diff options
Diffstat (limited to 'arch/x86')
122 files changed, 3309 insertions, 1153 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6d4774f203d0..c5ff296bc5d1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -75,6 +75,7 @@ config X86 select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO + select ARCH_SUPPORTS_ACPI select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_NUMA_BALANCING if X86_64 select ARCH_USE_BUILTIN_BSWAP @@ -123,6 +124,7 @@ config X86 select HAVE_ARCH_MMAP_RND_BITS if MMU select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT select HAVE_ARCH_COMPAT_MMAP_BASES if MMU && COMPAT + select HAVE_ARCH_PREL32_RELOCATIONS select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_THREAD_STRUCT_WHITELIST select HAVE_ARCH_TRACEHOOK @@ -178,7 +180,8 @@ config X86 select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP - select HAVE_RCU_TABLE_FREE + select HAVE_RCU_TABLE_FREE if PARAVIRT + select HAVE_RCU_TABLE_INVALIDATE if HAVE_RCU_TABLE_FREE select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RELIABLE_STACKTRACE if X86_64 && (UNWINDER_FRAME_POINTER || UNWINDER_ORC) && STACK_VALIDATION select HAVE_STACKPROTECTOR if CC_HAS_SANE_STACKPROTECTOR @@ -187,6 +190,7 @@ config X86 select HAVE_SYSCALL_TRACEPOINTS select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_USER_RETURN_NOTIFIER + select HOTPLUG_SMT if SMP select IRQ_FORCED_THREADING select NEED_SG_DMA_LENGTH select PCI_LOCKLESS_CONFIG @@ -345,8 +349,6 @@ config PGTABLE_LEVELS default 3 if X86_PAE default 2 -source "init/Kconfig" - config CC_HAS_SANE_STACKPROTECTOR bool default $(success,$(srctree)/scripts/gcc-x86_64-has-stack-protector.sh $(CC)) if 64BIT @@ -355,8 +357,6 @@ config CC_HAS_SANE_STACKPROTECTOR We have to make sure stack protector is unconditionally disabled if the compiler produces broken code. -source "kernel/Kconfig.freezer" - menu "Processor type and features" config ZONE_DMA @@ -1043,8 +1043,6 @@ config SCHED_MC_PRIO If unsure say Y here. -source "kernel/Kconfig.preempt" - config UP_LATE_INIT def_bool y depends on !SMP && X86_LOCAL_APIC @@ -1638,8 +1636,6 @@ config ILLEGAL_POINTER_VALUE default 0 if X86_32 default 0xdead000000000000 if X86_64 -source "mm/Kconfig" - config X86_PMEM_LEGACY_DEVICE bool @@ -2865,9 +2861,7 @@ config X86_SYSFB endmenu -menu "Executable file formats / Emulations" - -source "fs/Kconfig.binfmt" +menu "Binary Emulations" config IA32_EMULATION bool "IA32 Emulation" @@ -2937,20 +2931,6 @@ config X86_DMA_REMAP config HAVE_GENERIC_GUP def_bool y -source "net/Kconfig" - -source "drivers/Kconfig" - source "drivers/firmware/Kconfig" -source "fs/Kconfig" - -source "arch/x86/Kconfig.debug" - -source "security/Kconfig" - -source "crypto/Kconfig" - source "arch/x86/kvm/Kconfig" - -source "lib/Kconfig" diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index c6dd1d980081..7d68f0c7cfb1 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -1,11 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 -menu "Kernel hacking" config TRACE_IRQFLAGS_SUPPORT def_bool y -source "lib/Kconfig.debug" - config EARLY_PRINTK_USB bool @@ -410,5 +407,3 @@ endchoice config FRAME_POINTER depends on !UNWINDER_ORC && !UNWINDER_GUESS bool - -endmenu diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 7e3c07d6ad42..94859241bc3e 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -219,7 +219,7 @@ sha256_ni_instr :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA2 KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr) KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr) -LDFLAGS := -m elf_$(UTS_MACHINE) +KBUILD_LDFLAGS := -m elf_$(UTS_MACHINE) # # The 64-bit kernel must be aligned to 2MB. Pass -z max-page-size=0x200000 to @@ -227,7 +227,7 @@ LDFLAGS := -m elf_$(UTS_MACHINE) # by the linker. # ifdef CONFIG_X86_64 -LDFLAGS += $(call ld-option, -z max-page-size=0x200000) +KBUILD_LDFLAGS += $(call ld-option, -z max-page-size=0x200000) endif # Speed up the build diff --git a/arch/x86/Makefile.um b/arch/x86/Makefile.um index 45af19921ebd..91085a08de6c 100644 --- a/arch/x86/Makefile.um +++ b/arch/x86/Makefile.um @@ -4,7 +4,7 @@ core-y += arch/x86/crypto/ ifeq ($(CONFIG_X86_32),y) START := 0x8048000 -LDFLAGS += -m elf_i386 +KBUILD_LDFLAGS += -m elf_i386 ELF_ARCH := i386 ELF_FORMAT := elf32-i386 CHECKFLAGS += -D__i386__ @@ -13,8 +13,6 @@ KBUILD_CFLAGS += $(call cc-option,-m32) KBUILD_AFLAGS += $(call cc-option,-m32) LINK-y += $(call cc-option,-m32) -export LDFLAGS - LDS_EXTRA := -Ui386 export LDS_EXTRA @@ -45,7 +43,7 @@ KBUILD_CFLAGS += -fno-builtin -m64 CHECKFLAGS += -m64 -D__x86_64__ KBUILD_AFLAGS += -m64 -LDFLAGS += -m elf_x86_64 +KBUILD_LDFLAGS += -m elf_x86_64 KBUILD_CPPFLAGS += -m64 ELF_ARCH := i386:x86-64 diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 169c2feda14a..28764dacf018 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -42,16 +42,16 @@ KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ GCOV_PROFILE := n UBSAN_SANITIZE :=n -LDFLAGS := -m elf_$(UTS_MACHINE) +KBUILD_LDFLAGS := -m elf_$(UTS_MACHINE) # Compressed kernel should be built as PIE since it may be loaded at any # address by the bootloader. ifeq ($(CONFIG_X86_32),y) -LDFLAGS += $(call ld-option, -pie) $(call ld-option, --no-dynamic-linker) +KBUILD_LDFLAGS += $(call ld-option, -pie) $(call ld-option, --no-dynamic-linker) else # To build 64-bit compressed kernel as PIE, we disable relocation # overflow check to avoid relocation overflow error with a new linker # command-line option, -z noreloc-overflow. -LDFLAGS += $(shell $(LD) --help 2>&1 | grep -q "\-z noreloc-overflow" \ +KBUILD_LDFLAGS += $(shell $(LD) --help 2>&1 | grep -q "\-z noreloc-overflow" \ && echo "-z noreloc-overflow -pie --no-dynamic-linker") endif LDFLAGS_vmlinux := -T diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 302517929932..d1e19f358b6e 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -23,11 +23,8 @@ * _ctype[] in lib/ctype.c is needed by isspace() of linux/ctype.h. * While both lib/ctype.c and lib/cmdline.c will bring EXPORT_SYMBOL * which is meaningless and will cause compiling error in some cases. - * So do not include linux/export.h and define EXPORT_SYMBOL(sym) - * as empty. */ -#define _LINUX_EXPORT_H -#define EXPORT_SYMBOL(sym) +#define __DISABLE_EXPORTS #include "misc.h" #include "error.h" diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index 2ddbe3a1868b..3582ae885ee1 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -154,8 +154,7 @@ static struct shash_alg ghash_alg = { .cra_name = "__ghash", .cra_driver_name = "__ghash-pclmulqdqni", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_SHASH | - CRYPTO_ALG_INTERNAL, + .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = GHASH_BLOCK_SIZE, .cra_ctxsize = sizeof(struct ghash_ctx), .cra_module = THIS_MODULE, @@ -315,9 +314,8 @@ static struct ahash_alg ghash_async_alg = { .cra_driver_name = "ghash-clmulni", .cra_priority = 400, .cra_ctxsize = sizeof(struct ghash_async_ctx), - .cra_flags = CRYPTO_ALG_TYPE_AHASH | CRYPTO_ALG_ASYNC, + .cra_flags = CRYPTO_ALG_ASYNC, .cra_blocksize = GHASH_BLOCK_SIZE, - .cra_type = &crypto_ahash_type, .cra_module = THIS_MODULE, .cra_init = ghash_async_init_tfm, .cra_exit = ghash_async_exit_tfm, diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c index 790377797544..f012b7e28ad1 100644 --- a/arch/x86/crypto/poly1305_glue.c +++ b/arch/x86/crypto/poly1305_glue.c @@ -169,7 +169,6 @@ static struct shash_alg alg = { .cra_name = "poly1305", .cra_driver_name = "poly1305-simd", .cra_priority = 300, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = POLY1305_BLOCK_SIZE, .cra_module = THIS_MODULE, }, diff --git a/arch/x86/crypto/sha1-mb/sha1_mb.c b/arch/x86/crypto/sha1-mb/sha1_mb.c index e17655ffde79..b93805664c1d 100644 --- a/arch/x86/crypto/sha1-mb/sha1_mb.c +++ b/arch/x86/crypto/sha1-mb/sha1_mb.c @@ -746,9 +746,8 @@ static struct ahash_alg sha1_mb_areq_alg = { * algo may not have completed before hashing thread * sleep */ - .cra_flags = CRYPTO_ALG_TYPE_AHASH | - CRYPTO_ALG_ASYNC | - CRYPTO_ALG_INTERNAL, + .cra_flags = CRYPTO_ALG_ASYNC | + CRYPTO_ALG_INTERNAL, .cra_blocksize = SHA1_BLOCK_SIZE, .cra_module = THIS_MODULE, .cra_list = LIST_HEAD_INIT @@ -871,10 +870,16 @@ static struct ahash_alg sha1_mb_async_alg = { .base = { .cra_name = "sha1", .cra_driver_name = "sha1_mb", - .cra_priority = 200, - .cra_flags = CRYPTO_ALG_TYPE_AHASH | CRYPTO_ALG_ASYNC, + /* + * Low priority, since with few concurrent hash requests + * this is extremely slow due to the flush delay. Users + * whose workloads would benefit from this can request + * it explicitly by driver name, or can increase its + * priority at runtime using NETLINK_CRYPTO. + */ + .cra_priority = 50, + .cra_flags = CRYPTO_ALG_ASYNC, .cra_blocksize = SHA1_BLOCK_SIZE, - .cra_type = &crypto_ahash_type, .cra_module = THIS_MODULE, .cra_list = LIST_HEAD_INIT(sha1_mb_async_alg.halg.base.cra_list), .cra_init = sha1_mb_async_init_tfm, diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index fc61739150e7..7391c7de72c7 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -104,7 +104,6 @@ static struct shash_alg sha1_ssse3_alg = { .cra_name = "sha1", .cra_driver_name = "sha1-ssse3", .cra_priority = 150, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA1_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -157,7 +156,6 @@ static struct shash_alg sha1_avx_alg = { .cra_name = "sha1", .cra_driver_name = "sha1-avx", .cra_priority = 160, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA1_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -249,7 +247,6 @@ static struct shash_alg sha1_avx2_alg = { .cra_name = "sha1", .cra_driver_name = "sha1-avx2", .cra_priority = 170, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA1_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -307,7 +304,6 @@ static struct shash_alg sha1_ni_alg = { .cra_name = "sha1", .cra_driver_name = "sha1-ni", .cra_priority = 250, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA1_BLOCK_SIZE, .cra_module = THIS_MODULE, } diff --git a/arch/x86/crypto/sha256-mb/sha256_mb.c b/arch/x86/crypto/sha256-mb/sha256_mb.c index 4c46ac1b6653..97c5fc43e115 100644 --- a/arch/x86/crypto/sha256-mb/sha256_mb.c +++ b/arch/x86/crypto/sha256-mb/sha256_mb.c @@ -745,9 +745,8 @@ static struct ahash_alg sha256_mb_areq_alg = { * algo may not have completed before hashing thread * sleep */ - .cra_flags = CRYPTO_ALG_TYPE_AHASH | - CRYPTO_ALG_ASYNC | - CRYPTO_ALG_INTERNAL, + .cra_flags = CRYPTO_ALG_ASYNC | + CRYPTO_ALG_INTERNAL, .cra_blocksize = SHA256_BLOCK_SIZE, .cra_module = THIS_MODULE, .cra_list = LIST_HEAD_INIT @@ -870,11 +869,16 @@ static struct ahash_alg sha256_mb_async_alg = { .base = { .cra_name = "sha256", .cra_driver_name = "sha256_mb", - .cra_priority = 200, - .cra_flags = CRYPTO_ALG_TYPE_AHASH | - CRYPTO_ALG_ASYNC, + /* + * Low priority, since with few concurrent hash requests + * this is extremely slow due to the flush delay. Users + * whose workloads would benefit from this can request + * it explicitly by driver name, or can increase its + * priority at runtime using NETLINK_CRYPTO. + */ + .cra_priority = 50, + .cra_flags = CRYPTO_ALG_ASYNC, .cra_blocksize = SHA256_BLOCK_SIZE, - .cra_type = &crypto_ahash_type, .cra_module = THIS_MODULE, .cra_list = LIST_HEAD_INIT (sha256_mb_async_alg.halg.base.cra_list), diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S index 16c4ccb1f154..d2364c55bbde 100644 --- a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S +++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S @@ -265,7 +265,7 @@ ENTRY(sha256_mb_mgr_get_comp_job_avx2) vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0 vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0 vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0 - vmovd _args_digest(state , idx, 4) , %xmm0 + vmovd _args_digest+4*32(state, idx, 4), %xmm1 vpinsrd $1, _args_digest+5*32(state, idx, 4), %xmm1, %xmm1 vpinsrd $2, _args_digest+6*32(state, idx, 4), %xmm1, %xmm1 vpinsrd $3, _args_digest+7*32(state, idx, 4), %xmm1, %xmm1 diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c index 9e79baf03a4b..773a873d2b28 100644 --- a/arch/x86/crypto/sha256_ssse3_glue.c +++ b/arch/x86/crypto/sha256_ssse3_glue.c @@ -109,7 +109,6 @@ static struct shash_alg sha256_ssse3_algs[] = { { .cra_name = "sha256", .cra_driver_name = "sha256-ssse3", .cra_priority = 150, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA256_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -124,7 +123,6 @@ static struct shash_alg sha256_ssse3_algs[] = { { .cra_name = "sha224", .cra_driver_name = "sha224-ssse3", .cra_priority = 150, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA224_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -177,7 +175,6 @@ static struct shash_alg sha256_avx_algs[] = { { .cra_name = "sha256", .cra_driver_name = "sha256-avx", .cra_priority = 160, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA256_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -192,7 +189,6 @@ static struct shash_alg sha256_avx_algs[] = { { .cra_name = "sha224", .cra_driver_name = "sha224-avx", .cra_priority = 160, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA224_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -261,7 +257,6 @@ static struct shash_alg sha256_avx2_algs[] = { { .cra_name = "sha256", .cra_driver_name = "sha256-avx2", .cra_priority = 170, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA256_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -276,7 +271,6 @@ static struct shash_alg sha256_avx2_algs[] = { { .cra_name = "sha224", .cra_driver_name = "sha224-avx2", .cra_priority = 170, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA224_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -343,7 +337,6 @@ static struct shash_alg sha256_ni_algs[] = { { .cra_name = "sha256", .cra_driver_name = "sha256-ni", .cra_priority = 250, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA256_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -358,7 +351,6 @@ static struct shash_alg sha256_ni_algs[] = { { .cra_name = "sha224", .cra_driver_name = "sha224-ni", .cra_priority = 250, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA224_BLOCK_SIZE, .cra_module = THIS_MODULE, } diff --git a/arch/x86/crypto/sha512-mb/sha512_mb.c b/arch/x86/crypto/sha512-mb/sha512_mb.c index 39e2bbdc1836..26b85678012d 100644 --- a/arch/x86/crypto/sha512-mb/sha512_mb.c +++ b/arch/x86/crypto/sha512-mb/sha512_mb.c @@ -778,9 +778,8 @@ static struct ahash_alg sha512_mb_areq_alg = { * algo may not have completed before hashing thread * sleep */ - .cra_flags = CRYPTO_ALG_TYPE_AHASH | - CRYPTO_ALG_ASYNC | - CRYPTO_ALG_INTERNAL, + .cra_flags = CRYPTO_ALG_ASYNC | + CRYPTO_ALG_INTERNAL, .cra_blocksize = SHA512_BLOCK_SIZE, .cra_module = THIS_MODULE, .cra_list = LIST_HEAD_INIT @@ -904,11 +903,16 @@ static struct ahash_alg sha512_mb_async_alg = { .base = { .cra_name = "sha512", .cra_driver_name = "sha512_mb", - .cra_priority = 200, - .cra_flags = CRYPTO_ALG_TYPE_AHASH | - CRYPTO_ALG_ASYNC, + /* + * Low priority, since with few concurrent hash requests + * this is extremely slow due to the flush delay. Users + * whose workloads would benefit from this can request + * it explicitly by driver name, or can increase its + * priority at runtime using NETLINK_CRYPTO. + */ + .cra_priority = 50, + .cra_flags = CRYPTO_ALG_ASYNC, .cra_blocksize = SHA512_BLOCK_SIZE, - .cra_type = &crypto_ahash_type, .cra_module = THIS_MODULE, .cra_list = LIST_HEAD_INIT (sha512_mb_async_alg.halg.base.cra_list), diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c index 2b0e2a6825f3..f1b811b60ba6 100644 --- a/arch/x86/crypto/sha512_ssse3_glue.c +++ b/arch/x86/crypto/sha512_ssse3_glue.c @@ -109,7 +109,6 @@ static struct shash_alg sha512_ssse3_algs[] = { { .cra_name = "sha512", .cra_driver_name = "sha512-ssse3", .cra_priority = 150, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA512_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -124,7 +123,6 @@ static struct shash_alg sha512_ssse3_algs[] = { { .cra_name = "sha384", .cra_driver_name = "sha384-ssse3", .cra_priority = 150, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA384_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -188,7 +186,6 @@ static struct shash_alg sha512_avx_algs[] = { { .cra_name = "sha512", .cra_driver_name = "sha512-avx", .cra_priority = 160, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA512_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -203,7 +200,6 @@ static struct shash_alg sha512_avx_algs[] = { { .cra_name = "sha384", .cra_driver_name = "sha384-avx", .cra_priority = 160, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA384_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -261,7 +257,6 @@ static struct shash_alg sha512_avx2_algs[] = { { .cra_name = "sha512", .cra_driver_name = "sha512-avx2", .cra_priority = 170, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA512_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -276,7 +271,6 @@ static struct shash_alg sha512_avx2_algs[] = { { .cra_name = "sha384", .cra_driver_name = "sha384-avx2", .cra_priority = 170, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, .cra_blocksize = SHA384_BLOCK_SIZE, .cra_module = THIS_MODULE, } diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 9f695f517747..fa3f439f0a92 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -68,9 +68,9 @@ $(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso%.so $(obj)/vdso2c FORCE CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \ $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \ -fno-omit-frame-pointer -foptimize-sibling-calls \ - -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO + -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO $(RETPOLINE_VDSO_CFLAGS) -$(vobjs): KBUILD_CFLAGS := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS)) $(CFL) +$(vobjs): KBUILD_CFLAGS := $(filter-out $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL) # # vDSO code runs in userspace and -pg doesn't help with profiling anyway. @@ -132,11 +132,13 @@ KBUILD_CFLAGS_32 := $(filter-out -mcmodel=kernel,$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out -fno-pic,$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out -mfentry,$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS_32)) +KBUILD_CFLAGS_32 := $(filter-out $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 += -m32 -msoft-float -mregparm=0 -fpic KBUILD_CFLAGS_32 += $(call cc-option, -fno-stack-protector) KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls) KBUILD_CFLAGS_32 += -fno-omit-frame-pointer KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING +KBUILD_CFLAGS_32 += $(RETPOLINE_VDSO_CFLAGS) $(obj)/vdso32.so.dbg: KBUILD_CFLAGS = $(KBUILD_CFLAGS_32) $(obj)/vdso32.so.dbg: FORCE \ diff --git a/arch/x86/entry/vdso/vdso-note.S b/arch/x86/entry/vdso/vdso-note.S index 79a071e4357e..79423170118f 100644 --- a/arch/x86/entry/vdso/vdso-note.S +++ b/arch/x86/entry/vdso/vdso-note.S @@ -3,6 +3,7 @@ * Here we can supply some information useful to userland. */ +#include <linux/build-salt.h> #include <linux/uts.h> #include <linux/version.h> #include <linux/elfnote.h> @@ -10,3 +11,5 @@ ELFNOTE_START(Linux, 0, "a") .long LINUX_VERSION_CODE ELFNOTE_END + +BUILD_SALT diff --git a/arch/x86/entry/vdso/vdso32/note.S b/arch/x86/entry/vdso/vdso32/note.S index 9fd51f206314..e78047d119f6 100644 --- a/arch/x86/entry/vdso/vdso32/note.S +++ b/arch/x86/entry/vdso/vdso32/note.S @@ -4,6 +4,7 @@ * Here we can supply some information useful to userland. */ +#include <linux/build-salt.h> #include <linux/version.h> #include <linux/elfnote.h> @@ -14,6 +15,8 @@ ELFNOTE_START(Linux, 0, "a") .long LINUX_VERSION_CODE ELFNOTE_END +BUILD_SALT + #ifdef CONFIG_XEN /* * Add a special note telling glibc's dynamic linker a fake hardware diff --git a/arch/x86/hyperv/Makefile b/arch/x86/hyperv/Makefile index b173d404e3df..b21ee65c4101 100644 --- a/arch/x86/hyperv/Makefile +++ b/arch/x86/hyperv/Makefile @@ -1,2 +1,2 @@ -obj-y := hv_init.o mmu.o +obj-y := hv_init.o mmu.o nested.o obj-$(CONFIG_X86_64) += hv_apic.o diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 1ff420217298..20c876c7c5bf 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -333,7 +333,7 @@ void __init hyperv_init(void) * Register Hyper-V specific clocksource. */ #ifdef CONFIG_HYPERV_TSCPAGE - if (ms_hyperv.features & HV_X64_MSR_REFERENCE_TSC_AVAILABLE) { + if (ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE) { union hv_x64_msr_hypercall_contents tsc_msr; tsc_pg = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL); @@ -362,7 +362,7 @@ register_msr_cs: */ hyperv_cs = &hyperv_cs_msr; - if (ms_hyperv.features & HV_X64_MSR_TIME_REF_COUNT_AVAILABLE) + if (ms_hyperv.features & HV_MSR_TIME_REF_COUNT_AVAILABLE) clocksource_register_hz(&hyperv_cs_msr, NSEC_PER_SEC/100); return; @@ -426,6 +426,33 @@ void hyperv_report_panic(struct pt_regs *regs, long err) } EXPORT_SYMBOL_GPL(hyperv_report_panic); +/** + * hyperv_report_panic_msg - report panic message to Hyper-V + * @pa: physical address of the panic page containing the message + * @size: size of the message in the page + */ +void hyperv_report_panic_msg(phys_addr_t pa, size_t size) +{ + /* + * P3 to contain the physical address of the panic page & P4 to + * contain the size of the panic data in that page. Rest of the + * registers are no-op when the NOTIFY_MSG flag is set. + */ + wrmsrl(HV_X64_MSR_CRASH_P0, 0); + wrmsrl(HV_X64_MSR_CRASH_P1, 0); + wrmsrl(HV_X64_MSR_CRASH_P2, 0); + wrmsrl(HV_X64_MSR_CRASH_P3, pa); + wrmsrl(HV_X64_MSR_CRASH_P4, size); + + /* + * Let Hyper-V know there is crash data available along with + * the panic message. + */ + wrmsrl(HV_X64_MSR_CRASH_CTL, + (HV_CRASH_CTL_CRASH_NOTIFY | HV_CRASH_CTL_CRASH_NOTIFY_MSG)); +} +EXPORT_SYMBOL_GPL(hyperv_report_panic_msg); + bool hv_is_hyperv_initialized(void) { union hv_x64_msr_hypercall_contents hypercall_msr; diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c index 1147e1fed7ff..ef5f29f913d7 100644 --- a/arch/x86/hyperv/mmu.c +++ b/arch/x86/hyperv/mmu.c @@ -9,6 +9,7 @@ #include <asm/mshyperv.h> #include <asm/msr.h> #include <asm/tlbflush.h> +#include <asm/tlb.h> #define CREATE_TRACE_POINTS #include <asm/trace/hyperv.h> @@ -231,4 +232,5 @@ void hyperv_setup_mmu_ops(void) pr_info("Using hypercall for remote TLB flush\n"); pv_mmu_ops.flush_tlb_others = hyperv_flush_tlb_others; + pv_mmu_ops.tlb_remove_table = tlb_remove_table; } diff --git a/arch/x86/hyperv/nested.c b/arch/x86/hyperv/nested.c new file mode 100644 index 000000000000..b8e60cc50461 --- /dev/null +++ b/arch/x86/hyperv/nested.c @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Hyper-V nested virtualization code. + * + * Copyright (C) 2018, Microsoft, Inc. + * + * Author : Lan Tianyu <Tianyu.Lan@microsoft.com> + */ + + +#include <linux/types.h> +#include <asm/hyperv-tlfs.h> +#include <asm/mshyperv.h> +#include <asm/tlbflush.h> + +#include <asm/trace/hyperv.h> + +int hyperv_flush_guest_mapping(u64 as) +{ + struct hv_guest_mapping_flush **flush_pcpu; + struct hv_guest_mapping_flush *flush; + u64 status; + unsigned long flags; + int ret = -ENOTSUPP; + + if (!hv_hypercall_pg) + goto fault; + + local_irq_save(flags); + + flush_pcpu = (struct hv_guest_mapping_flush **) + this_cpu_ptr(hyperv_pcpu_input_arg); + + flush = *flush_pcpu; + + if (unlikely(!flush)) { + local_irq_restore(flags); + goto fault; + } + + flush->address_space = as; + flush->flags = 0; + + status = hv_do_hypercall(HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE, + flush, NULL); + local_irq_restore(flags); + + if (!(status & HV_HYPERCALL_RESULT_MASK)) + ret = 0; + +fault: + trace_hyperv_nested_flush_guest_mapping(as, ret); + return ret; +} +EXPORT_SYMBOL_GPL(hyperv_flush_guest_mapping); diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index de690c2d2e33..a0ab9ab61c75 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -8,5 +8,6 @@ generated-y += xen-hypercalls.h generic-y += dma-contiguous.h generic-y += early_ioremap.h +generic-y += export.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 74a9e06b6cfd..130e81e10fc7 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -10,6 +10,7 @@ #include <asm/fixmap.h> #include <asm/mpspec.h> #include <asm/msr.h> +#include <asm/hardirq.h> #define ARCH_APICTIMER_STOPS_ON_C3 1 @@ -502,12 +503,19 @@ extern int default_check_phys_apicid_present(int phys_apicid); #endif /* CONFIG_X86_LOCAL_APIC */ +#ifdef CONFIG_SMP +bool apic_id_is_primary_thread(unsigned int id); +#else +static inline bool apic_id_is_primary_thread(unsigned int id) { return false; } +#endif + extern void irq_enter(void); extern void irq_exit(void); static inline void entering_irq(void) { irq_enter(); + kvm_set_cpu_l1tf_flush_l1d(); } static inline void entering_ack_irq(void) @@ -520,6 +528,7 @@ static inline void ipi_entering_ack_irq(void) { irq_enter(); ack_APIC_irq(); + kvm_set_cpu_l1tf_flush_l1d(); } static inline void exiting_irq(void) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index b5c60faf8429..89a048c2faec 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -219,7 +219,8 @@ #define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */ #define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */ #define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU is AMD family 0x17 (Zen) */ -#define X86_FEATURE_IBRS_ENHANCED ( 7*32+29) /* Enhanced IBRS */ +#define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */ +#define X86_FEATURE_IBRS_ENHANCED ( 7*32+30) /* Enhanced IBRS */ /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ @@ -342,6 +343,7 @@ #define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */ #define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */ #define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */ +#define X86_FEATURE_FLUSH_L1D (18*32+28) /* Flush L1D cache */ #define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */ #define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* "" Speculative Store Bypass Disable */ @@ -374,5 +376,6 @@ #define X86_BUG_SPECTRE_V1 X86_BUG(15) /* CPU is affected by Spectre variant 1 attack with conditional branches */ #define X86_BUG_SPECTRE_V2 X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */ #define X86_BUG_SPEC_STORE_BYPASS X86_BUG(17) /* CPU is affected by speculative store bypass attack */ +#define X86_BUG_L1TF X86_BUG(18) /* CPU is affected by L1 Terminal Fault */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/dmi.h b/arch/x86/include/asm/dmi.h index 0ab2ab27ad1f..b825cb201251 100644 --- a/arch/x86/include/asm/dmi.h +++ b/arch/x86/include/asm/dmi.h @@ -4,8 +4,8 @@ #include <linux/compiler.h> #include <linux/init.h> +#include <linux/io.h> -#include <asm/io.h> #include <asm/setup.h> static __always_inline __init void *dmi_alloc(unsigned len) diff --git a/arch/x86/include/asm/export.h b/arch/x86/include/asm/export.h deleted file mode 100644 index 2a51d66689c5..000000000000 --- a/arch/x86/include/asm/export.h +++ /dev/null @@ -1,5 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifdef CONFIG_64BIT -#define KSYM_ALIGN 16 -#endif -#include <asm-generic/export.h> diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 740a428acf1e..d9069bb26c7f 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -3,10 +3,12 @@ #define _ASM_X86_HARDIRQ_H #include <linux/threads.h> -#include <linux/irq.h> typedef struct { - unsigned int __softirq_pending; + u16 __softirq_pending; +#if IS_ENABLED(CONFIG_KVM_INTEL) + u8 kvm_cpu_l1tf_flush_l1d; +#endif unsigned int __nmi_count; /* arch dependent */ #ifdef CONFIG_X86_LOCAL_APIC unsigned int apic_timer_irqs; /* arch dependent */ @@ -58,4 +60,24 @@ extern u64 arch_irq_stat_cpu(unsigned int cpu); extern u64 arch_irq_stat(void); #define arch_irq_stat arch_irq_stat + +#if IS_ENABLED(CONFIG_KVM_INTEL) +static inline void kvm_set_cpu_l1tf_flush_l1d(void) +{ + __this_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 1); +} + +static inline void kvm_clear_cpu_l1tf_flush_l1d(void) +{ + __this_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 0); +} + +static inline bool kvm_get_cpu_l1tf_flush_l1d(void) +{ + return __this_cpu_read(irq_stat.kvm_cpu_l1tf_flush_l1d); +} +#else /* !IS_ENABLED(CONFIG_KVM_INTEL) */ +static inline void kvm_set_cpu_l1tf_flush_l1d(void) { } +#endif /* IS_ENABLED(CONFIG_KVM_INTEL) */ + #endif /* _ASM_X86_HARDIRQ_H */ diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index b8c89265baf0..e977b6b3a538 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -35,9 +35,9 @@ /* VP Runtime (HV_X64_MSR_VP_RUNTIME) available */ #define HV_X64_MSR_VP_RUNTIME_AVAILABLE (1 << 0) /* Partition Reference Counter (HV_X64_MSR_TIME_REF_COUNT) available*/ -#define HV_X64_MSR_TIME_REF_COUNT_AVAILABLE (1 << 1) +#define HV_MSR_TIME_REF_COUNT_AVAILABLE (1 << 1) /* Partition reference TSC MSR is available */ -#define HV_X64_MSR_REFERENCE_TSC_AVAILABLE (1 << 9) +#define HV_MSR_REFERENCE_TSC_AVAILABLE (1 << 9) /* A partition's reference time stamp counter (TSC) page */ #define HV_X64_MSR_REFERENCE_TSC 0x40000021 @@ -60,7 +60,7 @@ * Synthetic Timer MSRs (HV_X64_MSR_STIMER0_CONFIG through * HV_X64_MSR_STIMER3_COUNT) available */ -#define HV_X64_MSR_SYNTIMER_AVAILABLE (1 << 3) +#define HV_MSR_SYNTIMER_AVAILABLE (1 << 3) /* * APIC access MSRs (HV_X64_MSR_EOI, HV_X64_MSR_ICR and HV_X64_MSR_TPR) * are available @@ -86,7 +86,7 @@ #define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE (1 << 10) /* stimer Direct Mode is available */ -#define HV_X64_STIMER_DIRECT_MODE_AVAILABLE (1 << 19) +#define HV_STIMER_DIRECT_MODE_AVAILABLE (1 << 19) /* * Feature identification: EBX indicates which flags were specified at @@ -160,9 +160,9 @@ #define HV_X64_RELAXED_TIMING_RECOMMENDED (1 << 5) /* - * Virtual APIC support + * Recommend not using Auto End-Of-Interrupt feature */ -#define HV_X64_DEPRECATING_AEOI_RECOMMENDED (1 << 9) +#define HV_DEPRECATING_AEOI_RECOMMENDED (1 << 9) /* * Recommend using cluster IPI hypercalls. @@ -176,9 +176,10 @@ #define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED (1 << 14) /* - * Crash notification flag. + * Crash notification flags. */ -#define HV_CRASH_CTL_CRASH_NOTIFY (1ULL << 63) +#define HV_CRASH_CTL_CRASH_NOTIFY_MSG BIT_ULL(62) +#define HV_CRASH_CTL_CRASH_NOTIFY BIT_ULL(63) /* MSR used to identify the guest OS. */ #define HV_X64_MSR_GUEST_OS_ID 0x40000000 @@ -309,6 +310,7 @@ struct ms_hyperv_tsc_page { #define HV_X64_MSR_REENLIGHTENMENT_CONTROL 0x40000106 /* Nested features (CPUID 0x4000000A) EAX */ +#define HV_X64_NESTED_GUEST_MAPPING_FLUSH BIT(18) #define HV_X64_NESTED_MSR_BITMAP BIT(19) struct hv_reenlightenment_control { @@ -350,6 +352,7 @@ struct hv_tsc_emulation_status { #define HVCALL_SEND_IPI_EX 0x0015 #define HVCALL_POST_MESSAGE 0x005c #define HVCALL_SIGNAL_EVENT 0x005d +#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af #define HV_X64_MSR_VP_ASSIST_PAGE_ENABLE 0x00000001 #define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT 12 @@ -741,6 +744,12 @@ struct ipi_arg_ex { struct hv_vpset vp_set; }; +/* HvFlushGuestPhysicalAddressSpace hypercalls */ +struct hv_guest_mapping_flush { + u64 address_space; + u64 flags; +}; + /* HvFlushVirtualAddressSpace, HvFlushVirtualAddressList hypercalls */ struct hv_tlb_flush { u64 address_space; diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h index 5cdcdbd4d892..89789e8c80f6 100644 --- a/arch/x86/include/asm/i8259.h +++ b/arch/x86/include/asm/i8259.h @@ -3,6 +3,7 @@ #define _ASM_X86_I8259_H #include <linux/delay.h> +#include <asm/io.h> extern unsigned int cached_irq_mask; diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index 023b4a9fc846..5f26962eff42 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -33,6 +33,11 @@ enum irq_remap_cap { IRQ_POSTING_CAP = 0, }; +enum { + IRQ_REMAP_XAPIC_MODE, + IRQ_REMAP_X2APIC_MODE, +}; + struct vcpu_data { u64 pi_desc_addr; /* Physical address of PI Descriptor */ u32 vector; /* Guest vector of the interrupt */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c13cd28d9d1b..00ddb0c9e612 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -17,6 +17,7 @@ #include <linux/tracepoint.h> #include <linux/cpumask.h> #include <linux/irq_work.h> +#include <linux/irq.h> #include <linux/kvm.h> #include <linux/kvm_para.h> @@ -54,6 +55,7 @@ #define KVM_REQ_TRIPLE_FAULT KVM_ARCH_REQ(2) #define KVM_REQ_MMU_SYNC KVM_ARCH_REQ(3) #define KVM_REQ_CLOCK_UPDATE KVM_ARCH_REQ(4) +#define KVM_REQ_LOAD_CR3 KVM_ARCH_REQ(5) #define KVM_REQ_EVENT KVM_ARCH_REQ(6) #define KVM_REQ_APF_HALT KVM_ARCH_REQ(7) #define KVM_REQ_STEAL_UPDATE KVM_ARCH_REQ(8) @@ -75,13 +77,13 @@ #define KVM_REQ_HV_EXIT KVM_ARCH_REQ(21) #define KVM_REQ_HV_STIMER KVM_ARCH_REQ(22) #define KVM_REQ_LOAD_EOI_EXITMAP KVM_ARCH_REQ(23) +#define KVM_REQ_GET_VMCS12_PAGES KVM_ARCH_REQ(24) #define CR0_RESERVED_BITS \ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) -#define CR3_PCID_INVD BIT_64(63) #define CR4_RESERVED_BITS \ (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ @@ -325,6 +327,16 @@ struct rsvd_bits_validate { u64 bad_mt_xwr; }; +struct kvm_mmu_root_info { + gpa_t cr3; + hpa_t hpa; +}; + +#define KVM_MMU_ROOT_INFO_INVALID \ + ((struct kvm_mmu_root_info) { .cr3 = INVALID_PAGE, .hpa = INVALID_PAGE }) + +#define KVM_MMU_NUM_PREV_ROOTS 3 + /* * x86 supports 4 paging modes (5-level 64-bit, 4-level 64-bit, 3-level 32-bit, * and 2-level 32-bit). The kvm_mmu structure abstracts the details of the @@ -344,7 +356,7 @@ struct kvm_mmu { struct x86_exception *exception); int (*sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp); - void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); + void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa); void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, const void *pte); hpa_t root_hpa; @@ -353,6 +365,7 @@ struct kvm_mmu { u8 shadow_root_level; u8 ept_ad; bool direct_map; + struct kvm_mmu_root_info prev_roots[KVM_MMU_NUM_PREV_ROOTS]; /* * Bitmap; bit set = permission fault @@ -713,6 +726,9 @@ struct kvm_vcpu_arch { /* be preempted when it's in kernel-mode(cpl=0) */ bool preempted_in_kernel; + + /* Flush the L1 Data cache for L1TF mitigation on VMENTER */ + bool l1tf_flush_l1d; }; struct kvm_lpage_info { @@ -881,6 +897,7 @@ struct kvm_vcpu_stat { u64 signal_exits; u64 irq_window_exits; u64 nmi_window_exits; + u64 l1d_flush; u64 halt_exits; u64 halt_successful_poll; u64 halt_attempted_poll; @@ -973,6 +990,15 @@ struct kvm_x86_ops { void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); void (*tlb_flush)(struct kvm_vcpu *vcpu, bool invalidate_gpa); + int (*tlb_remote_flush)(struct kvm *kvm); + + /* + * Flush any TLB entries associated with the given GVA. + * Does not need to flush GPA->HPA mappings. + * Can potentially get non-canonical addresses through INVLPGs, which + * the implementation may choose to ignore if appropriate. + */ + void (*tlb_flush_gva)(struct kvm_vcpu *vcpu, gva_t addr); void (*run)(struct kvm_vcpu *vcpu); int (*handle_exit)(struct kvm_vcpu *vcpu); @@ -1085,6 +1111,14 @@ struct kvm_x86_ops { void (*setup_mce)(struct kvm_vcpu *vcpu); + int (*get_nested_state)(struct kvm_vcpu *vcpu, + struct kvm_nested_state __user *user_kvm_nested_state, + unsigned user_data_size); + int (*set_nested_state)(struct kvm_vcpu *vcpu, + struct kvm_nested_state __user *user_kvm_nested_state, + struct kvm_nested_state *kvm_state); + void (*get_vmcs12_pages)(struct kvm_vcpu *vcpu); + int (*smi_allowed)(struct kvm_vcpu *vcpu); int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate); int (*pre_leave_smm)(struct kvm_vcpu *vcpu, u64 smbase); @@ -1117,6 +1151,16 @@ static inline void kvm_arch_free_vm(struct kvm *kvm) return kvm_x86_ops->vm_free(kvm); } +#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB +static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm) +{ + if (kvm_x86_ops->tlb_remote_flush && + !kvm_x86_ops->tlb_remote_flush(kvm)) + return 0; + else + return -ENOTSUPP; +} + int kvm_mmu_module_init(void); void kvm_mmu_module_exit(void); @@ -1268,6 +1312,10 @@ static inline int __kvm_irq_line_state(unsigned long *irq_state, return !!(*irq_state); } +#define KVM_MMU_ROOT_CURRENT BIT(0) +#define KVM_MMU_ROOT_PREVIOUS(i) BIT(1+i) +#define KVM_MMU_ROOTS_ALL (~0UL) + int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level); void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id); @@ -1279,7 +1327,7 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); int kvm_mmu_load(struct kvm_vcpu *vcpu); void kvm_mmu_unload(struct kvm_vcpu *vcpu); void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); -void kvm_mmu_free_roots(struct kvm_vcpu *vcpu); +void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, ulong roots_to_free); gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, struct x86_exception *exception); gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, @@ -1298,7 +1346,8 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u64 error_code, void *insn, int insn_len); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); -void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu); +void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid); +void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush); void kvm_enable_tdp(void); void kvm_disable_tdp(void); @@ -1413,6 +1462,11 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v); void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu); +int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, + unsigned long ipi_bitmap_high, int min, + unsigned long icr, int op_64_bit); + +u64 kvm_get_arch_capabilities(void); void kvm_define_shared_msr(unsigned index, u32 msr); int kvm_set_shared_msr(unsigned index, u64 val, u64 mask); diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 8c7b3e5a2d01..3a17107594c8 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -148,6 +148,7 @@ enum mce_notifier_prios { MCE_PRIO_LOWEST = 0, }; +struct notifier_block; extern void mce_register_decode_chain(struct notifier_block *nb); extern void mce_unregister_decode_chain(struct notifier_block *nb); diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 19886fef1dfc..f37704497d8f 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -76,8 +76,10 @@ static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type) } } -#define hv_init_timer(timer, tick) wrmsrl(timer, tick) -#define hv_init_timer_config(config, val) wrmsrl(config, val) +#define hv_init_timer(timer, tick) \ + wrmsrl(HV_X64_MSR_STIMER0_COUNT + (2*timer), tick) +#define hv_init_timer_config(timer, val) \ + wrmsrl(HV_X64_MSR_STIMER0_CONFIG + (2*timer), val) #define hv_get_simp(val) rdmsrl(HV_X64_MSR_SIMP, val) #define hv_set_simp(val) wrmsrl(HV_X64_MSR_SIMP, val) @@ -90,8 +92,13 @@ static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type) #define hv_get_vp_index(index) rdmsrl(HV_X64_MSR_VP_INDEX, index) -#define hv_get_synint_state(int_num, val) rdmsrl(int_num, val) -#define hv_set_synint_state(int_num, val) wrmsrl(int_num, val) +#define hv_get_synint_state(int_num, val) \ + rdmsrl(HV_X64_MSR_SINT0 + int_num, val) +#define hv_set_synint_state(int_num, val) \ + wrmsrl(HV_X64_MSR_SINT0 + int_num, val) + +#define hv_get_crash_ctl(val) \ + rdmsrl(HV_X64_MSR_CRASH_CTL, val) void hyperv_callback_vector(void); void hyperv_reenlightenment_vector(void); @@ -332,6 +339,7 @@ static inline int cpumask_to_vpset(struct hv_vpset *vpset, void __init hyperv_init(void); void hyperv_setup_mmu_ops(void); void hyperv_report_panic(struct pt_regs *regs, long err); +void hyperv_report_panic_msg(phys_addr_t pa, size_t size); bool hv_is_hyperv_initialized(void); void hyperv_cleanup(void); @@ -339,6 +347,7 @@ void hyperv_reenlightenment_intr(struct pt_regs *regs); void set_hv_tscchange_cb(void (*cb)(void)); void clear_hv_tscchange_cb(void); void hyperv_stop_tsc_emulation(void); +int hyperv_flush_guest_mapping(u64 as); #ifdef CONFIG_X86_64 void hv_apic_init(void); @@ -358,6 +367,7 @@ static inline struct hv_vp_assist_page *hv_get_vp_assist_page(unsigned int cpu) { return NULL; } +static inline int hyperv_flush_guest_mapping(u64 as) { return -1; } #endif /* CONFIG_HYPERV */ #ifdef CONFIG_HYPERV_TSCPAGE diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 68b2c3150de1..4731f0cf97c5 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -70,12 +70,19 @@ #define MSR_IA32_ARCH_CAPABILITIES 0x0000010a #define ARCH_CAP_RDCL_NO (1 << 0) /* Not susceptible to Meltdown */ #define ARCH_CAP_IBRS_ALL (1 << 1) /* Enhanced IBRS support */ +#define ARCH_CAP_SKIP_VMENTRY_L1DFLUSH (1 << 3) /* Skip L1D flush on vmentry */ #define ARCH_CAP_SSB_NO (1 << 4) /* * Not susceptible to Speculative Store Bypass * attack, so no Speculative Store Bypass * control required. */ +#define MSR_IA32_FLUSH_CMD 0x0000010b +#define L1D_FLUSH (1 << 0) /* + * Writeback and invalidate the + * L1 data cache. + */ + #define MSR_IA32_BBL_CR_CTL 0x00000119 #define MSR_IA32_BBL_CR_CTL3 0x0000011e diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h index aa30c3241ea7..0d5c739eebd7 100644 --- a/arch/x86/include/asm/page_32_types.h +++ b/arch/x86/include/asm/page_32_types.h @@ -29,8 +29,13 @@ #define N_EXCEPTION_STACKS 1 #ifdef CONFIG_X86_PAE -/* 44=32+12, the limit we can fit into an unsigned long pfn */ -#define __PHYSICAL_MASK_SHIFT 44 +/* + * This is beyond the 44 bit limit imposed by the 32bit long pfns, + * but we need the full mask to make sure inverted PROT_NONE + * entries have all the host bits set in a guest. + * The real limit is still 44 bits. + */ +#define __PHYSICAL_MASK_SHIFT 52 #define __VIRTUAL_MASK_SHIFT 32 #else /* !CONFIG_X86_PAE */ diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index d49bbf4bb5c8..e375d4266b53 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -309,6 +309,11 @@ static inline void flush_tlb_others(const struct cpumask *cpumask, PVOP_VCALL2(pv_mmu_ops.flush_tlb_others, cpumask, info); } +static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table) +{ + PVOP_VCALL2(pv_mmu_ops.tlb_remove_table, tlb, table); +} + static inline int paravirt_pgd_alloc(struct mm_struct *mm) { return PVOP_CALL1(int, pv_mmu_ops.pgd_alloc, mm); diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 180bc0bff0fb..4b75acc23b30 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -54,6 +54,7 @@ struct desc_struct; struct task_struct; struct cpumask; struct flush_tlb_info; +struct mmu_gather; /* * Wrapper type for pointers to code which uses the non-standard @@ -222,6 +223,8 @@ struct pv_mmu_ops { void (*flush_tlb_others)(const struct cpumask *cpus, const struct flush_tlb_info *info); + void (*tlb_remove_table)(struct mmu_gather *tlb, void *table); + /* Hooks for allocating and freeing a pagetable top-level */ int (*pgd_alloc)(struct mm_struct *mm); void (*pgd_free)(struct mm_struct *mm, pgd_t *pgd); diff --git a/arch/x86/include/asm/pci-direct.h b/arch/x86/include/asm/pci-direct.h index e1084f71a295..94597a3cf3d0 100644 --- a/arch/x86/include/asm/pci-direct.h +++ b/arch/x86/include/asm/pci-direct.h @@ -15,8 +15,4 @@ extern void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val); extern void write_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset, u16 val); extern int early_pci_allowed(void); - -extern unsigned int pci_early_dump_regs; -extern void early_dump_pci_device(u8 bus, u8 slot, u8 func); -extern void early_dump_pci_devices(void); #endif /* _ASM_X86_PCI_DIRECT_H */ diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h index c399ea5eea41..24c6cf5f16b7 100644 --- a/arch/x86/include/asm/pgtable-2level.h +++ b/arch/x86/include/asm/pgtable-2level.h @@ -104,4 +104,21 @@ static inline unsigned long pte_bitop(unsigned long value, unsigned int rightshi #define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low }) #define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val }) +/* No inverted PFNs on 2 level page tables */ + +static inline u64 protnone_mask(u64 val) +{ + return 0; +} + +static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask) +{ + return val; +} + +static inline bool __pte_needs_invert(u64 val) +{ + return false; +} + #endif /* _ASM_X86_PGTABLE_2LEVEL_H */ diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index f2ca3139ca22..a564084c6141 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -248,12 +248,43 @@ static inline pud_t native_pudp_get_and_clear(pud_t *pudp) #endif /* Encode and de-code a swap entry */ +#define SWP_TYPE_BITS 5 + +#define SWP_OFFSET_FIRST_BIT (_PAGE_BIT_PROTNONE + 1) + +/* We always extract/encode the offset by shifting it all the way up, and then down again */ +#define SWP_OFFSET_SHIFT (SWP_OFFSET_FIRST_BIT + SWP_TYPE_BITS) + #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5) #define __swp_type(x) (((x).val) & 0x1f) #define __swp_offset(x) ((x).val >> 5) #define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5}) -#define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high }) -#define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } }) + +/* + * Normally, __swp_entry() converts from arch-independent swp_entry_t to + * arch-dependent swp_entry_t, and __swp_entry_to_pte() just stores the result + * to pte. But here we have 32bit swp_entry_t and 64bit pte, and need to use the + * whole 64 bits. Thus, we shift the "real" arch-dependent conversion to + * __swp_entry_to_pte() through the following helper macro based on 64bit + * __swp_entry(). + */ +#define __swp_pteval_entry(type, offset) ((pteval_t) { \ + (~(pteval_t)(offset) << SWP_OFFSET_SHIFT >> SWP_TYPE_BITS) \ + | ((pteval_t)(type) << (64 - SWP_TYPE_BITS)) }) + +#define __swp_entry_to_pte(x) ((pte_t){ .pte = \ + __swp_pteval_entry(__swp_type(x), __swp_offset(x)) }) +/* + * Analogically, __pte_to_swp_entry() doesn't just extract the arch-dependent + * swp_entry_t, but also has to convert it from 64bit to the 32bit + * intermediate representation, using the following macros based on 64bit + * __swp_type() and __swp_offset(). + */ +#define __pteval_swp_type(x) ((unsigned long)((x).pte >> (64 - SWP_TYPE_BITS))) +#define __pteval_swp_offset(x) ((unsigned long)(~((x).pte) << SWP_TYPE_BITS >> SWP_OFFSET_SHIFT)) + +#define __pte_to_swp_entry(pte) (__swp_entry(__pteval_swp_type(pte), \ + __pteval_swp_offset(pte))) #define gup_get_pte gup_get_pte /* @@ -302,4 +333,6 @@ static inline pte_t gup_get_pte(pte_t *ptep) return pte; } +#include <asm/pgtable-invert.h> + #endif /* _ASM_X86_PGTABLE_3LEVEL_H */ diff --git a/arch/x86/include/asm/pgtable-invert.h b/arch/x86/include/asm/pgtable-invert.h new file mode 100644 index 000000000000..a0c1525f1b6f --- /dev/null +++ b/arch/x86/include/asm/pgtable-invert.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_PGTABLE_INVERT_H +#define _ASM_PGTABLE_INVERT_H 1 + +#ifndef __ASSEMBLY__ + +/* + * A clear pte value is special, and doesn't get inverted. + * + * Note that even users that only pass a pgprot_t (rather + * than a full pte) won't trigger the special zero case, + * because even PAGE_NONE has _PAGE_PROTNONE | _PAGE_ACCESSED + * set. So the all zero case really is limited to just the + * cleared page table entry case. + */ +static inline bool __pte_needs_invert(u64 val) +{ + return val && !(val & _PAGE_PRESENT); +} + +/* Get a mask to xor with the page table entry to get the correct pfn. */ +static inline u64 protnone_mask(u64 val) +{ + return __pte_needs_invert(val) ? ~0ull : 0; +} + +static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask) +{ + /* + * When a PTE transitions from NONE to !NONE or vice-versa + * invert the PFN part to stop speculation. + * pte_pfn undoes this when needed. + */ + if (__pte_needs_invert(oldval) != __pte_needs_invert(val)) + val = (val & ~mask) | (~val & mask); + return val; +} + +#endif /* __ASSEMBLY__ */ + +#endif diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index a1cb3339da8d..e4ffa565a69f 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -188,19 +188,29 @@ static inline int pte_special(pte_t pte) return pte_flags(pte) & _PAGE_SPECIAL; } +/* Entries that were set to PROT_NONE are inverted */ + +static inline u64 protnone_mask(u64 val); + static inline unsigned long pte_pfn(pte_t pte) { - return (pte_val(pte) & PTE_PFN_MASK) >> PAGE_SHIFT; + phys_addr_t pfn = pte_val(pte); + pfn ^= protnone_mask(pfn); + return (pfn & PTE_PFN_MASK) >> PAGE_SHIFT; } static inline unsigned long pmd_pfn(pmd_t pmd) { - return (pmd_val(pmd) & pmd_pfn_mask(pmd)) >> PAGE_SHIFT; + phys_addr_t pfn = pmd_val(pmd); + pfn ^= protnone_mask(pfn); + return (pfn & pmd_pfn_mask(pmd)) >> PAGE_SHIFT; } static inline unsigned long pud_pfn(pud_t pud) { - return (pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT; + phys_addr_t pfn = pud_val(pud); + pfn ^= protnone_mask(pfn); + return (pfn & pud_pfn_mask(pud)) >> PAGE_SHIFT; } static inline unsigned long p4d_pfn(p4d_t p4d) @@ -403,11 +413,6 @@ static inline pmd_t pmd_mkwrite(pmd_t pmd) return pmd_set_flags(pmd, _PAGE_RW); } -static inline pmd_t pmd_mknotpresent(pmd_t pmd) -{ - return pmd_clear_flags(pmd, _PAGE_PRESENT | _PAGE_PROTNONE); -} - static inline pud_t pud_set_flags(pud_t pud, pudval_t set) { pudval_t v = native_pud_val(pud); @@ -462,11 +467,6 @@ static inline pud_t pud_mkwrite(pud_t pud) return pud_set_flags(pud, _PAGE_RW); } -static inline pud_t pud_mknotpresent(pud_t pud) -{ - return pud_clear_flags(pud, _PAGE_PRESENT | _PAGE_PROTNONE); -} - #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY static inline int pte_soft_dirty(pte_t pte) { @@ -548,25 +548,45 @@ static inline pgprotval_t check_pgprot(pgprot_t pgprot) static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) { - return __pte(((phys_addr_t)page_nr << PAGE_SHIFT) | - check_pgprot(pgprot)); + phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT; + pfn ^= protnone_mask(pgprot_val(pgprot)); + pfn &= PTE_PFN_MASK; + return __pte(pfn | check_pgprot(pgprot)); } static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) { - return __pmd(((phys_addr_t)page_nr << PAGE_SHIFT) | - check_pgprot(pgprot)); + phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT; + pfn ^= protnone_mask(pgprot_val(pgprot)); + pfn &= PHYSICAL_PMD_PAGE_MASK; + return __pmd(pfn | check_pgprot(pgprot)); } static inline pud_t pfn_pud(unsigned long page_nr, pgprot_t pgprot) { - return __pud(((phys_addr_t)page_nr << PAGE_SHIFT) | - check_pgprot(pgprot)); + phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT; + pfn ^= protnone_mask(pgprot_val(pgprot)); + pfn &= PHYSICAL_PUD_PAGE_MASK; + return __pud(pfn | check_pgprot(pgprot)); } +static inline pmd_t pmd_mknotpresent(pmd_t pmd) +{ + return pfn_pmd(pmd_pfn(pmd), + __pgprot(pmd_flags(pmd) & ~(_PAGE_PRESENT|_PAGE_PROTNONE))); +} + +static inline pud_t pud_mknotpresent(pud_t pud) +{ + return pfn_pud(pud_pfn(pud), + __pgprot(pud_flags(pud) & ~(_PAGE_PRESENT|_PAGE_PROTNONE))); +} + +static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask); + static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { - pteval_t val = pte_val(pte); + pteval_t val = pte_val(pte), oldval = val; /* * Chop off the NX bit (if present), and add the NX portion of @@ -574,17 +594,17 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) */ val &= _PAGE_CHG_MASK; val |= check_pgprot(newprot) & ~_PAGE_CHG_MASK; - + val = flip_protnone_guard(oldval, val, PTE_PFN_MASK); return __pte(val); } static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) { - pmdval_t val = pmd_val(pmd); + pmdval_t val = pmd_val(pmd), oldval = val; val &= _HPAGE_CHG_MASK; val |= check_pgprot(newprot) & ~_HPAGE_CHG_MASK; - + val = flip_protnone_guard(oldval, val, PHYSICAL_PMD_PAGE_MASK); return __pmd(val); } @@ -1410,6 +1430,14 @@ static inline bool pud_access_permitted(pud_t pud, bool write) return __pte_access_permitted(pud_val(pud), write); } +#define __HAVE_ARCH_PFN_MODIFY_ALLOWED 1 +extern bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot); + +static inline bool arch_has_pfn_modify_check(void) +{ + return boot_cpu_has_bug(X86_BUG_L1TF); +} + #include <asm-generic/pgtable.h> #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index acb6970e7bcf..f773d5e6c8cc 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -188,7 +188,7 @@ extern void sync_global_pgds(unsigned long start, unsigned long end); * * | ... | 11| 10| 9|8|7|6|5| 4| 3|2| 1|0| <- bit number * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names - * | OFFSET (14->63) | TYPE (9-13) |0|0|X|X| X| X|X|SD|0| <- swp entry + * | TYPE (59-63) | ~OFFSET (9-58) |0|0|X|X| X| X|X|SD|0| <- swp entry * * G (8) is aliased and used as a PROT_NONE indicator for * !present ptes. We need to start storing swap entries above @@ -201,20 +201,34 @@ extern void sync_global_pgds(unsigned long start, unsigned long end); * * Bit 7 in swp entry should be 0 because pmd_present checks not only P, * but also L and G. + * + * The offset is inverted by a binary not operation to make the high + * physical bits set. */ -#define SWP_TYPE_FIRST_BIT (_PAGE_BIT_PROTNONE + 1) -#define SWP_TYPE_BITS 5 -/* Place the offset above the type: */ -#define SWP_OFFSET_FIRST_BIT (SWP_TYPE_FIRST_BIT + SWP_TYPE_BITS) +#define SWP_TYPE_BITS 5 + +#define SWP_OFFSET_FIRST_BIT (_PAGE_BIT_PROTNONE + 1) + +/* We always extract/encode the offset by shifting it all the way up, and then down again */ +#define SWP_OFFSET_SHIFT (SWP_OFFSET_FIRST_BIT+SWP_TYPE_BITS) #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) -#define __swp_type(x) (((x).val >> (SWP_TYPE_FIRST_BIT)) \ - & ((1U << SWP_TYPE_BITS) - 1)) -#define __swp_offset(x) ((x).val >> SWP_OFFSET_FIRST_BIT) -#define __swp_entry(type, offset) ((swp_entry_t) { \ - ((type) << (SWP_TYPE_FIRST_BIT)) \ - | ((offset) << SWP_OFFSET_FIRST_BIT) }) +/* Extract the high bits for type */ +#define __swp_type(x) ((x).val >> (64 - SWP_TYPE_BITS)) + +/* Shift up (to get rid of type), then down to get value */ +#define __swp_offset(x) (~(x).val << SWP_TYPE_BITS >> SWP_OFFSET_SHIFT) + +/* + * Shift the offset up "too far" by TYPE bits, then down again + * The offset is inverted by a binary not operation to make the high + * physical bits set. + */ +#define __swp_entry(type, offset) ((swp_entry_t) { \ + (~(unsigned long)(offset) << SWP_OFFSET_SHIFT >> SWP_TYPE_BITS) \ + | ((unsigned long)(type) << (64-SWP_TYPE_BITS)) }) + #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) }) #define __pmd_to_swp_entry(pmd) ((swp_entry_t) { pmd_val((pmd)) }) #define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val }) @@ -258,5 +272,7 @@ static inline bool gup_fast_permitted(unsigned long start, int nr_pages, return true; } +#include <asm/pgtable-invert.h> + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_PGTABLE_64_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 59663c08c949..c24297268ebc 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -181,6 +181,11 @@ extern const struct seq_operations cpuinfo_op; extern void cpu_detect(struct cpuinfo_x86 *c); +static inline unsigned long long l1tf_pfn_limit(void) +{ + return BIT_ULL(boot_cpu_data.x86_phys_bits - 1 - PAGE_SHIFT); +} + extern void early_cpu_init(void); extern void identify_boot_cpu(void); extern void identify_secondary_cpu(struct cpuinfo_x86 *); @@ -978,4 +983,16 @@ bool xen_set_default_idle(void); void stop_this_cpu(void *dummy); void df_debug(struct pt_regs *regs, long error_code); void microcode_check(void); + +enum l1tf_mitigations { + L1TF_MITIGATION_OFF, + L1TF_MITIGATION_FLUSH_NOWARN, + L1TF_MITIGATION_FLUSH, + L1TF_MITIGATION_FLUSH_NOSMT, + L1TF_MITIGATION_FULL, + L1TF_MITIGATION_FULL_FORCE +}; + +extern enum l1tf_mitigations l1tf_mitigation; + #endif /* _ASM_X86_PROCESSOR_H */ diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index 34cffcef7375..07a25753e85c 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -89,4 +89,46 @@ extern int kernel_set_to_readonly; void set_kernel_text_rw(void); void set_kernel_text_ro(void); +#ifdef CONFIG_X86_64 +static inline int set_mce_nospec(unsigned long pfn) +{ + unsigned long decoy_addr; + int rc; + + /* + * Mark the linear address as UC to make sure we don't log more + * errors because of speculative access to the page. + * We would like to just call: + * set_memory_uc((unsigned long)pfn_to_kaddr(pfn), 1); + * but doing that would radically increase the odds of a + * speculative access to the poison page because we'd have + * the virtual address of the kernel 1:1 mapping sitting + * around in registers. + * Instead we get tricky. We create a non-canonical address + * that looks just like the one we want, but has bit 63 flipped. + * This relies on set_memory_uc() properly sanitizing any __pa() + * results with __PHYSICAL_MASK or PTE_PFN_MASK. + */ + decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63)); + + rc = set_memory_uc(decoy_addr, 1); + if (rc) + pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn); + return rc; +} +#define set_mce_nospec set_mce_nospec + +/* Restore full speculative operation to the pfn. */ +static inline int clear_mce_nospec(unsigned long pfn) +{ + return set_memory_wb((unsigned long) pfn_to_kaddr(pfn), 1); +} +#define clear_mce_nospec clear_mce_nospec +#else +/* + * Few people would run a 32-bit kernel on a machine that supports + * recoverable errors because they have too much memory to boot 32-bit. + */ +#endif + #endif /* _ASM_X86_SET_MEMORY_H */ diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 511bf5fae8b8..29c9da6c62fc 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -148,6 +148,22 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid) #define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr) #endif +static inline bool tlb_defer_switch_to_init_mm(void) +{ + /* + * If we have PCID, then switching to init_mm is reasonably + * fast. If we don't have PCID, then switching to init_mm is + * quite slow, so we try to defer it in the hopes that we can + * avoid it entirely. The latter approach runs the risk of + * receiving otherwise unnecessary IPIs. + * + * This choice is just a heuristic. The tlb code can handle this + * function returning true or false regardless of whether we have + * PCID. + */ + return !static_cpu_has(X86_FEATURE_PCID); +} + struct tlb_context { u64 ctx_id; u64 tlb_gen; @@ -536,11 +552,9 @@ extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch); #ifndef CONFIG_PARAVIRT #define flush_tlb_others(mask, info) \ native_flush_tlb_others(mask, info) -#endif -extern void tlb_flush_remove_tables(struct mm_struct *mm); -extern void tlb_flush_remove_tables_local(void *arg); - -#define HAVE_TLB_FLUSH_REMOVE_TABLES +#define paravirt_tlb_remove_table(tlb, page) \ + tlb_remove_page(tlb, (void *)(page)) +#endif #endif /* _ASM_X86_TLBFLUSH_H */ diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index c1d2a9892352..453cf38a1c33 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -123,13 +123,17 @@ static inline int topology_max_smt_threads(void) } int topology_update_package_map(unsigned int apicid, unsigned int cpu); -extern int topology_phys_to_logical_pkg(unsigned int pkg); +int topology_phys_to_logical_pkg(unsigned int pkg); +bool topology_is_primary_thread(unsigned int cpu); +bool topology_smt_supported(void); #else #define topology_max_packages() (1) static inline int topology_update_package_map(unsigned int apicid, unsigned int cpu) { return 0; } static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; } static inline int topology_max_smt_threads(void) { return 1; } +static inline bool topology_is_primary_thread(unsigned int cpu) { return true; } +static inline bool topology_smt_supported(void) { return false; } #endif static inline void arch_fix_phys_package_id(int num, u32 slot) diff --git a/arch/x86/include/asm/trace/hyperv.h b/arch/x86/include/asm/trace/hyperv.h index 9c0d4b588e3f..2e6245a023ef 100644 --- a/arch/x86/include/asm/trace/hyperv.h +++ b/arch/x86/include/asm/trace/hyperv.h @@ -28,6 +28,20 @@ TRACE_EVENT(hyperv_mmu_flush_tlb_others, __entry->addr, __entry->end) ); +TRACE_EVENT(hyperv_nested_flush_guest_mapping, + TP_PROTO(u64 as, int ret), + TP_ARGS(as, ret), + + TP_STRUCT__entry( + __field(u64, as) + __field(int, ret) + ), + TP_fast_assign(__entry->as = as; + __entry->ret = ret; + ), + TP_printk("address space %llx ret %d", __entry->as, __entry->ret) + ); + TRACE_EVENT(hyperv_send_ipi_mask, TP_PROTO(const struct cpumask *cpus, int vector), diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 6aa8499e1f62..9527ba5d62da 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -74,6 +74,7 @@ #define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000 #define SECONDARY_EXEC_ENABLE_VMFUNC 0x00002000 #define SECONDARY_EXEC_SHADOW_VMCS 0x00004000 +#define SECONDARY_EXEC_ENCLS_EXITING 0x00008000 #define SECONDARY_EXEC_RDSEED_EXITING 0x00010000 #define SECONDARY_EXEC_ENABLE_PML 0x00020000 #define SECONDARY_EXEC_XSAVES 0x00100000 @@ -213,6 +214,8 @@ enum vmcs_field { VMWRITE_BITMAP_HIGH = 0x00002029, XSS_EXIT_BITMAP = 0x0000202C, XSS_EXIT_BITMAP_HIGH = 0x0000202D, + ENCLS_EXITING_BITMAP = 0x0000202E, + ENCLS_EXITING_BITMAP_HIGH = 0x0000202F, TSC_MULTIPLIER = 0x00002032, TSC_MULTIPLIER_HIGH = 0x00002033, GUEST_PHYSICAL_ADDRESS = 0x00002400, @@ -576,4 +579,15 @@ enum vm_instruction_error_number { VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID = 28, }; +enum vmx_l1d_flush_state { + VMENTER_L1D_FLUSH_AUTO, + VMENTER_L1D_FLUSH_NEVER, + VMENTER_L1D_FLUSH_COND, + VMENTER_L1D_FLUSH_ALWAYS, + VMENTER_L1D_FLUSH_EPT_DISABLED, + VMENTER_L1D_FLUSH_NOT_REQUIRED, +}; + +extern enum vmx_l1d_flush_state l1tf_vmx_mitigation; + #endif diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index bfd882617613..ef05bea7010d 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -197,36 +197,38 @@ extern struct { char _entry[32]; } hypercall_page[]; (type)__res; \ }) -#define _hypercall5(type, name, a1, a2, a3, a4, a5) \ -({ \ - __HYPERCALL_DECLS; \ - __HYPERCALL_5ARG(a1, a2, a3, a4, a5); \ - asm volatile (__HYPERCALL \ - : __HYPERCALL_5PARAM \ - : __HYPERCALL_ENTRY(name) \ - : __HYPERCALL_CLOBBER5); \ - (type)__res; \ -}) - static inline long -privcmd_call(unsigned call, - unsigned long a1, unsigned long a2, - unsigned long a3, unsigned long a4, - unsigned long a5) +xen_single_call(unsigned int call, + unsigned long a1, unsigned long a2, + unsigned long a3, unsigned long a4, + unsigned long a5) { __HYPERCALL_DECLS; __HYPERCALL_5ARG(a1, a2, a3, a4, a5); - stac(); asm volatile(CALL_NOSPEC : __HYPERCALL_5PARAM : [thunk_target] "a" (&hypercall_page[call]) : __HYPERCALL_CLOBBER5); - clac(); return (long)__res; } +static inline long +privcmd_call(unsigned int call, + unsigned long a1, unsigned long a2, + unsigned long a3, unsigned long a4, + unsigned long a5) +{ + long res; + + stac(); + res = xen_single_call(call, a1, a2, a3, a4, a5); + clac(); + + return res; +} + static inline int HYPERVISOR_set_trap_table(struct trap_info *table) { @@ -254,47 +256,12 @@ HYPERVISOR_set_gdt(unsigned long *frame_list, int entries) } static inline int -HYPERVISOR_stack_switch(unsigned long ss, unsigned long esp) -{ - return _hypercall2(int, stack_switch, ss, esp); -} - -#ifdef CONFIG_X86_32 -static inline int -HYPERVISOR_set_callbacks(unsigned long event_selector, - unsigned long event_address, - unsigned long failsafe_selector, - unsigned long failsafe_address) -{ - return _hypercall4(int, set_callbacks, - event_selector, event_address, - failsafe_selector, failsafe_address); -} -#else /* CONFIG_X86_64 */ -static inline int -HYPERVISOR_set_callbacks(unsigned long event_address, - unsigned long failsafe_address, - unsigned long syscall_address) -{ - return _hypercall3(int, set_callbacks, - event_address, failsafe_address, - syscall_address); -} -#endif /* CONFIG_X86_{32,64} */ - -static inline int HYPERVISOR_callback_op(int cmd, void *arg) { return _hypercall2(int, callback_op, cmd, arg); } static inline int -HYPERVISOR_fpu_taskswitch(int set) -{ - return _hypercall1(int, fpu_taskswitch, set); -} - -static inline int HYPERVISOR_sched_op(int cmd, void *arg) { return _hypercall2(int, sched_op, cmd, arg); @@ -406,19 +373,6 @@ HYPERVISOR_grant_table_op(unsigned int cmd, void *uop, unsigned int count) } static inline int -HYPERVISOR_update_va_mapping_otherdomain(unsigned long va, pte_t new_val, - unsigned long flags, domid_t domid) -{ - if (sizeof(new_val) == sizeof(long)) - return _hypercall4(int, update_va_mapping_otherdomain, va, - new_val.pte, flags, domid); - else - return _hypercall5(int, update_va_mapping_otherdomain, va, - new_val.pte, new_val.pte >> 32, - flags, domid); -} - -static inline int HYPERVISOR_vm_assist(unsigned int cmd, unsigned int type) { return _hypercall2(int, vm_assist, cmd, type); @@ -452,12 +406,6 @@ HYPERVISOR_suspend(unsigned long start_info_mfn) return _hypercall3(int, sched_op, SCHEDOP_shutdown, &r, start_info_mfn); } -static inline int -HYPERVISOR_nmi_op(unsigned long op, unsigned long arg) -{ - return _hypercall2(int, nmi_op, op, arg); -} - static inline unsigned long __must_check HYPERVISOR_hvm_op(int op, void *arg) { @@ -516,39 +464,6 @@ MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va, } static inline void -MULTI_grant_table_op(struct multicall_entry *mcl, unsigned int cmd, - void *uop, unsigned int count) -{ - mcl->op = __HYPERVISOR_grant_table_op; - mcl->args[0] = cmd; - mcl->args[1] = (unsigned long)uop; - mcl->args[2] = count; - - trace_xen_mc_entry(mcl, 3); -} - -static inline void -MULTI_update_va_mapping_otherdomain(struct multicall_entry *mcl, unsigned long va, - pte_t new_val, unsigned long flags, - domid_t domid) -{ - mcl->op = __HYPERVISOR_update_va_mapping_otherdomain; - mcl->args[0] = va; - if (sizeof(new_val) == sizeof(long)) { - mcl->args[1] = new_val.pte; - mcl->args[2] = flags; - mcl->args[3] = domid; - } else { - mcl->args[1] = new_val.pte; - mcl->args[2] = new_val.pte >> 32; - mcl->args[3] = flags; - mcl->args[4] = domid; - } - - trace_xen_mc_entry(mcl, sizeof(new_val) == sizeof(long) ? 4 : 5); -} - -static inline void MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr, struct desc_struct desc) { @@ -569,16 +484,6 @@ MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr, } static inline void -MULTI_memory_op(struct multicall_entry *mcl, unsigned int cmd, void *arg) -{ - mcl->op = __HYPERVISOR_memory_op; - mcl->args[0] = cmd; - mcl->args[1] = (unsigned long)arg; - - trace_xen_mc_entry(mcl, 2); -} - -static inline void MULTI_mmu_update(struct multicall_entry *mcl, struct mmu_update *req, int count, int *success_count, domid_t domid) { @@ -605,16 +510,6 @@ MULTI_mmuext_op(struct multicall_entry *mcl, struct mmuext_op *op, int count, } static inline void -MULTI_set_gdt(struct multicall_entry *mcl, unsigned long *frames, int entries) -{ - mcl->op = __HYPERVISOR_set_gdt; - mcl->args[0] = (unsigned long)frames; - mcl->args[1] = entries; - - trace_xen_mc_entry(mcl, 2); -} - -static inline void MULTI_stack_switch(struct multicall_entry *mcl, unsigned long ss, unsigned long esp) { diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index c535c2fdea13..86299efa804a 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -378,4 +378,41 @@ struct kvm_sync_regs { #define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0) #define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1) +#define KVM_STATE_NESTED_GUEST_MODE 0x00000001 +#define KVM_STATE_NESTED_RUN_PENDING 0x00000002 + +#define KVM_STATE_NESTED_SMM_GUEST_MODE 0x00000001 +#define KVM_STATE_NESTED_SMM_VMXON 0x00000002 + +struct kvm_vmx_nested_state { + __u64 vmxon_pa; + __u64 vmcs_pa; + + struct { + __u16 flags; + } smm; +}; + +/* for KVM_CAP_NESTED_STATE */ +struct kvm_nested_state { + /* KVM_STATE_* flags */ + __u16 flags; + + /* 0 for VMX, 1 for SVM. */ + __u16 format; + + /* 128 for SVM, 128 + VMCS size for VMX. */ + __u32 size; + + union { + /* VMXON, VMCS */ + struct kvm_vmx_nested_state vmx; + + /* Pad the header to 128 bytes. */ + __u8 pad[120]; + }; + + __u8 data[0]; +}; + #endif /* _ASM_X86_KVM_H */ diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 0ede697c3961..19980ec1a316 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -28,6 +28,7 @@ #define KVM_FEATURE_PV_UNHALT 7 #define KVM_FEATURE_PV_TLB_FLUSH 9 #define KVM_FEATURE_ASYNC_PF_VMEXIT 10 +#define KVM_FEATURE_PV_SEND_IPI 11 #define KVM_HINTS_REALTIME 0 diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index dde437f5d14f..158ad1483c43 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -108,7 +108,7 @@ static long acpi_processor_ffh_cstate_probe_cpu(void *_cx) cx->type); } snprintf(cx->desc, - ACPI_CX_DESC_LEN, "ACPI FFH INTEL MWAIT 0x%x", + ACPI_CX_DESC_LEN, "ACPI FFH MWAIT 0x%x", cx->address); out: return retval; diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 07fa222f0c52..84132eddb5a8 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -56,6 +56,7 @@ #include <asm/hypervisor.h> #include <asm/cpu_device_id.h> #include <asm/intel-family.h> +#include <asm/irq_regs.h> unsigned int num_processors; @@ -2192,6 +2193,23 @@ static int cpuid_to_apicid[] = { [0 ... NR_CPUS - 1] = -1, }; +#ifdef CONFIG_SMP +/** + * apic_id_is_primary_thread - Check whether APIC ID belongs to a primary thread + * @id: APIC ID to check + */ +bool apic_id_is_primary_thread(unsigned int apicid) +{ + u32 mask; + + if (smp_num_siblings == 1) + return true; + /* Isolate the SMT bit(s) in the APICID and check for 0 */ + mask = (1U << (fls(smp_num_siblings) - 1)) - 1; + return !(apicid & mask); +} +#endif + /* * Should use this API to allocate logical CPU IDs to keep nr_logical_cpuids * and cpuid_to_apicid[] synchronized. diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 3982f79d2377..ff0d14cd9e82 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -33,6 +33,7 @@ #include <linux/mm.h> #include <linux/interrupt.h> +#include <linux/irq.h> #include <linux/init.h> #include <linux/delay.h> #include <linux/sched.h> diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index ce503c99f5c4..72a94401f9e0 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -12,6 +12,7 @@ */ #include <linux/mm.h> #include <linux/interrupt.h> +#include <linux/irq.h> #include <linux/pci.h> #include <linux/dmar.h> #include <linux/hpet.h> diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 0954315842c0..9f148e3d45b4 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -11,6 +11,7 @@ * published by the Free Software Foundation. */ #include <linux/interrupt.h> +#include <linux/irq.h> #include <linux/seq_file.h> #include <linux/init.h> #include <linux/compiler.h> diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index b732438c1a1e..22ab408177b2 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -313,6 +313,13 @@ static void legacy_fixup_core_id(struct cpuinfo_x86 *c) c->cpu_core_id %= cus_per_node; } + +static void amd_get_topology_early(struct cpuinfo_x86 *c) +{ + if (cpu_has(c, X86_FEATURE_TOPOEXT)) + smp_num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1; +} + /* * Fixup core topology information for * (1) AMD multi-node processors @@ -332,7 +339,6 @@ static void amd_get_topology(struct cpuinfo_x86 *c) cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); node_id = ecx & 0xff; - smp_num_siblings = ((ebx >> 8) & 0xff) + 1; if (c->x86 == 0x15) c->cu_id = ebx & 0xff; @@ -611,6 +617,7 @@ clear_sev: static void early_init_amd(struct cpuinfo_x86 *c) { + u64 value; u32 dummy; early_init_amd_mc(c); @@ -689,6 +696,22 @@ static void early_init_amd(struct cpuinfo_x86 *c) set_cpu_bug(c, X86_BUG_AMD_E400); early_detect_mem_encrypt(c); + + /* Re-enable TopologyExtensions if switched off by BIOS */ + if (c->x86 == 0x15 && + (c->x86_model >= 0x10 && c->x86_model <= 0x6f) && + !cpu_has(c, X86_FEATURE_TOPOEXT)) { + + if (msr_set_bit(0xc0011005, 54) > 0) { + rdmsrl(0xc0011005, value); + if (value & BIT_64(54)) { + set_cpu_cap(c, X86_FEATURE_TOPOEXT); + pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n"); + } + } + } + + amd_get_topology_early(c); } static void init_amd_k8(struct cpuinfo_x86 *c) @@ -780,19 +803,6 @@ static void init_amd_bd(struct cpuinfo_x86 *c) { u64 value; - /* re-enable TopologyExtensions if switched off by BIOS */ - if ((c->x86_model >= 0x10) && (c->x86_model <= 0x6f) && - !cpu_has(c, X86_FEATURE_TOPOEXT)) { - - if (msr_set_bit(0xc0011005, 54) > 0) { - rdmsrl(0xc0011005, value); - if (value & BIT_64(54)) { - set_cpu_cap(c, X86_FEATURE_TOPOEXT); - pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n"); - } - } - } - /* * The way access filter has a performance penalty on some workloads. * Disable it on the affected CPUs. @@ -856,16 +866,9 @@ static void init_amd(struct cpuinfo_x86 *c) cpu_detect_cache_sizes(c); - /* Multi core CPU? */ - if (c->extended_cpuid_level >= 0x80000008) { - amd_detect_cmp(c); - amd_get_topology(c); - srat_detect_node(c); - } - -#ifdef CONFIG_X86_32 - detect_ht(c); -#endif + amd_detect_cmp(c); + amd_get_topology(c); + srat_detect_node(c); init_amd_cacheinfo(c); diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 405a9a61bb89..4c2313d0b9ca 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -22,15 +22,18 @@ #include <asm/processor-flags.h> #include <asm/fpu/internal.h> #include <asm/msr.h> +#include <asm/vmx.h> #include <asm/paravirt.h> #include <asm/alternative.h> #include <asm/pgtable.h> #include <asm/set_memory.h> #include <asm/intel-family.h> +#include <asm/e820/api.h> #include <asm/hypervisor.h> static void __init spectre_v2_select_mitigation(void); static void __init ssb_select_mitigation(void); +static void __init l1tf_select_mitigation(void); /* * Our boot-time value of the SPEC_CTRL MSR. We read it once so that any @@ -56,6 +59,12 @@ void __init check_bugs(void) { identify_boot_cpu(); + /* + * identify_boot_cpu() initialized SMT support information, let the + * core code know. + */ + cpu_smt_check_topology_early(); + if (!IS_ENABLED(CONFIG_SMP)) { pr_info("CPU: "); print_cpu_info(&boot_cpu_data); @@ -82,6 +91,8 @@ void __init check_bugs(void) */ ssb_select_mitigation(); + l1tf_select_mitigation(); + #ifdef CONFIG_X86_32 /* * Check whether we are able to run this kernel safely on SMP. @@ -646,8 +657,124 @@ void x86_spec_ctrl_setup_ap(void) x86_amd_ssb_disable(); } +#undef pr_fmt +#define pr_fmt(fmt) "L1TF: " fmt + +/* Default mitigation for L1TF-affected CPUs */ +enum l1tf_mitigations l1tf_mitigation __ro_after_init = L1TF_MITIGATION_FLUSH; +#if IS_ENABLED(CONFIG_KVM_INTEL) +EXPORT_SYMBOL_GPL(l1tf_mitigation); +#endif +enum vmx_l1d_flush_state l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; +EXPORT_SYMBOL_GPL(l1tf_vmx_mitigation); + +static void __init l1tf_select_mitigation(void) +{ + u64 half_pa; + + if (!boot_cpu_has_bug(X86_BUG_L1TF)) + return; + + switch (l1tf_mitigation) { + case L1TF_MITIGATION_OFF: + case L1TF_MITIGATION_FLUSH_NOWARN: + case L1TF_MITIGATION_FLUSH: + break; + case L1TF_MITIGATION_FLUSH_NOSMT: + case L1TF_MITIGATION_FULL: + cpu_smt_disable(false); + break; + case L1TF_MITIGATION_FULL_FORCE: + cpu_smt_disable(true); + break; + } + +#if CONFIG_PGTABLE_LEVELS == 2 + pr_warn("Kernel not compiled for PAE. No mitigation for L1TF\n"); + return; +#endif + + /* + * This is extremely unlikely to happen because almost all + * systems have far more MAX_PA/2 than RAM can be fit into + * DIMM slots. + */ + half_pa = (u64)l1tf_pfn_limit() << PAGE_SHIFT; + if (e820__mapped_any(half_pa, ULLONG_MAX - half_pa, E820_TYPE_RAM)) { + pr_warn("System has more than MAX_PA/2 memory. L1TF mitigation not effective.\n"); + pr_info("You may make it effective by booting the kernel with mem=%llu parameter.\n", + half_pa); + pr_info("However, doing so will make a part of your RAM unusable.\n"); + pr_info("Reading https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html might help you decide.\n"); + return; + } + + setup_force_cpu_cap(X86_FEATURE_L1TF_PTEINV); +} + +static int __init l1tf_cmdline(char *str) +{ + if (!boot_cpu_has_bug(X86_BUG_L1TF)) + return 0; + + if (!str) + return -EINVAL; + + if (!strcmp(str, "off")) + l1tf_mitigation = L1TF_MITIGATION_OFF; + else if (!strcmp(str, "flush,nowarn")) + l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOWARN; + else if (!strcmp(str, "flush")) + l1tf_mitigation = L1TF_MITIGATION_FLUSH; + else if (!strcmp(str, "flush,nosmt")) + l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT; + else if (!strcmp(str, "full")) + l1tf_mitigation = L1TF_MITIGATION_FULL; + else if (!strcmp(str, "full,force")) + l1tf_mitigation = L1TF_MITIGATION_FULL_FORCE; + + return 0; +} +early_param("l1tf", l1tf_cmdline); + +#undef pr_fmt + #ifdef CONFIG_SYSFS +#define L1TF_DEFAULT_MSG "Mitigation: PTE Inversion" + +#if IS_ENABLED(CONFIG_KVM_INTEL) +static const char *l1tf_vmx_states[] = { + [VMENTER_L1D_FLUSH_AUTO] = "auto", + [VMENTER_L1D_FLUSH_NEVER] = "vulnerable", + [VMENTER_L1D_FLUSH_COND] = "conditional cache flushes", + [VMENTER_L1D_FLUSH_ALWAYS] = "cache flushes", + [VMENTER_L1D_FLUSH_EPT_DISABLED] = "EPT disabled", + [VMENTER_L1D_FLUSH_NOT_REQUIRED] = "flush not necessary" +}; + +static ssize_t l1tf_show_state(char *buf) +{ + if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) + return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG); + + if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_EPT_DISABLED || + (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER && + cpu_smt_control == CPU_SMT_ENABLED)) + return sprintf(buf, "%s; VMX: %s\n", L1TF_DEFAULT_MSG, + l1tf_vmx_states[l1tf_vmx_mitigation]); + + return sprintf(buf, "%s; VMX: %s, SMT %s\n", L1TF_DEFAULT_MSG, + l1tf_vmx_states[l1tf_vmx_mitigation], + cpu_smt_control == CPU_SMT_ENABLED ? "vulnerable" : "disabled"); +} +#else +static ssize_t l1tf_show_state(char *buf) +{ + return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG); +} +#endif + static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, char *buf, unsigned int bug) { @@ -676,6 +803,10 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_SPEC_STORE_BYPASS: return sprintf(buf, "%s\n", ssb_strings[ssb_mode]); + case X86_BUG_L1TF: + if (boot_cpu_has(X86_FEATURE_L1TF_PTEINV)) + return l1tf_show_state(buf); + break; default: break; } @@ -702,4 +833,9 @@ ssize_t cpu_show_spec_store_bypass(struct device *dev, struct device_attribute * { return cpu_show_common(dev, attr, buf, X86_BUG_SPEC_STORE_BYPASS); } + +ssize_t cpu_show_l1tf(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_L1TF); +} #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index ba6b8bb1c036..84dee5ab745a 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -661,33 +661,36 @@ static void cpu_detect_tlb(struct cpuinfo_x86 *c) tlb_lld_4m[ENTRIES], tlb_lld_1g[ENTRIES]); } -void detect_ht(struct cpuinfo_x86 *c) +int detect_ht_early(struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP u32 eax, ebx, ecx, edx; - int index_msb, core_bits; - static bool printed; if (!cpu_has(c, X86_FEATURE_HT)) - return; + return -1; if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) - goto out; + return -1; if (cpu_has(c, X86_FEATURE_XTOPOLOGY)) - return; + return -1; cpuid(1, &eax, &ebx, &ecx, &edx); smp_num_siblings = (ebx & 0xff0000) >> 16; - - if (smp_num_siblings == 1) { + if (smp_num_siblings == 1) pr_info_once("CPU0: Hyper-Threading is disabled\n"); - goto out; - } +#endif + return 0; +} - if (smp_num_siblings <= 1) - goto out; +void detect_ht(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + int index_msb, core_bits; + + if (detect_ht_early(c) < 0) + return; index_msb = get_count_order(smp_num_siblings); c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb); @@ -700,15 +703,6 @@ void detect_ht(struct cpuinfo_x86 *c) c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) & ((1 << core_bits) - 1); - -out: - if (!printed && (c->x86_max_cores * smp_num_siblings) > 1) { - pr_info("CPU: Physical Processor ID: %d\n", - c->phys_proc_id); - pr_info("CPU: Processor Core ID: %d\n", - c->cpu_core_id); - printed = 1; - } #endif } @@ -911,7 +905,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c) apply_forced_caps(c); } -static void get_cpu_address_sizes(struct cpuinfo_x86 *c) +void get_cpu_address_sizes(struct cpuinfo_x86 *c) { u32 eax, ebx, ecx, edx; @@ -987,6 +981,21 @@ static const __initconst struct x86_cpu_id cpu_no_spec_store_bypass[] = { {} }; +static const __initconst struct x86_cpu_id cpu_no_l1tf[] = { + /* in addition to cpu_no_speculation */ + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT2 }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MERRIFIELD }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MOOREFIELD }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GOLDMONT }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_DENVERTON }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GEMINI_LAKE }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNL }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNM }, + {} +}; + static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) { u64 ia32_cap = 0; @@ -1016,6 +1025,11 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) return; setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); + + if (x86_match_cpu(cpu_no_l1tf)) + return; + + setup_force_cpu_bug(X86_BUG_L1TF); } /* diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 38216f678fc3..7b229afa0a37 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -46,6 +46,7 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[], *const __x86_cpu_dev_end[]; extern void get_cpu_cap(struct cpuinfo_x86 *c); +extern void get_cpu_address_sizes(struct cpuinfo_x86 *c); extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c); extern u32 get_scattered_cpuid_leaf(unsigned int level, @@ -55,7 +56,9 @@ extern void init_intel_cacheinfo(struct cpuinfo_x86 *c); extern void init_amd_cacheinfo(struct cpuinfo_x86 *c); extern void detect_num_cpu_cores(struct cpuinfo_x86 *c); +extern int detect_extended_topology_early(struct cpuinfo_x86 *c); extern int detect_extended_topology(struct cpuinfo_x86 *c); +extern int detect_ht_early(struct cpuinfo_x86 *c); extern void detect_ht(struct cpuinfo_x86 *c); unsigned int aperfmperf_get_khz(int cpu); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index c050cd6066af..401e8c133108 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -301,6 +301,13 @@ static void early_init_intel(struct cpuinfo_x86 *c) } check_mpx_erratum(c); + + /* + * Get the number of SMT siblings early from the extended topology + * leaf, if available. Otherwise try the legacy SMT detection. + */ + if (detect_extended_topology_early(c) < 0) + detect_ht_early(c); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index d6d7ea7349d0..b799c00bef09 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -204,6 +204,7 @@ static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft) int ret; kn = __kernfs_create_file(parent_kn, rft->name, rft->mode, + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0, rft->kf_ops, rft, NULL, NULL); if (IS_ERR(kn)) return PTR_ERR(kn); @@ -2095,7 +2096,8 @@ static int mon_addfile(struct kernfs_node *parent_kn, const char *name, struct kernfs_node *kn; int ret = 0; - kn = __kernfs_create_file(parent_kn, name, 0444, 0, + kn = __kernfs_create_file(parent_kn, name, 0444, + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0, &kf_mondata_ops, priv, NULL, NULL); if (IS_ERR(kn)) return PTR_ERR(kn); diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 374d1aa66952..ceb67cd5918f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -113,21 +113,6 @@ static inline void mce_register_injector_chain(struct notifier_block *nb) { } static inline void mce_unregister_injector_chain(struct notifier_block *nb) { } #endif -#ifndef CONFIG_X86_64 -/* - * On 32-bit systems it would be difficult to safely unmap a poison page - * from the kernel 1:1 map because there are no non-canonical addresses that - * we can use to refer to the address without risking a speculative access. - * However, this isn't much of an issue because: - * 1) Few unmappable pages are in the 1:1 map. Most are in HIGHMEM which - * are only mapped into the kernel as needed - * 2) Few people would run a 32-bit kernel on a machine that supports - * recoverable errors because they have too much memory to boot 32-bit. - */ -static inline void mce_unmap_kpfn(unsigned long pfn) {} -#define mce_unmap_kpfn mce_unmap_kpfn -#endif - struct mca_config { bool dont_log_ce; bool cmci_disabled; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 4b767284b7f5..953b3ce92dcc 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -42,6 +42,7 @@ #include <linux/irq_work.h> #include <linux/export.h> #include <linux/jump_label.h> +#include <linux/set_memory.h> #include <asm/intel-family.h> #include <asm/processor.h> @@ -50,7 +51,6 @@ #include <asm/mce.h> #include <asm/msr.h> #include <asm/reboot.h> -#include <asm/set_memory.h> #include "mce-internal.h" @@ -108,10 +108,6 @@ static struct irq_work mce_irq_work; static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); -#ifndef mce_unmap_kpfn -static void mce_unmap_kpfn(unsigned long pfn); -#endif - /* * CPU/chipset specific EDAC code can register a notifier call here to print * MCE errors in a human-readable form. @@ -602,7 +598,7 @@ static int srao_decode_notifier(struct notifier_block *nb, unsigned long val, if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) { pfn = mce->addr >> PAGE_SHIFT; if (!memory_failure(pfn, 0)) - mce_unmap_kpfn(pfn); + set_mce_nospec(pfn); } return NOTIFY_OK; @@ -1072,38 +1068,10 @@ static int do_memory_failure(struct mce *m) if (ret) pr_err("Memory error not recovered"); else - mce_unmap_kpfn(m->addr >> PAGE_SHIFT); + set_mce_nospec(m->addr >> PAGE_SHIFT); return ret; } -#ifndef mce_unmap_kpfn -static void mce_unmap_kpfn(unsigned long pfn) -{ - unsigned long decoy_addr; - - /* - * Unmap this page from the kernel 1:1 mappings to make sure - * we don't log more errors because of speculative access to - * the page. - * We would like to just call: - * set_memory_np((unsigned long)pfn_to_kaddr(pfn), 1); - * but doing that would radically increase the odds of a - * speculative access to the poison page because we'd have - * the virtual address of the kernel 1:1 mapping sitting - * around in registers. - * Instead we get tricky. We create a non-canonical address - * that looks just like the one we want, but has bit 63 flipped. - * This relies on set_memory_np() not checking whether we passed - * a legal address. - */ - - decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63)); - - if (set_memory_np(decoy_addr, 1)) - pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn); -} -#endif - /* * Cases where we avoid rendezvous handler timeout: diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 08286269fd24..b9bc8a1a584e 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -509,12 +509,20 @@ static struct platform_device *microcode_pdev; static int check_online_cpus(void) { - if (num_online_cpus() == num_present_cpus()) - return 0; + unsigned int cpu; - pr_err("Not all CPUs online, aborting microcode update.\n"); + /* + * Make sure all CPUs are online. It's fine for SMT to be disabled if + * all the primary threads are still online. + */ + for_each_present_cpu(cpu) { + if (topology_is_primary_thread(cpu) && !cpu_online(cpu)) { + pr_err("Not all CPUs online, aborting microcode update.\n"); + return -EINVAL; + } + } - return -EINVAL; + return 0; } static atomic_t late_cpus_in; diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 031082c96db8..ad12733f6058 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -41,7 +41,7 @@ static void (*hv_stimer0_handler)(void); static void (*hv_kexec_handler)(void); static void (*hv_crash_handler)(struct pt_regs *regs); -void hyperv_vector_handler(struct pt_regs *regs) +__visible void __irq_entry hyperv_vector_handler(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); @@ -50,7 +50,7 @@ void hyperv_vector_handler(struct pt_regs *regs) if (vmbus_handler) vmbus_handler(); - if (ms_hyperv.hints & HV_X64_DEPRECATING_AEOI_RECOMMENDED) + if (ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED) ack_APIC_irq(); exiting_irq(); @@ -300,7 +300,7 @@ static void __init ms_hyperv_init_platform(void) hyperv_reenlightenment_vector); /* Setup the IDT for stimer0 */ - if (ms_hyperv.misc_features & HV_X64_STIMER_DIRECT_MODE_AVAILABLE) + if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE) alloc_intr_gate(HYPERV_STIMER0_VECTOR, hv_stimer0_callback_vector); #endif diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index 81c0afb39d0a..71ca064e3794 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -22,18 +22,10 @@ #define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f) #define LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff) -/* - * Check for extended topology enumeration cpuid leaf 0xb and if it - * exists, use it for populating initial_apicid and cpu topology - * detection. - */ -int detect_extended_topology(struct cpuinfo_x86 *c) +int detect_extended_topology_early(struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP - unsigned int eax, ebx, ecx, edx, sub_index; - unsigned int ht_mask_width, core_plus_mask_width; - unsigned int core_select_mask, core_level_siblings; - static bool printed; + unsigned int eax, ebx, ecx, edx; if (c->cpuid_level < 0xb) return -1; @@ -52,10 +44,30 @@ int detect_extended_topology(struct cpuinfo_x86 *c) * initial apic id, which also represents 32-bit extended x2apic id. */ c->initial_apicid = edx; + smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx); +#endif + return 0; +} + +/* + * Check for extended topology enumeration cpuid leaf 0xb and if it + * exists, use it for populating initial_apicid and cpu topology + * detection. + */ +int detect_extended_topology(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + unsigned int eax, ebx, ecx, edx, sub_index; + unsigned int ht_mask_width, core_plus_mask_width; + unsigned int core_select_mask, core_level_siblings; + + if (detect_extended_topology_early(c) < 0) + return -1; /* * Populate HT related information from sub-leaf level 0. */ + cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx); core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx); core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); @@ -86,15 +98,6 @@ int detect_extended_topology(struct cpuinfo_x86 *c) c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); c->x86_max_cores = (core_level_siblings / smp_num_siblings); - - if (!printed) { - pr_info("CPU: Physical Processor ID: %d\n", - c->phys_proc_id); - if (c->x86_max_cores > 1) - pr_info("CPU: Processor Core ID: %d\n", - c->cpu_core_id); - printed = 1; - } #endif return 0; } diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index da5d8ac60062..50d5848bf22e 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -338,6 +338,18 @@ static resource_size_t __init gen3_stolen_base(int num, int slot, int func, return bsm & INTEL_BSM_MASK; } +static resource_size_t __init gen11_stolen_base(int num, int slot, int func, + resource_size_t stolen_size) +{ + u64 bsm; + + bsm = read_pci_config(num, slot, func, INTEL_GEN11_BSM_DW0); + bsm &= INTEL_BSM_MASK; + bsm |= (u64)read_pci_config(num, slot, func, INTEL_GEN11_BSM_DW1) << 32; + + return bsm; +} + static resource_size_t __init i830_stolen_size(int num, int slot, int func) { u16 gmch_ctrl; @@ -498,6 +510,11 @@ static const struct intel_early_ops chv_early_ops __initconst = { .stolen_size = chv_stolen_size, }; +static const struct intel_early_ops gen11_early_ops __initconst = { + .stolen_base = gen11_stolen_base, + .stolen_size = gen9_stolen_size, +}; + static const struct pci_device_id intel_early_ids[] __initconst = { INTEL_I830_IDS(&i830_early_ops), INTEL_I845G_IDS(&i845_early_ops), @@ -529,6 +546,7 @@ static const struct pci_device_id intel_early_ids[] __initconst = { INTEL_CFL_IDS(&gen9_early_ops), INTEL_GLK_IDS(&gen9_early_ops), INTEL_CNL_IDS(&gen9_early_ops), + INTEL_ICL_11_IDS(&gen11_early_ops), }; struct resource intel_graphics_stolen_res __ro_after_init = DEFINE_RES_MEM(0, 0); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index f92a6593de1e..2ea85b32421a 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -10,6 +10,7 @@ #include <asm/fpu/signal.h> #include <asm/fpu/types.h> #include <asm/traps.h> +#include <asm/irq_regs.h> #include <linux/hardirq.h> #include <linux/pkeys.h> diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 346b24883911..b0acb22e5a46 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -1,6 +1,7 @@ #include <linux/clocksource.h> #include <linux/clockchips.h> #include <linux/interrupt.h> +#include <linux/irq.h> #include <linux/export.h> #include <linux/delay.h> #include <linux/errno.h> diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 86c4439f9d74..519649ddf100 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -5,6 +5,7 @@ #include <linux/sched.h> #include <linux/ioport.h> #include <linux/interrupt.h> +#include <linux/irq.h> #include <linux/timex.h> #include <linux/random.h> #include <linux/init.h> diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 74383a3780dc..01adea278a71 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -8,6 +8,7 @@ #include <asm/traps.h> #include <asm/proto.h> #include <asm/desc.h> +#include <asm/hw_irq.h> struct idt_data { unsigned int vector; diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 328d027d829d..59b5f2ea7c2f 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -10,6 +10,7 @@ #include <linux/ftrace.h> #include <linux/delay.h> #include <linux/export.h> +#include <linux/irq.h> #include <asm/apic.h> #include <asm/io_apic.h> diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index c1bdbd3d3232..95600a99ae93 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -11,6 +11,7 @@ #include <linux/seq_file.h> #include <linux/interrupt.h> +#include <linux/irq.h> #include <linux/kernel_stat.h> #include <linux/notifier.h> #include <linux/cpu.h> diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index d86e344f5b3d..0469cd078db1 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -11,6 +11,7 @@ #include <linux/kernel_stat.h> #include <linux/interrupt.h> +#include <linux/irq.h> #include <linux/seq_file.h> #include <linux/delay.h> #include <linux/ftrace.h> diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 772196c1b8c4..a0693b71cfc1 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -5,6 +5,7 @@ #include <linux/sched.h> #include <linux/ioport.h> #include <linux/interrupt.h> +#include <linux/irq.h> #include <linux/timex.h> #include <linux/random.h> #include <linux/kprobes.h> diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 7326078eaa7a..278cd07228dd 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -532,7 +532,7 @@ static int bzImage64_cleanup(void *loader_data) static int bzImage64_verify_sig(const char *kernel, unsigned long kernel_len) { return verify_pefile_signature(kernel, kernel_len, - NULL, + VERIFY_USE_SECONDARY_KEYRING, VERIFYING_KEXEC_PE_SIGNATURE); } #endif diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 09aaabb2bbf1..d9b71924c23c 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -45,6 +45,7 @@ #include <asm/apic.h> #include <asm/apicdef.h> #include <asm/hypervisor.h> +#include <asm/tlb.h> static int kvmapf = 1; @@ -444,6 +445,98 @@ static void __init sev_map_percpu_data(void) } #ifdef CONFIG_SMP +#define KVM_IPI_CLUSTER_SIZE (2 * BITS_PER_LONG) + +static void __send_ipi_mask(const struct cpumask *mask, int vector) +{ + unsigned long flags; + int cpu, apic_id, icr; + int min = 0, max = 0; +#ifdef CONFIG_X86_64 + __uint128_t ipi_bitmap = 0; +#else + u64 ipi_bitmap = 0; +#endif + + if (cpumask_empty(mask)) + return; + + local_irq_save(flags); + + switch (vector) { + default: + icr = APIC_DM_FIXED | vector; + break; + case NMI_VECTOR: + icr = APIC_DM_NMI; + break; + } + + for_each_cpu(cpu, mask) { + apic_id = per_cpu(x86_cpu_to_apicid, cpu); + if (!ipi_bitmap) { + min = max = apic_id; + } else if (apic_id < min && max - apic_id < KVM_IPI_CLUSTER_SIZE) { + ipi_bitmap <<= min - apic_id; + min = apic_id; + } else if (apic_id < min + KVM_IPI_CLUSTER_SIZE) { + max = apic_id < max ? max : apic_id; + } else { + kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap, + (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr); + min = max = apic_id; + ipi_bitmap = 0; + } + __set_bit(apic_id - min, (unsigned long *)&ipi_bitmap); + } + + if (ipi_bitmap) { + kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap, + (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr); + } + + local_irq_restore(flags); +} + +static void kvm_send_ipi_mask(const struct cpumask *mask, int vector) +{ + __send_ipi_mask(mask, vector); +} + +static void kvm_send_ipi_mask_allbutself(const struct cpumask *mask, int vector) +{ + unsigned int this_cpu = smp_processor_id(); + struct cpumask new_mask; + const struct cpumask *local_mask; + + cpumask_copy(&new_mask, mask); + cpumask_clear_cpu(this_cpu, &new_mask); + local_mask = &new_mask; + __send_ipi_mask(local_mask, vector); +} + +static void kvm_send_ipi_allbutself(int vector) +{ + kvm_send_ipi_mask_allbutself(cpu_online_mask, vector); +} + +static void kvm_send_ipi_all(int vector) +{ + __send_ipi_mask(cpu_online_mask, vector); +} + +/* + * Set the IPI entry points + */ +static void kvm_setup_pv_ipi(void) +{ + apic->send_IPI_mask = kvm_send_ipi_mask; + apic->send_IPI_mask_allbutself = kvm_send_ipi_mask_allbutself; + apic->send_IPI_allbutself = kvm_send_ipi_allbutself; + apic->send_IPI_all = kvm_send_ipi_all; + pr_info("KVM setup pv IPIs\n"); +} + static void __init kvm_smp_prepare_cpus(unsigned int max_cpus) { native_smp_prepare_cpus(max_cpus); @@ -544,8 +637,10 @@ static void __init kvm_guest_init(void) if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) && !kvm_para_has_hint(KVM_HINTS_REALTIME) && - kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) + kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { pv_mmu_ops.flush_tlb_others = kvm_flush_tlb_others; + pv_mmu_ops.tlb_remove_table = tlb_remove_table; + } if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) apic_set_eoi_write(kvm_guest_apic_eoi_write); @@ -611,13 +706,27 @@ static uint32_t __init kvm_detect(void) return kvm_cpuid_base(); } +static void __init kvm_apic_init(void) +{ +#if defined(CONFIG_SMP) + if (kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI)) + kvm_setup_pv_ipi(); +#endif +} + +static void __init kvm_init_platform(void) +{ + kvmclock_init(); + x86_platform.apic_post_init = kvm_apic_init; +} + const __initconst struct hypervisor_x86 x86_hyper_kvm = { .name = "KVM", .detect = kvm_detect, .type = X86_HYPER_KVM, - .init.init_platform = kvmclock_init, .init.guest_late_init = kvm_guest_init, .init.x2apic_available = kvm_para_available, + .init.init_platform = kvm_init_platform, }; static __init int activate_jump_labels(void) @@ -736,6 +845,10 @@ void __init kvm_spinlock_init(void) if (kvm_para_has_hint(KVM_HINTS_REALTIME)) return; + /* Don't use the pvqspinlock code if there is only 1 vCPU. */ + if (num_possible_cpus() == 1) + return; + __pv_init_lock_hash(); pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock); diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index d2edd7e6c294..1e6764648af3 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -27,6 +27,7 @@ #include <linux/sched.h> #include <linux/sched/clock.h> #include <linux/mm.h> +#include <linux/slab.h> #include <asm/hypervisor.h> #include <asm/mem_encrypt.h> diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 930c88341e4e..afdb303285f8 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -41,6 +41,7 @@ #include <asm/tlbflush.h> #include <asm/timer.h> #include <asm/special_insns.h> +#include <asm/tlb.h> /* * nop stub, which must not clobber anything *including the stack* to @@ -409,6 +410,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = { .flush_tlb_kernel = native_flush_tlb_global, .flush_tlb_one_user = native_flush_tlb_one_user, .flush_tlb_others = native_flush_tlb_others, + .tlb_remove_table = (void (*)(struct mmu_gather *, void *))tlb_remove_page, .pgd_alloc = __paravirt_pgd_alloc, .pgd_free = paravirt_nop, diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index ab5d9dd668d2..7ba73fe0d917 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -40,8 +40,14 @@ int iommu_detected __read_mostly = 0; * devices and allow every device to access to whole physical memory. This is * useful if a user wants to use an IOMMU only for KVM device assignment to * guests and not for driver dma translation. + * It is also possible to disable by default in kernel config, and enable with + * iommu=nopt at boot time. */ +#ifdef CONFIG_IOMMU_DEFAULT_PASSTHROUGH +int iommu_pass_through __read_mostly = 1; +#else int iommu_pass_through __read_mostly; +#endif extern struct iommu_table_entry __iommu_table[], __iommu_table_end[]; @@ -135,6 +141,8 @@ static __init int iommu_setup(char *p) #endif if (!strncmp(p, "pt", 2)) iommu_pass_through = 1; + if (!strncmp(p, "nopt", 4)) + iommu_pass_through = 0; gart_parse_options(p); @@ -155,9 +163,6 @@ static int __init pci_iommu_init(void) { struct iommu_table_entry *p; -#ifdef CONFIG_PCI - dma_debug_add_bus(&pci_bus_type); -#endif x86_init.iommu.iommu_init(); for (p = __iommu_table; p < __iommu_table_end; p++) { @@ -175,7 +180,7 @@ rootfs_initcall(pci_iommu_init); static int via_no_dac_cb(struct pci_dev *pdev, void *data) { - pdev->dev.dma_32bit_limit = true; + pdev->dev.bus_dma_mask = DMA_BIT_MASK(32); return 0; } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 476e3ddf8890..a451bc374b9b 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -384,6 +384,7 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) start_thread_common(regs, new_ip, new_sp, __USER_CS, __USER_DS, 0); } +EXPORT_SYMBOL_GPL(start_thread); #ifdef CONFIG_COMPAT void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 5d32c55aeb8b..b4866badb235 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -823,6 +823,12 @@ void __init setup_arch(char **cmdline_p) memblock_reserve(__pa_symbol(_text), (unsigned long)__bss_stop - (unsigned long)_text); + /* + * Make sure page 0 is always reserved because on systems with + * L1TF its contents can be leaked to user processes. + */ + memblock_reserve(0, PAGE_SIZE); + early_reserve_initrd(); /* @@ -993,11 +999,6 @@ void __init setup_arch(char **cmdline_p) setup_clear_cpu_cap(X86_FEATURE_APIC); } -#ifdef CONFIG_PCI - if (pci_early_dump_regs) - early_dump_pci_devices(); -#endif - e820__reserve_setup_data(); e820__finish_early_params(); diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 5c574dff4c1a..04adc8d60aed 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -261,6 +261,7 @@ __visible void __irq_entry smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); inc_irq_stat(irq_resched_count); + kvm_set_cpu_l1tf_flush_l1d(); if (trace_resched_ipi_enabled()) { /* diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index db9656e13ea0..f02ecaf97904 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -80,6 +80,7 @@ #include <asm/intel-family.h> #include <asm/cpu_device_id.h> #include <asm/spec-ctrl.h> +#include <asm/hw_irq.h> /* representing HT siblings of each logical CPU */ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map); @@ -271,6 +272,23 @@ static void notrace start_secondary(void *unused) } /** + * topology_is_primary_thread - Check whether CPU is the primary SMT thread + * @cpu: CPU to check + */ +bool topology_is_primary_thread(unsigned int cpu) +{ + return apic_id_is_primary_thread(per_cpu(x86_cpu_to_apicid, cpu)); +} + +/** + * topology_smt_supported - Check whether SMT is supported by the CPUs + */ +bool topology_smt_supported(void) +{ + return smp_num_siblings > 1; +} + +/** * topology_phys_to_logical_pkg - Map a physical package id to a logical * * Returns logical package id or -1 if not found diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index 774ebafa97c4..be01328eb755 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -12,6 +12,7 @@ #include <linux/clockchips.h> #include <linux/interrupt.h> +#include <linux/irq.h> #include <linux/i8253.h> #include <linux/time.h> #include <linux/export.h> diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 7e042e3d47fd..7bcfa61375c0 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -621,7 +621,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) | (1 << KVM_FEATURE_PV_UNHALT) | (1 << KVM_FEATURE_PV_TLB_FLUSH) | - (1 << KVM_FEATURE_ASYNC_PF_VMEXIT); + (1 << KVM_FEATURE_ASYNC_PF_VMEXIT) | + (1 << KVM_FEATURE_PV_SEND_IPI); if (sched_info_on()) entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 4c4f4263420c..106482da6388 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4191,7 +4191,7 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt) maxphyaddr = 36; rsvd = rsvd_bits(maxphyaddr, 63); if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_PCIDE) - rsvd &= ~CR3_PCID_INVD; + rsvd &= ~X86_CR3_PCID_NOFLUSH; } if (new_val & rsvd) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index af8caf965baa..01d209ab5481 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -235,7 +235,7 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic, struct kvm_vcpu *vcpu = synic_to_vcpu(synic); int ret; - if (!synic->active) + if (!synic->active && !host) return 1; trace_kvm_hv_synic_set_msr(vcpu->vcpu_id, msr, data, host); @@ -295,11 +295,12 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic, return ret; } -static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata) +static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata, + bool host) { int ret; - if (!synic->active) + if (!synic->active && !host) return 1; ret = 0; @@ -1014,6 +1015,11 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, case HV_X64_MSR_TSC_EMULATION_STATUS: hv->hv_tsc_emulation_status = data; break; + case HV_X64_MSR_TIME_REF_COUNT: + /* read-only, but still ignore it if host-initiated */ + if (!host) + return 1; + break; default: vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n", msr, data); @@ -1101,6 +1107,12 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) return stimer_set_count(vcpu_to_stimer(vcpu, timer_index), data, host); } + case HV_X64_MSR_TSC_FREQUENCY: + case HV_X64_MSR_APIC_FREQUENCY: + /* read-only, but still ignore it if host-initiated */ + if (!host) + return 1; + break; default: vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n", msr, data); @@ -1156,7 +1168,8 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) return 0; } -static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, + bool host) { u64 data = 0; struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv; @@ -1183,7 +1196,7 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case HV_X64_MSR_SIMP: case HV_X64_MSR_EOM: case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: - return synic_get_msr(vcpu_to_synic(vcpu), msr, pdata); + return synic_get_msr(vcpu_to_synic(vcpu), msr, pdata, host); case HV_X64_MSR_STIMER0_CONFIG: case HV_X64_MSR_STIMER1_CONFIG: case HV_X64_MSR_STIMER2_CONFIG: @@ -1229,7 +1242,7 @@ int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) return kvm_hv_set_msr(vcpu, msr, data, host); } -int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) { if (kvm_hv_msr_partition_wide(msr)) { int r; @@ -1239,7 +1252,7 @@ int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) mutex_unlock(&vcpu->kvm->arch.hyperv.hv_lock); return r; } else - return kvm_hv_get_msr(vcpu, msr, pdata); + return kvm_hv_get_msr(vcpu, msr, pdata, host); } static __always_inline int get_sparse_bank_no(u64 valid_bank_mask, int bank_no) diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index 837465d69c6d..d6aa969e20f1 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -48,7 +48,7 @@ static inline struct kvm_vcpu *synic_to_vcpu(struct kvm_vcpu_hv_synic *synic) } int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host); -int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); +int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host); bool kvm_hv_hypercall_enabled(struct kvm *kvm); int kvm_hv_hypercall(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index d536d457517b..0cefba28c864 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -547,6 +547,46 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, irq->level, irq->trig_mode, dest_map); } +int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, + unsigned long ipi_bitmap_high, int min, + unsigned long icr, int op_64_bit) +{ + int i; + struct kvm_apic_map *map; + struct kvm_vcpu *vcpu; + struct kvm_lapic_irq irq = {0}; + int cluster_size = op_64_bit ? 64 : 32; + int count = 0; + + irq.vector = icr & APIC_VECTOR_MASK; + irq.delivery_mode = icr & APIC_MODE_MASK; + irq.level = (icr & APIC_INT_ASSERT) != 0; + irq.trig_mode = icr & APIC_INT_LEVELTRIG; + + if (icr & APIC_DEST_MASK) + return -KVM_EINVAL; + if (icr & APIC_SHORT_MASK) + return -KVM_EINVAL; + + rcu_read_lock(); + map = rcu_dereference(kvm->arch.apic_map); + + /* Bits above cluster_size are masked in the caller. */ + for_each_set_bit(i, &ipi_bitmap_low, BITS_PER_LONG) { + vcpu = map->phys_map[min + i]->vcpu; + count += kvm_apic_set_irq(vcpu, &irq, NULL); + } + + min += cluster_size; + for_each_set_bit(i, &ipi_bitmap_high, BITS_PER_LONG) { + vcpu = map->phys_map[min + i]->vcpu; + count += kvm_apic_set_irq(vcpu, &irq, NULL); + } + + rcu_read_unlock(); + return count; +} + static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val) { diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 6b8f11521c41..a282321329b5 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -178,7 +178,24 @@ struct kvm_shadow_walk_iterator { unsigned index; }; -#define for_each_shadow_entry(_vcpu, _addr, _walker) \ +static const union kvm_mmu_page_role mmu_base_role_mask = { + .cr0_wp = 1, + .cr4_pae = 1, + .nxe = 1, + .smep_andnot_wp = 1, + .smap_andnot_wp = 1, + .smm = 1, + .guest_mode = 1, + .ad_disabled = 1, +}; + +#define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker) \ + for (shadow_walk_init_using_root(&(_walker), (_vcpu), \ + (_root), (_addr)); \ + shadow_walk_okay(&(_walker)); \ + shadow_walk_next(&(_walker))) + +#define for_each_shadow_entry(_vcpu, _addr, _walker) \ for (shadow_walk_init(&(_walker), _vcpu, _addr); \ shadow_walk_okay(&(_walker)); \ shadow_walk_next(&(_walker))) @@ -221,7 +238,20 @@ static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK | PT64_EPT_EXECUTABLE_MASK; static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT; +/* + * This mask must be set on all non-zero Non-Present or Reserved SPTEs in order + * to guard against L1TF attacks. + */ +static u64 __read_mostly shadow_nonpresent_or_rsvd_mask; + +/* + * The number of high-order 1 bits to use in the mask above. + */ +static const u64 shadow_nonpresent_or_rsvd_mask_len = 5; + static void mmu_spte_set(u64 *sptep, u64 spte); +static union kvm_mmu_page_role +kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu); void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value) { @@ -308,9 +338,13 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, { unsigned int gen = kvm_current_mmio_generation(vcpu); u64 mask = generation_mmio_spte_mask(gen); + u64 gpa = gfn << PAGE_SHIFT; access &= ACC_WRITE_MASK | ACC_USER_MASK; - mask |= shadow_mmio_value | access | gfn << PAGE_SHIFT; + mask |= shadow_mmio_value | access; + mask |= gpa | shadow_nonpresent_or_rsvd_mask; + mask |= (gpa & shadow_nonpresent_or_rsvd_mask) + << shadow_nonpresent_or_rsvd_mask_len; trace_mark_mmio_spte(sptep, gfn, access, gen); mmu_spte_set(sptep, mask); @@ -323,8 +357,14 @@ static bool is_mmio_spte(u64 spte) static gfn_t get_mmio_spte_gfn(u64 spte) { - u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask; - return (spte & ~mask) >> PAGE_SHIFT; + u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask | + shadow_nonpresent_or_rsvd_mask; + u64 gpa = spte & ~mask; + + gpa |= (spte >> shadow_nonpresent_or_rsvd_mask_len) + & shadow_nonpresent_or_rsvd_mask; + + return gpa >> PAGE_SHIFT; } static unsigned get_mmio_spte_access(u64 spte) @@ -381,7 +421,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, } EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); -static void kvm_mmu_clear_all_pte_masks(void) +static void kvm_mmu_reset_all_pte_masks(void) { shadow_user_mask = 0; shadow_accessed_mask = 0; @@ -391,6 +431,18 @@ static void kvm_mmu_clear_all_pte_masks(void) shadow_mmio_mask = 0; shadow_present_mask = 0; shadow_acc_track_mask = 0; + + /* + * If the CPU has 46 or less physical address bits, then set an + * appropriate mask to guard against L1TF attacks. Otherwise, it is + * assumed that the CPU is not vulnerable to L1TF. + */ + if (boot_cpu_data.x86_phys_bits < + 52 - shadow_nonpresent_or_rsvd_mask_len) + shadow_nonpresent_or_rsvd_mask = + rsvd_bits(boot_cpu_data.x86_phys_bits - + shadow_nonpresent_or_rsvd_mask_len, + boot_cpu_data.x86_phys_bits - 1); } static int is_cpuid_PSE36(void) @@ -1986,7 +2038,7 @@ static int nonpaging_sync_page(struct kvm_vcpu *vcpu, return 0; } -static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva) +static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root) { } @@ -2117,12 +2169,8 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, struct list_head *invalid_list) { - if (sp->role.cr4_pae != !!is_pae(vcpu)) { - kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); - return false; - } - - if (vcpu->arch.mmu.sync_page(vcpu, sp) == 0) { + if (sp->role.cr4_pae != !!is_pae(vcpu) + || vcpu->arch.mmu.sync_page(vcpu, sp) == 0) { kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); return false; } @@ -2392,11 +2440,12 @@ out: return sp; } -static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, - struct kvm_vcpu *vcpu, u64 addr) +static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator, + struct kvm_vcpu *vcpu, hpa_t root, + u64 addr) { iterator->addr = addr; - iterator->shadow_addr = vcpu->arch.mmu.root_hpa; + iterator->shadow_addr = root; iterator->level = vcpu->arch.mmu.shadow_root_level; if (iterator->level == PT64_ROOT_4LEVEL && @@ -2405,6 +2454,12 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, --iterator->level; if (iterator->level == PT32E_ROOT_LEVEL) { + /* + * prev_root is currently only used for 64-bit hosts. So only + * the active root_hpa is valid here. + */ + BUG_ON(root != vcpu->arch.mmu.root_hpa); + iterator->shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; iterator->shadow_addr &= PT64_BASE_ADDR_MASK; @@ -2414,6 +2469,13 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, } } +static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, + struct kvm_vcpu *vcpu, u64 addr) +{ + shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu.root_hpa, + addr); +} + static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) { if (iterator->level < PT_PAGE_TABLE_LEVEL) @@ -2702,6 +2764,45 @@ static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_unsync_page(vcpu, sp); } + /* + * We need to ensure that the marking of unsync pages is visible + * before the SPTE is updated to allow writes because + * kvm_mmu_sync_roots() checks the unsync flags without holding + * the MMU lock and so can race with this. If the SPTE was updated + * before the page had been marked as unsync-ed, something like the + * following could happen: + * + * CPU 1 CPU 2 + * --------------------------------------------------------------------- + * 1.2 Host updates SPTE + * to be writable + * 2.1 Guest writes a GPTE for GVA X. + * (GPTE being in the guest page table shadowed + * by the SP from CPU 1.) + * This reads SPTE during the page table walk. + * Since SPTE.W is read as 1, there is no + * fault. + * + * 2.2 Guest issues TLB flush. + * That causes a VM Exit. + * + * 2.3 kvm_mmu_sync_pages() reads sp->unsync. + * Since it is false, so it just returns. + * + * 2.4 Guest accesses GVA X. + * Since the mapping in the SP was not updated, + * so the old mapping for GVA X incorrectly + * gets used. + * 1.1 Host marks SP + * as unsync + * (sp->unsync = true) + * + * The write barrier below ensures that 1.1 happens before 1.2 and thus + * the situation in 2.4 does not arise. The implicit barrier in 2.2 + * pairs with this write barrier. + */ + smp_wmb(); + return false; } @@ -2724,6 +2825,10 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) return true; } +/* Bits which may be returned by set_spte() */ +#define SET_SPTE_WRITE_PROTECTED_PT BIT(0) +#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1) + static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, int level, gfn_t gfn, kvm_pfn_t pfn, bool speculative, @@ -2800,7 +2905,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { pgprintk("%s: found shadow page for %llx, marking ro\n", __func__, gfn); - ret = 1; + ret |= SET_SPTE_WRITE_PROTECTED_PT; pte_access &= ~ACC_WRITE_MASK; spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); } @@ -2816,7 +2921,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, set_pte: if (mmu_spte_update(sptep, spte)) - kvm_flush_remote_tlbs(vcpu->kvm); + ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH; done: return ret; } @@ -2827,7 +2932,9 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, { int was_rmapped = 0; int rmap_count; + int set_spte_ret; int ret = RET_PF_RETRY; + bool flush = false; pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, *sptep, write_fault, gfn); @@ -2844,22 +2951,25 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, child = page_header(pte & PT64_BASE_ADDR_MASK); drop_parent_pte(child, sptep); - kvm_flush_remote_tlbs(vcpu->kvm); + flush = true; } else if (pfn != spte_to_pfn(*sptep)) { pgprintk("hfn old %llx new %llx\n", spte_to_pfn(*sptep), pfn); drop_spte(vcpu->kvm, sptep); - kvm_flush_remote_tlbs(vcpu->kvm); + flush = true; } else was_rmapped = 1; } - if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative, - true, host_writable)) { + set_spte_ret = set_spte(vcpu, sptep, pte_access, level, gfn, pfn, + speculative, true, host_writable); + if (set_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) { if (write_fault) ret = RET_PF_EMULATE; kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } + if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH || flush) + kvm_flush_remote_tlbs(vcpu->kvm); if (unlikely(is_mmio_spte(*sptep))) ret = RET_PF_EMULATE; @@ -3358,26 +3468,47 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, *root_hpa = INVALID_PAGE; } -void kvm_mmu_free_roots(struct kvm_vcpu *vcpu) +/* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */ +void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, ulong roots_to_free) { int i; LIST_HEAD(invalid_list); struct kvm_mmu *mmu = &vcpu->arch.mmu; + bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT; - if (!VALID_PAGE(mmu->root_hpa)) - return; + BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG); + + /* Before acquiring the MMU lock, see if we need to do any real work. */ + if (!(free_active_root && VALID_PAGE(mmu->root_hpa))) { + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) && + VALID_PAGE(mmu->prev_roots[i].hpa)) + break; + + if (i == KVM_MMU_NUM_PREV_ROOTS) + return; + } spin_lock(&vcpu->kvm->mmu_lock); - if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && - (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) { - mmu_free_root_page(vcpu->kvm, &mmu->root_hpa, &invalid_list); - } else { - for (i = 0; i < 4; ++i) - if (mmu->pae_root[i] != 0) - mmu_free_root_page(vcpu->kvm, &mmu->pae_root[i], - &invalid_list); - mmu->root_hpa = INVALID_PAGE; + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) + mmu_free_root_page(vcpu->kvm, &mmu->prev_roots[i].hpa, + &invalid_list); + + if (free_active_root) { + if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && + (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) { + mmu_free_root_page(vcpu->kvm, &mmu->root_hpa, + &invalid_list); + } else { + for (i = 0; i < 4; ++i) + if (mmu->pae_root[i] != 0) + mmu_free_root_page(vcpu->kvm, + &mmu->pae_root[i], + &invalid_list); + mmu->root_hpa = INVALID_PAGE; + } } kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); @@ -3546,7 +3677,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu) return mmu_alloc_shadow_roots(vcpu); } -static void mmu_sync_roots(struct kvm_vcpu *vcpu) +void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) { int i; struct kvm_mmu_page *sp; @@ -3558,14 +3689,39 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) return; vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); - kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); + if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; + sp = page_header(root); + + /* + * Even if another CPU was marking the SP as unsync-ed + * simultaneously, any guest page table changes are not + * guaranteed to be visible anyway until this VCPU issues a TLB + * flush strictly after those changes are made. We only need to + * ensure that the other CPU sets these flags before any actual + * changes to the page tables are made. The comments in + * mmu_need_write_protect() describe what could go wrong if this + * requirement isn't satisfied. + */ + if (!smp_load_acquire(&sp->unsync) && + !smp_load_acquire(&sp->unsync_children)) + return; + + spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); + mmu_sync_children(vcpu, sp); + kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); + spin_unlock(&vcpu->kvm->mmu_lock); return; } + + spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); + for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu.pae_root[i]; @@ -3575,13 +3731,8 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) mmu_sync_children(vcpu, sp); } } - kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); -} -void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) -{ - spin_lock(&vcpu->kvm->mmu_lock); - mmu_sync_roots(vcpu); + kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); spin_unlock(&vcpu->kvm->mmu_lock); } EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots); @@ -3840,6 +3991,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, { int r = 1; + vcpu->arch.l1tf_flush_l1d = true; switch (vcpu->arch.apf.host_apf_reason) { default: trace_kvm_page_fault(fault_address, error_code); @@ -3947,16 +4099,107 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu, context->update_pte = nonpaging_update_pte; context->root_level = 0; context->shadow_root_level = PT32E_ROOT_LEVEL; - context->root_hpa = INVALID_PAGE; context->direct_map = true; context->nx = false; } -void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu) +/* + * Find out if a previously cached root matching the new CR3/role is available. + * The current root is also inserted into the cache. + * If a matching root was found, it is assigned to kvm_mmu->root_hpa and true is + * returned. + * Otherwise, the LRU root from the cache is assigned to kvm_mmu->root_hpa and + * false is returned. This root should now be freed by the caller. + */ +static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_cr3, + union kvm_mmu_page_role new_role) +{ + uint i; + struct kvm_mmu_root_info root; + struct kvm_mmu *mmu = &vcpu->arch.mmu; + + root.cr3 = mmu->get_cr3(vcpu); + root.hpa = mmu->root_hpa; + + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { + swap(root, mmu->prev_roots[i]); + + if (new_cr3 == root.cr3 && VALID_PAGE(root.hpa) && + page_header(root.hpa) != NULL && + new_role.word == page_header(root.hpa)->role.word) + break; + } + + mmu->root_hpa = root.hpa; + + return i < KVM_MMU_NUM_PREV_ROOTS; +} + +static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3, + union kvm_mmu_page_role new_role, + bool skip_tlb_flush) { - kvm_mmu_free_roots(vcpu); + struct kvm_mmu *mmu = &vcpu->arch.mmu; + + /* + * For now, limit the fast switch to 64-bit hosts+VMs in order to avoid + * having to deal with PDPTEs. We may add support for 32-bit hosts/VMs + * later if necessary. + */ + if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && + mmu->root_level >= PT64_ROOT_4LEVEL) { + if (mmu_check_root(vcpu, new_cr3 >> PAGE_SHIFT)) + return false; + + if (cached_root_available(vcpu, new_cr3, new_role)) { + /* + * It is possible that the cached previous root page is + * obsolete because of a change in the MMU + * generation number. However, that is accompanied by + * KVM_REQ_MMU_RELOAD, which will free the root that we + * have set here and allocate a new one. + */ + + kvm_make_request(KVM_REQ_LOAD_CR3, vcpu); + if (!skip_tlb_flush) { + kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); + kvm_x86_ops->tlb_flush(vcpu, true); + } + + /* + * The last MMIO access's GVA and GPA are cached in the + * VCPU. When switching to a new CR3, that GVA->GPA + * mapping may no longer be valid. So clear any cached + * MMIO info even when we don't need to sync the shadow + * page tables. + */ + vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); + + __clear_sp_write_flooding_count( + page_header(mmu->root_hpa)); + + return true; + } + } + + return false; } +static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, + union kvm_mmu_page_role new_role, + bool skip_tlb_flush) +{ + if (!fast_cr3_switch(vcpu, new_cr3, new_role, skip_tlb_flush)) + kvm_mmu_free_roots(vcpu, KVM_MMU_ROOT_CURRENT); +} + +void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush) +{ + __kvm_mmu_new_cr3(vcpu, new_cr3, kvm_mmu_calc_root_page_role(vcpu), + skip_tlb_flush); +} +EXPORT_SYMBOL_GPL(kvm_mmu_new_cr3); + static unsigned long get_cr3(struct kvm_vcpu *vcpu) { return kvm_read_cr3(vcpu); @@ -4431,7 +4674,6 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu, context->invlpg = paging64_invlpg; context->update_pte = paging64_update_pte; context->shadow_root_level = level; - context->root_hpa = INVALID_PAGE; context->direct_map = false; } @@ -4461,7 +4703,6 @@ static void paging32_init_context(struct kvm_vcpu *vcpu, context->invlpg = paging32_invlpg; context->update_pte = paging32_update_pte; context->shadow_root_level = PT32E_ROOT_LEVEL; - context->root_hpa = INVALID_PAGE; context->direct_map = false; } @@ -4471,20 +4712,32 @@ static void paging32E_init_context(struct kvm_vcpu *vcpu, paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL); } +static union kvm_mmu_page_role +kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu) +{ + union kvm_mmu_page_role role = {0}; + + role.guest_mode = is_guest_mode(vcpu); + role.smm = is_smm(vcpu); + role.ad_disabled = (shadow_accessed_mask == 0); + role.level = kvm_x86_ops->get_tdp_level(vcpu); + role.direct = true; + role.access = ACC_ALL; + + return role; +} + static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) { struct kvm_mmu *context = &vcpu->arch.mmu; - context->base_role.word = 0; - context->base_role.guest_mode = is_guest_mode(vcpu); - context->base_role.smm = is_smm(vcpu); - context->base_role.ad_disabled = (shadow_accessed_mask == 0); + context->base_role.word = mmu_base_role_mask.word & + kvm_calc_tdp_mmu_root_page_role(vcpu).word; context->page_fault = tdp_page_fault; context->sync_page = nonpaging_sync_page; context->invlpg = nonpaging_invlpg; context->update_pte = nonpaging_update_pte; context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu); - context->root_hpa = INVALID_PAGE; context->direct_map = true; context->set_cr3 = kvm_x86_ops->set_tdp_cr3; context->get_cr3 = get_cr3; @@ -4519,13 +4772,36 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) reset_tdp_shadow_zero_bits_mask(vcpu, context); } -void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) +static union kvm_mmu_page_role +kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu) { + union kvm_mmu_page_role role = {0}; bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); bool smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP); - struct kvm_mmu *context = &vcpu->arch.mmu; - MMU_WARN_ON(VALID_PAGE(context->root_hpa)); + role.nxe = is_nx(vcpu); + role.cr4_pae = !!is_pae(vcpu); + role.cr0_wp = is_write_protection(vcpu); + role.smep_andnot_wp = smep && !is_write_protection(vcpu); + role.smap_andnot_wp = smap && !is_write_protection(vcpu); + role.guest_mode = is_guest_mode(vcpu); + role.smm = is_smm(vcpu); + role.direct = !is_paging(vcpu); + role.access = ACC_ALL; + + if (!is_long_mode(vcpu)) + role.level = PT32E_ROOT_LEVEL; + else if (is_la57_mode(vcpu)) + role.level = PT64_ROOT_5LEVEL; + else + role.level = PT64_ROOT_4LEVEL; + + return role; +} + +void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu *context = &vcpu->arch.mmu; if (!is_paging(vcpu)) nonpaging_init_context(vcpu, context); @@ -4536,26 +4812,34 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) else paging32_init_context(vcpu, context); - context->base_role.nxe = is_nx(vcpu); - context->base_role.cr4_pae = !!is_pae(vcpu); - context->base_role.cr0_wp = is_write_protection(vcpu); - context->base_role.smep_andnot_wp - = smep && !is_write_protection(vcpu); - context->base_role.smap_andnot_wp - = smap && !is_write_protection(vcpu); - context->base_role.guest_mode = is_guest_mode(vcpu); - context->base_role.smm = is_smm(vcpu); + context->base_role.word = mmu_base_role_mask.word & + kvm_calc_shadow_mmu_root_page_role(vcpu).word; reset_shadow_zero_bits_mask(vcpu, context); } EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); +static union kvm_mmu_page_role +kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty) +{ + union kvm_mmu_page_role role = vcpu->arch.mmu.base_role; + + role.level = PT64_ROOT_4LEVEL; + role.direct = false; + role.ad_disabled = !accessed_dirty; + role.guest_mode = true; + role.access = ACC_ALL; + + return role; +} + void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, - bool accessed_dirty) + bool accessed_dirty, gpa_t new_eptp) { struct kvm_mmu *context = &vcpu->arch.mmu; + union kvm_mmu_page_role root_page_role = + kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty); - MMU_WARN_ON(VALID_PAGE(context->root_hpa)); - + __kvm_mmu_new_cr3(vcpu, new_eptp, root_page_role, false); context->shadow_root_level = PT64_ROOT_4LEVEL; context->nx = true; @@ -4566,10 +4850,8 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, context->invlpg = ept_invlpg; context->update_pte = ept_update_pte; context->root_level = PT64_ROOT_4LEVEL; - context->root_hpa = INVALID_PAGE; context->direct_map = false; - context->base_role.ad_disabled = !accessed_dirty; - context->base_role.guest_mode = 1; + context->base_role.word = root_page_role.word & mmu_base_role_mask.word; update_permission_bitmask(vcpu, context, true); update_pkru_bitmask(vcpu, context, true); update_last_nonleaf_level(vcpu, context); @@ -4632,8 +4914,17 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) update_last_nonleaf_level(vcpu, g_context); } -static void init_kvm_mmu(struct kvm_vcpu *vcpu) +void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots) { + if (reset_roots) { + uint i; + + vcpu->arch.mmu.root_hpa = INVALID_PAGE; + + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + vcpu->arch.mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; + } + if (mmu_is_nested(vcpu)) init_kvm_nested_mmu(vcpu); else if (tdp_enabled) @@ -4641,11 +4932,21 @@ static void init_kvm_mmu(struct kvm_vcpu *vcpu) else init_kvm_softmmu(vcpu); } +EXPORT_SYMBOL_GPL(kvm_init_mmu); + +static union kvm_mmu_page_role +kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu) +{ + if (tdp_enabled) + return kvm_calc_tdp_mmu_root_page_role(vcpu); + else + return kvm_calc_shadow_mmu_root_page_role(vcpu); +} void kvm_mmu_reset_context(struct kvm_vcpu *vcpu) { kvm_mmu_unload(vcpu); - init_kvm_mmu(vcpu); + kvm_init_mmu(vcpu, true); } EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); @@ -4660,8 +4961,8 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) kvm_mmu_sync_roots(vcpu); if (r) goto out; - /* set_cr3() should ensure TLB has been flushed */ - vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa); + kvm_mmu_load_cr3(vcpu); + kvm_x86_ops->tlb_flush(vcpu, true); out: return r; } @@ -4669,7 +4970,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_load); void kvm_mmu_unload(struct kvm_vcpu *vcpu) { - kvm_mmu_free_roots(vcpu); + kvm_mmu_free_roots(vcpu, KVM_MMU_ROOTS_ALL); WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa)); } EXPORT_SYMBOL_GPL(kvm_mmu_unload); @@ -4822,16 +5123,6 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, u64 entry, gentry, *spte; int npte; bool remote_flush, local_flush; - union kvm_mmu_page_role mask = { }; - - mask.cr0_wp = 1; - mask.cr4_pae = 1; - mask.nxe = 1; - mask.smep_andnot_wp = 1; - mask.smap_andnot_wp = 1; - mask.smm = 1; - mask.guest_mode = 1; - mask.ad_disabled = 1; /* * If we don't have indirect shadow pages, it means no page is @@ -4875,7 +5166,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, mmu_page_zap_pte(vcpu->kvm, sp, spte); if (gentry && !((sp->role.word ^ vcpu->arch.mmu.base_role.word) - & mask.word) && rmap_can_add(vcpu)) + & mmu_base_role_mask.word) && rmap_can_add(vcpu)) mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); if (need_remote_flush(entry, *spte)) remote_flush = true; @@ -5000,12 +5291,67 @@ EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) { - vcpu->arch.mmu.invlpg(vcpu, gva); - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + struct kvm_mmu *mmu = &vcpu->arch.mmu; + int i; + + /* INVLPG on a * non-canonical address is a NOP according to the SDM. */ + if (is_noncanonical_address(gva, vcpu)) + return; + + mmu->invlpg(vcpu, gva, mmu->root_hpa); + + /* + * INVLPG is required to invalidate any global mappings for the VA, + * irrespective of PCID. Since it would take us roughly similar amount + * of work to determine whether any of the prev_root mappings of the VA + * is marked global, or to just sync it blindly, so we might as well + * just always sync it. + * + * Mappings not reachable via the current cr3 or the prev_roots will be + * synced when switching to that cr3, so nothing needs to be done here + * for them. + */ + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + if (VALID_PAGE(mmu->prev_roots[i].hpa)) + mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa); + + kvm_x86_ops->tlb_flush_gva(vcpu, gva); ++vcpu->stat.invlpg; } EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); +void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) +{ + struct kvm_mmu *mmu = &vcpu->arch.mmu; + bool tlb_flush = false; + uint i; + + if (pcid == kvm_get_active_pcid(vcpu)) { + mmu->invlpg(vcpu, gva, mmu->root_hpa); + tlb_flush = true; + } + + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { + if (VALID_PAGE(mmu->prev_roots[i].hpa) && + pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].cr3)) { + mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa); + tlb_flush = true; + } + } + + if (tlb_flush) + kvm_x86_ops->tlb_flush_gva(vcpu, gva); + + ++vcpu->stat.invlpg; + + /* + * Mappings not reachable via the current cr3 or the prev_roots will be + * synced when switching to that cr3, so nothing needs to be done here + * for them. + */ +} +EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva); + void kvm_enable_tdp(void) { tdp_enabled = true; @@ -5029,6 +5375,9 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) struct page *page; int i; + if (tdp_enabled) + return 0; + /* * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. * Therefore we need to allocate shadow page tables in the first @@ -5047,11 +5396,16 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) int kvm_mmu_create(struct kvm_vcpu *vcpu) { + uint i; + vcpu->arch.walk_mmu = &vcpu->arch.mmu; vcpu->arch.mmu.root_hpa = INVALID_PAGE; vcpu->arch.mmu.translate_gpa = translate_gpa; vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa; + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + vcpu->arch.mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; + return alloc_mmu_pages(vcpu); } @@ -5059,7 +5413,7 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu) { MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa)); - init_kvm_mmu(vcpu); + kvm_init_mmu(vcpu, true); } static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm, @@ -5499,7 +5853,7 @@ int kvm_mmu_module_init(void) { int ret = -ENOMEM; - kvm_mmu_clear_all_pte_masks(); + kvm_mmu_reset_all_pte_masks(); pte_list_desc_cache = kmem_cache_create("pte_list_desc", sizeof(struct pte_list_desc), diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 5b408c0ad612..1fab69c0b2f3 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -61,9 +61,10 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value); void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context); +void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots); void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu); void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, - bool accessed_dirty); + bool accessed_dirty, gpa_t new_eptp); bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu); int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, u64 fault_address, char *insn, int insn_len); @@ -85,6 +86,27 @@ static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) return kvm_mmu_load(vcpu); } +static inline unsigned long kvm_get_pcid(struct kvm_vcpu *vcpu, gpa_t cr3) +{ + BUILD_BUG_ON((X86_CR3_PCID_MASK & PAGE_MASK) != 0); + + return kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE) + ? cr3 & X86_CR3_PCID_MASK + : 0; +} + +static inline unsigned long kvm_get_active_pcid(struct kvm_vcpu *vcpu) +{ + return kvm_get_pcid(vcpu, kvm_read_cr3(vcpu)); +} + +static inline void kvm_mmu_load_cr3(struct kvm_vcpu *vcpu) +{ + if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) + vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa | + kvm_get_active_pcid(vcpu)); +} + /* * Currently, we have two sorts of write-protection, a) the first one * write-protects guest page to sync the guest modification, b) another one is diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 6288e9d7068e..14ffd973df54 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -181,7 +181,7 @@ no_present: * set bit 0 if execute only is supported. Here, we repurpose ACC_USER_MASK * to signify readability since it isn't used in the EPT case */ -static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte) +static inline unsigned FNAME(gpte_access)(u64 gpte) { unsigned access; #if PTTYPE == PTTYPE_EPT @@ -394,8 +394,8 @@ retry_walk: accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0; /* Convert to ACC_*_MASK flags for struct guest_walker. */ - walker->pt_access = FNAME(gpte_access)(vcpu, pt_access ^ walk_nx_mask); - walker->pte_access = FNAME(gpte_access)(vcpu, pte_access ^ walk_nx_mask); + walker->pt_access = FNAME(gpte_access)(pt_access ^ walk_nx_mask); + walker->pte_access = FNAME(gpte_access)(pte_access ^ walk_nx_mask); errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access); if (unlikely(errcode)) goto error; @@ -508,7 +508,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); gfn = gpte_to_gfn(gpte); - pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); + pte_access = sp->role.access & FNAME(gpte_access)(gpte); FNAME(protect_clean_gpte)(&vcpu->arch.mmu, &pte_access, gpte); pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, no_dirty_log && (pte_access & ACC_WRITE_MASK)); @@ -856,7 +856,7 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t); } -static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) +static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) { struct kvm_shadow_walk_iterator iterator; struct kvm_mmu_page *sp; @@ -871,13 +871,13 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) */ mmu_topup_memory_caches(vcpu); - if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) { + if (!VALID_PAGE(root_hpa)) { WARN_ON(1); return; } spin_lock(&vcpu->kvm->mmu_lock); - for_each_shadow_entry(vcpu, gva, iterator) { + for_each_shadow_entry_using_root(vcpu, root_hpa, gva, iterator) { level = iterator.level; sptep = iterator.sptep; @@ -968,6 +968,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) int i, nr_present = 0; bool host_writable; gpa_t first_pte_gpa; + int set_spte_ret = 0; /* direct kvm_mmu_page can not be unsync. */ BUG_ON(sp->role.direct); @@ -1002,7 +1003,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) gfn = gpte_to_gfn(gpte); pte_access = sp->role.access; - pte_access &= FNAME(gpte_access)(vcpu, gpte); + pte_access &= FNAME(gpte_access)(gpte); FNAME(protect_clean_gpte)(&vcpu->arch.mmu, &pte_access, gpte); if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access, @@ -1024,12 +1025,15 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE; - set_spte(vcpu, &sp->spt[i], pte_access, - PT_PAGE_TABLE_LEVEL, gfn, - spte_to_pfn(sp->spt[i]), true, false, - host_writable); + set_spte_ret |= set_spte(vcpu, &sp->spt[i], + pte_access, PT_PAGE_TABLE_LEVEL, + gfn, spte_to_pfn(sp->spt[i]), + true, false, host_writable); } + if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH) + kvm_flush_remote_tlbs(vcpu->kvm); + return nr_present; } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f059a73f0fd0..6276140044d0 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2884,7 +2884,6 @@ static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu, svm->vmcb->control.nested_cr3 = __sme_set(root); mark_dirty(svm->vmcb, VMCB_NPT); - svm_flush_tlb(vcpu, true); } static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, @@ -5435,6 +5434,13 @@ static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) svm->asid_generation--; } +static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + invlpga(gva, svm->vmcb->control.asid); +} + static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) { } @@ -5580,8 +5586,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) clgi(); - local_irq_enable(); - /* * If this vCPU has touched SPEC_CTRL, restore the guest's value if * it's non-zero. Since vmentry is serialising on affected CPUs, there @@ -5590,6 +5594,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) */ x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl); + local_irq_enable(); + asm volatile ( "push %%" _ASM_BP "; \n\t" "mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t" @@ -5712,12 +5718,12 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))) svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); - x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl); - reload_tss(vcpu); local_irq_disable(); + x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl); + vcpu->arch.cr2 = svm->vmcb->save.cr2; vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; @@ -5766,7 +5772,6 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) svm->vmcb->save.cr3 = __sme_set(root); mark_dirty(svm->vmcb, VMCB_CR); - svm_flush_tlb(vcpu, true); } static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root) @@ -5779,8 +5784,6 @@ static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root) /* Also sync guest cr3 here in case we live migrate */ svm->vmcb->save.cr3 = kvm_read_cr3(vcpu); mark_dirty(svm->vmcb, VMCB_CR); - - svm_flush_tlb(vcpu, true); } static int is_disabled(void) @@ -7090,6 +7093,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .set_rflags = svm_set_rflags, .tlb_flush = svm_flush_tlb, + .tlb_flush_gva = svm_flush_tlb_gva, .run = svm_vcpu_run, .handle_exit = handle_exit, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5d8e317c2b04..1d26f3c4985b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -38,6 +38,7 @@ #include "kvm_cache_regs.h" #include "x86.h" +#include <asm/asm.h> #include <asm/cpu.h> #include <asm/io.h> #include <asm/desc.h> @@ -188,23 +189,204 @@ module_param(ple_window_max, uint, 0444); extern const ulong vmx_return; +static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); +static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond); +static DEFINE_MUTEX(vmx_l1d_flush_mutex); + +/* Storage for pre module init parameter parsing */ +static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO; + +static const struct { + const char *option; + bool for_parse; +} vmentry_l1d_param[] = { + [VMENTER_L1D_FLUSH_AUTO] = {"auto", true}, + [VMENTER_L1D_FLUSH_NEVER] = {"never", true}, + [VMENTER_L1D_FLUSH_COND] = {"cond", true}, + [VMENTER_L1D_FLUSH_ALWAYS] = {"always", true}, + [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false}, + [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false}, +}; + +#define L1D_CACHE_ORDER 4 +static void *vmx_l1d_flush_pages; + +static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) +{ + struct page *page; + unsigned int i; + + if (!enable_ept) { + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED; + return 0; + } + + if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) { + u64 msr; + + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr); + if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) { + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; + return 0; + } + } + + /* If set to auto use the default l1tf mitigation method */ + if (l1tf == VMENTER_L1D_FLUSH_AUTO) { + switch (l1tf_mitigation) { + case L1TF_MITIGATION_OFF: + l1tf = VMENTER_L1D_FLUSH_NEVER; + break; + case L1TF_MITIGATION_FLUSH_NOWARN: + case L1TF_MITIGATION_FLUSH: + case L1TF_MITIGATION_FLUSH_NOSMT: + l1tf = VMENTER_L1D_FLUSH_COND; + break; + case L1TF_MITIGATION_FULL: + case L1TF_MITIGATION_FULL_FORCE: + l1tf = VMENTER_L1D_FLUSH_ALWAYS; + break; + } + } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) { + l1tf = VMENTER_L1D_FLUSH_ALWAYS; + } + + if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages && + !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) { + page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER); + if (!page) + return -ENOMEM; + vmx_l1d_flush_pages = page_address(page); + + /* + * Initialize each page with a different pattern in + * order to protect against KSM in the nested + * virtualization case. + */ + for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) { + memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1, + PAGE_SIZE); + } + } + + l1tf_vmx_mitigation = l1tf; + + if (l1tf != VMENTER_L1D_FLUSH_NEVER) + static_branch_enable(&vmx_l1d_should_flush); + else + static_branch_disable(&vmx_l1d_should_flush); + + if (l1tf == VMENTER_L1D_FLUSH_COND) + static_branch_enable(&vmx_l1d_flush_cond); + else + static_branch_disable(&vmx_l1d_flush_cond); + return 0; +} + +static int vmentry_l1d_flush_parse(const char *s) +{ + unsigned int i; + + if (s) { + for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) { + if (vmentry_l1d_param[i].for_parse && + sysfs_streq(s, vmentry_l1d_param[i].option)) + return i; + } + } + return -EINVAL; +} + +static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp) +{ + int l1tf, ret; + + l1tf = vmentry_l1d_flush_parse(s); + if (l1tf < 0) + return l1tf; + + if (!boot_cpu_has(X86_BUG_L1TF)) + return 0; + + /* + * Has vmx_init() run already? If not then this is the pre init + * parameter parsing. In that case just store the value and let + * vmx_init() do the proper setup after enable_ept has been + * established. + */ + if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) { + vmentry_l1d_flush_param = l1tf; + return 0; + } + + mutex_lock(&vmx_l1d_flush_mutex); + ret = vmx_setup_l1d_flush(l1tf); + mutex_unlock(&vmx_l1d_flush_mutex); + return ret; +} + +static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) +{ + if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param))) + return sprintf(s, "???\n"); + + return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option); +} + +static const struct kernel_param_ops vmentry_l1d_flush_ops = { + .set = vmentry_l1d_flush_set, + .get = vmentry_l1d_flush_get, +}; +module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); + +enum ept_pointers_status { + EPT_POINTERS_CHECK = 0, + EPT_POINTERS_MATCH = 1, + EPT_POINTERS_MISMATCH = 2 +}; + struct kvm_vmx { struct kvm kvm; unsigned int tss_addr; bool ept_identity_pagetable_done; gpa_t ept_identity_map_addr; + + enum ept_pointers_status ept_pointers_match; + spinlock_t ept_pointer_lock; }; #define NR_AUTOLOAD_MSRS 8 +struct vmcs_hdr { + u32 revision_id:31; + u32 shadow_vmcs:1; +}; + struct vmcs { - u32 revision_id; + struct vmcs_hdr hdr; u32 abort; char data[0]; }; /* + * vmcs_host_state tracks registers that are loaded from the VMCS on VMEXIT + * and whose values change infrequently, but are not constant. I.e. this is + * used as a write-through cache of the corresponding VMCS fields. + */ +struct vmcs_host_state { + unsigned long cr3; /* May not match real cr3 */ + unsigned long cr4; /* May not match real cr4 */ + unsigned long gs_base; + unsigned long fs_base; + + u16 fs_sel, gs_sel, ldt_sel; +#ifdef CONFIG_X86_64 + u16 ds_sel, es_sel; +#endif +}; + +/* * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs * loaded on this CPU (so we can clear them if the CPU goes down). @@ -215,14 +397,13 @@ struct loaded_vmcs { int cpu; bool launched; bool nmi_known_unmasked; - unsigned long vmcs_host_cr3; /* May not match real cr3 */ - unsigned long vmcs_host_cr4; /* May not match real cr4 */ /* Support for vnmi-less CPUs */ int soft_vnmi_blocked; ktime_t entry_time; s64 vnmi_blocked_time; unsigned long *msr_bitmap; struct list_head loaded_vmcss_on_cpu_link; + struct vmcs_host_state host_state; }; struct shared_msr_entry { @@ -253,7 +434,7 @@ struct __packed vmcs12 { /* According to the Intel spec, a VMCS region must start with the * following two fields. Then follow implementation-specific data. */ - u32 revision_id; + struct vmcs_hdr hdr; u32 abort; u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */ @@ -421,7 +602,7 @@ struct __packed vmcs12 { "Offset of " #field " in struct vmcs12 has changed.") static inline void vmx_check_vmcs12_offsets(void) { - CHECK_OFFSET(revision_id, 0); + CHECK_OFFSET(hdr, 0); CHECK_OFFSET(abort, 4); CHECK_OFFSET(launch_state, 8); CHECK_OFFSET(io_bitmap_a, 40); @@ -640,6 +821,12 @@ struct nested_vmx { */ struct vmcs12 *cached_vmcs12; /* + * Cache of the guest's shadow VMCS, existing outside of guest + * memory. Loaded from guest memory during VM entry. Flushed + * to guest memory during VM exit. + */ + struct vmcs12 *cached_shadow_vmcs12; + /* * Indicates if the shadow vmcs must be updated with the * data hold by vmcs12 */ @@ -757,6 +944,11 @@ static inline int pi_test_sn(struct pi_desc *pi_desc) (unsigned long *)&pi_desc->control); } +struct vmx_msrs { + unsigned int nr; + struct vmx_msr_entry val[NR_AUTOLOAD_MSRS]; +}; + struct vcpu_vmx { struct kvm_vcpu vcpu; unsigned long host_rsp; @@ -784,26 +976,20 @@ struct vcpu_vmx { /* * loaded_vmcs points to the VMCS currently used in this vcpu. For a * non-nested (L1) guest, it always points to vmcs01. For a nested - * guest (L2), it points to a different VMCS. + * guest (L2), it points to a different VMCS. loaded_cpu_state points + * to the VMCS whose state is loaded into the CPU registers that only + * need to be switched when transitioning to/from the kernel; a NULL + * value indicates that host state is loaded. */ struct loaded_vmcs vmcs01; struct loaded_vmcs *loaded_vmcs; + struct loaded_vmcs *loaded_cpu_state; bool __launched; /* temporary, used in vmx_vcpu_run */ struct msr_autoload { - unsigned nr; - struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS]; - struct vmx_msr_entry host[NR_AUTOLOAD_MSRS]; + struct vmx_msrs guest; + struct vmx_msrs host; } msr_autoload; - struct { - int loaded; - u16 fs_sel, gs_sel, ldt_sel; -#ifdef CONFIG_X86_64 - u16 ds_sel, es_sel; -#endif - int gs_ldt_reload_needed; - int fs_reload_needed; - u64 msr_host_bndcfgs; - } host_state; + struct { int vm86_active; ulong save_rflags; @@ -853,6 +1039,7 @@ struct vcpu_vmx { */ u64 msr_ia32_feature_control; u64 msr_ia32_feature_control_valid_bits; + u64 ept_pointer; }; enum segment_cache_field { @@ -1072,6 +1259,11 @@ static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) return to_vmx(vcpu)->nested.cached_vmcs12; } +static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu) +{ + return to_vmx(vcpu)->nested.cached_shadow_vmcs12; +} + static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu); static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa); @@ -1342,6 +1534,48 @@ static void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) * GUEST_IA32_RTIT_CTL = 0x00002814, */ } + +/* check_ept_pointer() should be under protection of ept_pointer_lock. */ +static void check_ept_pointer_match(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + u64 tmp_eptp = INVALID_PAGE; + int i; + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (!VALID_PAGE(tmp_eptp)) { + tmp_eptp = to_vmx(vcpu)->ept_pointer; + } else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) { + to_kvm_vmx(kvm)->ept_pointers_match + = EPT_POINTERS_MISMATCH; + return; + } + } + + to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH; +} + +static int vmx_hv_remote_flush_tlb(struct kvm *kvm) +{ + int ret; + + spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock); + + if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK) + check_ept_pointer_match(kvm); + + if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) { + ret = -ENOTSUPP; + goto out; + } + + ret = hyperv_flush_guest_mapping( + to_vmx(kvm_get_vcpu(kvm, 0))->ept_pointer); + +out: + spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock); + return ret; +} #else /* !IS_ENABLED(CONFIG_HYPERV) */ static inline void evmcs_write64(unsigned long field, u64 value) {} static inline void evmcs_write32(unsigned long field, u32 value) {} @@ -1456,6 +1690,12 @@ static inline bool cpu_has_vmx_virtual_intr_delivery(void) SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; } +static inline bool cpu_has_vmx_encls_vmexit(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_ENCLS_EXITING; +} + /* * Comment's format: document - errata name - stepping - processor name. * Refer from @@ -1716,6 +1956,12 @@ static inline bool nested_cpu_supports_monitor_trap_flag(struct kvm_vcpu *vcpu) CPU_BASED_MONITOR_TRAP_FLAG; } +static inline bool nested_cpu_has_vmx_shadow_vmcs(struct kvm_vcpu *vcpu) +{ + return to_vmx(vcpu)->nested.msrs.secondary_ctls_high & + SECONDARY_EXEC_SHADOW_VMCS; +} + static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit) { return vmcs12->cpu_based_vm_exec_control & bit; @@ -1796,6 +2042,11 @@ static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12) VMX_VMFUNC_EPTP_SWITCHING); } +static inline bool nested_cpu_has_shadow_vmcs(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS); +} + static inline bool is_nmi(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) @@ -1826,11 +2077,12 @@ static inline void __invvpid(int ext, u16 vpid, gva_t gva) u64 rsvd : 48; u64 gva; } operand = { vpid, 0, gva }; + bool error; - asm volatile (__ex(ASM_VMX_INVVPID) - /* CF==1 or ZF==1 --> rc = -1 */ - "; ja 1f ; ud2 ; 1:" - : : "a"(&operand), "c"(ext) : "cc", "memory"); + asm volatile (__ex(ASM_VMX_INVVPID) CC_SET(na) + : CC_OUT(na) (error) : "a"(&operand), "c"(ext) + : "memory"); + BUG_ON(error); } static inline void __invept(int ext, u64 eptp, gpa_t gpa) @@ -1838,11 +2090,12 @@ static inline void __invept(int ext, u64 eptp, gpa_t gpa) struct { u64 eptp, gpa; } operand = {eptp, gpa}; + bool error; - asm volatile (__ex(ASM_VMX_INVEPT) - /* CF==1 or ZF==1 --> rc = -1 */ - "; ja 1f ; ud2 ; 1:\n" - : : "a" (&operand), "c" (ext) : "cc", "memory"); + asm volatile (__ex(ASM_VMX_INVEPT) CC_SET(na) + : CC_OUT(na) (error) : "a" (&operand), "c" (ext) + : "memory"); + BUG_ON(error); } static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) @@ -1858,12 +2111,12 @@ static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) static void vmcs_clear(struct vmcs *vmcs) { u64 phys_addr = __pa(vmcs); - u8 error; + bool error; - asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0" - : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) - : "cc", "memory"); - if (error) + asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) CC_SET(na) + : CC_OUT(na) (error) : "a"(&phys_addr), "m"(phys_addr) + : "memory"); + if (unlikely(error)) printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", vmcs, phys_addr); } @@ -1880,15 +2133,15 @@ static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs) static void vmcs_load(struct vmcs *vmcs) { u64 phys_addr = __pa(vmcs); - u8 error; + bool error; if (static_branch_unlikely(&enable_evmcs)) return evmcs_load(phys_addr); - asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" - : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) - : "cc", "memory"); - if (error) + asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) CC_SET(na) + : CC_OUT(na) (error) : "a"(&phys_addr), "m"(phys_addr) + : "memory"); + if (unlikely(error)) printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n", vmcs, phys_addr); } @@ -1966,6 +2219,19 @@ static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) __loaded_vmcs_clear, loaded_vmcs, 1); } +static inline bool vpid_sync_vcpu_addr(int vpid, gva_t addr) +{ + if (vpid == 0) + return true; + + if (cpu_has_vmx_invvpid_individual_addr()) { + __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, vpid, addr); + return true; + } + + return false; +} + static inline void vpid_sync_vcpu_single(int vpid) { if (vpid == 0) @@ -2100,10 +2366,10 @@ static noinline void vmwrite_error(unsigned long field, unsigned long value) static __always_inline void __vmcs_writel(unsigned long field, unsigned long value) { - u8 error; + bool error; - asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0" - : "=q"(error) : "a"(value), "d"(field) : "cc"); + asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) CC_SET(na) + : CC_OUT(na) (error) : "a"(value), "d"(field)); if (unlikely(error)) vmwrite_error(field, value); } @@ -2377,9 +2643,20 @@ static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, vm_exit_controls_clearbit(vmx, exit); } +static int find_msr(struct vmx_msrs *m, unsigned int msr) +{ + unsigned int i; + + for (i = 0; i < m->nr; ++i) { + if (m->val[i].index == msr) + return i; + } + return -ENOENT; +} + static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) { - unsigned i; + int i; struct msr_autoload *m = &vmx->msr_autoload; switch (msr) { @@ -2400,18 +2677,21 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) } break; } + i = find_msr(&m->guest, msr); + if (i < 0) + goto skip_guest; + --m->guest.nr; + m->guest.val[i] = m->guest.val[m->guest.nr]; + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); - for (i = 0; i < m->nr; ++i) - if (m->guest[i].index == msr) - break; - - if (i == m->nr) +skip_guest: + i = find_msr(&m->host, msr); + if (i < 0) return; - --m->nr; - m->guest[i] = m->guest[m->nr]; - m->host[i] = m->host[m->nr]; - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr); - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr); + + --m->host.nr; + m->host.val[i] = m->host.val[m->host.nr]; + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); } static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, @@ -2426,9 +2706,9 @@ static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, } static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, - u64 guest_val, u64 host_val) + u64 guest_val, u64 host_val, bool entry_only) { - unsigned i; + int i, j = 0; struct msr_autoload *m = &vmx->msr_autoload; switch (msr) { @@ -2463,24 +2743,31 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, wrmsrl(MSR_IA32_PEBS_ENABLE, 0); } - for (i = 0; i < m->nr; ++i) - if (m->guest[i].index == msr) - break; + i = find_msr(&m->guest, msr); + if (!entry_only) + j = find_msr(&m->host, msr); - if (i == NR_AUTOLOAD_MSRS) { + if (i == NR_AUTOLOAD_MSRS || j == NR_AUTOLOAD_MSRS) { printk_once(KERN_WARNING "Not enough msr switch entries. " "Can't add msr %x\n", msr); return; - } else if (i == m->nr) { - ++m->nr; - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr); - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr); } + if (i < 0) { + i = m->guest.nr++; + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); + } + m->guest.val[i].index = msr; + m->guest.val[i].value = guest_val; - m->guest[i].index = msr; - m->guest[i].value = guest_val; - m->host[i].index = msr; - m->host[i].value = host_val; + if (entry_only) + return; + + if (j < 0) { + j = m->host.nr++; + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); + } + m->host.val[j].index = msr; + m->host.val[j].value = host_val; } static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) @@ -2524,7 +2811,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) guest_efer &= ~EFER_LME; if (guest_efer != host_efer) add_atomic_switch_msr(vmx, MSR_EFER, - guest_efer, host_efer); + guest_efer, host_efer, false); return false; } else { guest_efer &= ~ignore_bits; @@ -2566,121 +2853,150 @@ static unsigned long segment_base(u16 selector) } #endif -static void vmx_save_host_state(struct kvm_vcpu *vcpu) +static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmcs_host_state *host_state; #ifdef CONFIG_X86_64 int cpu = raw_smp_processor_id(); - unsigned long fs_base, kernel_gs_base; #endif + unsigned long fs_base, gs_base; + u16 fs_sel, gs_sel; int i; - if (vmx->host_state.loaded) + if (vmx->loaded_cpu_state) return; - vmx->host_state.loaded = 1; + vmx->loaded_cpu_state = vmx->loaded_vmcs; + host_state = &vmx->loaded_cpu_state->host_state; + /* * Set host fs and gs selectors. Unfortunately, 22.2.3 does not * allow segment selectors with cpl > 0 or ti == 1. */ - vmx->host_state.ldt_sel = kvm_read_ldt(); - vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; + host_state->ldt_sel = kvm_read_ldt(); #ifdef CONFIG_X86_64 + savesegment(ds, host_state->ds_sel); + savesegment(es, host_state->es_sel); + + gs_base = cpu_kernelmode_gs_base(cpu); if (likely(is_64bit_mm(current->mm))) { save_fsgs_for_kvm(); - vmx->host_state.fs_sel = current->thread.fsindex; - vmx->host_state.gs_sel = current->thread.gsindex; + fs_sel = current->thread.fsindex; + gs_sel = current->thread.gsindex; fs_base = current->thread.fsbase; - kernel_gs_base = current->thread.gsbase; + vmx->msr_host_kernel_gs_base = current->thread.gsbase; } else { -#endif - savesegment(fs, vmx->host_state.fs_sel); - savesegment(gs, vmx->host_state.gs_sel); -#ifdef CONFIG_X86_64 + savesegment(fs, fs_sel); + savesegment(gs, gs_sel); fs_base = read_msr(MSR_FS_BASE); - kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); - } -#endif - if (!(vmx->host_state.fs_sel & 7)) { - vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel); - vmx->host_state.fs_reload_needed = 0; - } else { - vmcs_write16(HOST_FS_SELECTOR, 0); - vmx->host_state.fs_reload_needed = 1; - } - if (!(vmx->host_state.gs_sel & 7)) - vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel); - else { - vmcs_write16(HOST_GS_SELECTOR, 0); - vmx->host_state.gs_ldt_reload_needed = 1; + vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); } -#ifdef CONFIG_X86_64 - savesegment(ds, vmx->host_state.ds_sel); - savesegment(es, vmx->host_state.es_sel); - - vmcs_writel(HOST_FS_BASE, fs_base); - vmcs_writel(HOST_GS_BASE, cpu_kernelmode_gs_base(cpu)); - - vmx->msr_host_kernel_gs_base = kernel_gs_base; if (is_long_mode(&vmx->vcpu)) wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); #else - vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel)); - vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel)); + savesegment(fs, fs_sel); + savesegment(gs, gs_sel); + fs_base = segment_base(fs_sel); + gs_base = segment_base(gs_sel); #endif - if (boot_cpu_has(X86_FEATURE_MPX)) - rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); + + if (unlikely(fs_sel != host_state->fs_sel)) { + if (!(fs_sel & 7)) + vmcs_write16(HOST_FS_SELECTOR, fs_sel); + else + vmcs_write16(HOST_FS_SELECTOR, 0); + host_state->fs_sel = fs_sel; + } + if (unlikely(gs_sel != host_state->gs_sel)) { + if (!(gs_sel & 7)) + vmcs_write16(HOST_GS_SELECTOR, gs_sel); + else + vmcs_write16(HOST_GS_SELECTOR, 0); + host_state->gs_sel = gs_sel; + } + if (unlikely(fs_base != host_state->fs_base)) { + vmcs_writel(HOST_FS_BASE, fs_base); + host_state->fs_base = fs_base; + } + if (unlikely(gs_base != host_state->gs_base)) { + vmcs_writel(HOST_GS_BASE, gs_base); + host_state->gs_base = gs_base; + } + for (i = 0; i < vmx->save_nmsrs; ++i) kvm_set_shared_msr(vmx->guest_msrs[i].index, vmx->guest_msrs[i].data, vmx->guest_msrs[i].mask); } -static void __vmx_load_host_state(struct vcpu_vmx *vmx) +static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) { - if (!vmx->host_state.loaded) + struct vmcs_host_state *host_state; + + if (!vmx->loaded_cpu_state) return; + WARN_ON_ONCE(vmx->loaded_cpu_state != vmx->loaded_vmcs); + host_state = &vmx->loaded_cpu_state->host_state; + ++vmx->vcpu.stat.host_state_reload; - vmx->host_state.loaded = 0; + vmx->loaded_cpu_state = NULL; + #ifdef CONFIG_X86_64 if (is_long_mode(&vmx->vcpu)) rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); #endif - if (vmx->host_state.gs_ldt_reload_needed) { - kvm_load_ldt(vmx->host_state.ldt_sel); + if (host_state->ldt_sel || (host_state->gs_sel & 7)) { + kvm_load_ldt(host_state->ldt_sel); #ifdef CONFIG_X86_64 - load_gs_index(vmx->host_state.gs_sel); + load_gs_index(host_state->gs_sel); #else - loadsegment(gs, vmx->host_state.gs_sel); + loadsegment(gs, host_state->gs_sel); #endif } - if (vmx->host_state.fs_reload_needed) - loadsegment(fs, vmx->host_state.fs_sel); + if (host_state->fs_sel & 7) + loadsegment(fs, host_state->fs_sel); #ifdef CONFIG_X86_64 - if (unlikely(vmx->host_state.ds_sel | vmx->host_state.es_sel)) { - loadsegment(ds, vmx->host_state.ds_sel); - loadsegment(es, vmx->host_state.es_sel); + if (unlikely(host_state->ds_sel | host_state->es_sel)) { + loadsegment(ds, host_state->ds_sel); + loadsegment(es, host_state->es_sel); } #endif invalidate_tss_limit(); #ifdef CONFIG_X86_64 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); #endif - if (vmx->host_state.msr_host_bndcfgs) - wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); load_fixmap_gdt(raw_smp_processor_id()); } -static void vmx_load_host_state(struct vcpu_vmx *vmx) +#ifdef CONFIG_X86_64 +static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) { - preempt_disable(); - __vmx_load_host_state(vmx); - preempt_enable(); + if (is_long_mode(&vmx->vcpu)) { + preempt_disable(); + if (vmx->loaded_cpu_state) + rdmsrl(MSR_KERNEL_GS_BASE, + vmx->msr_guest_kernel_gs_base); + preempt_enable(); + } + return vmx->msr_guest_kernel_gs_base; } +static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) +{ + if (is_long_mode(&vmx->vcpu)) { + preempt_disable(); + if (vmx->loaded_cpu_state) + wrmsrl(MSR_KERNEL_GS_BASE, data); + preempt_enable(); + } + vmx->msr_guest_kernel_gs_base = data; +} +#endif + static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); @@ -2822,7 +3138,7 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu) { vmx_vcpu_pi_put(vcpu); - __vmx_load_host_state(to_vmx(vcpu)); + vmx_prepare_switch_to_host(to_vmx(vcpu)); } static bool emulation_required(struct kvm_vcpu *vcpu) @@ -3043,7 +3359,7 @@ static bool vmx_rdtscp_supported(void) static bool vmx_invpcid_supported(void) { - return cpu_has_vmx_invpcid() && enable_ept; + return cpu_has_vmx_invpcid(); } /* @@ -3286,6 +3602,12 @@ static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv) SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_WBINVD_EXITING; + /* + * We can emulate "VMCS shadowing," even if the hardware + * doesn't support it. + */ + msrs->secondary_ctls_high |= + SECONDARY_EXEC_SHADOW_VMCS; if (enable_ept) { /* nested EPT: emulate EPT also to L1 */ @@ -3753,8 +4075,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vmcs_readl(GUEST_GS_BASE); break; case MSR_KERNEL_GS_BASE: - vmx_load_host_state(vmx); - msr_info->data = vmx->msr_guest_kernel_gs_base; + msr_info->data = vmx_read_guest_kernel_gs_base(vmx); break; #endif case MSR_EFER: @@ -3854,8 +4175,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmcs_writel(GUEST_GS_BASE, data); break; case MSR_KERNEL_GS_BASE: - vmx_load_host_state(vmx); - vmx->msr_guest_kernel_gs_base = data; + vmx_write_guest_kernel_gs_base(vmx, data); break; #endif case MSR_IA32_SYSENTER_CS: @@ -3987,7 +4307,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.ia32_xss = data; if (vcpu->arch.ia32_xss != host_xss) add_atomic_switch_msr(vmx, MSR_IA32_XSS, - vcpu->arch.ia32_xss, host_xss); + vcpu->arch.ia32_xss, host_xss, false); else clear_atomic_switch_msr(vmx, MSR_IA32_XSS); break; @@ -4243,7 +4563,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_RDRAND_EXITING | SECONDARY_EXEC_ENABLE_PML | SECONDARY_EXEC_TSC_SCALING | - SECONDARY_EXEC_ENABLE_VMFUNC; + SECONDARY_EXEC_ENABLE_VMFUNC | + SECONDARY_EXEC_ENCLS_EXITING; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) @@ -4390,7 +4711,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) return 0; } -static struct vmcs *alloc_vmcs_cpu(int cpu) +static struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu) { int node = cpu_to_node(cpu); struct page *pages; @@ -4404,10 +4725,12 @@ static struct vmcs *alloc_vmcs_cpu(int cpu) /* KVM supports Enlightened VMCS v1 only */ if (static_branch_unlikely(&enable_evmcs)) - vmcs->revision_id = KVM_EVMCS_VERSION; + vmcs->hdr.revision_id = KVM_EVMCS_VERSION; else - vmcs->revision_id = vmcs_config.revision_id; + vmcs->hdr.revision_id = vmcs_config.revision_id; + if (shadow) + vmcs->hdr.shadow_vmcs = 1; return vmcs; } @@ -4431,14 +4754,14 @@ static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) WARN_ON(loaded_vmcs->shadow_vmcs != NULL); } -static struct vmcs *alloc_vmcs(void) +static struct vmcs *alloc_vmcs(bool shadow) { - return alloc_vmcs_cpu(raw_smp_processor_id()); + return alloc_vmcs_cpu(shadow, raw_smp_processor_id()); } static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) { - loaded_vmcs->vmcs = alloc_vmcs(); + loaded_vmcs->vmcs = alloc_vmcs(false); if (!loaded_vmcs->vmcs) return -ENOMEM; @@ -4460,6 +4783,9 @@ static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) evmcs->hv_enlightenments_control.msr_bitmap = 1; } } + + memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state)); + return 0; out_vmcs: @@ -4569,7 +4895,7 @@ static __init int alloc_kvm_area(void) for_each_possible_cpu(cpu) { struct vmcs *vmcs; - vmcs = alloc_vmcs_cpu(cpu); + vmcs = alloc_vmcs_cpu(false, cpu); if (!vmcs) { free_kvm_area(); return -ENOMEM; @@ -4586,7 +4912,7 @@ static __init int alloc_kvm_area(void) * physical CPU. */ if (static_branch_unlikely(&enable_evmcs)) - vmcs->revision_id = vmcs_config.revision_id; + vmcs->hdr.revision_id = vmcs_config.revision_id; per_cpu(vmxarea, cpu) = vmcs; } @@ -4743,10 +5069,18 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) return; /* - * Force kernel_gs_base reloading before EFER changes, as control - * of this msr depends on is_long_mode(). + * MSR_KERNEL_GS_BASE is not intercepted when the guest is in + * 64-bit mode as a 64-bit kernel may frequently access the + * MSR. This means we need to manually save/restore the MSR + * when switching between guest and host state, but only if + * the guest is in 64-bit mode. Sync our cached value if the + * guest is transitioning to 32-bit mode and the CPU contains + * guest state, i.e. the cache is stale. */ - vmx_load_host_state(to_vmx(vcpu)); +#ifdef CONFIG_X86_64 + if (!(efer & EFER_LMA)) + (void)vmx_read_guest_kernel_gs_base(vmx); +#endif vcpu->arch.efer = efer; if (efer & EFER_LMA) { vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); @@ -4803,6 +5137,20 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid, invalidate_gpa); } +static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) +{ + int vpid = to_vmx(vcpu)->vpid; + + if (!vpid_sync_vcpu_addr(vpid, addr)) + vpid_sync_context(vpid); + + /* + * If VPIDs are not supported or enabled, then the above is a no-op. + * But we don't really need a TLB flush in that case anyway, because + * each VM entry/exit includes an implicit flush when VPID is 0. + */ +} + static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) { ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; @@ -4984,6 +5332,7 @@ static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa) static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { + struct kvm *kvm = vcpu->kvm; unsigned long guest_cr3; u64 eptp; @@ -4991,15 +5340,23 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) if (enable_ept) { eptp = construct_eptp(vcpu, cr3); vmcs_write64(EPT_POINTER, eptp); + + if (kvm_x86_ops->tlb_remote_flush) { + spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock); + to_vmx(vcpu)->ept_pointer = eptp; + to_kvm_vmx(kvm)->ept_pointers_match + = EPT_POINTERS_CHECK; + spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock); + } + if (enable_unrestricted_guest || is_paging(vcpu) || is_guest_mode(vcpu)) guest_cr3 = kvm_read_cr3(vcpu); else - guest_cr3 = to_kvm_vmx(vcpu->kvm)->ept_identity_map_addr; + guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr; ept_load_pdptrs(vcpu); } - vmx_flush_tlb(vcpu, true); vmcs_writel(GUEST_CR3, guest_cr3); } @@ -5935,19 +6292,19 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) */ cr3 = __read_cr3(); vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */ - vmx->loaded_vmcs->vmcs_host_cr3 = cr3; + vmx->loaded_vmcs->host_state.cr3 = cr3; /* Save the most likely value for this task's CR4 in the VMCS. */ cr4 = cr4_read_shadow(); vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ - vmx->loaded_vmcs->vmcs_host_cr4 = cr4; + vmx->loaded_vmcs->host_state.cr4 = cr4; vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ #ifdef CONFIG_X86_64 /* * Load null selectors, so we can avoid reloading them in - * __vmx_load_host_state(), in case userspace uses the null selectors - * too (the expected case). + * vmx_prepare_switch_to_host(), in case userspace uses + * the null selectors too (the expected case). */ vmcs_write16(HOST_DS_SELECTOR, 0); vmcs_write16(HOST_ES_SELECTOR, 0); @@ -6072,8 +6429,6 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) if (!enable_ept) { exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; enable_unrestricted_guest = 0; - /* Enable INVPCID for non-ept guests may cause performance regression. */ - exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; } if (!enable_unrestricted_guest) exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; @@ -6202,9 +6557,6 @@ static void ept_set_mmio_spte_mask(void) */ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) { -#ifdef CONFIG_X86_64 - unsigned long a; -#endif int i; if (enable_shadow_vmcs) { @@ -6259,24 +6611,17 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ vmx_set_constant_host_state(vmx); -#ifdef CONFIG_X86_64 - rdmsrl(MSR_FS_BASE, a); - vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */ - rdmsrl(MSR_GS_BASE, a); - vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */ -#else vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ -#endif if (cpu_has_vmx_vmfunc()) vmcs_write64(VM_FUNCTION_CONTROL, 0); vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); - vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host)); + vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); - vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest)); + vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); @@ -6296,8 +6641,7 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) ++vmx->nmsrs; } - if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, vmx->arch_capabilities); + vmx->arch_capabilities = kvm_get_arch_capabilities(); vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl); @@ -6317,6 +6661,9 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); } + + if (cpu_has_vmx_encls_vmexit()) + vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); } static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -7502,6 +7849,7 @@ static void vmx_enable_tdp(void) static __init int hardware_setup(void) { + unsigned long host_bndcfgs; int r = -ENOMEM, i; rdmsrl_safe(MSR_EFER, &host_efer); @@ -7526,6 +7874,11 @@ static __init int hardware_setup(void) if (boot_cpu_has(X86_FEATURE_NX)) kvm_enable_efer_bits(EFER_NX); + if (boot_cpu_has(X86_FEATURE_MPX)) { + rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs); + WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost"); + } + if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) enable_vpid = 0; @@ -7562,6 +7915,12 @@ static __init int hardware_setup(void) if (enable_ept && !cpu_has_vmx_ept_2m_page()) kvm_disable_largepages(); +#if IS_ENABLED(CONFIG_HYPERV) + if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH + && enable_ept) + kvm_x86_ops->tlb_remote_flush = vmx_hv_remote_flush_tlb; +#endif + if (!cpu_has_vmx_ple()) { ple_gap = 0; ple_window = 0; @@ -7588,6 +7947,11 @@ static __init int hardware_setup(void) else kvm_disable_tdp(); + if (!nested) { + kvm_x86_ops->get_nested_state = NULL; + kvm_x86_ops->set_nested_state = NULL; + } + /* * Only enable PML when hardware supports PML feature, and both EPT * and EPT A/D bit features are enabled -- PML depends on them to work. @@ -7864,10 +8228,35 @@ static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer) return 0; } +/* + * Allocate a shadow VMCS and associate it with the currently loaded + * VMCS, unless such a shadow VMCS already exists. The newly allocated + * VMCS is also VMCLEARed, so that it is ready for use. + */ +static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; + + /* + * We should allocate a shadow vmcs for vmcs01 only when L1 + * executes VMXON and free it when L1 executes VMXOFF. + * As it is invalid to execute VMXON twice, we shouldn't reach + * here when vmcs01 already have an allocated shadow vmcs. + */ + WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs); + + if (!loaded_vmcs->shadow_vmcs) { + loaded_vmcs->shadow_vmcs = alloc_vmcs(true); + if (loaded_vmcs->shadow_vmcs) + vmcs_clear(loaded_vmcs->shadow_vmcs); + } + return loaded_vmcs->shadow_vmcs; +} + static int enter_vmx_operation(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct vmcs *shadow_vmcs; int r; r = alloc_loaded_vmcs(&vmx->nested.vmcs02); @@ -7878,16 +8267,12 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) if (!vmx->nested.cached_vmcs12) goto out_cached_vmcs12; - if (enable_shadow_vmcs) { - shadow_vmcs = alloc_vmcs(); - if (!shadow_vmcs) - goto out_shadow_vmcs; - /* mark vmcs as shadow */ - shadow_vmcs->revision_id |= (1u << 31); - /* init shadow vmcs */ - vmcs_clear(shadow_vmcs); - vmx->vmcs01.shadow_vmcs = shadow_vmcs; - } + vmx->nested.cached_shadow_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL); + if (!vmx->nested.cached_shadow_vmcs12) + goto out_cached_shadow_vmcs12; + + if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) + goto out_shadow_vmcs; hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); @@ -7899,6 +8284,9 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) return 0; out_shadow_vmcs: + kfree(vmx->nested.cached_shadow_vmcs12); + +out_cached_shadow_vmcs12: kfree(vmx->nested.cached_vmcs12); out_cached_vmcs12: @@ -7941,7 +8329,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu) /* CPL=0 must be checked manually. */ if (vmx_get_cpl(vcpu)) { - kvm_queue_exception(vcpu, UD_VECTOR); + kvm_inject_gp(vcpu, 0); return 1; } @@ -8004,15 +8392,16 @@ static int handle_vmon(struct kvm_vcpu *vcpu) */ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) { - if (vmx_get_cpl(vcpu)) { + if (!to_vmx(vcpu)->nested.vmxon) { kvm_queue_exception(vcpu, UD_VECTOR); return 0; } - if (!to_vmx(vcpu)->nested.vmxon) { - kvm_queue_exception(vcpu, UD_VECTOR); + if (vmx_get_cpl(vcpu)) { + kvm_inject_gp(vcpu, 0); return 0; } + return 1; } @@ -8065,6 +8454,7 @@ static void free_nested(struct vcpu_vmx *vmx) vmx->vmcs01.shadow_vmcs = NULL; } kfree(vmx->nested.cached_vmcs12); + kfree(vmx->nested.cached_shadow_vmcs12); /* Unpin physical memory we referred to in the vmcs02 */ if (vmx->nested.apic_access_page) { kvm_release_page_dirty(vmx->nested.apic_access_page); @@ -8150,7 +8540,7 @@ static int handle_vmresume(struct kvm_vcpu *vcpu) * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of * 64-bit fields are to be returned). */ -static inline int vmcs12_read_any(struct kvm_vcpu *vcpu, +static inline int vmcs12_read_any(struct vmcs12 *vmcs12, unsigned long field, u64 *ret) { short offset = vmcs_field_to_offset(field); @@ -8159,7 +8549,7 @@ static inline int vmcs12_read_any(struct kvm_vcpu *vcpu, if (offset < 0) return offset; - p = ((char *)(get_vmcs12(vcpu))) + offset; + p = (char *)vmcs12 + offset; switch (vmcs_field_width(field)) { case VMCS_FIELD_WIDTH_NATURAL_WIDTH: @@ -8181,10 +8571,10 @@ static inline int vmcs12_read_any(struct kvm_vcpu *vcpu, } -static inline int vmcs12_write_any(struct kvm_vcpu *vcpu, +static inline int vmcs12_write_any(struct vmcs12 *vmcs12, unsigned long field, u64 field_value){ short offset = vmcs_field_to_offset(field); - char *p = ((char *) get_vmcs12(vcpu)) + offset; + char *p = (char *)vmcs12 + offset; if (offset < 0) return offset; @@ -8237,7 +8627,7 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) for (i = 0; i < max_fields[q]; i++) { field = fields[q][i]; field_value = __vmcs_readl(field); - vmcs12_write_any(&vmx->vcpu, field, field_value); + vmcs12_write_any(get_vmcs12(&vmx->vcpu), field, field_value); } /* * Skip the VM-exit information fields if they are read-only. @@ -8272,7 +8662,7 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) for (q = 0; q < ARRAY_SIZE(fields); q++) { for (i = 0; i < max_fields[q]; i++) { field = fields[q][i]; - vmcs12_read_any(&vmx->vcpu, field, &field_value); + vmcs12_read_any(get_vmcs12(&vmx->vcpu), field, &field_value); __vmcs_writel(field, field_value); } } @@ -8302,6 +8692,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); gva_t gva = 0; + struct vmcs12 *vmcs12; if (!nested_vmx_check_permission(vcpu)) return 1; @@ -8309,10 +8700,24 @@ static int handle_vmread(struct kvm_vcpu *vcpu) if (!nested_vmx_check_vmcs12(vcpu)) return kvm_skip_emulated_instruction(vcpu); + if (!is_guest_mode(vcpu)) + vmcs12 = get_vmcs12(vcpu); + else { + /* + * When vmcs->vmcs_link_pointer is -1ull, any VMREAD + * to shadowed-field sets the ALU flags for VMfailInvalid. + */ + if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) { + nested_vmx_failInvalid(vcpu); + return kvm_skip_emulated_instruction(vcpu); + } + vmcs12 = get_shadow_vmcs12(vcpu); + } + /* Decode instruction info and find the field to read */ field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); /* Read the field, zero-extended to a u64 field_value */ - if (vmcs12_read_any(vcpu, field, &field_value) < 0) { + if (vmcs12_read_any(vmcs12, field, &field_value) < 0) { nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); return kvm_skip_emulated_instruction(vcpu); } @@ -8354,6 +8759,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) */ u64 field_value = 0; struct x86_exception e; + struct vmcs12 *vmcs12; if (!nested_vmx_check_permission(vcpu)) return 1; @@ -8388,23 +8794,44 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } - if (vmcs12_write_any(vcpu, field, field_value) < 0) { + if (!is_guest_mode(vcpu)) + vmcs12 = get_vmcs12(vcpu); + else { + /* + * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE + * to shadowed-field sets the ALU flags for VMfailInvalid. + */ + if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) { + nested_vmx_failInvalid(vcpu); + return kvm_skip_emulated_instruction(vcpu); + } + vmcs12 = get_shadow_vmcs12(vcpu); + + } + + if (vmcs12_write_any(vmcs12, field, field_value) < 0) { nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); return kvm_skip_emulated_instruction(vcpu); } - switch (field) { + /* + * Do not track vmcs12 dirty-state if in guest-mode + * as we actually dirty shadow vmcs12 instead of vmcs12. + */ + if (!is_guest_mode(vcpu)) { + switch (field) { #define SHADOW_FIELD_RW(x) case x: #include "vmx_shadow_fields.h" - /* - * The fields that can be updated by L1 without a vmexit are - * always updated in the vmcs02, the others go down the slow - * path of prepare_vmcs02. - */ - break; - default: - vmx->nested.dirty_vmcs12 = true; - break; + /* + * The fields that can be updated by L1 without a vmexit are + * always updated in the vmcs02, the others go down the slow + * path of prepare_vmcs02. + */ + break; + default: + vmx->nested.dirty_vmcs12 = true; + break; + } } nested_vmx_succeed(vcpu); @@ -8455,7 +8882,9 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } new_vmcs12 = kmap(page); - if (new_vmcs12->revision_id != VMCS12_REVISION) { + if (new_vmcs12->hdr.revision_id != VMCS12_REVISION || + (new_vmcs12->hdr.shadow_vmcs && + !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { kunmap(page); kvm_release_page_clean(page); nested_vmx_failValid(vcpu, @@ -8653,6 +9082,105 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } +static int handle_invpcid(struct kvm_vcpu *vcpu) +{ + u32 vmx_instruction_info; + unsigned long type; + bool pcid_enabled; + gva_t gva; + struct x86_exception e; + unsigned i; + unsigned long roots_to_free = 0; + struct { + u64 pcid; + u64 gla; + } operand; + + if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); + + if (type > 3) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + /* According to the Intel instruction reference, the memory operand + * is read even if it isn't needed (e.g., for type==all) + */ + if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + vmx_instruction_info, false, &gva)) + return 1; + + if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { + kvm_inject_page_fault(vcpu, &e); + return 1; + } + + if (operand.pcid >> 12 != 0) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE); + + switch (type) { + case INVPCID_TYPE_INDIV_ADDR: + if ((!pcid_enabled && (operand.pcid != 0)) || + is_noncanonical_address(operand.gla, vcpu)) { + kvm_inject_gp(vcpu, 0); + return 1; + } + kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid); + return kvm_skip_emulated_instruction(vcpu); + + case INVPCID_TYPE_SINGLE_CTXT: + if (!pcid_enabled && (operand.pcid != 0)) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + if (kvm_get_active_pcid(vcpu) == operand.pcid) { + kvm_mmu_sync_roots(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + } + + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + if (kvm_get_pcid(vcpu, vcpu->arch.mmu.prev_roots[i].cr3) + == operand.pcid) + roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); + + kvm_mmu_free_roots(vcpu, roots_to_free); + /* + * If neither the current cr3 nor any of the prev_roots use the + * given PCID, then nothing needs to be done here because a + * resync will happen anyway before switching to any other CR3. + */ + + return kvm_skip_emulated_instruction(vcpu); + + case INVPCID_TYPE_ALL_NON_GLOBAL: + /* + * Currently, KVM doesn't mark global entries in the shadow + * page tables, so a non-global flush just degenerates to a + * global flush. If needed, we could optimize this later by + * keeping track of global entries in shadow page tables. + */ + + /* fall-through */ + case INVPCID_TYPE_ALL_INCL_GLOBAL: + kvm_mmu_unload(vcpu); + return kvm_skip_emulated_instruction(vcpu); + + default: + BUG(); /* We have already checked above that type <= 3 */ + } +} + static int handle_pml_full(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; @@ -8802,6 +9330,17 @@ fail: return 1; } +static int handle_encls(struct kvm_vcpu *vcpu) +{ + /* + * SGX virtualization is not yet supported. There is no software + * enable bit for SGX, so we have to trap ENCLS and inject a #UD + * to prevent the guest from executing ENCLS. + */ + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -8856,8 +9395,10 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_XSAVES] = handle_xsaves, [EXIT_REASON_XRSTORS] = handle_xrstors, [EXIT_REASON_PML_FULL] = handle_pml_full, + [EXIT_REASON_INVPCID] = handle_invpcid, [EXIT_REASON_VMFUNC] = handle_vmfunc, [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, + [EXIT_REASON_ENCLS] = handle_encls, }; static const int kvm_vmx_max_exit_handlers = @@ -9028,6 +9569,30 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, return false; } +static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12, gpa_t bitmap) +{ + u32 vmx_instruction_info; + unsigned long field; + u8 b; + + if (!nested_cpu_has_shadow_vmcs(vmcs12)) + return true; + + /* Decode instruction info and find the field to access */ + vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); + + /* Out-of-range fields always cause a VM exit from L2 to L1 */ + if (field >> 15) + return true; + + if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) + return true; + + return 1 & (b >> (field & 7)); +} + /* * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we * should handle it ourselves in L0 (and then continue L2). Only call this @@ -9112,10 +9677,15 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); + case EXIT_REASON_VMREAD: + return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, + vmcs12->vmread_bitmap); + case EXIT_REASON_VMWRITE: + return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, + vmcs12->vmwrite_bitmap); case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: - case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD: - case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE: + case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: /* @@ -9199,6 +9769,9 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) case EXIT_REASON_VMFUNC: /* VM functions are emulated through L2->L0 vmexits. */ return false; + case EXIT_REASON_ENCLS: + /* SGX is never exposed to L1 */ + return false; default: return true; } @@ -9548,6 +10121,76 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) } } +/* + * Software based L1D cache flush which is used when microcode providing + * the cache control MSR is not loaded. + * + * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to + * flush it is required to read in 64 KiB because the replacement algorithm + * is not exactly LRU. This could be sized at runtime via topology + * information but as all relevant affected CPUs have 32KiB L1D cache size + * there is no point in doing so. + */ +static void vmx_l1d_flush(struct kvm_vcpu *vcpu) +{ + int size = PAGE_SIZE << L1D_CACHE_ORDER; + + /* + * This code is only executed when the the flush mode is 'cond' or + * 'always' + */ + if (static_branch_likely(&vmx_l1d_flush_cond)) { + bool flush_l1d; + + /* + * Clear the per-vcpu flush bit, it gets set again + * either from vcpu_run() or from one of the unsafe + * VMEXIT handlers. + */ + flush_l1d = vcpu->arch.l1tf_flush_l1d; + vcpu->arch.l1tf_flush_l1d = false; + + /* + * Clear the per-cpu flush bit, it gets set again from + * the interrupt handlers. + */ + flush_l1d |= kvm_get_cpu_l1tf_flush_l1d(); + kvm_clear_cpu_l1tf_flush_l1d(); + + if (!flush_l1d) + return; + } + + vcpu->stat.l1d_flush++; + + if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) { + wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); + return; + } + + asm volatile( + /* First ensure the pages are in the TLB */ + "xorl %%eax, %%eax\n" + ".Lpopulate_tlb:\n\t" + "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" + "addl $4096, %%eax\n\t" + "cmpl %%eax, %[size]\n\t" + "jne .Lpopulate_tlb\n\t" + "xorl %%eax, %%eax\n\t" + "cpuid\n\t" + /* Now fill the cache */ + "xorl %%eax, %%eax\n" + ".Lfill_cache:\n" + "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" + "addl $64, %%eax\n\t" + "cmpl %%eax, %[size]\n\t" + "jne .Lfill_cache\n\t" + "lfence\n" + :: [flush_pages] "r" (vmx_l1d_flush_pages), + [size] "r" (size) + : "eax", "ebx", "ecx", "edx"); +} + static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); @@ -9949,7 +10592,7 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) clear_atomic_switch_msr(vmx, msrs[i].msr); else add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest, - msrs[i].host); + msrs[i].host, false); } static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu) @@ -10003,15 +10646,15 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); cr3 = __get_current_cr3_fast(); - if (unlikely(cr3 != vmx->loaded_vmcs->vmcs_host_cr3)) { + if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { vmcs_writel(HOST_CR3, cr3); - vmx->loaded_vmcs->vmcs_host_cr3 = cr3; + vmx->loaded_vmcs->host_state.cr3 = cr3; } cr4 = cr4_read_shadow(); - if (unlikely(cr4 != vmx->loaded_vmcs->vmcs_host_cr4)) { + if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { vmcs_writel(HOST_CR4, cr4); - vmx->loaded_vmcs->vmcs_host_cr4 = cr4; + vmx->loaded_vmcs->host_state.cr4 = cr4; } /* When single-stepping over STI and MOV SS, we must clear the @@ -10044,6 +10687,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) evmcs_rsp = static_branch_unlikely(&enable_evmcs) ? (unsigned long)¤t_evmcs->host_rsp : 0; + if (static_branch_unlikely(&vmx_l1d_should_flush)) + vmx_l1d_flush(vcpu); + asm( /* Store host registers */ "push %%" _ASM_DX "; push %%" _ASM_BP ";" @@ -10204,9 +10850,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) * The sysexit path does not restore ds/es, so we must set them to * a reasonable value ourselves. * - * We can't defer this to vmx_load_host_state() since that function - * may be executed in interrupt context, which saves and restore segments - * around it, nullifying its effect. + * We can't defer this to vmx_prepare_switch_to_host() since that + * function may be executed in interrupt context, which saves and + * restore segments around it, nullifying its effect. */ loadsegment(ds, __USER_DS); loadsegment(es, __USER_DS); @@ -10267,8 +10913,8 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) return; cpu = get_cpu(); - vmx->loaded_vmcs = vmcs; vmx_vcpu_put(vcpu); + vmx->loaded_vmcs = vmcs; vmx_vcpu_load(vcpu, cpu); put_cpu(); } @@ -10403,10 +11049,39 @@ free_vcpu: return ERR_PTR(err); } +#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n" +#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n" + static int vmx_vm_init(struct kvm *kvm) { + spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock); + if (!ple_gap) kvm->arch.pause_in_guest = true; + + if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) { + switch (l1tf_mitigation) { + case L1TF_MITIGATION_OFF: + case L1TF_MITIGATION_FLUSH_NOWARN: + /* 'I explicitly don't care' is set */ + break; + case L1TF_MITIGATION_FLUSH: + case L1TF_MITIGATION_FLUSH_NOSMT: + case L1TF_MITIGATION_FULL: + /* + * Warn upon starting the first VM in a potentially + * insecure environment. + */ + if (cpu_smt_control == CPU_SMT_ENABLED) + pr_warn_once(L1TF_MSG_SMT); + if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER) + pr_warn_once(L1TF_MSG_L1D); + break; + case L1TF_MITIGATION_FULL_FORCE: + /* Flush is enforced */ + break; + } + } return 0; } @@ -10605,11 +11280,11 @@ static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) if (!valid_ept_address(vcpu, nested_ept_get_cr3(vcpu))) return 1; - kvm_mmu_unload(vcpu); kvm_init_shadow_ept_mmu(vcpu, to_vmx(vcpu)->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT, - nested_ept_ad_enabled(vcpu)); + nested_ept_ad_enabled(vcpu), + nested_ept_get_cr3(vcpu)); vcpu->arch.mmu.set_cr3 = vmx_set_cr3; vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3; vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault; @@ -10657,9 +11332,9 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12); -static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) +static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) { + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); struct page *page; u64 hpa; @@ -10900,6 +11575,38 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, return true; } +static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + struct vmcs12 *shadow; + struct page *page; + + if (!nested_cpu_has_shadow_vmcs(vmcs12) || + vmcs12->vmcs_link_pointer == -1ull) + return; + + shadow = get_shadow_vmcs12(vcpu); + page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer); + + memcpy(shadow, kmap(page), VMCS12_SIZE); + + kunmap(page); + kvm_release_page_clean(page); +} + +static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!nested_cpu_has_shadow_vmcs(vmcs12) || + vmcs12->vmcs_link_pointer == -1ull) + return; + + kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer, + get_shadow_vmcs12(vcpu), VMCS12_SIZE); +} + static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { @@ -10957,11 +11664,12 @@ static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, unsigned long count_field, unsigned long addr_field) { + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); int maxphyaddr; u64 count, addr; - if (vmcs12_read_any(vcpu, count_field, &count) || - vmcs12_read_any(vcpu, addr_field, &addr)) { + if (vmcs12_read_any(vmcs12, count_field, &count) || + vmcs12_read_any(vmcs12, addr_field, &addr)) { WARN_ON(1); return -EINVAL; } @@ -11011,6 +11719,19 @@ static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, return 0; } +static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (!nested_cpu_has_shadow_vmcs(vmcs12)) + return 0; + + if (!page_address_valid(vcpu, vmcs12->vmread_bitmap) || + !page_address_valid(vcpu, vmcs12->vmwrite_bitmap)) + return -EINVAL; + + return 0; +} + static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, struct vmx_msr_entry *e) { @@ -11160,12 +11881,16 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne return 1; } } - - vcpu->arch.cr3 = cr3; - __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); } - kvm_mmu_reset_context(vcpu); + if (!nested_ept) + kvm_mmu_new_cr3(vcpu, cr3, false); + + vcpu->arch.cr3 = cr3; + __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); + + kvm_init_mmu(vcpu, false); + return 0; } @@ -11252,7 +11977,8 @@ static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) * Set host-state according to L0's settings (vmcs12 is irrelevant here) * Some constant fields are set here by vmx_set_constant_host_state(). * Other fields are different per CPU, and will be set later when - * vmx_vcpu_load() is called, and when vmx_save_host_state() is called. + * vmx_vcpu_load() is called, and when vmx_prepare_switch_to_guest() + * is called. */ vmx_set_constant_host_state(vmx); @@ -11260,10 +11986,10 @@ static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) * Set the MSR load/store lists to match L0's settings. */ vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr); - vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host)); - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr); - vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest)); + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); + vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); + vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); set_cr4_guest_host_mask(vmx); @@ -11324,11 +12050,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); - /* - * Not in vmcs02: GUEST_PML_INDEX, HOST_FS_SELECTOR, HOST_GS_SELECTOR, - * HOST_FS_BASE, HOST_GS_BASE. - */ - if (vmx->nested.nested_run_pending && (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); @@ -11393,6 +12114,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, exec_control |= vmcs12_exec_ctrl; } + /* VMCS shadowing for L2 is emulated for now */ + exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; + if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) vmcs_write16(GUEST_INTR_STATUS, vmcs12->guest_intr_status); @@ -11405,6 +12129,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) vmcs_write64(APIC_ACCESS_ADDR, -1ull); + if (exec_control & SECONDARY_EXEC_ENCLS_EXITING) + vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); } @@ -11612,6 +12339,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) if (nested_vmx_check_pml_controls(vcpu, vmcs12)) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, vmx->nested.msrs.procbased_ctls_low, vmx->nested.msrs.procbased_ctls_high) || @@ -11712,6 +12442,33 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) return 0; } +static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + int r; + struct page *page; + struct vmcs12 *shadow; + + if (vmcs12->vmcs_link_pointer == -1ull) + return 0; + + if (!page_address_valid(vcpu, vmcs12->vmcs_link_pointer)) + return -EINVAL; + + page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer); + if (is_error_page(page)) + return -EINVAL; + + r = 0; + shadow = kmap(page); + if (shadow->hdr.revision_id != VMCS12_REVISION || + shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)) + r = -EINVAL; + kunmap(page); + kvm_release_page_clean(page); + return r; +} + static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, u32 *exit_qual) { @@ -11723,8 +12480,7 @@ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)) return 1; - if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS) && - vmcs12->vmcs_link_pointer != -1ull) { + if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR; return 1; } @@ -11771,12 +12527,17 @@ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, return 0; } -static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu) +/* + * If exit_qual is NULL, this is being called from state restore (either RSM + * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. + */ +static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - u32 exit_qual; - int r; + bool from_vmentry = !!exit_qual; + u32 dummy_exit_qual; + int r = 0; enter_guest_mode(vcpu); @@ -11790,17 +12551,28 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu) vcpu->arch.tsc_offset += vmcs12->tsc_offset; r = EXIT_REASON_INVALID_STATE; - if (prepare_vmcs02(vcpu, vmcs12, &exit_qual)) + if (prepare_vmcs02(vcpu, vmcs12, from_vmentry ? exit_qual : &dummy_exit_qual)) goto fail; - nested_get_vmcs12_pages(vcpu, vmcs12); + if (from_vmentry) { + nested_get_vmcs12_pages(vcpu); - r = EXIT_REASON_MSR_LOAD_FAIL; - exit_qual = nested_vmx_load_msr(vcpu, - vmcs12->vm_entry_msr_load_addr, - vmcs12->vm_entry_msr_load_count); - if (exit_qual) - goto fail; + r = EXIT_REASON_MSR_LOAD_FAIL; + *exit_qual = nested_vmx_load_msr(vcpu, + vmcs12->vm_entry_msr_load_addr, + vmcs12->vm_entry_msr_load_count); + if (*exit_qual) + goto fail; + } else { + /* + * The MMU is not initialized to point at the right entities yet and + * "get pages" would need to read data from the guest (i.e. we will + * need to perform gpa to hpa translation). Request a call + * to nested_get_vmcs12_pages before the next VM-entry. The MSRs + * have already been set at vmentry time and should not be reset. + */ + kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu); + } /* * Note no nested_vmx_succeed or nested_vmx_fail here. At this point @@ -11815,8 +12587,7 @@ fail: vcpu->arch.tsc_offset -= vmcs12->tsc_offset; leave_guest_mode(vcpu); vmx_switch_vmcs(vcpu, &vmx->vmcs01); - nested_vmx_entry_failure(vcpu, vmcs12, r, exit_qual); - return 1; + return r; } /* @@ -11839,6 +12610,17 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) vmcs12 = get_vmcs12(vcpu); + /* + * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact + * that there *is* a valid VMCS pointer, RFLAGS.CF is set + * rather than RFLAGS.ZF, and no error number is stored to the + * VM-instruction error field. + */ + if (vmcs12->hdr.shadow_vmcs) { + nested_vmx_failInvalid(vcpu); + goto out; + } + if (enable_shadow_vmcs) copy_shadow_to_vmcs12(vmx); @@ -11893,12 +12675,28 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) */ vmx->nested.nested_run_pending = 1; - ret = enter_vmx_non_root_mode(vcpu); + ret = enter_vmx_non_root_mode(vcpu, &exit_qual); if (ret) { + nested_vmx_entry_failure(vcpu, vmcs12, ret, exit_qual); vmx->nested.nested_run_pending = 0; - return ret; + return 1; } + /* Hide L1D cache contents from the nested guest. */ + vmx->vcpu.arch.l1tf_flush_l1d = true; + + /* + * Must happen outside of enter_vmx_non_root_mode() as it will + * also be used as part of restoring nVMX state for + * snapshot restore (migration). + * + * In this flow, it is assumed that vmcs12 cache was + * trasferred as part of captured nVMX state and should + * therefore not be read from guest memory (which may not + * exist on destination host yet). + */ + nested_cache_shadow_vmcs12(vcpu, vmcs12); + /* * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken * by event injection, halt vcpu. @@ -12408,6 +13206,17 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, exit_qualification); + /* + * Must happen outside of sync_vmcs12() as it will + * also be used to capture vmcs12 cache as part of + * capturing nVMX state for snapshot (migration). + * + * Otherwise, this flush will dirty guest memory at a + * point it is already assumed by user-space to be + * immutable. + */ + nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); + if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr, vmcs12->vm_exit_msr_store_count)) nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL); @@ -12419,8 +13228,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vmx_segment_cache_clear(vmx); /* Update any VMCS fields that might have changed while L2 ran */ - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr); - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr); + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); if (vmx->hv_deadline_tsc == -1) vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, @@ -12982,7 +13791,7 @@ static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase) if (vmx->nested.smm.guest_mode) { vcpu->arch.hflags &= ~HF_SMM_MASK; - ret = enter_vmx_non_root_mode(vcpu); + ret = enter_vmx_non_root_mode(vcpu, NULL); vcpu->arch.hflags |= HF_SMM_MASK; if (ret) return ret; @@ -12997,6 +13806,199 @@ static int enable_smi_window(struct kvm_vcpu *vcpu) return 0; } +static int vmx_get_nested_state(struct kvm_vcpu *vcpu, + struct kvm_nested_state __user *user_kvm_nested_state, + u32 user_data_size) +{ + struct vcpu_vmx *vmx; + struct vmcs12 *vmcs12; + struct kvm_nested_state kvm_state = { + .flags = 0, + .format = 0, + .size = sizeof(kvm_state), + .vmx.vmxon_pa = -1ull, + .vmx.vmcs_pa = -1ull, + }; + + if (!vcpu) + return kvm_state.size + 2 * VMCS12_SIZE; + + vmx = to_vmx(vcpu); + vmcs12 = get_vmcs12(vcpu); + if (nested_vmx_allowed(vcpu) && + (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { + kvm_state.vmx.vmxon_pa = vmx->nested.vmxon_ptr; + kvm_state.vmx.vmcs_pa = vmx->nested.current_vmptr; + + if (vmx->nested.current_vmptr != -1ull) { + kvm_state.size += VMCS12_SIZE; + + if (is_guest_mode(vcpu) && + nested_cpu_has_shadow_vmcs(vmcs12) && + vmcs12->vmcs_link_pointer != -1ull) + kvm_state.size += VMCS12_SIZE; + } + + if (vmx->nested.smm.vmxon) + kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; + + if (vmx->nested.smm.guest_mode) + kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; + + if (is_guest_mode(vcpu)) { + kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; + + if (vmx->nested.nested_run_pending) + kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; + } + } + + if (user_data_size < kvm_state.size) + goto out; + + if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) + return -EFAULT; + + if (vmx->nested.current_vmptr == -1ull) + goto out; + + /* + * When running L2, the authoritative vmcs12 state is in the + * vmcs02. When running L1, the authoritative vmcs12 state is + * in the shadow vmcs linked to vmcs01, unless + * sync_shadow_vmcs is set, in which case, the authoritative + * vmcs12 state is in the vmcs12 already. + */ + if (is_guest_mode(vcpu)) + sync_vmcs12(vcpu, vmcs12); + else if (enable_shadow_vmcs && !vmx->nested.sync_shadow_vmcs) + copy_shadow_to_vmcs12(vmx); + + if (copy_to_user(user_kvm_nested_state->data, vmcs12, sizeof(*vmcs12))) + return -EFAULT; + + if (nested_cpu_has_shadow_vmcs(vmcs12) && + vmcs12->vmcs_link_pointer != -1ull) { + if (copy_to_user(user_kvm_nested_state->data + VMCS12_SIZE, + get_shadow_vmcs12(vcpu), sizeof(*vmcs12))) + return -EFAULT; + } + +out: + return kvm_state.size; +} + +static int vmx_set_nested_state(struct kvm_vcpu *vcpu, + struct kvm_nested_state __user *user_kvm_nested_state, + struct kvm_nested_state *kvm_state) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmcs12 *vmcs12; + u32 exit_qual; + int ret; + + if (kvm_state->format != 0) + return -EINVAL; + + if (!nested_vmx_allowed(vcpu)) + return kvm_state->vmx.vmxon_pa == -1ull ? 0 : -EINVAL; + + if (kvm_state->vmx.vmxon_pa == -1ull) { + if (kvm_state->vmx.smm.flags) + return -EINVAL; + + if (kvm_state->vmx.vmcs_pa != -1ull) + return -EINVAL; + + vmx_leave_nested(vcpu); + return 0; + } + + if (!page_address_valid(vcpu, kvm_state->vmx.vmxon_pa)) + return -EINVAL; + + if (kvm_state->size < sizeof(kvm_state) + sizeof(*vmcs12)) + return -EINVAL; + + if (kvm_state->vmx.vmcs_pa == kvm_state->vmx.vmxon_pa || + !page_address_valid(vcpu, kvm_state->vmx.vmcs_pa)) + return -EINVAL; + + if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && + (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) + return -EINVAL; + + if (kvm_state->vmx.smm.flags & + ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) + return -EINVAL; + + if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && + !(kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) + return -EINVAL; + + vmx_leave_nested(vcpu); + if (kvm_state->vmx.vmxon_pa == -1ull) + return 0; + + vmx->nested.vmxon_ptr = kvm_state->vmx.vmxon_pa; + ret = enter_vmx_operation(vcpu); + if (ret) + return ret; + + set_current_vmptr(vmx, kvm_state->vmx.vmcs_pa); + + if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { + vmx->nested.smm.vmxon = true; + vmx->nested.vmxon = false; + + if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) + vmx->nested.smm.guest_mode = true; + } + + vmcs12 = get_vmcs12(vcpu); + if (copy_from_user(vmcs12, user_kvm_nested_state->data, sizeof(*vmcs12))) + return -EFAULT; + + if (vmcs12->hdr.revision_id != VMCS12_REVISION) + return -EINVAL; + + if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) + return 0; + + vmx->nested.nested_run_pending = + !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); + + if (nested_cpu_has_shadow_vmcs(vmcs12) && + vmcs12->vmcs_link_pointer != -1ull) { + struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); + if (kvm_state->size < sizeof(kvm_state) + 2 * sizeof(*vmcs12)) + return -EINVAL; + + if (copy_from_user(shadow_vmcs12, + user_kvm_nested_state->data + VMCS12_SIZE, + sizeof(*vmcs12))) + return -EFAULT; + + if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || + !shadow_vmcs12->hdr.shadow_vmcs) + return -EINVAL; + } + + if (check_vmentry_prereqs(vcpu, vmcs12) || + check_vmentry_postreqs(vcpu, vmcs12, &exit_qual)) + return -EINVAL; + + if (kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING) + vmx->nested.nested_run_pending = 1; + + vmx->nested.dirty_vmcs12 = true; + ret = enter_vmx_non_root_mode(vcpu, NULL); + if (ret) + return -EINVAL; + + return 0; +} + static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -13016,7 +14018,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .vcpu_free = vmx_free_vcpu, .vcpu_reset = vmx_vcpu_reset, - .prepare_guest_switch = vmx_save_host_state, + .prepare_guest_switch = vmx_prepare_switch_to_guest, .vcpu_load = vmx_vcpu_load, .vcpu_put = vmx_vcpu_put, @@ -13049,6 +14051,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .set_rflags = vmx_set_rflags, .tlb_flush = vmx_flush_tlb, + .tlb_flush_gva = vmx_flush_tlb_gva, .run = vmx_vcpu_run, .handle_exit = vmx_handle_exit, @@ -13131,12 +14134,61 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .setup_mce = vmx_setup_mce, + .get_nested_state = vmx_get_nested_state, + .set_nested_state = vmx_set_nested_state, + .get_vmcs12_pages = nested_get_vmcs12_pages, + .smi_allowed = vmx_smi_allowed, .pre_enter_smm = vmx_pre_enter_smm, .pre_leave_smm = vmx_pre_leave_smm, .enable_smi_window = enable_smi_window, }; +static void vmx_cleanup_l1d_flush(void) +{ + if (vmx_l1d_flush_pages) { + free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER); + vmx_l1d_flush_pages = NULL; + } + /* Restore state so sysfs ignores VMX */ + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; +} + +static void vmx_exit(void) +{ +#ifdef CONFIG_KEXEC_CORE + RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL); + synchronize_rcu(); +#endif + + kvm_exit(); + +#if IS_ENABLED(CONFIG_HYPERV) + if (static_branch_unlikely(&enable_evmcs)) { + int cpu; + struct hv_vp_assist_page *vp_ap; + /* + * Reset everything to support using non-enlightened VMCS + * access later (e.g. when we reload the module with + * enlightened_vmcs=0) + */ + for_each_online_cpu(cpu) { + vp_ap = hv_get_vp_assist_page(cpu); + + if (!vp_ap) + continue; + + vp_ap->current_nested_vmcs = 0; + vp_ap->enlighten_vmentry = 0; + } + + static_branch_disable(&enable_evmcs); + } +#endif + vmx_cleanup_l1d_flush(); +} +module_exit(vmx_exit); + static int __init vmx_init(void) { int r; @@ -13171,10 +14223,25 @@ static int __init vmx_init(void) #endif r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), - __alignof__(struct vcpu_vmx), THIS_MODULE); + __alignof__(struct vcpu_vmx), THIS_MODULE); if (r) return r; + /* + * Must be called after kvm_init() so enable_ept is properly set + * up. Hand the parameter mitigation value in which was stored in + * the pre module init parser. If no parameter was given, it will + * contain 'auto' which will be turned into the default 'cond' + * mitigation mode. + */ + if (boot_cpu_has(X86_BUG_L1TF)) { + r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); + if (r) { + vmx_exit(); + return r; + } + } + #ifdef CONFIG_KEXEC_CORE rcu_assign_pointer(crash_vmclear_loaded_vmcss, crash_vmclear_local_loaded_vmcss); @@ -13183,39 +14250,4 @@ static int __init vmx_init(void) return 0; } - -static void __exit vmx_exit(void) -{ -#ifdef CONFIG_KEXEC_CORE - RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL); - synchronize_rcu(); -#endif - - kvm_exit(); - -#if IS_ENABLED(CONFIG_HYPERV) - if (static_branch_unlikely(&enable_evmcs)) { - int cpu; - struct hv_vp_assist_page *vp_ap; - /* - * Reset everything to support using non-enlightened VMCS - * access later (e.g. when we reload the module with - * enlightened_vmcs=0) - */ - for_each_online_cpu(cpu) { - vp_ap = hv_get_vp_assist_page(cpu); - - if (!vp_ap) - continue; - - vp_ap->current_nested_vmcs = 0; - vp_ap->enlighten_vmentry = 0; - } - - static_branch_disable(&enable_evmcs); - } -#endif -} - -module_init(vmx_init) -module_exit(vmx_exit) +module_init(vmx_init); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2b812b3c5088..506bd2b4b8bb 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -195,6 +195,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "irq_injections", VCPU_STAT(irq_injections) }, { "nmi_injections", VCPU_STAT(nmi_injections) }, { "req_event", VCPU_STAT(req_event) }, + { "l1d_flush", VCPU_STAT(l1d_flush) }, { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, { "mmu_pte_write", VM_STAT(mmu_pte_write) }, { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, @@ -847,16 +848,21 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4); int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { + bool skip_tlb_flush = false; #ifdef CONFIG_X86_64 bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE); - if (pcid_enabled) - cr3 &= ~CR3_PCID_INVD; + if (pcid_enabled) { + skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH; + cr3 &= ~X86_CR3_PCID_NOFLUSH; + } #endif if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) { - kvm_mmu_sync_roots(vcpu); - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + if (!skip_tlb_flush) { + kvm_mmu_sync_roots(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + } return 0; } @@ -867,9 +873,10 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) return 1; + kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush); vcpu->arch.cr3 = cr3; __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); - kvm_mmu_new_cr3(vcpu); + return 0; } EXPORT_SYMBOL_GPL(kvm_set_cr3); @@ -1102,11 +1109,35 @@ static u32 msr_based_features[] = { static unsigned int num_msr_based_features; +u64 kvm_get_arch_capabilities(void) +{ + u64 data; + + rdmsrl_safe(MSR_IA32_ARCH_CAPABILITIES, &data); + + /* + * If we're doing cache flushes (either "always" or "cond") + * we will do one whenever the guest does a vmlaunch/vmresume. + * If an outer hypervisor is doing the cache flush for us + * (VMENTER_L1D_FLUSH_NESTED_VM), we can safely pass that + * capability to the guest too, and if EPT is disabled we're not + * vulnerable. Overall, only VMENTER_L1D_FLUSH_NEVER will + * require a nested hypervisor to do a flush of its own. + */ + if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER) + data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH; + + return data; +} +EXPORT_SYMBOL_GPL(kvm_get_arch_capabilities); + static int kvm_get_msr_feature(struct kvm_msr_entry *msr) { switch (msr->index) { - case MSR_IA32_UCODE_REV: case MSR_IA32_ARCH_CAPABILITIES: + msr->data = kvm_get_arch_capabilities(); + break; + case MSR_IA32_UCODE_REV: rdmsrl_safe(msr->index, &msr->data); break; default: @@ -2160,10 +2191,11 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.mcg_status = data; break; case MSR_IA32_MCG_CTL: - if (!(mcg_cap & MCG_CTL_P)) + if (!(mcg_cap & MCG_CTL_P) && + (data || !msr_info->host_initiated)) return 1; if (data != 0 && data != ~(u64)0) - return -1; + return 1; vcpu->arch.mcg_ctl = data; break; default: @@ -2551,7 +2583,7 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) } EXPORT_SYMBOL_GPL(kvm_get_msr); -static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) { u64 data; u64 mcg_cap = vcpu->arch.mcg_cap; @@ -2566,7 +2598,7 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) data = vcpu->arch.mcg_cap; break; case MSR_IA32_MCG_CTL: - if (!(mcg_cap & MCG_CTL_P)) + if (!(mcg_cap & MCG_CTL_P) && !host) return 1; data = vcpu->arch.mcg_ctl; break; @@ -2699,7 +2731,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: - return get_msr_mce(vcpu, msr_info->index, &msr_info->data); + return get_msr_mce(vcpu, msr_info->index, &msr_info->data, + msr_info->host_initiated); case MSR_K7_CLK_CTL: /* * Provide expected ramp-up count for K7. All other @@ -2720,7 +2753,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case HV_X64_MSR_TSC_EMULATION_CONTROL: case HV_X64_MSR_TSC_EMULATION_STATUS: return kvm_hv_get_msr_common(vcpu, - msr_info->index, &msr_info->data); + msr_info->index, &msr_info->data, + msr_info->host_initiated); break; case MSR_IA32_BBL_CR_CTL3: /* This legacy MSR exists but isn't fully documented in current @@ -2944,6 +2978,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_X2APIC_API: r = KVM_X2APIC_API_VALID_FLAGS; break; + case KVM_CAP_NESTED_STATE: + r = kvm_x86_ops->get_nested_state ? + kvm_x86_ops->get_nested_state(NULL, 0, 0) : 0; + break; default: break; } @@ -3960,6 +3998,56 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap); break; } + case KVM_GET_NESTED_STATE: { + struct kvm_nested_state __user *user_kvm_nested_state = argp; + u32 user_data_size; + + r = -EINVAL; + if (!kvm_x86_ops->get_nested_state) + break; + + BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size)); + if (get_user(user_data_size, &user_kvm_nested_state->size)) + return -EFAULT; + + r = kvm_x86_ops->get_nested_state(vcpu, user_kvm_nested_state, + user_data_size); + if (r < 0) + return r; + + if (r > user_data_size) { + if (put_user(r, &user_kvm_nested_state->size)) + return -EFAULT; + return -E2BIG; + } + r = 0; + break; + } + case KVM_SET_NESTED_STATE: { + struct kvm_nested_state __user *user_kvm_nested_state = argp; + struct kvm_nested_state kvm_state; + + r = -EINVAL; + if (!kvm_x86_ops->set_nested_state) + break; + + if (copy_from_user(&kvm_state, user_kvm_nested_state, sizeof(kvm_state))) + return -EFAULT; + + if (kvm_state.size < sizeof(kvm_state)) + return -EINVAL; + + if (kvm_state.flags & + ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE)) + return -EINVAL; + + /* nested_run_pending implies guest_mode. */ + if (kvm_state.flags == KVM_STATE_NESTED_RUN_PENDING) + return -EINVAL; + + r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state); + break; + } default: r = -EINVAL; } @@ -4876,6 +4964,9 @@ static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *v int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception) { + /* kvm_write_guest_virt_system can pull in tons of pages. */ + vcpu->arch.l1tf_flush_l1d = true; + return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, PFERR_WRITE_MASK, exception); } @@ -6052,6 +6143,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, bool writeback = true; bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable; + vcpu->arch.l1tf_flush_l1d = true; + /* * Clear write_fault_to_shadow_pgtable here to ensure it is * never reused. @@ -6473,20 +6566,22 @@ static void kvm_set_mmio_spte_mask(void) * Set the reserved bits and the present bit of an paging-structure * entry to generate page fault with PFER.RSV = 1. */ - /* Mask the reserved physical address bits. */ - mask = rsvd_bits(maxphyaddr, 51); + + /* + * Mask the uppermost physical address bit, which would be reserved as + * long as the supported physical address width is less than 52. + */ + mask = 1ull << 51; /* Set the present bit. */ mask |= 1ull; -#ifdef CONFIG_X86_64 /* * If reserved bit is not supported, clear the present bit to disable * mmio page fault. */ - if (maxphyaddr == 52) + if (IS_ENABLED(CONFIG_X86_64) && maxphyaddr == 52) mask &= ~1ull; -#endif kvm_mmu_set_mmio_spte_mask(mask, mask); } @@ -6739,6 +6834,9 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) case KVM_HC_CLOCK_PAIRING: ret = kvm_pv_clock_pairing(vcpu, a0, a1); break; + case KVM_HC_SEND_IPI: + ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit); + break; #endif default: ret = -KVM_ENOSYS; @@ -7205,8 +7303,9 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu) kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); } -void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, - unsigned long start, unsigned long end) +int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, + unsigned long start, unsigned long end, + bool blockable) { unsigned long apic_address; @@ -7217,6 +7316,8 @@ void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); if (start <= apic_address && apic_address < end) kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); + + return 0; } void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) @@ -7257,6 +7358,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) bool req_immediate_exit = false; if (kvm_request_pending(vcpu)) { + if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu)) + kvm_x86_ops->get_vmcs12_pages(vcpu); if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) kvm_mmu_unload(vcpu); if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) @@ -7272,6 +7375,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu)) kvm_mmu_sync_roots(vcpu); + if (kvm_check_request(KVM_REQ_LOAD_CR3, vcpu)) + kvm_mmu_load_cr3(vcpu); if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) kvm_vcpu_flush_tlb(vcpu, true); if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { @@ -7581,6 +7686,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu) struct kvm *kvm = vcpu->kvm; vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); + vcpu->arch.l1tf_flush_l1d = true; for (;;) { if (kvm_vcpu_running(vcpu)) { @@ -7982,6 +8088,10 @@ EXPORT_SYMBOL_GPL(kvm_task_switch); static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { + if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && + (sregs->cr4 & X86_CR4_OSXSAVE)) + return -EINVAL; + if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) { /* * When EFER.LME and CR0.PG are set, the processor is in @@ -8012,10 +8122,6 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) struct desc_ptr dt; int ret = -EINVAL; - if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && - (sregs->cr4 & X86_CR4_OSXSAVE)) - goto out; - if (kvm_valid_sregs(vcpu, sregs)) goto out; @@ -8700,6 +8806,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) { + vcpu->arch.l1tf_flush_l1d = true; kvm_x86_ops->sched_in(vcpu, cpu); } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index db1c042e9853..b9123c497e0a 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -16,6 +16,7 @@ #include <linux/prefetch.h> /* prefetchw */ #include <linux/context_tracking.h> /* exception_enter(), ... */ #include <linux/uaccess.h> /* faulthandler_disabled() */ +#include <linux/mm_types.h> #include <asm/cpufeature.h> /* boot_cpu_has, ... */ #include <asm/traps.h> /* dotraplinkage, ... */ @@ -999,7 +1000,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, static noinline void mm_fault_error(struct pt_regs *regs, unsigned long error_code, - unsigned long address, u32 *pkey, unsigned int fault) + unsigned long address, u32 *pkey, vm_fault_t fault) { if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) { no_context(regs, error_code, address, 0, 0); @@ -1213,7 +1214,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, struct vm_area_struct *vma; struct task_struct *tsk; struct mm_struct *mm; - int fault, major = 0; + vm_fault_t fault, major = 0; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; u32 pkey; diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 74b157ac078d..7a8fc26c1115 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -4,6 +4,8 @@ #include <linux/swap.h> #include <linux/memblock.h> #include <linux/bootmem.h> /* for max_low_pfn */ +#include <linux/swapfile.h> +#include <linux/swapops.h> #include <asm/set_memory.h> #include <asm/e820/api.h> @@ -97,15 +99,22 @@ __ref void *alloc_low_pages(unsigned int num) } if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) { - unsigned long ret; - if (min_pfn_mapped >= max_pfn_mapped) - panic("alloc_low_pages: ran out of memory"); - ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT, + unsigned long ret = 0; + + if (min_pfn_mapped < max_pfn_mapped) { + ret = memblock_find_in_range( + min_pfn_mapped << PAGE_SHIFT, max_pfn_mapped << PAGE_SHIFT, PAGE_SIZE * num , PAGE_SIZE); + } + if (ret) + memblock_reserve(ret, PAGE_SIZE * num); + else if (can_use_brk_pgt) + ret = __pa(extend_brk(PAGE_SIZE * num, PAGE_SIZE)); + if (!ret) panic("alloc_low_pages: can not alloc memory"); - memblock_reserve(ret, PAGE_SIZE * num); + pfn = ret >> PAGE_SHIFT; } else { pfn = pgt_buf_end; @@ -911,3 +920,26 @@ void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache) __cachemode2pte_tbl[cache] = __cm_idx2pte(entry); __pte2cachemode_tbl[entry] = cache; } + +#ifdef CONFIG_SWAP +unsigned long max_swapfile_size(void) +{ + unsigned long pages; + + pages = generic_max_swapfile_size(); + + if (boot_cpu_has_bug(X86_BUG_L1TF)) { + /* Limit the swap file size to MAX_PA/2 for L1TF workaround */ + unsigned long long l1tf_limit = l1tf_pfn_limit(); + /* + * We encode swap offsets also with 3 bits below those for pfn + * which makes the usable limit higher. + */ +#if CONFIG_PGTABLE_LEVELS > 2 + l1tf_limit <<= PAGE_SHIFT - SWP_OFFSET_FIRST_BIT; +#endif + pages = min_t(unsigned long long, l1tf_limit, pages); + } + return pages; +} +#endif diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 7c8686709636..79eb55ce69a9 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -126,24 +126,29 @@ static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long addr) static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old) { + pmd_t new_pmd; pmdval_t v = pmd_val(*pmd); if (clear) { - *old = v & _PAGE_PRESENT; - v &= ~_PAGE_PRESENT; - } else /* presume this has been called with clear==true previously */ - v |= *old; - set_pmd(pmd, __pmd(v)); + *old = v; + new_pmd = pmd_mknotpresent(*pmd); + } else { + /* Presume this has been called with clear==true previously */ + new_pmd = __pmd(*old); + } + set_pmd(pmd, new_pmd); } static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old) { pteval_t v = pte_val(*pte); if (clear) { - *old = v & _PAGE_PRESENT; - v &= ~_PAGE_PRESENT; - } else /* presume this has been called with clear==true previously */ - v |= *old; - set_pte_atomic(pte, __pte(v)); + *old = v; + /* Nothing should care about address */ + pte_clear(&init_mm, 0, pte); + } else { + /* Presume this has been called with clear==true previously */ + set_pte_atomic(pte, __pte(*old)); + } } static int clear_page_presence(struct kmmio_fault_page *f, bool clear) diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 48c591251600..1e95d57760cf 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -240,3 +240,24 @@ int valid_mmap_phys_addr_range(unsigned long pfn, size_t count) return phys_addr_valid(addr + count - 1); } + +/* + * Only allow root to set high MMIO mappings to PROT_NONE. + * This prevents an unpriv. user to set them to PROT_NONE and invert + * them, then pointing to valid memory for L1TF speculation. + * + * Note: for locked down kernels may want to disable the root override. + */ +bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot) +{ + if (!boot_cpu_has_bug(X86_BUG_L1TF)) + return true; + if (!__pte_needs_invert(pgprot_val(prot))) + return true; + /* If it's real memory always allow */ + if (pfn_valid(pfn)) + return true; + if (pfn >= l1tf_pfn_limit() && !capable(CAP_SYS_ADMIN)) + return false; + return true; +} diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 0a74996a1149..8d6c34fe49be 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1015,8 +1015,8 @@ static long populate_pmd(struct cpa_data *cpa, pmd = pmd_offset(pud, start); - set_pmd(pmd, __pmd(cpa->pfn << PAGE_SHIFT | _PAGE_PSE | - massage_pgprot(pmd_pgprot))); + set_pmd(pmd, pmd_mkhuge(pfn_pmd(cpa->pfn, + canon_pgprot(pmd_pgprot)))); start += PMD_SIZE; cpa->pfn += PMD_SIZE >> PAGE_SHIFT; @@ -1088,8 +1088,8 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d, * Map everything starting from the Gb boundary, possibly with 1G pages */ while (boot_cpu_has(X86_FEATURE_GBPAGES) && end - start >= PUD_SIZE) { - set_pud(pud, __pud(cpa->pfn << PAGE_SHIFT | _PAGE_PSE | - massage_pgprot(pud_pgprot))); + set_pud(pud, pud_mkhuge(pfn_pud(cpa->pfn, + canon_pgprot(pud_pgprot)))); start += PUD_SIZE; cpa->pfn += PUD_SIZE >> PAGE_SHIFT; diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 1555bd7d3449..3d0c83ef6aab 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -512,6 +512,17 @@ static int free_ram_pages_type(u64 start, u64 end) return 0; } +static u64 sanitize_phys(u64 address) +{ + /* + * When changing the memtype for pages containing poison allow + * for a "decoy" virtual address (bit 63 clear) passed to + * set_memory_X(). __pa() on a "decoy" address results in a + * physical address with bit 63 set. + */ + return address & __PHYSICAL_MASK; +} + /* * req_type typically has one of the: * - _PAGE_CACHE_MODE_WB @@ -533,6 +544,8 @@ int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type, int is_range_ram; int err = 0; + start = sanitize_phys(start); + end = sanitize_phys(end); BUG_ON(start >= end); /* end is exclusive */ if (!pat_enabled()) { @@ -609,6 +622,9 @@ int free_memtype(u64 start, u64 end) if (!pat_enabled()) return 0; + start = sanitize_phys(start); + end = sanitize_phys(end); + /* Low ISA region is always mapped WB. No need to track */ if (x86_platform.is_untracked_pat_range(start, end)) return 0; diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 3ef095c70ae3..e848a4811785 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -63,7 +63,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { pgtable_page_dtor(pte); paravirt_release_pte(page_to_pfn(pte)); - tlb_remove_table(tlb, pte); + paravirt_tlb_remove_table(tlb, pte); } #if CONFIG_PGTABLE_LEVELS > 2 @@ -79,21 +79,21 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) tlb->need_flush_all = 1; #endif pgtable_pmd_page_dtor(page); - tlb_remove_table(tlb, page); + paravirt_tlb_remove_table(tlb, page); } #if CONFIG_PGTABLE_LEVELS > 3 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) { paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); - tlb_remove_table(tlb, virt_to_page(pud)); + paravirt_tlb_remove_table(tlb, virt_to_page(pud)); } #if CONFIG_PGTABLE_LEVELS > 4 void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) { paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT); - tlb_remove_table(tlb, virt_to_page(p4d)); + paravirt_tlb_remove_table(tlb, virt_to_page(p4d)); } #endif /* CONFIG_PGTABLE_LEVELS > 4 */ #endif /* CONFIG_PGTABLE_LEVELS > 3 */ diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index d58b4aba9510..31341ae7309f 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -45,6 +45,7 @@ #include <asm/pgalloc.h> #include <asm/tlbflush.h> #include <asm/desc.h> +#include <asm/sections.h> #undef pr_fmt #define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 752dbf4e0e50..9517d1b2a281 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -7,7 +7,6 @@ #include <linux/export.h> #include <linux/cpu.h> #include <linux/debugfs.h> -#include <linux/gfp.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> @@ -186,11 +185,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, { struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); - bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy); unsigned cpu = smp_processor_id(); u64 next_tlb_gen; - bool need_flush; - u16 new_asid; /* * NB: The scheduler will call us with prev == next when switching @@ -244,41 +240,20 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, next->context.ctx_id); /* - * Even in lazy TLB mode, the CPU should stay set in the - * mm_cpumask. The TLB shootdown code can figure out from - * from cpu_tlbstate.is_lazy whether or not to send an IPI. + * We don't currently support having a real mm loaded without + * our cpu set in mm_cpumask(). We have all the bookkeeping + * in place to figure out whether we would need to flush + * if our cpu were cleared in mm_cpumask(), but we don't + * currently use it. */ if (WARN_ON_ONCE(real_prev != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(next)))) cpumask_set_cpu(cpu, mm_cpumask(next)); - /* - * If the CPU is not in lazy TLB mode, we are just switching - * from one thread in a process to another thread in the same - * process. No TLB flush required. - */ - if (!was_lazy) - return; - - /* - * Read the tlb_gen to check whether a flush is needed. - * If the TLB is up to date, just use it. - * The barrier synchronizes with the tlb_gen increment in - * the TLB shootdown code. - */ - smp_mb(); - next_tlb_gen = atomic64_read(&next->context.tlb_gen); - if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) == - next_tlb_gen) - return; - - /* - * TLB contents went out of date while we were in lazy - * mode. Fall through to the TLB switching code below. - */ - new_asid = prev_asid; - need_flush = true; + return; } else { + u16 new_asid; + bool need_flush; u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id); /* @@ -329,41 +304,41 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, next_tlb_gen = atomic64_read(&next->context.tlb_gen); choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); - } - if (need_flush) { - this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); - this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); - load_new_mm_cr3(next->pgd, new_asid, true); + if (need_flush) { + this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); + this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); + load_new_mm_cr3(next->pgd, new_asid, true); + + /* + * NB: This gets called via leave_mm() in the idle path + * where RCU functions differently. Tracing normally + * uses RCU, so we need to use the _rcuidle variant. + * + * (There is no good reason for this. The idle code should + * be rearranged to call this before rcu_idle_enter().) + */ + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + } else { + /* The new ASID is already up to date. */ + load_new_mm_cr3(next->pgd, new_asid, false); + + /* See above wrt _rcuidle. */ + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); + } /* - * NB: This gets called via leave_mm() in the idle path - * where RCU functions differently. Tracing normally - * uses RCU, so we need to use the _rcuidle variant. - * - * (There is no good reason for this. The idle code should - * be rearranged to call this before rcu_idle_enter().) + * Record last user mm's context id, so we can avoid + * flushing branch buffer with IBPB if we switch back + * to the same user. */ - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); - } else { - /* The new ASID is already up to date. */ - load_new_mm_cr3(next->pgd, new_asid, false); + if (next != &init_mm) + this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id); - /* See above wrt _rcuidle. */ - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); + this_cpu_write(cpu_tlbstate.loaded_mm, next); + this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); } - /* - * Record last user mm's context id, so we can avoid - * flushing branch buffer with IBPB if we switch back - * to the same user. - */ - if (next != &init_mm) - this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id); - - this_cpu_write(cpu_tlbstate.loaded_mm, next); - this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); - load_mm_cr4(next); switch_ldt(real_prev, next); } @@ -386,7 +361,20 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) return; - this_cpu_write(cpu_tlbstate.is_lazy, true); + if (tlb_defer_switch_to_init_mm()) { + /* + * There's a significant optimization that may be possible + * here. We have accurate enough TLB flush tracking that we + * don't need to maintain coherence of TLB per se when we're + * lazy. We do, however, need to maintain coherence of + * paging-structure caches. We could, in principle, leave our + * old mm loaded and only switch to init_mm when + * tlb_remove_page() happens. + */ + this_cpu_write(cpu_tlbstate.is_lazy, true); + } else { + switch_mm(NULL, &init_mm, NULL); + } } /* @@ -473,9 +461,6 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, * paging-structure cache to avoid speculatively reading * garbage into our TLB. Since switching to init_mm is barely * slower than a minimal flush, just switch to init_mm. - * - * This should be rare, with native_flush_tlb_others skipping - * IPIs to lazy TLB mode CPUs. */ switch_mm_irqs_off(NULL, &init_mm, NULL); return; @@ -582,9 +567,6 @@ static void flush_tlb_func_remote(void *info) void native_flush_tlb_others(const struct cpumask *cpumask, const struct flush_tlb_info *info) { - cpumask_var_t lazymask; - unsigned int cpu; - count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); if (info->end == TLB_FLUSH_ALL) trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL); @@ -608,6 +590,8 @@ void native_flush_tlb_others(const struct cpumask *cpumask, * that UV should be updated so that smp_call_function_many(), * etc, are optimal on UV. */ + unsigned int cpu; + cpu = smp_processor_id(); cpumask = uv_flush_tlb_others(cpumask, info); if (cpumask) @@ -615,29 +599,8 @@ void native_flush_tlb_others(const struct cpumask *cpumask, (void *)info, 1); return; } - - /* - * A temporary cpumask is used in order to skip sending IPIs - * to CPUs in lazy TLB state, while keeping them in mm_cpumask(mm). - * If the allocation fails, simply IPI every CPU in mm_cpumask. - */ - if (!alloc_cpumask_var(&lazymask, GFP_ATOMIC)) { - smp_call_function_many(cpumask, flush_tlb_func_remote, - (void *)info, 1); - return; - } - - cpumask_copy(lazymask, cpumask); - - for_each_cpu(cpu, lazymask) { - if (per_cpu(cpu_tlbstate.is_lazy, cpu)) - cpumask_clear_cpu(cpu, lazymask); - } - - smp_call_function_many(lazymask, flush_tlb_func_remote, + smp_call_function_many(cpumask, flush_tlb_func_remote, (void *)info, 1); - - free_cpumask_var(lazymask); } /* @@ -690,68 +653,6 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, put_cpu(); } -void tlb_flush_remove_tables_local(void *arg) -{ - struct mm_struct *mm = arg; - - if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm && - this_cpu_read(cpu_tlbstate.is_lazy)) { - /* - * We're in lazy mode. We need to at least flush our - * paging-structure cache to avoid speculatively reading - * garbage into our TLB. Since switching to init_mm is barely - * slower than a minimal flush, just switch to init_mm. - */ - switch_mm_irqs_off(NULL, &init_mm, NULL); - } -} - -static void mm_fill_lazy_tlb_cpu_mask(struct mm_struct *mm, - struct cpumask *lazy_cpus) -{ - int cpu; - - for_each_cpu(cpu, mm_cpumask(mm)) { - if (!per_cpu(cpu_tlbstate.is_lazy, cpu)) - cpumask_set_cpu(cpu, lazy_cpus); - } -} - -void tlb_flush_remove_tables(struct mm_struct *mm) -{ - int cpu = get_cpu(); - cpumask_var_t lazy_cpus; - - if (cpumask_any_but(mm_cpumask(mm), cpu) >= nr_cpu_ids) { - put_cpu(); - return; - } - - if (!zalloc_cpumask_var(&lazy_cpus, GFP_ATOMIC)) { - /* - * If the cpumask allocation fails, do a brute force flush - * on all the CPUs that have this mm loaded. - */ - smp_call_function_many(mm_cpumask(mm), - tlb_flush_remove_tables_local, (void *)mm, 1); - put_cpu(); - return; - } - - /* - * CPUs with !is_lazy either received a TLB flush IPI while the user - * pages in this address range were unmapped, or have context switched - * and reloaded %CR3 since then. - * - * Shootdown IPIs at page table freeing time only need to be sent to - * CPUs that may have out of date TLB contents. - */ - mm_fill_lazy_tlb_cpu_mask(mm, lazy_cpus); - smp_call_function_many(lazy_cpus, - tlb_flush_remove_tables_local, (void *)mm, 1); - free_cpumask_var(lazy_cpus); - put_cpu(); -} static void do_flush_tlb_all(void *info) { diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 563049c483a1..d4ec117c1142 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -22,7 +22,6 @@ unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 | PCI_PROBE_MMCONF; -unsigned int pci_early_dump_regs; static int pci_bf_sort; int pci_routeirq; int noioapicquirk; @@ -599,9 +598,6 @@ char *__init pcibios_setup(char *str) pci_probe |= PCI_BIG_ROOT_WINDOW; return NULL; #endif - } else if (!strcmp(str, "earlydump")) { - pci_early_dump_regs = 1; - return NULL; } else if (!strcmp(str, "routeirq")) { pci_routeirq = 1; return NULL; diff --git a/arch/x86/pci/early.c b/arch/x86/pci/early.c index e5f753cbb1c3..f5fc953e5848 100644 --- a/arch/x86/pci/early.c +++ b/arch/x86/pci/early.c @@ -57,47 +57,3 @@ int early_pci_allowed(void) PCI_PROBE_CONF1; } -void early_dump_pci_device(u8 bus, u8 slot, u8 func) -{ - u32 value[256 / 4]; - int i; - - pr_info("pci 0000:%02x:%02x.%d config space:\n", bus, slot, func); - - for (i = 0; i < 256; i += 4) - value[i / 4] = read_pci_config(bus, slot, func, i); - - print_hex_dump(KERN_INFO, "", DUMP_PREFIX_OFFSET, 16, 1, value, 256, false); -} - -void early_dump_pci_devices(void) -{ - unsigned bus, slot, func; - - if (!early_pci_allowed()) - return; - - for (bus = 0; bus < 256; bus++) { - for (slot = 0; slot < 32; slot++) { - for (func = 0; func < 8; func++) { - u32 class; - u8 type; - - class = read_pci_config(bus, slot, func, - PCI_CLASS_REVISION); - if (class == 0xffffffff) - continue; - - early_dump_pci_device(bus, slot, func); - - if (func == 0) { - type = read_pci_config_byte(bus, slot, - func, - PCI_HEADER_TYPE); - if (!(type & 0x80)) - break; - } - } - } - } -} diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c index 4f5fa65a1011..2acd6be13375 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c @@ -18,6 +18,7 @@ #include <asm/intel-mid.h> #include <asm/intel_scu_ipc.h> #include <asm/io_apic.h> +#include <asm/hw_irq.h> #define TANGIER_EXT_TIMER0_MSI 12 diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index e26dfad507c8..a4130b84d1ff 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -1285,6 +1285,7 @@ void uv_bau_message_interrupt(struct pt_regs *regs) struct msg_desc msgdesc; ack_APIC_irq(); + kvm_set_cpu_l1tf_flush_l1d(); time_start = get_cycles(); bcp = &per_cpu(bau_control, smp_processor_id()); diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index 67ccf64c8bd8..f8e3b668d20b 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -233,29 +233,35 @@ struct restore_data_record { */ static int get_e820_md5(struct e820_table *table, void *buf) { - struct scatterlist sg; - struct crypto_ahash *tfm; + struct crypto_shash *tfm; + struct shash_desc *desc; int size; int ret = 0; - tfm = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC); + tfm = crypto_alloc_shash("md5", 0, 0); if (IS_ERR(tfm)) return -ENOMEM; - { - AHASH_REQUEST_ON_STACK(req, tfm); - size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry) * table->nr_entries; - ahash_request_set_tfm(req, tfm); - sg_init_one(&sg, (u8 *)table, size); - ahash_request_set_callback(req, 0, NULL, NULL); - ahash_request_set_crypt(req, &sg, buf, size); - - if (crypto_ahash_digest(req)) - ret = -EINVAL; - ahash_request_zero(req); + desc = kmalloc(sizeof(struct shash_desc) + crypto_shash_descsize(tfm), + GFP_KERNEL); + if (!desc) { + ret = -ENOMEM; + goto free_tfm; } - crypto_free_ahash(tfm); + desc->tfm = tfm; + desc->flags = 0; + + size = offsetof(struct e820_table, entries) + + sizeof(struct e820_entry) * table->nr_entries; + + if (crypto_shash_digest(desc, (u8 *)table, size, buf)) + ret = -EINVAL; + + kzfree(desc); + +free_tfm: + crypto_free_shash(tfm); return ret; } diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index 81a8e33115ad..3cf302b26332 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -28,9 +28,8 @@ $(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE targets += kexec-purgatory.c -CMD_BIN2C = $(objtree)/scripts/basic/bin2c quiet_cmd_bin2c = BIN2C $@ - cmd_bin2c = $(CMD_BIN2C) kexec_purgatory < $< > $@ + cmd_bin2c = $(objtree)/scripts/bin2c kexec_purgatory < $< > $@ $(obj)/kexec-purgatory.c: $(obj)/purgatory.ro FORCE $(call if_changed,bin2c) diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig index 9d529f22fd9d..f518b4744ff8 100644 --- a/arch/x86/um/Kconfig +++ b/arch/x86/um/Kconfig @@ -1,13 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -mainmenu "User Mode Linux/$(SUBARCH) $(KERNELVERSION) Kernel Configuration" - -comment "Compiler: $(CC_VERSION_TEXT)" - -source "scripts/Kconfig.include" - -source "arch/um/Kconfig.common" - -menu "UML-specific options" menu "Host processor type and features" @@ -66,9 +57,3 @@ config ARCH_REUSE_HOST_VSYSCALL_AREA config GENERIC_HWEIGHT def_bool y - -source "arch/um/Kconfig.um" - -endmenu - -source "arch/um/Kconfig.rest" diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 3b5318505c69..2eeddd814653 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -3,6 +3,7 @@ #endif #include <linux/cpu.h> #include <linux/kexec.h> +#include <linux/slab.h> #include <xen/features.h> #include <xen/page.h> diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 105a57d73701..52a7c3faee0c 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -122,6 +122,8 @@ static void __init xen_banner(void) static void __init xen_pv_init_platform(void) { + populate_extra_pte(fix_to_virt(FIX_PARAVIRT_BOOTMAP)); + set_fixmap(FIX_PARAVIRT_BOOTMAP, xen_start_info->shared_info); HYPERVISOR_shared_info = (void *)fix_to_virt(FIX_PARAVIRT_BOOTMAP); @@ -1170,13 +1172,13 @@ static void __init xen_boot_params_init_edd(void) * we do this, we have to be careful not to call any stack-protected * function, which is most of the kernel. */ -static void xen_setup_gdt(int cpu) +static void __init xen_setup_gdt(int cpu) { pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot; pv_cpu_ops.load_gdt = xen_load_gdt_boot; - setup_stack_canary_segment(0); - switch_to_new_gdt(0); + setup_stack_canary_segment(cpu); + switch_to_new_gdt(cpu); pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry; pv_cpu_ops.load_gdt = xen_load_gdt; @@ -1256,6 +1258,9 @@ asmlinkage __visible void __init xen_start_kernel(void) get_cpu_cap(&boot_cpu_data); x86_configure_nx(); + /* Determine virtual and physical address sizes */ + get_cpu_address_sizes(&boot_cpu_data); + /* Let's presume PV guests always boot on vCPU with id 0. */ per_cpu(xen_vcpu_id, 0) = 0; @@ -1382,8 +1387,11 @@ asmlinkage __visible void __init xen_start_kernel(void) xen_boot_params_init_edd(); } - add_preferred_console("tty", 0, NULL); + if (!boot_params.screen_info.orig_video_isVGA) + add_preferred_console("tty", 0, NULL); add_preferred_console("hvc", 0, NULL); + if (boot_params.screen_info.orig_video_isVGA) + add_preferred_console("tty", 0, NULL); #ifdef CONFIG_PCI /* PCI BIOS service won't work from a PV guest. */ diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 52206ad81e4b..45b700ac5fe7 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -67,6 +67,7 @@ #include <asm/init.h> #include <asm/pat.h> #include <asm/smp.h> +#include <asm/tlb.h> #include <asm/xen/hypercall.h> #include <asm/xen/hypervisor.h> @@ -2171,6 +2172,8 @@ void __init xen_relocate_p2m(void) #else /* !CONFIG_X86_64 */ static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD); static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD); +RESERVE_BRK(fixup_kernel_pmd, PAGE_SIZE); +RESERVE_BRK(fixup_kernel_pte, PAGE_SIZE); static void __init xen_write_cr3_init(unsigned long cr3) { @@ -2397,6 +2400,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { .flush_tlb_kernel = xen_flush_tlb, .flush_tlb_one_user = xen_flush_tlb_one_user, .flush_tlb_others = xen_flush_tlb_others, + .tlb_remove_table = tlb_remove_table, .pgd_alloc = xen_pgd_alloc, .pgd_free = xen_pgd_free, diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c index dc502ca8263e..2bce7958ce8b 100644 --- a/arch/x86/xen/multicalls.c +++ b/arch/x86/xen/multicalls.c @@ -80,9 +80,9 @@ void xen_mc_flush(void) and just do the call directly. */ mc = &b->entries[0]; - mc->result = privcmd_call(mc->op, - mc->args[0], mc->args[1], mc->args[2], - mc->args[3], mc->args[4]); + mc->result = xen_single_call(mc->op, mc->args[0], mc->args[1], + mc->args[2], mc->args[3], + mc->args[4]); ret = mc->result < 0; break; diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 6e0d2086eacb..1163e33121fb 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -906,37 +906,6 @@ char * __init xen_memory_setup(void) } /* - * Machine specific memory setup for auto-translated guests. - */ -char * __init xen_auto_xlated_memory_setup(void) -{ - struct xen_memory_map memmap; - int i; - int rc; - - memmap.nr_entries = ARRAY_SIZE(xen_e820_table.entries); - set_xen_guest_handle(memmap.buffer, xen_e820_table.entries); - - rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); - if (rc < 0) - panic("No memory map (%d)\n", rc); - - xen_e820_table.nr_entries = memmap.nr_entries; - - e820__update_table(&xen_e820_table); - - for (i = 0; i < xen_e820_table.nr_entries; i++) - e820__range_add(xen_e820_table.entries[i].addr, xen_e820_table.entries[i].size, xen_e820_table.entries[i].type); - - /* Remove p2m info, it is not needed. */ - xen_start_info->mfn_list = 0; - xen_start_info->first_p2m_pfn = 0; - xen_start_info->nr_p2m_frames = 0; - - return "Xen"; -} - -/* * Set the bit indicating "nosegneg" library variants should be used. * We only need to bother in pure 32-bit mode; compat 32-bit processes * can have un-truncated segments, so wrapping around is allowed. diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index cd97a62394e7..973f10e05211 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -130,6 +130,10 @@ PV_CALLEE_SAVE_REGS_THUNK(xen_vcpu_stolen); void __init xen_init_spinlocks(void) { + /* Don't need to use pvqspinlock code if there is only 1 vCPU. */ + if (num_possible_cpus() == 1) + xen_pvspin = false; + if (!xen_pvspin) { printk(KERN_DEBUG "xen: PV spinlocks disabled\n"); return; diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index e78684597f57..0e60bd918695 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -50,7 +50,6 @@ void __init xen_inv_extra_mem(void); void __init xen_remap_memory(void); phys_addr_t __init xen_find_free_area(phys_addr_t size); char * __init xen_memory_setup(void); -char * xen_auto_xlated_memory_setup(void); void __init xen_arch_setup(void); void xen_enable_sysenter(void); void xen_enable_syscall(void); |