diff options
Diffstat (limited to 'arch/x86')
115 files changed, 5307 insertions, 2144 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5aa4c2ecf5c7..f9920f1341c8 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -94,7 +94,6 @@ config X86 select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE select ARCH_HAS_SYSCALL_WRAPPER select ARCH_HAS_UBSAN_SANITIZE_ALL - select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_HAS_DEBUG_WX select ARCH_HAS_ZONE_DMA_SET if EXPERT select ARCH_HAVE_NMI_SAFE_CMPXCHG @@ -186,8 +185,8 @@ config X86 select HAVE_ASM_MODVERSIONS select HAVE_CMPXCHG_DOUBLE select HAVE_CMPXCHG_LOCAL - select HAVE_CONTEXT_TRACKING if X86_64 - select HAVE_CONTEXT_TRACKING_OFFSTACK if HAVE_CONTEXT_TRACKING + select HAVE_CONTEXT_TRACKING_USER if X86_64 + select HAVE_CONTEXT_TRACKING_USER_OFFSTACK if HAVE_CONTEXT_TRACKING_USER select HAVE_C_RECORDMCOUNT select HAVE_OBJTOOL_MCOUNT if HAVE_OBJTOOL select HAVE_BUILDTIME_MCOUNT_SORT @@ -280,7 +279,6 @@ config X86 select TRACE_IRQFLAGS_SUPPORT select TRACE_IRQFLAGS_NMI_SUPPORT select USER_STACKTRACE_SUPPORT - select VIRT_TO_BUS select HAVE_ARCH_KCSAN if X86_64 select X86_FEATURE_NAMES if PROC_FS select PROC_PID_ARCH_STATUS if PROC_FS @@ -1812,15 +1810,6 @@ config ARCH_USES_PG_UNCACHED def_bool y depends on X86_PAT -config ARCH_RANDOM - def_bool y - prompt "x86 architectural random number generator" if EXPERT - help - Enable the x86 architectural RDRAND instruction - (Intel Bull Mountain technology) to generate random numbers. - If supported, this is a high bandwidth, cryptographically - secure hardware random number generator. - config X86_UMIP def_bool y prompt "User Mode Instruction Prevention" if EXPERT diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 98a4852ed6a0..7207219509f6 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -135,7 +135,6 @@ CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_DEBUG_DEVRES=y CONFIG_CONNECTOR=y -CONFIG_EFI_VARS=y CONFIG_EFI_CAPSULE_LOADER=y CONFIG_BLK_DEV_LOOP=y CONFIG_VIRTIO_BLK=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 69784505a7a8..5ce67b73e218 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -134,7 +134,6 @@ CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_DEBUG_DEVRES=y CONFIG_CONNECTOR=y -CONFIG_EFI_VARS=y CONFIG_BLK_DEV_LOOP=y CONFIG_VIRTIO_BLK=y CONFIG_BLK_DEV_SD=y diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 2831685adf6f..04d07ab744b2 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -61,14 +61,15 @@ sha256-ssse3-$(CONFIG_AS_SHA256_NI) += sha256_ni_asm.o obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o -obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += blake2s-x86_64.o -blake2s-x86_64-y := blake2s-shash.o -obj-$(if $(CONFIG_CRYPTO_BLAKE2S_X86),y) += libblake2s-x86_64.o +obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += libblake2s-x86_64.o libblake2s-x86_64-y := blake2s-core.o blake2s-glue.o obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o +obj-$(CONFIG_CRYPTO_POLYVAL_CLMUL_NI) += polyval-clmulni.o +polyval-clmulni-y := polyval-clmulni_asm.o polyval-clmulni_glue.o + obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o crc32c-intel-y := crc32c-intel_glue.o crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o diff --git a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S index 43852ba6e19c..2402b9418cd7 100644 --- a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S +++ b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S @@ -23,6 +23,11 @@ #define VMOVDQ vmovdqu +/* + * Note: the "x" prefix in these aliases means "this is an xmm register". The + * alias prefixes have no relation to XCTR where the "X" prefix means "XOR + * counter". + */ #define xdata0 %xmm0 #define xdata1 %xmm1 #define xdata2 %xmm2 @@ -31,8 +36,10 @@ #define xdata5 %xmm5 #define xdata6 %xmm6 #define xdata7 %xmm7 -#define xcounter %xmm8 -#define xbyteswap %xmm9 +#define xcounter %xmm8 // CTR mode only +#define xiv %xmm8 // XCTR mode only +#define xbyteswap %xmm9 // CTR mode only +#define xtmp %xmm9 // XCTR mode only #define xkey0 %xmm10 #define xkey4 %xmm11 #define xkey8 %xmm12 @@ -45,7 +52,7 @@ #define p_keys %rdx #define p_out %rcx #define num_bytes %r8 - +#define counter %r9 // XCTR mode only #define tmp %r10 #define DDQ_DATA 0 #define XDATA 1 @@ -102,7 +109,7 @@ ddq_add_8: * do_aes num_in_par load_keys key_len * This increments p_in, but not p_out */ -.macro do_aes b, k, key_len +.macro do_aes b, k, key_len, xctr .set by, \b .set load_keys, \k .set klen, \key_len @@ -111,29 +118,48 @@ ddq_add_8: vmovdqa 0*16(p_keys), xkey0 .endif - vpshufb xbyteswap, xcounter, xdata0 - - .set i, 1 - .rept (by - 1) - club XDATA, i - vpaddq (ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata - vptest ddq_low_msk(%rip), var_xdata - jnz 1f - vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata - vpaddq ddq_high_add_1(%rip), xcounter, xcounter - 1: - vpshufb xbyteswap, var_xdata, var_xdata - .set i, (i +1) - .endr + .if \xctr + movq counter, xtmp + .set i, 0 + .rept (by) + club XDATA, i + vpaddq (ddq_add_1 + 16 * i)(%rip), xtmp, var_xdata + .set i, (i +1) + .endr + .set i, 0 + .rept (by) + club XDATA, i + vpxor xiv, var_xdata, var_xdata + .set i, (i +1) + .endr + .else + vpshufb xbyteswap, xcounter, xdata0 + .set i, 1 + .rept (by - 1) + club XDATA, i + vpaddq (ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata + vptest ddq_low_msk(%rip), var_xdata + jnz 1f + vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata + vpaddq ddq_high_add_1(%rip), xcounter, xcounter + 1: + vpshufb xbyteswap, var_xdata, var_xdata + .set i, (i +1) + .endr + .endif vmovdqa 1*16(p_keys), xkeyA vpxor xkey0, xdata0, xdata0 - vpaddq (ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter - vptest ddq_low_msk(%rip), xcounter - jnz 1f - vpaddq ddq_high_add_1(%rip), xcounter, xcounter - 1: + .if \xctr + add $by, counter + .else + vpaddq (ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter + vptest ddq_low_msk(%rip), xcounter + jnz 1f + vpaddq ddq_high_add_1(%rip), xcounter, xcounter + 1: + .endif .set i, 1 .rept (by - 1) @@ -371,94 +397,99 @@ ddq_add_8: .endr .endm -.macro do_aes_load val, key_len - do_aes \val, 1, \key_len +.macro do_aes_load val, key_len, xctr + do_aes \val, 1, \key_len, \xctr .endm -.macro do_aes_noload val, key_len - do_aes \val, 0, \key_len +.macro do_aes_noload val, key_len, xctr + do_aes \val, 0, \key_len, \xctr .endm /* main body of aes ctr load */ -.macro do_aes_ctrmain key_len +.macro do_aes_ctrmain key_len, xctr cmp $16, num_bytes - jb .Ldo_return2\key_len + jb .Ldo_return2\xctr\key_len - vmovdqa byteswap_const(%rip), xbyteswap - vmovdqu (p_iv), xcounter - vpshufb xbyteswap, xcounter, xcounter + .if \xctr + shr $4, counter + vmovdqu (p_iv), xiv + .else + vmovdqa byteswap_const(%rip), xbyteswap + vmovdqu (p_iv), xcounter + vpshufb xbyteswap, xcounter, xcounter + .endif mov num_bytes, tmp and $(7*16), tmp - jz .Lmult_of_8_blks\key_len + jz .Lmult_of_8_blks\xctr\key_len /* 1 <= tmp <= 7 */ cmp $(4*16), tmp - jg .Lgt4\key_len - je .Leq4\key_len + jg .Lgt4\xctr\key_len + je .Leq4\xctr\key_len -.Llt4\key_len: +.Llt4\xctr\key_len: cmp $(2*16), tmp - jg .Leq3\key_len - je .Leq2\key_len + jg .Leq3\xctr\key_len + je .Leq2\xctr\key_len -.Leq1\key_len: - do_aes_load 1, \key_len +.Leq1\xctr\key_len: + do_aes_load 1, \key_len, \xctr add $(1*16), p_out and $(~7*16), num_bytes - jz .Ldo_return2\key_len - jmp .Lmain_loop2\key_len + jz .Ldo_return2\xctr\key_len + jmp .Lmain_loop2\xctr\key_len -.Leq2\key_len: - do_aes_load 2, \key_len +.Leq2\xctr\key_len: + do_aes_load 2, \key_len, \xctr add $(2*16), p_out and $(~7*16), num_bytes - jz .Ldo_return2\key_len - jmp .Lmain_loop2\key_len + jz .Ldo_return2\xctr\key_len + jmp .Lmain_loop2\xctr\key_len -.Leq3\key_len: - do_aes_load 3, \key_len +.Leq3\xctr\key_len: + do_aes_load 3, \key_len, \xctr add $(3*16), p_out and $(~7*16), num_bytes - jz .Ldo_return2\key_len - jmp .Lmain_loop2\key_len + jz .Ldo_return2\xctr\key_len + jmp .Lmain_loop2\xctr\key_len -.Leq4\key_len: - do_aes_load 4, \key_len +.Leq4\xctr\key_len: + do_aes_load 4, \key_len, \xctr add $(4*16), p_out and $(~7*16), num_bytes - jz .Ldo_return2\key_len - jmp .Lmain_loop2\key_len + jz .Ldo_return2\xctr\key_len + jmp .Lmain_loop2\xctr\key_len -.Lgt4\key_len: +.Lgt4\xctr\key_len: cmp $(6*16), tmp - jg .Leq7\key_len - je .Leq6\key_len + jg .Leq7\xctr\key_len + je .Leq6\xctr\key_len -.Leq5\key_len: - do_aes_load 5, \key_len +.Leq5\xctr\key_len: + do_aes_load 5, \key_len, \xctr add $(5*16), p_out and $(~7*16), num_bytes - jz .Ldo_return2\key_len - jmp .Lmain_loop2\key_len + jz .Ldo_return2\xctr\key_len + jmp .Lmain_loop2\xctr\key_len -.Leq6\key_len: - do_aes_load 6, \key_len +.Leq6\xctr\key_len: + do_aes_load 6, \key_len, \xctr add $(6*16), p_out and $(~7*16), num_bytes - jz .Ldo_return2\key_len - jmp .Lmain_loop2\key_len + jz .Ldo_return2\xctr\key_len + jmp .Lmain_loop2\xctr\key_len -.Leq7\key_len: - do_aes_load 7, \key_len +.Leq7\xctr\key_len: + do_aes_load 7, \key_len, \xctr add $(7*16), p_out and $(~7*16), num_bytes - jz .Ldo_return2\key_len - jmp .Lmain_loop2\key_len + jz .Ldo_return2\xctr\key_len + jmp .Lmain_loop2\xctr\key_len -.Lmult_of_8_blks\key_len: +.Lmult_of_8_blks\xctr\key_len: .if (\key_len != KEY_128) vmovdqa 0*16(p_keys), xkey0 vmovdqa 4*16(p_keys), xkey4 @@ -471,17 +502,19 @@ ddq_add_8: vmovdqa 9*16(p_keys), xkey12 .endif .align 16 -.Lmain_loop2\key_len: +.Lmain_loop2\xctr\key_len: /* num_bytes is a multiple of 8 and >0 */ - do_aes_noload 8, \key_len + do_aes_noload 8, \key_len, \xctr add $(8*16), p_out sub $(8*16), num_bytes - jne .Lmain_loop2\key_len + jne .Lmain_loop2\xctr\key_len -.Ldo_return2\key_len: - /* return updated IV */ - vpshufb xbyteswap, xcounter, xcounter - vmovdqu xcounter, (p_iv) +.Ldo_return2\xctr\key_len: + .if !\xctr + /* return updated IV */ + vpshufb xbyteswap, xcounter, xcounter + vmovdqu xcounter, (p_iv) + .endif RET .endm @@ -494,7 +527,7 @@ ddq_add_8: */ SYM_FUNC_START(aes_ctr_enc_128_avx_by8) /* call the aes main loop */ - do_aes_ctrmain KEY_128 + do_aes_ctrmain KEY_128 0 SYM_FUNC_END(aes_ctr_enc_128_avx_by8) @@ -507,7 +540,7 @@ SYM_FUNC_END(aes_ctr_enc_128_avx_by8) */ SYM_FUNC_START(aes_ctr_enc_192_avx_by8) /* call the aes main loop */ - do_aes_ctrmain KEY_192 + do_aes_ctrmain KEY_192 0 SYM_FUNC_END(aes_ctr_enc_192_avx_by8) @@ -520,6 +553,45 @@ SYM_FUNC_END(aes_ctr_enc_192_avx_by8) */ SYM_FUNC_START(aes_ctr_enc_256_avx_by8) /* call the aes main loop */ - do_aes_ctrmain KEY_256 + do_aes_ctrmain KEY_256 0 SYM_FUNC_END(aes_ctr_enc_256_avx_by8) + +/* + * routine to do AES128 XCTR enc/decrypt "by8" + * XMM registers are clobbered. + * Saving/restoring must be done at a higher level + * aes_xctr_enc_128_avx_by8(const u8 *in, const u8 *iv, const void *keys, + * u8* out, unsigned int num_bytes, unsigned int byte_ctr) + */ +SYM_FUNC_START(aes_xctr_enc_128_avx_by8) + /* call the aes main loop */ + do_aes_ctrmain KEY_128 1 + +SYM_FUNC_END(aes_xctr_enc_128_avx_by8) + +/* + * routine to do AES192 XCTR enc/decrypt "by8" + * XMM registers are clobbered. + * Saving/restoring must be done at a higher level + * aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv, const void *keys, + * u8* out, unsigned int num_bytes, unsigned int byte_ctr) + */ +SYM_FUNC_START(aes_xctr_enc_192_avx_by8) + /* call the aes main loop */ + do_aes_ctrmain KEY_192 1 + +SYM_FUNC_END(aes_xctr_enc_192_avx_by8) + +/* + * routine to do AES256 XCTR enc/decrypt "by8" + * XMM registers are clobbered. + * Saving/restoring must be done at a higher level + * aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv, const void *keys, + * u8* out, unsigned int num_bytes, unsigned int byte_ctr) + */ +SYM_FUNC_START(aes_xctr_enc_256_avx_by8) + /* call the aes main loop */ + do_aes_ctrmain KEY_256 1 + +SYM_FUNC_END(aes_xctr_enc_256_avx_by8) diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 41901ba9d3a2..a5b0cb3efeba 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -135,6 +135,20 @@ asmlinkage void aes_ctr_enc_192_avx_by8(const u8 *in, u8 *iv, void *keys, u8 *out, unsigned int num_bytes); asmlinkage void aes_ctr_enc_256_avx_by8(const u8 *in, u8 *iv, void *keys, u8 *out, unsigned int num_bytes); + + +asmlinkage void aes_xctr_enc_128_avx_by8(const u8 *in, const u8 *iv, + const void *keys, u8 *out, unsigned int num_bytes, + unsigned int byte_ctr); + +asmlinkage void aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv, + const void *keys, u8 *out, unsigned int num_bytes, + unsigned int byte_ctr); + +asmlinkage void aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv, + const void *keys, u8 *out, unsigned int num_bytes, + unsigned int byte_ctr); + /* * asmlinkage void aesni_gcm_init_avx_gen2() * gcm_data *my_ctx_data, context data @@ -527,6 +541,59 @@ static int ctr_crypt(struct skcipher_request *req) return err; } +static void aesni_xctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out, + const u8 *in, unsigned int len, u8 *iv, + unsigned int byte_ctr) +{ + if (ctx->key_length == AES_KEYSIZE_128) + aes_xctr_enc_128_avx_by8(in, iv, (void *)ctx, out, len, + byte_ctr); + else if (ctx->key_length == AES_KEYSIZE_192) + aes_xctr_enc_192_avx_by8(in, iv, (void *)ctx, out, len, + byte_ctr); + else + aes_xctr_enc_256_avx_by8(in, iv, (void *)ctx, out, len, + byte_ctr); +} + +static int xctr_crypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm)); + u8 keystream[AES_BLOCK_SIZE]; + struct skcipher_walk walk; + unsigned int nbytes; + unsigned int byte_ctr = 0; + int err; + __le32 block[AES_BLOCK_SIZE / sizeof(__le32)]; + + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) > 0) { + kernel_fpu_begin(); + if (nbytes & AES_BLOCK_MASK) + aesni_xctr_enc_avx_tfm(ctx, walk.dst.virt.addr, + walk.src.virt.addr, nbytes & AES_BLOCK_MASK, + walk.iv, byte_ctr); + nbytes &= ~AES_BLOCK_MASK; + byte_ctr += walk.nbytes - nbytes; + + if (walk.nbytes == walk.total && nbytes > 0) { + memcpy(block, walk.iv, AES_BLOCK_SIZE); + block[0] ^= cpu_to_le32(1 + byte_ctr / AES_BLOCK_SIZE); + aesni_enc(ctx, keystream, (u8 *)block); + crypto_xor_cpy(walk.dst.virt.addr + walk.nbytes - + nbytes, walk.src.virt.addr + walk.nbytes + - nbytes, keystream, nbytes); + byte_ctr += nbytes; + nbytes = 0; + } + kernel_fpu_end(); + err = skcipher_walk_done(&walk, nbytes); + } + return err; +} + static int rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len) { @@ -1051,6 +1118,33 @@ static struct simd_skcipher_alg *aesni_simd_skciphers[ARRAY_SIZE(aesni_skciphers)]; #ifdef CONFIG_X86_64 +/* + * XCTR does not have a non-AVX implementation, so it must be enabled + * conditionally. + */ +static struct skcipher_alg aesni_xctr = { + .base = { + .cra_name = "__xctr(aes)", + .cra_driver_name = "__xctr-aes-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = 1, + .cra_ctxsize = CRYPTO_AES_CTX_SIZE, + .cra_module = THIS_MODULE, + }, + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .chunksize = AES_BLOCK_SIZE, + .setkey = aesni_skcipher_setkey, + .encrypt = xctr_crypt, + .decrypt = xctr_crypt, +}; + +static struct simd_skcipher_alg *aesni_simd_xctr; +#endif /* CONFIG_X86_64 */ + +#ifdef CONFIG_X86_64 static int generic_gcmaes_set_key(struct crypto_aead *aead, const u8 *key, unsigned int key_len) { @@ -1163,7 +1257,7 @@ static int __init aesni_init(void) static_call_update(aesni_ctr_enc_tfm, aesni_ctr_enc_avx_tfm); pr_info("AES CTR mode by8 optimization enabled\n"); } -#endif +#endif /* CONFIG_X86_64 */ err = crypto_register_alg(&aesni_cipher_alg); if (err) @@ -1180,8 +1274,22 @@ static int __init aesni_init(void) if (err) goto unregister_skciphers; +#ifdef CONFIG_X86_64 + if (boot_cpu_has(X86_FEATURE_AVX)) + err = simd_register_skciphers_compat(&aesni_xctr, 1, + &aesni_simd_xctr); + if (err) + goto unregister_aeads; +#endif /* CONFIG_X86_64 */ + return 0; +#ifdef CONFIG_X86_64 +unregister_aeads: + simd_unregister_aeads(aesni_aeads, ARRAY_SIZE(aesni_aeads), + aesni_simd_aeads); +#endif /* CONFIG_X86_64 */ + unregister_skciphers: simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers), aesni_simd_skciphers); @@ -1197,6 +1305,10 @@ static void __exit aesni_exit(void) simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers), aesni_simd_skciphers); crypto_unregister_alg(&aesni_cipher_alg); +#ifdef CONFIG_X86_64 + if (boot_cpu_has(X86_FEATURE_AVX)) + simd_unregister_skciphers(&aesni_xctr, 1, &aesni_simd_xctr); +#endif /* CONFIG_X86_64 */ } late_initcall(aesni_init); diff --git a/arch/x86/crypto/blake2s-glue.c b/arch/x86/crypto/blake2s-glue.c index 69853c13e8fb..aaba21230528 100644 --- a/arch/x86/crypto/blake2s-glue.c +++ b/arch/x86/crypto/blake2s-glue.c @@ -4,7 +4,6 @@ */ #include <crypto/internal/blake2s.h> -#include <crypto/internal/simd.h> #include <linux/types.h> #include <linux/jump_label.h> @@ -33,7 +32,7 @@ void blake2s_compress(struct blake2s_state *state, const u8 *block, /* SIMD disables preemption, so relax after processing each page. */ BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8); - if (!static_branch_likely(&blake2s_use_ssse3) || !crypto_simd_usable()) { + if (!static_branch_likely(&blake2s_use_ssse3) || !may_use_simd()) { blake2s_compress_generic(state, block, nblocks, inc); return; } diff --git a/arch/x86/crypto/blake2s-shash.c b/arch/x86/crypto/blake2s-shash.c deleted file mode 100644 index 59ae28abe35c..000000000000 --- a/arch/x86/crypto/blake2s-shash.c +++ /dev/null @@ -1,77 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 OR MIT -/* - * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. - */ - -#include <crypto/internal/blake2s.h> -#include <crypto/internal/simd.h> -#include <crypto/internal/hash.h> - -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/sizes.h> - -#include <asm/cpufeature.h> -#include <asm/processor.h> - -static int crypto_blake2s_update_x86(struct shash_desc *desc, - const u8 *in, unsigned int inlen) -{ - return crypto_blake2s_update(desc, in, inlen, false); -} - -static int crypto_blake2s_final_x86(struct shash_desc *desc, u8 *out) -{ - return crypto_blake2s_final(desc, out, false); -} - -#define BLAKE2S_ALG(name, driver_name, digest_size) \ - { \ - .base.cra_name = name, \ - .base.cra_driver_name = driver_name, \ - .base.cra_priority = 200, \ - .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, \ - .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, \ - .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), \ - .base.cra_module = THIS_MODULE, \ - .digestsize = digest_size, \ - .setkey = crypto_blake2s_setkey, \ - .init = crypto_blake2s_init, \ - .update = crypto_blake2s_update_x86, \ - .final = crypto_blake2s_final_x86, \ - .descsize = sizeof(struct blake2s_state), \ - } - -static struct shash_alg blake2s_algs[] = { - BLAKE2S_ALG("blake2s-128", "blake2s-128-x86", BLAKE2S_128_HASH_SIZE), - BLAKE2S_ALG("blake2s-160", "blake2s-160-x86", BLAKE2S_160_HASH_SIZE), - BLAKE2S_ALG("blake2s-224", "blake2s-224-x86", BLAKE2S_224_HASH_SIZE), - BLAKE2S_ALG("blake2s-256", "blake2s-256-x86", BLAKE2S_256_HASH_SIZE), -}; - -static int __init blake2s_mod_init(void) -{ - if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && boot_cpu_has(X86_FEATURE_SSSE3)) - return crypto_register_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs)); - return 0; -} - -static void __exit blake2s_mod_exit(void) -{ - if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && boot_cpu_has(X86_FEATURE_SSSE3)) - crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs)); -} - -module_init(blake2s_mod_init); -module_exit(blake2s_mod_exit); - -MODULE_ALIAS_CRYPTO("blake2s-128"); -MODULE_ALIAS_CRYPTO("blake2s-128-x86"); -MODULE_ALIAS_CRYPTO("blake2s-160"); -MODULE_ALIAS_CRYPTO("blake2s-160-x86"); -MODULE_ALIAS_CRYPTO("blake2s-224"); -MODULE_ALIAS_CRYPTO("blake2s-224-x86"); -MODULE_ALIAS_CRYPTO("blake2s-256"); -MODULE_ALIAS_CRYPTO("blake2s-256-x86"); -MODULE_LICENSE("GPL v2"); diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c index ba06322c1e39..019c64c1340a 100644 --- a/arch/x86/crypto/blowfish_glue.c +++ b/arch/x86/crypto/blowfish_glue.c @@ -144,7 +144,7 @@ static int cbc_encrypt(struct skcipher_request *req) err = skcipher_walk_virt(&walk, req, false); - while ((nbytes = walk.nbytes)) { + while (walk.nbytes) { nbytes = __cbc_encrypt(ctx, &walk); err = skcipher_walk_done(&walk, nbytes); } @@ -225,7 +225,7 @@ static int cbc_decrypt(struct skcipher_request *req) err = skcipher_walk_virt(&walk, req, false); - while ((nbytes = walk.nbytes)) { + while (walk.nbytes) { nbytes = __cbc_decrypt(ctx, &walk); err = skcipher_walk_done(&walk, nbytes); } diff --git a/arch/x86/crypto/crc32-pclmul_asm.S b/arch/x86/crypto/crc32-pclmul_asm.S index c392a6edbfff..ca53e96996ac 100644 --- a/arch/x86/crypto/crc32-pclmul_asm.S +++ b/arch/x86/crypto/crc32-pclmul_asm.S @@ -1,26 +1,4 @@ -/* GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see http://www.gnu.org/licenses - * - * Please visit http://www.xyratex.com/contact if you need additional - * information or have any questions. - * - * GPL HEADER END - */ - +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright 2012 Xyratex Technology Limited * diff --git a/arch/x86/crypto/polyval-clmulni_asm.S b/arch/x86/crypto/polyval-clmulni_asm.S new file mode 100644 index 000000000000..a6ebe4e7dd2b --- /dev/null +++ b/arch/x86/crypto/polyval-clmulni_asm.S @@ -0,0 +1,321 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright 2021 Google LLC + */ +/* + * This is an efficient implementation of POLYVAL using intel PCLMULQDQ-NI + * instructions. It works on 8 blocks at a time, by precomputing the first 8 + * keys powers h^8, ..., h^1 in the POLYVAL finite field. This precomputation + * allows us to split finite field multiplication into two steps. + * + * In the first step, we consider h^i, m_i as normal polynomials of degree less + * than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication + * is simply polynomial multiplication. + * + * In the second step, we compute the reduction of p(x) modulo the finite field + * modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1. + * + * This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where + * multiplication is finite field multiplication. The advantage is that the + * two-step process only requires 1 finite field reduction for every 8 + * polynomial multiplications. Further parallelism is gained by interleaving the + * multiplications and polynomial reductions. + */ + +#include <linux/linkage.h> +#include <asm/frame.h> + +#define STRIDE_BLOCKS 8 + +#define GSTAR %xmm7 +#define PL %xmm8 +#define PH %xmm9 +#define TMP_XMM %xmm11 +#define LO %xmm12 +#define HI %xmm13 +#define MI %xmm14 +#define SUM %xmm15 + +#define KEY_POWERS %rdi +#define MSG %rsi +#define BLOCKS_LEFT %rdx +#define ACCUMULATOR %rcx +#define TMP %rax + +.section .rodata.cst16.gstar, "aM", @progbits, 16 +.align 16 + +.Lgstar: + .quad 0xc200000000000000, 0xc200000000000000 + +.text + +/* + * Performs schoolbook1_iteration on two lists of 128-bit polynomials of length + * count pointed to by MSG and KEY_POWERS. + */ +.macro schoolbook1 count + .set i, 0 + .rept (\count) + schoolbook1_iteration i 0 + .set i, (i +1) + .endr +.endm + +/* + * Computes the product of two 128-bit polynomials at the memory locations + * specified by (MSG + 16*i) and (KEY_POWERS + 16*i) and XORs the components of + * the 256-bit product into LO, MI, HI. + * + * Given: + * X = [X_1 : X_0] + * Y = [Y_1 : Y_0] + * + * We compute: + * LO += X_0 * Y_0 + * MI += X_0 * Y_1 + X_1 * Y_0 + * HI += X_1 * Y_1 + * + * Later, the 256-bit result can be extracted as: + * [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0] + * This step is done when computing the polynomial reduction for efficiency + * reasons. + * + * If xor_sum == 1, then also XOR the value of SUM into m_0. This avoids an + * extra multiplication of SUM and h^8. + */ +.macro schoolbook1_iteration i xor_sum + movups (16*\i)(MSG), %xmm0 + .if (\i == 0 && \xor_sum == 1) + pxor SUM, %xmm0 + .endif + vpclmulqdq $0x01, (16*\i)(KEY_POWERS), %xmm0, %xmm2 + vpclmulqdq $0x00, (16*\i)(KEY_POWERS), %xmm0, %xmm1 + vpclmulqdq $0x10, (16*\i)(KEY_POWERS), %xmm0, %xmm3 + vpclmulqdq $0x11, (16*\i)(KEY_POWERS), %xmm0, %xmm4 + vpxor %xmm2, MI, MI + vpxor %xmm1, LO, LO + vpxor %xmm4, HI, HI + vpxor %xmm3, MI, MI +.endm + +/* + * Performs the same computation as schoolbook1_iteration, except we expect the + * arguments to already be loaded into xmm0 and xmm1 and we set the result + * registers LO, MI, and HI directly rather than XOR'ing into them. + */ +.macro schoolbook1_noload + vpclmulqdq $0x01, %xmm0, %xmm1, MI + vpclmulqdq $0x10, %xmm0, %xmm1, %xmm2 + vpclmulqdq $0x00, %xmm0, %xmm1, LO + vpclmulqdq $0x11, %xmm0, %xmm1, HI + vpxor %xmm2, MI, MI +.endm + +/* + * Computes the 256-bit polynomial represented by LO, HI, MI. Stores + * the result in PL, PH. + * [PH : PL] = [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0] + */ +.macro schoolbook2 + vpslldq $8, MI, PL + vpsrldq $8, MI, PH + pxor LO, PL + pxor HI, PH +.endm + +/* + * Computes the 128-bit reduction of PH : PL. Stores the result in dest. + * + * This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) = + * x^128 + x^127 + x^126 + x^121 + 1. + * + * We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the + * product of two 128-bit polynomials in Montgomery form. We need to reduce it + * mod g(x). Also, since polynomials in Montgomery form have an "extra" factor + * of x^128, this product has two extra factors of x^128. To get it back into + * Montgomery form, we need to remove one of these factors by dividing by x^128. + * + * To accomplish both of these goals, we add multiples of g(x) that cancel out + * the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low + * bits are zero, the polynomial division by x^128 can be done by right shifting. + * + * Since the only nonzero term in the low 64 bits of g(x) is the constant term, + * the multiple of g(x) needed to cancel out P_0 is P_0 * g(x). The CPU can + * only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 + + * x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x). Adding this to + * the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T + * = T_1 : T_0 = g*(x) * P_0. Thus, bits 0-63 got "folded" into bits 64-191. + * + * Repeating this same process on the next 64 bits "folds" bits 64-127 into bits + * 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1 + * + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) * + * x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 : + * P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0). + * + * So our final computation is: + * T = T_1 : T_0 = g*(x) * P_0 + * V = V_1 : V_0 = g*(x) * (P_1 + T_0) + * p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0 + * + * The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0 + * + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 : + * T_1 into dest. This allows us to reuse P_1 + T_0 when computing V. + */ +.macro montgomery_reduction dest + vpclmulqdq $0x00, PL, GSTAR, TMP_XMM # TMP_XMM = T_1 : T_0 = P_0 * g*(x) + pshufd $0b01001110, TMP_XMM, TMP_XMM # TMP_XMM = T_0 : T_1 + pxor PL, TMP_XMM # TMP_XMM = P_1 + T_0 : P_0 + T_1 + pxor TMP_XMM, PH # PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1 + pclmulqdq $0x11, GSTAR, TMP_XMM # TMP_XMM = V_1 : V_0 = V = [(P_1 + T_0) * g*(x)] + vpxor TMP_XMM, PH, \dest +.endm + +/* + * Compute schoolbook multiplication for 8 blocks + * m_0h^8 + ... + m_7h^1 + * + * If reduce is set, also computes the montgomery reduction of the + * previous full_stride call and XORs with the first message block. + * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1. + * I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0. + */ +.macro full_stride reduce + pxor LO, LO + pxor HI, HI + pxor MI, MI + + schoolbook1_iteration 7 0 + .if \reduce + vpclmulqdq $0x00, PL, GSTAR, TMP_XMM + .endif + + schoolbook1_iteration 6 0 + .if \reduce + pshufd $0b01001110, TMP_XMM, TMP_XMM + .endif + + schoolbook1_iteration 5 0 + .if \reduce + pxor PL, TMP_XMM + .endif + + schoolbook1_iteration 4 0 + .if \reduce + pxor TMP_XMM, PH + .endif + + schoolbook1_iteration 3 0 + .if \reduce + pclmulqdq $0x11, GSTAR, TMP_XMM + .endif + + schoolbook1_iteration 2 0 + .if \reduce + vpxor TMP_XMM, PH, SUM + .endif + + schoolbook1_iteration 1 0 + + schoolbook1_iteration 0 1 + + addq $(8*16), MSG + schoolbook2 +.endm + +/* + * Process BLOCKS_LEFT blocks, where 0 < BLOCKS_LEFT < STRIDE_BLOCKS + */ +.macro partial_stride + mov BLOCKS_LEFT, TMP + shlq $4, TMP + addq $(16*STRIDE_BLOCKS), KEY_POWERS + subq TMP, KEY_POWERS + + movups (MSG), %xmm0 + pxor SUM, %xmm0 + movaps (KEY_POWERS), %xmm1 + schoolbook1_noload + dec BLOCKS_LEFT + addq $16, MSG + addq $16, KEY_POWERS + + test $4, BLOCKS_LEFT + jz .Lpartial4BlocksDone + schoolbook1 4 + addq $(4*16), MSG + addq $(4*16), KEY_POWERS +.Lpartial4BlocksDone: + test $2, BLOCKS_LEFT + jz .Lpartial2BlocksDone + schoolbook1 2 + addq $(2*16), MSG + addq $(2*16), KEY_POWERS +.Lpartial2BlocksDone: + test $1, BLOCKS_LEFT + jz .LpartialDone + schoolbook1 1 +.LpartialDone: + schoolbook2 + montgomery_reduction SUM +.endm + +/* + * Perform montgomery multiplication in GF(2^128) and store result in op1. + * + * Computes op1*op2*x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1 + * If op1, op2 are in montgomery form, this computes the montgomery + * form of op1*op2. + * + * void clmul_polyval_mul(u8 *op1, const u8 *op2); + */ +SYM_FUNC_START(clmul_polyval_mul) + FRAME_BEGIN + vmovdqa .Lgstar(%rip), GSTAR + movups (%rdi), %xmm0 + movups (%rsi), %xmm1 + schoolbook1_noload + schoolbook2 + montgomery_reduction SUM + movups SUM, (%rdi) + FRAME_END + RET +SYM_FUNC_END(clmul_polyval_mul) + +/* + * Perform polynomial evaluation as specified by POLYVAL. This computes: + * h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1} + * where n=nblocks, h is the hash key, and m_i are the message blocks. + * + * rdi - pointer to precomputed key powers h^8 ... h^1 + * rsi - pointer to message blocks + * rdx - number of blocks to hash + * rcx - pointer to the accumulator + * + * void clmul_polyval_update(const struct polyval_tfm_ctx *keys, + * const u8 *in, size_t nblocks, u8 *accumulator); + */ +SYM_FUNC_START(clmul_polyval_update) + FRAME_BEGIN + vmovdqa .Lgstar(%rip), GSTAR + movups (ACCUMULATOR), SUM + subq $STRIDE_BLOCKS, BLOCKS_LEFT + js .LstrideLoopExit + full_stride 0 + subq $STRIDE_BLOCKS, BLOCKS_LEFT + js .LstrideLoopExitReduce +.LstrideLoop: + full_stride 1 + subq $STRIDE_BLOCKS, BLOCKS_LEFT + jns .LstrideLoop +.LstrideLoopExitReduce: + montgomery_reduction SUM +.LstrideLoopExit: + add $STRIDE_BLOCKS, BLOCKS_LEFT + jz .LskipPartial + partial_stride +.LskipPartial: + movups SUM, (ACCUMULATOR) + FRAME_END + RET +SYM_FUNC_END(clmul_polyval_update) diff --git a/arch/x86/crypto/polyval-clmulni_glue.c b/arch/x86/crypto/polyval-clmulni_glue.c new file mode 100644 index 000000000000..b7664d018851 --- /dev/null +++ b/arch/x86/crypto/polyval-clmulni_glue.c @@ -0,0 +1,203 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Glue code for POLYVAL using PCMULQDQ-NI + * + * Copyright (c) 2007 Nokia Siemens Networks - Mikko Herranen <mh1@iki.fi> + * Copyright (c) 2009 Intel Corp. + * Author: Huang Ying <ying.huang@intel.com> + * Copyright 2021 Google LLC + */ + +/* + * Glue code based on ghash-clmulni-intel_glue.c. + * + * This implementation of POLYVAL uses montgomery multiplication + * accelerated by PCLMULQDQ-NI to implement the finite field + * operations. + */ + +#include <crypto/algapi.h> +#include <crypto/internal/hash.h> +#include <crypto/internal/simd.h> +#include <crypto/polyval.h> +#include <linux/crypto.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <asm/cpu_device_id.h> +#include <asm/simd.h> + +#define NUM_KEY_POWERS 8 + +struct polyval_tfm_ctx { + /* + * These powers must be in the order h^8, ..., h^1. + */ + u8 key_powers[NUM_KEY_POWERS][POLYVAL_BLOCK_SIZE]; +}; + +struct polyval_desc_ctx { + u8 buffer[POLYVAL_BLOCK_SIZE]; + u32 bytes; +}; + +asmlinkage void clmul_polyval_update(const struct polyval_tfm_ctx *keys, + const u8 *in, size_t nblocks, u8 *accumulator); +asmlinkage void clmul_polyval_mul(u8 *op1, const u8 *op2); + +static void internal_polyval_update(const struct polyval_tfm_ctx *keys, + const u8 *in, size_t nblocks, u8 *accumulator) +{ + if (likely(crypto_simd_usable())) { + kernel_fpu_begin(); + clmul_polyval_update(keys, in, nblocks, accumulator); + kernel_fpu_end(); + } else { + polyval_update_non4k(keys->key_powers[NUM_KEY_POWERS-1], in, + nblocks, accumulator); + } +} + +static void internal_polyval_mul(u8 *op1, const u8 *op2) +{ + if (likely(crypto_simd_usable())) { + kernel_fpu_begin(); + clmul_polyval_mul(op1, op2); + kernel_fpu_end(); + } else { + polyval_mul_non4k(op1, op2); + } +} + +static int polyval_x86_setkey(struct crypto_shash *tfm, + const u8 *key, unsigned int keylen) +{ + struct polyval_tfm_ctx *tctx = crypto_shash_ctx(tfm); + int i; + + if (keylen != POLYVAL_BLOCK_SIZE) + return -EINVAL; + + memcpy(tctx->key_powers[NUM_KEY_POWERS-1], key, POLYVAL_BLOCK_SIZE); + + for (i = NUM_KEY_POWERS-2; i >= 0; i--) { + memcpy(tctx->key_powers[i], key, POLYVAL_BLOCK_SIZE); + internal_polyval_mul(tctx->key_powers[i], + tctx->key_powers[i+1]); + } + + return 0; +} + +static int polyval_x86_init(struct shash_desc *desc) +{ + struct polyval_desc_ctx *dctx = shash_desc_ctx(desc); + + memset(dctx, 0, sizeof(*dctx)); + + return 0; +} + +static int polyval_x86_update(struct shash_desc *desc, + const u8 *src, unsigned int srclen) +{ + struct polyval_desc_ctx *dctx = shash_desc_ctx(desc); + const struct polyval_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); + u8 *pos; + unsigned int nblocks; + unsigned int n; + + if (dctx->bytes) { + n = min(srclen, dctx->bytes); + pos = dctx->buffer + POLYVAL_BLOCK_SIZE - dctx->bytes; + + dctx->bytes -= n; + srclen -= n; + + while (n--) + *pos++ ^= *src++; + + if (!dctx->bytes) + internal_polyval_mul(dctx->buffer, + tctx->key_powers[NUM_KEY_POWERS-1]); + } + + while (srclen >= POLYVAL_BLOCK_SIZE) { + /* Allow rescheduling every 4K bytes. */ + nblocks = min(srclen, 4096U) / POLYVAL_BLOCK_SIZE; + internal_polyval_update(tctx, src, nblocks, dctx->buffer); + srclen -= nblocks * POLYVAL_BLOCK_SIZE; + src += nblocks * POLYVAL_BLOCK_SIZE; + } + + if (srclen) { + dctx->bytes = POLYVAL_BLOCK_SIZE - srclen; + pos = dctx->buffer; + while (srclen--) + *pos++ ^= *src++; + } + + return 0; +} + +static int polyval_x86_final(struct shash_desc *desc, u8 *dst) +{ + struct polyval_desc_ctx *dctx = shash_desc_ctx(desc); + const struct polyval_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); + + if (dctx->bytes) { + internal_polyval_mul(dctx->buffer, + tctx->key_powers[NUM_KEY_POWERS-1]); + } + + memcpy(dst, dctx->buffer, POLYVAL_BLOCK_SIZE); + + return 0; +} + +static struct shash_alg polyval_alg = { + .digestsize = POLYVAL_DIGEST_SIZE, + .init = polyval_x86_init, + .update = polyval_x86_update, + .final = polyval_x86_final, + .setkey = polyval_x86_setkey, + .descsize = sizeof(struct polyval_desc_ctx), + .base = { + .cra_name = "polyval", + .cra_driver_name = "polyval-clmulni", + .cra_priority = 200, + .cra_blocksize = POLYVAL_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct polyval_tfm_ctx), + .cra_module = THIS_MODULE, + }, +}; + +__maybe_unused static const struct x86_cpu_id pcmul_cpu_id[] = { + X86_MATCH_FEATURE(X86_FEATURE_PCLMULQDQ, NULL), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, pcmul_cpu_id); + +static int __init polyval_clmulni_mod_init(void) +{ + if (!x86_match_cpu(pcmul_cpu_id)) + return -ENODEV; + + if (!boot_cpu_has(X86_FEATURE_AVX)) + return -ENODEV; + + return crypto_register_shash(&polyval_alg); +} + +static void __exit polyval_clmulni_mod_exit(void) +{ + crypto_unregister_shash(&polyval_alg); +} + +module_init(polyval_clmulni_mod_init); +module_exit(polyval_clmulni_mod_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("POLYVAL hash function accelerated by PCLMULQDQ-NI"); +MODULE_ALIAS_CRYPTO("polyval"); +MODULE_ALIAS_CRYPTO("polyval-clmulni"); diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c index edfe9780f6d1..90d15f2a7205 100644 --- a/arch/x86/entry/vdso/vdso2c.c +++ b/arch/x86/entry/vdso/vdso2c.c @@ -1,7 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * vdso2c - A vdso image preparation tool * Copyright (c) 2014 Andy Lutomirski and others - * Licensed under the GPL v2 * * vdso2c requires stripped and unstripped input. It would be trivial * to fully strip the input in here, but, for reasons described below, diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 30788894124f..f969410d0c90 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -693,9 +693,9 @@ void x86_pmu_disable_all(void) } } -struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr) +struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data) { - return static_call(x86_pmu_guest_get_msrs)(nr); + return static_call(x86_pmu_guest_get_msrs)(nr, data); } EXPORT_SYMBOL_GPL(perf_guest_get_msrs); @@ -2103,14 +2103,15 @@ static int __init init_hw_perf_events(void) } if (err != 0) { pr_cont("no PMU driver, software events only.\n"); - return 0; + err = 0; + goto out_bad_pmu; } pmu_check_apic(); /* sanity check that the hardware exists or is emulated */ if (!check_hw_exists(&pmu, x86_pmu.num_counters, x86_pmu.num_counters_fixed)) - return 0; + goto out_bad_pmu; pr_cont("%s PMU driver.\n", x86_pmu.name); @@ -2219,6 +2220,8 @@ out1: cpuhp_remove_state(CPUHP_AP_PERF_X86_STARTING); out: cpuhp_remove_state(CPUHP_PERF_X86_PREPARE); +out_bad_pmu: + memset(&x86_pmu, 0, sizeof(x86_pmu)); return err; } early_initcall(init_hw_perf_events); @@ -2990,6 +2993,11 @@ unsigned long perf_misc_flags(struct pt_regs *regs) void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) { + if (!x86_pmu_initialized()) { + memset(cap, 0, sizeof(*cap)); + return; + } + cap->version = x86_pmu.version; /* * KVM doesn't support the hybrid PMU yet. @@ -3002,5 +3010,17 @@ void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) cap->bit_width_fixed = x86_pmu.cntval_bits; cap->events_mask = (unsigned int)x86_pmu.events_maskl; cap->events_mask_len = x86_pmu.events_mask_len; + cap->pebs_ept = x86_pmu.pebs_ept; } EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability); + +u64 perf_get_hw_event_config(int hw_event) +{ + int max = x86_pmu.max_events; + + if (hw_event < max) + return x86_pmu.event_map(array_index_nospec(hw_event, max)); + + return 0; +} +EXPORT_SYMBOL_GPL(perf_get_hw_event_config); diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index bd8b98857609..2db93498ff71 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -14,6 +14,7 @@ #include <linux/slab.h> #include <linux/export.h> #include <linux/nmi.h> +#include <linux/kvm_host.h> #include <asm/cpufeature.h> #include <asm/hardirq.h> @@ -2852,6 +2853,47 @@ static void intel_pmu_reset(void) local_irq_restore(flags); } +/* + * We may be running with guest PEBS events created by KVM, and the + * PEBS records are logged into the guest's DS and invisible to host. + * + * In the case of guest PEBS overflow, we only trigger a fake event + * to emulate the PEBS overflow PMI for guest PEBS counters in KVM. + * The guest will then vm-entry and check the guest DS area to read + * the guest PEBS records. + * + * The contents and other behavior of the guest event do not matter. + */ +static void x86_pmu_handle_guest_pebs(struct pt_regs *regs, + struct perf_sample_data *data) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + u64 guest_pebs_idxs = cpuc->pebs_enabled & ~cpuc->intel_ctrl_host_mask; + struct perf_event *event = NULL; + int bit; + + if (!unlikely(perf_guest_state())) + return; + + if (!x86_pmu.pebs_ept || !x86_pmu.pebs_active || + !guest_pebs_idxs) + return; + + for_each_set_bit(bit, (unsigned long *)&guest_pebs_idxs, + INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed) { + event = cpuc->events[bit]; + if (!event->attr.precise_ip) + continue; + + perf_sample_data_init(data, 0, event->hw.last_period); + if (perf_event_overflow(event, data, regs)) + x86_pmu_stop(event, 0); + + /* Inject one fake event is enough. */ + break; + } +} + static int handle_pmi_common(struct pt_regs *regs, u64 status) { struct perf_sample_data data; @@ -2891,10 +2933,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) * counters from the GLOBAL_STATUS mask and we always process PEBS * events via drain_pebs(). */ - if (x86_pmu.flags & PMU_FL_PEBS_ALL) - status &= ~cpuc->pebs_enabled; - else - status &= ~(cpuc->pebs_enabled & PEBS_COUNTER_MASK); + status &= ~(cpuc->pebs_enabled & x86_pmu.pebs_capable); /* * PEBS overflow sets bit 62 in the global status register @@ -2903,6 +2942,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) u64 pebs_enabled = cpuc->pebs_enabled; handled++; + x86_pmu_handle_guest_pebs(regs, &data); x86_pmu.drain_pebs(regs, &data); status &= intel_ctrl | GLOBAL_STATUS_TRACE_TOPAPMI; @@ -3930,40 +3970,98 @@ static int intel_pmu_hw_config(struct perf_event *event) return 0; } -static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr) +/* + * Currently, the only caller of this function is the atomic_switch_perf_msrs(). + * The host perf conext helps to prepare the values of the real hardware for + * a set of msrs that need to be switched atomically in a vmx transaction. + * + * For example, the pseudocode needed to add a new msr should look like: + * + * arr[(*nr)++] = (struct perf_guest_switch_msr){ + * .msr = the hardware msr address, + * .host = the value the hardware has when it doesn't run a guest, + * .guest = the value the hardware has when it runs a guest, + * }; + * + * These values have nothing to do with the emulated values the guest sees + * when it uses {RD,WR}MSR, which should be handled by the KVM context, + * specifically in the intel_pmu_{get,set}_msr(). + */ +static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs; + struct kvm_pmu *kvm_pmu = (struct kvm_pmu *)data; u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl); + u64 pebs_mask = cpuc->pebs_enabled & x86_pmu.pebs_capable; + int global_ctrl, pebs_enable; + + *nr = 0; + global_ctrl = (*nr)++; + arr[global_ctrl] = (struct perf_guest_switch_msr){ + .msr = MSR_CORE_PERF_GLOBAL_CTRL, + .host = intel_ctrl & ~cpuc->intel_ctrl_guest_mask, + .guest = intel_ctrl & (~cpuc->intel_ctrl_host_mask | ~pebs_mask), + }; - arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL; - arr[0].host = intel_ctrl & ~cpuc->intel_ctrl_guest_mask; - arr[0].guest = intel_ctrl & ~cpuc->intel_ctrl_host_mask; - if (x86_pmu.flags & PMU_FL_PEBS_ALL) - arr[0].guest &= ~cpuc->pebs_enabled; - else - arr[0].guest &= ~(cpuc->pebs_enabled & PEBS_COUNTER_MASK); - *nr = 1; + if (!x86_pmu.pebs) + return arr; - if (x86_pmu.pebs && x86_pmu.pebs_no_isolation) { - /* - * If PMU counter has PEBS enabled it is not enough to - * disable counter on a guest entry since PEBS memory - * write can overshoot guest entry and corrupt guest - * memory. Disabling PEBS solves the problem. - * - * Don't do this if the CPU already enforces it. - */ - arr[1].msr = MSR_IA32_PEBS_ENABLE; - arr[1].host = cpuc->pebs_enabled; - arr[1].guest = 0; - *nr = 2; + /* + * If PMU counter has PEBS enabled it is not enough to + * disable counter on a guest entry since PEBS memory + * write can overshoot guest entry and corrupt guest + * memory. Disabling PEBS solves the problem. + * + * Don't do this if the CPU already enforces it. + */ + if (x86_pmu.pebs_no_isolation) { + arr[(*nr)++] = (struct perf_guest_switch_msr){ + .msr = MSR_IA32_PEBS_ENABLE, + .host = cpuc->pebs_enabled, + .guest = 0, + }; + return arr; + } + + if (!kvm_pmu || !x86_pmu.pebs_ept) + return arr; + + arr[(*nr)++] = (struct perf_guest_switch_msr){ + .msr = MSR_IA32_DS_AREA, + .host = (unsigned long)cpuc->ds, + .guest = kvm_pmu->ds_area, + }; + + if (x86_pmu.intel_cap.pebs_baseline) { + arr[(*nr)++] = (struct perf_guest_switch_msr){ + .msr = MSR_PEBS_DATA_CFG, + .host = cpuc->pebs_data_cfg, + .guest = kvm_pmu->pebs_data_cfg, + }; + } + + pebs_enable = (*nr)++; + arr[pebs_enable] = (struct perf_guest_switch_msr){ + .msr = MSR_IA32_PEBS_ENABLE, + .host = cpuc->pebs_enabled & ~cpuc->intel_ctrl_guest_mask, + .guest = pebs_mask & ~cpuc->intel_ctrl_host_mask, + }; + + if (arr[pebs_enable].host) { + /* Disable guest PEBS if host PEBS is enabled. */ + arr[pebs_enable].guest = 0; + } else { + /* Disable guest PEBS for cross-mapped PEBS counters. */ + arr[pebs_enable].guest &= ~kvm_pmu->host_cross_mapped_mask; + /* Set hw GLOBAL_CTRL bits for PEBS counter when it runs for guest */ + arr[global_ctrl].guest |= arr[pebs_enable].guest; } return arr; } -static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr) +static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr, void *data) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs; @@ -5650,6 +5748,7 @@ __init int intel_pmu_init(void) x86_pmu.events_mask_len = eax.split.mask_length; x86_pmu.max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters); + x86_pmu.pebs_capable = PEBS_COUNTER_MASK; /* * Quirk: v2 perfmon does not report fixed-purpose events, so @@ -5834,6 +5933,7 @@ __init int intel_pmu_init(void) x86_pmu.pebs_aliases = NULL; x86_pmu.pebs_prec_dist = true; x86_pmu.lbr_pt_coexist = true; + x86_pmu.pebs_capable = ~0ULL; x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.flags |= PMU_FL_PEBS_ALL; x86_pmu.get_event_constraints = glp_get_event_constraints; @@ -6138,6 +6238,7 @@ __init int intel_pmu_init(void) case INTEL_FAM6_ICELAKE_X: case INTEL_FAM6_ICELAKE_D: + x86_pmu.pebs_ept = 1; pmem = true; fallthrough; case INTEL_FAM6_ICELAKE_L: @@ -6190,6 +6291,7 @@ __init int intel_pmu_init(void) x86_pmu.pebs_aliases = NULL; x86_pmu.pebs_prec_dist = true; x86_pmu.pebs_block = true; + x86_pmu.pebs_capable = ~0ULL; x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.flags |= PMU_FL_NO_HT_SHARING; x86_pmu.flags |= PMU_FL_PEBS_ALL; @@ -6235,6 +6337,7 @@ __init int intel_pmu_init(void) x86_pmu.pebs_aliases = NULL; x86_pmu.pebs_prec_dist = true; x86_pmu.pebs_block = true; + x86_pmu.pebs_capable = ~0ULL; x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.flags |= PMU_FL_NO_HT_SHARING; x86_pmu.flags |= PMU_FL_PEBS_ALL; @@ -6399,8 +6502,7 @@ __init int intel_pmu_init(void) x86_pmu.intel_ctrl); /* * Access LBR MSR may cause #GP under certain circumstances. - * E.g. KVM doesn't support LBR MSR - * Check all LBT MSR here. + * Check all LBR MSR here. * Disable LBR access if any LBR MSRs can not be accessed. */ if (x86_pmu.lbr_tos && !check_msr(x86_pmu.lbr_tos, 0x3UL)) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index ca2f8bfe6ff1..ba3d24a6a4ec 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -828,7 +828,8 @@ struct x86_pmu { pebs_prec_dist :1, pebs_no_tlb :1, pebs_no_isolation :1, - pebs_block :1; + pebs_block :1, + pebs_ept :1; int pebs_record_size; int pebs_buffer_size; int max_pebs_events; @@ -838,6 +839,7 @@ struct x86_pmu { u64 (*pebs_latency_data)(struct perf_event *event, u64 status); unsigned long large_pebs_flags; u64 rtm_abort_event; + u64 pebs_capable; /* * Intel LBR @@ -913,7 +915,7 @@ struct x86_pmu { /* * Intel host/guest support (KVM) */ - struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr); + struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr, void *data); /* * Check period value for PERF_EVENT_IOC_PERIOD ioctl. diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c index db2d92fb44da..fb8b2c088681 100644 --- a/arch/x86/hyperv/hv_apic.c +++ b/arch/x86/hyperv/hv_apic.c @@ -46,7 +46,7 @@ static void hv_apic_icr_write(u32 low, u32 id) { u64 reg_val; - reg_val = SET_APIC_DEST_FIELD(id); + reg_val = SET_XAPIC_DEST_FIELD(id); reg_val = reg_val << 32; reg_val |= low; diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 92035eb3afee..68d213e83fcc 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -89,8 +89,8 @@ #define APIC_DM_EXTINT 0x00700 #define APIC_VECTOR_MASK 0x000FF #define APIC_ICR2 0x310 -#define GET_APIC_DEST_FIELD(x) (((x) >> 24) & 0xFF) -#define SET_APIC_DEST_FIELD(x) ((x) << 24) +#define GET_XAPIC_DEST_FIELD(x) (((x) >> 24) & 0xFF) +#define SET_XAPIC_DEST_FIELD(x) ((x) << 24) #define APIC_LVTT 0x320 #define APIC_LVTTHMR 0x330 #define APIC_LVTPC 0x340 diff --git a/arch/x86/include/asm/archrandom.h b/arch/x86/include/asm/archrandom.h index ebc248e49549..02bae8e0758b 100644 --- a/arch/x86/include/asm/archrandom.h +++ b/arch/x86/include/asm/archrandom.h @@ -31,20 +31,6 @@ static inline bool __must_check rdrand_long(unsigned long *v) return false; } -static inline bool __must_check rdrand_int(unsigned int *v) -{ - bool ok; - unsigned int retry = RDRAND_RETRY_LOOPS; - do { - asm volatile("rdrand %[out]" - CC_SET(c) - : CC_OUT(c) (ok), [out] "=r" (*v)); - if (ok) - return true; - } while (--retry); - return false; -} - static inline bool __must_check rdseed_long(unsigned long *v) { bool ok; @@ -54,48 +40,23 @@ static inline bool __must_check rdseed_long(unsigned long *v) return ok; } -static inline bool __must_check rdseed_int(unsigned int *v) -{ - bool ok; - asm volatile("rdseed %[out]" - CC_SET(c) - : CC_OUT(c) (ok), [out] "=r" (*v)); - return ok; -} - /* * These are the generic interfaces; they must not be declared if the - * stubs in <linux/random.h> are to be invoked, - * i.e. CONFIG_ARCH_RANDOM is not defined. + * stubs in <linux/random.h> are to be invoked. */ -#ifdef CONFIG_ARCH_RANDOM -static inline bool __must_check arch_get_random_long(unsigned long *v) +static inline size_t __must_check arch_get_random_longs(unsigned long *v, size_t max_longs) { - return static_cpu_has(X86_FEATURE_RDRAND) ? rdrand_long(v) : false; + return max_longs && static_cpu_has(X86_FEATURE_RDRAND) && rdrand_long(v) ? 1 : 0; } -static inline bool __must_check arch_get_random_int(unsigned int *v) +static inline size_t __must_check arch_get_random_seed_longs(unsigned long *v, size_t max_longs) { - return static_cpu_has(X86_FEATURE_RDRAND) ? rdrand_int(v) : false; + return max_longs && static_cpu_has(X86_FEATURE_RDSEED) && rdseed_long(v) ? 1 : 0; } -static inline bool __must_check arch_get_random_seed_long(unsigned long *v) -{ - return static_cpu_has(X86_FEATURE_RDSEED) ? rdseed_long(v) : false; -} - -static inline bool __must_check arch_get_random_seed_int(unsigned int *v) -{ - return static_cpu_has(X86_FEATURE_RDSEED) ? rdseed_int(v) : false; -} - -extern void x86_init_rdrand(struct cpuinfo_x86 *c); - -#else /* !CONFIG_ARCH_RANDOM */ - -static inline void x86_init_rdrand(struct cpuinfo_x86 *c) { } - -#endif /* !CONFIG_ARCH_RANDOM */ +#ifndef CONFIG_UML +void x86_init_rdrand(struct cpuinfo_x86 *c); +#endif #endif /* ASM_X86_ARCHRANDOM_H */ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 5fe7f6c8a7a4..14ed039dff55 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -353,6 +353,7 @@ #define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ #define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */ #define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */ +#define X86_FEATURE_X2AVIC (15*32+18) /* Virtual x2apic */ #define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* Virtual SPEC_CTRL */ #define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* "" SVME addr check */ diff --git a/arch/x86/include/asm/dma.h b/arch/x86/include/asm/dma.h index 8e95aa4b0d17..8ae6e0e11b8b 100644 --- a/arch/x86/include/asm/dma.h +++ b/arch/x86/include/asm/dma.h @@ -307,12 +307,4 @@ extern int request_dma(unsigned int dmanr, const char *device_id); extern void free_dma(unsigned int dmanr); #endif -/* From PCI */ - -#ifdef CONFIG_PCI -extern int isa_dma_bridge_buggy; -#else -#define isa_dma_bridge_buggy (0) -#endif - #endif /* _ASM_X86_DMA_H */ diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 9636742a80f2..233ae6986d6f 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -100,8 +100,6 @@ static inline void efi_fpu_end(void) efi_fpu_end(); \ }) -#define arch_efi_call_virt(p, f, args...) p->f(args) - #else /* !CONFIG_X86_32 */ #define EFI_LOADER_SIGNATURE "EL64" @@ -121,6 +119,7 @@ extern asmlinkage u64 __efi_call(void *fp, ...); efi_enter_mm(); \ }) +#undef arch_efi_call_virt #define arch_efi_call_virt(p, f, args...) ({ \ u64 ret, ibt = ibt_save(); \ ret = efi_call((void *)p->f, args); \ @@ -383,7 +382,6 @@ static inline bool efi_is_64bit(void) extern bool efi_reboot_required(void); extern bool efi_is_table_address(unsigned long phys_addr); -extern void efi_find_mirror(void); extern void efi_reserve_boot_services(void); #else static inline void parse_efi_setup(u64 phys_addr, u32 data_len) {} @@ -395,9 +393,6 @@ static inline bool efi_is_table_address(unsigned long phys_addr) { return false; } -static inline void efi_find_mirror(void) -{ -} static inline void efi_reserve_boot_services(void) { } diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 1870b99c3356..e9025640f634 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -170,15 +170,6 @@ static inline unsigned int isa_virt_to_bus(volatile void *address) #define isa_bus_to_virt phys_to_virt /* - * However PCI ones are not necessarily 1:1 and therefore these interfaces - * are forbidden in portable PCI drivers. - * - * Allow them on x86 for legacy drivers, though. - */ -#define virt_to_bus virt_to_phys -#define bus_to_virt phys_to_virt - -/* * The default ioremap() behavior is non-cached; if you need something * else, you probably want one of the following. */ diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 6ad8d946cd3e..a3760ca796aa 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -186,6 +186,12 @@ extern int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, extern void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages); #define arch_kexec_pre_free_pages arch_kexec_pre_free_pages +void arch_kexec_protect_crashkres(void); +#define arch_kexec_protect_crashkres arch_kexec_protect_crashkres + +void arch_kexec_unprotect_crashkres(void); +#define arch_kexec_unprotect_crashkres arch_kexec_unprotect_crashkres + #ifdef CONFIG_KEXEC_FILE struct purgatory_info; int arch_kexec_apply_relocations_add(struct purgatory_info *pi, @@ -193,6 +199,12 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi, const Elf_Shdr *relsec, const Elf_Shdr *symtab); #define arch_kexec_apply_relocations_add arch_kexec_apply_relocations_add + +void *arch_kexec_kernel_image_load(struct kimage *image); +#define arch_kexec_kernel_image_load arch_kexec_kernel_image_load + +int arch_kimage_file_post_load_cleanup(struct kimage *image); +#define arch_kimage_file_post_load_cleanup arch_kimage_file_post_load_cleanup #endif #endif diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index da47f60a4650..51f777071584 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -21,6 +21,7 @@ KVM_X86_OP(has_emulated_msr) KVM_X86_OP(vcpu_after_set_cpuid) KVM_X86_OP(vm_init) KVM_X86_OP_OPTIONAL(vm_destroy) +KVM_X86_OP_OPTIONAL_RET0(vcpu_precreate) KVM_X86_OP(vcpu_create) KVM_X86_OP(vcpu_free) KVM_X86_OP(vcpu_reset) @@ -87,7 +88,7 @@ KVM_X86_OP(deliver_interrupt) KVM_X86_OP_OPTIONAL(sync_pir_to_irr) KVM_X86_OP_OPTIONAL_RET0(set_tss_addr) KVM_X86_OP_OPTIONAL_RET0(set_identity_map_addr) -KVM_X86_OP(get_mt_mask) +KVM_X86_OP_OPTIONAL_RET0(get_mt_mask) KVM_X86_OP(load_mmu_pgd) KVM_X86_OP(has_wbinvd_exit) KVM_X86_OP(get_l2_tsc_offset) diff --git a/arch/x86/include/asm/kvm-x86-pmu-ops.h b/arch/x86/include/asm/kvm-x86-pmu-ops.h index fdfd8e06fee6..c17e3e96fc1d 100644 --- a/arch/x86/include/asm/kvm-x86-pmu-ops.h +++ b/arch/x86/include/asm/kvm-x86-pmu-ops.h @@ -12,7 +12,7 @@ BUILD_BUG_ON(1) * a NULL definition, for example if "static_call_cond()" will be used * at the call sites. */ -KVM_X86_PMU_OP(pmc_perf_hw_id) +KVM_X86_PMU_OP(hw_event_available) KVM_X86_PMU_OP(pmc_is_enabled) KVM_X86_PMU_OP(pmc_idx_to_pmc) KVM_X86_PMU_OP(rdpmc_ecx_to_pmc) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9217bd6cf0d1..e8281d64a431 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -65,6 +65,9 @@ #define KVM_BUS_LOCK_DETECTION_VALID_MODE (KVM_BUS_LOCK_DETECTION_OFF | \ KVM_BUS_LOCK_DETECTION_EXIT) +#define KVM_X86_NOTIFY_VMEXIT_VALID_BITS (KVM_X86_NOTIFY_VMEXIT_ENABLED | \ + KVM_X86_NOTIFY_VMEXIT_USER) + /* x86-specific vcpu->requests bit members */ #define KVM_REQ_MIGRATE_TIMER KVM_ARCH_REQ(0) #define KVM_REQ_REPORT_TPR_ACCESS KVM_ARCH_REQ(1) @@ -126,7 +129,6 @@ #define INVALID_PAGE (~(hpa_t)0) #define VALID_PAGE(x) ((x) != INVALID_PAGE) -#define UNMAPPED_GVA (~(gpa_t)0) #define INVALID_GPA (~(gpa_t)0) /* KVM Hugepage definitions for x86 */ @@ -505,6 +507,7 @@ struct kvm_pmu { unsigned nr_arch_fixed_counters; unsigned available_event_types; u64 fixed_ctr_ctrl; + u64 fixed_ctr_ctrl_mask; u64 global_ctrl; u64 global_status; u64 counter_bitmask[2]; @@ -520,6 +523,21 @@ struct kvm_pmu { DECLARE_BITMAP(all_valid_pmc_idx, X86_PMC_IDX_MAX); DECLARE_BITMAP(pmc_in_use, X86_PMC_IDX_MAX); + u64 ds_area; + u64 pebs_enable; + u64 pebs_enable_mask; + u64 pebs_data_cfg; + u64 pebs_data_cfg_mask; + + /* + * If a guest counter is cross-mapped to host counter with different + * index, its PEBS capability will be temporarily disabled. + * + * The user should make sure that this mask is updated + * after disabling interrupts and before perf_guest_get_msrs(); + */ + u64 host_cross_mapped_mask; + /* * The gate to release perf_events not marked in * pmc_in_use only once in a vcpu time slice. @@ -644,7 +662,6 @@ struct kvm_vcpu_arch { u64 efer; u64 apic_base; struct kvm_lapic *apic; /* kernel irqchip context */ - bool apicv_active; bool load_eoi_exitmap_pending; DECLARE_BITMAP(ioapic_handled_vectors, 256); unsigned long apic_attention; @@ -695,7 +712,7 @@ struct kvm_vcpu_arch { struct kvm_mmu_memory_cache mmu_pte_list_desc_cache; struct kvm_mmu_memory_cache mmu_shadow_page_cache; - struct kvm_mmu_memory_cache mmu_gfn_array_cache; + struct kvm_mmu_memory_cache mmu_shadowed_info_cache; struct kvm_mmu_memory_cache mmu_page_header_cache; /* @@ -808,6 +825,7 @@ struct kvm_vcpu_arch { u64 mcg_ctl; u64 mcg_ext_ctl; u64 *mce_banks; + u64 *mci_ctl2_banks; /* Cache MMIO info */ u64 mmio_gva; @@ -1111,11 +1129,6 @@ enum kvm_apicv_inhibit { APICV_INHIBIT_REASON_PIT_REINJ, /* - * AVIC is inhibited because the guest has x2apic in its CPUID. - */ - APICV_INHIBIT_REASON_X2APIC, - - /* * AVIC is disabled because SEV doesn't support it. */ APICV_INHIBIT_REASON_SEV, @@ -1222,8 +1235,13 @@ struct kvm_arch { bool guest_can_read_msr_platform_info; bool exception_payload_enabled; + bool triple_fault_event; + bool bus_lock_detection_enabled; bool enable_pmu; + + u32 notify_window; + u32 notify_vmexit_flags; /* * If exit_on_emulation_error is set, and the in-kernel instruction * emulator fails to emulate an instruction, allow userspace @@ -1307,6 +1325,36 @@ struct kvm_arch { hpa_t hv_root_tdp; spinlock_t hv_root_tdp_lock; #endif + /* + * VM-scope maximum vCPU ID. Used to determine the size of structures + * that increase along with the maximum vCPU ID, in which case, using + * the global KVM_MAX_VCPU_IDS may lead to significant memory waste. + */ + u32 max_vcpu_ids; + + bool disable_nx_huge_pages; + + /* + * Memory caches used to allocate shadow pages when performing eager + * page splitting. No need for a shadowed_info_cache since eager page + * splitting only allocates direct shadow pages. + * + * Protected by kvm->slots_lock. + */ + struct kvm_mmu_memory_cache split_shadow_page_cache; + struct kvm_mmu_memory_cache split_page_header_cache; + + /* + * Memory cache used to allocate pte_list_desc structs while splitting + * huge pages. In the worst case, to split one huge page, 512 + * pte_list_desc structs are needed to add each lower level leaf sptep + * to the rmap plus 1 to extend the parent_ptes rmap of the lower level + * page table. + * + * Protected by kvm->slots_lock. + */ +#define SPLIT_DESC_CACHE_MIN_NR_OBJECTS (SPTE_ENT_PER_PAGE + 1) + struct kvm_mmu_memory_cache split_desc_cache; }; struct kvm_vm_stat { @@ -1367,6 +1415,7 @@ struct kvm_vcpu_stat { u64 preemption_reported; u64 preemption_other; u64 guest_mode; + u64 notify_window_exits; }; struct x86_instruction_info; @@ -1407,6 +1456,7 @@ struct kvm_x86_ops { void (*vm_destroy)(struct kvm *kvm); /* Create, but do not attach this VCPU */ + int (*vcpu_precreate)(struct kvm *kvm); int (*vcpu_create)(struct kvm_vcpu *vcpu); void (*vcpu_free)(struct kvm_vcpu *vcpu); void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event); @@ -1471,7 +1521,7 @@ struct kvm_x86_ops { u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu); void (*patch_hypercall)(struct kvm_vcpu *vcpu, unsigned char *hypercall_addr); - void (*inject_irq)(struct kvm_vcpu *vcpu); + void (*inject_irq)(struct kvm_vcpu *vcpu, bool reinjected); void (*inject_nmi)(struct kvm_vcpu *vcpu); void (*queue_exception)(struct kvm_vcpu *vcpu); void (*cancel_injection)(struct kvm_vcpu *vcpu); @@ -1485,7 +1535,7 @@ struct kvm_x86_ops { bool (*check_apicv_inhibit_reasons)(enum kvm_apicv_inhibit reason); void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); - void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr); + void (*hwapic_isr_update)(int isr); bool (*guest_apic_has_interrupt)(struct kvm_vcpu *vcpu); void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu); @@ -1495,7 +1545,7 @@ struct kvm_x86_ops { int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr); - u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); + u8 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); @@ -1705,21 +1755,6 @@ extern bool tdp_enabled; u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu); -/* control of guest tsc rate supported? */ -extern bool kvm_has_tsc_control; -/* maximum supported tsc_khz for guests */ -extern u32 kvm_max_guest_tsc_khz; -/* number of bits of the fractional part of the TSC scaling ratio */ -extern u8 kvm_tsc_scaling_ratio_frac_bits; -/* maximum allowed value of TSC scaling ratio */ -extern u64 kvm_max_tsc_scaling_ratio; -/* 1ull << kvm_tsc_scaling_ratio_frac_bits */ -extern u64 kvm_default_tsc_scaling_ratio; -/* bus lock detection supported? */ -extern bool kvm_has_bus_lock_exit; - -extern u64 kvm_mce_cap_supported; - /* * EMULTYPE_NO_DECODE - Set when re-emulating an instruction (after completing * userspace I/O) to indicate that the emulation context @@ -2060,6 +2095,7 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); KVM_X86_QUIRK_LAPIC_MMIO_HOLE | \ KVM_X86_QUIRK_OUT_7E_INC_RIP | \ KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT | \ - KVM_X86_QUIRK_FIX_HYPERCALL_INSN) + KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \ + KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS) #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index 88ceaf3648b3..72ca90552b6a 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -89,6 +89,8 @@ static inline void mem_encrypt_free_decrypted_mem(void) { } /* Architecture __weak replacement functions */ void __init mem_encrypt_init(void); +void add_encrypt_protection_map(void); + /* * The __sme_pa() and __sme_pa_nodebug() macros are meant for use when * writing to or comparing values from the cr3 register. Having the diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index cc615be27a54..182b2a1f71fe 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -231,6 +231,12 @@ #define PERF_CAP_PT_IDX 16 #define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 +#define PERF_CAP_PEBS_TRAP BIT_ULL(6) +#define PERF_CAP_ARCH_REG BIT_ULL(7) +#define PERF_CAP_PEBS_FORMAT 0xf00 +#define PERF_CAP_PEBS_BASELINE BIT_ULL(14) +#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \ + PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE) #define MSR_IA32_RTIT_CTL 0x00000570 #define RTIT_CTL_TRACEEN BIT(0) @@ -388,6 +394,7 @@ #define MSR_TURBO_ACTIVATION_RATIO 0x0000064C #define MSR_PLATFORM_ENERGY_STATUS 0x0000064D +#define MSR_SECONDARY_TURBO_RATIO_LIMIT 0x00000650 #define MSR_PKG_WEIGHTED_CORE_C0_RES 0x00000658 #define MSR_PKG_ANY_CORE_C0_RES 0x00000659 @@ -1018,6 +1025,7 @@ #define MSR_IA32_VMX_TRUE_EXIT_CTLS 0x0000048f #define MSR_IA32_VMX_TRUE_ENTRY_CTLS 0x00000490 #define MSR_IA32_VMX_VMFUNC 0x00000491 +#define MSR_IA32_VMX_PROCBASED_CTLS3 0x00000492 /* VMX_BASIC bits and bitmasks */ #define VMX_BASIC_VMCS_SIZE_SHIFT 32 diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index f3fd5928bcbb..736793d65bcb 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -105,9 +105,6 @@ static inline void early_quirks(void) { } extern void pci_iommu_alloc(void); -/* generic pci stuff */ -#include <asm-generic/pci.h> - #ifdef CONFIG_NUMA /* Returns the node based on pci bus */ static inline int __pcibus_to_node(const struct pci_bus *bus) diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 34348ae41cdb..f6fc8dd51ef4 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -222,6 +222,7 @@ struct x86_pmu_capability { int bit_width_fixed; unsigned int events_mask; int events_mask_len; + unsigned int pebs_ept :1; }; /* @@ -520,6 +521,7 @@ struct x86_pmu_lbr { }; extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap); +extern u64 perf_get_hw_event_config(int hw_event); extern void perf_check_microcode(void); extern void perf_clear_dirty_counters(void); extern int x86_perf_rdpmc_index(struct perf_event *event); @@ -529,15 +531,20 @@ static inline void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) memset(cap, 0, sizeof(*cap)); } +static inline u64 perf_get_hw_event_config(int hw_event) +{ + return 0; +} + static inline void perf_events_lapic_init(void) { } static inline void perf_check_microcode(void) { } #endif #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) -extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr); +extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data); extern int x86_perf_get_lbr(struct x86_pmu_lbr *lbr); #else -struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr); +struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data); static inline int x86_perf_get_lbr(struct x86_pmu_lbr *lbr) { return -1; diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index bdaf8391e2e0..aa174fed3a71 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -230,25 +230,6 @@ enum page_cache_mode { #endif /* __ASSEMBLY__ */ -/* xwr */ -#define __P000 PAGE_NONE -#define __P001 PAGE_READONLY -#define __P010 PAGE_COPY -#define __P011 PAGE_COPY -#define __P100 PAGE_READONLY_EXEC -#define __P101 PAGE_READONLY_EXEC -#define __P110 PAGE_COPY_EXEC -#define __P111 PAGE_COPY_EXEC - -#define __S000 PAGE_NONE -#define __S001 PAGE_READONLY -#define __S010 PAGE_SHARED -#define __S011 PAGE_SHARED -#define __S100 PAGE_READONLY_EXEC -#define __S101 PAGE_READONLY_EXEC -#define __S110 PAGE_SHARED_EXEC -#define __S111 PAGE_SHARED_EXEC - /* * early identity mapping pte attrib macros. */ diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h index 3f9334ef67cd..eae20fa52b93 100644 --- a/arch/x86/include/asm/sgx.h +++ b/arch/x86/include/asm/sgx.h @@ -65,17 +65,22 @@ enum sgx_encls_function { /** * enum sgx_return_code - The return code type for ENCLS, ENCLU and ENCLV + * %SGX_EPC_PAGE_CONFLICT: Page is being written by other ENCLS function. * %SGX_NOT_TRACKED: Previous ETRACK's shootdown sequence has not * been completed yet. * %SGX_CHILD_PRESENT SECS has child pages present in the EPC. * %SGX_INVALID_EINITTOKEN: EINITTOKEN is invalid and enclave signer's * public key does not match IA32_SGXLEPUBKEYHASH. + * %SGX_PAGE_NOT_MODIFIABLE: The EPC page cannot be modified because it + * is in the PENDING or MODIFIED state. * %SGX_UNMASKED_EVENT: An unmasked event, e.g. INTR, was received */ enum sgx_return_code { + SGX_EPC_PAGE_CONFLICT = 7, SGX_NOT_TRACKED = 11, SGX_CHILD_PRESENT = 13, SGX_INVALID_EINITTOKEN = 16, + SGX_PAGE_NOT_MODIFIABLE = 20, SGX_UNMASKED_EVENT = 128, }; @@ -234,6 +239,9 @@ struct sgx_pageinfo { * %SGX_PAGE_TYPE_REG: a regular page * %SGX_PAGE_TYPE_VA: a VA page * %SGX_PAGE_TYPE_TRIM: a page in trimmed state + * + * Make sure when making changes to this enum that its values can still fit + * in the bitfield within &struct sgx_encl_page */ enum sgx_page_type { SGX_PAGE_TYPE_SECS, diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 1b07fba11704..0361626841bc 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -195,6 +195,9 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define AVIC_ENABLE_SHIFT 31 #define AVIC_ENABLE_MASK (1 << AVIC_ENABLE_SHIFT) +#define X2APIC_MODE_SHIFT 30 +#define X2APIC_MODE_MASK (1 << X2APIC_MODE_SHIFT) + #define LBR_CTL_ENABLE_MASK BIT_ULL(0) #define VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK BIT_ULL(1) @@ -253,12 +256,19 @@ enum avic_ipi_failure_cause { AVIC_IPI_FAILURE_INVALID_BACKING_PAGE, }; +#define AVIC_PHYSICAL_MAX_INDEX_MASK GENMASK_ULL(9, 0) + +/* + * For AVIC, the max index allowed for physical APIC ID + * table is 0xff (255). + */ +#define AVIC_MAX_PHYSICAL_ID 0XFEULL /* - * 0xff is broadcast, so the max index allowed for physical APIC ID - * table is 0xfe. APIC IDs above 0xff are reserved. + * For x2AVIC, the max index allowed for physical APIC ID + * table is 0x1ff (511). */ -#define AVIC_MAX_PHYSICAL_ID_COUNT 0xff +#define X2AVIC_MAX_PHYSICAL_ID 0x1FFUL #define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF) #define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 6c343c6a1855..c371ef695fcc 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -31,6 +31,7 @@ #define CPU_BASED_RDTSC_EXITING VMCS_CONTROL_BIT(RDTSC_EXITING) #define CPU_BASED_CR3_LOAD_EXITING VMCS_CONTROL_BIT(CR3_LOAD_EXITING) #define CPU_BASED_CR3_STORE_EXITING VMCS_CONTROL_BIT(CR3_STORE_EXITING) +#define CPU_BASED_ACTIVATE_TERTIARY_CONTROLS VMCS_CONTROL_BIT(TERTIARY_CONTROLS) #define CPU_BASED_CR8_LOAD_EXITING VMCS_CONTROL_BIT(CR8_LOAD_EXITING) #define CPU_BASED_CR8_STORE_EXITING VMCS_CONTROL_BIT(CR8_STORE_EXITING) #define CPU_BASED_TPR_SHADOW VMCS_CONTROL_BIT(VIRTUAL_TPR) @@ -74,6 +75,12 @@ #define SECONDARY_EXEC_TSC_SCALING VMCS_CONTROL_BIT(TSC_SCALING) #define SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE VMCS_CONTROL_BIT(USR_WAIT_PAUSE) #define SECONDARY_EXEC_BUS_LOCK_DETECTION VMCS_CONTROL_BIT(BUS_LOCK_DETECTION) +#define SECONDARY_EXEC_NOTIFY_VM_EXITING VMCS_CONTROL_BIT(NOTIFY_VM_EXITING) + +/* + * Definitions of Tertiary Processor-Based VM-Execution Controls. + */ +#define TERTIARY_EXEC_IPI_VIRT VMCS_CONTROL_BIT(IPI_VIRT) #define PIN_BASED_EXT_INTR_MASK VMCS_CONTROL_BIT(INTR_EXITING) #define PIN_BASED_NMI_EXITING VMCS_CONTROL_BIT(NMI_EXITING) @@ -158,6 +165,7 @@ static inline int vmx_misc_mseg_revid(u64 vmx_misc) enum vmcs_field { VIRTUAL_PROCESSOR_ID = 0x00000000, POSTED_INTR_NV = 0x00000002, + LAST_PID_POINTER_INDEX = 0x00000008, GUEST_ES_SELECTOR = 0x00000800, GUEST_CS_SELECTOR = 0x00000802, GUEST_SS_SELECTOR = 0x00000804, @@ -221,6 +229,10 @@ enum vmcs_field { ENCLS_EXITING_BITMAP_HIGH = 0x0000202F, TSC_MULTIPLIER = 0x00002032, TSC_MULTIPLIER_HIGH = 0x00002033, + TERTIARY_VM_EXEC_CONTROL = 0x00002034, + TERTIARY_VM_EXEC_CONTROL_HIGH = 0x00002035, + PID_POINTER_TABLE = 0x00002042, + PID_POINTER_TABLE_HIGH = 0x00002043, GUEST_PHYSICAL_ADDRESS = 0x00002400, GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401, VMCS_LINK_POINTER = 0x00002800, @@ -269,6 +281,7 @@ enum vmcs_field { SECONDARY_VM_EXEC_CONTROL = 0x0000401e, PLE_GAP = 0x00004020, PLE_WINDOW = 0x00004022, + NOTIFY_WINDOW = 0x00004024, VM_INSTRUCTION_ERROR = 0x00004400, VM_EXIT_REASON = 0x00004402, VM_EXIT_INTR_INFO = 0x00004404, @@ -554,6 +567,11 @@ enum vm_entry_failure_code { #define EPT_VIOLATION_GVA_TRANSLATED (1 << EPT_VIOLATION_GVA_TRANSLATED_BIT) /* + * Exit Qualifications for NOTIFY VM EXIT + */ +#define NOTIFY_VM_CONTEXT_INVALID BIT(0) + +/* * VM-instruction error numbers */ enum vm_instruction_error_number { diff --git a/arch/x86/include/asm/vmxfeatures.h b/arch/x86/include/asm/vmxfeatures.h index d9a74681a77d..c6a7eed03914 100644 --- a/arch/x86/include/asm/vmxfeatures.h +++ b/arch/x86/include/asm/vmxfeatures.h @@ -5,7 +5,7 @@ /* * Defines VMX CPU feature bits */ -#define NVMXINTS 3 /* N 32-bit words worth of info */ +#define NVMXINTS 5 /* N 32-bit words worth of info */ /* * Note: If the comment begins with a quoted string, that string is used @@ -43,6 +43,7 @@ #define VMX_FEATURE_RDTSC_EXITING ( 1*32+ 12) /* "" VM-Exit on RDTSC */ #define VMX_FEATURE_CR3_LOAD_EXITING ( 1*32+ 15) /* "" VM-Exit on writes to CR3 */ #define VMX_FEATURE_CR3_STORE_EXITING ( 1*32+ 16) /* "" VM-Exit on reads from CR3 */ +#define VMX_FEATURE_TERTIARY_CONTROLS ( 1*32+ 17) /* "" Enable Tertiary VM-Execution Controls */ #define VMX_FEATURE_CR8_LOAD_EXITING ( 1*32+ 19) /* "" VM-Exit on writes to CR8 */ #define VMX_FEATURE_CR8_STORE_EXITING ( 1*32+ 20) /* "" VM-Exit on reads from CR8 */ #define VMX_FEATURE_VIRTUAL_TPR ( 1*32+ 21) /* "vtpr" TPR virtualization, a.k.a. TPR shadow */ @@ -84,5 +85,8 @@ #define VMX_FEATURE_USR_WAIT_PAUSE ( 2*32+ 26) /* Enable TPAUSE, UMONITOR, UMWAIT in guest */ #define VMX_FEATURE_ENCLV_EXITING ( 2*32+ 28) /* "" VM-Exit on ENCLV (leaf dependent) */ #define VMX_FEATURE_BUS_LOCK_DETECTION ( 2*32+ 30) /* "" VM-Exit when bus lock caused */ +#define VMX_FEATURE_NOTIFY_VM_EXITING ( 2*32+ 31) /* VM-Exit when no event windows after notify window */ +/* Tertiary Processor-Based VM-Execution Controls, word 3 */ +#define VMX_FEATURE_IPI_VIRT ( 3*32+ 4) /* Enable IPI virtualization */ #endif /* _ASM_X86_VMXFEATURES_H */ diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index 342290624040..01d19fc22346 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -53,7 +53,7 @@ struct setup_data { __u64 next; __u32 type; __u32 len; - __u8 data[0]; + __u8 data[]; }; /* extensible setup indirect data node */ diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 21614807a2cb..46de10a809ec 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -198,13 +198,13 @@ struct kvm_msrs { __u32 nmsrs; /* number of msrs in entries */ __u32 pad; - struct kvm_msr_entry entries[0]; + struct kvm_msr_entry entries[]; }; /* for KVM_GET_MSR_INDEX_LIST */ struct kvm_msr_list { __u32 nmsrs; /* number of msrs in entries */ - __u32 indices[0]; + __u32 indices[]; }; /* Maximum size of any access bitmap in bytes */ @@ -241,7 +241,7 @@ struct kvm_cpuid_entry { struct kvm_cpuid { __u32 nent; __u32 padding; - struct kvm_cpuid_entry entries[0]; + struct kvm_cpuid_entry entries[]; }; struct kvm_cpuid_entry2 { @@ -263,7 +263,7 @@ struct kvm_cpuid_entry2 { struct kvm_cpuid2 { __u32 nent; __u32 padding; - struct kvm_cpuid_entry2 entries[0]; + struct kvm_cpuid_entry2 entries[]; }; /* for KVM_GET_PIT and KVM_SET_PIT */ @@ -306,7 +306,8 @@ struct kvm_pit_state { struct kvm_pit_channel_state channels[3]; }; -#define KVM_PIT_FLAGS_HPET_LEGACY 0x00000001 +#define KVM_PIT_FLAGS_HPET_LEGACY 0x00000001 +#define KVM_PIT_FLAGS_SPEAKER_DATA_ON 0x00000002 struct kvm_pit_state2 { struct kvm_pit_channel_state channels[3]; @@ -325,6 +326,7 @@ struct kvm_reinject_control { #define KVM_VCPUEVENT_VALID_SHADOW 0x00000004 #define KVM_VCPUEVENT_VALID_SMM 0x00000008 #define KVM_VCPUEVENT_VALID_PAYLOAD 0x00000010 +#define KVM_VCPUEVENT_VALID_TRIPLE_FAULT 0x00000020 /* Interrupt shadow states */ #define KVM_X86_SHADOW_INT_MOV_SS 0x01 @@ -359,7 +361,10 @@ struct kvm_vcpu_events { __u8 smm_inside_nmi; __u8 latched_init; } smi; - __u8 reserved[27]; + struct { + __u8 pending; + } triple_fault; + __u8 reserved[26]; __u8 exception_has_payload; __u64 exception_payload; }; @@ -389,7 +394,7 @@ struct kvm_xsave { * the contents of CPUID leaf 0xD on the host. */ __u32 region[1024]; - __u32 extra[0]; + __u32 extra[]; }; #define KVM_MAX_XCRS 16 @@ -434,6 +439,7 @@ struct kvm_sync_regs { #define KVM_X86_QUIRK_OUT_7E_INC_RIP (1 << 3) #define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4) #define KVM_X86_QUIRK_FIX_HYPERCALL_INSN (1 << 5) +#define KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS (1 << 6) #define KVM_STATE_NESTED_FORMAT_VMX 0 #define KVM_STATE_NESTED_FORMAT_SVM 1 @@ -516,7 +522,7 @@ struct kvm_pmu_event_filter { __u32 fixed_counter_bitmap; __u32 flags; __u32 pad[4]; - __u64 events[0]; + __u64 events[]; }; #define KVM_PMU_EVENT_ALLOW 0 diff --git a/arch/x86/include/uapi/asm/sgx.h b/arch/x86/include/uapi/asm/sgx.h index f4b81587e90b..2dd35bbdc822 100644 --- a/arch/x86/include/uapi/asm/sgx.h +++ b/arch/x86/include/uapi/asm/sgx.h @@ -29,6 +29,12 @@ enum sgx_page_flags { _IOW(SGX_MAGIC, 0x03, struct sgx_enclave_provision) #define SGX_IOC_VEPC_REMOVE_ALL \ _IO(SGX_MAGIC, 0x04) +#define SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS \ + _IOWR(SGX_MAGIC, 0x05, struct sgx_enclave_restrict_permissions) +#define SGX_IOC_ENCLAVE_MODIFY_TYPES \ + _IOWR(SGX_MAGIC, 0x06, struct sgx_enclave_modify_types) +#define SGX_IOC_ENCLAVE_REMOVE_PAGES \ + _IOWR(SGX_MAGIC, 0x07, struct sgx_enclave_remove_pages) /** * struct sgx_enclave_create - parameter structure for the @@ -76,6 +82,62 @@ struct sgx_enclave_provision { __u64 fd; }; +/** + * struct sgx_enclave_restrict_permissions - parameters for ioctl + * %SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS + * @offset: starting page offset (page aligned relative to enclave base + * address defined in SECS) + * @length: length of memory (multiple of the page size) + * @permissions:new permission bits for pages in range described by @offset + * and @length + * @result: (output) SGX result code of ENCLS[EMODPR] function + * @count: (output) bytes successfully changed (multiple of page size) + */ +struct sgx_enclave_restrict_permissions { + __u64 offset; + __u64 length; + __u64 permissions; + __u64 result; + __u64 count; +}; + +/** + * struct sgx_enclave_modify_types - parameters for ioctl + * %SGX_IOC_ENCLAVE_MODIFY_TYPES + * @offset: starting page offset (page aligned relative to enclave base + * address defined in SECS) + * @length: length of memory (multiple of the page size) + * @page_type: new type for pages in range described by @offset and @length + * @result: (output) SGX result code of ENCLS[EMODT] function + * @count: (output) bytes successfully changed (multiple of page size) + */ +struct sgx_enclave_modify_types { + __u64 offset; + __u64 length; + __u64 page_type; + __u64 result; + __u64 count; +}; + +/** + * struct sgx_enclave_remove_pages - %SGX_IOC_ENCLAVE_REMOVE_PAGES parameters + * @offset: starting page offset (page aligned relative to enclave base + * address defined in SECS) + * @length: length of memory (multiple of the page size) + * @count: (output) bytes successfully changed (multiple of page size) + * + * Regular (PT_REG) or TCS (PT_TCS) can be removed from an initialized + * enclave if the system supports SGX2. First, the %SGX_IOC_ENCLAVE_MODIFY_TYPES + * ioctl() should be used to change the page type to PT_TRIM. After that + * succeeds ENCLU[EACCEPT] should be run from within the enclave and then + * %SGX_IOC_ENCLAVE_REMOVE_PAGES can be used to complete the page removal. + */ +struct sgx_enclave_remove_pages { + __u64 offset; + __u64 length; + __u64 count; +}; + struct sgx_enclave_run; /** diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index 946d761adbd3..a5faf6d88f1b 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -91,6 +91,7 @@ #define EXIT_REASON_UMWAIT 67 #define EXIT_REASON_TPAUSE 68 #define EXIT_REASON_BUS_LOCK 74 +#define EXIT_REASON_NOTIFY 75 #define VMX_EXIT_REASONS \ { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ @@ -153,7 +154,8 @@ { EXIT_REASON_XRSTORS, "XRSTORS" }, \ { EXIT_REASON_UMWAIT, "UMWAIT" }, \ { EXIT_REASON_TPAUSE, "TPAUSE" }, \ - { EXIT_REASON_BUS_LOCK, "BUS_LOCK" } + { EXIT_REASON_BUS_LOCK, "BUS_LOCK" }, \ + { EXIT_REASON_NOTIFY, "NOTIFY" } #define VMX_EXIT_REASON_FLAGS \ { VMX_EXIT_REASONS_FAILED_VMENTRY, "FAILED_VMENTRY" } diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 189d3a5e471a..a4347605ab00 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -275,7 +275,7 @@ void native_apic_icr_write(u32 low, u32 id) unsigned long flags; local_irq_save(flags); - apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id)); + apic_write(APIC_ICR2, SET_XAPIC_DEST_FIELD(id)); apic_write(APIC_ICR, low); local_irq_restore(flags); } diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index d1fb874fbe64..2a6509e8c840 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -99,7 +99,7 @@ sendmask: static inline int __prepare_ICR2(unsigned int mask) { - return SET_APIC_DEST_FIELD(mask); + return SET_XAPIC_DEST_FIELD(mask); } static inline void __xapic_wait_icr_idle(void) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 35d5288394cb..48276c0e479d 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -808,7 +808,7 @@ static void clear_rdrand_cpuid_bit(struct cpuinfo_x86 *c) return; /* - * The nordrand option can clear X86_FEATURE_RDRAND, so check for + * The self-test can clear X86_FEATURE_RDRAND, so check for * RDRAND support using the CPUID function directly. */ if (!(cpuid_ecx(1) & BIT(30)) || rdrand_force) diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 7227c15299d0..9651275aecd1 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/bitops.h> #include <linux/delay.h> +#include <linux/isa-dma.h> #include <linux/pci.h> #include <asm/dma.h> #include <linux/io.h> diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c index da696eb4821a..993697e71854 100644 --- a/arch/x86/kernel/cpu/feat_ctl.c +++ b/arch/x86/kernel/cpu/feat_ctl.c @@ -15,6 +15,8 @@ enum vmx_feature_leafs { MISC_FEATURES = 0, PRIMARY_CTLS, SECONDARY_CTLS, + TERTIARY_CTLS_LOW, + TERTIARY_CTLS_HIGH, NR_VMX_FEATURE_WORDS, }; @@ -22,7 +24,7 @@ enum vmx_feature_leafs { static void init_vmx_capabilities(struct cpuinfo_x86 *c) { - u32 supported, funcs, ept, vpid, ign; + u32 supported, funcs, ept, vpid, ign, low, high; BUILD_BUG_ON(NVMXINTS != NR_VMX_FEATURE_WORDS); @@ -42,6 +44,11 @@ static void init_vmx_capabilities(struct cpuinfo_x86 *c) rdmsr_safe(MSR_IA32_VMX_PROCBASED_CTLS2, &ign, &supported); c->vmx_capability[SECONDARY_CTLS] = supported; + /* All 64 bits of tertiary controls MSR are allowed-1 settings. */ + rdmsr_safe(MSR_IA32_VMX_PROCBASED_CTLS3, &low, &high); + c->vmx_capability[TERTIARY_CTLS_LOW] = low; + c->vmx_capability[TERTIARY_CTLS_HIGH] = high; + rdmsr(MSR_IA32_VMX_PINBASED_CTLS, ign, supported); rdmsr_safe(MSR_IA32_VMX_VMFUNC, &ign, &funcs); diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c index c4be62058dd9..26a427fa84ea 100644 --- a/arch/x86/kernel/cpu/rdrand.c +++ b/arch/x86/kernel/cpu/rdrand.c @@ -11,56 +11,39 @@ #include <asm/archrandom.h> #include <asm/sections.h> -static int __init x86_rdrand_setup(char *s) -{ - setup_clear_cpu_cap(X86_FEATURE_RDRAND); - setup_clear_cpu_cap(X86_FEATURE_RDSEED); - return 1; -} -__setup("nordrand", x86_rdrand_setup); - /* * RDRAND has Built-In-Self-Test (BIST) that runs on every invocation. - * Run the instruction a few times as a sanity check. - * If it fails, it is simple to disable RDRAND here. + * Run the instruction a few times as a sanity check. Also make sure + * it's not outputting the same value over and over, which has happened + * as a result of past CPU bugs. + * + * If it fails, it is simple to disable RDRAND and RDSEED here. */ -#define SANITY_CHECK_LOOPS 8 -#ifdef CONFIG_ARCH_RANDOM void x86_init_rdrand(struct cpuinfo_x86 *c) { - unsigned int changed = 0; - unsigned long tmp, prev; - int i; + enum { SAMPLES = 8, MIN_CHANGE = 5 }; + unsigned long sample, prev; + bool failure = false; + size_t i, changed; if (!cpu_has(c, X86_FEATURE_RDRAND)) return; - for (i = 0; i < SANITY_CHECK_LOOPS; i++) { - if (!rdrand_long(&tmp)) { - clear_cpu_cap(c, X86_FEATURE_RDRAND); - pr_warn_once("rdrand: disabled\n"); - return; + for (changed = 0, i = 0; i < SAMPLES; ++i) { + if (!rdrand_long(&sample)) { + failure = true; + break; } + changed += i && sample != prev; + prev = sample; } + if (changed < MIN_CHANGE) + failure = true; - /* - * Stupid sanity-check whether RDRAND does *actually* generate - * some at least random-looking data. - */ - prev = tmp; - for (i = 0; i < SANITY_CHECK_LOOPS; i++) { - if (rdrand_long(&tmp)) { - if (prev != tmp) - changed++; - - prev = tmp; - } + if (failure) { + clear_cpu_cap(c, X86_FEATURE_RDRAND); + clear_cpu_cap(c, X86_FEATURE_RDSEED); + pr_emerg("RDRAND is not reliable on this platform; disabling.\n"); } - - if (WARN_ON_ONCE(!changed)) - pr_emerg( -"RDRAND gives funky smelling output, might consider not using it by booting with \"nordrand\""); - } -#endif diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 19876ebfb504..24c1bb8eb196 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -232,25 +232,10 @@ static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page, return epc_page; } -static struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, - unsigned long addr, - unsigned long vm_flags) +static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl, + struct sgx_encl_page *entry) { - unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC); struct sgx_epc_page *epc_page; - struct sgx_encl_page *entry; - - entry = xa_load(&encl->page_array, PFN_DOWN(addr)); - if (!entry) - return ERR_PTR(-EFAULT); - - /* - * Verify that the faulted page has equal or higher build time - * permissions than the VMA permissions (i.e. the subset of {VM_READ, - * VM_WRITE, VM_EXECUTE} in vma->vm_flags). - */ - if ((entry->vm_max_prot_bits & vm_prot_bits) != vm_prot_bits) - return ERR_PTR(-EFAULT); /* Entry successfully located. */ if (entry->epc_page) { @@ -276,6 +261,146 @@ static struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, return entry; } +static struct sgx_encl_page *sgx_encl_load_page_in_vma(struct sgx_encl *encl, + unsigned long addr, + unsigned long vm_flags) +{ + unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC); + struct sgx_encl_page *entry; + + entry = xa_load(&encl->page_array, PFN_DOWN(addr)); + if (!entry) + return ERR_PTR(-EFAULT); + + /* + * Verify that the page has equal or higher build time + * permissions than the VMA permissions (i.e. the subset of {VM_READ, + * VM_WRITE, VM_EXECUTE} in vma->vm_flags). + */ + if ((entry->vm_max_prot_bits & vm_prot_bits) != vm_prot_bits) + return ERR_PTR(-EFAULT); + + return __sgx_encl_load_page(encl, entry); +} + +struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, + unsigned long addr) +{ + struct sgx_encl_page *entry; + + entry = xa_load(&encl->page_array, PFN_DOWN(addr)); + if (!entry) + return ERR_PTR(-EFAULT); + + return __sgx_encl_load_page(encl, entry); +} + +/** + * sgx_encl_eaug_page() - Dynamically add page to initialized enclave + * @vma: VMA obtained from fault info from where page is accessed + * @encl: enclave accessing the page + * @addr: address that triggered the page fault + * + * When an initialized enclave accesses a page with no backing EPC page + * on a SGX2 system then the EPC can be added dynamically via the SGX2 + * ENCLS[EAUG] instruction. + * + * Returns: Appropriate vm_fault_t: VM_FAULT_NOPAGE when PTE was installed + * successfully, VM_FAULT_SIGBUS or VM_FAULT_OOM as error otherwise. + */ +static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma, + struct sgx_encl *encl, unsigned long addr) +{ + vm_fault_t vmret = VM_FAULT_SIGBUS; + struct sgx_pageinfo pginfo = {0}; + struct sgx_encl_page *encl_page; + struct sgx_epc_page *epc_page; + struct sgx_va_page *va_page; + unsigned long phys_addr; + u64 secinfo_flags; + int ret; + + if (!test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) + return VM_FAULT_SIGBUS; + + /* + * Ignore internal permission checking for dynamically added pages. + * They matter only for data added during the pre-initialization + * phase. The enclave decides the permissions by the means of + * EACCEPT, EACCEPTCOPY and EMODPE. + */ + secinfo_flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_X; + encl_page = sgx_encl_page_alloc(encl, addr - encl->base, secinfo_flags); + if (IS_ERR(encl_page)) + return VM_FAULT_OOM; + + mutex_lock(&encl->lock); + + epc_page = sgx_alloc_epc_page(encl_page, false); + if (IS_ERR(epc_page)) { + if (PTR_ERR(epc_page) == -EBUSY) + vmret = VM_FAULT_NOPAGE; + goto err_out_unlock; + } + + va_page = sgx_encl_grow(encl, false); + if (IS_ERR(va_page)) + goto err_out_epc; + + if (va_page) + list_add(&va_page->list, &encl->va_pages); + + ret = xa_insert(&encl->page_array, PFN_DOWN(encl_page->desc), + encl_page, GFP_KERNEL); + /* + * If ret == -EBUSY then page was created in another flow while + * running without encl->lock + */ + if (ret) + goto err_out_shrink; + + pginfo.secs = (unsigned long)sgx_get_epc_virt_addr(encl->secs.epc_page); + pginfo.addr = encl_page->desc & PAGE_MASK; + pginfo.metadata = 0; + + ret = __eaug(&pginfo, sgx_get_epc_virt_addr(epc_page)); + if (ret) + goto err_out; + + encl_page->encl = encl; + encl_page->epc_page = epc_page; + encl_page->type = SGX_PAGE_TYPE_REG; + encl->secs_child_cnt++; + + sgx_mark_page_reclaimable(encl_page->epc_page); + + phys_addr = sgx_get_epc_phys_addr(epc_page); + /* + * Do not undo everything when creating PTE entry fails - next #PF + * would find page ready for a PTE. + */ + vmret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr)); + if (vmret != VM_FAULT_NOPAGE) { + mutex_unlock(&encl->lock); + return VM_FAULT_SIGBUS; + } + mutex_unlock(&encl->lock); + return VM_FAULT_NOPAGE; + +err_out: + xa_erase(&encl->page_array, PFN_DOWN(encl_page->desc)); + +err_out_shrink: + sgx_encl_shrink(encl, va_page); +err_out_epc: + sgx_encl_free_epc_page(epc_page); +err_out_unlock: + mutex_unlock(&encl->lock); + kfree(encl_page); + + return vmret; +} + static vm_fault_t sgx_vma_fault(struct vm_fault *vmf) { unsigned long addr = (unsigned long)vmf->address; @@ -295,9 +420,20 @@ static vm_fault_t sgx_vma_fault(struct vm_fault *vmf) if (unlikely(!encl)) return VM_FAULT_SIGBUS; + /* + * The page_array keeps track of all enclave pages, whether they + * are swapped out or not. If there is no entry for this page and + * the system supports SGX2 then it is possible to dynamically add + * a new enclave page. This is only possible for an initialized + * enclave that will be checked for right away. + */ + if (cpu_feature_enabled(X86_FEATURE_SGX2) && + (!xa_load(&encl->page_array, PFN_DOWN(addr)))) + return sgx_encl_eaug_page(vma, encl, addr); + mutex_lock(&encl->lock); - entry = sgx_encl_load_page(encl, addr, vma->vm_flags); + entry = sgx_encl_load_page_in_vma(encl, addr, vma->vm_flags); if (IS_ERR(entry)) { mutex_unlock(&encl->lock); @@ -367,6 +503,11 @@ int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start, XA_STATE(xas, &encl->page_array, PFN_DOWN(start)); + /* Disallow mapping outside enclave's address range. */ + if (test_bit(SGX_ENCL_INITIALIZED, &encl->flags) && + (start < encl->base || end > encl->base + encl->size)) + return -EACCES; + /* * Disallow READ_IMPLIES_EXEC tasks as their VMA permissions might * conflict with the enclave page permissions. @@ -445,7 +586,7 @@ static struct sgx_encl_page *sgx_encl_reserve_page(struct sgx_encl *encl, for ( ; ; ) { mutex_lock(&encl->lock); - entry = sgx_encl_load_page(encl, addr, vm_flags); + entry = sgx_encl_load_page_in_vma(encl, addr, vm_flags); if (PTR_ERR(entry) != -EBUSY) break; @@ -687,7 +828,7 @@ int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm) spin_lock(&encl->mm_lock); list_add_rcu(&encl_mm->list, &encl->mm_list); - /* Pairs with smp_rmb() in sgx_reclaimer_block(). */ + /* Pairs with smp_rmb() in sgx_zap_enclave_ptes(). */ smp_wmb(); encl->mm_list_version++; spin_unlock(&encl->mm_lock); @@ -695,6 +836,73 @@ int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm) return 0; } +/** + * sgx_encl_cpumask() - Query which CPUs might be accessing the enclave + * @encl: the enclave + * + * Some SGX functions require that no cached linear-to-physical address + * mappings are present before they can succeed. For example, ENCLS[EWB] + * copies a page from the enclave page cache to regular main memory but + * it fails if it cannot ensure that there are no cached + * linear-to-physical address mappings referring to the page. + * + * SGX hardware flushes all cached linear-to-physical mappings on a CPU + * when an enclave is exited via ENCLU[EEXIT] or an Asynchronous Enclave + * Exit (AEX). Exiting an enclave will thus ensure cached linear-to-physical + * address mappings are cleared but coordination with the tracking done within + * the SGX hardware is needed to support the SGX functions that depend on this + * cache clearing. + * + * When the ENCLS[ETRACK] function is issued on an enclave the hardware + * tracks threads operating inside the enclave at that time. The SGX + * hardware tracking require that all the identified threads must have + * exited the enclave in order to flush the mappings before a function such + * as ENCLS[EWB] will be permitted + * + * The following flow is used to support SGX functions that require that + * no cached linear-to-physical address mappings are present: + * 1) Execute ENCLS[ETRACK] to initiate hardware tracking. + * 2) Use this function (sgx_encl_cpumask()) to query which CPUs might be + * accessing the enclave. + * 3) Send IPI to identified CPUs, kicking them out of the enclave and + * thus flushing all locally cached linear-to-physical address mappings. + * 4) Execute SGX function. + * + * Context: It is required to call this function after ENCLS[ETRACK]. + * This will ensure that if any new mm appears (racing with + * sgx_encl_mm_add()) then the new mm will enter into the + * enclave with fresh linear-to-physical address mappings. + * + * It is required that all IPIs are completed before a new + * ENCLS[ETRACK] is issued so be sure to protect steps 1 to 3 + * of the above flow with the enclave's mutex. + * + * Return: cpumask of CPUs that might be accessing @encl + */ +const cpumask_t *sgx_encl_cpumask(struct sgx_encl *encl) +{ + cpumask_t *cpumask = &encl->cpumask; + struct sgx_encl_mm *encl_mm; + int idx; + + cpumask_clear(cpumask); + + idx = srcu_read_lock(&encl->srcu); + + list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { + if (!mmget_not_zero(encl_mm->mm)) + continue; + + cpumask_or(cpumask, cpumask, mm_cpumask(encl_mm->mm)); + + mmput_async(encl_mm->mm); + } + + srcu_read_unlock(&encl->srcu, idx); + + return cpumask; +} + static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl, pgoff_t index) { @@ -735,7 +943,6 @@ static int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, return PTR_ERR(pcmd); } - backing->page_index = page_index; backing->contents = contents; backing->pcmd = pcmd; backing->pcmd_offset = page_pcmd_off & (PAGE_SIZE - 1); @@ -902,8 +1109,85 @@ int sgx_encl_test_and_clear_young(struct mm_struct *mm, return ret; } +struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl, + unsigned long offset, + u64 secinfo_flags) +{ + struct sgx_encl_page *encl_page; + unsigned long prot; + + encl_page = kzalloc(sizeof(*encl_page), GFP_KERNEL); + if (!encl_page) + return ERR_PTR(-ENOMEM); + + encl_page->desc = encl->base + offset; + encl_page->encl = encl; + + prot = _calc_vm_trans(secinfo_flags, SGX_SECINFO_R, PROT_READ) | + _calc_vm_trans(secinfo_flags, SGX_SECINFO_W, PROT_WRITE) | + _calc_vm_trans(secinfo_flags, SGX_SECINFO_X, PROT_EXEC); + + /* + * TCS pages must always RW set for CPU access while the SECINFO + * permissions are *always* zero - the CPU ignores the user provided + * values and silently overwrites them with zero permissions. + */ + if ((secinfo_flags & SGX_SECINFO_PAGE_TYPE_MASK) == SGX_SECINFO_TCS) + prot |= PROT_READ | PROT_WRITE; + + /* Calculate maximum of the VM flags for the page. */ + encl_page->vm_max_prot_bits = calc_vm_prot_bits(prot, 0); + + return encl_page; +} + +/** + * sgx_zap_enclave_ptes() - remove PTEs mapping the address from enclave + * @encl: the enclave + * @addr: page aligned pointer to single page for which PTEs will be removed + * + * Multiple VMAs may have an enclave page mapped. Remove the PTE mapping + * @addr from each VMA. Ensure that page fault handler is ready to handle + * new mappings of @addr before calling this function. + */ +void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr) +{ + unsigned long mm_list_version; + struct sgx_encl_mm *encl_mm; + struct vm_area_struct *vma; + int idx, ret; + + do { + mm_list_version = encl->mm_list_version; + + /* Pairs with smp_wmb() in sgx_encl_mm_add(). */ + smp_rmb(); + + idx = srcu_read_lock(&encl->srcu); + + list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { + if (!mmget_not_zero(encl_mm->mm)) + continue; + + mmap_read_lock(encl_mm->mm); + + ret = sgx_encl_find(encl_mm->mm, addr, &vma); + if (!ret && encl == vma->vm_private_data) + zap_vma_ptes(vma, addr, PAGE_SIZE); + + mmap_read_unlock(encl_mm->mm); + + mmput_async(encl_mm->mm); + } + + srcu_read_unlock(&encl->srcu, idx); + } while (unlikely(encl->mm_list_version != mm_list_version)); +} + /** * sgx_alloc_va_page() - Allocate a Version Array (VA) page + * @reclaim: Reclaim EPC pages directly if none available. Enclave + * mutex should not be held if this is set. * * Allocate a free EPC page and convert it to a Version Array (VA) page. * @@ -911,12 +1195,12 @@ int sgx_encl_test_and_clear_young(struct mm_struct *mm, * a VA page, * -errno otherwise */ -struct sgx_epc_page *sgx_alloc_va_page(void) +struct sgx_epc_page *sgx_alloc_va_page(bool reclaim) { struct sgx_epc_page *epc_page; int ret; - epc_page = sgx_alloc_epc_page(NULL, true); + epc_page = sgx_alloc_epc_page(NULL, reclaim); if (IS_ERR(epc_page)) return ERR_CAST(epc_page); diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h index 332ef3568267..a65a952116fd 100644 --- a/arch/x86/kernel/cpu/sgx/encl.h +++ b/arch/x86/kernel/cpu/sgx/encl.h @@ -27,7 +27,8 @@ struct sgx_encl_page { unsigned long desc; - unsigned long vm_max_prot_bits; + unsigned long vm_max_prot_bits:8; + enum sgx_page_type type:16; struct sgx_epc_page *epc_page; struct sgx_encl *encl; struct sgx_va_page *va_page; @@ -78,7 +79,6 @@ struct sgx_va_page { }; struct sgx_backing { - pgoff_t page_index; struct page *contents; struct page *pcmd; unsigned long pcmd_offset; @@ -106,6 +106,7 @@ int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start, bool current_is_ksgxd(void); void sgx_encl_release(struct kref *ref); int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm); +const cpumask_t *sgx_encl_cpumask(struct sgx_encl *encl); int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index, struct sgx_backing *backing); int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index, @@ -113,11 +114,18 @@ int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index, void sgx_encl_put_backing(struct sgx_backing *backing); int sgx_encl_test_and_clear_young(struct mm_struct *mm, struct sgx_encl_page *page); - -struct sgx_epc_page *sgx_alloc_va_page(void); +struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl, + unsigned long offset, + u64 secinfo_flags); +void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr); +struct sgx_epc_page *sgx_alloc_va_page(bool reclaim); unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page); void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset); bool sgx_va_page_full(struct sgx_va_page *va_page); void sgx_encl_free_epc_page(struct sgx_epc_page *page); +struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, + unsigned long addr); +struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl, bool reclaim); +void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page); #endif /* _X86_ENCL_H */ diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h index fa04a73daf9c..99004b02e2ed 100644 --- a/arch/x86/kernel/cpu/sgx/encls.h +++ b/arch/x86/kernel/cpu/sgx/encls.h @@ -136,57 +136,71 @@ static inline bool encls_failed(int ret) ret; \ }) +/* Initialize an EPC page into an SGX Enclave Control Structure (SECS) page. */ static inline int __ecreate(struct sgx_pageinfo *pginfo, void *secs) { return __encls_2(ECREATE, pginfo, secs); } +/* Hash a 256 byte region of an enclave page to SECS:MRENCLAVE. */ static inline int __eextend(void *secs, void *addr) { return __encls_2(EEXTEND, secs, addr); } +/* + * Associate an EPC page to an enclave either as a REG or TCS page + * populated with the provided data. + */ static inline int __eadd(struct sgx_pageinfo *pginfo, void *addr) { return __encls_2(EADD, pginfo, addr); } +/* Finalize enclave build, initialize enclave for user code execution. */ static inline int __einit(void *sigstruct, void *token, void *secs) { return __encls_ret_3(EINIT, sigstruct, secs, token); } +/* Disassociate EPC page from its enclave and mark it as unused. */ static inline int __eremove(void *addr) { return __encls_ret_1(EREMOVE, addr); } +/* Copy data to an EPC page belonging to a debug enclave. */ static inline int __edbgwr(void *addr, unsigned long *data) { return __encls_2(EDGBWR, *data, addr); } +/* Copy data from an EPC page belonging to a debug enclave. */ static inline int __edbgrd(void *addr, unsigned long *data) { return __encls_1_1(EDGBRD, *data, addr); } +/* Track that software has completed the required TLB address clears. */ static inline int __etrack(void *addr) { return __encls_ret_1(ETRACK, addr); } +/* Load, verify, and unblock an EPC page. */ static inline int __eldu(struct sgx_pageinfo *pginfo, void *addr, void *va) { return __encls_ret_3(ELDU, pginfo, addr, va); } +/* Make EPC page inaccessible to enclave, ready to be written to memory. */ static inline int __eblock(void *addr) { return __encls_ret_1(EBLOCK, addr); } +/* Initialize an EPC page into a Version Array (VA) page. */ static inline int __epa(void *addr) { unsigned long rbx = SGX_PAGE_TYPE_VA; @@ -194,10 +208,29 @@ static inline int __epa(void *addr) return __encls_2(EPA, rbx, addr); } +/* Invalidate an EPC page and write it out to main memory. */ static inline int __ewb(struct sgx_pageinfo *pginfo, void *addr, void *va) { return __encls_ret_3(EWB, pginfo, addr, va); } +/* Restrict the EPCM permissions of an EPC page. */ +static inline int __emodpr(struct sgx_secinfo *secinfo, void *addr) +{ + return __encls_ret_2(EMODPR, secinfo, addr); +} + +/* Change the type of an EPC page. */ +static inline int __emodt(struct sgx_secinfo *secinfo, void *addr) +{ + return __encls_ret_2(EMODT, secinfo, addr); +} + +/* Zero a page of EPC memory and add it to an initialized enclave. */ +static inline int __eaug(struct sgx_pageinfo *pginfo, void *addr) +{ + return __encls_2(EAUG, pginfo, addr); +} + #endif /* _X86_ENCLS_H */ diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index 83df20e3e633..ebe79d60619f 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -17,7 +17,7 @@ #include "encl.h" #include "encls.h" -static struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl) +struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl, bool reclaim) { struct sgx_va_page *va_page = NULL; void *err; @@ -30,7 +30,7 @@ static struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl) if (!va_page) return ERR_PTR(-ENOMEM); - va_page->epc_page = sgx_alloc_va_page(); + va_page->epc_page = sgx_alloc_va_page(reclaim); if (IS_ERR(va_page->epc_page)) { err = ERR_CAST(va_page->epc_page); kfree(va_page); @@ -43,7 +43,7 @@ static struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl) return va_page; } -static void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page) +void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page) { encl->page_cnt--; @@ -64,7 +64,7 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs) struct file *backing; long ret; - va_page = sgx_encl_grow(encl); + va_page = sgx_encl_grow(encl, true); if (IS_ERR(va_page)) return PTR_ERR(va_page); else if (va_page) @@ -107,6 +107,7 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs) set_bit(SGX_ENCL_DEBUG, &encl->flags); encl->secs.encl = encl; + encl->secs.type = SGX_PAGE_TYPE_SECS; encl->base = secs->base; encl->size = secs->size; encl->attributes = secs->attributes; @@ -168,38 +169,6 @@ static long sgx_ioc_enclave_create(struct sgx_encl *encl, void __user *arg) return ret; } -static struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl, - unsigned long offset, - u64 secinfo_flags) -{ - struct sgx_encl_page *encl_page; - unsigned long prot; - - encl_page = kzalloc(sizeof(*encl_page), GFP_KERNEL); - if (!encl_page) - return ERR_PTR(-ENOMEM); - - encl_page->desc = encl->base + offset; - encl_page->encl = encl; - - prot = _calc_vm_trans(secinfo_flags, SGX_SECINFO_R, PROT_READ) | - _calc_vm_trans(secinfo_flags, SGX_SECINFO_W, PROT_WRITE) | - _calc_vm_trans(secinfo_flags, SGX_SECINFO_X, PROT_EXEC); - - /* - * TCS pages must always RW set for CPU access while the SECINFO - * permissions are *always* zero - the CPU ignores the user provided - * values and silently overwrites them with zero permissions. - */ - if ((secinfo_flags & SGX_SECINFO_PAGE_TYPE_MASK) == SGX_SECINFO_TCS) - prot |= PROT_READ | PROT_WRITE; - - /* Calculate maximum of the VM flags for the page. */ - encl_page->vm_max_prot_bits = calc_vm_prot_bits(prot, 0); - - return encl_page; -} - static int sgx_validate_secinfo(struct sgx_secinfo *secinfo) { u64 perm = secinfo->flags & SGX_SECINFO_PERMISSION_MASK; @@ -306,7 +275,7 @@ static int sgx_encl_add_page(struct sgx_encl *encl, unsigned long src, return PTR_ERR(epc_page); } - va_page = sgx_encl_grow(encl); + va_page = sgx_encl_grow(encl, true); if (IS_ERR(va_page)) { ret = PTR_ERR(va_page); goto err_out_free; @@ -344,6 +313,7 @@ static int sgx_encl_add_page(struct sgx_encl *encl, unsigned long src, */ encl_page->encl = encl; encl_page->epc_page = epc_page; + encl_page->type = (secinfo->flags & SGX_SECINFO_PAGE_TYPE_MASK) >> 8; encl->secs_child_cnt++; if (flags & SGX_PAGE_MEASURE) { @@ -372,6 +342,26 @@ err_out_free: return ret; } +/* + * Ensure user provided offset and length values are valid for + * an enclave. + */ +static int sgx_validate_offset_length(struct sgx_encl *encl, + unsigned long offset, + unsigned long length) +{ + if (!IS_ALIGNED(offset, PAGE_SIZE)) + return -EINVAL; + + if (!length || !IS_ALIGNED(length, PAGE_SIZE)) + return -EINVAL; + + if (offset + length - PAGE_SIZE >= encl->size) + return -EINVAL; + + return 0; +} + /** * sgx_ioc_enclave_add_pages() - The handler for %SGX_IOC_ENCLAVE_ADD_PAGES * @encl: an enclave pointer @@ -425,14 +415,10 @@ static long sgx_ioc_enclave_add_pages(struct sgx_encl *encl, void __user *arg) if (copy_from_user(&add_arg, arg, sizeof(add_arg))) return -EFAULT; - if (!IS_ALIGNED(add_arg.offset, PAGE_SIZE) || - !IS_ALIGNED(add_arg.src, PAGE_SIZE)) - return -EINVAL; - - if (!add_arg.length || add_arg.length & (PAGE_SIZE - 1)) + if (!IS_ALIGNED(add_arg.src, PAGE_SIZE)) return -EINVAL; - if (add_arg.offset + add_arg.length - PAGE_SIZE >= encl->size) + if (sgx_validate_offset_length(encl, add_arg.offset, add_arg.length)) return -EINVAL; if (copy_from_user(&secinfo, (void __user *)add_arg.secinfo, @@ -674,6 +660,565 @@ static long sgx_ioc_enclave_provision(struct sgx_encl *encl, void __user *arg) return sgx_set_attribute(&encl->attributes_mask, params.fd); } +/* + * Ensure enclave is ready for SGX2 functions. Readiness is checked + * by ensuring the hardware supports SGX2 and the enclave is initialized + * and thus able to handle requests to modify pages within it. + */ +static int sgx_ioc_sgx2_ready(struct sgx_encl *encl) +{ + if (!(cpu_feature_enabled(X86_FEATURE_SGX2))) + return -ENODEV; + + if (!test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) + return -EINVAL; + + return 0; +} + +/* + * Some SGX functions require that no cached linear-to-physical address + * mappings are present before they can succeed. Collaborate with + * hardware via ENCLS[ETRACK] to ensure that all cached + * linear-to-physical address mappings belonging to all threads of + * the enclave are cleared. See sgx_encl_cpumask() for details. + * + * Must be called with enclave's mutex held from the time the + * SGX function requiring that no cached linear-to-physical mappings + * are present is executed until this ETRACK flow is complete. + */ +static int sgx_enclave_etrack(struct sgx_encl *encl) +{ + void *epc_virt; + int ret; + + epc_virt = sgx_get_epc_virt_addr(encl->secs.epc_page); + ret = __etrack(epc_virt); + if (ret) { + /* + * ETRACK only fails when there is an OS issue. For + * example, two consecutive ETRACK was sent without + * completed IPI between. + */ + pr_err_once("ETRACK returned %d (0x%x)", ret, ret); + /* + * Send IPIs to kick CPUs out of the enclave and + * try ETRACK again. + */ + on_each_cpu_mask(sgx_encl_cpumask(encl), sgx_ipi_cb, NULL, 1); + ret = __etrack(epc_virt); + if (ret) { + pr_err_once("ETRACK repeat returned %d (0x%x)", + ret, ret); + return -EFAULT; + } + } + on_each_cpu_mask(sgx_encl_cpumask(encl), sgx_ipi_cb, NULL, 1); + + return 0; +} + +/** + * sgx_enclave_restrict_permissions() - Restrict EPCM permissions + * @encl: Enclave to which the pages belong. + * @modp: Checked parameters from user on which pages need modifying and + * their new permissions. + * + * Return: + * - 0: Success. + * - -errno: Otherwise. + */ +static long +sgx_enclave_restrict_permissions(struct sgx_encl *encl, + struct sgx_enclave_restrict_permissions *modp) +{ + struct sgx_encl_page *entry; + struct sgx_secinfo secinfo; + unsigned long addr; + unsigned long c; + void *epc_virt; + int ret; + + memset(&secinfo, 0, sizeof(secinfo)); + secinfo.flags = modp->permissions & SGX_SECINFO_PERMISSION_MASK; + + for (c = 0 ; c < modp->length; c += PAGE_SIZE) { + addr = encl->base + modp->offset + c; + + sgx_reclaim_direct(); + + mutex_lock(&encl->lock); + + entry = sgx_encl_load_page(encl, addr); + if (IS_ERR(entry)) { + ret = PTR_ERR(entry) == -EBUSY ? -EAGAIN : -EFAULT; + goto out_unlock; + } + + /* + * Changing EPCM permissions is only supported on regular + * SGX pages. Attempting this change on other pages will + * result in #PF. + */ + if (entry->type != SGX_PAGE_TYPE_REG) { + ret = -EINVAL; + goto out_unlock; + } + + /* + * Apart from ensuring that read-access remains, do not verify + * the permission bits requested. Kernel has no control over + * how EPCM permissions can be relaxed from within the enclave. + * ENCLS[EMODPR] can only remove existing EPCM permissions, + * attempting to set new permissions will be ignored by the + * hardware. + */ + + /* Change EPCM permissions. */ + epc_virt = sgx_get_epc_virt_addr(entry->epc_page); + ret = __emodpr(&secinfo, epc_virt); + if (encls_faulted(ret)) { + /* + * All possible faults should be avoidable: + * parameters have been checked, will only change + * permissions of a regular page, and no concurrent + * SGX1/SGX2 ENCLS instructions since these + * are protected with mutex. + */ + pr_err_once("EMODPR encountered exception %d\n", + ENCLS_TRAPNR(ret)); + ret = -EFAULT; + goto out_unlock; + } + if (encls_failed(ret)) { + modp->result = ret; + ret = -EFAULT; + goto out_unlock; + } + + ret = sgx_enclave_etrack(encl); + if (ret) { + ret = -EFAULT; + goto out_unlock; + } + + mutex_unlock(&encl->lock); + } + + ret = 0; + goto out; + +out_unlock: + mutex_unlock(&encl->lock); +out: + modp->count = c; + + return ret; +} + +/** + * sgx_ioc_enclave_restrict_permissions() - handler for + * %SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS + * @encl: an enclave pointer + * @arg: userspace pointer to a &struct sgx_enclave_restrict_permissions + * instance + * + * SGX2 distinguishes between relaxing and restricting the enclave page + * permissions maintained by the hardware (EPCM permissions) of pages + * belonging to an initialized enclave (after SGX_IOC_ENCLAVE_INIT). + * + * EPCM permissions cannot be restricted from within the enclave, the enclave + * requires the kernel to run the privileged level 0 instructions ENCLS[EMODPR] + * and ENCLS[ETRACK]. An attempt to relax EPCM permissions with this call + * will be ignored by the hardware. + * + * Return: + * - 0: Success + * - -errno: Otherwise + */ +static long sgx_ioc_enclave_restrict_permissions(struct sgx_encl *encl, + void __user *arg) +{ + struct sgx_enclave_restrict_permissions params; + long ret; + + ret = sgx_ioc_sgx2_ready(encl); + if (ret) + return ret; + + if (copy_from_user(¶ms, arg, sizeof(params))) + return -EFAULT; + + if (sgx_validate_offset_length(encl, params.offset, params.length)) + return -EINVAL; + + if (params.permissions & ~SGX_SECINFO_PERMISSION_MASK) + return -EINVAL; + + /* + * Fail early if invalid permissions requested to prevent ENCLS[EMODPR] + * from faulting later when the CPU does the same check. + */ + if ((params.permissions & SGX_SECINFO_W) && + !(params.permissions & SGX_SECINFO_R)) + return -EINVAL; + + if (params.result || params.count) + return -EINVAL; + + ret = sgx_enclave_restrict_permissions(encl, ¶ms); + + if (copy_to_user(arg, ¶ms, sizeof(params))) + return -EFAULT; + + return ret; +} + +/** + * sgx_enclave_modify_types() - Modify type of SGX enclave pages + * @encl: Enclave to which the pages belong. + * @modt: Checked parameters from user about which pages need modifying + * and their new page type. + * + * Return: + * - 0: Success + * - -errno: Otherwise + */ +static long sgx_enclave_modify_types(struct sgx_encl *encl, + struct sgx_enclave_modify_types *modt) +{ + unsigned long max_prot_restore; + enum sgx_page_type page_type; + struct sgx_encl_page *entry; + struct sgx_secinfo secinfo; + unsigned long prot; + unsigned long addr; + unsigned long c; + void *epc_virt; + int ret; + + page_type = modt->page_type & SGX_PAGE_TYPE_MASK; + + /* + * The only new page types allowed by hardware are PT_TCS and PT_TRIM. + */ + if (page_type != SGX_PAGE_TYPE_TCS && page_type != SGX_PAGE_TYPE_TRIM) + return -EINVAL; + + memset(&secinfo, 0, sizeof(secinfo)); + + secinfo.flags = page_type << 8; + + for (c = 0 ; c < modt->length; c += PAGE_SIZE) { + addr = encl->base + modt->offset + c; + + sgx_reclaim_direct(); + + mutex_lock(&encl->lock); + + entry = sgx_encl_load_page(encl, addr); + if (IS_ERR(entry)) { + ret = PTR_ERR(entry) == -EBUSY ? -EAGAIN : -EFAULT; + goto out_unlock; + } + + /* + * Borrow the logic from the Intel SDM. Regular pages + * (SGX_PAGE_TYPE_REG) can change type to SGX_PAGE_TYPE_TCS + * or SGX_PAGE_TYPE_TRIM but TCS pages can only be trimmed. + * CET pages not supported yet. + */ + if (!(entry->type == SGX_PAGE_TYPE_REG || + (entry->type == SGX_PAGE_TYPE_TCS && + page_type == SGX_PAGE_TYPE_TRIM))) { + ret = -EINVAL; + goto out_unlock; + } + + max_prot_restore = entry->vm_max_prot_bits; + + /* + * Once a regular page becomes a TCS page it cannot be + * changed back. So the maximum allowed protection reflects + * the TCS page that is always RW from kernel perspective but + * will be inaccessible from within enclave. Before doing + * so, do make sure that the new page type continues to + * respect the originally vetted page permissions. + */ + if (entry->type == SGX_PAGE_TYPE_REG && + page_type == SGX_PAGE_TYPE_TCS) { + if (~entry->vm_max_prot_bits & (VM_READ | VM_WRITE)) { + ret = -EPERM; + goto out_unlock; + } + prot = PROT_READ | PROT_WRITE; + entry->vm_max_prot_bits = calc_vm_prot_bits(prot, 0); + + /* + * Prevent page from being reclaimed while mutex + * is released. + */ + if (sgx_unmark_page_reclaimable(entry->epc_page)) { + ret = -EAGAIN; + goto out_entry_changed; + } + + /* + * Do not keep encl->lock because of dependency on + * mmap_lock acquired in sgx_zap_enclave_ptes(). + */ + mutex_unlock(&encl->lock); + + sgx_zap_enclave_ptes(encl, addr); + + mutex_lock(&encl->lock); + + sgx_mark_page_reclaimable(entry->epc_page); + } + + /* Change EPC type */ + epc_virt = sgx_get_epc_virt_addr(entry->epc_page); + ret = __emodt(&secinfo, epc_virt); + if (encls_faulted(ret)) { + /* + * All possible faults should be avoidable: + * parameters have been checked, will only change + * valid page types, and no concurrent + * SGX1/SGX2 ENCLS instructions since these are + * protected with mutex. + */ + pr_err_once("EMODT encountered exception %d\n", + ENCLS_TRAPNR(ret)); + ret = -EFAULT; + goto out_entry_changed; + } + if (encls_failed(ret)) { + modt->result = ret; + ret = -EFAULT; + goto out_entry_changed; + } + + ret = sgx_enclave_etrack(encl); + if (ret) { + ret = -EFAULT; + goto out_unlock; + } + + entry->type = page_type; + + mutex_unlock(&encl->lock); + } + + ret = 0; + goto out; + +out_entry_changed: + entry->vm_max_prot_bits = max_prot_restore; +out_unlock: + mutex_unlock(&encl->lock); +out: + modt->count = c; + + return ret; +} + +/** + * sgx_ioc_enclave_modify_types() - handler for %SGX_IOC_ENCLAVE_MODIFY_TYPES + * @encl: an enclave pointer + * @arg: userspace pointer to a &struct sgx_enclave_modify_types instance + * + * Ability to change the enclave page type supports the following use cases: + * + * * It is possible to add TCS pages to an enclave by changing the type of + * regular pages (%SGX_PAGE_TYPE_REG) to TCS (%SGX_PAGE_TYPE_TCS) pages. + * With this support the number of threads supported by an initialized + * enclave can be increased dynamically. + * + * * Regular or TCS pages can dynamically be removed from an initialized + * enclave by changing the page type to %SGX_PAGE_TYPE_TRIM. Changing the + * page type to %SGX_PAGE_TYPE_TRIM marks the page for removal with actual + * removal done by handler of %SGX_IOC_ENCLAVE_REMOVE_PAGES ioctl() called + * after ENCLU[EACCEPT] is run on %SGX_PAGE_TYPE_TRIM page from within the + * enclave. + * + * Return: + * - 0: Success + * - -errno: Otherwise + */ +static long sgx_ioc_enclave_modify_types(struct sgx_encl *encl, + void __user *arg) +{ + struct sgx_enclave_modify_types params; + long ret; + + ret = sgx_ioc_sgx2_ready(encl); + if (ret) + return ret; + + if (copy_from_user(¶ms, arg, sizeof(params))) + return -EFAULT; + + if (sgx_validate_offset_length(encl, params.offset, params.length)) + return -EINVAL; + + if (params.page_type & ~SGX_PAGE_TYPE_MASK) + return -EINVAL; + + if (params.result || params.count) + return -EINVAL; + + ret = sgx_enclave_modify_types(encl, ¶ms); + + if (copy_to_user(arg, ¶ms, sizeof(params))) + return -EFAULT; + + return ret; +} + +/** + * sgx_encl_remove_pages() - Remove trimmed pages from SGX enclave + * @encl: Enclave to which the pages belong + * @params: Checked parameters from user on which pages need to be removed + * + * Return: + * - 0: Success. + * - -errno: Otherwise. + */ +static long sgx_encl_remove_pages(struct sgx_encl *encl, + struct sgx_enclave_remove_pages *params) +{ + struct sgx_encl_page *entry; + struct sgx_secinfo secinfo; + unsigned long addr; + unsigned long c; + void *epc_virt; + int ret; + + memset(&secinfo, 0, sizeof(secinfo)); + secinfo.flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_X; + + for (c = 0 ; c < params->length; c += PAGE_SIZE) { + addr = encl->base + params->offset + c; + + sgx_reclaim_direct(); + + mutex_lock(&encl->lock); + + entry = sgx_encl_load_page(encl, addr); + if (IS_ERR(entry)) { + ret = PTR_ERR(entry) == -EBUSY ? -EAGAIN : -EFAULT; + goto out_unlock; + } + + if (entry->type != SGX_PAGE_TYPE_TRIM) { + ret = -EPERM; + goto out_unlock; + } + + /* + * ENCLS[EMODPR] is a no-op instruction used to inform if + * ENCLU[EACCEPT] was run from within the enclave. If + * ENCLS[EMODPR] is run with RWX on a trimmed page that is + * not yet accepted then it will return + * %SGX_PAGE_NOT_MODIFIABLE, after the trimmed page is + * accepted the instruction will encounter a page fault. + */ + epc_virt = sgx_get_epc_virt_addr(entry->epc_page); + ret = __emodpr(&secinfo, epc_virt); + if (!encls_faulted(ret) || ENCLS_TRAPNR(ret) != X86_TRAP_PF) { + ret = -EPERM; + goto out_unlock; + } + + if (sgx_unmark_page_reclaimable(entry->epc_page)) { + ret = -EBUSY; + goto out_unlock; + } + + /* + * Do not keep encl->lock because of dependency on + * mmap_lock acquired in sgx_zap_enclave_ptes(). + */ + mutex_unlock(&encl->lock); + + sgx_zap_enclave_ptes(encl, addr); + + mutex_lock(&encl->lock); + + sgx_encl_free_epc_page(entry->epc_page); + encl->secs_child_cnt--; + entry->epc_page = NULL; + xa_erase(&encl->page_array, PFN_DOWN(entry->desc)); + sgx_encl_shrink(encl, NULL); + kfree(entry); + + mutex_unlock(&encl->lock); + } + + ret = 0; + goto out; + +out_unlock: + mutex_unlock(&encl->lock); +out: + params->count = c; + + return ret; +} + +/** + * sgx_ioc_enclave_remove_pages() - handler for %SGX_IOC_ENCLAVE_REMOVE_PAGES + * @encl: an enclave pointer + * @arg: userspace pointer to &struct sgx_enclave_remove_pages instance + * + * Final step of the flow removing pages from an initialized enclave. The + * complete flow is: + * + * 1) User changes the type of the pages to be removed to %SGX_PAGE_TYPE_TRIM + * using the %SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl(). + * 2) User approves the page removal by running ENCLU[EACCEPT] from within + * the enclave. + * 3) User initiates actual page removal using the + * %SGX_IOC_ENCLAVE_REMOVE_PAGES ioctl() that is handled here. + * + * First remove any page table entries pointing to the page and then proceed + * with the actual removal of the enclave page and data in support of it. + * + * VA pages are not affected by this removal. It is thus possible that the + * enclave may end up with more VA pages than needed to support all its + * pages. + * + * Return: + * - 0: Success + * - -errno: Otherwise + */ +static long sgx_ioc_enclave_remove_pages(struct sgx_encl *encl, + void __user *arg) +{ + struct sgx_enclave_remove_pages params; + long ret; + + ret = sgx_ioc_sgx2_ready(encl); + if (ret) + return ret; + + if (copy_from_user(¶ms, arg, sizeof(params))) + return -EFAULT; + + if (sgx_validate_offset_length(encl, params.offset, params.length)) + return -EINVAL; + + if (params.count) + return -EINVAL; + + ret = sgx_encl_remove_pages(encl, ¶ms); + + if (copy_to_user(arg, ¶ms, sizeof(params))) + return -EFAULT; + + return ret; +} + long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) { struct sgx_encl *encl = filep->private_data; @@ -695,6 +1240,16 @@ long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) case SGX_IOC_ENCLAVE_PROVISION: ret = sgx_ioc_enclave_provision(encl, (void __user *)arg); break; + case SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS: + ret = sgx_ioc_enclave_restrict_permissions(encl, + (void __user *)arg); + break; + case SGX_IOC_ENCLAVE_MODIFY_TYPES: + ret = sgx_ioc_enclave_modify_types(encl, (void __user *)arg); + break; + case SGX_IOC_ENCLAVE_REMOVE_PAGES: + ret = sgx_ioc_enclave_remove_pages(encl, (void __user *)arg); + break; default: ret = -ENOIOCTLCMD; break; diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index a78652d43e61..515e2a5f25bb 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -137,36 +137,9 @@ static void sgx_reclaimer_block(struct sgx_epc_page *epc_page) struct sgx_encl_page *page = epc_page->owner; unsigned long addr = page->desc & PAGE_MASK; struct sgx_encl *encl = page->encl; - unsigned long mm_list_version; - struct sgx_encl_mm *encl_mm; - struct vm_area_struct *vma; - int idx, ret; - - do { - mm_list_version = encl->mm_list_version; - - /* Pairs with smp_rmb() in sgx_encl_mm_add(). */ - smp_rmb(); - - idx = srcu_read_lock(&encl->srcu); - - list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { - if (!mmget_not_zero(encl_mm->mm)) - continue; - - mmap_read_lock(encl_mm->mm); - - ret = sgx_encl_find(encl_mm->mm, addr, &vma); - if (!ret && encl == vma->vm_private_data) - zap_vma_ptes(vma, addr, PAGE_SIZE); - - mmap_read_unlock(encl_mm->mm); - - mmput_async(encl_mm->mm); - } + int ret; - srcu_read_unlock(&encl->srcu, idx); - } while (unlikely(encl->mm_list_version != mm_list_version)); + sgx_zap_enclave_ptes(encl, addr); mutex_lock(&encl->lock); @@ -201,39 +174,10 @@ static int __sgx_encl_ewb(struct sgx_epc_page *epc_page, void *va_slot, return ret; } -static void sgx_ipi_cb(void *info) +void sgx_ipi_cb(void *info) { } -static const cpumask_t *sgx_encl_ewb_cpumask(struct sgx_encl *encl) -{ - cpumask_t *cpumask = &encl->cpumask; - struct sgx_encl_mm *encl_mm; - int idx; - - /* - * Can race with sgx_encl_mm_add(), but ETRACK has already been - * executed, which means that the CPUs running in the new mm will enter - * into the enclave with a fresh epoch. - */ - cpumask_clear(cpumask); - - idx = srcu_read_lock(&encl->srcu); - - list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { - if (!mmget_not_zero(encl_mm->mm)) - continue; - - cpumask_or(cpumask, cpumask, mm_cpumask(encl_mm->mm)); - - mmput_async(encl_mm->mm); - } - - srcu_read_unlock(&encl->srcu, idx); - - return cpumask; -} - /* * Swap page to the regular memory transformed to the blocked state by using * EBLOCK, which means that it can no longer be referenced (no new TLB entries). @@ -280,7 +224,7 @@ static void sgx_encl_ewb(struct sgx_epc_page *epc_page, * miss cpus that entered the enclave between * generating the mask and incrementing epoch. */ - on_each_cpu_mask(sgx_encl_ewb_cpumask(encl), + on_each_cpu_mask(sgx_encl_cpumask(encl), sgx_ipi_cb, NULL, 1); ret = __sgx_encl_ewb(epc_page, va_slot, backing); } @@ -431,6 +375,17 @@ static bool sgx_should_reclaim(unsigned long watermark) !list_empty(&sgx_active_page_list); } +/* + * sgx_reclaim_direct() should be called (without enclave's mutex held) + * in locations where SGX memory resources might be low and might be + * needed in order to make forward progress. + */ +void sgx_reclaim_direct(void) +{ + if (sgx_should_reclaim(SGX_NR_LOW_PAGES)) + sgx_reclaim_pages(); +} + static int ksgxd(void *p) { set_freezable(); diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h index 0f17def9fe6f..0f2020653fba 100644 --- a/arch/x86/kernel/cpu/sgx/sgx.h +++ b/arch/x86/kernel/cpu/sgx/sgx.h @@ -86,10 +86,13 @@ static inline void *sgx_get_epc_virt_addr(struct sgx_epc_page *page) struct sgx_epc_page *__sgx_alloc_epc_page(void); void sgx_free_epc_page(struct sgx_epc_page *page); +void sgx_reclaim_direct(void); void sgx_mark_page_reclaimable(struct sgx_epc_page *page); int sgx_unmark_page_reclaimable(struct sgx_epc_page *page); struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim); +void sgx_ipi_cb(void *info); + #ifdef CONFIG_X86_SGX_KVM int __init sgx_vepc_init(void); #else diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 4fe7af58cfe1..9417d5aa7305 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -100,7 +100,7 @@ static void init_espfix_random(void) * This is run before the entropy pools are initialized, * but this is hopefully better than nothing. */ - if (!arch_get_random_long(&rand)) { + if (!arch_get_random_longs(&rand, 1)) { /* The constant is an arbitrary large prime */ rand = rdtsc(); rand *= 0xc345c6b72fd16123UL; diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 24b9fa89aa27..bd165004776d 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -91,6 +91,7 @@ static int ftrace_verify_code(unsigned long ip, const char *old_code) /* Make sure it is what we expect it to be */ if (memcmp(cur_code, old_code, MCOUNT_INSN_SIZE) != 0) { + ftrace_expected = old_code; WARN_ON(1); return -EINVAL; } diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index b9bdb40364a6..6b58610a1552 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -17,7 +17,6 @@ #include <linux/kernel.h> #include <linux/mm.h> #include <linux/efi.h> -#include <linux/verification.h> #include <linux/random.h> #include <asm/bootparam.h> @@ -596,28 +595,11 @@ static int bzImage64_cleanup(void *loader_data) return 0; } -#ifdef CONFIG_KEXEC_BZIMAGE_VERIFY_SIG -static int bzImage64_verify_sig(const char *kernel, unsigned long kernel_len) -{ - int ret; - - ret = verify_pefile_signature(kernel, kernel_len, - VERIFY_USE_SECONDARY_KEYRING, - VERIFYING_KEXEC_PE_SIGNATURE); - if (ret == -ENOKEY && IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING)) { - ret = verify_pefile_signature(kernel, kernel_len, - VERIFY_USE_PLATFORM_KEYRING, - VERIFYING_KEXEC_PE_SIGNATURE); - } - return ret; -} -#endif - const struct kexec_file_ops kexec_bzImage64_ops = { .probe = bzImage64_probe, .load = bzImage64_load, .cleanup = bzImage64_cleanup, #ifdef CONFIG_KEXEC_BZIMAGE_VERIFY_SIG - .verify_sig = bzImage64_verify_sig, + .verify_sig = kexec_kernel_verify_pe_sig, #endif }; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 1a3658f7e6d9..d4e48b4a438b 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -236,8 +236,7 @@ again: raw_spin_unlock(&b->lock); /* A dummy token might be allocated and ultimately not used. */ - if (dummy) - kfree(dummy); + kfree(dummy); } EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake); diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 0c1154a1c403..3bacd935f840 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -6,7 +6,6 @@ * Copyright (c) 2006-2009, Intel Corporation */ -#include <linux/intel-iommu.h> #include <linux/init_task.h> #include <linux/spinlock.h> #include <linux/export.h> @@ -516,17 +515,3 @@ struct acpi_table_header *tboot_get_dmar_table(struct acpi_table_header *dmar_tb return dmar_tbl; } - -int tboot_force_iommu(void) -{ - if (!tboot_enabled()) - return 0; - - if (no_iommu || dmar_disabled) - pr_warn("Forcing Intel-IOMMU to enabled\n"); - - dmar_disabled = 0; - no_iommu = 0; - - return 1; -} diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index de6d44e07e34..75dcf7a72605 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -67,9 +67,17 @@ u32 xstate_required_size(u64 xstate_bv, bool compacted) #define F feature_bit #define SF(name) (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0) +/* + * Magic value used by KVM when querying userspace-provided CPUID entries and + * doesn't care about the CPIUD index because the index of the function in + * question is not significant. Note, this magic value must have at least one + * bit set in bits[63:32] and must be consumed as a u64 by cpuid_entry2_find() + * to avoid false positives when processing guest CPUID input. + */ +#define KVM_CPUID_INDEX_NOT_SIGNIFICANT -1ull static inline struct kvm_cpuid_entry2 *cpuid_entry2_find( - struct kvm_cpuid_entry2 *entries, int nent, u32 function, u32 index) + struct kvm_cpuid_entry2 *entries, int nent, u32 function, u64 index) { struct kvm_cpuid_entry2 *e; int i; @@ -77,9 +85,31 @@ static inline struct kvm_cpuid_entry2 *cpuid_entry2_find( for (i = 0; i < nent; i++) { e = &entries[i]; - if (e->function == function && - (!(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) || e->index == index)) + if (e->function != function) + continue; + + /* + * If the index isn't significant, use the first entry with a + * matching function. It's userspace's responsibilty to not + * provide "duplicate" entries in all cases. + */ + if (!(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) || e->index == index) + return e; + + + /* + * Similarly, use the first matching entry if KVM is doing a + * lookup (as opposed to emulating CPUID) for a function that's + * architecturally defined as not having a significant index. + */ + if (index == KVM_CPUID_INDEX_NOT_SIGNIFICANT) { + /* + * Direct lookups from KVM should not diverge from what + * KVM defines internally (the architectural behavior). + */ + WARN_ON_ONCE(cpuid_function_is_indexed(function)); return e; + } } return NULL; @@ -96,7 +126,8 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu, * The existing code assumes virtual address is 48-bit or 57-bit in the * canonical address checks; exit if it is ever changed. */ - best = cpuid_entry2_find(entries, nent, 0x80000008, 0); + best = cpuid_entry2_find(entries, nent, 0x80000008, + KVM_CPUID_INDEX_NOT_SIGNIFICANT); if (best) { int vaddr_bits = (best->eax & 0xff00) >> 8; @@ -151,7 +182,7 @@ static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu) vcpu->arch.kvm_cpuid_base = 0; for_each_possible_hypervisor_cpuid_base(function) { - entry = kvm_find_cpuid_entry(vcpu, function, 0); + entry = kvm_find_cpuid_entry(vcpu, function); if (entry) { u32 signature[3]; @@ -177,7 +208,8 @@ static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_vcpu *v if (!base) return NULL; - return cpuid_entry2_find(entries, nent, base | KVM_CPUID_FEATURES, 0); + return cpuid_entry2_find(entries, nent, base | KVM_CPUID_FEATURES, + KVM_CPUID_INDEX_NOT_SIGNIFICANT); } static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu) @@ -200,7 +232,7 @@ void kvm_update_pv_runtime(struct kvm_vcpu *vcpu) /* * Calculate guest's supported XCR0 taking into account guest CPUID data and - * supported_xcr0 (comprised of host configuration and KVM_SUPPORTED_XCR0). + * KVM's supported XCR0 (comprised of host's XCR0 and KVM_SUPPORTED_XCR0). */ static u64 cpuid_get_supported_xcr0(struct kvm_cpuid_entry2 *entries, int nent) { @@ -210,7 +242,7 @@ static u64 cpuid_get_supported_xcr0(struct kvm_cpuid_entry2 *entries, int nent) if (!best) return 0; - return (best->eax | ((u64)best->edx << 32)) & supported_xcr0; + return (best->eax | ((u64)best->edx << 32)) & kvm_caps.supported_xcr0; } static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *entries, @@ -219,7 +251,7 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e struct kvm_cpuid_entry2 *best; u64 guest_supported_xcr0 = cpuid_get_supported_xcr0(entries, nent); - best = cpuid_entry2_find(entries, nent, 1, 0); + best = cpuid_entry2_find(entries, nent, 1, KVM_CPUID_INDEX_NOT_SIGNIFICANT); if (best) { /* Update OSXSAVE bit */ if (boot_cpu_has(X86_FEATURE_XSAVE)) @@ -250,7 +282,7 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) { - best = cpuid_entry2_find(entries, nent, 0x1, 0); + best = cpuid_entry2_find(entries, nent, 0x1, KVM_CPUID_INDEX_NOT_SIGNIFICANT); if (best) cpuid_entry_change(best, X86_FEATURE_MWAIT, vcpu->arch.ia32_misc_enable_msr & @@ -285,7 +317,7 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) struct kvm_cpuid_entry2 *best; u64 guest_supported_xcr0; - best = kvm_find_cpuid_entry(vcpu, 1, 0); + best = kvm_find_cpuid_entry(vcpu, 1); if (best && apic) { if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER)) apic->lapic_timer.timer_mode_mask = 3 << 17; @@ -325,10 +357,10 @@ int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0); + best = kvm_find_cpuid_entry(vcpu, 0x80000000); if (!best || best->eax < 0x80000008) goto not_found; - best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); + best = kvm_find_cpuid_entry(vcpu, 0x80000008); if (best) return best->eax & 0xff; not_found: @@ -868,7 +900,6 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) case 9: break; case 0xa: { /* Architectural Performance Monitoring */ - struct x86_pmu_capability cap; union cpuid10_eax eax; union cpuid10_edx edx; @@ -877,30 +908,20 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) break; } - perf_get_x86_pmu_capability(&cap); + eax.split.version_id = kvm_pmu_cap.version; + eax.split.num_counters = kvm_pmu_cap.num_counters_gp; + eax.split.bit_width = kvm_pmu_cap.bit_width_gp; + eax.split.mask_length = kvm_pmu_cap.events_mask_len; + edx.split.num_counters_fixed = kvm_pmu_cap.num_counters_fixed; + edx.split.bit_width_fixed = kvm_pmu_cap.bit_width_fixed; - /* - * The guest architecture pmu is only supported if the architecture - * pmu exists on the host and the module parameters allow it. - */ - if (!cap.version || !enable_pmu) - memset(&cap, 0, sizeof(cap)); - - eax.split.version_id = min(cap.version, 2); - eax.split.num_counters = cap.num_counters_gp; - eax.split.bit_width = cap.bit_width_gp; - eax.split.mask_length = cap.events_mask_len; - - edx.split.num_counters_fixed = - min(cap.num_counters_fixed, KVM_PMC_MAX_FIXED); - edx.split.bit_width_fixed = cap.bit_width_fixed; - if (cap.version) + if (kvm_pmu_cap.version) edx.split.anythread_deprecated = 1; edx.split.reserved1 = 0; edx.split.reserved2 = 0; entry->eax = eax.full; - entry->ebx = cap.events_mask; + entry->ebx = kvm_pmu_cap.events_mask; entry->ecx = 0; entry->edx = edx.full; break; @@ -923,8 +944,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) } break; case 0xd: { - u64 permitted_xcr0 = supported_xcr0 & xstate_get_guest_group_perm(); - u64 permitted_xss = supported_xss; + u64 permitted_xcr0 = kvm_caps.supported_xcr0 & xstate_get_guest_group_perm(); + u64 permitted_xss = kvm_caps.supported_xss; entry->eax &= permitted_xcr0; entry->ebx = xstate_required_size(permitted_xcr0, false); @@ -1313,12 +1334,20 @@ out_free: return r; } -struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, - u32 function, u32 index) +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu, + u32 function, u32 index) { return cpuid_entry2_find(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent, function, index); } +EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry_index); + +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, + u32 function) +{ + return cpuid_entry2_find(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent, + function, KVM_CPUID_INDEX_NOT_SIGNIFICANT); +} EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); /* @@ -1355,7 +1384,7 @@ get_out_of_range_cpuid_entry(struct kvm_vcpu *vcpu, u32 *fn_ptr, u32 index) struct kvm_cpuid_entry2 *basic, *class; u32 function = *fn_ptr; - basic = kvm_find_cpuid_entry(vcpu, 0, 0); + basic = kvm_find_cpuid_entry(vcpu, 0); if (!basic) return NULL; @@ -1364,11 +1393,11 @@ get_out_of_range_cpuid_entry(struct kvm_vcpu *vcpu, u32 *fn_ptr, u32 index) return NULL; if (function >= 0x40000000 && function <= 0x4fffffff) - class = kvm_find_cpuid_entry(vcpu, function & 0xffffff00, 0); + class = kvm_find_cpuid_entry(vcpu, function & 0xffffff00); else if (function >= 0xc0000000) - class = kvm_find_cpuid_entry(vcpu, 0xc0000000, 0); + class = kvm_find_cpuid_entry(vcpu, 0xc0000000); else - class = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0); + class = kvm_find_cpuid_entry(vcpu, function & 0x80000000); if (class && function <= class->eax) return NULL; @@ -1386,7 +1415,7 @@ get_out_of_range_cpuid_entry(struct kvm_vcpu *vcpu, u32 *fn_ptr, u32 index) * the effective CPUID entry is the max basic leaf. Note, the index of * the original requested leaf is observed! */ - return kvm_find_cpuid_entry(vcpu, basic->eax, index); + return kvm_find_cpuid_entry_index(vcpu, basic->eax, index); } bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, @@ -1396,7 +1425,7 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, struct kvm_cpuid_entry2 *entry; bool exact, used_max_basic = false; - entry = kvm_find_cpuid_entry(vcpu, function, index); + entry = kvm_find_cpuid_entry_index(vcpu, function, index); exact = !!entry; if (!entry && !exact_only) { @@ -1425,7 +1454,7 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, * exists. EDX can be copied from any existing index. */ if (function == 0xb || function == 0x1f) { - entry = kvm_find_cpuid_entry(vcpu, function, 1); + entry = kvm_find_cpuid_entry_index(vcpu, function, 1); if (entry) { *ecx = index & 0xff; *edx = entry->edx; diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 8a770b481d9d..b1658c0de847 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -13,8 +13,10 @@ void kvm_set_cpu_caps(void); void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu); void kvm_update_pv_runtime(struct kvm_vcpu *vcpu); +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu, + u32 function, u32 index); struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, - u32 function, u32 index); + u32 function); int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries, unsigned int type); @@ -76,7 +78,7 @@ static __always_inline u32 *guest_cpuid_get_register(struct kvm_vcpu *vcpu, const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature); struct kvm_cpuid_entry2 *entry; - entry = kvm_find_cpuid_entry(vcpu, cpuid.function, cpuid.index); + entry = kvm_find_cpuid_entry_index(vcpu, cpuid.function, cpuid.index); if (!entry) return NULL; @@ -109,7 +111,7 @@ static inline bool guest_cpuid_is_amd_or_hygon(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 0, 0); + best = kvm_find_cpuid_entry(vcpu, 0); return best && (is_guest_vendor_amd(best->ebx, best->ecx, best->edx) || is_guest_vendor_hygon(best->ebx, best->ecx, best->edx)); @@ -119,7 +121,7 @@ static inline bool guest_cpuid_is_intel(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 0, 0); + best = kvm_find_cpuid_entry(vcpu, 0); return best && is_guest_vendor_intel(best->ebx, best->ecx, best->edx); } @@ -127,7 +129,7 @@ static inline int guest_cpuid_family(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 0x1, 0); + best = kvm_find_cpuid_entry(vcpu, 0x1); if (!best) return -1; @@ -138,18 +140,23 @@ static inline int guest_cpuid_model(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 0x1, 0); + best = kvm_find_cpuid_entry(vcpu, 0x1); if (!best) return -1; return x86_model(best->eax); } +static inline bool cpuid_model_is_consistent(struct kvm_vcpu *vcpu) +{ + return boot_cpu_data.x86_model == guest_cpuid_model(vcpu); +} + static inline int guest_cpuid_stepping(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 0x1, 0); + best = kvm_find_cpuid_entry(vcpu, 0x1); if (!best) return -1; diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c index 9240b3b7f8dd..cfed36aba2f7 100644 --- a/arch/x86/kvm/debugfs.c +++ b/arch/x86/kvm/debugfs.c @@ -48,7 +48,7 @@ DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_fops, vcpu_get_tsc_scaling_ratio, NULL, static int vcpu_get_tsc_scaling_frac_bits(void *data, u64 *val) { - *val = kvm_tsc_scaling_ratio_frac_bits; + *val = kvm_caps.tsc_scaling_ratio_frac_bits; return 0; } @@ -66,7 +66,7 @@ void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu, struct dentry *debugfs_ debugfs_dentry, vcpu, &vcpu_timer_advance_ns_fops); - if (kvm_has_tsc_control) { + if (kvm_caps.has_tsc_control) { debugfs_create_file("tsc-scaling-ratio", 0444, debugfs_dentry, vcpu, &vcpu_tsc_scaling_fops); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index f8382abe22ff..047c583596bb 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -244,6 +244,9 @@ enum x86_transfer_type { static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr) { + if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt)) + nr &= NR_EMULATOR_GPRS - 1; + if (!(ctxt->regs_valid & (1 << nr))) { ctxt->regs_valid |= 1 << nr; ctxt->_regs[nr] = ctxt->ops->read_gpr(ctxt, nr); @@ -253,6 +256,12 @@ static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr) static ulong *reg_write(struct x86_emulate_ctxt *ctxt, unsigned nr) { + if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt)) + nr &= NR_EMULATOR_GPRS - 1; + + BUILD_BUG_ON(sizeof(ctxt->regs_dirty) * BITS_PER_BYTE < NR_EMULATOR_GPRS); + BUILD_BUG_ON(sizeof(ctxt->regs_valid) * BITS_PER_BYTE < NR_EMULATOR_GPRS); + ctxt->regs_valid |= 1 << nr; ctxt->regs_dirty |= 1 << nr; return &ctxt->_regs[nr]; @@ -266,9 +275,10 @@ static ulong *reg_rmw(struct x86_emulate_ctxt *ctxt, unsigned nr) static void writeback_registers(struct x86_emulate_ctxt *ctxt) { + unsigned long dirty = ctxt->regs_dirty; unsigned reg; - for_each_set_bit(reg, (ulong *)&ctxt->regs_dirty, 16) + for_each_set_bit(reg, &dirty, NR_EMULATOR_GPRS) ctxt->ops->write_gpr(ctxt, reg, ctxt->_regs[reg]); } @@ -615,7 +625,9 @@ static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg) static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, u32 error, bool valid) { - WARN_ON(vec > 0x1f); + if (KVM_EMULATOR_BUG_ON(vec > 0x1f, ctxt)) + return X86EMUL_UNHANDLEABLE; + ctxt->exception.vector = vec; ctxt->exception.error_code = error; ctxt->exception.error_code_valid = valid; @@ -1362,7 +1374,8 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt, if (mc->pos < mc->end) goto read_cached; - WARN_ON((mc->end + size) >= sizeof(mc->data)); + if (KVM_EMULATOR_BUG_ON((mc->end + size) >= sizeof(mc->data), ctxt)) + return X86EMUL_UNHANDLEABLE; rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, size, &ctxt->exception); @@ -1687,16 +1700,6 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, case VCPU_SREG_TR: if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9)) goto exception; - if (!seg_desc.p) { - err_vec = NP_VECTOR; - goto exception; - } - old_desc = seg_desc; - seg_desc.type |= 2; /* busy */ - ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, &old_desc, &seg_desc, - sizeof(seg_desc), &ctxt->exception); - if (ret != X86EMUL_CONTINUE) - return ret; break; case VCPU_SREG_LDTR: if (seg_desc.s || seg_desc.type != 2) @@ -1734,8 +1737,17 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, if (ret != X86EMUL_CONTINUE) return ret; if (emul_is_noncanonical_address(get_desc_base(&seg_desc) | - ((u64)base3 << 32), ctxt)) - return emulate_gp(ctxt, 0); + ((u64)base3 << 32), ctxt)) + return emulate_gp(ctxt, err_code); + } + + if (seg == VCPU_SREG_TR) { + old_desc = seg_desc; + seg_desc.type |= 2; /* busy */ + ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, &old_desc, &seg_desc, + sizeof(seg_desc), &ctxt->exception); + if (ret != X86EMUL_CONTINUE) + return ret; } load: ctxt->ops->set_segment(ctxt, selector, &seg_desc, base3, seg); @@ -2432,7 +2444,7 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7ff4) | X86_EFLAGS_FIXED; ctxt->_eip = GET_SMSTATE(u32, smstate, 0x7ff0); - for (i = 0; i < 8; i++) + for (i = 0; i < NR_EMULATOR_GPRS; i++) *reg_write(ctxt, i) = GET_SMSTATE(u32, smstate, 0x7fd0 + i * 4); val = GET_SMSTATE(u32, smstate, 0x7fcc); @@ -2489,7 +2501,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u16 selector; int i, r; - for (i = 0; i < 16; i++) + for (i = 0; i < NR_EMULATOR_GPRS; i++) *reg_write(ctxt, i) = GET_SMSTATE(u64, smstate, 0x7ff8 - i * 8); ctxt->_eip = GET_SMSTATE(u64, smstate, 0x7f78); @@ -5719,7 +5731,8 @@ writeback: done: if (rc == X86EMUL_PROPAGATE_FAULT) { - WARN_ON(ctxt->exception.vector > 0x1f); + if (KVM_EMULATOR_BUG_ON(ctxt->exception.vector > 0x1f, ctxt)) + return EMULATION_FAILED; ctxt->have_exception = true; } if (rc == X86EMUL_INTERCEPTED) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index e2e95a6fccfd..ed804447589c 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1992,7 +1992,7 @@ void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu) struct kvm_cpuid_entry2 *entry; struct kvm_vcpu_hv *hv_vcpu; - entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_INTERFACE, 0); + entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_INTERFACE); if (entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX) { vcpu->arch.hyperv_enabled = true; } else { @@ -2005,7 +2005,7 @@ void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu) hv_vcpu = to_hv_vcpu(vcpu); - entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_FEATURES, 0); + entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_FEATURES); if (entry) { hv_vcpu->cpuid_cache.features_eax = entry->eax; hv_vcpu->cpuid_cache.features_ebx = entry->ebx; @@ -2016,7 +2016,7 @@ void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu) hv_vcpu->cpuid_cache.features_edx = 0; } - entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_ENLIGHTMENT_INFO, 0); + entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_ENLIGHTMENT_INFO); if (entry) { hv_vcpu->cpuid_cache.enlightenments_eax = entry->eax; hv_vcpu->cpuid_cache.enlightenments_ebx = entry->ebx; @@ -2025,7 +2025,7 @@ void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu) hv_vcpu->cpuid_cache.enlightenments_ebx = 0; } - entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES, 0); + entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES); if (entry) hv_vcpu->cpuid_cache.syndbg_cap_eax = entry->eax; else diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 1c83076091af..e0a7a0e7a73c 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -591,7 +591,10 @@ static int speaker_ioport_write(struct kvm_vcpu *vcpu, return -EOPNOTSUPP; mutex_lock(&pit_state->lock); - pit_state->speaker_data_on = (val >> 1) & 1; + if (val & (1 << 1)) + pit_state->flags |= KVM_PIT_FLAGS_SPEAKER_DATA_ON; + else + pit_state->flags &= ~KVM_PIT_FLAGS_SPEAKER_DATA_ON; pit_set_gate(pit, 2, val & 1); mutex_unlock(&pit_state->lock); return 0; @@ -612,8 +615,9 @@ static int speaker_ioport_read(struct kvm_vcpu *vcpu, refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1; mutex_lock(&pit_state->lock); - ret = ((pit_state->speaker_data_on << 1) | pit_get_gate(pit, 2) | - (pit_get_out(pit, 2) << 5) | (refresh_clock << 4)); + ret = (!!(pit_state->flags & KVM_PIT_FLAGS_SPEAKER_DATA_ON) << 1) | + pit_get_gate(pit, 2) | (pit_get_out(pit, 2) << 5) | + (refresh_clock << 4); if (len > sizeof(ret)) len = sizeof(ret); memcpy(data, (char *)&ret, len); diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index 394d9527da7e..a768212ba821 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -29,7 +29,6 @@ struct kvm_kpit_state { bool is_periodic; s64 period; /* unit: ns */ struct hrtimer timer; - u32 speaker_data_on; struct mutex lock; atomic_t reinject; diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index 8dff25d267b7..89246446d6aa 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -89,6 +89,7 @@ struct x86_instruction_info { #define X86EMUL_INTERCEPTED 6 /* Intercepted by nested VMCB/VMCS */ struct x86_emulate_ops { + void (*vm_bugged)(struct x86_emulate_ctxt *ctxt); /* * read_gpr: read a general purpose register (rax - r15) * @@ -301,6 +302,18 @@ struct fastop; typedef void (*fastop_t)(struct fastop *); +/* + * The emulator's _regs array tracks only the GPRs, i.e. excludes RIP. RIP is + * tracked/accessed via _eip, and except for RIP relative addressing, which + * also uses _eip, RIP cannot be a register operand nor can it be an operand in + * a ModRM or SIB byte. + */ +#ifdef CONFIG_X86_64 +#define NR_EMULATOR_GPRS 16 +#else +#define NR_EMULATOR_GPRS 8 +#endif + struct x86_emulate_ctxt { void *vcpu; const struct x86_emulate_ops *ops; @@ -345,9 +358,9 @@ struct x86_emulate_ctxt { u8 lock_prefix; u8 rep_prefix; /* bitmaps of registers in _regs[] that can be read */ - u32 regs_valid; + u16 regs_valid; /* bitmaps of registers in _regs[] that have been written */ - u32 regs_dirty; + u16 regs_dirty; /* modrm */ u8 modrm; u8 modrm_mod; @@ -363,7 +376,7 @@ struct x86_emulate_ctxt { struct operand src2; struct operand dst; struct operand memop; - unsigned long _regs[NR_VCPU_REGS]; + unsigned long _regs[NR_EMULATOR_GPRS]; struct operand *memopp; struct fetch_cache fetch; struct read_cache io_read; @@ -371,6 +384,15 @@ struct x86_emulate_ctxt { bool is_branch; }; +#define KVM_EMULATOR_BUG_ON(cond, ctxt) \ +({ \ + int __ret = (cond); \ + \ + if (WARN_ON_ONCE(__ret)) \ + ctxt->ops->vm_bugged(ctxt); \ + unlikely(__ret); \ +}) + /* Repeat String Operation Prefix */ #define REPE_PREFIX 0xf3 #define REPNE_PREFIX 0xf2 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 0e68b4c937fc..e2ce3556915e 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -27,6 +27,7 @@ #include <linux/math64.h> #include <linux/slab.h> #include <asm/processor.h> +#include <asm/mce.h> #include <asm/msr.h> #include <asm/page.h> #include <asm/current.h> @@ -54,7 +55,7 @@ #define PRIo64 "o" /* 14 is the version for Xeon and Pentium 8.4.8*/ -#define APIC_VERSION (0x14UL | ((KVM_APIC_LVT_NUM - 1) << 16)) +#define APIC_VERSION 0x14UL #define LAPIC_MMIO_LENGTH (1 << 12) /* followed define is not in apicdef.h */ #define MAX_APIC_VECTOR 256 @@ -67,6 +68,8 @@ static bool lapic_timer_advance_dynamic __read_mostly; #define LAPIC_TIMER_ADVANCE_NS_MAX 5000 /* step-by-step approximation to mitigate fluctuation */ #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8 +static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data); +static int kvm_lapic_msr_write(struct kvm_lapic *apic, u32 reg, u64 data); static inline void __kvm_lapic_set_reg(char *regs, int reg_off, u32 val) { @@ -398,14 +401,26 @@ static inline int apic_lvt_nmi_mode(u32 lvt_val) return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI; } +static inline bool kvm_lapic_lvt_supported(struct kvm_lapic *apic, int lvt_index) +{ + return apic->nr_lvt_entries > lvt_index; +} + +static inline int kvm_apic_calc_nr_lvt_entries(struct kvm_vcpu *vcpu) +{ + return KVM_APIC_MAX_NR_LVT_ENTRIES - !(vcpu->arch.mcg_cap & MCG_CMCI_P); +} + void kvm_apic_set_version(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - u32 v = APIC_VERSION; + u32 v = 0; if (!lapic_in_kernel(vcpu)) return; + v = APIC_VERSION | ((apic->nr_lvt_entries - 1) << 16); + /* * KVM emulates 82093AA datasheet (with in-kernel IOAPIC implementation) * which doesn't have EOI register; Some buggy OSes (e.g. Windows with @@ -419,12 +434,33 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu) kvm_lapic_set_reg(apic, APIC_LVR, v); } -static const unsigned int apic_lvt_mask[KVM_APIC_LVT_NUM] = { - LVT_MASK , /* part LVTT mask, timer mode mask added at runtime */ - LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ - LVT_MASK | APIC_MODE_MASK, /* LVTPC */ - LINT_MASK, LINT_MASK, /* LVT0-1 */ - LVT_MASK /* LVTERR */ +void kvm_apic_after_set_mcg_cap(struct kvm_vcpu *vcpu) +{ + int nr_lvt_entries = kvm_apic_calc_nr_lvt_entries(vcpu); + struct kvm_lapic *apic = vcpu->arch.apic; + int i; + + if (!lapic_in_kernel(vcpu) || nr_lvt_entries == apic->nr_lvt_entries) + return; + + /* Initialize/mask any "new" LVT entries. */ + for (i = apic->nr_lvt_entries; i < nr_lvt_entries; i++) + kvm_lapic_set_reg(apic, APIC_LVTx(i), APIC_LVT_MASKED); + + apic->nr_lvt_entries = nr_lvt_entries; + + /* The number of LVT entries is reflected in the version register. */ + kvm_apic_set_version(vcpu); +} + +static const unsigned int apic_lvt_mask[KVM_APIC_MAX_NR_LVT_ENTRIES] = { + [LVT_TIMER] = LVT_MASK, /* timer mode mask added at runtime */ + [LVT_THERMAL_MONITOR] = LVT_MASK | APIC_MODE_MASK, + [LVT_PERFORMANCE_COUNTER] = LVT_MASK | APIC_MODE_MASK, + [LVT_LINT0] = LINT_MASK, + [LVT_LINT1] = LINT_MASK, + [LVT_ERROR] = LVT_MASK, + [LVT_CMCI] = LVT_MASK | APIC_MODE_MASK }; static int find_highest_vector(void *bitmap) @@ -518,14 +554,11 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic) static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) { - struct kvm_vcpu *vcpu; - - vcpu = apic->vcpu; - - if (unlikely(vcpu->arch.apicv_active)) { + if (unlikely(apic->apicv_active)) { /* need to update RVI */ kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR); - static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic)); + static_call_cond(kvm_x86_hwapic_irr_update)(apic->vcpu, + apic_find_highest_irr(apic)); } else { apic->irr_pending = false; kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR); @@ -542,20 +575,16 @@ EXPORT_SYMBOL_GPL(kvm_apic_clear_irr); static inline void apic_set_isr(int vec, struct kvm_lapic *apic) { - struct kvm_vcpu *vcpu; - if (__apic_test_and_set_vector(vec, apic->regs + APIC_ISR)) return; - vcpu = apic->vcpu; - /* * With APIC virtualization enabled, all caching is disabled * because the processor can modify ISR under the hood. Instead * just set SVI. */ - if (unlikely(vcpu->arch.apicv_active)) - static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, vec); + if (unlikely(apic->apicv_active)) + static_call_cond(kvm_x86_hwapic_isr_update)(vec); else { ++apic->isr_count; BUG_ON(apic->isr_count > MAX_APIC_VECTOR); @@ -589,12 +618,9 @@ static inline int apic_find_highest_isr(struct kvm_lapic *apic) static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) { - struct kvm_vcpu *vcpu; if (!__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR)) return; - vcpu = apic->vcpu; - /* * We do get here for APIC virtualization enabled if the guest * uses the Hyper-V APIC enlightenment. In this case we may need @@ -602,8 +628,8 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) * on the other hand isr_count and highest_isr_cache are unused * and must be left alone. */ - if (unlikely(vcpu->arch.apicv_active)) - static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, apic_find_highest_isr(apic)); + if (unlikely(apic->apicv_active)) + static_call_cond(kvm_x86_hwapic_isr_update)(apic_find_highest_isr(apic)); else { --apic->isr_count; BUG_ON(apic->isr_count < 0); @@ -801,17 +827,17 @@ static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda) if (kvm_apic_broadcast(apic, mda)) return true; - if (apic_x2apic_mode(apic)) - return mda == kvm_x2apic_id(apic); - /* - * Hotplug hack: Make LAPIC in xAPIC mode also accept interrupts as if - * it were in x2APIC mode. Hotplugged VCPUs start in xAPIC mode and - * this allows unique addressing of VCPUs with APIC ID over 0xff. - * The 0xff condition is needed because writeable xAPIC ID. + * Hotplug hack: Accept interrupts for vCPUs in xAPIC mode as if they + * were in x2APIC mode if the target APIC ID can't be encoded as an + * xAPIC ID. This allows unique addressing of hotplugged vCPUs (which + * start in xAPIC mode) with an APIC ID that is unaddressable in xAPIC + * mode. Match the x2APIC ID if and only if the target APIC ID can't + * be encoded in xAPIC to avoid spurious matches against a vCPU that + * changed its (addressable) xAPIC ID (which is writable). */ - if (kvm_x2apic_id(apic) > 0xff && mda == kvm_x2apic_id(apic)) - return true; + if (apic_x2apic_mode(apic) || mda > 0xff) + return mda == kvm_x2apic_id(apic); return mda == kvm_xapic_id(apic); } @@ -1325,7 +1351,7 @@ void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high) if (apic_x2apic_mode(apic)) irq.dest_id = icr_high; else - irq.dest_id = GET_APIC_DEST_FIELD(icr_high); + irq.dest_id = GET_XAPIC_DEST_FIELD(icr_high); trace_kvm_apic_ipi(icr_low, irq.dest_id); @@ -1444,6 +1470,9 @@ static int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, APIC_REG_MASK(APIC_TMCCT) | APIC_REG_MASK(APIC_TDCR); + if (kvm_lapic_lvt_supported(apic, LVT_CMCI)) + valid_reg_mask |= APIC_REG_MASK(APIC_LVTCMCI); + /* * ARBPRI and ICR2 are not valid in x2APIC mode. WARN if KVM reads ICR * in x2APIC mode as it's an 8-byte register in x2APIC and needs to be @@ -1583,7 +1612,7 @@ static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu) int vec = reg & APIC_VECTOR_MASK; void *bitmap = apic->regs + APIC_ISR; - if (vcpu->arch.apicv_active) + if (apic->apicv_active) bitmap = apic->regs + APIC_IRR; if (apic_test_vector(vec, bitmap)) @@ -1602,7 +1631,7 @@ static inline void __wait_lapic_expire(struct kvm_vcpu *vcpu, u64 guest_cycles) * that __delay() uses delay_tsc whenever the hardware has TSC, thus * always for VMX enabled hardware. */ - if (vcpu->arch.tsc_scaling_ratio == kvm_default_tsc_scaling_ratio) { + if (vcpu->arch.tsc_scaling_ratio == kvm_caps.default_tsc_scaling_ratio) { __delay(min(guest_cycles, nsec_to_cycles(vcpu, timer_advance_ns))); } else { @@ -1700,7 +1729,7 @@ static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn) if (apic_lvtt_tscdeadline(apic) || ktimer->hv_timer_in_use) ktimer->expired_tscdeadline = ktimer->tscdeadline; - if (!from_timer_fn && vcpu->arch.apicv_active) { + if (!from_timer_fn && apic->apicv_active) { WARN_ON(kvm_get_running_vcpu() != vcpu); kvm_apic_inject_pending_timer_irqs(apic); return; @@ -2052,6 +2081,16 @@ static void kvm_lapic_xapic_id_updated(struct kvm_lapic *apic) kvm_set_apicv_inhibit(apic->vcpu->kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED); } +static int get_lvt_index(u32 reg) +{ + if (reg == APIC_LVTCMCI) + return LVT_CMCI; + if (reg < APIC_LVTT || reg > APIC_LVTERR) + return -1; + return array_index_nospec( + (reg - APIC_LVTT) >> 4, KVM_APIC_MAX_NR_LVT_ENTRIES); +} + static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) { int ret = 0; @@ -2098,13 +2137,10 @@ static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) apic_set_spiv(apic, val & mask); if (!(val & APIC_SPIV_APIC_ENABLED)) { int i; - u32 lvt_val; - for (i = 0; i < KVM_APIC_LVT_NUM; i++) { - lvt_val = kvm_lapic_get_reg(apic, - APIC_LVTT + 0x10 * i); - kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i, - lvt_val | APIC_LVT_MASKED); + for (i = 0; i < apic->nr_lvt_entries; i++) { + kvm_lapic_set_reg(apic, APIC_LVTx(i), + kvm_lapic_get_reg(apic, APIC_LVTx(i)) | APIC_LVT_MASKED); } apic_update_lvtt(apic); atomic_set(&apic->lapic_timer.pending, 0); @@ -2133,16 +2169,15 @@ static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) case APIC_LVTTHMR: case APIC_LVTPC: case APIC_LVT1: - case APIC_LVTERR: { - /* TODO: Check vector */ - size_t size; - u32 index; - + case APIC_LVTERR: + case APIC_LVTCMCI: { + u32 index = get_lvt_index(reg); + if (!kvm_lapic_lvt_supported(apic, index)) { + ret = 1; + break; + } if (!kvm_apic_sw_enabled(apic)) val |= APIC_LVT_MASKED; - size = ARRAY_SIZE(apic_lvt_mask); - index = array_index_nospec( - (reg - APIC_LVTT) >> 4, size); val &= apic_lvt_mask[index]; kvm_lapic_set_reg(apic, reg, val); break; @@ -2246,10 +2281,26 @@ EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); /* emulate APIC access in a trap manner */ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) { - u32 val = kvm_lapic_get_reg(vcpu->arch.apic, offset); + struct kvm_lapic *apic = vcpu->arch.apic; + u64 val; + + if (apic_x2apic_mode(apic)) + kvm_lapic_msr_read(apic, offset, &val); + else + val = kvm_lapic_get_reg(apic, offset); - /* TODO: optimize to just emulate side effect w/o one more write */ - kvm_lapic_reg_write(vcpu->arch.apic, offset, val); + /* + * ICR is a single 64-bit register when x2APIC is enabled. For legacy + * xAPIC, ICR writes need to go down the common (slightly slower) path + * to get the upper half from ICR2. + */ + if (apic_x2apic_mode(apic) && offset == APIC_ICR) { + kvm_apic_send_ipi(apic, (u32)val, (u32)(val >> 32)); + trace_kvm_apic_write(APIC_ICR, val); + } else { + /* TODO: optimize to just emulate side effect w/o one more write */ + kvm_lapic_reg_write(apic, offset, (u32)val); + } } EXPORT_SYMBOL_GPL(kvm_apic_write_nodecode); @@ -2344,8 +2395,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) if (((old_value ^ value) & X2APIC_ENABLE) && (value & X2APIC_ENABLE)) kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id); - if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) + if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) { + kvm_vcpu_update_apicv(vcpu); static_call_cond(kvm_x86_set_virtual_apic_mode)(vcpu); + } apic->base_address = apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_BASE; @@ -2361,7 +2414,7 @@ void kvm_apic_update_apicv(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - if (vcpu->arch.apicv_active) { + if (apic->apicv_active) { /* irr_pending is always true when apicv is activated. */ apic->irr_pending = true; apic->isr_count = 1; @@ -2401,8 +2454,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); kvm_apic_set_version(apic->vcpu); - for (i = 0; i < KVM_APIC_LVT_NUM; i++) - kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); + for (i = 0; i < apic->nr_lvt_entries; i++) + kvm_lapic_set_reg(apic, APIC_LVTx(i), APIC_LVT_MASKED); apic_update_lvtt(apic); if (kvm_vcpu_is_reset_bsp(vcpu) && kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED)) @@ -2436,10 +2489,10 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.pv_eoi.msr_val = 0; apic_update_ppr(apic); - if (vcpu->arch.apicv_active) { + if (apic->apicv_active) { static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu); static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, -1); - static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, -1); + static_call_cond(kvm_x86_hwapic_isr_update)(-1); } vcpu->arch.apic_arb_prio = 0; @@ -2532,6 +2585,8 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) } apic->vcpu = vcpu; + apic->nr_lvt_entries = kvm_apic_calc_nr_lvt_entries(vcpu); + hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); apic->lapic_timer.timer.function = apic_timer_fn; @@ -2716,10 +2771,10 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) kvm_lapic_set_reg(apic, APIC_TMCCT, 0); kvm_apic_update_apicv(vcpu); apic->highest_isr_cache = -1; - if (vcpu->arch.apicv_active) { + if (apic->apicv_active) { static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu); static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic)); - static_call_cond(kvm_x86_hwapic_isr_update)(vcpu, apic_find_highest_isr(apic)); + static_call_cond(kvm_x86_hwapic_isr_update)(apic_find_highest_isr(apic)); } kvm_make_request(KVM_REQ_EVENT, vcpu); if (ioapic_in_kernel(vcpu->kvm)) diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 65bb2a8cf145..117a46df5cc1 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -10,7 +10,6 @@ #define KVM_APIC_INIT 0 #define KVM_APIC_SIPI 1 -#define KVM_APIC_LVT_NUM 6 #define APIC_SHORT_MASK 0xc0000 #define APIC_DEST_NOSHORT 0x0 @@ -29,6 +28,20 @@ enum lapic_mode { LAPIC_MODE_X2APIC = MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE, }; +enum lapic_lvt_entry { + LVT_TIMER, + LVT_THERMAL_MONITOR, + LVT_PERFORMANCE_COUNTER, + LVT_LINT0, + LVT_LINT1, + LVT_ERROR, + LVT_CMCI, + + KVM_APIC_MAX_NR_LVT_ENTRIES, +}; + +#define APIC_LVTx(x) ((x) == LVT_CMCI ? APIC_LVTCMCI : APIC_LVTT + 0x10 * (x)) + struct kvm_timer { struct hrtimer timer; s64 period; /* unit: ns */ @@ -48,6 +61,7 @@ struct kvm_lapic { struct kvm_timer lapic_timer; u32 divide_count; struct kvm_vcpu *vcpu; + bool apicv_active; bool sw_enabled; bool irr_pending; bool lvt0_in_nmi_mode; @@ -65,6 +79,7 @@ struct kvm_lapic { struct gfn_to_hva_cache vapic_cache; unsigned long pending_events; unsigned int sipi_vector; + int nr_lvt_entries; }; struct dest_map; @@ -84,6 +99,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); void kvm_recalculate_apic_map(struct kvm *kvm); void kvm_apic_set_version(struct kvm_vcpu *vcpu); +void kvm_apic_after_set_mcg_cap(struct kvm_vcpu *vcpu); bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int shorthand, unsigned int dest, int dest_mode); int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); @@ -204,7 +220,7 @@ static inline int apic_x2apic_mode(struct kvm_lapic *apic) static inline bool kvm_vcpu_apicv_active(struct kvm_vcpu *vcpu) { - return vcpu->arch.apic && vcpu->arch.apicv_active; + return lapic_in_kernel(vcpu) && vcpu->arch.apic->apicv_active; } static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index f8192864b496..a99acec925eb 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -6,11 +6,6 @@ #include "kvm_cache_regs.h" #include "cpuid.h" -#define PT64_PT_BITS 9 -#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) -#define PT32_PT_BITS 10 -#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS) - #define PT_WRITABLE_SHIFT 1 #define PT_USER_SHIFT 2 @@ -34,11 +29,6 @@ #define PT_DIR_PAT_SHIFT 12 #define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT) -#define PT32_DIR_PSE36_SIZE 4 -#define PT32_DIR_PSE36_SHIFT 13 -#define PT32_DIR_PSE36_MASK \ - (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT) - #define PT64_ROOT_5LEVEL 5 #define PT64_ROOT_4LEVEL 4 #define PT32_ROOT_LEVEL 2 diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 17252f39bd7c..06ac8c7cef67 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -53,8 +53,6 @@ #include <asm/kvm_page_track.h> #include "trace.h" -#include "paging.h" - extern bool itlb_multihit_kvm_mitigation; int __read_mostly nx_huge_pages = -1; @@ -111,26 +109,6 @@ module_param(dbg, bool, 0644); #define PTE_PREFETCH_NUM 8 -#define PT32_LEVEL_BITS 10 - -#define PT32_LEVEL_SHIFT(level) \ - (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS) - -#define PT32_LVL_OFFSET_MASK(level) \ - (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ - * PT32_LEVEL_BITS))) - 1)) - -#define PT32_INDEX(address, level)\ - (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) - - -#define PT32_BASE_ADDR_MASK PAGE_MASK -#define PT32_DIR_BASE_ADDR_MASK \ - (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) -#define PT32_LVL_ADDR_MASK(level) \ - (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ - * PT32_LEVEL_BITS))) - 1)) - #include <trace/events/kvm.h> /* make pte_list_desc fit well in cache lines */ @@ -326,13 +304,6 @@ static int is_cpuid_PSE36(void) return 1; } -static gfn_t pse36_gfn_delta(u32 gpte) -{ - int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT; - - return (gpte & PT32_DIR_PSE36_MASK) << shift; -} - #ifdef CONFIG_X86_64 static void __set_spte(u64 *sptep, u64 spte) { @@ -432,7 +403,7 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) * The idea using the light way get the spte on x86_32 guest is from * gup_get_pte (mm/gup.c). * - * An spte tlb flush may be pending, because kvm_set_pte_rmapp + * An spte tlb flush may be pending, because kvm_set_pte_rmap * coalesces them and we are running out of the MMU lock. Therefore * we need to protect against in-progress updates of the spte. * @@ -558,11 +529,12 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) * state bits, it is used to clear the last level sptep. * Returns the old PTE. */ -static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) +static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) { kvm_pfn_t pfn; u64 old_spte = *sptep; int level = sptep_to_sp(sptep)->role.level; + struct page *page; if (!is_shadow_present_pte(old_spte) || !spte_has_volatile_bits(old_spte)) @@ -578,11 +550,13 @@ static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) pfn = spte_to_pfn(old_spte); /* - * KVM does not hold the refcount of the page used by - * kvm mmu, before reclaiming the page, we should - * unmap it from mmu first. + * KVM doesn't hold a reference to any pages mapped into the guest, and + * instead uses the mmu_notifier to ensure that KVM unmaps any pages + * before they are reclaimed. Sanity check that, if the pfn is backed + * by a refcounted page, the refcount is elevated. */ - WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn))); + page = kvm_pfn_to_refcounted_page(pfn); + WARN_ON(page && !page_count(page)); if (is_accessed_spte(old_spte)) kvm_set_pfn_accessed(pfn); @@ -682,7 +656,7 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect) if (r) return r; if (maybe_indirect) { - r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_gfn_array_cache, + r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadowed_info_cache, PT64_ROOT_MAX_LEVEL); if (r) return r; @@ -695,48 +669,79 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) { kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache); kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache); - kvm_mmu_free_memory_cache(&vcpu->arch.mmu_gfn_array_cache); + kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadowed_info_cache); kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); } -static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu) -{ - return kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache); -} - static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) { kmem_cache_free(pte_list_desc_cache, pte_list_desc); } +static bool sp_has_gptes(struct kvm_mmu_page *sp); + static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) { if (sp->role.passthrough) return sp->gfn; if (!sp->role.direct) - return sp->gfns[index]; + return sp->shadowed_translation[index] >> PAGE_SHIFT; - return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS)); + return sp->gfn + (index << ((sp->role.level - 1) * SPTE_LEVEL_BITS)); } -static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) +/* + * For leaf SPTEs, fetch the *guest* access permissions being shadowed. Note + * that the SPTE itself may have a more constrained access permissions that + * what the guest enforces. For example, a guest may create an executable + * huge PTE but KVM may disallow execution to mitigate iTLB multihit. + */ +static u32 kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index) { - if (sp->role.passthrough) { - WARN_ON_ONCE(gfn != sp->gfn); - return; - } + if (sp_has_gptes(sp)) + return sp->shadowed_translation[index] & ACC_ALL; - if (!sp->role.direct) { - sp->gfns[index] = gfn; + /* + * For direct MMUs (e.g. TDP or non-paging guests) or passthrough SPs, + * KVM is not shadowing any guest page tables, so the "guest access + * permissions" are just ACC_ALL. + * + * For direct SPs in indirect MMUs (shadow paging), i.e. when KVM + * is shadowing a guest huge page with small pages, the guest access + * permissions being shadowed are the access permissions of the huge + * page. + * + * In both cases, sp->role.access contains the correct access bits. + */ + return sp->role.access; +} + +static void kvm_mmu_page_set_translation(struct kvm_mmu_page *sp, int index, + gfn_t gfn, unsigned int access) +{ + if (sp_has_gptes(sp)) { + sp->shadowed_translation[index] = (gfn << PAGE_SHIFT) | access; return; } - if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index))) - pr_err_ratelimited("gfn mismatch under direct page %llx " - "(expected %llx, got %llx)\n", - sp->gfn, - kvm_mmu_page_get_gfn(sp, index), gfn); + WARN_ONCE(access != kvm_mmu_page_get_access(sp, index), + "access mismatch under %s page %llx (expected %u, got %u)\n", + sp->role.passthrough ? "passthrough" : "direct", + sp->gfn, kvm_mmu_page_get_access(sp, index), access); + + WARN_ONCE(gfn != kvm_mmu_page_get_gfn(sp, index), + "gfn mismatch under %s page %llx (expected %llx, got %llx)\n", + sp->role.passthrough ? "passthrough" : "direct", + sp->gfn, kvm_mmu_page_get_gfn(sp, index), gfn); +} + +static void kvm_mmu_page_set_access(struct kvm_mmu_page *sp, int index, + unsigned int access) +{ + gfn_t gfn = kvm_mmu_page_get_gfn(sp, index); + + kvm_mmu_page_set_translation(sp, index, gfn, access); } /* @@ -792,6 +797,9 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) KVM_PAGE_TRACK_WRITE); kvm_mmu_gfn_disallow_lpage(slot, gfn); + + if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K)) + kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); } void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) @@ -855,7 +863,7 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, /* * Returns the number of pointers in the rmap chain, not counting the new one. */ -static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, +static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte, struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc; @@ -866,7 +874,7 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, rmap_head->val = (unsigned long)spte; } else if (!(rmap_head->val & 1)) { rmap_printk("%p %llx 1->many\n", spte, *spte); - desc = mmu_alloc_pte_list_desc(vcpu); + desc = kvm_mmu_memory_cache_alloc(cache); desc->sptes[0] = (u64 *)rmap_head->val; desc->sptes[1] = spte; desc->spte_count = 2; @@ -878,7 +886,7 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, while (desc->spte_count == PTE_LIST_EXT) { count += PTE_LIST_EXT; if (!desc->more) { - desc->more = mmu_alloc_pte_list_desc(vcpu); + desc->more = kvm_mmu_memory_cache_alloc(cache); desc = desc->more; desc->spte_count = 0; break; @@ -913,7 +921,7 @@ pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head, mmu_free_pte_list_desc(desc); } -static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) +static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc; struct pte_list_desc *prev_desc; @@ -949,15 +957,16 @@ static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) } } -static void pte_list_remove(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - u64 *sptep) +static void kvm_zap_one_rmap_spte(struct kvm *kvm, + struct kvm_rmap_head *rmap_head, u64 *sptep) { mmu_spte_clear_track_bits(kvm, sptep); - __pte_list_remove(sptep, rmap_head); + pte_list_remove(sptep, rmap_head); } -/* Return true if rmap existed, false otherwise */ -static bool pte_list_destroy(struct kvm *kvm, struct kvm_rmap_head *rmap_head) +/* Return true if at least one SPTE was zapped, false otherwise */ +static bool kvm_zap_all_rmap_sptes(struct kvm *kvm, + struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc, *next; int i; @@ -1030,7 +1039,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) struct kvm_rmap_head *rmap_head; sp = sptep_to_sp(spte); - gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); + gfn = kvm_mmu_page_get_gfn(sp, spte_index(spte)); /* * Unlike rmap_add, rmap_remove does not run in the context of a vCPU @@ -1042,7 +1051,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) slot = __gfn_to_memslot(slots, gfn); rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); - __pte_list_remove(spte, rmap_head); + pte_list_remove(spte, rmap_head); } /* @@ -1129,26 +1138,18 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) rmap_remove(kvm, sptep); } - -static bool __drop_large_spte(struct kvm *kvm, u64 *sptep) +static void drop_large_spte(struct kvm *kvm, u64 *sptep, bool flush) { - if (is_large_pte(*sptep)) { - WARN_ON(sptep_to_sp(sptep)->role.level == PG_LEVEL_4K); - drop_spte(kvm, sptep); - return true; - } + struct kvm_mmu_page *sp; - return false; -} + sp = sptep_to_sp(sptep); + WARN_ON(sp->role.level == PG_LEVEL_4K); -static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) -{ - if (__drop_large_spte(vcpu->kvm, sptep)) { - struct kvm_mmu_page *sp = sptep_to_sp(sptep); + drop_spte(kvm, sptep); - kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, + if (flush) + kvm_flush_remote_tlbs_with_address(kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); - } } /* @@ -1383,22 +1384,22 @@ static bool kvm_vcpu_write_protect_gfn(struct kvm_vcpu *vcpu, u64 gfn) return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K); } -static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - const struct kvm_memory_slot *slot) +static bool __kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + const struct kvm_memory_slot *slot) { - return pte_list_destroy(kvm, rmap_head); + return kvm_zap_all_rmap_sptes(kvm, rmap_head); } -static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, int level, - pte_t unused) +static bool kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot, gfn_t gfn, int level, + pte_t unused) { - return kvm_zap_rmapp(kvm, rmap_head, slot); + return __kvm_zap_rmap(kvm, rmap_head, slot); } -static bool kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, int level, - pte_t pte) +static bool kvm_set_pte_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot, gfn_t gfn, int level, + pte_t pte) { u64 *sptep; struct rmap_iterator iter; @@ -1417,7 +1418,7 @@ restart: need_flush = true; if (pte_write(pte)) { - pte_list_remove(kvm, rmap_head, sptep); + kvm_zap_one_rmap_spte(kvm, rmap_head, sptep); goto restart; } else { new_spte = kvm_mmu_changed_pte_notifier_make_spte( @@ -1529,7 +1530,7 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) bool flush = false; if (kvm_memslots_have_rmaps(kvm)) - flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp); + flush = kvm_handle_gfn_range(kvm, range, kvm_zap_rmap); if (is_tdp_mmu_enabled(kvm)) flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush); @@ -1542,7 +1543,7 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) bool flush = false; if (kvm_memslots_have_rmaps(kvm)) - flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmapp); + flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmap); if (is_tdp_mmu_enabled(kvm)) flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range); @@ -1550,9 +1551,9 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) return flush; } -static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, int level, - pte_t unused) +static bool kvm_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot, gfn_t gfn, int level, + pte_t unused) { u64 *sptep; struct rmap_iterator iter; @@ -1564,9 +1565,9 @@ static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, return young; } -static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot, gfn_t gfn, - int level, pte_t unused) +static bool kvm_test_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + struct kvm_memory_slot *slot, gfn_t gfn, + int level, pte_t unused) { u64 *sptep; struct rmap_iterator iter; @@ -1579,31 +1580,43 @@ static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, #define RMAP_RECYCLE_THRESHOLD 1000 -static void rmap_add(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, - u64 *spte, gfn_t gfn) +static void __rmap_add(struct kvm *kvm, + struct kvm_mmu_memory_cache *cache, + const struct kvm_memory_slot *slot, + u64 *spte, gfn_t gfn, unsigned int access) { struct kvm_mmu_page *sp; struct kvm_rmap_head *rmap_head; int rmap_count; sp = sptep_to_sp(spte); - kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); + kvm_mmu_page_set_translation(sp, spte_index(spte), gfn, access); + kvm_update_page_stats(kvm, sp->role.level, 1); + rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); - rmap_count = pte_list_add(vcpu, spte, rmap_head); + rmap_count = pte_list_add(cache, spte, rmap_head); if (rmap_count > RMAP_RECYCLE_THRESHOLD) { - kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0)); + kvm_zap_all_rmap_sptes(kvm, rmap_head); kvm_flush_remote_tlbs_with_address( - vcpu->kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); + kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); } } +static void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot, + u64 *spte, gfn_t gfn, unsigned int access) +{ + struct kvm_mmu_memory_cache *cache = &vcpu->arch.mmu_pte_list_desc_cache; + + __rmap_add(vcpu->kvm, cache, slot, spte, gfn, access); +} + bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { bool young = false; if (kvm_memslots_have_rmaps(kvm)) - young = kvm_handle_gfn_range(kvm, range, kvm_age_rmapp); + young = kvm_handle_gfn_range(kvm, range, kvm_age_rmap); if (is_tdp_mmu_enabled(kvm)) young |= kvm_tdp_mmu_age_gfn_range(kvm, range); @@ -1616,7 +1629,7 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) bool young = false; if (kvm_memslots_have_rmaps(kvm)) - young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmapp); + young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmap); if (is_tdp_mmu_enabled(kvm)) young |= kvm_tdp_mmu_test_age_gfn(kvm, range); @@ -1652,14 +1665,14 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr) percpu_counter_add(&kvm_total_used_mmu_pages, nr); } -static void kvm_mmu_free_page(struct kvm_mmu_page *sp) +static void kvm_mmu_free_shadow_page(struct kvm_mmu_page *sp) { MMU_WARN_ON(!is_empty_shadow_page(sp->spt)); hlist_del(&sp->hash_link); list_del(&sp->link); free_page((unsigned long)sp->spt); if (!sp->role.direct) - free_page((unsigned long)sp->gfns); + free_page((unsigned long)sp->shadowed_translation); kmem_cache_free(mmu_page_header_cache, sp); } @@ -1668,19 +1681,19 @@ static unsigned kvm_page_table_hashfn(gfn_t gfn) return hash_64(gfn, KVM_MMU_HASH_SHIFT); } -static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, +static void mmu_page_add_parent_pte(struct kvm_mmu_memory_cache *cache, struct kvm_mmu_page *sp, u64 *parent_pte) { if (!parent_pte) return; - pte_list_add(vcpu, parent_pte, &sp->parent_ptes); + pte_list_add(cache, parent_pte, &sp->parent_ptes); } static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, u64 *parent_pte) { - __pte_list_remove(parent_pte, &sp->parent_ptes); + pte_list_remove(parent_pte, &sp->parent_ptes); } static void drop_parent_pte(struct kvm_mmu_page *sp, @@ -1690,27 +1703,6 @@ static void drop_parent_pte(struct kvm_mmu_page *sp, mmu_spte_clear_no_track(parent_pte); } -static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct) -{ - struct kvm_mmu_page *sp; - - sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); - sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); - if (!direct) - sp->gfns = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_gfn_array_cache); - set_page_private(virt_to_page(sp->spt), (unsigned long)sp); - - /* - * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages() - * depends on valid pages being added to the head of the list. See - * comments in kvm_zap_obsolete_pages(). - */ - sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen; - list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); - kvm_mod_used_mmu_pages(vcpu->kvm, +1); - return sp; -} - static void mark_unsync(u64 *spte); static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) { @@ -1725,11 +1717,9 @@ static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) static void mark_unsync(u64 *spte) { struct kvm_mmu_page *sp; - unsigned int index; sp = sptep_to_sp(spte); - index = spte - sp->spt; - if (__test_and_set_bit(index, sp->unsync_child_bitmap)) + if (__test_and_set_bit(spte_index(spte), sp->unsync_child_bitmap)) return; if (sp->unsync_children++) return; @@ -1789,7 +1779,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp, continue; } - child = to_shadow_page(ent & PT64_BASE_ADDR_MASK); + child = to_shadow_page(ent & SPTE_BASE_ADDR_MASK); if (child->unsync_children) { if (mmu_pages_add(pvec, child, i)) @@ -2019,36 +2009,24 @@ static void clear_sp_write_flooding_count(u64 *spte) __clear_sp_write_flooding_count(sptep_to_sp(spte)); } -static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, - gfn_t gfn, - gva_t gaddr, - unsigned level, - int direct, - unsigned int access) +/* + * The vCPU is required when finding indirect shadow pages; the shadow + * page may already exist and syncing it needs the vCPU pointer in + * order to read guest page tables. Direct shadow pages are never + * unsync, thus @vcpu can be NULL if @role.direct is true. + */ +static struct kvm_mmu_page *kvm_mmu_find_shadow_page(struct kvm *kvm, + struct kvm_vcpu *vcpu, + gfn_t gfn, + struct hlist_head *sp_list, + union kvm_mmu_page_role role) { - bool direct_mmu = vcpu->arch.mmu->root_role.direct; - union kvm_mmu_page_role role; - struct hlist_head *sp_list; - unsigned quadrant; struct kvm_mmu_page *sp; int ret; int collisions = 0; LIST_HEAD(invalid_list); - role = vcpu->arch.mmu->root_role; - role.level = level; - role.direct = direct; - role.access = access; - if (role.has_4_byte_gpte) { - quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); - quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; - role.quadrant = quadrant; - } - if (level <= vcpu->arch.mmu->cpu_role.base.level) - role.passthrough = 0; - - sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]; - for_each_valid_sp(vcpu->kvm, sp, sp_list) { + for_each_valid_sp(kvm, sp, sp_list) { if (sp->gfn != gfn) { collisions++; continue; @@ -2064,16 +2042,20 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, * Unsync pages must not be left as is, because the new * upper-level page will be write-protected. */ - if (level > PG_LEVEL_4K && sp->unsync) - kvm_mmu_prepare_zap_page(vcpu->kvm, sp, + if (role.level > PG_LEVEL_4K && sp->unsync) + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); continue; } - if (direct_mmu) - goto trace_get_page; + /* unsync and write-flooding only apply to indirect SPs. */ + if (sp->role.direct) + goto out; if (sp->unsync) { + if (KVM_BUG_ON(!vcpu, kvm)) + break; + /* * The page is good, but is stale. kvm_sync_page does * get the latest guest state, but (unlike mmu_unsync_children) @@ -2092,37 +2074,160 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, WARN_ON(!list_empty(&invalid_list)); if (ret > 0) - kvm_flush_remote_tlbs(vcpu->kvm); + kvm_flush_remote_tlbs(kvm); } __clear_sp_write_flooding_count(sp); -trace_get_page: - trace_kvm_mmu_get_page(sp, false); goto out; } - ++vcpu->kvm->stat.mmu_cache_miss; + sp = NULL; + ++kvm->stat.mmu_cache_miss; + +out: + kvm_mmu_commit_zap_page(kvm, &invalid_list); + + if (collisions > kvm->stat.max_mmu_page_hash_collisions) + kvm->stat.max_mmu_page_hash_collisions = collisions; + return sp; +} + +/* Caches used when allocating a new shadow page. */ +struct shadow_page_caches { + struct kvm_mmu_memory_cache *page_header_cache; + struct kvm_mmu_memory_cache *shadow_page_cache; + struct kvm_mmu_memory_cache *shadowed_info_cache; +}; + +static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm, + struct shadow_page_caches *caches, + gfn_t gfn, + struct hlist_head *sp_list, + union kvm_mmu_page_role role) +{ + struct kvm_mmu_page *sp; + + sp = kvm_mmu_memory_cache_alloc(caches->page_header_cache); + sp->spt = kvm_mmu_memory_cache_alloc(caches->shadow_page_cache); + if (!role.direct) + sp->shadowed_translation = kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache); + + set_page_private(virt_to_page(sp->spt), (unsigned long)sp); - sp = kvm_mmu_alloc_page(vcpu, direct); + /* + * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages() + * depends on valid pages being added to the head of the list. See + * comments in kvm_zap_obsolete_pages(). + */ + sp->mmu_valid_gen = kvm->arch.mmu_valid_gen; + list_add(&sp->link, &kvm->arch.active_mmu_pages); + kvm_mod_used_mmu_pages(kvm, +1); sp->gfn = gfn; sp->role = role; hlist_add_head(&sp->hash_link, sp_list); - if (sp_has_gptes(sp)) { - account_shadowed(vcpu->kvm, sp); - if (level == PG_LEVEL_4K && kvm_vcpu_write_protect_gfn(vcpu, gfn)) - kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1); + if (sp_has_gptes(sp)) + account_shadowed(kvm, sp); + + return sp; +} + +/* Note, @vcpu may be NULL if @role.direct is true; see kvm_mmu_find_shadow_page. */ +static struct kvm_mmu_page *__kvm_mmu_get_shadow_page(struct kvm *kvm, + struct kvm_vcpu *vcpu, + struct shadow_page_caches *caches, + gfn_t gfn, + union kvm_mmu_page_role role) +{ + struct hlist_head *sp_list; + struct kvm_mmu_page *sp; + bool created = false; + + sp_list = &kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]; + + sp = kvm_mmu_find_shadow_page(kvm, vcpu, gfn, sp_list, role); + if (!sp) { + created = true; + sp = kvm_mmu_alloc_shadow_page(kvm, caches, gfn, sp_list, role); } - trace_kvm_mmu_get_page(sp, true); -out: - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); - if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions) - vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions; + trace_kvm_mmu_get_page(sp, created); return sp; } +static struct kvm_mmu_page *kvm_mmu_get_shadow_page(struct kvm_vcpu *vcpu, + gfn_t gfn, + union kvm_mmu_page_role role) +{ + struct shadow_page_caches caches = { + .page_header_cache = &vcpu->arch.mmu_page_header_cache, + .shadow_page_cache = &vcpu->arch.mmu_shadow_page_cache, + .shadowed_info_cache = &vcpu->arch.mmu_shadowed_info_cache, + }; + + return __kvm_mmu_get_shadow_page(vcpu->kvm, vcpu, &caches, gfn, role); +} + +static union kvm_mmu_page_role kvm_mmu_child_role(u64 *sptep, bool direct, + unsigned int access) +{ + struct kvm_mmu_page *parent_sp = sptep_to_sp(sptep); + union kvm_mmu_page_role role; + + role = parent_sp->role; + role.level--; + role.access = access; + role.direct = direct; + role.passthrough = 0; + + /* + * If the guest has 4-byte PTEs then that means it's using 32-bit, + * 2-level, non-PAE paging. KVM shadows such guests with PAE paging + * (i.e. 8-byte PTEs). The difference in PTE size means that KVM must + * shadow each guest page table with multiple shadow page tables, which + * requires extra bookkeeping in the role. + * + * Specifically, to shadow the guest's page directory (which covers a + * 4GiB address space), KVM uses 4 PAE page directories, each mapping + * 1GiB of the address space. @role.quadrant encodes which quarter of + * the address space each maps. + * + * To shadow the guest's page tables (which each map a 4MiB region), KVM + * uses 2 PAE page tables, each mapping a 2MiB region. For these, + * @role.quadrant encodes which half of the region they map. + * + * Concretely, a 4-byte PDE consumes bits 31:22, while an 8-byte PDE + * consumes bits 29:21. To consume bits 31:30, KVM's uses 4 shadow + * PDPTEs; those 4 PAE page directories are pre-allocated and their + * quadrant is assigned in mmu_alloc_root(). A 4-byte PTE consumes + * bits 21:12, while an 8-byte PTE consumes bits 20:12. To consume + * bit 21 in the PTE (the child here), KVM propagates that bit to the + * quadrant, i.e. sets quadrant to '0' or '1'. The parent 8-byte PDE + * covers bit 21 (see above), thus the quadrant is calculated from the + * _least_ significant bit of the PDE index. + */ + if (role.has_4_byte_gpte) { + WARN_ON_ONCE(role.level != PG_LEVEL_4K); + role.quadrant = spte_index(sptep) & 1; + } + + return role; +} + +static struct kvm_mmu_page *kvm_mmu_get_child_sp(struct kvm_vcpu *vcpu, + u64 *sptep, gfn_t gfn, + bool direct, unsigned int access) +{ + union kvm_mmu_page_role role; + + if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) + return ERR_PTR(-EEXIST); + + role = kvm_mmu_child_role(sptep, direct, access); + return kvm_mmu_get_shadow_page(vcpu, gfn, role); +} + static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator, struct kvm_vcpu *vcpu, hpa_t root, u64 addr) @@ -2145,7 +2250,7 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato iterator->shadow_addr = vcpu->arch.mmu->pae_root[(addr >> 30) & 3]; - iterator->shadow_addr &= PT64_BASE_ADDR_MASK; + iterator->shadow_addr &= SPTE_BASE_ADDR_MASK; --iterator->level; if (!iterator->shadow_addr) iterator->level = 0; @@ -2164,7 +2269,7 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) if (iterator->level < PG_LEVEL_4K) return false; - iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level); + iterator->index = SPTE_INDEX(iterator->addr, iterator->level); iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index; return true; } @@ -2177,7 +2282,7 @@ static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator, return; } - iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK; + iterator->shadow_addr = spte & SPTE_BASE_ADDR_MASK; --iterator->level; } @@ -2186,23 +2291,38 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) __shadow_walk_next(iterator, *iterator->sptep); } -static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, - struct kvm_mmu_page *sp) +static void __link_shadow_page(struct kvm *kvm, + struct kvm_mmu_memory_cache *cache, u64 *sptep, + struct kvm_mmu_page *sp, bool flush) { u64 spte; BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); + /* + * If an SPTE is present already, it must be a leaf and therefore + * a large one. Drop it, and flush the TLB if needed, before + * installing sp. + */ + if (is_shadow_present_pte(*sptep)) + drop_large_spte(kvm, sptep, flush); + spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp)); mmu_spte_set(sptep, spte); - mmu_page_add_parent_pte(vcpu, sp, sptep); + mmu_page_add_parent_pte(cache, sp, sptep); if (sp->unsync_children || sp->unsync) mark_unsync(sptep); } +static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, + struct kvm_mmu_page *sp) +{ + __link_shadow_page(vcpu->kvm, &vcpu->arch.mmu_pte_list_desc_cache, sptep, sp, true); +} + static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned direct_access) { @@ -2216,7 +2336,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, * so we should update the spte at this point to get * a new sp with the correct access. */ - child = to_shadow_page(*sptep & PT64_BASE_ADDR_MASK); + child = to_shadow_page(*sptep & SPTE_BASE_ADDR_MASK); if (child->role.access == direct_access) return; @@ -2237,7 +2357,7 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, if (is_last_spte(pte, sp->role.level)) { drop_spte(kvm, spte); } else { - child = to_shadow_page(pte & PT64_BASE_ADDR_MASK); + child = to_shadow_page(pte & SPTE_BASE_ADDR_MASK); drop_parent_pte(child, spte); /* @@ -2263,7 +2383,7 @@ static int kvm_mmu_page_unlink_children(struct kvm *kvm, int zapped = 0; unsigned i; - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) + for (i = 0; i < SPTE_ENT_PER_PAGE; ++i) zapped += mmu_page_zap_pte(kvm, sp, sp->spt + i, invalid_list); return zapped; @@ -2396,7 +2516,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, list_for_each_entry_safe(sp, nsp, invalid_list, link) { WARN_ON(!sp->role.invalid || sp->root_count); - kvm_mmu_free_page(sp); + kvm_mmu_free_shadow_page(sp); } } @@ -2676,7 +2796,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, struct kvm_mmu_page *child; u64 pte = *sptep; - child = to_shadow_page(pte & PT64_BASE_ADDR_MASK); + child = to_shadow_page(pte & SPTE_BASE_ADDR_MASK); drop_parent_pte(child, sptep); flush = true; } else if (pfn != spte_to_pfn(*sptep)) { @@ -2711,8 +2831,10 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, if (!was_rmapped) { WARN_ON_ONCE(ret == RET_PF_SPURIOUS); - kvm_update_page_stats(vcpu->kvm, level, 1); - rmap_add(vcpu, slot, sptep, gfn); + rmap_add(vcpu, slot, sptep, gfn, pte_access); + } else { + /* Already rmapped but the pte_access bits may have changed. */ + kvm_mmu_page_set_access(sp, spte_index(sptep), pte_access); } return ret; @@ -2728,7 +2850,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, int i, ret; gfn_t gfn; - gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt); + gfn = kvm_mmu_page_get_gfn(sp, spte_index(start)); slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK); if (!slot) return -1; @@ -2754,7 +2876,7 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu, WARN_ON(!sp->role.direct); - i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1); + i = spte_index(sptep) & ~(PTE_PREFETCH_NUM - 1); spte = sp->spt + i; for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { @@ -2798,20 +2920,42 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) __direct_pte_prefetch(vcpu, sp, sptep); } -static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, +/* + * Lookup the mapping level for @gfn in the current mm. + * + * WARNING! Use of host_pfn_mapping_level() requires the caller and the end + * consumer to be tied into KVM's handlers for MMU notifier events! + * + * There are several ways to safely use this helper: + * + * - Check mmu_notifier_retry_hva() after grabbing the mapping level, before + * consuming it. In this case, mmu_lock doesn't need to be held during the + * lookup, but it does need to be held while checking the MMU notifier. + * + * - Hold mmu_lock AND ensure there is no in-progress MMU notifier invalidation + * event for the hva. This can be done by explicit checking the MMU notifier + * or by ensuring that KVM already has a valid mapping that covers the hva. + * + * - Do not use the result to install new mappings, e.g. use the host mapping + * level only to decide whether or not to zap an entry. In this case, it's + * not required to hold mmu_lock (though it's highly likely the caller will + * want to hold mmu_lock anyways, e.g. to modify SPTEs). + * + * Note! The lookup can still race with modifications to host page tables, but + * the above "rules" ensure KVM will not _consume_ the result of the walk if a + * race with the primary MMU occurs. + */ +static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, const struct kvm_memory_slot *slot) { + int level = PG_LEVEL_4K; unsigned long hva; unsigned long flags; - int level = PG_LEVEL_4K; pgd_t pgd; p4d_t p4d; pud_t pud; pmd_t pmd; - if (!PageCompound(pfn_to_page(pfn)) && !kvm_is_zone_device_pfn(pfn)) - return PG_LEVEL_4K; - /* * Note, using the already-retrieved memslot and __gfn_to_hva_memslot() * is not solely for performance, it's also necessary to avoid the @@ -2823,16 +2967,19 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, hva = __gfn_to_hva_memslot(slot, gfn); /* - * Lookup the mapping level in the current mm. The information - * may become stale soon, but it is safe to use as long as - * 1) mmu_notifier_retry was checked after taking mmu_lock, and - * 2) mmu_lock is taken now. - * - * We still need to disable IRQs to prevent concurrent tear down - * of page tables. + * Disable IRQs to prevent concurrent tear down of host page tables, + * e.g. if the primary MMU promotes a P*D to a huge page and then frees + * the original page table. */ local_irq_save(flags); + /* + * Read each entry once. As above, a non-leaf entry can be promoted to + * a huge page _during_ this walk. Re-reading the entry could send the + * walk into the weeks, e.g. p*d_large() returns false (sees the old + * value) and then p*d_offset() walks into the target huge page instead + * of the old page table (sees the new value). + */ pgd = READ_ONCE(*pgd_offset(kvm->mm, hva)); if (pgd_none(pgd)) goto out; @@ -2864,7 +3011,7 @@ out: int kvm_mmu_max_mapping_level(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t gfn, - kvm_pfn_t pfn, int max_level) + int max_level) { struct kvm_lpage_info *linfo; int host_level; @@ -2879,7 +3026,7 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, if (max_level == PG_LEVEL_4K) return PG_LEVEL_4K; - host_level = host_pfn_mapping_level(kvm, gfn, pfn, slot); + host_level = host_pfn_mapping_level(kvm, gfn, slot); return min(host_level, max_level); } @@ -2893,7 +3040,7 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (unlikely(fault->max_level == PG_LEVEL_4K)) return; - if (is_error_noslot_pfn(fault->pfn) || kvm_is_reserved_pfn(fault->pfn)) + if (is_error_noslot_pfn(fault->pfn)) return; if (kvm_slot_dirty_track_enabled(slot)) @@ -2904,8 +3051,7 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault * level, which will be used to do precise, accurate accounting. */ fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, - fault->gfn, fault->pfn, - fault->max_level); + fault->gfn, fault->max_level); if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed) return; @@ -2961,13 +3107,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) if (it.level == fault->goal_level) break; - drop_large_spte(vcpu, it.sptep); - if (is_shadow_present_pte(*it.sptep)) + sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, true, ACC_ALL); + if (sp == ERR_PTR(-EEXIST)) continue; - sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr, - it.level - 1, true, ACC_ALL); - link_shadow_page(vcpu, it.sptep, sp); if (fault->is_tdp && fault->huge_page_disallowed && fault->req_level >= it.level) @@ -3095,7 +3238,7 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, * * Compare with set_spte where instead shadow_dirty_mask is set. */ - if (cmpxchg64(sptep, old_spte, new_spte) != old_spte) + if (!try_cmpxchg64(sptep, &old_spte, new_spte)) return false; if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) @@ -3265,7 +3408,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, if (!VALID_PAGE(*root_hpa)) return; - sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK); + sp = to_shadow_page(*root_hpa & SPTE_BASE_ADDR_MASK); if (WARN_ON(!sp)) return; @@ -3369,12 +3512,19 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) return ret; } -static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva, - u8 level, bool direct) +static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, int quadrant, + u8 level) { + union kvm_mmu_page_role role = vcpu->arch.mmu->root_role; struct kvm_mmu_page *sp; - sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL); + role.level = level; + role.quadrant = quadrant; + + WARN_ON_ONCE(quadrant && !role.has_4_byte_gpte); + WARN_ON_ONCE(role.direct && role.has_4_byte_gpte); + + sp = kvm_mmu_get_shadow_page(vcpu, gfn, role); ++sp->root_count; return __pa(sp->spt); @@ -3397,7 +3547,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu); mmu->root.hpa = root; } else if (shadow_root_level >= PT64_ROOT_4LEVEL) { - root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true); + root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level); mmu->root.hpa = root; } else if (shadow_root_level == PT32E_ROOT_LEVEL) { if (WARN_ON_ONCE(!mmu->pae_root)) { @@ -3408,8 +3558,8 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) for (i = 0; i < 4; ++i) { WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i])); - root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), - i << 30, PT32_ROOT_LEVEL, true); + root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), 0, + PT32_ROOT_LEVEL); mmu->pae_root[i] = root | PT_PRESENT_MASK | shadow_me_value; } @@ -3493,9 +3643,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) struct kvm_mmu *mmu = vcpu->arch.mmu; u64 pdptrs[4], pm_mask; gfn_t root_gfn, root_pgd; + int quadrant, i, r; hpa_t root; - unsigned i; - int r; root_pgd = mmu->get_guest_pgd(vcpu); root_gfn = root_pgd >> PAGE_SHIFT; @@ -3533,7 +3682,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) */ if (mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) { root = mmu_alloc_root(vcpu, root_gfn, 0, - mmu->root_role.level, false); + mmu->root_role.level); mmu->root.hpa = root; goto set_root_pgd; } @@ -3578,8 +3727,15 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) root_gfn = pdptrs[i] >> PAGE_SHIFT; } - root = mmu_alloc_root(vcpu, root_gfn, i << 30, - PT32_ROOT_LEVEL, false); + /* + * If shadowing 32-bit non-PAE page tables, each PAE page + * directory maps one quarter of the guest's non-PAE page + * directory. Othwerise each PAE page direct shadows one guest + * PAE page directory so that quadrant should be 0. + */ + quadrant = (mmu->cpu_role.base.level == PT32_ROOT_LEVEL) ? i : 0; + + root = mmu_alloc_root(vcpu, root_gfn, quadrant, PT32_ROOT_LEVEL); mmu->pae_root[i] = root | pm_mask; } @@ -3737,7 +3893,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) hpa_t root = vcpu->arch.mmu->pae_root[i]; if (IS_VALID_PAE_ROOT(root)) { - root &= PT64_BASE_ADDR_MASK; + root &= SPTE_BASE_ADDR_MASK; sp = to_shadow_page(root); mmu_sync_children(vcpu, sp, true); } @@ -4155,14 +4311,26 @@ EXPORT_SYMBOL_GPL(kvm_handle_page_fault); int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - while (fault->max_level > PG_LEVEL_4K) { - int page_num = KVM_PAGES_PER_HPAGE(fault->max_level); - gfn_t base = (fault->addr >> PAGE_SHIFT) & ~(page_num - 1); - - if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num)) - break; + /* + * If the guest's MTRRs may be used to compute the "real" memtype, + * restrict the mapping level to ensure KVM uses a consistent memtype + * across the entire mapping. If the host MTRRs are ignored by TDP + * (shadow_memtype_mask is non-zero), and the VM has non-coherent DMA + * (DMA doesn't snoop CPU caches), KVM's ABI is to honor the memtype + * from the guest's MTRRs so that guest accesses to memory that is + * DMA'd aren't cached against the guest's wishes. + * + * Note, KVM may still ultimately ignore guest MTRRs for certain PFNs, + * e.g. KVM will force UC memtype for host MMIO. + */ + if (shadow_memtype_mask && kvm_arch_has_noncoherent_dma(vcpu->kvm)) { + for ( ; fault->max_level > PG_LEVEL_4K; --fault->max_level) { + int page_num = KVM_PAGES_PER_HPAGE(fault->max_level); + gfn_t base = (fault->addr >> PAGE_SHIFT) & ~(page_num - 1); - --fault->max_level; + if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num)) + break; + } } return direct_page_fault(vcpu, fault); @@ -4567,7 +4735,7 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context) if (boot_cpu_is_amd()) __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(), - context->root_role.level, false, + context->root_role.level, true, boot_cpu_has(X86_FEATURE_GBPAGES), false, true); else @@ -5199,11 +5367,11 @@ static bool need_remote_flush(u64 old, u64 new) return false; if (!is_shadow_present_pte(new)) return true; - if ((old ^ new) & PT64_BASE_ADDR_MASK) + if ((old ^ new) & SPTE_BASE_ADDR_MASK) return true; old ^= shadow_nx_mask; new ^= shadow_nx_mask; - return (old & ~new & PT64_PERM_MASK) != 0; + return (old & ~new & SPTE_PERM_MASK) != 0; } static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, @@ -5328,13 +5496,6 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); - /* - * No need to care whether allocation memory is successful - * or not since pte prefetch is skipped if it does not have - * enough objects in the cache. - */ - mmu_topup_memory_caches(vcpu, true); - write_lock(&vcpu->kvm->mmu_lock); gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes); @@ -5819,9 +5980,25 @@ int kvm_mmu_init_vm(struct kvm *kvm) node->track_write = kvm_mmu_pte_write; node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot; kvm_page_track_register_notifier(kvm, node); + + kvm->arch.split_page_header_cache.kmem_cache = mmu_page_header_cache; + kvm->arch.split_page_header_cache.gfp_zero = __GFP_ZERO; + + kvm->arch.split_shadow_page_cache.gfp_zero = __GFP_ZERO; + + kvm->arch.split_desc_cache.kmem_cache = pte_list_desc_cache; + kvm->arch.split_desc_cache.gfp_zero = __GFP_ZERO; + return 0; } +static void mmu_free_vm_memory_caches(struct kvm *kvm) +{ + kvm_mmu_free_memory_cache(&kvm->arch.split_desc_cache); + kvm_mmu_free_memory_cache(&kvm->arch.split_page_header_cache); + kvm_mmu_free_memory_cache(&kvm->arch.split_shadow_page_cache); +} + void kvm_mmu_uninit_vm(struct kvm *kvm) { struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; @@ -5829,9 +6006,11 @@ void kvm_mmu_uninit_vm(struct kvm *kvm) kvm_page_track_unregister_notifier(kvm, node); kvm_mmu_uninit_tdp_mmu(kvm); + + mmu_free_vm_memory_caches(kvm); } -static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) +static bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) { const struct kvm_memory_slot *memslot; struct kvm_memslots *slots; @@ -5853,8 +6032,7 @@ static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) if (WARN_ON_ONCE(start >= end)) continue; - flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, - + flush = slot_handle_level_range(kvm, memslot, __kvm_zap_rmap, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, start, end - 1, true, flush); } @@ -5879,7 +6057,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) kvm_inc_notifier_count(kvm, gfn_start, gfn_end); - flush = __kvm_zap_rmaps(kvm, gfn_start, gfn_end); + flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end); if (is_tdp_mmu_enabled(kvm)) { for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) @@ -5950,15 +6128,249 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); } +static inline bool need_topup(struct kvm_mmu_memory_cache *cache, int min) +{ + return kvm_mmu_memory_cache_nr_free_objects(cache) < min; +} + +static bool need_topup_split_caches_or_resched(struct kvm *kvm) +{ + if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) + return true; + + /* + * In the worst case, SPLIT_DESC_CACHE_MIN_NR_OBJECTS descriptors are needed + * to split a single huge page. Calculating how many are actually needed + * is possible but not worth the complexity. + */ + return need_topup(&kvm->arch.split_desc_cache, SPLIT_DESC_CACHE_MIN_NR_OBJECTS) || + need_topup(&kvm->arch.split_page_header_cache, 1) || + need_topup(&kvm->arch.split_shadow_page_cache, 1); +} + +static int topup_split_caches(struct kvm *kvm) +{ + /* + * Allocating rmap list entries when splitting huge pages for nested + * MMUs is uncommon as KVM needs to use a list if and only if there is + * more than one rmap entry for a gfn, i.e. requires an L1 gfn to be + * aliased by multiple L2 gfns and/or from multiple nested roots with + * different roles. Aliasing gfns when using TDP is atypical for VMMs; + * a few gfns are often aliased during boot, e.g. when remapping BIOS, + * but aliasing rarely occurs post-boot or for many gfns. If there is + * only one rmap entry, rmap->val points directly at that one entry and + * doesn't need to allocate a list. Buffer the cache by the default + * capacity so that KVM doesn't have to drop mmu_lock to topup if KVM + * encounters an aliased gfn or two. + */ + const int capacity = SPLIT_DESC_CACHE_MIN_NR_OBJECTS + + KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE; + int r; + + lockdep_assert_held(&kvm->slots_lock); + + r = __kvm_mmu_topup_memory_cache(&kvm->arch.split_desc_cache, capacity, + SPLIT_DESC_CACHE_MIN_NR_OBJECTS); + if (r) + return r; + + r = kvm_mmu_topup_memory_cache(&kvm->arch.split_page_header_cache, 1); + if (r) + return r; + + return kvm_mmu_topup_memory_cache(&kvm->arch.split_shadow_page_cache, 1); +} + +static struct kvm_mmu_page *shadow_mmu_get_sp_for_split(struct kvm *kvm, u64 *huge_sptep) +{ + struct kvm_mmu_page *huge_sp = sptep_to_sp(huge_sptep); + struct shadow_page_caches caches = {}; + union kvm_mmu_page_role role; + unsigned int access; + gfn_t gfn; + + gfn = kvm_mmu_page_get_gfn(huge_sp, spte_index(huge_sptep)); + access = kvm_mmu_page_get_access(huge_sp, spte_index(huge_sptep)); + + /* + * Note, huge page splitting always uses direct shadow pages, regardless + * of whether the huge page itself is mapped by a direct or indirect + * shadow page, since the huge page region itself is being directly + * mapped with smaller pages. + */ + role = kvm_mmu_child_role(huge_sptep, /*direct=*/true, access); + + /* Direct SPs do not require a shadowed_info_cache. */ + caches.page_header_cache = &kvm->arch.split_page_header_cache; + caches.shadow_page_cache = &kvm->arch.split_shadow_page_cache; + + /* Safe to pass NULL for vCPU since requesting a direct SP. */ + return __kvm_mmu_get_shadow_page(kvm, NULL, &caches, gfn, role); +} + +static void shadow_mmu_split_huge_page(struct kvm *kvm, + const struct kvm_memory_slot *slot, + u64 *huge_sptep) + +{ + struct kvm_mmu_memory_cache *cache = &kvm->arch.split_desc_cache; + u64 huge_spte = READ_ONCE(*huge_sptep); + struct kvm_mmu_page *sp; + bool flush = false; + u64 *sptep, spte; + gfn_t gfn; + int index; + + sp = shadow_mmu_get_sp_for_split(kvm, huge_sptep); + + for (index = 0; index < SPTE_ENT_PER_PAGE; index++) { + sptep = &sp->spt[index]; + gfn = kvm_mmu_page_get_gfn(sp, index); + + /* + * The SP may already have populated SPTEs, e.g. if this huge + * page is aliased by multiple sptes with the same access + * permissions. These entries are guaranteed to map the same + * gfn-to-pfn translation since the SP is direct, so no need to + * modify them. + * + * However, if a given SPTE points to a lower level page table, + * that lower level page table may only be partially populated. + * Installing such SPTEs would effectively unmap a potion of the + * huge page. Unmapping guest memory always requires a TLB flush + * since a subsequent operation on the unmapped regions would + * fail to detect the need to flush. + */ + if (is_shadow_present_pte(*sptep)) { + flush |= !is_last_spte(*sptep, sp->role.level); + continue; + } + + spte = make_huge_page_split_spte(kvm, huge_spte, sp->role, index); + mmu_spte_set(sptep, spte); + __rmap_add(kvm, cache, slot, sptep, gfn, sp->role.access); + } + + __link_shadow_page(kvm, cache, huge_sptep, sp, flush); +} + +static int shadow_mmu_try_split_huge_page(struct kvm *kvm, + const struct kvm_memory_slot *slot, + u64 *huge_sptep) +{ + struct kvm_mmu_page *huge_sp = sptep_to_sp(huge_sptep); + int level, r = 0; + gfn_t gfn; + u64 spte; + + /* Grab information for the tracepoint before dropping the MMU lock. */ + gfn = kvm_mmu_page_get_gfn(huge_sp, spte_index(huge_sptep)); + level = huge_sp->role.level; + spte = *huge_sptep; + + if (kvm_mmu_available_pages(kvm) <= KVM_MIN_FREE_MMU_PAGES) { + r = -ENOSPC; + goto out; + } + + if (need_topup_split_caches_or_resched(kvm)) { + write_unlock(&kvm->mmu_lock); + cond_resched(); + /* + * If the topup succeeds, return -EAGAIN to indicate that the + * rmap iterator should be restarted because the MMU lock was + * dropped. + */ + r = topup_split_caches(kvm) ?: -EAGAIN; + write_lock(&kvm->mmu_lock); + goto out; + } + + shadow_mmu_split_huge_page(kvm, slot, huge_sptep); + +out: + trace_kvm_mmu_split_huge_page(gfn, spte, level, r); + return r; +} + +static bool shadow_mmu_try_split_huge_pages(struct kvm *kvm, + struct kvm_rmap_head *rmap_head, + const struct kvm_memory_slot *slot) +{ + struct rmap_iterator iter; + struct kvm_mmu_page *sp; + u64 *huge_sptep; + int r; + +restart: + for_each_rmap_spte(rmap_head, &iter, huge_sptep) { + sp = sptep_to_sp(huge_sptep); + + /* TDP MMU is enabled, so rmap only contains nested MMU SPs. */ + if (WARN_ON_ONCE(!sp->role.guest_mode)) + continue; + + /* The rmaps should never contain non-leaf SPTEs. */ + if (WARN_ON_ONCE(!is_large_pte(*huge_sptep))) + continue; + + /* SPs with level >PG_LEVEL_4K should never by unsync. */ + if (WARN_ON_ONCE(sp->unsync)) + continue; + + /* Don't bother splitting huge pages on invalid SPs. */ + if (sp->role.invalid) + continue; + + r = shadow_mmu_try_split_huge_page(kvm, slot, huge_sptep); + + /* + * The split succeeded or needs to be retried because the MMU + * lock was dropped. Either way, restart the iterator to get it + * back into a consistent state. + */ + if (!r || r == -EAGAIN) + goto restart; + + /* The split failed and shouldn't be retried (e.g. -ENOMEM). */ + break; + } + + return false; +} + +static void kvm_shadow_mmu_try_split_huge_pages(struct kvm *kvm, + const struct kvm_memory_slot *slot, + gfn_t start, gfn_t end, + int target_level) +{ + int level; + + /* + * Split huge pages starting with KVM_MAX_HUGEPAGE_LEVEL and working + * down to the target level. This ensures pages are recursively split + * all the way to the target level. There's no need to split pages + * already at the target level. + */ + for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--) { + slot_handle_level_range(kvm, slot, shadow_mmu_try_split_huge_pages, + level, level, start, end - 1, true, false); + } +} + /* Must be called with the mmu_lock held in write-mode. */ void kvm_mmu_try_split_huge_pages(struct kvm *kvm, const struct kvm_memory_slot *memslot, u64 start, u64 end, int target_level) { - if (is_tdp_mmu_enabled(kvm)) - kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, - target_level, false); + if (!is_tdp_mmu_enabled(kvm)) + return; + + if (kvm_memslots_have_rmaps(kvm)) + kvm_shadow_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level); + + kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, false); /* * A TLB flush is unnecessary at this point for the same resons as in @@ -5973,12 +6385,19 @@ void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm, u64 start = memslot->base_gfn; u64 end = start + memslot->npages; - if (is_tdp_mmu_enabled(kvm)) { - read_lock(&kvm->mmu_lock); - kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true); - read_unlock(&kvm->mmu_lock); + if (!is_tdp_mmu_enabled(kvm)) + return; + + if (kvm_memslots_have_rmaps(kvm)) { + write_lock(&kvm->mmu_lock); + kvm_shadow_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level); + write_unlock(&kvm->mmu_lock); } + read_lock(&kvm->mmu_lock); + kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true); + read_unlock(&kvm->mmu_lock); + /* * No TLB flush is necessary here. KVM will flush TLBs after * write-protecting and/or clearing dirty on the newly split SPTEs to @@ -5997,13 +6416,11 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, u64 *sptep; struct rmap_iterator iter; int need_tlb_flush = 0; - kvm_pfn_t pfn; struct kvm_mmu_page *sp; restart: for_each_rmap_spte(rmap_head, &iter, sptep) { sp = sptep_to_sp(sptep); - pfn = spte_to_pfn(*sptep); /* * We cannot do huge page mapping for indirect shadow pages, @@ -6012,10 +6429,10 @@ restart: * the guest, and the guest page table is using 4K page size * mapping if the indirect sp has level = 1. */ - if (sp->role.direct && !kvm_is_reserved_pfn(pfn) && + if (sp->role.direct && sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn, - pfn, PG_LEVEL_NUM)) { - pte_list_remove(kvm, rmap_head, sptep); + PG_LEVEL_NUM)) { + kvm_zap_one_rmap_spte(kvm, rmap_head, sptep); if (kvm_available_flush_tlb_with_range()) kvm_flush_remote_tlbs_with_address(kvm, sp->gfn, @@ -6030,18 +6447,24 @@ restart: return need_tlb_flush; } +static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm, + const struct kvm_memory_slot *slot) +{ + /* + * Note, use KVM_MAX_HUGEPAGE_LEVEL - 1 since there's no need to zap + * pages that are already mapped at the maximum hugepage level. + */ + if (slot_handle_level(kvm, slot, kvm_mmu_zap_collapsible_spte, + PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL - 1, true)) + kvm_arch_flush_remote_tlbs_memslot(kvm, slot); +} + void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *slot) { if (kvm_memslots_have_rmaps(kvm)) { write_lock(&kvm->mmu_lock); - /* - * Zap only 4k SPTEs since the legacy MMU only supports dirty - * logging at a 4k granularity and never creates collapsible - * 2m SPTEs during dirty logging. - */ - if (slot_handle_level_4k(kvm, slot, kvm_mmu_zap_collapsible_spte, true)) - kvm_arch_flush_remote_tlbs_memslot(kvm, slot); + kvm_rmap_zap_collapsible_sptes(kvm, slot); write_unlock(&kvm->mmu_lock); } @@ -6317,7 +6740,7 @@ int kvm_mmu_vendor_module_init(void) if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL)) goto out; - ret = register_shrinker(&mmu_shrinker); + ret = register_shrinker(&mmu_shrinker, "x86-mmu"); if (ret) goto out; diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index bd2a26897b97..582def531d4d 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -20,6 +20,20 @@ extern bool dbg; #define MMU_WARN_ON(x) do { } while (0) #endif +/* Page table builder macros common to shadow (host) PTEs and guest PTEs. */ +#define __PT_LEVEL_SHIFT(level, bits_per_level) \ + (PAGE_SHIFT + ((level) - 1) * (bits_per_level)) +#define __PT_INDEX(address, level, bits_per_level) \ + (((address) >> __PT_LEVEL_SHIFT(level, bits_per_level)) & ((1 << (bits_per_level)) - 1)) + +#define __PT_LVL_ADDR_MASK(base_addr_mask, level, bits_per_level) \ + ((base_addr_mask) & ~((1ULL << (PAGE_SHIFT + (((level) - 1) * (bits_per_level)))) - 1)) + +#define __PT_LVL_OFFSET_MASK(base_addr_mask, level, bits_per_level) \ + ((base_addr_mask) & ((1ULL << (PAGE_SHIFT + (((level) - 1) * (bits_per_level)))) - 1)) + +#define __PT_ENT_PER_PAGE(bits_per_level) (1 << (bits_per_level)) + /* * Unlike regular MMU roots, PAE "roots", a.k.a. PDPTEs/PDPTRs, have a PRESENT * bit, and thus are guaranteed to be non-zero when valid. And, when a guest @@ -53,8 +67,21 @@ struct kvm_mmu_page { gfn_t gfn; u64 *spt; - /* hold the gfn of each spte inside spt */ - gfn_t *gfns; + + /* + * Stores the result of the guest translation being shadowed by each + * SPTE. KVM shadows two types of guest translations: nGPA -> GPA + * (shadow EPT/NPT) and GVA -> GPA (traditional shadow paging). In both + * cases the result of the translation is a GPA and a set of access + * constraints. + * + * The GFN is stored in the upper bits (PAGE_SHIFT) and the shadowed + * access permissions are stored in the lower bits. Note, for + * convenience and uniformity across guests, the access permissions are + * stored in KVM format (e.g. ACC_EXEC_MASK) not the raw guest format. + */ + u64 *shadowed_translation; + /* Currently serving as active root */ union { int root_count; @@ -141,9 +168,9 @@ void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, unsigned int pte_list_count(struct kvm_rmap_head *rmap_head); extern int nx_huge_pages; -static inline bool is_nx_huge_page_enabled(void) +static inline bool is_nx_huge_page_enabled(struct kvm *kvm) { - return READ_ONCE(nx_huge_pages); + return READ_ONCE(nx_huge_pages) && !kvm->arch.disable_nx_huge_pages; } struct kvm_page_fault { @@ -242,7 +269,8 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, .user = err & PFERR_USER_MASK, .prefetch = prefetch, .is_tdp = likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault), - .nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(), + .nx_huge_page_workaround_enabled = + is_nx_huge_page_enabled(vcpu->kvm), .max_level = KVM_MAX_HUGEPAGE_LEVEL, .req_level = PG_LEVEL_4K, @@ -281,7 +309,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int kvm_mmu_max_mapping_level(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t gfn, - kvm_pfn_t pfn, int max_level); + int max_level); void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level); diff --git a/arch/x86/kvm/mmu/paging.h b/arch/x86/kvm/mmu/paging.h deleted file mode 100644 index de8ab323bb70..000000000000 --- a/arch/x86/kvm/mmu/paging.h +++ /dev/null @@ -1,14 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* Shadow paging constants/helpers that don't need to be #undef'd. */ -#ifndef __KVM_X86_PAGING_H -#define __KVM_X86_PAGING_H - -#define GUEST_PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) -#define PT64_LVL_ADDR_MASK(level) \ - (GUEST_PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ - * PT64_LEVEL_BITS))) - 1)) -#define PT64_LVL_OFFSET_MASK(level) \ - (GUEST_PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ - * PT64_LEVEL_BITS))) - 1)) -#endif /* __KVM_X86_PAGING_H */ - diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index db80f7ccaa4e..f5958071220c 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -16,25 +16,21 @@ */ /* - * We need the mmu code to access both 32-bit and 64-bit guest ptes, - * so the code in this file is compiled twice, once per pte size. + * The MMU needs to be able to access/walk 32-bit and 64-bit guest page tables, + * as well as guest EPT tables, so the code in this file is compiled thrice, + * once per guest PTE type. The per-type defines are #undef'd at the end. */ #if PTTYPE == 64 #define pt_element_t u64 #define guest_walker guest_walker64 #define FNAME(name) paging##64_##name - #define PT_BASE_ADDR_MASK GUEST_PT64_BASE_ADDR_MASK - #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) - #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) - #define PT_INDEX(addr, level) PT64_INDEX(addr, level) - #define PT_LEVEL_BITS PT64_LEVEL_BITS + #define PT_LEVEL_BITS 9 #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT #define PT_HAVE_ACCESSED_DIRTY(mmu) true #ifdef CONFIG_X86_64 #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL - #define CMPXCHG "cmpxchgq" #else #define PT_MAX_FULL_LEVELS 2 #endif @@ -42,36 +38,35 @@ #define pt_element_t u32 #define guest_walker guest_walker32 #define FNAME(name) paging##32_##name - #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK - #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl) - #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl) - #define PT_INDEX(addr, level) PT32_INDEX(addr, level) - #define PT_LEVEL_BITS PT32_LEVEL_BITS + #define PT_LEVEL_BITS 10 #define PT_MAX_FULL_LEVELS 2 #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT #define PT_HAVE_ACCESSED_DIRTY(mmu) true - #define CMPXCHG "cmpxchgl" + + #define PT32_DIR_PSE36_SIZE 4 + #define PT32_DIR_PSE36_SHIFT 13 + #define PT32_DIR_PSE36_MASK \ + (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT) #elif PTTYPE == PTTYPE_EPT #define pt_element_t u64 #define guest_walker guest_walkerEPT #define FNAME(name) ept_##name - #define PT_BASE_ADDR_MASK GUEST_PT64_BASE_ADDR_MASK - #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) - #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) - #define PT_INDEX(addr, level) PT64_INDEX(addr, level) - #define PT_LEVEL_BITS PT64_LEVEL_BITS + #define PT_LEVEL_BITS 9 #define PT_GUEST_DIRTY_SHIFT 9 #define PT_GUEST_ACCESSED_SHIFT 8 #define PT_HAVE_ACCESSED_DIRTY(mmu) (!(mmu)->cpu_role.base.ad_disabled) - #ifdef CONFIG_X86_64 - #define CMPXCHG "cmpxchgq" - #endif #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL #else #error Invalid PTTYPE value #endif +/* Common logic, but per-type values. These also need to be undefined. */ +#define PT_BASE_ADDR_MASK ((pt_element_t)(((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))) +#define PT_LVL_ADDR_MASK(lvl) __PT_LVL_ADDR_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS) +#define PT_LVL_OFFSET_MASK(lvl) __PT_LVL_OFFSET_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS) +#define PT_INDEX(addr, lvl) __PT_INDEX(addr, lvl, PT_LEVEL_BITS) + #define PT_GUEST_DIRTY_MASK (1 << PT_GUEST_DIRTY_SHIFT) #define PT_GUEST_ACCESSED_MASK (1 << PT_GUEST_ACCESSED_SHIFT) @@ -97,6 +92,15 @@ struct guest_walker { struct x86_exception fault; }; +#if PTTYPE == 32 +static inline gfn_t pse36_gfn_delta(u32 gpte) +{ + int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT; + + return (gpte & PT32_DIR_PSE36_MASK) << shift; +} +#endif + static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) { return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; @@ -374,7 +378,7 @@ retry_walk: * information to fix the exit_qualification or exit_info_1 * fields. */ - if (unlikely(real_gpa == UNMAPPED_GVA)) + if (unlikely(real_gpa == INVALID_GPA)) return 0; host_addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gpa_to_gfn(real_gpa), @@ -421,11 +425,13 @@ retry_walk: gfn = gpte_to_gfn_lvl(pte, walker->level); gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT; - if (PTTYPE == 32 && walker->level > PG_LEVEL_4K && is_cpuid_PSE36()) +#if PTTYPE == 32 + if (walker->level > PG_LEVEL_4K && is_cpuid_PSE36()) gfn += pse36_gfn_delta(pte); +#endif real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(gfn), access, &walker->fault); - if (real_gpa == UNMAPPED_GVA) + if (real_gpa == INVALID_GPA) return 0; walker->gfn = real_gpa >> PAGE_SHIFT; @@ -589,7 +595,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, if (sp->role.direct) return __direct_pte_prefetch(vcpu, sp, sptep); - i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1); + i = spte_index(sptep) & ~(PTE_PREFETCH_NUM - 1); spte = sp->spt + i; for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { @@ -642,14 +648,13 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, gfn_t table_gfn; clear_sp_write_flooding_count(it.sptep); - drop_large_spte(vcpu, it.sptep); - - sp = NULL; - if (!is_shadow_present_pte(*it.sptep)) { - table_gfn = gw->table_gfn[it.level - 2]; - access = gw->pt_access[it.level - 2]; - sp = kvm_mmu_get_page(vcpu, table_gfn, fault->addr, - it.level-1, false, access); + + table_gfn = gw->table_gfn[it.level - 2]; + access = gw->pt_access[it.level - 2]; + sp = kvm_mmu_get_child_sp(vcpu, it.sptep, table_gfn, + false, access); + + if (sp != ERR_PTR(-EEXIST)) { /* * We must synchronize the pagetable before linking it * because the guest doesn't need to flush tlb when @@ -678,7 +683,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, if (FNAME(gpte_changed)(vcpu, gw, it.level - 1)) goto out_gpte_changed; - if (sp) + if (sp != ERR_PTR(-EEXIST)) link_shadow_page(vcpu, it.sptep, sp); } @@ -702,16 +707,15 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, validate_direct_spte(vcpu, it.sptep, direct_access); - drop_large_spte(vcpu, it.sptep); + sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, + true, direct_access); + if (sp == ERR_PTR(-EEXIST)) + continue; - if (!is_shadow_present_pte(*it.sptep)) { - sp = kvm_mmu_get_page(vcpu, base_gfn, fault->addr, - it.level - 1, true, direct_access); - link_shadow_page(vcpu, it.sptep, sp); - if (fault->huge_page_disallowed && - fault->req_level >= it.level) - account_huge_nx_page(vcpu->kvm, sp); - } + link_shadow_page(vcpu, it.sptep, sp); + if (fault->huge_page_disallowed && + fault->req_level >= it.level) + account_huge_nx_page(vcpu->kvm, sp); } if (WARN_ON_ONCE(it.level != fault->goal_level)) @@ -888,7 +892,7 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) WARN_ON(sp->role.level != PG_LEVEL_4K); if (PTTYPE == 32) - offset = sp->role.quadrant << PT64_LEVEL_BITS; + offset = sp->role.quadrant << SPTE_LEVEL_BITS; return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t); } @@ -929,7 +933,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) break; pte_gpa = FNAME(get_level1_sp_gpa)(sp); - pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); + pte_gpa += spte_index(sptep) * sizeof(pt_element_t); mmu_page_zap_pte(vcpu->kvm, sp, sptep, NULL); if (is_shadow_present_pte(old_spte)) @@ -958,7 +962,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, struct x86_exception *exception) { struct guest_walker walker; - gpa_t gpa = UNMAPPED_GVA; + gpa_t gpa = INVALID_GPA; int r; #ifndef CONFIG_X86_64 @@ -978,7 +982,8 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, } /* - * Using the cached information from sp->gfns is safe because: + * Using the information in sp->shadowed_translation (kvm_mmu_page_get_gfn()) is + * safe because: * - The spte has a reference to the struct page, so the pfn for a given gfn * can't change unless all sptes pointing to it are nuked first. * @@ -1023,7 +1028,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) first_pte_gpa = FNAME(get_level1_sp_gpa)(sp); - for (i = 0; i < PT64_ENT_PER_PAGE; i++) { + for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { u64 *sptep, spte; struct kvm_memory_slot *slot; unsigned pte_access; @@ -1053,12 +1058,23 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access)) continue; - if (gfn != sp->gfns[i]) { + /* + * Drop the SPTE if the new protections would result in a RWX=0 + * SPTE or if the gfn is changing. The RWX=0 case only affects + * EPT with execute-only support, i.e. EPT without an effective + * "present" bit, as all other paging modes will create a + * read-only SPTE if pte_access is zero. + */ + if ((!pte_access && !shadow_present_mask) || + gfn != kvm_mmu_page_get_gfn(sp, i)) { drop_spte(vcpu->kvm, &sp->spt[i]); flush = true; continue; } + /* Update the shadowed access bits in case they changed. */ + kvm_mmu_page_set_access(sp, i, pte_access); + sptep = &sp->spt[i]; spte = *sptep; host_writable = spte & shadow_host_writable_mask; @@ -1070,6 +1086,15 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) flush |= mmu_spte_update(sptep, spte); } + /* + * Note, any flush is purely for KVM's correctness, e.g. when dropping + * an existing SPTE or clearing W/A/D bits to ensure an mmu_notifier + * unmap or dirty logging event doesn't fail to flush. The guest is + * responsible for flushing the TLB to ensure any changes in protection + * bits are recognized, i.e. until the guest flushes or page faults on + * a relevant address, KVM is architecturally allowed to let vCPUs use + * cached translations with the old protection bits. + */ return flush; } @@ -1084,7 +1109,6 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) #undef PT_MAX_FULL_LEVELS #undef gpte_to_gfn #undef gpte_to_gfn_lvl -#undef CMPXCHG #undef PT_GUEST_ACCESSED_MASK #undef PT_GUEST_DIRTY_MASK #undef PT_GUEST_DIRTY_SHIFT diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index b5960bbde7f7..7314d27d57a4 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -33,6 +33,7 @@ u64 __read_mostly shadow_mmio_value; u64 __read_mostly shadow_mmio_mask; u64 __read_mostly shadow_mmio_access_mask; u64 __read_mostly shadow_present_mask; +u64 __read_mostly shadow_memtype_mask; u64 __read_mostly shadow_me_value; u64 __read_mostly shadow_me_mask; u64 __read_mostly shadow_acc_track_mask; @@ -129,6 +130,8 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 spte = SPTE_MMU_PRESENT_MASK; bool wrprot = false; + WARN_ON_ONCE(!pte_access && !shadow_present_mask); + if (sp->role.ad_disabled) spte |= SPTE_TDP_AD_DISABLED_MASK; else if (kvm_mmu_page_ad_need_write_protect(sp)) @@ -145,7 +148,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, spte |= spte_shadow_accessed_mask(spte); if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) && - is_nx_huge_page_enabled()) { + is_nx_huge_page_enabled(vcpu->kvm)) { pte_access &= ~ACC_EXEC_MASK; } @@ -159,10 +162,10 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, if (level > PG_LEVEL_4K) spte |= PT_PAGE_SIZE_MASK; - if (tdp_enabled) - spte |= static_call(kvm_x86_get_mt_mask)(vcpu, gfn, - kvm_is_mmio_pfn(pfn)); + if (shadow_memtype_mask) + spte |= static_call(kvm_x86_get_mt_mask)(vcpu, gfn, + kvm_is_mmio_pfn(pfn)); if (host_writable) spte |= shadow_host_writable_mask; else @@ -244,10 +247,10 @@ static u64 make_spte_executable(u64 spte) * This is used during huge page splitting to build the SPTEs that make up the * new page table. */ -u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index) +u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, union kvm_mmu_page_role role, + int index) { u64 child_spte; - int child_level; if (WARN_ON_ONCE(!is_shadow_present_pte(huge_spte))) return 0; @@ -256,23 +259,23 @@ u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index) return 0; child_spte = huge_spte; - child_level = huge_level - 1; /* * The child_spte already has the base address of the huge page being * split. So we just have to OR in the offset to the page at the next * lower level for the given index. */ - child_spte |= (index * KVM_PAGES_PER_HPAGE(child_level)) << PAGE_SHIFT; + child_spte |= (index * KVM_PAGES_PER_HPAGE(role.level)) << PAGE_SHIFT; - if (child_level == PG_LEVEL_4K) { + if (role.level == PG_LEVEL_4K) { child_spte &= ~PT_PAGE_SIZE_MASK; /* - * When splitting to a 4K page, mark the page executable as the - * NX hugepage mitigation no longer applies. + * When splitting to a 4K page where execution is allowed, mark + * the page executable as the NX hugepage mitigation no longer + * applies. */ - if (is_nx_huge_page_enabled()) + if ((role.access & ACC_EXEC_MASK) && is_nx_huge_page_enabled(kvm)) child_spte = make_spte_executable(child_spte); } @@ -299,7 +302,7 @@ u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn) { u64 new_spte; - new_spte = old_spte & ~PT64_BASE_ADDR_MASK; + new_spte = old_spte & ~SPTE_BASE_ADDR_MASK; new_spte |= (u64)new_pfn << PAGE_SHIFT; new_spte &= ~PT_WRITABLE_MASK; @@ -389,6 +392,13 @@ void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only) shadow_nx_mask = 0ull; shadow_x_mask = VMX_EPT_EXECUTABLE_MASK; shadow_present_mask = has_exec_only ? 0ull : VMX_EPT_READABLE_MASK; + /* + * EPT overrides the host MTRRs, and so KVM must program the desired + * memtype directly into the SPTEs. Note, this mask is just the mask + * of all bits that factor into the memtype, the actual memtype must be + * dynamically calculated, e.g. to ensure host MMIO is mapped UC. + */ + shadow_memtype_mask = VMX_EPT_MT_MASK | VMX_EPT_IPAT_BIT; shadow_acc_track_mask = VMX_EPT_RWX_MASK; shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE; shadow_mmu_writable_mask = EPT_SPTE_MMU_WRITABLE; @@ -439,6 +449,13 @@ void kvm_mmu_reset_all_pte_masks(void) shadow_nx_mask = PT64_NX_MASK; shadow_x_mask = 0; shadow_present_mask = PT_PRESENT_MASK; + + /* + * For shadow paging and NPT, KVM uses PAT entry '0' to encode WB + * memtype in the SPTEs, i.e. relies on host MTRRs to provide the + * correct memtype (WB is the "weakest" memtype). + */ + shadow_memtype_mask = 0; shadow_acc_track_mask = 0; shadow_me_mask = 0; shadow_me_value = 0; diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 0127bb6e3c7d..cabe3fbb4f39 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -36,12 +36,12 @@ extern bool __read_mostly enable_mmio_caching; static_assert(SPTE_TDP_AD_ENABLED_MASK == 0); #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK -#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1)) +#define SPTE_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1)) #else -#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) +#define SPTE_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) #endif -#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \ +#define SPTE_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \ | shadow_x_mask | shadow_nx_mask | shadow_me_mask) #define ACC_EXEC_MASK 1 @@ -50,17 +50,13 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0); #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) /* The mask for the R/X bits in EPT PTEs */ -#define PT64_EPT_READABLE_MASK 0x1ull -#define PT64_EPT_EXECUTABLE_MASK 0x4ull +#define SPTE_EPT_READABLE_MASK 0x1ull +#define SPTE_EPT_EXECUTABLE_MASK 0x4ull -#define PT64_LEVEL_BITS 9 - -#define PT64_LEVEL_SHIFT(level) \ - (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) - -#define PT64_INDEX(address, level)\ - (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) -#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) +#define SPTE_LEVEL_BITS 9 +#define SPTE_LEVEL_SHIFT(level) __PT_LEVEL_SHIFT(level, SPTE_LEVEL_BITS) +#define SPTE_INDEX(address, level) __PT_INDEX(address, level, SPTE_LEVEL_BITS) +#define SPTE_ENT_PER_PAGE __PT_ENT_PER_PAGE(SPTE_LEVEL_BITS) /* * The mask/shift to use for saving the original R/X bits when marking the PTE @@ -69,8 +65,8 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0); * restored only when a write is attempted to the page. This mask obviously * must not overlap the A/D type mask. */ -#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (PT64_EPT_READABLE_MASK | \ - PT64_EPT_EXECUTABLE_MASK) +#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (SPTE_EPT_READABLE_MASK | \ + SPTE_EPT_EXECUTABLE_MASK) #define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT 54 #define SHADOW_ACC_TRACK_SAVED_MASK (SHADOW_ACC_TRACK_SAVED_BITS_MASK << \ SHADOW_ACC_TRACK_SAVED_BITS_SHIFT) @@ -151,6 +147,7 @@ extern u64 __read_mostly shadow_mmio_value; extern u64 __read_mostly shadow_mmio_mask; extern u64 __read_mostly shadow_mmio_access_mask; extern u64 __read_mostly shadow_present_mask; +extern u64 __read_mostly shadow_memtype_mask; extern u64 __read_mostly shadow_me_value; extern u64 __read_mostly shadow_me_mask; @@ -194,6 +191,12 @@ static inline bool is_removed_spte(u64 spte) return spte == REMOVED_SPTE; } +/* Get an SPTE's index into its parent's page table (and the spt array). */ +static inline int spte_index(u64 *sptep) +{ + return ((unsigned long)sptep / sizeof(*sptep)) & (SPTE_ENT_PER_PAGE - 1); +} + /* * In some cases, we need to preserve the GFN of a non-present or reserved * SPTE when we usurp the upper five bits of the physical address space to @@ -282,7 +285,7 @@ static inline bool is_executable_pte(u64 spte) static inline kvm_pfn_t spte_to_pfn(u64 pte) { - return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; + return (pte & SPTE_BASE_ADDR_MASK) >> PAGE_SHIFT; } static inline bool is_accessed_spte(u64 spte) @@ -425,7 +428,8 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool prefetch, bool can_unsync, bool host_writable, u64 *new_spte); -u64 make_huge_page_split_spte(u64 huge_spte, int huge_level, int index); +u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, + union kvm_mmu_page_role role, int index); u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled); u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access); u64 mark_spte_for_access_track(u64 spte); diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c index ee4802d7b36c..39b48e7d7d1a 100644 --- a/arch/x86/kvm/mmu/tdp_iter.c +++ b/arch/x86/kvm/mmu/tdp_iter.c @@ -11,7 +11,7 @@ static void tdp_iter_refresh_sptep(struct tdp_iter *iter) { iter->sptep = iter->pt_path[iter->level - 1] + - SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level); + SPTE_INDEX(iter->gfn << PAGE_SHIFT, iter->level); iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep); } @@ -116,8 +116,8 @@ static bool try_step_side(struct tdp_iter *iter) * Check if the iterator is already at the end of the current page * table. */ - if (SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level) == - (PT64_ENT_PER_PAGE - 1)) + if (SPTE_INDEX(iter->gfn << PAGE_SHIFT, iter->level) == + (SPTE_ENT_PER_PAGE - 1)) return false; iter->gfn += KVM_PAGES_PER_HPAGE(iter->level); @@ -146,15 +146,6 @@ static bool try_step_up(struct tdp_iter *iter) } /* - * Step the iterator back up a level in the paging structure. Should only be - * used when the iterator is below the root level. - */ -void tdp_iter_step_up(struct tdp_iter *iter) -{ - WARN_ON(!try_step_up(iter)); -} - -/* * Step to the next SPTE in a pre-order traversal of the paging structure. * To get to the next SPTE, the iterator either steps down towards the goal * GFN, if at a present, non-last-level SPTE, or over to a SPTE mapping a diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h index adfca0cf94d3..f0af385c56e0 100644 --- a/arch/x86/kvm/mmu/tdp_iter.h +++ b/arch/x86/kvm/mmu/tdp_iter.h @@ -114,6 +114,5 @@ void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root, int min_level, gfn_t next_last_level_gfn); void tdp_iter_next(struct tdp_iter *iter); void tdp_iter_restart(struct tdp_iter *iter); -void tdp_iter_step_up(struct tdp_iter *iter); #endif /* __KVM_X86_MMU_TDP_ITER_H */ diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 7b9265d67131..bf2ccf9debca 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -425,7 +425,7 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) tdp_mmu_unlink_sp(kvm, sp, shared); - for (i = 0; i < PT64_ENT_PER_PAGE; i++) { + for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { tdp_ptep_t sptep = pt + i; gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); u64 old_spte; @@ -633,7 +633,6 @@ static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm, u64 new_spte) { u64 *sptep = rcu_dereference(iter->sptep); - u64 old_spte; /* * The caller is responsible for ensuring the old SPTE is not a REMOVED @@ -649,17 +648,8 @@ static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm, * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and * does not hold the mmu_lock. */ - old_spte = cmpxchg64(sptep, iter->old_spte, new_spte); - if (old_spte != iter->old_spte) { - /* - * The page table entry was modified by a different logical - * CPU. Refresh iter->old_spte with the current value so the - * caller operates on fresh data, e.g. if it retries - * tdp_mmu_set_spte_atomic(). - */ - iter->old_spte = old_spte; + if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte)) return -EBUSY; - } __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, new_spte, iter->level, true); @@ -934,9 +924,6 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) } /* - * Zap leafs SPTEs for the range of gfns, [start, end). Returns true if SPTEs - * have been cleared and a TLB flush is needed before releasing the MMU lock. - * * If can_yield is true, will release the MMU lock and reschedule if the * scheduler needs the CPU or there is contention on the MMU lock. If this * function cannot yield, it will not release the MMU lock or reschedule and @@ -979,10 +966,9 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, } /* - * Tears down the mappings for the range of gfns, [start, end), and frees the - * non-root pages mapping GFNs strictly within that range. Returns true if - * SPTEs have been cleared and a TLB flush is needed before releasing the - * MMU lock. + * Zap leaf SPTEs for the range of gfns, [start, end), for all roots. Returns + * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or + * more SPTEs were zapped since the MMU lock was last acquired. */ bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end, bool can_yield, bool flush) @@ -1487,8 +1473,8 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, * No need for atomics when writing to sp->spt since the page table has * not been linked in yet and thus is not reachable from any other CPU. */ - for (i = 0; i < PT64_ENT_PER_PAGE; i++) - sp->spt[i] = make_huge_page_split_spte(huge_spte, level, i); + for (i = 0; i < SPTE_ENT_PER_PAGE; i++) + sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, sp->role, i); /* * Replace the huge spte with a pointer to the populated lower level @@ -1507,7 +1493,7 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, * are overwriting from the page stats. But we have to manually update * the page stats with the new present child pages. */ - kvm_update_page_stats(kvm, level - 1, PT64_ENT_PER_PAGE); + kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE); out: trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret); @@ -1731,10 +1717,6 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); } -/* - * Clear leaf entries which could be replaced by large mappings, for - * GFNs within the slot. - */ static void zap_collapsible_spte_range(struct kvm *kvm, struct kvm_mmu_page *root, const struct kvm_memory_slot *slot) @@ -1743,61 +1725,52 @@ static void zap_collapsible_spte_range(struct kvm *kvm, gfn_t end = start + slot->npages; struct tdp_iter iter; int max_mapping_level; - kvm_pfn_t pfn; rcu_read_lock(); - tdp_root_for_each_pte(iter, root, start, end) { + for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) { +retry: if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) continue; - if (!is_shadow_present_pte(iter.old_spte) || - !is_last_spte(iter.old_spte, iter.level)) + if (iter.level > KVM_MAX_HUGEPAGE_LEVEL || + !is_shadow_present_pte(iter.old_spte)) continue; /* - * This is a leaf SPTE. Check if the PFN it maps can - * be mapped at a higher level. + * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with + * a large page size, then its parent would have been zapped + * instead of stepping down. */ - pfn = spte_to_pfn(iter.old_spte); - - if (kvm_is_reserved_pfn(pfn)) + if (is_last_spte(iter.old_spte, iter.level)) continue; - max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, - iter.gfn, pfn, PG_LEVEL_NUM); - - WARN_ON(max_mapping_level < iter.level); - /* - * If this page is already mapped at the highest - * viable level, there's nothing more to do. + * If iter.gfn resides outside of the slot, i.e. the page for + * the current level overlaps but is not contained by the slot, + * then the SPTE can't be made huge. More importantly, trying + * to query that info from slot->arch.lpage_info will cause an + * out-of-bounds access. */ - if (max_mapping_level == iter.level) + if (iter.gfn < start || iter.gfn >= end) continue; - /* - * The page can be remapped at a higher level, so step - * up to zap the parent SPTE. - */ - while (max_mapping_level > iter.level) - tdp_iter_step_up(&iter); + max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, + iter.gfn, PG_LEVEL_NUM); + if (max_mapping_level < iter.level) + continue; /* Note, a successful atomic zap also does a remote TLB flush. */ - tdp_mmu_zap_spte_atomic(kvm, &iter); - - /* - * If the atomic zap fails, the iter will recurse back into - * the same subtree to retry. - */ + if (tdp_mmu_zap_spte_atomic(kvm, &iter)) + goto retry; } rcu_read_unlock(); } /* - * Clear non-leaf entries (and free associated page tables) which could - * be replaced by large mappings, for GFNs within the slot. + * Zap non-leaf SPTEs (and free their associated page tables) which could + * be replaced by huge pages, for GFNs within the slot. */ void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *slot) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 3f868fed9114..02f9e4f245bd 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -16,6 +16,7 @@ #include <linux/bsearch.h> #include <linux/sort.h> #include <asm/perf_event.h> +#include <asm/cpu_device_id.h> #include "x86.h" #include "cpuid.h" #include "lapic.h" @@ -24,6 +25,15 @@ /* This is enough to filter the vast majority of currently defined events. */ #define KVM_PMU_EVENT_FILTER_MAX_EVENTS 300 +struct x86_pmu_capability __read_mostly kvm_pmu_cap; +EXPORT_SYMBOL_GPL(kvm_pmu_cap); + +static const struct x86_cpu_id vmx_icl_pebs_cpu[] = { + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, NULL), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, NULL), + {} +}; + /* NOTE: * - Each perf counter is defined as "struct kvm_pmc"; * - There are two types of perf counters: general purpose (gp) and fixed. @@ -34,7 +44,9 @@ * However AMD doesn't support fixed-counters; * - There are three types of index to access perf counters (PMC): * 1. MSR (named msr): For example Intel has MSR_IA32_PERFCTRn and AMD - * has MSR_K7_PERFCTRn. + * has MSR_K7_PERFCTRn and, for families 15H and later, + * MSR_F15H_PERF_CTRn, where MSR_F15H_PERF_CTR[0-3] are + * aliased to MSR_K7_PERFCTRn. * 2. MSR Index (named idx): This normally is used by RDPMC instruction. * For instance AMD RDPMC instruction uses 0000_0003h in ECX to access * C001_0007h (MSR_K7_PERCTR3). Intel has a similar mechanism, except @@ -46,7 +58,8 @@ * between pmc and perf counters is as the following: * * Intel: [0 .. INTEL_PMC_MAX_GENERIC-1] <=> gp counters * [INTEL_PMC_IDX_FIXED .. INTEL_PMC_IDX_FIXED + 2] <=> fixed - * * AMD: [0 .. AMD64_NUM_COUNTERS-1] <=> gp counters + * * AMD: [0 .. AMD64_NUM_COUNTERS-1] and, for families 15H + * and later, [0 .. AMD64_NUM_COUNTERS_CORE-1] <=> gp counters */ static struct kvm_pmu_ops kvm_pmu_ops __read_mostly; @@ -86,15 +99,22 @@ static void kvm_pmi_trigger_fn(struct irq_work *irq_work) static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); + bool skip_pmi = false; /* Ignore counters that have been reprogrammed already. */ if (test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) return; - __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); + if (pmc->perf_event && pmc->perf_event->attr.precise_ip) { + /* Indicate PEBS overflow PMI to guest. */ + skip_pmi = __test_and_set_bit(GLOBAL_STATUS_BUFFER_OVF_BIT, + (unsigned long *)&pmu->global_status); + } else { + __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); + } kvm_make_request(KVM_REQ_PMU, pmc->vcpu); - if (!pmc->intr) + if (!pmc->intr || skip_pmi) return; /* @@ -124,6 +144,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config, bool exclude_user, bool exclude_kernel, bool intr) { + struct kvm_pmu *pmu = pmc_to_pmu(pmc); struct perf_event *event; struct perf_event_attr attr = { .type = type, @@ -135,9 +156,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, .exclude_kernel = exclude_kernel, .config = config, }; - - if (type == PERF_TYPE_HARDWARE && config >= PERF_COUNT_HW_MAX) - return; + bool pebs = test_bit(pmc->idx, (unsigned long *)&pmu->pebs_enable); attr.sample_period = get_sample_period(pmc, pmc->counter); @@ -150,6 +169,25 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, */ attr.sample_period = 0; } + if (pebs) { + /* + * The non-zero precision level of guest event makes the ordinary + * guest event becomes a guest PEBS event and triggers the host + * PEBS PMI handler to determine whether the PEBS overflow PMI + * comes from the host counters or the guest. + * + * For most PEBS hardware events, the difference in the software + * precision levels of guest and host PEBS events will not affect + * the accuracy of the PEBS profiling result, because the "event IP" + * in the PEBS record is calibrated on the guest side. + * + * On Icelake everything is fine. Other hardware (GLC+, TNT+) that + * could possibly care here is unsupported and needs changes. + */ + attr.precise_ip = 1; + if (x86_match_cpu(vmx_icl_pebs_cpu) && pmc->idx == 32) + attr.precise_ip = 3; + } event = perf_event_create_kernel_counter(&attr, -1, current, kvm_perf_overflow, pmc); @@ -163,7 +201,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, pmc_to_pmu(pmc)->event_count++; clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi); pmc->is_paused = false; - pmc->intr = intr; + pmc->intr = intr || pebs; } static void pmc_pause_counter(struct kvm_pmc *pmc) @@ -189,6 +227,10 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc) get_sample_period(pmc, pmc->counter))) return false; + if (!test_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->pebs_enable) && + pmc->perf_event->attr.precise_ip) + return false; + /* reuse perf_event to serve as pmc_reprogram_counter() does*/ perf_event_enable(pmc->perf_event); pmc->is_paused = false; @@ -205,115 +247,83 @@ static int cmp_u64(const void *pa, const void *pb) return (a > b) - (a < b); } -void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) +static bool check_pmu_event_filter(struct kvm_pmc *pmc) { - u64 config; - u32 type = PERF_TYPE_RAW; - struct kvm *kvm = pmc->vcpu->kvm; struct kvm_pmu_event_filter *filter; - struct kvm_pmu *pmu = vcpu_to_pmu(pmc->vcpu); + struct kvm *kvm = pmc->vcpu->kvm; bool allow_event = true; + __u64 key; + int idx; - if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) - printk_once("kvm pmu: pin control bit is ignored\n"); - - pmc->eventsel = eventsel; - - pmc_pause_counter(pmc); - - if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_is_enabled(pmc)) - return; + if (!static_call(kvm_x86_pmu_hw_event_available)(pmc)) + return false; filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu); - if (filter) { - __u64 key = eventsel & AMD64_RAW_EVENT_MASK_NB; + if (!filter) + goto out; + if (pmc_is_gp(pmc)) { + key = pmc->eventsel & AMD64_RAW_EVENT_MASK_NB; if (bsearch(&key, filter->events, filter->nevents, sizeof(__u64), cmp_u64)) allow_event = filter->action == KVM_PMU_EVENT_ALLOW; else allow_event = filter->action == KVM_PMU_EVENT_DENY; + } else { + idx = pmc->idx - INTEL_PMC_IDX_FIXED; + if (filter->action == KVM_PMU_EVENT_DENY && + test_bit(idx, (ulong *)&filter->fixed_counter_bitmap)) + allow_event = false; + if (filter->action == KVM_PMU_EVENT_ALLOW && + !test_bit(idx, (ulong *)&filter->fixed_counter_bitmap)) + allow_event = false; } - if (!allow_event) - return; - - if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE | - ARCH_PERFMON_EVENTSEL_INV | - ARCH_PERFMON_EVENTSEL_CMASK | - HSW_IN_TX | - HSW_IN_TX_CHECKPOINTED))) { - config = static_call(kvm_x86_pmu_pmc_perf_hw_id)(pmc); - if (config != PERF_COUNT_HW_MAX) - type = PERF_TYPE_HARDWARE; - } - - if (type == PERF_TYPE_RAW) - config = eventsel & pmu->raw_event_mask; - - if (pmc->current_config == eventsel && pmc_resume_counter(pmc)) - return; - - pmc_release_perf_event(pmc); - pmc->current_config = eventsel; - pmc_reprogram_counter(pmc, type, config, - !(eventsel & ARCH_PERFMON_EVENTSEL_USR), - !(eventsel & ARCH_PERFMON_EVENTSEL_OS), - eventsel & ARCH_PERFMON_EVENTSEL_INT); +out: + return allow_event; } -EXPORT_SYMBOL_GPL(reprogram_gp_counter); -void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx) +void reprogram_counter(struct kvm_pmc *pmc) { - unsigned en_field = ctrl & 0x3; - bool pmi = ctrl & 0x8; - struct kvm_pmu_event_filter *filter; - struct kvm *kvm = pmc->vcpu->kvm; + struct kvm_pmu *pmu = pmc_to_pmu(pmc); + u64 eventsel = pmc->eventsel; + u64 new_config = eventsel; + u8 fixed_ctr_ctrl; pmc_pause_counter(pmc); - if (!en_field || !pmc_is_enabled(pmc)) + if (!pmc_speculative_in_use(pmc) || !pmc_is_enabled(pmc)) return; - filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu); - if (filter) { - if (filter->action == KVM_PMU_EVENT_DENY && - test_bit(idx, (ulong *)&filter->fixed_counter_bitmap)) - return; - if (filter->action == KVM_PMU_EVENT_ALLOW && - !test_bit(idx, (ulong *)&filter->fixed_counter_bitmap)) - return; - } - - if (pmc->current_config == (u64)ctrl && pmc_resume_counter(pmc)) + if (!check_pmu_event_filter(pmc)) return; - pmc_release_perf_event(pmc); - - pmc->current_config = (u64)ctrl; - pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE, - static_call(kvm_x86_pmu_pmc_perf_hw_id)(pmc), - !(en_field & 0x2), /* exclude user */ - !(en_field & 0x1), /* exclude kernel */ - pmi); -} -EXPORT_SYMBOL_GPL(reprogram_fixed_counter); + if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) + printk_once("kvm pmu: pin control bit is ignored\n"); -void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx) -{ - struct kvm_pmc *pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, pmc_idx); + if (pmc_is_fixed(pmc)) { + fixed_ctr_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, + pmc->idx - INTEL_PMC_IDX_FIXED); + if (fixed_ctr_ctrl & 0x1) + eventsel |= ARCH_PERFMON_EVENTSEL_OS; + if (fixed_ctr_ctrl & 0x2) + eventsel |= ARCH_PERFMON_EVENTSEL_USR; + if (fixed_ctr_ctrl & 0x8) + eventsel |= ARCH_PERFMON_EVENTSEL_INT; + new_config = (u64)fixed_ctr_ctrl; + } - if (!pmc) + if (pmc->current_config == new_config && pmc_resume_counter(pmc)) return; - if (pmc_is_gp(pmc)) - reprogram_gp_counter(pmc, pmc->eventsel); - else { - int idx = pmc_idx - INTEL_PMC_IDX_FIXED; - u8 ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, idx); + pmc_release_perf_event(pmc); - reprogram_fixed_counter(pmc, ctrl, idx); - } + pmc->current_config = new_config; + pmc_reprogram_counter(pmc, PERF_TYPE_RAW, + (eventsel & pmu->raw_event_mask), + !(eventsel & ARCH_PERFMON_EVENTSEL_USR), + !(eventsel & ARCH_PERFMON_EVENTSEL_OS), + eventsel & ARCH_PERFMON_EVENTSEL_INT); } EXPORT_SYMBOL_GPL(reprogram_counter); @@ -329,8 +339,7 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) clear_bit(bit, pmu->reprogram_pmi); continue; } - - reprogram_counter(pmu, bit); + reprogram_counter(pmc); } /* @@ -471,17 +480,6 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu) kvm_pmu_refresh(vcpu); } -static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc) -{ - struct kvm_pmu *pmu = pmc_to_pmu(pmc); - - if (pmc_is_fixed(pmc)) - return fixed_ctrl_field(pmu->fixed_ctr_ctrl, - pmc->idx - INTEL_PMC_IDX_FIXED) & 0x3; - - return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE; -} - /* Release perf_events for vPMCs that have been unused for a full time slice. */ void kvm_pmu_cleanup(struct kvm_vcpu *vcpu) { @@ -514,13 +512,12 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu) static void kvm_pmu_incr_counter(struct kvm_pmc *pmc) { - struct kvm_pmu *pmu = pmc_to_pmu(pmc); u64 prev_count; prev_count = pmc->counter; pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc); - reprogram_counter(pmu, pmc->idx); + reprogram_counter(pmc); if (pmc->counter < prev_count) __kvm_perf_overflow(pmc, false); } @@ -528,13 +525,8 @@ static void kvm_pmu_incr_counter(struct kvm_pmc *pmc) static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc, unsigned int perf_hw_id) { - u64 old_eventsel = pmc->eventsel; - unsigned int config; - - pmc->eventsel &= (ARCH_PERFMON_EVENTSEL_EVENT | ARCH_PERFMON_EVENTSEL_UMASK); - config = static_call(kvm_x86_pmu_pmc_perf_hw_id)(pmc); - pmc->eventsel = old_eventsel; - return config == perf_hw_id; + return !((pmc->eventsel ^ perf_get_hw_event_config(perf_hw_id)) & + AMD64_RAW_EVENT_MASK_NB); } static inline bool cpl_is_matched(struct kvm_pmc *pmc) diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index e745f443b6a8..5cc5721f260b 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -8,6 +8,9 @@ #define pmu_to_vcpu(pmu) (container_of((pmu), struct kvm_vcpu, arch.pmu)) #define pmc_to_pmu(pmc) (&(pmc)->vcpu->arch.pmu) +#define MSR_IA32_MISC_ENABLE_PMU_RO_MASK (MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | \ + MSR_IA32_MISC_ENABLE_BTS_UNAVAIL) + /* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */ #define fixed_ctrl_field(ctrl_reg, idx) (((ctrl_reg) >> ((idx)*4)) & 0xf) @@ -22,7 +25,7 @@ struct kvm_event_hw_type_mapping { }; struct kvm_pmu_ops { - unsigned int (*pmc_perf_hw_id)(struct kvm_pmc *pmc); + bool (*hw_event_available)(struct kvm_pmc *pmc); bool (*pmc_is_enabled)(struct kvm_pmc *pmc); struct kvm_pmc *(*pmc_idx_to_pmc)(struct kvm_pmu *pmu, int pmc_idx); struct kvm_pmc *(*rdpmc_ecx_to_pmc)(struct kvm_vcpu *vcpu, @@ -144,9 +147,43 @@ static inline void pmc_update_sample_period(struct kvm_pmc *pmc) get_sample_period(pmc, pmc->counter)); } -void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel); -void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int fixed_idx); -void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx); +static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc) +{ + struct kvm_pmu *pmu = pmc_to_pmu(pmc); + + if (pmc_is_fixed(pmc)) + return fixed_ctrl_field(pmu->fixed_ctr_ctrl, + pmc->idx - INTEL_PMC_IDX_FIXED) & 0x3; + + return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE; +} + +extern struct x86_pmu_capability kvm_pmu_cap; + +static inline void kvm_init_pmu_capability(void) +{ + bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL; + + perf_get_x86_pmu_capability(&kvm_pmu_cap); + + /* + * For Intel, only support guest architectural pmu + * on a host with architectural pmu. + */ + if ((is_intel && !kvm_pmu_cap.version) || !kvm_pmu_cap.num_counters_gp) + enable_pmu = false; + + if (!enable_pmu) { + memset(&kvm_pmu_cap, 0, sizeof(kvm_pmu_cap)); + return; + } + + kvm_pmu_cap.version = min(kvm_pmu_cap.version, 2); + kvm_pmu_cap.num_counters_fixed = min(kvm_pmu_cap.num_counters_fixed, + KVM_PMC_MAX_FIXED); +} + +void reprogram_counter(struct kvm_pmc *pmc); void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index d1bc5820ea46..6919dee69f18 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -40,6 +40,9 @@ #define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK) #define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK) +static bool force_avic; +module_param_unsafe(force_avic, bool, 0444); + /* Note: * This hash table is used to map VM_ID to a struct kvm_svm, * when handling AMD IOMMU GALOG notification to schedule in @@ -50,6 +53,7 @@ static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS); static u32 next_vm_id = 0; static bool next_vm_id_wrapped = 0; static DEFINE_SPINLOCK(svm_vm_data_hash_lock); +enum avic_modes avic_mode; /* * This is a wrapper of struct amd_iommu_ir_data. @@ -59,6 +63,54 @@ struct amd_svm_iommu_ir { void *data; /* Storing pointer to struct amd_ir_data */ }; +static void avic_activate_vmcb(struct vcpu_svm *svm) +{ + struct vmcb *vmcb = svm->vmcb01.ptr; + + vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK); + vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK; + + vmcb->control.int_ctl |= AVIC_ENABLE_MASK; + + /* Note: + * KVM can support hybrid-AVIC mode, where KVM emulates x2APIC + * MSR accesses, while interrupt injection to a running vCPU + * can be achieved using AVIC doorbell. The AVIC hardware still + * accelerate MMIO accesses, but this does not cause any harm + * as the guest is not supposed to access xAPIC mmio when uses x2APIC. + */ + if (apic_x2apic_mode(svm->vcpu.arch.apic) && + avic_mode == AVIC_MODE_X2) { + vmcb->control.int_ctl |= X2APIC_MODE_MASK; + vmcb->control.avic_physical_id |= X2AVIC_MAX_PHYSICAL_ID; + /* Disabling MSR intercept for x2APIC registers */ + svm_set_x2apic_msr_interception(svm, false); + } else { + /* For xAVIC and hybrid-xAVIC modes */ + vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID; + /* Enabling MSR intercept for x2APIC registers */ + svm_set_x2apic_msr_interception(svm, true); + } +} + +static void avic_deactivate_vmcb(struct vcpu_svm *svm) +{ + struct vmcb *vmcb = svm->vmcb01.ptr; + + vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK); + vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK; + + /* + * If running nested and the guest uses its own MSR bitmap, there + * is no need to update L0's msr bitmap + */ + if (is_guest_mode(&svm->vcpu) && + vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)) + return; + + /* Enabling MSR intercept for x2APIC registers */ + svm_set_x2apic_msr_interception(svm, true); +} /* Note: * This function is called from IOMMU driver to notify @@ -175,13 +227,12 @@ void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb) vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK; vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK; vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK; - vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT; vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE & VMCB_AVIC_APIC_BAR_MASK; if (kvm_apicv_activated(svm->vcpu.kvm)) - vmcb->control.int_ctl |= AVIC_ENABLE_MASK; + avic_activate_vmcb(svm); else - vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK; + avic_deactivate_vmcb(svm); } static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, @@ -190,7 +241,8 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, u64 *avic_physical_id_table; struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); - if (index >= AVIC_MAX_PHYSICAL_ID_COUNT) + if ((avic_mode == AVIC_MODE_X1 && index > AVIC_MAX_PHYSICAL_ID) || + (avic_mode == AVIC_MODE_X2 && index > X2AVIC_MAX_PHYSICAL_ID)) return NULL; avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page); @@ -237,7 +289,8 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) int id = vcpu->vcpu_id; struct vcpu_svm *svm = to_svm(vcpu); - if (id >= AVIC_MAX_PHYSICAL_ID_COUNT) + if ((avic_mode == AVIC_MODE_X1 && id > AVIC_MAX_PHYSICAL_ID) || + (avic_mode == AVIC_MODE_X2 && id > X2AVIC_MAX_PHYSICAL_ID)) return -EINVAL; if (!vcpu->arch.apic->regs) @@ -279,8 +332,10 @@ void avic_ring_doorbell(struct kvm_vcpu *vcpu) */ int cpu = READ_ONCE(vcpu->cpu); - if (cpu != get_cpu()) + if (cpu != get_cpu()) { wrmsrl(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu)); + trace_kvm_avic_doorbell(vcpu->vcpu_id, kvm_cpu_get_apicid(cpu)); + } put_cpu(); } @@ -303,7 +358,7 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source if (apic_x2apic_mode(source)) dest = icrh; else - dest = GET_APIC_DEST_FIELD(icrh); + dest = GET_XAPIC_DEST_FIELD(icrh); if (dest_mode == APIC_DEST_PHYSICAL) { /* broadcast destination, use slow path */ @@ -345,9 +400,7 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source logid_index = cluster + __ffs(bitmap); - if (apic_x2apic_mode(source)) { - l1_physical_id = logid_index; - } else { + if (!apic_x2apic_mode(source)) { u32 *avic_logical_id_table = page_address(kvm_svm->avic_logical_id_table_page); @@ -362,6 +415,23 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source l1_physical_id = logid_entry & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK; + } else { + /* + * For x2APIC logical mode, cannot leverage the index. + * Instead, calculate physical ID from logical ID in ICRH. + */ + int cluster = (icrh & 0xffff0000) >> 16; + int apic = ffs(icrh & 0xffff) - 1; + + /* + * If the x2APIC logical ID sub-field (i.e. icrh[15:0]) + * contains anything but a single bit, we cannot use the + * fast path, because it is limited to a single vCPU. + */ + if (apic < 0 || icrh != (1 << apic)) + return -EINVAL; + + l1_physical_id = (cluster << 4) + apic; } } @@ -396,9 +466,15 @@ static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source, * since entered the guest will have processed pending IRQs at VMRUN. */ kvm_for_each_vcpu(i, vcpu, kvm) { + u32 dest; + + if (apic_x2apic_mode(vcpu->arch.apic)) + dest = icrh; + else + dest = GET_XAPIC_DEST_FIELD(icrh); + if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK, - GET_APIC_DEST_FIELD(icrh), - icrl & APIC_DEST_MASK)) { + dest, icrl & APIC_DEST_MASK)) { vcpu->arch.apic->irr_pending = true; svm_complete_interrupt_delivery(vcpu, icrl & APIC_MODE_MASK, @@ -514,8 +590,13 @@ static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); bool flat = svm->dfr_reg == APIC_DFR_FLAT; - u32 *entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat); + u32 *entry; + + /* Note: x2AVIC does not use logical APIC ID table */ + if (apic_x2apic_mode(vcpu->arch.apic)) + return; + entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat); if (entry) clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry); } @@ -527,6 +608,10 @@ static int avic_handle_ldr_update(struct kvm_vcpu *vcpu) u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR); u32 id = kvm_xapic_id(vcpu->arch.apic); + /* AVIC does not support LDR update for x2APIC */ + if (apic_x2apic_mode(vcpu->arch.apic)) + return 0; + if (ldr == svm->ldr_reg) return 0; @@ -654,6 +739,18 @@ void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu) avic_handle_ldr_update(vcpu); } +void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu) +{ + if (!lapic_in_kernel(vcpu) || avic_mode == AVIC_MODE_NONE) + return; + + if (kvm_get_apic_mode(vcpu) == LAPIC_MODE_INVALID) { + WARN_ONCE(true, "Invalid local APIC state (vcpu_id=%d)", vcpu->vcpu_id); + return; + } + avic_refresh_apicv_exec_ctrl(vcpu); +} + static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate) { int ret = 0; @@ -906,7 +1003,6 @@ bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason) BIT(APICV_INHIBIT_REASON_NESTED) | BIT(APICV_INHIBIT_REASON_IRQWIN) | BIT(APICV_INHIBIT_REASON_PIT_REINJ) | - BIT(APICV_INHIBIT_REASON_X2APIC) | BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | BIT(APICV_INHIBIT_REASON_SEV) | BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | @@ -968,7 +1064,6 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) return; entry = READ_ONCE(*(svm->avic_physical_id_cache)); - WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK; entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK); @@ -1016,9 +1111,9 @@ void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) * accordingly before re-activating. */ avic_apicv_post_state_restore(vcpu); - vmcb->control.int_ctl |= AVIC_ENABLE_MASK; + avic_activate_vmcb(svm); } else { - vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK; + avic_deactivate_vmcb(svm); } vmcb_mark_dirty(vmcb, VMCB_AVIC); @@ -1058,3 +1153,44 @@ void avic_vcpu_unblocking(struct kvm_vcpu *vcpu) avic_vcpu_load(vcpu, vcpu->cpu); } + +/* + * Note: + * - The module param avic enable both xAPIC and x2APIC mode. + * - Hypervisor can support both xAVIC and x2AVIC in the same guest. + * - The mode can be switched at run-time. + */ +bool avic_hardware_setup(struct kvm_x86_ops *x86_ops) +{ + if (!npt_enabled) + return false; + + if (boot_cpu_has(X86_FEATURE_AVIC)) { + avic_mode = AVIC_MODE_X1; + pr_info("AVIC enabled\n"); + } else if (force_avic) { + /* + * Some older systems does not advertise AVIC support. + * See Revision Guide for specific AMD processor for more detail. + */ + avic_mode = AVIC_MODE_X1; + pr_warn("AVIC is not supported in CPUID but force enabled"); + pr_warn("Your system might crash and burn"); + } + + /* AVIC is a prerequisite for x2AVIC. */ + if (boot_cpu_has(X86_FEATURE_X2AVIC)) { + if (avic_mode == AVIC_MODE_X1) { + avic_mode = AVIC_MODE_X2; + pr_info("x2AVIC enabled\n"); + } else { + pr_warn(FW_BUG "Cannot support x2AVIC due to AVIC is disabled"); + pr_warn(FW_BUG "Try enable AVIC using force_avic option"); + } + } + + if (avic_mode != AVIC_MODE_NONE) + amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); + + return !!avic_mode; +} diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index ba7cd26f438f..76dcc8a3e849 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -230,6 +230,11 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) break; p = msrpm_offsets[i]; + + /* x2apic msrs are intercepted always for the nested guest */ + if (is_x2apic_msrpm_offset(p)) + continue; + offset = svm->nested.ctl.msrpm_base_pa + (p * 4); if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4)) @@ -320,7 +325,8 @@ static bool __nested_vmcb_check_save(struct kvm_vcpu *vcpu, return false; } - if (CC(!kvm_is_valid_cr4(vcpu, save->cr4))) + /* Note, SVM doesn't have any additional restrictions on CR4. */ + if (CC(!__kvm_is_valid_cr4(vcpu, save->cr4))) return false; if (CC(!kvm_valid_efer(vcpu, save->efer))) @@ -371,6 +377,7 @@ void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu, to->nested_ctl = from->nested_ctl; to->event_inj = from->event_inj; to->event_inj_err = from->event_inj_err; + to->next_rip = from->next_rip; to->nested_cr3 = from->nested_cr3; to->virt_ext = from->virt_ext; to->pause_filter_count = from->pause_filter_count; @@ -608,7 +615,33 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 } } -static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) +static inline bool is_evtinj_soft(u32 evtinj) +{ + u32 type = evtinj & SVM_EVTINJ_TYPE_MASK; + u8 vector = evtinj & SVM_EVTINJ_VEC_MASK; + + if (!(evtinj & SVM_EVTINJ_VALID)) + return false; + + if (type == SVM_EVTINJ_TYPE_SOFT) + return true; + + return type == SVM_EVTINJ_TYPE_EXEPT && kvm_exception_is_soft(vector); +} + +static bool is_evtinj_nmi(u32 evtinj) +{ + u32 type = evtinj & SVM_EVTINJ_TYPE_MASK; + + if (!(evtinj & SVM_EVTINJ_VALID)) + return false; + + return type == SVM_EVTINJ_TYPE_NMI; +} + +static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, + unsigned long vmcb12_rip, + unsigned long vmcb12_csbase) { u32 int_ctl_vmcb01_bits = V_INTR_MASKING_MASK; u32 int_ctl_vmcb12_bits = V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK; @@ -650,7 +683,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) vmcb02->control.tsc_offset = vcpu->arch.tsc_offset; - if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) { + if (svm->tsc_ratio_msr != kvm_caps.default_tsc_scaling_ratio) { WARN_ON(!svm->tsc_scaling_enabled); nested_svm_update_tsc_ratio_msr(vcpu); } @@ -664,6 +697,30 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) vmcb02->control.event_inj = svm->nested.ctl.event_inj; vmcb02->control.event_inj_err = svm->nested.ctl.event_inj_err; + /* + * next_rip is consumed on VMRUN as the return address pushed on the + * stack for injected soft exceptions/interrupts. If nrips is exposed + * to L1, take it verbatim from vmcb12. If nrips is supported in + * hardware but not exposed to L1, stuff the actual L2 RIP to emulate + * what a nrips=0 CPU would do (L1 is responsible for advancing RIP + * prior to injecting the event). + */ + if (svm->nrips_enabled) + vmcb02->control.next_rip = svm->nested.ctl.next_rip; + else if (boot_cpu_has(X86_FEATURE_NRIPS)) + vmcb02->control.next_rip = vmcb12_rip; + + svm->nmi_l1_to_l2 = is_evtinj_nmi(vmcb02->control.event_inj); + if (is_evtinj_soft(vmcb02->control.event_inj)) { + svm->soft_int_injected = true; + svm->soft_int_csbase = vmcb12_csbase; + svm->soft_int_old_rip = vmcb12_rip; + if (svm->nrips_enabled) + svm->soft_int_next_rip = svm->nested.ctl.next_rip; + else + svm->soft_int_next_rip = vmcb12_rip; + } + vmcb02->control.virt_ext = vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK; if (svm->lbrv_enabled) @@ -745,7 +802,7 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, nested_svm_copy_common_state(svm->vmcb01.ptr, svm->nested.vmcb02.ptr); svm_switch_vmcb(svm, &svm->nested.vmcb02); - nested_vmcb02_prepare_control(svm); + nested_vmcb02_prepare_control(svm, vmcb12->save.rip, vmcb12->save.cs.base); nested_vmcb02_prepare_save(svm, vmcb12); ret = nested_svm_load_cr3(&svm->vcpu, svm->nested.save.cr3, @@ -834,6 +891,8 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) out_exit_err: svm->nested.nested_run_pending = 0; + svm->nmi_l1_to_l2 = false; + svm->soft_int_injected = false; svm->vmcb->control.exit_code = SVM_EXIT_ERR; svm->vmcb->control.exit_code_hi = 0; @@ -982,7 +1041,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS); } - if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) { + if (svm->tsc_ratio_msr != kvm_caps.default_tsc_scaling_ratio) { WARN_ON(!svm->tsc_scaling_enabled); vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio); @@ -1421,6 +1480,7 @@ static void nested_copy_vmcb_cache_to_control(struct vmcb_control_area *dst, dst->nested_ctl = from->nested_ctl; dst->event_inj = from->event_inj; dst->event_inj_err = from->event_inj_err; + dst->next_rip = from->next_rip; dst->nested_cr3 = from->nested_cr3; dst->virt_ext = from->virt_ext; dst->pause_filter_count = from->pause_filter_count; @@ -1605,7 +1665,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, nested_copy_vmcb_control_to_cache(svm, ctl); svm_switch_vmcb(svm, &svm->nested.vmcb02); - nested_vmcb02_prepare_control(svm); + nested_vmcb02_prepare_control(svm, svm->vmcb->save.rip, svm->vmcb->save.cs.base); /* * While the nested guest CR3 is already checked and set by diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 136039fc6d01..f24613a108c5 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -33,34 +33,6 @@ enum index { INDEX_ERROR, }; -/* duplicated from amd_perfmon_event_map, K7 and above should work. */ -static struct kvm_event_hw_type_mapping amd_event_mapping[] = { - [0] = { 0x76, 0x00, PERF_COUNT_HW_CPU_CYCLES }, - [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS }, - [2] = { 0x7d, 0x07, PERF_COUNT_HW_CACHE_REFERENCES }, - [3] = { 0x7e, 0x07, PERF_COUNT_HW_CACHE_MISSES }, - [4] = { 0xc2, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, - [5] = { 0xc3, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, - [6] = { 0xd0, 0x00, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND }, - [7] = { 0xd1, 0x00, PERF_COUNT_HW_STALLED_CYCLES_BACKEND }, -}; - -/* duplicated from amd_f17h_perfmon_event_map. */ -static struct kvm_event_hw_type_mapping amd_f17h_event_mapping[] = { - [0] = { 0x76, 0x00, PERF_COUNT_HW_CPU_CYCLES }, - [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS }, - [2] = { 0x60, 0xff, PERF_COUNT_HW_CACHE_REFERENCES }, - [3] = { 0x64, 0x09, PERF_COUNT_HW_CACHE_MISSES }, - [4] = { 0xc2, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, - [5] = { 0xc3, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, - [6] = { 0x87, 0x02, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND }, - [7] = { 0x87, 0x01, PERF_COUNT_HW_STALLED_CYCLES_BACKEND }, -}; - -/* amd_pmc_perf_hw_id depends on these being the same size */ -static_assert(ARRAY_SIZE(amd_event_mapping) == - ARRAY_SIZE(amd_f17h_event_mapping)); - static unsigned int get_msr_base(struct kvm_pmu *pmu, enum pmu_type type) { struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); @@ -154,31 +126,9 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, return &pmu->gp_counters[msr_to_index(msr)]; } -static unsigned int amd_pmc_perf_hw_id(struct kvm_pmc *pmc) +static bool amd_hw_event_available(struct kvm_pmc *pmc) { - struct kvm_event_hw_type_mapping *event_mapping; - u8 event_select = pmc->eventsel & ARCH_PERFMON_EVENTSEL_EVENT; - u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; - int i; - - /* return PERF_COUNT_HW_MAX as AMD doesn't have fixed events */ - if (WARN_ON(pmc_is_fixed(pmc))) - return PERF_COUNT_HW_MAX; - - if (guest_cpuid_family(pmc->vcpu) >= 0x17) - event_mapping = amd_f17h_event_mapping; - else - event_mapping = amd_event_mapping; - - for (i = 0; i < ARRAY_SIZE(amd_event_mapping); i++) - if (event_mapping[i].eventsel == event_select - && event_mapping[i].unit_mask == unit_mask) - break; - - if (i == ARRAY_SIZE(amd_event_mapping)) - return PERF_COUNT_HW_MAX; - - return event_mapping[i].event_type; + return true; } /* check if a PMC is enabled by comparing it against global_ctrl bits. Because @@ -286,8 +236,10 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL); if (pmc) { data &= ~pmu->reserved_bits; - if (data != pmc->eventsel) - reprogram_gp_counter(pmc, data); + if (data != pmc->eventsel) { + pmc->eventsel = data; + reprogram_counter(pmc); + } return 0; } @@ -343,7 +295,7 @@ static void amd_pmu_reset(struct kvm_vcpu *vcpu) } struct kvm_pmu_ops amd_pmu_ops __initdata = { - .pmc_perf_hw_id = amd_pmc_perf_hw_id, + .hw_event_available = amd_hw_event_available, .pmc_is_enabled = amd_pmc_is_enabled, .pmc_idx_to_pmc = amd_pmc_idx_to_pmc, .rdpmc_ecx_to_pmc = amd_rdpmc_ecx_to_pmc, diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 0c240ed04f96..b0e793e7d85c 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -603,6 +603,9 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm) save->xss = svm->vcpu.arch.ia32_xss; save->dr6 = svm->vcpu.arch.dr6; + pr_debug("Virtual Machine Save Area (VMSA):\n"); + print_hex_dump(KERN_CONT, "", DUMP_PREFIX_NONE, 16, 1, save, sizeof(*save), false); + return 0; } @@ -1606,38 +1609,35 @@ static int sev_lock_vcpus_for_migration(struct kvm *kvm, { struct kvm_vcpu *vcpu; unsigned long i, j; - bool first = true; kvm_for_each_vcpu(i, vcpu, kvm) { if (mutex_lock_killable_nested(&vcpu->mutex, role)) goto out_unlock; - if (first) { +#ifdef CONFIG_PROVE_LOCKING + if (!i) /* * Reset the role to one that avoids colliding with * the role used for the first vcpu mutex. */ role = SEV_NR_MIGRATION_ROLES; - first = false; - } else { + else mutex_release(&vcpu->mutex.dep_map, _THIS_IP_); - } +#endif } return 0; out_unlock: - first = true; kvm_for_each_vcpu(j, vcpu, kvm) { if (i == j) break; - if (first) - first = false; - else +#ifdef CONFIG_PROVE_LOCKING + if (j) mutex_acquire(&vcpu->mutex.dep_map, role, 0, _THIS_IP_); - +#endif mutex_unlock(&vcpu->mutex); } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 44bbf25dfeb9..38f873cb6f2c 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -74,6 +74,8 @@ static uint64_t osvw_len = 4, osvw_status; static DEFINE_PER_CPU(u64, current_tsc_ratio); +#define X2APIC_MSR(x) (APIC_BASE_MSR + (x >> 4)) + static const struct svm_direct_access_msrs { u32 index; /* Index of the MSR */ bool always; /* True if intercept is initially cleared */ @@ -100,6 +102,38 @@ static const struct svm_direct_access_msrs { { .index = MSR_IA32_CR_PAT, .always = false }, { .index = MSR_AMD64_SEV_ES_GHCB, .always = true }, { .index = MSR_TSC_AUX, .always = false }, + { .index = X2APIC_MSR(APIC_ID), .always = false }, + { .index = X2APIC_MSR(APIC_LVR), .always = false }, + { .index = X2APIC_MSR(APIC_TASKPRI), .always = false }, + { .index = X2APIC_MSR(APIC_ARBPRI), .always = false }, + { .index = X2APIC_MSR(APIC_PROCPRI), .always = false }, + { .index = X2APIC_MSR(APIC_EOI), .always = false }, + { .index = X2APIC_MSR(APIC_RRR), .always = false }, + { .index = X2APIC_MSR(APIC_LDR), .always = false }, + { .index = X2APIC_MSR(APIC_DFR), .always = false }, + { .index = X2APIC_MSR(APIC_SPIV), .always = false }, + { .index = X2APIC_MSR(APIC_ISR), .always = false }, + { .index = X2APIC_MSR(APIC_TMR), .always = false }, + { .index = X2APIC_MSR(APIC_IRR), .always = false }, + { .index = X2APIC_MSR(APIC_ESR), .always = false }, + { .index = X2APIC_MSR(APIC_ICR), .always = false }, + { .index = X2APIC_MSR(APIC_ICR2), .always = false }, + + /* + * Note: + * AMD does not virtualize APIC TSC-deadline timer mode, but it is + * emulated by KVM. When setting APIC LVTT (0x832) register bit 18, + * the AVIC hardware would generate GP fault. Therefore, always + * intercept the MSR 0x832, and do not setup direct_access_msr. + */ + { .index = X2APIC_MSR(APIC_LVTTHMR), .always = false }, + { .index = X2APIC_MSR(APIC_LVTPC), .always = false }, + { .index = X2APIC_MSR(APIC_LVT0), .always = false }, + { .index = X2APIC_MSR(APIC_LVT1), .always = false }, + { .index = X2APIC_MSR(APIC_LVTERR), .always = false }, + { .index = X2APIC_MSR(APIC_TMICT), .always = false }, + { .index = X2APIC_MSR(APIC_TMCCT), .always = false }, + { .index = X2APIC_MSR(APIC_TDCR), .always = false }, { .index = MSR_INVALID, .always = false }, }; @@ -188,9 +222,6 @@ module_param(tsc_scaling, int, 0444); static bool avic; module_param(avic, bool, 0444); -static bool force_avic; -module_param_unsafe(force_avic, bool, 0444); - bool __read_mostly dump_invalid_vmcb; module_param(dump_invalid_vmcb, bool, 0644); @@ -342,9 +373,11 @@ static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) } -static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu) +static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu, + bool commit_side_effects) { struct vcpu_svm *svm = to_svm(vcpu); + unsigned long old_rflags; /* * SEV-ES does not expose the next RIP. The RIP update is controlled by @@ -359,18 +392,75 @@ static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu) } if (!svm->next_rip) { + if (unlikely(!commit_side_effects)) + old_rflags = svm->vmcb->save.rflags; + if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP)) return 0; + + if (unlikely(!commit_side_effects)) + svm->vmcb->save.rflags = old_rflags; } else { kvm_rip_write(vcpu, svm->next_rip); } done: - svm_set_interrupt_shadow(vcpu, 0); + if (likely(commit_side_effects)) + svm_set_interrupt_shadow(vcpu, 0); return 1; } +static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu) +{ + return __svm_skip_emulated_instruction(vcpu, true); +} + +static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu) +{ + unsigned long rip, old_rip = kvm_rip_read(vcpu); + struct vcpu_svm *svm = to_svm(vcpu); + + /* + * Due to architectural shortcomings, the CPU doesn't always provide + * NextRIP, e.g. if KVM intercepted an exception that occurred while + * the CPU was vectoring an INTO/INT3 in the guest. Temporarily skip + * the instruction even if NextRIP is supported to acquire the next + * RIP so that it can be shoved into the NextRIP field, otherwise + * hardware will fail to advance guest RIP during event injection. + * Drop the exception/interrupt if emulation fails and effectively + * retry the instruction, it's the least awful option. If NRIPS is + * in use, the skip must not commit any side effects such as clearing + * the interrupt shadow or RFLAGS.RF. + */ + if (!__svm_skip_emulated_instruction(vcpu, !nrips)) + return -EIO; + + rip = kvm_rip_read(vcpu); + + /* + * Save the injection information, even when using next_rip, as the + * VMCB's next_rip will be lost (cleared on VM-Exit) if the injection + * doesn't complete due to a VM-Exit occurring while the CPU is + * vectoring the event. Decoding the instruction isn't guaranteed to + * work as there may be no backing instruction, e.g. if the event is + * being injected by L1 for L2, or if the guest is patching INT3 into + * a different instruction. + */ + svm->soft_int_injected = true; + svm->soft_int_csbase = svm->vmcb->save.cs.base; + svm->soft_int_old_rip = old_rip; + svm->soft_int_next_rip = rip; + + if (nrips) + kvm_rip_write(vcpu, old_rip); + + if (static_cpu_has(X86_FEATURE_NRIPS)) + svm->vmcb->control.next_rip = rip; + + return 0; +} + static void svm_queue_exception(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -380,21 +470,9 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu) kvm_deliver_exception_payload(vcpu); - if (nr == BP_VECTOR && !nrips) { - unsigned long rip, old_rip = kvm_rip_read(vcpu); - - /* - * For guest debugging where we have to reinject #BP if some - * INT3 is guest-owned: - * Emulate nRIP by moving RIP forward. Will fail if injection - * raises a fault that is not intercepted. Still better than - * failing in all cases. - */ - (void)svm_skip_emulated_instruction(vcpu); - rip = kvm_rip_read(vcpu); - svm->int3_rip = rip + svm->vmcb->save.cs.base; - svm->int3_injected = rip - old_rip; - } + if (kvm_exception_is_soft(nr) && + svm_update_soft_interrupt_rip(vcpu)) + return; svm->vmcb->control.event_inj = nr | SVM_EVTINJ_VALID @@ -736,6 +814,29 @@ void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm) } } +void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept) +{ + int i; + + if (intercept == svm->x2avic_msrs_intercepted) + return; + + if (avic_mode != AVIC_MODE_X2 || + !apic_x2apic_mode(svm->vcpu.arch.apic)) + return; + + for (i = 0; i < MAX_DIRECT_ACCESS_MSRS; i++) { + int index = direct_access_msrs[i].index; + + if ((index < APIC_BASE_MSR) || + (index > APIC_BASE_MSR + 0xff)) + continue; + set_msr_interception(&svm->vcpu, svm->msrpm, index, + !intercept, !intercept); + } + + svm->x2avic_msrs_intercepted = intercept; +} void svm_vcpu_free_msrpm(u32 *msrpm) { @@ -1231,7 +1332,7 @@ static void __svm_vcpu_reset(struct kvm_vcpu *vcpu) svm_init_osvw(vcpu); vcpu->arch.microcode_version = 0x01000065; - svm->tsc_ratio_msr = kvm_default_tsc_scaling_ratio; + svm->tsc_ratio_msr = kvm_caps.default_tsc_scaling_ratio; if (sev_es_guest(vcpu->kvm)) sev_es_vcpu_reset(svm); @@ -1299,6 +1400,8 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu) goto error_free_vmsa_page; } + svm->x2avic_msrs_intercepted = true; + svm->vmcb01.ptr = page_address(vmcb01_page); svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT); svm_switch_vmcb(svm, &svm->vmcb01); @@ -2345,6 +2448,7 @@ static int task_switch_interception(struct kvm_vcpu *vcpu) kvm_clear_exception_queue(vcpu); break; case SVM_EXITINTINFO_TYPE_INTR: + case SVM_EXITINTINFO_TYPE_SOFT: kvm_clear_interrupt_queue(vcpu); break; default: @@ -3375,35 +3479,49 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; + + if (svm->nmi_l1_to_l2) + return; + vcpu->arch.hflags |= HF_NMI_MASK; if (!sev_es_guest(vcpu->kvm)) svm_set_intercept(svm, INTERCEPT_IRET); ++vcpu->stat.nmi_injections; } -static void svm_inject_irq(struct kvm_vcpu *vcpu) +static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) { struct vcpu_svm *svm = to_svm(vcpu); + u32 type; + + if (vcpu->arch.interrupt.soft) { + if (svm_update_soft_interrupt_rip(vcpu)) + return; - BUG_ON(!(gif_set(svm))); + type = SVM_EVTINJ_TYPE_SOFT; + } else { + type = SVM_EVTINJ_TYPE_INTR; + } - trace_kvm_inj_virq(vcpu->arch.interrupt.nr); + trace_kvm_inj_virq(vcpu->arch.interrupt.nr, + vcpu->arch.interrupt.soft, reinjected); ++vcpu->stat.irq_injections; svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr | - SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; + SVM_EVTINJ_VALID | type; } void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode, int trig_mode, int vector) { /* - * vcpu->arch.apicv_active must be read after vcpu->mode. + * apic->apicv_active must be read after vcpu->mode. * Pairs with smp_store_release in vcpu_enter_guest. */ bool in_guest_mode = (smp_load_acquire(&vcpu->mode) == IN_GUEST_MODE); - if (!READ_ONCE(vcpu->arch.apicv_active)) { + /* Note, this is called iff the local APIC is in-kernel. */ + if (!READ_ONCE(vcpu->arch.apic->apicv_active)) { /* Process the interrupt via inject_pending_event */ kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); @@ -3668,15 +3786,49 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; } +static void svm_complete_soft_interrupt(struct kvm_vcpu *vcpu, u8 vector, + int type) +{ + bool is_exception = (type == SVM_EXITINTINFO_TYPE_EXEPT); + bool is_soft = (type == SVM_EXITINTINFO_TYPE_SOFT); + struct vcpu_svm *svm = to_svm(vcpu); + + /* + * If NRIPS is enabled, KVM must snapshot the pre-VMRUN next_rip that's + * associated with the original soft exception/interrupt. next_rip is + * cleared on all exits that can occur while vectoring an event, so KVM + * needs to manually set next_rip for re-injection. Unlike the !nrips + * case below, this needs to be done if and only if KVM is re-injecting + * the same event, i.e. if the event is a soft exception/interrupt, + * otherwise next_rip is unused on VMRUN. + */ + if (nrips && (is_soft || (is_exception && kvm_exception_is_soft(vector))) && + kvm_is_linear_rip(vcpu, svm->soft_int_old_rip + svm->soft_int_csbase)) + svm->vmcb->control.next_rip = svm->soft_int_next_rip; + /* + * If NRIPS isn't enabled, KVM must manually advance RIP prior to + * injecting the soft exception/interrupt. That advancement needs to + * be unwound if vectoring didn't complete. Note, the new event may + * not be the injected event, e.g. if KVM injected an INTn, the INTn + * hit a #NP in the guest, and the #NP encountered a #PF, the #NP will + * be the reported vectored event, but RIP still needs to be unwound. + */ + else if (!nrips && (is_soft || is_exception) && + kvm_is_linear_rip(vcpu, svm->soft_int_next_rip + svm->soft_int_csbase)) + kvm_rip_write(vcpu, svm->soft_int_old_rip); +} + static void svm_complete_interrupts(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); u8 vector; int type; u32 exitintinfo = svm->vmcb->control.exit_int_info; - unsigned int3_injected = svm->int3_injected; + bool nmi_l1_to_l2 = svm->nmi_l1_to_l2; + bool soft_int_injected = svm->soft_int_injected; - svm->int3_injected = 0; + svm->nmi_l1_to_l2 = false; + svm->soft_int_injected = false; /* * If we've made progress since setting HF_IRET_MASK, we've @@ -3701,9 +3853,13 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu) vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK; type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK; + if (soft_int_injected) + svm_complete_soft_interrupt(vcpu, vector, type); + switch (type) { case SVM_EXITINTINFO_TYPE_NMI: vcpu->arch.nmi_injected = true; + svm->nmi_l1_to_l2 = nmi_l1_to_l2; break; case SVM_EXITINTINFO_TYPE_EXEPT: /* @@ -3712,18 +3868,6 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu) if (vector == X86_TRAP_VC) break; - /* - * In case of software exceptions, do not reinject the vector, - * but re-execute the instruction instead. Rewind RIP first - * if we emulated INT3 before. - */ - if (kvm_exception_is_soft(vector)) { - if (vector == BP_VECTOR && int3_injected && - kvm_is_linear_rip(vcpu, svm->int3_rip)) - kvm_rip_write(vcpu, - kvm_rip_read(vcpu) - int3_injected); - break; - } if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) { u32 err = svm->vmcb->control.exit_int_info_err; kvm_requeue_exception_e(vcpu, vector, err); @@ -3734,9 +3878,13 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu) case SVM_EXITINTINFO_TYPE_INTR: kvm_queue_interrupt(vcpu, vector, false); break; + case SVM_EXITINTINFO_TYPE_SOFT: + kvm_queue_interrupt(vcpu, vector, true); + break; default: break; } + } static void svm_cancel_injection(struct kvm_vcpu *vcpu) @@ -3952,7 +4100,7 @@ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, hv_track_root_tdp(vcpu, root_hpa); cr3 = vcpu->arch.cr3; - } else if (vcpu->arch.mmu->root_role.level >= PT64_ROOT_4LEVEL) { + } else if (root_level >= PT64_ROOT_4LEVEL) { cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu); } else { /* PCID in the guest should be impossible with a 32-bit MMU. */ @@ -4013,16 +4161,10 @@ static bool svm_has_emulated_msr(struct kvm *kvm, u32 index) return true; } -static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) -{ - return 0; -} - static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct kvm_cpuid_entry2 *best; - struct kvm *kvm = vcpu->kvm; vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && boot_cpu_has(X86_FEATURE_XSAVE) && @@ -4049,19 +4191,11 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) /* For sev guests, the memory encryption bit is not reserved in CR3. */ if (sev_guest(vcpu->kvm)) { - best = kvm_find_cpuid_entry(vcpu, 0x8000001F, 0); + best = kvm_find_cpuid_entry(vcpu, 0x8000001F); if (best) vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f)); } - if (kvm_vcpu_apicv_active(vcpu)) { - /* - * AVIC does not work with an x2APIC mode guest. If the X2APIC feature - * is exposed to the guest, disable AVIC. - */ - if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC)) - kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_X2APIC); - } init_vmcb_after_set_cpuid(vcpu); } @@ -4673,11 +4807,11 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .enable_nmi_window = svm_enable_nmi_window, .enable_irq_window = svm_enable_irq_window, .update_cr8_intercept = svm_update_cr8_intercept, + .set_virtual_apic_mode = avic_set_virtual_apic_mode, .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl, .check_apicv_inhibit_reasons = avic_check_apicv_inhibit_reasons, .apicv_post_state_restore = avic_apicv_post_state_restore, - .get_mt_mask = svm_get_mt_mask, .get_exit_info = svm_get_exit_info, .vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid, @@ -4773,7 +4907,7 @@ static __init void svm_set_cpu_caps(void) { kvm_set_cpu_caps(); - supported_xss = 0; + kvm_caps.supported_xss = 0; /* CPUID 0x80000001 and 0x8000000A (SVM features) */ if (nested) { @@ -4849,7 +4983,8 @@ static __init int svm_hardware_setup(void) init_msrpm_offsets(); - supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); + kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | + XFEATURE_MASK_BNDCSR); if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) kvm_enable_efer_bits(EFER_FFXSR); @@ -4859,11 +4994,11 @@ static __init int svm_hardware_setup(void) tsc_scaling = false; } else { pr_info("TSC scaling supported\n"); - kvm_has_tsc_control = true; + kvm_caps.has_tsc_control = true; } } - kvm_max_tsc_scaling_ratio = SVM_TSC_RATIO_MAX; - kvm_tsc_scaling_ratio_frac_bits = 32; + kvm_caps.max_tsc_scaling_ratio = SVM_TSC_RATIO_MAX; + kvm_caps.tsc_scaling_ratio_frac_bits = 32; tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX); @@ -4917,17 +5052,9 @@ static __init int svm_hardware_setup(void) nrips = false; } - enable_apicv = avic = avic && npt_enabled && (boot_cpu_has(X86_FEATURE_AVIC) || force_avic); + enable_apicv = avic = avic && avic_hardware_setup(&svm_x86_ops); - if (enable_apicv) { - if (!boot_cpu_has(X86_FEATURE_AVIC)) { - pr_warn("AVIC is not supported in CPUID but force enabled"); - pr_warn("Your system might crash and burn"); - } else - pr_info("AVIC enabled\n"); - - amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); - } else { + if (!enable_apicv) { svm_x86_ops.vcpu_blocking = NULL; svm_x86_ops.vcpu_unblocking = NULL; svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL; diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 9223ac100ef5..6a7686bf6900 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -29,13 +29,21 @@ #define IOPM_SIZE PAGE_SIZE * 3 #define MSRPM_SIZE PAGE_SIZE * 2 -#define MAX_DIRECT_ACCESS_MSRS 21 -#define MSRPM_OFFSETS 16 +#define MAX_DIRECT_ACCESS_MSRS 46 +#define MSRPM_OFFSETS 32 extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; extern bool npt_enabled; extern int vgif; extern bool intercept_smi; +enum avic_modes { + AVIC_MODE_NONE = 0, + AVIC_MODE_X1, + AVIC_MODE_X2, +}; + +extern enum avic_modes avic_mode; + /* * Clean bits in VMCB. * VMCB_ALL_CLEAN_MASK might also need to @@ -139,6 +147,7 @@ struct vmcb_ctrl_area_cached { u64 nested_ctl; u32 event_inj; u32 event_inj_err; + u64 next_rip; u64 nested_cr3; u64 virt_ext; u32 clean; @@ -228,9 +237,12 @@ struct vcpu_svm { bool nmi_singlestep; u64 nmi_singlestep_guest_rflags; + bool nmi_l1_to_l2; - unsigned int3_injected; - unsigned long int3_rip; + unsigned long soft_int_csbase; + unsigned long soft_int_old_rip; + unsigned long soft_int_next_rip; + bool soft_int_injected; /* optional nested SVM features that are enabled for this guest */ bool nrips_enabled : 1; @@ -264,6 +276,8 @@ struct vcpu_svm { struct vcpu_sev_es_state sev_es; bool guest_state_loaded; + + bool x2avic_msrs_intercepted; }; struct svm_cpu_data { @@ -509,6 +523,15 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm) return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE; } +static inline bool is_x2apic_msrpm_offset(u32 offset) +{ + /* 4 msrs per u8, and 4 u8 in u32 */ + u32 msr = offset * 16; + + return (msr >= APIC_BASE_MSR) && + (msr < (APIC_BASE_MSR + 0x100)); +} + /* svm.c */ #define MSR_INVALID 0xffffffffU @@ -534,6 +557,7 @@ void svm_set_gif(struct vcpu_svm *svm, bool value); int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code); void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, int read, int write); +void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool disable); void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode, int trig_mode, int vec); @@ -603,6 +627,7 @@ extern struct kvm_x86_nested_ops svm_nested_ops; /* avic.c */ +bool avic_hardware_setup(struct kvm_x86_ops *ops); int avic_ga_log_notifier(u32 ga_tag); void avic_vm_destroy(struct kvm *kvm); int avic_vm_init(struct kvm *kvm); @@ -613,18 +638,16 @@ int avic_init_vcpu(struct vcpu_svm *svm); void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu); void avic_vcpu_put(struct kvm_vcpu *vcpu); void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu); -void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu); void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu); bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason); -void avic_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr); -void avic_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr); -bool avic_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu); int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); void avic_vcpu_blocking(struct kvm_vcpu *vcpu); void avic_vcpu_unblocking(struct kvm_vcpu *vcpu); void avic_ring_doorbell(struct kvm_vcpu *vcpu); unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu); +void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu); + /* sev.c */ diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index de4762517569..2120d7c060a9 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -154,7 +154,7 @@ TRACE_EVENT(kvm_xen_hypercall, TRACE_EVENT(kvm_pio, TP_PROTO(unsigned int rw, unsigned int port, unsigned int size, - unsigned int count, void *data), + unsigned int count, const void *data), TP_ARGS(rw, port, size, count, data), TP_STRUCT__entry( @@ -333,18 +333,24 @@ TRACE_EVENT_KVM_EXIT(kvm_exit); * Tracepoint for kvm interrupt injection: */ TRACE_EVENT(kvm_inj_virq, - TP_PROTO(unsigned int irq), - TP_ARGS(irq), + TP_PROTO(unsigned int vector, bool soft, bool reinjected), + TP_ARGS(vector, soft, reinjected), TP_STRUCT__entry( - __field( unsigned int, irq ) + __field( unsigned int, vector ) + __field( bool, soft ) + __field( bool, reinjected ) ), TP_fast_assign( - __entry->irq = irq; + __entry->vector = vector; + __entry->soft = soft; + __entry->reinjected = reinjected; ), - TP_printk("irq %u", __entry->irq) + TP_printk("%s 0x%x%s", + __entry->soft ? "Soft/INTn" : "IRQ", __entry->vector, + __entry->reinjected ? " [reinjected]" : "") ); #define EXS(x) { x##_VECTOR, "#" #x } @@ -358,25 +364,30 @@ TRACE_EVENT(kvm_inj_virq, * Tracepoint for kvm interrupt injection: */ TRACE_EVENT(kvm_inj_exception, - TP_PROTO(unsigned exception, bool has_error, unsigned error_code), - TP_ARGS(exception, has_error, error_code), + TP_PROTO(unsigned exception, bool has_error, unsigned error_code, + bool reinjected), + TP_ARGS(exception, has_error, error_code, reinjected), TP_STRUCT__entry( __field( u8, exception ) __field( u8, has_error ) __field( u32, error_code ) + __field( bool, reinjected ) ), TP_fast_assign( __entry->exception = exception; __entry->has_error = has_error; __entry->error_code = error_code; + __entry->reinjected = reinjected; ), - TP_printk("%s (0x%x)", + TP_printk("%s%s%s%s%s", __print_symbolic(__entry->exception, kvm_trace_sym_exc), - /* FIXME: don't print error_code if not present */ - __entry->has_error ? __entry->error_code : 0) + !__entry->has_error ? "" : " (", + !__entry->has_error ? "" : __print_symbolic(__entry->error_code, { }), + !__entry->has_error ? "" : ")", + __entry->reinjected ? " [reinjected]" : "") ); /* @@ -1479,6 +1490,24 @@ TRACE_EVENT(kvm_avic_kick_vcpu_slowpath, __entry->icrh, __entry->icrl, __entry->index) ); +TRACE_EVENT(kvm_avic_doorbell, + TP_PROTO(u32 vcpuid, u32 apicid), + TP_ARGS(vcpuid, apicid), + + TP_STRUCT__entry( + __field(u32, vcpuid) + __field(u32, apicid) + ), + + TP_fast_assign( + __entry->vcpuid = vcpuid; + __entry->apicid = apicid; + ), + + TP_printk("vcpuid=%u, apicid=%u", + __entry->vcpuid, __entry->apicid) +); + TRACE_EVENT(kvm_hv_timer_state, TP_PROTO(unsigned int vcpu_id, unsigned int hv_timer_in_use), TP_ARGS(vcpu_id, hv_timer_in_use), diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index c0e24826a86f..c5e5dfef69c7 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -6,6 +6,8 @@ #include "../lapic.h" #include "../x86.h" +#include "../pmu.h" +#include "../cpuid.h" extern bool __read_mostly enable_vpid; extern bool __read_mostly flexpriority_enabled; @@ -13,6 +15,7 @@ extern bool __read_mostly enable_ept; extern bool __read_mostly enable_unrestricted_guest; extern bool __read_mostly enable_ept_ad_bits; extern bool __read_mostly enable_pml; +extern bool __read_mostly enable_ipiv; extern int __read_mostly pt_mode; #define PT_MODE_SYSTEM 0 @@ -59,6 +62,7 @@ struct vmcs_config { u32 pin_based_exec_ctrl; u32 cpu_based_exec_ctrl; u32 cpu_based_2nd_exec_ctrl; + u64 cpu_based_3rd_exec_ctrl; u32 vmexit_ctrl; u32 vmentry_ctrl; struct nested_vmx_msrs nested; @@ -94,20 +98,17 @@ static inline bool cpu_has_vmx_posted_intr(void) static inline bool cpu_has_load_ia32_efer(void) { - return (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_EFER) && - (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_EFER); + return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_EFER; } static inline bool cpu_has_load_perf_global_ctrl(void) { - return (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && - (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); + return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; } static inline bool cpu_has_vmx_mpx(void) { - return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) && - (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS); + return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS; } static inline bool cpu_has_vmx_tpr_shadow(void) @@ -131,6 +132,12 @@ static inline bool cpu_has_secondary_exec_ctrls(void) CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; } +static inline bool cpu_has_tertiary_exec_ctrls(void) +{ + return vmcs_config.cpu_based_exec_ctrl & + CPU_BASED_ACTIVATE_TERTIARY_CONTROLS; +} + static inline bool cpu_has_vmx_virtualize_apic_accesses(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & @@ -276,6 +283,11 @@ static inline bool cpu_has_vmx_apicv(void) cpu_has_vmx_posted_intr(); } +static inline bool cpu_has_vmx_ipiv(void) +{ + return vmcs_config.cpu_based_3rd_exec_ctrl & TERTIARY_EXEC_IPI_VIRT; +} + static inline bool cpu_has_vmx_flexpriority(void) { return cpu_has_vmx_tpr_shadow() && @@ -363,7 +375,6 @@ static inline bool cpu_has_vmx_intel_pt(void) rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); return (vmx_msr & MSR_IA32_VMX_MISC_INTEL_PT) && (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_PT_USE_GPA) && - (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_IA32_RTIT_CTL) && (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_RTIT_CTL); } @@ -385,23 +396,31 @@ static inline bool vmx_pt_mode_is_host_guest(void) return pt_mode == PT_MODE_HOST_GUEST; } +static inline bool vmx_pebs_supported(void) +{ + return boot_cpu_has(X86_FEATURE_PEBS) && kvm_pmu_cap.pebs_ept; +} + static inline u64 vmx_get_perf_capabilities(void) { - u64 perf_cap = 0; + u64 perf_cap = PMU_CAP_FW_WRITES; + u64 host_perf_cap = 0; if (!enable_pmu) - return perf_cap; + return 0; if (boot_cpu_has(X86_FEATURE_PDCM)) - rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap); + rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); - perf_cap &= PMU_CAP_LBR_FMT; + perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT; - /* - * Since counters are virtualized, KVM would support full - * width counting unconditionally, even if the host lacks it. - */ - return PMU_CAP_FW_WRITES | perf_cap; + if (vmx_pebs_supported()) { + perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK; + if ((perf_cap & PERF_CAP_PEBS_FORMAT) < 4) + perf_cap &= ~PERF_CAP_PEBS_BASELINE; + } + + return perf_cap; } static inline u64 vmx_supported_debugctl(void) @@ -417,4 +436,10 @@ static inline u64 vmx_supported_debugctl(void) return debugctl; } +static inline bool cpu_has_notify_vmexit(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_NOTIFY_VM_EXITING; +} + #endif /* __KVM_X86_VMX_CAPS_H */ diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c index 87e3dc10edf4..6a61b1ae7942 100644 --- a/arch/x86/kvm/vmx/evmcs.c +++ b/arch/x86/kvm/vmx/evmcs.c @@ -297,8 +297,10 @@ const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1); #if IS_ENABLED(CONFIG_HYPERV) __init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) { + vmcs_conf->cpu_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_EXEC_CTRL; vmcs_conf->pin_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_PINCTRL; vmcs_conf->cpu_based_2nd_exec_ctrl &= ~EVMCS1_UNSUPPORTED_2NDEXEC; + vmcs_conf->cpu_based_3rd_exec_ctrl = 0; vmcs_conf->vmexit_ctrl &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL; vmcs_conf->vmentry_ctrl &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL; diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h index 8d70f9aea94b..f886a8ff0342 100644 --- a/arch/x86/kvm/vmx/evmcs.h +++ b/arch/x86/kvm/vmx/evmcs.h @@ -50,6 +50,7 @@ DECLARE_STATIC_KEY_FALSE(enable_evmcs); */ #define EVMCS1_UNSUPPORTED_PINCTRL (PIN_BASED_POSTED_INTR | \ PIN_BASED_VMX_PREEMPTION_TIMER) +#define EVMCS1_UNSUPPORTED_EXEC_CTRL (CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) #define EVMCS1_UNSUPPORTED_2NDEXEC \ (SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | \ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | \ diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index ab135f9ef52f..ddd4367d4826 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -311,11 +311,12 @@ static void free_nested(struct kvm_vcpu *vcpu) vmx->nested.cached_vmcs12 = NULL; kfree(vmx->nested.cached_shadow_vmcs12); vmx->nested.cached_shadow_vmcs12 = NULL; - /* Unpin physical memory we referred to in the vmcs02 */ - if (vmx->nested.apic_access_page) { - kvm_release_page_clean(vmx->nested.apic_access_page); - vmx->nested.apic_access_page = NULL; - } + /* + * Unpin physical memory we referred to in the vmcs02. The APIC access + * page's backing page (yeah, confusing) shouldn't actually be accessed, + * and if it is written, the contents are irrelevant. + */ + kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false); kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); vmx->nested.pi_desc = NULL; @@ -1223,7 +1224,7 @@ static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | /* reserved */ BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); - u64 vmx_basic = vmx->nested.msrs.basic; + u64 vmx_basic = vmcs_config.nested.basic; if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) return -EINVAL; @@ -1246,36 +1247,42 @@ static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) return 0; } -static int -vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) +static void vmx_get_control_msr(struct nested_vmx_msrs *msrs, u32 msr_index, + u32 **low, u32 **high) { - u64 supported; - u32 *lowp, *highp; - switch (msr_index) { case MSR_IA32_VMX_TRUE_PINBASED_CTLS: - lowp = &vmx->nested.msrs.pinbased_ctls_low; - highp = &vmx->nested.msrs.pinbased_ctls_high; + *low = &msrs->pinbased_ctls_low; + *high = &msrs->pinbased_ctls_high; break; case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: - lowp = &vmx->nested.msrs.procbased_ctls_low; - highp = &vmx->nested.msrs.procbased_ctls_high; + *low = &msrs->procbased_ctls_low; + *high = &msrs->procbased_ctls_high; break; case MSR_IA32_VMX_TRUE_EXIT_CTLS: - lowp = &vmx->nested.msrs.exit_ctls_low; - highp = &vmx->nested.msrs.exit_ctls_high; + *low = &msrs->exit_ctls_low; + *high = &msrs->exit_ctls_high; break; case MSR_IA32_VMX_TRUE_ENTRY_CTLS: - lowp = &vmx->nested.msrs.entry_ctls_low; - highp = &vmx->nested.msrs.entry_ctls_high; + *low = &msrs->entry_ctls_low; + *high = &msrs->entry_ctls_high; break; case MSR_IA32_VMX_PROCBASED_CTLS2: - lowp = &vmx->nested.msrs.secondary_ctls_low; - highp = &vmx->nested.msrs.secondary_ctls_high; + *low = &msrs->secondary_ctls_low; + *high = &msrs->secondary_ctls_high; break; default: BUG(); } +} + +static int +vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) +{ + u32 *lowp, *highp; + u64 supported; + + vmx_get_control_msr(&vmcs_config.nested, msr_index, &lowp, &highp); supported = vmx_control_msr(*lowp, *highp); @@ -1287,6 +1294,7 @@ vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) return -EINVAL; + vmx_get_control_msr(&vmx->nested.msrs, msr_index, &lowp, &highp); *lowp = data; *highp = data >> 32; return 0; @@ -1300,10 +1308,8 @@ static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | /* reserved */ GENMASK_ULL(13, 9) | BIT_ULL(31); - u64 vmx_misc; - - vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, - vmx->nested.msrs.misc_high); + u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low, + vmcs_config.nested.misc_high); if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) return -EINVAL; @@ -1331,10 +1337,8 @@ static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) { - u64 vmx_ept_vpid_cap; - - vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps, - vmx->nested.msrs.vpid_caps); + u64 vmx_ept_vpid_cap = vmx_control_msr(vmcs_config.nested.ept_caps, + vmcs_config.nested.vpid_caps); /* Every bit is either reserved or a feature bit. */ if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) @@ -1345,20 +1349,21 @@ static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) return 0; } -static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) +static u64 *vmx_get_fixed0_msr(struct nested_vmx_msrs *msrs, u32 msr_index) { - u64 *msr; - switch (msr_index) { case MSR_IA32_VMX_CR0_FIXED0: - msr = &vmx->nested.msrs.cr0_fixed0; - break; + return &msrs->cr0_fixed0; case MSR_IA32_VMX_CR4_FIXED0: - msr = &vmx->nested.msrs.cr4_fixed0; - break; + return &msrs->cr4_fixed0; default: BUG(); } +} + +static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) +{ + const u64 *msr = vmx_get_fixed0_msr(&vmcs_config.nested, msr_index); /* * 1 bits (which indicates bits which "must-be-1" during VMX operation) @@ -1367,7 +1372,7 @@ static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) if (!is_bitwise_subset(data, *msr, -1ULL)) return -EINVAL; - *msr = data; + *vmx_get_fixed0_msr(&vmx->nested.msrs, msr_index) = data; return 0; } @@ -1428,7 +1433,7 @@ int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) vmx->nested.msrs.vmcs_enum = data; return 0; case MSR_IA32_VMX_VMFUNC: - if (data & ~vmx->nested.msrs.vmfunc_controls) + if (data & ~vmcs_config.nested.vmfunc_controls) return -EINVAL; vmx->nested.msrs.vmfunc_controls = data; return 0; @@ -2133,6 +2138,8 @@ static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) { + struct kvm *kvm = vmx->vcpu.kvm; + /* * If vmcs02 hasn't been initialized, set the constant vmcs02 state * according to L0's settings (vmcs12 is irrelevant here). Host @@ -2175,6 +2182,9 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) if (cpu_has_vmx_encls_vmexit()) vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA); + if (kvm_notify_vmexit_enabled(kvm)) + vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); + /* * Set the MSR load/store lists to match L0's settings. Only the * addresses are constant (for vmcs02), the counts can change based @@ -2514,11 +2524,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); } else { kvm_set_dr(vcpu, 7, vcpu->arch.dr7); - vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl); + vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl); } if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) - vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs); + vmcs_write64(GUEST_BNDCFGS, vmx->nested.pre_vmenter_bndcfgs); vmx_set_rflags(vcpu, vmcs12->guest_rflags); /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the @@ -2547,7 +2557,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmx_get_l2_tsc_multiplier(vcpu)); vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); - if (kvm_has_tsc_control) + if (kvm_caps.has_tsc_control) vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); nested_vmx_transition_tlb_flush(vcpu, vmcs12, true); @@ -2613,6 +2623,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, } if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && + intel_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) && WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, vmcs12->guest_ia32_perf_global_ctrl))) { *entry_failure_code = ENTRY_FAIL_DEFAULT; @@ -3158,8 +3169,6 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); struct kvm_host_map *map; - struct page *page; - u64 hpa; if (!vcpu->arch.pdptrs_from_userspace && !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { @@ -3174,23 +3183,12 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { - /* - * Translate L1 physical address to host physical - * address for vmcs02. Keep the page pinned, so this - * physical address remains valid. We keep a reference - * to it so we can release it later. - */ - if (vmx->nested.apic_access_page) { /* shouldn't happen */ - kvm_release_page_clean(vmx->nested.apic_access_page); - vmx->nested.apic_access_page = NULL; - } - page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr); - if (!is_error_page(page)) { - vmx->nested.apic_access_page = page; - hpa = page_to_phys(vmx->nested.apic_access_page); - vmcs_write64(APIC_ACCESS_ADDR, hpa); + map = &vmx->nested.apic_access_page_map; + + if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->apic_access_addr), map)) { + vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(map->pfn)); } else { - pr_debug_ratelimited("%s: no backing 'struct page' for APIC-access address in vmcs12\n", + pr_debug_ratelimited("%s: no backing for APIC-access address in vmcs12\n", __func__); vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = @@ -3373,11 +3371,13 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); - if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) - vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); + if (!vmx->nested.nested_run_pending || + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) + vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); if (kvm_mpx_supported() && - !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) - vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); + (!vmx->nested.nested_run_pending || + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) + vmx->nested.pre_vmenter_bndcfgs = vmcs_read64(GUEST_BNDCFGS); /* * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and* @@ -4096,8 +4096,6 @@ static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); vmcs12->guest_pending_dbg_exceptions = vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); - if (kvm_mpx_supported()) - vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false; } @@ -4336,7 +4334,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); vcpu->arch.pat = vmcs12->host_ia32_pat; } - if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) + if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && + intel_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu))) WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, vmcs12->host_ia32_perf_global_ctrl)); @@ -4609,7 +4608,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); - if (kvm_has_tsc_control) + if (kvm_caps.has_tsc_control) vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); if (vmx->nested.l1_tpr_threshold != -1) @@ -4626,10 +4625,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, } /* Unpin physical memory we referred to in vmcs02 */ - if (vmx->nested.apic_access_page) { - kvm_release_page_clean(vmx->nested.apic_access_page); - vmx->nested.apic_access_page = NULL; - } + kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false); kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); vmx->nested.pi_desc = NULL; @@ -4828,28 +4824,6 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, return 0; } -void nested_vmx_pmu_refresh(struct kvm_vcpu *vcpu, - bool vcpu_has_perf_global_ctrl) -{ - struct vcpu_vmx *vmx; - - if (!nested_vmx_allowed(vcpu)) - return; - - vmx = to_vmx(vcpu); - if (vcpu_has_perf_global_ctrl) { - vmx->nested.msrs.entry_ctls_high |= - VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; - vmx->nested.msrs.exit_ctls_high |= - VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; - } else { - vmx->nested.msrs.entry_ctls_high &= - ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; - vmx->nested.msrs.exit_ctls_high &= - ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; - } -} - static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer, int *ret) { @@ -4952,7 +4926,7 @@ out_vmcs02: } /* Emulate the VMXON instruction. */ -static int handle_vmon(struct kvm_vcpu *vcpu) +static int handle_vmxon(struct kvm_vcpu *vcpu) { int ret; gpa_t vmptr; @@ -4962,20 +4936,25 @@ static int handle_vmon(struct kvm_vcpu *vcpu) | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; /* - * The Intel VMX Instruction Reference lists a bunch of bits that are - * prerequisite to running VMXON, most notably cr4.VMXE must be set to - * 1 (see vmx_is_valid_cr4() for when we allow the guest to set this). - * Otherwise, we should fail with #UD. But most faulting conditions - * have already been checked by hardware, prior to the VM-exit for - * VMXON. We do test guest cr4.VMXE because processor CR4 always has - * that bit set to 1 in non-root mode. + * Note, KVM cannot rely on hardware to perform the CR0/CR4 #UD checks + * that have higher priority than VM-Exit (see Intel SDM's pseudocode + * for VMXON), as KVM must load valid CR0/CR4 values into hardware while + * running the guest, i.e. KVM needs to check the _guest_ values. + * + * Rely on hardware for the other two pre-VM-Exit checks, !VM86 and + * !COMPATIBILITY modes. KVM may run the guest in VM86 to emulate Real + * Mode, but KVM will never take the guest out of those modes. */ - if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) { + if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) || + !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } - /* CPL=0 must be checked manually. */ + /* + * CPL=0 and all other checks that are lower priority than VM-Exit must + * be checked manually. + */ if (vmx_get_cpl(vcpu)) { kvm_inject_gp(vcpu, 0); return 1; @@ -5044,7 +5023,7 @@ static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) } /* Emulate the VMXOFF instruction */ -static int handle_vmoff(struct kvm_vcpu *vcpu) +static int handle_vmxoff(struct kvm_vcpu *vcpu) { if (!nested_vmx_check_permission(vcpu)) return 1; @@ -6111,6 +6090,9 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); case EXIT_REASON_ENCLS: return nested_vmx_exit_handled_encls(vcpu, vmcs12); + case EXIT_REASON_NOTIFY: + /* Notify VM exit is not exposed to L1 */ + return false; default: return true; } @@ -6775,6 +6757,9 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); + if (vmx_umip_emulated()) + msrs->cr4_fixed1 |= X86_CR4_UMIP; + msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr(); } @@ -6818,8 +6803,8 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) exit_handlers[EXIT_REASON_VMREAD] = handle_vmread; exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume; exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite; - exit_handlers[EXIT_REASON_VMOFF] = handle_vmoff; - exit_handlers[EXIT_REASON_VMON] = handle_vmon; + exit_handlers[EXIT_REASON_VMOFF] = handle_vmxoff; + exit_handlers[EXIT_REASON_VMON] = handle_vmxon; exit_handlers[EXIT_REASON_INVEPT] = handle_invept; exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid; exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc; diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index c92cea0b8ccc..88b00a7359e4 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -32,8 +32,6 @@ int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata); int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, u32 vmx_instruction_info, bool wr, int len, gva_t *ret); -void nested_vmx_pmu_refresh(struct kvm_vcpu *vcpu, - bool vcpu_has_perf_global_ctrl); void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu); bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, int size); @@ -281,7 +279,8 @@ static inline bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val) u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr4_fixed0; u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr4_fixed1; - return fixed_bits_valid(val, fixed0, fixed1); + return fixed_bits_valid(val, fixed0, fixed1) && + __kvm_is_valid_cr4(vcpu, val); } /* No difference in the restrictions on guest and host CR4 in VMX operation. */ diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 37e9eb32e3d9..862c1a4d971b 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -37,23 +37,35 @@ static int fixed_pmc_events[] = {1, 0, 7}; static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) { + struct kvm_pmc *pmc; + u8 old_fixed_ctr_ctrl = pmu->fixed_ctr_ctrl; int i; + pmu->fixed_ctr_ctrl = data; for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { u8 new_ctrl = fixed_ctrl_field(data, i); - u8 old_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, i); - struct kvm_pmc *pmc; - - pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i); + u8 old_ctrl = fixed_ctrl_field(old_fixed_ctr_ctrl, i); if (old_ctrl == new_ctrl) continue; + pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i); + __set_bit(INTEL_PMC_IDX_FIXED + i, pmu->pmc_in_use); - reprogram_fixed_counter(pmc, new_ctrl, i); + reprogram_counter(pmc); } +} - pmu->fixed_ctr_ctrl = data; +static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) +{ + if (pmc_idx < INTEL_PMC_IDX_FIXED) { + return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + pmc_idx, + MSR_P6_EVNTSEL0); + } else { + u32 idx = pmc_idx - INTEL_PMC_IDX_FIXED; + + return get_fixed_pmc(pmu, idx + MSR_CORE_PERF_FIXED_CTR0); + } } /* function is called when global control register has been updated. */ @@ -61,14 +73,18 @@ static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data) { int bit; u64 diff = pmu->global_ctrl ^ data; + struct kvm_pmc *pmc; pmu->global_ctrl = data; - for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) - reprogram_counter(pmu, bit); + for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) { + pmc = intel_pmc_idx_to_pmc(pmu, bit); + if (pmc) + reprogram_counter(pmc); + } } -static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc) +static bool intel_hw_event_available(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); u8 event_select = pmc->eventsel & ARCH_PERFMON_EVENTSEL_EVENT; @@ -82,15 +98,12 @@ static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc) /* disable event that reported as not present by cpuid */ if ((i < 7) && !(pmu->available_event_types & (1 << i))) - return PERF_COUNT_HW_MAX + 1; + return false; break; } - if (i == ARRAY_SIZE(intel_arch_events)) - return PERF_COUNT_HW_MAX; - - return intel_arch_events[i].event_type; + return true; } /* check if a PMC is enabled by comparing it with globl_ctrl bits. */ @@ -98,19 +111,10 @@ static bool intel_pmc_is_enabled(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); - return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl); -} - -static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) -{ - if (pmc_idx < INTEL_PMC_IDX_FIXED) - return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + pmc_idx, - MSR_P6_EVNTSEL0); - else { - u32 idx = pmc_idx - INTEL_PMC_IDX_FIXED; + if (!intel_pmu_has_perf_global_ctrl(pmu)) + return true; - return get_fixed_pmc(pmu, idx + MSR_CORE_PERF_FIXED_CTR0); - } + return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl); } static bool intel_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) @@ -167,16 +171,6 @@ static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr) return get_gp_pmc(pmu, msr, MSR_IA32_PMC0); } -bool intel_pmu_lbr_is_compatible(struct kvm_vcpu *vcpu) -{ - /* - * As a first step, a guest could only enable LBR feature if its - * cpu model is the same as the host because the LBR registers - * would be pass-through to the guest and they're model specific. - */ - return boot_cpu_data.x86_model == guest_cpuid_model(vcpu); -} - bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu) { struct x86_pmu_lbr *lbr = vcpu_to_lbr_records(vcpu); @@ -205,6 +199,7 @@ static bool intel_pmu_is_valid_lbr_msr(struct kvm_vcpu *vcpu, u32 index) static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + u64 perf_capabilities; int ret; switch (msr) { @@ -212,7 +207,18 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) case MSR_CORE_PERF_GLOBAL_STATUS: case MSR_CORE_PERF_GLOBAL_CTRL: case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - ret = pmu->version > 1; + return intel_pmu_has_perf_global_ctrl(pmu); + break; + case MSR_IA32_PEBS_ENABLE: + ret = vcpu_get_perf_capabilities(vcpu) & PERF_CAP_PEBS_FORMAT; + break; + case MSR_IA32_DS_AREA: + ret = guest_cpuid_has(vcpu, X86_FEATURE_DS); + break; + case MSR_PEBS_DATA_CFG: + perf_capabilities = vcpu_get_perf_capabilities(vcpu); + ret = (perf_capabilities & PERF_CAP_PEBS_BASELINE) && + ((perf_capabilities & PERF_CAP_PEBS_FORMAT) > 3); break; default: ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) || @@ -361,6 +367,15 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_CORE_PERF_GLOBAL_OVF_CTRL: msr_info->data = 0; return 0; + case MSR_IA32_PEBS_ENABLE: + msr_info->data = pmu->pebs_enable; + return 0; + case MSR_IA32_DS_AREA: + msr_info->data = pmu->ds_area; + return 0; + case MSR_PEBS_DATA_CFG: + msr_info->data = pmu->pebs_data_cfg; + return 0; default: if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) { @@ -395,7 +410,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_CORE_PERF_FIXED_CTR_CTRL: if (pmu->fixed_ctr_ctrl == data) return 0; - if (!(data & 0xfffffffffffff444ull)) { + if (!(data & pmu->fixed_ctr_ctrl_mask)) { reprogram_fixed_counters(pmu, data); return 0; } @@ -421,6 +436,29 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 0; } break; + case MSR_IA32_PEBS_ENABLE: + if (pmu->pebs_enable == data) + return 0; + if (!(data & pmu->pebs_enable_mask)) { + pmu->pebs_enable = data; + return 0; + } + break; + case MSR_IA32_DS_AREA: + if (msr_info->host_initiated && data && !guest_cpuid_has(vcpu, X86_FEATURE_DS)) + return 1; + if (is_noncanonical_address(data, vcpu)) + return 1; + pmu->ds_area = data; + return 0; + case MSR_PEBS_DATA_CFG: + if (pmu->pebs_data_cfg == data) + return 0; + if (!(data & pmu->pebs_data_cfg_mask)) { + pmu->pebs_data_cfg = data; + return 0; + } + break; default: if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) { @@ -445,7 +483,8 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) (pmu->raw_event_mask & HSW_IN_TX_CHECKPOINTED)) reserved_bits ^= HSW_IN_TX_CHECKPOINTED; if (!(data & reserved_bits)) { - reprogram_gp_counter(pmc, data); + pmc->eventsel = data; + reprogram_counter(pmc); return 0; } } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, false)) @@ -474,11 +513,12 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); - - struct x86_pmu_capability x86_pmu; struct kvm_cpuid_entry2 *entry; union cpuid10_eax eax; union cpuid10_edx edx; + u64 perf_capabilities; + u64 counter_mask; + int i; pmu->nr_arch_gp_counters = 0; pmu->nr_arch_fixed_counters = 0; @@ -487,8 +527,13 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->version = 0; pmu->reserved_bits = 0xffffffff00200000ull; pmu->raw_event_mask = X86_RAW_EVENT_MASK; + pmu->global_ctrl_mask = ~0ull; + pmu->global_ovf_ctrl_mask = ~0ull; + pmu->fixed_ctr_ctrl_mask = ~0ull; + pmu->pebs_enable_mask = ~0ull; + pmu->pebs_data_cfg_mask = ~0ull; - entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); + entry = kvm_find_cpuid_entry(vcpu, 0xa); if (!entry || !vcpu->kvm->arch.enable_pmu) return; eax.full = entry->eax; @@ -498,13 +543,13 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) if (!pmu->version) return; - perf_get_x86_pmu_capability(&x86_pmu); - pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters, - x86_pmu.num_counters_gp); - eax.split.bit_width = min_t(int, eax.split.bit_width, x86_pmu.bit_width_gp); + kvm_pmu_cap.num_counters_gp); + eax.split.bit_width = min_t(int, eax.split.bit_width, + kvm_pmu_cap.bit_width_gp); pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1; - eax.split.mask_length = min_t(int, eax.split.mask_length, x86_pmu.events_mask_len); + eax.split.mask_length = min_t(int, eax.split.mask_length, + kvm_pmu_cap.events_mask_len); pmu->available_event_types = ~entry->ebx & ((1ull << eax.split.mask_length) - 1); @@ -514,17 +559,19 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->nr_arch_fixed_counters = min3(ARRAY_SIZE(fixed_pmc_events), (size_t) edx.split.num_counters_fixed, - (size_t) x86_pmu.num_counters_fixed); - edx.split.bit_width_fixed = min_t(int, - edx.split.bit_width_fixed, x86_pmu.bit_width_fixed); + (size_t)kvm_pmu_cap.num_counters_fixed); + edx.split.bit_width_fixed = min_t(int, edx.split.bit_width_fixed, + kvm_pmu_cap.bit_width_fixed); pmu->counter_bitmask[KVM_PMC_FIXED] = ((u64)1 << edx.split.bit_width_fixed) - 1; setup_fixed_pmc_eventsel(pmu); } - pmu->global_ctrl = ((1ull << pmu->nr_arch_gp_counters) - 1) | - (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED); - pmu->global_ctrl_mask = ~pmu->global_ctrl; + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) + pmu->fixed_ctr_ctrl_mask &= ~(0xbull << (i * 4)); + counter_mask = ~(((1ull << pmu->nr_arch_gp_counters) - 1) | + (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED)); + pmu->global_ctrl_mask = counter_mask; pmu->global_ovf_ctrl_mask = pmu->global_ctrl_mask & ~(MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF | MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD); @@ -532,7 +579,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->global_ovf_ctrl_mask &= ~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI; - entry = kvm_find_cpuid_entry(vcpu, 7, 0); + entry = kvm_find_cpuid_entry_index(vcpu, 7, 0); if (entry && (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) && (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM))) { @@ -545,16 +592,29 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) bitmap_set(pmu->all_valid_pmc_idx, INTEL_PMC_MAX_GENERIC, pmu->nr_arch_fixed_counters); - nested_vmx_pmu_refresh(vcpu, - intel_is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)); - - if (intel_pmu_lbr_is_compatible(vcpu)) + if (cpuid_model_is_consistent(vcpu)) x86_perf_get_lbr(&lbr_desc->records); else lbr_desc->records.nr = 0; if (lbr_desc->records.nr) bitmap_set(pmu->all_valid_pmc_idx, INTEL_PMC_IDX_FIXED_VLBR, 1); + + perf_capabilities = vcpu_get_perf_capabilities(vcpu); + if (perf_capabilities & PERF_CAP_PEBS_FORMAT) { + if (perf_capabilities & PERF_CAP_PEBS_BASELINE) { + pmu->pebs_enable_mask = counter_mask; + pmu->reserved_bits &= ~ICL_EVENTSEL_ADAPTIVE; + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { + pmu->fixed_ctr_ctrl_mask &= + ~(1ULL << (INTEL_PMC_IDX_FIXED + i * 4)); + } + pmu->pebs_data_cfg_mask = ~0xff00000full; + } else { + pmu->pebs_enable_mask = + ~((1ull << pmu->nr_arch_gp_counters) - 1); + } + } } static void intel_pmu_init(struct kvm_vcpu *vcpu) @@ -719,8 +779,28 @@ static void intel_pmu_cleanup(struct kvm_vcpu *vcpu) intel_pmu_release_guest_lbr_event(vcpu); } +void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu) +{ + struct kvm_pmc *pmc = NULL; + int bit; + + for_each_set_bit(bit, (unsigned long *)&pmu->global_ctrl, + X86_PMC_IDX_MAX) { + pmc = intel_pmc_idx_to_pmc(pmu, bit); + + if (!pmc || !pmc_speculative_in_use(pmc) || + !intel_pmc_is_enabled(pmc)) + continue; + + if (pmc->perf_event && pmc->idx != pmc->perf_event->hw.idx) { + pmu->host_cross_mapped_mask |= + BIT_ULL(pmc->perf_event->hw.idx); + } + } +} + struct kvm_pmu_ops intel_pmu_ops __initdata = { - .pmc_perf_hw_id = intel_pmc_perf_hw_id, + .hw_event_available = intel_hw_event_available, .pmc_is_enabled = intel_pmc_is_enabled, .pmc_idx_to_pmc = intel_pmc_idx_to_pmc, .rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc, diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 07e5fcf5a5aa..1b56c5e5c9fb 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -34,7 +34,7 @@ static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) return &(to_vmx(vcpu)->pi_desc); } -static int pi_try_set_control(struct pi_desc *pi_desc, u64 old, u64 new) +static int pi_try_set_control(struct pi_desc *pi_desc, u64 *pold, u64 new) { /* * PID.ON can be set at any time by a different vCPU or by hardware, @@ -42,7 +42,7 @@ static int pi_try_set_control(struct pi_desc *pi_desc, u64 old, u64 new) * update must be retried with a fresh snapshot an ON change causes * the cmpxchg to fail. */ - if (cmpxchg64(&pi_desc->control, old, new) != old) + if (!try_cmpxchg64(&pi_desc->control, pold, new)) return -EBUSY; return 0; @@ -96,8 +96,9 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) if (!x2apic_mode) dest = (dest << 8) & 0xFF00; + old.control = READ_ONCE(pi_desc->control); do { - old.control = new.control = READ_ONCE(pi_desc->control); + new.control = old.control; /* * Clear SN (as above) and refresh the destination APIC ID to @@ -111,7 +112,7 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) * descriptor was modified on "put" to use the wakeup vector. */ new.nv = POSTED_INTR_VECTOR; - } while (pi_try_set_control(pi_desc, old.control, new.control)); + } while (pi_try_set_control(pi_desc, &old.control, new.control)); local_irq_restore(flags); @@ -156,12 +157,12 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) WARN(pi_desc->sn, "PI descriptor SN field set before blocking"); + old.control = READ_ONCE(pi_desc->control); do { - old.control = new.control = READ_ONCE(pi_desc->control); - /* set 'NV' to 'wakeup vector' */ + new.control = old.control; new.nv = POSTED_INTR_WAKEUP_VECTOR; - } while (pi_try_set_control(pi_desc, old.control, new.control)); + } while (pi_try_set_control(pi_desc, &old.control, new.control)); /* * Send a wakeup IPI to this CPU if an interrupt may have been posted @@ -177,11 +178,24 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) local_irq_restore(flags); } +static bool vmx_needs_pi_wakeup(struct kvm_vcpu *vcpu) +{ + /* + * The default posted interrupt vector does nothing when + * invoked outside guest mode. Return whether a blocked vCPU + * can be the target of posted interrupts, as is the case when + * using either IPI virtualization or VT-d PI, so that the + * notification vector is switched to the one that calls + * back to the pi_wakeup_handler() function. + */ + return vmx_can_use_ipiv(vcpu) || vmx_can_use_vtd_pi(vcpu->kvm); +} + void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - if (!vmx_can_use_vtd_pi(vcpu->kvm)) + if (!vmx_needs_pi_wakeup(vcpu)) return; if (kvm_vcpu_is_blocking(vcpu) && !vmx_interrupt_blocked(vcpu)) diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h index 9a45d5c9f116..26992076552e 100644 --- a/arch/x86/kvm/vmx/posted_intr.h +++ b/arch/x86/kvm/vmx/posted_intr.h @@ -5,6 +5,8 @@ #define POSTED_INTR_ON 0 #define POSTED_INTR_SN 1 +#define PID_TABLE_ENTRY_VALID 1 + /* Posted-Interrupt Descriptor */ struct pi_desc { u32 pir[8]; /* Posted interrupt requested */ diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c index 35e7ec91ae86..aba8cebdc587 100644 --- a/arch/x86/kvm/vmx/sgx.c +++ b/arch/x86/kvm/vmx/sgx.c @@ -79,7 +79,7 @@ static int sgx_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t gva, bool write, else *gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, &ex); - if (*gpa == UNMAPPED_GVA) { + if (*gpa == INVALID_GPA) { kvm_inject_emulated_page_fault(vcpu, &ex); return -EFAULT; } @@ -148,8 +148,8 @@ static int __handle_encls_ecreate(struct kvm_vcpu *vcpu, u8 max_size_log2; int trapnr, ret; - sgx_12_0 = kvm_find_cpuid_entry(vcpu, 0x12, 0); - sgx_12_1 = kvm_find_cpuid_entry(vcpu, 0x12, 1); + sgx_12_0 = kvm_find_cpuid_entry_index(vcpu, 0x12, 0); + sgx_12_1 = kvm_find_cpuid_entry_index(vcpu, 0x12, 1); if (!sgx_12_0 || !sgx_12_1) { kvm_prepare_emulation_failure_exit(vcpu); return 0; @@ -431,7 +431,7 @@ static bool sgx_intercept_encls_ecreate(struct kvm_vcpu *vcpu) if (!vcpu->kvm->arch.sgx_provisioning_allowed) return true; - guest_cpuid = kvm_find_cpuid_entry(vcpu, 0x12, 0); + guest_cpuid = kvm_find_cpuid_entry_index(vcpu, 0x12, 0); if (!guest_cpuid) return true; @@ -439,7 +439,7 @@ static bool sgx_intercept_encls_ecreate(struct kvm_vcpu *vcpu) if (guest_cpuid->ebx != ebx || guest_cpuid->edx != edx) return true; - guest_cpuid = kvm_find_cpuid_entry(vcpu, 0x12, 1); + guest_cpuid = kvm_find_cpuid_entry_index(vcpu, 0x12, 1); if (!guest_cpuid) return true; diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index 2b9d7a7e83f7..ac290a44a693 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -50,6 +50,7 @@ struct vmcs_controls_shadow { u32 pin; u32 exec; u32 secondary_exec; + u64 tertiary_exec; }; /* diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index be7c19374fdd..d7f8331d6f7e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -105,6 +105,9 @@ module_param(fasteoi, bool, S_IRUGO); module_param(enable_apicv, bool, S_IRUGO); +bool __read_mostly enable_ipiv = true; +module_param(enable_ipiv, bool, 0444); + /* * If nested=1, nested virtualization is supported, i.e., guests may use * VMX and be a hypervisor for its own guests. If nested=0, guests may not @@ -116,6 +119,9 @@ module_param(nested, bool, S_IRUGO); bool __read_mostly enable_pml = 1; module_param_named(pml, enable_pml, bool, S_IRUGO); +static bool __read_mostly error_on_inconsistent_vmcs_config = true; +module_param(error_on_inconsistent_vmcs_config, bool, 0444); + static bool __read_mostly dump_invalid_vmcs = 0; module_param(dump_invalid_vmcs, bool, 0644); @@ -443,18 +449,20 @@ asmlinkage void vmread_error(unsigned long field, bool fault) noinline void vmwrite_error(unsigned long field, unsigned long value) { - vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%d\n", + vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%u\n", field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); } noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr) { - vmx_insn_failed("kvm: vmclear failed: %p/%llx\n", vmcs, phys_addr); + vmx_insn_failed("kvm: vmclear failed: %p/%llx err=%u\n", + vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); } noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr) { - vmx_insn_failed("kvm: vmptrld failed: %p/%llx\n", vmcs, phys_addr); + vmx_insn_failed("kvm: vmptrld failed: %p/%llx err=%u\n", + vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); } noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva) @@ -1787,7 +1795,7 @@ u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) return vmcs12->tsc_multiplier; - return kvm_default_tsc_scaling_ratio; + return kvm_caps.default_tsc_scaling_ratio; } static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) @@ -2111,6 +2119,12 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (is_noncanonical_address(data & PAGE_MASK, vcpu) || (data & MSR_IA32_BNDCFGS_RSVD)) return 1; + + if (is_guest_mode(vcpu) && + ((vmx->nested.msrs.entry_ctls_high & VM_ENTRY_LOAD_BNDCFGS) || + (vmx->nested.msrs.exit_ctls_high & VM_EXIT_CLEAR_BNDCFGS))) + get_vmcs12(vcpu)->guest_bndcfgs = data; + vmcs_write64(GUEST_BNDCFGS, data); break; case MSR_IA32_UMWAIT_CONTROL: @@ -2312,7 +2326,18 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if ((data & PMU_CAP_LBR_FMT) != (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT)) return 1; - if (!intel_pmu_lbr_is_compatible(vcpu)) + if (!cpuid_model_is_consistent(vcpu)) + return 1; + } + if (data & PERF_CAP_PEBS_FORMAT) { + if ((data & PERF_CAP_PEBS_MASK) != + (vmx_get_perf_capabilities() & PERF_CAP_PEBS_MASK)) + return 1; + if (!guest_cpuid_has(vcpu, X86_FEATURE_DS)) + return 1; + if (!guest_cpuid_has(vcpu, X86_FEATURE_DTES64)) + return 1; + if (!cpuid_model_is_consistent(vcpu)) return 1; } ret = kvm_set_msr_common(vcpu, msr_info); @@ -2489,6 +2514,15 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, return 0; } +static __init u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) +{ + u64 allowed; + + rdmsrl(msr, allowed); + + return ctl_opt & allowed; +} + static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, struct vmx_capability *vmx_cap) { @@ -2497,8 +2531,26 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, u32 _pin_based_exec_control = 0; u32 _cpu_based_exec_control = 0; u32 _cpu_based_2nd_exec_control = 0; + u64 _cpu_based_3rd_exec_control = 0; u32 _vmexit_control = 0; u32 _vmentry_control = 0; + int i; + + /* + * LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory. + * SAVE_IA32_PAT and SAVE_IA32_EFER are absent because KVM always + * intercepts writes to PAT and EFER, i.e. never enables those controls. + */ + struct { + u32 entry_control; + u32 exit_control; + } const vmcs_entry_exit_pairs[] = { + { VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL }, + { VM_ENTRY_LOAD_IA32_PAT, VM_EXIT_LOAD_IA32_PAT }, + { VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER }, + { VM_ENTRY_LOAD_BNDCFGS, VM_EXIT_CLEAR_BNDCFGS }, + { VM_ENTRY_LOAD_IA32_RTIT_CTL, VM_EXIT_CLEAR_IA32_RTIT_CTL }, + }; memset(vmcs_conf, 0, sizeof(*vmcs_conf)); min = CPU_BASED_HLT_EXITING | @@ -2518,7 +2570,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, opt = CPU_BASED_TPR_SHADOW | CPU_BASED_USE_MSR_BITMAPS | - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS | + CPU_BASED_ACTIVATE_TERTIARY_CONTROLS; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, &_cpu_based_exec_control) < 0) return -EIO; @@ -2551,7 +2604,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX | SECONDARY_EXEC_ENABLE_VMFUNC | - SECONDARY_EXEC_BUS_LOCK_DETECTION; + SECONDARY_EXEC_BUS_LOCK_DETECTION | + SECONDARY_EXEC_NOTIFY_VM_EXITING; if (cpu_has_sgx()) opt2 |= SECONDARY_EXEC_ENCLS_EXITING; if (adjust_vmx_controls(min2, opt2, @@ -2581,15 +2635,30 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, CPU_BASED_CR3_STORE_EXITING | CPU_BASED_INVLPG_EXITING); } else if (vmx_cap->ept) { - vmx_cap->ept = 0; pr_warn_once("EPT CAP should not exist if not support " "1-setting enable EPT VM-execution control\n"); + + if (error_on_inconsistent_vmcs_config) + return -EIO; + + vmx_cap->ept = 0; } if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) && - vmx_cap->vpid) { - vmx_cap->vpid = 0; + vmx_cap->vpid) { pr_warn_once("VPID CAP should not exist if not support " "1-setting enable VPID VM-execution control\n"); + + if (error_on_inconsistent_vmcs_config) + return -EIO; + + vmx_cap->vpid = 0; + } + + if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) { + u64 opt3 = TERTIARY_EXEC_IPI_VIRT; + + _cpu_based_3rd_exec_control = adjust_vmx_controls64(opt3, + MSR_IA32_VMX_PROCBASED_CTLS3); } min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT; @@ -2630,6 +2699,23 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, &_vmentry_control) < 0) return -EIO; + for (i = 0; i < ARRAY_SIZE(vmcs_entry_exit_pairs); i++) { + u32 n_ctrl = vmcs_entry_exit_pairs[i].entry_control; + u32 x_ctrl = vmcs_entry_exit_pairs[i].exit_control; + + if (!(_vmentry_control & n_ctrl) == !(_vmexit_control & x_ctrl)) + continue; + + pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, entry = %x, exit = %x\n", + _vmentry_control & n_ctrl, _vmexit_control & x_ctrl); + + if (error_on_inconsistent_vmcs_config) + return -EIO; + + _vmentry_control &= ~n_ctrl; + _vmexit_control &= ~x_ctrl; + } + /* * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they * can't be used due to an errata where VM Exit may incorrectly clear @@ -2678,6 +2764,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; + vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control; vmcs_conf->vmexit_ctrl = _vmexit_control; vmcs_conf->vmentry_ctrl = _vmentry_control; @@ -3230,8 +3317,8 @@ static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { /* * We operate under the default treatment of SMM, so VMX cannot be - * enabled under SMM. Note, whether or not VMXE is allowed at all is - * handled by kvm_is_valid_cr4(). + * enabled under SMM. Note, whether or not VMXE is allowed at all, + * i.e. is a reserved bit, is handled by common x86 code. */ if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu)) return false; @@ -3702,7 +3789,7 @@ static int init_rmode_identity_map(struct kvm *kvm) } /* Set up identity-mapping pagetable for EPT in real mode */ - for (i = 0; i < PT32_ENT_PER_PAGE; i++) { + for (i = 0; i < (PAGE_SIZE / sizeof(tmp)); i++) { tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE); if (__copy_to_user(uaddr + i * sizeof(tmp), &tmp, sizeof(tmp))) { @@ -3932,6 +4019,8 @@ static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu) vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW); vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); + if (enable_ipiv) + vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_ICR), MSR_TYPE_RW); } } @@ -3977,20 +4066,26 @@ static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) u32 i; /* - * Set intercept permissions for all potentially passed through MSRs - * again. They will automatically get filtered through the MSR filter, - * so we are back in sync after this. + * Redo intercept permissions for MSRs that KVM is passing through to + * the guest. Disabling interception will check the new MSR filter and + * ensure that KVM enables interception if usersepace wants to filter + * the MSR. MSRs that KVM is already intercepting don't need to be + * refreshed since KVM is going to intercept them regardless of what + * userspace wants. */ for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) { u32 msr = vmx_possible_passthrough_msrs[i]; - bool read = test_bit(i, vmx->shadow_msr_intercept.read); - bool write = test_bit(i, vmx->shadow_msr_intercept.write); - vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_R, read); - vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_W, write); + if (!test_bit(i, vmx->shadow_msr_intercept.read)) + vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_R); + + if (!test_bit(i, vmx->shadow_msr_intercept.write)) + vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_W); } - pt_update_intercept_for_msr(vcpu); + /* PT MSRs can be passed through iff PT is exposed to the guest. */ + if (vmx_pt_mode_is_host_guest()) + pt_update_intercept_for_msr(vcpu); } static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, @@ -4085,7 +4180,8 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) if (!r) return 0; - if (!vcpu->arch.apicv_active) + /* Note, this is called iff the local APIC is in-kernel. */ + if (!vcpu->arch.apic->apicv_active) return -1; if (pi_test_and_set_pir(vector, &vmx->pi_desc)) @@ -4259,15 +4355,19 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) } pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); - if (cpu_has_secondary_exec_ctrls()) { - if (kvm_vcpu_apicv_active(vcpu)) - secondary_exec_controls_setbit(vmx, - SECONDARY_EXEC_APIC_REGISTER_VIRT | - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); - else - secondary_exec_controls_clearbit(vmx, - SECONDARY_EXEC_APIC_REGISTER_VIRT | - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); + + if (kvm_vcpu_apicv_active(vcpu)) { + secondary_exec_controls_setbit(vmx, + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); + if (enable_ipiv) + tertiary_exec_controls_setbit(vmx, TERTIARY_EXEC_IPI_VIRT); + } else { + secondary_exec_controls_clearbit(vmx, + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); + if (enable_ipiv) + tertiary_exec_controls_clearbit(vmx, TERTIARY_EXEC_IPI_VIRT); } vmx_update_msr_bitmap_x2apic(vcpu); @@ -4299,6 +4399,20 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx) return exec_control; } +static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx) +{ + u64 exec_control = vmcs_config.cpu_based_3rd_exec_ctrl; + + /* + * IPI virtualization relies on APICv. Disable IPI virtualization if + * APICv is inhibited. + */ + if (!enable_ipiv || !kvm_vcpu_apicv_active(&vmx->vcpu)) + exec_control &= ~TERTIARY_EXEC_IPI_VIRT; + + return exec_control; +} + /* * Adjust a single secondary execution control bit to intercept/allow an * instruction in the guest. This is usually done based on whether or not a @@ -4441,13 +4555,48 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) if (!vcpu->kvm->arch.bus_lock_detection_enabled) exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION; + if (!kvm_notify_vmexit_enabled(vcpu->kvm)) + exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING; + return exec_control; } +static inline int vmx_get_pid_table_order(struct kvm *kvm) +{ + return get_order(kvm->arch.max_vcpu_ids * sizeof(*to_kvm_vmx(kvm)->pid_table)); +} + +static int vmx_alloc_ipiv_pid_table(struct kvm *kvm) +{ + struct page *pages; + struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); + + if (!irqchip_in_kernel(kvm) || !enable_ipiv) + return 0; + + if (kvm_vmx->pid_table) + return 0; + + pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, vmx_get_pid_table_order(kvm)); + if (!pages) + return -ENOMEM; + + kvm_vmx->pid_table = (void *)page_address(pages); + return 0; +} + +static int vmx_vcpu_precreate(struct kvm *kvm) +{ + return vmx_alloc_ipiv_pid_table(kvm); +} + #define VMX_XSS_EXIT_BITMAP 0 static void init_vmcs(struct vcpu_vmx *vmx) { + struct kvm *kvm = vmx->vcpu.kvm; + struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); + if (nested) nested_vmx_set_vmcs_shadowing_bitmap(); @@ -4464,6 +4613,9 @@ static void init_vmcs(struct vcpu_vmx *vmx) if (cpu_has_secondary_exec_ctrls()) secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx)); + if (cpu_has_tertiary_exec_ctrls()) + tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx)); + if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) { vmcs_write64(EOI_EXIT_BITMAP0, 0); vmcs_write64(EOI_EXIT_BITMAP1, 0); @@ -4476,12 +4628,20 @@ static void init_vmcs(struct vcpu_vmx *vmx) vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc))); } - if (!kvm_pause_in_guest(vmx->vcpu.kvm)) { + if (vmx_can_use_ipiv(&vmx->vcpu)) { + vmcs_write64(PID_POINTER_TABLE, __pa(kvm_vmx->pid_table)); + vmcs_write16(LAST_PID_POINTER_INDEX, kvm->arch.max_vcpu_ids - 1); + } + + if (!kvm_pause_in_guest(kvm)) { vmcs_write32(PLE_GAP, ple_gap); vmx->ple_window = ple_window; vmx->ple_window_dirty = true; } + if (kvm_notify_vmexit_enabled(kvm)) + vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ @@ -4652,13 +4812,13 @@ static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu) exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); } -static void vmx_inject_irq(struct kvm_vcpu *vcpu) +static void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) { struct vcpu_vmx *vmx = to_vmx(vcpu); uint32_t intr; int irq = vcpu->arch.interrupt.nr; - trace_kvm_inj_virq(irq); + trace_kvm_inj_virq(irq, vcpu->arch.interrupt.soft, reinjected); ++vcpu->stat.irq_injections; if (vmx->rmode.vm86_active) { @@ -5770,6 +5930,32 @@ static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu) return 1; } +static int handle_notify(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qual = vmx_get_exit_qual(vcpu); + bool context_invalid = exit_qual & NOTIFY_VM_CONTEXT_INVALID; + + ++vcpu->stat.notify_window_exits; + + /* + * Notify VM exit happened while executing iret from NMI, + * "blocked by NMI" bit has to be set before next VM entry. + */ + if (enable_vnmi && (exit_qual & INTR_INFO_UNBLOCK_NMI)) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + + if (vcpu->kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_USER || + context_invalid) { + vcpu->run->exit_reason = KVM_EXIT_NOTIFY; + vcpu->run->notify.flags = context_invalid ? + KVM_NOTIFY_CONTEXT_INVALID : 0; + return 0; + } + + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -5827,6 +6013,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, [EXIT_REASON_ENCLS] = handle_encls, [EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit, + [EXIT_REASON_NOTIFY] = handle_notify, }; static const int kvm_vmx_max_exit_handlers = @@ -5924,6 +6111,7 @@ void dump_vmcs(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); u32 vmentry_ctl, vmexit_ctl; u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control; + u64 tertiary_exec_control; unsigned long cr4; int efer_slot; @@ -5937,9 +6125,16 @@ void dump_vmcs(struct kvm_vcpu *vcpu) cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL); cr4 = vmcs_readl(GUEST_CR4); - secondary_exec_control = 0; + if (cpu_has_secondary_exec_ctrls()) secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + else + secondary_exec_control = 0; + + if (cpu_has_tertiary_exec_ctrls()) + tertiary_exec_control = vmcs_read64(TERTIARY_VM_EXEC_CONTROL); + else + tertiary_exec_control = 0; pr_err("VMCS %p, last attempted VM-entry on CPU %d\n", vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu); @@ -6039,9 +6234,10 @@ void dump_vmcs(struct kvm_vcpu *vcpu) vmx_dump_msrs("host autoload", &vmx->msr_autoload.host); pr_err("*** Control State ***\n"); - pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n", - pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control); - pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl); + pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n", + cpu_based_exec_ctrl, secondary_exec_control, tertiary_exec_control); + pr_err("PinBased=0x%08x EntryControls=%08x ExitControls=%08x\n", + pin_based_exec_ctrl, vmentry_ctl, vmexit_ctl); pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n", vmcs_read32(EXCEPTION_BITMAP), vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK), @@ -6191,7 +6387,8 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) exit_reason.basic != EXIT_REASON_EPT_VIOLATION && exit_reason.basic != EXIT_REASON_PML_FULL && exit_reason.basic != EXIT_REASON_APIC_ACCESS && - exit_reason.basic != EXIT_REASON_TASK_SWITCH)) { + exit_reason.basic != EXIT_REASON_TASK_SWITCH && + exit_reason.basic != EXIT_REASON_NOTIFY)) { int ndata = 3; vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; @@ -6453,7 +6650,7 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) put_page(page); } -static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) +static void vmx_hwapic_isr_update(int max_isr) { u16 status; u8 old; @@ -6783,9 +6980,14 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) { int i, nr_msrs; struct perf_guest_switch_msr *msrs; + struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu); + + pmu->host_cross_mapped_mask = 0; + if (pmu->pebs_enable & pmu->global_ctrl) + intel_pmu_cross_mapped_check(pmu); /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */ - msrs = perf_guest_get_msrs(&nr_msrs); + msrs = perf_guest_get_msrs(&nr_msrs, (void *)pmu); if (!msrs) return; @@ -7166,6 +7368,10 @@ static int vmx_vcpu_create(struct kvm_vcpu *vcpu) goto free_vmcs; } + if (vmx_can_use_ipiv(vcpu)) + WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id], + __pa(&vmx->pi_desc) | PID_TABLE_ENTRY_VALID); + return 0; free_vmcs: @@ -7234,7 +7440,7 @@ static int __init vmx_check_processor_compat(void) return 0; } -static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) +static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { u8 cache; @@ -7310,7 +7516,7 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \ } while (0) - entry = kvm_find_cpuid_entry(vcpu, 0x1, 0); + entry = kvm_find_cpuid_entry(vcpu, 0x1); cr4_fixed1_update(X86_CR4_VME, edx, feature_bit(VME)); cr4_fixed1_update(X86_CR4_PVI, edx, feature_bit(VME)); cr4_fixed1_update(X86_CR4_TSD, edx, feature_bit(TSC)); @@ -7326,7 +7532,7 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) cr4_fixed1_update(X86_CR4_PCIDE, ecx, feature_bit(PCID)); cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, feature_bit(XSAVE)); - entry = kvm_find_cpuid_entry(vcpu, 0x7, 0); + entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 0); cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, feature_bit(FSGSBASE)); cr4_fixed1_update(X86_CR4_SMEP, ebx, feature_bit(SMEP)); cr4_fixed1_update(X86_CR4_SMAP, ebx, feature_bit(SMAP)); @@ -7337,23 +7543,6 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) #undef cr4_fixed1_update } -static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (kvm_mpx_supported()) { - bool mpx_enabled = guest_cpuid_has(vcpu, X86_FEATURE_MPX); - - if (mpx_enabled) { - vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; - vmx->nested.msrs.exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; - } else { - vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_BNDCFGS; - vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS; - } - } -} - static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -7361,7 +7550,7 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) int i; for (i = 0; i < PT_CPUID_LEAVES; i++) { - best = kvm_find_cpuid_entry(vcpu, 0x14, i); + best = kvm_find_cpuid_entry_index(vcpu, 0x14, i); if (!best) return; vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax; @@ -7445,10 +7634,8 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX); - if (nested_vmx_allowed(vcpu)) { + if (nested_vmx_allowed(vcpu)) nested_vmx_cr_fixed1_bits_update(vcpu); - nested_vmx_entry_exit_ctls_update(vcpu); - } if (boot_cpu_has(X86_FEATURE_INTEL_PT) && guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT)) @@ -7502,6 +7689,13 @@ static __init void vmx_set_cpu_caps(void) kvm_cpu_cap_clear(X86_FEATURE_INVPCID); if (vmx_pt_mode_is_host_guest()) kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT); + if (vmx_pebs_supported()) { + kvm_cpu_cap_check_and_set(X86_FEATURE_DS); + kvm_cpu_cap_check_and_set(X86_FEATURE_DTES64); + } + + if (!enable_pmu) + kvm_cpu_cap_clear(X86_FEATURE_PDCM); if (!enable_sgx) { kvm_cpu_cap_clear(X86_FEATURE_SGX); @@ -7514,7 +7708,7 @@ static __init void vmx_set_cpu_caps(void) kvm_cpu_cap_set(X86_FEATURE_UMIP); /* CPUID 0xD.1 */ - supported_xss = 0; + kvm_caps.supported_xss = 0; if (!cpu_has_vmx_xsaves()) kvm_cpu_cap_clear(X86_FEATURE_XSAVES); @@ -7655,9 +7849,9 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, delta_tsc = 0; /* Convert to host delta tsc if tsc scaling is enabled */ - if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio && + if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio && delta_tsc && u64_shl_div_u64(delta_tsc, - kvm_tsc_scaling_ratio_frac_bits, + kvm_caps.tsc_scaling_ratio_frac_bits, vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc)) return -ERANGE; @@ -7729,6 +7923,13 @@ static int vmx_enter_smm(struct kvm_vcpu *vcpu, char *smstate) { struct vcpu_vmx *vmx = to_vmx(vcpu); + /* + * TODO: Implement custom flows for forcing the vCPU out/in of L2 on + * SMI and RSM. Using the common VM-Exit + VM-Enter routines is wrong + * SMI and RSM only modify state that is saved and restored via SMRAM. + * E.g. most MSRs are left untouched, but many are modified by VM-Exit + * and VM-Enter, and thus L2's values may be corrupted on SMI+RSM. + */ vmx->nested.smm.guest_mode = is_guest_mode(vcpu); if (vmx->nested.smm.guest_mode) nested_vmx_vmexit(vcpu, -1, 0, 0); @@ -7802,6 +8003,13 @@ static bool vmx_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason) return supported & BIT(reason); } +static void vmx_vm_destroy(struct kvm *kvm) +{ + struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); + + free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm)); +} + static struct kvm_x86_ops vmx_x86_ops __initdata = { .name = "kvm_intel", @@ -7813,7 +8021,9 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .vm_size = sizeof(struct kvm_vmx), .vm_init = vmx_vm_init, + .vm_destroy = vmx_vm_destroy, + .vcpu_precreate = vmx_vcpu_precreate, .vcpu_create = vmx_vcpu_create, .vcpu_free = vmx_vcpu_free, .vcpu_reset = vmx_vcpu_reset, @@ -8027,8 +8237,8 @@ static __init int hardware_setup(void) } if (!cpu_has_vmx_mpx()) - supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | - XFEATURE_MASK_BNDCSR); + kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | + XFEATURE_MASK_BNDCSR); if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) @@ -8091,12 +8301,16 @@ static __init int hardware_setup(void) if (!enable_apicv) vmx_x86_ops.sync_pir_to_irr = NULL; + if (!enable_apicv || !cpu_has_vmx_ipiv()) + enable_ipiv = false; + if (cpu_has_vmx_tsc_scaling()) - kvm_has_tsc_control = true; + kvm_caps.has_tsc_control = true; - kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; - kvm_tsc_scaling_ratio_frac_bits = 48; - kvm_has_bus_lock_exit = cpu_has_vmx_bus_lock_detection(); + kvm_caps.max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; + kvm_caps.tsc_scaling_ratio_frac_bits = 48; + kvm_caps.has_bus_lock_exit = cpu_has_vmx_bus_lock_detection(); + kvm_caps.has_notify_vmexit = cpu_has_notify_vmexit(); set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ @@ -8153,11 +8367,12 @@ static __init int hardware_setup(void) vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit; } - kvm_mce_cap_supported |= MCG_LMCE_P; + kvm_caps.supported_mce_cap |= MCG_LMCE_P; + kvm_caps.supported_mce_cap |= MCG_CMCI_P; if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST) return -EINVAL; - if (!enable_ept || !cpu_has_vmx_intel_pt()) + if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt()) pt_mode = PT_MODE_SYSTEM; if (pt_mode == PT_MODE_HOST_GUEST) vmx_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 1e7f9453894b..fb8e3480a9d7 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -92,10 +92,22 @@ union vmx_exit_reason { u32 full; }; +static inline bool intel_pmu_has_perf_global_ctrl(struct kvm_pmu *pmu) +{ + /* + * Architecturally, Intel's SDM states that IA32_PERF_GLOBAL_CTRL is + * supported if "CPUID.0AH: EAX[7:0] > 0", i.e. if the PMU version is + * greater than zero. However, KVM only exposes and emulates the MSR + * to/for the guest if the guest PMU supports at least "Architectural + * Performance Monitoring Version 2". + */ + return pmu->version > 1; +} + #define vcpu_to_lbr_desc(vcpu) (&to_vmx(vcpu)->lbr_desc) #define vcpu_to_lbr_records(vcpu) (&to_vmx(vcpu)->lbr_desc.records) -bool intel_pmu_lbr_is_compatible(struct kvm_vcpu *vcpu); +void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu); bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu); int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu); @@ -205,7 +217,7 @@ struct nested_vmx { * Guest pages referred to in the vmcs02 with host-physical * pointers, so we must keep them pinned while L2 runs. */ - struct page *apic_access_page; + struct kvm_host_map apic_access_page_map; struct kvm_host_map virtual_apic_map; struct kvm_host_map pi_desc_map; @@ -220,9 +232,18 @@ struct nested_vmx { bool has_preemption_timer_deadline; bool preemption_timer_expired; - /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ - u64 vmcs01_debugctl; - u64 vmcs01_guest_bndcfgs; + /* + * Used to snapshot MSRs that are conditionally loaded on VM-Enter in + * order to propagate the guest's pre-VM-Enter value into vmcs02. For + * emulation of VMLAUNCH/VMRESUME, the snapshot will be of L1's value. + * For KVM_SET_NESTED_STATE, the snapshot is of L2's value, _if_ + * userspace restores MSRs before nested state. If userspace restores + * MSRs after nested state, the snapshot holds garbage, but KVM can't + * detect that, and the garbage value in vmcs02 will be overwritten by + * MSR restoration in any case. + */ + u64 pre_vmenter_debugctl; + u64 pre_vmenter_bndcfgs; /* to migrate it to L1 if L2 writes to L1's CR8 directly */ int l1_tpr_threshold; @@ -369,6 +390,8 @@ struct kvm_vmx { unsigned int tss_addr; bool ept_identity_pagetable_done; gpa_t ept_identity_map_addr; + /* Posted Interrupt Descriptor (PID) table for IPI virtualization */ + u64 *pid_table; }; bool nested_vmx_allowed(struct kvm_vcpu *vcpu); @@ -462,35 +485,36 @@ static inline u8 vmx_get_rvi(void) return vmcs_read16(GUEST_INTR_STATUS) & 0xff; } -#define BUILD_CONTROLS_SHADOW(lname, uname) \ -static inline void lname##_controls_set(struct vcpu_vmx *vmx, u32 val) \ -{ \ - if (vmx->loaded_vmcs->controls_shadow.lname != val) { \ - vmcs_write32(uname, val); \ - vmx->loaded_vmcs->controls_shadow.lname = val; \ - } \ -} \ -static inline u32 __##lname##_controls_get(struct loaded_vmcs *vmcs) \ -{ \ - return vmcs->controls_shadow.lname; \ -} \ -static inline u32 lname##_controls_get(struct vcpu_vmx *vmx) \ -{ \ - return __##lname##_controls_get(vmx->loaded_vmcs); \ -} \ -static inline void lname##_controls_setbit(struct vcpu_vmx *vmx, u32 val) \ -{ \ - lname##_controls_set(vmx, lname##_controls_get(vmx) | val); \ -} \ -static inline void lname##_controls_clearbit(struct vcpu_vmx *vmx, u32 val) \ -{ \ - lname##_controls_set(vmx, lname##_controls_get(vmx) & ~val); \ +#define BUILD_CONTROLS_SHADOW(lname, uname, bits) \ +static inline void lname##_controls_set(struct vcpu_vmx *vmx, u##bits val) \ +{ \ + if (vmx->loaded_vmcs->controls_shadow.lname != val) { \ + vmcs_write##bits(uname, val); \ + vmx->loaded_vmcs->controls_shadow.lname = val; \ + } \ +} \ +static inline u##bits __##lname##_controls_get(struct loaded_vmcs *vmcs) \ +{ \ + return vmcs->controls_shadow.lname; \ +} \ +static inline u##bits lname##_controls_get(struct vcpu_vmx *vmx) \ +{ \ + return __##lname##_controls_get(vmx->loaded_vmcs); \ +} \ +static inline void lname##_controls_setbit(struct vcpu_vmx *vmx, u##bits val) \ +{ \ + lname##_controls_set(vmx, lname##_controls_get(vmx) | val); \ +} \ +static inline void lname##_controls_clearbit(struct vcpu_vmx *vmx, u##bits val) \ +{ \ + lname##_controls_set(vmx, lname##_controls_get(vmx) & ~val); \ } -BUILD_CONTROLS_SHADOW(vm_entry, VM_ENTRY_CONTROLS) -BUILD_CONTROLS_SHADOW(vm_exit, VM_EXIT_CONTROLS) -BUILD_CONTROLS_SHADOW(pin, PIN_BASED_VM_EXEC_CONTROL) -BUILD_CONTROLS_SHADOW(exec, CPU_BASED_VM_EXEC_CONTROL) -BUILD_CONTROLS_SHADOW(secondary_exec, SECONDARY_VM_EXEC_CONTROL) +BUILD_CONTROLS_SHADOW(vm_entry, VM_ENTRY_CONTROLS, 32) +BUILD_CONTROLS_SHADOW(vm_exit, VM_EXIT_CONTROLS, 32) +BUILD_CONTROLS_SHADOW(pin, PIN_BASED_VM_EXEC_CONTROL, 32) +BUILD_CONTROLS_SHADOW(exec, CPU_BASED_VM_EXEC_CONTROL, 32) +BUILD_CONTROLS_SHADOW(secondary_exec, SECONDARY_VM_EXEC_CONTROL, 32) +BUILD_CONTROLS_SHADOW(tertiary_exec, TERTIARY_VM_EXEC_CONTROL, 64) /* * VMX_REGS_LAZY_LOAD_SET - The set of registers that will be updated in the @@ -586,4 +610,9 @@ static inline int vmx_get_instr_info_reg2(u32 vmx_instr_info) return (vmx_instr_info >> 28) & 0xf; } +static inline bool vmx_can_use_ipiv(struct kvm_vcpu *vcpu) +{ + return lapic_in_kernel(vcpu) && enable_ipiv; +} + #endif /* __KVM_X86_VMX_H */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e5fa335a4ea7..79a8a74b6b2a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -41,7 +41,6 @@ #include <linux/mman.h> #include <linux/highmem.h> #include <linux/iommu.h> -#include <linux/intel-iommu.h> #include <linux/cpufreq.h> #include <linux/user-return-notifier.h> #include <linux/srcu.h> @@ -87,8 +86,11 @@ #define MAX_IO_MSRS 256 #define KVM_MAX_MCE_BANKS 32 -u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P; -EXPORT_SYMBOL_GPL(kvm_mce_cap_supported); + +struct kvm_caps kvm_caps __read_mostly = { + .supported_mce_cap = MCG_CTL_P | MCG_SER_P, +}; +EXPORT_SYMBOL_GPL(kvm_caps); #define ERR_PTR_USR(e) ((void __user *)ERR_PTR(e)) @@ -151,19 +153,6 @@ module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); static bool __read_mostly kvmclock_periodic_sync = true; module_param(kvmclock_periodic_sync, bool, S_IRUGO); -bool __read_mostly kvm_has_tsc_control; -EXPORT_SYMBOL_GPL(kvm_has_tsc_control); -u32 __read_mostly kvm_max_guest_tsc_khz; -EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); -u8 __read_mostly kvm_tsc_scaling_ratio_frac_bits; -EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits); -u64 __read_mostly kvm_max_tsc_scaling_ratio; -EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio); -u64 __read_mostly kvm_default_tsc_scaling_ratio; -EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio); -bool __read_mostly kvm_has_bus_lock_exit; -EXPORT_SYMBOL_GPL(kvm_has_bus_lock_exit); - /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ static u32 __read_mostly tsc_tolerance_ppm = 250; module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); @@ -235,8 +224,6 @@ EXPORT_SYMBOL_GPL(enable_apicv); u64 __read_mostly host_xss; EXPORT_SYMBOL_GPL(host_xss); -u64 __read_mostly supported_xss; -EXPORT_SYMBOL_GPL(supported_xss); const struct _kvm_stats_desc kvm_vm_stats_desc[] = { KVM_GENERIC_VM_STATS(), @@ -298,7 +285,8 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, directed_yield_successful), STATS_DESC_COUNTER(VCPU, preemption_reported), STATS_DESC_COUNTER(VCPU, preemption_other), - STATS_DESC_IBOOLEAN(VCPU, guest_mode) + STATS_DESC_IBOOLEAN(VCPU, guest_mode), + STATS_DESC_COUNTER(VCPU, notify_window_exits), }; const struct kvm_stats_header kvm_vcpu_stats_header = { @@ -311,8 +299,6 @@ const struct kvm_stats_header kvm_vcpu_stats_header = { }; u64 __read_mostly host_xcr0; -u64 __read_mostly supported_xcr0; -EXPORT_SYMBOL_GPL(supported_xcr0); static struct kmem_cache *x86_emulator_cache; @@ -862,7 +848,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) */ real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(pdpt_gfn), PFERR_USER_MASK | PFERR_WRITE_MASK, NULL); - if (real_gpa == UNMAPPED_GVA) + if (real_gpa == INVALID_GPA) return 0; /* Note the offset, PDPTRs are 32 byte aligned when using PAE paging. */ @@ -1094,7 +1080,7 @@ int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_xsetbv); -bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { if (cr4 & cr4_reserved_bits) return false; @@ -1102,9 +1088,15 @@ bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (cr4 & vcpu->arch.cr4_guest_rsvd_bits) return false; - return static_call(kvm_x86_is_valid_cr4)(vcpu, cr4); + return true; +} +EXPORT_SYMBOL_GPL(__kvm_is_valid_cr4); + +static bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + return __kvm_is_valid_cr4(vcpu, cr4) && + static_call(kvm_x86_is_valid_cr4)(vcpu, cr4); } -EXPORT_SYMBOL_GPL(kvm_is_valid_cr4); void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4) { @@ -1450,6 +1442,7 @@ static const u32 msrs_to_save_all[] = { MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13, MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15, MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17, + MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG, MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3, MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3, @@ -2051,13 +2044,6 @@ int kvm_emulate_invd(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_invd); -int kvm_emulate_mwait(struct kvm_vcpu *vcpu) -{ - pr_warn_once("kvm: MWAIT instruction emulated as NOP!\n"); - return kvm_emulate_as_nop(vcpu); -} -EXPORT_SYMBOL_GPL(kvm_emulate_mwait); - int kvm_handle_invalid_op(struct kvm_vcpu *vcpu) { kvm_queue_exception(vcpu, UD_VECTOR); @@ -2065,11 +2051,26 @@ int kvm_handle_invalid_op(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_handle_invalid_op); -int kvm_emulate_monitor(struct kvm_vcpu *vcpu) + +static int kvm_emulate_monitor_mwait(struct kvm_vcpu *vcpu, const char *insn) { - pr_warn_once("kvm: MONITOR instruction emulated as NOP!\n"); + if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS) && + !guest_cpuid_has(vcpu, X86_FEATURE_MWAIT)) + return kvm_handle_invalid_op(vcpu); + + pr_warn_once("kvm: %s instruction emulated as NOP!\n", insn); return kvm_emulate_as_nop(vcpu); } +int kvm_emulate_mwait(struct kvm_vcpu *vcpu) +{ + return kvm_emulate_monitor_mwait(vcpu, "MWAIT"); +} +EXPORT_SYMBOL_GPL(kvm_emulate_mwait); + +int kvm_emulate_monitor(struct kvm_vcpu *vcpu) +{ + return kvm_emulate_monitor_mwait(vcpu, "MONITOR"); +} EXPORT_SYMBOL_GPL(kvm_emulate_monitor); static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu) @@ -2349,12 +2350,12 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) /* Guest TSC same frequency as host TSC? */ if (!scale) { - kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio); + kvm_vcpu_write_tsc_multiplier(vcpu, kvm_caps.default_tsc_scaling_ratio); return 0; } /* TSC scaling supported? */ - if (!kvm_has_tsc_control) { + if (!kvm_caps.has_tsc_control) { if (user_tsc_khz > tsc_khz) { vcpu->arch.tsc_catchup = 1; vcpu->arch.tsc_always_catchup = 1; @@ -2366,10 +2367,10 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) } /* TSC scaling required - calculate ratio */ - ratio = mul_u64_u32_div(1ULL << kvm_tsc_scaling_ratio_frac_bits, + ratio = mul_u64_u32_div(1ULL << kvm_caps.tsc_scaling_ratio_frac_bits, user_tsc_khz, tsc_khz); - if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) { + if (ratio == 0 || ratio >= kvm_caps.max_tsc_scaling_ratio) { pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n", user_tsc_khz); return -1; @@ -2387,7 +2388,7 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) /* tsc_khz can be zero if TSC calibration fails */ if (user_tsc_khz == 0) { /* set tsc_scaling_ratio to a safe value */ - kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio); + kvm_vcpu_write_tsc_multiplier(vcpu, kvm_caps.default_tsc_scaling_ratio); return -1; } @@ -2464,18 +2465,18 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) * (frac) represent the fractional part, ie. ratio represents a fixed * point number (mult + frac * 2^(-N)). * - * N equals to kvm_tsc_scaling_ratio_frac_bits. + * N equals to kvm_caps.tsc_scaling_ratio_frac_bits. */ static inline u64 __scale_tsc(u64 ratio, u64 tsc) { - return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits); + return mul_u64_u64_shr(tsc, ratio, kvm_caps.tsc_scaling_ratio_frac_bits); } u64 kvm_scale_tsc(u64 tsc, u64 ratio) { u64 _tsc = tsc; - if (ratio != kvm_default_tsc_scaling_ratio) + if (ratio != kvm_caps.default_tsc_scaling_ratio) _tsc = __scale_tsc(ratio, tsc); return _tsc; @@ -2502,11 +2503,11 @@ u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier) { u64 nested_offset; - if (l2_multiplier == kvm_default_tsc_scaling_ratio) + if (l2_multiplier == kvm_caps.default_tsc_scaling_ratio) nested_offset = l1_offset; else nested_offset = mul_s64_u64_shr((s64) l1_offset, l2_multiplier, - kvm_tsc_scaling_ratio_frac_bits); + kvm_caps.tsc_scaling_ratio_frac_bits); nested_offset += l2_offset; return nested_offset; @@ -2515,9 +2516,9 @@ EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_offset); u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier) { - if (l2_multiplier != kvm_default_tsc_scaling_ratio) + if (l2_multiplier != kvm_caps.default_tsc_scaling_ratio) return mul_u64_u64_shr(l1_multiplier, l2_multiplier, - kvm_tsc_scaling_ratio_frac_bits); + kvm_caps.tsc_scaling_ratio_frac_bits); return l1_multiplier; } @@ -2559,7 +2560,7 @@ static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multipli else vcpu->arch.tsc_scaling_ratio = l1_multiplier; - if (kvm_has_tsc_control) + if (kvm_caps.has_tsc_control) static_call(kvm_x86_write_tsc_multiplier)( vcpu, vcpu->arch.tsc_scaling_ratio); } @@ -2695,7 +2696,7 @@ static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) { - if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio) + if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio) WARN_ON(adjustment < 0); adjustment = kvm_scale_tsc((u64) adjustment, vcpu->arch.l1_tsc_scaling_ratio); @@ -3108,7 +3109,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) /* With all the info we got, fill in the values */ - if (kvm_has_tsc_control) + if (kvm_caps.has_tsc_control) tgt_tsc_khz = kvm_scale_tsc(tgt_tsc_khz, v->arch.l1_tsc_scaling_ratio); @@ -3198,6 +3199,16 @@ static void kvmclock_sync_fn(struct work_struct *work) KVMCLOCK_SYNC_PERIOD); } +/* These helpers are safe iff @msr is known to be an MCx bank MSR. */ +static bool is_mci_control_msr(u32 msr) +{ + return (msr & 3) == 0; +} +static bool is_mci_status_msr(u32 msr) +{ + return (msr & 3) == 1; +} + /* * On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP. */ @@ -3216,6 +3227,7 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) unsigned bank_num = mcg_cap & 0xff; u32 msr = msr_info->index; u64 data = msr_info->data; + u32 offset, last_msr; switch (msr) { case MSR_IA32_MCG_STATUS: @@ -3229,32 +3241,53 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vcpu->arch.mcg_ctl = data; break; - default: - if (msr >= MSR_IA32_MC0_CTL && - msr < MSR_IA32_MCx_CTL(bank_num)) { - u32 offset = array_index_nospec( - msr - MSR_IA32_MC0_CTL, - MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL); - - /* only 0 or all 1s can be written to IA32_MCi_CTL - * some Linux kernels though clear bit 10 in bank 4 to - * workaround a BIOS/GART TBL issue on AMD K8s, ignore - * this to avoid an uncatched #GP in the guest - */ - if ((offset & 0x3) == 0 && - data != 0 && (data | (1 << 10)) != ~(u64)0) - return -1; - - /* MCi_STATUS */ - if (!msr_info->host_initiated && - (offset & 0x3) == 1 && data != 0) { - if (!can_set_mci_status(vcpu)) - return -1; - } + case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: + last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1; + if (msr > last_msr) + return 1; - vcpu->arch.mce_banks[offset] = data; - break; - } + if (!(mcg_cap & MCG_CMCI_P) && (data || !msr_info->host_initiated)) + return 1; + /* An attempt to write a 1 to a reserved bit raises #GP */ + if (data & ~(MCI_CTL2_CMCI_EN | MCI_CTL2_CMCI_THRESHOLD_MASK)) + return 1; + offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2, + last_msr + 1 - MSR_IA32_MC0_CTL2); + vcpu->arch.mci_ctl2_banks[offset] = data; + break; + case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: + last_msr = MSR_IA32_MCx_CTL(bank_num) - 1; + if (msr > last_msr) + return 1; + + /* + * Only 0 or all 1s can be written to IA32_MCi_CTL, all other + * values are architecturally undefined. But, some Linux + * kernels clear bit 10 in bank 4 to workaround a BIOS/GART TLB + * issue on AMD K8s, allow bit 10 to be clear when setting all + * other bits in order to avoid an uncaught #GP in the guest. + * + * UNIXWARE clears bit 0 of MC1_CTL to ignore correctable, + * single-bit ECC data errors. + */ + if (is_mci_control_msr(msr) && + data != 0 && (data | (1 << 10) | 1) != ~(u64)0) + return 1; + + /* + * All CPUs allow writing 0 to MCi_STATUS MSRs to clear the MSR. + * AMD-based CPUs allow non-zero values, but if and only if + * HWCR[McStatusWrEn] is set. + */ + if (!msr_info->host_initiated && is_mci_status_msr(msr) && + data != 0 && !can_set_mci_status(vcpu)) + return 1; + + offset = array_index_nospec(msr - MSR_IA32_MC0_CTL, + last_msr + 1 - MSR_IA32_MC0_CTL); + vcpu->arch.mce_banks[offset] = data; + break; + default: return 1; } return 0; @@ -3538,7 +3571,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; } break; - case 0x200 ... 0x2ff: + case 0x200 ... MSR_IA32_MC0_CTL2 - 1: + case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff: return kvm_mtrr_set_msr(vcpu, msr, data); case MSR_IA32_APICBASE: return kvm_set_apic_base(vcpu, msr_info); @@ -3560,9 +3594,21 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.ia32_tsc_adjust_msr = data; } break; - case MSR_IA32_MISC_ENABLE: + case MSR_IA32_MISC_ENABLE: { + u64 old_val = vcpu->arch.ia32_misc_enable_msr; + + if (!msr_info->host_initiated) { + /* RO bits */ + if ((old_val ^ data) & MSR_IA32_MISC_ENABLE_PMU_RO_MASK) + return 1; + + /* R bits, i.e. writes are ignored, but don't fault. */ + data = data & ~MSR_IA32_MISC_ENABLE_EMON; + data |= old_val & MSR_IA32_MISC_ENABLE_EMON; + } + if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) && - ((vcpu->arch.ia32_misc_enable_msr ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) { + ((old_val ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) { if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3)) return 1; vcpu->arch.ia32_misc_enable_msr = data; @@ -3571,6 +3617,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.ia32_misc_enable_msr = data; } break; + } case MSR_IA32_SMBASE: if (!msr_info->host_initiated) return 1; @@ -3597,7 +3644,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than * XSAVES/XRSTORS to save/restore PT MSRs. */ - if (data & ~supported_xss) + if (data & ~kvm_caps.supported_xss) return 1; vcpu->arch.ia32_xss = data; kvm_update_cpuid_runtime(vcpu); @@ -3695,6 +3742,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: + case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: return set_msr_mce(vcpu, msr_info); case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: @@ -3785,6 +3833,17 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.guest_fpu.xfd_err = data; break; #endif + case MSR_IA32_PEBS_ENABLE: + case MSR_IA32_DS_AREA: + case MSR_PEBS_DATA_CFG: + case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: + if (kvm_pmu_is_valid_msr(vcpu, msr)) + return kvm_pmu_set_msr(vcpu, msr_info); + /* + * Userspace is allowed to write '0' to MSRs that KVM reports + * as to-be-saved, even if an MSRs isn't fully supported. + */ + return !msr_info->host_initiated || data; default: if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); @@ -3799,6 +3858,7 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) u64 data; u64 mcg_cap = vcpu->arch.mcg_cap; unsigned bank_num = mcg_cap & 0xff; + u32 offset, last_msr; switch (msr) { case MSR_IA32_P5_MC_ADDR: @@ -3816,16 +3876,27 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) case MSR_IA32_MCG_STATUS: data = vcpu->arch.mcg_status; break; - default: - if (msr >= MSR_IA32_MC0_CTL && - msr < MSR_IA32_MCx_CTL(bank_num)) { - u32 offset = array_index_nospec( - msr - MSR_IA32_MC0_CTL, - MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL); + case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: + last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1; + if (msr > last_msr) + return 1; - data = vcpu->arch.mce_banks[offset]; - break; - } + if (!(mcg_cap & MCG_CMCI_P) && !host) + return 1; + offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2, + last_msr + 1 - MSR_IA32_MC0_CTL2); + data = vcpu->arch.mci_ctl2_banks[offset]; + break; + case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: + last_msr = MSR_IA32_MCx_CTL(bank_num) - 1; + if (msr > last_msr) + return 1; + + offset = array_index_nospec(msr - MSR_IA32_MC0_CTL, + last_msr + 1 - MSR_IA32_MC0_CTL); + data = vcpu->arch.mce_banks[offset]; + break; + default: return 1; } *pdata = data; @@ -3865,9 +3936,16 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_DRAM_ENERGY_STATUS: /* DRAM controller */ msr_info->data = 0; break; + case MSR_IA32_PEBS_ENABLE: + case MSR_IA32_DS_AREA: + case MSR_PEBS_DATA_CFG: case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info); + /* + * Userspace is allowed to read MSRs that KVM reports as + * to-be-saved, even if an MSR isn't fully supported. + */ if (!msr_info->host_initiated) return 1; msr_info->data = 0; @@ -3922,7 +4000,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; } case MSR_MTRRcap: - case 0x200 ... 0x2ff: + case 0x200 ... MSR_IA32_MC0_CTL2 - 1: + case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff: return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data); case 0xcd: /* fsb frequency */ msr_info->data = 3; @@ -4038,6 +4117,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: + case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: return get_msr_mce(vcpu, msr_info->index, &msr_info->data, msr_info->host_initiated); case MSR_IA32_XSS: @@ -4280,6 +4360,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_GET_MSR_FEATURES: case KVM_CAP_MSR_PLATFORM_INFO: case KVM_CAP_EXCEPTION_PAYLOAD: + case KVM_CAP_X86_TRIPLE_FAULT_EVENT: case KVM_CAP_SET_GUEST_DEBUG: case KVM_CAP_LAST_CPU: case KVM_CAP_X86_USER_SPACE_MSR: @@ -4296,6 +4377,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SYS_ATTRIBUTES: case KVM_CAP_VAPIC: case KVM_CAP_ENABLE_CAP: + case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES: r = 1; break; case KVM_CAP_EXIT_HYPERCALL: @@ -4357,7 +4439,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; case KVM_CAP_TSC_CONTROL: case KVM_CAP_VM_TSC_CONTROL: - r = kvm_has_tsc_control; + r = kvm_caps.has_tsc_control; break; case KVM_CAP_X2APIC_API: r = KVM_X2APIC_API_VALID_FLAGS; @@ -4379,7 +4461,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = sched_info_on(); break; case KVM_CAP_X86_BUS_LOCK_EXIT: - if (kvm_has_bus_lock_exit) + if (kvm_caps.has_bus_lock_exit) r = KVM_BUS_LOCK_DETECTION_OFF | KVM_BUS_LOCK_DETECTION_EXIT; else @@ -4388,17 +4470,20 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_XSAVE2: { u64 guest_perm = xstate_get_guest_group_perm(); - r = xstate_required_size(supported_xcr0 & guest_perm, false); + r = xstate_required_size(kvm_caps.supported_xcr0 & guest_perm, false); if (r < sizeof(struct kvm_xsave)) r = sizeof(struct kvm_xsave); break; + } case KVM_CAP_PMU_CAPABILITY: r = enable_pmu ? KVM_CAP_PMU_VALID_MASK : 0; break; - } case KVM_CAP_DISABLE_QUIRKS2: r = KVM_X86_VALID_QUIRKS; break; + case KVM_CAP_X86_NOTIFY_VMEXIT: + r = kvm_caps.has_notify_vmexit; + break; default: break; } @@ -4426,7 +4511,7 @@ static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr) switch (attr->attr) { case KVM_X86_XCOMP_GUEST_SUPP: - if (put_user(supported_xcr0, uaddr)) + if (put_user(kvm_caps.supported_xcr0, uaddr)) return -EFAULT; return 0; default: @@ -4503,8 +4588,8 @@ long kvm_arch_dev_ioctl(struct file *filp, } case KVM_X86_GET_MCE_CAP_SUPPORTED: r = -EFAULT; - if (copy_to_user(argp, &kvm_mce_cap_supported, - sizeof(kvm_mce_cap_supported))) + if (copy_to_user(argp, &kvm_caps.supported_mce_cap, + sizeof(kvm_caps.supported_mce_cap))) goto out; r = 0; break; @@ -4803,22 +4888,63 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, r = -EINVAL; if (!bank_num || bank_num > KVM_MAX_MCE_BANKS) goto out; - if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000)) + if (mcg_cap & ~(kvm_caps.supported_mce_cap | 0xff | 0xff0000)) goto out; r = 0; vcpu->arch.mcg_cap = mcg_cap; /* Init IA32_MCG_CTL to all 1s */ if (mcg_cap & MCG_CTL_P) vcpu->arch.mcg_ctl = ~(u64)0; - /* Init IA32_MCi_CTL to all 1s */ - for (bank = 0; bank < bank_num; bank++) + /* Init IA32_MCi_CTL to all 1s, IA32_MCi_CTL2 to all 0s */ + for (bank = 0; bank < bank_num; bank++) { vcpu->arch.mce_banks[bank*4] = ~(u64)0; + if (mcg_cap & MCG_CMCI_P) + vcpu->arch.mci_ctl2_banks[bank] = 0; + } + + kvm_apic_after_set_mcg_cap(vcpu); static_call(kvm_x86_setup_mce)(vcpu); out: return r; } +/* + * Validate this is an UCNA (uncorrectable no action) error by checking the + * MCG_STATUS and MCi_STATUS registers: + * - none of the bits for Machine Check Exceptions are set + * - both the VAL (valid) and UC (uncorrectable) bits are set + * MCI_STATUS_PCC - Processor Context Corrupted + * MCI_STATUS_S - Signaled as a Machine Check Exception + * MCI_STATUS_AR - Software recoverable Action Required + */ +static bool is_ucna(struct kvm_x86_mce *mce) +{ + return !mce->mcg_status && + !(mce->status & (MCI_STATUS_PCC | MCI_STATUS_S | MCI_STATUS_AR)) && + (mce->status & MCI_STATUS_VAL) && + (mce->status & MCI_STATUS_UC); +} + +static int kvm_vcpu_x86_set_ucna(struct kvm_vcpu *vcpu, struct kvm_x86_mce *mce, u64* banks) +{ + u64 mcg_cap = vcpu->arch.mcg_cap; + + banks[1] = mce->status; + banks[2] = mce->addr; + banks[3] = mce->misc; + vcpu->arch.mcg_status = mce->mcg_status; + + if (!(mcg_cap & MCG_CMCI_P) || + !(vcpu->arch.mci_ctl2_banks[mce->bank] & MCI_CTL2_CMCI_EN)) + return 0; + + if (lapic_in_kernel(vcpu)) + kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTCMCI); + + return 0; +} + static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, struct kvm_x86_mce *mce) { @@ -4828,6 +4954,12 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL)) return -EINVAL; + + banks += array_index_nospec(4 * mce->bank, 4 * bank_num); + + if (is_ucna(mce)) + return kvm_vcpu_x86_set_ucna(vcpu, mce, banks); + /* * if IA32_MCG_CTL is not all 1s, the uncorrected error * reporting is disabled @@ -4835,7 +4967,6 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) && vcpu->arch.mcg_ctl != ~(u64)0) return 0; - banks += 4 * mce->bank; /* * if IA32_MCi_CTL is not all 1s, the uncorrected error * reporting is disabled for the bank @@ -4941,6 +5072,10 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, | KVM_VCPUEVENT_VALID_SMM); if (vcpu->kvm->arch.exception_payload_enabled) events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD; + if (vcpu->kvm->arch.triple_fault_event) { + events->triple_fault.pending = kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu); + events->flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT; + } memset(&events->reserved, 0, sizeof(events->reserved)); } @@ -4954,7 +5089,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, | KVM_VCPUEVENT_VALID_SIPI_VECTOR | KVM_VCPUEVENT_VALID_SHADOW | KVM_VCPUEVENT_VALID_SMM - | KVM_VCPUEVENT_VALID_PAYLOAD)) + | KVM_VCPUEVENT_VALID_PAYLOAD + | KVM_VCPUEVENT_VALID_TRIPLE_FAULT)) return -EINVAL; if (events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) { @@ -5027,6 +5163,15 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, } } + if (events->flags & KVM_VCPUEVENT_VALID_TRIPLE_FAULT) { + if (!vcpu->kvm->arch.triple_fault_event) + return -EINVAL; + if (events->triple_fault.pending) + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + else + kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu); + } + kvm_make_request(KVM_REQ_EVENT, vcpu); return 0; @@ -5095,7 +5240,8 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu, guest_xsave->region, - supported_xcr0, &vcpu->arch.pkru); + kvm_caps.supported_xcr0, + &vcpu->arch.pkru); } static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu, @@ -5600,8 +5746,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -EINVAL; user_tsc_khz = (u32)arg; - if (kvm_has_tsc_control && - user_tsc_khz >= kvm_max_guest_tsc_khz) + if (kvm_caps.has_tsc_control && + user_tsc_khz >= kvm_caps.max_guest_tsc_khz) goto out; if (user_tsc_khz == 0) @@ -6028,6 +6174,10 @@ split_irqchip_unlock: kvm->arch.exception_payload_enabled = cap->args[0]; r = 0; break; + case KVM_CAP_X86_TRIPLE_FAULT_EVENT: + kvm->arch.triple_fault_event = cap->args[0]; + r = 0; + break; case KVM_CAP_X86_USER_SPACE_MSR: r = -EINVAL; if (cap->args[0] & ~(KVM_MSR_EXIT_REASON_INVAL | @@ -6046,7 +6196,7 @@ split_irqchip_unlock: (cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT)) break; - if (kvm_has_bus_lock_exit && + if (kvm_caps.has_bus_lock_exit && cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT) kvm->arch.bus_lock_detection_enabled = true; r = 0; @@ -6109,6 +6259,65 @@ split_irqchip_unlock: } mutex_unlock(&kvm->lock); break; + case KVM_CAP_MAX_VCPU_ID: + r = -EINVAL; + if (cap->args[0] > KVM_MAX_VCPU_IDS) + break; + + mutex_lock(&kvm->lock); + if (kvm->arch.max_vcpu_ids == cap->args[0]) { + r = 0; + } else if (!kvm->arch.max_vcpu_ids) { + kvm->arch.max_vcpu_ids = cap->args[0]; + r = 0; + } + mutex_unlock(&kvm->lock); + break; + case KVM_CAP_X86_NOTIFY_VMEXIT: + r = -EINVAL; + if ((u32)cap->args[0] & ~KVM_X86_NOTIFY_VMEXIT_VALID_BITS) + break; + if (!kvm_caps.has_notify_vmexit) + break; + if (!((u32)cap->args[0] & KVM_X86_NOTIFY_VMEXIT_ENABLED)) + break; + mutex_lock(&kvm->lock); + if (!kvm->created_vcpus) { + kvm->arch.notify_window = cap->args[0] >> 32; + kvm->arch.notify_vmexit_flags = (u32)cap->args[0]; + r = 0; + } + mutex_unlock(&kvm->lock); + break; + case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES: + r = -EINVAL; + + /* + * Since the risk of disabling NX hugepages is a guest crashing + * the system, ensure the userspace process has permission to + * reboot the system. + * + * Note that unlike the reboot() syscall, the process must have + * this capability in the root namespace because exposing + * /dev/kvm into a container does not limit the scope of the + * iTLB multihit bug to that container. In other words, + * this must use capable(), not ns_capable(). + */ + if (!capable(CAP_SYS_BOOT)) { + r = -EPERM; + break; + } + + if (cap->args[0]) + break; + + mutex_lock(&kvm->lock); + if (!kvm->created_vcpus) { + kvm->arch.disable_nx_huge_pages = true; + r = 0; + } + mutex_unlock(&kvm->lock); + break; default: r = -EINVAL; break; @@ -6584,8 +6793,8 @@ set_pit2_out: r = -EINVAL; user_tsc_khz = (u32)arg; - if (kvm_has_tsc_control && - user_tsc_khz >= kvm_max_guest_tsc_khz) + if (kvm_caps.has_tsc_control && + user_tsc_khz >= kvm_caps.max_guest_tsc_khz) goto out; if (user_tsc_khz == 0) @@ -6660,15 +6869,12 @@ out: static void kvm_init_msr_list(void) { - struct x86_pmu_capability x86_pmu; u32 dummy[2]; unsigned i; BUILD_BUG_ON_MSG(KVM_PMC_MAX_FIXED != 3, "Please update the fixed PMCs in msrs_to_saved_all[]"); - perf_get_x86_pmu_capability(&x86_pmu); - num_msrs_to_save = 0; num_emulated_msrs = 0; num_msr_based_features = 0; @@ -6720,12 +6926,12 @@ static void kvm_init_msr_list(void) break; case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17: if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >= - min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) + min(INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp)) continue; break; case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17: if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >= - min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) + min(INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp)) continue; break; case MSR_IA32_XFD: @@ -6882,7 +7088,7 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; - if (gpa == UNMAPPED_GVA) + if (gpa == INVALID_GPA) return X86EMUL_PROPAGATE_FAULT; ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, data, offset, toread); @@ -6913,7 +7119,7 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt, /* Inline kvm_read_guest_virt_helper for speed. */ gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access|PFERR_FETCH_MASK, exception); - if (unlikely(gpa == UNMAPPED_GVA)) + if (unlikely(gpa == INVALID_GPA)) return X86EMUL_PROPAGATE_FAULT; offset = addr & (PAGE_SIZE-1); @@ -6983,7 +7189,7 @@ static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; - if (gpa == UNMAPPED_GVA) + if (gpa == INVALID_GPA) return X86EMUL_PROPAGATE_FAULT; ret = kvm_vcpu_write_guest(vcpu, gpa, data, towrite); if (ret < 0) { @@ -7094,7 +7300,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, *gpa = mmu->gva_to_gpa(vcpu, mmu, gva, access, exception); - if (*gpa == UNMAPPED_GVA) + if (*gpa == INVALID_GPA) return -1; return vcpu_is_mmio_gpa(vcpu, gva, *gpa, write); @@ -7331,7 +7537,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL); - if (gpa == UNMAPPED_GVA || + if (gpa == INVALID_GPA || (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) goto emul_write; @@ -7385,36 +7591,47 @@ emul_write: return emulator_write_emulated(ctxt, addr, new, bytes, exception); } -static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) +static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, + unsigned short port, void *data, + unsigned int count, bool in) { - int r = 0, i; + unsigned i; + int r; - for (i = 0; i < vcpu->arch.pio.count; i++) { - if (vcpu->arch.pio.in) - r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port, - vcpu->arch.pio.size, pd); + WARN_ON_ONCE(vcpu->arch.pio.count); + for (i = 0; i < count; i++) { + if (in) + r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, port, size, data); else - r = kvm_io_bus_write(vcpu, KVM_PIO_BUS, - vcpu->arch.pio.port, vcpu->arch.pio.size, - pd); - if (r) + r = kvm_io_bus_write(vcpu, KVM_PIO_BUS, port, size, data); + + if (r) { + if (i == 0) + goto userspace_io; + + /* + * Userspace must have unregistered the device while PIO + * was running. Drop writes / read as 0. + */ + if (in) + memset(data, 0, size * (count - i)); break; - pd += vcpu->arch.pio.size; + } + + data += size; } - return r; -} + return 1; -static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, - unsigned short port, - unsigned int count, bool in) -{ +userspace_io: vcpu->arch.pio.port = port; vcpu->arch.pio.in = in; - vcpu->arch.pio.count = count; + vcpu->arch.pio.count = count; vcpu->arch.pio.size = size; - if (!kernel_pio(vcpu, vcpu->arch.pio_data)) - return 1; + if (in) + memset(vcpu->arch.pio_data, 0, size * count); + else + memcpy(vcpu->arch.pio_data, data, size * count); vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; @@ -7422,30 +7639,33 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; vcpu->run->io.count = count; vcpu->run->io.port = port; - return 0; } -static int __emulator_pio_in(struct kvm_vcpu *vcpu, int size, - unsigned short port, unsigned int count) +static int emulator_pio_in(struct kvm_vcpu *vcpu, int size, + unsigned short port, void *val, unsigned int count) { - WARN_ON(vcpu->arch.pio.count); - memset(vcpu->arch.pio_data, 0, size * count); - return emulator_pio_in_out(vcpu, size, port, count, true); + int r = emulator_pio_in_out(vcpu, size, port, val, count, true); + if (r) + trace_kvm_pio(KVM_PIO_IN, port, size, count, val); + + return r; } static void complete_emulator_pio_in(struct kvm_vcpu *vcpu, void *val) { int size = vcpu->arch.pio.size; - unsigned count = vcpu->arch.pio.count; + unsigned int count = vcpu->arch.pio.count; memcpy(val, vcpu->arch.pio_data, size * count); trace_kvm_pio(KVM_PIO_IN, vcpu->arch.pio.port, size, count, vcpu->arch.pio_data); vcpu->arch.pio.count = 0; } -static int emulator_pio_in(struct kvm_vcpu *vcpu, int size, - unsigned short port, void *val, unsigned int count) +static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, + int size, unsigned short port, void *val, + unsigned int count) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); if (vcpu->arch.pio.count) { /* * Complete a previous iteration that required userspace I/O. @@ -7454,39 +7674,19 @@ static int emulator_pio_in(struct kvm_vcpu *vcpu, int size, * shenanigans as KVM doesn't support modifying the rep count, * and the emulator ensures @count doesn't overflow the buffer. */ - } else { - int r = __emulator_pio_in(vcpu, size, port, count); - if (!r) - return r; - - /* Results already available, fall through. */ + complete_emulator_pio_in(vcpu, val); + return 1; } - complete_emulator_pio_in(vcpu, val); - return 1; -} - -static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, - int size, unsigned short port, void *val, - unsigned int count) -{ - return emulator_pio_in(emul_to_vcpu(ctxt), size, port, val, count); - + return emulator_pio_in(vcpu, size, port, val, count); } static int emulator_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port, const void *val, unsigned int count) { - int ret; - - memcpy(vcpu->arch.pio_data, val, size * count); - trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data); - ret = emulator_pio_in_out(vcpu, size, port, count, false); - if (ret) - vcpu->arch.pio.count = 0; - - return ret; + trace_kvm_pio(KVM_PIO_OUT, port, size, count, val); + return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false); } static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, @@ -7868,7 +8068,16 @@ static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr) return __kvm_set_xcr(emul_to_vcpu(ctxt), index, xcr); } +static void emulator_vm_bugged(struct x86_emulate_ctxt *ctxt) +{ + struct kvm *kvm = emul_to_vcpu(ctxt)->kvm; + + if (!kvm->vm_bugged) + kvm_vm_bugged(kvm); +} + static const struct x86_emulate_ops emulate_ops = { + .vm_bugged = emulator_vm_bugged, .read_gpr = emulator_read_gpr, .write_gpr = emulator_write_gpr, .read_std = emulator_read_std, @@ -8145,7 +8354,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * If the mapping is invalid in guest, let cpu retry * it to generate fault. */ - if (gpa == UNMAPPED_GVA) + if (gpa == INVALID_GPA) return true; } @@ -8672,11 +8881,7 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu) /* For size less than 4 we merge, else we zero extend */ val = (vcpu->arch.pio.size < 4) ? kvm_rax_read(vcpu) : 0; - /* - * Since vcpu->arch.pio.count == 1 let emulator_pio_in perform - * the copy and tracing - */ - emulator_pio_in(vcpu, vcpu->arch.pio.size, vcpu->arch.pio.port, &val, 1); + complete_emulator_pio_in(vcpu, &val); kvm_rax_write(vcpu, val); return kvm_skip_emulated_instruction(vcpu); @@ -8751,7 +8956,7 @@ static void kvm_hyperv_tsc_notifier(void) /* TSC frequency always matches when on Hyper-V */ for_each_present_cpu(cpu) per_cpu(cpu_tsc_khz, cpu) = tsc_khz; - kvm_max_guest_tsc_khz = tsc_khz; + kvm_caps.max_guest_tsc_khz = tsc_khz; list_for_each_entry(kvm, &vm_list, vm_list) { __kvm_start_pvclock_update(kvm); @@ -8952,25 +9157,23 @@ static struct notifier_block pvclock_gtod_notifier = { int kvm_arch_init(void *opaque) { struct kvm_x86_init_ops *ops = opaque; + u64 host_pat; int r; if (kvm_x86_ops.hardware_enable) { pr_err("kvm: already loaded vendor module '%s'\n", kvm_x86_ops.name); - r = -EEXIST; - goto out; + return -EEXIST; } if (!ops->cpu_has_kvm_support()) { pr_err_ratelimited("kvm: no hardware support for '%s'\n", ops->runtime_ops->name); - r = -EOPNOTSUPP; - goto out; + return -EOPNOTSUPP; } if (ops->disabled_by_bios()) { pr_err_ratelimited("kvm: support for '%s' disabled by bios\n", ops->runtime_ops->name); - r = -EOPNOTSUPP; - goto out; + return -EOPNOTSUPP; } /* @@ -8980,27 +9183,37 @@ int kvm_arch_init(void *opaque) */ if (!boot_cpu_has(X86_FEATURE_FPU) || !boot_cpu_has(X86_FEATURE_FXSR)) { printk(KERN_ERR "kvm: inadequate fpu\n"); - r = -EOPNOTSUPP; - goto out; + return -EOPNOTSUPP; } if (IS_ENABLED(CONFIG_PREEMPT_RT) && !boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { pr_err("RT requires X86_FEATURE_CONSTANT_TSC\n"); - r = -EOPNOTSUPP; - goto out; + return -EOPNOTSUPP; } - r = -ENOMEM; + /* + * KVM assumes that PAT entry '0' encodes WB memtype and simply zeroes + * the PAT bits in SPTEs. Bail if PAT[0] is programmed to something + * other than WB. Note, EPT doesn't utilize the PAT, but don't bother + * with an exception. PAT[0] is set to WB on RESET and also by the + * kernel, i.e. failure indicates a kernel bug or broken firmware. + */ + if (rdmsrl_safe(MSR_IA32_CR_PAT, &host_pat) || + (host_pat & GENMASK(2, 0)) != 6) { + pr_err("kvm: host PAT[0] is not WB\n"); + return -EIO; + } x86_emulator_cache = kvm_alloc_emulator_cache(); if (!x86_emulator_cache) { pr_err("kvm: failed to allocate cache for x86 emulator\n"); - goto out; + return -ENOMEM; } user_return_msrs = alloc_percpu(struct kvm_user_return_msrs); if (!user_return_msrs) { printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n"); + r = -ENOMEM; goto out_free_x86_emulator_cache; } kvm_nr_uret_msrs = 0; @@ -9013,7 +9226,7 @@ int kvm_arch_init(void *opaque) if (boot_cpu_has(X86_FEATURE_XSAVE)) { host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0; + kvm_caps.supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0; } if (pi_inject_timer == -1) @@ -9031,7 +9244,6 @@ out_free_percpu: free_percpu(user_return_msrs); out_free_x86_emulator_cache: kmem_cache_destroy(x86_emulator_cache); -out: return r; } @@ -9406,7 +9618,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) if (!lapic_in_kernel(vcpu)) return; - if (vcpu->arch.apicv_active) + if (vcpu->arch.apic->apicv_active) return; if (!vcpu->arch.apic->vapic_addr) @@ -9435,6 +9647,11 @@ int kvm_check_nested_events(struct kvm_vcpu *vcpu) static void kvm_inject_exception(struct kvm_vcpu *vcpu) { + trace_kvm_inj_exception(vcpu->arch.exception.nr, + vcpu->arch.exception.has_error_code, + vcpu->arch.exception.error_code, + vcpu->arch.exception.injected); + if (vcpu->arch.exception.error_code && !is_protmode(vcpu)) vcpu->arch.exception.error_code = false; static_call(kvm_x86_queue_exception)(vcpu); @@ -9470,7 +9687,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) static_call(kvm_x86_inject_nmi)(vcpu); can_inject = false; } else if (vcpu->arch.interrupt.injected) { - static_call(kvm_x86_inject_irq)(vcpu); + static_call(kvm_x86_inject_irq)(vcpu, true); can_inject = false; } } @@ -9492,13 +9709,6 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) /* try to inject new event if pending */ if (vcpu->arch.exception.pending) { - trace_kvm_inj_exception(vcpu->arch.exception.nr, - vcpu->arch.exception.has_error_code, - vcpu->arch.exception.error_code); - - vcpu->arch.exception.pending = false; - vcpu->arch.exception.injected = true; - if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT) __kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) | X86_EFLAGS_RF); @@ -9512,6 +9722,10 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) } kvm_inject_exception(vcpu); + + vcpu->arch.exception.pending = false; + vcpu->arch.exception.injected = true; + can_inject = false; } @@ -9564,7 +9778,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) goto out; if (r) { kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false); - static_call(kvm_x86_inject_irq)(vcpu); + static_call(kvm_x86_inject_irq)(vcpu, false); WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0); } if (kvm_cpu_has_injectable_intr(vcpu)) @@ -9857,6 +10071,7 @@ void kvm_make_scan_ioapic_request(struct kvm *kvm) void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) { + struct kvm_lapic *apic = vcpu->arch.apic; bool activate; if (!lapic_in_kernel(vcpu)) @@ -9865,12 +10080,14 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) down_read(&vcpu->kvm->arch.apicv_update_lock); preempt_disable(); - activate = kvm_vcpu_apicv_activated(vcpu); + /* Do not activate APICV when APIC is disabled */ + activate = kvm_vcpu_apicv_activated(vcpu) && + (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED); - if (vcpu->arch.apicv_active == activate) + if (apic->apicv_active == activate) goto out; - vcpu->arch.apicv_active = activate; + apic->apicv_active = activate; kvm_apic_update_apicv(vcpu); static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu); @@ -9880,7 +10097,7 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) * still active when the interrupt got accepted. Make sure * inject_pending_event() is called to check for that. */ - if (!vcpu->arch.apicv_active) + if (!apic->apicv_active) kvm_make_request(KVM_REQ_EVENT, vcpu); out: @@ -10275,7 +10492,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * per-VM state, and responsing vCPUs must wait for the update * to complete before servicing KVM_REQ_APICV_UPDATE. */ - WARN_ON_ONCE(kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)); + WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) && + (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED)); exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu); if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST)) @@ -10654,8 +10872,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) r = cui(vcpu); if (r <= 0) goto out; - } else - WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed); + } else { + WARN_ON_ONCE(vcpu->arch.pio.count); + WARN_ON_ONCE(vcpu->mmio_needed); + } if (kvm_run->immediate_exit) { r = -EINTR; @@ -11183,7 +11403,7 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL); srcu_read_unlock(&vcpu->kvm->srcu, idx); tr->physical_address = gpa; - tr->valid = gpa != UNMAPPED_GVA; + tr->valid = gpa != INVALID_GPA; tr->writeable = 1; tr->usermode = 0; @@ -11276,11 +11496,17 @@ static int sync_regs(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) { - if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) + if (kvm_check_tsc_unstable() && kvm->created_vcpus) pr_warn_once("kvm: SMP vm created on host with unstable TSC; " "guest TSC will not be reliable\n"); - return 0; + if (!kvm->arch.max_vcpu_ids) + kvm->arch.max_vcpu_ids = KVM_MAX_VCPU_IDS; + + if (id >= kvm->arch.max_vcpu_ids) + return -EINVAL; + + return static_call(kvm_x86_vcpu_precreate)(kvm); } int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) @@ -11317,7 +11543,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) * will ensure the vCPU gets the correct state before VM-Entry. */ if (enable_apicv) { - vcpu->arch.apicv_active = true; + vcpu->arch.apic->apicv_active = true; kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); } } else @@ -11330,9 +11556,11 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) goto fail_free_lapic; vcpu->arch.pio_data = page_address(page); - vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, + vcpu->arch.mce_banks = kcalloc(KVM_MAX_MCE_BANKS * 4, sizeof(u64), GFP_KERNEL_ACCOUNT); - if (!vcpu->arch.mce_banks) + vcpu->arch.mci_ctl2_banks = kcalloc(KVM_MAX_MCE_BANKS, sizeof(u64), + GFP_KERNEL_ACCOUNT); + if (!vcpu->arch.mce_banks || !vcpu->arch.mci_ctl2_banks) goto fail_free_pio_data; vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; @@ -11386,6 +11614,7 @@ free_wbinvd_dirty_mask: free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); fail_free_mce_banks: kfree(vcpu->arch.mce_banks); + kfree(vcpu->arch.mci_ctl2_banks); fail_free_pio_data: free_page((unsigned long)vcpu->arch.pio_data); fail_free_lapic: @@ -11431,6 +11660,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_hv_vcpu_uninit(vcpu); kvm_pmu_destroy(vcpu); kfree(vcpu->arch.mce_banks); + kfree(vcpu->arch.mci_ctl2_banks); kvm_free_lapic(vcpu); idx = srcu_read_lock(&vcpu->kvm->srcu); kvm_mmu_destroy(vcpu); @@ -11510,6 +11740,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.smbase = 0x30000; vcpu->arch.msr_misc_features_enables = 0; + vcpu->arch.ia32_misc_enable_msr = MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | + MSR_IA32_MISC_ENABLE_BTS_UNAVAIL; __kvm_set_xcr(vcpu, 0, XFEATURE_MASK_FP); __kvm_set_msr(vcpu, MSR_IA32_XSS, 0, true); @@ -11526,7 +11758,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) * i.e. it's impossible for kvm_find_cpuid_entry() to find a valid entry * on RESET. But, go through the motions in case that's ever remedied. */ - cpuid_0x1 = kvm_find_cpuid_entry(vcpu, 1, 0); + cpuid_0x1 = kvm_find_cpuid_entry(vcpu, 1); kvm_rdx_write(vcpu, cpuid_0x1 ? cpuid_0x1->eax : 0x600); static_call(kvm_x86_vcpu_reset)(vcpu, init_event); @@ -11717,6 +11949,8 @@ int kvm_arch_hardware_setup(void *opaque) if (boot_cpu_has(X86_FEATURE_XSAVES)) rdmsrl(MSR_IA32_XSS, host_xss); + kvm_init_pmu_capability(); + r = ops->hardware_setup(); if (r != 0) return r; @@ -11726,13 +11960,13 @@ int kvm_arch_hardware_setup(void *opaque) kvm_register_perf_callbacks(ops->handle_intel_pt_intr); if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) - supported_xss = 0; + kvm_caps.supported_xss = 0; #define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f) cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_); #undef __kvm_cpu_cap_has - if (kvm_has_tsc_control) { + if (kvm_caps.has_tsc_control) { /* * Make sure the user can only configure tsc_khz values that * fit into a signed integer. @@ -11740,10 +11974,10 @@ int kvm_arch_hardware_setup(void *opaque) * be 1 on all machines. */ u64 max = min(0x7fffffffULL, - __scale_tsc(kvm_max_tsc_scaling_ratio, tsc_khz)); - kvm_max_guest_tsc_khz = max; + __scale_tsc(kvm_caps.max_tsc_scaling_ratio, tsc_khz)); + kvm_caps.max_guest_tsc_khz = max; } - kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits; + kvm_caps.default_tsc_scaling_ratio = 1ULL << kvm_caps.tsc_scaling_ratio_frac_bits; kvm_init_msr_list(); return 0; } @@ -12331,7 +12565,8 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu) { - if (vcpu->arch.apicv_active && static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu)) + if (kvm_vcpu_apicv_active(vcpu) && + static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu)) return true; return false; @@ -12773,7 +13008,7 @@ void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_c (PFERR_WRITE_MASK | PFERR_FETCH_MASK | PFERR_USER_MASK); if (!(error_code & PFERR_PRESENT_MASK) || - mmu->gva_to_gpa(vcpu, mmu, gva, access, &fault) != UNMAPPED_GVA) { + mmu->gva_to_gpa(vcpu, mmu, gva, access, &fault) != INVALID_GPA) { /* * If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page * tables probably do not match the TLB. Just proceed @@ -12998,6 +13233,12 @@ int kvm_sev_es_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes, } EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_read); +static void advance_sev_es_emulated_pio(struct kvm_vcpu *vcpu, unsigned count, int size) +{ + vcpu->arch.sev_pio_count -= count; + vcpu->arch.sev_pio_data += count * size; +} + static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size, unsigned int port); @@ -13021,8 +13262,7 @@ static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size, int ret = emulator_pio_out(vcpu, size, port, vcpu->arch.sev_pio_data, count); /* memcpy done already by emulator_pio_out. */ - vcpu->arch.sev_pio_count -= count; - vcpu->arch.sev_pio_data += count * vcpu->arch.pio.size; + advance_sev_es_emulated_pio(vcpu, count, size); if (!ret) break; @@ -13038,20 +13278,14 @@ static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size, static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size, unsigned int port); -static void advance_sev_es_emulated_ins(struct kvm_vcpu *vcpu) -{ - unsigned count = vcpu->arch.pio.count; - complete_emulator_pio_in(vcpu, vcpu->arch.sev_pio_data); - vcpu->arch.sev_pio_count -= count; - vcpu->arch.sev_pio_data += count * vcpu->arch.pio.size; -} - static int complete_sev_es_emulated_ins(struct kvm_vcpu *vcpu) { + unsigned count = vcpu->arch.pio.count; int size = vcpu->arch.pio.size; int port = vcpu->arch.pio.port; - advance_sev_es_emulated_ins(vcpu); + complete_emulator_pio_in(vcpu, vcpu->arch.sev_pio_data); + advance_sev_es_emulated_pio(vcpu, count, size); if (vcpu->arch.sev_pio_count) return kvm_sev_es_ins(vcpu, size, port); return 1; @@ -13063,11 +13297,11 @@ static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size, for (;;) { unsigned int count = min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count); - if (!__emulator_pio_in(vcpu, size, port, count)) + if (!emulator_pio_in(vcpu, size, port, vcpu->arch.sev_pio_data, count)) break; /* Emulation done by the kernel. */ - advance_sev_es_emulated_ins(vcpu); + advance_sev_es_emulated_pio(vcpu, count, size); if (!vcpu->arch.sev_pio_count) return 1; } @@ -13110,6 +13344,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_kick_vcpu_slowpath); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_doorbell); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_accept_irq); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 588792f00334..1926d2cb8e79 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -8,6 +8,27 @@ #include "kvm_cache_regs.h" #include "kvm_emulate.h" +struct kvm_caps { + /* control of guest tsc rate supported? */ + bool has_tsc_control; + /* maximum supported tsc_khz for guests */ + u32 max_guest_tsc_khz; + /* number of bits of the fractional part of the TSC scaling ratio */ + u8 tsc_scaling_ratio_frac_bits; + /* maximum allowed value of TSC scaling ratio */ + u64 max_tsc_scaling_ratio; + /* 1ull << kvm_caps.tsc_scaling_ratio_frac_bits */ + u64 default_tsc_scaling_ratio; + /* bus lock detection supported? */ + bool has_bus_lock_exit; + /* notify VM exit supported? */ + bool has_notify_vmexit; + + u64 supported_mce_cap; + u64 supported_xcr0; + u64 supported_xss; +}; + void kvm_spurious_fault(void); #define KVM_NESTED_VMENTER_CONSISTENCY_CHECK(consistency_check) \ @@ -283,14 +304,15 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu); extern u64 host_xcr0; -extern u64 supported_xcr0; extern u64 host_xss; -extern u64 supported_xss; + +extern struct kvm_caps kvm_caps; + extern bool enable_pmu; static inline bool kvm_mpx_supported(void) { - return (supported_xcr0 & (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)) + return (kvm_caps.supported_xcr0 & (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)) == (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); } @@ -344,6 +366,11 @@ static inline bool kvm_cstate_in_guest(struct kvm *kvm) return kvm->arch.cstate_in_guest; } +static inline bool kvm_notify_vmexit_enabled(struct kvm *kvm) +{ + return kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_ENABLED; +} + enum kvm_intr_type { /* Values are arbitrary, but must be non-zero. */ KVM_HANDLING_IRQ = 1, @@ -407,7 +434,7 @@ static inline void kvm_machine_check(void) void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu); void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu); int kvm_spec_ctrl_test_value(u64 value); -bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); +bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r, struct x86_exception *e); int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva); diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 610beba35907..a0c05ccbf4b1 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -1049,7 +1049,7 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode, else vcpu->arch.xen.poll_evtchn = -1; - set_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.xen.poll_mask); + set_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask); if (!wait_pending_event(vcpu, sched_poll.nr_ports, ports)) { vcpu->arch.mp_state = KVM_MP_STATE_HALTED; @@ -1071,7 +1071,7 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode, *r = 0; out: /* Really, this is only needed in case of timeout */ - clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.xen.poll_mask); + clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask); if (unlikely(sched_poll.nr_ports > 1)) kfree(ports); @@ -1311,7 +1311,7 @@ static void kvm_xen_check_poller(struct kvm_vcpu *vcpu, int port) int poll_evtchn = vcpu->arch.xen.poll_evtchn; if ((poll_evtchn == port || poll_evtchn == -1) && - test_and_clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.xen.poll_mask)) { + test_and_clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask)) { kvm_make_request(KVM_REQ_UNBLOCK, vcpu); kvm_vcpu_kick(vcpu); } @@ -1344,7 +1344,7 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm) vcpu = kvm_get_vcpu_by_id(kvm, xe->vcpu_id); if (!vcpu) return -EINVAL; - WRITE_ONCE(xe->vcpu_idx, kvm_vcpu_get_idx(vcpu)); + WRITE_ONCE(xe->vcpu_idx, vcpu->vcpu_idx); } if (!vcpu->arch.xen.vcpu_info_cache.active) @@ -1540,7 +1540,7 @@ int kvm_xen_setup_evtchn(struct kvm *kvm, */ vcpu = kvm_get_vcpu_by_id(kvm, ue->u.xen_evtchn.vcpu); if (vcpu) - e->xen_evtchn.vcpu_idx = kvm_vcpu_get_idx(vcpu); + e->xen_evtchn.vcpu_idx = vcpu->vcpu_idx; else e->xen_evtchn.vcpu_idx = -1; diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index fad8faa29d04..fa71a5d12e87 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1408,6 +1408,10 @@ good_area: return; } + /* The fault is fully completed (including releasing mmap lock) */ + if (fault & VM_FAULT_COMPLETED) + return; + /* * If we need to retry the mmap_lock has already been released, * and if there is a fatal signal pending there is no guarantee @@ -1526,7 +1530,7 @@ DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault) /* * Entry handling for valid #PF from kernel mode is slightly - * different: RCU is already watching and rcu_irq_enter() must not + * different: RCU is already watching and ct_irq_enter() must not * be invoked because a kernel fault on a user space address might * sleep. * diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index a0d023cb4292..509408da0da1 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -19,44 +19,6 @@ #include <asm/tlbflush.h> #include <asm/elf.h> -#if 0 /* This is just for testing */ -struct page * -follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) -{ - unsigned long start = address; - int length = 1; - int nr; - struct page *page; - struct vm_area_struct *vma; - - vma = find_vma(mm, addr); - if (!vma || !is_vm_hugetlb_page(vma)) - return ERR_PTR(-EINVAL); - - pte = huge_pte_offset(mm, address, vma_mmu_pagesize(vma)); - - /* hugetlb should be locked, and hence, prefaulted */ - WARN_ON(!pte || pte_none(*pte)); - - page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; - - WARN_ON(!PageHead(page)); - - return page; -} - -int pmd_huge(pmd_t pmd) -{ - return 0; -} - -int pud_huge(pud_t pud) -{ - return 0; -} - -#else - /* * pmd_huge() returns 1 if @pmd is hugetlb related entry, that is normal * hugetlb entry or non-present (migration or hwpoisoned) hugetlb entry. @@ -72,7 +34,6 @@ int pud_huge(pud_t pud) { return !!(pud_val(pud) & _PAGE_PSE); } -#endif #ifdef CONFIG_HUGETLB_PAGE static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c index f6d038e2cd8e..9c4d8dbcb129 100644 --- a/arch/x86/mm/mem_encrypt_amd.c +++ b/arch/x86/mm/mem_encrypt_amd.c @@ -20,12 +20,13 @@ #include <linux/bitops.h> #include <linux/dma-mapping.h> #include <linux/virtio_config.h> +#include <linux/virtio_anchor.h> #include <linux/cc_platform.h> -#include <linux/platform-feature.h> #include <asm/tlbflush.h> #include <asm/fixmap.h> #include <asm/setup.h> +#include <asm/mem_encrypt.h> #include <asm/bootparam.h> #include <asm/set_memory.h> #include <asm/cacheflush.h> @@ -245,7 +246,7 @@ void __init sev_setup_arch(void) swiotlb_adjust_size(size); /* Set restricted memory access for virtio. */ - platform_set(PLATFORM_VIRTIO_RESTRICTED_MEM_ACCESS); + virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc); } static unsigned long pg_level_to_pfn(int level, pte_t *kpte, pgprot_t *ret_prot) @@ -486,8 +487,6 @@ void __init early_set_mem_enc_dec_hypercall(unsigned long vaddr, int npages, boo void __init sme_early_init(void) { - unsigned int i; - if (!sme_me_mask) return; @@ -496,8 +495,7 @@ void __init sme_early_init(void) __supported_pte_mask = __sme_set(__supported_pte_mask); /* Update the protection map with memory encryption mask */ - for (i = 0; i < ARRAY_SIZE(protection_map); i++) - protection_map[i] = pgprot_encrypted(protection_map[i]); + add_encrypt_protection_map(); x86_platform.guest.enc_status_change_prepare = amd_enc_status_change_prepare; x86_platform.guest.enc_status_change_finish = amd_enc_status_change_finish; diff --git a/arch/x86/mm/pgprot.c b/arch/x86/mm/pgprot.c index 763742782286..c84bd9540b16 100644 --- a/arch/x86/mm/pgprot.c +++ b/arch/x86/mm/pgprot.c @@ -3,6 +3,34 @@ #include <linux/export.h> #include <linux/mm.h> #include <asm/pgtable.h> +#include <asm/mem_encrypt.h> + +static pgprot_t protection_map[16] __ro_after_init = { + [VM_NONE] = PAGE_NONE, + [VM_READ] = PAGE_READONLY, + [VM_WRITE] = PAGE_COPY, + [VM_WRITE | VM_READ] = PAGE_COPY, + [VM_EXEC] = PAGE_READONLY_EXEC, + [VM_EXEC | VM_READ] = PAGE_READONLY_EXEC, + [VM_EXEC | VM_WRITE] = PAGE_COPY_EXEC, + [VM_EXEC | VM_WRITE | VM_READ] = PAGE_COPY_EXEC, + [VM_SHARED] = PAGE_NONE, + [VM_SHARED | VM_READ] = PAGE_READONLY, + [VM_SHARED | VM_WRITE] = PAGE_SHARED, + [VM_SHARED | VM_WRITE | VM_READ] = PAGE_SHARED, + [VM_SHARED | VM_EXEC] = PAGE_READONLY_EXEC, + [VM_SHARED | VM_EXEC | VM_READ] = PAGE_READONLY_EXEC, + [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_SHARED_EXEC, + [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_SHARED_EXEC +}; + +void add_encrypt_protection_map(void) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(protection_map); i++) + protection_map[i] = pgprot_encrypted(protection_map[i]); +} pgprot_t vm_get_page_prot(unsigned long vm_flags) { diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index b808c9a80d1b..c1f6c1c51d99 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1785,6 +1785,10 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, struct bpf_tramp_link *l, int stack_size, int run_ctx_off, bool save_ret) { + void (*exit)(struct bpf_prog *prog, u64 start, + struct bpf_tramp_run_ctx *run_ctx) = __bpf_prog_exit; + u64 (*enter)(struct bpf_prog *prog, + struct bpf_tramp_run_ctx *run_ctx) = __bpf_prog_enter; u8 *prog = *pprog; u8 *jmp_insn; int ctx_cookie_off = offsetof(struct bpf_tramp_run_ctx, bpf_cookie); @@ -1803,15 +1807,21 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, */ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_1, -run_ctx_off + ctx_cookie_off); + if (p->aux->sleepable) { + enter = __bpf_prog_enter_sleepable; + exit = __bpf_prog_exit_sleepable; + } else if (p->expected_attach_type == BPF_LSM_CGROUP) { + enter = __bpf_prog_enter_lsm_cgroup; + exit = __bpf_prog_exit_lsm_cgroup; + } + /* arg1: mov rdi, progs[i] */ emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p); /* arg2: lea rsi, [rbp - ctx_cookie_off] */ EMIT4(0x48, 0x8D, 0x75, -run_ctx_off); - if (emit_call(&prog, - p->aux->sleepable ? __bpf_prog_enter_sleepable : - __bpf_prog_enter, prog)) - return -EINVAL; + if (emit_call(&prog, enter, prog)) + return -EINVAL; /* remember prog start time returned by __bpf_prog_enter */ emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0); @@ -1855,10 +1865,8 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6); /* arg3: lea rdx, [rbp - run_ctx_off] */ EMIT4(0x48, 0x8D, 0x55, -run_ctx_off); - if (emit_call(&prog, - p->aux->sleepable ? __bpf_prog_exit_sleepable : - __bpf_prog_exit, prog)) - return -EINVAL; + if (emit_call(&prog, exit, prog)) + return -EINVAL; *pprog = prog; return 0; @@ -1942,23 +1950,6 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog, return 0; } -static bool is_valid_bpf_tramp_flags(unsigned int flags) -{ - if ((flags & BPF_TRAMP_F_RESTORE_REGS) && - (flags & BPF_TRAMP_F_SKIP_FRAME)) - return false; - - /* - * BPF_TRAMP_F_RET_FENTRY_RET is only used by bpf_struct_ops, - * and it must be used alone. - */ - if ((flags & BPF_TRAMP_F_RET_FENTRY_RET) && - (flags & ~BPF_TRAMP_F_RET_FENTRY_RET)) - return false; - - return true; -} - /* Example: * __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev); * its 'struct btf_func_model' will be nr_args=2 @@ -2037,9 +2028,6 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i if (nr_args > 6) return -ENOTSUPP; - if (!is_valid_bpf_tramp_flags(flags)) - return -EINVAL; - /* Generated trampoline stack layout: * * RBP + 8 [ return address ] @@ -2145,10 +2133,15 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i if (flags & BPF_TRAMP_F_CALL_ORIG) { restore_regs(m, &prog, nr_args, regs_off); - /* call original function */ - if (emit_call(&prog, orig_call, prog)) { - ret = -EINVAL; - goto cleanup; + if (flags & BPF_TRAMP_F_ORIG_STACK) { + emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, 8); + EMIT2(0xff, 0xd0); /* call *rax */ + } else { + /* call original function */ + if (emit_call(&prog, orig_call, prog)) { + ret = -EINVAL; + goto cleanup; + } } /* remember return value in a stack for bpf prog to access */ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); @@ -2506,3 +2499,34 @@ void *bpf_arch_text_copy(void *dst, void *src, size_t len) return ERR_PTR(-EINVAL); return dst; } + +/* Indicate the JIT backend supports mixing bpf2bpf and tailcalls. */ +bool bpf_jit_supports_subprog_tailcalls(void) +{ + return true; +} + +void bpf_jit_free(struct bpf_prog *prog) +{ + if (prog->jited) { + struct x64_jit_data *jit_data = prog->aux->jit_data; + struct bpf_binary_header *hdr; + + /* + * If we fail the final pass of JIT (from jit_subprogs), + * the program may not be finalized yet. Call finalize here + * before freeing it. + */ + if (jit_data) { + bpf_jit_binary_pack_finalize(prog, jit_data->header, + jit_data->rw_header); + kvfree(jit_data->addrs); + kfree(jit_data); + } + hdr = bpf_jit_binary_pack_hdr(prog); + bpf_jit_binary_pack_free(hdr, NULL); + WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(prog)); + } + + bpf_prog_unlock_free(prog); +} diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 1591d67e0bcd..6e598bd78eef 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -108,29 +108,6 @@ static int __init setup_add_efi_memmap(char *arg) } early_param("add_efi_memmap", setup_add_efi_memmap); -void __init efi_find_mirror(void) -{ - efi_memory_desc_t *md; - u64 mirror_size = 0, total_size = 0; - - if (!efi_enabled(EFI_MEMMAP)) - return; - - for_each_efi_memory_desc(md) { - unsigned long long start = md->phys_addr; - unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; - - total_size += size; - if (md->attribute & EFI_MEMORY_MORE_RELIABLE) { - memblock_mark_mirror(start, size); - mirror_size += size; - } - } - if (mirror_size) - pr_info("Memory: %lldM/%lldM mirrored memory\n", - mirror_size>>20, total_size>>20); -} - /* * Tell the kernel about the EFI memory map. This might include * more than the max 128 entries that can fit in the passed in e820 diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig index 1bcd42c53039..186f13268401 100644 --- a/arch/x86/um/Kconfig +++ b/arch/x86/um/Kconfig @@ -32,12 +32,12 @@ config 3_LEVEL_PGTABLES bool "Three-level pagetables" if !64BIT default 64BIT help - Three-level pagetables will let UML have more than 4G of physical - memory. All the memory that can't be mapped directly will be treated - as high memory. + Three-level pagetables will let UML have more than 4G of physical + memory. All the memory that can't be mapped directly will be treated + as high memory. - However, this it experimental on 32-bit architectures, so if unsure say - N (on x86-64 it's automatically enabled, instead, as it's safe there). + However, this it experimental on 32-bit architectures, so if unsure say + N (on x86-64 it's automatically enabled, instead, as it's safe there). config ARCH_HAS_SC_SIGNALS def_bool !64BIT diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile index ba5789c35809..f778e37494ba 100644 --- a/arch/x86/um/Makefile +++ b/arch/x86/um/Makefile @@ -28,7 +28,8 @@ else obj-y += syscalls_64.o vdso/ -subarch-y = ../lib/csum-partial_64.o ../lib/memcpy_64.o ../entry/thunk_64.o +subarch-y = ../lib/csum-partial_64.o ../lib/memcpy_64.o ../entry/thunk_64.o \ + ../lib/memmove_64.o ../lib/memset_64.o endif diff --git a/arch/x86/um/mem_32.c b/arch/x86/um/mem_32.c index 19c5dbd46770..cafd01f730da 100644 --- a/arch/x86/um/mem_32.c +++ b/arch/x86/um/mem_32.c @@ -17,7 +17,7 @@ static int __init gate_vma_init(void) gate_vma.vm_start = FIXADDR_USER_START; gate_vma.vm_end = FIXADDR_USER_END; gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; - gate_vma.vm_page_prot = __P101; + gate_vma.vm_page_prot = PAGE_READONLY; return 0; } diff --git a/arch/x86/um/shared/sysdep/stub_64.h b/arch/x86/um/shared/sysdep/stub_64.h index e9c4b2b38803..92ea1670cf1c 100644 --- a/arch/x86/um/shared/sysdep/stub_64.h +++ b/arch/x86/um/shared/sysdep/stub_64.h @@ -8,6 +8,7 @@ #include <sysdep/ptrace_user.h> #include <generated/asm-offsets.h> +#include <linux/stddef.h> #define STUB_MMAP_NR __NR_mmap #define MMAP_OFFSET(o) (o) diff --git a/arch/x86/um/sysrq_64.c b/arch/x86/um/sysrq_64.c index 903ad91b624f..ef1eb4f4f612 100644 --- a/arch/x86/um/sysrq_64.c +++ b/arch/x86/um/sysrq_64.c @@ -19,8 +19,8 @@ void show_regs(struct pt_regs *regs) print_modules(); printk(KERN_INFO "Pid: %d, comm: %.20s %s %s\n", task_pid_nr(current), current->comm, print_tainted(), init_utsname()->release); - printk(KERN_INFO "RIP: %04lx:[<%016lx>]\n", PT_REGS_CS(regs) & 0xffff, - PT_REGS_IP(regs)); + printk(KERN_INFO "RIP: %04lx:%pS\n", PT_REGS_CS(regs) & 0xffff, + (void *)PT_REGS_IP(regs)); printk(KERN_INFO "RSP: %016lx EFLAGS: %08lx\n", PT_REGS_SP(regs), PT_REGS_EFLAGS(regs)); printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n", diff --git a/arch/x86/um/vdso/Makefile b/arch/x86/um/vdso/Makefile index 5943387e3f35..8c0396fd0e6f 100644 --- a/arch/x86/um/vdso/Makefile +++ b/arch/x86/um/vdso/Makefile @@ -3,6 +3,9 @@ # Building vDSO images for x86. # +# do not instrument on vdso because KASAN is not compatible with user mode +KASAN_SANITIZE := n + # Prevents link failures: __sanitizer_cov_trace_pc() is not linked in. KCOV_INSTRUMENT := n diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index 8b71b1dd7639..28762f800596 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -4,6 +4,7 @@ #include <linux/cpu.h> #include <linux/kexec.h> #include <linux/memblock.h> +#include <linux/virtio_anchor.h> #include <xen/features.h> #include <xen/events.h> @@ -195,7 +196,8 @@ static void __init xen_hvm_guest_init(void) if (xen_pv_domain()) return; - xen_set_restricted_virtio_memory_access(); + if (IS_ENABLED(CONFIG_XEN_VIRTIO_FORCE_GRANT)) + virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc); init_hvm_pv_info(); diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 70fb2ea85e90..0ed2e487a693 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -31,6 +31,7 @@ #include <linux/gfp.h> #include <linux/edd.h> #include <linux/reboot.h> +#include <linux/virtio_anchor.h> #include <xen/xen.h> #include <xen/events.h> @@ -109,7 +110,9 @@ static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc); static void __init xen_pv_init_platform(void) { - xen_set_restricted_virtio_memory_access(); + /* PV guests can't operate virtio devices without grants. */ + if (IS_ENABLED(CONFIG_XEN_VIRTIO)) + virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc); populate_extra_pte(fix_to_virt(FIX_PARAVIRT_BOOTMAP)); |