diff options
68 files changed, 4035 insertions, 451 deletions
| diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 255084c65138..f6946b81f74a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1523,6 +1523,7 @@ config AMD_MEM_ENCRYPT  	select DYNAMIC_PHYSICAL_MASK  	select ARCH_USE_MEMREMAP_PROT  	select ARCH_HAS_FORCE_DMA_UNENCRYPTED +	select INSTRUCTION_DECODER  	help  	  Say yes to enable support for the encryption of system memory.  	  This requires an AMD processor that supports Secure Memory diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 4fb989ef5665..ee249088cbfe 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -32,7 +32,7 @@ KBUILD_CFLAGS := -m$(BITS) -O2  KBUILD_CFLAGS += -fno-strict-aliasing -fPIE  KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING  cflags-$(CONFIG_X86_32) := -march=i386 -cflags-$(CONFIG_X86_64) := -mcmodel=small +cflags-$(CONFIG_X86_64) := -mcmodel=small -mno-red-zone  KBUILD_CFLAGS += $(cflags-y)  KBUILD_CFLAGS += -mno-mmx -mno-sse  KBUILD_CFLAGS += -ffreestanding @@ -47,6 +47,11 @@ KBUILD_CFLAGS += -D__DISABLE_EXPORTS  KBUILD_CFLAGS += $(call as-option,-Wa$(comma)-mrelax-relocations=no)  KBUILD_CFLAGS += -include $(srctree)/include/linux/hidden.h +# sev-es.c indirectly inludes inat-table.h which is generated during +# compilation and stored in $(objtree). Add the directory to the includes so +# that the compiler finds it even with out-of-tree builds (make O=/some/path). +CFLAGS_sev-es.o += -I$(objtree)/arch/x86/lib/ +  KBUILD_AFLAGS  := $(KBUILD_CFLAGS) -D__ASSEMBLY__  GCOV_PROFILE := n  UBSAN_SANITIZE :=n @@ -81,9 +86,11 @@ vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/kernel_info.o $(obj)/head_$(BITS).o  vmlinux-objs-$(CONFIG_EARLY_PRINTK) += $(obj)/early_serial_console.o  vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr.o  ifdef CONFIG_X86_64 -	vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr_64.o +	vmlinux-objs-y += $(obj)/ident_map_64.o +	vmlinux-objs-y += $(obj)/idt_64.o $(obj)/idt_handlers_64.o  	vmlinux-objs-y += $(obj)/mem_encrypt.o  	vmlinux-objs-y += $(obj)/pgtable_64.o +	vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/sev-es.o  endif  vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o diff --git a/arch/x86/boot/compressed/cpuflags.c b/arch/x86/boot/compressed/cpuflags.c index 6448a8196d32..0cc1323896d1 100644 --- a/arch/x86/boot/compressed/cpuflags.c +++ b/arch/x86/boot/compressed/cpuflags.c @@ -1,6 +1,4 @@  // SPDX-License-Identifier: GPL-2.0 -#ifdef CONFIG_RANDOMIZE_BASE -  #include "../cpuflags.c"  bool has_cpuflag(int flag) @@ -9,5 +7,3 @@ bool has_cpuflag(int flag)  	return test_bit(flag, cpu.flags);  } - -#endif diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 9e46729cf162..1c80f1738fd9 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -33,6 +33,7 @@  #include <asm/processor-flags.h>  #include <asm/asm-offsets.h>  #include <asm/bootparam.h> +#include <asm/desc_defs.h>  #include "pgtable.h"  /* @@ -415,6 +416,10 @@ SYM_CODE_START(startup_64)  .Lon_kernel_cs: +	pushq	%rsi +	call	load_stage1_idt +	popq	%rsi +  	/*  	 * paging_prepare() sets up the trampoline and checks if we need to  	 * enable 5-level paging. @@ -528,6 +533,21 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated)  	rep	stosq  /* + * If running as an SEV guest, the encryption mask is required in the + * page-table setup code below. When the guest also has SEV-ES enabled + * set_sev_encryption_mask() will cause #VC exceptions, but the stage2 + * handler can't map its GHCB because the page-table is not set up yet. + * So set up the encryption mask here while still on the stage1 #VC + * handler. Then load stage2 IDT and switch to the kernel's own + * page-table. + */ +	pushq	%rsi +	call	set_sev_encryption_mask +	call	load_stage2_idt +	call	initialize_identity_maps +	popq	%rsi + +/*   * Do the extraction, and jump to the new kernel..   */  	pushq	%rsi			/* Save the real mode argument */ @@ -659,10 +679,21 @@ SYM_DATA_START_LOCAL(gdt)  	.quad   0x0000000000000000	/* TS continued */  SYM_DATA_END_LABEL(gdt, SYM_L_LOCAL, gdt_end) +SYM_DATA_START(boot_idt_desc) +	.word	boot_idt_end - boot_idt - 1 +	.quad	0 +SYM_DATA_END(boot_idt_desc) +	.balign 8 +SYM_DATA_START(boot_idt) +	.rept	BOOT_IDT_ENTRIES +	.quad	0 +	.quad	0 +	.endr +SYM_DATA_END_LABEL(boot_idt, SYM_L_GLOBAL, boot_idt_end) +  #ifdef CONFIG_EFI_STUB  SYM_DATA(image_offset, .long 0)  #endif -  #ifdef CONFIG_EFI_MIXED  SYM_DATA_LOCAL(efi32_boot_args, .long 0, 0, 0)  SYM_DATA(efi_is64, .byte 1) diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c new file mode 100644 index 000000000000..063a60edcf99 --- /dev/null +++ b/arch/x86/boot/compressed/ident_map_64.c @@ -0,0 +1,349 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This code is used on x86_64 to create page table identity mappings on + * demand by building up a new set of page tables (or appending to the + * existing ones), and then switching over to them when ready. + * + * Copyright (C) 2015-2016  Yinghai Lu + * Copyright (C)      2016  Kees Cook + */ + +/* + * Since we're dealing with identity mappings, physical and virtual + * addresses are the same, so override these defines which are ultimately + * used by the headers in misc.h. + */ +#define __pa(x)  ((unsigned long)(x)) +#define __va(x)  ((void *)((unsigned long)(x))) + +/* No PAGE_TABLE_ISOLATION support needed either: */ +#undef CONFIG_PAGE_TABLE_ISOLATION + +#include "error.h" +#include "misc.h" + +/* These actually do the work of building the kernel identity maps. */ +#include <linux/pgtable.h> +#include <asm/cmpxchg.h> +#include <asm/trap_pf.h> +#include <asm/trapnr.h> +#include <asm/init.h> +/* Use the static base for this part of the boot process */ +#undef __PAGE_OFFSET +#define __PAGE_OFFSET __PAGE_OFFSET_BASE +#include "../../mm/ident_map.c" + +#ifdef CONFIG_X86_5LEVEL +unsigned int __pgtable_l5_enabled; +unsigned int pgdir_shift = 39; +unsigned int ptrs_per_p4d = 1; +#endif + +/* Used by PAGE_KERN* macros: */ +pteval_t __default_kernel_pte_mask __read_mostly = ~0; + +/* Used to track our page table allocation area. */ +struct alloc_pgt_data { +	unsigned char *pgt_buf; +	unsigned long pgt_buf_size; +	unsigned long pgt_buf_offset; +}; + +/* + * Allocates space for a page table entry, using struct alloc_pgt_data + * above. Besides the local callers, this is used as the allocation + * callback in mapping_info below. + */ +static void *alloc_pgt_page(void *context) +{ +	struct alloc_pgt_data *pages = (struct alloc_pgt_data *)context; +	unsigned char *entry; + +	/* Validate there is space available for a new page. */ +	if (pages->pgt_buf_offset >= pages->pgt_buf_size) { +		debug_putstr("out of pgt_buf in " __FILE__ "!?\n"); +		debug_putaddr(pages->pgt_buf_offset); +		debug_putaddr(pages->pgt_buf_size); +		return NULL; +	} + +	entry = pages->pgt_buf + pages->pgt_buf_offset; +	pages->pgt_buf_offset += PAGE_SIZE; + +	return entry; +} + +/* Used to track our allocated page tables. */ +static struct alloc_pgt_data pgt_data; + +/* The top level page table entry pointer. */ +static unsigned long top_level_pgt; + +phys_addr_t physical_mask = (1ULL << __PHYSICAL_MASK_SHIFT) - 1; + +/* + * Mapping information structure passed to kernel_ident_mapping_init(). + * Due to relocation, pointers must be assigned at run time not build time. + */ +static struct x86_mapping_info mapping_info; + +/* + * Adds the specified range to the identity mappings. + */ +static void add_identity_map(unsigned long start, unsigned long end) +{ +	int ret; + +	/* Align boundary to 2M. */ +	start = round_down(start, PMD_SIZE); +	end = round_up(end, PMD_SIZE); +	if (start >= end) +		return; + +	/* Build the mapping. */ +	ret = kernel_ident_mapping_init(&mapping_info, (pgd_t *)top_level_pgt, start, end); +	if (ret) +		error("Error: kernel_ident_mapping_init() failed\n"); +} + +/* Locates and clears a region for a new top level page table. */ +void initialize_identity_maps(void) +{ +	/* Exclude the encryption mask from __PHYSICAL_MASK */ +	physical_mask &= ~sme_me_mask; + +	/* Init mapping_info with run-time function/buffer pointers. */ +	mapping_info.alloc_pgt_page = alloc_pgt_page; +	mapping_info.context = &pgt_data; +	mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sme_me_mask; +	mapping_info.kernpg_flag = _KERNPG_TABLE; + +	/* +	 * It should be impossible for this not to already be true, +	 * but since calling this a second time would rewind the other +	 * counters, let's just make sure this is reset too. +	 */ +	pgt_data.pgt_buf_offset = 0; + +	/* +	 * If we came here via startup_32(), cr3 will be _pgtable already +	 * and we must append to the existing area instead of entirely +	 * overwriting it. +	 * +	 * With 5-level paging, we use '_pgtable' to allocate the p4d page table, +	 * the top-level page table is allocated separately. +	 * +	 * p4d_offset(top_level_pgt, 0) would cover both the 4- and 5-level +	 * cases. On 4-level paging it's equal to 'top_level_pgt'. +	 */ +	top_level_pgt = read_cr3_pa(); +	if (p4d_offset((pgd_t *)top_level_pgt, 0) == (p4d_t *)_pgtable) { +		pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE; +		pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE; +		memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size); +	} else { +		pgt_data.pgt_buf = _pgtable; +		pgt_data.pgt_buf_size = BOOT_PGT_SIZE; +		memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size); +		top_level_pgt = (unsigned long)alloc_pgt_page(&pgt_data); +	} + +	/* +	 * New page-table is set up - map the kernel image and load it +	 * into cr3. +	 */ +	add_identity_map((unsigned long)_head, (unsigned long)_end); +	write_cr3(top_level_pgt); +} + +/* + * This switches the page tables to the new level4 that has been built + * via calls to add_identity_map() above. If booted via startup_32(), + * this is effectively a no-op. + */ +void finalize_identity_maps(void) +{ +	write_cr3(top_level_pgt); +} + +static pte_t *split_large_pmd(struct x86_mapping_info *info, +			      pmd_t *pmdp, unsigned long __address) +{ +	unsigned long page_flags; +	unsigned long address; +	pte_t *pte; +	pmd_t pmd; +	int i; + +	pte = (pte_t *)info->alloc_pgt_page(info->context); +	if (!pte) +		return NULL; + +	address     = __address & PMD_MASK; +	/* No large page - clear PSE flag */ +	page_flags  = info->page_flag & ~_PAGE_PSE; + +	/* Populate the PTEs */ +	for (i = 0; i < PTRS_PER_PMD; i++) { +		set_pte(&pte[i], __pte(address | page_flags)); +		address += PAGE_SIZE; +	} + +	/* +	 * Ideally we need to clear the large PMD first and do a TLB +	 * flush before we write the new PMD. But the 2M range of the +	 * PMD might contain the code we execute and/or the stack +	 * we are on, so we can't do that. But that should be safe here +	 * because we are going from large to small mappings and we are +	 * also the only user of the page-table, so there is no chance +	 * of a TLB multihit. +	 */ +	pmd = __pmd((unsigned long)pte | info->kernpg_flag); +	set_pmd(pmdp, pmd); +	/* Flush TLB to establish the new PMD */ +	write_cr3(top_level_pgt); + +	return pte + pte_index(__address); +} + +static void clflush_page(unsigned long address) +{ +	unsigned int flush_size; +	char *cl, *start, *end; + +	/* +	 * Hardcode cl-size to 64 - CPUID can't be used here because that might +	 * cause another #VC exception and the GHCB is not ready to use yet. +	 */ +	flush_size = 64; +	start      = (char *)(address & PAGE_MASK); +	end        = start + PAGE_SIZE; + +	/* +	 * First make sure there are no pending writes on the cache-lines to +	 * flush. +	 */ +	asm volatile("mfence" : : : "memory"); + +	for (cl = start; cl != end; cl += flush_size) +		clflush(cl); +} + +static int set_clr_page_flags(struct x86_mapping_info *info, +			      unsigned long address, +			      pteval_t set, pteval_t clr) +{ +	pgd_t *pgdp = (pgd_t *)top_level_pgt; +	p4d_t *p4dp; +	pud_t *pudp; +	pmd_t *pmdp; +	pte_t *ptep, pte; + +	/* +	 * First make sure there is a PMD mapping for 'address'. +	 * It should already exist, but keep things generic. +	 * +	 * To map the page just read from it and fault it in if there is no +	 * mapping yet. add_identity_map() can't be called here because that +	 * would unconditionally map the address on PMD level, destroying any +	 * PTE-level mappings that might already exist. Use assembly here so +	 * the access won't be optimized away. +	 */ +	asm volatile("mov %[address], %%r9" +		     :: [address] "g" (*(unsigned long *)address) +		     : "r9", "memory"); + +	/* +	 * The page is mapped at least with PMD size - so skip checks and walk +	 * directly to the PMD. +	 */ +	p4dp = p4d_offset(pgdp, address); +	pudp = pud_offset(p4dp, address); +	pmdp = pmd_offset(pudp, address); + +	if (pmd_large(*pmdp)) +		ptep = split_large_pmd(info, pmdp, address); +	else +		ptep = pte_offset_kernel(pmdp, address); + +	if (!ptep) +		return -ENOMEM; + +	/* +	 * Changing encryption attributes of a page requires to flush it from +	 * the caches. +	 */ +	if ((set | clr) & _PAGE_ENC) +		clflush_page(address); + +	/* Update PTE */ +	pte = *ptep; +	pte = pte_set_flags(pte, set); +	pte = pte_clear_flags(pte, clr); +	set_pte(ptep, pte); + +	/* Flush TLB after changing encryption attribute */ +	write_cr3(top_level_pgt); + +	return 0; +} + +int set_page_decrypted(unsigned long address) +{ +	return set_clr_page_flags(&mapping_info, address, 0, _PAGE_ENC); +} + +int set_page_encrypted(unsigned long address) +{ +	return set_clr_page_flags(&mapping_info, address, _PAGE_ENC, 0); +} + +int set_page_non_present(unsigned long address) +{ +	return set_clr_page_flags(&mapping_info, address, 0, _PAGE_PRESENT); +} + +static void do_pf_error(const char *msg, unsigned long error_code, +			unsigned long address, unsigned long ip) +{ +	error_putstr(msg); + +	error_putstr("\nError Code: "); +	error_puthex(error_code); +	error_putstr("\nCR2: 0x"); +	error_puthex(address); +	error_putstr("\nRIP relative to _head: 0x"); +	error_puthex(ip - (unsigned long)_head); +	error_putstr("\n"); + +	error("Stopping.\n"); +} + +void do_boot_page_fault(struct pt_regs *regs, unsigned long error_code) +{ +	unsigned long address = native_read_cr2(); +	unsigned long end; +	bool ghcb_fault; + +	ghcb_fault = sev_es_check_ghcb_fault(address); + +	address   &= PMD_MASK; +	end        = address + PMD_SIZE; + +	/* +	 * Check for unexpected error codes. Unexpected are: +	 *	- Faults on present pages +	 *	- User faults +	 *	- Reserved bits set +	 */ +	if (error_code & (X86_PF_PROT | X86_PF_USER | X86_PF_RSVD)) +		do_pf_error("Unexpected page-fault:", error_code, address, regs->ip); +	else if (ghcb_fault) +		do_pf_error("Page-fault on GHCB page:", error_code, address, regs->ip); + +	/* +	 * Error code is sane - now identity map the 2M region around +	 * the faulting address. +	 */ +	add_identity_map(address, end); +} diff --git a/arch/x86/boot/compressed/idt_64.c b/arch/x86/boot/compressed/idt_64.c new file mode 100644 index 000000000000..804a502ee0d2 --- /dev/null +++ b/arch/x86/boot/compressed/idt_64.c @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <asm/trap_pf.h> +#include <asm/segment.h> +#include <asm/trapnr.h> +#include "misc.h" + +static void set_idt_entry(int vector, void (*handler)(void)) +{ +	unsigned long address = (unsigned long)handler; +	gate_desc entry; + +	memset(&entry, 0, sizeof(entry)); + +	entry.offset_low    = (u16)(address & 0xffff); +	entry.segment       = __KERNEL_CS; +	entry.bits.type     = GATE_TRAP; +	entry.bits.p        = 1; +	entry.offset_middle = (u16)((address >> 16) & 0xffff); +	entry.offset_high   = (u32)(address >> 32); + +	memcpy(&boot_idt[vector], &entry, sizeof(entry)); +} + +/* Have this here so we don't need to include <asm/desc.h> */ +static void load_boot_idt(const struct desc_ptr *dtr) +{ +	asm volatile("lidt %0"::"m" (*dtr)); +} + +/* Setup IDT before kernel jumping to  .Lrelocated */ +void load_stage1_idt(void) +{ +	boot_idt_desc.address = (unsigned long)boot_idt; + + +	if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) +		set_idt_entry(X86_TRAP_VC, boot_stage1_vc); + +	load_boot_idt(&boot_idt_desc); +} + +/* Setup IDT after kernel jumping to  .Lrelocated */ +void load_stage2_idt(void) +{ +	boot_idt_desc.address = (unsigned long)boot_idt; + +	set_idt_entry(X86_TRAP_PF, boot_page_fault); + +#ifdef CONFIG_AMD_MEM_ENCRYPT +	set_idt_entry(X86_TRAP_VC, boot_stage2_vc); +#endif + +	load_boot_idt(&boot_idt_desc); +} diff --git a/arch/x86/boot/compressed/idt_handlers_64.S b/arch/x86/boot/compressed/idt_handlers_64.S new file mode 100644 index 000000000000..22890e199f5b --- /dev/null +++ b/arch/x86/boot/compressed/idt_handlers_64.S @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Early IDT handler entry points + * + * Copyright (C) 2019 SUSE + * + * Author: Joerg Roedel <jroedel@suse.de> + */ + +#include <asm/segment.h> + +/* For ORIG_RAX */ +#include "../../entry/calling.h" + +.macro EXCEPTION_HANDLER name function error_code=0 +SYM_FUNC_START(\name) + +	/* Build pt_regs */ +	.if \error_code == 0 +	pushq   $0 +	.endif + +	pushq   %rdi +	pushq   %rsi +	pushq   %rdx +	pushq   %rcx +	pushq   %rax +	pushq   %r8 +	pushq   %r9 +	pushq   %r10 +	pushq   %r11 +	pushq   %rbx +	pushq   %rbp +	pushq   %r12 +	pushq   %r13 +	pushq   %r14 +	pushq   %r15 + +	/* Call handler with pt_regs */ +	movq    %rsp, %rdi +	/* Error code is second parameter */ +	movq	ORIG_RAX(%rsp), %rsi +	call    \function + +	/* Restore regs */ +	popq    %r15 +	popq    %r14 +	popq    %r13 +	popq    %r12 +	popq    %rbp +	popq    %rbx +	popq    %r11 +	popq    %r10 +	popq    %r9 +	popq    %r8 +	popq    %rax +	popq    %rcx +	popq    %rdx +	popq    %rsi +	popq    %rdi + +	/* Remove error code and return */ +	addq    $8, %rsp + +	iretq +SYM_FUNC_END(\name) +	.endm + +	.text +	.code64 + +EXCEPTION_HANDLER	boot_page_fault do_boot_page_fault error_code=1 + +#ifdef CONFIG_AMD_MEM_ENCRYPT +EXCEPTION_HANDLER	boot_stage1_vc do_vc_no_ghcb		error_code=1 +EXCEPTION_HANDLER	boot_stage2_vc do_boot_stage2_vc	error_code=1 +#endif diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 877970d76249..b59547ce5b19 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -40,17 +40,8 @@  #include <asm/setup.h>	/* For COMMAND_LINE_SIZE */  #undef _SETUP -#ifdef CONFIG_X86_5LEVEL -unsigned int __pgtable_l5_enabled; -unsigned int pgdir_shift __ro_after_init = 39; -unsigned int ptrs_per_p4d __ro_after_init = 1; -#endif -  extern unsigned long get_cmd_line_ptr(void); -/* Used by PAGE_KERN* macros: */ -pteval_t __default_kernel_pte_mask __read_mostly = ~0; -  /* Simplified build-specific string for starting entropy. */  static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@"  		LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION; @@ -406,8 +397,6 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size,  	 */  	mem_avoid[MEM_AVOID_ZO_RANGE].start = input;  	mem_avoid[MEM_AVOID_ZO_RANGE].size = (output + init_size) - input; -	add_identity_map(mem_avoid[MEM_AVOID_ZO_RANGE].start, -			 mem_avoid[MEM_AVOID_ZO_RANGE].size);  	/* Avoid initrd. */  	initrd_start  = (u64)boot_params->ext_ramdisk_image << 32; @@ -425,15 +414,11 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size,  		cmd_line_size = strnlen((char *)cmd_line, COMMAND_LINE_SIZE-1) + 1;  		mem_avoid[MEM_AVOID_CMDLINE].start = cmd_line;  		mem_avoid[MEM_AVOID_CMDLINE].size = cmd_line_size; -		add_identity_map(mem_avoid[MEM_AVOID_CMDLINE].start, -				 mem_avoid[MEM_AVOID_CMDLINE].size);  	}  	/* Avoid boot parameters. */  	mem_avoid[MEM_AVOID_BOOTPARAMS].start = (unsigned long)boot_params;  	mem_avoid[MEM_AVOID_BOOTPARAMS].size = sizeof(*boot_params); -	add_identity_map(mem_avoid[MEM_AVOID_BOOTPARAMS].start, -			 mem_avoid[MEM_AVOID_BOOTPARAMS].size);  	/* We don't need to set a mapping for setup_data. */ @@ -442,11 +427,6 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size,  	/* Enumerate the immovable memory regions */  	num_immovable_mem = count_immovable_mem_regions(); - -#ifdef CONFIG_X86_VERBOSE_BOOTUP -	/* Make sure video RAM can be used. */ -	add_identity_map(0, PMD_SIZE); -#endif  }  /* @@ -870,9 +850,6 @@ void choose_random_location(unsigned long input,  	boot_params->hdr.loadflags |= KASLR_FLAG; -	/* Prepare to add new identity pagetables on demand. */ -	initialize_identity_maps(); -  	if (IS_ENABLED(CONFIG_X86_32))  		mem_limit = KERNEL_IMAGE_SIZE;  	else @@ -896,19 +873,8 @@ void choose_random_location(unsigned long input,  		warn("Physical KASLR disabled: no suitable memory region!");  	} else {  		/* Update the new physical address location. */ -		if (*output != random_addr) { -			add_identity_map(random_addr, output_size); +		if (*output != random_addr)  			*output = random_addr; -		} - -		/* -		 * This loads the identity mapping page table. -		 * This should only be done if a new physical address -		 * is found for the kernel, otherwise we should keep -		 * the old page table to make it be like the "nokaslr" -		 * case. -		 */ -		finalize_identity_maps();  	} diff --git a/arch/x86/boot/compressed/kaslr_64.c b/arch/x86/boot/compressed/kaslr_64.c deleted file mode 100644 index f9c5c13d979b..000000000000 --- a/arch/x86/boot/compressed/kaslr_64.c +++ /dev/null @@ -1,153 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * This code is used on x86_64 to create page table identity mappings on - * demand by building up a new set of page tables (or appending to the - * existing ones), and then switching over to them when ready. - * - * Copyright (C) 2015-2016  Yinghai Lu - * Copyright (C)      2016  Kees Cook - */ - -/* - * Since we're dealing with identity mappings, physical and virtual - * addresses are the same, so override these defines which are ultimately - * used by the headers in misc.h. - */ -#define __pa(x)  ((unsigned long)(x)) -#define __va(x)  ((void *)((unsigned long)(x))) - -/* No PAGE_TABLE_ISOLATION support needed either: */ -#undef CONFIG_PAGE_TABLE_ISOLATION - -#include "misc.h" - -/* These actually do the work of building the kernel identity maps. */ -#include <linux/pgtable.h> -#include <asm/init.h> -/* Use the static base for this part of the boot process */ -#undef __PAGE_OFFSET -#define __PAGE_OFFSET __PAGE_OFFSET_BASE -#include "../../mm/ident_map.c" - -/* Used to track our page table allocation area. */ -struct alloc_pgt_data { -	unsigned char *pgt_buf; -	unsigned long pgt_buf_size; -	unsigned long pgt_buf_offset; -}; - -/* - * Allocates space for a page table entry, using struct alloc_pgt_data - * above. Besides the local callers, this is used as the allocation - * callback in mapping_info below. - */ -static void *alloc_pgt_page(void *context) -{ -	struct alloc_pgt_data *pages = (struct alloc_pgt_data *)context; -	unsigned char *entry; - -	/* Validate there is space available for a new page. */ -	if (pages->pgt_buf_offset >= pages->pgt_buf_size) { -		debug_putstr("out of pgt_buf in " __FILE__ "!?\n"); -		debug_putaddr(pages->pgt_buf_offset); -		debug_putaddr(pages->pgt_buf_size); -		return NULL; -	} - -	entry = pages->pgt_buf + pages->pgt_buf_offset; -	pages->pgt_buf_offset += PAGE_SIZE; - -	return entry; -} - -/* Used to track our allocated page tables. */ -static struct alloc_pgt_data pgt_data; - -/* The top level page table entry pointer. */ -static unsigned long top_level_pgt; - -phys_addr_t physical_mask = (1ULL << __PHYSICAL_MASK_SHIFT) - 1; - -/* - * Mapping information structure passed to kernel_ident_mapping_init(). - * Due to relocation, pointers must be assigned at run time not build time. - */ -static struct x86_mapping_info mapping_info; - -/* Locates and clears a region for a new top level page table. */ -void initialize_identity_maps(void) -{ -	/* If running as an SEV guest, the encryption mask is required. */ -	set_sev_encryption_mask(); - -	/* Exclude the encryption mask from __PHYSICAL_MASK */ -	physical_mask &= ~sme_me_mask; - -	/* Init mapping_info with run-time function/buffer pointers. */ -	mapping_info.alloc_pgt_page = alloc_pgt_page; -	mapping_info.context = &pgt_data; -	mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sme_me_mask; -	mapping_info.kernpg_flag = _KERNPG_TABLE; - -	/* -	 * It should be impossible for this not to already be true, -	 * but since calling this a second time would rewind the other -	 * counters, let's just make sure this is reset too. -	 */ -	pgt_data.pgt_buf_offset = 0; - -	/* -	 * If we came here via startup_32(), cr3 will be _pgtable already -	 * and we must append to the existing area instead of entirely -	 * overwriting it. -	 * -	 * With 5-level paging, we use '_pgtable' to allocate the p4d page table, -	 * the top-level page table is allocated separately. -	 * -	 * p4d_offset(top_level_pgt, 0) would cover both the 4- and 5-level -	 * cases. On 4-level paging it's equal to 'top_level_pgt'. -	 */ -	top_level_pgt = read_cr3_pa(); -	if (p4d_offset((pgd_t *)top_level_pgt, 0) == (p4d_t *)_pgtable) { -		debug_putstr("booted via startup_32()\n"); -		pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE; -		pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE; -		memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size); -	} else { -		debug_putstr("booted via startup_64()\n"); -		pgt_data.pgt_buf = _pgtable; -		pgt_data.pgt_buf_size = BOOT_PGT_SIZE; -		memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size); -		top_level_pgt = (unsigned long)alloc_pgt_page(&pgt_data); -	} -} - -/* - * Adds the specified range to what will become the new identity mappings. - * Once all ranges have been added, the new mapping is activated by calling - * finalize_identity_maps() below. - */ -void add_identity_map(unsigned long start, unsigned long size) -{ -	unsigned long end = start + size; - -	/* Align boundary to 2M. */ -	start = round_down(start, PMD_SIZE); -	end = round_up(end, PMD_SIZE); -	if (start >= end) -		return; - -	/* Build the mapping. */ -	kernel_ident_mapping_init(&mapping_info, (pgd_t *)top_level_pgt, -				  start, end); -} - -/* - * This switches the page tables to the new level4 that has been built - * via calls to add_identity_map() above. If booted via startup_32(), - * this is effectively a no-op. - */ -void finalize_identity_maps(void) -{ -	write_cr3(top_level_pgt); -} diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index e478e40fbe5a..267e7f93050e 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -442,6 +442,13 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,  	parse_elf(output);  	handle_relocations(output, output_len, virt_addr);  	debug_putstr("done.\nBooting the kernel.\n"); + +	/* +	 * Flush GHCB from cache and map it encrypted again when running as +	 * SEV-ES guest. +	 */ +	sev_es_shutdown_ghcb(); +  	return output;  } diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 3efce27ba35c..6d31f1b4c4d1 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -23,6 +23,7 @@  #include <asm/page.h>  #include <asm/boot.h>  #include <asm/bootparam.h> +#include <asm/desc_defs.h>  #define BOOT_CTYPE_H  #include <linux/acpi.h> @@ -36,6 +37,9 @@  #define memptr unsigned  #endif +/* boot/compressed/vmlinux start and end markers */ +extern char _head[], _end[]; +  /* misc.c */  extern memptr free_mem_ptr;  extern memptr free_mem_end_ptr; @@ -81,8 +85,6 @@ void choose_random_location(unsigned long input,  			    unsigned long *output,  			    unsigned long output_size,  			    unsigned long *virt_addr); -/* cpuflags.c */ -bool has_cpuflag(int flag);  #else  static inline void choose_random_location(unsigned long input,  					  unsigned long input_size, @@ -93,18 +95,14 @@ static inline void choose_random_location(unsigned long input,  }  #endif +/* cpuflags.c */ +bool has_cpuflag(int flag); +  #ifdef CONFIG_X86_64 -void initialize_identity_maps(void); -void add_identity_map(unsigned long start, unsigned long size); -void finalize_identity_maps(void); +extern int set_page_decrypted(unsigned long address); +extern int set_page_encrypted(unsigned long address); +extern int set_page_non_present(unsigned long address);  extern unsigned char _pgtable[]; -#else -static inline void initialize_identity_maps(void) -{ } -static inline void add_identity_map(unsigned long start, unsigned long size) -{ } -static inline void finalize_identity_maps(void) -{ }  #endif  #ifdef CONFIG_EARLY_PRINTK @@ -119,6 +117,17 @@ static inline void console_init(void)  void set_sev_encryption_mask(void); +#ifdef CONFIG_AMD_MEM_ENCRYPT +void sev_es_shutdown_ghcb(void); +extern bool sev_es_check_ghcb_fault(unsigned long address); +#else +static inline void sev_es_shutdown_ghcb(void) { } +static inline bool sev_es_check_ghcb_fault(unsigned long address) +{ +	return false; +} +#endif +  /* acpi.c */  #ifdef CONFIG_ACPI  acpi_physical_address get_rsdp_addr(void); @@ -133,4 +142,21 @@ int count_immovable_mem_regions(void);  static inline int count_immovable_mem_regions(void) { return 0; }  #endif +/* ident_map_64.c */ +#ifdef CONFIG_X86_5LEVEL +extern unsigned int __pgtable_l5_enabled, pgdir_shift, ptrs_per_p4d; +#endif + +/* Used by PAGE_KERN* macros: */ +extern pteval_t __default_kernel_pte_mask; + +/* idt_64.c */ +extern gate_desc boot_idt[BOOT_IDT_ENTRIES]; +extern struct desc_ptr boot_idt_desc; + +/* IDT Entry Points */ +void boot_page_fault(void); +void boot_stage1_vc(void); +void boot_stage2_vc(void); +  #endif /* BOOT_COMPRESSED_MISC_H */ diff --git a/arch/x86/boot/compressed/sev-es.c b/arch/x86/boot/compressed/sev-es.c new file mode 100644 index 000000000000..954cb2702e23 --- /dev/null +++ b/arch/x86/boot/compressed/sev-es.c @@ -0,0 +1,214 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * AMD Encrypted Register State Support + * + * Author: Joerg Roedel <jroedel@suse.de> + */ + +/* + * misc.h needs to be first because it knows how to include the other kernel + * headers in the pre-decompression code in a way that does not break + * compilation. + */ +#include "misc.h" + +#include <asm/pgtable_types.h> +#include <asm/sev-es.h> +#include <asm/trapnr.h> +#include <asm/trap_pf.h> +#include <asm/msr-index.h> +#include <asm/fpu/xcr.h> +#include <asm/ptrace.h> +#include <asm/svm.h> + +#include "error.h" + +struct ghcb boot_ghcb_page __aligned(PAGE_SIZE); +struct ghcb *boot_ghcb; + +/* + * Copy a version of this function here - insn-eval.c can't be used in + * pre-decompression code. + */ +static bool insn_has_rep_prefix(struct insn *insn) +{ +	int i; + +	insn_get_prefixes(insn); + +	for (i = 0; i < insn->prefixes.nbytes; i++) { +		insn_byte_t p = insn->prefixes.bytes[i]; + +		if (p == 0xf2 || p == 0xf3) +			return true; +	} + +	return false; +} + +/* + * Only a dummy for insn_get_seg_base() - Early boot-code is 64bit only and + * doesn't use segments. + */ +static unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx) +{ +	return 0UL; +} + +static inline u64 sev_es_rd_ghcb_msr(void) +{ +	unsigned long low, high; + +	asm volatile("rdmsr" : "=a" (low), "=d" (high) : +			"c" (MSR_AMD64_SEV_ES_GHCB)); + +	return ((high << 32) | low); +} + +static inline void sev_es_wr_ghcb_msr(u64 val) +{ +	u32 low, high; + +	low  = val & 0xffffffffUL; +	high = val >> 32; + +	asm volatile("wrmsr" : : "c" (MSR_AMD64_SEV_ES_GHCB), +			"a"(low), "d" (high) : "memory"); +} + +static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) +{ +	char buffer[MAX_INSN_SIZE]; +	enum es_result ret; + +	memcpy(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE); + +	insn_init(&ctxt->insn, buffer, MAX_INSN_SIZE, 1); +	insn_get_length(&ctxt->insn); + +	ret = ctxt->insn.immediate.got ? ES_OK : ES_DECODE_FAILED; + +	return ret; +} + +static enum es_result vc_write_mem(struct es_em_ctxt *ctxt, +				   void *dst, char *buf, size_t size) +{ +	memcpy(dst, buf, size); + +	return ES_OK; +} + +static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, +				  void *src, char *buf, size_t size) +{ +	memcpy(buf, src, size); + +	return ES_OK; +} + +#undef __init +#undef __pa +#define __init +#define __pa(x)	((unsigned long)(x)) + +#define __BOOT_COMPRESSED + +/* Basic instruction decoding support needed */ +#include "../../lib/inat.c" +#include "../../lib/insn.c" + +/* Include code for early handlers */ +#include "../../kernel/sev-es-shared.c" + +static bool early_setup_sev_es(void) +{ +	if (!sev_es_negotiate_protocol()) +		sev_es_terminate(GHCB_SEV_ES_REASON_PROTOCOL_UNSUPPORTED); + +	if (set_page_decrypted((unsigned long)&boot_ghcb_page)) +		return false; + +	/* Page is now mapped decrypted, clear it */ +	memset(&boot_ghcb_page, 0, sizeof(boot_ghcb_page)); + +	boot_ghcb = &boot_ghcb_page; + +	/* Initialize lookup tables for the instruction decoder */ +	inat_init_tables(); + +	return true; +} + +void sev_es_shutdown_ghcb(void) +{ +	if (!boot_ghcb) +		return; + +	if (!sev_es_check_cpu_features()) +		error("SEV-ES CPU Features missing."); + +	/* +	 * GHCB Page must be flushed from the cache and mapped encrypted again. +	 * Otherwise the running kernel will see strange cache effects when +	 * trying to use that page. +	 */ +	if (set_page_encrypted((unsigned long)&boot_ghcb_page)) +		error("Can't map GHCB page encrypted"); + +	/* +	 * GHCB page is mapped encrypted again and flushed from the cache. +	 * Mark it non-present now to catch bugs when #VC exceptions trigger +	 * after this point. +	 */ +	if (set_page_non_present((unsigned long)&boot_ghcb_page)) +		error("Can't unmap GHCB page"); +} + +bool sev_es_check_ghcb_fault(unsigned long address) +{ +	/* Check whether the fault was on the GHCB page */ +	return ((address & PAGE_MASK) == (unsigned long)&boot_ghcb_page); +} + +void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code) +{ +	struct es_em_ctxt ctxt; +	enum es_result result; + +	if (!boot_ghcb && !early_setup_sev_es()) +		sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); + +	vc_ghcb_invalidate(boot_ghcb); +	result = vc_init_em_ctxt(&ctxt, regs, exit_code); +	if (result != ES_OK) +		goto finish; + +	switch (exit_code) { +	case SVM_EXIT_RDTSC: +	case SVM_EXIT_RDTSCP: +		result = vc_handle_rdtsc(boot_ghcb, &ctxt, exit_code); +		break; +	case SVM_EXIT_IOIO: +		result = vc_handle_ioio(boot_ghcb, &ctxt); +		break; +	case SVM_EXIT_CPUID: +		result = vc_handle_cpuid(boot_ghcb, &ctxt); +		break; +	default: +		result = ES_UNSUPPORTED; +		break; +	} + +finish: +	if (result == ES_OK) { +		vc_finish_insn(&ctxt); +	} else if (result != ES_RETRY) { +		/* +		 * For now, just halt the machine. That makes debugging easier, +		 * later we just call sev_es_terminate() here. +		 */ +		while (true) +			asm volatile("hlt\n"); +	} +} diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 826e73488308..cad08703c4ad 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -101,6 +101,8 @@ SYM_CODE_START(entry_SYSCALL_64)  	SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp  	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp +SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL) +  	/* Construct struct pt_regs on stack */  	pushq	$__USER_DS				/* pt_regs->ss */  	pushq	PER_CPU_VAR(cpu_tss_rw + TSS_sp2)	/* pt_regs->sp */ @@ -446,6 +448,84 @@ _ASM_NOKPROBE(\asmsym)  SYM_CODE_END(\asmsym)  .endm +#ifdef CONFIG_AMD_MEM_ENCRYPT +/** + * idtentry_vc - Macro to generate entry stub for #VC + * @vector:		Vector number + * @asmsym:		ASM symbol for the entry point + * @cfunc:		C function to be called + * + * The macro emits code to set up the kernel context for #VC. The #VC handler + * runs on an IST stack and needs to be able to cause nested #VC exceptions. + * + * To make this work the #VC entry code tries its best to pretend it doesn't use + * an IST stack by switching to the task stack if coming from user-space (which + * includes early SYSCALL entry path) or back to the stack in the IRET frame if + * entered from kernel-mode. + * + * If entered from kernel-mode the return stack is validated first, and if it is + * not safe to use (e.g. because it points to the entry stack) the #VC handler + * will switch to a fall-back stack (VC2) and call a special handler function. + * + * The macro is only used for one vector, but it is planned to be extended in + * the future for the #HV exception. + */ +.macro idtentry_vc vector asmsym cfunc +SYM_CODE_START(\asmsym) +	UNWIND_HINT_IRET_REGS +	ASM_CLAC + +	/* +	 * If the entry is from userspace, switch stacks and treat it as +	 * a normal entry. +	 */ +	testb	$3, CS-ORIG_RAX(%rsp) +	jnz	.Lfrom_usermode_switch_stack_\@ + +	/* +	 * paranoid_entry returns SWAPGS flag for paranoid_exit in EBX. +	 * EBX == 0 -> SWAPGS, EBX == 1 -> no SWAPGS +	 */ +	call	paranoid_entry + +	UNWIND_HINT_REGS + +	/* +	 * Switch off the IST stack to make it free for nested exceptions. The +	 * vc_switch_off_ist() function will switch back to the interrupted +	 * stack if it is safe to do so. If not it switches to the VC fall-back +	 * stack. +	 */ +	movq	%rsp, %rdi		/* pt_regs pointer */ +	call	vc_switch_off_ist +	movq	%rax, %rsp		/* Switch to new stack */ + +	UNWIND_HINT_REGS + +	/* Update pt_regs */ +	movq	ORIG_RAX(%rsp), %rsi	/* get error code into 2nd argument*/ +	movq	$-1, ORIG_RAX(%rsp)	/* no syscall to restart */ + +	movq	%rsp, %rdi		/* pt_regs pointer */ + +	call	\cfunc + +	/* +	 * No need to switch back to the IST stack. The current stack is either +	 * identical to the stack in the IRET frame or the VC fall-back stack, +	 * so it is definitly mapped even with PTI enabled. +	 */ +	jmp	paranoid_exit + +	/* Switch to the regular task stack */ +.Lfrom_usermode_switch_stack_\@: +	idtentry_body safe_stack_\cfunc, has_error_code=1 + +_ASM_NOKPROBE(\asmsym) +SYM_CODE_END(\asmsym) +.endm +#endif +  /*   * Double fault entry. Straight paranoid. No checks from which context   * this comes because for the espfix induced #DF this would do the wrong diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h index 8902fdb7de13..3d52b094850a 100644 --- a/arch/x86/include/asm/cpu_entry_area.h +++ b/arch/x86/include/asm/cpu_entry_area.h @@ -11,25 +11,29 @@  #ifdef CONFIG_X86_64  /* Macro to enforce the same ordering and stack sizes */ -#define ESTACKS_MEMBERS(guardsize)		\ -	char	DF_stack_guard[guardsize];	\ -	char	DF_stack[EXCEPTION_STKSZ];	\ -	char	NMI_stack_guard[guardsize];	\ -	char	NMI_stack[EXCEPTION_STKSZ];	\ -	char	DB_stack_guard[guardsize];	\ -	char	DB_stack[EXCEPTION_STKSZ];	\ -	char	MCE_stack_guard[guardsize];	\ -	char	MCE_stack[EXCEPTION_STKSZ];	\ -	char	IST_top_guard[guardsize];	\ +#define ESTACKS_MEMBERS(guardsize, optional_stack_size)		\ +	char	DF_stack_guard[guardsize];			\ +	char	DF_stack[EXCEPTION_STKSZ];			\ +	char	NMI_stack_guard[guardsize];			\ +	char	NMI_stack[EXCEPTION_STKSZ];			\ +	char	DB_stack_guard[guardsize];			\ +	char	DB_stack[EXCEPTION_STKSZ];			\ +	char	MCE_stack_guard[guardsize];			\ +	char	MCE_stack[EXCEPTION_STKSZ];			\ +	char	VC_stack_guard[guardsize];			\ +	char	VC_stack[optional_stack_size];			\ +	char	VC2_stack_guard[guardsize];			\ +	char	VC2_stack[optional_stack_size];			\ +	char	IST_top_guard[guardsize];			\  /* The exception stacks' physical storage. No guard pages required */  struct exception_stacks { -	ESTACKS_MEMBERS(0) +	ESTACKS_MEMBERS(0, 0)  };  /* The effective cpu entry area mapping with guard pages. */  struct cea_exception_stacks { -	ESTACKS_MEMBERS(PAGE_SIZE) +	ESTACKS_MEMBERS(PAGE_SIZE, EXCEPTION_STKSZ)  };  /* @@ -40,6 +44,8 @@ enum exception_stack_ordering {  	ESTACK_NMI,  	ESTACK_DB,  	ESTACK_MCE, +	ESTACK_VC, +	ESTACK_VC2,  	N_EXCEPTION_STACKS  }; @@ -139,4 +145,7 @@ static inline struct entry_stack *cpu_entry_stack(int cpu)  #define __this_cpu_ist_top_va(name)					\  	CEA_ESTACK_TOP(__this_cpu_read(cea_exception_stacks), name) +#define __this_cpu_ist_bottom_va(name)					\ +	CEA_ESTACK_BOT(__this_cpu_read(cea_exception_stacks), name) +  #endif diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 7b0afd5e6c57..dad350d42ecf 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -236,6 +236,7 @@  #define X86_FEATURE_EPT_AD		( 8*32+17) /* Intel Extended Page Table access-dirty bit */  #define X86_FEATURE_VMCALL		( 8*32+18) /* "" Hypervisor supports the VMCALL instruction */  #define X86_FEATURE_VMW_VMMCALL		( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */ +#define X86_FEATURE_SEV_ES		( 8*32+20) /* AMD Secure Encrypted Virtualization - Encrypted State */  /* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */  #define X86_FEATURE_FSGSBASE		( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 1ced11d31932..476082a83d1c 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -383,6 +383,33 @@ static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit)  void alloc_intr_gate(unsigned int n, const void *addr); +static inline void init_idt_data(struct idt_data *data, unsigned int n, +				 const void *addr) +{ +	BUG_ON(n > 0xFF); + +	memset(data, 0, sizeof(*data)); +	data->vector	= n; +	data->addr	= addr; +	data->segment	= __KERNEL_CS; +	data->bits.type	= GATE_INTERRUPT; +	data->bits.p	= 1; +} + +static inline void idt_init_desc(gate_desc *gate, const struct idt_data *d) +{ +	unsigned long addr = (unsigned long) d->addr; + +	gate->offset_low	= (u16) addr; +	gate->segment		= (u16) d->segment; +	gate->bits		= d->bits; +	gate->offset_middle	= (u16) (addr >> 16); +#ifdef CONFIG_X86_64 +	gate->offset_high	= (u32) (addr >> 32); +	gate->reserved		= 0; +#endif +} +  extern unsigned long system_vectors[];  extern void load_current_idt(void); diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h index a91f3b6e4f2a..f7e7099af595 100644 --- a/arch/x86/include/asm/desc_defs.h +++ b/arch/x86/include/asm/desc_defs.h @@ -74,6 +74,13 @@ struct idt_bits {  			p	: 1;  } __attribute__((packed)); +struct idt_data { +	unsigned int	vector; +	unsigned int	segment; +	struct idt_bits	bits; +	const void	*addr; +}; +  struct gate_struct {  	u16		offset_low;  	u16		segment; @@ -109,6 +116,9 @@ struct desc_ptr {  #endif /* !__ASSEMBLY__ */ +/* Boot IDT definitions */ +#define	BOOT_IDT_ENTRIES	32 +  /* Access rights as returned by LAR */  #define AR_TYPE_RODATA		(0 * (1 << 9))  #define AR_TYPE_RWDATA		(1 * (1 << 9)) diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index eb1ed3bd8d96..8d33ad80704f 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -19,6 +19,7 @@  #include <asm/user.h>  #include <asm/fpu/api.h>  #include <asm/fpu/xstate.h> +#include <asm/fpu/xcr.h>  #include <asm/cpufeature.h>  #include <asm/trace/fpu.h> @@ -592,33 +593,4 @@ static inline void switch_fpu_finish(struct fpu *new_fpu)  	update_pasid();  } -/* - * MXCSR and XCR definitions: - */ - -static inline void ldmxcsr(u32 mxcsr) -{ -	asm volatile("ldmxcsr %0" :: "m" (mxcsr)); -} - -extern unsigned int mxcsr_feature_mask; - -#define XCR_XFEATURE_ENABLED_MASK	0x00000000 - -static inline u64 xgetbv(u32 index) -{ -	u32 eax, edx; - -	asm volatile("xgetbv" : "=a" (eax), "=d" (edx) : "c" (index)); -	return eax + ((u64)edx << 32); -} - -static inline void xsetbv(u32 index, u64 value) -{ -	u32 eax = value; -	u32 edx = value >> 32; - -	asm volatile("xsetbv" :: "a" (eax), "d" (edx), "c" (index)); -} -  #endif /* _ASM_X86_FPU_INTERNAL_H */ diff --git a/arch/x86/include/asm/fpu/xcr.h b/arch/x86/include/asm/fpu/xcr.h new file mode 100644 index 000000000000..1c7ab8d95da5 --- /dev/null +++ b/arch/x86/include/asm/fpu/xcr.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_FPU_XCR_H +#define _ASM_X86_FPU_XCR_H + +/* + * MXCSR and XCR definitions: + */ + +static inline void ldmxcsr(u32 mxcsr) +{ +	asm volatile("ldmxcsr %0" :: "m" (mxcsr)); +} + +extern unsigned int mxcsr_feature_mask; + +#define XCR_XFEATURE_ENABLED_MASK	0x00000000 + +static inline u64 xgetbv(u32 index) +{ +	u32 eax, edx; + +	asm volatile("xgetbv" : "=a" (eax), "=d" (edx) : "c" (index)); +	return eax + ((u64)edx << 32); +} + +static inline void xsetbv(u32 index, u64 value) +{ +	u32 eax = value; +	u32 edx = value >> 32; + +	asm volatile("xsetbv" :: "a" (eax), "d" (edx), "c" (index)); +} + +#endif /* _ASM_X86_FPU_XCR_H */ diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index cdd41d039cd1..b2442eb0ac2f 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -309,6 +309,19 @@ static __always_inline void __##func(struct pt_regs *regs)  	__visible void noist_##func(struct pt_regs *regs)  /** + * DECLARE_IDTENTRY_VC - Declare functions for the VC entry point + * @vector:	Vector number (ignored for C) + * @func:	Function name of the entry point + * + * Maps to DECLARE_IDTENTRY_RAW_ERRORCODE, but declares also the + * safe_stack C handler. + */ +#define DECLARE_IDTENTRY_VC(vector, func)				\ +	DECLARE_IDTENTRY_RAW_ERRORCODE(vector, func);			\ +	__visible noinstr void ist_##func(struct pt_regs *regs, unsigned long error_code);	\ +	__visible noinstr void safe_stack_##func(struct pt_regs *regs, unsigned long error_code) + +/**   * DEFINE_IDTENTRY_IST - Emit code for IST entry points   * @func:	Function name of the entry point   * @@ -347,6 +360,35 @@ static __always_inline void __##func(struct pt_regs *regs)  #define DEFINE_IDTENTRY_DF(func)					\  	DEFINE_IDTENTRY_RAW_ERRORCODE(func) +/** + * DEFINE_IDTENTRY_VC_SAFE_STACK - Emit code for VMM communication handler +				   which runs on a safe stack. + * @func:	Function name of the entry point + * + * Maps to DEFINE_IDTENTRY_RAW_ERRORCODE + */ +#define DEFINE_IDTENTRY_VC_SAFE_STACK(func)				\ +	DEFINE_IDTENTRY_RAW_ERRORCODE(safe_stack_##func) + +/** + * DEFINE_IDTENTRY_VC_IST - Emit code for VMM communication handler +			    which runs on the VC fall-back stack + * @func:	Function name of the entry point + * + * Maps to DEFINE_IDTENTRY_RAW_ERRORCODE + */ +#define DEFINE_IDTENTRY_VC_IST(func)				\ +	DEFINE_IDTENTRY_RAW_ERRORCODE(ist_##func) + +/** + * DEFINE_IDTENTRY_VC - Emit code for VMM communication handler + * @func:	Function name of the entry point + * + * Maps to DEFINE_IDTENTRY_RAW_ERRORCODE + */ +#define DEFINE_IDTENTRY_VC(func)					\ +	DEFINE_IDTENTRY_RAW_ERRORCODE(func) +  #else	/* CONFIG_X86_64 */  /** @@ -433,6 +475,9 @@ __visible noinstr void func(struct pt_regs *regs,			\  # define DECLARE_IDTENTRY_XENCB(vector, func)				\  	DECLARE_IDTENTRY(vector, func) +# define DECLARE_IDTENTRY_VC(vector, func)				\ +	idtentry_vc vector asm_##func func +  #else  # define DECLARE_IDTENTRY_MCE(vector, func)				\  	DECLARE_IDTENTRY(vector, func) @@ -564,6 +609,11 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_DB,	xenpv_exc_debug);  /* #DF */  DECLARE_IDTENTRY_DF(X86_TRAP_DF,	exc_double_fault); +/* #VC */ +#ifdef CONFIG_AMD_MEM_ENCRYPT +DECLARE_IDTENTRY_VC(X86_TRAP_VC,	exc_vmm_communication); +#endif +  #ifdef CONFIG_XEN_PV  DECLARE_IDTENTRY_XENCB(X86_TRAP_OTHER,	exc_xen_hypervisor_callback);  #endif diff --git a/arch/x86/include/asm/insn-eval.h b/arch/x86/include/asm/insn-eval.h index 2b6ccf2c49f1..a0f839aa144d 100644 --- a/arch/x86/include/asm/insn-eval.h +++ b/arch/x86/include/asm/insn-eval.h @@ -15,9 +15,15 @@  #define INSN_CODE_SEG_OPND_SZ(params) (params & 0xf)  #define INSN_CODE_SEG_PARAMS(oper_sz, addr_sz) (oper_sz | (addr_sz << 4)) +bool insn_has_rep_prefix(struct insn *insn);  void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs);  int insn_get_modrm_rm_off(struct insn *insn, struct pt_regs *regs); +int insn_get_modrm_reg_off(struct insn *insn, struct pt_regs *regs);  unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx);  int insn_get_code_seg_params(struct pt_regs *regs); +int insn_fetch_from_user(struct pt_regs *regs, +			 unsigned char buf[MAX_INSN_SIZE]); +bool insn_decode(struct insn *insn, struct pt_regs *regs, +		 unsigned char buf[MAX_INSN_SIZE], int buf_size);  #endif /* _ASM_X86_INSN_EVAL_H */ diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index 5049f6c22683..c9f5df0a1c10 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -19,6 +19,7 @@  #ifdef CONFIG_AMD_MEM_ENCRYPT  extern u64 sme_me_mask; +extern u64 sev_status;  extern bool sev_enabled;  void sme_encrypt_execute(unsigned long encrypted_kernel_vaddr, @@ -48,8 +49,10 @@ void __init mem_encrypt_free_decrypted_mem(void);  /* Architecture __weak replacement functions */  void __init mem_encrypt_init(void); +void __init sev_es_init_vc_handling(void);  bool sme_active(void);  bool sev_active(void); +bool sev_es_active(void);  #define __bss_decrypted __attribute__((__section__(".bss..decrypted"))) @@ -70,8 +73,10 @@ static inline void __init sme_early_init(void) { }  static inline void __init sme_encrypt_kernel(struct boot_params *bp) { }  static inline void __init sme_enable(struct boot_params *bp) { } +static inline void sev_es_init_vc_handling(void) { }  static inline bool sme_active(void) { return false; }  static inline bool sev_active(void) { return false; } +static inline bool sev_es_active(void) { return false; }  static inline int __init  early_set_memory_decrypted(unsigned long vaddr, unsigned long size) { return 0; } diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index c07a70ce7ffd..972a34d93505 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -470,9 +470,12 @@  #define MSR_AMD64_ICIBSEXTDCTL		0xc001103c  #define MSR_AMD64_IBSOPDATA4		0xc001103d  #define MSR_AMD64_IBS_REG_COUNT_MAX	8 /* includes MSR_AMD64_IBSBRTARGET */ +#define MSR_AMD64_SEV_ES_GHCB		0xc0010130  #define MSR_AMD64_SEV			0xc0010131  #define MSR_AMD64_SEV_ENABLED_BIT	0 +#define MSR_AMD64_SEV_ES_ENABLED_BIT	1  #define MSR_AMD64_SEV_ENABLED		BIT_ULL(MSR_AMD64_SEV_ENABLED_BIT) +#define MSR_AMD64_SEV_ES_ENABLED	BIT_ULL(MSR_AMD64_SEV_ES_ENABLED_BIT)  #define MSR_AMD64_VIRT_SPEC_CTRL	0xc001011f diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 288b065955b7..d0c6c10c18a0 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -28,6 +28,7 @@  #define	IST_INDEX_NMI		1  #define	IST_INDEX_DB		2  #define	IST_INDEX_MCE		3 +#define	IST_INDEX_VC		4  /*   * Set __PAGE_OFFSET to the most negative possible address + diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 5e0dcc20614d..a02c67291cfc 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -28,7 +28,7 @@  #include <asm-generic/pgtable_uffd.h>  extern pgd_t early_top_pgt[PTRS_PER_PGD]; -int __init __early_make_pgtable(unsigned long address, pmdval_t pmd); +bool __init __early_make_pgtable(unsigned long address, pmdval_t pmd);  void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm);  void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm, diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index d8a82e650810..5ac507586769 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -696,6 +696,7 @@ extern void load_direct_gdt(int);  extern void load_fixmap_gdt(int);  extern void load_percpu_segment(int);  extern void cpu_init(void); +extern void cpu_init_exception_handling(void);  extern void cr4_init(void);  static inline unsigned long get_debugctlmsr(void) diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 28996fe19301..2c35f1c01a2d 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -10,6 +10,7 @@ void syscall_init(void);  #ifdef CONFIG_X86_64  void entry_SYSCALL_64(void); +void entry_SYSCALL_64_safe_stack(void);  long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2);  #endif diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index b35030eeec36..5db5d083c873 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -21,6 +21,9 @@ struct real_mode_header {  	/* SMP trampoline */  	u32	trampoline_start;  	u32	trampoline_header; +#ifdef CONFIG_AMD_MEM_ENCRYPT +	u32	sev_es_trampoline_start; +#endif  #ifdef CONFIG_X86_64  	u32	trampoline_pgd;  #endif @@ -57,6 +60,9 @@ extern unsigned char real_mode_blob_end[];  extern unsigned long initial_code;  extern unsigned long initial_gs;  extern unsigned long initial_stack; +#ifdef CONFIG_AMD_MEM_ENCRYPT +extern unsigned long initial_vc_handler; +#endif  extern unsigned char real_mode_blob[];  extern unsigned char real_mode_relocs[]; @@ -66,6 +72,7 @@ extern unsigned char startup_32_smp[];  extern unsigned char boot_gdt[];  #else  extern unsigned char secondary_startup_64[]; +extern unsigned char secondary_startup_64_no_verify[];  #endif  static inline size_t real_mode_size_needed(void) diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 517920928989..7fdd4facfce7 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -226,7 +226,7 @@  #define NUM_EXCEPTION_VECTORS		32  /* Bitmask of exception vectors which push an error code on the stack: */ -#define EXCEPTION_ERRCODE_MASK		0x00027d00 +#define EXCEPTION_ERRCODE_MASK		0x20027d00  #define GDT_SIZE			(GDT_ENTRIES*8)  #define GDT_ENTRY_TLS_ENTRIES		3 diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 84b645cc8bc9..7d7a064af6ff 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -39,6 +39,8 @@ void vsmp_init(void);  static inline void vsmp_init(void) { }  #endif +struct pt_regs; +  void setup_bios_corruption_check(void);  void early_platform_quirks(void); @@ -48,7 +50,9 @@ extern void reserve_standard_io_resources(void);  extern void i386_reserve_resources(void);  extern unsigned long __startup_64(unsigned long physaddr, struct boot_params *bp);  extern unsigned long __startup_secondary_64(void); -extern int early_make_pgtable(unsigned long address); +extern void startup_64_setup_env(unsigned long physbase); +extern void early_setup_idt(void); +extern void __init do_early_exception(struct pt_regs *regs, int trapnr);  #ifdef CONFIG_X86_INTEL_MID  extern void x86_intel_mid_early_setup(void); diff --git a/arch/x86/include/asm/sev-es.h b/arch/x86/include/asm/sev-es.h new file mode 100644 index 000000000000..cf1d957c7091 --- /dev/null +++ b/arch/x86/include/asm/sev-es.h @@ -0,0 +1,114 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * AMD Encrypted Register State Support + * + * Author: Joerg Roedel <jroedel@suse.de> + */ + +#ifndef __ASM_ENCRYPTED_STATE_H +#define __ASM_ENCRYPTED_STATE_H + +#include <linux/types.h> +#include <asm/insn.h> + +#define GHCB_SEV_INFO		0x001UL +#define GHCB_SEV_INFO_REQ	0x002UL +#define		GHCB_INFO(v)		((v) & 0xfffUL) +#define		GHCB_PROTO_MAX(v)	(((v) >> 48) & 0xffffUL) +#define		GHCB_PROTO_MIN(v)	(((v) >> 32) & 0xffffUL) +#define		GHCB_PROTO_OUR		0x0001UL +#define GHCB_SEV_CPUID_REQ	0x004UL +#define		GHCB_CPUID_REQ_EAX	0 +#define		GHCB_CPUID_REQ_EBX	1 +#define		GHCB_CPUID_REQ_ECX	2 +#define		GHCB_CPUID_REQ_EDX	3 +#define		GHCB_CPUID_REQ(fn, reg) (GHCB_SEV_CPUID_REQ | \ +					(((unsigned long)reg & 3) << 30) | \ +					(((unsigned long)fn) << 32)) + +#define	GHCB_PROTOCOL_MAX	0x0001UL +#define GHCB_DEFAULT_USAGE	0x0000UL + +#define GHCB_SEV_CPUID_RESP	0x005UL +#define GHCB_SEV_TERMINATE	0x100UL +#define		GHCB_SEV_TERMINATE_REASON(reason_set, reason_val)	\ +			(((((u64)reason_set) &  0x7) << 12) |		\ +			 ((((u64)reason_val) & 0xff) << 16)) +#define		GHCB_SEV_ES_REASON_GENERAL_REQUEST	0 +#define		GHCB_SEV_ES_REASON_PROTOCOL_UNSUPPORTED	1 + +#define	GHCB_SEV_GHCB_RESP_CODE(v)	((v) & 0xfff) +#define	VMGEXIT()			{ asm volatile("rep; vmmcall\n\r"); } + +enum es_result { +	ES_OK,			/* All good */ +	ES_UNSUPPORTED,		/* Requested operation not supported */ +	ES_VMM_ERROR,		/* Unexpected state from the VMM */ +	ES_DECODE_FAILED,	/* Instruction decoding failed */ +	ES_EXCEPTION,		/* Instruction caused exception */ +	ES_RETRY,		/* Retry instruction emulation */ +}; + +struct es_fault_info { +	unsigned long vector; +	unsigned long error_code; +	unsigned long cr2; +}; + +struct pt_regs; + +/* ES instruction emulation context */ +struct es_em_ctxt { +	struct pt_regs *regs; +	struct insn insn; +	struct es_fault_info fi; +}; + +void do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code); + +static inline u64 lower_bits(u64 val, unsigned int bits) +{ +	u64 mask = (1ULL << bits) - 1; + +	return (val & mask); +} + +struct real_mode_header; +enum stack_type; + +/* Early IDT entry points for #VC handler */ +extern void vc_no_ghcb(void); +extern void vc_boot_ghcb(void); +extern bool handle_vc_boot_ghcb(struct pt_regs *regs); + +#ifdef CONFIG_AMD_MEM_ENCRYPT +extern struct static_key_false sev_es_enable_key; +extern void __sev_es_ist_enter(struct pt_regs *regs); +extern void __sev_es_ist_exit(void); +static __always_inline void sev_es_ist_enter(struct pt_regs *regs) +{ +	if (static_branch_unlikely(&sev_es_enable_key)) +		__sev_es_ist_enter(regs); +} +static __always_inline void sev_es_ist_exit(void) +{ +	if (static_branch_unlikely(&sev_es_enable_key)) +		__sev_es_ist_exit(); +} +extern int sev_es_setup_ap_jump_table(struct real_mode_header *rmh); +extern void __sev_es_nmi_complete(void); +static __always_inline void sev_es_nmi_complete(void) +{ +	if (static_branch_unlikely(&sev_es_enable_key)) +		__sev_es_nmi_complete(); +} +extern int __init sev_es_efi_map_ghcbs(pgd_t *pgd); +#else +static inline void sev_es_ist_enter(struct pt_regs *regs) { } +static inline void sev_es_ist_exit(void) { } +static inline int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) { return 0; } +static inline void sev_es_nmi_complete(void) { } +static inline int sev_es_efi_map_ghcbs(pgd_t *pgd) { return 0; } +#endif + +#endif diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 5ae5a68e469d..49600643faba 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -35,6 +35,8 @@ bool in_entry_stack(unsigned long *stack, struct stack_info *info);  int get_stack_info(unsigned long *stack, struct task_struct *task,  		   struct stack_info *info, unsigned long *visit_mask); +bool get_stack_info_noinstr(unsigned long *stack, struct task_struct *task, +			    struct stack_info *info);  const char *stack_type_name(enum stack_type type); diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 8a1f5382a4ea..cf13f9e78585 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -150,14 +150,14 @@ struct __attribute__ ((__packed__)) vmcb_control_area {  #define SVM_NESTED_CTL_NP_ENABLE	BIT(0)  #define SVM_NESTED_CTL_SEV_ENABLE	BIT(1) -struct __attribute__ ((__packed__)) vmcb_seg { +struct vmcb_seg {  	u16 selector;  	u16 attrib;  	u32 limit;  	u64 base; -}; +} __packed; -struct __attribute__ ((__packed__)) vmcb_save_area { +struct vmcb_save_area {  	struct vmcb_seg es;  	struct vmcb_seg cs;  	struct vmcb_seg ss; @@ -200,20 +200,67 @@ struct __attribute__ ((__packed__)) vmcb_save_area {  	u64 br_to;  	u64 last_excp_from;  	u64 last_excp_to; -}; +	/* +	 * The following part of the save area is valid only for +	 * SEV-ES guests when referenced through the GHCB. +	 */ +	u8 reserved_7[104]; +	u64 reserved_8;		/* rax already available at 0x01f8 */ +	u64 rcx; +	u64 rdx; +	u64 rbx; +	u64 reserved_9;		/* rsp already available at 0x01d8 */ +	u64 rbp; +	u64 rsi; +	u64 rdi; +	u64 r8; +	u64 r9; +	u64 r10; +	u64 r11; +	u64 r12; +	u64 r13; +	u64 r14; +	u64 r15; +	u8 reserved_10[16]; +	u64 sw_exit_code; +	u64 sw_exit_info_1; +	u64 sw_exit_info_2; +	u64 sw_scratch; +	u8 reserved_11[56]; +	u64 xcr0; +	u8 valid_bitmap[16]; +	u64 x87_state_gpa; +} __packed; + +struct ghcb { +	struct vmcb_save_area save; +	u8 reserved_save[2048 - sizeof(struct vmcb_save_area)]; + +	u8 shared_buffer[2032]; + +	u8 reserved_1[10]; +	u16 protocol_version;	/* negotiated SEV-ES/GHCB protocol version */ +	u32 ghcb_usage; +} __packed; + + +#define EXPECTED_VMCB_SAVE_AREA_SIZE		1032 +#define EXPECTED_VMCB_CONTROL_AREA_SIZE		256 +#define EXPECTED_GHCB_SIZE			PAGE_SIZE  static inline void __unused_size_checks(void)  { -	BUILD_BUG_ON(sizeof(struct vmcb_save_area) != 0x298); -	BUILD_BUG_ON(sizeof(struct vmcb_control_area) != 256); +	BUILD_BUG_ON(sizeof(struct vmcb_save_area)	!= EXPECTED_VMCB_SAVE_AREA_SIZE); +	BUILD_BUG_ON(sizeof(struct vmcb_control_area)	!= EXPECTED_VMCB_CONTROL_AREA_SIZE); +	BUILD_BUG_ON(sizeof(struct ghcb)		!= EXPECTED_GHCB_SIZE);  } -struct __attribute__ ((__packed__)) vmcb { +struct vmcb {  	struct vmcb_control_area control;  	u8 reserved_control[1024 - sizeof(struct vmcb_control_area)];  	struct vmcb_save_area save; -}; +} __packed;  #define SVM_CPUID_FUNC 0x8000000a @@ -298,4 +345,47 @@ struct __attribute__ ((__packed__)) vmcb {  #define SVM_CR0_SELECTIVE_MASK (X86_CR0_TS | X86_CR0_MP) +/* GHCB Accessor functions */ + +#define GHCB_BITMAP_IDX(field)							\ +	(offsetof(struct vmcb_save_area, field) / sizeof(u64)) + +#define DEFINE_GHCB_ACCESSORS(field)						\ +	static inline bool ghcb_##field##_is_valid(const struct ghcb *ghcb)	\ +	{									\ +		return test_bit(GHCB_BITMAP_IDX(field),				\ +				(unsigned long *)&ghcb->save.valid_bitmap);	\ +	}									\ +										\ +	static inline void ghcb_set_##field(struct ghcb *ghcb, u64 value)	\ +	{									\ +		__set_bit(GHCB_BITMAP_IDX(field),				\ +			  (unsigned long *)&ghcb->save.valid_bitmap);		\ +		ghcb->save.field = value;					\ +	} + +DEFINE_GHCB_ACCESSORS(cpl) +DEFINE_GHCB_ACCESSORS(rip) +DEFINE_GHCB_ACCESSORS(rsp) +DEFINE_GHCB_ACCESSORS(rax) +DEFINE_GHCB_ACCESSORS(rcx) +DEFINE_GHCB_ACCESSORS(rdx) +DEFINE_GHCB_ACCESSORS(rbx) +DEFINE_GHCB_ACCESSORS(rbp) +DEFINE_GHCB_ACCESSORS(rsi) +DEFINE_GHCB_ACCESSORS(rdi) +DEFINE_GHCB_ACCESSORS(r8) +DEFINE_GHCB_ACCESSORS(r9) +DEFINE_GHCB_ACCESSORS(r10) +DEFINE_GHCB_ACCESSORS(r11) +DEFINE_GHCB_ACCESSORS(r12) +DEFINE_GHCB_ACCESSORS(r13) +DEFINE_GHCB_ACCESSORS(r14) +DEFINE_GHCB_ACCESSORS(r15) +DEFINE_GHCB_ACCESSORS(sw_exit_code) +DEFINE_GHCB_ACCESSORS(sw_exit_info_1) +DEFINE_GHCB_ACCESSORS(sw_exit_info_2) +DEFINE_GHCB_ACCESSORS(sw_scratch) +DEFINE_GHCB_ACCESSORS(xcr0) +  #endif diff --git a/arch/x86/include/asm/trap_pf.h b/arch/x86/include/asm/trap_pf.h new file mode 100644 index 000000000000..305bc1214aef --- /dev/null +++ b/arch/x86/include/asm/trap_pf.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_TRAP_PF_H +#define _ASM_X86_TRAP_PF_H + +/* + * Page fault error code bits: + * + *   bit 0 ==	 0: no page found	1: protection fault + *   bit 1 ==	 0: read access		1: write access + *   bit 2 ==	 0: kernel-mode access	1: user-mode access + *   bit 3 ==				1: use of reserved bit detected + *   bit 4 ==				1: fault was an instruction fetch + *   bit 5 ==				1: protection keys block access + */ +enum x86_pf_error_code { +	X86_PF_PROT	=		1 << 0, +	X86_PF_WRITE	=		1 << 1, +	X86_PF_USER	=		1 << 2, +	X86_PF_RSVD	=		1 << 3, +	X86_PF_INSTR	=		1 << 4, +	X86_PF_PK	=		1 << 5, +}; + +#endif /* _ASM_X86_TRAP_PF_H */ diff --git a/arch/x86/include/asm/trapnr.h b/arch/x86/include/asm/trapnr.h index 082f45631fa9..f5d2325aa0b7 100644 --- a/arch/x86/include/asm/trapnr.h +++ b/arch/x86/include/asm/trapnr.h @@ -26,6 +26,7 @@  #define X86_TRAP_XF		19	/* SIMD Floating-Point Exception */  #define X86_TRAP_VE		20	/* Virtualization Exception */  #define X86_TRAP_CP		21	/* Control Protection Exception */ +#define X86_TRAP_VC		29	/* VMM Communication Exception */  #define X86_TRAP_IRET		32	/* IRET Exception */  #endif diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index df0b7bfc1234..7f7200021bd1 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -8,12 +8,14 @@  #include <asm/debugreg.h>  #include <asm/idtentry.h>  #include <asm/siginfo.h>			/* TRAP_TRACE, ... */ +#include <asm/trap_pf.h>  #ifdef CONFIG_X86_64  asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs);  asmlinkage __visible notrace  struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s);  void __init trap_init(void); +asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *eregs);  #endif  #ifdef CONFIG_X86_F00F_BUG @@ -43,22 +45,4 @@ void __noreturn handle_stack_overflow(const char *message,  				      unsigned long fault_address);  #endif -/* - * Page fault error code bits: - * - *   bit 0 ==	 0: no page found	1: protection fault - *   bit 1 ==	 0: read access		1: write access - *   bit 2 ==	 0: kernel-mode access	1: user-mode access - *   bit 3 ==				1: use of reserved bit detected - *   bit 4 ==				1: fault was an instruction fetch - *   bit 5 ==				1: protection keys block access - */ -enum x86_pf_error_code { -	X86_PF_PROT	=		1 << 0, -	X86_PF_WRITE	=		1 << 1, -	X86_PF_USER	=		1 << 2, -	X86_PF_RSVD	=		1 << 3, -	X86_PF_INSTR	=		1 << 4, -	X86_PF_PK	=		1 << 5, -};  #endif /* _ASM_X86_TRAPS_H */ diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 397196fae24d..dde5b3f1e7cd 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -4,8 +4,10 @@  #include <asm/bootparam.h> +struct ghcb;  struct mpc_bus;  struct mpc_cpu; +struct pt_regs;  struct mpc_table;  struct cpuinfo_x86;  struct irq_domain; @@ -229,10 +231,22 @@ struct x86_legacy_features {  /**   * struct x86_hyper_runtime - x86 hypervisor specific runtime callbacks   * - * @pin_vcpu:		pin current vcpu to specified physical cpu (run rarely) + * @pin_vcpu:			pin current vcpu to specified physical + *				cpu (run rarely) + * @sev_es_hcall_prepare:	Load additional hypervisor-specific + *				state into the GHCB when doing a VMMCALL under + *				SEV-ES. Called from the #VC exception handler. + * @sev_es_hcall_finish:	Copies state from the GHCB back into the + *				processor (or pt_regs). Also runs checks on the + *				state returned from the hypervisor after a + *				VMMCALL under SEV-ES.  Needs to return 'false' + *				if the checks fail.  Called from the #VC + *				exception handler.   */  struct x86_hyper_runtime {  	void (*pin_vcpu)(int cpu); +	void (*sev_es_hcall_prepare)(struct ghcb *ghcb, struct pt_regs *regs); +	bool (*sev_es_hcall_finish)(struct ghcb *ghcb, struct pt_regs *regs);  };  /** diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h index 2e8a30f06c74..a7a3403645e5 100644 --- a/arch/x86/include/uapi/asm/svm.h +++ b/arch/x86/include/uapi/asm/svm.h @@ -29,6 +29,7 @@  #define SVM_EXIT_WRITE_DR6     0x036  #define SVM_EXIT_WRITE_DR7     0x037  #define SVM_EXIT_EXCP_BASE     0x040 +#define SVM_EXIT_LAST_EXCP     0x05f  #define SVM_EXIT_INTR          0x060  #define SVM_EXIT_NMI           0x061  #define SVM_EXIT_SMI           0x062 @@ -80,6 +81,16 @@  #define SVM_EXIT_AVIC_INCOMPLETE_IPI		0x401  #define SVM_EXIT_AVIC_UNACCELERATED_ACCESS	0x402 +/* SEV-ES software-defined VMGEXIT events */ +#define SVM_VMGEXIT_MMIO_READ			0x80000001 +#define SVM_VMGEXIT_MMIO_WRITE			0x80000002 +#define SVM_VMGEXIT_NMI_COMPLETE		0x80000003 +#define SVM_VMGEXIT_AP_HLT_LOOP			0x80000004 +#define SVM_VMGEXIT_AP_JUMP_TABLE		0x80000005 +#define SVM_VMGEXIT_SET_AP_JUMP_TABLE		0 +#define SVM_VMGEXIT_GET_AP_JUMP_TABLE		1 +#define SVM_VMGEXIT_UNSUPPORTED_EVENT		0x8000ffff +  #define SVM_EXIT_ERR           -1  #define SVM_EXIT_REASONS \ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index de09af019e23..04ceea8f4a89 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -20,6 +20,7 @@ CFLAGS_REMOVE_kvmclock.o = -pg  CFLAGS_REMOVE_ftrace.o = -pg  CFLAGS_REMOVE_early_printk.o = -pg  CFLAGS_REMOVE_head64.o = -pg +CFLAGS_REMOVE_sev-es.o = -pg  endif  KASAN_SANITIZE_head$(BITS).o				:= n @@ -27,6 +28,7 @@ KASAN_SANITIZE_dumpstack.o				:= n  KASAN_SANITIZE_dumpstack_$(BITS).o			:= n  KASAN_SANITIZE_stacktrace.o				:= n  KASAN_SANITIZE_paravirt.o				:= n +KASAN_SANITIZE_sev-es.o					:= n  # With some compiler versions the generated code results in boot hangs, caused  # by several compilation units. To be safe, disable all instrumentation. @@ -146,6 +148,7 @@ obj-$(CONFIG_UNWINDER_ORC)		+= unwind_orc.o  obj-$(CONFIG_UNWINDER_FRAME_POINTER)	+= unwind_frame.o  obj-$(CONFIG_UNWINDER_GUESS)		+= unwind_guess.o +obj-$(CONFIG_AMD_MEM_ENCRYPT)		+= sev-es.o  ###  # 64 bit specific files  ifeq ($(CONFIG_X86_64),y) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index dcc3d943c68f..6062ce586b95 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -614,7 +614,7 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c)  	 *	      If BIOS has not enabled SME then don't advertise the  	 *	      SME feature (set in scattered.c).  	 *   For SEV: If BIOS has not enabled SEV then don't advertise the -	 *            SEV feature (set in scattered.c). +	 *            SEV and SEV_ES feature (set in scattered.c).  	 *  	 *   In all cases, since support for SME and SEV requires long mode,  	 *   don't advertise the feature under CONFIG_X86_32. @@ -645,6 +645,7 @@ clear_all:  		setup_clear_cpu_cap(X86_FEATURE_SME);  clear_sev:  		setup_clear_cpu_cap(X86_FEATURE_SEV); +		setup_clear_cpu_cap(X86_FEATURE_SEV_ES);  	}  } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c51158914ea2..35ad8480c464 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1876,6 +1876,8 @@ static inline void tss_setup_ist(struct tss_struct *tss)  	tss->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(NMI);  	tss->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(DB);  	tss->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE); +	/* Only mapped when SEV-ES is active */ +	tss->x86_tss.ist[IST_INDEX_VC] = __this_cpu_ist_top_va(VC);  }  #else /* CONFIG_X86_64 */ @@ -1908,6 +1910,29 @@ static inline void tss_setup_io_bitmap(struct tss_struct *tss)  }  /* + * Setup everything needed to handle exceptions from the IDT, including the IST + * exceptions which use paranoid_entry(). + */ +void cpu_init_exception_handling(void) +{ +	struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw); +	int cpu = raw_smp_processor_id(); + +	/* paranoid_entry() gets the CPU number from the GDT */ +	setup_getcpu(cpu); + +	/* IST vectors need TSS to be set up. */ +	tss_setup_ist(tss); +	tss_setup_io_bitmap(tss); +	set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); + +	load_TR_desc(); + +	/* Finally load the IDT */ +	load_current_idt(); +} + +/*   * cpu_init() initializes state that is per-CPU. Some data is already   * initialized (naturally) in the bootstrap process, such as the GDT   * and IDT. We reload them nevertheless, this function acts as a diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 2eb0a8c44b35..866c9a9bcdee 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -42,6 +42,7 @@ static const struct cpuid_bit cpuid_bits[] = {  	{ X86_FEATURE_MBA,		CPUID_EBX,  6, 0x80000008, 0 },  	{ X86_FEATURE_SME,		CPUID_EAX,  0, 0x8000001f, 0 },  	{ X86_FEATURE_SEV,		CPUID_EAX,  1, 0x8000001f, 0 }, +	{ X86_FEATURE_SEV_ES,		CPUID_EAX,  3, 0x8000001f, 0 },  	{ X86_FEATURE_SME_COHERENT,	CPUID_EAX, 10, 0x8000001f, 0 },  	{ 0, 0, 0, 0, 0 }  }; diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 9b6fafa69be9..924571fe5864 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -33,6 +33,7 @@  #include <asm/timer.h>  #include <asm/apic.h>  #include <asm/vmware.h> +#include <asm/svm.h>  #undef pr_fmt  #define pr_fmt(fmt)	"vmware: " fmt @@ -476,10 +477,49 @@ static bool __init vmware_legacy_x2apic_available(void)  	       (eax & (1 << VMWARE_CMD_LEGACY_X2APIC)) != 0;  } +#ifdef CONFIG_AMD_MEM_ENCRYPT +static void vmware_sev_es_hcall_prepare(struct ghcb *ghcb, +					struct pt_regs *regs) +{ +	/* Copy VMWARE specific Hypercall parameters to the GHCB */ +	ghcb_set_rip(ghcb, regs->ip); +	ghcb_set_rbx(ghcb, regs->bx); +	ghcb_set_rcx(ghcb, regs->cx); +	ghcb_set_rdx(ghcb, regs->dx); +	ghcb_set_rsi(ghcb, regs->si); +	ghcb_set_rdi(ghcb, regs->di); +	ghcb_set_rbp(ghcb, regs->bp); +} + +static bool vmware_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs) +{ +	if (!(ghcb_rbx_is_valid(ghcb) && +	      ghcb_rcx_is_valid(ghcb) && +	      ghcb_rdx_is_valid(ghcb) && +	      ghcb_rsi_is_valid(ghcb) && +	      ghcb_rdi_is_valid(ghcb) && +	      ghcb_rbp_is_valid(ghcb))) +		return false; + +	regs->bx = ghcb->save.rbx; +	regs->cx = ghcb->save.rcx; +	regs->dx = ghcb->save.rdx; +	regs->si = ghcb->save.rsi; +	regs->di = ghcb->save.rdi; +	regs->bp = ghcb->save.rbp; + +	return true; +} +#endif +  const __initconst struct hypervisor_x86 x86_hyper_vmware = { -	.name			= "VMware", -	.detect			= vmware_platform, -	.type			= X86_HYPER_VMWARE, -	.init.init_platform	= vmware_platform_setup, -	.init.x2apic_available	= vmware_legacy_x2apic_available, +	.name				= "VMware", +	.detect				= vmware_platform, +	.type				= X86_HYPER_VMWARE, +	.init.init_platform		= vmware_platform_setup, +	.init.x2apic_available		= vmware_legacy_x2apic_available, +#ifdef CONFIG_AMD_MEM_ENCRYPT +	.runtime.sev_es_hcall_prepare	= vmware_sev_es_hcall_prepare, +	.runtime.sev_es_hcall_finish	= vmware_sev_es_hcall_finish, +#endif  }; diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index ea8d51ec251b..25c06b67e7e0 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -29,8 +29,8 @@ static int die_counter;  static struct pt_regs exec_summary_regs; -bool in_task_stack(unsigned long *stack, struct task_struct *task, -		   struct stack_info *info) +bool noinstr in_task_stack(unsigned long *stack, struct task_struct *task, +			   struct stack_info *info)  {  	unsigned long *begin = task_stack_page(task);  	unsigned long *end   = task_stack_page(task) + THREAD_SIZE; @@ -46,7 +46,8 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task,  	return true;  } -bool in_entry_stack(unsigned long *stack, struct stack_info *info) +/* Called from get_stack_info_noinstr - so must be noinstr too */ +bool noinstr in_entry_stack(unsigned long *stack, struct stack_info *info)  {  	struct entry_stack *ss = cpu_entry_stack(smp_processor_id()); diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 4a94d38cd141..1dd851397bd9 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -24,11 +24,13 @@ static const char * const exception_stack_names[] = {  		[ ESTACK_NMI	]	= "NMI",  		[ ESTACK_DB	]	= "#DB",  		[ ESTACK_MCE	]	= "#MC", +		[ ESTACK_VC	]	= "#VC", +		[ ESTACK_VC2	]	= "#VC2",  };  const char *stack_type_name(enum stack_type type)  { -	BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); +	BUILD_BUG_ON(N_EXCEPTION_STACKS != 6);  	if (type == STACK_TYPE_IRQ)  		return "IRQ"; @@ -79,16 +81,18 @@ struct estack_pages estack_pages[CEA_ESTACK_PAGES] ____cacheline_aligned = {  	EPAGERANGE(NMI),  	EPAGERANGE(DB),  	EPAGERANGE(MCE), +	EPAGERANGE(VC), +	EPAGERANGE(VC2),  }; -static bool in_exception_stack(unsigned long *stack, struct stack_info *info) +static __always_inline bool in_exception_stack(unsigned long *stack, struct stack_info *info)  {  	unsigned long begin, end, stk = (unsigned long)stack;  	const struct estack_pages *ep;  	struct pt_regs *regs;  	unsigned int k; -	BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); +	BUILD_BUG_ON(N_EXCEPTION_STACKS != 6);  	begin = (unsigned long)__this_cpu_read(cea_exception_stacks);  	/* @@ -122,7 +126,7 @@ static bool in_exception_stack(unsigned long *stack, struct stack_info *info)  	return true;  } -static bool in_irq_stack(unsigned long *stack, struct stack_info *info) +static __always_inline bool in_irq_stack(unsigned long *stack, struct stack_info *info)  {  	unsigned long *end   = (unsigned long *)this_cpu_read(hardirq_stack_ptr);  	unsigned long *begin = end - (IRQ_STACK_SIZE / sizeof(long)); @@ -147,32 +151,38 @@ static bool in_irq_stack(unsigned long *stack, struct stack_info *info)  	return true;  } -int get_stack_info(unsigned long *stack, struct task_struct *task, -		   struct stack_info *info, unsigned long *visit_mask) +bool noinstr get_stack_info_noinstr(unsigned long *stack, struct task_struct *task, +				    struct stack_info *info)  { -	if (!stack) -		goto unknown; - -	task = task ? : current; -  	if (in_task_stack(stack, task, info)) -		goto recursion_check; +		return true;  	if (task != current) -		goto unknown; +		return false;  	if (in_exception_stack(stack, info)) -		goto recursion_check; +		return true;  	if (in_irq_stack(stack, info)) -		goto recursion_check; +		return true;  	if (in_entry_stack(stack, info)) -		goto recursion_check; +		return true; + +	return false; +} + +int get_stack_info(unsigned long *stack, struct task_struct *task, +		   struct stack_info *info, unsigned long *visit_mask) +{ +	task = task ? : current; -	goto unknown; +	if (!stack) +		goto unknown; + +	if (!get_stack_info_noinstr(stack, task, info)) +		goto unknown; -recursion_check:  	/*  	 * Make sure we don't iterate through any given stack more than once.  	 * If it comes up a second time then there's something wrong going on: diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index cbb71c1b574f..4199f25c0063 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -36,6 +36,11 @@  #include <asm/microcode.h>  #include <asm/kasan.h>  #include <asm/fixmap.h> +#include <asm/realmode.h> +#include <asm/desc.h> +#include <asm/extable.h> +#include <asm/trapnr.h> +#include <asm/sev-es.h>  /*   * Manage page tables very early on. @@ -61,6 +66,24 @@ unsigned long vmemmap_base __ro_after_init = __VMEMMAP_BASE_L4;  EXPORT_SYMBOL(vmemmap_base);  #endif +/* + * GDT used on the boot CPU before switching to virtual addresses. + */ +static struct desc_struct startup_gdt[GDT_ENTRIES] = { +	[GDT_ENTRY_KERNEL32_CS]         = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff), +	[GDT_ENTRY_KERNEL_CS]           = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff), +	[GDT_ENTRY_KERNEL_DS]           = GDT_ENTRY_INIT(0xc093, 0, 0xfffff), +}; + +/* + * Address needs to be set at runtime because it references the startup_gdt + * while the kernel still uses a direct mapping. + */ +static struct desc_ptr startup_gdt_descr = { +	.size = sizeof(startup_gdt), +	.address = 0, +}; +  #define __head	__section(.head.text)  static void __head *fixup_pointer(void *ptr, unsigned long physaddr) @@ -297,7 +320,7 @@ static void __init reset_early_page_tables(void)  }  /* Create a new PMD entry */ -int __init __early_make_pgtable(unsigned long address, pmdval_t pmd) +bool __init __early_make_pgtable(unsigned long address, pmdval_t pmd)  {  	unsigned long physaddr = address - __PAGE_OFFSET;  	pgdval_t pgd, *pgd_p; @@ -307,7 +330,7 @@ int __init __early_make_pgtable(unsigned long address, pmdval_t pmd)  	/* Invalid address or early pgt is done ?  */  	if (physaddr >= MAXMEM || read_cr3_pa() != __pa_nodebug(early_top_pgt)) -		return -1; +		return false;  again:  	pgd_p = &early_top_pgt[pgd_index(address)].pgd; @@ -364,10 +387,10 @@ again:  	}  	pmd_p[pmd_index(address)] = pmd; -	return 0; +	return true;  } -int __init early_make_pgtable(unsigned long address) +static bool __init early_make_pgtable(unsigned long address)  {  	unsigned long physaddr = address - __PAGE_OFFSET;  	pmdval_t pmd; @@ -377,6 +400,19 @@ int __init early_make_pgtable(unsigned long address)  	return __early_make_pgtable(address, pmd);  } +void __init do_early_exception(struct pt_regs *regs, int trapnr) +{ +	if (trapnr == X86_TRAP_PF && +	    early_make_pgtable(native_read_cr2())) +		return; + +	if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT) && +	    trapnr == X86_TRAP_VC && handle_vc_boot_ghcb(regs)) +		return; + +	early_fixup_exception(regs, trapnr); +} +  /* Don't add a printk in there. printk relies on the PDA which is not initialized      yet. */  static void __init clear_bss(void) @@ -489,3 +525,81 @@ void __init x86_64_start_reservations(char *real_mode_data)  	start_kernel();  } + +/* + * Data structures and code used for IDT setup in head_64.S. The bringup-IDT is + * used until the idt_table takes over. On the boot CPU this happens in + * x86_64_start_kernel(), on secondary CPUs in start_secondary(). In both cases + * this happens in the functions called from head_64.S. + * + * The idt_table can't be used that early because all the code modifying it is + * in idt.c and can be instrumented by tracing or KASAN, which both don't work + * during early CPU bringup. Also the idt_table has the runtime vectors + * configured which require certain CPU state to be setup already (like TSS), + * which also hasn't happened yet in early CPU bringup. + */ +static gate_desc bringup_idt_table[NUM_EXCEPTION_VECTORS] __page_aligned_data; + +static struct desc_ptr bringup_idt_descr = { +	.size		= (NUM_EXCEPTION_VECTORS * sizeof(gate_desc)) - 1, +	.address	= 0, /* Set at runtime */ +}; + +static void set_bringup_idt_handler(gate_desc *idt, int n, void *handler) +{ +#ifdef CONFIG_AMD_MEM_ENCRYPT +	struct idt_data data; +	gate_desc desc; + +	init_idt_data(&data, n, handler); +	idt_init_desc(&desc, &data); +	native_write_idt_entry(idt, n, &desc); +#endif +} + +/* This runs while still in the direct mapping */ +static void startup_64_load_idt(unsigned long physbase) +{ +	struct desc_ptr *desc = fixup_pointer(&bringup_idt_descr, physbase); +	gate_desc *idt = fixup_pointer(bringup_idt_table, physbase); + + +	if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) { +		void *handler; + +		/* VMM Communication Exception */ +		handler = fixup_pointer(vc_no_ghcb, physbase); +		set_bringup_idt_handler(idt, X86_TRAP_VC, handler); +	} + +	desc->address = (unsigned long)idt; +	native_load_idt(desc); +} + +/* This is used when running on kernel addresses */ +void early_setup_idt(void) +{ +	/* VMM Communication Exception */ +	if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) +		set_bringup_idt_handler(bringup_idt_table, X86_TRAP_VC, vc_boot_ghcb); + +	bringup_idt_descr.address = (unsigned long)bringup_idt_table; +	native_load_idt(&bringup_idt_descr); +} + +/* + * Setup boot CPU state needed before kernel switches to virtual addresses. + */ +void __head startup_64_setup_env(unsigned long physbase) +{ +	/* Load GDT */ +	startup_gdt_descr.address = (unsigned long)fixup_pointer(startup_gdt, physbase); +	native_load_gdt(&startup_gdt_descr); + +	/* New GDT is live - reload data segment registers */ +	asm volatile("movl %%eax, %%ds\n" +		     "movl %%eax, %%ss\n" +		     "movl %%eax, %%es\n" : : "a"(__KERNEL_DS) : "memory"); + +	startup_64_load_idt(physbase); +} diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 16da4ac01597..7eb2a1c87969 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -73,6 +73,20 @@ SYM_CODE_START_NOALIGN(startup_64)  	/* Set up the stack for verify_cpu(), similar to initial_stack below */  	leaq	(__end_init_task - SIZEOF_PTREGS)(%rip), %rsp +	leaq	_text(%rip), %rdi +	pushq	%rsi +	call	startup_64_setup_env +	popq	%rsi + +	/* Now switch to __KERNEL_CS so IRET works reliably */ +	pushq	$__KERNEL_CS +	leaq	.Lon_kernel_cs(%rip), %rax +	pushq	%rax +	lretq + +.Lon_kernel_cs: +	UNWIND_HINT_EMPTY +  	/* Sanitize CPU configuration */  	call verify_cpu @@ -112,6 +126,18 @@ SYM_CODE_START(secondary_startup_64)  	call verify_cpu  	/* +	 * The secondary_startup_64_no_verify entry point is only used by +	 * SEV-ES guests. In those guests the call to verify_cpu() would cause +	 * #VC exceptions which can not be handled at this stage of secondary +	 * CPU bringup. +	 * +	 * All non SEV-ES systems, especially Intel systems, need to execute +	 * verify_cpu() above to make sure NX is enabled. +	 */ +SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) +	UNWIND_HINT_EMPTY + +	/*  	 * Retrieve the modifier (SME encryption mask if SME is active) to be  	 * added to the initial pgdir entry that will be programmed into CR3.  	 */ @@ -144,33 +170,6 @@ SYM_CODE_START(secondary_startup_64)  1:  	UNWIND_HINT_EMPTY -	/* Check if nx is implemented */ -	movl	$0x80000001, %eax -	cpuid -	movl	%edx,%edi - -	/* Setup EFER (Extended Feature Enable Register) */ -	movl	$MSR_EFER, %ecx -	rdmsr -	btsl	$_EFER_SCE, %eax	/* Enable System Call */ -	btl	$20,%edi		/* No Execute supported? */ -	jnc     1f -	btsl	$_EFER_NX, %eax -	btsq	$_PAGE_BIT_NX,early_pmd_flags(%rip) -1:	wrmsr				/* Make changes effective */ - -	/* Setup cr0 */ -	movl	$CR0_STATE, %eax -	/* Make changes effective */ -	movq	%rax, %cr0 - -	/* Setup a boot time stack */ -	movq initial_stack(%rip), %rsp - -	/* zero EFLAGS after setting rsp */ -	pushq $0 -	popfq -  	/*  	 * We must switch to a new descriptor in kernel space for the GDT  	 * because soon the kernel won't have access anymore to the userspace @@ -205,6 +204,41 @@ SYM_CODE_START(secondary_startup_64)  	movl	initial_gs+4(%rip),%edx  	wrmsr +	/* +	 * Setup a boot time stack - Any secondary CPU will have lost its stack +	 * by now because the cr3-switch above unmaps the real-mode stack +	 */ +	movq initial_stack(%rip), %rsp + +	/* Setup and Load IDT */ +	pushq	%rsi +	call	early_setup_idt +	popq	%rsi + +	/* Check if nx is implemented */ +	movl	$0x80000001, %eax +	cpuid +	movl	%edx,%edi + +	/* Setup EFER (Extended Feature Enable Register) */ +	movl	$MSR_EFER, %ecx +	rdmsr +	btsl	$_EFER_SCE, %eax	/* Enable System Call */ +	btl	$20,%edi		/* No Execute supported? */ +	jnc     1f +	btsl	$_EFER_NX, %eax +	btsq	$_PAGE_BIT_NX,early_pmd_flags(%rip) +1:	wrmsr				/* Make changes effective */ + +	/* Setup cr0 */ +	movl	$CR0_STATE, %eax +	/* Make changes effective */ +	movq	%rax, %cr0 + +	/* zero EFLAGS after setting rsp */ +	pushq $0 +	popfq +  	/* rsi is pointer to real mode structure with interesting info.  	   pass it to C */  	movq	%rsi, %rdi @@ -259,11 +293,47 @@ SYM_CODE_START(start_cpu0)  SYM_CODE_END(start_cpu0)  #endif +#ifdef CONFIG_AMD_MEM_ENCRYPT +/* + * VC Exception handler used during early boot when running on kernel + * addresses, but before the switch to the idt_table can be made. + * The early_idt_handler_array can't be used here because it calls into a lot + * of __init code and this handler is also used during CPU offlining/onlining. + * Therefore this handler ends up in the .text section so that it stays around + * when .init.text is freed. + */ +SYM_CODE_START_NOALIGN(vc_boot_ghcb) +	UNWIND_HINT_IRET_REGS offset=8 + +	/* Build pt_regs */ +	PUSH_AND_CLEAR_REGS + +	/* Call C handler */ +	movq    %rsp, %rdi +	movq	ORIG_RAX(%rsp), %rsi +	movq	initial_vc_handler(%rip), %rax +	ANNOTATE_RETPOLINE_SAFE +	call	*%rax + +	/* Unwind pt_regs */ +	POP_REGS + +	/* Remove Error Code */ +	addq    $8, %rsp + +	/* Pure iret required here - don't use INTERRUPT_RETURN */ +	iretq +SYM_CODE_END(vc_boot_ghcb) +#endif +  	/* Both SMP bootup and ACPI suspend change these variables */  	__REFDATA  	.balign	8  SYM_DATA(initial_code,	.quad x86_64_start_kernel)  SYM_DATA(initial_gs,	.quad INIT_PER_CPU_VAR(fixed_percpu_data)) +#ifdef CONFIG_AMD_MEM_ENCRYPT +SYM_DATA(initial_vc_handler,	.quad handle_vc_boot_ghcb) +#endif  /*   * The SIZEOF_PTREGS gap is a convention which helps the in-kernel unwinder @@ -319,22 +389,43 @@ SYM_CODE_START_LOCAL(early_idt_handler_common)  	pushq %r15				/* pt_regs->r15 */  	UNWIND_HINT_REGS -	cmpq $14,%rsi		/* Page fault? */ -	jnz 10f -	GET_CR2_INTO(%rdi)	/* can clobber %rax if pv */ -	call early_make_pgtable -	andl %eax,%eax -	jz 20f			/* All good */ - -10:  	movq %rsp,%rdi		/* RDI = pt_regs; RSI is already trapnr */ -	call early_fixup_exception +	call do_early_exception -20:  	decl early_recursion_flag(%rip)  	jmp restore_regs_and_return_to_kernel  SYM_CODE_END(early_idt_handler_common) +#ifdef CONFIG_AMD_MEM_ENCRYPT +/* + * VC Exception handler used during very early boot. The + * early_idt_handler_array can't be used because it returns via the + * paravirtualized INTERRUPT_RETURN and pv-ops don't work that early. + * + * This handler will end up in the .init.text section and not be + * available to boot secondary CPUs. + */ +SYM_CODE_START_NOALIGN(vc_no_ghcb) +	UNWIND_HINT_IRET_REGS offset=8 + +	/* Build pt_regs */ +	PUSH_AND_CLEAR_REGS + +	/* Call C handler */ +	movq    %rsp, %rdi +	movq	ORIG_RAX(%rsp), %rsi +	call    do_vc_no_ghcb + +	/* Unwind pt_regs */ +	POP_REGS + +	/* Remove Error Code */ +	addq    $8, %rsp + +	/* Pure iret required here - don't use INTERRUPT_RETURN */ +	iretq +SYM_CODE_END(vc_no_ghcb) +#endif  #define SYM_DATA_START_PAGE_ALIGNED(name)			\  	SYM_START(name, SYM_L_GLOBAL, .balign PAGE_SIZE) diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 1bffb87dcfdc..ee1a283f8e96 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -11,13 +11,6 @@  #include <asm/desc.h>  #include <asm/hw_irq.h> -struct idt_data { -	unsigned int	vector; -	unsigned int	segment; -	struct idt_bits	bits; -	const void	*addr; -}; -  #define DPL0		0x0  #define DPL3		0x3 @@ -175,20 +168,6 @@ bool idt_is_f00f_address(unsigned long address)  }  #endif -static inline void idt_init_desc(gate_desc *gate, const struct idt_data *d) -{ -	unsigned long addr = (unsigned long) d->addr; - -	gate->offset_low	= (u16) addr; -	gate->segment		= (u16) d->segment; -	gate->bits		= d->bits; -	gate->offset_middle	= (u16) (addr >> 16); -#ifdef CONFIG_X86_64 -	gate->offset_high	= (u32) (addr >> 32); -	gate->reserved		= 0; -#endif -} -  static __init void  idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size, bool sys)  { @@ -206,14 +185,7 @@ static __init void set_intr_gate(unsigned int n, const void *addr)  {  	struct idt_data data; -	BUG_ON(n > 0xFF); - -	memset(&data, 0, sizeof(data)); -	data.vector	= n; -	data.addr	= addr; -	data.segment	= __KERNEL_CS; -	data.bits.type	= GATE_INTERRUPT; -	data.bits.p	= 1; +	init_idt_data(&data, n, addr);  	idt_setup_from_table(idt_table, &data, 1, false);  } @@ -254,11 +226,14 @@ static const __initconst struct idt_data early_pf_idts[] = {   * cpu_init() when the TSS has been initialized.   */  static const __initconst struct idt_data ist_idts[] = { -	ISTG(X86_TRAP_DB,	asm_exc_debug,		IST_INDEX_DB), -	ISTG(X86_TRAP_NMI,	asm_exc_nmi,		IST_INDEX_NMI), -	ISTG(X86_TRAP_DF,	asm_exc_double_fault,	IST_INDEX_DF), +	ISTG(X86_TRAP_DB,	asm_exc_debug,			IST_INDEX_DB), +	ISTG(X86_TRAP_NMI,	asm_exc_nmi,			IST_INDEX_NMI), +	ISTG(X86_TRAP_DF,	asm_exc_double_fault,		IST_INDEX_DF),  #ifdef CONFIG_X86_MCE -	ISTG(X86_TRAP_MC,	asm_exc_machine_check,	IST_INDEX_MCE), +	ISTG(X86_TRAP_MC,	asm_exc_machine_check,		IST_INDEX_MCE), +#endif +#ifdef CONFIG_AMD_MEM_ENCRYPT +	ISTG(X86_TRAP_VC,	asm_exc_vmm_communication,	IST_INDEX_VC),  #endif  }; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 9663ba31347c..1c0f2560a41c 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -36,6 +36,8 @@  #include <asm/hypervisor.h>  #include <asm/tlb.h>  #include <asm/cpuidle_haltpoll.h> +#include <asm/ptrace.h> +#include <asm/svm.h>  DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled); @@ -744,13 +746,34 @@ static void __init kvm_init_platform(void)  	x86_platform.apic_post_init = kvm_apic_init;  } +#if defined(CONFIG_AMD_MEM_ENCRYPT) +static void kvm_sev_es_hcall_prepare(struct ghcb *ghcb, struct pt_regs *regs) +{ +	/* RAX and CPL are already in the GHCB */ +	ghcb_set_rbx(ghcb, regs->bx); +	ghcb_set_rcx(ghcb, regs->cx); +	ghcb_set_rdx(ghcb, regs->dx); +	ghcb_set_rsi(ghcb, regs->si); +} + +static bool kvm_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs) +{ +	/* No checking of the return state needed */ +	return true; +} +#endif +  const __initconst struct hypervisor_x86 x86_hyper_kvm = { -	.name			= "KVM", -	.detect			= kvm_detect, -	.type			= X86_HYPER_KVM, -	.init.guest_late_init	= kvm_guest_init, -	.init.x2apic_available	= kvm_para_available, -	.init.init_platform	= kvm_init_platform, +	.name				= "KVM", +	.detect				= kvm_detect, +	.type				= X86_HYPER_KVM, +	.init.guest_late_init		= kvm_guest_init, +	.init.x2apic_available		= kvm_para_available, +	.init.init_platform		= kvm_init_platform, +#if defined(CONFIG_AMD_MEM_ENCRYPT) +	.runtime.sev_es_hcall_prepare	= kvm_sev_es_hcall_prepare, +	.runtime.sev_es_hcall_finish	= kvm_sev_es_hcall_finish, +#endif  };  static __init int activate_jump_labels(void) diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 47381666d6a5..4bc77aaf1303 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -33,6 +33,7 @@  #include <asm/reboot.h>  #include <asm/cache.h>  #include <asm/nospec-branch.h> +#include <asm/sev-es.h>  #define CREATE_TRACE_POINTS  #include <trace/events/nmi.h> @@ -476,6 +477,12 @@ DEFINE_IDTENTRY_RAW(exc_nmi)  {  	bool irq_state; +	/* +	 * Re-enable NMIs right here when running as an SEV-ES guest. This might +	 * cause nested NMIs, but those can be handled safely. +	 */ +	sev_es_nmi_complete(); +  	if (IS_ENABLED(CONFIG_SMP) && arch_cpu_is_offline(smp_processor_id()))  		return; @@ -487,6 +494,12 @@ DEFINE_IDTENTRY_RAW(exc_nmi)  	this_cpu_write(nmi_cr2, read_cr2());  nmi_restart: +	/* +	 * Needs to happen before DR7 is accessed, because the hypervisor can +	 * intercept DR7 reads/writes, turning those into #VC exceptions. +	 */ +	sev_es_ist_enter(regs); +  	this_cpu_write(nmi_dr7, local_db_save());  	irq_state = idtentry_enter_nmi(regs); @@ -500,6 +513,8 @@ nmi_restart:  	local_db_restore(this_cpu_read(nmi_dr7)); +	sev_es_ist_exit(); +  	if (unlikely(this_cpu_read(nmi_cr2) != read_cr2()))  		write_cr2(this_cpu_read(nmi_cr2));  	if (this_cpu_dec_return(nmi_state)) diff --git a/arch/x86/kernel/sev-es-shared.c b/arch/x86/kernel/sev-es-shared.c new file mode 100644 index 000000000000..5f83ccaab877 --- /dev/null +++ b/arch/x86/kernel/sev-es-shared.c @@ -0,0 +1,507 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * AMD Encrypted Register State Support + * + * Author: Joerg Roedel <jroedel@suse.de> + * + * This file is not compiled stand-alone. It contains code shared + * between the pre-decompression boot code and the running Linux kernel + * and is included directly into both code-bases. + */ + +#ifndef __BOOT_COMPRESSED +#define error(v)	pr_err(v) +#define has_cpuflag(f)	boot_cpu_has(f) +#endif + +static bool __init sev_es_check_cpu_features(void) +{ +	if (!has_cpuflag(X86_FEATURE_RDRAND)) { +		error("RDRAND instruction not supported - no trusted source of randomness available\n"); +		return false; +	} + +	return true; +} + +static void sev_es_terminate(unsigned int reason) +{ +	u64 val = GHCB_SEV_TERMINATE; + +	/* +	 * Tell the hypervisor what went wrong - only reason-set 0 is +	 * currently supported. +	 */ +	val |= GHCB_SEV_TERMINATE_REASON(0, reason); + +	/* Request Guest Termination from Hypvervisor */ +	sev_es_wr_ghcb_msr(val); +	VMGEXIT(); + +	while (true) +		asm volatile("hlt\n" : : : "memory"); +} + +static bool sev_es_negotiate_protocol(void) +{ +	u64 val; + +	/* Do the GHCB protocol version negotiation */ +	sev_es_wr_ghcb_msr(GHCB_SEV_INFO_REQ); +	VMGEXIT(); +	val = sev_es_rd_ghcb_msr(); + +	if (GHCB_INFO(val) != GHCB_SEV_INFO) +		return false; + +	if (GHCB_PROTO_MAX(val) < GHCB_PROTO_OUR || +	    GHCB_PROTO_MIN(val) > GHCB_PROTO_OUR) +		return false; + +	return true; +} + +static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb) +{ +	memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); +} + +static bool vc_decoding_needed(unsigned long exit_code) +{ +	/* Exceptions don't require to decode the instruction */ +	return !(exit_code >= SVM_EXIT_EXCP_BASE && +		 exit_code <= SVM_EXIT_LAST_EXCP); +} + +static enum es_result vc_init_em_ctxt(struct es_em_ctxt *ctxt, +				      struct pt_regs *regs, +				      unsigned long exit_code) +{ +	enum es_result ret = ES_OK; + +	memset(ctxt, 0, sizeof(*ctxt)); +	ctxt->regs = regs; + +	if (vc_decoding_needed(exit_code)) +		ret = vc_decode_insn(ctxt); + +	return ret; +} + +static void vc_finish_insn(struct es_em_ctxt *ctxt) +{ +	ctxt->regs->ip += ctxt->insn.length; +} + +static enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, +					  struct es_em_ctxt *ctxt, +					  u64 exit_code, u64 exit_info_1, +					  u64 exit_info_2) +{ +	enum es_result ret; + +	/* Fill in protocol and format specifiers */ +	ghcb->protocol_version = GHCB_PROTOCOL_MAX; +	ghcb->ghcb_usage       = GHCB_DEFAULT_USAGE; + +	ghcb_set_sw_exit_code(ghcb, exit_code); +	ghcb_set_sw_exit_info_1(ghcb, exit_info_1); +	ghcb_set_sw_exit_info_2(ghcb, exit_info_2); + +	sev_es_wr_ghcb_msr(__pa(ghcb)); +	VMGEXIT(); + +	if ((ghcb->save.sw_exit_info_1 & 0xffffffff) == 1) { +		u64 info = ghcb->save.sw_exit_info_2; +		unsigned long v; + +		info = ghcb->save.sw_exit_info_2; +		v = info & SVM_EVTINJ_VEC_MASK; + +		/* Check if exception information from hypervisor is sane. */ +		if ((info & SVM_EVTINJ_VALID) && +		    ((v == X86_TRAP_GP) || (v == X86_TRAP_UD)) && +		    ((info & SVM_EVTINJ_TYPE_MASK) == SVM_EVTINJ_TYPE_EXEPT)) { +			ctxt->fi.vector = v; +			if (info & SVM_EVTINJ_VALID_ERR) +				ctxt->fi.error_code = info >> 32; +			ret = ES_EXCEPTION; +		} else { +			ret = ES_VMM_ERROR; +		} +	} else { +		ret = ES_OK; +	} + +	return ret; +} + +/* + * Boot VC Handler - This is the first VC handler during boot, there is no GHCB + * page yet, so it only supports the MSR based communication with the + * hypervisor and only the CPUID exit-code. + */ +void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) +{ +	unsigned int fn = lower_bits(regs->ax, 32); +	unsigned long val; + +	/* Only CPUID is supported via MSR protocol */ +	if (exit_code != SVM_EXIT_CPUID) +		goto fail; + +	sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EAX)); +	VMGEXIT(); +	val = sev_es_rd_ghcb_msr(); +	if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP) +		goto fail; +	regs->ax = val >> 32; + +	sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EBX)); +	VMGEXIT(); +	val = sev_es_rd_ghcb_msr(); +	if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP) +		goto fail; +	regs->bx = val >> 32; + +	sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_ECX)); +	VMGEXIT(); +	val = sev_es_rd_ghcb_msr(); +	if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP) +		goto fail; +	regs->cx = val >> 32; + +	sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EDX)); +	VMGEXIT(); +	val = sev_es_rd_ghcb_msr(); +	if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP) +		goto fail; +	regs->dx = val >> 32; + +	/* Skip over the CPUID two-byte opcode */ +	regs->ip += 2; + +	return; + +fail: +	sev_es_wr_ghcb_msr(GHCB_SEV_TERMINATE); +	VMGEXIT(); + +	/* Shouldn't get here - if we do halt the machine */ +	while (true) +		asm volatile("hlt\n"); +} + +static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt, +					  void *src, char *buf, +					  unsigned int data_size, +					  unsigned int count, +					  bool backwards) +{ +	int i, b = backwards ? -1 : 1; +	enum es_result ret = ES_OK; + +	for (i = 0; i < count; i++) { +		void *s = src + (i * data_size * b); +		char *d = buf + (i * data_size); + +		ret = vc_read_mem(ctxt, s, d, data_size); +		if (ret != ES_OK) +			break; +	} + +	return ret; +} + +static enum es_result vc_insn_string_write(struct es_em_ctxt *ctxt, +					   void *dst, char *buf, +					   unsigned int data_size, +					   unsigned int count, +					   bool backwards) +{ +	int i, s = backwards ? -1 : 1; +	enum es_result ret = ES_OK; + +	for (i = 0; i < count; i++) { +		void *d = dst + (i * data_size * s); +		char *b = buf + (i * data_size); + +		ret = vc_write_mem(ctxt, d, b, data_size); +		if (ret != ES_OK) +			break; +	} + +	return ret; +} + +#define IOIO_TYPE_STR  BIT(2) +#define IOIO_TYPE_IN   1 +#define IOIO_TYPE_INS  (IOIO_TYPE_IN | IOIO_TYPE_STR) +#define IOIO_TYPE_OUT  0 +#define IOIO_TYPE_OUTS (IOIO_TYPE_OUT | IOIO_TYPE_STR) + +#define IOIO_REP       BIT(3) + +#define IOIO_ADDR_64   BIT(9) +#define IOIO_ADDR_32   BIT(8) +#define IOIO_ADDR_16   BIT(7) + +#define IOIO_DATA_32   BIT(6) +#define IOIO_DATA_16   BIT(5) +#define IOIO_DATA_8    BIT(4) + +#define IOIO_SEG_ES    (0 << 10) +#define IOIO_SEG_DS    (3 << 10) + +static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo) +{ +	struct insn *insn = &ctxt->insn; +	*exitinfo = 0; + +	switch (insn->opcode.bytes[0]) { +	/* INS opcodes */ +	case 0x6c: +	case 0x6d: +		*exitinfo |= IOIO_TYPE_INS; +		*exitinfo |= IOIO_SEG_ES; +		*exitinfo |= (ctxt->regs->dx & 0xffff) << 16; +		break; + +	/* OUTS opcodes */ +	case 0x6e: +	case 0x6f: +		*exitinfo |= IOIO_TYPE_OUTS; +		*exitinfo |= IOIO_SEG_DS; +		*exitinfo |= (ctxt->regs->dx & 0xffff) << 16; +		break; + +	/* IN immediate opcodes */ +	case 0xe4: +	case 0xe5: +		*exitinfo |= IOIO_TYPE_IN; +		*exitinfo |= (u64)insn->immediate.value << 16; +		break; + +	/* OUT immediate opcodes */ +	case 0xe6: +	case 0xe7: +		*exitinfo |= IOIO_TYPE_OUT; +		*exitinfo |= (u64)insn->immediate.value << 16; +		break; + +	/* IN register opcodes */ +	case 0xec: +	case 0xed: +		*exitinfo |= IOIO_TYPE_IN; +		*exitinfo |= (ctxt->regs->dx & 0xffff) << 16; +		break; + +	/* OUT register opcodes */ +	case 0xee: +	case 0xef: +		*exitinfo |= IOIO_TYPE_OUT; +		*exitinfo |= (ctxt->regs->dx & 0xffff) << 16; +		break; + +	default: +		return ES_DECODE_FAILED; +	} + +	switch (insn->opcode.bytes[0]) { +	case 0x6c: +	case 0x6e: +	case 0xe4: +	case 0xe6: +	case 0xec: +	case 0xee: +		/* Single byte opcodes */ +		*exitinfo |= IOIO_DATA_8; +		break; +	default: +		/* Length determined by instruction parsing */ +		*exitinfo |= (insn->opnd_bytes == 2) ? IOIO_DATA_16 +						     : IOIO_DATA_32; +	} +	switch (insn->addr_bytes) { +	case 2: +		*exitinfo |= IOIO_ADDR_16; +		break; +	case 4: +		*exitinfo |= IOIO_ADDR_32; +		break; +	case 8: +		*exitinfo |= IOIO_ADDR_64; +		break; +	} + +	if (insn_has_rep_prefix(insn)) +		*exitinfo |= IOIO_REP; + +	return ES_OK; +} + +static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ +	struct pt_regs *regs = ctxt->regs; +	u64 exit_info_1, exit_info_2; +	enum es_result ret; + +	ret = vc_ioio_exitinfo(ctxt, &exit_info_1); +	if (ret != ES_OK) +		return ret; + +	if (exit_info_1 & IOIO_TYPE_STR) { + +		/* (REP) INS/OUTS */ + +		bool df = ((regs->flags & X86_EFLAGS_DF) == X86_EFLAGS_DF); +		unsigned int io_bytes, exit_bytes; +		unsigned int ghcb_count, op_count; +		unsigned long es_base; +		u64 sw_scratch; + +		/* +		 * For the string variants with rep prefix the amount of in/out +		 * operations per #VC exception is limited so that the kernel +		 * has a chance to take interrupts and re-schedule while the +		 * instruction is emulated. +		 */ +		io_bytes   = (exit_info_1 >> 4) & 0x7; +		ghcb_count = sizeof(ghcb->shared_buffer) / io_bytes; + +		op_count    = (exit_info_1 & IOIO_REP) ? regs->cx : 1; +		exit_info_2 = min(op_count, ghcb_count); +		exit_bytes  = exit_info_2 * io_bytes; + +		es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES); + +		/* Read bytes of OUTS into the shared buffer */ +		if (!(exit_info_1 & IOIO_TYPE_IN)) { +			ret = vc_insn_string_read(ctxt, +					       (void *)(es_base + regs->si), +					       ghcb->shared_buffer, io_bytes, +					       exit_info_2, df); +			if (ret) +				return ret; +		} + +		/* +		 * Issue an VMGEXIT to the HV to consume the bytes from the +		 * shared buffer or to have it write them into the shared buffer +		 * depending on the instruction: OUTS or INS. +		 */ +		sw_scratch = __pa(ghcb) + offsetof(struct ghcb, shared_buffer); +		ghcb_set_sw_scratch(ghcb, sw_scratch); +		ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, +					  exit_info_1, exit_info_2); +		if (ret != ES_OK) +			return ret; + +		/* Read bytes from shared buffer into the guest's destination. */ +		if (exit_info_1 & IOIO_TYPE_IN) { +			ret = vc_insn_string_write(ctxt, +						   (void *)(es_base + regs->di), +						   ghcb->shared_buffer, io_bytes, +						   exit_info_2, df); +			if (ret) +				return ret; + +			if (df) +				regs->di -= exit_bytes; +			else +				regs->di += exit_bytes; +		} else { +			if (df) +				regs->si -= exit_bytes; +			else +				regs->si += exit_bytes; +		} + +		if (exit_info_1 & IOIO_REP) +			regs->cx -= exit_info_2; + +		ret = regs->cx ? ES_RETRY : ES_OK; + +	} else { + +		/* IN/OUT into/from rAX */ + +		int bits = (exit_info_1 & 0x70) >> 1; +		u64 rax = 0; + +		if (!(exit_info_1 & IOIO_TYPE_IN)) +			rax = lower_bits(regs->ax, bits); + +		ghcb_set_rax(ghcb, rax); + +		ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, exit_info_1, 0); +		if (ret != ES_OK) +			return ret; + +		if (exit_info_1 & IOIO_TYPE_IN) { +			if (!ghcb_rax_is_valid(ghcb)) +				return ES_VMM_ERROR; +			regs->ax = lower_bits(ghcb->save.rax, bits); +		} +	} + +	return ret; +} + +static enum es_result vc_handle_cpuid(struct ghcb *ghcb, +				      struct es_em_ctxt *ctxt) +{ +	struct pt_regs *regs = ctxt->regs; +	u32 cr4 = native_read_cr4(); +	enum es_result ret; + +	ghcb_set_rax(ghcb, regs->ax); +	ghcb_set_rcx(ghcb, regs->cx); + +	if (cr4 & X86_CR4_OSXSAVE) +		/* Safe to read xcr0 */ +		ghcb_set_xcr0(ghcb, xgetbv(XCR_XFEATURE_ENABLED_MASK)); +	else +		/* xgetbv will cause #GP - use reset value for xcr0 */ +		ghcb_set_xcr0(ghcb, 1); + +	ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0); +	if (ret != ES_OK) +		return ret; + +	if (!(ghcb_rax_is_valid(ghcb) && +	      ghcb_rbx_is_valid(ghcb) && +	      ghcb_rcx_is_valid(ghcb) && +	      ghcb_rdx_is_valid(ghcb))) +		return ES_VMM_ERROR; + +	regs->ax = ghcb->save.rax; +	regs->bx = ghcb->save.rbx; +	regs->cx = ghcb->save.rcx; +	regs->dx = ghcb->save.rdx; + +	return ES_OK; +} + +static enum es_result vc_handle_rdtsc(struct ghcb *ghcb, +				      struct es_em_ctxt *ctxt, +				      unsigned long exit_code) +{ +	bool rdtscp = (exit_code == SVM_EXIT_RDTSCP); +	enum es_result ret; + +	ret = sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, 0, 0); +	if (ret != ES_OK) +		return ret; + +	if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb) && +	     (!rdtscp || ghcb_rcx_is_valid(ghcb)))) +		return ES_VMM_ERROR; + +	ctxt->regs->ax = ghcb->save.rax; +	ctxt->regs->dx = ghcb->save.rdx; +	if (rdtscp) +		ctxt->regs->cx = ghcb->save.rcx; + +	return ES_OK; +} diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c new file mode 100644 index 000000000000..4a96726fbaf8 --- /dev/null +++ b/arch/x86/kernel/sev-es.c @@ -0,0 +1,1404 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2019 SUSE + * + * Author: Joerg Roedel <jroedel@suse.de> + */ + +#define pr_fmt(fmt)	"SEV-ES: " fmt + +#include <linux/sched/debug.h>	/* For show_regs() */ +#include <linux/percpu-defs.h> +#include <linux/mem_encrypt.h> +#include <linux/lockdep.h> +#include <linux/printk.h> +#include <linux/mm_types.h> +#include <linux/set_memory.h> +#include <linux/memblock.h> +#include <linux/kernel.h> +#include <linux/mm.h> + +#include <asm/cpu_entry_area.h> +#include <asm/stacktrace.h> +#include <asm/sev-es.h> +#include <asm/insn-eval.h> +#include <asm/fpu/internal.h> +#include <asm/processor.h> +#include <asm/realmode.h> +#include <asm/traps.h> +#include <asm/svm.h> +#include <asm/smp.h> +#include <asm/cpu.h> + +#define DR7_RESET_VALUE        0x400 + +/* For early boot hypervisor communication in SEV-ES enabled guests */ +static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); + +/* + * Needs to be in the .data section because we need it NULL before bss is + * cleared + */ +static struct ghcb __initdata *boot_ghcb; + +/* #VC handler runtime per-CPU data */ +struct sev_es_runtime_data { +	struct ghcb ghcb_page; + +	/* Physical storage for the per-CPU IST stack of the #VC handler */ +	char ist_stack[EXCEPTION_STKSZ] __aligned(PAGE_SIZE); + +	/* +	 * Physical storage for the per-CPU fall-back stack of the #VC handler. +	 * The fall-back stack is used when it is not safe to switch back to the +	 * interrupted stack in the #VC entry code. +	 */ +	char fallback_stack[EXCEPTION_STKSZ] __aligned(PAGE_SIZE); + +	/* +	 * Reserve one page per CPU as backup storage for the unencrypted GHCB. +	 * It is needed when an NMI happens while the #VC handler uses the real +	 * GHCB, and the NMI handler itself is causing another #VC exception. In +	 * that case the GHCB content of the first handler needs to be backed up +	 * and restored. +	 */ +	struct ghcb backup_ghcb; + +	/* +	 * Mark the per-cpu GHCBs as in-use to detect nested #VC exceptions. +	 * There is no need for it to be atomic, because nothing is written to +	 * the GHCB between the read and the write of ghcb_active. So it is safe +	 * to use it when a nested #VC exception happens before the write. +	 * +	 * This is necessary for example in the #VC->NMI->#VC case when the NMI +	 * happens while the first #VC handler uses the GHCB. When the NMI code +	 * raises a second #VC handler it might overwrite the contents of the +	 * GHCB written by the first handler. To avoid this the content of the +	 * GHCB is saved and restored when the GHCB is detected to be in use +	 * already. +	 */ +	bool ghcb_active; +	bool backup_ghcb_active; + +	/* +	 * Cached DR7 value - write it on DR7 writes and return it on reads. +	 * That value will never make it to the real hardware DR7 as debugging +	 * is currently unsupported in SEV-ES guests. +	 */ +	unsigned long dr7; +}; + +struct ghcb_state { +	struct ghcb *ghcb; +}; + +static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data); +DEFINE_STATIC_KEY_FALSE(sev_es_enable_key); + +/* Needed in vc_early_forward_exception */ +void do_early_exception(struct pt_regs *regs, int trapnr); + +static void __init setup_vc_stacks(int cpu) +{ +	struct sev_es_runtime_data *data; +	struct cpu_entry_area *cea; +	unsigned long vaddr; +	phys_addr_t pa; + +	data = per_cpu(runtime_data, cpu); +	cea  = get_cpu_entry_area(cpu); + +	/* Map #VC IST stack */ +	vaddr = CEA_ESTACK_BOT(&cea->estacks, VC); +	pa    = __pa(data->ist_stack); +	cea_set_pte((void *)vaddr, pa, PAGE_KERNEL); + +	/* Map VC fall-back stack */ +	vaddr = CEA_ESTACK_BOT(&cea->estacks, VC2); +	pa    = __pa(data->fallback_stack); +	cea_set_pte((void *)vaddr, pa, PAGE_KERNEL); +} + +static __always_inline bool on_vc_stack(unsigned long sp) +{ +	return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC))); +} + +/* + * This function handles the case when an NMI is raised in the #VC exception + * handler entry code. In this case, the IST entry for #VC must be adjusted, so + * that any subsequent #VC exception will not overwrite the stack contents of the + * interrupted #VC handler. + * + * The IST entry is adjusted unconditionally so that it can be also be + * unconditionally adjusted back in sev_es_ist_exit(). Otherwise a nested + * sev_es_ist_exit() call may adjust back the IST entry too early. + */ +void noinstr __sev_es_ist_enter(struct pt_regs *regs) +{ +	unsigned long old_ist, new_ist; + +	/* Read old IST entry */ +	old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); + +	/* Make room on the IST stack */ +	if (on_vc_stack(regs->sp)) +		new_ist = ALIGN_DOWN(regs->sp, 8) - sizeof(old_ist); +	else +		new_ist = old_ist - sizeof(old_ist); + +	/* Store old IST entry */ +	*(unsigned long *)new_ist = old_ist; + +	/* Set new IST entry */ +	this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], new_ist); +} + +void noinstr __sev_es_ist_exit(void) +{ +	unsigned long ist; + +	/* Read IST entry */ +	ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); + +	if (WARN_ON(ist == __this_cpu_ist_top_va(VC))) +		return; + +	/* Read back old IST entry and write it to the TSS */ +	this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *(unsigned long *)ist); +} + +static __always_inline struct ghcb *sev_es_get_ghcb(struct ghcb_state *state) +{ +	struct sev_es_runtime_data *data; +	struct ghcb *ghcb; + +	data = this_cpu_read(runtime_data); +	ghcb = &data->ghcb_page; + +	if (unlikely(data->ghcb_active)) { +		/* GHCB is already in use - save its contents */ + +		if (unlikely(data->backup_ghcb_active)) +			return NULL; + +		/* Mark backup_ghcb active before writing to it */ +		data->backup_ghcb_active = true; + +		state->ghcb = &data->backup_ghcb; + +		/* Backup GHCB content */ +		*state->ghcb = *ghcb; +	} else { +		state->ghcb = NULL; +		data->ghcb_active = true; +	} + +	return ghcb; +} + +static __always_inline void sev_es_put_ghcb(struct ghcb_state *state) +{ +	struct sev_es_runtime_data *data; +	struct ghcb *ghcb; + +	data = this_cpu_read(runtime_data); +	ghcb = &data->ghcb_page; + +	if (state->ghcb) { +		/* Restore GHCB from Backup */ +		*ghcb = *state->ghcb; +		data->backup_ghcb_active = false; +		state->ghcb = NULL; +	} else { +		data->ghcb_active = false; +	} +} + +/* Needed in vc_early_forward_exception */ +void do_early_exception(struct pt_regs *regs, int trapnr); + +static inline u64 sev_es_rd_ghcb_msr(void) +{ +	return __rdmsr(MSR_AMD64_SEV_ES_GHCB); +} + +static inline void sev_es_wr_ghcb_msr(u64 val) +{ +	u32 low, high; + +	low  = (u32)(val); +	high = (u32)(val >> 32); + +	native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high); +} + +static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt, +				unsigned char *buffer) +{ +	return copy_from_kernel_nofault(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE); +} + +static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) +{ +	char buffer[MAX_INSN_SIZE]; +	enum es_result ret; +	int res; + +	if (user_mode(ctxt->regs)) { +		res = insn_fetch_from_user(ctxt->regs, buffer); +		if (!res) { +			ctxt->fi.vector     = X86_TRAP_PF; +			ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER; +			ctxt->fi.cr2        = ctxt->regs->ip; +			return ES_EXCEPTION; +		} + +		if (!insn_decode(&ctxt->insn, ctxt->regs, buffer, res)) +			return ES_DECODE_FAILED; +	} else { +		res = vc_fetch_insn_kernel(ctxt, buffer); +		if (res) { +			ctxt->fi.vector     = X86_TRAP_PF; +			ctxt->fi.error_code = X86_PF_INSTR; +			ctxt->fi.cr2        = ctxt->regs->ip; +			return ES_EXCEPTION; +		} + +		insn_init(&ctxt->insn, buffer, MAX_INSN_SIZE - res, 1); +		insn_get_length(&ctxt->insn); +	} + +	ret = ctxt->insn.immediate.got ? ES_OK : ES_DECODE_FAILED; + +	return ret; +} + +static enum es_result vc_write_mem(struct es_em_ctxt *ctxt, +				   char *dst, char *buf, size_t size) +{ +	unsigned long error_code = X86_PF_PROT | X86_PF_WRITE; +	char __user *target = (char __user *)dst; +	u64 d8; +	u32 d4; +	u16 d2; +	u8  d1; + +	switch (size) { +	case 1: +		memcpy(&d1, buf, 1); +		if (put_user(d1, target)) +			goto fault; +		break; +	case 2: +		memcpy(&d2, buf, 2); +		if (put_user(d2, target)) +			goto fault; +		break; +	case 4: +		memcpy(&d4, buf, 4); +		if (put_user(d4, target)) +			goto fault; +		break; +	case 8: +		memcpy(&d8, buf, 8); +		if (put_user(d8, target)) +			goto fault; +		break; +	default: +		WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size); +		return ES_UNSUPPORTED; +	} + +	return ES_OK; + +fault: +	if (user_mode(ctxt->regs)) +		error_code |= X86_PF_USER; + +	ctxt->fi.vector = X86_TRAP_PF; +	ctxt->fi.error_code = error_code; +	ctxt->fi.cr2 = (unsigned long)dst; + +	return ES_EXCEPTION; +} + +static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, +				  char *src, char *buf, size_t size) +{ +	unsigned long error_code = X86_PF_PROT; +	char __user *s = (char __user *)src; +	u64 d8; +	u32 d4; +	u16 d2; +	u8  d1; + +	switch (size) { +	case 1: +		if (get_user(d1, s)) +			goto fault; +		memcpy(buf, &d1, 1); +		break; +	case 2: +		if (get_user(d2, s)) +			goto fault; +		memcpy(buf, &d2, 2); +		break; +	case 4: +		if (get_user(d4, s)) +			goto fault; +		memcpy(buf, &d4, 4); +		break; +	case 8: +		if (get_user(d8, s)) +			goto fault; +		memcpy(buf, &d8, 8); +		break; +	default: +		WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size); +		return ES_UNSUPPORTED; +	} + +	return ES_OK; + +fault: +	if (user_mode(ctxt->regs)) +		error_code |= X86_PF_USER; + +	ctxt->fi.vector = X86_TRAP_PF; +	ctxt->fi.error_code = error_code; +	ctxt->fi.cr2 = (unsigned long)src; + +	return ES_EXCEPTION; +} + +static bool vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt, +				 unsigned long vaddr, phys_addr_t *paddr) +{ +	unsigned long va = (unsigned long)vaddr; +	unsigned int level; +	phys_addr_t pa; +	pgd_t *pgd; +	pte_t *pte; + +	pgd = __va(read_cr3_pa()); +	pgd = &pgd[pgd_index(va)]; +	pte = lookup_address_in_pgd(pgd, va, &level); +	if (!pte) { +		ctxt->fi.vector     = X86_TRAP_PF; +		ctxt->fi.cr2        = vaddr; +		ctxt->fi.error_code = 0; + +		if (user_mode(ctxt->regs)) +			ctxt->fi.error_code |= X86_PF_USER; + +		return false; +	} + +	pa = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT; +	pa |= va & ~page_level_mask(level); + +	*paddr = pa; + +	return true; +} + +/* Include code shared with pre-decompression boot stage */ +#include "sev-es-shared.c" + +void noinstr __sev_es_nmi_complete(void) +{ +	struct ghcb_state state; +	struct ghcb *ghcb; + +	ghcb = sev_es_get_ghcb(&state); + +	vc_ghcb_invalidate(ghcb); +	ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_NMI_COMPLETE); +	ghcb_set_sw_exit_info_1(ghcb, 0); +	ghcb_set_sw_exit_info_2(ghcb, 0); + +	sev_es_wr_ghcb_msr(__pa_nodebug(ghcb)); +	VMGEXIT(); + +	sev_es_put_ghcb(&state); +} + +static u64 get_jump_table_addr(void) +{ +	struct ghcb_state state; +	unsigned long flags; +	struct ghcb *ghcb; +	u64 ret = 0; + +	local_irq_save(flags); + +	ghcb = sev_es_get_ghcb(&state); + +	vc_ghcb_invalidate(ghcb); +	ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_JUMP_TABLE); +	ghcb_set_sw_exit_info_1(ghcb, SVM_VMGEXIT_GET_AP_JUMP_TABLE); +	ghcb_set_sw_exit_info_2(ghcb, 0); + +	sev_es_wr_ghcb_msr(__pa(ghcb)); +	VMGEXIT(); + +	if (ghcb_sw_exit_info_1_is_valid(ghcb) && +	    ghcb_sw_exit_info_2_is_valid(ghcb)) +		ret = ghcb->save.sw_exit_info_2; + +	sev_es_put_ghcb(&state); + +	local_irq_restore(flags); + +	return ret; +} + +int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) +{ +	u16 startup_cs, startup_ip; +	phys_addr_t jump_table_pa; +	u64 jump_table_addr; +	u16 __iomem *jump_table; + +	jump_table_addr = get_jump_table_addr(); + +	/* On UP guests there is no jump table so this is not a failure */ +	if (!jump_table_addr) +		return 0; + +	/* Check if AP Jump Table is page-aligned */ +	if (jump_table_addr & ~PAGE_MASK) +		return -EINVAL; + +	jump_table_pa = jump_table_addr & PAGE_MASK; + +	startup_cs = (u16)(rmh->trampoline_start >> 4); +	startup_ip = (u16)(rmh->sev_es_trampoline_start - +			   rmh->trampoline_start); + +	jump_table = ioremap_encrypted(jump_table_pa, PAGE_SIZE); +	if (!jump_table) +		return -EIO; + +	writew(startup_ip, &jump_table[0]); +	writew(startup_cs, &jump_table[1]); + +	iounmap(jump_table); + +	return 0; +} + +/* + * This is needed by the OVMF UEFI firmware which will use whatever it finds in + * the GHCB MSR as its GHCB to talk to the hypervisor. So make sure the per-cpu + * runtime GHCBs used by the kernel are also mapped in the EFI page-table. + */ +int __init sev_es_efi_map_ghcbs(pgd_t *pgd) +{ +	struct sev_es_runtime_data *data; +	unsigned long address, pflags; +	int cpu; +	u64 pfn; + +	if (!sev_es_active()) +		return 0; + +	pflags = _PAGE_NX | _PAGE_RW; + +	for_each_possible_cpu(cpu) { +		data = per_cpu(runtime_data, cpu); + +		address = __pa(&data->ghcb_page); +		pfn = address >> PAGE_SHIFT; + +		if (kernel_map_pages_in_pgd(pgd, pfn, address, 1, pflags)) +			return 1; +	} + +	return 0; +} + +static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ +	struct pt_regs *regs = ctxt->regs; +	enum es_result ret; +	u64 exit_info_1; + +	/* Is it a WRMSR? */ +	exit_info_1 = (ctxt->insn.opcode.bytes[1] == 0x30) ? 1 : 0; + +	ghcb_set_rcx(ghcb, regs->cx); +	if (exit_info_1) { +		ghcb_set_rax(ghcb, regs->ax); +		ghcb_set_rdx(ghcb, regs->dx); +	} + +	ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_MSR, exit_info_1, 0); + +	if ((ret == ES_OK) && (!exit_info_1)) { +		regs->ax = ghcb->save.rax; +		regs->dx = ghcb->save.rdx; +	} + +	return ret; +} + +/* + * This function runs on the first #VC exception after the kernel + * switched to virtual addresses. + */ +static bool __init sev_es_setup_ghcb(void) +{ +	/* First make sure the hypervisor talks a supported protocol. */ +	if (!sev_es_negotiate_protocol()) +		return false; + +	/* +	 * Clear the boot_ghcb. The first exception comes in before the bss +	 * section is cleared. +	 */ +	memset(&boot_ghcb_page, 0, PAGE_SIZE); + +	/* Alright - Make the boot-ghcb public */ +	boot_ghcb = &boot_ghcb_page; + +	return true; +} + +#ifdef CONFIG_HOTPLUG_CPU +static void sev_es_ap_hlt_loop(void) +{ +	struct ghcb_state state; +	struct ghcb *ghcb; + +	ghcb = sev_es_get_ghcb(&state); + +	while (true) { +		vc_ghcb_invalidate(ghcb); +		ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_HLT_LOOP); +		ghcb_set_sw_exit_info_1(ghcb, 0); +		ghcb_set_sw_exit_info_2(ghcb, 0); + +		sev_es_wr_ghcb_msr(__pa(ghcb)); +		VMGEXIT(); + +		/* Wakeup signal? */ +		if (ghcb_sw_exit_info_2_is_valid(ghcb) && +		    ghcb->save.sw_exit_info_2) +			break; +	} + +	sev_es_put_ghcb(&state); +} + +/* + * Play_dead handler when running under SEV-ES. This is needed because + * the hypervisor can't deliver an SIPI request to restart the AP. + * Instead the kernel has to issue a VMGEXIT to halt the VCPU until the + * hypervisor wakes it up again. + */ +static void sev_es_play_dead(void) +{ +	play_dead_common(); + +	/* IRQs now disabled */ + +	sev_es_ap_hlt_loop(); + +	/* +	 * If we get here, the VCPU was woken up again. Jump to CPU +	 * startup code to get it back online. +	 */ +	start_cpu0(); +} +#else  /* CONFIG_HOTPLUG_CPU */ +#define sev_es_play_dead	native_play_dead +#endif /* CONFIG_HOTPLUG_CPU */ + +#ifdef CONFIG_SMP +static void __init sev_es_setup_play_dead(void) +{ +	smp_ops.play_dead = sev_es_play_dead; +} +#else +static inline void sev_es_setup_play_dead(void) { } +#endif + +static void __init alloc_runtime_data(int cpu) +{ +	struct sev_es_runtime_data *data; + +	data = memblock_alloc(sizeof(*data), PAGE_SIZE); +	if (!data) +		panic("Can't allocate SEV-ES runtime data"); + +	per_cpu(runtime_data, cpu) = data; +} + +static void __init init_ghcb(int cpu) +{ +	struct sev_es_runtime_data *data; +	int err; + +	data = per_cpu(runtime_data, cpu); + +	err = early_set_memory_decrypted((unsigned long)&data->ghcb_page, +					 sizeof(data->ghcb_page)); +	if (err) +		panic("Can't map GHCBs unencrypted"); + +	memset(&data->ghcb_page, 0, sizeof(data->ghcb_page)); + +	data->ghcb_active = false; +	data->backup_ghcb_active = false; +} + +void __init sev_es_init_vc_handling(void) +{ +	int cpu; + +	BUILD_BUG_ON(offsetof(struct sev_es_runtime_data, ghcb_page) % PAGE_SIZE); + +	if (!sev_es_active()) +		return; + +	if (!sev_es_check_cpu_features()) +		panic("SEV-ES CPU Features missing"); + +	/* Enable SEV-ES special handling */ +	static_branch_enable(&sev_es_enable_key); + +	/* Initialize per-cpu GHCB pages */ +	for_each_possible_cpu(cpu) { +		alloc_runtime_data(cpu); +		init_ghcb(cpu); +		setup_vc_stacks(cpu); +	} + +	sev_es_setup_play_dead(); + +	/* Secondary CPUs use the runtime #VC handler */ +	initial_vc_handler = (unsigned long)safe_stack_exc_vmm_communication; +} + +static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt) +{ +	int trapnr = ctxt->fi.vector; + +	if (trapnr == X86_TRAP_PF) +		native_write_cr2(ctxt->fi.cr2); + +	ctxt->regs->orig_ax = ctxt->fi.error_code; +	do_early_exception(ctxt->regs, trapnr); +} + +static long *vc_insn_get_reg(struct es_em_ctxt *ctxt) +{ +	long *reg_array; +	int offset; + +	reg_array = (long *)ctxt->regs; +	offset    = insn_get_modrm_reg_off(&ctxt->insn, ctxt->regs); + +	if (offset < 0) +		return NULL; + +	offset /= sizeof(long); + +	return reg_array + offset; +} + +static long *vc_insn_get_rm(struct es_em_ctxt *ctxt) +{ +	long *reg_array; +	int offset; + +	reg_array = (long *)ctxt->regs; +	offset    = insn_get_modrm_rm_off(&ctxt->insn, ctxt->regs); + +	if (offset < 0) +		return NULL; + +	offset /= sizeof(long); + +	return reg_array + offset; +} +static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt, +				 unsigned int bytes, bool read) +{ +	u64 exit_code, exit_info_1, exit_info_2; +	unsigned long ghcb_pa = __pa(ghcb); +	phys_addr_t paddr; +	void __user *ref; + +	ref = insn_get_addr_ref(&ctxt->insn, ctxt->regs); +	if (ref == (void __user *)-1L) +		return ES_UNSUPPORTED; + +	exit_code = read ? SVM_VMGEXIT_MMIO_READ : SVM_VMGEXIT_MMIO_WRITE; + +	if (!vc_slow_virt_to_phys(ghcb, ctxt, (unsigned long)ref, &paddr)) { +		if (!read) +			ctxt->fi.error_code |= X86_PF_WRITE; + +		return ES_EXCEPTION; +	} + +	exit_info_1 = paddr; +	/* Can never be greater than 8 */ +	exit_info_2 = bytes; + +	ghcb_set_sw_scratch(ghcb, ghcb_pa + offsetof(struct ghcb, shared_buffer)); + +	return sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, exit_info_1, exit_info_2); +} + +static enum es_result vc_handle_mmio_twobyte_ops(struct ghcb *ghcb, +						 struct es_em_ctxt *ctxt) +{ +	struct insn *insn = &ctxt->insn; +	unsigned int bytes = 0; +	enum es_result ret; +	int sign_byte; +	long *reg_data; + +	switch (insn->opcode.bytes[1]) { +		/* MMIO Read w/ zero-extension */ +	case 0xb6: +		bytes = 1; +		fallthrough; +	case 0xb7: +		if (!bytes) +			bytes = 2; + +		ret = vc_do_mmio(ghcb, ctxt, bytes, true); +		if (ret) +			break; + +		/* Zero extend based on operand size */ +		reg_data = vc_insn_get_reg(ctxt); +		if (!reg_data) +			return ES_DECODE_FAILED; + +		memset(reg_data, 0, insn->opnd_bytes); + +		memcpy(reg_data, ghcb->shared_buffer, bytes); +		break; + +		/* MMIO Read w/ sign-extension */ +	case 0xbe: +		bytes = 1; +		fallthrough; +	case 0xbf: +		if (!bytes) +			bytes = 2; + +		ret = vc_do_mmio(ghcb, ctxt, bytes, true); +		if (ret) +			break; + +		/* Sign extend based on operand size */ +		reg_data = vc_insn_get_reg(ctxt); +		if (!reg_data) +			return ES_DECODE_FAILED; + +		if (bytes == 1) { +			u8 *val = (u8 *)ghcb->shared_buffer; + +			sign_byte = (*val & 0x80) ? 0xff : 0x00; +		} else { +			u16 *val = (u16 *)ghcb->shared_buffer; + +			sign_byte = (*val & 0x8000) ? 0xff : 0x00; +		} +		memset(reg_data, sign_byte, insn->opnd_bytes); + +		memcpy(reg_data, ghcb->shared_buffer, bytes); +		break; + +	default: +		ret = ES_UNSUPPORTED; +	} + +	return ret; +} + +/* + * The MOVS instruction has two memory operands, which raises the + * problem that it is not known whether the access to the source or the + * destination caused the #VC exception (and hence whether an MMIO read + * or write operation needs to be emulated). + * + * Instead of playing games with walking page-tables and trying to guess + * whether the source or destination is an MMIO range, split the move + * into two operations, a read and a write with only one memory operand. + * This will cause a nested #VC exception on the MMIO address which can + * then be handled. + * + * This implementation has the benefit that it also supports MOVS where + * source _and_ destination are MMIO regions. + * + * It will slow MOVS on MMIO down a lot, but in SEV-ES guests it is a + * rare operation. If it turns out to be a performance problem the split + * operations can be moved to memcpy_fromio() and memcpy_toio(). + */ +static enum es_result vc_handle_mmio_movs(struct es_em_ctxt *ctxt, +					  unsigned int bytes) +{ +	unsigned long ds_base, es_base; +	unsigned char *src, *dst; +	unsigned char buffer[8]; +	enum es_result ret; +	bool rep; +	int off; + +	ds_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_DS); +	es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES); + +	if (ds_base == -1L || es_base == -1L) { +		ctxt->fi.vector = X86_TRAP_GP; +		ctxt->fi.error_code = 0; +		return ES_EXCEPTION; +	} + +	src = ds_base + (unsigned char *)ctxt->regs->si; +	dst = es_base + (unsigned char *)ctxt->regs->di; + +	ret = vc_read_mem(ctxt, src, buffer, bytes); +	if (ret != ES_OK) +		return ret; + +	ret = vc_write_mem(ctxt, dst, buffer, bytes); +	if (ret != ES_OK) +		return ret; + +	if (ctxt->regs->flags & X86_EFLAGS_DF) +		off = -bytes; +	else +		off =  bytes; + +	ctxt->regs->si += off; +	ctxt->regs->di += off; + +	rep = insn_has_rep_prefix(&ctxt->insn); +	if (rep) +		ctxt->regs->cx -= 1; + +	if (!rep || ctxt->regs->cx == 0) +		return ES_OK; +	else +		return ES_RETRY; +} + +static enum es_result vc_handle_mmio(struct ghcb *ghcb, +				     struct es_em_ctxt *ctxt) +{ +	struct insn *insn = &ctxt->insn; +	unsigned int bytes = 0; +	enum es_result ret; +	long *reg_data; + +	switch (insn->opcode.bytes[0]) { +	/* MMIO Write */ +	case 0x88: +		bytes = 1; +		fallthrough; +	case 0x89: +		if (!bytes) +			bytes = insn->opnd_bytes; + +		reg_data = vc_insn_get_reg(ctxt); +		if (!reg_data) +			return ES_DECODE_FAILED; + +		memcpy(ghcb->shared_buffer, reg_data, bytes); + +		ret = vc_do_mmio(ghcb, ctxt, bytes, false); +		break; + +	case 0xc6: +		bytes = 1; +		fallthrough; +	case 0xc7: +		if (!bytes) +			bytes = insn->opnd_bytes; + +		memcpy(ghcb->shared_buffer, insn->immediate1.bytes, bytes); + +		ret = vc_do_mmio(ghcb, ctxt, bytes, false); +		break; + +		/* MMIO Read */ +	case 0x8a: +		bytes = 1; +		fallthrough; +	case 0x8b: +		if (!bytes) +			bytes = insn->opnd_bytes; + +		ret = vc_do_mmio(ghcb, ctxt, bytes, true); +		if (ret) +			break; + +		reg_data = vc_insn_get_reg(ctxt); +		if (!reg_data) +			return ES_DECODE_FAILED; + +		/* Zero-extend for 32-bit operation */ +		if (bytes == 4) +			*reg_data = 0; + +		memcpy(reg_data, ghcb->shared_buffer, bytes); +		break; + +		/* MOVS instruction */ +	case 0xa4: +		bytes = 1; +		fallthrough; +	case 0xa5: +		if (!bytes) +			bytes = insn->opnd_bytes; + +		ret = vc_handle_mmio_movs(ctxt, bytes); +		break; +		/* Two-Byte Opcodes */ +	case 0x0f: +		ret = vc_handle_mmio_twobyte_ops(ghcb, ctxt); +		break; +	default: +		ret = ES_UNSUPPORTED; +	} + +	return ret; +} + +static enum es_result vc_handle_dr7_write(struct ghcb *ghcb, +					  struct es_em_ctxt *ctxt) +{ +	struct sev_es_runtime_data *data = this_cpu_read(runtime_data); +	long val, *reg = vc_insn_get_rm(ctxt); +	enum es_result ret; + +	if (!reg) +		return ES_DECODE_FAILED; + +	val = *reg; + +	/* Upper 32 bits must be written as zeroes */ +	if (val >> 32) { +		ctxt->fi.vector = X86_TRAP_GP; +		ctxt->fi.error_code = 0; +		return ES_EXCEPTION; +	} + +	/* Clear out other reserved bits and set bit 10 */ +	val = (val & 0xffff23ffL) | BIT(10); + +	/* Early non-zero writes to DR7 are not supported */ +	if (!data && (val & ~DR7_RESET_VALUE)) +		return ES_UNSUPPORTED; + +	/* Using a value of 0 for ExitInfo1 means RAX holds the value */ +	ghcb_set_rax(ghcb, val); +	ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WRITE_DR7, 0, 0); +	if (ret != ES_OK) +		return ret; + +	if (data) +		data->dr7 = val; + +	return ES_OK; +} + +static enum es_result vc_handle_dr7_read(struct ghcb *ghcb, +					 struct es_em_ctxt *ctxt) +{ +	struct sev_es_runtime_data *data = this_cpu_read(runtime_data); +	long *reg = vc_insn_get_rm(ctxt); + +	if (!reg) +		return ES_DECODE_FAILED; + +	if (data) +		*reg = data->dr7; +	else +		*reg = DR7_RESET_VALUE; + +	return ES_OK; +} + +static enum es_result vc_handle_wbinvd(struct ghcb *ghcb, +				       struct es_em_ctxt *ctxt) +{ +	return sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WBINVD, 0, 0); +} + +static enum es_result vc_handle_rdpmc(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ +	enum es_result ret; + +	ghcb_set_rcx(ghcb, ctxt->regs->cx); + +	ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_RDPMC, 0, 0); +	if (ret != ES_OK) +		return ret; + +	if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb))) +		return ES_VMM_ERROR; + +	ctxt->regs->ax = ghcb->save.rax; +	ctxt->regs->dx = ghcb->save.rdx; + +	return ES_OK; +} + +static enum es_result vc_handle_monitor(struct ghcb *ghcb, +					struct es_em_ctxt *ctxt) +{ +	/* +	 * Treat it as a NOP and do not leak a physical address to the +	 * hypervisor. +	 */ +	return ES_OK; +} + +static enum es_result vc_handle_mwait(struct ghcb *ghcb, +				      struct es_em_ctxt *ctxt) +{ +	/* Treat the same as MONITOR/MONITORX */ +	return ES_OK; +} + +static enum es_result vc_handle_vmmcall(struct ghcb *ghcb, +					struct es_em_ctxt *ctxt) +{ +	enum es_result ret; + +	ghcb_set_rax(ghcb, ctxt->regs->ax); +	ghcb_set_cpl(ghcb, user_mode(ctxt->regs) ? 3 : 0); + +	if (x86_platform.hyper.sev_es_hcall_prepare) +		x86_platform.hyper.sev_es_hcall_prepare(ghcb, ctxt->regs); + +	ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_VMMCALL, 0, 0); +	if (ret != ES_OK) +		return ret; + +	if (!ghcb_rax_is_valid(ghcb)) +		return ES_VMM_ERROR; + +	ctxt->regs->ax = ghcb->save.rax; + +	/* +	 * Call sev_es_hcall_finish() after regs->ax is already set. +	 * This allows the hypervisor handler to overwrite it again if +	 * necessary. +	 */ +	if (x86_platform.hyper.sev_es_hcall_finish && +	    !x86_platform.hyper.sev_es_hcall_finish(ghcb, ctxt->regs)) +		return ES_VMM_ERROR; + +	return ES_OK; +} + +static enum es_result vc_handle_trap_ac(struct ghcb *ghcb, +					struct es_em_ctxt *ctxt) +{ +	/* +	 * Calling ecx_alignment_check() directly does not work, because it +	 * enables IRQs and the GHCB is active. Forward the exception and call +	 * it later from vc_forward_exception(). +	 */ +	ctxt->fi.vector = X86_TRAP_AC; +	ctxt->fi.error_code = 0; +	return ES_EXCEPTION; +} + +static __always_inline void vc_handle_trap_db(struct pt_regs *regs) +{ +	if (user_mode(regs)) +		noist_exc_debug(regs); +	else +		exc_debug(regs); +} + +static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt, +					 struct ghcb *ghcb, +					 unsigned long exit_code) +{ +	enum es_result result; + +	switch (exit_code) { +	case SVM_EXIT_READ_DR7: +		result = vc_handle_dr7_read(ghcb, ctxt); +		break; +	case SVM_EXIT_WRITE_DR7: +		result = vc_handle_dr7_write(ghcb, ctxt); +		break; +	case SVM_EXIT_EXCP_BASE + X86_TRAP_AC: +		result = vc_handle_trap_ac(ghcb, ctxt); +		break; +	case SVM_EXIT_RDTSC: +	case SVM_EXIT_RDTSCP: +		result = vc_handle_rdtsc(ghcb, ctxt, exit_code); +		break; +	case SVM_EXIT_RDPMC: +		result = vc_handle_rdpmc(ghcb, ctxt); +		break; +	case SVM_EXIT_INVD: +		pr_err_ratelimited("#VC exception for INVD??? Seriously???\n"); +		result = ES_UNSUPPORTED; +		break; +	case SVM_EXIT_CPUID: +		result = vc_handle_cpuid(ghcb, ctxt); +		break; +	case SVM_EXIT_IOIO: +		result = vc_handle_ioio(ghcb, ctxt); +		break; +	case SVM_EXIT_MSR: +		result = vc_handle_msr(ghcb, ctxt); +		break; +	case SVM_EXIT_VMMCALL: +		result = vc_handle_vmmcall(ghcb, ctxt); +		break; +	case SVM_EXIT_WBINVD: +		result = vc_handle_wbinvd(ghcb, ctxt); +		break; +	case SVM_EXIT_MONITOR: +		result = vc_handle_monitor(ghcb, ctxt); +		break; +	case SVM_EXIT_MWAIT: +		result = vc_handle_mwait(ghcb, ctxt); +		break; +	case SVM_EXIT_NPF: +		result = vc_handle_mmio(ghcb, ctxt); +		break; +	default: +		/* +		 * Unexpected #VC exception +		 */ +		result = ES_UNSUPPORTED; +	} + +	return result; +} + +static __always_inline void vc_forward_exception(struct es_em_ctxt *ctxt) +{ +	long error_code = ctxt->fi.error_code; +	int trapnr = ctxt->fi.vector; + +	ctxt->regs->orig_ax = ctxt->fi.error_code; + +	switch (trapnr) { +	case X86_TRAP_GP: +		exc_general_protection(ctxt->regs, error_code); +		break; +	case X86_TRAP_UD: +		exc_invalid_op(ctxt->regs); +		break; +	case X86_TRAP_AC: +		exc_alignment_check(ctxt->regs, error_code); +		break; +	default: +		pr_emerg("Unsupported exception in #VC instruction emulation - can't continue\n"); +		BUG(); +	} +} + +static __always_inline bool on_vc_fallback_stack(struct pt_regs *regs) +{ +	unsigned long sp = (unsigned long)regs; + +	return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2)); +} + +/* + * Main #VC exception handler. It is called when the entry code was able to + * switch off the IST to a safe kernel stack. + * + * With the current implementation it is always possible to switch to a safe + * stack because #VC exceptions only happen at known places, like intercepted + * instructions or accesses to MMIO areas/IO ports. They can also happen with + * code instrumentation when the hypervisor intercepts #DB, but the critical + * paths are forbidden to be instrumented, so #DB exceptions currently also + * only happen in safe places. + */ +DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication) +{ +	struct sev_es_runtime_data *data = this_cpu_read(runtime_data); +	struct ghcb_state state; +	struct es_em_ctxt ctxt; +	enum es_result result; +	struct ghcb *ghcb; + +	lockdep_assert_irqs_disabled(); + +	/* +	 * Handle #DB before calling into !noinstr code to avoid recursive #DB. +	 */ +	if (error_code == SVM_EXIT_EXCP_BASE + X86_TRAP_DB) { +		vc_handle_trap_db(regs); +		return; +	} + +	instrumentation_begin(); + +	/* +	 * This is invoked through an interrupt gate, so IRQs are disabled. The +	 * code below might walk page-tables for user or kernel addresses, so +	 * keep the IRQs disabled to protect us against concurrent TLB flushes. +	 */ + +	ghcb = sev_es_get_ghcb(&state); +	if (!ghcb) { +		/* +		 * Mark GHCBs inactive so that panic() is able to print the +		 * message. +		 */ +		data->ghcb_active        = false; +		data->backup_ghcb_active = false; + +		panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use"); +	} + +	vc_ghcb_invalidate(ghcb); +	result = vc_init_em_ctxt(&ctxt, regs, error_code); + +	if (result == ES_OK) +		result = vc_handle_exitcode(&ctxt, ghcb, error_code); + +	sev_es_put_ghcb(&state); + +	/* Done - now check the result */ +	switch (result) { +	case ES_OK: +		vc_finish_insn(&ctxt); +		break; +	case ES_UNSUPPORTED: +		pr_err_ratelimited("Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n", +				   error_code, regs->ip); +		goto fail; +	case ES_VMM_ERROR: +		pr_err_ratelimited("Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n", +				   error_code, regs->ip); +		goto fail; +	case ES_DECODE_FAILED: +		pr_err_ratelimited("Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n", +				   error_code, regs->ip); +		goto fail; +	case ES_EXCEPTION: +		vc_forward_exception(&ctxt); +		break; +	case ES_RETRY: +		/* Nothing to do */ +		break; +	default: +		pr_emerg("Unknown result in %s():%d\n", __func__, result); +		/* +		 * Emulating the instruction which caused the #VC exception +		 * failed - can't continue so print debug information +		 */ +		BUG(); +	} + +out: +	instrumentation_end(); + +	return; + +fail: +	if (user_mode(regs)) { +		/* +		 * Do not kill the machine if user-space triggered the +		 * exception. Send SIGBUS instead and let user-space deal with +		 * it. +		 */ +		force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0); +	} else { +		pr_emerg("PANIC: Unhandled #VC exception in kernel space (result=%d)\n", +			 result); + +		/* Show some debug info */ +		show_regs(regs); + +		/* Ask hypervisor to sev_es_terminate */ +		sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); + +		/* If that fails and we get here - just panic */ +		panic("Returned from Terminate-Request to Hypervisor\n"); +	} + +	goto out; +} + +/* This handler runs on the #VC fall-back stack. It can cause further #VC exceptions */ +DEFINE_IDTENTRY_VC_IST(exc_vmm_communication) +{ +	instrumentation_begin(); +	panic("Can't handle #VC exception from unsupported context\n"); +	instrumentation_end(); +} + +DEFINE_IDTENTRY_VC(exc_vmm_communication) +{ +	if (likely(!on_vc_fallback_stack(regs))) +		safe_stack_exc_vmm_communication(regs, error_code); +	else +		ist_exc_vmm_communication(regs, error_code); +} + +bool __init handle_vc_boot_ghcb(struct pt_regs *regs) +{ +	unsigned long exit_code = regs->orig_ax; +	struct es_em_ctxt ctxt; +	enum es_result result; + +	/* Do initial setup or terminate the guest */ +	if (unlikely(boot_ghcb == NULL && !sev_es_setup_ghcb())) +		sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); + +	vc_ghcb_invalidate(boot_ghcb); + +	result = vc_init_em_ctxt(&ctxt, regs, exit_code); +	if (result == ES_OK) +		result = vc_handle_exitcode(&ctxt, boot_ghcb, exit_code); + +	/* Done - now check the result */ +	switch (result) { +	case ES_OK: +		vc_finish_insn(&ctxt); +		break; +	case ES_UNSUPPORTED: +		early_printk("PANIC: Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n", +				exit_code, regs->ip); +		goto fail; +	case ES_VMM_ERROR: +		early_printk("PANIC: Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n", +				exit_code, regs->ip); +		goto fail; +	case ES_DECODE_FAILED: +		early_printk("PANIC: Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n", +				exit_code, regs->ip); +		goto fail; +	case ES_EXCEPTION: +		vc_early_forward_exception(&ctxt); +		break; +	case ES_RETRY: +		/* Nothing to do */ +		break; +	default: +		BUG(); +	} + +	return true; + +fail: +	show_regs(regs); + +	while (true) +		halt(); +} diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index f5ef689dd62a..de776b2e6046 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -227,7 +227,7 @@ static void notrace start_secondary(void *unused)  	load_cr3(swapper_pg_dir);  	__flush_tlb_all();  #endif -	load_current_idt(); +	cpu_init_exception_handling();  	cpu_init();  	x86_cpuinit.early_percpu_clock_init();  	preempt_disable(); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ec3a2572843f..3c70fb34028b 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -43,6 +43,7 @@  #include <asm/stacktrace.h>  #include <asm/processor.h>  #include <asm/debugreg.h> +#include <asm/realmode.h>  #include <asm/text-patching.h>  #include <asm/ftrace.h>  #include <asm/traps.h> @@ -673,6 +674,50 @@ asmlinkage __visible noinstr struct pt_regs *sync_regs(struct pt_regs *eregs)  	return regs;  } +#ifdef CONFIG_AMD_MEM_ENCRYPT +asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *regs) +{ +	unsigned long sp, *stack; +	struct stack_info info; +	struct pt_regs *regs_ret; + +	/* +	 * In the SYSCALL entry path the RSP value comes from user-space - don't +	 * trust it and switch to the current kernel stack +	 */ +	if (regs->ip >= (unsigned long)entry_SYSCALL_64 && +	    regs->ip <  (unsigned long)entry_SYSCALL_64_safe_stack) { +		sp = this_cpu_read(cpu_current_top_of_stack); +		goto sync; +	} + +	/* +	 * From here on the RSP value is trusted. Now check whether entry +	 * happened from a safe stack. Not safe are the entry or unknown stacks, +	 * use the fall-back stack instead in this case. +	 */ +	sp    = regs->sp; +	stack = (unsigned long *)sp; + +	if (!get_stack_info_noinstr(stack, current, &info) || info.type == STACK_TYPE_ENTRY || +	    info.type >= STACK_TYPE_EXCEPTION_LAST) +		sp = __this_cpu_ist_top_va(VC2); + +sync: +	/* +	 * Found a safe stack - switch to it as if the entry didn't happen via +	 * IST stack. The code below only copies pt_regs, the real switch happens +	 * in assembly code. +	 */ +	sp = ALIGN_DOWN(sp, 8) - sizeof(*regs_ret); + +	regs_ret = (struct pt_regs *)sp; +	*regs_ret = *regs; + +	return regs_ret; +} +#endif +  struct bad_iret_stack {  	void *error_entry_ret;  	struct pt_regs regs; @@ -1082,6 +1127,9 @@ void __init trap_init(void)  	/* Init cpu_entry_area before IST entries are set up */  	setup_cpu_entry_areas(); +	/* Init GHCB memory pages when running as an SEV-ES guest */ +	sev_es_init_vc_handling(); +  	idt_setup_traps();  	/* diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c index 2c304fd0bb1a..f6225bf22c02 100644 --- a/arch/x86/kernel/umip.c +++ b/arch/x86/kernel/umip.c @@ -335,63 +335,28 @@ static void force_sig_info_umip_fault(void __user *addr, struct pt_regs *regs)   */  bool fixup_umip_exception(struct pt_regs *regs)  { -	int not_copied, nr_copied, reg_offset, dummy_data_size, umip_inst; -	unsigned long seg_base = 0, *reg_addr; +	int nr_copied, reg_offset, dummy_data_size, umip_inst;  	/* 10 bytes is the maximum size of the result of UMIP instructions */  	unsigned char dummy_data[10] = { 0 };  	unsigned char buf[MAX_INSN_SIZE]; +	unsigned long *reg_addr;  	void __user *uaddr;  	struct insn insn; -	int seg_defs;  	if (!regs)  		return false; -	/* -	 * If not in user-space long mode, a custom code segment could be in -	 * use. This is true in protected mode (if the process defined a local -	 * descriptor table), or virtual-8086 mode. In most of the cases -	 * seg_base will be zero as in USER_CS. -	 */ -	if (!user_64bit_mode(regs)) -		seg_base = insn_get_seg_base(regs, INAT_SEG_REG_CS); - -	if (seg_base == -1L) -		return false; - -	not_copied = copy_from_user(buf, (void __user *)(seg_base + regs->ip), -				    sizeof(buf)); -	nr_copied = sizeof(buf) - not_copied; +	nr_copied = insn_fetch_from_user(regs, buf);  	/* -	 * The copy_from_user above could have failed if user code is protected -	 * by a memory protection key. Give up on emulation in such a case. -	 * Should we issue a page fault? +	 * The insn_fetch_from_user above could have failed if user code +	 * is protected by a memory protection key. Give up on emulation +	 * in such a case.  Should we issue a page fault?  	 */  	if (!nr_copied)  		return false; -	insn_init(&insn, buf, nr_copied, user_64bit_mode(regs)); - -	/* -	 * Override the default operand and address sizes with what is specified -	 * in the code segment descriptor. The instruction decoder only sets -	 * the address size it to either 4 or 8 address bytes and does nothing -	 * for the operand bytes. This OK for most of the cases, but we could -	 * have special cases where, for instance, a 16-bit code segment -	 * descriptor is used. -	 * If there is an address override prefix, the instruction decoder -	 * correctly updates these values, even for 16-bit defaults. -	 */ -	seg_defs = insn_get_code_seg_params(regs); -	if (seg_defs == -EINVAL) -		return false; - -	insn.addr_bytes = INSN_CODE_SEG_ADDR_SZ(seg_defs); -	insn.opnd_bytes = INSN_CODE_SEG_OPND_SZ(seg_defs); - -	insn_get_length(&insn); -	if (nr_copied < insn.length) +	if (!insn_decode(&insn, regs, buf, nr_copied))  		return false;  	umip_inst = identify_insn(&insn); diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index e90bc436f584..598a769f1961 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -1062,10 +1062,14 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,  	struct vmcb *hsave = svm->nested.hsave;  	struct vmcb __user *user_vmcb = (struct vmcb __user *)  		&user_kvm_nested_state->data.svm[0]; -	struct vmcb_control_area ctl; -	struct vmcb_save_area save; +	struct vmcb_control_area *ctl; +	struct vmcb_save_area *save; +	int ret;  	u32 cr0; +	BUILD_BUG_ON(sizeof(struct vmcb_control_area) + sizeof(struct vmcb_save_area) > +		     KVM_STATE_NESTED_SVM_VMCB_SIZE); +  	if (kvm_state->format != KVM_STATE_NESTED_FORMAT_SVM)  		return -EINVAL; @@ -1097,13 +1101,22 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,  		return -EINVAL;  	if (kvm_state->size < sizeof(*kvm_state) + KVM_STATE_NESTED_SVM_VMCB_SIZE)  		return -EINVAL; -	if (copy_from_user(&ctl, &user_vmcb->control, sizeof(ctl))) -		return -EFAULT; -	if (copy_from_user(&save, &user_vmcb->save, sizeof(save))) -		return -EFAULT; -	if (!nested_vmcb_check_controls(&ctl)) -		return -EINVAL; +	ret  = -ENOMEM; +	ctl  = kzalloc(sizeof(*ctl),  GFP_KERNEL); +	save = kzalloc(sizeof(*save), GFP_KERNEL); +	if (!ctl || !save) +		goto out_free; + +	ret = -EFAULT; +	if (copy_from_user(ctl, &user_vmcb->control, sizeof(*ctl))) +		goto out_free; +	if (copy_from_user(save, &user_vmcb->save, sizeof(*save))) +		goto out_free; + +	ret = -EINVAL; +	if (!nested_vmcb_check_controls(ctl)) +		goto out_free;  	/*  	 * Processor state contains L2 state.  Check that it is @@ -1111,15 +1124,15 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,  	 */  	cr0 = kvm_read_cr0(vcpu);          if (((cr0 & X86_CR0_CD) == 0) && (cr0 & X86_CR0_NW)) -                return -EINVAL; +		goto out_free;  	/*  	 * Validate host state saved from before VMRUN (see  	 * nested_svm_check_permissions).  	 * TODO: validate reserved bits for all saved state.  	 */ -	if (!(save.cr0 & X86_CR0_PG)) -		return -EINVAL; +	if (!(save->cr0 & X86_CR0_PG)) +		goto out_free;  	/*  	 * All checks done, we can enter guest mode.  L1 control fields @@ -1128,10 +1141,10 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,  	 * contains saved L1 state.  	 */  	copy_vmcb_control_area(&hsave->control, &svm->vmcb->control); -	hsave->save = save; +	hsave->save = *save;  	svm->nested.vmcb = kvm_state->hdr.svm.vmcb_pa; -	load_nested_vmcb_control(svm, &ctl); +	load_nested_vmcb_control(svm, ctl);  	nested_prepare_vmcb_control(svm);  	if (!nested_svm_vmrun_msrpm(svm)) @@ -1139,7 +1152,13 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,  out_set_gif:  	svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET)); -	return 0; + +	ret = 0; +out_free: +	kfree(save); +	kfree(ctl); + +	return ret;  }  struct kvm_x86_nested_ops svm_nested_ops = { diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index dec6d5bc603c..9709c98d0d6c 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4176,6 +4176,8 @@ static struct kvm_x86_init_ops svm_init_ops __initdata = {  static int __init svm_init(void)  { +	__unused_size_checks(); +  	return kvm_init(&svm_init_ops, sizeof(struct vcpu_svm),  			__alignof__(struct vcpu_svm), THIS_MODULE);  } diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index 5e69603ff63f..58f7fb95c7f4 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -20,6 +20,7 @@  enum reg_type {  	REG_TYPE_RM = 0, +	REG_TYPE_REG,  	REG_TYPE_INDEX,  	REG_TYPE_BASE,  }; @@ -53,6 +54,30 @@ static bool is_string_insn(struct insn *insn)  }  /** + * insn_has_rep_prefix() - Determine if instruction has a REP prefix + * @insn:	Instruction containing the prefix to inspect + * + * Returns: + * + * true if the instruction has a REP prefix, false if not. + */ +bool insn_has_rep_prefix(struct insn *insn) +{ +	int i; + +	insn_get_prefixes(insn); + +	for (i = 0; i < insn->prefixes.nbytes; i++) { +		insn_byte_t p = insn->prefixes.bytes[i]; + +		if (p == 0xf2 || p == 0xf3) +			return true; +	} + +	return false; +} + +/**   * get_seg_reg_override_idx() - obtain segment register override index   * @insn:	Valid instruction with segment override prefixes   * @@ -439,6 +464,13 @@ static int get_reg_offset(struct insn *insn, struct pt_regs *regs,  			regno += 8;  		break; +	case REG_TYPE_REG: +		regno = X86_MODRM_REG(insn->modrm.value); + +		if (X86_REX_R(insn->rex_prefix.value)) +			regno += 8; +		break; +  	case REG_TYPE_INDEX:  		regno = X86_SIB_INDEX(insn->sib.value);  		if (X86_REX_X(insn->rex_prefix.value)) @@ -808,6 +840,21 @@ int insn_get_modrm_rm_off(struct insn *insn, struct pt_regs *regs)  }  /** + * insn_get_modrm_reg_off() - Obtain register in reg part of the ModRM byte + * @insn:	Instruction containing the ModRM byte + * @regs:	Register values as seen when entering kernel mode + * + * Returns: + * + * The register indicated by the reg part of the ModRM byte. The + * register is obtained as an offset from the base of pt_regs. + */ +int insn_get_modrm_reg_off(struct insn *insn, struct pt_regs *regs) +{ +	return get_reg_offset(insn, regs, REG_TYPE_REG); +} + +/**   * get_seg_base_limit() - obtain base address and limit of a segment   * @insn:	Instruction. Must be valid.   * @regs:	Register values as seen when entering kernel mode @@ -1367,3 +1414,86 @@ void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs)  		return (void __user *)-1L;  	}  } + +/** + * insn_fetch_from_user() - Copy instruction bytes from user-space memory + * @regs:	Structure with register values as seen when entering kernel mode + * @buf:	Array to store the fetched instruction + * + * Gets the linear address of the instruction and copies the instruction bytes + * to the buf. + * + * Returns: + * + * Number of instruction bytes copied. + * + * 0 if nothing was copied. + */ +int insn_fetch_from_user(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE]) +{ +	unsigned long seg_base = 0; +	int not_copied; + +	/* +	 * If not in user-space long mode, a custom code segment could be in +	 * use. This is true in protected mode (if the process defined a local +	 * descriptor table), or virtual-8086 mode. In most of the cases +	 * seg_base will be zero as in USER_CS. +	 */ +	if (!user_64bit_mode(regs)) { +		seg_base = insn_get_seg_base(regs, INAT_SEG_REG_CS); +		if (seg_base == -1L) +			return 0; +	} + + +	not_copied = copy_from_user(buf, (void __user *)(seg_base + regs->ip), +				    MAX_INSN_SIZE); + +	return MAX_INSN_SIZE - not_copied; +} + +/** + * insn_decode() - Decode an instruction + * @insn:	Structure to store decoded instruction + * @regs:	Structure with register values as seen when entering kernel mode + * @buf:	Buffer containing the instruction bytes + * @buf_size:   Number of instruction bytes available in buf + * + * Decodes the instruction provided in buf and stores the decoding results in + * insn. Also determines the correct address and operand sizes. + * + * Returns: + * + * True if instruction was decoded, False otherwise. + */ +bool insn_decode(struct insn *insn, struct pt_regs *regs, +		 unsigned char buf[MAX_INSN_SIZE], int buf_size) +{ +	int seg_defs; + +	insn_init(insn, buf, buf_size, user_64bit_mode(regs)); + +	/* +	 * Override the default operand and address sizes with what is specified +	 * in the code segment descriptor. The instruction decoder only sets +	 * the address size it to either 4 or 8 address bytes and does nothing +	 * for the operand bytes. This OK for most of the cases, but we could +	 * have special cases where, for instance, a 16-bit code segment +	 * descriptor is used. +	 * If there is an address override prefix, the instruction decoder +	 * correctly updates these values, even for 16-bit defaults. +	 */ +	seg_defs = insn_get_code_seg_params(regs); +	if (seg_defs == -EINVAL) +		return false; + +	insn->addr_bytes = INSN_CODE_SEG_ADDR_SZ(seg_defs); +	insn->opnd_bytes = INSN_CODE_SEG_OPND_SZ(seg_defs); + +	insn_get_length(insn); +	if (buf_size < insn->length) +		return false; + +	return true; +} diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index 770b613790b3..f5e1e60c9095 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c @@ -21,7 +21,8 @@ DEFINE_PER_CPU(struct cea_exception_stacks*, cea_exception_stacks);  DECLARE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack);  #endif -struct cpu_entry_area *get_cpu_entry_area(int cpu) +/* Is called from entry code, so must be noinstr */ +noinstr struct cpu_entry_area *get_cpu_entry_area(int cpu)  {  	unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE;  	BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0); diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 5829457f7ca3..b93d6cd08a7f 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -5,6 +5,7 @@  #include <xen/xen.h>  #include <asm/fpu/internal.h> +#include <asm/sev-es.h>  #include <asm/traps.h>  #include <asm/kdebug.h> diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 9f1177edc2e7..ebb7edc8bc0a 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -38,6 +38,7 @@   * section is later cleared.   */  u64 sme_me_mask __section(.data) = 0; +u64 sev_status __section(.data) = 0;  EXPORT_SYMBOL(sme_me_mask);  DEFINE_STATIC_KEY_FALSE(sev_enable_key);  EXPORT_SYMBOL_GPL(sev_enable_key); @@ -347,7 +348,13 @@ bool sme_active(void)  bool sev_active(void)  { -	return sme_me_mask && sev_enabled; +	return sev_status & MSR_AMD64_SEV_ENABLED; +} + +/* Needs to be called from non-instrumentable code */ +bool noinstr sev_es_active(void) +{ +	return sev_status & MSR_AMD64_SEV_ES_ENABLED;  }  /* Override for DMA direct allocation check - ARCH_HAS_FORCE_DMA_UNENCRYPTED */ @@ -400,6 +407,31 @@ void __init mem_encrypt_free_decrypted_mem(void)  	free_init_pages("unused decrypted", vaddr, vaddr_end);  } +static void print_mem_encrypt_feature_info(void) +{ +	pr_info("AMD Memory Encryption Features active:"); + +	/* Secure Memory Encryption */ +	if (sme_active()) { +		/* +		 * SME is mutually exclusive with any of the SEV +		 * features below. +		 */ +		pr_cont(" SME\n"); +		return; +	} + +	/* Secure Encrypted Virtualization */ +	if (sev_active()) +		pr_cont(" SEV"); + +	/* Encrypted Register State */ +	if (sev_es_active()) +		pr_cont(" SEV-ES"); + +	pr_cont("\n"); +} +  /* Architecture __weak replacement functions */  void __init mem_encrypt_init(void)  { @@ -415,8 +447,6 @@ void __init mem_encrypt_init(void)  	if (sev_active())  		static_branch_enable(&sev_enable_key); -	pr_info("AMD %s active\n", -		sev_active() ? "Secure Encrypted Virtualization (SEV)" -			     : "Secure Memory Encryption (SME)"); +	print_mem_encrypt_feature_info();  } diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index e2b0e2ac07bb..68d75379e06a 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -540,6 +540,9 @@ void __init sme_enable(struct boot_params *bp)  		if (!(msr & MSR_AMD64_SEV_ENABLED))  			return; +		/* Save SEV_STATUS to avoid reading MSR again */ +		sev_status = msr; +  		/* SEV state cannot be controlled by a command line option */  		sme_me_mask = me_mask;  		sev_enabled = true; diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 6af4da1149ba..8f5759df7776 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -47,6 +47,7 @@  #include <asm/realmode.h>  #include <asm/time.h>  #include <asm/pgalloc.h> +#include <asm/sev-es.h>  /*   * We allocate runtime services regions top-down, starting from -4G, i.e. @@ -230,6 +231,15 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)  	}  	/* +	 * When SEV-ES is active, the GHCB as set by the kernel will be used +	 * by firmware. Create a 1:1 unencrypted mapping for each GHCB. +	 */ +	if (sev_es_efi_map_ghcbs(pgd)) { +		pr_err("Failed to create 1:1 mapping for the GHCBs!\n"); +		return 1; +	} + +	/*  	 * When making calls to the firmware everything needs to be 1:1  	 * mapped and addressable with 32-bit pointers. Map the kernel  	 * text and allocate a new stack because we can't rely on the diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index 1ed1208931e0..22fda7d99159 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -9,6 +9,7 @@  #include <asm/realmode.h>  #include <asm/tlbflush.h>  #include <asm/crash.h> +#include <asm/sev-es.h>  struct real_mode_header *real_mode_header;  u32 *trampoline_cr4_features; @@ -38,6 +39,25 @@ void __init reserve_real_mode(void)  	crash_reserve_low_1M();  } +static void sme_sev_setup_real_mode(struct trampoline_header *th) +{ +#ifdef CONFIG_AMD_MEM_ENCRYPT +	if (sme_active()) +		th->flags |= TH_FLAGS_SME_ACTIVE; + +	if (sev_es_active()) { +		/* +		 * Skip the call to verify_cpu() in secondary_startup_64 as it +		 * will cause #VC exceptions when the AP can't handle them yet. +		 */ +		th->start = (u64) secondary_startup_64_no_verify; + +		if (sev_es_setup_ap_jump_table(real_mode_header)) +			panic("Failed to get/update SEV-ES AP Jump Table"); +	} +#endif +} +  static void __init setup_real_mode(void)  {  	u16 real_mode_seg; @@ -104,13 +124,13 @@ static void __init setup_real_mode(void)  	*trampoline_cr4_features = mmu_cr4_features;  	trampoline_header->flags = 0; -	if (sme_active()) -		trampoline_header->flags |= TH_FLAGS_SME_ACTIVE;  	trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);  	trampoline_pgd[0] = trampoline_pgd_entry.pgd;  	trampoline_pgd[511] = init_top_pgt[511].pgd;  #endif + +	sme_sev_setup_real_mode(trampoline_header);  }  /* diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S index af04512c02d9..8c1db5bf5d78 100644 --- a/arch/x86/realmode/rm/header.S +++ b/arch/x86/realmode/rm/header.S @@ -20,6 +20,9 @@ SYM_DATA_START(real_mode_header)  	/* SMP trampoline */  	.long	pa_trampoline_start  	.long	pa_trampoline_header +#ifdef CONFIG_AMD_MEM_ENCRYPT +	.long	pa_sev_es_trampoline_start +#endif  #ifdef CONFIG_X86_64  	.long	pa_trampoline_pgd;  #endif diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S index 251758ed7443..84c5d1b33d10 100644 --- a/arch/x86/realmode/rm/trampoline_64.S +++ b/arch/x86/realmode/rm/trampoline_64.S @@ -56,6 +56,7 @@ SYM_CODE_START(trampoline_start)  	testl   %eax, %eax		# Check for return code  	jnz	no_longmode +.Lswitch_to_protected:  	/*  	 * GDT tables in non default location kernel can be beyond 16MB and  	 * lgdt will not be able to load the address as in real mode default @@ -80,6 +81,25 @@ no_longmode:  	jmp no_longmode  SYM_CODE_END(trampoline_start) +#ifdef CONFIG_AMD_MEM_ENCRYPT +/* SEV-ES supports non-zero IP for entry points - no alignment needed */ +SYM_CODE_START(sev_es_trampoline_start) +	cli			# We should be safe anyway + +	LJMPW_RM(1f) +1: +	mov	%cs, %ax	# Code and data in the same place +	mov	%ax, %ds +	mov	%ax, %es +	mov	%ax, %ss + +	# Setup stack +	movl	$rm_stack_end, %esp + +	jmp	.Lswitch_to_protected +SYM_CODE_END(sev_es_trampoline_start) +#endif	/* CONFIG_AMD_MEM_ENCRYPT */ +  #include "../kernel/verify_cpu.S"  	.section ".text32","ax" diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk index a42015b305f4..af38469afd14 100644 --- a/arch/x86/tools/gen-insn-attr-x86.awk +++ b/arch/x86/tools/gen-insn-attr-x86.awk @@ -362,6 +362,9 @@ function convert_operands(count,opnd,       i,j,imm,mod)  END {  	if (awkchecked != "")  		exit 1 + +	print "#ifndef __BOOT_COMPRESSED\n" +  	# print escape opcode map's array  	print "/* Escape opcode map array */"  	print "const insn_attr_t * const inat_escape_tables[INAT_ESC_MAX + 1]" \ @@ -388,6 +391,51 @@ END {  		for (j = 0; j < max_lprefix; j++)  			if (atable[i,j])  				print "	["i"]["j"] = "atable[i,j]"," -	print "};" +	print "};\n" + +	print "#else /* !__BOOT_COMPRESSED */\n" + +	print "/* Escape opcode map array */" +	print "static const insn_attr_t *inat_escape_tables[INAT_ESC_MAX + 1]" \ +	      "[INAT_LSTPFX_MAX + 1];" +	print "" + +	print "/* Group opcode map array */" +	print "static const insn_attr_t *inat_group_tables[INAT_GRP_MAX + 1]"\ +	      "[INAT_LSTPFX_MAX + 1];" +	print "" + +	print "/* AVX opcode map array */" +	print "static const insn_attr_t *inat_avx_tables[X86_VEX_M_MAX + 1]"\ +	      "[INAT_LSTPFX_MAX + 1];" +	print "" + +	print "static void inat_init_tables(void)" +	print "{" + +	# print escape opcode map's array +	print "\t/* Print Escape opcode map array */" +	for (i = 0; i < geid; i++) +		for (j = 0; j < max_lprefix; j++) +			if (etable[i,j]) +				print "\tinat_escape_tables["i"]["j"] = "etable[i,j]";" +	print "" + +	# print group opcode map's array +	print "\t/* Print Group opcode map array */" +	for (i = 0; i < ggid; i++) +		for (j = 0; j < max_lprefix; j++) +			if (gtable[i,j]) +				print "\tinat_group_tables["i"]["j"] = "gtable[i,j]";" +	print "" +	# print AVX opcode map's array +	print "\t/* Print AVX opcode map array */" +	for (i = 0; i < gaid; i++) +		for (j = 0; j < max_lprefix; j++) +			if (atable[i,j]) +				print "\tinat_avx_tables["i"]["j"] = "atable[i,j]";" + +	print "}" +	print "#endif"  } diff --git a/tools/arch/x86/tools/gen-insn-attr-x86.awk b/tools/arch/x86/tools/gen-insn-attr-x86.awk index a42015b305f4..af38469afd14 100644 --- a/tools/arch/x86/tools/gen-insn-attr-x86.awk +++ b/tools/arch/x86/tools/gen-insn-attr-x86.awk @@ -362,6 +362,9 @@ function convert_operands(count,opnd,       i,j,imm,mod)  END {  	if (awkchecked != "")  		exit 1 + +	print "#ifndef __BOOT_COMPRESSED\n" +  	# print escape opcode map's array  	print "/* Escape opcode map array */"  	print "const insn_attr_t * const inat_escape_tables[INAT_ESC_MAX + 1]" \ @@ -388,6 +391,51 @@ END {  		for (j = 0; j < max_lprefix; j++)  			if (atable[i,j])  				print "	["i"]["j"] = "atable[i,j]"," -	print "};" +	print "};\n" + +	print "#else /* !__BOOT_COMPRESSED */\n" + +	print "/* Escape opcode map array */" +	print "static const insn_attr_t *inat_escape_tables[INAT_ESC_MAX + 1]" \ +	      "[INAT_LSTPFX_MAX + 1];" +	print "" + +	print "/* Group opcode map array */" +	print "static const insn_attr_t *inat_group_tables[INAT_GRP_MAX + 1]"\ +	      "[INAT_LSTPFX_MAX + 1];" +	print "" + +	print "/* AVX opcode map array */" +	print "static const insn_attr_t *inat_avx_tables[X86_VEX_M_MAX + 1]"\ +	      "[INAT_LSTPFX_MAX + 1];" +	print "" + +	print "static void inat_init_tables(void)" +	print "{" + +	# print escape opcode map's array +	print "\t/* Print Escape opcode map array */" +	for (i = 0; i < geid; i++) +		for (j = 0; j < max_lprefix; j++) +			if (etable[i,j]) +				print "\tinat_escape_tables["i"]["j"] = "etable[i,j]";" +	print "" + +	# print group opcode map's array +	print "\t/* Print Group opcode map array */" +	for (i = 0; i < ggid; i++) +		for (j = 0; j < max_lprefix; j++) +			if (gtable[i,j]) +				print "\tinat_group_tables["i"]["j"] = "gtable[i,j]";" +	print "" +	# print AVX opcode map's array +	print "\t/* Print AVX opcode map array */" +	for (i = 0; i < gaid; i++) +		for (j = 0; j < max_lprefix; j++) +			if (atable[i,j]) +				print "\tinat_avx_tables["i"]["j"] = "atable[i,j]";" + +	print "}" +	print "#endif"  } | 
