diff options
44 files changed, 1448 insertions, 307 deletions
| diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ee272423ec20..f9ae63863f94 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -887,9 +887,11 @@ config INTEL_TDX_GUEST  	bool "Intel TDX (Trust Domain Extensions) - Guest Support"  	depends on X86_64 && CPU_SUP_INTEL  	depends on X86_X2APIC +	depends on EFI_STUB  	select ARCH_HAS_CC_PLATFORM  	select X86_MEM_ENCRYPT  	select X86_MCE +	select UNACCEPTED_MEMORY  	help  	  Support running as a guest under Intel TDX.  Without this support,  	  the guest kernel can not boot or run under TDX. @@ -1544,11 +1546,13 @@ config X86_MEM_ENCRYPT  config AMD_MEM_ENCRYPT  	bool "AMD Secure Memory Encryption (SME) support"  	depends on X86_64 && CPU_SUP_AMD +	depends on EFI_STUB  	select DMA_COHERENT_POOL  	select ARCH_USE_MEMREMAP_PROT  	select INSTRUCTION_DECODER  	select ARCH_HAS_CC_PLATFORM  	select X86_MEM_ENCRYPT +	select UNACCEPTED_MEMORY  	help  	  Say yes to enable support for the encryption of system memory.  	  This requires an AMD processor that supports Secure Memory diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 0f78dbbbdcdd..40d2ff503079 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -106,7 +106,8 @@ ifdef CONFIG_X86_64  endif  vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o -vmlinux-objs-$(CONFIG_INTEL_TDX_GUEST) += $(obj)/tdx.o $(obj)/tdcall.o +vmlinux-objs-$(CONFIG_INTEL_TDX_GUEST) += $(obj)/tdx.o $(obj)/tdcall.o $(obj)/tdx-shared.o +vmlinux-objs-$(CONFIG_UNACCEPTED_MEMORY) += $(obj)/mem.o  vmlinux-objs-$(CONFIG_EFI) += $(obj)/efi.o  vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_mixed.o diff --git a/arch/x86/boot/compressed/efi.h b/arch/x86/boot/compressed/efi.h index 7db2f41b54cd..866c0af8b5b9 100644 --- a/arch/x86/boot/compressed/efi.h +++ b/arch/x86/boot/compressed/efi.h @@ -16,6 +16,7 @@ typedef guid_t efi_guid_t __aligned(__alignof__(u32));  #define ACPI_TABLE_GUID				EFI_GUID(0xeb9d2d30, 0x2d88, 0x11d3,  0x9a, 0x16, 0x00, 0x90, 0x27, 0x3f, 0xc1, 0x4d)  #define ACPI_20_TABLE_GUID			EFI_GUID(0x8868e871, 0xe4f1, 0x11d3,  0xbc, 0x22, 0x00, 0x80, 0xc7, 0x3c, 0x88, 0x81)  #define EFI_CC_BLOB_GUID			EFI_GUID(0x067b1f5f, 0xcf26, 0x44c5, 0x85, 0x54, 0x93, 0xd7, 0x77, 0x91, 0x2d, 0x42) +#define LINUX_EFI_UNACCEPTED_MEM_TABLE_GUID	EFI_GUID(0xd5d1de3c, 0x105c, 0x44f9,  0x9e, 0xa9, 0xbc, 0xef, 0x98, 0x12, 0x00, 0x31)  #define EFI32_LOADER_SIGNATURE	"EL32"  #define EFI64_LOADER_SIGNATURE	"EL64" @@ -32,6 +33,7 @@ typedef	struct {  } efi_table_hdr_t;  #define EFI_CONVENTIONAL_MEMORY		 7 +#define EFI_UNACCEPTED_MEMORY		15  #define EFI_MEMORY_MORE_RELIABLE \  				((u64)0x0000000000010000ULL)	/* higher reliability */ @@ -104,6 +106,14 @@ struct efi_setup_data {  	u64 reserved[8];  }; +struct efi_unaccepted_memory { +	u32 version; +	u32 unit_size; +	u64 phys_base; +	u64 size; +	unsigned long bitmap[]; +}; +  static inline int efi_guidcmp (efi_guid_t left, efi_guid_t right)  {  	return memcmp(&left, &right, sizeof (efi_guid_t)); diff --git a/arch/x86/boot/compressed/error.c b/arch/x86/boot/compressed/error.c index c881878e56d3..5313c5cb2b80 100644 --- a/arch/x86/boot/compressed/error.c +++ b/arch/x86/boot/compressed/error.c @@ -22,3 +22,22 @@ void error(char *m)  	while (1)  		asm("hlt");  } + +/* EFI libstub  provides vsnprintf() */ +#ifdef CONFIG_EFI_STUB +void panic(const char *fmt, ...) +{ +	static char buf[1024]; +	va_list args; +	int len; + +	va_start(args, fmt); +	len = vsnprintf(buf, sizeof(buf), fmt, args); +	va_end(args); + +	if (len && buf[len - 1] == '\n') +		buf[len - 1] = '\0'; + +	error(buf); +} +#endif diff --git a/arch/x86/boot/compressed/error.h b/arch/x86/boot/compressed/error.h index 1de5821184f1..86fe33b93715 100644 --- a/arch/x86/boot/compressed/error.h +++ b/arch/x86/boot/compressed/error.h @@ -6,5 +6,6 @@  void warn(char *m);  void error(char *m) __noreturn; +void panic(const char *fmt, ...) __noreturn __cold;  #endif /* BOOT_COMPRESSED_ERROR_H */ diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 454757fbdfe5..9193acf0e9cd 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -672,6 +672,33 @@ static bool process_mem_region(struct mem_vector *region,  }  #ifdef CONFIG_EFI + +/* + * Only EFI_CONVENTIONAL_MEMORY and EFI_UNACCEPTED_MEMORY (if supported) are + * guaranteed to be free. + * + * Pick free memory more conservatively than the EFI spec allows: according to + * the spec, EFI_BOOT_SERVICES_{CODE|DATA} are also free memory and thus + * available to place the kernel image into, but in practice there's firmware + * where using that memory leads to crashes. Buggy vendor EFI code registers + * for an event that triggers on SetVirtualAddressMap(). The handler assumes + * that EFI_BOOT_SERVICES_DATA memory has not been touched by loader yet, which + * is probably true for Windows. + * + * Preserve EFI_BOOT_SERVICES_* regions until after SetVirtualAddressMap(). + */ +static inline bool memory_type_is_free(efi_memory_desc_t *md) +{ +	if (md->type == EFI_CONVENTIONAL_MEMORY) +		return true; + +	if (IS_ENABLED(CONFIG_UNACCEPTED_MEMORY) && +	    md->type == EFI_UNACCEPTED_MEMORY) +		    return true; + +	return false; +} +  /*   * Returns true if we processed the EFI memmap, which we prefer over the E820   * table if it is available. @@ -716,18 +743,7 @@ process_efi_entries(unsigned long minimum, unsigned long image_size)  	for (i = 0; i < nr_desc; i++) {  		md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i); -		/* -		 * Here we are more conservative in picking free memory than -		 * the EFI spec allows: -		 * -		 * According to the spec, EFI_BOOT_SERVICES_{CODE|DATA} are also -		 * free memory and thus available to place the kernel image into, -		 * but in practice there's firmware where using that memory leads -		 * to crashes. -		 * -		 * Only EFI_CONVENTIONAL_MEMORY is guaranteed to be free. -		 */ -		if (md->type != EFI_CONVENTIONAL_MEMORY) +		if (!memory_type_is_free(md))  			continue;  		if (efi_soft_reserve_enabled() && diff --git a/arch/x86/boot/compressed/mem.c b/arch/x86/boot/compressed/mem.c new file mode 100644 index 000000000000..3c1609245f2a --- /dev/null +++ b/arch/x86/boot/compressed/mem.c @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "error.h" +#include "misc.h" +#include "tdx.h" +#include "sev.h" +#include <asm/shared/tdx.h> + +/* + * accept_memory() and process_unaccepted_memory() called from EFI stub which + * runs before decompresser and its early_tdx_detect(). + * + * Enumerate TDX directly from the early users. + */ +static bool early_is_tdx_guest(void) +{ +	static bool once; +	static bool is_tdx; + +	if (!IS_ENABLED(CONFIG_INTEL_TDX_GUEST)) +		return false; + +	if (!once) { +		u32 eax, sig[3]; + +		cpuid_count(TDX_CPUID_LEAF_ID, 0, &eax, +			    &sig[0], &sig[2],  &sig[1]); +		is_tdx = !memcmp(TDX_IDENT, sig, sizeof(sig)); +		once = true; +	} + +	return is_tdx; +} + +void arch_accept_memory(phys_addr_t start, phys_addr_t end) +{ +	/* Platform-specific memory-acceptance call goes here */ +	if (early_is_tdx_guest()) { +		if (!tdx_accept_memory(start, end)) +			panic("TDX: Failed to accept memory\n"); +	} else if (sev_snp_enabled()) { +		snp_accept_memory(start, end); +	} else { +		error("Cannot accept memory: unknown platform\n"); +	} +} + +bool init_unaccepted_memory(void) +{ +	guid_t guid = LINUX_EFI_UNACCEPTED_MEM_TABLE_GUID; +	struct efi_unaccepted_memory *table; +	unsigned long cfg_table_pa; +	unsigned int cfg_table_len; +	enum efi_type et; +	int ret; + +	et = efi_get_type(boot_params); +	if (et == EFI_TYPE_NONE) +		return false; + +	ret = efi_get_conf_table(boot_params, &cfg_table_pa, &cfg_table_len); +	if (ret) { +		warn("EFI config table not found."); +		return false; +	} + +	table = (void *)efi_find_vendor_table(boot_params, cfg_table_pa, +					      cfg_table_len, guid); +	if (!table) +		return false; + +	if (table->version != 1) +		error("Unknown version of unaccepted memory table\n"); + +	/* +	 * In many cases unaccepted_table is already set by EFI stub, but it +	 * has to be initialized again to cover cases when the table is not +	 * allocated by EFI stub or EFI stub copied the kernel image with +	 * efi_relocate_kernel() before the variable is set. +	 * +	 * It must be initialized before the first usage of accept_memory(). +	 */ +	unaccepted_table = table; + +	return true; +} diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 014ff222bf4b..94b7abcf624b 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -455,6 +455,12 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,  #endif  	debug_putstr("\nDecompressing Linux... "); + +	if (init_unaccepted_memory()) { +		debug_putstr("Accepting memory... "); +		accept_memory(__pa(output), __pa(output) + needed_size); +	} +  	__decompress(input_data, input_len, NULL, NULL, output, output_len,  			NULL, error);  	entry_offset = parse_elf(output); diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 2f155a0e3041..964fe903a1cd 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -247,4 +247,14 @@ static inline unsigned long efi_find_vendor_table(struct boot_params *bp,  }  #endif /* CONFIG_EFI */ +#ifdef CONFIG_UNACCEPTED_MEMORY +bool init_unaccepted_memory(void); +#else +static inline bool init_unaccepted_memory(void) { return false; } +#endif + +/* Defined in EFI stub */ +extern struct efi_unaccepted_memory *unaccepted_table; +void accept_memory(phys_addr_t start, phys_addr_t end); +  #endif /* BOOT_COMPRESSED_MISC_H */ diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 014b89c89088..09dc8c187b3c 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -115,7 +115,7 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,  /* Include code for early handlers */  #include "../../kernel/sev-shared.c" -static inline bool sev_snp_enabled(void) +bool sev_snp_enabled(void)  {  	return sev_status & MSR_AMD64_SEV_SNP_ENABLED;  } @@ -181,6 +181,58 @@ static bool early_setup_ghcb(void)  	return true;  } +static phys_addr_t __snp_accept_memory(struct snp_psc_desc *desc, +				       phys_addr_t pa, phys_addr_t pa_end) +{ +	struct psc_hdr *hdr; +	struct psc_entry *e; +	unsigned int i; + +	hdr = &desc->hdr; +	memset(hdr, 0, sizeof(*hdr)); + +	e = desc->entries; + +	i = 0; +	while (pa < pa_end && i < VMGEXIT_PSC_MAX_ENTRY) { +		hdr->end_entry = i; + +		e->gfn = pa >> PAGE_SHIFT; +		e->operation = SNP_PAGE_STATE_PRIVATE; +		if (IS_ALIGNED(pa, PMD_SIZE) && (pa_end - pa) >= PMD_SIZE) { +			e->pagesize = RMP_PG_SIZE_2M; +			pa += PMD_SIZE; +		} else { +			e->pagesize = RMP_PG_SIZE_4K; +			pa += PAGE_SIZE; +		} + +		e++; +		i++; +	} + +	if (vmgexit_psc(boot_ghcb, desc)) +		sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); + +	pvalidate_pages(desc); + +	return pa; +} + +void snp_accept_memory(phys_addr_t start, phys_addr_t end) +{ +	struct snp_psc_desc desc = {}; +	unsigned int i; +	phys_addr_t pa; + +	if (!boot_ghcb && !early_setup_ghcb()) +		sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); + +	pa = start; +	while (pa < end) +		pa = __snp_accept_memory(&desc, pa, end); +} +  void sev_es_shutdown_ghcb(void)  {  	if (!boot_ghcb) diff --git a/arch/x86/boot/compressed/sev.h b/arch/x86/boot/compressed/sev.h new file mode 100644 index 000000000000..fc725a981b09 --- /dev/null +++ b/arch/x86/boot/compressed/sev.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * AMD SEV header for early boot related functions. + * + * Author: Tom Lendacky <thomas.lendacky@amd.com> + */ + +#ifndef BOOT_COMPRESSED_SEV_H +#define BOOT_COMPRESSED_SEV_H + +#ifdef CONFIG_AMD_MEM_ENCRYPT + +bool sev_snp_enabled(void); +void snp_accept_memory(phys_addr_t start, phys_addr_t end); + +#else + +static inline bool sev_snp_enabled(void) { return false; } +static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { } + +#endif + +#endif diff --git a/arch/x86/boot/compressed/tdx-shared.c b/arch/x86/boot/compressed/tdx-shared.c new file mode 100644 index 000000000000..5ac43762fe13 --- /dev/null +++ b/arch/x86/boot/compressed/tdx-shared.c @@ -0,0 +1,2 @@ +#include "error.h" +#include "../../coco/tdx/tdx-shared.c" diff --git a/arch/x86/coco/tdx/Makefile b/arch/x86/coco/tdx/Makefile index 46c55998557d..2c7dcbf1458b 100644 --- a/arch/x86/coco/tdx/Makefile +++ b/arch/x86/coco/tdx/Makefile @@ -1,3 +1,3 @@  # SPDX-License-Identifier: GPL-2.0 -obj-y += tdx.o tdcall.o +obj-y += tdx.o tdx-shared.o tdcall.o diff --git a/arch/x86/coco/tdx/tdx-shared.c b/arch/x86/coco/tdx/tdx-shared.c new file mode 100644 index 000000000000..ef20ddc37b58 --- /dev/null +++ b/arch/x86/coco/tdx/tdx-shared.c @@ -0,0 +1,71 @@ +#include <asm/tdx.h> +#include <asm/pgtable.h> + +static unsigned long try_accept_one(phys_addr_t start, unsigned long len, +				    enum pg_level pg_level) +{ +	unsigned long accept_size = page_level_size(pg_level); +	u64 tdcall_rcx; +	u8 page_size; + +	if (!IS_ALIGNED(start, accept_size)) +		return 0; + +	if (len < accept_size) +		return 0; + +	/* +	 * Pass the page physical address to the TDX module to accept the +	 * pending, private page. +	 * +	 * Bits 2:0 of RCX encode page size: 0 - 4K, 1 - 2M, 2 - 1G. +	 */ +	switch (pg_level) { +	case PG_LEVEL_4K: +		page_size = 0; +		break; +	case PG_LEVEL_2M: +		page_size = 1; +		break; +	case PG_LEVEL_1G: +		page_size = 2; +		break; +	default: +		return 0; +	} + +	tdcall_rcx = start | page_size; +	if (__tdx_module_call(TDX_ACCEPT_PAGE, tdcall_rcx, 0, 0, 0, NULL)) +		return 0; + +	return accept_size; +} + +bool tdx_accept_memory(phys_addr_t start, phys_addr_t end) +{ +	/* +	 * For shared->private conversion, accept the page using +	 * TDX_ACCEPT_PAGE TDX module call. +	 */ +	while (start < end) { +		unsigned long len = end - start; +		unsigned long accept_size; + +		/* +		 * Try larger accepts first. It gives chance to VMM to keep +		 * 1G/2M Secure EPT entries where possible and speeds up +		 * process by cutting number of hypercalls (if successful). +		 */ + +		accept_size = try_accept_one(start, len, PG_LEVEL_1G); +		if (!accept_size) +			accept_size = try_accept_one(start, len, PG_LEVEL_2M); +		if (!accept_size) +			accept_size = try_accept_one(start, len, PG_LEVEL_4K); +		if (!accept_size) +			return false; +		start += accept_size; +	} + +	return true; +} diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index 27ce10c9ff61..fe67c027a27b 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -14,20 +14,6 @@  #include <asm/insn-eval.h>  #include <asm/pgtable.h> -/* TDX module Call Leaf IDs */ -#define TDX_GET_INFO			1 -#define TDX_GET_VEINFO			3 -#define TDX_GET_REPORT			4 -#define TDX_ACCEPT_PAGE			6 -#define TDX_WR				8 - -/* TDCS fields. To be used by TDG.VM.WR and TDG.VM.RD module calls */ -#define TDCS_NOTIFY_ENABLES		0x9100000000000010 - -/* TDX hypercall Leaf IDs */ -#define TDVMCALL_MAP_GPA		0x10001 -#define TDVMCALL_REPORT_FATAL_ERROR	0x10003 -  /* MMIO direction */  #define EPT_READ	0  #define EPT_WRITE	1 @@ -51,24 +37,6 @@  #define TDREPORT_SUBTYPE_0	0 -/* - * Wrapper for standard use of __tdx_hypercall with no output aside from - * return code. - */ -static inline u64 _tdx_hypercall(u64 fn, u64 r12, u64 r13, u64 r14, u64 r15) -{ -	struct tdx_hypercall_args args = { -		.r10 = TDX_HYPERCALL_STANDARD, -		.r11 = fn, -		.r12 = r12, -		.r13 = r13, -		.r14 = r14, -		.r15 = r15, -	}; - -	return __tdx_hypercall(&args); -} -  /* Called from __tdx_hypercall() for unrecoverable failure */  noinstr void __tdx_hypercall_failed(void)  { @@ -745,47 +713,6 @@ static bool tdx_cache_flush_required(void)  	return true;  } -static bool try_accept_one(phys_addr_t *start, unsigned long len, -			  enum pg_level pg_level) -{ -	unsigned long accept_size = page_level_size(pg_level); -	u64 tdcall_rcx; -	u8 page_size; - -	if (!IS_ALIGNED(*start, accept_size)) -		return false; - -	if (len < accept_size) -		return false; - -	/* -	 * Pass the page physical address to the TDX module to accept the -	 * pending, private page. -	 * -	 * Bits 2:0 of RCX encode page size: 0 - 4K, 1 - 2M, 2 - 1G. -	 */ -	switch (pg_level) { -	case PG_LEVEL_4K: -		page_size = 0; -		break; -	case PG_LEVEL_2M: -		page_size = 1; -		break; -	case PG_LEVEL_1G: -		page_size = 2; -		break; -	default: -		return false; -	} - -	tdcall_rcx = *start | page_size; -	if (__tdx_module_call(TDX_ACCEPT_PAGE, tdcall_rcx, 0, 0, 0, NULL)) -		return false; - -	*start += accept_size; -	return true; -} -  /*   * Inform the VMM of the guest's intent for this physical page: shared with   * the VMM or private to the guest.  The VMM is expected to change its mapping @@ -810,32 +737,9 @@ static bool tdx_enc_status_changed(unsigned long vaddr, int numpages, bool enc)  	if (_tdx_hypercall(TDVMCALL_MAP_GPA, start, end - start, 0, 0))  		return false; -	/* private->shared conversion  requires only MapGPA call */ -	if (!enc) -		return true; - -	/* -	 * For shared->private conversion, accept the page using -	 * TDX_ACCEPT_PAGE TDX module call. -	 */ -	while (start < end) { -		unsigned long len = end - start; - -		/* -		 * Try larger accepts first. It gives chance to VMM to keep -		 * 1G/2M SEPT entries where possible and speeds up process by -		 * cutting number of hypercalls (if successful). -		 */ - -		if (try_accept_one(&start, len, PG_LEVEL_1G)) -			continue; - -		if (try_accept_one(&start, len, PG_LEVEL_2M)) -			continue; - -		if (!try_accept_one(&start, len, PG_LEVEL_4K)) -			return false; -	} +	/* shared->private conversion requires memory to be accepted before use */ +	if (enc) +		return tdx_accept_memory(start, end);  	return true;  } diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 419280d263d2..8b4be7cecdb8 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -31,6 +31,8 @@ extern unsigned long efi_mixed_mode_stack_pa;  #define ARCH_EFI_IRQ_FLAGS_MASK	X86_EFLAGS_IF +#define EFI_UNACCEPTED_UNIT_SIZE PMD_SIZE +  /*   * The EFI services are called through variadic functions in many cases. These   * functions are implemented in assembler and support only a fixed number of diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index 0759af9b1acf..b463fcbd4b90 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -106,8 +106,13 @@ enum psc_op {  #define GHCB_HV_FT_SNP			BIT_ULL(0)  #define GHCB_HV_FT_SNP_AP_CREATION	BIT_ULL(1) -/* SNP Page State Change NAE event */ -#define VMGEXIT_PSC_MAX_ENTRY		253 +/* + * SNP Page State Change NAE event + *   The VMGEXIT_PSC_MAX_ENTRY determines the size of the PSC structure, which + *   is a local stack variable in set_pages_state(). Do not increase this value + *   without evaluating the impact to stack usage. + */ +#define VMGEXIT_PSC_MAX_ENTRY		64  struct psc_hdr {  	u16 cur_entry; diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 13dc2a9d23c1..86e1296e87f5 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -80,11 +80,15 @@ extern void vc_no_ghcb(void);  extern void vc_boot_ghcb(void);  extern bool handle_vc_boot_ghcb(struct pt_regs *regs); +/* PVALIDATE return codes */ +#define PVALIDATE_FAIL_SIZEMISMATCH	6 +  /* Software defined (when rFlags.CF = 1) */  #define PVALIDATE_FAIL_NOUPDATE		255  /* RMP page size */  #define RMP_PG_SIZE_4K			0 +#define RMP_PG_SIZE_2M			1  #define RMPADJUST_VMSA_PAGE_BIT		BIT(16) @@ -192,16 +196,17 @@ struct snp_guest_request_ioctl;  void setup_ghcb(void);  void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, -					 unsigned int npages); +					 unsigned long npages);  void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, -					unsigned int npages); +					unsigned long npages);  void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op); -void snp_set_memory_shared(unsigned long vaddr, unsigned int npages); -void snp_set_memory_private(unsigned long vaddr, unsigned int npages); +void snp_set_memory_shared(unsigned long vaddr, unsigned long npages); +void snp_set_memory_private(unsigned long vaddr, unsigned long npages);  void snp_set_wakeup_secondary_cpu(void);  bool snp_init(struct boot_params *bp);  void __init __noreturn snp_abort(void);  int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio); +void snp_accept_memory(phys_addr_t start, phys_addr_t end);  #else  static inline void sev_es_ist_enter(struct pt_regs *regs) { }  static inline void sev_es_ist_exit(void) { } @@ -212,12 +217,12 @@ static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate)  static inline int rmpadjust(unsigned long vaddr, bool rmp_psize, unsigned long attrs) { return 0; }  static inline void setup_ghcb(void) { }  static inline void __init -early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, unsigned int npages) { } +early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, unsigned long npages) { }  static inline void __init -early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, unsigned int npages) { } +early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, unsigned long npages) { }  static inline void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op) { } -static inline void snp_set_memory_shared(unsigned long vaddr, unsigned int npages) { } -static inline void snp_set_memory_private(unsigned long vaddr, unsigned int npages) { } +static inline void snp_set_memory_shared(unsigned long vaddr, unsigned long npages) { } +static inline void snp_set_memory_private(unsigned long vaddr, unsigned long npages) { }  static inline void snp_set_wakeup_secondary_cpu(void) { }  static inline bool snp_init(struct boot_params *bp) { return false; }  static inline void snp_abort(void) { } @@ -225,6 +230,8 @@ static inline int snp_issue_guest_request(u64 exit_code, struct snp_req_data *in  {  	return -ENOTTY;  } + +static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { }  #endif  #endif diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h index 2631e01f6e0f..19228beb4894 100644 --- a/arch/x86/include/asm/shared/tdx.h +++ b/arch/x86/include/asm/shared/tdx.h @@ -10,6 +10,20 @@  #define TDX_CPUID_LEAF_ID	0x21  #define TDX_IDENT		"IntelTDX    " +/* TDX module Call Leaf IDs */ +#define TDX_GET_INFO			1 +#define TDX_GET_VEINFO			3 +#define TDX_GET_REPORT			4 +#define TDX_ACCEPT_PAGE			6 +#define TDX_WR				8 + +/* TDCS fields. To be used by TDG.VM.WR and TDG.VM.RD module calls */ +#define TDCS_NOTIFY_ENABLES		0x9100000000000010 + +/* TDX hypercall Leaf IDs */ +#define TDVMCALL_MAP_GPA		0x10001 +#define TDVMCALL_REPORT_FATAL_ERROR	0x10003 +  #ifndef __ASSEMBLY__  /* @@ -37,8 +51,47 @@ struct tdx_hypercall_args {  u64 __tdx_hypercall(struct tdx_hypercall_args *args);  u64 __tdx_hypercall_ret(struct tdx_hypercall_args *args); +/* + * Wrapper for standard use of __tdx_hypercall with no output aside from + * return code. + */ +static inline u64 _tdx_hypercall(u64 fn, u64 r12, u64 r13, u64 r14, u64 r15) +{ +	struct tdx_hypercall_args args = { +		.r10 = TDX_HYPERCALL_STANDARD, +		.r11 = fn, +		.r12 = r12, +		.r13 = r13, +		.r14 = r14, +		.r15 = r15, +	}; + +	return __tdx_hypercall(&args); +} + +  /* Called from __tdx_hypercall() for unrecoverable failure */  void __tdx_hypercall_failed(void); +/* + * Used in __tdx_module_call() to gather the output registers' values of the + * TDCALL instruction when requesting services from the TDX module. This is a + * software only structure and not part of the TDX module/VMM ABI + */ +struct tdx_module_output { +	u64 rcx; +	u64 rdx; +	u64 r8; +	u64 r9; +	u64 r10; +	u64 r11; +}; + +/* Used to communicate with the TDX module */ +u64 __tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9, +		      struct tdx_module_output *out); + +bool tdx_accept_memory(phys_addr_t start, phys_addr_t end); +  #endif /* !__ASSEMBLY__ */  #endif /* _ASM_X86_SHARED_TDX_H */ diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index 28d889c9aa16..603e6d1e9d4a 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -5,6 +5,8 @@  #include <linux/init.h>  #include <linux/bits.h> + +#include <asm/errno.h>  #include <asm/ptrace.h>  #include <asm/shared/tdx.h> @@ -21,21 +23,6 @@  #ifndef __ASSEMBLY__  /* - * Used to gather the output registers values of the TDCALL and SEAMCALL - * instructions when requesting services from the TDX module. - * - * This is a software only structure and not part of the TDX module/VMM ABI. - */ -struct tdx_module_output { -	u64 rcx; -	u64 rdx; -	u64 r8; -	u64 r9; -	u64 r10; -	u64 r11; -}; - -/*   * Used by the #VE exception handler to gather the #VE exception   * info from the TDX module. This is a software only structure   * and not part of the TDX module/VMM ABI. @@ -55,10 +42,6 @@ struct ve_info {  void __init tdx_early_init(void); -/* Used to communicate with the TDX module */ -u64 __tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9, -		      struct tdx_module_output *out); -  void tdx_get_ve_info(struct ve_info *ve);  bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve); diff --git a/arch/x86/include/asm/unaccepted_memory.h b/arch/x86/include/asm/unaccepted_memory.h new file mode 100644 index 000000000000..f5937e9866ac --- /dev/null +++ b/arch/x86/include/asm/unaccepted_memory.h @@ -0,0 +1,27 @@ +#ifndef _ASM_X86_UNACCEPTED_MEMORY_H +#define _ASM_X86_UNACCEPTED_MEMORY_H + +#include <linux/efi.h> +#include <asm/tdx.h> +#include <asm/sev.h> + +static inline void arch_accept_memory(phys_addr_t start, phys_addr_t end) +{ +	/* Platform-specific memory-acceptance call goes here */ +	if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) { +		if (!tdx_accept_memory(start, end)) +			panic("TDX: Failed to accept memory\n"); +	} else if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) { +		snp_accept_memory(start, end); +	} else { +		panic("Cannot accept memory: unknown platform\n"); +	} +} + +static inline struct efi_unaccepted_memory *efi_get_unaccepted_table(void) +{ +	if (efi.unaccepted == EFI_INVALID_TABLE_ADDR) +		return NULL; +	return __va(efi.unaccepted); +} +#endif diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index 3a5b0c9c4fcc..2eabccde94fb 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -12,6 +12,9 @@  #ifndef __BOOT_COMPRESSED  #define error(v)	pr_err(v)  #define has_cpuflag(f)	boot_cpu_has(f) +#else +#undef WARN +#define WARN(condition, format...) (!!(condition))  #endif  /* I/O parameters for CPUID-related helpers */ @@ -991,3 +994,103 @@ static void __init setup_cpuid_table(const struct cc_blob_sev_info *cc_info)  			cpuid_ext_range_max = fn->eax;  	}  } + +static void pvalidate_pages(struct snp_psc_desc *desc) +{ +	struct psc_entry *e; +	unsigned long vaddr; +	unsigned int size; +	unsigned int i; +	bool validate; +	int rc; + +	for (i = 0; i <= desc->hdr.end_entry; i++) { +		e = &desc->entries[i]; + +		vaddr = (unsigned long)pfn_to_kaddr(e->gfn); +		size = e->pagesize ? RMP_PG_SIZE_2M : RMP_PG_SIZE_4K; +		validate = e->operation == SNP_PAGE_STATE_PRIVATE; + +		rc = pvalidate(vaddr, size, validate); +		if (rc == PVALIDATE_FAIL_SIZEMISMATCH && size == RMP_PG_SIZE_2M) { +			unsigned long vaddr_end = vaddr + PMD_SIZE; + +			for (; vaddr < vaddr_end; vaddr += PAGE_SIZE) { +				rc = pvalidate(vaddr, RMP_PG_SIZE_4K, validate); +				if (rc) +					break; +			} +		} + +		if (rc) { +			WARN(1, "Failed to validate address 0x%lx ret %d", vaddr, rc); +			sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE); +		} +	} +} + +static int vmgexit_psc(struct ghcb *ghcb, struct snp_psc_desc *desc) +{ +	int cur_entry, end_entry, ret = 0; +	struct snp_psc_desc *data; +	struct es_em_ctxt ctxt; + +	vc_ghcb_invalidate(ghcb); + +	/* Copy the input desc into GHCB shared buffer */ +	data = (struct snp_psc_desc *)ghcb->shared_buffer; +	memcpy(ghcb->shared_buffer, desc, min_t(int, GHCB_SHARED_BUF_SIZE, sizeof(*desc))); + +	/* +	 * As per the GHCB specification, the hypervisor can resume the guest +	 * before processing all the entries. Check whether all the entries +	 * are processed. If not, then keep retrying. Note, the hypervisor +	 * will update the data memory directly to indicate the status, so +	 * reference the data->hdr everywhere. +	 * +	 * The strategy here is to wait for the hypervisor to change the page +	 * state in the RMP table before guest accesses the memory pages. If the +	 * page state change was not successful, then later memory access will +	 * result in a crash. +	 */ +	cur_entry = data->hdr.cur_entry; +	end_entry = data->hdr.end_entry; + +	while (data->hdr.cur_entry <= data->hdr.end_entry) { +		ghcb_set_sw_scratch(ghcb, (u64)__pa(data)); + +		/* This will advance the shared buffer data points to. */ +		ret = sev_es_ghcb_hv_call(ghcb, &ctxt, SVM_VMGEXIT_PSC, 0, 0); + +		/* +		 * Page State Change VMGEXIT can pass error code through +		 * exit_info_2. +		 */ +		if (WARN(ret || ghcb->save.sw_exit_info_2, +			 "SNP: PSC failed ret=%d exit_info_2=%llx\n", +			 ret, ghcb->save.sw_exit_info_2)) { +			ret = 1; +			goto out; +		} + +		/* Verify that reserved bit is not set */ +		if (WARN(data->hdr.reserved, "Reserved bit is set in the PSC header\n")) { +			ret = 1; +			goto out; +		} + +		/* +		 * Sanity check that entry processing is not going backwards. +		 * This will happen only if hypervisor is tricking us. +		 */ +		if (WARN(data->hdr.end_entry > end_entry || cur_entry > data->hdr.cur_entry, +"SNP: PSC processing going backward, end_entry %d (got %d) cur_entry %d (got %d)\n", +			 end_entry, data->hdr.end_entry, cur_entry, data->hdr.cur_entry)) { +			ret = 1; +			goto out; +		} +	} + +out: +	return ret; +} diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index d095a5d3c14a..a0af3908ed49 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -119,7 +119,19 @@ static DEFINE_PER_CPU(struct sev_es_save_area *, sev_vmsa);  struct sev_config {  	__u64 debug		: 1, -	      __reserved	: 63; + +	      /* +	       * A flag used by __set_pages_state() that indicates when the +	       * per-CPU GHCB has been created and registered and thus can be +	       * used by the BSP instead of the early boot GHCB. +	       * +	       * For APs, the per-CPU GHCB is created before they are started +	       * and registered upon startup, so this flag can be used globally +	       * for the BSP and APs. +	       */ +	      ghcbs_initialized	: 1, + +	      __reserved	: 62;  };  static struct sev_config sev_cfg __read_mostly; @@ -645,32 +657,26 @@ static u64 __init get_jump_table_addr(void)  	return ret;  } -static void pvalidate_pages(unsigned long vaddr, unsigned int npages, bool validate) -{ -	unsigned long vaddr_end; -	int rc; - -	vaddr = vaddr & PAGE_MASK; -	vaddr_end = vaddr + (npages << PAGE_SHIFT); - -	while (vaddr < vaddr_end) { -		rc = pvalidate(vaddr, RMP_PG_SIZE_4K, validate); -		if (WARN(rc, "Failed to validate address 0x%lx ret %d", vaddr, rc)) -			sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE); - -		vaddr = vaddr + PAGE_SIZE; -	} -} - -static void __init early_set_pages_state(unsigned long paddr, unsigned int npages, enum psc_op op) +static void early_set_pages_state(unsigned long vaddr, unsigned long paddr, +				  unsigned long npages, enum psc_op op)  {  	unsigned long paddr_end;  	u64 val; +	int ret; + +	vaddr = vaddr & PAGE_MASK;  	paddr = paddr & PAGE_MASK;  	paddr_end = paddr + (npages << PAGE_SHIFT);  	while (paddr < paddr_end) { +		if (op == SNP_PAGE_STATE_SHARED) { +			/* Page validation must be rescinded before changing to shared */ +			ret = pvalidate(vaddr, RMP_PG_SIZE_4K, false); +			if (WARN(ret, "Failed to validate address 0x%lx ret %d", paddr, ret)) +				goto e_term; +		} +  		/*  		 * Use the MSR protocol because this function can be called before  		 * the GHCB is established. @@ -691,7 +697,15 @@ static void __init early_set_pages_state(unsigned long paddr, unsigned int npage  			 paddr, GHCB_MSR_PSC_RESP_VAL(val)))  			goto e_term; -		paddr = paddr + PAGE_SIZE; +		if (op == SNP_PAGE_STATE_PRIVATE) { +			/* Page validation must be performed after changing to private */ +			ret = pvalidate(vaddr, RMP_PG_SIZE_4K, true); +			if (WARN(ret, "Failed to validate address 0x%lx ret %d", paddr, ret)) +				goto e_term; +		} + +		vaddr += PAGE_SIZE; +		paddr += PAGE_SIZE;  	}  	return; @@ -701,7 +715,7 @@ e_term:  }  void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, -					 unsigned int npages) +					 unsigned long npages)  {  	/*  	 * This can be invoked in early boot while running identity mapped, so @@ -716,14 +730,11 @@ void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long padd  	  * Ask the hypervisor to mark the memory pages as private in the RMP  	  * table.  	  */ -	early_set_pages_state(paddr, npages, SNP_PAGE_STATE_PRIVATE); - -	/* Validate the memory pages after they've been added in the RMP table. */ -	pvalidate_pages(vaddr, npages, true); +	early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_PRIVATE);  }  void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, -					unsigned int npages) +					unsigned long npages)  {  	/*  	 * This can be invoked in early boot while running identity mapped, so @@ -734,11 +745,8 @@ void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr  	if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED))  		return; -	/* Invalidate the memory pages before they are marked shared in the RMP table. */ -	pvalidate_pages(vaddr, npages, false); -  	 /* Ask hypervisor to mark the memory pages shared in the RMP table. */ -	early_set_pages_state(paddr, npages, SNP_PAGE_STATE_SHARED); +	early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_SHARED);  }  void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op) @@ -756,96 +764,16 @@ void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op  		WARN(1, "invalid memory op %d\n", op);  } -static int vmgexit_psc(struct snp_psc_desc *desc) +static unsigned long __set_pages_state(struct snp_psc_desc *data, unsigned long vaddr, +				       unsigned long vaddr_end, int op)  { -	int cur_entry, end_entry, ret = 0; -	struct snp_psc_desc *data;  	struct ghcb_state state; -	struct es_em_ctxt ctxt; -	unsigned long flags; -	struct ghcb *ghcb; - -	/* -	 * __sev_get_ghcb() needs to run with IRQs disabled because it is using -	 * a per-CPU GHCB. -	 */ -	local_irq_save(flags); - -	ghcb = __sev_get_ghcb(&state); -	if (!ghcb) { -		ret = 1; -		goto out_unlock; -	} - -	/* Copy the input desc into GHCB shared buffer */ -	data = (struct snp_psc_desc *)ghcb->shared_buffer; -	memcpy(ghcb->shared_buffer, desc, min_t(int, GHCB_SHARED_BUF_SIZE, sizeof(*desc))); - -	/* -	 * As per the GHCB specification, the hypervisor can resume the guest -	 * before processing all the entries. Check whether all the entries -	 * are processed. If not, then keep retrying. Note, the hypervisor -	 * will update the data memory directly to indicate the status, so -	 * reference the data->hdr everywhere. -	 * -	 * The strategy here is to wait for the hypervisor to change the page -	 * state in the RMP table before guest accesses the memory pages. If the -	 * page state change was not successful, then later memory access will -	 * result in a crash. -	 */ -	cur_entry = data->hdr.cur_entry; -	end_entry = data->hdr.end_entry; - -	while (data->hdr.cur_entry <= data->hdr.end_entry) { -		ghcb_set_sw_scratch(ghcb, (u64)__pa(data)); - -		/* This will advance the shared buffer data points to. */ -		ret = sev_es_ghcb_hv_call(ghcb, &ctxt, SVM_VMGEXIT_PSC, 0, 0); - -		/* -		 * Page State Change VMGEXIT can pass error code through -		 * exit_info_2. -		 */ -		if (WARN(ret || ghcb->save.sw_exit_info_2, -			 "SNP: PSC failed ret=%d exit_info_2=%llx\n", -			 ret, ghcb->save.sw_exit_info_2)) { -			ret = 1; -			goto out; -		} - -		/* Verify that reserved bit is not set */ -		if (WARN(data->hdr.reserved, "Reserved bit is set in the PSC header\n")) { -			ret = 1; -			goto out; -		} - -		/* -		 * Sanity check that entry processing is not going backwards. -		 * This will happen only if hypervisor is tricking us. -		 */ -		if (WARN(data->hdr.end_entry > end_entry || cur_entry > data->hdr.cur_entry, -"SNP: PSC processing going backward, end_entry %d (got %d) cur_entry %d (got %d)\n", -			 end_entry, data->hdr.end_entry, cur_entry, data->hdr.cur_entry)) { -			ret = 1; -			goto out; -		} -	} - -out: -	__sev_put_ghcb(&state); - -out_unlock: -	local_irq_restore(flags); - -	return ret; -} - -static void __set_pages_state(struct snp_psc_desc *data, unsigned long vaddr, -			      unsigned long vaddr_end, int op) -{ +	bool use_large_entry;  	struct psc_hdr *hdr;  	struct psc_entry *e; +	unsigned long flags;  	unsigned long pfn; +	struct ghcb *ghcb;  	int i;  	hdr = &data->hdr; @@ -854,74 +782,104 @@ static void __set_pages_state(struct snp_psc_desc *data, unsigned long vaddr,  	memset(data, 0, sizeof(*data));  	i = 0; -	while (vaddr < vaddr_end) { -		if (is_vmalloc_addr((void *)vaddr)) +	while (vaddr < vaddr_end && i < ARRAY_SIZE(data->entries)) { +		hdr->end_entry = i; + +		if (is_vmalloc_addr((void *)vaddr)) {  			pfn = vmalloc_to_pfn((void *)vaddr); -		else +			use_large_entry = false; +		} else {  			pfn = __pa(vaddr) >> PAGE_SHIFT; +			use_large_entry = true; +		}  		e->gfn = pfn;  		e->operation = op; -		hdr->end_entry = i; -		/* -		 * Current SNP implementation doesn't keep track of the RMP page -		 * size so use 4K for simplicity. -		 */ -		e->pagesize = RMP_PG_SIZE_4K; +		if (use_large_entry && IS_ALIGNED(vaddr, PMD_SIZE) && +		    (vaddr_end - vaddr) >= PMD_SIZE) { +			e->pagesize = RMP_PG_SIZE_2M; +			vaddr += PMD_SIZE; +		} else { +			e->pagesize = RMP_PG_SIZE_4K; +			vaddr += PAGE_SIZE; +		} -		vaddr = vaddr + PAGE_SIZE;  		e++;  		i++;  	} -	if (vmgexit_psc(data)) +	/* Page validation must be rescinded before changing to shared */ +	if (op == SNP_PAGE_STATE_SHARED) +		pvalidate_pages(data); + +	local_irq_save(flags); + +	if (sev_cfg.ghcbs_initialized) +		ghcb = __sev_get_ghcb(&state); +	else +		ghcb = boot_ghcb; + +	/* Invoke the hypervisor to perform the page state changes */ +	if (!ghcb || vmgexit_psc(ghcb, data))  		sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); + +	if (sev_cfg.ghcbs_initialized) +		__sev_put_ghcb(&state); + +	local_irq_restore(flags); + +	/* Page validation must be performed after changing to private */ +	if (op == SNP_PAGE_STATE_PRIVATE) +		pvalidate_pages(data); + +	return vaddr;  } -static void set_pages_state(unsigned long vaddr, unsigned int npages, int op) +static void set_pages_state(unsigned long vaddr, unsigned long npages, int op)  { -	unsigned long vaddr_end, next_vaddr; -	struct snp_psc_desc *desc; +	struct snp_psc_desc desc; +	unsigned long vaddr_end; -	desc = kmalloc(sizeof(*desc), GFP_KERNEL_ACCOUNT); -	if (!desc) -		panic("SNP: failed to allocate memory for PSC descriptor\n"); +	/* Use the MSR protocol when a GHCB is not available. */ +	if (!boot_ghcb) +		return early_set_pages_state(vaddr, __pa(vaddr), npages, op);  	vaddr = vaddr & PAGE_MASK;  	vaddr_end = vaddr + (npages << PAGE_SHIFT); -	while (vaddr < vaddr_end) { -		/* Calculate the last vaddr that fits in one struct snp_psc_desc. */ -		next_vaddr = min_t(unsigned long, vaddr_end, -				   (VMGEXIT_PSC_MAX_ENTRY * PAGE_SIZE) + vaddr); - -		__set_pages_state(desc, vaddr, next_vaddr, op); - -		vaddr = next_vaddr; -	} - -	kfree(desc); +	while (vaddr < vaddr_end) +		vaddr = __set_pages_state(&desc, vaddr, vaddr_end, op);  } -void snp_set_memory_shared(unsigned long vaddr, unsigned int npages) +void snp_set_memory_shared(unsigned long vaddr, unsigned long npages)  {  	if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))  		return; -	pvalidate_pages(vaddr, npages, false); -  	set_pages_state(vaddr, npages, SNP_PAGE_STATE_SHARED);  } -void snp_set_memory_private(unsigned long vaddr, unsigned int npages) +void snp_set_memory_private(unsigned long vaddr, unsigned long npages)  {  	if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))  		return;  	set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE); +} + +void snp_accept_memory(phys_addr_t start, phys_addr_t end) +{ +	unsigned long vaddr; +	unsigned int npages; -	pvalidate_pages(vaddr, npages, true); +	if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) +		return; + +	vaddr = (unsigned long)__va(start); +	npages = (end - start) >> PAGE_SHIFT; + +	set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE);  }  static int snp_set_vmsa(void *va, bool vmsa) @@ -1267,6 +1225,8 @@ void setup_ghcb(void)  		if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP))  			snp_register_per_cpu_ghcb(); +		sev_cfg.ghcbs_initialized = true; +  		return;  	} diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index f3f2d87cce1b..e9f99c56f3ce 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -96,6 +96,9 @@ static const unsigned long * const efi_tables[] = {  #ifdef CONFIG_EFI_COCO_SECRET  	&efi.coco_secret,  #endif +#ifdef CONFIG_UNACCEPTED_MEMORY +	&efi.unaccepted, +#endif  };  u64 efi_setup;		/* efi setup_data physical address */ diff --git a/drivers/base/node.c b/drivers/base/node.c index b46db17124f3..655975946ef6 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -449,6 +449,9 @@ static ssize_t node_read_meminfo(struct device *dev,  			     "Node %d FileHugePages: %8lu kB\n"  			     "Node %d FilePmdMapped: %8lu kB\n"  #endif +#ifdef CONFIG_UNACCEPTED_MEMORY +			     "Node %d Unaccepted:     %8lu kB\n" +#endif  			     ,  			     nid, K(node_page_state(pgdat, NR_FILE_DIRTY)),  			     nid, K(node_page_state(pgdat, NR_WRITEBACK)), @@ -478,6 +481,10 @@ static ssize_t node_read_meminfo(struct device *dev,  			     nid, K(node_page_state(pgdat, NR_FILE_THPS)),  			     nid, K(node_page_state(pgdat, NR_FILE_PMDMAPPED))  #endif +#ifdef CONFIG_UNACCEPTED_MEMORY +			     , +			     nid, K(sum_zone_node_page_state(nid, NR_UNACCEPTED)) +#endif  			    );  	len += hugetlb_report_node_meminfo(buf, len, nid);  	return len; diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig index 043ca31c114e..231f1c70d1db 100644 --- a/drivers/firmware/efi/Kconfig +++ b/drivers/firmware/efi/Kconfig @@ -269,6 +269,20 @@ config EFI_COCO_SECRET  	  virt/coco/efi_secret module to access the secrets, which in turn  	  allows userspace programs to access the injected secrets. +config UNACCEPTED_MEMORY +	bool +	depends on EFI_STUB +	help +	   Some Virtual Machine platforms, such as Intel TDX, require +	   some memory to be "accepted" by the guest before it can be used. +	   This mechanism helps prevent malicious hosts from making changes +	   to guest memory. + +	   UEFI specification v2.9 introduced EFI_UNACCEPTED_MEMORY memory type. + +	   This option adds support for unaccepted memory and makes such memory +	   usable by the kernel. +  config EFI_EMBEDDED_FIRMWARE  	bool  	select CRYPTO_LIB_SHA256 diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile index b51f2a4c821e..e489fefd23da 100644 --- a/drivers/firmware/efi/Makefile +++ b/drivers/firmware/efi/Makefile @@ -41,3 +41,4 @@ obj-$(CONFIG_EFI_CAPSULE_LOADER)	+= capsule-loader.o  obj-$(CONFIG_EFI_EARLYCON)		+= earlycon.o  obj-$(CONFIG_UEFI_CPER_ARM)		+= cper-arm.o  obj-$(CONFIG_UEFI_CPER_X86)		+= cper-x86.o +obj-$(CONFIG_UNACCEPTED_MEMORY)		+= unaccepted_memory.o diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 34b9e7876538..3a6ee7bb06f1 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -50,6 +50,9 @@ struct efi __read_mostly efi = {  #ifdef CONFIG_EFI_COCO_SECRET  	.coco_secret		= EFI_INVALID_TABLE_ADDR,  #endif +#ifdef CONFIG_UNACCEPTED_MEMORY +	.unaccepted		= EFI_INVALID_TABLE_ADDR, +#endif  };  EXPORT_SYMBOL(efi); @@ -584,6 +587,9 @@ static const efi_config_table_type_t common_tables[] __initconst = {  #ifdef CONFIG_EFI_COCO_SECRET  	{LINUX_EFI_COCO_SECRET_AREA_GUID,	&efi.coco_secret,	"CocoSecret"	},  #endif +#ifdef CONFIG_UNACCEPTED_MEMORY +	{LINUX_EFI_UNACCEPTED_MEM_TABLE_GUID,	&efi.unaccepted,	"Unaccepted"	}, +#endif  #ifdef CONFIG_EFI_GENERIC_STUB  	{LINUX_EFI_SCREEN_INFO_TABLE_GUID,	&screen_info_table			},  #endif @@ -738,6 +744,25 @@ int __init efi_config_parse_tables(const efi_config_table_t *config_tables,  		}  	} +	if (IS_ENABLED(CONFIG_UNACCEPTED_MEMORY) && +	    efi.unaccepted != EFI_INVALID_TABLE_ADDR) { +		struct efi_unaccepted_memory *unaccepted; + +		unaccepted = early_memremap(efi.unaccepted, sizeof(*unaccepted)); +		if (unaccepted) { +			unsigned long size; + +			if (unaccepted->version == 1) { +				size = sizeof(*unaccepted) + unaccepted->size; +				memblock_reserve(efi.unaccepted, size); +			} else { +				efi.unaccepted = EFI_INVALID_TABLE_ADDR; +			} + +			early_memunmap(unaccepted, sizeof(*unaccepted)); +		} +	} +  	return 0;  } @@ -822,6 +847,7 @@ static __initdata char memory_type_name[][13] = {  	"MMIO Port",  	"PAL Code",  	"Persistent", +	"Unaccepted",  };  char * __init efi_md_typeattr_format(char *buf, size_t size, diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile index 3abb2b357482..16d64a34d1e1 100644 --- a/drivers/firmware/efi/libstub/Makefile +++ b/drivers/firmware/efi/libstub/Makefile @@ -96,6 +96,8 @@ CFLAGS_arm32-stub.o		:= -DTEXT_OFFSET=$(TEXT_OFFSET)  zboot-obj-$(CONFIG_RISCV)	:= lib-clz_ctz.o lib-ashldi3.o  lib-$(CONFIG_EFI_ZBOOT)		+= zboot.o $(zboot-obj-y) +lib-$(CONFIG_UNACCEPTED_MEMORY) += unaccepted_memory.o bitmap.o find.o +  extra-y				:= $(lib-y)  lib-y				:= $(patsubst %.o,%.stub.o,$(lib-y)) diff --git a/drivers/firmware/efi/libstub/bitmap.c b/drivers/firmware/efi/libstub/bitmap.c new file mode 100644 index 000000000000..5c9bba0d549b --- /dev/null +++ b/drivers/firmware/efi/libstub/bitmap.c @@ -0,0 +1,41 @@ +#include <linux/bitmap.h> + +void __bitmap_set(unsigned long *map, unsigned int start, int len) +{ +	unsigned long *p = map + BIT_WORD(start); +	const unsigned int size = start + len; +	int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG); +	unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start); + +	while (len - bits_to_set >= 0) { +		*p |= mask_to_set; +		len -= bits_to_set; +		bits_to_set = BITS_PER_LONG; +		mask_to_set = ~0UL; +		p++; +	} +	if (len) { +		mask_to_set &= BITMAP_LAST_WORD_MASK(size); +		*p |= mask_to_set; +	} +} + +void __bitmap_clear(unsigned long *map, unsigned int start, int len) +{ +	unsigned long *p = map + BIT_WORD(start); +	const unsigned int size = start + len; +	int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG); +	unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start); + +	while (len - bits_to_clear >= 0) { +		*p &= ~mask_to_clear; +		len -= bits_to_clear; +		bits_to_clear = BITS_PER_LONG; +		mask_to_clear = ~0UL; +		p++; +	} +	if (len) { +		mask_to_clear &= BITMAP_LAST_WORD_MASK(size); +		*p &= ~mask_to_clear; +	} +} diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h index 54a2822cae77..6aa38a1bf126 100644 --- a/drivers/firmware/efi/libstub/efistub.h +++ b/drivers/firmware/efi/libstub/efistub.h @@ -1136,4 +1136,10 @@ void efi_remap_image(unsigned long image_base, unsigned alloc_size,  asmlinkage efi_status_t __efiapi  efi_zboot_entry(efi_handle_t handle, efi_system_table_t *systab); +efi_status_t allocate_unaccepted_bitmap(__u32 nr_desc, +					struct efi_boot_memmap *map); +void process_unaccepted_memory(u64 start, u64 end); +void accept_memory(phys_addr_t start, phys_addr_t end); +void arch_accept_memory(phys_addr_t start, phys_addr_t end); +  #endif diff --git a/drivers/firmware/efi/libstub/find.c b/drivers/firmware/efi/libstub/find.c new file mode 100644 index 000000000000..4e7740d28987 --- /dev/null +++ b/drivers/firmware/efi/libstub/find.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/bitmap.h> +#include <linux/math.h> +#include <linux/minmax.h> + +/* + * Common helper for find_next_bit() function family + * @FETCH: The expression that fetches and pre-processes each word of bitmap(s) + * @MUNGE: The expression that post-processes a word containing found bit (may be empty) + * @size: The bitmap size in bits + * @start: The bitnumber to start searching at + */ +#define FIND_NEXT_BIT(FETCH, MUNGE, size, start)				\ +({										\ +	unsigned long mask, idx, tmp, sz = (size), __start = (start);		\ +										\ +	if (unlikely(__start >= sz))						\ +		goto out;							\ +										\ +	mask = MUNGE(BITMAP_FIRST_WORD_MASK(__start));				\ +	idx = __start / BITS_PER_LONG;						\ +										\ +	for (tmp = (FETCH) & mask; !tmp; tmp = (FETCH)) {			\ +		if ((idx + 1) * BITS_PER_LONG >= sz)				\ +			goto out;						\ +		idx++;								\ +	}									\ +										\ +	sz = min(idx * BITS_PER_LONG + __ffs(MUNGE(tmp)), sz);			\ +out:										\ +	sz;									\ +}) + +unsigned long _find_next_bit(const unsigned long *addr, unsigned long nbits, unsigned long start) +{ +	return FIND_NEXT_BIT(addr[idx], /* nop */, nbits, start); +} + +unsigned long _find_next_zero_bit(const unsigned long *addr, unsigned long nbits, +					 unsigned long start) +{ +	return FIND_NEXT_BIT(~addr[idx], /* nop */, nbits, start); +} diff --git a/drivers/firmware/efi/libstub/unaccepted_memory.c b/drivers/firmware/efi/libstub/unaccepted_memory.c new file mode 100644 index 000000000000..ca61f4733ea5 --- /dev/null +++ b/drivers/firmware/efi/libstub/unaccepted_memory.c @@ -0,0 +1,222 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/efi.h> +#include <asm/efi.h> +#include "efistub.h" + +struct efi_unaccepted_memory *unaccepted_table; + +efi_status_t allocate_unaccepted_bitmap(__u32 nr_desc, +					struct efi_boot_memmap *map) +{ +	efi_guid_t unaccepted_table_guid = LINUX_EFI_UNACCEPTED_MEM_TABLE_GUID; +	u64 unaccepted_start = ULLONG_MAX, unaccepted_end = 0, bitmap_size; +	efi_status_t status; +	int i; + +	/* Check if the table is already installed */ +	unaccepted_table = get_efi_config_table(unaccepted_table_guid); +	if (unaccepted_table) { +		if (unaccepted_table->version != 1) { +			efi_err("Unknown version of unaccepted memory table\n"); +			return EFI_UNSUPPORTED; +		} +		return EFI_SUCCESS; +	} + +	/* Check if there's any unaccepted memory and find the max address */ +	for (i = 0; i < nr_desc; i++) { +		efi_memory_desc_t *d; +		unsigned long m = (unsigned long)map->map; + +		d = efi_early_memdesc_ptr(m, map->desc_size, i); +		if (d->type != EFI_UNACCEPTED_MEMORY) +			continue; + +		unaccepted_start = min(unaccepted_start, d->phys_addr); +		unaccepted_end = max(unaccepted_end, +				     d->phys_addr + d->num_pages * PAGE_SIZE); +	} + +	if (unaccepted_start == ULLONG_MAX) +		return EFI_SUCCESS; + +	unaccepted_start = round_down(unaccepted_start, +				      EFI_UNACCEPTED_UNIT_SIZE); +	unaccepted_end = round_up(unaccepted_end, EFI_UNACCEPTED_UNIT_SIZE); + +	/* +	 * If unaccepted memory is present, allocate a bitmap to track what +	 * memory has to be accepted before access. +	 * +	 * One bit in the bitmap represents 2MiB in the address space: +	 * A 4k bitmap can track 64GiB of physical address space. +	 * +	 * In the worst case scenario -- a huge hole in the middle of the +	 * address space -- It needs 256MiB to handle 4PiB of the address +	 * space. +	 * +	 * The bitmap will be populated in setup_e820() according to the memory +	 * map after efi_exit_boot_services(). +	 */ +	bitmap_size = DIV_ROUND_UP(unaccepted_end - unaccepted_start, +				   EFI_UNACCEPTED_UNIT_SIZE * BITS_PER_BYTE); + +	status = efi_bs_call(allocate_pool, EFI_LOADER_DATA, +			     sizeof(*unaccepted_table) + bitmap_size, +			     (void **)&unaccepted_table); +	if (status != EFI_SUCCESS) { +		efi_err("Failed to allocate unaccepted memory config table\n"); +		return status; +	} + +	unaccepted_table->version = 1; +	unaccepted_table->unit_size = EFI_UNACCEPTED_UNIT_SIZE; +	unaccepted_table->phys_base = unaccepted_start; +	unaccepted_table->size = bitmap_size; +	memset(unaccepted_table->bitmap, 0, bitmap_size); + +	status = efi_bs_call(install_configuration_table, +			     &unaccepted_table_guid, unaccepted_table); +	if (status != EFI_SUCCESS) { +		efi_bs_call(free_pool, unaccepted_table); +		efi_err("Failed to install unaccepted memory config table!\n"); +	} + +	return status; +} + +/* + * The accepted memory bitmap only works at unit_size granularity.  Take + * unaligned start/end addresses and either: + *  1. Accepts the memory immediately and in its entirety + *  2. Accepts unaligned parts, and marks *some* aligned part unaccepted + * + * The function will never reach the bitmap_set() with zero bits to set. + */ +void process_unaccepted_memory(u64 start, u64 end) +{ +	u64 unit_size = unaccepted_table->unit_size; +	u64 unit_mask = unaccepted_table->unit_size - 1; +	u64 bitmap_size = unaccepted_table->size; + +	/* +	 * Ensure that at least one bit will be set in the bitmap by +	 * immediately accepting all regions under 2*unit_size.  This is +	 * imprecise and may immediately accept some areas that could +	 * have been represented in the bitmap.  But, results in simpler +	 * code below +	 * +	 * Consider case like this (assuming unit_size == 2MB): +	 * +	 * | 4k | 2044k |    2048k   | +	 * ^ 0x0        ^ 2MB        ^ 4MB +	 * +	 * Only the first 4k has been accepted. The 0MB->2MB region can not be +	 * represented in the bitmap. The 2MB->4MB region can be represented in +	 * the bitmap. But, the 0MB->4MB region is <2*unit_size and will be +	 * immediately accepted in its entirety. +	 */ +	if (end - start < 2 * unit_size) { +		arch_accept_memory(start, end); +		return; +	} + +	/* +	 * No matter how the start and end are aligned, at least one unaccepted +	 * unit_size area will remain to be marked in the bitmap. +	 */ + +	/* Immediately accept a <unit_size piece at the start: */ +	if (start & unit_mask) { +		arch_accept_memory(start, round_up(start, unit_size)); +		start = round_up(start, unit_size); +	} + +	/* Immediately accept a <unit_size piece at the end: */ +	if (end & unit_mask) { +		arch_accept_memory(round_down(end, unit_size), end); +		end = round_down(end, unit_size); +	} + +	/* +	 * Accept part of the range that before phys_base and cannot be recorded +	 * into the bitmap. +	 */ +	if (start < unaccepted_table->phys_base) { +		arch_accept_memory(start, +				   min(unaccepted_table->phys_base, end)); +		start = unaccepted_table->phys_base; +	} + +	/* Nothing to record */ +	if (end < unaccepted_table->phys_base) +		return; + +	/* Translate to offsets from the beginning of the bitmap */ +	start -= unaccepted_table->phys_base; +	end -= unaccepted_table->phys_base; + +	/* Accept memory that doesn't fit into bitmap */ +	if (end > bitmap_size * unit_size * BITS_PER_BYTE) { +		unsigned long phys_start, phys_end; + +		phys_start = bitmap_size * unit_size * BITS_PER_BYTE + +			     unaccepted_table->phys_base; +		phys_end = end + unaccepted_table->phys_base; + +		arch_accept_memory(phys_start, phys_end); +		end = bitmap_size * unit_size * BITS_PER_BYTE; +	} + +	/* +	 * 'start' and 'end' are now both unit_size-aligned. +	 * Record the range as being unaccepted: +	 */ +	bitmap_set(unaccepted_table->bitmap, +		   start / unit_size, (end - start) / unit_size); +} + +void accept_memory(phys_addr_t start, phys_addr_t end) +{ +	unsigned long range_start, range_end; +	unsigned long bitmap_size; +	u64 unit_size; + +	if (!unaccepted_table) +		return; + +	unit_size = unaccepted_table->unit_size; + +	/* +	 * Only care for the part of the range that is represented +	 * in the bitmap. +	 */ +	if (start < unaccepted_table->phys_base) +		start = unaccepted_table->phys_base; +	if (end < unaccepted_table->phys_base) +		return; + +	/* Translate to offsets from the beginning of the bitmap */ +	start -= unaccepted_table->phys_base; +	end -= unaccepted_table->phys_base; + +	/* Make sure not to overrun the bitmap */ +	if (end > unaccepted_table->size * unit_size * BITS_PER_BYTE) +		end = unaccepted_table->size * unit_size * BITS_PER_BYTE; + +	range_start = start / unit_size; +	bitmap_size = DIV_ROUND_UP(end, unit_size); + +	for_each_set_bitrange_from(range_start, range_end, +				   unaccepted_table->bitmap, bitmap_size) { +		unsigned long phys_start, phys_end; + +		phys_start = range_start * unit_size + unaccepted_table->phys_base; +		phys_end = range_end * unit_size + unaccepted_table->phys_base; + +		arch_accept_memory(phys_start, phys_end); +		bitmap_clear(unaccepted_table->bitmap, +			     range_start, range_end - range_start); +	} +} diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c index a0bfd31358ba..220be75a5cdc 100644 --- a/drivers/firmware/efi/libstub/x86-stub.c +++ b/drivers/firmware/efi/libstub/x86-stub.c @@ -26,6 +26,17 @@ const efi_dxe_services_table_t *efi_dxe_table;  u32 image_offset __section(".data");  static efi_loaded_image_t *image = NULL; +typedef union sev_memory_acceptance_protocol sev_memory_acceptance_protocol_t; +union sev_memory_acceptance_protocol { +	struct { +		efi_status_t (__efiapi * allow_unaccepted_memory)( +			sev_memory_acceptance_protocol_t *); +	}; +	struct { +		u32 allow_unaccepted_memory; +	} mixed_mode; +}; +  static efi_status_t  preserve_pci_rom_image(efi_pci_io_protocol_t *pci, struct pci_setup_rom **__rom)  { @@ -310,6 +321,29 @@ setup_memory_protection(unsigned long image_base, unsigned long image_size)  #endif  } +static void setup_unaccepted_memory(void) +{ +	efi_guid_t mem_acceptance_proto = OVMF_SEV_MEMORY_ACCEPTANCE_PROTOCOL_GUID; +	sev_memory_acceptance_protocol_t *proto; +	efi_status_t status; + +	if (!IS_ENABLED(CONFIG_UNACCEPTED_MEMORY)) +		return; + +	/* +	 * Enable unaccepted memory before calling exit boot services in order +	 * for the UEFI to not accept all memory on EBS. +	 */ +	status = efi_bs_call(locate_protocol, &mem_acceptance_proto, NULL, +			     (void **)&proto); +	if (status != EFI_SUCCESS) +		return; + +	status = efi_call_proto(proto, allow_unaccepted_memory); +	if (status != EFI_SUCCESS) +		efi_err("Memory acceptance protocol failed\n"); +} +  static const efi_char16_t apple[] = L"Apple";  static void setup_quirks(struct boot_params *boot_params, @@ -613,6 +647,16 @@ setup_e820(struct boot_params *params, struct setup_data *e820ext, u32 e820ext_s  			e820_type = E820_TYPE_PMEM;  			break; +		case EFI_UNACCEPTED_MEMORY: +			if (!IS_ENABLED(CONFIG_UNACCEPTED_MEMORY)) { +				efi_warn_once( +"The system has unaccepted memory,  but kernel does not support it\nConsider enabling CONFIG_UNACCEPTED_MEMORY\n"); +				continue; +			} +			e820_type = E820_TYPE_RAM; +			process_unaccepted_memory(d->phys_addr, +						  d->phys_addr + PAGE_SIZE * d->num_pages); +			break;  		default:  			continue;  		} @@ -681,28 +725,27 @@ static efi_status_t allocate_e820(struct boot_params *params,  				  struct setup_data **e820ext,  				  u32 *e820ext_size)  { -	unsigned long map_size, desc_size, map_key; +	struct efi_boot_memmap *map;  	efi_status_t status; -	__u32 nr_desc, desc_version; - -	/* Only need the size of the mem map and size of each mem descriptor */ -	map_size = 0; -	status = efi_bs_call(get_memory_map, &map_size, NULL, &map_key, -			     &desc_size, &desc_version); -	if (status != EFI_BUFFER_TOO_SMALL) -		return (status != EFI_SUCCESS) ? status : EFI_UNSUPPORTED; +	__u32 nr_desc; -	nr_desc = map_size / desc_size + EFI_MMAP_NR_SLACK_SLOTS; +	status = efi_get_memory_map(&map, false); +	if (status != EFI_SUCCESS) +		return status; -	if (nr_desc > ARRAY_SIZE(params->e820_table)) { -		u32 nr_e820ext = nr_desc - ARRAY_SIZE(params->e820_table); +	nr_desc = map->map_size / map->desc_size; +	if (nr_desc > ARRAY_SIZE(params->e820_table) - EFI_MMAP_NR_SLACK_SLOTS) { +		u32 nr_e820ext = nr_desc - ARRAY_SIZE(params->e820_table) + +				 EFI_MMAP_NR_SLACK_SLOTS;  		status = alloc_e820ext(nr_e820ext, e820ext, e820ext_size); -		if (status != EFI_SUCCESS) -			return status;  	} -	return EFI_SUCCESS; +	if (IS_ENABLED(CONFIG_UNACCEPTED_MEMORY) && status == EFI_SUCCESS) +		status = allocate_unaccepted_bitmap(nr_desc, map); + +	efi_bs_call(free_pool, map); +	return status;  }  struct exit_boot_struct { @@ -899,6 +942,8 @@ asmlinkage unsigned long efi_main(efi_handle_t handle,  	setup_quirks(boot_params, bzimage_addr, buffer_end - buffer_start); +	setup_unaccepted_memory(); +  	status = exit_boot(boot_params, handle);  	if (status != EFI_SUCCESS) {  		efi_err("exit_boot() failed!\n"); diff --git a/drivers/firmware/efi/unaccepted_memory.c b/drivers/firmware/efi/unaccepted_memory.c new file mode 100644 index 000000000000..853f7dc3c21d --- /dev/null +++ b/drivers/firmware/efi/unaccepted_memory.c @@ -0,0 +1,147 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/efi.h> +#include <linux/memblock.h> +#include <linux/spinlock.h> +#include <asm/unaccepted_memory.h> + +/* Protects unaccepted memory bitmap */ +static DEFINE_SPINLOCK(unaccepted_memory_lock); + +/* + * accept_memory() -- Consult bitmap and accept the memory if needed. + * + * Only memory that is explicitly marked as unaccepted in the bitmap requires + * an action. All the remaining memory is implicitly accepted and doesn't need + * acceptance. + * + * No need to accept: + *  - anything if the system has no unaccepted table; + *  - memory that is below phys_base; + *  - memory that is above the memory that addressable by the bitmap; + */ +void accept_memory(phys_addr_t start, phys_addr_t end) +{ +	struct efi_unaccepted_memory *unaccepted; +	unsigned long range_start, range_end; +	unsigned long flags; +	u64 unit_size; + +	unaccepted = efi_get_unaccepted_table(); +	if (!unaccepted) +		return; + +	unit_size = unaccepted->unit_size; + +	/* +	 * Only care for the part of the range that is represented +	 * in the bitmap. +	 */ +	if (start < unaccepted->phys_base) +		start = unaccepted->phys_base; +	if (end < unaccepted->phys_base) +		return; + +	/* Translate to offsets from the beginning of the bitmap */ +	start -= unaccepted->phys_base; +	end -= unaccepted->phys_base; + +	/* +	 * load_unaligned_zeropad() can lead to unwanted loads across page +	 * boundaries. The unwanted loads are typically harmless. But, they +	 * might be made to totally unrelated or even unmapped memory. +	 * load_unaligned_zeropad() relies on exception fixup (#PF, #GP and now +	 * #VE) to recover from these unwanted loads. +	 * +	 * But, this approach does not work for unaccepted memory. For TDX, a +	 * load from unaccepted memory will not lead to a recoverable exception +	 * within the guest. The guest will exit to the VMM where the only +	 * recourse is to terminate the guest. +	 * +	 * There are two parts to fix this issue and comprehensively avoid +	 * access to unaccepted memory. Together these ensure that an extra +	 * "guard" page is accepted in addition to the memory that needs to be +	 * used: +	 * +	 * 1. Implicitly extend the range_contains_unaccepted_memory(start, end) +	 *    checks up to end+unit_size if 'end' is aligned on a unit_size +	 *    boundary. +	 * +	 * 2. Implicitly extend accept_memory(start, end) to end+unit_size if +	 *    'end' is aligned on a unit_size boundary. (immediately following +	 *    this comment) +	 */ +	if (!(end % unit_size)) +		end += unit_size; + +	/* Make sure not to overrun the bitmap */ +	if (end > unaccepted->size * unit_size * BITS_PER_BYTE) +		end = unaccepted->size * unit_size * BITS_PER_BYTE; + +	range_start = start / unit_size; + +	spin_lock_irqsave(&unaccepted_memory_lock, flags); +	for_each_set_bitrange_from(range_start, range_end, unaccepted->bitmap, +				   DIV_ROUND_UP(end, unit_size)) { +		unsigned long phys_start, phys_end; +		unsigned long len = range_end - range_start; + +		phys_start = range_start * unit_size + unaccepted->phys_base; +		phys_end = range_end * unit_size + unaccepted->phys_base; + +		arch_accept_memory(phys_start, phys_end); +		bitmap_clear(unaccepted->bitmap, range_start, len); +	} +	spin_unlock_irqrestore(&unaccepted_memory_lock, flags); +} + +bool range_contains_unaccepted_memory(phys_addr_t start, phys_addr_t end) +{ +	struct efi_unaccepted_memory *unaccepted; +	unsigned long flags; +	bool ret = false; +	u64 unit_size; + +	unaccepted = efi_get_unaccepted_table(); +	if (!unaccepted) +		return false; + +	unit_size = unaccepted->unit_size; + +	/* +	 * Only care for the part of the range that is represented +	 * in the bitmap. +	 */ +	if (start < unaccepted->phys_base) +		start = unaccepted->phys_base; +	if (end < unaccepted->phys_base) +		return false; + +	/* Translate to offsets from the beginning of the bitmap */ +	start -= unaccepted->phys_base; +	end -= unaccepted->phys_base; + +	/* +	 * Also consider the unaccepted state of the *next* page. See fix #1 in +	 * the comment on load_unaligned_zeropad() in accept_memory(). +	 */ +	if (!(end % unit_size)) +		end += unit_size; + +	/* Make sure not to overrun the bitmap */ +	if (end > unaccepted->size * unit_size * BITS_PER_BYTE) +		end = unaccepted->size * unit_size * BITS_PER_BYTE; + +	spin_lock_irqsave(&unaccepted_memory_lock, flags); +	while (start < end) { +		if (test_bit(start / unit_size, unaccepted->bitmap)) { +			ret = true; +			break; +		} + +		start += unit_size; +	} +	spin_unlock_irqrestore(&unaccepted_memory_lock, flags); + +	return ret; +} diff --git a/drivers/virt/coco/sev-guest/Kconfig b/drivers/virt/coco/sev-guest/Kconfig index f9db0799ae67..da2d7ca531f0 100644 --- a/drivers/virt/coco/sev-guest/Kconfig +++ b/drivers/virt/coco/sev-guest/Kconfig @@ -2,6 +2,7 @@ config SEV_GUEST  	tristate "AMD SEV Guest driver"  	default m  	depends on AMD_MEM_ENCRYPT +	select CRYPTO  	select CRYPTO_AEAD2  	select CRYPTO_GCM  	help diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index b43d0bd42762..8dca4d6d96c7 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -168,6 +168,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v)  		    global_zone_page_state(NR_FREE_CMA_PAGES));  #endif +#ifdef CONFIG_UNACCEPTED_MEMORY +	show_val_kb(m, "Unaccepted:     ", +		    global_zone_page_state(NR_UNACCEPTED)); +#endif +  	hugetlb_report_meminfo(m);  	arch_report_meminfo(m); diff --git a/include/linux/efi.h b/include/linux/efi.h index 571d1a6e1b74..18d83a613635 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -108,7 +108,8 @@ typedef	struct {  #define EFI_MEMORY_MAPPED_IO_PORT_SPACE	12  #define EFI_PAL_CODE			13  #define EFI_PERSISTENT_MEMORY		14 -#define EFI_MAX_MEMORY_TYPE		15 +#define EFI_UNACCEPTED_MEMORY		15 +#define EFI_MAX_MEMORY_TYPE		16  /* Attribute values: */  #define EFI_MEMORY_UC		((u64)0x0000000000000001ULL)	/* uncached */ @@ -417,6 +418,7 @@ void efi_native_runtime_setup(void);  #define LINUX_EFI_MOK_VARIABLE_TABLE_GUID	EFI_GUID(0xc451ed2b, 0x9694, 0x45d3,  0xba, 0xba, 0xed, 0x9f, 0x89, 0x88, 0xa3, 0x89)  #define LINUX_EFI_COCO_SECRET_AREA_GUID		EFI_GUID(0xadf956ad, 0xe98c, 0x484c,  0xae, 0x11, 0xb5, 0x1c, 0x7d, 0x33, 0x64, 0x47)  #define LINUX_EFI_BOOT_MEMMAP_GUID		EFI_GUID(0x800f683f, 0xd08b, 0x423a,  0xa2, 0x93, 0x96, 0x5c, 0x3c, 0x6f, 0xe2, 0xb4) +#define LINUX_EFI_UNACCEPTED_MEM_TABLE_GUID	EFI_GUID(0xd5d1de3c, 0x105c, 0x44f9,  0x9e, 0xa9, 0xbc, 0xef, 0x98, 0x12, 0x00, 0x31)  #define RISCV_EFI_BOOT_PROTOCOL_GUID		EFI_GUID(0xccd15fec, 0x6f73, 0x4eec,  0x83, 0x95, 0x3e, 0x69, 0xe4, 0xb9, 0x40, 0xbf) @@ -435,6 +437,9 @@ void efi_native_runtime_setup(void);  #define DELLEMC_EFI_RCI2_TABLE_GUID		EFI_GUID(0x2d9f28a2, 0xa886, 0x456a,  0x97, 0xa8, 0xf1, 0x1e, 0xf2, 0x4f, 0xf4, 0x55)  #define AMD_SEV_MEM_ENCRYPT_GUID		EFI_GUID(0x0cf29b71, 0x9e51, 0x433a,  0xa3, 0xb7, 0x81, 0xf3, 0xab, 0x16, 0xb8, 0x75) +/* OVMF protocol GUIDs */ +#define OVMF_SEV_MEMORY_ACCEPTANCE_PROTOCOL_GUID	EFI_GUID(0xc5a010fe, 0x38a7, 0x4531,  0x8a, 0x4a, 0x05, 0x00, 0xd2, 0xfd, 0x16, 0x49) +  typedef struct {  	efi_guid_t guid;  	u64 table; @@ -534,6 +539,14 @@ struct efi_boot_memmap {  	efi_memory_desc_t	map[];  }; +struct efi_unaccepted_memory { +	u32 version; +	u32 unit_size; +	u64 phys_base; +	u64 size; +	unsigned long bitmap[]; +}; +  /*   * Architecture independent structure for describing a memory map for the   * benefit of efi_memmap_init_early(), and for passing context between @@ -636,6 +649,7 @@ extern struct efi {  	unsigned long			tpm_final_log;		/* TPM2 Final Events Log table */  	unsigned long			mokvar_table;		/* MOK variable config table */  	unsigned long			coco_secret;		/* Confidential computing secret table */ +	unsigned long			unaccepted;		/* Unaccepted memory table */  	efi_get_time_t			*get_time;  	efi_set_time_t			*set_time; diff --git a/include/linux/mm.h b/include/linux/mm.h index 200068d98686..fec149585985 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3839,4 +3839,23 @@ madvise_set_anon_name(struct mm_struct *mm, unsigned long start,  }  #endif +#ifdef CONFIG_UNACCEPTED_MEMORY + +bool range_contains_unaccepted_memory(phys_addr_t start, phys_addr_t end); +void accept_memory(phys_addr_t start, phys_addr_t end); + +#else + +static inline bool range_contains_unaccepted_memory(phys_addr_t start, +						    phys_addr_t end) +{ +	return false; +} + +static inline void accept_memory(phys_addr_t start, phys_addr_t end) +{ +} + +#endif +  #endif /* _LINUX_MM_H */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index a4889c9d4055..6c1c2fc13017 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -143,6 +143,9 @@ enum zone_stat_item {  	NR_ZSPAGES,		/* allocated in zsmalloc */  #endif  	NR_FREE_CMA_PAGES, +#ifdef CONFIG_UNACCEPTED_MEMORY +	NR_UNACCEPTED, +#endif  	NR_VM_ZONE_STAT_ITEMS };  enum node_stat_item { @@ -910,6 +913,11 @@ struct zone {  	/* free areas of different sizes */  	struct free_area	free_area[MAX_ORDER + 1]; +#ifdef CONFIG_UNACCEPTED_MEMORY +	/* Pages to be accepted. All pages on the list are MAX_ORDER */ +	struct list_head	unaccepted_pages; +#endif +  	/* zone flags, see below */  	unsigned long		flags; diff --git a/mm/memblock.c b/mm/memblock.c index 3feafea06ab2..50b921119600 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1436,6 +1436,15 @@ done:  		 */  		kmemleak_alloc_phys(found, size, 0); +	/* +	 * Some Virtual Machine platforms, such as Intel TDX or AMD SEV-SNP, +	 * require memory to be accepted before it can be used by the +	 * guest. +	 * +	 * Accept the memory of the allocated buffer. +	 */ +	accept_memory(found, found + size); +  	return found;  } diff --git a/mm/mm_init.c b/mm/mm_init.c index 7f7f9c677854..1cfc08e25f93 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1375,6 +1375,10 @@ static void __meminit zone_init_free_lists(struct zone *zone)  		INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);  		zone->free_area[order].nr_free = 0;  	} + +#ifdef CONFIG_UNACCEPTED_MEMORY +	INIT_LIST_HEAD(&zone->unaccepted_pages); +#endif  }  void __meminit init_currently_empty_zone(struct zone *zone, @@ -1960,6 +1964,9 @@ static void __init deferred_free_range(unsigned long pfn,  		return;  	} +	/* Accept chunks smaller than MAX_ORDER upfront */ +	accept_memory(PFN_PHYS(pfn), PFN_PHYS(pfn + nr_pages)); +  	for (i = 0; i < nr_pages; i++, page++, pfn++) {  		if (pageblock_aligned(pfn))  			set_pageblock_migratetype(page, MIGRATE_MOVABLE); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 47421bedc12b..d239fba3f31c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -387,6 +387,12 @@ EXPORT_SYMBOL(nr_node_ids);  EXPORT_SYMBOL(nr_online_nodes);  #endif +static bool page_contains_unaccepted(struct page *page, unsigned int order); +static void accept_page(struct page *page, unsigned int order); +static bool try_to_accept_memory(struct zone *zone, unsigned int order); +static inline bool has_unaccepted_memory(void); +static bool __free_unaccepted(struct page *page); +  int page_group_by_mobility_disabled __read_mostly;  #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT @@ -1481,6 +1487,13 @@ void __free_pages_core(struct page *page, unsigned int order)  	atomic_long_add(nr_pages, &page_zone(page)->managed_pages); +	if (page_contains_unaccepted(page, order)) { +		if (order == MAX_ORDER && __free_unaccepted(page)) +			return; + +		accept_page(page, order); +	} +  	/*  	 * Bypass PCP and place fresh pages right to the tail, primarily  	 * relevant for memory onlining. @@ -3159,6 +3172,9 @@ static inline long __zone_watermark_unusable_free(struct zone *z,  	if (!(alloc_flags & ALLOC_CMA))  		unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES);  #endif +#ifdef CONFIG_UNACCEPTED_MEMORY +	unusable_free += zone_page_state(z, NR_UNACCEPTED); +#endif  	return unusable_free;  } @@ -3458,6 +3474,11 @@ retry:  				       gfp_mask)) {  			int ret; +			if (has_unaccepted_memory()) { +				if (try_to_accept_memory(zone, order)) +					goto try_this_zone; +			} +  #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT  			/*  			 * Watermark failed for this zone, but see if we can @@ -3510,6 +3531,11 @@ try_this_zone:  			return page;  		} else { +			if (has_unaccepted_memory()) { +				if (try_to_accept_memory(zone, order)) +					goto try_this_zone; +			} +  #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT  			/* Try again if zone has deferred pages */  			if (deferred_pages_enabled()) { @@ -7215,3 +7241,150 @@ bool has_managed_dma(void)  	return false;  }  #endif /* CONFIG_ZONE_DMA */ + +#ifdef CONFIG_UNACCEPTED_MEMORY + +/* Counts number of zones with unaccepted pages. */ +static DEFINE_STATIC_KEY_FALSE(zones_with_unaccepted_pages); + +static bool lazy_accept = true; + +static int __init accept_memory_parse(char *p) +{ +	if (!strcmp(p, "lazy")) { +		lazy_accept = true; +		return 0; +	} else if (!strcmp(p, "eager")) { +		lazy_accept = false; +		return 0; +	} else { +		return -EINVAL; +	} +} +early_param("accept_memory", accept_memory_parse); + +static bool page_contains_unaccepted(struct page *page, unsigned int order) +{ +	phys_addr_t start = page_to_phys(page); +	phys_addr_t end = start + (PAGE_SIZE << order); + +	return range_contains_unaccepted_memory(start, end); +} + +static void accept_page(struct page *page, unsigned int order) +{ +	phys_addr_t start = page_to_phys(page); + +	accept_memory(start, start + (PAGE_SIZE << order)); +} + +static bool try_to_accept_memory_one(struct zone *zone) +{ +	unsigned long flags; +	struct page *page; +	bool last; + +	if (list_empty(&zone->unaccepted_pages)) +		return false; + +	spin_lock_irqsave(&zone->lock, flags); +	page = list_first_entry_or_null(&zone->unaccepted_pages, +					struct page, lru); +	if (!page) { +		spin_unlock_irqrestore(&zone->lock, flags); +		return false; +	} + +	list_del(&page->lru); +	last = list_empty(&zone->unaccepted_pages); + +	__mod_zone_freepage_state(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); +	__mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES); +	spin_unlock_irqrestore(&zone->lock, flags); + +	accept_page(page, MAX_ORDER); + +	__free_pages_ok(page, MAX_ORDER, FPI_TO_TAIL); + +	if (last) +		static_branch_dec(&zones_with_unaccepted_pages); + +	return true; +} + +static bool try_to_accept_memory(struct zone *zone, unsigned int order) +{ +	long to_accept; +	int ret = false; + +	/* How much to accept to get to high watermark? */ +	to_accept = high_wmark_pages(zone) - +		    (zone_page_state(zone, NR_FREE_PAGES) - +		    __zone_watermark_unusable_free(zone, order, 0)); + +	/* Accept at least one page */ +	do { +		if (!try_to_accept_memory_one(zone)) +			break; +		ret = true; +		to_accept -= MAX_ORDER_NR_PAGES; +	} while (to_accept > 0); + +	return ret; +} + +static inline bool has_unaccepted_memory(void) +{ +	return static_branch_unlikely(&zones_with_unaccepted_pages); +} + +static bool __free_unaccepted(struct page *page) +{ +	struct zone *zone = page_zone(page); +	unsigned long flags; +	bool first = false; + +	if (!lazy_accept) +		return false; + +	spin_lock_irqsave(&zone->lock, flags); +	first = list_empty(&zone->unaccepted_pages); +	list_add_tail(&page->lru, &zone->unaccepted_pages); +	__mod_zone_freepage_state(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE); +	__mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES); +	spin_unlock_irqrestore(&zone->lock, flags); + +	if (first) +		static_branch_inc(&zones_with_unaccepted_pages); + +	return true; +} + +#else + +static bool page_contains_unaccepted(struct page *page, unsigned int order) +{ +	return false; +} + +static void accept_page(struct page *page, unsigned int order) +{ +} + +static bool try_to_accept_memory(struct zone *zone, unsigned int order) +{ +	return false; +} + +static inline bool has_unaccepted_memory(void) +{ +	return false; +} + +static bool __free_unaccepted(struct page *page) +{ +	BUILD_BUG(); +	return false; +} + +#endif /* CONFIG_UNACCEPTED_MEMORY */ diff --git a/mm/vmstat.c b/mm/vmstat.c index c28046371b45..282349cabf01 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1180,6 +1180,9 @@ const char * const vmstat_text[] = {  	"nr_zspages",  #endif  	"nr_free_cma", +#ifdef CONFIG_UNACCEPTED_MEMORY +	"nr_unaccepted", +#endif  	/* enum numa_stat_item counters */  #ifdef CONFIG_NUMA | 
