diff options
Diffstat (limited to 'arch/x86/boot')
62 files changed, 3690 insertions, 2114 deletions
diff --git a/arch/x86/boot/.gitignore b/arch/x86/boot/.gitignore index 1189be057ebd..070ef534c915 100644 --- a/arch/x86/boot/.gitignore +++ b/arch/x86/boot/.gitignore @@ -12,3 +12,4 @@ fdimage mtools.conf image.iso hdimage +tools/ diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 9860ca5979f8..3f9fb3698d66 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -9,19 +9,6 @@ # Changed by many, many contributors over the years. # -# Sanitizer runtimes are unavailable and cannot be linked for early boot code. -KASAN_SANITIZE := n -KCSAN_SANITIZE := n -KMSAN_SANITIZE := n -OBJECT_FILES_NON_STANDARD := y - -# Kernel does not boot with kcov instrumentation here. -# One of the problems observed was insertion of __sanitizer_cov_trace_pc() -# callback into middle of per-cpu data enabling code. Thus the callback observed -# inconsistent state and crashed. We are interested mostly in syscall coverage, -# so boot code is not interesting anyway. -KCOV_INSTRUMENT := n - # If you want to preset the SVGA mode, uncomment the next line and # set SVGA_MODE to whatever number you want. # Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode. @@ -48,42 +35,35 @@ setup-y += video-vesa.o setup-y += video-bios.o targets += $(setup-y) -hostprogs := tools/build hostprogs += mkcpustr HOST_EXTRACFLAGS += -I$(srctree)/tools/include \ -include include/generated/autoconf.h \ -D__EXPORTED_HEADERS__ -ifdef CONFIG_X86_FEATURE_NAMES $(obj)/cpu.o: $(obj)/cpustr.h quiet_cmd_cpustr = CPUSTR $@ cmd_cpustr = $(obj)/mkcpustr > $@ $(obj)/cpustr.h: $(obj)/mkcpustr FORCE $(call if_changed,cpustr) -endif targets += cpustr.h # --------------------------------------------------------------------------- KBUILD_CFLAGS := $(REALMODE_CFLAGS) -D_SETUP KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ -KBUILD_CFLAGS += $(call cc-option,-fmacro-prefix-map=$(srctree)/=) KBUILD_CFLAGS += -fno-asynchronous-unwind-tables -GCOV_PROFILE := n -UBSAN_SANITIZE := n +KBUILD_CFLAGS += $(CONFIG_CC_IMPLICIT_FALLTHROUGH) $(obj)/bzImage: asflags-y := $(SVGA_MODE) quiet_cmd_image = BUILD $@ -silent_redirect_image = >/dev/null -cmd_image = $(obj)/tools/build $(obj)/setup.bin $(obj)/vmlinux.bin \ - $(obj)/zoffset.h $@ $($(quiet)redirect_image) + cmd_image = (dd if=$< bs=4k conv=sync status=none; cat $(filter-out $<,$(real-prereqs))) >$@ -$(obj)/bzImage: $(obj)/setup.bin $(obj)/vmlinux.bin $(obj)/tools/build FORCE +$(obj)/bzImage: $(obj)/setup.bin $(obj)/vmlinux.bin FORCE $(call if_changed,image) - @$(kecho) 'Kernel: $@ is ready' ' (#'`cat .version`')' + @$(kecho) 'Kernel: $@ is ready' ' (#'$(or $(KBUILD_BUILD_VERSION),`cat .version`)')' OBJCOPYFLAGS_vmlinux.bin := -O binary -R .note -R .comment -S $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE @@ -91,7 +71,7 @@ $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE SETUP_OBJS = $(addprefix $(obj)/,$(setup-y)) -sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [a-zA-Z] \(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|efi32_pe_entry\|input_data\|kernel_info\|_end\|_ehead\|_text\|z_.*\)$$/\#define ZO_\2 0x\1/p' +sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [a-zA-Z] \(startup_32\|efi.._stub_entry\|efi\(32\)\?_pe_entry\|input_data\|kernel_info\|_end\|_ehead\|_text\|_e\?data\|_e\?sbat\|z_.*\)$$/\#define ZO_\2 0x\1/p' quiet_cmd_zoffset = ZOFFSET $@ cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@ @@ -131,7 +111,7 @@ targets += mtools.conf # genimage.sh requires bash, but it also has a bunch of other # external dependencies. quiet_cmd_genimage = GENIMAGE $3 -cmd_genimage = $(BASH) $(srctree)/$(src)/genimage.sh $2 $3 $(obj)/bzImage \ + cmd_genimage = $(BASH) $(src)/genimage.sh $2 $3 $(obj)/bzImage \ $(obj)/mtools.conf '$(FDARGS)' $(FDINITRD) PHONY += bzdisk fdimage fdimage144 fdimage288 hdimage isoimage diff --git a/arch/x86/boot/a20.c b/arch/x86/boot/a20.c index a2b6b428922a..bda042933a05 100644 --- a/arch/x86/boot/a20.c +++ b/arch/x86/boot/a20.c @@ -135,29 +135,29 @@ int enable_a20(void) (legacy free, etc.) */ if (a20_test_short()) return 0; - + /* Next, try the BIOS (INT 0x15, AX=0x2401) */ enable_a20_bios(); if (a20_test_short()) return 0; - + /* Try enabling A20 through the keyboard controller */ kbc_err = empty_8042(); if (a20_test_short()) return 0; /* BIOS worked, but with delayed reaction */ - + if (!kbc_err) { enable_a20_kbc(); if (a20_test_long()) return 0; } - + /* Finally, try enabling the "fast A20 gate" */ enable_a20_fast(); if (a20_test_long()) return 0; } - + return -1; } diff --git a/arch/x86/boot/bioscall.S b/arch/x86/boot/bioscall.S index 5521ea12f44e..cf4a6155714e 100644 --- a/arch/x86/boot/bioscall.S +++ b/arch/x86/boot/bioscall.S @@ -32,7 +32,7 @@ intcall: movw %dx, %si movw %sp, %di movw $11, %cx - rep; movsd + rep movsl /* Pop full state from the stack */ popal @@ -67,7 +67,7 @@ intcall: jz 4f movw %sp, %si movw $11, %cx - rep; movsd + rep movsl 4: addw $44, %sp /* Restore state and return */ diff --git a/arch/x86/boot/bitops.h b/arch/x86/boot/bitops.h index 8518ae214c9b..79e15971529d 100644 --- a/arch/x86/boot/bitops.h +++ b/arch/x86/boot/bitops.h @@ -27,7 +27,7 @@ static inline bool variable_test_bit(int nr, const void *addr) bool v; const u32 *p = addr; - asm("btl %2,%1" CC_SET(c) : CC_OUT(c) (v) : "m" (*p), "Ir" (nr)); + asm("btl %2,%1" : "=@ccc" (v) : "m" (*p), "Ir" (nr)); return v; } diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index 148ba5c5106e..8e3eab34dff4 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -16,7 +16,7 @@ #define STACK_SIZE 1024 /* Minimum number of bytes for stack */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/stdarg.h> #include <linux/types.h> @@ -34,7 +34,7 @@ extern struct setup_header hdr; extern struct boot_params boot_params; -#define cpu_relax() asm volatile("rep; nop") +#define cpu_relax() asm volatile("pause") static inline void io_delay(void) { @@ -155,15 +155,15 @@ static inline void wrgs32(u32 v, addr_t addr) static inline bool memcmp_fs(const void *s1, addr_t s2, size_t len) { bool diff; - asm volatile("fs; repe; cmpsb" CC_SET(nz) - : CC_OUT(nz) (diff), "+D" (s1), "+S" (s2), "+c" (len)); + asm volatile("fs repe cmpsb" + : "=@ccnz" (diff), "+D" (s1), "+S" (s2), "+c" (len)); return diff; } static inline bool memcmp_gs(const void *s1, addr_t s2, size_t len) { bool diff; - asm volatile("gs; repe; cmpsb" CC_SET(nz) - : CC_OUT(nz) (diff), "+D" (s1), "+S" (s2), "+c" (len)); + asm volatile("gs repe cmpsb" + : "=@ccnz" (diff), "+D" (s1), "+S" (s2), "+c" (len)); return diff; } @@ -193,8 +193,6 @@ static inline bool heap_free(size_t n) void copy_to_fs(addr_t dst, void *src, size_t len); void *copy_from_fs(void *dst, addr_t src, size_t len); -void copy_to_gs(addr_t dst, void *src, size_t len); -void *copy_from_gs(void *dst, addr_t src, size_t len); /* a20.c */ int enable_a20(void); @@ -305,7 +303,6 @@ void initregs(struct biosregs *regs); int strcmp(const char *str1, const char *str2); int strncmp(const char *cs, const char *ct, size_t count); size_t strnlen(const char *s, size_t maxlen); -unsigned int atou(const char *s); unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base); size_t strlen(const char *s); char *strchr(const char *s, int c); @@ -328,6 +325,6 @@ void probe_cards(int unsafe); /* video-vesa.c */ void vesa_store_edid(void); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* BOOT_BOOT_H */ diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 3a261abb6d15..68f9d7a1683b 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -17,15 +17,6 @@ # (see scripts/Makefile.lib size_append) # compressed vmlinux.bin.all + u32 size of vmlinux.bin.all -# Sanitizer runtimes are unavailable and cannot be linked for early boot code. -KASAN_SANITIZE := n -KCSAN_SANITIZE := n -KMSAN_SANITIZE := n -OBJECT_FILES_NON_STANDARD := y - -# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in. -KCOV_INSTRUMENT := n - targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \ vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4 vmlinux.bin.zst @@ -34,6 +25,7 @@ targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \ # avoid errors with '-march=i386', and future flags may depend on the target to # be valid. KBUILD_CFLAGS := -m$(BITS) -O2 $(CLANG_FLAGS) +KBUILD_CFLAGS += -std=gnu11 -fms-extensions KBUILD_CFLAGS += -fno-strict-aliasing -fPIE KBUILD_CFLAGS += -Wundef KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING @@ -44,23 +36,23 @@ KBUILD_CFLAGS += -mno-mmx -mno-sse KBUILD_CFLAGS += -ffreestanding -fshort-wchar KBUILD_CFLAGS += -fno-stack-protector KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) -KBUILD_CFLAGS += $(call cc-disable-warning, gnu) +ifdef CONFIG_CC_IS_CLANG +KBUILD_CFLAGS += -Wno-gnu +KBUILD_CFLAGS += -Wno-microsoft-anon-tag +endif KBUILD_CFLAGS += -Wno-pointer-sign -KBUILD_CFLAGS += $(call cc-option,-fmacro-prefix-map=$(srctree)/=) KBUILD_CFLAGS += -fno-asynchronous-unwind-tables KBUILD_CFLAGS += -D__DISABLE_EXPORTS # Disable relocation relaxation in case the link is not PIE. -KBUILD_CFLAGS += $(call as-option,-Wa$(comma)-mrelax-relocations=no) +KBUILD_CFLAGS += $(call cc-option,-Wa$(comma)-mrelax-relocations=no) KBUILD_CFLAGS += -include $(srctree)/include/linux/hidden.h -# sev.c indirectly inludes inat-table.h which is generated during +# sev-decode-insn.c indirectly includes inat-table.c which is generated during # compilation and stored in $(objtree). Add the directory to the includes so # that the compiler finds it even with out-of-tree builds (make O=/some/path). -CFLAGS_sev.o += -I$(objtree)/arch/x86/lib/ +CFLAGS_sev-handle-vc.o += -I$(objtree)/arch/x86/lib/ KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ -GCOV_PROFILE := n -UBSAN_SANITIZE :=n KBUILD_LDFLAGS := -m elf_$(UTS_MACHINE) KBUILD_LDFLAGS += $(call ld-option,--no-ld-generated-unwind-info) @@ -68,18 +60,23 @@ KBUILD_LDFLAGS += $(call ld-option,--no-ld-generated-unwind-info) # address by the bootloader. LDFLAGS_vmlinux := -pie $(call ld-option, --no-dynamic-linker) ifdef CONFIG_LD_ORPHAN_WARN -LDFLAGS_vmlinux += --orphan-handling=warn +LDFLAGS_vmlinux += --orphan-handling=$(CONFIG_LD_ORPHAN_WARN_LEVEL) endif LDFLAGS_vmlinux += -z noexecstack ifeq ($(CONFIG_LD_IS_BFD),y) LDFLAGS_vmlinux += $(call ld-option,--no-warn-rwx-segments) endif +ifeq ($(CONFIG_EFI_STUB),y) +# ensure that the static EFI stub library will be pulled in, even if it is +# never referenced explicitly from the startup code +LDFLAGS_vmlinux += -u efi_pe_entry +endif LDFLAGS_vmlinux += -T hostprogs := mkpiggy HOST_EXTRACFLAGS += -I$(srctree)/tools/include -sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(_text\|__bss_start\|_end\)$$/\#define VO_\2 _AC(0x\1,UL)/p' +sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABbCDGRSTtVW] \(_text\|__start_rodata\|_sinittext\|__inittext_end\|__bss_start\|_end\)$$/\#define VO_\2 _AC(0x\1,UL)/p' quiet_cmd_voffset = VOFFSET $@ cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@ @@ -100,19 +97,25 @@ vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr.o ifdef CONFIG_X86_64 vmlinux-objs-y += $(obj)/ident_map_64.o vmlinux-objs-y += $(obj)/idt_64.o $(obj)/idt_handlers_64.o - vmlinux-objs-y += $(obj)/mem_encrypt.o + vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/mem_encrypt.o vmlinux-objs-y += $(obj)/pgtable_64.o - vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/sev.o + vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/sev.o $(obj)/sev-handle-vc.o endif vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o -vmlinux-objs-$(CONFIG_INTEL_TDX_GUEST) += $(obj)/tdx.o $(obj)/tdcall.o +vmlinux-objs-$(CONFIG_INTEL_TDX_GUEST) += $(obj)/tdx.o $(obj)/tdcall.o $(obj)/tdx-shared.o +vmlinux-objs-$(CONFIG_UNACCEPTED_MEMORY) += $(obj)/mem.o -vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_thunk_$(BITS).o vmlinux-objs-$(CONFIG_EFI) += $(obj)/efi.o -efi-obj-$(CONFIG_EFI_STUB) = $(objtree)/drivers/firmware/efi/libstub/lib.a +vmlinux-libs-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a +vmlinux-libs-$(CONFIG_X86_64) += $(objtree)/arch/x86/boot/startup/lib.a +vmlinux-objs-$(CONFIG_EFI_SBAT) += $(obj)/sbat.o -$(obj)/vmlinux: $(vmlinux-objs-y) $(efi-obj-y) FORCE +ifdef CONFIG_EFI_SBAT +$(obj)/sbat.o: $(CONFIG_EFI_SBAT_FILE) +endif + +$(obj)/vmlinux: $(vmlinux-objs-y) $(vmlinux-libs-y) FORCE $(call if_changed,ld) OBJCOPYFLAGS_vmlinux.bin := -R .comment -S @@ -124,7 +127,8 @@ targets += $(patsubst $(obj)/%,%,$(vmlinux-objs-y)) vmlinux.bin.all vmlinux.relo CMD_RELOCS = arch/x86/tools/relocs quiet_cmd_relocs = RELOCS $@ cmd_relocs = $(CMD_RELOCS) $< > $@;$(CMD_RELOCS) --abs-relocs $< -$(obj)/vmlinux.relocs: vmlinux FORCE + +$(obj)/vmlinux.relocs: vmlinux.unstripped FORCE $(call if_changed,relocs) vmlinux.bin.all-y := $(obj)/vmlinux.bin diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c index 9caf89063e77..f196b1d1ddf8 100644 --- a/arch/x86/boot/compressed/acpi.c +++ b/arch/x86/boot/compressed/acpi.c @@ -5,6 +5,8 @@ #include "../string.h" #include "efi.h" +#include <asm/bootparam.h> + #include <linux/numa.h> /* @@ -30,13 +32,13 @@ __efi_get_rsdp_addr(unsigned long cfg_tbl_pa, unsigned int cfg_tbl_len) * Search EFI system tables for RSDP. Preferred is ACPI_20_TABLE_GUID to * ACPI_TABLE_GUID because it has more features. */ - rsdp_addr = efi_find_vendor_table(boot_params, cfg_tbl_pa, cfg_tbl_len, + rsdp_addr = efi_find_vendor_table(boot_params_ptr, cfg_tbl_pa, cfg_tbl_len, ACPI_20_TABLE_GUID); if (rsdp_addr) return (acpi_physical_address)rsdp_addr; /* No ACPI_20_TABLE_GUID found, fallback to ACPI_TABLE_GUID. */ - rsdp_addr = efi_find_vendor_table(boot_params, cfg_tbl_pa, cfg_tbl_len, + rsdp_addr = efi_find_vendor_table(boot_params_ptr, cfg_tbl_pa, cfg_tbl_len, ACPI_TABLE_GUID); if (rsdp_addr) return (acpi_physical_address)rsdp_addr; @@ -56,15 +58,15 @@ static acpi_physical_address efi_get_rsdp_addr(void) enum efi_type et; int ret; - et = efi_get_type(boot_params); + et = efi_get_type(boot_params_ptr); if (et == EFI_TYPE_NONE) return 0; - systab_pa = efi_get_system_table(boot_params); + systab_pa = efi_get_system_table(boot_params_ptr); if (!systab_pa) error("EFI support advertised, but unable to locate system table."); - ret = efi_get_conf_table(boot_params, &cfg_tbl_pa, &cfg_tbl_len); + ret = efi_get_conf_table(boot_params_ptr, &cfg_tbl_pa, &cfg_tbl_len); if (ret || !cfg_tbl_pa) error("EFI config table not found."); @@ -156,7 +158,7 @@ acpi_physical_address get_rsdp_addr(void) { acpi_physical_address pa; - pa = boot_params->acpi_rsdp_addr; + pa = boot_params_ptr->acpi_rsdp_addr; if (!pa) pa = efi_get_rsdp_addr(); @@ -178,7 +180,7 @@ static unsigned long get_cmdline_acpi_rsdp(void) { unsigned long addr = 0; -#ifdef CONFIG_KEXEC +#ifdef CONFIG_KEXEC_CORE char val[MAX_ADDR_LEN] = { }; int ret; @@ -210,7 +212,7 @@ static unsigned long get_acpi_srat_table(void) rsdp = (struct acpi_table_rsdp *)get_cmdline_acpi_rsdp(); if (!rsdp) rsdp = (struct acpi_table_rsdp *)(long) - boot_params->acpi_rsdp_addr; + boot_params_ptr->acpi_rsdp_addr; if (!rsdp) return 0; diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c index f1add5d85da9..e162d7f59cc5 100644 --- a/arch/x86/boot/compressed/cmdline.c +++ b/arch/x86/boot/compressed/cmdline.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 #include "misc.h" +#include <asm/bootparam.h> + static unsigned long fs; static inline void set_fs(unsigned long seg) { @@ -14,9 +16,9 @@ static inline char rdfs8(addr_t addr) #include "../cmdline.c" unsigned long get_cmd_line_ptr(void) { - unsigned long cmd_line_ptr = boot_params->hdr.cmd_line_ptr; + unsigned long cmd_line_ptr = boot_params_ptr->hdr.cmd_line_ptr; - cmd_line_ptr |= (u64)boot_params->ext_cmd_line_ptr << 32; + cmd_line_ptr |= (u64)boot_params_ptr->ext_cmd_line_ptr << 32; return cmd_line_ptr; } diff --git a/arch/x86/boot/compressed/efi.c b/arch/x86/boot/compressed/efi.c index 6edd034b0b30..f2e50f9758e6 100644 --- a/arch/x86/boot/compressed/efi.c +++ b/arch/x86/boot/compressed/efi.c @@ -7,6 +7,8 @@ #include "misc.h" +#include <asm/bootparam.h> + /** * efi_get_type - Given a pointer to boot_params, determine the type of EFI environment. * diff --git a/arch/x86/boot/compressed/efi.h b/arch/x86/boot/compressed/efi.h index 7db2f41b54cd..b22300970f97 100644 --- a/arch/x86/boot/compressed/efi.h +++ b/arch/x86/boot/compressed/efi.h @@ -16,6 +16,7 @@ typedef guid_t efi_guid_t __aligned(__alignof__(u32)); #define ACPI_TABLE_GUID EFI_GUID(0xeb9d2d30, 0x2d88, 0x11d3, 0x9a, 0x16, 0x00, 0x90, 0x27, 0x3f, 0xc1, 0x4d) #define ACPI_20_TABLE_GUID EFI_GUID(0x8868e871, 0xe4f1, 0x11d3, 0xbc, 0x22, 0x00, 0x80, 0xc7, 0x3c, 0x88, 0x81) #define EFI_CC_BLOB_GUID EFI_GUID(0x067b1f5f, 0xcf26, 0x44c5, 0x85, 0x54, 0x93, 0xd7, 0x77, 0x91, 0x2d, 0x42) +#define LINUX_EFI_UNACCEPTED_MEM_TABLE_GUID EFI_GUID(0xd5d1de3c, 0x105c, 0x44f9, 0x9e, 0xa9, 0xbc, 0xef, 0x98, 0x12, 0x00, 0x31) #define EFI32_LOADER_SIGNATURE "EL32" #define EFI64_LOADER_SIGNATURE "EL64" @@ -32,6 +33,7 @@ typedef struct { } efi_table_hdr_t; #define EFI_CONVENTIONAL_MEMORY 7 +#define EFI_UNACCEPTED_MEMORY 15 #define EFI_MEMORY_MORE_RELIABLE \ ((u64)0x0000000000010000ULL) /* higher reliability */ @@ -95,13 +97,12 @@ typedef struct { u32 tables; } efi_system_table_32_t; -/* kexec external ABI */ -struct efi_setup_data { - u64 fw_vendor; - u64 __unused; - u64 tables; - u64 smbios; - u64 reserved[8]; +struct efi_unaccepted_memory { + u32 version; + u32 unit_size; + u64 phys_base; + u64 size; + unsigned long bitmap[]; }; static inline int efi_guidcmp (efi_guid_t left, efi_guid_t right) diff --git a/arch/x86/boot/compressed/efi_thunk_64.S b/arch/x86/boot/compressed/efi_thunk_64.S deleted file mode 100644 index 67e7edcdfea8..000000000000 --- a/arch/x86/boot/compressed/efi_thunk_64.S +++ /dev/null @@ -1,195 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2014, 2015 Intel Corporation; author Matt Fleming - * - * Early support for invoking 32-bit EFI services from a 64-bit kernel. - * - * Because this thunking occurs before ExitBootServices() we have to - * restore the firmware's 32-bit GDT and IDT before we make EFI service - * calls. - * - * On the plus side, we don't have to worry about mangling 64-bit - * addresses into 32-bits because we're executing with an identity - * mapped pagetable and haven't transitioned to 64-bit virtual addresses - * yet. - */ - -#include <linux/linkage.h> -#include <asm/msr.h> -#include <asm/page_types.h> -#include <asm/processor-flags.h> -#include <asm/segment.h> - - .code64 - .text -SYM_FUNC_START(__efi64_thunk) - push %rbp - push %rbx - - movl %ds, %eax - push %rax - movl %es, %eax - push %rax - movl %ss, %eax - push %rax - - /* Copy args passed on stack */ - movq 0x30(%rsp), %rbp - movq 0x38(%rsp), %rbx - movq 0x40(%rsp), %rax - - /* - * Convert x86-64 ABI params to i386 ABI - */ - subq $64, %rsp - movl %esi, 0x0(%rsp) - movl %edx, 0x4(%rsp) - movl %ecx, 0x8(%rsp) - movl %r8d, 0xc(%rsp) - movl %r9d, 0x10(%rsp) - movl %ebp, 0x14(%rsp) - movl %ebx, 0x18(%rsp) - movl %eax, 0x1c(%rsp) - - leaq 0x20(%rsp), %rbx - sgdt (%rbx) - - addq $16, %rbx - sidt (%rbx) - - leaq 1f(%rip), %rbp - - /* - * Switch to IDT and GDT with 32-bit segments. This is the firmware GDT - * and IDT that was installed when the kernel started executing. The - * pointers were saved at the EFI stub entry point in head_64.S. - * - * Pass the saved DS selector to the 32-bit code, and use far return to - * restore the saved CS selector. - */ - leaq efi32_boot_idt(%rip), %rax - lidt (%rax) - leaq efi32_boot_gdt(%rip), %rax - lgdt (%rax) - - movzwl efi32_boot_ds(%rip), %edx - movzwq efi32_boot_cs(%rip), %rax - pushq %rax - leaq efi_enter32(%rip), %rax - pushq %rax - lretq - -1: addq $64, %rsp - movq %rdi, %rax - - pop %rbx - movl %ebx, %ss - pop %rbx - movl %ebx, %es - pop %rbx - movl %ebx, %ds - /* Clear out 32-bit selector from FS and GS */ - xorl %ebx, %ebx - movl %ebx, %fs - movl %ebx, %gs - - /* - * Convert 32-bit status code into 64-bit. - */ - roll $1, %eax - rorq $1, %rax - - pop %rbx - pop %rbp - RET -SYM_FUNC_END(__efi64_thunk) - - .code32 -/* - * EFI service pointer must be in %edi. - * - * The stack should represent the 32-bit calling convention. - */ -SYM_FUNC_START_LOCAL(efi_enter32) - /* Load firmware selector into data and stack segment registers */ - movl %edx, %ds - movl %edx, %es - movl %edx, %fs - movl %edx, %gs - movl %edx, %ss - - /* Reload pgtables */ - movl %cr3, %eax - movl %eax, %cr3 - - /* Disable paging */ - movl %cr0, %eax - btrl $X86_CR0_PG_BIT, %eax - movl %eax, %cr0 - - /* Disable long mode via EFER */ - movl $MSR_EFER, %ecx - rdmsr - btrl $_EFER_LME, %eax - wrmsr - - call *%edi - - /* We must preserve return value */ - movl %eax, %edi - - /* - * Some firmware will return with interrupts enabled. Be sure to - * disable them before we switch GDTs and IDTs. - */ - cli - - lidtl (%ebx) - subl $16, %ebx - - lgdtl (%ebx) - - movl %cr4, %eax - btsl $(X86_CR4_PAE_BIT), %eax - movl %eax, %cr4 - - movl %cr3, %eax - movl %eax, %cr3 - - movl $MSR_EFER, %ecx - rdmsr - btsl $_EFER_LME, %eax - wrmsr - - xorl %eax, %eax - lldt %ax - - pushl $__KERNEL_CS - pushl %ebp - - /* Enable paging */ - movl %cr0, %eax - btsl $X86_CR0_PG_BIT, %eax - movl %eax, %cr0 - lret -SYM_FUNC_END(efi_enter32) - - .data - .balign 8 -SYM_DATA_START(efi32_boot_gdt) - .word 0 - .quad 0 -SYM_DATA_END(efi32_boot_gdt) - -SYM_DATA_START(efi32_boot_idt) - .word 0 - .quad 0 -SYM_DATA_END(efi32_boot_idt) - -SYM_DATA_START(efi32_boot_cs) - .word 0 -SYM_DATA_END(efi32_boot_cs) - -SYM_DATA_START(efi32_boot_ds) - .word 0 -SYM_DATA_END(efi32_boot_ds) diff --git a/arch/x86/boot/compressed/error.c b/arch/x86/boot/compressed/error.c index c881878e56d3..19a8251de506 100644 --- a/arch/x86/boot/compressed/error.c +++ b/arch/x86/boot/compressed/error.c @@ -7,7 +7,7 @@ #include "misc.h" #include "error.h" -void warn(char *m) +void warn(const char *m) { error_putstr("\n\n"); error_putstr(m); @@ -22,3 +22,22 @@ void error(char *m) while (1) asm("hlt"); } + +/* EFI libstub provides vsnprintf() */ +#ifdef CONFIG_EFI_STUB +void panic(const char *fmt, ...) +{ + static char buf[1024]; + va_list args; + int len; + + va_start(args, fmt); + len = vsnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + + if (len && buf[len - 1] == '\n') + buf[len - 1] = '\0'; + + error(buf); +} +#endif diff --git a/arch/x86/boot/compressed/error.h b/arch/x86/boot/compressed/error.h index 1de5821184f1..31f9e080d61a 100644 --- a/arch/x86/boot/compressed/error.h +++ b/arch/x86/boot/compressed/error.h @@ -4,7 +4,8 @@ #include <linux/compiler.h> -void warn(char *m); +void warn(const char *m); void error(char *m) __noreturn; +void panic(const char *fmt, ...) __noreturn __cold; #endif /* BOOT_COMPRESSED_ERROR_H */ diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 3b354eb9516d..1cfe9802a42f 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -84,19 +84,6 @@ SYM_FUNC_START(startup_32) #ifdef CONFIG_RELOCATABLE leal startup_32@GOTOFF(%edx), %ebx - -#ifdef CONFIG_EFI_STUB -/* - * If we were loaded via the EFI LoadImage service, startup_32() will be at an - * offset to the start of the space allocated for the image. efi_pe_entry() will - * set up image_offset to tell us where the image actually starts, so that we - * can use the full available buffer. - * image_offset = startup_32 - image_base - * Otherwise image_offset will be zero and has no effect on the calculations. - */ - subl image_offset@GOTOFF(%edx), %ebx -#endif - movl BP_kernel_alignment(%esi), %eax decl %eax addl %eax, %ebx @@ -150,17 +137,6 @@ SYM_FUNC_START(startup_32) jmp *%eax SYM_FUNC_END(startup_32) -#ifdef CONFIG_EFI_STUB -SYM_FUNC_START(efi32_stub_entry) - add $0x4, %esp - movl 8(%esp), %esi /* save boot_params pointer */ - call efi_main - /* efi_main returns the possibly relocated address of startup_32 */ - jmp *%eax -SYM_FUNC_END(efi32_stub_entry) -SYM_FUNC_ALIAS(efi_stub_entry, efi32_stub_entry) -#endif - .text SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated) @@ -179,15 +155,9 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated) */ /* push arguments for extract_kernel: */ - pushl output_len@GOTOFF(%ebx) /* decompressed length, end of relocs */ pushl %ebp /* output address */ - pushl input_len@GOTOFF(%ebx) /* input_len */ - leal input_data@GOTOFF(%ebx), %eax - pushl %eax /* input_data */ - leal boot_heap@GOTOFF(%ebx), %eax - pushl %eax /* heap area */ pushl %esi /* real mode pointer */ - call extract_kernel /* returns kernel location in %eax */ + call extract_kernel /* returns kernel entry point in %eax */ addl $24, %esp /* @@ -208,17 +178,11 @@ SYM_DATA_START_LOCAL(gdt) .quad 0x00cf92000000ffff /* __KERNEL_DS */ SYM_DATA_END_LABEL(gdt, SYM_L_LOCAL, gdt_end) -#ifdef CONFIG_EFI_STUB -SYM_DATA(image_offset, .long 0) -#endif - /* * Stack and heap for uncompression */ .bss .balign 4 -boot_heap: - .fill BOOT_HEAP_SIZE, 1, 0 boot_stack: .fill BOOT_STACK_SIZE, 1, 0 boot_stack_end: diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index d33f060900d2..d9dab940ff62 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -35,7 +35,14 @@ #include <asm/bootparam.h> #include <asm/desc_defs.h> #include <asm/trapnr.h> -#include "pgtable.h" + +/* + * Fix alignment at 16 bytes. Following CONFIG_FUNCTION_ALIGNMENT will result + * in assembly errors due to trying to move .org backward due to the excessive + * alignment. + */ +#undef __ALIGN +#define __ALIGN .balign 16, 0x90 /* * Locally defined symbols should be marked hidden: @@ -118,7 +125,9 @@ SYM_FUNC_START(startup_32) 1: /* Setup Exception handling for SEV-ES */ +#ifdef CONFIG_AMD_MEM_ENCRYPT call startup32_load_idt +#endif /* Make sure cpu supports long mode. */ call verify_cpu @@ -136,19 +145,6 @@ SYM_FUNC_START(startup_32) #ifdef CONFIG_RELOCATABLE movl %ebp, %ebx - -#ifdef CONFIG_EFI_STUB -/* - * If we were loaded via the EFI LoadImage service, startup_32 will be at an - * offset to the start of the space allocated for the image. efi_pe_entry will - * set up image_offset to tell us where the image actually starts, so that we - * can use the full available buffer. - * image_offset = startup_32 - image_base - * Otherwise image_offset will be zero and has no effect on the calculations. - */ - subl rva(image_offset)(%ebp), %ebx -#endif - movl BP_kernel_alignment(%esi), %eax decl %eax addl %eax, %ebx @@ -178,12 +174,13 @@ SYM_FUNC_START(startup_32) */ /* * If SEV is active then set the encryption mask in the page tables. - * This will insure that when the kernel is copied and decompressed + * This will ensure that when the kernel is copied and decompressed * it will be done so encrypted. */ - call get_sev_encryption_bit xorl %edx, %edx #ifdef CONFIG_AMD_MEM_ENCRYPT + call get_sev_encryption_bit + xorl %edx, %edx testl %eax, %eax jz 1f subl $32, %eax /* Encryption bit is always above bit 31 */ @@ -249,6 +246,11 @@ SYM_FUNC_START(startup_32) movl $__BOOT_TSS, %eax ltr %ax +#ifdef CONFIG_AMD_MEM_ENCRYPT + /* Check if the C-bit position is correct when SEV is active */ + call startup32_check_sev_cbit +#endif + /* * Setup for the jump to 64bit mode * @@ -260,31 +262,6 @@ SYM_FUNC_START(startup_32) * used to perform that far jump. */ leal rva(startup_64)(%ebp), %eax -#ifdef CONFIG_EFI_MIXED - movl rva(efi32_boot_args)(%ebp), %edi - testl %edi, %edi - jz 1f - leal rva(efi64_stub_entry)(%ebp), %eax - movl rva(efi32_boot_args+4)(%ebp), %esi - movl rva(efi32_boot_args+8)(%ebp), %edx // saved bootparams pointer - testl %edx, %edx - jnz 1f - /* - * efi_pe_entry uses MS calling convention, which requires 32 bytes of - * shadow space on the stack even if all arguments are passed in - * registers. We also need an additional 8 bytes for the space that - * would be occupied by the return address, and this also results in - * the correct stack alignment for entry. - */ - subl $40, %esp - leal rva(efi_pe_entry)(%ebp), %eax - movl %edi, %ecx // MS calling convention - movl %esi, %edx -1: -#endif - /* Check if the C-bit position is correct when SEV is active */ - call startup32_check_sev_cbit - pushl $__KERNEL_CS pushl %eax @@ -296,41 +273,6 @@ SYM_FUNC_START(startup_32) lret SYM_FUNC_END(startup_32) -#ifdef CONFIG_EFI_MIXED - .org 0x190 -SYM_FUNC_START(efi32_stub_entry) - add $0x4, %esp /* Discard return address */ - popl %ecx - popl %edx - popl %esi - - call 1f -1: pop %ebp - subl $ rva(1b), %ebp - - movl %esi, rva(efi32_boot_args+8)(%ebp) -SYM_INNER_LABEL(efi32_pe_stub_entry, SYM_L_LOCAL) - movl %ecx, rva(efi32_boot_args)(%ebp) - movl %edx, rva(efi32_boot_args+4)(%ebp) - movb $0, rva(efi_is64)(%ebp) - - /* Save firmware GDTR and code/data selectors */ - sgdtl rva(efi32_boot_gdt)(%ebp) - movw %cs, rva(efi32_boot_cs)(%ebp) - movw %ds, rva(efi32_boot_ds)(%ebp) - - /* Store firmware IDT descriptor */ - sidtl rva(efi32_boot_idt)(%ebp) - - /* Disable paging */ - movl %cr0, %eax - btrl $X86_CR0_PG_BIT, %eax - movl %eax, %cr0 - - jmp startup_32 -SYM_FUNC_END(efi32_stub_entry) -#endif - .code64 .org 0x200 SYM_CODE_START(startup_64) @@ -372,20 +314,6 @@ SYM_CODE_START(startup_64) /* Start with the delta to where the kernel will run at. */ #ifdef CONFIG_RELOCATABLE leaq startup_32(%rip) /* - $startup_32 */, %rbp - -#ifdef CONFIG_EFI_STUB -/* - * If we were loaded via the EFI LoadImage service, startup_32 will be at an - * offset to the start of the space allocated for the image. efi_pe_entry will - * set up image_offset to tell us where the image actually starts, so that we - * can use the full available buffer. - * image_offset = startup_32 - image_base - * Otherwise image_offset will be zero and has no effect on the calculations. - */ - movl image_offset(%rip), %eax - subq %rax, %rbp -#endif - movl BP_kernel_alignment(%rsi), %eax decl %eax addq %rax, %rbp @@ -424,10 +352,6 @@ SYM_CODE_START(startup_64) * For the trampoline, we need the top page table to reside in lower * memory as we don't have a way to load 64-bit values into CR3 in * 32-bit mode. - * - * We go though the trampoline even if we don't have to: if we're - * already in a desired paging mode. This way the trampoline code gets - * tested on every boot. */ /* Make sure we have GDT with 32-bit code segment */ @@ -442,10 +366,14 @@ SYM_CODE_START(startup_64) lretq .Lon_kernel_cs: + /* + * RSI holds a pointer to a boot_params structure provided by the + * loader, and this needs to be preserved across C function calls. So + * move it into a callee saved register. + */ + movq %rsi, %r15 - pushq %rsi call load_stage1_idt - popq %rsi #ifdef CONFIG_AMD_MEM_ENCRYPT /* @@ -456,63 +384,29 @@ SYM_CODE_START(startup_64) * CPUID instructions being issued, so go ahead and do that now via * sev_enable(), which will also handle the rest of the SEV-related * detection/setup to ensure that has been done in advance of any dependent - * code. + * code. Pass the boot_params pointer as the first argument. */ - pushq %rsi - movq %rsi, %rdi /* real mode address */ + movq %r15, %rdi call sev_enable - popq %rsi #endif - /* - * paging_prepare() sets up the trampoline and checks if we need to - * enable 5-level paging. - * - * paging_prepare() returns a two-quadword structure which lands - * into RDX:RAX: - * - Address of the trampoline is returned in RAX. - * - Non zero RDX means trampoline needs to enable 5-level - * paging. - * - * RSI holds real mode data and needs to be preserved across - * this function call. - */ - pushq %rsi - movq %rsi, %rdi /* real mode address */ - call paging_prepare - popq %rsi - - /* Save the trampoline address in RCX */ - movq %rax, %rcx - - /* - * Load the address of trampoline_return() into RDI. - * It will be used by the trampoline to return to the main code. - */ - leaq trampoline_return(%rip), %rdi - - /* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */ - pushq $__KERNEL32_CS - leaq TRAMPOLINE_32BIT_CODE_OFFSET(%rax), %rax - pushq %rax - lretq -trampoline_return: - /* Restore the stack, the 32-bit trampoline uses its own stack */ - leaq rva(boot_stack_end)(%rbx), %rsp + /* Preserve only the CR4 bits that must be preserved, and clear the rest */ + movq %cr4, %rax + andl $(X86_CR4_PAE | X86_CR4_MCE | X86_CR4_LA57), %eax + movq %rax, %cr4 /* - * cleanup_trampoline() would restore trampoline memory. + * configure_5level_paging() updates the number of paging levels using + * a trampoline in 32-bit addressable memory if the current number does + * not match the desired number. * - * RDI is address of the page table to use instead of page table - * in trampoline memory (if required). - * - * RSI holds real mode data and needs to be preserved across - * this function call. + * Pass the boot_params pointer as the first argument. The second + * argument is the relocated address of the page table to use instead + * of the page table in trampoline memory (if required). */ - pushq %rsi - leaq rva(top_pgtable)(%rbx), %rdi - call cleanup_trampoline - popq %rsi + movq %r15, %rdi + leaq rva(top_pgtable)(%rbx), %rsi + call configure_5level_paging /* Zero EFLAGS */ pushq $0 @@ -522,7 +416,6 @@ trampoline_return: * Copy the compressed kernel to the end of our buffer * where decompression in place becomes safe. */ - pushq %rsi leaq (_bss-8)(%rip), %rsi leaq rva(_bss-8)(%rbx), %rdi movl $(_bss - startup_32), %ecx @@ -530,7 +423,6 @@ trampoline_return: std rep movsq cld - popq %rsi /* * The GDT may get overwritten either during the copy we just did or @@ -549,19 +441,6 @@ trampoline_return: jmp *%rax SYM_CODE_END(startup_64) -#ifdef CONFIG_EFI_STUB - .org 0x390 -SYM_FUNC_START(efi64_stub_entry) - and $~0xf, %rsp /* realign the stack */ - movq %rdx, %rbx /* save boot_params pointer */ - call efi_main - movq %rbx,%rsi - leaq rva(startup_64)(%rax), %rax - jmp *%rax -SYM_FUNC_END(efi64_stub_entry) -SYM_FUNC_ALIAS(efi_stub_entry, efi64_stub_entry) -#endif - .text SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated) @@ -575,137 +454,28 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated) shrq $3, %rcx rep stosq - pushq %rsi call load_stage2_idt /* Pass boot_params to initialize_identity_maps() */ - movq (%rsp), %rdi + movq %r15, %rdi call initialize_identity_maps - popq %rsi /* * Do the extraction, and jump to the new kernel.. */ - pushq %rsi /* Save the real mode argument */ - movq %rsi, %rdi /* real mode address */ - leaq boot_heap(%rip), %rsi /* malloc area for uncompression */ - leaq input_data(%rip), %rdx /* input_data */ - movl input_len(%rip), %ecx /* input_len */ - movq %rbp, %r8 /* output target address */ - movl output_len(%rip), %r9d /* decompressed length, end of relocs */ - call extract_kernel /* returns kernel location in %rax */ - popq %rsi + /* pass struct boot_params pointer and output target address */ + movq %r15, %rdi + movq %rbp, %rsi + call extract_kernel /* returns kernel entry point in %rax */ /* * Jump to the decompressed kernel. */ + movq %r15, %rsi jmp *%rax SYM_FUNC_END(.Lrelocated) .code32 -/* - * This is the 32-bit trampoline that will be copied over to low memory. - * - * RDI contains the return address (might be above 4G). - * ECX contains the base address of the trampoline memory. - * Non zero RDX means trampoline needs to enable 5-level paging. - */ -SYM_CODE_START(trampoline_32bit_src) - /* Set up data and stack segments */ - movl $__KERNEL_DS, %eax - movl %eax, %ds - movl %eax, %ss - - /* Set up new stack */ - leal TRAMPOLINE_32BIT_STACK_END(%ecx), %esp - - /* Disable paging */ - movl %cr0, %eax - btrl $X86_CR0_PG_BIT, %eax - movl %eax, %cr0 - - /* Check what paging mode we want to be in after the trampoline */ - testl %edx, %edx - jz 1f - - /* We want 5-level paging: don't touch CR3 if it already points to 5-level page tables */ - movl %cr4, %eax - testl $X86_CR4_LA57, %eax - jnz 3f - jmp 2f -1: - /* We want 4-level paging: don't touch CR3 if it already points to 4-level page tables */ - movl %cr4, %eax - testl $X86_CR4_LA57, %eax - jz 3f -2: - /* Point CR3 to the trampoline's new top level page table */ - leal TRAMPOLINE_32BIT_PGTABLE_OFFSET(%ecx), %eax - movl %eax, %cr3 -3: - /* Set EFER.LME=1 as a precaution in case hypervsior pulls the rug */ - pushl %ecx - pushl %edx - movl $MSR_EFER, %ecx - rdmsr - btsl $_EFER_LME, %eax - /* Avoid writing EFER if no change was made (for TDX guest) */ - jc 1f - wrmsr -1: popl %edx - popl %ecx - -#ifdef CONFIG_X86_MCE - /* - * Preserve CR4.MCE if the kernel will enable #MC support. - * Clearing MCE may fault in some environments (that also force #MC - * support). Any machine check that occurs before #MC support is fully - * configured will crash the system regardless of the CR4.MCE value set - * here. - */ - movl %cr4, %eax - andl $X86_CR4_MCE, %eax -#else - movl $0, %eax -#endif - - /* Enable PAE and LA57 (if required) paging modes */ - orl $X86_CR4_PAE, %eax - testl %edx, %edx - jz 1f - orl $X86_CR4_LA57, %eax -1: - movl %eax, %cr4 - - /* Calculate address of paging_enabled() once we are executing in the trampoline */ - leal .Lpaging_enabled - trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_OFFSET(%ecx), %eax - - /* Prepare the stack for far return to Long Mode */ - pushl $__KERNEL_CS - pushl %eax - - /* Enable paging again. */ - movl %cr0, %eax - btsl $X86_CR0_PG_BIT, %eax - movl %eax, %cr0 - - lret -SYM_CODE_END(trampoline_32bit_src) - - .code64 -SYM_FUNC_START_LOCAL_NOALIGN(.Lpaging_enabled) - /* Return from the trampoline */ - jmp *%rdi -SYM_FUNC_END(.Lpaging_enabled) - - /* - * The trampoline code has a size limit. - * Make sure we fail to compile if the trampoline code grows - * beyond TRAMPOLINE_32BIT_CODE_SIZE bytes. - */ - .org trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_SIZE - - .code32 SYM_FUNC_START_LOCAL_NOALIGN(.Lno_longmode) /* This isn't an x86-64 CPU, so hang intentionally, we cannot continue */ 1: @@ -713,6 +483,7 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lno_longmode) jmp 1b SYM_FUNC_END(.Lno_longmode) + .globl verify_cpu #include "../../kernel/verify_cpu.S" .data @@ -744,249 +515,11 @@ SYM_DATA_START(boot_idt) .endr SYM_DATA_END_LABEL(boot_idt, SYM_L_GLOBAL, boot_idt_end) -#ifdef CONFIG_AMD_MEM_ENCRYPT -SYM_DATA_START(boot32_idt_desc) - .word boot32_idt_end - boot32_idt - 1 - .long 0 -SYM_DATA_END(boot32_idt_desc) - .balign 8 -SYM_DATA_START(boot32_idt) - .rept 32 - .quad 0 - .endr -SYM_DATA_END_LABEL(boot32_idt, SYM_L_GLOBAL, boot32_idt_end) -#endif - -#ifdef CONFIG_EFI_STUB -SYM_DATA(image_offset, .long 0) -#endif -#ifdef CONFIG_EFI_MIXED -SYM_DATA_LOCAL(efi32_boot_args, .long 0, 0, 0) -SYM_DATA(efi_is64, .byte 1) - -#define ST32_boottime 60 // offsetof(efi_system_table_32_t, boottime) -#define BS32_handle_protocol 88 // offsetof(efi_boot_services_32_t, handle_protocol) -#define LI32_image_base 32 // offsetof(efi_loaded_image_32_t, image_base) - - __HEAD - .code32 -SYM_FUNC_START(efi32_pe_entry) -/* - * efi_status_t efi32_pe_entry(efi_handle_t image_handle, - * efi_system_table_32_t *sys_table) - */ - - pushl %ebp - movl %esp, %ebp - pushl %eax // dummy push to allocate loaded_image - - pushl %ebx // save callee-save registers - pushl %edi - - call verify_cpu // check for long mode support - testl %eax, %eax - movl $0x80000003, %eax // EFI_UNSUPPORTED - jnz 2f - - call 1f -1: pop %ebx - subl $ rva(1b), %ebx - - /* Get the loaded image protocol pointer from the image handle */ - leal -4(%ebp), %eax - pushl %eax // &loaded_image - leal rva(loaded_image_proto)(%ebx), %eax - pushl %eax // pass the GUID address - pushl 8(%ebp) // pass the image handle - - /* - * Note the alignment of the stack frame. - * sys_table - * handle <-- 16-byte aligned on entry by ABI - * return address - * frame pointer - * loaded_image <-- local variable - * saved %ebx <-- 16-byte aligned here - * saved %edi - * &loaded_image - * &loaded_image_proto - * handle <-- 16-byte aligned for call to handle_protocol - */ - - movl 12(%ebp), %eax // sys_table - movl ST32_boottime(%eax), %eax // sys_table->boottime - call *BS32_handle_protocol(%eax) // sys_table->boottime->handle_protocol - addl $12, %esp // restore argument space - testl %eax, %eax - jnz 2f - - movl 8(%ebp), %ecx // image_handle - movl 12(%ebp), %edx // sys_table - movl -4(%ebp), %esi // loaded_image - movl LI32_image_base(%esi), %esi // loaded_image->image_base - movl %ebx, %ebp // startup_32 for efi32_pe_stub_entry - /* - * We need to set the image_offset variable here since startup_32() will - * use it before we get to the 64-bit efi_pe_entry() in C code. - */ - subl %esi, %ebx - movl %ebx, rva(image_offset)(%ebp) // save image_offset - jmp efi32_pe_stub_entry - -2: popl %edi // restore callee-save registers - popl %ebx - leave - RET -SYM_FUNC_END(efi32_pe_entry) - - .section ".rodata" - /* EFI loaded image protocol GUID */ - .balign 4 -SYM_DATA_START_LOCAL(loaded_image_proto) - .long 0x5b1b31a1 - .word 0x9562, 0x11d2 - .byte 0x8e, 0x3f, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b -SYM_DATA_END(loaded_image_proto) -#endif - -#ifdef CONFIG_AMD_MEM_ENCRYPT - __HEAD - .code32 -/* - * Write an IDT entry into boot32_idt - * - * Parameters: - * - * %eax: Handler address - * %edx: Vector number - * - * Physical offset is expected in %ebp - */ -SYM_FUNC_START(startup32_set_idt_entry) - push %ebx - push %ecx - - /* IDT entry address to %ebx */ - leal rva(boot32_idt)(%ebp), %ebx - shl $3, %edx - addl %edx, %ebx - - /* Build IDT entry, lower 4 bytes */ - movl %eax, %edx - andl $0x0000ffff, %edx # Target code segment offset [15:0] - movl $__KERNEL32_CS, %ecx # Target code segment selector - shl $16, %ecx - orl %ecx, %edx - - /* Store lower 4 bytes to IDT */ - movl %edx, (%ebx) - - /* Build IDT entry, upper 4 bytes */ - movl %eax, %edx - andl $0xffff0000, %edx # Target code segment offset [31:16] - orl $0x00008e00, %edx # Present, Type 32-bit Interrupt Gate - - /* Store upper 4 bytes to IDT */ - movl %edx, 4(%ebx) - - pop %ecx - pop %ebx - RET -SYM_FUNC_END(startup32_set_idt_entry) -#endif - -SYM_FUNC_START(startup32_load_idt) -#ifdef CONFIG_AMD_MEM_ENCRYPT - /* #VC handler */ - leal rva(startup32_vc_handler)(%ebp), %eax - movl $X86_TRAP_VC, %edx - call startup32_set_idt_entry - - /* Load IDT */ - leal rva(boot32_idt)(%ebp), %eax - movl %eax, rva(boot32_idt_desc+2)(%ebp) - lidt rva(boot32_idt_desc)(%ebp) -#endif - RET -SYM_FUNC_END(startup32_load_idt) - -/* - * Check for the correct C-bit position when the startup_32 boot-path is used. - * - * The check makes use of the fact that all memory is encrypted when paging is - * disabled. The function creates 64 bits of random data using the RDRAND - * instruction. RDRAND is mandatory for SEV guests, so always available. If the - * hypervisor violates that the kernel will crash right here. - * - * The 64 bits of random data are stored to a memory location and at the same - * time kept in the %eax and %ebx registers. Since encryption is always active - * when paging is off the random data will be stored encrypted in main memory. - * - * Then paging is enabled. When the C-bit position is correct all memory is - * still mapped encrypted and comparing the register values with memory will - * succeed. An incorrect C-bit position will map all memory unencrypted, so that - * the compare will use the encrypted random data and fail. - */ -SYM_FUNC_START(startup32_check_sev_cbit) -#ifdef CONFIG_AMD_MEM_ENCRYPT - pushl %eax - pushl %ebx - pushl %ecx - pushl %edx - - /* Check for non-zero sev_status */ - movl rva(sev_status)(%ebp), %eax - testl %eax, %eax - jz 4f - - /* - * Get two 32-bit random values - Don't bail out if RDRAND fails - * because it is better to prevent forward progress if no random value - * can be gathered. - */ -1: rdrand %eax - jnc 1b -2: rdrand %ebx - jnc 2b - - /* Store to memory and keep it in the registers */ - movl %eax, rva(sev_check_data)(%ebp) - movl %ebx, rva(sev_check_data+4)(%ebp) - - /* Enable paging to see if encryption is active */ - movl %cr0, %edx /* Backup %cr0 in %edx */ - movl $(X86_CR0_PG | X86_CR0_PE), %ecx /* Enable Paging and Protected mode */ - movl %ecx, %cr0 - - cmpl %eax, rva(sev_check_data)(%ebp) - jne 3f - cmpl %ebx, rva(sev_check_data+4)(%ebp) - jne 3f - - movl %edx, %cr0 /* Restore previous %cr0 */ - - jmp 4f - -3: /* Check failed - hlt the machine */ - hlt - jmp 3b - -4: - popl %edx - popl %ecx - popl %ebx - popl %eax -#endif - RET -SYM_FUNC_END(startup32_check_sev_cbit) - /* * Stack and heap for uncompression */ .bss .balign 4 -SYM_DATA_LOCAL(boot_heap, .fill BOOT_HEAP_SIZE, 1, 0) - SYM_DATA_START_LOCAL(boot_stack) .fill BOOT_STACK_SIZE, 1, 0 .balign 16 diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c index d4a314cc50d6..dfb9c2deb77c 100644 --- a/arch/x86/boot/compressed/ident_map_64.c +++ b/arch/x86/boot/compressed/ident_map_64.c @@ -8,16 +8,8 @@ * Copyright (C) 2016 Kees Cook */ -/* - * Since we're dealing with identity mappings, physical and virtual - * addresses are the same, so override these defines which are ultimately - * used by the headers in misc.h. - */ -#define __pa(x) ((unsigned long)(x)) -#define __va(x) ((void *)((unsigned long)(x))) - -/* No PAGE_TABLE_ISOLATION support needed either: */ -#undef CONFIG_PAGE_TABLE_ISOLATION +/* No MITIGATION_PAGE_TABLE_ISOLATION support needed either: */ +#undef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION #include "error.h" #include "misc.h" @@ -67,6 +59,14 @@ static void *alloc_pgt_page(void *context) return NULL; } + /* Consumed more tables than expected? */ + if (pages->pgt_buf_offset == BOOT_PGT_SIZE_WARN) { + debug_putstr("pgt_buf running low in " __FILE__ "\n"); + debug_putstr("Need to raise BOOT_PGT_SIZE?\n"); + debug_putaddr(pages->pgt_buf_offset); + debug_putaddr(pages->pgt_buf_size); + } + entry = pages->pgt_buf + pages->pgt_buf_offset; pages->pgt_buf_offset += PAGE_SIZE; @@ -159,8 +159,9 @@ void initialize_identity_maps(void *rmode) * or does not touch all the pages covering them. */ kernel_add_identity_map((unsigned long)_head, (unsigned long)_end); - boot_params = rmode; - kernel_add_identity_map((unsigned long)boot_params, (unsigned long)(boot_params + 1)); + boot_params_ptr = rmode; + kernel_add_identity_map((unsigned long)boot_params_ptr, + (unsigned long)(boot_params_ptr + 1)); cmdline = get_cmd_line_ptr(); kernel_add_identity_map(cmdline, cmdline + COMMAND_LINE_SIZE); @@ -168,7 +169,7 @@ void initialize_identity_maps(void *rmode) * Also map the setup_data entries passed via boot_params in case they * need to be accessed by uncompressed kernel via the identity mapping. */ - sd = (struct setup_data *)boot_params->hdr.setup_data; + sd = (struct setup_data *)boot_params_ptr->hdr.setup_data; while (sd) { unsigned long sd_addr = (unsigned long)sd; @@ -180,6 +181,12 @@ void initialize_identity_maps(void *rmode) /* Load the new page-table. */ write_cr3(top_level_pgt); + + /* + * Now that the required page table mappings are established and a + * GHCB can be used, check for SNP guest/HV feature compatibility. + */ + snp_check_features(); } static pte_t *split_large_pmd(struct x86_mapping_info *info, @@ -277,7 +284,7 @@ static int set_clr_page_flags(struct x86_mapping_info *info, pudp = pud_offset(p4dp, address); pmdp = pmd_offset(pudp, address); - if (pmd_large(*pmdp)) + if (pmd_leaf(*pmdp)) ptep = split_large_pmd(info, pmdp, address); else ptep = pte_offset_kernel(pmdp, address); @@ -379,3 +386,8 @@ void do_boot_page_fault(struct pt_regs *regs, unsigned long error_code) */ kernel_add_identity_map(address, end); } + +void do_boot_nmi_trap(struct pt_regs *regs, unsigned long error_code) +{ + spurious_nmi_count++; +} diff --git a/arch/x86/boot/compressed/idt_64.c b/arch/x86/boot/compressed/idt_64.c index 6debb816e83d..d100284bbef4 100644 --- a/arch/x86/boot/compressed/idt_64.c +++ b/arch/x86/boot/compressed/idt_64.c @@ -61,9 +61,17 @@ void load_stage2_idt(void) boot_idt_desc.address = (unsigned long)boot_idt; set_idt_entry(X86_TRAP_PF, boot_page_fault); + set_idt_entry(X86_TRAP_NMI, boot_nmi_trap); #ifdef CONFIG_AMD_MEM_ENCRYPT - set_idt_entry(X86_TRAP_VC, boot_stage2_vc); + /* + * Clear the second stage #VC handler in case guest types + * needing #VC have not been detected. + */ + if (sev_status & BIT(1)) + set_idt_entry(X86_TRAP_VC, boot_stage2_vc); + else + set_idt_entry(X86_TRAP_VC, NULL); #endif load_boot_idt(&boot_idt_desc); diff --git a/arch/x86/boot/compressed/idt_handlers_64.S b/arch/x86/boot/compressed/idt_handlers_64.S index 22890e199f5b..4d03c8562f63 100644 --- a/arch/x86/boot/compressed/idt_handlers_64.S +++ b/arch/x86/boot/compressed/idt_handlers_64.S @@ -70,6 +70,7 @@ SYM_FUNC_END(\name) .code64 EXCEPTION_HANDLER boot_page_fault do_boot_page_fault error_code=1 +EXCEPTION_HANDLER boot_nmi_trap do_boot_nmi_trap error_code=0 #ifdef CONFIG_AMD_MEM_ENCRYPT EXCEPTION_HANDLER boot_stage1_vc do_vc_no_ghcb error_code=1 diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index e476bcbd9b42..3b0948ad449f 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -25,10 +25,6 @@ #include "efi.h" #include <generated/compile.h> -#include <linux/module.h> -#include <linux/uts.h> -#include <linux/utsname.h> -#include <linux/ctype.h> #include <generated/utsversion.h> #include <generated/utsrelease.h> @@ -63,7 +59,7 @@ static unsigned long get_boot_seed(void) unsigned long hash = 0; hash = rotate_xor(hash, build_str, sizeof(build_str)); - hash = rotate_xor(hash, boot_params, sizeof(*boot_params)); + hash = rotate_xor(hash, boot_params_ptr, sizeof(*boot_params_ptr)); return hash; } @@ -119,13 +115,8 @@ char *skip_spaces(const char *str) #include "../../../../lib/ctype.c" #include "../../../../lib/cmdline.c" -enum parse_mode { - PARSE_MEMMAP, - PARSE_EFI, -}; - static int -parse_memmap(char *p, u64 *start, u64 *size, enum parse_mode mode) +parse_memmap(char *p, u64 *start, u64 *size) { char *oldp; @@ -148,29 +139,11 @@ parse_memmap(char *p, u64 *start, u64 *size, enum parse_mode mode) *start = memparse(p + 1, &p); return 0; case '@': - if (mode == PARSE_MEMMAP) { - /* - * memmap=nn@ss specifies usable region, should - * be skipped - */ - *size = 0; - } else { - u64 flags; - - /* - * efi_fake_mem=nn@ss:attr the attr specifies - * flags that might imply a soft-reservation. - */ - *start = memparse(p + 1, &p); - if (p && *p == ':') { - p++; - if (kstrtoull(p, 0, &flags) < 0) - *size = 0; - else if (flags & EFI_MEMORY_SP) - return 0; - } - *size = 0; - } + /* + * memmap=nn@ss specifies usable region, should + * be skipped + */ + *size = 0; fallthrough; default: /* @@ -185,7 +158,7 @@ parse_memmap(char *p, u64 *start, u64 *size, enum parse_mode mode) return -EINVAL; } -static void mem_avoid_memmap(enum parse_mode mode, char *str) +static void mem_avoid_memmap(char *str) { static int i; @@ -200,7 +173,7 @@ static void mem_avoid_memmap(enum parse_mode mode, char *str) if (k) *k++ = 0; - rc = parse_memmap(str, &start, &size, mode); + rc = parse_memmap(str, &start, &size); if (rc < 0) break; str = k; @@ -281,7 +254,7 @@ static void handle_mem_options(void) break; if (!strcmp(param, "memmap")) { - mem_avoid_memmap(PARSE_MEMMAP, val); + mem_avoid_memmap(val); } else if (IS_ENABLED(CONFIG_X86_64) && strstr(param, "hugepages")) { parse_gb_huge_pages(param, val); } else if (!strcmp(param, "mem")) { @@ -295,8 +268,6 @@ static void handle_mem_options(void) if (mem_size < mem_limit) mem_limit = mem_size; - } else if (!strcmp(param, "efi_fake_mem")) { - mem_avoid_memmap(PARSE_EFI, val); } } @@ -383,7 +354,7 @@ static void handle_mem_options(void) static void mem_avoid_init(unsigned long input, unsigned long input_size, unsigned long output) { - unsigned long init_size = boot_params->hdr.init_size; + unsigned long init_size = boot_params_ptr->hdr.init_size; u64 initrd_start, initrd_size; unsigned long cmd_line, cmd_line_size; @@ -395,10 +366,10 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, mem_avoid[MEM_AVOID_ZO_RANGE].size = (output + init_size) - input; /* Avoid initrd. */ - initrd_start = (u64)boot_params->ext_ramdisk_image << 32; - initrd_start |= boot_params->hdr.ramdisk_image; - initrd_size = (u64)boot_params->ext_ramdisk_size << 32; - initrd_size |= boot_params->hdr.ramdisk_size; + initrd_start = (u64)boot_params_ptr->ext_ramdisk_image << 32; + initrd_start |= boot_params_ptr->hdr.ramdisk_image; + initrd_size = (u64)boot_params_ptr->ext_ramdisk_size << 32; + initrd_size |= boot_params_ptr->hdr.ramdisk_size; mem_avoid[MEM_AVOID_INITRD].start = initrd_start; mem_avoid[MEM_AVOID_INITRD].size = initrd_size; /* No need to set mapping for initrd, it will be handled in VO. */ @@ -413,8 +384,8 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, } /* Avoid boot parameters. */ - mem_avoid[MEM_AVOID_BOOTPARAMS].start = (unsigned long)boot_params; - mem_avoid[MEM_AVOID_BOOTPARAMS].size = sizeof(*boot_params); + mem_avoid[MEM_AVOID_BOOTPARAMS].start = (unsigned long)boot_params_ptr; + mem_avoid[MEM_AVOID_BOOTPARAMS].size = sizeof(*boot_params_ptr); /* We don't need to set a mapping for setup_data. */ @@ -447,7 +418,7 @@ static bool mem_avoid_overlap(struct mem_vector *img, } /* Avoid all entries in the setup_data linked list. */ - ptr = (struct setup_data *)(unsigned long)boot_params->hdr.setup_data; + ptr = (struct setup_data *)(unsigned long)boot_params_ptr->hdr.setup_data; while (ptr) { struct mem_vector avoid; @@ -668,10 +639,37 @@ static bool process_mem_region(struct mem_vector *region, } } #endif - return 0; + return false; } #ifdef CONFIG_EFI + +/* + * Only EFI_CONVENTIONAL_MEMORY and EFI_UNACCEPTED_MEMORY (if supported) are + * guaranteed to be free. + * + * Pick free memory more conservatively than the EFI spec allows: according to + * the spec, EFI_BOOT_SERVICES_{CODE|DATA} are also free memory and thus + * available to place the kernel image into, but in practice there's firmware + * where using that memory leads to crashes. Buggy vendor EFI code registers + * for an event that triggers on SetVirtualAddressMap(). The handler assumes + * that EFI_BOOT_SERVICES_DATA memory has not been touched by loader yet, which + * is probably true for Windows. + * + * Preserve EFI_BOOT_SERVICES_* regions until after SetVirtualAddressMap(). + */ +static inline bool memory_type_is_free(efi_memory_desc_t *md) +{ + if (md->type == EFI_CONVENTIONAL_MEMORY) + return true; + + if (IS_ENABLED(CONFIG_UNACCEPTED_MEMORY) && + md->type == EFI_UNACCEPTED_MEMORY) + return true; + + return false; +} + /* * Returns true if we processed the EFI memmap, which we prefer over the E820 * table if it is available. @@ -679,7 +677,7 @@ static bool process_mem_region(struct mem_vector *region, static bool process_efi_entries(unsigned long minimum, unsigned long image_size) { - struct efi_info *e = &boot_params->efi_info; + struct efi_info *e = &boot_params_ptr->efi_info; bool efi_mirror_found = false; struct mem_vector region; efi_memory_desc_t *md; @@ -716,18 +714,7 @@ process_efi_entries(unsigned long minimum, unsigned long image_size) for (i = 0; i < nr_desc; i++) { md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i); - /* - * Here we are more conservative in picking free memory than - * the EFI spec allows: - * - * According to the spec, EFI_BOOT_SERVICES_{CODE|DATA} are also - * free memory and thus available to place the kernel image into, - * but in practice there's firmware where using that memory leads - * to crashes. - * - * Only EFI_CONVENTIONAL_MEMORY is guaranteed to be free. - */ - if (md->type != EFI_CONVENTIONAL_MEMORY) + if (!memory_type_is_free(md)) continue; if (efi_soft_reserve_enabled() && @@ -761,8 +748,8 @@ static void process_e820_entries(unsigned long minimum, struct boot_e820_entry *entry; /* Verify potential e820 positions, appending to slots list. */ - for (i = 0; i < boot_params->e820_entries; i++) { - entry = &boot_params->e820_table[i]; + for (i = 0; i < boot_params_ptr->e820_entries; i++) { + entry = &boot_params_ptr->e820_table[i]; /* Skip non-RAM entries. */ if (entry->type != E820_TYPE_RAM) continue; @@ -773,6 +760,49 @@ static void process_e820_entries(unsigned long minimum, } } +/* + * If KHO is active, only process its scratch areas to ensure we are not + * stepping onto preserved memory. + */ +static bool process_kho_entries(unsigned long minimum, unsigned long image_size) +{ + struct kho_scratch *kho_scratch; + struct setup_data *ptr; + struct kho_data *kho; + int i, nr_areas = 0; + + if (!IS_ENABLED(CONFIG_KEXEC_HANDOVER)) + return false; + + ptr = (struct setup_data *)(unsigned long)boot_params_ptr->hdr.setup_data; + while (ptr) { + if (ptr->type == SETUP_KEXEC_KHO) { + kho = (struct kho_data *)(unsigned long)ptr->data; + kho_scratch = (void *)(unsigned long)kho->scratch_addr; + nr_areas = kho->scratch_size / sizeof(*kho_scratch); + break; + } + + ptr = (struct setup_data *)(unsigned long)ptr->next; + } + + if (!nr_areas) + return false; + + for (i = 0; i < nr_areas; i++) { + struct kho_scratch *area = &kho_scratch[i]; + struct mem_vector region = { + .start = area->addr, + .size = area->size, + }; + + if (process_mem_region(®ion, minimum, image_size)) + break; + } + + return true; +} + static unsigned long find_random_phys_addr(unsigned long minimum, unsigned long image_size) { @@ -788,7 +818,12 @@ static unsigned long find_random_phys_addr(unsigned long minimum, return 0; } - if (!process_efi_entries(minimum, image_size)) + /* + * During kexec handover only process KHO scratch areas that are known + * not to contain any data that must be preserved. + */ + if (!process_kho_entries(minimum, image_size) && + !process_efi_entries(minimum, image_size)) process_e820_entries(minimum, image_size); phys_addr = slots_fetch_random(); @@ -836,7 +871,7 @@ void choose_random_location(unsigned long input, return; } - boot_params->hdr.loadflags |= KASLR_FLAG; + boot_params_ptr->hdr.loadflags |= KASLR_FLAG; if (IS_ENABLED(CONFIG_X86_32)) mem_limit = KERNEL_IMAGE_SIZE; diff --git a/arch/x86/boot/compressed/mem.c b/arch/x86/boot/compressed/mem.c new file mode 100644 index 000000000000..0e9f84ab4bdc --- /dev/null +++ b/arch/x86/boot/compressed/mem.c @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "error.h" +#include "misc.h" +#include "tdx.h" +#include "sev.h" +#include <asm/shared/tdx.h> + +/* + * accept_memory() and process_unaccepted_memory() called from EFI stub which + * runs before decompressor and its early_tdx_detect(). + * + * Enumerate TDX directly from the early users. + */ +static bool early_is_tdx_guest(void) +{ + static bool once; + static bool is_tdx; + + if (!IS_ENABLED(CONFIG_INTEL_TDX_GUEST)) + return false; + + if (!once) { + u32 eax, sig[3]; + + cpuid_count(TDX_CPUID_LEAF_ID, 0, &eax, + &sig[0], &sig[2], &sig[1]); + is_tdx = !memcmp(TDX_IDENT, sig, sizeof(sig)); + once = true; + } + + return is_tdx; +} + +void arch_accept_memory(phys_addr_t start, phys_addr_t end) +{ + /* Platform-specific memory-acceptance call goes here */ + if (early_is_tdx_guest()) { + if (!tdx_accept_memory(start, end)) + panic("TDX: Failed to accept memory\n"); + } else if (early_is_sevsnp_guest()) { + snp_accept_memory(start, end); + } else { + error("Cannot accept memory: unknown platform\n"); + } +} + +bool init_unaccepted_memory(void) +{ + guid_t guid = LINUX_EFI_UNACCEPTED_MEM_TABLE_GUID; + struct efi_unaccepted_memory *table; + unsigned long cfg_table_pa; + unsigned int cfg_table_len; + enum efi_type et; + int ret; + + et = efi_get_type(boot_params_ptr); + if (et == EFI_TYPE_NONE) + return false; + + ret = efi_get_conf_table(boot_params_ptr, &cfg_table_pa, &cfg_table_len); + if (ret) { + warn("EFI config table not found."); + return false; + } + + table = (void *)efi_find_vendor_table(boot_params_ptr, cfg_table_pa, + cfg_table_len, guid); + if (!table) + return false; + + if (table->version != 1) + error("Unknown version of unaccepted memory table\n"); + + /* + * In many cases unaccepted_table is already set by EFI stub, but it + * has to be initialized again to cover cases when the table is not + * allocated by EFI stub or EFI stub copied the kernel image with + * efi_relocate_kernel() before the variable is set. + * + * It must be initialized before the first usage of accept_memory(). + */ + unaccepted_table = table; + + return true; +} diff --git a/arch/x86/boot/compressed/mem_encrypt.S b/arch/x86/boot/compressed/mem_encrypt.S index a73e4d783cae..32f7cc8a8625 100644 --- a/arch/x86/boot/compressed/mem_encrypt.S +++ b/arch/x86/boot/compressed/mem_encrypt.S @@ -12,16 +12,13 @@ #include <asm/processor-flags.h> #include <asm/msr.h> #include <asm/asm-offsets.h> +#include <asm/segment.h> +#include <asm/trapnr.h> .text .code32 SYM_FUNC_START(get_sev_encryption_bit) - xor %eax, %eax - -#ifdef CONFIG_AMD_MEM_ENCRYPT push %ebx - push %ecx - push %edx movl $0x80000000, %eax /* CPUID to check the highest leaf */ cpuid @@ -52,12 +49,7 @@ SYM_FUNC_START(get_sev_encryption_bit) xor %eax, %eax .Lsev_exit: - pop %edx - pop %ecx pop %ebx - -#endif /* CONFIG_AMD_MEM_ENCRYPT */ - RET SYM_FUNC_END(get_sev_encryption_bit) @@ -98,7 +90,7 @@ SYM_CODE_START_LOCAL(sev_es_req_cpuid) jmp 1b SYM_CODE_END(sev_es_req_cpuid) -SYM_CODE_START(startup32_vc_handler) +SYM_CODE_START_LOCAL(startup32_vc_handler) pushl %eax pushl %ebx pushl %ecx @@ -184,15 +176,149 @@ SYM_CODE_START(startup32_vc_handler) jmp .Lfail SYM_CODE_END(startup32_vc_handler) +/* + * Write an IDT entry into boot32_idt + * + * Parameters: + * + * %eax: Handler address + * %edx: Vector number + * %ecx: IDT address + */ +SYM_FUNC_START_LOCAL(startup32_set_idt_entry) + /* IDT entry address to %ecx */ + leal (%ecx, %edx, 8), %ecx + + /* Build IDT entry, lower 4 bytes */ + movl %eax, %edx + andl $0x0000ffff, %edx # Target code segment offset [15:0] + orl $(__KERNEL32_CS << 16), %edx # Target code segment selector + + /* Store lower 4 bytes to IDT */ + movl %edx, (%ecx) + + /* Build IDT entry, upper 4 bytes */ + movl %eax, %edx + andl $0xffff0000, %edx # Target code segment offset [31:16] + orl $0x00008e00, %edx # Present, Type 32-bit Interrupt Gate + + /* Store upper 4 bytes to IDT */ + movl %edx, 4(%ecx) + + RET +SYM_FUNC_END(startup32_set_idt_entry) + +SYM_FUNC_START(startup32_load_idt) + push %ebp + push %ebx + + call 1f +1: pop %ebp + + leal (boot32_idt - 1b)(%ebp), %ebx + + /* #VC handler */ + leal (startup32_vc_handler - 1b)(%ebp), %eax + movl $X86_TRAP_VC, %edx + movl %ebx, %ecx + call startup32_set_idt_entry + + /* Load IDT */ + leal (boot32_idt_desc - 1b)(%ebp), %ecx + movl %ebx, 2(%ecx) + lidt (%ecx) + + pop %ebx + pop %ebp + RET +SYM_FUNC_END(startup32_load_idt) + +/* + * Check for the correct C-bit position when the startup_32 boot-path is used. + * + * The check makes use of the fact that all memory is encrypted when paging is + * disabled. The function creates 64 bits of random data using the RDRAND + * instruction. RDRAND is mandatory for SEV guests, so always available. If the + * hypervisor violates that the kernel will crash right here. + * + * The 64 bits of random data are stored to a memory location and at the same + * time kept in the %eax and %ebx registers. Since encryption is always active + * when paging is off the random data will be stored encrypted in main memory. + * + * Then paging is enabled. When the C-bit position is correct all memory is + * still mapped encrypted and comparing the register values with memory will + * succeed. An incorrect C-bit position will map all memory unencrypted, so that + * the compare will use the encrypted random data and fail. + */ +SYM_FUNC_START(startup32_check_sev_cbit) + pushl %ebx + pushl %ebp + + call 0f +0: popl %ebp + + /* Check for non-zero sev_status */ + movl (sev_status - 0b)(%ebp), %eax + testl %eax, %eax + jz 4f + + /* + * Get two 32-bit random values - Don't bail out if RDRAND fails + * because it is better to prevent forward progress if no random value + * can be gathered. + */ +1: rdrand %eax + jnc 1b +2: rdrand %ebx + jnc 2b + + /* Store to memory and keep it in the registers */ + leal (sev_check_data - 0b)(%ebp), %ebp + movl %eax, 0(%ebp) + movl %ebx, 4(%ebp) + + /* Enable paging to see if encryption is active */ + movl %cr0, %edx /* Backup %cr0 in %edx */ + movl $(X86_CR0_PG | X86_CR0_PE), %ecx /* Enable Paging and Protected mode */ + movl %ecx, %cr0 + + cmpl %eax, 0(%ebp) + jne 3f + cmpl %ebx, 4(%ebp) + jne 3f + + movl %edx, %cr0 /* Restore previous %cr0 */ + + jmp 4f + +3: /* Check failed - hlt the machine */ + hlt + jmp 3b + +4: + popl %ebp + popl %ebx + RET +SYM_FUNC_END(startup32_check_sev_cbit) + .code64 #include "../../kernel/sev_verify_cbit.S" .data -#ifdef CONFIG_AMD_MEM_ENCRYPT .balign 8 SYM_DATA(sme_me_mask, .quad 0) SYM_DATA(sev_status, .quad 0) SYM_DATA(sev_check_data, .quad 0) -#endif + +SYM_DATA_START_LOCAL(boot32_idt) + .rept 32 + .quad 0 + .endr +SYM_DATA_END(boot32_idt) + +SYM_DATA_START_LOCAL(boot32_idt_desc) + .word . - boot32_idt - 1 + .long 0 +SYM_DATA_END(boot32_idt_desc) diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index cf690d8712f4..0f41ca0e52c0 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -14,7 +14,6 @@ #include "misc.h" #include "error.h" -#include "pgtable.h" #include "../string.h" #include "../voffset.h" #include <asm/bootparam_utils.h> @@ -46,12 +45,13 @@ void *memmove(void *dest, const void *src, size_t n); /* * This is set up by the setup-routine at boot-time */ -struct boot_params *boot_params; +struct boot_params *boot_params_ptr; struct port_io_ops pio_ops; memptr free_mem_ptr; memptr free_mem_end_ptr; +int spurious_nmi_count; static char *vidmem; static int vidport; @@ -132,8 +132,8 @@ void __putstr(const char *s) if (lines == 0 || cols == 0) return; - x = boot_params->screen_info.orig_x; - y = boot_params->screen_info.orig_y; + x = boot_params_ptr->screen_info.orig_x; + y = boot_params_ptr->screen_info.orig_y; while ((c = *s++) != '\0') { if (c == '\n') { @@ -154,8 +154,8 @@ void __putstr(const char *s) } } - boot_params->screen_info.orig_x = x; - boot_params->screen_info.orig_y = y; + boot_params_ptr->screen_info.orig_x = x; + boot_params_ptr->screen_info.orig_y = y; pos = (x + cols * y) * 2; /* Update cursor position */ outb(14, vidport); @@ -164,21 +164,34 @@ void __putstr(const char *s) outb(0xff & (pos >> 1), vidport+1); } -void __puthex(unsigned long value) +static noinline void __putnum(unsigned long value, unsigned int base, + int mindig) { - char alpha[2] = "0"; - int bits; + char buf[8*sizeof(value)+1]; + char *p; - for (bits = sizeof(value) * 8 - 4; bits >= 0; bits -= 4) { - unsigned long digit = (value >> bits) & 0xf; + p = buf + sizeof(buf); + *--p = '\0'; - if (digit < 0xA) - alpha[0] = '0' + digit; - else - alpha[0] = 'a' + (digit - 0xA); + while (mindig-- > 0 || value) { + unsigned char digit = value % base; + digit += (digit >= 10) ? ('a'-10) : '0'; + *--p = digit; - __putstr(alpha); + value /= base; } + + __putstr(p); +} + +void __puthex(unsigned long value) +{ + __putnum(value, 16, sizeof(value)*2); +} + +void __putdec(unsigned long value) +{ + __putnum(value, 10, 1); } #ifdef CONFIG_X86_NEED_RELOCS @@ -221,7 +234,7 @@ static void handle_relocations(void *output, unsigned long output_len, /* * Process relocations: 32 bit relocations first then 64 bit after. - * Three sets of binary relocations are added to the end of the kernel + * Two sets of binary relocations are added to the end of the kernel * before compression. Each relocation table entry is the kernel * address of the location which needs to be updated stored as a * 32-bit value which is sign extended to 64 bits. @@ -231,8 +244,6 @@ static void handle_relocations(void *output, unsigned long output_len, * kernel bits... * 0 - zero terminator for 64 bit relocations * 64 bit relocation repeated - * 0 - zero terminator for inverse 32 bit relocations - * 32 bit inverse relocation repeated * 0 - zero terminator for 32 bit relocations * 32 bit relocation repeated * @@ -249,16 +260,6 @@ static void handle_relocations(void *output, unsigned long output_len, *(uint32_t *)ptr += delta; } #ifdef CONFIG_X86_64 - while (*--reloc) { - long extended = *reloc; - extended += map; - - ptr = (unsigned long)extended; - if (ptr < min_addr || ptr > max_addr) - error("inverse 32-bit relocation outside of kernel!\n"); - - *(int32_t *)ptr -= delta; - } for (reloc--; *reloc; reloc--) { long extended = *reloc; extended += map; @@ -277,7 +278,7 @@ static inline void handle_relocations(void *output, unsigned long output_len, { } #endif -static void parse_elf(void *output) +static size_t parse_elf(void *output) { #ifdef CONFIG_X86_64 Elf64_Ehdr ehdr; @@ -293,10 +294,8 @@ static void parse_elf(void *output) if (ehdr.e_ident[EI_MAG0] != ELFMAG0 || ehdr.e_ident[EI_MAG1] != ELFMAG1 || ehdr.e_ident[EI_MAG2] != ELFMAG2 || - ehdr.e_ident[EI_MAG3] != ELFMAG3) { + ehdr.e_ident[EI_MAG3] != ELFMAG3) error("Kernel is not a valid ELF file"); - return; - } debug_putstr("Parsing ELF... "); @@ -328,6 +327,64 @@ static void parse_elf(void *output) } free(phdrs); + + return ehdr.e_entry - LOAD_PHYSICAL_ADDR; +} + +const unsigned long kernel_text_size = VO___start_rodata - VO__text; +const unsigned long kernel_inittext_offset = VO__sinittext - VO__text; +const unsigned long kernel_inittext_size = VO___inittext_end - VO__sinittext; +const unsigned long kernel_total_size = VO__end - VO__text; + +static u8 boot_heap[BOOT_HEAP_SIZE] __aligned(4); + +extern unsigned char input_data[]; +extern unsigned int input_len, output_len; + +unsigned long decompress_kernel(unsigned char *outbuf, unsigned long virt_addr, + void (*error)(char *x)) +{ + unsigned long entry; + + if (!free_mem_ptr) { + free_mem_ptr = (unsigned long)boot_heap; + free_mem_end_ptr = (unsigned long)boot_heap + sizeof(boot_heap); + } + + if (__decompress(input_data, input_len, NULL, NULL, outbuf, output_len, + NULL, error) < 0) + return ULONG_MAX; + + entry = parse_elf(outbuf); + handle_relocations(outbuf, output_len, virt_addr); + + return entry; +} + +/* + * Set the memory encryption xloadflag based on the mem_encrypt= command line + * parameter, if provided. + */ +static void parse_mem_encrypt(struct setup_header *hdr) +{ + int on = cmdline_find_option_bool("mem_encrypt=on"); + int off = cmdline_find_option_bool("mem_encrypt=off"); + + if (on > off) + hdr->xloadflags |= XLF_MEM_ENCRYPTION; +} + +static void early_sev_detect(void) +{ + /* + * Accessing video memory causes guest termination because + * the boot stage2 #VC handler of SEV-ES/SNP guests does not + * support MMIO handling and kexec -c adds screen_info to the + * boot parameters passed to the kexec kernel, which causes + * console output to be dumped to both video and serial. + */ + if (sev_status & MSR_AMD64_SEV_ES_ENABLED) + lines = cols = 0; } /* @@ -347,25 +404,24 @@ static void parse_elf(void *output) * |-------uncompressed kernel image---------| * */ -asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, - unsigned char *input_data, - unsigned long input_len, - unsigned char *output, - unsigned long output_len) +asmlinkage __visible void *extract_kernel(void *rmode, unsigned char *output) { - const unsigned long kernel_total_size = VO__end - VO__text; unsigned long virt_addr = LOAD_PHYSICAL_ADDR; + memptr heap = (memptr)boot_heap; unsigned long needed_size; + size_t entry_offset; /* Retain x86 boot parameters pointer passed from startup_32/64. */ - boot_params = rmode; + boot_params_ptr = rmode; /* Clear flags intended for solely in-kernel use. */ - boot_params->hdr.loadflags &= ~KASLR_FLAG; + boot_params_ptr->hdr.loadflags &= ~KASLR_FLAG; - sanitize_boot_params(boot_params); + parse_mem_encrypt(&boot_params_ptr->hdr); - if (boot_params->screen_info.orig_video_mode == 7) { + sanitize_boot_params(boot_params_ptr); + + if (boot_params_ptr->screen_info.orig_video_mode == 7) { vidmem = (char *) 0xb0000; vidport = 0x3b4; } else { @@ -373,8 +429,8 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, vidport = 0x3d4; } - lines = boot_params->screen_info.orig_video_lines; - cols = boot_params->screen_info.orig_video_cols; + lines = boot_params_ptr->screen_info.orig_video_lines; + cols = boot_params_ptr->screen_info.orig_video_cols; init_default_io_ops(); @@ -386,6 +442,8 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, */ early_tdx_detect(); + early_sev_detect(); + console_init(); /* @@ -393,7 +451,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, * so that early debugging output from the RSDP parsing code can be * collected. */ - boot_params->acpi_rsdp_addr = get_rsdp_addr(); + boot_params_ptr->acpi_rsdp_addr = get_rsdp_addr(); debug_putstr("early console in extract_kernel\n"); @@ -411,7 +469,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, * entries. This ensures the full mapped area is usable RAM * and doesn't include any reserved areas. */ - needed_size = max(output_len, kernel_total_size); + needed_size = max_t(unsigned long, output_len, kernel_total_size); #ifdef CONFIG_X86_64 needed_size = ALIGN(needed_size, MIN_KERNEL_ALIGN); #endif @@ -442,7 +500,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, #ifdef CONFIG_X86_64 if (heap > 0x3fffffffffffUL) error("Destination address too large"); - if (virt_addr + max(output_len, kernel_total_size) > KERNEL_IMAGE_SIZE) + if (virt_addr + needed_size > KERNEL_IMAGE_SIZE) error("Destination virtual address is beyond the kernel mapping area"); #else if (heap > ((-__PAGE_OFFSET-(128<<20)-1) & 0x7fffffff)) @@ -454,19 +512,26 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, #endif debug_putstr("\nDecompressing Linux... "); - __decompress(input_data, input_len, NULL, NULL, output, output_len, - NULL, error); - parse_elf(output); - handle_relocations(output, output_len, virt_addr); - debug_putstr("done.\nBooting the kernel.\n"); + + if (init_unaccepted_memory()) { + debug_putstr("Accepting memory... "); + accept_memory(__pa(output), needed_size); + } + + entry_offset = decompress_kernel(output, virt_addr, error); + + debug_putstr("done.\nBooting the kernel (entry_offset: 0x"); + debug_puthex(entry_offset); + debug_putstr(").\n"); /* Disable exception handling before booting the kernel */ cleanup_exception_handling(); - return output; -} + if (spurious_nmi_count) { + error_putstr("Spurious early NMIs ignored: "); + error_putdec(spurious_nmi_count); + error_putstr("\n"); + } -void fortify_panic(const char *name) -{ - error("detected buffer overflow"); + return output + entry_offset; } diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 62208ec04ca4..fd855e32c9b9 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -19,6 +19,15 @@ /* cpu_feature_enabled() cannot be used this early */ #define USE_EARLY_PGTABLE_L5 +/* + * Boot stub deals with identity mappings, physical and virtual addresses are + * the same, so override these defines. + * + * <asm/page.h> will not define them if they are already defined. + */ +#define __pa(x) ((unsigned long)(x)) +#define __va(x) ((void *)((unsigned long)(x))) + #include <linux/linkage.h> #include <linux/screen_info.h> #include <linux/elf.h> @@ -50,13 +59,15 @@ extern char _head[], _end[]; /* misc.c */ extern memptr free_mem_ptr; extern memptr free_mem_end_ptr; +extern int spurious_nmi_count; void *malloc(int size); void free(void *where); -extern struct boot_params *boot_params; void __putstr(const char *s); void __puthex(unsigned long value); +void __putdec(unsigned long value); #define error_putstr(__x) __putstr(__x) #define error_puthex(__x) __puthex(__x) +#define error_putdec(__x) __putdec(__x) #ifdef CONFIG_X86_VERBOSE_BOOTUP @@ -125,24 +136,23 @@ static inline void console_init(void) #endif #ifdef CONFIG_AMD_MEM_ENCRYPT +struct es_em_ctxt; +struct insn; + void sev_enable(struct boot_params *bp); +void snp_check_features(void); void sev_es_shutdown_ghcb(void); extern bool sev_es_check_ghcb_fault(unsigned long address); void snp_set_page_private(unsigned long paddr); void snp_set_page_shared(unsigned long paddr); void sev_prep_identity_maps(unsigned long top_level_pgt); + +enum es_result vc_decode_insn(struct es_em_ctxt *ctxt); +bool insn_has_rep_prefix(struct insn *insn); +void sev_insn_decode_init(void); +bool early_setup_ghcb(void); #else -static inline void sev_enable(struct boot_params *bp) -{ - /* - * bp->cc_blob_address should only be set by boot/compressed kernel. - * Initialize it to 0 unconditionally (thus here in this stub too) to - * ensure that uninitialized values from buggy bootloaders aren't - * propagated. - */ - if (bp) - bp->cc_blob_address = 0; -} +static inline void snp_check_features(void) { } static inline void sev_es_shutdown_ghcb(void) { } static inline bool sev_es_check_ghcb_fault(unsigned long address) { @@ -168,9 +178,7 @@ static inline int count_immovable_mem_regions(void) { return 0; } #endif /* ident_map_64.c */ -#ifdef CONFIG_X86_5LEVEL extern unsigned int __pgtable_l5_enabled, pgdir_shift, ptrs_per_p4d; -#endif extern void kernel_add_identity_map(unsigned long start, unsigned long end); /* Used by PAGE_KERN* macros: */ @@ -188,6 +196,7 @@ static inline void cleanup_exception_handling(void) { } /* IDT Entry Points */ void boot_page_fault(void); +void boot_nmi_trap(void); void boot_stage1_vc(void); void boot_stage2_vc(void); @@ -236,4 +245,14 @@ static inline unsigned long efi_find_vendor_table(struct boot_params *bp, } #endif /* CONFIG_EFI */ +#ifdef CONFIG_UNACCEPTED_MEMORY +bool init_unaccepted_memory(void); +#else +static inline bool init_unaccepted_memory(void) { return false; } +#endif + +/* Defined in EFI stub */ +extern struct efi_unaccepted_memory *unaccepted_table; +void accept_memory(phys_addr_t start, unsigned long size); + #endif /* BOOT_COMPRESSED_MISC_H */ diff --git a/arch/x86/boot/compressed/pgtable.h b/arch/x86/boot/compressed/pgtable.h deleted file mode 100644 index cc9b2529a086..000000000000 --- a/arch/x86/boot/compressed/pgtable.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef BOOT_COMPRESSED_PAGETABLE_H -#define BOOT_COMPRESSED_PAGETABLE_H - -#define TRAMPOLINE_32BIT_SIZE (2 * PAGE_SIZE) - -#define TRAMPOLINE_32BIT_PGTABLE_OFFSET 0 - -#define TRAMPOLINE_32BIT_CODE_OFFSET PAGE_SIZE -#define TRAMPOLINE_32BIT_CODE_SIZE 0x80 - -#define TRAMPOLINE_32BIT_STACK_END TRAMPOLINE_32BIT_SIZE - -#ifndef __ASSEMBLER__ - -extern unsigned long *trampoline_32bit; - -extern void trampoline_32bit_src(void *return_ptr); - -#endif /* __ASSEMBLER__ */ -#endif /* BOOT_COMPRESSED_PAGETABLE_H */ diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c index 2ac12ff4111b..0e89e197e112 100644 --- a/arch/x86/boot/compressed/pgtable_64.c +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -1,25 +1,20 @@ // SPDX-License-Identifier: GPL-2.0 #include "misc.h" +#include <asm/bootparam.h> +#include <asm/bootparam_utils.h> #include <asm/e820/types.h> +#include <asm/pgtable.h> #include <asm/processor.h> -#include "pgtable.h" #include "../string.h" #include "efi.h" #define BIOS_START_MIN 0x20000U /* 128K, less than this is insane */ #define BIOS_START_MAX 0x9f000U /* 640K, absolute maximum */ -#ifdef CONFIG_X86_5LEVEL /* __pgtable_l5_enabled needs to be in .data to avoid being cleared along with .bss */ unsigned int __section(".data") __pgtable_l5_enabled; unsigned int __section(".data") pgdir_shift = 39; unsigned int __section(".data") ptrs_per_p4d = 1; -#endif - -struct paging_config { - unsigned long trampoline_start; - unsigned long l5_required; -}; /* Buffer to preserve trampoline memory */ static char trampoline_save[TRAMPOLINE_32BIT_SIZE]; @@ -29,11 +24,10 @@ static char trampoline_save[TRAMPOLINE_32BIT_SIZE]; * purposes. * * Avoid putting the pointer into .bss as it will be cleared between - * paging_prepare() and extract_kernel(). + * configure_5level_paging() and extract_kernel(). */ unsigned long *trampoline_32bit __section(".data"); -extern struct boot_params *boot_params; int cmdline_find_option_bool(const char *option); static unsigned long find_trampoline_placement(void) @@ -54,7 +48,7 @@ static unsigned long find_trampoline_placement(void) * * Only look for values in the legacy ROM for non-EFI system. */ - signature = (char *)&boot_params->efi_info.efi_loader_signature; + signature = (char *)&boot_params_ptr->efi_info.efi_loader_signature; if (strncmp(signature, EFI32_LOADER_SIGNATURE, 4) && strncmp(signature, EFI64_LOADER_SIGNATURE, 4)) { ebda_start = *(unsigned short *)0x40e << 4; @@ -70,10 +64,10 @@ static unsigned long find_trampoline_placement(void) bios_start = round_down(bios_start, PAGE_SIZE); /* Find the first usable memory region under bios_start. */ - for (i = boot_params->e820_entries - 1; i >= 0; i--) { + for (i = boot_params_ptr->e820_entries - 1; i >= 0; i--) { unsigned long new = bios_start; - entry = &boot_params->e820_table[i]; + entry = &boot_params_ptr->e820_table[i]; /* Skip all entries above bios_start. */ if (bios_start <= entry->addr) @@ -106,35 +100,42 @@ static unsigned long find_trampoline_placement(void) return bios_start - TRAMPOLINE_32BIT_SIZE; } -struct paging_config paging_prepare(void *rmode) +asmlinkage void configure_5level_paging(struct boot_params *bp, void *pgtable) { - struct paging_config paging_config = {}; + void (*toggle_la57)(void *cr3); + bool l5_required = false; /* Initialize boot_params. Required for cmdline_find_option_bool(). */ - boot_params = rmode; + sanitize_boot_params(bp); + boot_params_ptr = bp; /* * Check if LA57 is desired and supported. * * There are several parts to the check: - * - if the kernel supports 5-level paging: CONFIG_X86_5LEVEL=y * - if user asked to disable 5-level paging: no5lvl in cmdline * - if the machine supports 5-level paging: * + CPUID leaf 7 is supported * + the leaf has the feature bit set - * - * That's substitute for boot_cpu_has() in early boot code. */ - if (IS_ENABLED(CONFIG_X86_5LEVEL) && - !cmdline_find_option_bool("no5lvl") && - native_cpuid_eax(0) >= 7 && - (native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) { - paging_config.l5_required = 1; + if (!cmdline_find_option_bool("no5lvl") && + native_cpuid_eax(0) >= 7 && (native_cpuid_ecx(7) & BIT(16))) { + l5_required = true; + + /* Initialize variables for 5-level paging */ + __pgtable_l5_enabled = 1; + pgdir_shift = 48; + ptrs_per_p4d = 512; } - paging_config.trampoline_start = find_trampoline_placement(); + /* + * The trampoline will not be used if the paging mode is already set to + * the desired one. + */ + if (l5_required == !!(native_read_cr4() & X86_CR4_LA57)) + return; - trampoline_32bit = (unsigned long *)paging_config.trampoline_start; + trampoline_32bit = (unsigned long *)find_trampoline_placement(); /* Preserve trampoline memory */ memcpy(trampoline_save, trampoline_32bit, TRAMPOLINE_32BIT_SIZE); @@ -143,34 +144,35 @@ struct paging_config paging_prepare(void *rmode) memset(trampoline_32bit, 0, TRAMPOLINE_32BIT_SIZE); /* Copy trampoline code in place */ - memcpy(trampoline_32bit + TRAMPOLINE_32BIT_CODE_OFFSET / sizeof(unsigned long), + toggle_la57 = memcpy(trampoline_32bit + + TRAMPOLINE_32BIT_CODE_OFFSET / sizeof(unsigned long), &trampoline_32bit_src, TRAMPOLINE_32BIT_CODE_SIZE); /* + * Avoid the need for a stack in the 32-bit trampoline code, by using + * LJMP rather than LRET to return back to long mode. LJMP takes an + * immediate absolute address, which needs to be adjusted based on the + * placement of the trampoline. + */ + *(u32 *)((u8 *)toggle_la57 + trampoline_ljmp_imm_offset) += + (unsigned long)toggle_la57; + + /* * The code below prepares page table in trampoline memory. * * The new page table will be used by trampoline code for switching * from 4- to 5-level paging or vice versa. - * - * If switching is not required, the page table is unused: trampoline - * code wouldn't touch CR3. */ - /* - * We are not going to use the page table in trampoline memory if we - * are already in the desired paging mode. - */ - if (paging_config.l5_required == !!(native_read_cr4() & X86_CR4_LA57)) - goto out; - - if (paging_config.l5_required) { + if (l5_required) { /* * For 4- to 5-level paging transition, set up current CR3 as * the first and the only entry in a new top-level page table. */ - trampoline_32bit[TRAMPOLINE_32BIT_PGTABLE_OFFSET] = __native_read_cr3() | _PAGE_TABLE_NOENC; + *trampoline_32bit = native_read_cr3_pa() | _PAGE_TABLE_NOENC; } else { - unsigned long src; + u64 *new_cr3; + pgd_t *pgdp; /* * For 5- to 4-level paging transition, copy page table pointed @@ -180,39 +182,19 @@ struct paging_config paging_prepare(void *rmode) * We cannot just point to the page table from trampoline as it * may be above 4G. */ - src = *(unsigned long *)__native_read_cr3() & PAGE_MASK; - memcpy(trampoline_32bit + TRAMPOLINE_32BIT_PGTABLE_OFFSET / sizeof(unsigned long), - (void *)src, PAGE_SIZE); + pgdp = (pgd_t *)native_read_cr3_pa(); + new_cr3 = (u64 *)(native_pgd_val(pgdp[0]) & PTE_PFN_MASK); + memcpy(trampoline_32bit, new_cr3, PAGE_SIZE); } -out: - return paging_config; -} - -void cleanup_trampoline(void *pgtable) -{ - void *trampoline_pgtable; - - trampoline_pgtable = trampoline_32bit + TRAMPOLINE_32BIT_PGTABLE_OFFSET / sizeof(unsigned long); + toggle_la57(trampoline_32bit); /* - * Move the top level page table out of trampoline memory, - * if it's there. + * Move the top level page table out of trampoline memory. */ - if ((void *)__native_read_cr3() == trampoline_pgtable) { - memcpy(pgtable, trampoline_pgtable, PAGE_SIZE); - native_write_cr3((unsigned long)pgtable); - } + memcpy(pgtable, trampoline_32bit, PAGE_SIZE); + native_write_cr3((unsigned long)pgtable); /* Restore trampoline memory */ memcpy(trampoline_32bit, trampoline_save, TRAMPOLINE_32BIT_SIZE); - - /* Initialize variables for 5-level paging */ -#ifdef CONFIG_X86_5LEVEL - if (__read_cr4() & X86_CR4_LA57) { - __pgtable_l5_enabled = 1; - pgdir_shift = 48; - ptrs_per_p4d = 512; - } -#endif } diff --git a/arch/x86/boot/compressed/sbat.S b/arch/x86/boot/compressed/sbat.S new file mode 100644 index 000000000000..838f70a997dd --- /dev/null +++ b/arch/x86/boot/compressed/sbat.S @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Embed SBAT data in the kernel. + */ + .pushsection ".sbat", "a", @progbits + .incbin CONFIG_EFI_SBAT_FILE + .popsection diff --git a/arch/x86/boot/compressed/sev-handle-vc.c b/arch/x86/boot/compressed/sev-handle-vc.c new file mode 100644 index 000000000000..030001b46554 --- /dev/null +++ b/arch/x86/boot/compressed/sev-handle-vc.c @@ -0,0 +1,136 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "misc.h" +#include "error.h" +#include "sev.h" + +#include <linux/kernel.h> +#include <linux/string.h> +#include <asm/insn.h> +#include <asm/pgtable_types.h> +#include <asm/ptrace.h> +#include <asm/sev.h> +#include <asm/trapnr.h> +#include <asm/trap_pf.h> +#include <asm/fpu/xcr.h> + +#define __BOOT_COMPRESSED +#undef __init +#define __init + +/* Basic instruction decoding support needed */ +#include "../../lib/inat.c" +#include "../../lib/insn.c" + +/* + * Copy a version of this function here - insn-eval.c can't be used in + * pre-decompression code. + */ +bool insn_has_rep_prefix(struct insn *insn) +{ + insn_byte_t p; + + insn_get_prefixes(insn); + + for_each_insn_prefix(insn, p) { + if (p == 0xf2 || p == 0xf3) + return true; + } + + return false; +} + +enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) +{ + char buffer[MAX_INSN_SIZE]; + int ret; + + memcpy(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE); + + ret = insn_decode(&ctxt->insn, buffer, MAX_INSN_SIZE, INSN_MODE_64); + if (ret < 0) + return ES_DECODE_FAILED; + + return ES_OK; +} + +extern void sev_insn_decode_init(void) __alias(inat_init_tables); + +/* + * Only a dummy for insn_get_seg_base() - Early boot-code is 64bit only and + * doesn't use segments. + */ +static unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx) +{ + return 0UL; +} + +static enum es_result vc_write_mem(struct es_em_ctxt *ctxt, + void *dst, char *buf, size_t size) +{ + memcpy(dst, buf, size); + + return ES_OK; +} + +static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, + void *src, char *buf, size_t size) +{ + memcpy(buf, src, size); + + return ES_OK; +} + +static enum es_result vc_ioio_check(struct es_em_ctxt *ctxt, u16 port, size_t size) +{ + return ES_OK; +} + +static bool fault_in_kernel_space(unsigned long address) +{ + return false; +} + +#define sev_printk(fmt, ...) + +#include "../../coco/sev/vc-shared.c" + +void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code) +{ + struct es_em_ctxt ctxt; + enum es_result result; + + if (!boot_ghcb && !early_setup_ghcb()) + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); + + vc_ghcb_invalidate(boot_ghcb); + result = vc_init_em_ctxt(&ctxt, regs, exit_code); + if (result != ES_OK) + goto finish; + + result = vc_check_opcode_bytes(&ctxt, exit_code); + if (result != ES_OK) + goto finish; + + switch (exit_code) { + case SVM_EXIT_RDTSC: + case SVM_EXIT_RDTSCP: + result = vc_handle_rdtsc(boot_ghcb, &ctxt, exit_code); + break; + case SVM_EXIT_IOIO: + result = vc_handle_ioio(boot_ghcb, &ctxt); + break; + case SVM_EXIT_CPUID: + result = vc_handle_cpuid(boot_ghcb, &ctxt); + break; + default: + result = ES_UNSUPPORTED; + break; + } + +finish: + if (result == ES_OK) + vc_finish_insn(&ctxt); + else if (result != ES_RETRY) + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); +} diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index c93930d5ccbd..c8c1464b3a56 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -12,7 +12,9 @@ */ #include "misc.h" +#include <asm/bootparam.h> #include <asm/pgtable_types.h> +#include <asm/shared/msr.h> #include <asm/sev.h> #include <asm/trapnr.h> #include <asm/trap_pf.h> @@ -20,150 +22,61 @@ #include <asm/fpu/xcr.h> #include <asm/ptrace.h> #include <asm/svm.h> -#include <asm/cpuid.h> +#include <asm/cpuid/api.h> #include "error.h" -#include "../msr.h" +#include "sev.h" -struct ghcb boot_ghcb_page __aligned(PAGE_SIZE); +static struct ghcb boot_ghcb_page __aligned(PAGE_SIZE); struct ghcb *boot_ghcb; -/* - * Copy a version of this function here - insn-eval.c can't be used in - * pre-decompression code. - */ -static bool insn_has_rep_prefix(struct insn *insn) -{ - insn_byte_t p; - int i; - - insn_get_prefixes(insn); - - for_each_insn_prefix(insn, i, p) { - if (p == 0xf2 || p == 0xf3) - return true; - } - - return false; -} - -/* - * Only a dummy for insn_get_seg_base() - Early boot-code is 64bit only and - * doesn't use segments. - */ -static unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx) -{ - return 0UL; -} - -static inline u64 sev_es_rd_ghcb_msr(void) -{ - struct msr m; - - boot_rdmsr(MSR_AMD64_SEV_ES_GHCB, &m); - - return m.q; -} - -static inline void sev_es_wr_ghcb_msr(u64 val) -{ - struct msr m; - - m.q = val; - boot_wrmsr(MSR_AMD64_SEV_ES_GHCB, &m); -} - -static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) -{ - char buffer[MAX_INSN_SIZE]; - int ret; - - memcpy(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE); - - ret = insn_decode(&ctxt->insn, buffer, MAX_INSN_SIZE, INSN_MODE_64); - if (ret < 0) - return ES_DECODE_FAILED; - - return ES_OK; -} - -static enum es_result vc_write_mem(struct es_em_ctxt *ctxt, - void *dst, char *buf, size_t size) -{ - memcpy(dst, buf, size); - - return ES_OK; -} - -static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, - void *src, char *buf, size_t size) -{ - memcpy(buf, src, size); - - return ES_OK; -} - #undef __init -#undef __pa #define __init -#define __pa(x) ((unsigned long)(x)) #define __BOOT_COMPRESSED -/* Basic instruction decoding support needed */ -#include "../../lib/inat.c" -#include "../../lib/insn.c" +u8 snp_vmpl; +u16 ghcb_version; + +u64 boot_svsm_caa_pa; /* Include code for early handlers */ -#include "../../kernel/sev-shared.c" +#include "../../boot/startup/sev-shared.c" -static inline bool sev_snp_enabled(void) +static bool sev_snp_enabled(void) { return sev_status & MSR_AMD64_SEV_SNP_ENABLED; } -static void __page_state_change(unsigned long paddr, enum psc_op op) +void snp_set_page_private(unsigned long paddr) { - u64 val; + struct psc_desc d = { + SNP_PAGE_STATE_PRIVATE, + (struct svsm_ca *)boot_svsm_caa_pa, + boot_svsm_caa_pa + }; if (!sev_snp_enabled()) return; - /* - * If private -> shared then invalidate the page before requesting the - * state change in the RMP table. - */ - if (op == SNP_PAGE_STATE_SHARED && pvalidate(paddr, RMP_PG_SIZE_4K, 0)) - sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE); - - /* Issue VMGEXIT to change the page state in RMP table. */ - sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op)); - VMGEXIT(); - - /* Read the response of the VMGEXIT. */ - val = sev_es_rd_ghcb_msr(); - if ((GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP) || GHCB_MSR_PSC_RESP_VAL(val)) - sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); - - /* - * Now that page state is changed in the RMP table, validate it so that it is - * consistent with the RMP entry. - */ - if (op == SNP_PAGE_STATE_PRIVATE && pvalidate(paddr, RMP_PG_SIZE_4K, 1)) - sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE); -} - -void snp_set_page_private(unsigned long paddr) -{ - __page_state_change(paddr, SNP_PAGE_STATE_PRIVATE); + __page_state_change(paddr, paddr, &d); } void snp_set_page_shared(unsigned long paddr) { - __page_state_change(paddr, SNP_PAGE_STATE_SHARED); + struct psc_desc d = { + SNP_PAGE_STATE_SHARED, + (struct svsm_ca *)boot_svsm_caa_pa, + boot_svsm_caa_pa + }; + + if (!sev_snp_enabled()) + return; + + __page_state_change(paddr, paddr, &d); } -static bool early_setup_ghcb(void) +bool early_setup_ghcb(void) { if (set_page_decrypted((unsigned long)&boot_ghcb_page)) return false; @@ -174,7 +87,7 @@ static bool early_setup_ghcb(void) boot_ghcb = &boot_ghcb_page; /* Initialize lookup tables for the instruction decoder */ - inat_init_tables(); + sev_insn_decode_init(); /* SNP guest requires the GHCB GPA must be registered */ if (sev_snp_enabled()) @@ -183,6 +96,18 @@ static bool early_setup_ghcb(void) return true; } +void snp_accept_memory(phys_addr_t start, phys_addr_t end) +{ + struct psc_desc d = { + SNP_PAGE_STATE_PRIVATE, + (struct svsm_ca *)boot_svsm_caa_pa, + boot_svsm_caa_pa + }; + + for (phys_addr_t pa = start; pa < end; pa += PAGE_SIZE) + __page_state_change(pa, pa, &d); +} + void sev_es_shutdown_ghcb(void) { if (!boot_ghcb) @@ -192,6 +117,16 @@ void sev_es_shutdown_ghcb(void) error("SEV-ES CPU Features missing."); /* + * This denotes whether to use the GHCB MSR protocol or the GHCB + * shared page to perform a GHCB request. Since the GHCB page is + * being changed to encrypted, it can't be used to perform GHCB + * requests. Clear the boot_ghcb variable so that the GHCB MSR + * protocol is used to change the GHCB page over to an encrypted + * page. + */ + boot_ghcb = NULL; + + /* * GHCB Page must be flushed from the cache and mapped encrypted again. * Otherwise the running kernel will see strange cache effects when * trying to use that page. @@ -208,140 +143,94 @@ void sev_es_shutdown_ghcb(void) error("Can't unmap GHCB page"); } +static void __noreturn sev_es_ghcb_terminate(struct ghcb *ghcb, unsigned int set, + unsigned int reason, u64 exit_info_2) +{ + u64 exit_info_1 = SVM_VMGEXIT_TERM_REASON(set, reason); + + vc_ghcb_invalidate(ghcb); + ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_TERM_REQUEST); + ghcb_set_sw_exit_info_1(ghcb, exit_info_1); + ghcb_set_sw_exit_info_2(ghcb, exit_info_2); + + sev_es_wr_ghcb_msr(__pa(ghcb)); + VMGEXIT(); + + while (true) + asm volatile("hlt\n" : : : "memory"); +} + bool sev_es_check_ghcb_fault(unsigned long address) { /* Check whether the fault was on the GHCB page */ return ((address & PAGE_MASK) == (unsigned long)&boot_ghcb_page); } -void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code) -{ - struct es_em_ctxt ctxt; - enum es_result result; - - if (!boot_ghcb && !early_setup_ghcb()) - sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); - - vc_ghcb_invalidate(boot_ghcb); - result = vc_init_em_ctxt(&ctxt, regs, exit_code); - if (result != ES_OK) - goto finish; - - switch (exit_code) { - case SVM_EXIT_RDTSC: - case SVM_EXIT_RDTSCP: - result = vc_handle_rdtsc(boot_ghcb, &ctxt, exit_code); - break; - case SVM_EXIT_IOIO: - result = vc_handle_ioio(boot_ghcb, &ctxt); - break; - case SVM_EXIT_CPUID: - result = vc_handle_cpuid(boot_ghcb, &ctxt); - break; - default: - result = ES_UNSUPPORTED; - break; - } +/* + * SNP_FEATURES_IMPL_REQ is the mask of SNP features that will need + * guest side implementation for proper functioning of the guest. If any + * of these features are enabled in the hypervisor but are lacking guest + * side implementation, the behavior of the guest will be undefined. The + * guest could fail in non-obvious way making it difficult to debug. + * + * As the behavior of reserved feature bits is unknown to be on the + * safe side add them to the required features mask. + */ +#define SNP_FEATURES_IMPL_REQ (MSR_AMD64_SNP_VTOM | \ + MSR_AMD64_SNP_REFLECT_VC | \ + MSR_AMD64_SNP_RESTRICTED_INJ | \ + MSR_AMD64_SNP_ALT_INJ | \ + MSR_AMD64_SNP_DEBUG_SWAP | \ + MSR_AMD64_SNP_VMPL_SSS | \ + MSR_AMD64_SNP_SECURE_TSC | \ + MSR_AMD64_SNP_VMGEXIT_PARAM | \ + MSR_AMD64_SNP_VMSA_REG_PROT | \ + MSR_AMD64_SNP_RESERVED_BIT13 | \ + MSR_AMD64_SNP_RESERVED_BIT15 | \ + MSR_AMD64_SNP_SECURE_AVIC | \ + MSR_AMD64_SNP_RESERVED_MASK) + +#ifdef CONFIG_AMD_SECURE_AVIC +#define SNP_FEATURE_SECURE_AVIC MSR_AMD64_SNP_SECURE_AVIC +#else +#define SNP_FEATURE_SECURE_AVIC 0 +#endif -finish: - if (result == ES_OK) - vc_finish_insn(&ctxt); - else if (result != ES_RETRY) - sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); -} +/* + * SNP_FEATURES_PRESENT is the mask of SNP features that are implemented + * by the guest kernel. As and when a new feature is implemented in the + * guest kernel, a corresponding bit should be added to the mask. + */ +#define SNP_FEATURES_PRESENT (MSR_AMD64_SNP_DEBUG_SWAP | \ + MSR_AMD64_SNP_SECURE_TSC | \ + SNP_FEATURE_SECURE_AVIC) -static void enforce_vmpl0(void) +u64 snp_get_unsupported_features(u64 status) { - u64 attrs; - int err; + if (!(status & MSR_AMD64_SEV_SNP_ENABLED)) + return 0; - /* - * RMPADJUST modifies RMP permissions of a lesser-privileged (numerically - * higher) privilege level. Here, clear the VMPL1 permission mask of the - * GHCB page. If the guest is not running at VMPL0, this will fail. - * - * If the guest is running at VMPL0, it will succeed. Even if that operation - * modifies permission bits, it is still ok to do so currently because Linux - * SNP guests are supported only on VMPL0 so VMPL1 or higher permission masks - * changing is a don't-care. - */ - attrs = 1; - if (rmpadjust((unsigned long)&boot_ghcb_page, RMP_PG_SIZE_4K, attrs)) - sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_NOT_VMPL0); + return status & SNP_FEATURES_IMPL_REQ & ~SNP_FEATURES_PRESENT; } -void sev_enable(struct boot_params *bp) +void snp_check_features(void) { - unsigned int eax, ebx, ecx, edx; - struct msr m; - bool snp; - - /* - * bp->cc_blob_address should only be set by boot/compressed kernel. - * Initialize it to 0 to ensure that uninitialized values from - * buggy bootloaders aren't propagated. - */ - if (bp) - bp->cc_blob_address = 0; - - /* - * Setup/preliminary detection of SNP. This will be sanity-checked - * against CPUID/MSR values later. - */ - snp = snp_init(bp); - - /* Check for the SME/SEV support leaf */ - eax = 0x80000000; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - if (eax < 0x8000001f) - return; - - /* - * Check for the SME/SEV feature: - * CPUID Fn8000_001F[EAX] - * - Bit 0 - Secure Memory Encryption support - * - Bit 1 - Secure Encrypted Virtualization support - * CPUID Fn8000_001F[EBX] - * - Bits 5:0 - Pagetable bit position used to indicate encryption - */ - eax = 0x8000001f; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - /* Check whether SEV is supported */ - if (!(eax & BIT(1))) { - if (snp) - error("SEV-SNP support indicated by CC blob, but not CPUID."); - return; - } - - /* Set the SME mask if this is an SEV guest. */ - boot_rdmsr(MSR_AMD64_SEV, &m); - sev_status = m.q; - if (!(sev_status & MSR_AMD64_SEV_ENABLED)) - return; - - /* Negotiate the GHCB protocol version. */ - if (sev_status & MSR_AMD64_SEV_ES_ENABLED) { - if (!sev_es_negotiate_protocol()) - sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_PROT_UNSUPPORTED); - } + u64 unsupported; /* - * SNP is supported in v2 of the GHCB spec which mandates support for HV - * features. + * Terminate the boot if hypervisor has enabled any feature lacking + * guest side implementation. Pass on the unsupported features mask through + * EXIT_INFO_2 of the GHCB protocol so that those features can be reported + * as part of the guest boot failure. */ - if (sev_status & MSR_AMD64_SEV_SNP_ENABLED) { - if (!(get_hv_features() & GHCB_HV_FT_SNP)) + unsupported = snp_get_unsupported_features(sev_status); + if (unsupported) { + if (ghcb_version < 2 || (!boot_ghcb && !early_setup_ghcb())) sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); - enforce_vmpl0(); + sev_es_ghcb_terminate(boot_ghcb, SEV_TERM_SET_GEN, + GHCB_SNP_UNSUPPORTED, unsupported); } - - if (snp && !(sev_status & MSR_AMD64_SEV_SNP_ENABLED)) - error("SEV-SNP supported indicated by CC blob, but not SEV status MSR."); - - sme_me_mask = BIT_ULL(ebx & 0x3f); } /* Search for Confidential Computing blob in the EFI config table. */ @@ -393,7 +282,7 @@ found_cc_info: * Indicate SNP based on presence of SNP-specific CC blob. Subsequent checks * will verify the SNP CPUID/MSR bits. */ -bool snp_init(struct boot_params *bp) +static bool early_snp_init(struct boot_params *bp) { struct cc_blob_sev_info *cc_info; @@ -414,6 +303,13 @@ bool snp_init(struct boot_params *bp) setup_cpuid_table(cc_info); /* + * Record the SVSM Calling Area (CA) address if the guest is not + * running at VMPL0. The CA will be used to communicate with the + * SVSM and request its services. + */ + svsm_setup_ca(cc_info, rip_rel_ptr(&boot_ghcb_page)); + + /* * Pass run-time kernel a pointer to CC info via boot_params so EFI * config table doesn't need to be searched again during early startup * phase. @@ -423,6 +319,138 @@ bool snp_init(struct boot_params *bp) return true; } +/* + * sev_check_cpu_support - Check for SEV support in the CPU capabilities + * + * Returns < 0 if SEV is not supported, otherwise the position of the + * encryption bit in the page table descriptors. + */ +static int sev_check_cpu_support(void) +{ + unsigned int eax, ebx, ecx, edx; + + /* Check for the SME/SEV support leaf */ + eax = 0x80000000; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + if (eax < 0x8000001f) + return -ENODEV; + + /* + * Check for the SME/SEV feature: + * CPUID Fn8000_001F[EAX] + * - Bit 0 - Secure Memory Encryption support + * - Bit 1 - Secure Encrypted Virtualization support + * CPUID Fn8000_001F[EBX] + * - Bits 5:0 - Pagetable bit position used to indicate encryption + */ + eax = 0x8000001f; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + /* Check whether SEV is supported */ + if (!(eax & BIT(1))) + return -ENODEV; + + sev_snp_needs_sfw = !(ebx & BIT(31)); + + return ebx & 0x3f; +} + +void sev_enable(struct boot_params *bp) +{ + struct msr m; + int bitpos; + bool snp; + + /* + * bp->cc_blob_address should only be set by boot/compressed kernel. + * Initialize it to 0 to ensure that uninitialized values from + * buggy bootloaders aren't propagated. + */ + if (bp) + bp->cc_blob_address = 0; + + /* + * Do an initial SEV capability check before early_snp_init() which + * loads the CPUID page and the same checks afterwards are done + * without the hypervisor and are trustworthy. + * + * If the HV fakes SEV support, the guest will crash'n'burn + * which is good enough. + */ + + if (sev_check_cpu_support() < 0) + return; + + /* + * Setup/preliminary detection of SNP. This will be sanity-checked + * against CPUID/MSR values later. + */ + snp = early_snp_init(bp); + + /* Now repeat the checks with the SNP CPUID table. */ + + bitpos = sev_check_cpu_support(); + if (bitpos < 0) { + if (snp) + error("SEV-SNP support indicated by CC blob, but not CPUID."); + return; + } + + /* Set the SME mask if this is an SEV guest. */ + raw_rdmsr(MSR_AMD64_SEV, &m); + sev_status = m.q; + if (!(sev_status & MSR_AMD64_SEV_ENABLED)) + return; + + /* Negotiate the GHCB protocol version. */ + if (sev_status & MSR_AMD64_SEV_ES_ENABLED) { + if (!sev_es_negotiate_protocol()) + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_PROT_UNSUPPORTED); + } + + /* + * SNP is supported in v2 of the GHCB spec which mandates support for HV + * features. + */ + if (sev_status & MSR_AMD64_SEV_SNP_ENABLED) { + u64 hv_features; + + hv_features = get_hv_features(); + if (!(hv_features & GHCB_HV_FT_SNP)) + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); + + /* + * Running at VMPL0 is required unless an SVSM is present and + * the hypervisor supports the required SVSM GHCB events. + */ + if (snp_vmpl && !(hv_features & GHCB_HV_FT_SNP_MULTI_VMPL)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_NOT_VMPL0); + } + + if (snp && !(sev_status & MSR_AMD64_SEV_SNP_ENABLED)) + error("SEV-SNP supported indicated by CC blob, but not SEV status MSR."); + + sme_me_mask = BIT_ULL(bitpos); +} + +/* + * sev_get_status - Retrieve the SEV status mask + * + * Returns 0 if the CPU is not SEV capable, otherwise the value of the + * AMD64_SEV MSR. + */ +u64 sev_get_status(void) +{ + struct msr m; + + if (sev_check_cpu_support() < 0) + return 0; + + raw_rdmsr(MSR_AMD64_SEV, &m); + return m.q; +} + void sev_prep_identity_maps(unsigned long top_level_pgt) { /* @@ -432,7 +460,7 @@ void sev_prep_identity_maps(unsigned long top_level_pgt) * accessed after switchover. */ if (sev_snp_enabled()) { - unsigned long cc_info_pa = boot_params->cc_blob_address; + unsigned long cc_info_pa = boot_params_ptr->cc_blob_address; struct cc_blob_sev_info *cc_info; kernel_add_identity_map(cc_info_pa, cc_info_pa + sizeof(*cc_info)); @@ -443,3 +471,42 @@ void sev_prep_identity_maps(unsigned long top_level_pgt) sev_verify_cbit(top_level_pgt); } + +bool early_is_sevsnp_guest(void) +{ + static bool sevsnp; + + if (sevsnp) + return true; + + if (!(sev_get_status() & MSR_AMD64_SEV_SNP_ENABLED)) + return false; + + sevsnp = true; + + if (!snp_vmpl) { + unsigned int eax, ebx, ecx, edx; + + /* + * CPUID Fn8000_001F_EAX[28] - SVSM support + */ + eax = 0x8000001f; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + if (eax & BIT(28)) { + struct msr m; + + /* Obtain the address of the calling area to use */ + raw_rdmsr(MSR_SVSM_CAA, &m); + boot_svsm_caa_pa = m.q; + + /* + * The real VMPL level cannot be discovered, but the + * memory acceptance routines make no use of that so + * any non-zero value suffices here. + */ + snp_vmpl = U8_MAX; + } + } + return true; +} diff --git a/arch/x86/boot/compressed/sev.h b/arch/x86/boot/compressed/sev.h new file mode 100644 index 000000000000..22637b416b46 --- /dev/null +++ b/arch/x86/boot/compressed/sev.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * AMD SEV header for early boot related functions. + * + * Author: Tom Lendacky <thomas.lendacky@amd.com> + */ + +#ifndef BOOT_COMPRESSED_SEV_H +#define BOOT_COMPRESSED_SEV_H + +#ifdef CONFIG_AMD_MEM_ENCRYPT + +#include <asm/shared/msr.h> + +void snp_accept_memory(phys_addr_t start, phys_addr_t end); +u64 sev_get_status(void); +bool early_is_sevsnp_guest(void); + +static inline u64 sev_es_rd_ghcb_msr(void) +{ + struct msr m; + + raw_rdmsr(MSR_AMD64_SEV_ES_GHCB, &m); + + return m.q; +} + +static inline void sev_es_wr_ghcb_msr(u64 val) +{ + struct msr m; + + m.q = val; + raw_wrmsr(MSR_AMD64_SEV_ES_GHCB, &m); +} + +#else + +static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { } +static inline u64 sev_get_status(void) { return 0; } +static inline bool early_is_sevsnp_guest(void) { return false; } + +#endif + +#endif diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c index 81fc1eaa3229..9af19d9614cb 100644 --- a/arch/x86/boot/compressed/string.c +++ b/arch/x86/boot/compressed/string.c @@ -15,9 +15,9 @@ static void *____memcpy(void *dest, const void *src, size_t n) { int d0, d1, d2; asm volatile( - "rep ; movsl\n\t" + "rep movsl\n\t" "movl %4,%%ecx\n\t" - "rep ; movsb\n\t" + "rep movsb" : "=&c" (d0), "=&D" (d1), "=&S" (d2) : "0" (n >> 2), "g" (n & 3), "1" (dest), "2" (src) : "memory"); @@ -29,9 +29,9 @@ static void *____memcpy(void *dest, const void *src, size_t n) { long d0, d1, d2; asm volatile( - "rep ; movsq\n\t" + "rep movsq\n\t" "movq %4,%%rcx\n\t" - "rep ; movsb\n\t" + "rep movsb" : "=&c" (d0), "=&D" (d1), "=&S" (d2) : "0" (n >> 3), "g" (n & 7), "1" (dest), "2" (src) : "memory"); diff --git a/arch/x86/boot/compressed/tdx-shared.c b/arch/x86/boot/compressed/tdx-shared.c new file mode 100644 index 000000000000..5ac43762fe13 --- /dev/null +++ b/arch/x86/boot/compressed/tdx-shared.c @@ -0,0 +1,2 @@ +#include "error.h" +#include "../../coco/tdx/tdx-shared.c" diff --git a/arch/x86/boot/compressed/tdx.c b/arch/x86/boot/compressed/tdx.c index 918a7606f53c..8451d6a1030c 100644 --- a/arch/x86/boot/compressed/tdx.c +++ b/arch/x86/boot/compressed/tdx.c @@ -18,15 +18,15 @@ void __tdx_hypercall_failed(void) static inline unsigned int tdx_io_in(int size, u16 port) { - struct tdx_hypercall_args args = { + struct tdx_module_args args = { .r10 = TDX_HYPERCALL_STANDARD, - .r11 = EXIT_REASON_IO_INSTRUCTION, + .r11 = hcall_func(EXIT_REASON_IO_INSTRUCTION), .r12 = size, .r13 = 0, .r14 = port, }; - if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT)) + if (__tdx_hypercall(&args)) return UINT_MAX; return args.r11; @@ -34,16 +34,16 @@ static inline unsigned int tdx_io_in(int size, u16 port) static inline void tdx_io_out(int size, u16 port, u32 value) { - struct tdx_hypercall_args args = { + struct tdx_module_args args = { .r10 = TDX_HYPERCALL_STANDARD, - .r11 = EXIT_REASON_IO_INSTRUCTION, + .r11 = hcall_func(EXIT_REASON_IO_INSTRUCTION), .r12 = size, .r13 = 1, .r14 = port, .r15 = value, }; - __tdx_hypercall(&args, 0); + __tdx_hypercall(&args); } static inline u8 tdx_inb(u16 port) diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S index 112b2375d021..587ce3e7c504 100644 --- a/arch/x86/boot/compressed/vmlinux.lds.S +++ b/arch/x86/boot/compressed/vmlinux.lds.S @@ -34,6 +34,7 @@ SECTIONS _text = .; /* Text */ *(.text) *(.text.*) + *(.noinstr.text) _etext = . ; } .rodata : { @@ -42,11 +43,21 @@ SECTIONS *(.rodata.*) _erodata = . ; } - .data : { +#ifdef CONFIG_EFI_SBAT + .sbat : ALIGN(0x1000) { + _sbat = . ; + *(.sbat) + _esbat = ALIGN(0x1000); + . = _esbat; + } +#endif + .data : ALIGN(0x1000) { _data = . ; *(.data) *(.data.*) - *(.bss.efistub) + + /* Add 4 bytes of extra space for the obsolete CRC-32 checksum */ + . = ALIGN(. + 4, 0x200); _edata = . ; } . = ALIGN(L1_CACHE_BYTES); diff --git a/arch/x86/boot/copy.S b/arch/x86/boot/copy.S index 6afd05e819d2..3973a67cd04e 100644 --- a/arch/x86/boot/copy.S +++ b/arch/x86/boot/copy.S @@ -22,10 +22,10 @@ SYM_FUNC_START_NOALIGN(memcpy) movw %dx, %si pushw %cx shrw $2, %cx - rep; movsl + rep movsl popw %cx andw $3, %cx - rep; movsb + rep movsb popw %di popw %si retl @@ -38,10 +38,10 @@ SYM_FUNC_START_NOALIGN(memset) imull $0x01010101,%eax pushw %cx shrw $2, %cx - rep; stosl + rep stosl popw %cx andw $3, %cx - rep; stosb + rep stosb popw %di retl SYM_FUNC_END(memset) diff --git a/arch/x86/boot/cpu.c b/arch/x86/boot/cpu.c index 0bbf4f3707d2..feb6dbd7ca86 100644 --- a/arch/x86/boot/cpu.c +++ b/arch/x86/boot/cpu.c @@ -14,9 +14,7 @@ */ #include "boot.h" -#ifdef CONFIG_X86_FEATURE_NAMES #include "cpustr.h" -#endif static char *cpu_name(int level) { @@ -35,7 +33,6 @@ static char *cpu_name(int level) static void show_cap_strs(u32 *err_flags) { int i, j; -#ifdef CONFIG_X86_FEATURE_NAMES const unsigned char *msg_strs = (const unsigned char *)x86_cap_strs; for (i = 0; i < NCAPINTS; i++) { u32 e = err_flags[i]; @@ -58,16 +55,6 @@ static void show_cap_strs(u32 *err_flags) e >>= 1; } } -#else - for (i = 0; i < NCAPINTS; i++) { - u32 e = err_flags[i]; - for (j = 0; j < 32; j++) { - if (e & 1) - printf("%d:%d ", i, j); - e >>= 1; - } - } -#endif } int validate_cpu(void) diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c index fed8d13ce252..2e1bb936cba2 100644 --- a/arch/x86/boot/cpucheck.c +++ b/arch/x86/boot/cpucheck.c @@ -22,12 +22,13 @@ # include "boot.h" #endif #include <linux/types.h> +#include <asm/cpufeaturemasks.h> #include <asm/intel-family.h> #include <asm/processor-flags.h> -#include <asm/required-features.h> #include <asm/msr-index.h> +#include <asm/shared/msr.h> + #include "string.h" -#include "msr.h" static u32 err_flags[NCAPINTS]; @@ -133,9 +134,9 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr) struct msr m; - boot_rdmsr(MSR_K7_HWCR, &m); + raw_rdmsr(MSR_K7_HWCR, &m); m.l &= ~(1 << 15); - boot_wrmsr(MSR_K7_HWCR, &m); + raw_wrmsr(MSR_K7_HWCR, &m); get_cpuflags(); /* Make sure it really did something */ err = check_cpuflags(); @@ -147,9 +148,9 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr) struct msr m; - boot_rdmsr(MSR_VIA_FCR, &m); + raw_rdmsr(MSR_VIA_FCR, &m); m.l |= (1 << 1) | (1 << 7); - boot_wrmsr(MSR_VIA_FCR, &m); + raw_wrmsr(MSR_VIA_FCR, &m); set_bit(X86_FEATURE_CX8, cpu.flags); err = check_cpuflags(); @@ -159,14 +160,14 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr) struct msr m, m_tmp; u32 level = 1; - boot_rdmsr(0x80860004, &m); + raw_rdmsr(0x80860004, &m); m_tmp = m; m_tmp.l = ~0; - boot_wrmsr(0x80860004, &m_tmp); + raw_wrmsr(0x80860004, &m_tmp); asm("cpuid" : "+a" (level), "=d" (cpu.flags[0]) : : "ecx", "ebx"); - boot_wrmsr(0x80860004, &m); + raw_wrmsr(0x80860004, &m); err = check_cpuflags(); } else if (err == 0x01 && @@ -203,7 +204,7 @@ int check_knl_erratum(void) */ if (!is_intel() || cpu.family != 6 || - cpu.model != INTEL_FAM6_XEON_PHI_KNL) + cpu.model != 0x57 /*INTEL_XEON_PHI_KNL*/) return 0; /* diff --git a/arch/x86/boot/cpuflags.c b/arch/x86/boot/cpuflags.c index a83d67ec627d..916bac09b464 100644 --- a/arch/x86/boot/cpuflags.c +++ b/arch/x86/boot/cpuflags.c @@ -3,7 +3,6 @@ #include "bitops.h" #include <asm/processor-flags.h> -#include <asm/required-features.h> #include <asm/msr-index.h> #include "cpuflags.h" @@ -29,55 +28,38 @@ static int has_fpu(void) return fsw == 0 && (fcw & 0x103f) == 0x003f; } +#ifdef CONFIG_X86_32 /* * For building the 16-bit code we want to explicitly specify 32-bit * push/pop operations, rather than just saying 'pushf' or 'popf' and - * letting the compiler choose. But this is also included from the - * compressed/ directory where it may be 64-bit code, and thus needs - * to be 'pushfq' or 'popfq' in that case. + * letting the compiler choose. */ -#ifdef __x86_64__ -#define PUSHF "pushfq" -#define POPF "popfq" -#else -#define PUSHF "pushfl" -#define POPF "popfl" -#endif - -int has_eflag(unsigned long mask) +bool has_eflag(unsigned long mask) { unsigned long f0, f1; - asm volatile(PUSHF " \n\t" - PUSHF " \n\t" + asm volatile("pushfl \n\t" + "pushfl \n\t" "pop %0 \n\t" "mov %0,%1 \n\t" "xor %2,%1 \n\t" "push %1 \n\t" - POPF " \n\t" - PUSHF " \n\t" + "popfl \n\t" + "pushfl \n\t" "pop %1 \n\t" - POPF + "popfl" : "=&r" (f0), "=&r" (f1) : "ri" (mask)); return !!((f0^f1) & mask); } - -/* Handle x86_32 PIC using ebx. */ -#if defined(__i386__) && defined(__PIC__) -# define EBX_REG "=r" -#else -# define EBX_REG "=b" #endif void cpuid_count(u32 id, u32 count, u32 *a, u32 *b, u32 *c, u32 *d) { - asm volatile(".ifnc %%ebx,%3 ; movl %%ebx,%3 ; .endif \n\t" - "cpuid \n\t" - ".ifnc %%ebx,%3 ; xchgl %%ebx,%3 ; .endif \n\t" - : "=a" (*a), "=c" (*c), "=d" (*d), EBX_REG (*b) - : "a" (id), "c" (count) + asm volatile("cpuid" + : "=a" (*a), "=b" (*b), "=c" (*c), "=d" (*d) + : "0" (id), "2" (count) ); } diff --git a/arch/x86/boot/cpuflags.h b/arch/x86/boot/cpuflags.h index 475b8fde90f7..a398d9204ad0 100644 --- a/arch/x86/boot/cpuflags.h +++ b/arch/x86/boot/cpuflags.h @@ -15,8 +15,13 @@ struct cpu_features { extern struct cpu_features cpu; extern u32 cpu_vendor[3]; -int has_eflag(unsigned long mask); +#ifdef CONFIG_X86_32 +bool has_eflag(unsigned long mask); +#else +static inline bool has_eflag(unsigned long mask) { return true; } +#endif void get_cpuflags(void); void cpuid_count(u32 id, u32 count, u32 *a, u32 *b, u32 *c, u32 *d); +bool has_cpuflag(int flag); #endif diff --git a/arch/x86/boot/genimage.sh b/arch/x86/boot/genimage.sh index c9299aeb7333..3882ead513f7 100644 --- a/arch/x86/boot/genimage.sh +++ b/arch/x86/boot/genimage.sh @@ -22,6 +22,7 @@ # This script requires: # bash # syslinux +# genisoimage # mtools (for fdimage* and hdimage) # edk2/OVMF (for hdimage) # @@ -251,7 +252,9 @@ geniso() { cp "$isolinux" "$ldlinux" "$tmp_dir" cp "$FBZIMAGE" "$tmp_dir"/linux echo default linux "$KCMDLINE" > "$tmp_dir"/isolinux.cfg - cp "${FDINITRDS[@]}" "$tmp_dir"/ + if [ ${#FDINITRDS[@]} -gt 0 ]; then + cp "${FDINITRDS[@]}" "$tmp_dir"/ + fi genisoimage -J -r -appid 'LINUX_BOOT' -input-charset=utf-8 \ -quiet -o "$FIMAGE" -b isolinux.bin \ -c boot.cat -no-emul-boot -boot-load-size 4 \ diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index f912d7770130..9bea5a1e2c52 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -36,76 +36,31 @@ SYSSEG = 0x1000 /* historical load address >> 4 */ #define ROOT_RDONLY 1 #endif + .set salign, 0x1000 + .set falign, 0x200 + .code16 .section ".bstext", "ax" - - .global bootsect_start -bootsect_start: #ifdef CONFIG_EFI_STUB # "MZ", MS-DOS header - .word MZ_MAGIC -#endif - - # Normalize the start address - ljmp $BOOTSEG, $start2 - -start2: - movw %cs, %ax - movw %ax, %ds - movw %ax, %es - movw %ax, %ss - xorw %sp, %sp - sti - cld - - movw $bugger_off_msg, %si - -msg_loop: - lodsb - andb %al, %al - jz bs_die - movb $0xe, %ah - movw $7, %bx - int $0x10 - jmp msg_loop - -bs_die: - # Allow the user to press a key, then reboot - xorw %ax, %ax - int $0x16 - int $0x19 - - # int 0x19 should never return. In case it does anyway, - # invoke the BIOS reset code... - ljmp $0xf000,$0xfff0 - -#ifdef CONFIG_EFI_STUB - .org 0x3c + .word IMAGE_DOS_SIGNATURE + .org 0x38 # # Offset to the PE header. # + .long LINUX_PE_MAGIC .long pe_header -#endif /* CONFIG_EFI_STUB */ - - .section ".bsdata", "a" -bugger_off_msg: - .ascii "Use a boot loader.\r\n" - .ascii "\n" - .ascii "Remove disk and press any key to reboot...\r\n" - .byte 0 - -#ifdef CONFIG_EFI_STUB pe_header: - .long PE_MAGIC + .long IMAGE_NT_SIGNATURE coff_header: #ifdef CONFIG_X86_32 .set image_file_add_flags, IMAGE_FILE_32BIT_MACHINE - .set pe_opt_magic, PE_OPT_MAGIC_PE32 + .set pe_opt_magic, IMAGE_NT_OPTIONAL_HDR32_MAGIC .word IMAGE_FILE_MACHINE_I386 #else .set image_file_add_flags, 0 - .set pe_opt_magic, PE_OPT_MAGIC_PE32PLUS + .set pe_opt_magic, IMAGE_NT_OPTIONAL_HDR64_MAGIC .word IMAGE_FILE_MACHINE_AMD64 #endif .word section_count # nr_sections @@ -123,30 +78,26 @@ optional_header: .byte 0x02 # MajorLinkerVersion .byte 0x14 # MinorLinkerVersion - # Filled in by build.c - .long 0 # SizeOfCode + .long ZO__data # SizeOfCode - .long 0 # SizeOfInitializedData + .long ZO__end - ZO__data # SizeOfInitializedData .long 0 # SizeOfUninitializedData - # Filled in by build.c - .long 0x0000 # AddressOfEntryPoint + .long setup_size + ZO_efi_pe_entry # AddressOfEntryPoint - .long 0x0200 # BaseOfCode + .long setup_size # BaseOfCode #ifdef CONFIG_X86_32 .long 0 # data #endif extra_header_fields: - # PE specification requires ImageBase to be 64k aligned - .set image_base, (LOAD_PHYSICAL_ADDR + 0xffff) & ~0xffff #ifdef CONFIG_X86_32 - .long image_base # ImageBase + .long 0 # ImageBase #else - .quad image_base # ImageBase + .quad 0 # ImageBase #endif - .long 0x20 # SectionAlignment - .long 0x20 # FileAlignment + .long salign # SectionAlignment + .long falign # FileAlignment .word 0 # MajorOperatingSystemVersion .word 0 # MinorOperatingSystemVersion .word LINUX_EFISTUB_MAJOR_VERSION # MajorImageVersion @@ -155,19 +106,12 @@ extra_header_fields: .word 0 # MinorSubsystemVersion .long 0 # Win32VersionValue - # - # The size of the bzImage is written in tools/build.c - # - .long 0 # SizeOfImage + .long setup_size + ZO__end # SizeOfImage - .long 0x200 # SizeOfHeaders + .long salign # SizeOfHeaders .long 0 # CheckSum .word IMAGE_SUBSYSTEM_EFI_APPLICATION # Subsystem (EFI application) -#ifdef CONFIG_EFI_DXE_MEM_ATTRIBUTES - .word IMAGE_DLL_CHARACTERISTICS_NX_COMPAT # DllCharacteristics -#else - .word 0 # DllCharacteristics -#endif + .word IMAGE_DLLCHARACTERISTICS_NX_COMPAT # DllCharacteristics #ifdef CONFIG_X86_32 .long 0 # SizeOfStackReserve .long 0 # SizeOfStackCommit @@ -191,87 +135,90 @@ extra_header_fields: # Section table section_table: - # - # The offset & size fields are filled in by build.c. - # .ascii ".setup" .byte 0 .byte 0 - .long 0 - .long 0x0 # startup_{32,64} - .long 0 # Size of initialized data - # on disk - .long 0x0 # startup_{32,64} - .long 0 # PointerToRelocations - .long 0 # PointerToLineNumbers - .word 0 # NumberOfRelocations - .word 0 # NumberOfLineNumbers - .long IMAGE_SCN_CNT_CODE | \ - IMAGE_SCN_MEM_READ | \ - IMAGE_SCN_MEM_EXECUTE | \ - IMAGE_SCN_ALIGN_16BYTES # Characteristics + .long pecompat_fstart - salign # VirtualSize + .long salign # VirtualAddress + .long pecompat_fstart - salign # SizeOfRawData + .long salign # PointerToRawData - # - # The EFI application loader requires a relocation section - # because EFI applications must be relocatable. The .reloc - # offset & size fields are filled in by build.c. - # - .ascii ".reloc" - .byte 0 - .byte 0 - .long 0 - .long 0 - .long 0 # SizeOfRawData - .long 0 # PointerToRawData - .long 0 # PointerToRelocations - .long 0 # PointerToLineNumbers - .word 0 # NumberOfRelocations - .word 0 # NumberOfLineNumbers + .long 0, 0, 0 .long IMAGE_SCN_CNT_INITIALIZED_DATA | \ IMAGE_SCN_MEM_READ | \ - IMAGE_SCN_MEM_DISCARDABLE | \ - IMAGE_SCN_ALIGN_1BYTES # Characteristics + IMAGE_SCN_MEM_DISCARDABLE # Characteristics #ifdef CONFIG_EFI_MIXED - # - # The offset & size fields are filled in by build.c. - # .asciz ".compat" - .long 0 - .long 0x0 - .long 0 # Size of initialized data - # on disk - .long 0x0 - .long 0 # PointerToRelocations - .long 0 # PointerToLineNumbers - .word 0 # NumberOfRelocations - .word 0 # NumberOfLineNumbers + + .long pecompat_fsize # VirtualSize + .long pecompat_fstart # VirtualAddress + .long pecompat_fsize # SizeOfRawData + .long pecompat_fstart # PointerToRawData + + .long 0, 0, 0 .long IMAGE_SCN_CNT_INITIALIZED_DATA | \ IMAGE_SCN_MEM_READ | \ - IMAGE_SCN_MEM_DISCARDABLE | \ - IMAGE_SCN_ALIGN_1BYTES # Characteristics + IMAGE_SCN_MEM_DISCARDABLE # Characteristics + + /* + * Put the IA-32 machine type and the associated entry point address in + * the .compat section, so loaders can figure out which other execution + * modes this image supports. + */ + .pushsection ".pecompat", "a", @progbits + .balign salign + .globl pecompat_fstart +pecompat_fstart: + .byte 0x1 # Version + .byte 8 # Size + .word IMAGE_FILE_MACHINE_I386 # PE machine type + .long setup_size + ZO_efi32_pe_entry # Entrypoint + .byte 0x0 # Sentinel + .popsection +#else + .set pecompat_fstart, setup_size #endif - - # - # The offset & size fields are filled in by build.c. - # - .ascii ".text" - .byte 0 - .byte 0 - .byte 0 - .long 0 - .long 0x0 # startup_{32,64} - .long 0 # Size of initialized data - # on disk - .long 0x0 # startup_{32,64} + .ascii ".text\0\0\0" + .long textsize # VirtualSize + .long setup_size # VirtualAddress + .long textsize # SizeOfRawData + .long setup_size # PointerToRawData .long 0 # PointerToRelocations .long 0 # PointerToLineNumbers .word 0 # NumberOfRelocations .word 0 # NumberOfLineNumbers .long IMAGE_SCN_CNT_CODE | \ IMAGE_SCN_MEM_READ | \ - IMAGE_SCN_MEM_EXECUTE | \ - IMAGE_SCN_ALIGN_16BYTES # Characteristics + IMAGE_SCN_MEM_EXECUTE # Characteristics + +#ifdef CONFIG_EFI_SBAT + .ascii ".sbat\0\0\0" + .long ZO__esbat - ZO__sbat # VirtualSize + .long setup_size + ZO__sbat # VirtualAddress + .long ZO__esbat - ZO__sbat # SizeOfRawData + .long setup_size + ZO__sbat # PointerToRawData + + .long 0, 0, 0 + .long IMAGE_SCN_CNT_INITIALIZED_DATA | \ + IMAGE_SCN_MEM_READ | \ + IMAGE_SCN_MEM_DISCARDABLE # Characteristics + + .set textsize, ZO__sbat +#else + .set textsize, ZO__data +#endif + + .ascii ".data\0\0\0" + .long ZO__end - ZO__data # VirtualSize + .long setup_size + ZO__data # VirtualAddress + .long ZO__edata - ZO__data # SizeOfRawData + .long setup_size + ZO__data # PointerToRawData + + .long 0, 0, 0 + .long IMAGE_SCN_CNT_INITIALIZED_DATA | \ + IMAGE_SCN_MEM_READ | \ + IMAGE_SCN_MEM_WRITE # Characteristics .set section_count, (. - section_table) / 40 #endif /* CONFIG_EFI_STUB */ @@ -285,12 +232,12 @@ sentinel: .byte 0xff, 0xff /* Used to detect broken loaders */ .globl hdr hdr: -setup_sects: .byte 0 /* Filled in by build.c */ + .byte setup_sects - 1 root_flags: .word ROOT_RDONLY -syssize: .long 0 /* Filled in by build.c */ +syssize: .long ZO__edata / 16 ram_size: .word 0 /* Obsolete */ vid_mode: .word SVGA_MODE -root_dev: .word 0 /* Filled in by build.c */ +root_dev: .word 0 /* Default to major/minor 0/0 */ boot_flag: .word 0xAA55 # offset 512, entry point @@ -320,7 +267,7 @@ start_sys_seg: .word SYSSEG # obsolete and meaningless, but just type_of_loader: .byte 0 # 0 means ancient bootloader, newer # bootloaders know to change this. - # See Documentation/x86/boot.rst for + # See Documentation/arch/x86/boot.rst for # assigned ids # flags, unused bits must be zero (RFU) bit within loadflags @@ -406,7 +353,7 @@ xloadflags: # define XLF1 0 #endif -#ifdef CONFIG_EFI_STUB +#ifdef CONFIG_EFI_HANDOVER_PROTOCOL # ifdef CONFIG_EFI_MIXED # define XLF23 (XLF_EFI_HANDOVER_32|XLF_EFI_HANDOVER_64) # else @@ -427,12 +374,8 @@ xloadflags: #endif #ifdef CONFIG_X86_64 -#ifdef CONFIG_X86_5LEVEL #define XLF56 (XLF_5LEVEL|XLF_5LEVEL_ENABLED) #else -#define XLF56 XLF_5LEVEL -#endif -#else #define XLF56 0 #endif @@ -578,9 +521,25 @@ pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr # define INIT_SIZE VO_INIT_SIZE #endif + .macro __handover_offset +#ifndef CONFIG_EFI_HANDOVER_PROTOCOL + .long 0 +#elif !defined(CONFIG_X86_64) + .long ZO_efi32_stub_entry +#else + /* Yes, this is really how we defined it :( */ + .long ZO_efi64_stub_entry - 0x200 +#ifdef CONFIG_EFI_MIXED + .if ZO_efi32_stub_entry != ZO_efi64_stub_entry - 0x200 + .error "32-bit and 64-bit EFI entry points do not match" + .endif +#endif +#endif + .endm + init_size: .long INIT_SIZE # kernel initialization size -handover_offset: .long 0 # Filled in by build.c -kernel_info_offset: .long 0 # Filled in by build.c +handover_offset: __handover_offset +kernel_info_offset: .long ZO_kernel_info # End of setup header ##################################################### @@ -635,7 +594,7 @@ start_of_setup: xorl %eax, %eax subw %di, %cx shrw $2, %cx - rep; stosl + rep stosl # Jump to C code (should not return) calll main diff --git a/arch/x86/boot/install.sh b/arch/x86/boot/install.sh index 0849f4b42745..93784abcd66d 100755 --- a/arch/x86/boot/install.sh +++ b/arch/x86/boot/install.sh @@ -16,6 +16,8 @@ # $3 - kernel map file # $4 - default install path (blank if root directory) +set -e + if [ -f $4/vmlinuz ]; then mv $4/vmlinuz $4/vmlinuz.old fi diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c index c4ea5258ab55..9d0fea18d3c8 100644 --- a/arch/x86/boot/main.c +++ b/arch/x86/boot/main.c @@ -27,34 +27,32 @@ char *heap_end = _end; /* Default end of heap = no heap */ * screws up the old-style command line protocol, adjust by * filling in the new-style command line pointer instead. */ - static void copy_boot_params(void) { struct old_cmdline { u16 cl_magic; u16 cl_offset; }; - const struct old_cmdline * const oldcmd = - absolute_pointer(OLD_CL_ADDRESS); + const struct old_cmdline * const oldcmd = absolute_pointer(OLD_CL_ADDRESS); BUILD_BUG_ON(sizeof(boot_params) != 4096); memcpy(&boot_params.hdr, &hdr, sizeof(hdr)); - if (!boot_params.hdr.cmd_line_ptr && - oldcmd->cl_magic == OLD_CL_MAGIC) { - /* Old-style command line protocol. */ + if (!boot_params.hdr.cmd_line_ptr && oldcmd->cl_magic == OLD_CL_MAGIC) { + /* Old-style command line protocol */ u16 cmdline_seg; - /* Figure out if the command line falls in the region - of memory that an old kernel would have copied up - to 0x90000... */ + /* + * Figure out if the command line falls in the region + * of memory that an old kernel would have copied up + * to 0x90000... + */ if (oldcmd->cl_offset < boot_params.hdr.setup_move_size) cmdline_seg = ds(); else cmdline_seg = 0x9000; - boot_params.hdr.cmd_line_ptr = - (cmdline_seg << 4) + oldcmd->cl_offset; + boot_params.hdr.cmd_line_ptr = (cmdline_seg << 4) + oldcmd->cl_offset; } } @@ -66,6 +64,7 @@ static void copy_boot_params(void) static void keyboard_init(void) { struct biosregs ireg, oreg; + initregs(&ireg); ireg.ah = 0x02; /* Get keyboard status */ @@ -83,8 +82,10 @@ static void query_ist(void) { struct biosregs ireg, oreg; - /* Some older BIOSes apparently crash on this call, so filter - it from machines too old to have SpeedStep at all. */ + /* + * Some older BIOSes apparently crash on this call, so filter + * it from machines too old to have SpeedStep at all. + */ if (cpu.level < 6) return; @@ -119,17 +120,13 @@ static void init_heap(void) char *stack_end; if (boot_params.hdr.loadflags & CAN_USE_HEAP) { - asm("leal %P1(%%esp),%0" - : "=r" (stack_end) : "i" (-STACK_SIZE)); - - heap_end = (char *) - ((size_t)boot_params.hdr.heap_end_ptr + 0x200); + stack_end = (char *) (current_stack_pointer - STACK_SIZE); + heap_end = (char *) ((size_t)boot_params.hdr.heap_end_ptr + 0x200); if (heap_end > stack_end) heap_end = stack_end; } else { /* Boot protocol 2.00 only, no heap available */ - puts("WARNING: Ancient bootloader, some functionality " - "may be limited!\n"); + puts("WARNING: Ancient bootloader, some functionality may be limited!\n"); } } @@ -150,12 +147,11 @@ void main(void) /* Make sure we have all the proper CPU support */ if (validate_cpu()) { - puts("Unable to boot - please use a kernel appropriate " - "for your CPU.\n"); + puts("Unable to boot - please use a kernel appropriate for your CPU.\n"); die(); } - /* Tell the BIOS what CPU mode we intend to run in. */ + /* Tell the BIOS what CPU mode we intend to run in */ set_bios_mode(); /* Detect memory layout */ diff --git a/arch/x86/boot/mkcpustr.c b/arch/x86/boot/mkcpustr.c index da0ccc5de538..22d730b227e3 100644 --- a/arch/x86/boot/mkcpustr.c +++ b/arch/x86/boot/mkcpustr.c @@ -12,8 +12,6 @@ #include <stdio.h> -#include "../include/asm/required-features.h" -#include "../include/asm/disabled-features.h" #include "../include/asm/cpufeatures.h" #include "../include/asm/vmxfeatures.h" #include "../kernel/cpu/capflags.c" @@ -23,6 +21,7 @@ int main(void) int i, j; const char *str; + printf("#include <asm/cpufeaturemasks.h>\n\n"); printf("static const char x86_cap_strs[] =\n"); for (i = 0; i < NCAPINTS; i++) { diff --git a/arch/x86/boot/msr.h b/arch/x86/boot/msr.h deleted file mode 100644 index aed66f7ae199..000000000000 --- a/arch/x86/boot/msr.h +++ /dev/null @@ -1,26 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Helpers/definitions related to MSR access. - */ - -#ifndef BOOT_MSR_H -#define BOOT_MSR_H - -#include <asm/shared/msr.h> - -/* - * The kernel proper already defines rdmsr()/wrmsr(), but they are not for the - * boot kernel since they rely on tracepoint/exception handling infrastructure - * that's not available here. - */ -static inline void boot_rdmsr(unsigned int reg, struct msr *m) -{ - asm volatile("rdmsr" : "=a" (m->l), "=d" (m->h) : "c" (reg)); -} - -static inline void boot_wrmsr(unsigned int reg, const struct msr *m) -{ - asm volatile("wrmsr" : : "c" (reg), "a"(m->l), "d" (m->h) : "memory"); -} - -#endif /* BOOT_MSR_H */ diff --git a/arch/x86/boot/pm.c b/arch/x86/boot/pm.c index 40031a614712..5941f930f6c5 100644 --- a/arch/x86/boot/pm.c +++ b/arch/x86/boot/pm.c @@ -11,6 +11,7 @@ */ #include "boot.h" +#include <asm/desc_defs.h> #include <asm/segment.h> /* @@ -67,13 +68,13 @@ static void setup_gdt(void) being 8-byte unaligned. Intel recommends 16 byte alignment. */ static const u64 boot_gdt[] __attribute__((aligned(16))) = { /* CS: code, read/execute, 4 GB, base 0 */ - [GDT_ENTRY_BOOT_CS] = GDT_ENTRY(0xc09b, 0, 0xfffff), + [GDT_ENTRY_BOOT_CS] = GDT_ENTRY(DESC_CODE32, 0, 0xfffff), /* DS: data, read/write, 4 GB, base 0 */ - [GDT_ENTRY_BOOT_DS] = GDT_ENTRY(0xc093, 0, 0xfffff), + [GDT_ENTRY_BOOT_DS] = GDT_ENTRY(DESC_DATA32, 0, 0xfffff), /* TSS: 32-bit tss, 104 bytes, base 4096 */ /* We only have a TSS here to keep Intel VT happy; we don't actually use it for anything. */ - [GDT_ENTRY_BOOT_TSS] = GDT_ENTRY(0x0089, 4096, 103), + [GDT_ENTRY_BOOT_TSS] = GDT_ENTRY(DESC_TSS32, 4096, 103), }; /* Xen HVM incorrectly stores a pointer to the gdt_ptr, instead of the gdt_ptr contents. Thus, make it static so it will diff --git a/arch/x86/boot/printf.c b/arch/x86/boot/printf.c index 1237beeb9540..51dc14b714f6 100644 --- a/arch/x86/boot/printf.c +++ b/arch/x86/boot/printf.c @@ -246,6 +246,7 @@ int vsprintf(char *buf, const char *fmt, va_list args) case 'x': flags |= SMALL; + fallthrough; case 'X': base = 16; break; @@ -253,6 +254,8 @@ int vsprintf(char *buf, const char *fmt, va_list args) case 'd': case 'i': flags |= SIGN; + break; + case 'u': break; diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld index 49546c247ae2..e1d594a60204 100644 --- a/arch/x86/boot/setup.ld +++ b/arch/x86/boot/setup.ld @@ -10,10 +10,11 @@ ENTRY(_start) SECTIONS { . = 0; - .bstext : { *(.bstext) } - .bsdata : { *(.bsdata) } + .bstext : { + *(.bstext) + . = 495; + } =0xffffffff - . = 495; .header : { *(.header) } .entrytext : { *(.entrytext) } .inittext : { *(.inittext) } @@ -23,6 +24,9 @@ SECTIONS .text : { *(.text .text.*) } .text32 : { *(.text32) } + .pecompat : { *(.pecompat) } + PROVIDE(pecompat_fsize = setup_size - pecompat_fstart); + . = ALIGN(16); .rodata : { *(.rodata*) } @@ -38,8 +42,12 @@ SECTIONS .signature : { setup_sig = .; LONG(0x5a5aaa55) - } + setup_size = ALIGN(ABSOLUTE(.), 4096); + setup_sects = ABSOLUTE(setup_size / 512); + ASSERT(setup_sects >= 5, "The setup must be at least 5 sectors in size"); + ASSERT(setup_sects <= 64, "The setup must be at most 64 sectors in size"); + } . = ALIGN(16); .bss : diff --git a/arch/x86/boot/startup/Makefile b/arch/x86/boot/startup/Makefile new file mode 100644 index 000000000000..5e499cfb29b5 --- /dev/null +++ b/arch/x86/boot/startup/Makefile @@ -0,0 +1,52 @@ +# SPDX-License-Identifier: GPL-2.0 + +KBUILD_AFLAGS += -D__DISABLE_EXPORTS +KBUILD_CFLAGS += -D__DISABLE_EXPORTS -mcmodel=small -fPIC \ + -Os -DDISABLE_BRANCH_PROFILING \ + $(DISABLE_STACKLEAK_PLUGIN) \ + $(DISABLE_LATENT_ENTROPY_PLUGIN) \ + -fno-stack-protector -D__NO_FORTIFY \ + -fno-jump-tables \ + -include $(srctree)/include/linux/hidden.h + +# disable ftrace hooks and LTO +KBUILD_CFLAGS := $(subst $(CC_FLAGS_FTRACE),,$(KBUILD_CFLAGS)) +KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO),$(KBUILD_CFLAGS)) +KASAN_SANITIZE := n +KCSAN_SANITIZE := n +KMSAN_SANITIZE := n +UBSAN_SANITIZE := n +KCOV_INSTRUMENT := n + +obj-$(CONFIG_X86_64) += gdt_idt.o map_kernel.o +obj-$(CONFIG_AMD_MEM_ENCRYPT) += sme.o sev-startup.o +pi-objs := $(patsubst %.o,$(obj)/%.o,$(obj-y)) + +lib-$(CONFIG_X86_64) += la57toggle.o +lib-$(CONFIG_EFI_MIXED) += efi-mixed.o + +# +# Disable objtool validation for all library code, which is intended +# to be linked into the decompressor or the EFI stub but not vmlinux +# +$(patsubst %.o,$(obj)/%.o,$(lib-y)): OBJECT_FILES_NON_STANDARD := y + +# +# Invoke objtool for each object individually to check for absolute +# relocations, even if other objtool actions are being deferred. +# +$(pi-objs): objtool-enabled = 1 +$(pi-objs): objtool-args = $(if $(delay-objtool),--dry-run,$(objtool-args-y)) --noabs + +# +# Confine the startup code by prefixing all symbols with __pi_ (for position +# independent). This ensures that startup code can only call other startup +# code, or code that has explicitly been made accessible to it via a symbol +# alias. +# +$(obj)/%.pi.o: OBJCOPYFLAGS := --prefix-symbols=__pi_ +$(obj)/%.pi.o: $(obj)/%.o FORCE + $(call if_changed,objcopy) + +targets += $(obj-y) +obj-y := $(patsubst %.o,%.pi.o,$(obj-y)) diff --git a/arch/x86/boot/startup/efi-mixed.S b/arch/x86/boot/startup/efi-mixed.S new file mode 100644 index 000000000000..e04ed99bc449 --- /dev/null +++ b/arch/x86/boot/startup/efi-mixed.S @@ -0,0 +1,253 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2014, 2015 Intel Corporation; author Matt Fleming + * + * Early support for invoking 32-bit EFI services from a 64-bit kernel. + * + * Because this thunking occurs before ExitBootServices() we have to + * restore the firmware's 32-bit GDT and IDT before we make EFI service + * calls. + * + * On the plus side, we don't have to worry about mangling 64-bit + * addresses into 32-bits because we're executing with an identity + * mapped pagetable and haven't transitioned to 64-bit virtual addresses + * yet. + */ + +#include <linux/linkage.h> +#include <asm/desc_defs.h> +#include <asm/msr.h> +#include <asm/page_types.h> +#include <asm/pgtable_types.h> +#include <asm/processor-flags.h> +#include <asm/segment.h> + + .text + .code32 +#ifdef CONFIG_EFI_HANDOVER_PROTOCOL +SYM_FUNC_START(efi32_stub_entry) + call 1f +1: popl %ecx + + /* Clear BSS */ + xorl %eax, %eax + leal (_bss - 1b)(%ecx), %edi + leal (_ebss - 1b)(%ecx), %ecx + subl %edi, %ecx + shrl $2, %ecx + cld + rep stosl + + add $0x4, %esp /* Discard return address */ + movl 8(%esp), %ebx /* struct boot_params pointer */ + jmp efi32_startup +SYM_FUNC_END(efi32_stub_entry) +#endif + +/* + * Called using a far call from __efi64_thunk() below, using the x86_64 SysV + * ABI (except for R8/R9 which are inaccessible to 32-bit code - EAX/EBX are + * used instead). EBP+16 points to the arguments passed via the stack. + * + * The first argument (EDI) is a pointer to the boot service or protocol, to + * which the remaining arguments are passed, each truncated to 32 bits. + */ +SYM_FUNC_START_LOCAL(efi_enter32) + /* + * Convert x86-64 SysV ABI params to i386 ABI + */ + pushl 32(%ebp) /* Up to 3 args passed via the stack */ + pushl 24(%ebp) + pushl 16(%ebp) + pushl %ebx /* R9 */ + pushl %eax /* R8 */ + pushl %ecx + pushl %edx + pushl %esi + + /* Disable paging */ + movl %cr0, %eax + btrl $X86_CR0_PG_BIT, %eax + movl %eax, %cr0 + + /* Disable long mode via EFER */ + movl $MSR_EFER, %ecx + rdmsr + btrl $_EFER_LME, %eax + wrmsr + + call *%edi + + /* We must preserve return value */ + movl %eax, %edi + + call efi32_enable_long_mode + + addl $32, %esp + movl %edi, %eax + lret +SYM_FUNC_END(efi_enter32) + + .code64 +SYM_FUNC_START(__efi64_thunk) + push %rbp + movl %esp, %ebp + push %rbx + + /* Move args #5 and #6 into 32-bit accessible registers */ + movl %r8d, %eax + movl %r9d, %ebx + + lcalll *efi32_call(%rip) + + pop %rbx + pop %rbp + RET +SYM_FUNC_END(__efi64_thunk) + + .code32 +SYM_FUNC_START_LOCAL(efi32_enable_long_mode) + movl %cr4, %eax + btsl $(X86_CR4_PAE_BIT), %eax + movl %eax, %cr4 + + movl $MSR_EFER, %ecx + rdmsr + btsl $_EFER_LME, %eax + wrmsr + + /* Disable interrupts - the firmware's IDT does not work in long mode */ + cli + + /* Enable paging */ + movl %cr0, %eax + btsl $X86_CR0_PG_BIT, %eax + movl %eax, %cr0 + ret +SYM_FUNC_END(efi32_enable_long_mode) + +/* + * This is the common EFI stub entry point for mixed mode. It sets up the GDT + * and page tables needed for 64-bit execution, after which it calls the + * common 64-bit EFI entrypoint efi_stub_entry(). + * + * Arguments: 0(%esp) image handle + * 4(%esp) EFI system table pointer + * %ebx struct boot_params pointer (or NULL) + * + * Since this is the point of no return for ordinary execution, no registers + * are considered live except for the function parameters. [Note that the EFI + * stub may still exit and return to the firmware using the Exit() EFI boot + * service.] + */ +SYM_FUNC_START_LOCAL(efi32_startup) + movl %esp, %ebp + + subl $8, %esp + sgdtl (%esp) /* Save GDT descriptor to the stack */ + movl 2(%esp), %esi /* Existing GDT pointer */ + movzwl (%esp), %ecx /* Existing GDT limit */ + inc %ecx /* Existing GDT size */ + andl $~7, %ecx /* Ensure size is multiple of 8 */ + + subl %ecx, %esp /* Allocate new GDT */ + andl $~15, %esp /* Realign the stack */ + movl %esp, %edi /* New GDT address */ + leal 7(%ecx), %eax /* New GDT limit */ + pushw %cx /* Push 64-bit CS (for LJMP below) */ + pushl %edi /* Push new GDT address */ + pushw %ax /* Push new GDT limit */ + + /* Copy GDT to the stack and add a 64-bit code segment at the end */ + movl $GDT_ENTRY(DESC_CODE64, 0, 0xfffff) & 0xffffffff, (%edi,%ecx) + movl $GDT_ENTRY(DESC_CODE64, 0, 0xfffff) >> 32, 4(%edi,%ecx) + shrl $2, %ecx + cld + rep movsl /* Copy the firmware GDT */ + lgdtl (%esp) /* Switch to the new GDT */ + + call 1f +1: pop %edi + + /* Record mixed mode entry */ + movb $0x0, (efi_is64 - 1b)(%edi) + + /* Set up indirect far call to re-enter 32-bit mode */ + leal (efi32_call - 1b)(%edi), %eax + addl %eax, (%eax) + movw %cs, 4(%eax) + + /* Disable paging */ + movl %cr0, %eax + btrl $X86_CR0_PG_BIT, %eax + movl %eax, %cr0 + + /* Set up 1:1 mapping */ + leal (pte - 1b)(%edi), %eax + movl $_PAGE_PRESENT | _PAGE_RW | _PAGE_PSE, %ecx + leal (_PAGE_PRESENT | _PAGE_RW)(%eax), %edx +2: movl %ecx, (%eax) + addl $8, %eax + addl $PMD_SIZE, %ecx + jnc 2b + + movl $PAGE_SIZE, %ecx + .irpc l, 0123 + movl %edx, \l * 8(%eax) + addl %ecx, %edx + .endr + addl %ecx, %eax + movl %edx, (%eax) + movl %eax, %cr3 + + call efi32_enable_long_mode + + /* Set up far jump to 64-bit mode (CS is already on the stack) */ + leal (efi_stub_entry - 1b)(%edi), %eax + movl %eax, 2(%esp) + + movl 0(%ebp), %edi + movl 4(%ebp), %esi + movl %ebx, %edx + ljmpl *2(%esp) +SYM_FUNC_END(efi32_startup) + +/* + * efi_status_t efi32_pe_entry(efi_handle_t image_handle, + * efi_system_table_32_t *sys_table) + */ +SYM_FUNC_START(efi32_pe_entry) + pushl %ebx // save callee-save registers + + /* Check whether the CPU supports long mode */ + movl $0x80000001, %eax // assume extended info support + cpuid + btl $29, %edx // check long mode bit + jnc 1f + leal 8(%esp), %esp // preserve stack alignment + xor %ebx, %ebx // no struct boot_params pointer + jmp efi32_startup // only ESP and EBX remain live +1: movl $0x80000003, %eax // EFI_UNSUPPORTED + popl %ebx + RET +SYM_FUNC_END(efi32_pe_entry) + +#ifdef CONFIG_EFI_HANDOVER_PROTOCOL + .org efi32_stub_entry + 0x200 + .code64 +SYM_FUNC_START_NOALIGN(efi64_stub_entry) + jmp efi_handover_entry +SYM_FUNC_END(efi64_stub_entry) +#endif + + .data + .balign 8 +SYM_DATA_START_LOCAL(efi32_call) + .long efi_enter32 - . + .word 0x0 +SYM_DATA_END(efi32_call) +SYM_DATA(efi_is64, .byte 1) + + .bss + .balign PAGE_SIZE +SYM_DATA_LOCAL(pte, .fill 6 * PAGE_SIZE, 1, 0) diff --git a/arch/x86/boot/startup/exports.h b/arch/x86/boot/startup/exports.h new file mode 100644 index 000000000000..01d2363dc445 --- /dev/null +++ b/arch/x86/boot/startup/exports.h @@ -0,0 +1,14 @@ + +/* + * The symbols below are functions that are implemented by the startup code, + * but called at runtime by the SEV code residing in the core kernel. + */ +PROVIDE(early_set_pages_state = __pi_early_set_pages_state); +PROVIDE(early_snp_set_memory_private = __pi_early_snp_set_memory_private); +PROVIDE(early_snp_set_memory_shared = __pi_early_snp_set_memory_shared); +PROVIDE(get_hv_features = __pi_get_hv_features); +PROVIDE(sev_es_terminate = __pi_sev_es_terminate); +PROVIDE(snp_cpuid = __pi_snp_cpuid); +PROVIDE(snp_cpuid_get_table = __pi_snp_cpuid_get_table); +PROVIDE(svsm_issue_call = __pi_svsm_issue_call); +PROVIDE(svsm_process_result_codes = __pi_svsm_process_result_codes); diff --git a/arch/x86/boot/startup/gdt_idt.c b/arch/x86/boot/startup/gdt_idt.c new file mode 100644 index 000000000000..d16102abdaec --- /dev/null +++ b/arch/x86/boot/startup/gdt_idt.c @@ -0,0 +1,71 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/linkage.h> +#include <linux/types.h> + +#include <asm/desc.h> +#include <asm/init.h> +#include <asm/setup.h> +#include <asm/sev.h> +#include <asm/trapnr.h> + +/* + * Data structures and code used for IDT setup in head_64.S. The bringup-IDT is + * used until the idt_table takes over. On the boot CPU this happens in + * x86_64_start_kernel(), on secondary CPUs in start_secondary(). In both cases + * this happens in the functions called from head_64.S. + * + * The idt_table can't be used that early because all the code modifying it is + * in idt.c and can be instrumented by tracing or KASAN, which both don't work + * during early CPU bringup. Also the idt_table has the runtime vectors + * configured which require certain CPU state to be setup already (like TSS), + * which also hasn't happened yet in early CPU bringup. + */ +static gate_desc bringup_idt_table[NUM_EXCEPTION_VECTORS] __page_aligned_data; + +/* This may run while still in the direct mapping */ +void startup_64_load_idt(void *vc_handler) +{ + struct desc_ptr desc = { + .address = (unsigned long)rip_rel_ptr(bringup_idt_table), + .size = sizeof(bringup_idt_table) - 1, + }; + struct idt_data data; + gate_desc idt_desc; + + /* @vc_handler is set only for a VMM Communication Exception */ + if (vc_handler) { + init_idt_data(&data, X86_TRAP_VC, vc_handler); + idt_init_desc(&idt_desc, &data); + native_write_idt_entry((gate_desc *)desc.address, X86_TRAP_VC, &idt_desc); + } + + native_load_idt(&desc); +} + +/* + * Setup boot CPU state needed before kernel switches to virtual addresses. + */ +void __init startup_64_setup_gdt_idt(void) +{ + struct gdt_page *gp = rip_rel_ptr((void *)(__force unsigned long)&gdt_page); + void *handler = NULL; + + struct desc_ptr startup_gdt_descr = { + .address = (unsigned long)gp->gdt, + .size = GDT_SIZE - 1, + }; + + /* Load GDT */ + native_load_gdt(&startup_gdt_descr); + + /* New GDT is live - reload data segment registers */ + asm volatile("movl %%eax, %%ds\n" + "movl %%eax, %%ss\n" + "movl %%eax, %%es\n" : : "a"(__KERNEL_DS) : "memory"); + + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) + handler = rip_rel_ptr(vc_no_ghcb); + + startup_64_load_idt(handler); +} diff --git a/arch/x86/boot/startup/la57toggle.S b/arch/x86/boot/startup/la57toggle.S new file mode 100644 index 000000000000..370075b4d95b --- /dev/null +++ b/arch/x86/boot/startup/la57toggle.S @@ -0,0 +1,111 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <linux/linkage.h> +#include <asm/segment.h> +#include <asm/boot.h> +#include <asm/msr.h> +#include <asm/processor-flags.h> + +/* + * This is the 32-bit trampoline that will be copied over to low memory. It + * will be called using the ordinary 64-bit calling convention from code + * running in 64-bit mode. + * + * Return address is at the top of the stack (might be above 4G). + * The first argument (EDI) contains the address of the temporary PGD level + * page table in 32-bit addressable memory which will be programmed into + * register CR3. + */ + + .section ".rodata", "a", @progbits +SYM_CODE_START(trampoline_32bit_src) + /* + * Preserve callee save 64-bit registers on the stack: this is + * necessary because the architecture does not guarantee that GPRs will + * retain their full 64-bit values across a 32-bit mode switch. + */ + pushq %r15 + pushq %r14 + pushq %r13 + pushq %r12 + pushq %rbp + pushq %rbx + + /* Preserve top half of RSP in a legacy mode GPR to avoid truncation */ + movq %rsp, %rbx + shrq $32, %rbx + + /* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */ + pushq $__KERNEL32_CS + leaq 0f(%rip), %rax + pushq %rax + lretq + + /* + * The 32-bit code below will do a far jump back to long mode and end + * up here after reconfiguring the number of paging levels. First, the + * stack pointer needs to be restored to its full 64-bit value before + * the callee save register contents can be popped from the stack. + */ +.Lret: + shlq $32, %rbx + orq %rbx, %rsp + + /* Restore the preserved 64-bit registers */ + popq %rbx + popq %rbp + popq %r12 + popq %r13 + popq %r14 + popq %r15 + retq + + .code32 +0: + /* Disable paging */ + movl %cr0, %eax + btrl $X86_CR0_PG_BIT, %eax + movl %eax, %cr0 + + /* Point CR3 to the trampoline's new top level page table */ + movl %edi, %cr3 + + /* Set EFER.LME=1 as a precaution in case hypervsior pulls the rug */ + movl $MSR_EFER, %ecx + rdmsr + btsl $_EFER_LME, %eax + /* Avoid writing EFER if no change was made (for TDX guest) */ + jc 1f + wrmsr +1: + /* Toggle CR4.LA57 */ + movl %cr4, %eax + btcl $X86_CR4_LA57_BIT, %eax + movl %eax, %cr4 + + /* Enable paging again. */ + movl %cr0, %eax + btsl $X86_CR0_PG_BIT, %eax + movl %eax, %cr0 + + /* + * Return to the 64-bit calling code using LJMP rather than LRET, to + * avoid the need for a 32-bit addressable stack. The destination + * address will be adjusted after the template code is copied into a + * 32-bit addressable buffer. + */ +.Ljmp: ljmpl $__KERNEL_CS, $(.Lret - trampoline_32bit_src) +SYM_CODE_END(trampoline_32bit_src) + +/* + * This symbol is placed right after trampoline_32bit_src() so its address can + * be used to infer the size of the trampoline code. + */ +SYM_DATA(trampoline_ljmp_imm_offset, .word .Ljmp + 1 - trampoline_32bit_src) + + /* + * The trampoline code has a size limit. + * Make sure we fail to compile if the trampoline code grows + * beyond TRAMPOLINE_32BIT_CODE_SIZE bytes. + */ + .org trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_SIZE diff --git a/arch/x86/boot/startup/map_kernel.c b/arch/x86/boot/startup/map_kernel.c new file mode 100644 index 000000000000..83ba98d61572 --- /dev/null +++ b/arch/x86/boot/startup/map_kernel.c @@ -0,0 +1,217 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/init.h> +#include <linux/linkage.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/pgtable.h> + +#include <asm/init.h> +#include <asm/sections.h> +#include <asm/setup.h> +#include <asm/sev.h> + +extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD]; +extern unsigned int next_early_pgt; + +static inline bool check_la57_support(void) +{ + /* + * 5-level paging is detected and enabled at kernel decompression + * stage. Only check if it has been enabled there. + */ + if (!(native_read_cr4() & X86_CR4_LA57)) + return false; + + __pgtable_l5_enabled = 1; + pgdir_shift = 48; + ptrs_per_p4d = 512; + + return true; +} + +static unsigned long __init sme_postprocess_startup(struct boot_params *bp, + pmdval_t *pmd, + unsigned long p2v_offset) +{ + unsigned long paddr, paddr_end; + int i; + + /* Encrypt the kernel and related (if SME is active) */ + sme_encrypt_kernel(bp); + + /* + * Clear the memory encryption mask from the .bss..decrypted section. + * The bss section will be memset to zero later in the initialization so + * there is no need to zero it after changing the memory encryption + * attribute. + */ + if (sme_get_me_mask()) { + paddr = (unsigned long)rip_rel_ptr(__start_bss_decrypted); + paddr_end = (unsigned long)rip_rel_ptr(__end_bss_decrypted); + + for (; paddr < paddr_end; paddr += PMD_SIZE) { + /* + * On SNP, transition the page to shared in the RMP table so that + * it is consistent with the page table attribute change. + * + * __start_bss_decrypted has a virtual address in the high range + * mapping (kernel .text). PVALIDATE, by way of + * early_snp_set_memory_shared(), requires a valid virtual + * address but the kernel is currently running off of the identity + * mapping so use the PA to get a *currently* valid virtual address. + */ + early_snp_set_memory_shared(paddr, paddr, PTRS_PER_PMD); + + i = pmd_index(paddr - p2v_offset); + pmd[i] -= sme_get_me_mask(); + } + } + + /* + * Return the SME encryption mask (if SME is active) to be used as a + * modifier for the initial pgdir entry programmed into CR3. + */ + return sme_get_me_mask(); +} + +/* + * This code is compiled using PIC codegen because it will execute from the + * early 1:1 mapping of memory, which deviates from the mapping expected by the + * linker. Due to this deviation, taking the address of a global variable will + * produce an ambiguous result when using the plain & operator. Instead, + * rip_rel_ptr() must be used, which will return the RIP-relative address in + * the 1:1 mapping of memory. Kernel virtual addresses can be determined by + * subtracting p2v_offset from the RIP-relative address. + */ +unsigned long __init __startup_64(unsigned long p2v_offset, + struct boot_params *bp) +{ + pmd_t (*early_pgts)[PTRS_PER_PMD] = rip_rel_ptr(early_dynamic_pgts); + unsigned long physaddr = (unsigned long)rip_rel_ptr(_text); + unsigned long va_text, va_end; + unsigned long pgtable_flags; + unsigned long load_delta; + pgdval_t *pgd; + p4dval_t *p4d; + pudval_t *pud; + pmdval_t *pmd, pmd_entry; + bool la57; + int i; + + la57 = check_la57_support(); + + /* Is the address too large? */ + if (physaddr >> MAX_PHYSMEM_BITS) + for (;;); + + /* + * Compute the delta between the address I am compiled to run at + * and the address I am actually running at. + */ + phys_base = load_delta = __START_KERNEL_map + p2v_offset; + + /* Is the address not 2M aligned? */ + if (load_delta & ~PMD_MASK) + for (;;); + + va_text = physaddr - p2v_offset; + va_end = (unsigned long)rip_rel_ptr(_end) - p2v_offset; + + /* Include the SME encryption mask in the fixup value */ + load_delta += sme_get_me_mask(); + + /* Fixup the physical addresses in the page table */ + + pgd = rip_rel_ptr(early_top_pgt); + pgd[pgd_index(__START_KERNEL_map)] += load_delta; + + if (la57) { + p4d = (p4dval_t *)rip_rel_ptr(level4_kernel_pgt); + p4d[MAX_PTRS_PER_P4D - 1] += load_delta; + + pgd[pgd_index(__START_KERNEL_map)] = (pgdval_t)p4d | _PAGE_TABLE; + } + + level3_kernel_pgt[PTRS_PER_PUD - 2].pud += load_delta; + level3_kernel_pgt[PTRS_PER_PUD - 1].pud += load_delta; + + for (i = FIXMAP_PMD_TOP; i > FIXMAP_PMD_TOP - FIXMAP_PMD_NUM; i--) + level2_fixmap_pgt[i].pmd += load_delta; + + /* + * Set up the identity mapping for the switchover. These + * entries should *NOT* have the global bit set! This also + * creates a bunch of nonsense entries but that is fine -- + * it avoids problems around wraparound. + */ + + pud = &early_pgts[0]->pmd; + pmd = &early_pgts[1]->pmd; + next_early_pgt = 2; + + pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask(); + + if (la57) { + p4d = &early_pgts[next_early_pgt++]->pmd; + + i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; + pgd[i + 0] = (pgdval_t)p4d + pgtable_flags; + pgd[i + 1] = (pgdval_t)p4d + pgtable_flags; + + i = physaddr >> P4D_SHIFT; + p4d[(i + 0) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags; + p4d[(i + 1) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags; + } else { + i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; + pgd[i + 0] = (pgdval_t)pud + pgtable_flags; + pgd[i + 1] = (pgdval_t)pud + pgtable_flags; + } + + i = physaddr >> PUD_SHIFT; + pud[(i + 0) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags; + pud[(i + 1) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags; + + pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL; + pmd_entry += sme_get_me_mask(); + pmd_entry += physaddr; + + for (i = 0; i < DIV_ROUND_UP(va_end - va_text, PMD_SIZE); i++) { + int idx = i + (physaddr >> PMD_SHIFT); + + pmd[idx % PTRS_PER_PMD] = pmd_entry + i * PMD_SIZE; + } + + /* + * Fixup the kernel text+data virtual addresses. Note that + * we might write invalid pmds, when the kernel is relocated + * cleanup_highmap() fixes this up along with the mappings + * beyond _end. + * + * Only the region occupied by the kernel image has so far + * been checked against the table of usable memory regions + * provided by the firmware, so invalidate pages outside that + * region. A page table entry that maps to a reserved area of + * memory would allow processor speculation into that area, + * and on some hardware (particularly the UV platform) even + * speculative access to some reserved areas is caught as an + * error, causing the BIOS to halt the system. + */ + + pmd = rip_rel_ptr(level2_kernel_pgt); + + /* invalidate pages before the kernel image */ + for (i = 0; i < pmd_index(va_text); i++) + pmd[i] &= ~_PAGE_PRESENT; + + /* fixup pages that are part of the kernel image */ + for (; i <= pmd_index(va_end); i++) + if (pmd[i] & _PAGE_PRESENT) + pmd[i] += load_delta; + + /* invalidate pages after the kernel image */ + for (; i < PTRS_PER_PMD; i++) + pmd[i] &= ~_PAGE_PRESENT; + + return sme_postprocess_startup(bp, pmd, p2v_offset); +} diff --git a/arch/x86/boot/startup/sev-shared.c b/arch/x86/boot/startup/sev-shared.c new file mode 100644 index 000000000000..a0fa8bb2b945 --- /dev/null +++ b/arch/x86/boot/startup/sev-shared.c @@ -0,0 +1,762 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * AMD Encrypted Register State Support + * + * Author: Joerg Roedel <jroedel@suse.de> + * + * This file is not compiled stand-alone. It contains code shared + * between the pre-decompression boot code and the running Linux kernel + * and is included directly into both code-bases. + */ + +#include <asm/setup_data.h> + +#ifndef __BOOT_COMPRESSED +#define has_cpuflag(f) cpu_feature_enabled(f) +#else +#undef WARN +#define WARN(condition, format...) (!!(condition)) +#endif + +/* Copy of the SNP firmware's CPUID page. */ +static struct snp_cpuid_table cpuid_table_copy __ro_after_init; + +/* + * These will be initialized based on CPUID table so that non-present + * all-zero leaves (for sparse tables) can be differentiated from + * invalid/out-of-range leaves. This is needed since all-zero leaves + * still need to be post-processed. + */ +static u32 cpuid_std_range_max __ro_after_init; +static u32 cpuid_hyp_range_max __ro_after_init; +static u32 cpuid_ext_range_max __ro_after_init; + +bool sev_snp_needs_sfw; + +void __noreturn +sev_es_terminate(unsigned int set, unsigned int reason) +{ + u64 val = GHCB_MSR_TERM_REQ; + + /* Tell the hypervisor what went wrong. */ + val |= GHCB_SEV_TERM_REASON(set, reason); + + /* Request Guest Termination from Hypervisor */ + sev_es_wr_ghcb_msr(val); + VMGEXIT(); + + while (true) + asm volatile("hlt\n" : : : "memory"); +} + +/* + * The hypervisor features are available from GHCB version 2 onward. + */ +u64 __init get_hv_features(void) +{ + u64 val; + + if (ghcb_version < 2) + return 0; + + sev_es_wr_ghcb_msr(GHCB_MSR_HV_FT_REQ); + VMGEXIT(); + + val = sev_es_rd_ghcb_msr(); + if (GHCB_RESP_CODE(val) != GHCB_MSR_HV_FT_RESP) + return 0; + + return GHCB_MSR_HV_FT_RESP_VAL(val); +} + +int svsm_process_result_codes(struct svsm_call *call) +{ + switch (call->rax_out) { + case SVSM_SUCCESS: + return 0; + case SVSM_ERR_INCOMPLETE: + case SVSM_ERR_BUSY: + return -EAGAIN; + default: + return -EINVAL; + } +} + +/* + * Issue a VMGEXIT to call the SVSM: + * - Load the SVSM register state (RAX, RCX, RDX, R8 and R9) + * - Set the CA call pending field to 1 + * - Issue VMGEXIT + * - Save the SVSM return register state (RAX, RCX, RDX, R8 and R9) + * - Perform atomic exchange of the CA call pending field + * + * - See the "Secure VM Service Module for SEV-SNP Guests" specification for + * details on the calling convention. + * - The calling convention loosely follows the Microsoft X64 calling + * convention by putting arguments in RCX, RDX, R8 and R9. + * - RAX specifies the SVSM protocol/callid as input and the return code + * as output. + */ +void svsm_issue_call(struct svsm_call *call, u8 *pending) +{ + register unsigned long rax asm("rax") = call->rax; + register unsigned long rcx asm("rcx") = call->rcx; + register unsigned long rdx asm("rdx") = call->rdx; + register unsigned long r8 asm("r8") = call->r8; + register unsigned long r9 asm("r9") = call->r9; + + call->caa->call_pending = 1; + + asm volatile("rep; vmmcall\n\t" + : "+r" (rax), "+r" (rcx), "+r" (rdx), "+r" (r8), "+r" (r9) + : : "memory"); + + *pending = xchg(&call->caa->call_pending, *pending); + + call->rax_out = rax; + call->rcx_out = rcx; + call->rdx_out = rdx; + call->r8_out = r8; + call->r9_out = r9; +} + +int svsm_perform_msr_protocol(struct svsm_call *call) +{ + u8 pending = 0; + u64 val, resp; + + /* + * When using the MSR protocol, be sure to save and restore + * the current MSR value. + */ + val = sev_es_rd_ghcb_msr(); + + sev_es_wr_ghcb_msr(GHCB_MSR_VMPL_REQ_LEVEL(0)); + + svsm_issue_call(call, &pending); + + resp = sev_es_rd_ghcb_msr(); + + sev_es_wr_ghcb_msr(val); + + if (pending) + return -EINVAL; + + if (GHCB_RESP_CODE(resp) != GHCB_MSR_VMPL_RESP) + return -EINVAL; + + if (GHCB_MSR_VMPL_RESP_VAL(resp)) + return -EINVAL; + + return svsm_process_result_codes(call); +} + +static int __sev_cpuid_hv(u32 fn, int reg_idx, u32 *reg) +{ + u64 val; + + sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, reg_idx)); + VMGEXIT(); + val = sev_es_rd_ghcb_msr(); + if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP) + return -EIO; + + *reg = (val >> 32); + + return 0; +} + +static int __sev_cpuid_hv_msr(struct cpuid_leaf *leaf) +{ + int ret; + + /* + * MSR protocol does not support fetching non-zero subfunctions, but is + * sufficient to handle current early-boot cases. Should that change, + * make sure to report an error rather than ignoring the index and + * grabbing random values. If this issue arises in the future, handling + * can be added here to use GHCB-page protocol for cases that occur late + * enough in boot that GHCB page is available. + */ + if (cpuid_function_is_indexed(leaf->fn) && leaf->subfn) + return -EINVAL; + + ret = __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_EAX, &leaf->eax); + ret = ret ? : __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_EBX, &leaf->ebx); + ret = ret ? : __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_ECX, &leaf->ecx); + ret = ret ? : __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_EDX, &leaf->edx); + + return ret; +} + + + +/* + * This may be called early while still running on the initial identity + * mapping. Use RIP-relative addressing to obtain the correct address + * while running with the initial identity mapping as well as the + * switch-over to kernel virtual addresses later. + */ +const struct snp_cpuid_table *snp_cpuid_get_table(void) +{ + return rip_rel_ptr(&cpuid_table_copy); +} + +/* + * The SNP Firmware ABI, Revision 0.9, Section 7.1, details the use of + * XCR0_IN and XSS_IN to encode multiple versions of 0xD subfunctions 0 + * and 1 based on the corresponding features enabled by a particular + * combination of XCR0 and XSS registers so that a guest can look up the + * version corresponding to the features currently enabled in its XCR0/XSS + * registers. The only values that differ between these versions/table + * entries is the enabled XSAVE area size advertised via EBX. + * + * While hypervisors may choose to make use of this support, it is more + * robust/secure for a guest to simply find the entry corresponding to the + * base/legacy XSAVE area size (XCR0=1 or XCR0=3), and then calculate the + * XSAVE area size using subfunctions 2 through 64, as documented in APM + * Volume 3, Rev 3.31, Appendix E.3.8, which is what is done here. + * + * Since base/legacy XSAVE area size is documented as 0x240, use that value + * directly rather than relying on the base size in the CPUID table. + * + * Return: XSAVE area size on success, 0 otherwise. + */ +static u32 snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted) +{ + const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); + u64 xfeatures_found = 0; + u32 xsave_size = 0x240; + int i; + + for (i = 0; i < cpuid_table->count; i++) { + const struct snp_cpuid_fn *e = &cpuid_table->fn[i]; + + if (!(e->eax_in == 0xD && e->ecx_in > 1 && e->ecx_in < 64)) + continue; + if (!(xfeatures_en & (BIT_ULL(e->ecx_in)))) + continue; + if (xfeatures_found & (BIT_ULL(e->ecx_in))) + continue; + + xfeatures_found |= (BIT_ULL(e->ecx_in)); + + if (compacted) + xsave_size += e->eax; + else + xsave_size = max(xsave_size, e->eax + e->ebx); + } + + /* + * Either the guest set unsupported XCR0/XSS bits, or the corresponding + * entries in the CPUID table were not present. This is not a valid + * state to be in. + */ + if (xfeatures_found != (xfeatures_en & GENMASK_ULL(63, 2))) + return 0; + + return xsave_size; +} + +static bool +snp_cpuid_get_validated_func(struct cpuid_leaf *leaf) +{ + const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); + int i; + + for (i = 0; i < cpuid_table->count; i++) { + const struct snp_cpuid_fn *e = &cpuid_table->fn[i]; + + if (e->eax_in != leaf->fn) + continue; + + if (cpuid_function_is_indexed(leaf->fn) && e->ecx_in != leaf->subfn) + continue; + + /* + * For 0xD subfunctions 0 and 1, only use the entry corresponding + * to the base/legacy XSAVE area size (XCR0=1 or XCR0=3, XSS=0). + * See the comments above snp_cpuid_calc_xsave_size() for more + * details. + */ + if (e->eax_in == 0xD && (e->ecx_in == 0 || e->ecx_in == 1)) + if (!(e->xcr0_in == 1 || e->xcr0_in == 3) || e->xss_in) + continue; + + leaf->eax = e->eax; + leaf->ebx = e->ebx; + leaf->ecx = e->ecx; + leaf->edx = e->edx; + + return true; + } + + return false; +} + +static void snp_cpuid_hv_msr(void *ctx, struct cpuid_leaf *leaf) +{ + if (__sev_cpuid_hv_msr(leaf)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID_HV); +} + +static int +snp_cpuid_postprocess(void (*cpuid_fn)(void *ctx, struct cpuid_leaf *leaf), + void *ctx, struct cpuid_leaf *leaf) +{ + struct cpuid_leaf leaf_hv = *leaf; + + switch (leaf->fn) { + case 0x1: + cpuid_fn(ctx, &leaf_hv); + + /* initial APIC ID */ + leaf->ebx = (leaf_hv.ebx & GENMASK(31, 24)) | (leaf->ebx & GENMASK(23, 0)); + /* APIC enabled bit */ + leaf->edx = (leaf_hv.edx & BIT(9)) | (leaf->edx & ~BIT(9)); + + /* OSXSAVE enabled bit */ + if (native_read_cr4() & X86_CR4_OSXSAVE) + leaf->ecx |= BIT(27); + break; + case 0x7: + /* OSPKE enabled bit */ + leaf->ecx &= ~BIT(4); + if (native_read_cr4() & X86_CR4_PKE) + leaf->ecx |= BIT(4); + break; + case 0xB: + leaf_hv.subfn = 0; + cpuid_fn(ctx, &leaf_hv); + + /* extended APIC ID */ + leaf->edx = leaf_hv.edx; + break; + case 0xD: { + bool compacted = false; + u64 xcr0 = 1, xss = 0; + u32 xsave_size; + + if (leaf->subfn != 0 && leaf->subfn != 1) + return 0; + + if (native_read_cr4() & X86_CR4_OSXSAVE) + xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); + if (leaf->subfn == 1) { + /* Get XSS value if XSAVES is enabled. */ + if (leaf->eax & BIT(3)) { + unsigned long lo, hi; + + asm volatile("rdmsr" : "=a" (lo), "=d" (hi) + : "c" (MSR_IA32_XSS)); + xss = (hi << 32) | lo; + } + + /* + * The PPR and APM aren't clear on what size should be + * encoded in 0xD:0x1:EBX when compaction is not enabled + * by either XSAVEC (feature bit 1) or XSAVES (feature + * bit 3) since SNP-capable hardware has these feature + * bits fixed as 1. KVM sets it to 0 in this case, but + * to avoid this becoming an issue it's safer to simply + * treat this as unsupported for SNP guests. + */ + if (!(leaf->eax & (BIT(1) | BIT(3)))) + return -EINVAL; + + compacted = true; + } + + xsave_size = snp_cpuid_calc_xsave_size(xcr0 | xss, compacted); + if (!xsave_size) + return -EINVAL; + + leaf->ebx = xsave_size; + } + break; + case 0x8000001E: + cpuid_fn(ctx, &leaf_hv); + + /* extended APIC ID */ + leaf->eax = leaf_hv.eax; + /* compute ID */ + leaf->ebx = (leaf->ebx & GENMASK(31, 8)) | (leaf_hv.ebx & GENMASK(7, 0)); + /* node ID */ + leaf->ecx = (leaf->ecx & GENMASK(31, 8)) | (leaf_hv.ecx & GENMASK(7, 0)); + break; + default: + /* No fix-ups needed, use values as-is. */ + break; + } + + return 0; +} + +/* + * Returns -EOPNOTSUPP if feature not enabled. Any other non-zero return value + * should be treated as fatal by caller. + */ +int snp_cpuid(void (*cpuid_fn)(void *ctx, struct cpuid_leaf *leaf), + void *ctx, struct cpuid_leaf *leaf) +{ + const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); + + if (!cpuid_table->count) + return -EOPNOTSUPP; + + if (!snp_cpuid_get_validated_func(leaf)) { + /* + * Some hypervisors will avoid keeping track of CPUID entries + * where all values are zero, since they can be handled the + * same as out-of-range values (all-zero). This is useful here + * as well as it allows virtually all guest configurations to + * work using a single SNP CPUID table. + * + * To allow for this, there is a need to distinguish between + * out-of-range entries and in-range zero entries, since the + * CPUID table entries are only a template that may need to be + * augmented with additional values for things like + * CPU-specific information during post-processing. So if it's + * not in the table, set the values to zero. Then, if they are + * within a valid CPUID range, proceed with post-processing + * using zeros as the initial values. Otherwise, skip + * post-processing and just return zeros immediately. + */ + leaf->eax = leaf->ebx = leaf->ecx = leaf->edx = 0; + + /* Skip post-processing for out-of-range zero leafs. */ + if (!(leaf->fn <= cpuid_std_range_max || + (leaf->fn >= 0x40000000 && leaf->fn <= cpuid_hyp_range_max) || + (leaf->fn >= 0x80000000 && leaf->fn <= cpuid_ext_range_max))) + return 0; + } + + return snp_cpuid_postprocess(cpuid_fn, ctx, leaf); +} + +/* + * Boot VC Handler - This is the first VC handler during boot, there is no GHCB + * page yet, so it only supports the MSR based communication with the + * hypervisor and only the CPUID exit-code. + */ +void do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) +{ + unsigned int subfn = lower_bits(regs->cx, 32); + unsigned int fn = lower_bits(regs->ax, 32); + u16 opcode = *(unsigned short *)regs->ip; + struct cpuid_leaf leaf; + int ret; + + /* Only CPUID is supported via MSR protocol */ + if (exit_code != SVM_EXIT_CPUID) + goto fail; + + /* Is it really a CPUID insn? */ + if (opcode != 0xa20f) + goto fail; + + leaf.fn = fn; + leaf.subfn = subfn; + + /* + * If SNP is active, then snp_cpuid() uses the CPUID table to obtain the + * CPUID values (with possible HV interaction during post-processing of + * the values). But if SNP is not active (no CPUID table present), then + * snp_cpuid() returns -EOPNOTSUPP so that an SEV-ES guest can call the + * HV to obtain the CPUID information. + */ + ret = snp_cpuid(snp_cpuid_hv_msr, NULL, &leaf); + if (!ret) + goto cpuid_done; + + if (ret != -EOPNOTSUPP) + goto fail; + + /* + * This is reached by a SEV-ES guest and needs to invoke the HV for + * the CPUID data. + */ + if (__sev_cpuid_hv_msr(&leaf)) + goto fail; + +cpuid_done: + regs->ax = leaf.eax; + regs->bx = leaf.ebx; + regs->cx = leaf.ecx; + regs->dx = leaf.edx; + + /* + * This is a VC handler and the #VC is only raised when SEV-ES is + * active, which means SEV must be active too. Do sanity checks on the + * CPUID results to make sure the hypervisor does not trick the kernel + * into the no-sev path. This could map sensitive data unencrypted and + * make it accessible to the hypervisor. + * + * In particular, check for: + * - Availability of CPUID leaf 0x8000001f + * - SEV CPUID bit. + * + * The hypervisor might still report the wrong C-bit position, but this + * can't be checked here. + */ + + if (fn == 0x80000000 && (regs->ax < 0x8000001f)) + /* SEV leaf check */ + goto fail; + else if ((fn == 0x8000001f && !(regs->ax & BIT(1)))) + /* SEV bit */ + goto fail; + + /* Skip over the CPUID two-byte opcode */ + regs->ip += 2; + + return; + +fail: + /* Terminate the guest */ + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); +} + +struct cc_setup_data { + struct setup_data header; + u32 cc_blob_address; +}; + +/* + * Search for a Confidential Computing blob passed in as a setup_data entry + * via the Linux Boot Protocol. + */ +static __init +struct cc_blob_sev_info *find_cc_blob_setup_data(struct boot_params *bp) +{ + struct cc_setup_data *sd = NULL; + struct setup_data *hdr; + + hdr = (struct setup_data *)bp->hdr.setup_data; + + while (hdr) { + if (hdr->type == SETUP_CC_BLOB) { + sd = (struct cc_setup_data *)hdr; + return (struct cc_blob_sev_info *)(unsigned long)sd->cc_blob_address; + } + hdr = (struct setup_data *)hdr->next; + } + + return NULL; +} + +/* + * Initialize the kernel's copy of the SNP CPUID table, and set up the + * pointer that will be used to access it. + * + * Maintaining a direct mapping of the SNP CPUID table used by firmware would + * be possible as an alternative, but the approach is brittle since the + * mapping needs to be updated in sync with all the changes to virtual memory + * layout and related mapping facilities throughout the boot process. + */ +static void __init setup_cpuid_table(const struct cc_blob_sev_info *cc_info) +{ + const struct snp_cpuid_table *cpuid_table_fw, *cpuid_table; + int i; + + if (!cc_info || !cc_info->cpuid_phys || cc_info->cpuid_len < PAGE_SIZE) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID); + + cpuid_table_fw = (const struct snp_cpuid_table *)cc_info->cpuid_phys; + if (!cpuid_table_fw->count || cpuid_table_fw->count > SNP_CPUID_COUNT_MAX) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID); + + cpuid_table = snp_cpuid_get_table(); + memcpy((void *)cpuid_table, cpuid_table_fw, sizeof(*cpuid_table)); + + /* Initialize CPUID ranges for range-checking. */ + for (i = 0; i < cpuid_table->count; i++) { + const struct snp_cpuid_fn *fn = &cpuid_table->fn[i]; + + if (fn->eax_in == 0x0) + cpuid_std_range_max = fn->eax; + else if (fn->eax_in == 0x40000000) + cpuid_hyp_range_max = fn->eax; + else if (fn->eax_in == 0x80000000) + cpuid_ext_range_max = fn->eax; + } +} + +static int svsm_call_msr_protocol(struct svsm_call *call) +{ + int ret; + + do { + ret = svsm_perform_msr_protocol(call); + } while (ret == -EAGAIN); + + return ret; +} + +static void svsm_pval_4k_page(unsigned long paddr, bool validate, + struct svsm_ca *caa, u64 caa_pa) +{ + struct svsm_pvalidate_call *pc; + struct svsm_call call = {}; + unsigned long flags; + u64 pc_pa; + + /* + * This can be called very early in the boot, use native functions in + * order to avoid paravirt issues. + */ + flags = native_local_irq_save(); + + call.caa = caa; + + pc = (struct svsm_pvalidate_call *)call.caa->svsm_buffer; + pc_pa = caa_pa + offsetof(struct svsm_ca, svsm_buffer); + + pc->num_entries = 1; + pc->cur_index = 0; + pc->entry[0].page_size = RMP_PG_SIZE_4K; + pc->entry[0].action = validate; + pc->entry[0].ignore_cf = 0; + pc->entry[0].rsvd = 0; + pc->entry[0].pfn = paddr >> PAGE_SHIFT; + + /* Protocol 0, Call ID 1 */ + call.rax = SVSM_CORE_CALL(SVSM_CORE_PVALIDATE); + call.rcx = pc_pa; + + /* + * Use the MSR protocol exclusively, so that this code is usable in + * startup code where VA/PA translations of the GHCB page's address may + * be problematic. + */ + if (svsm_call_msr_protocol(&call)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE); + + native_local_irq_restore(flags); +} + +static void pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, + bool validate, struct svsm_ca *caa, u64 caa_pa) +{ + int ret; + + if (snp_vmpl) { + svsm_pval_4k_page(paddr, validate, caa, caa_pa); + } else { + ret = pvalidate(vaddr, RMP_PG_SIZE_4K, validate); + if (ret) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE); + } + + /* + * If validating memory (making it private) and affected by the + * cache-coherency vulnerability, perform the cache eviction mitigation. + */ + if (validate && sev_snp_needs_sfw) + sev_evict_cache((void *)vaddr, 1); +} + +static void __page_state_change(unsigned long vaddr, unsigned long paddr, + const struct psc_desc *desc) +{ + u64 val, msr; + + /* + * If private -> shared then invalidate the page before requesting the + * state change in the RMP table. + */ + if (desc->op == SNP_PAGE_STATE_SHARED) + pvalidate_4k_page(vaddr, paddr, false, desc->ca, desc->caa_pa); + + /* Save the current GHCB MSR value */ + msr = sev_es_rd_ghcb_msr(); + + /* Issue VMGEXIT to change the page state in RMP table. */ + sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, desc->op)); + VMGEXIT(); + + /* Read the response of the VMGEXIT. */ + val = sev_es_rd_ghcb_msr(); + if ((GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP) || GHCB_MSR_PSC_RESP_VAL(val)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); + + /* Restore the GHCB MSR value */ + sev_es_wr_ghcb_msr(msr); + + /* + * Now that page state is changed in the RMP table, validate it so that it is + * consistent with the RMP entry. + */ + if (desc->op == SNP_PAGE_STATE_PRIVATE) + pvalidate_4k_page(vaddr, paddr, true, desc->ca, desc->caa_pa); +} + +/* + * Maintain the GPA of the SVSM Calling Area (CA) in order to utilize the SVSM + * services needed when not running in VMPL0. + */ +static bool __init svsm_setup_ca(const struct cc_blob_sev_info *cc_info, + void *page) +{ + struct snp_secrets_page *secrets_page; + struct snp_cpuid_table *cpuid_table; + unsigned int i; + u64 caa; + + BUILD_BUG_ON(sizeof(*secrets_page) != PAGE_SIZE); + + /* + * Check if running at VMPL0. + * + * Use RMPADJUST (see the rmpadjust() function for a description of what + * the instruction does) to update the VMPL1 permissions of a page. If + * the guest is running at VMPL0, this will succeed and implies there is + * no SVSM. If the guest is running at any other VMPL, this will fail. + * Linux SNP guests only ever run at a single VMPL level so permission mask + * changes of a lesser-privileged VMPL are a don't-care. + * + * Use a rip-relative reference to obtain the proper address, since this + * routine is running identity mapped when called, both by the decompressor + * code and the early kernel code. + */ + if (!rmpadjust((unsigned long)page, RMP_PG_SIZE_4K, 1)) + return false; + + /* + * Not running at VMPL0, ensure everything has been properly supplied + * for running under an SVSM. + */ + if (!cc_info || !cc_info->secrets_phys || cc_info->secrets_len != PAGE_SIZE) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SECRETS_PAGE); + + secrets_page = (struct snp_secrets_page *)cc_info->secrets_phys; + if (!secrets_page->svsm_size) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_NO_SVSM); + + if (!secrets_page->svsm_guest_vmpl) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SVSM_VMPL0); + + snp_vmpl = secrets_page->svsm_guest_vmpl; + + caa = secrets_page->svsm_caa; + + /* + * An open-coded PAGE_ALIGNED() in order to avoid including + * kernel-proper headers into the decompressor. + */ + if (caa & (PAGE_SIZE - 1)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SVSM_CAA); + + boot_svsm_caa_pa = caa; + + /* Advertise the SVSM presence via CPUID. */ + cpuid_table = (struct snp_cpuid_table *)snp_cpuid_get_table(); + for (i = 0; i < cpuid_table->count; i++) { + struct snp_cpuid_fn *fn = &cpuid_table->fn[i]; + + if (fn->eax_in == 0x8000001f) + fn->eax |= BIT(28); + } + + return true; +} diff --git a/arch/x86/boot/startup/sev-startup.c b/arch/x86/boot/startup/sev-startup.c new file mode 100644 index 000000000000..09725428d3e6 --- /dev/null +++ b/arch/x86/boot/startup/sev-startup.c @@ -0,0 +1,220 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2019 SUSE + * + * Author: Joerg Roedel <jroedel@suse.de> + */ + +#define pr_fmt(fmt) "SEV: " fmt + +#include <linux/percpu-defs.h> +#include <linux/cc_platform.h> +#include <linux/printk.h> +#include <linux/mm_types.h> +#include <linux/set_memory.h> +#include <linux/memblock.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/cpumask.h> +#include <linux/efi.h> +#include <linux/io.h> +#include <linux/psp-sev.h> +#include <uapi/linux/sev-guest.h> + +#include <asm/init.h> +#include <asm/cpu_entry_area.h> +#include <asm/stacktrace.h> +#include <asm/sev.h> +#include <asm/sev-internal.h> +#include <asm/insn-eval.h> +#include <asm/fpu/xcr.h> +#include <asm/processor.h> +#include <asm/realmode.h> +#include <asm/setup.h> +#include <asm/traps.h> +#include <asm/svm.h> +#include <asm/smp.h> +#include <asm/cpu.h> +#include <asm/apic.h> +#include <asm/cpuid/api.h> +#include <asm/cmdline.h> + +/* Include code shared with pre-decompression boot stage */ +#include "sev-shared.c" + +void +early_set_pages_state(unsigned long vaddr, unsigned long paddr, + unsigned long npages, const struct psc_desc *desc) +{ + unsigned long paddr_end; + + vaddr = vaddr & PAGE_MASK; + + paddr = paddr & PAGE_MASK; + paddr_end = paddr + (npages << PAGE_SHIFT); + + while (paddr < paddr_end) { + __page_state_change(vaddr, paddr, desc); + + vaddr += PAGE_SIZE; + paddr += PAGE_SIZE; + } +} + +void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, + unsigned long npages) +{ + struct psc_desc d = { + SNP_PAGE_STATE_PRIVATE, + rip_rel_ptr(&boot_svsm_ca_page), + boot_svsm_caa_pa + }; + + /* + * This can be invoked in early boot while running identity mapped, so + * use an open coded check for SNP instead of using cc_platform_has(). + * This eliminates worries about jump tables or checking boot_cpu_data + * in the cc_platform_has() function. + */ + if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED)) + return; + + /* + * Ask the hypervisor to mark the memory pages as private in the RMP + * table. + */ + early_set_pages_state(vaddr, paddr, npages, &d); +} + +void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, + unsigned long npages) +{ + struct psc_desc d = { + SNP_PAGE_STATE_SHARED, + rip_rel_ptr(&boot_svsm_ca_page), + boot_svsm_caa_pa + }; + + /* + * This can be invoked in early boot while running identity mapped, so + * use an open coded check for SNP instead of using cc_platform_has(). + * This eliminates worries about jump tables or checking boot_cpu_data + * in the cc_platform_has() function. + */ + if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED)) + return; + + /* Ask hypervisor to mark the memory pages shared in the RMP table. */ + early_set_pages_state(vaddr, paddr, npages, &d); +} + +/* + * Initial set up of SNP relies on information provided by the + * Confidential Computing blob, which can be passed to the kernel + * in the following ways, depending on how it is booted: + * + * - when booted via the boot/decompress kernel: + * - via boot_params + * + * - when booted directly by firmware/bootloader (e.g. CONFIG_PVH): + * - via a setup_data entry, as defined by the Linux Boot Protocol + * + * Scan for the blob in that order. + */ +static struct cc_blob_sev_info *__init find_cc_blob(struct boot_params *bp) +{ + struct cc_blob_sev_info *cc_info; + + /* Boot kernel would have passed the CC blob via boot_params. */ + if (bp->cc_blob_address) { + cc_info = (struct cc_blob_sev_info *)(unsigned long)bp->cc_blob_address; + goto found_cc_info; + } + + /* + * If kernel was booted directly, without the use of the + * boot/decompression kernel, the CC blob may have been passed via + * setup_data instead. + */ + cc_info = find_cc_blob_setup_data(bp); + if (!cc_info) + return NULL; + +found_cc_info: + if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC) + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); + + return cc_info; +} + +static void __init svsm_setup(struct cc_blob_sev_info *cc_info) +{ + struct snp_secrets_page *secrets = (void *)cc_info->secrets_phys; + struct svsm_call call = {}; + u64 pa; + + /* + * Record the SVSM Calling Area address (CAA) if the guest is not + * running at VMPL0. The CA will be used to communicate with the + * SVSM to perform the SVSM services. + */ + if (!svsm_setup_ca(cc_info, rip_rel_ptr(&boot_svsm_ca_page))) + return; + + /* + * It is very early in the boot and the kernel is running identity + * mapped but without having adjusted the pagetables to where the + * kernel was loaded (physbase), so the get the CA address using + * RIP-relative addressing. + */ + pa = (u64)rip_rel_ptr(&boot_svsm_ca_page); + + /* + * Switch over to the boot SVSM CA while the current CA is still 1:1 + * mapped and thus addressable with VA == PA. There is no GHCB at this + * point so use the MSR protocol. + * + * SVSM_CORE_REMAP_CA call: + * RAX = 0 (Protocol=0, CallID=0) + * RCX = New CA GPA + */ + call.caa = (struct svsm_ca *)secrets->svsm_caa; + call.rax = SVSM_CORE_CALL(SVSM_CORE_REMAP_CA); + call.rcx = pa; + + if (svsm_call_msr_protocol(&call)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SVSM_CA_REMAP_FAIL); + + boot_svsm_caa_pa = pa; +} + +bool __init snp_init(struct boot_params *bp) +{ + struct cc_blob_sev_info *cc_info; + + if (!bp) + return false; + + cc_info = find_cc_blob(bp); + if (!cc_info) + return false; + + if (cc_info->secrets_phys && cc_info->secrets_len == PAGE_SIZE) + sev_secrets_pa = cc_info->secrets_phys; + else + return false; + + setup_cpuid_table(cc_info); + + svsm_setup(cc_info); + + /* + * The CC blob will be used later to access the secrets page. Cache + * it here like the boot kernel does. + */ + bp->cc_blob_address = (u32)(unsigned long)cc_info; + + return true; +} diff --git a/arch/x86/boot/startup/sme.c b/arch/x86/boot/startup/sme.c new file mode 100644 index 000000000000..e7ea65f3f1d6 --- /dev/null +++ b/arch/x86/boot/startup/sme.c @@ -0,0 +1,575 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2016 Advanced Micro Devices, Inc. + * + * Author: Tom Lendacky <thomas.lendacky@amd.com> + */ + +/* + * Since we're dealing with identity mappings, physical and virtual + * addresses are the same, so override these defines which are ultimately + * used by the headers in misc.h. + */ +#define __pa(x) ((unsigned long)(x)) +#define __va(x) ((void *)((unsigned long)(x))) + +/* + * Special hack: we have to be careful, because no indirections are + * allowed here, and paravirt_ops is a kind of one. As it will only run in + * baremetal anyway, we just keep it from happening. (This list needs to + * be extended when new paravirt and debugging variants are added.) + */ +#undef CONFIG_PARAVIRT +#undef CONFIG_PARAVIRT_XXL +#undef CONFIG_PARAVIRT_SPINLOCKS + +/* + * This code runs before CPU feature bits are set. By default, the + * pgtable_l5_enabled() function uses bit X86_FEATURE_LA57 to determine if + * 5-level paging is active, so that won't work here. USE_EARLY_PGTABLE_L5 + * is provided to handle this situation and, instead, use a variable that + * has been set by the early boot code. + */ +#define USE_EARLY_PGTABLE_L5 + +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/mem_encrypt.h> +#include <linux/cc_platform.h> + +#include <asm/init.h> +#include <asm/setup.h> +#include <asm/sections.h> +#include <asm/coco.h> +#include <asm/sev.h> + +#define PGD_FLAGS _KERNPG_TABLE_NOENC +#define P4D_FLAGS _KERNPG_TABLE_NOENC +#define PUD_FLAGS _KERNPG_TABLE_NOENC +#define PMD_FLAGS _KERNPG_TABLE_NOENC + +#define PMD_FLAGS_LARGE (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL) + +#define PMD_FLAGS_DEC PMD_FLAGS_LARGE +#define PMD_FLAGS_DEC_WP ((PMD_FLAGS_DEC & ~_PAGE_LARGE_CACHE_MASK) | \ + (_PAGE_PAT_LARGE | _PAGE_PWT)) + +#define PMD_FLAGS_ENC (PMD_FLAGS_LARGE | _PAGE_ENC) + +#define PTE_FLAGS (__PAGE_KERNEL_EXEC & ~_PAGE_GLOBAL) + +#define PTE_FLAGS_DEC PTE_FLAGS +#define PTE_FLAGS_DEC_WP ((PTE_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \ + (_PAGE_PAT | _PAGE_PWT)) + +#define PTE_FLAGS_ENC (PTE_FLAGS | _PAGE_ENC) + +struct sme_populate_pgd_data { + void *pgtable_area; + pgd_t *pgd; + + pmdval_t pmd_flags; + pteval_t pte_flags; + unsigned long paddr; + + unsigned long vaddr; + unsigned long vaddr_end; +}; + +/* + * This work area lives in the .init.scratch section, which lives outside of + * the kernel proper. It is sized to hold the intermediate copy buffer and + * more than enough pagetable pages. + * + * By using this section, the kernel can be encrypted in place and it + * avoids any possibility of boot parameters or initramfs images being + * placed such that the in-place encryption logic overwrites them. This + * section is 2MB aligned to allow for simple pagetable setup using only + * PMD entries (see vmlinux.lds.S). + */ +static char sme_workarea[2 * PMD_SIZE] __section(".init.scratch"); + +static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd) +{ + unsigned long pgd_start, pgd_end, pgd_size; + pgd_t *pgd_p; + + pgd_start = ppd->vaddr & PGDIR_MASK; + pgd_end = ppd->vaddr_end & PGDIR_MASK; + + pgd_size = (((pgd_end - pgd_start) / PGDIR_SIZE) + 1) * sizeof(pgd_t); + + pgd_p = ppd->pgd + pgd_index(ppd->vaddr); + + memset(pgd_p, 0, pgd_size); +} + +static pud_t __init *sme_prepare_pgd(struct sme_populate_pgd_data *ppd) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + pgd = ppd->pgd + pgd_index(ppd->vaddr); + if (pgd_none(*pgd)) { + p4d = ppd->pgtable_area; + memset(p4d, 0, sizeof(*p4d) * PTRS_PER_P4D); + ppd->pgtable_area += sizeof(*p4d) * PTRS_PER_P4D; + set_pgd(pgd, __pgd(PGD_FLAGS | __pa(p4d))); + } + + p4d = p4d_offset(pgd, ppd->vaddr); + if (p4d_none(*p4d)) { + pud = ppd->pgtable_area; + memset(pud, 0, sizeof(*pud) * PTRS_PER_PUD); + ppd->pgtable_area += sizeof(*pud) * PTRS_PER_PUD; + set_p4d(p4d, __p4d(P4D_FLAGS | __pa(pud))); + } + + pud = pud_offset(p4d, ppd->vaddr); + if (pud_none(*pud)) { + pmd = ppd->pgtable_area; + memset(pmd, 0, sizeof(*pmd) * PTRS_PER_PMD); + ppd->pgtable_area += sizeof(*pmd) * PTRS_PER_PMD; + set_pud(pud, __pud(PUD_FLAGS | __pa(pmd))); + } + + if (pud_leaf(*pud)) + return NULL; + + return pud; +} + +static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd) +{ + pud_t *pud; + pmd_t *pmd; + + pud = sme_prepare_pgd(ppd); + if (!pud) + return; + + pmd = pmd_offset(pud, ppd->vaddr); + if (pmd_leaf(*pmd)) + return; + + set_pmd(pmd, __pmd(ppd->paddr | ppd->pmd_flags)); +} + +static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd) +{ + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pud = sme_prepare_pgd(ppd); + if (!pud) + return; + + pmd = pmd_offset(pud, ppd->vaddr); + if (pmd_none(*pmd)) { + pte = ppd->pgtable_area; + memset(pte, 0, sizeof(*pte) * PTRS_PER_PTE); + ppd->pgtable_area += sizeof(*pte) * PTRS_PER_PTE; + set_pmd(pmd, __pmd(PMD_FLAGS | __pa(pte))); + } + + if (pmd_leaf(*pmd)) + return; + + pte = pte_offset_kernel(pmd, ppd->vaddr); + if (pte_none(*pte)) + set_pte(pte, __pte(ppd->paddr | ppd->pte_flags)); +} + +static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd) +{ + while (ppd->vaddr < ppd->vaddr_end) { + sme_populate_pgd_large(ppd); + + ppd->vaddr += PMD_SIZE; + ppd->paddr += PMD_SIZE; + } +} + +static void __init __sme_map_range_pte(struct sme_populate_pgd_data *ppd) +{ + while (ppd->vaddr < ppd->vaddr_end) { + sme_populate_pgd(ppd); + + ppd->vaddr += PAGE_SIZE; + ppd->paddr += PAGE_SIZE; + } +} + +static void __init __sme_map_range(struct sme_populate_pgd_data *ppd, + pmdval_t pmd_flags, pteval_t pte_flags) +{ + unsigned long vaddr_end; + + ppd->pmd_flags = pmd_flags; + ppd->pte_flags = pte_flags; + + /* Save original end value since we modify the struct value */ + vaddr_end = ppd->vaddr_end; + + /* If start is not 2MB aligned, create PTE entries */ + ppd->vaddr_end = ALIGN(ppd->vaddr, PMD_SIZE); + __sme_map_range_pte(ppd); + + /* Create PMD entries */ + ppd->vaddr_end = vaddr_end & PMD_MASK; + __sme_map_range_pmd(ppd); + + /* If end is not 2MB aligned, create PTE entries */ + ppd->vaddr_end = vaddr_end; + __sme_map_range_pte(ppd); +} + +static void __init sme_map_range_encrypted(struct sme_populate_pgd_data *ppd) +{ + __sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC); +} + +static void __init sme_map_range_decrypted(struct sme_populate_pgd_data *ppd) +{ + __sme_map_range(ppd, PMD_FLAGS_DEC, PTE_FLAGS_DEC); +} + +static void __init sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd) +{ + __sme_map_range(ppd, PMD_FLAGS_DEC_WP, PTE_FLAGS_DEC_WP); +} + +static unsigned long __init sme_pgtable_calc(unsigned long len) +{ + unsigned long entries = 0, tables = 0; + + /* + * Perform a relatively simplistic calculation of the pagetable + * entries that are needed. Those mappings will be covered mostly + * by 2MB PMD entries so we can conservatively calculate the required + * number of P4D, PUD and PMD structures needed to perform the + * mappings. For mappings that are not 2MB aligned, PTE mappings + * would be needed for the start and end portion of the address range + * that fall outside of the 2MB alignment. This results in, at most, + * two extra pages to hold PTE entries for each range that is mapped. + * Incrementing the count for each covers the case where the addresses + * cross entries. + */ + + /* PGDIR_SIZE is equal to P4D_SIZE on 4-level machine. */ + if (PTRS_PER_P4D > 1) + entries += (DIV_ROUND_UP(len, PGDIR_SIZE) + 1) * sizeof(p4d_t) * PTRS_PER_P4D; + entries += (DIV_ROUND_UP(len, P4D_SIZE) + 1) * sizeof(pud_t) * PTRS_PER_PUD; + entries += (DIV_ROUND_UP(len, PUD_SIZE) + 1) * sizeof(pmd_t) * PTRS_PER_PMD; + entries += 2 * sizeof(pte_t) * PTRS_PER_PTE; + + /* + * Now calculate the added pagetable structures needed to populate + * the new pagetables. + */ + + if (PTRS_PER_P4D > 1) + tables += DIV_ROUND_UP(entries, PGDIR_SIZE) * sizeof(p4d_t) * PTRS_PER_P4D; + tables += DIV_ROUND_UP(entries, P4D_SIZE) * sizeof(pud_t) * PTRS_PER_PUD; + tables += DIV_ROUND_UP(entries, PUD_SIZE) * sizeof(pmd_t) * PTRS_PER_PMD; + + return entries + tables; +} + +void __init sme_encrypt_kernel(struct boot_params *bp) +{ + unsigned long workarea_start, workarea_end, workarea_len; + unsigned long execute_start, execute_end, execute_len; + unsigned long kernel_start, kernel_end, kernel_len; + unsigned long initrd_start, initrd_end, initrd_len; + struct sme_populate_pgd_data ppd; + unsigned long pgtable_area_len; + unsigned long decrypted_base; + + /* + * This is early code, use an open coded check for SME instead of + * using cc_platform_has(). This eliminates worries about removing + * instrumentation or checking boot_cpu_data in the cc_platform_has() + * function. + */ + if (!sme_get_me_mask() || sev_status & MSR_AMD64_SEV_ENABLED) + return; + + /* + * Prepare for encrypting the kernel and initrd by building new + * pagetables with the necessary attributes needed to encrypt the + * kernel in place. + * + * One range of virtual addresses will map the memory occupied + * by the kernel and initrd as encrypted. + * + * Another range of virtual addresses will map the memory occupied + * by the kernel and initrd as decrypted and write-protected. + * + * The use of write-protect attribute will prevent any of the + * memory from being cached. + */ + + kernel_start = (unsigned long)rip_rel_ptr(_text); + kernel_end = ALIGN((unsigned long)rip_rel_ptr(_end), PMD_SIZE); + kernel_len = kernel_end - kernel_start; + + initrd_start = 0; + initrd_end = 0; + initrd_len = 0; +#ifdef CONFIG_BLK_DEV_INITRD + initrd_len = (unsigned long)bp->hdr.ramdisk_size | + ((unsigned long)bp->ext_ramdisk_size << 32); + if (initrd_len) { + initrd_start = (unsigned long)bp->hdr.ramdisk_image | + ((unsigned long)bp->ext_ramdisk_image << 32); + initrd_end = PAGE_ALIGN(initrd_start + initrd_len); + initrd_len = initrd_end - initrd_start; + } +#endif + + /* + * Calculate required number of workarea bytes needed: + * executable encryption area size: + * stack page (PAGE_SIZE) + * encryption routine page (PAGE_SIZE) + * intermediate copy buffer (PMD_SIZE) + * pagetable structures for the encryption of the kernel + * pagetable structures for workarea (in case not currently mapped) + */ + execute_start = workarea_start = (unsigned long)rip_rel_ptr(sme_workarea); + execute_end = execute_start + (PAGE_SIZE * 2) + PMD_SIZE; + execute_len = execute_end - execute_start; + + /* + * One PGD for both encrypted and decrypted mappings and a set of + * PUDs and PMDs for each of the encrypted and decrypted mappings. + */ + pgtable_area_len = sizeof(pgd_t) * PTRS_PER_PGD; + pgtable_area_len += sme_pgtable_calc(execute_end - kernel_start) * 2; + if (initrd_len) + pgtable_area_len += sme_pgtable_calc(initrd_len) * 2; + + /* PUDs and PMDs needed in the current pagetables for the workarea */ + pgtable_area_len += sme_pgtable_calc(execute_len + pgtable_area_len); + + /* + * The total workarea includes the executable encryption area and + * the pagetable area. The start of the workarea is already 2MB + * aligned, align the end of the workarea on a 2MB boundary so that + * we don't try to create/allocate PTE entries from the workarea + * before it is mapped. + */ + workarea_len = execute_len + pgtable_area_len; + workarea_end = ALIGN(workarea_start + workarea_len, PMD_SIZE); + + /* + * Set the address to the start of where newly created pagetable + * structures (PGDs, PUDs and PMDs) will be allocated. New pagetable + * structures are created when the workarea is added to the current + * pagetables and when the new encrypted and decrypted kernel + * mappings are populated. + */ + ppd.pgtable_area = (void *)execute_end; + + /* + * Make sure the current pagetable structure has entries for + * addressing the workarea. + */ + ppd.pgd = (pgd_t *)native_read_cr3_pa(); + ppd.paddr = workarea_start; + ppd.vaddr = workarea_start; + ppd.vaddr_end = workarea_end; + sme_map_range_decrypted(&ppd); + + /* Flush the TLB - no globals so cr3 is enough */ + native_write_cr3(__native_read_cr3()); + + /* + * A new pagetable structure is being built to allow for the kernel + * and initrd to be encrypted. It starts with an empty PGD that will + * then be populated with new PUDs and PMDs as the encrypted and + * decrypted kernel mappings are created. + */ + ppd.pgd = ppd.pgtable_area; + memset(ppd.pgd, 0, sizeof(pgd_t) * PTRS_PER_PGD); + ppd.pgtable_area += sizeof(pgd_t) * PTRS_PER_PGD; + + /* + * A different PGD index/entry must be used to get different + * pagetable entries for the decrypted mapping. Choose the next + * PGD index and convert it to a virtual address to be used as + * the base of the mapping. + */ + decrypted_base = (pgd_index(workarea_end) + 1) & (PTRS_PER_PGD - 1); + if (initrd_len) { + unsigned long check_base; + + check_base = (pgd_index(initrd_end) + 1) & (PTRS_PER_PGD - 1); + decrypted_base = max(decrypted_base, check_base); + } + decrypted_base <<= PGDIR_SHIFT; + + /* Add encrypted kernel (identity) mappings */ + ppd.paddr = kernel_start; + ppd.vaddr = kernel_start; + ppd.vaddr_end = kernel_end; + sme_map_range_encrypted(&ppd); + + /* Add decrypted, write-protected kernel (non-identity) mappings */ + ppd.paddr = kernel_start; + ppd.vaddr = kernel_start + decrypted_base; + ppd.vaddr_end = kernel_end + decrypted_base; + sme_map_range_decrypted_wp(&ppd); + + if (initrd_len) { + /* Add encrypted initrd (identity) mappings */ + ppd.paddr = initrd_start; + ppd.vaddr = initrd_start; + ppd.vaddr_end = initrd_end; + sme_map_range_encrypted(&ppd); + /* + * Add decrypted, write-protected initrd (non-identity) mappings + */ + ppd.paddr = initrd_start; + ppd.vaddr = initrd_start + decrypted_base; + ppd.vaddr_end = initrd_end + decrypted_base; + sme_map_range_decrypted_wp(&ppd); + } + + /* Add decrypted workarea mappings to both kernel mappings */ + ppd.paddr = workarea_start; + ppd.vaddr = workarea_start; + ppd.vaddr_end = workarea_end; + sme_map_range_decrypted(&ppd); + + ppd.paddr = workarea_start; + ppd.vaddr = workarea_start + decrypted_base; + ppd.vaddr_end = workarea_end + decrypted_base; + sme_map_range_decrypted(&ppd); + + /* Perform the encryption */ + sme_encrypt_execute(kernel_start, kernel_start + decrypted_base, + kernel_len, workarea_start, (unsigned long)ppd.pgd); + + if (initrd_len) + sme_encrypt_execute(initrd_start, initrd_start + decrypted_base, + initrd_len, workarea_start, + (unsigned long)ppd.pgd); + + /* + * At this point we are running encrypted. Remove the mappings for + * the decrypted areas - all that is needed for this is to remove + * the PGD entry/entries. + */ + ppd.vaddr = kernel_start + decrypted_base; + ppd.vaddr_end = kernel_end + decrypted_base; + sme_clear_pgd(&ppd); + + if (initrd_len) { + ppd.vaddr = initrd_start + decrypted_base; + ppd.vaddr_end = initrd_end + decrypted_base; + sme_clear_pgd(&ppd); + } + + ppd.vaddr = workarea_start + decrypted_base; + ppd.vaddr_end = workarea_end + decrypted_base; + sme_clear_pgd(&ppd); + + /* Flush the TLB - no globals so cr3 is enough */ + native_write_cr3(__native_read_cr3()); +} + +void __init sme_enable(struct boot_params *bp) +{ + unsigned int eax, ebx, ecx, edx; + unsigned long feature_mask; + unsigned long me_mask; + bool snp_en; + u64 msr; + + snp_en = snp_init(bp); + + /* Check for the SME/SEV support leaf */ + eax = 0x80000000; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + if (eax < 0x8000001f) + return; + +#define AMD_SME_BIT BIT(0) +#define AMD_SEV_BIT BIT(1) + + /* + * Check for the SME/SEV feature: + * CPUID Fn8000_001F[EAX] + * - Bit 0 - Secure Memory Encryption support + * - Bit 1 - Secure Encrypted Virtualization support + * CPUID Fn8000_001F[EBX] + * - Bits 5:0 - Pagetable bit position used to indicate encryption + */ + eax = 0x8000001f; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + /* Check whether SEV or SME is supported */ + if (!(eax & (AMD_SEV_BIT | AMD_SME_BIT))) + return; + + me_mask = 1UL << (ebx & 0x3f); + sev_snp_needs_sfw = !(ebx & BIT(31)); + + /* Check the SEV MSR whether SEV or SME is enabled */ + sev_status = msr = native_rdmsrq(MSR_AMD64_SEV); + feature_mask = (msr & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT; + + /* + * Any discrepancies between the presence of a CC blob and SNP + * enablement abort the guest. + */ + if (snp_en ^ !!(msr & MSR_AMD64_SEV_SNP_ENABLED)) + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); + + /* Check if memory encryption is enabled */ + if (feature_mask == AMD_SME_BIT) { + if (!(bp->hdr.xloadflags & XLF_MEM_ENCRYPTION)) + return; + + /* + * No SME if Hypervisor bit is set. This check is here to + * prevent a guest from trying to enable SME. For running as a + * KVM guest the MSR_AMD64_SYSCFG will be sufficient, but there + * might be other hypervisors which emulate that MSR as non-zero + * or even pass it through to the guest. + * A malicious hypervisor can still trick a guest into this + * path, but there is no way to protect against that. + */ + eax = 1; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + if (ecx & BIT(31)) + return; + + /* For SME, check the SYSCFG MSR */ + msr = native_rdmsrq(MSR_AMD64_SYSCFG); + if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) + return; + } + + sme_me_mask = me_mask; + physical_mask &= ~me_mask; + cc_vendor = CC_VENDOR_AMD; + cc_set_mask(me_mask); +} + +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION +/* Local version for startup code, which never operates on user page tables */ +pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd) +{ + return pgd; +} +#endif diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c index 8a3fff9128bb..b25c6a9303b7 100644 --- a/arch/x86/boot/string.c +++ b/arch/x86/boot/string.c @@ -32,8 +32,8 @@ int memcmp(const void *s1, const void *s2, size_t len) { bool diff; - asm("repe; cmpsb" CC_SET(nz) - : CC_OUT(nz) (diff), "+D" (s1), "+S" (s2), "+c" (len)); + asm("repe cmpsb" + : "=@ccnz" (diff), "+D" (s1), "+S" (s2), "+c" (len)); return diff; } @@ -49,7 +49,7 @@ int strcmp(const char *str1, const char *str2) { const unsigned char *s1 = (const unsigned char *)str1; const unsigned char *s2 = (const unsigned char *)str2; - int delta = 0; + int delta; while (*s1 || *s2) { delta = *s1 - *s2; @@ -88,14 +88,6 @@ size_t strnlen(const char *s, size_t maxlen) return (es - s); } -unsigned int atou(const char *s) -{ - unsigned int i = 0; - while (isdigit(*s)) - i = i * 10 + (*s++ - '0'); - return i; -} - /* Works only for digits and letters, but small and fast */ #define TOLOWER(x) ((x) | 0x20) @@ -350,7 +342,7 @@ static int _kstrtoul(const char *s, unsigned int base, unsigned long *res) } /** - * kstrtoul - convert a string to an unsigned long + * boot_kstrtoul - convert a string to an unsigned long * @s: The start of the string. The string must be null-terminated, and may also * include a single newline before its terminating null. The first character * may also be a plus sign, but not a minus sign. diff --git a/arch/x86/boot/string.h b/arch/x86/boot/string.h index e5d2c6b8c2f1..a5b05ebc037d 100644 --- a/arch/x86/boot/string.h +++ b/arch/x86/boot/string.h @@ -24,7 +24,6 @@ extern size_t strlen(const char *s); extern char *strstr(const char *s1, const char *s2); extern char *strchr(const char *s, int c); extern size_t strnlen(const char *s, size_t maxlen); -extern unsigned int atou(const char *s); extern unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base); long simple_strtol(const char *cp, char **endp, unsigned int base); diff --git a/arch/x86/boot/tools/.gitignore b/arch/x86/boot/tools/.gitignore deleted file mode 100644 index ae91f4d0d78b..000000000000 --- a/arch/x86/boot/tools/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -build diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c deleted file mode 100644 index a3725ad46c5a..000000000000 --- a/arch/x86/boot/tools/build.c +++ /dev/null @@ -1,500 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 1991, 1992 Linus Torvalds - * Copyright (C) 1997 Martin Mares - * Copyright (C) 2007 H. Peter Anvin - */ - -/* - * This file builds a disk-image from three different files: - * - * - setup: 8086 machine code, sets up system parm - * - system: 80386 code for actual system - * - zoffset.h: header with ZO_* defines - * - * It does some checking that all files are of the correct type, and writes - * the result to the specified destination, removing headers and padding to - * the right amount. It also writes some system data to stdout. - */ - -/* - * Changes by tytso to allow root device specification - * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 - * Cross compiling fixes by Gertjan van Wingerde, July 1996 - * Rewritten by Martin Mares, April 1997 - * Substantially overhauled by H. Peter Anvin, April 2007 - */ - -#include <stdio.h> -#include <string.h> -#include <stdlib.h> -#include <stdarg.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <unistd.h> -#include <fcntl.h> -#include <sys/mman.h> -#include <tools/le_byteshift.h> - -typedef unsigned char u8; -typedef unsigned short u16; -typedef unsigned int u32; - -#define DEFAULT_MAJOR_ROOT 0 -#define DEFAULT_MINOR_ROOT 0 -#define DEFAULT_ROOT_DEV (DEFAULT_MAJOR_ROOT << 8 | DEFAULT_MINOR_ROOT) - -/* Minimal number of setup sectors */ -#define SETUP_SECT_MIN 5 -#define SETUP_SECT_MAX 64 - -/* This must be large enough to hold the entire setup */ -u8 buf[SETUP_SECT_MAX*512]; - -#define PECOFF_RELOC_RESERVE 0x20 - -#ifdef CONFIG_EFI_MIXED -#define PECOFF_COMPAT_RESERVE 0x20 -#else -#define PECOFF_COMPAT_RESERVE 0x0 -#endif - -static unsigned long efi32_stub_entry; -static unsigned long efi64_stub_entry; -static unsigned long efi_pe_entry; -static unsigned long efi32_pe_entry; -static unsigned long kernel_info; -static unsigned long startup_64; -static unsigned long _ehead; -static unsigned long _end; - -/*----------------------------------------------------------------------*/ - -static const u32 crctab32[] = { - 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, - 0x706af48f, 0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4, - 0xe0d5e91e, 0x97d2d988, 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, - 0x90bf1d91, 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de, - 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7, 0x136c9856, - 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9, - 0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4, - 0xa2677172, 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, - 0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, - 0x45df5c75, 0xdcd60dcf, 0xabd13d59, 0x26d930ac, 0x51de003a, - 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423, 0xcfba9599, - 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, - 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190, - 0x01db7106, 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, - 0x9fbfe4a5, 0xe8b8d433, 0x7807c9a2, 0x0f00f934, 0x9609a88e, - 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01, - 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, 0x6c0695ed, - 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950, - 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, - 0xfbd44c65, 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, - 0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, - 0x346ed9fc, 0xad678846, 0xda60b8d0, 0x44042d73, 0x33031de5, - 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa, 0xbe0b1010, - 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f, - 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, - 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, - 0x03b6e20c, 0x74b1d29a, 0xead54739, 0x9dd277af, 0x04db2615, - 0x73dc1683, 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8, - 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1, 0xf00f9344, - 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb, - 0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, - 0x67dd4acc, 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, - 0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, - 0xa6bc5767, 0x3fb506dd, 0x48b2364b, 0xd80d2bda, 0xaf0a1b4c, - 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55, 0x316e8eef, - 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, - 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, - 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, - 0x2cd99e8b, 0x5bdeae1d, 0x9b64c2b0, 0xec63f226, 0x756aa39c, - 0x026d930a, 0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713, - 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, 0x92d28e9b, - 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242, - 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, - 0x18b74777, 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, - 0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45, 0xa00ae278, - 0xd70dd2ee, 0x4e048354, 0x3903b3c2, 0xa7672661, 0xd06016f7, - 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc, 0x40df0b66, - 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9, - 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, - 0xcdd70693, 0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, - 0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, - 0x2d02ef8d -}; - -static u32 partial_crc32_one(u8 c, u32 crc) -{ - return crctab32[(crc ^ c) & 0xff] ^ (crc >> 8); -} - -static u32 partial_crc32(const u8 *s, int len, u32 crc) -{ - while (len--) - crc = partial_crc32_one(*s++, crc); - return crc; -} - -static void die(const char * str, ...) -{ - va_list args; - va_start(args, str); - vfprintf(stderr, str, args); - va_end(args); - fputc('\n', stderr); - exit(1); -} - -static void usage(void) -{ - die("Usage: build setup system zoffset.h image"); -} - -#ifdef CONFIG_EFI_STUB - -static void update_pecoff_section_header_fields(char *section_name, u32 vma, u32 size, u32 datasz, u32 offset) -{ - unsigned int pe_header; - unsigned short num_sections; - u8 *section; - - pe_header = get_unaligned_le32(&buf[0x3c]); - num_sections = get_unaligned_le16(&buf[pe_header + 6]); - -#ifdef CONFIG_X86_32 - section = &buf[pe_header + 0xa8]; -#else - section = &buf[pe_header + 0xb8]; -#endif - - while (num_sections > 0) { - if (strncmp((char*)section, section_name, 8) == 0) { - /* section header size field */ - put_unaligned_le32(size, section + 0x8); - - /* section header vma field */ - put_unaligned_le32(vma, section + 0xc); - - /* section header 'size of initialised data' field */ - put_unaligned_le32(datasz, section + 0x10); - - /* section header 'file offset' field */ - put_unaligned_le32(offset, section + 0x14); - - break; - } - section += 0x28; - num_sections--; - } -} - -static void update_pecoff_section_header(char *section_name, u32 offset, u32 size) -{ - update_pecoff_section_header_fields(section_name, offset, size, size, offset); -} - -static void update_pecoff_setup_and_reloc(unsigned int size) -{ - u32 setup_offset = 0x200; - u32 reloc_offset = size - PECOFF_RELOC_RESERVE - PECOFF_COMPAT_RESERVE; -#ifdef CONFIG_EFI_MIXED - u32 compat_offset = reloc_offset + PECOFF_RELOC_RESERVE; -#endif - u32 setup_size = reloc_offset - setup_offset; - - update_pecoff_section_header(".setup", setup_offset, setup_size); - update_pecoff_section_header(".reloc", reloc_offset, PECOFF_RELOC_RESERVE); - - /* - * Modify .reloc section contents with a single entry. The - * relocation is applied to offset 10 of the relocation section. - */ - put_unaligned_le32(reloc_offset + 10, &buf[reloc_offset]); - put_unaligned_le32(10, &buf[reloc_offset + 4]); - -#ifdef CONFIG_EFI_MIXED - update_pecoff_section_header(".compat", compat_offset, PECOFF_COMPAT_RESERVE); - - /* - * Put the IA-32 machine type (0x14c) and the associated entry point - * address in the .compat section, so loaders can figure out which other - * execution modes this image supports. - */ - buf[compat_offset] = 0x1; - buf[compat_offset + 1] = 0x8; - put_unaligned_le16(0x14c, &buf[compat_offset + 2]); - put_unaligned_le32(efi32_pe_entry + size, &buf[compat_offset + 4]); -#endif -} - -static void update_pecoff_text(unsigned int text_start, unsigned int file_sz, - unsigned int init_sz) -{ - unsigned int pe_header; - unsigned int text_sz = file_sz - text_start; - unsigned int bss_sz = init_sz - file_sz; - - pe_header = get_unaligned_le32(&buf[0x3c]); - - /* - * The PE/COFF loader may load the image at an address which is - * misaligned with respect to the kernel_alignment field in the setup - * header. - * - * In order to avoid relocating the kernel to correct the misalignment, - * add slack to allow the buffer to be aligned within the declared size - * of the image. - */ - bss_sz += CONFIG_PHYSICAL_ALIGN; - init_sz += CONFIG_PHYSICAL_ALIGN; - - /* - * Size of code: Subtract the size of the first sector (512 bytes) - * which includes the header. - */ - put_unaligned_le32(file_sz - 512 + bss_sz, &buf[pe_header + 0x1c]); - - /* Size of image */ - put_unaligned_le32(init_sz, &buf[pe_header + 0x50]); - - /* - * Address of entry point for PE/COFF executable - */ - put_unaligned_le32(text_start + efi_pe_entry, &buf[pe_header + 0x28]); - - update_pecoff_section_header_fields(".text", text_start, text_sz + bss_sz, - text_sz, text_start); -} - -static int reserve_pecoff_reloc_section(int c) -{ - /* Reserve 0x20 bytes for .reloc section */ - memset(buf+c, 0, PECOFF_RELOC_RESERVE); - return PECOFF_RELOC_RESERVE; -} - -static void efi_stub_defaults(void) -{ - /* Defaults for old kernel */ -#ifdef CONFIG_X86_32 - efi_pe_entry = 0x10; -#else - efi_pe_entry = 0x210; - startup_64 = 0x200; -#endif -} - -static void efi_stub_entry_update(void) -{ - unsigned long addr = efi32_stub_entry; - -#ifdef CONFIG_X86_64 - /* Yes, this is really how we defined it :( */ - addr = efi64_stub_entry - 0x200; -#endif - -#ifdef CONFIG_EFI_MIXED - if (efi32_stub_entry != addr) - die("32-bit and 64-bit EFI entry points do not match\n"); -#endif - put_unaligned_le32(addr, &buf[0x264]); -} - -#else - -static inline void update_pecoff_setup_and_reloc(unsigned int size) {} -static inline void update_pecoff_text(unsigned int text_start, - unsigned int file_sz, - unsigned int init_sz) {} -static inline void efi_stub_defaults(void) {} -static inline void efi_stub_entry_update(void) {} - -static inline int reserve_pecoff_reloc_section(int c) -{ - return 0; -} -#endif /* CONFIG_EFI_STUB */ - -static int reserve_pecoff_compat_section(int c) -{ - /* Reserve 0x20 bytes for .compat section */ - memset(buf+c, 0, PECOFF_COMPAT_RESERVE); - return PECOFF_COMPAT_RESERVE; -} - -/* - * Parse zoffset.h and find the entry points. We could just #include zoffset.h - * but that would mean tools/build would have to be rebuilt every time. It's - * not as if parsing it is hard... - */ -#define PARSE_ZOFS(p, sym) do { \ - if (!strncmp(p, "#define ZO_" #sym " ", 11+sizeof(#sym))) \ - sym = strtoul(p + 11 + sizeof(#sym), NULL, 16); \ -} while (0) - -static void parse_zoffset(char *fname) -{ - FILE *file; - char *p; - int c; - - file = fopen(fname, "r"); - if (!file) - die("Unable to open `%s': %m", fname); - c = fread(buf, 1, sizeof(buf) - 1, file); - if (ferror(file)) - die("read-error on `zoffset.h'"); - fclose(file); - buf[c] = 0; - - p = (char *)buf; - - while (p && *p) { - PARSE_ZOFS(p, efi32_stub_entry); - PARSE_ZOFS(p, efi64_stub_entry); - PARSE_ZOFS(p, efi_pe_entry); - PARSE_ZOFS(p, efi32_pe_entry); - PARSE_ZOFS(p, kernel_info); - PARSE_ZOFS(p, startup_64); - PARSE_ZOFS(p, _ehead); - PARSE_ZOFS(p, _end); - - p = strchr(p, '\n'); - while (p && (*p == '\r' || *p == '\n')) - p++; - } -} - -int main(int argc, char ** argv) -{ - unsigned int i, sz, setup_sectors, init_sz; - int c; - u32 sys_size; - struct stat sb; - FILE *file, *dest; - int fd; - void *kernel; - u32 crc = 0xffffffffUL; - - efi_stub_defaults(); - - if (argc != 5) - usage(); - parse_zoffset(argv[3]); - - dest = fopen(argv[4], "w"); - if (!dest) - die("Unable to write `%s': %m", argv[4]); - - /* Copy the setup code */ - file = fopen(argv[1], "r"); - if (!file) - die("Unable to open `%s': %m", argv[1]); - c = fread(buf, 1, sizeof(buf), file); - if (ferror(file)) - die("read-error on `setup'"); - if (c < 1024) - die("The setup must be at least 1024 bytes"); - if (get_unaligned_le16(&buf[510]) != 0xAA55) - die("Boot block hasn't got boot flag (0xAA55)"); - fclose(file); - - c += reserve_pecoff_compat_section(c); - c += reserve_pecoff_reloc_section(c); - - /* Pad unused space with zeros */ - setup_sectors = (c + 511) / 512; - if (setup_sectors < SETUP_SECT_MIN) - setup_sectors = SETUP_SECT_MIN; - i = setup_sectors*512; - memset(buf+c, 0, i-c); - - update_pecoff_setup_and_reloc(i); - - /* Set the default root device */ - put_unaligned_le16(DEFAULT_ROOT_DEV, &buf[508]); - - /* Open and stat the kernel file */ - fd = open(argv[2], O_RDONLY); - if (fd < 0) - die("Unable to open `%s': %m", argv[2]); - if (fstat(fd, &sb)) - die("Unable to stat `%s': %m", argv[2]); - sz = sb.st_size; - kernel = mmap(NULL, sz, PROT_READ, MAP_SHARED, fd, 0); - if (kernel == MAP_FAILED) - die("Unable to mmap '%s': %m", argv[2]); - /* Number of 16-byte paragraphs, including space for a 4-byte CRC */ - sys_size = (sz + 15 + 4) / 16; -#ifdef CONFIG_EFI_STUB - /* - * COFF requires minimum 32-byte alignment of sections, and - * adding a signature is problematic without that alignment. - */ - sys_size = (sys_size + 1) & ~1; -#endif - - /* Patch the setup code with the appropriate size parameters */ - buf[0x1f1] = setup_sectors-1; - put_unaligned_le32(sys_size, &buf[0x1f4]); - - init_sz = get_unaligned_le32(&buf[0x260]); -#ifdef CONFIG_EFI_STUB - /* - * The decompression buffer will start at ImageBase. When relocating - * the compressed kernel to its end, we must ensure that the head - * section does not get overwritten. The head section occupies - * [i, i + _ehead), and the destination is [init_sz - _end, init_sz). - * - * At present these should never overlap, because 'i' is at most 32k - * because of SETUP_SECT_MAX, '_ehead' is less than 1k, and the - * calculation of INIT_SIZE in boot/header.S ensures that - * 'init_sz - _end' is at least 64k. - * - * For future-proofing, increase init_sz if necessary. - */ - - if (init_sz - _end < i + _ehead) { - init_sz = (i + _ehead + _end + 4095) & ~4095; - put_unaligned_le32(init_sz, &buf[0x260]); - } -#endif - update_pecoff_text(setup_sectors * 512, i + (sys_size * 16), init_sz); - - efi_stub_entry_update(); - - /* Update kernel_info offset. */ - put_unaligned_le32(kernel_info, &buf[0x268]); - - crc = partial_crc32(buf, i, crc); - if (fwrite(buf, 1, i, dest) != i) - die("Writing setup failed"); - - /* Copy the kernel code */ - crc = partial_crc32(kernel, sz, crc); - if (fwrite(kernel, 1, sz, dest) != sz) - die("Writing kernel failed"); - - /* Add padding leaving 4 bytes for the checksum */ - while (sz++ < (sys_size*16) - 4) { - crc = partial_crc32_one('\0', crc); - if (fwrite("\0", 1, 1, dest) != 1) - die("Writing padding failed"); - } - - /* Write the CRC */ - put_unaligned_le32(crc, buf); - if (fwrite(buf, 1, 4, dest) != 4) - die("Writing CRC failed"); - - /* Catch any delayed write failures */ - if (fclose(dest)) - die("Writing image failed"); - - close(fd); - - /* Everything is OK */ - return 0; -} diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c index f2e96905b3fe..0641c8c46aee 100644 --- a/arch/x86/boot/video.c +++ b/arch/x86/boot/video.c @@ -292,7 +292,7 @@ static void restore_screen(void) "shrw %%cx ; " "jnc 1f ; " "stosw \n\t" - "1: rep;stosl ; " + "1: rep stosl ; " "popw %%es" : "+D" (dst), "+c" (npad) : "bdS" (video_segment), |
