diff options
Diffstat (limited to 'arch/x86/boot/compressed')
37 files changed, 4820 insertions, 1966 deletions
diff --git a/arch/x86/boot/compressed/.gitignore b/arch/x86/boot/compressed/.gitignore index 4a46fab7162e..25805199a506 100644 --- a/arch/x86/boot/compressed/.gitignore +++ b/arch/x86/boot/compressed/.gitignore @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only relocs vmlinux.bin.all vmlinux.relocs diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index dcd90df10ab4..68f9d7a1683b 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -1,54 +1,134 @@ +# SPDX-License-Identifier: GPL-2.0 # # linux/arch/x86/boot/compressed/Makefile # # create a compressed vmlinux image from the original vmlinux # +# vmlinuz is: +# decompression code (*.o) +# asm globals (piggy.S), including: +# vmlinux.bin.(gz|bz2|lzma|...) +# +# vmlinux.bin is: +# vmlinux stripped of debugging and comments +# vmlinux.bin.all is: +# vmlinux.bin + vmlinux.relocs +# vmlinux.bin.(gz|bz2|lzma|...) is: +# (see scripts/Makefile.lib size_append) +# compressed vmlinux.bin.all + u32 size of vmlinux.bin.all targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \ - vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4 - -KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 -KBUILD_CFLAGS += -fno-strict-aliasing -fPIC + vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4 vmlinux.bin.zst + +# CLANG_FLAGS must come before any cc-disable-warning or cc-option calls in +# case of cross compiling, as it has the '--target=' flag, which is needed to +# avoid errors with '-march=i386', and future flags may depend on the target to +# be valid. +KBUILD_CFLAGS := -m$(BITS) -O2 $(CLANG_FLAGS) +KBUILD_CFLAGS += -std=gnu11 -fms-extensions +KBUILD_CFLAGS += -fno-strict-aliasing -fPIE +KBUILD_CFLAGS += -Wundef KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING cflags-$(CONFIG_X86_32) := -march=i386 -cflags-$(CONFIG_X86_64) := -mcmodel=small +cflags-$(CONFIG_X86_64) := -mcmodel=small -mno-red-zone KBUILD_CFLAGS += $(cflags-y) -KBUILD_CFLAGS += $(call cc-option,-ffreestanding) -KBUILD_CFLAGS += $(call cc-option,-fno-stack-protector) +KBUILD_CFLAGS += -mno-mmx -mno-sse +KBUILD_CFLAGS += -ffreestanding -fshort-wchar +KBUILD_CFLAGS += -fno-stack-protector +KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) +ifdef CONFIG_CC_IS_CLANG +KBUILD_CFLAGS += -Wno-gnu +KBUILD_CFLAGS += -Wno-microsoft-anon-tag +endif +KBUILD_CFLAGS += -Wno-pointer-sign +KBUILD_CFLAGS += -fno-asynchronous-unwind-tables +KBUILD_CFLAGS += -D__DISABLE_EXPORTS +# Disable relocation relaxation in case the link is not PIE. +KBUILD_CFLAGS += $(call cc-option,-Wa$(comma)-mrelax-relocations=no) +KBUILD_CFLAGS += -include $(srctree)/include/linux/hidden.h + +# sev-decode-insn.c indirectly includes inat-table.c which is generated during +# compilation and stored in $(objtree). Add the directory to the includes so +# that the compiler finds it even with out-of-tree builds (make O=/some/path). +CFLAGS_sev-handle-vc.o += -I$(objtree)/arch/x86/lib/ KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ -GCOV_PROFILE := n -LDFLAGS := -m elf_$(UTS_MACHINE) -LDFLAGS_vmlinux := -T +KBUILD_LDFLAGS := -m elf_$(UTS_MACHINE) +KBUILD_LDFLAGS += $(call ld-option,--no-ld-generated-unwind-info) +# Compressed kernel should be built as PIE since it may be loaded at any +# address by the bootloader. +LDFLAGS_vmlinux := -pie $(call ld-option, --no-dynamic-linker) +ifdef CONFIG_LD_ORPHAN_WARN +LDFLAGS_vmlinux += --orphan-handling=$(CONFIG_LD_ORPHAN_WARN_LEVEL) +endif +LDFLAGS_vmlinux += -z noexecstack +ifeq ($(CONFIG_LD_IS_BFD),y) +LDFLAGS_vmlinux += $(call ld-option,--no-warn-rwx-segments) +endif +ifeq ($(CONFIG_EFI_STUB),y) +# ensure that the static EFI stub library will be pulled in, even if it is +# never referenced explicitly from the startup code +LDFLAGS_vmlinux += -u efi_pe_entry +endif +LDFLAGS_vmlinux += -T -hostprogs-y := mkpiggy +hostprogs := mkpiggy HOST_EXTRACFLAGS += -I$(srctree)/tools/include -VMLINUX_OBJS = $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \ - $(obj)/string.o $(obj)/cmdline.o $(obj)/early_serial_console.o \ - $(obj)/piggy.o +sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABbCDGRSTtVW] \(_text\|__start_rodata\|_sinittext\|__inittext_end\|__bss_start\|_end\)$$/\#define VO_\2 _AC(0x\1,UL)/p' + +quiet_cmd_voffset = VOFFSET $@ + cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@ + +targets += ../voffset.h -$(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone +$(obj)/../voffset.h: vmlinux FORCE + $(call if_changed,voffset) -ifeq ($(CONFIG_EFI_STUB), y) - VMLINUX_OBJS += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o +$(obj)/misc.o: $(obj)/../voffset.h + +vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/kernel_info.o $(obj)/head_$(BITS).o \ + $(obj)/misc.o $(obj)/string.o $(obj)/cmdline.o $(obj)/error.o \ + $(obj)/piggy.o $(obj)/cpuflags.o + +vmlinux-objs-$(CONFIG_EARLY_PRINTK) += $(obj)/early_serial_console.o +vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr.o +ifdef CONFIG_X86_64 + vmlinux-objs-y += $(obj)/ident_map_64.o + vmlinux-objs-y += $(obj)/idt_64.o $(obj)/idt_handlers_64.o + vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/mem_encrypt.o + vmlinux-objs-y += $(obj)/pgtable_64.o + vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/sev.o $(obj)/sev-handle-vc.o endif -$(obj)/vmlinux: $(VMLINUX_OBJS) FORCE +vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o +vmlinux-objs-$(CONFIG_INTEL_TDX_GUEST) += $(obj)/tdx.o $(obj)/tdcall.o $(obj)/tdx-shared.o +vmlinux-objs-$(CONFIG_UNACCEPTED_MEMORY) += $(obj)/mem.o + +vmlinux-objs-$(CONFIG_EFI) += $(obj)/efi.o +vmlinux-libs-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a +vmlinux-libs-$(CONFIG_X86_64) += $(objtree)/arch/x86/boot/startup/lib.a +vmlinux-objs-$(CONFIG_EFI_SBAT) += $(obj)/sbat.o + +ifdef CONFIG_EFI_SBAT +$(obj)/sbat.o: $(CONFIG_EFI_SBAT_FILE) +endif + +$(obj)/vmlinux: $(vmlinux-objs-y) $(vmlinux-libs-y) FORCE $(call if_changed,ld) - @: OBJCOPYFLAGS_vmlinux.bin := -R .comment -S $(obj)/vmlinux.bin: vmlinux FORCE $(call if_changed,objcopy) -targets += $(patsubst $(obj)/%,%,$(VMLINUX_OBJS)) vmlinux.bin.all vmlinux.relocs +targets += $(patsubst $(obj)/%,%,$(vmlinux-objs-y)) vmlinux.bin.all vmlinux.relocs CMD_RELOCS = arch/x86/tools/relocs quiet_cmd_relocs = RELOCS $@ cmd_relocs = $(CMD_RELOCS) $< > $@;$(CMD_RELOCS) --abs-relocs $< -$(obj)/vmlinux.relocs: vmlinux FORCE + +$(obj)/vmlinux.relocs: vmlinux.unstripped FORCE $(call if_changed,relocs) vmlinux.bin.all-y := $(obj)/vmlinux.bin @@ -57,15 +137,17 @@ vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += $(obj)/vmlinux.relocs $(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE $(call if_changed,gzip) $(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE - $(call if_changed,bzip2) + $(call if_changed,bzip2_with_size) $(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE - $(call if_changed,lzma) + $(call if_changed,lzma_with_size) $(obj)/vmlinux.bin.xz: $(vmlinux.bin.all-y) FORCE - $(call if_changed,xzkern) + $(call if_changed,xzkern_with_size) $(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE - $(call if_changed,lzo) + $(call if_changed,lzo_with_size) $(obj)/vmlinux.bin.lz4: $(vmlinux.bin.all-y) FORCE - $(call if_changed,lz4) + $(call if_changed,lz4_with_size) +$(obj)/vmlinux.bin.zst: $(vmlinux.bin.all-y) FORCE + $(call if_changed,zstd22_with_size) suffix-$(CONFIG_KERNEL_GZIP) := gz suffix-$(CONFIG_KERNEL_BZIP2) := bz2 @@ -73,9 +155,10 @@ suffix-$(CONFIG_KERNEL_LZMA) := lzma suffix-$(CONFIG_KERNEL_XZ) := xz suffix-$(CONFIG_KERNEL_LZO) := lzo suffix-$(CONFIG_KERNEL_LZ4) := lz4 +suffix-$(CONFIG_KERNEL_ZSTD) := zst quiet_cmd_mkpiggy = MKPIGGY $@ - cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false ) + cmd_mkpiggy = $(obj)/mkpiggy $< > $@ targets += piggy.S $(obj)/piggy.S: $(obj)/vmlinux.bin.$(suffix-y) $(obj)/mkpiggy FORCE diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c new file mode 100644 index 000000000000..f196b1d1ddf8 --- /dev/null +++ b/arch/x86/boot/compressed/acpi.c @@ -0,0 +1,317 @@ +// SPDX-License-Identifier: GPL-2.0 +#define BOOT_CTYPE_H +#include "misc.h" +#include "error.h" +#include "../string.h" +#include "efi.h" + +#include <asm/bootparam.h> + +#include <linux/numa.h> + +/* + * Longest parameter of 'acpi=' is 'copy_dsdt', plus an extra '\0' + * for termination. + */ +#define MAX_ACPI_ARG_LENGTH 10 + +/* + * Immovable memory regions representation. Max amount of memory regions is + * MAX_NUMNODES*2. + */ +struct mem_vector immovable_mem[MAX_NUMNODES*2]; + +static acpi_physical_address +__efi_get_rsdp_addr(unsigned long cfg_tbl_pa, unsigned int cfg_tbl_len) +{ +#ifdef CONFIG_EFI + unsigned long rsdp_addr; + int ret; + + /* + * Search EFI system tables for RSDP. Preferred is ACPI_20_TABLE_GUID to + * ACPI_TABLE_GUID because it has more features. + */ + rsdp_addr = efi_find_vendor_table(boot_params_ptr, cfg_tbl_pa, cfg_tbl_len, + ACPI_20_TABLE_GUID); + if (rsdp_addr) + return (acpi_physical_address)rsdp_addr; + + /* No ACPI_20_TABLE_GUID found, fallback to ACPI_TABLE_GUID. */ + rsdp_addr = efi_find_vendor_table(boot_params_ptr, cfg_tbl_pa, cfg_tbl_len, + ACPI_TABLE_GUID); + if (rsdp_addr) + return (acpi_physical_address)rsdp_addr; + + debug_putstr("Error getting RSDP address.\n"); +#endif + return 0; +} + +static acpi_physical_address efi_get_rsdp_addr(void) +{ +#ifdef CONFIG_EFI + unsigned long cfg_tbl_pa = 0; + unsigned int cfg_tbl_len; + unsigned long systab_pa; + unsigned int nr_tables; + enum efi_type et; + int ret; + + et = efi_get_type(boot_params_ptr); + if (et == EFI_TYPE_NONE) + return 0; + + systab_pa = efi_get_system_table(boot_params_ptr); + if (!systab_pa) + error("EFI support advertised, but unable to locate system table."); + + ret = efi_get_conf_table(boot_params_ptr, &cfg_tbl_pa, &cfg_tbl_len); + if (ret || !cfg_tbl_pa) + error("EFI config table not found."); + + return __efi_get_rsdp_addr(cfg_tbl_pa, cfg_tbl_len); +#else + return 0; +#endif +} + +static u8 compute_checksum(u8 *buffer, u32 length) +{ + u8 *end = buffer + length; + u8 sum = 0; + + while (buffer < end) + sum += *(buffer++); + + return sum; +} + +/* Search a block of memory for the RSDP signature. */ +static u8 *scan_mem_for_rsdp(u8 *start, u32 length) +{ + struct acpi_table_rsdp *rsdp; + u8 *address, *end; + + end = start + length; + + /* Search from given start address for the requested length */ + for (address = start; address < end; address += ACPI_RSDP_SCAN_STEP) { + /* + * Both RSDP signature and checksum must be correct. + * Note: Sometimes there exists more than one RSDP in memory; + * the valid RSDP has a valid checksum, all others have an + * invalid checksum. + */ + rsdp = (struct acpi_table_rsdp *)address; + + /* BAD Signature */ + if (!ACPI_VALIDATE_RSDP_SIG(rsdp->signature)) + continue; + + /* Check the standard checksum */ + if (compute_checksum((u8 *)rsdp, ACPI_RSDP_CHECKSUM_LENGTH)) + continue; + + /* Check extended checksum if table version >= 2 */ + if ((rsdp->revision >= 2) && + (compute_checksum((u8 *)rsdp, ACPI_RSDP_XCHECKSUM_LENGTH))) + continue; + + /* Signature and checksum valid, we have found a real RSDP */ + return address; + } + return NULL; +} + +/* Search RSDP address in EBDA. */ +static acpi_physical_address bios_get_rsdp_addr(void) +{ + unsigned long address; + u8 *rsdp; + + /* Get the location of the Extended BIOS Data Area (EBDA) */ + address = *(u16 *)ACPI_EBDA_PTR_LOCATION; + address <<= 4; + + /* + * Search EBDA paragraphs (EBDA is required to be a minimum of + * 1K length) + */ + if (address > 0x400) { + rsdp = scan_mem_for_rsdp((u8 *)address, ACPI_EBDA_WINDOW_SIZE); + if (rsdp) + return (acpi_physical_address)(unsigned long)rsdp; + } + + /* Search upper memory: 16-byte boundaries in E0000h-FFFFFh */ + rsdp = scan_mem_for_rsdp((u8 *) ACPI_HI_RSDP_WINDOW_BASE, + ACPI_HI_RSDP_WINDOW_SIZE); + if (rsdp) + return (acpi_physical_address)(unsigned long)rsdp; + + return 0; +} + +/* Return RSDP address on success, otherwise 0. */ +acpi_physical_address get_rsdp_addr(void) +{ + acpi_physical_address pa; + + pa = boot_params_ptr->acpi_rsdp_addr; + + if (!pa) + pa = efi_get_rsdp_addr(); + + if (!pa) + pa = bios_get_rsdp_addr(); + + return pa; +} + +#if defined(CONFIG_RANDOMIZE_BASE) && defined(CONFIG_MEMORY_HOTREMOVE) +/* + * Max length of 64-bit hex address string is 19, prefix "0x" + 16 hex + * digits, and '\0' for termination. + */ +#define MAX_ADDR_LEN 19 + +static unsigned long get_cmdline_acpi_rsdp(void) +{ + unsigned long addr = 0; + +#ifdef CONFIG_KEXEC_CORE + char val[MAX_ADDR_LEN] = { }; + int ret; + + ret = cmdline_find_option("acpi_rsdp", val, MAX_ADDR_LEN); + if (ret < 0) + return 0; + + if (boot_kstrtoul(val, 16, &addr)) + return 0; +#endif + return addr; +} + +/* Compute SRAT address from RSDP. */ +static unsigned long get_acpi_srat_table(void) +{ + unsigned long root_table, acpi_table; + struct acpi_table_header *header; + struct acpi_table_rsdp *rsdp; + u32 num_entries, size, len; + char arg[10]; + u8 *entry; + + /* + * Check whether we were given an RSDP on the command line. We don't + * stash this in boot params because the kernel itself may have + * different ideas about whether to trust a command-line parameter. + */ + rsdp = (struct acpi_table_rsdp *)get_cmdline_acpi_rsdp(); + if (!rsdp) + rsdp = (struct acpi_table_rsdp *)(long) + boot_params_ptr->acpi_rsdp_addr; + + if (!rsdp) + return 0; + + /* Get ACPI root table from RSDP.*/ + if (!(cmdline_find_option("acpi", arg, sizeof(arg)) == 4 && + !strncmp(arg, "rsdt", 4)) && + rsdp->xsdt_physical_address && + rsdp->revision > 1) { + root_table = rsdp->xsdt_physical_address; + size = ACPI_XSDT_ENTRY_SIZE; + } else { + root_table = rsdp->rsdt_physical_address; + size = ACPI_RSDT_ENTRY_SIZE; + } + + if (!root_table) + return 0; + + header = (struct acpi_table_header *)root_table; + len = header->length; + if (len < sizeof(struct acpi_table_header) + size) + return 0; + + num_entries = (len - sizeof(struct acpi_table_header)) / size; + entry = (u8 *)(root_table + sizeof(struct acpi_table_header)); + + while (num_entries--) { + if (size == ACPI_RSDT_ENTRY_SIZE) + acpi_table = *(u32 *)entry; + else + acpi_table = *(u64 *)entry; + + if (acpi_table) { + header = (struct acpi_table_header *)acpi_table; + + if (ACPI_COMPARE_NAMESEG(header->signature, ACPI_SIG_SRAT)) + return acpi_table; + } + entry += size; + } + return 0; +} + +/** + * count_immovable_mem_regions - Parse SRAT and cache the immovable + * memory regions into the immovable_mem array. + * + * Return the number of immovable memory regions on success, 0 on failure: + * + * - Too many immovable memory regions + * - ACPI off or no SRAT found + * - No immovable memory region found. + */ +int count_immovable_mem_regions(void) +{ + unsigned long table_addr, table_end, table; + struct acpi_subtable_header *sub_table; + struct acpi_table_header *table_header; + char arg[MAX_ACPI_ARG_LENGTH]; + int num = 0; + + if (cmdline_find_option("acpi", arg, sizeof(arg)) == 3 && + !strncmp(arg, "off", 3)) + return 0; + + table_addr = get_acpi_srat_table(); + if (!table_addr) + return 0; + + table_header = (struct acpi_table_header *)table_addr; + table_end = table_addr + table_header->length; + table = table_addr + sizeof(struct acpi_table_srat); + + while (table + sizeof(struct acpi_subtable_header) < table_end) { + + sub_table = (struct acpi_subtable_header *)table; + if (!sub_table->length) { + debug_putstr("Invalid zero length SRAT subtable.\n"); + return 0; + } + + if (sub_table->type == ACPI_SRAT_TYPE_MEMORY_AFFINITY) { + struct acpi_srat_mem_affinity *ma; + + ma = (struct acpi_srat_mem_affinity *)sub_table; + if (!(ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && ma->length) { + immovable_mem[num].start = ma->base_address; + immovable_mem[num].size = ma->length; + num++; + } + + if (num >= MAX_NUMNODES*2) { + debug_putstr("Too many immovable memory regions, aborting.\n"); + return 0; + } + } + table += sub_table->length; + } + return num; +} +#endif /* CONFIG_RANDOMIZE_BASE && CONFIG_MEMORY_HOTREMOVE */ diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c index bffd73b45b1f..e162d7f59cc5 100644 --- a/arch/x86/boot/compressed/cmdline.c +++ b/arch/x86/boot/compressed/cmdline.c @@ -1,6 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 #include "misc.h" -#ifdef CONFIG_EARLY_PRINTK +#include <asm/bootparam.h> static unsigned long fs; static inline void set_fs(unsigned long seg) @@ -13,11 +14,11 @@ static inline char rdfs8(addr_t addr) return *((char *)(fs + addr)); } #include "../cmdline.c" -static unsigned long get_cmd_line_ptr(void) +unsigned long get_cmd_line_ptr(void) { - unsigned long cmd_line_ptr = real_mode->hdr.cmd_line_ptr; + unsigned long cmd_line_ptr = boot_params_ptr->hdr.cmd_line_ptr; - cmd_line_ptr |= (u64)real_mode->ext_cmd_line_ptr << 32; + cmd_line_ptr |= (u64)boot_params_ptr->ext_cmd_line_ptr << 32; return cmd_line_ptr; } @@ -29,5 +30,3 @@ int cmdline_find_option_bool(const char *option) { return __cmdline_find_option_bool(get_cmd_line_ptr(), option); } - -#endif diff --git a/arch/x86/boot/compressed/cpuflags.c b/arch/x86/boot/compressed/cpuflags.c new file mode 100644 index 000000000000..0cc1323896d1 --- /dev/null +++ b/arch/x86/boot/compressed/cpuflags.c @@ -0,0 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "../cpuflags.c" + +bool has_cpuflag(int flag) +{ + get_cpuflags(); + + return test_bit(flag, cpu.flags); +} diff --git a/arch/x86/boot/compressed/early_serial_console.c b/arch/x86/boot/compressed/early_serial_console.c index d3d003cb5481..70a8d1706d0f 100644 --- a/arch/x86/boot/compressed/early_serial_console.c +++ b/arch/x86/boot/compressed/early_serial_console.c @@ -1,9 +1,6 @@ #include "misc.h" -#ifdef CONFIG_EARLY_PRINTK - -int early_serial_base; +/* This might be accessed before .bss is cleared, so use .data instead. */ +int early_serial_base __section(".data"); #include "../early_serial_console.c" - -#endif diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c deleted file mode 100644 index d606463aa6d6..000000000000 --- a/arch/x86/boot/compressed/eboot.c +++ /dev/null @@ -1,1278 +0,0 @@ -/* ----------------------------------------------------------------------- - * - * Copyright 2011 Intel Corporation; author Matt Fleming - * - * This file is part of the Linux kernel, and is made available under - * the terms of the GNU General Public License version 2. - * - * ----------------------------------------------------------------------- */ - -#include <linux/efi.h> -#include <linux/pci.h> -#include <asm/efi.h> -#include <asm/setup.h> -#include <asm/desc.h> - -#undef memcpy /* Use memcpy from misc.c */ - -#include "eboot.h" - -static efi_system_table_t *sys_table; - -static void efi_char16_printk(efi_char16_t *str) -{ - struct efi_simple_text_output_protocol *out; - - out = (struct efi_simple_text_output_protocol *)sys_table->con_out; - efi_call_phys2(out->output_string, out, str); -} - -static void efi_printk(char *str) -{ - char *s8; - - for (s8 = str; *s8; s8++) { - efi_char16_t ch[2] = { 0 }; - - ch[0] = *s8; - if (*s8 == '\n') { - efi_char16_t nl[2] = { '\r', 0 }; - efi_char16_printk(nl); - } - - efi_char16_printk(ch); - } -} - -static efi_status_t __get_map(efi_memory_desc_t **map, unsigned long *map_size, - unsigned long *desc_size) -{ - efi_memory_desc_t *m = NULL; - efi_status_t status; - unsigned long key; - u32 desc_version; - - *map_size = sizeof(*m) * 32; -again: - /* - * Add an additional efi_memory_desc_t because we're doing an - * allocation which may be in a new descriptor region. - */ - *map_size += sizeof(*m); - status = efi_call_phys3(sys_table->boottime->allocate_pool, - EFI_LOADER_DATA, *map_size, (void **)&m); - if (status != EFI_SUCCESS) - goto fail; - - status = efi_call_phys5(sys_table->boottime->get_memory_map, map_size, - m, &key, desc_size, &desc_version); - if (status == EFI_BUFFER_TOO_SMALL) { - efi_call_phys1(sys_table->boottime->free_pool, m); - goto again; - } - - if (status != EFI_SUCCESS) - efi_call_phys1(sys_table->boottime->free_pool, m); - -fail: - *map = m; - return status; -} - -/* - * Allocate at the highest possible address that is not above 'max'. - */ -static efi_status_t high_alloc(unsigned long size, unsigned long align, - unsigned long *addr, unsigned long max) -{ - unsigned long map_size, desc_size; - efi_memory_desc_t *map; - efi_status_t status; - unsigned long nr_pages; - u64 max_addr = 0; - int i; - - status = __get_map(&map, &map_size, &desc_size); - if (status != EFI_SUCCESS) - goto fail; - - nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE; -again: - for (i = 0; i < map_size / desc_size; i++) { - efi_memory_desc_t *desc; - unsigned long m = (unsigned long)map; - u64 start, end; - - desc = (efi_memory_desc_t *)(m + (i * desc_size)); - if (desc->type != EFI_CONVENTIONAL_MEMORY) - continue; - - if (desc->num_pages < nr_pages) - continue; - - start = desc->phys_addr; - end = start + desc->num_pages * (1UL << EFI_PAGE_SHIFT); - - if ((start + size) > end || (start + size) > max) - continue; - - if (end - size > max) - end = max; - - if (round_down(end - size, align) < start) - continue; - - start = round_down(end - size, align); - - /* - * Don't allocate at 0x0. It will confuse code that - * checks pointers against NULL. - */ - if (start == 0x0) - continue; - - if (start > max_addr) - max_addr = start; - } - - if (!max_addr) - status = EFI_NOT_FOUND; - else { - status = efi_call_phys4(sys_table->boottime->allocate_pages, - EFI_ALLOCATE_ADDRESS, EFI_LOADER_DATA, - nr_pages, &max_addr); - if (status != EFI_SUCCESS) { - max = max_addr; - max_addr = 0; - goto again; - } - - *addr = max_addr; - } - -free_pool: - efi_call_phys1(sys_table->boottime->free_pool, map); - -fail: - return status; -} - -/* - * Allocate at the lowest possible address. - */ -static efi_status_t low_alloc(unsigned long size, unsigned long align, - unsigned long *addr) -{ - unsigned long map_size, desc_size; - efi_memory_desc_t *map; - efi_status_t status; - unsigned long nr_pages; - int i; - - status = __get_map(&map, &map_size, &desc_size); - if (status != EFI_SUCCESS) - goto fail; - - nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE; - for (i = 0; i < map_size / desc_size; i++) { - efi_memory_desc_t *desc; - unsigned long m = (unsigned long)map; - u64 start, end; - - desc = (efi_memory_desc_t *)(m + (i * desc_size)); - - if (desc->type != EFI_CONVENTIONAL_MEMORY) - continue; - - if (desc->num_pages < nr_pages) - continue; - - start = desc->phys_addr; - end = start + desc->num_pages * (1UL << EFI_PAGE_SHIFT); - - /* - * Don't allocate at 0x0. It will confuse code that - * checks pointers against NULL. Skip the first 8 - * bytes so we start at a nice even number. - */ - if (start == 0x0) - start += 8; - - start = round_up(start, align); - if ((start + size) > end) - continue; - - status = efi_call_phys4(sys_table->boottime->allocate_pages, - EFI_ALLOCATE_ADDRESS, EFI_LOADER_DATA, - nr_pages, &start); - if (status == EFI_SUCCESS) { - *addr = start; - break; - } - } - - if (i == map_size / desc_size) - status = EFI_NOT_FOUND; - -free_pool: - efi_call_phys1(sys_table->boottime->free_pool, map); -fail: - return status; -} - -static void low_free(unsigned long size, unsigned long addr) -{ - unsigned long nr_pages; - - nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE; - efi_call_phys2(sys_table->boottime->free_pages, addr, size); -} - -static void find_bits(unsigned long mask, u8 *pos, u8 *size) -{ - u8 first, len; - - first = 0; - len = 0; - - if (mask) { - while (!(mask & 0x1)) { - mask = mask >> 1; - first++; - } - - while (mask & 0x1) { - mask = mask >> 1; - len++; - } - } - - *pos = first; - *size = len; -} - -static efi_status_t setup_efi_pci(struct boot_params *params) -{ - efi_pci_io_protocol *pci; - efi_status_t status; - void **pci_handle; - efi_guid_t pci_proto = EFI_PCI_IO_PROTOCOL_GUID; - unsigned long nr_pci, size = 0; - int i; - struct setup_data *data; - - data = (struct setup_data *)(unsigned long)params->hdr.setup_data; - - while (data && data->next) - data = (struct setup_data *)(unsigned long)data->next; - - status = efi_call_phys5(sys_table->boottime->locate_handle, - EFI_LOCATE_BY_PROTOCOL, &pci_proto, - NULL, &size, pci_handle); - - if (status == EFI_BUFFER_TOO_SMALL) { - status = efi_call_phys3(sys_table->boottime->allocate_pool, - EFI_LOADER_DATA, size, &pci_handle); - - if (status != EFI_SUCCESS) - return status; - - status = efi_call_phys5(sys_table->boottime->locate_handle, - EFI_LOCATE_BY_PROTOCOL, &pci_proto, - NULL, &size, pci_handle); - } - - if (status != EFI_SUCCESS) - goto free_handle; - - nr_pci = size / sizeof(void *); - for (i = 0; i < nr_pci; i++) { - void *h = pci_handle[i]; - uint64_t attributes; - struct pci_setup_rom *rom; - - status = efi_call_phys3(sys_table->boottime->handle_protocol, - h, &pci_proto, &pci); - - if (status != EFI_SUCCESS) - continue; - - if (!pci) - continue; - -#ifdef CONFIG_X86_64 - status = efi_call_phys4(pci->attributes, pci, - EfiPciIoAttributeOperationGet, 0, - &attributes); -#else - status = efi_call_phys5(pci->attributes, pci, - EfiPciIoAttributeOperationGet, 0, 0, - &attributes); -#endif - if (status != EFI_SUCCESS) - continue; - - if (!pci->romimage || !pci->romsize) - continue; - - size = pci->romsize + sizeof(*rom); - - status = efi_call_phys3(sys_table->boottime->allocate_pool, - EFI_LOADER_DATA, size, &rom); - - if (status != EFI_SUCCESS) - continue; - - rom->data.type = SETUP_PCI; - rom->data.len = size - sizeof(struct setup_data); - rom->data.next = 0; - rom->pcilen = pci->romsize; - - status = efi_call_phys5(pci->pci.read, pci, - EfiPciIoWidthUint16, PCI_VENDOR_ID, - 1, &(rom->vendor)); - - if (status != EFI_SUCCESS) - goto free_struct; - - status = efi_call_phys5(pci->pci.read, pci, - EfiPciIoWidthUint16, PCI_DEVICE_ID, - 1, &(rom->devid)); - - if (status != EFI_SUCCESS) - goto free_struct; - - status = efi_call_phys5(pci->get_location, pci, - &(rom->segment), &(rom->bus), - &(rom->device), &(rom->function)); - - if (status != EFI_SUCCESS) - goto free_struct; - - memcpy(rom->romdata, pci->romimage, pci->romsize); - - if (data) - data->next = (unsigned long)rom; - else - params->hdr.setup_data = (unsigned long)rom; - - data = (struct setup_data *)rom; - - continue; - free_struct: - efi_call_phys1(sys_table->boottime->free_pool, rom); - } - -free_handle: - efi_call_phys1(sys_table->boottime->free_pool, pci_handle); - return status; -} - -/* - * See if we have Graphics Output Protocol - */ -static efi_status_t setup_gop(struct screen_info *si, efi_guid_t *proto, - unsigned long size) -{ - struct efi_graphics_output_protocol *gop, *first_gop; - struct efi_pixel_bitmask pixel_info; - unsigned long nr_gops; - efi_status_t status; - void **gop_handle; - u16 width, height; - u32 fb_base, fb_size; - u32 pixels_per_scan_line; - int pixel_format; - int i; - - status = efi_call_phys3(sys_table->boottime->allocate_pool, - EFI_LOADER_DATA, size, &gop_handle); - if (status != EFI_SUCCESS) - return status; - - status = efi_call_phys5(sys_table->boottime->locate_handle, - EFI_LOCATE_BY_PROTOCOL, proto, - NULL, &size, gop_handle); - if (status != EFI_SUCCESS) - goto free_handle; - - first_gop = NULL; - - nr_gops = size / sizeof(void *); - for (i = 0; i < nr_gops; i++) { - struct efi_graphics_output_mode_info *info; - efi_guid_t conout_proto = EFI_CONSOLE_OUT_DEVICE_GUID; - bool conout_found = false; - void *dummy; - void *h = gop_handle[i]; - - status = efi_call_phys3(sys_table->boottime->handle_protocol, - h, proto, &gop); - if (status != EFI_SUCCESS) - continue; - - status = efi_call_phys3(sys_table->boottime->handle_protocol, - h, &conout_proto, &dummy); - - if (status == EFI_SUCCESS) - conout_found = true; - - status = efi_call_phys4(gop->query_mode, gop, - gop->mode->mode, &size, &info); - if (status == EFI_SUCCESS && (!first_gop || conout_found)) { - /* - * Systems that use the UEFI Console Splitter may - * provide multiple GOP devices, not all of which are - * backed by real hardware. The workaround is to search - * for a GOP implementing the ConOut protocol, and if - * one isn't found, to just fall back to the first GOP. - */ - width = info->horizontal_resolution; - height = info->vertical_resolution; - fb_base = gop->mode->frame_buffer_base; - fb_size = gop->mode->frame_buffer_size; - pixel_format = info->pixel_format; - pixel_info = info->pixel_information; - pixels_per_scan_line = info->pixels_per_scan_line; - - /* - * Once we've found a GOP supporting ConOut, - * don't bother looking any further. - */ - first_gop = gop; - if (conout_found) - break; - } - } - - /* Did we find any GOPs? */ - if (!first_gop) - goto free_handle; - - /* EFI framebuffer */ - si->orig_video_isVGA = VIDEO_TYPE_EFI; - - si->lfb_width = width; - si->lfb_height = height; - si->lfb_base = fb_base; - si->pages = 1; - - if (pixel_format == PIXEL_RGB_RESERVED_8BIT_PER_COLOR) { - si->lfb_depth = 32; - si->lfb_linelength = pixels_per_scan_line * 4; - si->red_size = 8; - si->red_pos = 0; - si->green_size = 8; - si->green_pos = 8; - si->blue_size = 8; - si->blue_pos = 16; - si->rsvd_size = 8; - si->rsvd_pos = 24; - } else if (pixel_format == PIXEL_BGR_RESERVED_8BIT_PER_COLOR) { - si->lfb_depth = 32; - si->lfb_linelength = pixels_per_scan_line * 4; - si->red_size = 8; - si->red_pos = 16; - si->green_size = 8; - si->green_pos = 8; - si->blue_size = 8; - si->blue_pos = 0; - si->rsvd_size = 8; - si->rsvd_pos = 24; - } else if (pixel_format == PIXEL_BIT_MASK) { - find_bits(pixel_info.red_mask, &si->red_pos, &si->red_size); - find_bits(pixel_info.green_mask, &si->green_pos, - &si->green_size); - find_bits(pixel_info.blue_mask, &si->blue_pos, &si->blue_size); - find_bits(pixel_info.reserved_mask, &si->rsvd_pos, - &si->rsvd_size); - si->lfb_depth = si->red_size + si->green_size + - si->blue_size + si->rsvd_size; - si->lfb_linelength = (pixels_per_scan_line * si->lfb_depth) / 8; - } else { - si->lfb_depth = 4; - si->lfb_linelength = si->lfb_width / 2; - si->red_size = 0; - si->red_pos = 0; - si->green_size = 0; - si->green_pos = 0; - si->blue_size = 0; - si->blue_pos = 0; - si->rsvd_size = 0; - si->rsvd_pos = 0; - } - - si->lfb_size = si->lfb_linelength * si->lfb_height; - - si->capabilities |= VIDEO_CAPABILITY_SKIP_QUIRKS; - -free_handle: - efi_call_phys1(sys_table->boottime->free_pool, gop_handle); - return status; -} - -/* - * See if we have Universal Graphics Adapter (UGA) protocol - */ -static efi_status_t setup_uga(struct screen_info *si, efi_guid_t *uga_proto, - unsigned long size) -{ - struct efi_uga_draw_protocol *uga, *first_uga; - unsigned long nr_ugas; - efi_status_t status; - u32 width, height; - void **uga_handle = NULL; - int i; - - status = efi_call_phys3(sys_table->boottime->allocate_pool, - EFI_LOADER_DATA, size, &uga_handle); - if (status != EFI_SUCCESS) - return status; - - status = efi_call_phys5(sys_table->boottime->locate_handle, - EFI_LOCATE_BY_PROTOCOL, uga_proto, - NULL, &size, uga_handle); - if (status != EFI_SUCCESS) - goto free_handle; - - first_uga = NULL; - - nr_ugas = size / sizeof(void *); - for (i = 0; i < nr_ugas; i++) { - efi_guid_t pciio_proto = EFI_PCI_IO_PROTOCOL_GUID; - void *handle = uga_handle[i]; - u32 w, h, depth, refresh; - void *pciio; - - status = efi_call_phys3(sys_table->boottime->handle_protocol, - handle, uga_proto, &uga); - if (status != EFI_SUCCESS) - continue; - - efi_call_phys3(sys_table->boottime->handle_protocol, - handle, &pciio_proto, &pciio); - - status = efi_call_phys5(uga->get_mode, uga, &w, &h, - &depth, &refresh); - if (status == EFI_SUCCESS && (!first_uga || pciio)) { - width = w; - height = h; - - /* - * Once we've found a UGA supporting PCIIO, - * don't bother looking any further. - */ - if (pciio) - break; - - first_uga = uga; - } - } - - if (!first_uga) - goto free_handle; - - /* EFI framebuffer */ - si->orig_video_isVGA = VIDEO_TYPE_EFI; - - si->lfb_depth = 32; - si->lfb_width = width; - si->lfb_height = height; - - si->red_size = 8; - si->red_pos = 16; - si->green_size = 8; - si->green_pos = 8; - si->blue_size = 8; - si->blue_pos = 0; - si->rsvd_size = 8; - si->rsvd_pos = 24; - - -free_handle: - efi_call_phys1(sys_table->boottime->free_pool, uga_handle); - return status; -} - -void setup_graphics(struct boot_params *boot_params) -{ - efi_guid_t graphics_proto = EFI_GRAPHICS_OUTPUT_PROTOCOL_GUID; - struct screen_info *si; - efi_guid_t uga_proto = EFI_UGA_PROTOCOL_GUID; - efi_status_t status; - unsigned long size; - void **gop_handle = NULL; - void **uga_handle = NULL; - - si = &boot_params->screen_info; - memset(si, 0, sizeof(*si)); - - size = 0; - status = efi_call_phys5(sys_table->boottime->locate_handle, - EFI_LOCATE_BY_PROTOCOL, &graphics_proto, - NULL, &size, gop_handle); - if (status == EFI_BUFFER_TOO_SMALL) - status = setup_gop(si, &graphics_proto, size); - - if (status != EFI_SUCCESS) { - size = 0; - status = efi_call_phys5(sys_table->boottime->locate_handle, - EFI_LOCATE_BY_PROTOCOL, &uga_proto, - NULL, &size, uga_handle); - if (status == EFI_BUFFER_TOO_SMALL) - setup_uga(si, &uga_proto, size); - } -} - -struct initrd { - efi_file_handle_t *handle; - u64 size; -}; - -/* - * Check the cmdline for a LILO-style initrd= arguments. - * - * We only support loading an initrd from the same filesystem as the - * kernel image. - */ -static efi_status_t handle_ramdisks(efi_loaded_image_t *image, - struct setup_header *hdr) -{ - struct initrd *initrds; - unsigned long initrd_addr; - efi_guid_t fs_proto = EFI_FILE_SYSTEM_GUID; - u64 initrd_total; - efi_file_io_interface_t *io; - efi_file_handle_t *fh; - efi_status_t status; - int nr_initrds; - char *str; - int i, j, k; - - initrd_addr = 0; - initrd_total = 0; - - str = (char *)(unsigned long)hdr->cmd_line_ptr; - - j = 0; /* See close_handles */ - - if (!str || !*str) - return EFI_SUCCESS; - - for (nr_initrds = 0; *str; nr_initrds++) { - str = strstr(str, "initrd="); - if (!str) - break; - - str += 7; - - /* Skip any leading slashes */ - while (*str == '/' || *str == '\\') - str++; - - while (*str && *str != ' ' && *str != '\n') - str++; - } - - if (!nr_initrds) - return EFI_SUCCESS; - - status = efi_call_phys3(sys_table->boottime->allocate_pool, - EFI_LOADER_DATA, - nr_initrds * sizeof(*initrds), - &initrds); - if (status != EFI_SUCCESS) { - efi_printk("Failed to alloc mem for initrds\n"); - goto fail; - } - - str = (char *)(unsigned long)hdr->cmd_line_ptr; - for (i = 0; i < nr_initrds; i++) { - struct initrd *initrd; - efi_file_handle_t *h; - efi_file_info_t *info; - efi_char16_t filename_16[256]; - unsigned long info_sz; - efi_guid_t info_guid = EFI_FILE_INFO_ID; - efi_char16_t *p; - u64 file_sz; - - str = strstr(str, "initrd="); - if (!str) - break; - - str += 7; - - initrd = &initrds[i]; - p = filename_16; - - /* Skip any leading slashes */ - while (*str == '/' || *str == '\\') - str++; - - while (*str && *str != ' ' && *str != '\n') { - if ((u8 *)p >= (u8 *)filename_16 + sizeof(filename_16)) - break; - - if (*str == '/') { - *p++ = '\\'; - *str++; - } else { - *p++ = *str++; - } - } - - *p = '\0'; - - /* Only open the volume once. */ - if (!i) { - efi_boot_services_t *boottime; - - boottime = sys_table->boottime; - - status = efi_call_phys3(boottime->handle_protocol, - image->device_handle, &fs_proto, &io); - if (status != EFI_SUCCESS) { - efi_printk("Failed to handle fs_proto\n"); - goto free_initrds; - } - - status = efi_call_phys2(io->open_volume, io, &fh); - if (status != EFI_SUCCESS) { - efi_printk("Failed to open volume\n"); - goto free_initrds; - } - } - - status = efi_call_phys5(fh->open, fh, &h, filename_16, - EFI_FILE_MODE_READ, (u64)0); - if (status != EFI_SUCCESS) { - efi_printk("Failed to open initrd file: "); - efi_char16_printk(filename_16); - efi_printk("\n"); - goto close_handles; - } - - initrd->handle = h; - - info_sz = 0; - status = efi_call_phys4(h->get_info, h, &info_guid, - &info_sz, NULL); - if (status != EFI_BUFFER_TOO_SMALL) { - efi_printk("Failed to get initrd info size\n"); - goto close_handles; - } - -grow: - status = efi_call_phys3(sys_table->boottime->allocate_pool, - EFI_LOADER_DATA, info_sz, &info); - if (status != EFI_SUCCESS) { - efi_printk("Failed to alloc mem for initrd info\n"); - goto close_handles; - } - - status = efi_call_phys4(h->get_info, h, &info_guid, - &info_sz, info); - if (status == EFI_BUFFER_TOO_SMALL) { - efi_call_phys1(sys_table->boottime->free_pool, info); - goto grow; - } - - file_sz = info->file_size; - efi_call_phys1(sys_table->boottime->free_pool, info); - - if (status != EFI_SUCCESS) { - efi_printk("Failed to get initrd info\n"); - goto close_handles; - } - - initrd->size = file_sz; - initrd_total += file_sz; - } - - if (initrd_total) { - unsigned long addr; - - /* - * Multiple initrd's need to be at consecutive - * addresses in memory, so allocate enough memory for - * all the initrd's. - */ - status = high_alloc(initrd_total, 0x1000, - &initrd_addr, hdr->initrd_addr_max); - if (status != EFI_SUCCESS) { - efi_printk("Failed to alloc highmem for initrds\n"); - goto close_handles; - } - - /* We've run out of free low memory. */ - if (initrd_addr > hdr->initrd_addr_max) { - efi_printk("We've run out of free low memory\n"); - status = EFI_INVALID_PARAMETER; - goto free_initrd_total; - } - - addr = initrd_addr; - for (j = 0; j < nr_initrds; j++) { - u64 size; - - size = initrds[j].size; - while (size) { - u64 chunksize; - if (size > EFI_READ_CHUNK_SIZE) - chunksize = EFI_READ_CHUNK_SIZE; - else - chunksize = size; - status = efi_call_phys3(fh->read, - initrds[j].handle, - &chunksize, addr); - if (status != EFI_SUCCESS) { - efi_printk("Failed to read initrd\n"); - goto free_initrd_total; - } - addr += chunksize; - size -= chunksize; - } - - efi_call_phys1(fh->close, initrds[j].handle); - } - - } - - efi_call_phys1(sys_table->boottime->free_pool, initrds); - - hdr->ramdisk_image = initrd_addr; - hdr->ramdisk_size = initrd_total; - - return status; - -free_initrd_total: - low_free(initrd_total, initrd_addr); - -close_handles: - for (k = j; k < i; k++) - efi_call_phys1(fh->close, initrds[k].handle); -free_initrds: - efi_call_phys1(sys_table->boottime->free_pool, initrds); -fail: - hdr->ramdisk_image = 0; - hdr->ramdisk_size = 0; - - return status; -} - -/* - * Because the x86 boot code expects to be passed a boot_params we - * need to create one ourselves (usually the bootloader would create - * one for us). - */ -struct boot_params *make_boot_params(void *handle, efi_system_table_t *_table) -{ - struct boot_params *boot_params; - struct sys_desc_table *sdt; - struct apm_bios_info *bi; - struct setup_header *hdr; - struct efi_info *efi; - efi_loaded_image_t *image; - void *options; - u32 load_options_size; - efi_guid_t proto = LOADED_IMAGE_PROTOCOL_GUID; - int options_size = 0; - efi_status_t status; - unsigned long cmdline; - u16 *s2; - u8 *s1; - int i; - - sys_table = _table; - - /* Check if we were booted by the EFI firmware */ - if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) - return NULL; - - status = efi_call_phys3(sys_table->boottime->handle_protocol, - handle, &proto, (void *)&image); - if (status != EFI_SUCCESS) { - efi_printk("Failed to get handle for LOADED_IMAGE_PROTOCOL\n"); - return NULL; - } - - status = low_alloc(0x4000, 1, (unsigned long *)&boot_params); - if (status != EFI_SUCCESS) { - efi_printk("Failed to alloc lowmem for boot params\n"); - return NULL; - } - - memset(boot_params, 0x0, 0x4000); - - hdr = &boot_params->hdr; - efi = &boot_params->efi_info; - bi = &boot_params->apm_bios_info; - sdt = &boot_params->sys_desc_table; - - /* Copy the second sector to boot_params */ - memcpy(&hdr->jump, image->image_base + 512, 512); - - /* - * Fill out some of the header fields ourselves because the - * EFI firmware loader doesn't load the first sector. - */ - hdr->root_flags = 1; - hdr->vid_mode = 0xffff; - hdr->boot_flag = 0xAA55; - - hdr->code32_start = (__u64)(unsigned long)image->image_base; - - hdr->type_of_loader = 0x21; - - /* Convert unicode cmdline to ascii */ - options = image->load_options; - load_options_size = image->load_options_size / 2; /* ASCII */ - cmdline = 0; - s2 = (u16 *)options; - - if (s2) { - while (*s2 && *s2 != '\n' && options_size < load_options_size) { - s2++; - options_size++; - } - - if (options_size) { - if (options_size > hdr->cmdline_size) - options_size = hdr->cmdline_size; - - options_size++; /* NUL termination */ - - status = low_alloc(options_size, 1, &cmdline); - if (status != EFI_SUCCESS) { - efi_printk("Failed to alloc mem for cmdline\n"); - goto fail; - } - - s1 = (u8 *)(unsigned long)cmdline; - s2 = (u16 *)options; - - for (i = 0; i < options_size - 1; i++) - *s1++ = *s2++; - - *s1 = '\0'; - } - } - - hdr->cmd_line_ptr = cmdline; - - hdr->ramdisk_image = 0; - hdr->ramdisk_size = 0; - - /* Clear APM BIOS info */ - memset(bi, 0, sizeof(*bi)); - - memset(sdt, 0, sizeof(*sdt)); - - status = handle_ramdisks(image, hdr); - if (status != EFI_SUCCESS) - goto fail2; - - return boot_params; -fail2: - if (options_size) - low_free(options_size, hdr->cmd_line_ptr); -fail: - low_free(0x4000, (unsigned long)boot_params); - return NULL; -} - -static efi_status_t exit_boot(struct boot_params *boot_params, - void *handle) -{ - struct efi_info *efi = &boot_params->efi_info; - struct e820entry *e820_map = &boot_params->e820_map[0]; - struct e820entry *prev = NULL; - unsigned long size, key, desc_size, _size; - efi_memory_desc_t *mem_map; - efi_status_t status; - __u32 desc_version; - bool called_exit = false; - u8 nr_entries; - int i; - - size = sizeof(*mem_map) * 32; - -again: - size += sizeof(*mem_map) * 2; - _size = size; - status = low_alloc(size, 1, (unsigned long *)&mem_map); - if (status != EFI_SUCCESS) - return status; - -get_map: - status = efi_call_phys5(sys_table->boottime->get_memory_map, &size, - mem_map, &key, &desc_size, &desc_version); - if (status == EFI_BUFFER_TOO_SMALL) { - low_free(_size, (unsigned long)mem_map); - goto again; - } - - if (status != EFI_SUCCESS) - goto free_mem_map; - - memcpy(&efi->efi_loader_signature, EFI_LOADER_SIGNATURE, sizeof(__u32)); - efi->efi_systab = (unsigned long)sys_table; - efi->efi_memdesc_size = desc_size; - efi->efi_memdesc_version = desc_version; - efi->efi_memmap = (unsigned long)mem_map; - efi->efi_memmap_size = size; - -#ifdef CONFIG_X86_64 - efi->efi_systab_hi = (unsigned long)sys_table >> 32; - efi->efi_memmap_hi = (unsigned long)mem_map >> 32; -#endif - - /* Might as well exit boot services now */ - status = efi_call_phys2(sys_table->boottime->exit_boot_services, - handle, key); - if (status != EFI_SUCCESS) { - /* - * ExitBootServices() will fail if any of the event - * handlers change the memory map. In which case, we - * must be prepared to retry, but only once so that - * we're guaranteed to exit on repeated failures instead - * of spinning forever. - */ - if (called_exit) - goto free_mem_map; - - called_exit = true; - goto get_map; - } - - /* Historic? */ - boot_params->alt_mem_k = 32 * 1024; - - /* - * Convert the EFI memory map to E820. - */ - nr_entries = 0; - for (i = 0; i < size / desc_size; i++) { - efi_memory_desc_t *d; - unsigned int e820_type = 0; - unsigned long m = (unsigned long)mem_map; - - d = (efi_memory_desc_t *)(m + (i * desc_size)); - switch (d->type) { - case EFI_RESERVED_TYPE: - case EFI_RUNTIME_SERVICES_CODE: - case EFI_RUNTIME_SERVICES_DATA: - case EFI_MEMORY_MAPPED_IO: - case EFI_MEMORY_MAPPED_IO_PORT_SPACE: - case EFI_PAL_CODE: - e820_type = E820_RESERVED; - break; - - case EFI_UNUSABLE_MEMORY: - e820_type = E820_UNUSABLE; - break; - - case EFI_ACPI_RECLAIM_MEMORY: - e820_type = E820_ACPI; - break; - - case EFI_LOADER_CODE: - case EFI_LOADER_DATA: - case EFI_BOOT_SERVICES_CODE: - case EFI_BOOT_SERVICES_DATA: - case EFI_CONVENTIONAL_MEMORY: - e820_type = E820_RAM; - break; - - case EFI_ACPI_MEMORY_NVS: - e820_type = E820_NVS; - break; - - default: - continue; - } - - /* Merge adjacent mappings */ - if (prev && prev->type == e820_type && - (prev->addr + prev->size) == d->phys_addr) - prev->size += d->num_pages << 12; - else { - e820_map->addr = d->phys_addr; - e820_map->size = d->num_pages << 12; - e820_map->type = e820_type; - prev = e820_map++; - nr_entries++; - } - } - - boot_params->e820_entries = nr_entries; - - return EFI_SUCCESS; - -free_mem_map: - low_free(_size, (unsigned long)mem_map); - return status; -} - -static efi_status_t relocate_kernel(struct setup_header *hdr) -{ - unsigned long start, nr_pages; - efi_status_t status; - - /* - * The EFI firmware loader could have placed the kernel image - * anywhere in memory, but the kernel has various restrictions - * on the max physical address it can run at. Attempt to move - * the kernel to boot_params.pref_address, or as low as - * possible. - */ - start = hdr->pref_address; - nr_pages = round_up(hdr->init_size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE; - - status = efi_call_phys4(sys_table->boottime->allocate_pages, - EFI_ALLOCATE_ADDRESS, EFI_LOADER_DATA, - nr_pages, &start); - if (status != EFI_SUCCESS) { - status = low_alloc(hdr->init_size, hdr->kernel_alignment, - &start); - if (status != EFI_SUCCESS) - efi_printk("Failed to alloc mem for kernel\n"); - } - - if (status == EFI_SUCCESS) - memcpy((void *)start, (void *)(unsigned long)hdr->code32_start, - hdr->init_size); - - hdr->pref_address = hdr->code32_start; - hdr->code32_start = (__u32)start; - - return status; -} - -/* - * On success we return a pointer to a boot_params structure, and NULL - * on failure. - */ -struct boot_params *efi_main(void *handle, efi_system_table_t *_table, - struct boot_params *boot_params) -{ - struct desc_ptr *gdt, *idt; - efi_loaded_image_t *image; - struct setup_header *hdr = &boot_params->hdr; - efi_status_t status; - struct desc_struct *desc; - - sys_table = _table; - - /* Check if we were booted by the EFI firmware */ - if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) - goto fail; - - setup_graphics(boot_params); - - setup_efi_pci(boot_params); - - status = efi_call_phys3(sys_table->boottime->allocate_pool, - EFI_LOADER_DATA, sizeof(*gdt), - (void **)&gdt); - if (status != EFI_SUCCESS) { - efi_printk("Failed to alloc mem for gdt structure\n"); - goto fail; - } - - gdt->size = 0x800; - status = low_alloc(gdt->size, 8, (unsigned long *)&gdt->address); - if (status != EFI_SUCCESS) { - efi_printk("Failed to alloc mem for gdt\n"); - goto fail; - } - - status = efi_call_phys3(sys_table->boottime->allocate_pool, - EFI_LOADER_DATA, sizeof(*idt), - (void **)&idt); - if (status != EFI_SUCCESS) { - efi_printk("Failed to alloc mem for idt structure\n"); - goto fail; - } - - idt->size = 0; - idt->address = 0; - - /* - * If the kernel isn't already loaded at the preferred load - * address, relocate it. - */ - if (hdr->pref_address != hdr->code32_start) { - status = relocate_kernel(hdr); - - if (status != EFI_SUCCESS) - goto fail; - } - - status = exit_boot(boot_params, handle); - if (status != EFI_SUCCESS) - goto fail; - - memset((char *)gdt->address, 0x0, gdt->size); - desc = (struct desc_struct *)gdt->address; - - /* The first GDT is a dummy and the second is unused. */ - desc += 2; - - desc->limit0 = 0xffff; - desc->base0 = 0x0000; - desc->base1 = 0x0000; - desc->type = SEG_TYPE_CODE | SEG_TYPE_EXEC_READ; - desc->s = DESC_TYPE_CODE_DATA; - desc->dpl = 0; - desc->p = 1; - desc->limit = 0xf; - desc->avl = 0; - desc->l = 0; - desc->d = SEG_OP_SIZE_32BIT; - desc->g = SEG_GRANULARITY_4KB; - desc->base2 = 0x00; - - desc++; - desc->limit0 = 0xffff; - desc->base0 = 0x0000; - desc->base1 = 0x0000; - desc->type = SEG_TYPE_DATA | SEG_TYPE_READ_WRITE; - desc->s = DESC_TYPE_CODE_DATA; - desc->dpl = 0; - desc->p = 1; - desc->limit = 0xf; - desc->avl = 0; - desc->l = 0; - desc->d = SEG_OP_SIZE_32BIT; - desc->g = SEG_GRANULARITY_4KB; - desc->base2 = 0x00; - -#ifdef CONFIG_X86_64 - /* Task segment value */ - desc++; - desc->limit0 = 0x0000; - desc->base0 = 0x0000; - desc->base1 = 0x0000; - desc->type = SEG_TYPE_TSS; - desc->s = 0; - desc->dpl = 0; - desc->p = 1; - desc->limit = 0x0; - desc->avl = 0; - desc->l = 0; - desc->d = 0; - desc->g = SEG_GRANULARITY_4KB; - desc->base2 = 0x00; -#endif /* CONFIG_X86_64 */ - - asm volatile ("lidt %0" : : "m" (*idt)); - asm volatile ("lgdt %0" : : "m" (*gdt)); - - asm volatile("cli"); - - return boot_params; -fail: - return NULL; -} diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h deleted file mode 100644 index e5b0a8f91c5f..000000000000 --- a/arch/x86/boot/compressed/eboot.h +++ /dev/null @@ -1,71 +0,0 @@ -#ifndef BOOT_COMPRESSED_EBOOT_H -#define BOOT_COMPRESSED_EBOOT_H - -#define SEG_TYPE_DATA (0 << 3) -#define SEG_TYPE_READ_WRITE (1 << 1) -#define SEG_TYPE_CODE (1 << 3) -#define SEG_TYPE_EXEC_READ (1 << 1) -#define SEG_TYPE_TSS ((1 << 3) | (1 << 0)) -#define SEG_OP_SIZE_32BIT (1 << 0) -#define SEG_GRANULARITY_4KB (1 << 0) - -#define DESC_TYPE_CODE_DATA (1 << 0) - -#define EFI_PAGE_SIZE (1UL << EFI_PAGE_SHIFT) -#define EFI_READ_CHUNK_SIZE (1024 * 1024) - -#define EFI_CONSOLE_OUT_DEVICE_GUID \ - EFI_GUID(0xd3b36f2c, 0xd551, 0x11d4, 0x9a, 0x46, 0x0, 0x90, 0x27, \ - 0x3f, 0xc1, 0x4d) - -#define PIXEL_RGB_RESERVED_8BIT_PER_COLOR 0 -#define PIXEL_BGR_RESERVED_8BIT_PER_COLOR 1 -#define PIXEL_BIT_MASK 2 -#define PIXEL_BLT_ONLY 3 -#define PIXEL_FORMAT_MAX 4 - -struct efi_pixel_bitmask { - u32 red_mask; - u32 green_mask; - u32 blue_mask; - u32 reserved_mask; -}; - -struct efi_graphics_output_mode_info { - u32 version; - u32 horizontal_resolution; - u32 vertical_resolution; - int pixel_format; - struct efi_pixel_bitmask pixel_information; - u32 pixels_per_scan_line; -} __packed; - -struct efi_graphics_output_protocol_mode { - u32 max_mode; - u32 mode; - unsigned long info; - unsigned long size_of_info; - u64 frame_buffer_base; - unsigned long frame_buffer_size; -} __packed; - -struct efi_graphics_output_protocol { - void *query_mode; - unsigned long set_mode; - unsigned long blt; - struct efi_graphics_output_protocol_mode *mode; -}; - -struct efi_uga_draw_protocol { - void *get_mode; - void *set_mode; - void *blt; -}; - -struct efi_simple_text_output_protocol { - void *reset; - void *output_string; - void *test_string; -}; - -#endif /* BOOT_COMPRESSED_EBOOT_H */ diff --git a/arch/x86/boot/compressed/efi.c b/arch/x86/boot/compressed/efi.c new file mode 100644 index 000000000000..f2e50f9758e6 --- /dev/null +++ b/arch/x86/boot/compressed/efi.c @@ -0,0 +1,236 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Helpers for early access to EFI configuration table. + * + * Originally derived from arch/x86/boot/compressed/acpi.c + */ + +#include "misc.h" + +#include <asm/bootparam.h> + +/** + * efi_get_type - Given a pointer to boot_params, determine the type of EFI environment. + * + * @bp: pointer to boot_params + * + * Return: EFI_TYPE_{32,64} for valid EFI environments, EFI_TYPE_NONE otherwise. + */ +enum efi_type efi_get_type(struct boot_params *bp) +{ + struct efi_info *ei; + enum efi_type et; + const char *sig; + + ei = &bp->efi_info; + sig = (char *)&ei->efi_loader_signature; + + if (!strncmp(sig, EFI64_LOADER_SIGNATURE, 4)) { + et = EFI_TYPE_64; + } else if (!strncmp(sig, EFI32_LOADER_SIGNATURE, 4)) { + et = EFI_TYPE_32; + } else { + debug_putstr("No EFI environment detected.\n"); + et = EFI_TYPE_NONE; + } + +#ifndef CONFIG_X86_64 + /* + * Existing callers like acpi.c treat this case as an indicator to + * fall-through to non-EFI, rather than an error, so maintain that + * functionality here as well. + */ + if (ei->efi_systab_hi || ei->efi_memmap_hi) { + debug_putstr("EFI system table is located above 4GB and cannot be accessed.\n"); + et = EFI_TYPE_NONE; + } +#endif + + return et; +} + +/** + * efi_get_system_table - Given a pointer to boot_params, retrieve the physical address + * of the EFI system table. + * + * @bp: pointer to boot_params + * + * Return: EFI system table address on success. On error, return 0. + */ +unsigned long efi_get_system_table(struct boot_params *bp) +{ + unsigned long sys_tbl_pa; + struct efi_info *ei; + enum efi_type et; + + /* Get systab from boot params. */ + ei = &bp->efi_info; +#ifdef CONFIG_X86_64 + sys_tbl_pa = ei->efi_systab | ((__u64)ei->efi_systab_hi << 32); +#else + sys_tbl_pa = ei->efi_systab; +#endif + if (!sys_tbl_pa) { + debug_putstr("EFI system table not found."); + return 0; + } + + return sys_tbl_pa; +} + +/* + * EFI config table address changes to virtual address after boot, which may + * not be accessible for the kexec'd kernel. To address this, kexec provides + * the initial physical address via a struct setup_data entry, which is + * checked for here, along with some sanity checks. + */ +static struct efi_setup_data *get_kexec_setup_data(struct boot_params *bp, + enum efi_type et) +{ +#ifdef CONFIG_X86_64 + struct efi_setup_data *esd = NULL; + struct setup_data *data; + u64 pa_data; + + pa_data = bp->hdr.setup_data; + while (pa_data) { + data = (struct setup_data *)pa_data; + if (data->type == SETUP_EFI) { + esd = (struct efi_setup_data *)(pa_data + sizeof(struct setup_data)); + break; + } + + pa_data = data->next; + } + + /* + * Original ACPI code falls back to attempting normal EFI boot in these + * cases, so maintain existing behavior by indicating non-kexec + * environment to the caller, but print them for debugging. + */ + if (esd && !esd->tables) { + debug_putstr("kexec EFI environment missing valid configuration table.\n"); + return NULL; + } + + return esd; +#endif + return NULL; +} + +/** + * efi_get_conf_table - Given a pointer to boot_params, locate and return the physical + * address of EFI configuration table. + * + * @bp: pointer to boot_params + * @cfg_tbl_pa: location to store physical address of config table + * @cfg_tbl_len: location to store number of config table entries + * + * Return: 0 on success. On error, return params are left unchanged. + */ +int efi_get_conf_table(struct boot_params *bp, unsigned long *cfg_tbl_pa, + unsigned int *cfg_tbl_len) +{ + unsigned long sys_tbl_pa; + enum efi_type et; + int ret; + + if (!cfg_tbl_pa || !cfg_tbl_len) + return -EINVAL; + + sys_tbl_pa = efi_get_system_table(bp); + if (!sys_tbl_pa) + return -EINVAL; + + /* Handle EFI bitness properly */ + et = efi_get_type(bp); + if (et == EFI_TYPE_64) { + efi_system_table_64_t *stbl = (efi_system_table_64_t *)sys_tbl_pa; + struct efi_setup_data *esd; + + /* kexec provides an alternative EFI conf table, check for it. */ + esd = get_kexec_setup_data(bp, et); + + *cfg_tbl_pa = esd ? esd->tables : stbl->tables; + *cfg_tbl_len = stbl->nr_tables; + } else if (et == EFI_TYPE_32) { + efi_system_table_32_t *stbl = (efi_system_table_32_t *)sys_tbl_pa; + + *cfg_tbl_pa = stbl->tables; + *cfg_tbl_len = stbl->nr_tables; + } else { + return -EINVAL; + } + + return 0; +} + +/* Get vendor table address/guid from EFI config table at the given index */ +static int get_vendor_table(void *cfg_tbl, unsigned int idx, + unsigned long *vendor_tbl_pa, + efi_guid_t *vendor_tbl_guid, + enum efi_type et) +{ + if (et == EFI_TYPE_64) { + efi_config_table_64_t *tbl_entry = (efi_config_table_64_t *)cfg_tbl + idx; + + if (!IS_ENABLED(CONFIG_X86_64) && tbl_entry->table >> 32) { + debug_putstr("Error: EFI config table entry located above 4GB.\n"); + return -EINVAL; + } + + *vendor_tbl_pa = tbl_entry->table; + *vendor_tbl_guid = tbl_entry->guid; + + } else if (et == EFI_TYPE_32) { + efi_config_table_32_t *tbl_entry = (efi_config_table_32_t *)cfg_tbl + idx; + + *vendor_tbl_pa = tbl_entry->table; + *vendor_tbl_guid = tbl_entry->guid; + } else { + return -EINVAL; + } + + return 0; +} + +/** + * efi_find_vendor_table - Given EFI config table, search it for the physical + * address of the vendor table associated with GUID. + * + * @bp: pointer to boot_params + * @cfg_tbl_pa: pointer to EFI configuration table + * @cfg_tbl_len: number of entries in EFI configuration table + * @guid: GUID of vendor table + * + * Return: vendor table address on success. On error, return 0. + */ +unsigned long efi_find_vendor_table(struct boot_params *bp, + unsigned long cfg_tbl_pa, + unsigned int cfg_tbl_len, + efi_guid_t guid) +{ + enum efi_type et; + unsigned int i; + + et = efi_get_type(bp); + if (et == EFI_TYPE_NONE) + return 0; + + for (i = 0; i < cfg_tbl_len; i++) { + unsigned long vendor_tbl_pa; + efi_guid_t vendor_tbl_guid; + int ret; + + ret = get_vendor_table((void *)cfg_tbl_pa, i, + &vendor_tbl_pa, + &vendor_tbl_guid, et); + if (ret) + return 0; + + if (!efi_guidcmp(guid, vendor_tbl_guid)) + return vendor_tbl_pa; + } + + return 0; +} diff --git a/arch/x86/boot/compressed/efi.h b/arch/x86/boot/compressed/efi.h new file mode 100644 index 000000000000..b22300970f97 --- /dev/null +++ b/arch/x86/boot/compressed/efi.h @@ -0,0 +1,127 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef BOOT_COMPRESSED_EFI_H +#define BOOT_COMPRESSED_EFI_H + +#if defined(_LINUX_EFI_H) || defined(_ASM_X86_EFI_H) +#error Please do not include kernel proper namespace headers +#endif + +typedef guid_t efi_guid_t __aligned(__alignof__(u32)); + +#define EFI_GUID(a, b, c, d...) (efi_guid_t){ { \ + (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \ + (b) & 0xff, ((b) >> 8) & 0xff, \ + (c) & 0xff, ((c) >> 8) & 0xff, d } } + +#define ACPI_TABLE_GUID EFI_GUID(0xeb9d2d30, 0x2d88, 0x11d3, 0x9a, 0x16, 0x00, 0x90, 0x27, 0x3f, 0xc1, 0x4d) +#define ACPI_20_TABLE_GUID EFI_GUID(0x8868e871, 0xe4f1, 0x11d3, 0xbc, 0x22, 0x00, 0x80, 0xc7, 0x3c, 0x88, 0x81) +#define EFI_CC_BLOB_GUID EFI_GUID(0x067b1f5f, 0xcf26, 0x44c5, 0x85, 0x54, 0x93, 0xd7, 0x77, 0x91, 0x2d, 0x42) +#define LINUX_EFI_UNACCEPTED_MEM_TABLE_GUID EFI_GUID(0xd5d1de3c, 0x105c, 0x44f9, 0x9e, 0xa9, 0xbc, 0xef, 0x98, 0x12, 0x00, 0x31) + +#define EFI32_LOADER_SIGNATURE "EL32" +#define EFI64_LOADER_SIGNATURE "EL64" + +/* + * Generic EFI table header + */ +typedef struct { + u64 signature; + u32 revision; + u32 headersize; + u32 crc32; + u32 reserved; +} efi_table_hdr_t; + +#define EFI_CONVENTIONAL_MEMORY 7 +#define EFI_UNACCEPTED_MEMORY 15 + +#define EFI_MEMORY_MORE_RELIABLE \ + ((u64)0x0000000000010000ULL) /* higher reliability */ +#define EFI_MEMORY_SP ((u64)0x0000000000040000ULL) /* soft reserved */ + +#define EFI_PAGE_SHIFT 12 + +typedef struct { + u32 type; + u32 pad; + u64 phys_addr; + u64 virt_addr; + u64 num_pages; + u64 attribute; +} efi_memory_desc_t; + +#define efi_early_memdesc_ptr(map, desc_size, n) \ + (efi_memory_desc_t *)((void *)(map) + ((n) * (desc_size))) + +typedef struct { + efi_guid_t guid; + u64 table; +} efi_config_table_64_t; + +typedef struct { + efi_guid_t guid; + u32 table; +} efi_config_table_32_t; + +typedef struct { + efi_table_hdr_t hdr; + u64 fw_vendor; /* physical addr of CHAR16 vendor string */ + u32 fw_revision; + u32 __pad1; + u64 con_in_handle; + u64 con_in; + u64 con_out_handle; + u64 con_out; + u64 stderr_handle; + u64 stderr; + u64 runtime; + u64 boottime; + u32 nr_tables; + u32 __pad2; + u64 tables; +} efi_system_table_64_t; + +typedef struct { + efi_table_hdr_t hdr; + u32 fw_vendor; /* physical addr of CHAR16 vendor string */ + u32 fw_revision; + u32 con_in_handle; + u32 con_in; + u32 con_out_handle; + u32 con_out; + u32 stderr_handle; + u32 stderr; + u32 runtime; + u32 boottime; + u32 nr_tables; + u32 tables; +} efi_system_table_32_t; + +struct efi_unaccepted_memory { + u32 version; + u32 unit_size; + u64 phys_base; + u64 size; + unsigned long bitmap[]; +}; + +static inline int efi_guidcmp (efi_guid_t left, efi_guid_t right) +{ + return memcmp(&left, &right, sizeof (efi_guid_t)); +} + +#ifdef CONFIG_EFI +bool __pure __efi_soft_reserve_enabled(void); + +static inline bool __pure efi_soft_reserve_enabled(void) +{ + return IS_ENABLED(CONFIG_EFI_SOFT_RESERVE) + && __efi_soft_reserve_enabled(); +} +#else +static inline bool efi_soft_reserve_enabled(void) +{ + return false; +} +#endif /* CONFIG_EFI */ +#endif /* BOOT_COMPRESSED_EFI_H */ diff --git a/arch/x86/boot/compressed/efi_stub_32.S b/arch/x86/boot/compressed/efi_stub_32.S deleted file mode 100644 index a53440e81d52..000000000000 --- a/arch/x86/boot/compressed/efi_stub_32.S +++ /dev/null @@ -1,86 +0,0 @@ -/* - * EFI call stub for IA32. - * - * This stub allows us to make EFI calls in physical mode with interrupts - * turned off. Note that this implementation is different from the one in - * arch/x86/platform/efi/efi_stub_32.S because we're _already_ in physical - * mode at this point. - */ - -#include <linux/linkage.h> -#include <asm/page_types.h> - -/* - * efi_call_phys(void *, ...) is a function with variable parameters. - * All the callers of this function assure that all the parameters are 4-bytes. - */ - -/* - * In gcc calling convention, EBX, ESP, EBP, ESI and EDI are all callee save. - * So we'd better save all of them at the beginning of this function and restore - * at the end no matter how many we use, because we can not assure EFI runtime - * service functions will comply with gcc calling convention, too. - */ - -.text -ENTRY(efi_call_phys) - /* - * 0. The function can only be called in Linux kernel. So CS has been - * set to 0x0010, DS and SS have been set to 0x0018. In EFI, I found - * the values of these registers are the same. And, the corresponding - * GDT entries are identical. So I will do nothing about segment reg - * and GDT, but change GDT base register in prelog and epilog. - */ - - /* - * 1. Because we haven't been relocated by this point we need to - * use relative addressing. - */ - call 1f -1: popl %edx - subl $1b, %edx - - /* - * 2. Now on the top of stack is the return - * address in the caller of efi_call_phys(), then parameter 1, - * parameter 2, ..., param n. To make things easy, we save the return - * address of efi_call_phys in a global variable. - */ - popl %ecx - movl %ecx, saved_return_addr(%edx) - /* get the function pointer into ECX*/ - popl %ecx - movl %ecx, efi_rt_function_ptr(%edx) - - /* - * 3. Call the physical function. - */ - call *%ecx - - /* - * 4. Balance the stack. And because EAX contain the return value, - * we'd better not clobber it. We need to calculate our address - * again because %ecx and %edx are not preserved across EFI function - * calls. - */ - call 1f -1: popl %edx - subl $1b, %edx - - movl efi_rt_function_ptr(%edx), %ecx - pushl %ecx - - /* - * 10. Push the saved return address onto the stack and return. - */ - movl saved_return_addr(%edx), %ecx - pushl %ecx - ret -ENDPROC(efi_call_phys) -.previous - -.data -saved_return_addr: - .long 0 -efi_rt_function_ptr: - .long 0 diff --git a/arch/x86/boot/compressed/efi_stub_64.S b/arch/x86/boot/compressed/efi_stub_64.S deleted file mode 100644 index cedc60de86eb..000000000000 --- a/arch/x86/boot/compressed/efi_stub_64.S +++ /dev/null @@ -1 +0,0 @@ -#include "../../platform/efi/efi_stub_64.S" diff --git a/arch/x86/boot/compressed/error.c b/arch/x86/boot/compressed/error.c new file mode 100644 index 000000000000..19a8251de506 --- /dev/null +++ b/arch/x86/boot/compressed/error.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Callers outside of misc.c need access to the error reporting routines, + * but the *_putstr() functions need to stay in misc.c because of how + * memcpy() and memmove() are defined for the compressed boot environment. + */ +#include "misc.h" +#include "error.h" + +void warn(const char *m) +{ + error_putstr("\n\n"); + error_putstr(m); + error_putstr("\n\n"); +} + +void error(char *m) +{ + warn(m); + error_putstr(" -- System halted"); + + while (1) + asm("hlt"); +} + +/* EFI libstub provides vsnprintf() */ +#ifdef CONFIG_EFI_STUB +void panic(const char *fmt, ...) +{ + static char buf[1024]; + va_list args; + int len; + + va_start(args, fmt); + len = vsnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + + if (len && buf[len - 1] == '\n') + buf[len - 1] = '\0'; + + error(buf); +} +#endif diff --git a/arch/x86/boot/compressed/error.h b/arch/x86/boot/compressed/error.h new file mode 100644 index 000000000000..31f9e080d61a --- /dev/null +++ b/arch/x86/boot/compressed/error.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef BOOT_COMPRESSED_ERROR_H +#define BOOT_COMPRESSED_ERROR_H + +#include <linux/compiler.h> + +void warn(const char *m); +void error(char *m) __noreturn; +void panic(const char *fmt, ...) __noreturn __cold; + +#endif /* BOOT_COMPRESSED_ERROR_H */ diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 1e3184f6072f..1cfe9802a42f 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * linux/boot/head.S * @@ -29,67 +30,22 @@ #include <asm/page_types.h> #include <asm/boot.h> #include <asm/asm-offsets.h> +#include <asm/bootparam.h> - __HEAD -ENTRY(startup_32) -#ifdef CONFIG_EFI_STUB - jmp preferred_addr - - /* - * We don't need the return address, so set up the stack so - * efi_main() can find its arguments. - */ -ENTRY(efi_pe_entry) - add $0x4, %esp - - call make_boot_params - cmpl $0, %eax - je 1f - movl 0x4(%esp), %esi - movl (%esp), %ecx - pushl %eax - pushl %esi - pushl %ecx - sub $0x4, %esp - -ENTRY(efi_stub_entry) - add $0x4, %esp - call efi_main - cmpl $0, %eax - movl %eax, %esi - jne 2f -1: - /* EFI init failed, so hang. */ - hlt - jmp 1b -2: - call 3f -3: - popl %eax - subl $3b, %eax - subl BP_pref_address(%esi), %eax - add BP_code32_start(%esi), %eax - leal preferred_addr(%eax), %eax - jmp *%eax +/* + * These symbols needed to be marked as .hidden to prevent the BFD linker from + * generating R_386_32 (rather than R_386_RELATIVE) relocations for them when + * the 32-bit compressed kernel is linked as PIE. This is no longer necessary, + * but it doesn't hurt to keep them .hidden. + */ + .hidden _bss + .hidden _ebss + .hidden _end -preferred_addr: -#endif + __HEAD +SYM_FUNC_START(startup_32) cld - /* - * Test KEEP_SEGMENTS flag to see if the bootloader is asking - * us to not reload segments - */ - testb $(1<<6), BP_loadflags(%esi) - jnz 1f - cli - movl $__BOOT_DS, %eax - movl %eax, %ds - movl %eax, %es - movl %eax, %fs - movl %eax, %gs - movl %eax, %ss -1: /* * Calculate the delta between where we were compiled to run @@ -101,31 +57,51 @@ preferred_addr: */ leal (BP_scratch+4)(%esi), %esp call 1f -1: popl %ebp - subl $1b, %ebp +1: popl %edx + addl $_GLOBAL_OFFSET_TABLE_+(.-1b), %edx + + /* Load new GDT */ + leal gdt@GOTOFF(%edx), %eax + movl %eax, 2(%eax) + lgdt (%eax) + + /* Load segment registers with our descriptors */ + movl $__BOOT_DS, %eax + movl %eax, %ds + movl %eax, %es + movl %eax, %fs + movl %eax, %gs + movl %eax, %ss /* - * %ebp contains the address we are loaded at by the boot loader and %ebx - * contains the address where we should move the kernel image temporarily - * for safe in-place decompression. + * %edx contains the address we are loaded at by the boot loader (plus the + * offset to the GOT). The below code calculates %ebx to be the address where + * we should move the kernel image temporarily for safe in-place decompression + * (again, plus the offset to the GOT). + * + * %ebp is calculated to be the address that the kernel will be decompressed to. */ #ifdef CONFIG_RELOCATABLE - movl %ebp, %ebx + leal startup_32@GOTOFF(%edx), %ebx movl BP_kernel_alignment(%esi), %eax decl %eax addl %eax, %ebx notl %eax andl %eax, %ebx -#else - movl $LOAD_PHYSICAL_ADDR, %ebx + cmpl $LOAD_PHYSICAL_ADDR, %ebx + jae 1f #endif + movl $LOAD_PHYSICAL_ADDR, %ebx +1: + movl %ebx, %ebp // Save the output address for later /* Target address to relocate to for decompression */ - addl $z_extract_offset, %ebx + addl BP_init_size(%esi), %ebx + subl $_end@GOTOFF, %ebx /* Set up the stack */ - leal boot_stack_end(%ebx), %esp + leal boot_stack_end@GOTOFF(%ebx), %esp /* Zero EFLAGS */ pushl $0 @@ -136,8 +112,8 @@ preferred_addr: * where decompression in place becomes safe. */ pushl %esi - leal (_bss-4)(%ebp), %esi - leal (_bss-4)(%ebx), %edi + leal (_bss@GOTOFF-4)(%edx), %esi + leal (_bss@GOTOFF-4)(%ebx), %edi movl $(_bss - startup_32), %ecx shrl $2, %ecx std @@ -145,93 +121,68 @@ preferred_addr: cld popl %esi + /* + * The GDT may get overwritten either during the copy we just did or + * during extract_kernel below. To avoid any issues, repoint the GDTR + * to the new copy of the GDT. + */ + leal gdt@GOTOFF(%ebx), %eax + movl %eax, 2(%eax) + lgdt (%eax) + /* * Jump to the relocated address. */ - leal relocated(%ebx), %eax + leal .Lrelocated@GOTOFF(%ebx), %eax jmp *%eax -ENDPROC(startup_32) +SYM_FUNC_END(startup_32) .text -relocated: +SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated) /* * Clear BSS (stack is currently empty) */ xorl %eax, %eax - leal _bss(%ebx), %edi - leal _ebss(%ebx), %ecx + leal _bss@GOTOFF(%ebx), %edi + leal _ebss@GOTOFF(%ebx), %ecx subl %edi, %ecx shrl $2, %ecx rep stosl /* - * Adjust our own GOT - */ - leal _got(%ebx), %edx - leal _egot(%ebx), %ecx -1: - cmpl %ecx, %edx - jae 2f - addl %ebx, (%edx) - addl $4, %edx - jmp 1b -2: - -/* - * Do the decompression, and jump to the new kernel.. - */ - leal z_extract_offset_negative(%ebx), %ebp - /* push arguments for decompress_kernel: */ - pushl %ebp /* output address */ - pushl $z_input_len /* input_len */ - leal input_data(%ebx), %eax - pushl %eax /* input_data */ - leal boot_heap(%ebx), %eax - pushl %eax /* heap area */ - pushl %esi /* real mode pointer */ - call decompress_kernel - addl $20, %esp - -#if CONFIG_RELOCATABLE -/* - * Find the address of the relocations. - */ - leal z_output_len(%ebp), %edi - -/* - * Calculate the delta between where vmlinux was compiled to run - * and where it was actually loaded. - */ - movl %ebp, %ebx - subl $LOAD_PHYSICAL_ADDR, %ebx - jz 2f /* Nothing to be done if loaded at compiled addr. */ -/* - * Process relocations. + * Do the extraction, and jump to the new kernel.. */ + /* push arguments for extract_kernel: */ -1: subl $4, %edi - movl (%edi), %ecx - testl %ecx, %ecx - jz 2f - addl %ebx, -__PAGE_OFFSET(%ebx, %ecx) - jmp 1b -2: -#endif + pushl %ebp /* output address */ + pushl %esi /* real mode pointer */ + call extract_kernel /* returns kernel entry point in %eax */ + addl $24, %esp /* - * Jump to the decompressed kernel. + * Jump to the extracted kernel. */ xorl %ebx, %ebx - jmp *%ebp + jmp *%eax +SYM_FUNC_END(.Lrelocated) + + .data + .balign 8 +SYM_DATA_START_LOCAL(gdt) + .word gdt_end - gdt - 1 + .long 0 + .word 0 + .quad 0x0000000000000000 /* Reserved */ + .quad 0x00cf9a000000ffff /* __KERNEL_CS */ + .quad 0x00cf92000000ffff /* __KERNEL_DS */ +SYM_DATA_END_LABEL(gdt, SYM_L_LOCAL, gdt_end) /* * Stack and heap for uncompression */ .bss .balign 4 -boot_heap: - .fill BOOT_HEAP_SIZE, 1, 0 boot_stack: .fill BOOT_STACK_SIZE, 1, 0 boot_stack_end: diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 06e71c2c16bf..d9dab940ff62 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * linux/boot/head.S * @@ -31,10 +32,54 @@ #include <asm/msr.h> #include <asm/processor-flags.h> #include <asm/asm-offsets.h> +#include <asm/bootparam.h> +#include <asm/desc_defs.h> +#include <asm/trapnr.h> + +/* + * Fix alignment at 16 bytes. Following CONFIG_FUNCTION_ALIGNMENT will result + * in assembly errors due to trying to move .org backward due to the excessive + * alignment. + */ +#undef __ALIGN +#define __ALIGN .balign 16, 0x90 + +/* + * Locally defined symbols should be marked hidden: + */ + .hidden _bss + .hidden _ebss + .hidden _end __HEAD + +/* + * This macro gives the relative virtual address of X, i.e. the offset of X + * from startup_32. This is the same as the link-time virtual address of X, + * since startup_32 is at 0, but defining it this way tells the + * assembler/linker that we do not want the actual run-time address of X. This + * prevents the linker from trying to create unwanted run-time relocation + * entries for the reference when the compressed kernel is linked as PIE. + * + * A reference X(%reg) will result in the link-time VA of X being stored with + * the instruction, and a run-time R_X86_64_RELATIVE relocation entry that + * adds the 64-bit base address where the kernel is loaded. + * + * Replacing it with (X-startup_32)(%reg) results in the offset being stored, + * and no run-time relocation. + * + * The macro should be used as a displacement with a base register containing + * the run-time address of startup_32 [i.e. rva(X)(%reg)], or as an immediate + * [$ rva(X)]. + * + * This macro can only be used from within the .head.text section, since the + * expression requires startup_32 to be in the same section as the code being + * assembled. + */ +#define rva(X) ((X) - startup_32) + .code32 -ENTRY(startup_32) +SYM_FUNC_START(startup_32) /* * 32bit entry is 0 and it is ABI so immutable! * If we come here directly from a bootloader, @@ -42,19 +87,7 @@ ENTRY(startup_32) * all need to be under the 4G limit. */ cld - /* - * Test KEEP_SEGMENTS flag to see if the bootloader is asking - * us to not reload segments - */ - testb $(1<<6), BP_loadflags(%esi) - jnz 1f - cli - movl $(__BOOT_DS), %eax - movl %eax, %ds - movl %eax, %es - movl %eax, %ss -1: /* * Calculate the delta between where we were compiled to run @@ -67,16 +100,39 @@ ENTRY(startup_32) leal (BP_scratch+4)(%esi), %esp call 1f 1: popl %ebp - subl $1b, %ebp + subl $ rva(1b), %ebp + + /* Load new GDT with the 64bit segments using 32bit descriptor */ + leal rva(gdt)(%ebp), %eax + movl %eax, 2(%eax) + lgdt (%eax) + + /* Load segment registers with our descriptors */ + movl $__BOOT_DS, %eax + movl %eax, %ds + movl %eax, %es + movl %eax, %fs + movl %eax, %gs + movl %eax, %ss + + /* Setup a stack and load CS from current GDT */ + leal rva(boot_stack_end)(%ebp), %esp -/* setup a stack and make sure cpu supports long mode. */ - movl $boot_stack_end, %eax - addl %ebp, %eax - movl %eax, %esp + pushl $__KERNEL32_CS + leal rva(1f)(%ebp), %eax + pushl %eax + lretl +1: + + /* Setup Exception handling for SEV-ES */ +#ifdef CONFIG_AMD_MEM_ENCRYPT + call startup32_load_idt +#endif + /* Make sure cpu supports long mode. */ call verify_cpu testl %eax, %eax - jnz no_longmode + jnz .Lno_longmode /* * Compute the delta between where we were compiled to run at @@ -94,62 +150,88 @@ ENTRY(startup_32) addl %eax, %ebx notl %eax andl %eax, %ebx -#else - movl $LOAD_PHYSICAL_ADDR, %ebx + cmpl $LOAD_PHYSICAL_ADDR, %ebx + jae 1f #endif + movl $LOAD_PHYSICAL_ADDR, %ebx +1: /* Target address to relocate to for decompression */ - addl $z_extract_offset, %ebx + addl BP_init_size(%esi), %ebx + subl $ rva(_end), %ebx /* * Prepare for entering 64 bit mode */ - /* Load new GDT with the 64bit segments using 32bit descriptor */ - leal gdt(%ebp), %eax - movl %eax, gdt+2(%ebp) - lgdt gdt(%ebp) - /* Enable PAE mode */ - movl $(X86_CR4_PAE), %eax + movl %cr4, %eax + orl $X86_CR4_PAE, %eax movl %eax, %cr4 /* * Build early 4G boot pagetable */ + /* + * If SEV is active then set the encryption mask in the page tables. + * This will ensure that when the kernel is copied and decompressed + * it will be done so encrypted. + */ + xorl %edx, %edx +#ifdef CONFIG_AMD_MEM_ENCRYPT + call get_sev_encryption_bit + xorl %edx, %edx + testl %eax, %eax + jz 1f + subl $32, %eax /* Encryption bit is always above bit 31 */ + bts %eax, %edx /* Set encryption mask for page tables */ + /* + * Set MSR_AMD64_SEV_ENABLED_BIT in sev_status so that + * startup32_check_sev_cbit() will do a check. sev_enable() will + * initialize sev_status with all the bits reported by + * MSR_AMD_SEV_STATUS later, but only MSR_AMD64_SEV_ENABLED_BIT + * needs to be set for now. + */ + movl $1, rva(sev_status)(%ebp) +1: +#endif + /* Initialize Page tables to 0 */ - leal pgtable(%ebx), %edi + leal rva(pgtable)(%ebx), %edi xorl %eax, %eax - movl $((4096*6)/4), %ecx + movl $(BOOT_INIT_PGT_SIZE/4), %ecx rep stosl /* Build Level 4 */ - leal pgtable + 0(%ebx), %edi + leal rva(pgtable + 0)(%ebx), %edi leal 0x1007 (%edi), %eax movl %eax, 0(%edi) + addl %edx, 4(%edi) /* Build Level 3 */ - leal pgtable + 0x1000(%ebx), %edi + leal rva(pgtable + 0x1000)(%ebx), %edi leal 0x1007(%edi), %eax movl $4, %ecx 1: movl %eax, 0x00(%edi) + addl %edx, 0x04(%edi) addl $0x00001000, %eax addl $8, %edi decl %ecx jnz 1b /* Build Level 2 */ - leal pgtable + 0x2000(%ebx), %edi + leal rva(pgtable + 0x2000)(%ebx), %edi movl $0x00000183, %eax movl $2048, %ecx 1: movl %eax, 0(%edi) + addl %edx, 4(%edi) addl $0x00200000, %eax addl $8, %edi decl %ecx jnz 1b /* Enable the boot page tables */ - leal pgtable(%ebx), %eax + leal rva(pgtable)(%ebx), %eax movl %eax, %cr3 /* Enable Long mode in EFER (Extended Feature Enable Register) */ @@ -161,34 +243,39 @@ ENTRY(startup_32) /* After gdt is loaded */ xorl %eax, %eax lldt %ax - movl $0x20, %eax + movl $__BOOT_TSS, %eax ltr %ax +#ifdef CONFIG_AMD_MEM_ENCRYPT + /* Check if the C-bit position is correct when SEV is active */ + call startup32_check_sev_cbit +#endif + /* * Setup for the jump to 64bit mode * - * When the jump is performend we will be in long mode but + * When the jump is performed we will be in long mode but * in 32bit compatibility mode with EFER.LME = 1, CS.L = 0, CS.D = 1 * (and in turn EFER.LMA = 1). To jump into 64bit mode we use * the new gdt/idt that has __KERNEL_CS with CS.L = 1. * We place all of the values on our mini stack so lret can * used to perform that far jump. */ + leal rva(startup_64)(%ebp), %eax pushl $__KERNEL_CS - leal startup_64(%ebp), %eax pushl %eax /* Enter paged protected Mode, activating Long Mode */ - movl $(X86_CR0_PG | X86_CR0_PE), %eax /* Enable Paging and Protected mode */ + movl $CR0_STATE, %eax movl %eax, %cr0 /* Jump from 32bit compatibility mode into 64bit mode. */ lret -ENDPROC(startup_32) +SYM_FUNC_END(startup_32) .code64 .org 0x200 -ENTRY(startup_64) +SYM_CODE_START(startup_64) /* * 64bit entry is 0x200 and it is ABI so immutable! * We come here either from startup_32 or directly from a @@ -199,46 +286,9 @@ ENTRY(startup_64) * that maps our entire kernel(text+data+bss+brk), zero page * and command line. */ -#ifdef CONFIG_EFI_STUB - /* - * The entry point for the PE/COFF executable is efi_pe_entry, so - * only legacy boot loaders will execute this jmp. - */ - jmp preferred_addr - -ENTRY(efi_pe_entry) - mov %rcx, %rdi - mov %rdx, %rsi - pushq %rdi - pushq %rsi - call make_boot_params - cmpq $0,%rax - je 1f - mov %rax, %rdx - popq %rsi - popq %rdi - -ENTRY(efi_stub_entry) - call efi_main - movq %rax,%rsi - cmpq $0,%rax - jne 2f -1: - /* EFI init failed, so hang. */ - hlt - jmp 1b -2: - call 3f -3: - popq %rax - subq $3b, %rax - subq BP_pref_address(%rsi), %rax - add BP_code32_start(%esi), %eax - leaq preferred_addr(%rax), %rax - jmp *%rax -preferred_addr: -#endif + cld + cli /* Setup data segments. */ xorl %eax, %eax @@ -269,15 +319,94 @@ preferred_addr: addq %rax, %rbp notq %rax andq %rax, %rbp -#else - movq $LOAD_PHYSICAL_ADDR, %rbp + cmpq $LOAD_PHYSICAL_ADDR, %rbp + jae 1f #endif + movq $LOAD_PHYSICAL_ADDR, %rbp +1: /* Target address to relocate to for decompression */ - leaq z_extract_offset(%rbp), %rbx + movl BP_init_size(%rsi), %ebx + subl $ rva(_end), %ebx + addq %rbp, %rbx /* Set up the stack */ - leaq boot_stack_end(%rbx), %rsp + leaq rva(boot_stack_end)(%rbx), %rsp + + /* + * At this point we are in long mode with 4-level paging enabled, + * but we might want to enable 5-level paging or vice versa. + * + * The problem is that we cannot do it directly. Setting or clearing + * CR4.LA57 in long mode would trigger #GP. So we need to switch off + * long mode and paging first. + * + * We also need a trampoline in lower memory to switch over from + * 4- to 5-level paging for cases when the bootloader puts the kernel + * above 4G, but didn't enable 5-level paging for us. + * + * The same trampoline can be used to switch from 5- to 4-level paging + * mode, like when starting 4-level paging kernel via kexec() when + * original kernel worked in 5-level paging mode. + * + * For the trampoline, we need the top page table to reside in lower + * memory as we don't have a way to load 64-bit values into CR3 in + * 32-bit mode. + */ + + /* Make sure we have GDT with 32-bit code segment */ + leaq gdt64(%rip), %rax + addq %rax, 2(%rax) + lgdt (%rax) + + /* Reload CS so IRET returns to a CS actually in the GDT */ + pushq $__KERNEL_CS + leaq .Lon_kernel_cs(%rip), %rax + pushq %rax + lretq + +.Lon_kernel_cs: + /* + * RSI holds a pointer to a boot_params structure provided by the + * loader, and this needs to be preserved across C function calls. So + * move it into a callee saved register. + */ + movq %rsi, %r15 + + call load_stage1_idt + +#ifdef CONFIG_AMD_MEM_ENCRYPT + /* + * Now that the stage1 interrupt handlers are set up, #VC exceptions from + * CPUID instructions can be properly handled for SEV-ES guests. + * + * For SEV-SNP, the CPUID table also needs to be set up in advance of any + * CPUID instructions being issued, so go ahead and do that now via + * sev_enable(), which will also handle the rest of the SEV-related + * detection/setup to ensure that has been done in advance of any dependent + * code. Pass the boot_params pointer as the first argument. + */ + movq %r15, %rdi + call sev_enable +#endif + + /* Preserve only the CR4 bits that must be preserved, and clear the rest */ + movq %cr4, %rax + andl $(X86_CR4_PAE | X86_CR4_MCE | X86_CR4_LA57), %eax + movq %rax, %cr4 + + /* + * configure_5level_paging() updates the number of paging levels using + * a trampoline in 32-bit addressable memory if the current number does + * not match the desired number. + * + * Pass the boot_params pointer as the first argument. The second + * argument is the relocated address of the page table to use instead + * of the page table in trampoline memory (if required). + */ + movq %r15, %rdi + leaq rva(top_pgtable)(%rbx), %rsi + call configure_5level_paging /* Zero EFLAGS */ pushq $0 @@ -287,24 +416,33 @@ preferred_addr: * Copy the compressed kernel to the end of our buffer * where decompression in place becomes safe. */ - pushq %rsi leaq (_bss-8)(%rip), %rsi - leaq (_bss-8)(%rbx), %rdi - movq $_bss /* - $startup_32 */, %rcx - shrq $3, %rcx + leaq rva(_bss-8)(%rbx), %rdi + movl $(_bss - startup_32), %ecx + shrl $3, %ecx std rep movsq cld - popq %rsi + + /* + * The GDT may get overwritten either during the copy we just did or + * during extract_kernel below. To avoid any issues, repoint the GDTR + * to the new copy of the GDT. + */ + leaq rva(gdt64)(%rbx), %rax + leaq rva(gdt)(%rbx), %rdx + movq %rdx, 2(%rax) + lgdt (%rax) /* * Jump to the relocated address. */ - leaq relocated(%rbx), %rax + leaq rva(.Lrelocated)(%rbx), %rax jmp *%rax +SYM_CODE_END(startup_64) .text -relocated: +SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated) /* * Clear BSS (stack is currently empty) @@ -316,72 +454,86 @@ relocated: shrq $3, %rcx rep stosq + call load_stage2_idt + + /* Pass boot_params to initialize_identity_maps() */ + movq %r15, %rdi + call initialize_identity_maps + /* - * Adjust our own GOT - */ - leaq _got(%rip), %rdx - leaq _egot(%rip), %rcx -1: - cmpq %rcx, %rdx - jae 2f - addq %rbx, (%rdx) - addq $8, %rdx - jmp 1b -2: - -/* - * Do the decompression, and jump to the new kernel.. + * Do the extraction, and jump to the new kernel.. */ - pushq %rsi /* Save the real mode argument */ - movq %rsi, %rdi /* real mode address */ - leaq boot_heap(%rip), %rsi /* malloc area for uncompression */ - leaq input_data(%rip), %rdx /* input_data */ - movl $z_input_len, %ecx /* input_len */ - movq %rbp, %r8 /* output target address */ - call decompress_kernel - popq %rsi + /* pass struct boot_params pointer and output target address */ + movq %r15, %rdi + movq %rbp, %rsi + call extract_kernel /* returns kernel entry point in %rax */ /* * Jump to the decompressed kernel. */ - jmp *%rbp + movq %r15, %rsi + jmp *%rax +SYM_FUNC_END(.Lrelocated) .code32 -no_longmode: - /* This isn't an x86-64 CPU so hang */ +SYM_FUNC_START_LOCAL_NOALIGN(.Lno_longmode) + /* This isn't an x86-64 CPU, so hang intentionally, we cannot continue */ 1: hlt jmp 1b +SYM_FUNC_END(.Lno_longmode) + .globl verify_cpu #include "../../kernel/verify_cpu.S" .data -gdt: - .word gdt_end - gdt - .long gdt +SYM_DATA_START_LOCAL(gdt64) + .word gdt_end - gdt - 1 + .quad gdt - gdt64 +SYM_DATA_END(gdt64) + .balign 8 +SYM_DATA_START_LOCAL(gdt) + .word gdt_end - gdt - 1 + .long 0 .word 0 - .quad 0x0000000000000000 /* NULL descriptor */ + .quad 0x00cf9a000000ffff /* __KERNEL32_CS */ .quad 0x00af9a000000ffff /* __KERNEL_CS */ .quad 0x00cf92000000ffff /* __KERNEL_DS */ .quad 0x0080890000000000 /* TS descriptor */ .quad 0x0000000000000000 /* TS continued */ -gdt_end: +SYM_DATA_END_LABEL(gdt, SYM_L_LOCAL, gdt_end) + +SYM_DATA_START(boot_idt_desc) + .word boot_idt_end - boot_idt - 1 + .quad 0 +SYM_DATA_END(boot_idt_desc) + .balign 8 +SYM_DATA_START(boot_idt) + .rept BOOT_IDT_ENTRIES + .quad 0 + .quad 0 + .endr +SYM_DATA_END_LABEL(boot_idt, SYM_L_GLOBAL, boot_idt_end) /* * Stack and heap for uncompression */ .bss .balign 4 -boot_heap: - .fill BOOT_HEAP_SIZE, 1, 0 -boot_stack: +SYM_DATA_START_LOCAL(boot_stack) .fill BOOT_STACK_SIZE, 1, 0 -boot_stack_end: + .balign 16 +SYM_DATA_END_LABEL(boot_stack, SYM_L_LOCAL, boot_stack_end) /* * Space for page tables (not in .bss so not zeroed) */ - .section ".pgtable","a",@nobits + .section ".pgtable","aw",@nobits .balign 4096 -pgtable: - .fill 6*4096, 1, 0 +SYM_DATA_LOCAL(pgtable, .fill BOOT_PGT_SIZE, 1, 0) + +/* + * The page table is going to be used instead of page table in the trampoline + * memory. + */ +SYM_DATA_LOCAL(top_pgtable, .fill PAGE_SIZE, 1, 0) diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c new file mode 100644 index 000000000000..dfb9c2deb77c --- /dev/null +++ b/arch/x86/boot/compressed/ident_map_64.c @@ -0,0 +1,393 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This code is used on x86_64 to create page table identity mappings on + * demand by building up a new set of page tables (or appending to the + * existing ones), and then switching over to them when ready. + * + * Copyright (C) 2015-2016 Yinghai Lu + * Copyright (C) 2016 Kees Cook + */ + +/* No MITIGATION_PAGE_TABLE_ISOLATION support needed either: */ +#undef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION + +#include "error.h" +#include "misc.h" + +/* These actually do the work of building the kernel identity maps. */ +#include <linux/pgtable.h> +#include <asm/cmpxchg.h> +#include <asm/trap_pf.h> +#include <asm/trapnr.h> +#include <asm/init.h> +/* Use the static base for this part of the boot process */ +#undef __PAGE_OFFSET +#define __PAGE_OFFSET __PAGE_OFFSET_BASE +#include "../../mm/ident_map.c" + +#define _SETUP +#include <asm/setup.h> /* For COMMAND_LINE_SIZE */ +#undef _SETUP + +extern unsigned long get_cmd_line_ptr(void); + +/* Used by PAGE_KERN* macros: */ +pteval_t __default_kernel_pte_mask __read_mostly = ~0; + +/* Used to track our page table allocation area. */ +struct alloc_pgt_data { + unsigned char *pgt_buf; + unsigned long pgt_buf_size; + unsigned long pgt_buf_offset; +}; + +/* + * Allocates space for a page table entry, using struct alloc_pgt_data + * above. Besides the local callers, this is used as the allocation + * callback in mapping_info below. + */ +static void *alloc_pgt_page(void *context) +{ + struct alloc_pgt_data *pages = (struct alloc_pgt_data *)context; + unsigned char *entry; + + /* Validate there is space available for a new page. */ + if (pages->pgt_buf_offset >= pages->pgt_buf_size) { + debug_putstr("out of pgt_buf in " __FILE__ "!?\n"); + debug_putaddr(pages->pgt_buf_offset); + debug_putaddr(pages->pgt_buf_size); + return NULL; + } + + /* Consumed more tables than expected? */ + if (pages->pgt_buf_offset == BOOT_PGT_SIZE_WARN) { + debug_putstr("pgt_buf running low in " __FILE__ "\n"); + debug_putstr("Need to raise BOOT_PGT_SIZE?\n"); + debug_putaddr(pages->pgt_buf_offset); + debug_putaddr(pages->pgt_buf_size); + } + + entry = pages->pgt_buf + pages->pgt_buf_offset; + pages->pgt_buf_offset += PAGE_SIZE; + + return entry; +} + +/* Used to track our allocated page tables. */ +static struct alloc_pgt_data pgt_data; + +/* The top level page table entry pointer. */ +static unsigned long top_level_pgt; + +phys_addr_t physical_mask = (1ULL << __PHYSICAL_MASK_SHIFT) - 1; + +/* + * Mapping information structure passed to kernel_ident_mapping_init(). + * Due to relocation, pointers must be assigned at run time not build time. + */ +static struct x86_mapping_info mapping_info; + +/* + * Adds the specified range to the identity mappings. + */ +void kernel_add_identity_map(unsigned long start, unsigned long end) +{ + int ret; + + /* Align boundary to 2M. */ + start = round_down(start, PMD_SIZE); + end = round_up(end, PMD_SIZE); + if (start >= end) + return; + + /* Build the mapping. */ + ret = kernel_ident_mapping_init(&mapping_info, (pgd_t *)top_level_pgt, start, end); + if (ret) + error("Error: kernel_ident_mapping_init() failed\n"); +} + +/* Locates and clears a region for a new top level page table. */ +void initialize_identity_maps(void *rmode) +{ + unsigned long cmdline; + struct setup_data *sd; + + /* Exclude the encryption mask from __PHYSICAL_MASK */ + physical_mask &= ~sme_me_mask; + + /* Init mapping_info with run-time function/buffer pointers. */ + mapping_info.alloc_pgt_page = alloc_pgt_page; + mapping_info.context = &pgt_data; + mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sme_me_mask; + mapping_info.kernpg_flag = _KERNPG_TABLE; + + /* + * It should be impossible for this not to already be true, + * but since calling this a second time would rewind the other + * counters, let's just make sure this is reset too. + */ + pgt_data.pgt_buf_offset = 0; + + /* + * If we came here via startup_32(), cr3 will be _pgtable already + * and we must append to the existing area instead of entirely + * overwriting it. + * + * With 5-level paging, we use '_pgtable' to allocate the p4d page table, + * the top-level page table is allocated separately. + * + * p4d_offset(top_level_pgt, 0) would cover both the 4- and 5-level + * cases. On 4-level paging it's equal to 'top_level_pgt'. + */ + top_level_pgt = read_cr3_pa(); + if (p4d_offset((pgd_t *)top_level_pgt, 0) == (p4d_t *)_pgtable) { + pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE; + pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE; + memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size); + } else { + pgt_data.pgt_buf = _pgtable; + pgt_data.pgt_buf_size = BOOT_PGT_SIZE; + memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size); + top_level_pgt = (unsigned long)alloc_pgt_page(&pgt_data); + } + + /* + * New page-table is set up - map the kernel image, boot_params and the + * command line. The uncompressed kernel requires boot_params and the + * command line to be mapped in the identity mapping. Map them + * explicitly here in case the compressed kernel does not touch them, + * or does not touch all the pages covering them. + */ + kernel_add_identity_map((unsigned long)_head, (unsigned long)_end); + boot_params_ptr = rmode; + kernel_add_identity_map((unsigned long)boot_params_ptr, + (unsigned long)(boot_params_ptr + 1)); + cmdline = get_cmd_line_ptr(); + kernel_add_identity_map(cmdline, cmdline + COMMAND_LINE_SIZE); + + /* + * Also map the setup_data entries passed via boot_params in case they + * need to be accessed by uncompressed kernel via the identity mapping. + */ + sd = (struct setup_data *)boot_params_ptr->hdr.setup_data; + while (sd) { + unsigned long sd_addr = (unsigned long)sd; + + kernel_add_identity_map(sd_addr, sd_addr + sizeof(*sd) + sd->len); + sd = (struct setup_data *)sd->next; + } + + sev_prep_identity_maps(top_level_pgt); + + /* Load the new page-table. */ + write_cr3(top_level_pgt); + + /* + * Now that the required page table mappings are established and a + * GHCB can be used, check for SNP guest/HV feature compatibility. + */ + snp_check_features(); +} + +static pte_t *split_large_pmd(struct x86_mapping_info *info, + pmd_t *pmdp, unsigned long __address) +{ + unsigned long page_flags; + unsigned long address; + pte_t *pte; + pmd_t pmd; + int i; + + pte = (pte_t *)info->alloc_pgt_page(info->context); + if (!pte) + return NULL; + + address = __address & PMD_MASK; + /* No large page - clear PSE flag */ + page_flags = info->page_flag & ~_PAGE_PSE; + + /* Populate the PTEs */ + for (i = 0; i < PTRS_PER_PMD; i++) { + set_pte(&pte[i], __pte(address | page_flags)); + address += PAGE_SIZE; + } + + /* + * Ideally we need to clear the large PMD first and do a TLB + * flush before we write the new PMD. But the 2M range of the + * PMD might contain the code we execute and/or the stack + * we are on, so we can't do that. But that should be safe here + * because we are going from large to small mappings and we are + * also the only user of the page-table, so there is no chance + * of a TLB multihit. + */ + pmd = __pmd((unsigned long)pte | info->kernpg_flag); + set_pmd(pmdp, pmd); + /* Flush TLB to establish the new PMD */ + write_cr3(top_level_pgt); + + return pte + pte_index(__address); +} + +static void clflush_page(unsigned long address) +{ + unsigned int flush_size; + char *cl, *start, *end; + + /* + * Hardcode cl-size to 64 - CPUID can't be used here because that might + * cause another #VC exception and the GHCB is not ready to use yet. + */ + flush_size = 64; + start = (char *)(address & PAGE_MASK); + end = start + PAGE_SIZE; + + /* + * First make sure there are no pending writes on the cache-lines to + * flush. + */ + asm volatile("mfence" : : : "memory"); + + for (cl = start; cl != end; cl += flush_size) + clflush(cl); +} + +static int set_clr_page_flags(struct x86_mapping_info *info, + unsigned long address, + pteval_t set, pteval_t clr) +{ + pgd_t *pgdp = (pgd_t *)top_level_pgt; + p4d_t *p4dp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep, pte; + + /* + * First make sure there is a PMD mapping for 'address'. + * It should already exist, but keep things generic. + * + * To map the page just read from it and fault it in if there is no + * mapping yet. kernel_add_identity_map() can't be called here because + * that would unconditionally map the address on PMD level, destroying + * any PTE-level mappings that might already exist. Use assembly here + * so the access won't be optimized away. + */ + asm volatile("mov %[address], %%r9" + :: [address] "g" (*(unsigned long *)address) + : "r9", "memory"); + + /* + * The page is mapped at least with PMD size - so skip checks and walk + * directly to the PMD. + */ + p4dp = p4d_offset(pgdp, address); + pudp = pud_offset(p4dp, address); + pmdp = pmd_offset(pudp, address); + + if (pmd_leaf(*pmdp)) + ptep = split_large_pmd(info, pmdp, address); + else + ptep = pte_offset_kernel(pmdp, address); + + if (!ptep) + return -ENOMEM; + + /* + * Changing encryption attributes of a page requires to flush it from + * the caches. + */ + if ((set | clr) & _PAGE_ENC) { + clflush_page(address); + + /* + * If the encryption attribute is being cleared, change the page state + * to shared in the RMP table. + */ + if (clr) + snp_set_page_shared(__pa(address & PAGE_MASK)); + } + + /* Update PTE */ + pte = *ptep; + pte = pte_set_flags(pte, set); + pte = pte_clear_flags(pte, clr); + set_pte(ptep, pte); + + /* + * If the encryption attribute is being set, then change the page state to + * private in the RMP entry. The page state change must be done after the PTE + * is updated. + */ + if (set & _PAGE_ENC) + snp_set_page_private(__pa(address & PAGE_MASK)); + + /* Flush TLB after changing encryption attribute */ + write_cr3(top_level_pgt); + + return 0; +} + +int set_page_decrypted(unsigned long address) +{ + return set_clr_page_flags(&mapping_info, address, 0, _PAGE_ENC); +} + +int set_page_encrypted(unsigned long address) +{ + return set_clr_page_flags(&mapping_info, address, _PAGE_ENC, 0); +} + +int set_page_non_present(unsigned long address) +{ + return set_clr_page_flags(&mapping_info, address, 0, _PAGE_PRESENT); +} + +static void do_pf_error(const char *msg, unsigned long error_code, + unsigned long address, unsigned long ip) +{ + error_putstr(msg); + + error_putstr("\nError Code: "); + error_puthex(error_code); + error_putstr("\nCR2: 0x"); + error_puthex(address); + error_putstr("\nRIP relative to _head: 0x"); + error_puthex(ip - (unsigned long)_head); + error_putstr("\n"); + + error("Stopping.\n"); +} + +void do_boot_page_fault(struct pt_regs *regs, unsigned long error_code) +{ + unsigned long address = native_read_cr2(); + unsigned long end; + bool ghcb_fault; + + ghcb_fault = sev_es_check_ghcb_fault(address); + + address &= PMD_MASK; + end = address + PMD_SIZE; + + /* + * Check for unexpected error codes. Unexpected are: + * - Faults on present pages + * - User faults + * - Reserved bits set + */ + if (error_code & (X86_PF_PROT | X86_PF_USER | X86_PF_RSVD)) + do_pf_error("Unexpected page-fault:", error_code, address, regs->ip); + else if (ghcb_fault) + do_pf_error("Page-fault on GHCB page:", error_code, address, regs->ip); + + /* + * Error code is sane - now identity map the 2M region around + * the faulting address. + */ + kernel_add_identity_map(address, end); +} + +void do_boot_nmi_trap(struct pt_regs *regs, unsigned long error_code) +{ + spurious_nmi_count++; +} diff --git a/arch/x86/boot/compressed/idt_64.c b/arch/x86/boot/compressed/idt_64.c new file mode 100644 index 000000000000..d100284bbef4 --- /dev/null +++ b/arch/x86/boot/compressed/idt_64.c @@ -0,0 +1,92 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <asm/trap_pf.h> +#include <asm/segment.h> +#include <asm/trapnr.h> +#include "misc.h" + +static void set_idt_entry(int vector, void (*handler)(void)) +{ + unsigned long address = (unsigned long)handler; + gate_desc entry; + + memset(&entry, 0, sizeof(entry)); + + entry.offset_low = (u16)(address & 0xffff); + entry.segment = __KERNEL_CS; + entry.bits.type = GATE_TRAP; + entry.bits.p = 1; + entry.offset_middle = (u16)((address >> 16) & 0xffff); + entry.offset_high = (u32)(address >> 32); + + memcpy(&boot_idt[vector], &entry, sizeof(entry)); +} + +/* Have this here so we don't need to include <asm/desc.h> */ +static void load_boot_idt(const struct desc_ptr *dtr) +{ + asm volatile("lidt %0"::"m" (*dtr)); +} + +/* Setup IDT before kernel jumping to .Lrelocated */ +void load_stage1_idt(void) +{ + boot_idt_desc.address = (unsigned long)boot_idt; + + + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) + set_idt_entry(X86_TRAP_VC, boot_stage1_vc); + + load_boot_idt(&boot_idt_desc); +} + +/* + * Setup IDT after kernel jumping to .Lrelocated. + * + * initialize_identity_maps() needs a #PF handler to be setup + * in order to be able to fault-in identity mapping ranges; see + * do_boot_page_fault(). + * + * This #PF handler setup needs to happen in load_stage2_idt() where the + * IDT is loaded and there the #VC IDT entry gets setup too. + * + * In order to be able to handle #VCs, one needs a GHCB which + * gets setup with an already set up pagetable, which is done in + * initialize_identity_maps(). And there's the catch 22: the boot #VC + * handler do_boot_stage2_vc() needs to call early_setup_ghcb() itself + * (and, especially set_page_decrypted()) because the SEV-ES setup code + * cannot initialize a GHCB as there's no #PF handler yet... + */ +void load_stage2_idt(void) +{ + boot_idt_desc.address = (unsigned long)boot_idt; + + set_idt_entry(X86_TRAP_PF, boot_page_fault); + set_idt_entry(X86_TRAP_NMI, boot_nmi_trap); + +#ifdef CONFIG_AMD_MEM_ENCRYPT + /* + * Clear the second stage #VC handler in case guest types + * needing #VC have not been detected. + */ + if (sev_status & BIT(1)) + set_idt_entry(X86_TRAP_VC, boot_stage2_vc); + else + set_idt_entry(X86_TRAP_VC, NULL); +#endif + + load_boot_idt(&boot_idt_desc); +} + +void cleanup_exception_handling(void) +{ + /* + * Flush GHCB from cache and map it encrypted again when running as + * SEV-ES guest. + */ + sev_es_shutdown_ghcb(); + + /* Set a null-idt, disabling #PF and #VC handling */ + boot_idt_desc.size = 0; + boot_idt_desc.address = 0; + load_boot_idt(&boot_idt_desc); +} diff --git a/arch/x86/boot/compressed/idt_handlers_64.S b/arch/x86/boot/compressed/idt_handlers_64.S new file mode 100644 index 000000000000..4d03c8562f63 --- /dev/null +++ b/arch/x86/boot/compressed/idt_handlers_64.S @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Early IDT handler entry points + * + * Copyright (C) 2019 SUSE + * + * Author: Joerg Roedel <jroedel@suse.de> + */ + +#include <asm/segment.h> + +/* For ORIG_RAX */ +#include "../../entry/calling.h" + +.macro EXCEPTION_HANDLER name function error_code=0 +SYM_FUNC_START(\name) + + /* Build pt_regs */ + .if \error_code == 0 + pushq $0 + .endif + + pushq %rdi + pushq %rsi + pushq %rdx + pushq %rcx + pushq %rax + pushq %r8 + pushq %r9 + pushq %r10 + pushq %r11 + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + /* Call handler with pt_regs */ + movq %rsp, %rdi + /* Error code is second parameter */ + movq ORIG_RAX(%rsp), %rsi + call \function + + /* Restore regs */ + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + popq %r11 + popq %r10 + popq %r9 + popq %r8 + popq %rax + popq %rcx + popq %rdx + popq %rsi + popq %rdi + + /* Remove error code and return */ + addq $8, %rsp + + iretq +SYM_FUNC_END(\name) + .endm + + .text + .code64 + +EXCEPTION_HANDLER boot_page_fault do_boot_page_fault error_code=1 +EXCEPTION_HANDLER boot_nmi_trap do_boot_nmi_trap error_code=0 + +#ifdef CONFIG_AMD_MEM_ENCRYPT +EXCEPTION_HANDLER boot_stage1_vc do_vc_no_ghcb error_code=1 +EXCEPTION_HANDLER boot_stage2_vc do_boot_stage2_vc error_code=1 +#endif diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c new file mode 100644 index 000000000000..3b0948ad449f --- /dev/null +++ b/arch/x86/boot/compressed/kaslr.c @@ -0,0 +1,908 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * kaslr.c + * + * This contains the routines needed to generate a reasonable level of + * entropy to choose a randomized kernel base address offset in support + * of Kernel Address Space Layout Randomization (KASLR). Additionally + * handles walking the physical memory maps (and tracking memory regions + * to avoid) in order to select a physical memory location that can + * contain the entire properly aligned running kernel image. + * + */ + +/* + * isspace() in linux/ctype.h is expected by next_args() to filter + * out "space/lf/tab". While boot/ctype.h conflicts with linux/ctype.h, + * since isdigit() is implemented in both of them. Hence disable it + * here. + */ +#define BOOT_CTYPE_H + +#include "misc.h" +#include "error.h" +#include "../string.h" +#include "efi.h" + +#include <generated/compile.h> +#include <generated/utsversion.h> +#include <generated/utsrelease.h> + +#define _SETUP +#include <asm/setup.h> /* For COMMAND_LINE_SIZE */ +#undef _SETUP + +extern unsigned long get_cmd_line_ptr(void); + +/* Simplified build-specific string for starting entropy. */ +static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@" + LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION; + +static unsigned long rotate_xor(unsigned long hash, const void *area, + size_t size) +{ + size_t i; + unsigned long *ptr = (unsigned long *)area; + + for (i = 0; i < size / sizeof(hash); i++) { + /* Rotate by odd number of bits and XOR. */ + hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7); + hash ^= ptr[i]; + } + + return hash; +} + +/* Attempt to create a simple but unpredictable starting entropy. */ +static unsigned long get_boot_seed(void) +{ + unsigned long hash = 0; + + hash = rotate_xor(hash, build_str, sizeof(build_str)); + hash = rotate_xor(hash, boot_params_ptr, sizeof(*boot_params_ptr)); + + return hash; +} + +#define KASLR_COMPRESSED_BOOT +#include "../../lib/kaslr.c" + + +/* Only supporting at most 4 unusable memmap regions with kaslr */ +#define MAX_MEMMAP_REGIONS 4 + +static bool memmap_too_large; + + +/* + * Store memory limit: MAXMEM on 64-bit and KERNEL_IMAGE_SIZE on 32-bit. + * It may be reduced by "mem=nn[KMG]" or "memmap=nn[KMG]" command line options. + */ +static u64 mem_limit; + +/* Number of immovable memory regions */ +static int num_immovable_mem; + +enum mem_avoid_index { + MEM_AVOID_ZO_RANGE = 0, + MEM_AVOID_INITRD, + MEM_AVOID_CMDLINE, + MEM_AVOID_BOOTPARAMS, + MEM_AVOID_MEMMAP_BEGIN, + MEM_AVOID_MEMMAP_END = MEM_AVOID_MEMMAP_BEGIN + MAX_MEMMAP_REGIONS - 1, + MEM_AVOID_MAX, +}; + +static struct mem_vector mem_avoid[MEM_AVOID_MAX]; + +static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two) +{ + /* Item one is entirely before item two. */ + if (one->start + one->size <= two->start) + return false; + /* Item one is entirely after item two. */ + if (one->start >= two->start + two->size) + return false; + return true; +} + +char *skip_spaces(const char *str) +{ + while (isspace(*str)) + ++str; + return (char *)str; +} +#include "../../../../lib/ctype.c" +#include "../../../../lib/cmdline.c" + +static int +parse_memmap(char *p, u64 *start, u64 *size) +{ + char *oldp; + + if (!p) + return -EINVAL; + + /* We don't care about this option here */ + if (!strncmp(p, "exactmap", 8)) + return -EINVAL; + + oldp = p; + *size = memparse(p, &p); + if (p == oldp) + return -EINVAL; + + switch (*p) { + case '#': + case '$': + case '!': + *start = memparse(p + 1, &p); + return 0; + case '@': + /* + * memmap=nn@ss specifies usable region, should + * be skipped + */ + *size = 0; + fallthrough; + default: + /* + * If w/o offset, only size specified, memmap=nn[KMG] has the + * same behaviour as mem=nn[KMG]. It limits the max address + * system can use. Region above the limit should be avoided. + */ + *start = 0; + return 0; + } + + return -EINVAL; +} + +static void mem_avoid_memmap(char *str) +{ + static int i; + + if (i >= MAX_MEMMAP_REGIONS) + return; + + while (str && (i < MAX_MEMMAP_REGIONS)) { + int rc; + u64 start, size; + char *k = strchr(str, ','); + + if (k) + *k++ = 0; + + rc = parse_memmap(str, &start, &size); + if (rc < 0) + break; + str = k; + + if (start == 0) { + /* Store the specified memory limit if size > 0 */ + if (size > 0 && size < mem_limit) + mem_limit = size; + + continue; + } + + mem_avoid[MEM_AVOID_MEMMAP_BEGIN + i].start = start; + mem_avoid[MEM_AVOID_MEMMAP_BEGIN + i].size = size; + i++; + } + + /* More than 4 memmaps, fail kaslr */ + if ((i >= MAX_MEMMAP_REGIONS) && str) + memmap_too_large = true; +} + +/* Store the number of 1GB huge pages which users specified: */ +static unsigned long max_gb_huge_pages; + +static void parse_gb_huge_pages(char *param, char *val) +{ + static bool gbpage_sz; + char *p; + + if (!strcmp(param, "hugepagesz")) { + p = val; + if (memparse(p, &p) != PUD_SIZE) { + gbpage_sz = false; + return; + } + + if (gbpage_sz) + warn("Repeatedly set hugeTLB page size of 1G!\n"); + gbpage_sz = true; + return; + } + + if (!strcmp(param, "hugepages") && gbpage_sz) { + p = val; + max_gb_huge_pages = simple_strtoull(p, &p, 0); + return; + } +} + +static void handle_mem_options(void) +{ + char *args = (char *)get_cmd_line_ptr(); + size_t len; + char *tmp_cmdline; + char *param, *val; + u64 mem_size; + + if (!args) + return; + + len = strnlen(args, COMMAND_LINE_SIZE-1); + tmp_cmdline = malloc(len + 1); + if (!tmp_cmdline) + error("Failed to allocate space for tmp_cmdline"); + + memcpy(tmp_cmdline, args, len); + tmp_cmdline[len] = 0; + args = tmp_cmdline; + + /* Chew leading spaces */ + args = skip_spaces(args); + + while (*args) { + args = next_arg(args, ¶m, &val); + /* Stop at -- */ + if (!val && strcmp(param, "--") == 0) + break; + + if (!strcmp(param, "memmap")) { + mem_avoid_memmap(val); + } else if (IS_ENABLED(CONFIG_X86_64) && strstr(param, "hugepages")) { + parse_gb_huge_pages(param, val); + } else if (!strcmp(param, "mem")) { + char *p = val; + + if (!strcmp(p, "nopentium")) + continue; + mem_size = memparse(p, &p); + if (mem_size == 0) + break; + + if (mem_size < mem_limit) + mem_limit = mem_size; + } + } + + free(tmp_cmdline); + return; +} + +/* + * In theory, KASLR can put the kernel anywhere in the range of [16M, MAXMEM) + * on 64-bit, and [16M, KERNEL_IMAGE_SIZE) on 32-bit. + * + * The mem_avoid array is used to store the ranges that need to be avoided + * when KASLR searches for an appropriate random address. We must avoid any + * regions that are unsafe to overlap with during decompression, and other + * things like the initrd, cmdline and boot_params. This comment seeks to + * explain mem_avoid as clearly as possible since incorrect mem_avoid + * memory ranges lead to really hard to debug boot failures. + * + * The initrd, cmdline, and boot_params are trivial to identify for + * avoiding. They are MEM_AVOID_INITRD, MEM_AVOID_CMDLINE, and + * MEM_AVOID_BOOTPARAMS respectively below. + * + * What is not obvious how to avoid is the range of memory that is used + * during decompression (MEM_AVOID_ZO_RANGE below). This range must cover + * the compressed kernel (ZO) and its run space, which is used to extract + * the uncompressed kernel (VO) and relocs. + * + * ZO's full run size sits against the end of the decompression buffer, so + * we can calculate where text, data, bss, etc of ZO are positioned more + * easily. + * + * For additional background, the decompression calculations can be found + * in header.S, and the memory diagram is based on the one found in misc.c. + * + * The following conditions are already enforced by the image layouts and + * associated code: + * - input + input_size >= output + output_size + * - kernel_total_size <= init_size + * - kernel_total_size <= output_size (see Note below) + * - output + init_size >= output + output_size + * + * (Note that kernel_total_size and output_size have no fundamental + * relationship, but output_size is passed to choose_random_location + * as a maximum of the two. The diagram is showing a case where + * kernel_total_size is larger than output_size, but this case is + * handled by bumping output_size.) + * + * The above conditions can be illustrated by a diagram: + * + * 0 output input input+input_size output+init_size + * | | | | | + * | | | | | + * |-----|--------|--------|--------------|-----------|--|-------------| + * | | | + * | | | + * output+init_size-ZO_INIT_SIZE output+output_size output+kernel_total_size + * + * [output, output+init_size) is the entire memory range used for + * extracting the compressed image. + * + * [output, output+kernel_total_size) is the range needed for the + * uncompressed kernel (VO) and its run size (bss, brk, etc). + * + * [output, output+output_size) is VO plus relocs (i.e. the entire + * uncompressed payload contained by ZO). This is the area of the buffer + * written to during decompression. + * + * [output+init_size-ZO_INIT_SIZE, output+init_size) is the worst-case + * range of the copied ZO and decompression code. (i.e. the range + * covered backwards of size ZO_INIT_SIZE, starting from output+init_size.) + * + * [input, input+input_size) is the original copied compressed image (ZO) + * (i.e. it does not include its run size). This range must be avoided + * because it contains the data used for decompression. + * + * [input+input_size, output+init_size) is [_text, _end) for ZO. This + * range includes ZO's heap and stack, and must be avoided since it + * performs the decompression. + * + * Since the above two ranges need to be avoided and they are adjacent, + * they can be merged, resulting in: [input, output+init_size) which + * becomes the MEM_AVOID_ZO_RANGE below. + */ +static void mem_avoid_init(unsigned long input, unsigned long input_size, + unsigned long output) +{ + unsigned long init_size = boot_params_ptr->hdr.init_size; + u64 initrd_start, initrd_size; + unsigned long cmd_line, cmd_line_size; + + /* + * Avoid the region that is unsafe to overlap during + * decompression. + */ + mem_avoid[MEM_AVOID_ZO_RANGE].start = input; + mem_avoid[MEM_AVOID_ZO_RANGE].size = (output + init_size) - input; + + /* Avoid initrd. */ + initrd_start = (u64)boot_params_ptr->ext_ramdisk_image << 32; + initrd_start |= boot_params_ptr->hdr.ramdisk_image; + initrd_size = (u64)boot_params_ptr->ext_ramdisk_size << 32; + initrd_size |= boot_params_ptr->hdr.ramdisk_size; + mem_avoid[MEM_AVOID_INITRD].start = initrd_start; + mem_avoid[MEM_AVOID_INITRD].size = initrd_size; + /* No need to set mapping for initrd, it will be handled in VO. */ + + /* Avoid kernel command line. */ + cmd_line = get_cmd_line_ptr(); + /* Calculate size of cmd_line. */ + if (cmd_line) { + cmd_line_size = strnlen((char *)cmd_line, COMMAND_LINE_SIZE-1) + 1; + mem_avoid[MEM_AVOID_CMDLINE].start = cmd_line; + mem_avoid[MEM_AVOID_CMDLINE].size = cmd_line_size; + } + + /* Avoid boot parameters. */ + mem_avoid[MEM_AVOID_BOOTPARAMS].start = (unsigned long)boot_params_ptr; + mem_avoid[MEM_AVOID_BOOTPARAMS].size = sizeof(*boot_params_ptr); + + /* We don't need to set a mapping for setup_data. */ + + /* Mark the memmap regions we need to avoid */ + handle_mem_options(); + + /* Enumerate the immovable memory regions */ + num_immovable_mem = count_immovable_mem_regions(); +} + +/* + * Does this memory vector overlap a known avoided area? If so, record the + * overlap region with the lowest address. + */ +static bool mem_avoid_overlap(struct mem_vector *img, + struct mem_vector *overlap) +{ + int i; + struct setup_data *ptr; + u64 earliest = img->start + img->size; + bool is_overlapping = false; + + for (i = 0; i < MEM_AVOID_MAX; i++) { + if (mem_overlaps(img, &mem_avoid[i]) && + mem_avoid[i].start < earliest) { + *overlap = mem_avoid[i]; + earliest = overlap->start; + is_overlapping = true; + } + } + + /* Avoid all entries in the setup_data linked list. */ + ptr = (struct setup_data *)(unsigned long)boot_params_ptr->hdr.setup_data; + while (ptr) { + struct mem_vector avoid; + + avoid.start = (unsigned long)ptr; + avoid.size = sizeof(*ptr) + ptr->len; + + if (mem_overlaps(img, &avoid) && (avoid.start < earliest)) { + *overlap = avoid; + earliest = overlap->start; + is_overlapping = true; + } + + if (ptr->type == SETUP_INDIRECT && + ((struct setup_indirect *)ptr->data)->type != SETUP_INDIRECT) { + avoid.start = ((struct setup_indirect *)ptr->data)->addr; + avoid.size = ((struct setup_indirect *)ptr->data)->len; + + if (mem_overlaps(img, &avoid) && (avoid.start < earliest)) { + *overlap = avoid; + earliest = overlap->start; + is_overlapping = true; + } + } + + ptr = (struct setup_data *)(unsigned long)ptr->next; + } + + return is_overlapping; +} + +struct slot_area { + u64 addr; + unsigned long num; +}; + +#define MAX_SLOT_AREA 100 + +static struct slot_area slot_areas[MAX_SLOT_AREA]; +static unsigned int slot_area_index; +static unsigned long slot_max; + +static void store_slot_info(struct mem_vector *region, unsigned long image_size) +{ + struct slot_area slot_area; + + if (slot_area_index == MAX_SLOT_AREA) + return; + + slot_area.addr = region->start; + slot_area.num = 1 + (region->size - image_size) / CONFIG_PHYSICAL_ALIGN; + + slot_areas[slot_area_index++] = slot_area; + slot_max += slot_area.num; +} + +/* + * Skip as many 1GB huge pages as possible in the passed region + * according to the number which users specified: + */ +static void +process_gb_huge_pages(struct mem_vector *region, unsigned long image_size) +{ + u64 pud_start, pud_end; + unsigned long gb_huge_pages; + struct mem_vector tmp; + + if (!IS_ENABLED(CONFIG_X86_64) || !max_gb_huge_pages) { + store_slot_info(region, image_size); + return; + } + + /* Are there any 1GB pages in the region? */ + pud_start = ALIGN(region->start, PUD_SIZE); + pud_end = ALIGN_DOWN(region->start + region->size, PUD_SIZE); + + /* No good 1GB huge pages found: */ + if (pud_start >= pud_end) { + store_slot_info(region, image_size); + return; + } + + /* Check if the head part of the region is usable. */ + if (pud_start >= region->start + image_size) { + tmp.start = region->start; + tmp.size = pud_start - region->start; + store_slot_info(&tmp, image_size); + } + + /* Skip the good 1GB pages. */ + gb_huge_pages = (pud_end - pud_start) >> PUD_SHIFT; + if (gb_huge_pages > max_gb_huge_pages) { + pud_end = pud_start + (max_gb_huge_pages << PUD_SHIFT); + max_gb_huge_pages = 0; + } else { + max_gb_huge_pages -= gb_huge_pages; + } + + /* Check if the tail part of the region is usable. */ + if (region->start + region->size >= pud_end + image_size) { + tmp.start = pud_end; + tmp.size = region->start + region->size - pud_end; + store_slot_info(&tmp, image_size); + } +} + +static u64 slots_fetch_random(void) +{ + unsigned long slot; + unsigned int i; + + /* Handle case of no slots stored. */ + if (slot_max == 0) + return 0; + + slot = kaslr_get_random_long("Physical") % slot_max; + + for (i = 0; i < slot_area_index; i++) { + if (slot >= slot_areas[i].num) { + slot -= slot_areas[i].num; + continue; + } + return slot_areas[i].addr + ((u64)slot * CONFIG_PHYSICAL_ALIGN); + } + + if (i == slot_area_index) + debug_putstr("slots_fetch_random() failed!?\n"); + return 0; +} + +static void __process_mem_region(struct mem_vector *entry, + unsigned long minimum, + unsigned long image_size) +{ + struct mem_vector region, overlap; + u64 region_end; + + /* Enforce minimum and memory limit. */ + region.start = max_t(u64, entry->start, minimum); + region_end = min(entry->start + entry->size, mem_limit); + + /* Give up if slot area array is full. */ + while (slot_area_index < MAX_SLOT_AREA) { + /* Potentially raise address to meet alignment needs. */ + region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN); + + /* Did we raise the address above the passed in memory entry? */ + if (region.start > region_end) + return; + + /* Reduce size by any delta from the original address. */ + region.size = region_end - region.start; + + /* Return if region can't contain decompressed kernel */ + if (region.size < image_size) + return; + + /* If nothing overlaps, store the region and return. */ + if (!mem_avoid_overlap(®ion, &overlap)) { + process_gb_huge_pages(®ion, image_size); + return; + } + + /* Store beginning of region if holds at least image_size. */ + if (overlap.start >= region.start + image_size) { + region.size = overlap.start - region.start; + process_gb_huge_pages(®ion, image_size); + } + + /* Clip off the overlapping region and start over. */ + region.start = overlap.start + overlap.size; + } +} + +static bool process_mem_region(struct mem_vector *region, + unsigned long minimum, + unsigned long image_size) +{ + int i; + /* + * If no immovable memory found, or MEMORY_HOTREMOVE disabled, + * use @region directly. + */ + if (!num_immovable_mem) { + __process_mem_region(region, minimum, image_size); + + if (slot_area_index == MAX_SLOT_AREA) { + debug_putstr("Aborted e820/efi memmap scan (slot_areas full)!\n"); + return true; + } + return false; + } + +#if defined(CONFIG_MEMORY_HOTREMOVE) && defined(CONFIG_ACPI) + /* + * If immovable memory found, filter the intersection between + * immovable memory and @region. + */ + for (i = 0; i < num_immovable_mem; i++) { + u64 start, end, entry_end, region_end; + struct mem_vector entry; + + if (!mem_overlaps(region, &immovable_mem[i])) + continue; + + start = immovable_mem[i].start; + end = start + immovable_mem[i].size; + region_end = region->start + region->size; + + entry.start = clamp(region->start, start, end); + entry_end = clamp(region_end, start, end); + entry.size = entry_end - entry.start; + + __process_mem_region(&entry, minimum, image_size); + + if (slot_area_index == MAX_SLOT_AREA) { + debug_putstr("Aborted e820/efi memmap scan when walking immovable regions(slot_areas full)!\n"); + return true; + } + } +#endif + return false; +} + +#ifdef CONFIG_EFI + +/* + * Only EFI_CONVENTIONAL_MEMORY and EFI_UNACCEPTED_MEMORY (if supported) are + * guaranteed to be free. + * + * Pick free memory more conservatively than the EFI spec allows: according to + * the spec, EFI_BOOT_SERVICES_{CODE|DATA} are also free memory and thus + * available to place the kernel image into, but in practice there's firmware + * where using that memory leads to crashes. Buggy vendor EFI code registers + * for an event that triggers on SetVirtualAddressMap(). The handler assumes + * that EFI_BOOT_SERVICES_DATA memory has not been touched by loader yet, which + * is probably true for Windows. + * + * Preserve EFI_BOOT_SERVICES_* regions until after SetVirtualAddressMap(). + */ +static inline bool memory_type_is_free(efi_memory_desc_t *md) +{ + if (md->type == EFI_CONVENTIONAL_MEMORY) + return true; + + if (IS_ENABLED(CONFIG_UNACCEPTED_MEMORY) && + md->type == EFI_UNACCEPTED_MEMORY) + return true; + + return false; +} + +/* + * Returns true if we processed the EFI memmap, which we prefer over the E820 + * table if it is available. + */ +static bool +process_efi_entries(unsigned long minimum, unsigned long image_size) +{ + struct efi_info *e = &boot_params_ptr->efi_info; + bool efi_mirror_found = false; + struct mem_vector region; + efi_memory_desc_t *md; + unsigned long pmap; + char *signature; + u32 nr_desc; + int i; + + signature = (char *)&e->efi_loader_signature; + if (strncmp(signature, EFI32_LOADER_SIGNATURE, 4) && + strncmp(signature, EFI64_LOADER_SIGNATURE, 4)) + return false; + +#ifdef CONFIG_X86_32 + /* Can't handle data above 4GB at this time */ + if (e->efi_memmap_hi) { + warn("EFI memmap is above 4GB, can't be handled now on x86_32. EFI should be disabled.\n"); + return false; + } + pmap = e->efi_memmap; +#else + pmap = (e->efi_memmap | ((__u64)e->efi_memmap_hi << 32)); +#endif + + nr_desc = e->efi_memmap_size / e->efi_memdesc_size; + for (i = 0; i < nr_desc; i++) { + md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i); + if (md->attribute & EFI_MEMORY_MORE_RELIABLE) { + efi_mirror_found = true; + break; + } + } + + for (i = 0; i < nr_desc; i++) { + md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i); + + if (!memory_type_is_free(md)) + continue; + + if (efi_soft_reserve_enabled() && + (md->attribute & EFI_MEMORY_SP)) + continue; + + if (efi_mirror_found && + !(md->attribute & EFI_MEMORY_MORE_RELIABLE)) + continue; + + region.start = md->phys_addr; + region.size = md->num_pages << EFI_PAGE_SHIFT; + if (process_mem_region(®ion, minimum, image_size)) + break; + } + return true; +} +#else +static inline bool +process_efi_entries(unsigned long minimum, unsigned long image_size) +{ + return false; +} +#endif + +static void process_e820_entries(unsigned long minimum, + unsigned long image_size) +{ + int i; + struct mem_vector region; + struct boot_e820_entry *entry; + + /* Verify potential e820 positions, appending to slots list. */ + for (i = 0; i < boot_params_ptr->e820_entries; i++) { + entry = &boot_params_ptr->e820_table[i]; + /* Skip non-RAM entries. */ + if (entry->type != E820_TYPE_RAM) + continue; + region.start = entry->addr; + region.size = entry->size; + if (process_mem_region(®ion, minimum, image_size)) + break; + } +} + +/* + * If KHO is active, only process its scratch areas to ensure we are not + * stepping onto preserved memory. + */ +static bool process_kho_entries(unsigned long minimum, unsigned long image_size) +{ + struct kho_scratch *kho_scratch; + struct setup_data *ptr; + struct kho_data *kho; + int i, nr_areas = 0; + + if (!IS_ENABLED(CONFIG_KEXEC_HANDOVER)) + return false; + + ptr = (struct setup_data *)(unsigned long)boot_params_ptr->hdr.setup_data; + while (ptr) { + if (ptr->type == SETUP_KEXEC_KHO) { + kho = (struct kho_data *)(unsigned long)ptr->data; + kho_scratch = (void *)(unsigned long)kho->scratch_addr; + nr_areas = kho->scratch_size / sizeof(*kho_scratch); + break; + } + + ptr = (struct setup_data *)(unsigned long)ptr->next; + } + + if (!nr_areas) + return false; + + for (i = 0; i < nr_areas; i++) { + struct kho_scratch *area = &kho_scratch[i]; + struct mem_vector region = { + .start = area->addr, + .size = area->size, + }; + + if (process_mem_region(®ion, minimum, image_size)) + break; + } + + return true; +} + +static unsigned long find_random_phys_addr(unsigned long minimum, + unsigned long image_size) +{ + u64 phys_addr; + + /* Bail out early if it's impossible to succeed. */ + if (minimum + image_size > mem_limit) + return 0; + + /* Check if we had too many memmaps. */ + if (memmap_too_large) { + debug_putstr("Aborted memory entries scan (more than 4 memmap= args)!\n"); + return 0; + } + + /* + * During kexec handover only process KHO scratch areas that are known + * not to contain any data that must be preserved. + */ + if (!process_kho_entries(minimum, image_size) && + !process_efi_entries(minimum, image_size)) + process_e820_entries(minimum, image_size); + + phys_addr = slots_fetch_random(); + + /* Perform a final check to make sure the address is in range. */ + if (phys_addr < minimum || phys_addr + image_size > mem_limit) { + warn("Invalid physical address chosen!\n"); + return 0; + } + + return (unsigned long)phys_addr; +} + +static unsigned long find_random_virt_addr(unsigned long minimum, + unsigned long image_size) +{ + unsigned long slots, random_addr; + + /* + * There are how many CONFIG_PHYSICAL_ALIGN-sized slots + * that can hold image_size within the range of minimum to + * KERNEL_IMAGE_SIZE? + */ + slots = 1 + (KERNEL_IMAGE_SIZE - minimum - image_size) / CONFIG_PHYSICAL_ALIGN; + + random_addr = kaslr_get_random_long("Virtual") % slots; + + return random_addr * CONFIG_PHYSICAL_ALIGN + minimum; +} + +/* + * Since this function examines addresses much more numerically, + * it takes the input and output pointers as 'unsigned long'. + */ +void choose_random_location(unsigned long input, + unsigned long input_size, + unsigned long *output, + unsigned long output_size, + unsigned long *virt_addr) +{ + unsigned long random_addr, min_addr; + + if (cmdline_find_option_bool("nokaslr")) { + warn("KASLR disabled: 'nokaslr' on cmdline."); + return; + } + + boot_params_ptr->hdr.loadflags |= KASLR_FLAG; + + if (IS_ENABLED(CONFIG_X86_32)) + mem_limit = KERNEL_IMAGE_SIZE; + else + mem_limit = MAXMEM; + + /* Record the various known unsafe memory ranges. */ + mem_avoid_init(input, input_size, *output); + + /* + * Low end of the randomization range should be the + * smaller of 512M or the initial kernel image + * location: + */ + min_addr = min(*output, 512UL << 20); + /* Make sure minimum is aligned. */ + min_addr = ALIGN(min_addr, CONFIG_PHYSICAL_ALIGN); + + /* Walk available memory entries to find a random address. */ + random_addr = find_random_phys_addr(min_addr, output_size); + if (!random_addr) { + warn("Physical KASLR disabled: no suitable memory region!"); + } else { + /* Update the new physical address location. */ + if (*output != random_addr) + *output = random_addr; + } + + + /* Pick random virtual address starting from LOAD_PHYSICAL_ADDR. */ + if (IS_ENABLED(CONFIG_X86_64)) + random_addr = find_random_virt_addr(LOAD_PHYSICAL_ADDR, output_size); + *virt_addr = random_addr; +} diff --git a/arch/x86/boot/compressed/kernel_info.S b/arch/x86/boot/compressed/kernel_info.S new file mode 100644 index 000000000000..f818ee8fba38 --- /dev/null +++ b/arch/x86/boot/compressed/kernel_info.S @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <asm/bootparam.h> + + .section ".rodata.kernel_info", "a" + + .global kernel_info + +kernel_info: + /* Header, Linux top (structure). */ + .ascii "LToP" + /* Size. */ + .long kernel_info_var_len_data - kernel_info + /* Size total. */ + .long kernel_info_end - kernel_info + + /* Maximal allowed type for setup_data and setup_indirect structs. */ + .long SETUP_TYPE_MAX + +kernel_info_var_len_data: + /* Empty for time being... */ +kernel_info_end: diff --git a/arch/x86/boot/compressed/mem.c b/arch/x86/boot/compressed/mem.c new file mode 100644 index 000000000000..0e9f84ab4bdc --- /dev/null +++ b/arch/x86/boot/compressed/mem.c @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "error.h" +#include "misc.h" +#include "tdx.h" +#include "sev.h" +#include <asm/shared/tdx.h> + +/* + * accept_memory() and process_unaccepted_memory() called from EFI stub which + * runs before decompressor and its early_tdx_detect(). + * + * Enumerate TDX directly from the early users. + */ +static bool early_is_tdx_guest(void) +{ + static bool once; + static bool is_tdx; + + if (!IS_ENABLED(CONFIG_INTEL_TDX_GUEST)) + return false; + + if (!once) { + u32 eax, sig[3]; + + cpuid_count(TDX_CPUID_LEAF_ID, 0, &eax, + &sig[0], &sig[2], &sig[1]); + is_tdx = !memcmp(TDX_IDENT, sig, sizeof(sig)); + once = true; + } + + return is_tdx; +} + +void arch_accept_memory(phys_addr_t start, phys_addr_t end) +{ + /* Platform-specific memory-acceptance call goes here */ + if (early_is_tdx_guest()) { + if (!tdx_accept_memory(start, end)) + panic("TDX: Failed to accept memory\n"); + } else if (early_is_sevsnp_guest()) { + snp_accept_memory(start, end); + } else { + error("Cannot accept memory: unknown platform\n"); + } +} + +bool init_unaccepted_memory(void) +{ + guid_t guid = LINUX_EFI_UNACCEPTED_MEM_TABLE_GUID; + struct efi_unaccepted_memory *table; + unsigned long cfg_table_pa; + unsigned int cfg_table_len; + enum efi_type et; + int ret; + + et = efi_get_type(boot_params_ptr); + if (et == EFI_TYPE_NONE) + return false; + + ret = efi_get_conf_table(boot_params_ptr, &cfg_table_pa, &cfg_table_len); + if (ret) { + warn("EFI config table not found."); + return false; + } + + table = (void *)efi_find_vendor_table(boot_params_ptr, cfg_table_pa, + cfg_table_len, guid); + if (!table) + return false; + + if (table->version != 1) + error("Unknown version of unaccepted memory table\n"); + + /* + * In many cases unaccepted_table is already set by EFI stub, but it + * has to be initialized again to cover cases when the table is not + * allocated by EFI stub or EFI stub copied the kernel image with + * efi_relocate_kernel() before the variable is set. + * + * It must be initialized before the first usage of accept_memory(). + */ + unaccepted_table = table; + + return true; +} diff --git a/arch/x86/boot/compressed/mem_encrypt.S b/arch/x86/boot/compressed/mem_encrypt.S new file mode 100644 index 000000000000..32f7cc8a8625 --- /dev/null +++ b/arch/x86/boot/compressed/mem_encrypt.S @@ -0,0 +1,324 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2017 Advanced Micro Devices, Inc. + * + * Author: Tom Lendacky <thomas.lendacky@amd.com> + */ + +#include <linux/linkage.h> + +#include <asm/processor-flags.h> +#include <asm/msr.h> +#include <asm/asm-offsets.h> +#include <asm/segment.h> +#include <asm/trapnr.h> + + .text + .code32 +SYM_FUNC_START(get_sev_encryption_bit) + push %ebx + + movl $0x80000000, %eax /* CPUID to check the highest leaf */ + cpuid + cmpl $0x8000001f, %eax /* See if 0x8000001f is available */ + jb .Lno_sev + + /* + * Check for the SEV feature: + * CPUID Fn8000_001F[EAX] - Bit 1 + * CPUID Fn8000_001F[EBX] - Bits 5:0 + * Pagetable bit position used to indicate encryption + */ + movl $0x8000001f, %eax + cpuid + bt $1, %eax /* Check if SEV is available */ + jnc .Lno_sev + + movl $MSR_AMD64_SEV, %ecx /* Read the SEV MSR */ + rdmsr + bt $MSR_AMD64_SEV_ENABLED_BIT, %eax /* Check if SEV is active */ + jnc .Lno_sev + + movl %ebx, %eax + andl $0x3f, %eax /* Return the encryption bit location */ + jmp .Lsev_exit + +.Lno_sev: + xor %eax, %eax + +.Lsev_exit: + pop %ebx + RET +SYM_FUNC_END(get_sev_encryption_bit) + +/** + * sev_es_req_cpuid - Request a CPUID value from the Hypervisor using + * the GHCB MSR protocol + * + * @%eax: Register to request (0=EAX, 1=EBX, 2=ECX, 3=EDX) + * @%edx: CPUID Function + * + * Returns 0 in %eax on success, non-zero on failure + * %edx returns CPUID value on success + */ +SYM_CODE_START_LOCAL(sev_es_req_cpuid) + shll $30, %eax + orl $0x00000004, %eax + movl $MSR_AMD64_SEV_ES_GHCB, %ecx + wrmsr + rep; vmmcall # VMGEXIT + rdmsr + + /* Check response */ + movl %eax, %ecx + andl $0x3ffff000, %ecx # Bits [12-29] MBZ + jnz 2f + + /* Check return code */ + andl $0xfff, %eax + cmpl $5, %eax + jne 2f + + /* All good - return success */ + xorl %eax, %eax +1: + RET +2: + movl $-1, %eax + jmp 1b +SYM_CODE_END(sev_es_req_cpuid) + +SYM_CODE_START_LOCAL(startup32_vc_handler) + pushl %eax + pushl %ebx + pushl %ecx + pushl %edx + + /* Keep CPUID function in %ebx */ + movl %eax, %ebx + + /* Check if error-code == SVM_EXIT_CPUID */ + cmpl $0x72, 16(%esp) + jne .Lfail + + movl $0, %eax # Request CPUID[fn].EAX + movl %ebx, %edx # CPUID fn + call sev_es_req_cpuid # Call helper + testl %eax, %eax # Check return code + jnz .Lfail + movl %edx, 12(%esp) # Store result + + movl $1, %eax # Request CPUID[fn].EBX + movl %ebx, %edx # CPUID fn + call sev_es_req_cpuid # Call helper + testl %eax, %eax # Check return code + jnz .Lfail + movl %edx, 8(%esp) # Store result + + movl $2, %eax # Request CPUID[fn].ECX + movl %ebx, %edx # CPUID fn + call sev_es_req_cpuid # Call helper + testl %eax, %eax # Check return code + jnz .Lfail + movl %edx, 4(%esp) # Store result + + movl $3, %eax # Request CPUID[fn].EDX + movl %ebx, %edx # CPUID fn + call sev_es_req_cpuid # Call helper + testl %eax, %eax # Check return code + jnz .Lfail + movl %edx, 0(%esp) # Store result + + /* + * Sanity check CPUID results from the Hypervisor. See comment in + * do_vc_no_ghcb() for more details on why this is necessary. + */ + + /* Fail if SEV leaf not available in CPUID[0x80000000].EAX */ + cmpl $0x80000000, %ebx + jne .Lcheck_sev + cmpl $0x8000001f, 12(%esp) + jb .Lfail + jmp .Ldone + +.Lcheck_sev: + /* Fail if SEV bit not set in CPUID[0x8000001f].EAX[1] */ + cmpl $0x8000001f, %ebx + jne .Ldone + btl $1, 12(%esp) + jnc .Lfail + +.Ldone: + popl %edx + popl %ecx + popl %ebx + popl %eax + + /* Remove error code */ + addl $4, %esp + + /* Jump over CPUID instruction */ + addl $2, (%esp) + + iret +.Lfail: + /* Send terminate request to Hypervisor */ + movl $0x100, %eax + xorl %edx, %edx + movl $MSR_AMD64_SEV_ES_GHCB, %ecx + wrmsr + rep; vmmcall + + /* If request fails, go to hlt loop */ + hlt + jmp .Lfail +SYM_CODE_END(startup32_vc_handler) + +/* + * Write an IDT entry into boot32_idt + * + * Parameters: + * + * %eax: Handler address + * %edx: Vector number + * %ecx: IDT address + */ +SYM_FUNC_START_LOCAL(startup32_set_idt_entry) + /* IDT entry address to %ecx */ + leal (%ecx, %edx, 8), %ecx + + /* Build IDT entry, lower 4 bytes */ + movl %eax, %edx + andl $0x0000ffff, %edx # Target code segment offset [15:0] + orl $(__KERNEL32_CS << 16), %edx # Target code segment selector + + /* Store lower 4 bytes to IDT */ + movl %edx, (%ecx) + + /* Build IDT entry, upper 4 bytes */ + movl %eax, %edx + andl $0xffff0000, %edx # Target code segment offset [31:16] + orl $0x00008e00, %edx # Present, Type 32-bit Interrupt Gate + + /* Store upper 4 bytes to IDT */ + movl %edx, 4(%ecx) + + RET +SYM_FUNC_END(startup32_set_idt_entry) + +SYM_FUNC_START(startup32_load_idt) + push %ebp + push %ebx + + call 1f +1: pop %ebp + + leal (boot32_idt - 1b)(%ebp), %ebx + + /* #VC handler */ + leal (startup32_vc_handler - 1b)(%ebp), %eax + movl $X86_TRAP_VC, %edx + movl %ebx, %ecx + call startup32_set_idt_entry + + /* Load IDT */ + leal (boot32_idt_desc - 1b)(%ebp), %ecx + movl %ebx, 2(%ecx) + lidt (%ecx) + + pop %ebx + pop %ebp + RET +SYM_FUNC_END(startup32_load_idt) + +/* + * Check for the correct C-bit position when the startup_32 boot-path is used. + * + * The check makes use of the fact that all memory is encrypted when paging is + * disabled. The function creates 64 bits of random data using the RDRAND + * instruction. RDRAND is mandatory for SEV guests, so always available. If the + * hypervisor violates that the kernel will crash right here. + * + * The 64 bits of random data are stored to a memory location and at the same + * time kept in the %eax and %ebx registers. Since encryption is always active + * when paging is off the random data will be stored encrypted in main memory. + * + * Then paging is enabled. When the C-bit position is correct all memory is + * still mapped encrypted and comparing the register values with memory will + * succeed. An incorrect C-bit position will map all memory unencrypted, so that + * the compare will use the encrypted random data and fail. + */ +SYM_FUNC_START(startup32_check_sev_cbit) + pushl %ebx + pushl %ebp + + call 0f +0: popl %ebp + + /* Check for non-zero sev_status */ + movl (sev_status - 0b)(%ebp), %eax + testl %eax, %eax + jz 4f + + /* + * Get two 32-bit random values - Don't bail out if RDRAND fails + * because it is better to prevent forward progress if no random value + * can be gathered. + */ +1: rdrand %eax + jnc 1b +2: rdrand %ebx + jnc 2b + + /* Store to memory and keep it in the registers */ + leal (sev_check_data - 0b)(%ebp), %ebp + movl %eax, 0(%ebp) + movl %ebx, 4(%ebp) + + /* Enable paging to see if encryption is active */ + movl %cr0, %edx /* Backup %cr0 in %edx */ + movl $(X86_CR0_PG | X86_CR0_PE), %ecx /* Enable Paging and Protected mode */ + movl %ecx, %cr0 + + cmpl %eax, 0(%ebp) + jne 3f + cmpl %ebx, 4(%ebp) + jne 3f + + movl %edx, %cr0 /* Restore previous %cr0 */ + + jmp 4f + +3: /* Check failed - hlt the machine */ + hlt + jmp 3b + +4: + popl %ebp + popl %ebx + RET +SYM_FUNC_END(startup32_check_sev_cbit) + + .code64 + +#include "../../kernel/sev_verify_cbit.S" + + .data + + .balign 8 +SYM_DATA(sme_me_mask, .quad 0) +SYM_DATA(sev_status, .quad 0) +SYM_DATA(sev_check_data, .quad 0) + +SYM_DATA_START_LOCAL(boot32_idt) + .rept 32 + .quad 0 + .endr +SYM_DATA_END(boot32_idt) + +SYM_DATA_START_LOCAL(boot32_idt_desc) + .word . - boot32_idt - 1 + .long 0 +SYM_DATA_END(boot32_idt_desc) diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 0319c88290a5..0f41ca0e52c0 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -1,8 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0 /* * misc.c * - * This is a collection of several routines from gzip-1.0.3 - * adapted for Linux. + * This is a collection of several routines used to extract the kernel + * which includes KASLR relocation, decompression, ELF parsing, and + * relocation processing. Additionally included are the screen and serial + * output functions and related debugging support functions. * * malloc by Hannu Savolainen 1993 and Matthias Urlichs 1994 * puts by Nick Holloway 1993, better puts by Martin Mares 1995 @@ -10,120 +13,52 @@ */ #include "misc.h" - -/* WARNING!! - * This code is compiled with -fPIC and it is relocated dynamically - * at run time, but no relocation processing is performed. - * This means that it is not safe to place pointers in static structures. - */ +#include "error.h" +#include "../string.h" +#include "../voffset.h" +#include <asm/bootparam_utils.h> /* - * Getting to provable safe in place decompression is hard. - * Worst case behaviours need to be analyzed. - * Background information: - * - * The file layout is: - * magic[2] - * method[1] - * flags[1] - * timestamp[4] - * extraflags[1] - * os[1] - * compressed data blocks[N] - * crc[4] orig_len[4] - * - * resulting in 18 bytes of non compressed data overhead. - * - * Files divided into blocks - * 1 bit (last block flag) - * 2 bits (block type) - * - * 1 block occurs every 32K -1 bytes or when there 50% compression - * has been achieved. The smallest block type encoding is always used. - * - * stored: - * 32 bits length in bytes. - * - * fixed: - * magic fixed tree. - * symbols. - * - * dynamic: - * dynamic tree encoding. - * symbols. - * - * - * The buffer for decompression in place is the length of the - * uncompressed data, plus a small amount extra to keep the algorithm safe. - * The compressed data is placed at the end of the buffer. The output - * pointer is placed at the start of the buffer and the input pointer - * is placed where the compressed data starts. Problems will occur - * when the output pointer overruns the input pointer. - * - * The output pointer can only overrun the input pointer if the input - * pointer is moving faster than the output pointer. A condition only - * triggered by data whose compressed form is larger than the uncompressed - * form. - * - * The worst case at the block level is a growth of the compressed data - * of 5 bytes per 32767 bytes. - * - * The worst case internal to a compressed block is very hard to figure. - * The worst case can at least be boundined by having one bit that represents - * 32764 bytes and then all of the rest of the bytes representing the very - * very last byte. - * - * All of which is enough to compute an amount of extra data that is required - * to be safe. To avoid problems at the block level allocating 5 extra bytes - * per 32767 bytes of data is sufficient. To avoind problems internal to a - * block adding an extra 32767 bytes (the worst case uncompressed block size) - * is sufficient, to ensure that in the worst case the decompressed data for - * block will stop the byte before the compressed data for a block begins. - * To avoid problems with the compressed data's meta information an extra 18 - * bytes are needed. Leading to the formula: - * - * extra_bytes = (uncompressed_size >> 12) + 32768 + 18 + decompressor_size. - * - * Adding 8 bytes per 32K is a bit excessive but much easier to calculate. - * Adding 32768 instead of 32767 just makes for round numbers. - * Adding the decompressor_size is necessary as it musht live after all - * of the data as well. Last I measured the decompressor is about 14K. - * 10K of actual data and 4K of bss. - * + * WARNING!! + * This code is compiled with -fPIC and it is relocated dynamically at + * run time, but no relocation processing is performed. This means that + * it is not safe to place pointers in static structures. */ -/* - * gzip declarations - */ +/* Macros used by the included decompressor code below. */ #define STATIC static +/* Define an externally visible malloc()/free(). */ +#define MALLOC_VISIBLE +#include <linux/decompress/mm.h> -#undef memset -#undef memcpy +/* + * Provide definitions of memzero and memmove as some of the decompressors will + * try to define their own functions if these are not defined as macros. + */ #define memzero(s, n) memset((s), 0, (n)) - - -static void error(char *m); +#ifndef memmove +#define memmove memmove +/* Functions used by the included decompressor code below. */ +void *memmove(void *dest, const void *src, size_t n); +#endif /* * This is set up by the setup-routine at boot-time */ -struct boot_params *real_mode; /* Pointer to real-mode data */ +struct boot_params *boot_params_ptr; -void *memset(void *s, int c, size_t n); -void *memcpy(void *dest, const void *src, size_t n); +struct port_io_ops pio_ops; -#ifdef CONFIG_X86_64 -#define memptr long -#else -#define memptr unsigned -#endif - -static memptr free_mem_ptr; -static memptr free_mem_end_ptr; +memptr free_mem_ptr; +memptr free_mem_end_ptr; +int spurious_nmi_count; static char *vidmem; static int vidport; -static int lines, cols; + +/* These might be accessed before .bss is cleared, so use .data instead. */ +static int lines __section(".data"); +static int cols __section(".data"); #ifdef CONFIG_KERNEL_GZIP #include "../../../../lib/decompress_inflate.c" @@ -149,11 +84,19 @@ static int lines, cols; #include "../../../../lib/decompress_unlz4.c" #endif +#ifdef CONFIG_KERNEL_ZSTD +#include "../../../../lib/decompress_unzstd.c" +#endif +/* + * NOTE: When adding a new decompressor, please update the analysis in + * ../header.S. + */ + static void scroll(void) { int i; - memcpy(vidmem, vidmem + cols * 2, (lines - 1) * cols * 2); + memmove(vidmem, vidmem + cols * 2, (lines - 1) * cols * 2); for (i = (lines - 1) * cols * 2; i < lines * cols * 2; i += 2) vidmem[i] = ' '; } @@ -186,12 +129,11 @@ void __putstr(const char *s) } } - if (real_mode->screen_info.orig_video_mode == 0 && - lines == 0 && cols == 0) + if (lines == 0 || cols == 0) return; - x = real_mode->screen_info.orig_x; - y = real_mode->screen_info.orig_y; + x = boot_params_ptr->screen_info.orig_x; + y = boot_params_ptr->screen_info.orig_y; while ((c = *s++) != '\0') { if (c == '\n') { @@ -212,8 +154,8 @@ void __putstr(const char *s) } } - real_mode->screen_info.orig_x = x; - real_mode->screen_info.orig_y = y; + boot_params_ptr->screen_info.orig_x = x; + boot_params_ptr->screen_info.orig_y = y; pos = (x + cols * y) * 2; /* Update cursor position */ outb(14, vidport); @@ -222,56 +164,121 @@ void __putstr(const char *s) outb(0xff & (pos >> 1), vidport+1); } -void *memset(void *s, int c, size_t n) +static noinline void __putnum(unsigned long value, unsigned int base, + int mindig) { - int i; - char *ss = s; + char buf[8*sizeof(value)+1]; + char *p; + + p = buf + sizeof(buf); + *--p = '\0'; + + while (mindig-- > 0 || value) { + unsigned char digit = value % base; + digit += (digit >= 10) ? ('a'-10) : '0'; + *--p = digit; - for (i = 0; i < n; i++) - ss[i] = c; - return s; + value /= base; + } + + __putstr(p); } -#ifdef CONFIG_X86_32 -void *memcpy(void *dest, const void *src, size_t n) + +void __puthex(unsigned long value) { - int d0, d1, d2; - asm volatile( - "rep ; movsl\n\t" - "movl %4,%%ecx\n\t" - "rep ; movsb\n\t" - : "=&c" (d0), "=&D" (d1), "=&S" (d2) - : "0" (n >> 2), "g" (n & 3), "1" (dest), "2" (src) - : "memory"); - - return dest; + __putnum(value, 16, sizeof(value)*2); } -#else -void *memcpy(void *dest, const void *src, size_t n) + +void __putdec(unsigned long value) { - long d0, d1, d2; - asm volatile( - "rep ; movsq\n\t" - "movq %4,%%rcx\n\t" - "rep ; movsb\n\t" - : "=&c" (d0), "=&D" (d1), "=&S" (d2) - : "0" (n >> 3), "g" (n & 7), "1" (dest), "2" (src) - : "memory"); - - return dest; + __putnum(value, 10, 1); } -#endif -static void error(char *x) +#ifdef CONFIG_X86_NEED_RELOCS +static void handle_relocations(void *output, unsigned long output_len, + unsigned long virt_addr) { - error_putstr("\n\n"); - error_putstr(x); - error_putstr("\n\n -- System halted"); + int *reloc; + unsigned long delta, map, ptr; + unsigned long min_addr = (unsigned long)output; + unsigned long max_addr = min_addr + (VO___bss_start - VO__text); + + /* + * Calculate the delta between where vmlinux was linked to load + * and where it was actually loaded. + */ + delta = min_addr - LOAD_PHYSICAL_ADDR; + + /* + * The kernel contains a table of relocation addresses. Those + * addresses have the final load address of the kernel in virtual + * memory. We are currently working in the self map. So we need to + * create an adjustment for kernel memory addresses to the self map. + * This will involve subtracting out the base address of the kernel. + */ + map = delta - __START_KERNEL_map; + + /* + * 32-bit always performs relocations. 64-bit relocations are only + * needed if KASLR has chosen a different starting address offset + * from __START_KERNEL_map. + */ + if (IS_ENABLED(CONFIG_X86_64)) + delta = virt_addr - LOAD_PHYSICAL_ADDR; + + if (!delta) { + debug_putstr("No relocation needed... "); + return; + } + debug_putstr("Performing relocations... "); + + /* + * Process relocations: 32 bit relocations first then 64 bit after. + * Two sets of binary relocations are added to the end of the kernel + * before compression. Each relocation table entry is the kernel + * address of the location which needs to be updated stored as a + * 32-bit value which is sign extended to 64 bits. + * + * Format is: + * + * kernel bits... + * 0 - zero terminator for 64 bit relocations + * 64 bit relocation repeated + * 0 - zero terminator for 32 bit relocations + * 32 bit relocation repeated + * + * So we work backwards from the end of the decompressed image. + */ + for (reloc = output + output_len - sizeof(*reloc); *reloc; reloc--) { + long extended = *reloc; + extended += map; + + ptr = (unsigned long)extended; + if (ptr < min_addr || ptr > max_addr) + error("32-bit relocation outside of kernel!\n"); + + *(uint32_t *)ptr += delta; + } +#ifdef CONFIG_X86_64 + for (reloc--; *reloc; reloc--) { + long extended = *reloc; + extended += map; - while (1) - asm("hlt"); + ptr = (unsigned long)extended; + if (ptr < min_addr || ptr > max_addr) + error("64-bit relocation outside of kernel!\n"); + + *(uint64_t *)ptr += delta; + } +#endif } +#else +static inline void handle_relocations(void *output, unsigned long output_len, + unsigned long virt_addr) +{ } +#endif -static void parse_elf(void *output) +static size_t parse_elf(void *output) { #ifdef CONFIG_X86_64 Elf64_Ehdr ehdr; @@ -287,10 +294,8 @@ static void parse_elf(void *output) if (ehdr.e_ident[EI_MAG0] != ELFMAG0 || ehdr.e_ident[EI_MAG1] != ELFMAG1 || ehdr.e_ident[EI_MAG2] != ELFMAG2 || - ehdr.e_ident[EI_MAG3] != ELFMAG3) { + ehdr.e_ident[EI_MAG3] != ELFMAG3) error("Kernel is not a valid ELF file"); - return; - } debug_putstr("Parsing ELF... "); @@ -305,33 +310,118 @@ static void parse_elf(void *output) switch (phdr->p_type) { case PT_LOAD: +#ifdef CONFIG_X86_64 + if ((phdr->p_align % 0x200000) != 0) + error("Alignment of LOAD segment isn't multiple of 2MB"); +#endif #ifdef CONFIG_RELOCATABLE dest = output; dest += (phdr->p_paddr - LOAD_PHYSICAL_ADDR); #else dest = (void *)(phdr->p_paddr); #endif - memcpy(dest, - output + phdr->p_offset, - phdr->p_filesz); + memmove(dest, output + phdr->p_offset, phdr->p_filesz); break; default: /* Ignore other PT_* */ break; } } free(phdrs); + + return ehdr.e_entry - LOAD_PHYSICAL_ADDR; +} + +const unsigned long kernel_text_size = VO___start_rodata - VO__text; +const unsigned long kernel_inittext_offset = VO__sinittext - VO__text; +const unsigned long kernel_inittext_size = VO___inittext_end - VO__sinittext; +const unsigned long kernel_total_size = VO__end - VO__text; + +static u8 boot_heap[BOOT_HEAP_SIZE] __aligned(4); + +extern unsigned char input_data[]; +extern unsigned int input_len, output_len; + +unsigned long decompress_kernel(unsigned char *outbuf, unsigned long virt_addr, + void (*error)(char *x)) +{ + unsigned long entry; + + if (!free_mem_ptr) { + free_mem_ptr = (unsigned long)boot_heap; + free_mem_end_ptr = (unsigned long)boot_heap + sizeof(boot_heap); + } + + if (__decompress(input_data, input_len, NULL, NULL, outbuf, output_len, + NULL, error) < 0) + return ULONG_MAX; + + entry = parse_elf(outbuf); + handle_relocations(outbuf, output_len, virt_addr); + + return entry; } -asmlinkage void decompress_kernel(void *rmode, memptr heap, - unsigned char *input_data, - unsigned long input_len, - unsigned char *output) +/* + * Set the memory encryption xloadflag based on the mem_encrypt= command line + * parameter, if provided. + */ +static void parse_mem_encrypt(struct setup_header *hdr) +{ + int on = cmdline_find_option_bool("mem_encrypt=on"); + int off = cmdline_find_option_bool("mem_encrypt=off"); + + if (on > off) + hdr->xloadflags |= XLF_MEM_ENCRYPTION; +} + +static void early_sev_detect(void) +{ + /* + * Accessing video memory causes guest termination because + * the boot stage2 #VC handler of SEV-ES/SNP guests does not + * support MMIO handling and kexec -c adds screen_info to the + * boot parameters passed to the kexec kernel, which causes + * console output to be dumped to both video and serial. + */ + if (sev_status & MSR_AMD64_SEV_ES_ENABLED) + lines = cols = 0; +} + +/* + * The compressed kernel image (ZO), has been moved so that its position + * is against the end of the buffer used to hold the uncompressed kernel + * image (VO) and the execution environment (.bss, .brk), which makes sure + * there is room to do the in-place decompression. (See header.S for the + * calculations.) + * + * |-----compressed kernel image------| + * V V + * 0 extract_offset +INIT_SIZE + * |-----------|---------------|-------------------------|--------| + * | | | | + * VO__text startup_32 of ZO VO__end ZO__end + * ^ ^ + * |-------uncompressed kernel image---------| + * + */ +asmlinkage __visible void *extract_kernel(void *rmode, unsigned char *output) { - real_mode = rmode; + unsigned long virt_addr = LOAD_PHYSICAL_ADDR; + memptr heap = (memptr)boot_heap; + unsigned long needed_size; + size_t entry_offset; + + /* Retain x86 boot parameters pointer passed from startup_32/64. */ + boot_params_ptr = rmode; + + /* Clear flags intended for solely in-kernel use. */ + boot_params_ptr->hdr.loadflags &= ~KASLR_FLAG; - sanitize_boot_params(real_mode); + parse_mem_encrypt(&boot_params_ptr->hdr); - if (real_mode->screen_info.orig_video_mode == 7) { + sanitize_boot_params(boot_params_ptr); + + if (boot_params_ptr->screen_info.orig_video_mode == 7) { vidmem = (char *) 0xb0000; vidport = 0x3b4; } else { @@ -339,32 +429,109 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap, vidport = 0x3d4; } - lines = real_mode->screen_info.orig_video_lines; - cols = real_mode->screen_info.orig_video_cols; + lines = boot_params_ptr->screen_info.orig_video_lines; + cols = boot_params_ptr->screen_info.orig_video_cols; + + init_default_io_ops(); + + /* + * Detect TDX guest environment. + * + * It has to be done before console_init() in order to use + * paravirtualized port I/O operations if needed. + */ + early_tdx_detect(); + + early_sev_detect(); console_init(); - debug_putstr("early console in decompress_kernel\n"); + + /* + * Save RSDP address for later use. Have this after console_init() + * so that early debugging output from the RSDP parsing code can be + * collected. + */ + boot_params_ptr->acpi_rsdp_addr = get_rsdp_addr(); + + debug_putstr("early console in extract_kernel\n"); free_mem_ptr = heap; /* Heap */ free_mem_end_ptr = heap + BOOT_HEAP_SIZE; + /* + * The memory hole needed for the kernel is the larger of either + * the entire decompressed kernel plus relocation table, or the + * entire decompressed kernel plus .bss and .brk sections. + * + * On X86_64, the memory is mapped with PMD pages. Round the + * size up so that the full extent of PMD pages mapped is + * included in the check against the valid memory table + * entries. This ensures the full mapped area is usable RAM + * and doesn't include any reserved areas. + */ + needed_size = max_t(unsigned long, output_len, kernel_total_size); +#ifdef CONFIG_X86_64 + needed_size = ALIGN(needed_size, MIN_KERNEL_ALIGN); +#endif + + /* Report initial kernel position details. */ + debug_putaddr(input_data); + debug_putaddr(input_len); + debug_putaddr(output); + debug_putaddr(output_len); + debug_putaddr(kernel_total_size); + debug_putaddr(needed_size); + +#ifdef CONFIG_X86_64 + /* Report address of 32-bit trampoline */ + debug_putaddr(trampoline_32bit); +#endif + + choose_random_location((unsigned long)input_data, input_len, + (unsigned long *)&output, + needed_size, + &virt_addr); + + /* Validate memory location choices. */ if ((unsigned long)output & (MIN_KERNEL_ALIGN - 1)) - error("Destination address inappropriately aligned"); + error("Destination physical address inappropriately aligned"); + if (virt_addr & (MIN_KERNEL_ALIGN - 1)) + error("Destination virtual address inappropriately aligned"); #ifdef CONFIG_X86_64 if (heap > 0x3fffffffffffUL) error("Destination address too large"); + if (virt_addr + needed_size > KERNEL_IMAGE_SIZE) + error("Destination virtual address is beyond the kernel mapping area"); #else if (heap > ((-__PAGE_OFFSET-(128<<20)-1) & 0x7fffffff)) error("Destination address too large"); #endif #ifndef CONFIG_RELOCATABLE - if ((unsigned long)output != LOAD_PHYSICAL_ADDR) - error("Wrong destination address"); + if (virt_addr != LOAD_PHYSICAL_ADDR) + error("Destination virtual address changed when not relocatable"); #endif debug_putstr("\nDecompressing Linux... "); - decompress(input_data, input_len, NULL, NULL, output, NULL, error); - parse_elf(output); - debug_putstr("done.\nBooting the kernel.\n"); - return; + + if (init_unaccepted_memory()) { + debug_putstr("Accepting memory... "); + accept_memory(__pa(output), needed_size); + } + + entry_offset = decompress_kernel(output, virt_addr, error); + + debug_putstr("done.\nBooting the kernel (entry_offset: 0x"); + debug_puthex(entry_offset); + debug_putstr(").\n"); + + /* Disable exception handling before booting the kernel */ + cleanup_exception_handling(); + + if (spurious_nmi_count) { + error_putstr("Spurious early NMIs ignored: "); + error_putdec(spurious_nmi_count); + error_putstr("\n"); + } + + return output + entry_offset; } diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 674019d8e235..fd855e32c9b9 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -1,61 +1,258 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef BOOT_COMPRESSED_MISC_H #define BOOT_COMPRESSED_MISC_H /* - * we have to be careful, because no indirections are allowed here, and - * paravirt_ops is a kind of one. As it will only run in baremetal anyway, - * we just keep it from happening + * Special hack: we have to be careful, because no indirections are allowed here, + * and paravirt_ops is a kind of one. As it will only run in baremetal anyway, + * we just keep it from happening. (This list needs to be extended when new + * paravirt and debugging variants are added.) */ #undef CONFIG_PARAVIRT -#ifdef CONFIG_X86_32 -#define _ASM_X86_DESC_H 1 -#endif +#undef CONFIG_PARAVIRT_XXL +#undef CONFIG_PARAVIRT_SPINLOCKS +#undef CONFIG_KASAN +#undef CONFIG_KASAN_GENERIC + +#define __NO_FORTIFY + +/* cpu_feature_enabled() cannot be used this early */ +#define USE_EARLY_PGTABLE_L5 + +/* + * Boot stub deals with identity mappings, physical and virtual addresses are + * the same, so override these defines. + * + * <asm/page.h> will not define them if they are already defined. + */ +#define __pa(x) ((unsigned long)(x)) +#define __va(x) ((void *)((unsigned long)(x))) #include <linux/linkage.h> #include <linux/screen_info.h> #include <linux/elf.h> -#include <linux/io.h> #include <asm/page.h> #include <asm/boot.h> #include <asm/bootparam.h> -#include <asm/bootparam_utils.h> +#include <asm/desc_defs.h> + +#include "tdx.h" + +#define BOOT_CTYPE_H +#include <linux/acpi.h> #define BOOT_BOOT_H #include "../ctype.h" +#include "../io.h" + +#include "efi.h" + +#ifdef CONFIG_X86_64 +#define memptr long +#else +#define memptr unsigned +#endif + +/* boot/compressed/vmlinux start and end markers */ +extern char _head[], _end[]; /* misc.c */ -extern struct boot_params *real_mode; /* Pointer to real-mode data */ +extern memptr free_mem_ptr; +extern memptr free_mem_end_ptr; +extern int spurious_nmi_count; +void *malloc(int size); +void free(void *where); void __putstr(const char *s); +void __puthex(unsigned long value); +void __putdec(unsigned long value); #define error_putstr(__x) __putstr(__x) +#define error_puthex(__x) __puthex(__x) +#define error_putdec(__x) __putdec(__x) #ifdef CONFIG_X86_VERBOSE_BOOTUP #define debug_putstr(__x) __putstr(__x) +#define debug_puthex(__x) __puthex(__x) +#define debug_putaddr(__x) { \ + debug_putstr(#__x ": 0x"); \ + debug_puthex((unsigned long)(__x)); \ + debug_putstr("\n"); \ + } #else static inline void debug_putstr(const char *s) { } +static inline void debug_puthex(unsigned long value) +{ } +#define debug_putaddr(x) /* */ #endif -#ifdef CONFIG_EARLY_PRINTK - /* cmdline.c */ int cmdline_find_option(const char *option, char *buffer, int bufsize); int cmdline_find_option_bool(const char *option); -/* early_serial_console.c */ -extern int early_serial_base; -void console_init(void); +struct mem_vector { + u64 start; + u64 size; +}; +#ifdef CONFIG_RANDOMIZE_BASE +/* kaslr.c */ +void choose_random_location(unsigned long input, + unsigned long input_size, + unsigned long *output, + unsigned long output_size, + unsigned long *virt_addr); #else +static inline void choose_random_location(unsigned long input, + unsigned long input_size, + unsigned long *output, + unsigned long output_size, + unsigned long *virt_addr) +{ +} +#endif + +/* cpuflags.c */ +bool has_cpuflag(int flag); +#ifdef CONFIG_X86_64 +extern int set_page_decrypted(unsigned long address); +extern int set_page_encrypted(unsigned long address); +extern int set_page_non_present(unsigned long address); +extern unsigned char _pgtable[]; +#endif + +#ifdef CONFIG_EARLY_PRINTK /* early_serial_console.c */ +extern int early_serial_base; +void console_init(void); +#else static const int early_serial_base; static inline void console_init(void) { } +#endif + +#ifdef CONFIG_AMD_MEM_ENCRYPT +struct es_em_ctxt; +struct insn; + +void sev_enable(struct boot_params *bp); +void snp_check_features(void); +void sev_es_shutdown_ghcb(void); +extern bool sev_es_check_ghcb_fault(unsigned long address); +void snp_set_page_private(unsigned long paddr); +void snp_set_page_shared(unsigned long paddr); +void sev_prep_identity_maps(unsigned long top_level_pgt); + +enum es_result vc_decode_insn(struct es_em_ctxt *ctxt); +bool insn_has_rep_prefix(struct insn *insn); +void sev_insn_decode_init(void); +bool early_setup_ghcb(void); +#else +static inline void snp_check_features(void) { } +static inline void sev_es_shutdown_ghcb(void) { } +static inline bool sev_es_check_ghcb_fault(unsigned long address) +{ + return false; +} +static inline void snp_set_page_private(unsigned long paddr) { } +static inline void snp_set_page_shared(unsigned long paddr) { } +static inline void sev_prep_identity_maps(unsigned long top_level_pgt) { } +#endif + +/* acpi.c */ +#ifdef CONFIG_ACPI +acpi_physical_address get_rsdp_addr(void); +#else +static inline acpi_physical_address get_rsdp_addr(void) { return 0; } +#endif +#if defined(CONFIG_RANDOMIZE_BASE) && defined(CONFIG_MEMORY_HOTREMOVE) && defined(CONFIG_ACPI) +extern struct mem_vector immovable_mem[MAX_NUMNODES*2]; +int count_immovable_mem_regions(void); +#else +static inline int count_immovable_mem_regions(void) { return 0; } #endif +/* ident_map_64.c */ +extern unsigned int __pgtable_l5_enabled, pgdir_shift, ptrs_per_p4d; +extern void kernel_add_identity_map(unsigned long start, unsigned long end); + +/* Used by PAGE_KERN* macros: */ +extern pteval_t __default_kernel_pte_mask; + +/* idt_64.c */ +extern gate_desc boot_idt[BOOT_IDT_ENTRIES]; +extern struct desc_ptr boot_idt_desc; + +#ifdef CONFIG_X86_64 +void cleanup_exception_handling(void); +#else +static inline void cleanup_exception_handling(void) { } #endif + +/* IDT Entry Points */ +void boot_page_fault(void); +void boot_nmi_trap(void); +void boot_stage1_vc(void); +void boot_stage2_vc(void); + +unsigned long sev_verify_cbit(unsigned long cr3); + +enum efi_type { + EFI_TYPE_64, + EFI_TYPE_32, + EFI_TYPE_NONE, +}; + +#ifdef CONFIG_EFI +/* helpers for early EFI config table access */ +enum efi_type efi_get_type(struct boot_params *bp); +unsigned long efi_get_system_table(struct boot_params *bp); +int efi_get_conf_table(struct boot_params *bp, unsigned long *cfg_tbl_pa, + unsigned int *cfg_tbl_len); +unsigned long efi_find_vendor_table(struct boot_params *bp, + unsigned long cfg_tbl_pa, + unsigned int cfg_tbl_len, + efi_guid_t guid); +#else +static inline enum efi_type efi_get_type(struct boot_params *bp) +{ + return EFI_TYPE_NONE; +} + +static inline unsigned long efi_get_system_table(struct boot_params *bp) +{ + return 0; +} + +static inline int efi_get_conf_table(struct boot_params *bp, + unsigned long *cfg_tbl_pa, + unsigned int *cfg_tbl_len) +{ + return -ENOENT; +} + +static inline unsigned long efi_find_vendor_table(struct boot_params *bp, + unsigned long cfg_tbl_pa, + unsigned int cfg_tbl_len, + efi_guid_t guid) +{ + return 0; +} +#endif /* CONFIG_EFI */ + +#ifdef CONFIG_UNACCEPTED_MEMORY +bool init_unaccepted_memory(void); +#else +static inline bool init_unaccepted_memory(void) { return false; } +#endif + +/* Defined in EFI stub */ +extern struct efi_unaccepted_memory *unaccepted_table; +void accept_memory(phys_addr_t start, unsigned long size); + +#endif /* BOOT_COMPRESSED_MISC_H */ diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c index 958a641483dd..52aa56cdbacc 100644 --- a/arch/x86/boot/compressed/mkpiggy.c +++ b/arch/x86/boot/compressed/mkpiggy.c @@ -1,28 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0-only /* ----------------------------------------------------------------------- * * * Copyright (C) 2009 Intel Corporation. All rights reserved. * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * * H. Peter Anvin <hpa@linux.intel.com> * - * ----------------------------------------------------------------------- */ - -/* - * Compute the desired load offset from a compressed program; outputs - * a small assembly wrapper with the appropriate symbols defined. + * ----------------------------------------------------------------------- + * + * Outputs a small assembly wrapper with the appropriate symbols defined. */ #include <stdlib.h> @@ -35,12 +20,12 @@ int main(int argc, char *argv[]) { uint32_t olen; long ilen; - unsigned long offs; - FILE *f; + FILE *f = NULL; + int retval = 1; if (argc < 2) { fprintf(stderr, "Usage: %s compressed_file\n", argv[0]); - return 1; + goto bail; } /* Get the information for the compressed kernel image first */ @@ -48,7 +33,7 @@ int main(int argc, char *argv[]) f = fopen(argv[1], "r"); if (!f) { perror(argv[1]); - return 1; + goto bail; } @@ -58,38 +43,32 @@ int main(int argc, char *argv[]) if (fread(&olen, sizeof(olen), 1, f) != 1) { perror(argv[1]); - return 1; + goto bail; } ilen = ftell(f); olen = get_unaligned_le32(&olen); - fclose(f); - - /* - * Now we have the input (compressed) and output (uncompressed) - * sizes, compute the necessary decompression offset... - */ - - offs = (olen > ilen) ? olen - ilen : 0; - offs += olen >> 12; /* Add 8 bytes for each 32K block */ - offs += 64*1024 + 128; /* Add 64K + 128 bytes slack */ - offs = (offs+4095) & ~4095; /* Round to a 4K boundary */ printf(".section \".rodata..compressed\",\"a\",@progbits\n"); printf(".globl z_input_len\n"); printf("z_input_len = %lu\n", ilen); printf(".globl z_output_len\n"); printf("z_output_len = %lu\n", (unsigned long)olen); - printf(".globl z_extract_offset\n"); - printf("z_extract_offset = 0x%lx\n", offs); - /* z_extract_offset_negative allows simplification of head_32.S */ - printf(".globl z_extract_offset_negative\n"); - printf("z_extract_offset_negative = -0x%lx\n", offs); printf(".globl input_data, input_data_end\n"); printf("input_data:\n"); printf(".incbin \"%s\"\n", argv[1]); printf("input_data_end:\n"); - return 0; + printf(".section \".rodata\",\"a\",@progbits\n"); + printf(".globl input_len\n"); + printf("input_len:\n\t.long %lu\n", ilen); + printf(".globl output_len\n"); + printf("output_len:\n\t.long %lu\n", (unsigned long)olen); + + retval = 0; +bail: + if (f) + fclose(f); + return retval; } diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c new file mode 100644 index 000000000000..0e89e197e112 --- /dev/null +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -0,0 +1,200 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "misc.h" +#include <asm/bootparam.h> +#include <asm/bootparam_utils.h> +#include <asm/e820/types.h> +#include <asm/pgtable.h> +#include <asm/processor.h> +#include "../string.h" +#include "efi.h" + +#define BIOS_START_MIN 0x20000U /* 128K, less than this is insane */ +#define BIOS_START_MAX 0x9f000U /* 640K, absolute maximum */ + +/* __pgtable_l5_enabled needs to be in .data to avoid being cleared along with .bss */ +unsigned int __section(".data") __pgtable_l5_enabled; +unsigned int __section(".data") pgdir_shift = 39; +unsigned int __section(".data") ptrs_per_p4d = 1; + +/* Buffer to preserve trampoline memory */ +static char trampoline_save[TRAMPOLINE_32BIT_SIZE]; + +/* + * Trampoline address will be printed by extract_kernel() for debugging + * purposes. + * + * Avoid putting the pointer into .bss as it will be cleared between + * configure_5level_paging() and extract_kernel(). + */ +unsigned long *trampoline_32bit __section(".data"); + +int cmdline_find_option_bool(const char *option); + +static unsigned long find_trampoline_placement(void) +{ + unsigned long bios_start = 0, ebda_start = 0; + struct boot_e820_entry *entry; + char *signature; + int i; + + /* + * Find a suitable spot for the trampoline. + * This code is based on reserve_bios_regions(). + */ + + /* + * EFI systems may not provide legacy ROM. The memory may not be mapped + * at all. + * + * Only look for values in the legacy ROM for non-EFI system. + */ + signature = (char *)&boot_params_ptr->efi_info.efi_loader_signature; + if (strncmp(signature, EFI32_LOADER_SIGNATURE, 4) && + strncmp(signature, EFI64_LOADER_SIGNATURE, 4)) { + ebda_start = *(unsigned short *)0x40e << 4; + bios_start = *(unsigned short *)0x413 << 10; + } + + if (bios_start < BIOS_START_MIN || bios_start > BIOS_START_MAX) + bios_start = BIOS_START_MAX; + + if (ebda_start > BIOS_START_MIN && ebda_start < bios_start) + bios_start = ebda_start; + + bios_start = round_down(bios_start, PAGE_SIZE); + + /* Find the first usable memory region under bios_start. */ + for (i = boot_params_ptr->e820_entries - 1; i >= 0; i--) { + unsigned long new = bios_start; + + entry = &boot_params_ptr->e820_table[i]; + + /* Skip all entries above bios_start. */ + if (bios_start <= entry->addr) + continue; + + /* Skip non-RAM entries. */ + if (entry->type != E820_TYPE_RAM) + continue; + + /* Adjust bios_start to the end of the entry if needed. */ + if (bios_start > entry->addr + entry->size) + new = entry->addr + entry->size; + + /* Keep bios_start page-aligned. */ + new = round_down(new, PAGE_SIZE); + + /* Skip the entry if it's too small. */ + if (new - TRAMPOLINE_32BIT_SIZE < entry->addr) + continue; + + /* Protect against underflow. */ + if (new - TRAMPOLINE_32BIT_SIZE > bios_start) + break; + + bios_start = new; + break; + } + + /* Place the trampoline just below the end of low memory */ + return bios_start - TRAMPOLINE_32BIT_SIZE; +} + +asmlinkage void configure_5level_paging(struct boot_params *bp, void *pgtable) +{ + void (*toggle_la57)(void *cr3); + bool l5_required = false; + + /* Initialize boot_params. Required for cmdline_find_option_bool(). */ + sanitize_boot_params(bp); + boot_params_ptr = bp; + + /* + * Check if LA57 is desired and supported. + * + * There are several parts to the check: + * - if user asked to disable 5-level paging: no5lvl in cmdline + * - if the machine supports 5-level paging: + * + CPUID leaf 7 is supported + * + the leaf has the feature bit set + */ + if (!cmdline_find_option_bool("no5lvl") && + native_cpuid_eax(0) >= 7 && (native_cpuid_ecx(7) & BIT(16))) { + l5_required = true; + + /* Initialize variables for 5-level paging */ + __pgtable_l5_enabled = 1; + pgdir_shift = 48; + ptrs_per_p4d = 512; + } + + /* + * The trampoline will not be used if the paging mode is already set to + * the desired one. + */ + if (l5_required == !!(native_read_cr4() & X86_CR4_LA57)) + return; + + trampoline_32bit = (unsigned long *)find_trampoline_placement(); + + /* Preserve trampoline memory */ + memcpy(trampoline_save, trampoline_32bit, TRAMPOLINE_32BIT_SIZE); + + /* Clear trampoline memory first */ + memset(trampoline_32bit, 0, TRAMPOLINE_32BIT_SIZE); + + /* Copy trampoline code in place */ + toggle_la57 = memcpy(trampoline_32bit + + TRAMPOLINE_32BIT_CODE_OFFSET / sizeof(unsigned long), + &trampoline_32bit_src, TRAMPOLINE_32BIT_CODE_SIZE); + + /* + * Avoid the need for a stack in the 32-bit trampoline code, by using + * LJMP rather than LRET to return back to long mode. LJMP takes an + * immediate absolute address, which needs to be adjusted based on the + * placement of the trampoline. + */ + *(u32 *)((u8 *)toggle_la57 + trampoline_ljmp_imm_offset) += + (unsigned long)toggle_la57; + + /* + * The code below prepares page table in trampoline memory. + * + * The new page table will be used by trampoline code for switching + * from 4- to 5-level paging or vice versa. + */ + + if (l5_required) { + /* + * For 4- to 5-level paging transition, set up current CR3 as + * the first and the only entry in a new top-level page table. + */ + *trampoline_32bit = native_read_cr3_pa() | _PAGE_TABLE_NOENC; + } else { + u64 *new_cr3; + pgd_t *pgdp; + + /* + * For 5- to 4-level paging transition, copy page table pointed + * by first entry in the current top-level page table as our + * new top-level page table. + * + * We cannot just point to the page table from trampoline as it + * may be above 4G. + */ + pgdp = (pgd_t *)native_read_cr3_pa(); + new_cr3 = (u64 *)(native_pgd_val(pgdp[0]) & PTE_PFN_MASK); + memcpy(trampoline_32bit, new_cr3, PAGE_SIZE); + } + + toggle_la57(trampoline_32bit); + + /* + * Move the top level page table out of trampoline memory. + */ + memcpy(pgtable, trampoline_32bit, PAGE_SIZE); + native_write_cr3((unsigned long)pgtable); + + /* Restore trampoline memory */ + memcpy(trampoline_32bit, trampoline_save, TRAMPOLINE_32BIT_SIZE); +} diff --git a/arch/x86/boot/compressed/sbat.S b/arch/x86/boot/compressed/sbat.S new file mode 100644 index 000000000000..838f70a997dd --- /dev/null +++ b/arch/x86/boot/compressed/sbat.S @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Embed SBAT data in the kernel. + */ + .pushsection ".sbat", "a", @progbits + .incbin CONFIG_EFI_SBAT_FILE + .popsection diff --git a/arch/x86/boot/compressed/sev-handle-vc.c b/arch/x86/boot/compressed/sev-handle-vc.c new file mode 100644 index 000000000000..030001b46554 --- /dev/null +++ b/arch/x86/boot/compressed/sev-handle-vc.c @@ -0,0 +1,136 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "misc.h" +#include "error.h" +#include "sev.h" + +#include <linux/kernel.h> +#include <linux/string.h> +#include <asm/insn.h> +#include <asm/pgtable_types.h> +#include <asm/ptrace.h> +#include <asm/sev.h> +#include <asm/trapnr.h> +#include <asm/trap_pf.h> +#include <asm/fpu/xcr.h> + +#define __BOOT_COMPRESSED +#undef __init +#define __init + +/* Basic instruction decoding support needed */ +#include "../../lib/inat.c" +#include "../../lib/insn.c" + +/* + * Copy a version of this function here - insn-eval.c can't be used in + * pre-decompression code. + */ +bool insn_has_rep_prefix(struct insn *insn) +{ + insn_byte_t p; + + insn_get_prefixes(insn); + + for_each_insn_prefix(insn, p) { + if (p == 0xf2 || p == 0xf3) + return true; + } + + return false; +} + +enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) +{ + char buffer[MAX_INSN_SIZE]; + int ret; + + memcpy(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE); + + ret = insn_decode(&ctxt->insn, buffer, MAX_INSN_SIZE, INSN_MODE_64); + if (ret < 0) + return ES_DECODE_FAILED; + + return ES_OK; +} + +extern void sev_insn_decode_init(void) __alias(inat_init_tables); + +/* + * Only a dummy for insn_get_seg_base() - Early boot-code is 64bit only and + * doesn't use segments. + */ +static unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx) +{ + return 0UL; +} + +static enum es_result vc_write_mem(struct es_em_ctxt *ctxt, + void *dst, char *buf, size_t size) +{ + memcpy(dst, buf, size); + + return ES_OK; +} + +static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, + void *src, char *buf, size_t size) +{ + memcpy(buf, src, size); + + return ES_OK; +} + +static enum es_result vc_ioio_check(struct es_em_ctxt *ctxt, u16 port, size_t size) +{ + return ES_OK; +} + +static bool fault_in_kernel_space(unsigned long address) +{ + return false; +} + +#define sev_printk(fmt, ...) + +#include "../../coco/sev/vc-shared.c" + +void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code) +{ + struct es_em_ctxt ctxt; + enum es_result result; + + if (!boot_ghcb && !early_setup_ghcb()) + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); + + vc_ghcb_invalidate(boot_ghcb); + result = vc_init_em_ctxt(&ctxt, regs, exit_code); + if (result != ES_OK) + goto finish; + + result = vc_check_opcode_bytes(&ctxt, exit_code); + if (result != ES_OK) + goto finish; + + switch (exit_code) { + case SVM_EXIT_RDTSC: + case SVM_EXIT_RDTSCP: + result = vc_handle_rdtsc(boot_ghcb, &ctxt, exit_code); + break; + case SVM_EXIT_IOIO: + result = vc_handle_ioio(boot_ghcb, &ctxt); + break; + case SVM_EXIT_CPUID: + result = vc_handle_cpuid(boot_ghcb, &ctxt); + break; + default: + result = ES_UNSUPPORTED; + break; + } + +finish: + if (result == ES_OK) + vc_finish_insn(&ctxt); + else if (result != ES_RETRY) + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); +} diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c new file mode 100644 index 000000000000..c8c1464b3a56 --- /dev/null +++ b/arch/x86/boot/compressed/sev.c @@ -0,0 +1,512 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * AMD Encrypted Register State Support + * + * Author: Joerg Roedel <jroedel@suse.de> + */ + +/* + * misc.h needs to be first because it knows how to include the other kernel + * headers in the pre-decompression code in a way that does not break + * compilation. + */ +#include "misc.h" + +#include <asm/bootparam.h> +#include <asm/pgtable_types.h> +#include <asm/shared/msr.h> +#include <asm/sev.h> +#include <asm/trapnr.h> +#include <asm/trap_pf.h> +#include <asm/msr-index.h> +#include <asm/fpu/xcr.h> +#include <asm/ptrace.h> +#include <asm/svm.h> +#include <asm/cpuid/api.h> + +#include "error.h" +#include "sev.h" + +static struct ghcb boot_ghcb_page __aligned(PAGE_SIZE); +struct ghcb *boot_ghcb; + +#undef __init +#define __init + +#define __BOOT_COMPRESSED + +u8 snp_vmpl; +u16 ghcb_version; + +u64 boot_svsm_caa_pa; + +/* Include code for early handlers */ +#include "../../boot/startup/sev-shared.c" + +static bool sev_snp_enabled(void) +{ + return sev_status & MSR_AMD64_SEV_SNP_ENABLED; +} + +void snp_set_page_private(unsigned long paddr) +{ + struct psc_desc d = { + SNP_PAGE_STATE_PRIVATE, + (struct svsm_ca *)boot_svsm_caa_pa, + boot_svsm_caa_pa + }; + + if (!sev_snp_enabled()) + return; + + __page_state_change(paddr, paddr, &d); +} + +void snp_set_page_shared(unsigned long paddr) +{ + struct psc_desc d = { + SNP_PAGE_STATE_SHARED, + (struct svsm_ca *)boot_svsm_caa_pa, + boot_svsm_caa_pa + }; + + if (!sev_snp_enabled()) + return; + + __page_state_change(paddr, paddr, &d); +} + +bool early_setup_ghcb(void) +{ + if (set_page_decrypted((unsigned long)&boot_ghcb_page)) + return false; + + /* Page is now mapped decrypted, clear it */ + memset(&boot_ghcb_page, 0, sizeof(boot_ghcb_page)); + + boot_ghcb = &boot_ghcb_page; + + /* Initialize lookup tables for the instruction decoder */ + sev_insn_decode_init(); + + /* SNP guest requires the GHCB GPA must be registered */ + if (sev_snp_enabled()) + snp_register_ghcb_early(__pa(&boot_ghcb_page)); + + return true; +} + +void snp_accept_memory(phys_addr_t start, phys_addr_t end) +{ + struct psc_desc d = { + SNP_PAGE_STATE_PRIVATE, + (struct svsm_ca *)boot_svsm_caa_pa, + boot_svsm_caa_pa + }; + + for (phys_addr_t pa = start; pa < end; pa += PAGE_SIZE) + __page_state_change(pa, pa, &d); +} + +void sev_es_shutdown_ghcb(void) +{ + if (!boot_ghcb) + return; + + if (!sev_es_check_cpu_features()) + error("SEV-ES CPU Features missing."); + + /* + * This denotes whether to use the GHCB MSR protocol or the GHCB + * shared page to perform a GHCB request. Since the GHCB page is + * being changed to encrypted, it can't be used to perform GHCB + * requests. Clear the boot_ghcb variable so that the GHCB MSR + * protocol is used to change the GHCB page over to an encrypted + * page. + */ + boot_ghcb = NULL; + + /* + * GHCB Page must be flushed from the cache and mapped encrypted again. + * Otherwise the running kernel will see strange cache effects when + * trying to use that page. + */ + if (set_page_encrypted((unsigned long)&boot_ghcb_page)) + error("Can't map GHCB page encrypted"); + + /* + * GHCB page is mapped encrypted again and flushed from the cache. + * Mark it non-present now to catch bugs when #VC exceptions trigger + * after this point. + */ + if (set_page_non_present((unsigned long)&boot_ghcb_page)) + error("Can't unmap GHCB page"); +} + +static void __noreturn sev_es_ghcb_terminate(struct ghcb *ghcb, unsigned int set, + unsigned int reason, u64 exit_info_2) +{ + u64 exit_info_1 = SVM_VMGEXIT_TERM_REASON(set, reason); + + vc_ghcb_invalidate(ghcb); + ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_TERM_REQUEST); + ghcb_set_sw_exit_info_1(ghcb, exit_info_1); + ghcb_set_sw_exit_info_2(ghcb, exit_info_2); + + sev_es_wr_ghcb_msr(__pa(ghcb)); + VMGEXIT(); + + while (true) + asm volatile("hlt\n" : : : "memory"); +} + +bool sev_es_check_ghcb_fault(unsigned long address) +{ + /* Check whether the fault was on the GHCB page */ + return ((address & PAGE_MASK) == (unsigned long)&boot_ghcb_page); +} + +/* + * SNP_FEATURES_IMPL_REQ is the mask of SNP features that will need + * guest side implementation for proper functioning of the guest. If any + * of these features are enabled in the hypervisor but are lacking guest + * side implementation, the behavior of the guest will be undefined. The + * guest could fail in non-obvious way making it difficult to debug. + * + * As the behavior of reserved feature bits is unknown to be on the + * safe side add them to the required features mask. + */ +#define SNP_FEATURES_IMPL_REQ (MSR_AMD64_SNP_VTOM | \ + MSR_AMD64_SNP_REFLECT_VC | \ + MSR_AMD64_SNP_RESTRICTED_INJ | \ + MSR_AMD64_SNP_ALT_INJ | \ + MSR_AMD64_SNP_DEBUG_SWAP | \ + MSR_AMD64_SNP_VMPL_SSS | \ + MSR_AMD64_SNP_SECURE_TSC | \ + MSR_AMD64_SNP_VMGEXIT_PARAM | \ + MSR_AMD64_SNP_VMSA_REG_PROT | \ + MSR_AMD64_SNP_RESERVED_BIT13 | \ + MSR_AMD64_SNP_RESERVED_BIT15 | \ + MSR_AMD64_SNP_SECURE_AVIC | \ + MSR_AMD64_SNP_RESERVED_MASK) + +#ifdef CONFIG_AMD_SECURE_AVIC +#define SNP_FEATURE_SECURE_AVIC MSR_AMD64_SNP_SECURE_AVIC +#else +#define SNP_FEATURE_SECURE_AVIC 0 +#endif + +/* + * SNP_FEATURES_PRESENT is the mask of SNP features that are implemented + * by the guest kernel. As and when a new feature is implemented in the + * guest kernel, a corresponding bit should be added to the mask. + */ +#define SNP_FEATURES_PRESENT (MSR_AMD64_SNP_DEBUG_SWAP | \ + MSR_AMD64_SNP_SECURE_TSC | \ + SNP_FEATURE_SECURE_AVIC) + +u64 snp_get_unsupported_features(u64 status) +{ + if (!(status & MSR_AMD64_SEV_SNP_ENABLED)) + return 0; + + return status & SNP_FEATURES_IMPL_REQ & ~SNP_FEATURES_PRESENT; +} + +void snp_check_features(void) +{ + u64 unsupported; + + /* + * Terminate the boot if hypervisor has enabled any feature lacking + * guest side implementation. Pass on the unsupported features mask through + * EXIT_INFO_2 of the GHCB protocol so that those features can be reported + * as part of the guest boot failure. + */ + unsupported = snp_get_unsupported_features(sev_status); + if (unsupported) { + if (ghcb_version < 2 || (!boot_ghcb && !early_setup_ghcb())) + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); + + sev_es_ghcb_terminate(boot_ghcb, SEV_TERM_SET_GEN, + GHCB_SNP_UNSUPPORTED, unsupported); + } +} + +/* Search for Confidential Computing blob in the EFI config table. */ +static struct cc_blob_sev_info *find_cc_blob_efi(struct boot_params *bp) +{ + unsigned long cfg_table_pa; + unsigned int cfg_table_len; + int ret; + + ret = efi_get_conf_table(bp, &cfg_table_pa, &cfg_table_len); + if (ret) + return NULL; + + return (struct cc_blob_sev_info *)efi_find_vendor_table(bp, cfg_table_pa, + cfg_table_len, + EFI_CC_BLOB_GUID); +} + +/* + * Initial set up of SNP relies on information provided by the + * Confidential Computing blob, which can be passed to the boot kernel + * by firmware/bootloader in the following ways: + * + * - via an entry in the EFI config table + * - via a setup_data structure, as defined by the Linux Boot Protocol + * + * Scan for the blob in that order. + */ +static struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp) +{ + struct cc_blob_sev_info *cc_info; + + cc_info = find_cc_blob_efi(bp); + if (cc_info) + goto found_cc_info; + + cc_info = find_cc_blob_setup_data(bp); + if (!cc_info) + return NULL; + +found_cc_info: + if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC) + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); + + return cc_info; +} + +/* + * Indicate SNP based on presence of SNP-specific CC blob. Subsequent checks + * will verify the SNP CPUID/MSR bits. + */ +static bool early_snp_init(struct boot_params *bp) +{ + struct cc_blob_sev_info *cc_info; + + if (!bp) + return false; + + cc_info = find_cc_blob(bp); + if (!cc_info) + return false; + + /* + * If a SNP-specific Confidential Computing blob is present, then + * firmware/bootloader have indicated SNP support. Verifying this + * involves CPUID checks which will be more reliable if the SNP + * CPUID table is used. See comments over snp_setup_cpuid_table() for + * more details. + */ + setup_cpuid_table(cc_info); + + /* + * Record the SVSM Calling Area (CA) address if the guest is not + * running at VMPL0. The CA will be used to communicate with the + * SVSM and request its services. + */ + svsm_setup_ca(cc_info, rip_rel_ptr(&boot_ghcb_page)); + + /* + * Pass run-time kernel a pointer to CC info via boot_params so EFI + * config table doesn't need to be searched again during early startup + * phase. + */ + bp->cc_blob_address = (u32)(unsigned long)cc_info; + + return true; +} + +/* + * sev_check_cpu_support - Check for SEV support in the CPU capabilities + * + * Returns < 0 if SEV is not supported, otherwise the position of the + * encryption bit in the page table descriptors. + */ +static int sev_check_cpu_support(void) +{ + unsigned int eax, ebx, ecx, edx; + + /* Check for the SME/SEV support leaf */ + eax = 0x80000000; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + if (eax < 0x8000001f) + return -ENODEV; + + /* + * Check for the SME/SEV feature: + * CPUID Fn8000_001F[EAX] + * - Bit 0 - Secure Memory Encryption support + * - Bit 1 - Secure Encrypted Virtualization support + * CPUID Fn8000_001F[EBX] + * - Bits 5:0 - Pagetable bit position used to indicate encryption + */ + eax = 0x8000001f; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + /* Check whether SEV is supported */ + if (!(eax & BIT(1))) + return -ENODEV; + + sev_snp_needs_sfw = !(ebx & BIT(31)); + + return ebx & 0x3f; +} + +void sev_enable(struct boot_params *bp) +{ + struct msr m; + int bitpos; + bool snp; + + /* + * bp->cc_blob_address should only be set by boot/compressed kernel. + * Initialize it to 0 to ensure that uninitialized values from + * buggy bootloaders aren't propagated. + */ + if (bp) + bp->cc_blob_address = 0; + + /* + * Do an initial SEV capability check before early_snp_init() which + * loads the CPUID page and the same checks afterwards are done + * without the hypervisor and are trustworthy. + * + * If the HV fakes SEV support, the guest will crash'n'burn + * which is good enough. + */ + + if (sev_check_cpu_support() < 0) + return; + + /* + * Setup/preliminary detection of SNP. This will be sanity-checked + * against CPUID/MSR values later. + */ + snp = early_snp_init(bp); + + /* Now repeat the checks with the SNP CPUID table. */ + + bitpos = sev_check_cpu_support(); + if (bitpos < 0) { + if (snp) + error("SEV-SNP support indicated by CC blob, but not CPUID."); + return; + } + + /* Set the SME mask if this is an SEV guest. */ + raw_rdmsr(MSR_AMD64_SEV, &m); + sev_status = m.q; + if (!(sev_status & MSR_AMD64_SEV_ENABLED)) + return; + + /* Negotiate the GHCB protocol version. */ + if (sev_status & MSR_AMD64_SEV_ES_ENABLED) { + if (!sev_es_negotiate_protocol()) + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_PROT_UNSUPPORTED); + } + + /* + * SNP is supported in v2 of the GHCB spec which mandates support for HV + * features. + */ + if (sev_status & MSR_AMD64_SEV_SNP_ENABLED) { + u64 hv_features; + + hv_features = get_hv_features(); + if (!(hv_features & GHCB_HV_FT_SNP)) + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); + + /* + * Running at VMPL0 is required unless an SVSM is present and + * the hypervisor supports the required SVSM GHCB events. + */ + if (snp_vmpl && !(hv_features & GHCB_HV_FT_SNP_MULTI_VMPL)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_NOT_VMPL0); + } + + if (snp && !(sev_status & MSR_AMD64_SEV_SNP_ENABLED)) + error("SEV-SNP supported indicated by CC blob, but not SEV status MSR."); + + sme_me_mask = BIT_ULL(bitpos); +} + +/* + * sev_get_status - Retrieve the SEV status mask + * + * Returns 0 if the CPU is not SEV capable, otherwise the value of the + * AMD64_SEV MSR. + */ +u64 sev_get_status(void) +{ + struct msr m; + + if (sev_check_cpu_support() < 0) + return 0; + + raw_rdmsr(MSR_AMD64_SEV, &m); + return m.q; +} + +void sev_prep_identity_maps(unsigned long top_level_pgt) +{ + /* + * The Confidential Computing blob is used very early in uncompressed + * kernel to find the in-memory CPUID table to handle CPUID + * instructions. Make sure an identity-mapping exists so it can be + * accessed after switchover. + */ + if (sev_snp_enabled()) { + unsigned long cc_info_pa = boot_params_ptr->cc_blob_address; + struct cc_blob_sev_info *cc_info; + + kernel_add_identity_map(cc_info_pa, cc_info_pa + sizeof(*cc_info)); + + cc_info = (struct cc_blob_sev_info *)cc_info_pa; + kernel_add_identity_map(cc_info->cpuid_phys, cc_info->cpuid_phys + cc_info->cpuid_len); + } + + sev_verify_cbit(top_level_pgt); +} + +bool early_is_sevsnp_guest(void) +{ + static bool sevsnp; + + if (sevsnp) + return true; + + if (!(sev_get_status() & MSR_AMD64_SEV_SNP_ENABLED)) + return false; + + sevsnp = true; + + if (!snp_vmpl) { + unsigned int eax, ebx, ecx, edx; + + /* + * CPUID Fn8000_001F_EAX[28] - SVSM support + */ + eax = 0x8000001f; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + if (eax & BIT(28)) { + struct msr m; + + /* Obtain the address of the calling area to use */ + raw_rdmsr(MSR_SVSM_CAA, &m); + boot_svsm_caa_pa = m.q; + + /* + * The real VMPL level cannot be discovered, but the + * memory acceptance routines make no use of that so + * any non-zero value suffices here. + */ + snp_vmpl = U8_MAX; + } + } + return true; +} diff --git a/arch/x86/boot/compressed/sev.h b/arch/x86/boot/compressed/sev.h new file mode 100644 index 000000000000..22637b416b46 --- /dev/null +++ b/arch/x86/boot/compressed/sev.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * AMD SEV header for early boot related functions. + * + * Author: Tom Lendacky <thomas.lendacky@amd.com> + */ + +#ifndef BOOT_COMPRESSED_SEV_H +#define BOOT_COMPRESSED_SEV_H + +#ifdef CONFIG_AMD_MEM_ENCRYPT + +#include <asm/shared/msr.h> + +void snp_accept_memory(phys_addr_t start, phys_addr_t end); +u64 sev_get_status(void); +bool early_is_sevsnp_guest(void); + +static inline u64 sev_es_rd_ghcb_msr(void) +{ + struct msr m; + + raw_rdmsr(MSR_AMD64_SEV_ES_GHCB, &m); + + return m.q; +} + +static inline void sev_es_wr_ghcb_msr(u64 val) +{ + struct msr m; + + m.q = val; + raw_wrmsr(MSR_AMD64_SEV_ES_GHCB, &m); +} + +#else + +static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { } +static inline u64 sev_get_status(void) { return 0; } +static inline bool early_is_sevsnp_guest(void) { return false; } + +#endif + +#endif diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c index ffb9c5c9d748..9af19d9614cb 100644 --- a/arch/x86/boot/compressed/string.c +++ b/arch/x86/boot/compressed/string.c @@ -1,11 +1,81 @@ -#include "misc.h" +// SPDX-License-Identifier: GPL-2.0 +/* + * This provides an optimized implementation of memcpy, and a simplified + * implementation of memset and memmove. These are used here because the + * standard kernel runtime versions are not yet available and we don't + * trust the gcc built-in implementations as they may do unexpected things + * (e.g. FPU ops) in the minimal decompression stub execution environment. + */ +#include "error.h" -int memcmp(const void *s1, const void *s2, size_t len) +#include "../string.c" + +#ifdef CONFIG_X86_32 +static void *____memcpy(void *dest, const void *src, size_t n) { - u8 diff; - asm("repe; cmpsb; setnz %0" - : "=qm" (diff), "+D" (s1), "+S" (s2), "+c" (len)); - return diff; + int d0, d1, d2; + asm volatile( + "rep movsl\n\t" + "movl %4,%%ecx\n\t" + "rep movsb" + : "=&c" (d0), "=&D" (d1), "=&S" (d2) + : "0" (n >> 2), "g" (n & 3), "1" (dest), "2" (src) + : "memory"); + + return dest; } +#else +static void *____memcpy(void *dest, const void *src, size_t n) +{ + long d0, d1, d2; + asm volatile( + "rep movsq\n\t" + "movq %4,%%rcx\n\t" + "rep movsb" + : "=&c" (d0), "=&D" (d1), "=&S" (d2) + : "0" (n >> 3), "g" (n & 7), "1" (dest), "2" (src) + : "memory"); -#include "../string.c" + return dest; +} +#endif + +void *memset(void *s, int c, size_t n) +{ + int i; + char *ss = s; + + for (i = 0; i < n; i++) + ss[i] = c; + return s; +} + +void *memmove(void *dest, const void *src, size_t n) +{ + unsigned char *d = dest; + const unsigned char *s = src; + + if (d <= s || d - s >= n) + return ____memcpy(dest, src, n); + + while (n-- > 0) + d[n] = s[n]; + + return dest; +} + +/* Detect and warn about potential overlaps, but handle them with memmove. */ +void *memcpy(void *dest, const void *src, size_t n) +{ + if (dest > src && dest - src < n) { + warn("Avoiding potentially unsafe overlapping memcpy()!"); + return memmove(dest, src, n); + } + return ____memcpy(dest, src, n); +} + +#ifdef CONFIG_KASAN +extern void *__memset(void *s, int c, size_t n) __alias(memset); +extern void *__memmove(void *dest, const void *src, size_t n) __alias(memmove); +extern void *__memcpy(void *dest, const void *src, size_t n) __alias(memcpy); +#endif diff --git a/arch/x86/boot/compressed/tdcall.S b/arch/x86/boot/compressed/tdcall.S new file mode 100644 index 000000000000..46d0495e0d3a --- /dev/null +++ b/arch/x86/boot/compressed/tdcall.S @@ -0,0 +1,3 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include "../../coco/tdx/tdcall.S" diff --git a/arch/x86/boot/compressed/tdx-shared.c b/arch/x86/boot/compressed/tdx-shared.c new file mode 100644 index 000000000000..5ac43762fe13 --- /dev/null +++ b/arch/x86/boot/compressed/tdx-shared.c @@ -0,0 +1,2 @@ +#include "error.h" +#include "../../coco/tdx/tdx-shared.c" diff --git a/arch/x86/boot/compressed/tdx.c b/arch/x86/boot/compressed/tdx.c new file mode 100644 index 000000000000..8451d6a1030c --- /dev/null +++ b/arch/x86/boot/compressed/tdx.c @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "../cpuflags.h" +#include "../string.h" +#include "../io.h" +#include "error.h" + +#include <vdso/limits.h> +#include <uapi/asm/vmx.h> + +#include <asm/shared/tdx.h> + +/* Called from __tdx_hypercall() for unrecoverable failure */ +void __tdx_hypercall_failed(void) +{ + error("TDVMCALL failed. TDX module bug?"); +} + +static inline unsigned int tdx_io_in(int size, u16 port) +{ + struct tdx_module_args args = { + .r10 = TDX_HYPERCALL_STANDARD, + .r11 = hcall_func(EXIT_REASON_IO_INSTRUCTION), + .r12 = size, + .r13 = 0, + .r14 = port, + }; + + if (__tdx_hypercall(&args)) + return UINT_MAX; + + return args.r11; +} + +static inline void tdx_io_out(int size, u16 port, u32 value) +{ + struct tdx_module_args args = { + .r10 = TDX_HYPERCALL_STANDARD, + .r11 = hcall_func(EXIT_REASON_IO_INSTRUCTION), + .r12 = size, + .r13 = 1, + .r14 = port, + .r15 = value, + }; + + __tdx_hypercall(&args); +} + +static inline u8 tdx_inb(u16 port) +{ + return tdx_io_in(1, port); +} + +static inline void tdx_outb(u8 value, u16 port) +{ + tdx_io_out(1, port, value); +} + +static inline void tdx_outw(u16 value, u16 port) +{ + tdx_io_out(2, port, value); +} + +void early_tdx_detect(void) +{ + u32 eax, sig[3]; + + cpuid_count(TDX_CPUID_LEAF_ID, 0, &eax, &sig[0], &sig[2], &sig[1]); + + if (memcmp(TDX_IDENT, sig, sizeof(sig))) + return; + + /* Use hypercalls instead of I/O instructions */ + pio_ops.f_inb = tdx_inb; + pio_ops.f_outb = tdx_outb; + pio_ops.f_outw = tdx_outw; +} diff --git a/arch/x86/boot/compressed/tdx.h b/arch/x86/boot/compressed/tdx.h new file mode 100644 index 000000000000..9055482cd35c --- /dev/null +++ b/arch/x86/boot/compressed/tdx.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef BOOT_COMPRESSED_TDX_H +#define BOOT_COMPRESSED_TDX_H + +#include <linux/types.h> + +#ifdef CONFIG_INTEL_TDX_GUEST +void early_tdx_detect(void); +#else +static inline void early_tdx_detect(void) { }; +#endif + +#endif /* BOOT_COMPRESSED_TDX_H */ diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S index 34d047c98284..587ce3e7c504 100644 --- a/arch/x86/boot/compressed/vmlinux.lds.S +++ b/arch/x86/boot/compressed/vmlinux.lds.S @@ -1,6 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #include <asm-generic/vmlinux.lds.h> -OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT) +OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT) #undef i386 @@ -33,6 +34,7 @@ SECTIONS _text = .; /* Text */ *(.text) *(.text.*) + *(.noinstr.text) _etext = . ; } .rodata : { @@ -41,16 +43,21 @@ SECTIONS *(.rodata.*) _erodata = . ; } - .got : { - _got = .; - KEEP(*(.got.plt)) - KEEP(*(.got)) - _egot = .; +#ifdef CONFIG_EFI_SBAT + .sbat : ALIGN(0x1000) { + _sbat = . ; + *(.sbat) + _esbat = ALIGN(0x1000); + . = _esbat; } - .data : { +#endif + .data : ALIGN(0x1000) { _data = . ; *(.data) *(.data.*) + + /* Add 4 bytes of extra space for the obsolete CRC-32 checksum */ + . = ALIGN(. + 4, 0x200); _edata = . ; } . = ALIGN(L1_CACHE_BYTES); @@ -70,5 +77,52 @@ SECTIONS _epgtable = . ; } #endif + . = ALIGN(PAGE_SIZE); /* keep ZO size page aligned */ _end = .; + + STABS_DEBUG + DWARF_DEBUG + ELF_DETAILS + + DISCARDS + /DISCARD/ : { + *(.dynamic) *(.dynsym) *(.dynstr) *(.dynbss) + *(.hash) *(.gnu.hash) + *(.note.*) + } + + .got.plt (INFO) : { + *(.got.plt) + } + ASSERT(SIZEOF(.got.plt) == 0 || +#ifdef CONFIG_X86_64 + SIZEOF(.got.plt) == 0x18, +#else + SIZEOF(.got.plt) == 0xc, +#endif + "Unexpected GOT/PLT entries detected!") + + /* + * Sections that should stay zero sized, which is safer to + * explicitly check instead of blindly discarding. + */ + .got : { + *(.got) + } + ASSERT(SIZEOF(.got) == 0, "Unexpected GOT entries detected!") + + .plt : { + *(.plt) *(.plt.*) + } + ASSERT(SIZEOF(.plt) == 0, "Unexpected run-time procedure linkages detected!") + + .rel.dyn : { + *(.rel.*) *(.rel_*) + } + ASSERT(SIZEOF(.rel.dyn) == 0, "Unexpected run-time relocations (.rel) detected!") + + .rela.dyn : { + *(.rela.*) *(.rela_*) + } + ASSERT(SIZEOF(.rela.dyn) == 0, "Unexpected run-time relocations (.rela) detected!") } |
