summaryrefslogtreecommitdiff
path: root/arch/x86/boot/compressed
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/boot/compressed')
-rw-r--r--arch/x86/boot/compressed/.gitignore1
-rw-r--r--arch/x86/boot/compressed/Makefile137
-rw-r--r--arch/x86/boot/compressed/acpi.c317
-rw-r--r--arch/x86/boot/compressed/cmdline.c11
-rw-r--r--arch/x86/boot/compressed/cpuflags.c9
-rw-r--r--arch/x86/boot/compressed/early_serial_console.c7
-rw-r--r--arch/x86/boot/compressed/eboot.c1278
-rw-r--r--arch/x86/boot/compressed/eboot.h71
-rw-r--r--arch/x86/boot/compressed/efi.c236
-rw-r--r--arch/x86/boot/compressed/efi.h127
-rw-r--r--arch/x86/boot/compressed/efi_stub_32.S86
-rw-r--r--arch/x86/boot/compressed/efi_stub_64.S1
-rw-r--r--arch/x86/boot/compressed/error.c43
-rw-r--r--arch/x86/boot/compressed/error.h11
-rw-r--r--arch/x86/boot/compressed/head_32.S207
-rw-r--r--arch/x86/boot/compressed/head_64.S406
-rw-r--r--arch/x86/boot/compressed/ident_map_64.c393
-rw-r--r--arch/x86/boot/compressed/idt_64.c92
-rw-r--r--arch/x86/boot/compressed/idt_handlers_64.S78
-rw-r--r--arch/x86/boot/compressed/kaslr.c908
-rw-r--r--arch/x86/boot/compressed/kernel_info.S22
-rw-r--r--arch/x86/boot/compressed/mem.c86
-rw-r--r--arch/x86/boot/compressed/mem_encrypt.S324
-rw-r--r--arch/x86/boot/compressed/misc.c503
-rw-r--r--arch/x86/boot/compressed/misc.h225
-rw-r--r--arch/x86/boot/compressed/mkpiggy.c61
-rw-r--r--arch/x86/boot/compressed/pgtable_64.c200
-rw-r--r--arch/x86/boot/compressed/sbat.S7
-rw-r--r--arch/x86/boot/compressed/sev-handle-vc.c136
-rw-r--r--arch/x86/boot/compressed/sev.c512
-rw-r--r--arch/x86/boot/compressed/sev.h44
-rw-r--r--arch/x86/boot/compressed/string.c84
-rw-r--r--arch/x86/boot/compressed/tdcall.S3
-rw-r--r--arch/x86/boot/compressed/tdx-shared.c2
-rw-r--r--arch/x86/boot/compressed/tdx.c77
-rw-r--r--arch/x86/boot/compressed/tdx.h13
-rw-r--r--arch/x86/boot/compressed/vmlinux.lds.S68
37 files changed, 4820 insertions, 1966 deletions
diff --git a/arch/x86/boot/compressed/.gitignore b/arch/x86/boot/compressed/.gitignore
index 4a46fab7162e..25805199a506 100644
--- a/arch/x86/boot/compressed/.gitignore
+++ b/arch/x86/boot/compressed/.gitignore
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
relocs
vmlinux.bin.all
vmlinux.relocs
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index dcd90df10ab4..68f9d7a1683b 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -1,54 +1,134 @@
+# SPDX-License-Identifier: GPL-2.0
#
# linux/arch/x86/boot/compressed/Makefile
#
# create a compressed vmlinux image from the original vmlinux
#
+# vmlinuz is:
+# decompression code (*.o)
+# asm globals (piggy.S), including:
+# vmlinux.bin.(gz|bz2|lzma|...)
+#
+# vmlinux.bin is:
+# vmlinux stripped of debugging and comments
+# vmlinux.bin.all is:
+# vmlinux.bin + vmlinux.relocs
+# vmlinux.bin.(gz|bz2|lzma|...) is:
+# (see scripts/Makefile.lib size_append)
+# compressed vmlinux.bin.all + u32 size of vmlinux.bin.all
targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \
- vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4
-
-KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2
-KBUILD_CFLAGS += -fno-strict-aliasing -fPIC
+ vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4 vmlinux.bin.zst
+
+# CLANG_FLAGS must come before any cc-disable-warning or cc-option calls in
+# case of cross compiling, as it has the '--target=' flag, which is needed to
+# avoid errors with '-march=i386', and future flags may depend on the target to
+# be valid.
+KBUILD_CFLAGS := -m$(BITS) -O2 $(CLANG_FLAGS)
+KBUILD_CFLAGS += -std=gnu11 -fms-extensions
+KBUILD_CFLAGS += -fno-strict-aliasing -fPIE
+KBUILD_CFLAGS += -Wundef
KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
cflags-$(CONFIG_X86_32) := -march=i386
-cflags-$(CONFIG_X86_64) := -mcmodel=small
+cflags-$(CONFIG_X86_64) := -mcmodel=small -mno-red-zone
KBUILD_CFLAGS += $(cflags-y)
-KBUILD_CFLAGS += $(call cc-option,-ffreestanding)
-KBUILD_CFLAGS += $(call cc-option,-fno-stack-protector)
+KBUILD_CFLAGS += -mno-mmx -mno-sse
+KBUILD_CFLAGS += -ffreestanding -fshort-wchar
+KBUILD_CFLAGS += -fno-stack-protector
+KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member)
+ifdef CONFIG_CC_IS_CLANG
+KBUILD_CFLAGS += -Wno-gnu
+KBUILD_CFLAGS += -Wno-microsoft-anon-tag
+endif
+KBUILD_CFLAGS += -Wno-pointer-sign
+KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
+KBUILD_CFLAGS += -D__DISABLE_EXPORTS
+# Disable relocation relaxation in case the link is not PIE.
+KBUILD_CFLAGS += $(call cc-option,-Wa$(comma)-mrelax-relocations=no)
+KBUILD_CFLAGS += -include $(srctree)/include/linux/hidden.h
+
+# sev-decode-insn.c indirectly includes inat-table.c which is generated during
+# compilation and stored in $(objtree). Add the directory to the includes so
+# that the compiler finds it even with out-of-tree builds (make O=/some/path).
+CFLAGS_sev-handle-vc.o += -I$(objtree)/arch/x86/lib/
KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
-GCOV_PROFILE := n
-LDFLAGS := -m elf_$(UTS_MACHINE)
-LDFLAGS_vmlinux := -T
+KBUILD_LDFLAGS := -m elf_$(UTS_MACHINE)
+KBUILD_LDFLAGS += $(call ld-option,--no-ld-generated-unwind-info)
+# Compressed kernel should be built as PIE since it may be loaded at any
+# address by the bootloader.
+LDFLAGS_vmlinux := -pie $(call ld-option, --no-dynamic-linker)
+ifdef CONFIG_LD_ORPHAN_WARN
+LDFLAGS_vmlinux += --orphan-handling=$(CONFIG_LD_ORPHAN_WARN_LEVEL)
+endif
+LDFLAGS_vmlinux += -z noexecstack
+ifeq ($(CONFIG_LD_IS_BFD),y)
+LDFLAGS_vmlinux += $(call ld-option,--no-warn-rwx-segments)
+endif
+ifeq ($(CONFIG_EFI_STUB),y)
+# ensure that the static EFI stub library will be pulled in, even if it is
+# never referenced explicitly from the startup code
+LDFLAGS_vmlinux += -u efi_pe_entry
+endif
+LDFLAGS_vmlinux += -T
-hostprogs-y := mkpiggy
+hostprogs := mkpiggy
HOST_EXTRACFLAGS += -I$(srctree)/tools/include
-VMLINUX_OBJS = $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \
- $(obj)/string.o $(obj)/cmdline.o $(obj)/early_serial_console.o \
- $(obj)/piggy.o
+sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABbCDGRSTtVW] \(_text\|__start_rodata\|_sinittext\|__inittext_end\|__bss_start\|_end\)$$/\#define VO_\2 _AC(0x\1,UL)/p'
+
+quiet_cmd_voffset = VOFFSET $@
+ cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@
+
+targets += ../voffset.h
-$(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone
+$(obj)/../voffset.h: vmlinux FORCE
+ $(call if_changed,voffset)
-ifeq ($(CONFIG_EFI_STUB), y)
- VMLINUX_OBJS += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o
+$(obj)/misc.o: $(obj)/../voffset.h
+
+vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/kernel_info.o $(obj)/head_$(BITS).o \
+ $(obj)/misc.o $(obj)/string.o $(obj)/cmdline.o $(obj)/error.o \
+ $(obj)/piggy.o $(obj)/cpuflags.o
+
+vmlinux-objs-$(CONFIG_EARLY_PRINTK) += $(obj)/early_serial_console.o
+vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr.o
+ifdef CONFIG_X86_64
+ vmlinux-objs-y += $(obj)/ident_map_64.o
+ vmlinux-objs-y += $(obj)/idt_64.o $(obj)/idt_handlers_64.o
+ vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/mem_encrypt.o
+ vmlinux-objs-y += $(obj)/pgtable_64.o
+ vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/sev.o $(obj)/sev-handle-vc.o
endif
-$(obj)/vmlinux: $(VMLINUX_OBJS) FORCE
+vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o
+vmlinux-objs-$(CONFIG_INTEL_TDX_GUEST) += $(obj)/tdx.o $(obj)/tdcall.o $(obj)/tdx-shared.o
+vmlinux-objs-$(CONFIG_UNACCEPTED_MEMORY) += $(obj)/mem.o
+
+vmlinux-objs-$(CONFIG_EFI) += $(obj)/efi.o
+vmlinux-libs-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a
+vmlinux-libs-$(CONFIG_X86_64) += $(objtree)/arch/x86/boot/startup/lib.a
+vmlinux-objs-$(CONFIG_EFI_SBAT) += $(obj)/sbat.o
+
+ifdef CONFIG_EFI_SBAT
+$(obj)/sbat.o: $(CONFIG_EFI_SBAT_FILE)
+endif
+
+$(obj)/vmlinux: $(vmlinux-objs-y) $(vmlinux-libs-y) FORCE
$(call if_changed,ld)
- @:
OBJCOPYFLAGS_vmlinux.bin := -R .comment -S
$(obj)/vmlinux.bin: vmlinux FORCE
$(call if_changed,objcopy)
-targets += $(patsubst $(obj)/%,%,$(VMLINUX_OBJS)) vmlinux.bin.all vmlinux.relocs
+targets += $(patsubst $(obj)/%,%,$(vmlinux-objs-y)) vmlinux.bin.all vmlinux.relocs
CMD_RELOCS = arch/x86/tools/relocs
quiet_cmd_relocs = RELOCS $@
cmd_relocs = $(CMD_RELOCS) $< > $@;$(CMD_RELOCS) --abs-relocs $<
-$(obj)/vmlinux.relocs: vmlinux FORCE
+
+$(obj)/vmlinux.relocs: vmlinux.unstripped FORCE
$(call if_changed,relocs)
vmlinux.bin.all-y := $(obj)/vmlinux.bin
@@ -57,15 +137,17 @@ vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += $(obj)/vmlinux.relocs
$(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE
$(call if_changed,gzip)
$(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE
- $(call if_changed,bzip2)
+ $(call if_changed,bzip2_with_size)
$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE
- $(call if_changed,lzma)
+ $(call if_changed,lzma_with_size)
$(obj)/vmlinux.bin.xz: $(vmlinux.bin.all-y) FORCE
- $(call if_changed,xzkern)
+ $(call if_changed,xzkern_with_size)
$(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE
- $(call if_changed,lzo)
+ $(call if_changed,lzo_with_size)
$(obj)/vmlinux.bin.lz4: $(vmlinux.bin.all-y) FORCE
- $(call if_changed,lz4)
+ $(call if_changed,lz4_with_size)
+$(obj)/vmlinux.bin.zst: $(vmlinux.bin.all-y) FORCE
+ $(call if_changed,zstd22_with_size)
suffix-$(CONFIG_KERNEL_GZIP) := gz
suffix-$(CONFIG_KERNEL_BZIP2) := bz2
@@ -73,9 +155,10 @@ suffix-$(CONFIG_KERNEL_LZMA) := lzma
suffix-$(CONFIG_KERNEL_XZ) := xz
suffix-$(CONFIG_KERNEL_LZO) := lzo
suffix-$(CONFIG_KERNEL_LZ4) := lz4
+suffix-$(CONFIG_KERNEL_ZSTD) := zst
quiet_cmd_mkpiggy = MKPIGGY $@
- cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false )
+ cmd_mkpiggy = $(obj)/mkpiggy $< > $@
targets += piggy.S
$(obj)/piggy.S: $(obj)/vmlinux.bin.$(suffix-y) $(obj)/mkpiggy FORCE
diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c
new file mode 100644
index 000000000000..f196b1d1ddf8
--- /dev/null
+++ b/arch/x86/boot/compressed/acpi.c
@@ -0,0 +1,317 @@
+// SPDX-License-Identifier: GPL-2.0
+#define BOOT_CTYPE_H
+#include "misc.h"
+#include "error.h"
+#include "../string.h"
+#include "efi.h"
+
+#include <asm/bootparam.h>
+
+#include <linux/numa.h>
+
+/*
+ * Longest parameter of 'acpi=' is 'copy_dsdt', plus an extra '\0'
+ * for termination.
+ */
+#define MAX_ACPI_ARG_LENGTH 10
+
+/*
+ * Immovable memory regions representation. Max amount of memory regions is
+ * MAX_NUMNODES*2.
+ */
+struct mem_vector immovable_mem[MAX_NUMNODES*2];
+
+static acpi_physical_address
+__efi_get_rsdp_addr(unsigned long cfg_tbl_pa, unsigned int cfg_tbl_len)
+{
+#ifdef CONFIG_EFI
+ unsigned long rsdp_addr;
+ int ret;
+
+ /*
+ * Search EFI system tables for RSDP. Preferred is ACPI_20_TABLE_GUID to
+ * ACPI_TABLE_GUID because it has more features.
+ */
+ rsdp_addr = efi_find_vendor_table(boot_params_ptr, cfg_tbl_pa, cfg_tbl_len,
+ ACPI_20_TABLE_GUID);
+ if (rsdp_addr)
+ return (acpi_physical_address)rsdp_addr;
+
+ /* No ACPI_20_TABLE_GUID found, fallback to ACPI_TABLE_GUID. */
+ rsdp_addr = efi_find_vendor_table(boot_params_ptr, cfg_tbl_pa, cfg_tbl_len,
+ ACPI_TABLE_GUID);
+ if (rsdp_addr)
+ return (acpi_physical_address)rsdp_addr;
+
+ debug_putstr("Error getting RSDP address.\n");
+#endif
+ return 0;
+}
+
+static acpi_physical_address efi_get_rsdp_addr(void)
+{
+#ifdef CONFIG_EFI
+ unsigned long cfg_tbl_pa = 0;
+ unsigned int cfg_tbl_len;
+ unsigned long systab_pa;
+ unsigned int nr_tables;
+ enum efi_type et;
+ int ret;
+
+ et = efi_get_type(boot_params_ptr);
+ if (et == EFI_TYPE_NONE)
+ return 0;
+
+ systab_pa = efi_get_system_table(boot_params_ptr);
+ if (!systab_pa)
+ error("EFI support advertised, but unable to locate system table.");
+
+ ret = efi_get_conf_table(boot_params_ptr, &cfg_tbl_pa, &cfg_tbl_len);
+ if (ret || !cfg_tbl_pa)
+ error("EFI config table not found.");
+
+ return __efi_get_rsdp_addr(cfg_tbl_pa, cfg_tbl_len);
+#else
+ return 0;
+#endif
+}
+
+static u8 compute_checksum(u8 *buffer, u32 length)
+{
+ u8 *end = buffer + length;
+ u8 sum = 0;
+
+ while (buffer < end)
+ sum += *(buffer++);
+
+ return sum;
+}
+
+/* Search a block of memory for the RSDP signature. */
+static u8 *scan_mem_for_rsdp(u8 *start, u32 length)
+{
+ struct acpi_table_rsdp *rsdp;
+ u8 *address, *end;
+
+ end = start + length;
+
+ /* Search from given start address for the requested length */
+ for (address = start; address < end; address += ACPI_RSDP_SCAN_STEP) {
+ /*
+ * Both RSDP signature and checksum must be correct.
+ * Note: Sometimes there exists more than one RSDP in memory;
+ * the valid RSDP has a valid checksum, all others have an
+ * invalid checksum.
+ */
+ rsdp = (struct acpi_table_rsdp *)address;
+
+ /* BAD Signature */
+ if (!ACPI_VALIDATE_RSDP_SIG(rsdp->signature))
+ continue;
+
+ /* Check the standard checksum */
+ if (compute_checksum((u8 *)rsdp, ACPI_RSDP_CHECKSUM_LENGTH))
+ continue;
+
+ /* Check extended checksum if table version >= 2 */
+ if ((rsdp->revision >= 2) &&
+ (compute_checksum((u8 *)rsdp, ACPI_RSDP_XCHECKSUM_LENGTH)))
+ continue;
+
+ /* Signature and checksum valid, we have found a real RSDP */
+ return address;
+ }
+ return NULL;
+}
+
+/* Search RSDP address in EBDA. */
+static acpi_physical_address bios_get_rsdp_addr(void)
+{
+ unsigned long address;
+ u8 *rsdp;
+
+ /* Get the location of the Extended BIOS Data Area (EBDA) */
+ address = *(u16 *)ACPI_EBDA_PTR_LOCATION;
+ address <<= 4;
+
+ /*
+ * Search EBDA paragraphs (EBDA is required to be a minimum of
+ * 1K length)
+ */
+ if (address > 0x400) {
+ rsdp = scan_mem_for_rsdp((u8 *)address, ACPI_EBDA_WINDOW_SIZE);
+ if (rsdp)
+ return (acpi_physical_address)(unsigned long)rsdp;
+ }
+
+ /* Search upper memory: 16-byte boundaries in E0000h-FFFFFh */
+ rsdp = scan_mem_for_rsdp((u8 *) ACPI_HI_RSDP_WINDOW_BASE,
+ ACPI_HI_RSDP_WINDOW_SIZE);
+ if (rsdp)
+ return (acpi_physical_address)(unsigned long)rsdp;
+
+ return 0;
+}
+
+/* Return RSDP address on success, otherwise 0. */
+acpi_physical_address get_rsdp_addr(void)
+{
+ acpi_physical_address pa;
+
+ pa = boot_params_ptr->acpi_rsdp_addr;
+
+ if (!pa)
+ pa = efi_get_rsdp_addr();
+
+ if (!pa)
+ pa = bios_get_rsdp_addr();
+
+ return pa;
+}
+
+#if defined(CONFIG_RANDOMIZE_BASE) && defined(CONFIG_MEMORY_HOTREMOVE)
+/*
+ * Max length of 64-bit hex address string is 19, prefix "0x" + 16 hex
+ * digits, and '\0' for termination.
+ */
+#define MAX_ADDR_LEN 19
+
+static unsigned long get_cmdline_acpi_rsdp(void)
+{
+ unsigned long addr = 0;
+
+#ifdef CONFIG_KEXEC_CORE
+ char val[MAX_ADDR_LEN] = { };
+ int ret;
+
+ ret = cmdline_find_option("acpi_rsdp", val, MAX_ADDR_LEN);
+ if (ret < 0)
+ return 0;
+
+ if (boot_kstrtoul(val, 16, &addr))
+ return 0;
+#endif
+ return addr;
+}
+
+/* Compute SRAT address from RSDP. */
+static unsigned long get_acpi_srat_table(void)
+{
+ unsigned long root_table, acpi_table;
+ struct acpi_table_header *header;
+ struct acpi_table_rsdp *rsdp;
+ u32 num_entries, size, len;
+ char arg[10];
+ u8 *entry;
+
+ /*
+ * Check whether we were given an RSDP on the command line. We don't
+ * stash this in boot params because the kernel itself may have
+ * different ideas about whether to trust a command-line parameter.
+ */
+ rsdp = (struct acpi_table_rsdp *)get_cmdline_acpi_rsdp();
+ if (!rsdp)
+ rsdp = (struct acpi_table_rsdp *)(long)
+ boot_params_ptr->acpi_rsdp_addr;
+
+ if (!rsdp)
+ return 0;
+
+ /* Get ACPI root table from RSDP.*/
+ if (!(cmdline_find_option("acpi", arg, sizeof(arg)) == 4 &&
+ !strncmp(arg, "rsdt", 4)) &&
+ rsdp->xsdt_physical_address &&
+ rsdp->revision > 1) {
+ root_table = rsdp->xsdt_physical_address;
+ size = ACPI_XSDT_ENTRY_SIZE;
+ } else {
+ root_table = rsdp->rsdt_physical_address;
+ size = ACPI_RSDT_ENTRY_SIZE;
+ }
+
+ if (!root_table)
+ return 0;
+
+ header = (struct acpi_table_header *)root_table;
+ len = header->length;
+ if (len < sizeof(struct acpi_table_header) + size)
+ return 0;
+
+ num_entries = (len - sizeof(struct acpi_table_header)) / size;
+ entry = (u8 *)(root_table + sizeof(struct acpi_table_header));
+
+ while (num_entries--) {
+ if (size == ACPI_RSDT_ENTRY_SIZE)
+ acpi_table = *(u32 *)entry;
+ else
+ acpi_table = *(u64 *)entry;
+
+ if (acpi_table) {
+ header = (struct acpi_table_header *)acpi_table;
+
+ if (ACPI_COMPARE_NAMESEG(header->signature, ACPI_SIG_SRAT))
+ return acpi_table;
+ }
+ entry += size;
+ }
+ return 0;
+}
+
+/**
+ * count_immovable_mem_regions - Parse SRAT and cache the immovable
+ * memory regions into the immovable_mem array.
+ *
+ * Return the number of immovable memory regions on success, 0 on failure:
+ *
+ * - Too many immovable memory regions
+ * - ACPI off or no SRAT found
+ * - No immovable memory region found.
+ */
+int count_immovable_mem_regions(void)
+{
+ unsigned long table_addr, table_end, table;
+ struct acpi_subtable_header *sub_table;
+ struct acpi_table_header *table_header;
+ char arg[MAX_ACPI_ARG_LENGTH];
+ int num = 0;
+
+ if (cmdline_find_option("acpi", arg, sizeof(arg)) == 3 &&
+ !strncmp(arg, "off", 3))
+ return 0;
+
+ table_addr = get_acpi_srat_table();
+ if (!table_addr)
+ return 0;
+
+ table_header = (struct acpi_table_header *)table_addr;
+ table_end = table_addr + table_header->length;
+ table = table_addr + sizeof(struct acpi_table_srat);
+
+ while (table + sizeof(struct acpi_subtable_header) < table_end) {
+
+ sub_table = (struct acpi_subtable_header *)table;
+ if (!sub_table->length) {
+ debug_putstr("Invalid zero length SRAT subtable.\n");
+ return 0;
+ }
+
+ if (sub_table->type == ACPI_SRAT_TYPE_MEMORY_AFFINITY) {
+ struct acpi_srat_mem_affinity *ma;
+
+ ma = (struct acpi_srat_mem_affinity *)sub_table;
+ if (!(ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && ma->length) {
+ immovable_mem[num].start = ma->base_address;
+ immovable_mem[num].size = ma->length;
+ num++;
+ }
+
+ if (num >= MAX_NUMNODES*2) {
+ debug_putstr("Too many immovable memory regions, aborting.\n");
+ return 0;
+ }
+ }
+ table += sub_table->length;
+ }
+ return num;
+}
+#endif /* CONFIG_RANDOMIZE_BASE && CONFIG_MEMORY_HOTREMOVE */
diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c
index bffd73b45b1f..e162d7f59cc5 100644
--- a/arch/x86/boot/compressed/cmdline.c
+++ b/arch/x86/boot/compressed/cmdline.c
@@ -1,6 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
#include "misc.h"
-#ifdef CONFIG_EARLY_PRINTK
+#include <asm/bootparam.h>
static unsigned long fs;
static inline void set_fs(unsigned long seg)
@@ -13,11 +14,11 @@ static inline char rdfs8(addr_t addr)
return *((char *)(fs + addr));
}
#include "../cmdline.c"
-static unsigned long get_cmd_line_ptr(void)
+unsigned long get_cmd_line_ptr(void)
{
- unsigned long cmd_line_ptr = real_mode->hdr.cmd_line_ptr;
+ unsigned long cmd_line_ptr = boot_params_ptr->hdr.cmd_line_ptr;
- cmd_line_ptr |= (u64)real_mode->ext_cmd_line_ptr << 32;
+ cmd_line_ptr |= (u64)boot_params_ptr->ext_cmd_line_ptr << 32;
return cmd_line_ptr;
}
@@ -29,5 +30,3 @@ int cmdline_find_option_bool(const char *option)
{
return __cmdline_find_option_bool(get_cmd_line_ptr(), option);
}
-
-#endif
diff --git a/arch/x86/boot/compressed/cpuflags.c b/arch/x86/boot/compressed/cpuflags.c
new file mode 100644
index 000000000000..0cc1323896d1
--- /dev/null
+++ b/arch/x86/boot/compressed/cpuflags.c
@@ -0,0 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "../cpuflags.c"
+
+bool has_cpuflag(int flag)
+{
+ get_cpuflags();
+
+ return test_bit(flag, cpu.flags);
+}
diff --git a/arch/x86/boot/compressed/early_serial_console.c b/arch/x86/boot/compressed/early_serial_console.c
index d3d003cb5481..70a8d1706d0f 100644
--- a/arch/x86/boot/compressed/early_serial_console.c
+++ b/arch/x86/boot/compressed/early_serial_console.c
@@ -1,9 +1,6 @@
#include "misc.h"
-#ifdef CONFIG_EARLY_PRINTK
-
-int early_serial_base;
+/* This might be accessed before .bss is cleared, so use .data instead. */
+int early_serial_base __section(".data");
#include "../early_serial_console.c"
-
-#endif
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
deleted file mode 100644
index d606463aa6d6..000000000000
--- a/arch/x86/boot/compressed/eboot.c
+++ /dev/null
@@ -1,1278 +0,0 @@
-/* -----------------------------------------------------------------------
- *
- * Copyright 2011 Intel Corporation; author Matt Fleming
- *
- * This file is part of the Linux kernel, and is made available under
- * the terms of the GNU General Public License version 2.
- *
- * ----------------------------------------------------------------------- */
-
-#include <linux/efi.h>
-#include <linux/pci.h>
-#include <asm/efi.h>
-#include <asm/setup.h>
-#include <asm/desc.h>
-
-#undef memcpy /* Use memcpy from misc.c */
-
-#include "eboot.h"
-
-static efi_system_table_t *sys_table;
-
-static void efi_char16_printk(efi_char16_t *str)
-{
- struct efi_simple_text_output_protocol *out;
-
- out = (struct efi_simple_text_output_protocol *)sys_table->con_out;
- efi_call_phys2(out->output_string, out, str);
-}
-
-static void efi_printk(char *str)
-{
- char *s8;
-
- for (s8 = str; *s8; s8++) {
- efi_char16_t ch[2] = { 0 };
-
- ch[0] = *s8;
- if (*s8 == '\n') {
- efi_char16_t nl[2] = { '\r', 0 };
- efi_char16_printk(nl);
- }
-
- efi_char16_printk(ch);
- }
-}
-
-static efi_status_t __get_map(efi_memory_desc_t **map, unsigned long *map_size,
- unsigned long *desc_size)
-{
- efi_memory_desc_t *m = NULL;
- efi_status_t status;
- unsigned long key;
- u32 desc_version;
-
- *map_size = sizeof(*m) * 32;
-again:
- /*
- * Add an additional efi_memory_desc_t because we're doing an
- * allocation which may be in a new descriptor region.
- */
- *map_size += sizeof(*m);
- status = efi_call_phys3(sys_table->boottime->allocate_pool,
- EFI_LOADER_DATA, *map_size, (void **)&m);
- if (status != EFI_SUCCESS)
- goto fail;
-
- status = efi_call_phys5(sys_table->boottime->get_memory_map, map_size,
- m, &key, desc_size, &desc_version);
- if (status == EFI_BUFFER_TOO_SMALL) {
- efi_call_phys1(sys_table->boottime->free_pool, m);
- goto again;
- }
-
- if (status != EFI_SUCCESS)
- efi_call_phys1(sys_table->boottime->free_pool, m);
-
-fail:
- *map = m;
- return status;
-}
-
-/*
- * Allocate at the highest possible address that is not above 'max'.
- */
-static efi_status_t high_alloc(unsigned long size, unsigned long align,
- unsigned long *addr, unsigned long max)
-{
- unsigned long map_size, desc_size;
- efi_memory_desc_t *map;
- efi_status_t status;
- unsigned long nr_pages;
- u64 max_addr = 0;
- int i;
-
- status = __get_map(&map, &map_size, &desc_size);
- if (status != EFI_SUCCESS)
- goto fail;
-
- nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE;
-again:
- for (i = 0; i < map_size / desc_size; i++) {
- efi_memory_desc_t *desc;
- unsigned long m = (unsigned long)map;
- u64 start, end;
-
- desc = (efi_memory_desc_t *)(m + (i * desc_size));
- if (desc->type != EFI_CONVENTIONAL_MEMORY)
- continue;
-
- if (desc->num_pages < nr_pages)
- continue;
-
- start = desc->phys_addr;
- end = start + desc->num_pages * (1UL << EFI_PAGE_SHIFT);
-
- if ((start + size) > end || (start + size) > max)
- continue;
-
- if (end - size > max)
- end = max;
-
- if (round_down(end - size, align) < start)
- continue;
-
- start = round_down(end - size, align);
-
- /*
- * Don't allocate at 0x0. It will confuse code that
- * checks pointers against NULL.
- */
- if (start == 0x0)
- continue;
-
- if (start > max_addr)
- max_addr = start;
- }
-
- if (!max_addr)
- status = EFI_NOT_FOUND;
- else {
- status = efi_call_phys4(sys_table->boottime->allocate_pages,
- EFI_ALLOCATE_ADDRESS, EFI_LOADER_DATA,
- nr_pages, &max_addr);
- if (status != EFI_SUCCESS) {
- max = max_addr;
- max_addr = 0;
- goto again;
- }
-
- *addr = max_addr;
- }
-
-free_pool:
- efi_call_phys1(sys_table->boottime->free_pool, map);
-
-fail:
- return status;
-}
-
-/*
- * Allocate at the lowest possible address.
- */
-static efi_status_t low_alloc(unsigned long size, unsigned long align,
- unsigned long *addr)
-{
- unsigned long map_size, desc_size;
- efi_memory_desc_t *map;
- efi_status_t status;
- unsigned long nr_pages;
- int i;
-
- status = __get_map(&map, &map_size, &desc_size);
- if (status != EFI_SUCCESS)
- goto fail;
-
- nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE;
- for (i = 0; i < map_size / desc_size; i++) {
- efi_memory_desc_t *desc;
- unsigned long m = (unsigned long)map;
- u64 start, end;
-
- desc = (efi_memory_desc_t *)(m + (i * desc_size));
-
- if (desc->type != EFI_CONVENTIONAL_MEMORY)
- continue;
-
- if (desc->num_pages < nr_pages)
- continue;
-
- start = desc->phys_addr;
- end = start + desc->num_pages * (1UL << EFI_PAGE_SHIFT);
-
- /*
- * Don't allocate at 0x0. It will confuse code that
- * checks pointers against NULL. Skip the first 8
- * bytes so we start at a nice even number.
- */
- if (start == 0x0)
- start += 8;
-
- start = round_up(start, align);
- if ((start + size) > end)
- continue;
-
- status = efi_call_phys4(sys_table->boottime->allocate_pages,
- EFI_ALLOCATE_ADDRESS, EFI_LOADER_DATA,
- nr_pages, &start);
- if (status == EFI_SUCCESS) {
- *addr = start;
- break;
- }
- }
-
- if (i == map_size / desc_size)
- status = EFI_NOT_FOUND;
-
-free_pool:
- efi_call_phys1(sys_table->boottime->free_pool, map);
-fail:
- return status;
-}
-
-static void low_free(unsigned long size, unsigned long addr)
-{
- unsigned long nr_pages;
-
- nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE;
- efi_call_phys2(sys_table->boottime->free_pages, addr, size);
-}
-
-static void find_bits(unsigned long mask, u8 *pos, u8 *size)
-{
- u8 first, len;
-
- first = 0;
- len = 0;
-
- if (mask) {
- while (!(mask & 0x1)) {
- mask = mask >> 1;
- first++;
- }
-
- while (mask & 0x1) {
- mask = mask >> 1;
- len++;
- }
- }
-
- *pos = first;
- *size = len;
-}
-
-static efi_status_t setup_efi_pci(struct boot_params *params)
-{
- efi_pci_io_protocol *pci;
- efi_status_t status;
- void **pci_handle;
- efi_guid_t pci_proto = EFI_PCI_IO_PROTOCOL_GUID;
- unsigned long nr_pci, size = 0;
- int i;
- struct setup_data *data;
-
- data = (struct setup_data *)(unsigned long)params->hdr.setup_data;
-
- while (data && data->next)
- data = (struct setup_data *)(unsigned long)data->next;
-
- status = efi_call_phys5(sys_table->boottime->locate_handle,
- EFI_LOCATE_BY_PROTOCOL, &pci_proto,
- NULL, &size, pci_handle);
-
- if (status == EFI_BUFFER_TOO_SMALL) {
- status = efi_call_phys3(sys_table->boottime->allocate_pool,
- EFI_LOADER_DATA, size, &pci_handle);
-
- if (status != EFI_SUCCESS)
- return status;
-
- status = efi_call_phys5(sys_table->boottime->locate_handle,
- EFI_LOCATE_BY_PROTOCOL, &pci_proto,
- NULL, &size, pci_handle);
- }
-
- if (status != EFI_SUCCESS)
- goto free_handle;
-
- nr_pci = size / sizeof(void *);
- for (i = 0; i < nr_pci; i++) {
- void *h = pci_handle[i];
- uint64_t attributes;
- struct pci_setup_rom *rom;
-
- status = efi_call_phys3(sys_table->boottime->handle_protocol,
- h, &pci_proto, &pci);
-
- if (status != EFI_SUCCESS)
- continue;
-
- if (!pci)
- continue;
-
-#ifdef CONFIG_X86_64
- status = efi_call_phys4(pci->attributes, pci,
- EfiPciIoAttributeOperationGet, 0,
- &attributes);
-#else
- status = efi_call_phys5(pci->attributes, pci,
- EfiPciIoAttributeOperationGet, 0, 0,
- &attributes);
-#endif
- if (status != EFI_SUCCESS)
- continue;
-
- if (!pci->romimage || !pci->romsize)
- continue;
-
- size = pci->romsize + sizeof(*rom);
-
- status = efi_call_phys3(sys_table->boottime->allocate_pool,
- EFI_LOADER_DATA, size, &rom);
-
- if (status != EFI_SUCCESS)
- continue;
-
- rom->data.type = SETUP_PCI;
- rom->data.len = size - sizeof(struct setup_data);
- rom->data.next = 0;
- rom->pcilen = pci->romsize;
-
- status = efi_call_phys5(pci->pci.read, pci,
- EfiPciIoWidthUint16, PCI_VENDOR_ID,
- 1, &(rom->vendor));
-
- if (status != EFI_SUCCESS)
- goto free_struct;
-
- status = efi_call_phys5(pci->pci.read, pci,
- EfiPciIoWidthUint16, PCI_DEVICE_ID,
- 1, &(rom->devid));
-
- if (status != EFI_SUCCESS)
- goto free_struct;
-
- status = efi_call_phys5(pci->get_location, pci,
- &(rom->segment), &(rom->bus),
- &(rom->device), &(rom->function));
-
- if (status != EFI_SUCCESS)
- goto free_struct;
-
- memcpy(rom->romdata, pci->romimage, pci->romsize);
-
- if (data)
- data->next = (unsigned long)rom;
- else
- params->hdr.setup_data = (unsigned long)rom;
-
- data = (struct setup_data *)rom;
-
- continue;
- free_struct:
- efi_call_phys1(sys_table->boottime->free_pool, rom);
- }
-
-free_handle:
- efi_call_phys1(sys_table->boottime->free_pool, pci_handle);
- return status;
-}
-
-/*
- * See if we have Graphics Output Protocol
- */
-static efi_status_t setup_gop(struct screen_info *si, efi_guid_t *proto,
- unsigned long size)
-{
- struct efi_graphics_output_protocol *gop, *first_gop;
- struct efi_pixel_bitmask pixel_info;
- unsigned long nr_gops;
- efi_status_t status;
- void **gop_handle;
- u16 width, height;
- u32 fb_base, fb_size;
- u32 pixels_per_scan_line;
- int pixel_format;
- int i;
-
- status = efi_call_phys3(sys_table->boottime->allocate_pool,
- EFI_LOADER_DATA, size, &gop_handle);
- if (status != EFI_SUCCESS)
- return status;
-
- status = efi_call_phys5(sys_table->boottime->locate_handle,
- EFI_LOCATE_BY_PROTOCOL, proto,
- NULL, &size, gop_handle);
- if (status != EFI_SUCCESS)
- goto free_handle;
-
- first_gop = NULL;
-
- nr_gops = size / sizeof(void *);
- for (i = 0; i < nr_gops; i++) {
- struct efi_graphics_output_mode_info *info;
- efi_guid_t conout_proto = EFI_CONSOLE_OUT_DEVICE_GUID;
- bool conout_found = false;
- void *dummy;
- void *h = gop_handle[i];
-
- status = efi_call_phys3(sys_table->boottime->handle_protocol,
- h, proto, &gop);
- if (status != EFI_SUCCESS)
- continue;
-
- status = efi_call_phys3(sys_table->boottime->handle_protocol,
- h, &conout_proto, &dummy);
-
- if (status == EFI_SUCCESS)
- conout_found = true;
-
- status = efi_call_phys4(gop->query_mode, gop,
- gop->mode->mode, &size, &info);
- if (status == EFI_SUCCESS && (!first_gop || conout_found)) {
- /*
- * Systems that use the UEFI Console Splitter may
- * provide multiple GOP devices, not all of which are
- * backed by real hardware. The workaround is to search
- * for a GOP implementing the ConOut protocol, and if
- * one isn't found, to just fall back to the first GOP.
- */
- width = info->horizontal_resolution;
- height = info->vertical_resolution;
- fb_base = gop->mode->frame_buffer_base;
- fb_size = gop->mode->frame_buffer_size;
- pixel_format = info->pixel_format;
- pixel_info = info->pixel_information;
- pixels_per_scan_line = info->pixels_per_scan_line;
-
- /*
- * Once we've found a GOP supporting ConOut,
- * don't bother looking any further.
- */
- first_gop = gop;
- if (conout_found)
- break;
- }
- }
-
- /* Did we find any GOPs? */
- if (!first_gop)
- goto free_handle;
-
- /* EFI framebuffer */
- si->orig_video_isVGA = VIDEO_TYPE_EFI;
-
- si->lfb_width = width;
- si->lfb_height = height;
- si->lfb_base = fb_base;
- si->pages = 1;
-
- if (pixel_format == PIXEL_RGB_RESERVED_8BIT_PER_COLOR) {
- si->lfb_depth = 32;
- si->lfb_linelength = pixels_per_scan_line * 4;
- si->red_size = 8;
- si->red_pos = 0;
- si->green_size = 8;
- si->green_pos = 8;
- si->blue_size = 8;
- si->blue_pos = 16;
- si->rsvd_size = 8;
- si->rsvd_pos = 24;
- } else if (pixel_format == PIXEL_BGR_RESERVED_8BIT_PER_COLOR) {
- si->lfb_depth = 32;
- si->lfb_linelength = pixels_per_scan_line * 4;
- si->red_size = 8;
- si->red_pos = 16;
- si->green_size = 8;
- si->green_pos = 8;
- si->blue_size = 8;
- si->blue_pos = 0;
- si->rsvd_size = 8;
- si->rsvd_pos = 24;
- } else if (pixel_format == PIXEL_BIT_MASK) {
- find_bits(pixel_info.red_mask, &si->red_pos, &si->red_size);
- find_bits(pixel_info.green_mask, &si->green_pos,
- &si->green_size);
- find_bits(pixel_info.blue_mask, &si->blue_pos, &si->blue_size);
- find_bits(pixel_info.reserved_mask, &si->rsvd_pos,
- &si->rsvd_size);
- si->lfb_depth = si->red_size + si->green_size +
- si->blue_size + si->rsvd_size;
- si->lfb_linelength = (pixels_per_scan_line * si->lfb_depth) / 8;
- } else {
- si->lfb_depth = 4;
- si->lfb_linelength = si->lfb_width / 2;
- si->red_size = 0;
- si->red_pos = 0;
- si->green_size = 0;
- si->green_pos = 0;
- si->blue_size = 0;
- si->blue_pos = 0;
- si->rsvd_size = 0;
- si->rsvd_pos = 0;
- }
-
- si->lfb_size = si->lfb_linelength * si->lfb_height;
-
- si->capabilities |= VIDEO_CAPABILITY_SKIP_QUIRKS;
-
-free_handle:
- efi_call_phys1(sys_table->boottime->free_pool, gop_handle);
- return status;
-}
-
-/*
- * See if we have Universal Graphics Adapter (UGA) protocol
- */
-static efi_status_t setup_uga(struct screen_info *si, efi_guid_t *uga_proto,
- unsigned long size)
-{
- struct efi_uga_draw_protocol *uga, *first_uga;
- unsigned long nr_ugas;
- efi_status_t status;
- u32 width, height;
- void **uga_handle = NULL;
- int i;
-
- status = efi_call_phys3(sys_table->boottime->allocate_pool,
- EFI_LOADER_DATA, size, &uga_handle);
- if (status != EFI_SUCCESS)
- return status;
-
- status = efi_call_phys5(sys_table->boottime->locate_handle,
- EFI_LOCATE_BY_PROTOCOL, uga_proto,
- NULL, &size, uga_handle);
- if (status != EFI_SUCCESS)
- goto free_handle;
-
- first_uga = NULL;
-
- nr_ugas = size / sizeof(void *);
- for (i = 0; i < nr_ugas; i++) {
- efi_guid_t pciio_proto = EFI_PCI_IO_PROTOCOL_GUID;
- void *handle = uga_handle[i];
- u32 w, h, depth, refresh;
- void *pciio;
-
- status = efi_call_phys3(sys_table->boottime->handle_protocol,
- handle, uga_proto, &uga);
- if (status != EFI_SUCCESS)
- continue;
-
- efi_call_phys3(sys_table->boottime->handle_protocol,
- handle, &pciio_proto, &pciio);
-
- status = efi_call_phys5(uga->get_mode, uga, &w, &h,
- &depth, &refresh);
- if (status == EFI_SUCCESS && (!first_uga || pciio)) {
- width = w;
- height = h;
-
- /*
- * Once we've found a UGA supporting PCIIO,
- * don't bother looking any further.
- */
- if (pciio)
- break;
-
- first_uga = uga;
- }
- }
-
- if (!first_uga)
- goto free_handle;
-
- /* EFI framebuffer */
- si->orig_video_isVGA = VIDEO_TYPE_EFI;
-
- si->lfb_depth = 32;
- si->lfb_width = width;
- si->lfb_height = height;
-
- si->red_size = 8;
- si->red_pos = 16;
- si->green_size = 8;
- si->green_pos = 8;
- si->blue_size = 8;
- si->blue_pos = 0;
- si->rsvd_size = 8;
- si->rsvd_pos = 24;
-
-
-free_handle:
- efi_call_phys1(sys_table->boottime->free_pool, uga_handle);
- return status;
-}
-
-void setup_graphics(struct boot_params *boot_params)
-{
- efi_guid_t graphics_proto = EFI_GRAPHICS_OUTPUT_PROTOCOL_GUID;
- struct screen_info *si;
- efi_guid_t uga_proto = EFI_UGA_PROTOCOL_GUID;
- efi_status_t status;
- unsigned long size;
- void **gop_handle = NULL;
- void **uga_handle = NULL;
-
- si = &boot_params->screen_info;
- memset(si, 0, sizeof(*si));
-
- size = 0;
- status = efi_call_phys5(sys_table->boottime->locate_handle,
- EFI_LOCATE_BY_PROTOCOL, &graphics_proto,
- NULL, &size, gop_handle);
- if (status == EFI_BUFFER_TOO_SMALL)
- status = setup_gop(si, &graphics_proto, size);
-
- if (status != EFI_SUCCESS) {
- size = 0;
- status = efi_call_phys5(sys_table->boottime->locate_handle,
- EFI_LOCATE_BY_PROTOCOL, &uga_proto,
- NULL, &size, uga_handle);
- if (status == EFI_BUFFER_TOO_SMALL)
- setup_uga(si, &uga_proto, size);
- }
-}
-
-struct initrd {
- efi_file_handle_t *handle;
- u64 size;
-};
-
-/*
- * Check the cmdline for a LILO-style initrd= arguments.
- *
- * We only support loading an initrd from the same filesystem as the
- * kernel image.
- */
-static efi_status_t handle_ramdisks(efi_loaded_image_t *image,
- struct setup_header *hdr)
-{
- struct initrd *initrds;
- unsigned long initrd_addr;
- efi_guid_t fs_proto = EFI_FILE_SYSTEM_GUID;
- u64 initrd_total;
- efi_file_io_interface_t *io;
- efi_file_handle_t *fh;
- efi_status_t status;
- int nr_initrds;
- char *str;
- int i, j, k;
-
- initrd_addr = 0;
- initrd_total = 0;
-
- str = (char *)(unsigned long)hdr->cmd_line_ptr;
-
- j = 0; /* See close_handles */
-
- if (!str || !*str)
- return EFI_SUCCESS;
-
- for (nr_initrds = 0; *str; nr_initrds++) {
- str = strstr(str, "initrd=");
- if (!str)
- break;
-
- str += 7;
-
- /* Skip any leading slashes */
- while (*str == '/' || *str == '\\')
- str++;
-
- while (*str && *str != ' ' && *str != '\n')
- str++;
- }
-
- if (!nr_initrds)
- return EFI_SUCCESS;
-
- status = efi_call_phys3(sys_table->boottime->allocate_pool,
- EFI_LOADER_DATA,
- nr_initrds * sizeof(*initrds),
- &initrds);
- if (status != EFI_SUCCESS) {
- efi_printk("Failed to alloc mem for initrds\n");
- goto fail;
- }
-
- str = (char *)(unsigned long)hdr->cmd_line_ptr;
- for (i = 0; i < nr_initrds; i++) {
- struct initrd *initrd;
- efi_file_handle_t *h;
- efi_file_info_t *info;
- efi_char16_t filename_16[256];
- unsigned long info_sz;
- efi_guid_t info_guid = EFI_FILE_INFO_ID;
- efi_char16_t *p;
- u64 file_sz;
-
- str = strstr(str, "initrd=");
- if (!str)
- break;
-
- str += 7;
-
- initrd = &initrds[i];
- p = filename_16;
-
- /* Skip any leading slashes */
- while (*str == '/' || *str == '\\')
- str++;
-
- while (*str && *str != ' ' && *str != '\n') {
- if ((u8 *)p >= (u8 *)filename_16 + sizeof(filename_16))
- break;
-
- if (*str == '/') {
- *p++ = '\\';
- *str++;
- } else {
- *p++ = *str++;
- }
- }
-
- *p = '\0';
-
- /* Only open the volume once. */
- if (!i) {
- efi_boot_services_t *boottime;
-
- boottime = sys_table->boottime;
-
- status = efi_call_phys3(boottime->handle_protocol,
- image->device_handle, &fs_proto, &io);
- if (status != EFI_SUCCESS) {
- efi_printk("Failed to handle fs_proto\n");
- goto free_initrds;
- }
-
- status = efi_call_phys2(io->open_volume, io, &fh);
- if (status != EFI_SUCCESS) {
- efi_printk("Failed to open volume\n");
- goto free_initrds;
- }
- }
-
- status = efi_call_phys5(fh->open, fh, &h, filename_16,
- EFI_FILE_MODE_READ, (u64)0);
- if (status != EFI_SUCCESS) {
- efi_printk("Failed to open initrd file: ");
- efi_char16_printk(filename_16);
- efi_printk("\n");
- goto close_handles;
- }
-
- initrd->handle = h;
-
- info_sz = 0;
- status = efi_call_phys4(h->get_info, h, &info_guid,
- &info_sz, NULL);
- if (status != EFI_BUFFER_TOO_SMALL) {
- efi_printk("Failed to get initrd info size\n");
- goto close_handles;
- }
-
-grow:
- status = efi_call_phys3(sys_table->boottime->allocate_pool,
- EFI_LOADER_DATA, info_sz, &info);
- if (status != EFI_SUCCESS) {
- efi_printk("Failed to alloc mem for initrd info\n");
- goto close_handles;
- }
-
- status = efi_call_phys4(h->get_info, h, &info_guid,
- &info_sz, info);
- if (status == EFI_BUFFER_TOO_SMALL) {
- efi_call_phys1(sys_table->boottime->free_pool, info);
- goto grow;
- }
-
- file_sz = info->file_size;
- efi_call_phys1(sys_table->boottime->free_pool, info);
-
- if (status != EFI_SUCCESS) {
- efi_printk("Failed to get initrd info\n");
- goto close_handles;
- }
-
- initrd->size = file_sz;
- initrd_total += file_sz;
- }
-
- if (initrd_total) {
- unsigned long addr;
-
- /*
- * Multiple initrd's need to be at consecutive
- * addresses in memory, so allocate enough memory for
- * all the initrd's.
- */
- status = high_alloc(initrd_total, 0x1000,
- &initrd_addr, hdr->initrd_addr_max);
- if (status != EFI_SUCCESS) {
- efi_printk("Failed to alloc highmem for initrds\n");
- goto close_handles;
- }
-
- /* We've run out of free low memory. */
- if (initrd_addr > hdr->initrd_addr_max) {
- efi_printk("We've run out of free low memory\n");
- status = EFI_INVALID_PARAMETER;
- goto free_initrd_total;
- }
-
- addr = initrd_addr;
- for (j = 0; j < nr_initrds; j++) {
- u64 size;
-
- size = initrds[j].size;
- while (size) {
- u64 chunksize;
- if (size > EFI_READ_CHUNK_SIZE)
- chunksize = EFI_READ_CHUNK_SIZE;
- else
- chunksize = size;
- status = efi_call_phys3(fh->read,
- initrds[j].handle,
- &chunksize, addr);
- if (status != EFI_SUCCESS) {
- efi_printk("Failed to read initrd\n");
- goto free_initrd_total;
- }
- addr += chunksize;
- size -= chunksize;
- }
-
- efi_call_phys1(fh->close, initrds[j].handle);
- }
-
- }
-
- efi_call_phys1(sys_table->boottime->free_pool, initrds);
-
- hdr->ramdisk_image = initrd_addr;
- hdr->ramdisk_size = initrd_total;
-
- return status;
-
-free_initrd_total:
- low_free(initrd_total, initrd_addr);
-
-close_handles:
- for (k = j; k < i; k++)
- efi_call_phys1(fh->close, initrds[k].handle);
-free_initrds:
- efi_call_phys1(sys_table->boottime->free_pool, initrds);
-fail:
- hdr->ramdisk_image = 0;
- hdr->ramdisk_size = 0;
-
- return status;
-}
-
-/*
- * Because the x86 boot code expects to be passed a boot_params we
- * need to create one ourselves (usually the bootloader would create
- * one for us).
- */
-struct boot_params *make_boot_params(void *handle, efi_system_table_t *_table)
-{
- struct boot_params *boot_params;
- struct sys_desc_table *sdt;
- struct apm_bios_info *bi;
- struct setup_header *hdr;
- struct efi_info *efi;
- efi_loaded_image_t *image;
- void *options;
- u32 load_options_size;
- efi_guid_t proto = LOADED_IMAGE_PROTOCOL_GUID;
- int options_size = 0;
- efi_status_t status;
- unsigned long cmdline;
- u16 *s2;
- u8 *s1;
- int i;
-
- sys_table = _table;
-
- /* Check if we were booted by the EFI firmware */
- if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
- return NULL;
-
- status = efi_call_phys3(sys_table->boottime->handle_protocol,
- handle, &proto, (void *)&image);
- if (status != EFI_SUCCESS) {
- efi_printk("Failed to get handle for LOADED_IMAGE_PROTOCOL\n");
- return NULL;
- }
-
- status = low_alloc(0x4000, 1, (unsigned long *)&boot_params);
- if (status != EFI_SUCCESS) {
- efi_printk("Failed to alloc lowmem for boot params\n");
- return NULL;
- }
-
- memset(boot_params, 0x0, 0x4000);
-
- hdr = &boot_params->hdr;
- efi = &boot_params->efi_info;
- bi = &boot_params->apm_bios_info;
- sdt = &boot_params->sys_desc_table;
-
- /* Copy the second sector to boot_params */
- memcpy(&hdr->jump, image->image_base + 512, 512);
-
- /*
- * Fill out some of the header fields ourselves because the
- * EFI firmware loader doesn't load the first sector.
- */
- hdr->root_flags = 1;
- hdr->vid_mode = 0xffff;
- hdr->boot_flag = 0xAA55;
-
- hdr->code32_start = (__u64)(unsigned long)image->image_base;
-
- hdr->type_of_loader = 0x21;
-
- /* Convert unicode cmdline to ascii */
- options = image->load_options;
- load_options_size = image->load_options_size / 2; /* ASCII */
- cmdline = 0;
- s2 = (u16 *)options;
-
- if (s2) {
- while (*s2 && *s2 != '\n' && options_size < load_options_size) {
- s2++;
- options_size++;
- }
-
- if (options_size) {
- if (options_size > hdr->cmdline_size)
- options_size = hdr->cmdline_size;
-
- options_size++; /* NUL termination */
-
- status = low_alloc(options_size, 1, &cmdline);
- if (status != EFI_SUCCESS) {
- efi_printk("Failed to alloc mem for cmdline\n");
- goto fail;
- }
-
- s1 = (u8 *)(unsigned long)cmdline;
- s2 = (u16 *)options;
-
- for (i = 0; i < options_size - 1; i++)
- *s1++ = *s2++;
-
- *s1 = '\0';
- }
- }
-
- hdr->cmd_line_ptr = cmdline;
-
- hdr->ramdisk_image = 0;
- hdr->ramdisk_size = 0;
-
- /* Clear APM BIOS info */
- memset(bi, 0, sizeof(*bi));
-
- memset(sdt, 0, sizeof(*sdt));
-
- status = handle_ramdisks(image, hdr);
- if (status != EFI_SUCCESS)
- goto fail2;
-
- return boot_params;
-fail2:
- if (options_size)
- low_free(options_size, hdr->cmd_line_ptr);
-fail:
- low_free(0x4000, (unsigned long)boot_params);
- return NULL;
-}
-
-static efi_status_t exit_boot(struct boot_params *boot_params,
- void *handle)
-{
- struct efi_info *efi = &boot_params->efi_info;
- struct e820entry *e820_map = &boot_params->e820_map[0];
- struct e820entry *prev = NULL;
- unsigned long size, key, desc_size, _size;
- efi_memory_desc_t *mem_map;
- efi_status_t status;
- __u32 desc_version;
- bool called_exit = false;
- u8 nr_entries;
- int i;
-
- size = sizeof(*mem_map) * 32;
-
-again:
- size += sizeof(*mem_map) * 2;
- _size = size;
- status = low_alloc(size, 1, (unsigned long *)&mem_map);
- if (status != EFI_SUCCESS)
- return status;
-
-get_map:
- status = efi_call_phys5(sys_table->boottime->get_memory_map, &size,
- mem_map, &key, &desc_size, &desc_version);
- if (status == EFI_BUFFER_TOO_SMALL) {
- low_free(_size, (unsigned long)mem_map);
- goto again;
- }
-
- if (status != EFI_SUCCESS)
- goto free_mem_map;
-
- memcpy(&efi->efi_loader_signature, EFI_LOADER_SIGNATURE, sizeof(__u32));
- efi->efi_systab = (unsigned long)sys_table;
- efi->efi_memdesc_size = desc_size;
- efi->efi_memdesc_version = desc_version;
- efi->efi_memmap = (unsigned long)mem_map;
- efi->efi_memmap_size = size;
-
-#ifdef CONFIG_X86_64
- efi->efi_systab_hi = (unsigned long)sys_table >> 32;
- efi->efi_memmap_hi = (unsigned long)mem_map >> 32;
-#endif
-
- /* Might as well exit boot services now */
- status = efi_call_phys2(sys_table->boottime->exit_boot_services,
- handle, key);
- if (status != EFI_SUCCESS) {
- /*
- * ExitBootServices() will fail if any of the event
- * handlers change the memory map. In which case, we
- * must be prepared to retry, but only once so that
- * we're guaranteed to exit on repeated failures instead
- * of spinning forever.
- */
- if (called_exit)
- goto free_mem_map;
-
- called_exit = true;
- goto get_map;
- }
-
- /* Historic? */
- boot_params->alt_mem_k = 32 * 1024;
-
- /*
- * Convert the EFI memory map to E820.
- */
- nr_entries = 0;
- for (i = 0; i < size / desc_size; i++) {
- efi_memory_desc_t *d;
- unsigned int e820_type = 0;
- unsigned long m = (unsigned long)mem_map;
-
- d = (efi_memory_desc_t *)(m + (i * desc_size));
- switch (d->type) {
- case EFI_RESERVED_TYPE:
- case EFI_RUNTIME_SERVICES_CODE:
- case EFI_RUNTIME_SERVICES_DATA:
- case EFI_MEMORY_MAPPED_IO:
- case EFI_MEMORY_MAPPED_IO_PORT_SPACE:
- case EFI_PAL_CODE:
- e820_type = E820_RESERVED;
- break;
-
- case EFI_UNUSABLE_MEMORY:
- e820_type = E820_UNUSABLE;
- break;
-
- case EFI_ACPI_RECLAIM_MEMORY:
- e820_type = E820_ACPI;
- break;
-
- case EFI_LOADER_CODE:
- case EFI_LOADER_DATA:
- case EFI_BOOT_SERVICES_CODE:
- case EFI_BOOT_SERVICES_DATA:
- case EFI_CONVENTIONAL_MEMORY:
- e820_type = E820_RAM;
- break;
-
- case EFI_ACPI_MEMORY_NVS:
- e820_type = E820_NVS;
- break;
-
- default:
- continue;
- }
-
- /* Merge adjacent mappings */
- if (prev && prev->type == e820_type &&
- (prev->addr + prev->size) == d->phys_addr)
- prev->size += d->num_pages << 12;
- else {
- e820_map->addr = d->phys_addr;
- e820_map->size = d->num_pages << 12;
- e820_map->type = e820_type;
- prev = e820_map++;
- nr_entries++;
- }
- }
-
- boot_params->e820_entries = nr_entries;
-
- return EFI_SUCCESS;
-
-free_mem_map:
- low_free(_size, (unsigned long)mem_map);
- return status;
-}
-
-static efi_status_t relocate_kernel(struct setup_header *hdr)
-{
- unsigned long start, nr_pages;
- efi_status_t status;
-
- /*
- * The EFI firmware loader could have placed the kernel image
- * anywhere in memory, but the kernel has various restrictions
- * on the max physical address it can run at. Attempt to move
- * the kernel to boot_params.pref_address, or as low as
- * possible.
- */
- start = hdr->pref_address;
- nr_pages = round_up(hdr->init_size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE;
-
- status = efi_call_phys4(sys_table->boottime->allocate_pages,
- EFI_ALLOCATE_ADDRESS, EFI_LOADER_DATA,
- nr_pages, &start);
- if (status != EFI_SUCCESS) {
- status = low_alloc(hdr->init_size, hdr->kernel_alignment,
- &start);
- if (status != EFI_SUCCESS)
- efi_printk("Failed to alloc mem for kernel\n");
- }
-
- if (status == EFI_SUCCESS)
- memcpy((void *)start, (void *)(unsigned long)hdr->code32_start,
- hdr->init_size);
-
- hdr->pref_address = hdr->code32_start;
- hdr->code32_start = (__u32)start;
-
- return status;
-}
-
-/*
- * On success we return a pointer to a boot_params structure, and NULL
- * on failure.
- */
-struct boot_params *efi_main(void *handle, efi_system_table_t *_table,
- struct boot_params *boot_params)
-{
- struct desc_ptr *gdt, *idt;
- efi_loaded_image_t *image;
- struct setup_header *hdr = &boot_params->hdr;
- efi_status_t status;
- struct desc_struct *desc;
-
- sys_table = _table;
-
- /* Check if we were booted by the EFI firmware */
- if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
- goto fail;
-
- setup_graphics(boot_params);
-
- setup_efi_pci(boot_params);
-
- status = efi_call_phys3(sys_table->boottime->allocate_pool,
- EFI_LOADER_DATA, sizeof(*gdt),
- (void **)&gdt);
- if (status != EFI_SUCCESS) {
- efi_printk("Failed to alloc mem for gdt structure\n");
- goto fail;
- }
-
- gdt->size = 0x800;
- status = low_alloc(gdt->size, 8, (unsigned long *)&gdt->address);
- if (status != EFI_SUCCESS) {
- efi_printk("Failed to alloc mem for gdt\n");
- goto fail;
- }
-
- status = efi_call_phys3(sys_table->boottime->allocate_pool,
- EFI_LOADER_DATA, sizeof(*idt),
- (void **)&idt);
- if (status != EFI_SUCCESS) {
- efi_printk("Failed to alloc mem for idt structure\n");
- goto fail;
- }
-
- idt->size = 0;
- idt->address = 0;
-
- /*
- * If the kernel isn't already loaded at the preferred load
- * address, relocate it.
- */
- if (hdr->pref_address != hdr->code32_start) {
- status = relocate_kernel(hdr);
-
- if (status != EFI_SUCCESS)
- goto fail;
- }
-
- status = exit_boot(boot_params, handle);
- if (status != EFI_SUCCESS)
- goto fail;
-
- memset((char *)gdt->address, 0x0, gdt->size);
- desc = (struct desc_struct *)gdt->address;
-
- /* The first GDT is a dummy and the second is unused. */
- desc += 2;
-
- desc->limit0 = 0xffff;
- desc->base0 = 0x0000;
- desc->base1 = 0x0000;
- desc->type = SEG_TYPE_CODE | SEG_TYPE_EXEC_READ;
- desc->s = DESC_TYPE_CODE_DATA;
- desc->dpl = 0;
- desc->p = 1;
- desc->limit = 0xf;
- desc->avl = 0;
- desc->l = 0;
- desc->d = SEG_OP_SIZE_32BIT;
- desc->g = SEG_GRANULARITY_4KB;
- desc->base2 = 0x00;
-
- desc++;
- desc->limit0 = 0xffff;
- desc->base0 = 0x0000;
- desc->base1 = 0x0000;
- desc->type = SEG_TYPE_DATA | SEG_TYPE_READ_WRITE;
- desc->s = DESC_TYPE_CODE_DATA;
- desc->dpl = 0;
- desc->p = 1;
- desc->limit = 0xf;
- desc->avl = 0;
- desc->l = 0;
- desc->d = SEG_OP_SIZE_32BIT;
- desc->g = SEG_GRANULARITY_4KB;
- desc->base2 = 0x00;
-
-#ifdef CONFIG_X86_64
- /* Task segment value */
- desc++;
- desc->limit0 = 0x0000;
- desc->base0 = 0x0000;
- desc->base1 = 0x0000;
- desc->type = SEG_TYPE_TSS;
- desc->s = 0;
- desc->dpl = 0;
- desc->p = 1;
- desc->limit = 0x0;
- desc->avl = 0;
- desc->l = 0;
- desc->d = 0;
- desc->g = SEG_GRANULARITY_4KB;
- desc->base2 = 0x00;
-#endif /* CONFIG_X86_64 */
-
- asm volatile ("lidt %0" : : "m" (*idt));
- asm volatile ("lgdt %0" : : "m" (*gdt));
-
- asm volatile("cli");
-
- return boot_params;
-fail:
- return NULL;
-}
diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h
deleted file mode 100644
index e5b0a8f91c5f..000000000000
--- a/arch/x86/boot/compressed/eboot.h
+++ /dev/null
@@ -1,71 +0,0 @@
-#ifndef BOOT_COMPRESSED_EBOOT_H
-#define BOOT_COMPRESSED_EBOOT_H
-
-#define SEG_TYPE_DATA (0 << 3)
-#define SEG_TYPE_READ_WRITE (1 << 1)
-#define SEG_TYPE_CODE (1 << 3)
-#define SEG_TYPE_EXEC_READ (1 << 1)
-#define SEG_TYPE_TSS ((1 << 3) | (1 << 0))
-#define SEG_OP_SIZE_32BIT (1 << 0)
-#define SEG_GRANULARITY_4KB (1 << 0)
-
-#define DESC_TYPE_CODE_DATA (1 << 0)
-
-#define EFI_PAGE_SIZE (1UL << EFI_PAGE_SHIFT)
-#define EFI_READ_CHUNK_SIZE (1024 * 1024)
-
-#define EFI_CONSOLE_OUT_DEVICE_GUID \
- EFI_GUID(0xd3b36f2c, 0xd551, 0x11d4, 0x9a, 0x46, 0x0, 0x90, 0x27, \
- 0x3f, 0xc1, 0x4d)
-
-#define PIXEL_RGB_RESERVED_8BIT_PER_COLOR 0
-#define PIXEL_BGR_RESERVED_8BIT_PER_COLOR 1
-#define PIXEL_BIT_MASK 2
-#define PIXEL_BLT_ONLY 3
-#define PIXEL_FORMAT_MAX 4
-
-struct efi_pixel_bitmask {
- u32 red_mask;
- u32 green_mask;
- u32 blue_mask;
- u32 reserved_mask;
-};
-
-struct efi_graphics_output_mode_info {
- u32 version;
- u32 horizontal_resolution;
- u32 vertical_resolution;
- int pixel_format;
- struct efi_pixel_bitmask pixel_information;
- u32 pixels_per_scan_line;
-} __packed;
-
-struct efi_graphics_output_protocol_mode {
- u32 max_mode;
- u32 mode;
- unsigned long info;
- unsigned long size_of_info;
- u64 frame_buffer_base;
- unsigned long frame_buffer_size;
-} __packed;
-
-struct efi_graphics_output_protocol {
- void *query_mode;
- unsigned long set_mode;
- unsigned long blt;
- struct efi_graphics_output_protocol_mode *mode;
-};
-
-struct efi_uga_draw_protocol {
- void *get_mode;
- void *set_mode;
- void *blt;
-};
-
-struct efi_simple_text_output_protocol {
- void *reset;
- void *output_string;
- void *test_string;
-};
-
-#endif /* BOOT_COMPRESSED_EBOOT_H */
diff --git a/arch/x86/boot/compressed/efi.c b/arch/x86/boot/compressed/efi.c
new file mode 100644
index 000000000000..f2e50f9758e6
--- /dev/null
+++ b/arch/x86/boot/compressed/efi.c
@@ -0,0 +1,236 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Helpers for early access to EFI configuration table.
+ *
+ * Originally derived from arch/x86/boot/compressed/acpi.c
+ */
+
+#include "misc.h"
+
+#include <asm/bootparam.h>
+
+/**
+ * efi_get_type - Given a pointer to boot_params, determine the type of EFI environment.
+ *
+ * @bp: pointer to boot_params
+ *
+ * Return: EFI_TYPE_{32,64} for valid EFI environments, EFI_TYPE_NONE otherwise.
+ */
+enum efi_type efi_get_type(struct boot_params *bp)
+{
+ struct efi_info *ei;
+ enum efi_type et;
+ const char *sig;
+
+ ei = &bp->efi_info;
+ sig = (char *)&ei->efi_loader_signature;
+
+ if (!strncmp(sig, EFI64_LOADER_SIGNATURE, 4)) {
+ et = EFI_TYPE_64;
+ } else if (!strncmp(sig, EFI32_LOADER_SIGNATURE, 4)) {
+ et = EFI_TYPE_32;
+ } else {
+ debug_putstr("No EFI environment detected.\n");
+ et = EFI_TYPE_NONE;
+ }
+
+#ifndef CONFIG_X86_64
+ /*
+ * Existing callers like acpi.c treat this case as an indicator to
+ * fall-through to non-EFI, rather than an error, so maintain that
+ * functionality here as well.
+ */
+ if (ei->efi_systab_hi || ei->efi_memmap_hi) {
+ debug_putstr("EFI system table is located above 4GB and cannot be accessed.\n");
+ et = EFI_TYPE_NONE;
+ }
+#endif
+
+ return et;
+}
+
+/**
+ * efi_get_system_table - Given a pointer to boot_params, retrieve the physical address
+ * of the EFI system table.
+ *
+ * @bp: pointer to boot_params
+ *
+ * Return: EFI system table address on success. On error, return 0.
+ */
+unsigned long efi_get_system_table(struct boot_params *bp)
+{
+ unsigned long sys_tbl_pa;
+ struct efi_info *ei;
+ enum efi_type et;
+
+ /* Get systab from boot params. */
+ ei = &bp->efi_info;
+#ifdef CONFIG_X86_64
+ sys_tbl_pa = ei->efi_systab | ((__u64)ei->efi_systab_hi << 32);
+#else
+ sys_tbl_pa = ei->efi_systab;
+#endif
+ if (!sys_tbl_pa) {
+ debug_putstr("EFI system table not found.");
+ return 0;
+ }
+
+ return sys_tbl_pa;
+}
+
+/*
+ * EFI config table address changes to virtual address after boot, which may
+ * not be accessible for the kexec'd kernel. To address this, kexec provides
+ * the initial physical address via a struct setup_data entry, which is
+ * checked for here, along with some sanity checks.
+ */
+static struct efi_setup_data *get_kexec_setup_data(struct boot_params *bp,
+ enum efi_type et)
+{
+#ifdef CONFIG_X86_64
+ struct efi_setup_data *esd = NULL;
+ struct setup_data *data;
+ u64 pa_data;
+
+ pa_data = bp->hdr.setup_data;
+ while (pa_data) {
+ data = (struct setup_data *)pa_data;
+ if (data->type == SETUP_EFI) {
+ esd = (struct efi_setup_data *)(pa_data + sizeof(struct setup_data));
+ break;
+ }
+
+ pa_data = data->next;
+ }
+
+ /*
+ * Original ACPI code falls back to attempting normal EFI boot in these
+ * cases, so maintain existing behavior by indicating non-kexec
+ * environment to the caller, but print them for debugging.
+ */
+ if (esd && !esd->tables) {
+ debug_putstr("kexec EFI environment missing valid configuration table.\n");
+ return NULL;
+ }
+
+ return esd;
+#endif
+ return NULL;
+}
+
+/**
+ * efi_get_conf_table - Given a pointer to boot_params, locate and return the physical
+ * address of EFI configuration table.
+ *
+ * @bp: pointer to boot_params
+ * @cfg_tbl_pa: location to store physical address of config table
+ * @cfg_tbl_len: location to store number of config table entries
+ *
+ * Return: 0 on success. On error, return params are left unchanged.
+ */
+int efi_get_conf_table(struct boot_params *bp, unsigned long *cfg_tbl_pa,
+ unsigned int *cfg_tbl_len)
+{
+ unsigned long sys_tbl_pa;
+ enum efi_type et;
+ int ret;
+
+ if (!cfg_tbl_pa || !cfg_tbl_len)
+ return -EINVAL;
+
+ sys_tbl_pa = efi_get_system_table(bp);
+ if (!sys_tbl_pa)
+ return -EINVAL;
+
+ /* Handle EFI bitness properly */
+ et = efi_get_type(bp);
+ if (et == EFI_TYPE_64) {
+ efi_system_table_64_t *stbl = (efi_system_table_64_t *)sys_tbl_pa;
+ struct efi_setup_data *esd;
+
+ /* kexec provides an alternative EFI conf table, check for it. */
+ esd = get_kexec_setup_data(bp, et);
+
+ *cfg_tbl_pa = esd ? esd->tables : stbl->tables;
+ *cfg_tbl_len = stbl->nr_tables;
+ } else if (et == EFI_TYPE_32) {
+ efi_system_table_32_t *stbl = (efi_system_table_32_t *)sys_tbl_pa;
+
+ *cfg_tbl_pa = stbl->tables;
+ *cfg_tbl_len = stbl->nr_tables;
+ } else {
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/* Get vendor table address/guid from EFI config table at the given index */
+static int get_vendor_table(void *cfg_tbl, unsigned int idx,
+ unsigned long *vendor_tbl_pa,
+ efi_guid_t *vendor_tbl_guid,
+ enum efi_type et)
+{
+ if (et == EFI_TYPE_64) {
+ efi_config_table_64_t *tbl_entry = (efi_config_table_64_t *)cfg_tbl + idx;
+
+ if (!IS_ENABLED(CONFIG_X86_64) && tbl_entry->table >> 32) {
+ debug_putstr("Error: EFI config table entry located above 4GB.\n");
+ return -EINVAL;
+ }
+
+ *vendor_tbl_pa = tbl_entry->table;
+ *vendor_tbl_guid = tbl_entry->guid;
+
+ } else if (et == EFI_TYPE_32) {
+ efi_config_table_32_t *tbl_entry = (efi_config_table_32_t *)cfg_tbl + idx;
+
+ *vendor_tbl_pa = tbl_entry->table;
+ *vendor_tbl_guid = tbl_entry->guid;
+ } else {
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/**
+ * efi_find_vendor_table - Given EFI config table, search it for the physical
+ * address of the vendor table associated with GUID.
+ *
+ * @bp: pointer to boot_params
+ * @cfg_tbl_pa: pointer to EFI configuration table
+ * @cfg_tbl_len: number of entries in EFI configuration table
+ * @guid: GUID of vendor table
+ *
+ * Return: vendor table address on success. On error, return 0.
+ */
+unsigned long efi_find_vendor_table(struct boot_params *bp,
+ unsigned long cfg_tbl_pa,
+ unsigned int cfg_tbl_len,
+ efi_guid_t guid)
+{
+ enum efi_type et;
+ unsigned int i;
+
+ et = efi_get_type(bp);
+ if (et == EFI_TYPE_NONE)
+ return 0;
+
+ for (i = 0; i < cfg_tbl_len; i++) {
+ unsigned long vendor_tbl_pa;
+ efi_guid_t vendor_tbl_guid;
+ int ret;
+
+ ret = get_vendor_table((void *)cfg_tbl_pa, i,
+ &vendor_tbl_pa,
+ &vendor_tbl_guid, et);
+ if (ret)
+ return 0;
+
+ if (!efi_guidcmp(guid, vendor_tbl_guid))
+ return vendor_tbl_pa;
+ }
+
+ return 0;
+}
diff --git a/arch/x86/boot/compressed/efi.h b/arch/x86/boot/compressed/efi.h
new file mode 100644
index 000000000000..b22300970f97
--- /dev/null
+++ b/arch/x86/boot/compressed/efi.h
@@ -0,0 +1,127 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef BOOT_COMPRESSED_EFI_H
+#define BOOT_COMPRESSED_EFI_H
+
+#if defined(_LINUX_EFI_H) || defined(_ASM_X86_EFI_H)
+#error Please do not include kernel proper namespace headers
+#endif
+
+typedef guid_t efi_guid_t __aligned(__alignof__(u32));
+
+#define EFI_GUID(a, b, c, d...) (efi_guid_t){ { \
+ (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \
+ (b) & 0xff, ((b) >> 8) & 0xff, \
+ (c) & 0xff, ((c) >> 8) & 0xff, d } }
+
+#define ACPI_TABLE_GUID EFI_GUID(0xeb9d2d30, 0x2d88, 0x11d3, 0x9a, 0x16, 0x00, 0x90, 0x27, 0x3f, 0xc1, 0x4d)
+#define ACPI_20_TABLE_GUID EFI_GUID(0x8868e871, 0xe4f1, 0x11d3, 0xbc, 0x22, 0x00, 0x80, 0xc7, 0x3c, 0x88, 0x81)
+#define EFI_CC_BLOB_GUID EFI_GUID(0x067b1f5f, 0xcf26, 0x44c5, 0x85, 0x54, 0x93, 0xd7, 0x77, 0x91, 0x2d, 0x42)
+#define LINUX_EFI_UNACCEPTED_MEM_TABLE_GUID EFI_GUID(0xd5d1de3c, 0x105c, 0x44f9, 0x9e, 0xa9, 0xbc, 0xef, 0x98, 0x12, 0x00, 0x31)
+
+#define EFI32_LOADER_SIGNATURE "EL32"
+#define EFI64_LOADER_SIGNATURE "EL64"
+
+/*
+ * Generic EFI table header
+ */
+typedef struct {
+ u64 signature;
+ u32 revision;
+ u32 headersize;
+ u32 crc32;
+ u32 reserved;
+} efi_table_hdr_t;
+
+#define EFI_CONVENTIONAL_MEMORY 7
+#define EFI_UNACCEPTED_MEMORY 15
+
+#define EFI_MEMORY_MORE_RELIABLE \
+ ((u64)0x0000000000010000ULL) /* higher reliability */
+#define EFI_MEMORY_SP ((u64)0x0000000000040000ULL) /* soft reserved */
+
+#define EFI_PAGE_SHIFT 12
+
+typedef struct {
+ u32 type;
+ u32 pad;
+ u64 phys_addr;
+ u64 virt_addr;
+ u64 num_pages;
+ u64 attribute;
+} efi_memory_desc_t;
+
+#define efi_early_memdesc_ptr(map, desc_size, n) \
+ (efi_memory_desc_t *)((void *)(map) + ((n) * (desc_size)))
+
+typedef struct {
+ efi_guid_t guid;
+ u64 table;
+} efi_config_table_64_t;
+
+typedef struct {
+ efi_guid_t guid;
+ u32 table;
+} efi_config_table_32_t;
+
+typedef struct {
+ efi_table_hdr_t hdr;
+ u64 fw_vendor; /* physical addr of CHAR16 vendor string */
+ u32 fw_revision;
+ u32 __pad1;
+ u64 con_in_handle;
+ u64 con_in;
+ u64 con_out_handle;
+ u64 con_out;
+ u64 stderr_handle;
+ u64 stderr;
+ u64 runtime;
+ u64 boottime;
+ u32 nr_tables;
+ u32 __pad2;
+ u64 tables;
+} efi_system_table_64_t;
+
+typedef struct {
+ efi_table_hdr_t hdr;
+ u32 fw_vendor; /* physical addr of CHAR16 vendor string */
+ u32 fw_revision;
+ u32 con_in_handle;
+ u32 con_in;
+ u32 con_out_handle;
+ u32 con_out;
+ u32 stderr_handle;
+ u32 stderr;
+ u32 runtime;
+ u32 boottime;
+ u32 nr_tables;
+ u32 tables;
+} efi_system_table_32_t;
+
+struct efi_unaccepted_memory {
+ u32 version;
+ u32 unit_size;
+ u64 phys_base;
+ u64 size;
+ unsigned long bitmap[];
+};
+
+static inline int efi_guidcmp (efi_guid_t left, efi_guid_t right)
+{
+ return memcmp(&left, &right, sizeof (efi_guid_t));
+}
+
+#ifdef CONFIG_EFI
+bool __pure __efi_soft_reserve_enabled(void);
+
+static inline bool __pure efi_soft_reserve_enabled(void)
+{
+ return IS_ENABLED(CONFIG_EFI_SOFT_RESERVE)
+ && __efi_soft_reserve_enabled();
+}
+#else
+static inline bool efi_soft_reserve_enabled(void)
+{
+ return false;
+}
+#endif /* CONFIG_EFI */
+#endif /* BOOT_COMPRESSED_EFI_H */
diff --git a/arch/x86/boot/compressed/efi_stub_32.S b/arch/x86/boot/compressed/efi_stub_32.S
deleted file mode 100644
index a53440e81d52..000000000000
--- a/arch/x86/boot/compressed/efi_stub_32.S
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * EFI call stub for IA32.
- *
- * This stub allows us to make EFI calls in physical mode with interrupts
- * turned off. Note that this implementation is different from the one in
- * arch/x86/platform/efi/efi_stub_32.S because we're _already_ in physical
- * mode at this point.
- */
-
-#include <linux/linkage.h>
-#include <asm/page_types.h>
-
-/*
- * efi_call_phys(void *, ...) is a function with variable parameters.
- * All the callers of this function assure that all the parameters are 4-bytes.
- */
-
-/*
- * In gcc calling convention, EBX, ESP, EBP, ESI and EDI are all callee save.
- * So we'd better save all of them at the beginning of this function and restore
- * at the end no matter how many we use, because we can not assure EFI runtime
- * service functions will comply with gcc calling convention, too.
- */
-
-.text
-ENTRY(efi_call_phys)
- /*
- * 0. The function can only be called in Linux kernel. So CS has been
- * set to 0x0010, DS and SS have been set to 0x0018. In EFI, I found
- * the values of these registers are the same. And, the corresponding
- * GDT entries are identical. So I will do nothing about segment reg
- * and GDT, but change GDT base register in prelog and epilog.
- */
-
- /*
- * 1. Because we haven't been relocated by this point we need to
- * use relative addressing.
- */
- call 1f
-1: popl %edx
- subl $1b, %edx
-
- /*
- * 2. Now on the top of stack is the return
- * address in the caller of efi_call_phys(), then parameter 1,
- * parameter 2, ..., param n. To make things easy, we save the return
- * address of efi_call_phys in a global variable.
- */
- popl %ecx
- movl %ecx, saved_return_addr(%edx)
- /* get the function pointer into ECX*/
- popl %ecx
- movl %ecx, efi_rt_function_ptr(%edx)
-
- /*
- * 3. Call the physical function.
- */
- call *%ecx
-
- /*
- * 4. Balance the stack. And because EAX contain the return value,
- * we'd better not clobber it. We need to calculate our address
- * again because %ecx and %edx are not preserved across EFI function
- * calls.
- */
- call 1f
-1: popl %edx
- subl $1b, %edx
-
- movl efi_rt_function_ptr(%edx), %ecx
- pushl %ecx
-
- /*
- * 10. Push the saved return address onto the stack and return.
- */
- movl saved_return_addr(%edx), %ecx
- pushl %ecx
- ret
-ENDPROC(efi_call_phys)
-.previous
-
-.data
-saved_return_addr:
- .long 0
-efi_rt_function_ptr:
- .long 0
diff --git a/arch/x86/boot/compressed/efi_stub_64.S b/arch/x86/boot/compressed/efi_stub_64.S
deleted file mode 100644
index cedc60de86eb..000000000000
--- a/arch/x86/boot/compressed/efi_stub_64.S
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../platform/efi/efi_stub_64.S"
diff --git a/arch/x86/boot/compressed/error.c b/arch/x86/boot/compressed/error.c
new file mode 100644
index 000000000000..19a8251de506
--- /dev/null
+++ b/arch/x86/boot/compressed/error.c
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Callers outside of misc.c need access to the error reporting routines,
+ * but the *_putstr() functions need to stay in misc.c because of how
+ * memcpy() and memmove() are defined for the compressed boot environment.
+ */
+#include "misc.h"
+#include "error.h"
+
+void warn(const char *m)
+{
+ error_putstr("\n\n");
+ error_putstr(m);
+ error_putstr("\n\n");
+}
+
+void error(char *m)
+{
+ warn(m);
+ error_putstr(" -- System halted");
+
+ while (1)
+ asm("hlt");
+}
+
+/* EFI libstub provides vsnprintf() */
+#ifdef CONFIG_EFI_STUB
+void panic(const char *fmt, ...)
+{
+ static char buf[1024];
+ va_list args;
+ int len;
+
+ va_start(args, fmt);
+ len = vsnprintf(buf, sizeof(buf), fmt, args);
+ va_end(args);
+
+ if (len && buf[len - 1] == '\n')
+ buf[len - 1] = '\0';
+
+ error(buf);
+}
+#endif
diff --git a/arch/x86/boot/compressed/error.h b/arch/x86/boot/compressed/error.h
new file mode 100644
index 000000000000..31f9e080d61a
--- /dev/null
+++ b/arch/x86/boot/compressed/error.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef BOOT_COMPRESSED_ERROR_H
+#define BOOT_COMPRESSED_ERROR_H
+
+#include <linux/compiler.h>
+
+void warn(const char *m);
+void error(char *m) __noreturn;
+void panic(const char *fmt, ...) __noreturn __cold;
+
+#endif /* BOOT_COMPRESSED_ERROR_H */
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 1e3184f6072f..1cfe9802a42f 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* linux/boot/head.S
*
@@ -29,67 +30,22 @@
#include <asm/page_types.h>
#include <asm/boot.h>
#include <asm/asm-offsets.h>
+#include <asm/bootparam.h>
- __HEAD
-ENTRY(startup_32)
-#ifdef CONFIG_EFI_STUB
- jmp preferred_addr
-
- /*
- * We don't need the return address, so set up the stack so
- * efi_main() can find its arguments.
- */
-ENTRY(efi_pe_entry)
- add $0x4, %esp
-
- call make_boot_params
- cmpl $0, %eax
- je 1f
- movl 0x4(%esp), %esi
- movl (%esp), %ecx
- pushl %eax
- pushl %esi
- pushl %ecx
- sub $0x4, %esp
-
-ENTRY(efi_stub_entry)
- add $0x4, %esp
- call efi_main
- cmpl $0, %eax
- movl %eax, %esi
- jne 2f
-1:
- /* EFI init failed, so hang. */
- hlt
- jmp 1b
-2:
- call 3f
-3:
- popl %eax
- subl $3b, %eax
- subl BP_pref_address(%esi), %eax
- add BP_code32_start(%esi), %eax
- leal preferred_addr(%eax), %eax
- jmp *%eax
+/*
+ * These symbols needed to be marked as .hidden to prevent the BFD linker from
+ * generating R_386_32 (rather than R_386_RELATIVE) relocations for them when
+ * the 32-bit compressed kernel is linked as PIE. This is no longer necessary,
+ * but it doesn't hurt to keep them .hidden.
+ */
+ .hidden _bss
+ .hidden _ebss
+ .hidden _end
-preferred_addr:
-#endif
+ __HEAD
+SYM_FUNC_START(startup_32)
cld
- /*
- * Test KEEP_SEGMENTS flag to see if the bootloader is asking
- * us to not reload segments
- */
- testb $(1<<6), BP_loadflags(%esi)
- jnz 1f
-
cli
- movl $__BOOT_DS, %eax
- movl %eax, %ds
- movl %eax, %es
- movl %eax, %fs
- movl %eax, %gs
- movl %eax, %ss
-1:
/*
* Calculate the delta between where we were compiled to run
@@ -101,31 +57,51 @@ preferred_addr:
*/
leal (BP_scratch+4)(%esi), %esp
call 1f
-1: popl %ebp
- subl $1b, %ebp
+1: popl %edx
+ addl $_GLOBAL_OFFSET_TABLE_+(.-1b), %edx
+
+ /* Load new GDT */
+ leal gdt@GOTOFF(%edx), %eax
+ movl %eax, 2(%eax)
+ lgdt (%eax)
+
+ /* Load segment registers with our descriptors */
+ movl $__BOOT_DS, %eax
+ movl %eax, %ds
+ movl %eax, %es
+ movl %eax, %fs
+ movl %eax, %gs
+ movl %eax, %ss
/*
- * %ebp contains the address we are loaded at by the boot loader and %ebx
- * contains the address where we should move the kernel image temporarily
- * for safe in-place decompression.
+ * %edx contains the address we are loaded at by the boot loader (plus the
+ * offset to the GOT). The below code calculates %ebx to be the address where
+ * we should move the kernel image temporarily for safe in-place decompression
+ * (again, plus the offset to the GOT).
+ *
+ * %ebp is calculated to be the address that the kernel will be decompressed to.
*/
#ifdef CONFIG_RELOCATABLE
- movl %ebp, %ebx
+ leal startup_32@GOTOFF(%edx), %ebx
movl BP_kernel_alignment(%esi), %eax
decl %eax
addl %eax, %ebx
notl %eax
andl %eax, %ebx
-#else
- movl $LOAD_PHYSICAL_ADDR, %ebx
+ cmpl $LOAD_PHYSICAL_ADDR, %ebx
+ jae 1f
#endif
+ movl $LOAD_PHYSICAL_ADDR, %ebx
+1:
+ movl %ebx, %ebp // Save the output address for later
/* Target address to relocate to for decompression */
- addl $z_extract_offset, %ebx
+ addl BP_init_size(%esi), %ebx
+ subl $_end@GOTOFF, %ebx
/* Set up the stack */
- leal boot_stack_end(%ebx), %esp
+ leal boot_stack_end@GOTOFF(%ebx), %esp
/* Zero EFLAGS */
pushl $0
@@ -136,8 +112,8 @@ preferred_addr:
* where decompression in place becomes safe.
*/
pushl %esi
- leal (_bss-4)(%ebp), %esi
- leal (_bss-4)(%ebx), %edi
+ leal (_bss@GOTOFF-4)(%edx), %esi
+ leal (_bss@GOTOFF-4)(%ebx), %edi
movl $(_bss - startup_32), %ecx
shrl $2, %ecx
std
@@ -145,93 +121,68 @@ preferred_addr:
cld
popl %esi
+ /*
+ * The GDT may get overwritten either during the copy we just did or
+ * during extract_kernel below. To avoid any issues, repoint the GDTR
+ * to the new copy of the GDT.
+ */
+ leal gdt@GOTOFF(%ebx), %eax
+ movl %eax, 2(%eax)
+ lgdt (%eax)
+
/*
* Jump to the relocated address.
*/
- leal relocated(%ebx), %eax
+ leal .Lrelocated@GOTOFF(%ebx), %eax
jmp *%eax
-ENDPROC(startup_32)
+SYM_FUNC_END(startup_32)
.text
-relocated:
+SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated)
/*
* Clear BSS (stack is currently empty)
*/
xorl %eax, %eax
- leal _bss(%ebx), %edi
- leal _ebss(%ebx), %ecx
+ leal _bss@GOTOFF(%ebx), %edi
+ leal _ebss@GOTOFF(%ebx), %ecx
subl %edi, %ecx
shrl $2, %ecx
rep stosl
/*
- * Adjust our own GOT
- */
- leal _got(%ebx), %edx
- leal _egot(%ebx), %ecx
-1:
- cmpl %ecx, %edx
- jae 2f
- addl %ebx, (%edx)
- addl $4, %edx
- jmp 1b
-2:
-
-/*
- * Do the decompression, and jump to the new kernel..
- */
- leal z_extract_offset_negative(%ebx), %ebp
- /* push arguments for decompress_kernel: */
- pushl %ebp /* output address */
- pushl $z_input_len /* input_len */
- leal input_data(%ebx), %eax
- pushl %eax /* input_data */
- leal boot_heap(%ebx), %eax
- pushl %eax /* heap area */
- pushl %esi /* real mode pointer */
- call decompress_kernel
- addl $20, %esp
-
-#if CONFIG_RELOCATABLE
-/*
- * Find the address of the relocations.
- */
- leal z_output_len(%ebp), %edi
-
-/*
- * Calculate the delta between where vmlinux was compiled to run
- * and where it was actually loaded.
- */
- movl %ebp, %ebx
- subl $LOAD_PHYSICAL_ADDR, %ebx
- jz 2f /* Nothing to be done if loaded at compiled addr. */
-/*
- * Process relocations.
+ * Do the extraction, and jump to the new kernel..
*/
+ /* push arguments for extract_kernel: */
-1: subl $4, %edi
- movl (%edi), %ecx
- testl %ecx, %ecx
- jz 2f
- addl %ebx, -__PAGE_OFFSET(%ebx, %ecx)
- jmp 1b
-2:
-#endif
+ pushl %ebp /* output address */
+ pushl %esi /* real mode pointer */
+ call extract_kernel /* returns kernel entry point in %eax */
+ addl $24, %esp
/*
- * Jump to the decompressed kernel.
+ * Jump to the extracted kernel.
*/
xorl %ebx, %ebx
- jmp *%ebp
+ jmp *%eax
+SYM_FUNC_END(.Lrelocated)
+
+ .data
+ .balign 8
+SYM_DATA_START_LOCAL(gdt)
+ .word gdt_end - gdt - 1
+ .long 0
+ .word 0
+ .quad 0x0000000000000000 /* Reserved */
+ .quad 0x00cf9a000000ffff /* __KERNEL_CS */
+ .quad 0x00cf92000000ffff /* __KERNEL_DS */
+SYM_DATA_END_LABEL(gdt, SYM_L_LOCAL, gdt_end)
/*
* Stack and heap for uncompression
*/
.bss
.balign 4
-boot_heap:
- .fill BOOT_HEAP_SIZE, 1, 0
boot_stack:
.fill BOOT_STACK_SIZE, 1, 0
boot_stack_end:
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 06e71c2c16bf..d9dab940ff62 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* linux/boot/head.S
*
@@ -31,10 +32,54 @@
#include <asm/msr.h>
#include <asm/processor-flags.h>
#include <asm/asm-offsets.h>
+#include <asm/bootparam.h>
+#include <asm/desc_defs.h>
+#include <asm/trapnr.h>
+
+/*
+ * Fix alignment at 16 bytes. Following CONFIG_FUNCTION_ALIGNMENT will result
+ * in assembly errors due to trying to move .org backward due to the excessive
+ * alignment.
+ */
+#undef __ALIGN
+#define __ALIGN .balign 16, 0x90
+
+/*
+ * Locally defined symbols should be marked hidden:
+ */
+ .hidden _bss
+ .hidden _ebss
+ .hidden _end
__HEAD
+
+/*
+ * This macro gives the relative virtual address of X, i.e. the offset of X
+ * from startup_32. This is the same as the link-time virtual address of X,
+ * since startup_32 is at 0, but defining it this way tells the
+ * assembler/linker that we do not want the actual run-time address of X. This
+ * prevents the linker from trying to create unwanted run-time relocation
+ * entries for the reference when the compressed kernel is linked as PIE.
+ *
+ * A reference X(%reg) will result in the link-time VA of X being stored with
+ * the instruction, and a run-time R_X86_64_RELATIVE relocation entry that
+ * adds the 64-bit base address where the kernel is loaded.
+ *
+ * Replacing it with (X-startup_32)(%reg) results in the offset being stored,
+ * and no run-time relocation.
+ *
+ * The macro should be used as a displacement with a base register containing
+ * the run-time address of startup_32 [i.e. rva(X)(%reg)], or as an immediate
+ * [$ rva(X)].
+ *
+ * This macro can only be used from within the .head.text section, since the
+ * expression requires startup_32 to be in the same section as the code being
+ * assembled.
+ */
+#define rva(X) ((X) - startup_32)
+
.code32
-ENTRY(startup_32)
+SYM_FUNC_START(startup_32)
/*
* 32bit entry is 0 and it is ABI so immutable!
* If we come here directly from a bootloader,
@@ -42,19 +87,7 @@ ENTRY(startup_32)
* all need to be under the 4G limit.
*/
cld
- /*
- * Test KEEP_SEGMENTS flag to see if the bootloader is asking
- * us to not reload segments
- */
- testb $(1<<6), BP_loadflags(%esi)
- jnz 1f
-
cli
- movl $(__BOOT_DS), %eax
- movl %eax, %ds
- movl %eax, %es
- movl %eax, %ss
-1:
/*
* Calculate the delta between where we were compiled to run
@@ -67,16 +100,39 @@ ENTRY(startup_32)
leal (BP_scratch+4)(%esi), %esp
call 1f
1: popl %ebp
- subl $1b, %ebp
+ subl $ rva(1b), %ebp
+
+ /* Load new GDT with the 64bit segments using 32bit descriptor */
+ leal rva(gdt)(%ebp), %eax
+ movl %eax, 2(%eax)
+ lgdt (%eax)
+
+ /* Load segment registers with our descriptors */
+ movl $__BOOT_DS, %eax
+ movl %eax, %ds
+ movl %eax, %es
+ movl %eax, %fs
+ movl %eax, %gs
+ movl %eax, %ss
+
+ /* Setup a stack and load CS from current GDT */
+ leal rva(boot_stack_end)(%ebp), %esp
-/* setup a stack and make sure cpu supports long mode. */
- movl $boot_stack_end, %eax
- addl %ebp, %eax
- movl %eax, %esp
+ pushl $__KERNEL32_CS
+ leal rva(1f)(%ebp), %eax
+ pushl %eax
+ lretl
+1:
+
+ /* Setup Exception handling for SEV-ES */
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ call startup32_load_idt
+#endif
+ /* Make sure cpu supports long mode. */
call verify_cpu
testl %eax, %eax
- jnz no_longmode
+ jnz .Lno_longmode
/*
* Compute the delta between where we were compiled to run at
@@ -94,62 +150,88 @@ ENTRY(startup_32)
addl %eax, %ebx
notl %eax
andl %eax, %ebx
-#else
- movl $LOAD_PHYSICAL_ADDR, %ebx
+ cmpl $LOAD_PHYSICAL_ADDR, %ebx
+ jae 1f
#endif
+ movl $LOAD_PHYSICAL_ADDR, %ebx
+1:
/* Target address to relocate to for decompression */
- addl $z_extract_offset, %ebx
+ addl BP_init_size(%esi), %ebx
+ subl $ rva(_end), %ebx
/*
* Prepare for entering 64 bit mode
*/
- /* Load new GDT with the 64bit segments using 32bit descriptor */
- leal gdt(%ebp), %eax
- movl %eax, gdt+2(%ebp)
- lgdt gdt(%ebp)
-
/* Enable PAE mode */
- movl $(X86_CR4_PAE), %eax
+ movl %cr4, %eax
+ orl $X86_CR4_PAE, %eax
movl %eax, %cr4
/*
* Build early 4G boot pagetable
*/
+ /*
+ * If SEV is active then set the encryption mask in the page tables.
+ * This will ensure that when the kernel is copied and decompressed
+ * it will be done so encrypted.
+ */
+ xorl %edx, %edx
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ call get_sev_encryption_bit
+ xorl %edx, %edx
+ testl %eax, %eax
+ jz 1f
+ subl $32, %eax /* Encryption bit is always above bit 31 */
+ bts %eax, %edx /* Set encryption mask for page tables */
+ /*
+ * Set MSR_AMD64_SEV_ENABLED_BIT in sev_status so that
+ * startup32_check_sev_cbit() will do a check. sev_enable() will
+ * initialize sev_status with all the bits reported by
+ * MSR_AMD_SEV_STATUS later, but only MSR_AMD64_SEV_ENABLED_BIT
+ * needs to be set for now.
+ */
+ movl $1, rva(sev_status)(%ebp)
+1:
+#endif
+
/* Initialize Page tables to 0 */
- leal pgtable(%ebx), %edi
+ leal rva(pgtable)(%ebx), %edi
xorl %eax, %eax
- movl $((4096*6)/4), %ecx
+ movl $(BOOT_INIT_PGT_SIZE/4), %ecx
rep stosl
/* Build Level 4 */
- leal pgtable + 0(%ebx), %edi
+ leal rva(pgtable + 0)(%ebx), %edi
leal 0x1007 (%edi), %eax
movl %eax, 0(%edi)
+ addl %edx, 4(%edi)
/* Build Level 3 */
- leal pgtable + 0x1000(%ebx), %edi
+ leal rva(pgtable + 0x1000)(%ebx), %edi
leal 0x1007(%edi), %eax
movl $4, %ecx
1: movl %eax, 0x00(%edi)
+ addl %edx, 0x04(%edi)
addl $0x00001000, %eax
addl $8, %edi
decl %ecx
jnz 1b
/* Build Level 2 */
- leal pgtable + 0x2000(%ebx), %edi
+ leal rva(pgtable + 0x2000)(%ebx), %edi
movl $0x00000183, %eax
movl $2048, %ecx
1: movl %eax, 0(%edi)
+ addl %edx, 4(%edi)
addl $0x00200000, %eax
addl $8, %edi
decl %ecx
jnz 1b
/* Enable the boot page tables */
- leal pgtable(%ebx), %eax
+ leal rva(pgtable)(%ebx), %eax
movl %eax, %cr3
/* Enable Long mode in EFER (Extended Feature Enable Register) */
@@ -161,34 +243,39 @@ ENTRY(startup_32)
/* After gdt is loaded */
xorl %eax, %eax
lldt %ax
- movl $0x20, %eax
+ movl $__BOOT_TSS, %eax
ltr %ax
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ /* Check if the C-bit position is correct when SEV is active */
+ call startup32_check_sev_cbit
+#endif
+
/*
* Setup for the jump to 64bit mode
*
- * When the jump is performend we will be in long mode but
+ * When the jump is performed we will be in long mode but
* in 32bit compatibility mode with EFER.LME = 1, CS.L = 0, CS.D = 1
* (and in turn EFER.LMA = 1). To jump into 64bit mode we use
* the new gdt/idt that has __KERNEL_CS with CS.L = 1.
* We place all of the values on our mini stack so lret can
* used to perform that far jump.
*/
+ leal rva(startup_64)(%ebp), %eax
pushl $__KERNEL_CS
- leal startup_64(%ebp), %eax
pushl %eax
/* Enter paged protected Mode, activating Long Mode */
- movl $(X86_CR0_PG | X86_CR0_PE), %eax /* Enable Paging and Protected mode */
+ movl $CR0_STATE, %eax
movl %eax, %cr0
/* Jump from 32bit compatibility mode into 64bit mode. */
lret
-ENDPROC(startup_32)
+SYM_FUNC_END(startup_32)
.code64
.org 0x200
-ENTRY(startup_64)
+SYM_CODE_START(startup_64)
/*
* 64bit entry is 0x200 and it is ABI so immutable!
* We come here either from startup_32 or directly from a
@@ -199,46 +286,9 @@ ENTRY(startup_64)
* that maps our entire kernel(text+data+bss+brk), zero page
* and command line.
*/
-#ifdef CONFIG_EFI_STUB
- /*
- * The entry point for the PE/COFF executable is efi_pe_entry, so
- * only legacy boot loaders will execute this jmp.
- */
- jmp preferred_addr
-
-ENTRY(efi_pe_entry)
- mov %rcx, %rdi
- mov %rdx, %rsi
- pushq %rdi
- pushq %rsi
- call make_boot_params
- cmpq $0,%rax
- je 1f
- mov %rax, %rdx
- popq %rsi
- popq %rdi
-
-ENTRY(efi_stub_entry)
- call efi_main
- movq %rax,%rsi
- cmpq $0,%rax
- jne 2f
-1:
- /* EFI init failed, so hang. */
- hlt
- jmp 1b
-2:
- call 3f
-3:
- popq %rax
- subq $3b, %rax
- subq BP_pref_address(%rsi), %rax
- add BP_code32_start(%esi), %eax
- leaq preferred_addr(%rax), %rax
- jmp *%rax
-preferred_addr:
-#endif
+ cld
+ cli
/* Setup data segments. */
xorl %eax, %eax
@@ -269,15 +319,94 @@ preferred_addr:
addq %rax, %rbp
notq %rax
andq %rax, %rbp
-#else
- movq $LOAD_PHYSICAL_ADDR, %rbp
+ cmpq $LOAD_PHYSICAL_ADDR, %rbp
+ jae 1f
#endif
+ movq $LOAD_PHYSICAL_ADDR, %rbp
+1:
/* Target address to relocate to for decompression */
- leaq z_extract_offset(%rbp), %rbx
+ movl BP_init_size(%rsi), %ebx
+ subl $ rva(_end), %ebx
+ addq %rbp, %rbx
/* Set up the stack */
- leaq boot_stack_end(%rbx), %rsp
+ leaq rva(boot_stack_end)(%rbx), %rsp
+
+ /*
+ * At this point we are in long mode with 4-level paging enabled,
+ * but we might want to enable 5-level paging or vice versa.
+ *
+ * The problem is that we cannot do it directly. Setting or clearing
+ * CR4.LA57 in long mode would trigger #GP. So we need to switch off
+ * long mode and paging first.
+ *
+ * We also need a trampoline in lower memory to switch over from
+ * 4- to 5-level paging for cases when the bootloader puts the kernel
+ * above 4G, but didn't enable 5-level paging for us.
+ *
+ * The same trampoline can be used to switch from 5- to 4-level paging
+ * mode, like when starting 4-level paging kernel via kexec() when
+ * original kernel worked in 5-level paging mode.
+ *
+ * For the trampoline, we need the top page table to reside in lower
+ * memory as we don't have a way to load 64-bit values into CR3 in
+ * 32-bit mode.
+ */
+
+ /* Make sure we have GDT with 32-bit code segment */
+ leaq gdt64(%rip), %rax
+ addq %rax, 2(%rax)
+ lgdt (%rax)
+
+ /* Reload CS so IRET returns to a CS actually in the GDT */
+ pushq $__KERNEL_CS
+ leaq .Lon_kernel_cs(%rip), %rax
+ pushq %rax
+ lretq
+
+.Lon_kernel_cs:
+ /*
+ * RSI holds a pointer to a boot_params structure provided by the
+ * loader, and this needs to be preserved across C function calls. So
+ * move it into a callee saved register.
+ */
+ movq %rsi, %r15
+
+ call load_stage1_idt
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ /*
+ * Now that the stage1 interrupt handlers are set up, #VC exceptions from
+ * CPUID instructions can be properly handled for SEV-ES guests.
+ *
+ * For SEV-SNP, the CPUID table also needs to be set up in advance of any
+ * CPUID instructions being issued, so go ahead and do that now via
+ * sev_enable(), which will also handle the rest of the SEV-related
+ * detection/setup to ensure that has been done in advance of any dependent
+ * code. Pass the boot_params pointer as the first argument.
+ */
+ movq %r15, %rdi
+ call sev_enable
+#endif
+
+ /* Preserve only the CR4 bits that must be preserved, and clear the rest */
+ movq %cr4, %rax
+ andl $(X86_CR4_PAE | X86_CR4_MCE | X86_CR4_LA57), %eax
+ movq %rax, %cr4
+
+ /*
+ * configure_5level_paging() updates the number of paging levels using
+ * a trampoline in 32-bit addressable memory if the current number does
+ * not match the desired number.
+ *
+ * Pass the boot_params pointer as the first argument. The second
+ * argument is the relocated address of the page table to use instead
+ * of the page table in trampoline memory (if required).
+ */
+ movq %r15, %rdi
+ leaq rva(top_pgtable)(%rbx), %rsi
+ call configure_5level_paging
/* Zero EFLAGS */
pushq $0
@@ -287,24 +416,33 @@ preferred_addr:
* Copy the compressed kernel to the end of our buffer
* where decompression in place becomes safe.
*/
- pushq %rsi
leaq (_bss-8)(%rip), %rsi
- leaq (_bss-8)(%rbx), %rdi
- movq $_bss /* - $startup_32 */, %rcx
- shrq $3, %rcx
+ leaq rva(_bss-8)(%rbx), %rdi
+ movl $(_bss - startup_32), %ecx
+ shrl $3, %ecx
std
rep movsq
cld
- popq %rsi
+
+ /*
+ * The GDT may get overwritten either during the copy we just did or
+ * during extract_kernel below. To avoid any issues, repoint the GDTR
+ * to the new copy of the GDT.
+ */
+ leaq rva(gdt64)(%rbx), %rax
+ leaq rva(gdt)(%rbx), %rdx
+ movq %rdx, 2(%rax)
+ lgdt (%rax)
/*
* Jump to the relocated address.
*/
- leaq relocated(%rbx), %rax
+ leaq rva(.Lrelocated)(%rbx), %rax
jmp *%rax
+SYM_CODE_END(startup_64)
.text
-relocated:
+SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated)
/*
* Clear BSS (stack is currently empty)
@@ -316,72 +454,86 @@ relocated:
shrq $3, %rcx
rep stosq
+ call load_stage2_idt
+
+ /* Pass boot_params to initialize_identity_maps() */
+ movq %r15, %rdi
+ call initialize_identity_maps
+
/*
- * Adjust our own GOT
- */
- leaq _got(%rip), %rdx
- leaq _egot(%rip), %rcx
-1:
- cmpq %rcx, %rdx
- jae 2f
- addq %rbx, (%rdx)
- addq $8, %rdx
- jmp 1b
-2:
-
-/*
- * Do the decompression, and jump to the new kernel..
+ * Do the extraction, and jump to the new kernel..
*/
- pushq %rsi /* Save the real mode argument */
- movq %rsi, %rdi /* real mode address */
- leaq boot_heap(%rip), %rsi /* malloc area for uncompression */
- leaq input_data(%rip), %rdx /* input_data */
- movl $z_input_len, %ecx /* input_len */
- movq %rbp, %r8 /* output target address */
- call decompress_kernel
- popq %rsi
+ /* pass struct boot_params pointer and output target address */
+ movq %r15, %rdi
+ movq %rbp, %rsi
+ call extract_kernel /* returns kernel entry point in %rax */
/*
* Jump to the decompressed kernel.
*/
- jmp *%rbp
+ movq %r15, %rsi
+ jmp *%rax
+SYM_FUNC_END(.Lrelocated)
.code32
-no_longmode:
- /* This isn't an x86-64 CPU so hang */
+SYM_FUNC_START_LOCAL_NOALIGN(.Lno_longmode)
+ /* This isn't an x86-64 CPU, so hang intentionally, we cannot continue */
1:
hlt
jmp 1b
+SYM_FUNC_END(.Lno_longmode)
+ .globl verify_cpu
#include "../../kernel/verify_cpu.S"
.data
-gdt:
- .word gdt_end - gdt
- .long gdt
+SYM_DATA_START_LOCAL(gdt64)
+ .word gdt_end - gdt - 1
+ .quad gdt - gdt64
+SYM_DATA_END(gdt64)
+ .balign 8
+SYM_DATA_START_LOCAL(gdt)
+ .word gdt_end - gdt - 1
+ .long 0
.word 0
- .quad 0x0000000000000000 /* NULL descriptor */
+ .quad 0x00cf9a000000ffff /* __KERNEL32_CS */
.quad 0x00af9a000000ffff /* __KERNEL_CS */
.quad 0x00cf92000000ffff /* __KERNEL_DS */
.quad 0x0080890000000000 /* TS descriptor */
.quad 0x0000000000000000 /* TS continued */
-gdt_end:
+SYM_DATA_END_LABEL(gdt, SYM_L_LOCAL, gdt_end)
+
+SYM_DATA_START(boot_idt_desc)
+ .word boot_idt_end - boot_idt - 1
+ .quad 0
+SYM_DATA_END(boot_idt_desc)
+ .balign 8
+SYM_DATA_START(boot_idt)
+ .rept BOOT_IDT_ENTRIES
+ .quad 0
+ .quad 0
+ .endr
+SYM_DATA_END_LABEL(boot_idt, SYM_L_GLOBAL, boot_idt_end)
/*
* Stack and heap for uncompression
*/
.bss
.balign 4
-boot_heap:
- .fill BOOT_HEAP_SIZE, 1, 0
-boot_stack:
+SYM_DATA_START_LOCAL(boot_stack)
.fill BOOT_STACK_SIZE, 1, 0
-boot_stack_end:
+ .balign 16
+SYM_DATA_END_LABEL(boot_stack, SYM_L_LOCAL, boot_stack_end)
/*
* Space for page tables (not in .bss so not zeroed)
*/
- .section ".pgtable","a",@nobits
+ .section ".pgtable","aw",@nobits
.balign 4096
-pgtable:
- .fill 6*4096, 1, 0
+SYM_DATA_LOCAL(pgtable, .fill BOOT_PGT_SIZE, 1, 0)
+
+/*
+ * The page table is going to be used instead of page table in the trampoline
+ * memory.
+ */
+SYM_DATA_LOCAL(top_pgtable, .fill PAGE_SIZE, 1, 0)
diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c
new file mode 100644
index 000000000000..dfb9c2deb77c
--- /dev/null
+++ b/arch/x86/boot/compressed/ident_map_64.c
@@ -0,0 +1,393 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This code is used on x86_64 to create page table identity mappings on
+ * demand by building up a new set of page tables (or appending to the
+ * existing ones), and then switching over to them when ready.
+ *
+ * Copyright (C) 2015-2016 Yinghai Lu
+ * Copyright (C) 2016 Kees Cook
+ */
+
+/* No MITIGATION_PAGE_TABLE_ISOLATION support needed either: */
+#undef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
+
+#include "error.h"
+#include "misc.h"
+
+/* These actually do the work of building the kernel identity maps. */
+#include <linux/pgtable.h>
+#include <asm/cmpxchg.h>
+#include <asm/trap_pf.h>
+#include <asm/trapnr.h>
+#include <asm/init.h>
+/* Use the static base for this part of the boot process */
+#undef __PAGE_OFFSET
+#define __PAGE_OFFSET __PAGE_OFFSET_BASE
+#include "../../mm/ident_map.c"
+
+#define _SETUP
+#include <asm/setup.h> /* For COMMAND_LINE_SIZE */
+#undef _SETUP
+
+extern unsigned long get_cmd_line_ptr(void);
+
+/* Used by PAGE_KERN* macros: */
+pteval_t __default_kernel_pte_mask __read_mostly = ~0;
+
+/* Used to track our page table allocation area. */
+struct alloc_pgt_data {
+ unsigned char *pgt_buf;
+ unsigned long pgt_buf_size;
+ unsigned long pgt_buf_offset;
+};
+
+/*
+ * Allocates space for a page table entry, using struct alloc_pgt_data
+ * above. Besides the local callers, this is used as the allocation
+ * callback in mapping_info below.
+ */
+static void *alloc_pgt_page(void *context)
+{
+ struct alloc_pgt_data *pages = (struct alloc_pgt_data *)context;
+ unsigned char *entry;
+
+ /* Validate there is space available for a new page. */
+ if (pages->pgt_buf_offset >= pages->pgt_buf_size) {
+ debug_putstr("out of pgt_buf in " __FILE__ "!?\n");
+ debug_putaddr(pages->pgt_buf_offset);
+ debug_putaddr(pages->pgt_buf_size);
+ return NULL;
+ }
+
+ /* Consumed more tables than expected? */
+ if (pages->pgt_buf_offset == BOOT_PGT_SIZE_WARN) {
+ debug_putstr("pgt_buf running low in " __FILE__ "\n");
+ debug_putstr("Need to raise BOOT_PGT_SIZE?\n");
+ debug_putaddr(pages->pgt_buf_offset);
+ debug_putaddr(pages->pgt_buf_size);
+ }
+
+ entry = pages->pgt_buf + pages->pgt_buf_offset;
+ pages->pgt_buf_offset += PAGE_SIZE;
+
+ return entry;
+}
+
+/* Used to track our allocated page tables. */
+static struct alloc_pgt_data pgt_data;
+
+/* The top level page table entry pointer. */
+static unsigned long top_level_pgt;
+
+phys_addr_t physical_mask = (1ULL << __PHYSICAL_MASK_SHIFT) - 1;
+
+/*
+ * Mapping information structure passed to kernel_ident_mapping_init().
+ * Due to relocation, pointers must be assigned at run time not build time.
+ */
+static struct x86_mapping_info mapping_info;
+
+/*
+ * Adds the specified range to the identity mappings.
+ */
+void kernel_add_identity_map(unsigned long start, unsigned long end)
+{
+ int ret;
+
+ /* Align boundary to 2M. */
+ start = round_down(start, PMD_SIZE);
+ end = round_up(end, PMD_SIZE);
+ if (start >= end)
+ return;
+
+ /* Build the mapping. */
+ ret = kernel_ident_mapping_init(&mapping_info, (pgd_t *)top_level_pgt, start, end);
+ if (ret)
+ error("Error: kernel_ident_mapping_init() failed\n");
+}
+
+/* Locates and clears a region for a new top level page table. */
+void initialize_identity_maps(void *rmode)
+{
+ unsigned long cmdline;
+ struct setup_data *sd;
+
+ /* Exclude the encryption mask from __PHYSICAL_MASK */
+ physical_mask &= ~sme_me_mask;
+
+ /* Init mapping_info with run-time function/buffer pointers. */
+ mapping_info.alloc_pgt_page = alloc_pgt_page;
+ mapping_info.context = &pgt_data;
+ mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sme_me_mask;
+ mapping_info.kernpg_flag = _KERNPG_TABLE;
+
+ /*
+ * It should be impossible for this not to already be true,
+ * but since calling this a second time would rewind the other
+ * counters, let's just make sure this is reset too.
+ */
+ pgt_data.pgt_buf_offset = 0;
+
+ /*
+ * If we came here via startup_32(), cr3 will be _pgtable already
+ * and we must append to the existing area instead of entirely
+ * overwriting it.
+ *
+ * With 5-level paging, we use '_pgtable' to allocate the p4d page table,
+ * the top-level page table is allocated separately.
+ *
+ * p4d_offset(top_level_pgt, 0) would cover both the 4- and 5-level
+ * cases. On 4-level paging it's equal to 'top_level_pgt'.
+ */
+ top_level_pgt = read_cr3_pa();
+ if (p4d_offset((pgd_t *)top_level_pgt, 0) == (p4d_t *)_pgtable) {
+ pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE;
+ pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE;
+ memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size);
+ } else {
+ pgt_data.pgt_buf = _pgtable;
+ pgt_data.pgt_buf_size = BOOT_PGT_SIZE;
+ memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size);
+ top_level_pgt = (unsigned long)alloc_pgt_page(&pgt_data);
+ }
+
+ /*
+ * New page-table is set up - map the kernel image, boot_params and the
+ * command line. The uncompressed kernel requires boot_params and the
+ * command line to be mapped in the identity mapping. Map them
+ * explicitly here in case the compressed kernel does not touch them,
+ * or does not touch all the pages covering them.
+ */
+ kernel_add_identity_map((unsigned long)_head, (unsigned long)_end);
+ boot_params_ptr = rmode;
+ kernel_add_identity_map((unsigned long)boot_params_ptr,
+ (unsigned long)(boot_params_ptr + 1));
+ cmdline = get_cmd_line_ptr();
+ kernel_add_identity_map(cmdline, cmdline + COMMAND_LINE_SIZE);
+
+ /*
+ * Also map the setup_data entries passed via boot_params in case they
+ * need to be accessed by uncompressed kernel via the identity mapping.
+ */
+ sd = (struct setup_data *)boot_params_ptr->hdr.setup_data;
+ while (sd) {
+ unsigned long sd_addr = (unsigned long)sd;
+
+ kernel_add_identity_map(sd_addr, sd_addr + sizeof(*sd) + sd->len);
+ sd = (struct setup_data *)sd->next;
+ }
+
+ sev_prep_identity_maps(top_level_pgt);
+
+ /* Load the new page-table. */
+ write_cr3(top_level_pgt);
+
+ /*
+ * Now that the required page table mappings are established and a
+ * GHCB can be used, check for SNP guest/HV feature compatibility.
+ */
+ snp_check_features();
+}
+
+static pte_t *split_large_pmd(struct x86_mapping_info *info,
+ pmd_t *pmdp, unsigned long __address)
+{
+ unsigned long page_flags;
+ unsigned long address;
+ pte_t *pte;
+ pmd_t pmd;
+ int i;
+
+ pte = (pte_t *)info->alloc_pgt_page(info->context);
+ if (!pte)
+ return NULL;
+
+ address = __address & PMD_MASK;
+ /* No large page - clear PSE flag */
+ page_flags = info->page_flag & ~_PAGE_PSE;
+
+ /* Populate the PTEs */
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ set_pte(&pte[i], __pte(address | page_flags));
+ address += PAGE_SIZE;
+ }
+
+ /*
+ * Ideally we need to clear the large PMD first and do a TLB
+ * flush before we write the new PMD. But the 2M range of the
+ * PMD might contain the code we execute and/or the stack
+ * we are on, so we can't do that. But that should be safe here
+ * because we are going from large to small mappings and we are
+ * also the only user of the page-table, so there is no chance
+ * of a TLB multihit.
+ */
+ pmd = __pmd((unsigned long)pte | info->kernpg_flag);
+ set_pmd(pmdp, pmd);
+ /* Flush TLB to establish the new PMD */
+ write_cr3(top_level_pgt);
+
+ return pte + pte_index(__address);
+}
+
+static void clflush_page(unsigned long address)
+{
+ unsigned int flush_size;
+ char *cl, *start, *end;
+
+ /*
+ * Hardcode cl-size to 64 - CPUID can't be used here because that might
+ * cause another #VC exception and the GHCB is not ready to use yet.
+ */
+ flush_size = 64;
+ start = (char *)(address & PAGE_MASK);
+ end = start + PAGE_SIZE;
+
+ /*
+ * First make sure there are no pending writes on the cache-lines to
+ * flush.
+ */
+ asm volatile("mfence" : : : "memory");
+
+ for (cl = start; cl != end; cl += flush_size)
+ clflush(cl);
+}
+
+static int set_clr_page_flags(struct x86_mapping_info *info,
+ unsigned long address,
+ pteval_t set, pteval_t clr)
+{
+ pgd_t *pgdp = (pgd_t *)top_level_pgt;
+ p4d_t *p4dp;
+ pud_t *pudp;
+ pmd_t *pmdp;
+ pte_t *ptep, pte;
+
+ /*
+ * First make sure there is a PMD mapping for 'address'.
+ * It should already exist, but keep things generic.
+ *
+ * To map the page just read from it and fault it in if there is no
+ * mapping yet. kernel_add_identity_map() can't be called here because
+ * that would unconditionally map the address on PMD level, destroying
+ * any PTE-level mappings that might already exist. Use assembly here
+ * so the access won't be optimized away.
+ */
+ asm volatile("mov %[address], %%r9"
+ :: [address] "g" (*(unsigned long *)address)
+ : "r9", "memory");
+
+ /*
+ * The page is mapped at least with PMD size - so skip checks and walk
+ * directly to the PMD.
+ */
+ p4dp = p4d_offset(pgdp, address);
+ pudp = pud_offset(p4dp, address);
+ pmdp = pmd_offset(pudp, address);
+
+ if (pmd_leaf(*pmdp))
+ ptep = split_large_pmd(info, pmdp, address);
+ else
+ ptep = pte_offset_kernel(pmdp, address);
+
+ if (!ptep)
+ return -ENOMEM;
+
+ /*
+ * Changing encryption attributes of a page requires to flush it from
+ * the caches.
+ */
+ if ((set | clr) & _PAGE_ENC) {
+ clflush_page(address);
+
+ /*
+ * If the encryption attribute is being cleared, change the page state
+ * to shared in the RMP table.
+ */
+ if (clr)
+ snp_set_page_shared(__pa(address & PAGE_MASK));
+ }
+
+ /* Update PTE */
+ pte = *ptep;
+ pte = pte_set_flags(pte, set);
+ pte = pte_clear_flags(pte, clr);
+ set_pte(ptep, pte);
+
+ /*
+ * If the encryption attribute is being set, then change the page state to
+ * private in the RMP entry. The page state change must be done after the PTE
+ * is updated.
+ */
+ if (set & _PAGE_ENC)
+ snp_set_page_private(__pa(address & PAGE_MASK));
+
+ /* Flush TLB after changing encryption attribute */
+ write_cr3(top_level_pgt);
+
+ return 0;
+}
+
+int set_page_decrypted(unsigned long address)
+{
+ return set_clr_page_flags(&mapping_info, address, 0, _PAGE_ENC);
+}
+
+int set_page_encrypted(unsigned long address)
+{
+ return set_clr_page_flags(&mapping_info, address, _PAGE_ENC, 0);
+}
+
+int set_page_non_present(unsigned long address)
+{
+ return set_clr_page_flags(&mapping_info, address, 0, _PAGE_PRESENT);
+}
+
+static void do_pf_error(const char *msg, unsigned long error_code,
+ unsigned long address, unsigned long ip)
+{
+ error_putstr(msg);
+
+ error_putstr("\nError Code: ");
+ error_puthex(error_code);
+ error_putstr("\nCR2: 0x");
+ error_puthex(address);
+ error_putstr("\nRIP relative to _head: 0x");
+ error_puthex(ip - (unsigned long)_head);
+ error_putstr("\n");
+
+ error("Stopping.\n");
+}
+
+void do_boot_page_fault(struct pt_regs *regs, unsigned long error_code)
+{
+ unsigned long address = native_read_cr2();
+ unsigned long end;
+ bool ghcb_fault;
+
+ ghcb_fault = sev_es_check_ghcb_fault(address);
+
+ address &= PMD_MASK;
+ end = address + PMD_SIZE;
+
+ /*
+ * Check for unexpected error codes. Unexpected are:
+ * - Faults on present pages
+ * - User faults
+ * - Reserved bits set
+ */
+ if (error_code & (X86_PF_PROT | X86_PF_USER | X86_PF_RSVD))
+ do_pf_error("Unexpected page-fault:", error_code, address, regs->ip);
+ else if (ghcb_fault)
+ do_pf_error("Page-fault on GHCB page:", error_code, address, regs->ip);
+
+ /*
+ * Error code is sane - now identity map the 2M region around
+ * the faulting address.
+ */
+ kernel_add_identity_map(address, end);
+}
+
+void do_boot_nmi_trap(struct pt_regs *regs, unsigned long error_code)
+{
+ spurious_nmi_count++;
+}
diff --git a/arch/x86/boot/compressed/idt_64.c b/arch/x86/boot/compressed/idt_64.c
new file mode 100644
index 000000000000..d100284bbef4
--- /dev/null
+++ b/arch/x86/boot/compressed/idt_64.c
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <asm/trap_pf.h>
+#include <asm/segment.h>
+#include <asm/trapnr.h>
+#include "misc.h"
+
+static void set_idt_entry(int vector, void (*handler)(void))
+{
+ unsigned long address = (unsigned long)handler;
+ gate_desc entry;
+
+ memset(&entry, 0, sizeof(entry));
+
+ entry.offset_low = (u16)(address & 0xffff);
+ entry.segment = __KERNEL_CS;
+ entry.bits.type = GATE_TRAP;
+ entry.bits.p = 1;
+ entry.offset_middle = (u16)((address >> 16) & 0xffff);
+ entry.offset_high = (u32)(address >> 32);
+
+ memcpy(&boot_idt[vector], &entry, sizeof(entry));
+}
+
+/* Have this here so we don't need to include <asm/desc.h> */
+static void load_boot_idt(const struct desc_ptr *dtr)
+{
+ asm volatile("lidt %0"::"m" (*dtr));
+}
+
+/* Setup IDT before kernel jumping to .Lrelocated */
+void load_stage1_idt(void)
+{
+ boot_idt_desc.address = (unsigned long)boot_idt;
+
+
+ if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT))
+ set_idt_entry(X86_TRAP_VC, boot_stage1_vc);
+
+ load_boot_idt(&boot_idt_desc);
+}
+
+/*
+ * Setup IDT after kernel jumping to .Lrelocated.
+ *
+ * initialize_identity_maps() needs a #PF handler to be setup
+ * in order to be able to fault-in identity mapping ranges; see
+ * do_boot_page_fault().
+ *
+ * This #PF handler setup needs to happen in load_stage2_idt() where the
+ * IDT is loaded and there the #VC IDT entry gets setup too.
+ *
+ * In order to be able to handle #VCs, one needs a GHCB which
+ * gets setup with an already set up pagetable, which is done in
+ * initialize_identity_maps(). And there's the catch 22: the boot #VC
+ * handler do_boot_stage2_vc() needs to call early_setup_ghcb() itself
+ * (and, especially set_page_decrypted()) because the SEV-ES setup code
+ * cannot initialize a GHCB as there's no #PF handler yet...
+ */
+void load_stage2_idt(void)
+{
+ boot_idt_desc.address = (unsigned long)boot_idt;
+
+ set_idt_entry(X86_TRAP_PF, boot_page_fault);
+ set_idt_entry(X86_TRAP_NMI, boot_nmi_trap);
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ /*
+ * Clear the second stage #VC handler in case guest types
+ * needing #VC have not been detected.
+ */
+ if (sev_status & BIT(1))
+ set_idt_entry(X86_TRAP_VC, boot_stage2_vc);
+ else
+ set_idt_entry(X86_TRAP_VC, NULL);
+#endif
+
+ load_boot_idt(&boot_idt_desc);
+}
+
+void cleanup_exception_handling(void)
+{
+ /*
+ * Flush GHCB from cache and map it encrypted again when running as
+ * SEV-ES guest.
+ */
+ sev_es_shutdown_ghcb();
+
+ /* Set a null-idt, disabling #PF and #VC handling */
+ boot_idt_desc.size = 0;
+ boot_idt_desc.address = 0;
+ load_boot_idt(&boot_idt_desc);
+}
diff --git a/arch/x86/boot/compressed/idt_handlers_64.S b/arch/x86/boot/compressed/idt_handlers_64.S
new file mode 100644
index 000000000000..4d03c8562f63
--- /dev/null
+++ b/arch/x86/boot/compressed/idt_handlers_64.S
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Early IDT handler entry points
+ *
+ * Copyright (C) 2019 SUSE
+ *
+ * Author: Joerg Roedel <jroedel@suse.de>
+ */
+
+#include <asm/segment.h>
+
+/* For ORIG_RAX */
+#include "../../entry/calling.h"
+
+.macro EXCEPTION_HANDLER name function error_code=0
+SYM_FUNC_START(\name)
+
+ /* Build pt_regs */
+ .if \error_code == 0
+ pushq $0
+ .endif
+
+ pushq %rdi
+ pushq %rsi
+ pushq %rdx
+ pushq %rcx
+ pushq %rax
+ pushq %r8
+ pushq %r9
+ pushq %r10
+ pushq %r11
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+ /* Call handler with pt_regs */
+ movq %rsp, %rdi
+ /* Error code is second parameter */
+ movq ORIG_RAX(%rsp), %rsi
+ call \function
+
+ /* Restore regs */
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbp
+ popq %rbx
+ popq %r11
+ popq %r10
+ popq %r9
+ popq %r8
+ popq %rax
+ popq %rcx
+ popq %rdx
+ popq %rsi
+ popq %rdi
+
+ /* Remove error code and return */
+ addq $8, %rsp
+
+ iretq
+SYM_FUNC_END(\name)
+ .endm
+
+ .text
+ .code64
+
+EXCEPTION_HANDLER boot_page_fault do_boot_page_fault error_code=1
+EXCEPTION_HANDLER boot_nmi_trap do_boot_nmi_trap error_code=0
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+EXCEPTION_HANDLER boot_stage1_vc do_vc_no_ghcb error_code=1
+EXCEPTION_HANDLER boot_stage2_vc do_boot_stage2_vc error_code=1
+#endif
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
new file mode 100644
index 000000000000..3b0948ad449f
--- /dev/null
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -0,0 +1,908 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * kaslr.c
+ *
+ * This contains the routines needed to generate a reasonable level of
+ * entropy to choose a randomized kernel base address offset in support
+ * of Kernel Address Space Layout Randomization (KASLR). Additionally
+ * handles walking the physical memory maps (and tracking memory regions
+ * to avoid) in order to select a physical memory location that can
+ * contain the entire properly aligned running kernel image.
+ *
+ */
+
+/*
+ * isspace() in linux/ctype.h is expected by next_args() to filter
+ * out "space/lf/tab". While boot/ctype.h conflicts with linux/ctype.h,
+ * since isdigit() is implemented in both of them. Hence disable it
+ * here.
+ */
+#define BOOT_CTYPE_H
+
+#include "misc.h"
+#include "error.h"
+#include "../string.h"
+#include "efi.h"
+
+#include <generated/compile.h>
+#include <generated/utsversion.h>
+#include <generated/utsrelease.h>
+
+#define _SETUP
+#include <asm/setup.h> /* For COMMAND_LINE_SIZE */
+#undef _SETUP
+
+extern unsigned long get_cmd_line_ptr(void);
+
+/* Simplified build-specific string for starting entropy. */
+static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@"
+ LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION;
+
+static unsigned long rotate_xor(unsigned long hash, const void *area,
+ size_t size)
+{
+ size_t i;
+ unsigned long *ptr = (unsigned long *)area;
+
+ for (i = 0; i < size / sizeof(hash); i++) {
+ /* Rotate by odd number of bits and XOR. */
+ hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7);
+ hash ^= ptr[i];
+ }
+
+ return hash;
+}
+
+/* Attempt to create a simple but unpredictable starting entropy. */
+static unsigned long get_boot_seed(void)
+{
+ unsigned long hash = 0;
+
+ hash = rotate_xor(hash, build_str, sizeof(build_str));
+ hash = rotate_xor(hash, boot_params_ptr, sizeof(*boot_params_ptr));
+
+ return hash;
+}
+
+#define KASLR_COMPRESSED_BOOT
+#include "../../lib/kaslr.c"
+
+
+/* Only supporting at most 4 unusable memmap regions with kaslr */
+#define MAX_MEMMAP_REGIONS 4
+
+static bool memmap_too_large;
+
+
+/*
+ * Store memory limit: MAXMEM on 64-bit and KERNEL_IMAGE_SIZE on 32-bit.
+ * It may be reduced by "mem=nn[KMG]" or "memmap=nn[KMG]" command line options.
+ */
+static u64 mem_limit;
+
+/* Number of immovable memory regions */
+static int num_immovable_mem;
+
+enum mem_avoid_index {
+ MEM_AVOID_ZO_RANGE = 0,
+ MEM_AVOID_INITRD,
+ MEM_AVOID_CMDLINE,
+ MEM_AVOID_BOOTPARAMS,
+ MEM_AVOID_MEMMAP_BEGIN,
+ MEM_AVOID_MEMMAP_END = MEM_AVOID_MEMMAP_BEGIN + MAX_MEMMAP_REGIONS - 1,
+ MEM_AVOID_MAX,
+};
+
+static struct mem_vector mem_avoid[MEM_AVOID_MAX];
+
+static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two)
+{
+ /* Item one is entirely before item two. */
+ if (one->start + one->size <= two->start)
+ return false;
+ /* Item one is entirely after item two. */
+ if (one->start >= two->start + two->size)
+ return false;
+ return true;
+}
+
+char *skip_spaces(const char *str)
+{
+ while (isspace(*str))
+ ++str;
+ return (char *)str;
+}
+#include "../../../../lib/ctype.c"
+#include "../../../../lib/cmdline.c"
+
+static int
+parse_memmap(char *p, u64 *start, u64 *size)
+{
+ char *oldp;
+
+ if (!p)
+ return -EINVAL;
+
+ /* We don't care about this option here */
+ if (!strncmp(p, "exactmap", 8))
+ return -EINVAL;
+
+ oldp = p;
+ *size = memparse(p, &p);
+ if (p == oldp)
+ return -EINVAL;
+
+ switch (*p) {
+ case '#':
+ case '$':
+ case '!':
+ *start = memparse(p + 1, &p);
+ return 0;
+ case '@':
+ /*
+ * memmap=nn@ss specifies usable region, should
+ * be skipped
+ */
+ *size = 0;
+ fallthrough;
+ default:
+ /*
+ * If w/o offset, only size specified, memmap=nn[KMG] has the
+ * same behaviour as mem=nn[KMG]. It limits the max address
+ * system can use. Region above the limit should be avoided.
+ */
+ *start = 0;
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+static void mem_avoid_memmap(char *str)
+{
+ static int i;
+
+ if (i >= MAX_MEMMAP_REGIONS)
+ return;
+
+ while (str && (i < MAX_MEMMAP_REGIONS)) {
+ int rc;
+ u64 start, size;
+ char *k = strchr(str, ',');
+
+ if (k)
+ *k++ = 0;
+
+ rc = parse_memmap(str, &start, &size);
+ if (rc < 0)
+ break;
+ str = k;
+
+ if (start == 0) {
+ /* Store the specified memory limit if size > 0 */
+ if (size > 0 && size < mem_limit)
+ mem_limit = size;
+
+ continue;
+ }
+
+ mem_avoid[MEM_AVOID_MEMMAP_BEGIN + i].start = start;
+ mem_avoid[MEM_AVOID_MEMMAP_BEGIN + i].size = size;
+ i++;
+ }
+
+ /* More than 4 memmaps, fail kaslr */
+ if ((i >= MAX_MEMMAP_REGIONS) && str)
+ memmap_too_large = true;
+}
+
+/* Store the number of 1GB huge pages which users specified: */
+static unsigned long max_gb_huge_pages;
+
+static void parse_gb_huge_pages(char *param, char *val)
+{
+ static bool gbpage_sz;
+ char *p;
+
+ if (!strcmp(param, "hugepagesz")) {
+ p = val;
+ if (memparse(p, &p) != PUD_SIZE) {
+ gbpage_sz = false;
+ return;
+ }
+
+ if (gbpage_sz)
+ warn("Repeatedly set hugeTLB page size of 1G!\n");
+ gbpage_sz = true;
+ return;
+ }
+
+ if (!strcmp(param, "hugepages") && gbpage_sz) {
+ p = val;
+ max_gb_huge_pages = simple_strtoull(p, &p, 0);
+ return;
+ }
+}
+
+static void handle_mem_options(void)
+{
+ char *args = (char *)get_cmd_line_ptr();
+ size_t len;
+ char *tmp_cmdline;
+ char *param, *val;
+ u64 mem_size;
+
+ if (!args)
+ return;
+
+ len = strnlen(args, COMMAND_LINE_SIZE-1);
+ tmp_cmdline = malloc(len + 1);
+ if (!tmp_cmdline)
+ error("Failed to allocate space for tmp_cmdline");
+
+ memcpy(tmp_cmdline, args, len);
+ tmp_cmdline[len] = 0;
+ args = tmp_cmdline;
+
+ /* Chew leading spaces */
+ args = skip_spaces(args);
+
+ while (*args) {
+ args = next_arg(args, &param, &val);
+ /* Stop at -- */
+ if (!val && strcmp(param, "--") == 0)
+ break;
+
+ if (!strcmp(param, "memmap")) {
+ mem_avoid_memmap(val);
+ } else if (IS_ENABLED(CONFIG_X86_64) && strstr(param, "hugepages")) {
+ parse_gb_huge_pages(param, val);
+ } else if (!strcmp(param, "mem")) {
+ char *p = val;
+
+ if (!strcmp(p, "nopentium"))
+ continue;
+ mem_size = memparse(p, &p);
+ if (mem_size == 0)
+ break;
+
+ if (mem_size < mem_limit)
+ mem_limit = mem_size;
+ }
+ }
+
+ free(tmp_cmdline);
+ return;
+}
+
+/*
+ * In theory, KASLR can put the kernel anywhere in the range of [16M, MAXMEM)
+ * on 64-bit, and [16M, KERNEL_IMAGE_SIZE) on 32-bit.
+ *
+ * The mem_avoid array is used to store the ranges that need to be avoided
+ * when KASLR searches for an appropriate random address. We must avoid any
+ * regions that are unsafe to overlap with during decompression, and other
+ * things like the initrd, cmdline and boot_params. This comment seeks to
+ * explain mem_avoid as clearly as possible since incorrect mem_avoid
+ * memory ranges lead to really hard to debug boot failures.
+ *
+ * The initrd, cmdline, and boot_params are trivial to identify for
+ * avoiding. They are MEM_AVOID_INITRD, MEM_AVOID_CMDLINE, and
+ * MEM_AVOID_BOOTPARAMS respectively below.
+ *
+ * What is not obvious how to avoid is the range of memory that is used
+ * during decompression (MEM_AVOID_ZO_RANGE below). This range must cover
+ * the compressed kernel (ZO) and its run space, which is used to extract
+ * the uncompressed kernel (VO) and relocs.
+ *
+ * ZO's full run size sits against the end of the decompression buffer, so
+ * we can calculate where text, data, bss, etc of ZO are positioned more
+ * easily.
+ *
+ * For additional background, the decompression calculations can be found
+ * in header.S, and the memory diagram is based on the one found in misc.c.
+ *
+ * The following conditions are already enforced by the image layouts and
+ * associated code:
+ * - input + input_size >= output + output_size
+ * - kernel_total_size <= init_size
+ * - kernel_total_size <= output_size (see Note below)
+ * - output + init_size >= output + output_size
+ *
+ * (Note that kernel_total_size and output_size have no fundamental
+ * relationship, but output_size is passed to choose_random_location
+ * as a maximum of the two. The diagram is showing a case where
+ * kernel_total_size is larger than output_size, but this case is
+ * handled by bumping output_size.)
+ *
+ * The above conditions can be illustrated by a diagram:
+ *
+ * 0 output input input+input_size output+init_size
+ * | | | | |
+ * | | | | |
+ * |-----|--------|--------|--------------|-----------|--|-------------|
+ * | | |
+ * | | |
+ * output+init_size-ZO_INIT_SIZE output+output_size output+kernel_total_size
+ *
+ * [output, output+init_size) is the entire memory range used for
+ * extracting the compressed image.
+ *
+ * [output, output+kernel_total_size) is the range needed for the
+ * uncompressed kernel (VO) and its run size (bss, brk, etc).
+ *
+ * [output, output+output_size) is VO plus relocs (i.e. the entire
+ * uncompressed payload contained by ZO). This is the area of the buffer
+ * written to during decompression.
+ *
+ * [output+init_size-ZO_INIT_SIZE, output+init_size) is the worst-case
+ * range of the copied ZO and decompression code. (i.e. the range
+ * covered backwards of size ZO_INIT_SIZE, starting from output+init_size.)
+ *
+ * [input, input+input_size) is the original copied compressed image (ZO)
+ * (i.e. it does not include its run size). This range must be avoided
+ * because it contains the data used for decompression.
+ *
+ * [input+input_size, output+init_size) is [_text, _end) for ZO. This
+ * range includes ZO's heap and stack, and must be avoided since it
+ * performs the decompression.
+ *
+ * Since the above two ranges need to be avoided and they are adjacent,
+ * they can be merged, resulting in: [input, output+init_size) which
+ * becomes the MEM_AVOID_ZO_RANGE below.
+ */
+static void mem_avoid_init(unsigned long input, unsigned long input_size,
+ unsigned long output)
+{
+ unsigned long init_size = boot_params_ptr->hdr.init_size;
+ u64 initrd_start, initrd_size;
+ unsigned long cmd_line, cmd_line_size;
+
+ /*
+ * Avoid the region that is unsafe to overlap during
+ * decompression.
+ */
+ mem_avoid[MEM_AVOID_ZO_RANGE].start = input;
+ mem_avoid[MEM_AVOID_ZO_RANGE].size = (output + init_size) - input;
+
+ /* Avoid initrd. */
+ initrd_start = (u64)boot_params_ptr->ext_ramdisk_image << 32;
+ initrd_start |= boot_params_ptr->hdr.ramdisk_image;
+ initrd_size = (u64)boot_params_ptr->ext_ramdisk_size << 32;
+ initrd_size |= boot_params_ptr->hdr.ramdisk_size;
+ mem_avoid[MEM_AVOID_INITRD].start = initrd_start;
+ mem_avoid[MEM_AVOID_INITRD].size = initrd_size;
+ /* No need to set mapping for initrd, it will be handled in VO. */
+
+ /* Avoid kernel command line. */
+ cmd_line = get_cmd_line_ptr();
+ /* Calculate size of cmd_line. */
+ if (cmd_line) {
+ cmd_line_size = strnlen((char *)cmd_line, COMMAND_LINE_SIZE-1) + 1;
+ mem_avoid[MEM_AVOID_CMDLINE].start = cmd_line;
+ mem_avoid[MEM_AVOID_CMDLINE].size = cmd_line_size;
+ }
+
+ /* Avoid boot parameters. */
+ mem_avoid[MEM_AVOID_BOOTPARAMS].start = (unsigned long)boot_params_ptr;
+ mem_avoid[MEM_AVOID_BOOTPARAMS].size = sizeof(*boot_params_ptr);
+
+ /* We don't need to set a mapping for setup_data. */
+
+ /* Mark the memmap regions we need to avoid */
+ handle_mem_options();
+
+ /* Enumerate the immovable memory regions */
+ num_immovable_mem = count_immovable_mem_regions();
+}
+
+/*
+ * Does this memory vector overlap a known avoided area? If so, record the
+ * overlap region with the lowest address.
+ */
+static bool mem_avoid_overlap(struct mem_vector *img,
+ struct mem_vector *overlap)
+{
+ int i;
+ struct setup_data *ptr;
+ u64 earliest = img->start + img->size;
+ bool is_overlapping = false;
+
+ for (i = 0; i < MEM_AVOID_MAX; i++) {
+ if (mem_overlaps(img, &mem_avoid[i]) &&
+ mem_avoid[i].start < earliest) {
+ *overlap = mem_avoid[i];
+ earliest = overlap->start;
+ is_overlapping = true;
+ }
+ }
+
+ /* Avoid all entries in the setup_data linked list. */
+ ptr = (struct setup_data *)(unsigned long)boot_params_ptr->hdr.setup_data;
+ while (ptr) {
+ struct mem_vector avoid;
+
+ avoid.start = (unsigned long)ptr;
+ avoid.size = sizeof(*ptr) + ptr->len;
+
+ if (mem_overlaps(img, &avoid) && (avoid.start < earliest)) {
+ *overlap = avoid;
+ earliest = overlap->start;
+ is_overlapping = true;
+ }
+
+ if (ptr->type == SETUP_INDIRECT &&
+ ((struct setup_indirect *)ptr->data)->type != SETUP_INDIRECT) {
+ avoid.start = ((struct setup_indirect *)ptr->data)->addr;
+ avoid.size = ((struct setup_indirect *)ptr->data)->len;
+
+ if (mem_overlaps(img, &avoid) && (avoid.start < earliest)) {
+ *overlap = avoid;
+ earliest = overlap->start;
+ is_overlapping = true;
+ }
+ }
+
+ ptr = (struct setup_data *)(unsigned long)ptr->next;
+ }
+
+ return is_overlapping;
+}
+
+struct slot_area {
+ u64 addr;
+ unsigned long num;
+};
+
+#define MAX_SLOT_AREA 100
+
+static struct slot_area slot_areas[MAX_SLOT_AREA];
+static unsigned int slot_area_index;
+static unsigned long slot_max;
+
+static void store_slot_info(struct mem_vector *region, unsigned long image_size)
+{
+ struct slot_area slot_area;
+
+ if (slot_area_index == MAX_SLOT_AREA)
+ return;
+
+ slot_area.addr = region->start;
+ slot_area.num = 1 + (region->size - image_size) / CONFIG_PHYSICAL_ALIGN;
+
+ slot_areas[slot_area_index++] = slot_area;
+ slot_max += slot_area.num;
+}
+
+/*
+ * Skip as many 1GB huge pages as possible in the passed region
+ * according to the number which users specified:
+ */
+static void
+process_gb_huge_pages(struct mem_vector *region, unsigned long image_size)
+{
+ u64 pud_start, pud_end;
+ unsigned long gb_huge_pages;
+ struct mem_vector tmp;
+
+ if (!IS_ENABLED(CONFIG_X86_64) || !max_gb_huge_pages) {
+ store_slot_info(region, image_size);
+ return;
+ }
+
+ /* Are there any 1GB pages in the region? */
+ pud_start = ALIGN(region->start, PUD_SIZE);
+ pud_end = ALIGN_DOWN(region->start + region->size, PUD_SIZE);
+
+ /* No good 1GB huge pages found: */
+ if (pud_start >= pud_end) {
+ store_slot_info(region, image_size);
+ return;
+ }
+
+ /* Check if the head part of the region is usable. */
+ if (pud_start >= region->start + image_size) {
+ tmp.start = region->start;
+ tmp.size = pud_start - region->start;
+ store_slot_info(&tmp, image_size);
+ }
+
+ /* Skip the good 1GB pages. */
+ gb_huge_pages = (pud_end - pud_start) >> PUD_SHIFT;
+ if (gb_huge_pages > max_gb_huge_pages) {
+ pud_end = pud_start + (max_gb_huge_pages << PUD_SHIFT);
+ max_gb_huge_pages = 0;
+ } else {
+ max_gb_huge_pages -= gb_huge_pages;
+ }
+
+ /* Check if the tail part of the region is usable. */
+ if (region->start + region->size >= pud_end + image_size) {
+ tmp.start = pud_end;
+ tmp.size = region->start + region->size - pud_end;
+ store_slot_info(&tmp, image_size);
+ }
+}
+
+static u64 slots_fetch_random(void)
+{
+ unsigned long slot;
+ unsigned int i;
+
+ /* Handle case of no slots stored. */
+ if (slot_max == 0)
+ return 0;
+
+ slot = kaslr_get_random_long("Physical") % slot_max;
+
+ for (i = 0; i < slot_area_index; i++) {
+ if (slot >= slot_areas[i].num) {
+ slot -= slot_areas[i].num;
+ continue;
+ }
+ return slot_areas[i].addr + ((u64)slot * CONFIG_PHYSICAL_ALIGN);
+ }
+
+ if (i == slot_area_index)
+ debug_putstr("slots_fetch_random() failed!?\n");
+ return 0;
+}
+
+static void __process_mem_region(struct mem_vector *entry,
+ unsigned long minimum,
+ unsigned long image_size)
+{
+ struct mem_vector region, overlap;
+ u64 region_end;
+
+ /* Enforce minimum and memory limit. */
+ region.start = max_t(u64, entry->start, minimum);
+ region_end = min(entry->start + entry->size, mem_limit);
+
+ /* Give up if slot area array is full. */
+ while (slot_area_index < MAX_SLOT_AREA) {
+ /* Potentially raise address to meet alignment needs. */
+ region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN);
+
+ /* Did we raise the address above the passed in memory entry? */
+ if (region.start > region_end)
+ return;
+
+ /* Reduce size by any delta from the original address. */
+ region.size = region_end - region.start;
+
+ /* Return if region can't contain decompressed kernel */
+ if (region.size < image_size)
+ return;
+
+ /* If nothing overlaps, store the region and return. */
+ if (!mem_avoid_overlap(&region, &overlap)) {
+ process_gb_huge_pages(&region, image_size);
+ return;
+ }
+
+ /* Store beginning of region if holds at least image_size. */
+ if (overlap.start >= region.start + image_size) {
+ region.size = overlap.start - region.start;
+ process_gb_huge_pages(&region, image_size);
+ }
+
+ /* Clip off the overlapping region and start over. */
+ region.start = overlap.start + overlap.size;
+ }
+}
+
+static bool process_mem_region(struct mem_vector *region,
+ unsigned long minimum,
+ unsigned long image_size)
+{
+ int i;
+ /*
+ * If no immovable memory found, or MEMORY_HOTREMOVE disabled,
+ * use @region directly.
+ */
+ if (!num_immovable_mem) {
+ __process_mem_region(region, minimum, image_size);
+
+ if (slot_area_index == MAX_SLOT_AREA) {
+ debug_putstr("Aborted e820/efi memmap scan (slot_areas full)!\n");
+ return true;
+ }
+ return false;
+ }
+
+#if defined(CONFIG_MEMORY_HOTREMOVE) && defined(CONFIG_ACPI)
+ /*
+ * If immovable memory found, filter the intersection between
+ * immovable memory and @region.
+ */
+ for (i = 0; i < num_immovable_mem; i++) {
+ u64 start, end, entry_end, region_end;
+ struct mem_vector entry;
+
+ if (!mem_overlaps(region, &immovable_mem[i]))
+ continue;
+
+ start = immovable_mem[i].start;
+ end = start + immovable_mem[i].size;
+ region_end = region->start + region->size;
+
+ entry.start = clamp(region->start, start, end);
+ entry_end = clamp(region_end, start, end);
+ entry.size = entry_end - entry.start;
+
+ __process_mem_region(&entry, minimum, image_size);
+
+ if (slot_area_index == MAX_SLOT_AREA) {
+ debug_putstr("Aborted e820/efi memmap scan when walking immovable regions(slot_areas full)!\n");
+ return true;
+ }
+ }
+#endif
+ return false;
+}
+
+#ifdef CONFIG_EFI
+
+/*
+ * Only EFI_CONVENTIONAL_MEMORY and EFI_UNACCEPTED_MEMORY (if supported) are
+ * guaranteed to be free.
+ *
+ * Pick free memory more conservatively than the EFI spec allows: according to
+ * the spec, EFI_BOOT_SERVICES_{CODE|DATA} are also free memory and thus
+ * available to place the kernel image into, but in practice there's firmware
+ * where using that memory leads to crashes. Buggy vendor EFI code registers
+ * for an event that triggers on SetVirtualAddressMap(). The handler assumes
+ * that EFI_BOOT_SERVICES_DATA memory has not been touched by loader yet, which
+ * is probably true for Windows.
+ *
+ * Preserve EFI_BOOT_SERVICES_* regions until after SetVirtualAddressMap().
+ */
+static inline bool memory_type_is_free(efi_memory_desc_t *md)
+{
+ if (md->type == EFI_CONVENTIONAL_MEMORY)
+ return true;
+
+ if (IS_ENABLED(CONFIG_UNACCEPTED_MEMORY) &&
+ md->type == EFI_UNACCEPTED_MEMORY)
+ return true;
+
+ return false;
+}
+
+/*
+ * Returns true if we processed the EFI memmap, which we prefer over the E820
+ * table if it is available.
+ */
+static bool
+process_efi_entries(unsigned long minimum, unsigned long image_size)
+{
+ struct efi_info *e = &boot_params_ptr->efi_info;
+ bool efi_mirror_found = false;
+ struct mem_vector region;
+ efi_memory_desc_t *md;
+ unsigned long pmap;
+ char *signature;
+ u32 nr_desc;
+ int i;
+
+ signature = (char *)&e->efi_loader_signature;
+ if (strncmp(signature, EFI32_LOADER_SIGNATURE, 4) &&
+ strncmp(signature, EFI64_LOADER_SIGNATURE, 4))
+ return false;
+
+#ifdef CONFIG_X86_32
+ /* Can't handle data above 4GB at this time */
+ if (e->efi_memmap_hi) {
+ warn("EFI memmap is above 4GB, can't be handled now on x86_32. EFI should be disabled.\n");
+ return false;
+ }
+ pmap = e->efi_memmap;
+#else
+ pmap = (e->efi_memmap | ((__u64)e->efi_memmap_hi << 32));
+#endif
+
+ nr_desc = e->efi_memmap_size / e->efi_memdesc_size;
+ for (i = 0; i < nr_desc; i++) {
+ md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i);
+ if (md->attribute & EFI_MEMORY_MORE_RELIABLE) {
+ efi_mirror_found = true;
+ break;
+ }
+ }
+
+ for (i = 0; i < nr_desc; i++) {
+ md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i);
+
+ if (!memory_type_is_free(md))
+ continue;
+
+ if (efi_soft_reserve_enabled() &&
+ (md->attribute & EFI_MEMORY_SP))
+ continue;
+
+ if (efi_mirror_found &&
+ !(md->attribute & EFI_MEMORY_MORE_RELIABLE))
+ continue;
+
+ region.start = md->phys_addr;
+ region.size = md->num_pages << EFI_PAGE_SHIFT;
+ if (process_mem_region(&region, minimum, image_size))
+ break;
+ }
+ return true;
+}
+#else
+static inline bool
+process_efi_entries(unsigned long minimum, unsigned long image_size)
+{
+ return false;
+}
+#endif
+
+static void process_e820_entries(unsigned long minimum,
+ unsigned long image_size)
+{
+ int i;
+ struct mem_vector region;
+ struct boot_e820_entry *entry;
+
+ /* Verify potential e820 positions, appending to slots list. */
+ for (i = 0; i < boot_params_ptr->e820_entries; i++) {
+ entry = &boot_params_ptr->e820_table[i];
+ /* Skip non-RAM entries. */
+ if (entry->type != E820_TYPE_RAM)
+ continue;
+ region.start = entry->addr;
+ region.size = entry->size;
+ if (process_mem_region(&region, minimum, image_size))
+ break;
+ }
+}
+
+/*
+ * If KHO is active, only process its scratch areas to ensure we are not
+ * stepping onto preserved memory.
+ */
+static bool process_kho_entries(unsigned long minimum, unsigned long image_size)
+{
+ struct kho_scratch *kho_scratch;
+ struct setup_data *ptr;
+ struct kho_data *kho;
+ int i, nr_areas = 0;
+
+ if (!IS_ENABLED(CONFIG_KEXEC_HANDOVER))
+ return false;
+
+ ptr = (struct setup_data *)(unsigned long)boot_params_ptr->hdr.setup_data;
+ while (ptr) {
+ if (ptr->type == SETUP_KEXEC_KHO) {
+ kho = (struct kho_data *)(unsigned long)ptr->data;
+ kho_scratch = (void *)(unsigned long)kho->scratch_addr;
+ nr_areas = kho->scratch_size / sizeof(*kho_scratch);
+ break;
+ }
+
+ ptr = (struct setup_data *)(unsigned long)ptr->next;
+ }
+
+ if (!nr_areas)
+ return false;
+
+ for (i = 0; i < nr_areas; i++) {
+ struct kho_scratch *area = &kho_scratch[i];
+ struct mem_vector region = {
+ .start = area->addr,
+ .size = area->size,
+ };
+
+ if (process_mem_region(&region, minimum, image_size))
+ break;
+ }
+
+ return true;
+}
+
+static unsigned long find_random_phys_addr(unsigned long minimum,
+ unsigned long image_size)
+{
+ u64 phys_addr;
+
+ /* Bail out early if it's impossible to succeed. */
+ if (minimum + image_size > mem_limit)
+ return 0;
+
+ /* Check if we had too many memmaps. */
+ if (memmap_too_large) {
+ debug_putstr("Aborted memory entries scan (more than 4 memmap= args)!\n");
+ return 0;
+ }
+
+ /*
+ * During kexec handover only process KHO scratch areas that are known
+ * not to contain any data that must be preserved.
+ */
+ if (!process_kho_entries(minimum, image_size) &&
+ !process_efi_entries(minimum, image_size))
+ process_e820_entries(minimum, image_size);
+
+ phys_addr = slots_fetch_random();
+
+ /* Perform a final check to make sure the address is in range. */
+ if (phys_addr < minimum || phys_addr + image_size > mem_limit) {
+ warn("Invalid physical address chosen!\n");
+ return 0;
+ }
+
+ return (unsigned long)phys_addr;
+}
+
+static unsigned long find_random_virt_addr(unsigned long minimum,
+ unsigned long image_size)
+{
+ unsigned long slots, random_addr;
+
+ /*
+ * There are how many CONFIG_PHYSICAL_ALIGN-sized slots
+ * that can hold image_size within the range of minimum to
+ * KERNEL_IMAGE_SIZE?
+ */
+ slots = 1 + (KERNEL_IMAGE_SIZE - minimum - image_size) / CONFIG_PHYSICAL_ALIGN;
+
+ random_addr = kaslr_get_random_long("Virtual") % slots;
+
+ return random_addr * CONFIG_PHYSICAL_ALIGN + minimum;
+}
+
+/*
+ * Since this function examines addresses much more numerically,
+ * it takes the input and output pointers as 'unsigned long'.
+ */
+void choose_random_location(unsigned long input,
+ unsigned long input_size,
+ unsigned long *output,
+ unsigned long output_size,
+ unsigned long *virt_addr)
+{
+ unsigned long random_addr, min_addr;
+
+ if (cmdline_find_option_bool("nokaslr")) {
+ warn("KASLR disabled: 'nokaslr' on cmdline.");
+ return;
+ }
+
+ boot_params_ptr->hdr.loadflags |= KASLR_FLAG;
+
+ if (IS_ENABLED(CONFIG_X86_32))
+ mem_limit = KERNEL_IMAGE_SIZE;
+ else
+ mem_limit = MAXMEM;
+
+ /* Record the various known unsafe memory ranges. */
+ mem_avoid_init(input, input_size, *output);
+
+ /*
+ * Low end of the randomization range should be the
+ * smaller of 512M or the initial kernel image
+ * location:
+ */
+ min_addr = min(*output, 512UL << 20);
+ /* Make sure minimum is aligned. */
+ min_addr = ALIGN(min_addr, CONFIG_PHYSICAL_ALIGN);
+
+ /* Walk available memory entries to find a random address. */
+ random_addr = find_random_phys_addr(min_addr, output_size);
+ if (!random_addr) {
+ warn("Physical KASLR disabled: no suitable memory region!");
+ } else {
+ /* Update the new physical address location. */
+ if (*output != random_addr)
+ *output = random_addr;
+ }
+
+
+ /* Pick random virtual address starting from LOAD_PHYSICAL_ADDR. */
+ if (IS_ENABLED(CONFIG_X86_64))
+ random_addr = find_random_virt_addr(LOAD_PHYSICAL_ADDR, output_size);
+ *virt_addr = random_addr;
+}
diff --git a/arch/x86/boot/compressed/kernel_info.S b/arch/x86/boot/compressed/kernel_info.S
new file mode 100644
index 000000000000..f818ee8fba38
--- /dev/null
+++ b/arch/x86/boot/compressed/kernel_info.S
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <asm/bootparam.h>
+
+ .section ".rodata.kernel_info", "a"
+
+ .global kernel_info
+
+kernel_info:
+ /* Header, Linux top (structure). */
+ .ascii "LToP"
+ /* Size. */
+ .long kernel_info_var_len_data - kernel_info
+ /* Size total. */
+ .long kernel_info_end - kernel_info
+
+ /* Maximal allowed type for setup_data and setup_indirect structs. */
+ .long SETUP_TYPE_MAX
+
+kernel_info_var_len_data:
+ /* Empty for time being... */
+kernel_info_end:
diff --git a/arch/x86/boot/compressed/mem.c b/arch/x86/boot/compressed/mem.c
new file mode 100644
index 000000000000..0e9f84ab4bdc
--- /dev/null
+++ b/arch/x86/boot/compressed/mem.c
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "error.h"
+#include "misc.h"
+#include "tdx.h"
+#include "sev.h"
+#include <asm/shared/tdx.h>
+
+/*
+ * accept_memory() and process_unaccepted_memory() called from EFI stub which
+ * runs before decompressor and its early_tdx_detect().
+ *
+ * Enumerate TDX directly from the early users.
+ */
+static bool early_is_tdx_guest(void)
+{
+ static bool once;
+ static bool is_tdx;
+
+ if (!IS_ENABLED(CONFIG_INTEL_TDX_GUEST))
+ return false;
+
+ if (!once) {
+ u32 eax, sig[3];
+
+ cpuid_count(TDX_CPUID_LEAF_ID, 0, &eax,
+ &sig[0], &sig[2], &sig[1]);
+ is_tdx = !memcmp(TDX_IDENT, sig, sizeof(sig));
+ once = true;
+ }
+
+ return is_tdx;
+}
+
+void arch_accept_memory(phys_addr_t start, phys_addr_t end)
+{
+ /* Platform-specific memory-acceptance call goes here */
+ if (early_is_tdx_guest()) {
+ if (!tdx_accept_memory(start, end))
+ panic("TDX: Failed to accept memory\n");
+ } else if (early_is_sevsnp_guest()) {
+ snp_accept_memory(start, end);
+ } else {
+ error("Cannot accept memory: unknown platform\n");
+ }
+}
+
+bool init_unaccepted_memory(void)
+{
+ guid_t guid = LINUX_EFI_UNACCEPTED_MEM_TABLE_GUID;
+ struct efi_unaccepted_memory *table;
+ unsigned long cfg_table_pa;
+ unsigned int cfg_table_len;
+ enum efi_type et;
+ int ret;
+
+ et = efi_get_type(boot_params_ptr);
+ if (et == EFI_TYPE_NONE)
+ return false;
+
+ ret = efi_get_conf_table(boot_params_ptr, &cfg_table_pa, &cfg_table_len);
+ if (ret) {
+ warn("EFI config table not found.");
+ return false;
+ }
+
+ table = (void *)efi_find_vendor_table(boot_params_ptr, cfg_table_pa,
+ cfg_table_len, guid);
+ if (!table)
+ return false;
+
+ if (table->version != 1)
+ error("Unknown version of unaccepted memory table\n");
+
+ /*
+ * In many cases unaccepted_table is already set by EFI stub, but it
+ * has to be initialized again to cover cases when the table is not
+ * allocated by EFI stub or EFI stub copied the kernel image with
+ * efi_relocate_kernel() before the variable is set.
+ *
+ * It must be initialized before the first usage of accept_memory().
+ */
+ unaccepted_table = table;
+
+ return true;
+}
diff --git a/arch/x86/boot/compressed/mem_encrypt.S b/arch/x86/boot/compressed/mem_encrypt.S
new file mode 100644
index 000000000000..32f7cc8a8625
--- /dev/null
+++ b/arch/x86/boot/compressed/mem_encrypt.S
@@ -0,0 +1,324 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2017 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ */
+
+#include <linux/linkage.h>
+
+#include <asm/processor-flags.h>
+#include <asm/msr.h>
+#include <asm/asm-offsets.h>
+#include <asm/segment.h>
+#include <asm/trapnr.h>
+
+ .text
+ .code32
+SYM_FUNC_START(get_sev_encryption_bit)
+ push %ebx
+
+ movl $0x80000000, %eax /* CPUID to check the highest leaf */
+ cpuid
+ cmpl $0x8000001f, %eax /* See if 0x8000001f is available */
+ jb .Lno_sev
+
+ /*
+ * Check for the SEV feature:
+ * CPUID Fn8000_001F[EAX] - Bit 1
+ * CPUID Fn8000_001F[EBX] - Bits 5:0
+ * Pagetable bit position used to indicate encryption
+ */
+ movl $0x8000001f, %eax
+ cpuid
+ bt $1, %eax /* Check if SEV is available */
+ jnc .Lno_sev
+
+ movl $MSR_AMD64_SEV, %ecx /* Read the SEV MSR */
+ rdmsr
+ bt $MSR_AMD64_SEV_ENABLED_BIT, %eax /* Check if SEV is active */
+ jnc .Lno_sev
+
+ movl %ebx, %eax
+ andl $0x3f, %eax /* Return the encryption bit location */
+ jmp .Lsev_exit
+
+.Lno_sev:
+ xor %eax, %eax
+
+.Lsev_exit:
+ pop %ebx
+ RET
+SYM_FUNC_END(get_sev_encryption_bit)
+
+/**
+ * sev_es_req_cpuid - Request a CPUID value from the Hypervisor using
+ * the GHCB MSR protocol
+ *
+ * @%eax: Register to request (0=EAX, 1=EBX, 2=ECX, 3=EDX)
+ * @%edx: CPUID Function
+ *
+ * Returns 0 in %eax on success, non-zero on failure
+ * %edx returns CPUID value on success
+ */
+SYM_CODE_START_LOCAL(sev_es_req_cpuid)
+ shll $30, %eax
+ orl $0x00000004, %eax
+ movl $MSR_AMD64_SEV_ES_GHCB, %ecx
+ wrmsr
+ rep; vmmcall # VMGEXIT
+ rdmsr
+
+ /* Check response */
+ movl %eax, %ecx
+ andl $0x3ffff000, %ecx # Bits [12-29] MBZ
+ jnz 2f
+
+ /* Check return code */
+ andl $0xfff, %eax
+ cmpl $5, %eax
+ jne 2f
+
+ /* All good - return success */
+ xorl %eax, %eax
+1:
+ RET
+2:
+ movl $-1, %eax
+ jmp 1b
+SYM_CODE_END(sev_es_req_cpuid)
+
+SYM_CODE_START_LOCAL(startup32_vc_handler)
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+
+ /* Keep CPUID function in %ebx */
+ movl %eax, %ebx
+
+ /* Check if error-code == SVM_EXIT_CPUID */
+ cmpl $0x72, 16(%esp)
+ jne .Lfail
+
+ movl $0, %eax # Request CPUID[fn].EAX
+ movl %ebx, %edx # CPUID fn
+ call sev_es_req_cpuid # Call helper
+ testl %eax, %eax # Check return code
+ jnz .Lfail
+ movl %edx, 12(%esp) # Store result
+
+ movl $1, %eax # Request CPUID[fn].EBX
+ movl %ebx, %edx # CPUID fn
+ call sev_es_req_cpuid # Call helper
+ testl %eax, %eax # Check return code
+ jnz .Lfail
+ movl %edx, 8(%esp) # Store result
+
+ movl $2, %eax # Request CPUID[fn].ECX
+ movl %ebx, %edx # CPUID fn
+ call sev_es_req_cpuid # Call helper
+ testl %eax, %eax # Check return code
+ jnz .Lfail
+ movl %edx, 4(%esp) # Store result
+
+ movl $3, %eax # Request CPUID[fn].EDX
+ movl %ebx, %edx # CPUID fn
+ call sev_es_req_cpuid # Call helper
+ testl %eax, %eax # Check return code
+ jnz .Lfail
+ movl %edx, 0(%esp) # Store result
+
+ /*
+ * Sanity check CPUID results from the Hypervisor. See comment in
+ * do_vc_no_ghcb() for more details on why this is necessary.
+ */
+
+ /* Fail if SEV leaf not available in CPUID[0x80000000].EAX */
+ cmpl $0x80000000, %ebx
+ jne .Lcheck_sev
+ cmpl $0x8000001f, 12(%esp)
+ jb .Lfail
+ jmp .Ldone
+
+.Lcheck_sev:
+ /* Fail if SEV bit not set in CPUID[0x8000001f].EAX[1] */
+ cmpl $0x8000001f, %ebx
+ jne .Ldone
+ btl $1, 12(%esp)
+ jnc .Lfail
+
+.Ldone:
+ popl %edx
+ popl %ecx
+ popl %ebx
+ popl %eax
+
+ /* Remove error code */
+ addl $4, %esp
+
+ /* Jump over CPUID instruction */
+ addl $2, (%esp)
+
+ iret
+.Lfail:
+ /* Send terminate request to Hypervisor */
+ movl $0x100, %eax
+ xorl %edx, %edx
+ movl $MSR_AMD64_SEV_ES_GHCB, %ecx
+ wrmsr
+ rep; vmmcall
+
+ /* If request fails, go to hlt loop */
+ hlt
+ jmp .Lfail
+SYM_CODE_END(startup32_vc_handler)
+
+/*
+ * Write an IDT entry into boot32_idt
+ *
+ * Parameters:
+ *
+ * %eax: Handler address
+ * %edx: Vector number
+ * %ecx: IDT address
+ */
+SYM_FUNC_START_LOCAL(startup32_set_idt_entry)
+ /* IDT entry address to %ecx */
+ leal (%ecx, %edx, 8), %ecx
+
+ /* Build IDT entry, lower 4 bytes */
+ movl %eax, %edx
+ andl $0x0000ffff, %edx # Target code segment offset [15:0]
+ orl $(__KERNEL32_CS << 16), %edx # Target code segment selector
+
+ /* Store lower 4 bytes to IDT */
+ movl %edx, (%ecx)
+
+ /* Build IDT entry, upper 4 bytes */
+ movl %eax, %edx
+ andl $0xffff0000, %edx # Target code segment offset [31:16]
+ orl $0x00008e00, %edx # Present, Type 32-bit Interrupt Gate
+
+ /* Store upper 4 bytes to IDT */
+ movl %edx, 4(%ecx)
+
+ RET
+SYM_FUNC_END(startup32_set_idt_entry)
+
+SYM_FUNC_START(startup32_load_idt)
+ push %ebp
+ push %ebx
+
+ call 1f
+1: pop %ebp
+
+ leal (boot32_idt - 1b)(%ebp), %ebx
+
+ /* #VC handler */
+ leal (startup32_vc_handler - 1b)(%ebp), %eax
+ movl $X86_TRAP_VC, %edx
+ movl %ebx, %ecx
+ call startup32_set_idt_entry
+
+ /* Load IDT */
+ leal (boot32_idt_desc - 1b)(%ebp), %ecx
+ movl %ebx, 2(%ecx)
+ lidt (%ecx)
+
+ pop %ebx
+ pop %ebp
+ RET
+SYM_FUNC_END(startup32_load_idt)
+
+/*
+ * Check for the correct C-bit position when the startup_32 boot-path is used.
+ *
+ * The check makes use of the fact that all memory is encrypted when paging is
+ * disabled. The function creates 64 bits of random data using the RDRAND
+ * instruction. RDRAND is mandatory for SEV guests, so always available. If the
+ * hypervisor violates that the kernel will crash right here.
+ *
+ * The 64 bits of random data are stored to a memory location and at the same
+ * time kept in the %eax and %ebx registers. Since encryption is always active
+ * when paging is off the random data will be stored encrypted in main memory.
+ *
+ * Then paging is enabled. When the C-bit position is correct all memory is
+ * still mapped encrypted and comparing the register values with memory will
+ * succeed. An incorrect C-bit position will map all memory unencrypted, so that
+ * the compare will use the encrypted random data and fail.
+ */
+SYM_FUNC_START(startup32_check_sev_cbit)
+ pushl %ebx
+ pushl %ebp
+
+ call 0f
+0: popl %ebp
+
+ /* Check for non-zero sev_status */
+ movl (sev_status - 0b)(%ebp), %eax
+ testl %eax, %eax
+ jz 4f
+
+ /*
+ * Get two 32-bit random values - Don't bail out if RDRAND fails
+ * because it is better to prevent forward progress if no random value
+ * can be gathered.
+ */
+1: rdrand %eax
+ jnc 1b
+2: rdrand %ebx
+ jnc 2b
+
+ /* Store to memory and keep it in the registers */
+ leal (sev_check_data - 0b)(%ebp), %ebp
+ movl %eax, 0(%ebp)
+ movl %ebx, 4(%ebp)
+
+ /* Enable paging to see if encryption is active */
+ movl %cr0, %edx /* Backup %cr0 in %edx */
+ movl $(X86_CR0_PG | X86_CR0_PE), %ecx /* Enable Paging and Protected mode */
+ movl %ecx, %cr0
+
+ cmpl %eax, 0(%ebp)
+ jne 3f
+ cmpl %ebx, 4(%ebp)
+ jne 3f
+
+ movl %edx, %cr0 /* Restore previous %cr0 */
+
+ jmp 4f
+
+3: /* Check failed - hlt the machine */
+ hlt
+ jmp 3b
+
+4:
+ popl %ebp
+ popl %ebx
+ RET
+SYM_FUNC_END(startup32_check_sev_cbit)
+
+ .code64
+
+#include "../../kernel/sev_verify_cbit.S"
+
+ .data
+
+ .balign 8
+SYM_DATA(sme_me_mask, .quad 0)
+SYM_DATA(sev_status, .quad 0)
+SYM_DATA(sev_check_data, .quad 0)
+
+SYM_DATA_START_LOCAL(boot32_idt)
+ .rept 32
+ .quad 0
+ .endr
+SYM_DATA_END(boot32_idt)
+
+SYM_DATA_START_LOCAL(boot32_idt_desc)
+ .word . - boot32_idt - 1
+ .long 0
+SYM_DATA_END(boot32_idt_desc)
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 0319c88290a5..0f41ca0e52c0 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -1,8 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* misc.c
*
- * This is a collection of several routines from gzip-1.0.3
- * adapted for Linux.
+ * This is a collection of several routines used to extract the kernel
+ * which includes KASLR relocation, decompression, ELF parsing, and
+ * relocation processing. Additionally included are the screen and serial
+ * output functions and related debugging support functions.
*
* malloc by Hannu Savolainen 1993 and Matthias Urlichs 1994
* puts by Nick Holloway 1993, better puts by Martin Mares 1995
@@ -10,120 +13,52 @@
*/
#include "misc.h"
-
-/* WARNING!!
- * This code is compiled with -fPIC and it is relocated dynamically
- * at run time, but no relocation processing is performed.
- * This means that it is not safe to place pointers in static structures.
- */
+#include "error.h"
+#include "../string.h"
+#include "../voffset.h"
+#include <asm/bootparam_utils.h>
/*
- * Getting to provable safe in place decompression is hard.
- * Worst case behaviours need to be analyzed.
- * Background information:
- *
- * The file layout is:
- * magic[2]
- * method[1]
- * flags[1]
- * timestamp[4]
- * extraflags[1]
- * os[1]
- * compressed data blocks[N]
- * crc[4] orig_len[4]
- *
- * resulting in 18 bytes of non compressed data overhead.
- *
- * Files divided into blocks
- * 1 bit (last block flag)
- * 2 bits (block type)
- *
- * 1 block occurs every 32K -1 bytes or when there 50% compression
- * has been achieved. The smallest block type encoding is always used.
- *
- * stored:
- * 32 bits length in bytes.
- *
- * fixed:
- * magic fixed tree.
- * symbols.
- *
- * dynamic:
- * dynamic tree encoding.
- * symbols.
- *
- *
- * The buffer for decompression in place is the length of the
- * uncompressed data, plus a small amount extra to keep the algorithm safe.
- * The compressed data is placed at the end of the buffer. The output
- * pointer is placed at the start of the buffer and the input pointer
- * is placed where the compressed data starts. Problems will occur
- * when the output pointer overruns the input pointer.
- *
- * The output pointer can only overrun the input pointer if the input
- * pointer is moving faster than the output pointer. A condition only
- * triggered by data whose compressed form is larger than the uncompressed
- * form.
- *
- * The worst case at the block level is a growth of the compressed data
- * of 5 bytes per 32767 bytes.
- *
- * The worst case internal to a compressed block is very hard to figure.
- * The worst case can at least be boundined by having one bit that represents
- * 32764 bytes and then all of the rest of the bytes representing the very
- * very last byte.
- *
- * All of which is enough to compute an amount of extra data that is required
- * to be safe. To avoid problems at the block level allocating 5 extra bytes
- * per 32767 bytes of data is sufficient. To avoind problems internal to a
- * block adding an extra 32767 bytes (the worst case uncompressed block size)
- * is sufficient, to ensure that in the worst case the decompressed data for
- * block will stop the byte before the compressed data for a block begins.
- * To avoid problems with the compressed data's meta information an extra 18
- * bytes are needed. Leading to the formula:
- *
- * extra_bytes = (uncompressed_size >> 12) + 32768 + 18 + decompressor_size.
- *
- * Adding 8 bytes per 32K is a bit excessive but much easier to calculate.
- * Adding 32768 instead of 32767 just makes for round numbers.
- * Adding the decompressor_size is necessary as it musht live after all
- * of the data as well. Last I measured the decompressor is about 14K.
- * 10K of actual data and 4K of bss.
- *
+ * WARNING!!
+ * This code is compiled with -fPIC and it is relocated dynamically at
+ * run time, but no relocation processing is performed. This means that
+ * it is not safe to place pointers in static structures.
*/
-/*
- * gzip declarations
- */
+/* Macros used by the included decompressor code below. */
#define STATIC static
+/* Define an externally visible malloc()/free(). */
+#define MALLOC_VISIBLE
+#include <linux/decompress/mm.h>
-#undef memset
-#undef memcpy
+/*
+ * Provide definitions of memzero and memmove as some of the decompressors will
+ * try to define their own functions if these are not defined as macros.
+ */
#define memzero(s, n) memset((s), 0, (n))
-
-
-static void error(char *m);
+#ifndef memmove
+#define memmove memmove
+/* Functions used by the included decompressor code below. */
+void *memmove(void *dest, const void *src, size_t n);
+#endif
/*
* This is set up by the setup-routine at boot-time
*/
-struct boot_params *real_mode; /* Pointer to real-mode data */
+struct boot_params *boot_params_ptr;
-void *memset(void *s, int c, size_t n);
-void *memcpy(void *dest, const void *src, size_t n);
+struct port_io_ops pio_ops;
-#ifdef CONFIG_X86_64
-#define memptr long
-#else
-#define memptr unsigned
-#endif
-
-static memptr free_mem_ptr;
-static memptr free_mem_end_ptr;
+memptr free_mem_ptr;
+memptr free_mem_end_ptr;
+int spurious_nmi_count;
static char *vidmem;
static int vidport;
-static int lines, cols;
+
+/* These might be accessed before .bss is cleared, so use .data instead. */
+static int lines __section(".data");
+static int cols __section(".data");
#ifdef CONFIG_KERNEL_GZIP
#include "../../../../lib/decompress_inflate.c"
@@ -149,11 +84,19 @@ static int lines, cols;
#include "../../../../lib/decompress_unlz4.c"
#endif
+#ifdef CONFIG_KERNEL_ZSTD
+#include "../../../../lib/decompress_unzstd.c"
+#endif
+/*
+ * NOTE: When adding a new decompressor, please update the analysis in
+ * ../header.S.
+ */
+
static void scroll(void)
{
int i;
- memcpy(vidmem, vidmem + cols * 2, (lines - 1) * cols * 2);
+ memmove(vidmem, vidmem + cols * 2, (lines - 1) * cols * 2);
for (i = (lines - 1) * cols * 2; i < lines * cols * 2; i += 2)
vidmem[i] = ' ';
}
@@ -186,12 +129,11 @@ void __putstr(const char *s)
}
}
- if (real_mode->screen_info.orig_video_mode == 0 &&
- lines == 0 && cols == 0)
+ if (lines == 0 || cols == 0)
return;
- x = real_mode->screen_info.orig_x;
- y = real_mode->screen_info.orig_y;
+ x = boot_params_ptr->screen_info.orig_x;
+ y = boot_params_ptr->screen_info.orig_y;
while ((c = *s++) != '\0') {
if (c == '\n') {
@@ -212,8 +154,8 @@ void __putstr(const char *s)
}
}
- real_mode->screen_info.orig_x = x;
- real_mode->screen_info.orig_y = y;
+ boot_params_ptr->screen_info.orig_x = x;
+ boot_params_ptr->screen_info.orig_y = y;
pos = (x + cols * y) * 2; /* Update cursor position */
outb(14, vidport);
@@ -222,56 +164,121 @@ void __putstr(const char *s)
outb(0xff & (pos >> 1), vidport+1);
}
-void *memset(void *s, int c, size_t n)
+static noinline void __putnum(unsigned long value, unsigned int base,
+ int mindig)
{
- int i;
- char *ss = s;
+ char buf[8*sizeof(value)+1];
+ char *p;
+
+ p = buf + sizeof(buf);
+ *--p = '\0';
+
+ while (mindig-- > 0 || value) {
+ unsigned char digit = value % base;
+ digit += (digit >= 10) ? ('a'-10) : '0';
+ *--p = digit;
- for (i = 0; i < n; i++)
- ss[i] = c;
- return s;
+ value /= base;
+ }
+
+ __putstr(p);
}
-#ifdef CONFIG_X86_32
-void *memcpy(void *dest, const void *src, size_t n)
+
+void __puthex(unsigned long value)
{
- int d0, d1, d2;
- asm volatile(
- "rep ; movsl\n\t"
- "movl %4,%%ecx\n\t"
- "rep ; movsb\n\t"
- : "=&c" (d0), "=&D" (d1), "=&S" (d2)
- : "0" (n >> 2), "g" (n & 3), "1" (dest), "2" (src)
- : "memory");
-
- return dest;
+ __putnum(value, 16, sizeof(value)*2);
}
-#else
-void *memcpy(void *dest, const void *src, size_t n)
+
+void __putdec(unsigned long value)
{
- long d0, d1, d2;
- asm volatile(
- "rep ; movsq\n\t"
- "movq %4,%%rcx\n\t"
- "rep ; movsb\n\t"
- : "=&c" (d0), "=&D" (d1), "=&S" (d2)
- : "0" (n >> 3), "g" (n & 7), "1" (dest), "2" (src)
- : "memory");
-
- return dest;
+ __putnum(value, 10, 1);
}
-#endif
-static void error(char *x)
+#ifdef CONFIG_X86_NEED_RELOCS
+static void handle_relocations(void *output, unsigned long output_len,
+ unsigned long virt_addr)
{
- error_putstr("\n\n");
- error_putstr(x);
- error_putstr("\n\n -- System halted");
+ int *reloc;
+ unsigned long delta, map, ptr;
+ unsigned long min_addr = (unsigned long)output;
+ unsigned long max_addr = min_addr + (VO___bss_start - VO__text);
+
+ /*
+ * Calculate the delta between where vmlinux was linked to load
+ * and where it was actually loaded.
+ */
+ delta = min_addr - LOAD_PHYSICAL_ADDR;
+
+ /*
+ * The kernel contains a table of relocation addresses. Those
+ * addresses have the final load address of the kernel in virtual
+ * memory. We are currently working in the self map. So we need to
+ * create an adjustment for kernel memory addresses to the self map.
+ * This will involve subtracting out the base address of the kernel.
+ */
+ map = delta - __START_KERNEL_map;
+
+ /*
+ * 32-bit always performs relocations. 64-bit relocations are only
+ * needed if KASLR has chosen a different starting address offset
+ * from __START_KERNEL_map.
+ */
+ if (IS_ENABLED(CONFIG_X86_64))
+ delta = virt_addr - LOAD_PHYSICAL_ADDR;
+
+ if (!delta) {
+ debug_putstr("No relocation needed... ");
+ return;
+ }
+ debug_putstr("Performing relocations... ");
+
+ /*
+ * Process relocations: 32 bit relocations first then 64 bit after.
+ * Two sets of binary relocations are added to the end of the kernel
+ * before compression. Each relocation table entry is the kernel
+ * address of the location which needs to be updated stored as a
+ * 32-bit value which is sign extended to 64 bits.
+ *
+ * Format is:
+ *
+ * kernel bits...
+ * 0 - zero terminator for 64 bit relocations
+ * 64 bit relocation repeated
+ * 0 - zero terminator for 32 bit relocations
+ * 32 bit relocation repeated
+ *
+ * So we work backwards from the end of the decompressed image.
+ */
+ for (reloc = output + output_len - sizeof(*reloc); *reloc; reloc--) {
+ long extended = *reloc;
+ extended += map;
+
+ ptr = (unsigned long)extended;
+ if (ptr < min_addr || ptr > max_addr)
+ error("32-bit relocation outside of kernel!\n");
+
+ *(uint32_t *)ptr += delta;
+ }
+#ifdef CONFIG_X86_64
+ for (reloc--; *reloc; reloc--) {
+ long extended = *reloc;
+ extended += map;
- while (1)
- asm("hlt");
+ ptr = (unsigned long)extended;
+ if (ptr < min_addr || ptr > max_addr)
+ error("64-bit relocation outside of kernel!\n");
+
+ *(uint64_t *)ptr += delta;
+ }
+#endif
}
+#else
+static inline void handle_relocations(void *output, unsigned long output_len,
+ unsigned long virt_addr)
+{ }
+#endif
-static void parse_elf(void *output)
+static size_t parse_elf(void *output)
{
#ifdef CONFIG_X86_64
Elf64_Ehdr ehdr;
@@ -287,10 +294,8 @@ static void parse_elf(void *output)
if (ehdr.e_ident[EI_MAG0] != ELFMAG0 ||
ehdr.e_ident[EI_MAG1] != ELFMAG1 ||
ehdr.e_ident[EI_MAG2] != ELFMAG2 ||
- ehdr.e_ident[EI_MAG3] != ELFMAG3) {
+ ehdr.e_ident[EI_MAG3] != ELFMAG3)
error("Kernel is not a valid ELF file");
- return;
- }
debug_putstr("Parsing ELF... ");
@@ -305,33 +310,118 @@ static void parse_elf(void *output)
switch (phdr->p_type) {
case PT_LOAD:
+#ifdef CONFIG_X86_64
+ if ((phdr->p_align % 0x200000) != 0)
+ error("Alignment of LOAD segment isn't multiple of 2MB");
+#endif
#ifdef CONFIG_RELOCATABLE
dest = output;
dest += (phdr->p_paddr - LOAD_PHYSICAL_ADDR);
#else
dest = (void *)(phdr->p_paddr);
#endif
- memcpy(dest,
- output + phdr->p_offset,
- phdr->p_filesz);
+ memmove(dest, output + phdr->p_offset, phdr->p_filesz);
break;
default: /* Ignore other PT_* */ break;
}
}
free(phdrs);
+
+ return ehdr.e_entry - LOAD_PHYSICAL_ADDR;
+}
+
+const unsigned long kernel_text_size = VO___start_rodata - VO__text;
+const unsigned long kernel_inittext_offset = VO__sinittext - VO__text;
+const unsigned long kernel_inittext_size = VO___inittext_end - VO__sinittext;
+const unsigned long kernel_total_size = VO__end - VO__text;
+
+static u8 boot_heap[BOOT_HEAP_SIZE] __aligned(4);
+
+extern unsigned char input_data[];
+extern unsigned int input_len, output_len;
+
+unsigned long decompress_kernel(unsigned char *outbuf, unsigned long virt_addr,
+ void (*error)(char *x))
+{
+ unsigned long entry;
+
+ if (!free_mem_ptr) {
+ free_mem_ptr = (unsigned long)boot_heap;
+ free_mem_end_ptr = (unsigned long)boot_heap + sizeof(boot_heap);
+ }
+
+ if (__decompress(input_data, input_len, NULL, NULL, outbuf, output_len,
+ NULL, error) < 0)
+ return ULONG_MAX;
+
+ entry = parse_elf(outbuf);
+ handle_relocations(outbuf, output_len, virt_addr);
+
+ return entry;
}
-asmlinkage void decompress_kernel(void *rmode, memptr heap,
- unsigned char *input_data,
- unsigned long input_len,
- unsigned char *output)
+/*
+ * Set the memory encryption xloadflag based on the mem_encrypt= command line
+ * parameter, if provided.
+ */
+static void parse_mem_encrypt(struct setup_header *hdr)
+{
+ int on = cmdline_find_option_bool("mem_encrypt=on");
+ int off = cmdline_find_option_bool("mem_encrypt=off");
+
+ if (on > off)
+ hdr->xloadflags |= XLF_MEM_ENCRYPTION;
+}
+
+static void early_sev_detect(void)
+{
+ /*
+ * Accessing video memory causes guest termination because
+ * the boot stage2 #VC handler of SEV-ES/SNP guests does not
+ * support MMIO handling and kexec -c adds screen_info to the
+ * boot parameters passed to the kexec kernel, which causes
+ * console output to be dumped to both video and serial.
+ */
+ if (sev_status & MSR_AMD64_SEV_ES_ENABLED)
+ lines = cols = 0;
+}
+
+/*
+ * The compressed kernel image (ZO), has been moved so that its position
+ * is against the end of the buffer used to hold the uncompressed kernel
+ * image (VO) and the execution environment (.bss, .brk), which makes sure
+ * there is room to do the in-place decompression. (See header.S for the
+ * calculations.)
+ *
+ * |-----compressed kernel image------|
+ * V V
+ * 0 extract_offset +INIT_SIZE
+ * |-----------|---------------|-------------------------|--------|
+ * | | | |
+ * VO__text startup_32 of ZO VO__end ZO__end
+ * ^ ^
+ * |-------uncompressed kernel image---------|
+ *
+ */
+asmlinkage __visible void *extract_kernel(void *rmode, unsigned char *output)
{
- real_mode = rmode;
+ unsigned long virt_addr = LOAD_PHYSICAL_ADDR;
+ memptr heap = (memptr)boot_heap;
+ unsigned long needed_size;
+ size_t entry_offset;
+
+ /* Retain x86 boot parameters pointer passed from startup_32/64. */
+ boot_params_ptr = rmode;
+
+ /* Clear flags intended for solely in-kernel use. */
+ boot_params_ptr->hdr.loadflags &= ~KASLR_FLAG;
- sanitize_boot_params(real_mode);
+ parse_mem_encrypt(&boot_params_ptr->hdr);
- if (real_mode->screen_info.orig_video_mode == 7) {
+ sanitize_boot_params(boot_params_ptr);
+
+ if (boot_params_ptr->screen_info.orig_video_mode == 7) {
vidmem = (char *) 0xb0000;
vidport = 0x3b4;
} else {
@@ -339,32 +429,109 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
vidport = 0x3d4;
}
- lines = real_mode->screen_info.orig_video_lines;
- cols = real_mode->screen_info.orig_video_cols;
+ lines = boot_params_ptr->screen_info.orig_video_lines;
+ cols = boot_params_ptr->screen_info.orig_video_cols;
+
+ init_default_io_ops();
+
+ /*
+ * Detect TDX guest environment.
+ *
+ * It has to be done before console_init() in order to use
+ * paravirtualized port I/O operations if needed.
+ */
+ early_tdx_detect();
+
+ early_sev_detect();
console_init();
- debug_putstr("early console in decompress_kernel\n");
+
+ /*
+ * Save RSDP address for later use. Have this after console_init()
+ * so that early debugging output from the RSDP parsing code can be
+ * collected.
+ */
+ boot_params_ptr->acpi_rsdp_addr = get_rsdp_addr();
+
+ debug_putstr("early console in extract_kernel\n");
free_mem_ptr = heap; /* Heap */
free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
+ /*
+ * The memory hole needed for the kernel is the larger of either
+ * the entire decompressed kernel plus relocation table, or the
+ * entire decompressed kernel plus .bss and .brk sections.
+ *
+ * On X86_64, the memory is mapped with PMD pages. Round the
+ * size up so that the full extent of PMD pages mapped is
+ * included in the check against the valid memory table
+ * entries. This ensures the full mapped area is usable RAM
+ * and doesn't include any reserved areas.
+ */
+ needed_size = max_t(unsigned long, output_len, kernel_total_size);
+#ifdef CONFIG_X86_64
+ needed_size = ALIGN(needed_size, MIN_KERNEL_ALIGN);
+#endif
+
+ /* Report initial kernel position details. */
+ debug_putaddr(input_data);
+ debug_putaddr(input_len);
+ debug_putaddr(output);
+ debug_putaddr(output_len);
+ debug_putaddr(kernel_total_size);
+ debug_putaddr(needed_size);
+
+#ifdef CONFIG_X86_64
+ /* Report address of 32-bit trampoline */
+ debug_putaddr(trampoline_32bit);
+#endif
+
+ choose_random_location((unsigned long)input_data, input_len,
+ (unsigned long *)&output,
+ needed_size,
+ &virt_addr);
+
+ /* Validate memory location choices. */
if ((unsigned long)output & (MIN_KERNEL_ALIGN - 1))
- error("Destination address inappropriately aligned");
+ error("Destination physical address inappropriately aligned");
+ if (virt_addr & (MIN_KERNEL_ALIGN - 1))
+ error("Destination virtual address inappropriately aligned");
#ifdef CONFIG_X86_64
if (heap > 0x3fffffffffffUL)
error("Destination address too large");
+ if (virt_addr + needed_size > KERNEL_IMAGE_SIZE)
+ error("Destination virtual address is beyond the kernel mapping area");
#else
if (heap > ((-__PAGE_OFFSET-(128<<20)-1) & 0x7fffffff))
error("Destination address too large");
#endif
#ifndef CONFIG_RELOCATABLE
- if ((unsigned long)output != LOAD_PHYSICAL_ADDR)
- error("Wrong destination address");
+ if (virt_addr != LOAD_PHYSICAL_ADDR)
+ error("Destination virtual address changed when not relocatable");
#endif
debug_putstr("\nDecompressing Linux... ");
- decompress(input_data, input_len, NULL, NULL, output, NULL, error);
- parse_elf(output);
- debug_putstr("done.\nBooting the kernel.\n");
- return;
+
+ if (init_unaccepted_memory()) {
+ debug_putstr("Accepting memory... ");
+ accept_memory(__pa(output), needed_size);
+ }
+
+ entry_offset = decompress_kernel(output, virt_addr, error);
+
+ debug_putstr("done.\nBooting the kernel (entry_offset: 0x");
+ debug_puthex(entry_offset);
+ debug_putstr(").\n");
+
+ /* Disable exception handling before booting the kernel */
+ cleanup_exception_handling();
+
+ if (spurious_nmi_count) {
+ error_putstr("Spurious early NMIs ignored: ");
+ error_putdec(spurious_nmi_count);
+ error_putstr("\n");
+ }
+
+ return output + entry_offset;
}
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 674019d8e235..fd855e32c9b9 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -1,61 +1,258 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BOOT_COMPRESSED_MISC_H
#define BOOT_COMPRESSED_MISC_H
/*
- * we have to be careful, because no indirections are allowed here, and
- * paravirt_ops is a kind of one. As it will only run in baremetal anyway,
- * we just keep it from happening
+ * Special hack: we have to be careful, because no indirections are allowed here,
+ * and paravirt_ops is a kind of one. As it will only run in baremetal anyway,
+ * we just keep it from happening. (This list needs to be extended when new
+ * paravirt and debugging variants are added.)
*/
#undef CONFIG_PARAVIRT
-#ifdef CONFIG_X86_32
-#define _ASM_X86_DESC_H 1
-#endif
+#undef CONFIG_PARAVIRT_XXL
+#undef CONFIG_PARAVIRT_SPINLOCKS
+#undef CONFIG_KASAN
+#undef CONFIG_KASAN_GENERIC
+
+#define __NO_FORTIFY
+
+/* cpu_feature_enabled() cannot be used this early */
+#define USE_EARLY_PGTABLE_L5
+
+/*
+ * Boot stub deals with identity mappings, physical and virtual addresses are
+ * the same, so override these defines.
+ *
+ * <asm/page.h> will not define them if they are already defined.
+ */
+#define __pa(x) ((unsigned long)(x))
+#define __va(x) ((void *)((unsigned long)(x)))
#include <linux/linkage.h>
#include <linux/screen_info.h>
#include <linux/elf.h>
-#include <linux/io.h>
#include <asm/page.h>
#include <asm/boot.h>
#include <asm/bootparam.h>
-#include <asm/bootparam_utils.h>
+#include <asm/desc_defs.h>
+
+#include "tdx.h"
+
+#define BOOT_CTYPE_H
+#include <linux/acpi.h>
#define BOOT_BOOT_H
#include "../ctype.h"
+#include "../io.h"
+
+#include "efi.h"
+
+#ifdef CONFIG_X86_64
+#define memptr long
+#else
+#define memptr unsigned
+#endif
+
+/* boot/compressed/vmlinux start and end markers */
+extern char _head[], _end[];
/* misc.c */
-extern struct boot_params *real_mode; /* Pointer to real-mode data */
+extern memptr free_mem_ptr;
+extern memptr free_mem_end_ptr;
+extern int spurious_nmi_count;
+void *malloc(int size);
+void free(void *where);
void __putstr(const char *s);
+void __puthex(unsigned long value);
+void __putdec(unsigned long value);
#define error_putstr(__x) __putstr(__x)
+#define error_puthex(__x) __puthex(__x)
+#define error_putdec(__x) __putdec(__x)
#ifdef CONFIG_X86_VERBOSE_BOOTUP
#define debug_putstr(__x) __putstr(__x)
+#define debug_puthex(__x) __puthex(__x)
+#define debug_putaddr(__x) { \
+ debug_putstr(#__x ": 0x"); \
+ debug_puthex((unsigned long)(__x)); \
+ debug_putstr("\n"); \
+ }
#else
static inline void debug_putstr(const char *s)
{ }
+static inline void debug_puthex(unsigned long value)
+{ }
+#define debug_putaddr(x) /* */
#endif
-#ifdef CONFIG_EARLY_PRINTK
-
/* cmdline.c */
int cmdline_find_option(const char *option, char *buffer, int bufsize);
int cmdline_find_option_bool(const char *option);
-/* early_serial_console.c */
-extern int early_serial_base;
-void console_init(void);
+struct mem_vector {
+ u64 start;
+ u64 size;
+};
+#ifdef CONFIG_RANDOMIZE_BASE
+/* kaslr.c */
+void choose_random_location(unsigned long input,
+ unsigned long input_size,
+ unsigned long *output,
+ unsigned long output_size,
+ unsigned long *virt_addr);
#else
+static inline void choose_random_location(unsigned long input,
+ unsigned long input_size,
+ unsigned long *output,
+ unsigned long output_size,
+ unsigned long *virt_addr)
+{
+}
+#endif
+
+/* cpuflags.c */
+bool has_cpuflag(int flag);
+#ifdef CONFIG_X86_64
+extern int set_page_decrypted(unsigned long address);
+extern int set_page_encrypted(unsigned long address);
+extern int set_page_non_present(unsigned long address);
+extern unsigned char _pgtable[];
+#endif
+
+#ifdef CONFIG_EARLY_PRINTK
/* early_serial_console.c */
+extern int early_serial_base;
+void console_init(void);
+#else
static const int early_serial_base;
static inline void console_init(void)
{ }
+#endif
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+struct es_em_ctxt;
+struct insn;
+
+void sev_enable(struct boot_params *bp);
+void snp_check_features(void);
+void sev_es_shutdown_ghcb(void);
+extern bool sev_es_check_ghcb_fault(unsigned long address);
+void snp_set_page_private(unsigned long paddr);
+void snp_set_page_shared(unsigned long paddr);
+void sev_prep_identity_maps(unsigned long top_level_pgt);
+
+enum es_result vc_decode_insn(struct es_em_ctxt *ctxt);
+bool insn_has_rep_prefix(struct insn *insn);
+void sev_insn_decode_init(void);
+bool early_setup_ghcb(void);
+#else
+static inline void snp_check_features(void) { }
+static inline void sev_es_shutdown_ghcb(void) { }
+static inline bool sev_es_check_ghcb_fault(unsigned long address)
+{
+ return false;
+}
+static inline void snp_set_page_private(unsigned long paddr) { }
+static inline void snp_set_page_shared(unsigned long paddr) { }
+static inline void sev_prep_identity_maps(unsigned long top_level_pgt) { }
+#endif
+
+/* acpi.c */
+#ifdef CONFIG_ACPI
+acpi_physical_address get_rsdp_addr(void);
+#else
+static inline acpi_physical_address get_rsdp_addr(void) { return 0; }
+#endif
+#if defined(CONFIG_RANDOMIZE_BASE) && defined(CONFIG_MEMORY_HOTREMOVE) && defined(CONFIG_ACPI)
+extern struct mem_vector immovable_mem[MAX_NUMNODES*2];
+int count_immovable_mem_regions(void);
+#else
+static inline int count_immovable_mem_regions(void) { return 0; }
#endif
+/* ident_map_64.c */
+extern unsigned int __pgtable_l5_enabled, pgdir_shift, ptrs_per_p4d;
+extern void kernel_add_identity_map(unsigned long start, unsigned long end);
+
+/* Used by PAGE_KERN* macros: */
+extern pteval_t __default_kernel_pte_mask;
+
+/* idt_64.c */
+extern gate_desc boot_idt[BOOT_IDT_ENTRIES];
+extern struct desc_ptr boot_idt_desc;
+
+#ifdef CONFIG_X86_64
+void cleanup_exception_handling(void);
+#else
+static inline void cleanup_exception_handling(void) { }
#endif
+
+/* IDT Entry Points */
+void boot_page_fault(void);
+void boot_nmi_trap(void);
+void boot_stage1_vc(void);
+void boot_stage2_vc(void);
+
+unsigned long sev_verify_cbit(unsigned long cr3);
+
+enum efi_type {
+ EFI_TYPE_64,
+ EFI_TYPE_32,
+ EFI_TYPE_NONE,
+};
+
+#ifdef CONFIG_EFI
+/* helpers for early EFI config table access */
+enum efi_type efi_get_type(struct boot_params *bp);
+unsigned long efi_get_system_table(struct boot_params *bp);
+int efi_get_conf_table(struct boot_params *bp, unsigned long *cfg_tbl_pa,
+ unsigned int *cfg_tbl_len);
+unsigned long efi_find_vendor_table(struct boot_params *bp,
+ unsigned long cfg_tbl_pa,
+ unsigned int cfg_tbl_len,
+ efi_guid_t guid);
+#else
+static inline enum efi_type efi_get_type(struct boot_params *bp)
+{
+ return EFI_TYPE_NONE;
+}
+
+static inline unsigned long efi_get_system_table(struct boot_params *bp)
+{
+ return 0;
+}
+
+static inline int efi_get_conf_table(struct boot_params *bp,
+ unsigned long *cfg_tbl_pa,
+ unsigned int *cfg_tbl_len)
+{
+ return -ENOENT;
+}
+
+static inline unsigned long efi_find_vendor_table(struct boot_params *bp,
+ unsigned long cfg_tbl_pa,
+ unsigned int cfg_tbl_len,
+ efi_guid_t guid)
+{
+ return 0;
+}
+#endif /* CONFIG_EFI */
+
+#ifdef CONFIG_UNACCEPTED_MEMORY
+bool init_unaccepted_memory(void);
+#else
+static inline bool init_unaccepted_memory(void) { return false; }
+#endif
+
+/* Defined in EFI stub */
+extern struct efi_unaccepted_memory *unaccepted_table;
+void accept_memory(phys_addr_t start, unsigned long size);
+
+#endif /* BOOT_COMPRESSED_MISC_H */
diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c
index 958a641483dd..52aa56cdbacc 100644
--- a/arch/x86/boot/compressed/mkpiggy.c
+++ b/arch/x86/boot/compressed/mkpiggy.c
@@ -1,28 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* ----------------------------------------------------------------------- *
*
* Copyright (C) 2009 Intel Corporation. All rights reserved.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version
- * 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301, USA.
- *
* H. Peter Anvin <hpa@linux.intel.com>
*
- * ----------------------------------------------------------------------- */
-
-/*
- * Compute the desired load offset from a compressed program; outputs
- * a small assembly wrapper with the appropriate symbols defined.
+ * -----------------------------------------------------------------------
+ *
+ * Outputs a small assembly wrapper with the appropriate symbols defined.
*/
#include <stdlib.h>
@@ -35,12 +20,12 @@ int main(int argc, char *argv[])
{
uint32_t olen;
long ilen;
- unsigned long offs;
- FILE *f;
+ FILE *f = NULL;
+ int retval = 1;
if (argc < 2) {
fprintf(stderr, "Usage: %s compressed_file\n", argv[0]);
- return 1;
+ goto bail;
}
/* Get the information for the compressed kernel image first */
@@ -48,7 +33,7 @@ int main(int argc, char *argv[])
f = fopen(argv[1], "r");
if (!f) {
perror(argv[1]);
- return 1;
+ goto bail;
}
@@ -58,38 +43,32 @@ int main(int argc, char *argv[])
if (fread(&olen, sizeof(olen), 1, f) != 1) {
perror(argv[1]);
- return 1;
+ goto bail;
}
ilen = ftell(f);
olen = get_unaligned_le32(&olen);
- fclose(f);
-
- /*
- * Now we have the input (compressed) and output (uncompressed)
- * sizes, compute the necessary decompression offset...
- */
-
- offs = (olen > ilen) ? olen - ilen : 0;
- offs += olen >> 12; /* Add 8 bytes for each 32K block */
- offs += 64*1024 + 128; /* Add 64K + 128 bytes slack */
- offs = (offs+4095) & ~4095; /* Round to a 4K boundary */
printf(".section \".rodata..compressed\",\"a\",@progbits\n");
printf(".globl z_input_len\n");
printf("z_input_len = %lu\n", ilen);
printf(".globl z_output_len\n");
printf("z_output_len = %lu\n", (unsigned long)olen);
- printf(".globl z_extract_offset\n");
- printf("z_extract_offset = 0x%lx\n", offs);
- /* z_extract_offset_negative allows simplification of head_32.S */
- printf(".globl z_extract_offset_negative\n");
- printf("z_extract_offset_negative = -0x%lx\n", offs);
printf(".globl input_data, input_data_end\n");
printf("input_data:\n");
printf(".incbin \"%s\"\n", argv[1]);
printf("input_data_end:\n");
- return 0;
+ printf(".section \".rodata\",\"a\",@progbits\n");
+ printf(".globl input_len\n");
+ printf("input_len:\n\t.long %lu\n", ilen);
+ printf(".globl output_len\n");
+ printf("output_len:\n\t.long %lu\n", (unsigned long)olen);
+
+ retval = 0;
+bail:
+ if (f)
+ fclose(f);
+ return retval;
}
diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c
new file mode 100644
index 000000000000..0e89e197e112
--- /dev/null
+++ b/arch/x86/boot/compressed/pgtable_64.c
@@ -0,0 +1,200 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "misc.h"
+#include <asm/bootparam.h>
+#include <asm/bootparam_utils.h>
+#include <asm/e820/types.h>
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include "../string.h"
+#include "efi.h"
+
+#define BIOS_START_MIN 0x20000U /* 128K, less than this is insane */
+#define BIOS_START_MAX 0x9f000U /* 640K, absolute maximum */
+
+/* __pgtable_l5_enabled needs to be in .data to avoid being cleared along with .bss */
+unsigned int __section(".data") __pgtable_l5_enabled;
+unsigned int __section(".data") pgdir_shift = 39;
+unsigned int __section(".data") ptrs_per_p4d = 1;
+
+/* Buffer to preserve trampoline memory */
+static char trampoline_save[TRAMPOLINE_32BIT_SIZE];
+
+/*
+ * Trampoline address will be printed by extract_kernel() for debugging
+ * purposes.
+ *
+ * Avoid putting the pointer into .bss as it will be cleared between
+ * configure_5level_paging() and extract_kernel().
+ */
+unsigned long *trampoline_32bit __section(".data");
+
+int cmdline_find_option_bool(const char *option);
+
+static unsigned long find_trampoline_placement(void)
+{
+ unsigned long bios_start = 0, ebda_start = 0;
+ struct boot_e820_entry *entry;
+ char *signature;
+ int i;
+
+ /*
+ * Find a suitable spot for the trampoline.
+ * This code is based on reserve_bios_regions().
+ */
+
+ /*
+ * EFI systems may not provide legacy ROM. The memory may not be mapped
+ * at all.
+ *
+ * Only look for values in the legacy ROM for non-EFI system.
+ */
+ signature = (char *)&boot_params_ptr->efi_info.efi_loader_signature;
+ if (strncmp(signature, EFI32_LOADER_SIGNATURE, 4) &&
+ strncmp(signature, EFI64_LOADER_SIGNATURE, 4)) {
+ ebda_start = *(unsigned short *)0x40e << 4;
+ bios_start = *(unsigned short *)0x413 << 10;
+ }
+
+ if (bios_start < BIOS_START_MIN || bios_start > BIOS_START_MAX)
+ bios_start = BIOS_START_MAX;
+
+ if (ebda_start > BIOS_START_MIN && ebda_start < bios_start)
+ bios_start = ebda_start;
+
+ bios_start = round_down(bios_start, PAGE_SIZE);
+
+ /* Find the first usable memory region under bios_start. */
+ for (i = boot_params_ptr->e820_entries - 1; i >= 0; i--) {
+ unsigned long new = bios_start;
+
+ entry = &boot_params_ptr->e820_table[i];
+
+ /* Skip all entries above bios_start. */
+ if (bios_start <= entry->addr)
+ continue;
+
+ /* Skip non-RAM entries. */
+ if (entry->type != E820_TYPE_RAM)
+ continue;
+
+ /* Adjust bios_start to the end of the entry if needed. */
+ if (bios_start > entry->addr + entry->size)
+ new = entry->addr + entry->size;
+
+ /* Keep bios_start page-aligned. */
+ new = round_down(new, PAGE_SIZE);
+
+ /* Skip the entry if it's too small. */
+ if (new - TRAMPOLINE_32BIT_SIZE < entry->addr)
+ continue;
+
+ /* Protect against underflow. */
+ if (new - TRAMPOLINE_32BIT_SIZE > bios_start)
+ break;
+
+ bios_start = new;
+ break;
+ }
+
+ /* Place the trampoline just below the end of low memory */
+ return bios_start - TRAMPOLINE_32BIT_SIZE;
+}
+
+asmlinkage void configure_5level_paging(struct boot_params *bp, void *pgtable)
+{
+ void (*toggle_la57)(void *cr3);
+ bool l5_required = false;
+
+ /* Initialize boot_params. Required for cmdline_find_option_bool(). */
+ sanitize_boot_params(bp);
+ boot_params_ptr = bp;
+
+ /*
+ * Check if LA57 is desired and supported.
+ *
+ * There are several parts to the check:
+ * - if user asked to disable 5-level paging: no5lvl in cmdline
+ * - if the machine supports 5-level paging:
+ * + CPUID leaf 7 is supported
+ * + the leaf has the feature bit set
+ */
+ if (!cmdline_find_option_bool("no5lvl") &&
+ native_cpuid_eax(0) >= 7 && (native_cpuid_ecx(7) & BIT(16))) {
+ l5_required = true;
+
+ /* Initialize variables for 5-level paging */
+ __pgtable_l5_enabled = 1;
+ pgdir_shift = 48;
+ ptrs_per_p4d = 512;
+ }
+
+ /*
+ * The trampoline will not be used if the paging mode is already set to
+ * the desired one.
+ */
+ if (l5_required == !!(native_read_cr4() & X86_CR4_LA57))
+ return;
+
+ trampoline_32bit = (unsigned long *)find_trampoline_placement();
+
+ /* Preserve trampoline memory */
+ memcpy(trampoline_save, trampoline_32bit, TRAMPOLINE_32BIT_SIZE);
+
+ /* Clear trampoline memory first */
+ memset(trampoline_32bit, 0, TRAMPOLINE_32BIT_SIZE);
+
+ /* Copy trampoline code in place */
+ toggle_la57 = memcpy(trampoline_32bit +
+ TRAMPOLINE_32BIT_CODE_OFFSET / sizeof(unsigned long),
+ &trampoline_32bit_src, TRAMPOLINE_32BIT_CODE_SIZE);
+
+ /*
+ * Avoid the need for a stack in the 32-bit trampoline code, by using
+ * LJMP rather than LRET to return back to long mode. LJMP takes an
+ * immediate absolute address, which needs to be adjusted based on the
+ * placement of the trampoline.
+ */
+ *(u32 *)((u8 *)toggle_la57 + trampoline_ljmp_imm_offset) +=
+ (unsigned long)toggle_la57;
+
+ /*
+ * The code below prepares page table in trampoline memory.
+ *
+ * The new page table will be used by trampoline code for switching
+ * from 4- to 5-level paging or vice versa.
+ */
+
+ if (l5_required) {
+ /*
+ * For 4- to 5-level paging transition, set up current CR3 as
+ * the first and the only entry in a new top-level page table.
+ */
+ *trampoline_32bit = native_read_cr3_pa() | _PAGE_TABLE_NOENC;
+ } else {
+ u64 *new_cr3;
+ pgd_t *pgdp;
+
+ /*
+ * For 5- to 4-level paging transition, copy page table pointed
+ * by first entry in the current top-level page table as our
+ * new top-level page table.
+ *
+ * We cannot just point to the page table from trampoline as it
+ * may be above 4G.
+ */
+ pgdp = (pgd_t *)native_read_cr3_pa();
+ new_cr3 = (u64 *)(native_pgd_val(pgdp[0]) & PTE_PFN_MASK);
+ memcpy(trampoline_32bit, new_cr3, PAGE_SIZE);
+ }
+
+ toggle_la57(trampoline_32bit);
+
+ /*
+ * Move the top level page table out of trampoline memory.
+ */
+ memcpy(pgtable, trampoline_32bit, PAGE_SIZE);
+ native_write_cr3((unsigned long)pgtable);
+
+ /* Restore trampoline memory */
+ memcpy(trampoline_32bit, trampoline_save, TRAMPOLINE_32BIT_SIZE);
+}
diff --git a/arch/x86/boot/compressed/sbat.S b/arch/x86/boot/compressed/sbat.S
new file mode 100644
index 000000000000..838f70a997dd
--- /dev/null
+++ b/arch/x86/boot/compressed/sbat.S
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Embed SBAT data in the kernel.
+ */
+ .pushsection ".sbat", "a", @progbits
+ .incbin CONFIG_EFI_SBAT_FILE
+ .popsection
diff --git a/arch/x86/boot/compressed/sev-handle-vc.c b/arch/x86/boot/compressed/sev-handle-vc.c
new file mode 100644
index 000000000000..030001b46554
--- /dev/null
+++ b/arch/x86/boot/compressed/sev-handle-vc.c
@@ -0,0 +1,136 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "misc.h"
+#include "error.h"
+#include "sev.h"
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <asm/insn.h>
+#include <asm/pgtable_types.h>
+#include <asm/ptrace.h>
+#include <asm/sev.h>
+#include <asm/trapnr.h>
+#include <asm/trap_pf.h>
+#include <asm/fpu/xcr.h>
+
+#define __BOOT_COMPRESSED
+#undef __init
+#define __init
+
+/* Basic instruction decoding support needed */
+#include "../../lib/inat.c"
+#include "../../lib/insn.c"
+
+/*
+ * Copy a version of this function here - insn-eval.c can't be used in
+ * pre-decompression code.
+ */
+bool insn_has_rep_prefix(struct insn *insn)
+{
+ insn_byte_t p;
+
+ insn_get_prefixes(insn);
+
+ for_each_insn_prefix(insn, p) {
+ if (p == 0xf2 || p == 0xf3)
+ return true;
+ }
+
+ return false;
+}
+
+enum es_result vc_decode_insn(struct es_em_ctxt *ctxt)
+{
+ char buffer[MAX_INSN_SIZE];
+ int ret;
+
+ memcpy(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE);
+
+ ret = insn_decode(&ctxt->insn, buffer, MAX_INSN_SIZE, INSN_MODE_64);
+ if (ret < 0)
+ return ES_DECODE_FAILED;
+
+ return ES_OK;
+}
+
+extern void sev_insn_decode_init(void) __alias(inat_init_tables);
+
+/*
+ * Only a dummy for insn_get_seg_base() - Early boot-code is 64bit only and
+ * doesn't use segments.
+ */
+static unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx)
+{
+ return 0UL;
+}
+
+static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
+ void *dst, char *buf, size_t size)
+{
+ memcpy(dst, buf, size);
+
+ return ES_OK;
+}
+
+static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
+ void *src, char *buf, size_t size)
+{
+ memcpy(buf, src, size);
+
+ return ES_OK;
+}
+
+static enum es_result vc_ioio_check(struct es_em_ctxt *ctxt, u16 port, size_t size)
+{
+ return ES_OK;
+}
+
+static bool fault_in_kernel_space(unsigned long address)
+{
+ return false;
+}
+
+#define sev_printk(fmt, ...)
+
+#include "../../coco/sev/vc-shared.c"
+
+void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code)
+{
+ struct es_em_ctxt ctxt;
+ enum es_result result;
+
+ if (!boot_ghcb && !early_setup_ghcb())
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
+
+ vc_ghcb_invalidate(boot_ghcb);
+ result = vc_init_em_ctxt(&ctxt, regs, exit_code);
+ if (result != ES_OK)
+ goto finish;
+
+ result = vc_check_opcode_bytes(&ctxt, exit_code);
+ if (result != ES_OK)
+ goto finish;
+
+ switch (exit_code) {
+ case SVM_EXIT_RDTSC:
+ case SVM_EXIT_RDTSCP:
+ result = vc_handle_rdtsc(boot_ghcb, &ctxt, exit_code);
+ break;
+ case SVM_EXIT_IOIO:
+ result = vc_handle_ioio(boot_ghcb, &ctxt);
+ break;
+ case SVM_EXIT_CPUID:
+ result = vc_handle_cpuid(boot_ghcb, &ctxt);
+ break;
+ default:
+ result = ES_UNSUPPORTED;
+ break;
+ }
+
+finish:
+ if (result == ES_OK)
+ vc_finish_insn(&ctxt);
+ else if (result != ES_RETRY)
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
+}
diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
new file mode 100644
index 000000000000..c8c1464b3a56
--- /dev/null
+++ b/arch/x86/boot/compressed/sev.c
@@ -0,0 +1,512 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * AMD Encrypted Register State Support
+ *
+ * Author: Joerg Roedel <jroedel@suse.de>
+ */
+
+/*
+ * misc.h needs to be first because it knows how to include the other kernel
+ * headers in the pre-decompression code in a way that does not break
+ * compilation.
+ */
+#include "misc.h"
+
+#include <asm/bootparam.h>
+#include <asm/pgtable_types.h>
+#include <asm/shared/msr.h>
+#include <asm/sev.h>
+#include <asm/trapnr.h>
+#include <asm/trap_pf.h>
+#include <asm/msr-index.h>
+#include <asm/fpu/xcr.h>
+#include <asm/ptrace.h>
+#include <asm/svm.h>
+#include <asm/cpuid/api.h>
+
+#include "error.h"
+#include "sev.h"
+
+static struct ghcb boot_ghcb_page __aligned(PAGE_SIZE);
+struct ghcb *boot_ghcb;
+
+#undef __init
+#define __init
+
+#define __BOOT_COMPRESSED
+
+u8 snp_vmpl;
+u16 ghcb_version;
+
+u64 boot_svsm_caa_pa;
+
+/* Include code for early handlers */
+#include "../../boot/startup/sev-shared.c"
+
+static bool sev_snp_enabled(void)
+{
+ return sev_status & MSR_AMD64_SEV_SNP_ENABLED;
+}
+
+void snp_set_page_private(unsigned long paddr)
+{
+ struct psc_desc d = {
+ SNP_PAGE_STATE_PRIVATE,
+ (struct svsm_ca *)boot_svsm_caa_pa,
+ boot_svsm_caa_pa
+ };
+
+ if (!sev_snp_enabled())
+ return;
+
+ __page_state_change(paddr, paddr, &d);
+}
+
+void snp_set_page_shared(unsigned long paddr)
+{
+ struct psc_desc d = {
+ SNP_PAGE_STATE_SHARED,
+ (struct svsm_ca *)boot_svsm_caa_pa,
+ boot_svsm_caa_pa
+ };
+
+ if (!sev_snp_enabled())
+ return;
+
+ __page_state_change(paddr, paddr, &d);
+}
+
+bool early_setup_ghcb(void)
+{
+ if (set_page_decrypted((unsigned long)&boot_ghcb_page))
+ return false;
+
+ /* Page is now mapped decrypted, clear it */
+ memset(&boot_ghcb_page, 0, sizeof(boot_ghcb_page));
+
+ boot_ghcb = &boot_ghcb_page;
+
+ /* Initialize lookup tables for the instruction decoder */
+ sev_insn_decode_init();
+
+ /* SNP guest requires the GHCB GPA must be registered */
+ if (sev_snp_enabled())
+ snp_register_ghcb_early(__pa(&boot_ghcb_page));
+
+ return true;
+}
+
+void snp_accept_memory(phys_addr_t start, phys_addr_t end)
+{
+ struct psc_desc d = {
+ SNP_PAGE_STATE_PRIVATE,
+ (struct svsm_ca *)boot_svsm_caa_pa,
+ boot_svsm_caa_pa
+ };
+
+ for (phys_addr_t pa = start; pa < end; pa += PAGE_SIZE)
+ __page_state_change(pa, pa, &d);
+}
+
+void sev_es_shutdown_ghcb(void)
+{
+ if (!boot_ghcb)
+ return;
+
+ if (!sev_es_check_cpu_features())
+ error("SEV-ES CPU Features missing.");
+
+ /*
+ * This denotes whether to use the GHCB MSR protocol or the GHCB
+ * shared page to perform a GHCB request. Since the GHCB page is
+ * being changed to encrypted, it can't be used to perform GHCB
+ * requests. Clear the boot_ghcb variable so that the GHCB MSR
+ * protocol is used to change the GHCB page over to an encrypted
+ * page.
+ */
+ boot_ghcb = NULL;
+
+ /*
+ * GHCB Page must be flushed from the cache and mapped encrypted again.
+ * Otherwise the running kernel will see strange cache effects when
+ * trying to use that page.
+ */
+ if (set_page_encrypted((unsigned long)&boot_ghcb_page))
+ error("Can't map GHCB page encrypted");
+
+ /*
+ * GHCB page is mapped encrypted again and flushed from the cache.
+ * Mark it non-present now to catch bugs when #VC exceptions trigger
+ * after this point.
+ */
+ if (set_page_non_present((unsigned long)&boot_ghcb_page))
+ error("Can't unmap GHCB page");
+}
+
+static void __noreturn sev_es_ghcb_terminate(struct ghcb *ghcb, unsigned int set,
+ unsigned int reason, u64 exit_info_2)
+{
+ u64 exit_info_1 = SVM_VMGEXIT_TERM_REASON(set, reason);
+
+ vc_ghcb_invalidate(ghcb);
+ ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_TERM_REQUEST);
+ ghcb_set_sw_exit_info_1(ghcb, exit_info_1);
+ ghcb_set_sw_exit_info_2(ghcb, exit_info_2);
+
+ sev_es_wr_ghcb_msr(__pa(ghcb));
+ VMGEXIT();
+
+ while (true)
+ asm volatile("hlt\n" : : : "memory");
+}
+
+bool sev_es_check_ghcb_fault(unsigned long address)
+{
+ /* Check whether the fault was on the GHCB page */
+ return ((address & PAGE_MASK) == (unsigned long)&boot_ghcb_page);
+}
+
+/*
+ * SNP_FEATURES_IMPL_REQ is the mask of SNP features that will need
+ * guest side implementation for proper functioning of the guest. If any
+ * of these features are enabled in the hypervisor but are lacking guest
+ * side implementation, the behavior of the guest will be undefined. The
+ * guest could fail in non-obvious way making it difficult to debug.
+ *
+ * As the behavior of reserved feature bits is unknown to be on the
+ * safe side add them to the required features mask.
+ */
+#define SNP_FEATURES_IMPL_REQ (MSR_AMD64_SNP_VTOM | \
+ MSR_AMD64_SNP_REFLECT_VC | \
+ MSR_AMD64_SNP_RESTRICTED_INJ | \
+ MSR_AMD64_SNP_ALT_INJ | \
+ MSR_AMD64_SNP_DEBUG_SWAP | \
+ MSR_AMD64_SNP_VMPL_SSS | \
+ MSR_AMD64_SNP_SECURE_TSC | \
+ MSR_AMD64_SNP_VMGEXIT_PARAM | \
+ MSR_AMD64_SNP_VMSA_REG_PROT | \
+ MSR_AMD64_SNP_RESERVED_BIT13 | \
+ MSR_AMD64_SNP_RESERVED_BIT15 | \
+ MSR_AMD64_SNP_SECURE_AVIC | \
+ MSR_AMD64_SNP_RESERVED_MASK)
+
+#ifdef CONFIG_AMD_SECURE_AVIC
+#define SNP_FEATURE_SECURE_AVIC MSR_AMD64_SNP_SECURE_AVIC
+#else
+#define SNP_FEATURE_SECURE_AVIC 0
+#endif
+
+/*
+ * SNP_FEATURES_PRESENT is the mask of SNP features that are implemented
+ * by the guest kernel. As and when a new feature is implemented in the
+ * guest kernel, a corresponding bit should be added to the mask.
+ */
+#define SNP_FEATURES_PRESENT (MSR_AMD64_SNP_DEBUG_SWAP | \
+ MSR_AMD64_SNP_SECURE_TSC | \
+ SNP_FEATURE_SECURE_AVIC)
+
+u64 snp_get_unsupported_features(u64 status)
+{
+ if (!(status & MSR_AMD64_SEV_SNP_ENABLED))
+ return 0;
+
+ return status & SNP_FEATURES_IMPL_REQ & ~SNP_FEATURES_PRESENT;
+}
+
+void snp_check_features(void)
+{
+ u64 unsupported;
+
+ /*
+ * Terminate the boot if hypervisor has enabled any feature lacking
+ * guest side implementation. Pass on the unsupported features mask through
+ * EXIT_INFO_2 of the GHCB protocol so that those features can be reported
+ * as part of the guest boot failure.
+ */
+ unsupported = snp_get_unsupported_features(sev_status);
+ if (unsupported) {
+ if (ghcb_version < 2 || (!boot_ghcb && !early_setup_ghcb()))
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
+
+ sev_es_ghcb_terminate(boot_ghcb, SEV_TERM_SET_GEN,
+ GHCB_SNP_UNSUPPORTED, unsupported);
+ }
+}
+
+/* Search for Confidential Computing blob in the EFI config table. */
+static struct cc_blob_sev_info *find_cc_blob_efi(struct boot_params *bp)
+{
+ unsigned long cfg_table_pa;
+ unsigned int cfg_table_len;
+ int ret;
+
+ ret = efi_get_conf_table(bp, &cfg_table_pa, &cfg_table_len);
+ if (ret)
+ return NULL;
+
+ return (struct cc_blob_sev_info *)efi_find_vendor_table(bp, cfg_table_pa,
+ cfg_table_len,
+ EFI_CC_BLOB_GUID);
+}
+
+/*
+ * Initial set up of SNP relies on information provided by the
+ * Confidential Computing blob, which can be passed to the boot kernel
+ * by firmware/bootloader in the following ways:
+ *
+ * - via an entry in the EFI config table
+ * - via a setup_data structure, as defined by the Linux Boot Protocol
+ *
+ * Scan for the blob in that order.
+ */
+static struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp)
+{
+ struct cc_blob_sev_info *cc_info;
+
+ cc_info = find_cc_blob_efi(bp);
+ if (cc_info)
+ goto found_cc_info;
+
+ cc_info = find_cc_blob_setup_data(bp);
+ if (!cc_info)
+ return NULL;
+
+found_cc_info:
+ if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC)
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
+
+ return cc_info;
+}
+
+/*
+ * Indicate SNP based on presence of SNP-specific CC blob. Subsequent checks
+ * will verify the SNP CPUID/MSR bits.
+ */
+static bool early_snp_init(struct boot_params *bp)
+{
+ struct cc_blob_sev_info *cc_info;
+
+ if (!bp)
+ return false;
+
+ cc_info = find_cc_blob(bp);
+ if (!cc_info)
+ return false;
+
+ /*
+ * If a SNP-specific Confidential Computing blob is present, then
+ * firmware/bootloader have indicated SNP support. Verifying this
+ * involves CPUID checks which will be more reliable if the SNP
+ * CPUID table is used. See comments over snp_setup_cpuid_table() for
+ * more details.
+ */
+ setup_cpuid_table(cc_info);
+
+ /*
+ * Record the SVSM Calling Area (CA) address if the guest is not
+ * running at VMPL0. The CA will be used to communicate with the
+ * SVSM and request its services.
+ */
+ svsm_setup_ca(cc_info, rip_rel_ptr(&boot_ghcb_page));
+
+ /*
+ * Pass run-time kernel a pointer to CC info via boot_params so EFI
+ * config table doesn't need to be searched again during early startup
+ * phase.
+ */
+ bp->cc_blob_address = (u32)(unsigned long)cc_info;
+
+ return true;
+}
+
+/*
+ * sev_check_cpu_support - Check for SEV support in the CPU capabilities
+ *
+ * Returns < 0 if SEV is not supported, otherwise the position of the
+ * encryption bit in the page table descriptors.
+ */
+static int sev_check_cpu_support(void)
+{
+ unsigned int eax, ebx, ecx, edx;
+
+ /* Check for the SME/SEV support leaf */
+ eax = 0x80000000;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ if (eax < 0x8000001f)
+ return -ENODEV;
+
+ /*
+ * Check for the SME/SEV feature:
+ * CPUID Fn8000_001F[EAX]
+ * - Bit 0 - Secure Memory Encryption support
+ * - Bit 1 - Secure Encrypted Virtualization support
+ * CPUID Fn8000_001F[EBX]
+ * - Bits 5:0 - Pagetable bit position used to indicate encryption
+ */
+ eax = 0x8000001f;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ /* Check whether SEV is supported */
+ if (!(eax & BIT(1)))
+ return -ENODEV;
+
+ sev_snp_needs_sfw = !(ebx & BIT(31));
+
+ return ebx & 0x3f;
+}
+
+void sev_enable(struct boot_params *bp)
+{
+ struct msr m;
+ int bitpos;
+ bool snp;
+
+ /*
+ * bp->cc_blob_address should only be set by boot/compressed kernel.
+ * Initialize it to 0 to ensure that uninitialized values from
+ * buggy bootloaders aren't propagated.
+ */
+ if (bp)
+ bp->cc_blob_address = 0;
+
+ /*
+ * Do an initial SEV capability check before early_snp_init() which
+ * loads the CPUID page and the same checks afterwards are done
+ * without the hypervisor and are trustworthy.
+ *
+ * If the HV fakes SEV support, the guest will crash'n'burn
+ * which is good enough.
+ */
+
+ if (sev_check_cpu_support() < 0)
+ return;
+
+ /*
+ * Setup/preliminary detection of SNP. This will be sanity-checked
+ * against CPUID/MSR values later.
+ */
+ snp = early_snp_init(bp);
+
+ /* Now repeat the checks with the SNP CPUID table. */
+
+ bitpos = sev_check_cpu_support();
+ if (bitpos < 0) {
+ if (snp)
+ error("SEV-SNP support indicated by CC blob, but not CPUID.");
+ return;
+ }
+
+ /* Set the SME mask if this is an SEV guest. */
+ raw_rdmsr(MSR_AMD64_SEV, &m);
+ sev_status = m.q;
+ if (!(sev_status & MSR_AMD64_SEV_ENABLED))
+ return;
+
+ /* Negotiate the GHCB protocol version. */
+ if (sev_status & MSR_AMD64_SEV_ES_ENABLED) {
+ if (!sev_es_negotiate_protocol())
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_PROT_UNSUPPORTED);
+ }
+
+ /*
+ * SNP is supported in v2 of the GHCB spec which mandates support for HV
+ * features.
+ */
+ if (sev_status & MSR_AMD64_SEV_SNP_ENABLED) {
+ u64 hv_features;
+
+ hv_features = get_hv_features();
+ if (!(hv_features & GHCB_HV_FT_SNP))
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
+
+ /*
+ * Running at VMPL0 is required unless an SVSM is present and
+ * the hypervisor supports the required SVSM GHCB events.
+ */
+ if (snp_vmpl && !(hv_features & GHCB_HV_FT_SNP_MULTI_VMPL))
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_NOT_VMPL0);
+ }
+
+ if (snp && !(sev_status & MSR_AMD64_SEV_SNP_ENABLED))
+ error("SEV-SNP supported indicated by CC blob, but not SEV status MSR.");
+
+ sme_me_mask = BIT_ULL(bitpos);
+}
+
+/*
+ * sev_get_status - Retrieve the SEV status mask
+ *
+ * Returns 0 if the CPU is not SEV capable, otherwise the value of the
+ * AMD64_SEV MSR.
+ */
+u64 sev_get_status(void)
+{
+ struct msr m;
+
+ if (sev_check_cpu_support() < 0)
+ return 0;
+
+ raw_rdmsr(MSR_AMD64_SEV, &m);
+ return m.q;
+}
+
+void sev_prep_identity_maps(unsigned long top_level_pgt)
+{
+ /*
+ * The Confidential Computing blob is used very early in uncompressed
+ * kernel to find the in-memory CPUID table to handle CPUID
+ * instructions. Make sure an identity-mapping exists so it can be
+ * accessed after switchover.
+ */
+ if (sev_snp_enabled()) {
+ unsigned long cc_info_pa = boot_params_ptr->cc_blob_address;
+ struct cc_blob_sev_info *cc_info;
+
+ kernel_add_identity_map(cc_info_pa, cc_info_pa + sizeof(*cc_info));
+
+ cc_info = (struct cc_blob_sev_info *)cc_info_pa;
+ kernel_add_identity_map(cc_info->cpuid_phys, cc_info->cpuid_phys + cc_info->cpuid_len);
+ }
+
+ sev_verify_cbit(top_level_pgt);
+}
+
+bool early_is_sevsnp_guest(void)
+{
+ static bool sevsnp;
+
+ if (sevsnp)
+ return true;
+
+ if (!(sev_get_status() & MSR_AMD64_SEV_SNP_ENABLED))
+ return false;
+
+ sevsnp = true;
+
+ if (!snp_vmpl) {
+ unsigned int eax, ebx, ecx, edx;
+
+ /*
+ * CPUID Fn8000_001F_EAX[28] - SVSM support
+ */
+ eax = 0x8000001f;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ if (eax & BIT(28)) {
+ struct msr m;
+
+ /* Obtain the address of the calling area to use */
+ raw_rdmsr(MSR_SVSM_CAA, &m);
+ boot_svsm_caa_pa = m.q;
+
+ /*
+ * The real VMPL level cannot be discovered, but the
+ * memory acceptance routines make no use of that so
+ * any non-zero value suffices here.
+ */
+ snp_vmpl = U8_MAX;
+ }
+ }
+ return true;
+}
diff --git a/arch/x86/boot/compressed/sev.h b/arch/x86/boot/compressed/sev.h
new file mode 100644
index 000000000000..22637b416b46
--- /dev/null
+++ b/arch/x86/boot/compressed/sev.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * AMD SEV header for early boot related functions.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ */
+
+#ifndef BOOT_COMPRESSED_SEV_H
+#define BOOT_COMPRESSED_SEV_H
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+
+#include <asm/shared/msr.h>
+
+void snp_accept_memory(phys_addr_t start, phys_addr_t end);
+u64 sev_get_status(void);
+bool early_is_sevsnp_guest(void);
+
+static inline u64 sev_es_rd_ghcb_msr(void)
+{
+ struct msr m;
+
+ raw_rdmsr(MSR_AMD64_SEV_ES_GHCB, &m);
+
+ return m.q;
+}
+
+static inline void sev_es_wr_ghcb_msr(u64 val)
+{
+ struct msr m;
+
+ m.q = val;
+ raw_wrmsr(MSR_AMD64_SEV_ES_GHCB, &m);
+}
+
+#else
+
+static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { }
+static inline u64 sev_get_status(void) { return 0; }
+static inline bool early_is_sevsnp_guest(void) { return false; }
+
+#endif
+
+#endif
diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c
index ffb9c5c9d748..9af19d9614cb 100644
--- a/arch/x86/boot/compressed/string.c
+++ b/arch/x86/boot/compressed/string.c
@@ -1,11 +1,81 @@
-#include "misc.h"
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This provides an optimized implementation of memcpy, and a simplified
+ * implementation of memset and memmove. These are used here because the
+ * standard kernel runtime versions are not yet available and we don't
+ * trust the gcc built-in implementations as they may do unexpected things
+ * (e.g. FPU ops) in the minimal decompression stub execution environment.
+ */
+#include "error.h"
-int memcmp(const void *s1, const void *s2, size_t len)
+#include "../string.c"
+
+#ifdef CONFIG_X86_32
+static void *____memcpy(void *dest, const void *src, size_t n)
{
- u8 diff;
- asm("repe; cmpsb; setnz %0"
- : "=qm" (diff), "+D" (s1), "+S" (s2), "+c" (len));
- return diff;
+ int d0, d1, d2;
+ asm volatile(
+ "rep movsl\n\t"
+ "movl %4,%%ecx\n\t"
+ "rep movsb"
+ : "=&c" (d0), "=&D" (d1), "=&S" (d2)
+ : "0" (n >> 2), "g" (n & 3), "1" (dest), "2" (src)
+ : "memory");
+
+ return dest;
}
+#else
+static void *____memcpy(void *dest, const void *src, size_t n)
+{
+ long d0, d1, d2;
+ asm volatile(
+ "rep movsq\n\t"
+ "movq %4,%%rcx\n\t"
+ "rep movsb"
+ : "=&c" (d0), "=&D" (d1), "=&S" (d2)
+ : "0" (n >> 3), "g" (n & 7), "1" (dest), "2" (src)
+ : "memory");
-#include "../string.c"
+ return dest;
+}
+#endif
+
+void *memset(void *s, int c, size_t n)
+{
+ int i;
+ char *ss = s;
+
+ for (i = 0; i < n; i++)
+ ss[i] = c;
+ return s;
+}
+
+void *memmove(void *dest, const void *src, size_t n)
+{
+ unsigned char *d = dest;
+ const unsigned char *s = src;
+
+ if (d <= s || d - s >= n)
+ return ____memcpy(dest, src, n);
+
+ while (n-- > 0)
+ d[n] = s[n];
+
+ return dest;
+}
+
+/* Detect and warn about potential overlaps, but handle them with memmove. */
+void *memcpy(void *dest, const void *src, size_t n)
+{
+ if (dest > src && dest - src < n) {
+ warn("Avoiding potentially unsafe overlapping memcpy()!");
+ return memmove(dest, src, n);
+ }
+ return ____memcpy(dest, src, n);
+}
+
+#ifdef CONFIG_KASAN
+extern void *__memset(void *s, int c, size_t n) __alias(memset);
+extern void *__memmove(void *dest, const void *src, size_t n) __alias(memmove);
+extern void *__memcpy(void *dest, const void *src, size_t n) __alias(memcpy);
+#endif
diff --git a/arch/x86/boot/compressed/tdcall.S b/arch/x86/boot/compressed/tdcall.S
new file mode 100644
index 000000000000..46d0495e0d3a
--- /dev/null
+++ b/arch/x86/boot/compressed/tdcall.S
@@ -0,0 +1,3 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include "../../coco/tdx/tdcall.S"
diff --git a/arch/x86/boot/compressed/tdx-shared.c b/arch/x86/boot/compressed/tdx-shared.c
new file mode 100644
index 000000000000..5ac43762fe13
--- /dev/null
+++ b/arch/x86/boot/compressed/tdx-shared.c
@@ -0,0 +1,2 @@
+#include "error.h"
+#include "../../coco/tdx/tdx-shared.c"
diff --git a/arch/x86/boot/compressed/tdx.c b/arch/x86/boot/compressed/tdx.c
new file mode 100644
index 000000000000..8451d6a1030c
--- /dev/null
+++ b/arch/x86/boot/compressed/tdx.c
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "../cpuflags.h"
+#include "../string.h"
+#include "../io.h"
+#include "error.h"
+
+#include <vdso/limits.h>
+#include <uapi/asm/vmx.h>
+
+#include <asm/shared/tdx.h>
+
+/* Called from __tdx_hypercall() for unrecoverable failure */
+void __tdx_hypercall_failed(void)
+{
+ error("TDVMCALL failed. TDX module bug?");
+}
+
+static inline unsigned int tdx_io_in(int size, u16 port)
+{
+ struct tdx_module_args args = {
+ .r10 = TDX_HYPERCALL_STANDARD,
+ .r11 = hcall_func(EXIT_REASON_IO_INSTRUCTION),
+ .r12 = size,
+ .r13 = 0,
+ .r14 = port,
+ };
+
+ if (__tdx_hypercall(&args))
+ return UINT_MAX;
+
+ return args.r11;
+}
+
+static inline void tdx_io_out(int size, u16 port, u32 value)
+{
+ struct tdx_module_args args = {
+ .r10 = TDX_HYPERCALL_STANDARD,
+ .r11 = hcall_func(EXIT_REASON_IO_INSTRUCTION),
+ .r12 = size,
+ .r13 = 1,
+ .r14 = port,
+ .r15 = value,
+ };
+
+ __tdx_hypercall(&args);
+}
+
+static inline u8 tdx_inb(u16 port)
+{
+ return tdx_io_in(1, port);
+}
+
+static inline void tdx_outb(u8 value, u16 port)
+{
+ tdx_io_out(1, port, value);
+}
+
+static inline void tdx_outw(u16 value, u16 port)
+{
+ tdx_io_out(2, port, value);
+}
+
+void early_tdx_detect(void)
+{
+ u32 eax, sig[3];
+
+ cpuid_count(TDX_CPUID_LEAF_ID, 0, &eax, &sig[0], &sig[2], &sig[1]);
+
+ if (memcmp(TDX_IDENT, sig, sizeof(sig)))
+ return;
+
+ /* Use hypercalls instead of I/O instructions */
+ pio_ops.f_inb = tdx_inb;
+ pio_ops.f_outb = tdx_outb;
+ pio_ops.f_outw = tdx_outw;
+}
diff --git a/arch/x86/boot/compressed/tdx.h b/arch/x86/boot/compressed/tdx.h
new file mode 100644
index 000000000000..9055482cd35c
--- /dev/null
+++ b/arch/x86/boot/compressed/tdx.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef BOOT_COMPRESSED_TDX_H
+#define BOOT_COMPRESSED_TDX_H
+
+#include <linux/types.h>
+
+#ifdef CONFIG_INTEL_TDX_GUEST
+void early_tdx_detect(void);
+#else
+static inline void early_tdx_detect(void) { };
+#endif
+
+#endif /* BOOT_COMPRESSED_TDX_H */
diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S
index 34d047c98284..587ce3e7c504 100644
--- a/arch/x86/boot/compressed/vmlinux.lds.S
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -1,6 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#include <asm-generic/vmlinux.lds.h>
-OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
+OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT)
#undef i386
@@ -33,6 +34,7 @@ SECTIONS
_text = .; /* Text */
*(.text)
*(.text.*)
+ *(.noinstr.text)
_etext = . ;
}
.rodata : {
@@ -41,16 +43,21 @@ SECTIONS
*(.rodata.*)
_erodata = . ;
}
- .got : {
- _got = .;
- KEEP(*(.got.plt))
- KEEP(*(.got))
- _egot = .;
+#ifdef CONFIG_EFI_SBAT
+ .sbat : ALIGN(0x1000) {
+ _sbat = . ;
+ *(.sbat)
+ _esbat = ALIGN(0x1000);
+ . = _esbat;
}
- .data : {
+#endif
+ .data : ALIGN(0x1000) {
_data = . ;
*(.data)
*(.data.*)
+
+ /* Add 4 bytes of extra space for the obsolete CRC-32 checksum */
+ . = ALIGN(. + 4, 0x200);
_edata = . ;
}
. = ALIGN(L1_CACHE_BYTES);
@@ -70,5 +77,52 @@ SECTIONS
_epgtable = . ;
}
#endif
+ . = ALIGN(PAGE_SIZE); /* keep ZO size page aligned */
_end = .;
+
+ STABS_DEBUG
+ DWARF_DEBUG
+ ELF_DETAILS
+
+ DISCARDS
+ /DISCARD/ : {
+ *(.dynamic) *(.dynsym) *(.dynstr) *(.dynbss)
+ *(.hash) *(.gnu.hash)
+ *(.note.*)
+ }
+
+ .got.plt (INFO) : {
+ *(.got.plt)
+ }
+ ASSERT(SIZEOF(.got.plt) == 0 ||
+#ifdef CONFIG_X86_64
+ SIZEOF(.got.plt) == 0x18,
+#else
+ SIZEOF(.got.plt) == 0xc,
+#endif
+ "Unexpected GOT/PLT entries detected!")
+
+ /*
+ * Sections that should stay zero sized, which is safer to
+ * explicitly check instead of blindly discarding.
+ */
+ .got : {
+ *(.got)
+ }
+ ASSERT(SIZEOF(.got) == 0, "Unexpected GOT entries detected!")
+
+ .plt : {
+ *(.plt) *(.plt.*)
+ }
+ ASSERT(SIZEOF(.plt) == 0, "Unexpected run-time procedure linkages detected!")
+
+ .rel.dyn : {
+ *(.rel.*) *(.rel_*)
+ }
+ ASSERT(SIZEOF(.rel.dyn) == 0, "Unexpected run-time relocations (.rel) detected!")
+
+ .rela.dyn : {
+ *(.rela.*) *(.rela_*)
+ }
+ ASSERT(SIZEOF(.rela.dyn) == 0, "Unexpected run-time relocations (.rela) detected!")
}