diff options
Diffstat (limited to 'arch/x86/kernel/setup.c')
| -rw-r--r-- | arch/x86/kernel/setup.c | 1237 |
1 files changed, 654 insertions, 583 deletions
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index e68709da8251..1b2edd07a3e1 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1,122 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 1995 Linus Torvalds * - * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 - * - * Memory region support - * David Parsons <orc@pell.chi.il.us>, July-August 1999 - * - * Added E820 sanitization routine (removes overlapping memory regions); - * Brian Moyle <bmoyle@mvista.com>, February 2001 - * - * Moved CPU detection code to cpu/${cpu}.c - * Patrick Mochel <mochel@osdl.org>, March 2002 - * - * Provisions for empty E820 memory regions (reported by certain BIOSes). - * Alex Achenbach <xela@slit.de>, December 2002. - * + * This file contains the setup_arch() code, which handles the architecture-dependent + * parts of early kernel initialization. */ - -/* - * This file handles the architecture-dependent parts of initialization - */ - -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/mmzone.h> -#include <linux/screen_info.h> -#include <linux/ioport.h> #include <linux/acpi.h> -#include <linux/sfi.h> -#include <linux/apm_bios.h> -#include <linux/initrd.h> -#include <linux/bootmem.h> -#include <linux/memblock.h> -#include <linux/seq_file.h> #include <linux/console.h> -#include <linux/root_dev.h> -#include <linux/highmem.h> -#include <linux/module.h> +#include <linux/cpu.h> +#include <linux/crash_dump.h> +#include <linux/dma-map-ops.h> #include <linux/efi.h> -#include <linux/init.h> -#include <linux/edd.h> +#include <linux/hugetlb.h> +#include <linux/ima.h> +#include <linux/init_ohci1394_dma.h> +#include <linux/initrd.h> #include <linux/iscsi_ibft.h> -#include <linux/nodemask.h> -#include <linux/kexec.h> -#include <linux/dmi.h> -#include <linux/pfn.h> +#include <linux/memblock.h> +#include <linux/panic_notifier.h> #include <linux/pci.h> -#include <asm/pci-direct.h> -#include <linux/init_ohci1394_dma.h> -#include <linux/kvm_para.h> -#include <linux/dma-contiguous.h> - -#include <linux/errno.h> -#include <linux/kernel.h> -#include <linux/stddef.h> -#include <linux/unistd.h> -#include <linux/ptrace.h> -#include <linux/user.h> -#include <linux/delay.h> - -#include <linux/kallsyms.h> -#include <linux/cpufreq.h> -#include <linux/dma-mapping.h> -#include <linux/ctype.h> -#include <linux/uaccess.h> - -#include <linux/percpu.h> -#include <linux/crash_dump.h> +#include <linux/random.h> +#include <linux/root_dev.h> +#include <linux/static_call.h> +#include <linux/swiotlb.h> #include <linux/tboot.h> -#include <linux/jiffies.h> +#include <linux/usb/xhci-dbgp.h> +#include <linux/vmalloc.h> -#include <video/edid.h> +#include <uapi/linux/mount.h> + +#include <xen/xen.h> -#include <asm/mtrr.h> #include <asm/apic.h> -#include <asm/realmode.h> -#include <asm/e820.h> -#include <asm/mpspec.h> -#include <asm/setup.h> -#include <asm/efi.h> -#include <asm/timer.h> -#include <asm/i8259.h> -#include <asm/sections.h> -#include <asm/io_apic.h> -#include <asm/ist.h> -#include <asm/setup_arch.h> #include <asm/bios_ebda.h> -#include <asm/cacheflush.h> -#include <asm/processor.h> #include <asm/bugs.h> - -#include <asm/vsyscall.h> +#include <asm/cacheinfo.h> +#include <asm/coco.h> #include <asm/cpu.h> -#include <asm/desc.h> -#include <asm/dma.h> -#include <asm/iommu.h> +#include <asm/efi.h> #include <asm/gart.h> -#include <asm/mmu_context.h> -#include <asm/proto.h> - -#include <asm/paravirt.h> #include <asm/hypervisor.h> -#include <asm/olpc_ofw.h> - -#include <asm/percpu.h> -#include <asm/topology.h> -#include <asm/apicdef.h> -#include <asm/amd_nb.h> +#include <asm/io_apic.h> +#include <asm/kasan.h> +#include <asm/kaslr.h> #include <asm/mce.h> -#include <asm/alternative.h> +#include <asm/memtype.h> +#include <asm/mtrr.h> +#include <asm/nmi.h> +#include <asm/numa.h> +#include <asm/olpc_ofw.h> +#include <asm/pci-direct.h> #include <asm/prom.h> +#include <asm/proto.h> +#include <asm/realmode.h> +#include <asm/thermal.h> +#include <asm/unwind.h> +#include <asm/vsyscall.h> /* - * max_low_pfn_mapped: highest direct mapped pfn under 4GB - * max_pfn_mapped: highest direct mapped pfn over 4GB + * max_low_pfn_mapped: highest directly mapped pfn < 4 GB + * max_pfn_mapped: highest directly mapped pfn > 4 GB * - * The direct mapping only covers E820_RAM regions, so the ranges and gaps are - * represented by pfn_mapped + * The direct mapping only covers E820_TYPE_RAM regions, so the ranges and gaps are + * represented by pfn_mapped[]. */ unsigned long max_low_pfn_mapped; unsigned long max_pfn_mapped; @@ -126,65 +72,49 @@ RESERVE_BRK(dmi_alloc, 65536); #endif -static __initdata unsigned long _brk_start = (unsigned long)__brk_base; -unsigned long _brk_end = (unsigned long)__brk_base; - -#ifdef CONFIG_X86_64 -int default_cpu_present_to_apicid(int mps_cpu) -{ - return __default_cpu_present_to_apicid(mps_cpu); -} - -int default_check_phys_apicid_present(int phys_apicid) -{ - return __default_check_phys_apicid_present(phys_apicid); -} -#endif +unsigned long _brk_start = (unsigned long)__brk_base; +unsigned long _brk_end = (unsigned long)__brk_base; struct boot_params boot_params; /* - * Machine setup.. + * These are the four main kernel memory regions, we put them into + * the resource tree so that kdump tools and other debugging tools + * recover it: */ + +static struct resource rodata_resource = { + .name = "Kernel rodata", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM +}; + static struct resource data_resource = { .name = "Kernel data", .start = 0, .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM }; static struct resource code_resource = { .name = "Kernel code", .start = 0, .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM }; static struct resource bss_resource = { .name = "Kernel bss", .start = 0, .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM }; #ifdef CONFIG_X86_32 -/* cpu data as detected by the assembly code in head.S */ -struct cpuinfo_x86 new_cpu_data __cpuinitdata = { - .wp_works_ok = -1, -}; -/* common cpu data for all cpus */ -struct cpuinfo_x86 boot_cpu_data __read_mostly = { - .wp_works_ok = -1, -}; -EXPORT_SYMBOL(boot_cpu_data); - -unsigned int def_to_bigsmp; - -/* for MCA, but anyone else can use it if they want */ -unsigned int machine_id; -unsigned int machine_submodel_id; -unsigned int BIOS_revision; +/* CPU data as detected by the assembly code in head_32.S */ +struct cpuinfo_x86 new_cpu_data; struct apm_info apm_info; EXPORT_SYMBOL(apm_info); @@ -197,30 +127,96 @@ EXPORT_SYMBOL(ist_info); struct ist_info ist_info; #endif -#else -struct cpuinfo_x86 boot_cpu_data __read_mostly = { - .x86_phys_bits = MAX_PHYSMEM_BITS, -}; -EXPORT_SYMBOL(boot_cpu_data); #endif +struct cpuinfo_x86 boot_cpu_data __read_mostly; +EXPORT_SYMBOL(boot_cpu_data); +SYM_PIC_ALIAS(boot_cpu_data); #if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) -unsigned long mmu_cr4_features; +__visible unsigned long mmu_cr4_features __ro_after_init; #else -unsigned long mmu_cr4_features = X86_CR4_PAE; +__visible unsigned long mmu_cr4_features __ro_after_init = X86_CR4_PAE; +#endif + +#ifdef CONFIG_IMA +static phys_addr_t ima_kexec_buffer_phys; +static size_t ima_kexec_buffer_size; #endif /* Boot loader ID and version as integers, for the benefit of proc_dointvec */ int bootloader_type, bootloader_version; +static const struct ctl_table x86_sysctl_table[] = { + { + .procname = "unknown_nmi_panic", + .data = &unknown_nmi_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "panic_on_unrecovered_nmi", + .data = &panic_on_unrecovered_nmi, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "panic_on_io_nmi", + .data = &panic_on_io_nmi, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "bootloader_type", + .data = &bootloader_type, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, + { + .procname = "bootloader_version", + .data = &bootloader_version, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, + { + .procname = "io_delay_type", + .data = &io_delay_type, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#if defined(CONFIG_ACPI_SLEEP) + { + .procname = "acpi_video_flags", + .data = &acpi_realmode_flags, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, +#endif +}; + +static int __init init_x86_sysctl(void) +{ + register_sysctl_init("kernel", x86_sysctl_table); + return 0; +} +arch_initcall(init_x86_sysctl); + /* * Setup options */ struct screen_info screen_info; EXPORT_SYMBOL(screen_info); +#if defined(CONFIG_FIRMWARE_EDID) struct edid_info edid_info; EXPORT_SYMBOL_GPL(edid_info); +#endif extern int root_mountflags; @@ -232,7 +228,8 @@ unsigned long saved_video_mode; static char __initdata command_line[COMMAND_LINE_SIZE]; #ifdef CONFIG_CMDLINE_BOOL -static char __initdata builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE; +char builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE; +bool builtin_cmdline_added __ro_after_init; #endif #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) @@ -287,8 +284,8 @@ static void __init cleanup_highmap(void) static void __init reserve_brk(void) { if (_brk_end > _brk_start) - memblock_reserve(__pa_symbol(_brk_start), - _brk_end - _brk_start); + memblock_reserve_kern(__pa_symbol(_brk_start), + _brk_end - _brk_start); /* Mark brk area as locked down and no longer taking any new allocations */ @@ -303,6 +300,9 @@ static u64 __init get_ramdisk_image(void) ramdisk_image |= (u64)boot_params.ext_ramdisk_image << 32; + if (ramdisk_image == 0) + ramdisk_image = phys_initrd_start; + return ramdisk_image; } static u64 __init get_ramdisk_size(void) @@ -311,59 +311,40 @@ static u64 __init get_ramdisk_size(void) ramdisk_size |= (u64)boot_params.ext_ramdisk_size << 32; + if (ramdisk_size == 0) + ramdisk_size = phys_initrd_size; + return ramdisk_size; } -#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) static void __init relocate_initrd(void) { /* Assume only end is not page aligned */ u64 ramdisk_image = get_ramdisk_image(); u64 ramdisk_size = get_ramdisk_size(); u64 area_size = PAGE_ALIGN(ramdisk_size); - u64 ramdisk_here; - unsigned long slop, clen, mapaddr; - char *p, *q; + int ret = 0; /* We need to move the initrd down into directly mapped mem */ - ramdisk_here = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), - area_size, PAGE_SIZE); - - if (!ramdisk_here) + u64 relocated_ramdisk = memblock_phys_alloc_range(area_size, PAGE_SIZE, 0, + PFN_PHYS(max_pfn_mapped)); + if (!relocated_ramdisk) panic("Cannot find place for new RAMDISK of size %lld\n", - ramdisk_size); + ramdisk_size); - /* Note: this includes all the mem currently occupied by - the initrd, we rely on that fact to keep the data intact. */ - memblock_reserve(ramdisk_here, area_size); - initrd_start = ramdisk_here + PAGE_OFFSET; + initrd_start = relocated_ramdisk + PAGE_OFFSET; initrd_end = initrd_start + ramdisk_size; printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n", - ramdisk_here, ramdisk_here + ramdisk_size - 1); - - q = (char *)initrd_start; - - /* Copy the initrd */ - while (ramdisk_size) { - slop = ramdisk_image & ~PAGE_MASK; - clen = ramdisk_size; - if (clen > MAX_MAP_CHUNK-slop) - clen = MAX_MAP_CHUNK-slop; - mapaddr = ramdisk_image & PAGE_MASK; - p = early_memremap(mapaddr, clen+slop); - memcpy(q, p+slop, clen); - early_iounmap(p, clen+slop); - q += clen; - ramdisk_image += clen; - ramdisk_size -= clen; - } + relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1); + + ret = copy_from_early_mem((void *)initrd_start, ramdisk_image, ramdisk_size); + if (ret) + panic("Copy RAMDISK failed\n"); - ramdisk_image = get_ramdisk_image(); - ramdisk_size = get_ramdisk_size(); printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to" " [mem %#010llx-%#010llx]\n", ramdisk_image, ramdisk_image + ramdisk_size - 1, - ramdisk_here, ramdisk_here + ramdisk_size - 1); + relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1); } static void __init early_reserve_initrd(void) @@ -377,15 +358,15 @@ static void __init early_reserve_initrd(void) !ramdisk_image || !ramdisk_size) return; /* No initrd provided by bootloader */ - memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image); + memblock_reserve_kern(ramdisk_image, ramdisk_end - ramdisk_image); } + static void __init reserve_initrd(void) { /* Assume only end is not page aligned */ u64 ramdisk_image = get_ramdisk_image(); u64 ramdisk_size = get_ramdisk_size(); u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); - u64 mapped_size; if (!boot_params.hdr.type_of_loader || !ramdisk_image || !ramdisk_size) @@ -393,12 +374,6 @@ static void __init reserve_initrd(void) initrd_start = 0; - mapped_size = memblock_mem_size(max_pfn_mapped); - if (ramdisk_size >= (mapped_size>>1)) - panic("initrd too large to handle, " - "disabling initrd (%lld needed, %lld available)\n", - ramdisk_size, mapped_size>>1); - printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image, ramdisk_end - 1); @@ -412,8 +387,9 @@ static void __init reserve_initrd(void) relocate_initrd(); - memblock_free(ramdisk_image, ramdisk_end - ramdisk_image); + memblock_phys_free(ramdisk_image, ramdisk_end - ramdisk_image); } + #else static void __init early_reserve_initrd(void) { @@ -423,215 +399,231 @@ static void __init reserve_initrd(void) } #endif /* CONFIG_BLK_DEV_INITRD */ +static void __init add_early_ima_buffer(u64 phys_addr) +{ +#ifdef CONFIG_IMA + struct ima_setup_data *data; + + data = early_memremap(phys_addr + sizeof(struct setup_data), sizeof(*data)); + if (!data) { + pr_warn("setup: failed to memremap ima_setup_data entry\n"); + return; + } + + if (data->size) { + memblock_reserve_kern(data->addr, data->size); + ima_kexec_buffer_phys = data->addr; + ima_kexec_buffer_size = data->size; + } + + early_memunmap(data, sizeof(*data)); +#else + pr_warn("Passed IMA kexec data, but CONFIG_IMA not set. Ignoring.\n"); +#endif +} + +#if defined(CONFIG_HAVE_IMA_KEXEC) && !defined(CONFIG_OF_FLATTREE) +int __init ima_free_kexec_buffer(void) +{ + if (!ima_kexec_buffer_size) + return -ENOENT; + + memblock_free_late(ima_kexec_buffer_phys, + ima_kexec_buffer_size); + + ima_kexec_buffer_phys = 0; + ima_kexec_buffer_size = 0; + + return 0; +} + +int __init ima_get_kexec_buffer(void **addr, size_t *size) +{ + if (!ima_kexec_buffer_size) + return -ENOENT; + + *addr = __va(ima_kexec_buffer_phys); + *size = ima_kexec_buffer_size; + + return 0; +} +#endif + +static void __init add_kho(u64 phys_addr, u32 data_len) +{ + struct kho_data *kho; + u64 addr = phys_addr + sizeof(struct setup_data); + u64 size = data_len - sizeof(struct setup_data); + + if (!IS_ENABLED(CONFIG_KEXEC_HANDOVER)) { + pr_warn("Passed KHO data, but CONFIG_KEXEC_HANDOVER not set. Ignoring.\n"); + return; + } + + kho = early_memremap(addr, size); + if (!kho) { + pr_warn("setup: failed to memremap kho data (0x%llx, 0x%llx)\n", + addr, size); + return; + } + + kho_populate(kho->fdt_addr, kho->fdt_size, kho->scratch_addr, kho->scratch_size); + + early_memunmap(kho, size); +} + static void __init parse_setup_data(void) { struct setup_data *data; - u64 pa_data; + u64 pa_data, pa_next; pa_data = boot_params.hdr.setup_data; while (pa_data) { - u32 data_len, map_len; + u32 data_len, data_type; - map_len = max(PAGE_SIZE - (pa_data & ~PAGE_MASK), - (u64)sizeof(struct setup_data)); - data = early_memremap(pa_data, map_len); + data = early_memremap(pa_data, sizeof(*data)); data_len = data->len + sizeof(struct setup_data); - if (data_len > map_len) { - early_iounmap(data, map_len); - data = early_memremap(pa_data, data_len); - map_len = data_len; - } + data_type = data->type; + pa_next = data->next; + early_memunmap(data, sizeof(*data)); - switch (data->type) { + switch (data_type) { case SETUP_E820_EXT: - parse_e820_ext(data); + e820__memory_setup_extended(pa_data, data_len); break; case SETUP_DTB: add_dtb(pa_data); break; + case SETUP_EFI: + parse_efi_setup(pa_data, data_len); + break; + case SETUP_IMA: + add_early_ima_buffer(pa_data); + break; + case SETUP_KEXEC_KHO: + add_kho(pa_data, data_len); + break; + case SETUP_RNG_SEED: + data = early_memremap(pa_data, data_len); + add_bootloader_randomness(data->data, data->len); + /* Zero seed for forward secrecy. */ + memzero_explicit(data->data, data->len); + /* Zero length in case we find ourselves back here by accident. */ + memzero_explicit(&data->len, sizeof(data->len)); + early_memunmap(data, data_len); + break; default: break; } - pa_data = data->next; - early_iounmap(data, map_len); + pa_data = pa_next; } } -static void __init e820_reserve_setup_data(void) +/* + * Translate the fields of 'struct boot_param' into global variables + * representing these parameters. + */ +static void __init parse_boot_params(void) { - struct setup_data *data; - u64 pa_data; - int found = 0; + ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); + screen_info = boot_params.screen_info; +#if defined(CONFIG_FIRMWARE_EDID) + edid_info = boot_params.edid_info; +#endif +#ifdef CONFIG_X86_32 + apm_info.bios = boot_params.apm_bios_info; + ist_info = boot_params.ist_info; +#endif + saved_video_mode = boot_params.hdr.vid_mode; + bootloader_type = boot_params.hdr.type_of_loader; + if ((bootloader_type >> 4) == 0xe) { + bootloader_type &= 0xf; + bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4; + } + bootloader_version = bootloader_type & 0xf; + bootloader_version |= boot_params.hdr.ext_loader_ver << 4; - pa_data = boot_params.hdr.setup_data; - while (pa_data) { - data = early_memremap(pa_data, sizeof(*data)); - e820_update_range(pa_data, sizeof(*data)+data->len, - E820_RAM, E820_RESERVED_KERN); - found = 1; - pa_data = data->next; - early_iounmap(data, sizeof(*data)); +#ifdef CONFIG_BLK_DEV_RAM + rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK; +#endif +#ifdef CONFIG_EFI + if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, + EFI32_LOADER_SIGNATURE, 4)) { + set_bit(EFI_BOOT, &efi.flags); + } else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, + EFI64_LOADER_SIGNATURE, 4)) { + set_bit(EFI_BOOT, &efi.flags); + set_bit(EFI_64BIT, &efi.flags); } - if (!found) - return; +#endif - sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); - memcpy(&e820_saved, &e820, sizeof(struct e820map)); - printk(KERN_INFO "extended physical RAM map:\n"); - e820_print_map("reserve setup_data"); + if (!boot_params.hdr.root_flags) + root_mountflags &= ~MS_RDONLY; } static void __init memblock_x86_reserve_range_setup_data(void) { + struct setup_indirect *indirect; struct setup_data *data; - u64 pa_data; + u64 pa_data, pa_next; + u32 len; pa_data = boot_params.hdr.setup_data; while (pa_data) { data = early_memremap(pa_data, sizeof(*data)); - memblock_reserve(pa_data, sizeof(*data) + data->len); - pa_data = data->next; - early_iounmap(data, sizeof(*data)); - } -} - -/* - * --------- Crashkernel reservation ------------------------------ - */ - -#ifdef CONFIG_KEXEC + if (!data) { + pr_warn("setup: failed to memremap setup_data entry\n"); + return; + } -/* - * Keep the crash kernel below this limit. On 32 bits earlier kernels - * would limit the kernel to the low 512 MiB due to mapping restrictions. - * On 64bit, old kexec-tools need to under 896MiB. - */ -#ifdef CONFIG_X86_32 -# define CRASH_KERNEL_ADDR_LOW_MAX (512 << 20) -# define CRASH_KERNEL_ADDR_HIGH_MAX (512 << 20) -#else -# define CRASH_KERNEL_ADDR_LOW_MAX (896UL<<20) -# define CRASH_KERNEL_ADDR_HIGH_MAX MAXMEM -#endif + len = sizeof(*data); + pa_next = data->next; -static void __init reserve_crashkernel_low(void) -{ -#ifdef CONFIG_X86_64 - const unsigned long long alignment = 16<<20; /* 16M */ - unsigned long long low_base = 0, low_size = 0; - unsigned long total_low_mem; - unsigned long long base; - bool auto_set = false; - int ret; + memblock_reserve_kern(pa_data, sizeof(*data) + data->len); - total_low_mem = memblock_mem_size(1UL<<(32-PAGE_SHIFT)); - /* crashkernel=Y,low */ - ret = parse_crashkernel_low(boot_command_line, total_low_mem, - &low_size, &base); - if (ret != 0) { - /* - * two parts from lib/swiotlb.c: - * swiotlb size: user specified with swiotlb= or default. - * swiotlb overflow buffer: now is hardcoded to 32k. - * We round it to 8M for other buffers that - * may need to stay low too. - */ - low_size = swiotlb_size_or_default() + (8UL<<20); - auto_set = true; - } else { - /* passed with crashkernel=0,low ? */ - if (!low_size) - return; - } + if (data->type == SETUP_INDIRECT) { + len += data->len; + early_memunmap(data, sizeof(*data)); + data = early_memremap(pa_data, len); + if (!data) { + pr_warn("setup: failed to memremap indirect setup_data\n"); + return; + } - low_base = memblock_find_in_range(low_size, (1ULL<<32), - low_size, alignment); + indirect = (struct setup_indirect *)data->data; - if (!low_base) { - if (!auto_set) - pr_info("crashkernel low reservation failed - No suitable area found.\n"); + if (indirect->type != SETUP_INDIRECT) + memblock_reserve_kern(indirect->addr, indirect->len); + } - return; + pa_data = pa_next; + early_memunmap(data, len); } - - memblock_reserve(low_base, low_size); - pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (System low RAM: %ldMB)\n", - (unsigned long)(low_size >> 20), - (unsigned long)(low_base >> 20), - (unsigned long)(total_low_mem >> 20)); - crashk_low_res.start = low_base; - crashk_low_res.end = low_base + low_size - 1; - insert_resource(&iomem_resource, &crashk_low_res); -#endif } -static void __init reserve_crashkernel(void) +static void __init arch_reserve_crashkernel(void) { - const unsigned long long alignment = 16<<20; /* 16M */ - unsigned long long total_mem; - unsigned long long crash_size, crash_base; + unsigned long long crash_base, crash_size, low_size = 0, cma_size = 0; bool high = false; int ret; - total_mem = memblock_phys_mem_size(); - - /* crashkernel=XM */ - ret = parse_crashkernel(boot_command_line, total_mem, - &crash_size, &crash_base); - if (ret != 0 || crash_size <= 0) { - /* crashkernel=X,high */ - ret = parse_crashkernel_high(boot_command_line, total_mem, - &crash_size, &crash_base); - if (ret != 0 || crash_size <= 0) - return; - high = true; - } - - /* 0 means: find the address automatically */ - if (crash_base <= 0) { - /* - * kexec want bzImage is below CRASH_KERNEL_ADDR_MAX - */ - crash_base = memblock_find_in_range(alignment, - high ? CRASH_KERNEL_ADDR_HIGH_MAX : - CRASH_KERNEL_ADDR_LOW_MAX, - crash_size, alignment); - - if (!crash_base) { - pr_info("crashkernel reservation failed - No suitable area found.\n"); - return; - } + if (!IS_ENABLED(CONFIG_CRASH_RESERVE)) + return; - } else { - unsigned long long start; + ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), + &crash_size, &crash_base, + &low_size, &cma_size, &high); + if (ret) + return; - start = memblock_find_in_range(crash_base, - crash_base + crash_size, crash_size, 1<<20); - if (start != crash_base) { - pr_info("crashkernel reservation failed - memory is in use.\n"); - return; - } + if (xen_pv_domain()) { + pr_info("Ignoring crashkernel for a Xen PV domain\n"); + return; } - memblock_reserve(crash_base, crash_size); - - printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " - "for crashkernel (System RAM: %ldMB)\n", - (unsigned long)(crash_size >> 20), - (unsigned long)(crash_base >> 20), - (unsigned long)(total_mem >> 20)); - crashk_res.start = crash_base; - crashk_res.end = crash_base + crash_size - 1; - insert_resource(&iomem_resource, &crashk_res); - - if (crash_base >= (1ULL<<32)) - reserve_crashkernel_low(); -} -#else -static void __init reserve_crashkernel(void) -{ + reserve_crashkernel_generic(crash_size, crash_base, low_size, high); + reserve_crashkernel_cma(cma_size); } -#endif static struct resource standard_io_resources[] = { { .name = "dma1", .start = 0x00, .end = 0x1f, @@ -666,14 +658,21 @@ void __init reserve_standard_io_resources(void) } -static __init void reserve_ibft_region(void) +static void __init setup_kernel_resources(void) { - unsigned long addr, size = 0; - - addr = find_ibft_region(&size); + code_resource.start = __pa_symbol(_text); + code_resource.end = __pa_symbol(_etext)-1; + rodata_resource.start = __pa_symbol(__start_rodata); + rodata_resource.end = __pa_symbol(__end_rodata)-1; + data_resource.start = __pa_symbol(_sdata); + data_resource.end = __pa_symbol(_edata)-1; + bss_resource.start = __pa_symbol(__bss_start); + bss_resource.end = __pa_symbol(__bss_stop)-1; - if (size) - memblock_reserve(addr, size); + insert_resource(&iomem_resource, &code_resource); + insert_resource(&iomem_resource, &rodata_resource); + insert_resource(&iomem_resource, &data_resource); + insert_resource(&iomem_resource, &bss_resource); } static bool __init snb_gfx_workaround_needed(void) @@ -729,11 +728,16 @@ static void __init trim_snb_memory(void) printk(KERN_DEBUG "reserving inaccessible SNB gfx pages\n"); /* - * Reserve all memory below the 1 MB mark that has not - * already been reserved. + * SandyBridge integrated graphics devices have a bug that prevents + * them from accessing certain memory ranges, namely anything below + * 1M and in the pages listed in bad_pages[] above. + * + * To avoid these pages being ever accessed by SNB gfx devices reserve + * bad_pages that have not already been reserved at boot time. + * All memory below the 1 MB mark is anyway reserved later during + * setup_arch(), so there is no need to reserve it here. */ - memblock_reserve(0, 1<<20); - + for (i = 0; i < ARRAY_SIZE(bad_pages); i++) { if (memblock_reserve(bad_pages[i], PAGE_SIZE)) printk(KERN_WARNING "failed to reserve 0x%08lx\n", @@ -741,18 +745,6 @@ static void __init trim_snb_memory(void) } } -/* - * Here we put platform-specific memory range workarounds, i.e. - * memory known to be corrupt or otherwise in need to be reserved on - * specific platforms. - * - * If this gets used more widely it could use a real dispatch mechanism. - */ -static void __init trim_platform_memory_ranges(void) -{ - trim_snb_memory(); -} - static void __init trim_bios_range(void) { /* @@ -764,16 +756,16 @@ static void __init trim_bios_range(void) * since some BIOSes are known to corrupt low memory. See the * Kconfig help text for X86_RESERVE_LOW. */ - e820_update_range(0, PAGE_SIZE, E820_RAM, E820_RESERVED); + e820__range_update(0, PAGE_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); /* - * special case: Some BIOSen report the PC BIOS - * area (640->1Mb) as ram even though it is not. + * special case: Some BIOSes report the PC BIOS + * area (640Kb -> 1Mb) as RAM even though it is not. * take them out. */ - e820_remove_range(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_RAM, 1); + e820__range_remove(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_TYPE_RAM, 1); - sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + e820__update_table(e820_table); } /* called before trim_bios_range() to spare extra sanitize */ @@ -783,49 +775,95 @@ static void __init e820_add_kernel_range(void) u64 size = __pa_symbol(_end) - start; /* - * Complain if .text .data and .bss are not marked as E820_RAM and + * Complain if .text .data and .bss are not marked as E820_TYPE_RAM and * attempt to fix it by adding the range. We may have a confused BIOS, * or the user may have used memmap=exactmap or memmap=xxM$yyM to * exclude kernel range. If we really are running on top non-RAM, * we will crash later anyways. */ - if (e820_all_mapped(start, start + size, E820_RAM)) + if (e820__mapped_all(start, start + size, E820_TYPE_RAM)) return; - pr_warn(".text .data .bss are not marked as E820_RAM!\n"); - e820_remove_range(start, size, E820_RAM, 0); - e820_add_region(start, size, E820_RAM); + pr_warn(".text .data .bss are not marked as E820_TYPE_RAM!\n"); + e820__range_remove(start, size, E820_TYPE_RAM, 0); + e820__range_add(start, size, E820_TYPE_RAM); } -static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10; - -static int __init parse_reservelow(char *p) +static void __init early_reserve_memory(void) { - unsigned long long size; + /* + * Reserve the memory occupied by the kernel between _text and + * __end_of_kernel_reserve symbols. Any kernel sections after the + * __end_of_kernel_reserve symbol must be explicitly reserved with a + * separate memblock_reserve() or they will be discarded. + */ + memblock_reserve_kern(__pa_symbol(_text), + (unsigned long)__end_of_kernel_reserve - (unsigned long)_text); - if (!p) - return -EINVAL; + /* + * The first 4Kb of memory is a BIOS owned area, but generally it is + * not listed as such in the E820 table. + * + * Reserve the first 64K of memory since some BIOSes are known to + * corrupt low memory. After the real mode trampoline is allocated the + * rest of the memory below 640k is reserved. + * + * In addition, make sure page 0 is always reserved because on + * systems with L1TF its contents can be leaked to user processes. + */ + memblock_reserve(0, SZ_64K); - size = memparse(p, &p); + early_reserve_initrd(); - if (size < 4096) - size = 4096; + memblock_x86_reserve_range_setup_data(); - if (size > 640*1024) - size = 640*1024; + reserve_bios_regions(); + trim_snb_memory(); +} - reserve_low = size; +/* + * Dump out kernel offset information on panic. + */ +static int +dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) +{ + if (kaslr_enabled()) { + pr_emerg("Kernel Offset: 0x%lx from 0x%lx (relocation range: 0x%lx-0x%lx)\n", + kaslr_offset(), + __START_KERNEL, + __START_KERNEL_map, + MODULES_VADDR-1); + } else { + pr_emerg("Kernel Offset: disabled\n"); + } return 0; } -early_param("reservelow", parse_reservelow); +void x86_configure_nx(void) +{ + if (boot_cpu_has(X86_FEATURE_NX)) + __supported_pte_mask |= _PAGE_NX; + else + __supported_pte_mask &= ~_PAGE_NX; +} -static void __init trim_low_memory_range(void) +static void __init x86_report_nx(void) { - memblock_reserve(0, ALIGN(reserve_low, PAGE_SIZE)); + if (!boot_cpu_has(X86_FEATURE_NX)) { + printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " + "missing in CPU!\n"); + } else { +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) + printk(KERN_INFO "NX (Execute Disable) protection: active\n"); +#else + /* 32bit non-PAE kernel, NX cannot be used */ + printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " + "cannot be enabled: non-PAE kernel!\n"); +#endif + } } - + /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures @@ -841,20 +879,8 @@ static void __init trim_low_memory_range(void) void __init setup_arch(char **cmdline_p) { - memblock_reserve(__pa_symbol(_text), - (unsigned long)__bss_stop - (unsigned long)_text); - - early_reserve_initrd(); - - /* - * At this point everything still needed from the boot loader - * or BIOS or kernel text should be early reserved or marked not - * RAM in e820. All other memory is free game. - */ - #ifdef CONFIG_X86_32 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); - visws_early_detect(); /* * copy kernel address range established so far and switch @@ -865,161 +891,135 @@ void __init setup_arch(char **cmdline_p) KERNEL_PGD_PTRS); load_cr3(swapper_pg_dir); + /* + * Note: Quark X1000 CPUs advertise PGE incorrectly and require + * a cr3 based tlb flush, so the following __flush_tlb_all() + * will not flush anything because the CPU quirk which clears + * X86_FEATURE_PGE has not been invoked yet. Though due to the + * load_cr3() above the TLB has been flushed already. The + * quirk is invoked before subsequent calls to __flush_tlb_all() + * so proper operation is guaranteed. + */ __flush_tlb_all(); #else printk(KERN_INFO "Command line: %s\n", boot_command_line); + boot_cpu_data.x86_phys_bits = MAX_PHYSMEM_BITS; #endif +#ifdef CONFIG_CMDLINE_BOOL +#ifdef CONFIG_CMDLINE_OVERRIDE + strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); +#else + if (builtin_cmdline[0]) { + /* append boot loader cmdline to builtin */ + strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE); + strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE); + strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); + } +#endif + builtin_cmdline_added = true; +#endif + + strscpy(command_line, boot_command_line, COMMAND_LINE_SIZE); + *cmdline_p = command_line; + /* * If we have OLPC OFW, we might end up relocating the fixmap due to * reserve_top(), so do this before touching the ioremap area. */ olpc_ofw_detect(); - early_trap_init(); + idt_setup_early_traps(); early_cpu_init(); + jump_label_init(); + static_call_init(); early_ioremap_init(); setup_olpc_ofw_pgd(); - ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); - screen_info = boot_params.screen_info; - edid_info = boot_params.edid_info; -#ifdef CONFIG_X86_32 - apm_info.bios = boot_params.apm_bios_info; - ist_info = boot_params.ist_info; - if (boot_params.sys_desc_table.length != 0) { - machine_id = boot_params.sys_desc_table.table[0]; - machine_submodel_id = boot_params.sys_desc_table.table[1]; - BIOS_revision = boot_params.sys_desc_table.table[2]; - } -#endif - saved_video_mode = boot_params.hdr.vid_mode; - bootloader_type = boot_params.hdr.type_of_loader; - if ((bootloader_type >> 4) == 0xe) { - bootloader_type &= 0xf; - bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4; - } - bootloader_version = bootloader_type & 0xf; - bootloader_version |= boot_params.hdr.ext_loader_ver << 4; - -#ifdef CONFIG_BLK_DEV_RAM - rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK; - rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0); - rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0); -#endif -#ifdef CONFIG_EFI - if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, - "EL32", 4)) { - set_bit(EFI_BOOT, &x86_efi_facility); - } else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, - "EL64", 4)) { - set_bit(EFI_BOOT, &x86_efi_facility); - set_bit(EFI_64BIT, &x86_efi_facility); - } - - if (efi_enabled(EFI_BOOT)) - efi_memblock_x86_reserve_range(); -#endif + parse_boot_params(); x86_init.oem.arch_setup(); + /* + * Do some memory reservations *before* memory is added to memblock, so + * memblock allocations won't overwrite it. + * + * After this point, everything still needed from the boot loader or + * firmware or kernel text should be early reserved or marked not RAM in + * e820. All other memory is free game. + * + * This call needs to happen before e820__memory_setup() which calls the + * xen_memory_setup() on Xen dom0 which relies on the fact that those + * early reservations have happened already. + */ + early_reserve_memory(); + iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1; - setup_memory_map(); + e820__memory_setup(); parse_setup_data(); - /* update the e820_saved too */ - e820_reserve_setup_data(); copy_edd(); - if (!boot_params.hdr.root_flags) - root_mountflags &= ~MS_RDONLY; - init_mm.start_code = (unsigned long) _text; - init_mm.end_code = (unsigned long) _etext; - init_mm.end_data = (unsigned long) _edata; - init_mm.brk = _brk_end; - - code_resource.start = __pa_symbol(_text); - code_resource.end = __pa_symbol(_etext)-1; - data_resource.start = __pa_symbol(_etext); - data_resource.end = __pa_symbol(_edata)-1; - bss_resource.start = __pa_symbol(__bss_start); - bss_resource.end = __pa_symbol(__bss_stop)-1; - -#ifdef CONFIG_CMDLINE_BOOL -#ifdef CONFIG_CMDLINE_OVERRIDE - strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); -#else - if (builtin_cmdline[0]) { - /* append boot loader cmdline to builtin */ - strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE); - strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE); - strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); - } -#endif -#endif - - strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); - *cmdline_p = command_line; + setup_initial_init_mm(_text, _etext, _edata, (void *)_brk_end); /* * x86_configure_nx() is called before parse_early_param() to detect * whether hardware doesn't support NX (so that the early EHCI debug - * console setup can safely call set_fixmap()). It may then be called - * again from within noexec_setup() during parsing early parameters - * to honor the respective command line option. + * console setup can safely call set_fixmap()). */ x86_configure_nx(); parse_early_param(); + if (efi_enabled(EFI_BOOT)) + efi_memblock_x86_reserve_range(); + x86_report_nx(); - /* after early param, so could get panic from serial */ - memblock_x86_reserve_range_setup_data(); + apic_setup_apic_calls(); if (acpi_mps_check()) { #ifdef CONFIG_X86_LOCAL_APIC - disable_apic = 1; + apic_is_disabled = true; #endif setup_clear_cpu_cap(X86_FEATURE_APIC); } -#ifdef CONFIG_PCI - if (pci_early_dump_regs) - early_dump_pci_devices(); -#endif - - finish_e820_parsing(); + e820__finish_early_params(); if (efi_enabled(EFI_BOOT)) efi_init(); - dmi_scan_machine(); - dmi_set_dump_stack_arch_desc(); + reserve_ibft_region(); + x86_init.resources.dmi_setup(); /* * VMware detection requires dmi to be available, so this - * needs to be done after dmi_scan_machine, for the BP. + * needs to be done after dmi_setup(), for the boot CPU. + * For some guest types (Xen PV, SEV-SNP, TDX) it is required to be + * called before cache_bp_init() for setting up MTRR state. */ init_hypervisor_platform(); + tsc_early_init(); x86_init.resources.probe_roms(); - /* after parse_early_param, so could debug it */ - insert_resource(&iomem_resource, &code_resource); - insert_resource(&iomem_resource, &data_resource); - insert_resource(&iomem_resource, &bss_resource); + /* + * Add resources for kernel text and data to the iomem_resource. + * Do it after parse_early_param, so it can be debugged. + */ + setup_kernel_resources(); e820_add_kernel_range(); trim_bios_range(); #ifdef CONFIG_X86_32 if (ppro_with_ram_bug()) { - e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM, - E820_RESERVED); - sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + e820__range_update(0x70000000ULL, 0x40000ULL, E820_TYPE_RAM, + E820_TYPE_RESERVED); + e820__update_table(e820_table); printk(KERN_INFO "fixed physical RAM map:\n"); - e820_print_map("bad_ppro"); + e820__print_table("bad_ppro"); } #else early_gart_iommu_check(); @@ -1029,12 +1029,20 @@ void __init setup_arch(char **cmdline_p) * partially used pages are not usable - thus * we are rounding upwards: */ - max_pfn = e820_end_of_ram_pfn(); + max_pfn = e820__end_of_ram_pfn(); /* update e820 for memory not covered by WB MTRRs */ - mtrr_bp_init(); + cache_bp_init(); if (mtrr_trim_uncached_memory(max_pfn)) - max_pfn = e820_end_of_ram_pfn(); + max_pfn = e820__end_of_ram_pfn(); + + max_possible_pfn = max_pfn; + + /* + * Define random base addresses for memory sections after max_pfn is + * defined and before each memory section base is used. + */ + kernel_randomize_memory(); #ifdef CONFIG_X86_32 /* max_low_pfn get updated here */ @@ -1045,43 +1053,46 @@ void __init setup_arch(char **cmdline_p) /* How many end-of-memory variables you have, grandma! */ /* need this before calling reserve_initrd */ if (max_pfn > (1UL<<(32 - PAGE_SHIFT))) - max_low_pfn = e820_end_of_low_ram_pfn(); + max_low_pfn = e820__end_of_low_ram_pfn(); else max_low_pfn = max_pfn; - - high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; #endif - /* - * Find and reserve possible boot-time SMP configuration: - */ - find_smp_config(); - - reserve_ibft_region(); + /* Find and reserve MPTABLE area */ + x86_init.mpparse.find_mptable(); early_alloc_pgt_buf(); /* - * Need to conclude brk, before memblock_x86_fill() - * it could use memblock_find_in_range, could overlap with - * brk area. + * Need to conclude brk, before e820__memblock_setup() + * it could use memblock_find_in_range, could overlap with + * brk area. */ reserve_brk(); cleanup_highmap(); - memblock.current_limit = ISA_END_ADDRESS; - memblock_x86_fill(); + e820__memblock_setup(); /* - * The EFI specification says that boot service code won't be called - * after ExitBootServices(). This is, in fact, a lie. + * Needs to run after memblock setup because it needs the physical + * memory size. */ - if (efi_enabled(EFI_MEMMAP)) - efi_reserve_boot_services(); + mem_encrypt_setup_arch(); + cc_random_init(); + + efi_find_mirror(); + efi_esrt_init(); + efi_mokvar_table_init(); + + /* + * The EFI specification says that boot service code won't be + * called after ExitBootServices(). This is, in fact, a lie. + */ + efi_reserve_boot_services(); /* preallocate 4k for mptable mpc */ - early_reserve_e820_mpc_new(); + e820__memblock_alloc_reserved_mpc_new(); #ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION setup_bios_corruption_check(); @@ -1092,19 +1103,45 @@ void __init setup_arch(char **cmdline_p) (max_pfn_mapped<<PAGE_SHIFT) - 1); #endif - reserve_real_mode(); - - trim_platform_memory_ranges(); - trim_low_memory_range(); + /* + * Find free memory for the real mode trampoline and place it there. If + * there is not enough free memory under 1M, on EFI-enabled systems + * there will be additional attempt to reclaim the memory for the real + * mode trampoline at efi_free_boot_services(). + * + * Unconditionally reserve the entire first 1M of RAM because BIOSes + * are known to corrupt low memory and several hundred kilobytes are not + * worth complex detection what memory gets clobbered. Windows does the + * same thing for very similar reasons. + * + * Moreover, on machines with SandyBridge graphics or in setups that use + * crashkernel the entire 1M is reserved anyway. + * + * Note the host kernel TDX also requires the first 1MB being reserved. + */ + x86_platform.realmode_reserve(); init_mem_mapping(); - early_trap_pf_init(); + /* + * init_mem_mapping() relies on the early IDT page fault handling. + * Now either enable FRED or install the real page fault handler + * for 64-bit in the IDT. + */ + cpu_init_replace_early_idt(); - setup_real_mode(); + /* + * Update mmu_cr4_features (and, indirectly, trampoline_cr4_features) + * with the current CR4 value. This may not be necessary, but + * auditing all the early-boot CR4 manipulation would be needed to + * rule it out. + * + * Mask off features that don't work outside long mode (just + * PCIDE for now). + */ + mmu_cr4_features = __read_cr4() & ~X86_CR4_PCIDE; - memblock.current_limit = get_max_mapped(); - dma_contiguous_reserve(0); + memblock_set_current_limit(get_max_mapped()); /* * NOTE: On x86-32, only from this point on, fixmaps are ready for use. @@ -1117,116 +1154,131 @@ void __init setup_arch(char **cmdline_p) /* Allocate bigger log buffer */ setup_log_buf(1); - reserve_initrd(); + if (efi_enabled(EFI_BOOT)) { + switch (boot_params.secure_boot) { + case efi_secureboot_mode_disabled: + pr_info("Secure boot disabled\n"); + break; + case efi_secureboot_mode_enabled: + pr_info("Secure boot enabled\n"); + break; + default: + pr_info("Secure boot could not be determined\n"); + break; + } + } -#if defined(CONFIG_ACPI) && defined(CONFIG_BLK_DEV_INITRD) - acpi_initrd_override((void *)initrd_start, initrd_end - initrd_start); -#endif + reserve_initrd(); - reserve_crashkernel(); + acpi_table_upgrade(); + /* Look for ACPI tables and reserve memory occupied by them. */ + acpi_boot_table_init(); vsmp_init(); io_delay_init(); - /* - * Parse the ACPI tables for possible boot-time SMP configuration. - */ - acpi_boot_table_init(); + early_platform_quirks(); + /* Some platforms need the APIC registered for NUMA configuration */ early_acpi_boot_init(); + x86_init.mpparse.early_parse_smp_cfg(); + + x86_flattree_get_config(); initmem_init(); - memblock_find_dma_reserve(); + dma_contiguous_reserve(max_pfn_mapped << PAGE_SHIFT); -#ifdef CONFIG_KVM_GUEST - kvmclock_init(); -#endif + if (boot_cpu_has(X86_FEATURE_GBPAGES)) { + hugetlb_cma_reserve(PUD_SHIFT - PAGE_SHIFT); + hugetlb_bootmem_alloc(); + } + + /* + * Reserve memory for crash kernel after SRAT is parsed so that it + * won't consume hotpluggable memory. + */ + arch_reserve_crashkernel(); + + if (!early_xdbc_setup_hardware()) + early_xdbc_register_console(); x86_init.paging.pagetable_init(); - if (boot_cpu_data.cpuid_level >= 0) { - /* A CPU has %cr4 if and only if it has CPUID */ - mmu_cr4_features = read_cr4(); - if (trampoline_cr4_features) - *trampoline_cr4_features = mmu_cr4_features; - } + kasan_init(); -#ifdef CONFIG_X86_32 - /* sync back kernel address range */ - clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY, - swapper_pg_dir + KERNEL_PGD_BOUNDARY, - KERNEL_PGD_PTRS); -#endif + /* + * Sync back kernel address range. + * + * FIXME: Can the later sync in setup_cpu_entry_areas() replace + * this call? + */ + sync_initial_page_table(); tboot_probe(); -#ifdef CONFIG_X86_64 map_vsyscall(); -#endif - generic_apic_probe(); + x86_32_probe_apic(); early_quirks(); + topology_apply_cmdline_limits_early(); + /* - * Read APIC and some other early information from ACPI tables. + * Parse SMP configuration. Try ACPI first and then the platform + * specific parser. */ acpi_boot_init(); - sfi_init(); - x86_dtb_init(); + x86_init.mpparse.parse_smp_cfg(); - /* - * get boot-time SMP configuration: - */ - if (smp_found_config) - get_smp_config(); + /* Last opportunity to detect and map the local APIC */ + init_apic_mappings(); - prefill_possible_map(); + topology_init_possible_cpus(); init_cpu_to_node(); + init_gi_nodes(); - init_apic_mappings(); - if (x86_io_apic_ops.init) - x86_io_apic_ops.init(); + io_apic_init_mappings(); - kvm_guest_init(); + x86_init.hyper.guest_late_init(); - e820_reserve_resources(); - e820_mark_nosave_regions(max_low_pfn); + e820__reserve_resources(); + e820__register_nosave_regions(max_pfn); x86_init.resources.reserve_resources(); - e820_setup_gap(); + e820__setup_pci_gap(); #ifdef CONFIG_VT #if defined(CONFIG_VGA_CONSOLE) if (!efi_enabled(EFI_BOOT) || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY)) - conswitchp = &vga_con; -#elif defined(CONFIG_DUMMY_CONSOLE) - conswitchp = &dummy_con; + vgacon_register_screen(&screen_info); #endif #endif x86_init.oem.banner(); x86_init.timers.wallclock_init(); - mcheck_init(); + /* + * This needs to run before setup_local_APIC() which soft-disables the + * local APIC temporarily and that masks the thermal LVT interrupt, + * leading to softlockups on machines which have configured SMI + * interrupt delivery. + */ + therm_lvt_init(); - arch_init_ideal_nops(); + mcheck_init(); register_refined_jiffies(CLOCK_TICK_RATE); #ifdef CONFIG_EFI - /* Once setup is done above, unmap the EFI memory map on - * mismatched firmware/kernel archtectures since there is no - * support for runtime services. - */ - if (efi_enabled(EFI_BOOT) && !efi_is_native()) { - pr_info("efi: Setup done, disabling due to 32/64-bit mismatch\n"); - efi_unmap_memmap(); - } + if (efi_enabled(EFI_BOOT)) + efi_apply_memmap_quirks(); #endif + + unwind_init(); } #ifdef CONFIG_X86_32 @@ -1245,3 +1297,22 @@ void __init i386_reserve_resources(void) } #endif /* CONFIG_X86_32 */ + +static struct notifier_block kernel_offset_notifier = { + .notifier_call = dump_kernel_offset +}; + +static int __init register_kernel_offset_dumper(void) +{ + atomic_notifier_chain_register(&panic_notifier_list, + &kernel_offset_notifier); + return 0; +} +__initcall(register_kernel_offset_dumper); + +#ifdef CONFIG_HOTPLUG_CPU +bool arch_cpu_is_hotpluggable(int cpu) +{ + return cpu > 0; +} +#endif /* CONFIG_HOTPLUG_CPU */ |
