summaryrefslogtreecommitdiff
path: root/arch/x86/kernel/crash.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel/crash.c')
-rw-r--r--arch/x86/kernel/crash.c417
1 files changed, 252 insertions, 165 deletions
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index c8b07d8ea5a2..335fd2ee9766 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Architecture specific (i386/x86_64) functions for kexec based crash dumps.
*
@@ -23,7 +24,9 @@
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
+#include <linux/memblock.h>
+#include <asm/bootparam.h>
#include <asm/processor.h>
#include <asm/hardirq.h>
#include <asm/nmi.h>
@@ -35,9 +38,10 @@
#include <linux/kdebug.h>
#include <asm/cpu.h>
#include <asm/reboot.h>
-#include <asm/virtext.h>
#include <asm/intel_pt.h>
#include <asm/crash.h>
+#include <asm/cmdline.h>
+#include <asm/sev.h>
/* Used while preparing memory map entries for second kernel */
struct crash_memmap_data {
@@ -46,61 +50,19 @@ struct crash_memmap_data {
unsigned int type;
};
-/*
- * This is used to VMCLEAR all VMCSs loaded on the
- * processor. And when loading kvm_intel module, the
- * callback function pointer will be assigned.
- *
- * protected by rcu.
- */
-crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss = NULL;
-EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss);
-unsigned long crash_zero_bytes;
-
-static inline void cpu_crash_vmclear_loaded_vmcss(void)
-{
- crash_vmclear_fn *do_vmclear_operation = NULL;
-
- rcu_read_lock();
- do_vmclear_operation = rcu_dereference(crash_vmclear_loaded_vmcss);
- if (do_vmclear_operation)
- do_vmclear_operation();
- rcu_read_unlock();
-}
-
#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
{
-#ifdef CONFIG_X86_32
- struct pt_regs fixed_regs;
-
- if (!user_mode(regs)) {
- crash_fixup_ss_esp(&fixed_regs, regs);
- regs = &fixed_regs;
- }
-#endif
crash_save_cpu(regs, cpu);
/*
- * VMCLEAR VMCSs loaded on all cpus if needed.
- */
- cpu_crash_vmclear_loaded_vmcss();
-
- /* Disable VMX or SVM if needed.
- *
- * We need to disable virtualization on all CPUs.
- * Having VMX or SVM enabled on any CPU may break rebooting
- * after the kdump kernel has finished its task.
- */
- cpu_emergency_vmxoff();
- cpu_emergency_svm_disable();
-
- /*
* Disable Intel PT to stop its logging
*/
cpu_emergency_stop_pt();
+ kdump_sev_callback();
+
disable_local_APIC();
}
@@ -149,17 +111,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
crash_smp_send_stop();
- /*
- * VMCLEAR VMCSs loaded on this cpu if needed.
- */
- cpu_crash_vmclear_loaded_vmcss();
-
- /* Booting kdump kernel with VMX or SVM enabled won't work,
- * because (among other limitations) we can't disable paging
- * with the virt flags.
- */
- cpu_emergency_vmxoff();
- cpu_emergency_svm_disable();
+ cpu_emergency_disable_virtualization();
/*
* Disable Intel PT to stop its logging
@@ -176,10 +128,22 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
#ifdef CONFIG_HPET_TIMER
hpet_disable();
#endif
- crash_save_cpu(regs, safe_smp_processor_id());
+
+ /*
+ * Non-crash kexec calls enc_kexec_begin() while scheduling is still
+ * active. This allows the callback to wait until all in-flight
+ * shared<->private conversions are complete. In a crash scenario,
+ * enc_kexec_begin() gets called after all but one CPU have been shut
+ * down and interrupts have been disabled. This allows the callback to
+ * detect a race with the conversion and report it.
+ */
+ x86_platform.guest.enc_kexec_begin();
+ x86_platform.guest.enc_kexec_finish();
+
+ crash_save_cpu(regs, smp_processor_id());
}
-#ifdef CONFIG_KEXEC_FILE
+#if defined(CONFIG_KEXEC_FILE) || defined(CONFIG_CRASH_HOTPLUG)
static int get_nr_ram_ranges_callback(struct resource *res, void *arg)
{
unsigned int *nr_ranges = arg;
@@ -194,23 +158,30 @@ static struct crash_mem *fill_up_crash_elf_data(void)
unsigned int nr_ranges = 0;
struct crash_mem *cmem;
- walk_system_ram_res(0, -1, &nr_ranges,
- get_nr_ram_ranges_callback);
+ walk_system_ram_res(0, -1, &nr_ranges, get_nr_ram_ranges_callback);
if (!nr_ranges)
return NULL;
/*
- * Exclusion of crash region and/or crashk_low_res may cause
- * another range split. So add extra two slots here.
+ * Exclusion of crash region, crashk_low_res and/or crashk_cma_ranges
+ * may cause range splits. So add extra slots here.
+ *
+ * Exclusion of low 1M may not cause another range split, because the
+ * range of exclude is [0, 1M] and the condition for splitting a new
+ * region is that the start, end parameters are both in a certain
+ * existing region in cmem and cannot be equal to existing region's
+ * start or end. Obviously, the start of [0, 1M] cannot meet this
+ * condition.
+ *
+ * But in order to lest the low 1M could be changed in the future,
+ * (e.g. [start, 1M]), add a extra slot.
*/
- nr_ranges += 2;
- cmem = vzalloc(sizeof(struct crash_mem) +
- sizeof(struct crash_mem_range) * nr_ranges);
+ nr_ranges += 3 + crashk_cma_cnt;
+ cmem = vzalloc(struct_size(cmem, ranges, nr_ranges));
if (!cmem)
return NULL;
cmem->max_nr_ranges = nr_ranges;
- cmem->nr_ranges = 0;
return cmem;
}
@@ -222,20 +193,32 @@ static struct crash_mem *fill_up_crash_elf_data(void)
static int elf_header_exclude_ranges(struct crash_mem *cmem)
{
int ret = 0;
+ int i;
+
+ /* Exclude the low 1M because it is always reserved */
+ ret = crash_exclude_mem_range(cmem, 0, SZ_1M - 1);
+ if (ret)
+ return ret;
/* Exclude crashkernel region */
ret = crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end);
if (ret)
return ret;
- if (crashk_low_res.end) {
+ if (crashk_low_res.end)
ret = crash_exclude_mem_range(cmem, crashk_low_res.start,
- crashk_low_res.end);
+ crashk_low_res.end);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < crashk_cma_cnt; ++i) {
+ ret = crash_exclude_mem_range(cmem, crashk_cma_ranges[i].start,
+ crashk_cma_ranges[i].end);
if (ret)
return ret;
}
- return ret;
+ return 0;
}
static int prepare_elf64_ram_headers_callback(struct resource *res, void *arg)
@@ -250,20 +233,17 @@ static int prepare_elf64_ram_headers_callback(struct resource *res, void *arg)
}
/* Prepare elf headers. Return addr and size */
-static int prepare_elf_headers(struct kimage *image, void **addr,
- unsigned long *sz)
+static int prepare_elf_headers(void **addr, unsigned long *sz,
+ unsigned long *nr_mem_ranges)
{
struct crash_mem *cmem;
- Elf64_Ehdr *ehdr;
- Elf64_Phdr *phdr;
- int ret, i;
+ int ret;
cmem = fill_up_crash_elf_data();
if (!cmem)
return -ENOMEM;
- ret = walk_system_ram_res(0, -1, cmem,
- prepare_elf64_ram_headers_callback);
+ ret = walk_system_ram_res(0, -1, cmem, prepare_elf64_ram_headers_callback);
if (ret)
goto out;
@@ -272,30 +252,19 @@ static int prepare_elf_headers(struct kimage *image, void **addr,
if (ret)
goto out;
+ /* Return the computed number of memory ranges, for hotplug usage */
+ *nr_mem_ranges = cmem->nr_ranges;
+
/* By default prepare 64bit headers */
- ret = crash_prepare_elf64_headers(cmem,
- IS_ENABLED(CONFIG_X86_64), addr, sz);
- if (ret)
- goto out;
+ ret = crash_prepare_elf64_headers(cmem, IS_ENABLED(CONFIG_X86_64), addr, sz);
- /*
- * If a range matches backup region, adjust offset to backup
- * segment.
- */
- ehdr = (Elf64_Ehdr *)*addr;
- phdr = (Elf64_Phdr *)(ehdr + 1);
- for (i = 0; i < ehdr->e_phnum; phdr++, i++)
- if (phdr->p_type == PT_LOAD &&
- phdr->p_paddr == image->arch.backup_src_start &&
- phdr->p_memsz == image->arch.backup_src_sz) {
- phdr->p_offset = image->arch.backup_load_addr;
- break;
- }
out:
vfree(cmem);
return ret;
}
+#endif
+#ifdef CONFIG_KEXEC_FILE
static int add_e820_entry(struct boot_params *params, struct e820_entry *entry)
{
unsigned int nr_e820_entries;
@@ -304,8 +273,7 @@ static int add_e820_entry(struct boot_params *params, struct e820_entry *entry)
if (nr_e820_entries >= E820_MAX_ENTRIES_ZEROPAGE)
return 1;
- memcpy(&params->e820_table[nr_e820_entries], entry,
- sizeof(struct e820_entry));
+ memcpy(&params->e820_table[nr_e820_entries], entry, sizeof(struct e820_entry));
params->e820_entries++;
return 0;
}
@@ -329,69 +297,92 @@ static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem,
unsigned long long mend)
{
unsigned long start, end;
- int ret = 0;
+ int ret;
cmem->ranges[0].start = mstart;
cmem->ranges[0].end = mend;
cmem->nr_ranges = 1;
- /* Exclude Backup region */
- start = image->arch.backup_load_addr;
- end = start + image->arch.backup_src_sz - 1;
+ /* Exclude elf header region */
+ start = image->elf_load_addr;
+ end = start + image->elf_headers_sz - 1;
ret = crash_exclude_mem_range(cmem, start, end);
+
if (ret)
return ret;
- /* Exclude elf header region */
- start = image->arch.elf_load_addr;
- end = start + image->arch.elf_headers_sz - 1;
- return crash_exclude_mem_range(cmem, start, end);
+ /* Exclude dm crypt keys region */
+ if (image->dm_crypt_keys_addr) {
+ start = image->dm_crypt_keys_addr;
+ end = start + image->dm_crypt_keys_sz - 1;
+ return crash_exclude_mem_range(cmem, start, end);
+ }
+
+ return ret;
}
/* Prepare memory map for crash dump kernel */
int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params)
{
+ unsigned int nr_ranges = 0;
int i, ret = 0;
unsigned long flags;
struct e820_entry ei;
struct crash_memmap_data cmd;
struct crash_mem *cmem;
- cmem = vzalloc(sizeof(struct crash_mem));
+ /*
+ * In the current x86 architecture code, the elfheader is always
+ * allocated at crashk_res.start. But it depends on the allocation
+ * position of elfheader in crashk_res. To avoid potential out of
+ * bounds in future, add an extra slot.
+ *
+ * And using random kexec_buf for passing dm crypt keys may cause a
+ * range split too, add another extra slot here.
+ */
+ nr_ranges = 3;
+ cmem = vzalloc(struct_size(cmem, ranges, nr_ranges));
if (!cmem)
return -ENOMEM;
+ cmem->max_nr_ranges = nr_ranges;
+
memset(&cmd, 0, sizeof(struct crash_memmap_data));
cmd.params = params;
- /* Add first 640K segment */
- ei.addr = image->arch.backup_src_start;
- ei.size = image->arch.backup_src_sz;
- ei.type = E820_TYPE_RAM;
- add_e820_entry(params, &ei);
+ /* Add the low 1M */
+ cmd.type = E820_TYPE_RAM;
+ flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+ walk_iomem_res_desc(IORES_DESC_NONE, flags, 0, (1<<20)-1, &cmd,
+ memmap_entry_callback);
/* Add ACPI tables */
cmd.type = E820_TYPE_ACPI;
flags = IORESOURCE_MEM | IORESOURCE_BUSY;
walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1, &cmd,
- memmap_entry_callback);
+ memmap_entry_callback);
/* Add ACPI Non-volatile Storage */
cmd.type = E820_TYPE_NVS;
walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, &cmd,
- memmap_entry_callback);
+ memmap_entry_callback);
+
+ /* Add e820 reserved ranges */
+ cmd.type = E820_TYPE_RESERVED;
+ flags = IORESOURCE_MEM;
+ walk_iomem_res_desc(IORES_DESC_RESERVED, flags, 0, -1, &cmd,
+ memmap_entry_callback);
/* Add crashk_low_res region */
if (crashk_low_res.end) {
ei.addr = crashk_low_res.start;
- ei.size = crashk_low_res.end - crashk_low_res.start + 1;
+ ei.size = resource_size(&crashk_low_res);
ei.type = E820_TYPE_RAM;
add_e820_entry(params, &ei);
}
/* Exclude some ranges from crashk_res and add rest to memmap */
- ret = memmap_exclude_ranges(image, cmem, crashk_res.start,
- crashk_res.end);
+ ret = memmap_exclude_ranges(image, cmem, crashk_res.start, crashk_res.end);
if (ret)
goto out;
@@ -406,79 +397,175 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params)
add_e820_entry(params, &ei);
}
+ for (i = 0; i < crashk_cma_cnt; ++i) {
+ ei.addr = crashk_cma_ranges[i].start;
+ ei.size = crashk_cma_ranges[i].end -
+ crashk_cma_ranges[i].start + 1;
+ ei.type = E820_TYPE_RAM;
+ add_e820_entry(params, &ei);
+ }
+
out:
vfree(cmem);
return ret;
}
-static int determine_backup_region(struct resource *res, void *arg)
-{
- struct kimage *image = arg;
-
- image->arch.backup_src_start = res->start;
- image->arch.backup_src_sz = resource_size(res);
-
- /* Expecting only one range for backup region */
- return 1;
-}
-
int crash_load_segments(struct kimage *image)
{
int ret;
+ unsigned long pnum = 0;
struct kexec_buf kbuf = { .image = image, .buf_min = 0,
.buf_max = ULONG_MAX, .top_down = false };
+ /* Prepare elf headers and add a segment */
+ ret = prepare_elf_headers(&kbuf.buffer, &kbuf.bufsz, &pnum);
+ if (ret)
+ return ret;
+
+ image->elf_headers = kbuf.buffer;
+ image->elf_headers_sz = kbuf.bufsz;
+ kbuf.memsz = kbuf.bufsz;
+
+#ifdef CONFIG_CRASH_HOTPLUG
/*
- * Determine and load a segment for backup area. First 640K RAM
- * region is backup source
+ * The elfcorehdr segment size accounts for VMCOREINFO, kernel_map,
+ * maximum CPUs and maximum memory ranges.
*/
+ if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG))
+ pnum = 2 + CONFIG_NR_CPUS_DEFAULT + CONFIG_CRASH_MAX_MEMORY_RANGES;
+ else
+ pnum += 2 + CONFIG_NR_CPUS_DEFAULT;
- ret = walk_system_ram_res(KEXEC_BACKUP_SRC_START, KEXEC_BACKUP_SRC_END,
- image, determine_backup_region);
+ if (pnum < (unsigned long)PN_XNUM) {
+ kbuf.memsz = pnum * sizeof(Elf64_Phdr);
+ kbuf.memsz += sizeof(Elf64_Ehdr);
- /* Zero or postive return values are ok */
- if (ret < 0)
- return ret;
+ image->elfcorehdr_index = image->nr_segments;
- /* Add backup segment. */
- if (image->arch.backup_src_sz) {
- kbuf.buffer = &crash_zero_bytes;
- kbuf.bufsz = sizeof(crash_zero_bytes);
- kbuf.memsz = image->arch.backup_src_sz;
- kbuf.buf_align = PAGE_SIZE;
- /*
- * Ideally there is no source for backup segment. This is
- * copied in purgatory after crash. Just add a zero filled
- * segment for now to make sure checksum logic works fine.
- */
- ret = kexec_add_buffer(&kbuf);
- if (ret)
- return ret;
- image->arch.backup_load_addr = kbuf.mem;
- pr_debug("Loaded backup region at 0x%lx backup_start=0x%lx memsz=0x%lx\n",
- image->arch.backup_load_addr,
- image->arch.backup_src_start, kbuf.memsz);
+ /* Mark as usable to crash kernel, else crash kernel fails on boot */
+ image->elf_headers_sz = kbuf.memsz;
+ } else {
+ pr_err("number of Phdrs %lu exceeds max\n", pnum);
}
+#endif
- /* Prepare elf headers and add a segment */
- ret = prepare_elf_headers(image, &kbuf.buffer, &kbuf.bufsz);
- if (ret)
- return ret;
-
- image->arch.elf_headers = kbuf.buffer;
- image->arch.elf_headers_sz = kbuf.bufsz;
-
- kbuf.memsz = kbuf.bufsz;
kbuf.buf_align = ELF_CORE_HEADER_ALIGN;
+ kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
ret = kexec_add_buffer(&kbuf);
- if (ret) {
- vfree((void *)image->arch.elf_headers);
+ if (ret)
return ret;
- }
- image->arch.elf_load_addr = kbuf.mem;
- pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
- image->arch.elf_load_addr, kbuf.bufsz, kbuf.bufsz);
+ image->elf_load_addr = kbuf.mem;
+ kexec_dprintk("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ image->elf_load_addr, kbuf.bufsz, kbuf.memsz);
return ret;
}
#endif /* CONFIG_KEXEC_FILE */
+
+#ifdef CONFIG_CRASH_HOTPLUG
+
+#undef pr_fmt
+#define pr_fmt(fmt) "crash hp: " fmt
+
+int arch_crash_hotplug_support(struct kimage *image, unsigned long kexec_flags)
+{
+
+#ifdef CONFIG_KEXEC_FILE
+ if (image->file_mode)
+ return 1;
+#endif
+ /*
+ * Initially, crash hotplug support for kexec_load was added
+ * with the KEXEC_UPDATE_ELFCOREHDR flag. Later, this
+ * functionality was expanded to accommodate multiple kexec
+ * segment updates, leading to the introduction of the
+ * KEXEC_CRASH_HOTPLUG_SUPPORT kexec flag bit. Consequently,
+ * when the kexec tool sends either of these flags, it indicates
+ * that the required kexec segment (elfcorehdr) is excluded from
+ * the SHA calculation.
+ */
+ return (kexec_flags & KEXEC_UPDATE_ELFCOREHDR ||
+ kexec_flags & KEXEC_CRASH_HOTPLUG_SUPPORT);
+}
+
+unsigned int arch_crash_get_elfcorehdr_size(void)
+{
+ unsigned int sz;
+
+ /* kernel_map, VMCOREINFO and maximum CPUs */
+ sz = 2 + CONFIG_NR_CPUS_DEFAULT;
+ if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG))
+ sz += CONFIG_CRASH_MAX_MEMORY_RANGES;
+ sz *= sizeof(Elf64_Phdr);
+ return sz;
+}
+
+/**
+ * arch_crash_handle_hotplug_event() - Handle hotplug elfcorehdr changes
+ * @image: a pointer to kexec_crash_image
+ * @arg: struct memory_notify handler for memory hotplug case and
+ * NULL for CPU hotplug case.
+ *
+ * Prepare the new elfcorehdr and replace the existing elfcorehdr.
+ */
+void arch_crash_handle_hotplug_event(struct kimage *image, void *arg)
+{
+ void *elfbuf = NULL, *old_elfcorehdr;
+ unsigned long nr_mem_ranges;
+ unsigned long mem, memsz;
+ unsigned long elfsz = 0;
+
+ /*
+ * As crash_prepare_elf64_headers() has already described all
+ * possible CPUs, there is no need to update the elfcorehdr
+ * for additional CPU changes.
+ */
+ if ((image->file_mode || image->elfcorehdr_updated) &&
+ ((image->hp_action == KEXEC_CRASH_HP_ADD_CPU) ||
+ (image->hp_action == KEXEC_CRASH_HP_REMOVE_CPU)))
+ return;
+
+ /*
+ * Create the new elfcorehdr reflecting the changes to CPU and/or
+ * memory resources.
+ */
+ if (prepare_elf_headers(&elfbuf, &elfsz, &nr_mem_ranges)) {
+ pr_err("unable to create new elfcorehdr");
+ goto out;
+ }
+
+ /*
+ * Obtain address and size of the elfcorehdr segment, and
+ * check it against the new elfcorehdr buffer.
+ */
+ mem = image->segment[image->elfcorehdr_index].mem;
+ memsz = image->segment[image->elfcorehdr_index].memsz;
+ if (elfsz > memsz) {
+ pr_err("update elfcorehdr elfsz %lu > memsz %lu",
+ elfsz, memsz);
+ goto out;
+ }
+
+ /*
+ * Copy new elfcorehdr over the old elfcorehdr at destination.
+ */
+ old_elfcorehdr = kmap_local_page(pfn_to_page(mem >> PAGE_SHIFT));
+ if (!old_elfcorehdr) {
+ pr_err("mapping elfcorehdr segment failed\n");
+ goto out;
+ }
+
+ /*
+ * Temporarily invalidate the crash image while the
+ * elfcorehdr is updated.
+ */
+ xchg(&kexec_crash_image, NULL);
+ memcpy_flushcache(old_elfcorehdr, elfbuf, elfsz);
+ xchg(&kexec_crash_image, image);
+ kunmap_local(old_elfcorehdr);
+ pr_debug("updated elfcorehdr\n");
+
+out:
+ vfree(elfbuf);
+}
+#endif