diff options
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r-- | arch/x86/kernel/cpu/amd.c | 17 | ||||
-rw-r--r-- | arch/x86/kernel/crash.c | 25 | ||||
-rw-r--r-- | arch/x86/kernel/kexec-bzimage64.c | 47 | ||||
-rw-r--r-- | arch/x86/kernel/kvm.c | 44 | ||||
-rw-r--r-- | arch/x86/kernel/machine_kexec_64.c | 44 | ||||
-rw-r--r-- | arch/x86/kernel/process.c | 24 | ||||
-rw-r--r-- | arch/x86/kernel/relocate_kernel_64.S | 36 |
7 files changed, 180 insertions, 57 deletions
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index a6f88ca1a6b4..5398db4dedb4 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -546,6 +546,23 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) u64 msr; /* + * Mark using WBINVD is needed during kexec on processors that + * support SME. This provides support for performing a successful + * kexec when going from SME inactive to SME active (or vice-versa). + * + * The cache must be cleared so that if there are entries with the + * same physical address, both with and without the encryption bit, + * they don't race each other when flushed and potentially end up + * with the wrong entry being committed to memory. + * + * Test the CPUID bit directly because with mem_encrypt=off the + * BSP will clear the X86_FEATURE_SME bit and the APs will not + * see it set after that. + */ + if (c->extended_cpuid_level >= 0x8000001f && (cpuid_eax(0x8000001f) & BIT(0))) + __this_cpu_write(cache_state_incoherent, true); + + /* * BIOS support is required for SME and SEV. * For SME: If BIOS has enabled SME then adjust x86_phys_bits by * the SME physical address space reduction value. diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index c6b12bed173d..335fd2ee9766 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -165,14 +165,23 @@ static struct crash_mem *fill_up_crash_elf_data(void) /* * Exclusion of crash region, crashk_low_res and/or crashk_cma_ranges * may cause range splits. So add extra slots here. + * + * Exclusion of low 1M may not cause another range split, because the + * range of exclude is [0, 1M] and the condition for splitting a new + * region is that the start, end parameters are both in a certain + * existing region in cmem and cannot be equal to existing region's + * start or end. Obviously, the start of [0, 1M] cannot meet this + * condition. + * + * But in order to lest the low 1M could be changed in the future, + * (e.g. [start, 1M]), add a extra slot. */ - nr_ranges += 2 + crashk_cma_cnt; + nr_ranges += 3 + crashk_cma_cnt; cmem = vzalloc(struct_size(cmem, ranges, nr_ranges)); if (!cmem) return NULL; cmem->max_nr_ranges = nr_ranges; - cmem->nr_ranges = 0; return cmem; } @@ -323,16 +332,20 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) struct crash_mem *cmem; /* - * Using random kexec_buf for passing dm crypt keys may cause a range - * split. So use two slots here. + * In the current x86 architecture code, the elfheader is always + * allocated at crashk_res.start. But it depends on the allocation + * position of elfheader in crashk_res. To avoid potential out of + * bounds in future, add an extra slot. + * + * And using random kexec_buf for passing dm crypt keys may cause a + * range split too, add another extra slot here. */ - nr_ranges = 2; + nr_ranges = 3; cmem = vzalloc(struct_size(cmem, ranges, nr_ranges)); if (!cmem) return -ENOMEM; cmem->max_nr_ranges = nr_ranges; - cmem->nr_ranges = 0; memset(&cmd, 0, sizeof(struct crash_memmap_data)); cmd.params = params; diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 24a41f0e0cf1..c3244ac680d1 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -16,6 +16,8 @@ #include <linux/kexec.h> #include <linux/kernel.h> #include <linux/mm.h> +#include <linux/libfdt.h> +#include <linux/of_fdt.h> #include <linux/efi.h> #include <linux/random.h> @@ -212,6 +214,28 @@ setup_efi_state(struct boot_params *params, unsigned long params_load_addr, } #endif /* CONFIG_EFI */ +#ifdef CONFIG_OF_FLATTREE +static void setup_dtb(struct boot_params *params, + unsigned long params_load_addr, + unsigned int dtb_setup_data_offset) +{ + struct setup_data *sd = (void *)params + dtb_setup_data_offset; + unsigned long setup_data_phys, dtb_len; + + dtb_len = fdt_totalsize(initial_boot_params); + sd->type = SETUP_DTB; + sd->len = dtb_len; + + /* Carry over current boot DTB with setup_data */ + memcpy(sd->data, initial_boot_params, dtb_len); + + /* Add setup data */ + setup_data_phys = params_load_addr + dtb_setup_data_offset; + sd->next = params->hdr.setup_data; + params->hdr.setup_data = setup_data_phys; +} +#endif /* CONFIG_OF_FLATTREE */ + static void setup_ima_state(const struct kimage *image, struct boot_params *params, unsigned long params_load_addr, @@ -336,6 +360,17 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params, sizeof(struct efi_setup_data); #endif +#ifdef CONFIG_OF_FLATTREE + if (image->force_dtb && initial_boot_params) { + setup_dtb(params, params_load_addr, setup_data_offset); + setup_data_offset += sizeof(struct setup_data) + + fdt_totalsize(initial_boot_params); + } else { + pr_debug("Not carrying over DTB, force_dtb = %d\n", + image->force_dtb); + } +#endif + if (IS_ENABLED(CONFIG_IMA_KEXEC)) { /* Setup IMA log buffer state */ setup_ima_state(image, params, params_load_addr, @@ -529,6 +564,12 @@ static void *bzImage64_load(struct kimage *image, char *kernel, sizeof(struct setup_data) + RNG_SEED_LENGTH; +#ifdef CONFIG_OF_FLATTREE + if (image->force_dtb && initial_boot_params) + kbuf.bufsz += sizeof(struct setup_data) + + fdt_totalsize(initial_boot_params); +#endif + if (IS_ENABLED(CONFIG_IMA_KEXEC)) kbuf.bufsz += sizeof(struct setup_data) + sizeof(struct ima_setup_data); @@ -537,7 +578,7 @@ static void *bzImage64_load(struct kimage *image, char *kernel, kbuf.bufsz += sizeof(struct setup_data) + sizeof(struct kho_data); - params = kzalloc(kbuf.bufsz, GFP_KERNEL); + params = kvzalloc(kbuf.bufsz, GFP_KERNEL); if (!params) return ERR_PTR(-ENOMEM); efi_map_offset = params_cmdline_sz; @@ -647,7 +688,7 @@ static void *bzImage64_load(struct kimage *image, char *kernel, return ldata; out_free_params: - kfree(params); + kvfree(params); return ERR_PTR(ret); } @@ -659,7 +700,7 @@ static int bzImage64_cleanup(void *loader_data) if (!ldata) return 0; - kfree(ldata->bootparams_buf); + kvfree(ldata->bootparams_buf); ldata->bootparams_buf = NULL; return 0; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 8ae750cde0c6..b67d7c59dca0 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -190,7 +190,7 @@ static void apf_task_wake_all(void) } } -void kvm_async_pf_task_wake(u32 token) +static void kvm_async_pf_task_wake(u32 token) { u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; @@ -241,7 +241,6 @@ again: /* A dummy token might be allocated and ultimately not used. */ kfree(dummy); } -EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake); noinstr u32 kvm_read_and_reset_apf_flags(void) { @@ -933,6 +932,19 @@ static void kvm_sev_hc_page_enc_status(unsigned long pfn, int npages, bool enc) static void __init kvm_init_platform(void) { + u64 tolud = PFN_PHYS(e820__end_of_low_ram_pfn()); + /* + * Note, hardware requires variable MTRR ranges to be power-of-2 sized + * and naturally aligned. But when forcing guest MTRR state, Linux + * doesn't program the forced ranges into hardware. Don't bother doing + * the math to generate a technically-legal range. + */ + struct mtrr_var_range pci_hole = { + .base_lo = tolud | X86_MEMTYPE_UC, + .mask_lo = (u32)(~(SZ_4G - tolud - 1)) | MTRR_PHYSMASK_V, + .mask_hi = (BIT_ULL(boot_cpu_data.x86_phys_bits) - 1) >> 32, + }; + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) && kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL)) { unsigned long nr_pages; @@ -982,8 +994,12 @@ static void __init kvm_init_platform(void) kvmclock_init(); x86_platform.apic_post_init = kvm_apic_init; - /* Set WB as the default cache mode for SEV-SNP and TDX */ - guest_force_mtrr_state(NULL, 0, MTRR_TYPE_WRBACK); + /* + * Set WB as the default cache mode for SEV-SNP and TDX, with a single + * UC range for the legacy PCI hole, e.g. so that devices that expect + * to get UC/WC mappings don't get surprised with WB. + */ + guest_force_mtrr_state(&pci_hole, 1, MTRR_TYPE_WRBACK); } #if defined(CONFIG_AMD_MEM_ENCRYPT) @@ -1073,16 +1089,6 @@ static void kvm_wait(u8 *ptr, u8 val) void __init kvm_spinlock_init(void) { /* - * In case host doesn't support KVM_FEATURE_PV_UNHALT there is still an - * advantage of keeping virt_spin_lock_key enabled: virt_spin_lock() is - * preferred over native qspinlock when vCPU is preempted. - */ - if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) { - pr_info("PV spinlocks disabled, no host support\n"); - return; - } - - /* * Disable PV spinlocks and use native qspinlock when dedicated pCPUs * are available. */ @@ -1101,6 +1107,16 @@ void __init kvm_spinlock_init(void) goto out; } + /* + * In case host doesn't support KVM_FEATURE_PV_UNHALT there is still an + * advantage of keeping virt_spin_lock_key enabled: virt_spin_lock() is + * preferred over native qspinlock when vCPU is preempted. + */ + if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) { + pr_info("PV spinlocks disabled, no host support\n"); + return; + } + pr_info("PV spinlocks enabled\n"); __pv_init_lock_hash(); diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 697fb99406e6..15088d14904f 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -29,6 +29,7 @@ #include <asm/set_memory.h> #include <asm/cpu.h> #include <asm/efi.h> +#include <asm/processor.h> #ifdef CONFIG_ACPI /* @@ -346,6 +347,22 @@ int machine_kexec_prepare(struct kimage *image) unsigned long reloc_end = (unsigned long)__relocate_kernel_end; int result; + /* + * Some early TDX-capable platforms have an erratum. A kernel + * partial write (a write transaction of less than cacheline + * lands at memory controller) to TDX private memory poisons that + * memory, and a subsequent read triggers a machine check. + * + * On those platforms the old kernel must reset TDX private + * memory before jumping to the new kernel otherwise the new + * kernel may see unexpected machine check. For simplicity + * just fail kexec/kdump on those platforms. + */ + if (boot_cpu_has_bug(X86_BUG_TDX_PW_MCE)) { + pr_info_once("Not allowed on platform with tdx_pw_mce bug\n"); + return -EOPNOTSUPP; + } + /* Setup the identity mapped 64bit page table */ result = init_pgtable(image, __pa(control_page)); if (result) @@ -384,16 +401,10 @@ void __nocfi machine_kexec(struct kimage *image) { unsigned long reloc_start = (unsigned long)__relocate_kernel_start; relocate_kernel_fn *relocate_kernel_ptr; - unsigned int host_mem_enc_active; + unsigned int relocate_kernel_flags; int save_ftrace_enabled; void *control_page; - /* - * This must be done before load_segments() since if call depth tracking - * is used then GS must be valid to make any function calls. - */ - host_mem_enc_active = cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT); - #ifdef CONFIG_KEXEC_JUMP if (image->preserve_context) save_processor_state(); @@ -427,6 +438,17 @@ void __nocfi machine_kexec(struct kimage *image) */ relocate_kernel_ptr = control_page + (unsigned long)relocate_kernel - reloc_start; + relocate_kernel_flags = 0; + if (image->preserve_context) + relocate_kernel_flags |= RELOC_KERNEL_PRESERVE_CONTEXT; + + /* + * This must be done before load_segments() since it resets + * GS to 0 and percpu data needs the correct GS to work. + */ + if (this_cpu_read(cache_state_incoherent)) + relocate_kernel_flags |= RELOC_KERNEL_CACHE_INCOHERENT; + /* * The segment registers are funny things, they have both a * visible and an invisible part. Whenever the visible part is @@ -436,6 +458,11 @@ void __nocfi machine_kexec(struct kimage *image) * * Take advantage of this here by force loading the segments, * before the GDT is zapped with an invalid value. + * + * load_segments() resets GS to 0. Don't make any function call + * after here since call depth tracking uses percpu variables to + * operate (relocate_kernel() is explicitly ignored by call depth + * tracking). */ load_segments(); @@ -443,8 +470,7 @@ void __nocfi machine_kexec(struct kimage *image) image->start = relocate_kernel_ptr((unsigned long)image->head, virt_to_phys(control_page), image->start, - image->preserve_context, - host_mem_enc_active); + relocate_kernel_flags); #ifdef CONFIG_KEXEC_JUMP if (image->preserve_context) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index e3a3987b0c4f..4c718f8adc59 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -89,6 +89,16 @@ DEFINE_PER_CPU(bool, __tss_limit_invalid); EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid); /* + * The cache may be in an incoherent state and needs flushing during kexec. + * E.g., on SME/TDX platforms, dirty cacheline aliases with and without + * encryption bit(s) can coexist and the cache needs to be flushed before + * booting to the new kernel to avoid the silent memory corruption due to + * dirty cachelines with different encryption property being written back + * to the memory. + */ +DEFINE_PER_CPU(bool, cache_state_incoherent); + +/* * this gets called so that we can store lazy state into memory and copy the * current task into the new thread. */ @@ -827,19 +837,7 @@ void __noreturn stop_this_cpu(void *dummy) disable_local_APIC(); mcheck_cpu_clear(c); - /* - * Use wbinvd on processors that support SME. This provides support - * for performing a successful kexec when going from SME inactive - * to SME active (or vice-versa). The cache must be cleared so that - * if there are entries with the same physical address, both with and - * without the encryption bit, they don't race each other when flushed - * and potentially end up with the wrong entry being committed to - * memory. - * - * Test the CPUID bit directly because the machine might've cleared - * X86_FEATURE_SME due to cmdline options. - */ - if (c->extended_cpuid_level >= 0x8000001f && (cpuid_eax(0x8000001f) & BIT(0))) + if (this_cpu_read(cache_state_incoherent)) wbinvd(); /* diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index ea604f4d0b52..11e20bb13aca 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -66,8 +66,7 @@ SYM_CODE_START_NOALIGN(relocate_kernel) * %rdi indirection_page * %rsi pa_control_page * %rdx start address - * %rcx preserve_context - * %r8 host_mem_enc_active + * %rcx flags: RELOC_KERNEL_* */ /* Save the CPU context, used for jumping back */ @@ -111,7 +110,7 @@ SYM_CODE_START_NOALIGN(relocate_kernel) /* save indirection list for jumping back */ movq %rdi, pa_backup_pages_map(%rip) - /* Save the preserve_context to %r11 as swap_pages clobbers %rcx. */ + /* Save the flags to %r11 as swap_pages clobbers %rcx. */ movq %rcx, %r11 /* setup a new stack at the end of the physical control page */ @@ -129,9 +128,8 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) /* * %rdi indirection page * %rdx start address - * %r8 host_mem_enc_active * %r9 page table page - * %r11 preserve_context + * %r11 flags: RELOC_KERNEL_* * %r13 original CR4 when relocate_kernel() was invoked */ @@ -200,14 +198,21 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) movq %r9, %cr3 /* + * If the memory cache is in incoherent state, e.g., due to + * memory encryption, do WBINVD to flush cache. + * * If SME is active, there could be old encrypted cache line * entries that will conflict with the now unencrypted memory * used by kexec. Flush the caches before copying the kernel. + * + * Note SME sets this flag to true when the platform supports + * SME, so the WBINVD is performed even SME is not activated + * by the kernel. But this has no harm. */ - testq %r8, %r8 - jz .Lsme_off + testb $RELOC_KERNEL_CACHE_INCOHERENT, %r11b + jz .Lnowbinvd wbinvd -.Lsme_off: +.Lnowbinvd: call swap_pages @@ -220,7 +225,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) movq %cr3, %rax movq %rax, %cr3 - testq %r11, %r11 /* preserve_context */ + testb $RELOC_KERNEL_PRESERVE_CONTEXT, %r11b jnz .Lrelocate /* @@ -273,7 +278,13 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) ANNOTATE_NOENDBR andq $PAGE_MASK, %r8 lea PAGE_SIZE(%r8), %rsp - movl $1, %r11d /* Ensure preserve_context flag is set */ + /* + * Ensure RELOC_KERNEL_PRESERVE_CONTEXT flag is set so that + * swap_pages() can swap pages correctly. Note all other + * RELOC_KERNEL_* flags passed to relocate_kernel() are not + * restored. + */ + movl $RELOC_KERNEL_PRESERVE_CONTEXT, %r11d call swap_pages movq kexec_va_control_page(%rip), %rax 0: addq $virtual_mapped - 0b, %rax @@ -321,7 +332,7 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages) UNWIND_HINT_END_OF_STACK /* * %rdi indirection page - * %r11 preserve_context + * %r11 flags: RELOC_KERNEL_* */ movq %rdi, %rcx /* Put the indirection_page in %rcx */ xorl %edi, %edi @@ -357,7 +368,8 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages) movq %rdi, %rdx /* Save destination page to %rdx */ movq %rsi, %rax /* Save source page to %rax */ - testq %r11, %r11 /* Only actually swap for ::preserve_context */ + /* Only actually swap for ::preserve_context */ + testb $RELOC_KERNEL_PRESERVE_CONTEXT, %r11b jz .Lnoswap /* copy source page to swap page */ |