diff options
Diffstat (limited to 'arch/x86')
39 files changed, 489 insertions, 246 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 75f3de70df51..9d034a987c6e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1565,7 +1565,6 @@ config ARCH_SPARSEMEM_ENABLE def_bool y select SPARSEMEM_STATIC if X86_32 select SPARSEMEM_VMEMMAP_ENABLE if X86_64 - select SPARSEMEM_VMEMMAP if X86_64 config ARCH_SPARSEMEM_DEFAULT def_bool X86_64 || (NUMA && X86_32) @@ -1903,7 +1902,6 @@ config INTEL_TDX_HOST depends on X86_X2APIC select ARCH_KEEP_MEMBLOCK depends on CONTIG_ALLOC - depends on !KEXEC_CORE depends on X86_MCE help Intel Trust Domain Extensions (TDX) protects guest VMs from malicious diff --git a/arch/x86/entry/entry_64_fred.S b/arch/x86/entry/entry_64_fred.S index 29c5c32c16c3..907bd233c6c1 100644 --- a/arch/x86/entry/entry_64_fred.S +++ b/arch/x86/entry/entry_64_fred.S @@ -16,7 +16,7 @@ .macro FRED_ENTER UNWIND_HINT_END_OF_STACK - ENDBR + ANNOTATE_NOENDBR PUSH_AND_CLEAR_REGS movq %rsp, %rdi /* %rdi -> pt_regs */ .endm diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index c9103a6fa06e..6e6c0a740837 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -124,7 +124,12 @@ bool emulate_vsyscall(unsigned long error_code, if ((error_code & (X86_PF_WRITE | X86_PF_USER)) != X86_PF_USER) return false; - if (!(error_code & X86_PF_INSTR)) { + /* + * Assume that faults at regs->ip are because of an + * instruction fetch. Return early and avoid + * emulation for faults during data accesses: + */ + if (address != regs->ip) { /* Failed vsyscall read */ if (vsyscall_mode == EMULATE) return false; @@ -137,12 +142,18 @@ bool emulate_vsyscall(unsigned long error_code, } /* + * X86_PF_INSTR is only set when NX is supported. When + * available, use it to double-check that the emulation code + * is only being used for instruction fetches: + */ + if (cpu_feature_enabled(X86_FEATURE_NX)) + WARN_ON_ONCE(!(error_code & X86_PF_INSTR)); + + /* * No point in checking CS -- the only way to get here is a user mode * trap to a high address, which means that we're in 64-bit user code. */ - WARN_ON_ONCE(address != regs->ip); - if (vsyscall_mode == NONE) { warn_bad_vsyscall(KERN_INFO, regs, "vsyscall attempted with vsyscall=none"); diff --git a/arch/x86/include/asm/floppy.h b/arch/x86/include/asm/floppy.h index 6ec3fc969ad5..e7a244051c62 100644 --- a/arch/x86/include/asm/floppy.h +++ b/arch/x86/include/asm/floppy.h @@ -10,6 +10,7 @@ #ifndef _ASM_X86_FLOPPY_H #define _ASM_X86_FLOPPY_H +#include <linux/sizes.h> #include <linux/vmalloc.h> /* @@ -22,10 +23,7 @@ */ #define _CROSS_64KB(a, s, vdma) \ (!(vdma) && \ - ((unsigned long)(a)/K_64 != ((unsigned long)(a) + (s) - 1) / K_64)) - -#define CROSS_64KB(a, s) _CROSS_64KB(a, s, use_virtual_dma & 1) - + ((unsigned long)(a) / SZ_64K != ((unsigned long)(a) + (s) - 1) / SZ_64K)) #define SW fd_routine[use_virtual_dma & 1] #define CSW fd_routine[can_use_virtual_dma & 1] @@ -206,7 +204,7 @@ static int vdma_dma_setup(char *addr, unsigned long size, int mode, int io) static int hard_dma_setup(char *addr, unsigned long size, int mode, int io) { #ifdef FLOPPY_SANITY_CHECK - if (CROSS_64KB(addr, size)) { + if (_CROSS_64KB(addr, size, use_virtual_dma & 1)) { printk("DMA crossing 64-K boundary %p-%p\n", addr, addr+size); return -1; } diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index f2ad77929d6e..5cfb27f26583 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -13,6 +13,15 @@ # define KEXEC_DEBUG_EXC_HANDLER_SIZE 6 /* PUSHI, PUSHI, 2-byte JMP */ #endif +#ifdef CONFIG_X86_64 + +#include <linux/bits.h> + +#define RELOC_KERNEL_PRESERVE_CONTEXT BIT(0) +#define RELOC_KERNEL_CACHE_INCOHERENT BIT(1) + +#endif + # define KEXEC_CONTROL_PAGE_SIZE 4096 # define KEXEC_CONTROL_CODE_MAX_SIZE 2048 @@ -121,8 +130,7 @@ typedef unsigned long relocate_kernel_fn(unsigned long indirection_page, unsigned long pa_control_page, unsigned long start_address, - unsigned int preserve_context, - unsigned int host_mem_enc_active); + unsigned int flags); #endif extern relocate_kernel_fn relocate_kernel; #define ARCH_HAS_KIMAGE_ARCH diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 18a5c3119e1a..62c3e4de3303 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -145,7 +145,7 @@ KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons); KVM_X86_OP_OPTIONAL(get_untagged_addr) KVM_X86_OP_OPTIONAL(alloc_apic_backing_page) KVM_X86_OP_OPTIONAL_RET0(gmem_prepare) -KVM_X86_OP_OPTIONAL_RET0(private_max_mapping_level) +KVM_X86_OP_OPTIONAL_RET0(gmem_max_mapping_level) KVM_X86_OP_OPTIONAL(gmem_invalidate) #undef KVM_X86_OP diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f19a76d3ca0e..c56cc54d682a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1922,7 +1922,7 @@ struct kvm_x86_ops { void *(*alloc_apic_backing_page)(struct kvm_vcpu *vcpu); int (*gmem_prepare)(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order); void (*gmem_invalidate)(kvm_pfn_t start, kvm_pfn_t end); - int (*private_max_mapping_level)(struct kvm *kvm, kvm_pfn_t pfn); + int (*gmem_max_mapping_level)(struct kvm *kvm, kvm_pfn_t pfn, bool is_private); }; struct kvm_x86_nested_ops { @@ -2276,10 +2276,8 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, int tdp_max_root_level, int tdp_huge_page_level); -#ifdef CONFIG_KVM_PRIVATE_MEM +#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES #define kvm_arch_has_private_mem(kvm) ((kvm)->arch.has_private_mem) -#else -#define kvm_arch_has_private_mem(kvm) false #endif #define kvm_arch_has_readonly_mem(kvm) (!(kvm)->arch.has_protected_state) diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 57bc74e112f2..4a47c16e2df8 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -124,7 +124,6 @@ bool kvm_para_available(void); unsigned int kvm_arch_para_features(void); unsigned int kvm_arch_para_hints(void); void kvm_async_pf_task_wait_schedule(u32 token); -void kvm_async_pf_task_wake(u32 token); u32 kvm_read_and_reset_apf_flags(void); bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token); @@ -148,7 +147,6 @@ static inline void kvm_spinlock_init(void) #else /* CONFIG_KVM_GUEST */ #define kvm_async_pf_task_wait_schedule(T) do {} while(0) -#define kvm_async_pf_task_wake(T) do {} while(0) static inline bool kvm_para_available(void) { diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index bde58f6510ac..a24c7805acdb 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -731,6 +731,8 @@ void __noreturn stop_this_cpu(void *dummy); void microcode_check(struct cpuinfo_x86 *prev_info); void store_cpu_caps(struct cpuinfo_x86 *info); +DECLARE_PER_CPU(bool, cache_state_incoherent); + enum l1tf_mitigations { L1TF_MITIGATION_OFF, L1TF_MITIGATION_AUTO, diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index 7ddef3a69866..6b338d7f01b7 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -102,10 +102,31 @@ u64 __seamcall_ret(u64 fn, struct tdx_module_args *args); u64 __seamcall_saved_ret(u64 fn, struct tdx_module_args *args); void tdx_init(void); +#include <linux/preempt.h> #include <asm/archrandom.h> +#include <asm/processor.h> typedef u64 (*sc_func_t)(u64 fn, struct tdx_module_args *args); +static __always_inline u64 __seamcall_dirty_cache(sc_func_t func, u64 fn, + struct tdx_module_args *args) +{ + lockdep_assert_preemption_disabled(); + + /* + * SEAMCALLs are made to the TDX module and can generate dirty + * cachelines of TDX private memory. Mark cache state incoherent + * so that the cache can be flushed during kexec. + * + * This needs to be done before actually making the SEAMCALL, + * because kexec-ing CPU could send NMI to stop remote CPUs, + * in which case even disabling IRQ won't help here. + */ + this_cpu_write(cache_state_incoherent, true); + + return func(fn, args); +} + static __always_inline u64 sc_retry(sc_func_t func, u64 fn, struct tdx_module_args *args) { @@ -113,7 +134,9 @@ static __always_inline u64 sc_retry(sc_func_t func, u64 fn, u64 ret; do { - ret = func(fn, args); + preempt_disable(); + ret = __seamcall_dirty_cache(func, fn, args); + preempt_enable(); } while (ret == TDX_RND_NO_ENTROPY && --retry); return ret; @@ -131,6 +154,8 @@ int tdx_guest_keyid_alloc(void); u32 tdx_get_nr_guest_keyids(void); void tdx_guest_keyid_free(unsigned int keyid); +void tdx_quirk_reset_page(struct page *page); + struct tdx_td { /* TD root structure: */ struct page *tdr_page; @@ -146,6 +171,8 @@ struct tdx_td { struct tdx_vp { /* TDVP root page */ struct page *tdvpr_page; + /* precalculated page_to_phys(tdvpr_page) for use in noinstr code */ + phys_addr_t tdvpr_pa; /* TD vCPU control structure: */ struct page **tdcx_pages; @@ -203,5 +230,11 @@ static inline const char *tdx_dump_mce_info(struct mce *m) { return NULL; } static inline const struct tdx_sys_info *tdx_get_sysinfo(void) { return NULL; } #endif /* CONFIG_INTEL_TDX_HOST */ +#ifdef CONFIG_KEXEC_CORE +void tdx_cpu_flush_cache_for_kexec(void); +#else +static inline void tdx_cpu_flush_cache_for_kexec(void) { } +#endif + #endif /* !__ASSEMBLER__ */ #endif /* _ASM_X86_TDX_H */ diff --git a/arch/x86/include/asm/video.h b/arch/x86/include/asm/video.h index 0950c9535fae..08ec328203ef 100644 --- a/arch/x86/include/asm/video.h +++ b/arch/x86/include/asm/video.h @@ -13,8 +13,10 @@ pgprot_t pgprot_framebuffer(pgprot_t prot, unsigned long offset); #define pgprot_framebuffer pgprot_framebuffer +#ifdef CONFIG_VIDEO bool video_is_primary_device(struct device *dev); #define video_is_primary_device video_is_primary_device +#endif #include <asm-generic/video.h> diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index a6f88ca1a6b4..5398db4dedb4 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -546,6 +546,23 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) u64 msr; /* + * Mark using WBINVD is needed during kexec on processors that + * support SME. This provides support for performing a successful + * kexec when going from SME inactive to SME active (or vice-versa). + * + * The cache must be cleared so that if there are entries with the + * same physical address, both with and without the encryption bit, + * they don't race each other when flushed and potentially end up + * with the wrong entry being committed to memory. + * + * Test the CPUID bit directly because with mem_encrypt=off the + * BSP will clear the X86_FEATURE_SME bit and the APs will not + * see it set after that. + */ + if (c->extended_cpuid_level >= 0x8000001f && (cpuid_eax(0x8000001f) & BIT(0))) + __this_cpu_write(cache_state_incoherent, true); + + /* * BIOS support is required for SME and SEV. * For SME: If BIOS has enabled SME then adjust x86_phys_bits by * the SME physical address space reduction value. diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index c6b12bed173d..335fd2ee9766 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -165,14 +165,23 @@ static struct crash_mem *fill_up_crash_elf_data(void) /* * Exclusion of crash region, crashk_low_res and/or crashk_cma_ranges * may cause range splits. So add extra slots here. + * + * Exclusion of low 1M may not cause another range split, because the + * range of exclude is [0, 1M] and the condition for splitting a new + * region is that the start, end parameters are both in a certain + * existing region in cmem and cannot be equal to existing region's + * start or end. Obviously, the start of [0, 1M] cannot meet this + * condition. + * + * But in order to lest the low 1M could be changed in the future, + * (e.g. [start, 1M]), add a extra slot. */ - nr_ranges += 2 + crashk_cma_cnt; + nr_ranges += 3 + crashk_cma_cnt; cmem = vzalloc(struct_size(cmem, ranges, nr_ranges)); if (!cmem) return NULL; cmem->max_nr_ranges = nr_ranges; - cmem->nr_ranges = 0; return cmem; } @@ -323,16 +332,20 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) struct crash_mem *cmem; /* - * Using random kexec_buf for passing dm crypt keys may cause a range - * split. So use two slots here. + * In the current x86 architecture code, the elfheader is always + * allocated at crashk_res.start. But it depends on the allocation + * position of elfheader in crashk_res. To avoid potential out of + * bounds in future, add an extra slot. + * + * And using random kexec_buf for passing dm crypt keys may cause a + * range split too, add another extra slot here. */ - nr_ranges = 2; + nr_ranges = 3; cmem = vzalloc(struct_size(cmem, ranges, nr_ranges)); if (!cmem) return -ENOMEM; cmem->max_nr_ranges = nr_ranges; - cmem->nr_ranges = 0; memset(&cmd, 0, sizeof(struct crash_memmap_data)); cmd.params = params; diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 24a41f0e0cf1..c3244ac680d1 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -16,6 +16,8 @@ #include <linux/kexec.h> #include <linux/kernel.h> #include <linux/mm.h> +#include <linux/libfdt.h> +#include <linux/of_fdt.h> #include <linux/efi.h> #include <linux/random.h> @@ -212,6 +214,28 @@ setup_efi_state(struct boot_params *params, unsigned long params_load_addr, } #endif /* CONFIG_EFI */ +#ifdef CONFIG_OF_FLATTREE +static void setup_dtb(struct boot_params *params, + unsigned long params_load_addr, + unsigned int dtb_setup_data_offset) +{ + struct setup_data *sd = (void *)params + dtb_setup_data_offset; + unsigned long setup_data_phys, dtb_len; + + dtb_len = fdt_totalsize(initial_boot_params); + sd->type = SETUP_DTB; + sd->len = dtb_len; + + /* Carry over current boot DTB with setup_data */ + memcpy(sd->data, initial_boot_params, dtb_len); + + /* Add setup data */ + setup_data_phys = params_load_addr + dtb_setup_data_offset; + sd->next = params->hdr.setup_data; + params->hdr.setup_data = setup_data_phys; +} +#endif /* CONFIG_OF_FLATTREE */ + static void setup_ima_state(const struct kimage *image, struct boot_params *params, unsigned long params_load_addr, @@ -336,6 +360,17 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params, sizeof(struct efi_setup_data); #endif +#ifdef CONFIG_OF_FLATTREE + if (image->force_dtb && initial_boot_params) { + setup_dtb(params, params_load_addr, setup_data_offset); + setup_data_offset += sizeof(struct setup_data) + + fdt_totalsize(initial_boot_params); + } else { + pr_debug("Not carrying over DTB, force_dtb = %d\n", + image->force_dtb); + } +#endif + if (IS_ENABLED(CONFIG_IMA_KEXEC)) { /* Setup IMA log buffer state */ setup_ima_state(image, params, params_load_addr, @@ -529,6 +564,12 @@ static void *bzImage64_load(struct kimage *image, char *kernel, sizeof(struct setup_data) + RNG_SEED_LENGTH; +#ifdef CONFIG_OF_FLATTREE + if (image->force_dtb && initial_boot_params) + kbuf.bufsz += sizeof(struct setup_data) + + fdt_totalsize(initial_boot_params); +#endif + if (IS_ENABLED(CONFIG_IMA_KEXEC)) kbuf.bufsz += sizeof(struct setup_data) + sizeof(struct ima_setup_data); @@ -537,7 +578,7 @@ static void *bzImage64_load(struct kimage *image, char *kernel, kbuf.bufsz += sizeof(struct setup_data) + sizeof(struct kho_data); - params = kzalloc(kbuf.bufsz, GFP_KERNEL); + params = kvzalloc(kbuf.bufsz, GFP_KERNEL); if (!params) return ERR_PTR(-ENOMEM); efi_map_offset = params_cmdline_sz; @@ -647,7 +688,7 @@ static void *bzImage64_load(struct kimage *image, char *kernel, return ldata; out_free_params: - kfree(params); + kvfree(params); return ERR_PTR(ret); } @@ -659,7 +700,7 @@ static int bzImage64_cleanup(void *loader_data) if (!ldata) return 0; - kfree(ldata->bootparams_buf); + kvfree(ldata->bootparams_buf); ldata->bootparams_buf = NULL; return 0; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 8ae750cde0c6..b67d7c59dca0 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -190,7 +190,7 @@ static void apf_task_wake_all(void) } } -void kvm_async_pf_task_wake(u32 token) +static void kvm_async_pf_task_wake(u32 token) { u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; @@ -241,7 +241,6 @@ again: /* A dummy token might be allocated and ultimately not used. */ kfree(dummy); } -EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake); noinstr u32 kvm_read_and_reset_apf_flags(void) { @@ -933,6 +932,19 @@ static void kvm_sev_hc_page_enc_status(unsigned long pfn, int npages, bool enc) static void __init kvm_init_platform(void) { + u64 tolud = PFN_PHYS(e820__end_of_low_ram_pfn()); + /* + * Note, hardware requires variable MTRR ranges to be power-of-2 sized + * and naturally aligned. But when forcing guest MTRR state, Linux + * doesn't program the forced ranges into hardware. Don't bother doing + * the math to generate a technically-legal range. + */ + struct mtrr_var_range pci_hole = { + .base_lo = tolud | X86_MEMTYPE_UC, + .mask_lo = (u32)(~(SZ_4G - tolud - 1)) | MTRR_PHYSMASK_V, + .mask_hi = (BIT_ULL(boot_cpu_data.x86_phys_bits) - 1) >> 32, + }; + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) && kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL)) { unsigned long nr_pages; @@ -982,8 +994,12 @@ static void __init kvm_init_platform(void) kvmclock_init(); x86_platform.apic_post_init = kvm_apic_init; - /* Set WB as the default cache mode for SEV-SNP and TDX */ - guest_force_mtrr_state(NULL, 0, MTRR_TYPE_WRBACK); + /* + * Set WB as the default cache mode for SEV-SNP and TDX, with a single + * UC range for the legacy PCI hole, e.g. so that devices that expect + * to get UC/WC mappings don't get surprised with WB. + */ + guest_force_mtrr_state(&pci_hole, 1, MTRR_TYPE_WRBACK); } #if defined(CONFIG_AMD_MEM_ENCRYPT) @@ -1073,16 +1089,6 @@ static void kvm_wait(u8 *ptr, u8 val) void __init kvm_spinlock_init(void) { /* - * In case host doesn't support KVM_FEATURE_PV_UNHALT there is still an - * advantage of keeping virt_spin_lock_key enabled: virt_spin_lock() is - * preferred over native qspinlock when vCPU is preempted. - */ - if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) { - pr_info("PV spinlocks disabled, no host support\n"); - return; - } - - /* * Disable PV spinlocks and use native qspinlock when dedicated pCPUs * are available. */ @@ -1101,6 +1107,16 @@ void __init kvm_spinlock_init(void) goto out; } + /* + * In case host doesn't support KVM_FEATURE_PV_UNHALT there is still an + * advantage of keeping virt_spin_lock_key enabled: virt_spin_lock() is + * preferred over native qspinlock when vCPU is preempted. + */ + if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) { + pr_info("PV spinlocks disabled, no host support\n"); + return; + } + pr_info("PV spinlocks enabled\n"); __pv_init_lock_hash(); diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 697fb99406e6..15088d14904f 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -29,6 +29,7 @@ #include <asm/set_memory.h> #include <asm/cpu.h> #include <asm/efi.h> +#include <asm/processor.h> #ifdef CONFIG_ACPI /* @@ -346,6 +347,22 @@ int machine_kexec_prepare(struct kimage *image) unsigned long reloc_end = (unsigned long)__relocate_kernel_end; int result; + /* + * Some early TDX-capable platforms have an erratum. A kernel + * partial write (a write transaction of less than cacheline + * lands at memory controller) to TDX private memory poisons that + * memory, and a subsequent read triggers a machine check. + * + * On those platforms the old kernel must reset TDX private + * memory before jumping to the new kernel otherwise the new + * kernel may see unexpected machine check. For simplicity + * just fail kexec/kdump on those platforms. + */ + if (boot_cpu_has_bug(X86_BUG_TDX_PW_MCE)) { + pr_info_once("Not allowed on platform with tdx_pw_mce bug\n"); + return -EOPNOTSUPP; + } + /* Setup the identity mapped 64bit page table */ result = init_pgtable(image, __pa(control_page)); if (result) @@ -384,16 +401,10 @@ void __nocfi machine_kexec(struct kimage *image) { unsigned long reloc_start = (unsigned long)__relocate_kernel_start; relocate_kernel_fn *relocate_kernel_ptr; - unsigned int host_mem_enc_active; + unsigned int relocate_kernel_flags; int save_ftrace_enabled; void *control_page; - /* - * This must be done before load_segments() since if call depth tracking - * is used then GS must be valid to make any function calls. - */ - host_mem_enc_active = cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT); - #ifdef CONFIG_KEXEC_JUMP if (image->preserve_context) save_processor_state(); @@ -427,6 +438,17 @@ void __nocfi machine_kexec(struct kimage *image) */ relocate_kernel_ptr = control_page + (unsigned long)relocate_kernel - reloc_start; + relocate_kernel_flags = 0; + if (image->preserve_context) + relocate_kernel_flags |= RELOC_KERNEL_PRESERVE_CONTEXT; + + /* + * This must be done before load_segments() since it resets + * GS to 0 and percpu data needs the correct GS to work. + */ + if (this_cpu_read(cache_state_incoherent)) + relocate_kernel_flags |= RELOC_KERNEL_CACHE_INCOHERENT; + /* * The segment registers are funny things, they have both a * visible and an invisible part. Whenever the visible part is @@ -436,6 +458,11 @@ void __nocfi machine_kexec(struct kimage *image) * * Take advantage of this here by force loading the segments, * before the GDT is zapped with an invalid value. + * + * load_segments() resets GS to 0. Don't make any function call + * after here since call depth tracking uses percpu variables to + * operate (relocate_kernel() is explicitly ignored by call depth + * tracking). */ load_segments(); @@ -443,8 +470,7 @@ void __nocfi machine_kexec(struct kimage *image) image->start = relocate_kernel_ptr((unsigned long)image->head, virt_to_phys(control_page), image->start, - image->preserve_context, - host_mem_enc_active); + relocate_kernel_flags); #ifdef CONFIG_KEXEC_JUMP if (image->preserve_context) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index e3a3987b0c4f..4c718f8adc59 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -89,6 +89,16 @@ DEFINE_PER_CPU(bool, __tss_limit_invalid); EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid); /* + * The cache may be in an incoherent state and needs flushing during kexec. + * E.g., on SME/TDX platforms, dirty cacheline aliases with and without + * encryption bit(s) can coexist and the cache needs to be flushed before + * booting to the new kernel to avoid the silent memory corruption due to + * dirty cachelines with different encryption property being written back + * to the memory. + */ +DEFINE_PER_CPU(bool, cache_state_incoherent); + +/* * this gets called so that we can store lazy state into memory and copy the * current task into the new thread. */ @@ -827,19 +837,7 @@ void __noreturn stop_this_cpu(void *dummy) disable_local_APIC(); mcheck_cpu_clear(c); - /* - * Use wbinvd on processors that support SME. This provides support - * for performing a successful kexec when going from SME inactive - * to SME active (or vice-versa). The cache must be cleared so that - * if there are entries with the same physical address, both with and - * without the encryption bit, they don't race each other when flushed - * and potentially end up with the wrong entry being committed to - * memory. - * - * Test the CPUID bit directly because the machine might've cleared - * X86_FEATURE_SME due to cmdline options. - */ - if (c->extended_cpuid_level >= 0x8000001f && (cpuid_eax(0x8000001f) & BIT(0))) + if (this_cpu_read(cache_state_incoherent)) wbinvd(); /* diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index ea604f4d0b52..11e20bb13aca 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -66,8 +66,7 @@ SYM_CODE_START_NOALIGN(relocate_kernel) * %rdi indirection_page * %rsi pa_control_page * %rdx start address - * %rcx preserve_context - * %r8 host_mem_enc_active + * %rcx flags: RELOC_KERNEL_* */ /* Save the CPU context, used for jumping back */ @@ -111,7 +110,7 @@ SYM_CODE_START_NOALIGN(relocate_kernel) /* save indirection list for jumping back */ movq %rdi, pa_backup_pages_map(%rip) - /* Save the preserve_context to %r11 as swap_pages clobbers %rcx. */ + /* Save the flags to %r11 as swap_pages clobbers %rcx. */ movq %rcx, %r11 /* setup a new stack at the end of the physical control page */ @@ -129,9 +128,8 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) /* * %rdi indirection page * %rdx start address - * %r8 host_mem_enc_active * %r9 page table page - * %r11 preserve_context + * %r11 flags: RELOC_KERNEL_* * %r13 original CR4 when relocate_kernel() was invoked */ @@ -200,14 +198,21 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) movq %r9, %cr3 /* + * If the memory cache is in incoherent state, e.g., due to + * memory encryption, do WBINVD to flush cache. + * * If SME is active, there could be old encrypted cache line * entries that will conflict with the now unencrypted memory * used by kexec. Flush the caches before copying the kernel. + * + * Note SME sets this flag to true when the platform supports + * SME, so the WBINVD is performed even SME is not activated + * by the kernel. But this has no harm. */ - testq %r8, %r8 - jz .Lsme_off + testb $RELOC_KERNEL_CACHE_INCOHERENT, %r11b + jz .Lnowbinvd wbinvd -.Lsme_off: +.Lnowbinvd: call swap_pages @@ -220,7 +225,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) movq %cr3, %rax movq %rax, %cr3 - testq %r11, %r11 /* preserve_context */ + testb $RELOC_KERNEL_PRESERVE_CONTEXT, %r11b jnz .Lrelocate /* @@ -273,7 +278,13 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) ANNOTATE_NOENDBR andq $PAGE_MASK, %r8 lea PAGE_SIZE(%r8), %rsp - movl $1, %r11d /* Ensure preserve_context flag is set */ + /* + * Ensure RELOC_KERNEL_PRESERVE_CONTEXT flag is set so that + * swap_pages() can swap pages correctly. Note all other + * RELOC_KERNEL_* flags passed to relocate_kernel() are not + * restored. + */ + movl $RELOC_KERNEL_PRESERVE_CONTEXT, %r11d call swap_pages movq kexec_va_control_page(%rip), %rax 0: addq $virtual_mapped - 0b, %rax @@ -321,7 +332,7 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages) UNWIND_HINT_END_OF_STACK /* * %rdi indirection page - * %r11 preserve_context + * %r11 flags: RELOC_KERNEL_* */ movq %rdi, %rcx /* Put the indirection_page in %rcx */ xorl %edi, %edi @@ -357,7 +368,8 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages) movq %rdi, %rdx /* Save destination page to %rdx */ movq %rsi, %rax /* Save source page to %rax */ - testq %r11, %r11 /* Only actually swap for ::preserve_context */ + /* Only actually swap for ::preserve_context */ + testb $RELOC_KERNEL_PRESERVE_CONTEXT, %r11b jz .Lnoswap /* copy source page to swap page */ diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 2c86673155c9..4e43923656d0 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -46,8 +46,8 @@ config KVM_X86 select HAVE_KVM_PM_NOTIFIER if PM select KVM_GENERIC_HARDWARE_ENABLING select KVM_GENERIC_PRE_FAULT_MEMORY - select KVM_GENERIC_PRIVATE_MEM if KVM_SW_PROTECTED_VM select KVM_WERROR if WERROR + select KVM_GUEST_MEMFD if X86_64 config KVM tristate "Kernel-based Virtual Machine (KVM) support" @@ -74,7 +74,7 @@ config KVM_WERROR # FRAME_WARN, i.e. KVM_WERROR=y with KASAN=y requires special tuning. # Building KVM with -Werror and KASAN is still doable via enabling # the kernel-wide WERROR=y. - depends on KVM && ((EXPERT && !KASAN) || WERROR) + depends on KVM_X86 && ((EXPERT && !KASAN) || WERROR) help Add -Werror to the build flags for KVM. @@ -83,7 +83,8 @@ config KVM_WERROR config KVM_SW_PROTECTED_VM bool "Enable support for KVM software-protected VMs" depends on EXPERT - depends on KVM && X86_64 + depends on KVM_X86 && X86_64 + select KVM_GENERIC_MEMORY_ATTRIBUTES help Enable support for KVM software-protected VMs. Currently, software- protected VMs are purely a development and testing vehicle for @@ -95,8 +96,6 @@ config KVM_SW_PROTECTED_VM config KVM_INTEL tristate "KVM for Intel (and compatible) processors support" depends on KVM && IA32_FEAT_CTL - select KVM_GENERIC_PRIVATE_MEM if INTEL_TDX_HOST - select KVM_GENERIC_MEMORY_ATTRIBUTES if INTEL_TDX_HOST help Provides support for KVM on processors equipped with Intel's VT extensions, a.k.a. Virtual Machine Extensions (VMX). @@ -135,6 +134,8 @@ config KVM_INTEL_TDX bool "Intel Trust Domain Extensions (TDX) support" default y depends on INTEL_TDX_HOST + select KVM_GENERIC_MEMORY_ATTRIBUTES + select HAVE_KVM_ARCH_GMEM_POPULATE help Provides support for launching Intel Trust Domain Extensions (TDX) confidential VMs on Intel processors. @@ -157,9 +158,10 @@ config KVM_AMD_SEV depends on KVM_AMD && X86_64 depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m) select ARCH_HAS_CC_PLATFORM - select KVM_GENERIC_PRIVATE_MEM + select KVM_GENERIC_MEMORY_ATTRIBUTES select HAVE_KVM_ARCH_GMEM_PREPARE select HAVE_KVM_ARCH_GMEM_INVALIDATE + select HAVE_KVM_ARCH_GMEM_POPULATE help Provides support for launching encrypted VMs which use Secure Encrypted Virtualization (SEV), Secure Encrypted Virtualization with @@ -169,7 +171,7 @@ config KVM_AMD_SEV config KVM_IOAPIC bool "I/O APIC, PIC, and PIT emulation" default y - depends on KVM + depends on KVM_X86 help Provides support for KVM to emulate an I/O APIC, PIC, and PIT, i.e. for full in-kernel APIC emulation. @@ -179,7 +181,7 @@ config KVM_IOAPIC config KVM_SMM bool "System Management Mode emulation" default y - depends on KVM + depends on KVM_X86 help Provides support for KVM to emulate System Management Mode (SMM) in virtual machines. This can be used by the virtual machine @@ -189,7 +191,7 @@ config KVM_SMM config KVM_HYPERV bool "Support for Microsoft Hyper-V emulation" - depends on KVM + depends on KVM_X86 default y help Provides KVM support for emulating Microsoft Hyper-V. This allows KVM @@ -203,7 +205,7 @@ config KVM_HYPERV config KVM_XEN bool "Support for Xen hypercall interface" - depends on KVM + depends on KVM_X86 help Provides KVM support for the hosting Xen HVM guests and passing Xen hypercalls to userspace. @@ -213,7 +215,7 @@ config KVM_XEN config KVM_PROVE_MMU bool "Prove KVM MMU correctness" depends on DEBUG_KERNEL - depends on KVM + depends on KVM_X86 depends on EXPERT help Enables runtime assertions in KVM's MMU that are too costly to enable @@ -228,7 +230,7 @@ config KVM_EXTERNAL_WRITE_TRACKING config KVM_MAX_NR_VCPUS int "Maximum number of vCPUs per KVM guest" - depends on KVM + depends on KVM_X86 range 1024 4096 default 4096 if MAXSMP default 1024 diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 6e838cb6c9e1..56c80588efa0 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3285,12 +3285,72 @@ out: return level; } -static int __kvm_mmu_max_mapping_level(struct kvm *kvm, - const struct kvm_memory_slot *slot, - gfn_t gfn, int max_level, bool is_private) +static u8 kvm_max_level_for_order(int order) +{ + BUILD_BUG_ON(KVM_MAX_HUGEPAGE_LEVEL > PG_LEVEL_1G); + + KVM_MMU_WARN_ON(order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G) && + order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M) && + order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K)); + + if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G)) + return PG_LEVEL_1G; + + if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M)) + return PG_LEVEL_2M; + + return PG_LEVEL_4K; +} + +static u8 kvm_gmem_max_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault, + const struct kvm_memory_slot *slot, gfn_t gfn, + bool is_private) +{ + u8 max_level, coco_level; + kvm_pfn_t pfn; + + /* For faults, use the gmem information that was resolved earlier. */ + if (fault) { + pfn = fault->pfn; + max_level = fault->max_level; + } else { + /* TODO: Call into guest_memfd once hugepages are supported. */ + WARN_ONCE(1, "Get pfn+order from guest_memfd"); + pfn = KVM_PFN_ERR_FAULT; + max_level = PG_LEVEL_4K; + } + + if (max_level == PG_LEVEL_4K) + return max_level; + + /* + * CoCo may influence the max mapping level, e.g. due to RMP or S-EPT + * restrictions. A return of '0' means "no additional restrictions", to + * allow for using an optional "ret0" static call. + */ + coco_level = kvm_x86_call(gmem_max_mapping_level)(kvm, pfn, is_private); + if (coco_level) + max_level = min(max_level, coco_level); + + return max_level; +} + +int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault, + const struct kvm_memory_slot *slot, gfn_t gfn) { struct kvm_lpage_info *linfo; - int host_level; + int host_level, max_level; + bool is_private; + + lockdep_assert_held(&kvm->mmu_lock); + + if (fault) { + max_level = fault->max_level; + is_private = fault->is_private; + } else { + max_level = PG_LEVEL_NUM; + is_private = kvm_mem_is_private(kvm, gfn); + } max_level = min(max_level, max_huge_page_level); for ( ; max_level > PG_LEVEL_4K; max_level--) { @@ -3299,25 +3359,17 @@ static int __kvm_mmu_max_mapping_level(struct kvm *kvm, break; } - if (is_private) - return max_level; - if (max_level == PG_LEVEL_4K) return PG_LEVEL_4K; - host_level = host_pfn_mapping_level(kvm, gfn, slot); + if (is_private || kvm_memslot_is_gmem_only(slot)) + host_level = kvm_gmem_max_mapping_level(kvm, fault, slot, gfn, + is_private); + else + host_level = host_pfn_mapping_level(kvm, gfn, slot); return min(host_level, max_level); } -int kvm_mmu_max_mapping_level(struct kvm *kvm, - const struct kvm_memory_slot *slot, gfn_t gfn) -{ - bool is_private = kvm_slot_can_be_private(slot) && - kvm_mem_is_private(kvm, gfn); - - return __kvm_mmu_max_mapping_level(kvm, slot, gfn, PG_LEVEL_NUM, is_private); -} - void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_memory_slot *slot = fault->slot; @@ -3338,9 +3390,8 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault * Enforce the iTLB multihit workaround after capturing the requested * level, which will be used to do precise, accurate accounting. */ - fault->req_level = __kvm_mmu_max_mapping_level(vcpu->kvm, slot, - fault->gfn, fault->max_level, - fault->is_private); + fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, fault, + fault->slot, fault->gfn); if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed) return; @@ -4503,42 +4554,6 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) vcpu->stat.pf_fixed++; } -static inline u8 kvm_max_level_for_order(int order) -{ - BUILD_BUG_ON(KVM_MAX_HUGEPAGE_LEVEL > PG_LEVEL_1G); - - KVM_MMU_WARN_ON(order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G) && - order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M) && - order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K)); - - if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G)) - return PG_LEVEL_1G; - - if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M)) - return PG_LEVEL_2M; - - return PG_LEVEL_4K; -} - -static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, - u8 max_level, int gmem_order) -{ - u8 req_max_level; - - if (max_level == PG_LEVEL_4K) - return PG_LEVEL_4K; - - max_level = min(kvm_max_level_for_order(gmem_order), max_level); - if (max_level == PG_LEVEL_4K) - return PG_LEVEL_4K; - - req_max_level = kvm_x86_call(private_max_mapping_level)(kvm, pfn); - if (req_max_level) - max_level = min(max_level, req_max_level); - - return max_level; -} - static void kvm_mmu_finish_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, int r) { @@ -4546,12 +4561,12 @@ static void kvm_mmu_finish_page_fault(struct kvm_vcpu *vcpu, r == RET_PF_RETRY, fault->map_writable); } -static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu, - struct kvm_page_fault *fault) +static int kvm_mmu_faultin_pfn_gmem(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault) { int max_order, r; - if (!kvm_slot_can_be_private(fault->slot)) { + if (!kvm_slot_has_gmem(fault->slot)) { kvm_mmu_prepare_memory_fault_exit(vcpu, fault); return -EFAULT; } @@ -4564,8 +4579,7 @@ static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu, } fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY); - fault->max_level = kvm_max_private_mapping_level(vcpu->kvm, fault->pfn, - fault->max_level, max_order); + fault->max_level = kvm_max_level_for_order(max_order); return RET_PF_CONTINUE; } @@ -4575,8 +4589,8 @@ static int __kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu, { unsigned int foll = fault->write ? FOLL_WRITE : 0; - if (fault->is_private) - return kvm_mmu_faultin_pfn_private(vcpu, fault); + if (fault->is_private || kvm_memslot_is_gmem_only(fault->slot)) + return kvm_mmu_faultin_pfn_gmem(vcpu, fault); foll |= FOLL_NOWAIT; fault->pfn = __kvm_faultin_pfn(fault->slot, fault->gfn, foll, @@ -7165,7 +7179,7 @@ restart: * mapping if the indirect sp has level = 1. */ if (sp->role.direct && - sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn)) { + sp->role.level < kvm_mmu_max_mapping_level(kvm, NULL, slot, sp->gfn)) { kvm_zap_one_rmap_spte(kvm, rmap_head, sptep); if (kvm_available_flush_remote_tlbs_range()) diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 65f3c89d7c5d..b776be783a2f 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -411,7 +411,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, return r; } -int kvm_mmu_max_mapping_level(struct kvm *kvm, +int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault, const struct kvm_memory_slot *slot, gfn_t gfn); void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 7f3d7229b2c1..740cb06accdb 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1813,7 +1813,7 @@ retry: if (iter.gfn < start || iter.gfn >= end) continue; - max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, iter.gfn); + max_mapping_level = kvm_mmu_max_mapping_level(kvm, NULL, slot, iter.gfn); if (max_mapping_level < iter.level) continue; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 0635bd71c10e..5bac4d20aec0 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2361,7 +2361,7 @@ static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp) mutex_lock(&kvm->slots_lock); memslot = gfn_to_memslot(kvm, params.gfn_start); - if (!kvm_slot_can_be_private(memslot)) { + if (!kvm_slot_has_gmem(memslot)) { ret = -EINVAL; goto out; } @@ -4715,7 +4715,7 @@ void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code) } slot = gfn_to_memslot(kvm, gfn); - if (!kvm_slot_can_be_private(slot)) { + if (!kvm_slot_has_gmem(slot)) { pr_warn_ratelimited("SEV: Unexpected RMP fault, non-private slot for GPA 0x%llx\n", gpa); return; @@ -4943,7 +4943,7 @@ next_pfn: } } -int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) +int sev_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private) { int level, rc; bool assigned; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 1bfebe40854f..3a9fe0a8b78c 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -5179,7 +5179,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .gmem_prepare = sev_gmem_prepare, .gmem_invalidate = sev_gmem_invalidate, - .private_max_mapping_level = sev_private_max_mapping_level, + .gmem_max_mapping_level = sev_gmem_max_mapping_level, }; /* diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 58b9d168e0c8..70df7c6413cf 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -866,7 +866,7 @@ void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code); void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu); int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order); void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end); -int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn); +int sev_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private); struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu); void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa); #else @@ -895,7 +895,7 @@ static inline int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, in return 0; } static inline void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) {} -static inline int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) +static inline int sev_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private) { return 0; } diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index dbab1c15b0cd..bb5f182f6788 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -831,10 +831,11 @@ static int vt_vcpu_mem_enc_ioctl(struct kvm_vcpu *vcpu, void __user *argp) return tdx_vcpu_ioctl(vcpu, argp); } -static int vt_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) +static int vt_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, + bool is_private) { if (is_td(kvm)) - return tdx_gmem_private_max_mapping_level(kvm, pfn); + return tdx_gmem_max_mapping_level(kvm, pfn, is_private); return 0; } @@ -1005,7 +1006,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .mem_enc_ioctl = vt_op_tdx_only(mem_enc_ioctl), .vcpu_mem_enc_ioctl = vt_op_tdx_only(vcpu_mem_enc_ioctl), - .private_max_mapping_level = vt_op_tdx_only(gmem_private_max_mapping_level) + .gmem_max_mapping_level = vt_op_tdx_only(gmem_max_mapping_level) }; struct kvm_x86_init_ops vt_init_ops __initdata = { diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 66744f5768c8..00f8bfd2330d 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -281,25 +281,6 @@ static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu) vcpu->cpu = -1; } -static void tdx_clear_page(struct page *page) -{ - const void *zero_page = (const void *) page_to_virt(ZERO_PAGE(0)); - void *dest = page_to_virt(page); - unsigned long i; - - /* - * The page could have been poisoned. MOVDIR64B also clears - * the poison bit so the kernel can safely use the page again. - */ - for (i = 0; i < PAGE_SIZE; i += 64) - movdir64b(dest + i, zero_page); - /* - * MOVDIR64B store uses WC buffer. Prevent following memory reads - * from seeing potentially poisoned cache. - */ - __mb(); -} - static void tdx_no_vcpus_enter_start(struct kvm *kvm) { struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); @@ -345,7 +326,7 @@ static int tdx_reclaim_page(struct page *page) r = __tdx_reclaim_page(page); if (!r) - tdx_clear_page(page); + tdx_quirk_reset_page(page); return r; } @@ -442,6 +423,16 @@ void tdx_disable_virtualization_cpu(void) tdx_flush_vp(&arg); } local_irq_restore(flags); + + /* + * Flush cache now if kexec is possible: this is necessary to avoid + * having dirty private memory cachelines when the new kernel boots, + * but WBINVD is a relatively expensive operation and doing it during + * kexec can exacerbate races in native_stop_other_cpus(). Do it + * now, since this is a safe moment and there is going to be no more + * TDX activity on this CPU from this point on. + */ + tdx_cpu_flush_cache_for_kexec(); } #define TDX_SEAMCALL_RETRIES 10000 @@ -593,7 +584,7 @@ static void tdx_reclaim_td_control_pages(struct kvm *kvm) pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err); return; } - tdx_clear_page(kvm_tdx->td.tdr_page); + tdx_quirk_reset_page(kvm_tdx->td.tdr_page); __free_page(kvm_tdx->td.tdr_page); kvm_tdx->td.tdr_page = NULL; @@ -861,6 +852,7 @@ void tdx_vcpu_free(struct kvm_vcpu *vcpu) if (tdx->vp.tdvpr_page) { tdx_reclaim_control_page(tdx->vp.tdvpr_page); tdx->vp.tdvpr_page = 0; + tdx->vp.tdvpr_pa = 0; } tdx->state = VCPU_TD_STATE_UNINITIALIZED; @@ -1714,7 +1706,7 @@ static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn, pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err); return -EIO; } - tdx_clear_page(page); + tdx_quirk_reset_page(page); tdx_unpin(kvm, page); return 0; } @@ -2940,6 +2932,13 @@ static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx) return -ENOMEM; tdx->vp.tdvpr_page = page; + /* + * page_to_phys() does not work in 'noinstr' code, like guest + * entry via tdh_vp_enter(). Precalculate and store it instead + * of doing it at runtime later. + */ + tdx->vp.tdvpr_pa = page_to_phys(tdx->vp.tdvpr_page); + tdx->vp.tdcx_pages = kcalloc(kvm_tdx->td.tdcx_nr_pages, sizeof(*tdx->vp.tdcx_pages), GFP_KERNEL); if (!tdx->vp.tdcx_pages) { @@ -3002,6 +3001,7 @@ free_tdvpr: if (tdx->vp.tdvpr_page) __free_page(tdx->vp.tdvpr_page); tdx->vp.tdvpr_page = 0; + tdx->vp.tdvpr_pa = 0; return ret; } @@ -3318,8 +3318,11 @@ int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) return ret; } -int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) +int tdx_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private) { + if (!is_private) + return 0; + return PG_LEVEL_4K; } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index aa157fe5b7b3..0bdf9405969a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5785,6 +5785,13 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) if (kvm_test_request(KVM_REQ_EVENT, vcpu)) return 1; + /* + * Ensure that any updates to kvm->buses[] observed by the + * previous instruction (emulated or otherwise) are also + * visible to the instruction KVM is about to emulate. + */ + smp_rmb(); + if (!kvm_emulate_instruction(vcpu, 0)) return 0; diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h index 2b3424f638db..4c70f56c57c8 100644 --- a/arch/x86/kvm/vmx/x86_ops.h +++ b/arch/x86/kvm/vmx/x86_ops.h @@ -153,7 +153,7 @@ int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp); void tdx_flush_tlb_current(struct kvm_vcpu *vcpu); void tdx_flush_tlb_all(struct kvm_vcpu *vcpu); void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); -int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn); +int tdx_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private); #endif #endif /* __KVM_X86_VMX_X86_OPS_H */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 706b6fd56d3c..f122906ed9f3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -13530,6 +13530,16 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_arch_no_poll); +#ifdef CONFIG_KVM_GUEST_MEMFD +/* + * KVM doesn't yet support mmap() on guest_memfd for VMs with private memory + * (the private vs. shared tracking needs to be moved into guest_memfd). + */ +bool kvm_arch_supports_gmem_mmap(struct kvm *kvm) +{ + return !kvm_arch_has_private_mem(kvm); +} + #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order) { @@ -13543,6 +13553,7 @@ void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) kvm_x86_call(gmem_invalidate)(start, end); } #endif +#endif int kvm_spec_ctrl_test_value(u64 value) { diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index bb57e93b4caf..8bf6ad4b9400 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -34,6 +34,7 @@ * We need to define the tracepoints somewhere, and tlb.c * is only compiled when SMP=y. */ +#define CREATE_TRACE_POINTS #include <trace/events/tlb.h> #include "mm_internal.h" diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index b9426fce5f3e..0e4270e20fad 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1031,7 +1031,7 @@ static void __meminit free_pagetable(struct page *page, int order) free_reserved_pages(page, nr_pages); #endif } else { - free_pages((unsigned long)page_address(page), order); + __free_pages(page, order); } } diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 0539efd0d216..998b6010d6d3 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -451,5 +451,5 @@ void __init kasan_init(void) __flush_tlb_all(); init_task.kasan_depth = 0; - pr_info("KernelAddressSanitizer initialized\n"); + kasan_init_generic(); } diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 5ed2109211da..82f3a987f7cf 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -80,7 +80,7 @@ unsigned long arch_mmap_rnd(void) } static unsigned long mmap_base(unsigned long rnd, unsigned long task_size, - struct rlimit *rlim_stack) + const struct rlimit *rlim_stack) { unsigned long gap = rlim_stack->rlim_cur; unsigned long pad = stack_maxrandom_size(task_size) + stack_guard_gap; @@ -110,7 +110,7 @@ static unsigned long mmap_legacy_base(unsigned long rnd, */ static void arch_pick_mmap_base(unsigned long *base, unsigned long *legacy_base, unsigned long random_factor, unsigned long task_size, - struct rlimit *rlim_stack) + const struct rlimit *rlim_stack) { *legacy_base = mmap_legacy_base(random_factor, task_size); if (mmap_is_legacy()) @@ -119,12 +119,12 @@ static void arch_pick_mmap_base(unsigned long *base, unsigned long *legacy_base, *base = mmap_base(random_factor, task_size, rlim_stack); } -void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) +void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack) { if (mmap_is_legacy()) - clear_bit(MMF_TOPDOWN, &mm->flags); + mm_flags_clear(MMF_TOPDOWN, mm); else - set_bit(MMF_TOPDOWN, &mm->flags); + mm_flags_set(MMF_TOPDOWN, mm); arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base, arch_rnd(mmap64_rnd_bits), task_size_64bit(0), diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index c09284302dd3..b68200a0e0c6 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -126,7 +126,7 @@ __setup("debugpat", pat_debug_setup); static inline enum page_cache_mode get_page_memtype(struct page *pg) { - unsigned long pg_flags = pg->flags & _PGMT_MASK; + unsigned long pg_flags = pg->flags.f & _PGMT_MASK; if (pg_flags == _PGMT_WB) return _PAGE_CACHE_MODE_WB; @@ -161,10 +161,10 @@ static inline void set_page_memtype(struct page *pg, break; } - old_flags = READ_ONCE(pg->flags); + old_flags = READ_ONCE(pg->flags.f); do { new_flags = (old_flags & _PGMT_CLEAR_MASK) | memtype_flags; - } while (!try_cmpxchg(&pg->flags, &old_flags, new_flags)); + } while (!try_cmpxchg(&pg->flags.f, &old_flags, new_flags)); } #else static inline enum page_cache_mode get_page_memtype(struct page *pg) diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 8834c76f91c9..d2d54b8c4dbb 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -399,15 +399,6 @@ static void cpa_flush_all(unsigned long cache) on_each_cpu(__cpa_flush_all, (void *) cache, 1); } -static void __cpa_flush_tlb(void *data) -{ - struct cpa_data *cpa = data; - unsigned int i; - - for (i = 0; i < cpa->numpages; i++) - flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i))); -} - static int collapse_large_pages(unsigned long addr, struct list_head *pgtables); static void cpa_collapse_large_pages(struct cpa_data *cpa) @@ -444,6 +435,7 @@ static void cpa_collapse_large_pages(struct cpa_data *cpa) static void cpa_flush(struct cpa_data *cpa, int cache) { + unsigned long start, end; unsigned int i; BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); @@ -453,10 +445,12 @@ static void cpa_flush(struct cpa_data *cpa, int cache) goto collapse_large_pages; } - if (cpa->force_flush_all || cpa->numpages > tlb_single_page_flush_ceiling) - flush_tlb_all(); - else - on_each_cpu(__cpa_flush_tlb, cpa, 1); + start = fix_addr(__cpa_addr(cpa, 0)); + end = fix_addr(__cpa_addr(cpa, cpa->numpages)); + if (cpa->force_flush_all) + end = TLB_FLUSH_ALL; + + flush_tlb_kernel_range(start, end); if (!cache) goto collapse_large_pages; diff --git a/arch/x86/platform/efi/memmap.c b/arch/x86/platform/efi/memmap.c index 061b8ecc71a1..023697c88910 100644 --- a/arch/x86/platform/efi/memmap.c +++ b/arch/x86/platform/efi/memmap.c @@ -42,7 +42,7 @@ void __init __efi_memmap_free(u64 phys, unsigned long size, unsigned long flags) struct page *p = pfn_to_page(PHYS_PFN(phys)); unsigned int order = get_order(size); - free_pages((unsigned long) page_address(p), order); + __free_pages(p, order); } } diff --git a/arch/x86/video/video-common.c b/arch/x86/video/video-common.c index 81fc97a2a837..e0aeee99bc99 100644 --- a/arch/x86/video/video-common.c +++ b/arch/x86/video/video-common.c @@ -9,6 +9,7 @@ #include <linux/module.h> #include <linux/pci.h> +#include <linux/screen_info.h> #include <linux/vgaarb.h> #include <asm/video.h> @@ -27,6 +28,11 @@ EXPORT_SYMBOL(pgprot_framebuffer); bool video_is_primary_device(struct device *dev) { +#ifdef CONFIG_SCREEN_INFO + struct screen_info *si = &screen_info; + struct resource res[SCREEN_INFO_MAX_RESOURCES]; + ssize_t i, numres; +#endif struct pci_dev *pdev; if (!dev_is_pci(dev)) @@ -34,7 +40,24 @@ bool video_is_primary_device(struct device *dev) pdev = to_pci_dev(dev); - return (pdev == vga_default_device()); + if (!pci_is_display(pdev)) + return false; + + if (pdev == vga_default_device()) + return true; + +#ifdef CONFIG_SCREEN_INFO + numres = screen_info_resources(si, res, ARRAY_SIZE(res)); + for (i = 0; i < numres; ++i) { + if (!(res[i].flags & IORESOURCE_MEM)) + continue; + + if (pci_find_resource(pdev, &res[i])) + return true; + } +#endif + + return false; } EXPORT_SYMBOL(video_is_primary_device); diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index c7a9a087ccaf..eac403248462 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -633,15 +633,19 @@ err: } /* - * Convert TDX private pages back to normal by using MOVDIR64B to - * clear these pages. Note this function doesn't flush cache of - * these TDX private pages. The caller should make sure of that. + * Convert TDX private pages back to normal by using MOVDIR64B to clear these + * pages. Typically, any write to the page will convert it from TDX private back + * to normal kernel memory. Systems with the X86_BUG_TDX_PW_MCE erratum need to + * do the conversion explicitly via MOVDIR64B. */ -static void reset_tdx_pages(unsigned long base, unsigned long size) +static void tdx_quirk_reset_paddr(unsigned long base, unsigned long size) { const void *zero_page = (const void *)page_address(ZERO_PAGE(0)); unsigned long phys, end; + if (!boot_cpu_has_bug(X86_BUG_TDX_PW_MCE)) + return; + end = base + size; for (phys = base; phys < end; phys += 64) movdir64b(__va(phys), zero_page); @@ -654,17 +658,23 @@ static void reset_tdx_pages(unsigned long base, unsigned long size) mb(); } -static void tdmr_reset_pamt(struct tdmr_info *tdmr) +void tdx_quirk_reset_page(struct page *page) +{ + tdx_quirk_reset_paddr(page_to_phys(page), PAGE_SIZE); +} +EXPORT_SYMBOL_GPL(tdx_quirk_reset_page); + +static void tdmr_quirk_reset_pamt(struct tdmr_info *tdmr) { - tdmr_do_pamt_func(tdmr, reset_tdx_pages); + tdmr_do_pamt_func(tdmr, tdx_quirk_reset_paddr); } -static void tdmrs_reset_pamt_all(struct tdmr_info_list *tdmr_list) +static void tdmrs_quirk_reset_pamt_all(struct tdmr_info_list *tdmr_list) { int i; for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) - tdmr_reset_pamt(tdmr_entry(tdmr_list, i)); + tdmr_quirk_reset_pamt(tdmr_entry(tdmr_list, i)); } static unsigned long tdmrs_count_pamt_kb(struct tdmr_info_list *tdmr_list) @@ -1136,15 +1146,7 @@ err_reset_pamts: * to the kernel. */ wbinvd_on_all_cpus(); - /* - * According to the TDX hardware spec, if the platform - * doesn't have the "partial write machine check" - * erratum, any kernel read/write will never cause #MC - * in kernel space, thus it's OK to not convert PAMTs - * back to normal. But do the conversion anyway here - * as suggested by the TDX spec. - */ - tdmrs_reset_pamt_all(&tdx_tdmr_list); + tdmrs_quirk_reset_pamt_all(&tdx_tdmr_list); err_free_pamts: tdmrs_free_pamt_all(&tdx_tdmr_list); err_free_tdmrs: @@ -1266,7 +1268,7 @@ static bool paddr_is_tdx_private(unsigned long phys) return false; /* Get page type from the TDX module */ - sret = __seamcall_ret(TDH_PHYMEM_PAGE_RDMD, &args); + sret = __seamcall_dirty_cache(__seamcall_ret, TDH_PHYMEM_PAGE_RDMD, &args); /* * The SEAMCALL will not return success unless there is a @@ -1502,11 +1504,6 @@ static inline u64 tdx_tdr_pa(struct tdx_td *td) return page_to_phys(td->tdr_page); } -static inline u64 tdx_tdvpr_pa(struct tdx_vp *td) -{ - return page_to_phys(td->tdvpr_page); -} - /* * The TDX module exposes a CLFLUSH_BEFORE_ALLOC bit to specify whether * a CLFLUSH of pages is required before handing them to the TDX module. @@ -1518,11 +1515,11 @@ static void tdx_clflush_page(struct page *page) clflush_cache_range(page_to_virt(page), PAGE_SIZE); } -noinstr __flatten u64 tdh_vp_enter(struct tdx_vp *td, struct tdx_module_args *args) +noinstr u64 tdh_vp_enter(struct tdx_vp *td, struct tdx_module_args *args) { - args->rcx = tdx_tdvpr_pa(td); + args->rcx = td->tdvpr_pa; - return __seamcall_saved_ret(TDH_VP_ENTER, args); + return __seamcall_dirty_cache(__seamcall_saved_ret, TDH_VP_ENTER, args); } EXPORT_SYMBOL_GPL(tdh_vp_enter); @@ -1581,7 +1578,7 @@ u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page) { struct tdx_module_args args = { .rcx = page_to_phys(tdcx_page), - .rdx = tdx_tdvpr_pa(vp), + .rdx = vp->tdvpr_pa, }; tdx_clflush_page(tdcx_page); @@ -1650,7 +1647,7 @@ EXPORT_SYMBOL_GPL(tdh_mng_create); u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp) { struct tdx_module_args args = { - .rcx = tdx_tdvpr_pa(vp), + .rcx = vp->tdvpr_pa, .rdx = tdx_tdr_pa(td), }; @@ -1706,7 +1703,7 @@ EXPORT_SYMBOL_GPL(tdh_mr_finalize); u64 tdh_vp_flush(struct tdx_vp *vp) { struct tdx_module_args args = { - .rcx = tdx_tdvpr_pa(vp), + .rcx = vp->tdvpr_pa, }; return seamcall(TDH_VP_FLUSH, &args); @@ -1752,7 +1749,7 @@ EXPORT_SYMBOL_GPL(tdh_mng_init); u64 tdh_vp_rd(struct tdx_vp *vp, u64 field, u64 *data) { struct tdx_module_args args = { - .rcx = tdx_tdvpr_pa(vp), + .rcx = vp->tdvpr_pa, .rdx = field, }; u64 ret; @@ -1769,7 +1766,7 @@ EXPORT_SYMBOL_GPL(tdh_vp_rd); u64 tdh_vp_wr(struct tdx_vp *vp, u64 field, u64 data, u64 mask) { struct tdx_module_args args = { - .rcx = tdx_tdvpr_pa(vp), + .rcx = vp->tdvpr_pa, .rdx = field, .r8 = data, .r9 = mask, @@ -1782,7 +1779,7 @@ EXPORT_SYMBOL_GPL(tdh_vp_wr); u64 tdh_vp_init(struct tdx_vp *vp, u64 initial_rcx, u32 x2apicid) { struct tdx_module_args args = { - .rcx = tdx_tdvpr_pa(vp), + .rcx = vp->tdvpr_pa, .rdx = initial_rcx, .r8 = x2apicid, }; @@ -1870,3 +1867,22 @@ u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page) return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args); } EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_hkid); + +#ifdef CONFIG_KEXEC_CORE +void tdx_cpu_flush_cache_for_kexec(void) +{ + lockdep_assert_preemption_disabled(); + + if (!this_cpu_read(cache_state_incoherent)) + return; + + /* + * Private memory cachelines need to be clean at the time of + * kexec. Write them back now, as the caller promises that + * there should be no more SEAMCALLs on this CPU. + */ + wbinvd(); + this_cpu_write(cache_state_incoherent, false); +} +EXPORT_SYMBOL_GPL(tdx_cpu_flush_cache_for_kexec); +#endif |