summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig2
-rw-r--r--arch/x86/entry/entry_64_fred.S2
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_64.c17
-rw-r--r--arch/x86/include/asm/floppy.h8
-rw-r--r--arch/x86/include/asm/kexec.h12
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h2
-rw-r--r--arch/x86/include/asm/kvm_host.h6
-rw-r--r--arch/x86/include/asm/kvm_para.h2
-rw-r--r--arch/x86/include/asm/processor.h2
-rw-r--r--arch/x86/include/asm/tdx.h35
-rw-r--r--arch/x86/include/asm/video.h2
-rw-r--r--arch/x86/kernel/cpu/amd.c17
-rw-r--r--arch/x86/kernel/crash.c25
-rw-r--r--arch/x86/kernel/kexec-bzimage64.c47
-rw-r--r--arch/x86/kernel/kvm.c44
-rw-r--r--arch/x86/kernel/machine_kexec_64.c44
-rw-r--r--arch/x86/kernel/process.c24
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S36
-rw-r--r--arch/x86/kvm/Kconfig26
-rw-r--r--arch/x86/kvm/mmu/mmu.c142
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h2
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c2
-rw-r--r--arch/x86/kvm/svm/sev.c6
-rw-r--r--arch/x86/kvm/svm/svm.c2
-rw-r--r--arch/x86/kvm/svm/svm.h4
-rw-r--r--arch/x86/kvm/vmx/main.c7
-rw-r--r--arch/x86/kvm/vmx/tdx.c49
-rw-r--r--arch/x86/kvm/vmx/vmx.c7
-rw-r--r--arch/x86/kvm/vmx/x86_ops.h2
-rw-r--r--arch/x86/kvm/x86.c11
-rw-r--r--arch/x86/mm/init.c1
-rw-r--r--arch/x86/mm/init_64.c2
-rw-r--r--arch/x86/mm/kasan_init_64.c2
-rw-r--r--arch/x86/mm/mmap.c10
-rw-r--r--arch/x86/mm/pat/memtype.c6
-rw-r--r--arch/x86/mm/pat/set_memory.c20
-rw-r--r--arch/x86/platform/efi/memmap.c2
-rw-r--r--arch/x86/video/video-common.c25
-rw-r--r--arch/x86/virt/vmx/tdx/tdx.c80
39 files changed, 489 insertions, 246 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 75f3de70df51..9d034a987c6e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1565,7 +1565,6 @@ config ARCH_SPARSEMEM_ENABLE
def_bool y
select SPARSEMEM_STATIC if X86_32
select SPARSEMEM_VMEMMAP_ENABLE if X86_64
- select SPARSEMEM_VMEMMAP if X86_64
config ARCH_SPARSEMEM_DEFAULT
def_bool X86_64 || (NUMA && X86_32)
@@ -1903,7 +1902,6 @@ config INTEL_TDX_HOST
depends on X86_X2APIC
select ARCH_KEEP_MEMBLOCK
depends on CONTIG_ALLOC
- depends on !KEXEC_CORE
depends on X86_MCE
help
Intel Trust Domain Extensions (TDX) protects guest VMs from malicious
diff --git a/arch/x86/entry/entry_64_fred.S b/arch/x86/entry/entry_64_fred.S
index 29c5c32c16c3..907bd233c6c1 100644
--- a/arch/x86/entry/entry_64_fred.S
+++ b/arch/x86/entry/entry_64_fred.S
@@ -16,7 +16,7 @@
.macro FRED_ENTER
UNWIND_HINT_END_OF_STACK
- ENDBR
+ ANNOTATE_NOENDBR
PUSH_AND_CLEAR_REGS
movq %rsp, %rdi /* %rdi -> pt_regs */
.endm
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index c9103a6fa06e..6e6c0a740837 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -124,7 +124,12 @@ bool emulate_vsyscall(unsigned long error_code,
if ((error_code & (X86_PF_WRITE | X86_PF_USER)) != X86_PF_USER)
return false;
- if (!(error_code & X86_PF_INSTR)) {
+ /*
+ * Assume that faults at regs->ip are because of an
+ * instruction fetch. Return early and avoid
+ * emulation for faults during data accesses:
+ */
+ if (address != regs->ip) {
/* Failed vsyscall read */
if (vsyscall_mode == EMULATE)
return false;
@@ -137,12 +142,18 @@ bool emulate_vsyscall(unsigned long error_code,
}
/*
+ * X86_PF_INSTR is only set when NX is supported. When
+ * available, use it to double-check that the emulation code
+ * is only being used for instruction fetches:
+ */
+ if (cpu_feature_enabled(X86_FEATURE_NX))
+ WARN_ON_ONCE(!(error_code & X86_PF_INSTR));
+
+ /*
* No point in checking CS -- the only way to get here is a user mode
* trap to a high address, which means that we're in 64-bit user code.
*/
- WARN_ON_ONCE(address != regs->ip);
-
if (vsyscall_mode == NONE) {
warn_bad_vsyscall(KERN_INFO, regs,
"vsyscall attempted with vsyscall=none");
diff --git a/arch/x86/include/asm/floppy.h b/arch/x86/include/asm/floppy.h
index 6ec3fc969ad5..e7a244051c62 100644
--- a/arch/x86/include/asm/floppy.h
+++ b/arch/x86/include/asm/floppy.h
@@ -10,6 +10,7 @@
#ifndef _ASM_X86_FLOPPY_H
#define _ASM_X86_FLOPPY_H
+#include <linux/sizes.h>
#include <linux/vmalloc.h>
/*
@@ -22,10 +23,7 @@
*/
#define _CROSS_64KB(a, s, vdma) \
(!(vdma) && \
- ((unsigned long)(a)/K_64 != ((unsigned long)(a) + (s) - 1) / K_64))
-
-#define CROSS_64KB(a, s) _CROSS_64KB(a, s, use_virtual_dma & 1)
-
+ ((unsigned long)(a) / SZ_64K != ((unsigned long)(a) + (s) - 1) / SZ_64K))
#define SW fd_routine[use_virtual_dma & 1]
#define CSW fd_routine[can_use_virtual_dma & 1]
@@ -206,7 +204,7 @@ static int vdma_dma_setup(char *addr, unsigned long size, int mode, int io)
static int hard_dma_setup(char *addr, unsigned long size, int mode, int io)
{
#ifdef FLOPPY_SANITY_CHECK
- if (CROSS_64KB(addr, size)) {
+ if (_CROSS_64KB(addr, size, use_virtual_dma & 1)) {
printk("DMA crossing 64-K boundary %p-%p\n", addr, addr+size);
return -1;
}
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index f2ad77929d6e..5cfb27f26583 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -13,6 +13,15 @@
# define KEXEC_DEBUG_EXC_HANDLER_SIZE 6 /* PUSHI, PUSHI, 2-byte JMP */
#endif
+#ifdef CONFIG_X86_64
+
+#include <linux/bits.h>
+
+#define RELOC_KERNEL_PRESERVE_CONTEXT BIT(0)
+#define RELOC_KERNEL_CACHE_INCOHERENT BIT(1)
+
+#endif
+
# define KEXEC_CONTROL_PAGE_SIZE 4096
# define KEXEC_CONTROL_CODE_MAX_SIZE 2048
@@ -121,8 +130,7 @@ typedef unsigned long
relocate_kernel_fn(unsigned long indirection_page,
unsigned long pa_control_page,
unsigned long start_address,
- unsigned int preserve_context,
- unsigned int host_mem_enc_active);
+ unsigned int flags);
#endif
extern relocate_kernel_fn relocate_kernel;
#define ARCH_HAS_KIMAGE_ARCH
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 18a5c3119e1a..62c3e4de3303 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -145,7 +145,7 @@ KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons);
KVM_X86_OP_OPTIONAL(get_untagged_addr)
KVM_X86_OP_OPTIONAL(alloc_apic_backing_page)
KVM_X86_OP_OPTIONAL_RET0(gmem_prepare)
-KVM_X86_OP_OPTIONAL_RET0(private_max_mapping_level)
+KVM_X86_OP_OPTIONAL_RET0(gmem_max_mapping_level)
KVM_X86_OP_OPTIONAL(gmem_invalidate)
#undef KVM_X86_OP
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f19a76d3ca0e..c56cc54d682a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1922,7 +1922,7 @@ struct kvm_x86_ops {
void *(*alloc_apic_backing_page)(struct kvm_vcpu *vcpu);
int (*gmem_prepare)(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order);
void (*gmem_invalidate)(kvm_pfn_t start, kvm_pfn_t end);
- int (*private_max_mapping_level)(struct kvm *kvm, kvm_pfn_t pfn);
+ int (*gmem_max_mapping_level)(struct kvm *kvm, kvm_pfn_t pfn, bool is_private);
};
struct kvm_x86_nested_ops {
@@ -2276,10 +2276,8 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
int tdp_max_root_level, int tdp_huge_page_level);
-#ifdef CONFIG_KVM_PRIVATE_MEM
+#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
#define kvm_arch_has_private_mem(kvm) ((kvm)->arch.has_private_mem)
-#else
-#define kvm_arch_has_private_mem(kvm) false
#endif
#define kvm_arch_has_readonly_mem(kvm) (!(kvm)->arch.has_protected_state)
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 57bc74e112f2..4a47c16e2df8 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -124,7 +124,6 @@ bool kvm_para_available(void);
unsigned int kvm_arch_para_features(void);
unsigned int kvm_arch_para_hints(void);
void kvm_async_pf_task_wait_schedule(u32 token);
-void kvm_async_pf_task_wake(u32 token);
u32 kvm_read_and_reset_apf_flags(void);
bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token);
@@ -148,7 +147,6 @@ static inline void kvm_spinlock_init(void)
#else /* CONFIG_KVM_GUEST */
#define kvm_async_pf_task_wait_schedule(T) do {} while(0)
-#define kvm_async_pf_task_wake(T) do {} while(0)
static inline bool kvm_para_available(void)
{
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index bde58f6510ac..a24c7805acdb 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -731,6 +731,8 @@ void __noreturn stop_this_cpu(void *dummy);
void microcode_check(struct cpuinfo_x86 *prev_info);
void store_cpu_caps(struct cpuinfo_x86 *info);
+DECLARE_PER_CPU(bool, cache_state_incoherent);
+
enum l1tf_mitigations {
L1TF_MITIGATION_OFF,
L1TF_MITIGATION_AUTO,
diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index 7ddef3a69866..6b338d7f01b7 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -102,10 +102,31 @@ u64 __seamcall_ret(u64 fn, struct tdx_module_args *args);
u64 __seamcall_saved_ret(u64 fn, struct tdx_module_args *args);
void tdx_init(void);
+#include <linux/preempt.h>
#include <asm/archrandom.h>
+#include <asm/processor.h>
typedef u64 (*sc_func_t)(u64 fn, struct tdx_module_args *args);
+static __always_inline u64 __seamcall_dirty_cache(sc_func_t func, u64 fn,
+ struct tdx_module_args *args)
+{
+ lockdep_assert_preemption_disabled();
+
+ /*
+ * SEAMCALLs are made to the TDX module and can generate dirty
+ * cachelines of TDX private memory. Mark cache state incoherent
+ * so that the cache can be flushed during kexec.
+ *
+ * This needs to be done before actually making the SEAMCALL,
+ * because kexec-ing CPU could send NMI to stop remote CPUs,
+ * in which case even disabling IRQ won't help here.
+ */
+ this_cpu_write(cache_state_incoherent, true);
+
+ return func(fn, args);
+}
+
static __always_inline u64 sc_retry(sc_func_t func, u64 fn,
struct tdx_module_args *args)
{
@@ -113,7 +134,9 @@ static __always_inline u64 sc_retry(sc_func_t func, u64 fn,
u64 ret;
do {
- ret = func(fn, args);
+ preempt_disable();
+ ret = __seamcall_dirty_cache(func, fn, args);
+ preempt_enable();
} while (ret == TDX_RND_NO_ENTROPY && --retry);
return ret;
@@ -131,6 +154,8 @@ int tdx_guest_keyid_alloc(void);
u32 tdx_get_nr_guest_keyids(void);
void tdx_guest_keyid_free(unsigned int keyid);
+void tdx_quirk_reset_page(struct page *page);
+
struct tdx_td {
/* TD root structure: */
struct page *tdr_page;
@@ -146,6 +171,8 @@ struct tdx_td {
struct tdx_vp {
/* TDVP root page */
struct page *tdvpr_page;
+ /* precalculated page_to_phys(tdvpr_page) for use in noinstr code */
+ phys_addr_t tdvpr_pa;
/* TD vCPU control structure: */
struct page **tdcx_pages;
@@ -203,5 +230,11 @@ static inline const char *tdx_dump_mce_info(struct mce *m) { return NULL; }
static inline const struct tdx_sys_info *tdx_get_sysinfo(void) { return NULL; }
#endif /* CONFIG_INTEL_TDX_HOST */
+#ifdef CONFIG_KEXEC_CORE
+void tdx_cpu_flush_cache_for_kexec(void);
+#else
+static inline void tdx_cpu_flush_cache_for_kexec(void) { }
+#endif
+
#endif /* !__ASSEMBLER__ */
#endif /* _ASM_X86_TDX_H */
diff --git a/arch/x86/include/asm/video.h b/arch/x86/include/asm/video.h
index 0950c9535fae..08ec328203ef 100644
--- a/arch/x86/include/asm/video.h
+++ b/arch/x86/include/asm/video.h
@@ -13,8 +13,10 @@ pgprot_t pgprot_framebuffer(pgprot_t prot,
unsigned long offset);
#define pgprot_framebuffer pgprot_framebuffer
+#ifdef CONFIG_VIDEO
bool video_is_primary_device(struct device *dev);
#define video_is_primary_device video_is_primary_device
+#endif
#include <asm-generic/video.h>
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index a6f88ca1a6b4..5398db4dedb4 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -546,6 +546,23 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c)
u64 msr;
/*
+ * Mark using WBINVD is needed during kexec on processors that
+ * support SME. This provides support for performing a successful
+ * kexec when going from SME inactive to SME active (or vice-versa).
+ *
+ * The cache must be cleared so that if there are entries with the
+ * same physical address, both with and without the encryption bit,
+ * they don't race each other when flushed and potentially end up
+ * with the wrong entry being committed to memory.
+ *
+ * Test the CPUID bit directly because with mem_encrypt=off the
+ * BSP will clear the X86_FEATURE_SME bit and the APs will not
+ * see it set after that.
+ */
+ if (c->extended_cpuid_level >= 0x8000001f && (cpuid_eax(0x8000001f) & BIT(0)))
+ __this_cpu_write(cache_state_incoherent, true);
+
+ /*
* BIOS support is required for SME and SEV.
* For SME: If BIOS has enabled SME then adjust x86_phys_bits by
* the SME physical address space reduction value.
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index c6b12bed173d..335fd2ee9766 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -165,14 +165,23 @@ static struct crash_mem *fill_up_crash_elf_data(void)
/*
* Exclusion of crash region, crashk_low_res and/or crashk_cma_ranges
* may cause range splits. So add extra slots here.
+ *
+ * Exclusion of low 1M may not cause another range split, because the
+ * range of exclude is [0, 1M] and the condition for splitting a new
+ * region is that the start, end parameters are both in a certain
+ * existing region in cmem and cannot be equal to existing region's
+ * start or end. Obviously, the start of [0, 1M] cannot meet this
+ * condition.
+ *
+ * But in order to lest the low 1M could be changed in the future,
+ * (e.g. [start, 1M]), add a extra slot.
*/
- nr_ranges += 2 + crashk_cma_cnt;
+ nr_ranges += 3 + crashk_cma_cnt;
cmem = vzalloc(struct_size(cmem, ranges, nr_ranges));
if (!cmem)
return NULL;
cmem->max_nr_ranges = nr_ranges;
- cmem->nr_ranges = 0;
return cmem;
}
@@ -323,16 +332,20 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params)
struct crash_mem *cmem;
/*
- * Using random kexec_buf for passing dm crypt keys may cause a range
- * split. So use two slots here.
+ * In the current x86 architecture code, the elfheader is always
+ * allocated at crashk_res.start. But it depends on the allocation
+ * position of elfheader in crashk_res. To avoid potential out of
+ * bounds in future, add an extra slot.
+ *
+ * And using random kexec_buf for passing dm crypt keys may cause a
+ * range split too, add another extra slot here.
*/
- nr_ranges = 2;
+ nr_ranges = 3;
cmem = vzalloc(struct_size(cmem, ranges, nr_ranges));
if (!cmem)
return -ENOMEM;
cmem->max_nr_ranges = nr_ranges;
- cmem->nr_ranges = 0;
memset(&cmd, 0, sizeof(struct crash_memmap_data));
cmd.params = params;
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index 24a41f0e0cf1..c3244ac680d1 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -16,6 +16,8 @@
#include <linux/kexec.h>
#include <linux/kernel.h>
#include <linux/mm.h>
+#include <linux/libfdt.h>
+#include <linux/of_fdt.h>
#include <linux/efi.h>
#include <linux/random.h>
@@ -212,6 +214,28 @@ setup_efi_state(struct boot_params *params, unsigned long params_load_addr,
}
#endif /* CONFIG_EFI */
+#ifdef CONFIG_OF_FLATTREE
+static void setup_dtb(struct boot_params *params,
+ unsigned long params_load_addr,
+ unsigned int dtb_setup_data_offset)
+{
+ struct setup_data *sd = (void *)params + dtb_setup_data_offset;
+ unsigned long setup_data_phys, dtb_len;
+
+ dtb_len = fdt_totalsize(initial_boot_params);
+ sd->type = SETUP_DTB;
+ sd->len = dtb_len;
+
+ /* Carry over current boot DTB with setup_data */
+ memcpy(sd->data, initial_boot_params, dtb_len);
+
+ /* Add setup data */
+ setup_data_phys = params_load_addr + dtb_setup_data_offset;
+ sd->next = params->hdr.setup_data;
+ params->hdr.setup_data = setup_data_phys;
+}
+#endif /* CONFIG_OF_FLATTREE */
+
static void
setup_ima_state(const struct kimage *image, struct boot_params *params,
unsigned long params_load_addr,
@@ -336,6 +360,17 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params,
sizeof(struct efi_setup_data);
#endif
+#ifdef CONFIG_OF_FLATTREE
+ if (image->force_dtb && initial_boot_params) {
+ setup_dtb(params, params_load_addr, setup_data_offset);
+ setup_data_offset += sizeof(struct setup_data) +
+ fdt_totalsize(initial_boot_params);
+ } else {
+ pr_debug("Not carrying over DTB, force_dtb = %d\n",
+ image->force_dtb);
+ }
+#endif
+
if (IS_ENABLED(CONFIG_IMA_KEXEC)) {
/* Setup IMA log buffer state */
setup_ima_state(image, params, params_load_addr,
@@ -529,6 +564,12 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
sizeof(struct setup_data) +
RNG_SEED_LENGTH;
+#ifdef CONFIG_OF_FLATTREE
+ if (image->force_dtb && initial_boot_params)
+ kbuf.bufsz += sizeof(struct setup_data) +
+ fdt_totalsize(initial_boot_params);
+#endif
+
if (IS_ENABLED(CONFIG_IMA_KEXEC))
kbuf.bufsz += sizeof(struct setup_data) +
sizeof(struct ima_setup_data);
@@ -537,7 +578,7 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
kbuf.bufsz += sizeof(struct setup_data) +
sizeof(struct kho_data);
- params = kzalloc(kbuf.bufsz, GFP_KERNEL);
+ params = kvzalloc(kbuf.bufsz, GFP_KERNEL);
if (!params)
return ERR_PTR(-ENOMEM);
efi_map_offset = params_cmdline_sz;
@@ -647,7 +688,7 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
return ldata;
out_free_params:
- kfree(params);
+ kvfree(params);
return ERR_PTR(ret);
}
@@ -659,7 +700,7 @@ static int bzImage64_cleanup(void *loader_data)
if (!ldata)
return 0;
- kfree(ldata->bootparams_buf);
+ kvfree(ldata->bootparams_buf);
ldata->bootparams_buf = NULL;
return 0;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 8ae750cde0c6..b67d7c59dca0 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -190,7 +190,7 @@ static void apf_task_wake_all(void)
}
}
-void kvm_async_pf_task_wake(u32 token)
+static void kvm_async_pf_task_wake(u32 token)
{
u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
@@ -241,7 +241,6 @@ again:
/* A dummy token might be allocated and ultimately not used. */
kfree(dummy);
}
-EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
noinstr u32 kvm_read_and_reset_apf_flags(void)
{
@@ -933,6 +932,19 @@ static void kvm_sev_hc_page_enc_status(unsigned long pfn, int npages, bool enc)
static void __init kvm_init_platform(void)
{
+ u64 tolud = PFN_PHYS(e820__end_of_low_ram_pfn());
+ /*
+ * Note, hardware requires variable MTRR ranges to be power-of-2 sized
+ * and naturally aligned. But when forcing guest MTRR state, Linux
+ * doesn't program the forced ranges into hardware. Don't bother doing
+ * the math to generate a technically-legal range.
+ */
+ struct mtrr_var_range pci_hole = {
+ .base_lo = tolud | X86_MEMTYPE_UC,
+ .mask_lo = (u32)(~(SZ_4G - tolud - 1)) | MTRR_PHYSMASK_V,
+ .mask_hi = (BIT_ULL(boot_cpu_data.x86_phys_bits) - 1) >> 32,
+ };
+
if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) &&
kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL)) {
unsigned long nr_pages;
@@ -982,8 +994,12 @@ static void __init kvm_init_platform(void)
kvmclock_init();
x86_platform.apic_post_init = kvm_apic_init;
- /* Set WB as the default cache mode for SEV-SNP and TDX */
- guest_force_mtrr_state(NULL, 0, MTRR_TYPE_WRBACK);
+ /*
+ * Set WB as the default cache mode for SEV-SNP and TDX, with a single
+ * UC range for the legacy PCI hole, e.g. so that devices that expect
+ * to get UC/WC mappings don't get surprised with WB.
+ */
+ guest_force_mtrr_state(&pci_hole, 1, MTRR_TYPE_WRBACK);
}
#if defined(CONFIG_AMD_MEM_ENCRYPT)
@@ -1073,16 +1089,6 @@ static void kvm_wait(u8 *ptr, u8 val)
void __init kvm_spinlock_init(void)
{
/*
- * In case host doesn't support KVM_FEATURE_PV_UNHALT there is still an
- * advantage of keeping virt_spin_lock_key enabled: virt_spin_lock() is
- * preferred over native qspinlock when vCPU is preempted.
- */
- if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) {
- pr_info("PV spinlocks disabled, no host support\n");
- return;
- }
-
- /*
* Disable PV spinlocks and use native qspinlock when dedicated pCPUs
* are available.
*/
@@ -1101,6 +1107,16 @@ void __init kvm_spinlock_init(void)
goto out;
}
+ /*
+ * In case host doesn't support KVM_FEATURE_PV_UNHALT there is still an
+ * advantage of keeping virt_spin_lock_key enabled: virt_spin_lock() is
+ * preferred over native qspinlock when vCPU is preempted.
+ */
+ if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) {
+ pr_info("PV spinlocks disabled, no host support\n");
+ return;
+ }
+
pr_info("PV spinlocks enabled\n");
__pv_init_lock_hash();
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 697fb99406e6..15088d14904f 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -29,6 +29,7 @@
#include <asm/set_memory.h>
#include <asm/cpu.h>
#include <asm/efi.h>
+#include <asm/processor.h>
#ifdef CONFIG_ACPI
/*
@@ -346,6 +347,22 @@ int machine_kexec_prepare(struct kimage *image)
unsigned long reloc_end = (unsigned long)__relocate_kernel_end;
int result;
+ /*
+ * Some early TDX-capable platforms have an erratum. A kernel
+ * partial write (a write transaction of less than cacheline
+ * lands at memory controller) to TDX private memory poisons that
+ * memory, and a subsequent read triggers a machine check.
+ *
+ * On those platforms the old kernel must reset TDX private
+ * memory before jumping to the new kernel otherwise the new
+ * kernel may see unexpected machine check. For simplicity
+ * just fail kexec/kdump on those platforms.
+ */
+ if (boot_cpu_has_bug(X86_BUG_TDX_PW_MCE)) {
+ pr_info_once("Not allowed on platform with tdx_pw_mce bug\n");
+ return -EOPNOTSUPP;
+ }
+
/* Setup the identity mapped 64bit page table */
result = init_pgtable(image, __pa(control_page));
if (result)
@@ -384,16 +401,10 @@ void __nocfi machine_kexec(struct kimage *image)
{
unsigned long reloc_start = (unsigned long)__relocate_kernel_start;
relocate_kernel_fn *relocate_kernel_ptr;
- unsigned int host_mem_enc_active;
+ unsigned int relocate_kernel_flags;
int save_ftrace_enabled;
void *control_page;
- /*
- * This must be done before load_segments() since if call depth tracking
- * is used then GS must be valid to make any function calls.
- */
- host_mem_enc_active = cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT);
-
#ifdef CONFIG_KEXEC_JUMP
if (image->preserve_context)
save_processor_state();
@@ -427,6 +438,17 @@ void __nocfi machine_kexec(struct kimage *image)
*/
relocate_kernel_ptr = control_page + (unsigned long)relocate_kernel - reloc_start;
+ relocate_kernel_flags = 0;
+ if (image->preserve_context)
+ relocate_kernel_flags |= RELOC_KERNEL_PRESERVE_CONTEXT;
+
+ /*
+ * This must be done before load_segments() since it resets
+ * GS to 0 and percpu data needs the correct GS to work.
+ */
+ if (this_cpu_read(cache_state_incoherent))
+ relocate_kernel_flags |= RELOC_KERNEL_CACHE_INCOHERENT;
+
/*
* The segment registers are funny things, they have both a
* visible and an invisible part. Whenever the visible part is
@@ -436,6 +458,11 @@ void __nocfi machine_kexec(struct kimage *image)
*
* Take advantage of this here by force loading the segments,
* before the GDT is zapped with an invalid value.
+ *
+ * load_segments() resets GS to 0. Don't make any function call
+ * after here since call depth tracking uses percpu variables to
+ * operate (relocate_kernel() is explicitly ignored by call depth
+ * tracking).
*/
load_segments();
@@ -443,8 +470,7 @@ void __nocfi machine_kexec(struct kimage *image)
image->start = relocate_kernel_ptr((unsigned long)image->head,
virt_to_phys(control_page),
image->start,
- image->preserve_context,
- host_mem_enc_active);
+ relocate_kernel_flags);
#ifdef CONFIG_KEXEC_JUMP
if (image->preserve_context)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index e3a3987b0c4f..4c718f8adc59 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -89,6 +89,16 @@ DEFINE_PER_CPU(bool, __tss_limit_invalid);
EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid);
/*
+ * The cache may be in an incoherent state and needs flushing during kexec.
+ * E.g., on SME/TDX platforms, dirty cacheline aliases with and without
+ * encryption bit(s) can coexist and the cache needs to be flushed before
+ * booting to the new kernel to avoid the silent memory corruption due to
+ * dirty cachelines with different encryption property being written back
+ * to the memory.
+ */
+DEFINE_PER_CPU(bool, cache_state_incoherent);
+
+/*
* this gets called so that we can store lazy state into memory and copy the
* current task into the new thread.
*/
@@ -827,19 +837,7 @@ void __noreturn stop_this_cpu(void *dummy)
disable_local_APIC();
mcheck_cpu_clear(c);
- /*
- * Use wbinvd on processors that support SME. This provides support
- * for performing a successful kexec when going from SME inactive
- * to SME active (or vice-versa). The cache must be cleared so that
- * if there are entries with the same physical address, both with and
- * without the encryption bit, they don't race each other when flushed
- * and potentially end up with the wrong entry being committed to
- * memory.
- *
- * Test the CPUID bit directly because the machine might've cleared
- * X86_FEATURE_SME due to cmdline options.
- */
- if (c->extended_cpuid_level >= 0x8000001f && (cpuid_eax(0x8000001f) & BIT(0)))
+ if (this_cpu_read(cache_state_incoherent))
wbinvd();
/*
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index ea604f4d0b52..11e20bb13aca 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -66,8 +66,7 @@ SYM_CODE_START_NOALIGN(relocate_kernel)
* %rdi indirection_page
* %rsi pa_control_page
* %rdx start address
- * %rcx preserve_context
- * %r8 host_mem_enc_active
+ * %rcx flags: RELOC_KERNEL_*
*/
/* Save the CPU context, used for jumping back */
@@ -111,7 +110,7 @@ SYM_CODE_START_NOALIGN(relocate_kernel)
/* save indirection list for jumping back */
movq %rdi, pa_backup_pages_map(%rip)
- /* Save the preserve_context to %r11 as swap_pages clobbers %rcx. */
+ /* Save the flags to %r11 as swap_pages clobbers %rcx. */
movq %rcx, %r11
/* setup a new stack at the end of the physical control page */
@@ -129,9 +128,8 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
/*
* %rdi indirection page
* %rdx start address
- * %r8 host_mem_enc_active
* %r9 page table page
- * %r11 preserve_context
+ * %r11 flags: RELOC_KERNEL_*
* %r13 original CR4 when relocate_kernel() was invoked
*/
@@ -200,14 +198,21 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
movq %r9, %cr3
/*
+ * If the memory cache is in incoherent state, e.g., due to
+ * memory encryption, do WBINVD to flush cache.
+ *
* If SME is active, there could be old encrypted cache line
* entries that will conflict with the now unencrypted memory
* used by kexec. Flush the caches before copying the kernel.
+ *
+ * Note SME sets this flag to true when the platform supports
+ * SME, so the WBINVD is performed even SME is not activated
+ * by the kernel. But this has no harm.
*/
- testq %r8, %r8
- jz .Lsme_off
+ testb $RELOC_KERNEL_CACHE_INCOHERENT, %r11b
+ jz .Lnowbinvd
wbinvd
-.Lsme_off:
+.Lnowbinvd:
call swap_pages
@@ -220,7 +225,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
movq %cr3, %rax
movq %rax, %cr3
- testq %r11, %r11 /* preserve_context */
+ testb $RELOC_KERNEL_PRESERVE_CONTEXT, %r11b
jnz .Lrelocate
/*
@@ -273,7 +278,13 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
ANNOTATE_NOENDBR
andq $PAGE_MASK, %r8
lea PAGE_SIZE(%r8), %rsp
- movl $1, %r11d /* Ensure preserve_context flag is set */
+ /*
+ * Ensure RELOC_KERNEL_PRESERVE_CONTEXT flag is set so that
+ * swap_pages() can swap pages correctly. Note all other
+ * RELOC_KERNEL_* flags passed to relocate_kernel() are not
+ * restored.
+ */
+ movl $RELOC_KERNEL_PRESERVE_CONTEXT, %r11d
call swap_pages
movq kexec_va_control_page(%rip), %rax
0: addq $virtual_mapped - 0b, %rax
@@ -321,7 +332,7 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
UNWIND_HINT_END_OF_STACK
/*
* %rdi indirection page
- * %r11 preserve_context
+ * %r11 flags: RELOC_KERNEL_*
*/
movq %rdi, %rcx /* Put the indirection_page in %rcx */
xorl %edi, %edi
@@ -357,7 +368,8 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
movq %rdi, %rdx /* Save destination page to %rdx */
movq %rsi, %rax /* Save source page to %rax */
- testq %r11, %r11 /* Only actually swap for ::preserve_context */
+ /* Only actually swap for ::preserve_context */
+ testb $RELOC_KERNEL_PRESERVE_CONTEXT, %r11b
jz .Lnoswap
/* copy source page to swap page */
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 2c86673155c9..4e43923656d0 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -46,8 +46,8 @@ config KVM_X86
select HAVE_KVM_PM_NOTIFIER if PM
select KVM_GENERIC_HARDWARE_ENABLING
select KVM_GENERIC_PRE_FAULT_MEMORY
- select KVM_GENERIC_PRIVATE_MEM if KVM_SW_PROTECTED_VM
select KVM_WERROR if WERROR
+ select KVM_GUEST_MEMFD if X86_64
config KVM
tristate "Kernel-based Virtual Machine (KVM) support"
@@ -74,7 +74,7 @@ config KVM_WERROR
# FRAME_WARN, i.e. KVM_WERROR=y with KASAN=y requires special tuning.
# Building KVM with -Werror and KASAN is still doable via enabling
# the kernel-wide WERROR=y.
- depends on KVM && ((EXPERT && !KASAN) || WERROR)
+ depends on KVM_X86 && ((EXPERT && !KASAN) || WERROR)
help
Add -Werror to the build flags for KVM.
@@ -83,7 +83,8 @@ config KVM_WERROR
config KVM_SW_PROTECTED_VM
bool "Enable support for KVM software-protected VMs"
depends on EXPERT
- depends on KVM && X86_64
+ depends on KVM_X86 && X86_64
+ select KVM_GENERIC_MEMORY_ATTRIBUTES
help
Enable support for KVM software-protected VMs. Currently, software-
protected VMs are purely a development and testing vehicle for
@@ -95,8 +96,6 @@ config KVM_SW_PROTECTED_VM
config KVM_INTEL
tristate "KVM for Intel (and compatible) processors support"
depends on KVM && IA32_FEAT_CTL
- select KVM_GENERIC_PRIVATE_MEM if INTEL_TDX_HOST
- select KVM_GENERIC_MEMORY_ATTRIBUTES if INTEL_TDX_HOST
help
Provides support for KVM on processors equipped with Intel's VT
extensions, a.k.a. Virtual Machine Extensions (VMX).
@@ -135,6 +134,8 @@ config KVM_INTEL_TDX
bool "Intel Trust Domain Extensions (TDX) support"
default y
depends on INTEL_TDX_HOST
+ select KVM_GENERIC_MEMORY_ATTRIBUTES
+ select HAVE_KVM_ARCH_GMEM_POPULATE
help
Provides support for launching Intel Trust Domain Extensions (TDX)
confidential VMs on Intel processors.
@@ -157,9 +158,10 @@ config KVM_AMD_SEV
depends on KVM_AMD && X86_64
depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m)
select ARCH_HAS_CC_PLATFORM
- select KVM_GENERIC_PRIVATE_MEM
+ select KVM_GENERIC_MEMORY_ATTRIBUTES
select HAVE_KVM_ARCH_GMEM_PREPARE
select HAVE_KVM_ARCH_GMEM_INVALIDATE
+ select HAVE_KVM_ARCH_GMEM_POPULATE
help
Provides support for launching encrypted VMs which use Secure
Encrypted Virtualization (SEV), Secure Encrypted Virtualization with
@@ -169,7 +171,7 @@ config KVM_AMD_SEV
config KVM_IOAPIC
bool "I/O APIC, PIC, and PIT emulation"
default y
- depends on KVM
+ depends on KVM_X86
help
Provides support for KVM to emulate an I/O APIC, PIC, and PIT, i.e.
for full in-kernel APIC emulation.
@@ -179,7 +181,7 @@ config KVM_IOAPIC
config KVM_SMM
bool "System Management Mode emulation"
default y
- depends on KVM
+ depends on KVM_X86
help
Provides support for KVM to emulate System Management Mode (SMM)
in virtual machines. This can be used by the virtual machine
@@ -189,7 +191,7 @@ config KVM_SMM
config KVM_HYPERV
bool "Support for Microsoft Hyper-V emulation"
- depends on KVM
+ depends on KVM_X86
default y
help
Provides KVM support for emulating Microsoft Hyper-V. This allows KVM
@@ -203,7 +205,7 @@ config KVM_HYPERV
config KVM_XEN
bool "Support for Xen hypercall interface"
- depends on KVM
+ depends on KVM_X86
help
Provides KVM support for the hosting Xen HVM guests and
passing Xen hypercalls to userspace.
@@ -213,7 +215,7 @@ config KVM_XEN
config KVM_PROVE_MMU
bool "Prove KVM MMU correctness"
depends on DEBUG_KERNEL
- depends on KVM
+ depends on KVM_X86
depends on EXPERT
help
Enables runtime assertions in KVM's MMU that are too costly to enable
@@ -228,7 +230,7 @@ config KVM_EXTERNAL_WRITE_TRACKING
config KVM_MAX_NR_VCPUS
int "Maximum number of vCPUs per KVM guest"
- depends on KVM
+ depends on KVM_X86
range 1024 4096
default 4096 if MAXSMP
default 1024
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 6e838cb6c9e1..56c80588efa0 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3285,12 +3285,72 @@ out:
return level;
}
-static int __kvm_mmu_max_mapping_level(struct kvm *kvm,
- const struct kvm_memory_slot *slot,
- gfn_t gfn, int max_level, bool is_private)
+static u8 kvm_max_level_for_order(int order)
+{
+ BUILD_BUG_ON(KVM_MAX_HUGEPAGE_LEVEL > PG_LEVEL_1G);
+
+ KVM_MMU_WARN_ON(order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G) &&
+ order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M) &&
+ order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K));
+
+ if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G))
+ return PG_LEVEL_1G;
+
+ if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M))
+ return PG_LEVEL_2M;
+
+ return PG_LEVEL_4K;
+}
+
+static u8 kvm_gmem_max_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault,
+ const struct kvm_memory_slot *slot, gfn_t gfn,
+ bool is_private)
+{
+ u8 max_level, coco_level;
+ kvm_pfn_t pfn;
+
+ /* For faults, use the gmem information that was resolved earlier. */
+ if (fault) {
+ pfn = fault->pfn;
+ max_level = fault->max_level;
+ } else {
+ /* TODO: Call into guest_memfd once hugepages are supported. */
+ WARN_ONCE(1, "Get pfn+order from guest_memfd");
+ pfn = KVM_PFN_ERR_FAULT;
+ max_level = PG_LEVEL_4K;
+ }
+
+ if (max_level == PG_LEVEL_4K)
+ return max_level;
+
+ /*
+ * CoCo may influence the max mapping level, e.g. due to RMP or S-EPT
+ * restrictions. A return of '0' means "no additional restrictions", to
+ * allow for using an optional "ret0" static call.
+ */
+ coco_level = kvm_x86_call(gmem_max_mapping_level)(kvm, pfn, is_private);
+ if (coco_level)
+ max_level = min(max_level, coco_level);
+
+ return max_level;
+}
+
+int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault,
+ const struct kvm_memory_slot *slot, gfn_t gfn)
{
struct kvm_lpage_info *linfo;
- int host_level;
+ int host_level, max_level;
+ bool is_private;
+
+ lockdep_assert_held(&kvm->mmu_lock);
+
+ if (fault) {
+ max_level = fault->max_level;
+ is_private = fault->is_private;
+ } else {
+ max_level = PG_LEVEL_NUM;
+ is_private = kvm_mem_is_private(kvm, gfn);
+ }
max_level = min(max_level, max_huge_page_level);
for ( ; max_level > PG_LEVEL_4K; max_level--) {
@@ -3299,25 +3359,17 @@ static int __kvm_mmu_max_mapping_level(struct kvm *kvm,
break;
}
- if (is_private)
- return max_level;
-
if (max_level == PG_LEVEL_4K)
return PG_LEVEL_4K;
- host_level = host_pfn_mapping_level(kvm, gfn, slot);
+ if (is_private || kvm_memslot_is_gmem_only(slot))
+ host_level = kvm_gmem_max_mapping_level(kvm, fault, slot, gfn,
+ is_private);
+ else
+ host_level = host_pfn_mapping_level(kvm, gfn, slot);
return min(host_level, max_level);
}
-int kvm_mmu_max_mapping_level(struct kvm *kvm,
- const struct kvm_memory_slot *slot, gfn_t gfn)
-{
- bool is_private = kvm_slot_can_be_private(slot) &&
- kvm_mem_is_private(kvm, gfn);
-
- return __kvm_mmu_max_mapping_level(kvm, slot, gfn, PG_LEVEL_NUM, is_private);
-}
-
void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
struct kvm_memory_slot *slot = fault->slot;
@@ -3338,9 +3390,8 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
* Enforce the iTLB multihit workaround after capturing the requested
* level, which will be used to do precise, accurate accounting.
*/
- fault->req_level = __kvm_mmu_max_mapping_level(vcpu->kvm, slot,
- fault->gfn, fault->max_level,
- fault->is_private);
+ fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, fault,
+ fault->slot, fault->gfn);
if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed)
return;
@@ -4503,42 +4554,6 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
vcpu->stat.pf_fixed++;
}
-static inline u8 kvm_max_level_for_order(int order)
-{
- BUILD_BUG_ON(KVM_MAX_HUGEPAGE_LEVEL > PG_LEVEL_1G);
-
- KVM_MMU_WARN_ON(order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G) &&
- order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M) &&
- order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K));
-
- if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G))
- return PG_LEVEL_1G;
-
- if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M))
- return PG_LEVEL_2M;
-
- return PG_LEVEL_4K;
-}
-
-static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn,
- u8 max_level, int gmem_order)
-{
- u8 req_max_level;
-
- if (max_level == PG_LEVEL_4K)
- return PG_LEVEL_4K;
-
- max_level = min(kvm_max_level_for_order(gmem_order), max_level);
- if (max_level == PG_LEVEL_4K)
- return PG_LEVEL_4K;
-
- req_max_level = kvm_x86_call(private_max_mapping_level)(kvm, pfn);
- if (req_max_level)
- max_level = min(max_level, req_max_level);
-
- return max_level;
-}
-
static void kvm_mmu_finish_page_fault(struct kvm_vcpu *vcpu,
struct kvm_page_fault *fault, int r)
{
@@ -4546,12 +4561,12 @@ static void kvm_mmu_finish_page_fault(struct kvm_vcpu *vcpu,
r == RET_PF_RETRY, fault->map_writable);
}
-static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu,
- struct kvm_page_fault *fault)
+static int kvm_mmu_faultin_pfn_gmem(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
{
int max_order, r;
- if (!kvm_slot_can_be_private(fault->slot)) {
+ if (!kvm_slot_has_gmem(fault->slot)) {
kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
return -EFAULT;
}
@@ -4564,8 +4579,7 @@ static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu,
}
fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY);
- fault->max_level = kvm_max_private_mapping_level(vcpu->kvm, fault->pfn,
- fault->max_level, max_order);
+ fault->max_level = kvm_max_level_for_order(max_order);
return RET_PF_CONTINUE;
}
@@ -4575,8 +4589,8 @@ static int __kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu,
{
unsigned int foll = fault->write ? FOLL_WRITE : 0;
- if (fault->is_private)
- return kvm_mmu_faultin_pfn_private(vcpu, fault);
+ if (fault->is_private || kvm_memslot_is_gmem_only(fault->slot))
+ return kvm_mmu_faultin_pfn_gmem(vcpu, fault);
foll |= FOLL_NOWAIT;
fault->pfn = __kvm_faultin_pfn(fault->slot, fault->gfn, foll,
@@ -7165,7 +7179,7 @@ restart:
* mapping if the indirect sp has level = 1.
*/
if (sp->role.direct &&
- sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn)) {
+ sp->role.level < kvm_mmu_max_mapping_level(kvm, NULL, slot, sp->gfn)) {
kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
if (kvm_available_flush_remote_tlbs_range())
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 65f3c89d7c5d..b776be783a2f 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -411,7 +411,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
return r;
}
-int kvm_mmu_max_mapping_level(struct kvm *kvm,
+int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_page_fault *fault,
const struct kvm_memory_slot *slot, gfn_t gfn);
void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 7f3d7229b2c1..740cb06accdb 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1813,7 +1813,7 @@ retry:
if (iter.gfn < start || iter.gfn >= end)
continue;
- max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, iter.gfn);
+ max_mapping_level = kvm_mmu_max_mapping_level(kvm, NULL, slot, iter.gfn);
if (max_mapping_level < iter.level)
continue;
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 0635bd71c10e..5bac4d20aec0 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -2361,7 +2361,7 @@ static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp)
mutex_lock(&kvm->slots_lock);
memslot = gfn_to_memslot(kvm, params.gfn_start);
- if (!kvm_slot_can_be_private(memslot)) {
+ if (!kvm_slot_has_gmem(memslot)) {
ret = -EINVAL;
goto out;
}
@@ -4715,7 +4715,7 @@ void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code)
}
slot = gfn_to_memslot(kvm, gfn);
- if (!kvm_slot_can_be_private(slot)) {
+ if (!kvm_slot_has_gmem(slot)) {
pr_warn_ratelimited("SEV: Unexpected RMP fault, non-private slot for GPA 0x%llx\n",
gpa);
return;
@@ -4943,7 +4943,7 @@ next_pfn:
}
}
-int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
+int sev_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private)
{
int level, rc;
bool assigned;
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 1bfebe40854f..3a9fe0a8b78c 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -5179,7 +5179,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.gmem_prepare = sev_gmem_prepare,
.gmem_invalidate = sev_gmem_invalidate,
- .private_max_mapping_level = sev_private_max_mapping_level,
+ .gmem_max_mapping_level = sev_gmem_max_mapping_level,
};
/*
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 58b9d168e0c8..70df7c6413cf 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -866,7 +866,7 @@ void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code);
void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu);
int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order);
void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end);
-int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn);
+int sev_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private);
struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu);
void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa);
#else
@@ -895,7 +895,7 @@ static inline int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, in
return 0;
}
static inline void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) {}
-static inline int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
+static inline int sev_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private)
{
return 0;
}
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index dbab1c15b0cd..bb5f182f6788 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -831,10 +831,11 @@ static int vt_vcpu_mem_enc_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
return tdx_vcpu_ioctl(vcpu, argp);
}
-static int vt_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
+static int vt_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn,
+ bool is_private)
{
if (is_td(kvm))
- return tdx_gmem_private_max_mapping_level(kvm, pfn);
+ return tdx_gmem_max_mapping_level(kvm, pfn, is_private);
return 0;
}
@@ -1005,7 +1006,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.mem_enc_ioctl = vt_op_tdx_only(mem_enc_ioctl),
.vcpu_mem_enc_ioctl = vt_op_tdx_only(vcpu_mem_enc_ioctl),
- .private_max_mapping_level = vt_op_tdx_only(gmem_private_max_mapping_level)
+ .gmem_max_mapping_level = vt_op_tdx_only(gmem_max_mapping_level)
};
struct kvm_x86_init_ops vt_init_ops __initdata = {
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 66744f5768c8..00f8bfd2330d 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -281,25 +281,6 @@ static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu)
vcpu->cpu = -1;
}
-static void tdx_clear_page(struct page *page)
-{
- const void *zero_page = (const void *) page_to_virt(ZERO_PAGE(0));
- void *dest = page_to_virt(page);
- unsigned long i;
-
- /*
- * The page could have been poisoned. MOVDIR64B also clears
- * the poison bit so the kernel can safely use the page again.
- */
- for (i = 0; i < PAGE_SIZE; i += 64)
- movdir64b(dest + i, zero_page);
- /*
- * MOVDIR64B store uses WC buffer. Prevent following memory reads
- * from seeing potentially poisoned cache.
- */
- __mb();
-}
-
static void tdx_no_vcpus_enter_start(struct kvm *kvm)
{
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
@@ -345,7 +326,7 @@ static int tdx_reclaim_page(struct page *page)
r = __tdx_reclaim_page(page);
if (!r)
- tdx_clear_page(page);
+ tdx_quirk_reset_page(page);
return r;
}
@@ -442,6 +423,16 @@ void tdx_disable_virtualization_cpu(void)
tdx_flush_vp(&arg);
}
local_irq_restore(flags);
+
+ /*
+ * Flush cache now if kexec is possible: this is necessary to avoid
+ * having dirty private memory cachelines when the new kernel boots,
+ * but WBINVD is a relatively expensive operation and doing it during
+ * kexec can exacerbate races in native_stop_other_cpus(). Do it
+ * now, since this is a safe moment and there is going to be no more
+ * TDX activity on this CPU from this point on.
+ */
+ tdx_cpu_flush_cache_for_kexec();
}
#define TDX_SEAMCALL_RETRIES 10000
@@ -593,7 +584,7 @@ static void tdx_reclaim_td_control_pages(struct kvm *kvm)
pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err);
return;
}
- tdx_clear_page(kvm_tdx->td.tdr_page);
+ tdx_quirk_reset_page(kvm_tdx->td.tdr_page);
__free_page(kvm_tdx->td.tdr_page);
kvm_tdx->td.tdr_page = NULL;
@@ -861,6 +852,7 @@ void tdx_vcpu_free(struct kvm_vcpu *vcpu)
if (tdx->vp.tdvpr_page) {
tdx_reclaim_control_page(tdx->vp.tdvpr_page);
tdx->vp.tdvpr_page = 0;
+ tdx->vp.tdvpr_pa = 0;
}
tdx->state = VCPU_TD_STATE_UNINITIALIZED;
@@ -1714,7 +1706,7 @@ static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn,
pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err);
return -EIO;
}
- tdx_clear_page(page);
+ tdx_quirk_reset_page(page);
tdx_unpin(kvm, page);
return 0;
}
@@ -2940,6 +2932,13 @@ static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
return -ENOMEM;
tdx->vp.tdvpr_page = page;
+ /*
+ * page_to_phys() does not work in 'noinstr' code, like guest
+ * entry via tdh_vp_enter(). Precalculate and store it instead
+ * of doing it at runtime later.
+ */
+ tdx->vp.tdvpr_pa = page_to_phys(tdx->vp.tdvpr_page);
+
tdx->vp.tdcx_pages = kcalloc(kvm_tdx->td.tdcx_nr_pages, sizeof(*tdx->vp.tdcx_pages),
GFP_KERNEL);
if (!tdx->vp.tdcx_pages) {
@@ -3002,6 +3001,7 @@ free_tdvpr:
if (tdx->vp.tdvpr_page)
__free_page(tdx->vp.tdvpr_page);
tdx->vp.tdvpr_page = 0;
+ tdx->vp.tdvpr_pa = 0;
return ret;
}
@@ -3318,8 +3318,11 @@ int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
return ret;
}
-int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
+int tdx_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private)
{
+ if (!is_private)
+ return 0;
+
return PG_LEVEL_4K;
}
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index aa157fe5b7b3..0bdf9405969a 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -5785,6 +5785,13 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
if (kvm_test_request(KVM_REQ_EVENT, vcpu))
return 1;
+ /*
+ * Ensure that any updates to kvm->buses[] observed by the
+ * previous instruction (emulated or otherwise) are also
+ * visible to the instruction KVM is about to emulate.
+ */
+ smp_rmb();
+
if (!kvm_emulate_instruction(vcpu, 0))
return 0;
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
index 2b3424f638db..4c70f56c57c8 100644
--- a/arch/x86/kvm/vmx/x86_ops.h
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -153,7 +153,7 @@ int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp);
void tdx_flush_tlb_current(struct kvm_vcpu *vcpu);
void tdx_flush_tlb_all(struct kvm_vcpu *vcpu);
void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
-int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn);
+int tdx_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private);
#endif
#endif /* __KVM_X86_VMX_X86_OPS_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 706b6fd56d3c..f122906ed9f3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -13530,6 +13530,16 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_arch_no_poll);
+#ifdef CONFIG_KVM_GUEST_MEMFD
+/*
+ * KVM doesn't yet support mmap() on guest_memfd for VMs with private memory
+ * (the private vs. shared tracking needs to be moved into guest_memfd).
+ */
+bool kvm_arch_supports_gmem_mmap(struct kvm *kvm)
+{
+ return !kvm_arch_has_private_mem(kvm);
+}
+
#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE
int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order)
{
@@ -13543,6 +13553,7 @@ void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end)
kvm_x86_call(gmem_invalidate)(start, end);
}
#endif
+#endif
int kvm_spec_ctrl_test_value(u64 value)
{
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index bb57e93b4caf..8bf6ad4b9400 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -34,6 +34,7 @@
* We need to define the tracepoints somewhere, and tlb.c
* is only compiled when SMP=y.
*/
+#define CREATE_TRACE_POINTS
#include <trace/events/tlb.h>
#include "mm_internal.h"
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index b9426fce5f3e..0e4270e20fad 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1031,7 +1031,7 @@ static void __meminit free_pagetable(struct page *page, int order)
free_reserved_pages(page, nr_pages);
#endif
} else {
- free_pages((unsigned long)page_address(page), order);
+ __free_pages(page, order);
}
}
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 0539efd0d216..998b6010d6d3 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -451,5 +451,5 @@ void __init kasan_init(void)
__flush_tlb_all();
init_task.kasan_depth = 0;
- pr_info("KernelAddressSanitizer initialized\n");
+ kasan_init_generic();
}
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 5ed2109211da..82f3a987f7cf 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -80,7 +80,7 @@ unsigned long arch_mmap_rnd(void)
}
static unsigned long mmap_base(unsigned long rnd, unsigned long task_size,
- struct rlimit *rlim_stack)
+ const struct rlimit *rlim_stack)
{
unsigned long gap = rlim_stack->rlim_cur;
unsigned long pad = stack_maxrandom_size(task_size) + stack_guard_gap;
@@ -110,7 +110,7 @@ static unsigned long mmap_legacy_base(unsigned long rnd,
*/
static void arch_pick_mmap_base(unsigned long *base, unsigned long *legacy_base,
unsigned long random_factor, unsigned long task_size,
- struct rlimit *rlim_stack)
+ const struct rlimit *rlim_stack)
{
*legacy_base = mmap_legacy_base(random_factor, task_size);
if (mmap_is_legacy())
@@ -119,12 +119,12 @@ static void arch_pick_mmap_base(unsigned long *base, unsigned long *legacy_base,
*base = mmap_base(random_factor, task_size, rlim_stack);
}
-void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
+void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack)
{
if (mmap_is_legacy())
- clear_bit(MMF_TOPDOWN, &mm->flags);
+ mm_flags_clear(MMF_TOPDOWN, mm);
else
- set_bit(MMF_TOPDOWN, &mm->flags);
+ mm_flags_set(MMF_TOPDOWN, mm);
arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base,
arch_rnd(mmap64_rnd_bits), task_size_64bit(0),
diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
index c09284302dd3..b68200a0e0c6 100644
--- a/arch/x86/mm/pat/memtype.c
+++ b/arch/x86/mm/pat/memtype.c
@@ -126,7 +126,7 @@ __setup("debugpat", pat_debug_setup);
static inline enum page_cache_mode get_page_memtype(struct page *pg)
{
- unsigned long pg_flags = pg->flags & _PGMT_MASK;
+ unsigned long pg_flags = pg->flags.f & _PGMT_MASK;
if (pg_flags == _PGMT_WB)
return _PAGE_CACHE_MODE_WB;
@@ -161,10 +161,10 @@ static inline void set_page_memtype(struct page *pg,
break;
}
- old_flags = READ_ONCE(pg->flags);
+ old_flags = READ_ONCE(pg->flags.f);
do {
new_flags = (old_flags & _PGMT_CLEAR_MASK) | memtype_flags;
- } while (!try_cmpxchg(&pg->flags, &old_flags, new_flags));
+ } while (!try_cmpxchg(&pg->flags.f, &old_flags, new_flags));
}
#else
static inline enum page_cache_mode get_page_memtype(struct page *pg)
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 8834c76f91c9..d2d54b8c4dbb 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -399,15 +399,6 @@ static void cpa_flush_all(unsigned long cache)
on_each_cpu(__cpa_flush_all, (void *) cache, 1);
}
-static void __cpa_flush_tlb(void *data)
-{
- struct cpa_data *cpa = data;
- unsigned int i;
-
- for (i = 0; i < cpa->numpages; i++)
- flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i)));
-}
-
static int collapse_large_pages(unsigned long addr, struct list_head *pgtables);
static void cpa_collapse_large_pages(struct cpa_data *cpa)
@@ -444,6 +435,7 @@ static void cpa_collapse_large_pages(struct cpa_data *cpa)
static void cpa_flush(struct cpa_data *cpa, int cache)
{
+ unsigned long start, end;
unsigned int i;
BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
@@ -453,10 +445,12 @@ static void cpa_flush(struct cpa_data *cpa, int cache)
goto collapse_large_pages;
}
- if (cpa->force_flush_all || cpa->numpages > tlb_single_page_flush_ceiling)
- flush_tlb_all();
- else
- on_each_cpu(__cpa_flush_tlb, cpa, 1);
+ start = fix_addr(__cpa_addr(cpa, 0));
+ end = fix_addr(__cpa_addr(cpa, cpa->numpages));
+ if (cpa->force_flush_all)
+ end = TLB_FLUSH_ALL;
+
+ flush_tlb_kernel_range(start, end);
if (!cache)
goto collapse_large_pages;
diff --git a/arch/x86/platform/efi/memmap.c b/arch/x86/platform/efi/memmap.c
index 061b8ecc71a1..023697c88910 100644
--- a/arch/x86/platform/efi/memmap.c
+++ b/arch/x86/platform/efi/memmap.c
@@ -42,7 +42,7 @@ void __init __efi_memmap_free(u64 phys, unsigned long size, unsigned long flags)
struct page *p = pfn_to_page(PHYS_PFN(phys));
unsigned int order = get_order(size);
- free_pages((unsigned long) page_address(p), order);
+ __free_pages(p, order);
}
}
diff --git a/arch/x86/video/video-common.c b/arch/x86/video/video-common.c
index 81fc97a2a837..e0aeee99bc99 100644
--- a/arch/x86/video/video-common.c
+++ b/arch/x86/video/video-common.c
@@ -9,6 +9,7 @@
#include <linux/module.h>
#include <linux/pci.h>
+#include <linux/screen_info.h>
#include <linux/vgaarb.h>
#include <asm/video.h>
@@ -27,6 +28,11 @@ EXPORT_SYMBOL(pgprot_framebuffer);
bool video_is_primary_device(struct device *dev)
{
+#ifdef CONFIG_SCREEN_INFO
+ struct screen_info *si = &screen_info;
+ struct resource res[SCREEN_INFO_MAX_RESOURCES];
+ ssize_t i, numres;
+#endif
struct pci_dev *pdev;
if (!dev_is_pci(dev))
@@ -34,7 +40,24 @@ bool video_is_primary_device(struct device *dev)
pdev = to_pci_dev(dev);
- return (pdev == vga_default_device());
+ if (!pci_is_display(pdev))
+ return false;
+
+ if (pdev == vga_default_device())
+ return true;
+
+#ifdef CONFIG_SCREEN_INFO
+ numres = screen_info_resources(si, res, ARRAY_SIZE(res));
+ for (i = 0; i < numres; ++i) {
+ if (!(res[i].flags & IORESOURCE_MEM))
+ continue;
+
+ if (pci_find_resource(pdev, &res[i]))
+ return true;
+ }
+#endif
+
+ return false;
}
EXPORT_SYMBOL(video_is_primary_device);
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index c7a9a087ccaf..eac403248462 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -633,15 +633,19 @@ err:
}
/*
- * Convert TDX private pages back to normal by using MOVDIR64B to
- * clear these pages. Note this function doesn't flush cache of
- * these TDX private pages. The caller should make sure of that.
+ * Convert TDX private pages back to normal by using MOVDIR64B to clear these
+ * pages. Typically, any write to the page will convert it from TDX private back
+ * to normal kernel memory. Systems with the X86_BUG_TDX_PW_MCE erratum need to
+ * do the conversion explicitly via MOVDIR64B.
*/
-static void reset_tdx_pages(unsigned long base, unsigned long size)
+static void tdx_quirk_reset_paddr(unsigned long base, unsigned long size)
{
const void *zero_page = (const void *)page_address(ZERO_PAGE(0));
unsigned long phys, end;
+ if (!boot_cpu_has_bug(X86_BUG_TDX_PW_MCE))
+ return;
+
end = base + size;
for (phys = base; phys < end; phys += 64)
movdir64b(__va(phys), zero_page);
@@ -654,17 +658,23 @@ static void reset_tdx_pages(unsigned long base, unsigned long size)
mb();
}
-static void tdmr_reset_pamt(struct tdmr_info *tdmr)
+void tdx_quirk_reset_page(struct page *page)
+{
+ tdx_quirk_reset_paddr(page_to_phys(page), PAGE_SIZE);
+}
+EXPORT_SYMBOL_GPL(tdx_quirk_reset_page);
+
+static void tdmr_quirk_reset_pamt(struct tdmr_info *tdmr)
{
- tdmr_do_pamt_func(tdmr, reset_tdx_pages);
+ tdmr_do_pamt_func(tdmr, tdx_quirk_reset_paddr);
}
-static void tdmrs_reset_pamt_all(struct tdmr_info_list *tdmr_list)
+static void tdmrs_quirk_reset_pamt_all(struct tdmr_info_list *tdmr_list)
{
int i;
for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
- tdmr_reset_pamt(tdmr_entry(tdmr_list, i));
+ tdmr_quirk_reset_pamt(tdmr_entry(tdmr_list, i));
}
static unsigned long tdmrs_count_pamt_kb(struct tdmr_info_list *tdmr_list)
@@ -1136,15 +1146,7 @@ err_reset_pamts:
* to the kernel.
*/
wbinvd_on_all_cpus();
- /*
- * According to the TDX hardware spec, if the platform
- * doesn't have the "partial write machine check"
- * erratum, any kernel read/write will never cause #MC
- * in kernel space, thus it's OK to not convert PAMTs
- * back to normal. But do the conversion anyway here
- * as suggested by the TDX spec.
- */
- tdmrs_reset_pamt_all(&tdx_tdmr_list);
+ tdmrs_quirk_reset_pamt_all(&tdx_tdmr_list);
err_free_pamts:
tdmrs_free_pamt_all(&tdx_tdmr_list);
err_free_tdmrs:
@@ -1266,7 +1268,7 @@ static bool paddr_is_tdx_private(unsigned long phys)
return false;
/* Get page type from the TDX module */
- sret = __seamcall_ret(TDH_PHYMEM_PAGE_RDMD, &args);
+ sret = __seamcall_dirty_cache(__seamcall_ret, TDH_PHYMEM_PAGE_RDMD, &args);
/*
* The SEAMCALL will not return success unless there is a
@@ -1502,11 +1504,6 @@ static inline u64 tdx_tdr_pa(struct tdx_td *td)
return page_to_phys(td->tdr_page);
}
-static inline u64 tdx_tdvpr_pa(struct tdx_vp *td)
-{
- return page_to_phys(td->tdvpr_page);
-}
-
/*
* The TDX module exposes a CLFLUSH_BEFORE_ALLOC bit to specify whether
* a CLFLUSH of pages is required before handing them to the TDX module.
@@ -1518,11 +1515,11 @@ static void tdx_clflush_page(struct page *page)
clflush_cache_range(page_to_virt(page), PAGE_SIZE);
}
-noinstr __flatten u64 tdh_vp_enter(struct tdx_vp *td, struct tdx_module_args *args)
+noinstr u64 tdh_vp_enter(struct tdx_vp *td, struct tdx_module_args *args)
{
- args->rcx = tdx_tdvpr_pa(td);
+ args->rcx = td->tdvpr_pa;
- return __seamcall_saved_ret(TDH_VP_ENTER, args);
+ return __seamcall_dirty_cache(__seamcall_saved_ret, TDH_VP_ENTER, args);
}
EXPORT_SYMBOL_GPL(tdh_vp_enter);
@@ -1581,7 +1578,7 @@ u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page)
{
struct tdx_module_args args = {
.rcx = page_to_phys(tdcx_page),
- .rdx = tdx_tdvpr_pa(vp),
+ .rdx = vp->tdvpr_pa,
};
tdx_clflush_page(tdcx_page);
@@ -1650,7 +1647,7 @@ EXPORT_SYMBOL_GPL(tdh_mng_create);
u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp)
{
struct tdx_module_args args = {
- .rcx = tdx_tdvpr_pa(vp),
+ .rcx = vp->tdvpr_pa,
.rdx = tdx_tdr_pa(td),
};
@@ -1706,7 +1703,7 @@ EXPORT_SYMBOL_GPL(tdh_mr_finalize);
u64 tdh_vp_flush(struct tdx_vp *vp)
{
struct tdx_module_args args = {
- .rcx = tdx_tdvpr_pa(vp),
+ .rcx = vp->tdvpr_pa,
};
return seamcall(TDH_VP_FLUSH, &args);
@@ -1752,7 +1749,7 @@ EXPORT_SYMBOL_GPL(tdh_mng_init);
u64 tdh_vp_rd(struct tdx_vp *vp, u64 field, u64 *data)
{
struct tdx_module_args args = {
- .rcx = tdx_tdvpr_pa(vp),
+ .rcx = vp->tdvpr_pa,
.rdx = field,
};
u64 ret;
@@ -1769,7 +1766,7 @@ EXPORT_SYMBOL_GPL(tdh_vp_rd);
u64 tdh_vp_wr(struct tdx_vp *vp, u64 field, u64 data, u64 mask)
{
struct tdx_module_args args = {
- .rcx = tdx_tdvpr_pa(vp),
+ .rcx = vp->tdvpr_pa,
.rdx = field,
.r8 = data,
.r9 = mask,
@@ -1782,7 +1779,7 @@ EXPORT_SYMBOL_GPL(tdh_vp_wr);
u64 tdh_vp_init(struct tdx_vp *vp, u64 initial_rcx, u32 x2apicid)
{
struct tdx_module_args args = {
- .rcx = tdx_tdvpr_pa(vp),
+ .rcx = vp->tdvpr_pa,
.rdx = initial_rcx,
.r8 = x2apicid,
};
@@ -1870,3 +1867,22 @@ u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page)
return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args);
}
EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_hkid);
+
+#ifdef CONFIG_KEXEC_CORE
+void tdx_cpu_flush_cache_for_kexec(void)
+{
+ lockdep_assert_preemption_disabled();
+
+ if (!this_cpu_read(cache_state_incoherent))
+ return;
+
+ /*
+ * Private memory cachelines need to be clean at the time of
+ * kexec. Write them back now, as the caller promises that
+ * there should be no more SEAMCALLs on this CPU.
+ */
+ wbinvd();
+ this_cpu_write(cache_state_incoherent, false);
+}
+EXPORT_SYMBOL_GPL(tdx_cpu_flush_cache_for_kexec);
+#endif