diff options
author | Jiri Kosina <jkosina@suse.cz> | 2022-01-10 09:49:13 +0100 |
---|---|---|
committer | Jiri Kosina <jkosina@suse.cz> | 2022-01-10 09:49:13 +0100 |
commit | 8a2094d679d921d104d3296528d4fa419702ce1c (patch) | |
tree | 5feeb2f8b94eb3632109b778276ccd1007486770 /arch/x86 | |
parent | 3809fe479861194e310c23ed48b010c7c0f72d22 (diff) | |
parent | b60d3c803d7603432a08aeaf988aff53b3a5ec64 (diff) |
Merge branch 'for-5.17/core' into for-linus
- support for USI style pens (Tero Kristo, Mika Westerberg)
- quirk for devices that need inverted X/Y axes (Alistair Francis)
- small core code cleanups and deduplication (Benjamin Tissoires)
Diffstat (limited to 'arch/x86')
84 files changed, 1365 insertions, 1066 deletions
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild index 30dec019756b..f384cb1a4f7a 100644 --- a/arch/x86/Kbuild +++ b/arch/x86/Kbuild @@ -25,3 +25,6 @@ obj-y += platform/ obj-y += net/ obj-$(CONFIG_KEXEC_FILE) += purgatory/ + +# for cleaning +subdir- += boot tools diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b1d4b481fcdd..7399327d1eff 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -63,7 +63,7 @@ config X86 select ARCH_CLOCKSOURCE_INIT select ARCH_CORRECT_STACKTRACE_ON_KRETPROBE select ARCH_ENABLE_HUGEPAGE_MIGRATION if X86_64 && HUGETLB_PAGE && MIGRATION - select ARCH_ENABLE_MEMORY_HOTPLUG if X86_64 || (X86_32 && HIGHMEM) + select ARCH_ENABLE_MEMORY_HOTPLUG if X86_64 select ARCH_ENABLE_MEMORY_HOTREMOVE if MEMORY_HOTPLUG select ARCH_ENABLE_SPLIT_PMD_PTLOCK if (PGTABLE_LEVELS > 2) && (X86_64 || X86_PAE) select ARCH_ENABLE_THP_MIGRATION if X86_64 && TRANSPARENT_HUGEPAGE @@ -192,6 +192,8 @@ config X86 select HAVE_DYNAMIC_FTRACE_WITH_REGS select HAVE_DYNAMIC_FTRACE_WITH_ARGS if X86_64 select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + select HAVE_SAMPLE_FTRACE_DIRECT if X86_64 + select HAVE_SAMPLE_FTRACE_DIRECT_MULTI if X86_64 select HAVE_EBPF_JIT select HAVE_EFFICIENT_UNALIGNED_ACCESS select HAVE_EISA @@ -1627,7 +1629,7 @@ config ARCH_SELECT_MEMORY_MODEL config ARCH_MEMORY_PROBE bool "Enable sysfs memory/probe interface" - depends on X86_64 && MEMORY_HOTPLUG + depends on MEMORY_HOTPLUG help This option enables a sysfs memory/probe interface for testing. See Documentation/admin-guide/mm/memory-hotplug.rst for more information. @@ -2423,7 +2425,7 @@ endmenu config ARCH_HAS_ADD_PAGES def_bool y - depends on X86_64 && ARCH_ENABLE_MEMORY_HOTPLUG + depends on ARCH_ENABLE_MEMORY_HOTPLUG config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE def_bool y diff --git a/arch/x86/Makefile b/arch/x86/Makefile index aab70413ae7a..42243869216d 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -283,8 +283,6 @@ endif archclean: $(Q)rm -rf $(objtree)/arch/i386 $(Q)rm -rf $(objtree)/arch/x86_64 - $(Q)$(MAKE) $(clean)=$(boot) - $(Q)$(MAKE) $(clean)=arch/x86/tools define archhelp echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)' diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index 1b40b9297083..fd2ee9408e91 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -226,7 +226,8 @@ bool emulate_vsyscall(unsigned long error_code, if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) { warn_bad_vsyscall(KERN_DEBUG, regs, "seccomp tried to change syscall nr or ip"); - do_exit(SIGSYS); + force_exit_sig(SIGSYS); + return true; } regs->orig_ax = -1; if (tmp) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 603964408d2d..ec6444f2c9dc 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2211,7 +2211,6 @@ intel_pmu_snapshot_branch_stack(struct perf_branch_entry *entries, unsigned int /* must not have branches... */ local_irq_save(flags); __intel_pmu_disable_all(false); /* we don't care about BTS */ - __intel_pmu_pebs_disable_all(); __intel_pmu_lbr_disable(); /* ... until here */ return __intel_pmu_snapshot_branch_stack(entries, cnt, flags); @@ -2225,7 +2224,6 @@ intel_pmu_snapshot_arch_branch_stack(struct perf_branch_entry *entries, unsigned /* must not have branches... */ local_irq_save(flags); __intel_pmu_disable_all(false); /* we don't care about BTS */ - __intel_pmu_pebs_disable_all(); __intel_pmu_arch_lbr_disable(); /* ... until here */ return __intel_pmu_snapshot_branch_stack(entries, cnt, flags); @@ -3048,8 +3046,10 @@ intel_vlbr_constraints(struct perf_event *event) { struct event_constraint *c = &vlbr_constraint; - if (unlikely(constraint_match(c, event->hw.config))) + if (unlikely(constraint_match(c, event->hw.config))) { + event->hw.flags |= c->flags; return c; + } return NULL; } diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 6b72e9b55c69..8043213b75a5 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -265,6 +265,8 @@ void intel_pmu_lbr_reset(void) cpuc->last_task_ctx = NULL; cpuc->last_log_id = 0; + if (!static_cpu_has(X86_FEATURE_ARCH_LBR) && cpuc->lbr_select) + wrmsrl(MSR_LBR_SELECT, 0); } /* diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index c72e368dd164..f1ba6ab2e97e 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1187,7 +1187,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id * PCI slot and func to indicate the uncore box. */ if (id->driver_data & ~0xffff) { - struct pci_driver *pci_drv = pdev->driver; + struct pci_driver *pci_drv = to_pci_driver(pdev->dev.driver); pmu = uncore_pci_find_dev_pmu(pdev, pci_drv->id_table); if (pmu == NULL) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index eb2c6cea9d0d..3660f698fb2a 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -3608,6 +3608,9 @@ static int skx_cha_hw_config(struct intel_uncore_box *box, struct perf_event *ev struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; struct extra_reg *er; int idx = 0; + /* Any of the CHA events may be filtered by Thread/Core-ID.*/ + if (event->hw.config & SNBEP_CBO_PMON_CTL_TID_EN) + idx = SKX_CHA_MSR_PMON_BOX_FILTER_TID; for (er = skx_uncore_cha_extra_regs; er->msr; er++) { if (er->event != (event->hw.config & er->config_mask)) @@ -3675,6 +3678,7 @@ static struct event_constraint skx_uncore_iio_constraints[] = { UNCORE_EVENT_CONSTRAINT(0xc0, 0xc), UNCORE_EVENT_CONSTRAINT(0xc5, 0xc), UNCORE_EVENT_CONSTRAINT(0xd4, 0xc), + UNCORE_EVENT_CONSTRAINT(0xd5, 0xc), EVENT_CONSTRAINT_END }; @@ -4525,6 +4529,13 @@ static void snr_iio_cleanup_mapping(struct intel_uncore_type *type) pmu_iio_cleanup_mapping(type, &snr_iio_mapping_group); } +static struct event_constraint snr_uncore_iio_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x83, 0x3), + UNCORE_EVENT_CONSTRAINT(0xc0, 0xc), + UNCORE_EVENT_CONSTRAINT(0xd5, 0xc), + EVENT_CONSTRAINT_END +}; + static struct intel_uncore_type snr_uncore_iio = { .name = "iio", .num_counters = 4, @@ -4536,6 +4547,7 @@ static struct intel_uncore_type snr_uncore_iio = { .event_mask_ext = SNR_IIO_PMON_RAW_EVENT_MASK_EXT, .box_ctl = SNR_IIO_MSR_PMON_BOX_CTL, .msr_offset = SNR_IIO_MSR_OFFSET, + .constraints = snr_uncore_iio_constraints, .ops = &ivbep_uncore_msr_ops, .format_group = &snr_uncore_iio_format_group, .attr_update = snr_iio_attr_update, diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 24f4a06ac46a..96eb7db31c8e 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -177,6 +177,9 @@ void set_hv_tscchange_cb(void (*cb)(void)) return; } + if (!hv_vp_index) + return; + hv_reenlightenment_cb = cb; /* Make sure callback is registered before we write to MSRs */ @@ -383,20 +386,13 @@ static void __init hv_get_partition_id(void) */ void __init hyperv_init(void) { - u64 guest_id, required_msrs; + u64 guest_id; union hv_x64_msr_hypercall_contents hypercall_msr; int cpuhp; if (x86_hyper_type != X86_HYPER_MS_HYPERV) return; - /* Absolutely required MSRs */ - required_msrs = HV_MSR_HYPERCALL_AVAILABLE | - HV_MSR_VP_INDEX_AVAILABLE; - - if ((ms_hyperv.features & required_msrs) != required_msrs) - return; - if (hv_common_init()) return; diff --git a/arch/x86/include/asm/fpu/xcr.h b/arch/x86/include/asm/fpu/xcr.h index 79f95d3787e2..9656a5bc6fea 100644 --- a/arch/x86/include/asm/fpu/xcr.h +++ b/arch/x86/include/asm/fpu/xcr.h @@ -3,6 +3,7 @@ #define _ASM_X86_FPU_XCR_H #define XCR_XFEATURE_ENABLED_MASK 0x00000000 +#define XCR_XFEATURE_IN_USE_MASK 0x00000001 static inline u64 xgetbv(u32 index) { @@ -20,4 +21,15 @@ static inline void xsetbv(u32 index, u64 value) asm volatile("xsetbv" :: "a" (eax), "d" (edx), "c" (index)); } +/* + * Return a mask of xfeatures which are currently being tracked + * by the processor as being in the initial configuration. + * + * Callers should check X86_FEATURE_XGETBV1. + */ +static inline u64 xfeatures_in_use(void) +{ + return xgetbv(XCR_XFEATURE_IN_USE_MASK); +} + #endif /* _ASM_X86_FPU_XCR_H */ diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 0f8b90ab18c9..cd3dd170e23a 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -92,6 +92,13 @@ #define XFEATURE_MASK_FPSTATE (XFEATURE_MASK_USER_RESTORE | \ XFEATURE_MASK_SUPERVISOR_SUPPORTED) +/* + * Features in this mask have space allocated in the signal frame, but may not + * have that space initialized when the feature is in its init state. + */ +#define XFEATURE_MASK_SIGFRAME_INITOPT (XFEATURE_MASK_XTILE | \ + XFEATURE_MASK_USER_DYNAMIC) + extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; extern void __init update_regset_xstate_info(unsigned int size, diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 27158436f322..5a0bcf8b78d7 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -108,6 +108,8 @@ #define INTEL_FAM6_ALDERLAKE 0x97 /* Golden Cove / Gracemont */ #define INTEL_FAM6_ALDERLAKE_L 0x9A /* Golden Cove / Gracemont */ +#define INTEL_FAM6_RAPTOR_LAKE 0xB7 + /* "Small Core" Processors (Atom) */ #define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 2acf37cc1991..6ac61f85e07b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -38,7 +38,6 @@ #define __KVM_HAVE_ARCH_VCPU_DEBUGFS #define KVM_MAX_VCPUS 1024 -#define KVM_SOFT_MAX_VCPUS 710 /* * In x86, the VCPU ID corresponds to the APIC ID, and APIC IDs @@ -364,6 +363,7 @@ union kvm_mmu_extended_role { unsigned int cr4_smap:1; unsigned int cr4_smep:1; unsigned int cr4_la57:1; + unsigned int efer_lma:1; }; }; @@ -725,6 +725,7 @@ struct kvm_vcpu_arch { int cpuid_nent; struct kvm_cpuid_entry2 *cpuid_entries; + u32 kvm_cpuid_base; u64 reserved_gpa_bits; int maxphyaddr; @@ -748,7 +749,7 @@ struct kvm_vcpu_arch { u8 preempted; u64 msr_val; u64 last_steal; - struct gfn_to_pfn_cache cache; + struct gfn_to_hva_cache cache; } st; u64 l1_tsc_offset; @@ -1034,6 +1035,7 @@ struct kvm_x86_msr_filter { #define APICV_INHIBIT_REASON_IRQWIN 3 #define APICV_INHIBIT_REASON_PIT_REINJ 4 #define APICV_INHIBIT_REASON_X2APIC 5 +#define APICV_INHIBIT_REASON_BLOCKIRQ 6 struct kvm_arch { unsigned long n_used_mmu_pages; @@ -1476,6 +1478,7 @@ struct kvm_x86_ops { int (*mem_enc_reg_region)(struct kvm *kvm, struct kvm_enc_region *argp); int (*mem_enc_unreg_region)(struct kvm *kvm, struct kvm_enc_region *argp); int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd); + int (*vm_move_enc_context_from)(struct kvm *kvm, unsigned int source_fd); int (*get_msr_feature)(struct kvm_msr_entry *entry); diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 69299878b200..56935ebb1dfe 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -83,6 +83,18 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1, return ret; } +static inline long kvm_sev_hypercall3(unsigned int nr, unsigned long p1, + unsigned long p2, unsigned long p3) +{ + long ret; + + asm volatile("vmmcall" + : "=a"(ret) + : "a"(nr), "b"(p1), "c"(p2), "d"(p3) + : "memory"); + return ret; +} + #ifdef CONFIG_KVM_GUEST void kvmclock_init(void); void kvmclock_disable(void); diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index 2d4f5c17d79c..e2c6f433ed10 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -44,6 +44,8 @@ void __init sme_enable(struct boot_params *bp); int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size); int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size); +void __init early_set_mem_enc_dec_hypercall(unsigned long vaddr, int npages, + bool enc); void __init mem_encrypt_free_decrypted_mem(void); @@ -78,6 +80,8 @@ static inline int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size) { return 0; } static inline int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; } +static inline void __init +early_set_mem_enc_dec_hypercall(unsigned long vaddr, int npages, bool enc) {} static inline void mem_encrypt_free_decrypted_mem(void) { } diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index cebec95a7124..21c4a694ca11 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -97,6 +97,12 @@ static inline void paravirt_arch_exit_mmap(struct mm_struct *mm) PVOP_VCALL1(mmu.exit_mmap, mm); } +static inline void notify_page_enc_status_changed(unsigned long pfn, + int npages, bool enc) +{ + PVOP_VCALL3(mmu.notify_page_enc_status_changed, pfn, npages, enc); +} + #ifdef CONFIG_PARAVIRT_XXL static inline void load_sp0(unsigned long sp0) { diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index d9d6b0203ec4..a69012e1903f 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -168,6 +168,7 @@ struct pv_mmu_ops { /* Hook for intercepting the destruction of an mm_struct. */ void (*exit_mmap)(struct mm_struct *mm); + void (*notify_page_enc_status_changed)(unsigned long pfn, int npages, bool enc); #ifdef CONFIG_PARAVIRT_XXL struct paravirt_callee_save read_cr2; @@ -577,7 +578,9 @@ void paravirt_leave_lazy_mmu(void); void paravirt_flush_lazy_mmu(void); void _paravirt_nop(void); +void paravirt_BUG(void); u64 _paravirt_ident_64(u64); +unsigned long paravirt_ret0(void); #define paravirt_nop ((void *)_paravirt_nop) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 191878a65c61..355d38c0cf60 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -806,11 +806,14 @@ static inline u32 amd_get_nodes_per_socket(void) { return 0; } static inline u32 amd_get_highest_perf(void) { return 0; } #endif +#define for_each_possible_hypervisor_cpuid_base(function) \ + for (function = 0x40000000; function < 0x40010000; function += 0x100) + static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves) { uint32_t base, eax, signature[3]; - for (base = 0x40000000; base < 0x40010000; base += 0x100) { + for_each_possible_hypervisor_cpuid_base(base) { cpuid(base, &eax, &signature[0], &signature[1], &signature[2]); if (!memcmp(sig, signature, 12) && diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index 43fa081a1adb..872617542bbc 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -83,6 +83,7 @@ int set_pages_rw(struct page *page, int numpages); int set_direct_map_invalid_noflush(struct page *page); int set_direct_map_default_noflush(struct page *page); bool kernel_page_present(struct page *page); +void notify_range_enc_status_changed(unsigned long vaddr, int npages, bool enc); extern int kernel_set_to_readonly; diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 08b0e90623ad..81a0211a372d 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -126,6 +126,7 @@ static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask) void cpu_disable_common(void); void native_smp_prepare_boot_cpu(void); +void smp_prepare_cpus_common(void); void native_smp_prepare_cpus(unsigned int max_cpus); void calculate_max_logical_packages(void); void native_smp_cpus_done(unsigned int max_cpus); diff --git a/arch/x86/include/asm/static_call.h b/arch/x86/include/asm/static_call.h index cbb67b6030f9..39ebe0511869 100644 --- a/arch/x86/include/asm/static_call.h +++ b/arch/x86/include/asm/static_call.h @@ -27,6 +27,7 @@ ".globl " STATIC_CALL_TRAMP_STR(name) " \n" \ STATIC_CALL_TRAMP_STR(name) ": \n" \ insns " \n" \ + ".byte 0x53, 0x43, 0x54 \n" \ ".type " STATIC_CALL_TRAMP_STR(name) ", @function \n" \ ".size " STATIC_CALL_TRAMP_STR(name) ", . - " STATIC_CALL_TRAMP_STR(name) " \n" \ ".popsection \n") diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index f7e2d82d24fb..5b85987a5e97 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -87,15 +87,6 @@ static inline void syscall_get_arguments(struct task_struct *task, memcpy(args, ®s->bx, 6 * sizeof(args[0])); } -static inline void syscall_set_arguments(struct task_struct *task, - struct pt_regs *regs, - unsigned int i, unsigned int n, - const unsigned long *args) -{ - BUG_ON(i + n > 6); - memcpy(®s->bx + i, args, n * sizeof(args[0])); -} - static inline int syscall_get_arch(struct task_struct *task) { return AUDIT_ARCH_I386; @@ -127,30 +118,6 @@ static inline void syscall_get_arguments(struct task_struct *task, } } -static inline void syscall_set_arguments(struct task_struct *task, - struct pt_regs *regs, - const unsigned long *args) -{ -# ifdef CONFIG_IA32_EMULATION - if (task->thread_info.status & TS_COMPAT) { - regs->bx = *args++; - regs->cx = *args++; - regs->dx = *args++; - regs->si = *args++; - regs->di = *args++; - regs->bp = *args; - } else -# endif - { - regs->di = *args++; - regs->si = *args++; - regs->dx = *args++; - regs->r10 = *args++; - regs->r8 = *args++; - regs->r9 = *args; - } -} - static inline int syscall_get_arch(struct task_struct *task) { /* x32 tasks should be considered AUDIT_ARCH_X86_64. */ diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 4a7ff8b0db20..0575f5863b7f 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -248,6 +248,7 @@ privcmd_call(unsigned int call, return res; } +#ifdef CONFIG_XEN_PV static inline int HYPERVISOR_set_trap_table(struct trap_info *table) { @@ -281,6 +282,107 @@ HYPERVISOR_callback_op(int cmd, void *arg) } static inline int +HYPERVISOR_set_debugreg(int reg, unsigned long value) +{ + return _hypercall2(int, set_debugreg, reg, value); +} + +static inline unsigned long +HYPERVISOR_get_debugreg(int reg) +{ + return _hypercall1(unsigned long, get_debugreg, reg); +} + +static inline int +HYPERVISOR_update_descriptor(u64 ma, u64 desc) +{ + return _hypercall2(int, update_descriptor, ma, desc); +} + +static inline int +HYPERVISOR_update_va_mapping(unsigned long va, pte_t new_val, + unsigned long flags) +{ + return _hypercall3(int, update_va_mapping, va, new_val.pte, flags); +} + +static inline int +HYPERVISOR_set_segment_base(int reg, unsigned long value) +{ + return _hypercall2(int, set_segment_base, reg, value); +} + +static inline void +MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set) +{ + mcl->op = __HYPERVISOR_fpu_taskswitch; + mcl->args[0] = set; + + trace_xen_mc_entry(mcl, 1); +} + +static inline void +MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va, + pte_t new_val, unsigned long flags) +{ + mcl->op = __HYPERVISOR_update_va_mapping; + mcl->args[0] = va; + mcl->args[1] = new_val.pte; + mcl->args[2] = flags; + + trace_xen_mc_entry(mcl, 3); +} + +static inline void +MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr, + struct desc_struct desc) +{ + mcl->op = __HYPERVISOR_update_descriptor; + mcl->args[0] = maddr; + mcl->args[1] = *(unsigned long *)&desc; + + trace_xen_mc_entry(mcl, 2); +} + +static inline void +MULTI_mmu_update(struct multicall_entry *mcl, struct mmu_update *req, + int count, int *success_count, domid_t domid) +{ + mcl->op = __HYPERVISOR_mmu_update; + mcl->args[0] = (unsigned long)req; + mcl->args[1] = count; + mcl->args[2] = (unsigned long)success_count; + mcl->args[3] = domid; + + trace_xen_mc_entry(mcl, 4); +} + +static inline void +MULTI_mmuext_op(struct multicall_entry *mcl, struct mmuext_op *op, int count, + int *success_count, domid_t domid) +{ + mcl->op = __HYPERVISOR_mmuext_op; + mcl->args[0] = (unsigned long)op; + mcl->args[1] = count; + mcl->args[2] = (unsigned long)success_count; + mcl->args[3] = domid; + + trace_xen_mc_entry(mcl, 4); +} + +static inline void +MULTI_stack_switch(struct multicall_entry *mcl, + unsigned long ss, unsigned long esp) +{ + mcl->op = __HYPERVISOR_stack_switch; + mcl->args[0] = ss; + mcl->args[1] = esp; + + trace_xen_mc_entry(mcl, 2); +} +#endif + +static inline int HYPERVISOR_sched_op(int cmd, void *arg) { return _hypercall2(int, sched_op, cmd, arg); @@ -308,26 +410,6 @@ HYPERVISOR_platform_op(struct xen_platform_op *op) return _hypercall1(int, platform_op, op); } -static __always_inline int -HYPERVISOR_set_debugreg(int reg, unsigned long value) -{ - return _hypercall2(int, set_debugreg, reg, value); -} - -static __always_inline unsigned long -HYPERVISOR_get_debugreg(int reg) -{ - return _hypercall1(unsigned long, get_debugreg, reg); -} - -static inline int -HYPERVISOR_update_descriptor(u64 ma, u64 desc) -{ - if (sizeof(u64) == sizeof(long)) - return _hypercall2(int, update_descriptor, ma, desc); - return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32); -} - static inline long HYPERVISOR_memory_op(unsigned int cmd, void *arg) { @@ -341,18 +423,6 @@ HYPERVISOR_multicall(void *call_list, uint32_t nr_calls) } static inline int -HYPERVISOR_update_va_mapping(unsigned long va, pte_t new_val, - unsigned long flags) -{ - if (sizeof(new_val) == sizeof(long)) - return _hypercall3(int, update_va_mapping, va, - new_val.pte, flags); - else - return _hypercall4(int, update_va_mapping, va, - new_val.pte, new_val.pte >> 32, flags); -} - -static inline int HYPERVISOR_event_channel_op(int cmd, void *arg) { return _hypercall2(int, event_channel_op, cmd, arg); @@ -394,14 +464,6 @@ HYPERVISOR_vcpu_op(int cmd, int vcpuid, void *extra_args) return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args); } -#ifdef CONFIG_X86_64 -static inline int -HYPERVISOR_set_segment_base(int reg, unsigned long value) -{ - return _hypercall2(int, set_segment_base, reg, value); -} -#endif - static inline int HYPERVISOR_suspend(unsigned long start_info_mfn) { @@ -423,13 +485,6 @@ HYPERVISOR_hvm_op(int op, void *arg) } static inline int -HYPERVISOR_tmem_op( - struct tmem_op *op) -{ - return _hypercall1(int, tmem_op, op); -} - -static inline int HYPERVISOR_xenpmu_op(unsigned int op, void *arg) { return _hypercall2(int, xenpmu_op, op, arg); @@ -446,88 +501,4 @@ HYPERVISOR_dm_op( return ret; } -static inline void -MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set) -{ - mcl->op = __HYPERVISOR_fpu_taskswitch; - mcl->args[0] = set; - - trace_xen_mc_entry(mcl, 1); -} - -static inline void -MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va, - pte_t new_val, unsigned long flags) -{ - mcl->op = __HYPERVISOR_update_va_mapping; - mcl->args[0] = va; - if (sizeof(new_val) == sizeof(long)) { - mcl->args[1] = new_val.pte; - mcl->args[2] = flags; - } else { - mcl->args[1] = new_val.pte; - mcl->args[2] = new_val.pte >> 32; - mcl->args[3] = flags; - } - - trace_xen_mc_entry(mcl, sizeof(new_val) == sizeof(long) ? 3 : 4); -} - -static inline void -MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr, - struct desc_struct desc) -{ - mcl->op = __HYPERVISOR_update_descriptor; - if (sizeof(maddr) == sizeof(long)) { - mcl->args[0] = maddr; - mcl->args[1] = *(unsigned long *)&desc; - } else { - u32 *p = (u32 *)&desc; - - mcl->args[0] = maddr; - mcl->args[1] = maddr >> 32; - mcl->args[2] = *p++; - mcl->args[3] = *p; - } - - trace_xen_mc_entry(mcl, sizeof(maddr) == sizeof(long) ? 2 : 4); -} - -static inline void -MULTI_mmu_update(struct multicall_entry *mcl, struct mmu_update *req, - int count, int *success_count, domid_t domid) -{ - mcl->op = __HYPERVISOR_mmu_update; - mcl->args[0] = (unsigned long)req; - mcl->args[1] = count; - mcl->args[2] = (unsigned long)success_count; - mcl->args[3] = domid; - - trace_xen_mc_entry(mcl, 4); -} - -static inline void -MULTI_mmuext_op(struct multicall_entry *mcl, struct mmuext_op *op, int count, - int *success_count, domid_t domid) -{ - mcl->op = __HYPERVISOR_mmuext_op; - mcl->args[0] = (unsigned long)op; - mcl->args[1] = count; - mcl->args[2] = (unsigned long)success_count; - mcl->args[3] = domid; - - trace_xen_mc_entry(mcl, 4); -} - -static inline void -MULTI_stack_switch(struct multicall_entry *mcl, - unsigned long ss, unsigned long esp) -{ - mcl->op = __HYPERVISOR_stack_switch; - mcl->args[0] = ss; - mcl->args[1] = esp; - - trace_xen_mc_entry(mcl, 2); -} - #endif /* _ASM_X86_XEN_HYPERCALL_H */ diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h index ff4b52e37e60..4957f59deb40 100644 --- a/arch/x86/include/asm/xen/hypervisor.h +++ b/arch/x86/include/asm/xen/hypervisor.h @@ -62,4 +62,8 @@ void xen_arch_register_cpu(int num); void xen_arch_unregister_cpu(int num); #endif +#ifdef CONFIG_PVH +void __init xen_pvh_init(struct boot_params *boot_params); +#endif + #endif /* _ASM_X86_XEN_HYPERVISOR_H */ diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h index 4557f7cb0fa6..9015b888edd6 100644 --- a/arch/x86/include/asm/xen/pci.h +++ b/arch/x86/include/asm/xen/pci.h @@ -22,25 +22,6 @@ static inline int __init pci_xen_initial_domain(void) return -1; } #endif -#ifdef CONFIG_XEN_DOM0 -int xen_find_device_domain_owner(struct pci_dev *dev); -int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain); -int xen_unregister_device_domain_owner(struct pci_dev *dev); -#else -static inline int xen_find_device_domain_owner(struct pci_dev *dev) -{ - return -1; -} -static inline int xen_register_device_domain_owner(struct pci_dev *dev, - uint16_t domain) -{ - return -1; -} -static inline int xen_unregister_device_domain_owner(struct pci_dev *dev) -{ - return -1; -} -#endif #if defined(CONFIG_PCI_MSI) #if defined(CONFIG_PCI_XEN) diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 5146bbab84d4..6e64b27b2c1e 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -8,6 +8,7 @@ * should be used to determine that a VM is running under KVM. */ #define KVM_CPUID_SIGNATURE 0x40000000 +#define KVM_SIGNATURE "KVMKVMKVM\0\0\0" /* This CPUID returns two feature bitmaps in eax, edx. Before enabling * a particular paravirtualization, the appropriate feature bit should diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 10562885f5fc..af3ba08b684b 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -73,12 +73,23 @@ static int gart_mem_pfn_is_ram(unsigned long pfn) (pfn >= aperture_pfn_start + aperture_page_count)); } +#ifdef CONFIG_PROC_VMCORE +static bool gart_oldmem_pfn_is_ram(struct vmcore_cb *cb, unsigned long pfn) +{ + return !!gart_mem_pfn_is_ram(pfn); +} + +static struct vmcore_cb gart_vmcore_cb = { + .pfn_is_ram = gart_oldmem_pfn_is_ram, +}; +#endif + static void __init exclude_from_core(u64 aper_base, u32 aper_order) { aperture_pfn_start = aper_base >> PAGE_SHIFT; aperture_page_count = (32 * 1024 * 1024) << aper_order >> PAGE_SHIFT; #ifdef CONFIG_PROC_VMCORE - WARN_ON(register_oldmem_pfn_is_ram(&gart_mem_pfn_is_ram)); + register_vmcore_cb(&gart_vmcore_cb); #endif #ifdef CONFIG_PROC_KCORE WARN_ON(register_mem_pfn_is_ram(&gart_mem_pfn_is_ram)); diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index cb2fdd130aae..c881bcafba7d 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -76,6 +76,7 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_SGX1, X86_FEATURE_SGX }, { X86_FEATURE_SGX2, X86_FEATURE_SGX1 }, { X86_FEATURE_XFD, X86_FEATURE_XSAVES }, + { X86_FEATURE_XFD, X86_FEATURE_XGETBV1 }, { X86_FEATURE_AMX_TILE, X86_FEATURE_XFD }, {} }; diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index acfd5d9f93c6..bb9a46a804bf 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -547,12 +547,13 @@ bool intel_filter_mce(struct mce *m) { struct cpuinfo_x86 *c = &boot_cpu_data; - /* MCE errata HSD131, HSM142, HSW131, BDM48, and HSM142 */ + /* MCE errata HSD131, HSM142, HSW131, BDM48, HSM142 and SKX37 */ if ((c->x86 == 6) && ((c->x86_model == INTEL_FAM6_HASWELL) || (c->x86_model == INTEL_FAM6_HASWELL_L) || (c->x86_model == INTEL_FAM6_BROADWELL) || - (c->x86_model == INTEL_FAM6_HASWELL_G)) && + (c->x86_model == INTEL_FAM6_HASWELL_G) || + (c->x86_model == INTEL_FAM6_SKYLAKE_X)) && (m->bank == 0) && ((m->status & 0xa0000000ffffffff) == 0x80000000000f0005)) return true; diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 4794b716ec79..ff55df60228f 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -163,12 +163,22 @@ static uint32_t __init ms_hyperv_platform(void) cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS, &eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]); - if (eax >= HYPERV_CPUID_MIN && - eax <= HYPERV_CPUID_MAX && - !memcmp("Microsoft Hv", hyp_signature, 12)) - return HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS; + if (eax < HYPERV_CPUID_MIN || eax > HYPERV_CPUID_MAX || + memcmp("Microsoft Hv", hyp_signature, 12)) + return 0; - return 0; + /* HYPERCALL and VP_INDEX MSRs are mandatory for all features. */ + eax = cpuid_eax(HYPERV_CPUID_FEATURES); + if (!(eax & HV_MSR_HYPERCALL_AVAILABLE)) { + pr_warn("x86/hyperv: HYPERCALL MSR not available.\n"); + return 0; + } + if (!(eax & HV_MSR_VP_INDEX_AVAILABLE)) { + pr_warn("x86/hyperv: VP_INDEX MSR not available.\n"); + return 0; + } + + return HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS; } static unsigned char hv_get_nmi_reason(void) diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 63d3de02bbcc..8471a8b9b48e 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -28,8 +28,7 @@ static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq); static LIST_HEAD(sgx_active_page_list); static DEFINE_SPINLOCK(sgx_reclaimer_lock); -/* The free page list lock protected variables prepend the lock. */ -static unsigned long sgx_nr_free_pages; +static atomic_long_t sgx_nr_free_pages = ATOMIC_LONG_INIT(0); /* Nodes with one or more EPC sections. */ static nodemask_t sgx_numa_mask; @@ -403,14 +402,15 @@ skip: spin_lock(&node->lock); list_add_tail(&epc_page->list, &node->free_page_list); - sgx_nr_free_pages++; spin_unlock(&node->lock); + atomic_long_inc(&sgx_nr_free_pages); } } static bool sgx_should_reclaim(unsigned long watermark) { - return sgx_nr_free_pages < watermark && !list_empty(&sgx_active_page_list); + return atomic_long_read(&sgx_nr_free_pages) < watermark && + !list_empty(&sgx_active_page_list); } static int ksgxd(void *p) @@ -471,9 +471,9 @@ static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid) page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list); list_del_init(&page->list); - sgx_nr_free_pages--; spin_unlock(&node->lock); + atomic_long_dec(&sgx_nr_free_pages); return page; } @@ -625,9 +625,9 @@ void sgx_free_epc_page(struct sgx_epc_page *page) spin_lock(&node->lock); list_add_tail(&page->list, &node->free_page_list); - sgx_nr_free_pages++; spin_unlock(&node->lock); + atomic_long_inc(&sgx_nr_free_pages); } static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c index d1d49e3d536b..3b58d8703094 100644 --- a/arch/x86/kernel/doublefault_32.c +++ b/arch/x86/kernel/doublefault_32.c @@ -77,9 +77,6 @@ asmlinkage noinstr void __noreturn doublefault_shim(void) * some way to reconstruct CR3. We could make a credible guess based * on cpu_tlbstate, but that would be racy and would not account for * PTI. - * - * Instead, don't bother. We can return through - * rewind_stack_do_exit() instead. */ panic("cannot return from double fault\n"); } diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h index e18210dff88c..86ea7c0fa2f6 100644 --- a/arch/x86/kernel/fpu/xstate.h +++ b/arch/x86/kernel/fpu/xstate.h @@ -4,6 +4,7 @@ #include <asm/cpufeature.h> #include <asm/fpu/xstate.h> +#include <asm/fpu/xcr.h> #ifdef CONFIG_X86_64 DECLARE_PER_CPU(u64, xfd_state); @@ -199,6 +200,32 @@ static inline void os_xrstor_supervisor(struct fpstate *fpstate) } /* + * XSAVE itself always writes all requested xfeatures. Removing features + * from the request bitmap reduces the features which are written. + * Generate a mask of features which must be written to a sigframe. The + * unset features can be optimized away and not written. + * + * This optimization is user-visible. Only use for states where + * uninitialized sigframe contents are tolerable, like dynamic features. + * + * Users of buffers produced with this optimization must check XSTATE_BV + * to determine which features have been optimized out. + */ +static inline u64 xfeatures_need_sigframe_write(void) +{ + u64 xfeaures_to_write; + + /* In-use features must be written: */ + xfeaures_to_write = xfeatures_in_use(); + + /* Also write all non-optimizable sigframe features: */ + xfeaures_to_write |= XFEATURE_MASK_USER_SUPPORTED & + ~XFEATURE_MASK_SIGFRAME_INITOPT; + + return xfeaures_to_write; +} + +/* * Save xstate to user space xsave area. * * We don't use modified optimization because xrstor/xrstors might track @@ -220,10 +247,16 @@ static inline int xsave_to_user_sigframe(struct xregs_state __user *buf) */ struct fpstate *fpstate = current->thread.fpu.fpstate; u64 mask = fpstate->user_xfeatures; - u32 lmask = mask; - u32 hmask = mask >> 32; + u32 lmask; + u32 hmask; int err; + /* Optimize away writing unnecessary xfeatures: */ + if (fpu_state_size_dynamic()) + mask &= xfeatures_need_sigframe_write(); + + lmask = mask; + hmask = mask >> 32; xfd_validate_state(fpstate, mask, false); stac(); diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 8863d1941f1b..59abbdad7729 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -28,6 +28,7 @@ #include <linux/swait.h> #include <linux/syscore_ops.h> #include <linux/cc_platform.h> +#include <linux/efi.h> #include <asm/timer.h> #include <asm/cpu.h> #include <asm/traps.h> @@ -41,6 +42,7 @@ #include <asm/ptrace.h> #include <asm/reboot.h> #include <asm/svm.h> +#include <asm/e820/api.h> DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled); @@ -434,6 +436,8 @@ static void kvm_guest_cpu_offline(bool shutdown) kvm_disable_steal_time(); if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) wrmsrl(MSR_KVM_PV_EOI_EN, 0); + if (kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL)) + wrmsrl(MSR_KVM_MIGRATION_CONTROL, 0); kvm_pv_disable_apf(); if (!shutdown) apf_task_wake_all(); @@ -548,6 +552,55 @@ static void kvm_send_ipi_mask_allbutself(const struct cpumask *mask, int vector) __send_ipi_mask(local_mask, vector); } +static int __init setup_efi_kvm_sev_migration(void) +{ + efi_char16_t efi_sev_live_migration_enabled[] = L"SevLiveMigrationEnabled"; + efi_guid_t efi_variable_guid = AMD_SEV_MEM_ENCRYPT_GUID; + efi_status_t status; + unsigned long size; + bool enabled; + + if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) || + !kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL)) + return 0; + + if (!efi_enabled(EFI_BOOT)) + return 0; + + if (!efi_enabled(EFI_RUNTIME_SERVICES)) { + pr_info("%s : EFI runtime services are not enabled\n", __func__); + return 0; + } + + size = sizeof(enabled); + + /* Get variable contents into buffer */ + status = efi.get_variable(efi_sev_live_migration_enabled, + &efi_variable_guid, NULL, &size, &enabled); + + if (status == EFI_NOT_FOUND) { + pr_info("%s : EFI live migration variable not found\n", __func__); + return 0; + } + + if (status != EFI_SUCCESS) { + pr_info("%s : EFI variable retrieval failed\n", __func__); + return 0; + } + + if (enabled == 0) { + pr_info("%s: live migration disabled in EFI\n", __func__); + return 0; + } + + pr_info("%s : live migration enabled in EFI\n", __func__); + wrmsrl(MSR_KVM_MIGRATION_CONTROL, KVM_MIGRATION_READY); + + return 1; +} + +late_initcall(setup_efi_kvm_sev_migration); + /* * Set the IPI entry points */ @@ -756,7 +809,7 @@ static noinline uint32_t __kvm_cpuid_base(void) return 0; /* So we don't blow up on old processors */ if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) - return hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0); + return hypervisor_cpuid_base(KVM_SIGNATURE, 0); return 0; } @@ -806,8 +859,62 @@ static bool __init kvm_msi_ext_dest_id(void) return kvm_para_has_feature(KVM_FEATURE_MSI_EXT_DEST_ID); } +static void kvm_sev_hc_page_enc_status(unsigned long pfn, int npages, bool enc) +{ + kvm_sev_hypercall3(KVM_HC_MAP_GPA_RANGE, pfn << PAGE_SHIFT, npages, + KVM_MAP_GPA_RANGE_ENC_STAT(enc) | KVM_MAP_GPA_RANGE_PAGE_SZ_4K); +} + static void __init kvm_init_platform(void) { + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) && + kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL)) { + unsigned long nr_pages; + int i; + + pv_ops.mmu.notify_page_enc_status_changed = + kvm_sev_hc_page_enc_status; + + /* + * Reset the host's shared pages list related to kernel + * specific page encryption status settings before we load a + * new kernel by kexec. Reset the page encryption status + * during early boot intead of just before kexec to avoid SMP + * races during kvm_pv_guest_cpu_reboot(). + * NOTE: We cannot reset the complete shared pages list + * here as we need to retain the UEFI/OVMF firmware + * specific settings. + */ + + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; + + if (entry->type != E820_TYPE_RAM) + continue; + + nr_pages = DIV_ROUND_UP(entry->size, PAGE_SIZE); + + kvm_sev_hypercall3(KVM_HC_MAP_GPA_RANGE, entry->addr, + nr_pages, + KVM_MAP_GPA_RANGE_ENCRYPTED | KVM_MAP_GPA_RANGE_PAGE_SZ_4K); + } + + /* + * Ensure that _bss_decrypted section is marked as decrypted in the + * shared pages list. + */ + nr_pages = DIV_ROUND_UP(__end_bss_decrypted - __start_bss_decrypted, + PAGE_SIZE); + early_set_mem_enc_dec_hypercall((unsigned long)__start_bss_decrypted, + nr_pages, 0); + + /* + * If not booted using EFI, enable Live migration support. + */ + if (!efi_enabled(EFI_BOOT)) + wrmsrl(MSR_KVM_MIGRATION_CONTROL, + KVM_MIGRATION_READY); + } kvmclock_init(); x86_platform.apic_post_init = kvm_apic_init; } diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index ebc45360ffd4..7f7636aac620 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -46,6 +46,17 @@ asm (".pushsection .entry.text, \"ax\"\n" ".type _paravirt_nop, @function\n\t" ".popsection"); +/* stub always returning 0. */ +asm (".pushsection .entry.text, \"ax\"\n" + ".global paravirt_ret0\n" + "paravirt_ret0:\n\t" + "xor %" _ASM_AX ", %" _ASM_AX ";\n\t" + "ret\n\t" + ".size paravirt_ret0, . - paravirt_ret0\n\t" + ".type paravirt_ret0, @function\n\t" + ".popsection"); + + void __init default_banner(void) { printk(KERN_INFO "Booting paravirtualized kernel on %s\n", @@ -53,7 +64,7 @@ void __init default_banner(void) } /* Undefined instruction for dealing with missing ops pointers. */ -static void paravirt_BUG(void) +noinstr void paravirt_BUG(void) { BUG(); } @@ -326,6 +337,7 @@ struct paravirt_patch_template pv_ops = { (void (*)(struct mmu_gather *, void *))tlb_remove_page, .mmu.exit_mmap = paravirt_nop, + .mmu.notify_page_enc_status_changed = paravirt_nop, #ifdef CONFIG_PARAVIRT_XXL .mmu.read_cr2 = __PV_IS_CALLEE_SAVE(pv_native_read_cr2), diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c index 9e1def3744f2..36e84d904260 100644 --- a/arch/x86/kernel/probe_roms.c +++ b/arch/x86/kernel/probe_roms.c @@ -80,7 +80,7 @@ static struct resource video_rom_resource = { */ static bool match_id(struct pci_dev *pdev, unsigned short vendor, unsigned short device) { - struct pci_driver *drv = pdev->driver; + struct pci_driver *drv = to_pci_driver(pdev->dev.driver); const struct pci_device_id *id; if (pdev->vendor == vendor && pdev->device == device) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index e9ee8b526319..04143a653a8a 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -964,6 +964,9 @@ unsigned long __get_wchan(struct task_struct *p) struct unwind_state state; unsigned long addr = 0; + if (!try_get_task_stack(p)) + return 0; + for (unwind_start(&state, p, NULL, NULL); !unwind_done(&state); unwind_next_frame(&state)) { addr = unwind_get_return_address(&state); @@ -974,6 +977,8 @@ unsigned long __get_wchan(struct task_struct *p) break; } + put_task_stack(p); + return addr; } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 40ed44ead063..c410be738ae7 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -322,7 +322,7 @@ static void __init reserve_initrd(void) relocate_initrd(); - memblock_free(ramdisk_image, ramdisk_end - ramdisk_image); + memblock_phys_free(ramdisk_image, ramdisk_end - ramdisk_image); } #else @@ -521,7 +521,7 @@ static void __init reserve_crashkernel(void) } if (crash_base >= (1ULL << 32) && reserve_crashkernel_low()) { - memblock_free(crash_base, crash_size); + memblock_phys_free(crash_base, crash_size); return; } @@ -742,6 +742,28 @@ dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) return 0; } +static char *prepare_command_line(void) +{ +#ifdef CONFIG_CMDLINE_BOOL +#ifdef CONFIG_CMDLINE_OVERRIDE + strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); +#else + if (builtin_cmdline[0]) { + /* append boot loader cmdline to builtin */ + strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE); + strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE); + strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); + } +#endif +#endif + + strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); + + parse_early_param(); + + return command_line; +} + /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures @@ -831,6 +853,23 @@ void __init setup_arch(char **cmdline_p) x86_init.oem.arch_setup(); /* + * x86_configure_nx() is called before parse_early_param() (called by + * prepare_command_line()) to detect whether hardware doesn't support + * NX (so that the early EHCI debug console setup can safely call + * set_fixmap()). It may then be called again from within noexec_setup() + * during parsing early parameters to honor the respective command line + * option. + */ + x86_configure_nx(); + + /* + * This parses early params and it needs to run before + * early_reserve_memory() because latter relies on such settings + * supplied as early params. + */ + *cmdline_p = prepare_command_line(); + + /* * Do some memory reservations *before* memory is added to memblock, so * memblock allocations won't overwrite it. * @@ -863,33 +902,6 @@ void __init setup_arch(char **cmdline_p) bss_resource.start = __pa_symbol(__bss_start); bss_resource.end = __pa_symbol(__bss_stop)-1; -#ifdef CONFIG_CMDLINE_BOOL -#ifdef CONFIG_CMDLINE_OVERRIDE - strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); -#else - if (builtin_cmdline[0]) { - /* append boot loader cmdline to builtin */ - strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE); - strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE); - strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); - } -#endif -#endif - - strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); - *cmdline_p = command_line; - - /* - * x86_configure_nx() is called before parse_early_param() to detect - * whether hardware doesn't support NX (so that the early EHCI debug - * console setup can safely call set_fixmap()). It may then be called - * again from within noexec_setup() during parsing early parameters - * to honor the respective command line option. - */ - x86_configure_nx(); - - parse_early_param(); - #ifdef CONFIG_MEMORY_HOTPLUG /* * Memory used by the kernel cannot be hot-removed because Linux diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 5afd98559193..7b65275544b2 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -135,7 +135,7 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align) static void __init pcpu_fc_free(void *ptr, size_t size) { - memblock_free_ptr(ptr, size); + memblock_free(ptr, size); } static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 8241927addff..ac2909f0cab3 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1350,12 +1350,7 @@ static void __init smp_get_logical_apicid(void) cpu0_logical_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); } -/* - * Prepare for SMP bootup. - * @max_cpus: configured maximum number of CPUs, It is a legacy parameter - * for common interface support. - */ -void __init native_smp_prepare_cpus(unsigned int max_cpus) +void __init smp_prepare_cpus_common(void) { unsigned int i; @@ -1386,6 +1381,17 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) set_sched_topology(x86_topology); set_cpu_sibling_map(0); +} + +/* + * Prepare for SMP bootup. + * @max_cpus: configured maximum number of CPUs, It is a legacy parameter + * for common interface support. + */ +void __init native_smp_prepare_cpus(unsigned int max_cpus) +{ + smp_prepare_cpus_common(); + init_freq_invariance(false, false); smp_sanity_check(); diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c index ea028e736831..9c407a33a774 100644 --- a/arch/x86/kernel/static_call.c +++ b/arch/x86/kernel/static_call.c @@ -56,10 +56,15 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, void text_poke_bp(insn, code, size, emulate); } -static void __static_call_validate(void *insn, bool tail) +static void __static_call_validate(void *insn, bool tail, bool tramp) { u8 opcode = *(u8 *)insn; + if (tramp && memcmp(insn+5, "SCT", 3)) { + pr_err("trampoline signature fail"); + BUG(); + } + if (tail) { if (opcode == JMP32_INSN_OPCODE || opcode == RET_INSN_OPCODE) @@ -74,7 +79,8 @@ static void __static_call_validate(void *insn, bool tail) /* * If we ever trigger this, our text is corrupt, we'll probably not live long. */ - WARN_ONCE(1, "unexpected static_call insn opcode 0x%x at %pS\n", opcode, insn); + pr_err("unexpected static_call insn opcode 0x%x at %pS\n", opcode, insn); + BUG(); } static inline enum insn_type __sc_insn(bool null, bool tail) @@ -97,12 +103,12 @@ void arch_static_call_transform(void *site, void *tramp, void *func, bool tail) mutex_lock(&text_mutex); if (tramp) { - __static_call_validate(tramp, true); + __static_call_validate(tramp, true, true); __static_call_transform(tramp, __sc_insn(!func, true), func); } if (IS_ENABLED(CONFIG_HAVE_STATIC_CALL_INLINE) && site) { - __static_call_validate(site, tail); + __static_call_validate(site, tail, false); __static_call_transform(site, __sc_insn(!func, tail), func); } diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index e6f7592790af..2de3c8c5eba9 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -175,7 +175,7 @@ static struct orc_entry *orc_find(unsigned long ip) } /* vmlinux .init slow lookup: */ - if (init_kernel_text(ip)) + if (is_kernel_inittext(ip)) return __orc_find(__start_orc_unwind_ip, __start_orc_unwind, __stop_orc_unwind_ip - __start_orc_unwind_ip, ip); diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index e5a7a10a0164..c21bcd668284 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -106,10 +106,8 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) */ local_irq_enable(); - if (!vm86 || !vm86->user_vm86) { - pr_alert("no user_vm86: BAD\n"); - do_exit(SIGSEGV); - } + BUG_ON(!vm86); + set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | vm86->veflags_mask); user = vm86->user_vm86; @@ -142,6 +140,7 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) user_access_end(); +exit_vm86: preempt_disable(); tsk->thread.sp0 = vm86->saved_sp0; tsk->thread.sysenter_cs = __KERNEL_CS; @@ -161,7 +160,8 @@ Efault_end: user_access_end(); Efault: pr_alert("could not access userspace vm86 info\n"); - do_exit(SIGSEGV); + force_exit_sig(SIGSEGV); + goto exit_vm86; } static int do_vm86_irq_handling(int subfunction, int irqnumber); diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 2d70edb0f323..07e9215e911d 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -99,11 +99,45 @@ static int kvm_check_cpuid(struct kvm_cpuid_entry2 *entries, int nent) return 0; } -void kvm_update_pv_runtime(struct kvm_vcpu *vcpu) +static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; + u32 function; + struct kvm_cpuid_entry2 *entry; + + vcpu->arch.kvm_cpuid_base = 0; + + for_each_possible_hypervisor_cpuid_base(function) { + entry = kvm_find_cpuid_entry(vcpu, function, 0); - best = kvm_find_cpuid_entry(vcpu, KVM_CPUID_FEATURES, 0); + if (entry) { + u32 signature[3]; + + signature[0] = entry->ebx; + signature[1] = entry->ecx; + signature[2] = entry->edx; + + BUILD_BUG_ON(sizeof(signature) > sizeof(KVM_SIGNATURE)); + if (!memcmp(signature, KVM_SIGNATURE, sizeof(signature))) { + vcpu->arch.kvm_cpuid_base = function; + break; + } + } + } +} + +static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu) +{ + u32 base = vcpu->arch.kvm_cpuid_base; + + if (!base) + return NULL; + + return kvm_find_cpuid_entry(vcpu, base | KVM_CPUID_FEATURES, 0); +} + +void kvm_update_pv_runtime(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best = kvm_find_kvm_cpuid_features(vcpu); /* * save the feature bitmap to avoid cpuid lookup for every PV @@ -142,7 +176,7 @@ void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) cpuid_entry_has(best, X86_FEATURE_XSAVEC))) best->ebx = xstate_required_size(vcpu->arch.xcr0, true); - best = kvm_find_cpuid_entry(vcpu, KVM_CPUID_FEATURES, 0); + best = kvm_find_kvm_cpuid_features(vcpu); if (kvm_hlt_in_guest(vcpu->kvm) && best && (best->eax & (1 << KVM_FEATURE_PV_UNHALT))) best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); @@ -239,6 +273,26 @@ u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu) return rsvd_bits(cpuid_maxphyaddr(vcpu), 63); } +static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, + int nent) +{ + int r; + + r = kvm_check_cpuid(e2, nent); + if (r) + return r; + + kvfree(vcpu->arch.cpuid_entries); + vcpu->arch.cpuid_entries = e2; + vcpu->arch.cpuid_nent = nent; + + kvm_update_kvm_cpuid_base(vcpu); + kvm_update_cpuid_runtime(vcpu); + kvm_vcpu_after_set_cpuid(vcpu); + + return 0; +} + /* when an old userspace process fills a new kernel module */ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid *cpuid, @@ -275,18 +329,9 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, e2[i].padding[2] = 0; } - r = kvm_check_cpuid(e2, cpuid->nent); - if (r) { + r = kvm_set_cpuid(vcpu, e2, cpuid->nent); + if (r) kvfree(e2); - goto out_free_cpuid; - } - - kvfree(vcpu->arch.cpuid_entries); - vcpu->arch.cpuid_entries = e2; - vcpu->arch.cpuid_nent = cpuid->nent; - - kvm_update_cpuid_runtime(vcpu); - kvm_vcpu_after_set_cpuid(vcpu); out_free_cpuid: kvfree(e); @@ -310,20 +355,11 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, return PTR_ERR(e2); } - r = kvm_check_cpuid(e2, cpuid->nent); - if (r) { + r = kvm_set_cpuid(vcpu, e2, cpuid->nent); + if (r) kvfree(e2); - return r; - } - kvfree(vcpu->arch.cpuid_entries); - vcpu->arch.cpuid_entries = e2; - vcpu->arch.cpuid_nent = cpuid->nent; - - kvm_update_cpuid_runtime(vcpu); - kvm_vcpu_after_set_cpuid(vcpu); - - return 0; + return r; } int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, @@ -871,8 +907,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) } break; case KVM_CPUID_SIGNATURE: { - static const char signature[12] = "KVMKVMKVM\0\0"; - const u32 *sigptr = (const u32 *)signature; + const u32 *sigptr = (const u32 *)KVM_SIGNATURE; entry->eax = KVM_CPUID_FEATURES; entry->ebx = sigptr[0]; entry->ecx = sigptr[1]; diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 4f15c0165c05..5e19e6e4c2ce 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1472,7 +1472,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) if (!(data & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE)) { hv_vcpu->hv_vapic = data; - if (kvm_lapic_enable_pv_eoi(vcpu, 0, 0)) + if (kvm_lapic_set_pv_eoi(vcpu, 0, 0)) return 1; break; } @@ -1490,7 +1490,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) return 1; hv_vcpu->hv_vapic = data; kvm_vcpu_mark_page_dirty(vcpu, gfn); - if (kvm_lapic_enable_pv_eoi(vcpu, + if (kvm_lapic_set_pv_eoi(vcpu, gfn_to_gpa(gfn) | KVM_MSR_ENABLED, sizeof(struct hv_vp_assist_page))) return 1; @@ -2022,7 +2022,7 @@ static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result) { bool longmode; - longmode = is_64_bit_mode(vcpu); + longmode = is_64_bit_hypercall(vcpu); if (longmode) kvm_rax_write(vcpu, result); else { @@ -2171,7 +2171,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) } #ifdef CONFIG_X86_64 - if (is_64_bit_mode(vcpu)) { + if (is_64_bit_hypercall(vcpu)) { hc.param = kvm_rcx_read(vcpu); hc.ingpa = kvm_rdx_read(vcpu); hc.outgpa = kvm_r8_read(vcpu); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index d6ac32f3f650..759952dd1222 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2856,25 +2856,30 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data) return 0; } -int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len) +int kvm_lapic_set_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len) { u64 addr = data & ~KVM_MSR_ENABLED; struct gfn_to_hva_cache *ghc = &vcpu->arch.pv_eoi.data; unsigned long new_len; + int ret; if (!IS_ALIGNED(addr, 4)) return 1; - vcpu->arch.pv_eoi.msr_val = data; - if (!pv_eoi_enabled(vcpu)) - return 0; + if (data & KVM_MSR_ENABLED) { + if (addr == ghc->gpa && len <= ghc->len) + new_len = ghc->len; + else + new_len = len; - if (addr == ghc->gpa && len <= ghc->len) - new_len = ghc->len; - else - new_len = len; + ret = kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, addr, new_len); + if (ret) + return ret; + } + + vcpu->arch.pv_eoi.msr_val = data; - return kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, addr, new_len); + return 0; } int kvm_apic_accept_events(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index d7c25d0c1354..2b44e533fc8d 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -127,7 +127,7 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data); int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); -int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len); +int kvm_lapic_set_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len); void kvm_lapic_exit(void); #define VEC_POS(v) ((v) & (32 - 1)) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 323b5057d08f..3be9beea838d 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3191,17 +3191,17 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) new_spte |= PT_WRITABLE_MASK; /* - * Do not fix write-permission on the large spte. Since - * we only dirty the first page into the dirty-bitmap in + * Do not fix write-permission on the large spte when + * dirty logging is enabled. Since we only dirty the + * first page into the dirty-bitmap in * fast_pf_fix_direct_spte(), other pages are missed * if its slot has dirty logging enabled. * * Instead, we let the slow page fault path create a * normal spte to fix the access. - * - * See the comments in kvm_arch_commit_memory_region(). */ - if (sp->role.level > PG_LEVEL_4K) + if (sp->role.level > PG_LEVEL_4K && + kvm_slot_dirty_track_enabled(fault->slot)) break; } @@ -4682,6 +4682,7 @@ static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu, /* PKEY and LA57 are active iff long mode is active. */ ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs); ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs); + ext.efer_lma = ____is_efer_lma(regs); } ext.valid = 1; diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 7c5dd83e52de..a54c3491af42 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -897,7 +897,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, struct tdp_iter *iter) { - struct kvm_mmu_page *sp = sptep_to_sp(iter->sptep); + struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep)); u64 new_spte; int ret = RET_PF_FIXED; bool wrprot = false; diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 0772bad9165c..09873f6488f7 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -319,7 +319,7 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) } /* check if idx is a valid index to access PMU */ -int kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) +bool kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) { return kvm_x86_ops.pmu_ops->is_valid_rdpmc_ecx(vcpu, idx); } diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 0e4f2b1fa9fb..59d6b76203d5 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -32,7 +32,7 @@ struct kvm_pmu_ops { struct kvm_pmc *(*rdpmc_ecx_to_pmc)(struct kvm_vcpu *vcpu, unsigned int idx, u64 *mask); struct kvm_pmc *(*msr_idx_to_pmc)(struct kvm_vcpu *vcpu, u32 msr); - int (*is_valid_rdpmc_ecx)(struct kvm_vcpu *vcpu, unsigned int idx); + bool (*is_valid_rdpmc_ecx)(struct kvm_vcpu *vcpu, unsigned int idx); bool (*is_valid_msr)(struct kvm_vcpu *vcpu, u32 msr); int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info); int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info); @@ -149,7 +149,7 @@ void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx); void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu); int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); -int kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx); +bool kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx); bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr); int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 8052d92069e0..affc0ea98d30 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -904,7 +904,8 @@ bool svm_check_apicv_inhibit_reasons(ulong bit) BIT(APICV_INHIBIT_REASON_NESTED) | BIT(APICV_INHIBIT_REASON_IRQWIN) | BIT(APICV_INHIBIT_REASON_PIT_REINJ) | - BIT(APICV_INHIBIT_REASON_X2APIC); + BIT(APICV_INHIBIT_REASON_X2APIC) | + BIT(APICV_INHIBIT_REASON_BLOCKIRQ); return supported & BIT(bit); } diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index fdf587f19c5f..871c426ec389 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -181,14 +181,13 @@ static struct kvm_pmc *amd_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) return get_gp_pmc_amd(pmu, base + pmc_idx, PMU_TYPE_COUNTER); } -/* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */ -static int amd_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) +static bool amd_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); idx &= ~(3u << 30); - return (idx >= pmu->nr_arch_gp_counters); + return idx < pmu->nr_arch_gp_counters; } /* idx is the ECX register of RDPMC instruction */ diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 1964b9a174be..21ac0a5de4e0 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -120,16 +120,26 @@ static bool __sev_recycle_asids(int min_asid, int max_asid) return true; } +static int sev_misc_cg_try_charge(struct kvm_sev_info *sev) +{ + enum misc_res_type type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV; + return misc_cg_try_charge(type, sev->misc_cg, 1); +} + +static void sev_misc_cg_uncharge(struct kvm_sev_info *sev) +{ + enum misc_res_type type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV; + misc_cg_uncharge(type, sev->misc_cg, 1); +} + static int sev_asid_new(struct kvm_sev_info *sev) { int asid, min_asid, max_asid, ret; bool retry = true; - enum misc_res_type type; - type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV; WARN_ON(sev->misc_cg); sev->misc_cg = get_current_misc_cg(); - ret = misc_cg_try_charge(type, sev->misc_cg, 1); + ret = sev_misc_cg_try_charge(sev); if (ret) { put_misc_cg(sev->misc_cg); sev->misc_cg = NULL; @@ -162,7 +172,7 @@ again: return asid; e_uncharge: - misc_cg_uncharge(type, sev->misc_cg, 1); + sev_misc_cg_uncharge(sev); put_misc_cg(sev->misc_cg); sev->misc_cg = NULL; return ret; @@ -179,7 +189,6 @@ static void sev_asid_free(struct kvm_sev_info *sev) { struct svm_cpu_data *sd; int cpu; - enum misc_res_type type; mutex_lock(&sev_bitmap_lock); @@ -192,8 +201,7 @@ static void sev_asid_free(struct kvm_sev_info *sev) mutex_unlock(&sev_bitmap_lock); - type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV; - misc_cg_uncharge(type, sev->misc_cg, 1); + sev_misc_cg_uncharge(sev); put_misc_cg(sev->misc_cg); sev->misc_cg = NULL; } @@ -229,7 +237,6 @@ static void sev_unbind_asid(struct kvm *kvm, unsigned int handle) static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) { struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - bool es_active = argp->id == KVM_SEV_ES_INIT; int asid, ret; if (kvm->created_vcpus) @@ -239,7 +246,8 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) if (unlikely(sev->active)) return ret; - sev->es_active = es_active; + sev->active = true; + sev->es_active = argp->id == KVM_SEV_ES_INIT; asid = sev_asid_new(sev); if (asid < 0) goto e_no_asid; @@ -249,8 +257,6 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) if (ret) goto e_free; - sev->active = true; - sev->asid = asid; INIT_LIST_HEAD(&sev->regions_list); return 0; @@ -260,6 +266,7 @@ e_free: sev->asid = 0; e_no_asid: sev->es_active = false; + sev->active = false; return ret; } @@ -590,7 +597,7 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm) * traditional VMSA as it has been built so far (in prep * for LAUNCH_UPDATE_VMSA) to be the initial SEV-ES state. */ - memcpy(svm->vmsa, save, sizeof(*save)); + memcpy(svm->sev_es.vmsa, save, sizeof(*save)); return 0; } @@ -612,11 +619,11 @@ static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu, * the VMSA memory content (i.e it will write the same memory region * with the guest's key), so invalidate it first. */ - clflush_cache_range(svm->vmsa, PAGE_SIZE); + clflush_cache_range(svm->sev_es.vmsa, PAGE_SIZE); vmsa.reserved = 0; vmsa.handle = to_kvm_svm(kvm)->sev_info.handle; - vmsa.address = __sme_pa(svm->vmsa); + vmsa.address = __sme_pa(svm->sev_es.vmsa); vmsa.len = PAGE_SIZE; ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa, error); if (ret) @@ -1522,7 +1529,7 @@ static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) return sev_issue_cmd(kvm, SEV_CMD_RECEIVE_FINISH, &data, &argp->error); } -static bool cmd_allowed_from_miror(u32 cmd_id) +static bool is_cmd_allowed_from_mirror(u32 cmd_id) { /* * Allow mirrors VM to call KVM_SEV_LAUNCH_UPDATE_VMSA to enable SEV-ES @@ -1536,6 +1543,201 @@ static bool cmd_allowed_from_miror(u32 cmd_id) return false; } +static int sev_lock_for_migration(struct kvm *kvm) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + + /* + * Bail if this VM is already involved in a migration to avoid deadlock + * between two VMs trying to migrate to/from each other. + */ + if (atomic_cmpxchg_acquire(&sev->migration_in_progress, 0, 1)) + return -EBUSY; + + mutex_lock(&kvm->lock); + + return 0; +} + +static void sev_unlock_after_migration(struct kvm *kvm) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + + mutex_unlock(&kvm->lock); + atomic_set_release(&sev->migration_in_progress, 0); +} + + +static int sev_lock_vcpus_for_migration(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + int i, j; + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (mutex_lock_killable(&vcpu->mutex)) + goto out_unlock; + } + + return 0; + +out_unlock: + kvm_for_each_vcpu(j, vcpu, kvm) { + if (i == j) + break; + + mutex_unlock(&vcpu->mutex); + } + return -EINTR; +} + +static void sev_unlock_vcpus_for_migration(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + int i; + + kvm_for_each_vcpu(i, vcpu, kvm) { + mutex_unlock(&vcpu->mutex); + } +} + +static void sev_migrate_from(struct kvm_sev_info *dst, + struct kvm_sev_info *src) +{ + dst->active = true; + dst->asid = src->asid; + dst->handle = src->handle; + dst->pages_locked = src->pages_locked; + + src->asid = 0; + src->active = false; + src->handle = 0; + src->pages_locked = 0; + + INIT_LIST_HEAD(&dst->regions_list); + list_replace_init(&src->regions_list, &dst->regions_list); +} + +static int sev_es_migrate_from(struct kvm *dst, struct kvm *src) +{ + int i; + struct kvm_vcpu *dst_vcpu, *src_vcpu; + struct vcpu_svm *dst_svm, *src_svm; + + if (atomic_read(&src->online_vcpus) != atomic_read(&dst->online_vcpus)) + return -EINVAL; + + kvm_for_each_vcpu(i, src_vcpu, src) { + if (!src_vcpu->arch.guest_state_protected) + return -EINVAL; + } + + kvm_for_each_vcpu(i, src_vcpu, src) { + src_svm = to_svm(src_vcpu); + dst_vcpu = kvm_get_vcpu(dst, i); + dst_svm = to_svm(dst_vcpu); + + /* + * Transfer VMSA and GHCB state to the destination. Nullify and + * clear source fields as appropriate, the state now belongs to + * the destination. + */ + memcpy(&dst_svm->sev_es, &src_svm->sev_es, sizeof(src_svm->sev_es)); + dst_svm->vmcb->control.ghcb_gpa = src_svm->vmcb->control.ghcb_gpa; + dst_svm->vmcb->control.vmsa_pa = src_svm->vmcb->control.vmsa_pa; + dst_vcpu->arch.guest_state_protected = true; + + memset(&src_svm->sev_es, 0, sizeof(src_svm->sev_es)); + src_svm->vmcb->control.ghcb_gpa = INVALID_PAGE; + src_svm->vmcb->control.vmsa_pa = INVALID_PAGE; + src_vcpu->arch.guest_state_protected = false; + } + to_kvm_svm(src)->sev_info.es_active = false; + to_kvm_svm(dst)->sev_info.es_active = true; + + return 0; +} + +int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd) +{ + struct kvm_sev_info *dst_sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *src_sev, *cg_cleanup_sev; + struct file *source_kvm_file; + struct kvm *source_kvm; + bool charged = false; + int ret; + + ret = sev_lock_for_migration(kvm); + if (ret) + return ret; + + if (sev_guest(kvm)) { + ret = -EINVAL; + goto out_unlock; + } + + source_kvm_file = fget(source_fd); + if (!file_is_kvm(source_kvm_file)) { + ret = -EBADF; + goto out_fput; + } + + source_kvm = source_kvm_file->private_data; + ret = sev_lock_for_migration(source_kvm); + if (ret) + goto out_fput; + + if (!sev_guest(source_kvm)) { + ret = -EINVAL; + goto out_source; + } + + src_sev = &to_kvm_svm(source_kvm)->sev_info; + dst_sev->misc_cg = get_current_misc_cg(); + cg_cleanup_sev = dst_sev; + if (dst_sev->misc_cg != src_sev->misc_cg) { + ret = sev_misc_cg_try_charge(dst_sev); + if (ret) + goto out_dst_cgroup; + charged = true; + } + + ret = sev_lock_vcpus_for_migration(kvm); + if (ret) + goto out_dst_cgroup; + ret = sev_lock_vcpus_for_migration(source_kvm); + if (ret) + goto out_dst_vcpu; + + if (sev_es_guest(source_kvm)) { + ret = sev_es_migrate_from(kvm, source_kvm); + if (ret) + goto out_source_vcpu; + } + sev_migrate_from(dst_sev, src_sev); + kvm_vm_dead(source_kvm); + cg_cleanup_sev = src_sev; + ret = 0; + +out_source_vcpu: + sev_unlock_vcpus_for_migration(source_kvm); +out_dst_vcpu: + sev_unlock_vcpus_for_migration(kvm); +out_dst_cgroup: + /* Operates on the source on success, on the destination on failure. */ + if (charged) + sev_misc_cg_uncharge(cg_cleanup_sev); + put_misc_cg(cg_cleanup_sev->misc_cg); + cg_cleanup_sev->misc_cg = NULL; +out_source: + sev_unlock_after_migration(source_kvm); +out_fput: + if (source_kvm_file) + fput(source_kvm_file); +out_unlock: + sev_unlock_after_migration(kvm); + return ret; +} + int svm_mem_enc_op(struct kvm *kvm, void __user *argp) { struct kvm_sev_cmd sev_cmd; @@ -1554,7 +1756,7 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp) /* Only the enc_context_owner handles some memory enc operations. */ if (is_mirroring_enc_context(kvm) && - !cmd_allowed_from_miror(sev_cmd.id)) { + !is_cmd_allowed_from_mirror(sev_cmd.id)) { r = -EINVAL; goto out; } @@ -1787,7 +1989,12 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd) mutex_unlock(&source_kvm->lock); mutex_lock(&kvm->lock); - if (sev_guest(kvm)) { + /* + * Disallow out-of-band SEV/SEV-ES init if the target is already an + * SEV guest, or if vCPUs have been created. KVM relies on vCPUs being + * created after SEV/SEV-ES initialization, e.g. to init intercepts. + */ + if (sev_guest(kvm) || kvm->created_vcpus) { ret = -EINVAL; goto e_mirror_unlock; } @@ -2038,16 +2245,16 @@ void sev_free_vcpu(struct kvm_vcpu *vcpu) svm = to_svm(vcpu); if (vcpu->arch.guest_state_protected) - sev_flush_guest_memory(svm, svm->vmsa, PAGE_SIZE); - __free_page(virt_to_page(svm->vmsa)); + sev_flush_guest_memory(svm, svm->sev_es.vmsa, PAGE_SIZE); + __free_page(virt_to_page(svm->sev_es.vmsa)); - if (svm->ghcb_sa_free) - kfree(svm->ghcb_sa); + if (svm->sev_es.ghcb_sa_free) + kfree(svm->sev_es.ghcb_sa); } static void dump_ghcb(struct vcpu_svm *svm) { - struct ghcb *ghcb = svm->ghcb; + struct ghcb *ghcb = svm->sev_es.ghcb; unsigned int nbits; /* Re-use the dump_invalid_vmcb module parameter */ @@ -2073,7 +2280,7 @@ static void dump_ghcb(struct vcpu_svm *svm) static void sev_es_sync_to_ghcb(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu = &svm->vcpu; - struct ghcb *ghcb = svm->ghcb; + struct ghcb *ghcb = svm->sev_es.ghcb; /* * The GHCB protocol so far allows for the following data @@ -2093,7 +2300,7 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; struct kvm_vcpu *vcpu = &svm->vcpu; - struct ghcb *ghcb = svm->ghcb; + struct ghcb *ghcb = svm->sev_es.ghcb; u64 exit_code; /* @@ -2140,7 +2347,7 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) struct ghcb *ghcb; u64 exit_code = 0; - ghcb = svm->ghcb; + ghcb = svm->sev_es.ghcb; /* Only GHCB Usage code 0 is supported */ if (ghcb->ghcb_usage) @@ -2258,33 +2465,34 @@ vmgexit_err: void sev_es_unmap_ghcb(struct vcpu_svm *svm) { - if (!svm->ghcb) + if (!svm->sev_es.ghcb) return; - if (svm->ghcb_sa_free) { + if (svm->sev_es.ghcb_sa_free) { /* * The scratch area lives outside the GHCB, so there is a * buffer that, depending on the operation performed, may * need to be synced, then freed. */ - if (svm->ghcb_sa_sync) { + if (svm->sev_es.ghcb_sa_sync) { kvm_write_guest(svm->vcpu.kvm, - ghcb_get_sw_scratch(svm->ghcb), - svm->ghcb_sa, svm->ghcb_sa_len); - svm->ghcb_sa_sync = false; + ghcb_get_sw_scratch(svm->sev_es.ghcb), + svm->sev_es.ghcb_sa, + svm->sev_es.ghcb_sa_len); + svm->sev_es.ghcb_sa_sync = false; } - kfree(svm->ghcb_sa); - svm->ghcb_sa = NULL; - svm->ghcb_sa_free = false; + kfree(svm->sev_es.ghcb_sa); + svm->sev_es.ghcb_sa = NULL; + svm->sev_es.ghcb_sa_free = false; } - trace_kvm_vmgexit_exit(svm->vcpu.vcpu_id, svm->ghcb); + trace_kvm_vmgexit_exit(svm->vcpu.vcpu_id, svm->sev_es.ghcb); sev_es_sync_to_ghcb(svm); - kvm_vcpu_unmap(&svm->vcpu, &svm->ghcb_map, true); - svm->ghcb = NULL; + kvm_vcpu_unmap(&svm->vcpu, &svm->sev_es.ghcb_map, true); + svm->sev_es.ghcb = NULL; } void pre_sev_run(struct vcpu_svm *svm, int cpu) @@ -2314,7 +2522,7 @@ void pre_sev_run(struct vcpu_svm *svm, int cpu) static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) { struct vmcb_control_area *control = &svm->vmcb->control; - struct ghcb *ghcb = svm->ghcb; + struct ghcb *ghcb = svm->sev_es.ghcb; u64 ghcb_scratch_beg, ghcb_scratch_end; u64 scratch_gpa_beg, scratch_gpa_end; void *scratch_va; @@ -2350,7 +2558,7 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) return false; } - scratch_va = (void *)svm->ghcb; + scratch_va = (void *)svm->sev_es.ghcb; scratch_va += (scratch_gpa_beg - control->ghcb_gpa); } else { /* @@ -2380,12 +2588,12 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) * the vCPU next time (i.e. a read was requested so the data * must be written back to the guest memory). */ - svm->ghcb_sa_sync = sync; - svm->ghcb_sa_free = true; + svm->sev_es.ghcb_sa_sync = sync; + svm->sev_es.ghcb_sa_free = true; } - svm->ghcb_sa = scratch_va; - svm->ghcb_sa_len = len; + svm->sev_es.ghcb_sa = scratch_va; + svm->sev_es.ghcb_sa_len = len; return true; } @@ -2504,15 +2712,15 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) return -EINVAL; } - if (kvm_vcpu_map(vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->ghcb_map)) { + if (kvm_vcpu_map(vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->sev_es.ghcb_map)) { /* Unable to map GHCB from guest */ vcpu_unimpl(vcpu, "vmgexit: error mapping GHCB [%#llx] from guest\n", ghcb_gpa); return -EINVAL; } - svm->ghcb = svm->ghcb_map.hva; - ghcb = svm->ghcb_map.hva; + svm->sev_es.ghcb = svm->sev_es.ghcb_map.hva; + ghcb = svm->sev_es.ghcb_map.hva; trace_kvm_vmgexit_enter(vcpu->vcpu_id, ghcb); @@ -2535,7 +2743,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) ret = kvm_sev_es_mmio_read(vcpu, control->exit_info_1, control->exit_info_2, - svm->ghcb_sa); + svm->sev_es.ghcb_sa); break; case SVM_VMGEXIT_MMIO_WRITE: if (!setup_vmgexit_scratch(svm, false, control->exit_info_2)) @@ -2544,7 +2752,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) ret = kvm_sev_es_mmio_write(vcpu, control->exit_info_1, control->exit_info_2, - svm->ghcb_sa); + svm->sev_es.ghcb_sa); break; case SVM_VMGEXIT_NMI_COMPLETE: ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_IRET); @@ -2604,7 +2812,8 @@ int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in) if (!setup_vmgexit_scratch(svm, in, bytes)) return -EINVAL; - return kvm_sev_es_string_io(&svm->vcpu, size, port, svm->ghcb_sa, count, in); + return kvm_sev_es_string_io(&svm->vcpu, size, port, svm->sev_es.ghcb_sa, + count, in); } void sev_es_init_vmcb(struct vcpu_svm *svm) @@ -2619,7 +2828,7 @@ void sev_es_init_vmcb(struct vcpu_svm *svm) * VMCB page. Do not include the encryption mask on the VMSA physical * address since hardware will access it using the guest key. */ - svm->vmcb->control.vmsa_pa = __pa(svm->vmsa); + svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa); /* Can't intercept CR register access, HV can't modify CR registers */ svm_clr_intercept(svm, INTERCEPT_CR0_READ); @@ -2691,8 +2900,8 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) struct vcpu_svm *svm = to_svm(vcpu); /* First SIPI: Use the values as initially set by the VMM */ - if (!svm->received_first_sipi) { - svm->received_first_sipi = true; + if (!svm->sev_es.received_first_sipi) { + svm->sev_es.received_first_sipi = true; return; } @@ -2701,8 +2910,8 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) * the guest will set the CS and RIP. Set SW_EXIT_INFO_2 to a * non-zero value. */ - if (!svm->ghcb) + if (!svm->sev_es.ghcb) return; - ghcb_set_sw_exit_info_2(svm->ghcb, 1); + ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1); } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index b36ca4e476c2..5630c241d5f6 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1452,7 +1452,7 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) svm_switch_vmcb(svm, &svm->vmcb01); if (vmsa_page) - svm->vmsa = page_address(vmsa_page); + svm->sev_es.vmsa = page_address(vmsa_page); svm->guest_state_loaded = false; @@ -2835,11 +2835,11 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) { struct vcpu_svm *svm = to_svm(vcpu); - if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->ghcb)) + if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->sev_es.ghcb)) return kvm_complete_insn_gp(vcpu, err); - ghcb_set_sw_exit_info_1(svm->ghcb, 1); - ghcb_set_sw_exit_info_2(svm->ghcb, + ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 1); + ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, X86_TRAP_GP | SVM_EVTINJ_TYPE_EXEPT | SVM_EVTINJ_VALID); @@ -3121,11 +3121,6 @@ static int invpcid_interception(struct kvm_vcpu *vcpu) type = svm->vmcb->control.exit_info_2; gva = svm->vmcb->control.exit_info_1; - if (type > 3) { - kvm_inject_gp(vcpu, 0); - return 1; - } - return kvm_handle_invpcid(vcpu, type, gva); } @@ -4701,6 +4696,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .mem_enc_unreg_region = svm_unregister_enc_region, .vm_copy_enc_context_from = svm_vm_copy_asid_from, + .vm_move_enc_context_from = svm_vm_migrate_from, .can_emulate_instruction = svm_can_emulate_instruction, diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 5e9510d4574e..5faad3dc10e2 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -80,6 +80,7 @@ struct kvm_sev_info { u64 ap_jump_table; /* SEV-ES AP Jump Table address */ struct kvm *enc_context_owner; /* Owner of copied encryption context */ struct misc_cg *misc_cg; /* For misc cgroup accounting */ + atomic_t migration_in_progress; }; struct kvm_svm { @@ -123,6 +124,20 @@ struct svm_nested_state { bool initialized; }; +struct vcpu_sev_es_state { + /* SEV-ES support */ + struct vmcb_save_area *vmsa; + struct ghcb *ghcb; + struct kvm_host_map ghcb_map; + bool received_first_sipi; + + /* SEV-ES scratch area support */ + void *ghcb_sa; + u32 ghcb_sa_len; + bool ghcb_sa_sync; + bool ghcb_sa_free; +}; + struct vcpu_svm { struct kvm_vcpu vcpu; /* vmcb always points at current_vmcb->ptr, it's purely a shorthand. */ @@ -186,17 +201,7 @@ struct vcpu_svm { DECLARE_BITMAP(write, MAX_DIRECT_ACCESS_MSRS); } shadow_msr_intercept; - /* SEV-ES support */ - struct vmcb_save_area *vmsa; - struct ghcb *ghcb; - struct kvm_host_map ghcb_map; - bool received_first_sipi; - - /* SEV-ES scratch area support */ - void *ghcb_sa; - u32 ghcb_sa_len; - bool ghcb_sa_sync; - bool ghcb_sa_free; + struct vcpu_sev_es_state sev_es; bool guest_state_loaded; }; @@ -242,7 +247,7 @@ static __always_inline bool sev_es_guest(struct kvm *kvm) #ifdef CONFIG_KVM_AMD_SEV struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - return sev_guest(kvm) && sev->es_active; + return sev->es_active && !WARN_ON_ONCE(!sev->active); #else return false; #endif @@ -558,6 +563,7 @@ int svm_register_enc_region(struct kvm *kvm, int svm_unregister_enc_region(struct kvm *kvm, struct kvm_enc_region *range); int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd); +int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd); void pre_sev_run(struct vcpu_svm *svm, int cpu); void __init sev_set_cpu_caps(void); void __init sev_hardware_setup(void); diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index b4ee5e9f9e20..1e2f66951566 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -525,67 +525,19 @@ static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, } /* - * Check if MSR is intercepted for L01 MSR bitmap. + * For x2APIC MSRs, ignore the vmcs01 bitmap. L1 can enable x2APIC without L1 + * itself utilizing x2APIC. All MSRs were previously set to be intercepted, + * only the "disable intercept" case needs to be handled. */ -static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr) +static void nested_vmx_disable_intercept_for_x2apic_msr(unsigned long *msr_bitmap_l1, + unsigned long *msr_bitmap_l0, + u32 msr, int type) { - unsigned long *msr_bitmap; - int f = sizeof(unsigned long); + if (type & MSR_TYPE_R && !vmx_test_msr_bitmap_read(msr_bitmap_l1, msr)) + vmx_clear_msr_bitmap_read(msr_bitmap_l0, msr); - if (!cpu_has_vmx_msr_bitmap()) - return true; - - msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap; - - if (msr <= 0x1fff) { - return !!test_bit(msr, msr_bitmap + 0x800 / f); - } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { - msr &= 0x1fff; - return !!test_bit(msr, msr_bitmap + 0xc00 / f); - } - - return true; -} - -/* - * If a msr is allowed by L0, we should check whether it is allowed by L1. - * The corresponding bit will be cleared unless both of L0 and L1 allow it. - */ -static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1, - unsigned long *msr_bitmap_nested, - u32 msr, int type) -{ - int f = sizeof(unsigned long); - - /* - * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals - * have the write-low and read-high bitmap offsets the wrong way round. - * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. - */ - if (msr <= 0x1fff) { - if (type & MSR_TYPE_R && - !test_bit(msr, msr_bitmap_l1 + 0x000 / f)) - /* read-low */ - __clear_bit(msr, msr_bitmap_nested + 0x000 / f); - - if (type & MSR_TYPE_W && - !test_bit(msr, msr_bitmap_l1 + 0x800 / f)) - /* write-low */ - __clear_bit(msr, msr_bitmap_nested + 0x800 / f); - - } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { - msr &= 0x1fff; - if (type & MSR_TYPE_R && - !test_bit(msr, msr_bitmap_l1 + 0x400 / f)) - /* read-high */ - __clear_bit(msr, msr_bitmap_nested + 0x400 / f); - - if (type & MSR_TYPE_W && - !test_bit(msr, msr_bitmap_l1 + 0xc00 / f)) - /* write-high */ - __clear_bit(msr, msr_bitmap_nested + 0xc00 / f); - - } + if (type & MSR_TYPE_W && !vmx_test_msr_bitmap_write(msr_bitmap_l1, msr)) + vmx_clear_msr_bitmap_write(msr_bitmap_l0, msr); } static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) @@ -600,6 +552,34 @@ static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) } } +#define BUILD_NVMX_MSR_INTERCEPT_HELPER(rw) \ +static inline \ +void nested_vmx_set_msr_##rw##_intercept(struct vcpu_vmx *vmx, \ + unsigned long *msr_bitmap_l1, \ + unsigned long *msr_bitmap_l0, u32 msr) \ +{ \ + if (vmx_test_msr_bitmap_##rw(vmx->vmcs01.msr_bitmap, msr) || \ + vmx_test_msr_bitmap_##rw(msr_bitmap_l1, msr)) \ + vmx_set_msr_bitmap_##rw(msr_bitmap_l0, msr); \ + else \ + vmx_clear_msr_bitmap_##rw(msr_bitmap_l0, msr); \ +} +BUILD_NVMX_MSR_INTERCEPT_HELPER(read) +BUILD_NVMX_MSR_INTERCEPT_HELPER(write) + +static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx, + unsigned long *msr_bitmap_l1, + unsigned long *msr_bitmap_l0, + u32 msr, int types) +{ + if (types & MSR_TYPE_R) + nested_vmx_set_msr_read_intercept(vmx, msr_bitmap_l1, + msr_bitmap_l0, msr); + if (types & MSR_TYPE_W) + nested_vmx_set_msr_write_intercept(vmx, msr_bitmap_l1, + msr_bitmap_l0, msr); +} + /* * Merge L0's and L1's MSR bitmap, return false to indicate that * we do not use the hardware. @@ -607,10 +587,11 @@ static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { + struct vcpu_vmx *vmx = to_vmx(vcpu); int msr; unsigned long *msr_bitmap_l1; - unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap; - struct kvm_host_map *map = &to_vmx(vcpu)->nested.msr_bitmap_map; + unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap; + struct kvm_host_map *map = &vmx->nested.msr_bitmap_map; /* Nothing to do if the MSR bitmap is not in use. */ if (!cpu_has_vmx_msr_bitmap() || @@ -625,7 +606,7 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, /* * To keep the control flow simple, pay eight 8-byte writes (sixteen * 4-byte writes on 32-bit systems) up front to enable intercepts for - * the x2APIC MSR range and selectively disable them below. + * the x2APIC MSR range and selectively toggle those relevant to L2. */ enable_x2apic_msr_intercepts(msr_bitmap_l0); @@ -644,61 +625,44 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, } } - nested_vmx_disable_intercept_for_msr( + nested_vmx_disable_intercept_for_x2apic_msr( msr_bitmap_l1, msr_bitmap_l0, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_R | MSR_TYPE_W); if (nested_cpu_has_vid(vmcs12)) { - nested_vmx_disable_intercept_for_msr( + nested_vmx_disable_intercept_for_x2apic_msr( msr_bitmap_l1, msr_bitmap_l0, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); - nested_vmx_disable_intercept_for_msr( + nested_vmx_disable_intercept_for_x2apic_msr( msr_bitmap_l1, msr_bitmap_l0, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); } } - /* KVM unconditionally exposes the FS/GS base MSRs to L1. */ + /* + * Always check vmcs01's bitmap to honor userspace MSR filters and any + * other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through. + */ #ifdef CONFIG_X86_64 - nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, - MSR_FS_BASE, MSR_TYPE_RW); + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_FS_BASE, MSR_TYPE_RW); - nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, - MSR_GS_BASE, MSR_TYPE_RW); + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_GS_BASE, MSR_TYPE_RW); - nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, - MSR_KERNEL_GS_BASE, MSR_TYPE_RW); + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_KERNEL_GS_BASE, MSR_TYPE_RW); #endif + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); - /* - * Checking the L0->L1 bitmap is trying to verify two things: - * - * 1. L0 gave a permission to L1 to actually passthrough the MSR. This - * ensures that we do not accidentally generate an L02 MSR bitmap - * from the L12 MSR bitmap that is too permissive. - * 2. That L1 or L2s have actually used the MSR. This avoids - * unnecessarily merging of the bitmap if the MSR is unused. This - * works properly because we only update the L01 MSR bitmap lazily. - * So even if L0 should pass L1 these MSRs, the L01 bitmap is only - * updated to reflect this when L1 (or its L2s) actually write to - * the MSR. - */ - if (!msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL)) - nested_vmx_disable_intercept_for_msr( - msr_bitmap_l1, msr_bitmap_l0, - MSR_IA32_SPEC_CTRL, - MSR_TYPE_R | MSR_TYPE_W); - - if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD)) - nested_vmx_disable_intercept_for_msr( - msr_bitmap_l1, msr_bitmap_l0, - MSR_IA32_PRED_CMD, - MSR_TYPE_W); + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_PRED_CMD, MSR_TYPE_W); - kvm_vcpu_unmap(vcpu, &to_vmx(vcpu)->nested.msr_bitmap_map, false); + kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false); return true; } @@ -706,33 +670,39 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { - struct kvm_host_map map; - struct vmcs12 *shadow; + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; if (!nested_cpu_has_shadow_vmcs(vmcs12) || vmcs12->vmcs_link_pointer == INVALID_GPA) return; - shadow = get_shadow_vmcs12(vcpu); - - if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map)) + if (ghc->gpa != vmcs12->vmcs_link_pointer && + kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, + vmcs12->vmcs_link_pointer, VMCS12_SIZE)) return; - memcpy(shadow, map.hva, VMCS12_SIZE); - kvm_vcpu_unmap(vcpu, &map, false); + kvm_read_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), + VMCS12_SIZE); } static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct vcpu_vmx *vmx = to_vmx(vcpu); + struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; if (!nested_cpu_has_shadow_vmcs(vmcs12) || vmcs12->vmcs_link_pointer == INVALID_GPA) return; - kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer, - get_shadow_vmcs12(vcpu), VMCS12_SIZE); + if (ghc->gpa != vmcs12->vmcs_link_pointer && + kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, + vmcs12->vmcs_link_pointer, VMCS12_SIZE)) + return; + + kvm_write_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), + VMCS12_SIZE); } /* @@ -2866,6 +2836,17 @@ static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, return 0; } +static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ +#ifdef CONFIG_X86_64 + if (CC(!!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) != + !!(vcpu->arch.efer & EFER_LMA))) + return -EINVAL; +#endif + return 0; +} + static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { @@ -2890,18 +2871,16 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, return -EINVAL; #ifdef CONFIG_X86_64 - ia32e = !!(vcpu->arch.efer & EFER_LMA); + ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE); #else ia32e = false; #endif if (ia32e) { - if (CC(!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)) || - CC(!(vmcs12->host_cr4 & X86_CR4_PAE))) + if (CC(!(vmcs12->host_cr4 & X86_CR4_PAE))) return -EINVAL; } else { - if (CC(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) || - CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || + if (CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || CC(vmcs12->host_cr4 & X86_CR4_PCIDE) || CC((vmcs12->host_rip) >> 32)) return -EINVAL; @@ -2946,9 +2925,9 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { - int r = 0; - struct vmcs12 *shadow; - struct kvm_host_map map; + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; + struct vmcs_hdr hdr; if (vmcs12->vmcs_link_pointer == INVALID_GPA) return 0; @@ -2956,17 +2935,21 @@ static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))) return -EINVAL; - if (CC(kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))) - return -EINVAL; + if (ghc->gpa != vmcs12->vmcs_link_pointer && + CC(kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, + vmcs12->vmcs_link_pointer, VMCS12_SIZE))) + return -EINVAL; - shadow = map.hva; + if (CC(kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, + offsetof(struct vmcs12, hdr), + sizeof(hdr)))) + return -EINVAL; - if (CC(shadow->hdr.revision_id != VMCS12_REVISION) || - CC(shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))) - r = -EINVAL; + if (CC(hdr.revision_id != VMCS12_REVISION) || + CC(hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))) + return -EINVAL; - kvm_vcpu_unmap(vcpu, &map, false); - return r; + return 0; } /* @@ -3571,6 +3554,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) if (nested_vmx_check_controls(vcpu, vmcs12)) return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); + if (nested_vmx_check_address_space_size(vcpu, vmcs12)) + return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); + if (nested_vmx_check_host_state(vcpu, vmcs12)) return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); @@ -5300,10 +5286,11 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) return 1; if (vmx->nested.current_vmptr != vmptr) { - struct kvm_host_map map; - struct vmcs12 *new_vmcs12; + struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache; + struct vmcs_hdr hdr; - if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmptr), &map)) { + if (ghc->gpa != vmptr && + kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) { /* * Reads from an unbacked page return all 1s, * which means that the 32 bits located at the @@ -5314,12 +5301,16 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); } - new_vmcs12 = map.hva; + if (kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, + offsetof(struct vmcs12, hdr), + sizeof(hdr))) { + return nested_vmx_fail(vcpu, + VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); + } - if (new_vmcs12->hdr.revision_id != VMCS12_REVISION || - (new_vmcs12->hdr.shadow_vmcs && + if (hdr.revision_id != VMCS12_REVISION || + (hdr.shadow_vmcs && !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { - kvm_vcpu_unmap(vcpu, &map, false); return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); } @@ -5330,8 +5321,11 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) * Load VMCS12 from guest memory since it is not already * cached. */ - memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE); - kvm_vcpu_unmap(vcpu, &map, false); + if (kvm_read_guest_cached(vcpu->kvm, ghc, vmx->nested.cached_vmcs12, + VMCS12_SIZE)) { + return nested_vmx_fail(vcpu, + VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); + } set_current_vmptr(vmx, vmptr); } @@ -5379,7 +5373,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) struct { u64 eptp, gpa; } operand; - int i, r; + int i, r, gpr_index; if (!(vmx->nested.msrs.secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) || @@ -5392,7 +5386,8 @@ static int handle_invept(struct kvm_vcpu *vcpu) return 1; vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf); + gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); + type = kvm_register_read(vcpu, gpr_index); types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; @@ -5459,7 +5454,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) u64 gla; } operand; u16 vpid02; - int r; + int r, gpr_index; if (!(vmx->nested.msrs.secondary_ctls_high & SECONDARY_EXEC_ENABLE_VPID) || @@ -5472,7 +5467,8 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) return 1; vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf); + gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); + type = kvm_register_read(vcpu, gpr_index); types = (vmx->nested.msrs.vpid_caps & VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index b8e0d21b7c8a..1b7456b2177b 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -118,16 +118,15 @@ static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) } } -/* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */ -static int intel_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) +static bool intel_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); bool fixed = idx & (1u << 30); idx &= ~(3u << 30); - return (!fixed && idx >= pmu->nr_arch_gp_counters) || - (fixed && idx >= pmu->nr_arch_fixed_counters); + return fixed ? idx < pmu->nr_arch_fixed_counters + : idx < pmu->nr_arch_gp_counters; } static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 76861b66bbcf..ba66c171d951 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -769,24 +769,13 @@ void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu) /* * Check if MSR is intercepted for currently loaded MSR bitmap. */ -static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) +static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr) { - unsigned long *msr_bitmap; - int f = sizeof(unsigned long); - - if (!cpu_has_vmx_msr_bitmap()) + if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS)) return true; - msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap; - - if (msr <= 0x1fff) { - return !!test_bit(msr, msr_bitmap + 0x800 / f); - } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { - msr &= 0x1fff; - return !!test_bit(msr, msr_bitmap + 0xc00 / f); - } - - return true; + return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap, + MSR_IA32_SPEC_CTRL); } static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, @@ -3697,46 +3686,6 @@ void free_vpid(int vpid) spin_unlock(&vmx_vpid_lock); } -static void vmx_clear_msr_bitmap_read(ulong *msr_bitmap, u32 msr) -{ - int f = sizeof(unsigned long); - - if (msr <= 0x1fff) - __clear_bit(msr, msr_bitmap + 0x000 / f); - else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) - __clear_bit(msr & 0x1fff, msr_bitmap + 0x400 / f); -} - -static void vmx_clear_msr_bitmap_write(ulong *msr_bitmap, u32 msr) -{ - int f = sizeof(unsigned long); - - if (msr <= 0x1fff) - __clear_bit(msr, msr_bitmap + 0x800 / f); - else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) - __clear_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f); -} - -static void vmx_set_msr_bitmap_read(ulong *msr_bitmap, u32 msr) -{ - int f = sizeof(unsigned long); - - if (msr <= 0x1fff) - __set_bit(msr, msr_bitmap + 0x000 / f); - else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) - __set_bit(msr & 0x1fff, msr_bitmap + 0x400 / f); -} - -static void vmx_set_msr_bitmap_write(ulong *msr_bitmap, u32 msr) -{ - int f = sizeof(unsigned long); - - if (msr <= 0x1fff) - __set_bit(msr, msr_bitmap + 0x800 / f); - else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) - __set_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f); -} - void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -5494,6 +5443,7 @@ static int handle_invpcid(struct kvm_vcpu *vcpu) u64 pcid; u64 gla; } operand; + int gpr_index; if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) { kvm_queue_exception(vcpu, UD_VECTOR); @@ -5501,12 +5451,8 @@ static int handle_invpcid(struct kvm_vcpu *vcpu) } vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf); - - if (type > 3) { - kvm_inject_gp(vcpu, 0); - return 1; - } + gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); + type = kvm_register_read(vcpu, gpr_index); /* According to the Intel instruction reference, the memory operand * is read even if it isn't needed (e.g., for type==all) @@ -6749,7 +6695,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) * If the L02 MSR bitmap does not intercept the MSR, then we need to * save it. */ - if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))) + if (unlikely(!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))) vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0); @@ -7563,7 +7509,8 @@ static void hardware_unsetup(void) static bool vmx_check_apicv_inhibit_reasons(ulong bit) { ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) | - BIT(APICV_INHIBIT_REASON_HYPERV); + BIT(APICV_INHIBIT_REASON_HYPERV) | + BIT(APICV_INHIBIT_REASON_BLOCKIRQ); return supported & BIT(bit); } diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index e7db42e3b0ce..4df2ac24ffc1 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -142,6 +142,16 @@ struct nested_vmx { struct vmcs12 *cached_shadow_vmcs12; /* + * GPA to HVA cache for accessing vmcs12->vmcs_link_pointer + */ + struct gfn_to_hva_cache shadow_vmcs12_cache; + + /* + * GPA to HVA cache for VMCS12 + */ + struct gfn_to_hva_cache vmcs12_cache; + + /* * Indicates if the shadow vmcs or enlightened vmcs must be updated * with the data held by struct vmcs12. */ @@ -400,6 +410,34 @@ static inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu); +/* + * Note, early Intel manuals have the write-low and read-high bitmap offsets + * the wrong way round. The bitmaps control MSRs 0x00000000-0x00001fff and + * 0xc0000000-0xc0001fff. The former (low) uses bytes 0-0x3ff for reads and + * 0x800-0xbff for writes. The latter (high) uses 0x400-0x7ff for reads and + * 0xc00-0xfff for writes. MSRs not covered by either of the ranges always + * VM-Exit. + */ +#define __BUILD_VMX_MSR_BITMAP_HELPER(rtype, action, bitop, access, base) \ +static inline rtype vmx_##action##_msr_bitmap_##access(unsigned long *bitmap, \ + u32 msr) \ +{ \ + int f = sizeof(unsigned long); \ + \ + if (msr <= 0x1fff) \ + return bitop##_bit(msr, bitmap + base / f); \ + else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) \ + return bitop##_bit(msr & 0x1fff, bitmap + (base + 0x400) / f); \ + return (rtype)true; \ +} +#define BUILD_VMX_MSR_BITMAP_HELPERS(ret_type, action, bitop) \ + __BUILD_VMX_MSR_BITMAP_HELPER(ret_type, action, bitop, read, 0x0) \ + __BUILD_VMX_MSR_BITMAP_HELPER(ret_type, action, bitop, write, 0x800) + +BUILD_VMX_MSR_BITMAP_HELPERS(bool, test, test) +BUILD_VMX_MSR_BITMAP_HELPERS(void, clear, __clear) +BUILD_VMX_MSR_BITMAP_HELPERS(void, set, __set) + static inline u8 vmx_get_rvi(void) { return vmcs_read16(GUEST_INTR_STATUS) & 0xff; @@ -522,4 +560,9 @@ static inline bool vmx_guest_state_valid(struct kvm_vcpu *vcpu) void dump_vmcs(struct kvm_vcpu *vcpu); +static inline int vmx_get_instr_info_reg2(u32 vmx_instr_info) +{ + return (vmx_instr_info >> 28) & 0xf; +} + #endif /* __KVM_X86_VMX_H */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c1c4e2b05a63..5a403d92833f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3260,8 +3260,11 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) static void record_steal_time(struct kvm_vcpu *vcpu) { - struct kvm_host_map map; - struct kvm_steal_time *st; + struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache; + struct kvm_steal_time __user *st; + struct kvm_memslots *slots; + u64 steal; + u32 version; if (kvm_xen_msr_enabled(vcpu->kvm)) { kvm_xen_runstate_set_running(vcpu); @@ -3271,47 +3274,86 @@ static void record_steal_time(struct kvm_vcpu *vcpu) if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; - /* -EAGAIN is returned in atomic context so we can just return. */ - if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, - &map, &vcpu->arch.st.cache, false)) + if (WARN_ON_ONCE(current->mm != vcpu->kvm->mm)) return; - st = map.hva + - offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS); + slots = kvm_memslots(vcpu->kvm); + + if (unlikely(slots->generation != ghc->generation || + kvm_is_error_hva(ghc->hva) || !ghc->memslot)) { + gfn_t gfn = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS; + + /* We rely on the fact that it fits in a single page. */ + BUILD_BUG_ON((sizeof(*st) - 1) & KVM_STEAL_VALID_BITS); + + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, gfn, sizeof(*st)) || + kvm_is_error_hva(ghc->hva) || !ghc->memslot) + return; + } + st = (struct kvm_steal_time __user *)ghc->hva; /* * Doing a TLB flush here, on the guest's behalf, can avoid * expensive IPIs. */ if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) { - u8 st_preempted = xchg(&st->preempted, 0); + u8 st_preempted = 0; + int err = -EFAULT; + + if (!user_access_begin(st, sizeof(*st))) + return; + + asm volatile("1: xchgb %0, %2\n" + "xor %1, %1\n" + "2:\n" + _ASM_EXTABLE_UA(1b, 2b) + : "+q" (st_preempted), + "+&r" (err), + "+m" (st->preempted)); + if (err) + goto out; + + user_access_end(); + + vcpu->arch.st.preempted = 0; trace_kvm_pv_tlb_flush(vcpu->vcpu_id, st_preempted & KVM_VCPU_FLUSH_TLB); if (st_preempted & KVM_VCPU_FLUSH_TLB) kvm_vcpu_flush_tlb_guest(vcpu); + + if (!user_access_begin(st, sizeof(*st))) + goto dirty; } else { - st->preempted = 0; - } + if (!user_access_begin(st, sizeof(*st))) + return; - vcpu->arch.st.preempted = 0; + unsafe_put_user(0, &st->preempted, out); + vcpu->arch.st.preempted = 0; + } - if (st->version & 1) - st->version += 1; /* first time write, random junk */ + unsafe_get_user(version, &st->version, out); + if (version & 1) + version += 1; /* first time write, random junk */ - st->version += 1; + version += 1; + unsafe_put_user(version, &st->version, out); smp_wmb(); - st->steal += current->sched_info.run_delay - + unsafe_get_user(steal, &st->steal, out); + steal += current->sched_info.run_delay - vcpu->arch.st.last_steal; vcpu->arch.st.last_steal = current->sched_info.run_delay; + unsafe_put_user(steal, &st->steal, out); - smp_wmb(); + version += 1; + unsafe_put_user(version, &st->version, out); - st->version += 1; - - kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, false); + out: + user_access_end(); + dirty: + mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa)); } int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) @@ -3517,7 +3559,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI)) return 1; - if (kvm_lapic_enable_pv_eoi(vcpu, data, sizeof(u8))) + if (kvm_lapic_set_pv_eoi(vcpu, data, sizeof(u8))) return 1; break; @@ -4137,7 +4179,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = !static_call(kvm_x86_cpu_has_accelerated_tpr)(); break; case KVM_CAP_NR_VCPUS: - r = KVM_SOFT_MAX_VCPUS; + r = min_t(unsigned int, num_online_cpus(), KVM_MAX_VCPUS); break; case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; @@ -4351,8 +4393,10 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) { - struct kvm_host_map map; - struct kvm_steal_time *st; + struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache; + struct kvm_steal_time __user *st; + struct kvm_memslots *slots; + static const u8 preempted = KVM_VCPU_PREEMPTED; if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; @@ -4360,16 +4404,23 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) if (vcpu->arch.st.preempted) return; - if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map, - &vcpu->arch.st.cache, true)) + /* This happens on process exit */ + if (unlikely(current->mm != vcpu->kvm->mm)) + return; + + slots = kvm_memslots(vcpu->kvm); + + if (unlikely(slots->generation != ghc->generation || + kvm_is_error_hva(ghc->hva) || !ghc->memslot)) return; - st = map.hva + - offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS); + st = (struct kvm_steal_time __user *)ghc->hva; + BUILD_BUG_ON(sizeof(st->preempted) != sizeof(preempted)); - st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED; + if (!copy_to_user_nofault(&st->preempted, &preempted, sizeof(preempted))) + vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED; - kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true); + mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa)); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) @@ -5728,6 +5779,12 @@ split_irqchip_unlock: if (kvm_x86_ops.vm_copy_enc_context_from) r = kvm_x86_ops.vm_copy_enc_context_from(kvm, cap->args[0]); return r; + case KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM: + r = -EINVAL; + if (kvm_x86_ops.vm_move_enc_context_from) + r = kvm_x86_ops.vm_move_enc_context_from( + kvm, cap->args[0]); + return r; case KVM_CAP_EXIT_HYPERCALL: if (cap->args[0] & ~KVM_EXIT_HYPERCALL_VALID_MASK) { r = -EINVAL; @@ -7328,7 +7385,9 @@ static void emulator_set_smbase(struct x86_emulate_ctxt *ctxt, u64 smbase) static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt, u32 pmc) { - return kvm_pmu_is_valid_rdpmc_ecx(emul_to_vcpu(ctxt), pmc); + if (kvm_pmu_is_valid_rdpmc_ecx(emul_to_vcpu(ctxt), pmc)) + return 0; + return -EINVAL; } static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt, @@ -8789,7 +8848,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) trace_kvm_hypercall(nr, a0, a1, a2, a3); - op_64_bit = is_64_bit_mode(vcpu); + op_64_bit = is_64_bit_hypercall(vcpu); if (!op_64_bit) { nr &= 0xFFFFFFFF; a0 &= 0xFFFFFFFF; @@ -9488,12 +9547,16 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu) if (!kvm_apic_hw_enabled(vcpu->arch.apic)) return; - if (to_hv_vcpu(vcpu)) + if (to_hv_vcpu(vcpu)) { bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors, to_hv_synic(vcpu)->vec_bitmap, 256); + static_call(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap); + return; + } - static_call(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap); + static_call(kvm_x86_load_eoi_exitmap)( + vcpu, (u64 *)vcpu->arch.ioapic_handled_vectors); } void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, @@ -9552,7 +9615,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } if (kvm_request_pending(vcpu)) { - if (kvm_check_request(KVM_REQ_VM_BUGGED, vcpu)) { + if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) { r = -EIO; goto out; } @@ -10564,6 +10627,24 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, return ret; } +static void kvm_arch_vcpu_guestdbg_update_apicv_inhibit(struct kvm *kvm) +{ + bool inhibit = false; + struct kvm_vcpu *vcpu; + int i; + + down_write(&kvm->arch.apicv_update_lock); + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (vcpu->guest_debug & KVM_GUESTDBG_BLOCKIRQ) { + inhibit = true; + break; + } + } + __kvm_request_apicv_update(kvm, !inhibit, APICV_INHIBIT_REASON_BLOCKIRQ); + up_write(&kvm->arch.apicv_update_lock); +} + int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) { @@ -10616,6 +10697,8 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, static_call(kvm_x86_update_exception_bitmap)(vcpu); + kvm_arch_vcpu_guestdbg_update_apicv_inhibit(vcpu->kvm); + r = 0; out: @@ -10859,11 +10942,8 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { - struct gfn_to_pfn_cache *cache = &vcpu->arch.st.cache; int idx; - kvm_release_pfn(cache->pfn, cache->dirty, cache); - kvmclock_reset(vcpu); static_call(kvm_x86_vcpu_free)(vcpu); @@ -12275,7 +12355,8 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) return kvm_skip_emulated_instruction(vcpu); default: - BUG(); /* We have already checked above that type <= 3 */ + kvm_inject_gp(vcpu, 0); + return 1; } } EXPORT_SYMBOL_GPL(kvm_handle_invpcid); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index ea264c4502e4..997669ae9caa 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -153,12 +153,24 @@ static inline bool is_64_bit_mode(struct kvm_vcpu *vcpu) { int cs_db, cs_l; + WARN_ON_ONCE(vcpu->arch.guest_state_protected); + if (!is_long_mode(vcpu)) return false; static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l); return cs_l; } +static inline bool is_64_bit_hypercall(struct kvm_vcpu *vcpu) +{ + /* + * If running with protected guest state, the CS register is not + * accessible. The hypercall register values will have had to been + * provided in 64-bit mode, so assume the guest is in 64-bit. + */ + return vcpu->arch.guest_state_protected || is_64_bit_mode(vcpu); +} + static inline bool x86_exception_has_error_code(unsigned int vector) { static u32 exception_has_error_code = BIT(DF_VECTOR) | BIT(TS_VECTOR) | diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 8f62baebd028..dff2bdf9507a 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -127,9 +127,9 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) state_entry_time = vx->runstate_entry_time; state_entry_time |= XEN_RUNSTATE_UPDATE; - BUILD_BUG_ON(sizeof(((struct vcpu_runstate_info *)0)->state_entry_time) != + BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) != sizeof(state_entry_time)); - BUILD_BUG_ON(sizeof(((struct compat_vcpu_runstate_info *)0)->state_entry_time) != + BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) != sizeof(state_entry_time)); if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache, @@ -144,9 +144,9 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) */ BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != offsetof(struct compat_vcpu_runstate_info, state)); - BUILD_BUG_ON(sizeof(((struct vcpu_runstate_info *)0)->state) != + BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state) != sizeof(vx->current_runstate)); - BUILD_BUG_ON(sizeof(((struct compat_vcpu_runstate_info *)0)->state) != + BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) != sizeof(vx->current_runstate)); if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache, @@ -163,9 +163,9 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) offsetof(struct vcpu_runstate_info, time) - sizeof(u64)); BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state_entry_time) != offsetof(struct compat_vcpu_runstate_info, time) - sizeof(u64)); - BUILD_BUG_ON(sizeof(((struct vcpu_runstate_info *)0)->time) != - sizeof(((struct compat_vcpu_runstate_info *)0)->time)); - BUILD_BUG_ON(sizeof(((struct vcpu_runstate_info *)0)->time) != + BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) != + sizeof_field(struct compat_vcpu_runstate_info, time)); + BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) != sizeof(vx->runstate_times)); if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache, @@ -205,9 +205,9 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v) BUILD_BUG_ON(offsetof(struct vcpu_info, evtchn_upcall_pending) != offsetof(struct compat_vcpu_info, evtchn_upcall_pending)); BUILD_BUG_ON(sizeof(rc) != - sizeof(((struct vcpu_info *)0)->evtchn_upcall_pending)); + sizeof_field(struct vcpu_info, evtchn_upcall_pending)); BUILD_BUG_ON(sizeof(rc) != - sizeof(((struct compat_vcpu_info *)0)->evtchn_upcall_pending)); + sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending)); /* * For efficiency, this mirrors the checks for using the valid @@ -299,7 +299,7 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) break; case KVM_XEN_ATTR_TYPE_SHARED_INFO: - data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_gfn); + data->u.shared_info.gfn = kvm->arch.xen.shinfo_gfn; r = 0; break; @@ -698,7 +698,7 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu) kvm_hv_hypercall_enabled(vcpu)) return kvm_hv_hypercall(vcpu); - longmode = is_64_bit_mode(vcpu); + longmode = is_64_bit_hypercall(vcpu); if (!longmode) { params[0] = (u32)kvm_rbx_read(vcpu); params[1] = (u32)kvm_rcx_read(vcpu); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 23a14d82e783..1895986842b9 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -618,7 +618,7 @@ static void __init memory_map_top_down(unsigned long map_start, */ addr = memblock_phys_alloc_range(PMD_SIZE, PMD_SIZE, map_start, map_end); - memblock_free(addr, PMD_SIZE); + memblock_phys_free(addr, PMD_SIZE); real_end = addr + PMD_SIZE; /* step_size need to be small so pgt_buf from BRK could cover it */ diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index bd90b8fe81e4..d4e2648a1dfb 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -238,11 +238,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base) } } -/* - * The <linux/kallsyms.h> already defines is_kernel_text, - * using '__' prefix not to get in conflict. - */ -static inline int __is_kernel_text(unsigned long addr) +static inline int is_x86_32_kernel_text(unsigned long addr) { if (addr >= (unsigned long)_text && addr <= (unsigned long)__init_end) return 1; @@ -333,8 +329,8 @@ repeat: addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1; - if (__is_kernel_text(addr) || - __is_kernel_text(addr2)) + if (is_x86_32_kernel_text(addr) || + is_x86_32_kernel_text(addr2)) prot = PAGE_KERNEL_LARGE_EXEC; pages_2m++; @@ -359,7 +355,7 @@ repeat: */ pgprot_t init_prot = __pgprot(PTE_IDENT_ATTR); - if (__is_kernel_text(addr)) + if (is_x86_32_kernel_text(addr)) prot = PAGE_KERNEL_EXEC; pages_4k++; @@ -779,37 +775,6 @@ void __init mem_init(void) test_wp_bit(); } -#ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, - struct mhp_params *params) -{ - unsigned long start_pfn = start >> PAGE_SHIFT; - unsigned long nr_pages = size >> PAGE_SHIFT; - int ret; - - /* - * The page tables were already mapped at boot so if the caller - * requests a different mapping type then we must change all the - * pages with __set_memory_prot(). - */ - if (params->pgprot.pgprot != PAGE_KERNEL.pgprot) { - ret = __set_memory_prot(start, nr_pages, params->pgprot); - if (ret) - return ret; - } - - return __add_pages(nid, start_pfn, nr_pages, params); -} - -void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) -{ - unsigned long start_pfn = start >> PAGE_SHIFT; - unsigned long nr_pages = size >> PAGE_SHIFT; - - __remove_pages(start_pfn, nr_pages, altmap); -} -#endif - int kernel_set_to_readonly __read_mostly; static void mark_nxdata_nx(void) @@ -820,7 +785,7 @@ static void mark_nxdata_nx(void) */ unsigned long start = PFN_ALIGN(_etext); /* - * This comes from __is_kernel_text upper limit. Also HPAGE where used: + * This comes from is_x86_32_kernel_text upper limit. Also HPAGE where used: */ unsigned long size = (((unsigned long)__init_end + HPAGE_SIZE) & HPAGE_MASK) - start; diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index ef885370719a..e7b9b464a82f 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -49,7 +49,7 @@ static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr, p = early_alloc(PMD_SIZE, nid, false); if (p && pmd_set_huge(pmd, __pa(p), PAGE_KERNEL)) return; - memblock_free_ptr(p, PMD_SIZE); + memblock_free(p, PMD_SIZE); } p = early_alloc(PAGE_SIZE, nid, true); @@ -85,7 +85,7 @@ static void __init kasan_populate_pud(pud_t *pud, unsigned long addr, p = early_alloc(PUD_SIZE, nid, false); if (p && pud_set_huge(pud, __pa(p), PAGE_KERNEL)) return; - memblock_free_ptr(p, PUD_SIZE); + memblock_free(p, PUD_SIZE); } p = early_alloc(PAGE_SIZE, nid, true); diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 23d54b810f08..35487305d8af 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -229,28 +229,75 @@ void __init sev_setup_arch(void) swiotlb_adjust_size(size); } -static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc) +static unsigned long pg_level_to_pfn(int level, pte_t *kpte, pgprot_t *ret_prot) { - pgprot_t old_prot, new_prot; - unsigned long pfn, pa, size; - pte_t new_pte; + unsigned long pfn = 0; + pgprot_t prot; switch (level) { case PG_LEVEL_4K: pfn = pte_pfn(*kpte); - old_prot = pte_pgprot(*kpte); + prot = pte_pgprot(*kpte); break; case PG_LEVEL_2M: pfn = pmd_pfn(*(pmd_t *)kpte); - old_prot = pmd_pgprot(*(pmd_t *)kpte); + prot = pmd_pgprot(*(pmd_t *)kpte); break; case PG_LEVEL_1G: pfn = pud_pfn(*(pud_t *)kpte); - old_prot = pud_pgprot(*(pud_t *)kpte); + prot = pud_pgprot(*(pud_t *)kpte); break; default: - return; + WARN_ONCE(1, "Invalid level for kpte\n"); + return 0; + } + + if (ret_prot) + *ret_prot = prot; + + return pfn; +} + +void notify_range_enc_status_changed(unsigned long vaddr, int npages, bool enc) +{ +#ifdef CONFIG_PARAVIRT + unsigned long sz = npages << PAGE_SHIFT; + unsigned long vaddr_end = vaddr + sz; + + while (vaddr < vaddr_end) { + int psize, pmask, level; + unsigned long pfn; + pte_t *kpte; + + kpte = lookup_address(vaddr, &level); + if (!kpte || pte_none(*kpte)) { + WARN_ONCE(1, "kpte lookup for vaddr\n"); + return; + } + + pfn = pg_level_to_pfn(level, kpte, NULL); + if (!pfn) + continue; + + psize = page_level_size(level); + pmask = page_level_mask(level); + + notify_page_enc_status_changed(pfn, psize >> PAGE_SHIFT, enc); + + vaddr = (vaddr & pmask) + psize; } +#endif +} + +static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc) +{ + pgprot_t old_prot, new_prot; + unsigned long pfn, pa, size; + pte_t new_pte; + + pfn = pg_level_to_pfn(level, kpte, &old_prot); + if (!pfn) + return; new_prot = old_prot; if (enc) @@ -286,12 +333,13 @@ static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc) static int __init early_set_memory_enc_dec(unsigned long vaddr, unsigned long size, bool enc) { - unsigned long vaddr_end, vaddr_next; + unsigned long vaddr_end, vaddr_next, start; unsigned long psize, pmask; int split_page_size_mask; int level, ret; pte_t *kpte; + start = vaddr; vaddr_next = vaddr; vaddr_end = vaddr + size; @@ -346,6 +394,7 @@ static int __init early_set_memory_enc_dec(unsigned long vaddr, ret = 0; + notify_range_enc_status_changed(start, PAGE_ALIGN(size) >> PAGE_SHIFT, enc); out: __flush_tlb_all(); return ret; @@ -361,6 +410,11 @@ int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size) return early_set_memory_enc_dec(vaddr, size, true); } +void __init early_set_mem_enc_dec_hypercall(unsigned long vaddr, int npages, bool enc) +{ + notify_range_enc_status_changed(vaddr, npages, enc); +} + /* Override for DMA direct allocation check - ARCH_HAS_FORCE_DMA_UNENCRYPTED */ bool force_dma_unencrypted(struct device *dev) { diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 1e9b93b088db..c6b1213086d6 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -355,7 +355,7 @@ void __init numa_reset_distance(void) /* numa_distance could be 1LU marking allocation failure, test cnt */ if (numa_distance_cnt) - memblock_free_ptr(numa_distance, size); + memblock_free(numa_distance, size); numa_distance_cnt = 0; numa_distance = NULL; /* enable table creation */ } diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index e801e30089c4..1a02b791d273 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -517,7 +517,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) } /* free the copied physical distance table */ - memblock_free_ptr(phys_dist, phys_size); + memblock_free(phys_dist, phys_size); return; no_emu: diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 934dc5b2df36..b4072115c8ef 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -2023,6 +2023,12 @@ static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc) */ cpa_flush(&cpa, 0); + /* + * Notify hypervisor that a given memory range is mapped encrypted + * or decrypted. + */ + notify_range_enc_status_changed(addr, numpages, enc); + return ret; } diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 3507f456fcd0..9e1e6b8d8876 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -632,7 +632,7 @@ static void set_dev_domain_options(struct pci_dev *pdev) pdev->hotplug_user_indicators = 1; } -int pcibios_add_device(struct pci_dev *dev) +int pcibios_device_add(struct pci_dev *dev) { struct pci_setup_rom *rom; struct irq_domain *msidom; diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 5debe4ac6f81..12da00558631 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -23,6 +23,7 @@ #include <xen/features.h> #include <xen/events.h> +#include <xen/pci.h> #include <asm/xen/pci.h> #include <asm/xen/cpuid.h> #include <asm/apic.h> @@ -585,78 +586,3 @@ int __init pci_xen_initial_domain(void) } #endif -#ifdef CONFIG_XEN_DOM0 - -struct xen_device_domain_owner { - domid_t domain; - struct pci_dev *dev; - struct list_head list; -}; - -static DEFINE_SPINLOCK(dev_domain_list_spinlock); -static struct list_head dev_domain_list = LIST_HEAD_INIT(dev_domain_list); - -static struct xen_device_domain_owner *find_device(struct pci_dev *dev) -{ - struct xen_device_domain_owner *owner; - - list_for_each_entry(owner, &dev_domain_list, list) { - if (owner->dev == dev) - return owner; - } - return NULL; -} - -int xen_find_device_domain_owner(struct pci_dev *dev) -{ - struct xen_device_domain_owner *owner; - int domain = -ENODEV; - - spin_lock(&dev_domain_list_spinlock); - owner = find_device(dev); - if (owner) - domain = owner->domain; - spin_unlock(&dev_domain_list_spinlock); - return domain; -} -EXPORT_SYMBOL_GPL(xen_find_device_domain_owner); - -int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain) -{ - struct xen_device_domain_owner *owner; - - owner = kzalloc(sizeof(struct xen_device_domain_owner), GFP_KERNEL); - if (!owner) - return -ENODEV; - - spin_lock(&dev_domain_list_spinlock); - if (find_device(dev)) { - spin_unlock(&dev_domain_list_spinlock); - kfree(owner); - return -EEXIST; - } - owner->domain = domain; - owner->dev = dev; - list_add_tail(&owner->list, &dev_domain_list); - spin_unlock(&dev_domain_list_spinlock); - return 0; -} -EXPORT_SYMBOL_GPL(xen_register_device_domain_owner); - -int xen_unregister_device_domain_owner(struct pci_dev *dev) -{ - struct xen_device_domain_owner *owner; - - spin_lock(&dev_domain_list_spinlock); - owner = find_device(dev); - if (!owner) { - spin_unlock(&dev_domain_list_spinlock); - return -ENODEV; - } - list_del(&owner->list); - spin_unlock(&dev_domain_list_spinlock); - kfree(owner); - return 0; -} -EXPORT_SYMBOL_GPL(xen_unregister_device_domain_owner); -#endif /* CONFIG_XEN_DOM0 */ diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 95d970359e17..30c6e986a6cd 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -31,25 +31,10 @@ EXPORT_SYMBOL_GPL(hypercall_page); * Pointer to the xen_vcpu_info structure or * &HYPERVISOR_shared_info->vcpu_info[cpu]. See xen_hvm_init_shared_info * and xen_vcpu_setup for details. By default it points to share_info->vcpu_info - * but if the hypervisor supports VCPUOP_register_vcpu_info then it can point - * to xen_vcpu_info. The pointer is used in __xen_evtchn_do_upcall to - * acknowledge pending events. - * Also more subtly it is used by the patched version of irq enable/disable - * e.g. xen_irq_enable_direct and xen_iret in PV mode. - * - * The desire to be able to do those mask/unmask operations as a single - * instruction by using the per-cpu offset held in %gs is the real reason - * vcpu info is in a per-cpu pointer and the original reason for this - * hypercall. - * + * but during boot it is switched to point to xen_vcpu_info. + * The pointer is used in __xen_evtchn_do_upcall to acknowledge pending events. */ DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); - -/* - * Per CPU pages used if hypervisor supports VCPUOP_register_vcpu_info - * hypercall. This can be used both in PV and PVHVM mode. The structure - * overrides the default per_cpu(xen_vcpu, cpu) value. - */ DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); /* Linux <-> Xen vCPU id mapping */ @@ -84,21 +69,6 @@ EXPORT_SYMBOL(xen_start_flags); */ struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info; -/* - * Flag to determine whether vcpu info placement is available on all - * VCPUs. We assume it is to start with, and then set it to zero on - * the first failure. This is because it can succeed on some VCPUs - * and not others, since it can involve hypervisor memory allocation, - * or because the guest failed to guarantee all the appropriate - * constraints on all VCPUs (ie buffer can't cross a page boundary). - * - * Note that any particular CPU may be using a placed vcpu structure, - * but we can only optimise if the all are. - * - * 0: not available, 1: available - */ -int xen_have_vcpu_info_placement = 1; - static int xen_cpu_up_online(unsigned int cpu) { xen_init_lock_cpu(cpu); @@ -124,10 +94,8 @@ int xen_cpuhp_setup(int (*cpu_up_prepare_cb)(unsigned int), return rc >= 0 ? 0 : rc; } -static int xen_vcpu_setup_restore(int cpu) +static void xen_vcpu_setup_restore(int cpu) { - int rc = 0; - /* Any per_cpu(xen_vcpu) is stale, so reset it */ xen_vcpu_info_reset(cpu); @@ -136,11 +104,8 @@ static int xen_vcpu_setup_restore(int cpu) * be handled by hotplug. */ if (xen_pv_domain() || - (xen_hvm_domain() && cpu_online(cpu))) { - rc = xen_vcpu_setup(cpu); - } - - return rc; + (xen_hvm_domain() && cpu_online(cpu))) + xen_vcpu_setup(cpu); } /* @@ -150,7 +115,7 @@ static int xen_vcpu_setup_restore(int cpu) */ void xen_vcpu_restore(void) { - int cpu, rc; + int cpu; for_each_possible_cpu(cpu) { bool other_cpu = (cpu != smp_processor_id()); @@ -170,20 +135,9 @@ void xen_vcpu_restore(void) if (xen_pv_domain() || xen_feature(XENFEAT_hvm_safe_pvclock)) xen_setup_runstate_info(cpu); - rc = xen_vcpu_setup_restore(cpu); - if (rc) - pr_emerg_once("vcpu restore failed for cpu=%d err=%d. " - "System will hang.\n", cpu, rc); - /* - * In case xen_vcpu_setup_restore() fails, do not bring up the - * VCPU. This helps us avoid the resulting OOPS when the VCPU - * accesses pvclock_vcpu_time via xen_vcpu (which is NULL.) - * Note that this does not improve the situation much -- now the - * VM hangs instead of OOPSing -- with the VCPUs that did not - * fail, spinning in stop_machine(), waiting for the failed - * VCPUs to come up. - */ - if (other_cpu && is_up && (rc == 0) && + xen_vcpu_setup_restore(cpu); + + if (other_cpu && is_up && HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL)) BUG(); } @@ -200,7 +154,7 @@ void xen_vcpu_info_reset(int cpu) } } -int xen_vcpu_setup(int cpu) +void xen_vcpu_setup(int cpu) { struct vcpu_register_vcpu_info info; int err; @@ -221,44 +175,26 @@ int xen_vcpu_setup(int cpu) */ if (xen_hvm_domain()) { if (per_cpu(xen_vcpu, cpu) == &per_cpu(xen_vcpu_info, cpu)) - return 0; + return; } - if (xen_have_vcpu_info_placement) { - vcpup = &per_cpu(xen_vcpu_info, cpu); - info.mfn = arbitrary_virt_to_mfn(vcpup); - info.offset = offset_in_page(vcpup); - - /* - * Check to see if the hypervisor will put the vcpu_info - * structure where we want it, which allows direct access via - * a percpu-variable. - * N.B. This hypercall can _only_ be called once per CPU. - * Subsequent calls will error out with -EINVAL. This is due to - * the fact that hypervisor has no unregister variant and this - * hypercall does not allow to over-write info.mfn and - * info.offset. - */ - err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, - xen_vcpu_nr(cpu), &info); - - if (err) { - pr_warn_once("register_vcpu_info failed: cpu=%d err=%d\n", - cpu, err); - xen_have_vcpu_info_placement = 0; - } else { - /* - * This cpu is using the registered vcpu info, even if - * later ones fail to. - */ - per_cpu(xen_vcpu, cpu) = vcpup; - } - } + vcpup = &per_cpu(xen_vcpu_info, cpu); + info.mfn = arbitrary_virt_to_mfn(vcpup); + info.offset = offset_in_page(vcpup); - if (!xen_have_vcpu_info_placement) - xen_vcpu_info_reset(cpu); + /* + * N.B. This hypercall can _only_ be called once per CPU. + * Subsequent calls will error out with -EINVAL. This is due to + * the fact that hypervisor has no unregister variant and this + * hypercall does not allow to over-write info.mfn and + * info.offset. + */ + err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, xen_vcpu_nr(cpu), + &info); + if (err) + panic("register_vcpu_info failed: cpu=%d err=%d\n", cpu, err); - return ((per_cpu(xen_vcpu, cpu) == NULL) ? -ENODEV : 0); + per_cpu(xen_vcpu, cpu) = vcpup; } void __init xen_banner(void) diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index e68ea5f4ad1c..42300941ec29 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -163,9 +163,9 @@ static int xen_cpu_up_prepare_hvm(unsigned int cpu) per_cpu(xen_vcpu_id, cpu) = cpu_acpi_id(cpu); else per_cpu(xen_vcpu_id, cpu) = cpu; - rc = xen_vcpu_setup(cpu); - if (rc || !xen_have_vector_callback) - return rc; + xen_vcpu_setup(cpu); + if (!xen_have_vector_callback) + return 0; if (xen_feature(XENFEAT_hvm_safe_pvclock)) xen_setup_timer(cpu); diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 4f63117f09bb..5004feb16783 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -27,7 +27,6 @@ #include <linux/export.h> #include <linux/mm.h> #include <linux/page-flags.h> -#include <linux/highmem.h> #include <linux/pci.h> #include <linux/gfp.h> #include <linux/edd.h> @@ -993,31 +992,13 @@ void __init xen_setup_vcpu_info_placement(void) for_each_possible_cpu(cpu) { /* Set up direct vCPU id mapping for PV guests. */ per_cpu(xen_vcpu_id, cpu) = cpu; - - /* - * xen_vcpu_setup(cpu) can fail -- in which case it - * falls back to the shared_info version for cpus - * where xen_vcpu_nr(cpu) < MAX_VIRT_CPUS. - * - * xen_cpu_up_prepare_pv() handles the rest by failing - * them in hotplug. - */ - (void) xen_vcpu_setup(cpu); + xen_vcpu_setup(cpu); } - /* - * xen_vcpu_setup managed to place the vcpu_info within the - * percpu area for all cpus, so make use of it. - */ - if (xen_have_vcpu_info_placement) { - pv_ops.irq.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct); - pv_ops.irq.irq_disable = - __PV_IS_CALLEE_SAVE(xen_irq_disable_direct); - pv_ops.irq.irq_enable = - __PV_IS_CALLEE_SAVE(xen_irq_enable_direct); - pv_ops.mmu.read_cr2 = - __PV_IS_CALLEE_SAVE(xen_read_cr2_direct); - } + pv_ops.irq.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct); + pv_ops.irq.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct); + pv_ops.irq.irq_enable = __PV_IS_CALLEE_SAVE(xen_irq_enable_direct); + pv_ops.mmu.read_cr2 = __PV_IS_CALLEE_SAVE(xen_read_cr2_direct); } static const struct pv_info xen_info __initconst = { @@ -1247,12 +1228,6 @@ asmlinkage __visible void __init xen_start_kernel(void) __supported_pte_mask &= ~_PAGE_GLOBAL; __default_kernel_pte_mask &= ~_PAGE_GLOBAL; - /* - * Prevent page tables from being allocated in highmem, even - * if CONFIG_HIGHPTE is enabled. - */ - __userpte_alloc_gfp &= ~__GFP_HIGHMEM; - /* Get mfn list */ xen_build_dynamic_phys_to_machine(); diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index 4fe387e520af..06c3c2fb4b06 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -24,60 +24,6 @@ noinstr void xen_force_evtchn_callback(void) (void)HYPERVISOR_xen_version(0, NULL); } -asmlinkage __visible noinstr unsigned long xen_save_fl(void) -{ - struct vcpu_info *vcpu; - unsigned long flags; - - vcpu = this_cpu_read(xen_vcpu); - - /* flag has opposite sense of mask */ - flags = !vcpu->evtchn_upcall_mask; - - /* convert to IF type flag - -0 -> 0x00000000 - -1 -> 0xffffffff - */ - return (-flags) & X86_EFLAGS_IF; -} -__PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl, ".noinstr.text"); - -asmlinkage __visible noinstr void xen_irq_disable(void) -{ - /* There's a one instruction preempt window here. We need to - make sure we're don't switch CPUs between getting the vcpu - pointer and updating the mask. */ - preempt_disable(); - this_cpu_read(xen_vcpu)->evtchn_upcall_mask = 1; - preempt_enable_no_resched(); -} -__PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable, ".noinstr.text"); - -asmlinkage __visible noinstr void xen_irq_enable(void) -{ - struct vcpu_info *vcpu; - - /* - * We may be preempted as soon as vcpu->evtchn_upcall_mask is - * cleared, so disable preemption to ensure we check for - * events on the VCPU we are still running on. - */ - preempt_disable(); - - vcpu = this_cpu_read(xen_vcpu); - vcpu->evtchn_upcall_mask = 0; - - /* Doesn't matter if we get preempted here, because any - pending event will get dealt with anyway. */ - - barrier(); /* unmask then check (avoid races) */ - if (unlikely(vcpu->evtchn_upcall_pending)) - xen_force_evtchn_callback(); - - preempt_enable(); -} -__PV_CALLEE_SAVE_REGS_THUNK(xen_irq_enable, ".noinstr.text"); - static void xen_safe_halt(void) { /* Blocking includes an implicit local_irq_enable(). */ @@ -96,10 +42,10 @@ static void xen_halt(void) static const typeof(pv_ops) xen_irq_ops __initconst = { .irq = { - - .save_fl = PV_CALLEE_SAVE(xen_save_fl), - .irq_disable = PV_CALLEE_SAVE(xen_irq_disable), - .irq_enable = PV_CALLEE_SAVE(xen_irq_enable), + /* Initial interrupt flag handling only called while interrupts off. */ + .save_fl = __PV_IS_CALLEE_SAVE(paravirt_ret0), + .irq_disable = __PV_IS_CALLEE_SAVE(paravirt_nop), + .irq_enable = __PV_IS_CALLEE_SAVE(paravirt_BUG), .safe_halt = xen_safe_halt, .halt = xen_halt, diff --git a/arch/x86/xen/mmu_hvm.c b/arch/x86/xen/mmu_hvm.c index 57409373750f..509bdee3ab90 100644 --- a/arch/x86/xen/mmu_hvm.c +++ b/arch/x86/xen/mmu_hvm.c @@ -9,39 +9,28 @@ #ifdef CONFIG_PROC_VMCORE /* - * This function is used in two contexts: - * - the kdump kernel has to check whether a pfn of the crashed kernel - * was a ballooned page. vmcore is using this function to decide - * whether to access a pfn of the crashed kernel. - * - the kexec kernel has to check whether a pfn was ballooned by the - * previous kernel. If the pfn is ballooned, handle it properly. - * Returns 0 if the pfn is not backed by a RAM page, the caller may + * The kdump kernel has to check whether a pfn of the crashed kernel + * was a ballooned page. vmcore is using this function to decide + * whether to access a pfn of the crashed kernel. + * Returns "false" if the pfn is not backed by a RAM page, the caller may * handle the pfn special in this case. */ -static int xen_oldmem_pfn_is_ram(unsigned long pfn) +static bool xen_vmcore_pfn_is_ram(struct vmcore_cb *cb, unsigned long pfn) { struct xen_hvm_get_mem_type a = { .domid = DOMID_SELF, .pfn = pfn, }; - int ram; - if (HYPERVISOR_hvm_op(HVMOP_get_mem_type, &a)) - return -ENXIO; - - switch (a.mem_type) { - case HVMMEM_mmio_dm: - ram = 0; - break; - case HVMMEM_ram_rw: - case HVMMEM_ram_ro: - default: - ram = 1; - break; + if (HYPERVISOR_hvm_op(HVMOP_get_mem_type, &a)) { + pr_warn_once("Unexpected HVMOP_get_mem_type failure\n"); + return true; } - - return ram; + return a.mem_type != HVMMEM_mmio_dm; } +static struct vmcore_cb xen_vmcore_cb = { + .pfn_is_ram = xen_vmcore_pfn_is_ram, +}; #endif static void xen_hvm_exit_mmap(struct mm_struct *mm) @@ -75,6 +64,6 @@ void __init xen_hvm_init_mmu_ops(void) if (is_pagetable_dying_supported()) pv_ops.mmu.exit_mmap = xen_hvm_exit_mmap; #ifdef CONFIG_PROC_VMCORE - WARN_ON(register_oldmem_pfn_is_ram(&xen_oldmem_pfn_is_ram)); + register_vmcore_cb(&xen_vmcore_cb); #endif } diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 1ce436eeda15..00354866921b 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -41,7 +41,6 @@ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 */ #include <linux/sched/mm.h> -#include <linux/highmem.h> #include <linux/debugfs.h> #include <linux/bug.h> #include <linux/vmalloc.h> @@ -86,8 +85,10 @@ #include "mmu.h" #include "debugfs.h" +#ifdef CONFIG_X86_VSYSCALL_EMULATION /* l3 pud for userspace vsyscall mapping */ static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss; +#endif /* * Protects atomic reservation decrease/increase against concurrent increases. @@ -241,9 +242,11 @@ static void xen_set_pmd(pmd_t *ptr, pmd_t val) * Associate a virtual page frame with a given physical page frame * and protection flags for that frame. */ -void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) +void __init set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) { - set_pte_vaddr(vaddr, mfn_pte(mfn, flags)); + if (HYPERVISOR_update_va_mapping(vaddr, mfn_pte(mfn, flags), + UVMF_INVLPG)) + BUG(); } static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval) @@ -789,7 +792,9 @@ static void __init xen_mark_pinned(struct mm_struct *mm, struct page *page, static void __init xen_after_bootmem(void) { static_branch_enable(&xen_struct_pages_ready); +#ifdef CONFIG_X86_VSYSCALL_EMULATION SetPagePinned(virt_to_page(level3_user_vsyscall)); +#endif xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP); } @@ -1025,7 +1030,7 @@ static void __init xen_free_ro_pages(unsigned long paddr, unsigned long size) for (; vaddr < vaddr_end; vaddr += PAGE_SIZE) make_lowmem_page_readwrite(vaddr); - memblock_free(paddr, size); + memblock_phys_free(paddr, size); } static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin) @@ -1151,7 +1156,7 @@ static void __init xen_pagetable_p2m_free(void) xen_cleanhighmap(addr, addr + size); size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); - memblock_free(__pa(addr), size); + memblock_free((void *)addr, size); } else { xen_cleanmfnmap(addr); } @@ -1192,6 +1197,13 @@ static void __init xen_pagetable_p2m_setup(void) static void __init xen_pagetable_init(void) { + /* + * The majority of further PTE writes is to pagetables already + * announced as such to Xen. Hence it is more efficient to use + * hypercalls for these updates. + */ + pv_ops.mmu.set_pte = __xen_set_pte; + paging_init(); xen_post_allocator_init(); @@ -1421,10 +1433,18 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) * * Many of these PTE updates are done on unpinned and writable pages * and doing a hypercall for these is unnecessary and expensive. At - * this point it is not possible to tell if a page is pinned or not, - * so always write the PTE directly and rely on Xen trapping and + * this point it is rarely possible to tell if a page is pinned, so + * mostly write the PTE directly and rely on Xen trapping and * emulating any updates as necessary. */ +static void __init xen_set_pte_init(pte_t *ptep, pte_t pte) +{ + if (unlikely(is_early_ioremap_ptep(ptep))) + __xen_set_pte(ptep, pte); + else + native_set_pte(ptep, pte); +} + __visible pte_t xen_make_pte_init(pteval_t pte) { unsigned long pfn; @@ -1446,11 +1466,6 @@ __visible pte_t xen_make_pte_init(pteval_t pte) } PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_init); -static void __init xen_set_pte_init(pte_t *ptep, pte_t pte) -{ - __xen_set_pte(ptep, pte); -} - /* Early in boot, while setting up the initial pagetable, assume everything is pinned. */ static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) @@ -1750,7 +1765,6 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) set_page_prot(init_top_pgt, PAGE_KERNEL_RO); set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); - set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO); set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); @@ -1767,6 +1781,13 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) /* Unpin Xen-provided one */ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); +#ifdef CONFIG_X86_VSYSCALL_EMULATION + /* Pin user vsyscall L3 */ + set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); + pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, + PFN_DOWN(__pa_symbol(level3_user_vsyscall))); +#endif + /* * At this stage there can be no user pgd, and no page structure to * attach it to, so make sure we just set kernel pgd. @@ -1956,7 +1977,7 @@ void __init xen_relocate_p2m(void) pfn_end = p2m_pfn_end; } - memblock_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn)); + memblock_phys_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn)); while (pfn < pfn_end) { if (pfn == p2m_pfn) { pfn = p2m_pfn_end; @@ -1999,6 +2020,7 @@ static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss; static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) { pte_t pte; + unsigned long vaddr; phys >>= PAGE_SHIFT; @@ -2039,15 +2061,15 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) break; } - __native_set_fixmap(idx, pte); + vaddr = __fix_to_virt(idx); + if (HYPERVISOR_update_va_mapping(vaddr, pte, UVMF_INVLPG)) + BUG(); #ifdef CONFIG_X86_VSYSCALL_EMULATION /* Replicate changes to map the vsyscall page into the user pagetable vsyscall mapping. */ - if (idx == VSYSCALL_PAGE) { - unsigned long vaddr = __fix_to_virt(idx); + if (idx == VSYSCALL_PAGE) set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte); - } #endif } diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 5e6e236977c7..58db86f7b384 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -197,7 +197,7 @@ static void * __ref alloc_p2m_page(void) static void __ref free_p2m_page(void *p) { if (unlikely(!slab_is_available())) { - memblock_free((unsigned long)p, PAGE_SIZE); + memblock_free(p, PAGE_SIZE); return; } diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 8bfc10330107..af216feb63d9 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -153,7 +153,7 @@ static void __init xen_del_extra_mem(unsigned long start_pfn, break; } } - memblock_free(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns)); + memblock_phys_free(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns)); } /* @@ -306,10 +306,6 @@ static void __init xen_update_mem_tables(unsigned long pfn, unsigned long mfn) BUG(); } - /* Update kernel mapping, but not for highmem. */ - if (pfn >= PFN_UP(__pa(high_memory - 1))) - return; - if (HYPERVISOR_update_va_mapping((unsigned long)__va(pfn << PAGE_SHIFT), mfn_pte(mfn, PAGE_KERNEL), 0)) { WARN(1, "Failed to update kernel mapping for mfn=%ld pfn=%ld\n", @@ -429,13 +425,13 @@ static unsigned long __init xen_set_identity_and_remap_chunk( } /* - * If the PFNs are currently mapped, the VA mapping also needs - * to be updated to be 1:1. + * If the PFNs are currently mapped, their VA mappings need to be + * zapped. */ for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) (void)HYPERVISOR_update_va_mapping( (unsigned long)__va(pfn << PAGE_SHIFT), - mfn_pte(pfn, PAGE_KERNEL_IO), 0); + native_make_pte(0), 0); return remap_pfn; } @@ -719,7 +715,7 @@ static void __init xen_reserve_xen_mfnlist(void) return; xen_relocate_p2m(); - memblock_free(start, size); + memblock_phys_free(start, size); } /** @@ -885,7 +881,7 @@ char * __init xen_memory_setup(void) xen_phys_memcpy(new_area, start, size); pr_info("initrd moved from [mem %#010llx-%#010llx] to [mem %#010llx-%#010llx]\n", start, start + size, new_area, new_area + size); - memblock_free(start, size); + memblock_phys_free(start, size); boot_params.hdr.ramdisk_image = new_area; boot_params.ext_ramdisk_image = new_area >> 32; } diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index c1b2f764b29a..c3e1f9a7d43a 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -121,34 +121,10 @@ int xen_smp_intr_init(unsigned int cpu) void __init xen_smp_cpus_done(unsigned int max_cpus) { - int cpu, rc, count = 0; - if (xen_hvm_domain()) native_smp_cpus_done(max_cpus); else calculate_max_logical_packages(); - - if (xen_have_vcpu_info_placement) - return; - - for_each_online_cpu(cpu) { - if (xen_vcpu_nr(cpu) < MAX_VIRT_CPUS) - continue; - - rc = remove_cpu(cpu); - - if (rc == 0) { - /* - * Reset vcpu_info so this cpu cannot be onlined again. - */ - xen_vcpu_info_reset(cpu); - count++; - } else { - pr_warn("%s: failed to bring CPU %d down, error %d\n", - __func__, cpu, rc); - } - } - WARN(count, "%s: brought %d CPUs offline\n", __func__, count); } void xen_smp_send_reschedule(int cpu) @@ -268,20 +244,16 @@ void xen_send_IPI_allbutself(int vector) static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id) { - irq_enter(); generic_smp_call_function_interrupt(); inc_irq_stat(irq_call_count); - irq_exit(); return IRQ_HANDLED; } static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id) { - irq_enter(); generic_smp_call_function_single_interrupt(); inc_irq_stat(irq_call_count); - irq_exit(); return IRQ_HANDLED; } diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index 7ed56c6075b0..6a8f3b53ab83 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -225,7 +225,6 @@ static void __init xen_pv_smp_prepare_boot_cpu(void) static void __init xen_pv_smp_prepare_cpus(unsigned int max_cpus) { unsigned cpu; - unsigned int i; if (skip_ioapic_setup) { char *m = (max_cpus == 0) ? @@ -238,16 +237,9 @@ static void __init xen_pv_smp_prepare_cpus(unsigned int max_cpus) } xen_init_lock_cpu(0); - smp_store_boot_cpu_info(); - cpu_data(0).x86_max_cores = 1; + smp_prepare_cpus_common(); - for_each_possible_cpu(i) { - zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); - zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); - zalloc_cpumask_var(&per_cpu(cpu_die_map, i), GFP_KERNEL); - zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL); - } - set_cpu_sibling_map(0); + cpu_data(0).x86_max_cores = 1; speculative_store_bypass_ht_init(); @@ -458,10 +450,8 @@ static void xen_pv_stop_other_cpus(int wait) static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id) { - irq_enter(); irq_work_run(); inc_irq_stat(apic_irq_work_irqs); - irq_exit(); return IRQ_HANDLED; } diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 9e27b86a0c31..6a64496edefb 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -45,13 +45,13 @@ SYM_CODE_START(startup_xen) /* Clear .bss */ xor %eax,%eax - mov $__bss_start, %_ASM_DI - mov $__bss_stop, %_ASM_CX - sub %_ASM_DI, %_ASM_CX - shr $__ASM_SEL(2, 3), %_ASM_CX - rep __ASM_SIZE(stos) + mov $__bss_start, %rdi + mov $__bss_stop, %rcx + sub %rdi, %rcx + shr $3, %rcx + rep stosq - mov %_ASM_SI, xen_start_info + mov %rsi, xen_start_info mov initial_stack(%rip), %rsp /* Set up %gs. diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 8bc8b72a205d..fd0fec6e92f4 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -76,9 +76,7 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id); bool xen_vcpu_stolen(int vcpu); -extern int xen_have_vcpu_info_placement; - -int xen_vcpu_setup(int cpu); +void xen_vcpu_setup(int cpu); void xen_vcpu_info_reset(int cpu); void xen_setup_vcpu_info_placement(void); |