diff options
Diffstat (limited to 'arch/x86')
41 files changed, 1112 insertions, 1173 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 9b2d50a73a11..bada636d1065 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -23,11 +23,11 @@ config X86 select ARCH_CLOCKSOURCE_DATA select ARCH_DISCARD_MEMBLOCK select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI - select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FAST_MULTIPLIER select ARCH_HAS_GCOV_PROFILE_ALL + select ARCH_HAS_GIGANTIC_PAGE if X86_64 select ARCH_HAS_KCOV if X86_64 select ARCH_HAS_PMEM_API if X86_64 select ARCH_HAS_MMIO_FLUSH @@ -2757,19 +2757,6 @@ config PMC_ATOM def_bool y depends on PCI -config VMD - depends on PCI_MSI - tristate "Volume Management Device Driver" - default N - ---help--- - Adds support for the Intel Volume Management Device (VMD). VMD is a - secondary PCI host bridge that allows PCI Express root ports, - and devices attached to them, to be removed from the default - PCI domain and placed within the VMD domain. This provides - more bus resources than are otherwise possible with a - single domain. If you know your system provides one of these and - has devices attached to it, say Y; if you are not sure, say N. - source "net/Kconfig" source "drivers/Kconfig" diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 830ed391e7ef..2d449337a360 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -163,11 +163,12 @@ asinstr += $(call as-instr,pshufb %xmm0$(comma)%xmm0,-DCONFIG_AS_SSSE3=1) asinstr += $(call as-instr,crc32l %eax$(comma)%eax,-DCONFIG_AS_CRC32=1) avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1) avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1) +avx512_instr :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,-DCONFIG_AS_AVX512=1) sha1_ni_instr :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA1_NI=1) sha256_ni_instr :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA256_NI=1) -KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(sha1_ni_instr) $(sha256_ni_instr) -KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(sha1_ni_instr) $(sha256_ni_instr) +KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr) +KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr) LDFLAGS := -m elf_$(UTS_MACHINE) diff --git a/arch/x86/configs/kvm_guest.config b/arch/x86/configs/kvm_guest.config deleted file mode 100644 index 9906505c998a..000000000000 --- a/arch/x86/configs/kvm_guest.config +++ /dev/null @@ -1,31 +0,0 @@ -CONFIG_NET=y -CONFIG_NET_CORE=y -CONFIG_NETDEVICES=y -CONFIG_BLOCK=y -CONFIG_BLK_DEV=y -CONFIG_NETWORK_FILESYSTEMS=y -CONFIG_INET=y -CONFIG_TTY=y -CONFIG_SERIAL_8250=y -CONFIG_SERIAL_8250_CONSOLE=y -CONFIG_IP_PNP=y -CONFIG_IP_PNP_DHCP=y -CONFIG_BINFMT_ELF=y -CONFIG_PCI=y -CONFIG_PCI_MSI=y -CONFIG_DEBUG_KERNEL=y -CONFIG_VIRTUALIZATION=y -CONFIG_HYPERVISOR_GUEST=y -CONFIG_PARAVIRT=y -CONFIG_KVM_GUEST=y -CONFIG_VIRTIO=y -CONFIG_VIRTIO_PCI=y -CONFIG_VIRTIO_BLK=y -CONFIG_VIRTIO_CONSOLE=y -CONFIG_VIRTIO_NET=y -CONFIG_9P_FS=y -CONFIG_NET_9P=y -CONFIG_NET_9P_VIRTIO=y -CONFIG_SCSI_LOWLEVEL=y -CONFIG_SCSI_VIRTIO=y -CONFIG_VIRTIO_INPUT=y diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index fee1d95902b5..c98ec2efd750 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1148,7 +1148,7 @@ END(error_entry) /* - * On entry, EBS is a "return to kernel mode" flag: + * On entry, EBX is a "return to kernel mode" flag: * 1: already in kernel mode, don't need SWAPGS * 0: user gsbase is loaded, we need SWAPGS and standard preparation for return to usermode */ diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index 94d54d0defa7..02223cb4bcfd 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -129,7 +129,7 @@ static notrace cycle_t vread_pvclock(int *mode) return 0; } - ret = __pvclock_read_cycles(pvti); + ret = __pvclock_read_cycles(pvti, rdtsc_ordered()); } while (pvclock_read_retry(pvti, version)); /* refer to vread_tsc() comment for rationale */ diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index e7de5c9a4fbd..16d3fa211962 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -50,8 +50,9 @@ extern int vector_used_by_percpu_irq(unsigned int vector); extern void init_ISA_irqs(void); #ifdef CONFIG_X86_LOCAL_APIC -void arch_trigger_all_cpu_backtrace(bool); -#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace +void arch_trigger_cpumask_backtrace(const struct cpumask *mask, + bool exclude_self); +#define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace #endif #endif /* _ASM_X86_IRQ_H */ diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index b77f5edb03b0..ac7692dcfa2e 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -4,6 +4,10 @@ #include <asm/processor-flags.h> #ifndef __ASSEMBLY__ + +/* Provide __cpuidle; we can't safely include <linux/cpu.h> */ +#define __cpuidle __attribute__((__section__(".cpuidle.text"))) + /* * Interrupt control: */ @@ -44,12 +48,12 @@ static inline void native_irq_enable(void) asm volatile("sti": : :"memory"); } -static inline void native_safe_halt(void) +static inline __cpuidle void native_safe_halt(void) { asm volatile("sti; hlt": : :"memory"); } -static inline void native_halt(void) +static inline __cpuidle void native_halt(void) { asm volatile("hlt": : :"memory"); } @@ -86,7 +90,7 @@ static inline notrace void arch_local_irq_enable(void) * Used in the idle loop; sti takes one instruction cycle * to complete: */ -static inline void arch_safe_halt(void) +static inline __cpuidle void arch_safe_halt(void) { native_safe_halt(); } @@ -95,7 +99,7 @@ static inline void arch_safe_halt(void) * Used when interrupts are already enabled or to * shutdown the processor: */ -static inline void halt(void) +static inline __cpuidle void halt(void) { native_halt(); } diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 33ae3a4d0159..4b20f7304b9c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -568,6 +568,7 @@ struct kvm_vcpu_arch { struct kvm_steal_time steal; } st; + u64 tsc_offset; u64 last_guest_tsc; u64 last_host_tsc; u64 tsc_offset_adjustment; @@ -701,6 +702,8 @@ struct kvm_hv { /* Hyper-v based guest crash (NT kernel bugcheck) parameters */ u64 hv_crash_param[HV_X64_MSR_CRASH_PARAMS]; u64 hv_crash_ctl; + + HV_REFERENCE_TSC_PAGE tsc_ref; }; struct kvm_arch { @@ -781,54 +784,56 @@ struct kvm_arch { bool disabled_lapic_found; /* Struct members for AVIC */ + u32 avic_vm_id; u32 ldr_mode; struct page *avic_logical_id_table_page; struct page *avic_physical_id_table_page; + struct hlist_node hnode; bool x2apic_format; bool x2apic_broadcast_quirk_disabled; }; struct kvm_vm_stat { - u32 mmu_shadow_zapped; - u32 mmu_pte_write; - u32 mmu_pte_updated; - u32 mmu_pde_zapped; - u32 mmu_flooded; - u32 mmu_recycled; - u32 mmu_cache_miss; - u32 mmu_unsync; - u32 remote_tlb_flush; - u32 lpages; + ulong mmu_shadow_zapped; + ulong mmu_pte_write; + ulong mmu_pte_updated; + ulong mmu_pde_zapped; + ulong mmu_flooded; + ulong mmu_recycled; + ulong mmu_cache_miss; + ulong mmu_unsync; + ulong remote_tlb_flush; + ulong lpages; }; struct kvm_vcpu_stat { - u32 pf_fixed; - u32 pf_guest; - u32 tlb_flush; - u32 invlpg; - - u32 exits; - u32 io_exits; - u32 mmio_exits; - u32 signal_exits; - u32 irq_window_exits; - u32 nmi_window_exits; - u32 halt_exits; - u32 halt_successful_poll; - u32 halt_attempted_poll; - u32 halt_poll_invalid; - u32 halt_wakeup; - u32 request_irq_exits; - u32 irq_exits; - u32 host_state_reload; - u32 efer_reload; - u32 fpu_reload; - u32 insn_emulation; - u32 insn_emulation_fail; - u32 hypercalls; - u32 irq_injections; - u32 nmi_injections; + u64 pf_fixed; + u64 pf_guest; + u64 tlb_flush; + u64 invlpg; + + u64 exits; + u64 io_exits; + u64 mmio_exits; + u64 signal_exits; + u64 irq_window_exits; + u64 nmi_window_exits; + u64 halt_exits; + u64 halt_successful_poll; + u64 halt_attempted_poll; + u64 halt_poll_invalid; + u64 halt_wakeup; + u64 request_irq_exits; + u64 irq_exits; + u64 host_state_reload; + u64 efer_reload; + u64 fpu_reload; + u64 insn_emulation; + u64 insn_emulation_fail; + u64 hypercalls; + u64 irq_injections; + u64 nmi_injections; }; struct x86_instruction_info; @@ -951,7 +956,6 @@ struct kvm_x86_ops { bool (*has_wbinvd_exit)(void); - u64 (*read_tsc_offset)(struct kvm_vcpu *vcpu); void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu, u64 host_tsc); diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index 9ab7507ca1c2..1411dbed5e5e 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -23,6 +23,9 @@ struct pci_sysdata { #ifdef CONFIG_PCI_MSI_IRQ_DOMAIN void *fwnode; /* IRQ domain for MSI assignment */ #endif +#if IS_ENABLED(CONFIG_VMD) + bool vmd_domain; /* True if in Intel VMD domain */ +#endif }; extern int pci_routeirq; @@ -56,6 +59,17 @@ static inline void *_pci_root_bus_fwnode(struct pci_bus *bus) #define pci_root_bus_fwnode _pci_root_bus_fwnode #endif +static inline bool is_vmd(struct pci_bus *bus) +{ +#if IS_ENABLED(CONFIG_VMD) + struct pci_sysdata *sd = bus->sysdata; + + return sd->vmd_domain; +#else + return false; +#endif +} + /* Can be used to override the logic in pci_scan_bus for skipping already-configured bus numbers - to be used for buggy BIOSes or architectures with incomplete PCI setup by the loader */ diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index f1218f512f62..8b4de22d6429 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -439,8 +439,6 @@ extern pgprot_t pgprot_writethrough(pgprot_t prot); struct file; pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot); -int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, - unsigned long size, pgprot_t *vma_prot); /* Install a pte for a particular vaddr in kernel space. */ void set_pte_vaddr(unsigned long vaddr, pte_t pte); diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index d019f0cc80ec..3ad741b84072 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -87,9 +87,10 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift) } static __always_inline -cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src) +cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, + u64 tsc) { - u64 delta = rdtsc_ordered() - src->tsc_timestamp; + u64 delta = tsc - src->tsc_timestamp; cycle_t offset = pvclock_scale_delta(delta, src->tsc_to_system_mul, src->tsc_shift); return src->system_time + offset; diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h index e6911caf5bbf..608a79d5a466 100644 --- a/arch/x86/include/asm/xen/events.h +++ b/arch/x86/include/asm/xen/events.h @@ -20,15 +20,4 @@ static inline int xen_irqs_disabled(struct pt_regs *regs) /* No need for a barrier -- XCHG is a barrier on x86. */ #define xchg_xen_ulong(ptr, val) xchg((ptr), (val)) -extern int xen_have_vector_callback; - -/* - * Events delivered via platform PCI interrupts are always - * routed to vcpu 0 and hence cannot be rebound. - */ -static inline bool xen_support_evtchn_rebind(void) -{ - return (!xen_hvm_domain() || xen_have_vector_callback); -} - #endif /* _ASM_X86_XEN_EVENTS_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 45257cf84370..4dd5d500eb60 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -83,6 +83,7 @@ obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-y += apic/ obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o +obj-$(CONFIG_LIVEPATCH) += livepatch.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o obj-$(CONFIG_X86_TSC) += trace_clock.o diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index bdfad642123f..af15f4444330 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -152,7 +152,7 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu, } EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe); -void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx) +void __cpuidle acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx) { unsigned int cpu = smp_processor_id(); struct cstate_entry *percpu_entry; diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index f29501e1a5c1..c73c9fb281e1 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -26,32 +26,32 @@ u64 hw_nmi_get_sample_period(int watchdog_thresh) } #endif -#ifdef arch_trigger_all_cpu_backtrace +#ifdef arch_trigger_cpumask_backtrace static void nmi_raise_cpu_backtrace(cpumask_t *mask) { apic->send_IPI_mask(mask, NMI_VECTOR); } -void arch_trigger_all_cpu_backtrace(bool include_self) +void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self) { - nmi_trigger_all_cpu_backtrace(include_self, nmi_raise_cpu_backtrace); + nmi_trigger_cpumask_backtrace(mask, exclude_self, + nmi_raise_cpu_backtrace); } -static int -arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs) +static int nmi_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs) { if (nmi_cpu_backtrace(regs)) return NMI_HANDLED; return NMI_DONE; } -NOKPROBE_SYMBOL(arch_trigger_all_cpu_backtrace_handler); +NOKPROBE_SYMBOL(nmi_cpu_backtrace_handler); -static int __init register_trigger_all_cpu_backtrace(void) +static int __init register_nmi_cpu_backtrace_handler(void) { - register_nmi_handler(NMI_LOCAL, arch_trigger_all_cpu_backtrace_handler, + register_nmi_handler(NMI_LOCAL, nmi_cpu_backtrace_handler, 0, "arch_bt"); return 0; } -early_initcall(register_trigger_all_cpu_backtrace); +early_initcall(register_nmi_cpu_backtrace_handler); #endif diff --git a/arch/x86/kernel/livepatch.c b/arch/x86/kernel/livepatch.c new file mode 100644 index 000000000000..e9d252d873aa --- /dev/null +++ b/arch/x86/kernel/livepatch.c @@ -0,0 +1,65 @@ +/* + * livepatch.c - x86-specific Kernel Live Patching Core + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/module.h> +#include <linux/kallsyms.h> +#include <linux/livepatch.h> +#include <asm/text-patching.h> + +/* Apply per-object alternatives. Based on x86 module_finalize() */ +void arch_klp_init_object_loaded(struct klp_patch *patch, + struct klp_object *obj) +{ + int cnt; + struct klp_modinfo *info; + Elf_Shdr *s, *alt = NULL, *para = NULL; + void *aseg, *pseg; + const char *objname; + char sec_objname[MODULE_NAME_LEN]; + char secname[KSYM_NAME_LEN]; + + info = patch->mod->klp_info; + objname = obj->name ? obj->name : "vmlinux"; + + /* See livepatch core code for BUILD_BUG_ON() explanation */ + BUILD_BUG_ON(MODULE_NAME_LEN < 56 || KSYM_NAME_LEN != 128); + + for (s = info->sechdrs; s < info->sechdrs + info->hdr.e_shnum; s++) { + /* Apply per-object .klp.arch sections */ + cnt = sscanf(info->secstrings + s->sh_name, + ".klp.arch.%55[^.].%127s", + sec_objname, secname); + if (cnt != 2) + continue; + if (strcmp(sec_objname, objname)) + continue; + if (!strcmp(".altinstructions", secname)) + alt = s; + if (!strcmp(".parainstructions", secname)) + para = s; + } + + if (alt) { + aseg = (void *) alt->sh_addr; + apply_alternatives(aseg, aseg + alt->sh_size); + } + + if (para) { + pseg = (void *) para->sh_addr; + apply_paravirt(pseg, pseg + para->sh_size); + } +} diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 4002b475171c..28cea7802ecb 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -302,7 +302,7 @@ void arch_cpu_idle(void) /* * We use this if we don't have any better idle routine.. */ -void default_idle(void) +void __cpuidle default_idle(void) { trace_cpu_idle_rcuidle(1, smp_processor_id()); safe_halt(); @@ -417,7 +417,7 @@ static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) * with interrupts enabled and no flags, which is backwards compatible with the * original MWAIT implementation. */ -static void mwait_idle(void) +static __cpuidle void mwait_idle(void) { if (!current_set_polling_and_test()) { trace_cpu_idle_rcuidle(1, smp_processor_id()); diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 3599404e3089..5b2cc889ce34 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -80,7 +80,7 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) do { version = pvclock_read_begin(src); - ret = __pvclock_read_cycles(src); + ret = __pvclock_read_cycles(src, rdtsc_ordered()); flags = src->flags; } while (pvclock_read_retry(src, version)); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 9297a002d8e5..dbf67f64d5ec 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -97,6 +97,7 @@ SECTIONS _stext = .; TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT ENTRY_TEXT diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 464fa477afbf..3bff20710471 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -13,7 +13,7 @@ kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ - hyperv.o page_track.o + hyperv.o page_track.o debugfs.o kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += assigned-dev.o iommu.o diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 3235e0fe7792..afa7bbb596cd 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -366,7 +366,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) | - F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB); + F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) | + F(AVX512BW) | F(AVX512VL); /* cpuid 0xD.1.eax */ const u32 kvm_cpuid_D_1_eax_x86_features = diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c new file mode 100644 index 000000000000..c19c7ede9bd6 --- /dev/null +++ b/arch/x86/kvm/debugfs.c @@ -0,0 +1,69 @@ +/* + * Kernel-based Virtual Machine driver for Linux + * + * Copyright 2016 Red Hat, Inc. and/or its affiliates. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ +#include <linux/kvm_host.h> +#include <linux/debugfs.h> + +bool kvm_arch_has_vcpu_debugfs(void) +{ + return true; +} + +static int vcpu_get_tsc_offset(void *data, u64 *val) +{ + struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data; + *val = vcpu->arch.tsc_offset; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_offset_fops, vcpu_get_tsc_offset, NULL, "%lld\n"); + +static int vcpu_get_tsc_scaling_ratio(void *data, u64 *val) +{ + struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data; + *val = vcpu->arch.tsc_scaling_ratio; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_fops, vcpu_get_tsc_scaling_ratio, NULL, "%llu\n"); + +static int vcpu_get_tsc_scaling_frac_bits(void *data, u64 *val) +{ + *val = kvm_tsc_scaling_ratio_frac_bits; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_frac_fops, vcpu_get_tsc_scaling_frac_bits, NULL, "%llu\n"); + +int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu) +{ + struct dentry *ret; + + ret = debugfs_create_file("tsc-offset", 0444, + vcpu->debugfs_dentry, + vcpu, &vcpu_tsc_offset_fops); + if (!ret) + return -ENOMEM; + + if (kvm_has_tsc_control) { + ret = debugfs_create_file("tsc-scaling-ratio", 0444, + vcpu->debugfs_dentry, + vcpu, &vcpu_tsc_scaling_fops); + if (!ret) + return -ENOMEM; + ret = debugfs_create_file("tsc-scaling-ratio-frac-bits", 0444, + vcpu->debugfs_dentry, + vcpu, &vcpu_tsc_scaling_frac_fops); + if (!ret) + return -ENOMEM; + + } + + return 0; +} diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 01bd7b7a6866..42b1c83741c8 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -386,7 +386,21 @@ static void synic_init(struct kvm_vcpu_hv_synic *synic) static u64 get_time_ref_counter(struct kvm *kvm) { - return div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100); + struct kvm_hv *hv = &kvm->arch.hyperv; + struct kvm_vcpu *vcpu; + u64 tsc; + + /* + * The guest has not set up the TSC page or the clock isn't + * stable, fall back to get_kvmclock_ns. + */ + if (!hv->tsc_ref.tsc_sequence) + return div_u64(get_kvmclock_ns(kvm), 100); + + vcpu = kvm_get_vcpu(kvm, 0); + tsc = kvm_read_l1_tsc(vcpu, rdtsc()); + return mul_u64_u64_shr(tsc, hv->tsc_ref.tsc_scale, 64) + + hv->tsc_ref.tsc_offset; } static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer, @@ -756,6 +770,129 @@ static int kvm_hv_msr_set_crash_data(struct kvm_vcpu *vcpu, return 0; } +/* + * The kvmclock and Hyper-V TSC page use similar formulas, and converting + * between them is possible: + * + * kvmclock formula: + * nsec = (ticks - tsc_timestamp) * tsc_to_system_mul * 2^(tsc_shift-32) + * + system_time + * + * Hyper-V formula: + * nsec/100 = ticks * scale / 2^64 + offset + * + * When tsc_timestamp = system_time = 0, offset is zero in the Hyper-V formula. + * By dividing the kvmclock formula by 100 and equating what's left we get: + * ticks * scale / 2^64 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100 + * scale / 2^64 = tsc_to_system_mul * 2^(tsc_shift-32) / 100 + * scale = tsc_to_system_mul * 2^(32+tsc_shift) / 100 + * + * Now expand the kvmclock formula and divide by 100: + * nsec = ticks * tsc_to_system_mul * 2^(tsc_shift-32) + * - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32) + * + system_time + * nsec/100 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100 + * - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32) / 100 + * + system_time / 100 + * + * Replace tsc_to_system_mul * 2^(tsc_shift-32) / 100 by scale / 2^64: + * nsec/100 = ticks * scale / 2^64 + * - tsc_timestamp * scale / 2^64 + * + system_time / 100 + * + * Equate with the Hyper-V formula so that ticks * scale / 2^64 cancels out: + * offset = system_time / 100 - tsc_timestamp * scale / 2^64 + * + * These two equivalencies are implemented in this function. + */ +static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info *hv_clock, + HV_REFERENCE_TSC_PAGE *tsc_ref) +{ + u64 max_mul; + + if (!(hv_clock->flags & PVCLOCK_TSC_STABLE_BIT)) + return false; + + /* + * check if scale would overflow, if so we use the time ref counter + * tsc_to_system_mul * 2^(tsc_shift+32) / 100 >= 2^64 + * tsc_to_system_mul / 100 >= 2^(32-tsc_shift) + * tsc_to_system_mul >= 100 * 2^(32-tsc_shift) + */ + max_mul = 100ull << (32 - hv_clock->tsc_shift); + if (hv_clock->tsc_to_system_mul >= max_mul) + return false; + + /* + * Otherwise compute the scale and offset according to the formulas + * derived above. + */ + tsc_ref->tsc_scale = + mul_u64_u32_div(1ULL << (32 + hv_clock->tsc_shift), + hv_clock->tsc_to_system_mul, + 100); + + tsc_ref->tsc_offset = hv_clock->system_time; + do_div(tsc_ref->tsc_offset, 100); + tsc_ref->tsc_offset -= + mul_u64_u64_shr(hv_clock->tsc_timestamp, tsc_ref->tsc_scale, 64); + return true; +} + +void kvm_hv_setup_tsc_page(struct kvm *kvm, + struct pvclock_vcpu_time_info *hv_clock) +{ + struct kvm_hv *hv = &kvm->arch.hyperv; + u32 tsc_seq; + u64 gfn; + + BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(hv->tsc_ref.tsc_sequence)); + BUILD_BUG_ON(offsetof(HV_REFERENCE_TSC_PAGE, tsc_sequence) != 0); + + if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)) + return; + + gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT; + /* + * Because the TSC parameters only vary when there is a + * change in the master clock, do not bother with caching. + */ + if (unlikely(kvm_read_guest(kvm, gfn_to_gpa(gfn), + &tsc_seq, sizeof(tsc_seq)))) + return; + + /* + * While we're computing and writing the parameters, force the + * guest to use the time reference count MSR. + */ + hv->tsc_ref.tsc_sequence = 0; + if (kvm_write_guest(kvm, gfn_to_gpa(gfn), + &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence))) + return; + + if (!compute_tsc_page_parameters(hv_clock, &hv->tsc_ref)) + return; + + /* Ensure sequence is zero before writing the rest of the struct. */ + smp_wmb(); + if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref))) + return; + + /* + * Now switch to the TSC page mechanism by writing the sequence. + */ + tsc_seq++; + if (tsc_seq == 0xFFFFFFFF || tsc_seq == 0) + tsc_seq = 1; + + /* Write the struct entirely before the non-zero sequence. */ + smp_wmb(); + + hv->tsc_ref.tsc_sequence = tsc_seq; + kvm_write_guest(kvm, gfn_to_gpa(gfn), + &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)); +} + static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) { @@ -793,23 +930,11 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, mark_page_dirty(kvm, gfn); break; } - case HV_X64_MSR_REFERENCE_TSC: { - u64 gfn; - HV_REFERENCE_TSC_PAGE tsc_ref; - - memset(&tsc_ref, 0, sizeof(tsc_ref)); + case HV_X64_MSR_REFERENCE_TSC: hv->hv_tsc_page = data; - if (!(data & HV_X64_MSR_TSC_REFERENCE_ENABLE)) - break; - gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT; - if (kvm_write_guest( - kvm, - gfn << HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT, - &tsc_ref, sizeof(tsc_ref))) - return 1; - mark_page_dirty(kvm, gfn); + if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE) + kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); break; - } case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: return kvm_hv_msr_set_crash_data(vcpu, msr - HV_X64_MSR_CRASH_P0, diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index 60eccd4bd1d3..cd1119538add 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -84,4 +84,7 @@ static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu) void kvm_hv_process_stimers(struct kvm_vcpu *vcpu); +void kvm_hv_setup_tsc_page(struct kvm *kvm, + struct pvclock_vcpu_time_info *hv_clock); + #endif diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index b62c85229711..23b99f305382 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1761,9 +1761,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) if (value & MSR_IA32_APICBASE_ENABLE) { kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); static_key_slow_dec_deferred(&apic_hw_disabled); - } else + } else { static_key_slow_inc(&apic_hw_disabled.key); - recalculate_apic_map(vcpu->kvm); + recalculate_apic_map(vcpu->kvm); + } } if ((old_value ^ value) & X2APIC_ENABLE) { diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3d4cc8cc56a3..d9c7e986b4e4 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1207,7 +1207,7 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) * * Return true if tlb need be flushed. */ -static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect) +static bool spte_write_protect(u64 *sptep, bool pt_protect) { u64 spte = *sptep; @@ -1233,12 +1233,12 @@ static bool __rmap_write_protect(struct kvm *kvm, bool flush = false; for_each_rmap_spte(rmap_head, &iter, sptep) - flush |= spte_write_protect(kvm, sptep, pt_protect); + flush |= spte_write_protect(sptep, pt_protect); return flush; } -static bool spte_clear_dirty(struct kvm *kvm, u64 *sptep) +static bool spte_clear_dirty(u64 *sptep) { u64 spte = *sptep; @@ -1256,12 +1256,12 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) bool flush = false; for_each_rmap_spte(rmap_head, &iter, sptep) - flush |= spte_clear_dirty(kvm, sptep); + flush |= spte_clear_dirty(sptep); return flush; } -static bool spte_set_dirty(struct kvm *kvm, u64 *sptep) +static bool spte_set_dirty(u64 *sptep) { u64 spte = *sptep; @@ -1279,7 +1279,7 @@ static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) bool flush = false; for_each_rmap_spte(rmap_head, &iter, sptep) - flush |= spte_set_dirty(kvm, sptep); + flush |= spte_set_dirty(sptep); return flush; } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 1e6b84b96ea6..f8157a36ab09 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -34,6 +34,8 @@ #include <linux/sched.h> #include <linux/trace_events.h> #include <linux/slab.h> +#include <linux/amd-iommu.h> +#include <linux/hashtable.h> #include <asm/apic.h> #include <asm/perf_event.h> @@ -41,6 +43,7 @@ #include <asm/desc.h> #include <asm/debugreg.h> #include <asm/kvm_para.h> +#include <asm/irq_remapping.h> #include <asm/virtext.h> #include "trace.h" @@ -96,6 +99,19 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); #define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0 #define AVIC_UNACCEL_ACCESS_VECTOR_MASK 0xFFFFFFFF +/* AVIC GATAG is encoded using VM and VCPU IDs */ +#define AVIC_VCPU_ID_BITS 8 +#define AVIC_VCPU_ID_MASK ((1 << AVIC_VCPU_ID_BITS) - 1) + +#define AVIC_VM_ID_BITS 24 +#define AVIC_VM_ID_NR (1 << AVIC_VM_ID_BITS) +#define AVIC_VM_ID_MASK ((1 << AVIC_VM_ID_BITS) - 1) + +#define AVIC_GATAG(x, y) (((x & AVIC_VM_ID_MASK) << AVIC_VCPU_ID_BITS) | \ + (y & AVIC_VCPU_ID_MASK)) +#define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK) +#define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK) + static bool erratum_383_found __read_mostly; static const u32 host_save_user_msrs[] = { @@ -185,6 +201,23 @@ struct vcpu_svm { struct page *avic_backing_page; u64 *avic_physical_id_cache; bool avic_is_running; + + /* + * Per-vcpu list of struct amd_svm_iommu_ir: + * This is used mainly to store interrupt remapping information used + * when update the vcpu affinity. This avoids the need to scan for + * IRTE and try to match ga_tag in the IOMMU driver. + */ + struct list_head ir_list; + spinlock_t ir_list_lock; +}; + +/* + * This is a wrapper of struct amd_iommu_ir_data. + */ +struct amd_svm_iommu_ir { + struct list_head node; /* Used by SVM for per-vcpu ir_list */ + void *data; /* Storing pointer to struct amd_ir_data */ }; #define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF) @@ -242,6 +275,10 @@ static int avic; module_param(avic, int, S_IRUGO); #endif +/* AVIC VM ID bit masks and lock */ +static DECLARE_BITMAP(avic_vm_id_bitmap, AVIC_VM_ID_NR); +static DEFINE_SPINLOCK(avic_vm_id_lock); + static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); static void svm_flush_tlb(struct kvm_vcpu *vcpu); static void svm_complete_interrupts(struct vcpu_svm *svm); @@ -928,6 +965,55 @@ static void svm_disable_lbrv(struct vcpu_svm *svm) set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0); } +/* Note: + * This hash table is used to map VM_ID to a struct kvm_arch, + * when handling AMD IOMMU GALOG notification to schedule in + * a particular vCPU. + */ +#define SVM_VM_DATA_HASH_BITS 8 +DECLARE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS); +static spinlock_t svm_vm_data_hash_lock; + +/* Note: + * This function is called from IOMMU driver to notify + * SVM to schedule in a particular vCPU of a particular VM. + */ +static int avic_ga_log_notifier(u32 ga_tag) +{ + unsigned long flags; + struct kvm_arch *ka = NULL; + struct kvm_vcpu *vcpu = NULL; + u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag); + u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag); + + pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id); + + spin_lock_irqsave(&svm_vm_data_hash_lock, flags); + hash_for_each_possible(svm_vm_data_hash, ka, hnode, vm_id) { + struct kvm *kvm = container_of(ka, struct kvm, arch); + struct kvm_arch *vm_data = &kvm->arch; + + if (vm_data->avic_vm_id != vm_id) + continue; + vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id); + break; + } + spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); + + if (!vcpu) + return 0; + + /* Note: + * At this point, the IOMMU should have already set the pending + * bit in the vAPIC backing page. So, we just need to schedule + * in the vcpu. + */ + if (vcpu->mode == OUTSIDE_GUEST_MODE) + kvm_vcpu_wake_up(vcpu); + + return 0; +} + static __init int svm_hardware_setup(void) { int cpu; @@ -986,10 +1072,15 @@ static __init int svm_hardware_setup(void) if (avic) { if (!npt_enabled || !boot_cpu_has(X86_FEATURE_AVIC) || - !IS_ENABLED(CONFIG_X86_LOCAL_APIC)) + !IS_ENABLED(CONFIG_X86_LOCAL_APIC)) { avic = false; - else + } else { pr_info("AVIC enabled\n"); + + hash_init(svm_vm_data_hash); + spin_lock_init(&svm_vm_data_hash_lock); + amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); + } } return 0; @@ -1028,13 +1119,6 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) seg->base = 0; } -static u64 svm_read_tsc_offset(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - return svm->vmcb->control.tsc_offset; -} - static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1280,19 +1364,55 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) return 0; } +static inline int avic_get_next_vm_id(void) +{ + int id; + + spin_lock(&avic_vm_id_lock); + + /* AVIC VM ID is one-based. */ + id = find_next_zero_bit(avic_vm_id_bitmap, AVIC_VM_ID_NR, 1); + if (id <= AVIC_VM_ID_MASK) + __set_bit(id, avic_vm_id_bitmap); + else + id = -EAGAIN; + + spin_unlock(&avic_vm_id_lock); + return id; +} + +static inline int avic_free_vm_id(int id) +{ + if (id <= 0 || id > AVIC_VM_ID_MASK) + return -EINVAL; + + spin_lock(&avic_vm_id_lock); + __clear_bit(id, avic_vm_id_bitmap); + spin_unlock(&avic_vm_id_lock); + return 0; +} + static void avic_vm_destroy(struct kvm *kvm) { + unsigned long flags; struct kvm_arch *vm_data = &kvm->arch; + avic_free_vm_id(vm_data->avic_vm_id); + if (vm_data->avic_logical_id_table_page) __free_page(vm_data->avic_logical_id_table_page); if (vm_data->avic_physical_id_table_page) __free_page(vm_data->avic_physical_id_table_page); + + spin_lock_irqsave(&svm_vm_data_hash_lock, flags); + hash_del(&vm_data->hnode); + spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); } static int avic_vm_init(struct kvm *kvm) { - int err = -ENOMEM; + unsigned long flags; + int vm_id, err = -ENOMEM; struct kvm_arch *vm_data = &kvm->arch; struct page *p_page; struct page *l_page; @@ -1300,6 +1420,11 @@ static int avic_vm_init(struct kvm *kvm) if (!avic) return 0; + vm_id = avic_get_next_vm_id(); + if (vm_id < 0) + return vm_id; + vm_data->avic_vm_id = (u32)vm_id; + /* Allocating physical APIC ID table (4KB) */ p_page = alloc_page(GFP_KERNEL); if (!p_page) @@ -1316,6 +1441,10 @@ static int avic_vm_init(struct kvm *kvm) vm_data->avic_logical_id_table_page = l_page; clear_page(page_address(l_page)); + spin_lock_irqsave(&svm_vm_data_hash_lock, flags); + hash_add(svm_vm_data_hash, &vm_data->hnode, vm_data->avic_vm_id); + spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); + return 0; free_avic: @@ -1323,31 +1452,34 @@ free_avic: return err; } -/** - * This function is called during VCPU halt/unhalt. - */ -static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run) +static inline int +avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r) { - u64 entry; - int h_physical_id = kvm_cpu_get_apicid(vcpu->cpu); + int ret = 0; + unsigned long flags; + struct amd_svm_iommu_ir *ir; struct vcpu_svm *svm = to_svm(vcpu); - if (!kvm_vcpu_apicv_active(vcpu)) - return; - - svm->avic_is_running = is_run; + if (!kvm_arch_has_assigned_device(vcpu->kvm)) + return 0; - /* ID = 0xff (broadcast), ID > 0xff (reserved) */ - if (WARN_ON(h_physical_id >= AVIC_MAX_PHYSICAL_ID_COUNT)) - return; + /* + * Here, we go through the per-vcpu ir_list to update all existing + * interrupt remapping table entry targeting this vcpu. + */ + spin_lock_irqsave(&svm->ir_list_lock, flags); - entry = READ_ONCE(*(svm->avic_physical_id_cache)); - WARN_ON(is_run == !!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)); + if (list_empty(&svm->ir_list)) + goto out; - entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; - if (is_run) - entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; - WRITE_ONCE(*(svm->avic_physical_id_cache), entry); + list_for_each_entry(ir, &svm->ir_list, node) { + ret = amd_iommu_update_ga(cpu, r, ir->data); + if (ret) + break; + } +out: + spin_unlock_irqrestore(&svm->ir_list_lock, flags); + return ret; } static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) @@ -1374,6 +1506,8 @@ static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; WRITE_ONCE(*(svm->avic_physical_id_cache), entry); + avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, + svm->avic_is_running); } static void avic_vcpu_put(struct kvm_vcpu *vcpu) @@ -1385,10 +1519,27 @@ static void avic_vcpu_put(struct kvm_vcpu *vcpu) return; entry = READ_ONCE(*(svm->avic_physical_id_cache)); + if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) + avic_update_iommu_vcpu_affinity(vcpu, -1, 0); + entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; WRITE_ONCE(*(svm->avic_physical_id_cache), entry); } +/** + * This function is called during VCPU halt/unhalt. + */ +static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + svm->avic_is_running = is_run; + if (is_run) + avic_vcpu_load(vcpu, vcpu->cpu); + else + avic_vcpu_put(vcpu); +} + static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1450,6 +1601,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) err = avic_init_backing_page(&svm->vcpu); if (err) goto free_page4; + + INIT_LIST_HEAD(&svm->ir_list); + spin_lock_init(&svm->ir_list_lock); } /* We initialize this flag to true to make sure that the is_running @@ -4246,6 +4400,209 @@ static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec) kvm_vcpu_wake_up(vcpu); } +static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) +{ + unsigned long flags; + struct amd_svm_iommu_ir *cur; + + spin_lock_irqsave(&svm->ir_list_lock, flags); + list_for_each_entry(cur, &svm->ir_list, node) { + if (cur->data != pi->ir_data) + continue; + list_del(&cur->node); + kfree(cur); + break; + } + spin_unlock_irqrestore(&svm->ir_list_lock, flags); +} + +static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) +{ + int ret = 0; + unsigned long flags; + struct amd_svm_iommu_ir *ir; + + /** + * In some cases, the existing irte is updaed and re-set, + * so we need to check here if it's already been * added + * to the ir_list. + */ + if (pi->ir_data && (pi->prev_ga_tag != 0)) { + struct kvm *kvm = svm->vcpu.kvm; + u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag); + struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id); + struct vcpu_svm *prev_svm; + + if (!prev_vcpu) { + ret = -EINVAL; + goto out; + } + + prev_svm = to_svm(prev_vcpu); + svm_ir_list_del(prev_svm, pi); + } + + /** + * Allocating new amd_iommu_pi_data, which will get + * add to the per-vcpu ir_list. + */ + ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL); + if (!ir) { + ret = -ENOMEM; + goto out; + } + ir->data = pi->ir_data; + + spin_lock_irqsave(&svm->ir_list_lock, flags); + list_add(&ir->node, &svm->ir_list); + spin_unlock_irqrestore(&svm->ir_list_lock, flags); +out: + return ret; +} + +/** + * Note: + * The HW cannot support posting multicast/broadcast + * interrupts to a vCPU. So, we still use legacy interrupt + * remapping for these kind of interrupts. + * + * For lowest-priority interrupts, we only support + * those with single CPU as the destination, e.g. user + * configures the interrupts via /proc/irq or uses + * irqbalance to make the interrupts single-CPU. + */ +static int +get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, + struct vcpu_data *vcpu_info, struct vcpu_svm **svm) +{ + struct kvm_lapic_irq irq; + struct kvm_vcpu *vcpu = NULL; + + kvm_set_msi_irq(kvm, e, &irq); + + if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) { + pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n", + __func__, irq.vector); + return -1; + } + + pr_debug("SVM: %s: use GA mode for irq %u\n", __func__, + irq.vector); + *svm = to_svm(vcpu); + vcpu_info->pi_desc_addr = page_to_phys((*svm)->avic_backing_page); + vcpu_info->vector = irq.vector; + + return 0; +} + +/* + * svm_update_pi_irte - set IRTE for Posted-Interrupts + * + * @kvm: kvm + * @host_irq: host irq of the interrupt + * @guest_irq: gsi of the interrupt + * @set: set or unset PI + * returns 0 on success, < 0 on failure + */ +static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq, + uint32_t guest_irq, bool set) +{ + struct kvm_kernel_irq_routing_entry *e; + struct kvm_irq_routing_table *irq_rt; + int idx, ret = -EINVAL; + + if (!kvm_arch_has_assigned_device(kvm) || + !irq_remapping_cap(IRQ_POSTING_CAP)) + return 0; + + pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n", + __func__, host_irq, guest_irq, set); + + idx = srcu_read_lock(&kvm->irq_srcu); + irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); + WARN_ON(guest_irq >= irq_rt->nr_rt_entries); + + hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { + struct vcpu_data vcpu_info; + struct vcpu_svm *svm = NULL; + + if (e->type != KVM_IRQ_ROUTING_MSI) + continue; + + /** + * Here, we setup with legacy mode in the following cases: + * 1. When cannot target interrupt to a specific vcpu. + * 2. Unsetting posted interrupt. + * 3. APIC virtialization is disabled for the vcpu. + */ + if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set && + kvm_vcpu_apicv_active(&svm->vcpu)) { + struct amd_iommu_pi_data pi; + + /* Try to enable guest_mode in IRTE */ + pi.base = page_to_phys(svm->avic_backing_page) & AVIC_HPA_MASK; + pi.ga_tag = AVIC_GATAG(kvm->arch.avic_vm_id, + svm->vcpu.vcpu_id); + pi.is_guest_mode = true; + pi.vcpu_data = &vcpu_info; + ret = irq_set_vcpu_affinity(host_irq, &pi); + + /** + * Here, we successfully setting up vcpu affinity in + * IOMMU guest mode. Now, we need to store the posted + * interrupt information in a per-vcpu ir_list so that + * we can reference to them directly when we update vcpu + * scheduling information in IOMMU irte. + */ + if (!ret && pi.is_guest_mode) + svm_ir_list_add(svm, &pi); + } else { + /* Use legacy mode in IRTE */ + struct amd_iommu_pi_data pi; + + /** + * Here, pi is used to: + * - Tell IOMMU to use legacy mode for this interrupt. + * - Retrieve ga_tag of prior interrupt remapping data. + */ + pi.is_guest_mode = false; + ret = irq_set_vcpu_affinity(host_irq, &pi); + + /** + * Check if the posted interrupt was previously + * setup with the guest_mode by checking if the ga_tag + * was cached. If so, we need to clean up the per-vcpu + * ir_list. + */ + if (!ret && pi.prev_ga_tag) { + int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag); + struct kvm_vcpu *vcpu; + + vcpu = kvm_get_vcpu_by_id(kvm, id); + if (vcpu) + svm_ir_list_del(to_svm(vcpu), &pi); + } + } + + if (!ret && svm) { + trace_kvm_pi_irte_update(svm->vcpu.vcpu_id, + host_irq, e->gsi, + vcpu_info.vector, + vcpu_info.pi_desc_addr, set); + } + + if (ret < 0) { + pr_err("%s: failed to update PI IRTE\n", __func__); + goto out; + } + } + + ret = 0; +out: + srcu_read_unlock(&kvm->irq_srcu, idx); + return ret; +} + static int svm_nmi_allowed(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -5064,7 +5421,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .has_wbinvd_exit = svm_has_wbinvd_exit, - .read_tsc_offset = svm_read_tsc_offset, .write_tsc_offset = svm_write_tsc_offset, .adjust_tsc_offset_guest = svm_adjust_tsc_offset_guest, .read_l1_tsc = svm_read_l1_tsc, @@ -5078,6 +5434,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .pmu_ops = &amd_pmu_ops, .deliver_posted_interrupt = svm_deliver_avic_intr, + .update_pi_irte = svm_update_pi_irte, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 121fdf6e9ed0..cf1b16dbc98a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -927,6 +927,8 @@ static unsigned long *vmx_msr_bitmap_legacy; static unsigned long *vmx_msr_bitmap_longmode; static unsigned long *vmx_msr_bitmap_legacy_x2apic; static unsigned long *vmx_msr_bitmap_longmode_x2apic; +static unsigned long *vmx_msr_bitmap_legacy_x2apic_apicv_inactive; +static unsigned long *vmx_msr_bitmap_longmode_x2apic_apicv_inactive; static unsigned long *vmx_vmread_bitmap; static unsigned long *vmx_vmwrite_bitmap; @@ -939,6 +941,7 @@ static DEFINE_SPINLOCK(vmx_vpid_lock); static struct vmcs_config { int size; int order; + u32 basic_cap; u32 revision_id; u32 pin_based_exec_ctrl; u32 cpu_based_exec_ctrl; @@ -1215,6 +1218,11 @@ static inline bool cpu_has_vmx_ple(void) SECONDARY_EXEC_PAUSE_LOOP_EXITING; } +static inline bool cpu_has_vmx_basic_inout(void) +{ + return (((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT); +} + static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu) { return flexpriority_enabled && lapic_in_kernel(vcpu); @@ -2518,10 +2526,17 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu) else if (cpu_has_secondary_exec_ctrls() && (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) & SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { - if (is_long_mode(vcpu)) - msr_bitmap = vmx_msr_bitmap_longmode_x2apic; - else - msr_bitmap = vmx_msr_bitmap_legacy_x2apic; + if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) { + if (is_long_mode(vcpu)) + msr_bitmap = vmx_msr_bitmap_longmode_x2apic; + else + msr_bitmap = vmx_msr_bitmap_legacy_x2apic; + } else { + if (is_long_mode(vcpu)) + msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv_inactive; + else + msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv_inactive; + } } else { if (is_long_mode(vcpu)) msr_bitmap = vmx_msr_bitmap_longmode; @@ -2603,11 +2618,6 @@ static u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) return host_tsc + tsc_offset; } -static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu) -{ - return vmcs_read64(TSC_OFFSET); -} - /* * writes 'offset' into guest's timestamp counter offset register */ @@ -2877,6 +2887,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) *pdata = VMCS12_REVISION | VMX_BASIC_TRUE_CTLS | ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); + if (cpu_has_vmx_basic_inout()) + *pdata |= VMX_BASIC_INOUT; break; case MSR_IA32_VMX_TRUE_PINBASED_CTLS: case MSR_IA32_VMX_PINBASED_CTLS: @@ -3457,7 +3469,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) return -EIO; vmcs_conf->size = vmx_msr_high & 0x1fff; - vmcs_conf->order = get_order(vmcs_config.size); + vmcs_conf->order = get_order(vmcs_conf->size); + vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff; vmcs_conf->revision_id = vmx_msr_low; vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; @@ -4678,28 +4691,49 @@ static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) msr, MSR_TYPE_R | MSR_TYPE_W); } -static void vmx_enable_intercept_msr_read_x2apic(u32 msr) +static void vmx_enable_intercept_msr_read_x2apic(u32 msr, bool apicv_active) { - __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, - msr, MSR_TYPE_R); - __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, - msr, MSR_TYPE_R); + if (apicv_active) { + __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, + msr, MSR_TYPE_R); + __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, + msr, MSR_TYPE_R); + } else { + __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive, + msr, MSR_TYPE_R); + __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive, + msr, MSR_TYPE_R); + } } -static void vmx_disable_intercept_msr_read_x2apic(u32 msr) +static void vmx_disable_intercept_msr_read_x2apic(u32 msr, bool apicv_active) { - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, - msr, MSR_TYPE_R); - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, - msr, MSR_TYPE_R); + if (apicv_active) { + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, + msr, MSR_TYPE_R); + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, + msr, MSR_TYPE_R); + } else { + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive, + msr, MSR_TYPE_R); + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive, + msr, MSR_TYPE_R); + } } -static void vmx_disable_intercept_msr_write_x2apic(u32 msr) +static void vmx_disable_intercept_msr_write_x2apic(u32 msr, bool apicv_active) { - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, - msr, MSR_TYPE_W); - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, - msr, MSR_TYPE_W); + if (apicv_active) { + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, + msr, MSR_TYPE_W); + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, + msr, MSR_TYPE_W); + } else { + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive, + msr, MSR_TYPE_W); + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive, + msr, MSR_TYPE_W); + } } static bool vmx_get_enable_apicv(void) @@ -5279,29 +5313,30 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (is_guest_mode(vcpu)) - return; + if (!is_guest_mode(vcpu)) { + if (!cpu_has_virtual_nmis()) { + /* + * Tracking the NMI-blocked state in software is built upon + * finding the next open IRQ window. This, in turn, depends on + * well-behaving guests: They have to keep IRQs disabled at + * least as long as the NMI handler runs. Otherwise we may + * cause NMI nesting, maybe breaking the guest. But as this is + * highly unlikely, we can live with the residual risk. + */ + vmx->soft_vnmi_blocked = 1; + vmx->vnmi_blocked_time = 0; + } - if (!cpu_has_virtual_nmis()) { - /* - * Tracking the NMI-blocked state in software is built upon - * finding the next open IRQ window. This, in turn, depends on - * well-behaving guests: They have to keep IRQs disabled at - * least as long as the NMI handler runs. Otherwise we may - * cause NMI nesting, maybe breaking the guest. But as this is - * highly unlikely, we can live with the residual risk. - */ - vmx->soft_vnmi_blocked = 1; - vmx->vnmi_blocked_time = 0; + ++vcpu->stat.nmi_injections; + vmx->nmi_known_unmasked = false; } - ++vcpu->stat.nmi_injections; - vmx->nmi_known_unmasked = false; if (vmx->rmode.vm86_active) { if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE) kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); return; } + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); } @@ -6109,7 +6144,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) exit_qualification = vmcs_readl(EXIT_QUALIFICATION); gla_validity = (exit_qualification >> 7) & 0x3; - if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) { + if (gla_validity == 0x2) { printk(KERN_ERR "EPT: Handling EPT violation failed!\n"); printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n", (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS), @@ -6360,22 +6395,32 @@ static __init int hardware_setup(void) if (!vmx_msr_bitmap_legacy_x2apic) goto out2; + vmx_msr_bitmap_legacy_x2apic_apicv_inactive = + (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_msr_bitmap_legacy_x2apic_apicv_inactive) + goto out3; + vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); if (!vmx_msr_bitmap_longmode) - goto out3; + goto out4; vmx_msr_bitmap_longmode_x2apic = (unsigned long *)__get_free_page(GFP_KERNEL); if (!vmx_msr_bitmap_longmode_x2apic) - goto out4; + goto out5; + + vmx_msr_bitmap_longmode_x2apic_apicv_inactive = + (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_msr_bitmap_longmode_x2apic_apicv_inactive) + goto out6; vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); if (!vmx_vmread_bitmap) - goto out6; + goto out7; vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); if (!vmx_vmwrite_bitmap) - goto out7; + goto out8; memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); @@ -6394,7 +6439,7 @@ static __init int hardware_setup(void) if (setup_vmcs_config(&vmcs_config) < 0) { r = -EIO; - goto out8; + goto out9; } if (boot_cpu_has(X86_FEATURE_NX)) @@ -6461,20 +6506,35 @@ static __init int hardware_setup(void) vmx_msr_bitmap_legacy, PAGE_SIZE); memcpy(vmx_msr_bitmap_longmode_x2apic, vmx_msr_bitmap_longmode, PAGE_SIZE); + memcpy(vmx_msr_bitmap_legacy_x2apic_apicv_inactive, + vmx_msr_bitmap_legacy, PAGE_SIZE); + memcpy(vmx_msr_bitmap_longmode_x2apic_apicv_inactive, + vmx_msr_bitmap_longmode, PAGE_SIZE); set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ + /* + * enable_apicv && kvm_vcpu_apicv_active() + */ for (msr = 0x800; msr <= 0x8ff; msr++) - vmx_disable_intercept_msr_read_x2apic(msr); + vmx_disable_intercept_msr_read_x2apic(msr, true); /* TMCCT */ - vmx_enable_intercept_msr_read_x2apic(0x839); + vmx_enable_intercept_msr_read_x2apic(0x839, true); /* TPR */ - vmx_disable_intercept_msr_write_x2apic(0x808); + vmx_disable_intercept_msr_write_x2apic(0x808, true); /* EOI */ - vmx_disable_intercept_msr_write_x2apic(0x80b); + vmx_disable_intercept_msr_write_x2apic(0x80b, true); /* SELF-IPI */ - vmx_disable_intercept_msr_write_x2apic(0x83f); + vmx_disable_intercept_msr_write_x2apic(0x83f, true); + + /* + * (enable_apicv && !kvm_vcpu_apicv_active()) || + * !enable_apicv + */ + /* TPR */ + vmx_disable_intercept_msr_read_x2apic(0x808, false); + vmx_disable_intercept_msr_write_x2apic(0x808, false); if (enable_ept) { kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, @@ -6521,14 +6581,18 @@ static __init int hardware_setup(void) return alloc_kvm_area(); -out8: +out9: free_page((unsigned long)vmx_vmwrite_bitmap); -out7: +out8: free_page((unsigned long)vmx_vmread_bitmap); +out7: + free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive); out6: free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); -out4: +out5: free_page((unsigned long)vmx_msr_bitmap_longmode); +out4: + free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive); out3: free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); out2: @@ -6544,7 +6608,9 @@ out: static __exit void hardware_unsetup(void) { free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); + free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive); free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); + free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive); free_page((unsigned long)vmx_msr_bitmap_legacy); free_page((unsigned long)vmx_msr_bitmap_longmode); free_page((unsigned long)vmx_io_bitmap_b); @@ -6726,7 +6792,7 @@ static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) { /* TODO: not to reset guest simply here. */ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); - pr_warn("kvm: nested vmx abort, indicator %d\n", indicator); + pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator); } static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) @@ -7013,7 +7079,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu) vmx->nested.vmcs02_num = 0; hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); + HRTIMER_MODE_REL_PINNED); vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; vmx->nested.vmxon = true; @@ -8435,12 +8501,7 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) return; } - /* - * There is not point to enable virtualize x2apic without enable - * apicv - */ - if (!cpu_has_vmx_virtualize_x2apic_mode() || - !kvm_vcpu_apicv_active(vcpu)) + if (!cpu_has_vmx_virtualize_x2apic_mode()) return; if (!cpu_need_tpr_shadow(vcpu)) @@ -9598,7 +9659,7 @@ static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, maxphyaddr = cpuid_maxphyaddr(vcpu); if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr || (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) { - pr_warn_ratelimited( + pr_debug_ratelimited( "nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)", addr_field, maxphyaddr, count, addr); return -EINVAL; @@ -9671,13 +9732,13 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) for (i = 0; i < count; i++) { if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), &e, sizeof(e))) { - pr_warn_ratelimited( + pr_debug_ratelimited( "%s cannot read MSR entry (%u, 0x%08llx)\n", __func__, i, gpa + i * sizeof(e)); goto fail; } if (nested_vmx_load_msr_check(vcpu, &e)) { - pr_warn_ratelimited( + pr_debug_ratelimited( "%s check failed (%u, 0x%x, 0x%x)\n", __func__, i, e.index, e.reserved); goto fail; @@ -9685,7 +9746,7 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) msr.index = e.index; msr.data = e.value; if (kvm_set_msr(vcpu, &msr)) { - pr_warn_ratelimited( + pr_debug_ratelimited( "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", __func__, i, e.index, e.value); goto fail; @@ -9706,13 +9767,13 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), &e, 2 * sizeof(u32))) { - pr_warn_ratelimited( + pr_debug_ratelimited( "%s cannot read MSR entry (%u, 0x%08llx)\n", __func__, i, gpa + i * sizeof(e)); return -EINVAL; } if (nested_vmx_store_msr_check(vcpu, &e)) { - pr_warn_ratelimited( + pr_debug_ratelimited( "%s check failed (%u, 0x%x, 0x%x)\n", __func__, i, e.index, e.reserved); return -EINVAL; @@ -9720,7 +9781,7 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) msr_info.host_initiated = false; msr_info.index = e.index; if (kvm_get_msr(vcpu, &msr_info)) { - pr_warn_ratelimited( + pr_debug_ratelimited( "%s cannot read MSR (%u, 0x%x)\n", __func__, i, e.index); return -EINVAL; @@ -9729,7 +9790,7 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) gpa + i * sizeof(e) + offsetof(struct vmx_msr_entry, value), &msr_info.data, sizeof(msr_info.data))) { - pr_warn_ratelimited( + pr_debug_ratelimited( "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", __func__, i, e.index, msr_info.data); return -EINVAL; @@ -10500,6 +10561,9 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); } + if (nested_cpu_has_ept(vmcs12)) + vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); + if (nested_cpu_has_vid(vmcs12)) vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); @@ -10793,7 +10857,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, * We are now running in L2, mmu_notifier will force to reload the * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1. */ - kvm_vcpu_reload_apic_access_page(vcpu); + kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); /* * Exiting from L2 to L1, we're now back to L1 which thinks it just @@ -11274,7 +11338,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, - .read_tsc_offset = vmx_read_tsc_offset, .write_tsc_offset = vmx_write_tsc_offset, .adjust_tsc_offset_guest = vmx_adjust_tsc_offset_guest, .read_l1_tsc = vmx_read_l1_tsc, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 699f8726539a..6c633de84dd7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1367,7 +1367,7 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset) { - u64 curr_offset = kvm_x86_ops->read_tsc_offset(vcpu); + u64 curr_offset = vcpu->arch.tsc_offset; vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset; } @@ -1413,6 +1413,12 @@ u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) } EXPORT_SYMBOL_GPL(kvm_read_l1_tsc); +static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) +{ + kvm_x86_ops->write_tsc_offset(vcpu, offset); + vcpu->arch.tsc_offset = offset; +} + void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) { struct kvm *kvm = vcpu->kvm; @@ -1425,7 +1431,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); offset = kvm_compute_tsc_offset(vcpu, data); - ns = get_kernel_ns(); + ns = ktime_get_boot_ns(); elapsed = ns - kvm->arch.last_tsc_nsec; if (vcpu->arch.virtual_tsc_khz) { @@ -1522,7 +1528,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) if (guest_cpuid_has_tsc_adjust(vcpu) && !msr->host_initiated) update_ia32_tsc_adjust_msr(vcpu, offset); - kvm_x86_ops->write_tsc_offset(vcpu, offset); + kvm_vcpu_write_tsc_offset(vcpu, offset); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); spin_lock(&kvm->arch.pvclock_gtod_sync_lock); @@ -1716,6 +1722,88 @@ static void kvm_gen_update_masterclock(struct kvm *kvm) #endif } +static u64 __get_kvmclock_ns(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu = kvm_get_vcpu(kvm, 0); + struct kvm_arch *ka = &kvm->arch; + s64 ns; + + if (vcpu->arch.hv_clock.flags & PVCLOCK_TSC_STABLE_BIT) { + u64 tsc = kvm_read_l1_tsc(vcpu, rdtsc()); + ns = __pvclock_read_cycles(&vcpu->arch.hv_clock, tsc); + } else { + ns = ktime_get_boot_ns() + ka->kvmclock_offset; + } + + return ns; +} + +u64 get_kvmclock_ns(struct kvm *kvm) +{ + unsigned long flags; + s64 ns; + + local_irq_save(flags); + ns = __get_kvmclock_ns(kvm); + local_irq_restore(flags); + + return ns; +} + +static void kvm_setup_pvclock_page(struct kvm_vcpu *v) +{ + struct kvm_vcpu_arch *vcpu = &v->arch; + struct pvclock_vcpu_time_info guest_hv_clock; + + if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time, + &guest_hv_clock, sizeof(guest_hv_clock)))) + return; + + /* This VCPU is paused, but it's legal for a guest to read another + * VCPU's kvmclock, so we really have to follow the specification where + * it says that version is odd if data is being modified, and even after + * it is consistent. + * + * Version field updates must be kept separate. This is because + * kvm_write_guest_cached might use a "rep movs" instruction, and + * writes within a string instruction are weakly ordered. So there + * are three writes overall. + * + * As a small optimization, only write the version field in the first + * and third write. The vcpu->pv_time cache is still valid, because the + * version field is the first in the struct. + */ + BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0); + + vcpu->hv_clock.version = guest_hv_clock.version + 1; + kvm_write_guest_cached(v->kvm, &vcpu->pv_time, + &vcpu->hv_clock, + sizeof(vcpu->hv_clock.version)); + + smp_wmb(); + + /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ + vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED); + + if (vcpu->pvclock_set_guest_stopped_request) { + vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED; + vcpu->pvclock_set_guest_stopped_request = false; + } + + trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock); + + kvm_write_guest_cached(v->kvm, &vcpu->pv_time, + &vcpu->hv_clock, + sizeof(vcpu->hv_clock)); + + smp_wmb(); + + vcpu->hv_clock.version++; + kvm_write_guest_cached(v->kvm, &vcpu->pv_time, + &vcpu->hv_clock, + sizeof(vcpu->hv_clock.version)); +} + static int kvm_guest_time_update(struct kvm_vcpu *v) { unsigned long flags, tgt_tsc_khz; @@ -1723,7 +1811,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) struct kvm_arch *ka = &v->kvm->arch; s64 kernel_ns; u64 tsc_timestamp, host_tsc; - struct pvclock_vcpu_time_info guest_hv_clock; u8 pvclock_flags; bool use_master_clock; @@ -1752,7 +1839,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) } if (!use_master_clock) { host_tsc = rdtsc(); - kernel_ns = get_kernel_ns(); + kernel_ns = ktime_get_boot_ns(); } tsc_timestamp = kvm_read_l1_tsc(v, host_tsc); @@ -1777,8 +1864,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) local_irq_restore(flags); - if (!vcpu->pv_time_enabled) - return 0; + /* With all the info we got, fill in the values */ if (kvm_has_tsc_control) tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz); @@ -1790,64 +1876,21 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) vcpu->hw_tsc_khz = tgt_tsc_khz; } - /* With all the info we got, fill in the values */ vcpu->hv_clock.tsc_timestamp = tsc_timestamp; vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; vcpu->last_guest_tsc = tsc_timestamp; - if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time, - &guest_hv_clock, sizeof(guest_hv_clock)))) - return 0; - - /* This VCPU is paused, but it's legal for a guest to read another - * VCPU's kvmclock, so we really have to follow the specification where - * it says that version is odd if data is being modified, and even after - * it is consistent. - * - * Version field updates must be kept separate. This is because - * kvm_write_guest_cached might use a "rep movs" instruction, and - * writes within a string instruction are weakly ordered. So there - * are three writes overall. - * - * As a small optimization, only write the version field in the first - * and third write. The vcpu->pv_time cache is still valid, because the - * version field is the first in the struct. - */ - BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0); - - vcpu->hv_clock.version = guest_hv_clock.version + 1; - kvm_write_guest_cached(v->kvm, &vcpu->pv_time, - &vcpu->hv_clock, - sizeof(vcpu->hv_clock.version)); - - smp_wmb(); - - /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ - pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED); - - if (vcpu->pvclock_set_guest_stopped_request) { - pvclock_flags |= PVCLOCK_GUEST_STOPPED; - vcpu->pvclock_set_guest_stopped_request = false; - } - /* If the host uses TSC clocksource, then it is stable */ + pvclock_flags = 0; if (use_master_clock) pvclock_flags |= PVCLOCK_TSC_STABLE_BIT; vcpu->hv_clock.flags = pvclock_flags; - trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock); - - kvm_write_guest_cached(v->kvm, &vcpu->pv_time, - &vcpu->hv_clock, - sizeof(vcpu->hv_clock)); - - smp_wmb(); - - vcpu->hv_clock.version++; - kvm_write_guest_cached(v->kvm, &vcpu->pv_time, - &vcpu->hv_clock, - sizeof(vcpu->hv_clock.version)); + if (vcpu->pv_time_enabled) + kvm_setup_pvclock_page(v); + if (v == kvm_get_vcpu(v->kvm, 0)) + kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock); return 0; } @@ -2746,7 +2789,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (check_tsc_unstable()) { u64 offset = kvm_compute_tsc_offset(vcpu, vcpu->arch.last_guest_tsc); - kvm_x86_ops->write_tsc_offset(vcpu, offset); + kvm_vcpu_write_tsc_offset(vcpu, offset); vcpu->arch.tsc_catchup = 1; } if (kvm_lapic_hv_timer_in_use(vcpu) && @@ -4039,7 +4082,6 @@ long kvm_arch_vm_ioctl(struct file *filp, case KVM_SET_CLOCK: { struct kvm_clock_data user_ns; u64 now_ns; - s64 delta; r = -EFAULT; if (copy_from_user(&user_ns, argp, sizeof(user_ns))) @@ -4051,10 +4093,9 @@ long kvm_arch_vm_ioctl(struct file *filp, r = 0; local_irq_disable(); - now_ns = get_kernel_ns(); - delta = user_ns.clock - now_ns; + now_ns = __get_kvmclock_ns(kvm); + kvm->arch.kvmclock_offset += user_ns.clock - now_ns; local_irq_enable(); - kvm->arch.kvmclock_offset = delta; kvm_gen_update_masterclock(kvm); break; } @@ -4062,10 +4103,8 @@ long kvm_arch_vm_ioctl(struct file *filp, struct kvm_clock_data user_ns; u64 now_ns; - local_irq_disable(); - now_ns = get_kernel_ns(); - user_ns.clock = kvm->arch.kvmclock_offset + now_ns; - local_irq_enable(); + now_ns = get_kvmclock_ns(kvm); + user_ns.clock = now_ns; user_ns.flags = 0; memset(&user_ns.pad, 0, sizeof(user_ns.pad)); @@ -6700,7 +6739,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_put_guest_xcr0(vcpu); - /* Interrupt is enabled by handle_external_intr() */ kvm_x86_ops->handle_external_intr(vcpu); ++vcpu->stat.exits; @@ -7530,7 +7568,7 @@ int kvm_arch_hardware_enable(void) * before any KVM threads can be running. Unfortunately, we can't * bring the TSCs fully up to date with real time, as we aren't yet far * enough into CPU bringup that we know how much real time has actually - * elapsed; our helper function, get_kernel_ns() will be using boot + * elapsed; our helper function, ktime_get_boot_ns() will be using boot * variables that haven't been updated yet. * * So we simply find the maximum observed TSC above, then record the @@ -7765,6 +7803,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) mutex_init(&kvm->arch.apic_map_lock); spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock); + kvm->arch.kvmclock_offset = -ktime_get_boot_ns(); pvclock_update_vm_gtod_copy(kvm); INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index a82ca466b62e..e8ff3e4ce38a 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -148,11 +148,6 @@ static inline void kvm_register_writel(struct kvm_vcpu *vcpu, return kvm_register_write(vcpu, reg, val); } -static inline u64 get_kernel_ns(void) -{ - return ktime_get_boot_ns(); -} - static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk) { return !(kvm->arch.disabled_quirks & quirk); @@ -164,6 +159,7 @@ void kvm_set_pending_timer(struct kvm_vcpu *vcpu); int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr); +u64 get_kvmclock_ns(struct kvm *kvm); int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, unsigned int bytes, diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile index 97062a635b77..5c6fc3577a49 100644 --- a/arch/x86/pci/Makefile +++ b/arch/x86/pci/Makefile @@ -23,8 +23,6 @@ obj-y += bus_numa.o obj-$(CONFIG_AMD_NB) += amd_bus.o obj-$(CONFIG_PCI_CNB20LE_QUIRK) += broadcom_bus.o -obj-$(CONFIG_VMD) += vmd.o - ifeq ($(CONFIG_PCI_DEBUG),y) EXTRA_CFLAGS += -DDEBUG endif diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 7b6a9d14c8c0..a4fdfa7dcc1b 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -677,6 +677,12 @@ static void set_dma_domain_ops(struct pci_dev *pdev) static void set_dma_domain_ops(struct pci_dev *pdev) {} #endif +static void set_dev_domain_options(struct pci_dev *pdev) +{ + if (is_vmd(pdev->bus)) + pdev->hotplug_user_indicators = 1; +} + int pcibios_add_device(struct pci_dev *dev) { struct setup_data *data; @@ -707,6 +713,7 @@ int pcibios_add_device(struct pci_dev *dev) iounmap(data); } set_dma_domain_ops(dev); + set_dev_domain_options(dev); return 0; } diff --git a/arch/x86/pci/vmd.c b/arch/x86/pci/vmd.c deleted file mode 100644 index 7948be342ee9..000000000000 --- a/arch/x86/pci/vmd.c +++ /dev/null @@ -1,771 +0,0 @@ -/* - * Volume Management Device driver - * Copyright (c) 2015, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - */ - -#include <linux/device.h> -#include <linux/interrupt.h> -#include <linux/irq.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/msi.h> -#include <linux/pci.h> -#include <linux/rculist.h> -#include <linux/rcupdate.h> - -#include <asm/irqdomain.h> -#include <asm/device.h> -#include <asm/msi.h> -#include <asm/msidef.h> - -#define VMD_CFGBAR 0 -#define VMD_MEMBAR1 2 -#define VMD_MEMBAR2 4 - -/* - * Lock for manipulating VMD IRQ lists. - */ -static DEFINE_RAW_SPINLOCK(list_lock); - -/** - * struct vmd_irq - private data to map driver IRQ to the VMD shared vector - * @node: list item for parent traversal. - * @rcu: RCU callback item for freeing. - * @irq: back pointer to parent. - * @enabled: true if driver enabled IRQ - * @virq: the virtual IRQ value provided to the requesting driver. - * - * Every MSI/MSI-X IRQ requested for a device in a VMD domain will be mapped to - * a VMD IRQ using this structure. - */ -struct vmd_irq { - struct list_head node; - struct rcu_head rcu; - struct vmd_irq_list *irq; - bool enabled; - unsigned int virq; -}; - -/** - * struct vmd_irq_list - list of driver requested IRQs mapping to a VMD vector - * @irq_list: the list of irq's the VMD one demuxes to. - * @vmd_vector: the h/w IRQ assigned to the VMD. - * @index: index into the VMD MSI-X table; used for message routing. - * @count: number of child IRQs assigned to this vector; used to track - * sharing. - */ -struct vmd_irq_list { - struct list_head irq_list; - struct vmd_dev *vmd; - unsigned int vmd_vector; - unsigned int index; - unsigned int count; -}; - -struct vmd_dev { - struct pci_dev *dev; - - spinlock_t cfg_lock; - char __iomem *cfgbar; - - int msix_count; - struct msix_entry *msix_entries; - struct vmd_irq_list *irqs; - - struct pci_sysdata sysdata; - struct resource resources[3]; - struct irq_domain *irq_domain; - struct pci_bus *bus; - -#ifdef CONFIG_X86_DEV_DMA_OPS - struct dma_map_ops dma_ops; - struct dma_domain dma_domain; -#endif -}; - -static inline struct vmd_dev *vmd_from_bus(struct pci_bus *bus) -{ - return container_of(bus->sysdata, struct vmd_dev, sysdata); -} - -/* - * Drivers managing a device in a VMD domain allocate their own IRQs as before, - * but the MSI entry for the hardware it's driving will be programmed with a - * destination ID for the VMD MSI-X table. The VMD muxes interrupts in its - * domain into one of its own, and the VMD driver de-muxes these for the - * handlers sharing that VMD IRQ. The vmd irq_domain provides the operations - * and irq_chip to set this up. - */ -static void vmd_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) -{ - struct vmd_irq *vmdirq = data->chip_data; - struct vmd_irq_list *irq = vmdirq->irq; - - msg->address_hi = MSI_ADDR_BASE_HI; - msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_DEST_ID(irq->index); - msg->data = 0; -} - -/* - * We rely on MSI_FLAG_USE_DEF_CHIP_OPS to set the IRQ mask/unmask ops. - */ -static void vmd_irq_enable(struct irq_data *data) -{ - struct vmd_irq *vmdirq = data->chip_data; - unsigned long flags; - - raw_spin_lock_irqsave(&list_lock, flags); - WARN_ON(vmdirq->enabled); - list_add_tail_rcu(&vmdirq->node, &vmdirq->irq->irq_list); - vmdirq->enabled = true; - raw_spin_unlock_irqrestore(&list_lock, flags); - - data->chip->irq_unmask(data); -} - -static void vmd_irq_disable(struct irq_data *data) -{ - struct vmd_irq *vmdirq = data->chip_data; - unsigned long flags; - - data->chip->irq_mask(data); - - raw_spin_lock_irqsave(&list_lock, flags); - if (vmdirq->enabled) { - list_del_rcu(&vmdirq->node); - vmdirq->enabled = false; - } - raw_spin_unlock_irqrestore(&list_lock, flags); -} - -/* - * XXX: Stubbed until we develop acceptable way to not create conflicts with - * other devices sharing the same vector. - */ -static int vmd_irq_set_affinity(struct irq_data *data, - const struct cpumask *dest, bool force) -{ - return -EINVAL; -} - -static struct irq_chip vmd_msi_controller = { - .name = "VMD-MSI", - .irq_enable = vmd_irq_enable, - .irq_disable = vmd_irq_disable, - .irq_compose_msi_msg = vmd_compose_msi_msg, - .irq_set_affinity = vmd_irq_set_affinity, -}; - -static irq_hw_number_t vmd_get_hwirq(struct msi_domain_info *info, - msi_alloc_info_t *arg) -{ - return 0; -} - -/* - * XXX: We can be even smarter selecting the best IRQ once we solve the - * affinity problem. - */ -static struct vmd_irq_list *vmd_next_irq(struct vmd_dev *vmd, struct msi_desc *desc) -{ - int i, best = 1; - unsigned long flags; - - if (!desc->msi_attrib.is_msix || vmd->msix_count == 1) - return &vmd->irqs[0]; - - raw_spin_lock_irqsave(&list_lock, flags); - for (i = 1; i < vmd->msix_count; i++) - if (vmd->irqs[i].count < vmd->irqs[best].count) - best = i; - vmd->irqs[best].count++; - raw_spin_unlock_irqrestore(&list_lock, flags); - - return &vmd->irqs[best]; -} - -static int vmd_msi_init(struct irq_domain *domain, struct msi_domain_info *info, - unsigned int virq, irq_hw_number_t hwirq, - msi_alloc_info_t *arg) -{ - struct msi_desc *desc = arg->desc; - struct vmd_dev *vmd = vmd_from_bus(msi_desc_to_pci_dev(desc)->bus); - struct vmd_irq *vmdirq = kzalloc(sizeof(*vmdirq), GFP_KERNEL); - - if (!vmdirq) - return -ENOMEM; - - INIT_LIST_HEAD(&vmdirq->node); - vmdirq->irq = vmd_next_irq(vmd, desc); - vmdirq->virq = virq; - - irq_domain_set_info(domain, virq, vmdirq->irq->vmd_vector, info->chip, - vmdirq, handle_untracked_irq, vmd, NULL); - return 0; -} - -static void vmd_msi_free(struct irq_domain *domain, - struct msi_domain_info *info, unsigned int virq) -{ - struct vmd_irq *vmdirq = irq_get_chip_data(virq); - unsigned long flags; - - /* XXX: Potential optimization to rebalance */ - raw_spin_lock_irqsave(&list_lock, flags); - vmdirq->irq->count--; - raw_spin_unlock_irqrestore(&list_lock, flags); - - kfree_rcu(vmdirq, rcu); -} - -static int vmd_msi_prepare(struct irq_domain *domain, struct device *dev, - int nvec, msi_alloc_info_t *arg) -{ - struct pci_dev *pdev = to_pci_dev(dev); - struct vmd_dev *vmd = vmd_from_bus(pdev->bus); - - if (nvec > vmd->msix_count) - return vmd->msix_count; - - memset(arg, 0, sizeof(*arg)); - return 0; -} - -static void vmd_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc) -{ - arg->desc = desc; -} - -static struct msi_domain_ops vmd_msi_domain_ops = { - .get_hwirq = vmd_get_hwirq, - .msi_init = vmd_msi_init, - .msi_free = vmd_msi_free, - .msi_prepare = vmd_msi_prepare, - .set_desc = vmd_set_desc, -}; - -static struct msi_domain_info vmd_msi_domain_info = { - .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | - MSI_FLAG_PCI_MSIX, - .ops = &vmd_msi_domain_ops, - .chip = &vmd_msi_controller, -}; - -#ifdef CONFIG_X86_DEV_DMA_OPS -/* - * VMD replaces the requester ID with its own. DMA mappings for devices in a - * VMD domain need to be mapped for the VMD, not the device requiring - * the mapping. - */ -static struct device *to_vmd_dev(struct device *dev) -{ - struct pci_dev *pdev = to_pci_dev(dev); - struct vmd_dev *vmd = vmd_from_bus(pdev->bus); - - return &vmd->dev->dev; -} - -static struct dma_map_ops *vmd_dma_ops(struct device *dev) -{ - return get_dma_ops(to_vmd_dev(dev)); -} - -static void *vmd_alloc(struct device *dev, size_t size, dma_addr_t *addr, - gfp_t flag, unsigned long attrs) -{ - return vmd_dma_ops(dev)->alloc(to_vmd_dev(dev), size, addr, flag, - attrs); -} - -static void vmd_free(struct device *dev, size_t size, void *vaddr, - dma_addr_t addr, unsigned long attrs) -{ - return vmd_dma_ops(dev)->free(to_vmd_dev(dev), size, vaddr, addr, - attrs); -} - -static int vmd_mmap(struct device *dev, struct vm_area_struct *vma, - void *cpu_addr, dma_addr_t addr, size_t size, - unsigned long attrs) -{ - return vmd_dma_ops(dev)->mmap(to_vmd_dev(dev), vma, cpu_addr, addr, - size, attrs); -} - -static int vmd_get_sgtable(struct device *dev, struct sg_table *sgt, - void *cpu_addr, dma_addr_t addr, size_t size, - unsigned long attrs) -{ - return vmd_dma_ops(dev)->get_sgtable(to_vmd_dev(dev), sgt, cpu_addr, - addr, size, attrs); -} - -static dma_addr_t vmd_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction dir, - unsigned long attrs) -{ - return vmd_dma_ops(dev)->map_page(to_vmd_dev(dev), page, offset, size, - dir, attrs); -} - -static void vmd_unmap_page(struct device *dev, dma_addr_t addr, size_t size, - enum dma_data_direction dir, unsigned long attrs) -{ - vmd_dma_ops(dev)->unmap_page(to_vmd_dev(dev), addr, size, dir, attrs); -} - -static int vmd_map_sg(struct device *dev, struct scatterlist *sg, int nents, - enum dma_data_direction dir, unsigned long attrs) -{ - return vmd_dma_ops(dev)->map_sg(to_vmd_dev(dev), sg, nents, dir, attrs); -} - -static void vmd_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, - enum dma_data_direction dir, unsigned long attrs) -{ - vmd_dma_ops(dev)->unmap_sg(to_vmd_dev(dev), sg, nents, dir, attrs); -} - -static void vmd_sync_single_for_cpu(struct device *dev, dma_addr_t addr, - size_t size, enum dma_data_direction dir) -{ - vmd_dma_ops(dev)->sync_single_for_cpu(to_vmd_dev(dev), addr, size, dir); -} - -static void vmd_sync_single_for_device(struct device *dev, dma_addr_t addr, - size_t size, enum dma_data_direction dir) -{ - vmd_dma_ops(dev)->sync_single_for_device(to_vmd_dev(dev), addr, size, - dir); -} - -static void vmd_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, - int nents, enum dma_data_direction dir) -{ - vmd_dma_ops(dev)->sync_sg_for_cpu(to_vmd_dev(dev), sg, nents, dir); -} - -static void vmd_sync_sg_for_device(struct device *dev, struct scatterlist *sg, - int nents, enum dma_data_direction dir) -{ - vmd_dma_ops(dev)->sync_sg_for_device(to_vmd_dev(dev), sg, nents, dir); -} - -static int vmd_mapping_error(struct device *dev, dma_addr_t addr) -{ - return vmd_dma_ops(dev)->mapping_error(to_vmd_dev(dev), addr); -} - -static int vmd_dma_supported(struct device *dev, u64 mask) -{ - return vmd_dma_ops(dev)->dma_supported(to_vmd_dev(dev), mask); -} - -#ifdef ARCH_HAS_DMA_GET_REQUIRED_MASK -static u64 vmd_get_required_mask(struct device *dev) -{ - return vmd_dma_ops(dev)->get_required_mask(to_vmd_dev(dev)); -} -#endif - -static void vmd_teardown_dma_ops(struct vmd_dev *vmd) -{ - struct dma_domain *domain = &vmd->dma_domain; - - if (get_dma_ops(&vmd->dev->dev)) - del_dma_domain(domain); -} - -#define ASSIGN_VMD_DMA_OPS(source, dest, fn) \ - do { \ - if (source->fn) \ - dest->fn = vmd_##fn; \ - } while (0) - -static void vmd_setup_dma_ops(struct vmd_dev *vmd) -{ - const struct dma_map_ops *source = get_dma_ops(&vmd->dev->dev); - struct dma_map_ops *dest = &vmd->dma_ops; - struct dma_domain *domain = &vmd->dma_domain; - - domain->domain_nr = vmd->sysdata.domain; - domain->dma_ops = dest; - - if (!source) - return; - ASSIGN_VMD_DMA_OPS(source, dest, alloc); - ASSIGN_VMD_DMA_OPS(source, dest, free); - ASSIGN_VMD_DMA_OPS(source, dest, mmap); - ASSIGN_VMD_DMA_OPS(source, dest, get_sgtable); - ASSIGN_VMD_DMA_OPS(source, dest, map_page); - ASSIGN_VMD_DMA_OPS(source, dest, unmap_page); - ASSIGN_VMD_DMA_OPS(source, dest, map_sg); - ASSIGN_VMD_DMA_OPS(source, dest, unmap_sg); - ASSIGN_VMD_DMA_OPS(source, dest, sync_single_for_cpu); - ASSIGN_VMD_DMA_OPS(source, dest, sync_single_for_device); - ASSIGN_VMD_DMA_OPS(source, dest, sync_sg_for_cpu); - ASSIGN_VMD_DMA_OPS(source, dest, sync_sg_for_device); - ASSIGN_VMD_DMA_OPS(source, dest, mapping_error); - ASSIGN_VMD_DMA_OPS(source, dest, dma_supported); -#ifdef ARCH_HAS_DMA_GET_REQUIRED_MASK - ASSIGN_VMD_DMA_OPS(source, dest, get_required_mask); -#endif - add_dma_domain(domain); -} -#undef ASSIGN_VMD_DMA_OPS -#else -static void vmd_teardown_dma_ops(struct vmd_dev *vmd) {} -static void vmd_setup_dma_ops(struct vmd_dev *vmd) {} -#endif - -static char __iomem *vmd_cfg_addr(struct vmd_dev *vmd, struct pci_bus *bus, - unsigned int devfn, int reg, int len) -{ - char __iomem *addr = vmd->cfgbar + - (bus->number << 20) + (devfn << 12) + reg; - - if ((addr - vmd->cfgbar) + len >= - resource_size(&vmd->dev->resource[VMD_CFGBAR])) - return NULL; - - return addr; -} - -/* - * CPU may deadlock if config space is not serialized on some versions of this - * hardware, so all config space access is done under a spinlock. - */ -static int vmd_pci_read(struct pci_bus *bus, unsigned int devfn, int reg, - int len, u32 *value) -{ - struct vmd_dev *vmd = vmd_from_bus(bus); - char __iomem *addr = vmd_cfg_addr(vmd, bus, devfn, reg, len); - unsigned long flags; - int ret = 0; - - if (!addr) - return -EFAULT; - - spin_lock_irqsave(&vmd->cfg_lock, flags); - switch (len) { - case 1: - *value = readb(addr); - break; - case 2: - *value = readw(addr); - break; - case 4: - *value = readl(addr); - break; - default: - ret = -EINVAL; - break; - } - spin_unlock_irqrestore(&vmd->cfg_lock, flags); - return ret; -} - -/* - * VMD h/w converts non-posted config writes to posted memory writes. The - * read-back in this function forces the completion so it returns only after - * the config space was written, as expected. - */ -static int vmd_pci_write(struct pci_bus *bus, unsigned int devfn, int reg, - int len, u32 value) -{ - struct vmd_dev *vmd = vmd_from_bus(bus); - char __iomem *addr = vmd_cfg_addr(vmd, bus, devfn, reg, len); - unsigned long flags; - int ret = 0; - - if (!addr) - return -EFAULT; - - spin_lock_irqsave(&vmd->cfg_lock, flags); - switch (len) { - case 1: - writeb(value, addr); - readb(addr); - break; - case 2: - writew(value, addr); - readw(addr); - break; - case 4: - writel(value, addr); - readl(addr); - break; - default: - ret = -EINVAL; - break; - } - spin_unlock_irqrestore(&vmd->cfg_lock, flags); - return ret; -} - -static struct pci_ops vmd_ops = { - .read = vmd_pci_read, - .write = vmd_pci_write, -}; - -static void vmd_attach_resources(struct vmd_dev *vmd) -{ - vmd->dev->resource[VMD_MEMBAR1].child = &vmd->resources[1]; - vmd->dev->resource[VMD_MEMBAR2].child = &vmd->resources[2]; -} - -static void vmd_detach_resources(struct vmd_dev *vmd) -{ - vmd->dev->resource[VMD_MEMBAR1].child = NULL; - vmd->dev->resource[VMD_MEMBAR2].child = NULL; -} - -/* - * VMD domains start at 0x1000 to not clash with ACPI _SEG domains. - */ -static int vmd_find_free_domain(void) -{ - int domain = 0xffff; - struct pci_bus *bus = NULL; - - while ((bus = pci_find_next_bus(bus)) != NULL) - domain = max_t(int, domain, pci_domain_nr(bus)); - return domain + 1; -} - -static int vmd_enable_domain(struct vmd_dev *vmd) -{ - struct pci_sysdata *sd = &vmd->sysdata; - struct resource *res; - u32 upper_bits; - unsigned long flags; - LIST_HEAD(resources); - - res = &vmd->dev->resource[VMD_CFGBAR]; - vmd->resources[0] = (struct resource) { - .name = "VMD CFGBAR", - .start = 0, - .end = (resource_size(res) >> 20) - 1, - .flags = IORESOURCE_BUS | IORESOURCE_PCI_FIXED, - }; - - /* - * If the window is below 4GB, clear IORESOURCE_MEM_64 so we can - * put 32-bit resources in the window. - * - * There's no hardware reason why a 64-bit window *couldn't* - * contain a 32-bit resource, but pbus_size_mem() computes the - * bridge window size assuming a 64-bit window will contain no - * 32-bit resources. __pci_assign_resource() enforces that - * artificial restriction to make sure everything will fit. - * - * The only way we could use a 64-bit non-prefechable MEMBAR is - * if its address is <4GB so that we can convert it to a 32-bit - * resource. To be visible to the host OS, all VMD endpoints must - * be initially configured by platform BIOS, which includes setting - * up these resources. We can assume the device is configured - * according to the platform needs. - */ - res = &vmd->dev->resource[VMD_MEMBAR1]; - upper_bits = upper_32_bits(res->end); - flags = res->flags & ~IORESOURCE_SIZEALIGN; - if (!upper_bits) - flags &= ~IORESOURCE_MEM_64; - vmd->resources[1] = (struct resource) { - .name = "VMD MEMBAR1", - .start = res->start, - .end = res->end, - .flags = flags, - .parent = res, - }; - - res = &vmd->dev->resource[VMD_MEMBAR2]; - upper_bits = upper_32_bits(res->end); - flags = res->flags & ~IORESOURCE_SIZEALIGN; - if (!upper_bits) - flags &= ~IORESOURCE_MEM_64; - vmd->resources[2] = (struct resource) { - .name = "VMD MEMBAR2", - .start = res->start + 0x2000, - .end = res->end, - .flags = flags, - .parent = res, - }; - - sd->domain = vmd_find_free_domain(); - if (sd->domain < 0) - return sd->domain; - - sd->node = pcibus_to_node(vmd->dev->bus); - - vmd->irq_domain = pci_msi_create_irq_domain(NULL, &vmd_msi_domain_info, - x86_vector_domain); - if (!vmd->irq_domain) - return -ENODEV; - - pci_add_resource(&resources, &vmd->resources[0]); - pci_add_resource(&resources, &vmd->resources[1]); - pci_add_resource(&resources, &vmd->resources[2]); - vmd->bus = pci_create_root_bus(&vmd->dev->dev, 0, &vmd_ops, sd, - &resources); - if (!vmd->bus) { - pci_free_resource_list(&resources); - irq_domain_remove(vmd->irq_domain); - return -ENODEV; - } - - vmd_attach_resources(vmd); - vmd_setup_dma_ops(vmd); - dev_set_msi_domain(&vmd->bus->dev, vmd->irq_domain); - pci_rescan_bus(vmd->bus); - - WARN(sysfs_create_link(&vmd->dev->dev.kobj, &vmd->bus->dev.kobj, - "domain"), "Can't create symlink to domain\n"); - return 0; -} - -static irqreturn_t vmd_irq(int irq, void *data) -{ - struct vmd_irq_list *irqs = data; - struct vmd_irq *vmdirq; - - rcu_read_lock(); - list_for_each_entry_rcu(vmdirq, &irqs->irq_list, node) - generic_handle_irq(vmdirq->virq); - rcu_read_unlock(); - - return IRQ_HANDLED; -} - -static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id) -{ - struct vmd_dev *vmd; - int i, err; - - if (resource_size(&dev->resource[VMD_CFGBAR]) < (1 << 20)) - return -ENOMEM; - - vmd = devm_kzalloc(&dev->dev, sizeof(*vmd), GFP_KERNEL); - if (!vmd) - return -ENOMEM; - - vmd->dev = dev; - err = pcim_enable_device(dev); - if (err < 0) - return err; - - vmd->cfgbar = pcim_iomap(dev, VMD_CFGBAR, 0); - if (!vmd->cfgbar) - return -ENOMEM; - - pci_set_master(dev); - if (dma_set_mask_and_coherent(&dev->dev, DMA_BIT_MASK(64)) && - dma_set_mask_and_coherent(&dev->dev, DMA_BIT_MASK(32))) - return -ENODEV; - - vmd->msix_count = pci_msix_vec_count(dev); - if (vmd->msix_count < 0) - return -ENODEV; - - vmd->irqs = devm_kcalloc(&dev->dev, vmd->msix_count, sizeof(*vmd->irqs), - GFP_KERNEL); - if (!vmd->irqs) - return -ENOMEM; - - vmd->msix_entries = devm_kcalloc(&dev->dev, vmd->msix_count, - sizeof(*vmd->msix_entries), - GFP_KERNEL); - if (!vmd->msix_entries) - return -ENOMEM; - for (i = 0; i < vmd->msix_count; i++) - vmd->msix_entries[i].entry = i; - - vmd->msix_count = pci_enable_msix_range(vmd->dev, vmd->msix_entries, 1, - vmd->msix_count); - if (vmd->msix_count < 0) - return vmd->msix_count; - - for (i = 0; i < vmd->msix_count; i++) { - INIT_LIST_HEAD(&vmd->irqs[i].irq_list); - vmd->irqs[i].vmd_vector = vmd->msix_entries[i].vector; - vmd->irqs[i].index = i; - - err = devm_request_irq(&dev->dev, vmd->irqs[i].vmd_vector, - vmd_irq, 0, "vmd", &vmd->irqs[i]); - if (err) - return err; - } - - spin_lock_init(&vmd->cfg_lock); - pci_set_drvdata(dev, vmd); - err = vmd_enable_domain(vmd); - if (err) - return err; - - dev_info(&vmd->dev->dev, "Bound to PCI domain %04x\n", - vmd->sysdata.domain); - return 0; -} - -static void vmd_remove(struct pci_dev *dev) -{ - struct vmd_dev *vmd = pci_get_drvdata(dev); - - vmd_detach_resources(vmd); - pci_set_drvdata(dev, NULL); - sysfs_remove_link(&vmd->dev->dev.kobj, "domain"); - pci_stop_root_bus(vmd->bus); - pci_remove_root_bus(vmd->bus); - vmd_teardown_dma_ops(vmd); - irq_domain_remove(vmd->irq_domain); -} - -#ifdef CONFIG_PM -static int vmd_suspend(struct device *dev) -{ - struct pci_dev *pdev = to_pci_dev(dev); - - pci_save_state(pdev); - return 0; -} - -static int vmd_resume(struct device *dev) -{ - struct pci_dev *pdev = to_pci_dev(dev); - - pci_restore_state(pdev); - return 0; -} -#endif -static SIMPLE_DEV_PM_OPS(vmd_dev_pm_ops, vmd_suspend, vmd_resume); - -static const struct pci_device_id vmd_ids[] = { - {PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x201d),}, - {0,} -}; -MODULE_DEVICE_TABLE(pci, vmd_ids); - -static struct pci_driver vmd_drv = { - .name = "vmd", - .id_table = vmd_ids, - .probe = vmd_probe, - .remove = vmd_remove, - .driver = { - .pm = &vmd_dev_pm_ops, - }, -}; -module_pci_driver(vmd_drv); - -MODULE_AUTHOR("Intel Corporation"); -MODULE_LICENSE("GPL v2"); -MODULE_VERSION("0.6"); diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 3a483cb5ac81..bedfab98077a 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -456,7 +456,7 @@ void __init xen_msi_init(void) int __init pci_xen_hvm_init(void) { - if (!xen_have_vector_callback || !xen_feature(XENFEAT_hvm_pirqs)) + if (!xen_feature(XENFEAT_hvm_pirqs)) return 0; #ifdef CONFIG_ACPI diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index f1d2182e071f..c0fdd57da7aa 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -137,8 +137,10 @@ struct shared_info xen_dummy_shared_info; void *xen_initial_gdt; RESERVE_BRK(shared_info_page_brk, PAGE_SIZE); -__read_mostly int xen_have_vector_callback; -EXPORT_SYMBOL_GPL(xen_have_vector_callback); + +static int xen_cpu_up_prepare(unsigned int cpu); +static int xen_cpu_up_online(unsigned int cpu); +static int xen_cpu_dead(unsigned int cpu); /* * Point at some empty memory to start with. We map the real shared_info @@ -1519,10 +1521,7 @@ static void __init xen_pvh_early_guest_init(void) if (!xen_feature(XENFEAT_auto_translated_physmap)) return; - if (!xen_feature(XENFEAT_hvm_callback_vector)) - return; - - xen_have_vector_callback = 1; + BUG_ON(!xen_feature(XENFEAT_hvm_callback_vector)); xen_pvh_early_cpu_init(0, false); xen_pvh_set_cr_flags(0); @@ -1538,6 +1537,24 @@ static void __init xen_dom0_set_legacy_features(void) x86_platform.legacy.rtc = 1; } +static int xen_cpuhp_setup(void) +{ + int rc; + + rc = cpuhp_setup_state_nocalls(CPUHP_XEN_PREPARE, + "XEN_HVM_GUEST_PREPARE", + xen_cpu_up_prepare, xen_cpu_dead); + if (rc >= 0) { + rc = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "XEN_HVM_GUEST_ONLINE", + xen_cpu_up_online, NULL); + if (rc < 0) + cpuhp_remove_state_nocalls(CPUHP_XEN_PREPARE); + } + + return rc >= 0 ? 0 : rc; +} + /* First C function to be called on Xen boot */ asmlinkage __visible void __init xen_start_kernel(void) { @@ -1639,6 +1656,8 @@ asmlinkage __visible void __init xen_start_kernel(void) possible map and a non-dummy shared_info. */ per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; + WARN_ON(xen_cpuhp_setup()); + local_irq_disable(); early_boot_irqs_disabled = true; @@ -1819,31 +1838,54 @@ static void __init init_hvm_pv_info(void) xen_domain_type = XEN_HVM_DOMAIN; } -static int xen_hvm_cpu_notify(struct notifier_block *self, unsigned long action, - void *hcpu) +static int xen_cpu_up_prepare(unsigned int cpu) { - int cpu = (long)hcpu; - switch (action) { - case CPU_UP_PREPARE: + int rc; + + if (xen_hvm_domain()) { + /* + * This can happen if CPU was offlined earlier and + * offlining timed out in common_cpu_die(). + */ + if (cpu_report_state(cpu) == CPU_DEAD_FROZEN) { + xen_smp_intr_free(cpu); + xen_uninit_lock_cpu(cpu); + } + if (cpu_acpi_id(cpu) != U32_MAX) per_cpu(xen_vcpu_id, cpu) = cpu_acpi_id(cpu); else per_cpu(xen_vcpu_id, cpu) = cpu; xen_vcpu_setup(cpu); - if (xen_have_vector_callback) { - if (xen_feature(XENFEAT_hvm_safe_pvclock)) - xen_setup_timer(cpu); - } - break; - default: - break; } - return NOTIFY_OK; + + if (xen_pv_domain() || xen_feature(XENFEAT_hvm_safe_pvclock)) + xen_setup_timer(cpu); + + rc = xen_smp_intr_init(cpu); + if (rc) { + WARN(1, "xen_smp_intr_init() for CPU %d failed: %d\n", + cpu, rc); + return rc; + } + return 0; } -static struct notifier_block xen_hvm_cpu_notifier = { - .notifier_call = xen_hvm_cpu_notify, -}; +static int xen_cpu_dead(unsigned int cpu) +{ + xen_smp_intr_free(cpu); + + if (xen_pv_domain() || xen_feature(XENFEAT_hvm_safe_pvclock)) + xen_teardown_timer(cpu); + + return 0; +} + +static int xen_cpu_up_online(unsigned int cpu) +{ + xen_init_lock_cpu(cpu); + return 0; +} #ifdef CONFIG_KEXEC_CORE static void xen_hvm_shutdown(void) @@ -1871,10 +1913,10 @@ static void __init xen_hvm_guest_init(void) xen_panic_handler_init(); - if (xen_feature(XENFEAT_hvm_callback_vector)) - xen_have_vector_callback = 1; + BUG_ON(!xen_feature(XENFEAT_hvm_callback_vector)); + xen_hvm_smp_init(); - register_cpu_notifier(&xen_hvm_cpu_notifier); + WARN_ON(xen_cpuhp_setup()); xen_unplug_emulated_devices(); x86_init.irqs.intr_init = xen_init_IRQ; xen_hvm_init_time_ops(); @@ -1910,7 +1952,7 @@ bool xen_hvm_need_lapic(void) return false; if (!xen_hvm_domain()) return false; - if (xen_feature(XENFEAT_hvm_pirqs) && xen_have_vector_callback) + if (xen_feature(XENFEAT_hvm_pirqs)) return false; return true; } diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c index de4144c24f1c..809b6c812654 100644 --- a/arch/x86/xen/grant-table.c +++ b/arch/x86/xen/grant-table.c @@ -89,7 +89,7 @@ void arch_gnttab_unmap(void *shared, unsigned long nr_gframes) static int arch_gnttab_valloc(struct gnttab_vm_area *area, unsigned nr_frames) { - area->ptes = kmalloc(sizeof(pte_t *) * nr_frames, GFP_KERNEL); + area->ptes = kmalloc_array(nr_frames, sizeof(*area->ptes), GFP_KERNEL); if (area->ptes == NULL) return -ENOMEM; diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c index d37a0c7f82cb..90d1b83cf35f 100644 --- a/arch/x86/xen/platform-pci-unplug.c +++ b/arch/x86/xen/platform-pci-unplug.c @@ -61,7 +61,7 @@ static int check_platform_magic(void) } break; default: - printk(KERN_WARNING "Xen Platform PCI: unknown I/O protocol version"); + printk(KERN_WARNING "Xen Platform PCI: unknown I/O protocol version\n"); return XEN_PLATFORM_ERR_PROTOCOL; } diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c index 32bdc2c90297..b9fc52556bcc 100644 --- a/arch/x86/xen/pmu.c +++ b/arch/x86/xen/pmu.c @@ -547,8 +547,11 @@ void xen_pmu_init(int cpu) return; fail: - pr_info_once("Could not initialize VPMU for cpu %d, error %d\n", - cpu, err); + if (err == -EOPNOTSUPP || err == -ENOSYS) + pr_info_once("VPMU disabled by hypervisor.\n"); + else + pr_info_once("Could not initialize VPMU for cpu %d, error %d\n", + cpu, err); free_pages((unsigned long)xenpmu_data, 0); } diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 0b4d04c8ab4d..9fa27ceeecfd 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -87,6 +87,12 @@ static void cpu_bringup(void) cpu_data(cpu).x86_max_cores = 1; set_cpu_sibling_map(cpu); + /* + * identify_cpu() may have set logical_pkg_id to -1 due + * to incorrect phys_proc_id. Let's re-comupte it. + */ + topology_update_package_map(apic->cpu_present_to_apicid(cpu), cpu); + xen_setup_cpu_clockevents(); notify_cpu_starting(cpu); @@ -115,7 +121,7 @@ asmlinkage __visible void cpu_bringup_and_idle(int cpu) cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } -static void xen_smp_intr_free(unsigned int cpu) +void xen_smp_intr_free(unsigned int cpu) { if (per_cpu(xen_resched_irq, cpu).irq >= 0) { unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu).irq, NULL); @@ -159,7 +165,7 @@ static void xen_smp_intr_free(unsigned int cpu) per_cpu(xen_pmu_irq, cpu).name = NULL; } }; -static int xen_smp_intr_init(unsigned int cpu) +int xen_smp_intr_init(unsigned int cpu) { int rc; char *resched_name, *callfunc_name, *debug_name, *pmu_name; @@ -475,8 +481,6 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle) common_cpu_up(cpu, idle); xen_setup_runstate_info(cpu); - xen_setup_timer(cpu); - xen_init_lock_cpu(cpu); /* * PV VCPUs are always successfully taken down (see 'while' loop @@ -495,10 +499,6 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle) xen_pmu_init(cpu); - rc = xen_smp_intr_init(cpu); - if (rc) - return rc; - rc = HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL); BUG_ON(rc); @@ -769,47 +769,10 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus) xen_init_lock_cpu(0); } -static int xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle) -{ - int rc; - - /* - * This can happen if CPU was offlined earlier and - * offlining timed out in common_cpu_die(). - */ - if (cpu_report_state(cpu) == CPU_DEAD_FROZEN) { - xen_smp_intr_free(cpu); - xen_uninit_lock_cpu(cpu); - } - - /* - * xen_smp_intr_init() needs to run before native_cpu_up() - * so that IPI vectors are set up on the booting CPU before - * it is marked online in native_cpu_up(). - */ - rc = xen_smp_intr_init(cpu); - WARN_ON(rc); - if (!rc) - rc = native_cpu_up(cpu, tidle); - - /* - * We must initialize the slowpath CPU kicker _after_ the native - * path has executed. If we initialized it before none of the - * unlocker IPI kicks would reach the booting CPU as the booting - * CPU had not set itself 'online' in cpu_online_mask. That mask - * is checked when IPIs are sent (on HVM at least). - */ - xen_init_lock_cpu(cpu); - return rc; -} - void __init xen_hvm_smp_init(void) { - if (!xen_have_vector_callback) - return; smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus; smp_ops.smp_send_reschedule = xen_smp_send_reschedule; - smp_ops.cpu_up = xen_hvm_cpu_up; smp_ops.cpu_die = xen_cpu_die; smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi; smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi; diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h index 963d62a35c82..c5c16dc4f694 100644 --- a/arch/x86/xen/smp.h +++ b/arch/x86/xen/smp.h @@ -1,5 +1,6 @@ #ifndef _XEN_SMP_H +#ifdef CONFIG_SMP extern void xen_send_IPI_mask(const struct cpumask *mask, int vector); extern void xen_send_IPI_mask_allbutself(const struct cpumask *mask, @@ -8,6 +9,18 @@ extern void xen_send_IPI_allbutself(int vector); extern void xen_send_IPI_all(int vector); extern void xen_send_IPI_self(int vector); +extern int xen_smp_intr_init(unsigned int cpu); +extern void xen_smp_intr_free(unsigned int cpu); + +#else /* CONFIG_SMP */ + +static inline int xen_smp_intr_init(unsigned int cpu) +{ + return 0; +} +static inline void xen_smp_intr_free(unsigned int cpu) {} +#endif /* CONFIG_SMP */ + #ifdef CONFIG_XEN_PVH extern void xen_pvh_early_cpu_init(int cpu, bool entry); #else diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 67356d29d74d..33d8f6a7829d 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -432,11 +432,6 @@ static void xen_hvm_setup_cpu_clockevents(void) void __init xen_hvm_init_time_ops(void) { - /* vector callback is needed otherwise we cannot receive interrupts - * on cpu > 0 and at this point we don't know how many cpus are - * available */ - if (!xen_have_vector_callback) - return; if (!xen_feature(XENFEAT_hvm_safe_pvclock)) { printk(KERN_INFO "Xen doesn't support pvclock on HVM," "disable pv timer\n"); |