41 files changed, 1112 insertions, 1173 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 9b2d50a73a11..bada636d1065 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -23,11 +23,11 @@ config X86
 	select ARCH_CLOCKSOURCE_DATA
 	select ARCH_DISCARD_MEMBLOCK
 	select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI
-	select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
 	select ARCH_HAS_DEVMEM_IS_ALLOWED
 	select ARCH_HAS_ELF_RANDOMIZE
 	select ARCH_HAS_FAST_MULTIPLIER
 	select ARCH_HAS_GCOV_PROFILE_ALL
+	select ARCH_HAS_GIGANTIC_PAGE		if X86_64
 	select ARCH_HAS_KCOV			if X86_64
 	select ARCH_HAS_PMEM_API		if X86_64
 	select ARCH_HAS_MMIO_FLUSH
@@ -2757,19 +2757,6 @@ config PMC_ATOM
 	def_bool y
         depends on PCI
 
-config VMD
-	depends on PCI_MSI
-	tristate "Volume Management Device Driver"
-	default N
-	---help---
-	  Adds support for the Intel Volume Management Device (VMD). VMD is a
-	  secondary PCI host bridge that allows PCI Express root ports,
-	  and devices attached to them, to be removed from the default
-	  PCI domain and placed within the VMD domain. This provides
-	  more bus resources than are otherwise possible with a
-	  single domain. If you know your system provides one of these and
-	  has devices attached to it, say Y; if you are not sure, say N.
-
 source "net/Kconfig"
 
 source "drivers/Kconfig"
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 830ed391e7ef..2d449337a360 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -163,11 +163,12 @@ asinstr += $(call as-instr,pshufb %xmm0$(comma)%xmm0,-DCONFIG_AS_SSSE3=1)
 asinstr += $(call as-instr,crc32l %eax$(comma)%eax,-DCONFIG_AS_CRC32=1)
 avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1)
 avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1)
+avx512_instr :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,-DCONFIG_AS_AVX512=1)
 sha1_ni_instr :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA1_NI=1)
 sha256_ni_instr :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA256_NI=1)
 
-KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(sha1_ni_instr) $(sha256_ni_instr)
-KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(sha1_ni_instr) $(sha256_ni_instr)
+KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr)
+KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr)
 
 LDFLAGS := -m elf_$(UTS_MACHINE)
 
diff --git a/arch/x86/configs/kvm_guest.config b/arch/x86/configs/kvm_guest.config
deleted file mode 100644
index 9906505c998a..000000000000
--- a/arch/x86/configs/kvm_guest.config
+++ /dev/null
@@ -1,31 +0,0 @@
-CONFIG_NET=y
-CONFIG_NET_CORE=y
-CONFIG_NETDEVICES=y
-CONFIG_BLOCK=y
-CONFIG_BLK_DEV=y
-CONFIG_NETWORK_FILESYSTEMS=y
-CONFIG_INET=y
-CONFIG_TTY=y
-CONFIG_SERIAL_8250=y
-CONFIG_SERIAL_8250_CONSOLE=y
-CONFIG_IP_PNP=y
-CONFIG_IP_PNP_DHCP=y
-CONFIG_BINFMT_ELF=y
-CONFIG_PCI=y
-CONFIG_PCI_MSI=y
-CONFIG_DEBUG_KERNEL=y
-CONFIG_VIRTUALIZATION=y
-CONFIG_HYPERVISOR_GUEST=y
-CONFIG_PARAVIRT=y
-CONFIG_KVM_GUEST=y
-CONFIG_VIRTIO=y
-CONFIG_VIRTIO_PCI=y
-CONFIG_VIRTIO_BLK=y
-CONFIG_VIRTIO_CONSOLE=y
-CONFIG_VIRTIO_NET=y
-CONFIG_9P_FS=y
-CONFIG_NET_9P=y
-CONFIG_NET_9P_VIRTIO=y
-CONFIG_SCSI_LOWLEVEL=y
-CONFIG_SCSI_VIRTIO=y
-CONFIG_VIRTIO_INPUT=y
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index fee1d95902b5..c98ec2efd750 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1148,7 +1148,7 @@ END(error_entry)
 
 
 /*
- * On entry, EBS is a "return to kernel mode" flag:
+ * On entry, EBX is a "return to kernel mode" flag:
  *   1: already in kernel mode, don't need SWAPGS
  *   0: user gsbase is loaded, we need SWAPGS and standard preparation for return to usermode
  */
diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index 94d54d0defa7..02223cb4bcfd 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -129,7 +129,7 @@ static notrace cycle_t vread_pvclock(int *mode)
 			return 0;
 		}
 
-		ret = __pvclock_read_cycles(pvti);
+		ret = __pvclock_read_cycles(pvti, rdtsc_ordered());
 	} while (pvclock_read_retry(pvti, version));
 
 	/* refer to vread_tsc() comment for rationale */
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index e7de5c9a4fbd..16d3fa211962 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -50,8 +50,9 @@ extern int vector_used_by_percpu_irq(unsigned int vector);
 extern void init_ISA_irqs(void);
 
 #ifdef CONFIG_X86_LOCAL_APIC
-void arch_trigger_all_cpu_backtrace(bool);
-#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
+void arch_trigger_cpumask_backtrace(const struct cpumask *mask,
+				    bool exclude_self);
+#define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace
 #endif
 
 #endif /* _ASM_X86_IRQ_H */
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index b77f5edb03b0..ac7692dcfa2e 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -4,6 +4,10 @@
 #include <asm/processor-flags.h>
 
 #ifndef __ASSEMBLY__
+
+/* Provide __cpuidle; we can't safely include <linux/cpu.h> */
+#define __cpuidle __attribute__((__section__(".cpuidle.text")))
+
 /*
  * Interrupt control:
  */
@@ -44,12 +48,12 @@ static inline void native_irq_enable(void)
 	asm volatile("sti": : :"memory");
 }
 
-static inline void native_safe_halt(void)
+static inline __cpuidle void native_safe_halt(void)
 {
 	asm volatile("sti; hlt": : :"memory");
 }
 
-static inline void native_halt(void)
+static inline __cpuidle void native_halt(void)
 {
 	asm volatile("hlt": : :"memory");
 }
@@ -86,7 +90,7 @@ static inline notrace void arch_local_irq_enable(void)
  * Used in the idle loop; sti takes one instruction cycle
  * to complete:
  */
-static inline void arch_safe_halt(void)
+static inline __cpuidle void arch_safe_halt(void)
 {
 	native_safe_halt();
 }
@@ -95,7 +99,7 @@ static inline void arch_safe_halt(void)
  * Used when interrupts are already enabled or to
  * shutdown the processor:
  */
-static inline void halt(void)
+static inline __cpuidle void halt(void)
 {
 	native_halt();
 }
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 33ae3a4d0159..4b20f7304b9c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -568,6 +568,7 @@ struct kvm_vcpu_arch {
 		struct kvm_steal_time steal;
 	} st;
 
+	u64 tsc_offset;
 	u64 last_guest_tsc;
 	u64 last_host_tsc;
 	u64 tsc_offset_adjustment;
@@ -701,6 +702,8 @@ struct kvm_hv {
 	/* Hyper-v based guest crash (NT kernel bugcheck) parameters */
 	u64 hv_crash_param[HV_X64_MSR_CRASH_PARAMS];
 	u64 hv_crash_ctl;
+
+	HV_REFERENCE_TSC_PAGE tsc_ref;
 };
 
 struct kvm_arch {
@@ -781,54 +784,56 @@ struct kvm_arch {
 	bool disabled_lapic_found;
 
 	/* Struct members for AVIC */
+	u32 avic_vm_id;
 	u32 ldr_mode;
 	struct page *avic_logical_id_table_page;
 	struct page *avic_physical_id_table_page;
+	struct hlist_node hnode;
 
 	bool x2apic_format;
 	bool x2apic_broadcast_quirk_disabled;
 };
 
 struct kvm_vm_stat {
-	u32 mmu_shadow_zapped;
-	u32 mmu_pte_write;
-	u32 mmu_pte_updated;
-	u32 mmu_pde_zapped;
-	u32 mmu_flooded;
-	u32 mmu_recycled;
-	u32 mmu_cache_miss;
-	u32 mmu_unsync;
-	u32 remote_tlb_flush;
-	u32 lpages;
+	ulong mmu_shadow_zapped;
+	ulong mmu_pte_write;
+	ulong mmu_pte_updated;
+	ulong mmu_pde_zapped;
+	ulong mmu_flooded;
+	ulong mmu_recycled;
+	ulong mmu_cache_miss;
+	ulong mmu_unsync;
+	ulong remote_tlb_flush;
+	ulong lpages;
 };
 
 struct kvm_vcpu_stat {
-	u32 pf_fixed;
-	u32 pf_guest;
-	u32 tlb_flush;
-	u32 invlpg;
-
-	u32 exits;
-	u32 io_exits;
-	u32 mmio_exits;
-	u32 signal_exits;
-	u32 irq_window_exits;
-	u32 nmi_window_exits;
-	u32 halt_exits;
-	u32 halt_successful_poll;
-	u32 halt_attempted_poll;
-	u32 halt_poll_invalid;
-	u32 halt_wakeup;
-	u32 request_irq_exits;
-	u32 irq_exits;
-	u32 host_state_reload;
-	u32 efer_reload;
-	u32 fpu_reload;
-	u32 insn_emulation;
-	u32 insn_emulation_fail;
-	u32 hypercalls;
-	u32 irq_injections;
-	u32 nmi_injections;
+	u64 pf_fixed;
+	u64 pf_guest;
+	u64 tlb_flush;
+	u64 invlpg;
+
+	u64 exits;
+	u64 io_exits;
+	u64 mmio_exits;
+	u64 signal_exits;
+	u64 irq_window_exits;
+	u64 nmi_window_exits;
+	u64 halt_exits;
+	u64 halt_successful_poll;
+	u64 halt_attempted_poll;
+	u64 halt_poll_invalid;
+	u64 halt_wakeup;
+	u64 request_irq_exits;
+	u64 irq_exits;
+	u64 host_state_reload;
+	u64 efer_reload;
+	u64 fpu_reload;
+	u64 insn_emulation;
+	u64 insn_emulation_fail;
+	u64 hypercalls;
+	u64 irq_injections;
+	u64 nmi_injections;
 };
 
 struct x86_instruction_info;
@@ -951,7 +956,6 @@ struct kvm_x86_ops {
 
 	bool (*has_wbinvd_exit)(void);
 
-	u64 (*read_tsc_offset)(struct kvm_vcpu *vcpu);
 	void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
 
 	u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu, u64 host_tsc);
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 9ab7507ca1c2..1411dbed5e5e 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -23,6 +23,9 @@ struct pci_sysdata {
 #ifdef CONFIG_PCI_MSI_IRQ_DOMAIN
 	void		*fwnode;	/* IRQ domain for MSI assignment */
 #endif
+#if IS_ENABLED(CONFIG_VMD)
+	bool vmd_domain;		/* True if in Intel VMD domain */
+#endif
 };
 
 extern int pci_routeirq;
@@ -56,6 +59,17 @@ static inline void *_pci_root_bus_fwnode(struct pci_bus *bus)
 #define pci_root_bus_fwnode	_pci_root_bus_fwnode
 #endif
 
+static inline bool is_vmd(struct pci_bus *bus)
+{
+#if IS_ENABLED(CONFIG_VMD)
+	struct pci_sysdata *sd = bus->sysdata;
+
+	return sd->vmd_domain;
+#else
+	return false;
+#endif
+}
+
 /* Can be used to override the logic in pci_scan_bus for skipping
    already-configured bus numbers - to be used for buggy BIOSes
    or architectures with incomplete PCI setup by the loader */
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index f1218f512f62..8b4de22d6429 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -439,8 +439,6 @@ extern pgprot_t pgprot_writethrough(pgprot_t prot);
 struct file;
 pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
                               unsigned long size, pgprot_t vma_prot);
-int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
-                              unsigned long size, pgprot_t *vma_prot);
 
 /* Install a pte for a particular vaddr in kernel space. */
 void set_pte_vaddr(unsigned long vaddr, pte_t pte);
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index d019f0cc80ec..3ad741b84072 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -87,9 +87,10 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
 }
 
 static __always_inline
-cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src)
+cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
+			      u64 tsc)
 {
-	u64 delta = rdtsc_ordered() - src->tsc_timestamp;
+	u64 delta = tsc - src->tsc_timestamp;
 	cycle_t offset = pvclock_scale_delta(delta, src->tsc_to_system_mul,
 					     src->tsc_shift);
 	return src->system_time + offset;
diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h
index e6911caf5bbf..608a79d5a466 100644
--- a/arch/x86/include/asm/xen/events.h
+++ b/arch/x86/include/asm/xen/events.h
@@ -20,15 +20,4 @@ static inline int xen_irqs_disabled(struct pt_regs *regs)
 /* No need for a barrier -- XCHG is a barrier on x86. */
 #define xchg_xen_ulong(ptr, val) xchg((ptr), (val))
 
-extern int xen_have_vector_callback;
-
-/*
- * Events delivered via platform PCI interrupts are always
- * routed to vcpu 0 and hence cannot be rebound.
- */
-static inline bool xen_support_evtchn_rebind(void)
-{
-	return (!xen_hvm_domain() || xen_have_vector_callback);
-}
-
 #endif /* _ASM_X86_XEN_EVENTS_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 45257cf84370..4dd5d500eb60 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -83,6 +83,7 @@ obj-$(CONFIG_X86_MPPARSE)	+= mpparse.o
 obj-y				+= apic/
 obj-$(CONFIG_X86_REBOOTFIXUPS)	+= reboot_fixups_32.o
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
+obj-$(CONFIG_LIVEPATCH)	+= livepatch.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
 obj-$(CONFIG_FTRACE_SYSCALLS)	+= ftrace.o
 obj-$(CONFIG_X86_TSC)		+= trace_clock.o
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index bdfad642123f..af15f4444330 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -152,7 +152,7 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
 }
 EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe);
 
-void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
+void __cpuidle acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
 {
 	unsigned int cpu = smp_processor_id();
 	struct cstate_entry *percpu_entry;
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index f29501e1a5c1..c73c9fb281e1 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -26,32 +26,32 @@ u64 hw_nmi_get_sample_period(int watchdog_thresh)
 }
 #endif
 
-#ifdef arch_trigger_all_cpu_backtrace
+#ifdef arch_trigger_cpumask_backtrace
 static void nmi_raise_cpu_backtrace(cpumask_t *mask)
 {
 	apic->send_IPI_mask(mask, NMI_VECTOR);
 }
 
-void arch_trigger_all_cpu_backtrace(bool include_self)
+void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self)
 {
-	nmi_trigger_all_cpu_backtrace(include_self, nmi_raise_cpu_backtrace);
+	nmi_trigger_cpumask_backtrace(mask, exclude_self,
+				      nmi_raise_cpu_backtrace);
 }
 
-static int
-arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
+static int nmi_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
 {
 	if (nmi_cpu_backtrace(regs))
 		return NMI_HANDLED;
 
 	return NMI_DONE;
 }
-NOKPROBE_SYMBOL(arch_trigger_all_cpu_backtrace_handler);
+NOKPROBE_SYMBOL(nmi_cpu_backtrace_handler);
 
-static int __init register_trigger_all_cpu_backtrace(void)
+static int __init register_nmi_cpu_backtrace_handler(void)
 {
-	register_nmi_handler(NMI_LOCAL, arch_trigger_all_cpu_backtrace_handler,
+	register_nmi_handler(NMI_LOCAL, nmi_cpu_backtrace_handler,
 				0, "arch_bt");
 	return 0;
 }
-early_initcall(register_trigger_all_cpu_backtrace);
+early_initcall(register_nmi_cpu_backtrace_handler);
 #endif
diff --git a/arch/x86/kernel/livepatch.c b/arch/x86/kernel/livepatch.c
new file mode 100644
index 000000000000..e9d252d873aa
--- /dev/null
+++ b/arch/x86/kernel/livepatch.c
@@ -0,0 +1,65 @@
+/*
+ * livepatch.c - x86-specific Kernel Live Patching Core
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/livepatch.h>
+#include <asm/text-patching.h>
+
+/* Apply per-object alternatives. Based on x86 module_finalize() */
+void arch_klp_init_object_loaded(struct klp_patch *patch,
+				 struct klp_object *obj)
+{
+	int cnt;
+	struct klp_modinfo *info;
+	Elf_Shdr *s, *alt = NULL, *para = NULL;
+	void *aseg, *pseg;
+	const char *objname;
+	char sec_objname[MODULE_NAME_LEN];
+	char secname[KSYM_NAME_LEN];
+
+	info = patch->mod->klp_info;
+	objname = obj->name ? obj->name : "vmlinux";
+
+	/* See livepatch core code for BUILD_BUG_ON() explanation */
+	BUILD_BUG_ON(MODULE_NAME_LEN < 56 || KSYM_NAME_LEN != 128);
+
+	for (s = info->sechdrs; s < info->sechdrs + info->hdr.e_shnum; s++) {
+		/* Apply per-object .klp.arch sections */
+		cnt = sscanf(info->secstrings + s->sh_name,
+			     ".klp.arch.%55[^.].%127s",
+			     sec_objname, secname);
+		if (cnt != 2)
+			continue;
+		if (strcmp(sec_objname, objname))
+			continue;
+		if (!strcmp(".altinstructions", secname))
+			alt = s;
+		if (!strcmp(".parainstructions", secname))
+			para = s;
+	}
+
+	if (alt) {
+		aseg = (void *) alt->sh_addr;
+		apply_alternatives(aseg, aseg + alt->sh_size);
+	}
+
+	if (para) {
+		pseg = (void *) para->sh_addr;
+		apply_paravirt(pseg, pseg + para->sh_size);
+	}
+}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 4002b475171c..28cea7802ecb 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -302,7 +302,7 @@ void arch_cpu_idle(void)
 /*
  * We use this if we don't have any better idle routine..
  */
-void default_idle(void)
+void __cpuidle default_idle(void)
 {
 	trace_cpu_idle_rcuidle(1, smp_processor_id());
 	safe_halt();
@@ -417,7 +417,7 @@ static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c)
  * with interrupts enabled and no flags, which is backwards compatible with the
  * original MWAIT implementation.
  */
-static void mwait_idle(void)
+static __cpuidle void mwait_idle(void)
 {
 	if (!current_set_polling_and_test()) {
 		trace_cpu_idle_rcuidle(1, smp_processor_id());
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 3599404e3089..5b2cc889ce34 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -80,7 +80,7 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
 
 	do {
 		version = pvclock_read_begin(src);
-		ret = __pvclock_read_cycles(src);
+		ret = __pvclock_read_cycles(src, rdtsc_ordered());
 		flags = src->flags;
 	} while (pvclock_read_retry(src, version));
 
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 9297a002d8e5..dbf67f64d5ec 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -97,6 +97,7 @@ SECTIONS
 		_stext = .;
 		TEXT_TEXT
 		SCHED_TEXT
+		CPUIDLE_TEXT
 		LOCK_TEXT
 		KPROBES_TEXT
 		ENTRY_TEXT
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 464fa477afbf..3bff20710471 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -13,7 +13,7 @@ kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(KVM)/async_pf.o
 
 kvm-y			+= x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
 			   i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
-			   hyperv.o page_track.o
+			   hyperv.o page_track.o debugfs.o
 
 kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT)	+= assigned-dev.o iommu.o
 
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 3235e0fe7792..afa7bbb596cd 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -366,7 +366,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
 		F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
 		F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) |
-		F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB);
+		F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) |
+		F(AVX512BW) | F(AVX512VL);
 
 	/* cpuid 0xD.1.eax */
 	const u32 kvm_cpuid_D_1_eax_x86_features =
diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c
new file mode 100644
index 000000000000..c19c7ede9bd6
--- /dev/null
+++ b/arch/x86/kvm/debugfs.c
@@ -0,0 +1,69 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * Copyright 2016 Red Hat, Inc. and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+#include <linux/kvm_host.h>
+#include <linux/debugfs.h>
+
+bool kvm_arch_has_vcpu_debugfs(void)
+{
+	return true;
+}
+
+static int vcpu_get_tsc_offset(void *data, u64 *val)
+{
+	struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
+	*val = vcpu->arch.tsc_offset;
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_offset_fops, vcpu_get_tsc_offset, NULL, "%lld\n");
+
+static int vcpu_get_tsc_scaling_ratio(void *data, u64 *val)
+{
+	struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
+	*val = vcpu->arch.tsc_scaling_ratio;
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_fops, vcpu_get_tsc_scaling_ratio, NULL, "%llu\n");
+
+static int vcpu_get_tsc_scaling_frac_bits(void *data, u64 *val)
+{
+	*val = kvm_tsc_scaling_ratio_frac_bits;
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_frac_fops, vcpu_get_tsc_scaling_frac_bits, NULL, "%llu\n");
+
+int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
+{
+	struct dentry *ret;
+
+	ret = debugfs_create_file("tsc-offset", 0444,
+							vcpu->debugfs_dentry,
+							vcpu, &vcpu_tsc_offset_fops);
+	if (!ret)
+		return -ENOMEM;
+
+	if (kvm_has_tsc_control) {
+		ret = debugfs_create_file("tsc-scaling-ratio", 0444,
+							vcpu->debugfs_dentry,
+							vcpu, &vcpu_tsc_scaling_fops);
+		if (!ret)
+			return -ENOMEM;
+		ret = debugfs_create_file("tsc-scaling-ratio-frac-bits", 0444,
+							vcpu->debugfs_dentry,
+							vcpu, &vcpu_tsc_scaling_frac_fops);
+		if (!ret)
+			return -ENOMEM;
+
+	}
+
+	return 0;
+}
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 01bd7b7a6866..42b1c83741c8 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -386,7 +386,21 @@ static void synic_init(struct kvm_vcpu_hv_synic *synic)
 
 static u64 get_time_ref_counter(struct kvm *kvm)
 {
-	return div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100);
+	struct kvm_hv *hv = &kvm->arch.hyperv;
+	struct kvm_vcpu *vcpu;
+	u64 tsc;
+
+	/*
+	 * The guest has not set up the TSC page or the clock isn't
+	 * stable, fall back to get_kvmclock_ns.
+	 */
+	if (!hv->tsc_ref.tsc_sequence)
+		return div_u64(get_kvmclock_ns(kvm), 100);
+
+	vcpu = kvm_get_vcpu(kvm, 0);
+	tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+	return mul_u64_u64_shr(tsc, hv->tsc_ref.tsc_scale, 64)
+		+ hv->tsc_ref.tsc_offset;
 }
 
 static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
@@ -756,6 +770,129 @@ static int kvm_hv_msr_set_crash_data(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
+/*
+ * The kvmclock and Hyper-V TSC page use similar formulas, and converting
+ * between them is possible:
+ *
+ * kvmclock formula:
+ *    nsec = (ticks - tsc_timestamp) * tsc_to_system_mul * 2^(tsc_shift-32)
+ *           + system_time
+ *
+ * Hyper-V formula:
+ *    nsec/100 = ticks * scale / 2^64 + offset
+ *
+ * When tsc_timestamp = system_time = 0, offset is zero in the Hyper-V formula.
+ * By dividing the kvmclock formula by 100 and equating what's left we get:
+ *    ticks * scale / 2^64 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ *            scale / 2^64 =         tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ *            scale        =         tsc_to_system_mul * 2^(32+tsc_shift) / 100
+ *
+ * Now expand the kvmclock formula and divide by 100:
+ *    nsec = ticks * tsc_to_system_mul * 2^(tsc_shift-32)
+ *           - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32)
+ *           + system_time
+ *    nsec/100 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ *               - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ *               + system_time / 100
+ *
+ * Replace tsc_to_system_mul * 2^(tsc_shift-32) / 100 by scale / 2^64:
+ *    nsec/100 = ticks * scale / 2^64
+ *               - tsc_timestamp * scale / 2^64
+ *               + system_time / 100
+ *
+ * Equate with the Hyper-V formula so that ticks * scale / 2^64 cancels out:
+ *    offset = system_time / 100 - tsc_timestamp * scale / 2^64
+ *
+ * These two equivalencies are implemented in this function.
+ */
+static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info *hv_clock,
+					HV_REFERENCE_TSC_PAGE *tsc_ref)
+{
+	u64 max_mul;
+
+	if (!(hv_clock->flags & PVCLOCK_TSC_STABLE_BIT))
+		return false;
+
+	/*
+	 * check if scale would overflow, if so we use the time ref counter
+	 *    tsc_to_system_mul * 2^(tsc_shift+32) / 100 >= 2^64
+	 *    tsc_to_system_mul / 100 >= 2^(32-tsc_shift)
+	 *    tsc_to_system_mul >= 100 * 2^(32-tsc_shift)
+	 */
+	max_mul = 100ull << (32 - hv_clock->tsc_shift);
+	if (hv_clock->tsc_to_system_mul >= max_mul)
+		return false;
+
+	/*
+	 * Otherwise compute the scale and offset according to the formulas
+	 * derived above.
+	 */
+	tsc_ref->tsc_scale =
+		mul_u64_u32_div(1ULL << (32 + hv_clock->tsc_shift),
+				hv_clock->tsc_to_system_mul,
+				100);
+
+	tsc_ref->tsc_offset = hv_clock->system_time;
+	do_div(tsc_ref->tsc_offset, 100);
+	tsc_ref->tsc_offset -=
+		mul_u64_u64_shr(hv_clock->tsc_timestamp, tsc_ref->tsc_scale, 64);
+	return true;
+}
+
+void kvm_hv_setup_tsc_page(struct kvm *kvm,
+			   struct pvclock_vcpu_time_info *hv_clock)
+{
+	struct kvm_hv *hv = &kvm->arch.hyperv;
+	u32 tsc_seq;
+	u64 gfn;
+
+	BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(hv->tsc_ref.tsc_sequence));
+	BUILD_BUG_ON(offsetof(HV_REFERENCE_TSC_PAGE, tsc_sequence) != 0);
+
+	if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE))
+		return;
+
+	gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
+	/*
+	 * Because the TSC parameters only vary when there is a
+	 * change in the master clock, do not bother with caching.
+	 */
+	if (unlikely(kvm_read_guest(kvm, gfn_to_gpa(gfn),
+				    &tsc_seq, sizeof(tsc_seq))))
+		return;
+
+	/*
+	 * While we're computing and writing the parameters, force the
+	 * guest to use the time reference count MSR.
+	 */
+	hv->tsc_ref.tsc_sequence = 0;
+	if (kvm_write_guest(kvm, gfn_to_gpa(gfn),
+			    &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)))
+		return;
+
+	if (!compute_tsc_page_parameters(hv_clock, &hv->tsc_ref))
+		return;
+
+	/* Ensure sequence is zero before writing the rest of the struct.  */
+	smp_wmb();
+	if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref)))
+		return;
+
+	/*
+	 * Now switch to the TSC page mechanism by writing the sequence.
+	 */
+	tsc_seq++;
+	if (tsc_seq == 0xFFFFFFFF || tsc_seq == 0)
+		tsc_seq = 1;
+
+	/* Write the struct entirely before the non-zero sequence.  */
+	smp_wmb();
+
+	hv->tsc_ref.tsc_sequence = tsc_seq;
+	kvm_write_guest(kvm, gfn_to_gpa(gfn),
+			&hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence));
+}
+
 static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
 			     bool host)
 {
@@ -793,23 +930,11 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
 		mark_page_dirty(kvm, gfn);
 		break;
 	}
-	case HV_X64_MSR_REFERENCE_TSC: {
-		u64 gfn;
-		HV_REFERENCE_TSC_PAGE tsc_ref;
-
-		memset(&tsc_ref, 0, sizeof(tsc_ref));
+	case HV_X64_MSR_REFERENCE_TSC:
 		hv->hv_tsc_page = data;
-		if (!(data & HV_X64_MSR_TSC_REFERENCE_ENABLE))
-			break;
-		gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
-		if (kvm_write_guest(
-				kvm,
-				gfn << HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT,
-				&tsc_ref, sizeof(tsc_ref)))
-			return 1;
-		mark_page_dirty(kvm, gfn);
+		if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)
+			kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
 		break;
-	}
 	case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
 		return kvm_hv_msr_set_crash_data(vcpu,
 						 msr - HV_X64_MSR_CRASH_P0,
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index 60eccd4bd1d3..cd1119538add 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -84,4 +84,7 @@ static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu)
 
 void kvm_hv_process_stimers(struct kvm_vcpu *vcpu);
 
+void kvm_hv_setup_tsc_page(struct kvm *kvm,
+			   struct pvclock_vcpu_time_info *hv_clock);
+
 #endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index b62c85229711..23b99f305382 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1761,9 +1761,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
 		if (value & MSR_IA32_APICBASE_ENABLE) {
 			kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
 			static_key_slow_dec_deferred(&apic_hw_disabled);
-		} else
+		} else {
 			static_key_slow_inc(&apic_hw_disabled.key);
-		recalculate_apic_map(vcpu->kvm);
+			recalculate_apic_map(vcpu->kvm);
+		}
 	}
 
 	if ((old_value ^ value) & X2APIC_ENABLE) {
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3d4cc8cc56a3..d9c7e986b4e4 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1207,7 +1207,7 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
  *
  * Return true if tlb need be flushed.
  */
-static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect)
+static bool spte_write_protect(u64 *sptep, bool pt_protect)
 {
 	u64 spte = *sptep;
 
@@ -1233,12 +1233,12 @@ static bool __rmap_write_protect(struct kvm *kvm,
 	bool flush = false;
 
 	for_each_rmap_spte(rmap_head, &iter, sptep)
-		flush |= spte_write_protect(kvm, sptep, pt_protect);
+		flush |= spte_write_protect(sptep, pt_protect);
 
 	return flush;
 }
 
-static bool spte_clear_dirty(struct kvm *kvm, u64 *sptep)
+static bool spte_clear_dirty(u64 *sptep)
 {
 	u64 spte = *sptep;
 
@@ -1256,12 +1256,12 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
 	bool flush = false;
 
 	for_each_rmap_spte(rmap_head, &iter, sptep)
-		flush |= spte_clear_dirty(kvm, sptep);
+		flush |= spte_clear_dirty(sptep);
 
 	return flush;
 }
 
-static bool spte_set_dirty(struct kvm *kvm, u64 *sptep)
+static bool spte_set_dirty(u64 *sptep)
 {
 	u64 spte = *sptep;
 
@@ -1279,7 +1279,7 @@ static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
 	bool flush = false;
 
 	for_each_rmap_spte(rmap_head, &iter, sptep)
-		flush |= spte_set_dirty(kvm, sptep);
+		flush |= spte_set_dirty(sptep);
 
 	return flush;
 }
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1e6b84b96ea6..f8157a36ab09 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -34,6 +34,8 @@
 #include <linux/sched.h>
 #include <linux/trace_events.h>
 #include <linux/slab.h>
+#include <linux/amd-iommu.h>
+#include <linux/hashtable.h>
 
 #include <asm/apic.h>
 #include <asm/perf_event.h>
@@ -41,6 +43,7 @@
 #include <asm/desc.h>
 #include <asm/debugreg.h>
 #include <asm/kvm_para.h>
+#include <asm/irq_remapping.h>
 
 #include <asm/virtext.h>
 #include "trace.h"
@@ -96,6 +99,19 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
 #define AVIC_UNACCEL_ACCESS_OFFSET_MASK		0xFF0
 #define AVIC_UNACCEL_ACCESS_VECTOR_MASK		0xFFFFFFFF
 
+/* AVIC GATAG is encoded using VM and VCPU IDs */
+#define AVIC_VCPU_ID_BITS		8
+#define AVIC_VCPU_ID_MASK		((1 << AVIC_VCPU_ID_BITS) - 1)
+
+#define AVIC_VM_ID_BITS			24
+#define AVIC_VM_ID_NR			(1 << AVIC_VM_ID_BITS)
+#define AVIC_VM_ID_MASK			((1 << AVIC_VM_ID_BITS) - 1)
+
+#define AVIC_GATAG(x, y)		(((x & AVIC_VM_ID_MASK) << AVIC_VCPU_ID_BITS) | \
+						(y & AVIC_VCPU_ID_MASK))
+#define AVIC_GATAG_TO_VMID(x)		((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK)
+#define AVIC_GATAG_TO_VCPUID(x)		(x & AVIC_VCPU_ID_MASK)
+
 static bool erratum_383_found __read_mostly;
 
 static const u32 host_save_user_msrs[] = {
@@ -185,6 +201,23 @@ struct vcpu_svm {
 	struct page *avic_backing_page;
 	u64 *avic_physical_id_cache;
 	bool avic_is_running;
+
+	/*
+	 * Per-vcpu list of struct amd_svm_iommu_ir:
+	 * This is used mainly to store interrupt remapping information used
+	 * when update the vcpu affinity. This avoids the need to scan for
+	 * IRTE and try to match ga_tag in the IOMMU driver.
+	 */
+	struct list_head ir_list;
+	spinlock_t ir_list_lock;
+};
+
+/*
+ * This is a wrapper of struct amd_iommu_ir_data.
+ */
+struct amd_svm_iommu_ir {
+	struct list_head node;	/* Used by SVM for per-vcpu ir_list */
+	void *data;		/* Storing pointer to struct amd_ir_data */
 };
 
 #define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK	(0xFF)
@@ -242,6 +275,10 @@ static int avic;
 module_param(avic, int, S_IRUGO);
 #endif
 
+/* AVIC VM ID bit masks and lock */
+static DECLARE_BITMAP(avic_vm_id_bitmap, AVIC_VM_ID_NR);
+static DEFINE_SPINLOCK(avic_vm_id_lock);
+
 static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
 static void svm_flush_tlb(struct kvm_vcpu *vcpu);
 static void svm_complete_interrupts(struct vcpu_svm *svm);
@@ -928,6 +965,55 @@ static void svm_disable_lbrv(struct vcpu_svm *svm)
 	set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
 }
 
+/* Note:
+ * This hash table is used to map VM_ID to a struct kvm_arch,
+ * when handling AMD IOMMU GALOG notification to schedule in
+ * a particular vCPU.
+ */
+#define SVM_VM_DATA_HASH_BITS	8
+DECLARE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
+static spinlock_t svm_vm_data_hash_lock;
+
+/* Note:
+ * This function is called from IOMMU driver to notify
+ * SVM to schedule in a particular vCPU of a particular VM.
+ */
+static int avic_ga_log_notifier(u32 ga_tag)
+{
+	unsigned long flags;
+	struct kvm_arch *ka = NULL;
+	struct kvm_vcpu *vcpu = NULL;
+	u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag);
+	u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag);
+
+	pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id);
+
+	spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+	hash_for_each_possible(svm_vm_data_hash, ka, hnode, vm_id) {
+		struct kvm *kvm = container_of(ka, struct kvm, arch);
+		struct kvm_arch *vm_data = &kvm->arch;
+
+		if (vm_data->avic_vm_id != vm_id)
+			continue;
+		vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
+		break;
+	}
+	spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
+
+	if (!vcpu)
+		return 0;
+
+	/* Note:
+	 * At this point, the IOMMU should have already set the pending
+	 * bit in the vAPIC backing page. So, we just need to schedule
+	 * in the vcpu.
+	 */
+	if (vcpu->mode == OUTSIDE_GUEST_MODE)
+		kvm_vcpu_wake_up(vcpu);
+
+	return 0;
+}
+
 static __init int svm_hardware_setup(void)
 {
 	int cpu;
@@ -986,10 +1072,15 @@ static __init int svm_hardware_setup(void)
 	if (avic) {
 		if (!npt_enabled ||
 		    !boot_cpu_has(X86_FEATURE_AVIC) ||
-		    !IS_ENABLED(CONFIG_X86_LOCAL_APIC))
+		    !IS_ENABLED(CONFIG_X86_LOCAL_APIC)) {
 			avic = false;
-		else
+		} else {
 			pr_info("AVIC enabled\n");
+
+			hash_init(svm_vm_data_hash);
+			spin_lock_init(&svm_vm_data_hash_lock);
+			amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
+		}
 	}
 
 	return 0;
@@ -1028,13 +1119,6 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
 	seg->base = 0;
 }
 
-static u64 svm_read_tsc_offset(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-
-	return svm->vmcb->control.tsc_offset;
-}
-
 static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -1280,19 +1364,55 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+static inline int avic_get_next_vm_id(void)
+{
+	int id;
+
+	spin_lock(&avic_vm_id_lock);
+
+	/* AVIC VM ID is one-based. */
+	id = find_next_zero_bit(avic_vm_id_bitmap, AVIC_VM_ID_NR, 1);
+	if (id <= AVIC_VM_ID_MASK)
+		__set_bit(id, avic_vm_id_bitmap);
+	else
+		id = -EAGAIN;
+
+	spin_unlock(&avic_vm_id_lock);
+	return id;
+}
+
+static inline int avic_free_vm_id(int id)
+{
+	if (id <= 0 || id > AVIC_VM_ID_MASK)
+		return -EINVAL;
+
+	spin_lock(&avic_vm_id_lock);
+	__clear_bit(id, avic_vm_id_bitmap);
+	spin_unlock(&avic_vm_id_lock);
+	return 0;
+}
+
 static void avic_vm_destroy(struct kvm *kvm)
 {
+	unsigned long flags;
 	struct kvm_arch *vm_data = &kvm->arch;
 
+	avic_free_vm_id(vm_data->avic_vm_id);
+
 	if (vm_data->avic_logical_id_table_page)
 		__free_page(vm_data->avic_logical_id_table_page);
 	if (vm_data->avic_physical_id_table_page)
 		__free_page(vm_data->avic_physical_id_table_page);
+
+	spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+	hash_del(&vm_data->hnode);
+	spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
 }
 
 static int avic_vm_init(struct kvm *kvm)
 {
-	int err = -ENOMEM;
+	unsigned long flags;
+	int vm_id, err = -ENOMEM;
 	struct kvm_arch *vm_data = &kvm->arch;
 	struct page *p_page;
 	struct page *l_page;
@@ -1300,6 +1420,11 @@ static int avic_vm_init(struct kvm *kvm)
 	if (!avic)
 		return 0;
 
+	vm_id = avic_get_next_vm_id();
+	if (vm_id < 0)
+		return vm_id;
+	vm_data->avic_vm_id = (u32)vm_id;
+
 	/* Allocating physical APIC ID table (4KB) */
 	p_page = alloc_page(GFP_KERNEL);
 	if (!p_page)
@@ -1316,6 +1441,10 @@ static int avic_vm_init(struct kvm *kvm)
 	vm_data->avic_logical_id_table_page = l_page;
 	clear_page(page_address(l_page));
 
+	spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+	hash_add(svm_vm_data_hash, &vm_data->hnode, vm_data->avic_vm_id);
+	spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
+
 	return 0;
 
 free_avic:
@@ -1323,31 +1452,34 @@ free_avic:
 	return err;
 }
 
-/**
- * This function is called during VCPU halt/unhalt.
- */
-static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
+static inline int
+avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
 {
-	u64 entry;
-	int h_physical_id = kvm_cpu_get_apicid(vcpu->cpu);
+	int ret = 0;
+	unsigned long flags;
+	struct amd_svm_iommu_ir *ir;
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	if (!kvm_vcpu_apicv_active(vcpu))
-		return;
-
-	svm->avic_is_running = is_run;
+	if (!kvm_arch_has_assigned_device(vcpu->kvm))
+		return 0;
 
-	/* ID = 0xff (broadcast), ID > 0xff (reserved) */
-	if (WARN_ON(h_physical_id >= AVIC_MAX_PHYSICAL_ID_COUNT))
-		return;
+	/*
+	 * Here, we go through the per-vcpu ir_list to update all existing
+	 * interrupt remapping table entry targeting this vcpu.
+	 */
+	spin_lock_irqsave(&svm->ir_list_lock, flags);
 
-	entry = READ_ONCE(*(svm->avic_physical_id_cache));
-	WARN_ON(is_run == !!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK));
+	if (list_empty(&svm->ir_list))
+		goto out;
 
-	entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
-	if (is_run)
-		entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
-	WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+	list_for_each_entry(ir, &svm->ir_list, node) {
+		ret = amd_iommu_update_ga(cpu, r, ir->data);
+		if (ret)
+			break;
+	}
+out:
+	spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+	return ret;
 }
 
 static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -1374,6 +1506,8 @@ static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
 
 	WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+	avic_update_iommu_vcpu_affinity(vcpu, h_physical_id,
+					svm->avic_is_running);
 }
 
 static void avic_vcpu_put(struct kvm_vcpu *vcpu)
@@ -1385,10 +1519,27 @@ static void avic_vcpu_put(struct kvm_vcpu *vcpu)
 		return;
 
 	entry = READ_ONCE(*(svm->avic_physical_id_cache));
+	if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
+		avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
+
 	entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
 	WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
 }
 
+/**
+ * This function is called during VCPU halt/unhalt.
+ */
+static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	svm->avic_is_running = is_run;
+	if (is_run)
+		avic_vcpu_load(vcpu, vcpu->cpu);
+	else
+		avic_vcpu_put(vcpu);
+}
+
 static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -1450,6 +1601,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 		err = avic_init_backing_page(&svm->vcpu);
 		if (err)
 			goto free_page4;
+
+		INIT_LIST_HEAD(&svm->ir_list);
+		spin_lock_init(&svm->ir_list_lock);
 	}
 
 	/* We initialize this flag to true to make sure that the is_running
@@ -4246,6 +4400,209 @@ static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
 		kvm_vcpu_wake_up(vcpu);
 }
 
+static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
+{
+	unsigned long flags;
+	struct amd_svm_iommu_ir *cur;
+
+	spin_lock_irqsave(&svm->ir_list_lock, flags);
+	list_for_each_entry(cur, &svm->ir_list, node) {
+		if (cur->data != pi->ir_data)
+			continue;
+		list_del(&cur->node);
+		kfree(cur);
+		break;
+	}
+	spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+}
+
+static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
+{
+	int ret = 0;
+	unsigned long flags;
+	struct amd_svm_iommu_ir *ir;
+
+	/**
+	 * In some cases, the existing irte is updaed and re-set,
+	 * so we need to check here if it's already been * added
+	 * to the ir_list.
+	 */
+	if (pi->ir_data && (pi->prev_ga_tag != 0)) {
+		struct kvm *kvm = svm->vcpu.kvm;
+		u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag);
+		struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
+		struct vcpu_svm *prev_svm;
+
+		if (!prev_vcpu) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		prev_svm = to_svm(prev_vcpu);
+		svm_ir_list_del(prev_svm, pi);
+	}
+
+	/**
+	 * Allocating new amd_iommu_pi_data, which will get
+	 * add to the per-vcpu ir_list.
+	 */
+	ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL);
+	if (!ir) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	ir->data = pi->ir_data;
+
+	spin_lock_irqsave(&svm->ir_list_lock, flags);
+	list_add(&ir->node, &svm->ir_list);
+	spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+out:
+	return ret;
+}
+
+/**
+ * Note:
+ * The HW cannot support posting multicast/broadcast
+ * interrupts to a vCPU. So, we still use legacy interrupt
+ * remapping for these kind of interrupts.
+ *
+ * For lowest-priority interrupts, we only support
+ * those with single CPU as the destination, e.g. user
+ * configures the interrupts via /proc/irq or uses
+ * irqbalance to make the interrupts single-CPU.
+ */
+static int
+get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
+		 struct vcpu_data *vcpu_info, struct vcpu_svm **svm)
+{
+	struct kvm_lapic_irq irq;
+	struct kvm_vcpu *vcpu = NULL;
+
+	kvm_set_msi_irq(kvm, e, &irq);
+
+	if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
+		pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n",
+			 __func__, irq.vector);
+		return -1;
+	}
+
+	pr_debug("SVM: %s: use GA mode for irq %u\n", __func__,
+		 irq.vector);
+	*svm = to_svm(vcpu);
+	vcpu_info->pi_desc_addr = page_to_phys((*svm)->avic_backing_page);
+	vcpu_info->vector = irq.vector;
+
+	return 0;
+}
+
+/*
+ * svm_update_pi_irte - set IRTE for Posted-Interrupts
+ *
+ * @kvm: kvm
+ * @host_irq: host irq of the interrupt
+ * @guest_irq: gsi of the interrupt
+ * @set: set or unset PI
+ * returns 0 on success, < 0 on failure
+ */
+static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
+			      uint32_t guest_irq, bool set)
+{
+	struct kvm_kernel_irq_routing_entry *e;
+	struct kvm_irq_routing_table *irq_rt;
+	int idx, ret = -EINVAL;
+
+	if (!kvm_arch_has_assigned_device(kvm) ||
+	    !irq_remapping_cap(IRQ_POSTING_CAP))
+		return 0;
+
+	pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n",
+		 __func__, host_irq, guest_irq, set);
+
+	idx = srcu_read_lock(&kvm->irq_srcu);
+	irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+	WARN_ON(guest_irq >= irq_rt->nr_rt_entries);
+
+	hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
+		struct vcpu_data vcpu_info;
+		struct vcpu_svm *svm = NULL;
+
+		if (e->type != KVM_IRQ_ROUTING_MSI)
+			continue;
+
+		/**
+		 * Here, we setup with legacy mode in the following cases:
+		 * 1. When cannot target interrupt to a specific vcpu.
+		 * 2. Unsetting posted interrupt.
+		 * 3. APIC virtialization is disabled for the vcpu.
+		 */
+		if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set &&
+		    kvm_vcpu_apicv_active(&svm->vcpu)) {
+			struct amd_iommu_pi_data pi;
+
+			/* Try to enable guest_mode in IRTE */
+			pi.base = page_to_phys(svm->avic_backing_page) & AVIC_HPA_MASK;
+			pi.ga_tag = AVIC_GATAG(kvm->arch.avic_vm_id,
+						     svm->vcpu.vcpu_id);
+			pi.is_guest_mode = true;
+			pi.vcpu_data = &vcpu_info;
+			ret = irq_set_vcpu_affinity(host_irq, &pi);
+
+			/**
+			 * Here, we successfully setting up vcpu affinity in
+			 * IOMMU guest mode. Now, we need to store the posted
+			 * interrupt information in a per-vcpu ir_list so that
+			 * we can reference to them directly when we update vcpu
+			 * scheduling information in IOMMU irte.
+			 */
+			if (!ret && pi.is_guest_mode)
+				svm_ir_list_add(svm, &pi);
+		} else {
+			/* Use legacy mode in IRTE */
+			struct amd_iommu_pi_data pi;
+
+			/**
+			 * Here, pi is used to:
+			 * - Tell IOMMU to use legacy mode for this interrupt.
+			 * - Retrieve ga_tag of prior interrupt remapping data.
+			 */
+			pi.is_guest_mode = false;
+			ret = irq_set_vcpu_affinity(host_irq, &pi);
+
+			/**
+			 * Check if the posted interrupt was previously
+			 * setup with the guest_mode by checking if the ga_tag
+			 * was cached. If so, we need to clean up the per-vcpu
+			 * ir_list.
+			 */
+			if (!ret && pi.prev_ga_tag) {
+				int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
+				struct kvm_vcpu *vcpu;
+
+				vcpu = kvm_get_vcpu_by_id(kvm, id);
+				if (vcpu)
+					svm_ir_list_del(to_svm(vcpu), &pi);
+			}
+		}
+
+		if (!ret && svm) {
+			trace_kvm_pi_irte_update(svm->vcpu.vcpu_id,
+						 host_irq, e->gsi,
+						 vcpu_info.vector,
+						 vcpu_info.pi_desc_addr, set);
+		}
+
+		if (ret < 0) {
+			pr_err("%s: failed to update PI IRTE\n", __func__);
+			goto out;
+		}
+	}
+
+	ret = 0;
+out:
+	srcu_read_unlock(&kvm->irq_srcu, idx);
+	return ret;
+}
+
 static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -5064,7 +5421,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
 
 	.has_wbinvd_exit = svm_has_wbinvd_exit,
 
-	.read_tsc_offset = svm_read_tsc_offset,
 	.write_tsc_offset = svm_write_tsc_offset,
 	.adjust_tsc_offset_guest = svm_adjust_tsc_offset_guest,
 	.read_l1_tsc = svm_read_l1_tsc,
@@ -5078,6 +5434,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
 
 	.pmu_ops = &amd_pmu_ops,
 	.deliver_posted_interrupt = svm_deliver_avic_intr,
+	.update_pi_irte = svm_update_pi_irte,
 };
 
 static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 121fdf6e9ed0..cf1b16dbc98a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -927,6 +927,8 @@ static unsigned long *vmx_msr_bitmap_legacy;
 static unsigned long *vmx_msr_bitmap_longmode;
 static unsigned long *vmx_msr_bitmap_legacy_x2apic;
 static unsigned long *vmx_msr_bitmap_longmode_x2apic;
+static unsigned long *vmx_msr_bitmap_legacy_x2apic_apicv_inactive;
+static unsigned long *vmx_msr_bitmap_longmode_x2apic_apicv_inactive;
 static unsigned long *vmx_vmread_bitmap;
 static unsigned long *vmx_vmwrite_bitmap;
 
@@ -939,6 +941,7 @@ static DEFINE_SPINLOCK(vmx_vpid_lock);
 static struct vmcs_config {
 	int size;
 	int order;
+	u32 basic_cap;
 	u32 revision_id;
 	u32 pin_based_exec_ctrl;
 	u32 cpu_based_exec_ctrl;
@@ -1215,6 +1218,11 @@ static inline bool cpu_has_vmx_ple(void)
 		SECONDARY_EXEC_PAUSE_LOOP_EXITING;
 }
 
+static inline bool cpu_has_vmx_basic_inout(void)
+{
+	return	(((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT);
+}
+
 static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
 {
 	return flexpriority_enabled && lapic_in_kernel(vcpu);
@@ -2518,10 +2526,17 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
 	else if (cpu_has_secondary_exec_ctrls() &&
 		 (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
 		  SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
-		if (is_long_mode(vcpu))
-			msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
-		else
-			msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
+		if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) {
+			if (is_long_mode(vcpu))
+				msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
+			else
+				msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
+		} else {
+			if (is_long_mode(vcpu))
+				msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv_inactive;
+			else
+				msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv_inactive;
+		}
 	} else {
 		if (is_long_mode(vcpu))
 			msr_bitmap = vmx_msr_bitmap_longmode;
@@ -2603,11 +2618,6 @@ static u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
 	return host_tsc + tsc_offset;
 }
 
-static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu)
-{
-	return vmcs_read64(TSC_OFFSET);
-}
-
 /*
  * writes 'offset' into guest's timestamp counter offset register
  */
@@ -2877,6 +2887,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 		*pdata = VMCS12_REVISION | VMX_BASIC_TRUE_CTLS |
 			   ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
 			   (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
+		if (cpu_has_vmx_basic_inout())
+			*pdata |= VMX_BASIC_INOUT;
 		break;
 	case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
 	case MSR_IA32_VMX_PINBASED_CTLS:
@@ -3457,7 +3469,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 		return -EIO;
 
 	vmcs_conf->size = vmx_msr_high & 0x1fff;
-	vmcs_conf->order = get_order(vmcs_config.size);
+	vmcs_conf->order = get_order(vmcs_conf->size);
+	vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
 	vmcs_conf->revision_id = vmx_msr_low;
 
 	vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
@@ -4678,28 +4691,49 @@ static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
 						msr, MSR_TYPE_R | MSR_TYPE_W);
 }
 
-static void vmx_enable_intercept_msr_read_x2apic(u32 msr)
+static void vmx_enable_intercept_msr_read_x2apic(u32 msr, bool apicv_active)
 {
-	__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
-			msr, MSR_TYPE_R);
-	__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
-			msr, MSR_TYPE_R);
+	if (apicv_active) {
+		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
+				msr, MSR_TYPE_R);
+		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
+				msr, MSR_TYPE_R);
+	} else {
+		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
+				msr, MSR_TYPE_R);
+		__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
+				msr, MSR_TYPE_R);
+	}
 }
 
-static void vmx_disable_intercept_msr_read_x2apic(u32 msr)
+static void vmx_disable_intercept_msr_read_x2apic(u32 msr, bool apicv_active)
 {
-	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
-			msr, MSR_TYPE_R);
-	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
-			msr, MSR_TYPE_R);
+	if (apicv_active) {
+		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
+				msr, MSR_TYPE_R);
+		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
+				msr, MSR_TYPE_R);
+	} else {
+		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
+				msr, MSR_TYPE_R);
+		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
+				msr, MSR_TYPE_R);
+	}
 }
 
-static void vmx_disable_intercept_msr_write_x2apic(u32 msr)
+static void vmx_disable_intercept_msr_write_x2apic(u32 msr, bool apicv_active)
 {
-	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
-			msr, MSR_TYPE_W);
-	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
-			msr, MSR_TYPE_W);
+	if (apicv_active) {
+		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
+				msr, MSR_TYPE_W);
+		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
+				msr, MSR_TYPE_W);
+	} else {
+		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
+				msr, MSR_TYPE_W);
+		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
+				msr, MSR_TYPE_W);
+	}
 }
 
 static bool vmx_get_enable_apicv(void)
@@ -5279,29 +5313,30 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
-	if (is_guest_mode(vcpu))
-		return;
+	if (!is_guest_mode(vcpu)) {
+		if (!cpu_has_virtual_nmis()) {
+			/*
+			 * Tracking the NMI-blocked state in software is built upon
+			 * finding the next open IRQ window. This, in turn, depends on
+			 * well-behaving guests: They have to keep IRQs disabled at
+			 * least as long as the NMI handler runs. Otherwise we may
+			 * cause NMI nesting, maybe breaking the guest. But as this is
+			 * highly unlikely, we can live with the residual risk.
+			 */
+			vmx->soft_vnmi_blocked = 1;
+			vmx->vnmi_blocked_time = 0;
+		}
 
-	if (!cpu_has_virtual_nmis()) {
-		/*
-		 * Tracking the NMI-blocked state in software is built upon
-		 * finding the next open IRQ window. This, in turn, depends on
-		 * well-behaving guests: They have to keep IRQs disabled at
-		 * least as long as the NMI handler runs. Otherwise we may
-		 * cause NMI nesting, maybe breaking the guest. But as this is
-		 * highly unlikely, we can live with the residual risk.
-		 */
-		vmx->soft_vnmi_blocked = 1;
-		vmx->vnmi_blocked_time = 0;
+		++vcpu->stat.nmi_injections;
+		vmx->nmi_known_unmasked = false;
 	}
 
-	++vcpu->stat.nmi_injections;
-	vmx->nmi_known_unmasked = false;
 	if (vmx->rmode.vm86_active) {
 		if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE)
 			kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
 		return;
 	}
+
 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
 			INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
 }
@@ -6109,7 +6144,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
 	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 
 	gla_validity = (exit_qualification >> 7) & 0x3;
-	if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) {
+	if (gla_validity == 0x2) {
 		printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
 		printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
 			(long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
@@ -6360,22 +6395,32 @@ static __init int hardware_setup(void)
 	if (!vmx_msr_bitmap_legacy_x2apic)
 		goto out2;
 
+	vmx_msr_bitmap_legacy_x2apic_apicv_inactive =
+				(unsigned long *)__get_free_page(GFP_KERNEL);
+	if (!vmx_msr_bitmap_legacy_x2apic_apicv_inactive)
+		goto out3;
+
 	vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
 	if (!vmx_msr_bitmap_longmode)
-		goto out3;
+		goto out4;
 
 	vmx_msr_bitmap_longmode_x2apic =
 				(unsigned long *)__get_free_page(GFP_KERNEL);
 	if (!vmx_msr_bitmap_longmode_x2apic)
-		goto out4;
+		goto out5;
+
+	vmx_msr_bitmap_longmode_x2apic_apicv_inactive =
+				(unsigned long *)__get_free_page(GFP_KERNEL);
+	if (!vmx_msr_bitmap_longmode_x2apic_apicv_inactive)
+		goto out6;
 
 	vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
 	if (!vmx_vmread_bitmap)
-		goto out6;
+		goto out7;
 
 	vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
 	if (!vmx_vmwrite_bitmap)
-		goto out7;
+		goto out8;
 
 	memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
 	memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
@@ -6394,7 +6439,7 @@ static __init int hardware_setup(void)
 
 	if (setup_vmcs_config(&vmcs_config) < 0) {
 		r = -EIO;
-		goto out8;
+		goto out9;
 	}
 
 	if (boot_cpu_has(X86_FEATURE_NX))
@@ -6461,20 +6506,35 @@ static __init int hardware_setup(void)
 			vmx_msr_bitmap_legacy, PAGE_SIZE);
 	memcpy(vmx_msr_bitmap_longmode_x2apic,
 			vmx_msr_bitmap_longmode, PAGE_SIZE);
+	memcpy(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
+			vmx_msr_bitmap_legacy, PAGE_SIZE);
+	memcpy(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
+			vmx_msr_bitmap_longmode, PAGE_SIZE);
 
 	set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
 
+	/*
+	 * enable_apicv && kvm_vcpu_apicv_active()
+	 */
 	for (msr = 0x800; msr <= 0x8ff; msr++)
-		vmx_disable_intercept_msr_read_x2apic(msr);
+		vmx_disable_intercept_msr_read_x2apic(msr, true);
 
 	/* TMCCT */
-	vmx_enable_intercept_msr_read_x2apic(0x839);
+	vmx_enable_intercept_msr_read_x2apic(0x839, true);
 	/* TPR */
-	vmx_disable_intercept_msr_write_x2apic(0x808);
+	vmx_disable_intercept_msr_write_x2apic(0x808, true);
 	/* EOI */
-	vmx_disable_intercept_msr_write_x2apic(0x80b);
+	vmx_disable_intercept_msr_write_x2apic(0x80b, true);
 	/* SELF-IPI */
-	vmx_disable_intercept_msr_write_x2apic(0x83f);
+	vmx_disable_intercept_msr_write_x2apic(0x83f, true);
+
+	/*
+	 * (enable_apicv && !kvm_vcpu_apicv_active()) ||
+	 * 	!enable_apicv
+	 */
+	/* TPR */
+	vmx_disable_intercept_msr_read_x2apic(0x808, false);
+	vmx_disable_intercept_msr_write_x2apic(0x808, false);
 
 	if (enable_ept) {
 		kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
@@ -6521,14 +6581,18 @@ static __init int hardware_setup(void)
 
 	return alloc_kvm_area();
 
-out8:
+out9:
 	free_page((unsigned long)vmx_vmwrite_bitmap);
-out7:
+out8:
 	free_page((unsigned long)vmx_vmread_bitmap);
+out7:
+	free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive);
 out6:
 	free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
-out4:
+out5:
 	free_page((unsigned long)vmx_msr_bitmap_longmode);
+out4:
+	free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive);
 out3:
 	free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
 out2:
@@ -6544,7 +6608,9 @@ out:
 static __exit void hardware_unsetup(void)
 {
 	free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
+	free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive);
 	free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
+	free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive);
 	free_page((unsigned long)vmx_msr_bitmap_legacy);
 	free_page((unsigned long)vmx_msr_bitmap_longmode);
 	free_page((unsigned long)vmx_io_bitmap_b);
@@ -6726,7 +6792,7 @@ static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
 {
 	/* TODO: not to reset guest simply here. */
 	kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
-	pr_warn("kvm: nested vmx abort, indicator %d\n", indicator);
+	pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator);
 }
 
 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
@@ -7013,7 +7079,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 	vmx->nested.vmcs02_num = 0;
 
 	hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
-		     HRTIMER_MODE_REL);
+		     HRTIMER_MODE_REL_PINNED);
 	vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
 
 	vmx->nested.vmxon = true;
@@ -8435,12 +8501,7 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
 		return;
 	}
 
-	/*
-	 * There is not point to enable virtualize x2apic without enable
-	 * apicv
-	 */
-	if (!cpu_has_vmx_virtualize_x2apic_mode() ||
-				!kvm_vcpu_apicv_active(vcpu))
+	if (!cpu_has_vmx_virtualize_x2apic_mode())
 		return;
 
 	if (!cpu_need_tpr_shadow(vcpu))
@@ -9598,7 +9659,7 @@ static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
 	maxphyaddr = cpuid_maxphyaddr(vcpu);
 	if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
 	    (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) {
-		pr_warn_ratelimited(
+		pr_debug_ratelimited(
 			"nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)",
 			addr_field, maxphyaddr, count, addr);
 		return -EINVAL;
@@ -9671,13 +9732,13 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
 	for (i = 0; i < count; i++) {
 		if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
 					&e, sizeof(e))) {
-			pr_warn_ratelimited(
+			pr_debug_ratelimited(
 				"%s cannot read MSR entry (%u, 0x%08llx)\n",
 				__func__, i, gpa + i * sizeof(e));
 			goto fail;
 		}
 		if (nested_vmx_load_msr_check(vcpu, &e)) {
-			pr_warn_ratelimited(
+			pr_debug_ratelimited(
 				"%s check failed (%u, 0x%x, 0x%x)\n",
 				__func__, i, e.index, e.reserved);
 			goto fail;
@@ -9685,7 +9746,7 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
 		msr.index = e.index;
 		msr.data = e.value;
 		if (kvm_set_msr(vcpu, &msr)) {
-			pr_warn_ratelimited(
+			pr_debug_ratelimited(
 				"%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
 				__func__, i, e.index, e.value);
 			goto fail;
@@ -9706,13 +9767,13 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
 		if (kvm_vcpu_read_guest(vcpu,
 					gpa + i * sizeof(e),
 					&e, 2 * sizeof(u32))) {
-			pr_warn_ratelimited(
+			pr_debug_ratelimited(
 				"%s cannot read MSR entry (%u, 0x%08llx)\n",
 				__func__, i, gpa + i * sizeof(e));
 			return -EINVAL;
 		}
 		if (nested_vmx_store_msr_check(vcpu, &e)) {
-			pr_warn_ratelimited(
+			pr_debug_ratelimited(
 				"%s check failed (%u, 0x%x, 0x%x)\n",
 				__func__, i, e.index, e.reserved);
 			return -EINVAL;
@@ -9720,7 +9781,7 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
 		msr_info.host_initiated = false;
 		msr_info.index = e.index;
 		if (kvm_get_msr(vcpu, &msr_info)) {
-			pr_warn_ratelimited(
+			pr_debug_ratelimited(
 				"%s cannot read MSR (%u, 0x%x)\n",
 				__func__, i, e.index);
 			return -EINVAL;
@@ -9729,7 +9790,7 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
 					 gpa + i * sizeof(e) +
 					     offsetof(struct vmx_msr_entry, value),
 					 &msr_info.data, sizeof(msr_info.data))) {
-			pr_warn_ratelimited(
+			pr_debug_ratelimited(
 				"%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
 				__func__, i, e.index, msr_info.data);
 			return -EINVAL;
@@ -10500,6 +10561,9 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 		vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
 	}
 
+	if (nested_cpu_has_ept(vmcs12))
+		vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
+
 	if (nested_cpu_has_vid(vmcs12))
 		vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
 
@@ -10793,7 +10857,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 	 * We are now running in L2, mmu_notifier will force to reload the
 	 * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1.
 	 */
-	kvm_vcpu_reload_apic_access_page(vcpu);
+	kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
 
 	/*
 	 * Exiting from L2 to L1, we're now back to L1 which thinks it just
@@ -11274,7 +11338,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 
 	.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
 
-	.read_tsc_offset = vmx_read_tsc_offset,
 	.write_tsc_offset = vmx_write_tsc_offset,
 	.adjust_tsc_offset_guest = vmx_adjust_tsc_offset_guest,
 	.read_l1_tsc = vmx_read_l1_tsc,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 699f8726539a..6c633de84dd7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1367,7 +1367,7 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
 
 static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
 {
-	u64 curr_offset = kvm_x86_ops->read_tsc_offset(vcpu);
+	u64 curr_offset = vcpu->arch.tsc_offset;
 	vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
 }
 
@@ -1413,6 +1413,12 @@ u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
 }
 EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
 
+static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+{
+	kvm_x86_ops->write_tsc_offset(vcpu, offset);
+	vcpu->arch.tsc_offset = offset;
+}
+
 void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
 {
 	struct kvm *kvm = vcpu->kvm;
@@ -1425,7 +1431,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
 
 	raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
 	offset = kvm_compute_tsc_offset(vcpu, data);
-	ns = get_kernel_ns();
+	ns = ktime_get_boot_ns();
 	elapsed = ns - kvm->arch.last_tsc_nsec;
 
 	if (vcpu->arch.virtual_tsc_khz) {
@@ -1522,7 +1528,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
 
 	if (guest_cpuid_has_tsc_adjust(vcpu) && !msr->host_initiated)
 		update_ia32_tsc_adjust_msr(vcpu, offset);
-	kvm_x86_ops->write_tsc_offset(vcpu, offset);
+	kvm_vcpu_write_tsc_offset(vcpu, offset);
 	raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
 
 	spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
@@ -1716,6 +1722,88 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
 #endif
 }
 
+static u64 __get_kvmclock_ns(struct kvm *kvm)
+{
+	struct kvm_vcpu *vcpu = kvm_get_vcpu(kvm, 0);
+	struct kvm_arch *ka = &kvm->arch;
+	s64 ns;
+
+	if (vcpu->arch.hv_clock.flags & PVCLOCK_TSC_STABLE_BIT) {
+		u64 tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+		ns = __pvclock_read_cycles(&vcpu->arch.hv_clock, tsc);
+	} else {
+		ns = ktime_get_boot_ns() + ka->kvmclock_offset;
+	}
+
+	return ns;
+}
+
+u64 get_kvmclock_ns(struct kvm *kvm)
+{
+	unsigned long flags;
+	s64 ns;
+
+	local_irq_save(flags);
+	ns = __get_kvmclock_ns(kvm);
+	local_irq_restore(flags);
+
+	return ns;
+}
+
+static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
+{
+	struct kvm_vcpu_arch *vcpu = &v->arch;
+	struct pvclock_vcpu_time_info guest_hv_clock;
+
+	if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
+		&guest_hv_clock, sizeof(guest_hv_clock))))
+		return;
+
+	/* This VCPU is paused, but it's legal for a guest to read another
+	 * VCPU's kvmclock, so we really have to follow the specification where
+	 * it says that version is odd if data is being modified, and even after
+	 * it is consistent.
+	 *
+	 * Version field updates must be kept separate.  This is because
+	 * kvm_write_guest_cached might use a "rep movs" instruction, and
+	 * writes within a string instruction are weakly ordered.  So there
+	 * are three writes overall.
+	 *
+	 * As a small optimization, only write the version field in the first
+	 * and third write.  The vcpu->pv_time cache is still valid, because the
+	 * version field is the first in the struct.
+	 */
+	BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
+
+	vcpu->hv_clock.version = guest_hv_clock.version + 1;
+	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+				&vcpu->hv_clock,
+				sizeof(vcpu->hv_clock.version));
+
+	smp_wmb();
+
+	/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
+	vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
+
+	if (vcpu->pvclock_set_guest_stopped_request) {
+		vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
+		vcpu->pvclock_set_guest_stopped_request = false;
+	}
+
+	trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
+
+	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+				&vcpu->hv_clock,
+				sizeof(vcpu->hv_clock));
+
+	smp_wmb();
+
+	vcpu->hv_clock.version++;
+	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+				&vcpu->hv_clock,
+				sizeof(vcpu->hv_clock.version));
+}
+
 static int kvm_guest_time_update(struct kvm_vcpu *v)
 {
 	unsigned long flags, tgt_tsc_khz;
@@ -1723,7 +1811,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 	struct kvm_arch *ka = &v->kvm->arch;
 	s64 kernel_ns;
 	u64 tsc_timestamp, host_tsc;
-	struct pvclock_vcpu_time_info guest_hv_clock;
 	u8 pvclock_flags;
 	bool use_master_clock;
 
@@ -1752,7 +1839,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 	}
 	if (!use_master_clock) {
 		host_tsc = rdtsc();
-		kernel_ns = get_kernel_ns();
+		kernel_ns = ktime_get_boot_ns();
 	}
 
 	tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
@@ -1777,8 +1864,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 
 	local_irq_restore(flags);
 
-	if (!vcpu->pv_time_enabled)
-		return 0;
+	/* With all the info we got, fill in the values */
 
 	if (kvm_has_tsc_control)
 		tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
@@ -1790,64 +1876,21 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 		vcpu->hw_tsc_khz = tgt_tsc_khz;
 	}
 
-	/* With all the info we got, fill in the values */
 	vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
 	vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
 	vcpu->last_guest_tsc = tsc_timestamp;
 
-	if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
-		&guest_hv_clock, sizeof(guest_hv_clock))))
-		return 0;
-
-	/* This VCPU is paused, but it's legal for a guest to read another
-	 * VCPU's kvmclock, so we really have to follow the specification where
-	 * it says that version is odd if data is being modified, and even after
-	 * it is consistent.
-	 *
-	 * Version field updates must be kept separate.  This is because
-	 * kvm_write_guest_cached might use a "rep movs" instruction, and
-	 * writes within a string instruction are weakly ordered.  So there
-	 * are three writes overall.
-	 *
-	 * As a small optimization, only write the version field in the first
-	 * and third write.  The vcpu->pv_time cache is still valid, because the
-	 * version field is the first in the struct.
-	 */
-	BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
-
-	vcpu->hv_clock.version = guest_hv_clock.version + 1;
-	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
-				&vcpu->hv_clock,
-				sizeof(vcpu->hv_clock.version));
-
-	smp_wmb();
-
-	/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
-	pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
-
-	if (vcpu->pvclock_set_guest_stopped_request) {
-		pvclock_flags |= PVCLOCK_GUEST_STOPPED;
-		vcpu->pvclock_set_guest_stopped_request = false;
-	}
-
 	/* If the host uses TSC clocksource, then it is stable */
+	pvclock_flags = 0;
 	if (use_master_clock)
 		pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
 
 	vcpu->hv_clock.flags = pvclock_flags;
 
-	trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
-
-	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
-				&vcpu->hv_clock,
-				sizeof(vcpu->hv_clock));
-
-	smp_wmb();
-
-	vcpu->hv_clock.version++;
-	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
-				&vcpu->hv_clock,
-				sizeof(vcpu->hv_clock.version));
+	if (vcpu->pv_time_enabled)
+		kvm_setup_pvclock_page(v);
+	if (v == kvm_get_vcpu(v->kvm, 0))
+		kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
 	return 0;
 }
 
@@ -2746,7 +2789,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		if (check_tsc_unstable()) {
 			u64 offset = kvm_compute_tsc_offset(vcpu,
 						vcpu->arch.last_guest_tsc);
-			kvm_x86_ops->write_tsc_offset(vcpu, offset);
+			kvm_vcpu_write_tsc_offset(vcpu, offset);
 			vcpu->arch.tsc_catchup = 1;
 		}
 		if (kvm_lapic_hv_timer_in_use(vcpu) &&
@@ -4039,7 +4082,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
 	case KVM_SET_CLOCK: {
 		struct kvm_clock_data user_ns;
 		u64 now_ns;
-		s64 delta;
 
 		r = -EFAULT;
 		if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
@@ -4051,10 +4093,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
 
 		r = 0;
 		local_irq_disable();
-		now_ns = get_kernel_ns();
-		delta = user_ns.clock - now_ns;
+		now_ns = __get_kvmclock_ns(kvm);
+		kvm->arch.kvmclock_offset += user_ns.clock - now_ns;
 		local_irq_enable();
-		kvm->arch.kvmclock_offset = delta;
 		kvm_gen_update_masterclock(kvm);
 		break;
 	}
@@ -4062,10 +4103,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		struct kvm_clock_data user_ns;
 		u64 now_ns;
 
-		local_irq_disable();
-		now_ns = get_kernel_ns();
-		user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
-		local_irq_enable();
+		now_ns = get_kvmclock_ns(kvm);
+		user_ns.clock = now_ns;
 		user_ns.flags = 0;
 		memset(&user_ns.pad, 0, sizeof(user_ns.pad));
 
@@ -6700,7 +6739,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 	kvm_put_guest_xcr0(vcpu);
 
-	/* Interrupt is enabled by handle_external_intr() */
 	kvm_x86_ops->handle_external_intr(vcpu);
 
 	++vcpu->stat.exits;
@@ -7530,7 +7568,7 @@ int kvm_arch_hardware_enable(void)
 	 * before any KVM threads can be running.  Unfortunately, we can't
 	 * bring the TSCs fully up to date with real time, as we aren't yet far
 	 * enough into CPU bringup that we know how much real time has actually
-	 * elapsed; our helper function, get_kernel_ns() will be using boot
+	 * elapsed; our helper function, ktime_get_boot_ns() will be using boot
 	 * variables that haven't been updated yet.
 	 *
 	 * So we simply find the maximum observed TSC above, then record the
@@ -7765,6 +7803,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	mutex_init(&kvm->arch.apic_map_lock);
 	spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
 
+	kvm->arch.kvmclock_offset = -ktime_get_boot_ns();
 	pvclock_update_vm_gtod_copy(kvm);
 
 	INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index a82ca466b62e..e8ff3e4ce38a 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -148,11 +148,6 @@ static inline void kvm_register_writel(struct kvm_vcpu *vcpu,
 	return kvm_register_write(vcpu, reg, val);
 }
 
-static inline u64 get_kernel_ns(void)
-{
-	return ktime_get_boot_ns();
-}
-
 static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk)
 {
 	return !(kvm->arch.disabled_quirks & quirk);
@@ -164,6 +159,7 @@ void kvm_set_pending_timer(struct kvm_vcpu *vcpu);
 int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
 
 void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr);
+u64 get_kvmclock_ns(struct kvm *kvm);
 
 int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
 	gva_t addr, void *val, unsigned int bytes,
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index 97062a635b77..5c6fc3577a49 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -23,8 +23,6 @@ obj-y				+= bus_numa.o
 obj-$(CONFIG_AMD_NB)		+= amd_bus.o
 obj-$(CONFIG_PCI_CNB20LE_QUIRK)	+= broadcom_bus.o
 
-obj-$(CONFIG_VMD) += vmd.o
-
 ifeq ($(CONFIG_PCI_DEBUG),y)
 EXTRA_CFLAGS += -DDEBUG
 endif
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 7b6a9d14c8c0..a4fdfa7dcc1b 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -677,6 +677,12 @@ static void set_dma_domain_ops(struct pci_dev *pdev)
 static void set_dma_domain_ops(struct pci_dev *pdev) {}
 #endif
 
+static void set_dev_domain_options(struct pci_dev *pdev)
+{
+	if (is_vmd(pdev->bus))
+		pdev->hotplug_user_indicators = 1;
+}
+
 int pcibios_add_device(struct pci_dev *dev)
 {
 	struct setup_data *data;
@@ -707,6 +713,7 @@ int pcibios_add_device(struct pci_dev *dev)
 		iounmap(data);
 	}
 	set_dma_domain_ops(dev);
+	set_dev_domain_options(dev);
 	return 0;
 }
 
diff --git a/arch/x86/pci/vmd.c b/arch/x86/pci/vmd.c
deleted file mode 100644
index 7948be342ee9..000000000000
--- a/arch/x86/pci/vmd.c
+++ /dev/null
@@ -1,771 +0,0 @@
-/*
- * Volume Management Device driver
- * Copyright (c) 2015, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- */
-
-#include <linux/device.h>
-#include <linux/interrupt.h>
-#include <linux/irq.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/msi.h>
-#include <linux/pci.h>
-#include <linux/rculist.h>
-#include <linux/rcupdate.h>
-
-#include <asm/irqdomain.h>
-#include <asm/device.h>
-#include <asm/msi.h>
-#include <asm/msidef.h>
-
-#define VMD_CFGBAR	0
-#define VMD_MEMBAR1	2
-#define VMD_MEMBAR2	4
-
-/*
- * Lock for manipulating VMD IRQ lists.
- */
-static DEFINE_RAW_SPINLOCK(list_lock);
-
-/**
- * struct vmd_irq - private data to map driver IRQ to the VMD shared vector
- * @node:	list item for parent traversal.
- * @rcu:	RCU callback item for freeing.
- * @irq:	back pointer to parent.
- * @enabled:	true if driver enabled IRQ
- * @virq:	the virtual IRQ value provided to the requesting driver.
- *
- * Every MSI/MSI-X IRQ requested for a device in a VMD domain will be mapped to
- * a VMD IRQ using this structure.
- */
-struct vmd_irq {
-	struct list_head	node;
-	struct rcu_head		rcu;
-	struct vmd_irq_list	*irq;
-	bool			enabled;
-	unsigned int		virq;
-};
-
-/**
- * struct vmd_irq_list - list of driver requested IRQs mapping to a VMD vector
- * @irq_list:	the list of irq's the VMD one demuxes to.
- * @vmd_vector:	the h/w IRQ assigned to the VMD.
- * @index:	index into the VMD MSI-X table; used for message routing.
- * @count:	number of child IRQs assigned to this vector; used to track
- *		sharing.
- */
-struct vmd_irq_list {
-	struct list_head	irq_list;
-	struct vmd_dev		*vmd;
-	unsigned int		vmd_vector;
-	unsigned int		index;
-	unsigned int		count;
-};
-
-struct vmd_dev {
-	struct pci_dev		*dev;
-
-	spinlock_t		cfg_lock;
-	char __iomem		*cfgbar;
-
-	int msix_count;
-	struct msix_entry	*msix_entries;
-	struct vmd_irq_list	*irqs;
-
-	struct pci_sysdata	sysdata;
-	struct resource		resources[3];
-	struct irq_domain	*irq_domain;
-	struct pci_bus		*bus;
-
-#ifdef CONFIG_X86_DEV_DMA_OPS
-	struct dma_map_ops	dma_ops;
-	struct dma_domain	dma_domain;
-#endif
-};
-
-static inline struct vmd_dev *vmd_from_bus(struct pci_bus *bus)
-{
-	return container_of(bus->sysdata, struct vmd_dev, sysdata);
-}
-
-/*
- * Drivers managing a device in a VMD domain allocate their own IRQs as before,
- * but the MSI entry for the hardware it's driving will be programmed with a
- * destination ID for the VMD MSI-X table.  The VMD muxes interrupts in its
- * domain into one of its own, and the VMD driver de-muxes these for the
- * handlers sharing that VMD IRQ.  The vmd irq_domain provides the operations
- * and irq_chip to set this up.
- */
-static void vmd_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
-{
-	struct vmd_irq *vmdirq = data->chip_data;
-	struct vmd_irq_list *irq = vmdirq->irq;
-
-	msg->address_hi = MSI_ADDR_BASE_HI;
-	msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_DEST_ID(irq->index);
-	msg->data = 0;
-}
-
-/*
- * We rely on MSI_FLAG_USE_DEF_CHIP_OPS to set the IRQ mask/unmask ops.
- */
-static void vmd_irq_enable(struct irq_data *data)
-{
-	struct vmd_irq *vmdirq = data->chip_data;
-	unsigned long flags;
-
-	raw_spin_lock_irqsave(&list_lock, flags);
-	WARN_ON(vmdirq->enabled);
-	list_add_tail_rcu(&vmdirq->node, &vmdirq->irq->irq_list);
-	vmdirq->enabled = true;
-	raw_spin_unlock_irqrestore(&list_lock, flags);
-
-	data->chip->irq_unmask(data);
-}
-
-static void vmd_irq_disable(struct irq_data *data)
-{
-	struct vmd_irq *vmdirq = data->chip_data;
-	unsigned long flags;
-
-	data->chip->irq_mask(data);
-
-	raw_spin_lock_irqsave(&list_lock, flags);
-	if (vmdirq->enabled) {
-		list_del_rcu(&vmdirq->node);
-		vmdirq->enabled = false;
-	}
-	raw_spin_unlock_irqrestore(&list_lock, flags);
-}
-
-/*
- * XXX: Stubbed until we develop acceptable way to not create conflicts with
- * other devices sharing the same vector.
- */
-static int vmd_irq_set_affinity(struct irq_data *data,
-				const struct cpumask *dest, bool force)
-{
-	return -EINVAL;
-}
-
-static struct irq_chip vmd_msi_controller = {
-	.name			= "VMD-MSI",
-	.irq_enable		= vmd_irq_enable,
-	.irq_disable		= vmd_irq_disable,
-	.irq_compose_msi_msg	= vmd_compose_msi_msg,
-	.irq_set_affinity	= vmd_irq_set_affinity,
-};
-
-static irq_hw_number_t vmd_get_hwirq(struct msi_domain_info *info,
-				     msi_alloc_info_t *arg)
-{
-	return 0;
-}
-
-/*
- * XXX: We can be even smarter selecting the best IRQ once we solve the
- * affinity problem.
- */
-static struct vmd_irq_list *vmd_next_irq(struct vmd_dev *vmd, struct msi_desc *desc)
-{
-	int i, best = 1;
-	unsigned long flags;
-
-	if (!desc->msi_attrib.is_msix || vmd->msix_count == 1)
-		return &vmd->irqs[0];
-
-	raw_spin_lock_irqsave(&list_lock, flags);
-	for (i = 1; i < vmd->msix_count; i++)
-		if (vmd->irqs[i].count < vmd->irqs[best].count)
-			best = i;
-	vmd->irqs[best].count++;
-	raw_spin_unlock_irqrestore(&list_lock, flags);
-
-	return &vmd->irqs[best];
-}
-
-static int vmd_msi_init(struct irq_domain *domain, struct msi_domain_info *info,
-			unsigned int virq, irq_hw_number_t hwirq,
-			msi_alloc_info_t *arg)
-{
-	struct msi_desc *desc = arg->desc;
-	struct vmd_dev *vmd = vmd_from_bus(msi_desc_to_pci_dev(desc)->bus);
-	struct vmd_irq *vmdirq = kzalloc(sizeof(*vmdirq), GFP_KERNEL);
-
-	if (!vmdirq)
-		return -ENOMEM;
-
-	INIT_LIST_HEAD(&vmdirq->node);
-	vmdirq->irq = vmd_next_irq(vmd, desc);
-	vmdirq->virq = virq;
-
-	irq_domain_set_info(domain, virq, vmdirq->irq->vmd_vector, info->chip,
-			    vmdirq, handle_untracked_irq, vmd, NULL);
-	return 0;
-}
-
-static void vmd_msi_free(struct irq_domain *domain,
-			struct msi_domain_info *info, unsigned int virq)
-{
-	struct vmd_irq *vmdirq = irq_get_chip_data(virq);
-	unsigned long flags;
-
-	/* XXX: Potential optimization to rebalance */
-	raw_spin_lock_irqsave(&list_lock, flags);
-	vmdirq->irq->count--;
-	raw_spin_unlock_irqrestore(&list_lock, flags);
-
-	kfree_rcu(vmdirq, rcu);
-}
-
-static int vmd_msi_prepare(struct irq_domain *domain, struct device *dev,
-			   int nvec, msi_alloc_info_t *arg)
-{
-	struct pci_dev *pdev = to_pci_dev(dev);
-	struct vmd_dev *vmd = vmd_from_bus(pdev->bus);
-
-	if (nvec > vmd->msix_count)
-		return vmd->msix_count;
-
-	memset(arg, 0, sizeof(*arg));
-	return 0;
-}
-
-static void vmd_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc)
-{
-	arg->desc = desc;
-}
-
-static struct msi_domain_ops vmd_msi_domain_ops = {
-	.get_hwirq	= vmd_get_hwirq,
-	.msi_init	= vmd_msi_init,
-	.msi_free	= vmd_msi_free,
-	.msi_prepare	= vmd_msi_prepare,
-	.set_desc	= vmd_set_desc,
-};
-
-static struct msi_domain_info vmd_msi_domain_info = {
-	.flags		= MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
-			  MSI_FLAG_PCI_MSIX,
-	.ops		= &vmd_msi_domain_ops,
-	.chip		= &vmd_msi_controller,
-};
-
-#ifdef CONFIG_X86_DEV_DMA_OPS
-/*
- * VMD replaces the requester ID with its own.  DMA mappings for devices in a
- * VMD domain need to be mapped for the VMD, not the device requiring
- * the mapping.
- */
-static struct device *to_vmd_dev(struct device *dev)
-{
-	struct pci_dev *pdev = to_pci_dev(dev);
-	struct vmd_dev *vmd = vmd_from_bus(pdev->bus);
-
-	return &vmd->dev->dev;
-}
-
-static struct dma_map_ops *vmd_dma_ops(struct device *dev)
-{
-	return get_dma_ops(to_vmd_dev(dev));
-}
-
-static void *vmd_alloc(struct device *dev, size_t size, dma_addr_t *addr,
-		       gfp_t flag, unsigned long attrs)
-{
-	return vmd_dma_ops(dev)->alloc(to_vmd_dev(dev), size, addr, flag,
-				       attrs);
-}
-
-static void vmd_free(struct device *dev, size_t size, void *vaddr,
-		     dma_addr_t addr, unsigned long attrs)
-{
-	return vmd_dma_ops(dev)->free(to_vmd_dev(dev), size, vaddr, addr,
-				      attrs);
-}
-
-static int vmd_mmap(struct device *dev, struct vm_area_struct *vma,
-		    void *cpu_addr, dma_addr_t addr, size_t size,
-		    unsigned long attrs)
-{
-	return vmd_dma_ops(dev)->mmap(to_vmd_dev(dev), vma, cpu_addr, addr,
-				      size, attrs);
-}
-
-static int vmd_get_sgtable(struct device *dev, struct sg_table *sgt,
-			   void *cpu_addr, dma_addr_t addr, size_t size,
-			   unsigned long attrs)
-{
-	return vmd_dma_ops(dev)->get_sgtable(to_vmd_dev(dev), sgt, cpu_addr,
-					     addr, size, attrs);
-}
-
-static dma_addr_t vmd_map_page(struct device *dev, struct page *page,
-			       unsigned long offset, size_t size,
-			       enum dma_data_direction dir,
-			       unsigned long attrs)
-{
-	return vmd_dma_ops(dev)->map_page(to_vmd_dev(dev), page, offset, size,
-					  dir, attrs);
-}
-
-static void vmd_unmap_page(struct device *dev, dma_addr_t addr, size_t size,
-			   enum dma_data_direction dir, unsigned long attrs)
-{
-	vmd_dma_ops(dev)->unmap_page(to_vmd_dev(dev), addr, size, dir, attrs);
-}
-
-static int vmd_map_sg(struct device *dev, struct scatterlist *sg, int nents,
-		      enum dma_data_direction dir, unsigned long attrs)
-{
-	return vmd_dma_ops(dev)->map_sg(to_vmd_dev(dev), sg, nents, dir, attrs);
-}
-
-static void vmd_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
-			 enum dma_data_direction dir, unsigned long attrs)
-{
-	vmd_dma_ops(dev)->unmap_sg(to_vmd_dev(dev), sg, nents, dir, attrs);
-}
-
-static void vmd_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
-				    size_t size, enum dma_data_direction dir)
-{
-	vmd_dma_ops(dev)->sync_single_for_cpu(to_vmd_dev(dev), addr, size, dir);
-}
-
-static void vmd_sync_single_for_device(struct device *dev, dma_addr_t addr,
-				       size_t size, enum dma_data_direction dir)
-{
-	vmd_dma_ops(dev)->sync_single_for_device(to_vmd_dev(dev), addr, size,
-						 dir);
-}
-
-static void vmd_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
-				int nents, enum dma_data_direction dir)
-{
-	vmd_dma_ops(dev)->sync_sg_for_cpu(to_vmd_dev(dev), sg, nents, dir);
-}
-
-static void vmd_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
-				   int nents, enum dma_data_direction dir)
-{
-	vmd_dma_ops(dev)->sync_sg_for_device(to_vmd_dev(dev), sg, nents, dir);
-}
-
-static int vmd_mapping_error(struct device *dev, dma_addr_t addr)
-{
-	return vmd_dma_ops(dev)->mapping_error(to_vmd_dev(dev), addr);
-}
-
-static int vmd_dma_supported(struct device *dev, u64 mask)
-{
-	return vmd_dma_ops(dev)->dma_supported(to_vmd_dev(dev), mask);
-}
-
-#ifdef ARCH_HAS_DMA_GET_REQUIRED_MASK
-static u64 vmd_get_required_mask(struct device *dev)
-{
-	return vmd_dma_ops(dev)->get_required_mask(to_vmd_dev(dev));
-}
-#endif
-
-static void vmd_teardown_dma_ops(struct vmd_dev *vmd)
-{
-	struct dma_domain *domain = &vmd->dma_domain;
-
-	if (get_dma_ops(&vmd->dev->dev))
-		del_dma_domain(domain);
-}
-
-#define ASSIGN_VMD_DMA_OPS(source, dest, fn)	\
-	do {					\
-		if (source->fn)			\
-			dest->fn = vmd_##fn;	\
-	} while (0)
-
-static void vmd_setup_dma_ops(struct vmd_dev *vmd)
-{
-	const struct dma_map_ops *source = get_dma_ops(&vmd->dev->dev);
-	struct dma_map_ops *dest = &vmd->dma_ops;
-	struct dma_domain *domain = &vmd->dma_domain;
-
-	domain->domain_nr = vmd->sysdata.domain;
-	domain->dma_ops = dest;
-
-	if (!source)
-		return;
-	ASSIGN_VMD_DMA_OPS(source, dest, alloc);
-	ASSIGN_VMD_DMA_OPS(source, dest, free);
-	ASSIGN_VMD_DMA_OPS(source, dest, mmap);
-	ASSIGN_VMD_DMA_OPS(source, dest, get_sgtable);
-	ASSIGN_VMD_DMA_OPS(source, dest, map_page);
-	ASSIGN_VMD_DMA_OPS(source, dest, unmap_page);
-	ASSIGN_VMD_DMA_OPS(source, dest, map_sg);
-	ASSIGN_VMD_DMA_OPS(source, dest, unmap_sg);
-	ASSIGN_VMD_DMA_OPS(source, dest, sync_single_for_cpu);
-	ASSIGN_VMD_DMA_OPS(source, dest, sync_single_for_device);
-	ASSIGN_VMD_DMA_OPS(source, dest, sync_sg_for_cpu);
-	ASSIGN_VMD_DMA_OPS(source, dest, sync_sg_for_device);
-	ASSIGN_VMD_DMA_OPS(source, dest, mapping_error);
-	ASSIGN_VMD_DMA_OPS(source, dest, dma_supported);
-#ifdef ARCH_HAS_DMA_GET_REQUIRED_MASK
-	ASSIGN_VMD_DMA_OPS(source, dest, get_required_mask);
-#endif
-	add_dma_domain(domain);
-}
-#undef ASSIGN_VMD_DMA_OPS
-#else
-static void vmd_teardown_dma_ops(struct vmd_dev *vmd) {}
-static void vmd_setup_dma_ops(struct vmd_dev *vmd) {}
-#endif
-
-static char __iomem *vmd_cfg_addr(struct vmd_dev *vmd, struct pci_bus *bus,
-				  unsigned int devfn, int reg, int len)
-{
-	char __iomem *addr = vmd->cfgbar +
-			     (bus->number << 20) + (devfn << 12) + reg;
-
-	if ((addr - vmd->cfgbar) + len >=
-	    resource_size(&vmd->dev->resource[VMD_CFGBAR]))
-		return NULL;
-
-	return addr;
-}
-
-/*
- * CPU may deadlock if config space is not serialized on some versions of this
- * hardware, so all config space access is done under a spinlock.
- */
-static int vmd_pci_read(struct pci_bus *bus, unsigned int devfn, int reg,
-			int len, u32 *value)
-{
-	struct vmd_dev *vmd = vmd_from_bus(bus);
-	char __iomem *addr = vmd_cfg_addr(vmd, bus, devfn, reg, len);
-	unsigned long flags;
-	int ret = 0;
-
-	if (!addr)
-		return -EFAULT;
-
-	spin_lock_irqsave(&vmd->cfg_lock, flags);
-	switch (len) {
-	case 1:
-		*value = readb(addr);
-		break;
-	case 2:
-		*value = readw(addr);
-		break;
-	case 4:
-		*value = readl(addr);
-		break;
-	default:
-		ret = -EINVAL;
-		break;
-	}
-	spin_unlock_irqrestore(&vmd->cfg_lock, flags);
-	return ret;
-}
-
-/*
- * VMD h/w converts non-posted config writes to posted memory writes. The
- * read-back in this function forces the completion so it returns only after
- * the config space was written, as expected.
- */
-static int vmd_pci_write(struct pci_bus *bus, unsigned int devfn, int reg,
-			 int len, u32 value)
-{
-	struct vmd_dev *vmd = vmd_from_bus(bus);
-	char __iomem *addr = vmd_cfg_addr(vmd, bus, devfn, reg, len);
-	unsigned long flags;
-	int ret = 0;
-
-	if (!addr)
-		return -EFAULT;
-
-	spin_lock_irqsave(&vmd->cfg_lock, flags);
-	switch (len) {
-	case 1:
-		writeb(value, addr);
-		readb(addr);
-		break;
-	case 2:
-		writew(value, addr);
-		readw(addr);
-		break;
-	case 4:
-		writel(value, addr);
-		readl(addr);
-		break;
-	default:
-		ret = -EINVAL;
-		break;
-	}
-	spin_unlock_irqrestore(&vmd->cfg_lock, flags);
-	return ret;
-}
-
-static struct pci_ops vmd_ops = {
-	.read		= vmd_pci_read,
-	.write		= vmd_pci_write,
-};
-
-static void vmd_attach_resources(struct vmd_dev *vmd)
-{
-	vmd->dev->resource[VMD_MEMBAR1].child = &vmd->resources[1];
-	vmd->dev->resource[VMD_MEMBAR2].child = &vmd->resources[2];
-}
-
-static void vmd_detach_resources(struct vmd_dev *vmd)
-{
-	vmd->dev->resource[VMD_MEMBAR1].child = NULL;
-	vmd->dev->resource[VMD_MEMBAR2].child = NULL;
-}
-
-/*
- * VMD domains start at 0x1000 to not clash with ACPI _SEG domains.
- */
-static int vmd_find_free_domain(void)
-{
-	int domain = 0xffff;
-	struct pci_bus *bus = NULL;
-
-	while ((bus = pci_find_next_bus(bus)) != NULL)
-		domain = max_t(int, domain, pci_domain_nr(bus));
-	return domain + 1;
-}
-
-static int vmd_enable_domain(struct vmd_dev *vmd)
-{
-	struct pci_sysdata *sd = &vmd->sysdata;
-	struct resource *res;
-	u32 upper_bits;
-	unsigned long flags;
-	LIST_HEAD(resources);
-
-	res = &vmd->dev->resource[VMD_CFGBAR];
-	vmd->resources[0] = (struct resource) {
-		.name  = "VMD CFGBAR",
-		.start = 0,
-		.end   = (resource_size(res) >> 20) - 1,
-		.flags = IORESOURCE_BUS | IORESOURCE_PCI_FIXED,
-	};
-
-	/*
-	 * If the window is below 4GB, clear IORESOURCE_MEM_64 so we can
-	 * put 32-bit resources in the window.
-	 *
-	 * There's no hardware reason why a 64-bit window *couldn't*
-	 * contain a 32-bit resource, but pbus_size_mem() computes the
-	 * bridge window size assuming a 64-bit window will contain no
-	 * 32-bit resources.  __pci_assign_resource() enforces that
-	 * artificial restriction to make sure everything will fit.
-	 *
-	 * The only way we could use a 64-bit non-prefechable MEMBAR is
-	 * if its address is <4GB so that we can convert it to a 32-bit
-	 * resource.  To be visible to the host OS, all VMD endpoints must
-	 * be initially configured by platform BIOS, which includes setting
-	 * up these resources.  We can assume the device is configured
-	 * according to the platform needs.
-	 */
-	res = &vmd->dev->resource[VMD_MEMBAR1];
-	upper_bits = upper_32_bits(res->end);
-	flags = res->flags & ~IORESOURCE_SIZEALIGN;
-	if (!upper_bits)
-		flags &= ~IORESOURCE_MEM_64;
-	vmd->resources[1] = (struct resource) {
-		.name  = "VMD MEMBAR1",
-		.start = res->start,
-		.end   = res->end,
-		.flags = flags,
-		.parent = res,
-	};
-
-	res = &vmd->dev->resource[VMD_MEMBAR2];
-	upper_bits = upper_32_bits(res->end);
-	flags = res->flags & ~IORESOURCE_SIZEALIGN;
-	if (!upper_bits)
-		flags &= ~IORESOURCE_MEM_64;
-	vmd->resources[2] = (struct resource) {
-		.name  = "VMD MEMBAR2",
-		.start = res->start + 0x2000,
-		.end   = res->end,
-		.flags = flags,
-		.parent = res,
-	};
-
-	sd->domain = vmd_find_free_domain();
-	if (sd->domain < 0)
-		return sd->domain;
-
-	sd->node = pcibus_to_node(vmd->dev->bus);
-
-	vmd->irq_domain = pci_msi_create_irq_domain(NULL, &vmd_msi_domain_info,
-						    x86_vector_domain);
-	if (!vmd->irq_domain)
-		return -ENODEV;
-
-	pci_add_resource(&resources, &vmd->resources[0]);
-	pci_add_resource(&resources, &vmd->resources[1]);
-	pci_add_resource(&resources, &vmd->resources[2]);
-	vmd->bus = pci_create_root_bus(&vmd->dev->dev, 0, &vmd_ops, sd,
-				       &resources);
-	if (!vmd->bus) {
-		pci_free_resource_list(&resources);
-		irq_domain_remove(vmd->irq_domain);
-		return -ENODEV;
-	}
-
-	vmd_attach_resources(vmd);
-	vmd_setup_dma_ops(vmd);
-	dev_set_msi_domain(&vmd->bus->dev, vmd->irq_domain);
-	pci_rescan_bus(vmd->bus);
-
-	WARN(sysfs_create_link(&vmd->dev->dev.kobj, &vmd->bus->dev.kobj,
-			       "domain"), "Can't create symlink to domain\n");
-	return 0;
-}
-
-static irqreturn_t vmd_irq(int irq, void *data)
-{
-	struct vmd_irq_list *irqs = data;
-	struct vmd_irq *vmdirq;
-
-	rcu_read_lock();
-	list_for_each_entry_rcu(vmdirq, &irqs->irq_list, node)
-		generic_handle_irq(vmdirq->virq);
-	rcu_read_unlock();
-
-	return IRQ_HANDLED;
-}
-
-static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id)
-{
-	struct vmd_dev *vmd;
-	int i, err;
-
-	if (resource_size(&dev->resource[VMD_CFGBAR]) < (1 << 20))
-		return -ENOMEM;
-
-	vmd = devm_kzalloc(&dev->dev, sizeof(*vmd), GFP_KERNEL);
-	if (!vmd)
-		return -ENOMEM;
-
-	vmd->dev = dev;
-	err = pcim_enable_device(dev);
-	if (err < 0)
-		return err;
-
-	vmd->cfgbar = pcim_iomap(dev, VMD_CFGBAR, 0);
-	if (!vmd->cfgbar)
-		return -ENOMEM;
-
-	pci_set_master(dev);
-	if (dma_set_mask_and_coherent(&dev->dev, DMA_BIT_MASK(64)) &&
-	    dma_set_mask_and_coherent(&dev->dev, DMA_BIT_MASK(32)))
-		return -ENODEV;
-
-	vmd->msix_count = pci_msix_vec_count(dev);
-	if (vmd->msix_count < 0)
-		return -ENODEV;
-
-	vmd->irqs = devm_kcalloc(&dev->dev, vmd->msix_count, sizeof(*vmd->irqs),
-				 GFP_KERNEL);
-	if (!vmd->irqs)
-		return -ENOMEM;
-
-	vmd->msix_entries = devm_kcalloc(&dev->dev, vmd->msix_count,
-					 sizeof(*vmd->msix_entries),
-					 GFP_KERNEL);
-	if (!vmd->msix_entries)
-		return -ENOMEM;
-	for (i = 0; i < vmd->msix_count; i++)
-		vmd->msix_entries[i].entry = i;
-
-	vmd->msix_count = pci_enable_msix_range(vmd->dev, vmd->msix_entries, 1,
-						vmd->msix_count);
-	if (vmd->msix_count < 0)
-		return vmd->msix_count;
-
-	for (i = 0; i < vmd->msix_count; i++) {
-		INIT_LIST_HEAD(&vmd->irqs[i].irq_list);
-		vmd->irqs[i].vmd_vector = vmd->msix_entries[i].vector;
-		vmd->irqs[i].index = i;
-
-		err = devm_request_irq(&dev->dev, vmd->irqs[i].vmd_vector,
-				       vmd_irq, 0, "vmd", &vmd->irqs[i]);
-		if (err)
-			return err;
-	}
-
-	spin_lock_init(&vmd->cfg_lock);
-	pci_set_drvdata(dev, vmd);
-	err = vmd_enable_domain(vmd);
-	if (err)
-		return err;
-
-	dev_info(&vmd->dev->dev, "Bound to PCI domain %04x\n",
-		 vmd->sysdata.domain);
-	return 0;
-}
-
-static void vmd_remove(struct pci_dev *dev)
-{
-	struct vmd_dev *vmd = pci_get_drvdata(dev);
-
-	vmd_detach_resources(vmd);
-	pci_set_drvdata(dev, NULL);
-	sysfs_remove_link(&vmd->dev->dev.kobj, "domain");
-	pci_stop_root_bus(vmd->bus);
-	pci_remove_root_bus(vmd->bus);
-	vmd_teardown_dma_ops(vmd);
-	irq_domain_remove(vmd->irq_domain);
-}
-
-#ifdef CONFIG_PM
-static int vmd_suspend(struct device *dev)
-{
-	struct pci_dev *pdev = to_pci_dev(dev);
-
-	pci_save_state(pdev);
-	return 0;
-}
-
-static int vmd_resume(struct device *dev)
-{
-	struct pci_dev *pdev = to_pci_dev(dev);
-
-	pci_restore_state(pdev);
-	return 0;
-}
-#endif
-static SIMPLE_DEV_PM_OPS(vmd_dev_pm_ops, vmd_suspend, vmd_resume);
-
-static const struct pci_device_id vmd_ids[] = {
-	{PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x201d),},
-	{0,}
-};
-MODULE_DEVICE_TABLE(pci, vmd_ids);
-
-static struct pci_driver vmd_drv = {
-	.name		= "vmd",
-	.id_table	= vmd_ids,
-	.probe		= vmd_probe,
-	.remove		= vmd_remove,
-	.driver		= {
-		.pm	= &vmd_dev_pm_ops,
-	},
-};
-module_pci_driver(vmd_drv);
-
-MODULE_AUTHOR("Intel Corporation");
-MODULE_LICENSE("GPL v2");
-MODULE_VERSION("0.6");
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 3a483cb5ac81..bedfab98077a 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -456,7 +456,7 @@ void __init xen_msi_init(void)
 
 int __init pci_xen_hvm_init(void)
 {
-	if (!xen_have_vector_callback || !xen_feature(XENFEAT_hvm_pirqs))
+	if (!xen_feature(XENFEAT_hvm_pirqs))
 		return 0;
 
 #ifdef CONFIG_ACPI
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index f1d2182e071f..c0fdd57da7aa 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -137,8 +137,10 @@ struct shared_info xen_dummy_shared_info;
 void *xen_initial_gdt;
 
 RESERVE_BRK(shared_info_page_brk, PAGE_SIZE);
-__read_mostly int xen_have_vector_callback;
-EXPORT_SYMBOL_GPL(xen_have_vector_callback);
+
+static int xen_cpu_up_prepare(unsigned int cpu);
+static int xen_cpu_up_online(unsigned int cpu);
+static int xen_cpu_dead(unsigned int cpu);
 
 /*
  * Point at some empty memory to start with. We map the real shared_info
@@ -1519,10 +1521,7 @@ static void __init xen_pvh_early_guest_init(void)
 	if (!xen_feature(XENFEAT_auto_translated_physmap))
 		return;
 
-	if (!xen_feature(XENFEAT_hvm_callback_vector))
-		return;
-
-	xen_have_vector_callback = 1;
+	BUG_ON(!xen_feature(XENFEAT_hvm_callback_vector));
 
 	xen_pvh_early_cpu_init(0, false);
 	xen_pvh_set_cr_flags(0);
@@ -1538,6 +1537,24 @@ static void __init xen_dom0_set_legacy_features(void)
 	x86_platform.legacy.rtc = 1;
 }
 
+static int xen_cpuhp_setup(void)
+{
+	int rc;
+
+	rc = cpuhp_setup_state_nocalls(CPUHP_XEN_PREPARE,
+				       "XEN_HVM_GUEST_PREPARE",
+				       xen_cpu_up_prepare, xen_cpu_dead);
+	if (rc >= 0) {
+		rc = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+					       "XEN_HVM_GUEST_ONLINE",
+					       xen_cpu_up_online, NULL);
+		if (rc < 0)
+			cpuhp_remove_state_nocalls(CPUHP_XEN_PREPARE);
+	}
+
+	return rc >= 0 ? 0 : rc;
+}
+
 /* First C function to be called on Xen boot */
 asmlinkage __visible void __init xen_start_kernel(void)
 {
@@ -1639,6 +1656,8 @@ asmlinkage __visible void __init xen_start_kernel(void)
 	   possible map and a non-dummy shared_info. */
 	per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
 
+	WARN_ON(xen_cpuhp_setup());
+
 	local_irq_disable();
 	early_boot_irqs_disabled = true;
 
@@ -1819,31 +1838,54 @@ static void __init init_hvm_pv_info(void)
 	xen_domain_type = XEN_HVM_DOMAIN;
 }
 
-static int xen_hvm_cpu_notify(struct notifier_block *self, unsigned long action,
-			      void *hcpu)
+static int xen_cpu_up_prepare(unsigned int cpu)
 {
-	int cpu = (long)hcpu;
-	switch (action) {
-	case CPU_UP_PREPARE:
+	int rc;
+
+	if (xen_hvm_domain()) {
+		/*
+		 * This can happen if CPU was offlined earlier and
+		 * offlining timed out in common_cpu_die().
+		 */
+		if (cpu_report_state(cpu) == CPU_DEAD_FROZEN) {
+			xen_smp_intr_free(cpu);
+			xen_uninit_lock_cpu(cpu);
+		}
+
 		if (cpu_acpi_id(cpu) != U32_MAX)
 			per_cpu(xen_vcpu_id, cpu) = cpu_acpi_id(cpu);
 		else
 			per_cpu(xen_vcpu_id, cpu) = cpu;
 		xen_vcpu_setup(cpu);
-		if (xen_have_vector_callback) {
-			if (xen_feature(XENFEAT_hvm_safe_pvclock))
-				xen_setup_timer(cpu);
-		}
-		break;
-	default:
-		break;
 	}
-	return NOTIFY_OK;
+
+	if (xen_pv_domain() || xen_feature(XENFEAT_hvm_safe_pvclock))
+		xen_setup_timer(cpu);
+
+	rc = xen_smp_intr_init(cpu);
+	if (rc) {
+		WARN(1, "xen_smp_intr_init() for CPU %d failed: %d\n",
+		     cpu, rc);
+		return rc;
+	}
+	return 0;
 }
 
-static struct notifier_block xen_hvm_cpu_notifier = {
-	.notifier_call	= xen_hvm_cpu_notify,
-};
+static int xen_cpu_dead(unsigned int cpu)
+{
+	xen_smp_intr_free(cpu);
+
+	if (xen_pv_domain() || xen_feature(XENFEAT_hvm_safe_pvclock))
+		xen_teardown_timer(cpu);
+
+	return 0;
+}
+
+static int xen_cpu_up_online(unsigned int cpu)
+{
+	xen_init_lock_cpu(cpu);
+	return 0;
+}
 
 #ifdef CONFIG_KEXEC_CORE
 static void xen_hvm_shutdown(void)
@@ -1871,10 +1913,10 @@ static void __init xen_hvm_guest_init(void)
 
 	xen_panic_handler_init();
 
-	if (xen_feature(XENFEAT_hvm_callback_vector))
-		xen_have_vector_callback = 1;
+	BUG_ON(!xen_feature(XENFEAT_hvm_callback_vector));
+
 	xen_hvm_smp_init();
-	register_cpu_notifier(&xen_hvm_cpu_notifier);
+	WARN_ON(xen_cpuhp_setup());
 	xen_unplug_emulated_devices();
 	x86_init.irqs.intr_init = xen_init_IRQ;
 	xen_hvm_init_time_ops();
@@ -1910,7 +1952,7 @@ bool xen_hvm_need_lapic(void)
 		return false;
 	if (!xen_hvm_domain())
 		return false;
-	if (xen_feature(XENFEAT_hvm_pirqs) && xen_have_vector_callback)
+	if (xen_feature(XENFEAT_hvm_pirqs))
 		return false;
 	return true;
 }
diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c
index de4144c24f1c..809b6c812654 100644
--- a/arch/x86/xen/grant-table.c
+++ b/arch/x86/xen/grant-table.c
@@ -89,7 +89,7 @@ void arch_gnttab_unmap(void *shared, unsigned long nr_gframes)
 
 static int arch_gnttab_valloc(struct gnttab_vm_area *area, unsigned nr_frames)
 {
-	area->ptes = kmalloc(sizeof(pte_t *) * nr_frames, GFP_KERNEL);
+	area->ptes = kmalloc_array(nr_frames, sizeof(*area->ptes), GFP_KERNEL);
 	if (area->ptes == NULL)
 		return -ENOMEM;
 
diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c
index d37a0c7f82cb..90d1b83cf35f 100644
--- a/arch/x86/xen/platform-pci-unplug.c
+++ b/arch/x86/xen/platform-pci-unplug.c
@@ -61,7 +61,7 @@ static int check_platform_magic(void)
 		}
 		break;
 	default:
-		printk(KERN_WARNING "Xen Platform PCI: unknown I/O protocol version");
+		printk(KERN_WARNING "Xen Platform PCI: unknown I/O protocol version\n");
 		return XEN_PLATFORM_ERR_PROTOCOL;
 	}
 
diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c
index 32bdc2c90297..b9fc52556bcc 100644
--- a/arch/x86/xen/pmu.c
+++ b/arch/x86/xen/pmu.c
@@ -547,8 +547,11 @@ void xen_pmu_init(int cpu)
 	return;
 
 fail:
-	pr_info_once("Could not initialize VPMU for cpu %d, error %d\n",
-		cpu, err);
+	if (err == -EOPNOTSUPP || err == -ENOSYS)
+		pr_info_once("VPMU disabled by hypervisor.\n");
+	else
+		pr_info_once("Could not initialize VPMU for cpu %d, error %d\n",
+			cpu, err);
 	free_pages((unsigned long)xenpmu_data, 0);
 }
 
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 0b4d04c8ab4d..9fa27ceeecfd 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -87,6 +87,12 @@ static void cpu_bringup(void)
 	cpu_data(cpu).x86_max_cores = 1;
 	set_cpu_sibling_map(cpu);
 
+	/*
+	 * identify_cpu() may have set logical_pkg_id to -1 due
+	 * to incorrect phys_proc_id. Let's re-comupte it.
+	 */
+	topology_update_package_map(apic->cpu_present_to_apicid(cpu), cpu);
+
 	xen_setup_cpu_clockevents();
 
 	notify_cpu_starting(cpu);
@@ -115,7 +121,7 @@ asmlinkage __visible void cpu_bringup_and_idle(int cpu)
 	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
-static void xen_smp_intr_free(unsigned int cpu)
+void xen_smp_intr_free(unsigned int cpu)
 {
 	if (per_cpu(xen_resched_irq, cpu).irq >= 0) {
 		unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu).irq, NULL);
@@ -159,7 +165,7 @@ static void xen_smp_intr_free(unsigned int cpu)
 		per_cpu(xen_pmu_irq, cpu).name = NULL;
 	}
 };
-static int xen_smp_intr_init(unsigned int cpu)
+int xen_smp_intr_init(unsigned int cpu)
 {
 	int rc;
 	char *resched_name, *callfunc_name, *debug_name, *pmu_name;
@@ -475,8 +481,6 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)
 	common_cpu_up(cpu, idle);
 
 	xen_setup_runstate_info(cpu);
-	xen_setup_timer(cpu);
-	xen_init_lock_cpu(cpu);
 
 	/*
 	 * PV VCPUs are always successfully taken down (see 'while' loop
@@ -495,10 +499,6 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)
 
 	xen_pmu_init(cpu);
 
-	rc = xen_smp_intr_init(cpu);
-	if (rc)
-		return rc;
-
 	rc = HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL);
 	BUG_ON(rc);
 
@@ -769,47 +769,10 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
 	xen_init_lock_cpu(0);
 }
 
-static int xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle)
-{
-	int rc;
-
-	/*
-	 * This can happen if CPU was offlined earlier and
-	 * offlining timed out in common_cpu_die().
-	 */
-	if (cpu_report_state(cpu) == CPU_DEAD_FROZEN) {
-		xen_smp_intr_free(cpu);
-		xen_uninit_lock_cpu(cpu);
-	}
-
-	/*
-	 * xen_smp_intr_init() needs to run before native_cpu_up()
-	 * so that IPI vectors are set up on the booting CPU before
-	 * it is marked online in native_cpu_up().
-	*/
-	rc = xen_smp_intr_init(cpu);
-	WARN_ON(rc);
-	if (!rc)
-		rc =  native_cpu_up(cpu, tidle);
-
-	/*
-	 * We must initialize the slowpath CPU kicker _after_ the native
-	 * path has executed. If we initialized it before none of the
-	 * unlocker IPI kicks would reach the booting CPU as the booting
-	 * CPU had not set itself 'online' in cpu_online_mask. That mask
-	 * is checked when IPIs are sent (on HVM at least).
-	 */
-	xen_init_lock_cpu(cpu);
-	return rc;
-}
-
 void __init xen_hvm_smp_init(void)
 {
-	if (!xen_have_vector_callback)
-		return;
 	smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus;
 	smp_ops.smp_send_reschedule = xen_smp_send_reschedule;
-	smp_ops.cpu_up = xen_hvm_cpu_up;
 	smp_ops.cpu_die = xen_cpu_die;
 	smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi;
 	smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi;
diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h
index 963d62a35c82..c5c16dc4f694 100644
--- a/arch/x86/xen/smp.h
+++ b/arch/x86/xen/smp.h
@@ -1,5 +1,6 @@
 #ifndef _XEN_SMP_H
 
+#ifdef CONFIG_SMP
 extern void xen_send_IPI_mask(const struct cpumask *mask,
 			      int vector);
 extern void xen_send_IPI_mask_allbutself(const struct cpumask *mask,
@@ -8,6 +9,18 @@ extern void xen_send_IPI_allbutself(int vector);
 extern void xen_send_IPI_all(int vector);
 extern void xen_send_IPI_self(int vector);
 
+extern int xen_smp_intr_init(unsigned int cpu);
+extern void xen_smp_intr_free(unsigned int cpu);
+
+#else /* CONFIG_SMP */
+
+static inline int xen_smp_intr_init(unsigned int cpu)
+{
+	return 0;
+}
+static inline void xen_smp_intr_free(unsigned int cpu) {}
+#endif /* CONFIG_SMP */
+
 #ifdef CONFIG_XEN_PVH
 extern void xen_pvh_early_cpu_init(int cpu, bool entry);
 #else
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 67356d29d74d..33d8f6a7829d 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -432,11 +432,6 @@ static void xen_hvm_setup_cpu_clockevents(void)
 
 void __init xen_hvm_init_time_ops(void)
 {
-	/* vector callback is needed otherwise we cannot receive interrupts
-	 * on cpu > 0 and at this point we don't know how many cpus are
-	 * available */
-	if (!xen_have_vector_callback)
-		return;
 	if (!xen_feature(XENFEAT_hvm_safe_pvclock)) {
 		printk(KERN_INFO "Xen doesn't support pvclock on HVM,"
 				"disable pv timer\n");