40 files changed, 6387 insertions, 19982 deletions
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index ea434ddc8499..267c7369c765 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -1,21 +1,43 @@
 # SPDX-License-Identifier: GPL-2.0
 # KVM common configuration items and defaults
 
-config HAVE_KVM
+config KVM_COMMON
        bool
+       select EVENTFD
+       select INTERVAL_TREE
+       select PREEMPT_NOTIFIERS
 
-config HAVE_KVM_IRQCHIP
+config HAVE_KVM_PFNCACHE
        bool
 
-config HAVE_KVM_IRQFD
+config HAVE_KVM_IRQCHIP
        bool
 
 config HAVE_KVM_IRQ_ROUTING
        bool
 
-config HAVE_KVM_EVENTFD
+config HAVE_KVM_DIRTY_RING
        bool
-       select EVENTFD
+
+# Only strongly ordered architectures can select this, as it doesn't
+# put any explicit constraint on userspace ordering. They can also
+# select the _ACQ_REL version.
+config HAVE_KVM_DIRTY_RING_TSO
+       bool
+       select HAVE_KVM_DIRTY_RING
+       depends on X86
+
+# Weakly ordered architectures can only select this, advertising
+# to userspace the additional ordering requirements.
+config HAVE_KVM_DIRTY_RING_ACQ_REL
+       bool
+       select HAVE_KVM_DIRTY_RING
+
+# Allow enabling both the dirty bitmap and dirty ring. Only architectures
+# that need to dirty memory outside of a vCPU context should select this.
+config NEED_KVM_DIRTY_RING_WITH_BITMAP
+	bool
+	depends on HAVE_KVM_DIRTY_RING
 
 config KVM_MMIO
        bool
@@ -30,13 +52,13 @@ config KVM_ASYNC_PF_SYNC
 config HAVE_KVM_MSI
        bool
 
-config HAVE_KVM_CPU_RELAX_INTERCEPT
+config HAVE_KVM_READONLY_MEM
        bool
 
-config KVM_VFIO
+config HAVE_KVM_CPU_RELAX_INTERCEPT
        bool
 
-config HAVE_KVM_ARCH_TLB_FLUSH_ALL
+config KVM_VFIO
        bool
 
 config HAVE_KVM_INVALID_WAKEUPS
@@ -45,15 +67,61 @@ config HAVE_KVM_INVALID_WAKEUPS
 config KVM_GENERIC_DIRTYLOG_READ_PROTECT
        bool
 
+config KVM_GENERIC_PRE_FAULT_MEMORY
+       bool
+
 config KVM_COMPAT
        def_bool y
-       depends on KVM && COMPAT && !(S390 || ARM64)
+       depends on KVM && COMPAT && !(S390 || ARM64 || RISCV)
 
 config HAVE_KVM_IRQ_BYPASS
+       tristate
+       select IRQ_BYPASS_MANAGER
+
+config HAVE_KVM_VCPU_RUN_PID_CHANGE
        bool
 
-config HAVE_KVM_VCPU_ASYNC_IOCTL
+config HAVE_KVM_NO_POLL
        bool
 
-config HAVE_KVM_VCPU_RUN_PID_CHANGE
+config VIRT_XFER_TO_GUEST_WORK
+       bool
+
+config HAVE_KVM_PM_NOTIFIER
+       bool
+
+config KVM_GENERIC_HARDWARE_ENABLING
+       bool
+
+config KVM_GENERIC_MMU_NOTIFIER
+       select MMU_NOTIFIER
+       bool
+
+config KVM_ELIDE_TLB_FLUSH_IF_YOUNG
+       depends on KVM_GENERIC_MMU_NOTIFIER
+       bool
+
+config KVM_MMU_LOCKLESS_AGING
+       depends on KVM_GENERIC_MMU_NOTIFIER
+       bool
+
+config KVM_GENERIC_MEMORY_ATTRIBUTES
+       depends on KVM_GENERIC_MMU_NOTIFIER
+       bool
+
+config KVM_GUEST_MEMFD
+       depends on KVM_GENERIC_MMU_NOTIFIER
+       select XARRAY_MULTI
+       bool
+
+config HAVE_KVM_ARCH_GMEM_PREPARE
+       bool
+       depends on KVM_GUEST_MEMFD
+
+config HAVE_KVM_ARCH_GMEM_INVALIDATE
+       bool
+       depends on KVM_GUEST_MEMFD
+
+config HAVE_KVM_ARCH_GMEM_POPULATE
        bool
+       depends on KVM_GUEST_MEMFD
diff --git a/virt/kvm/Makefile.kvm b/virt/kvm/Makefile.kvm
new file mode 100644
index 000000000000..d047d4cf58c9
--- /dev/null
+++ b/virt/kvm/Makefile.kvm
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for Kernel-based Virtual Machine module
+#
+
+KVM ?= ../../../virt/kvm
+
+kvm-y := $(KVM)/kvm_main.o $(KVM)/eventfd.o $(KVM)/binary_stats.o
+kvm-$(CONFIG_KVM_VFIO) += $(KVM)/vfio.o
+kvm-$(CONFIG_KVM_MMIO) += $(KVM)/coalesced_mmio.o
+kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
+kvm-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(KVM)/irqchip.o
+kvm-$(CONFIG_HAVE_KVM_DIRTY_RING) += $(KVM)/dirty_ring.o
+kvm-$(CONFIG_HAVE_KVM_PFNCACHE) += $(KVM)/pfncache.o
+kvm-$(CONFIG_KVM_GUEST_MEMFD) += $(KVM)/guest_memfd.o
diff --git a/virt/kvm/arm/aarch32.c b/virt/kvm/arm/aarch32.c
deleted file mode 100644
index 5abbe9b3c652..000000000000
--- a/virt/kvm/arm/aarch32.c
+++ /dev/null
@@ -1,239 +0,0 @@
-/*
- * (not much of an) Emulation layer for 32bit guests.
- *
- * Copyright (C) 2012,2013 - ARM Ltd
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * based on arch/arm/kvm/emulate.c
- * Copyright (C) 2012 - Virtual Open Systems and Columbia University
- * Author: Christoffer Dall <c.dall@virtualopensystems.com>
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/kvm_host.h>
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_hyp.h>
-
-/*
- * stolen from arch/arm/kernel/opcodes.c
- *
- * condition code lookup table
- * index into the table is test code: EQ, NE, ... LT, GT, AL, NV
- *
- * bit position in short is condition code: NZCV
- */
-static const unsigned short cc_map[16] = {
-	0xF0F0,			/* EQ == Z set            */
-	0x0F0F,			/* NE                     */
-	0xCCCC,			/* CS == C set            */
-	0x3333,			/* CC                     */
-	0xFF00,			/* MI == N set            */
-	0x00FF,			/* PL                     */
-	0xAAAA,			/* VS == V set            */
-	0x5555,			/* VC                     */
-	0x0C0C,			/* HI == C set && Z clear */
-	0xF3F3,			/* LS == C clear || Z set */
-	0xAA55,			/* GE == (N==V)           */
-	0x55AA,			/* LT == (N!=V)           */
-	0x0A05,			/* GT == (!Z && (N==V))   */
-	0xF5FA,			/* LE == (Z || (N!=V))    */
-	0xFFFF,			/* AL always              */
-	0			/* NV                     */
-};
-
-/*
- * Check if a trapped instruction should have been executed or not.
- */
-bool __hyp_text kvm_condition_valid32(const struct kvm_vcpu *vcpu)
-{
-	unsigned long cpsr;
-	u32 cpsr_cond;
-	int cond;
-
-	/* Top two bits non-zero?  Unconditional. */
-	if (kvm_vcpu_get_hsr(vcpu) >> 30)
-		return true;
-
-	/* Is condition field valid? */
-	cond = kvm_vcpu_get_condition(vcpu);
-	if (cond == 0xE)
-		return true;
-
-	cpsr = *vcpu_cpsr(vcpu);
-
-	if (cond < 0) {
-		/* This can happen in Thumb mode: examine IT state. */
-		unsigned long it;
-
-		it = ((cpsr >> 8) & 0xFC) | ((cpsr >> 25) & 0x3);
-
-		/* it == 0 => unconditional. */
-		if (it == 0)
-			return true;
-
-		/* The cond for this insn works out as the top 4 bits. */
-		cond = (it >> 4);
-	}
-
-	cpsr_cond = cpsr >> 28;
-
-	if (!((cc_map[cond] >> cpsr_cond) & 1))
-		return false;
-
-	return true;
-}
-
-/**
- * adjust_itstate - adjust ITSTATE when emulating instructions in IT-block
- * @vcpu:	The VCPU pointer
- *
- * When exceptions occur while instructions are executed in Thumb IF-THEN
- * blocks, the ITSTATE field of the CPSR is not advanced (updated), so we have
- * to do this little bit of work manually. The fields map like this:
- *
- * IT[7:0] -> CPSR[26:25],CPSR[15:10]
- */
-static void __hyp_text kvm_adjust_itstate(struct kvm_vcpu *vcpu)
-{
-	unsigned long itbits, cond;
-	unsigned long cpsr = *vcpu_cpsr(vcpu);
-	bool is_arm = !(cpsr & PSR_AA32_T_BIT);
-
-	if (is_arm || !(cpsr & PSR_AA32_IT_MASK))
-		return;
-
-	cond = (cpsr & 0xe000) >> 13;
-	itbits = (cpsr & 0x1c00) >> (10 - 2);
-	itbits |= (cpsr & (0x3 << 25)) >> 25;
-
-	/* Perform ITAdvance (see page A2-52 in ARM DDI 0406C) */
-	if ((itbits & 0x7) == 0)
-		itbits = cond = 0;
-	else
-		itbits = (itbits << 1) & 0x1f;
-
-	cpsr &= ~PSR_AA32_IT_MASK;
-	cpsr |= cond << 13;
-	cpsr |= (itbits & 0x1c) << (10 - 2);
-	cpsr |= (itbits & 0x3) << 25;
-	*vcpu_cpsr(vcpu) = cpsr;
-}
-
-/**
- * kvm_skip_instr - skip a trapped instruction and proceed to the next
- * @vcpu: The vcpu pointer
- */
-void __hyp_text kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr)
-{
-	bool is_thumb;
-
-	is_thumb = !!(*vcpu_cpsr(vcpu) & PSR_AA32_T_BIT);
-	if (is_thumb && !is_wide_instr)
-		*vcpu_pc(vcpu) += 2;
-	else
-		*vcpu_pc(vcpu) += 4;
-	kvm_adjust_itstate(vcpu);
-}
-
-/*
- * Table taken from ARMv8 ARM DDI0487B-B, table G1-10.
- */
-static const u8 return_offsets[8][2] = {
-	[0] = { 0, 0 },		/* Reset, unused */
-	[1] = { 4, 2 },		/* Undefined */
-	[2] = { 0, 0 },		/* SVC, unused */
-	[3] = { 4, 4 },		/* Prefetch abort */
-	[4] = { 8, 8 },		/* Data abort */
-	[5] = { 0, 0 },		/* HVC, unused */
-	[6] = { 4, 4 },		/* IRQ, unused */
-	[7] = { 4, 4 },		/* FIQ, unused */
-};
-
-static void prepare_fault32(struct kvm_vcpu *vcpu, u32 mode, u32 vect_offset)
-{
-	unsigned long cpsr;
-	unsigned long new_spsr_value = *vcpu_cpsr(vcpu);
-	bool is_thumb = (new_spsr_value & PSR_AA32_T_BIT);
-	u32 return_offset = return_offsets[vect_offset >> 2][is_thumb];
-	u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR);
-
-	cpsr = mode | PSR_AA32_I_BIT;
-
-	if (sctlr & (1 << 30))
-		cpsr |= PSR_AA32_T_BIT;
-	if (sctlr & (1 << 25))
-		cpsr |= PSR_AA32_E_BIT;
-
-	*vcpu_cpsr(vcpu) = cpsr;
-
-	/* Note: These now point to the banked copies */
-	vcpu_write_spsr(vcpu, new_spsr_value);
-	*vcpu_reg32(vcpu, 14) = *vcpu_pc(vcpu) + return_offset;
-
-	/* Branch to exception vector */
-	if (sctlr & (1 << 13))
-		vect_offset += 0xffff0000;
-	else /* always have security exceptions */
-		vect_offset += vcpu_cp15(vcpu, c12_VBAR);
-
-	*vcpu_pc(vcpu) = vect_offset;
-}
-
-void kvm_inject_undef32(struct kvm_vcpu *vcpu)
-{
-	prepare_fault32(vcpu, PSR_AA32_MODE_UND, 4);
-}
-
-/*
- * Modelled after TakeDataAbortException() and TakePrefetchAbortException
- * pseudocode.
- */
-static void inject_abt32(struct kvm_vcpu *vcpu, bool is_pabt,
-			 unsigned long addr)
-{
-	u32 vect_offset;
-	u32 *far, *fsr;
-	bool is_lpae;
-
-	if (is_pabt) {
-		vect_offset = 12;
-		far = &vcpu_cp15(vcpu, c6_IFAR);
-		fsr = &vcpu_cp15(vcpu, c5_IFSR);
-	} else { /* !iabt */
-		vect_offset = 16;
-		far = &vcpu_cp15(vcpu, c6_DFAR);
-		fsr = &vcpu_cp15(vcpu, c5_DFSR);
-	}
-
-	prepare_fault32(vcpu, PSR_AA32_MODE_ABT | PSR_AA32_A_BIT, vect_offset);
-
-	*far = addr;
-
-	/* Give the guest an IMPLEMENTATION DEFINED exception */
-	is_lpae = (vcpu_cp15(vcpu, c2_TTBCR) >> 31);
-	if (is_lpae)
-		*fsr = 1 << 9 | 0x34;
-	else
-		*fsr = 0x14;
-}
-
-void kvm_inject_dabt32(struct kvm_vcpu *vcpu, unsigned long addr)
-{
-	inject_abt32(vcpu, false, addr);
-}
-
-void kvm_inject_pabt32(struct kvm_vcpu *vcpu, unsigned long addr)
-{
-	inject_abt32(vcpu, true, addr);
-}
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
deleted file mode 100644
index b07ac4614e1c..000000000000
--- a/virt/kvm/arm/arch_timer.c
+++ /dev/null
@@ -1,943 +0,0 @@
-/*
- * Copyright (C) 2012 ARM Ltd.
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#include <linux/cpu.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <linux/interrupt.h>
-#include <linux/irq.h>
-#include <linux/uaccess.h>
-
-#include <clocksource/arm_arch_timer.h>
-#include <asm/arch_timer.h>
-#include <asm/kvm_hyp.h>
-
-#include <kvm/arm_vgic.h>
-#include <kvm/arm_arch_timer.h>
-
-#include "trace.h"
-
-static struct timecounter *timecounter;
-static unsigned int host_vtimer_irq;
-static u32 host_vtimer_irq_flags;
-
-static DEFINE_STATIC_KEY_FALSE(has_gic_active_state);
-
-static const struct kvm_irq_level default_ptimer_irq = {
-	.irq	= 30,
-	.level	= 1,
-};
-
-static const struct kvm_irq_level default_vtimer_irq = {
-	.irq	= 27,
-	.level	= 1,
-};
-
-static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx);
-static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level,
-				 struct arch_timer_context *timer_ctx);
-static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx);
-
-u64 kvm_phys_timer_read(void)
-{
-	return timecounter->cc->read(timecounter->cc);
-}
-
-static inline bool userspace_irqchip(struct kvm *kvm)
-{
-	return static_branch_unlikely(&userspace_irqchip_in_use) &&
-		unlikely(!irqchip_in_kernel(kvm));
-}
-
-static void soft_timer_start(struct hrtimer *hrt, u64 ns)
-{
-	hrtimer_start(hrt, ktime_add_ns(ktime_get(), ns),
-		      HRTIMER_MODE_ABS);
-}
-
-static void soft_timer_cancel(struct hrtimer *hrt)
-{
-	hrtimer_cancel(hrt);
-}
-
-static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
-{
-	struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id;
-	struct arch_timer_context *vtimer;
-
-	/*
-	 * We may see a timer interrupt after vcpu_put() has been called which
-	 * sets the CPU's vcpu pointer to NULL, because even though the timer
-	 * has been disabled in vtimer_save_state(), the hardware interrupt
-	 * signal may not have been retired from the interrupt controller yet.
-	 */
-	if (!vcpu)
-		return IRQ_HANDLED;
-
-	vtimer = vcpu_vtimer(vcpu);
-	if (kvm_timer_should_fire(vtimer))
-		kvm_timer_update_irq(vcpu, true, vtimer);
-
-	if (userspace_irqchip(vcpu->kvm) &&
-	    !static_branch_unlikely(&has_gic_active_state))
-		disable_percpu_irq(host_vtimer_irq);
-
-	return IRQ_HANDLED;
-}
-
-static u64 kvm_timer_compute_delta(struct arch_timer_context *timer_ctx)
-{
-	u64 cval, now;
-
-	cval = timer_ctx->cnt_cval;
-	now = kvm_phys_timer_read() - timer_ctx->cntvoff;
-
-	if (now < cval) {
-		u64 ns;
-
-		ns = cyclecounter_cyc2ns(timecounter->cc,
-					 cval - now,
-					 timecounter->mask,
-					 &timecounter->frac);
-		return ns;
-	}
-
-	return 0;
-}
-
-static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx)
-{
-	return !(timer_ctx->cnt_ctl & ARCH_TIMER_CTRL_IT_MASK) &&
-		(timer_ctx->cnt_ctl & ARCH_TIMER_CTRL_ENABLE);
-}
-
-/*
- * Returns the earliest expiration time in ns among guest timers.
- * Note that it will return 0 if none of timers can fire.
- */
-static u64 kvm_timer_earliest_exp(struct kvm_vcpu *vcpu)
-{
-	u64 min_virt = ULLONG_MAX, min_phys = ULLONG_MAX;
-	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
-	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
-
-	if (kvm_timer_irq_can_fire(vtimer))
-		min_virt = kvm_timer_compute_delta(vtimer);
-
-	if (kvm_timer_irq_can_fire(ptimer))
-		min_phys = kvm_timer_compute_delta(ptimer);
-
-	/* If none of timers can fire, then return 0 */
-	if ((min_virt == ULLONG_MAX) && (min_phys == ULLONG_MAX))
-		return 0;
-
-	return min(min_virt, min_phys);
-}
-
-static enum hrtimer_restart kvm_bg_timer_expire(struct hrtimer *hrt)
-{
-	struct arch_timer_cpu *timer;
-	struct kvm_vcpu *vcpu;
-	u64 ns;
-
-	timer = container_of(hrt, struct arch_timer_cpu, bg_timer);
-	vcpu = container_of(timer, struct kvm_vcpu, arch.timer_cpu);
-
-	/*
-	 * Check that the timer has really expired from the guest's
-	 * PoV (NTP on the host may have forced it to expire
-	 * early). If we should have slept longer, restart it.
-	 */
-	ns = kvm_timer_earliest_exp(vcpu);
-	if (unlikely(ns)) {
-		hrtimer_forward_now(hrt, ns_to_ktime(ns));
-		return HRTIMER_RESTART;
-	}
-
-	kvm_vcpu_wake_up(vcpu);
-	return HRTIMER_NORESTART;
-}
-
-static enum hrtimer_restart kvm_phys_timer_expire(struct hrtimer *hrt)
-{
-	struct arch_timer_context *ptimer;
-	struct arch_timer_cpu *timer;
-	struct kvm_vcpu *vcpu;
-	u64 ns;
-
-	timer = container_of(hrt, struct arch_timer_cpu, phys_timer);
-	vcpu = container_of(timer, struct kvm_vcpu, arch.timer_cpu);
-	ptimer = vcpu_ptimer(vcpu);
-
-	/*
-	 * Check that the timer has really expired from the guest's
-	 * PoV (NTP on the host may have forced it to expire
-	 * early). If not ready, schedule for a later time.
-	 */
-	ns = kvm_timer_compute_delta(ptimer);
-	if (unlikely(ns)) {
-		hrtimer_forward_now(hrt, ns_to_ktime(ns));
-		return HRTIMER_RESTART;
-	}
-
-	kvm_timer_update_irq(vcpu, true, ptimer);
-	return HRTIMER_NORESTART;
-}
-
-static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx)
-{
-	u64 cval, now;
-
-	if (timer_ctx->loaded) {
-		u32 cnt_ctl;
-
-		/* Only the virtual timer can be loaded so far */
-		cnt_ctl = read_sysreg_el0(cntv_ctl);
-		return  (cnt_ctl & ARCH_TIMER_CTRL_ENABLE) &&
-		        (cnt_ctl & ARCH_TIMER_CTRL_IT_STAT) &&
-		       !(cnt_ctl & ARCH_TIMER_CTRL_IT_MASK);
-	}
-
-	if (!kvm_timer_irq_can_fire(timer_ctx))
-		return false;
-
-	cval = timer_ctx->cnt_cval;
-	now = kvm_phys_timer_read() - timer_ctx->cntvoff;
-
-	return cval <= now;
-}
-
-bool kvm_timer_is_pending(struct kvm_vcpu *vcpu)
-{
-	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
-	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
-
-	if (kvm_timer_should_fire(vtimer))
-		return true;
-
-	return kvm_timer_should_fire(ptimer);
-}
-
-/*
- * Reflect the timer output level into the kvm_run structure
- */
-void kvm_timer_update_run(struct kvm_vcpu *vcpu)
-{
-	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
-	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
-	struct kvm_sync_regs *regs = &vcpu->run->s.regs;
-
-	/* Populate the device bitmap with the timer states */
-	regs->device_irq_level &= ~(KVM_ARM_DEV_EL1_VTIMER |
-				    KVM_ARM_DEV_EL1_PTIMER);
-	if (kvm_timer_should_fire(vtimer))
-		regs->device_irq_level |= KVM_ARM_DEV_EL1_VTIMER;
-	if (kvm_timer_should_fire(ptimer))
-		regs->device_irq_level |= KVM_ARM_DEV_EL1_PTIMER;
-}
-
-static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level,
-				 struct arch_timer_context *timer_ctx)
-{
-	int ret;
-
-	timer_ctx->irq.level = new_level;
-	trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_ctx->irq.irq,
-				   timer_ctx->irq.level);
-
-	if (!userspace_irqchip(vcpu->kvm)) {
-		ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id,
-					  timer_ctx->irq.irq,
-					  timer_ctx->irq.level,
-					  timer_ctx);
-		WARN_ON(ret);
-	}
-}
-
-/* Schedule the background timer for the emulated timer. */
-static void phys_timer_emulate(struct kvm_vcpu *vcpu)
-{
-	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
-	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
-
-	/*
-	 * If the timer can fire now, we don't need to have a soft timer
-	 * scheduled for the future.  If the timer cannot fire at all,
-	 * then we also don't need a soft timer.
-	 */
-	if (kvm_timer_should_fire(ptimer) || !kvm_timer_irq_can_fire(ptimer)) {
-		soft_timer_cancel(&timer->phys_timer);
-		return;
-	}
-
-	soft_timer_start(&timer->phys_timer, kvm_timer_compute_delta(ptimer));
-}
-
-/*
- * Check if there was a change in the timer state, so that we should either
- * raise or lower the line level to the GIC or schedule a background timer to
- * emulate the physical timer.
- */
-static void kvm_timer_update_state(struct kvm_vcpu *vcpu)
-{
-	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
-	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
-	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
-	bool level;
-
-	if (unlikely(!timer->enabled))
-		return;
-
-	/*
-	 * The vtimer virtual interrupt is a 'mapped' interrupt, meaning part
-	 * of its lifecycle is offloaded to the hardware, and we therefore may
-	 * not have lowered the irq.level value before having to signal a new
-	 * interrupt, but have to signal an interrupt every time the level is
-	 * asserted.
-	 */
-	level = kvm_timer_should_fire(vtimer);
-	kvm_timer_update_irq(vcpu, level, vtimer);
-
-	phys_timer_emulate(vcpu);
-
-	if (kvm_timer_should_fire(ptimer) != ptimer->irq.level)
-		kvm_timer_update_irq(vcpu, !ptimer->irq.level, ptimer);
-}
-
-static void vtimer_save_state(struct kvm_vcpu *vcpu)
-{
-	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
-	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
-	unsigned long flags;
-
-	local_irq_save(flags);
-
-	if (!vtimer->loaded)
-		goto out;
-
-	if (timer->enabled) {
-		vtimer->cnt_ctl = read_sysreg_el0(cntv_ctl);
-		vtimer->cnt_cval = read_sysreg_el0(cntv_cval);
-	}
-
-	/* Disable the virtual timer */
-	write_sysreg_el0(0, cntv_ctl);
-	isb();
-
-	vtimer->loaded = false;
-out:
-	local_irq_restore(flags);
-}
-
-/*
- * Schedule the background timer before calling kvm_vcpu_block, so that this
- * thread is removed from its waitqueue and made runnable when there's a timer
- * interrupt to handle.
- */
-void kvm_timer_schedule(struct kvm_vcpu *vcpu)
-{
-	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
-	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
-	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
-
-	vtimer_save_state(vcpu);
-
-	/*
-	 * No need to schedule a background timer if any guest timer has
-	 * already expired, because kvm_vcpu_block will return before putting
-	 * the thread to sleep.
-	 */
-	if (kvm_timer_should_fire(vtimer) || kvm_timer_should_fire(ptimer))
-		return;
-
-	/*
-	 * If both timers are not capable of raising interrupts (disabled or
-	 * masked), then there's no more work for us to do.
-	 */
-	if (!kvm_timer_irq_can_fire(vtimer) && !kvm_timer_irq_can_fire(ptimer))
-		return;
-
-	/*
-	 * The guest timers have not yet expired, schedule a background timer.
-	 * Set the earliest expiration time among the guest timers.
-	 */
-	soft_timer_start(&timer->bg_timer, kvm_timer_earliest_exp(vcpu));
-}
-
-static void vtimer_restore_state(struct kvm_vcpu *vcpu)
-{
-	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
-	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
-	unsigned long flags;
-
-	local_irq_save(flags);
-
-	if (vtimer->loaded)
-		goto out;
-
-	if (timer->enabled) {
-		write_sysreg_el0(vtimer->cnt_cval, cntv_cval);
-		isb();
-		write_sysreg_el0(vtimer->cnt_ctl, cntv_ctl);
-	}
-
-	vtimer->loaded = true;
-out:
-	local_irq_restore(flags);
-}
-
-void kvm_timer_unschedule(struct kvm_vcpu *vcpu)
-{
-	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
-
-	vtimer_restore_state(vcpu);
-
-	soft_timer_cancel(&timer->bg_timer);
-}
-
-static void set_cntvoff(u64 cntvoff)
-{
-	u32 low = lower_32_bits(cntvoff);
-	u32 high = upper_32_bits(cntvoff);
-
-	/*
-	 * Since kvm_call_hyp doesn't fully support the ARM PCS especially on
-	 * 32-bit systems, but rather passes register by register shifted one
-	 * place (we put the function address in r0/x0), we cannot simply pass
-	 * a 64-bit value as an argument, but have to split the value in two
-	 * 32-bit halves.
-	 */
-	kvm_call_hyp(__kvm_timer_set_cntvoff, low, high);
-}
-
-static inline void set_vtimer_irq_phys_active(struct kvm_vcpu *vcpu, bool active)
-{
-	int r;
-	r = irq_set_irqchip_state(host_vtimer_irq, IRQCHIP_STATE_ACTIVE, active);
-	WARN_ON(r);
-}
-
-static void kvm_timer_vcpu_load_gic(struct kvm_vcpu *vcpu)
-{
-	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
-	bool phys_active;
-
-	if (irqchip_in_kernel(vcpu->kvm))
-		phys_active = kvm_vgic_map_is_active(vcpu, vtimer->irq.irq);
-	else
-		phys_active = vtimer->irq.level;
-	set_vtimer_irq_phys_active(vcpu, phys_active);
-}
-
-static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu)
-{
-	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
-
-	/*
-	 * When using a userspace irqchip with the architected timers and a
-	 * host interrupt controller that doesn't support an active state, we
-	 * must still prevent continuously exiting from the guest, and
-	 * therefore mask the physical interrupt by disabling it on the host
-	 * interrupt controller when the virtual level is high, such that the
-	 * guest can make forward progress.  Once we detect the output level
-	 * being de-asserted, we unmask the interrupt again so that we exit
-	 * from the guest when the timer fires.
-	 */
-	if (vtimer->irq.level)
-		disable_percpu_irq(host_vtimer_irq);
-	else
-		enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags);
-}
-
-void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu)
-{
-	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
-	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
-	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
-
-	if (unlikely(!timer->enabled))
-		return;
-
-	if (static_branch_likely(&has_gic_active_state))
-		kvm_timer_vcpu_load_gic(vcpu);
-	else
-		kvm_timer_vcpu_load_nogic(vcpu);
-
-	set_cntvoff(vtimer->cntvoff);
-
-	vtimer_restore_state(vcpu);
-
-	/* Set the background timer for the physical timer emulation. */
-	phys_timer_emulate(vcpu);
-
-	/* If the timer fired while we weren't running, inject it now */
-	if (kvm_timer_should_fire(ptimer) != ptimer->irq.level)
-		kvm_timer_update_irq(vcpu, !ptimer->irq.level, ptimer);
-}
-
-bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu)
-{
-	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
-	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
-	struct kvm_sync_regs *sregs = &vcpu->run->s.regs;
-	bool vlevel, plevel;
-
-	if (likely(irqchip_in_kernel(vcpu->kvm)))
-		return false;
-
-	vlevel = sregs->device_irq_level & KVM_ARM_DEV_EL1_VTIMER;
-	plevel = sregs->device_irq_level & KVM_ARM_DEV_EL1_PTIMER;
-
-	return kvm_timer_should_fire(vtimer) != vlevel ||
-	       kvm_timer_should_fire(ptimer) != plevel;
-}
-
-void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu)
-{
-	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
-
-	if (unlikely(!timer->enabled))
-		return;
-
-	vtimer_save_state(vcpu);
-
-	/*
-	 * Cancel the physical timer emulation, because the only case where we
-	 * need it after a vcpu_put is in the context of a sleeping VCPU, and
-	 * in that case we already factor in the deadline for the physical
-	 * timer when scheduling the bg_timer.
-	 *
-	 * In any case, we re-schedule the hrtimer for the physical timer when
-	 * coming back to the VCPU thread in kvm_timer_vcpu_load().
-	 */
-	soft_timer_cancel(&timer->phys_timer);
-
-	/*
-	 * The kernel may decide to run userspace after calling vcpu_put, so
-	 * we reset cntvoff to 0 to ensure a consistent read between user
-	 * accesses to the virtual counter and kernel access to the physical
-	 * counter of non-VHE case. For VHE, the virtual counter uses a fixed
-	 * virtual offset of zero, so no need to zero CNTVOFF_EL2 register.
-	 */
-	if (!has_vhe())
-		set_cntvoff(0);
-}
-
-/*
- * With a userspace irqchip we have to check if the guest de-asserted the
- * timer and if so, unmask the timer irq signal on the host interrupt
- * controller to ensure that we see future timer signals.
- */
-static void unmask_vtimer_irq_user(struct kvm_vcpu *vcpu)
-{
-	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
-
-	if (!kvm_timer_should_fire(vtimer)) {
-		kvm_timer_update_irq(vcpu, false, vtimer);
-		if (static_branch_likely(&has_gic_active_state))
-			set_vtimer_irq_phys_active(vcpu, false);
-		else
-			enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags);
-	}
-}
-
-void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
-{
-	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
-
-	if (unlikely(!timer->enabled))
-		return;
-
-	if (unlikely(!irqchip_in_kernel(vcpu->kvm)))
-		unmask_vtimer_irq_user(vcpu);
-}
-
-int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu)
-{
-	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
-	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
-	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
-
-	/*
-	 * The bits in CNTV_CTL are architecturally reset to UNKNOWN for ARMv8
-	 * and to 0 for ARMv7.  We provide an implementation that always
-	 * resets the timer to be disabled and unmasked and is compliant with
-	 * the ARMv7 architecture.
-	 */
-	vtimer->cnt_ctl = 0;
-	ptimer->cnt_ctl = 0;
-	kvm_timer_update_state(vcpu);
-
-	if (timer->enabled && irqchip_in_kernel(vcpu->kvm))
-		kvm_vgic_reset_mapped_irq(vcpu, vtimer->irq.irq);
-
-	return 0;
-}
-
-/* Make the updates of cntvoff for all vtimer contexts atomic */
-static void update_vtimer_cntvoff(struct kvm_vcpu *vcpu, u64 cntvoff)
-{
-	int i;
-	struct kvm *kvm = vcpu->kvm;
-	struct kvm_vcpu *tmp;
-
-	mutex_lock(&kvm->lock);
-	kvm_for_each_vcpu(i, tmp, kvm)
-		vcpu_vtimer(tmp)->cntvoff = cntvoff;
-
-	/*
-	 * When called from the vcpu create path, the CPU being created is not
-	 * included in the loop above, so we just set it here as well.
-	 */
-	vcpu_vtimer(vcpu)->cntvoff = cntvoff;
-	mutex_unlock(&kvm->lock);
-}
-
-void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
-{
-	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
-	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
-	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
-
-	/* Synchronize cntvoff across all vtimers of a VM. */
-	update_vtimer_cntvoff(vcpu, kvm_phys_timer_read());
-	vcpu_ptimer(vcpu)->cntvoff = 0;
-
-	hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
-	timer->bg_timer.function = kvm_bg_timer_expire;
-
-	hrtimer_init(&timer->phys_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
-	timer->phys_timer.function = kvm_phys_timer_expire;
-
-	vtimer->irq.irq = default_vtimer_irq.irq;
-	ptimer->irq.irq = default_ptimer_irq.irq;
-}
-
-static void kvm_timer_init_interrupt(void *info)
-{
-	enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags);
-}
-
-int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value)
-{
-	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
-	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
-
-	switch (regid) {
-	case KVM_REG_ARM_TIMER_CTL:
-		vtimer->cnt_ctl = value & ~ARCH_TIMER_CTRL_IT_STAT;
-		break;
-	case KVM_REG_ARM_TIMER_CNT:
-		update_vtimer_cntvoff(vcpu, kvm_phys_timer_read() - value);
-		break;
-	case KVM_REG_ARM_TIMER_CVAL:
-		vtimer->cnt_cval = value;
-		break;
-	case KVM_REG_ARM_PTIMER_CTL:
-		ptimer->cnt_ctl = value & ~ARCH_TIMER_CTRL_IT_STAT;
-		break;
-	case KVM_REG_ARM_PTIMER_CVAL:
-		ptimer->cnt_cval = value;
-		break;
-
-	default:
-		return -1;
-	}
-
-	kvm_timer_update_state(vcpu);
-	return 0;
-}
-
-static u64 read_timer_ctl(struct arch_timer_context *timer)
-{
-	/*
-	 * Set ISTATUS bit if it's expired.
-	 * Note that according to ARMv8 ARM Issue A.k, ISTATUS bit is
-	 * UNKNOWN when ENABLE bit is 0, so we chose to set ISTATUS bit
-	 * regardless of ENABLE bit for our implementation convenience.
-	 */
-	if (!kvm_timer_compute_delta(timer))
-		return timer->cnt_ctl | ARCH_TIMER_CTRL_IT_STAT;
-	else
-		return timer->cnt_ctl;
-}
-
-u64 kvm_arm_timer_get_reg(struct kvm_vcpu *vcpu, u64 regid)
-{
-	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
-	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
-
-	switch (regid) {
-	case KVM_REG_ARM_TIMER_CTL:
-		return read_timer_ctl(vtimer);
-	case KVM_REG_ARM_TIMER_CNT:
-		return kvm_phys_timer_read() - vtimer->cntvoff;
-	case KVM_REG_ARM_TIMER_CVAL:
-		return vtimer->cnt_cval;
-	case KVM_REG_ARM_PTIMER_CTL:
-		return read_timer_ctl(ptimer);
-	case KVM_REG_ARM_PTIMER_CVAL:
-		return ptimer->cnt_cval;
-	case KVM_REG_ARM_PTIMER_CNT:
-		return kvm_phys_timer_read();
-	}
-	return (u64)-1;
-}
-
-static int kvm_timer_starting_cpu(unsigned int cpu)
-{
-	kvm_timer_init_interrupt(NULL);
-	return 0;
-}
-
-static int kvm_timer_dying_cpu(unsigned int cpu)
-{
-	disable_percpu_irq(host_vtimer_irq);
-	return 0;
-}
-
-int kvm_timer_hyp_init(bool has_gic)
-{
-	struct arch_timer_kvm_info *info;
-	int err;
-
-	info = arch_timer_get_kvm_info();
-	timecounter = &info->timecounter;
-
-	if (!timecounter->cc) {
-		kvm_err("kvm_arch_timer: uninitialized timecounter\n");
-		return -ENODEV;
-	}
-
-	if (info->virtual_irq <= 0) {
-		kvm_err("kvm_arch_timer: invalid virtual timer IRQ: %d\n",
-			info->virtual_irq);
-		return -ENODEV;
-	}
-	host_vtimer_irq = info->virtual_irq;
-
-	host_vtimer_irq_flags = irq_get_trigger_type(host_vtimer_irq);
-	if (host_vtimer_irq_flags != IRQF_TRIGGER_HIGH &&
-	    host_vtimer_irq_flags != IRQF_TRIGGER_LOW) {
-		kvm_err("Invalid trigger for IRQ%d, assuming level low\n",
-			host_vtimer_irq);
-		host_vtimer_irq_flags = IRQF_TRIGGER_LOW;
-	}
-
-	err = request_percpu_irq(host_vtimer_irq, kvm_arch_timer_handler,
-				 "kvm guest timer", kvm_get_running_vcpus());
-	if (err) {
-		kvm_err("kvm_arch_timer: can't request interrupt %d (%d)\n",
-			host_vtimer_irq, err);
-		return err;
-	}
-
-	if (has_gic) {
-		err = irq_set_vcpu_affinity(host_vtimer_irq,
-					    kvm_get_running_vcpus());
-		if (err) {
-			kvm_err("kvm_arch_timer: error setting vcpu affinity\n");
-			goto out_free_irq;
-		}
-
-		static_branch_enable(&has_gic_active_state);
-	}
-
-	kvm_debug("virtual timer IRQ%d\n", host_vtimer_irq);
-
-	cpuhp_setup_state(CPUHP_AP_KVM_ARM_TIMER_STARTING,
-			  "kvm/arm/timer:starting", kvm_timer_starting_cpu,
-			  kvm_timer_dying_cpu);
-	return 0;
-out_free_irq:
-	free_percpu_irq(host_vtimer_irq, kvm_get_running_vcpus());
-	return err;
-}
-
-void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu)
-{
-	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
-
-	soft_timer_cancel(&timer->bg_timer);
-}
-
-static bool timer_irqs_are_valid(struct kvm_vcpu *vcpu)
-{
-	int vtimer_irq, ptimer_irq;
-	int i, ret;
-
-	vtimer_irq = vcpu_vtimer(vcpu)->irq.irq;
-	ret = kvm_vgic_set_owner(vcpu, vtimer_irq, vcpu_vtimer(vcpu));
-	if (ret)
-		return false;
-
-	ptimer_irq = vcpu_ptimer(vcpu)->irq.irq;
-	ret = kvm_vgic_set_owner(vcpu, ptimer_irq, vcpu_ptimer(vcpu));
-	if (ret)
-		return false;
-
-	kvm_for_each_vcpu(i, vcpu, vcpu->kvm) {
-		if (vcpu_vtimer(vcpu)->irq.irq != vtimer_irq ||
-		    vcpu_ptimer(vcpu)->irq.irq != ptimer_irq)
-			return false;
-	}
-
-	return true;
-}
-
-bool kvm_arch_timer_get_input_level(int vintid)
-{
-	struct kvm_vcpu *vcpu = kvm_arm_get_running_vcpu();
-	struct arch_timer_context *timer;
-
-	if (vintid == vcpu_vtimer(vcpu)->irq.irq)
-		timer = vcpu_vtimer(vcpu);
-	else
-		BUG(); /* We only map the vtimer so far */
-
-	return kvm_timer_should_fire(timer);
-}
-
-int kvm_timer_enable(struct kvm_vcpu *vcpu)
-{
-	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
-	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
-	int ret;
-
-	if (timer->enabled)
-		return 0;
-
-	/* Without a VGIC we do not map virtual IRQs to physical IRQs */
-	if (!irqchip_in_kernel(vcpu->kvm))
-		goto no_vgic;
-
-	if (!vgic_initialized(vcpu->kvm))
-		return -ENODEV;
-
-	if (!timer_irqs_are_valid(vcpu)) {
-		kvm_debug("incorrectly configured timer irqs\n");
-		return -EINVAL;
-	}
-
-	ret = kvm_vgic_map_phys_irq(vcpu, host_vtimer_irq, vtimer->irq.irq,
-				    kvm_arch_timer_get_input_level);
-	if (ret)
-		return ret;
-
-no_vgic:
-	timer->enabled = 1;
-	return 0;
-}
-
-/*
- * On VHE system, we only need to configure trap on physical timer and counter
- * accesses in EL0 and EL1 once, not for every world switch.
- * The host kernel runs at EL2 with HCR_EL2.TGE == 1,
- * and this makes those bits have no effect for the host kernel execution.
- */
-void kvm_timer_init_vhe(void)
-{
-	/* When HCR_EL2.E2H ==1, EL1PCEN and EL1PCTEN are shifted by 10 */
-	u32 cnthctl_shift = 10;
-	u64 val;
-
-	/*
-	 * Disallow physical timer access for the guest.
-	 * Physical counter access is allowed.
-	 */
-	val = read_sysreg(cnthctl_el2);
-	val &= ~(CNTHCTL_EL1PCEN << cnthctl_shift);
-	val |= (CNTHCTL_EL1PCTEN << cnthctl_shift);
-	write_sysreg(val, cnthctl_el2);
-}
-
-static void set_timer_irqs(struct kvm *kvm, int vtimer_irq, int ptimer_irq)
-{
-	struct kvm_vcpu *vcpu;
-	int i;
-
-	kvm_for_each_vcpu(i, vcpu, kvm) {
-		vcpu_vtimer(vcpu)->irq.irq = vtimer_irq;
-		vcpu_ptimer(vcpu)->irq.irq = ptimer_irq;
-	}
-}
-
-int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
-{
-	int __user *uaddr = (int __user *)(long)attr->addr;
-	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
-	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
-	int irq;
-
-	if (!irqchip_in_kernel(vcpu->kvm))
-		return -EINVAL;
-
-	if (get_user(irq, uaddr))
-		return -EFAULT;
-
-	if (!(irq_is_ppi(irq)))
-		return -EINVAL;
-
-	if (vcpu->arch.timer_cpu.enabled)
-		return -EBUSY;
-
-	switch (attr->attr) {
-	case KVM_ARM_VCPU_TIMER_IRQ_VTIMER:
-		set_timer_irqs(vcpu->kvm, irq, ptimer->irq.irq);
-		break;
-	case KVM_ARM_VCPU_TIMER_IRQ_PTIMER:
-		set_timer_irqs(vcpu->kvm, vtimer->irq.irq, irq);
-		break;
-	default:
-		return -ENXIO;
-	}
-
-	return 0;
-}
-
-int kvm_arm_timer_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
-{
-	int __user *uaddr = (int __user *)(long)attr->addr;
-	struct arch_timer_context *timer;
-	int irq;
-
-	switch (attr->attr) {
-	case KVM_ARM_VCPU_TIMER_IRQ_VTIMER:
-		timer = vcpu_vtimer(vcpu);
-		break;
-	case KVM_ARM_VCPU_TIMER_IRQ_PTIMER:
-		timer = vcpu_ptimer(vcpu);
-		break;
-	default:
-		return -ENXIO;
-	}
-
-	irq = timer->irq.irq;
-	return put_user(irq, uaddr);
-}
-
-int kvm_arm_timer_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
-{
-	switch (attr->attr) {
-	case KVM_ARM_VCPU_TIMER_IRQ_VTIMER:
-	case KVM_ARM_VCPU_TIMER_IRQ_PTIMER:
-		return 0;
-	}
-
-	return -ENXIO;
-}
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
deleted file mode 100644
index 9e350fd34504..000000000000
--- a/virt/kvm/arm/arm.c
+++ /dev/null
@@ -1,1711 +0,0 @@
-/*
- * Copyright (C) 2012 - Virtual Open Systems and Columbia University
- * Author: Christoffer Dall <c.dall@virtualopensystems.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
- */
-
-#include <linux/bug.h>
-#include <linux/cpu_pm.h>
-#include <linux/errno.h>
-#include <linux/err.h>
-#include <linux/kvm_host.h>
-#include <linux/list.h>
-#include <linux/module.h>
-#include <linux/vmalloc.h>
-#include <linux/fs.h>
-#include <linux/mman.h>
-#include <linux/sched.h>
-#include <linux/kvm.h>
-#include <linux/kvm_irqfd.h>
-#include <linux/irqbypass.h>
-#include <linux/sched/stat.h>
-#include <trace/events/kvm.h>
-#include <kvm/arm_pmu.h>
-#include <kvm/arm_psci.h>
-
-#define CREATE_TRACE_POINTS
-#include "trace.h"
-
-#include <linux/uaccess.h>
-#include <asm/ptrace.h>
-#include <asm/mman.h>
-#include <asm/tlbflush.h>
-#include <asm/cacheflush.h>
-#include <asm/cpufeature.h>
-#include <asm/virt.h>
-#include <asm/kvm_arm.h>
-#include <asm/kvm_asm.h>
-#include <asm/kvm_mmu.h>
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_coproc.h>
-#include <asm/sections.h>
-
-#ifdef REQUIRES_VIRT
-__asm__(".arch_extension	virt");
-#endif
-
-DEFINE_PER_CPU(kvm_cpu_context_t, kvm_host_cpu_state);
-static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
-
-/* Per-CPU variable containing the currently running vcpu. */
-static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_arm_running_vcpu);
-
-/* The VMID used in the VTTBR */
-static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1);
-static u32 kvm_next_vmid;
-static unsigned int kvm_vmid_bits __read_mostly;
-static DEFINE_SPINLOCK(kvm_vmid_lock);
-
-static bool vgic_present;
-
-static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled);
-
-static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu)
-{
-	__this_cpu_write(kvm_arm_running_vcpu, vcpu);
-}
-
-DEFINE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
-
-/**
- * kvm_arm_get_running_vcpu - get the vcpu running on the current CPU.
- * Must be called from non-preemptible context
- */
-struct kvm_vcpu *kvm_arm_get_running_vcpu(void)
-{
-	return __this_cpu_read(kvm_arm_running_vcpu);
-}
-
-/**
- * kvm_arm_get_running_vcpus - get the per-CPU array of currently running vcpus.
- */
-struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void)
-{
-	return &kvm_arm_running_vcpu;
-}
-
-int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
-{
-	return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
-}
-
-int kvm_arch_hardware_setup(void)
-{
-	return 0;
-}
-
-void kvm_arch_check_processor_compat(void *rtn)
-{
-	*(int *)rtn = 0;
-}
-
-
-/**
- * kvm_arch_init_vm - initializes a VM data structure
- * @kvm:	pointer to the KVM struct
- */
-int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
-{
-	int ret, cpu;
-
-	ret = kvm_arm_setup_stage2(kvm, type);
-	if (ret)
-		return ret;
-
-	kvm->arch.last_vcpu_ran = alloc_percpu(typeof(*kvm->arch.last_vcpu_ran));
-	if (!kvm->arch.last_vcpu_ran)
-		return -ENOMEM;
-
-	for_each_possible_cpu(cpu)
-		*per_cpu_ptr(kvm->arch.last_vcpu_ran, cpu) = -1;
-
-	ret = kvm_alloc_stage2_pgd(kvm);
-	if (ret)
-		goto out_fail_alloc;
-
-	ret = create_hyp_mappings(kvm, kvm + 1, PAGE_HYP);
-	if (ret)
-		goto out_free_stage2_pgd;
-
-	kvm_vgic_early_init(kvm);
-
-	/* Mark the initial VMID generation invalid */
-	kvm->arch.vmid_gen = 0;
-
-	/* The maximum number of VCPUs is limited by the host's GIC model */
-	kvm->arch.max_vcpus = vgic_present ?
-				kvm_vgic_get_max_vcpus() : KVM_MAX_VCPUS;
-
-	return ret;
-out_free_stage2_pgd:
-	kvm_free_stage2_pgd(kvm);
-out_fail_alloc:
-	free_percpu(kvm->arch.last_vcpu_ran);
-	kvm->arch.last_vcpu_ran = NULL;
-	return ret;
-}
-
-bool kvm_arch_has_vcpu_debugfs(void)
-{
-	return false;
-}
-
-int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
-{
-	return 0;
-}
-
-vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
-{
-	return VM_FAULT_SIGBUS;
-}
-
-
-/**
- * kvm_arch_destroy_vm - destroy the VM data structure
- * @kvm:	pointer to the KVM struct
- */
-void kvm_arch_destroy_vm(struct kvm *kvm)
-{
-	int i;
-
-	kvm_vgic_destroy(kvm);
-
-	free_percpu(kvm->arch.last_vcpu_ran);
-	kvm->arch.last_vcpu_ran = NULL;
-
-	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-		if (kvm->vcpus[i]) {
-			kvm_arch_vcpu_free(kvm->vcpus[i]);
-			kvm->vcpus[i] = NULL;
-		}
-	}
-	atomic_set(&kvm->online_vcpus, 0);
-}
-
-int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
-{
-	int r;
-	switch (ext) {
-	case KVM_CAP_IRQCHIP:
-		r = vgic_present;
-		break;
-	case KVM_CAP_IOEVENTFD:
-	case KVM_CAP_DEVICE_CTRL:
-	case KVM_CAP_USER_MEMORY:
-	case KVM_CAP_SYNC_MMU:
-	case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
-	case KVM_CAP_ONE_REG:
-	case KVM_CAP_ARM_PSCI:
-	case KVM_CAP_ARM_PSCI_0_2:
-	case KVM_CAP_READONLY_MEM:
-	case KVM_CAP_MP_STATE:
-	case KVM_CAP_IMMEDIATE_EXIT:
-	case KVM_CAP_VCPU_EVENTS:
-		r = 1;
-		break;
-	case KVM_CAP_ARM_SET_DEVICE_ADDR:
-		r = 1;
-		break;
-	case KVM_CAP_NR_VCPUS:
-		r = num_online_cpus();
-		break;
-	case KVM_CAP_MAX_VCPUS:
-		r = KVM_MAX_VCPUS;
-		break;
-	case KVM_CAP_NR_MEMSLOTS:
-		r = KVM_USER_MEM_SLOTS;
-		break;
-	case KVM_CAP_MSI_DEVID:
-		if (!kvm)
-			r = -EINVAL;
-		else
-			r = kvm->arch.vgic.msis_require_devid;
-		break;
-	case KVM_CAP_ARM_USER_IRQ:
-		/*
-		 * 1: EL1_VTIMER, EL1_PTIMER, and PMU.
-		 * (bump this number if adding more devices)
-		 */
-		r = 1;
-		break;
-	default:
-		r = kvm_arch_vm_ioctl_check_extension(kvm, ext);
-		break;
-	}
-	return r;
-}
-
-long kvm_arch_dev_ioctl(struct file *filp,
-			unsigned int ioctl, unsigned long arg)
-{
-	return -EINVAL;
-}
-
-struct kvm *kvm_arch_alloc_vm(void)
-{
-	if (!has_vhe())
-		return kzalloc(sizeof(struct kvm), GFP_KERNEL);
-
-	return vzalloc(sizeof(struct kvm));
-}
-
-void kvm_arch_free_vm(struct kvm *kvm)
-{
-	if (!has_vhe())
-		kfree(kvm);
-	else
-		vfree(kvm);
-}
-
-struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
-{
-	int err;
-	struct kvm_vcpu *vcpu;
-
-	if (irqchip_in_kernel(kvm) && vgic_initialized(kvm)) {
-		err = -EBUSY;
-		goto out;
-	}
-
-	if (id >= kvm->arch.max_vcpus) {
-		err = -EINVAL;
-		goto out;
-	}
-
-	vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
-	if (!vcpu) {
-		err = -ENOMEM;
-		goto out;
-	}
-
-	err = kvm_vcpu_init(vcpu, kvm, id);
-	if (err)
-		goto free_vcpu;
-
-	err = create_hyp_mappings(vcpu, vcpu + 1, PAGE_HYP);
-	if (err)
-		goto vcpu_uninit;
-
-	return vcpu;
-vcpu_uninit:
-	kvm_vcpu_uninit(vcpu);
-free_vcpu:
-	kmem_cache_free(kvm_vcpu_cache, vcpu);
-out:
-	return ERR_PTR(err);
-}
-
-void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
-{
-}
-
-void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
-{
-	if (vcpu->arch.has_run_once && unlikely(!irqchip_in_kernel(vcpu->kvm)))
-		static_branch_dec(&userspace_irqchip_in_use);
-
-	kvm_mmu_free_memory_caches(vcpu);
-	kvm_timer_vcpu_terminate(vcpu);
-	kvm_pmu_vcpu_destroy(vcpu);
-	kvm_vcpu_uninit(vcpu);
-	kmem_cache_free(kvm_vcpu_cache, vcpu);
-}
-
-void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
-{
-	kvm_arch_vcpu_free(vcpu);
-}
-
-int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
-{
-	return kvm_timer_is_pending(vcpu);
-}
-
-void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
-{
-	kvm_timer_schedule(vcpu);
-	kvm_vgic_v4_enable_doorbell(vcpu);
-}
-
-void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu)
-{
-	kvm_timer_unschedule(vcpu);
-	kvm_vgic_v4_disable_doorbell(vcpu);
-}
-
-int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
-{
-	/* Force users to call KVM_ARM_VCPU_INIT */
-	vcpu->arch.target = -1;
-	bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES);
-
-	/* Set up the timer */
-	kvm_timer_vcpu_init(vcpu);
-
-	kvm_arm_reset_debug_ptr(vcpu);
-
-	return kvm_vgic_vcpu_init(vcpu);
-}
-
-void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
-{
-	int *last_ran;
-
-	last_ran = this_cpu_ptr(vcpu->kvm->arch.last_vcpu_ran);
-
-	/*
-	 * We might get preempted before the vCPU actually runs, but
-	 * over-invalidation doesn't affect correctness.
-	 */
-	if (*last_ran != vcpu->vcpu_id) {
-		kvm_call_hyp(__kvm_tlb_flush_local_vmid, vcpu);
-		*last_ran = vcpu->vcpu_id;
-	}
-
-	vcpu->cpu = cpu;
-	vcpu->arch.host_cpu_context = this_cpu_ptr(&kvm_host_cpu_state);
-
-	kvm_arm_set_running_vcpu(vcpu);
-	kvm_vgic_load(vcpu);
-	kvm_timer_vcpu_load(vcpu);
-	kvm_vcpu_load_sysregs(vcpu);
-	kvm_arch_vcpu_load_fp(vcpu);
-
-	if (single_task_running())
-		vcpu_clear_wfe_traps(vcpu);
-	else
-		vcpu_set_wfe_traps(vcpu);
-}
-
-void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
-{
-	kvm_arch_vcpu_put_fp(vcpu);
-	kvm_vcpu_put_sysregs(vcpu);
-	kvm_timer_vcpu_put(vcpu);
-	kvm_vgic_put(vcpu);
-
-	vcpu->cpu = -1;
-
-	kvm_arm_set_running_vcpu(NULL);
-}
-
-static void vcpu_power_off(struct kvm_vcpu *vcpu)
-{
-	vcpu->arch.power_off = true;
-	kvm_make_request(KVM_REQ_SLEEP, vcpu);
-	kvm_vcpu_kick(vcpu);
-}
-
-int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
-				    struct kvm_mp_state *mp_state)
-{
-	if (vcpu->arch.power_off)
-		mp_state->mp_state = KVM_MP_STATE_STOPPED;
-	else
-		mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
-
-	return 0;
-}
-
-int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
-				    struct kvm_mp_state *mp_state)
-{
-	int ret = 0;
-
-	switch (mp_state->mp_state) {
-	case KVM_MP_STATE_RUNNABLE:
-		vcpu->arch.power_off = false;
-		break;
-	case KVM_MP_STATE_STOPPED:
-		vcpu_power_off(vcpu);
-		break;
-	default:
-		ret = -EINVAL;
-	}
-
-	return ret;
-}
-
-/**
- * kvm_arch_vcpu_runnable - determine if the vcpu can be scheduled
- * @v:		The VCPU pointer
- *
- * If the guest CPU is not waiting for interrupts or an interrupt line is
- * asserted, the CPU is by definition runnable.
- */
-int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
-{
-	bool irq_lines = *vcpu_hcr(v) & (HCR_VI | HCR_VF);
-	return ((irq_lines || kvm_vgic_vcpu_pending_irq(v))
-		&& !v->arch.power_off && !v->arch.pause);
-}
-
-bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
-{
-	return vcpu_mode_priv(vcpu);
-}
-
-/* Just ensure a guest exit from a particular CPU */
-static void exit_vm_noop(void *info)
-{
-}
-
-void force_vm_exit(const cpumask_t *mask)
-{
-	preempt_disable();
-	smp_call_function_many(mask, exit_vm_noop, NULL, true);
-	preempt_enable();
-}
-
-/**
- * need_new_vmid_gen - check that the VMID is still valid
- * @kvm: The VM's VMID to check
- *
- * return true if there is a new generation of VMIDs being used
- *
- * The hardware supports only 256 values with the value zero reserved for the
- * host, so we check if an assigned value belongs to a previous generation,
- * which which requires us to assign a new value. If we're the first to use a
- * VMID for the new generation, we must flush necessary caches and TLBs on all
- * CPUs.
- */
-static bool need_new_vmid_gen(struct kvm *kvm)
-{
-	u64 current_vmid_gen = atomic64_read(&kvm_vmid_gen);
-	smp_rmb(); /* Orders read of kvm_vmid_gen and kvm->arch.vmid */
-	return unlikely(READ_ONCE(kvm->arch.vmid_gen) != current_vmid_gen);
-}
-
-/**
- * update_vttbr - Update the VTTBR with a valid VMID before the guest runs
- * @kvm	The guest that we are about to run
- *
- * Called from kvm_arch_vcpu_ioctl_run before entering the guest to ensure the
- * VM has a valid VMID, otherwise assigns a new one and flushes corresponding
- * caches and TLBs.
- */
-static void update_vttbr(struct kvm *kvm)
-{
-	phys_addr_t pgd_phys;
-	u64 vmid, cnp = kvm_cpu_has_cnp() ? VTTBR_CNP_BIT : 0;
-
-	if (!need_new_vmid_gen(kvm))
-		return;
-
-	spin_lock(&kvm_vmid_lock);
-
-	/*
-	 * We need to re-check the vmid_gen here to ensure that if another vcpu
-	 * already allocated a valid vmid for this vm, then this vcpu should
-	 * use the same vmid.
-	 */
-	if (!need_new_vmid_gen(kvm)) {
-		spin_unlock(&kvm_vmid_lock);
-		return;
-	}
-
-	/* First user of a new VMID generation? */
-	if (unlikely(kvm_next_vmid == 0)) {
-		atomic64_inc(&kvm_vmid_gen);
-		kvm_next_vmid = 1;
-
-		/*
-		 * On SMP we know no other CPUs can use this CPU's or each
-		 * other's VMID after force_vm_exit returns since the
-		 * kvm_vmid_lock blocks them from reentry to the guest.
-		 */
-		force_vm_exit(cpu_all_mask);
-		/*
-		 * Now broadcast TLB + ICACHE invalidation over the inner
-		 * shareable domain to make sure all data structures are
-		 * clean.
-		 */
-		kvm_call_hyp(__kvm_flush_vm_context);
-	}
-
-	kvm->arch.vmid = kvm_next_vmid;
-	kvm_next_vmid++;
-	kvm_next_vmid &= (1 << kvm_vmid_bits) - 1;
-
-	/* update vttbr to be used with the new vmid */
-	pgd_phys = virt_to_phys(kvm->arch.pgd);
-	BUG_ON(pgd_phys & ~kvm_vttbr_baddr_mask(kvm));
-	vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK(kvm_vmid_bits);
-	kvm->arch.vttbr = kvm_phys_to_vttbr(pgd_phys) | vmid | cnp;
-
-	smp_wmb();
-	WRITE_ONCE(kvm->arch.vmid_gen, atomic64_read(&kvm_vmid_gen));
-
-	spin_unlock(&kvm_vmid_lock);
-}
-
-static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu)
-{
-	struct kvm *kvm = vcpu->kvm;
-	int ret = 0;
-
-	if (likely(vcpu->arch.has_run_once))
-		return 0;
-
-	vcpu->arch.has_run_once = true;
-
-	if (likely(irqchip_in_kernel(kvm))) {
-		/*
-		 * Map the VGIC hardware resources before running a vcpu the
-		 * first time on this VM.
-		 */
-		if (unlikely(!vgic_ready(kvm))) {
-			ret = kvm_vgic_map_resources(kvm);
-			if (ret)
-				return ret;
-		}
-	} else {
-		/*
-		 * Tell the rest of the code that there are userspace irqchip
-		 * VMs in the wild.
-		 */
-		static_branch_inc(&userspace_irqchip_in_use);
-	}
-
-	ret = kvm_timer_enable(vcpu);
-	if (ret)
-		return ret;
-
-	ret = kvm_arm_pmu_v3_enable(vcpu);
-
-	return ret;
-}
-
-bool kvm_arch_intc_initialized(struct kvm *kvm)
-{
-	return vgic_initialized(kvm);
-}
-
-void kvm_arm_halt_guest(struct kvm *kvm)
-{
-	int i;
-	struct kvm_vcpu *vcpu;
-
-	kvm_for_each_vcpu(i, vcpu, kvm)
-		vcpu->arch.pause = true;
-	kvm_make_all_cpus_request(kvm, KVM_REQ_SLEEP);
-}
-
-void kvm_arm_resume_guest(struct kvm *kvm)
-{
-	int i;
-	struct kvm_vcpu *vcpu;
-
-	kvm_for_each_vcpu(i, vcpu, kvm) {
-		vcpu->arch.pause = false;
-		swake_up_one(kvm_arch_vcpu_wq(vcpu));
-	}
-}
-
-static void vcpu_req_sleep(struct kvm_vcpu *vcpu)
-{
-	struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
-
-	swait_event_interruptible_exclusive(*wq, ((!vcpu->arch.power_off) &&
-				       (!vcpu->arch.pause)));
-
-	if (vcpu->arch.power_off || vcpu->arch.pause) {
-		/* Awaken to handle a signal, request we sleep again later. */
-		kvm_make_request(KVM_REQ_SLEEP, vcpu);
-	}
-}
-
-static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu)
-{
-	return vcpu->arch.target >= 0;
-}
-
-static void check_vcpu_requests(struct kvm_vcpu *vcpu)
-{
-	if (kvm_request_pending(vcpu)) {
-		if (kvm_check_request(KVM_REQ_SLEEP, vcpu))
-			vcpu_req_sleep(vcpu);
-
-		/*
-		 * Clear IRQ_PENDING requests that were made to guarantee
-		 * that a VCPU sees new virtual interrupts.
-		 */
-		kvm_check_request(KVM_REQ_IRQ_PENDING, vcpu);
-	}
-}
-
-/**
- * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code
- * @vcpu:	The VCPU pointer
- * @run:	The kvm_run structure pointer used for userspace state exchange
- *
- * This function is called through the VCPU_RUN ioctl called from user space. It
- * will execute VM code in a loop until the time slice for the process is used
- * or some emulation is needed from user space in which case the function will
- * return with return value 0 and with the kvm_run structure filled in with the
- * required data for the requested emulation.
- */
-int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
-{
-	int ret;
-
-	if (unlikely(!kvm_vcpu_initialized(vcpu)))
-		return -ENOEXEC;
-
-	ret = kvm_vcpu_first_run_init(vcpu);
-	if (ret)
-		return ret;
-
-	if (run->exit_reason == KVM_EXIT_MMIO) {
-		ret = kvm_handle_mmio_return(vcpu, vcpu->run);
-		if (ret)
-			return ret;
-	}
-
-	if (run->immediate_exit)
-		return -EINTR;
-
-	vcpu_load(vcpu);
-
-	kvm_sigset_activate(vcpu);
-
-	ret = 1;
-	run->exit_reason = KVM_EXIT_UNKNOWN;
-	while (ret > 0) {
-		/*
-		 * Check conditions before entering the guest
-		 */
-		cond_resched();
-
-		update_vttbr(vcpu->kvm);
-
-		check_vcpu_requests(vcpu);
-
-		/*
-		 * Preparing the interrupts to be injected also
-		 * involves poking the GIC, which must be done in a
-		 * non-preemptible context.
-		 */
-		preempt_disable();
-
-		kvm_pmu_flush_hwstate(vcpu);
-
-		local_irq_disable();
-
-		kvm_vgic_flush_hwstate(vcpu);
-
-		/*
-		 * Exit if we have a signal pending so that we can deliver the
-		 * signal to user space.
-		 */
-		if (signal_pending(current)) {
-			ret = -EINTR;
-			run->exit_reason = KVM_EXIT_INTR;
-		}
-
-		/*
-		 * If we're using a userspace irqchip, then check if we need
-		 * to tell a userspace irqchip about timer or PMU level
-		 * changes and if so, exit to userspace (the actual level
-		 * state gets updated in kvm_timer_update_run and
-		 * kvm_pmu_update_run below).
-		 */
-		if (static_branch_unlikely(&userspace_irqchip_in_use)) {
-			if (kvm_timer_should_notify_user(vcpu) ||
-			    kvm_pmu_should_notify_user(vcpu)) {
-				ret = -EINTR;
-				run->exit_reason = KVM_EXIT_INTR;
-			}
-		}
-
-		/*
-		 * Ensure we set mode to IN_GUEST_MODE after we disable
-		 * interrupts and before the final VCPU requests check.
-		 * See the comment in kvm_vcpu_exiting_guest_mode() and
-		 * Documentation/virtual/kvm/vcpu-requests.rst
-		 */
-		smp_store_mb(vcpu->mode, IN_GUEST_MODE);
-
-		if (ret <= 0 || need_new_vmid_gen(vcpu->kvm) ||
-		    kvm_request_pending(vcpu)) {
-			vcpu->mode = OUTSIDE_GUEST_MODE;
-			isb(); /* Ensure work in x_flush_hwstate is committed */
-			kvm_pmu_sync_hwstate(vcpu);
-			if (static_branch_unlikely(&userspace_irqchip_in_use))
-				kvm_timer_sync_hwstate(vcpu);
-			kvm_vgic_sync_hwstate(vcpu);
-			local_irq_enable();
-			preempt_enable();
-			continue;
-		}
-
-		kvm_arm_setup_debug(vcpu);
-
-		/**************************************************************
-		 * Enter the guest
-		 */
-		trace_kvm_entry(*vcpu_pc(vcpu));
-		guest_enter_irqoff();
-
-		if (has_vhe()) {
-			kvm_arm_vhe_guest_enter();
-			ret = kvm_vcpu_run_vhe(vcpu);
-			kvm_arm_vhe_guest_exit();
-		} else {
-			ret = kvm_call_hyp(__kvm_vcpu_run_nvhe, vcpu);
-		}
-
-		vcpu->mode = OUTSIDE_GUEST_MODE;
-		vcpu->stat.exits++;
-		/*
-		 * Back from guest
-		 *************************************************************/
-
-		kvm_arm_clear_debug(vcpu);
-
-		/*
-		 * We must sync the PMU state before the vgic state so
-		 * that the vgic can properly sample the updated state of the
-		 * interrupt line.
-		 */
-		kvm_pmu_sync_hwstate(vcpu);
-
-		/*
-		 * Sync the vgic state before syncing the timer state because
-		 * the timer code needs to know if the virtual timer
-		 * interrupts are active.
-		 */
-		kvm_vgic_sync_hwstate(vcpu);
-
-		/*
-		 * Sync the timer hardware state before enabling interrupts as
-		 * we don't want vtimer interrupts to race with syncing the
-		 * timer virtual interrupt state.
-		 */
-		if (static_branch_unlikely(&userspace_irqchip_in_use))
-			kvm_timer_sync_hwstate(vcpu);
-
-		kvm_arch_vcpu_ctxsync_fp(vcpu);
-
-		/*
-		 * We may have taken a host interrupt in HYP mode (ie
-		 * while executing the guest). This interrupt is still
-		 * pending, as we haven't serviced it yet!
-		 *
-		 * We're now back in SVC mode, with interrupts
-		 * disabled.  Enabling the interrupts now will have
-		 * the effect of taking the interrupt again, in SVC
-		 * mode this time.
-		 */
-		local_irq_enable();
-
-		/*
-		 * We do local_irq_enable() before calling guest_exit() so
-		 * that if a timer interrupt hits while running the guest we
-		 * account that tick as being spent in the guest.  We enable
-		 * preemption after calling guest_exit() so that if we get
-		 * preempted we make sure ticks after that is not counted as
-		 * guest time.
-		 */
-		guest_exit();
-		trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
-
-		/* Exit types that need handling before we can be preempted */
-		handle_exit_early(vcpu, run, ret);
-
-		preempt_enable();
-
-		ret = handle_exit(vcpu, run, ret);
-	}
-
-	/* Tell userspace about in-kernel device output levels */
-	if (unlikely(!irqchip_in_kernel(vcpu->kvm))) {
-		kvm_timer_update_run(vcpu);
-		kvm_pmu_update_run(vcpu);
-	}
-
-	kvm_sigset_deactivate(vcpu);
-
-	vcpu_put(vcpu);
-	return ret;
-}
-
-static int vcpu_interrupt_line(struct kvm_vcpu *vcpu, int number, bool level)
-{
-	int bit_index;
-	bool set;
-	unsigned long *hcr;
-
-	if (number == KVM_ARM_IRQ_CPU_IRQ)
-		bit_index = __ffs(HCR_VI);
-	else /* KVM_ARM_IRQ_CPU_FIQ */
-		bit_index = __ffs(HCR_VF);
-
-	hcr = vcpu_hcr(vcpu);
-	if (level)
-		set = test_and_set_bit(bit_index, hcr);
-	else
-		set = test_and_clear_bit(bit_index, hcr);
-
-	/*
-	 * If we didn't change anything, no need to wake up or kick other CPUs
-	 */
-	if (set == level)
-		return 0;
-
-	/*
-	 * The vcpu irq_lines field was updated, wake up sleeping VCPUs and
-	 * trigger a world-switch round on the running physical CPU to set the
-	 * virtual IRQ/FIQ fields in the HCR appropriately.
-	 */
-	kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
-	kvm_vcpu_kick(vcpu);
-
-	return 0;
-}
-
-int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
-			  bool line_status)
-{
-	u32 irq = irq_level->irq;
-	unsigned int irq_type, vcpu_idx, irq_num;
-	int nrcpus = atomic_read(&kvm->online_vcpus);
-	struct kvm_vcpu *vcpu = NULL;
-	bool level = irq_level->level;
-
-	irq_type = (irq >> KVM_ARM_IRQ_TYPE_SHIFT) & KVM_ARM_IRQ_TYPE_MASK;
-	vcpu_idx = (irq >> KVM_ARM_IRQ_VCPU_SHIFT) & KVM_ARM_IRQ_VCPU_MASK;
-	irq_num = (irq >> KVM_ARM_IRQ_NUM_SHIFT) & KVM_ARM_IRQ_NUM_MASK;
-
-	trace_kvm_irq_line(irq_type, vcpu_idx, irq_num, irq_level->level);
-
-	switch (irq_type) {
-	case KVM_ARM_IRQ_TYPE_CPU:
-		if (irqchip_in_kernel(kvm))
-			return -ENXIO;
-
-		if (vcpu_idx >= nrcpus)
-			return -EINVAL;
-
-		vcpu = kvm_get_vcpu(kvm, vcpu_idx);
-		if (!vcpu)
-			return -EINVAL;
-
-		if (irq_num > KVM_ARM_IRQ_CPU_FIQ)
-			return -EINVAL;
-
-		return vcpu_interrupt_line(vcpu, irq_num, level);
-	case KVM_ARM_IRQ_TYPE_PPI:
-		if (!irqchip_in_kernel(kvm))
-			return -ENXIO;
-
-		if (vcpu_idx >= nrcpus)
-			return -EINVAL;
-
-		vcpu = kvm_get_vcpu(kvm, vcpu_idx);
-		if (!vcpu)
-			return -EINVAL;
-
-		if (irq_num < VGIC_NR_SGIS || irq_num >= VGIC_NR_PRIVATE_IRQS)
-			return -EINVAL;
-
-		return kvm_vgic_inject_irq(kvm, vcpu->vcpu_id, irq_num, level, NULL);
-	case KVM_ARM_IRQ_TYPE_SPI:
-		if (!irqchip_in_kernel(kvm))
-			return -ENXIO;
-
-		if (irq_num < VGIC_NR_PRIVATE_IRQS)
-			return -EINVAL;
-
-		return kvm_vgic_inject_irq(kvm, 0, irq_num, level, NULL);
-	}
-
-	return -EINVAL;
-}
-
-static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
-			       const struct kvm_vcpu_init *init)
-{
-	unsigned int i;
-	int phys_target = kvm_target_cpu();
-
-	if (init->target != phys_target)
-		return -EINVAL;
-
-	/*
-	 * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must
-	 * use the same target.
-	 */
-	if (vcpu->arch.target != -1 && vcpu->arch.target != init->target)
-		return -EINVAL;
-
-	/* -ENOENT for unknown features, -EINVAL for invalid combinations. */
-	for (i = 0; i < sizeof(init->features) * 8; i++) {
-		bool set = (init->features[i / 32] & (1 << (i % 32)));
-
-		if (set && i >= KVM_VCPU_MAX_FEATURES)
-			return -ENOENT;
-
-		/*
-		 * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must
-		 * use the same feature set.
-		 */
-		if (vcpu->arch.target != -1 && i < KVM_VCPU_MAX_FEATURES &&
-		    test_bit(i, vcpu->arch.features) != set)
-			return -EINVAL;
-
-		if (set)
-			set_bit(i, vcpu->arch.features);
-	}
-
-	vcpu->arch.target = phys_target;
-
-	/* Now we know what it is, we can reset it. */
-	return kvm_reset_vcpu(vcpu);
-}
-
-
-static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
-					 struct kvm_vcpu_init *init)
-{
-	int ret;
-
-	ret = kvm_vcpu_set_target(vcpu, init);
-	if (ret)
-		return ret;
-
-	/*
-	 * Ensure a rebooted VM will fault in RAM pages and detect if the
-	 * guest MMU is turned off and flush the caches as needed.
-	 */
-	if (vcpu->arch.has_run_once)
-		stage2_unmap_vm(vcpu->kvm);
-
-	vcpu_reset_hcr(vcpu);
-
-	/*
-	 * Handle the "start in power-off" case.
-	 */
-	if (test_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features))
-		vcpu_power_off(vcpu);
-	else
-		vcpu->arch.power_off = false;
-
-	return 0;
-}
-
-static int kvm_arm_vcpu_set_attr(struct kvm_vcpu *vcpu,
-				 struct kvm_device_attr *attr)
-{
-	int ret = -ENXIO;
-
-	switch (attr->group) {
-	default:
-		ret = kvm_arm_vcpu_arch_set_attr(vcpu, attr);
-		break;
-	}
-
-	return ret;
-}
-
-static int kvm_arm_vcpu_get_attr(struct kvm_vcpu *vcpu,
-				 struct kvm_device_attr *attr)
-{
-	int ret = -ENXIO;
-
-	switch (attr->group) {
-	default:
-		ret = kvm_arm_vcpu_arch_get_attr(vcpu, attr);
-		break;
-	}
-
-	return ret;
-}
-
-static int kvm_arm_vcpu_has_attr(struct kvm_vcpu *vcpu,
-				 struct kvm_device_attr *attr)
-{
-	int ret = -ENXIO;
-
-	switch (attr->group) {
-	default:
-		ret = kvm_arm_vcpu_arch_has_attr(vcpu, attr);
-		break;
-	}
-
-	return ret;
-}
-
-static int kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu,
-				   struct kvm_vcpu_events *events)
-{
-	memset(events, 0, sizeof(*events));
-
-	return __kvm_arm_vcpu_get_events(vcpu, events);
-}
-
-static int kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
-				   struct kvm_vcpu_events *events)
-{
-	int i;
-
-	/* check whether the reserved field is zero */
-	for (i = 0; i < ARRAY_SIZE(events->reserved); i++)
-		if (events->reserved[i])
-			return -EINVAL;
-
-	/* check whether the pad field is zero */
-	for (i = 0; i < ARRAY_SIZE(events->exception.pad); i++)
-		if (events->exception.pad[i])
-			return -EINVAL;
-
-	return __kvm_arm_vcpu_set_events(vcpu, events);
-}
-
-long kvm_arch_vcpu_ioctl(struct file *filp,
-			 unsigned int ioctl, unsigned long arg)
-{
-	struct kvm_vcpu *vcpu = filp->private_data;
-	void __user *argp = (void __user *)arg;
-	struct kvm_device_attr attr;
-	long r;
-
-	switch (ioctl) {
-	case KVM_ARM_VCPU_INIT: {
-		struct kvm_vcpu_init init;
-
-		r = -EFAULT;
-		if (copy_from_user(&init, argp, sizeof(init)))
-			break;
-
-		r = kvm_arch_vcpu_ioctl_vcpu_init(vcpu, &init);
-		break;
-	}
-	case KVM_SET_ONE_REG:
-	case KVM_GET_ONE_REG: {
-		struct kvm_one_reg reg;
-
-		r = -ENOEXEC;
-		if (unlikely(!kvm_vcpu_initialized(vcpu)))
-			break;
-
-		r = -EFAULT;
-		if (copy_from_user(&reg, argp, sizeof(reg)))
-			break;
-
-		if (ioctl == KVM_SET_ONE_REG)
-			r = kvm_arm_set_reg(vcpu, &reg);
-		else
-			r = kvm_arm_get_reg(vcpu, &reg);
-		break;
-	}
-	case KVM_GET_REG_LIST: {
-		struct kvm_reg_list __user *user_list = argp;
-		struct kvm_reg_list reg_list;
-		unsigned n;
-
-		r = -ENOEXEC;
-		if (unlikely(!kvm_vcpu_initialized(vcpu)))
-			break;
-
-		r = -EFAULT;
-		if (copy_from_user(&reg_list, user_list, sizeof(reg_list)))
-			break;
-		n = reg_list.n;
-		reg_list.n = kvm_arm_num_regs(vcpu);
-		if (copy_to_user(user_list, &reg_list, sizeof(reg_list)))
-			break;
-		r = -E2BIG;
-		if (n < reg_list.n)
-			break;
-		r = kvm_arm_copy_reg_indices(vcpu, user_list->reg);
-		break;
-	}
-	case KVM_SET_DEVICE_ATTR: {
-		r = -EFAULT;
-		if (copy_from_user(&attr, argp, sizeof(attr)))
-			break;
-		r = kvm_arm_vcpu_set_attr(vcpu, &attr);
-		break;
-	}
-	case KVM_GET_DEVICE_ATTR: {
-		r = -EFAULT;
-		if (copy_from_user(&attr, argp, sizeof(attr)))
-			break;
-		r = kvm_arm_vcpu_get_attr(vcpu, &attr);
-		break;
-	}
-	case KVM_HAS_DEVICE_ATTR: {
-		r = -EFAULT;
-		if (copy_from_user(&attr, argp, sizeof(attr)))
-			break;
-		r = kvm_arm_vcpu_has_attr(vcpu, &attr);
-		break;
-	}
-	case KVM_GET_VCPU_EVENTS: {
-		struct kvm_vcpu_events events;
-
-		if (kvm_arm_vcpu_get_events(vcpu, &events))
-			return -EINVAL;
-
-		if (copy_to_user(argp, &events, sizeof(events)))
-			return -EFAULT;
-
-		return 0;
-	}
-	case KVM_SET_VCPU_EVENTS: {
-		struct kvm_vcpu_events events;
-
-		if (copy_from_user(&events, argp, sizeof(events)))
-			return -EFAULT;
-
-		return kvm_arm_vcpu_set_events(vcpu, &events);
-	}
-	default:
-		r = -EINVAL;
-	}
-
-	return r;
-}
-
-/**
- * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
- * @kvm: kvm instance
- * @log: slot id and address to which we copy the log
- *
- * Steps 1-4 below provide general overview of dirty page logging. See
- * kvm_get_dirty_log_protect() function description for additional details.
- *
- * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
- * always flush the TLB (step 4) even if previous step failed  and the dirty
- * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
- * does not preclude user space subsequent dirty log read. Flushing TLB ensures
- * writes will be marked dirty for next log read.
- *
- *   1. Take a snapshot of the bit and clear it if needed.
- *   2. Write protect the corresponding page.
- *   3. Copy the snapshot to the userspace.
- *   4. Flush TLB's if needed.
- */
-int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
-{
-	bool flush = false;
-	int r;
-
-	mutex_lock(&kvm->slots_lock);
-
-	r = kvm_get_dirty_log_protect(kvm, log, &flush);
-
-	if (flush)
-		kvm_flush_remote_tlbs(kvm);
-
-	mutex_unlock(&kvm->slots_lock);
-	return r;
-}
-
-int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log)
-{
-	bool flush = false;
-	int r;
-
-	mutex_lock(&kvm->slots_lock);
-
-	r = kvm_clear_dirty_log_protect(kvm, log, &flush);
-
-	if (flush)
-		kvm_flush_remote_tlbs(kvm);
-
-	mutex_unlock(&kvm->slots_lock);
-	return r;
-}
-
-static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
-					struct kvm_arm_device_addr *dev_addr)
-{
-	unsigned long dev_id, type;
-
-	dev_id = (dev_addr->id & KVM_ARM_DEVICE_ID_MASK) >>
-		KVM_ARM_DEVICE_ID_SHIFT;
-	type = (dev_addr->id & KVM_ARM_DEVICE_TYPE_MASK) >>
-		KVM_ARM_DEVICE_TYPE_SHIFT;
-
-	switch (dev_id) {
-	case KVM_ARM_DEVICE_VGIC_V2:
-		if (!vgic_present)
-			return -ENXIO;
-		return kvm_vgic_addr(kvm, type, &dev_addr->addr, true);
-	default:
-		return -ENODEV;
-	}
-}
-
-long kvm_arch_vm_ioctl(struct file *filp,
-		       unsigned int ioctl, unsigned long arg)
-{
-	struct kvm *kvm = filp->private_data;
-	void __user *argp = (void __user *)arg;
-
-	switch (ioctl) {
-	case KVM_CREATE_IRQCHIP: {
-		int ret;
-		if (!vgic_present)
-			return -ENXIO;
-		mutex_lock(&kvm->lock);
-		ret = kvm_vgic_create(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
-		mutex_unlock(&kvm->lock);
-		return ret;
-	}
-	case KVM_ARM_SET_DEVICE_ADDR: {
-		struct kvm_arm_device_addr dev_addr;
-
-		if (copy_from_user(&dev_addr, argp, sizeof(dev_addr)))
-			return -EFAULT;
-		return kvm_vm_ioctl_set_device_addr(kvm, &dev_addr);
-	}
-	case KVM_ARM_PREFERRED_TARGET: {
-		int err;
-		struct kvm_vcpu_init init;
-
-		err = kvm_vcpu_preferred_target(&init);
-		if (err)
-			return err;
-
-		if (copy_to_user(argp, &init, sizeof(init)))
-			return -EFAULT;
-
-		return 0;
-	}
-	default:
-		return -EINVAL;
-	}
-}
-
-static void cpu_init_hyp_mode(void *dummy)
-{
-	phys_addr_t pgd_ptr;
-	unsigned long hyp_stack_ptr;
-	unsigned long stack_page;
-	unsigned long vector_ptr;
-
-	/* Switch from the HYP stub to our own HYP init vector */
-	__hyp_set_vectors(kvm_get_idmap_vector());
-
-	pgd_ptr = kvm_mmu_get_httbr();
-	stack_page = __this_cpu_read(kvm_arm_hyp_stack_page);
-	hyp_stack_ptr = stack_page + PAGE_SIZE;
-	vector_ptr = (unsigned long)kvm_get_hyp_vector();
-
-	__cpu_init_hyp_mode(pgd_ptr, hyp_stack_ptr, vector_ptr);
-	__cpu_init_stage2();
-}
-
-static void cpu_hyp_reset(void)
-{
-	if (!is_kernel_in_hyp_mode())
-		__hyp_reset_vectors();
-}
-
-static void cpu_hyp_reinit(void)
-{
-	cpu_hyp_reset();
-
-	if (is_kernel_in_hyp_mode())
-		kvm_timer_init_vhe();
-	else
-		cpu_init_hyp_mode(NULL);
-
-	kvm_arm_init_debug();
-
-	if (vgic_present)
-		kvm_vgic_init_cpu_hardware();
-}
-
-static void _kvm_arch_hardware_enable(void *discard)
-{
-	if (!__this_cpu_read(kvm_arm_hardware_enabled)) {
-		cpu_hyp_reinit();
-		__this_cpu_write(kvm_arm_hardware_enabled, 1);
-	}
-}
-
-int kvm_arch_hardware_enable(void)
-{
-	_kvm_arch_hardware_enable(NULL);
-	return 0;
-}
-
-static void _kvm_arch_hardware_disable(void *discard)
-{
-	if (__this_cpu_read(kvm_arm_hardware_enabled)) {
-		cpu_hyp_reset();
-		__this_cpu_write(kvm_arm_hardware_enabled, 0);
-	}
-}
-
-void kvm_arch_hardware_disable(void)
-{
-	_kvm_arch_hardware_disable(NULL);
-}
-
-#ifdef CONFIG_CPU_PM
-static int hyp_init_cpu_pm_notifier(struct notifier_block *self,
-				    unsigned long cmd,
-				    void *v)
-{
-	/*
-	 * kvm_arm_hardware_enabled is left with its old value over
-	 * PM_ENTER->PM_EXIT. It is used to indicate PM_EXIT should
-	 * re-enable hyp.
-	 */
-	switch (cmd) {
-	case CPU_PM_ENTER:
-		if (__this_cpu_read(kvm_arm_hardware_enabled))
-			/*
-			 * don't update kvm_arm_hardware_enabled here
-			 * so that the hardware will be re-enabled
-			 * when we resume. See below.
-			 */
-			cpu_hyp_reset();
-
-		return NOTIFY_OK;
-	case CPU_PM_ENTER_FAILED:
-	case CPU_PM_EXIT:
-		if (__this_cpu_read(kvm_arm_hardware_enabled))
-			/* The hardware was enabled before suspend. */
-			cpu_hyp_reinit();
-
-		return NOTIFY_OK;
-
-	default:
-		return NOTIFY_DONE;
-	}
-}
-
-static struct notifier_block hyp_init_cpu_pm_nb = {
-	.notifier_call = hyp_init_cpu_pm_notifier,
-};
-
-static void __init hyp_cpu_pm_init(void)
-{
-	cpu_pm_register_notifier(&hyp_init_cpu_pm_nb);
-}
-static void __init hyp_cpu_pm_exit(void)
-{
-	cpu_pm_unregister_notifier(&hyp_init_cpu_pm_nb);
-}
-#else
-static inline void hyp_cpu_pm_init(void)
-{
-}
-static inline void hyp_cpu_pm_exit(void)
-{
-}
-#endif
-
-static int init_common_resources(void)
-{
-	/* set size of VMID supported by CPU */
-	kvm_vmid_bits = kvm_get_vmid_bits();
-	kvm_info("%d-bit VMID\n", kvm_vmid_bits);
-
-	kvm_set_ipa_limit();
-
-	return 0;
-}
-
-static int init_subsystems(void)
-{
-	int err = 0;
-
-	/*
-	 * Enable hardware so that subsystem initialisation can access EL2.
-	 */
-	on_each_cpu(_kvm_arch_hardware_enable, NULL, 1);
-
-	/*
-	 * Register CPU lower-power notifier
-	 */
-	hyp_cpu_pm_init();
-
-	/*
-	 * Init HYP view of VGIC
-	 */
-	err = kvm_vgic_hyp_init();
-	switch (err) {
-	case 0:
-		vgic_present = true;
-		break;
-	case -ENODEV:
-	case -ENXIO:
-		vgic_present = false;
-		err = 0;
-		break;
-	default:
-		goto out;
-	}
-
-	/*
-	 * Init HYP architected timer support
-	 */
-	err = kvm_timer_hyp_init(vgic_present);
-	if (err)
-		goto out;
-
-	kvm_perf_init();
-	kvm_coproc_table_init();
-
-out:
-	on_each_cpu(_kvm_arch_hardware_disable, NULL, 1);
-
-	return err;
-}
-
-static void teardown_hyp_mode(void)
-{
-	int cpu;
-
-	free_hyp_pgds();
-	for_each_possible_cpu(cpu)
-		free_page(per_cpu(kvm_arm_hyp_stack_page, cpu));
-	hyp_cpu_pm_exit();
-}
-
-/**
- * Inits Hyp-mode on all online CPUs
- */
-static int init_hyp_mode(void)
-{
-	int cpu;
-	int err = 0;
-
-	/*
-	 * Allocate Hyp PGD and setup Hyp identity mapping
-	 */
-	err = kvm_mmu_init();
-	if (err)
-		goto out_err;
-
-	/*
-	 * Allocate stack pages for Hypervisor-mode
-	 */
-	for_each_possible_cpu(cpu) {
-		unsigned long stack_page;
-
-		stack_page = __get_free_page(GFP_KERNEL);
-		if (!stack_page) {
-			err = -ENOMEM;
-			goto out_err;
-		}
-
-		per_cpu(kvm_arm_hyp_stack_page, cpu) = stack_page;
-	}
-
-	/*
-	 * Map the Hyp-code called directly from the host
-	 */
-	err = create_hyp_mappings(kvm_ksym_ref(__hyp_text_start),
-				  kvm_ksym_ref(__hyp_text_end), PAGE_HYP_EXEC);
-	if (err) {
-		kvm_err("Cannot map world-switch code\n");
-		goto out_err;
-	}
-
-	err = create_hyp_mappings(kvm_ksym_ref(__start_rodata),
-				  kvm_ksym_ref(__end_rodata), PAGE_HYP_RO);
-	if (err) {
-		kvm_err("Cannot map rodata section\n");
-		goto out_err;
-	}
-
-	err = create_hyp_mappings(kvm_ksym_ref(__bss_start),
-				  kvm_ksym_ref(__bss_stop), PAGE_HYP_RO);
-	if (err) {
-		kvm_err("Cannot map bss section\n");
-		goto out_err;
-	}
-
-	err = kvm_map_vectors();
-	if (err) {
-		kvm_err("Cannot map vectors\n");
-		goto out_err;
-	}
-
-	/*
-	 * Map the Hyp stack pages
-	 */
-	for_each_possible_cpu(cpu) {
-		char *stack_page = (char *)per_cpu(kvm_arm_hyp_stack_page, cpu);
-		err = create_hyp_mappings(stack_page, stack_page + PAGE_SIZE,
-					  PAGE_HYP);
-
-		if (err) {
-			kvm_err("Cannot map hyp stack\n");
-			goto out_err;
-		}
-	}
-
-	for_each_possible_cpu(cpu) {
-		kvm_cpu_context_t *cpu_ctxt;
-
-		cpu_ctxt = per_cpu_ptr(&kvm_host_cpu_state, cpu);
-		err = create_hyp_mappings(cpu_ctxt, cpu_ctxt + 1, PAGE_HYP);
-
-		if (err) {
-			kvm_err("Cannot map host CPU state: %d\n", err);
-			goto out_err;
-		}
-	}
-
-	err = hyp_map_aux_data();
-	if (err)
-		kvm_err("Cannot map host auxilary data: %d\n", err);
-
-	return 0;
-
-out_err:
-	teardown_hyp_mode();
-	kvm_err("error initializing Hyp mode: %d\n", err);
-	return err;
-}
-
-static void check_kvm_target_cpu(void *ret)
-{
-	*(int *)ret = kvm_target_cpu();
-}
-
-struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr)
-{
-	struct kvm_vcpu *vcpu;
-	int i;
-
-	mpidr &= MPIDR_HWID_BITMASK;
-	kvm_for_each_vcpu(i, vcpu, kvm) {
-		if (mpidr == kvm_vcpu_get_mpidr_aff(vcpu))
-			return vcpu;
-	}
-	return NULL;
-}
-
-bool kvm_arch_has_irq_bypass(void)
-{
-	return true;
-}
-
-int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
-				      struct irq_bypass_producer *prod)
-{
-	struct kvm_kernel_irqfd *irqfd =
-		container_of(cons, struct kvm_kernel_irqfd, consumer);
-
-	return kvm_vgic_v4_set_forwarding(irqfd->kvm, prod->irq,
-					  &irqfd->irq_entry);
-}
-void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
-				      struct irq_bypass_producer *prod)
-{
-	struct kvm_kernel_irqfd *irqfd =
-		container_of(cons, struct kvm_kernel_irqfd, consumer);
-
-	kvm_vgic_v4_unset_forwarding(irqfd->kvm, prod->irq,
-				     &irqfd->irq_entry);
-}
-
-void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *cons)
-{
-	struct kvm_kernel_irqfd *irqfd =
-		container_of(cons, struct kvm_kernel_irqfd, consumer);
-
-	kvm_arm_halt_guest(irqfd->kvm);
-}
-
-void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *cons)
-{
-	struct kvm_kernel_irqfd *irqfd =
-		container_of(cons, struct kvm_kernel_irqfd, consumer);
-
-	kvm_arm_resume_guest(irqfd->kvm);
-}
-
-/**
- * Initialize Hyp-mode and memory mappings on all CPUs.
- */
-int kvm_arch_init(void *opaque)
-{
-	int err;
-	int ret, cpu;
-	bool in_hyp_mode;
-
-	if (!is_hyp_mode_available()) {
-		kvm_info("HYP mode not available\n");
-		return -ENODEV;
-	}
-
-	in_hyp_mode = is_kernel_in_hyp_mode();
-
-	if (!in_hyp_mode && kvm_arch_requires_vhe()) {
-		kvm_pr_unimpl("CPU unsupported in non-VHE mode, not initializing\n");
-		return -ENODEV;
-	}
-
-	for_each_online_cpu(cpu) {
-		smp_call_function_single(cpu, check_kvm_target_cpu, &ret, 1);
-		if (ret < 0) {
-			kvm_err("Error, CPU %d not supported!\n", cpu);
-			return -ENODEV;
-		}
-	}
-
-	err = init_common_resources();
-	if (err)
-		return err;
-
-	if (!in_hyp_mode) {
-		err = init_hyp_mode();
-		if (err)
-			goto out_err;
-	}
-
-	err = init_subsystems();
-	if (err)
-		goto out_hyp;
-
-	if (in_hyp_mode)
-		kvm_info("VHE mode initialized successfully\n");
-	else
-		kvm_info("Hyp mode initialized successfully\n");
-
-	return 0;
-
-out_hyp:
-	if (!in_hyp_mode)
-		teardown_hyp_mode();
-out_err:
-	return err;
-}
-
-/* NOP: Compiling as a module not supported */
-void kvm_arch_exit(void)
-{
-	kvm_perf_teardown();
-}
-
-static int arm_init(void)
-{
-	int rc = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
-	return rc;
-}
-
-module_init(arm_init);
diff --git a/virt/kvm/arm/hyp/timer-sr.c b/virt/kvm/arm/hyp/timer-sr.c
deleted file mode 100644
index 77754a62eb0c..000000000000
--- a/virt/kvm/arm/hyp/timer-sr.c
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (C) 2012-2015 - ARM Ltd
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <clocksource/arm_arch_timer.h>
-#include <linux/compiler.h>
-#include <linux/kvm_host.h>
-
-#include <asm/kvm_hyp.h>
-
-void __hyp_text __kvm_timer_set_cntvoff(u32 cntvoff_low, u32 cntvoff_high)
-{
-	u64 cntvoff = (u64)cntvoff_high << 32 | cntvoff_low;
-	write_sysreg(cntvoff, cntvoff_el2);
-}
-
-/*
- * Should only be called on non-VHE systems.
- * VHE systems use EL2 timers and configure EL1 timers in kvm_timer_init_vhe().
- */
-void __hyp_text __timer_disable_traps(struct kvm_vcpu *vcpu)
-{
-	u64 val;
-
-	/* Allow physical timer/counter access for the host */
-	val = read_sysreg(cnthctl_el2);
-	val |= CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN;
-	write_sysreg(val, cnthctl_el2);
-}
-
-/*
- * Should only be called on non-VHE systems.
- * VHE systems use EL2 timers and configure EL1 timers in kvm_timer_init_vhe().
- */
-void __hyp_text __timer_enable_traps(struct kvm_vcpu *vcpu)
-{
-	u64 val;
-
-	/*
-	 * Disallow physical timer access for the guest
-	 * Physical counter access is allowed
-	 */
-	val = read_sysreg(cnthctl_el2);
-	val &= ~CNTHCTL_EL1PCEN;
-	val |= CNTHCTL_EL1PCTEN;
-	write_sysreg(val, cnthctl_el2);
-}
diff --git a/virt/kvm/arm/hyp/vgic-v3-sr.c b/virt/kvm/arm/hyp/vgic-v3-sr.c
deleted file mode 100644
index 9652c453480f..000000000000
--- a/virt/kvm/arm/hyp/vgic-v3-sr.c
+++ /dev/null
@@ -1,1133 +0,0 @@
-/*
- * Copyright (C) 2012-2015 - ARM Ltd
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/compiler.h>
-#include <linux/irqchip/arm-gic-v3.h>
-#include <linux/kvm_host.h>
-
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_hyp.h>
-#include <asm/kvm_mmu.h>
-
-#define vtr_to_max_lr_idx(v)		((v) & 0xf)
-#define vtr_to_nr_pre_bits(v)		((((u32)(v) >> 26) & 7) + 1)
-#define vtr_to_nr_apr_regs(v)		(1 << (vtr_to_nr_pre_bits(v) - 5))
-
-static u64 __hyp_text __gic_v3_get_lr(unsigned int lr)
-{
-	switch (lr & 0xf) {
-	case 0:
-		return read_gicreg(ICH_LR0_EL2);
-	case 1:
-		return read_gicreg(ICH_LR1_EL2);
-	case 2:
-		return read_gicreg(ICH_LR2_EL2);
-	case 3:
-		return read_gicreg(ICH_LR3_EL2);
-	case 4:
-		return read_gicreg(ICH_LR4_EL2);
-	case 5:
-		return read_gicreg(ICH_LR5_EL2);
-	case 6:
-		return read_gicreg(ICH_LR6_EL2);
-	case 7:
-		return read_gicreg(ICH_LR7_EL2);
-	case 8:
-		return read_gicreg(ICH_LR8_EL2);
-	case 9:
-		return read_gicreg(ICH_LR9_EL2);
-	case 10:
-		return read_gicreg(ICH_LR10_EL2);
-	case 11:
-		return read_gicreg(ICH_LR11_EL2);
-	case 12:
-		return read_gicreg(ICH_LR12_EL2);
-	case 13:
-		return read_gicreg(ICH_LR13_EL2);
-	case 14:
-		return read_gicreg(ICH_LR14_EL2);
-	case 15:
-		return read_gicreg(ICH_LR15_EL2);
-	}
-
-	unreachable();
-}
-
-static void __hyp_text __gic_v3_set_lr(u64 val, int lr)
-{
-	switch (lr & 0xf) {
-	case 0:
-		write_gicreg(val, ICH_LR0_EL2);
-		break;
-	case 1:
-		write_gicreg(val, ICH_LR1_EL2);
-		break;
-	case 2:
-		write_gicreg(val, ICH_LR2_EL2);
-		break;
-	case 3:
-		write_gicreg(val, ICH_LR3_EL2);
-		break;
-	case 4:
-		write_gicreg(val, ICH_LR4_EL2);
-		break;
-	case 5:
-		write_gicreg(val, ICH_LR5_EL2);
-		break;
-	case 6:
-		write_gicreg(val, ICH_LR6_EL2);
-		break;
-	case 7:
-		write_gicreg(val, ICH_LR7_EL2);
-		break;
-	case 8:
-		write_gicreg(val, ICH_LR8_EL2);
-		break;
-	case 9:
-		write_gicreg(val, ICH_LR9_EL2);
-		break;
-	case 10:
-		write_gicreg(val, ICH_LR10_EL2);
-		break;
-	case 11:
-		write_gicreg(val, ICH_LR11_EL2);
-		break;
-	case 12:
-		write_gicreg(val, ICH_LR12_EL2);
-		break;
-	case 13:
-		write_gicreg(val, ICH_LR13_EL2);
-		break;
-	case 14:
-		write_gicreg(val, ICH_LR14_EL2);
-		break;
-	case 15:
-		write_gicreg(val, ICH_LR15_EL2);
-		break;
-	}
-}
-
-static void __hyp_text __vgic_v3_write_ap0rn(u32 val, int n)
-{
-	switch (n) {
-	case 0:
-		write_gicreg(val, ICH_AP0R0_EL2);
-		break;
-	case 1:
-		write_gicreg(val, ICH_AP0R1_EL2);
-		break;
-	case 2:
-		write_gicreg(val, ICH_AP0R2_EL2);
-		break;
-	case 3:
-		write_gicreg(val, ICH_AP0R3_EL2);
-		break;
-	}
-}
-
-static void __hyp_text __vgic_v3_write_ap1rn(u32 val, int n)
-{
-	switch (n) {
-	case 0:
-		write_gicreg(val, ICH_AP1R0_EL2);
-		break;
-	case 1:
-		write_gicreg(val, ICH_AP1R1_EL2);
-		break;
-	case 2:
-		write_gicreg(val, ICH_AP1R2_EL2);
-		break;
-	case 3:
-		write_gicreg(val, ICH_AP1R3_EL2);
-		break;
-	}
-}
-
-static u32 __hyp_text __vgic_v3_read_ap0rn(int n)
-{
-	u32 val;
-
-	switch (n) {
-	case 0:
-		val = read_gicreg(ICH_AP0R0_EL2);
-		break;
-	case 1:
-		val = read_gicreg(ICH_AP0R1_EL2);
-		break;
-	case 2:
-		val = read_gicreg(ICH_AP0R2_EL2);
-		break;
-	case 3:
-		val = read_gicreg(ICH_AP0R3_EL2);
-		break;
-	default:
-		unreachable();
-	}
-
-	return val;
-}
-
-static u32 __hyp_text __vgic_v3_read_ap1rn(int n)
-{
-	u32 val;
-
-	switch (n) {
-	case 0:
-		val = read_gicreg(ICH_AP1R0_EL2);
-		break;
-	case 1:
-		val = read_gicreg(ICH_AP1R1_EL2);
-		break;
-	case 2:
-		val = read_gicreg(ICH_AP1R2_EL2);
-		break;
-	case 3:
-		val = read_gicreg(ICH_AP1R3_EL2);
-		break;
-	default:
-		unreachable();
-	}
-
-	return val;
-}
-
-void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu)
-{
-	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
-	u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
-
-	/*
-	 * Make sure stores to the GIC via the memory mapped interface
-	 * are now visible to the system register interface when reading the
-	 * LRs, and when reading back the VMCR on non-VHE systems.
-	 */
-	if (used_lrs || !has_vhe()) {
-		if (!cpu_if->vgic_sre) {
-			dsb(sy);
-			isb();
-		}
-	}
-
-	if (used_lrs) {
-		int i;
-		u32 elrsr;
-
-		elrsr = read_gicreg(ICH_ELSR_EL2);
-
-		write_gicreg(cpu_if->vgic_hcr & ~ICH_HCR_EN, ICH_HCR_EL2);
-
-		for (i = 0; i < used_lrs; i++) {
-			if (elrsr & (1 << i))
-				cpu_if->vgic_lr[i] &= ~ICH_LR_STATE;
-			else
-				cpu_if->vgic_lr[i] = __gic_v3_get_lr(i);
-
-			__gic_v3_set_lr(0, i);
-		}
-	}
-}
-
-void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu)
-{
-	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
-	u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
-	int i;
-
-	if (used_lrs) {
-		write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2);
-
-		for (i = 0; i < used_lrs; i++)
-			__gic_v3_set_lr(cpu_if->vgic_lr[i], i);
-	}
-
-	/*
-	 * Ensure that writes to the LRs, and on non-VHE systems ensure that
-	 * the write to the VMCR in __vgic_v3_activate_traps(), will have
-	 * reached the (re)distributors. This ensure the guest will read the
-	 * correct values from the memory-mapped interface.
-	 */
-	if (used_lrs || !has_vhe()) {
-		if (!cpu_if->vgic_sre) {
-			isb();
-			dsb(sy);
-		}
-	}
-}
-
-void __hyp_text __vgic_v3_activate_traps(struct kvm_vcpu *vcpu)
-{
-	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
-
-	/*
-	 * VFIQEn is RES1 if ICC_SRE_EL1.SRE is 1. This causes a
-	 * Group0 interrupt (as generated in GICv2 mode) to be
-	 * delivered as a FIQ to the guest, with potentially fatal
-	 * consequences. So we must make sure that ICC_SRE_EL1 has
-	 * been actually programmed with the value we want before
-	 * starting to mess with the rest of the GIC, and VMCR_EL2 in
-	 * particular.  This logic must be called before
-	 * __vgic_v3_restore_state().
-	 */
-	if (!cpu_if->vgic_sre) {
-		write_gicreg(0, ICC_SRE_EL1);
-		isb();
-		write_gicreg(cpu_if->vgic_vmcr, ICH_VMCR_EL2);
-
-
-		if (has_vhe()) {
-			/*
-			 * Ensure that the write to the VMCR will have reached
-			 * the (re)distributors. This ensure the guest will
-			 * read the correct values from the memory-mapped
-			 * interface.
-			 */
-			isb();
-			dsb(sy);
-		}
-	}
-
-	/*
-	 * Prevent the guest from touching the GIC system registers if
-	 * SRE isn't enabled for GICv3 emulation.
-	 */
-	write_gicreg(read_gicreg(ICC_SRE_EL2) & ~ICC_SRE_EL2_ENABLE,
-		     ICC_SRE_EL2);
-
-	/*
-	 * If we need to trap system registers, we must write
-	 * ICH_HCR_EL2 anyway, even if no interrupts are being
-	 * injected,
-	 */
-	if (static_branch_unlikely(&vgic_v3_cpuif_trap) ||
-	    cpu_if->its_vpe.its_vm)
-		write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2);
-}
-
-void __hyp_text __vgic_v3_deactivate_traps(struct kvm_vcpu *vcpu)
-{
-	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
-	u64 val;
-
-	if (!cpu_if->vgic_sre) {
-		cpu_if->vgic_vmcr = read_gicreg(ICH_VMCR_EL2);
-	}
-
-	val = read_gicreg(ICC_SRE_EL2);
-	write_gicreg(val | ICC_SRE_EL2_ENABLE, ICC_SRE_EL2);
-
-	if (!cpu_if->vgic_sre) {
-		/* Make sure ENABLE is set at EL2 before setting SRE at EL1 */
-		isb();
-		write_gicreg(1, ICC_SRE_EL1);
-	}
-
-	/*
-	 * If we were trapping system registers, we enabled the VGIC even if
-	 * no interrupts were being injected, and we disable it again here.
-	 */
-	if (static_branch_unlikely(&vgic_v3_cpuif_trap) ||
-	    cpu_if->its_vpe.its_vm)
-		write_gicreg(0, ICH_HCR_EL2);
-}
-
-void __hyp_text __vgic_v3_save_aprs(struct kvm_vcpu *vcpu)
-{
-	struct vgic_v3_cpu_if *cpu_if;
-	u64 val;
-	u32 nr_pre_bits;
-
-	vcpu = kern_hyp_va(vcpu);
-	cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
-
-	val = read_gicreg(ICH_VTR_EL2);
-	nr_pre_bits = vtr_to_nr_pre_bits(val);
-
-	switch (nr_pre_bits) {
-	case 7:
-		cpu_if->vgic_ap0r[3] = __vgic_v3_read_ap0rn(3);
-		cpu_if->vgic_ap0r[2] = __vgic_v3_read_ap0rn(2);
-	case 6:
-		cpu_if->vgic_ap0r[1] = __vgic_v3_read_ap0rn(1);
-	default:
-		cpu_if->vgic_ap0r[0] = __vgic_v3_read_ap0rn(0);
-	}
-
-	switch (nr_pre_bits) {
-	case 7:
-		cpu_if->vgic_ap1r[3] = __vgic_v3_read_ap1rn(3);
-		cpu_if->vgic_ap1r[2] = __vgic_v3_read_ap1rn(2);
-	case 6:
-		cpu_if->vgic_ap1r[1] = __vgic_v3_read_ap1rn(1);
-	default:
-		cpu_if->vgic_ap1r[0] = __vgic_v3_read_ap1rn(0);
-	}
-}
-
-void __hyp_text __vgic_v3_restore_aprs(struct kvm_vcpu *vcpu)
-{
-	struct vgic_v3_cpu_if *cpu_if;
-	u64 val;
-	u32 nr_pre_bits;
-
-	vcpu = kern_hyp_va(vcpu);
-	cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
-
-	val = read_gicreg(ICH_VTR_EL2);
-	nr_pre_bits = vtr_to_nr_pre_bits(val);
-
-	switch (nr_pre_bits) {
-	case 7:
-		__vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[3], 3);
-		__vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[2], 2);
-	case 6:
-		__vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[1], 1);
-	default:
-		__vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[0], 0);
-	}
-
-	switch (nr_pre_bits) {
-	case 7:
-		__vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[3], 3);
-		__vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[2], 2);
-	case 6:
-		__vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[1], 1);
-	default:
-		__vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[0], 0);
-	}
-}
-
-void __hyp_text __vgic_v3_init_lrs(void)
-{
-	int max_lr_idx = vtr_to_max_lr_idx(read_gicreg(ICH_VTR_EL2));
-	int i;
-
-	for (i = 0; i <= max_lr_idx; i++)
-		__gic_v3_set_lr(0, i);
-}
-
-u64 __hyp_text __vgic_v3_get_ich_vtr_el2(void)
-{
-	return read_gicreg(ICH_VTR_EL2);
-}
-
-u64 __hyp_text __vgic_v3_read_vmcr(void)
-{
-	return read_gicreg(ICH_VMCR_EL2);
-}
-
-void __hyp_text __vgic_v3_write_vmcr(u32 vmcr)
-{
-	write_gicreg(vmcr, ICH_VMCR_EL2);
-}
-
-#ifdef CONFIG_ARM64
-
-static int __hyp_text __vgic_v3_bpr_min(void)
-{
-	/* See Pseudocode for VPriorityGroup */
-	return 8 - vtr_to_nr_pre_bits(read_gicreg(ICH_VTR_EL2));
-}
-
-static int __hyp_text __vgic_v3_get_group(struct kvm_vcpu *vcpu)
-{
-	u32 esr = kvm_vcpu_get_hsr(vcpu);
-	u8 crm = (esr & ESR_ELx_SYS64_ISS_CRM_MASK) >> ESR_ELx_SYS64_ISS_CRM_SHIFT;
-
-	return crm != 8;
-}
-
-#define GICv3_IDLE_PRIORITY	0xff
-
-static int __hyp_text __vgic_v3_highest_priority_lr(struct kvm_vcpu *vcpu,
-						    u32 vmcr,
-						    u64 *lr_val)
-{
-	unsigned int used_lrs = vcpu->arch.vgic_cpu.used_lrs;
-	u8 priority = GICv3_IDLE_PRIORITY;
-	int i, lr = -1;
-
-	for (i = 0; i < used_lrs; i++) {
-		u64 val = __gic_v3_get_lr(i);
-		u8 lr_prio = (val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT;
-
-		/* Not pending in the state? */
-		if ((val & ICH_LR_STATE) != ICH_LR_PENDING_BIT)
-			continue;
-
-		/* Group-0 interrupt, but Group-0 disabled? */
-		if (!(val & ICH_LR_GROUP) && !(vmcr & ICH_VMCR_ENG0_MASK))
-			continue;
-
-		/* Group-1 interrupt, but Group-1 disabled? */
-		if ((val & ICH_LR_GROUP) && !(vmcr & ICH_VMCR_ENG1_MASK))
-			continue;
-
-		/* Not the highest priority? */
-		if (lr_prio >= priority)
-			continue;
-
-		/* This is a candidate */
-		priority = lr_prio;
-		*lr_val = val;
-		lr = i;
-	}
-
-	if (lr == -1)
-		*lr_val = ICC_IAR1_EL1_SPURIOUS;
-
-	return lr;
-}
-
-static int __hyp_text __vgic_v3_find_active_lr(struct kvm_vcpu *vcpu,
-					       int intid, u64 *lr_val)
-{
-	unsigned int used_lrs = vcpu->arch.vgic_cpu.used_lrs;
-	int i;
-
-	for (i = 0; i < used_lrs; i++) {
-		u64 val = __gic_v3_get_lr(i);
-
-		if ((val & ICH_LR_VIRTUAL_ID_MASK) == intid &&
-		    (val & ICH_LR_ACTIVE_BIT)) {
-			*lr_val = val;
-			return i;
-		}
-	}
-
-	*lr_val = ICC_IAR1_EL1_SPURIOUS;
-	return -1;
-}
-
-static int __hyp_text __vgic_v3_get_highest_active_priority(void)
-{
-	u8 nr_apr_regs = vtr_to_nr_apr_regs(read_gicreg(ICH_VTR_EL2));
-	u32 hap = 0;
-	int i;
-
-	for (i = 0; i < nr_apr_regs; i++) {
-		u32 val;
-
-		/*
-		 * The ICH_AP0Rn_EL2 and ICH_AP1Rn_EL2 registers
-		 * contain the active priority levels for this VCPU
-		 * for the maximum number of supported priority
-		 * levels, and we return the full priority level only
-		 * if the BPR is programmed to its minimum, otherwise
-		 * we return a combination of the priority level and
-		 * subpriority, as determined by the setting of the
-		 * BPR, but without the full subpriority.
-		 */
-		val  = __vgic_v3_read_ap0rn(i);
-		val |= __vgic_v3_read_ap1rn(i);
-		if (!val) {
-			hap += 32;
-			continue;
-		}
-
-		return (hap + __ffs(val)) << __vgic_v3_bpr_min();
-	}
-
-	return GICv3_IDLE_PRIORITY;
-}
-
-static unsigned int __hyp_text __vgic_v3_get_bpr0(u32 vmcr)
-{
-	return (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT;
-}
-
-static unsigned int __hyp_text __vgic_v3_get_bpr1(u32 vmcr)
-{
-	unsigned int bpr;
-
-	if (vmcr & ICH_VMCR_CBPR_MASK) {
-		bpr = __vgic_v3_get_bpr0(vmcr);
-		if (bpr < 7)
-			bpr++;
-	} else {
-		bpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT;
-	}
-
-	return bpr;
-}
-
-/*
- * Convert a priority to a preemption level, taking the relevant BPR
- * into account by zeroing the sub-priority bits.
- */
-static u8 __hyp_text __vgic_v3_pri_to_pre(u8 pri, u32 vmcr, int grp)
-{
-	unsigned int bpr;
-
-	if (!grp)
-		bpr = __vgic_v3_get_bpr0(vmcr) + 1;
-	else
-		bpr = __vgic_v3_get_bpr1(vmcr);
-
-	return pri & (GENMASK(7, 0) << bpr);
-}
-
-/*
- * The priority value is independent of any of the BPR values, so we
- * normalize it using the minumal BPR value. This guarantees that no
- * matter what the guest does with its BPR, we can always set/get the
- * same value of a priority.
- */
-static void __hyp_text __vgic_v3_set_active_priority(u8 pri, u32 vmcr, int grp)
-{
-	u8 pre, ap;
-	u32 val;
-	int apr;
-
-	pre = __vgic_v3_pri_to_pre(pri, vmcr, grp);
-	ap = pre >> __vgic_v3_bpr_min();
-	apr = ap / 32;
-
-	if (!grp) {
-		val = __vgic_v3_read_ap0rn(apr);
-		__vgic_v3_write_ap0rn(val | BIT(ap % 32), apr);
-	} else {
-		val = __vgic_v3_read_ap1rn(apr);
-		__vgic_v3_write_ap1rn(val | BIT(ap % 32), apr);
-	}
-}
-
-static int __hyp_text __vgic_v3_clear_highest_active_priority(void)
-{
-	u8 nr_apr_regs = vtr_to_nr_apr_regs(read_gicreg(ICH_VTR_EL2));
-	u32 hap = 0;
-	int i;
-
-	for (i = 0; i < nr_apr_regs; i++) {
-		u32 ap0, ap1;
-		int c0, c1;
-
-		ap0 = __vgic_v3_read_ap0rn(i);
-		ap1 = __vgic_v3_read_ap1rn(i);
-		if (!ap0 && !ap1) {
-			hap += 32;
-			continue;
-		}
-
-		c0 = ap0 ? __ffs(ap0) : 32;
-		c1 = ap1 ? __ffs(ap1) : 32;
-
-		/* Always clear the LSB, which is the highest priority */
-		if (c0 < c1) {
-			ap0 &= ~BIT(c0);
-			__vgic_v3_write_ap0rn(ap0, i);
-			hap += c0;
-		} else {
-			ap1 &= ~BIT(c1);
-			__vgic_v3_write_ap1rn(ap1, i);
-			hap += c1;
-		}
-
-		/* Rescale to 8 bits of priority */
-		return hap << __vgic_v3_bpr_min();
-	}
-
-	return GICv3_IDLE_PRIORITY;
-}
-
-static void __hyp_text __vgic_v3_read_iar(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
-{
-	u64 lr_val;
-	u8 lr_prio, pmr;
-	int lr, grp;
-
-	grp = __vgic_v3_get_group(vcpu);
-
-	lr = __vgic_v3_highest_priority_lr(vcpu, vmcr, &lr_val);
-	if (lr < 0)
-		goto spurious;
-
-	if (grp != !!(lr_val & ICH_LR_GROUP))
-		goto spurious;
-
-	pmr = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT;
-	lr_prio = (lr_val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT;
-	if (pmr <= lr_prio)
-		goto spurious;
-
-	if (__vgic_v3_get_highest_active_priority() <= __vgic_v3_pri_to_pre(lr_prio, vmcr, grp))
-		goto spurious;
-
-	lr_val &= ~ICH_LR_STATE;
-	/* No active state for LPIs */
-	if ((lr_val & ICH_LR_VIRTUAL_ID_MASK) <= VGIC_MAX_SPI)
-		lr_val |= ICH_LR_ACTIVE_BIT;
-	__gic_v3_set_lr(lr_val, lr);
-	__vgic_v3_set_active_priority(lr_prio, vmcr, grp);
-	vcpu_set_reg(vcpu, rt, lr_val & ICH_LR_VIRTUAL_ID_MASK);
-	return;
-
-spurious:
-	vcpu_set_reg(vcpu, rt, ICC_IAR1_EL1_SPURIOUS);
-}
-
-static void __hyp_text __vgic_v3_clear_active_lr(int lr, u64 lr_val)
-{
-	lr_val &= ~ICH_LR_ACTIVE_BIT;
-	if (lr_val & ICH_LR_HW) {
-		u32 pid;
-
-		pid = (lr_val & ICH_LR_PHYS_ID_MASK) >> ICH_LR_PHYS_ID_SHIFT;
-		gic_write_dir(pid);
-	}
-
-	__gic_v3_set_lr(lr_val, lr);
-}
-
-static void __hyp_text __vgic_v3_bump_eoicount(void)
-{
-	u32 hcr;
-
-	hcr = read_gicreg(ICH_HCR_EL2);
-	hcr += 1 << ICH_HCR_EOIcount_SHIFT;
-	write_gicreg(hcr, ICH_HCR_EL2);
-}
-
-static void __hyp_text __vgic_v3_write_dir(struct kvm_vcpu *vcpu,
-					   u32 vmcr, int rt)
-{
-	u32 vid = vcpu_get_reg(vcpu, rt);
-	u64 lr_val;
-	int lr;
-
-	/* EOImode == 0, nothing to be done here */
-	if (!(vmcr & ICH_VMCR_EOIM_MASK))
-		return;
-
-	/* No deactivate to be performed on an LPI */
-	if (vid >= VGIC_MIN_LPI)
-		return;
-
-	lr = __vgic_v3_find_active_lr(vcpu, vid, &lr_val);
-	if (lr == -1) {
-		__vgic_v3_bump_eoicount();
-		return;
-	}
-
-	__vgic_v3_clear_active_lr(lr, lr_val);
-}
-
-static void __hyp_text __vgic_v3_write_eoir(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
-{
-	u32 vid = vcpu_get_reg(vcpu, rt);
-	u64 lr_val;
-	u8 lr_prio, act_prio;
-	int lr, grp;
-
-	grp = __vgic_v3_get_group(vcpu);
-
-	/* Drop priority in any case */
-	act_prio = __vgic_v3_clear_highest_active_priority();
-
-	/* If EOIing an LPI, no deactivate to be performed */
-	if (vid >= VGIC_MIN_LPI)
-		return;
-
-	/* EOImode == 1, nothing to be done here */
-	if (vmcr & ICH_VMCR_EOIM_MASK)
-		return;
-
-	lr = __vgic_v3_find_active_lr(vcpu, vid, &lr_val);
-	if (lr == -1) {
-		__vgic_v3_bump_eoicount();
-		return;
-	}
-
-	lr_prio = (lr_val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT;
-
-	/* If priorities or group do not match, the guest has fscked-up. */
-	if (grp != !!(lr_val & ICH_LR_GROUP) ||
-	    __vgic_v3_pri_to_pre(lr_prio, vmcr, grp) != act_prio)
-		return;
-
-	/* Let's now perform the deactivation */
-	__vgic_v3_clear_active_lr(lr, lr_val);
-}
-
-static void __hyp_text __vgic_v3_read_igrpen0(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
-{
-	vcpu_set_reg(vcpu, rt, !!(vmcr & ICH_VMCR_ENG0_MASK));
-}
-
-static void __hyp_text __vgic_v3_read_igrpen1(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
-{
-	vcpu_set_reg(vcpu, rt, !!(vmcr & ICH_VMCR_ENG1_MASK));
-}
-
-static void __hyp_text __vgic_v3_write_igrpen0(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
-{
-	u64 val = vcpu_get_reg(vcpu, rt);
-
-	if (val & 1)
-		vmcr |= ICH_VMCR_ENG0_MASK;
-	else
-		vmcr &= ~ICH_VMCR_ENG0_MASK;
-
-	__vgic_v3_write_vmcr(vmcr);
-}
-
-static void __hyp_text __vgic_v3_write_igrpen1(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
-{
-	u64 val = vcpu_get_reg(vcpu, rt);
-
-	if (val & 1)
-		vmcr |= ICH_VMCR_ENG1_MASK;
-	else
-		vmcr &= ~ICH_VMCR_ENG1_MASK;
-
-	__vgic_v3_write_vmcr(vmcr);
-}
-
-static void __hyp_text __vgic_v3_read_bpr0(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
-{
-	vcpu_set_reg(vcpu, rt, __vgic_v3_get_bpr0(vmcr));
-}
-
-static void __hyp_text __vgic_v3_read_bpr1(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
-{
-	vcpu_set_reg(vcpu, rt, __vgic_v3_get_bpr1(vmcr));
-}
-
-static void __hyp_text __vgic_v3_write_bpr0(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
-{
-	u64 val = vcpu_get_reg(vcpu, rt);
-	u8 bpr_min = __vgic_v3_bpr_min() - 1;
-
-	/* Enforce BPR limiting */
-	if (val < bpr_min)
-		val = bpr_min;
-
-	val <<= ICH_VMCR_BPR0_SHIFT;
-	val &= ICH_VMCR_BPR0_MASK;
-	vmcr &= ~ICH_VMCR_BPR0_MASK;
-	vmcr |= val;
-
-	__vgic_v3_write_vmcr(vmcr);
-}
-
-static void __hyp_text __vgic_v3_write_bpr1(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
-{
-	u64 val = vcpu_get_reg(vcpu, rt);
-	u8 bpr_min = __vgic_v3_bpr_min();
-
-	if (vmcr & ICH_VMCR_CBPR_MASK)
-		return;
-
-	/* Enforce BPR limiting */
-	if (val < bpr_min)
-		val = bpr_min;
-
-	val <<= ICH_VMCR_BPR1_SHIFT;
-	val &= ICH_VMCR_BPR1_MASK;
-	vmcr &= ~ICH_VMCR_BPR1_MASK;
-	vmcr |= val;
-
-	__vgic_v3_write_vmcr(vmcr);
-}
-
-static void __hyp_text __vgic_v3_read_apxrn(struct kvm_vcpu *vcpu, int rt, int n)
-{
-	u32 val;
-
-	if (!__vgic_v3_get_group(vcpu))
-		val = __vgic_v3_read_ap0rn(n);
-	else
-		val = __vgic_v3_read_ap1rn(n);
-
-	vcpu_set_reg(vcpu, rt, val);
-}
-
-static void __hyp_text __vgic_v3_write_apxrn(struct kvm_vcpu *vcpu, int rt, int n)
-{
-	u32 val = vcpu_get_reg(vcpu, rt);
-
-	if (!__vgic_v3_get_group(vcpu))
-		__vgic_v3_write_ap0rn(val, n);
-	else
-		__vgic_v3_write_ap1rn(val, n);
-}
-
-static void __hyp_text __vgic_v3_read_apxr0(struct kvm_vcpu *vcpu,
-					    u32 vmcr, int rt)
-{
-	__vgic_v3_read_apxrn(vcpu, rt, 0);
-}
-
-static void __hyp_text __vgic_v3_read_apxr1(struct kvm_vcpu *vcpu,
-					    u32 vmcr, int rt)
-{
-	__vgic_v3_read_apxrn(vcpu, rt, 1);
-}
-
-static void __hyp_text __vgic_v3_read_apxr2(struct kvm_vcpu *vcpu,
-					    u32 vmcr, int rt)
-{
-	__vgic_v3_read_apxrn(vcpu, rt, 2);
-}
-
-static void __hyp_text __vgic_v3_read_apxr3(struct kvm_vcpu *vcpu,
-					    u32 vmcr, int rt)
-{
-	__vgic_v3_read_apxrn(vcpu, rt, 3);
-}
-
-static void __hyp_text __vgic_v3_write_apxr0(struct kvm_vcpu *vcpu,
-					     u32 vmcr, int rt)
-{
-	__vgic_v3_write_apxrn(vcpu, rt, 0);
-}
-
-static void __hyp_text __vgic_v3_write_apxr1(struct kvm_vcpu *vcpu,
-					     u32 vmcr, int rt)
-{
-	__vgic_v3_write_apxrn(vcpu, rt, 1);
-}
-
-static void __hyp_text __vgic_v3_write_apxr2(struct kvm_vcpu *vcpu,
-					     u32 vmcr, int rt)
-{
-	__vgic_v3_write_apxrn(vcpu, rt, 2);
-}
-
-static void __hyp_text __vgic_v3_write_apxr3(struct kvm_vcpu *vcpu,
-					     u32 vmcr, int rt)
-{
-	__vgic_v3_write_apxrn(vcpu, rt, 3);
-}
-
-static void __hyp_text __vgic_v3_read_hppir(struct kvm_vcpu *vcpu,
-					    u32 vmcr, int rt)
-{
-	u64 lr_val;
-	int lr, lr_grp, grp;
-
-	grp = __vgic_v3_get_group(vcpu);
-
-	lr = __vgic_v3_highest_priority_lr(vcpu, vmcr, &lr_val);
-	if (lr == -1)
-		goto spurious;
-
-	lr_grp = !!(lr_val & ICH_LR_GROUP);
-	if (lr_grp != grp)
-		lr_val = ICC_IAR1_EL1_SPURIOUS;
-
-spurious:
-	vcpu_set_reg(vcpu, rt, lr_val & ICH_LR_VIRTUAL_ID_MASK);
-}
-
-static void __hyp_text __vgic_v3_read_pmr(struct kvm_vcpu *vcpu,
-					  u32 vmcr, int rt)
-{
-	vmcr &= ICH_VMCR_PMR_MASK;
-	vmcr >>= ICH_VMCR_PMR_SHIFT;
-	vcpu_set_reg(vcpu, rt, vmcr);
-}
-
-static void __hyp_text __vgic_v3_write_pmr(struct kvm_vcpu *vcpu,
-					   u32 vmcr, int rt)
-{
-	u32 val = vcpu_get_reg(vcpu, rt);
-
-	val <<= ICH_VMCR_PMR_SHIFT;
-	val &= ICH_VMCR_PMR_MASK;
-	vmcr &= ~ICH_VMCR_PMR_MASK;
-	vmcr |= val;
-
-	write_gicreg(vmcr, ICH_VMCR_EL2);
-}
-
-static void __hyp_text __vgic_v3_read_rpr(struct kvm_vcpu *vcpu,
-					  u32 vmcr, int rt)
-{
-	u32 val = __vgic_v3_get_highest_active_priority();
-	vcpu_set_reg(vcpu, rt, val);
-}
-
-static void __hyp_text __vgic_v3_read_ctlr(struct kvm_vcpu *vcpu,
-					   u32 vmcr, int rt)
-{
-	u32 vtr, val;
-
-	vtr = read_gicreg(ICH_VTR_EL2);
-	/* PRIbits */
-	val = ((vtr >> 29) & 7) << ICC_CTLR_EL1_PRI_BITS_SHIFT;
-	/* IDbits */
-	val |= ((vtr >> 23) & 7) << ICC_CTLR_EL1_ID_BITS_SHIFT;
-	/* SEIS */
-	val |= ((vtr >> 22) & 1) << ICC_CTLR_EL1_SEIS_SHIFT;
-	/* A3V */
-	val |= ((vtr >> 21) & 1) << ICC_CTLR_EL1_A3V_SHIFT;
-	/* EOImode */
-	val |= ((vmcr & ICH_VMCR_EOIM_MASK) >> ICH_VMCR_EOIM_SHIFT) << ICC_CTLR_EL1_EOImode_SHIFT;
-	/* CBPR */
-	val |= (vmcr & ICH_VMCR_CBPR_MASK) >> ICH_VMCR_CBPR_SHIFT;
-
-	vcpu_set_reg(vcpu, rt, val);
-}
-
-static void __hyp_text __vgic_v3_write_ctlr(struct kvm_vcpu *vcpu,
-					    u32 vmcr, int rt)
-{
-	u32 val = vcpu_get_reg(vcpu, rt);
-
-	if (val & ICC_CTLR_EL1_CBPR_MASK)
-		vmcr |= ICH_VMCR_CBPR_MASK;
-	else
-		vmcr &= ~ICH_VMCR_CBPR_MASK;
-
-	if (val & ICC_CTLR_EL1_EOImode_MASK)
-		vmcr |= ICH_VMCR_EOIM_MASK;
-	else
-		vmcr &= ~ICH_VMCR_EOIM_MASK;
-
-	write_gicreg(vmcr, ICH_VMCR_EL2);
-}
-
-int __hyp_text __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu)
-{
-	int rt;
-	u32 esr;
-	u32 vmcr;
-	void (*fn)(struct kvm_vcpu *, u32, int);
-	bool is_read;
-	u32 sysreg;
-
-	esr = kvm_vcpu_get_hsr(vcpu);
-	if (vcpu_mode_is_32bit(vcpu)) {
-		if (!kvm_condition_valid(vcpu)) {
-			__kvm_skip_instr(vcpu);
-			return 1;
-		}
-
-		sysreg = esr_cp15_to_sysreg(esr);
-	} else {
-		sysreg = esr_sys64_to_sysreg(esr);
-	}
-
-	is_read = (esr & ESR_ELx_SYS64_ISS_DIR_MASK) == ESR_ELx_SYS64_ISS_DIR_READ;
-
-	switch (sysreg) {
-	case SYS_ICC_IAR0_EL1:
-	case SYS_ICC_IAR1_EL1:
-		if (unlikely(!is_read))
-			return 0;
-		fn = __vgic_v3_read_iar;
-		break;
-	case SYS_ICC_EOIR0_EL1:
-	case SYS_ICC_EOIR1_EL1:
-		if (unlikely(is_read))
-			return 0;
-		fn = __vgic_v3_write_eoir;
-		break;
-	case SYS_ICC_IGRPEN1_EL1:
-		if (is_read)
-			fn = __vgic_v3_read_igrpen1;
-		else
-			fn = __vgic_v3_write_igrpen1;
-		break;
-	case SYS_ICC_BPR1_EL1:
-		if (is_read)
-			fn = __vgic_v3_read_bpr1;
-		else
-			fn = __vgic_v3_write_bpr1;
-		break;
-	case SYS_ICC_AP0Rn_EL1(0):
-	case SYS_ICC_AP1Rn_EL1(0):
-		if (is_read)
-			fn = __vgic_v3_read_apxr0;
-		else
-			fn = __vgic_v3_write_apxr0;
-		break;
-	case SYS_ICC_AP0Rn_EL1(1):
-	case SYS_ICC_AP1Rn_EL1(1):
-		if (is_read)
-			fn = __vgic_v3_read_apxr1;
-		else
-			fn = __vgic_v3_write_apxr1;
-		break;
-	case SYS_ICC_AP0Rn_EL1(2):
-	case SYS_ICC_AP1Rn_EL1(2):
-		if (is_read)
-			fn = __vgic_v3_read_apxr2;
-		else
-			fn = __vgic_v3_write_apxr2;
-		break;
-	case SYS_ICC_AP0Rn_EL1(3):
-	case SYS_ICC_AP1Rn_EL1(3):
-		if (is_read)
-			fn = __vgic_v3_read_apxr3;
-		else
-			fn = __vgic_v3_write_apxr3;
-		break;
-	case SYS_ICC_HPPIR0_EL1:
-	case SYS_ICC_HPPIR1_EL1:
-		if (unlikely(!is_read))
-			return 0;
-		fn = __vgic_v3_read_hppir;
-		break;
-	case SYS_ICC_IGRPEN0_EL1:
-		if (is_read)
-			fn = __vgic_v3_read_igrpen0;
-		else
-			fn = __vgic_v3_write_igrpen0;
-		break;
-	case SYS_ICC_BPR0_EL1:
-		if (is_read)
-			fn = __vgic_v3_read_bpr0;
-		else
-			fn = __vgic_v3_write_bpr0;
-		break;
-	case SYS_ICC_DIR_EL1:
-		if (unlikely(is_read))
-			return 0;
-		fn = __vgic_v3_write_dir;
-		break;
-	case SYS_ICC_RPR_EL1:
-		if (unlikely(!is_read))
-			return 0;
-		fn = __vgic_v3_read_rpr;
-		break;
-	case SYS_ICC_CTLR_EL1:
-		if (is_read)
-			fn = __vgic_v3_read_ctlr;
-		else
-			fn = __vgic_v3_write_ctlr;
-		break;
-	case SYS_ICC_PMR_EL1:
-		if (is_read)
-			fn = __vgic_v3_read_pmr;
-		else
-			fn = __vgic_v3_write_pmr;
-		break;
-	default:
-		return 0;
-	}
-
-	vmcr = __vgic_v3_read_vmcr();
-	rt = kvm_vcpu_sys_get_rt(vcpu);
-	fn(vcpu, vmcr, rt);
-
-	__kvm_skip_instr(vcpu);
-
-	return 1;
-}
-
-#endif
diff --git a/virt/kvm/arm/mmio.c b/virt/kvm/arm/mmio.c
deleted file mode 100644
index 08443a15e6be..000000000000
--- a/virt/kvm/arm/mmio.c
+++ /dev/null
@@ -1,218 +0,0 @@
-/*
- * Copyright (C) 2012 - Virtual Open Systems and Columbia University
- * Author: Christoffer Dall <c.dall@virtualopensystems.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
- */
-
-#include <linux/kvm_host.h>
-#include <asm/kvm_mmio.h>
-#include <asm/kvm_emulate.h>
-#include <trace/events/kvm.h>
-
-#include "trace.h"
-
-void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data)
-{
-	void *datap = NULL;
-	union {
-		u8	byte;
-		u16	hword;
-		u32	word;
-		u64	dword;
-	} tmp;
-
-	switch (len) {
-	case 1:
-		tmp.byte	= data;
-		datap		= &tmp.byte;
-		break;
-	case 2:
-		tmp.hword	= data;
-		datap		= &tmp.hword;
-		break;
-	case 4:
-		tmp.word	= data;
-		datap		= &tmp.word;
-		break;
-	case 8:
-		tmp.dword	= data;
-		datap		= &tmp.dword;
-		break;
-	}
-
-	memcpy(buf, datap, len);
-}
-
-unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len)
-{
-	unsigned long data = 0;
-	union {
-		u16	hword;
-		u32	word;
-		u64	dword;
-	} tmp;
-
-	switch (len) {
-	case 1:
-		data = *(u8 *)buf;
-		break;
-	case 2:
-		memcpy(&tmp.hword, buf, len);
-		data = tmp.hword;
-		break;
-	case 4:
-		memcpy(&tmp.word, buf, len);
-		data = tmp.word;
-		break;
-	case 8:
-		memcpy(&tmp.dword, buf, len);
-		data = tmp.dword;
-		break;
-	}
-
-	return data;
-}
-
-/**
- * kvm_handle_mmio_return -- Handle MMIO loads after user space emulation
- *			     or in-kernel IO emulation
- *
- * @vcpu: The VCPU pointer
- * @run:  The VCPU run struct containing the mmio data
- */
-int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run)
-{
-	unsigned long data;
-	unsigned int len;
-	int mask;
-
-	if (!run->mmio.is_write) {
-		len = run->mmio.len;
-		if (len > sizeof(unsigned long))
-			return -EINVAL;
-
-		data = kvm_mmio_read_buf(run->mmio.data, len);
-
-		if (vcpu->arch.mmio_decode.sign_extend &&
-		    len < sizeof(unsigned long)) {
-			mask = 1U << ((len * 8) - 1);
-			data = (data ^ mask) - mask;
-		}
-
-		trace_kvm_mmio(KVM_TRACE_MMIO_READ, len, run->mmio.phys_addr,
-			       &data);
-		data = vcpu_data_host_to_guest(vcpu, data, len);
-		vcpu_set_reg(vcpu, vcpu->arch.mmio_decode.rt, data);
-	}
-
-	/*
-	 * The MMIO instruction is emulated and should not be re-executed
-	 * in the guest.
-	 */
-	kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
-
-	return 0;
-}
-
-static int decode_hsr(struct kvm_vcpu *vcpu, bool *is_write, int *len)
-{
-	unsigned long rt;
-	int access_size;
-	bool sign_extend;
-
-	if (kvm_vcpu_dabt_iss1tw(vcpu)) {
-		/* page table accesses IO mem: tell guest to fix its TTBR */
-		kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
-		return 1;
-	}
-
-	access_size = kvm_vcpu_dabt_get_as(vcpu);
-	if (unlikely(access_size < 0))
-		return access_size;
-
-	*is_write = kvm_vcpu_dabt_iswrite(vcpu);
-	sign_extend = kvm_vcpu_dabt_issext(vcpu);
-	rt = kvm_vcpu_dabt_get_rd(vcpu);
-
-	*len = access_size;
-	vcpu->arch.mmio_decode.sign_extend = sign_extend;
-	vcpu->arch.mmio_decode.rt = rt;
-
-	return 0;
-}
-
-int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run,
-		 phys_addr_t fault_ipa)
-{
-	unsigned long data;
-	unsigned long rt;
-	int ret;
-	bool is_write;
-	int len;
-	u8 data_buf[8];
-
-	/*
-	 * Prepare MMIO operation. First decode the syndrome data we get
-	 * from the CPU. Then try if some in-kernel emulation feels
-	 * responsible, otherwise let user space do its magic.
-	 */
-	if (kvm_vcpu_dabt_isvalid(vcpu)) {
-		ret = decode_hsr(vcpu, &is_write, &len);
-		if (ret)
-			return ret;
-	} else {
-		kvm_err("load/store instruction decoding not implemented\n");
-		return -ENOSYS;
-	}
-
-	rt = vcpu->arch.mmio_decode.rt;
-
-	if (is_write) {
-		data = vcpu_data_guest_to_host(vcpu, vcpu_get_reg(vcpu, rt),
-					       len);
-
-		trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, len, fault_ipa, &data);
-		kvm_mmio_write_buf(data_buf, len, data);
-
-		ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, fault_ipa, len,
-				       data_buf);
-	} else {
-		trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, len,
-			       fault_ipa, NULL);
-
-		ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, fault_ipa, len,
-				      data_buf);
-	}
-
-	/* Now prepare kvm_run for the potential return to userland. */
-	run->mmio.is_write	= is_write;
-	run->mmio.phys_addr	= fault_ipa;
-	run->mmio.len		= len;
-
-	if (!ret) {
-		/* We handled the access successfully in the kernel. */
-		if (!is_write)
-			memcpy(run->mmio.data, data_buf, len);
-		vcpu->stat.mmio_exit_kernel++;
-		kvm_handle_mmio_return(vcpu, run);
-		return 1;
-	}
-
-	if (is_write)
-		memcpy(run->mmio.data, data_buf, len);
-	vcpu->stat.mmio_exit_user++;
-	run->exit_reason	= KVM_EXIT_MMIO;
-	return 0;
-}
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
deleted file mode 100644
index fbdf3ac2f001..000000000000
--- a/virt/kvm/arm/mmu.c
+++ /dev/null
@@ -1,2439 +0,0 @@
-/*
- * Copyright (C) 2012 - Virtual Open Systems and Columbia University
- * Author: Christoffer Dall <c.dall@virtualopensystems.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
- */
-
-#include <linux/mman.h>
-#include <linux/kvm_host.h>
-#include <linux/io.h>
-#include <linux/hugetlb.h>
-#include <linux/sched/signal.h>
-#include <trace/events/kvm.h>
-#include <asm/pgalloc.h>
-#include <asm/cacheflush.h>
-#include <asm/kvm_arm.h>
-#include <asm/kvm_mmu.h>
-#include <asm/kvm_mmio.h>
-#include <asm/kvm_asm.h>
-#include <asm/kvm_emulate.h>
-#include <asm/virt.h>
-#include <asm/system_misc.h>
-
-#include "trace.h"
-
-static pgd_t *boot_hyp_pgd;
-static pgd_t *hyp_pgd;
-static pgd_t *merged_hyp_pgd;
-static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
-
-static unsigned long hyp_idmap_start;
-static unsigned long hyp_idmap_end;
-static phys_addr_t hyp_idmap_vector;
-
-static unsigned long io_map_base;
-
-#define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t))
-
-#define KVM_S2PTE_FLAG_IS_IOMAP		(1UL << 0)
-#define KVM_S2_FLAG_LOGGING_ACTIVE	(1UL << 1)
-
-static bool memslot_is_logging(struct kvm_memory_slot *memslot)
-{
-	return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
-}
-
-/**
- * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8
- * @kvm:	pointer to kvm structure.
- *
- * Interface to HYP function to flush all VM TLB entries
- */
-void kvm_flush_remote_tlbs(struct kvm *kvm)
-{
-	kvm_call_hyp(__kvm_tlb_flush_vmid, kvm);
-}
-
-static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
-{
-	kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
-}
-
-/*
- * D-Cache management functions. They take the page table entries by
- * value, as they are flushing the cache using the kernel mapping (or
- * kmap on 32bit).
- */
-static void kvm_flush_dcache_pte(pte_t pte)
-{
-	__kvm_flush_dcache_pte(pte);
-}
-
-static void kvm_flush_dcache_pmd(pmd_t pmd)
-{
-	__kvm_flush_dcache_pmd(pmd);
-}
-
-static void kvm_flush_dcache_pud(pud_t pud)
-{
-	__kvm_flush_dcache_pud(pud);
-}
-
-static bool kvm_is_device_pfn(unsigned long pfn)
-{
-	return !pfn_valid(pfn);
-}
-
-/**
- * stage2_dissolve_pmd() - clear and flush huge PMD entry
- * @kvm:	pointer to kvm structure.
- * @addr:	IPA
- * @pmd:	pmd pointer for IPA
- *
- * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all
- * pages in the range dirty.
- */
-static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
-{
-	if (!pmd_thp_or_huge(*pmd))
-		return;
-
-	pmd_clear(pmd);
-	kvm_tlb_flush_vmid_ipa(kvm, addr);
-	put_page(virt_to_page(pmd));
-}
-
-/**
- * stage2_dissolve_pud() - clear and flush huge PUD entry
- * @kvm:	pointer to kvm structure.
- * @addr:	IPA
- * @pud:	pud pointer for IPA
- *
- * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs. Marks all
- * pages in the range dirty.
- */
-static void stage2_dissolve_pud(struct kvm *kvm, phys_addr_t addr, pud_t *pudp)
-{
-	if (!stage2_pud_huge(kvm, *pudp))
-		return;
-
-	stage2_pud_clear(kvm, pudp);
-	kvm_tlb_flush_vmid_ipa(kvm, addr);
-	put_page(virt_to_page(pudp));
-}
-
-static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
-				  int min, int max)
-{
-	void *page;
-
-	BUG_ON(max > KVM_NR_MEM_OBJS);
-	if (cache->nobjs >= min)
-		return 0;
-	while (cache->nobjs < max) {
-		page = (void *)__get_free_page(PGALLOC_GFP);
-		if (!page)
-			return -ENOMEM;
-		cache->objects[cache->nobjs++] = page;
-	}
-	return 0;
-}
-
-static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
-{
-	while (mc->nobjs)
-		free_page((unsigned long)mc->objects[--mc->nobjs]);
-}
-
-static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
-{
-	void *p;
-
-	BUG_ON(!mc || !mc->nobjs);
-	p = mc->objects[--mc->nobjs];
-	return p;
-}
-
-static void clear_stage2_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr)
-{
-	pud_t *pud_table __maybe_unused = stage2_pud_offset(kvm, pgd, 0UL);
-	stage2_pgd_clear(kvm, pgd);
-	kvm_tlb_flush_vmid_ipa(kvm, addr);
-	stage2_pud_free(kvm, pud_table);
-	put_page(virt_to_page(pgd));
-}
-
-static void clear_stage2_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr)
-{
-	pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(kvm, pud, 0);
-	VM_BUG_ON(stage2_pud_huge(kvm, *pud));
-	stage2_pud_clear(kvm, pud);
-	kvm_tlb_flush_vmid_ipa(kvm, addr);
-	stage2_pmd_free(kvm, pmd_table);
-	put_page(virt_to_page(pud));
-}
-
-static void clear_stage2_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr)
-{
-	pte_t *pte_table = pte_offset_kernel(pmd, 0);
-	VM_BUG_ON(pmd_thp_or_huge(*pmd));
-	pmd_clear(pmd);
-	kvm_tlb_flush_vmid_ipa(kvm, addr);
-	pte_free_kernel(NULL, pte_table);
-	put_page(virt_to_page(pmd));
-}
-
-static inline void kvm_set_pte(pte_t *ptep, pte_t new_pte)
-{
-	WRITE_ONCE(*ptep, new_pte);
-	dsb(ishst);
-}
-
-static inline void kvm_set_pmd(pmd_t *pmdp, pmd_t new_pmd)
-{
-	WRITE_ONCE(*pmdp, new_pmd);
-	dsb(ishst);
-}
-
-static inline void kvm_pmd_populate(pmd_t *pmdp, pte_t *ptep)
-{
-	kvm_set_pmd(pmdp, kvm_mk_pmd(ptep));
-}
-
-static inline void kvm_pud_populate(pud_t *pudp, pmd_t *pmdp)
-{
-	WRITE_ONCE(*pudp, kvm_mk_pud(pmdp));
-	dsb(ishst);
-}
-
-static inline void kvm_pgd_populate(pgd_t *pgdp, pud_t *pudp)
-{
-	WRITE_ONCE(*pgdp, kvm_mk_pgd(pudp));
-	dsb(ishst);
-}
-
-/*
- * Unmapping vs dcache management:
- *
- * If a guest maps certain memory pages as uncached, all writes will
- * bypass the data cache and go directly to RAM.  However, the CPUs
- * can still speculate reads (not writes) and fill cache lines with
- * data.
- *
- * Those cache lines will be *clean* cache lines though, so a
- * clean+invalidate operation is equivalent to an invalidate
- * operation, because no cache lines are marked dirty.
- *
- * Those clean cache lines could be filled prior to an uncached write
- * by the guest, and the cache coherent IO subsystem would therefore
- * end up writing old data to disk.
- *
- * This is why right after unmapping a page/section and invalidating
- * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure
- * the IO subsystem will never hit in the cache.
- *
- * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as
- * we then fully enforce cacheability of RAM, no matter what the guest
- * does.
- */
-static void unmap_stage2_ptes(struct kvm *kvm, pmd_t *pmd,
-		       phys_addr_t addr, phys_addr_t end)
-{
-	phys_addr_t start_addr = addr;
-	pte_t *pte, *start_pte;
-
-	start_pte = pte = pte_offset_kernel(pmd, addr);
-	do {
-		if (!pte_none(*pte)) {
-			pte_t old_pte = *pte;
-
-			kvm_set_pte(pte, __pte(0));
-			kvm_tlb_flush_vmid_ipa(kvm, addr);
-
-			/* No need to invalidate the cache for device mappings */
-			if (!kvm_is_device_pfn(pte_pfn(old_pte)))
-				kvm_flush_dcache_pte(old_pte);
-
-			put_page(virt_to_page(pte));
-		}
-	} while (pte++, addr += PAGE_SIZE, addr != end);
-
-	if (stage2_pte_table_empty(kvm, start_pte))
-		clear_stage2_pmd_entry(kvm, pmd, start_addr);
-}
-
-static void unmap_stage2_pmds(struct kvm *kvm, pud_t *pud,
-		       phys_addr_t addr, phys_addr_t end)
-{
-	phys_addr_t next, start_addr = addr;
-	pmd_t *pmd, *start_pmd;
-
-	start_pmd = pmd = stage2_pmd_offset(kvm, pud, addr);
-	do {
-		next = stage2_pmd_addr_end(kvm, addr, end);
-		if (!pmd_none(*pmd)) {
-			if (pmd_thp_or_huge(*pmd)) {
-				pmd_t old_pmd = *pmd;
-
-				pmd_clear(pmd);
-				kvm_tlb_flush_vmid_ipa(kvm, addr);
-
-				kvm_flush_dcache_pmd(old_pmd);
-
-				put_page(virt_to_page(pmd));
-			} else {
-				unmap_stage2_ptes(kvm, pmd, addr, next);
-			}
-		}
-	} while (pmd++, addr = next, addr != end);
-
-	if (stage2_pmd_table_empty(kvm, start_pmd))
-		clear_stage2_pud_entry(kvm, pud, start_addr);
-}
-
-static void unmap_stage2_puds(struct kvm *kvm, pgd_t *pgd,
-		       phys_addr_t addr, phys_addr_t end)
-{
-	phys_addr_t next, start_addr = addr;
-	pud_t *pud, *start_pud;
-
-	start_pud = pud = stage2_pud_offset(kvm, pgd, addr);
-	do {
-		next = stage2_pud_addr_end(kvm, addr, end);
-		if (!stage2_pud_none(kvm, *pud)) {
-			if (stage2_pud_huge(kvm, *pud)) {
-				pud_t old_pud = *pud;
-
-				stage2_pud_clear(kvm, pud);
-				kvm_tlb_flush_vmid_ipa(kvm, addr);
-				kvm_flush_dcache_pud(old_pud);
-				put_page(virt_to_page(pud));
-			} else {
-				unmap_stage2_pmds(kvm, pud, addr, next);
-			}
-		}
-	} while (pud++, addr = next, addr != end);
-
-	if (stage2_pud_table_empty(kvm, start_pud))
-		clear_stage2_pgd_entry(kvm, pgd, start_addr);
-}
-
-/**
- * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
- * @kvm:   The VM pointer
- * @start: The intermediate physical base address of the range to unmap
- * @size:  The size of the area to unmap
- *
- * Clear a range of stage-2 mappings, lowering the various ref-counts.  Must
- * be called while holding mmu_lock (unless for freeing the stage2 pgd before
- * destroying the VM), otherwise another faulting VCPU may come in and mess
- * with things behind our backs.
- */
-static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size)
-{
-	pgd_t *pgd;
-	phys_addr_t addr = start, end = start + size;
-	phys_addr_t next;
-
-	assert_spin_locked(&kvm->mmu_lock);
-	WARN_ON(size & ~PAGE_MASK);
-
-	pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
-	do {
-		/*
-		 * Make sure the page table is still active, as another thread
-		 * could have possibly freed the page table, while we released
-		 * the lock.
-		 */
-		if (!READ_ONCE(kvm->arch.pgd))
-			break;
-		next = stage2_pgd_addr_end(kvm, addr, end);
-		if (!stage2_pgd_none(kvm, *pgd))
-			unmap_stage2_puds(kvm, pgd, addr, next);
-		/*
-		 * If the range is too large, release the kvm->mmu_lock
-		 * to prevent starvation and lockup detector warnings.
-		 */
-		if (next != end)
-			cond_resched_lock(&kvm->mmu_lock);
-	} while (pgd++, addr = next, addr != end);
-}
-
-static void stage2_flush_ptes(struct kvm *kvm, pmd_t *pmd,
-			      phys_addr_t addr, phys_addr_t end)
-{
-	pte_t *pte;
-
-	pte = pte_offset_kernel(pmd, addr);
-	do {
-		if (!pte_none(*pte) && !kvm_is_device_pfn(pte_pfn(*pte)))
-			kvm_flush_dcache_pte(*pte);
-	} while (pte++, addr += PAGE_SIZE, addr != end);
-}
-
-static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud,
-			      phys_addr_t addr, phys_addr_t end)
-{
-	pmd_t *pmd;
-	phys_addr_t next;
-
-	pmd = stage2_pmd_offset(kvm, pud, addr);
-	do {
-		next = stage2_pmd_addr_end(kvm, addr, end);
-		if (!pmd_none(*pmd)) {
-			if (pmd_thp_or_huge(*pmd))
-				kvm_flush_dcache_pmd(*pmd);
-			else
-				stage2_flush_ptes(kvm, pmd, addr, next);
-		}
-	} while (pmd++, addr = next, addr != end);
-}
-
-static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd,
-			      phys_addr_t addr, phys_addr_t end)
-{
-	pud_t *pud;
-	phys_addr_t next;
-
-	pud = stage2_pud_offset(kvm, pgd, addr);
-	do {
-		next = stage2_pud_addr_end(kvm, addr, end);
-		if (!stage2_pud_none(kvm, *pud)) {
-			if (stage2_pud_huge(kvm, *pud))
-				kvm_flush_dcache_pud(*pud);
-			else
-				stage2_flush_pmds(kvm, pud, addr, next);
-		}
-	} while (pud++, addr = next, addr != end);
-}
-
-static void stage2_flush_memslot(struct kvm *kvm,
-				 struct kvm_memory_slot *memslot)
-{
-	phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
-	phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
-	phys_addr_t next;
-	pgd_t *pgd;
-
-	pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
-	do {
-		next = stage2_pgd_addr_end(kvm, addr, end);
-		if (!stage2_pgd_none(kvm, *pgd))
-			stage2_flush_puds(kvm, pgd, addr, next);
-	} while (pgd++, addr = next, addr != end);
-}
-
-/**
- * stage2_flush_vm - Invalidate cache for pages mapped in stage 2
- * @kvm: The struct kvm pointer
- *
- * Go through the stage 2 page tables and invalidate any cache lines
- * backing memory already mapped to the VM.
- */
-static void stage2_flush_vm(struct kvm *kvm)
-{
-	struct kvm_memslots *slots;
-	struct kvm_memory_slot *memslot;
-	int idx;
-
-	idx = srcu_read_lock(&kvm->srcu);
-	spin_lock(&kvm->mmu_lock);
-
-	slots = kvm_memslots(kvm);
-	kvm_for_each_memslot(memslot, slots)
-		stage2_flush_memslot(kvm, memslot);
-
-	spin_unlock(&kvm->mmu_lock);
-	srcu_read_unlock(&kvm->srcu, idx);
-}
-
-static void clear_hyp_pgd_entry(pgd_t *pgd)
-{
-	pud_t *pud_table __maybe_unused = pud_offset(pgd, 0UL);
-	pgd_clear(pgd);
-	pud_free(NULL, pud_table);
-	put_page(virt_to_page(pgd));
-}
-
-static void clear_hyp_pud_entry(pud_t *pud)
-{
-	pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0);
-	VM_BUG_ON(pud_huge(*pud));
-	pud_clear(pud);
-	pmd_free(NULL, pmd_table);
-	put_page(virt_to_page(pud));
-}
-
-static void clear_hyp_pmd_entry(pmd_t *pmd)
-{
-	pte_t *pte_table = pte_offset_kernel(pmd, 0);
-	VM_BUG_ON(pmd_thp_or_huge(*pmd));
-	pmd_clear(pmd);
-	pte_free_kernel(NULL, pte_table);
-	put_page(virt_to_page(pmd));
-}
-
-static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
-{
-	pte_t *pte, *start_pte;
-
-	start_pte = pte = pte_offset_kernel(pmd, addr);
-	do {
-		if (!pte_none(*pte)) {
-			kvm_set_pte(pte, __pte(0));
-			put_page(virt_to_page(pte));
-		}
-	} while (pte++, addr += PAGE_SIZE, addr != end);
-
-	if (hyp_pte_table_empty(start_pte))
-		clear_hyp_pmd_entry(pmd);
-}
-
-static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end)
-{
-	phys_addr_t next;
-	pmd_t *pmd, *start_pmd;
-
-	start_pmd = pmd = pmd_offset(pud, addr);
-	do {
-		next = pmd_addr_end(addr, end);
-		/* Hyp doesn't use huge pmds */
-		if (!pmd_none(*pmd))
-			unmap_hyp_ptes(pmd, addr, next);
-	} while (pmd++, addr = next, addr != end);
-
-	if (hyp_pmd_table_empty(start_pmd))
-		clear_hyp_pud_entry(pud);
-}
-
-static void unmap_hyp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
-{
-	phys_addr_t next;
-	pud_t *pud, *start_pud;
-
-	start_pud = pud = pud_offset(pgd, addr);
-	do {
-		next = pud_addr_end(addr, end);
-		/* Hyp doesn't use huge puds */
-		if (!pud_none(*pud))
-			unmap_hyp_pmds(pud, addr, next);
-	} while (pud++, addr = next, addr != end);
-
-	if (hyp_pud_table_empty(start_pud))
-		clear_hyp_pgd_entry(pgd);
-}
-
-static unsigned int kvm_pgd_index(unsigned long addr, unsigned int ptrs_per_pgd)
-{
-	return (addr >> PGDIR_SHIFT) & (ptrs_per_pgd - 1);
-}
-
-static void __unmap_hyp_range(pgd_t *pgdp, unsigned long ptrs_per_pgd,
-			      phys_addr_t start, u64 size)
-{
-	pgd_t *pgd;
-	phys_addr_t addr = start, end = start + size;
-	phys_addr_t next;
-
-	/*
-	 * We don't unmap anything from HYP, except at the hyp tear down.
-	 * Hence, we don't have to invalidate the TLBs here.
-	 */
-	pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd);
-	do {
-		next = pgd_addr_end(addr, end);
-		if (!pgd_none(*pgd))
-			unmap_hyp_puds(pgd, addr, next);
-	} while (pgd++, addr = next, addr != end);
-}
-
-static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size)
-{
-	__unmap_hyp_range(pgdp, PTRS_PER_PGD, start, size);
-}
-
-static void unmap_hyp_idmap_range(pgd_t *pgdp, phys_addr_t start, u64 size)
-{
-	__unmap_hyp_range(pgdp, __kvm_idmap_ptrs_per_pgd(), start, size);
-}
-
-/**
- * free_hyp_pgds - free Hyp-mode page tables
- *
- * Assumes hyp_pgd is a page table used strictly in Hyp-mode and
- * therefore contains either mappings in the kernel memory area (above
- * PAGE_OFFSET), or device mappings in the idmap range.
- *
- * boot_hyp_pgd should only map the idmap range, and is only used in
- * the extended idmap case.
- */
-void free_hyp_pgds(void)
-{
-	pgd_t *id_pgd;
-
-	mutex_lock(&kvm_hyp_pgd_mutex);
-
-	id_pgd = boot_hyp_pgd ? boot_hyp_pgd : hyp_pgd;
-
-	if (id_pgd) {
-		/* In case we never called hyp_mmu_init() */
-		if (!io_map_base)
-			io_map_base = hyp_idmap_start;
-		unmap_hyp_idmap_range(id_pgd, io_map_base,
-				      hyp_idmap_start + PAGE_SIZE - io_map_base);
-	}
-
-	if (boot_hyp_pgd) {
-		free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order);
-		boot_hyp_pgd = NULL;
-	}
-
-	if (hyp_pgd) {
-		unmap_hyp_range(hyp_pgd, kern_hyp_va(PAGE_OFFSET),
-				(uintptr_t)high_memory - PAGE_OFFSET);
-
-		free_pages((unsigned long)hyp_pgd, hyp_pgd_order);
-		hyp_pgd = NULL;
-	}
-	if (merged_hyp_pgd) {
-		clear_page(merged_hyp_pgd);
-		free_page((unsigned long)merged_hyp_pgd);
-		merged_hyp_pgd = NULL;
-	}
-
-	mutex_unlock(&kvm_hyp_pgd_mutex);
-}
-
-static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start,
-				    unsigned long end, unsigned long pfn,
-				    pgprot_t prot)
-{
-	pte_t *pte;
-	unsigned long addr;
-
-	addr = start;
-	do {
-		pte = pte_offset_kernel(pmd, addr);
-		kvm_set_pte(pte, kvm_pfn_pte(pfn, prot));
-		get_page(virt_to_page(pte));
-		pfn++;
-	} while (addr += PAGE_SIZE, addr != end);
-}
-
-static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start,
-				   unsigned long end, unsigned long pfn,
-				   pgprot_t prot)
-{
-	pmd_t *pmd;
-	pte_t *pte;
-	unsigned long addr, next;
-
-	addr = start;
-	do {
-		pmd = pmd_offset(pud, addr);
-
-		BUG_ON(pmd_sect(*pmd));
-
-		if (pmd_none(*pmd)) {
-			pte = pte_alloc_one_kernel(NULL);
-			if (!pte) {
-				kvm_err("Cannot allocate Hyp pte\n");
-				return -ENOMEM;
-			}
-			kvm_pmd_populate(pmd, pte);
-			get_page(virt_to_page(pmd));
-		}
-
-		next = pmd_addr_end(addr, end);
-
-		create_hyp_pte_mappings(pmd, addr, next, pfn, prot);
-		pfn += (next - addr) >> PAGE_SHIFT;
-	} while (addr = next, addr != end);
-
-	return 0;
-}
-
-static int create_hyp_pud_mappings(pgd_t *pgd, unsigned long start,
-				   unsigned long end, unsigned long pfn,
-				   pgprot_t prot)
-{
-	pud_t *pud;
-	pmd_t *pmd;
-	unsigned long addr, next;
-	int ret;
-
-	addr = start;
-	do {
-		pud = pud_offset(pgd, addr);
-
-		if (pud_none_or_clear_bad(pud)) {
-			pmd = pmd_alloc_one(NULL, addr);
-			if (!pmd) {
-				kvm_err("Cannot allocate Hyp pmd\n");
-				return -ENOMEM;
-			}
-			kvm_pud_populate(pud, pmd);
-			get_page(virt_to_page(pud));
-		}
-
-		next = pud_addr_end(addr, end);
-		ret = create_hyp_pmd_mappings(pud, addr, next, pfn, prot);
-		if (ret)
-			return ret;
-		pfn += (next - addr) >> PAGE_SHIFT;
-	} while (addr = next, addr != end);
-
-	return 0;
-}
-
-static int __create_hyp_mappings(pgd_t *pgdp, unsigned long ptrs_per_pgd,
-				 unsigned long start, unsigned long end,
-				 unsigned long pfn, pgprot_t prot)
-{
-	pgd_t *pgd;
-	pud_t *pud;
-	unsigned long addr, next;
-	int err = 0;
-
-	mutex_lock(&kvm_hyp_pgd_mutex);
-	addr = start & PAGE_MASK;
-	end = PAGE_ALIGN(end);
-	do {
-		pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd);
-
-		if (pgd_none(*pgd)) {
-			pud = pud_alloc_one(NULL, addr);
-			if (!pud) {
-				kvm_err("Cannot allocate Hyp pud\n");
-				err = -ENOMEM;
-				goto out;
-			}
-			kvm_pgd_populate(pgd, pud);
-			get_page(virt_to_page(pgd));
-		}
-
-		next = pgd_addr_end(addr, end);
-		err = create_hyp_pud_mappings(pgd, addr, next, pfn, prot);
-		if (err)
-			goto out;
-		pfn += (next - addr) >> PAGE_SHIFT;
-	} while (addr = next, addr != end);
-out:
-	mutex_unlock(&kvm_hyp_pgd_mutex);
-	return err;
-}
-
-static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
-{
-	if (!is_vmalloc_addr(kaddr)) {
-		BUG_ON(!virt_addr_valid(kaddr));
-		return __pa(kaddr);
-	} else {
-		return page_to_phys(vmalloc_to_page(kaddr)) +
-		       offset_in_page(kaddr);
-	}
-}
-
-/**
- * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
- * @from:	The virtual kernel start address of the range
- * @to:		The virtual kernel end address of the range (exclusive)
- * @prot:	The protection to be applied to this range
- *
- * The same virtual address as the kernel virtual address is also used
- * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
- * physical pages.
- */
-int create_hyp_mappings(void *from, void *to, pgprot_t prot)
-{
-	phys_addr_t phys_addr;
-	unsigned long virt_addr;
-	unsigned long start = kern_hyp_va((unsigned long)from);
-	unsigned long end = kern_hyp_va((unsigned long)to);
-
-	if (is_kernel_in_hyp_mode())
-		return 0;
-
-	start = start & PAGE_MASK;
-	end = PAGE_ALIGN(end);
-
-	for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) {
-		int err;
-
-		phys_addr = kvm_kaddr_to_phys(from + virt_addr - start);
-		err = __create_hyp_mappings(hyp_pgd, PTRS_PER_PGD,
-					    virt_addr, virt_addr + PAGE_SIZE,
-					    __phys_to_pfn(phys_addr),
-					    prot);
-		if (err)
-			return err;
-	}
-
-	return 0;
-}
-
-static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
-					unsigned long *haddr, pgprot_t prot)
-{
-	pgd_t *pgd = hyp_pgd;
-	unsigned long base;
-	int ret = 0;
-
-	mutex_lock(&kvm_hyp_pgd_mutex);
-
-	/*
-	 * This assumes that we we have enough space below the idmap
-	 * page to allocate our VAs. If not, the check below will
-	 * kick. A potential alternative would be to detect that
-	 * overflow and switch to an allocation above the idmap.
-	 *
-	 * The allocated size is always a multiple of PAGE_SIZE.
-	 */
-	size = PAGE_ALIGN(size + offset_in_page(phys_addr));
-	base = io_map_base - size;
-
-	/*
-	 * Verify that BIT(VA_BITS - 1) hasn't been flipped by
-	 * allocating the new area, as it would indicate we've
-	 * overflowed the idmap/IO address range.
-	 */
-	if ((base ^ io_map_base) & BIT(VA_BITS - 1))
-		ret = -ENOMEM;
-	else
-		io_map_base = base;
-
-	mutex_unlock(&kvm_hyp_pgd_mutex);
-
-	if (ret)
-		goto out;
-
-	if (__kvm_cpu_uses_extended_idmap())
-		pgd = boot_hyp_pgd;
-
-	ret = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(),
-				    base, base + size,
-				    __phys_to_pfn(phys_addr), prot);
-	if (ret)
-		goto out;
-
-	*haddr = base + offset_in_page(phys_addr);
-
-out:
-	return ret;
-}
-
-/**
- * create_hyp_io_mappings - Map IO into both kernel and HYP
- * @phys_addr:	The physical start address which gets mapped
- * @size:	Size of the region being mapped
- * @kaddr:	Kernel VA for this mapping
- * @haddr:	HYP VA for this mapping
- */
-int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
-			   void __iomem **kaddr,
-			   void __iomem **haddr)
-{
-	unsigned long addr;
-	int ret;
-
-	*kaddr = ioremap(phys_addr, size);
-	if (!*kaddr)
-		return -ENOMEM;
-
-	if (is_kernel_in_hyp_mode()) {
-		*haddr = *kaddr;
-		return 0;
-	}
-
-	ret = __create_hyp_private_mapping(phys_addr, size,
-					   &addr, PAGE_HYP_DEVICE);
-	if (ret) {
-		iounmap(*kaddr);
-		*kaddr = NULL;
-		*haddr = NULL;
-		return ret;
-	}
-
-	*haddr = (void __iomem *)addr;
-	return 0;
-}
-
-/**
- * create_hyp_exec_mappings - Map an executable range into HYP
- * @phys_addr:	The physical start address which gets mapped
- * @size:	Size of the region being mapped
- * @haddr:	HYP VA for this mapping
- */
-int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
-			     void **haddr)
-{
-	unsigned long addr;
-	int ret;
-
-	BUG_ON(is_kernel_in_hyp_mode());
-
-	ret = __create_hyp_private_mapping(phys_addr, size,
-					   &addr, PAGE_HYP_EXEC);
-	if (ret) {
-		*haddr = NULL;
-		return ret;
-	}
-
-	*haddr = (void *)addr;
-	return 0;
-}
-
-/**
- * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation.
- * @kvm:	The KVM struct pointer for the VM.
- *
- * Allocates only the stage-2 HW PGD level table(s) (can support either full
- * 40-bit input addresses or limited to 32-bit input addresses). Clears the
- * allocated pages.
- *
- * Note we don't need locking here as this is only called when the VM is
- * created, which can only be done once.
- */
-int kvm_alloc_stage2_pgd(struct kvm *kvm)
-{
-	pgd_t *pgd;
-
-	if (kvm->arch.pgd != NULL) {
-		kvm_err("kvm_arch already initialized?\n");
-		return -EINVAL;
-	}
-
-	/* Allocate the HW PGD, making sure that each page gets its own refcount */
-	pgd = alloc_pages_exact(stage2_pgd_size(kvm), GFP_KERNEL | __GFP_ZERO);
-	if (!pgd)
-		return -ENOMEM;
-
-	kvm->arch.pgd = pgd;
-	return 0;
-}
-
-static void stage2_unmap_memslot(struct kvm *kvm,
-				 struct kvm_memory_slot *memslot)
-{
-	hva_t hva = memslot->userspace_addr;
-	phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
-	phys_addr_t size = PAGE_SIZE * memslot->npages;
-	hva_t reg_end = hva + size;
-
-	/*
-	 * A memory region could potentially cover multiple VMAs, and any holes
-	 * between them, so iterate over all of them to find out if we should
-	 * unmap any of them.
-	 *
-	 *     +--------------------------------------------+
-	 * +---------------+----------------+   +----------------+
-	 * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
-	 * +---------------+----------------+   +----------------+
-	 *     |               memory region                |
-	 *     +--------------------------------------------+
-	 */
-	do {
-		struct vm_area_struct *vma = find_vma(current->mm, hva);
-		hva_t vm_start, vm_end;
-
-		if (!vma || vma->vm_start >= reg_end)
-			break;
-
-		/*
-		 * Take the intersection of this VMA with the memory region
-		 */
-		vm_start = max(hva, vma->vm_start);
-		vm_end = min(reg_end, vma->vm_end);
-
-		if (!(vma->vm_flags & VM_PFNMAP)) {
-			gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
-			unmap_stage2_range(kvm, gpa, vm_end - vm_start);
-		}
-		hva = vm_end;
-	} while (hva < reg_end);
-}
-
-/**
- * stage2_unmap_vm - Unmap Stage-2 RAM mappings
- * @kvm: The struct kvm pointer
- *
- * Go through the memregions and unmap any reguler RAM
- * backing memory already mapped to the VM.
- */
-void stage2_unmap_vm(struct kvm *kvm)
-{
-	struct kvm_memslots *slots;
-	struct kvm_memory_slot *memslot;
-	int idx;
-
-	idx = srcu_read_lock(&kvm->srcu);
-	down_read(&current->mm->mmap_sem);
-	spin_lock(&kvm->mmu_lock);
-
-	slots = kvm_memslots(kvm);
-	kvm_for_each_memslot(memslot, slots)
-		stage2_unmap_memslot(kvm, memslot);
-
-	spin_unlock(&kvm->mmu_lock);
-	up_read(&current->mm->mmap_sem);
-	srcu_read_unlock(&kvm->srcu, idx);
-}
-
-/**
- * kvm_free_stage2_pgd - free all stage-2 tables
- * @kvm:	The KVM struct pointer for the VM.
- *
- * Walks the level-1 page table pointed to by kvm->arch.pgd and frees all
- * underlying level-2 and level-3 tables before freeing the actual level-1 table
- * and setting the struct pointer to NULL.
- */
-void kvm_free_stage2_pgd(struct kvm *kvm)
-{
-	void *pgd = NULL;
-
-	spin_lock(&kvm->mmu_lock);
-	if (kvm->arch.pgd) {
-		unmap_stage2_range(kvm, 0, kvm_phys_size(kvm));
-		pgd = READ_ONCE(kvm->arch.pgd);
-		kvm->arch.pgd = NULL;
-	}
-	spin_unlock(&kvm->mmu_lock);
-
-	/* Free the HW pgd, one page at a time */
-	if (pgd)
-		free_pages_exact(pgd, stage2_pgd_size(kvm));
-}
-
-static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
-			     phys_addr_t addr)
-{
-	pgd_t *pgd;
-	pud_t *pud;
-
-	pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
-	if (stage2_pgd_none(kvm, *pgd)) {
-		if (!cache)
-			return NULL;
-		pud = mmu_memory_cache_alloc(cache);
-		stage2_pgd_populate(kvm, pgd, pud);
-		get_page(virt_to_page(pgd));
-	}
-
-	return stage2_pud_offset(kvm, pgd, addr);
-}
-
-static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
-			     phys_addr_t addr)
-{
-	pud_t *pud;
-	pmd_t *pmd;
-
-	pud = stage2_get_pud(kvm, cache, addr);
-	if (!pud || stage2_pud_huge(kvm, *pud))
-		return NULL;
-
-	if (stage2_pud_none(kvm, *pud)) {
-		if (!cache)
-			return NULL;
-		pmd = mmu_memory_cache_alloc(cache);
-		stage2_pud_populate(kvm, pud, pmd);
-		get_page(virt_to_page(pud));
-	}
-
-	return stage2_pmd_offset(kvm, pud, addr);
-}
-
-static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
-			       *cache, phys_addr_t addr, const pmd_t *new_pmd)
-{
-	pmd_t *pmd, old_pmd;
-
-	pmd = stage2_get_pmd(kvm, cache, addr);
-	VM_BUG_ON(!pmd);
-
-	old_pmd = *pmd;
-	if (pmd_present(old_pmd)) {
-		/*
-		 * Multiple vcpus faulting on the same PMD entry, can
-		 * lead to them sequentially updating the PMD with the
-		 * same value. Following the break-before-make
-		 * (pmd_clear() followed by tlb_flush()) process can
-		 * hinder forward progress due to refaults generated
-		 * on missing translations.
-		 *
-		 * Skip updating the page table if the entry is
-		 * unchanged.
-		 */
-		if (pmd_val(old_pmd) == pmd_val(*new_pmd))
-			return 0;
-
-		/*
-		 * Mapping in huge pages should only happen through a
-		 * fault.  If a page is merged into a transparent huge
-		 * page, the individual subpages of that huge page
-		 * should be unmapped through MMU notifiers before we
-		 * get here.
-		 *
-		 * Merging of CompoundPages is not supported; they
-		 * should become splitting first, unmapped, merged,
-		 * and mapped back in on-demand.
-		 */
-		VM_BUG_ON(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
-
-		pmd_clear(pmd);
-		kvm_tlb_flush_vmid_ipa(kvm, addr);
-	} else {
-		get_page(virt_to_page(pmd));
-	}
-
-	kvm_set_pmd(pmd, *new_pmd);
-	return 0;
-}
-
-static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
-			       phys_addr_t addr, const pud_t *new_pudp)
-{
-	pud_t *pudp, old_pud;
-
-	pudp = stage2_get_pud(kvm, cache, addr);
-	VM_BUG_ON(!pudp);
-
-	old_pud = *pudp;
-
-	/*
-	 * A large number of vcpus faulting on the same stage 2 entry,
-	 * can lead to a refault due to the
-	 * stage2_pud_clear()/tlb_flush(). Skip updating the page
-	 * tables if there is no change.
-	 */
-	if (pud_val(old_pud) == pud_val(*new_pudp))
-		return 0;
-
-	if (stage2_pud_present(kvm, old_pud)) {
-		stage2_pud_clear(kvm, pudp);
-		kvm_tlb_flush_vmid_ipa(kvm, addr);
-	} else {
-		get_page(virt_to_page(pudp));
-	}
-
-	kvm_set_pud(pudp, *new_pudp);
-	return 0;
-}
-
-/*
- * stage2_get_leaf_entry - walk the stage2 VM page tables and return
- * true if a valid and present leaf-entry is found. A pointer to the
- * leaf-entry is returned in the appropriate level variable - pudpp,
- * pmdpp, ptepp.
- */
-static bool stage2_get_leaf_entry(struct kvm *kvm, phys_addr_t addr,
-				  pud_t **pudpp, pmd_t **pmdpp, pte_t **ptepp)
-{
-	pud_t *pudp;
-	pmd_t *pmdp;
-	pte_t *ptep;
-
-	*pudpp = NULL;
-	*pmdpp = NULL;
-	*ptepp = NULL;
-
-	pudp = stage2_get_pud(kvm, NULL, addr);
-	if (!pudp || stage2_pud_none(kvm, *pudp) || !stage2_pud_present(kvm, *pudp))
-		return false;
-
-	if (stage2_pud_huge(kvm, *pudp)) {
-		*pudpp = pudp;
-		return true;
-	}
-
-	pmdp = stage2_pmd_offset(kvm, pudp, addr);
-	if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp))
-		return false;
-
-	if (pmd_thp_or_huge(*pmdp)) {
-		*pmdpp = pmdp;
-		return true;
-	}
-
-	ptep = pte_offset_kernel(pmdp, addr);
-	if (!ptep || pte_none(*ptep) || !pte_present(*ptep))
-		return false;
-
-	*ptepp = ptep;
-	return true;
-}
-
-static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr)
-{
-	pud_t *pudp;
-	pmd_t *pmdp;
-	pte_t *ptep;
-	bool found;
-
-	found = stage2_get_leaf_entry(kvm, addr, &pudp, &pmdp, &ptep);
-	if (!found)
-		return false;
-
-	if (pudp)
-		return kvm_s2pud_exec(pudp);
-	else if (pmdp)
-		return kvm_s2pmd_exec(pmdp);
-	else
-		return kvm_s2pte_exec(ptep);
-}
-
-static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
-			  phys_addr_t addr, const pte_t *new_pte,
-			  unsigned long flags)
-{
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte, old_pte;
-	bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
-	bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE;
-
-	VM_BUG_ON(logging_active && !cache);
-
-	/* Create stage-2 page table mapping - Levels 0 and 1 */
-	pud = stage2_get_pud(kvm, cache, addr);
-	if (!pud) {
-		/*
-		 * Ignore calls from kvm_set_spte_hva for unallocated
-		 * address ranges.
-		 */
-		return 0;
-	}
-
-	/*
-	 * While dirty page logging - dissolve huge PUD, then continue
-	 * on to allocate page.
-	 */
-	if (logging_active)
-		stage2_dissolve_pud(kvm, addr, pud);
-
-	if (stage2_pud_none(kvm, *pud)) {
-		if (!cache)
-			return 0; /* ignore calls from kvm_set_spte_hva */
-		pmd = mmu_memory_cache_alloc(cache);
-		stage2_pud_populate(kvm, pud, pmd);
-		get_page(virt_to_page(pud));
-	}
-
-	pmd = stage2_pmd_offset(kvm, pud, addr);
-	if (!pmd) {
-		/*
-		 * Ignore calls from kvm_set_spte_hva for unallocated
-		 * address ranges.
-		 */
-		return 0;
-	}
-
-	/*
-	 * While dirty page logging - dissolve huge PMD, then continue on to
-	 * allocate page.
-	 */
-	if (logging_active)
-		stage2_dissolve_pmd(kvm, addr, pmd);
-
-	/* Create stage-2 page mappings - Level 2 */
-	if (pmd_none(*pmd)) {
-		if (!cache)
-			return 0; /* ignore calls from kvm_set_spte_hva */
-		pte = mmu_memory_cache_alloc(cache);
-		kvm_pmd_populate(pmd, pte);
-		get_page(virt_to_page(pmd));
-	}
-
-	pte = pte_offset_kernel(pmd, addr);
-
-	if (iomap && pte_present(*pte))
-		return -EFAULT;
-
-	/* Create 2nd stage page table mapping - Level 3 */
-	old_pte = *pte;
-	if (pte_present(old_pte)) {
-		/* Skip page table update if there is no change */
-		if (pte_val(old_pte) == pte_val(*new_pte))
-			return 0;
-
-		kvm_set_pte(pte, __pte(0));
-		kvm_tlb_flush_vmid_ipa(kvm, addr);
-	} else {
-		get_page(virt_to_page(pte));
-	}
-
-	kvm_set_pte(pte, *new_pte);
-	return 0;
-}
-
-#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
-static int stage2_ptep_test_and_clear_young(pte_t *pte)
-{
-	if (pte_young(*pte)) {
-		*pte = pte_mkold(*pte);
-		return 1;
-	}
-	return 0;
-}
-#else
-static int stage2_ptep_test_and_clear_young(pte_t *pte)
-{
-	return __ptep_test_and_clear_young(pte);
-}
-#endif
-
-static int stage2_pmdp_test_and_clear_young(pmd_t *pmd)
-{
-	return stage2_ptep_test_and_clear_young((pte_t *)pmd);
-}
-
-static int stage2_pudp_test_and_clear_young(pud_t *pud)
-{
-	return stage2_ptep_test_and_clear_young((pte_t *)pud);
-}
-
-/**
- * kvm_phys_addr_ioremap - map a device range to guest IPA
- *
- * @kvm:	The KVM pointer
- * @guest_ipa:	The IPA at which to insert the mapping
- * @pa:		The physical address of the device
- * @size:	The size of the mapping
- */
-int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
-			  phys_addr_t pa, unsigned long size, bool writable)
-{
-	phys_addr_t addr, end;
-	int ret = 0;
-	unsigned long pfn;
-	struct kvm_mmu_memory_cache cache = { 0, };
-
-	end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK;
-	pfn = __phys_to_pfn(pa);
-
-	for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) {
-		pte_t pte = kvm_pfn_pte(pfn, PAGE_S2_DEVICE);
-
-		if (writable)
-			pte = kvm_s2pte_mkwrite(pte);
-
-		ret = mmu_topup_memory_cache(&cache,
-					     kvm_mmu_cache_min_pages(kvm),
-					     KVM_NR_MEM_OBJS);
-		if (ret)
-			goto out;
-		spin_lock(&kvm->mmu_lock);
-		ret = stage2_set_pte(kvm, &cache, addr, &pte,
-						KVM_S2PTE_FLAG_IS_IOMAP);
-		spin_unlock(&kvm->mmu_lock);
-		if (ret)
-			goto out;
-
-		pfn++;
-	}
-
-out:
-	mmu_free_memory_cache(&cache);
-	return ret;
-}
-
-static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap)
-{
-	kvm_pfn_t pfn = *pfnp;
-	gfn_t gfn = *ipap >> PAGE_SHIFT;
-	struct page *page = pfn_to_page(pfn);
-
-	/*
-	 * PageTransCompoundMap() returns true for THP and
-	 * hugetlbfs. Make sure the adjustment is done only for THP
-	 * pages.
-	 */
-	if (!PageHuge(page) && PageTransCompoundMap(page)) {
-		unsigned long mask;
-		/*
-		 * The address we faulted on is backed by a transparent huge
-		 * page.  However, because we map the compound huge page and
-		 * not the individual tail page, we need to transfer the
-		 * refcount to the head page.  We have to be careful that the
-		 * THP doesn't start to split while we are adjusting the
-		 * refcounts.
-		 *
-		 * We are sure this doesn't happen, because mmu_notifier_retry
-		 * was successful and we are holding the mmu_lock, so if this
-		 * THP is trying to split, it will be blocked in the mmu
-		 * notifier before touching any of the pages, specifically
-		 * before being able to call __split_huge_page_refcount().
-		 *
-		 * We can therefore safely transfer the refcount from PG_tail
-		 * to PG_head and switch the pfn from a tail page to the head
-		 * page accordingly.
-		 */
-		mask = PTRS_PER_PMD - 1;
-		VM_BUG_ON((gfn & mask) != (pfn & mask));
-		if (pfn & mask) {
-			*ipap &= PMD_MASK;
-			kvm_release_pfn_clean(pfn);
-			pfn &= ~mask;
-			kvm_get_pfn(pfn);
-			*pfnp = pfn;
-		}
-
-		return true;
-	}
-
-	return false;
-}
-
-static bool kvm_is_write_fault(struct kvm_vcpu *vcpu)
-{
-	if (kvm_vcpu_trap_is_iabt(vcpu))
-		return false;
-
-	return kvm_vcpu_dabt_iswrite(vcpu);
-}
-
-/**
- * stage2_wp_ptes - write protect PMD range
- * @pmd:	pointer to pmd entry
- * @addr:	range start address
- * @end:	range end address
- */
-static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
-{
-	pte_t *pte;
-
-	pte = pte_offset_kernel(pmd, addr);
-	do {
-		if (!pte_none(*pte)) {
-			if (!kvm_s2pte_readonly(pte))
-				kvm_set_s2pte_readonly(pte);
-		}
-	} while (pte++, addr += PAGE_SIZE, addr != end);
-}
-
-/**
- * stage2_wp_pmds - write protect PUD range
- * kvm:		kvm instance for the VM
- * @pud:	pointer to pud entry
- * @addr:	range start address
- * @end:	range end address
- */
-static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud,
-			   phys_addr_t addr, phys_addr_t end)
-{
-	pmd_t *pmd;
-	phys_addr_t next;
-
-	pmd = stage2_pmd_offset(kvm, pud, addr);
-
-	do {
-		next = stage2_pmd_addr_end(kvm, addr, end);
-		if (!pmd_none(*pmd)) {
-			if (pmd_thp_or_huge(*pmd)) {
-				if (!kvm_s2pmd_readonly(pmd))
-					kvm_set_s2pmd_readonly(pmd);
-			} else {
-				stage2_wp_ptes(pmd, addr, next);
-			}
-		}
-	} while (pmd++, addr = next, addr != end);
-}
-
-/**
-  * stage2_wp_puds - write protect PGD range
-  * @pgd:	pointer to pgd entry
-  * @addr:	range start address
-  * @end:	range end address
-  *
-  * Process PUD entries, for a huge PUD we cause a panic.
-  */
-static void  stage2_wp_puds(struct kvm *kvm, pgd_t *pgd,
-			    phys_addr_t addr, phys_addr_t end)
-{
-	pud_t *pud;
-	phys_addr_t next;
-
-	pud = stage2_pud_offset(kvm, pgd, addr);
-	do {
-		next = stage2_pud_addr_end(kvm, addr, end);
-		if (!stage2_pud_none(kvm, *pud)) {
-			if (stage2_pud_huge(kvm, *pud)) {
-				if (!kvm_s2pud_readonly(pud))
-					kvm_set_s2pud_readonly(pud);
-			} else {
-				stage2_wp_pmds(kvm, pud, addr, next);
-			}
-		}
-	} while (pud++, addr = next, addr != end);
-}
-
-/**
- * stage2_wp_range() - write protect stage2 memory region range
- * @kvm:	The KVM pointer
- * @addr:	Start address of range
- * @end:	End address of range
- */
-static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
-{
-	pgd_t *pgd;
-	phys_addr_t next;
-
-	pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
-	do {
-		/*
-		 * Release kvm_mmu_lock periodically if the memory region is
-		 * large. Otherwise, we may see kernel panics with
-		 * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR,
-		 * CONFIG_LOCKDEP. Additionally, holding the lock too long
-		 * will also starve other vCPUs. We have to also make sure
-		 * that the page tables are not freed while we released
-		 * the lock.
-		 */
-		cond_resched_lock(&kvm->mmu_lock);
-		if (!READ_ONCE(kvm->arch.pgd))
-			break;
-		next = stage2_pgd_addr_end(kvm, addr, end);
-		if (stage2_pgd_present(kvm, *pgd))
-			stage2_wp_puds(kvm, pgd, addr, next);
-	} while (pgd++, addr = next, addr != end);
-}
-
-/**
- * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot
- * @kvm:	The KVM pointer
- * @slot:	The memory slot to write protect
- *
- * Called to start logging dirty pages after memory region
- * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns
- * all present PUD, PMD and PTEs are write protected in the memory region.
- * Afterwards read of dirty page log can be called.
- *
- * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
- * serializing operations for VM memory regions.
- */
-void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
-{
-	struct kvm_memslots *slots = kvm_memslots(kvm);
-	struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
-	phys_addr_t start = memslot->base_gfn << PAGE_SHIFT;
-	phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
-
-	spin_lock(&kvm->mmu_lock);
-	stage2_wp_range(kvm, start, end);
-	spin_unlock(&kvm->mmu_lock);
-	kvm_flush_remote_tlbs(kvm);
-}
-
-/**
- * kvm_mmu_write_protect_pt_masked() - write protect dirty pages
- * @kvm:	The KVM pointer
- * @slot:	The memory slot associated with mask
- * @gfn_offset:	The gfn offset in memory slot
- * @mask:	The mask of dirty pages at offset 'gfn_offset' in this memory
- *		slot to be write protected
- *
- * Walks bits set in mask write protects the associated pte's. Caller must
- * acquire kvm_mmu_lock.
- */
-static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
-		struct kvm_memory_slot *slot,
-		gfn_t gfn_offset, unsigned long mask)
-{
-	phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
-	phys_addr_t start = (base_gfn +  __ffs(mask)) << PAGE_SHIFT;
-	phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
-
-	stage2_wp_range(kvm, start, end);
-}
-
-/*
- * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
- * dirty pages.
- *
- * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
- * enable dirty logging for them.
- */
-void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
-		struct kvm_memory_slot *slot,
-		gfn_t gfn_offset, unsigned long mask)
-{
-	kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
-}
-
-static void clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
-{
-	__clean_dcache_guest_page(pfn, size);
-}
-
-static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size)
-{
-	__invalidate_icache_guest_page(pfn, size);
-}
-
-static void kvm_send_hwpoison_signal(unsigned long address,
-				     struct vm_area_struct *vma)
-{
-	short lsb;
-
-	if (is_vm_hugetlb_page(vma))
-		lsb = huge_page_shift(hstate_vma(vma));
-	else
-		lsb = PAGE_SHIFT;
-
-	send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
-}
-
-static bool fault_supports_stage2_pmd_mappings(struct kvm_memory_slot *memslot,
-					       unsigned long hva)
-{
-	gpa_t gpa_start, gpa_end;
-	hva_t uaddr_start, uaddr_end;
-	size_t size;
-
-	size = memslot->npages * PAGE_SIZE;
-
-	gpa_start = memslot->base_gfn << PAGE_SHIFT;
-	gpa_end = gpa_start + size;
-
-	uaddr_start = memslot->userspace_addr;
-	uaddr_end = uaddr_start + size;
-
-	/*
-	 * Pages belonging to memslots that don't have the same alignment
-	 * within a PMD for userspace and IPA cannot be mapped with stage-2
-	 * PMD entries, because we'll end up mapping the wrong pages.
-	 *
-	 * Consider a layout like the following:
-	 *
-	 *    memslot->userspace_addr:
-	 *    +-----+--------------------+--------------------+---+
-	 *    |abcde|fgh  Stage-1 PMD    |    Stage-1 PMD   tv|xyz|
-	 *    +-----+--------------------+--------------------+---+
-	 *
-	 *    memslot->base_gfn << PAGE_SIZE:
-	 *      +---+--------------------+--------------------+-----+
-	 *      |abc|def  Stage-2 PMD    |    Stage-2 PMD     |tvxyz|
-	 *      +---+--------------------+--------------------+-----+
-	 *
-	 * If we create those stage-2 PMDs, we'll end up with this incorrect
-	 * mapping:
-	 *   d -> f
-	 *   e -> g
-	 *   f -> h
-	 */
-	if ((gpa_start & ~S2_PMD_MASK) != (uaddr_start & ~S2_PMD_MASK))
-		return false;
-
-	/*
-	 * Next, let's make sure we're not trying to map anything not covered
-	 * by the memslot. This means we have to prohibit PMD size mappings
-	 * for the beginning and end of a non-PMD aligned and non-PMD sized
-	 * memory slot (illustrated by the head and tail parts of the
-	 * userspace view above containing pages 'abcde' and 'xyz',
-	 * respectively).
-	 *
-	 * Note that it doesn't matter if we do the check using the
-	 * userspace_addr or the base_gfn, as both are equally aligned (per
-	 * the check above) and equally sized.
-	 */
-	return (hva & S2_PMD_MASK) >= uaddr_start &&
-	       (hva & S2_PMD_MASK) + S2_PMD_SIZE <= uaddr_end;
-}
-
-static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
-			  struct kvm_memory_slot *memslot, unsigned long hva,
-			  unsigned long fault_status)
-{
-	int ret;
-	bool write_fault, writable, force_pte = false;
-	bool exec_fault, needs_exec;
-	unsigned long mmu_seq;
-	gfn_t gfn = fault_ipa >> PAGE_SHIFT;
-	struct kvm *kvm = vcpu->kvm;
-	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
-	struct vm_area_struct *vma;
-	kvm_pfn_t pfn;
-	pgprot_t mem_type = PAGE_S2;
-	bool logging_active = memslot_is_logging(memslot);
-	unsigned long vma_pagesize, flags = 0;
-
-	write_fault = kvm_is_write_fault(vcpu);
-	exec_fault = kvm_vcpu_trap_is_iabt(vcpu);
-	VM_BUG_ON(write_fault && exec_fault);
-
-	if (fault_status == FSC_PERM && !write_fault && !exec_fault) {
-		kvm_err("Unexpected L2 read permission error\n");
-		return -EFAULT;
-	}
-
-	if (!fault_supports_stage2_pmd_mappings(memslot, hva))
-		force_pte = true;
-
-	if (logging_active)
-		force_pte = true;
-
-	/* Let's check if we will get back a huge page backed by hugetlbfs */
-	down_read(&current->mm->mmap_sem);
-	vma = find_vma_intersection(current->mm, hva, hva + 1);
-	if (unlikely(!vma)) {
-		kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
-		up_read(&current->mm->mmap_sem);
-		return -EFAULT;
-	}
-
-	vma_pagesize = vma_kernel_pagesize(vma);
-	/*
-	 * PUD level may not exist for a VM but PMD is guaranteed to
-	 * exist.
-	 */
-	if ((vma_pagesize == PMD_SIZE ||
-	     (vma_pagesize == PUD_SIZE && kvm_stage2_has_pud(kvm))) &&
-	    !force_pte) {
-		gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT;
-	}
-	up_read(&current->mm->mmap_sem);
-
-	/* We need minimum second+third level pages */
-	ret = mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm),
-				     KVM_NR_MEM_OBJS);
-	if (ret)
-		return ret;
-
-	mmu_seq = vcpu->kvm->mmu_notifier_seq;
-	/*
-	 * Ensure the read of mmu_notifier_seq happens before we call
-	 * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
-	 * the page we just got a reference to gets unmapped before we have a
-	 * chance to grab the mmu_lock, which ensure that if the page gets
-	 * unmapped afterwards, the call to kvm_unmap_hva will take it away
-	 * from us again properly. This smp_rmb() interacts with the smp_wmb()
-	 * in kvm_mmu_notifier_invalidate_<page|range_end>.
-	 */
-	smp_rmb();
-
-	pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
-	if (pfn == KVM_PFN_ERR_HWPOISON) {
-		kvm_send_hwpoison_signal(hva, vma);
-		return 0;
-	}
-	if (is_error_noslot_pfn(pfn))
-		return -EFAULT;
-
-	if (kvm_is_device_pfn(pfn)) {
-		mem_type = PAGE_S2_DEVICE;
-		flags |= KVM_S2PTE_FLAG_IS_IOMAP;
-	} else if (logging_active) {
-		/*
-		 * Faults on pages in a memslot with logging enabled
-		 * should not be mapped with huge pages (it introduces churn
-		 * and performance degradation), so force a pte mapping.
-		 */
-		flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
-
-		/*
-		 * Only actually map the page as writable if this was a write
-		 * fault.
-		 */
-		if (!write_fault)
-			writable = false;
-	}
-
-	spin_lock(&kvm->mmu_lock);
-	if (mmu_notifier_retry(kvm, mmu_seq))
-		goto out_unlock;
-
-	if (vma_pagesize == PAGE_SIZE && !force_pte) {
-		/*
-		 * Only PMD_SIZE transparent hugepages(THP) are
-		 * currently supported. This code will need to be
-		 * updated to support other THP sizes.
-		 */
-		if (transparent_hugepage_adjust(&pfn, &fault_ipa))
-			vma_pagesize = PMD_SIZE;
-	}
-
-	if (writable)
-		kvm_set_pfn_dirty(pfn);
-
-	if (fault_status != FSC_PERM)
-		clean_dcache_guest_page(pfn, vma_pagesize);
-
-	if (exec_fault)
-		invalidate_icache_guest_page(pfn, vma_pagesize);
-
-	/*
-	 * If we took an execution fault we have made the
-	 * icache/dcache coherent above and should now let the s2
-	 * mapping be executable.
-	 *
-	 * Write faults (!exec_fault && FSC_PERM) are orthogonal to
-	 * execute permissions, and we preserve whatever we have.
-	 */
-	needs_exec = exec_fault ||
-		(fault_status == FSC_PERM && stage2_is_exec(kvm, fault_ipa));
-
-	if (vma_pagesize == PUD_SIZE) {
-		pud_t new_pud = kvm_pfn_pud(pfn, mem_type);
-
-		new_pud = kvm_pud_mkhuge(new_pud);
-		if (writable)
-			new_pud = kvm_s2pud_mkwrite(new_pud);
-
-		if (needs_exec)
-			new_pud = kvm_s2pud_mkexec(new_pud);
-
-		ret = stage2_set_pud_huge(kvm, memcache, fault_ipa, &new_pud);
-	} else if (vma_pagesize == PMD_SIZE) {
-		pmd_t new_pmd = kvm_pfn_pmd(pfn, mem_type);
-
-		new_pmd = kvm_pmd_mkhuge(new_pmd);
-
-		if (writable)
-			new_pmd = kvm_s2pmd_mkwrite(new_pmd);
-
-		if (needs_exec)
-			new_pmd = kvm_s2pmd_mkexec(new_pmd);
-
-		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
-	} else {
-		pte_t new_pte = kvm_pfn_pte(pfn, mem_type);
-
-		if (writable) {
-			new_pte = kvm_s2pte_mkwrite(new_pte);
-			mark_page_dirty(kvm, gfn);
-		}
-
-		if (needs_exec)
-			new_pte = kvm_s2pte_mkexec(new_pte);
-
-		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags);
-	}
-
-out_unlock:
-	spin_unlock(&kvm->mmu_lock);
-	kvm_set_pfn_accessed(pfn);
-	kvm_release_pfn_clean(pfn);
-	return ret;
-}
-
-/*
- * Resolve the access fault by making the page young again.
- * Note that because the faulting entry is guaranteed not to be
- * cached in the TLB, we don't need to invalidate anything.
- * Only the HW Access Flag updates are supported for Stage 2 (no DBM),
- * so there is no need for atomic (pte|pmd)_mkyoung operations.
- */
-static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
-{
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
-	kvm_pfn_t pfn;
-	bool pfn_valid = false;
-
-	trace_kvm_access_fault(fault_ipa);
-
-	spin_lock(&vcpu->kvm->mmu_lock);
-
-	if (!stage2_get_leaf_entry(vcpu->kvm, fault_ipa, &pud, &pmd, &pte))
-		goto out;
-
-	if (pud) {		/* HugeTLB */
-		*pud = kvm_s2pud_mkyoung(*pud);
-		pfn = kvm_pud_pfn(*pud);
-		pfn_valid = true;
-	} else	if (pmd) {	/* THP, HugeTLB */
-		*pmd = pmd_mkyoung(*pmd);
-		pfn = pmd_pfn(*pmd);
-		pfn_valid = true;
-	} else {
-		*pte = pte_mkyoung(*pte);	/* Just a page... */
-		pfn = pte_pfn(*pte);
-		pfn_valid = true;
-	}
-
-out:
-	spin_unlock(&vcpu->kvm->mmu_lock);
-	if (pfn_valid)
-		kvm_set_pfn_accessed(pfn);
-}
-
-/**
- * kvm_handle_guest_abort - handles all 2nd stage aborts
- * @vcpu:	the VCPU pointer
- * @run:	the kvm_run structure
- *
- * Any abort that gets to the host is almost guaranteed to be caused by a
- * missing second stage translation table entry, which can mean that either the
- * guest simply needs more memory and we must allocate an appropriate page or it
- * can mean that the guest tried to access I/O memory, which is emulated by user
- * space. The distinction is based on the IPA causing the fault and whether this
- * memory region has been registered as standard RAM by user space.
- */
-int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
-{
-	unsigned long fault_status;
-	phys_addr_t fault_ipa;
-	struct kvm_memory_slot *memslot;
-	unsigned long hva;
-	bool is_iabt, write_fault, writable;
-	gfn_t gfn;
-	int ret, idx;
-
-	fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
-
-	fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
-	is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
-
-	/* Synchronous External Abort? */
-	if (kvm_vcpu_dabt_isextabt(vcpu)) {
-		/*
-		 * For RAS the host kernel may handle this abort.
-		 * There is no need to pass the error into the guest.
-		 */
-		if (!handle_guest_sea(fault_ipa, kvm_vcpu_get_hsr(vcpu)))
-			return 1;
-
-		if (unlikely(!is_iabt)) {
-			kvm_inject_vabt(vcpu);
-			return 1;
-		}
-	}
-
-	trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_hsr(vcpu),
-			      kvm_vcpu_get_hfar(vcpu), fault_ipa);
-
-	/* Check the stage-2 fault is trans. fault or write fault */
-	if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
-	    fault_status != FSC_ACCESS) {
-		kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
-			kvm_vcpu_trap_get_class(vcpu),
-			(unsigned long)kvm_vcpu_trap_get_fault(vcpu),
-			(unsigned long)kvm_vcpu_get_hsr(vcpu));
-		return -EFAULT;
-	}
-
-	idx = srcu_read_lock(&vcpu->kvm->srcu);
-
-	gfn = fault_ipa >> PAGE_SHIFT;
-	memslot = gfn_to_memslot(vcpu->kvm, gfn);
-	hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
-	write_fault = kvm_is_write_fault(vcpu);
-	if (kvm_is_error_hva(hva) || (write_fault && !writable)) {
-		if (is_iabt) {
-			/* Prefetch Abort on I/O address */
-			kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
-			ret = 1;
-			goto out_unlock;
-		}
-
-		/*
-		 * Check for a cache maintenance operation. Since we
-		 * ended-up here, we know it is outside of any memory
-		 * slot. But we can't find out if that is for a device,
-		 * or if the guest is just being stupid. The only thing
-		 * we know for sure is that this range cannot be cached.
-		 *
-		 * So let's assume that the guest is just being
-		 * cautious, and skip the instruction.
-		 */
-		if (kvm_vcpu_dabt_is_cm(vcpu)) {
-			kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
-			ret = 1;
-			goto out_unlock;
-		}
-
-		/*
-		 * The IPA is reported as [MAX:12], so we need to
-		 * complement it with the bottom 12 bits from the
-		 * faulting VA. This is always 12 bits, irrespective
-		 * of the page size.
-		 */
-		fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
-		ret = io_mem_abort(vcpu, run, fault_ipa);
-		goto out_unlock;
-	}
-
-	/* Userspace should not be able to register out-of-bounds IPAs */
-	VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm));
-
-	if (fault_status == FSC_ACCESS) {
-		handle_access_fault(vcpu, fault_ipa);
-		ret = 1;
-		goto out_unlock;
-	}
-
-	ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
-	if (ret == 0)
-		ret = 1;
-out_unlock:
-	srcu_read_unlock(&vcpu->kvm->srcu, idx);
-	return ret;
-}
-
-static int handle_hva_to_gpa(struct kvm *kvm,
-			     unsigned long start,
-			     unsigned long end,
-			     int (*handler)(struct kvm *kvm,
-					    gpa_t gpa, u64 size,
-					    void *data),
-			     void *data)
-{
-	struct kvm_memslots *slots;
-	struct kvm_memory_slot *memslot;
-	int ret = 0;
-
-	slots = kvm_memslots(kvm);
-
-	/* we only care about the pages that the guest sees */
-	kvm_for_each_memslot(memslot, slots) {
-		unsigned long hva_start, hva_end;
-		gfn_t gpa;
-
-		hva_start = max(start, memslot->userspace_addr);
-		hva_end = min(end, memslot->userspace_addr +
-					(memslot->npages << PAGE_SHIFT));
-		if (hva_start >= hva_end)
-			continue;
-
-		gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT;
-		ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data);
-	}
-
-	return ret;
-}
-
-static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
-{
-	unmap_stage2_range(kvm, gpa, size);
-	return 0;
-}
-
-int kvm_unmap_hva_range(struct kvm *kvm,
-			unsigned long start, unsigned long end)
-{
-	if (!kvm->arch.pgd)
-		return 0;
-
-	trace_kvm_unmap_hva_range(start, end);
-	handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL);
-	return 0;
-}
-
-static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
-{
-	pte_t *pte = (pte_t *)data;
-
-	WARN_ON(size != PAGE_SIZE);
-	/*
-	 * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE
-	 * flag clear because MMU notifiers will have unmapped a huge PMD before
-	 * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and
-	 * therefore stage2_set_pte() never needs to clear out a huge PMD
-	 * through this calling path.
-	 */
-	stage2_set_pte(kvm, NULL, gpa, pte, 0);
-	return 0;
-}
-
-
-int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
-{
-	unsigned long end = hva + PAGE_SIZE;
-	kvm_pfn_t pfn = pte_pfn(pte);
-	pte_t stage2_pte;
-
-	if (!kvm->arch.pgd)
-		return 0;
-
-	trace_kvm_set_spte_hva(hva);
-
-	/*
-	 * We've moved a page around, probably through CoW, so let's treat it
-	 * just like a translation fault and clean the cache to the PoC.
-	 */
-	clean_dcache_guest_page(pfn, PAGE_SIZE);
-	stage2_pte = kvm_pfn_pte(pfn, PAGE_S2);
-	handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte);
-
-	return 0;
-}
-
-static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
-{
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
-
-	WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
-	if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte))
-		return 0;
-
-	if (pud)
-		return stage2_pudp_test_and_clear_young(pud);
-	else if (pmd)
-		return stage2_pmdp_test_and_clear_young(pmd);
-	else
-		return stage2_ptep_test_and_clear_young(pte);
-}
-
-static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
-{
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
-
-	WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
-	if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte))
-		return 0;
-
-	if (pud)
-		return kvm_s2pud_young(*pud);
-	else if (pmd)
-		return pmd_young(*pmd);
-	else
-		return pte_young(*pte);
-}
-
-int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
-{
-	if (!kvm->arch.pgd)
-		return 0;
-	trace_kvm_age_hva(start, end);
-	return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
-}
-
-int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
-{
-	if (!kvm->arch.pgd)
-		return 0;
-	trace_kvm_test_age_hva(hva);
-	return handle_hva_to_gpa(kvm, hva, hva, kvm_test_age_hva_handler, NULL);
-}
-
-void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)
-{
-	mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
-}
-
-phys_addr_t kvm_mmu_get_httbr(void)
-{
-	if (__kvm_cpu_uses_extended_idmap())
-		return virt_to_phys(merged_hyp_pgd);
-	else
-		return virt_to_phys(hyp_pgd);
-}
-
-phys_addr_t kvm_get_idmap_vector(void)
-{
-	return hyp_idmap_vector;
-}
-
-static int kvm_map_idmap_text(pgd_t *pgd)
-{
-	int err;
-
-	/* Create the idmap in the boot page tables */
-	err = 	__create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(),
-				      hyp_idmap_start, hyp_idmap_end,
-				      __phys_to_pfn(hyp_idmap_start),
-				      PAGE_HYP_EXEC);
-	if (err)
-		kvm_err("Failed to idmap %lx-%lx\n",
-			hyp_idmap_start, hyp_idmap_end);
-
-	return err;
-}
-
-int kvm_mmu_init(void)
-{
-	int err;
-
-	hyp_idmap_start = kvm_virt_to_phys(__hyp_idmap_text_start);
-	hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
-	hyp_idmap_end = kvm_virt_to_phys(__hyp_idmap_text_end);
-	hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE);
-	hyp_idmap_vector = kvm_virt_to_phys(__kvm_hyp_init);
-
-	/*
-	 * We rely on the linker script to ensure at build time that the HYP
-	 * init code does not cross a page boundary.
-	 */
-	BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
-
-	kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
-	kvm_debug("HYP VA range: %lx:%lx\n",
-		  kern_hyp_va(PAGE_OFFSET),
-		  kern_hyp_va((unsigned long)high_memory - 1));
-
-	if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) &&
-	    hyp_idmap_start <  kern_hyp_va((unsigned long)high_memory - 1) &&
-	    hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) {
-		/*
-		 * The idmap page is intersecting with the VA space,
-		 * it is not safe to continue further.
-		 */
-		kvm_err("IDMAP intersecting with HYP VA, unable to continue\n");
-		err = -EINVAL;
-		goto out;
-	}
-
-	hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order);
-	if (!hyp_pgd) {
-		kvm_err("Hyp mode PGD not allocated\n");
-		err = -ENOMEM;
-		goto out;
-	}
-
-	if (__kvm_cpu_uses_extended_idmap()) {
-		boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
-							 hyp_pgd_order);
-		if (!boot_hyp_pgd) {
-			kvm_err("Hyp boot PGD not allocated\n");
-			err = -ENOMEM;
-			goto out;
-		}
-
-		err = kvm_map_idmap_text(boot_hyp_pgd);
-		if (err)
-			goto out;
-
-		merged_hyp_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
-		if (!merged_hyp_pgd) {
-			kvm_err("Failed to allocate extra HYP pgd\n");
-			goto out;
-		}
-		__kvm_extend_hypmap(boot_hyp_pgd, hyp_pgd, merged_hyp_pgd,
-				    hyp_idmap_start);
-	} else {
-		err = kvm_map_idmap_text(hyp_pgd);
-		if (err)
-			goto out;
-	}
-
-	io_map_base = hyp_idmap_start;
-	return 0;
-out:
-	free_hyp_pgds();
-	return err;
-}
-
-void kvm_arch_commit_memory_region(struct kvm *kvm,
-				   const struct kvm_userspace_memory_region *mem,
-				   const struct kvm_memory_slot *old,
-				   const struct kvm_memory_slot *new,
-				   enum kvm_mr_change change)
-{
-	/*
-	 * At this point memslot has been committed and there is an
-	 * allocated dirty_bitmap[], dirty pages will be be tracked while the
-	 * memory slot is write protected.
-	 */
-	if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES)
-		kvm_mmu_wp_memory_region(kvm, mem->slot);
-}
-
-int kvm_arch_prepare_memory_region(struct kvm *kvm,
-				   struct kvm_memory_slot *memslot,
-				   const struct kvm_userspace_memory_region *mem,
-				   enum kvm_mr_change change)
-{
-	hva_t hva = mem->userspace_addr;
-	hva_t reg_end = hva + mem->memory_size;
-	bool writable = !(mem->flags & KVM_MEM_READONLY);
-	int ret = 0;
-
-	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
-			change != KVM_MR_FLAGS_ONLY)
-		return 0;
-
-	/*
-	 * Prevent userspace from creating a memory region outside of the IPA
-	 * space addressable by the KVM guest IPA space.
-	 */
-	if (memslot->base_gfn + memslot->npages >=
-	    (kvm_phys_size(kvm) >> PAGE_SHIFT))
-		return -EFAULT;
-
-	down_read(&current->mm->mmap_sem);
-	/*
-	 * A memory region could potentially cover multiple VMAs, and any holes
-	 * between them, so iterate over all of them to find out if we can map
-	 * any of them right now.
-	 *
-	 *     +--------------------------------------------+
-	 * +---------------+----------------+   +----------------+
-	 * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
-	 * +---------------+----------------+   +----------------+
-	 *     |               memory region                |
-	 *     +--------------------------------------------+
-	 */
-	do {
-		struct vm_area_struct *vma = find_vma(current->mm, hva);
-		hva_t vm_start, vm_end;
-
-		if (!vma || vma->vm_start >= reg_end)
-			break;
-
-		/*
-		 * Mapping a read-only VMA is only allowed if the
-		 * memory region is configured as read-only.
-		 */
-		if (writable && !(vma->vm_flags & VM_WRITE)) {
-			ret = -EPERM;
-			break;
-		}
-
-		/*
-		 * Take the intersection of this VMA with the memory region
-		 */
-		vm_start = max(hva, vma->vm_start);
-		vm_end = min(reg_end, vma->vm_end);
-
-		if (vma->vm_flags & VM_PFNMAP) {
-			gpa_t gpa = mem->guest_phys_addr +
-				    (vm_start - mem->userspace_addr);
-			phys_addr_t pa;
-
-			pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT;
-			pa += vm_start - vma->vm_start;
-
-			/* IO region dirty page logging not allowed */
-			if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) {
-				ret = -EINVAL;
-				goto out;
-			}
-
-			ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
-						    vm_end - vm_start,
-						    writable);
-			if (ret)
-				break;
-		}
-		hva = vm_end;
-	} while (hva < reg_end);
-
-	if (change == KVM_MR_FLAGS_ONLY)
-		goto out;
-
-	spin_lock(&kvm->mmu_lock);
-	if (ret)
-		unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size);
-	else
-		stage2_flush_memslot(kvm, memslot);
-	spin_unlock(&kvm->mmu_lock);
-out:
-	up_read(&current->mm->mmap_sem);
-	return ret;
-}
-
-void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
-			   struct kvm_memory_slot *dont)
-{
-}
-
-int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
-			    unsigned long npages)
-{
-	return 0;
-}
-
-void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots)
-{
-}
-
-void kvm_arch_flush_shadow_all(struct kvm *kvm)
-{
-	kvm_free_stage2_pgd(kvm);
-}
-
-void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
-				   struct kvm_memory_slot *slot)
-{
-	gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
-	phys_addr_t size = slot->npages << PAGE_SHIFT;
-
-	spin_lock(&kvm->mmu_lock);
-	unmap_stage2_range(kvm, gpa, size);
-	spin_unlock(&kvm->mmu_lock);
-}
-
-/*
- * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
- *
- * Main problems:
- * - S/W ops are local to a CPU (not broadcast)
- * - We have line migration behind our back (speculation)
- * - System caches don't support S/W at all (damn!)
- *
- * In the face of the above, the best we can do is to try and convert
- * S/W ops to VA ops. Because the guest is not allowed to infer the
- * S/W to PA mapping, it can only use S/W to nuke the whole cache,
- * which is a rather good thing for us.
- *
- * Also, it is only used when turning caches on/off ("The expected
- * usage of the cache maintenance instructions that operate by set/way
- * is associated with the cache maintenance instructions associated
- * with the powerdown and powerup of caches, if this is required by
- * the implementation.").
- *
- * We use the following policy:
- *
- * - If we trap a S/W operation, we enable VM trapping to detect
- *   caches being turned on/off, and do a full clean.
- *
- * - We flush the caches on both caches being turned on and off.
- *
- * - Once the caches are enabled, we stop trapping VM ops.
- */
-void kvm_set_way_flush(struct kvm_vcpu *vcpu)
-{
-	unsigned long hcr = *vcpu_hcr(vcpu);
-
-	/*
-	 * If this is the first time we do a S/W operation
-	 * (i.e. HCR_TVM not set) flush the whole memory, and set the
-	 * VM trapping.
-	 *
-	 * Otherwise, rely on the VM trapping to wait for the MMU +
-	 * Caches to be turned off. At that point, we'll be able to
-	 * clean the caches again.
-	 */
-	if (!(hcr & HCR_TVM)) {
-		trace_kvm_set_way_flush(*vcpu_pc(vcpu),
-					vcpu_has_cache_enabled(vcpu));
-		stage2_flush_vm(vcpu->kvm);
-		*vcpu_hcr(vcpu) = hcr | HCR_TVM;
-	}
-}
-
-void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled)
-{
-	bool now_enabled = vcpu_has_cache_enabled(vcpu);
-
-	/*
-	 * If switching the MMU+caches on, need to invalidate the caches.
-	 * If switching it off, need to clean the caches.
-	 * Clean + invalidate does the trick always.
-	 */
-	if (now_enabled != was_enabled)
-		stage2_flush_vm(vcpu->kvm);
-
-	/* Caches are now on, stop trapping VM ops (until a S/W op) */
-	if (now_enabled)
-		*vcpu_hcr(vcpu) &= ~HCR_TVM;
-
-	trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled);
-}
diff --git a/virt/kvm/arm/perf.c b/virt/kvm/arm/perf.c
deleted file mode 100644
index 1a3849da0b4b..000000000000
--- a/virt/kvm/arm/perf.c
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Based on the x86 implementation.
- *
- * Copyright (C) 2012 ARM Ltd.
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/perf_event.h>
-#include <linux/kvm_host.h>
-
-#include <asm/kvm_emulate.h>
-
-static int kvm_is_in_guest(void)
-{
-        return kvm_arm_get_running_vcpu() != NULL;
-}
-
-static int kvm_is_user_mode(void)
-{
-	struct kvm_vcpu *vcpu;
-
-	vcpu = kvm_arm_get_running_vcpu();
-
-	if (vcpu)
-		return !vcpu_mode_priv(vcpu);
-
-	return 0;
-}
-
-static unsigned long kvm_get_guest_ip(void)
-{
-	struct kvm_vcpu *vcpu;
-
-	vcpu = kvm_arm_get_running_vcpu();
-
-	if (vcpu)
-		return *vcpu_pc(vcpu);
-
-	return 0;
-}
-
-static struct perf_guest_info_callbacks kvm_guest_cbs = {
-	.is_in_guest	= kvm_is_in_guest,
-	.is_user_mode	= kvm_is_user_mode,
-	.get_guest_ip	= kvm_get_guest_ip,
-};
-
-int kvm_perf_init(void)
-{
-	return perf_register_guest_info_callbacks(&kvm_guest_cbs);
-}
-
-int kvm_perf_teardown(void)
-{
-	return perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
-}
diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c
deleted file mode 100644
index 1c5b76c46e26..000000000000
--- a/virt/kvm/arm/pmu.c
+++ /dev/null
@@ -1,606 +0,0 @@
-/*
- * Copyright (C) 2015 Linaro Ltd.
- * Author: Shannon Zhao <shannon.zhao@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/cpu.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <linux/perf_event.h>
-#include <linux/uaccess.h>
-#include <asm/kvm_emulate.h>
-#include <kvm/arm_pmu.h>
-#include <kvm/arm_vgic.h>
-
-/**
- * kvm_pmu_get_counter_value - get PMU counter value
- * @vcpu: The vcpu pointer
- * @select_idx: The counter index
- */
-u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx)
-{
-	u64 counter, reg, enabled, running;
-	struct kvm_pmu *pmu = &vcpu->arch.pmu;
-	struct kvm_pmc *pmc = &pmu->pmc[select_idx];
-
-	reg = (select_idx == ARMV8_PMU_CYCLE_IDX)
-	      ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + select_idx;
-	counter = __vcpu_sys_reg(vcpu, reg);
-
-	/* The real counter value is equal to the value of counter register plus
-	 * the value perf event counts.
-	 */
-	if (pmc->perf_event)
-		counter += perf_event_read_value(pmc->perf_event, &enabled,
-						 &running);
-
-	return counter & pmc->bitmask;
-}
-
-/**
- * kvm_pmu_set_counter_value - set PMU counter value
- * @vcpu: The vcpu pointer
- * @select_idx: The counter index
- * @val: The counter value
- */
-void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val)
-{
-	u64 reg;
-
-	reg = (select_idx == ARMV8_PMU_CYCLE_IDX)
-	      ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + select_idx;
-	__vcpu_sys_reg(vcpu, reg) += (s64)val - kvm_pmu_get_counter_value(vcpu, select_idx);
-}
-
-/**
- * kvm_pmu_stop_counter - stop PMU counter
- * @pmc: The PMU counter pointer
- *
- * If this counter has been configured to monitor some event, release it here.
- */
-static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc)
-{
-	u64 counter, reg;
-
-	if (pmc->perf_event) {
-		counter = kvm_pmu_get_counter_value(vcpu, pmc->idx);
-		reg = (pmc->idx == ARMV8_PMU_CYCLE_IDX)
-		       ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + pmc->idx;
-		__vcpu_sys_reg(vcpu, reg) = counter;
-		perf_event_disable(pmc->perf_event);
-		perf_event_release_kernel(pmc->perf_event);
-		pmc->perf_event = NULL;
-	}
-}
-
-/**
- * kvm_pmu_vcpu_reset - reset pmu state for cpu
- * @vcpu: The vcpu pointer
- *
- */
-void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu)
-{
-	int i;
-	struct kvm_pmu *pmu = &vcpu->arch.pmu;
-
-	for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) {
-		kvm_pmu_stop_counter(vcpu, &pmu->pmc[i]);
-		pmu->pmc[i].idx = i;
-		pmu->pmc[i].bitmask = 0xffffffffUL;
-	}
-}
-
-/**
- * kvm_pmu_vcpu_destroy - free perf event of PMU for cpu
- * @vcpu: The vcpu pointer
- *
- */
-void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu)
-{
-	int i;
-	struct kvm_pmu *pmu = &vcpu->arch.pmu;
-
-	for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) {
-		struct kvm_pmc *pmc = &pmu->pmc[i];
-
-		if (pmc->perf_event) {
-			perf_event_disable(pmc->perf_event);
-			perf_event_release_kernel(pmc->perf_event);
-			pmc->perf_event = NULL;
-		}
-	}
-}
-
-u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu)
-{
-	u64 val = __vcpu_sys_reg(vcpu, PMCR_EL0) >> ARMV8_PMU_PMCR_N_SHIFT;
-
-	val &= ARMV8_PMU_PMCR_N_MASK;
-	if (val == 0)
-		return BIT(ARMV8_PMU_CYCLE_IDX);
-	else
-		return GENMASK(val - 1, 0) | BIT(ARMV8_PMU_CYCLE_IDX);
-}
-
-/**
- * kvm_pmu_enable_counter - enable selected PMU counter
- * @vcpu: The vcpu pointer
- * @val: the value guest writes to PMCNTENSET register
- *
- * Call perf_event_enable to start counting the perf event
- */
-void kvm_pmu_enable_counter(struct kvm_vcpu *vcpu, u64 val)
-{
-	int i;
-	struct kvm_pmu *pmu = &vcpu->arch.pmu;
-	struct kvm_pmc *pmc;
-
-	if (!(__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E) || !val)
-		return;
-
-	for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) {
-		if (!(val & BIT(i)))
-			continue;
-
-		pmc = &pmu->pmc[i];
-		if (pmc->perf_event) {
-			perf_event_enable(pmc->perf_event);
-			if (pmc->perf_event->state != PERF_EVENT_STATE_ACTIVE)
-				kvm_debug("fail to enable perf event\n");
-		}
-	}
-}
-
-/**
- * kvm_pmu_disable_counter - disable selected PMU counter
- * @vcpu: The vcpu pointer
- * @val: the value guest writes to PMCNTENCLR register
- *
- * Call perf_event_disable to stop counting the perf event
- */
-void kvm_pmu_disable_counter(struct kvm_vcpu *vcpu, u64 val)
-{
-	int i;
-	struct kvm_pmu *pmu = &vcpu->arch.pmu;
-	struct kvm_pmc *pmc;
-
-	if (!val)
-		return;
-
-	for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) {
-		if (!(val & BIT(i)))
-			continue;
-
-		pmc = &pmu->pmc[i];
-		if (pmc->perf_event)
-			perf_event_disable(pmc->perf_event);
-	}
-}
-
-static u64 kvm_pmu_overflow_status(struct kvm_vcpu *vcpu)
-{
-	u64 reg = 0;
-
-	if ((__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E)) {
-		reg = __vcpu_sys_reg(vcpu, PMOVSSET_EL0);
-		reg &= __vcpu_sys_reg(vcpu, PMCNTENSET_EL0);
-		reg &= __vcpu_sys_reg(vcpu, PMINTENSET_EL1);
-		reg &= kvm_pmu_valid_counter_mask(vcpu);
-	}
-
-	return reg;
-}
-
-static void kvm_pmu_update_state(struct kvm_vcpu *vcpu)
-{
-	struct kvm_pmu *pmu = &vcpu->arch.pmu;
-	bool overflow;
-
-	if (!kvm_arm_pmu_v3_ready(vcpu))
-		return;
-
-	overflow = !!kvm_pmu_overflow_status(vcpu);
-	if (pmu->irq_level == overflow)
-		return;
-
-	pmu->irq_level = overflow;
-
-	if (likely(irqchip_in_kernel(vcpu->kvm))) {
-		int ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id,
-					      pmu->irq_num, overflow, pmu);
-		WARN_ON(ret);
-	}
-}
-
-bool kvm_pmu_should_notify_user(struct kvm_vcpu *vcpu)
-{
-	struct kvm_pmu *pmu = &vcpu->arch.pmu;
-	struct kvm_sync_regs *sregs = &vcpu->run->s.regs;
-	bool run_level = sregs->device_irq_level & KVM_ARM_DEV_PMU;
-
-	if (likely(irqchip_in_kernel(vcpu->kvm)))
-		return false;
-
-	return pmu->irq_level != run_level;
-}
-
-/*
- * Reflect the PMU overflow interrupt output level into the kvm_run structure
- */
-void kvm_pmu_update_run(struct kvm_vcpu *vcpu)
-{
-	struct kvm_sync_regs *regs = &vcpu->run->s.regs;
-
-	/* Populate the timer bitmap for user space */
-	regs->device_irq_level &= ~KVM_ARM_DEV_PMU;
-	if (vcpu->arch.pmu.irq_level)
-		regs->device_irq_level |= KVM_ARM_DEV_PMU;
-}
-
-/**
- * kvm_pmu_flush_hwstate - flush pmu state to cpu
- * @vcpu: The vcpu pointer
- *
- * Check if the PMU has overflowed while we were running in the host, and inject
- * an interrupt if that was the case.
- */
-void kvm_pmu_flush_hwstate(struct kvm_vcpu *vcpu)
-{
-	kvm_pmu_update_state(vcpu);
-}
-
-/**
- * kvm_pmu_sync_hwstate - sync pmu state from cpu
- * @vcpu: The vcpu pointer
- *
- * Check if the PMU has overflowed while we were running in the guest, and
- * inject an interrupt if that was the case.
- */
-void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu)
-{
-	kvm_pmu_update_state(vcpu);
-}
-
-static inline struct kvm_vcpu *kvm_pmc_to_vcpu(struct kvm_pmc *pmc)
-{
-	struct kvm_pmu *pmu;
-	struct kvm_vcpu_arch *vcpu_arch;
-
-	pmc -= pmc->idx;
-	pmu = container_of(pmc, struct kvm_pmu, pmc[0]);
-	vcpu_arch = container_of(pmu, struct kvm_vcpu_arch, pmu);
-	return container_of(vcpu_arch, struct kvm_vcpu, arch);
-}
-
-/**
- * When the perf event overflows, set the overflow status and inform the vcpu.
- */
-static void kvm_pmu_perf_overflow(struct perf_event *perf_event,
-				  struct perf_sample_data *data,
-				  struct pt_regs *regs)
-{
-	struct kvm_pmc *pmc = perf_event->overflow_handler_context;
-	struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc);
-	int idx = pmc->idx;
-
-	__vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(idx);
-
-	if (kvm_pmu_overflow_status(vcpu)) {
-		kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
-		kvm_vcpu_kick(vcpu);
-	}
-}
-
-/**
- * kvm_pmu_software_increment - do software increment
- * @vcpu: The vcpu pointer
- * @val: the value guest writes to PMSWINC register
- */
-void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val)
-{
-	int i;
-	u64 type, enable, reg;
-
-	if (val == 0)
-		return;
-
-	enable = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0);
-	for (i = 0; i < ARMV8_PMU_CYCLE_IDX; i++) {
-		if (!(val & BIT(i)))
-			continue;
-		type = __vcpu_sys_reg(vcpu, PMEVTYPER0_EL0 + i)
-		       & ARMV8_PMU_EVTYPE_EVENT;
-		if ((type == ARMV8_PMUV3_PERFCTR_SW_INCR)
-		    && (enable & BIT(i))) {
-			reg = __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) + 1;
-			reg = lower_32_bits(reg);
-			__vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) = reg;
-			if (!reg)
-				__vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i);
-		}
-	}
-}
-
-/**
- * kvm_pmu_handle_pmcr - handle PMCR register
- * @vcpu: The vcpu pointer
- * @val: the value guest writes to PMCR register
- */
-void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val)
-{
-	struct kvm_pmu *pmu = &vcpu->arch.pmu;
-	struct kvm_pmc *pmc;
-	u64 mask;
-	int i;
-
-	mask = kvm_pmu_valid_counter_mask(vcpu);
-	if (val & ARMV8_PMU_PMCR_E) {
-		kvm_pmu_enable_counter(vcpu,
-		       __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & mask);
-	} else {
-		kvm_pmu_disable_counter(vcpu, mask);
-	}
-
-	if (val & ARMV8_PMU_PMCR_C)
-		kvm_pmu_set_counter_value(vcpu, ARMV8_PMU_CYCLE_IDX, 0);
-
-	if (val & ARMV8_PMU_PMCR_P) {
-		for (i = 0; i < ARMV8_PMU_CYCLE_IDX; i++)
-			kvm_pmu_set_counter_value(vcpu, i, 0);
-	}
-
-	if (val & ARMV8_PMU_PMCR_LC) {
-		pmc = &pmu->pmc[ARMV8_PMU_CYCLE_IDX];
-		pmc->bitmask = 0xffffffffffffffffUL;
-	}
-}
-
-static bool kvm_pmu_counter_is_enabled(struct kvm_vcpu *vcpu, u64 select_idx)
-{
-	return (__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E) &&
-	       (__vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & BIT(select_idx));
-}
-
-/**
- * kvm_pmu_set_counter_event_type - set selected counter to monitor some event
- * @vcpu: The vcpu pointer
- * @data: The data guest writes to PMXEVTYPER_EL0
- * @select_idx: The number of selected counter
- *
- * When OS accesses PMXEVTYPER_EL0, that means it wants to set a PMC to count an
- * event with given hardware event number. Here we call perf_event API to
- * emulate this action and create a kernel perf event for it.
- */
-void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data,
-				    u64 select_idx)
-{
-	struct kvm_pmu *pmu = &vcpu->arch.pmu;
-	struct kvm_pmc *pmc = &pmu->pmc[select_idx];
-	struct perf_event *event;
-	struct perf_event_attr attr;
-	u64 eventsel, counter;
-
-	kvm_pmu_stop_counter(vcpu, pmc);
-	eventsel = data & ARMV8_PMU_EVTYPE_EVENT;
-
-	/* Software increment event does't need to be backed by a perf event */
-	if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR &&
-	    select_idx != ARMV8_PMU_CYCLE_IDX)
-		return;
-
-	memset(&attr, 0, sizeof(struct perf_event_attr));
-	attr.type = PERF_TYPE_RAW;
-	attr.size = sizeof(attr);
-	attr.pinned = 1;
-	attr.disabled = !kvm_pmu_counter_is_enabled(vcpu, select_idx);
-	attr.exclude_user = data & ARMV8_PMU_EXCLUDE_EL0 ? 1 : 0;
-	attr.exclude_kernel = data & ARMV8_PMU_EXCLUDE_EL1 ? 1 : 0;
-	attr.exclude_hv = 1; /* Don't count EL2 events */
-	attr.exclude_host = 1; /* Don't count host events */
-	attr.config = (select_idx == ARMV8_PMU_CYCLE_IDX) ?
-		ARMV8_PMUV3_PERFCTR_CPU_CYCLES : eventsel;
-
-	counter = kvm_pmu_get_counter_value(vcpu, select_idx);
-	/* The initial sample period (overflow count) of an event. */
-	attr.sample_period = (-counter) & pmc->bitmask;
-
-	event = perf_event_create_kernel_counter(&attr, -1, current,
-						 kvm_pmu_perf_overflow, pmc);
-	if (IS_ERR(event)) {
-		pr_err_once("kvm: pmu event creation failed %ld\n",
-			    PTR_ERR(event));
-		return;
-	}
-
-	pmc->perf_event = event;
-}
-
-bool kvm_arm_support_pmu_v3(void)
-{
-	/*
-	 * Check if HW_PERF_EVENTS are supported by checking the number of
-	 * hardware performance counters. This could ensure the presence of
-	 * a physical PMU and CONFIG_PERF_EVENT is selected.
-	 */
-	return (perf_num_counters() > 0);
-}
-
-int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu)
-{
-	if (!vcpu->arch.pmu.created)
-		return 0;
-
-	/*
-	 * A valid interrupt configuration for the PMU is either to have a
-	 * properly configured interrupt number and using an in-kernel
-	 * irqchip, or to not have an in-kernel GIC and not set an IRQ.
-	 */
-	if (irqchip_in_kernel(vcpu->kvm)) {
-		int irq = vcpu->arch.pmu.irq_num;
-		if (!kvm_arm_pmu_irq_initialized(vcpu))
-			return -EINVAL;
-
-		/*
-		 * If we are using an in-kernel vgic, at this point we know
-		 * the vgic will be initialized, so we can check the PMU irq
-		 * number against the dimensions of the vgic and make sure
-		 * it's valid.
-		 */
-		if (!irq_is_ppi(irq) && !vgic_valid_spi(vcpu->kvm, irq))
-			return -EINVAL;
-	} else if (kvm_arm_pmu_irq_initialized(vcpu)) {
-		   return -EINVAL;
-	}
-
-	kvm_pmu_vcpu_reset(vcpu);
-	vcpu->arch.pmu.ready = true;
-
-	return 0;
-}
-
-static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu)
-{
-	if (!kvm_arm_support_pmu_v3())
-		return -ENODEV;
-
-	if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
-		return -ENXIO;
-
-	if (vcpu->arch.pmu.created)
-		return -EBUSY;
-
-	if (irqchip_in_kernel(vcpu->kvm)) {
-		int ret;
-
-		/*
-		 * If using the PMU with an in-kernel virtual GIC
-		 * implementation, we require the GIC to be already
-		 * initialized when initializing the PMU.
-		 */
-		if (!vgic_initialized(vcpu->kvm))
-			return -ENODEV;
-
-		if (!kvm_arm_pmu_irq_initialized(vcpu))
-			return -ENXIO;
-
-		ret = kvm_vgic_set_owner(vcpu, vcpu->arch.pmu.irq_num,
-					 &vcpu->arch.pmu);
-		if (ret)
-			return ret;
-	}
-
-	vcpu->arch.pmu.created = true;
-	return 0;
-}
-
-/*
- * For one VM the interrupt type must be same for each vcpu.
- * As a PPI, the interrupt number is the same for all vcpus,
- * while as an SPI it must be a separate number per vcpu.
- */
-static bool pmu_irq_is_valid(struct kvm *kvm, int irq)
-{
-	int i;
-	struct kvm_vcpu *vcpu;
-
-	kvm_for_each_vcpu(i, vcpu, kvm) {
-		if (!kvm_arm_pmu_irq_initialized(vcpu))
-			continue;
-
-		if (irq_is_ppi(irq)) {
-			if (vcpu->arch.pmu.irq_num != irq)
-				return false;
-		} else {
-			if (vcpu->arch.pmu.irq_num == irq)
-				return false;
-		}
-	}
-
-	return true;
-}
-
-int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
-{
-	switch (attr->attr) {
-	case KVM_ARM_VCPU_PMU_V3_IRQ: {
-		int __user *uaddr = (int __user *)(long)attr->addr;
-		int irq;
-
-		if (!irqchip_in_kernel(vcpu->kvm))
-			return -EINVAL;
-
-		if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
-			return -ENODEV;
-
-		if (get_user(irq, uaddr))
-			return -EFAULT;
-
-		/* The PMU overflow interrupt can be a PPI or a valid SPI. */
-		if (!(irq_is_ppi(irq) || irq_is_spi(irq)))
-			return -EINVAL;
-
-		if (!pmu_irq_is_valid(vcpu->kvm, irq))
-			return -EINVAL;
-
-		if (kvm_arm_pmu_irq_initialized(vcpu))
-			return -EBUSY;
-
-		kvm_debug("Set kvm ARM PMU irq: %d\n", irq);
-		vcpu->arch.pmu.irq_num = irq;
-		return 0;
-	}
-	case KVM_ARM_VCPU_PMU_V3_INIT:
-		return kvm_arm_pmu_v3_init(vcpu);
-	}
-
-	return -ENXIO;
-}
-
-int kvm_arm_pmu_v3_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
-{
-	switch (attr->attr) {
-	case KVM_ARM_VCPU_PMU_V3_IRQ: {
-		int __user *uaddr = (int __user *)(long)attr->addr;
-		int irq;
-
-		if (!irqchip_in_kernel(vcpu->kvm))
-			return -EINVAL;
-
-		if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
-			return -ENODEV;
-
-		if (!kvm_arm_pmu_irq_initialized(vcpu))
-			return -ENXIO;
-
-		irq = vcpu->arch.pmu.irq_num;
-		return put_user(irq, uaddr);
-	}
-	}
-
-	return -ENXIO;
-}
-
-int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
-{
-	switch (attr->attr) {
-	case KVM_ARM_VCPU_PMU_V3_IRQ:
-	case KVM_ARM_VCPU_PMU_V3_INIT:
-		if (kvm_arm_support_pmu_v3() &&
-		    test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
-			return 0;
-	}
-
-	return -ENXIO;
-}
diff --git a/virt/kvm/arm/psci.c b/virt/kvm/arm/psci.c
deleted file mode 100644
index 9b73d3ad918a..000000000000
--- a/virt/kvm/arm/psci.c
+++ /dev/null
@@ -1,503 +0,0 @@
-/*
- * Copyright (C) 2012 - ARM Ltd
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/arm-smccc.h>
-#include <linux/preempt.h>
-#include <linux/kvm_host.h>
-#include <linux/uaccess.h>
-#include <linux/wait.h>
-
-#include <asm/cputype.h>
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_host.h>
-
-#include <kvm/arm_psci.h>
-
-/*
- * This is an implementation of the Power State Coordination Interface
- * as described in ARM document number ARM DEN 0022A.
- */
-
-#define AFFINITY_MASK(level)	~((0x1UL << ((level) * MPIDR_LEVEL_BITS)) - 1)
-
-static u32 smccc_get_function(struct kvm_vcpu *vcpu)
-{
-	return vcpu_get_reg(vcpu, 0);
-}
-
-static unsigned long smccc_get_arg1(struct kvm_vcpu *vcpu)
-{
-	return vcpu_get_reg(vcpu, 1);
-}
-
-static unsigned long smccc_get_arg2(struct kvm_vcpu *vcpu)
-{
-	return vcpu_get_reg(vcpu, 2);
-}
-
-static unsigned long smccc_get_arg3(struct kvm_vcpu *vcpu)
-{
-	return vcpu_get_reg(vcpu, 3);
-}
-
-static void smccc_set_retval(struct kvm_vcpu *vcpu,
-			     unsigned long a0,
-			     unsigned long a1,
-			     unsigned long a2,
-			     unsigned long a3)
-{
-	vcpu_set_reg(vcpu, 0, a0);
-	vcpu_set_reg(vcpu, 1, a1);
-	vcpu_set_reg(vcpu, 2, a2);
-	vcpu_set_reg(vcpu, 3, a3);
-}
-
-static unsigned long psci_affinity_mask(unsigned long affinity_level)
-{
-	if (affinity_level <= 3)
-		return MPIDR_HWID_BITMASK & AFFINITY_MASK(affinity_level);
-
-	return 0;
-}
-
-static unsigned long kvm_psci_vcpu_suspend(struct kvm_vcpu *vcpu)
-{
-	/*
-	 * NOTE: For simplicity, we make VCPU suspend emulation to be
-	 * same-as WFI (Wait-for-interrupt) emulation.
-	 *
-	 * This means for KVM the wakeup events are interrupts and
-	 * this is consistent with intended use of StateID as described
-	 * in section 5.4.1 of PSCI v0.2 specification (ARM DEN 0022A).
-	 *
-	 * Further, we also treat power-down request to be same as
-	 * stand-by request as-per section 5.4.2 clause 3 of PSCI v0.2
-	 * specification (ARM DEN 0022A). This means all suspend states
-	 * for KVM will preserve the register state.
-	 */
-	kvm_vcpu_block(vcpu);
-	kvm_clear_request(KVM_REQ_UNHALT, vcpu);
-
-	return PSCI_RET_SUCCESS;
-}
-
-static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu)
-{
-	vcpu->arch.power_off = true;
-	kvm_make_request(KVM_REQ_SLEEP, vcpu);
-	kvm_vcpu_kick(vcpu);
-}
-
-static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
-{
-	struct kvm *kvm = source_vcpu->kvm;
-	struct kvm_vcpu *vcpu = NULL;
-	struct swait_queue_head *wq;
-	unsigned long cpu_id;
-	unsigned long context_id;
-	phys_addr_t target_pc;
-
-	cpu_id = smccc_get_arg1(source_vcpu) & MPIDR_HWID_BITMASK;
-	if (vcpu_mode_is_32bit(source_vcpu))
-		cpu_id &= ~((u32) 0);
-
-	vcpu = kvm_mpidr_to_vcpu(kvm, cpu_id);
-
-	/*
-	 * Make sure the caller requested a valid CPU and that the CPU is
-	 * turned off.
-	 */
-	if (!vcpu)
-		return PSCI_RET_INVALID_PARAMS;
-	if (!vcpu->arch.power_off) {
-		if (kvm_psci_version(source_vcpu, kvm) != KVM_ARM_PSCI_0_1)
-			return PSCI_RET_ALREADY_ON;
-		else
-			return PSCI_RET_INVALID_PARAMS;
-	}
-
-	target_pc = smccc_get_arg2(source_vcpu);
-	context_id = smccc_get_arg3(source_vcpu);
-
-	kvm_reset_vcpu(vcpu);
-
-	/* Gracefully handle Thumb2 entry point */
-	if (vcpu_mode_is_32bit(vcpu) && (target_pc & 1)) {
-		target_pc &= ~((phys_addr_t) 1);
-		vcpu_set_thumb(vcpu);
-	}
-
-	/* Propagate caller endianness */
-	if (kvm_vcpu_is_be(source_vcpu))
-		kvm_vcpu_set_be(vcpu);
-
-	*vcpu_pc(vcpu) = target_pc;
-	/*
-	 * NOTE: We always update r0 (or x0) because for PSCI v0.1
-	 * the general puspose registers are undefined upon CPU_ON.
-	 */
-	smccc_set_retval(vcpu, context_id, 0, 0, 0);
-	vcpu->arch.power_off = false;
-	smp_mb();		/* Make sure the above is visible */
-
-	wq = kvm_arch_vcpu_wq(vcpu);
-	swake_up_one(wq);
-
-	return PSCI_RET_SUCCESS;
-}
-
-static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu)
-{
-	int i, matching_cpus = 0;
-	unsigned long mpidr;
-	unsigned long target_affinity;
-	unsigned long target_affinity_mask;
-	unsigned long lowest_affinity_level;
-	struct kvm *kvm = vcpu->kvm;
-	struct kvm_vcpu *tmp;
-
-	target_affinity = smccc_get_arg1(vcpu);
-	lowest_affinity_level = smccc_get_arg2(vcpu);
-
-	/* Determine target affinity mask */
-	target_affinity_mask = psci_affinity_mask(lowest_affinity_level);
-	if (!target_affinity_mask)
-		return PSCI_RET_INVALID_PARAMS;
-
-	/* Ignore other bits of target affinity */
-	target_affinity &= target_affinity_mask;
-
-	/*
-	 * If one or more VCPU matching target affinity are running
-	 * then ON else OFF
-	 */
-	kvm_for_each_vcpu(i, tmp, kvm) {
-		mpidr = kvm_vcpu_get_mpidr_aff(tmp);
-		if ((mpidr & target_affinity_mask) == target_affinity) {
-			matching_cpus++;
-			if (!tmp->arch.power_off)
-				return PSCI_0_2_AFFINITY_LEVEL_ON;
-		}
-	}
-
-	if (!matching_cpus)
-		return PSCI_RET_INVALID_PARAMS;
-
-	return PSCI_0_2_AFFINITY_LEVEL_OFF;
-}
-
-static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type)
-{
-	int i;
-	struct kvm_vcpu *tmp;
-
-	/*
-	 * The KVM ABI specifies that a system event exit may call KVM_RUN
-	 * again and may perform shutdown/reboot at a later time that when the
-	 * actual request is made.  Since we are implementing PSCI and a
-	 * caller of PSCI reboot and shutdown expects that the system shuts
-	 * down or reboots immediately, let's make sure that VCPUs are not run
-	 * after this call is handled and before the VCPUs have been
-	 * re-initialized.
-	 */
-	kvm_for_each_vcpu(i, tmp, vcpu->kvm)
-		tmp->arch.power_off = true;
-	kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_SLEEP);
-
-	memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event));
-	vcpu->run->system_event.type = type;
-	vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
-}
-
-static void kvm_psci_system_off(struct kvm_vcpu *vcpu)
-{
-	kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_SHUTDOWN);
-}
-
-static void kvm_psci_system_reset(struct kvm_vcpu *vcpu)
-{
-	kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_RESET);
-}
-
-static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu)
-{
-	struct kvm *kvm = vcpu->kvm;
-	u32 psci_fn = smccc_get_function(vcpu);
-	unsigned long val;
-	int ret = 1;
-
-	switch (psci_fn) {
-	case PSCI_0_2_FN_PSCI_VERSION:
-		/*
-		 * Bits[31:16] = Major Version = 0
-		 * Bits[15:0] = Minor Version = 2
-		 */
-		val = KVM_ARM_PSCI_0_2;
-		break;
-	case PSCI_0_2_FN_CPU_SUSPEND:
-	case PSCI_0_2_FN64_CPU_SUSPEND:
-		val = kvm_psci_vcpu_suspend(vcpu);
-		break;
-	case PSCI_0_2_FN_CPU_OFF:
-		kvm_psci_vcpu_off(vcpu);
-		val = PSCI_RET_SUCCESS;
-		break;
-	case PSCI_0_2_FN_CPU_ON:
-	case PSCI_0_2_FN64_CPU_ON:
-		mutex_lock(&kvm->lock);
-		val = kvm_psci_vcpu_on(vcpu);
-		mutex_unlock(&kvm->lock);
-		break;
-	case PSCI_0_2_FN_AFFINITY_INFO:
-	case PSCI_0_2_FN64_AFFINITY_INFO:
-		val = kvm_psci_vcpu_affinity_info(vcpu);
-		break;
-	case PSCI_0_2_FN_MIGRATE_INFO_TYPE:
-		/*
-		 * Trusted OS is MP hence does not require migration
-	         * or
-		 * Trusted OS is not present
-		 */
-		val = PSCI_0_2_TOS_MP;
-		break;
-	case PSCI_0_2_FN_SYSTEM_OFF:
-		kvm_psci_system_off(vcpu);
-		/*
-		 * We should'nt be going back to guest VCPU after
-		 * receiving SYSTEM_OFF request.
-		 *
-		 * If user space accidently/deliberately resumes
-		 * guest VCPU after SYSTEM_OFF request then guest
-		 * VCPU should see internal failure from PSCI return
-		 * value. To achieve this, we preload r0 (or x0) with
-		 * PSCI return value INTERNAL_FAILURE.
-		 */
-		val = PSCI_RET_INTERNAL_FAILURE;
-		ret = 0;
-		break;
-	case PSCI_0_2_FN_SYSTEM_RESET:
-		kvm_psci_system_reset(vcpu);
-		/*
-		 * Same reason as SYSTEM_OFF for preloading r0 (or x0)
-		 * with PSCI return value INTERNAL_FAILURE.
-		 */
-		val = PSCI_RET_INTERNAL_FAILURE;
-		ret = 0;
-		break;
-	default:
-		val = PSCI_RET_NOT_SUPPORTED;
-		break;
-	}
-
-	smccc_set_retval(vcpu, val, 0, 0, 0);
-	return ret;
-}
-
-static int kvm_psci_1_0_call(struct kvm_vcpu *vcpu)
-{
-	u32 psci_fn = smccc_get_function(vcpu);
-	u32 feature;
-	unsigned long val;
-	int ret = 1;
-
-	switch(psci_fn) {
-	case PSCI_0_2_FN_PSCI_VERSION:
-		val = KVM_ARM_PSCI_1_0;
-		break;
-	case PSCI_1_0_FN_PSCI_FEATURES:
-		feature = smccc_get_arg1(vcpu);
-		switch(feature) {
-		case PSCI_0_2_FN_PSCI_VERSION:
-		case PSCI_0_2_FN_CPU_SUSPEND:
-		case PSCI_0_2_FN64_CPU_SUSPEND:
-		case PSCI_0_2_FN_CPU_OFF:
-		case PSCI_0_2_FN_CPU_ON:
-		case PSCI_0_2_FN64_CPU_ON:
-		case PSCI_0_2_FN_AFFINITY_INFO:
-		case PSCI_0_2_FN64_AFFINITY_INFO:
-		case PSCI_0_2_FN_MIGRATE_INFO_TYPE:
-		case PSCI_0_2_FN_SYSTEM_OFF:
-		case PSCI_0_2_FN_SYSTEM_RESET:
-		case PSCI_1_0_FN_PSCI_FEATURES:
-		case ARM_SMCCC_VERSION_FUNC_ID:
-			val = 0;
-			break;
-		default:
-			val = PSCI_RET_NOT_SUPPORTED;
-			break;
-		}
-		break;
-	default:
-		return kvm_psci_0_2_call(vcpu);
-	}
-
-	smccc_set_retval(vcpu, val, 0, 0, 0);
-	return ret;
-}
-
-static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu)
-{
-	struct kvm *kvm = vcpu->kvm;
-	u32 psci_fn = smccc_get_function(vcpu);
-	unsigned long val;
-
-	switch (psci_fn) {
-	case KVM_PSCI_FN_CPU_OFF:
-		kvm_psci_vcpu_off(vcpu);
-		val = PSCI_RET_SUCCESS;
-		break;
-	case KVM_PSCI_FN_CPU_ON:
-		mutex_lock(&kvm->lock);
-		val = kvm_psci_vcpu_on(vcpu);
-		mutex_unlock(&kvm->lock);
-		break;
-	default:
-		val = PSCI_RET_NOT_SUPPORTED;
-		break;
-	}
-
-	smccc_set_retval(vcpu, val, 0, 0, 0);
-	return 1;
-}
-
-/**
- * kvm_psci_call - handle PSCI call if r0 value is in range
- * @vcpu: Pointer to the VCPU struct
- *
- * Handle PSCI calls from guests through traps from HVC instructions.
- * The calling convention is similar to SMC calls to the secure world
- * where the function number is placed in r0.
- *
- * This function returns: > 0 (success), 0 (success but exit to user
- * space), and < 0 (errors)
- *
- * Errors:
- * -EINVAL: Unrecognized PSCI function
- */
-static int kvm_psci_call(struct kvm_vcpu *vcpu)
-{
-	switch (kvm_psci_version(vcpu, vcpu->kvm)) {
-	case KVM_ARM_PSCI_1_0:
-		return kvm_psci_1_0_call(vcpu);
-	case KVM_ARM_PSCI_0_2:
-		return kvm_psci_0_2_call(vcpu);
-	case KVM_ARM_PSCI_0_1:
-		return kvm_psci_0_1_call(vcpu);
-	default:
-		return -EINVAL;
-	};
-}
-
-int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
-{
-	u32 func_id = smccc_get_function(vcpu);
-	u32 val = SMCCC_RET_NOT_SUPPORTED;
-	u32 feature;
-
-	switch (func_id) {
-	case ARM_SMCCC_VERSION_FUNC_ID:
-		val = ARM_SMCCC_VERSION_1_1;
-		break;
-	case ARM_SMCCC_ARCH_FEATURES_FUNC_ID:
-		feature = smccc_get_arg1(vcpu);
-		switch(feature) {
-		case ARM_SMCCC_ARCH_WORKAROUND_1:
-			if (kvm_arm_harden_branch_predictor())
-				val = SMCCC_RET_SUCCESS;
-			break;
-		case ARM_SMCCC_ARCH_WORKAROUND_2:
-			switch (kvm_arm_have_ssbd()) {
-			case KVM_SSBD_FORCE_DISABLE:
-			case KVM_SSBD_UNKNOWN:
-				break;
-			case KVM_SSBD_KERNEL:
-				val = SMCCC_RET_SUCCESS;
-				break;
-			case KVM_SSBD_FORCE_ENABLE:
-			case KVM_SSBD_MITIGATED:
-				val = SMCCC_RET_NOT_REQUIRED;
-				break;
-			}
-			break;
-		}
-		break;
-	default:
-		return kvm_psci_call(vcpu);
-	}
-
-	smccc_set_retval(vcpu, val, 0, 0, 0);
-	return 1;
-}
-
-int kvm_arm_get_fw_num_regs(struct kvm_vcpu *vcpu)
-{
-	return 1;		/* PSCI version */
-}
-
-int kvm_arm_copy_fw_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices)
-{
-	if (put_user(KVM_REG_ARM_PSCI_VERSION, uindices))
-		return -EFAULT;
-
-	return 0;
-}
-
-int kvm_arm_get_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
-{
-	if (reg->id == KVM_REG_ARM_PSCI_VERSION) {
-		void __user *uaddr = (void __user *)(long)reg->addr;
-		u64 val;
-
-		val = kvm_psci_version(vcpu, vcpu->kvm);
-		if (copy_to_user(uaddr, &val, KVM_REG_SIZE(reg->id)))
-			return -EFAULT;
-
-		return 0;
-	}
-
-	return -EINVAL;
-}
-
-int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
-{
-	if (reg->id == KVM_REG_ARM_PSCI_VERSION) {
-		void __user *uaddr = (void __user *)(long)reg->addr;
-		bool wants_02;
-		u64 val;
-
-		if (copy_from_user(&val, uaddr, KVM_REG_SIZE(reg->id)))
-			return -EFAULT;
-
-		wants_02 = test_bit(KVM_ARM_VCPU_PSCI_0_2, vcpu->arch.features);
-
-		switch (val) {
-		case KVM_ARM_PSCI_0_1:
-			if (wants_02)
-				return -EINVAL;
-			vcpu->kvm->arch.psci_version = val;
-			return 0;
-		case KVM_ARM_PSCI_0_2:
-		case KVM_ARM_PSCI_1_0:
-			if (!wants_02)
-				return -EINVAL;
-			vcpu->kvm->arch.psci_version = val;
-			return 0;
-		}
-	}
-
-	return -EINVAL;
-}
diff --git a/virt/kvm/arm/trace.h b/virt/kvm/arm/trace.h
deleted file mode 100644
index 3828beab93f2..000000000000
--- a/virt/kvm/arm/trace.h
+++ /dev/null
@@ -1,273 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_KVM_H
-
-#include <linux/tracepoint.h>
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM kvm
-
-/*
- * Tracepoints for entry/exit to guest
- */
-TRACE_EVENT(kvm_entry,
-	TP_PROTO(unsigned long vcpu_pc),
-	TP_ARGS(vcpu_pc),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	vcpu_pc		)
-	),
-
-	TP_fast_assign(
-		__entry->vcpu_pc		= vcpu_pc;
-	),
-
-	TP_printk("PC: 0x%08lx", __entry->vcpu_pc)
-);
-
-TRACE_EVENT(kvm_exit,
-	TP_PROTO(int ret, unsigned int esr_ec, unsigned long vcpu_pc),
-	TP_ARGS(ret, esr_ec, vcpu_pc),
-
-	TP_STRUCT__entry(
-		__field(	int,		ret		)
-		__field(	unsigned int,	esr_ec		)
-		__field(	unsigned long,	vcpu_pc		)
-	),
-
-	TP_fast_assign(
-		__entry->ret			= ARM_EXCEPTION_CODE(ret);
-		__entry->esr_ec = ARM_EXCEPTION_IS_TRAP(ret) ? esr_ec : 0;
-		__entry->vcpu_pc		= vcpu_pc;
-	),
-
-	TP_printk("%s: HSR_EC: 0x%04x (%s), PC: 0x%08lx",
-		  __print_symbolic(__entry->ret, kvm_arm_exception_type),
-		  __entry->esr_ec,
-		  __print_symbolic(__entry->esr_ec, kvm_arm_exception_class),
-		  __entry->vcpu_pc)
-);
-
-TRACE_EVENT(kvm_guest_fault,
-	TP_PROTO(unsigned long vcpu_pc, unsigned long hsr,
-		 unsigned long hxfar,
-		 unsigned long long ipa),
-	TP_ARGS(vcpu_pc, hsr, hxfar, ipa),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	vcpu_pc		)
-		__field(	unsigned long,	hsr		)
-		__field(	unsigned long,	hxfar		)
-		__field(   unsigned long long,	ipa		)
-	),
-
-	TP_fast_assign(
-		__entry->vcpu_pc		= vcpu_pc;
-		__entry->hsr			= hsr;
-		__entry->hxfar			= hxfar;
-		__entry->ipa			= ipa;
-	),
-
-	TP_printk("ipa %#llx, hsr %#08lx, hxfar %#08lx, pc %#08lx",
-		  __entry->ipa, __entry->hsr,
-		  __entry->hxfar, __entry->vcpu_pc)
-);
-
-TRACE_EVENT(kvm_access_fault,
-	TP_PROTO(unsigned long ipa),
-	TP_ARGS(ipa),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	ipa		)
-	),
-
-	TP_fast_assign(
-		__entry->ipa		= ipa;
-	),
-
-	TP_printk("IPA: %lx", __entry->ipa)
-);
-
-TRACE_EVENT(kvm_irq_line,
-	TP_PROTO(unsigned int type, int vcpu_idx, int irq_num, int level),
-	TP_ARGS(type, vcpu_idx, irq_num, level),
-
-	TP_STRUCT__entry(
-		__field(	unsigned int,	type		)
-		__field(	int,		vcpu_idx	)
-		__field(	int,		irq_num		)
-		__field(	int,		level		)
-	),
-
-	TP_fast_assign(
-		__entry->type		= type;
-		__entry->vcpu_idx	= vcpu_idx;
-		__entry->irq_num	= irq_num;
-		__entry->level		= level;
-	),
-
-	TP_printk("Inject %s interrupt (%d), vcpu->idx: %d, num: %d, level: %d",
-		  (__entry->type == KVM_ARM_IRQ_TYPE_CPU) ? "CPU" :
-		  (__entry->type == KVM_ARM_IRQ_TYPE_PPI) ? "VGIC PPI" :
-		  (__entry->type == KVM_ARM_IRQ_TYPE_SPI) ? "VGIC SPI" : "UNKNOWN",
-		  __entry->type, __entry->vcpu_idx, __entry->irq_num, __entry->level)
-);
-
-TRACE_EVENT(kvm_mmio_emulate,
-	TP_PROTO(unsigned long vcpu_pc, unsigned long instr,
-		 unsigned long cpsr),
-	TP_ARGS(vcpu_pc, instr, cpsr),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	vcpu_pc		)
-		__field(	unsigned long,	instr		)
-		__field(	unsigned long,	cpsr		)
-	),
-
-	TP_fast_assign(
-		__entry->vcpu_pc		= vcpu_pc;
-		__entry->instr			= instr;
-		__entry->cpsr			= cpsr;
-	),
-
-	TP_printk("Emulate MMIO at: 0x%08lx (instr: %08lx, cpsr: %08lx)",
-		  __entry->vcpu_pc, __entry->instr, __entry->cpsr)
-);
-
-TRACE_EVENT(kvm_unmap_hva_range,
-	TP_PROTO(unsigned long start, unsigned long end),
-	TP_ARGS(start, end),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	start		)
-		__field(	unsigned long,	end		)
-	),
-
-	TP_fast_assign(
-		__entry->start		= start;
-		__entry->end		= end;
-	),
-
-	TP_printk("mmu notifier unmap range: %#08lx -- %#08lx",
-		  __entry->start, __entry->end)
-);
-
-TRACE_EVENT(kvm_set_spte_hva,
-	TP_PROTO(unsigned long hva),
-	TP_ARGS(hva),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	hva		)
-	),
-
-	TP_fast_assign(
-		__entry->hva		= hva;
-	),
-
-	TP_printk("mmu notifier set pte hva: %#08lx", __entry->hva)
-);
-
-TRACE_EVENT(kvm_age_hva,
-	TP_PROTO(unsigned long start, unsigned long end),
-	TP_ARGS(start, end),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	start		)
-		__field(	unsigned long,	end		)
-	),
-
-	TP_fast_assign(
-		__entry->start		= start;
-		__entry->end		= end;
-	),
-
-	TP_printk("mmu notifier age hva: %#08lx -- %#08lx",
-		  __entry->start, __entry->end)
-);
-
-TRACE_EVENT(kvm_test_age_hva,
-	TP_PROTO(unsigned long hva),
-	TP_ARGS(hva),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	hva		)
-	),
-
-	TP_fast_assign(
-		__entry->hva		= hva;
-	),
-
-	TP_printk("mmu notifier test age hva: %#08lx", __entry->hva)
-);
-
-TRACE_EVENT(kvm_set_way_flush,
-	    TP_PROTO(unsigned long vcpu_pc, bool cache),
-	    TP_ARGS(vcpu_pc, cache),
-
-	    TP_STRUCT__entry(
-		    __field(	unsigned long,	vcpu_pc		)
-		    __field(	bool,		cache		)
-	    ),
-
-	    TP_fast_assign(
-		    __entry->vcpu_pc		= vcpu_pc;
-		    __entry->cache		= cache;
-	    ),
-
-	    TP_printk("S/W flush at 0x%016lx (cache %s)",
-		      __entry->vcpu_pc, __entry->cache ? "on" : "off")
-);
-
-TRACE_EVENT(kvm_toggle_cache,
-	    TP_PROTO(unsigned long vcpu_pc, bool was, bool now),
-	    TP_ARGS(vcpu_pc, was, now),
-
-	    TP_STRUCT__entry(
-		    __field(	unsigned long,	vcpu_pc		)
-		    __field(	bool,		was		)
-		    __field(	bool,		now		)
-	    ),
-
-	    TP_fast_assign(
-		    __entry->vcpu_pc		= vcpu_pc;
-		    __entry->was		= was;
-		    __entry->now		= now;
-	    ),
-
-	    TP_printk("VM op at 0x%016lx (cache was %s, now %s)",
-		      __entry->vcpu_pc, __entry->was ? "on" : "off",
-		      __entry->now ? "on" : "off")
-);
-
-/*
- * Tracepoints for arch_timer
- */
-TRACE_EVENT(kvm_timer_update_irq,
-	TP_PROTO(unsigned long vcpu_id, __u32 irq, int level),
-	TP_ARGS(vcpu_id, irq, level),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	vcpu_id	)
-		__field(	__u32,		irq	)
-		__field(	int,		level	)
-	),
-
-	TP_fast_assign(
-		__entry->vcpu_id	= vcpu_id;
-		__entry->irq		= irq;
-		__entry->level		= level;
-	),
-
-	TP_printk("VCPU: %ld, IRQ %d, level %d",
-		  __entry->vcpu_id, __entry->irq, __entry->level)
-);
-
-#endif /* _TRACE_KVM_H */
-
-#undef TRACE_INCLUDE_PATH
-#define TRACE_INCLUDE_PATH ../../../virt/kvm/arm
-#undef TRACE_INCLUDE_FILE
-#define TRACE_INCLUDE_FILE trace
-
-/* This part must be outside protection */
-#include <trace/define_trace.h>
diff --git a/virt/kvm/arm/vgic/trace.h b/virt/kvm/arm/vgic/trace.h
deleted file mode 100644
index 55fed77a9f73..000000000000
--- a/virt/kvm/arm/vgic/trace.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#if !defined(_TRACE_VGIC_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_VGIC_H
-
-#include <linux/tracepoint.h>
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM kvm
-
-TRACE_EVENT(vgic_update_irq_pending,
-	TP_PROTO(unsigned long vcpu_id, __u32 irq, bool level),
-	TP_ARGS(vcpu_id, irq, level),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	vcpu_id	)
-		__field(	__u32,		irq	)
-		__field(	bool,		level	)
-	),
-
-	TP_fast_assign(
-		__entry->vcpu_id	= vcpu_id;
-		__entry->irq		= irq;
-		__entry->level		= level;
-	),
-
-	TP_printk("VCPU: %ld, IRQ %d, level: %d",
-		  __entry->vcpu_id, __entry->irq, __entry->level)
-);
-
-#endif /* _TRACE_VGIC_H */
-
-#undef TRACE_INCLUDE_PATH
-#define TRACE_INCLUDE_PATH ../../../virt/kvm/arm/vgic
-#undef TRACE_INCLUDE_FILE
-#define TRACE_INCLUDE_FILE trace
-
-/* This part must be outside protection */
-#include <trace/define_trace.h>
diff --git a/virt/kvm/arm/vgic/vgic-debug.c b/virt/kvm/arm/vgic/vgic-debug.c
deleted file mode 100644
index 07aa900bac56..000000000000
--- a/virt/kvm/arm/vgic/vgic-debug.c
+++ /dev/null
@@ -1,299 +0,0 @@
-/*
- * Copyright (C) 2016 Linaro
- * Author: Christoffer Dall <christoffer.dall@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/cpu.h>
-#include <linux/debugfs.h>
-#include <linux/interrupt.h>
-#include <linux/kvm_host.h>
-#include <linux/seq_file.h>
-#include <kvm/arm_vgic.h>
-#include <asm/kvm_mmu.h>
-#include "vgic.h"
-
-/*
- * Structure to control looping through the entire vgic state.  We start at
- * zero for each field and move upwards.  So, if dist_id is 0 we print the
- * distributor info.  When dist_id is 1, we have already printed it and move
- * on.
- *
- * When vcpu_id < nr_cpus we print the vcpu info until vcpu_id == nr_cpus and
- * so on.
- */
-struct vgic_state_iter {
-	int nr_cpus;
-	int nr_spis;
-	int nr_lpis;
-	int dist_id;
-	int vcpu_id;
-	int intid;
-	int lpi_idx;
-	u32 *lpi_array;
-};
-
-static void iter_next(struct vgic_state_iter *iter)
-{
-	if (iter->dist_id == 0) {
-		iter->dist_id++;
-		return;
-	}
-
-	iter->intid++;
-	if (iter->intid == VGIC_NR_PRIVATE_IRQS &&
-	    ++iter->vcpu_id < iter->nr_cpus)
-		iter->intid = 0;
-
-	if (iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS)) {
-		if (iter->lpi_idx < iter->nr_lpis)
-			iter->intid = iter->lpi_array[iter->lpi_idx];
-		iter->lpi_idx++;
-	}
-}
-
-static void iter_init(struct kvm *kvm, struct vgic_state_iter *iter,
-		      loff_t pos)
-{
-	int nr_cpus = atomic_read(&kvm->online_vcpus);
-
-	memset(iter, 0, sizeof(*iter));
-
-	iter->nr_cpus = nr_cpus;
-	iter->nr_spis = kvm->arch.vgic.nr_spis;
-	if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
-		iter->nr_lpis = vgic_copy_lpi_list(kvm, NULL, &iter->lpi_array);
-		if (iter->nr_lpis < 0)
-			iter->nr_lpis = 0;
-	}
-
-	/* Fast forward to the right position if needed */
-	while (pos--)
-		iter_next(iter);
-}
-
-static bool end_of_vgic(struct vgic_state_iter *iter)
-{
-	return iter->dist_id > 0 &&
-		iter->vcpu_id == iter->nr_cpus &&
-		iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS) &&
-		iter->lpi_idx > iter->nr_lpis;
-}
-
-static void *vgic_debug_start(struct seq_file *s, loff_t *pos)
-{
-	struct kvm *kvm = (struct kvm *)s->private;
-	struct vgic_state_iter *iter;
-
-	mutex_lock(&kvm->lock);
-	iter = kvm->arch.vgic.iter;
-	if (iter) {
-		iter = ERR_PTR(-EBUSY);
-		goto out;
-	}
-
-	iter = kmalloc(sizeof(*iter), GFP_KERNEL);
-	if (!iter) {
-		iter = ERR_PTR(-ENOMEM);
-		goto out;
-	}
-
-	iter_init(kvm, iter, *pos);
-	kvm->arch.vgic.iter = iter;
-
-	if (end_of_vgic(iter))
-		iter = NULL;
-out:
-	mutex_unlock(&kvm->lock);
-	return iter;
-}
-
-static void *vgic_debug_next(struct seq_file *s, void *v, loff_t *pos)
-{
-	struct kvm *kvm = (struct kvm *)s->private;
-	struct vgic_state_iter *iter = kvm->arch.vgic.iter;
-
-	++*pos;
-	iter_next(iter);
-	if (end_of_vgic(iter))
-		iter = NULL;
-	return iter;
-}
-
-static void vgic_debug_stop(struct seq_file *s, void *v)
-{
-	struct kvm *kvm = (struct kvm *)s->private;
-	struct vgic_state_iter *iter;
-
-	/*
-	 * If the seq file wasn't properly opened, there's nothing to clearn
-	 * up.
-	 */
-	if (IS_ERR(v))
-		return;
-
-	mutex_lock(&kvm->lock);
-	iter = kvm->arch.vgic.iter;
-	kfree(iter->lpi_array);
-	kfree(iter);
-	kvm->arch.vgic.iter = NULL;
-	mutex_unlock(&kvm->lock);
-}
-
-static void print_dist_state(struct seq_file *s, struct vgic_dist *dist)
-{
-	bool v3 = dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3;
-
-	seq_printf(s, "Distributor\n");
-	seq_printf(s, "===========\n");
-	seq_printf(s, "vgic_model:\t%s\n", v3 ? "GICv3" : "GICv2");
-	seq_printf(s, "nr_spis:\t%d\n", dist->nr_spis);
-	if (v3)
-		seq_printf(s, "nr_lpis:\t%d\n", dist->lpi_list_count);
-	seq_printf(s, "enabled:\t%d\n", dist->enabled);
-	seq_printf(s, "\n");
-
-	seq_printf(s, "P=pending_latch, L=line_level, A=active\n");
-	seq_printf(s, "E=enabled, H=hw, C=config (level=1, edge=0)\n");
-	seq_printf(s, "G=group\n");
-}
-
-static void print_header(struct seq_file *s, struct vgic_irq *irq,
-			 struct kvm_vcpu *vcpu)
-{
-	int id = 0;
-	char *hdr = "SPI ";
-
-	if (vcpu) {
-		hdr = "VCPU";
-		id = vcpu->vcpu_id;
-	}
-
-	seq_printf(s, "\n");
-	seq_printf(s, "%s%2d TYP   ID TGT_ID PLAEHCG     HWID   TARGET SRC PRI VCPU_ID\n", hdr, id);
-	seq_printf(s, "----------------------------------------------------------------\n");
-}
-
-static void print_irq_state(struct seq_file *s, struct vgic_irq *irq,
-			    struct kvm_vcpu *vcpu)
-{
-	char *type;
-	if (irq->intid < VGIC_NR_SGIS)
-		type = "SGI";
-	else if (irq->intid < VGIC_NR_PRIVATE_IRQS)
-		type = "PPI";
-	else if (irq->intid < VGIC_MAX_SPI)
-		type = "SPI";
-	else
-		type = "LPI";
-
-	if (irq->intid ==0 || irq->intid == VGIC_NR_PRIVATE_IRQS)
-		print_header(s, irq, vcpu);
-
-	seq_printf(s, "       %s %4d "
-		      "    %2d "
-		      "%d%d%d%d%d%d%d "
-		      "%8d "
-		      "%8x "
-		      " %2x "
-		      "%3d "
-		      "     %2d "
-		      "\n",
-			type, irq->intid,
-			(irq->target_vcpu) ? irq->target_vcpu->vcpu_id : -1,
-			irq->pending_latch,
-			irq->line_level,
-			irq->active,
-			irq->enabled,
-			irq->hw,
-			irq->config == VGIC_CONFIG_LEVEL,
-			irq->group,
-			irq->hwintid,
-			irq->mpidr,
-			irq->source,
-			irq->priority,
-			(irq->vcpu) ? irq->vcpu->vcpu_id : -1);
-}
-
-static int vgic_debug_show(struct seq_file *s, void *v)
-{
-	struct kvm *kvm = (struct kvm *)s->private;
-	struct vgic_state_iter *iter = (struct vgic_state_iter *)v;
-	struct vgic_irq *irq;
-	struct kvm_vcpu *vcpu = NULL;
-	unsigned long flags;
-
-	if (iter->dist_id == 0) {
-		print_dist_state(s, &kvm->arch.vgic);
-		return 0;
-	}
-
-	if (!kvm->arch.vgic.initialized)
-		return 0;
-
-	if (iter->vcpu_id < iter->nr_cpus)
-		vcpu = kvm_get_vcpu(kvm, iter->vcpu_id);
-
-	irq = vgic_get_irq(kvm, vcpu, iter->intid);
-	if (!irq) {
-		seq_printf(s, "       LPI %4d freed\n", iter->intid);
-		return 0;
-	}
-
-	spin_lock_irqsave(&irq->irq_lock, flags);
-	print_irq_state(s, irq, vcpu);
-	spin_unlock_irqrestore(&irq->irq_lock, flags);
-
-	vgic_put_irq(kvm, irq);
-	return 0;
-}
-
-static const struct seq_operations vgic_debug_seq_ops = {
-	.start = vgic_debug_start,
-	.next  = vgic_debug_next,
-	.stop  = vgic_debug_stop,
-	.show  = vgic_debug_show
-};
-
-static int debug_open(struct inode *inode, struct file *file)
-{
-	int ret;
-	ret = seq_open(file, &vgic_debug_seq_ops);
-	if (!ret) {
-		struct seq_file *seq;
-		/* seq_open will have modified file->private_data */
-		seq = file->private_data;
-		seq->private = inode->i_private;
-	}
-
-	return ret;
-};
-
-static const struct file_operations vgic_debug_fops = {
-	.owner   = THIS_MODULE,
-	.open    = debug_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release
-};
-
-void vgic_debug_init(struct kvm *kvm)
-{
-	debugfs_create_file("vgic-state", 0444, kvm->debugfs_dentry, kvm,
-			    &vgic_debug_fops);
-}
-
-void vgic_debug_destroy(struct kvm *kvm)
-{
-}
diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c
deleted file mode 100644
index c0c0b88af1d5..000000000000
--- a/virt/kvm/arm/vgic/vgic-init.c
+++ /dev/null
@@ -1,543 +0,0 @@
-/*
- * Copyright (C) 2015, 2016 ARM Ltd.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/uaccess.h>
-#include <linux/interrupt.h>
-#include <linux/cpu.h>
-#include <linux/kvm_host.h>
-#include <kvm/arm_vgic.h>
-#include <asm/kvm_mmu.h>
-#include "vgic.h"
-
-/*
- * Initialization rules: there are multiple stages to the vgic
- * initialization, both for the distributor and the CPU interfaces.  The basic
- * idea is that even though the VGIC is not functional or not requested from
- * user space, the critical path of the run loop can still call VGIC functions
- * that just won't do anything, without them having to check additional
- * initialization flags to ensure they don't look at uninitialized data
- * structures.
- *
- * Distributor:
- *
- * - kvm_vgic_early_init(): initialization of static data that doesn't
- *   depend on any sizing information or emulation type. No allocation
- *   is allowed there.
- *
- * - vgic_init(): allocation and initialization of the generic data
- *   structures that depend on sizing information (number of CPUs,
- *   number of interrupts). Also initializes the vcpu specific data
- *   structures. Can be executed lazily for GICv2.
- *
- * CPU Interface:
- *
- * - kvm_vgic_vcpu_init(): initialization of static data that
- *   doesn't depend on any sizing information or emulation type. No
- *   allocation is allowed there.
- */
-
-/* EARLY INIT */
-
-/**
- * kvm_vgic_early_init() - Initialize static VGIC VCPU data structures
- * @kvm: The VM whose VGIC districutor should be initialized
- *
- * Only do initialization of static structures that don't require any
- * allocation or sizing information from userspace.  vgic_init() called
- * kvm_vgic_dist_init() which takes care of the rest.
- */
-void kvm_vgic_early_init(struct kvm *kvm)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-
-	INIT_LIST_HEAD(&dist->lpi_list_head);
-	spin_lock_init(&dist->lpi_list_lock);
-}
-
-/* CREATION */
-
-/**
- * kvm_vgic_create: triggered by the instantiation of the VGIC device by
- * user space, either through the legacy KVM_CREATE_IRQCHIP ioctl (v2 only)
- * or through the generic KVM_CREATE_DEVICE API ioctl.
- * irqchip_in_kernel() tells you if this function succeeded or not.
- * @kvm: kvm struct pointer
- * @type: KVM_DEV_TYPE_ARM_VGIC_V[23]
- */
-int kvm_vgic_create(struct kvm *kvm, u32 type)
-{
-	int i, vcpu_lock_idx = -1, ret;
-	struct kvm_vcpu *vcpu;
-
-	if (irqchip_in_kernel(kvm))
-		return -EEXIST;
-
-	/*
-	 * This function is also called by the KVM_CREATE_IRQCHIP handler,
-	 * which had no chance yet to check the availability of the GICv2
-	 * emulation. So check this here again. KVM_CREATE_DEVICE does
-	 * the proper checks already.
-	 */
-	if (type == KVM_DEV_TYPE_ARM_VGIC_V2 &&
-		!kvm_vgic_global_state.can_emulate_gicv2)
-		return -ENODEV;
-
-	/*
-	 * Any time a vcpu is run, vcpu_load is called which tries to grab the
-	 * vcpu->mutex.  By grabbing the vcpu->mutex of all VCPUs we ensure
-	 * that no other VCPUs are run while we create the vgic.
-	 */
-	ret = -EBUSY;
-	kvm_for_each_vcpu(i, vcpu, kvm) {
-		if (!mutex_trylock(&vcpu->mutex))
-			goto out_unlock;
-		vcpu_lock_idx = i;
-	}
-
-	kvm_for_each_vcpu(i, vcpu, kvm) {
-		if (vcpu->arch.has_run_once)
-			goto out_unlock;
-	}
-	ret = 0;
-
-	if (type == KVM_DEV_TYPE_ARM_VGIC_V2)
-		kvm->arch.max_vcpus = VGIC_V2_MAX_CPUS;
-	else
-		kvm->arch.max_vcpus = VGIC_V3_MAX_CPUS;
-
-	if (atomic_read(&kvm->online_vcpus) > kvm->arch.max_vcpus) {
-		ret = -E2BIG;
-		goto out_unlock;
-	}
-
-	kvm->arch.vgic.in_kernel = true;
-	kvm->arch.vgic.vgic_model = type;
-
-	kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF;
-
-	if (type == KVM_DEV_TYPE_ARM_VGIC_V2)
-		kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF;
-	else
-		INIT_LIST_HEAD(&kvm->arch.vgic.rd_regions);
-
-out_unlock:
-	for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) {
-		vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx);
-		mutex_unlock(&vcpu->mutex);
-	}
-	return ret;
-}
-
-/* INIT/DESTROY */
-
-/**
- * kvm_vgic_dist_init: initialize the dist data structures
- * @kvm: kvm struct pointer
- * @nr_spis: number of spis, frozen by caller
- */
-static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	struct kvm_vcpu *vcpu0 = kvm_get_vcpu(kvm, 0);
-	int i;
-
-	dist->spis = kcalloc(nr_spis, sizeof(struct vgic_irq), GFP_KERNEL);
-	if (!dist->spis)
-		return  -ENOMEM;
-
-	/*
-	 * In the following code we do not take the irq struct lock since
-	 * no other action on irq structs can happen while the VGIC is
-	 * not initialized yet:
-	 * If someone wants to inject an interrupt or does a MMIO access, we
-	 * require prior initialization in case of a virtual GICv3 or trigger
-	 * initialization when using a virtual GICv2.
-	 */
-	for (i = 0; i < nr_spis; i++) {
-		struct vgic_irq *irq = &dist->spis[i];
-
-		irq->intid = i + VGIC_NR_PRIVATE_IRQS;
-		INIT_LIST_HEAD(&irq->ap_list);
-		spin_lock_init(&irq->irq_lock);
-		irq->vcpu = NULL;
-		irq->target_vcpu = vcpu0;
-		kref_init(&irq->refcount);
-		if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2) {
-			irq->targets = 0;
-			irq->group = 0;
-		} else {
-			irq->mpidr = 0;
-			irq->group = 1;
-		}
-	}
-	return 0;
-}
-
-/**
- * kvm_vgic_vcpu_init() - Initialize static VGIC VCPU data
- * structures and register VCPU-specific KVM iodevs
- *
- * @vcpu: pointer to the VCPU being created and initialized
- *
- * Only do initialization, but do not actually enable the
- * VGIC CPU interface
- */
-int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
-{
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-	int ret = 0;
-	int i;
-
-	vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF;
-	vgic_cpu->sgi_iodev.base_addr = VGIC_ADDR_UNDEF;
-
-	INIT_LIST_HEAD(&vgic_cpu->ap_list_head);
-	spin_lock_init(&vgic_cpu->ap_list_lock);
-
-	/*
-	 * Enable and configure all SGIs to be edge-triggered and
-	 * configure all PPIs as level-triggered.
-	 */
-	for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) {
-		struct vgic_irq *irq = &vgic_cpu->private_irqs[i];
-
-		INIT_LIST_HEAD(&irq->ap_list);
-		spin_lock_init(&irq->irq_lock);
-		irq->intid = i;
-		irq->vcpu = NULL;
-		irq->target_vcpu = vcpu;
-		irq->targets = 1U << vcpu->vcpu_id;
-		kref_init(&irq->refcount);
-		if (vgic_irq_is_sgi(i)) {
-			/* SGIs */
-			irq->enabled = 1;
-			irq->config = VGIC_CONFIG_EDGE;
-		} else {
-			/* PPIs */
-			irq->config = VGIC_CONFIG_LEVEL;
-		}
-
-		/*
-		 * GICv3 can only be created via the KVM_DEVICE_CREATE API and
-		 * so we always know the emulation type at this point as it's
-		 * either explicitly configured as GICv3, or explicitly
-		 * configured as GICv2, or not configured yet which also
-		 * implies GICv2.
-		 */
-		if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)
-			irq->group = 1;
-		else
-			irq->group = 0;
-	}
-
-	if (!irqchip_in_kernel(vcpu->kvm))
-		return 0;
-
-	/*
-	 * If we are creating a VCPU with a GICv3 we must also register the
-	 * KVM io device for the redistributor that belongs to this VCPU.
-	 */
-	if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
-		mutex_lock(&vcpu->kvm->lock);
-		ret = vgic_register_redist_iodev(vcpu);
-		mutex_unlock(&vcpu->kvm->lock);
-	}
-	return ret;
-}
-
-static void kvm_vgic_vcpu_enable(struct kvm_vcpu *vcpu)
-{
-	if (kvm_vgic_global_state.type == VGIC_V2)
-		vgic_v2_enable(vcpu);
-	else
-		vgic_v3_enable(vcpu);
-}
-
-/*
- * vgic_init: allocates and initializes dist and vcpu data structures
- * depending on two dimensioning parameters:
- * - the number of spis
- * - the number of vcpus
- * The function is generally called when nr_spis has been explicitly set
- * by the guest through the KVM DEVICE API. If not nr_spis is set to 256.
- * vgic_initialized() returns true when this function has succeeded.
- * Must be called with kvm->lock held!
- */
-int vgic_init(struct kvm *kvm)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	struct kvm_vcpu *vcpu;
-	int ret = 0, i;
-
-	if (vgic_initialized(kvm))
-		return 0;
-
-	/* Are we also in the middle of creating a VCPU? */
-	if (kvm->created_vcpus != atomic_read(&kvm->online_vcpus))
-		return -EBUSY;
-
-	/* freeze the number of spis */
-	if (!dist->nr_spis)
-		dist->nr_spis = VGIC_NR_IRQS_LEGACY - VGIC_NR_PRIVATE_IRQS;
-
-	ret = kvm_vgic_dist_init(kvm, dist->nr_spis);
-	if (ret)
-		goto out;
-
-	if (vgic_has_its(kvm)) {
-		ret = vgic_v4_init(kvm);
-		if (ret)
-			goto out;
-	}
-
-	kvm_for_each_vcpu(i, vcpu, kvm)
-		kvm_vgic_vcpu_enable(vcpu);
-
-	ret = kvm_vgic_setup_default_irq_routing(kvm);
-	if (ret)
-		goto out;
-
-	vgic_debug_init(kvm);
-
-	dist->implementation_rev = 2;
-	dist->initialized = true;
-
-out:
-	return ret;
-}
-
-static void kvm_vgic_dist_destroy(struct kvm *kvm)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	struct vgic_redist_region *rdreg, *next;
-
-	dist->ready = false;
-	dist->initialized = false;
-
-	kfree(dist->spis);
-	dist->spis = NULL;
-	dist->nr_spis = 0;
-
-	if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
-		list_for_each_entry_safe(rdreg, next, &dist->rd_regions, list) {
-			list_del(&rdreg->list);
-			kfree(rdreg);
-		}
-		INIT_LIST_HEAD(&dist->rd_regions);
-	}
-
-	if (vgic_supports_direct_msis(kvm))
-		vgic_v4_teardown(kvm);
-}
-
-void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
-{
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-
-	INIT_LIST_HEAD(&vgic_cpu->ap_list_head);
-}
-
-/* To be called with kvm->lock held */
-static void __kvm_vgic_destroy(struct kvm *kvm)
-{
-	struct kvm_vcpu *vcpu;
-	int i;
-
-	vgic_debug_destroy(kvm);
-
-	kvm_vgic_dist_destroy(kvm);
-
-	kvm_for_each_vcpu(i, vcpu, kvm)
-		kvm_vgic_vcpu_destroy(vcpu);
-}
-
-void kvm_vgic_destroy(struct kvm *kvm)
-{
-	mutex_lock(&kvm->lock);
-	__kvm_vgic_destroy(kvm);
-	mutex_unlock(&kvm->lock);
-}
-
-/**
- * vgic_lazy_init: Lazy init is only allowed if the GIC exposed to the guest
- * is a GICv2. A GICv3 must be explicitly initialized by the guest using the
- * KVM_DEV_ARM_VGIC_GRP_CTRL KVM_DEVICE group.
- * @kvm: kvm struct pointer
- */
-int vgic_lazy_init(struct kvm *kvm)
-{
-	int ret = 0;
-
-	if (unlikely(!vgic_initialized(kvm))) {
-		/*
-		 * We only provide the automatic initialization of the VGIC
-		 * for the legacy case of a GICv2. Any other type must
-		 * be explicitly initialized once setup with the respective
-		 * KVM device call.
-		 */
-		if (kvm->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V2)
-			return -EBUSY;
-
-		mutex_lock(&kvm->lock);
-		ret = vgic_init(kvm);
-		mutex_unlock(&kvm->lock);
-	}
-
-	return ret;
-}
-
-/* RESOURCE MAPPING */
-
-/**
- * Map the MMIO regions depending on the VGIC model exposed to the guest
- * called on the first VCPU run.
- * Also map the virtual CPU interface into the VM.
- * v2/v3 derivatives call vgic_init if not already done.
- * vgic_ready() returns true if this function has succeeded.
- * @kvm: kvm struct pointer
- */
-int kvm_vgic_map_resources(struct kvm *kvm)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	int ret = 0;
-
-	mutex_lock(&kvm->lock);
-	if (!irqchip_in_kernel(kvm))
-		goto out;
-
-	if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2)
-		ret = vgic_v2_map_resources(kvm);
-	else
-		ret = vgic_v3_map_resources(kvm);
-
-	if (ret)
-		__kvm_vgic_destroy(kvm);
-
-out:
-	mutex_unlock(&kvm->lock);
-	return ret;
-}
-
-/* GENERIC PROBE */
-
-static int vgic_init_cpu_starting(unsigned int cpu)
-{
-	enable_percpu_irq(kvm_vgic_global_state.maint_irq, 0);
-	return 0;
-}
-
-
-static int vgic_init_cpu_dying(unsigned int cpu)
-{
-	disable_percpu_irq(kvm_vgic_global_state.maint_irq);
-	return 0;
-}
-
-static irqreturn_t vgic_maintenance_handler(int irq, void *data)
-{
-	/*
-	 * We cannot rely on the vgic maintenance interrupt to be
-	 * delivered synchronously. This means we can only use it to
-	 * exit the VM, and we perform the handling of EOIed
-	 * interrupts on the exit path (see vgic_fold_lr_state).
-	 */
-	return IRQ_HANDLED;
-}
-
-/**
- * kvm_vgic_init_cpu_hardware - initialize the GIC VE hardware
- *
- * For a specific CPU, initialize the GIC VE hardware.
- */
-void kvm_vgic_init_cpu_hardware(void)
-{
-	BUG_ON(preemptible());
-
-	/*
-	 * We want to make sure the list registers start out clear so that we
-	 * only have the program the used registers.
-	 */
-	if (kvm_vgic_global_state.type == VGIC_V2)
-		vgic_v2_init_lrs();
-	else
-		kvm_call_hyp(__vgic_v3_init_lrs);
-}
-
-/**
- * kvm_vgic_hyp_init: populates the kvm_vgic_global_state variable
- * according to the host GIC model. Accordingly calls either
- * vgic_v2/v3_probe which registers the KVM_DEVICE that can be
- * instantiated by a guest later on .
- */
-int kvm_vgic_hyp_init(void)
-{
-	const struct gic_kvm_info *gic_kvm_info;
-	int ret;
-
-	gic_kvm_info = gic_get_kvm_info();
-	if (!gic_kvm_info)
-		return -ENODEV;
-
-	if (!gic_kvm_info->maint_irq) {
-		kvm_err("No vgic maintenance irq\n");
-		return -ENXIO;
-	}
-
-	switch (gic_kvm_info->type) {
-	case GIC_V2:
-		ret = vgic_v2_probe(gic_kvm_info);
-		break;
-	case GIC_V3:
-		ret = vgic_v3_probe(gic_kvm_info);
-		if (!ret) {
-			static_branch_enable(&kvm_vgic_global_state.gicv3_cpuif);
-			kvm_info("GIC system register CPU interface enabled\n");
-		}
-		break;
-	default:
-		ret = -ENODEV;
-	};
-
-	if (ret)
-		return ret;
-
-	kvm_vgic_global_state.maint_irq = gic_kvm_info->maint_irq;
-	ret = request_percpu_irq(kvm_vgic_global_state.maint_irq,
-				 vgic_maintenance_handler,
-				 "vgic", kvm_get_running_vcpus());
-	if (ret) {
-		kvm_err("Cannot register interrupt %d\n",
-			kvm_vgic_global_state.maint_irq);
-		return ret;
-	}
-
-	ret = cpuhp_setup_state(CPUHP_AP_KVM_ARM_VGIC_INIT_STARTING,
-				"kvm/arm/vgic:starting",
-				vgic_init_cpu_starting, vgic_init_cpu_dying);
-	if (ret) {
-		kvm_err("Cannot register vgic CPU notifier\n");
-		goto out_free_irq;
-	}
-
-	kvm_info("vgic interrupt IRQ%d\n", kvm_vgic_global_state.maint_irq);
-	return 0;
-
-out_free_irq:
-	free_percpu_irq(kvm_vgic_global_state.maint_irq,
-			kvm_get_running_vcpus());
-	return ret;
-}
diff --git a/virt/kvm/arm/vgic/vgic-irqfd.c b/virt/kvm/arm/vgic/vgic-irqfd.c
deleted file mode 100644
index 99e026d2dade..000000000000
--- a/virt/kvm/arm/vgic/vgic-irqfd.c
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Copyright (C) 2015, 2016 ARM Ltd.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <trace/events/kvm.h>
-#include <kvm/arm_vgic.h>
-#include "vgic.h"
-
-/**
- * vgic_irqfd_set_irq: inject the IRQ corresponding to the
- * irqchip routing entry
- *
- * This is the entry point for irqfd IRQ injection
- */
-static int vgic_irqfd_set_irq(struct kvm_kernel_irq_routing_entry *e,
-			struct kvm *kvm, int irq_source_id,
-			int level, bool line_status)
-{
-	unsigned int spi_id = e->irqchip.pin + VGIC_NR_PRIVATE_IRQS;
-
-	if (!vgic_valid_spi(kvm, spi_id))
-		return -EINVAL;
-	return kvm_vgic_inject_irq(kvm, 0, spi_id, level, NULL);
-}
-
-/**
- * kvm_set_routing_entry: populate a kvm routing entry
- * from a user routing entry
- *
- * @kvm: the VM this entry is applied to
- * @e: kvm kernel routing entry handle
- * @ue: user api routing entry handle
- * return 0 on success, -EINVAL on errors.
- */
-int kvm_set_routing_entry(struct kvm *kvm,
-			  struct kvm_kernel_irq_routing_entry *e,
-			  const struct kvm_irq_routing_entry *ue)
-{
-	int r = -EINVAL;
-
-	switch (ue->type) {
-	case KVM_IRQ_ROUTING_IRQCHIP:
-		e->set = vgic_irqfd_set_irq;
-		e->irqchip.irqchip = ue->u.irqchip.irqchip;
-		e->irqchip.pin = ue->u.irqchip.pin;
-		if ((e->irqchip.pin >= KVM_IRQCHIP_NUM_PINS) ||
-		    (e->irqchip.irqchip >= KVM_NR_IRQCHIPS))
-			goto out;
-		break;
-	case KVM_IRQ_ROUTING_MSI:
-		e->set = kvm_set_msi;
-		e->msi.address_lo = ue->u.msi.address_lo;
-		e->msi.address_hi = ue->u.msi.address_hi;
-		e->msi.data = ue->u.msi.data;
-		e->msi.flags = ue->flags;
-		e->msi.devid = ue->u.msi.devid;
-		break;
-	default:
-		goto out;
-	}
-	r = 0;
-out:
-	return r;
-}
-
-/**
- * kvm_set_msi: inject the MSI corresponding to the
- * MSI routing entry
- *
- * This is the entry point for irqfd MSI injection
- * and userspace MSI injection.
- */
-int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
-		struct kvm *kvm, int irq_source_id,
-		int level, bool line_status)
-{
-	struct kvm_msi msi;
-
-	msi.address_lo = e->msi.address_lo;
-	msi.address_hi = e->msi.address_hi;
-	msi.data = e->msi.data;
-	msi.flags = e->msi.flags;
-	msi.devid = e->msi.devid;
-
-	if (!vgic_has_its(kvm))
-		return -ENODEV;
-
-	if (!level)
-		return -1;
-
-	return vgic_its_inject_msi(kvm, &msi);
-}
-
-int kvm_vgic_setup_default_irq_routing(struct kvm *kvm)
-{
-	struct kvm_irq_routing_entry *entries;
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	u32 nr = dist->nr_spis;
-	int i, ret;
-
-	entries = kcalloc(nr, sizeof(*entries), GFP_KERNEL);
-	if (!entries)
-		return -ENOMEM;
-
-	for (i = 0; i < nr; i++) {
-		entries[i].gsi = i;
-		entries[i].type = KVM_IRQ_ROUTING_IRQCHIP;
-		entries[i].u.irqchip.irqchip = 0;
-		entries[i].u.irqchip.pin = i;
-	}
-	ret = kvm_set_irq_routing(kvm, entries, nr, 0);
-	kfree(entries);
-	return ret;
-}
diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
deleted file mode 100644
index eb2a390a6c86..000000000000
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ /dev/null
@@ -1,2569 +0,0 @@
-/*
- * GICv3 ITS emulation
- *
- * Copyright (C) 2015,2016 ARM Ltd.
- * Author: Andre Przywara <andre.przywara@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/cpu.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <linux/interrupt.h>
-#include <linux/list.h>
-#include <linux/uaccess.h>
-#include <linux/list_sort.h>
-
-#include <linux/irqchip/arm-gic-v3.h>
-
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_arm.h>
-#include <asm/kvm_mmu.h>
-
-#include "vgic.h"
-#include "vgic-mmio.h"
-
-static int vgic_its_save_tables_v0(struct vgic_its *its);
-static int vgic_its_restore_tables_v0(struct vgic_its *its);
-static int vgic_its_commit_v0(struct vgic_its *its);
-static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq,
-			     struct kvm_vcpu *filter_vcpu, bool needs_inv);
-
-/*
- * Creates a new (reference to a) struct vgic_irq for a given LPI.
- * If this LPI is already mapped on another ITS, we increase its refcount
- * and return a pointer to the existing structure.
- * If this is a "new" LPI, we allocate and initialize a new struct vgic_irq.
- * This function returns a pointer to the _unlocked_ structure.
- */
-static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid,
-				     struct kvm_vcpu *vcpu)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	struct vgic_irq *irq = vgic_get_irq(kvm, NULL, intid), *oldirq;
-	unsigned long flags;
-	int ret;
-
-	/* In this case there is no put, since we keep the reference. */
-	if (irq)
-		return irq;
-
-	irq = kzalloc(sizeof(struct vgic_irq), GFP_KERNEL);
-	if (!irq)
-		return ERR_PTR(-ENOMEM);
-
-	INIT_LIST_HEAD(&irq->lpi_list);
-	INIT_LIST_HEAD(&irq->ap_list);
-	spin_lock_init(&irq->irq_lock);
-
-	irq->config = VGIC_CONFIG_EDGE;
-	kref_init(&irq->refcount);
-	irq->intid = intid;
-	irq->target_vcpu = vcpu;
-	irq->group = 1;
-
-	spin_lock_irqsave(&dist->lpi_list_lock, flags);
-
-	/*
-	 * There could be a race with another vgic_add_lpi(), so we need to
-	 * check that we don't add a second list entry with the same LPI.
-	 */
-	list_for_each_entry(oldirq, &dist->lpi_list_head, lpi_list) {
-		if (oldirq->intid != intid)
-			continue;
-
-		/* Someone was faster with adding this LPI, lets use that. */
-		kfree(irq);
-		irq = oldirq;
-
-		/*
-		 * This increases the refcount, the caller is expected to
-		 * call vgic_put_irq() on the returned pointer once it's
-		 * finished with the IRQ.
-		 */
-		vgic_get_irq_kref(irq);
-
-		goto out_unlock;
-	}
-
-	list_add_tail(&irq->lpi_list, &dist->lpi_list_head);
-	dist->lpi_list_count++;
-
-out_unlock:
-	spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
-
-	/*
-	 * We "cache" the configuration table entries in our struct vgic_irq's.
-	 * However we only have those structs for mapped IRQs, so we read in
-	 * the respective config data from memory here upon mapping the LPI.
-	 */
-	ret = update_lpi_config(kvm, irq, NULL, false);
-	if (ret)
-		return ERR_PTR(ret);
-
-	ret = vgic_v3_lpi_sync_pending_status(kvm, irq);
-	if (ret)
-		return ERR_PTR(ret);
-
-	return irq;
-}
-
-struct its_device {
-	struct list_head dev_list;
-
-	/* the head for the list of ITTEs */
-	struct list_head itt_head;
-	u32 num_eventid_bits;
-	gpa_t itt_addr;
-	u32 device_id;
-};
-
-#define COLLECTION_NOT_MAPPED ((u32)~0)
-
-struct its_collection {
-	struct list_head coll_list;
-
-	u32 collection_id;
-	u32 target_addr;
-};
-
-#define its_is_collection_mapped(coll) ((coll) && \
-				((coll)->target_addr != COLLECTION_NOT_MAPPED))
-
-struct its_ite {
-	struct list_head ite_list;
-
-	struct vgic_irq *irq;
-	struct its_collection *collection;
-	u32 event_id;
-};
-
-/**
- * struct vgic_its_abi - ITS abi ops and settings
- * @cte_esz: collection table entry size
- * @dte_esz: device table entry size
- * @ite_esz: interrupt translation table entry size
- * @save tables: save the ITS tables into guest RAM
- * @restore_tables: restore the ITS internal structs from tables
- *  stored in guest RAM
- * @commit: initialize the registers which expose the ABI settings,
- *  especially the entry sizes
- */
-struct vgic_its_abi {
-	int cte_esz;
-	int dte_esz;
-	int ite_esz;
-	int (*save_tables)(struct vgic_its *its);
-	int (*restore_tables)(struct vgic_its *its);
-	int (*commit)(struct vgic_its *its);
-};
-
-#define ABI_0_ESZ	8
-#define ESZ_MAX		ABI_0_ESZ
-
-static const struct vgic_its_abi its_table_abi_versions[] = {
-	[0] = {
-	 .cte_esz = ABI_0_ESZ,
-	 .dte_esz = ABI_0_ESZ,
-	 .ite_esz = ABI_0_ESZ,
-	 .save_tables = vgic_its_save_tables_v0,
-	 .restore_tables = vgic_its_restore_tables_v0,
-	 .commit = vgic_its_commit_v0,
-	},
-};
-
-#define NR_ITS_ABIS	ARRAY_SIZE(its_table_abi_versions)
-
-inline const struct vgic_its_abi *vgic_its_get_abi(struct vgic_its *its)
-{
-	return &its_table_abi_versions[its->abi_rev];
-}
-
-static int vgic_its_set_abi(struct vgic_its *its, u32 rev)
-{
-	const struct vgic_its_abi *abi;
-
-	its->abi_rev = rev;
-	abi = vgic_its_get_abi(its);
-	return abi->commit(its);
-}
-
-/*
- * Find and returns a device in the device table for an ITS.
- * Must be called with the its_lock mutex held.
- */
-static struct its_device *find_its_device(struct vgic_its *its, u32 device_id)
-{
-	struct its_device *device;
-
-	list_for_each_entry(device, &its->device_list, dev_list)
-		if (device_id == device->device_id)
-			return device;
-
-	return NULL;
-}
-
-/*
- * Find and returns an interrupt translation table entry (ITTE) for a given
- * Device ID/Event ID pair on an ITS.
- * Must be called with the its_lock mutex held.
- */
-static struct its_ite *find_ite(struct vgic_its *its, u32 device_id,
-				  u32 event_id)
-{
-	struct its_device *device;
-	struct its_ite *ite;
-
-	device = find_its_device(its, device_id);
-	if (device == NULL)
-		return NULL;
-
-	list_for_each_entry(ite, &device->itt_head, ite_list)
-		if (ite->event_id == event_id)
-			return ite;
-
-	return NULL;
-}
-
-/* To be used as an iterator this macro misses the enclosing parentheses */
-#define for_each_lpi_its(dev, ite, its) \
-	list_for_each_entry(dev, &(its)->device_list, dev_list) \
-		list_for_each_entry(ite, &(dev)->itt_head, ite_list)
-
-#define GIC_LPI_OFFSET 8192
-
-#define VITS_TYPER_IDBITS 16
-#define VITS_TYPER_DEVBITS 16
-#define VITS_DTE_MAX_DEVID_OFFSET	(BIT(14) - 1)
-#define VITS_ITE_MAX_EVENTID_OFFSET	(BIT(16) - 1)
-
-/*
- * Finds and returns a collection in the ITS collection table.
- * Must be called with the its_lock mutex held.
- */
-static struct its_collection *find_collection(struct vgic_its *its, int coll_id)
-{
-	struct its_collection *collection;
-
-	list_for_each_entry(collection, &its->collection_list, coll_list) {
-		if (coll_id == collection->collection_id)
-			return collection;
-	}
-
-	return NULL;
-}
-
-#define LPI_PROP_ENABLE_BIT(p)	((p) & LPI_PROP_ENABLED)
-#define LPI_PROP_PRIORITY(p)	((p) & 0xfc)
-
-/*
- * Reads the configuration data for a given LPI from guest memory and
- * updates the fields in struct vgic_irq.
- * If filter_vcpu is not NULL, applies only if the IRQ is targeting this
- * VCPU. Unconditionally applies if filter_vcpu is NULL.
- */
-static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq,
-			     struct kvm_vcpu *filter_vcpu, bool needs_inv)
-{
-	u64 propbase = GICR_PROPBASER_ADDRESS(kvm->arch.vgic.propbaser);
-	u8 prop;
-	int ret;
-	unsigned long flags;
-
-	ret = kvm_read_guest_lock(kvm, propbase + irq->intid - GIC_LPI_OFFSET,
-				  &prop, 1);
-
-	if (ret)
-		return ret;
-
-	spin_lock_irqsave(&irq->irq_lock, flags);
-
-	if (!filter_vcpu || filter_vcpu == irq->target_vcpu) {
-		irq->priority = LPI_PROP_PRIORITY(prop);
-		irq->enabled = LPI_PROP_ENABLE_BIT(prop);
-
-		if (!irq->hw) {
-			vgic_queue_irq_unlock(kvm, irq, flags);
-			return 0;
-		}
-	}
-
-	spin_unlock_irqrestore(&irq->irq_lock, flags);
-
-	if (irq->hw)
-		return its_prop_update_vlpi(irq->host_irq, prop, needs_inv);
-
-	return 0;
-}
-
-/*
- * Create a snapshot of the current LPIs targeting @vcpu, so that we can
- * enumerate those LPIs without holding any lock.
- * Returns their number and puts the kmalloc'ed array into intid_ptr.
- */
-int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	struct vgic_irq *irq;
-	unsigned long flags;
-	u32 *intids;
-	int irq_count, i = 0;
-
-	/*
-	 * There is an obvious race between allocating the array and LPIs
-	 * being mapped/unmapped. If we ended up here as a result of a
-	 * command, we're safe (locks are held, preventing another
-	 * command). If coming from another path (such as enabling LPIs),
-	 * we must be careful not to overrun the array.
-	 */
-	irq_count = READ_ONCE(dist->lpi_list_count);
-	intids = kmalloc_array(irq_count, sizeof(intids[0]), GFP_KERNEL);
-	if (!intids)
-		return -ENOMEM;
-
-	spin_lock_irqsave(&dist->lpi_list_lock, flags);
-	list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
-		if (i == irq_count)
-			break;
-		/* We don't need to "get" the IRQ, as we hold the list lock. */
-		if (vcpu && irq->target_vcpu != vcpu)
-			continue;
-		intids[i++] = irq->intid;
-	}
-	spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
-
-	*intid_ptr = intids;
-	return i;
-}
-
-static int update_affinity(struct vgic_irq *irq, struct kvm_vcpu *vcpu)
-{
-	int ret = 0;
-	unsigned long flags;
-
-	spin_lock_irqsave(&irq->irq_lock, flags);
-	irq->target_vcpu = vcpu;
-	spin_unlock_irqrestore(&irq->irq_lock, flags);
-
-	if (irq->hw) {
-		struct its_vlpi_map map;
-
-		ret = its_get_vlpi(irq->host_irq, &map);
-		if (ret)
-			return ret;
-
-		map.vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
-
-		ret = its_map_vlpi(irq->host_irq, &map);
-	}
-
-	return ret;
-}
-
-/*
- * Promotes the ITS view of affinity of an ITTE (which redistributor this LPI
- * is targeting) to the VGIC's view, which deals with target VCPUs.
- * Needs to be called whenever either the collection for a LPIs has
- * changed or the collection itself got retargeted.
- */
-static void update_affinity_ite(struct kvm *kvm, struct its_ite *ite)
-{
-	struct kvm_vcpu *vcpu;
-
-	if (!its_is_collection_mapped(ite->collection))
-		return;
-
-	vcpu = kvm_get_vcpu(kvm, ite->collection->target_addr);
-	update_affinity(ite->irq, vcpu);
-}
-
-/*
- * Updates the target VCPU for every LPI targeting this collection.
- * Must be called with the its_lock mutex held.
- */
-static void update_affinity_collection(struct kvm *kvm, struct vgic_its *its,
-				       struct its_collection *coll)
-{
-	struct its_device *device;
-	struct its_ite *ite;
-
-	for_each_lpi_its(device, ite, its) {
-		if (!ite->collection || coll != ite->collection)
-			continue;
-
-		update_affinity_ite(kvm, ite);
-	}
-}
-
-static u32 max_lpis_propbaser(u64 propbaser)
-{
-	int nr_idbits = (propbaser & 0x1f) + 1;
-
-	return 1U << min(nr_idbits, INTERRUPT_ID_BITS_ITS);
-}
-
-/*
- * Sync the pending table pending bit of LPIs targeting @vcpu
- * with our own data structures. This relies on the LPI being
- * mapped before.
- */
-static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu)
-{
-	gpa_t pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser);
-	struct vgic_irq *irq;
-	int last_byte_offset = -1;
-	int ret = 0;
-	u32 *intids;
-	int nr_irqs, i;
-	unsigned long flags;
-	u8 pendmask;
-
-	nr_irqs = vgic_copy_lpi_list(vcpu->kvm, vcpu, &intids);
-	if (nr_irqs < 0)
-		return nr_irqs;
-
-	for (i = 0; i < nr_irqs; i++) {
-		int byte_offset, bit_nr;
-
-		byte_offset = intids[i] / BITS_PER_BYTE;
-		bit_nr = intids[i] % BITS_PER_BYTE;
-
-		/*
-		 * For contiguously allocated LPIs chances are we just read
-		 * this very same byte in the last iteration. Reuse that.
-		 */
-		if (byte_offset != last_byte_offset) {
-			ret = kvm_read_guest_lock(vcpu->kvm,
-						  pendbase + byte_offset,
-						  &pendmask, 1);
-			if (ret) {
-				kfree(intids);
-				return ret;
-			}
-			last_byte_offset = byte_offset;
-		}
-
-		irq = vgic_get_irq(vcpu->kvm, NULL, intids[i]);
-		spin_lock_irqsave(&irq->irq_lock, flags);
-		irq->pending_latch = pendmask & (1U << bit_nr);
-		vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-
-	kfree(intids);
-
-	return ret;
-}
-
-static unsigned long vgic_mmio_read_its_typer(struct kvm *kvm,
-					      struct vgic_its *its,
-					      gpa_t addr, unsigned int len)
-{
-	const struct vgic_its_abi *abi = vgic_its_get_abi(its);
-	u64 reg = GITS_TYPER_PLPIS;
-
-	/*
-	 * We use linear CPU numbers for redistributor addressing,
-	 * so GITS_TYPER.PTA is 0.
-	 * Also we force all PROPBASER registers to be the same, so
-	 * CommonLPIAff is 0 as well.
-	 * To avoid memory waste in the guest, we keep the number of IDBits and
-	 * DevBits low - as least for the time being.
-	 */
-	reg |= GIC_ENCODE_SZ(VITS_TYPER_DEVBITS, 5) << GITS_TYPER_DEVBITS_SHIFT;
-	reg |= GIC_ENCODE_SZ(VITS_TYPER_IDBITS, 5) << GITS_TYPER_IDBITS_SHIFT;
-	reg |= GIC_ENCODE_SZ(abi->ite_esz, 4) << GITS_TYPER_ITT_ENTRY_SIZE_SHIFT;
-
-	return extract_bytes(reg, addr & 7, len);
-}
-
-static unsigned long vgic_mmio_read_its_iidr(struct kvm *kvm,
-					     struct vgic_its *its,
-					     gpa_t addr, unsigned int len)
-{
-	u32 val;
-
-	val = (its->abi_rev << GITS_IIDR_REV_SHIFT) & GITS_IIDR_REV_MASK;
-	val |= (PRODUCT_ID_KVM << GITS_IIDR_PRODUCTID_SHIFT) | IMPLEMENTER_ARM;
-	return val;
-}
-
-static int vgic_mmio_uaccess_write_its_iidr(struct kvm *kvm,
-					    struct vgic_its *its,
-					    gpa_t addr, unsigned int len,
-					    unsigned long val)
-{
-	u32 rev = GITS_IIDR_REV(val);
-
-	if (rev >= NR_ITS_ABIS)
-		return -EINVAL;
-	return vgic_its_set_abi(its, rev);
-}
-
-static unsigned long vgic_mmio_read_its_idregs(struct kvm *kvm,
-					       struct vgic_its *its,
-					       gpa_t addr, unsigned int len)
-{
-	switch (addr & 0xffff) {
-	case GITS_PIDR0:
-		return 0x92;	/* part number, bits[7:0] */
-	case GITS_PIDR1:
-		return 0xb4;	/* part number, bits[11:8] */
-	case GITS_PIDR2:
-		return GIC_PIDR2_ARCH_GICv3 | 0x0b;
-	case GITS_PIDR4:
-		return 0x40;	/* This is a 64K software visible page */
-	/* The following are the ID registers for (any) GIC. */
-	case GITS_CIDR0:
-		return 0x0d;
-	case GITS_CIDR1:
-		return 0xf0;
-	case GITS_CIDR2:
-		return 0x05;
-	case GITS_CIDR3:
-		return 0xb1;
-	}
-
-	return 0;
-}
-
-int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its,
-			 u32 devid, u32 eventid, struct vgic_irq **irq)
-{
-	struct kvm_vcpu *vcpu;
-	struct its_ite *ite;
-
-	if (!its->enabled)
-		return -EBUSY;
-
-	ite = find_ite(its, devid, eventid);
-	if (!ite || !its_is_collection_mapped(ite->collection))
-		return E_ITS_INT_UNMAPPED_INTERRUPT;
-
-	vcpu = kvm_get_vcpu(kvm, ite->collection->target_addr);
-	if (!vcpu)
-		return E_ITS_INT_UNMAPPED_INTERRUPT;
-
-	if (!vcpu->arch.vgic_cpu.lpis_enabled)
-		return -EBUSY;
-
-	*irq = ite->irq;
-	return 0;
-}
-
-struct vgic_its *vgic_msi_to_its(struct kvm *kvm, struct kvm_msi *msi)
-{
-	u64 address;
-	struct kvm_io_device *kvm_io_dev;
-	struct vgic_io_device *iodev;
-
-	if (!vgic_has_its(kvm))
-		return ERR_PTR(-ENODEV);
-
-	if (!(msi->flags & KVM_MSI_VALID_DEVID))
-		return ERR_PTR(-EINVAL);
-
-	address = (u64)msi->address_hi << 32 | msi->address_lo;
-
-	kvm_io_dev = kvm_io_bus_get_dev(kvm, KVM_MMIO_BUS, address);
-	if (!kvm_io_dev)
-		return ERR_PTR(-EINVAL);
-
-	if (kvm_io_dev->ops != &kvm_io_gic_ops)
-		return ERR_PTR(-EINVAL);
-
-	iodev = container_of(kvm_io_dev, struct vgic_io_device, dev);
-	if (iodev->iodev_type != IODEV_ITS)
-		return ERR_PTR(-EINVAL);
-
-	return iodev->its;
-}
-
-/*
- * Find the target VCPU and the LPI number for a given devid/eventid pair
- * and make this IRQ pending, possibly injecting it.
- * Must be called with the its_lock mutex held.
- * Returns 0 on success, a positive error value for any ITS mapping
- * related errors and negative error values for generic errors.
- */
-static int vgic_its_trigger_msi(struct kvm *kvm, struct vgic_its *its,
-				u32 devid, u32 eventid)
-{
-	struct vgic_irq *irq = NULL;
-	unsigned long flags;
-	int err;
-
-	err = vgic_its_resolve_lpi(kvm, its, devid, eventid, &irq);
-	if (err)
-		return err;
-
-	if (irq->hw)
-		return irq_set_irqchip_state(irq->host_irq,
-					     IRQCHIP_STATE_PENDING, true);
-
-	spin_lock_irqsave(&irq->irq_lock, flags);
-	irq->pending_latch = true;
-	vgic_queue_irq_unlock(kvm, irq, flags);
-
-	return 0;
-}
-
-/*
- * Queries the KVM IO bus framework to get the ITS pointer from the given
- * doorbell address.
- * We then call vgic_its_trigger_msi() with the decoded data.
- * According to the KVM_SIGNAL_MSI API description returns 1 on success.
- */
-int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi)
-{
-	struct vgic_its *its;
-	int ret;
-
-	its = vgic_msi_to_its(kvm, msi);
-	if (IS_ERR(its))
-		return PTR_ERR(its);
-
-	mutex_lock(&its->its_lock);
-	ret = vgic_its_trigger_msi(kvm, its, msi->devid, msi->data);
-	mutex_unlock(&its->its_lock);
-
-	if (ret < 0)
-		return ret;
-
-	/*
-	 * KVM_SIGNAL_MSI demands a return value > 0 for success and 0
-	 * if the guest has blocked the MSI. So we map any LPI mapping
-	 * related error to that.
-	 */
-	if (ret)
-		return 0;
-	else
-		return 1;
-}
-
-/* Requires the its_lock to be held. */
-static void its_free_ite(struct kvm *kvm, struct its_ite *ite)
-{
-	list_del(&ite->ite_list);
-
-	/* This put matches the get in vgic_add_lpi. */
-	if (ite->irq) {
-		if (ite->irq->hw)
-			WARN_ON(its_unmap_vlpi(ite->irq->host_irq));
-
-		vgic_put_irq(kvm, ite->irq);
-	}
-
-	kfree(ite);
-}
-
-static u64 its_cmd_mask_field(u64 *its_cmd, int word, int shift, int size)
-{
-	return (le64_to_cpu(its_cmd[word]) >> shift) & (BIT_ULL(size) - 1);
-}
-
-#define its_cmd_get_command(cmd)	its_cmd_mask_field(cmd, 0,  0,  8)
-#define its_cmd_get_deviceid(cmd)	its_cmd_mask_field(cmd, 0, 32, 32)
-#define its_cmd_get_size(cmd)		(its_cmd_mask_field(cmd, 1,  0,  5) + 1)
-#define its_cmd_get_id(cmd)		its_cmd_mask_field(cmd, 1,  0, 32)
-#define its_cmd_get_physical_id(cmd)	its_cmd_mask_field(cmd, 1, 32, 32)
-#define its_cmd_get_collection(cmd)	its_cmd_mask_field(cmd, 2,  0, 16)
-#define its_cmd_get_ittaddr(cmd)	(its_cmd_mask_field(cmd, 2,  8, 44) << 8)
-#define its_cmd_get_target_addr(cmd)	its_cmd_mask_field(cmd, 2, 16, 32)
-#define its_cmd_get_validbit(cmd)	its_cmd_mask_field(cmd, 2, 63,  1)
-
-/*
- * The DISCARD command frees an Interrupt Translation Table Entry (ITTE).
- * Must be called with the its_lock mutex held.
- */
-static int vgic_its_cmd_handle_discard(struct kvm *kvm, struct vgic_its *its,
-				       u64 *its_cmd)
-{
-	u32 device_id = its_cmd_get_deviceid(its_cmd);
-	u32 event_id = its_cmd_get_id(its_cmd);
-	struct its_ite *ite;
-
-
-	ite = find_ite(its, device_id, event_id);
-	if (ite && ite->collection) {
-		/*
-		 * Though the spec talks about removing the pending state, we
-		 * don't bother here since we clear the ITTE anyway and the
-		 * pending state is a property of the ITTE struct.
-		 */
-		its_free_ite(kvm, ite);
-		return 0;
-	}
-
-	return E_ITS_DISCARD_UNMAPPED_INTERRUPT;
-}
-
-/*
- * The MOVI command moves an ITTE to a different collection.
- * Must be called with the its_lock mutex held.
- */
-static int vgic_its_cmd_handle_movi(struct kvm *kvm, struct vgic_its *its,
-				    u64 *its_cmd)
-{
-	u32 device_id = its_cmd_get_deviceid(its_cmd);
-	u32 event_id = its_cmd_get_id(its_cmd);
-	u32 coll_id = its_cmd_get_collection(its_cmd);
-	struct kvm_vcpu *vcpu;
-	struct its_ite *ite;
-	struct its_collection *collection;
-
-	ite = find_ite(its, device_id, event_id);
-	if (!ite)
-		return E_ITS_MOVI_UNMAPPED_INTERRUPT;
-
-	if (!its_is_collection_mapped(ite->collection))
-		return E_ITS_MOVI_UNMAPPED_COLLECTION;
-
-	collection = find_collection(its, coll_id);
-	if (!its_is_collection_mapped(collection))
-		return E_ITS_MOVI_UNMAPPED_COLLECTION;
-
-	ite->collection = collection;
-	vcpu = kvm_get_vcpu(kvm, collection->target_addr);
-
-	return update_affinity(ite->irq, vcpu);
-}
-
-/*
- * Check whether an ID can be stored into the corresponding guest table.
- * For a direct table this is pretty easy, but gets a bit nasty for
- * indirect tables. We check whether the resulting guest physical address
- * is actually valid (covered by a memslot and guest accessible).
- * For this we have to read the respective first level entry.
- */
-static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id,
-			      gpa_t *eaddr)
-{
-	int l1_tbl_size = GITS_BASER_NR_PAGES(baser) * SZ_64K;
-	u64 indirect_ptr, type = GITS_BASER_TYPE(baser);
-	phys_addr_t base = GITS_BASER_ADDR_48_to_52(baser);
-	int esz = GITS_BASER_ENTRY_SIZE(baser);
-	int index;
-	gfn_t gfn;
-
-	switch (type) {
-	case GITS_BASER_TYPE_DEVICE:
-		if (id >= BIT_ULL(VITS_TYPER_DEVBITS))
-			return false;
-		break;
-	case GITS_BASER_TYPE_COLLECTION:
-		/* as GITS_TYPER.CIL == 0, ITS supports 16-bit collection ID */
-		if (id >= BIT_ULL(16))
-			return false;
-		break;
-	default:
-		return false;
-	}
-
-	if (!(baser & GITS_BASER_INDIRECT)) {
-		phys_addr_t addr;
-
-		if (id >= (l1_tbl_size / esz))
-			return false;
-
-		addr = base + id * esz;
-		gfn = addr >> PAGE_SHIFT;
-
-		if (eaddr)
-			*eaddr = addr;
-		return kvm_is_visible_gfn(its->dev->kvm, gfn);
-	}
-
-	/* calculate and check the index into the 1st level */
-	index = id / (SZ_64K / esz);
-	if (index >= (l1_tbl_size / sizeof(u64)))
-		return false;
-
-	/* Each 1st level entry is represented by a 64-bit value. */
-	if (kvm_read_guest_lock(its->dev->kvm,
-			   base + index * sizeof(indirect_ptr),
-			   &indirect_ptr, sizeof(indirect_ptr)))
-		return false;
-
-	indirect_ptr = le64_to_cpu(indirect_ptr);
-
-	/* check the valid bit of the first level entry */
-	if (!(indirect_ptr & BIT_ULL(63)))
-		return false;
-
-	/* Mask the guest physical address and calculate the frame number. */
-	indirect_ptr &= GENMASK_ULL(51, 16);
-
-	/* Find the address of the actual entry */
-	index = id % (SZ_64K / esz);
-	indirect_ptr += index * esz;
-	gfn = indirect_ptr >> PAGE_SHIFT;
-
-	if (eaddr)
-		*eaddr = indirect_ptr;
-	return kvm_is_visible_gfn(its->dev->kvm, gfn);
-}
-
-static int vgic_its_alloc_collection(struct vgic_its *its,
-				     struct its_collection **colp,
-				     u32 coll_id)
-{
-	struct its_collection *collection;
-
-	if (!vgic_its_check_id(its, its->baser_coll_table, coll_id, NULL))
-		return E_ITS_MAPC_COLLECTION_OOR;
-
-	collection = kzalloc(sizeof(*collection), GFP_KERNEL);
-	if (!collection)
-		return -ENOMEM;
-
-	collection->collection_id = coll_id;
-	collection->target_addr = COLLECTION_NOT_MAPPED;
-
-	list_add_tail(&collection->coll_list, &its->collection_list);
-	*colp = collection;
-
-	return 0;
-}
-
-static void vgic_its_free_collection(struct vgic_its *its, u32 coll_id)
-{
-	struct its_collection *collection;
-	struct its_device *device;
-	struct its_ite *ite;
-
-	/*
-	 * Clearing the mapping for that collection ID removes the
-	 * entry from the list. If there wasn't any before, we can
-	 * go home early.
-	 */
-	collection = find_collection(its, coll_id);
-	if (!collection)
-		return;
-
-	for_each_lpi_its(device, ite, its)
-		if (ite->collection &&
-		    ite->collection->collection_id == coll_id)
-			ite->collection = NULL;
-
-	list_del(&collection->coll_list);
-	kfree(collection);
-}
-
-/* Must be called with its_lock mutex held */
-static struct its_ite *vgic_its_alloc_ite(struct its_device *device,
-					  struct its_collection *collection,
-					  u32 event_id)
-{
-	struct its_ite *ite;
-
-	ite = kzalloc(sizeof(*ite), GFP_KERNEL);
-	if (!ite)
-		return ERR_PTR(-ENOMEM);
-
-	ite->event_id	= event_id;
-	ite->collection = collection;
-
-	list_add_tail(&ite->ite_list, &device->itt_head);
-	return ite;
-}
-
-/*
- * The MAPTI and MAPI commands map LPIs to ITTEs.
- * Must be called with its_lock mutex held.
- */
-static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its,
-				    u64 *its_cmd)
-{
-	u32 device_id = its_cmd_get_deviceid(its_cmd);
-	u32 event_id = its_cmd_get_id(its_cmd);
-	u32 coll_id = its_cmd_get_collection(its_cmd);
-	struct its_ite *ite;
-	struct kvm_vcpu *vcpu = NULL;
-	struct its_device *device;
-	struct its_collection *collection, *new_coll = NULL;
-	struct vgic_irq *irq;
-	int lpi_nr;
-
-	device = find_its_device(its, device_id);
-	if (!device)
-		return E_ITS_MAPTI_UNMAPPED_DEVICE;
-
-	if (event_id >= BIT_ULL(device->num_eventid_bits))
-		return E_ITS_MAPTI_ID_OOR;
-
-	if (its_cmd_get_command(its_cmd) == GITS_CMD_MAPTI)
-		lpi_nr = its_cmd_get_physical_id(its_cmd);
-	else
-		lpi_nr = event_id;
-	if (lpi_nr < GIC_LPI_OFFSET ||
-	    lpi_nr >= max_lpis_propbaser(kvm->arch.vgic.propbaser))
-		return E_ITS_MAPTI_PHYSICALID_OOR;
-
-	/* If there is an existing mapping, behavior is UNPREDICTABLE. */
-	if (find_ite(its, device_id, event_id))
-		return 0;
-
-	collection = find_collection(its, coll_id);
-	if (!collection) {
-		int ret = vgic_its_alloc_collection(its, &collection, coll_id);
-		if (ret)
-			return ret;
-		new_coll = collection;
-	}
-
-	ite = vgic_its_alloc_ite(device, collection, event_id);
-	if (IS_ERR(ite)) {
-		if (new_coll)
-			vgic_its_free_collection(its, coll_id);
-		return PTR_ERR(ite);
-	}
-
-	if (its_is_collection_mapped(collection))
-		vcpu = kvm_get_vcpu(kvm, collection->target_addr);
-
-	irq = vgic_add_lpi(kvm, lpi_nr, vcpu);
-	if (IS_ERR(irq)) {
-		if (new_coll)
-			vgic_its_free_collection(its, coll_id);
-		its_free_ite(kvm, ite);
-		return PTR_ERR(irq);
-	}
-	ite->irq = irq;
-
-	return 0;
-}
-
-/* Requires the its_lock to be held. */
-static void vgic_its_free_device(struct kvm *kvm, struct its_device *device)
-{
-	struct its_ite *ite, *temp;
-
-	/*
-	 * The spec says that unmapping a device with still valid
-	 * ITTEs associated is UNPREDICTABLE. We remove all ITTEs,
-	 * since we cannot leave the memory unreferenced.
-	 */
-	list_for_each_entry_safe(ite, temp, &device->itt_head, ite_list)
-		its_free_ite(kvm, ite);
-
-	list_del(&device->dev_list);
-	kfree(device);
-}
-
-/* its lock must be held */
-static void vgic_its_free_device_list(struct kvm *kvm, struct vgic_its *its)
-{
-	struct its_device *cur, *temp;
-
-	list_for_each_entry_safe(cur, temp, &its->device_list, dev_list)
-		vgic_its_free_device(kvm, cur);
-}
-
-/* its lock must be held */
-static void vgic_its_free_collection_list(struct kvm *kvm, struct vgic_its *its)
-{
-	struct its_collection *cur, *temp;
-
-	list_for_each_entry_safe(cur, temp, &its->collection_list, coll_list)
-		vgic_its_free_collection(its, cur->collection_id);
-}
-
-/* Must be called with its_lock mutex held */
-static struct its_device *vgic_its_alloc_device(struct vgic_its *its,
-						u32 device_id, gpa_t itt_addr,
-						u8 num_eventid_bits)
-{
-	struct its_device *device;
-
-	device = kzalloc(sizeof(*device), GFP_KERNEL);
-	if (!device)
-		return ERR_PTR(-ENOMEM);
-
-	device->device_id = device_id;
-	device->itt_addr = itt_addr;
-	device->num_eventid_bits = num_eventid_bits;
-	INIT_LIST_HEAD(&device->itt_head);
-
-	list_add_tail(&device->dev_list, &its->device_list);
-	return device;
-}
-
-/*
- * MAPD maps or unmaps a device ID to Interrupt Translation Tables (ITTs).
- * Must be called with the its_lock mutex held.
- */
-static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its,
-				    u64 *its_cmd)
-{
-	u32 device_id = its_cmd_get_deviceid(its_cmd);
-	bool valid = its_cmd_get_validbit(its_cmd);
-	u8 num_eventid_bits = its_cmd_get_size(its_cmd);
-	gpa_t itt_addr = its_cmd_get_ittaddr(its_cmd);
-	struct its_device *device;
-
-	if (!vgic_its_check_id(its, its->baser_device_table, device_id, NULL))
-		return E_ITS_MAPD_DEVICE_OOR;
-
-	if (valid && num_eventid_bits > VITS_TYPER_IDBITS)
-		return E_ITS_MAPD_ITTSIZE_OOR;
-
-	device = find_its_device(its, device_id);
-
-	/*
-	 * The spec says that calling MAPD on an already mapped device
-	 * invalidates all cached data for this device. We implement this
-	 * by removing the mapping and re-establishing it.
-	 */
-	if (device)
-		vgic_its_free_device(kvm, device);
-
-	/*
-	 * The spec does not say whether unmapping a not-mapped device
-	 * is an error, so we are done in any case.
-	 */
-	if (!valid)
-		return 0;
-
-	device = vgic_its_alloc_device(its, device_id, itt_addr,
-				       num_eventid_bits);
-
-	return PTR_ERR_OR_ZERO(device);
-}
-
-/*
- * The MAPC command maps collection IDs to redistributors.
- * Must be called with the its_lock mutex held.
- */
-static int vgic_its_cmd_handle_mapc(struct kvm *kvm, struct vgic_its *its,
-				    u64 *its_cmd)
-{
-	u16 coll_id;
-	u32 target_addr;
-	struct its_collection *collection;
-	bool valid;
-
-	valid = its_cmd_get_validbit(its_cmd);
-	coll_id = its_cmd_get_collection(its_cmd);
-	target_addr = its_cmd_get_target_addr(its_cmd);
-
-	if (target_addr >= atomic_read(&kvm->online_vcpus))
-		return E_ITS_MAPC_PROCNUM_OOR;
-
-	if (!valid) {
-		vgic_its_free_collection(its, coll_id);
-	} else {
-		collection = find_collection(its, coll_id);
-
-		if (!collection) {
-			int ret;
-
-			ret = vgic_its_alloc_collection(its, &collection,
-							coll_id);
-			if (ret)
-				return ret;
-			collection->target_addr = target_addr;
-		} else {
-			collection->target_addr = target_addr;
-			update_affinity_collection(kvm, its, collection);
-		}
-	}
-
-	return 0;
-}
-
-/*
- * The CLEAR command removes the pending state for a particular LPI.
- * Must be called with the its_lock mutex held.
- */
-static int vgic_its_cmd_handle_clear(struct kvm *kvm, struct vgic_its *its,
-				     u64 *its_cmd)
-{
-	u32 device_id = its_cmd_get_deviceid(its_cmd);
-	u32 event_id = its_cmd_get_id(its_cmd);
-	struct its_ite *ite;
-
-
-	ite = find_ite(its, device_id, event_id);
-	if (!ite)
-		return E_ITS_CLEAR_UNMAPPED_INTERRUPT;
-
-	ite->irq->pending_latch = false;
-
-	if (ite->irq->hw)
-		return irq_set_irqchip_state(ite->irq->host_irq,
-					     IRQCHIP_STATE_PENDING, false);
-
-	return 0;
-}
-
-/*
- * The INV command syncs the configuration bits from the memory table.
- * Must be called with the its_lock mutex held.
- */
-static int vgic_its_cmd_handle_inv(struct kvm *kvm, struct vgic_its *its,
-				   u64 *its_cmd)
-{
-	u32 device_id = its_cmd_get_deviceid(its_cmd);
-	u32 event_id = its_cmd_get_id(its_cmd);
-	struct its_ite *ite;
-
-
-	ite = find_ite(its, device_id, event_id);
-	if (!ite)
-		return E_ITS_INV_UNMAPPED_INTERRUPT;
-
-	return update_lpi_config(kvm, ite->irq, NULL, true);
-}
-
-/*
- * The INVALL command requests flushing of all IRQ data in this collection.
- * Find the VCPU mapped to that collection, then iterate over the VM's list
- * of mapped LPIs and update the configuration for each IRQ which targets
- * the specified vcpu. The configuration will be read from the in-memory
- * configuration table.
- * Must be called with the its_lock mutex held.
- */
-static int vgic_its_cmd_handle_invall(struct kvm *kvm, struct vgic_its *its,
-				      u64 *its_cmd)
-{
-	u32 coll_id = its_cmd_get_collection(its_cmd);
-	struct its_collection *collection;
-	struct kvm_vcpu *vcpu;
-	struct vgic_irq *irq;
-	u32 *intids;
-	int irq_count, i;
-
-	collection = find_collection(its, coll_id);
-	if (!its_is_collection_mapped(collection))
-		return E_ITS_INVALL_UNMAPPED_COLLECTION;
-
-	vcpu = kvm_get_vcpu(kvm, collection->target_addr);
-
-	irq_count = vgic_copy_lpi_list(kvm, vcpu, &intids);
-	if (irq_count < 0)
-		return irq_count;
-
-	for (i = 0; i < irq_count; i++) {
-		irq = vgic_get_irq(kvm, NULL, intids[i]);
-		if (!irq)
-			continue;
-		update_lpi_config(kvm, irq, vcpu, false);
-		vgic_put_irq(kvm, irq);
-	}
-
-	kfree(intids);
-
-	if (vcpu->arch.vgic_cpu.vgic_v3.its_vpe.its_vm)
-		its_invall_vpe(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe);
-
-	return 0;
-}
-
-/*
- * The MOVALL command moves the pending state of all IRQs targeting one
- * redistributor to another. We don't hold the pending state in the VCPUs,
- * but in the IRQs instead, so there is really not much to do for us here.
- * However the spec says that no IRQ must target the old redistributor
- * afterwards, so we make sure that no LPI is using the associated target_vcpu.
- * This command affects all LPIs in the system that target that redistributor.
- */
-static int vgic_its_cmd_handle_movall(struct kvm *kvm, struct vgic_its *its,
-				      u64 *its_cmd)
-{
-	u32 target1_addr = its_cmd_get_target_addr(its_cmd);
-	u32 target2_addr = its_cmd_mask_field(its_cmd, 3, 16, 32);
-	struct kvm_vcpu *vcpu1, *vcpu2;
-	struct vgic_irq *irq;
-	u32 *intids;
-	int irq_count, i;
-
-	if (target1_addr >= atomic_read(&kvm->online_vcpus) ||
-	    target2_addr >= atomic_read(&kvm->online_vcpus))
-		return E_ITS_MOVALL_PROCNUM_OOR;
-
-	if (target1_addr == target2_addr)
-		return 0;
-
-	vcpu1 = kvm_get_vcpu(kvm, target1_addr);
-	vcpu2 = kvm_get_vcpu(kvm, target2_addr);
-
-	irq_count = vgic_copy_lpi_list(kvm, vcpu1, &intids);
-	if (irq_count < 0)
-		return irq_count;
-
-	for (i = 0; i < irq_count; i++) {
-		irq = vgic_get_irq(kvm, NULL, intids[i]);
-
-		update_affinity(irq, vcpu2);
-
-		vgic_put_irq(kvm, irq);
-	}
-
-	kfree(intids);
-	return 0;
-}
-
-/*
- * The INT command injects the LPI associated with that DevID/EvID pair.
- * Must be called with the its_lock mutex held.
- */
-static int vgic_its_cmd_handle_int(struct kvm *kvm, struct vgic_its *its,
-				   u64 *its_cmd)
-{
-	u32 msi_data = its_cmd_get_id(its_cmd);
-	u64 msi_devid = its_cmd_get_deviceid(its_cmd);
-
-	return vgic_its_trigger_msi(kvm, its, msi_devid, msi_data);
-}
-
-/*
- * This function is called with the its_cmd lock held, but the ITS data
- * structure lock dropped.
- */
-static int vgic_its_handle_command(struct kvm *kvm, struct vgic_its *its,
-				   u64 *its_cmd)
-{
-	int ret = -ENODEV;
-
-	mutex_lock(&its->its_lock);
-	switch (its_cmd_get_command(its_cmd)) {
-	case GITS_CMD_MAPD:
-		ret = vgic_its_cmd_handle_mapd(kvm, its, its_cmd);
-		break;
-	case GITS_CMD_MAPC:
-		ret = vgic_its_cmd_handle_mapc(kvm, its, its_cmd);
-		break;
-	case GITS_CMD_MAPI:
-		ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd);
-		break;
-	case GITS_CMD_MAPTI:
-		ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd);
-		break;
-	case GITS_CMD_MOVI:
-		ret = vgic_its_cmd_handle_movi(kvm, its, its_cmd);
-		break;
-	case GITS_CMD_DISCARD:
-		ret = vgic_its_cmd_handle_discard(kvm, its, its_cmd);
-		break;
-	case GITS_CMD_CLEAR:
-		ret = vgic_its_cmd_handle_clear(kvm, its, its_cmd);
-		break;
-	case GITS_CMD_MOVALL:
-		ret = vgic_its_cmd_handle_movall(kvm, its, its_cmd);
-		break;
-	case GITS_CMD_INT:
-		ret = vgic_its_cmd_handle_int(kvm, its, its_cmd);
-		break;
-	case GITS_CMD_INV:
-		ret = vgic_its_cmd_handle_inv(kvm, its, its_cmd);
-		break;
-	case GITS_CMD_INVALL:
-		ret = vgic_its_cmd_handle_invall(kvm, its, its_cmd);
-		break;
-	case GITS_CMD_SYNC:
-		/* we ignore this command: we are in sync all of the time */
-		ret = 0;
-		break;
-	}
-	mutex_unlock(&its->its_lock);
-
-	return ret;
-}
-
-static u64 vgic_sanitise_its_baser(u64 reg)
-{
-	reg = vgic_sanitise_field(reg, GITS_BASER_SHAREABILITY_MASK,
-				  GITS_BASER_SHAREABILITY_SHIFT,
-				  vgic_sanitise_shareability);
-	reg = vgic_sanitise_field(reg, GITS_BASER_INNER_CACHEABILITY_MASK,
-				  GITS_BASER_INNER_CACHEABILITY_SHIFT,
-				  vgic_sanitise_inner_cacheability);
-	reg = vgic_sanitise_field(reg, GITS_BASER_OUTER_CACHEABILITY_MASK,
-				  GITS_BASER_OUTER_CACHEABILITY_SHIFT,
-				  vgic_sanitise_outer_cacheability);
-
-	/* We support only one (ITS) page size: 64K */
-	reg = (reg & ~GITS_BASER_PAGE_SIZE_MASK) | GITS_BASER_PAGE_SIZE_64K;
-
-	return reg;
-}
-
-static u64 vgic_sanitise_its_cbaser(u64 reg)
-{
-	reg = vgic_sanitise_field(reg, GITS_CBASER_SHAREABILITY_MASK,
-				  GITS_CBASER_SHAREABILITY_SHIFT,
-				  vgic_sanitise_shareability);
-	reg = vgic_sanitise_field(reg, GITS_CBASER_INNER_CACHEABILITY_MASK,
-				  GITS_CBASER_INNER_CACHEABILITY_SHIFT,
-				  vgic_sanitise_inner_cacheability);
-	reg = vgic_sanitise_field(reg, GITS_CBASER_OUTER_CACHEABILITY_MASK,
-				  GITS_CBASER_OUTER_CACHEABILITY_SHIFT,
-				  vgic_sanitise_outer_cacheability);
-
-	/* Sanitise the physical address to be 64k aligned. */
-	reg &= ~GENMASK_ULL(15, 12);
-
-	return reg;
-}
-
-static unsigned long vgic_mmio_read_its_cbaser(struct kvm *kvm,
-					       struct vgic_its *its,
-					       gpa_t addr, unsigned int len)
-{
-	return extract_bytes(its->cbaser, addr & 7, len);
-}
-
-static void vgic_mmio_write_its_cbaser(struct kvm *kvm, struct vgic_its *its,
-				       gpa_t addr, unsigned int len,
-				       unsigned long val)
-{
-	/* When GITS_CTLR.Enable is 1, this register is RO. */
-	if (its->enabled)
-		return;
-
-	mutex_lock(&its->cmd_lock);
-	its->cbaser = update_64bit_reg(its->cbaser, addr & 7, len, val);
-	its->cbaser = vgic_sanitise_its_cbaser(its->cbaser);
-	its->creadr = 0;
-	/*
-	 * CWRITER is architecturally UNKNOWN on reset, but we need to reset
-	 * it to CREADR to make sure we start with an empty command buffer.
-	 */
-	its->cwriter = its->creadr;
-	mutex_unlock(&its->cmd_lock);
-}
-
-#define ITS_CMD_BUFFER_SIZE(baser)	((((baser) & 0xff) + 1) << 12)
-#define ITS_CMD_SIZE			32
-#define ITS_CMD_OFFSET(reg)		((reg) & GENMASK(19, 5))
-
-/* Must be called with the cmd_lock held. */
-static void vgic_its_process_commands(struct kvm *kvm, struct vgic_its *its)
-{
-	gpa_t cbaser;
-	u64 cmd_buf[4];
-
-	/* Commands are only processed when the ITS is enabled. */
-	if (!its->enabled)
-		return;
-
-	cbaser = GITS_CBASER_ADDRESS(its->cbaser);
-
-	while (its->cwriter != its->creadr) {
-		int ret = kvm_read_guest_lock(kvm, cbaser + its->creadr,
-					      cmd_buf, ITS_CMD_SIZE);
-		/*
-		 * If kvm_read_guest() fails, this could be due to the guest
-		 * programming a bogus value in CBASER or something else going
-		 * wrong from which we cannot easily recover.
-		 * According to section 6.3.2 in the GICv3 spec we can just
-		 * ignore that command then.
-		 */
-		if (!ret)
-			vgic_its_handle_command(kvm, its, cmd_buf);
-
-		its->creadr += ITS_CMD_SIZE;
-		if (its->creadr == ITS_CMD_BUFFER_SIZE(its->cbaser))
-			its->creadr = 0;
-	}
-}
-
-/*
- * By writing to CWRITER the guest announces new commands to be processed.
- * To avoid any races in the first place, we take the its_cmd lock, which
- * protects our ring buffer variables, so that there is only one user
- * per ITS handling commands at a given time.
- */
-static void vgic_mmio_write_its_cwriter(struct kvm *kvm, struct vgic_its *its,
-					gpa_t addr, unsigned int len,
-					unsigned long val)
-{
-	u64 reg;
-
-	if (!its)
-		return;
-
-	mutex_lock(&its->cmd_lock);
-
-	reg = update_64bit_reg(its->cwriter, addr & 7, len, val);
-	reg = ITS_CMD_OFFSET(reg);
-	if (reg >= ITS_CMD_BUFFER_SIZE(its->cbaser)) {
-		mutex_unlock(&its->cmd_lock);
-		return;
-	}
-	its->cwriter = reg;
-
-	vgic_its_process_commands(kvm, its);
-
-	mutex_unlock(&its->cmd_lock);
-}
-
-static unsigned long vgic_mmio_read_its_cwriter(struct kvm *kvm,
-						struct vgic_its *its,
-						gpa_t addr, unsigned int len)
-{
-	return extract_bytes(its->cwriter, addr & 0x7, len);
-}
-
-static unsigned long vgic_mmio_read_its_creadr(struct kvm *kvm,
-					       struct vgic_its *its,
-					       gpa_t addr, unsigned int len)
-{
-	return extract_bytes(its->creadr, addr & 0x7, len);
-}
-
-static int vgic_mmio_uaccess_write_its_creadr(struct kvm *kvm,
-					      struct vgic_its *its,
-					      gpa_t addr, unsigned int len,
-					      unsigned long val)
-{
-	u32 cmd_offset;
-	int ret = 0;
-
-	mutex_lock(&its->cmd_lock);
-
-	if (its->enabled) {
-		ret = -EBUSY;
-		goto out;
-	}
-
-	cmd_offset = ITS_CMD_OFFSET(val);
-	if (cmd_offset >= ITS_CMD_BUFFER_SIZE(its->cbaser)) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	its->creadr = cmd_offset;
-out:
-	mutex_unlock(&its->cmd_lock);
-	return ret;
-}
-
-#define BASER_INDEX(addr) (((addr) / sizeof(u64)) & 0x7)
-static unsigned long vgic_mmio_read_its_baser(struct kvm *kvm,
-					      struct vgic_its *its,
-					      gpa_t addr, unsigned int len)
-{
-	u64 reg;
-
-	switch (BASER_INDEX(addr)) {
-	case 0:
-		reg = its->baser_device_table;
-		break;
-	case 1:
-		reg = its->baser_coll_table;
-		break;
-	default:
-		reg = 0;
-		break;
-	}
-
-	return extract_bytes(reg, addr & 7, len);
-}
-
-#define GITS_BASER_RO_MASK	(GENMASK_ULL(52, 48) | GENMASK_ULL(58, 56))
-static void vgic_mmio_write_its_baser(struct kvm *kvm,
-				      struct vgic_its *its,
-				      gpa_t addr, unsigned int len,
-				      unsigned long val)
-{
-	const struct vgic_its_abi *abi = vgic_its_get_abi(its);
-	u64 entry_size, table_type;
-	u64 reg, *regptr, clearbits = 0;
-
-	/* When GITS_CTLR.Enable is 1, we ignore write accesses. */
-	if (its->enabled)
-		return;
-
-	switch (BASER_INDEX(addr)) {
-	case 0:
-		regptr = &its->baser_device_table;
-		entry_size = abi->dte_esz;
-		table_type = GITS_BASER_TYPE_DEVICE;
-		break;
-	case 1:
-		regptr = &its->baser_coll_table;
-		entry_size = abi->cte_esz;
-		table_type = GITS_BASER_TYPE_COLLECTION;
-		clearbits = GITS_BASER_INDIRECT;
-		break;
-	default:
-		return;
-	}
-
-	reg = update_64bit_reg(*regptr, addr & 7, len, val);
-	reg &= ~GITS_BASER_RO_MASK;
-	reg &= ~clearbits;
-
-	reg |= (entry_size - 1) << GITS_BASER_ENTRY_SIZE_SHIFT;
-	reg |= table_type << GITS_BASER_TYPE_SHIFT;
-	reg = vgic_sanitise_its_baser(reg);
-
-	*regptr = reg;
-
-	if (!(reg & GITS_BASER_VALID)) {
-		/* Take the its_lock to prevent a race with a save/restore */
-		mutex_lock(&its->its_lock);
-		switch (table_type) {
-		case GITS_BASER_TYPE_DEVICE:
-			vgic_its_free_device_list(kvm, its);
-			break;
-		case GITS_BASER_TYPE_COLLECTION:
-			vgic_its_free_collection_list(kvm, its);
-			break;
-		}
-		mutex_unlock(&its->its_lock);
-	}
-}
-
-static unsigned long vgic_mmio_read_its_ctlr(struct kvm *vcpu,
-					     struct vgic_its *its,
-					     gpa_t addr, unsigned int len)
-{
-	u32 reg = 0;
-
-	mutex_lock(&its->cmd_lock);
-	if (its->creadr == its->cwriter)
-		reg |= GITS_CTLR_QUIESCENT;
-	if (its->enabled)
-		reg |= GITS_CTLR_ENABLE;
-	mutex_unlock(&its->cmd_lock);
-
-	return reg;
-}
-
-static void vgic_mmio_write_its_ctlr(struct kvm *kvm, struct vgic_its *its,
-				     gpa_t addr, unsigned int len,
-				     unsigned long val)
-{
-	mutex_lock(&its->cmd_lock);
-
-	/*
-	 * It is UNPREDICTABLE to enable the ITS if any of the CBASER or
-	 * device/collection BASER are invalid
-	 */
-	if (!its->enabled && (val & GITS_CTLR_ENABLE) &&
-		(!(its->baser_device_table & GITS_BASER_VALID) ||
-		 !(its->baser_coll_table & GITS_BASER_VALID) ||
-		 !(its->cbaser & GITS_CBASER_VALID)))
-		goto out;
-
-	its->enabled = !!(val & GITS_CTLR_ENABLE);
-
-	/*
-	 * Try to process any pending commands. This function bails out early
-	 * if the ITS is disabled or no commands have been queued.
-	 */
-	vgic_its_process_commands(kvm, its);
-
-out:
-	mutex_unlock(&its->cmd_lock);
-}
-
-#define REGISTER_ITS_DESC(off, rd, wr, length, acc)		\
-{								\
-	.reg_offset = off,					\
-	.len = length,						\
-	.access_flags = acc,					\
-	.its_read = rd,						\
-	.its_write = wr,					\
-}
-
-#define REGISTER_ITS_DESC_UACCESS(off, rd, wr, uwr, length, acc)\
-{								\
-	.reg_offset = off,					\
-	.len = length,						\
-	.access_flags = acc,					\
-	.its_read = rd,						\
-	.its_write = wr,					\
-	.uaccess_its_write = uwr,				\
-}
-
-static void its_mmio_write_wi(struct kvm *kvm, struct vgic_its *its,
-			      gpa_t addr, unsigned int len, unsigned long val)
-{
-	/* Ignore */
-}
-
-static struct vgic_register_region its_registers[] = {
-	REGISTER_ITS_DESC(GITS_CTLR,
-		vgic_mmio_read_its_ctlr, vgic_mmio_write_its_ctlr, 4,
-		VGIC_ACCESS_32bit),
-	REGISTER_ITS_DESC_UACCESS(GITS_IIDR,
-		vgic_mmio_read_its_iidr, its_mmio_write_wi,
-		vgic_mmio_uaccess_write_its_iidr, 4,
-		VGIC_ACCESS_32bit),
-	REGISTER_ITS_DESC(GITS_TYPER,
-		vgic_mmio_read_its_typer, its_mmio_write_wi, 8,
-		VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
-	REGISTER_ITS_DESC(GITS_CBASER,
-		vgic_mmio_read_its_cbaser, vgic_mmio_write_its_cbaser, 8,
-		VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
-	REGISTER_ITS_DESC(GITS_CWRITER,
-		vgic_mmio_read_its_cwriter, vgic_mmio_write_its_cwriter, 8,
-		VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
-	REGISTER_ITS_DESC_UACCESS(GITS_CREADR,
-		vgic_mmio_read_its_creadr, its_mmio_write_wi,
-		vgic_mmio_uaccess_write_its_creadr, 8,
-		VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
-	REGISTER_ITS_DESC(GITS_BASER,
-		vgic_mmio_read_its_baser, vgic_mmio_write_its_baser, 0x40,
-		VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
-	REGISTER_ITS_DESC(GITS_IDREGS_BASE,
-		vgic_mmio_read_its_idregs, its_mmio_write_wi, 0x30,
-		VGIC_ACCESS_32bit),
-};
-
-/* This is called on setting the LPI enable bit in the redistributor. */
-void vgic_enable_lpis(struct kvm_vcpu *vcpu)
-{
-	if (!(vcpu->arch.vgic_cpu.pendbaser & GICR_PENDBASER_PTZ))
-		its_sync_lpi_pending_table(vcpu);
-}
-
-static int vgic_register_its_iodev(struct kvm *kvm, struct vgic_its *its,
-				   u64 addr)
-{
-	struct vgic_io_device *iodev = &its->iodev;
-	int ret;
-
-	mutex_lock(&kvm->slots_lock);
-	if (!IS_VGIC_ADDR_UNDEF(its->vgic_its_base)) {
-		ret = -EBUSY;
-		goto out;
-	}
-
-	its->vgic_its_base = addr;
-	iodev->regions = its_registers;
-	iodev->nr_regions = ARRAY_SIZE(its_registers);
-	kvm_iodevice_init(&iodev->dev, &kvm_io_gic_ops);
-
-	iodev->base_addr = its->vgic_its_base;
-	iodev->iodev_type = IODEV_ITS;
-	iodev->its = its;
-	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, iodev->base_addr,
-				      KVM_VGIC_V3_ITS_SIZE, &iodev->dev);
-out:
-	mutex_unlock(&kvm->slots_lock);
-
-	return ret;
-}
-
-#define INITIAL_BASER_VALUE						  \
-	(GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWb)		| \
-	 GIC_BASER_CACHEABILITY(GITS_BASER, OUTER, SameAsInner)		| \
-	 GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable)		| \
-	 GITS_BASER_PAGE_SIZE_64K)
-
-#define INITIAL_PROPBASER_VALUE						  \
-	(GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWb)		| \
-	 GIC_BASER_CACHEABILITY(GICR_PROPBASER, OUTER, SameAsInner)	| \
-	 GIC_BASER_SHAREABILITY(GICR_PROPBASER, InnerShareable))
-
-static int vgic_its_create(struct kvm_device *dev, u32 type)
-{
-	struct vgic_its *its;
-
-	if (type != KVM_DEV_TYPE_ARM_VGIC_ITS)
-		return -ENODEV;
-
-	its = kzalloc(sizeof(struct vgic_its), GFP_KERNEL);
-	if (!its)
-		return -ENOMEM;
-
-	if (vgic_initialized(dev->kvm)) {
-		int ret = vgic_v4_init(dev->kvm);
-		if (ret < 0) {
-			kfree(its);
-			return ret;
-		}
-	}
-
-	mutex_init(&its->its_lock);
-	mutex_init(&its->cmd_lock);
-
-	its->vgic_its_base = VGIC_ADDR_UNDEF;
-
-	INIT_LIST_HEAD(&its->device_list);
-	INIT_LIST_HEAD(&its->collection_list);
-
-	dev->kvm->arch.vgic.msis_require_devid = true;
-	dev->kvm->arch.vgic.has_its = true;
-	its->enabled = false;
-	its->dev = dev;
-
-	its->baser_device_table = INITIAL_BASER_VALUE			|
-		((u64)GITS_BASER_TYPE_DEVICE << GITS_BASER_TYPE_SHIFT);
-	its->baser_coll_table = INITIAL_BASER_VALUE |
-		((u64)GITS_BASER_TYPE_COLLECTION << GITS_BASER_TYPE_SHIFT);
-	dev->kvm->arch.vgic.propbaser = INITIAL_PROPBASER_VALUE;
-
-	dev->private = its;
-
-	return vgic_its_set_abi(its, NR_ITS_ABIS - 1);
-}
-
-static void vgic_its_destroy(struct kvm_device *kvm_dev)
-{
-	struct kvm *kvm = kvm_dev->kvm;
-	struct vgic_its *its = kvm_dev->private;
-
-	mutex_lock(&its->its_lock);
-
-	vgic_its_free_device_list(kvm, its);
-	vgic_its_free_collection_list(kvm, its);
-
-	mutex_unlock(&its->its_lock);
-	kfree(its);
-}
-
-int vgic_its_has_attr_regs(struct kvm_device *dev,
-			   struct kvm_device_attr *attr)
-{
-	const struct vgic_register_region *region;
-	gpa_t offset = attr->attr;
-	int align;
-
-	align = (offset < GITS_TYPER) || (offset >= GITS_PIDR4) ? 0x3 : 0x7;
-
-	if (offset & align)
-		return -EINVAL;
-
-	region = vgic_find_mmio_region(its_registers,
-				       ARRAY_SIZE(its_registers),
-				       offset);
-	if (!region)
-		return -ENXIO;
-
-	return 0;
-}
-
-int vgic_its_attr_regs_access(struct kvm_device *dev,
-			      struct kvm_device_attr *attr,
-			      u64 *reg, bool is_write)
-{
-	const struct vgic_register_region *region;
-	struct vgic_its *its;
-	gpa_t addr, offset;
-	unsigned int len;
-	int align, ret = 0;
-
-	its = dev->private;
-	offset = attr->attr;
-
-	/*
-	 * Although the spec supports upper/lower 32-bit accesses to
-	 * 64-bit ITS registers, the userspace ABI requires 64-bit
-	 * accesses to all 64-bit wide registers. We therefore only
-	 * support 32-bit accesses to GITS_CTLR, GITS_IIDR and GITS ID
-	 * registers
-	 */
-	if ((offset < GITS_TYPER) || (offset >= GITS_PIDR4))
-		align = 0x3;
-	else
-		align = 0x7;
-
-	if (offset & align)
-		return -EINVAL;
-
-	mutex_lock(&dev->kvm->lock);
-
-	if (IS_VGIC_ADDR_UNDEF(its->vgic_its_base)) {
-		ret = -ENXIO;
-		goto out;
-	}
-
-	region = vgic_find_mmio_region(its_registers,
-				       ARRAY_SIZE(its_registers),
-				       offset);
-	if (!region) {
-		ret = -ENXIO;
-		goto out;
-	}
-
-	if (!lock_all_vcpus(dev->kvm)) {
-		ret = -EBUSY;
-		goto out;
-	}
-
-	addr = its->vgic_its_base + offset;
-
-	len = region->access_flags & VGIC_ACCESS_64bit ? 8 : 4;
-
-	if (is_write) {
-		if (region->uaccess_its_write)
-			ret = region->uaccess_its_write(dev->kvm, its, addr,
-							len, *reg);
-		else
-			region->its_write(dev->kvm, its, addr, len, *reg);
-	} else {
-		*reg = region->its_read(dev->kvm, its, addr, len);
-	}
-	unlock_all_vcpus(dev->kvm);
-out:
-	mutex_unlock(&dev->kvm->lock);
-	return ret;
-}
-
-static u32 compute_next_devid_offset(struct list_head *h,
-				     struct its_device *dev)
-{
-	struct its_device *next;
-	u32 next_offset;
-
-	if (list_is_last(&dev->dev_list, h))
-		return 0;
-	next = list_next_entry(dev, dev_list);
-	next_offset = next->device_id - dev->device_id;
-
-	return min_t(u32, next_offset, VITS_DTE_MAX_DEVID_OFFSET);
-}
-
-static u32 compute_next_eventid_offset(struct list_head *h, struct its_ite *ite)
-{
-	struct its_ite *next;
-	u32 next_offset;
-
-	if (list_is_last(&ite->ite_list, h))
-		return 0;
-	next = list_next_entry(ite, ite_list);
-	next_offset = next->event_id - ite->event_id;
-
-	return min_t(u32, next_offset, VITS_ITE_MAX_EVENTID_OFFSET);
-}
-
-/**
- * entry_fn_t - Callback called on a table entry restore path
- * @its: its handle
- * @id: id of the entry
- * @entry: pointer to the entry
- * @opaque: pointer to an opaque data
- *
- * Return: < 0 on error, 0 if last element was identified, id offset to next
- * element otherwise
- */
-typedef int (*entry_fn_t)(struct vgic_its *its, u32 id, void *entry,
-			  void *opaque);
-
-/**
- * scan_its_table - Scan a contiguous table in guest RAM and applies a function
- * to each entry
- *
- * @its: its handle
- * @base: base gpa of the table
- * @size: size of the table in bytes
- * @esz: entry size in bytes
- * @start_id: the ID of the first entry in the table
- * (non zero for 2d level tables)
- * @fn: function to apply on each entry
- *
- * Return: < 0 on error, 0 if last element was identified, 1 otherwise
- * (the last element may not be found on second level tables)
- */
-static int scan_its_table(struct vgic_its *its, gpa_t base, int size, u32 esz,
-			  int start_id, entry_fn_t fn, void *opaque)
-{
-	struct kvm *kvm = its->dev->kvm;
-	unsigned long len = size;
-	int id = start_id;
-	gpa_t gpa = base;
-	char entry[ESZ_MAX];
-	int ret;
-
-	memset(entry, 0, esz);
-
-	while (len > 0) {
-		int next_offset;
-		size_t byte_offset;
-
-		ret = kvm_read_guest_lock(kvm, gpa, entry, esz);
-		if (ret)
-			return ret;
-
-		next_offset = fn(its, id, entry, opaque);
-		if (next_offset <= 0)
-			return next_offset;
-
-		byte_offset = next_offset * esz;
-		id += next_offset;
-		gpa += byte_offset;
-		len -= byte_offset;
-	}
-	return 1;
-}
-
-/**
- * vgic_its_save_ite - Save an interrupt translation entry at @gpa
- */
-static int vgic_its_save_ite(struct vgic_its *its, struct its_device *dev,
-			      struct its_ite *ite, gpa_t gpa, int ite_esz)
-{
-	struct kvm *kvm = its->dev->kvm;
-	u32 next_offset;
-	u64 val;
-
-	next_offset = compute_next_eventid_offset(&dev->itt_head, ite);
-	val = ((u64)next_offset << KVM_ITS_ITE_NEXT_SHIFT) |
-	       ((u64)ite->irq->intid << KVM_ITS_ITE_PINTID_SHIFT) |
-		ite->collection->collection_id;
-	val = cpu_to_le64(val);
-	return kvm_write_guest(kvm, gpa, &val, ite_esz);
-}
-
-/**
- * vgic_its_restore_ite - restore an interrupt translation entry
- * @event_id: id used for indexing
- * @ptr: pointer to the ITE entry
- * @opaque: pointer to the its_device
- */
-static int vgic_its_restore_ite(struct vgic_its *its, u32 event_id,
-				void *ptr, void *opaque)
-{
-	struct its_device *dev = (struct its_device *)opaque;
-	struct its_collection *collection;
-	struct kvm *kvm = its->dev->kvm;
-	struct kvm_vcpu *vcpu = NULL;
-	u64 val;
-	u64 *p = (u64 *)ptr;
-	struct vgic_irq *irq;
-	u32 coll_id, lpi_id;
-	struct its_ite *ite;
-	u32 offset;
-
-	val = *p;
-
-	val = le64_to_cpu(val);
-
-	coll_id = val & KVM_ITS_ITE_ICID_MASK;
-	lpi_id = (val & KVM_ITS_ITE_PINTID_MASK) >> KVM_ITS_ITE_PINTID_SHIFT;
-
-	if (!lpi_id)
-		return 1; /* invalid entry, no choice but to scan next entry */
-
-	if (lpi_id < VGIC_MIN_LPI)
-		return -EINVAL;
-
-	offset = val >> KVM_ITS_ITE_NEXT_SHIFT;
-	if (event_id + offset >= BIT_ULL(dev->num_eventid_bits))
-		return -EINVAL;
-
-	collection = find_collection(its, coll_id);
-	if (!collection)
-		return -EINVAL;
-
-	ite = vgic_its_alloc_ite(dev, collection, event_id);
-	if (IS_ERR(ite))
-		return PTR_ERR(ite);
-
-	if (its_is_collection_mapped(collection))
-		vcpu = kvm_get_vcpu(kvm, collection->target_addr);
-
-	irq = vgic_add_lpi(kvm, lpi_id, vcpu);
-	if (IS_ERR(irq))
-		return PTR_ERR(irq);
-	ite->irq = irq;
-
-	return offset;
-}
-
-static int vgic_its_ite_cmp(void *priv, struct list_head *a,
-			    struct list_head *b)
-{
-	struct its_ite *itea = container_of(a, struct its_ite, ite_list);
-	struct its_ite *iteb = container_of(b, struct its_ite, ite_list);
-
-	if (itea->event_id < iteb->event_id)
-		return -1;
-	else
-		return 1;
-}
-
-static int vgic_its_save_itt(struct vgic_its *its, struct its_device *device)
-{
-	const struct vgic_its_abi *abi = vgic_its_get_abi(its);
-	gpa_t base = device->itt_addr;
-	struct its_ite *ite;
-	int ret;
-	int ite_esz = abi->ite_esz;
-
-	list_sort(NULL, &device->itt_head, vgic_its_ite_cmp);
-
-	list_for_each_entry(ite, &device->itt_head, ite_list) {
-		gpa_t gpa = base + ite->event_id * ite_esz;
-
-		/*
-		 * If an LPI carries the HW bit, this means that this
-		 * interrupt is controlled by GICv4, and we do not
-		 * have direct access to that state. Let's simply fail
-		 * the save operation...
-		 */
-		if (ite->irq->hw)
-			return -EACCES;
-
-		ret = vgic_its_save_ite(its, device, ite, gpa, ite_esz);
-		if (ret)
-			return ret;
-	}
-	return 0;
-}
-
-/**
- * vgic_its_restore_itt - restore the ITT of a device
- *
- * @its: its handle
- * @dev: device handle
- *
- * Return 0 on success, < 0 on error
- */
-static int vgic_its_restore_itt(struct vgic_its *its, struct its_device *dev)
-{
-	const struct vgic_its_abi *abi = vgic_its_get_abi(its);
-	gpa_t base = dev->itt_addr;
-	int ret;
-	int ite_esz = abi->ite_esz;
-	size_t max_size = BIT_ULL(dev->num_eventid_bits) * ite_esz;
-
-	ret = scan_its_table(its, base, max_size, ite_esz, 0,
-			     vgic_its_restore_ite, dev);
-
-	/* scan_its_table returns +1 if all ITEs are invalid */
-	if (ret > 0)
-		ret = 0;
-
-	return ret;
-}
-
-/**
- * vgic_its_save_dte - Save a device table entry at a given GPA
- *
- * @its: ITS handle
- * @dev: ITS device
- * @ptr: GPA
- */
-static int vgic_its_save_dte(struct vgic_its *its, struct its_device *dev,
-			     gpa_t ptr, int dte_esz)
-{
-	struct kvm *kvm = its->dev->kvm;
-	u64 val, itt_addr_field;
-	u32 next_offset;
-
-	itt_addr_field = dev->itt_addr >> 8;
-	next_offset = compute_next_devid_offset(&its->device_list, dev);
-	val = (1ULL << KVM_ITS_DTE_VALID_SHIFT |
-	       ((u64)next_offset << KVM_ITS_DTE_NEXT_SHIFT) |
-	       (itt_addr_field << KVM_ITS_DTE_ITTADDR_SHIFT) |
-		(dev->num_eventid_bits - 1));
-	val = cpu_to_le64(val);
-	return kvm_write_guest(kvm, ptr, &val, dte_esz);
-}
-
-/**
- * vgic_its_restore_dte - restore a device table entry
- *
- * @its: its handle
- * @id: device id the DTE corresponds to
- * @ptr: kernel VA where the 8 byte DTE is located
- * @opaque: unused
- *
- * Return: < 0 on error, 0 if the dte is the last one, id offset to the
- * next dte otherwise
- */
-static int vgic_its_restore_dte(struct vgic_its *its, u32 id,
-				void *ptr, void *opaque)
-{
-	struct its_device *dev;
-	gpa_t itt_addr;
-	u8 num_eventid_bits;
-	u64 entry = *(u64 *)ptr;
-	bool valid;
-	u32 offset;
-	int ret;
-
-	entry = le64_to_cpu(entry);
-
-	valid = entry >> KVM_ITS_DTE_VALID_SHIFT;
-	num_eventid_bits = (entry & KVM_ITS_DTE_SIZE_MASK) + 1;
-	itt_addr = ((entry & KVM_ITS_DTE_ITTADDR_MASK)
-			>> KVM_ITS_DTE_ITTADDR_SHIFT) << 8;
-
-	if (!valid)
-		return 1;
-
-	/* dte entry is valid */
-	offset = (entry & KVM_ITS_DTE_NEXT_MASK) >> KVM_ITS_DTE_NEXT_SHIFT;
-
-	dev = vgic_its_alloc_device(its, id, itt_addr, num_eventid_bits);
-	if (IS_ERR(dev))
-		return PTR_ERR(dev);
-
-	ret = vgic_its_restore_itt(its, dev);
-	if (ret) {
-		vgic_its_free_device(its->dev->kvm, dev);
-		return ret;
-	}
-
-	return offset;
-}
-
-static int vgic_its_device_cmp(void *priv, struct list_head *a,
-			       struct list_head *b)
-{
-	struct its_device *deva = container_of(a, struct its_device, dev_list);
-	struct its_device *devb = container_of(b, struct its_device, dev_list);
-
-	if (deva->device_id < devb->device_id)
-		return -1;
-	else
-		return 1;
-}
-
-/**
- * vgic_its_save_device_tables - Save the device table and all ITT
- * into guest RAM
- *
- * L1/L2 handling is hidden by vgic_its_check_id() helper which directly
- * returns the GPA of the device entry
- */
-static int vgic_its_save_device_tables(struct vgic_its *its)
-{
-	const struct vgic_its_abi *abi = vgic_its_get_abi(its);
-	u64 baser = its->baser_device_table;
-	struct its_device *dev;
-	int dte_esz = abi->dte_esz;
-
-	if (!(baser & GITS_BASER_VALID))
-		return 0;
-
-	list_sort(NULL, &its->device_list, vgic_its_device_cmp);
-
-	list_for_each_entry(dev, &its->device_list, dev_list) {
-		int ret;
-		gpa_t eaddr;
-
-		if (!vgic_its_check_id(its, baser,
-				       dev->device_id, &eaddr))
-			return -EINVAL;
-
-		ret = vgic_its_save_itt(its, dev);
-		if (ret)
-			return ret;
-
-		ret = vgic_its_save_dte(its, dev, eaddr, dte_esz);
-		if (ret)
-			return ret;
-	}
-	return 0;
-}
-
-/**
- * handle_l1_dte - callback used for L1 device table entries (2 stage case)
- *
- * @its: its handle
- * @id: index of the entry in the L1 table
- * @addr: kernel VA
- * @opaque: unused
- *
- * L1 table entries are scanned by steps of 1 entry
- * Return < 0 if error, 0 if last dte was found when scanning the L2
- * table, +1 otherwise (meaning next L1 entry must be scanned)
- */
-static int handle_l1_dte(struct vgic_its *its, u32 id, void *addr,
-			 void *opaque)
-{
-	const struct vgic_its_abi *abi = vgic_its_get_abi(its);
-	int l2_start_id = id * (SZ_64K / abi->dte_esz);
-	u64 entry = *(u64 *)addr;
-	int dte_esz = abi->dte_esz;
-	gpa_t gpa;
-	int ret;
-
-	entry = le64_to_cpu(entry);
-
-	if (!(entry & KVM_ITS_L1E_VALID_MASK))
-		return 1;
-
-	gpa = entry & KVM_ITS_L1E_ADDR_MASK;
-
-	ret = scan_its_table(its, gpa, SZ_64K, dte_esz,
-			     l2_start_id, vgic_its_restore_dte, NULL);
-
-	return ret;
-}
-
-/**
- * vgic_its_restore_device_tables - Restore the device table and all ITT
- * from guest RAM to internal data structs
- */
-static int vgic_its_restore_device_tables(struct vgic_its *its)
-{
-	const struct vgic_its_abi *abi = vgic_its_get_abi(its);
-	u64 baser = its->baser_device_table;
-	int l1_esz, ret;
-	int l1_tbl_size = GITS_BASER_NR_PAGES(baser) * SZ_64K;
-	gpa_t l1_gpa;
-
-	if (!(baser & GITS_BASER_VALID))
-		return 0;
-
-	l1_gpa = GITS_BASER_ADDR_48_to_52(baser);
-
-	if (baser & GITS_BASER_INDIRECT) {
-		l1_esz = GITS_LVL1_ENTRY_SIZE;
-		ret = scan_its_table(its, l1_gpa, l1_tbl_size, l1_esz, 0,
-				     handle_l1_dte, NULL);
-	} else {
-		l1_esz = abi->dte_esz;
-		ret = scan_its_table(its, l1_gpa, l1_tbl_size, l1_esz, 0,
-				     vgic_its_restore_dte, NULL);
-	}
-
-	/* scan_its_table returns +1 if all entries are invalid */
-	if (ret > 0)
-		ret = 0;
-
-	return ret;
-}
-
-static int vgic_its_save_cte(struct vgic_its *its,
-			     struct its_collection *collection,
-			     gpa_t gpa, int esz)
-{
-	u64 val;
-
-	val = (1ULL << KVM_ITS_CTE_VALID_SHIFT |
-	       ((u64)collection->target_addr << KVM_ITS_CTE_RDBASE_SHIFT) |
-	       collection->collection_id);
-	val = cpu_to_le64(val);
-	return kvm_write_guest(its->dev->kvm, gpa, &val, esz);
-}
-
-static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa, int esz)
-{
-	struct its_collection *collection;
-	struct kvm *kvm = its->dev->kvm;
-	u32 target_addr, coll_id;
-	u64 val;
-	int ret;
-
-	BUG_ON(esz > sizeof(val));
-	ret = kvm_read_guest_lock(kvm, gpa, &val, esz);
-	if (ret)
-		return ret;
-	val = le64_to_cpu(val);
-	if (!(val & KVM_ITS_CTE_VALID_MASK))
-		return 0;
-
-	target_addr = (u32)(val >> KVM_ITS_CTE_RDBASE_SHIFT);
-	coll_id = val & KVM_ITS_CTE_ICID_MASK;
-
-	if (target_addr >= atomic_read(&kvm->online_vcpus))
-		return -EINVAL;
-
-	collection = find_collection(its, coll_id);
-	if (collection)
-		return -EEXIST;
-	ret = vgic_its_alloc_collection(its, &collection, coll_id);
-	if (ret)
-		return ret;
-	collection->target_addr = target_addr;
-	return 1;
-}
-
-/**
- * vgic_its_save_collection_table - Save the collection table into
- * guest RAM
- */
-static int vgic_its_save_collection_table(struct vgic_its *its)
-{
-	const struct vgic_its_abi *abi = vgic_its_get_abi(its);
-	u64 baser = its->baser_coll_table;
-	gpa_t gpa = GITS_BASER_ADDR_48_to_52(baser);
-	struct its_collection *collection;
-	u64 val;
-	size_t max_size, filled = 0;
-	int ret, cte_esz = abi->cte_esz;
-
-	if (!(baser & GITS_BASER_VALID))
-		return 0;
-
-	max_size = GITS_BASER_NR_PAGES(baser) * SZ_64K;
-
-	list_for_each_entry(collection, &its->collection_list, coll_list) {
-		ret = vgic_its_save_cte(its, collection, gpa, cte_esz);
-		if (ret)
-			return ret;
-		gpa += cte_esz;
-		filled += cte_esz;
-	}
-
-	if (filled == max_size)
-		return 0;
-
-	/*
-	 * table is not fully filled, add a last dummy element
-	 * with valid bit unset
-	 */
-	val = 0;
-	BUG_ON(cte_esz > sizeof(val));
-	ret = kvm_write_guest(its->dev->kvm, gpa, &val, cte_esz);
-	return ret;
-}
-
-/**
- * vgic_its_restore_collection_table - reads the collection table
- * in guest memory and restores the ITS internal state. Requires the
- * BASER registers to be restored before.
- */
-static int vgic_its_restore_collection_table(struct vgic_its *its)
-{
-	const struct vgic_its_abi *abi = vgic_its_get_abi(its);
-	u64 baser = its->baser_coll_table;
-	int cte_esz = abi->cte_esz;
-	size_t max_size, read = 0;
-	gpa_t gpa;
-	int ret;
-
-	if (!(baser & GITS_BASER_VALID))
-		return 0;
-
-	gpa = GITS_BASER_ADDR_48_to_52(baser);
-
-	max_size = GITS_BASER_NR_PAGES(baser) * SZ_64K;
-
-	while (read < max_size) {
-		ret = vgic_its_restore_cte(its, gpa, cte_esz);
-		if (ret <= 0)
-			break;
-		gpa += cte_esz;
-		read += cte_esz;
-	}
-
-	if (ret > 0)
-		return 0;
-
-	return ret;
-}
-
-/**
- * vgic_its_save_tables_v0 - Save the ITS tables into guest ARM
- * according to v0 ABI
- */
-static int vgic_its_save_tables_v0(struct vgic_its *its)
-{
-	int ret;
-
-	ret = vgic_its_save_device_tables(its);
-	if (ret)
-		return ret;
-
-	return vgic_its_save_collection_table(its);
-}
-
-/**
- * vgic_its_restore_tables_v0 - Restore the ITS tables from guest RAM
- * to internal data structs according to V0 ABI
- *
- */
-static int vgic_its_restore_tables_v0(struct vgic_its *its)
-{
-	int ret;
-
-	ret = vgic_its_restore_collection_table(its);
-	if (ret)
-		return ret;
-
-	return vgic_its_restore_device_tables(its);
-}
-
-static int vgic_its_commit_v0(struct vgic_its *its)
-{
-	const struct vgic_its_abi *abi;
-
-	abi = vgic_its_get_abi(its);
-	its->baser_coll_table &= ~GITS_BASER_ENTRY_SIZE_MASK;
-	its->baser_device_table &= ~GITS_BASER_ENTRY_SIZE_MASK;
-
-	its->baser_coll_table |= (GIC_ENCODE_SZ(abi->cte_esz, 5)
-					<< GITS_BASER_ENTRY_SIZE_SHIFT);
-
-	its->baser_device_table |= (GIC_ENCODE_SZ(abi->dte_esz, 5)
-					<< GITS_BASER_ENTRY_SIZE_SHIFT);
-	return 0;
-}
-
-static void vgic_its_reset(struct kvm *kvm, struct vgic_its *its)
-{
-	/* We need to keep the ABI specific field values */
-	its->baser_coll_table &= ~GITS_BASER_VALID;
-	its->baser_device_table &= ~GITS_BASER_VALID;
-	its->cbaser = 0;
-	its->creadr = 0;
-	its->cwriter = 0;
-	its->enabled = 0;
-	vgic_its_free_device_list(kvm, its);
-	vgic_its_free_collection_list(kvm, its);
-}
-
-static int vgic_its_has_attr(struct kvm_device *dev,
-			     struct kvm_device_attr *attr)
-{
-	switch (attr->group) {
-	case KVM_DEV_ARM_VGIC_GRP_ADDR:
-		switch (attr->attr) {
-		case KVM_VGIC_ITS_ADDR_TYPE:
-			return 0;
-		}
-		break;
-	case KVM_DEV_ARM_VGIC_GRP_CTRL:
-		switch (attr->attr) {
-		case KVM_DEV_ARM_VGIC_CTRL_INIT:
-			return 0;
-		case KVM_DEV_ARM_ITS_CTRL_RESET:
-			return 0;
-		case KVM_DEV_ARM_ITS_SAVE_TABLES:
-			return 0;
-		case KVM_DEV_ARM_ITS_RESTORE_TABLES:
-			return 0;
-		}
-		break;
-	case KVM_DEV_ARM_VGIC_GRP_ITS_REGS:
-		return vgic_its_has_attr_regs(dev, attr);
-	}
-	return -ENXIO;
-}
-
-static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr)
-{
-	const struct vgic_its_abi *abi = vgic_its_get_abi(its);
-	int ret = 0;
-
-	if (attr == KVM_DEV_ARM_VGIC_CTRL_INIT) /* Nothing to do */
-		return 0;
-
-	mutex_lock(&kvm->lock);
-	mutex_lock(&its->its_lock);
-
-	if (!lock_all_vcpus(kvm)) {
-		mutex_unlock(&its->its_lock);
-		mutex_unlock(&kvm->lock);
-		return -EBUSY;
-	}
-
-	switch (attr) {
-	case KVM_DEV_ARM_ITS_CTRL_RESET:
-		vgic_its_reset(kvm, its);
-		break;
-	case KVM_DEV_ARM_ITS_SAVE_TABLES:
-		ret = abi->save_tables(its);
-		break;
-	case KVM_DEV_ARM_ITS_RESTORE_TABLES:
-		ret = abi->restore_tables(its);
-		break;
-	}
-
-	unlock_all_vcpus(kvm);
-	mutex_unlock(&its->its_lock);
-	mutex_unlock(&kvm->lock);
-	return ret;
-}
-
-static int vgic_its_set_attr(struct kvm_device *dev,
-			     struct kvm_device_attr *attr)
-{
-	struct vgic_its *its = dev->private;
-	int ret;
-
-	switch (attr->group) {
-	case KVM_DEV_ARM_VGIC_GRP_ADDR: {
-		u64 __user *uaddr = (u64 __user *)(long)attr->addr;
-		unsigned long type = (unsigned long)attr->attr;
-		u64 addr;
-
-		if (type != KVM_VGIC_ITS_ADDR_TYPE)
-			return -ENODEV;
-
-		if (copy_from_user(&addr, uaddr, sizeof(addr)))
-			return -EFAULT;
-
-		ret = vgic_check_ioaddr(dev->kvm, &its->vgic_its_base,
-					addr, SZ_64K);
-		if (ret)
-			return ret;
-
-		return vgic_register_its_iodev(dev->kvm, its, addr);
-	}
-	case KVM_DEV_ARM_VGIC_GRP_CTRL:
-		return vgic_its_ctrl(dev->kvm, its, attr->attr);
-	case KVM_DEV_ARM_VGIC_GRP_ITS_REGS: {
-		u64 __user *uaddr = (u64 __user *)(long)attr->addr;
-		u64 reg;
-
-		if (get_user(reg, uaddr))
-			return -EFAULT;
-
-		return vgic_its_attr_regs_access(dev, attr, &reg, true);
-	}
-	}
-	return -ENXIO;
-}
-
-static int vgic_its_get_attr(struct kvm_device *dev,
-			     struct kvm_device_attr *attr)
-{
-	switch (attr->group) {
-	case KVM_DEV_ARM_VGIC_GRP_ADDR: {
-		struct vgic_its *its = dev->private;
-		u64 addr = its->vgic_its_base;
-		u64 __user *uaddr = (u64 __user *)(long)attr->addr;
-		unsigned long type = (unsigned long)attr->attr;
-
-		if (type != KVM_VGIC_ITS_ADDR_TYPE)
-			return -ENODEV;
-
-		if (copy_to_user(uaddr, &addr, sizeof(addr)))
-			return -EFAULT;
-		break;
-	}
-	case KVM_DEV_ARM_VGIC_GRP_ITS_REGS: {
-		u64 __user *uaddr = (u64 __user *)(long)attr->addr;
-		u64 reg;
-		int ret;
-
-		ret = vgic_its_attr_regs_access(dev, attr, &reg, false);
-		if (ret)
-			return ret;
-		return put_user(reg, uaddr);
-	}
-	default:
-		return -ENXIO;
-	}
-
-	return 0;
-}
-
-static struct kvm_device_ops kvm_arm_vgic_its_ops = {
-	.name = "kvm-arm-vgic-its",
-	.create = vgic_its_create,
-	.destroy = vgic_its_destroy,
-	.set_attr = vgic_its_set_attr,
-	.get_attr = vgic_its_get_attr,
-	.has_attr = vgic_its_has_attr,
-};
-
-int kvm_vgic_register_its_device(void)
-{
-	return kvm_register_device_ops(&kvm_arm_vgic_its_ops,
-				       KVM_DEV_TYPE_ARM_VGIC_ITS);
-}
diff --git a/virt/kvm/arm/vgic/vgic-kvm-device.c b/virt/kvm/arm/vgic/vgic-kvm-device.c
deleted file mode 100644
index 114dce9f4bf5..000000000000
--- a/virt/kvm/arm/vgic/vgic-kvm-device.c
+++ /dev/null
@@ -1,749 +0,0 @@
-/*
- * VGIC: KVM DEVICE API
- *
- * Copyright (C) 2015 ARM Ltd.
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
-#include <linux/kvm_host.h>
-#include <kvm/arm_vgic.h>
-#include <linux/uaccess.h>
-#include <asm/kvm_mmu.h>
-#include <asm/cputype.h>
-#include "vgic.h"
-
-/* common helpers */
-
-int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr,
-		      phys_addr_t addr, phys_addr_t alignment)
-{
-	if (addr & ~kvm_phys_mask(kvm))
-		return -E2BIG;
-
-	if (!IS_ALIGNED(addr, alignment))
-		return -EINVAL;
-
-	if (!IS_VGIC_ADDR_UNDEF(*ioaddr))
-		return -EEXIST;
-
-	return 0;
-}
-
-static int vgic_check_type(struct kvm *kvm, int type_needed)
-{
-	if (kvm->arch.vgic.vgic_model != type_needed)
-		return -ENODEV;
-	else
-		return 0;
-}
-
-/**
- * kvm_vgic_addr - set or get vgic VM base addresses
- * @kvm:   pointer to the vm struct
- * @type:  the VGIC addr type, one of KVM_VGIC_V[23]_ADDR_TYPE_XXX
- * @addr:  pointer to address value
- * @write: if true set the address in the VM address space, if false read the
- *          address
- *
- * Set or get the vgic base addresses for the distributor and the virtual CPU
- * interface in the VM physical address space.  These addresses are properties
- * of the emulated core/SoC and therefore user space initially knows this
- * information.
- * Check them for sanity (alignment, double assignment). We can't check for
- * overlapping regions in case of a virtual GICv3 here, since we don't know
- * the number of VCPUs yet, so we defer this check to map_resources().
- */
-int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write)
-{
-	int r = 0;
-	struct vgic_dist *vgic = &kvm->arch.vgic;
-	phys_addr_t *addr_ptr, alignment;
-	u64 undef_value = VGIC_ADDR_UNDEF;
-
-	mutex_lock(&kvm->lock);
-	switch (type) {
-	case KVM_VGIC_V2_ADDR_TYPE_DIST:
-		r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
-		addr_ptr = &vgic->vgic_dist_base;
-		alignment = SZ_4K;
-		break;
-	case KVM_VGIC_V2_ADDR_TYPE_CPU:
-		r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
-		addr_ptr = &vgic->vgic_cpu_base;
-		alignment = SZ_4K;
-		break;
-	case KVM_VGIC_V3_ADDR_TYPE_DIST:
-		r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V3);
-		addr_ptr = &vgic->vgic_dist_base;
-		alignment = SZ_64K;
-		break;
-	case KVM_VGIC_V3_ADDR_TYPE_REDIST: {
-		struct vgic_redist_region *rdreg;
-
-		r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V3);
-		if (r)
-			break;
-		if (write) {
-			r = vgic_v3_set_redist_base(kvm, 0, *addr, 0);
-			goto out;
-		}
-		rdreg = list_first_entry(&vgic->rd_regions,
-					 struct vgic_redist_region, list);
-		if (!rdreg)
-			addr_ptr = &undef_value;
-		else
-			addr_ptr = &rdreg->base;
-		break;
-	}
-	case KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION:
-	{
-		struct vgic_redist_region *rdreg;
-		u8 index;
-
-		r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V3);
-		if (r)
-			break;
-
-		index = *addr & KVM_VGIC_V3_RDIST_INDEX_MASK;
-
-		if (write) {
-			gpa_t base = *addr & KVM_VGIC_V3_RDIST_BASE_MASK;
-			u32 count = (*addr & KVM_VGIC_V3_RDIST_COUNT_MASK)
-					>> KVM_VGIC_V3_RDIST_COUNT_SHIFT;
-			u8 flags = (*addr & KVM_VGIC_V3_RDIST_FLAGS_MASK)
-					>> KVM_VGIC_V3_RDIST_FLAGS_SHIFT;
-
-			if (!count || flags)
-				r = -EINVAL;
-			else
-				r = vgic_v3_set_redist_base(kvm, index,
-							    base, count);
-			goto out;
-		}
-
-		rdreg = vgic_v3_rdist_region_from_index(kvm, index);
-		if (!rdreg) {
-			r = -ENOENT;
-			goto out;
-		}
-
-		*addr = index;
-		*addr |= rdreg->base;
-		*addr |= (u64)rdreg->count << KVM_VGIC_V3_RDIST_COUNT_SHIFT;
-		goto out;
-	}
-	default:
-		r = -ENODEV;
-	}
-
-	if (r)
-		goto out;
-
-	if (write) {
-		r = vgic_check_ioaddr(kvm, addr_ptr, *addr, alignment);
-		if (!r)
-			*addr_ptr = *addr;
-	} else {
-		*addr = *addr_ptr;
-	}
-
-out:
-	mutex_unlock(&kvm->lock);
-	return r;
-}
-
-static int vgic_set_common_attr(struct kvm_device *dev,
-				struct kvm_device_attr *attr)
-{
-	int r;
-
-	switch (attr->group) {
-	case KVM_DEV_ARM_VGIC_GRP_ADDR: {
-		u64 __user *uaddr = (u64 __user *)(long)attr->addr;
-		u64 addr;
-		unsigned long type = (unsigned long)attr->attr;
-
-		if (copy_from_user(&addr, uaddr, sizeof(addr)))
-			return -EFAULT;
-
-		r = kvm_vgic_addr(dev->kvm, type, &addr, true);
-		return (r == -ENODEV) ? -ENXIO : r;
-	}
-	case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: {
-		u32 __user *uaddr = (u32 __user *)(long)attr->addr;
-		u32 val;
-		int ret = 0;
-
-		if (get_user(val, uaddr))
-			return -EFAULT;
-
-		/*
-		 * We require:
-		 * - at least 32 SPIs on top of the 16 SGIs and 16 PPIs
-		 * - at most 1024 interrupts
-		 * - a multiple of 32 interrupts
-		 */
-		if (val < (VGIC_NR_PRIVATE_IRQS + 32) ||
-		    val > VGIC_MAX_RESERVED ||
-		    (val & 31))
-			return -EINVAL;
-
-		mutex_lock(&dev->kvm->lock);
-
-		if (vgic_ready(dev->kvm) || dev->kvm->arch.vgic.nr_spis)
-			ret = -EBUSY;
-		else
-			dev->kvm->arch.vgic.nr_spis =
-				val - VGIC_NR_PRIVATE_IRQS;
-
-		mutex_unlock(&dev->kvm->lock);
-
-		return ret;
-	}
-	case KVM_DEV_ARM_VGIC_GRP_CTRL: {
-		switch (attr->attr) {
-		case KVM_DEV_ARM_VGIC_CTRL_INIT:
-			mutex_lock(&dev->kvm->lock);
-			r = vgic_init(dev->kvm);
-			mutex_unlock(&dev->kvm->lock);
-			return r;
-		}
-		break;
-	}
-	}
-
-	return -ENXIO;
-}
-
-static int vgic_get_common_attr(struct kvm_device *dev,
-				struct kvm_device_attr *attr)
-{
-	int r = -ENXIO;
-
-	switch (attr->group) {
-	case KVM_DEV_ARM_VGIC_GRP_ADDR: {
-		u64 __user *uaddr = (u64 __user *)(long)attr->addr;
-		u64 addr;
-		unsigned long type = (unsigned long)attr->attr;
-
-		r = kvm_vgic_addr(dev->kvm, type, &addr, false);
-		if (r)
-			return (r == -ENODEV) ? -ENXIO : r;
-
-		if (copy_to_user(uaddr, &addr, sizeof(addr)))
-			return -EFAULT;
-		break;
-	}
-	case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: {
-		u32 __user *uaddr = (u32 __user *)(long)attr->addr;
-
-		r = put_user(dev->kvm->arch.vgic.nr_spis +
-			     VGIC_NR_PRIVATE_IRQS, uaddr);
-		break;
-	}
-	}
-
-	return r;
-}
-
-static int vgic_create(struct kvm_device *dev, u32 type)
-{
-	return kvm_vgic_create(dev->kvm, type);
-}
-
-static void vgic_destroy(struct kvm_device *dev)
-{
-	kfree(dev);
-}
-
-int kvm_register_vgic_device(unsigned long type)
-{
-	int ret = -ENODEV;
-
-	switch (type) {
-	case KVM_DEV_TYPE_ARM_VGIC_V2:
-		ret = kvm_register_device_ops(&kvm_arm_vgic_v2_ops,
-					      KVM_DEV_TYPE_ARM_VGIC_V2);
-		break;
-	case KVM_DEV_TYPE_ARM_VGIC_V3:
-		ret = kvm_register_device_ops(&kvm_arm_vgic_v3_ops,
-					      KVM_DEV_TYPE_ARM_VGIC_V3);
-
-		if (ret)
-			break;
-		ret = kvm_vgic_register_its_device();
-		break;
-	}
-
-	return ret;
-}
-
-int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr,
-		       struct vgic_reg_attr *reg_attr)
-{
-	int cpuid;
-
-	cpuid = (attr->attr & KVM_DEV_ARM_VGIC_CPUID_MASK) >>
-		 KVM_DEV_ARM_VGIC_CPUID_SHIFT;
-
-	if (cpuid >= atomic_read(&dev->kvm->online_vcpus))
-		return -EINVAL;
-
-	reg_attr->vcpu = kvm_get_vcpu(dev->kvm, cpuid);
-	reg_attr->addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
-
-	return 0;
-}
-
-/* unlocks vcpus from @vcpu_lock_idx and smaller */
-static void unlock_vcpus(struct kvm *kvm, int vcpu_lock_idx)
-{
-	struct kvm_vcpu *tmp_vcpu;
-
-	for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) {
-		tmp_vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx);
-		mutex_unlock(&tmp_vcpu->mutex);
-	}
-}
-
-void unlock_all_vcpus(struct kvm *kvm)
-{
-	unlock_vcpus(kvm, atomic_read(&kvm->online_vcpus) - 1);
-}
-
-/* Returns true if all vcpus were locked, false otherwise */
-bool lock_all_vcpus(struct kvm *kvm)
-{
-	struct kvm_vcpu *tmp_vcpu;
-	int c;
-
-	/*
-	 * Any time a vcpu is run, vcpu_load is called which tries to grab the
-	 * vcpu->mutex.  By grabbing the vcpu->mutex of all VCPUs we ensure
-	 * that no other VCPUs are run and fiddle with the vgic state while we
-	 * access it.
-	 */
-	kvm_for_each_vcpu(c, tmp_vcpu, kvm) {
-		if (!mutex_trylock(&tmp_vcpu->mutex)) {
-			unlock_vcpus(kvm, c - 1);
-			return false;
-		}
-	}
-
-	return true;
-}
-
-/**
- * vgic_v2_attr_regs_access - allows user space to access VGIC v2 state
- *
- * @dev:      kvm device handle
- * @attr:     kvm device attribute
- * @reg:      address the value is read or written
- * @is_write: true if userspace is writing a register
- */
-static int vgic_v2_attr_regs_access(struct kvm_device *dev,
-				    struct kvm_device_attr *attr,
-				    u32 *reg, bool is_write)
-{
-	struct vgic_reg_attr reg_attr;
-	gpa_t addr;
-	struct kvm_vcpu *vcpu;
-	int ret;
-
-	ret = vgic_v2_parse_attr(dev, attr, &reg_attr);
-	if (ret)
-		return ret;
-
-	vcpu = reg_attr.vcpu;
-	addr = reg_attr.addr;
-
-	mutex_lock(&dev->kvm->lock);
-
-	ret = vgic_init(dev->kvm);
-	if (ret)
-		goto out;
-
-	if (!lock_all_vcpus(dev->kvm)) {
-		ret = -EBUSY;
-		goto out;
-	}
-
-	switch (attr->group) {
-	case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
-		ret = vgic_v2_cpuif_uaccess(vcpu, is_write, addr, reg);
-		break;
-	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-		ret = vgic_v2_dist_uaccess(vcpu, is_write, addr, reg);
-		break;
-	default:
-		ret = -EINVAL;
-		break;
-	}
-
-	unlock_all_vcpus(dev->kvm);
-out:
-	mutex_unlock(&dev->kvm->lock);
-	return ret;
-}
-
-static int vgic_v2_set_attr(struct kvm_device *dev,
-			    struct kvm_device_attr *attr)
-{
-	int ret;
-
-	ret = vgic_set_common_attr(dev, attr);
-	if (ret != -ENXIO)
-		return ret;
-
-	switch (attr->group) {
-	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-	case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: {
-		u32 __user *uaddr = (u32 __user *)(long)attr->addr;
-		u32 reg;
-
-		if (get_user(reg, uaddr))
-			return -EFAULT;
-
-		return vgic_v2_attr_regs_access(dev, attr, &reg, true);
-	}
-	}
-
-	return -ENXIO;
-}
-
-static int vgic_v2_get_attr(struct kvm_device *dev,
-			    struct kvm_device_attr *attr)
-{
-	int ret;
-
-	ret = vgic_get_common_attr(dev, attr);
-	if (ret != -ENXIO)
-		return ret;
-
-	switch (attr->group) {
-	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-	case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: {
-		u32 __user *uaddr = (u32 __user *)(long)attr->addr;
-		u32 reg = 0;
-
-		ret = vgic_v2_attr_regs_access(dev, attr, &reg, false);
-		if (ret)
-			return ret;
-		return put_user(reg, uaddr);
-	}
-	}
-
-	return -ENXIO;
-}
-
-static int vgic_v2_has_attr(struct kvm_device *dev,
-			    struct kvm_device_attr *attr)
-{
-	switch (attr->group) {
-	case KVM_DEV_ARM_VGIC_GRP_ADDR:
-		switch (attr->attr) {
-		case KVM_VGIC_V2_ADDR_TYPE_DIST:
-		case KVM_VGIC_V2_ADDR_TYPE_CPU:
-			return 0;
-		}
-		break;
-	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-	case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
-		return vgic_v2_has_attr_regs(dev, attr);
-	case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
-		return 0;
-	case KVM_DEV_ARM_VGIC_GRP_CTRL:
-		switch (attr->attr) {
-		case KVM_DEV_ARM_VGIC_CTRL_INIT:
-			return 0;
-		}
-	}
-	return -ENXIO;
-}
-
-struct kvm_device_ops kvm_arm_vgic_v2_ops = {
-	.name = "kvm-arm-vgic-v2",
-	.create = vgic_create,
-	.destroy = vgic_destroy,
-	.set_attr = vgic_v2_set_attr,
-	.get_attr = vgic_v2_get_attr,
-	.has_attr = vgic_v2_has_attr,
-};
-
-int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr,
-		       struct vgic_reg_attr *reg_attr)
-{
-	unsigned long vgic_mpidr, mpidr_reg;
-
-	/*
-	 * For KVM_DEV_ARM_VGIC_GRP_DIST_REGS group,
-	 * attr might not hold MPIDR. Hence assume vcpu0.
-	 */
-	if (attr->group != KVM_DEV_ARM_VGIC_GRP_DIST_REGS) {
-		vgic_mpidr = (attr->attr & KVM_DEV_ARM_VGIC_V3_MPIDR_MASK) >>
-			      KVM_DEV_ARM_VGIC_V3_MPIDR_SHIFT;
-
-		mpidr_reg = VGIC_TO_MPIDR(vgic_mpidr);
-		reg_attr->vcpu = kvm_mpidr_to_vcpu(dev->kvm, mpidr_reg);
-	} else {
-		reg_attr->vcpu = kvm_get_vcpu(dev->kvm, 0);
-	}
-
-	if (!reg_attr->vcpu)
-		return -EINVAL;
-
-	reg_attr->addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
-
-	return 0;
-}
-
-/*
- * vgic_v3_attr_regs_access - allows user space to access VGIC v3 state
- *
- * @dev:      kvm device handle
- * @attr:     kvm device attribute
- * @reg:      address the value is read or written
- * @is_write: true if userspace is writing a register
- */
-static int vgic_v3_attr_regs_access(struct kvm_device *dev,
-				    struct kvm_device_attr *attr,
-				    u64 *reg, bool is_write)
-{
-	struct vgic_reg_attr reg_attr;
-	gpa_t addr;
-	struct kvm_vcpu *vcpu;
-	int ret;
-	u32 tmp32;
-
-	ret = vgic_v3_parse_attr(dev, attr, &reg_attr);
-	if (ret)
-		return ret;
-
-	vcpu = reg_attr.vcpu;
-	addr = reg_attr.addr;
-
-	mutex_lock(&dev->kvm->lock);
-
-	if (unlikely(!vgic_initialized(dev->kvm))) {
-		ret = -EBUSY;
-		goto out;
-	}
-
-	if (!lock_all_vcpus(dev->kvm)) {
-		ret = -EBUSY;
-		goto out;
-	}
-
-	switch (attr->group) {
-	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-		if (is_write)
-			tmp32 = *reg;
-
-		ret = vgic_v3_dist_uaccess(vcpu, is_write, addr, &tmp32);
-		if (!is_write)
-			*reg = tmp32;
-		break;
-	case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS:
-		if (is_write)
-			tmp32 = *reg;
-
-		ret = vgic_v3_redist_uaccess(vcpu, is_write, addr, &tmp32);
-		if (!is_write)
-			*reg = tmp32;
-		break;
-	case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: {
-		u64 regid;
-
-		regid = (attr->attr & KVM_DEV_ARM_VGIC_SYSREG_INSTR_MASK);
-		ret = vgic_v3_cpu_sysregs_uaccess(vcpu, is_write,
-						  regid, reg);
-		break;
-	}
-	case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: {
-		unsigned int info, intid;
-
-		info = (attr->attr & KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK) >>
-			KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT;
-		if (info == VGIC_LEVEL_INFO_LINE_LEVEL) {
-			intid = attr->attr &
-				KVM_DEV_ARM_VGIC_LINE_LEVEL_INTID_MASK;
-			ret = vgic_v3_line_level_info_uaccess(vcpu, is_write,
-							      intid, reg);
-		} else {
-			ret = -EINVAL;
-		}
-		break;
-	}
-	default:
-		ret = -EINVAL;
-		break;
-	}
-
-	unlock_all_vcpus(dev->kvm);
-out:
-	mutex_unlock(&dev->kvm->lock);
-	return ret;
-}
-
-static int vgic_v3_set_attr(struct kvm_device *dev,
-			    struct kvm_device_attr *attr)
-{
-	int ret;
-
-	ret = vgic_set_common_attr(dev, attr);
-	if (ret != -ENXIO)
-		return ret;
-
-	switch (attr->group) {
-	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-	case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: {
-		u32 __user *uaddr = (u32 __user *)(long)attr->addr;
-		u32 tmp32;
-		u64 reg;
-
-		if (get_user(tmp32, uaddr))
-			return -EFAULT;
-
-		reg = tmp32;
-		return vgic_v3_attr_regs_access(dev, attr, &reg, true);
-	}
-	case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: {
-		u64 __user *uaddr = (u64 __user *)(long)attr->addr;
-		u64 reg;
-
-		if (get_user(reg, uaddr))
-			return -EFAULT;
-
-		return vgic_v3_attr_regs_access(dev, attr, &reg, true);
-	}
-	case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: {
-		u32 __user *uaddr = (u32 __user *)(long)attr->addr;
-		u64 reg;
-		u32 tmp32;
-
-		if (get_user(tmp32, uaddr))
-			return -EFAULT;
-
-		reg = tmp32;
-		return vgic_v3_attr_regs_access(dev, attr, &reg, true);
-	}
-	case KVM_DEV_ARM_VGIC_GRP_CTRL: {
-		int ret;
-
-		switch (attr->attr) {
-		case KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES:
-			mutex_lock(&dev->kvm->lock);
-
-			if (!lock_all_vcpus(dev->kvm)) {
-				mutex_unlock(&dev->kvm->lock);
-				return -EBUSY;
-			}
-			ret = vgic_v3_save_pending_tables(dev->kvm);
-			unlock_all_vcpus(dev->kvm);
-			mutex_unlock(&dev->kvm->lock);
-			return ret;
-		}
-		break;
-	}
-	}
-	return -ENXIO;
-}
-
-static int vgic_v3_get_attr(struct kvm_device *dev,
-			    struct kvm_device_attr *attr)
-{
-	int ret;
-
-	ret = vgic_get_common_attr(dev, attr);
-	if (ret != -ENXIO)
-		return ret;
-
-	switch (attr->group) {
-	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-	case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: {
-		u32 __user *uaddr = (u32 __user *)(long)attr->addr;
-		u64 reg;
-		u32 tmp32;
-
-		ret = vgic_v3_attr_regs_access(dev, attr, &reg, false);
-		if (ret)
-			return ret;
-		tmp32 = reg;
-		return put_user(tmp32, uaddr);
-	}
-	case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: {
-		u64 __user *uaddr = (u64 __user *)(long)attr->addr;
-		u64 reg;
-
-		ret = vgic_v3_attr_regs_access(dev, attr, &reg, false);
-		if (ret)
-			return ret;
-		return put_user(reg, uaddr);
-	}
-	case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: {
-		u32 __user *uaddr = (u32 __user *)(long)attr->addr;
-		u64 reg;
-		u32 tmp32;
-
-		ret = vgic_v3_attr_regs_access(dev, attr, &reg, false);
-		if (ret)
-			return ret;
-		tmp32 = reg;
-		return put_user(tmp32, uaddr);
-	}
-	}
-	return -ENXIO;
-}
-
-static int vgic_v3_has_attr(struct kvm_device *dev,
-			    struct kvm_device_attr *attr)
-{
-	switch (attr->group) {
-	case KVM_DEV_ARM_VGIC_GRP_ADDR:
-		switch (attr->attr) {
-		case KVM_VGIC_V3_ADDR_TYPE_DIST:
-		case KVM_VGIC_V3_ADDR_TYPE_REDIST:
-		case KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION:
-			return 0;
-		}
-		break;
-	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-	case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS:
-	case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS:
-		return vgic_v3_has_attr_regs(dev, attr);
-	case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
-		return 0;
-	case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: {
-		if (((attr->attr & KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK) >>
-		      KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT) ==
-		      VGIC_LEVEL_INFO_LINE_LEVEL)
-			return 0;
-		break;
-	}
-	case KVM_DEV_ARM_VGIC_GRP_CTRL:
-		switch (attr->attr) {
-		case KVM_DEV_ARM_VGIC_CTRL_INIT:
-			return 0;
-		case KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES:
-			return 0;
-		}
-	}
-	return -ENXIO;
-}
-
-struct kvm_device_ops kvm_arm_vgic_v3_ops = {
-	.name = "kvm-arm-vgic-v3",
-	.create = vgic_create,
-	.destroy = vgic_destroy,
-	.set_attr = vgic_v3_set_attr,
-	.get_attr = vgic_v3_get_attr,
-	.has_attr = vgic_v3_has_attr,
-};
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v2.c b/virt/kvm/arm/vgic/vgic-mmio-v2.c
deleted file mode 100644
index 738b65d2d0e7..000000000000
--- a/virt/kvm/arm/vgic/vgic-mmio-v2.c
+++ /dev/null
@@ -1,554 +0,0 @@
-/*
- * VGICv2 MMIO handling functions
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
-
-#include <linux/irqchip/arm-gic.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <linux/nospec.h>
-
-#include <kvm/iodev.h>
-#include <kvm/arm_vgic.h>
-
-#include "vgic.h"
-#include "vgic-mmio.h"
-
-/*
- * The Revision field in the IIDR have the following meanings:
- *
- * Revision 1: Report GICv2 interrupts as group 0 instead of group 1
- * Revision 2: Interrupt groups are guest-configurable and signaled using
- * 	       their configured groups.
- */
-
-static unsigned long vgic_mmio_read_v2_misc(struct kvm_vcpu *vcpu,
-					    gpa_t addr, unsigned int len)
-{
-	struct vgic_dist *vgic = &vcpu->kvm->arch.vgic;
-	u32 value;
-
-	switch (addr & 0x0c) {
-	case GIC_DIST_CTRL:
-		value = vgic->enabled ? GICD_ENABLE : 0;
-		break;
-	case GIC_DIST_CTR:
-		value = vgic->nr_spis + VGIC_NR_PRIVATE_IRQS;
-		value = (value >> 5) - 1;
-		value |= (atomic_read(&vcpu->kvm->online_vcpus) - 1) << 5;
-		break;
-	case GIC_DIST_IIDR:
-		value = (PRODUCT_ID_KVM << GICD_IIDR_PRODUCT_ID_SHIFT) |
-			(vgic->implementation_rev << GICD_IIDR_REVISION_SHIFT) |
-			(IMPLEMENTER_ARM << GICD_IIDR_IMPLEMENTER_SHIFT);
-		break;
-	default:
-		return 0;
-	}
-
-	return value;
-}
-
-static void vgic_mmio_write_v2_misc(struct kvm_vcpu *vcpu,
-				    gpa_t addr, unsigned int len,
-				    unsigned long val)
-{
-	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-	bool was_enabled = dist->enabled;
-
-	switch (addr & 0x0c) {
-	case GIC_DIST_CTRL:
-		dist->enabled = val & GICD_ENABLE;
-		if (!was_enabled && dist->enabled)
-			vgic_kick_vcpus(vcpu->kvm);
-		break;
-	case GIC_DIST_CTR:
-	case GIC_DIST_IIDR:
-		/* Nothing to do */
-		return;
-	}
-}
-
-static int vgic_mmio_uaccess_write_v2_misc(struct kvm_vcpu *vcpu,
-					   gpa_t addr, unsigned int len,
-					   unsigned long val)
-{
-	switch (addr & 0x0c) {
-	case GIC_DIST_IIDR:
-		if (val != vgic_mmio_read_v2_misc(vcpu, addr, len))
-			return -EINVAL;
-
-		/*
-		 * If we observe a write to GICD_IIDR we know that userspace
-		 * has been updated and has had a chance to cope with older
-		 * kernels (VGICv2 IIDR.Revision == 0) incorrectly reporting
-		 * interrupts as group 1, and therefore we now allow groups to
-		 * be user writable.  Doing this by default would break
-		 * migration from old kernels to new kernels with legacy
-		 * userspace.
-		 */
-		vcpu->kvm->arch.vgic.v2_groups_user_writable = true;
-		return 0;
-	}
-
-	vgic_mmio_write_v2_misc(vcpu, addr, len, val);
-	return 0;
-}
-
-static int vgic_mmio_uaccess_write_v2_group(struct kvm_vcpu *vcpu,
-					    gpa_t addr, unsigned int len,
-					    unsigned long val)
-{
-	if (vcpu->kvm->arch.vgic.v2_groups_user_writable)
-		vgic_mmio_write_group(vcpu, addr, len, val);
-
-	return 0;
-}
-
-static void vgic_mmio_write_sgir(struct kvm_vcpu *source_vcpu,
-				 gpa_t addr, unsigned int len,
-				 unsigned long val)
-{
-	int nr_vcpus = atomic_read(&source_vcpu->kvm->online_vcpus);
-	int intid = val & 0xf;
-	int targets = (val >> 16) & 0xff;
-	int mode = (val >> 24) & 0x03;
-	int c;
-	struct kvm_vcpu *vcpu;
-	unsigned long flags;
-
-	switch (mode) {
-	case 0x0:		/* as specified by targets */
-		break;
-	case 0x1:
-		targets = (1U << nr_vcpus) - 1;			/* all, ... */
-		targets &= ~(1U << source_vcpu->vcpu_id);	/* but self */
-		break;
-	case 0x2:		/* this very vCPU only */
-		targets = (1U << source_vcpu->vcpu_id);
-		break;
-	case 0x3:		/* reserved */
-		return;
-	}
-
-	kvm_for_each_vcpu(c, vcpu, source_vcpu->kvm) {
-		struct vgic_irq *irq;
-
-		if (!(targets & (1U << c)))
-			continue;
-
-		irq = vgic_get_irq(source_vcpu->kvm, vcpu, intid);
-
-		spin_lock_irqsave(&irq->irq_lock, flags);
-		irq->pending_latch = true;
-		irq->source |= 1U << source_vcpu->vcpu_id;
-
-		vgic_queue_irq_unlock(source_vcpu->kvm, irq, flags);
-		vgic_put_irq(source_vcpu->kvm, irq);
-	}
-}
-
-static unsigned long vgic_mmio_read_target(struct kvm_vcpu *vcpu,
-					   gpa_t addr, unsigned int len)
-{
-	u32 intid = VGIC_ADDR_TO_INTID(addr, 8);
-	int i;
-	u64 val = 0;
-
-	for (i = 0; i < len; i++) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-		val |= (u64)irq->targets << (i * 8);
-
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-
-	return val;
-}
-
-static void vgic_mmio_write_target(struct kvm_vcpu *vcpu,
-				   gpa_t addr, unsigned int len,
-				   unsigned long val)
-{
-	u32 intid = VGIC_ADDR_TO_INTID(addr, 8);
-	u8 cpu_mask = GENMASK(atomic_read(&vcpu->kvm->online_vcpus) - 1, 0);
-	int i;
-	unsigned long flags;
-
-	/* GICD_ITARGETSR[0-7] are read-only */
-	if (intid < VGIC_NR_PRIVATE_IRQS)
-		return;
-
-	for (i = 0; i < len; i++) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid + i);
-		int target;
-
-		spin_lock_irqsave(&irq->irq_lock, flags);
-
-		irq->targets = (val >> (i * 8)) & cpu_mask;
-		target = irq->targets ? __ffs(irq->targets) : 0;
-		irq->target_vcpu = kvm_get_vcpu(vcpu->kvm, target);
-
-		spin_unlock_irqrestore(&irq->irq_lock, flags);
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-}
-
-static unsigned long vgic_mmio_read_sgipend(struct kvm_vcpu *vcpu,
-					    gpa_t addr, unsigned int len)
-{
-	u32 intid = addr & 0x0f;
-	int i;
-	u64 val = 0;
-
-	for (i = 0; i < len; i++) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-		val |= (u64)irq->source << (i * 8);
-
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-	return val;
-}
-
-static void vgic_mmio_write_sgipendc(struct kvm_vcpu *vcpu,
-				     gpa_t addr, unsigned int len,
-				     unsigned long val)
-{
-	u32 intid = addr & 0x0f;
-	int i;
-	unsigned long flags;
-
-	for (i = 0; i < len; i++) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-		spin_lock_irqsave(&irq->irq_lock, flags);
-
-		irq->source &= ~((val >> (i * 8)) & 0xff);
-		if (!irq->source)
-			irq->pending_latch = false;
-
-		spin_unlock_irqrestore(&irq->irq_lock, flags);
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-}
-
-static void vgic_mmio_write_sgipends(struct kvm_vcpu *vcpu,
-				     gpa_t addr, unsigned int len,
-				     unsigned long val)
-{
-	u32 intid = addr & 0x0f;
-	int i;
-	unsigned long flags;
-
-	for (i = 0; i < len; i++) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-		spin_lock_irqsave(&irq->irq_lock, flags);
-
-		irq->source |= (val >> (i * 8)) & 0xff;
-
-		if (irq->source) {
-			irq->pending_latch = true;
-			vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
-		} else {
-			spin_unlock_irqrestore(&irq->irq_lock, flags);
-		}
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-}
-
-#define GICC_ARCH_VERSION_V2	0x2
-
-/* These are for userland accesses only, there is no guest-facing emulation. */
-static unsigned long vgic_mmio_read_vcpuif(struct kvm_vcpu *vcpu,
-					   gpa_t addr, unsigned int len)
-{
-	struct vgic_vmcr vmcr;
-	u32 val;
-
-	vgic_get_vmcr(vcpu, &vmcr);
-
-	switch (addr & 0xff) {
-	case GIC_CPU_CTRL:
-		val = vmcr.grpen0 << GIC_CPU_CTRL_EnableGrp0_SHIFT;
-		val |= vmcr.grpen1 << GIC_CPU_CTRL_EnableGrp1_SHIFT;
-		val |= vmcr.ackctl << GIC_CPU_CTRL_AckCtl_SHIFT;
-		val |= vmcr.fiqen << GIC_CPU_CTRL_FIQEn_SHIFT;
-		val |= vmcr.cbpr << GIC_CPU_CTRL_CBPR_SHIFT;
-		val |= vmcr.eoim << GIC_CPU_CTRL_EOImodeNS_SHIFT;
-
-		break;
-	case GIC_CPU_PRIMASK:
-		/*
-		 * Our KVM_DEV_TYPE_ARM_VGIC_V2 device ABI exports the
-		 * the PMR field as GICH_VMCR.VMPriMask rather than
-		 * GICC_PMR.Priority, so we expose the upper five bits of
-		 * priority mask to userspace using the lower bits in the
-		 * unsigned long.
-		 */
-		val = (vmcr.pmr & GICV_PMR_PRIORITY_MASK) >>
-			GICV_PMR_PRIORITY_SHIFT;
-		break;
-	case GIC_CPU_BINPOINT:
-		val = vmcr.bpr;
-		break;
-	case GIC_CPU_ALIAS_BINPOINT:
-		val = vmcr.abpr;
-		break;
-	case GIC_CPU_IDENT:
-		val = ((PRODUCT_ID_KVM << 20) |
-		       (GICC_ARCH_VERSION_V2 << 16) |
-		       IMPLEMENTER_ARM);
-		break;
-	default:
-		return 0;
-	}
-
-	return val;
-}
-
-static void vgic_mmio_write_vcpuif(struct kvm_vcpu *vcpu,
-				   gpa_t addr, unsigned int len,
-				   unsigned long val)
-{
-	struct vgic_vmcr vmcr;
-
-	vgic_get_vmcr(vcpu, &vmcr);
-
-	switch (addr & 0xff) {
-	case GIC_CPU_CTRL:
-		vmcr.grpen0 = !!(val & GIC_CPU_CTRL_EnableGrp0);
-		vmcr.grpen1 = !!(val & GIC_CPU_CTRL_EnableGrp1);
-		vmcr.ackctl = !!(val & GIC_CPU_CTRL_AckCtl);
-		vmcr.fiqen = !!(val & GIC_CPU_CTRL_FIQEn);
-		vmcr.cbpr = !!(val & GIC_CPU_CTRL_CBPR);
-		vmcr.eoim = !!(val & GIC_CPU_CTRL_EOImodeNS);
-
-		break;
-	case GIC_CPU_PRIMASK:
-		/*
-		 * Our KVM_DEV_TYPE_ARM_VGIC_V2 device ABI exports the
-		 * the PMR field as GICH_VMCR.VMPriMask rather than
-		 * GICC_PMR.Priority, so we expose the upper five bits of
-		 * priority mask to userspace using the lower bits in the
-		 * unsigned long.
-		 */
-		vmcr.pmr = (val << GICV_PMR_PRIORITY_SHIFT) &
-			GICV_PMR_PRIORITY_MASK;
-		break;
-	case GIC_CPU_BINPOINT:
-		vmcr.bpr = val;
-		break;
-	case GIC_CPU_ALIAS_BINPOINT:
-		vmcr.abpr = val;
-		break;
-	}
-
-	vgic_set_vmcr(vcpu, &vmcr);
-}
-
-static unsigned long vgic_mmio_read_apr(struct kvm_vcpu *vcpu,
-					gpa_t addr, unsigned int len)
-{
-	int n; /* which APRn is this */
-
-	n = (addr >> 2) & 0x3;
-
-	if (kvm_vgic_global_state.type == VGIC_V2) {
-		/* GICv2 hardware systems support max. 32 groups */
-		if (n != 0)
-			return 0;
-		return vcpu->arch.vgic_cpu.vgic_v2.vgic_apr;
-	} else {
-		struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3;
-
-		if (n > vgic_v3_max_apr_idx(vcpu))
-			return 0;
-
-		n = array_index_nospec(n, 4);
-
-		/* GICv3 only uses ICH_AP1Rn for memory mapped (GICv2) guests */
-		return vgicv3->vgic_ap1r[n];
-	}
-}
-
-static void vgic_mmio_write_apr(struct kvm_vcpu *vcpu,
-				gpa_t addr, unsigned int len,
-				unsigned long val)
-{
-	int n; /* which APRn is this */
-
-	n = (addr >> 2) & 0x3;
-
-	if (kvm_vgic_global_state.type == VGIC_V2) {
-		/* GICv2 hardware systems support max. 32 groups */
-		if (n != 0)
-			return;
-		vcpu->arch.vgic_cpu.vgic_v2.vgic_apr = val;
-	} else {
-		struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3;
-
-		if (n > vgic_v3_max_apr_idx(vcpu))
-			return;
-
-		n = array_index_nospec(n, 4);
-
-		/* GICv3 only uses ICH_AP1Rn for memory mapped (GICv2) guests */
-		vgicv3->vgic_ap1r[n] = val;
-	}
-}
-
-static const struct vgic_register_region vgic_v2_dist_registers[] = {
-	REGISTER_DESC_WITH_LENGTH_UACCESS(GIC_DIST_CTRL,
-		vgic_mmio_read_v2_misc, vgic_mmio_write_v2_misc,
-		NULL, vgic_mmio_uaccess_write_v2_misc,
-		12, VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_IGROUP,
-		vgic_mmio_read_group, vgic_mmio_write_group,
-		NULL, vgic_mmio_uaccess_write_v2_group, 1,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ENABLE_SET,
-		vgic_mmio_read_enable, vgic_mmio_write_senable, NULL, NULL, 1,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ENABLE_CLEAR,
-		vgic_mmio_read_enable, vgic_mmio_write_cenable, NULL, NULL, 1,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_SET,
-		vgic_mmio_read_pending, vgic_mmio_write_spending, NULL, NULL, 1,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_CLEAR,
-		vgic_mmio_read_pending, vgic_mmio_write_cpending, NULL, NULL, 1,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_SET,
-		vgic_mmio_read_active, vgic_mmio_write_sactive,
-		NULL, vgic_mmio_uaccess_write_sactive, 1,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_CLEAR,
-		vgic_mmio_read_active, vgic_mmio_write_cactive,
-		NULL, vgic_mmio_uaccess_write_cactive, 1,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PRI,
-		vgic_mmio_read_priority, vgic_mmio_write_priority, NULL, NULL,
-		8, VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
-	REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_TARGET,
-		vgic_mmio_read_target, vgic_mmio_write_target, NULL, NULL, 8,
-		VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
-	REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_CONFIG,
-		vgic_mmio_read_config, vgic_mmio_write_config, NULL, NULL, 2,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH(GIC_DIST_SOFTINT,
-		vgic_mmio_read_raz, vgic_mmio_write_sgir, 4,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH(GIC_DIST_SGI_PENDING_CLEAR,
-		vgic_mmio_read_sgipend, vgic_mmio_write_sgipendc, 16,
-		VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
-	REGISTER_DESC_WITH_LENGTH(GIC_DIST_SGI_PENDING_SET,
-		vgic_mmio_read_sgipend, vgic_mmio_write_sgipends, 16,
-		VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
-};
-
-static const struct vgic_register_region vgic_v2_cpu_registers[] = {
-	REGISTER_DESC_WITH_LENGTH(GIC_CPU_CTRL,
-		vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH(GIC_CPU_PRIMASK,
-		vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH(GIC_CPU_BINPOINT,
-		vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH(GIC_CPU_ALIAS_BINPOINT,
-		vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH(GIC_CPU_ACTIVEPRIO,
-		vgic_mmio_read_apr, vgic_mmio_write_apr, 16,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH(GIC_CPU_IDENT,
-		vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4,
-		VGIC_ACCESS_32bit),
-};
-
-unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev)
-{
-	dev->regions = vgic_v2_dist_registers;
-	dev->nr_regions = ARRAY_SIZE(vgic_v2_dist_registers);
-
-	kvm_iodevice_init(&dev->dev, &kvm_io_gic_ops);
-
-	return SZ_4K;
-}
-
-int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr)
-{
-	const struct vgic_register_region *region;
-	struct vgic_io_device iodev;
-	struct vgic_reg_attr reg_attr;
-	struct kvm_vcpu *vcpu;
-	gpa_t addr;
-	int ret;
-
-	ret = vgic_v2_parse_attr(dev, attr, &reg_attr);
-	if (ret)
-		return ret;
-
-	vcpu = reg_attr.vcpu;
-	addr = reg_attr.addr;
-
-	switch (attr->group) {
-	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-		iodev.regions = vgic_v2_dist_registers;
-		iodev.nr_regions = ARRAY_SIZE(vgic_v2_dist_registers);
-		iodev.base_addr = 0;
-		break;
-	case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
-		iodev.regions = vgic_v2_cpu_registers;
-		iodev.nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers);
-		iodev.base_addr = 0;
-		break;
-	default:
-		return -ENXIO;
-	}
-
-	/* We only support aligned 32-bit accesses. */
-	if (addr & 3)
-		return -ENXIO;
-
-	region = vgic_get_mmio_region(vcpu, &iodev, addr, sizeof(u32));
-	if (!region)
-		return -ENXIO;
-
-	return 0;
-}
-
-int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write,
-			  int offset, u32 *val)
-{
-	struct vgic_io_device dev = {
-		.regions = vgic_v2_cpu_registers,
-		.nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers),
-		.iodev_type = IODEV_CPUIF,
-	};
-
-	return vgic_uaccess(vcpu, &dev, is_write, offset, val);
-}
-
-int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
-			 int offset, u32 *val)
-{
-	struct vgic_io_device dev = {
-		.regions = vgic_v2_dist_registers,
-		.nr_regions = ARRAY_SIZE(vgic_v2_dist_registers),
-		.iodev_type = IODEV_DIST,
-	};
-
-	return vgic_uaccess(vcpu, &dev, is_write, offset, val);
-}
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c
deleted file mode 100644
index b3d1f0985117..000000000000
--- a/virt/kvm/arm/vgic/vgic-mmio-v3.c
+++ /dev/null
@@ -1,1022 +0,0 @@
-/*
- * VGICv3 MMIO handling functions
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
-
-#include <linux/irqchip/arm-gic-v3.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <kvm/iodev.h>
-#include <kvm/arm_vgic.h>
-
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_arm.h>
-#include <asm/kvm_mmu.h>
-
-#include "vgic.h"
-#include "vgic-mmio.h"
-
-/* extract @num bytes at @offset bytes offset in data */
-unsigned long extract_bytes(u64 data, unsigned int offset,
-			    unsigned int num)
-{
-	return (data >> (offset * 8)) & GENMASK_ULL(num * 8 - 1, 0);
-}
-
-/* allows updates of any half of a 64-bit register (or the whole thing) */
-u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len,
-		     unsigned long val)
-{
-	int lower = (offset & 4) * 8;
-	int upper = lower + 8 * len - 1;
-
-	reg &= ~GENMASK_ULL(upper, lower);
-	val &= GENMASK_ULL(len * 8 - 1, 0);
-
-	return reg | ((u64)val << lower);
-}
-
-bool vgic_has_its(struct kvm *kvm)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-
-	if (dist->vgic_model != KVM_DEV_TYPE_ARM_VGIC_V3)
-		return false;
-
-	return dist->has_its;
-}
-
-bool vgic_supports_direct_msis(struct kvm *kvm)
-{
-	return kvm_vgic_global_state.has_gicv4 && vgic_has_its(kvm);
-}
-
-/*
- * The Revision field in the IIDR have the following meanings:
- *
- * Revision 2: Interrupt groups are guest-configurable and signaled using
- * 	       their configured groups.
- */
-
-static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu,
-					    gpa_t addr, unsigned int len)
-{
-	struct vgic_dist *vgic = &vcpu->kvm->arch.vgic;
-	u32 value = 0;
-
-	switch (addr & 0x0c) {
-	case GICD_CTLR:
-		if (vgic->enabled)
-			value |= GICD_CTLR_ENABLE_SS_G1;
-		value |= GICD_CTLR_ARE_NS | GICD_CTLR_DS;
-		break;
-	case GICD_TYPER:
-		value = vgic->nr_spis + VGIC_NR_PRIVATE_IRQS;
-		value = (value >> 5) - 1;
-		if (vgic_has_its(vcpu->kvm)) {
-			value |= (INTERRUPT_ID_BITS_ITS - 1) << 19;
-			value |= GICD_TYPER_LPIS;
-		} else {
-			value |= (INTERRUPT_ID_BITS_SPIS - 1) << 19;
-		}
-		break;
-	case GICD_IIDR:
-		value = (PRODUCT_ID_KVM << GICD_IIDR_PRODUCT_ID_SHIFT) |
-			(vgic->implementation_rev << GICD_IIDR_REVISION_SHIFT) |
-			(IMPLEMENTER_ARM << GICD_IIDR_IMPLEMENTER_SHIFT);
-		break;
-	default:
-		return 0;
-	}
-
-	return value;
-}
-
-static void vgic_mmio_write_v3_misc(struct kvm_vcpu *vcpu,
-				    gpa_t addr, unsigned int len,
-				    unsigned long val)
-{
-	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-	bool was_enabled = dist->enabled;
-
-	switch (addr & 0x0c) {
-	case GICD_CTLR:
-		dist->enabled = val & GICD_CTLR_ENABLE_SS_G1;
-
-		if (!was_enabled && dist->enabled)
-			vgic_kick_vcpus(vcpu->kvm);
-		break;
-	case GICD_TYPER:
-	case GICD_IIDR:
-		return;
-	}
-}
-
-static int vgic_mmio_uaccess_write_v3_misc(struct kvm_vcpu *vcpu,
-					   gpa_t addr, unsigned int len,
-					   unsigned long val)
-{
-	switch (addr & 0x0c) {
-	case GICD_IIDR:
-		if (val != vgic_mmio_read_v3_misc(vcpu, addr, len))
-			return -EINVAL;
-	}
-
-	vgic_mmio_write_v3_misc(vcpu, addr, len, val);
-	return 0;
-}
-
-static unsigned long vgic_mmio_read_irouter(struct kvm_vcpu *vcpu,
-					    gpa_t addr, unsigned int len)
-{
-	int intid = VGIC_ADDR_TO_INTID(addr, 64);
-	struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid);
-	unsigned long ret = 0;
-
-	if (!irq)
-		return 0;
-
-	/* The upper word is RAZ for us. */
-	if (!(addr & 4))
-		ret = extract_bytes(READ_ONCE(irq->mpidr), addr & 7, len);
-
-	vgic_put_irq(vcpu->kvm, irq);
-	return ret;
-}
-
-static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu,
-				    gpa_t addr, unsigned int len,
-				    unsigned long val)
-{
-	int intid = VGIC_ADDR_TO_INTID(addr, 64);
-	struct vgic_irq *irq;
-	unsigned long flags;
-
-	/* The upper word is WI for us since we don't implement Aff3. */
-	if (addr & 4)
-		return;
-
-	irq = vgic_get_irq(vcpu->kvm, NULL, intid);
-
-	if (!irq)
-		return;
-
-	spin_lock_irqsave(&irq->irq_lock, flags);
-
-	/* We only care about and preserve Aff0, Aff1 and Aff2. */
-	irq->mpidr = val & GENMASK(23, 0);
-	irq->target_vcpu = kvm_mpidr_to_vcpu(vcpu->kvm, irq->mpidr);
-
-	spin_unlock_irqrestore(&irq->irq_lock, flags);
-	vgic_put_irq(vcpu->kvm, irq);
-}
-
-static unsigned long vgic_mmio_read_v3r_ctlr(struct kvm_vcpu *vcpu,
-					     gpa_t addr, unsigned int len)
-{
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-
-	return vgic_cpu->lpis_enabled ? GICR_CTLR_ENABLE_LPIS : 0;
-}
-
-
-static void vgic_mmio_write_v3r_ctlr(struct kvm_vcpu *vcpu,
-				     gpa_t addr, unsigned int len,
-				     unsigned long val)
-{
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-	bool was_enabled = vgic_cpu->lpis_enabled;
-
-	if (!vgic_has_its(vcpu->kvm))
-		return;
-
-	vgic_cpu->lpis_enabled = val & GICR_CTLR_ENABLE_LPIS;
-
-	if (!was_enabled && vgic_cpu->lpis_enabled)
-		vgic_enable_lpis(vcpu);
-}
-
-static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu,
-					      gpa_t addr, unsigned int len)
-{
-	unsigned long mpidr = kvm_vcpu_get_mpidr_aff(vcpu);
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-	struct vgic_redist_region *rdreg = vgic_cpu->rdreg;
-	int target_vcpu_id = vcpu->vcpu_id;
-	gpa_t last_rdist_typer = rdreg->base + GICR_TYPER +
-			(rdreg->free_index - 1) * KVM_VGIC_V3_REDIST_SIZE;
-	u64 value;
-
-	value = (u64)(mpidr & GENMASK(23, 0)) << 32;
-	value |= ((target_vcpu_id & 0xffff) << 8);
-
-	if (addr == last_rdist_typer)
-		value |= GICR_TYPER_LAST;
-	if (vgic_has_its(vcpu->kvm))
-		value |= GICR_TYPER_PLPIS;
-
-	return extract_bytes(value, addr & 7, len);
-}
-
-static unsigned long vgic_mmio_read_v3r_iidr(struct kvm_vcpu *vcpu,
-					     gpa_t addr, unsigned int len)
-{
-	return (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
-}
-
-static unsigned long vgic_mmio_read_v3_idregs(struct kvm_vcpu *vcpu,
-					      gpa_t addr, unsigned int len)
-{
-	switch (addr & 0xffff) {
-	case GICD_PIDR2:
-		/* report a GICv3 compliant implementation */
-		return 0x3b;
-	}
-
-	return 0;
-}
-
-static unsigned long vgic_v3_uaccess_read_pending(struct kvm_vcpu *vcpu,
-						  gpa_t addr, unsigned int len)
-{
-	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-	u32 value = 0;
-	int i;
-
-	/*
-	 * pending state of interrupt is latched in pending_latch variable.
-	 * Userspace will save and restore pending state and line_level
-	 * separately.
-	 * Refer to Documentation/virtual/kvm/devices/arm-vgic-v3.txt
-	 * for handling of ISPENDR and ICPENDR.
-	 */
-	for (i = 0; i < len * 8; i++) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-		if (irq->pending_latch)
-			value |= (1U << i);
-
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-
-	return value;
-}
-
-static int vgic_v3_uaccess_write_pending(struct kvm_vcpu *vcpu,
-					 gpa_t addr, unsigned int len,
-					 unsigned long val)
-{
-	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-	int i;
-	unsigned long flags;
-
-	for (i = 0; i < len * 8; i++) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-		spin_lock_irqsave(&irq->irq_lock, flags);
-		if (test_bit(i, &val)) {
-			/*
-			 * pending_latch is set irrespective of irq type
-			 * (level or edge) to avoid dependency that VM should
-			 * restore irq config before pending info.
-			 */
-			irq->pending_latch = true;
-			vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
-		} else {
-			irq->pending_latch = false;
-			spin_unlock_irqrestore(&irq->irq_lock, flags);
-		}
-
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-
-	return 0;
-}
-
-/* We want to avoid outer shareable. */
-u64 vgic_sanitise_shareability(u64 field)
-{
-	switch (field) {
-	case GIC_BASER_OuterShareable:
-		return GIC_BASER_InnerShareable;
-	default:
-		return field;
-	}
-}
-
-/* Avoid any inner non-cacheable mapping. */
-u64 vgic_sanitise_inner_cacheability(u64 field)
-{
-	switch (field) {
-	case GIC_BASER_CACHE_nCnB:
-	case GIC_BASER_CACHE_nC:
-		return GIC_BASER_CACHE_RaWb;
-	default:
-		return field;
-	}
-}
-
-/* Non-cacheable or same-as-inner are OK. */
-u64 vgic_sanitise_outer_cacheability(u64 field)
-{
-	switch (field) {
-	case GIC_BASER_CACHE_SameAsInner:
-	case GIC_BASER_CACHE_nC:
-		return field;
-	default:
-		return GIC_BASER_CACHE_nC;
-	}
-}
-
-u64 vgic_sanitise_field(u64 reg, u64 field_mask, int field_shift,
-			u64 (*sanitise_fn)(u64))
-{
-	u64 field = (reg & field_mask) >> field_shift;
-
-	field = sanitise_fn(field) << field_shift;
-	return (reg & ~field_mask) | field;
-}
-
-#define PROPBASER_RES0_MASK						\
-	(GENMASK_ULL(63, 59) | GENMASK_ULL(55, 52) | GENMASK_ULL(6, 5))
-#define PENDBASER_RES0_MASK						\
-	(BIT_ULL(63) | GENMASK_ULL(61, 59) | GENMASK_ULL(55, 52) |	\
-	 GENMASK_ULL(15, 12) | GENMASK_ULL(6, 0))
-
-static u64 vgic_sanitise_pendbaser(u64 reg)
-{
-	reg = vgic_sanitise_field(reg, GICR_PENDBASER_SHAREABILITY_MASK,
-				  GICR_PENDBASER_SHAREABILITY_SHIFT,
-				  vgic_sanitise_shareability);
-	reg = vgic_sanitise_field(reg, GICR_PENDBASER_INNER_CACHEABILITY_MASK,
-				  GICR_PENDBASER_INNER_CACHEABILITY_SHIFT,
-				  vgic_sanitise_inner_cacheability);
-	reg = vgic_sanitise_field(reg, GICR_PENDBASER_OUTER_CACHEABILITY_MASK,
-				  GICR_PENDBASER_OUTER_CACHEABILITY_SHIFT,
-				  vgic_sanitise_outer_cacheability);
-
-	reg &= ~PENDBASER_RES0_MASK;
-
-	return reg;
-}
-
-static u64 vgic_sanitise_propbaser(u64 reg)
-{
-	reg = vgic_sanitise_field(reg, GICR_PROPBASER_SHAREABILITY_MASK,
-				  GICR_PROPBASER_SHAREABILITY_SHIFT,
-				  vgic_sanitise_shareability);
-	reg = vgic_sanitise_field(reg, GICR_PROPBASER_INNER_CACHEABILITY_MASK,
-				  GICR_PROPBASER_INNER_CACHEABILITY_SHIFT,
-				  vgic_sanitise_inner_cacheability);
-	reg = vgic_sanitise_field(reg, GICR_PROPBASER_OUTER_CACHEABILITY_MASK,
-				  GICR_PROPBASER_OUTER_CACHEABILITY_SHIFT,
-				  vgic_sanitise_outer_cacheability);
-
-	reg &= ~PROPBASER_RES0_MASK;
-	return reg;
-}
-
-static unsigned long vgic_mmio_read_propbase(struct kvm_vcpu *vcpu,
-					     gpa_t addr, unsigned int len)
-{
-	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
-	return extract_bytes(dist->propbaser, addr & 7, len);
-}
-
-static void vgic_mmio_write_propbase(struct kvm_vcpu *vcpu,
-				     gpa_t addr, unsigned int len,
-				     unsigned long val)
-{
-	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-	u64 old_propbaser, propbaser;
-
-	/* Storing a value with LPIs already enabled is undefined */
-	if (vgic_cpu->lpis_enabled)
-		return;
-
-	do {
-		old_propbaser = READ_ONCE(dist->propbaser);
-		propbaser = old_propbaser;
-		propbaser = update_64bit_reg(propbaser, addr & 4, len, val);
-		propbaser = vgic_sanitise_propbaser(propbaser);
-	} while (cmpxchg64(&dist->propbaser, old_propbaser,
-			   propbaser) != old_propbaser);
-}
-
-static unsigned long vgic_mmio_read_pendbase(struct kvm_vcpu *vcpu,
-					     gpa_t addr, unsigned int len)
-{
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-
-	return extract_bytes(vgic_cpu->pendbaser, addr & 7, len);
-}
-
-static void vgic_mmio_write_pendbase(struct kvm_vcpu *vcpu,
-				     gpa_t addr, unsigned int len,
-				     unsigned long val)
-{
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-	u64 old_pendbaser, pendbaser;
-
-	/* Storing a value with LPIs already enabled is undefined */
-	if (vgic_cpu->lpis_enabled)
-		return;
-
-	do {
-		old_pendbaser = READ_ONCE(vgic_cpu->pendbaser);
-		pendbaser = old_pendbaser;
-		pendbaser = update_64bit_reg(pendbaser, addr & 4, len, val);
-		pendbaser = vgic_sanitise_pendbaser(pendbaser);
-	} while (cmpxchg64(&vgic_cpu->pendbaser, old_pendbaser,
-			   pendbaser) != old_pendbaser);
-}
-
-/*
- * The GICv3 per-IRQ registers are split to control PPIs and SGIs in the
- * redistributors, while SPIs are covered by registers in the distributor
- * block. Trying to set private IRQs in this block gets ignored.
- * We take some special care here to fix the calculation of the register
- * offset.
- */
-#define REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(off, rd, wr, ur, uw, bpi, acc) \
-	{								\
-		.reg_offset = off,					\
-		.bits_per_irq = bpi,					\
-		.len = (bpi * VGIC_NR_PRIVATE_IRQS) / 8,		\
-		.access_flags = acc,					\
-		.read = vgic_mmio_read_raz,				\
-		.write = vgic_mmio_write_wi,				\
-	}, {								\
-		.reg_offset = off + (bpi * VGIC_NR_PRIVATE_IRQS) / 8,	\
-		.bits_per_irq = bpi,					\
-		.len = (bpi * (1024 - VGIC_NR_PRIVATE_IRQS)) / 8,	\
-		.access_flags = acc,					\
-		.read = rd,						\
-		.write = wr,						\
-		.uaccess_read = ur,					\
-		.uaccess_write = uw,					\
-	}
-
-static const struct vgic_register_region vgic_v3_dist_registers[] = {
-	REGISTER_DESC_WITH_LENGTH_UACCESS(GICD_CTLR,
-		vgic_mmio_read_v3_misc, vgic_mmio_write_v3_misc,
-		NULL, vgic_mmio_uaccess_write_v3_misc,
-		16, VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH(GICD_STATUSR,
-		vgic_mmio_read_rao, vgic_mmio_write_wi, 4,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IGROUPR,
-		vgic_mmio_read_group, vgic_mmio_write_group, NULL, NULL, 1,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISENABLER,
-		vgic_mmio_read_enable, vgic_mmio_write_senable, NULL, NULL, 1,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICENABLER,
-		vgic_mmio_read_enable, vgic_mmio_write_cenable, NULL, NULL, 1,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISPENDR,
-		vgic_mmio_read_pending, vgic_mmio_write_spending,
-		vgic_v3_uaccess_read_pending, vgic_v3_uaccess_write_pending, 1,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICPENDR,
-		vgic_mmio_read_pending, vgic_mmio_write_cpending,
-		vgic_mmio_read_raz, vgic_mmio_uaccess_write_wi, 1,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISACTIVER,
-		vgic_mmio_read_active, vgic_mmio_write_sactive,
-		NULL, vgic_mmio_uaccess_write_sactive, 1,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICACTIVER,
-		vgic_mmio_read_active, vgic_mmio_write_cactive,
-		NULL, vgic_mmio_uaccess_write_cactive,
-		1, VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IPRIORITYR,
-		vgic_mmio_read_priority, vgic_mmio_write_priority, NULL, NULL,
-		8, VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
-	REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ITARGETSR,
-		vgic_mmio_read_raz, vgic_mmio_write_wi, NULL, NULL, 8,
-		VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
-	REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICFGR,
-		vgic_mmio_read_config, vgic_mmio_write_config, NULL, NULL, 2,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IGRPMODR,
-		vgic_mmio_read_raz, vgic_mmio_write_wi, NULL, NULL, 1,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IROUTER,
-		vgic_mmio_read_irouter, vgic_mmio_write_irouter, NULL, NULL, 64,
-		VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH(GICD_IDREGS,
-		vgic_mmio_read_v3_idregs, vgic_mmio_write_wi, 48,
-		VGIC_ACCESS_32bit),
-};
-
-static const struct vgic_register_region vgic_v3_rdbase_registers[] = {
-	REGISTER_DESC_WITH_LENGTH(GICR_CTLR,
-		vgic_mmio_read_v3r_ctlr, vgic_mmio_write_v3r_ctlr, 4,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH(GICR_STATUSR,
-		vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH(GICR_IIDR,
-		vgic_mmio_read_v3r_iidr, vgic_mmio_write_wi, 4,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH(GICR_TYPER,
-		vgic_mmio_read_v3r_typer, vgic_mmio_write_wi, 8,
-		VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH(GICR_WAKER,
-		vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH(GICR_PROPBASER,
-		vgic_mmio_read_propbase, vgic_mmio_write_propbase, 8,
-		VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH(GICR_PENDBASER,
-		vgic_mmio_read_pendbase, vgic_mmio_write_pendbase, 8,
-		VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH(GICR_IDREGS,
-		vgic_mmio_read_v3_idregs, vgic_mmio_write_wi, 48,
-		VGIC_ACCESS_32bit),
-};
-
-static const struct vgic_register_region vgic_v3_sgibase_registers[] = {
-	REGISTER_DESC_WITH_LENGTH(GICR_IGROUPR0,
-		vgic_mmio_read_group, vgic_mmio_write_group, 4,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH(GICR_ISENABLER0,
-		vgic_mmio_read_enable, vgic_mmio_write_senable, 4,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH(GICR_ICENABLER0,
-		vgic_mmio_read_enable, vgic_mmio_write_cenable, 4,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH_UACCESS(GICR_ISPENDR0,
-		vgic_mmio_read_pending, vgic_mmio_write_spending,
-		vgic_v3_uaccess_read_pending, vgic_v3_uaccess_write_pending, 4,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH_UACCESS(GICR_ICPENDR0,
-		vgic_mmio_read_pending, vgic_mmio_write_cpending,
-		vgic_mmio_read_raz, vgic_mmio_uaccess_write_wi, 4,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH_UACCESS(GICR_ISACTIVER0,
-		vgic_mmio_read_active, vgic_mmio_write_sactive,
-		NULL, vgic_mmio_uaccess_write_sactive,
-		4, VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH_UACCESS(GICR_ICACTIVER0,
-		vgic_mmio_read_active, vgic_mmio_write_cactive,
-		NULL, vgic_mmio_uaccess_write_cactive,
-		4, VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH(GICR_IPRIORITYR0,
-		vgic_mmio_read_priority, vgic_mmio_write_priority, 32,
-		VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
-	REGISTER_DESC_WITH_LENGTH(GICR_ICFGR0,
-		vgic_mmio_read_config, vgic_mmio_write_config, 8,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH(GICR_IGRPMODR0,
-		vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
-		VGIC_ACCESS_32bit),
-	REGISTER_DESC_WITH_LENGTH(GICR_NSACR,
-		vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
-		VGIC_ACCESS_32bit),
-};
-
-unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev)
-{
-	dev->regions = vgic_v3_dist_registers;
-	dev->nr_regions = ARRAY_SIZE(vgic_v3_dist_registers);
-
-	kvm_iodevice_init(&dev->dev, &kvm_io_gic_ops);
-
-	return SZ_64K;
-}
-
-/**
- * vgic_register_redist_iodev - register a single redist iodev
- * @vcpu:    The VCPU to which the redistributor belongs
- *
- * Register a KVM iodev for this VCPU's redistributor using the address
- * provided.
- *
- * Return 0 on success, -ERRNO otherwise.
- */
-int vgic_register_redist_iodev(struct kvm_vcpu *vcpu)
-{
-	struct kvm *kvm = vcpu->kvm;
-	struct vgic_dist *vgic = &kvm->arch.vgic;
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-	struct vgic_io_device *rd_dev = &vcpu->arch.vgic_cpu.rd_iodev;
-	struct vgic_io_device *sgi_dev = &vcpu->arch.vgic_cpu.sgi_iodev;
-	struct vgic_redist_region *rdreg;
-	gpa_t rd_base, sgi_base;
-	int ret;
-
-	if (!IS_VGIC_ADDR_UNDEF(vgic_cpu->rd_iodev.base_addr))
-		return 0;
-
-	/*
-	 * We may be creating VCPUs before having set the base address for the
-	 * redistributor region, in which case we will come back to this
-	 * function for all VCPUs when the base address is set.  Just return
-	 * without doing any work for now.
-	 */
-	rdreg = vgic_v3_rdist_free_slot(&vgic->rd_regions);
-	if (!rdreg)
-		return 0;
-
-	if (!vgic_v3_check_base(kvm))
-		return -EINVAL;
-
-	vgic_cpu->rdreg = rdreg;
-
-	rd_base = rdreg->base + rdreg->free_index * KVM_VGIC_V3_REDIST_SIZE;
-	sgi_base = rd_base + SZ_64K;
-
-	kvm_iodevice_init(&rd_dev->dev, &kvm_io_gic_ops);
-	rd_dev->base_addr = rd_base;
-	rd_dev->iodev_type = IODEV_REDIST;
-	rd_dev->regions = vgic_v3_rdbase_registers;
-	rd_dev->nr_regions = ARRAY_SIZE(vgic_v3_rdbase_registers);
-	rd_dev->redist_vcpu = vcpu;
-
-	mutex_lock(&kvm->slots_lock);
-	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, rd_base,
-				      SZ_64K, &rd_dev->dev);
-	mutex_unlock(&kvm->slots_lock);
-
-	if (ret)
-		return ret;
-
-	kvm_iodevice_init(&sgi_dev->dev, &kvm_io_gic_ops);
-	sgi_dev->base_addr = sgi_base;
-	sgi_dev->iodev_type = IODEV_REDIST;
-	sgi_dev->regions = vgic_v3_sgibase_registers;
-	sgi_dev->nr_regions = ARRAY_SIZE(vgic_v3_sgibase_registers);
-	sgi_dev->redist_vcpu = vcpu;
-
-	mutex_lock(&kvm->slots_lock);
-	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, sgi_base,
-				      SZ_64K, &sgi_dev->dev);
-	if (ret) {
-		kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
-					  &rd_dev->dev);
-		goto out;
-	}
-
-	rdreg->free_index++;
-out:
-	mutex_unlock(&kvm->slots_lock);
-	return ret;
-}
-
-static void vgic_unregister_redist_iodev(struct kvm_vcpu *vcpu)
-{
-	struct vgic_io_device *rd_dev = &vcpu->arch.vgic_cpu.rd_iodev;
-	struct vgic_io_device *sgi_dev = &vcpu->arch.vgic_cpu.sgi_iodev;
-
-	kvm_io_bus_unregister_dev(vcpu->kvm, KVM_MMIO_BUS, &rd_dev->dev);
-	kvm_io_bus_unregister_dev(vcpu->kvm, KVM_MMIO_BUS, &sgi_dev->dev);
-}
-
-static int vgic_register_all_redist_iodevs(struct kvm *kvm)
-{
-	struct kvm_vcpu *vcpu;
-	int c, ret = 0;
-
-	kvm_for_each_vcpu(c, vcpu, kvm) {
-		ret = vgic_register_redist_iodev(vcpu);
-		if (ret)
-			break;
-	}
-
-	if (ret) {
-		/* The current c failed, so we start with the previous one. */
-		mutex_lock(&kvm->slots_lock);
-		for (c--; c >= 0; c--) {
-			vcpu = kvm_get_vcpu(kvm, c);
-			vgic_unregister_redist_iodev(vcpu);
-		}
-		mutex_unlock(&kvm->slots_lock);
-	}
-
-	return ret;
-}
-
-/**
- * vgic_v3_insert_redist_region - Insert a new redistributor region
- *
- * Performs various checks before inserting the rdist region in the list.
- * Those tests depend on whether the size of the rdist region is known
- * (ie. count != 0). The list is sorted by rdist region index.
- *
- * @kvm: kvm handle
- * @index: redist region index
- * @base: base of the new rdist region
- * @count: number of redistributors the region is made of (0 in the old style
- * single region, whose size is induced from the number of vcpus)
- *
- * Return 0 on success, < 0 otherwise
- */
-static int vgic_v3_insert_redist_region(struct kvm *kvm, uint32_t index,
-					gpa_t base, uint32_t count)
-{
-	struct vgic_dist *d = &kvm->arch.vgic;
-	struct vgic_redist_region *rdreg;
-	struct list_head *rd_regions = &d->rd_regions;
-	size_t size = count * KVM_VGIC_V3_REDIST_SIZE;
-	int ret;
-
-	/* single rdist region already set ?*/
-	if (!count && !list_empty(rd_regions))
-		return -EINVAL;
-
-	/* cross the end of memory ? */
-	if (base + size < base)
-		return -EINVAL;
-
-	if (list_empty(rd_regions)) {
-		if (index != 0)
-			return -EINVAL;
-	} else {
-		rdreg = list_last_entry(rd_regions,
-					struct vgic_redist_region, list);
-		if (index != rdreg->index + 1)
-			return -EINVAL;
-
-		/* Cannot add an explicitly sized regions after legacy region */
-		if (!rdreg->count)
-			return -EINVAL;
-	}
-
-	/*
-	 * For legacy single-region redistributor regions (!count),
-	 * check that the redistributor region does not overlap with the
-	 * distributor's address space.
-	 */
-	if (!count && !IS_VGIC_ADDR_UNDEF(d->vgic_dist_base) &&
-		vgic_dist_overlap(kvm, base, size))
-		return -EINVAL;
-
-	/* collision with any other rdist region? */
-	if (vgic_v3_rdist_overlap(kvm, base, size))
-		return -EINVAL;
-
-	rdreg = kzalloc(sizeof(*rdreg), GFP_KERNEL);
-	if (!rdreg)
-		return -ENOMEM;
-
-	rdreg->base = VGIC_ADDR_UNDEF;
-
-	ret = vgic_check_ioaddr(kvm, &rdreg->base, base, SZ_64K);
-	if (ret)
-		goto free;
-
-	rdreg->base = base;
-	rdreg->count = count;
-	rdreg->free_index = 0;
-	rdreg->index = index;
-
-	list_add_tail(&rdreg->list, rd_regions);
-	return 0;
-free:
-	kfree(rdreg);
-	return ret;
-}
-
-int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count)
-{
-	int ret;
-
-	ret = vgic_v3_insert_redist_region(kvm, index, addr, count);
-	if (ret)
-		return ret;
-
-	/*
-	 * Register iodevs for each existing VCPU.  Adding more VCPUs
-	 * afterwards will register the iodevs when needed.
-	 */
-	ret = vgic_register_all_redist_iodevs(kvm);
-	if (ret)
-		return ret;
-
-	return 0;
-}
-
-int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr)
-{
-	const struct vgic_register_region *region;
-	struct vgic_io_device iodev;
-	struct vgic_reg_attr reg_attr;
-	struct kvm_vcpu *vcpu;
-	gpa_t addr;
-	int ret;
-
-	ret = vgic_v3_parse_attr(dev, attr, &reg_attr);
-	if (ret)
-		return ret;
-
-	vcpu = reg_attr.vcpu;
-	addr = reg_attr.addr;
-
-	switch (attr->group) {
-	case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
-		iodev.regions = vgic_v3_dist_registers;
-		iodev.nr_regions = ARRAY_SIZE(vgic_v3_dist_registers);
-		iodev.base_addr = 0;
-		break;
-	case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS:{
-		iodev.regions = vgic_v3_rdbase_registers;
-		iodev.nr_regions = ARRAY_SIZE(vgic_v3_rdbase_registers);
-		iodev.base_addr = 0;
-		break;
-	}
-	case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: {
-		u64 reg, id;
-
-		id = (attr->attr & KVM_DEV_ARM_VGIC_SYSREG_INSTR_MASK);
-		return vgic_v3_has_cpu_sysregs_attr(vcpu, 0, id, &reg);
-	}
-	default:
-		return -ENXIO;
-	}
-
-	/* We only support aligned 32-bit accesses. */
-	if (addr & 3)
-		return -ENXIO;
-
-	region = vgic_get_mmio_region(vcpu, &iodev, addr, sizeof(u32));
-	if (!region)
-		return -ENXIO;
-
-	return 0;
-}
-/*
- * Compare a given affinity (level 1-3 and a level 0 mask, from the SGI
- * generation register ICC_SGI1R_EL1) with a given VCPU.
- * If the VCPU's MPIDR matches, return the level0 affinity, otherwise
- * return -1.
- */
-static int match_mpidr(u64 sgi_aff, u16 sgi_cpu_mask, struct kvm_vcpu *vcpu)
-{
-	unsigned long affinity;
-	int level0;
-
-	/*
-	 * Split the current VCPU's MPIDR into affinity level 0 and the
-	 * rest as this is what we have to compare against.
-	 */
-	affinity = kvm_vcpu_get_mpidr_aff(vcpu);
-	level0 = MPIDR_AFFINITY_LEVEL(affinity, 0);
-	affinity &= ~MPIDR_LEVEL_MASK;
-
-	/* bail out if the upper three levels don't match */
-	if (sgi_aff != affinity)
-		return -1;
-
-	/* Is this VCPU's bit set in the mask ? */
-	if (!(sgi_cpu_mask & BIT(level0)))
-		return -1;
-
-	return level0;
-}
-
-/*
- * The ICC_SGI* registers encode the affinity differently from the MPIDR,
- * so provide a wrapper to use the existing defines to isolate a certain
- * affinity level.
- */
-#define SGI_AFFINITY_LEVEL(reg, level) \
-	((((reg) & ICC_SGI1R_AFFINITY_## level ##_MASK) \
-	>> ICC_SGI1R_AFFINITY_## level ##_SHIFT) << MPIDR_LEVEL_SHIFT(level))
-
-/**
- * vgic_v3_dispatch_sgi - handle SGI requests from VCPUs
- * @vcpu: The VCPU requesting a SGI
- * @reg: The value written into ICC_{ASGI1,SGI0,SGI1}R by that VCPU
- * @allow_group1: Does the sysreg access allow generation of G1 SGIs
- *
- * With GICv3 (and ARE=1) CPUs trigger SGIs by writing to a system register.
- * This will trap in sys_regs.c and call this function.
- * This ICC_SGI1R_EL1 register contains the upper three affinity levels of the
- * target processors as well as a bitmask of 16 Aff0 CPUs.
- * If the interrupt routing mode bit is not set, we iterate over all VCPUs to
- * check for matching ones. If this bit is set, we signal all, but not the
- * calling VCPU.
- */
-void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg, bool allow_group1)
-{
-	struct kvm *kvm = vcpu->kvm;
-	struct kvm_vcpu *c_vcpu;
-	u16 target_cpus;
-	u64 mpidr;
-	int sgi, c;
-	int vcpu_id = vcpu->vcpu_id;
-	bool broadcast;
-	unsigned long flags;
-
-	sgi = (reg & ICC_SGI1R_SGI_ID_MASK) >> ICC_SGI1R_SGI_ID_SHIFT;
-	broadcast = reg & BIT_ULL(ICC_SGI1R_IRQ_ROUTING_MODE_BIT);
-	target_cpus = (reg & ICC_SGI1R_TARGET_LIST_MASK) >> ICC_SGI1R_TARGET_LIST_SHIFT;
-	mpidr = SGI_AFFINITY_LEVEL(reg, 3);
-	mpidr |= SGI_AFFINITY_LEVEL(reg, 2);
-	mpidr |= SGI_AFFINITY_LEVEL(reg, 1);
-
-	/*
-	 * We iterate over all VCPUs to find the MPIDRs matching the request.
-	 * If we have handled one CPU, we clear its bit to detect early
-	 * if we are already finished. This avoids iterating through all
-	 * VCPUs when most of the times we just signal a single VCPU.
-	 */
-	kvm_for_each_vcpu(c, c_vcpu, kvm) {
-		struct vgic_irq *irq;
-
-		/* Exit early if we have dealt with all requested CPUs */
-		if (!broadcast && target_cpus == 0)
-			break;
-
-		/* Don't signal the calling VCPU */
-		if (broadcast && c == vcpu_id)
-			continue;
-
-		if (!broadcast) {
-			int level0;
-
-			level0 = match_mpidr(mpidr, target_cpus, c_vcpu);
-			if (level0 == -1)
-				continue;
-
-			/* remove this matching VCPU from the mask */
-			target_cpus &= ~BIT(level0);
-		}
-
-		irq = vgic_get_irq(vcpu->kvm, c_vcpu, sgi);
-
-		spin_lock_irqsave(&irq->irq_lock, flags);
-
-		/*
-		 * An access targetting Group0 SGIs can only generate
-		 * those, while an access targetting Group1 SGIs can
-		 * generate interrupts of either group.
-		 */
-		if (!irq->group || allow_group1) {
-			irq->pending_latch = true;
-			vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
-		} else {
-			spin_unlock_irqrestore(&irq->irq_lock, flags);
-		}
-
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-}
-
-int vgic_v3_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
-			 int offset, u32 *val)
-{
-	struct vgic_io_device dev = {
-		.regions = vgic_v3_dist_registers,
-		.nr_regions = ARRAY_SIZE(vgic_v3_dist_registers),
-	};
-
-	return vgic_uaccess(vcpu, &dev, is_write, offset, val);
-}
-
-int vgic_v3_redist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
-			   int offset, u32 *val)
-{
-	struct vgic_io_device rd_dev = {
-		.regions = vgic_v3_rdbase_registers,
-		.nr_regions = ARRAY_SIZE(vgic_v3_rdbase_registers),
-	};
-
-	struct vgic_io_device sgi_dev = {
-		.regions = vgic_v3_sgibase_registers,
-		.nr_regions = ARRAY_SIZE(vgic_v3_sgibase_registers),
-	};
-
-	/* SGI_base is the next 64K frame after RD_base */
-	if (offset >= SZ_64K)
-		return vgic_uaccess(vcpu, &sgi_dev, is_write, offset - SZ_64K,
-				    val);
-	else
-		return vgic_uaccess(vcpu, &rd_dev, is_write, offset, val);
-}
-
-int vgic_v3_line_level_info_uaccess(struct kvm_vcpu *vcpu, bool is_write,
-				    u32 intid, u64 *val)
-{
-	if (intid % 32)
-		return -EINVAL;
-
-	if (is_write)
-		vgic_write_irq_line_level_info(vcpu, intid, *val);
-	else
-		*val = vgic_read_irq_line_level_info(vcpu, intid);
-
-	return 0;
-}
diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c
deleted file mode 100644
index ceeda7e04a4d..000000000000
--- a/virt/kvm/arm/vgic/vgic-mmio.c
+++ /dev/null
@@ -1,893 +0,0 @@
-/*
- * VGIC MMIO handling functions
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
-
-#include <linux/bitops.h>
-#include <linux/bsearch.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <kvm/iodev.h>
-#include <kvm/arm_arch_timer.h>
-#include <kvm/arm_vgic.h>
-
-#include "vgic.h"
-#include "vgic-mmio.h"
-
-unsigned long vgic_mmio_read_raz(struct kvm_vcpu *vcpu,
-				 gpa_t addr, unsigned int len)
-{
-	return 0;
-}
-
-unsigned long vgic_mmio_read_rao(struct kvm_vcpu *vcpu,
-				 gpa_t addr, unsigned int len)
-{
-	return -1UL;
-}
-
-void vgic_mmio_write_wi(struct kvm_vcpu *vcpu, gpa_t addr,
-			unsigned int len, unsigned long val)
-{
-	/* Ignore */
-}
-
-int vgic_mmio_uaccess_write_wi(struct kvm_vcpu *vcpu, gpa_t addr,
-			       unsigned int len, unsigned long val)
-{
-	/* Ignore */
-	return 0;
-}
-
-unsigned long vgic_mmio_read_group(struct kvm_vcpu *vcpu,
-				   gpa_t addr, unsigned int len)
-{
-	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-	u32 value = 0;
-	int i;
-
-	/* Loop over all IRQs affected by this read */
-	for (i = 0; i < len * 8; i++) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-		if (irq->group)
-			value |= BIT(i);
-
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-
-	return value;
-}
-
-void vgic_mmio_write_group(struct kvm_vcpu *vcpu, gpa_t addr,
-			   unsigned int len, unsigned long val)
-{
-	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-	int i;
-	unsigned long flags;
-
-	for (i = 0; i < len * 8; i++) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-		spin_lock_irqsave(&irq->irq_lock, flags);
-		irq->group = !!(val & BIT(i));
-		vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
-
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-}
-
-/*
- * Read accesses to both GICD_ICENABLER and GICD_ISENABLER return the value
- * of the enabled bit, so there is only one function for both here.
- */
-unsigned long vgic_mmio_read_enable(struct kvm_vcpu *vcpu,
-				    gpa_t addr, unsigned int len)
-{
-	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-	u32 value = 0;
-	int i;
-
-	/* Loop over all IRQs affected by this read */
-	for (i = 0; i < len * 8; i++) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-		if (irq->enabled)
-			value |= (1U << i);
-
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-
-	return value;
-}
-
-void vgic_mmio_write_senable(struct kvm_vcpu *vcpu,
-			     gpa_t addr, unsigned int len,
-			     unsigned long val)
-{
-	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-	int i;
-	unsigned long flags;
-
-	for_each_set_bit(i, &val, len * 8) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-		spin_lock_irqsave(&irq->irq_lock, flags);
-		irq->enabled = true;
-		vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
-
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-}
-
-void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu,
-			     gpa_t addr, unsigned int len,
-			     unsigned long val)
-{
-	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-	int i;
-	unsigned long flags;
-
-	for_each_set_bit(i, &val, len * 8) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-		spin_lock_irqsave(&irq->irq_lock, flags);
-
-		irq->enabled = false;
-
-		spin_unlock_irqrestore(&irq->irq_lock, flags);
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-}
-
-unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
-				     gpa_t addr, unsigned int len)
-{
-	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-	u32 value = 0;
-	int i;
-
-	/* Loop over all IRQs affected by this read */
-	for (i = 0; i < len * 8; i++) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-		unsigned long flags;
-
-		spin_lock_irqsave(&irq->irq_lock, flags);
-		if (irq_is_pending(irq))
-			value |= (1U << i);
-		spin_unlock_irqrestore(&irq->irq_lock, flags);
-
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-
-	return value;
-}
-
-/*
- * This function will return the VCPU that performed the MMIO access and
- * trapped from within the VM, and will return NULL if this is a userspace
- * access.
- *
- * We can disable preemption locally around accessing the per-CPU variable,
- * and use the resolved vcpu pointer after enabling preemption again, because
- * even if the current thread is migrated to another CPU, reading the per-CPU
- * value later will give us the same value as we update the per-CPU variable
- * in the preempt notifier handlers.
- */
-static struct kvm_vcpu *vgic_get_mmio_requester_vcpu(void)
-{
-	struct kvm_vcpu *vcpu;
-
-	preempt_disable();
-	vcpu = kvm_arm_get_running_vcpu();
-	preempt_enable();
-	return vcpu;
-}
-
-/* Must be called with irq->irq_lock held */
-static void vgic_hw_irq_spending(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
-				 bool is_uaccess)
-{
-	if (is_uaccess)
-		return;
-
-	irq->pending_latch = true;
-	vgic_irq_set_phys_active(irq, true);
-}
-
-void vgic_mmio_write_spending(struct kvm_vcpu *vcpu,
-			      gpa_t addr, unsigned int len,
-			      unsigned long val)
-{
-	bool is_uaccess = !vgic_get_mmio_requester_vcpu();
-	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-	int i;
-	unsigned long flags;
-
-	for_each_set_bit(i, &val, len * 8) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-		spin_lock_irqsave(&irq->irq_lock, flags);
-		if (irq->hw)
-			vgic_hw_irq_spending(vcpu, irq, is_uaccess);
-		else
-			irq->pending_latch = true;
-		vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-}
-
-/* Must be called with irq->irq_lock held */
-static void vgic_hw_irq_cpending(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
-				 bool is_uaccess)
-{
-	if (is_uaccess)
-		return;
-
-	irq->pending_latch = false;
-
-	/*
-	 * We don't want the guest to effectively mask the physical
-	 * interrupt by doing a write to SPENDR followed by a write to
-	 * CPENDR for HW interrupts, so we clear the active state on
-	 * the physical side if the virtual interrupt is not active.
-	 * This may lead to taking an additional interrupt on the
-	 * host, but that should not be a problem as the worst that
-	 * can happen is an additional vgic injection.  We also clear
-	 * the pending state to maintain proper semantics for edge HW
-	 * interrupts.
-	 */
-	vgic_irq_set_phys_pending(irq, false);
-	if (!irq->active)
-		vgic_irq_set_phys_active(irq, false);
-}
-
-void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu,
-			      gpa_t addr, unsigned int len,
-			      unsigned long val)
-{
-	bool is_uaccess = !vgic_get_mmio_requester_vcpu();
-	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-	int i;
-	unsigned long flags;
-
-	for_each_set_bit(i, &val, len * 8) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-		spin_lock_irqsave(&irq->irq_lock, flags);
-
-		if (irq->hw)
-			vgic_hw_irq_cpending(vcpu, irq, is_uaccess);
-		else
-			irq->pending_latch = false;
-
-		spin_unlock_irqrestore(&irq->irq_lock, flags);
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-}
-
-unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu,
-				    gpa_t addr, unsigned int len)
-{
-	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-	u32 value = 0;
-	int i;
-
-	/* Loop over all IRQs affected by this read */
-	for (i = 0; i < len * 8; i++) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-		if (irq->active)
-			value |= (1U << i);
-
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-
-	return value;
-}
-
-/* Must be called with irq->irq_lock held */
-static void vgic_hw_irq_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
-				      bool active, bool is_uaccess)
-{
-	if (is_uaccess)
-		return;
-
-	irq->active = active;
-	vgic_irq_set_phys_active(irq, active);
-}
-
-static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
-				    bool active)
-{
-	unsigned long flags;
-	struct kvm_vcpu *requester_vcpu = vgic_get_mmio_requester_vcpu();
-
-	spin_lock_irqsave(&irq->irq_lock, flags);
-
-	if (irq->hw) {
-		vgic_hw_irq_change_active(vcpu, irq, active, !requester_vcpu);
-	} else {
-		u32 model = vcpu->kvm->arch.vgic.vgic_model;
-		u8 active_source;
-
-		irq->active = active;
-
-		/*
-		 * The GICv2 architecture indicates that the source CPUID for
-		 * an SGI should be provided during an EOI which implies that
-		 * the active state is stored somewhere, but at the same time
-		 * this state is not architecturally exposed anywhere and we
-		 * have no way of knowing the right source.
-		 *
-		 * This may lead to a VCPU not being able to receive
-		 * additional instances of a particular SGI after migration
-		 * for a GICv2 VM on some GIC implementations.  Oh well.
-		 */
-		active_source = (requester_vcpu) ? requester_vcpu->vcpu_id : 0;
-
-		if (model == KVM_DEV_TYPE_ARM_VGIC_V2 &&
-		    active && vgic_irq_is_sgi(irq->intid))
-			irq->active_source = active_source;
-	}
-
-	if (irq->active)
-		vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
-	else
-		spin_unlock_irqrestore(&irq->irq_lock, flags);
-}
-
-/*
- * If we are fiddling with an IRQ's active state, we have to make sure the IRQ
- * is not queued on some running VCPU's LRs, because then the change to the
- * active state can be overwritten when the VCPU's state is synced coming back
- * from the guest.
- *
- * For shared interrupts, we have to stop all the VCPUs because interrupts can
- * be migrated while we don't hold the IRQ locks and we don't want to be
- * chasing moving targets.
- *
- * For private interrupts we don't have to do anything because userspace
- * accesses to the VGIC state already require all VCPUs to be stopped, and
- * only the VCPU itself can modify its private interrupts active state, which
- * guarantees that the VCPU is not running.
- */
-static void vgic_change_active_prepare(struct kvm_vcpu *vcpu, u32 intid)
-{
-	if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 ||
-	    intid > VGIC_NR_PRIVATE_IRQS)
-		kvm_arm_halt_guest(vcpu->kvm);
-}
-
-/* See vgic_change_active_prepare */
-static void vgic_change_active_finish(struct kvm_vcpu *vcpu, u32 intid)
-{
-	if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 ||
-	    intid > VGIC_NR_PRIVATE_IRQS)
-		kvm_arm_resume_guest(vcpu->kvm);
-}
-
-static void __vgic_mmio_write_cactive(struct kvm_vcpu *vcpu,
-				      gpa_t addr, unsigned int len,
-				      unsigned long val)
-{
-	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-	int i;
-
-	for_each_set_bit(i, &val, len * 8) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-		vgic_mmio_change_active(vcpu, irq, false);
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-}
-
-void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu,
-			     gpa_t addr, unsigned int len,
-			     unsigned long val)
-{
-	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-
-	mutex_lock(&vcpu->kvm->lock);
-	vgic_change_active_prepare(vcpu, intid);
-
-	__vgic_mmio_write_cactive(vcpu, addr, len, val);
-
-	vgic_change_active_finish(vcpu, intid);
-	mutex_unlock(&vcpu->kvm->lock);
-}
-
-int vgic_mmio_uaccess_write_cactive(struct kvm_vcpu *vcpu,
-				     gpa_t addr, unsigned int len,
-				     unsigned long val)
-{
-	__vgic_mmio_write_cactive(vcpu, addr, len, val);
-	return 0;
-}
-
-static void __vgic_mmio_write_sactive(struct kvm_vcpu *vcpu,
-				      gpa_t addr, unsigned int len,
-				      unsigned long val)
-{
-	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-	int i;
-
-	for_each_set_bit(i, &val, len * 8) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-		vgic_mmio_change_active(vcpu, irq, true);
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-}
-
-void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu,
-			     gpa_t addr, unsigned int len,
-			     unsigned long val)
-{
-	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
-
-	mutex_lock(&vcpu->kvm->lock);
-	vgic_change_active_prepare(vcpu, intid);
-
-	__vgic_mmio_write_sactive(vcpu, addr, len, val);
-
-	vgic_change_active_finish(vcpu, intid);
-	mutex_unlock(&vcpu->kvm->lock);
-}
-
-int vgic_mmio_uaccess_write_sactive(struct kvm_vcpu *vcpu,
-				     gpa_t addr, unsigned int len,
-				     unsigned long val)
-{
-	__vgic_mmio_write_sactive(vcpu, addr, len, val);
-	return 0;
-}
-
-unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu,
-				      gpa_t addr, unsigned int len)
-{
-	u32 intid = VGIC_ADDR_TO_INTID(addr, 8);
-	int i;
-	u64 val = 0;
-
-	for (i = 0; i < len; i++) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-		val |= (u64)irq->priority << (i * 8);
-
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-
-	return val;
-}
-
-/*
- * We currently don't handle changing the priority of an interrupt that
- * is already pending on a VCPU. If there is a need for this, we would
- * need to make this VCPU exit and re-evaluate the priorities, potentially
- * leading to this interrupt getting presented now to the guest (if it has
- * been masked by the priority mask before).
- */
-void vgic_mmio_write_priority(struct kvm_vcpu *vcpu,
-			      gpa_t addr, unsigned int len,
-			      unsigned long val)
-{
-	u32 intid = VGIC_ADDR_TO_INTID(addr, 8);
-	int i;
-	unsigned long flags;
-
-	for (i = 0; i < len; i++) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-		spin_lock_irqsave(&irq->irq_lock, flags);
-		/* Narrow the priority range to what we actually support */
-		irq->priority = (val >> (i * 8)) & GENMASK(7, 8 - VGIC_PRI_BITS);
-		spin_unlock_irqrestore(&irq->irq_lock, flags);
-
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-}
-
-unsigned long vgic_mmio_read_config(struct kvm_vcpu *vcpu,
-				    gpa_t addr, unsigned int len)
-{
-	u32 intid = VGIC_ADDR_TO_INTID(addr, 2);
-	u32 value = 0;
-	int i;
-
-	for (i = 0; i < len * 4; i++) {
-		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-		if (irq->config == VGIC_CONFIG_EDGE)
-			value |= (2U << (i * 2));
-
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-
-	return value;
-}
-
-void vgic_mmio_write_config(struct kvm_vcpu *vcpu,
-			    gpa_t addr, unsigned int len,
-			    unsigned long val)
-{
-	u32 intid = VGIC_ADDR_TO_INTID(addr, 2);
-	int i;
-	unsigned long flags;
-
-	for (i = 0; i < len * 4; i++) {
-		struct vgic_irq *irq;
-
-		/*
-		 * The configuration cannot be changed for SGIs in general,
-		 * for PPIs this is IMPLEMENTATION DEFINED. The arch timer
-		 * code relies on PPIs being level triggered, so we also
-		 * make them read-only here.
-		 */
-		if (intid + i < VGIC_NR_PRIVATE_IRQS)
-			continue;
-
-		irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-		spin_lock_irqsave(&irq->irq_lock, flags);
-
-		if (test_bit(i * 2 + 1, &val))
-			irq->config = VGIC_CONFIG_EDGE;
-		else
-			irq->config = VGIC_CONFIG_LEVEL;
-
-		spin_unlock_irqrestore(&irq->irq_lock, flags);
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-}
-
-u64 vgic_read_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid)
-{
-	int i;
-	u64 val = 0;
-	int nr_irqs = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS;
-
-	for (i = 0; i < 32; i++) {
-		struct vgic_irq *irq;
-
-		if ((intid + i) < VGIC_NR_SGIS || (intid + i) >= nr_irqs)
-			continue;
-
-		irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-		if (irq->config == VGIC_CONFIG_LEVEL && irq->line_level)
-			val |= (1U << i);
-
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-
-	return val;
-}
-
-void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid,
-				    const u64 val)
-{
-	int i;
-	int nr_irqs = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS;
-	unsigned long flags;
-
-	for (i = 0; i < 32; i++) {
-		struct vgic_irq *irq;
-		bool new_level;
-
-		if ((intid + i) < VGIC_NR_SGIS || (intid + i) >= nr_irqs)
-			continue;
-
-		irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
-
-		/*
-		 * Line level is set irrespective of irq type
-		 * (level or edge) to avoid dependency that VM should
-		 * restore irq config before line level.
-		 */
-		new_level = !!(val & (1U << i));
-		spin_lock_irqsave(&irq->irq_lock, flags);
-		irq->line_level = new_level;
-		if (new_level)
-			vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
-		else
-			spin_unlock_irqrestore(&irq->irq_lock, flags);
-
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-}
-
-static int match_region(const void *key, const void *elt)
-{
-	const unsigned int offset = (unsigned long)key;
-	const struct vgic_register_region *region = elt;
-
-	if (offset < region->reg_offset)
-		return -1;
-
-	if (offset >= region->reg_offset + region->len)
-		return 1;
-
-	return 0;
-}
-
-const struct vgic_register_region *
-vgic_find_mmio_region(const struct vgic_register_region *regions,
-		      int nr_regions, unsigned int offset)
-{
-	return bsearch((void *)(uintptr_t)offset, regions, nr_regions,
-		       sizeof(regions[0]), match_region);
-}
-
-void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
-{
-	if (kvm_vgic_global_state.type == VGIC_V2)
-		vgic_v2_set_vmcr(vcpu, vmcr);
-	else
-		vgic_v3_set_vmcr(vcpu, vmcr);
-}
-
-void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
-{
-	if (kvm_vgic_global_state.type == VGIC_V2)
-		vgic_v2_get_vmcr(vcpu, vmcr);
-	else
-		vgic_v3_get_vmcr(vcpu, vmcr);
-}
-
-/*
- * kvm_mmio_read_buf() returns a value in a format where it can be converted
- * to a byte array and be directly observed as the guest wanted it to appear
- * in memory if it had done the store itself, which is LE for the GIC, as the
- * guest knows the GIC is always LE.
- *
- * We convert this value to the CPUs native format to deal with it as a data
- * value.
- */
-unsigned long vgic_data_mmio_bus_to_host(const void *val, unsigned int len)
-{
-	unsigned long data = kvm_mmio_read_buf(val, len);
-
-	switch (len) {
-	case 1:
-		return data;
-	case 2:
-		return le16_to_cpu(data);
-	case 4:
-		return le32_to_cpu(data);
-	default:
-		return le64_to_cpu(data);
-	}
-}
-
-/*
- * kvm_mmio_write_buf() expects a value in a format such that if converted to
- * a byte array it is observed as the guest would see it if it could perform
- * the load directly.  Since the GIC is LE, and the guest knows this, the
- * guest expects a value in little endian format.
- *
- * We convert the data value from the CPUs native format to LE so that the
- * value is returned in the proper format.
- */
-void vgic_data_host_to_mmio_bus(void *buf, unsigned int len,
-				unsigned long data)
-{
-	switch (len) {
-	case 1:
-		break;
-	case 2:
-		data = cpu_to_le16(data);
-		break;
-	case 4:
-		data = cpu_to_le32(data);
-		break;
-	default:
-		data = cpu_to_le64(data);
-	}
-
-	kvm_mmio_write_buf(buf, len, data);
-}
-
-static
-struct vgic_io_device *kvm_to_vgic_iodev(const struct kvm_io_device *dev)
-{
-	return container_of(dev, struct vgic_io_device, dev);
-}
-
-static bool check_region(const struct kvm *kvm,
-			 const struct vgic_register_region *region,
-			 gpa_t addr, int len)
-{
-	int flags, nr_irqs = kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS;
-
-	switch (len) {
-	case sizeof(u8):
-		flags = VGIC_ACCESS_8bit;
-		break;
-	case sizeof(u32):
-		flags = VGIC_ACCESS_32bit;
-		break;
-	case sizeof(u64):
-		flags = VGIC_ACCESS_64bit;
-		break;
-	default:
-		return false;
-	}
-
-	if ((region->access_flags & flags) && IS_ALIGNED(addr, len)) {
-		if (!region->bits_per_irq)
-			return true;
-
-		/* Do we access a non-allocated IRQ? */
-		return VGIC_ADDR_TO_INTID(addr, region->bits_per_irq) < nr_irqs;
-	}
-
-	return false;
-}
-
-const struct vgic_register_region *
-vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev,
-		     gpa_t addr, int len)
-{
-	const struct vgic_register_region *region;
-
-	region = vgic_find_mmio_region(iodev->regions, iodev->nr_regions,
-				       addr - iodev->base_addr);
-	if (!region || !check_region(vcpu->kvm, region, addr, len))
-		return NULL;
-
-	return region;
-}
-
-static int vgic_uaccess_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
-			     gpa_t addr, u32 *val)
-{
-	struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
-	const struct vgic_register_region *region;
-	struct kvm_vcpu *r_vcpu;
-
-	region = vgic_get_mmio_region(vcpu, iodev, addr, sizeof(u32));
-	if (!region) {
-		*val = 0;
-		return 0;
-	}
-
-	r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu;
-	if (region->uaccess_read)
-		*val = region->uaccess_read(r_vcpu, addr, sizeof(u32));
-	else
-		*val = region->read(r_vcpu, addr, sizeof(u32));
-
-	return 0;
-}
-
-static int vgic_uaccess_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
-			      gpa_t addr, const u32 *val)
-{
-	struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
-	const struct vgic_register_region *region;
-	struct kvm_vcpu *r_vcpu;
-
-	region = vgic_get_mmio_region(vcpu, iodev, addr, sizeof(u32));
-	if (!region)
-		return 0;
-
-	r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu;
-	if (region->uaccess_write)
-		return region->uaccess_write(r_vcpu, addr, sizeof(u32), *val);
-
-	region->write(r_vcpu, addr, sizeof(u32), *val);
-	return 0;
-}
-
-/*
- * Userland access to VGIC registers.
- */
-int vgic_uaccess(struct kvm_vcpu *vcpu, struct vgic_io_device *dev,
-		 bool is_write, int offset, u32 *val)
-{
-	if (is_write)
-		return vgic_uaccess_write(vcpu, &dev->dev, offset, val);
-	else
-		return vgic_uaccess_read(vcpu, &dev->dev, offset, val);
-}
-
-static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
-			      gpa_t addr, int len, void *val)
-{
-	struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
-	const struct vgic_register_region *region;
-	unsigned long data = 0;
-
-	region = vgic_get_mmio_region(vcpu, iodev, addr, len);
-	if (!region) {
-		memset(val, 0, len);
-		return 0;
-	}
-
-	switch (iodev->iodev_type) {
-	case IODEV_CPUIF:
-		data = region->read(vcpu, addr, len);
-		break;
-	case IODEV_DIST:
-		data = region->read(vcpu, addr, len);
-		break;
-	case IODEV_REDIST:
-		data = region->read(iodev->redist_vcpu, addr, len);
-		break;
-	case IODEV_ITS:
-		data = region->its_read(vcpu->kvm, iodev->its, addr, len);
-		break;
-	}
-
-	vgic_data_host_to_mmio_bus(val, len, data);
-	return 0;
-}
-
-static int dispatch_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
-			       gpa_t addr, int len, const void *val)
-{
-	struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
-	const struct vgic_register_region *region;
-	unsigned long data = vgic_data_mmio_bus_to_host(val, len);
-
-	region = vgic_get_mmio_region(vcpu, iodev, addr, len);
-	if (!region)
-		return 0;
-
-	switch (iodev->iodev_type) {
-	case IODEV_CPUIF:
-		region->write(vcpu, addr, len, data);
-		break;
-	case IODEV_DIST:
-		region->write(vcpu, addr, len, data);
-		break;
-	case IODEV_REDIST:
-		region->write(iodev->redist_vcpu, addr, len, data);
-		break;
-	case IODEV_ITS:
-		region->its_write(vcpu->kvm, iodev->its, addr, len, data);
-		break;
-	}
-
-	return 0;
-}
-
-struct kvm_io_device_ops kvm_io_gic_ops = {
-	.read = dispatch_mmio_read,
-	.write = dispatch_mmio_write,
-};
-
-int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address,
-			     enum vgic_type type)
-{
-	struct vgic_io_device *io_device = &kvm->arch.vgic.dist_iodev;
-	int ret = 0;
-	unsigned int len;
-
-	switch (type) {
-	case VGIC_V2:
-		len = vgic_v2_init_dist_iodev(io_device);
-		break;
-	case VGIC_V3:
-		len = vgic_v3_init_dist_iodev(io_device);
-		break;
-	default:
-		BUG_ON(1);
-	}
-
-	io_device->base_addr = dist_base_address;
-	io_device->iodev_type = IODEV_DIST;
-	io_device->redist_vcpu = NULL;
-
-	mutex_lock(&kvm->slots_lock);
-	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, dist_base_address,
-				      len, &io_device->dev);
-	mutex_unlock(&kvm->slots_lock);
-
-	return ret;
-}
diff --git a/virt/kvm/arm/vgic/vgic-mmio.h b/virt/kvm/arm/vgic/vgic-mmio.h
deleted file mode 100644
index a07f90acdaec..000000000000
--- a/virt/kvm/arm/vgic/vgic-mmio.h
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * Copyright (C) 2015, 2016 ARM Ltd.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-#ifndef __KVM_ARM_VGIC_MMIO_H__
-#define __KVM_ARM_VGIC_MMIO_H__
-
-struct vgic_register_region {
-	unsigned int reg_offset;
-	unsigned int len;
-	unsigned int bits_per_irq;
-	unsigned int access_flags;
-	union {
-		unsigned long (*read)(struct kvm_vcpu *vcpu, gpa_t addr,
-				      unsigned int len);
-		unsigned long (*its_read)(struct kvm *kvm, struct vgic_its *its,
-					  gpa_t addr, unsigned int len);
-	};
-	union {
-		void (*write)(struct kvm_vcpu *vcpu, gpa_t addr,
-			      unsigned int len, unsigned long val);
-		void (*its_write)(struct kvm *kvm, struct vgic_its *its,
-				  gpa_t addr, unsigned int len,
-				  unsigned long val);
-	};
-	unsigned long (*uaccess_read)(struct kvm_vcpu *vcpu, gpa_t addr,
-				      unsigned int len);
-	union {
-		int (*uaccess_write)(struct kvm_vcpu *vcpu, gpa_t addr,
-				     unsigned int len, unsigned long val);
-		int (*uaccess_its_write)(struct kvm *kvm, struct vgic_its *its,
-					 gpa_t addr, unsigned int len,
-					 unsigned long val);
-	};
-};
-
-extern struct kvm_io_device_ops kvm_io_gic_ops;
-
-#define VGIC_ACCESS_8bit	1
-#define VGIC_ACCESS_32bit	2
-#define VGIC_ACCESS_64bit	4
-
-/*
- * Generate a mask that covers the number of bytes required to address
- * up to 1024 interrupts, each represented by <bits> bits. This assumes
- * that <bits> is a power of two.
- */
-#define VGIC_ADDR_IRQ_MASK(bits) (((bits) * 1024 / 8) - 1)
-
-/*
- * (addr & mask) gives us the _byte_ offset for the INT ID.
- * We multiply this by 8 the get the _bit_ offset, then divide this by
- * the number of bits to learn the actual INT ID.
- * But instead of a division (which requires a "long long div" implementation),
- * we shift by the binary logarithm of <bits>.
- * This assumes that <bits> is a power of two.
- */
-#define VGIC_ADDR_TO_INTID(addr, bits)  (((addr) & VGIC_ADDR_IRQ_MASK(bits)) * \
-					8 >> ilog2(bits))
-
-/*
- * Some VGIC registers store per-IRQ information, with a different number
- * of bits per IRQ. For those registers this macro is used.
- * The _WITH_LENGTH version instantiates registers with a fixed length
- * and is mutually exclusive with the _PER_IRQ version.
- */
-#define REGISTER_DESC_WITH_BITS_PER_IRQ(off, rd, wr, ur, uw, bpi, acc)	\
-	{								\
-		.reg_offset = off,					\
-		.bits_per_irq = bpi,					\
-		.len = bpi * 1024 / 8,					\
-		.access_flags = acc,					\
-		.read = rd,						\
-		.write = wr,						\
-		.uaccess_read = ur,					\
-		.uaccess_write = uw,					\
-	}
-
-#define REGISTER_DESC_WITH_LENGTH(off, rd, wr, length, acc)		\
-	{								\
-		.reg_offset = off,					\
-		.bits_per_irq = 0,					\
-		.len = length,						\
-		.access_flags = acc,					\
-		.read = rd,						\
-		.write = wr,						\
-	}
-
-#define REGISTER_DESC_WITH_LENGTH_UACCESS(off, rd, wr, urd, uwr, length, acc) \
-	{								\
-		.reg_offset = off,					\
-		.bits_per_irq = 0,					\
-		.len = length,						\
-		.access_flags = acc,					\
-		.read = rd,						\
-		.write = wr,						\
-		.uaccess_read = urd,					\
-		.uaccess_write = uwr,					\
-	}
-
-int kvm_vgic_register_mmio_region(struct kvm *kvm, struct kvm_vcpu *vcpu,
-				  struct vgic_register_region *reg_desc,
-				  struct vgic_io_device *region,
-				  int nr_irqs, bool offset_private);
-
-unsigned long vgic_data_mmio_bus_to_host(const void *val, unsigned int len);
-
-void vgic_data_host_to_mmio_bus(void *buf, unsigned int len,
-				unsigned long data);
-
-unsigned long extract_bytes(u64 data, unsigned int offset,
-			    unsigned int num);
-
-u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len,
-		     unsigned long val);
-
-unsigned long vgic_mmio_read_raz(struct kvm_vcpu *vcpu,
-				 gpa_t addr, unsigned int len);
-
-unsigned long vgic_mmio_read_rao(struct kvm_vcpu *vcpu,
-				 gpa_t addr, unsigned int len);
-
-void vgic_mmio_write_wi(struct kvm_vcpu *vcpu, gpa_t addr,
-			unsigned int len, unsigned long val);
-
-int vgic_mmio_uaccess_write_wi(struct kvm_vcpu *vcpu, gpa_t addr,
-			       unsigned int len, unsigned long val);
-
-unsigned long vgic_mmio_read_group(struct kvm_vcpu *vcpu, gpa_t addr,
-				   unsigned int len);
-
-void vgic_mmio_write_group(struct kvm_vcpu *vcpu, gpa_t addr,
-			   unsigned int len, unsigned long val);
-
-unsigned long vgic_mmio_read_enable(struct kvm_vcpu *vcpu,
-				    gpa_t addr, unsigned int len);
-
-void vgic_mmio_write_senable(struct kvm_vcpu *vcpu,
-			     gpa_t addr, unsigned int len,
-			     unsigned long val);
-
-void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu,
-			     gpa_t addr, unsigned int len,
-			     unsigned long val);
-
-unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
-				     gpa_t addr, unsigned int len);
-
-void vgic_mmio_write_spending(struct kvm_vcpu *vcpu,
-			      gpa_t addr, unsigned int len,
-			      unsigned long val);
-
-void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu,
-			      gpa_t addr, unsigned int len,
-			      unsigned long val);
-
-unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu,
-				    gpa_t addr, unsigned int len);
-
-void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu,
-			     gpa_t addr, unsigned int len,
-			     unsigned long val);
-
-void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu,
-			     gpa_t addr, unsigned int len,
-			     unsigned long val);
-
-int vgic_mmio_uaccess_write_cactive(struct kvm_vcpu *vcpu,
-				    gpa_t addr, unsigned int len,
-				    unsigned long val);
-
-int vgic_mmio_uaccess_write_sactive(struct kvm_vcpu *vcpu,
-				    gpa_t addr, unsigned int len,
-				    unsigned long val);
-
-unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu,
-				      gpa_t addr, unsigned int len);
-
-void vgic_mmio_write_priority(struct kvm_vcpu *vcpu,
-			      gpa_t addr, unsigned int len,
-			      unsigned long val);
-
-unsigned long vgic_mmio_read_config(struct kvm_vcpu *vcpu,
-				    gpa_t addr, unsigned int len);
-
-void vgic_mmio_write_config(struct kvm_vcpu *vcpu,
-			    gpa_t addr, unsigned int len,
-			    unsigned long val);
-
-int vgic_uaccess(struct kvm_vcpu *vcpu, struct vgic_io_device *dev,
-		 bool is_write, int offset, u32 *val);
-
-u64 vgic_read_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid);
-
-void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid,
-				    const u64 val);
-
-unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev);
-
-unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev);
-
-u64 vgic_sanitise_outer_cacheability(u64 reg);
-u64 vgic_sanitise_inner_cacheability(u64 reg);
-u64 vgic_sanitise_shareability(u64 reg);
-u64 vgic_sanitise_field(u64 reg, u64 field_mask, int field_shift,
-			u64 (*sanitise_fn)(u64));
-
-/* Find the proper register handler entry given a certain address offset */
-const struct vgic_register_region *
-vgic_find_mmio_region(const struct vgic_register_region *regions,
-		      int nr_regions, unsigned int offset);
-
-#endif
diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c
deleted file mode 100644
index 69b892abd7dc..000000000000
--- a/virt/kvm/arm/vgic/vgic-v2.c
+++ /dev/null
@@ -1,504 +0,0 @@
-/*
- * Copyright (C) 2015, 2016 ARM Ltd.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/irqchip/arm-gic.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <kvm/arm_vgic.h>
-#include <asm/kvm_mmu.h>
-
-#include "vgic.h"
-
-static inline void vgic_v2_write_lr(int lr, u32 val)
-{
-	void __iomem *base = kvm_vgic_global_state.vctrl_base;
-
-	writel_relaxed(val, base + GICH_LR0 + (lr * 4));
-}
-
-void vgic_v2_init_lrs(void)
-{
-	int i;
-
-	for (i = 0; i < kvm_vgic_global_state.nr_lr; i++)
-		vgic_v2_write_lr(i, 0);
-}
-
-void vgic_v2_set_underflow(struct kvm_vcpu *vcpu)
-{
-	struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2;
-
-	cpuif->vgic_hcr |= GICH_HCR_UIE;
-}
-
-static bool lr_signals_eoi_mi(u32 lr_val)
-{
-	return !(lr_val & GICH_LR_STATE) && (lr_val & GICH_LR_EOI) &&
-	       !(lr_val & GICH_LR_HW);
-}
-
-/*
- * transfer the content of the LRs back into the corresponding ap_list:
- * - active bit is transferred as is
- * - pending bit is
- *   - transferred as is in case of edge sensitive IRQs
- *   - set to the line-level (resample time) for level sensitive IRQs
- */
-void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
-{
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-	struct vgic_v2_cpu_if *cpuif = &vgic_cpu->vgic_v2;
-	int lr;
-
-	DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
-
-	cpuif->vgic_hcr &= ~GICH_HCR_UIE;
-
-	for (lr = 0; lr < vgic_cpu->used_lrs; lr++) {
-		u32 val = cpuif->vgic_lr[lr];
-		u32 cpuid, intid = val & GICH_LR_VIRTUALID;
-		struct vgic_irq *irq;
-
-		/* Extract the source vCPU id from the LR */
-		cpuid = val & GICH_LR_PHYSID_CPUID;
-		cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT;
-		cpuid &= 7;
-
-		/* Notify fds when the guest EOI'ed a level-triggered SPI */
-		if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid))
-			kvm_notify_acked_irq(vcpu->kvm, 0,
-					     intid - VGIC_NR_PRIVATE_IRQS);
-
-		irq = vgic_get_irq(vcpu->kvm, vcpu, intid);
-
-		spin_lock(&irq->irq_lock);
-
-		/* Always preserve the active bit */
-		irq->active = !!(val & GICH_LR_ACTIVE_BIT);
-
-		if (irq->active && vgic_irq_is_sgi(intid))
-			irq->active_source = cpuid;
-
-		/* Edge is the only case where we preserve the pending bit */
-		if (irq->config == VGIC_CONFIG_EDGE &&
-		    (val & GICH_LR_PENDING_BIT)) {
-			irq->pending_latch = true;
-
-			if (vgic_irq_is_sgi(intid))
-				irq->source |= (1 << cpuid);
-		}
-
-		/*
-		 * Clear soft pending state when level irqs have been acked.
-		 */
-		if (irq->config == VGIC_CONFIG_LEVEL && !(val & GICH_LR_STATE))
-			irq->pending_latch = false;
-
-		/*
-		 * Level-triggered mapped IRQs are special because we only
-		 * observe rising edges as input to the VGIC.
-		 *
-		 * If the guest never acked the interrupt we have to sample
-		 * the physical line and set the line level, because the
-		 * device state could have changed or we simply need to
-		 * process the still pending interrupt later.
-		 *
-		 * If this causes us to lower the level, we have to also clear
-		 * the physical active state, since we will otherwise never be
-		 * told when the interrupt becomes asserted again.
-		 */
-		if (vgic_irq_is_mapped_level(irq) && (val & GICH_LR_PENDING_BIT)) {
-			irq->line_level = vgic_get_phys_line_level(irq);
-
-			if (!irq->line_level)
-				vgic_irq_set_phys_active(irq, false);
-		}
-
-		spin_unlock(&irq->irq_lock);
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-
-	vgic_cpu->used_lrs = 0;
-}
-
-/*
- * Populates the particular LR with the state of a given IRQ:
- * - for an edge sensitive IRQ the pending state is cleared in struct vgic_irq
- * - for a level sensitive IRQ the pending state value is unchanged;
- *   it is dictated directly by the input level
- *
- * If @irq describes an SGI with multiple sources, we choose the
- * lowest-numbered source VCPU and clear that bit in the source bitmap.
- *
- * The irq_lock must be held by the caller.
- */
-void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
-{
-	u32 val = irq->intid;
-	bool allow_pending = true;
-
-	if (irq->active) {
-		val |= GICH_LR_ACTIVE_BIT;
-		if (vgic_irq_is_sgi(irq->intid))
-			val |= irq->active_source << GICH_LR_PHYSID_CPUID_SHIFT;
-		if (vgic_irq_is_multi_sgi(irq)) {
-			allow_pending = false;
-			val |= GICH_LR_EOI;
-		}
-	}
-
-	if (irq->group)
-		val |= GICH_LR_GROUP1;
-
-	if (irq->hw) {
-		val |= GICH_LR_HW;
-		val |= irq->hwintid << GICH_LR_PHYSID_CPUID_SHIFT;
-		/*
-		 * Never set pending+active on a HW interrupt, as the
-		 * pending state is kept at the physical distributor
-		 * level.
-		 */
-		if (irq->active)
-			allow_pending = false;
-	} else {
-		if (irq->config == VGIC_CONFIG_LEVEL) {
-			val |= GICH_LR_EOI;
-
-			/*
-			 * Software resampling doesn't work very well
-			 * if we allow P+A, so let's not do that.
-			 */
-			if (irq->active)
-				allow_pending = false;
-		}
-	}
-
-	if (allow_pending && irq_is_pending(irq)) {
-		val |= GICH_LR_PENDING_BIT;
-
-		if (irq->config == VGIC_CONFIG_EDGE)
-			irq->pending_latch = false;
-
-		if (vgic_irq_is_sgi(irq->intid)) {
-			u32 src = ffs(irq->source);
-
-			BUG_ON(!src);
-			val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT;
-			irq->source &= ~(1 << (src - 1));
-			if (irq->source) {
-				irq->pending_latch = true;
-				val |= GICH_LR_EOI;
-			}
-		}
-	}
-
-	/*
-	 * Level-triggered mapped IRQs are special because we only observe
-	 * rising edges as input to the VGIC.  We therefore lower the line
-	 * level here, so that we can take new virtual IRQs.  See
-	 * vgic_v2_fold_lr_state for more info.
-	 */
-	if (vgic_irq_is_mapped_level(irq) && (val & GICH_LR_PENDING_BIT))
-		irq->line_level = false;
-
-	/* The GICv2 LR only holds five bits of priority. */
-	val |= (irq->priority >> 3) << GICH_LR_PRIORITY_SHIFT;
-
-	vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = val;
-}
-
-void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr)
-{
-	vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = 0;
-}
-
-void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
-{
-	struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
-	u32 vmcr;
-
-	vmcr = (vmcrp->grpen0 << GICH_VMCR_ENABLE_GRP0_SHIFT) &
-		GICH_VMCR_ENABLE_GRP0_MASK;
-	vmcr |= (vmcrp->grpen1 << GICH_VMCR_ENABLE_GRP1_SHIFT) &
-		GICH_VMCR_ENABLE_GRP1_MASK;
-	vmcr |= (vmcrp->ackctl << GICH_VMCR_ACK_CTL_SHIFT) &
-		GICH_VMCR_ACK_CTL_MASK;
-	vmcr |= (vmcrp->fiqen << GICH_VMCR_FIQ_EN_SHIFT) &
-		GICH_VMCR_FIQ_EN_MASK;
-	vmcr |= (vmcrp->cbpr << GICH_VMCR_CBPR_SHIFT) &
-		GICH_VMCR_CBPR_MASK;
-	vmcr |= (vmcrp->eoim << GICH_VMCR_EOI_MODE_SHIFT) &
-		GICH_VMCR_EOI_MODE_MASK;
-	vmcr |= (vmcrp->abpr << GICH_VMCR_ALIAS_BINPOINT_SHIFT) &
-		GICH_VMCR_ALIAS_BINPOINT_MASK;
-	vmcr |= (vmcrp->bpr << GICH_VMCR_BINPOINT_SHIFT) &
-		GICH_VMCR_BINPOINT_MASK;
-	vmcr |= ((vmcrp->pmr >> GICV_PMR_PRIORITY_SHIFT) <<
-		 GICH_VMCR_PRIMASK_SHIFT) & GICH_VMCR_PRIMASK_MASK;
-
-	cpu_if->vgic_vmcr = vmcr;
-}
-
-void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
-{
-	struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
-	u32 vmcr;
-
-	vmcr = cpu_if->vgic_vmcr;
-
-	vmcrp->grpen0 = (vmcr & GICH_VMCR_ENABLE_GRP0_MASK) >>
-		GICH_VMCR_ENABLE_GRP0_SHIFT;
-	vmcrp->grpen1 = (vmcr & GICH_VMCR_ENABLE_GRP1_MASK) >>
-		GICH_VMCR_ENABLE_GRP1_SHIFT;
-	vmcrp->ackctl = (vmcr & GICH_VMCR_ACK_CTL_MASK) >>
-		GICH_VMCR_ACK_CTL_SHIFT;
-	vmcrp->fiqen = (vmcr & GICH_VMCR_FIQ_EN_MASK) >>
-		GICH_VMCR_FIQ_EN_SHIFT;
-	vmcrp->cbpr = (vmcr & GICH_VMCR_CBPR_MASK) >>
-		GICH_VMCR_CBPR_SHIFT;
-	vmcrp->eoim = (vmcr & GICH_VMCR_EOI_MODE_MASK) >>
-		GICH_VMCR_EOI_MODE_SHIFT;
-
-	vmcrp->abpr = (vmcr & GICH_VMCR_ALIAS_BINPOINT_MASK) >>
-			GICH_VMCR_ALIAS_BINPOINT_SHIFT;
-	vmcrp->bpr  = (vmcr & GICH_VMCR_BINPOINT_MASK) >>
-			GICH_VMCR_BINPOINT_SHIFT;
-	vmcrp->pmr  = ((vmcr & GICH_VMCR_PRIMASK_MASK) >>
-			GICH_VMCR_PRIMASK_SHIFT) << GICV_PMR_PRIORITY_SHIFT;
-}
-
-void vgic_v2_enable(struct kvm_vcpu *vcpu)
-{
-	/*
-	 * By forcing VMCR to zero, the GIC will restore the binary
-	 * points to their reset values. Anything else resets to zero
-	 * anyway.
-	 */
-	vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = 0;
-
-	/* Get the show on the road... */
-	vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr = GICH_HCR_EN;
-}
-
-/* check for overlapping regions and for regions crossing the end of memory */
-static bool vgic_v2_check_base(gpa_t dist_base, gpa_t cpu_base)
-{
-	if (dist_base + KVM_VGIC_V2_DIST_SIZE < dist_base)
-		return false;
-	if (cpu_base + KVM_VGIC_V2_CPU_SIZE < cpu_base)
-		return false;
-
-	if (dist_base + KVM_VGIC_V2_DIST_SIZE <= cpu_base)
-		return true;
-	if (cpu_base + KVM_VGIC_V2_CPU_SIZE <= dist_base)
-		return true;
-
-	return false;
-}
-
-int vgic_v2_map_resources(struct kvm *kvm)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	int ret = 0;
-
-	if (vgic_ready(kvm))
-		goto out;
-
-	if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) ||
-	    IS_VGIC_ADDR_UNDEF(dist->vgic_cpu_base)) {
-		kvm_err("Need to set vgic cpu and dist addresses first\n");
-		ret = -ENXIO;
-		goto out;
-	}
-
-	if (!vgic_v2_check_base(dist->vgic_dist_base, dist->vgic_cpu_base)) {
-		kvm_err("VGIC CPU and dist frames overlap\n");
-		ret = -EINVAL;
-		goto out;
-	}
-
-	/*
-	 * Initialize the vgic if this hasn't already been done on demand by
-	 * accessing the vgic state from userspace.
-	 */
-	ret = vgic_init(kvm);
-	if (ret) {
-		kvm_err("Unable to initialize VGIC dynamic data structures\n");
-		goto out;
-	}
-
-	ret = vgic_register_dist_iodev(kvm, dist->vgic_dist_base, VGIC_V2);
-	if (ret) {
-		kvm_err("Unable to register VGIC MMIO regions\n");
-		goto out;
-	}
-
-	if (!static_branch_unlikely(&vgic_v2_cpuif_trap)) {
-		ret = kvm_phys_addr_ioremap(kvm, dist->vgic_cpu_base,
-					    kvm_vgic_global_state.vcpu_base,
-					    KVM_VGIC_V2_CPU_SIZE, true);
-		if (ret) {
-			kvm_err("Unable to remap VGIC CPU to VCPU\n");
-			goto out;
-		}
-	}
-
-	dist->ready = true;
-
-out:
-	return ret;
-}
-
-DEFINE_STATIC_KEY_FALSE(vgic_v2_cpuif_trap);
-
-/**
- * vgic_v2_probe - probe for a GICv2 compatible interrupt controller in DT
- * @node:	pointer to the DT node
- *
- * Returns 0 if a GICv2 has been found, returns an error code otherwise
- */
-int vgic_v2_probe(const struct gic_kvm_info *info)
-{
-	int ret;
-	u32 vtr;
-
-	if (!info->vctrl.start) {
-		kvm_err("GICH not present in the firmware table\n");
-		return -ENXIO;
-	}
-
-	if (!PAGE_ALIGNED(info->vcpu.start) ||
-	    !PAGE_ALIGNED(resource_size(&info->vcpu))) {
-		kvm_info("GICV region size/alignment is unsafe, using trapping (reduced performance)\n");
-
-		ret = create_hyp_io_mappings(info->vcpu.start,
-					     resource_size(&info->vcpu),
-					     &kvm_vgic_global_state.vcpu_base_va,
-					     &kvm_vgic_global_state.vcpu_hyp_va);
-		if (ret) {
-			kvm_err("Cannot map GICV into hyp\n");
-			goto out;
-		}
-
-		static_branch_enable(&vgic_v2_cpuif_trap);
-	}
-
-	ret = create_hyp_io_mappings(info->vctrl.start,
-				     resource_size(&info->vctrl),
-				     &kvm_vgic_global_state.vctrl_base,
-				     &kvm_vgic_global_state.vctrl_hyp);
-	if (ret) {
-		kvm_err("Cannot map VCTRL into hyp\n");
-		goto out;
-	}
-
-	vtr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VTR);
-	kvm_vgic_global_state.nr_lr = (vtr & 0x3f) + 1;
-
-	ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2);
-	if (ret) {
-		kvm_err("Cannot register GICv2 KVM device\n");
-		goto out;
-	}
-
-	kvm_vgic_global_state.can_emulate_gicv2 = true;
-	kvm_vgic_global_state.vcpu_base = info->vcpu.start;
-	kvm_vgic_global_state.type = VGIC_V2;
-	kvm_vgic_global_state.max_gic_vcpus = VGIC_V2_MAX_CPUS;
-
-	kvm_debug("vgic-v2@%llx\n", info->vctrl.start);
-
-	return 0;
-out:
-	if (kvm_vgic_global_state.vctrl_base)
-		iounmap(kvm_vgic_global_state.vctrl_base);
-	if (kvm_vgic_global_state.vcpu_base_va)
-		iounmap(kvm_vgic_global_state.vcpu_base_va);
-
-	return ret;
-}
-
-static void save_lrs(struct kvm_vcpu *vcpu, void __iomem *base)
-{
-	struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
-	u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
-	u64 elrsr;
-	int i;
-
-	elrsr = readl_relaxed(base + GICH_ELRSR0);
-	if (unlikely(used_lrs > 32))
-		elrsr |= ((u64)readl_relaxed(base + GICH_ELRSR1)) << 32;
-
-	for (i = 0; i < used_lrs; i++) {
-		if (elrsr & (1UL << i))
-			cpu_if->vgic_lr[i] &= ~GICH_LR_STATE;
-		else
-			cpu_if->vgic_lr[i] = readl_relaxed(base + GICH_LR0 + (i * 4));
-
-		writel_relaxed(0, base + GICH_LR0 + (i * 4));
-	}
-}
-
-void vgic_v2_save_state(struct kvm_vcpu *vcpu)
-{
-	void __iomem *base = kvm_vgic_global_state.vctrl_base;
-	u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
-
-	if (!base)
-		return;
-
-	if (used_lrs) {
-		save_lrs(vcpu, base);
-		writel_relaxed(0, base + GICH_HCR);
-	}
-}
-
-void vgic_v2_restore_state(struct kvm_vcpu *vcpu)
-{
-	struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
-	void __iomem *base = kvm_vgic_global_state.vctrl_base;
-	u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
-	int i;
-
-	if (!base)
-		return;
-
-	if (used_lrs) {
-		writel_relaxed(cpu_if->vgic_hcr, base + GICH_HCR);
-		for (i = 0; i < used_lrs; i++) {
-			writel_relaxed(cpu_if->vgic_lr[i],
-				       base + GICH_LR0 + (i * 4));
-		}
-	}
-}
-
-void vgic_v2_load(struct kvm_vcpu *vcpu)
-{
-	struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
-
-	writel_relaxed(cpu_if->vgic_vmcr,
-		       kvm_vgic_global_state.vctrl_base + GICH_VMCR);
-	writel_relaxed(cpu_if->vgic_apr,
-		       kvm_vgic_global_state.vctrl_base + GICH_APR);
-}
-
-void vgic_v2_put(struct kvm_vcpu *vcpu)
-{
-	struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
-
-	cpu_if->vgic_vmcr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VMCR);
-	cpu_if->vgic_apr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_APR);
-}
diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c
deleted file mode 100644
index 9c0dd234ebe8..000000000000
--- a/virt/kvm/arm/vgic/vgic-v3.c
+++ /dev/null
@@ -1,688 +0,0 @@
-/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/irqchip/arm-gic-v3.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <kvm/arm_vgic.h>
-#include <asm/kvm_hyp.h>
-#include <asm/kvm_mmu.h>
-#include <asm/kvm_asm.h>
-
-#include "vgic.h"
-
-static bool group0_trap;
-static bool group1_trap;
-static bool common_trap;
-static bool gicv4_enable;
-
-void vgic_v3_set_underflow(struct kvm_vcpu *vcpu)
-{
-	struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3;
-
-	cpuif->vgic_hcr |= ICH_HCR_UIE;
-}
-
-static bool lr_signals_eoi_mi(u64 lr_val)
-{
-	return !(lr_val & ICH_LR_STATE) && (lr_val & ICH_LR_EOI) &&
-	       !(lr_val & ICH_LR_HW);
-}
-
-void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
-{
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-	struct vgic_v3_cpu_if *cpuif = &vgic_cpu->vgic_v3;
-	u32 model = vcpu->kvm->arch.vgic.vgic_model;
-	int lr;
-
-	DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
-
-	cpuif->vgic_hcr &= ~ICH_HCR_UIE;
-
-	for (lr = 0; lr < vgic_cpu->used_lrs; lr++) {
-		u64 val = cpuif->vgic_lr[lr];
-		u32 intid, cpuid;
-		struct vgic_irq *irq;
-		bool is_v2_sgi = false;
-
-		cpuid = val & GICH_LR_PHYSID_CPUID;
-		cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT;
-
-		if (model == KVM_DEV_TYPE_ARM_VGIC_V3) {
-			intid = val & ICH_LR_VIRTUAL_ID_MASK;
-		} else {
-			intid = val & GICH_LR_VIRTUALID;
-			is_v2_sgi = vgic_irq_is_sgi(intid);
-		}
-
-		/* Notify fds when the guest EOI'ed a level-triggered IRQ */
-		if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid))
-			kvm_notify_acked_irq(vcpu->kvm, 0,
-					     intid - VGIC_NR_PRIVATE_IRQS);
-
-		irq = vgic_get_irq(vcpu->kvm, vcpu, intid);
-		if (!irq)	/* An LPI could have been unmapped. */
-			continue;
-
-		spin_lock(&irq->irq_lock);
-
-		/* Always preserve the active bit */
-		irq->active = !!(val & ICH_LR_ACTIVE_BIT);
-
-		if (irq->active && is_v2_sgi)
-			irq->active_source = cpuid;
-
-		/* Edge is the only case where we preserve the pending bit */
-		if (irq->config == VGIC_CONFIG_EDGE &&
-		    (val & ICH_LR_PENDING_BIT)) {
-			irq->pending_latch = true;
-
-			if (is_v2_sgi)
-				irq->source |= (1 << cpuid);
-		}
-
-		/*
-		 * Clear soft pending state when level irqs have been acked.
-		 */
-		if (irq->config == VGIC_CONFIG_LEVEL && !(val & ICH_LR_STATE))
-			irq->pending_latch = false;
-
-		/*
-		 * Level-triggered mapped IRQs are special because we only
-		 * observe rising edges as input to the VGIC.
-		 *
-		 * If the guest never acked the interrupt we have to sample
-		 * the physical line and set the line level, because the
-		 * device state could have changed or we simply need to
-		 * process the still pending interrupt later.
-		 *
-		 * If this causes us to lower the level, we have to also clear
-		 * the physical active state, since we will otherwise never be
-		 * told when the interrupt becomes asserted again.
-		 */
-		if (vgic_irq_is_mapped_level(irq) && (val & ICH_LR_PENDING_BIT)) {
-			irq->line_level = vgic_get_phys_line_level(irq);
-
-			if (!irq->line_level)
-				vgic_irq_set_phys_active(irq, false);
-		}
-
-		spin_unlock(&irq->irq_lock);
-		vgic_put_irq(vcpu->kvm, irq);
-	}
-
-	vgic_cpu->used_lrs = 0;
-}
-
-/* Requires the irq to be locked already */
-void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
-{
-	u32 model = vcpu->kvm->arch.vgic.vgic_model;
-	u64 val = irq->intid;
-	bool allow_pending = true, is_v2_sgi;
-
-	is_v2_sgi = (vgic_irq_is_sgi(irq->intid) &&
-		     model == KVM_DEV_TYPE_ARM_VGIC_V2);
-
-	if (irq->active) {
-		val |= ICH_LR_ACTIVE_BIT;
-		if (is_v2_sgi)
-			val |= irq->active_source << GICH_LR_PHYSID_CPUID_SHIFT;
-		if (vgic_irq_is_multi_sgi(irq)) {
-			allow_pending = false;
-			val |= ICH_LR_EOI;
-		}
-	}
-
-	if (irq->hw) {
-		val |= ICH_LR_HW;
-		val |= ((u64)irq->hwintid) << ICH_LR_PHYS_ID_SHIFT;
-		/*
-		 * Never set pending+active on a HW interrupt, as the
-		 * pending state is kept at the physical distributor
-		 * level.
-		 */
-		if (irq->active)
-			allow_pending = false;
-	} else {
-		if (irq->config == VGIC_CONFIG_LEVEL) {
-			val |= ICH_LR_EOI;
-
-			/*
-			 * Software resampling doesn't work very well
-			 * if we allow P+A, so let's not do that.
-			 */
-			if (irq->active)
-				allow_pending = false;
-		}
-	}
-
-	if (allow_pending && irq_is_pending(irq)) {
-		val |= ICH_LR_PENDING_BIT;
-
-		if (irq->config == VGIC_CONFIG_EDGE)
-			irq->pending_latch = false;
-
-		if (vgic_irq_is_sgi(irq->intid) &&
-		    model == KVM_DEV_TYPE_ARM_VGIC_V2) {
-			u32 src = ffs(irq->source);
-
-			BUG_ON(!src);
-			val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT;
-			irq->source &= ~(1 << (src - 1));
-			if (irq->source) {
-				irq->pending_latch = true;
-				val |= ICH_LR_EOI;
-			}
-		}
-	}
-
-	/*
-	 * Level-triggered mapped IRQs are special because we only observe
-	 * rising edges as input to the VGIC.  We therefore lower the line
-	 * level here, so that we can take new virtual IRQs.  See
-	 * vgic_v3_fold_lr_state for more info.
-	 */
-	if (vgic_irq_is_mapped_level(irq) && (val & ICH_LR_PENDING_BIT))
-		irq->line_level = false;
-
-	if (irq->group)
-		val |= ICH_LR_GROUP;
-
-	val |= (u64)irq->priority << ICH_LR_PRIORITY_SHIFT;
-
-	vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = val;
-}
-
-void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr)
-{
-	vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = 0;
-}
-
-void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
-{
-	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
-	u32 model = vcpu->kvm->arch.vgic.vgic_model;
-	u32 vmcr;
-
-	if (model == KVM_DEV_TYPE_ARM_VGIC_V2) {
-		vmcr = (vmcrp->ackctl << ICH_VMCR_ACK_CTL_SHIFT) &
-			ICH_VMCR_ACK_CTL_MASK;
-		vmcr |= (vmcrp->fiqen << ICH_VMCR_FIQ_EN_SHIFT) &
-			ICH_VMCR_FIQ_EN_MASK;
-	} else {
-		/*
-		 * When emulating GICv3 on GICv3 with SRE=1 on the
-		 * VFIQEn bit is RES1 and the VAckCtl bit is RES0.
-		 */
-		vmcr = ICH_VMCR_FIQ_EN_MASK;
-	}
-
-	vmcr |= (vmcrp->cbpr << ICH_VMCR_CBPR_SHIFT) & ICH_VMCR_CBPR_MASK;
-	vmcr |= (vmcrp->eoim << ICH_VMCR_EOIM_SHIFT) & ICH_VMCR_EOIM_MASK;
-	vmcr |= (vmcrp->abpr << ICH_VMCR_BPR1_SHIFT) & ICH_VMCR_BPR1_MASK;
-	vmcr |= (vmcrp->bpr << ICH_VMCR_BPR0_SHIFT) & ICH_VMCR_BPR0_MASK;
-	vmcr |= (vmcrp->pmr << ICH_VMCR_PMR_SHIFT) & ICH_VMCR_PMR_MASK;
-	vmcr |= (vmcrp->grpen0 << ICH_VMCR_ENG0_SHIFT) & ICH_VMCR_ENG0_MASK;
-	vmcr |= (vmcrp->grpen1 << ICH_VMCR_ENG1_SHIFT) & ICH_VMCR_ENG1_MASK;
-
-	cpu_if->vgic_vmcr = vmcr;
-}
-
-void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
-{
-	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
-	u32 model = vcpu->kvm->arch.vgic.vgic_model;
-	u32 vmcr;
-
-	vmcr = cpu_if->vgic_vmcr;
-
-	if (model == KVM_DEV_TYPE_ARM_VGIC_V2) {
-		vmcrp->ackctl = (vmcr & ICH_VMCR_ACK_CTL_MASK) >>
-			ICH_VMCR_ACK_CTL_SHIFT;
-		vmcrp->fiqen = (vmcr & ICH_VMCR_FIQ_EN_MASK) >>
-			ICH_VMCR_FIQ_EN_SHIFT;
-	} else {
-		/*
-		 * When emulating GICv3 on GICv3 with SRE=1 on the
-		 * VFIQEn bit is RES1 and the VAckCtl bit is RES0.
-		 */
-		vmcrp->fiqen = 1;
-		vmcrp->ackctl = 0;
-	}
-
-	vmcrp->cbpr = (vmcr & ICH_VMCR_CBPR_MASK) >> ICH_VMCR_CBPR_SHIFT;
-	vmcrp->eoim = (vmcr & ICH_VMCR_EOIM_MASK) >> ICH_VMCR_EOIM_SHIFT;
-	vmcrp->abpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT;
-	vmcrp->bpr  = (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT;
-	vmcrp->pmr  = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT;
-	vmcrp->grpen0 = (vmcr & ICH_VMCR_ENG0_MASK) >> ICH_VMCR_ENG0_SHIFT;
-	vmcrp->grpen1 = (vmcr & ICH_VMCR_ENG1_MASK) >> ICH_VMCR_ENG1_SHIFT;
-}
-
-#define INITIAL_PENDBASER_VALUE						  \
-	(GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWb)		| \
-	GIC_BASER_CACHEABILITY(GICR_PENDBASER, OUTER, SameAsInner)	| \
-	GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable))
-
-void vgic_v3_enable(struct kvm_vcpu *vcpu)
-{
-	struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3;
-
-	/*
-	 * By forcing VMCR to zero, the GIC will restore the binary
-	 * points to their reset values. Anything else resets to zero
-	 * anyway.
-	 */
-	vgic_v3->vgic_vmcr = 0;
-
-	/*
-	 * If we are emulating a GICv3, we do it in an non-GICv2-compatible
-	 * way, so we force SRE to 1 to demonstrate this to the guest.
-	 * Also, we don't support any form of IRQ/FIQ bypass.
-	 * This goes with the spec allowing the value to be RAO/WI.
-	 */
-	if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
-		vgic_v3->vgic_sre = (ICC_SRE_EL1_DIB |
-				     ICC_SRE_EL1_DFB |
-				     ICC_SRE_EL1_SRE);
-		vcpu->arch.vgic_cpu.pendbaser = INITIAL_PENDBASER_VALUE;
-	} else {
-		vgic_v3->vgic_sre = 0;
-	}
-
-	vcpu->arch.vgic_cpu.num_id_bits = (kvm_vgic_global_state.ich_vtr_el2 &
-					   ICH_VTR_ID_BITS_MASK) >>
-					   ICH_VTR_ID_BITS_SHIFT;
-	vcpu->arch.vgic_cpu.num_pri_bits = ((kvm_vgic_global_state.ich_vtr_el2 &
-					    ICH_VTR_PRI_BITS_MASK) >>
-					    ICH_VTR_PRI_BITS_SHIFT) + 1;
-
-	/* Get the show on the road... */
-	vgic_v3->vgic_hcr = ICH_HCR_EN;
-	if (group0_trap)
-		vgic_v3->vgic_hcr |= ICH_HCR_TALL0;
-	if (group1_trap)
-		vgic_v3->vgic_hcr |= ICH_HCR_TALL1;
-	if (common_trap)
-		vgic_v3->vgic_hcr |= ICH_HCR_TC;
-}
-
-int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq)
-{
-	struct kvm_vcpu *vcpu;
-	int byte_offset, bit_nr;
-	gpa_t pendbase, ptr;
-	bool status;
-	u8 val;
-	int ret;
-	unsigned long flags;
-
-retry:
-	vcpu = irq->target_vcpu;
-	if (!vcpu)
-		return 0;
-
-	pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser);
-
-	byte_offset = irq->intid / BITS_PER_BYTE;
-	bit_nr = irq->intid % BITS_PER_BYTE;
-	ptr = pendbase + byte_offset;
-
-	ret = kvm_read_guest_lock(kvm, ptr, &val, 1);
-	if (ret)
-		return ret;
-
-	status = val & (1 << bit_nr);
-
-	spin_lock_irqsave(&irq->irq_lock, flags);
-	if (irq->target_vcpu != vcpu) {
-		spin_unlock_irqrestore(&irq->irq_lock, flags);
-		goto retry;
-	}
-	irq->pending_latch = status;
-	vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
-
-	if (status) {
-		/* clear consumed data */
-		val &= ~(1 << bit_nr);
-		ret = kvm_write_guest(kvm, ptr, &val, 1);
-		if (ret)
-			return ret;
-	}
-	return 0;
-}
-
-/**
- * vgic_its_save_pending_tables - Save the pending tables into guest RAM
- * kvm lock and all vcpu lock must be held
- */
-int vgic_v3_save_pending_tables(struct kvm *kvm)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	int last_byte_offset = -1;
-	struct vgic_irq *irq;
-	int ret;
-	u8 val;
-
-	list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
-		int byte_offset, bit_nr;
-		struct kvm_vcpu *vcpu;
-		gpa_t pendbase, ptr;
-		bool stored;
-
-		vcpu = irq->target_vcpu;
-		if (!vcpu)
-			continue;
-
-		pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser);
-
-		byte_offset = irq->intid / BITS_PER_BYTE;
-		bit_nr = irq->intid % BITS_PER_BYTE;
-		ptr = pendbase + byte_offset;
-
-		if (byte_offset != last_byte_offset) {
-			ret = kvm_read_guest_lock(kvm, ptr, &val, 1);
-			if (ret)
-				return ret;
-			last_byte_offset = byte_offset;
-		}
-
-		stored = val & (1U << bit_nr);
-		if (stored == irq->pending_latch)
-			continue;
-
-		if (irq->pending_latch)
-			val |= 1 << bit_nr;
-		else
-			val &= ~(1 << bit_nr);
-
-		ret = kvm_write_guest(kvm, ptr, &val, 1);
-		if (ret)
-			return ret;
-	}
-	return 0;
-}
-
-/**
- * vgic_v3_rdist_overlap - check if a region overlaps with any
- * existing redistributor region
- *
- * @kvm: kvm handle
- * @base: base of the region
- * @size: size of region
- *
- * Return: true if there is an overlap
- */
-bool vgic_v3_rdist_overlap(struct kvm *kvm, gpa_t base, size_t size)
-{
-	struct vgic_dist *d = &kvm->arch.vgic;
-	struct vgic_redist_region *rdreg;
-
-	list_for_each_entry(rdreg, &d->rd_regions, list) {
-		if ((base + size > rdreg->base) &&
-			(base < rdreg->base + vgic_v3_rd_region_size(kvm, rdreg)))
-			return true;
-	}
-	return false;
-}
-
-/*
- * Check for overlapping regions and for regions crossing the end of memory
- * for base addresses which have already been set.
- */
-bool vgic_v3_check_base(struct kvm *kvm)
-{
-	struct vgic_dist *d = &kvm->arch.vgic;
-	struct vgic_redist_region *rdreg;
-
-	if (!IS_VGIC_ADDR_UNDEF(d->vgic_dist_base) &&
-	    d->vgic_dist_base + KVM_VGIC_V3_DIST_SIZE < d->vgic_dist_base)
-		return false;
-
-	list_for_each_entry(rdreg, &d->rd_regions, list) {
-		if (rdreg->base + vgic_v3_rd_region_size(kvm, rdreg) <
-			rdreg->base)
-			return false;
-	}
-
-	if (IS_VGIC_ADDR_UNDEF(d->vgic_dist_base))
-		return true;
-
-	return !vgic_v3_rdist_overlap(kvm, d->vgic_dist_base,
-				      KVM_VGIC_V3_DIST_SIZE);
-}
-
-/**
- * vgic_v3_rdist_free_slot - Look up registered rdist regions and identify one
- * which has free space to put a new rdist region.
- *
- * @rd_regions: redistributor region list head
- *
- * A redistributor regions maps n redistributors, n = region size / (2 x 64kB).
- * Stride between redistributors is 0 and regions are filled in the index order.
- *
- * Return: the redist region handle, if any, that has space to map a new rdist
- * region.
- */
-struct vgic_redist_region *vgic_v3_rdist_free_slot(struct list_head *rd_regions)
-{
-	struct vgic_redist_region *rdreg;
-
-	list_for_each_entry(rdreg, rd_regions, list) {
-		if (!vgic_v3_redist_region_full(rdreg))
-			return rdreg;
-	}
-	return NULL;
-}
-
-struct vgic_redist_region *vgic_v3_rdist_region_from_index(struct kvm *kvm,
-							   u32 index)
-{
-	struct list_head *rd_regions = &kvm->arch.vgic.rd_regions;
-	struct vgic_redist_region *rdreg;
-
-	list_for_each_entry(rdreg, rd_regions, list) {
-		if (rdreg->index == index)
-			return rdreg;
-	}
-	return NULL;
-}
-
-
-int vgic_v3_map_resources(struct kvm *kvm)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	struct kvm_vcpu *vcpu;
-	int ret = 0;
-	int c;
-
-	if (vgic_ready(kvm))
-		goto out;
-
-	kvm_for_each_vcpu(c, vcpu, kvm) {
-		struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-
-		if (IS_VGIC_ADDR_UNDEF(vgic_cpu->rd_iodev.base_addr)) {
-			kvm_debug("vcpu %d redistributor base not set\n", c);
-			ret = -ENXIO;
-			goto out;
-		}
-	}
-
-	if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base)) {
-		kvm_err("Need to set vgic distributor addresses first\n");
-		ret = -ENXIO;
-		goto out;
-	}
-
-	if (!vgic_v3_check_base(kvm)) {
-		kvm_err("VGIC redist and dist frames overlap\n");
-		ret = -EINVAL;
-		goto out;
-	}
-
-	/*
-	 * For a VGICv3 we require the userland to explicitly initialize
-	 * the VGIC before we need to use it.
-	 */
-	if (!vgic_initialized(kvm)) {
-		ret = -EBUSY;
-		goto out;
-	}
-
-	ret = vgic_register_dist_iodev(kvm, dist->vgic_dist_base, VGIC_V3);
-	if (ret) {
-		kvm_err("Unable to register VGICv3 dist MMIO regions\n");
-		goto out;
-	}
-
-	dist->ready = true;
-
-out:
-	return ret;
-}
-
-DEFINE_STATIC_KEY_FALSE(vgic_v3_cpuif_trap);
-
-static int __init early_group0_trap_cfg(char *buf)
-{
-	return strtobool(buf, &group0_trap);
-}
-early_param("kvm-arm.vgic_v3_group0_trap", early_group0_trap_cfg);
-
-static int __init early_group1_trap_cfg(char *buf)
-{
-	return strtobool(buf, &group1_trap);
-}
-early_param("kvm-arm.vgic_v3_group1_trap", early_group1_trap_cfg);
-
-static int __init early_common_trap_cfg(char *buf)
-{
-	return strtobool(buf, &common_trap);
-}
-early_param("kvm-arm.vgic_v3_common_trap", early_common_trap_cfg);
-
-static int __init early_gicv4_enable(char *buf)
-{
-	return strtobool(buf, &gicv4_enable);
-}
-early_param("kvm-arm.vgic_v4_enable", early_gicv4_enable);
-
-/**
- * vgic_v3_probe - probe for a GICv3 compatible interrupt controller in DT
- * @node:	pointer to the DT node
- *
- * Returns 0 if a GICv3 has been found, returns an error code otherwise
- */
-int vgic_v3_probe(const struct gic_kvm_info *info)
-{
-	u32 ich_vtr_el2 = kvm_call_hyp(__vgic_v3_get_ich_vtr_el2);
-	int ret;
-
-	/*
-	 * The ListRegs field is 5 bits, but there is a architectural
-	 * maximum of 16 list registers. Just ignore bit 4...
-	 */
-	kvm_vgic_global_state.nr_lr = (ich_vtr_el2 & 0xf) + 1;
-	kvm_vgic_global_state.can_emulate_gicv2 = false;
-	kvm_vgic_global_state.ich_vtr_el2 = ich_vtr_el2;
-
-	/* GICv4 support? */
-	if (info->has_v4) {
-		kvm_vgic_global_state.has_gicv4 = gicv4_enable;
-		kvm_info("GICv4 support %sabled\n",
-			 gicv4_enable ? "en" : "dis");
-	}
-
-	if (!info->vcpu.start) {
-		kvm_info("GICv3: no GICV resource entry\n");
-		kvm_vgic_global_state.vcpu_base = 0;
-	} else if (!PAGE_ALIGNED(info->vcpu.start)) {
-		pr_warn("GICV physical address 0x%llx not page aligned\n",
-			(unsigned long long)info->vcpu.start);
-		kvm_vgic_global_state.vcpu_base = 0;
-	} else {
-		kvm_vgic_global_state.vcpu_base = info->vcpu.start;
-		kvm_vgic_global_state.can_emulate_gicv2 = true;
-		ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2);
-		if (ret) {
-			kvm_err("Cannot register GICv2 KVM device.\n");
-			return ret;
-		}
-		kvm_info("vgic-v2@%llx\n", info->vcpu.start);
-	}
-	ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V3);
-	if (ret) {
-		kvm_err("Cannot register GICv3 KVM device.\n");
-		kvm_unregister_device_ops(KVM_DEV_TYPE_ARM_VGIC_V2);
-		return ret;
-	}
-
-	if (kvm_vgic_global_state.vcpu_base == 0)
-		kvm_info("disabling GICv2 emulation\n");
-
-#ifdef CONFIG_ARM64
-	if (cpus_have_const_cap(ARM64_WORKAROUND_CAVIUM_30115)) {
-		group0_trap = true;
-		group1_trap = true;
-	}
-#endif
-
-	if (group0_trap || group1_trap || common_trap) {
-		kvm_info("GICv3 sysreg trapping enabled ([%s%s%s], reduced performance)\n",
-			 group0_trap ? "G0" : "",
-			 group1_trap ? "G1" : "",
-			 common_trap ? "C"  : "");
-		static_branch_enable(&vgic_v3_cpuif_trap);
-	}
-
-	kvm_vgic_global_state.vctrl_base = NULL;
-	kvm_vgic_global_state.type = VGIC_V3;
-	kvm_vgic_global_state.max_gic_vcpus = VGIC_V3_MAX_CPUS;
-
-	return 0;
-}
-
-void vgic_v3_load(struct kvm_vcpu *vcpu)
-{
-	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
-
-	/*
-	 * If dealing with a GICv2 emulation on GICv3, VMCR_EL2.VFIQen
-	 * is dependent on ICC_SRE_EL1.SRE, and we have to perform the
-	 * VMCR_EL2 save/restore in the world switch.
-	 */
-	if (likely(cpu_if->vgic_sre))
-		kvm_call_hyp(__vgic_v3_write_vmcr, cpu_if->vgic_vmcr);
-
-	kvm_call_hyp(__vgic_v3_restore_aprs, vcpu);
-
-	if (has_vhe())
-		__vgic_v3_activate_traps(vcpu);
-}
-
-void vgic_v3_put(struct kvm_vcpu *vcpu)
-{
-	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
-
-	if (likely(cpu_if->vgic_sre))
-		cpu_if->vgic_vmcr = kvm_call_hyp(__vgic_v3_read_vmcr);
-
-	kvm_call_hyp(__vgic_v3_save_aprs, vcpu);
-
-	if (has_vhe())
-		__vgic_v3_deactivate_traps(vcpu);
-}
diff --git a/virt/kvm/arm/vgic/vgic-v4.c b/virt/kvm/arm/vgic/vgic-v4.c
deleted file mode 100644
index 1ed5f2286b8e..000000000000
--- a/virt/kvm/arm/vgic/vgic-v4.c
+++ /dev/null
@@ -1,366 +0,0 @@
-/*
- * Copyright (C) 2017 ARM Ltd.
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/interrupt.h>
-#include <linux/irq.h>
-#include <linux/irqdomain.h>
-#include <linux/kvm_host.h>
-#include <linux/irqchip/arm-gic-v3.h>
-
-#include "vgic.h"
-
-/*
- * How KVM uses GICv4 (insert rude comments here):
- *
- * The vgic-v4 layer acts as a bridge between several entities:
- * - The GICv4 ITS representation offered by the ITS driver
- * - VFIO, which is in charge of the PCI endpoint
- * - The virtual ITS, which is the only thing the guest sees
- *
- * The configuration of VLPIs is triggered by a callback from VFIO,
- * instructing KVM that a PCI device has been configured to deliver
- * MSIs to a vITS.
- *
- * kvm_vgic_v4_set_forwarding() is thus called with the routing entry,
- * and this is used to find the corresponding vITS data structures
- * (ITS instance, device, event and irq) using a process that is
- * extremely similar to the injection of an MSI.
- *
- * At this stage, we can link the guest's view of an LPI (uniquely
- * identified by the routing entry) and the host irq, using the GICv4
- * driver mapping operation. Should the mapping succeed, we've then
- * successfully upgraded the guest's LPI to a VLPI. We can then start
- * with updating GICv4's view of the property table and generating an
- * INValidation in order to kickstart the delivery of this VLPI to the
- * guest directly, without software intervention. Well, almost.
- *
- * When the PCI endpoint is deconfigured, this operation is reversed
- * with VFIO calling kvm_vgic_v4_unset_forwarding().
- *
- * Once the VLPI has been mapped, it needs to follow any change the
- * guest performs on its LPI through the vITS. For that, a number of
- * command handlers have hooks to communicate these changes to the HW:
- * - Any invalidation triggers a call to its_prop_update_vlpi()
- * - The INT command results in a irq_set_irqchip_state(), which
- *   generates an INT on the corresponding VLPI.
- * - The CLEAR command results in a irq_set_irqchip_state(), which
- *   generates an CLEAR on the corresponding VLPI.
- * - DISCARD translates into an unmap, similar to a call to
- *   kvm_vgic_v4_unset_forwarding().
- * - MOVI is translated by an update of the existing mapping, changing
- *   the target vcpu, resulting in a VMOVI being generated.
- * - MOVALL is translated by a string of mapping updates (similar to
- *   the handling of MOVI). MOVALL is horrible.
- *
- * Note that a DISCARD/MAPTI sequence emitted from the guest without
- * reprogramming the PCI endpoint after MAPTI does not result in a
- * VLPI being mapped, as there is no callback from VFIO (the guest
- * will get the interrupt via the normal SW injection). Fixing this is
- * not trivial, and requires some horrible messing with the VFIO
- * internals. Not fun. Don't do that.
- *
- * Then there is the scheduling. Each time a vcpu is about to run on a
- * physical CPU, KVM must tell the corresponding redistributor about
- * it. And if we've migrated our vcpu from one CPU to another, we must
- * tell the ITS (so that the messages reach the right redistributor).
- * This is done in two steps: first issue a irq_set_affinity() on the
- * irq corresponding to the vcpu, then call its_schedule_vpe(). You
- * must be in a non-preemptible context. On exit, another call to
- * its_schedule_vpe() tells the redistributor that we're done with the
- * vcpu.
- *
- * Finally, the doorbell handling: Each vcpu is allocated an interrupt
- * which will fire each time a VLPI is made pending whilst the vcpu is
- * not running. Each time the vcpu gets blocked, the doorbell
- * interrupt gets enabled. When the vcpu is unblocked (for whatever
- * reason), the doorbell interrupt is disabled.
- */
-
-#define DB_IRQ_FLAGS	(IRQ_NOAUTOEN | IRQ_DISABLE_UNLAZY | IRQ_NO_BALANCING)
-
-static irqreturn_t vgic_v4_doorbell_handler(int irq, void *info)
-{
-	struct kvm_vcpu *vcpu = info;
-
-	vcpu->arch.vgic_cpu.vgic_v3.its_vpe.pending_last = true;
-	kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
-	kvm_vcpu_kick(vcpu);
-
-	return IRQ_HANDLED;
-}
-
-/**
- * vgic_v4_init - Initialize the GICv4 data structures
- * @kvm:	Pointer to the VM being initialized
- *
- * We may be called each time a vITS is created, or when the
- * vgic is initialized. This relies on kvm->lock to be
- * held. In both cases, the number of vcpus should now be
- * fixed.
- */
-int vgic_v4_init(struct kvm *kvm)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	struct kvm_vcpu *vcpu;
-	int i, nr_vcpus, ret;
-
-	if (!kvm_vgic_global_state.has_gicv4)
-		return 0; /* Nothing to see here... move along. */
-
-	if (dist->its_vm.vpes)
-		return 0;
-
-	nr_vcpus = atomic_read(&kvm->online_vcpus);
-
-	dist->its_vm.vpes = kcalloc(nr_vcpus, sizeof(*dist->its_vm.vpes),
-				    GFP_KERNEL);
-	if (!dist->its_vm.vpes)
-		return -ENOMEM;
-
-	dist->its_vm.nr_vpes = nr_vcpus;
-
-	kvm_for_each_vcpu(i, vcpu, kvm)
-		dist->its_vm.vpes[i] = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
-
-	ret = its_alloc_vcpu_irqs(&dist->its_vm);
-	if (ret < 0) {
-		kvm_err("VPE IRQ allocation failure\n");
-		kfree(dist->its_vm.vpes);
-		dist->its_vm.nr_vpes = 0;
-		dist->its_vm.vpes = NULL;
-		return ret;
-	}
-
-	kvm_for_each_vcpu(i, vcpu, kvm) {
-		int irq = dist->its_vm.vpes[i]->irq;
-
-		/*
-		 * Don't automatically enable the doorbell, as we're
-		 * flipping it back and forth when the vcpu gets
-		 * blocked. Also disable the lazy disabling, as the
-		 * doorbell could kick us out of the guest too
-		 * early...
-		 */
-		irq_set_status_flags(irq, DB_IRQ_FLAGS);
-		ret = request_irq(irq, vgic_v4_doorbell_handler,
-				  0, "vcpu", vcpu);
-		if (ret) {
-			kvm_err("failed to allocate vcpu IRQ%d\n", irq);
-			/*
-			 * Trick: adjust the number of vpes so we know
-			 * how many to nuke on teardown...
-			 */
-			dist->its_vm.nr_vpes = i;
-			break;
-		}
-	}
-
-	if (ret)
-		vgic_v4_teardown(kvm);
-
-	return ret;
-}
-
-/**
- * vgic_v4_teardown - Free the GICv4 data structures
- * @kvm:	Pointer to the VM being destroyed
- *
- * Relies on kvm->lock to be held.
- */
-void vgic_v4_teardown(struct kvm *kvm)
-{
-	struct its_vm *its_vm = &kvm->arch.vgic.its_vm;
-	int i;
-
-	if (!its_vm->vpes)
-		return;
-
-	for (i = 0; i < its_vm->nr_vpes; i++) {
-		struct kvm_vcpu *vcpu = kvm_get_vcpu(kvm, i);
-		int irq = its_vm->vpes[i]->irq;
-
-		irq_clear_status_flags(irq, DB_IRQ_FLAGS);
-		free_irq(irq, vcpu);
-	}
-
-	its_free_vcpu_irqs(its_vm);
-	kfree(its_vm->vpes);
-	its_vm->nr_vpes = 0;
-	its_vm->vpes = NULL;
-}
-
-int vgic_v4_sync_hwstate(struct kvm_vcpu *vcpu)
-{
-	if (!vgic_supports_direct_msis(vcpu->kvm))
-		return 0;
-
-	return its_schedule_vpe(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe, false);
-}
-
-int vgic_v4_flush_hwstate(struct kvm_vcpu *vcpu)
-{
-	int irq = vcpu->arch.vgic_cpu.vgic_v3.its_vpe.irq;
-	int err;
-
-	if (!vgic_supports_direct_msis(vcpu->kvm))
-		return 0;
-
-	/*
-	 * Before making the VPE resident, make sure the redistributor
-	 * corresponding to our current CPU expects us here. See the
-	 * doc in drivers/irqchip/irq-gic-v4.c to understand how this
-	 * turns into a VMOVP command at the ITS level.
-	 */
-	err = irq_set_affinity(irq, cpumask_of(smp_processor_id()));
-	if (err)
-		return err;
-
-	err = its_schedule_vpe(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe, true);
-	if (err)
-		return err;
-
-	/*
-	 * Now that the VPE is resident, let's get rid of a potential
-	 * doorbell interrupt that would still be pending.
-	 */
-	err = irq_set_irqchip_state(irq, IRQCHIP_STATE_PENDING, false);
-
-	return err;
-}
-
-static struct vgic_its *vgic_get_its(struct kvm *kvm,
-				     struct kvm_kernel_irq_routing_entry *irq_entry)
-{
-	struct kvm_msi msi  = (struct kvm_msi) {
-		.address_lo	= irq_entry->msi.address_lo,
-		.address_hi	= irq_entry->msi.address_hi,
-		.data		= irq_entry->msi.data,
-		.flags		= irq_entry->msi.flags,
-		.devid		= irq_entry->msi.devid,
-	};
-
-	return vgic_msi_to_its(kvm, &msi);
-}
-
-int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq,
-			       struct kvm_kernel_irq_routing_entry *irq_entry)
-{
-	struct vgic_its *its;
-	struct vgic_irq *irq;
-	struct its_vlpi_map map;
-	int ret;
-
-	if (!vgic_supports_direct_msis(kvm))
-		return 0;
-
-	/*
-	 * Get the ITS, and escape early on error (not a valid
-	 * doorbell for any of our vITSs).
-	 */
-	its = vgic_get_its(kvm, irq_entry);
-	if (IS_ERR(its))
-		return 0;
-
-	mutex_lock(&its->its_lock);
-
-	/* Perform then actual DevID/EventID -> LPI translation. */
-	ret = vgic_its_resolve_lpi(kvm, its, irq_entry->msi.devid,
-				   irq_entry->msi.data, &irq);
-	if (ret)
-		goto out;
-
-	/*
-	 * Emit the mapping request. If it fails, the ITS probably
-	 * isn't v4 compatible, so let's silently bail out. Holding
-	 * the ITS lock should ensure that nothing can modify the
-	 * target vcpu.
-	 */
-	map = (struct its_vlpi_map) {
-		.vm		= &kvm->arch.vgic.its_vm,
-		.vpe		= &irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe,
-		.vintid		= irq->intid,
-		.properties	= ((irq->priority & 0xfc) |
-				   (irq->enabled ? LPI_PROP_ENABLED : 0) |
-				   LPI_PROP_GROUP1),
-		.db_enabled	= true,
-	};
-
-	ret = its_map_vlpi(virq, &map);
-	if (ret)
-		goto out;
-
-	irq->hw		= true;
-	irq->host_irq	= virq;
-
-out:
-	mutex_unlock(&its->its_lock);
-	return ret;
-}
-
-int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int virq,
-				 struct kvm_kernel_irq_routing_entry *irq_entry)
-{
-	struct vgic_its *its;
-	struct vgic_irq *irq;
-	int ret;
-
-	if (!vgic_supports_direct_msis(kvm))
-		return 0;
-
-	/*
-	 * Get the ITS, and escape early on error (not a valid
-	 * doorbell for any of our vITSs).
-	 */
-	its = vgic_get_its(kvm, irq_entry);
-	if (IS_ERR(its))
-		return 0;
-
-	mutex_lock(&its->its_lock);
-
-	ret = vgic_its_resolve_lpi(kvm, its, irq_entry->msi.devid,
-				   irq_entry->msi.data, &irq);
-	if (ret)
-		goto out;
-
-	WARN_ON(!(irq->hw && irq->host_irq == virq));
-	if (irq->hw) {
-		irq->hw = false;
-		ret = its_unmap_vlpi(virq);
-	}
-
-out:
-	mutex_unlock(&its->its_lock);
-	return ret;
-}
-
-void kvm_vgic_v4_enable_doorbell(struct kvm_vcpu *vcpu)
-{
-	if (vgic_supports_direct_msis(vcpu->kvm)) {
-		int irq = vcpu->arch.vgic_cpu.vgic_v3.its_vpe.irq;
-		if (irq)
-			enable_irq(irq);
-	}
-}
-
-void kvm_vgic_v4_disable_doorbell(struct kvm_vcpu *vcpu)
-{
-	if (vgic_supports_direct_msis(vcpu->kvm)) {
-		int irq = vcpu->arch.vgic_cpu.vgic_v3.its_vpe.irq;
-		if (irq)
-			disable_irq(irq);
-	}
-}
diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c
deleted file mode 100644
index 870b1185173b..000000000000
--- a/virt/kvm/arm/vgic/vgic.c
+++ /dev/null
@@ -1,973 +0,0 @@
-/*
- * Copyright (C) 2015, 2016 ARM Ltd.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/interrupt.h>
-#include <linux/irq.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <linux/list_sort.h>
-#include <linux/nospec.h>
-
-#include <asm/kvm_hyp.h>
-
-#include "vgic.h"
-
-#define CREATE_TRACE_POINTS
-#include "trace.h"
-
-struct vgic_global kvm_vgic_global_state __ro_after_init = {
-	.gicv3_cpuif = STATIC_KEY_FALSE_INIT,
-};
-
-/*
- * Locking order is always:
- * kvm->lock (mutex)
- *   its->cmd_lock (mutex)
- *     its->its_lock (mutex)
- *       vgic_cpu->ap_list_lock		must be taken with IRQs disabled
- *         kvm->lpi_list_lock		must be taken with IRQs disabled
- *           vgic_irq->irq_lock		must be taken with IRQs disabled
- *
- * As the ap_list_lock might be taken from the timer interrupt handler,
- * we have to disable IRQs before taking this lock and everything lower
- * than it.
- *
- * If you need to take multiple locks, always take the upper lock first,
- * then the lower ones, e.g. first take the its_lock, then the irq_lock.
- * If you are already holding a lock and need to take a higher one, you
- * have to drop the lower ranking lock first and re-aquire it after having
- * taken the upper one.
- *
- * When taking more than one ap_list_lock at the same time, always take the
- * lowest numbered VCPU's ap_list_lock first, so:
- *   vcpuX->vcpu_id < vcpuY->vcpu_id:
- *     spin_lock(vcpuX->arch.vgic_cpu.ap_list_lock);
- *     spin_lock(vcpuY->arch.vgic_cpu.ap_list_lock);
- *
- * Since the VGIC must support injecting virtual interrupts from ISRs, we have
- * to use the spin_lock_irqsave/spin_unlock_irqrestore versions of outer
- * spinlocks for any lock that may be taken while injecting an interrupt.
- */
-
-/*
- * Iterate over the VM's list of mapped LPIs to find the one with a
- * matching interrupt ID and return a reference to the IRQ structure.
- */
-static struct vgic_irq *vgic_get_lpi(struct kvm *kvm, u32 intid)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	struct vgic_irq *irq = NULL;
-	unsigned long flags;
-
-	spin_lock_irqsave(&dist->lpi_list_lock, flags);
-
-	list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
-		if (irq->intid != intid)
-			continue;
-
-		/*
-		 * This increases the refcount, the caller is expected to
-		 * call vgic_put_irq() later once it's finished with the IRQ.
-		 */
-		vgic_get_irq_kref(irq);
-		goto out_unlock;
-	}
-	irq = NULL;
-
-out_unlock:
-	spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
-
-	return irq;
-}
-
-/*
- * This looks up the virtual interrupt ID to get the corresponding
- * struct vgic_irq. It also increases the refcount, so any caller is expected
- * to call vgic_put_irq() once it's finished with this IRQ.
- */
-struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
-			      u32 intid)
-{
-	/* SGIs and PPIs */
-	if (intid <= VGIC_MAX_PRIVATE) {
-		intid = array_index_nospec(intid, VGIC_MAX_PRIVATE + 1);
-		return &vcpu->arch.vgic_cpu.private_irqs[intid];
-	}
-
-	/* SPIs */
-	if (intid < (kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS)) {
-		intid = array_index_nospec(intid, kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS);
-		return &kvm->arch.vgic.spis[intid - VGIC_NR_PRIVATE_IRQS];
-	}
-
-	/* LPIs */
-	if (intid >= VGIC_MIN_LPI)
-		return vgic_get_lpi(kvm, intid);
-
-	WARN(1, "Looking up struct vgic_irq for reserved INTID");
-	return NULL;
-}
-
-/*
- * We can't do anything in here, because we lack the kvm pointer to
- * lock and remove the item from the lpi_list. So we keep this function
- * empty and use the return value of kref_put() to trigger the freeing.
- */
-static void vgic_irq_release(struct kref *ref)
-{
-}
-
-void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	unsigned long flags;
-
-	if (irq->intid < VGIC_MIN_LPI)
-		return;
-
-	spin_lock_irqsave(&dist->lpi_list_lock, flags);
-	if (!kref_put(&irq->refcount, vgic_irq_release)) {
-		spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
-		return;
-	};
-
-	list_del(&irq->lpi_list);
-	dist->lpi_list_count--;
-	spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
-
-	kfree(irq);
-}
-
-void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending)
-{
-	WARN_ON(irq_set_irqchip_state(irq->host_irq,
-				      IRQCHIP_STATE_PENDING,
-				      pending));
-}
-
-bool vgic_get_phys_line_level(struct vgic_irq *irq)
-{
-	bool line_level;
-
-	BUG_ON(!irq->hw);
-
-	if (irq->get_input_level)
-		return irq->get_input_level(irq->intid);
-
-	WARN_ON(irq_get_irqchip_state(irq->host_irq,
-				      IRQCHIP_STATE_PENDING,
-				      &line_level));
-	return line_level;
-}
-
-/* Set/Clear the physical active state */
-void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active)
-{
-
-	BUG_ON(!irq->hw);
-	WARN_ON(irq_set_irqchip_state(irq->host_irq,
-				      IRQCHIP_STATE_ACTIVE,
-				      active));
-}
-
-/**
- * kvm_vgic_target_oracle - compute the target vcpu for an irq
- *
- * @irq:	The irq to route. Must be already locked.
- *
- * Based on the current state of the interrupt (enabled, pending,
- * active, vcpu and target_vcpu), compute the next vcpu this should be
- * given to. Return NULL if this shouldn't be injected at all.
- *
- * Requires the IRQ lock to be held.
- */
-static struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq)
-{
-	lockdep_assert_held(&irq->irq_lock);
-
-	/* If the interrupt is active, it must stay on the current vcpu */
-	if (irq->active)
-		return irq->vcpu ? : irq->target_vcpu;
-
-	/*
-	 * If the IRQ is not active but enabled and pending, we should direct
-	 * it to its configured target VCPU.
-	 * If the distributor is disabled, pending interrupts shouldn't be
-	 * forwarded.
-	 */
-	if (irq->enabled && irq_is_pending(irq)) {
-		if (unlikely(irq->target_vcpu &&
-			     !irq->target_vcpu->kvm->arch.vgic.enabled))
-			return NULL;
-
-		return irq->target_vcpu;
-	}
-
-	/* If neither active nor pending and enabled, then this IRQ should not
-	 * be queued to any VCPU.
-	 */
-	return NULL;
-}
-
-/*
- * The order of items in the ap_lists defines how we'll pack things in LRs as
- * well, the first items in the list being the first things populated in the
- * LRs.
- *
- * A hard rule is that active interrupts can never be pushed out of the LRs
- * (and therefore take priority) since we cannot reliably trap on deactivation
- * of IRQs and therefore they have to be present in the LRs.
- *
- * Otherwise things should be sorted by the priority field and the GIC
- * hardware support will take care of preemption of priority groups etc.
- *
- * Return negative if "a" sorts before "b", 0 to preserve order, and positive
- * to sort "b" before "a".
- */
-static int vgic_irq_cmp(void *priv, struct list_head *a, struct list_head *b)
-{
-	struct vgic_irq *irqa = container_of(a, struct vgic_irq, ap_list);
-	struct vgic_irq *irqb = container_of(b, struct vgic_irq, ap_list);
-	bool penda, pendb;
-	int ret;
-
-	spin_lock(&irqa->irq_lock);
-	spin_lock_nested(&irqb->irq_lock, SINGLE_DEPTH_NESTING);
-
-	if (irqa->active || irqb->active) {
-		ret = (int)irqb->active - (int)irqa->active;
-		goto out;
-	}
-
-	penda = irqa->enabled && irq_is_pending(irqa);
-	pendb = irqb->enabled && irq_is_pending(irqb);
-
-	if (!penda || !pendb) {
-		ret = (int)pendb - (int)penda;
-		goto out;
-	}
-
-	/* Both pending and enabled, sort by priority */
-	ret = irqa->priority - irqb->priority;
-out:
-	spin_unlock(&irqb->irq_lock);
-	spin_unlock(&irqa->irq_lock);
-	return ret;
-}
-
-/* Must be called with the ap_list_lock held */
-static void vgic_sort_ap_list(struct kvm_vcpu *vcpu)
-{
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-
-	lockdep_assert_held(&vgic_cpu->ap_list_lock);
-
-	list_sort(NULL, &vgic_cpu->ap_list_head, vgic_irq_cmp);
-}
-
-/*
- * Only valid injection if changing level for level-triggered IRQs or for a
- * rising edge, and in-kernel connected IRQ lines can only be controlled by
- * their owner.
- */
-static bool vgic_validate_injection(struct vgic_irq *irq, bool level, void *owner)
-{
-	if (irq->owner != owner)
-		return false;
-
-	switch (irq->config) {
-	case VGIC_CONFIG_LEVEL:
-		return irq->line_level != level;
-	case VGIC_CONFIG_EDGE:
-		return level;
-	}
-
-	return false;
-}
-
-/*
- * Check whether an IRQ needs to (and can) be queued to a VCPU's ap list.
- * Do the queuing if necessary, taking the right locks in the right order.
- * Returns true when the IRQ was queued, false otherwise.
- *
- * Needs to be entered with the IRQ lock already held, but will return
- * with all locks dropped.
- */
-bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq,
-			   unsigned long flags)
-{
-	struct kvm_vcpu *vcpu;
-
-	lockdep_assert_held(&irq->irq_lock);
-
-retry:
-	vcpu = vgic_target_oracle(irq);
-	if (irq->vcpu || !vcpu) {
-		/*
-		 * If this IRQ is already on a VCPU's ap_list, then it
-		 * cannot be moved or modified and there is no more work for
-		 * us to do.
-		 *
-		 * Otherwise, if the irq is not pending and enabled, it does
-		 * not need to be inserted into an ap_list and there is also
-		 * no more work for us to do.
-		 */
-		spin_unlock_irqrestore(&irq->irq_lock, flags);
-
-		/*
-		 * We have to kick the VCPU here, because we could be
-		 * queueing an edge-triggered interrupt for which we
-		 * get no EOI maintenance interrupt. In that case,
-		 * while the IRQ is already on the VCPU's AP list, the
-		 * VCPU could have EOI'ed the original interrupt and
-		 * won't see this one until it exits for some other
-		 * reason.
-		 */
-		if (vcpu) {
-			kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
-			kvm_vcpu_kick(vcpu);
-		}
-		return false;
-	}
-
-	/*
-	 * We must unlock the irq lock to take the ap_list_lock where
-	 * we are going to insert this new pending interrupt.
-	 */
-	spin_unlock_irqrestore(&irq->irq_lock, flags);
-
-	/* someone can do stuff here, which we re-check below */
-
-	spin_lock_irqsave(&vcpu->arch.vgic_cpu.ap_list_lock, flags);
-	spin_lock(&irq->irq_lock);
-
-	/*
-	 * Did something change behind our backs?
-	 *
-	 * There are two cases:
-	 * 1) The irq lost its pending state or was disabled behind our
-	 *    backs and/or it was queued to another VCPU's ap_list.
-	 * 2) Someone changed the affinity on this irq behind our
-	 *    backs and we are now holding the wrong ap_list_lock.
-	 *
-	 * In both cases, drop the locks and retry.
-	 */
-
-	if (unlikely(irq->vcpu || vcpu != vgic_target_oracle(irq))) {
-		spin_unlock(&irq->irq_lock);
-		spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock, flags);
-
-		spin_lock_irqsave(&irq->irq_lock, flags);
-		goto retry;
-	}
-
-	/*
-	 * Grab a reference to the irq to reflect the fact that it is
-	 * now in the ap_list.
-	 */
-	vgic_get_irq_kref(irq);
-	list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head);
-	irq->vcpu = vcpu;
-
-	spin_unlock(&irq->irq_lock);
-	spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock, flags);
-
-	kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
-	kvm_vcpu_kick(vcpu);
-
-	return true;
-}
-
-/**
- * kvm_vgic_inject_irq - Inject an IRQ from a device to the vgic
- * @kvm:     The VM structure pointer
- * @cpuid:   The CPU for PPIs
- * @intid:   The INTID to inject a new state to.
- * @level:   Edge-triggered:  true:  to trigger the interrupt
- *			      false: to ignore the call
- *	     Level-sensitive  true:  raise the input signal
- *			      false: lower the input signal
- * @owner:   The opaque pointer to the owner of the IRQ being raised to verify
- *           that the caller is allowed to inject this IRQ.  Userspace
- *           injections will have owner == NULL.
- *
- * The VGIC is not concerned with devices being active-LOW or active-HIGH for
- * level-sensitive interrupts.  You can think of the level parameter as 1
- * being HIGH and 0 being LOW and all devices being active-HIGH.
- */
-int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid,
-			bool level, void *owner)
-{
-	struct kvm_vcpu *vcpu;
-	struct vgic_irq *irq;
-	unsigned long flags;
-	int ret;
-
-	trace_vgic_update_irq_pending(cpuid, intid, level);
-
-	ret = vgic_lazy_init(kvm);
-	if (ret)
-		return ret;
-
-	vcpu = kvm_get_vcpu(kvm, cpuid);
-	if (!vcpu && intid < VGIC_NR_PRIVATE_IRQS)
-		return -EINVAL;
-
-	irq = vgic_get_irq(kvm, vcpu, intid);
-	if (!irq)
-		return -EINVAL;
-
-	spin_lock_irqsave(&irq->irq_lock, flags);
-
-	if (!vgic_validate_injection(irq, level, owner)) {
-		/* Nothing to see here, move along... */
-		spin_unlock_irqrestore(&irq->irq_lock, flags);
-		vgic_put_irq(kvm, irq);
-		return 0;
-	}
-
-	if (irq->config == VGIC_CONFIG_LEVEL)
-		irq->line_level = level;
-	else
-		irq->pending_latch = true;
-
-	vgic_queue_irq_unlock(kvm, irq, flags);
-	vgic_put_irq(kvm, irq);
-
-	return 0;
-}
-
-/* @irq->irq_lock must be held */
-static int kvm_vgic_map_irq(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
-			    unsigned int host_irq,
-			    bool (*get_input_level)(int vindid))
-{
-	struct irq_desc *desc;
-	struct irq_data *data;
-
-	/*
-	 * Find the physical IRQ number corresponding to @host_irq
-	 */
-	desc = irq_to_desc(host_irq);
-	if (!desc) {
-		kvm_err("%s: no interrupt descriptor\n", __func__);
-		return -EINVAL;
-	}
-	data = irq_desc_get_irq_data(desc);
-	while (data->parent_data)
-		data = data->parent_data;
-
-	irq->hw = true;
-	irq->host_irq = host_irq;
-	irq->hwintid = data->hwirq;
-	irq->get_input_level = get_input_level;
-	return 0;
-}
-
-/* @irq->irq_lock must be held */
-static inline void kvm_vgic_unmap_irq(struct vgic_irq *irq)
-{
-	irq->hw = false;
-	irq->hwintid = 0;
-	irq->get_input_level = NULL;
-}
-
-int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq,
-			  u32 vintid, bool (*get_input_level)(int vindid))
-{
-	struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, vintid);
-	unsigned long flags;
-	int ret;
-
-	BUG_ON(!irq);
-
-	spin_lock_irqsave(&irq->irq_lock, flags);
-	ret = kvm_vgic_map_irq(vcpu, irq, host_irq, get_input_level);
-	spin_unlock_irqrestore(&irq->irq_lock, flags);
-	vgic_put_irq(vcpu->kvm, irq);
-
-	return ret;
-}
-
-/**
- * kvm_vgic_reset_mapped_irq - Reset a mapped IRQ
- * @vcpu: The VCPU pointer
- * @vintid: The INTID of the interrupt
- *
- * Reset the active and pending states of a mapped interrupt.  Kernel
- * subsystems injecting mapped interrupts should reset their interrupt lines
- * when we are doing a reset of the VM.
- */
-void kvm_vgic_reset_mapped_irq(struct kvm_vcpu *vcpu, u32 vintid)
-{
-	struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, vintid);
-	unsigned long flags;
-
-	if (!irq->hw)
-		goto out;
-
-	spin_lock_irqsave(&irq->irq_lock, flags);
-	irq->active = false;
-	irq->pending_latch = false;
-	irq->line_level = false;
-	spin_unlock_irqrestore(&irq->irq_lock, flags);
-out:
-	vgic_put_irq(vcpu->kvm, irq);
-}
-
-int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int vintid)
-{
-	struct vgic_irq *irq;
-	unsigned long flags;
-
-	if (!vgic_initialized(vcpu->kvm))
-		return -EAGAIN;
-
-	irq = vgic_get_irq(vcpu->kvm, vcpu, vintid);
-	BUG_ON(!irq);
-
-	spin_lock_irqsave(&irq->irq_lock, flags);
-	kvm_vgic_unmap_irq(irq);
-	spin_unlock_irqrestore(&irq->irq_lock, flags);
-	vgic_put_irq(vcpu->kvm, irq);
-
-	return 0;
-}
-
-/**
- * kvm_vgic_set_owner - Set the owner of an interrupt for a VM
- *
- * @vcpu:   Pointer to the VCPU (used for PPIs)
- * @intid:  The virtual INTID identifying the interrupt (PPI or SPI)
- * @owner:  Opaque pointer to the owner
- *
- * Returns 0 if intid is not already used by another in-kernel device and the
- * owner is set, otherwise returns an error code.
- */
-int kvm_vgic_set_owner(struct kvm_vcpu *vcpu, unsigned int intid, void *owner)
-{
-	struct vgic_irq *irq;
-	unsigned long flags;
-	int ret = 0;
-
-	if (!vgic_initialized(vcpu->kvm))
-		return -EAGAIN;
-
-	/* SGIs and LPIs cannot be wired up to any device */
-	if (!irq_is_ppi(intid) && !vgic_valid_spi(vcpu->kvm, intid))
-		return -EINVAL;
-
-	irq = vgic_get_irq(vcpu->kvm, vcpu, intid);
-	spin_lock_irqsave(&irq->irq_lock, flags);
-	if (irq->owner && irq->owner != owner)
-		ret = -EEXIST;
-	else
-		irq->owner = owner;
-	spin_unlock_irqrestore(&irq->irq_lock, flags);
-
-	return ret;
-}
-
-/**
- * vgic_prune_ap_list - Remove non-relevant interrupts from the list
- *
- * @vcpu: The VCPU pointer
- *
- * Go over the list of "interesting" interrupts, and prune those that we
- * won't have to consider in the near future.
- */
-static void vgic_prune_ap_list(struct kvm_vcpu *vcpu)
-{
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-	struct vgic_irq *irq, *tmp;
-
-	DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
-
-retry:
-	spin_lock(&vgic_cpu->ap_list_lock);
-
-	list_for_each_entry_safe(irq, tmp, &vgic_cpu->ap_list_head, ap_list) {
-		struct kvm_vcpu *target_vcpu, *vcpuA, *vcpuB;
-		bool target_vcpu_needs_kick = false;
-
-		spin_lock(&irq->irq_lock);
-
-		BUG_ON(vcpu != irq->vcpu);
-
-		target_vcpu = vgic_target_oracle(irq);
-
-		if (!target_vcpu) {
-			/*
-			 * We don't need to process this interrupt any
-			 * further, move it off the list.
-			 */
-			list_del(&irq->ap_list);
-			irq->vcpu = NULL;
-			spin_unlock(&irq->irq_lock);
-
-			/*
-			 * This vgic_put_irq call matches the
-			 * vgic_get_irq_kref in vgic_queue_irq_unlock,
-			 * where we added the LPI to the ap_list. As
-			 * we remove the irq from the list, we drop
-			 * also drop the refcount.
-			 */
-			vgic_put_irq(vcpu->kvm, irq);
-			continue;
-		}
-
-		if (target_vcpu == vcpu) {
-			/* We're on the right CPU */
-			spin_unlock(&irq->irq_lock);
-			continue;
-		}
-
-		/* This interrupt looks like it has to be migrated. */
-
-		spin_unlock(&irq->irq_lock);
-		spin_unlock(&vgic_cpu->ap_list_lock);
-
-		/*
-		 * Ensure locking order by always locking the smallest
-		 * ID first.
-		 */
-		if (vcpu->vcpu_id < target_vcpu->vcpu_id) {
-			vcpuA = vcpu;
-			vcpuB = target_vcpu;
-		} else {
-			vcpuA = target_vcpu;
-			vcpuB = vcpu;
-		}
-
-		spin_lock(&vcpuA->arch.vgic_cpu.ap_list_lock);
-		spin_lock_nested(&vcpuB->arch.vgic_cpu.ap_list_lock,
-				 SINGLE_DEPTH_NESTING);
-		spin_lock(&irq->irq_lock);
-
-		/*
-		 * If the affinity has been preserved, move the
-		 * interrupt around. Otherwise, it means things have
-		 * changed while the interrupt was unlocked, and we
-		 * need to replay this.
-		 *
-		 * In all cases, we cannot trust the list not to have
-		 * changed, so we restart from the beginning.
-		 */
-		if (target_vcpu == vgic_target_oracle(irq)) {
-			struct vgic_cpu *new_cpu = &target_vcpu->arch.vgic_cpu;
-
-			list_del(&irq->ap_list);
-			irq->vcpu = target_vcpu;
-			list_add_tail(&irq->ap_list, &new_cpu->ap_list_head);
-			target_vcpu_needs_kick = true;
-		}
-
-		spin_unlock(&irq->irq_lock);
-		spin_unlock(&vcpuB->arch.vgic_cpu.ap_list_lock);
-		spin_unlock(&vcpuA->arch.vgic_cpu.ap_list_lock);
-
-		if (target_vcpu_needs_kick) {
-			kvm_make_request(KVM_REQ_IRQ_PENDING, target_vcpu);
-			kvm_vcpu_kick(target_vcpu);
-		}
-
-		goto retry;
-	}
-
-	spin_unlock(&vgic_cpu->ap_list_lock);
-}
-
-static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu)
-{
-	if (kvm_vgic_global_state.type == VGIC_V2)
-		vgic_v2_fold_lr_state(vcpu);
-	else
-		vgic_v3_fold_lr_state(vcpu);
-}
-
-/* Requires the irq_lock to be held. */
-static inline void vgic_populate_lr(struct kvm_vcpu *vcpu,
-				    struct vgic_irq *irq, int lr)
-{
-	lockdep_assert_held(&irq->irq_lock);
-
-	if (kvm_vgic_global_state.type == VGIC_V2)
-		vgic_v2_populate_lr(vcpu, irq, lr);
-	else
-		vgic_v3_populate_lr(vcpu, irq, lr);
-}
-
-static inline void vgic_clear_lr(struct kvm_vcpu *vcpu, int lr)
-{
-	if (kvm_vgic_global_state.type == VGIC_V2)
-		vgic_v2_clear_lr(vcpu, lr);
-	else
-		vgic_v3_clear_lr(vcpu, lr);
-}
-
-static inline void vgic_set_underflow(struct kvm_vcpu *vcpu)
-{
-	if (kvm_vgic_global_state.type == VGIC_V2)
-		vgic_v2_set_underflow(vcpu);
-	else
-		vgic_v3_set_underflow(vcpu);
-}
-
-/* Requires the ap_list_lock to be held. */
-static int compute_ap_list_depth(struct kvm_vcpu *vcpu,
-				 bool *multi_sgi)
-{
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-	struct vgic_irq *irq;
-	int count = 0;
-
-	*multi_sgi = false;
-
-	lockdep_assert_held(&vgic_cpu->ap_list_lock);
-
-	list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
-		int w;
-
-		spin_lock(&irq->irq_lock);
-		/* GICv2 SGIs can count for more than one... */
-		w = vgic_irq_get_lr_count(irq);
-		spin_unlock(&irq->irq_lock);
-
-		count += w;
-		*multi_sgi |= (w > 1);
-	}
-	return count;
-}
-
-/* Requires the VCPU's ap_list_lock to be held. */
-static void vgic_flush_lr_state(struct kvm_vcpu *vcpu)
-{
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-	struct vgic_irq *irq;
-	int count;
-	bool multi_sgi;
-	u8 prio = 0xff;
-
-	lockdep_assert_held(&vgic_cpu->ap_list_lock);
-
-	count = compute_ap_list_depth(vcpu, &multi_sgi);
-	if (count > kvm_vgic_global_state.nr_lr || multi_sgi)
-		vgic_sort_ap_list(vcpu);
-
-	count = 0;
-
-	list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
-		spin_lock(&irq->irq_lock);
-
-		/*
-		 * If we have multi-SGIs in the pipeline, we need to
-		 * guarantee that they are all seen before any IRQ of
-		 * lower priority. In that case, we need to filter out
-		 * these interrupts by exiting early. This is easy as
-		 * the AP list has been sorted already.
-		 */
-		if (multi_sgi && irq->priority > prio) {
-			spin_unlock(&irq->irq_lock);
-			break;
-		}
-
-		if (likely(vgic_target_oracle(irq) == vcpu)) {
-			vgic_populate_lr(vcpu, irq, count++);
-
-			if (irq->source)
-				prio = irq->priority;
-		}
-
-		spin_unlock(&irq->irq_lock);
-
-		if (count == kvm_vgic_global_state.nr_lr) {
-			if (!list_is_last(&irq->ap_list,
-					  &vgic_cpu->ap_list_head))
-				vgic_set_underflow(vcpu);
-			break;
-		}
-	}
-
-	vcpu->arch.vgic_cpu.used_lrs = count;
-
-	/* Nuke remaining LRs */
-	for ( ; count < kvm_vgic_global_state.nr_lr; count++)
-		vgic_clear_lr(vcpu, count);
-}
-
-static inline bool can_access_vgic_from_kernel(void)
-{
-	/*
-	 * GICv2 can always be accessed from the kernel because it is
-	 * memory-mapped, and VHE systems can access GICv3 EL2 system
-	 * registers.
-	 */
-	return !static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif) || has_vhe();
-}
-
-static inline void vgic_save_state(struct kvm_vcpu *vcpu)
-{
-	if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
-		vgic_v2_save_state(vcpu);
-	else
-		__vgic_v3_save_state(vcpu);
-}
-
-/* Sync back the hardware VGIC state into our emulation after a guest's run. */
-void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
-{
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-
-	WARN_ON(vgic_v4_sync_hwstate(vcpu));
-
-	/* An empty ap_list_head implies used_lrs == 0 */
-	if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head))
-		return;
-
-	if (can_access_vgic_from_kernel())
-		vgic_save_state(vcpu);
-
-	if (vgic_cpu->used_lrs)
-		vgic_fold_lr_state(vcpu);
-	vgic_prune_ap_list(vcpu);
-}
-
-static inline void vgic_restore_state(struct kvm_vcpu *vcpu)
-{
-	if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
-		vgic_v2_restore_state(vcpu);
-	else
-		__vgic_v3_restore_state(vcpu);
-}
-
-/* Flush our emulation state into the GIC hardware before entering the guest. */
-void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
-{
-	WARN_ON(vgic_v4_flush_hwstate(vcpu));
-
-	/*
-	 * If there are no virtual interrupts active or pending for this
-	 * VCPU, then there is no work to do and we can bail out without
-	 * taking any lock.  There is a potential race with someone injecting
-	 * interrupts to the VCPU, but it is a benign race as the VCPU will
-	 * either observe the new interrupt before or after doing this check,
-	 * and introducing additional synchronization mechanism doesn't change
-	 * this.
-	 */
-	if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head))
-		return;
-
-	DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
-
-	spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock);
-	vgic_flush_lr_state(vcpu);
-	spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock);
-
-	if (can_access_vgic_from_kernel())
-		vgic_restore_state(vcpu);
-}
-
-void kvm_vgic_load(struct kvm_vcpu *vcpu)
-{
-	if (unlikely(!vgic_initialized(vcpu->kvm)))
-		return;
-
-	if (kvm_vgic_global_state.type == VGIC_V2)
-		vgic_v2_load(vcpu);
-	else
-		vgic_v3_load(vcpu);
-}
-
-void kvm_vgic_put(struct kvm_vcpu *vcpu)
-{
-	if (unlikely(!vgic_initialized(vcpu->kvm)))
-		return;
-
-	if (kvm_vgic_global_state.type == VGIC_V2)
-		vgic_v2_put(vcpu);
-	else
-		vgic_v3_put(vcpu);
-}
-
-int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu)
-{
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-	struct vgic_irq *irq;
-	bool pending = false;
-	unsigned long flags;
-	struct vgic_vmcr vmcr;
-
-	if (!vcpu->kvm->arch.vgic.enabled)
-		return false;
-
-	if (vcpu->arch.vgic_cpu.vgic_v3.its_vpe.pending_last)
-		return true;
-
-	vgic_get_vmcr(vcpu, &vmcr);
-
-	spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags);
-
-	list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
-		spin_lock(&irq->irq_lock);
-		pending = irq_is_pending(irq) && irq->enabled &&
-			  !irq->active &&
-			  irq->priority < vmcr.pmr;
-		spin_unlock(&irq->irq_lock);
-
-		if (pending)
-			break;
-	}
-
-	spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags);
-
-	return pending;
-}
-
-void vgic_kick_vcpus(struct kvm *kvm)
-{
-	struct kvm_vcpu *vcpu;
-	int c;
-
-	/*
-	 * We've injected an interrupt, time to find out who deserves
-	 * a good kick...
-	 */
-	kvm_for_each_vcpu(c, vcpu, kvm) {
-		if (kvm_vgic_vcpu_pending_irq(vcpu)) {
-			kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
-			kvm_vcpu_kick(vcpu);
-		}
-	}
-}
-
-bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int vintid)
-{
-	struct vgic_irq *irq;
-	bool map_is_active;
-	unsigned long flags;
-
-	if (!vgic_initialized(vcpu->kvm))
-		return false;
-
-	irq = vgic_get_irq(vcpu->kvm, vcpu, vintid);
-	spin_lock_irqsave(&irq->irq_lock, flags);
-	map_is_active = irq->hw && irq->active;
-	spin_unlock_irqrestore(&irq->irq_lock, flags);
-	vgic_put_irq(vcpu->kvm, irq);
-
-	return map_is_active;
-}
-
diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
deleted file mode 100644
index a90024718ca4..000000000000
--- a/virt/kvm/arm/vgic/vgic.h
+++ /dev/null
@@ -1,325 +0,0 @@
-/*
- * Copyright (C) 2015, 2016 ARM Ltd.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-#ifndef __KVM_ARM_VGIC_NEW_H__
-#define __KVM_ARM_VGIC_NEW_H__
-
-#include <linux/irqchip/arm-gic-common.h>
-
-#define PRODUCT_ID_KVM		0x4b	/* ASCII code K */
-#define IMPLEMENTER_ARM		0x43b
-
-#define VGIC_ADDR_UNDEF		(-1)
-#define IS_VGIC_ADDR_UNDEF(_x)  ((_x) == VGIC_ADDR_UNDEF)
-
-#define INTERRUPT_ID_BITS_SPIS	10
-#define INTERRUPT_ID_BITS_ITS	16
-#define VGIC_PRI_BITS		5
-
-#define vgic_irq_is_sgi(intid) ((intid) < VGIC_NR_SGIS)
-
-#define VGIC_AFFINITY_0_SHIFT 0
-#define VGIC_AFFINITY_0_MASK (0xffUL << VGIC_AFFINITY_0_SHIFT)
-#define VGIC_AFFINITY_1_SHIFT 8
-#define VGIC_AFFINITY_1_MASK (0xffUL << VGIC_AFFINITY_1_SHIFT)
-#define VGIC_AFFINITY_2_SHIFT 16
-#define VGIC_AFFINITY_2_MASK (0xffUL << VGIC_AFFINITY_2_SHIFT)
-#define VGIC_AFFINITY_3_SHIFT 24
-#define VGIC_AFFINITY_3_MASK (0xffUL << VGIC_AFFINITY_3_SHIFT)
-
-#define VGIC_AFFINITY_LEVEL(reg, level) \
-	((((reg) & VGIC_AFFINITY_## level ##_MASK) \
-	>> VGIC_AFFINITY_## level ##_SHIFT) << MPIDR_LEVEL_SHIFT(level))
-
-/*
- * The Userspace encodes the affinity differently from the MPIDR,
- * Below macro converts vgic userspace format to MPIDR reg format.
- */
-#define VGIC_TO_MPIDR(val) (VGIC_AFFINITY_LEVEL(val, 0) | \
-			    VGIC_AFFINITY_LEVEL(val, 1) | \
-			    VGIC_AFFINITY_LEVEL(val, 2) | \
-			    VGIC_AFFINITY_LEVEL(val, 3))
-
-/*
- * As per Documentation/virtual/kvm/devices/arm-vgic-v3.txt,
- * below macros are defined for CPUREG encoding.
- */
-#define KVM_REG_ARM_VGIC_SYSREG_OP0_MASK   0x000000000000c000
-#define KVM_REG_ARM_VGIC_SYSREG_OP0_SHIFT  14
-#define KVM_REG_ARM_VGIC_SYSREG_OP1_MASK   0x0000000000003800
-#define KVM_REG_ARM_VGIC_SYSREG_OP1_SHIFT  11
-#define KVM_REG_ARM_VGIC_SYSREG_CRN_MASK   0x0000000000000780
-#define KVM_REG_ARM_VGIC_SYSREG_CRN_SHIFT  7
-#define KVM_REG_ARM_VGIC_SYSREG_CRM_MASK   0x0000000000000078
-#define KVM_REG_ARM_VGIC_SYSREG_CRM_SHIFT  3
-#define KVM_REG_ARM_VGIC_SYSREG_OP2_MASK   0x0000000000000007
-#define KVM_REG_ARM_VGIC_SYSREG_OP2_SHIFT  0
-
-#define KVM_DEV_ARM_VGIC_SYSREG_MASK (KVM_REG_ARM_VGIC_SYSREG_OP0_MASK | \
-				      KVM_REG_ARM_VGIC_SYSREG_OP1_MASK | \
-				      KVM_REG_ARM_VGIC_SYSREG_CRN_MASK | \
-				      KVM_REG_ARM_VGIC_SYSREG_CRM_MASK | \
-				      KVM_REG_ARM_VGIC_SYSREG_OP2_MASK)
-
-/*
- * As per Documentation/virtual/kvm/devices/arm-vgic-its.txt,
- * below macros are defined for ITS table entry encoding.
- */
-#define KVM_ITS_CTE_VALID_SHIFT		63
-#define KVM_ITS_CTE_VALID_MASK		BIT_ULL(63)
-#define KVM_ITS_CTE_RDBASE_SHIFT	16
-#define KVM_ITS_CTE_ICID_MASK		GENMASK_ULL(15, 0)
-#define KVM_ITS_ITE_NEXT_SHIFT		48
-#define KVM_ITS_ITE_PINTID_SHIFT	16
-#define KVM_ITS_ITE_PINTID_MASK		GENMASK_ULL(47, 16)
-#define KVM_ITS_ITE_ICID_MASK		GENMASK_ULL(15, 0)
-#define KVM_ITS_DTE_VALID_SHIFT		63
-#define KVM_ITS_DTE_VALID_MASK		BIT_ULL(63)
-#define KVM_ITS_DTE_NEXT_SHIFT		49
-#define KVM_ITS_DTE_NEXT_MASK		GENMASK_ULL(62, 49)
-#define KVM_ITS_DTE_ITTADDR_SHIFT	5
-#define KVM_ITS_DTE_ITTADDR_MASK	GENMASK_ULL(48, 5)
-#define KVM_ITS_DTE_SIZE_MASK		GENMASK_ULL(4, 0)
-#define KVM_ITS_L1E_VALID_MASK		BIT_ULL(63)
-/* we only support 64 kB translation table page size */
-#define KVM_ITS_L1E_ADDR_MASK		GENMASK_ULL(51, 16)
-
-#define KVM_VGIC_V3_RDIST_INDEX_MASK	GENMASK_ULL(11, 0)
-#define KVM_VGIC_V3_RDIST_FLAGS_MASK	GENMASK_ULL(15, 12)
-#define KVM_VGIC_V3_RDIST_FLAGS_SHIFT	12
-#define KVM_VGIC_V3_RDIST_BASE_MASK	GENMASK_ULL(51, 16)
-#define KVM_VGIC_V3_RDIST_COUNT_MASK	GENMASK_ULL(63, 52)
-#define KVM_VGIC_V3_RDIST_COUNT_SHIFT	52
-
-#ifdef CONFIG_DEBUG_SPINLOCK
-#define DEBUG_SPINLOCK_BUG_ON(p) BUG_ON(p)
-#else
-#define DEBUG_SPINLOCK_BUG_ON(p)
-#endif
-
-/* Requires the irq_lock to be held by the caller. */
-static inline bool irq_is_pending(struct vgic_irq *irq)
-{
-	if (irq->config == VGIC_CONFIG_EDGE)
-		return irq->pending_latch;
-	else
-		return irq->pending_latch || irq->line_level;
-}
-
-static inline bool vgic_irq_is_mapped_level(struct vgic_irq *irq)
-{
-	return irq->config == VGIC_CONFIG_LEVEL && irq->hw;
-}
-
-static inline int vgic_irq_get_lr_count(struct vgic_irq *irq)
-{
-	/* Account for the active state as an interrupt */
-	if (vgic_irq_is_sgi(irq->intid) && irq->source)
-		return hweight8(irq->source) + irq->active;
-
-	return irq_is_pending(irq) || irq->active;
-}
-
-static inline bool vgic_irq_is_multi_sgi(struct vgic_irq *irq)
-{
-	return vgic_irq_get_lr_count(irq) > 1;
-}
-
-/*
- * This struct provides an intermediate representation of the fields contained
- * in the GICH_VMCR and ICH_VMCR registers, such that code exporting the GIC
- * state to userspace can generate either GICv2 or GICv3 CPU interface
- * registers regardless of the hardware backed GIC used.
- */
-struct vgic_vmcr {
-	u32	grpen0;
-	u32	grpen1;
-
-	u32	ackctl;
-	u32	fiqen;
-	u32	cbpr;
-	u32	eoim;
-
-	u32	abpr;
-	u32	bpr;
-	u32	pmr;  /* Priority mask field in the GICC_PMR and
-		       * ICC_PMR_EL1 priority field format */
-};
-
-struct vgic_reg_attr {
-	struct kvm_vcpu *vcpu;
-	gpa_t addr;
-};
-
-int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr,
-		       struct vgic_reg_attr *reg_attr);
-int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr,
-		       struct vgic_reg_attr *reg_attr);
-const struct vgic_register_region *
-vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev,
-		     gpa_t addr, int len);
-struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
-			      u32 intid);
-void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq);
-bool vgic_get_phys_line_level(struct vgic_irq *irq);
-void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending);
-void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active);
-bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq,
-			   unsigned long flags);
-void vgic_kick_vcpus(struct kvm *kvm);
-
-int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr,
-		      phys_addr_t addr, phys_addr_t alignment);
-
-void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu);
-void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr);
-void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr);
-void vgic_v2_set_underflow(struct kvm_vcpu *vcpu);
-void vgic_v2_set_npie(struct kvm_vcpu *vcpu);
-int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr);
-int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
-			 int offset, u32 *val);
-int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write,
-			  int offset, u32 *val);
-void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
-void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
-void vgic_v2_enable(struct kvm_vcpu *vcpu);
-int vgic_v2_probe(const struct gic_kvm_info *info);
-int vgic_v2_map_resources(struct kvm *kvm);
-int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address,
-			     enum vgic_type);
-
-void vgic_v2_init_lrs(void);
-void vgic_v2_load(struct kvm_vcpu *vcpu);
-void vgic_v2_put(struct kvm_vcpu *vcpu);
-
-void vgic_v2_save_state(struct kvm_vcpu *vcpu);
-void vgic_v2_restore_state(struct kvm_vcpu *vcpu);
-
-static inline void vgic_get_irq_kref(struct vgic_irq *irq)
-{
-	if (irq->intid < VGIC_MIN_LPI)
-		return;
-
-	kref_get(&irq->refcount);
-}
-
-void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu);
-void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr);
-void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr);
-void vgic_v3_set_underflow(struct kvm_vcpu *vcpu);
-void vgic_v3_set_npie(struct kvm_vcpu *vcpu);
-void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
-void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
-void vgic_v3_enable(struct kvm_vcpu *vcpu);
-int vgic_v3_probe(const struct gic_kvm_info *info);
-int vgic_v3_map_resources(struct kvm *kvm);
-int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq);
-int vgic_v3_save_pending_tables(struct kvm *kvm);
-int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count);
-int vgic_register_redist_iodev(struct kvm_vcpu *vcpu);
-bool vgic_v3_check_base(struct kvm *kvm);
-
-void vgic_v3_load(struct kvm_vcpu *vcpu);
-void vgic_v3_put(struct kvm_vcpu *vcpu);
-
-bool vgic_has_its(struct kvm *kvm);
-int kvm_vgic_register_its_device(void);
-void vgic_enable_lpis(struct kvm_vcpu *vcpu);
-int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi);
-int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr);
-int vgic_v3_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
-			 int offset, u32 *val);
-int vgic_v3_redist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
-			 int offset, u32 *val);
-int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu, bool is_write,
-			 u64 id, u64 *val);
-int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, bool is_write, u64 id,
-				u64 *reg);
-int vgic_v3_line_level_info_uaccess(struct kvm_vcpu *vcpu, bool is_write,
-				    u32 intid, u64 *val);
-int kvm_register_vgic_device(unsigned long type);
-void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
-void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
-int vgic_lazy_init(struct kvm *kvm);
-int vgic_init(struct kvm *kvm);
-
-void vgic_debug_init(struct kvm *kvm);
-void vgic_debug_destroy(struct kvm *kvm);
-
-bool lock_all_vcpus(struct kvm *kvm);
-void unlock_all_vcpus(struct kvm *kvm);
-
-static inline int vgic_v3_max_apr_idx(struct kvm_vcpu *vcpu)
-{
-	struct vgic_cpu *cpu_if = &vcpu->arch.vgic_cpu;
-
-	/*
-	 * num_pri_bits are initialized with HW supported values.
-	 * We can rely safely on num_pri_bits even if VM has not
-	 * restored ICC_CTLR_EL1 before restoring APnR registers.
-	 */
-	switch (cpu_if->num_pri_bits) {
-	case 7: return 3;
-	case 6: return 1;
-	default: return 0;
-	}
-}
-
-static inline bool
-vgic_v3_redist_region_full(struct vgic_redist_region *region)
-{
-	if (!region->count)
-		return false;
-
-	return (region->free_index >= region->count);
-}
-
-struct vgic_redist_region *vgic_v3_rdist_free_slot(struct list_head *rdregs);
-
-static inline size_t
-vgic_v3_rd_region_size(struct kvm *kvm, struct vgic_redist_region *rdreg)
-{
-	if (!rdreg->count)
-		return atomic_read(&kvm->online_vcpus) * KVM_VGIC_V3_REDIST_SIZE;
-	else
-		return rdreg->count * KVM_VGIC_V3_REDIST_SIZE;
-}
-
-struct vgic_redist_region *vgic_v3_rdist_region_from_index(struct kvm *kvm,
-							   u32 index);
-
-bool vgic_v3_rdist_overlap(struct kvm *kvm, gpa_t base, size_t size);
-
-static inline bool vgic_dist_overlap(struct kvm *kvm, gpa_t base, size_t size)
-{
-	struct vgic_dist *d = &kvm->arch.vgic;
-
-	return (base + size > d->vgic_dist_base) &&
-		(base < d->vgic_dist_base + KVM_VGIC_V3_DIST_SIZE);
-}
-
-int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr);
-int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its,
-			 u32 devid, u32 eventid, struct vgic_irq **irq);
-struct vgic_its *vgic_msi_to_its(struct kvm *kvm, struct kvm_msi *msi);
-
-bool vgic_supports_direct_msis(struct kvm *kvm);
-int vgic_v4_init(struct kvm *kvm);
-void vgic_v4_teardown(struct kvm *kvm);
-int vgic_v4_sync_hwstate(struct kvm_vcpu *vcpu);
-int vgic_v4_flush_hwstate(struct kvm_vcpu *vcpu);
-
-#endif
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 110cbe3f74f8..b8aaa96b799b 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * kvm asynchronous fault support
  *
@@ -5,19 +6,6 @@
  *
  * Author:
  *      Gleb Natapov <gleb@redhat.com>
- *
- * This file is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
  */
 
 #include <linux/kvm_host.h>
@@ -29,21 +17,6 @@
 #include "async_pf.h"
 #include <trace/events/kvm.h>
 
-static inline void kvm_async_page_present_sync(struct kvm_vcpu *vcpu,
-					       struct kvm_async_pf *work)
-{
-#ifdef CONFIG_KVM_ASYNC_PF_SYNC
-	kvm_arch_async_page_present(vcpu, work);
-#endif
-}
-static inline void kvm_async_page_present_async(struct kvm_vcpu *vcpu,
-						struct kvm_async_pf *work)
-{
-#ifndef CONFIG_KVM_ASYNC_PF_SYNC
-	kvm_arch_async_page_present(vcpu, work);
-#endif
-}
-
 static struct kmem_cache *async_pf_cache;
 
 int kvm_async_pf_init(void)
@@ -73,50 +46,79 @@ static void async_pf_execute(struct work_struct *work)
 {
 	struct kvm_async_pf *apf =
 		container_of(work, struct kvm_async_pf, work);
-	struct mm_struct *mm = apf->mm;
 	struct kvm_vcpu *vcpu = apf->vcpu;
+	struct mm_struct *mm = vcpu->kvm->mm;
 	unsigned long addr = apf->addr;
-	gva_t gva = apf->gva;
+	gpa_t cr2_or_gpa = apf->cr2_or_gpa;
 	int locked = 1;
+	bool first;
 
 	might_sleep();
 
 	/*
-	 * This work is run asynchronously to the task which owns
-	 * mm and might be done in another context, so we must
-	 * access remotely.
+	 * Attempt to pin the VM's host address space, and simply skip gup() if
+	 * acquiring a pin fail, i.e. if the process is exiting.  Note, KVM
+	 * holds a reference to its associated mm_struct until the very end of
+	 * kvm_destroy_vm(), i.e. the struct itself won't be freed before this
+	 * work item is fully processed.
 	 */
-	down_read(&mm->mmap_sem);
-	get_user_pages_remote(NULL, mm, addr, 1, FOLL_WRITE, NULL, NULL,
-			&locked);
-	if (locked)
-		up_read(&mm->mmap_sem);
+	if (mmget_not_zero(mm)) {
+		mmap_read_lock(mm);
+		get_user_pages_remote(mm, addr, 1, FOLL_WRITE, NULL, &locked);
+		if (locked)
+			mmap_read_unlock(mm);
+		mmput(mm);
+	}
 
-	kvm_async_page_present_sync(vcpu, apf);
+	/*
+	 * Notify and kick the vCPU even if faulting in the page failed, e.g.
+	 * so that the vCPU can retry the fault synchronously.
+	 */
+	if (IS_ENABLED(CONFIG_KVM_ASYNC_PF_SYNC))
+		kvm_arch_async_page_present(vcpu, apf);
 
 	spin_lock(&vcpu->async_pf.lock);
+	first = list_empty(&vcpu->async_pf.done);
 	list_add_tail(&apf->link, &vcpu->async_pf.done);
-	apf->vcpu = NULL;
 	spin_unlock(&vcpu->async_pf.lock);
 
 	/*
-	 * apf may be freed by kvm_check_async_pf_completion() after
-	 * this point
+	 * The apf struct may be freed by kvm_check_async_pf_completion() as
+	 * soon as the lock is dropped.  Nullify it to prevent improper usage.
 	 */
+	apf = NULL;
 
-	trace_kvm_async_pf_completed(addr, gva);
+	if (!IS_ENABLED(CONFIG_KVM_ASYNC_PF_SYNC) && first)
+		kvm_arch_async_page_present_queued(vcpu);
 
-	if (swq_has_sleeper(&vcpu->wq))
-		swake_up_one(&vcpu->wq);
+	trace_kvm_async_pf_completed(addr, cr2_or_gpa);
 
-	mmput(mm);
-	kvm_put_kvm(vcpu->kvm);
+	__kvm_vcpu_wake_up(vcpu);
 }
 
-void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
+static void kvm_flush_and_free_async_pf_work(struct kvm_async_pf *work)
 {
-	spin_lock(&vcpu->async_pf.lock);
+	/*
+	 * The async #PF is "done", but KVM must wait for the work item itself,
+	 * i.e. async_pf_execute(), to run to completion.  If KVM is a module,
+	 * KVM must ensure *no* code owned by the KVM (the module) can be run
+	 * after the last call to module_put().  Note, flushing the work item
+	 * is always required when the item is taken off the completion queue.
+	 * E.g. even if the vCPU handles the item in the "normal" path, the VM
+	 * could be terminated before async_pf_execute() completes.
+	 *
+	 * Wake all events skip the queue and go straight done, i.e. don't
+	 * need to be flushed (but sanity check that the work wasn't queued).
+	 */
+	if (work->wakeup_all)
+		WARN_ON_ONCE(work->work.func);
+	else
+		flush_work(&work->work);
+	kmem_cache_free(async_pf_cache, work);
+}
 
+void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
+{
 	/* cancel outstanding work queue item */
 	while (!list_empty(&vcpu->async_pf.queue)) {
 		struct kvm_async_pf *work =
@@ -124,32 +126,24 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
 					 typeof(*work), queue);
 		list_del(&work->queue);
 
-		/*
-		 * We know it's present in vcpu->async_pf.done, do
-		 * nothing here.
-		 */
-		if (!work->vcpu)
-			continue;
-
-		spin_unlock(&vcpu->async_pf.lock);
 #ifdef CONFIG_KVM_ASYNC_PF_SYNC
 		flush_work(&work->work);
 #else
-		if (cancel_work_sync(&work->work)) {
-			mmput(work->mm);
-			kvm_put_kvm(vcpu->kvm); /* == work->vcpu->kvm */
+		if (cancel_work_sync(&work->work))
 			kmem_cache_free(async_pf_cache, work);
-		}
 #endif
-		spin_lock(&vcpu->async_pf.lock);
 	}
 
+	spin_lock(&vcpu->async_pf.lock);
 	while (!list_empty(&vcpu->async_pf.done)) {
 		struct kvm_async_pf *work =
 			list_first_entry(&vcpu->async_pf.done,
 					 typeof(*work), link);
 		list_del(&work->link);
-		kmem_cache_free(async_pf_cache, work);
+
+		spin_unlock(&vcpu->async_pf.lock);
+		kvm_flush_and_free_async_pf_work(work);
+		spin_lock(&vcpu->async_pf.lock);
 	}
 	spin_unlock(&vcpu->async_pf.lock);
 
@@ -161,7 +155,7 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
 	struct kvm_async_pf *work;
 
 	while (!list_empty_careful(&vcpu->async_pf.done) &&
-	      kvm_arch_can_inject_async_page_present(vcpu)) {
+	      kvm_arch_can_dequeue_async_page_present(vcpu)) {
 		spin_lock(&vcpu->async_pf.lock);
 		work = list_first_entry(&vcpu->async_pf.done, typeof(*work),
 					      link);
@@ -169,64 +163,60 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
 		spin_unlock(&vcpu->async_pf.lock);
 
 		kvm_arch_async_page_ready(vcpu, work);
-		kvm_async_page_present_async(vcpu, work);
+		if (!IS_ENABLED(CONFIG_KVM_ASYNC_PF_SYNC))
+			kvm_arch_async_page_present(vcpu, work);
 
 		list_del(&work->queue);
 		vcpu->async_pf.queued--;
-		kmem_cache_free(async_pf_cache, work);
+		kvm_flush_and_free_async_pf_work(work);
 	}
 }
 
-int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva,
-		       struct kvm_arch_async_pf *arch)
+/*
+ * Try to schedule a job to handle page fault asynchronously. Returns 'true' on
+ * success, 'false' on failure (page fault has to be handled synchronously).
+ */
+bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+			unsigned long hva, struct kvm_arch_async_pf *arch)
 {
 	struct kvm_async_pf *work;
 
 	if (vcpu->async_pf.queued >= ASYNC_PF_PER_VCPU)
-		return 0;
+		return false;
 
-	/* setup delayed work */
+	/* Arch specific code should not do async PF in this case */
+	if (unlikely(kvm_is_error_hva(hva)))
+		return false;
 
 	/*
 	 * do alloc nowait since if we are going to sleep anyway we
 	 * may as well sleep faulting in page
 	 */
-	work = kmem_cache_zalloc(async_pf_cache, GFP_NOWAIT | __GFP_NOWARN);
+	work = kmem_cache_zalloc(async_pf_cache, GFP_NOWAIT);
 	if (!work)
-		return 0;
+		return false;
 
 	work->wakeup_all = false;
 	work->vcpu = vcpu;
-	work->gva = gva;
+	work->cr2_or_gpa = cr2_or_gpa;
 	work->addr = hva;
 	work->arch = *arch;
-	work->mm = current->mm;
-	mmget(work->mm);
-	kvm_get_kvm(work->vcpu->kvm);
-
-	/* this can't really happen otherwise gfn_to_pfn_async
-	   would succeed */
-	if (unlikely(kvm_is_error_hva(work->addr)))
-		goto retry_sync;
 
 	INIT_WORK(&work->work, async_pf_execute);
-	if (!schedule_work(&work->work))
-		goto retry_sync;
 
 	list_add_tail(&work->queue, &vcpu->async_pf.queue);
 	vcpu->async_pf.queued++;
-	kvm_arch_async_page_not_present(vcpu, work);
-	return 1;
-retry_sync:
-	kvm_put_kvm(work->vcpu->kvm);
-	mmput(work->mm);
-	kmem_cache_free(async_pf_cache, work);
-	return 0;
+	work->notpresent_injected = kvm_arch_async_page_not_present(vcpu, work);
+
+	schedule_work(&work->work);
+
+	return true;
 }
 
 int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu)
 {
 	struct kvm_async_pf *work;
+	bool first;
 
 	if (!list_empty_careful(&vcpu->async_pf.done))
 		return 0;
@@ -239,9 +229,13 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu)
 	INIT_LIST_HEAD(&work->queue); /* for list_del to work */
 
 	spin_lock(&vcpu->async_pf.lock);
+	first = list_empty(&vcpu->async_pf.done);
 	list_add_tail(&work->link, &vcpu->async_pf.done);
 	spin_unlock(&vcpu->async_pf.lock);
 
+	if (!IS_ENABLED(CONFIG_KVM_ASYNC_PF_SYNC) && first)
+		kvm_arch_async_page_present_queued(vcpu);
+
 	vcpu->async_pf.queued++;
 	return 0;
 }
diff --git a/virt/kvm/async_pf.h b/virt/kvm/async_pf.h
index ec4cfa278f04..90d1a7d8c6de 100644
--- a/virt/kvm/async_pf.h
+++ b/virt/kvm/async_pf.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * kvm asynchronous fault support
  *
@@ -5,19 +6,6 @@
  *
  * Author:
  *      Gleb Natapov <gleb@redhat.com>
- *
- * This file is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
  */
 
 #ifndef __KVM_ASYNC_PF_H__
diff --git a/virt/kvm/binary_stats.c b/virt/kvm/binary_stats.c
new file mode 100644
index 000000000000..eefca6c69f51
--- /dev/null
+++ b/virt/kvm/binary_stats.c
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KVM binary statistics interface implementation
+ *
+ * Copyright 2021 Google LLC
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/errno.h>
+#include <linux/uaccess.h>
+
+/**
+ * kvm_stats_read() - Common function to read from the binary statistics
+ * file descriptor.
+ *
+ * @id: identification string of the stats
+ * @header: stats header for a vm or a vcpu
+ * @desc: start address of an array of stats descriptors for a vm or a vcpu
+ * @stats: start address of stats data block for a vm or a vcpu
+ * @size_stats: the size of stats data block pointed by @stats
+ * @user_buffer: start address of userspace buffer
+ * @size: requested read size from userspace
+ * @offset: the start position from which the content will be read for the
+ *          corresponding vm or vcp file descriptor
+ *
+ * The file content of a vm/vcpu file descriptor is now defined as below:
+ * +-------------+
+ * |   Header    |
+ * +-------------+
+ * |  id string  |
+ * +-------------+
+ * | Descriptors |
+ * +-------------+
+ * | Stats Data  |
+ * +-------------+
+ * Although this function allows userspace to read any amount of data (as long
+ * as in the limit) from any position, the typical usage would follow below
+ * steps:
+ * 1. Read header from offset 0. Get the offset of descriptors and stats data
+ *    and some other necessary information. This is a one-time work for the
+ *    lifecycle of the corresponding vm/vcpu stats fd.
+ * 2. Read id string from its offset. This is a one-time work for the lifecycle
+ *    of the corresponding vm/vcpu stats fd.
+ * 3. Read descriptors from its offset and discover all the stats by parsing
+ *    descriptors. This is a one-time work for the lifecycle of the
+ *    corresponding vm/vcpu stats fd.
+ * 4. Periodically read stats data from its offset using pread.
+ *
+ * Return: the number of bytes that has been successfully read
+ */
+ssize_t kvm_stats_read(char *id, const struct kvm_stats_header *header,
+		       const struct _kvm_stats_desc *desc,
+		       void *stats, size_t size_stats,
+		       char __user *user_buffer, size_t size, loff_t *offset)
+{
+	ssize_t len;
+	ssize_t copylen;
+	ssize_t remain = size;
+	size_t size_desc;
+	size_t size_header;
+	void *src;
+	loff_t pos = *offset;
+	char __user *dest = user_buffer;
+
+	size_header = sizeof(*header);
+	size_desc = header->num_desc * sizeof(*desc);
+
+	len = KVM_STATS_NAME_SIZE + size_header + size_desc + size_stats - pos;
+	len = min(len, remain);
+	if (len <= 0)
+		return 0;
+	remain = len;
+
+	/*
+	 * Copy kvm stats header.
+	 * The header is the first block of content userspace usually read out.
+	 * The pos is 0 and the copylen and remain would be the size of header.
+	 * The copy of the header would be skipped if offset is larger than the
+	 * size of header. That usually happens when userspace reads stats
+	 * descriptors and stats data.
+	 */
+	copylen = size_header - pos;
+	copylen = min(copylen, remain);
+	if (copylen > 0) {
+		src = (void *)header + pos;
+		if (copy_to_user(dest, src, copylen))
+			return -EFAULT;
+		remain -= copylen;
+		pos += copylen;
+		dest += copylen;
+	}
+
+	/*
+	 * Copy kvm stats header id string.
+	 * The id string is unique for every vm/vcpu, which is stored in kvm
+	 * and kvm_vcpu structure.
+	 * The id string is part of the stat header from the perspective of
+	 * userspace, it is usually read out together with previous constant
+	 * header part and could be skipped for later descriptors and stats
+	 * data readings.
+	 */
+	copylen = header->id_offset + KVM_STATS_NAME_SIZE - pos;
+	copylen = min(copylen, remain);
+	if (copylen > 0) {
+		src = id + pos - header->id_offset;
+		if (copy_to_user(dest, src, copylen))
+			return -EFAULT;
+		remain -= copylen;
+		pos += copylen;
+		dest += copylen;
+	}
+
+	/*
+	 * Copy kvm stats descriptors.
+	 * The descriptors copy would be skipped in the typical case that
+	 * userspace periodically read stats data, since the pos would be
+	 * greater than the end address of descriptors
+	 * (header->header.desc_offset + size_desc) causing copylen <= 0.
+	 */
+	copylen = header->desc_offset + size_desc - pos;
+	copylen = min(copylen, remain);
+	if (copylen > 0) {
+		src = (void *)desc + pos - header->desc_offset;
+		if (copy_to_user(dest, src, copylen))
+			return -EFAULT;
+		remain -= copylen;
+		pos += copylen;
+		dest += copylen;
+	}
+
+	/* Copy kvm stats values */
+	copylen = header->data_offset + size_stats - pos;
+	copylen = min(copylen, remain);
+	if (copylen > 0) {
+		src = stats + pos - header->data_offset;
+		if (copy_to_user(dest, src, copylen))
+			return -EFAULT;
+		pos += copylen;
+	}
+
+	*offset = pos;
+	return len;
+}
diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c
index 6855cce3e528..375d6285475e 100644
--- a/virt/kvm/coalesced_mmio.c
+++ b/virt/kvm/coalesced_mmio.c
@@ -40,52 +40,40 @@ static int coalesced_mmio_in_range(struct kvm_coalesced_mmio_dev *dev,
 	return 1;
 }
 
-static int coalesced_mmio_has_room(struct kvm_coalesced_mmio_dev *dev)
-{
-	struct kvm_coalesced_mmio_ring *ring;
-	unsigned avail;
-
-	/* Are we able to batch it ? */
-
-	/* last is the first free entry
-	 * check if we don't meet the first used entry
-	 * there is always one unused entry in the buffer
-	 */
-	ring = dev->kvm->coalesced_mmio_ring;
-	avail = (ring->first - ring->last - 1) % KVM_COALESCED_MMIO_MAX;
-	if (avail == 0) {
-		/* full */
-		return 0;
-	}
-
-	return 1;
-}
-
 static int coalesced_mmio_write(struct kvm_vcpu *vcpu,
 				struct kvm_io_device *this, gpa_t addr,
 				int len, const void *val)
 {
 	struct kvm_coalesced_mmio_dev *dev = to_mmio(this);
 	struct kvm_coalesced_mmio_ring *ring = dev->kvm->coalesced_mmio_ring;
+	__u32 insert;
 
 	if (!coalesced_mmio_in_range(dev, addr, len))
 		return -EOPNOTSUPP;
 
 	spin_lock(&dev->kvm->ring_lock);
 
-	if (!coalesced_mmio_has_room(dev)) {
+	/*
+	 * last is the index of the entry to fill.  Verify userspace hasn't
+	 * set last to be out of range, and that there is room in the ring.
+	 * Leave one entry free in the ring so that userspace can differentiate
+	 * between an empty ring and a full ring.
+	 */
+	insert = READ_ONCE(ring->last);
+	if (insert >= KVM_COALESCED_MMIO_MAX ||
+	    (insert + 1) % KVM_COALESCED_MMIO_MAX == READ_ONCE(ring->first)) {
 		spin_unlock(&dev->kvm->ring_lock);
 		return -EOPNOTSUPP;
 	}
 
 	/* copy data in first free entry of the ring */
 
-	ring->coalesced_mmio[ring->last].phys_addr = addr;
-	ring->coalesced_mmio[ring->last].len = len;
-	memcpy(ring->coalesced_mmio[ring->last].data, val, len);
-	ring->coalesced_mmio[ring->last].pio = dev->zone.pio;
+	ring->coalesced_mmio[insert].phys_addr = addr;
+	ring->coalesced_mmio[insert].len = len;
+	memcpy(ring->coalesced_mmio[insert].data, val, len);
+	ring->coalesced_mmio[insert].pio = dev->zone.pio;
 	smp_wmb();
-	ring->last = (ring->last + 1) % KVM_COALESCED_MMIO_MAX;
+	ring->last = (insert + 1) % KVM_COALESCED_MMIO_MAX;
 	spin_unlock(&dev->kvm->ring_lock);
 	return 0;
 }
@@ -107,26 +95,22 @@ static const struct kvm_io_device_ops coalesced_mmio_ops = {
 int kvm_coalesced_mmio_init(struct kvm *kvm)
 {
 	struct page *page;
-	int ret;
 
-	ret = -ENOMEM;
-	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+	page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
 	if (!page)
-		goto out_err;
+		return -ENOMEM;
 
-	ret = 0;
 	kvm->coalesced_mmio_ring = page_address(page);
 
 	/*
 	 * We're using this spinlock to sync access to the coalesced ring.
-	 * The list doesn't need it's own lock since device registration and
+	 * The list doesn't need its own lock since device registration and
 	 * unregistration should only happen when kvm->slots_lock is held.
 	 */
 	spin_lock_init(&kvm->ring_lock);
 	INIT_LIST_HEAD(&kvm->coalesced_zones);
 
-out_err:
-	return ret;
+	return 0;
 }
 
 void kvm_coalesced_mmio_free(struct kvm *kvm)
@@ -144,7 +128,8 @@ int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm,
 	if (zone->pio != 1 && zone->pio != 0)
 		return -EINVAL;
 
-	dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL);
+	dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev),
+		      GFP_KERNEL_ACCOUNT);
 	if (!dev)
 		return -ENOMEM;
 
@@ -174,21 +159,33 @@ int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm,
 					   struct kvm_coalesced_mmio_zone *zone)
 {
 	struct kvm_coalesced_mmio_dev *dev, *tmp;
+	int r;
 
 	if (zone->pio != 1 && zone->pio != 0)
 		return -EINVAL;
 
 	mutex_lock(&kvm->slots_lock);
 
-	list_for_each_entry_safe(dev, tmp, &kvm->coalesced_zones, list)
+	list_for_each_entry_safe(dev, tmp, &kvm->coalesced_zones, list) {
 		if (zone->pio == dev->zone.pio &&
 		    coalesced_mmio_in_range(dev, zone->addr, zone->size)) {
-			kvm_io_bus_unregister_dev(kvm,
+			r = kvm_io_bus_unregister_dev(kvm,
 				zone->pio ? KVM_PIO_BUS : KVM_MMIO_BUS, &dev->dev);
-			kvm_iodevice_destructor(&dev->dev);
+			/*
+			 * On failure, unregister destroys all devices on the
+			 * bus, including the target device. There's no need
+			 * to restart the walk as there aren't any zones left.
+			 */
+			if (r)
+				break;
 		}
+	}
 
 	mutex_unlock(&kvm->slots_lock);
 
+	/*
+	 * Ignore the result of kvm_io_bus_unregister_dev(), from userspace's
+	 * perspective, the coalesced MMIO is most definitely unregistered.
+	 */
 	return 0;
 }
diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c
new file mode 100644
index 000000000000..02bc6b00d76c
--- /dev/null
+++ b/virt/kvm/dirty_ring.c
@@ -0,0 +1,271 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KVM dirty ring implementation
+ *
+ * Copyright 2019 Red Hat, Inc.
+ */
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/vmalloc.h>
+#include <linux/kvm_dirty_ring.h>
+#include <trace/events/kvm.h>
+#include "kvm_mm.h"
+
+int __weak kvm_cpu_dirty_log_size(struct kvm *kvm)
+{
+	return 0;
+}
+
+u32 kvm_dirty_ring_get_rsvd_entries(struct kvm *kvm)
+{
+	return KVM_DIRTY_RING_RSVD_ENTRIES + kvm_cpu_dirty_log_size(kvm);
+}
+
+bool kvm_use_dirty_bitmap(struct kvm *kvm)
+{
+	lockdep_assert_held(&kvm->slots_lock);
+
+	return !kvm->dirty_ring_size || kvm->dirty_ring_with_bitmap;
+}
+
+#ifndef CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP
+bool kvm_arch_allow_write_without_running_vcpu(struct kvm *kvm)
+{
+	return false;
+}
+#endif
+
+static u32 kvm_dirty_ring_used(struct kvm_dirty_ring *ring)
+{
+	return READ_ONCE(ring->dirty_index) - READ_ONCE(ring->reset_index);
+}
+
+static bool kvm_dirty_ring_soft_full(struct kvm_dirty_ring *ring)
+{
+	return kvm_dirty_ring_used(ring) >= ring->soft_limit;
+}
+
+static bool kvm_dirty_ring_full(struct kvm_dirty_ring *ring)
+{
+	return kvm_dirty_ring_used(ring) >= ring->size;
+}
+
+static void kvm_reset_dirty_gfn(struct kvm *kvm, u32 slot, u64 offset, u64 mask)
+{
+	struct kvm_memory_slot *memslot;
+	int as_id, id;
+
+	as_id = slot >> 16;
+	id = (u16)slot;
+
+	if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS)
+		return;
+
+	memslot = id_to_memslot(__kvm_memslots(kvm, as_id), id);
+
+	if (!memslot || (offset + __fls(mask)) >= memslot->npages)
+		return;
+
+	KVM_MMU_LOCK(kvm);
+	kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, offset, mask);
+	KVM_MMU_UNLOCK(kvm);
+}
+
+int kvm_dirty_ring_alloc(struct kvm *kvm, struct kvm_dirty_ring *ring,
+			 int index, u32 size)
+{
+	ring->dirty_gfns = vzalloc(size);
+	if (!ring->dirty_gfns)
+		return -ENOMEM;
+
+	ring->size = size / sizeof(struct kvm_dirty_gfn);
+	ring->soft_limit = ring->size - kvm_dirty_ring_get_rsvd_entries(kvm);
+	ring->dirty_index = 0;
+	ring->reset_index = 0;
+	ring->index = index;
+
+	return 0;
+}
+
+static inline void kvm_dirty_gfn_set_invalid(struct kvm_dirty_gfn *gfn)
+{
+	smp_store_release(&gfn->flags, 0);
+}
+
+static inline void kvm_dirty_gfn_set_dirtied(struct kvm_dirty_gfn *gfn)
+{
+	gfn->flags = KVM_DIRTY_GFN_F_DIRTY;
+}
+
+static inline bool kvm_dirty_gfn_harvested(struct kvm_dirty_gfn *gfn)
+{
+	return smp_load_acquire(&gfn->flags) & KVM_DIRTY_GFN_F_RESET;
+}
+
+int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring,
+			 int *nr_entries_reset)
+{
+	/*
+	 * To minimize mmu_lock contention, batch resets for harvested entries
+	 * whose gfns are in the same slot, and are within N frame numbers of
+	 * each other, where N is the number of bits in an unsigned long.  For
+	 * simplicity, process the current set of entries when the next entry
+	 * can't be included in the batch.
+	 *
+	 * Track the current batch slot, the gfn offset into the slot for the
+	 * batch, and the bitmask of gfns that need to be reset (relative to
+	 * offset).  Note, the offset may be adjusted backwards, e.g. so that
+	 * a sequence of gfns X, X-1, ... X-N-1 can be batched.
+	 */
+	u32 cur_slot, next_slot;
+	u64 cur_offset, next_offset;
+	unsigned long mask = 0;
+	struct kvm_dirty_gfn *entry;
+
+	/*
+	 * Ensure concurrent calls to KVM_RESET_DIRTY_RINGS are serialized,
+	 * e.g. so that KVM fully resets all entries processed by a given call
+	 * before returning to userspace.  Holding slots_lock also protects
+	 * the various memslot accesses.
+	 */
+	lockdep_assert_held(&kvm->slots_lock);
+
+	while (likely((*nr_entries_reset) < INT_MAX)) {
+		if (signal_pending(current))
+			return -EINTR;
+
+		entry = &ring->dirty_gfns[ring->reset_index & (ring->size - 1)];
+
+		if (!kvm_dirty_gfn_harvested(entry))
+			break;
+
+		next_slot = READ_ONCE(entry->slot);
+		next_offset = READ_ONCE(entry->offset);
+
+		/* Update the flags to reflect that this GFN is reset */
+		kvm_dirty_gfn_set_invalid(entry);
+
+		ring->reset_index++;
+		(*nr_entries_reset)++;
+
+		if (mask) {
+			/*
+			 * While the size of each ring is fixed, it's possible
+			 * for the ring to be constantly re-dirtied/harvested
+			 * while the reset is in-progress (the hard limit exists
+			 * only to guard against the count becoming negative).
+			 */
+			cond_resched();
+
+			/*
+			 * Try to coalesce the reset operations when the guest
+			 * is scanning pages in the same slot.
+			 */
+			if (next_slot == cur_slot) {
+				s64 delta = next_offset - cur_offset;
+
+				if (delta >= 0 && delta < BITS_PER_LONG) {
+					mask |= 1ull << delta;
+					continue;
+				}
+
+				/* Backwards visit, careful about overflows! */
+				if (delta > -BITS_PER_LONG && delta < 0 &&
+				(mask << -delta >> -delta) == mask) {
+					cur_offset = next_offset;
+					mask = (mask << -delta) | 1;
+					continue;
+				}
+			}
+
+			/*
+			 * Reset the slot for all the harvested entries that
+			 * have been gathered, but not yet fully processed.
+			 */
+			kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask);
+		}
+
+		/*
+		 * The current slot was reset or this is the first harvested
+		 * entry, (re)initialize the metadata.
+		 */
+		cur_slot = next_slot;
+		cur_offset = next_offset;
+		mask = 1;
+	}
+
+	/*
+	 * Perform a final reset if there are harvested entries that haven't
+	 * been processed, which is guaranteed if at least one harvested was
+	 * found.  The loop only performs a reset when the "next" entry can't
+	 * be batched with the "current" entry(s), and that reset processes the
+	 * _current_ entry(s); i.e. the last harvested entry, a.k.a. next, will
+	 * always be left pending.
+	 */
+	if (mask)
+		kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask);
+
+	/*
+	 * The request KVM_REQ_DIRTY_RING_SOFT_FULL will be cleared
+	 * by the VCPU thread next time when it enters the guest.
+	 */
+
+	trace_kvm_dirty_ring_reset(ring);
+
+	return 0;
+}
+
+void kvm_dirty_ring_push(struct kvm_vcpu *vcpu, u32 slot, u64 offset)
+{
+	struct kvm_dirty_ring *ring = &vcpu->dirty_ring;
+	struct kvm_dirty_gfn *entry;
+
+	/* It should never get full */
+	WARN_ON_ONCE(kvm_dirty_ring_full(ring));
+
+	entry = &ring->dirty_gfns[ring->dirty_index & (ring->size - 1)];
+
+	entry->slot = slot;
+	entry->offset = offset;
+	/*
+	 * Make sure the data is filled in before we publish this to
+	 * the userspace program.  There's no paired kernel-side reader.
+	 */
+	smp_wmb();
+	kvm_dirty_gfn_set_dirtied(entry);
+	ring->dirty_index++;
+	trace_kvm_dirty_ring_push(ring, slot, offset);
+
+	if (kvm_dirty_ring_soft_full(ring))
+		kvm_make_request(KVM_REQ_DIRTY_RING_SOFT_FULL, vcpu);
+}
+
+bool kvm_dirty_ring_check_request(struct kvm_vcpu *vcpu)
+{
+	/*
+	 * The VCPU isn't runnable when the dirty ring becomes soft full.
+	 * The KVM_REQ_DIRTY_RING_SOFT_FULL event is always set to prevent
+	 * the VCPU from running until the dirty pages are harvested and
+	 * the dirty ring is reset by userspace.
+	 */
+	if (kvm_check_request(KVM_REQ_DIRTY_RING_SOFT_FULL, vcpu) &&
+	    kvm_dirty_ring_soft_full(&vcpu->dirty_ring)) {
+		kvm_make_request(KVM_REQ_DIRTY_RING_SOFT_FULL, vcpu);
+		vcpu->run->exit_reason = KVM_EXIT_DIRTY_RING_FULL;
+		trace_kvm_dirty_ring_exit(vcpu);
+		return true;
+	}
+
+	return false;
+}
+
+struct page *kvm_dirty_ring_get_page(struct kvm_dirty_ring *ring, u32 offset)
+{
+	return vmalloc_to_page((void *)ring->dirty_gfns + offset * PAGE_SIZE);
+}
+
+void kvm_dirty_ring_free(struct kvm_dirty_ring *ring)
+{
+	vfree(ring->dirty_gfns);
+	ring->dirty_gfns = NULL;
+}
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index b20b751286fc..0e8b5277be3b 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * kvm eventfd support - use eventfd objects to signal various KVM events
  *
@@ -6,19 +7,6 @@
  *
  * Author:
  *	Gregory Haskins <ghaskins@novell.com>
- *
- * This file is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
  */
 
 #include <linux/kvm_host.h>
@@ -40,10 +28,16 @@
 
 #include <kvm/iodev.h>
 
-#ifdef CONFIG_HAVE_KVM_IRQFD
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
 
 static struct workqueue_struct *irqfd_cleanup_wq;
 
+bool __attribute__((weak))
+kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args)
+{
+	return true;
+}
+
 static void
 irqfd_inject(struct work_struct *work)
 {
@@ -61,6 +55,15 @@ irqfd_inject(struct work_struct *work)
 			    irqfd->gsi, 1, false);
 }
 
+static void irqfd_resampler_notify(struct kvm_kernel_irqfd_resampler *resampler)
+{
+	struct kvm_kernel_irqfd *irqfd;
+
+	list_for_each_entry_srcu(irqfd, &resampler->list, resampler_link,
+				 srcu_read_lock_held(&resampler->kvm->irq_srcu))
+		eventfd_signal(irqfd->resamplefd);
+}
+
 /*
  * Since resampler irqfds share an IRQ source ID, we de-assert once
  * then notify all of the resampler irqfds using this GSI.  We can't
@@ -71,7 +74,6 @@ irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
 {
 	struct kvm_kernel_irqfd_resampler *resampler;
 	struct kvm *kvm;
-	struct kvm_kernel_irqfd *irqfd;
 	int idx;
 
 	resampler = container_of(kian,
@@ -82,10 +84,7 @@ irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
 		    resampler->notifier.gsi, 0, false);
 
 	idx = srcu_read_lock(&kvm->irq_srcu);
-
-	list_for_each_entry_rcu(irqfd, &resampler->list, resampler_link)
-		eventfd_signal(irqfd->resamplefd, 1);
-
+	irqfd_resampler_notify(resampler);
 	srcu_read_unlock(&kvm->irq_srcu, idx);
 }
 
@@ -98,14 +97,19 @@ irqfd_resampler_shutdown(struct kvm_kernel_irqfd *irqfd)
 	mutex_lock(&kvm->irqfds.resampler_lock);
 
 	list_del_rcu(&irqfd->resampler_link);
-	synchronize_srcu(&kvm->irq_srcu);
 
 	if (list_empty(&resampler->list)) {
-		list_del(&resampler->link);
+		list_del_rcu(&resampler->link);
 		kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier);
+		/*
+		 * synchronize_srcu_expedited(&kvm->irq_srcu) already called
+		 * in kvm_unregister_irq_ack_notifier().
+		 */
 		kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
 			    resampler->notifier.gsi, 0, false);
 		kfree(resampler);
+	} else {
+		synchronize_srcu_expedited(&kvm->irq_srcu);
 	}
 
 	mutex_unlock(&kvm->irqfds.resampler_lock);
@@ -122,8 +126,8 @@ irqfd_shutdown(struct work_struct *work)
 	struct kvm *kvm = irqfd->kvm;
 	u64 cnt;
 
-	/* Make sure irqfd has been initalized in assign path. */
-	synchronize_srcu(&kvm->irq_srcu);
+	/* Make sure irqfd has been initialized in assign path. */
+	synchronize_srcu_expedited(&kvm->irq_srcu);
 
 	/*
 	 * Synchronize with the wait-queue and unhook ourselves to prevent
@@ -145,7 +149,7 @@ irqfd_shutdown(struct work_struct *work)
 	/*
 	 * It is now safe to release the object's resources
 	 */
-#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
+#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS)
 	irq_bypass_unregister_consumer(&irqfd->consumer);
 #endif
 	eventfd_ctx_put(irqfd->eventfd);
@@ -197,8 +201,17 @@ irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
 	struct kvm *kvm = irqfd->kvm;
 	unsigned seq;
 	int idx;
+	int ret = 0;
 
 	if (flags & EPOLLIN) {
+		/*
+		 * WARNING: Do NOT take irqfds.lock in any path except EPOLLHUP,
+		 * as KVM holds irqfds.lock when registering the irqfd with the
+		 * eventfd.
+		 */
+		u64 cnt;
+		eventfd_ctx_do_read(irqfd->eventfd, &cnt);
+
 		idx = srcu_read_lock(&kvm->irq_srcu);
 		do {
 			seq = read_seqcount_begin(&irqfd->irq_entry_sc);
@@ -210,13 +223,19 @@ irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
 					      false) == -EWOULDBLOCK)
 			schedule_work(&irqfd->inject);
 		srcu_read_unlock(&kvm->irq_srcu, idx);
+		ret = 1;
 	}
 
 	if (flags & EPOLLHUP) {
 		/* The eventfd is closing, detach from KVM */
-		unsigned long flags;
+		unsigned long iflags;
 
-		spin_lock_irqsave(&kvm->irqfds.lock, flags);
+		/*
+		 * Taking irqfds.lock is safe here, as KVM holds a reference to
+		 * the eventfd when registering the irqfd, i.e. this path can't
+		 * be reached while kvm_irqfd_add() is running.
+		 */
+		spin_lock_irqsave(&kvm->irqfds.lock, iflags);
 
 		/*
 		 * We must check if someone deactivated the irqfd before
@@ -230,28 +249,20 @@ irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
 		if (irqfd_is_active(irqfd))
 			irqfd_deactivate(irqfd);
 
-		spin_unlock_irqrestore(&kvm->irqfds.lock, flags);
+		spin_unlock_irqrestore(&kvm->irqfds.lock, iflags);
 	}
 
-	return 0;
-}
-
-static void
-irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
-			poll_table *pt)
-{
-	struct kvm_kernel_irqfd *irqfd =
-		container_of(pt, struct kvm_kernel_irqfd, pt);
-	add_wait_queue(wqh, &irqfd->wait);
+	return ret;
 }
 
-/* Must be called under irqfds.lock */
 static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd)
 {
 	struct kvm_kernel_irq_routing_entry *e;
 	struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
 	int n_entries;
 
+	lockdep_assert_held(&kvm->irqfds.lock);
+
 	n_entries = kvm_irq_map_gsi(kvm, entries, irqfd->gsi);
 
 	write_seqcount_begin(&irqfd->irq_entry_sc);
@@ -265,7 +276,64 @@ static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd)
 	write_seqcount_end(&irqfd->irq_entry_sc);
 }
 
-#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
+struct kvm_irqfd_pt {
+	struct kvm_kernel_irqfd *irqfd;
+	struct kvm *kvm;
+	poll_table pt;
+	int ret;
+};
+
+static void kvm_irqfd_register(struct file *file, wait_queue_head_t *wqh,
+			       poll_table *pt)
+{
+	struct kvm_irqfd_pt *p = container_of(pt, struct kvm_irqfd_pt, pt);
+	struct kvm_kernel_irqfd *irqfd = p->irqfd;
+	struct kvm *kvm = p->kvm;
+
+	/*
+	 * Note, irqfds.lock protects the irqfd's irq_entry, i.e. its routing,
+	 * and irqfds.items.  It does NOT protect registering with the eventfd.
+	 */
+	spin_lock_irq(&kvm->irqfds.lock);
+
+	/*
+	 * Initialize the routing information prior to adding the irqfd to the
+	 * eventfd's waitqueue, as irqfd_wakeup() can be invoked as soon as the
+	 * irqfd is registered.
+	 */
+	irqfd_update(kvm, irqfd);
+
+	/*
+	 * Add the irqfd as a priority waiter on the eventfd, with a custom
+	 * wake-up handler, so that KVM *and only KVM* is notified whenever the
+	 * underlying eventfd is signaled.
+	 */
+	init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup);
+
+	/*
+	 * Temporarily lie to lockdep about holding irqfds.lock to avoid a
+	 * false positive regarding potential deadlock with irqfd_wakeup()
+	 * (see irqfd_wakeup() for details).
+	 *
+	 * Adding to the wait queue will fail if there is already a priority
+	 * waiter, i.e. if the eventfd is associated with another irqfd (in any
+	 * VM).  Note, kvm_irqfd_deassign() waits for all in-flight shutdown
+	 * jobs to complete, i.e. ensures the irqfd has been removed from the
+	 * eventfd's waitqueue before returning to userspace.
+	 */
+	spin_release(&kvm->irqfds.lock.dep_map, _RET_IP_);
+	p->ret = add_wait_queue_priority_exclusive(wqh, &irqfd->wait);
+	spin_acquire(&kvm->irqfds.lock.dep_map, 0, 0, _RET_IP_);
+	if (p->ret)
+		goto out;
+
+	list_add_tail(&irqfd->list, &kvm->irqfds.items);
+
+out:
+	spin_unlock_irq(&kvm->irqfds.lock);
+}
+
+#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS)
 void __attribute__((weak)) kvm_arch_irq_bypass_stop(
 				struct irq_bypass_consumer *cons)
 {
@@ -276,20 +344,20 @@ void __attribute__((weak)) kvm_arch_irq_bypass_start(
 {
 }
 
-int  __attribute__((weak)) kvm_arch_update_irqfd_routing(
-				struct kvm *kvm, unsigned int host_irq,
-				uint32_t guest_irq, bool set)
+void __weak kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd,
+					  struct kvm_kernel_irq_routing_entry *old,
+					  struct kvm_kernel_irq_routing_entry *new)
 {
-	return 0;
+
 }
 #endif
 
 static int
 kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
 {
-	struct kvm_kernel_irqfd *irqfd, *tmp;
-	struct fd f;
+	struct kvm_kernel_irqfd *irqfd;
 	struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL;
+	struct kvm_irqfd_pt irqfd_pt;
 	int ret;
 	__poll_t events;
 	int idx;
@@ -297,7 +365,10 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
 	if (!kvm_arch_intc_initialized(kvm))
 		return -EAGAIN;
 
-	irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL);
+	if (!kvm_arch_irqfd_allowed(kvm, args))
+		return -EINVAL;
+
+	irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL_ACCOUNT);
 	if (!irqfd)
 		return -ENOMEM;
 
@@ -306,18 +377,18 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
 	INIT_LIST_HEAD(&irqfd->list);
 	INIT_WORK(&irqfd->inject, irqfd_inject);
 	INIT_WORK(&irqfd->shutdown, irqfd_shutdown);
-	seqcount_init(&irqfd->irq_entry_sc);
+	seqcount_spinlock_init(&irqfd->irq_entry_sc, &kvm->irqfds.lock);
 
-	f = fdget(args->fd);
-	if (!f.file) {
+	CLASS(fd, f)(args->fd);
+	if (fd_empty(f)) {
 		ret = -EBADF;
 		goto out;
 	}
 
-	eventfd = eventfd_ctx_fileget(f.file);
+	eventfd = eventfd_ctx_fileget(fd_file(f));
 	if (IS_ERR(eventfd)) {
 		ret = PTR_ERR(eventfd);
-		goto fail;
+		goto out;
 	}
 
 	irqfd->eventfd = eventfd;
@@ -345,7 +416,8 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
 		}
 
 		if (!irqfd->resampler) {
-			resampler = kzalloc(sizeof(*resampler), GFP_KERNEL);
+			resampler = kzalloc(sizeof(*resampler),
+					    GFP_KERNEL_ACCOUNT);
 			if (!resampler) {
 				ret = -ENOMEM;
 				mutex_unlock(&kvm->irqfds.resampler_lock);
@@ -358,76 +430,67 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
 			resampler->notifier.irq_acked = irqfd_resampler_ack;
 			INIT_LIST_HEAD(&resampler->link);
 
-			list_add(&resampler->link, &kvm->irqfds.resampler_list);
+			list_add_rcu(&resampler->link, &kvm->irqfds.resampler_list);
 			kvm_register_irq_ack_notifier(kvm,
 						      &resampler->notifier);
 			irqfd->resampler = resampler;
 		}
 
 		list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list);
-		synchronize_srcu(&kvm->irq_srcu);
+		synchronize_srcu_expedited(&kvm->irq_srcu);
 
 		mutex_unlock(&kvm->irqfds.resampler_lock);
 	}
 
 	/*
-	 * Install our own custom wake-up handling so we are notified via
-	 * a callback whenever someone signals the underlying eventfd
+	 * Set the irqfd routing and add it to KVM's list before registering
+	 * the irqfd with the eventfd, so that the routing information is valid
+	 * and stays valid, e.g. if there are GSI routing changes, prior to
+	 * making the irqfd visible, i.e. before it might be signaled.
+	 *
+	 * Note, holding SRCU ensures a stable read of routing information, and
+	 * also prevents irqfd_shutdown() from freeing the irqfd before it's
+	 * fully initialized.
 	 */
-	init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup);
-	init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc);
-
-	spin_lock_irq(&kvm->irqfds.lock);
-
-	ret = 0;
-	list_for_each_entry(tmp, &kvm->irqfds.items, list) {
-		if (irqfd->eventfd != tmp->eventfd)
-			continue;
-		/* This fd is used for another irq already. */
-		ret = -EBUSY;
-		spin_unlock_irq(&kvm->irqfds.lock);
-		goto fail;
-	}
-
 	idx = srcu_read_lock(&kvm->irq_srcu);
-	irqfd_update(kvm, irqfd);
-
-	list_add_tail(&irqfd->list, &kvm->irqfds.items);
-
-	spin_unlock_irq(&kvm->irqfds.lock);
 
 	/*
-	 * Check if there was an event already pending on the eventfd
-	 * before we registered, and trigger it as if we didn't miss it.
+	 * Register the irqfd with the eventfd by polling on the eventfd, and
+	 * simultaneously and the irqfd to KVM's list.  If there was en event
+	 * pending on the eventfd prior to registering, manually trigger IRQ
+	 * injection.
 	 */
-	events = vfs_poll(f.file, &irqfd->pt);
+	irqfd_pt.irqfd = irqfd;
+	irqfd_pt.kvm = kvm;
+	init_poll_funcptr(&irqfd_pt.pt, kvm_irqfd_register);
+
+	events = vfs_poll(fd_file(f), &irqfd_pt.pt);
+
+	ret = irqfd_pt.ret;
+	if (ret)
+		goto fail_poll;
 
 	if (events & EPOLLIN)
 		schedule_work(&irqfd->inject);
 
-#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
+#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS)
 	if (kvm_arch_has_irq_bypass()) {
-		irqfd->consumer.token = (void *)irqfd->eventfd;
 		irqfd->consumer.add_producer = kvm_arch_irq_bypass_add_producer;
 		irqfd->consumer.del_producer = kvm_arch_irq_bypass_del_producer;
 		irqfd->consumer.stop = kvm_arch_irq_bypass_stop;
 		irqfd->consumer.start = kvm_arch_irq_bypass_start;
-		ret = irq_bypass_register_consumer(&irqfd->consumer);
+		ret = irq_bypass_register_consumer(&irqfd->consumer, irqfd->eventfd);
 		if (ret)
-			pr_info("irq bypass consumer (token %p) registration fails: %d\n",
-				irqfd->consumer.token, ret);
+			pr_info("irq bypass consumer (eventfd %p) registration fails: %d\n",
+				irqfd->eventfd, ret);
 	}
 #endif
 
 	srcu_read_unlock(&kvm->irq_srcu, idx);
-
-	/*
-	 * do not drop the file until the irqfd is fully initialized, otherwise
-	 * we might race against the EPOLLHUP
-	 */
-	fdput(f);
 	return 0;
 
+fail_poll:
+	srcu_read_unlock(&kvm->irq_srcu, idx);
 fail:
 	if (irqfd->resampler)
 		irqfd_resampler_shutdown(irqfd);
@@ -438,8 +501,6 @@ fail:
 	if (eventfd && !IS_ERR(eventfd))
 		eventfd_ctx_put(eventfd);
 
-	fdput(f);
-
 out:
 	kfree(irqfd);
 	return ret;
@@ -453,8 +514,8 @@ bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin)
 	idx = srcu_read_lock(&kvm->irq_srcu);
 	gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
 	if (gsi != -1)
-		hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list,
-					 link)
+		hlist_for_each_entry_srcu(kian, &kvm->irq_ack_notifier_list,
+					  link, srcu_read_lock_held(&kvm->irq_srcu))
 			if (kian->gsi == gsi) {
 				srcu_read_unlock(&kvm->irq_srcu, idx);
 				return true;
@@ -464,14 +525,14 @@ bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin)
 
 	return false;
 }
-EXPORT_SYMBOL_GPL(kvm_irq_has_notifier);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_irq_has_notifier);
 
 void kvm_notify_acked_gsi(struct kvm *kvm, int gsi)
 {
 	struct kvm_irq_ack_notifier *kian;
 
-	hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list,
-				 link)
+	hlist_for_each_entry_srcu(kian, &kvm->irq_ack_notifier_list,
+				  link, srcu_read_lock_held(&kvm->irq_srcu))
 		if (kian->gsi == gsi)
 			kian->irq_acked(kian);
 }
@@ -504,24 +565,10 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
 	mutex_lock(&kvm->irq_lock);
 	hlist_del_init_rcu(&kian->link);
 	mutex_unlock(&kvm->irq_lock);
-	synchronize_srcu(&kvm->irq_srcu);
+	synchronize_srcu_expedited(&kvm->irq_srcu);
 	kvm_arch_post_irq_ack_notifier_list_update(kvm);
 }
-#endif
-
-void
-kvm_eventfd_init(struct kvm *kvm)
-{
-#ifdef CONFIG_HAVE_KVM_IRQFD
-	spin_lock_init(&kvm->irqfds.lock);
-	INIT_LIST_HEAD(&kvm->irqfds.items);
-	INIT_LIST_HEAD(&kvm->irqfds.resampler_list);
-	mutex_init(&kvm->irqfds.resampler_lock);
-#endif
-	INIT_LIST_HEAD(&kvm->ioeventfds);
-}
 
-#ifdef CONFIG_HAVE_KVM_IRQFD
 /*
  * shutdown any irqfd's that match fd+gsi
  */
@@ -603,7 +650,7 @@ kvm_irqfd_release(struct kvm *kvm)
 
 /*
  * Take note of a change in irq routing.
- * Caller must invoke synchronize_srcu(&kvm->irq_srcu) afterwards.
+ * Caller must invoke synchronize_srcu_expedited(&kvm->irq_srcu) afterwards.
  */
 void kvm_irq_routing_update(struct kvm *kvm)
 {
@@ -612,21 +659,47 @@ void kvm_irq_routing_update(struct kvm *kvm)
 	spin_lock_irq(&kvm->irqfds.lock);
 
 	list_for_each_entry(irqfd, &kvm->irqfds.items, list) {
+#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS)
+		/* Under irqfds.lock, so can read irq_entry safely */
+		struct kvm_kernel_irq_routing_entry old = irqfd->irq_entry;
+#endif
+
 		irqfd_update(kvm, irqfd);
 
-#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
-		if (irqfd->producer) {
-			int ret = kvm_arch_update_irqfd_routing(
-					irqfd->kvm, irqfd->producer->irq,
-					irqfd->gsi, 1);
-			WARN_ON(ret);
-		}
+#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS)
+		if (irqfd->producer)
+			kvm_arch_update_irqfd_routing(irqfd, &old, &irqfd->irq_entry);
 #endif
 	}
 
 	spin_unlock_irq(&kvm->irqfds.lock);
 }
 
+bool kvm_notify_irqfd_resampler(struct kvm *kvm,
+				unsigned int irqchip,
+				unsigned int pin)
+{
+	struct kvm_kernel_irqfd_resampler *resampler;
+	int gsi, idx;
+
+	idx = srcu_read_lock(&kvm->irq_srcu);
+	gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
+	if (gsi != -1) {
+		list_for_each_entry_srcu(resampler,
+					 &kvm->irqfds.resampler_list, link,
+					 srcu_read_lock_held(&kvm->irq_srcu)) {
+			if (resampler->notifier.gsi == gsi) {
+				irqfd_resampler_notify(resampler);
+				srcu_read_unlock(&kvm->irq_srcu, idx);
+				return true;
+			}
+		}
+	}
+	srcu_read_unlock(&kvm->irq_srcu, idx);
+
+	return false;
+}
+
 /*
  * create a host-wide workqueue for issuing deferred shutdown requests
  * aggregated from all vm* instances. We need our own isolated
@@ -634,7 +707,7 @@ void kvm_irq_routing_update(struct kvm *kvm)
  */
 int kvm_irqfd_init(void)
 {
-	irqfd_cleanup_wq = alloc_workqueue("kvm-irqfd-cleanup", 0, 0);
+	irqfd_cleanup_wq = alloc_workqueue("kvm-irqfd-cleanup", WQ_PERCPU, 0);
 	if (!irqfd_cleanup_wq)
 		return -ENOMEM;
 
@@ -723,7 +796,7 @@ ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val)
 		return false;
 	}
 
-	return _val == p->datamatch ? true : false;
+	return _val == p->datamatch;
 }
 
 /* MMIO/PIO writes trigger an event if the addr/val match */
@@ -736,7 +809,7 @@ ioeventfd_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
 	if (!ioeventfd_in_range(p, addr, len, val))
 		return -EOPNOTSUPP;
 
-	eventfd_signal(p->eventfd, 1);
+	eventfd_signal(p->eventfd);
 	return 0;
 }
 
@@ -797,7 +870,7 @@ static int kvm_assign_ioeventfd_idx(struct kvm *kvm,
 	if (IS_ERR(eventfd))
 		return PTR_ERR(eventfd);
 
-	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	p = kzalloc(sizeof(*p), GFP_KERNEL_ACCOUNT);
 	if (!p) {
 		ret = -ENOMEM;
 		goto fail;
@@ -839,9 +912,9 @@ static int kvm_assign_ioeventfd_idx(struct kvm *kvm,
 
 unlock_fail:
 	mutex_unlock(&kvm->slots_lock);
+	kfree(p);
 
 fail:
-	kfree(p);
 	eventfd_ctx_put(eventfd);
 
 	return ret;
@@ -851,20 +924,21 @@ static int
 kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx,
 			   struct kvm_ioeventfd *args)
 {
-	struct _ioeventfd        *p, *tmp;
+	struct _ioeventfd        *p;
 	struct eventfd_ctx       *eventfd;
 	struct kvm_io_bus	 *bus;
 	int                       ret = -ENOENT;
+	bool                      wildcard;
 
 	eventfd = eventfd_ctx_fdget(args->fd);
 	if (IS_ERR(eventfd))
 		return PTR_ERR(eventfd);
 
-	mutex_lock(&kvm->slots_lock);
+	wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH);
 
-	list_for_each_entry_safe(p, tmp, &kvm->ioeventfds, list) {
-		bool wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH);
+	mutex_lock(&kvm->slots_lock);
 
+	list_for_each_entry(p, &kvm->ioeventfds, list) {
 		if (p->bus_idx != bus_idx ||
 		    p->eventfd != eventfd  ||
 		    p->addr != args->addr  ||
@@ -879,7 +953,6 @@ kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx,
 		bus = kvm_get_bus(kvm, bus_idx);
 		if (bus)
 			bus->ioeventfd_count--;
-		ioeventfd_release(p);
 		ret = 0;
 		break;
 	}
@@ -962,3 +1035,15 @@ kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 
 	return kvm_assign_ioeventfd(kvm, args);
 }
+
+void
+kvm_eventfd_init(struct kvm *kvm)
+{
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
+	spin_lock_init(&kvm->irqfds.lock);
+	INIT_LIST_HEAD(&kvm->irqfds.items);
+	INIT_LIST_HEAD(&kvm->irqfds.resampler_list);
+	mutex_init(&kvm->irqfds.resampler_lock);
+#endif
+	INIT_LIST_HEAD(&kvm->ioeventfds);
+}
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
new file mode 100644
index 000000000000..fdaea3422c30
--- /dev/null
+++ b/virt/kvm/guest_memfd.c
@@ -0,0 +1,1016 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/anon_inodes.h>
+#include <linux/backing-dev.h>
+#include <linux/falloc.h>
+#include <linux/fs.h>
+#include <linux/kvm_host.h>
+#include <linux/mempolicy.h>
+#include <linux/pseudo_fs.h>
+#include <linux/pagemap.h>
+
+#include "kvm_mm.h"
+
+static struct vfsmount *kvm_gmem_mnt;
+
+/*
+ * A guest_memfd instance can be associated multiple VMs, each with its own
+ * "view" of the underlying physical memory.
+ *
+ * The gmem's inode is effectively the raw underlying physical storage, and is
+ * used to track properties of the physical memory, while each gmem file is
+ * effectively a single VM's view of that storage, and is used to track assets
+ * specific to its associated VM, e.g. memslots=>gmem bindings.
+ */
+struct gmem_file {
+	struct kvm *kvm;
+	struct xarray bindings;
+	struct list_head entry;
+};
+
+struct gmem_inode {
+	struct shared_policy policy;
+	struct inode vfs_inode;
+
+	u64 flags;
+};
+
+static __always_inline struct gmem_inode *GMEM_I(struct inode *inode)
+{
+	return container_of(inode, struct gmem_inode, vfs_inode);
+}
+
+#define kvm_gmem_for_each_file(f, mapping) \
+	list_for_each_entry(f, &(mapping)->i_private_list, entry)
+
+/**
+ * folio_file_pfn - like folio_file_page, but return a pfn.
+ * @folio: The folio which contains this index.
+ * @index: The index we want to look up.
+ *
+ * Return: The pfn for this index.
+ */
+static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index)
+{
+	return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1));
+}
+
+static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn)
+{
+	return gfn - slot->base_gfn + slot->gmem.pgoff;
+}
+
+static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
+				    pgoff_t index, struct folio *folio)
+{
+#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE
+	kvm_pfn_t pfn = folio_file_pfn(folio, index);
+	gfn_t gfn = slot->base_gfn + index - slot->gmem.pgoff;
+	int rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, folio_order(folio));
+	if (rc) {
+		pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx GFN %llx PFN %llx error %d.\n",
+				    index, gfn, pfn, rc);
+		return rc;
+	}
+#endif
+
+	return 0;
+}
+
+static inline void kvm_gmem_mark_prepared(struct folio *folio)
+{
+	folio_mark_uptodate(folio);
+}
+
+/*
+ * Process @folio, which contains @gfn, so that the guest can use it.
+ * The folio must be locked and the gfn must be contained in @slot.
+ * On successful return the guest sees a zero page so as to avoid
+ * leaking host data and the up-to-date flag is set.
+ */
+static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
+				  gfn_t gfn, struct folio *folio)
+{
+	unsigned long nr_pages, i;
+	pgoff_t index;
+	int r;
+
+	nr_pages = folio_nr_pages(folio);
+	for (i = 0; i < nr_pages; i++)
+		clear_highpage(folio_page(folio, i));
+
+	/*
+	 * Preparing huge folios should always be safe, since it should
+	 * be possible to split them later if needed.
+	 *
+	 * Right now the folio order is always going to be zero, but the
+	 * code is ready for huge folios.  The only assumption is that
+	 * the base pgoff of memslots is naturally aligned with the
+	 * requested page order, ensuring that huge folios can also use
+	 * huge page table entries for GPA->HPA mapping.
+	 *
+	 * The order will be passed when creating the guest_memfd, and
+	 * checked when creating memslots.
+	 */
+	WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, folio_nr_pages(folio)));
+	index = kvm_gmem_get_index(slot, gfn);
+	index = ALIGN_DOWN(index, folio_nr_pages(folio));
+	r = __kvm_gmem_prepare_folio(kvm, slot, index, folio);
+	if (!r)
+		kvm_gmem_mark_prepared(folio);
+
+	return r;
+}
+
+/*
+ * Returns a locked folio on success.  The caller is responsible for
+ * setting the up-to-date flag before the memory is mapped into the guest.
+ * There is no backing storage for the memory, so the folio will remain
+ * up-to-date until it's removed.
+ *
+ * Ignore accessed, referenced, and dirty flags.  The memory is
+ * unevictable and there is no storage to write back to.
+ */
+static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
+{
+	/* TODO: Support huge pages. */
+	struct mempolicy *policy;
+	struct folio *folio;
+
+	/*
+	 * Fast-path: See if folio is already present in mapping to avoid
+	 * policy_lookup.
+	 */
+	folio = __filemap_get_folio(inode->i_mapping, index,
+				    FGP_LOCK | FGP_ACCESSED, 0);
+	if (!IS_ERR(folio))
+		return folio;
+
+	policy = mpol_shared_policy_lookup(&GMEM_I(inode)->policy, index);
+	folio = __filemap_get_folio_mpol(inode->i_mapping, index,
+					 FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
+					 mapping_gfp_mask(inode->i_mapping), policy);
+	mpol_cond_put(policy);
+
+	return folio;
+}
+
+static enum kvm_gfn_range_filter kvm_gmem_get_invalidate_filter(struct inode *inode)
+{
+	if (GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED)
+		return KVM_FILTER_SHARED;
+
+	return KVM_FILTER_PRIVATE;
+}
+
+static void __kvm_gmem_invalidate_begin(struct gmem_file *f, pgoff_t start,
+					pgoff_t end,
+					enum kvm_gfn_range_filter attr_filter)
+{
+	bool flush = false, found_memslot = false;
+	struct kvm_memory_slot *slot;
+	struct kvm *kvm = f->kvm;
+	unsigned long index;
+
+	xa_for_each_range(&f->bindings, index, slot, start, end - 1) {
+		pgoff_t pgoff = slot->gmem.pgoff;
+
+		struct kvm_gfn_range gfn_range = {
+			.start = slot->base_gfn + max(pgoff, start) - pgoff,
+			.end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff,
+			.slot = slot,
+			.may_block = true,
+			.attr_filter = attr_filter,
+		};
+
+		if (!found_memslot) {
+			found_memslot = true;
+
+			KVM_MMU_LOCK(kvm);
+			kvm_mmu_invalidate_begin(kvm);
+		}
+
+		flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range);
+	}
+
+	if (flush)
+		kvm_flush_remote_tlbs(kvm);
+
+	if (found_memslot)
+		KVM_MMU_UNLOCK(kvm);
+}
+
+static void kvm_gmem_invalidate_begin(struct inode *inode, pgoff_t start,
+				      pgoff_t end)
+{
+	enum kvm_gfn_range_filter attr_filter;
+	struct gmem_file *f;
+
+	attr_filter = kvm_gmem_get_invalidate_filter(inode);
+
+	kvm_gmem_for_each_file(f, inode->i_mapping)
+		__kvm_gmem_invalidate_begin(f, start, end, attr_filter);
+}
+
+static void __kvm_gmem_invalidate_end(struct gmem_file *f, pgoff_t start,
+				      pgoff_t end)
+{
+	struct kvm *kvm = f->kvm;
+
+	if (xa_find(&f->bindings, &start, end - 1, XA_PRESENT)) {
+		KVM_MMU_LOCK(kvm);
+		kvm_mmu_invalidate_end(kvm);
+		KVM_MMU_UNLOCK(kvm);
+	}
+}
+
+static void kvm_gmem_invalidate_end(struct inode *inode, pgoff_t start,
+				    pgoff_t end)
+{
+	struct gmem_file *f;
+
+	kvm_gmem_for_each_file(f, inode->i_mapping)
+		__kvm_gmem_invalidate_end(f, start, end);
+}
+
+static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
+{
+	pgoff_t start = offset >> PAGE_SHIFT;
+	pgoff_t end = (offset + len) >> PAGE_SHIFT;
+
+	/*
+	 * Bindings must be stable across invalidation to ensure the start+end
+	 * are balanced.
+	 */
+	filemap_invalidate_lock(inode->i_mapping);
+
+	kvm_gmem_invalidate_begin(inode, start, end);
+
+	truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1);
+
+	kvm_gmem_invalidate_end(inode, start, end);
+
+	filemap_invalidate_unlock(inode->i_mapping);
+
+	return 0;
+}
+
+static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
+{
+	struct address_space *mapping = inode->i_mapping;
+	pgoff_t start, index, end;
+	int r;
+
+	/* Dedicated guest is immutable by default. */
+	if (offset + len > i_size_read(inode))
+		return -EINVAL;
+
+	filemap_invalidate_lock_shared(mapping);
+
+	start = offset >> PAGE_SHIFT;
+	end = (offset + len) >> PAGE_SHIFT;
+
+	r = 0;
+	for (index = start; index < end; ) {
+		struct folio *folio;
+
+		if (signal_pending(current)) {
+			r = -EINTR;
+			break;
+		}
+
+		folio = kvm_gmem_get_folio(inode, index);
+		if (IS_ERR(folio)) {
+			r = PTR_ERR(folio);
+			break;
+		}
+
+		index = folio_next_index(folio);
+
+		folio_unlock(folio);
+		folio_put(folio);
+
+		/* 64-bit only, wrapping the index should be impossible. */
+		if (WARN_ON_ONCE(!index))
+			break;
+
+		cond_resched();
+	}
+
+	filemap_invalidate_unlock_shared(mapping);
+
+	return r;
+}
+
+static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
+			       loff_t len)
+{
+	int ret;
+
+	if (!(mode & FALLOC_FL_KEEP_SIZE))
+		return -EOPNOTSUPP;
+
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+		return -EOPNOTSUPP;
+
+	if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
+		return -EINVAL;
+
+	if (mode & FALLOC_FL_PUNCH_HOLE)
+		ret = kvm_gmem_punch_hole(file_inode(file), offset, len);
+	else
+		ret = kvm_gmem_allocate(file_inode(file), offset, len);
+
+	if (!ret)
+		file_modified(file);
+	return ret;
+}
+
+static int kvm_gmem_release(struct inode *inode, struct file *file)
+{
+	struct gmem_file *f = file->private_data;
+	struct kvm_memory_slot *slot;
+	struct kvm *kvm = f->kvm;
+	unsigned long index;
+
+	/*
+	 * Prevent concurrent attempts to *unbind* a memslot.  This is the last
+	 * reference to the file and thus no new bindings can be created, but
+	 * dereferencing the slot for existing bindings needs to be protected
+	 * against memslot updates, specifically so that unbind doesn't race
+	 * and free the memslot (kvm_gmem_get_file() will return NULL).
+	 *
+	 * Since .release is called only when the reference count is zero,
+	 * after which file_ref_get() and get_file_active() fail,
+	 * kvm_gmem_get_pfn() cannot be using the file concurrently.
+	 * file_ref_put() provides a full barrier, and get_file_active() the
+	 * matching acquire barrier.
+	 */
+	mutex_lock(&kvm->slots_lock);
+
+	filemap_invalidate_lock(inode->i_mapping);
+
+	xa_for_each(&f->bindings, index, slot)
+		WRITE_ONCE(slot->gmem.file, NULL);
+
+	/*
+	 * All in-flight operations are gone and new bindings can be created.
+	 * Zap all SPTEs pointed at by this file.  Do not free the backing
+	 * memory, as its lifetime is associated with the inode, not the file.
+	 */
+	__kvm_gmem_invalidate_begin(f, 0, -1ul,
+				    kvm_gmem_get_invalidate_filter(inode));
+	__kvm_gmem_invalidate_end(f, 0, -1ul);
+
+	list_del(&f->entry);
+
+	filemap_invalidate_unlock(inode->i_mapping);
+
+	mutex_unlock(&kvm->slots_lock);
+
+	xa_destroy(&f->bindings);
+	kfree(f);
+
+	kvm_put_kvm(kvm);
+
+	return 0;
+}
+
+static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot)
+{
+	/*
+	 * Do not return slot->gmem.file if it has already been closed;
+	 * there might be some time between the last fput() and when
+	 * kvm_gmem_release() clears slot->gmem.file.
+	 */
+	return get_file_active(&slot->gmem.file);
+}
+
+DEFINE_CLASS(gmem_get_file, struct file *, if (_T) fput(_T),
+	     kvm_gmem_get_file(slot), struct kvm_memory_slot *slot);
+
+static bool kvm_gmem_supports_mmap(struct inode *inode)
+{
+	return GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_MMAP;
+}
+
+static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
+{
+	struct inode *inode = file_inode(vmf->vma->vm_file);
+	struct folio *folio;
+	vm_fault_t ret = VM_FAULT_LOCKED;
+
+	if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode))
+		return VM_FAULT_SIGBUS;
+
+	if (!(GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED))
+		return VM_FAULT_SIGBUS;
+
+	folio = kvm_gmem_get_folio(inode, vmf->pgoff);
+	if (IS_ERR(folio)) {
+		if (PTR_ERR(folio) == -EAGAIN)
+			return VM_FAULT_RETRY;
+
+		return vmf_error(PTR_ERR(folio));
+	}
+
+	if (WARN_ON_ONCE(folio_test_large(folio))) {
+		ret = VM_FAULT_SIGBUS;
+		goto out_folio;
+	}
+
+	if (!folio_test_uptodate(folio)) {
+		clear_highpage(folio_page(folio, 0));
+		kvm_gmem_mark_prepared(folio);
+	}
+
+	vmf->page = folio_file_page(folio, vmf->pgoff);
+
+out_folio:
+	if (ret != VM_FAULT_LOCKED) {
+		folio_unlock(folio);
+		folio_put(folio);
+	}
+
+	return ret;
+}
+
+#ifdef CONFIG_NUMA
+static int kvm_gmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
+{
+	struct inode *inode = file_inode(vma->vm_file);
+
+	return mpol_set_shared_policy(&GMEM_I(inode)->policy, vma, mpol);
+}
+
+static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma,
+					     unsigned long addr, pgoff_t *pgoff)
+{
+	struct inode *inode = file_inode(vma->vm_file);
+
+	*pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
+
+	/*
+	 * Return the memory policy for this index, or NULL if none is set.
+	 *
+	 * Returning NULL, e.g. instead of the current task's memory policy, is
+	 * important for the .get_policy kernel ABI: it indicates that no
+	 * explicit policy has been set via mbind() for this memory. The caller
+	 * can then replace NULL with the default memory policy instead of the
+	 * current task's memory policy.
+	 */
+	return mpol_shared_policy_lookup(&GMEM_I(inode)->policy, *pgoff);
+}
+#endif /* CONFIG_NUMA */
+
+static const struct vm_operations_struct kvm_gmem_vm_ops = {
+	.fault		= kvm_gmem_fault_user_mapping,
+#ifdef CONFIG_NUMA
+	.get_policy	= kvm_gmem_get_policy,
+	.set_policy	= kvm_gmem_set_policy,
+#endif
+};
+
+static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	if (!kvm_gmem_supports_mmap(file_inode(file)))
+		return -ENODEV;
+
+	if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) !=
+	    (VM_SHARED | VM_MAYSHARE)) {
+		return -EINVAL;
+	}
+
+	vma->vm_ops = &kvm_gmem_vm_ops;
+
+	return 0;
+}
+
+static struct file_operations kvm_gmem_fops = {
+	.mmap		= kvm_gmem_mmap,
+	.open		= generic_file_open,
+	.release	= kvm_gmem_release,
+	.fallocate	= kvm_gmem_fallocate,
+};
+
+static int kvm_gmem_migrate_folio(struct address_space *mapping,
+				  struct folio *dst, struct folio *src,
+				  enum migrate_mode mode)
+{
+	WARN_ON_ONCE(1);
+	return -EINVAL;
+}
+
+static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *folio)
+{
+	pgoff_t start, end;
+
+	filemap_invalidate_lock_shared(mapping);
+
+	start = folio->index;
+	end = start + folio_nr_pages(folio);
+
+	kvm_gmem_invalidate_begin(mapping->host, start, end);
+
+	/*
+	 * Do not truncate the range, what action is taken in response to the
+	 * error is userspace's decision (assuming the architecture supports
+	 * gracefully handling memory errors).  If/when the guest attempts to
+	 * access a poisoned page, kvm_gmem_get_pfn() will return -EHWPOISON,
+	 * at which point KVM can either terminate the VM or propagate the
+	 * error to userspace.
+	 */
+
+	kvm_gmem_invalidate_end(mapping->host, start, end);
+
+	filemap_invalidate_unlock_shared(mapping);
+
+	return MF_DELAYED;
+}
+
+#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
+static void kvm_gmem_free_folio(struct folio *folio)
+{
+	struct page *page = folio_page(folio, 0);
+	kvm_pfn_t pfn = page_to_pfn(page);
+	int order = folio_order(folio);
+
+	kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order));
+}
+#endif
+
+static const struct address_space_operations kvm_gmem_aops = {
+	.dirty_folio = noop_dirty_folio,
+	.migrate_folio	= kvm_gmem_migrate_folio,
+	.error_remove_folio = kvm_gmem_error_folio,
+#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
+	.free_folio = kvm_gmem_free_folio,
+#endif
+};
+
+static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
+			    struct iattr *attr)
+{
+	return -EINVAL;
+}
+static const struct inode_operations kvm_gmem_iops = {
+	.setattr	= kvm_gmem_setattr,
+};
+
+bool __weak kvm_arch_supports_gmem_init_shared(struct kvm *kvm)
+{
+	return true;
+}
+
+static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
+{
+	static const char *name = "[kvm-gmem]";
+	struct gmem_file *f;
+	struct inode *inode;
+	struct file *file;
+	int fd, err;
+
+	fd = get_unused_fd_flags(0);
+	if (fd < 0)
+		return fd;
+
+	f = kzalloc(sizeof(*f), GFP_KERNEL);
+	if (!f) {
+		err = -ENOMEM;
+		goto err_fd;
+	}
+
+	/* __fput() will take care of fops_put(). */
+	if (!fops_get(&kvm_gmem_fops)) {
+		err = -ENOENT;
+		goto err_gmem;
+	}
+
+	inode = anon_inode_make_secure_inode(kvm_gmem_mnt->mnt_sb, name, NULL);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto err_fops;
+	}
+
+	inode->i_op = &kvm_gmem_iops;
+	inode->i_mapping->a_ops = &kvm_gmem_aops;
+	inode->i_mode |= S_IFREG;
+	inode->i_size = size;
+	mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
+	mapping_set_inaccessible(inode->i_mapping);
+	/* Unmovable mappings are supposed to be marked unevictable as well. */
+	WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
+
+	GMEM_I(inode)->flags = flags;
+
+	file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR, &kvm_gmem_fops);
+	if (IS_ERR(file)) {
+		err = PTR_ERR(file);
+		goto err_inode;
+	}
+
+	file->f_flags |= O_LARGEFILE;
+	file->private_data = f;
+
+	kvm_get_kvm(kvm);
+	f->kvm = kvm;
+	xa_init(&f->bindings);
+	list_add(&f->entry, &inode->i_mapping->i_private_list);
+
+	fd_install(fd, file);
+	return fd;
+
+err_inode:
+	iput(inode);
+err_fops:
+	fops_put(&kvm_gmem_fops);
+err_gmem:
+	kfree(f);
+err_fd:
+	put_unused_fd(fd);
+	return err;
+}
+
+int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
+{
+	loff_t size = args->size;
+	u64 flags = args->flags;
+
+	if (flags & ~kvm_gmem_get_supported_flags(kvm))
+		return -EINVAL;
+
+	if (size <= 0 || !PAGE_ALIGNED(size))
+		return -EINVAL;
+
+	return __kvm_gmem_create(kvm, size, flags);
+}
+
+int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
+		  unsigned int fd, loff_t offset)
+{
+	loff_t size = slot->npages << PAGE_SHIFT;
+	unsigned long start, end;
+	struct gmem_file *f;
+	struct inode *inode;
+	struct file *file;
+	int r = -EINVAL;
+
+	BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff));
+
+	file = fget(fd);
+	if (!file)
+		return -EBADF;
+
+	if (file->f_op != &kvm_gmem_fops)
+		goto err;
+
+	f = file->private_data;
+	if (f->kvm != kvm)
+		goto err;
+
+	inode = file_inode(file);
+
+	if (offset < 0 || !PAGE_ALIGNED(offset) ||
+	    offset + size > i_size_read(inode))
+		goto err;
+
+	filemap_invalidate_lock(inode->i_mapping);
+
+	start = offset >> PAGE_SHIFT;
+	end = start + slot->npages;
+
+	if (!xa_empty(&f->bindings) &&
+	    xa_find(&f->bindings, &start, end - 1, XA_PRESENT)) {
+		filemap_invalidate_unlock(inode->i_mapping);
+		goto err;
+	}
+
+	/*
+	 * memslots of flag KVM_MEM_GUEST_MEMFD are immutable to change, so
+	 * kvm_gmem_bind() must occur on a new memslot.  Because the memslot
+	 * is not visible yet, kvm_gmem_get_pfn() is guaranteed to see the file.
+	 */
+	WRITE_ONCE(slot->gmem.file, file);
+	slot->gmem.pgoff = start;
+	if (kvm_gmem_supports_mmap(inode))
+		slot->flags |= KVM_MEMSLOT_GMEM_ONLY;
+
+	xa_store_range(&f->bindings, start, end - 1, slot, GFP_KERNEL);
+	filemap_invalidate_unlock(inode->i_mapping);
+
+	/*
+	 * Drop the reference to the file, even on success.  The file pins KVM,
+	 * not the other way 'round.  Active bindings are invalidated if the
+	 * file is closed before memslots are destroyed.
+	 */
+	r = 0;
+err:
+	fput(file);
+	return r;
+}
+
+static void __kvm_gmem_unbind(struct kvm_memory_slot *slot, struct gmem_file *f)
+{
+	unsigned long start = slot->gmem.pgoff;
+	unsigned long end = start + slot->npages;
+
+	xa_store_range(&f->bindings, start, end - 1, NULL, GFP_KERNEL);
+
+	/*
+	 * synchronize_srcu(&kvm->srcu) ensured that kvm_gmem_get_pfn()
+	 * cannot see this memslot.
+	 */
+	WRITE_ONCE(slot->gmem.file, NULL);
+}
+
+void kvm_gmem_unbind(struct kvm_memory_slot *slot)
+{
+	/*
+	 * Nothing to do if the underlying file was _already_ closed, as
+	 * kvm_gmem_release() invalidates and nullifies all bindings.
+	 */
+	if (!slot->gmem.file)
+		return;
+
+	CLASS(gmem_get_file, file)(slot);
+
+	/*
+	 * However, if the file is _being_ closed, then the bindings need to be
+	 * removed as kvm_gmem_release() might not run until after the memslot
+	 * is freed.  Note, modifying the bindings is safe even though the file
+	 * is dying as kvm_gmem_release() nullifies slot->gmem.file under
+	 * slots_lock, and only puts its reference to KVM after destroying all
+	 * bindings.  I.e. reaching this point means kvm_gmem_release() hasn't
+	 * yet destroyed the bindings or freed the gmem_file, and can't do so
+	 * until the caller drops slots_lock.
+	 */
+	if (!file) {
+		__kvm_gmem_unbind(slot, slot->gmem.file->private_data);
+		return;
+	}
+
+	filemap_invalidate_lock(file->f_mapping);
+	__kvm_gmem_unbind(slot, file->private_data);
+	filemap_invalidate_unlock(file->f_mapping);
+}
+
+/* Returns a locked folio on success.  */
+static struct folio *__kvm_gmem_get_pfn(struct file *file,
+					struct kvm_memory_slot *slot,
+					pgoff_t index, kvm_pfn_t *pfn,
+					bool *is_prepared, int *max_order)
+{
+	struct file *slot_file = READ_ONCE(slot->gmem.file);
+	struct gmem_file *f = file->private_data;
+	struct folio *folio;
+
+	if (file != slot_file) {
+		WARN_ON_ONCE(slot_file);
+		return ERR_PTR(-EFAULT);
+	}
+
+	if (xa_load(&f->bindings, index) != slot) {
+		WARN_ON_ONCE(xa_load(&f->bindings, index));
+		return ERR_PTR(-EIO);
+	}
+
+	folio = kvm_gmem_get_folio(file_inode(file), index);
+	if (IS_ERR(folio))
+		return folio;
+
+	if (folio_test_hwpoison(folio)) {
+		folio_unlock(folio);
+		folio_put(folio);
+		return ERR_PTR(-EHWPOISON);
+	}
+
+	*pfn = folio_file_pfn(folio, index);
+	if (max_order)
+		*max_order = 0;
+
+	*is_prepared = folio_test_uptodate(folio);
+	return folio;
+}
+
+int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
+		     gfn_t gfn, kvm_pfn_t *pfn, struct page **page,
+		     int *max_order)
+{
+	pgoff_t index = kvm_gmem_get_index(slot, gfn);
+	struct folio *folio;
+	bool is_prepared = false;
+	int r = 0;
+
+	CLASS(gmem_get_file, file)(slot);
+	if (!file)
+		return -EFAULT;
+
+	folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order);
+	if (IS_ERR(folio))
+		return PTR_ERR(folio);
+
+	if (!is_prepared)
+		r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
+
+	folio_unlock(folio);
+
+	if (!r)
+		*page = folio_file_page(folio, index);
+	else
+		folio_put(folio);
+
+	return r;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn);
+
+#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_POPULATE
+long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages,
+		       kvm_gmem_populate_cb post_populate, void *opaque)
+{
+	struct kvm_memory_slot *slot;
+	void __user *p;
+
+	int ret = 0, max_order;
+	long i;
+
+	lockdep_assert_held(&kvm->slots_lock);
+
+	if (WARN_ON_ONCE(npages <= 0))
+		return -EINVAL;
+
+	slot = gfn_to_memslot(kvm, start_gfn);
+	if (!kvm_slot_has_gmem(slot))
+		return -EINVAL;
+
+	CLASS(gmem_get_file, file)(slot);
+	if (!file)
+		return -EFAULT;
+
+	filemap_invalidate_lock(file->f_mapping);
+
+	npages = min_t(ulong, slot->npages - (start_gfn - slot->base_gfn), npages);
+	for (i = 0; i < npages; i += (1 << max_order)) {
+		struct folio *folio;
+		gfn_t gfn = start_gfn + i;
+		pgoff_t index = kvm_gmem_get_index(slot, gfn);
+		bool is_prepared = false;
+		kvm_pfn_t pfn;
+
+		if (signal_pending(current)) {
+			ret = -EINTR;
+			break;
+		}
+
+		folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &is_prepared, &max_order);
+		if (IS_ERR(folio)) {
+			ret = PTR_ERR(folio);
+			break;
+		}
+
+		if (is_prepared) {
+			folio_unlock(folio);
+			folio_put(folio);
+			ret = -EEXIST;
+			break;
+		}
+
+		folio_unlock(folio);
+		WARN_ON(!IS_ALIGNED(gfn, 1 << max_order) ||
+			(npages - i) < (1 << max_order));
+
+		ret = -EINVAL;
+		while (!kvm_range_has_memory_attributes(kvm, gfn, gfn + (1 << max_order),
+							KVM_MEMORY_ATTRIBUTE_PRIVATE,
+							KVM_MEMORY_ATTRIBUTE_PRIVATE)) {
+			if (!max_order)
+				goto put_folio_and_exit;
+			max_order--;
+		}
+
+		p = src ? src + i * PAGE_SIZE : NULL;
+		ret = post_populate(kvm, gfn, pfn, p, max_order, opaque);
+		if (!ret)
+			kvm_gmem_mark_prepared(folio);
+
+put_folio_and_exit:
+		folio_put(folio);
+		if (ret)
+			break;
+	}
+
+	filemap_invalidate_unlock(file->f_mapping);
+
+	return ret && !i ? ret : i;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_populate);
+#endif
+
+static struct kmem_cache *kvm_gmem_inode_cachep;
+
+static void kvm_gmem_init_inode_once(void *__gi)
+{
+	struct gmem_inode *gi = __gi;
+
+	/*
+	 * Note!  Don't initialize the inode with anything specific to the
+	 * guest_memfd instance, or that might be specific to how the inode is
+	 * used (from the VFS-layer's perspective).  This hook is called only
+	 * during the initial slab allocation, i.e. only fields/state that are
+	 * idempotent across _all_ use of the inode _object_ can be initialized
+	 * at this time!
+	 */
+	inode_init_once(&gi->vfs_inode);
+}
+
+static struct inode *kvm_gmem_alloc_inode(struct super_block *sb)
+{
+	struct gmem_inode *gi;
+
+	gi = alloc_inode_sb(sb, kvm_gmem_inode_cachep, GFP_KERNEL);
+	if (!gi)
+		return NULL;
+
+	mpol_shared_policy_init(&gi->policy, NULL);
+
+	gi->flags = 0;
+	return &gi->vfs_inode;
+}
+
+static void kvm_gmem_destroy_inode(struct inode *inode)
+{
+	mpol_free_shared_policy(&GMEM_I(inode)->policy);
+}
+
+static void kvm_gmem_free_inode(struct inode *inode)
+{
+	kmem_cache_free(kvm_gmem_inode_cachep, GMEM_I(inode));
+}
+
+static const struct super_operations kvm_gmem_super_operations = {
+	.statfs		= simple_statfs,
+	.alloc_inode	= kvm_gmem_alloc_inode,
+	.destroy_inode	= kvm_gmem_destroy_inode,
+	.free_inode	= kvm_gmem_free_inode,
+};
+
+static int kvm_gmem_init_fs_context(struct fs_context *fc)
+{
+	struct pseudo_fs_context *ctx;
+
+	if (!init_pseudo(fc, GUEST_MEMFD_MAGIC))
+		return -ENOMEM;
+
+	fc->s_iflags |= SB_I_NOEXEC;
+	fc->s_iflags |= SB_I_NODEV;
+	ctx = fc->fs_private;
+	ctx->ops = &kvm_gmem_super_operations;
+
+	return 0;
+}
+
+static struct file_system_type kvm_gmem_fs = {
+	.name		 = "guest_memfd",
+	.init_fs_context = kvm_gmem_init_fs_context,
+	.kill_sb	 = kill_anon_super,
+};
+
+static int kvm_gmem_init_mount(void)
+{
+	kvm_gmem_mnt = kern_mount(&kvm_gmem_fs);
+
+	if (IS_ERR(kvm_gmem_mnt))
+		return PTR_ERR(kvm_gmem_mnt);
+
+	kvm_gmem_mnt->mnt_flags |= MNT_NOEXEC;
+	return 0;
+}
+
+int kvm_gmem_init(struct module *module)
+{
+	struct kmem_cache_args args = {
+		.align = 0,
+		.ctor = kvm_gmem_init_inode_once,
+	};
+	int ret;
+
+	kvm_gmem_fops.owner = module;
+	kvm_gmem_inode_cachep = kmem_cache_create("kvm_gmem_inode_cache",
+						  sizeof(struct gmem_inode),
+						  &args, SLAB_ACCOUNT);
+	if (!kvm_gmem_inode_cachep)
+		return -ENOMEM;
+
+	ret = kvm_gmem_init_mount();
+	if (ret) {
+		kmem_cache_destroy(kvm_gmem_inode_cachep);
+		return ret;
+	}
+	return 0;
+}
+
+void kvm_gmem_exit(void)
+{
+	kern_unmount(kvm_gmem_mnt);
+	kvm_gmem_mnt = NULL;
+	rcu_barrier();
+	kmem_cache_destroy(kvm_gmem_inode_cachep);
+}
diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
index b1286c4e0712..6ccabfd32287 100644
--- a/virt/kvm/irqchip.c
+++ b/virt/kvm/irqchip.c
@@ -1,22 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * irqchip.c: Common API for in kernel interrupt controllers
  * Copyright (c) 2007, Intel Corporation.
  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  * Copyright (c) 2013, Alexander Graf <agraf@suse.de>
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
- *
  * This file is derived from virt/kvm/irq_comm.c.
  *
  * Authors:
@@ -29,7 +17,6 @@
 #include <linux/srcu.h>
 #include <linux/export.h>
 #include <trace/events/kvm.h>
-#include "irq.h"
 
 int kvm_irq_map_gsi(struct kvm *kvm,
 		    struct kvm_kernel_irq_routing_entry *entries, int gsi)
@@ -62,7 +49,7 @@ int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi)
 {
 	struct kvm_kernel_irq_routing_entry route;
 
-	if (!irqchip_in_kernel(kvm) || (msi->flags & ~KVM_MSI_VALID_DEVID))
+	if (!kvm_arch_irqchip_in_kernel(kvm) || (msi->flags & ~KVM_MSI_VALID_DEVID))
 		return -EINVAL;
 
 	route.msi.address_lo = msi->address_lo;
@@ -144,18 +131,19 @@ static int setup_routing_entry(struct kvm *kvm,
 {
 	struct kvm_kernel_irq_routing_entry *ei;
 	int r;
+	u32 gsi = array_index_nospec(ue->gsi, KVM_MAX_IRQ_ROUTES);
 
 	/*
 	 * Do not allow GSI to be mapped to the same irqchip more than once.
 	 * Allow only one to one mapping between GSI and non-irqchip routing.
 	 */
-	hlist_for_each_entry(ei, &rt->map[ue->gsi], link)
+	hlist_for_each_entry(ei, &rt->map[gsi], link)
 		if (ei->type != KVM_IRQ_ROUTING_IRQCHIP ||
 		    ue->type != KVM_IRQ_ROUTING_IRQCHIP ||
 		    ue->u.irqchip.irqchip == ei->irqchip.irqchip)
 			return -EINVAL;
 
-	e->gsi = ue->gsi;
+	e->gsi = gsi;
 	e->type = ue->type;
 	r = kvm_set_routing_entry(kvm, e, ue);
 	if (r)
@@ -195,9 +183,7 @@ int kvm_set_irq_routing(struct kvm *kvm,
 
 	nr_rt_entries += 1;
 
-	new = kzalloc(sizeof(*new) + (nr_rt_entries * sizeof(struct hlist_head)),
-		      GFP_KERNEL);
-
+	new = kzalloc(struct_size(new, map, nr_rt_entries), GFP_KERNEL_ACCOUNT);
 	if (!new)
 		return -ENOMEM;
 
@@ -208,7 +194,7 @@ int kvm_set_irq_routing(struct kvm *kvm,
 
 	for (i = 0; i < nr; ++i) {
 		r = -ENOMEM;
-		e = kzalloc(sizeof(*e), GFP_KERNEL);
+		e = kzalloc(sizeof(*e), GFP_KERNEL_ACCOUNT);
 		if (!e)
 			goto out;
 
@@ -236,8 +222,6 @@ int kvm_set_irq_routing(struct kvm *kvm,
 	kvm_arch_irq_routing_update(kvm);
 	mutex_unlock(&kvm->irq_lock);
 
-	kvm_arch_post_irq_routing_update(kvm);
-
 	synchronize_srcu_expedited(&kvm->irq_srcu);
 
 	new = old;
@@ -251,3 +235,27 @@ out:
 
 	return r;
 }
+
+/*
+ * Allocate empty IRQ routing by default so that additional setup isn't needed
+ * when userspace-driven IRQ routing is activated, and so that kvm->irq_routing
+ * is guaranteed to be non-NULL.
+ */
+int kvm_init_irq_routing(struct kvm *kvm)
+{
+	struct kvm_irq_routing_table *new;
+	int chip_size;
+
+	new = kzalloc(struct_size(new, map, 1), GFP_KERNEL_ACCOUNT);
+	if (!new)
+		return -ENOMEM;
+
+	new->nr_rt_entries = 1;
+
+	chip_size = sizeof(int) * KVM_NR_IRQCHIPS * KVM_IRQCHIP_NUM_PINS;
+	memset(new->chip, -1, chip_size);
+
+	RCU_INIT_POINTER(kvm->irq_routing, new);
+
+	return 0;
+}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5ecea812cb6a..5fcd401a5897 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1,8 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
- * Kernel-based Virtual Machine driver for Linux
- *
- * This module enables machines with Intel VT-x extensions to run virtual
- * machines without emulation or binary translation.
+ * Kernel-based Virtual Machine (KVM) Hypervisor
  *
  * Copyright (C) 2006 Qumranet, Inc.
  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
@@ -10,10 +8,6 @@
  * Authors:
  *   Avi Kivity   <avi@qumranet.com>
  *   Yaniv Kamay  <yaniv@qumranet.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2.  See
- * the COPYING file in the top-level directory.
- *
  */
 
 #include <kvm/iodev.h>
@@ -51,40 +45,62 @@
 #include <linux/slab.h>
 #include <linux/sort.h>
 #include <linux/bsearch.h>
+#include <linux/io.h>
+#include <linux/lockdep.h>
+#include <linux/kthread.h>
+#include <linux/suspend.h>
+#include <linux/rseq.h>
 
 #include <asm/processor.h>
-#include <asm/io.h>
 #include <asm/ioctl.h>
 #include <linux/uaccess.h>
-#include <asm/pgtable.h>
 
 #include "coalesced_mmio.h"
 #include "async_pf.h"
+#include "kvm_mm.h"
 #include "vfio.h"
 
+#include <trace/events/ipi.h>
+
 #define CREATE_TRACE_POINTS
 #include <trace/events/kvm.h>
 
+#include <linux/kvm_dirty_ring.h>
+
+
 /* Worst case buffer size needed for holding an integer. */
 #define ITOA_MAX_LEN 12
 
 MODULE_AUTHOR("Qumranet");
+MODULE_DESCRIPTION("Kernel-based Virtual Machine (KVM) Hypervisor");
 MODULE_LICENSE("GPL");
 
 /* Architectures should define their poll value according to the halt latency */
 unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
 module_param(halt_poll_ns, uint, 0644);
-EXPORT_SYMBOL_GPL(halt_poll_ns);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(halt_poll_ns);
 
 /* Default doubles per-vcpu halt_poll_ns. */
 unsigned int halt_poll_ns_grow = 2;
 module_param(halt_poll_ns_grow, uint, 0644);
-EXPORT_SYMBOL_GPL(halt_poll_ns_grow);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(halt_poll_ns_grow);
+
+/* The start value to grow halt_poll_ns from */
+unsigned int halt_poll_ns_grow_start = 10000; /* 10us */
+module_param(halt_poll_ns_grow_start, uint, 0644);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(halt_poll_ns_grow_start);
 
-/* Default resets per-vcpu halt_poll_ns . */
-unsigned int halt_poll_ns_shrink;
+/* Default halves per-vcpu halt_poll_ns. */
+unsigned int halt_poll_ns_shrink = 2;
 module_param(halt_poll_ns_shrink, uint, 0644);
-EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(halt_poll_ns_shrink);
+
+/*
+ * Allow direct access (from KVM or the CPU) without MMU notifier protection
+ * to unpinned pages.
+ */
+static bool allow_unsafe_mappings;
+module_param(allow_unsafe_mappings, bool, 0444);
 
 /*
  * Ordering of locks:
@@ -92,24 +108,17 @@ EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
  *	kvm->lock --> kvm->slots_lock --> kvm->irq_lock
  */
 
-DEFINE_SPINLOCK(kvm_lock);
-static DEFINE_RAW_SPINLOCK(kvm_count_lock);
+DEFINE_MUTEX(kvm_lock);
 LIST_HEAD(vm_list);
 
-static cpumask_var_t cpus_hardware_enabled;
-static int kvm_usage_count;
-static atomic_t hardware_enable_failed;
-
-struct kmem_cache *kvm_vcpu_cache;
-EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
+static struct kmem_cache *kvm_vcpu_cache;
 
 static __read_mostly struct preempt_ops kvm_preempt_ops;
+static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu);
 
-struct dentry *kvm_debugfs_dir;
-EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
+static struct dentry *kvm_debugfs_dir;
 
-static int kvm_debugfs_num_entries;
-static const struct file_operations *stat_fops_per_vm[];
+static const struct file_operations stat_fops_per_vm;
 
 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
 			   unsigned long arg);
@@ -118,40 +127,36 @@ static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
 				  unsigned long arg);
 #define KVM_COMPAT(c)	.compat_ioctl	= (c)
 #else
+/*
+ * For architectures that don't implement a compat infrastructure,
+ * adopt a double line of defense:
+ * - Prevent a compat task from opening /dev/kvm
+ * - If the open has been done by a 64bit task, and the KVM fd
+ *   passed to a compat task, let the ioctls fail.
+ */
 static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl,
 				unsigned long arg) { return -EINVAL; }
-#define KVM_COMPAT(c)	.compat_ioctl	= kvm_no_compat_ioctl
+
+static int kvm_no_compat_open(struct inode *inode, struct file *file)
+{
+	return is_compat_task() ? -ENODEV : 0;
+}
+#define KVM_COMPAT(c)	.compat_ioctl	= kvm_no_compat_ioctl,	\
+			.open		= kvm_no_compat_open
 #endif
-static int hardware_enable_all(void);
-static void hardware_disable_all(void);
 
 static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
 
-static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn);
-
-__visible bool kvm_rebooting;
-EXPORT_SYMBOL_GPL(kvm_rebooting);
-
-static bool largepages_enabled = true;
-
 #define KVM_EVENT_CREATE_VM 0
 #define KVM_EVENT_DESTROY_VM 1
 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
 static unsigned long long kvm_createvm_count;
 static unsigned long long kvm_active_vms;
 
-__weak int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
-		unsigned long start, unsigned long end, bool blockable)
-{
-	return 0;
-}
+static DEFINE_PER_CPU(cpumask_var_t, cpu_kick_mask);
 
-bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
+__weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
 {
-	if (pfn_valid(pfn))
-		return PageReserved(pfn_to_page(pfn));
-
-	return true;
 }
 
 /*
@@ -160,20 +165,23 @@ bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
 void vcpu_load(struct kvm_vcpu *vcpu)
 {
 	int cpu = get_cpu();
+
+	__this_cpu_write(kvm_running_vcpu, vcpu);
 	preempt_notifier_register(&vcpu->preempt_notifier);
 	kvm_arch_vcpu_load(vcpu, cpu);
 	put_cpu();
 }
-EXPORT_SYMBOL_GPL(vcpu_load);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(vcpu_load);
 
 void vcpu_put(struct kvm_vcpu *vcpu)
 {
 	preempt_disable();
 	kvm_arch_vcpu_put(vcpu);
 	preempt_notifier_unregister(&vcpu->preempt_notifier);
+	__this_cpu_write(kvm_running_vcpu, NULL);
 	preempt_enable();
 }
-EXPORT_SYMBOL_GPL(vcpu_put);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(vcpu_put);
 
 /* TODO: merge with kvm_arch_vcpu_should_kick */
 static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req)
@@ -193,47 +201,68 @@ static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req)
 	return mode == IN_GUEST_MODE;
 }
 
-static void ack_flush(void *_completed)
+static void ack_kick(void *_completed)
 {
 }
 
-static inline bool kvm_kick_many_cpus(const struct cpumask *cpus, bool wait)
+static inline bool kvm_kick_many_cpus(struct cpumask *cpus, bool wait)
 {
-	if (unlikely(!cpus))
-		cpus = cpu_online_mask;
-
 	if (cpumask_empty(cpus))
 		return false;
 
-	smp_call_function_many(cpus, ack_flush, NULL, wait);
+	smp_call_function_many(cpus, ack_kick, NULL, wait);
 	return true;
 }
 
+static void kvm_make_vcpu_request(struct kvm_vcpu *vcpu, unsigned int req,
+				  struct cpumask *tmp, int current_cpu)
+{
+	int cpu;
+
+	if (likely(!(req & KVM_REQUEST_NO_ACTION)))
+		__kvm_make_request(req, vcpu);
+
+	if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
+		return;
+
+	/*
+	 * Note, the vCPU could get migrated to a different pCPU at any point
+	 * after kvm_request_needs_ipi(), which could result in sending an IPI
+	 * to the previous pCPU.  But, that's OK because the purpose of the IPI
+	 * is to ensure the vCPU returns to OUTSIDE_GUEST_MODE, which is
+	 * satisfied if the vCPU migrates. Entering READING_SHADOW_PAGE_TABLES
+	 * after this point is also OK, as the requirement is only that KVM wait
+	 * for vCPUs that were reading SPTEs _before_ any changes were
+	 * finalized. See kvm_vcpu_kick() for more details on handling requests.
+	 */
+	if (kvm_request_needs_ipi(vcpu, req)) {
+		cpu = READ_ONCE(vcpu->cpu);
+		if (cpu != -1 && cpu != current_cpu)
+			__cpumask_set_cpu(cpu, tmp);
+	}
+}
+
 bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
-				 unsigned long *vcpu_bitmap, cpumask_var_t tmp)
+				 unsigned long *vcpu_bitmap)
 {
-	int i, cpu, me;
 	struct kvm_vcpu *vcpu;
+	struct cpumask *cpus;
+	int i, me;
 	bool called;
 
 	me = get_cpu();
 
-	kvm_for_each_vcpu(i, vcpu, kvm) {
-		if (vcpu_bitmap && !test_bit(i, vcpu_bitmap))
-			continue;
+	cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
+	cpumask_clear(cpus);
 
-		kvm_make_request(req, vcpu);
-		cpu = vcpu->cpu;
-
-		if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
+	for_each_set_bit(i, vcpu_bitmap, KVM_MAX_VCPUS) {
+		vcpu = kvm_get_vcpu(kvm, i);
+		if (!vcpu)
 			continue;
-
-		if (tmp != NULL && cpu != -1 && cpu != me &&
-		    kvm_request_needs_ipi(vcpu, req))
-			__cpumask_set_cpu(cpu, tmp);
+		kvm_make_vcpu_request(vcpu, req, cpus, me);
 	}
 
-	called = kvm_kick_many_cpus(tmp, !!(req & KVM_REQUEST_WAIT));
+	called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
 	put_cpu();
 
 	return called;
@@ -241,25 +270,30 @@ bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
 
 bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
 {
-	cpumask_var_t cpus;
+	struct kvm_vcpu *vcpu;
+	struct cpumask *cpus;
+	unsigned long i;
 	bool called;
+	int me;
+
+	me = get_cpu();
 
-	zalloc_cpumask_var(&cpus, GFP_ATOMIC);
+	cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
+	cpumask_clear(cpus);
 
-	called = kvm_make_vcpus_request_mask(kvm, req, NULL, cpus);
+	kvm_for_each_vcpu(i, vcpu, kvm)
+		kvm_make_vcpu_request(vcpu, req, cpus, me);
+
+	called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
+	put_cpu();
 
-	free_cpumask_var(cpus);
 	return called;
 }
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_make_all_cpus_request);
 
-#ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL
 void kvm_flush_remote_tlbs(struct kvm *kvm)
 {
-	/*
-	 * Read tlbs_dirty before setting KVM_REQ_TLB_FLUSH in
-	 * kvm_make_all_cpus_request.
-	 */
-	long dirty_count = smp_load_acquire(&kvm->tlbs_dirty);
+	++kvm->stat.generic.remote_tlb_flush_requests;
 
 	/*
 	 * We want to publish modifications to the page tables before reading
@@ -272,170 +306,540 @@ void kvm_flush_remote_tlbs(struct kvm *kvm)
 	 * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that
 	 * barrier here.
 	 */
-	if (!kvm_arch_flush_remote_tlb(kvm)
+	if (!kvm_arch_flush_remote_tlbs(kvm)
 	    || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
-		++kvm->stat.remote_tlb_flush;
-	cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
+		++kvm->stat.generic.remote_tlb_flush;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_flush_remote_tlbs);
+
+void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages)
+{
+	if (!kvm_arch_flush_remote_tlbs_range(kvm, gfn, nr_pages))
+		return;
+
+	/*
+	 * Fall back to a flushing entire TLBs if the architecture range-based
+	 * TLB invalidation is unsupported or can't be performed for whatever
+	 * reason.
+	 */
+	kvm_flush_remote_tlbs(kvm);
+}
+
+void kvm_flush_remote_tlbs_memslot(struct kvm *kvm,
+				   const struct kvm_memory_slot *memslot)
+{
+	/*
+	 * All current use cases for flushing the TLBs for a specific memslot
+	 * are related to dirty logging, and many do the TLB flush out of
+	 * mmu_lock. The interaction between the various operations on memslot
+	 * must be serialized by slots_lock to ensure the TLB flush from one
+	 * operation is observed by any other operation on the same memslot.
+	 */
+	lockdep_assert_held(&kvm->slots_lock);
+	kvm_flush_remote_tlbs_range(kvm, memslot->base_gfn, memslot->npages);
 }
-EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
-#endif
 
-void kvm_reload_remote_mmus(struct kvm *kvm)
+static void kvm_flush_shadow_all(struct kvm *kvm)
 {
-	kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
+	kvm_arch_flush_shadow_all(kvm);
+	kvm_arch_guest_memory_reclaimed(kvm);
 }
 
-int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
+#ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
+static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
+					       gfp_t gfp_flags)
 {
-	struct page *page;
-	int r;
+	void *page;
+
+	gfp_flags |= mc->gfp_zero;
+
+	if (mc->kmem_cache)
+		return kmem_cache_alloc(mc->kmem_cache, gfp_flags);
+
+	page = (void *)__get_free_page(gfp_flags);
+	if (page && mc->init_value)
+		memset64(page, mc->init_value, PAGE_SIZE / sizeof(u64));
+	return page;
+}
+
+int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min)
+{
+	gfp_t gfp = mc->gfp_custom ? mc->gfp_custom : GFP_KERNEL_ACCOUNT;
+	void *obj;
+
+	if (mc->nobjs >= min)
+		return 0;
+
+	if (unlikely(!mc->objects)) {
+		if (WARN_ON_ONCE(!capacity))
+			return -EIO;
+
+		/*
+		 * Custom init values can be used only for page allocations,
+		 * and obviously conflict with __GFP_ZERO.
+		 */
+		if (WARN_ON_ONCE(mc->init_value && (mc->kmem_cache || mc->gfp_zero)))
+			return -EIO;
+
+		mc->objects = kvmalloc_array(capacity, sizeof(void *), gfp);
+		if (!mc->objects)
+			return -ENOMEM;
 
+		mc->capacity = capacity;
+	}
+
+	/* It is illegal to request a different capacity across topups. */
+	if (WARN_ON_ONCE(mc->capacity != capacity))
+		return -EIO;
+
+	while (mc->nobjs < mc->capacity) {
+		obj = mmu_memory_cache_alloc_obj(mc, gfp);
+		if (!obj)
+			return mc->nobjs >= min ? 0 : -ENOMEM;
+		mc->objects[mc->nobjs++] = obj;
+	}
+	return 0;
+}
+
+int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
+{
+	return __kvm_mmu_topup_memory_cache(mc, KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE, min);
+}
+
+int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
+{
+	return mc->nobjs;
+}
+
+void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
+{
+	while (mc->nobjs) {
+		if (mc->kmem_cache)
+			kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]);
+		else
+			free_page((unsigned long)mc->objects[--mc->nobjs]);
+	}
+
+	kvfree(mc->objects);
+
+	mc->objects = NULL;
+	mc->capacity = 0;
+}
+
+void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
+{
+	void *p;
+
+	if (WARN_ON(!mc->nobjs))
+		p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT);
+	else
+		p = mc->objects[--mc->nobjs];
+	BUG_ON(!p);
+	return p;
+}
+#endif
+
+static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
+{
 	mutex_init(&vcpu->mutex);
 	vcpu->cpu = -1;
 	vcpu->kvm = kvm;
 	vcpu->vcpu_id = id;
 	vcpu->pid = NULL;
-	init_swait_queue_head(&vcpu->wq);
+	rwlock_init(&vcpu->pid_lock);
+#ifndef __KVM_HAVE_ARCH_WQP
+	rcuwait_init(&vcpu->wait);
+#endif
 	kvm_async_pf_vcpu_init(vcpu);
 
-	vcpu->pre_pcpu = -1;
-	INIT_LIST_HEAD(&vcpu->blocked_vcpu_list);
-
-	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-	if (!page) {
-		r = -ENOMEM;
-		goto fail;
-	}
-	vcpu->run = page_address(page);
-
 	kvm_vcpu_set_in_spin_loop(vcpu, false);
 	kvm_vcpu_set_dy_eligible(vcpu, false);
 	vcpu->preempted = false;
+	vcpu->ready = false;
+	preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
+	vcpu->last_used_slot = NULL;
 
-	r = kvm_arch_vcpu_init(vcpu);
-	if (r < 0)
-		goto fail_free_run;
-	return 0;
-
-fail_free_run:
-	free_page((unsigned long)vcpu->run);
-fail:
-	return r;
+	/* Fill the stats id string for the vcpu */
+	snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d",
+		 task_pid_nr(current), id);
 }
-EXPORT_SYMBOL_GPL(kvm_vcpu_init);
 
-void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
+static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
 {
+	kvm_arch_vcpu_destroy(vcpu);
+	kvm_dirty_ring_free(&vcpu->dirty_ring);
+
 	/*
-	 * no need for rcu_read_lock as VCPU_RUN is the only place that
-	 * will change the vcpu->pid pointer and on uninit all file
-	 * descriptors are already gone.
+	 * No need for rcu_read_lock as VCPU_RUN is the only place that changes
+	 * the vcpu->pid pointer, and at destruction time all file descriptors
+	 * are already gone.
 	 */
-	put_pid(rcu_dereference_protected(vcpu->pid, 1));
-	kvm_arch_vcpu_uninit(vcpu);
+	put_pid(vcpu->pid);
+
 	free_page((unsigned long)vcpu->run);
+	kmem_cache_free(kvm_vcpu_cache, vcpu);
+}
+
+void kvm_destroy_vcpus(struct kvm *kvm)
+{
+	unsigned long i;
+	struct kvm_vcpu *vcpu;
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		kvm_vcpu_destroy(vcpu);
+		xa_erase(&kvm->vcpu_array, i);
+
+		/*
+		 * Assert that the vCPU isn't visible in any way, to ensure KVM
+		 * doesn't trigger a use-after-free if destroying vCPUs results
+		 * in VM-wide request, e.g. to flush remote TLBs when tearing
+		 * down MMUs, or to mark the VM dead if a KVM_BUG_ON() fires.
+		 */
+		WARN_ON_ONCE(xa_load(&kvm->vcpu_array, i) || kvm_get_vcpu(kvm, i));
+	}
+
+	atomic_set(&kvm->online_vcpus, 0);
 }
-EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_destroy_vcpus);
 
-#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
 {
 	return container_of(mn, struct kvm, mmu_notifier);
 }
 
-static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
-					struct mm_struct *mm,
-					unsigned long address,
-					pte_t pte)
+typedef bool (*gfn_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);
+
+typedef void (*on_lock_fn_t)(struct kvm *kvm);
+
+struct kvm_mmu_notifier_range {
+	/*
+	 * 64-bit addresses, as KVM notifiers can operate on host virtual
+	 * addresses (unsigned long) and guest physical addresses (64-bit).
+	 */
+	u64 start;
+	u64 end;
+	union kvm_mmu_notifier_arg arg;
+	gfn_handler_t handler;
+	on_lock_fn_t on_lock;
+	bool flush_on_ret;
+	bool may_block;
+	bool lockless;
+};
+
+/*
+ * The inner-most helper returns a tuple containing the return value from the
+ * arch- and action-specific handler, plus a flag indicating whether or not at
+ * least one memslot was found, i.e. if the handler found guest memory.
+ *
+ * Note, most notifiers are averse to booleans, so even though KVM tracks the
+ * return from arch code as a bool, outer helpers will cast it to an int. :-(
+ */
+typedef struct kvm_mmu_notifier_return {
+	bool ret;
+	bool found_memslot;
+} kvm_mn_ret_t;
+
+/*
+ * Use a dedicated stub instead of NULL to indicate that there is no callback
+ * function/handler.  The compiler technically can't guarantee that a real
+ * function will have a non-zero address, and so it will generate code to
+ * check for !NULL, whereas comparing against a stub will be elided at compile
+ * time (unless the compiler is getting long in the tooth, e.g. gcc 4.9).
+ */
+static void kvm_null_fn(void)
 {
-	struct kvm *kvm = mmu_notifier_to_kvm(mn);
-	int idx;
+
+}
+#define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn)
+
+/* Iterate over each memslot intersecting [start, last] (inclusive) range */
+#define kvm_for_each_memslot_in_hva_range(node, slots, start, last)	     \
+	for (node = interval_tree_iter_first(&slots->hva_tree, start, last); \
+	     node;							     \
+	     node = interval_tree_iter_next(node, start, last))	     \
+
+static __always_inline kvm_mn_ret_t kvm_handle_hva_range(struct kvm *kvm,
+							 const struct kvm_mmu_notifier_range *range)
+{
+	struct kvm_mmu_notifier_return r = {
+		.ret = false,
+		.found_memslot = false,
+	};
+	struct kvm_gfn_range gfn_range;
+	struct kvm_memory_slot *slot;
+	struct kvm_memslots *slots;
+	int i, idx;
+
+	if (WARN_ON_ONCE(range->end <= range->start))
+		return r;
+
+	/* A null handler is allowed if and only if on_lock() is provided. */
+	if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) &&
+			 IS_KVM_NULL_FN(range->handler)))
+		return r;
+
+	/* on_lock will never be called for lockless walks */
+	if (WARN_ON_ONCE(range->lockless && !IS_KVM_NULL_FN(range->on_lock)))
+		return r;
 
 	idx = srcu_read_lock(&kvm->srcu);
-	spin_lock(&kvm->mmu_lock);
-	kvm->mmu_notifier_seq++;
 
-	if (kvm_set_spte_hva(kvm, address, pte))
+	for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
+		struct interval_tree_node *node;
+
+		slots = __kvm_memslots(kvm, i);
+		kvm_for_each_memslot_in_hva_range(node, slots,
+						  range->start, range->end - 1) {
+			unsigned long hva_start, hva_end;
+
+			slot = container_of(node, struct kvm_memory_slot, hva_node[slots->node_idx]);
+			hva_start = max_t(unsigned long, range->start, slot->userspace_addr);
+			hva_end = min_t(unsigned long, range->end,
+					slot->userspace_addr + (slot->npages << PAGE_SHIFT));
+
+			/*
+			 * To optimize for the likely case where the address
+			 * range is covered by zero or one memslots, don't
+			 * bother making these conditional (to avoid writes on
+			 * the second or later invocation of the handler).
+			 */
+			gfn_range.arg = range->arg;
+			gfn_range.may_block = range->may_block;
+			/*
+			 * HVA-based notifications aren't relevant to private
+			 * mappings as they don't have a userspace mapping.
+			 */
+			gfn_range.attr_filter = KVM_FILTER_SHARED;
+
+			/*
+			 * {gfn(page) | page intersects with [hva_start, hva_end)} =
+			 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
+			 */
+			gfn_range.start = hva_to_gfn_memslot(hva_start, slot);
+			gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
+			gfn_range.slot = slot;
+			gfn_range.lockless = range->lockless;
+
+			if (!r.found_memslot) {
+				r.found_memslot = true;
+				if (!range->lockless) {
+					KVM_MMU_LOCK(kvm);
+					if (!IS_KVM_NULL_FN(range->on_lock))
+						range->on_lock(kvm);
+
+					if (IS_KVM_NULL_FN(range->handler))
+						goto mmu_unlock;
+				}
+			}
+			r.ret |= range->handler(kvm, &gfn_range);
+		}
+	}
+
+	if (range->flush_on_ret && r.ret)
 		kvm_flush_remote_tlbs(kvm);
 
-	spin_unlock(&kvm->mmu_lock);
+mmu_unlock:
+	if (r.found_memslot && !range->lockless)
+		KVM_MMU_UNLOCK(kvm);
+
 	srcu_read_unlock(&kvm->srcu, idx);
+
+	return r;
 }
 
-static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
-					const struct mmu_notifier_range *range)
+static __always_inline int kvm_age_hva_range(struct mmu_notifier *mn,
+						unsigned long start,
+						unsigned long end,
+						gfn_handler_t handler,
+						bool flush_on_ret)
 {
 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
-	int need_tlb_flush = 0, idx;
-	int ret;
+	const struct kvm_mmu_notifier_range range = {
+		.start		= start,
+		.end		= end,
+		.handler	= handler,
+		.on_lock	= (void *)kvm_null_fn,
+		.flush_on_ret	= flush_on_ret,
+		.may_block	= false,
+		.lockless	= IS_ENABLED(CONFIG_KVM_MMU_LOCKLESS_AGING),
+	};
 
-	idx = srcu_read_lock(&kvm->srcu);
-	spin_lock(&kvm->mmu_lock);
+	return kvm_handle_hva_range(kvm, &range).ret;
+}
+
+static __always_inline int kvm_age_hva_range_no_flush(struct mmu_notifier *mn,
+						      unsigned long start,
+						      unsigned long end,
+						      gfn_handler_t handler)
+{
+	return kvm_age_hva_range(mn, start, end, handler, false);
+}
+
+void kvm_mmu_invalidate_begin(struct kvm *kvm)
+{
+	lockdep_assert_held_write(&kvm->mmu_lock);
 	/*
 	 * The count increase must become visible at unlock time as no
 	 * spte can be established without taking the mmu_lock and
 	 * count is also read inside the mmu_lock critical section.
 	 */
-	kvm->mmu_notifier_count++;
-	need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end);
-	need_tlb_flush |= kvm->tlbs_dirty;
-	/* we've to flush the tlb before the pages can be freed */
-	if (need_tlb_flush)
-		kvm_flush_remote_tlbs(kvm);
+	kvm->mmu_invalidate_in_progress++;
 
-	spin_unlock(&kvm->mmu_lock);
+	if (likely(kvm->mmu_invalidate_in_progress == 1)) {
+		kvm->mmu_invalidate_range_start = INVALID_GPA;
+		kvm->mmu_invalidate_range_end = INVALID_GPA;
+	}
+}
 
-	ret = kvm_arch_mmu_notifier_invalidate_range(kvm, range->start,
-					range->end, range->blockable);
+void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end)
+{
+	lockdep_assert_held_write(&kvm->mmu_lock);
 
-	srcu_read_unlock(&kvm->srcu, idx);
+	WARN_ON_ONCE(!kvm->mmu_invalidate_in_progress);
 
-	return ret;
+	if (likely(kvm->mmu_invalidate_range_start == INVALID_GPA)) {
+		kvm->mmu_invalidate_range_start = start;
+		kvm->mmu_invalidate_range_end = end;
+	} else {
+		/*
+		 * Fully tracking multiple concurrent ranges has diminishing
+		 * returns. Keep things simple and just find the minimal range
+		 * which includes the current and new ranges. As there won't be
+		 * enough information to subtract a range after its invalidate
+		 * completes, any ranges invalidated concurrently will
+		 * accumulate and persist until all outstanding invalidates
+		 * complete.
+		 */
+		kvm->mmu_invalidate_range_start =
+			min(kvm->mmu_invalidate_range_start, start);
+		kvm->mmu_invalidate_range_end =
+			max(kvm->mmu_invalidate_range_end, end);
+	}
 }
 
-static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
+bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+	kvm_mmu_invalidate_range_add(kvm, range->start, range->end);
+	return kvm_unmap_gfn_range(kvm, range);
+}
+
+static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 					const struct mmu_notifier_range *range)
 {
 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+	const struct kvm_mmu_notifier_range hva_range = {
+		.start		= range->start,
+		.end		= range->end,
+		.handler	= kvm_mmu_unmap_gfn_range,
+		.on_lock	= kvm_mmu_invalidate_begin,
+		.flush_on_ret	= true,
+		.may_block	= mmu_notifier_range_blockable(range),
+	};
+
+	trace_kvm_unmap_hva_range(range->start, range->end);
+
+	/*
+	 * Prevent memslot modification between range_start() and range_end()
+	 * so that conditionally locking provides the same result in both
+	 * functions.  Without that guarantee, the mmu_invalidate_in_progress
+	 * adjustments will be imbalanced.
+	 *
+	 * Pairs with the decrement in range_end().
+	 */
+	spin_lock(&kvm->mn_invalidate_lock);
+	kvm->mn_active_invalidate_count++;
+	spin_unlock(&kvm->mn_invalidate_lock);
+
+	/*
+	 * Invalidate pfn caches _before_ invalidating the secondary MMUs, i.e.
+	 * before acquiring mmu_lock, to avoid holding mmu_lock while acquiring
+	 * each cache's lock.  There are relatively few caches in existence at
+	 * any given time, and the caches themselves can check for hva overlap,
+	 * i.e. don't need to rely on memslot overlap checks for performance.
+	 * Because this runs without holding mmu_lock, the pfn caches must use
+	 * mn_active_invalidate_count (see above) instead of
+	 * mmu_invalidate_in_progress.
+	 */
+	gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end);
+
+	/*
+	 * If one or more memslots were found and thus zapped, notify arch code
+	 * that guest memory has been reclaimed.  This needs to be done *after*
+	 * dropping mmu_lock, as x86's reclaim path is slooooow.
+	 */
+	if (kvm_handle_hva_range(kvm, &hva_range).found_memslot)
+		kvm_arch_guest_memory_reclaimed(kvm);
+
+	return 0;
+}
+
+void kvm_mmu_invalidate_end(struct kvm *kvm)
+{
+	lockdep_assert_held_write(&kvm->mmu_lock);
 
-	spin_lock(&kvm->mmu_lock);
 	/*
 	 * This sequence increase will notify the kvm page fault that
 	 * the page that is going to be mapped in the spte could have
 	 * been freed.
 	 */
-	kvm->mmu_notifier_seq++;
+	kvm->mmu_invalidate_seq++;
 	smp_wmb();
 	/*
 	 * The above sequence increase must be visible before the
 	 * below count decrease, which is ensured by the smp_wmb above
-	 * in conjunction with the smp_rmb in mmu_notifier_retry().
+	 * in conjunction with the smp_rmb in mmu_invalidate_retry().
 	 */
-	kvm->mmu_notifier_count--;
-	spin_unlock(&kvm->mmu_lock);
+	kvm->mmu_invalidate_in_progress--;
+	KVM_BUG_ON(kvm->mmu_invalidate_in_progress < 0, kvm);
 
-	BUG_ON(kvm->mmu_notifier_count < 0);
+	/*
+	 * Assert that at least one range was added between start() and end().
+	 * Not adding a range isn't fatal, but it is a KVM bug.
+	 */
+	WARN_ON_ONCE(kvm->mmu_invalidate_range_start == INVALID_GPA);
 }
 
-static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
-					      struct mm_struct *mm,
-					      unsigned long start,
-					      unsigned long end)
+static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
+					const struct mmu_notifier_range *range)
 {
 	struct kvm *kvm = mmu_notifier_to_kvm(mn);
-	int young, idx;
+	const struct kvm_mmu_notifier_range hva_range = {
+		.start		= range->start,
+		.end		= range->end,
+		.handler	= (void *)kvm_null_fn,
+		.on_lock	= kvm_mmu_invalidate_end,
+		.flush_on_ret	= false,
+		.may_block	= mmu_notifier_range_blockable(range),
+	};
+	bool wake;
 
-	idx = srcu_read_lock(&kvm->srcu);
-	spin_lock(&kvm->mmu_lock);
+	kvm_handle_hva_range(kvm, &hva_range);
 
-	young = kvm_age_hva(kvm, start, end);
-	if (young)
-		kvm_flush_remote_tlbs(kvm);
+	/* Pairs with the increment in range_start(). */
+	spin_lock(&kvm->mn_invalidate_lock);
+	if (!WARN_ON_ONCE(!kvm->mn_active_invalidate_count))
+		--kvm->mn_active_invalidate_count;
+	wake = !kvm->mn_active_invalidate_count;
+	spin_unlock(&kvm->mn_invalidate_lock);
 
-	spin_unlock(&kvm->mmu_lock);
-	srcu_read_unlock(&kvm->srcu, idx);
+	/*
+	 * There can only be one waiter, since the wait happens under
+	 * slots_lock.
+	 */
+	if (wake)
+		rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);
+}
+
+static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
+					      struct mm_struct *mm,
+					      unsigned long start,
+					      unsigned long end)
+{
+	trace_kvm_age_hva(start, end);
 
-	return young;
+	return kvm_age_hva_range(mn, start, end, kvm_age_gfn,
+				 !IS_ENABLED(CONFIG_KVM_ELIDE_TLB_FLUSH_IF_YOUNG));
 }
 
 static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
@@ -443,11 +847,8 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
 					unsigned long start,
 					unsigned long end)
 {
-	struct kvm *kvm = mmu_notifier_to_kvm(mn);
-	int young, idx;
+	trace_kvm_age_hva(start, end);
 
-	idx = srcu_read_lock(&kvm->srcu);
-	spin_lock(&kvm->mmu_lock);
 	/*
 	 * Even though we do not flush TLB, this will still adversely
 	 * affect performance on pre-Haswell Intel EPT, where there is
@@ -461,27 +862,17 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
 	 * cadence. If we find this inaccurate, we might come up with a
 	 * more sophisticated heuristic later.
 	 */
-	young = kvm_age_hva(kvm, start, end);
-	spin_unlock(&kvm->mmu_lock);
-	srcu_read_unlock(&kvm->srcu, idx);
-
-	return young;
+	return kvm_age_hva_range_no_flush(mn, start, end, kvm_age_gfn);
 }
 
 static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
 				       struct mm_struct *mm,
 				       unsigned long address)
 {
-	struct kvm *kvm = mmu_notifier_to_kvm(mn);
-	int young, idx;
-
-	idx = srcu_read_lock(&kvm->srcu);
-	spin_lock(&kvm->mmu_lock);
-	young = kvm_test_age_hva(kvm, address);
-	spin_unlock(&kvm->mmu_lock);
-	srcu_read_unlock(&kvm->srcu, idx);
+	trace_kvm_test_age_hva(address);
 
-	return young;
+	return kvm_age_hva_range_no_flush(mn, address, address + 1,
+					  kvm_test_age_gfn);
 }
 
 static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
@@ -491,7 +882,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
 	int idx;
 
 	idx = srcu_read_lock(&kvm->srcu);
-	kvm_arch_flush_shadow_all(kvm);
+	kvm_flush_shadow_all(kvm);
 	srcu_read_unlock(&kvm->srcu, idx);
 }
 
@@ -501,7 +892,6 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
 	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,
 	.clear_young		= kvm_mmu_notifier_clear_young,
 	.test_young		= kvm_mmu_notifier_test_young,
-	.change_pte		= kvm_mmu_notifier_change_pte,
 	.release		= kvm_mmu_notifier_release,
 };
 
@@ -511,71 +901,108 @@ static int kvm_init_mmu_notifier(struct kvm *kvm)
 	return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
 }
 
-#else  /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */
+#else  /* !CONFIG_KVM_GENERIC_MMU_NOTIFIER */
 
 static int kvm_init_mmu_notifier(struct kvm *kvm)
 {
 	return 0;
 }
 
-#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
+#endif /* CONFIG_KVM_GENERIC_MMU_NOTIFIER */
 
-static struct kvm_memslots *kvm_alloc_memslots(void)
+#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
+static int kvm_pm_notifier_call(struct notifier_block *bl,
+				unsigned long state,
+				void *unused)
 {
-	int i;
-	struct kvm_memslots *slots;
+	struct kvm *kvm = container_of(bl, struct kvm, pm_notifier);
 
-	slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
-	if (!slots)
-		return NULL;
+	return kvm_arch_pm_notifier(kvm, state);
+}
 
-	for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
-		slots->id_to_index[i] = slots->memslots[i].id = i;
+static void kvm_init_pm_notifier(struct kvm *kvm)
+{
+	kvm->pm_notifier.notifier_call = kvm_pm_notifier_call;
+	/* Suspend KVM before we suspend ftrace, RCU, etc. */
+	kvm->pm_notifier.priority = INT_MAX;
+	register_pm_notifier(&kvm->pm_notifier);
+}
 
-	return slots;
+static void kvm_destroy_pm_notifier(struct kvm *kvm)
+{
+	unregister_pm_notifier(&kvm->pm_notifier);
+}
+#else /* !CONFIG_HAVE_KVM_PM_NOTIFIER */
+static void kvm_init_pm_notifier(struct kvm *kvm)
+{
+}
+
+static void kvm_destroy_pm_notifier(struct kvm *kvm)
+{
 }
+#endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
 
 static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
 {
 	if (!memslot->dirty_bitmap)
 		return;
 
-	kvfree(memslot->dirty_bitmap);
+	vfree(memslot->dirty_bitmap);
 	memslot->dirty_bitmap = NULL;
 }
 
-/*
- * Free any memory in @free but not in @dont.
- */
-static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
-			      struct kvm_memory_slot *dont)
+/* This does not remove the slot from struct kvm_memslots data structures */
+static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
 {
-	if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
-		kvm_destroy_dirty_bitmap(free);
+	if (slot->flags & KVM_MEM_GUEST_MEMFD)
+		kvm_gmem_unbind(slot);
+
+	kvm_destroy_dirty_bitmap(slot);
 
-	kvm_arch_free_memslot(kvm, free, dont);
+	kvm_arch_free_memslot(kvm, slot);
 
-	free->npages = 0;
+	kfree(slot);
 }
 
 static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
 {
+	struct hlist_node *idnode;
 	struct kvm_memory_slot *memslot;
+	int bkt;
 
-	if (!slots)
+	/*
+	 * The same memslot objects live in both active and inactive sets,
+	 * arbitrarily free using index '1' so the second invocation of this
+	 * function isn't operating over a structure with dangling pointers
+	 * (even though this function isn't actually touching them).
+	 */
+	if (!slots->node_idx)
 		return;
 
-	kvm_for_each_memslot(memslot, slots)
-		kvm_free_memslot(kvm, memslot, NULL);
+	hash_for_each_safe(slots->id_hash, bkt, idnode, memslot, id_node[1])
+		kvm_free_memslot(kvm, memslot);
+}
 
-	kvfree(slots);
+static umode_t kvm_stats_debugfs_mode(const struct _kvm_stats_desc *pdesc)
+{
+	switch (pdesc->desc.flags & KVM_STATS_TYPE_MASK) {
+	case KVM_STATS_TYPE_INSTANT:
+		return 0444;
+	case KVM_STATS_TYPE_CUMULATIVE:
+	case KVM_STATS_TYPE_PEAK:
+	default:
+		return 0644;
+	}
 }
 
+
 static void kvm_destroy_vm_debugfs(struct kvm *kvm)
 {
 	int i;
+	int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
+				      kvm_vcpu_stats_header.num_desc;
 
-	if (!kvm->debugfs_dentry)
+	if (IS_ERR(kvm->debugfs_dentry))
 		return;
 
 	debugfs_remove_recursive(kvm->debugfs_dentry);
@@ -587,119 +1014,236 @@ static void kvm_destroy_vm_debugfs(struct kvm *kvm)
 	}
 }
 
-static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
+static int kvm_create_vm_debugfs(struct kvm *kvm, const char *fdname)
 {
+	static DEFINE_MUTEX(kvm_debugfs_lock);
+	struct dentry *dent;
 	char dir_name[ITOA_MAX_LEN * 2];
 	struct kvm_stat_data *stat_data;
-	struct kvm_stats_debugfs_item *p;
+	const struct _kvm_stats_desc *pdesc;
+	int i, ret = -ENOMEM;
+	int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
+				      kvm_vcpu_stats_header.num_desc;
 
 	if (!debugfs_initialized())
 		return 0;
 
-	snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd);
-	kvm->debugfs_dentry = debugfs_create_dir(dir_name, kvm_debugfs_dir);
+	snprintf(dir_name, sizeof(dir_name), "%d-%s", task_pid_nr(current), fdname);
+	mutex_lock(&kvm_debugfs_lock);
+	dent = debugfs_lookup(dir_name, kvm_debugfs_dir);
+	if (dent) {
+		pr_warn_ratelimited("KVM: debugfs: duplicate directory %s\n", dir_name);
+		dput(dent);
+		mutex_unlock(&kvm_debugfs_lock);
+		return 0;
+	}
+	dent = debugfs_create_dir(dir_name, kvm_debugfs_dir);
+	mutex_unlock(&kvm_debugfs_lock);
+	if (IS_ERR(dent))
+		return 0;
 
+	kvm->debugfs_dentry = dent;
 	kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
 					 sizeof(*kvm->debugfs_stat_data),
-					 GFP_KERNEL);
+					 GFP_KERNEL_ACCOUNT);
 	if (!kvm->debugfs_stat_data)
-		return -ENOMEM;
+		goto out_err;
 
-	for (p = debugfs_entries; p->name; p++) {
-		stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL);
+	for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
+		pdesc = &kvm_vm_stats_desc[i];
+		stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
 		if (!stat_data)
-			return -ENOMEM;
+			goto out_err;
+
+		stat_data->kvm = kvm;
+		stat_data->desc = pdesc;
+		stat_data->kind = KVM_STAT_VM;
+		kvm->debugfs_stat_data[i] = stat_data;
+		debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
+				    kvm->debugfs_dentry, stat_data,
+				    &stat_fops_per_vm);
+	}
+
+	for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
+		pdesc = &kvm_vcpu_stats_desc[i];
+		stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
+		if (!stat_data)
+			goto out_err;
 
 		stat_data->kvm = kvm;
-		stat_data->offset = p->offset;
-		kvm->debugfs_stat_data[p - debugfs_entries] = stat_data;
-		debugfs_create_file(p->name, 0644, kvm->debugfs_dentry,
-				    stat_data, stat_fops_per_vm[p->kind]);
+		stat_data->desc = pdesc;
+		stat_data->kind = KVM_STAT_VCPU;
+		kvm->debugfs_stat_data[i + kvm_vm_stats_header.num_desc] = stat_data;
+		debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
+				    kvm->debugfs_dentry, stat_data,
+				    &stat_fops_per_vm);
 	}
+
+	kvm_arch_create_vm_debugfs(kvm);
 	return 0;
+out_err:
+	kvm_destroy_vm_debugfs(kvm);
+	return ret;
 }
 
-static struct kvm *kvm_create_vm(unsigned long type)
+/*
+ * Called just after removing the VM from the vm_list, but before doing any
+ * other destruction.
+ */
+void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
+{
+}
+
+/*
+ * Called after per-vm debugfs created.  When called kvm->debugfs_dentry should
+ * be setup already, so we can create arch-specific debugfs entries under it.
+ * Cleanup should be automatic done in kvm_destroy_vm_debugfs() recursively, so
+ * a per-arch destroy interface is not needed.
+ */
+void __weak kvm_arch_create_vm_debugfs(struct kvm *kvm)
+{
+}
+
+/* Called only on cleanup and destruction paths when there are no users. */
+static inline struct kvm_io_bus *kvm_get_bus_for_destruction(struct kvm *kvm,
+							     enum kvm_bus idx)
+{
+	return rcu_dereference_protected(kvm->buses[idx],
+					 !refcount_read(&kvm->users_count));
+}
+
+static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
 {
-	int r, i;
 	struct kvm *kvm = kvm_arch_alloc_vm();
+	struct kvm_memslots *slots;
+	int r, i, j;
 
 	if (!kvm)
 		return ERR_PTR(-ENOMEM);
 
-	spin_lock_init(&kvm->mmu_lock);
+	KVM_MMU_LOCK_INIT(kvm);
 	mmgrab(current->mm);
 	kvm->mm = current->mm;
 	kvm_eventfd_init(kvm);
 	mutex_init(&kvm->lock);
 	mutex_init(&kvm->irq_lock);
 	mutex_init(&kvm->slots_lock);
-	refcount_set(&kvm->users_count, 1);
-	INIT_LIST_HEAD(&kvm->devices);
+	mutex_init(&kvm->slots_arch_lock);
+	spin_lock_init(&kvm->mn_invalidate_lock);
+	rcuwait_init(&kvm->mn_memslots_update_rcuwait);
+	xa_init(&kvm->vcpu_array);
+#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+	xa_init(&kvm->mem_attr_array);
+#endif
 
-	r = kvm_arch_init_vm(kvm, type);
-	if (r)
-		goto out_err_no_disable;
+	INIT_LIST_HEAD(&kvm->gpc_list);
+	spin_lock_init(&kvm->gpc_lock);
 
-	r = hardware_enable_all();
-	if (r)
-		goto out_err_no_disable;
-
-#ifdef CONFIG_HAVE_KVM_IRQFD
-	INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
-#endif
+	INIT_LIST_HEAD(&kvm->devices);
+	kvm->max_vcpus = KVM_MAX_VCPUS;
 
 	BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
 
-	r = -ENOMEM;
-	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
-		struct kvm_memslots *slots = kvm_alloc_memslots();
-		if (!slots)
-			goto out_err_no_srcu;
-		/*
-		 * Generations must be different for each address space.
-		 * Init kvm generation close to the maximum to easily test the
-		 * code of handling generation number wrap-around.
-		 */
-		slots->generation = i * 2 - 150;
-		rcu_assign_pointer(kvm->memslots[i], slots);
-	}
+	/*
+	 * Force subsequent debugfs file creations to fail if the VM directory
+	 * is not created (by kvm_create_vm_debugfs()).
+	 */
+	kvm->debugfs_dentry = ERR_PTR(-ENOENT);
+
+	snprintf(kvm->stats_id, sizeof(kvm->stats_id), "kvm-%d",
+		 task_pid_nr(current));
 
+	r = -ENOMEM;
 	if (init_srcu_struct(&kvm->srcu))
 		goto out_err_no_srcu;
 	if (init_srcu_struct(&kvm->irq_srcu))
 		goto out_err_no_irq_srcu;
+
+	r = kvm_init_irq_routing(kvm);
+	if (r)
+		goto out_err_no_irq_routing;
+
+	refcount_set(&kvm->users_count, 1);
+
+	for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
+		for (j = 0; j < 2; j++) {
+			slots = &kvm->__memslots[i][j];
+
+			atomic_long_set(&slots->last_used_slot, (unsigned long)NULL);
+			slots->hva_tree = RB_ROOT_CACHED;
+			slots->gfn_tree = RB_ROOT;
+			hash_init(slots->id_hash);
+			slots->node_idx = j;
+
+			/* Generations must be different for each address space. */
+			slots->generation = i;
+		}
+
+		rcu_assign_pointer(kvm->memslots[i], &kvm->__memslots[i][0]);
+	}
+
+	r = -ENOMEM;
 	for (i = 0; i < KVM_NR_BUSES; i++) {
 		rcu_assign_pointer(kvm->buses[i],
-			kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL));
+			kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
 		if (!kvm->buses[i])
-			goto out_err;
+			goto out_err_no_arch_destroy_vm;
 	}
 
+	r = kvm_arch_init_vm(kvm, type);
+	if (r)
+		goto out_err_no_arch_destroy_vm;
+
+	r = kvm_enable_virtualization();
+	if (r)
+		goto out_err_no_disable;
+
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
+	INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
+#endif
+
 	r = kvm_init_mmu_notifier(kvm);
 	if (r)
-		goto out_err;
+		goto out_err_no_mmu_notifier;
+
+	r = kvm_coalesced_mmio_init(kvm);
+	if (r < 0)
+		goto out_no_coalesced_mmio;
+
+	r = kvm_create_vm_debugfs(kvm, fdname);
+	if (r)
+		goto out_err_no_debugfs;
 
-	spin_lock(&kvm_lock);
+	mutex_lock(&kvm_lock);
 	list_add(&kvm->vm_list, &vm_list);
-	spin_unlock(&kvm_lock);
+	mutex_unlock(&kvm_lock);
 
 	preempt_notifier_inc();
+	kvm_init_pm_notifier(kvm);
 
 	return kvm;
 
-out_err:
+out_err_no_debugfs:
+	kvm_coalesced_mmio_free(kvm);
+out_no_coalesced_mmio:
+#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
+	if (kvm->mmu_notifier.ops)
+		mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
+#endif
+out_err_no_mmu_notifier:
+	kvm_disable_virtualization();
+out_err_no_disable:
+	kvm_arch_destroy_vm(kvm);
+out_err_no_arch_destroy_vm:
+	WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
+	for (i = 0; i < KVM_NR_BUSES; i++)
+		kfree(kvm_get_bus_for_destruction(kvm, i));
+	kvm_free_irq_routing(kvm);
+out_err_no_irq_routing:
 	cleanup_srcu_struct(&kvm->irq_srcu);
 out_err_no_irq_srcu:
 	cleanup_srcu_struct(&kvm->srcu);
 out_err_no_srcu:
-	hardware_disable_all();
-out_err_no_disable:
-	refcount_set(&kvm->users_count, 0);
-	for (i = 0; i < KVM_NR_BUSES; i++)
-		kfree(kvm_get_bus(kvm, i));
-	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
-		kvm_free_memslots(kvm, __kvm_memslots(kvm, i));
 	kvm_arch_free_vm(kvm);
 	mmdrop(current->mm);
 	return ERR_PTR(r);
@@ -713,6 +1257,12 @@ static void kvm_destroy_devices(struct kvm *kvm)
 	 * We do not need to take the kvm->lock here, because nobody else
 	 * has a reference to the struct kvm at this point and therefore
 	 * cannot access the devices list anyhow.
+	 *
+	 * The device list is generally managed as an rculist, but list_del()
+	 * is used intentionally here. If a bug in KVM introduced a reader that
+	 * was not backed by a reference on the kvm struct, the hope is that
+	 * it'd consume the poisoned forward pointer instead of suffering a
+	 * use-after-free, even though this cannot be guaranteed.
 	 */
 	list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
 		list_del(&dev->vm_node);
@@ -725,35 +1275,60 @@ static void kvm_destroy_vm(struct kvm *kvm)
 	int i;
 	struct mm_struct *mm = kvm->mm;
 
+	kvm_destroy_pm_notifier(kvm);
 	kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
 	kvm_destroy_vm_debugfs(kvm);
-	kvm_arch_sync_events(kvm);
-	spin_lock(&kvm_lock);
+	mutex_lock(&kvm_lock);
 	list_del(&kvm->vm_list);
-	spin_unlock(&kvm_lock);
+	mutex_unlock(&kvm_lock);
+	kvm_arch_pre_destroy_vm(kvm);
+
 	kvm_free_irq_routing(kvm);
 	for (i = 0; i < KVM_NR_BUSES; i++) {
-		struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
+		struct kvm_io_bus *bus = kvm_get_bus_for_destruction(kvm, i);
 
 		if (bus)
 			kvm_io_bus_destroy(bus);
 		kvm->buses[i] = NULL;
 	}
 	kvm_coalesced_mmio_free(kvm);
-#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
 	mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
+	/*
+	 * At this point, pending calls to invalidate_range_start()
+	 * have completed but no more MMU notifiers will run, so
+	 * mn_active_invalidate_count may remain unbalanced.
+	 * No threads can be waiting in kvm_swap_active_memslots() as the
+	 * last reference on KVM has been dropped, but freeing
+	 * memslots would deadlock without this manual intervention.
+	 *
+	 * If the count isn't unbalanced, i.e. KVM did NOT unregister its MMU
+	 * notifier between a start() and end(), then there shouldn't be any
+	 * in-progress invalidations.
+	 */
+	WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait));
+	if (kvm->mn_active_invalidate_count)
+		kvm->mn_active_invalidate_count = 0;
+	else
+		WARN_ON(kvm->mmu_invalidate_in_progress);
 #else
-	kvm_arch_flush_shadow_all(kvm);
+	kvm_flush_shadow_all(kvm);
 #endif
 	kvm_arch_destroy_vm(kvm);
 	kvm_destroy_devices(kvm);
-	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
-		kvm_free_memslots(kvm, __kvm_memslots(kvm, i));
+	for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
+		kvm_free_memslots(kvm, &kvm->__memslots[i][0]);
+		kvm_free_memslots(kvm, &kvm->__memslots[i][1]);
+	}
 	cleanup_srcu_struct(&kvm->irq_srcu);
+	srcu_barrier(&kvm->srcu);
 	cleanup_srcu_struct(&kvm->srcu);
+#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+	xa_destroy(&kvm->mem_attr_array);
+#endif
 	kvm_arch_free_vm(kvm);
 	preempt_notifier_dec();
-	hardware_disable_all();
+	kvm_disable_virtualization();
 	mmdrop(mm);
 }
 
@@ -763,6 +1338,16 @@ void kvm_get_kvm(struct kvm *kvm)
 }
 EXPORT_SYMBOL_GPL(kvm_get_kvm);
 
+/*
+ * Make sure the vm is not during destruction, which is a safe version of
+ * kvm_get_kvm().  Return true if kvm referenced successfully, false otherwise.
+ */
+bool kvm_get_kvm_safe(struct kvm *kvm)
+{
+	return refcount_inc_not_zero(&kvm->users_count);
+}
+EXPORT_SYMBOL_GPL(kvm_get_kvm_safe);
+
 void kvm_put_kvm(struct kvm *kvm)
 {
 	if (refcount_dec_and_test(&kvm->users_count))
@@ -770,6 +1355,18 @@ void kvm_put_kvm(struct kvm *kvm)
 }
 EXPORT_SYMBOL_GPL(kvm_put_kvm);
 
+/*
+ * Used to put a reference that was taken on behalf of an object associated
+ * with a user-visible file descriptor, e.g. a vcpu or device, if installation
+ * of the new file descriptor fails and the reference cannot be transferred to
+ * its final owner.  In such cases, the caller is still actively using @kvm and
+ * will fail miserably if the refcount unexpectedly hits zero.
+ */
+void kvm_put_kvm_no_destroy(struct kvm *kvm)
+{
+	WARN_ON(refcount_dec_and_test(&kvm->users_count));
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_put_kvm_no_destroy);
 
 static int kvm_vm_release(struct inode *inode, struct file *filp)
 {
@@ -781,88 +1378,241 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+int kvm_trylock_all_vcpus(struct kvm *kvm)
+{
+	struct kvm_vcpu *vcpu;
+	unsigned long i, j;
+
+	lockdep_assert_held(&kvm->lock);
+
+	kvm_for_each_vcpu(i, vcpu, kvm)
+		if (!mutex_trylock_nest_lock(&vcpu->mutex, &kvm->lock))
+			goto out_unlock;
+	return 0;
+
+out_unlock:
+	kvm_for_each_vcpu(j, vcpu, kvm) {
+		if (i == j)
+			break;
+		mutex_unlock(&vcpu->mutex);
+	}
+	return -EINTR;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_trylock_all_vcpus);
+
+int kvm_lock_all_vcpus(struct kvm *kvm)
+{
+	struct kvm_vcpu *vcpu;
+	unsigned long i, j;
+	int r;
+
+	lockdep_assert_held(&kvm->lock);
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		r = mutex_lock_killable_nest_lock(&vcpu->mutex, &kvm->lock);
+		if (r)
+			goto out_unlock;
+	}
+	return 0;
+
+out_unlock:
+	kvm_for_each_vcpu(j, vcpu, kvm) {
+		if (i == j)
+			break;
+		mutex_unlock(&vcpu->mutex);
+	}
+	return r;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lock_all_vcpus);
+
+void kvm_unlock_all_vcpus(struct kvm *kvm)
+{
+	struct kvm_vcpu *vcpu;
+	unsigned long i;
+
+	lockdep_assert_held(&kvm->lock);
+
+	kvm_for_each_vcpu(i, vcpu, kvm)
+		mutex_unlock(&vcpu->mutex);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_unlock_all_vcpus);
+
 /*
  * Allocation size is twice as large as the actual dirty bitmap size.
- * See x86's kvm_vm_ioctl_get_dirty_log() why this is needed.
+ * See kvm_vm_ioctl_get_dirty_log() why this is needed.
  */
-static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
+static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot)
 {
-	unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
+	unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(memslot);
 
-	memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL);
+	memslot->dirty_bitmap = __vcalloc(2, dirty_bytes, GFP_KERNEL_ACCOUNT);
 	if (!memslot->dirty_bitmap)
 		return -ENOMEM;
 
 	return 0;
 }
 
+static struct kvm_memslots *kvm_get_inactive_memslots(struct kvm *kvm, int as_id)
+{
+	struct kvm_memslots *active = __kvm_memslots(kvm, as_id);
+	int node_idx_inactive = active->node_idx ^ 1;
+
+	return &kvm->__memslots[as_id][node_idx_inactive];
+}
+
 /*
- * Insert memslot and re-sort memslots based on their GFN,
- * so binary search could be used to lookup GFN.
- * Sorting algorithm takes advantage of having initially
- * sorted array and known changed memslot position.
+ * Helper to get the address space ID when one of memslot pointers may be NULL.
+ * This also serves as a sanity that at least one of the pointers is non-NULL,
+ * and that their address space IDs don't diverge.
  */
-static void update_memslots(struct kvm_memslots *slots,
-			    struct kvm_memory_slot *new,
-			    enum kvm_mr_change change)
+static int kvm_memslots_get_as_id(struct kvm_memory_slot *a,
+				  struct kvm_memory_slot *b)
 {
-	int id = new->id;
-	int i = slots->id_to_index[id];
-	struct kvm_memory_slot *mslots = slots->memslots;
+	if (WARN_ON_ONCE(!a && !b))
+		return 0;
 
-	WARN_ON(mslots[i].id != id);
-	switch (change) {
-	case KVM_MR_CREATE:
-		slots->used_slots++;
-		WARN_ON(mslots[i].npages || !new->npages);
-		break;
-	case KVM_MR_DELETE:
-		slots->used_slots--;
-		WARN_ON(new->npages || !mslots[i].npages);
-		break;
-	default:
-		break;
+	if (!a)
+		return b->as_id;
+	if (!b)
+		return a->as_id;
+
+	WARN_ON_ONCE(a->as_id != b->as_id);
+	return a->as_id;
+}
+
+static void kvm_insert_gfn_node(struct kvm_memslots *slots,
+				struct kvm_memory_slot *slot)
+{
+	struct rb_root *gfn_tree = &slots->gfn_tree;
+	struct rb_node **node, *parent;
+	int idx = slots->node_idx;
+
+	parent = NULL;
+	for (node = &gfn_tree->rb_node; *node; ) {
+		struct kvm_memory_slot *tmp;
+
+		tmp = container_of(*node, struct kvm_memory_slot, gfn_node[idx]);
+		parent = *node;
+		if (slot->base_gfn < tmp->base_gfn)
+			node = &(*node)->rb_left;
+		else if (slot->base_gfn > tmp->base_gfn)
+			node = &(*node)->rb_right;
+		else
+			BUG();
 	}
 
-	while (i < KVM_MEM_SLOTS_NUM - 1 &&
-	       new->base_gfn <= mslots[i + 1].base_gfn) {
-		if (!mslots[i + 1].npages)
-			break;
-		mslots[i] = mslots[i + 1];
-		slots->id_to_index[mslots[i].id] = i;
-		i++;
+	rb_link_node(&slot->gfn_node[idx], parent, node);
+	rb_insert_color(&slot->gfn_node[idx], gfn_tree);
+}
+
+static void kvm_erase_gfn_node(struct kvm_memslots *slots,
+			       struct kvm_memory_slot *slot)
+{
+	rb_erase(&slot->gfn_node[slots->node_idx], &slots->gfn_tree);
+}
+
+static void kvm_replace_gfn_node(struct kvm_memslots *slots,
+				 struct kvm_memory_slot *old,
+				 struct kvm_memory_slot *new)
+{
+	int idx = slots->node_idx;
+
+	WARN_ON_ONCE(old->base_gfn != new->base_gfn);
+
+	rb_replace_node(&old->gfn_node[idx], &new->gfn_node[idx],
+			&slots->gfn_tree);
+}
+
+/*
+ * Replace @old with @new in the inactive memslots.
+ *
+ * With NULL @old this simply adds @new.
+ * With NULL @new this simply removes @old.
+ *
+ * If @new is non-NULL its hva_node[slots_idx] range has to be set
+ * appropriately.
+ */
+static void kvm_replace_memslot(struct kvm *kvm,
+				struct kvm_memory_slot *old,
+				struct kvm_memory_slot *new)
+{
+	int as_id = kvm_memslots_get_as_id(old, new);
+	struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id);
+	int idx = slots->node_idx;
+
+	if (old) {
+		hash_del(&old->id_node[idx]);
+		interval_tree_remove(&old->hva_node[idx], &slots->hva_tree);
+
+		if ((long)old == atomic_long_read(&slots->last_used_slot))
+			atomic_long_set(&slots->last_used_slot, (long)new);
+
+		if (!new) {
+			kvm_erase_gfn_node(slots, old);
+			return;
+		}
 	}
 
 	/*
-	 * The ">=" is needed when creating a slot with base_gfn == 0,
-	 * so that it moves before all those with base_gfn == npages == 0.
-	 *
-	 * On the other hand, if new->npages is zero, the above loop has
-	 * already left i pointing to the beginning of the empty part of
-	 * mslots, and the ">=" would move the hole backwards in this
-	 * case---which is wrong.  So skip the loop when deleting a slot.
+	 * Initialize @new's hva range.  Do this even when replacing an @old
+	 * slot, kvm_copy_memslot() deliberately does not touch node data.
 	 */
-	if (new->npages) {
-		while (i > 0 &&
-		       new->base_gfn >= mslots[i - 1].base_gfn) {
-			mslots[i] = mslots[i - 1];
-			slots->id_to_index[mslots[i].id] = i;
-			i--;
-		}
-	} else
-		WARN_ON_ONCE(i != slots->used_slots);
+	new->hva_node[idx].start = new->userspace_addr;
+	new->hva_node[idx].last = new->userspace_addr +
+				  (new->npages << PAGE_SHIFT) - 1;
+
+	/*
+	 * (Re)Add the new memslot.  There is no O(1) interval_tree_replace(),
+	 * hva_node needs to be swapped with remove+insert even though hva can't
+	 * change when replacing an existing slot.
+	 */
+	hash_add(slots->id_hash, &new->id_node[idx], new->id);
+	interval_tree_insert(&new->hva_node[idx], &slots->hva_tree);
 
-	mslots[i] = *new;
-	slots->id_to_index[mslots[i].id] = i;
+	/*
+	 * If the memslot gfn is unchanged, rb_replace_node() can be used to
+	 * switch the node in the gfn tree instead of removing the old and
+	 * inserting the new as two separate operations. Replacement is a
+	 * single O(1) operation versus two O(log(n)) operations for
+	 * remove+insert.
+	 */
+	if (old && old->base_gfn == new->base_gfn) {
+		kvm_replace_gfn_node(slots, old, new);
+	} else {
+		if (old)
+			kvm_erase_gfn_node(slots, old);
+		kvm_insert_gfn_node(slots, new);
+	}
 }
 
-static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem)
+/*
+ * Flags that do not access any of the extra space of struct
+ * kvm_userspace_memory_region2.  KVM_SET_USER_MEMORY_REGION_V1_FLAGS
+ * only allows these.
+ */
+#define KVM_SET_USER_MEMORY_REGION_V1_FLAGS \
+	(KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY)
+
+static int check_memory_region_flags(struct kvm *kvm,
+				     const struct kvm_userspace_memory_region2 *mem)
 {
 	u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
 
-#ifdef __KVM_HAVE_READONLY_MEM
-	valid_flags |= KVM_MEM_READONLY;
-#endif
+	if (IS_ENABLED(CONFIG_KVM_GUEST_MEMFD))
+		valid_flags |= KVM_MEM_GUEST_MEMFD;
+
+	/* Dirty logging private memory is not currently supported. */
+	if (mem->flags & KVM_MEM_GUEST_MEMFD)
+		valid_flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
+
+	/*
+	 * GUEST_MEMFD is incompatible with read-only memslots, as writes to
+	 * read-only memslots have emulated MMIO, not page fault, semantics,
+	 * and KVM doesn't allow emulated MMIO for private memory.
+	 */
+	if (kvm_arch_has_readonly_mem(kvm) &&
+	    !(mem->flags & KVM_MEM_GUEST_MEMFD))
+		valid_flags |= KVM_MEM_READONLY;
 
 	if (mem->flags & ~valid_flags)
 		return -EINVAL;
@@ -870,271 +1620,596 @@ static int check_memory_region_flags(const struct kvm_userspace_memory_region *m
 	return 0;
 }
 
-static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
-		int as_id, struct kvm_memslots *slots)
+static void kvm_swap_active_memslots(struct kvm *kvm, int as_id)
 {
-	struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id);
+	struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id);
+
+	/* Grab the generation from the activate memslots. */
+	u64 gen = __kvm_memslots(kvm, as_id)->generation;
+
+	WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
+	slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
 
 	/*
-	 * Set the low bit in the generation, which disables SPTE caching
-	 * until the end of synchronize_srcu_expedited.
+	 * Do not store the new memslots while there are invalidations in
+	 * progress, otherwise the locking in invalidate_range_start and
+	 * invalidate_range_end will be unbalanced.
 	 */
-	WARN_ON(old_memslots->generation & 1);
-	slots->generation = old_memslots->generation + 1;
-
+	spin_lock(&kvm->mn_invalidate_lock);
+	prepare_to_rcuwait(&kvm->mn_memslots_update_rcuwait);
+	while (kvm->mn_active_invalidate_count) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		spin_unlock(&kvm->mn_invalidate_lock);
+		schedule();
+		spin_lock(&kvm->mn_invalidate_lock);
+	}
+	finish_rcuwait(&kvm->mn_memslots_update_rcuwait);
 	rcu_assign_pointer(kvm->memslots[as_id], slots);
+	spin_unlock(&kvm->mn_invalidate_lock);
+
+	/*
+	 * Acquired in kvm_set_memslot. Must be released before synchronize
+	 * SRCU below in order to avoid deadlock with another thread
+	 * acquiring the slots_arch_lock in an srcu critical section.
+	 */
+	mutex_unlock(&kvm->slots_arch_lock);
+
 	synchronize_srcu_expedited(&kvm->srcu);
 
 	/*
-	 * Increment the new memslot generation a second time. This prevents
-	 * vm exits that race with memslot updates from caching a memslot
-	 * generation that will (potentially) be valid forever.
-	 *
+	 * Increment the new memslot generation a second time, dropping the
+	 * update in-progress flag and incrementing the generation based on
+	 * the number of address spaces.  This provides a unique and easily
+	 * identifiable generation number while the memslots are in flux.
+	 */
+	gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
+
+	/*
 	 * Generations must be unique even across address spaces.  We do not need
 	 * a global counter for that, instead the generation space is evenly split
 	 * across address spaces.  For example, with two address spaces, address
-	 * space 0 will use generations 0, 4, 8, ... while * address space 1 will
-	 * use generations 2, 6, 10, 14, ...
+	 * space 0 will use generations 0, 2, 4, ... while address space 1 will
+	 * use generations 1, 3, 5, ...
+	 */
+	gen += kvm_arch_nr_memslot_as_ids(kvm);
+
+	kvm_arch_memslots_updated(kvm, gen);
+
+	slots->generation = gen;
+}
+
+static int kvm_prepare_memory_region(struct kvm *kvm,
+				     const struct kvm_memory_slot *old,
+				     struct kvm_memory_slot *new,
+				     enum kvm_mr_change change)
+{
+	int r;
+
+	/*
+	 * If dirty logging is disabled, nullify the bitmap; the old bitmap
+	 * will be freed on "commit".  If logging is enabled in both old and
+	 * new, reuse the existing bitmap.  If logging is enabled only in the
+	 * new and KVM isn't using a ring buffer, allocate and initialize a
+	 * new bitmap.
 	 */
-	slots->generation += KVM_ADDRESS_SPACE_NUM * 2 - 1;
+	if (change != KVM_MR_DELETE) {
+		if (!(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
+			new->dirty_bitmap = NULL;
+		else if (old && old->dirty_bitmap)
+			new->dirty_bitmap = old->dirty_bitmap;
+		else if (kvm_use_dirty_bitmap(kvm)) {
+			r = kvm_alloc_dirty_bitmap(new);
+			if (r)
+				return r;
+
+			if (kvm_dirty_log_manual_protect_and_init_set(kvm))
+				bitmap_set(new->dirty_bitmap, 0, new->npages);
+		}
+	}
+
+	r = kvm_arch_prepare_memory_region(kvm, old, new, change);
 
-	kvm_arch_memslots_updated(kvm, slots);
+	/* Free the bitmap on failure if it was allocated above. */
+	if (r && new && new->dirty_bitmap && (!old || !old->dirty_bitmap))
+		kvm_destroy_dirty_bitmap(new);
 
-	return old_memslots;
+	return r;
+}
+
+static void kvm_commit_memory_region(struct kvm *kvm,
+				     struct kvm_memory_slot *old,
+				     const struct kvm_memory_slot *new,
+				     enum kvm_mr_change change)
+{
+	int old_flags = old ? old->flags : 0;
+	int new_flags = new ? new->flags : 0;
+	/*
+	 * Update the total number of memslot pages before calling the arch
+	 * hook so that architectures can consume the result directly.
+	 */
+	if (change == KVM_MR_DELETE)
+		kvm->nr_memslot_pages -= old->npages;
+	else if (change == KVM_MR_CREATE)
+		kvm->nr_memslot_pages += new->npages;
+
+	if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES) {
+		int change = (new_flags & KVM_MEM_LOG_DIRTY_PAGES) ? 1 : -1;
+		atomic_set(&kvm->nr_memslots_dirty_logging,
+			   atomic_read(&kvm->nr_memslots_dirty_logging) + change);
+	}
+
+	kvm_arch_commit_memory_region(kvm, old, new, change);
+
+	switch (change) {
+	case KVM_MR_CREATE:
+		/* Nothing more to do. */
+		break;
+	case KVM_MR_DELETE:
+		/* Free the old memslot and all its metadata. */
+		kvm_free_memslot(kvm, old);
+		break;
+	case KVM_MR_MOVE:
+	case KVM_MR_FLAGS_ONLY:
+		/*
+		 * Free the dirty bitmap as needed; the below check encompasses
+		 * both the flags and whether a ring buffer is being used)
+		 */
+		if (old->dirty_bitmap && !new->dirty_bitmap)
+			kvm_destroy_dirty_bitmap(old);
+
+		/*
+		 * The final quirk.  Free the detached, old slot, but only its
+		 * memory, not any metadata.  Metadata, including arch specific
+		 * data, may be reused by @new.
+		 */
+		kfree(old);
+		break;
+	default:
+		BUG();
+	}
 }
 
 /*
- * Allocate some memory and give it an address in the guest physical address
- * space.
- *
- * Discontiguous memory is allowed, mostly for framebuffers.
+ * Activate @new, which must be installed in the inactive slots by the caller,
+ * by swapping the active slots and then propagating @new to @old once @old is
+ * unreachable and can be safely modified.
  *
- * Must be called holding kvm->slots_lock for write.
+ * With NULL @old this simply adds @new to @active (while swapping the sets).
+ * With NULL @new this simply removes @old from @active and frees it
+ * (while also swapping the sets).
  */
-int __kvm_set_memory_region(struct kvm *kvm,
-			    const struct kvm_userspace_memory_region *mem)
+static void kvm_activate_memslot(struct kvm *kvm,
+				 struct kvm_memory_slot *old,
+				 struct kvm_memory_slot *new)
+{
+	int as_id = kvm_memslots_get_as_id(old, new);
+
+	kvm_swap_active_memslots(kvm, as_id);
+
+	/* Propagate the new memslot to the now inactive memslots. */
+	kvm_replace_memslot(kvm, old, new);
+}
+
+static void kvm_copy_memslot(struct kvm_memory_slot *dest,
+			     const struct kvm_memory_slot *src)
 {
+	dest->base_gfn = src->base_gfn;
+	dest->npages = src->npages;
+	dest->dirty_bitmap = src->dirty_bitmap;
+	dest->arch = src->arch;
+	dest->userspace_addr = src->userspace_addr;
+	dest->flags = src->flags;
+	dest->id = src->id;
+	dest->as_id = src->as_id;
+}
+
+static void kvm_invalidate_memslot(struct kvm *kvm,
+				   struct kvm_memory_slot *old,
+				   struct kvm_memory_slot *invalid_slot)
+{
+	/*
+	 * Mark the current slot INVALID.  As with all memslot modifications,
+	 * this must be done on an unreachable slot to avoid modifying the
+	 * current slot in the active tree.
+	 */
+	kvm_copy_memslot(invalid_slot, old);
+	invalid_slot->flags |= KVM_MEMSLOT_INVALID;
+	kvm_replace_memslot(kvm, old, invalid_slot);
+
+	/*
+	 * Activate the slot that is now marked INVALID, but don't propagate
+	 * the slot to the now inactive slots. The slot is either going to be
+	 * deleted or recreated as a new slot.
+	 */
+	kvm_swap_active_memslots(kvm, old->as_id);
+
+	/*
+	 * From this point no new shadow pages pointing to a deleted, or moved,
+	 * memslot will be created.  Validation of sp->gfn happens in:
+	 *	- gfn_to_hva (kvm_read_guest, gfn_to_pfn)
+	 *	- kvm_is_visible_gfn (mmu_check_root)
+	 */
+	kvm_arch_flush_shadow_memslot(kvm, old);
+	kvm_arch_guest_memory_reclaimed(kvm);
+
+	/* Was released by kvm_swap_active_memslots(), reacquire. */
+	mutex_lock(&kvm->slots_arch_lock);
+
+	/*
+	 * Copy the arch-specific field of the newly-installed slot back to the
+	 * old slot as the arch data could have changed between releasing
+	 * slots_arch_lock in kvm_swap_active_memslots() and re-acquiring the lock
+	 * above.  Writers are required to retrieve memslots *after* acquiring
+	 * slots_arch_lock, thus the active slot's data is guaranteed to be fresh.
+	 */
+	old->arch = invalid_slot->arch;
+}
+
+static void kvm_create_memslot(struct kvm *kvm,
+			       struct kvm_memory_slot *new)
+{
+	/* Add the new memslot to the inactive set and activate. */
+	kvm_replace_memslot(kvm, NULL, new);
+	kvm_activate_memslot(kvm, NULL, new);
+}
+
+static void kvm_delete_memslot(struct kvm *kvm,
+			       struct kvm_memory_slot *old,
+			       struct kvm_memory_slot *invalid_slot)
+{
+	/*
+	 * Remove the old memslot (in the inactive memslots) by passing NULL as
+	 * the "new" slot, and for the invalid version in the active slots.
+	 */
+	kvm_replace_memslot(kvm, old, NULL);
+	kvm_activate_memslot(kvm, invalid_slot, NULL);
+}
+
+static void kvm_move_memslot(struct kvm *kvm,
+			     struct kvm_memory_slot *old,
+			     struct kvm_memory_slot *new,
+			     struct kvm_memory_slot *invalid_slot)
+{
+	/*
+	 * Replace the old memslot in the inactive slots, and then swap slots
+	 * and replace the current INVALID with the new as well.
+	 */
+	kvm_replace_memslot(kvm, old, new);
+	kvm_activate_memslot(kvm, invalid_slot, new);
+}
+
+static void kvm_update_flags_memslot(struct kvm *kvm,
+				     struct kvm_memory_slot *old,
+				     struct kvm_memory_slot *new)
+{
+	/*
+	 * Similar to the MOVE case, but the slot doesn't need to be zapped as
+	 * an intermediate step. Instead, the old memslot is simply replaced
+	 * with a new, updated copy in both memslot sets.
+	 */
+	kvm_replace_memslot(kvm, old, new);
+	kvm_activate_memslot(kvm, old, new);
+}
+
+static int kvm_set_memslot(struct kvm *kvm,
+			   struct kvm_memory_slot *old,
+			   struct kvm_memory_slot *new,
+			   enum kvm_mr_change change)
+{
+	struct kvm_memory_slot *invalid_slot;
 	int r;
-	gfn_t base_gfn;
+
+	/*
+	 * Released in kvm_swap_active_memslots().
+	 *
+	 * Must be held from before the current memslots are copied until after
+	 * the new memslots are installed with rcu_assign_pointer, then
+	 * released before the synchronize srcu in kvm_swap_active_memslots().
+	 *
+	 * When modifying memslots outside of the slots_lock, must be held
+	 * before reading the pointer to the current memslots until after all
+	 * changes to those memslots are complete.
+	 *
+	 * These rules ensure that installing new memslots does not lose
+	 * changes made to the previous memslots.
+	 */
+	mutex_lock(&kvm->slots_arch_lock);
+
+	/*
+	 * Invalidate the old slot if it's being deleted or moved.  This is
+	 * done prior to actually deleting/moving the memslot to allow vCPUs to
+	 * continue running by ensuring there are no mappings or shadow pages
+	 * for the memslot when it is deleted/moved.  Without pre-invalidation
+	 * (and without a lock), a window would exist between effecting the
+	 * delete/move and committing the changes in arch code where KVM or a
+	 * guest could access a non-existent memslot.
+	 *
+	 * Modifications are done on a temporary, unreachable slot.  The old
+	 * slot needs to be preserved in case a later step fails and the
+	 * invalidation needs to be reverted.
+	 */
+	if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
+		invalid_slot = kzalloc(sizeof(*invalid_slot), GFP_KERNEL_ACCOUNT);
+		if (!invalid_slot) {
+			mutex_unlock(&kvm->slots_arch_lock);
+			return -ENOMEM;
+		}
+		kvm_invalidate_memslot(kvm, old, invalid_slot);
+	}
+
+	r = kvm_prepare_memory_region(kvm, old, new, change);
+	if (r) {
+		/*
+		 * For DELETE/MOVE, revert the above INVALID change.  No
+		 * modifications required since the original slot was preserved
+		 * in the inactive slots.  Changing the active memslots also
+		 * release slots_arch_lock.
+		 */
+		if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
+			kvm_activate_memslot(kvm, invalid_slot, old);
+			kfree(invalid_slot);
+		} else {
+			mutex_unlock(&kvm->slots_arch_lock);
+		}
+		return r;
+	}
+
+	/*
+	 * For DELETE and MOVE, the working slot is now active as the INVALID
+	 * version of the old slot.  MOVE is particularly special as it reuses
+	 * the old slot and returns a copy of the old slot (in working_slot).
+	 * For CREATE, there is no old slot.  For DELETE and FLAGS_ONLY, the
+	 * old slot is detached but otherwise preserved.
+	 */
+	if (change == KVM_MR_CREATE)
+		kvm_create_memslot(kvm, new);
+	else if (change == KVM_MR_DELETE)
+		kvm_delete_memslot(kvm, old, invalid_slot);
+	else if (change == KVM_MR_MOVE)
+		kvm_move_memslot(kvm, old, new, invalid_slot);
+	else if (change == KVM_MR_FLAGS_ONLY)
+		kvm_update_flags_memslot(kvm, old, new);
+	else
+		BUG();
+
+	/* Free the temporary INVALID slot used for DELETE and MOVE. */
+	if (change == KVM_MR_DELETE || change == KVM_MR_MOVE)
+		kfree(invalid_slot);
+
+	/*
+	 * No need to refresh new->arch, changes after dropping slots_arch_lock
+	 * will directly hit the final, active memslot.  Architectures are
+	 * responsible for knowing that new->arch may be stale.
+	 */
+	kvm_commit_memory_region(kvm, old, new, change);
+
+	return 0;
+}
+
+static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id,
+				      gfn_t start, gfn_t end)
+{
+	struct kvm_memslot_iter iter;
+
+	kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
+		if (iter.slot->id != id)
+			return true;
+	}
+
+	return false;
+}
+
+static int kvm_set_memory_region(struct kvm *kvm,
+				 const struct kvm_userspace_memory_region2 *mem)
+{
+	struct kvm_memory_slot *old, *new;
+	struct kvm_memslots *slots;
+	enum kvm_mr_change change;
 	unsigned long npages;
-	struct kvm_memory_slot *slot;
-	struct kvm_memory_slot old, new;
-	struct kvm_memslots *slots = NULL, *old_memslots;
+	gfn_t base_gfn;
 	int as_id, id;
-	enum kvm_mr_change change;
+	int r;
+
+	lockdep_assert_held(&kvm->slots_lock);
 
-	r = check_memory_region_flags(mem);
+	r = check_memory_region_flags(kvm, mem);
 	if (r)
-		goto out;
+		return r;
 
-	r = -EINVAL;
 	as_id = mem->slot >> 16;
 	id = (u16)mem->slot;
 
 	/* General sanity checks */
-	if (mem->memory_size & (PAGE_SIZE - 1))
-		goto out;
+	if ((mem->memory_size & (PAGE_SIZE - 1)) ||
+	    (mem->memory_size != (unsigned long)mem->memory_size))
+		return -EINVAL;
 	if (mem->guest_phys_addr & (PAGE_SIZE - 1))
-		goto out;
+		return -EINVAL;
 	/* We can read the guest memory with __xxx_user() later on. */
-	if ((id < KVM_USER_MEM_SLOTS) &&
-	    ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
+	if ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
+	    (mem->userspace_addr != untagged_addr(mem->userspace_addr)) ||
 	     !access_ok((void __user *)(unsigned long)mem->userspace_addr,
-			mem->memory_size)))
-		goto out;
-	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
-		goto out;
+			mem->memory_size))
+		return -EINVAL;
+	if (mem->flags & KVM_MEM_GUEST_MEMFD &&
+	    (mem->guest_memfd_offset & (PAGE_SIZE - 1) ||
+	     mem->guest_memfd_offset + mem->memory_size < mem->guest_memfd_offset))
+		return -EINVAL;
+	if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_MEM_SLOTS_NUM)
+		return -EINVAL;
 	if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
-		goto out;
-
-	slot = id_to_memslot(__kvm_memslots(kvm, as_id), id);
-	base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
-	npages = mem->memory_size >> PAGE_SHIFT;
-
-	if (npages > KVM_MEM_MAX_NR_PAGES)
-		goto out;
+		return -EINVAL;
 
-	new = old = *slot;
+	/*
+	 * The size of userspace-defined memory regions is restricted in order
+	 * to play nice with dirty bitmap operations, which are indexed with an
+	 * "unsigned int".  KVM's internal memory regions don't support dirty
+	 * logging, and so are exempt.
+	 */
+	if (id < KVM_USER_MEM_SLOTS &&
+	    (mem->memory_size >> PAGE_SHIFT) > KVM_MEM_MAX_NR_PAGES)
+		return -EINVAL;
 
-	new.id = id;
-	new.base_gfn = base_gfn;
-	new.npages = npages;
-	new.flags = mem->flags;
+	slots = __kvm_memslots(kvm, as_id);
 
-	if (npages) {
-		if (!old.npages)
-			change = KVM_MR_CREATE;
-		else { /* Modify an existing slot. */
-			if ((mem->userspace_addr != old.userspace_addr) ||
-			    (npages != old.npages) ||
-			    ((new.flags ^ old.flags) & KVM_MEM_READONLY))
-				goto out;
+	/*
+	 * Note, the old memslot (and the pointer itself!) may be invalidated
+	 * and/or destroyed by kvm_set_memslot().
+	 */
+	old = id_to_memslot(slots, id);
 
-			if (base_gfn != old.base_gfn)
-				change = KVM_MR_MOVE;
-			else if (new.flags != old.flags)
-				change = KVM_MR_FLAGS_ONLY;
-			else { /* Nothing to change. */
-				r = 0;
-				goto out;
-			}
-		}
-	} else {
-		if (!old.npages)
-			goto out;
+	if (!mem->memory_size) {
+		if (!old || !old->npages)
+			return -EINVAL;
 
-		change = KVM_MR_DELETE;
-		new.base_gfn = 0;
-		new.flags = 0;
-	}
+		if (WARN_ON_ONCE(kvm->nr_memslot_pages < old->npages))
+			return -EIO;
 
-	if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
-		/* Check for overlaps */
-		r = -EEXIST;
-		kvm_for_each_memslot(slot, __kvm_memslots(kvm, as_id)) {
-			if (slot->id == id)
-				continue;
-			if (!((base_gfn + npages <= slot->base_gfn) ||
-			      (base_gfn >= slot->base_gfn + slot->npages)))
-				goto out;
-		}
+		return kvm_set_memslot(kvm, old, NULL, KVM_MR_DELETE);
 	}
 
-	/* Free page dirty bitmap if unneeded */
-	if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
-		new.dirty_bitmap = NULL;
+	base_gfn = (mem->guest_phys_addr >> PAGE_SHIFT);
+	npages = (mem->memory_size >> PAGE_SHIFT);
 
-	r = -ENOMEM;
-	if (change == KVM_MR_CREATE) {
-		new.userspace_addr = mem->userspace_addr;
+	if (!old || !old->npages) {
+		change = KVM_MR_CREATE;
 
-		if (kvm_arch_create_memslot(kvm, &new, npages))
-			goto out_free;
-	}
+		/*
+		 * To simplify KVM internals, the total number of pages across
+		 * all memslots must fit in an unsigned long.
+		 */
+		if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages)
+			return -EINVAL;
+	} else { /* Modify an existing slot. */
+		/* Private memslots are immutable, they can only be deleted. */
+		if (mem->flags & KVM_MEM_GUEST_MEMFD)
+			return -EINVAL;
+		if ((mem->userspace_addr != old->userspace_addr) ||
+		    (npages != old->npages) ||
+		    ((mem->flags ^ old->flags) & KVM_MEM_READONLY))
+			return -EINVAL;
 
-	/* Allocate page dirty bitmap if needed */
-	if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
-		if (kvm_create_dirty_bitmap(&new) < 0)
-			goto out_free;
+		if (base_gfn != old->base_gfn)
+			change = KVM_MR_MOVE;
+		else if (mem->flags != old->flags)
+			change = KVM_MR_FLAGS_ONLY;
+		else /* Nothing to change. */
+			return 0;
 	}
 
-	slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
-	if (!slots)
-		goto out_free;
-	memcpy(slots, __kvm_memslots(kvm, as_id), sizeof(struct kvm_memslots));
-
-	if ((change == KVM_MR_DELETE) || (change == KVM_MR_MOVE)) {
-		slot = id_to_memslot(slots, id);
-		slot->flags |= KVM_MEMSLOT_INVALID;
-
-		old_memslots = install_new_memslots(kvm, as_id, slots);
+	if ((change == KVM_MR_CREATE || change == KVM_MR_MOVE) &&
+	    kvm_check_memslot_overlap(slots, id, base_gfn, base_gfn + npages))
+		return -EEXIST;
 
-		/* From this point no new shadow pages pointing to a deleted,
-		 * or moved, memslot will be created.
-		 *
-		 * validation of sp->gfn happens in:
-		 *	- gfn_to_hva (kvm_read_guest, gfn_to_pfn)
-		 *	- kvm_is_visible_gfn (mmu_check_roots)
-		 */
-		kvm_arch_flush_shadow_memslot(kvm, slot);
+	/* Allocate a slot that will persist in the memslot. */
+	new = kzalloc(sizeof(*new), GFP_KERNEL_ACCOUNT);
+	if (!new)
+		return -ENOMEM;
 
-		/*
-		 * We can re-use the old_memslots from above, the only difference
-		 * from the currently installed memslots is the invalid flag.  This
-		 * will get overwritten by update_memslots anyway.
-		 */
-		slots = old_memslots;
+	new->as_id = as_id;
+	new->id = id;
+	new->base_gfn = base_gfn;
+	new->npages = npages;
+	new->flags = mem->flags;
+	new->userspace_addr = mem->userspace_addr;
+	if (mem->flags & KVM_MEM_GUEST_MEMFD) {
+		r = kvm_gmem_bind(kvm, new, mem->guest_memfd, mem->guest_memfd_offset);
+		if (r)
+			goto out;
 	}
 
-	r = kvm_arch_prepare_memory_region(kvm, &new, mem, change);
+	r = kvm_set_memslot(kvm, old, new, change);
 	if (r)
-		goto out_slots;
-
-	/* actual memory is freed via old in kvm_free_memslot below */
-	if (change == KVM_MR_DELETE) {
-		new.dirty_bitmap = NULL;
-		memset(&new.arch, 0, sizeof(new.arch));
-	}
-
-	update_memslots(slots, &new, change);
-	old_memslots = install_new_memslots(kvm, as_id, slots);
+		goto out_unbind;
 
-	kvm_arch_commit_memory_region(kvm, mem, &old, &new, change);
-
-	kvm_free_memslot(kvm, &old, &new);
-	kvfree(old_memslots);
 	return 0;
 
-out_slots:
-	kvfree(slots);
-out_free:
-	kvm_free_memslot(kvm, &new, &old);
+out_unbind:
+	if (mem->flags & KVM_MEM_GUEST_MEMFD)
+		kvm_gmem_unbind(new);
 out:
+	kfree(new);
 	return r;
 }
-EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
 
-int kvm_set_memory_region(struct kvm *kvm,
-			  const struct kvm_userspace_memory_region *mem)
+int kvm_set_internal_memslot(struct kvm *kvm,
+			     const struct kvm_userspace_memory_region2 *mem)
 {
-	int r;
+	if (WARN_ON_ONCE(mem->slot < KVM_USER_MEM_SLOTS))
+		return -EINVAL;
 
-	mutex_lock(&kvm->slots_lock);
-	r = __kvm_set_memory_region(kvm, mem);
-	mutex_unlock(&kvm->slots_lock);
-	return r;
+	if (WARN_ON_ONCE(mem->flags))
+		return -EINVAL;
+
+	return kvm_set_memory_region(kvm, mem);
 }
-EXPORT_SYMBOL_GPL(kvm_set_memory_region);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_internal_memslot);
 
 static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
-					  struct kvm_userspace_memory_region *mem)
+					  struct kvm_userspace_memory_region2 *mem)
 {
 	if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
 		return -EINVAL;
 
+	guard(mutex)(&kvm->slots_lock);
 	return kvm_set_memory_region(kvm, mem);
 }
 
-int kvm_get_dirty_log(struct kvm *kvm,
-			struct kvm_dirty_log *log, int *is_dirty)
+#ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
+/**
+ * kvm_get_dirty_log - get a snapshot of dirty pages
+ * @kvm:	pointer to kvm instance
+ * @log:	slot id and address to which we copy the log
+ * @is_dirty:	set to '1' if any dirty pages were found
+ * @memslot:	set to the associated memslot, always valid on success
+ */
+int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
+		      int *is_dirty, struct kvm_memory_slot **memslot)
 {
 	struct kvm_memslots *slots;
-	struct kvm_memory_slot *memslot;
 	int i, as_id, id;
 	unsigned long n;
 	unsigned long any = 0;
 
+	/* Dirty ring tracking may be exclusive to dirty log tracking */
+	if (!kvm_use_dirty_bitmap(kvm))
+		return -ENXIO;
+
+	*memslot = NULL;
+	*is_dirty = 0;
+
 	as_id = log->slot >> 16;
 	id = (u16)log->slot;
-	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
+	if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS)
 		return -EINVAL;
 
 	slots = __kvm_memslots(kvm, as_id);
-	memslot = id_to_memslot(slots, id);
-	if (!memslot->dirty_bitmap)
+	*memslot = id_to_memslot(slots, id);
+	if (!(*memslot) || !(*memslot)->dirty_bitmap)
 		return -ENOENT;
 
-	n = kvm_dirty_bitmap_bytes(memslot);
+	kvm_arch_sync_dirty_log(kvm, *memslot);
+
+	n = kvm_dirty_bitmap_bytes(*memslot);
 
 	for (i = 0; !any && i < n/sizeof(long); ++i)
-		any = memslot->dirty_bitmap[i];
+		any = (*memslot)->dirty_bitmap[i];
 
-	if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
+	if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n))
 		return -EFAULT;
 
 	if (any)
 		*is_dirty = 1;
 	return 0;
 }
-EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_dirty_log);
 
-#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
+#else /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
 /**
- * kvm_get_dirty_log_protect - get a snapshot of dirty pages, and if any pages
+ * kvm_get_dirty_log_protect - get a snapshot of dirty pages
  *	and reenable dirty page tracking for the corresponding pages.
  * @kvm:	pointer to kvm instance
  * @log:	slot id and address to which we copy the log
- * @is_dirty:	flag set if any page is dirty
  *
  * We need to keep it in mind that VCPU threads can write to the bitmap
  * concurrently. So, to avoid losing track of dirty pages we keep the
@@ -1151,8 +2226,7 @@ EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
  * exiting to userspace will be logged for the next call.
  *
  */
-int kvm_get_dirty_log_protect(struct kvm *kvm,
-			struct kvm_dirty_log *log, bool *flush)
+static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
 {
 	struct kvm_memslots *slots;
 	struct kvm_memory_slot *memslot;
@@ -1160,21 +2234,28 @@ int kvm_get_dirty_log_protect(struct kvm *kvm,
 	unsigned long n;
 	unsigned long *dirty_bitmap;
 	unsigned long *dirty_bitmap_buffer;
+	bool flush;
+
+	/* Dirty ring tracking may be exclusive to dirty log tracking */
+	if (!kvm_use_dirty_bitmap(kvm))
+		return -ENXIO;
 
 	as_id = log->slot >> 16;
 	id = (u16)log->slot;
-	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
+	if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS)
 		return -EINVAL;
 
 	slots = __kvm_memslots(kvm, as_id);
 	memslot = id_to_memslot(slots, id);
+	if (!memslot || !memslot->dirty_bitmap)
+		return -ENOENT;
 
 	dirty_bitmap = memslot->dirty_bitmap;
-	if (!dirty_bitmap)
-		return -ENOENT;
+
+	kvm_arch_sync_dirty_log(kvm, memslot);
 
 	n = kvm_dirty_bitmap_bytes(memslot);
-	*flush = false;
+	flush = false;
 	if (kvm->manual_dirty_log_protect) {
 		/*
 		 * Unlike kvm_get_dirty_log, we always return false in *flush,
@@ -1189,7 +2270,7 @@ int kvm_get_dirty_log_protect(struct kvm *kvm,
 		dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
 		memset(dirty_bitmap_buffer, 0, n);
 
-		spin_lock(&kvm->mmu_lock);
+		KVM_MMU_LOCK(kvm);
 		for (i = 0; i < n / sizeof(long); i++) {
 			unsigned long mask;
 			gfn_t offset;
@@ -1197,24 +2278,57 @@ int kvm_get_dirty_log_protect(struct kvm *kvm,
 			if (!dirty_bitmap[i])
 				continue;
 
-			*flush = true;
+			flush = true;
 			mask = xchg(&dirty_bitmap[i], 0);
 			dirty_bitmap_buffer[i] = mask;
 
-			if (mask) {
-				offset = i * BITS_PER_LONG;
-				kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
-									offset, mask);
-			}
+			offset = i * BITS_PER_LONG;
+			kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
+								offset, mask);
 		}
-		spin_unlock(&kvm->mmu_lock);
+		KVM_MMU_UNLOCK(kvm);
 	}
 
+	if (flush)
+		kvm_flush_remote_tlbs_memslot(kvm, memslot);
+
 	if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
 		return -EFAULT;
 	return 0;
 }
-EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect);
+
+
+/**
+ * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
+ * @kvm: kvm instance
+ * @log: slot id and address to which we copy the log
+ *
+ * Steps 1-4 below provide general overview of dirty page logging. See
+ * kvm_get_dirty_log_protect() function description for additional details.
+ *
+ * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
+ * always flush the TLB (step 4) even if previous step failed  and the dirty
+ * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
+ * does not preclude user space subsequent dirty log read. Flushing TLB ensures
+ * writes will be marked dirty for next log read.
+ *
+ *   1. Take a snapshot of the bit and clear it if needed.
+ *   2. Write protect the corresponding page.
+ *   3. Copy the snapshot to the userspace.
+ *   4. Flush TLB's if needed.
+ */
+static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
+				      struct kvm_dirty_log *log)
+{
+	int r;
+
+	mutex_lock(&kvm->slots_lock);
+
+	r = kvm_get_dirty_log_protect(kvm, log);
+
+	mutex_unlock(&kvm->slots_lock);
+	return r;
+}
 
 /**
  * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap
@@ -1222,8 +2336,8 @@ EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect);
  * @kvm:	pointer to kvm instance
  * @log:	slot id and address from which to fetch the bitmap of dirty pages
  */
-int kvm_clear_dirty_log_protect(struct kvm *kvm,
-				struct kvm_clear_dirty_log *log, bool *flush)
+static int kvm_clear_dirty_log_protect(struct kvm *kvm,
+				       struct kvm_clear_dirty_log *log)
 {
 	struct kvm_memslots *slots;
 	struct kvm_memory_slot *memslot;
@@ -1232,36 +2346,44 @@ int kvm_clear_dirty_log_protect(struct kvm *kvm,
 	unsigned long i, n;
 	unsigned long *dirty_bitmap;
 	unsigned long *dirty_bitmap_buffer;
+	bool flush;
+
+	/* Dirty ring tracking may be exclusive to dirty log tracking */
+	if (!kvm_use_dirty_bitmap(kvm))
+		return -ENXIO;
 
 	as_id = log->slot >> 16;
 	id = (u16)log->slot;
-	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
+	if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS)
 		return -EINVAL;
 
-	if ((log->first_page & 63) || (log->num_pages & 63))
+	if (log->first_page & 63)
 		return -EINVAL;
 
 	slots = __kvm_memslots(kvm, as_id);
 	memslot = id_to_memslot(slots, id);
+	if (!memslot || !memslot->dirty_bitmap)
+		return -ENOENT;
 
 	dirty_bitmap = memslot->dirty_bitmap;
-	if (!dirty_bitmap)
-		return -ENOENT;
 
-	n = kvm_dirty_bitmap_bytes(memslot);
+	n = ALIGN(log->num_pages, BITS_PER_LONG) / 8;
 
 	if (log->first_page > memslot->npages ||
-	    log->num_pages > memslot->npages - log->first_page)
-			return -EINVAL;
+	    log->num_pages > memslot->npages - log->first_page ||
+	    (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63)))
+	    return -EINVAL;
 
-	*flush = false;
+	kvm_arch_sync_dirty_log(kvm, memslot);
+
+	flush = false;
 	dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
 	if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
 		return -EFAULT;
 
-	spin_lock(&kvm->mmu_lock);
-	for (offset = log->first_page,
-	     i = offset / BITS_PER_LONG, n = log->num_pages / BITS_PER_LONG; n--;
+	KVM_MMU_LOCK(kvm);
+	for (offset = log->first_page, i = offset / BITS_PER_LONG,
+		 n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--;
 	     i++, offset += BITS_PER_LONG) {
 		unsigned long mask = *dirty_bitmap_buffer++;
 		atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i];
@@ -1277,64 +2399,308 @@ int kvm_clear_dirty_log_protect(struct kvm *kvm,
 		 * a problem if userspace sets them in log->dirty_bitmap.
 		*/
 		if (mask) {
-			*flush = true;
+			flush = true;
 			kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
 								offset, mask);
 		}
 	}
-	spin_unlock(&kvm->mmu_lock);
+	KVM_MMU_UNLOCK(kvm);
+
+	if (flush)
+		kvm_flush_remote_tlbs_memslot(kvm, memslot);
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(kvm_clear_dirty_log_protect);
-#endif
 
-bool kvm_largepages_enabled(void)
+static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
+					struct kvm_clear_dirty_log *log)
 {
-	return largepages_enabled;
+	int r;
+
+	mutex_lock(&kvm->slots_lock);
+
+	r = kvm_clear_dirty_log_protect(kvm, log);
+
+	mutex_unlock(&kvm->slots_lock);
+	return r;
 }
+#endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
 
-void kvm_disable_largepages(void)
+#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+static u64 kvm_supported_mem_attributes(struct kvm *kvm)
 {
-	largepages_enabled = false;
+	if (!kvm || kvm_arch_has_private_mem(kvm))
+		return KVM_MEMORY_ATTRIBUTE_PRIVATE;
+
+	return 0;
 }
-EXPORT_SYMBOL_GPL(kvm_disable_largepages);
+
+/*
+ * Returns true if _all_ gfns in the range [@start, @end) have attributes
+ * such that the bits in @mask match @attrs.
+ */
+bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
+				     unsigned long mask, unsigned long attrs)
+{
+	XA_STATE(xas, &kvm->mem_attr_array, start);
+	unsigned long index;
+	void *entry;
+
+	mask &= kvm_supported_mem_attributes(kvm);
+	if (attrs & ~mask)
+		return false;
+
+	if (end == start + 1)
+		return (kvm_get_memory_attributes(kvm, start) & mask) == attrs;
+
+	guard(rcu)();
+	if (!attrs)
+		return !xas_find(&xas, end - 1);
+
+	for (index = start; index < end; index++) {
+		do {
+			entry = xas_next(&xas);
+		} while (xas_retry(&xas, entry));
+
+		if (xas.xa_index != index ||
+		    (xa_to_value(entry) & mask) != attrs)
+			return false;
+	}
+
+	return true;
+}
+
+static __always_inline void kvm_handle_gfn_range(struct kvm *kvm,
+						 struct kvm_mmu_notifier_range *range)
+{
+	struct kvm_gfn_range gfn_range;
+	struct kvm_memory_slot *slot;
+	struct kvm_memslots *slots;
+	struct kvm_memslot_iter iter;
+	bool found_memslot = false;
+	bool ret = false;
+	int i;
+
+	gfn_range.arg = range->arg;
+	gfn_range.may_block = range->may_block;
+
+	/*
+	 * If/when KVM supports more attributes beyond private .vs shared, this
+	 * _could_ set KVM_FILTER_{SHARED,PRIVATE} appropriately if the entire target
+	 * range already has the desired private vs. shared state (it's unclear
+	 * if that is a net win).  For now, KVM reaches this point if and only
+	 * if the private flag is being toggled, i.e. all mappings are in play.
+	 */
+
+	for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
+		slots = __kvm_memslots(kvm, i);
+
+		kvm_for_each_memslot_in_gfn_range(&iter, slots, range->start, range->end) {
+			slot = iter.slot;
+			gfn_range.slot = slot;
+
+			gfn_range.start = max(range->start, slot->base_gfn);
+			gfn_range.end = min(range->end, slot->base_gfn + slot->npages);
+			if (gfn_range.start >= gfn_range.end)
+				continue;
+
+			if (!found_memslot) {
+				found_memslot = true;
+				KVM_MMU_LOCK(kvm);
+				if (!IS_KVM_NULL_FN(range->on_lock))
+					range->on_lock(kvm);
+			}
+
+			ret |= range->handler(kvm, &gfn_range);
+		}
+	}
+
+	if (range->flush_on_ret && ret)
+		kvm_flush_remote_tlbs(kvm);
+
+	if (found_memslot)
+		KVM_MMU_UNLOCK(kvm);
+}
+
+static bool kvm_pre_set_memory_attributes(struct kvm *kvm,
+					  struct kvm_gfn_range *range)
+{
+	/*
+	 * Unconditionally add the range to the invalidation set, regardless of
+	 * whether or not the arch callback actually needs to zap SPTEs.  E.g.
+	 * if KVM supports RWX attributes in the future and the attributes are
+	 * going from R=>RW, zapping isn't strictly necessary.  Unconditionally
+	 * adding the range allows KVM to require that MMU invalidations add at
+	 * least one range between begin() and end(), e.g. allows KVM to detect
+	 * bugs where the add() is missed.  Relaxing the rule *might* be safe,
+	 * but it's not obvious that allowing new mappings while the attributes
+	 * are in flux is desirable or worth the complexity.
+	 */
+	kvm_mmu_invalidate_range_add(kvm, range->start, range->end);
+
+	return kvm_arch_pre_set_memory_attributes(kvm, range);
+}
+
+/* Set @attributes for the gfn range [@start, @end). */
+static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
+				     unsigned long attributes)
+{
+	struct kvm_mmu_notifier_range pre_set_range = {
+		.start = start,
+		.end = end,
+		.arg.attributes = attributes,
+		.handler = kvm_pre_set_memory_attributes,
+		.on_lock = kvm_mmu_invalidate_begin,
+		.flush_on_ret = true,
+		.may_block = true,
+	};
+	struct kvm_mmu_notifier_range post_set_range = {
+		.start = start,
+		.end = end,
+		.arg.attributes = attributes,
+		.handler = kvm_arch_post_set_memory_attributes,
+		.on_lock = kvm_mmu_invalidate_end,
+		.may_block = true,
+	};
+	unsigned long i;
+	void *entry;
+	int r = 0;
+
+	entry = attributes ? xa_mk_value(attributes) : NULL;
+
+	trace_kvm_vm_set_mem_attributes(start, end, attributes);
+
+	mutex_lock(&kvm->slots_lock);
+
+	/* Nothing to do if the entire range has the desired attributes. */
+	if (kvm_range_has_memory_attributes(kvm, start, end, ~0, attributes))
+		goto out_unlock;
+
+	/*
+	 * Reserve memory ahead of time to avoid having to deal with failures
+	 * partway through setting the new attributes.
+	 */
+	for (i = start; i < end; i++) {
+		r = xa_reserve(&kvm->mem_attr_array, i, GFP_KERNEL_ACCOUNT);
+		if (r)
+			goto out_unlock;
+
+		cond_resched();
+	}
+
+	kvm_handle_gfn_range(kvm, &pre_set_range);
+
+	for (i = start; i < end; i++) {
+		r = xa_err(xa_store(&kvm->mem_attr_array, i, entry,
+				    GFP_KERNEL_ACCOUNT));
+		KVM_BUG_ON(r, kvm);
+		cond_resched();
+	}
+
+	kvm_handle_gfn_range(kvm, &post_set_range);
+
+out_unlock:
+	mutex_unlock(&kvm->slots_lock);
+
+	return r;
+}
+static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
+					   struct kvm_memory_attributes *attrs)
+{
+	gfn_t start, end;
+
+	/* flags is currently not used. */
+	if (attrs->flags)
+		return -EINVAL;
+	if (attrs->attributes & ~kvm_supported_mem_attributes(kvm))
+		return -EINVAL;
+	if (attrs->size == 0 || attrs->address + attrs->size < attrs->address)
+		return -EINVAL;
+	if (!PAGE_ALIGNED(attrs->address) || !PAGE_ALIGNED(attrs->size))
+		return -EINVAL;
+
+	start = attrs->address >> PAGE_SHIFT;
+	end = (attrs->address + attrs->size) >> PAGE_SHIFT;
+
+	/*
+	 * xarray tracks data using "unsigned long", and as a result so does
+	 * KVM.  For simplicity, supports generic attributes only on 64-bit
+	 * architectures.
+	 */
+	BUILD_BUG_ON(sizeof(attrs->attributes) != sizeof(unsigned long));
+
+	return kvm_vm_set_mem_attributes(kvm, start, end, attrs->attributes);
+}
+#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
 
 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
 {
 	return __gfn_to_memslot(kvm_memslots(kvm), gfn);
 }
-EXPORT_SYMBOL_GPL(gfn_to_memslot);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(gfn_to_memslot);
 
 struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn)
 {
-	return __gfn_to_memslot(kvm_vcpu_memslots(vcpu), gfn);
+	struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
+	u64 gen = slots->generation;
+	struct kvm_memory_slot *slot;
+
+	/*
+	 * This also protects against using a memslot from a different address space,
+	 * since different address spaces have different generation numbers.
+	 */
+	if (unlikely(gen != vcpu->last_used_slot_gen)) {
+		vcpu->last_used_slot = NULL;
+		vcpu->last_used_slot_gen = gen;
+	}
+
+	slot = try_get_memslot(vcpu->last_used_slot, gfn);
+	if (slot)
+		return slot;
+
+	/*
+	 * Fall back to searching all memslots. We purposely use
+	 * search_memslots() instead of __gfn_to_memslot() to avoid
+	 * thrashing the VM-wide last_used_slot in kvm_memslots.
+	 */
+	slot = search_memslots(slots, gfn, false);
+	if (slot) {
+		vcpu->last_used_slot = slot;
+		return slot;
+	}
+
+	return NULL;
 }
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_gfn_to_memslot);
 
 bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
 {
 	struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
 
-	if (!memslot || memslot->id >= KVM_USER_MEM_SLOTS ||
-	      memslot->flags & KVM_MEMSLOT_INVALID)
-		return false;
+	return kvm_is_visible_memslot(memslot);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_is_visible_gfn);
 
-	return true;
+bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+	struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+
+	return kvm_is_visible_memslot(memslot);
 }
-EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_is_visible_gfn);
 
-unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn)
+unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
 {
 	struct vm_area_struct *vma;
 	unsigned long addr, size;
 
 	size = PAGE_SIZE;
 
-	addr = gfn_to_hva(kvm, gfn);
+	addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL);
 	if (kvm_is_error_hva(addr))
 		return PAGE_SIZE;
 
-	down_read(&current->mm->mmap_sem);
+	mmap_read_lock(current->mm);
 	vma = find_vma(current->mm, addr);
 	if (!vma)
 		goto out;
@@ -1342,17 +2708,17 @@ unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn)
 	size = vma_kernel_pagesize(vma);
 
 out:
-	up_read(&current->mm->mmap_sem);
+	mmap_read_unlock(current->mm);
 
 	return size;
 }
 
-static bool memslot_is_readonly(struct kvm_memory_slot *slot)
+static bool memslot_is_readonly(const struct kvm_memory_slot *slot)
 {
 	return slot->flags & KVM_MEM_READONLY;
 }
 
-static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
+static unsigned long __gfn_to_hva_many(const struct kvm_memory_slot *slot, gfn_t gfn,
 				       gfn_t *nr_pages, bool write)
 {
 	if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
@@ -1378,19 +2744,19 @@ unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
 {
 	return gfn_to_hva_many(slot, gfn, NULL);
 }
-EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(gfn_to_hva_memslot);
 
 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
 {
 	return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
 }
-EXPORT_SYMBOL_GPL(gfn_to_hva);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(gfn_to_hva);
 
 unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn)
 {
 	return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL);
 }
-EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_gfn_to_hva);
 
 /*
  * Return the hva of a @gfn and the R/W attribute if possible.
@@ -1425,39 +2791,93 @@ unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *w
 	return gfn_to_hva_memslot_prot(slot, gfn, writable);
 }
 
-static inline int check_user_page_hwpoison(unsigned long addr)
+static bool kvm_is_ad_tracked_page(struct page *page)
+{
+	/*
+	 * Per page-flags.h, pages tagged PG_reserved "should in general not be
+	 * touched (e.g. set dirty) except by its owner".
+	 */
+	return !PageReserved(page);
+}
+
+static void kvm_set_page_dirty(struct page *page)
+{
+	if (kvm_is_ad_tracked_page(page))
+		SetPageDirty(page);
+}
+
+static void kvm_set_page_accessed(struct page *page)
+{
+	if (kvm_is_ad_tracked_page(page))
+		mark_page_accessed(page);
+}
+
+void kvm_release_page_clean(struct page *page)
+{
+	if (!page)
+		return;
+
+	kvm_set_page_accessed(page);
+	put_page(page);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_release_page_clean);
+
+void kvm_release_page_dirty(struct page *page)
+{
+	if (!page)
+		return;
+
+	kvm_set_page_dirty(page);
+	kvm_release_page_clean(page);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_release_page_dirty);
+
+static kvm_pfn_t kvm_resolve_pfn(struct kvm_follow_pfn *kfp, struct page *page,
+				 struct follow_pfnmap_args *map, bool writable)
 {
-	int rc, flags = FOLL_HWPOISON | FOLL_WRITE;
+	kvm_pfn_t pfn;
+
+	WARN_ON_ONCE(!!page == !!map);
 
-	rc = get_user_pages(addr, 1, flags, NULL, NULL);
-	return rc == -EHWPOISON;
+	if (kfp->map_writable)
+		*kfp->map_writable = writable;
+
+	if (map)
+		pfn = map->pfn;
+	else
+		pfn = page_to_pfn(page);
+
+	*kfp->refcounted_page = page;
+
+	return pfn;
 }
 
 /*
  * The fast path to get the writable pfn which will be stored in @pfn,
- * true indicates success, otherwise false is returned.  It's also the
- * only part that runs if we can are in atomic context.
+ * true indicates success, otherwise false is returned.
  */
-static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
-			    bool *writable, kvm_pfn_t *pfn)
+static bool hva_to_pfn_fast(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn)
 {
-	struct page *page[1];
-	int npages;
+	struct page *page;
+	bool r;
 
 	/*
-	 * Fast pin a writable pfn only if it is a write fault request
-	 * or the caller allows to map a writable pfn for a read fault
-	 * request.
+	 * Try the fast-only path when the caller wants to pin/get the page for
+	 * writing.  If the caller only wants to read the page, KVM must go
+	 * down the full, slow path in order to avoid racing an operation that
+	 * breaks Copy-on-Write (CoW), e.g. so that KVM doesn't end up pointing
+	 * at the old, read-only page while mm/ points at a new, writable page.
 	 */
-	if (!(write_fault || writable))
+	if (!((kfp->flags & FOLL_WRITE) || kfp->map_writable))
 		return false;
 
-	npages = __get_user_pages_fast(addr, 1, 1, page);
-	if (npages == 1) {
-		*pfn = page_to_pfn(page[0]);
+	if (kfp->pin)
+		r = pin_user_pages_fast(kfp->hva, 1, FOLL_WRITE, &page) == 1;
+	else
+		r = get_user_page_fast_only(kfp->hva, FOLL_WRITE, &page);
 
-		if (writable)
-			*writable = true;
+	if (r) {
+		*pfn = kvm_resolve_pfn(kfp, page, NULL, true);
 		return true;
 	}
 
@@ -1468,38 +2888,48 @@ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
  * The slow path to get the pfn of the specified host virtual address,
  * 1 indicates success, -errno is returned if error is detected.
  */
-static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
-			   bool *writable, kvm_pfn_t *pfn)
+static int hva_to_pfn_slow(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn)
 {
-	unsigned int flags = FOLL_HWPOISON;
-	struct page *page;
-	int npages = 0;
-
-	might_sleep();
-
-	if (writable)
-		*writable = write_fault;
-
-	if (write_fault)
-		flags |= FOLL_WRITE;
-	if (async)
-		flags |= FOLL_NOWAIT;
+	/*
+	 * When a VCPU accesses a page that is not mapped into the secondary
+	 * MMU, we lookup the page using GUP to map it, so the guest VCPU can
+	 * make progress. We always want to honor NUMA hinting faults in that
+	 * case, because GUP usage corresponds to memory accesses from the VCPU.
+	 * Otherwise, we'd not trigger NUMA hinting faults once a page is
+	 * mapped into the secondary MMU and gets accessed by a VCPU.
+	 *
+	 * Note that get_user_page_fast_only() and FOLL_WRITE for now
+	 * implicitly honor NUMA hinting faults and don't need this flag.
+	 */
+	unsigned int flags = FOLL_HWPOISON | FOLL_HONOR_NUMA_FAULT | kfp->flags;
+	struct page *page, *wpage;
+	int npages;
 
-	npages = get_user_pages_unlocked(addr, 1, &page, flags);
+	if (kfp->pin)
+		npages = pin_user_pages_unlocked(kfp->hva, 1, &page, flags);
+	else
+		npages = get_user_pages_unlocked(kfp->hva, 1, &page, flags);
 	if (npages != 1)
 		return npages;
 
-	/* map read fault as writable if possible */
-	if (unlikely(!write_fault) && writable) {
-		struct page *wpage;
+	/*
+	 * Pinning is mutually exclusive with opportunistically mapping a read
+	 * fault as writable, as KVM should never pin pages when mapping memory
+	 * into the guest (pinning is only for direct accesses from KVM).
+	 */
+	if (WARN_ON_ONCE(kfp->map_writable && kfp->pin))
+		goto out;
 
-		if (__get_user_pages_fast(addr, 1, 1, &wpage) == 1) {
-			*writable = true;
-			put_page(page);
-			page = wpage;
-		}
+	/* map read fault as writable if possible */
+	if (!(flags & FOLL_WRITE) && kfp->map_writable &&
+	    get_user_page_fast_only(kfp->hva, FOLL_WRITE, &wpage)) {
+		put_page(page);
+		page = wpage;
+		flags |= FOLL_WRITE;
 	}
-	*pfn = page_to_pfn(page);
+
+out:
+	*pfn = kvm_resolve_pfn(kfp, page, NULL, flags & FOLL_WRITE);
 	return npages;
 }
 
@@ -1515,21 +2945,28 @@ static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
 }
 
 static int hva_to_pfn_remapped(struct vm_area_struct *vma,
-			       unsigned long addr, bool *async,
-			       bool write_fault, bool *writable,
-			       kvm_pfn_t *p_pfn)
+			       struct kvm_follow_pfn *kfp, kvm_pfn_t *p_pfn)
 {
-	unsigned long pfn;
+	struct follow_pfnmap_args args = { .vma = vma, .address = kfp->hva };
+	bool write_fault = kfp->flags & FOLL_WRITE;
 	int r;
 
-	r = follow_pfn(vma, addr, &pfn);
+	/*
+	 * Remapped memory cannot be pinned in any meaningful sense.  Bail if
+	 * the caller wants to pin the page, i.e. access the page outside of
+	 * MMU notifier protection, and unsafe umappings are disallowed.
+	 */
+	if (kfp->pin && !allow_unsafe_mappings)
+		return -EINVAL;
+
+	r = follow_pfnmap_start(&args);
 	if (r) {
 		/*
 		 * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
 		 * not call the fault handler, so do it here.
 		 */
 		bool unlocked = false;
-		r = fixup_user_fault(current, current->mm, addr,
+		r = fixup_user_fault(current->mm, kfp->hva,
 				     (write_fault ? FAULT_FLAG_WRITE : 0),
 				     &unlocked);
 		if (unlocked)
@@ -1537,169 +2974,110 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
 		if (r)
 			return r;
 
-		r = follow_pfn(vma, addr, &pfn);
+		r = follow_pfnmap_start(&args);
 		if (r)
 			return r;
-
 	}
 
-	if (writable)
-		*writable = true;
-
-	/*
-	 * Get a reference here because callers of *hva_to_pfn* and
-	 * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the
-	 * returned pfn.  This is only needed if the VMA has VM_MIXEDMAP
-	 * set, but the kvm_get_pfn/kvm_release_pfn_clean pair will
-	 * simply do nothing for reserved pfns.
-	 *
-	 * Whoever called remap_pfn_range is also going to call e.g.
-	 * unmap_mapping_range before the underlying pages are freed,
-	 * causing a call to our MMU notifier.
-	 */ 
-	kvm_get_pfn(pfn);
+	if (write_fault && !args.writable) {
+		*p_pfn = KVM_PFN_ERR_RO_FAULT;
+		goto out;
+	}
 
-	*p_pfn = pfn;
-	return 0;
+	*p_pfn = kvm_resolve_pfn(kfp, NULL, &args, args.writable);
+out:
+	follow_pfnmap_end(&args);
+	return r;
 }
 
-/*
- * Pin guest page in memory and return its pfn.
- * @addr: host virtual address which maps memory to the guest
- * @atomic: whether this function can sleep
- * @async: whether this function need to wait IO complete if the
- *         host page is not in the memory
- * @write_fault: whether we should get a writable host page
- * @writable: whether it allows to map a writable host page for !@write_fault
- *
- * The function will map a writable host page for these two cases:
- * 1): @write_fault = true
- * 2): @write_fault = false && @writable, @writable will tell the caller
- *     whether the mapping is writable.
- */
-static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
-			bool write_fault, bool *writable)
+kvm_pfn_t hva_to_pfn(struct kvm_follow_pfn *kfp)
 {
 	struct vm_area_struct *vma;
-	kvm_pfn_t pfn = 0;
+	kvm_pfn_t pfn;
 	int npages, r;
 
-	/* we can do it either atomically or asynchronously, not both */
-	BUG_ON(atomic && async);
-
-	if (hva_to_pfn_fast(addr, write_fault, writable, &pfn))
-		return pfn;
+	might_sleep();
 
-	if (atomic)
+	if (WARN_ON_ONCE(!kfp->refcounted_page))
 		return KVM_PFN_ERR_FAULT;
 
-	npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn);
-	if (npages == 1)
+	if (hva_to_pfn_fast(kfp, &pfn))
 		return pfn;
 
-	down_read(&current->mm->mmap_sem);
-	if (npages == -EHWPOISON ||
-	      (!async && check_user_page_hwpoison(addr))) {
-		pfn = KVM_PFN_ERR_HWPOISON;
-		goto exit;
-	}
+	npages = hva_to_pfn_slow(kfp, &pfn);
+	if (npages == 1)
+		return pfn;
+	if (npages == -EINTR || npages == -EAGAIN)
+		return KVM_PFN_ERR_SIGPENDING;
+	if (npages == -EHWPOISON)
+		return KVM_PFN_ERR_HWPOISON;
 
+	mmap_read_lock(current->mm);
 retry:
-	vma = find_vma_intersection(current->mm, addr, addr + 1);
+	vma = vma_lookup(current->mm, kfp->hva);
 
 	if (vma == NULL)
 		pfn = KVM_PFN_ERR_FAULT;
 	else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
-		r = hva_to_pfn_remapped(vma, addr, async, write_fault, writable, &pfn);
+		r = hva_to_pfn_remapped(vma, kfp, &pfn);
 		if (r == -EAGAIN)
 			goto retry;
 		if (r < 0)
 			pfn = KVM_PFN_ERR_FAULT;
 	} else {
-		if (async && vma_is_valid(vma, write_fault))
-			*async = true;
-		pfn = KVM_PFN_ERR_FAULT;
+		if ((kfp->flags & FOLL_NOWAIT) &&
+		    vma_is_valid(vma, kfp->flags & FOLL_WRITE))
+			pfn = KVM_PFN_ERR_NEEDS_IO;
+		else
+			pfn = KVM_PFN_ERR_FAULT;
 	}
-exit:
-	up_read(&current->mm->mmap_sem);
+	mmap_read_unlock(current->mm);
 	return pfn;
 }
 
-kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
-			       bool atomic, bool *async, bool write_fault,
-			       bool *writable)
+static kvm_pfn_t kvm_follow_pfn(struct kvm_follow_pfn *kfp)
 {
-	unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
+	kfp->hva = __gfn_to_hva_many(kfp->slot, kfp->gfn, NULL,
+				     kfp->flags & FOLL_WRITE);
 
-	if (addr == KVM_HVA_ERR_RO_BAD) {
-		if (writable)
-			*writable = false;
+	if (kfp->hva == KVM_HVA_ERR_RO_BAD)
 		return KVM_PFN_ERR_RO_FAULT;
-	}
 
-	if (kvm_is_error_hva(addr)) {
-		if (writable)
-			*writable = false;
+	if (kvm_is_error_hva(kfp->hva))
 		return KVM_PFN_NOSLOT;
-	}
 
-	/* Do not map writable pfn in the readonly memslot. */
-	if (writable && memslot_is_readonly(slot)) {
-		*writable = false;
-		writable = NULL;
+	if (memslot_is_readonly(kfp->slot) && kfp->map_writable) {
+		*kfp->map_writable = false;
+		kfp->map_writable = NULL;
 	}
 
-	return hva_to_pfn(addr, atomic, async, write_fault,
-			  writable);
-}
-EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
-
-kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
-		      bool *writable)
-{
-	return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL,
-				    write_fault, writable);
-}
-EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
-
-kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
-{
-	return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL);
+	return hva_to_pfn(kfp);
 }
-EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
 
-kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
+kvm_pfn_t __kvm_faultin_pfn(const struct kvm_memory_slot *slot, gfn_t gfn,
+			    unsigned int foll, bool *writable,
+			    struct page **refcounted_page)
 {
-	return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL);
-}
-EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
-
-kvm_pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
-{
-	return gfn_to_pfn_memslot_atomic(gfn_to_memslot(kvm, gfn), gfn);
-}
-EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
+	struct kvm_follow_pfn kfp = {
+		.slot = slot,
+		.gfn = gfn,
+		.flags = foll,
+		.map_writable = writable,
+		.refcounted_page = refcounted_page,
+	};
 
-kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
-	return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
-}
-EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic);
+	if (WARN_ON_ONCE(!writable || !refcounted_page))
+		return KVM_PFN_ERR_FAULT;
 
-kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
-{
-	return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn);
-}
-EXPORT_SYMBOL_GPL(gfn_to_pfn);
+	*writable = false;
+	*refcounted_page = NULL;
 
-kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
-	return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
+	return kvm_follow_pfn(&kfp);
 }
-EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_faultin_pfn);
 
-int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
-			    struct page **pages, int nr_pages)
+int kvm_prefetch_pages(struct kvm_memory_slot *slot, gfn_t gfn,
+		       struct page **pages, int nr_pages)
 {
 	unsigned long addr;
 	gfn_t entry = 0;
@@ -1711,97 +3089,93 @@ int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
 	if (entry < nr_pages)
 		return 0;
 
-	return __get_user_pages_fast(addr, nr_pages, 1, pages);
-}
-EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
-
-static struct page *kvm_pfn_to_page(kvm_pfn_t pfn)
-{
-	if (is_error_noslot_pfn(pfn))
-		return KVM_ERR_PTR_BAD_PAGE;
-
-	if (kvm_is_reserved_pfn(pfn)) {
-		WARN_ON(1);
-		return KVM_ERR_PTR_BAD_PAGE;
-	}
-
-	return pfn_to_page(pfn);
+	return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages);
 }
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_prefetch_pages);
 
-struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
-{
-	kvm_pfn_t pfn;
-
-	pfn = gfn_to_pfn(kvm, gfn);
+/*
+ * Don't use this API unless you are absolutely, positively certain that KVM
+ * needs to get a struct page, e.g. to pin the page for firmware DMA.
+ *
+ * FIXME: Users of this API likely need to FOLL_PIN the page, not just elevate
+ *	  its refcount.
+ */
+struct page *__gfn_to_page(struct kvm *kvm, gfn_t gfn, bool write)
+{
+	struct page *refcounted_page = NULL;
+	struct kvm_follow_pfn kfp = {
+		.slot = gfn_to_memslot(kvm, gfn),
+		.gfn = gfn,
+		.flags = write ? FOLL_WRITE : 0,
+		.refcounted_page = &refcounted_page,
+	};
 
-	return kvm_pfn_to_page(pfn);
+	(void)kvm_follow_pfn(&kfp);
+	return refcounted_page;
 }
-EXPORT_SYMBOL_GPL(gfn_to_page);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(__gfn_to_page);
 
-struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn)
+int __kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map,
+		   bool writable)
 {
-	kvm_pfn_t pfn;
-
-	pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn);
+	struct kvm_follow_pfn kfp = {
+		.slot = gfn_to_memslot(vcpu->kvm, gfn),
+		.gfn = gfn,
+		.flags = writable ? FOLL_WRITE : 0,
+		.refcounted_page = &map->pinned_page,
+		.pin = true,
+	};
 
-	return kvm_pfn_to_page(pfn);
-}
-EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_page);
+	map->pinned_page = NULL;
+	map->page = NULL;
+	map->hva = NULL;
+	map->gfn = gfn;
+	map->writable = writable;
 
-void kvm_release_page_clean(struct page *page)
-{
-	WARN_ON(is_error_page(page));
+	map->pfn = kvm_follow_pfn(&kfp);
+	if (is_error_noslot_pfn(map->pfn))
+		return -EINVAL;
 
-	kvm_release_pfn_clean(page_to_pfn(page));
-}
-EXPORT_SYMBOL_GPL(kvm_release_page_clean);
+	if (pfn_valid(map->pfn)) {
+		map->page = pfn_to_page(map->pfn);
+		map->hva = kmap(map->page);
+#ifdef CONFIG_HAS_IOMEM
+	} else {
+		map->hva = memremap(pfn_to_hpa(map->pfn), PAGE_SIZE, MEMREMAP_WB);
+#endif
+	}
 
-void kvm_release_pfn_clean(kvm_pfn_t pfn)
-{
-	if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn))
-		put_page(pfn_to_page(pfn));
+	return map->hva ? 0 : -EFAULT;
 }
-EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_vcpu_map);
 
-void kvm_release_page_dirty(struct page *page)
+void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map)
 {
-	WARN_ON(is_error_page(page));
-
-	kvm_release_pfn_dirty(page_to_pfn(page));
-}
-EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
+	if (!map->hva)
+		return;
 
-void kvm_release_pfn_dirty(kvm_pfn_t pfn)
-{
-	kvm_set_pfn_dirty(pfn);
-	kvm_release_pfn_clean(pfn);
-}
-EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
+	if (map->page)
+		kunmap(map->page);
+#ifdef CONFIG_HAS_IOMEM
+	else
+		memunmap(map->hva);
+#endif
 
-void kvm_set_pfn_dirty(kvm_pfn_t pfn)
-{
-	if (!kvm_is_reserved_pfn(pfn)) {
-		struct page *page = pfn_to_page(pfn);
+	if (map->writable)
+		kvm_vcpu_mark_page_dirty(vcpu, map->gfn);
 
-		if (!PageReserved(page))
-			SetPageDirty(page);
+	if (map->pinned_page) {
+		if (map->writable)
+			kvm_set_page_dirty(map->pinned_page);
+		kvm_set_page_accessed(map->pinned_page);
+		unpin_user_page(map->pinned_page);
 	}
-}
-EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
 
-void kvm_set_pfn_accessed(kvm_pfn_t pfn)
-{
-	if (!kvm_is_reserved_pfn(pfn))
-		mark_page_accessed(pfn_to_page(pfn));
+	map->hva = NULL;
+	map->page = NULL;
+	map->pinned_page = NULL;
 }
-EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
-
-void kvm_get_pfn(kvm_pfn_t pfn)
-{
-	if (!kvm_is_reserved_pfn(pfn))
-		get_page(pfn_to_page(pfn));
-}
-EXPORT_SYMBOL_GPL(kvm_get_pfn);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_unmap);
 
 static int next_segment(unsigned long len, int offset)
 {
@@ -1811,12 +3185,16 @@ static int next_segment(unsigned long len, int offset)
 		return len;
 }
 
+/* Copy @len bytes from guest memory at '(@gfn * PAGE_SIZE) + @offset' to @data */
 static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
 				 void *data, int offset, int len)
 {
 	int r;
 	unsigned long addr;
 
+	if (WARN_ON_ONCE(offset + len > PAGE_SIZE))
+		return -EFAULT;
+
 	addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
 	if (kvm_is_error_hva(addr))
 		return -EFAULT;
@@ -1833,7 +3211,7 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
 
 	return __kvm_read_guest_page(slot, gfn, data, offset, len);
 }
-EXPORT_SYMBOL_GPL(kvm_read_guest_page);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_read_guest_page);
 
 int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data,
 			     int offset, int len)
@@ -1842,7 +3220,7 @@ int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data,
 
 	return __kvm_read_guest_page(slot, gfn, data, offset, len);
 }
-EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_read_guest_page);
 
 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
 {
@@ -1862,7 +3240,7 @@ int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
 	}
 	return 0;
 }
-EXPORT_SYMBOL_GPL(kvm_read_guest);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_read_guest);
 
 int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len)
 {
@@ -1882,7 +3260,7 @@ int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned l
 	}
 	return 0;
 }
-EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_read_guest);
 
 static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
 			           void *data, int offset, unsigned long len)
@@ -1890,6 +3268,9 @@ static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
 	int r;
 	unsigned long addr;
 
+	if (WARN_ON_ONCE(offset + len > PAGE_SIZE))
+		return -EFAULT;
+
 	addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
 	if (kvm_is_error_hva(addr))
 		return -EFAULT;
@@ -1901,17 +3282,6 @@ static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
 	return 0;
 }
 
-int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
-			  unsigned long len)
-{
-	gfn_t gfn = gpa >> PAGE_SHIFT;
-	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
-	int offset = offset_in_page(gpa);
-
-	return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
-}
-EXPORT_SYMBOL_GPL(kvm_read_guest_atomic);
-
 int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
 			       void *data, unsigned long len)
 {
@@ -1921,21 +3291,26 @@ int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
 
 	return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
 }
-EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_read_guest_atomic);
 
-static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn,
+/* Copy @len bytes from @data into guest memory at '(@gfn * PAGE_SIZE) + @offset' */
+static int __kvm_write_guest_page(struct kvm *kvm,
+				  struct kvm_memory_slot *memslot, gfn_t gfn,
 			          const void *data, int offset, int len)
 {
 	int r;
 	unsigned long addr;
 
+	if (WARN_ON_ONCE(offset + len > PAGE_SIZE))
+		return -EFAULT;
+
 	addr = gfn_to_hva_memslot(memslot, gfn);
 	if (kvm_is_error_hva(addr))
 		return -EFAULT;
 	r = __copy_to_user((void __user *)addr + offset, data, len);
 	if (r)
 		return -EFAULT;
-	mark_page_dirty_in_slot(memslot, gfn);
+	mark_page_dirty_in_slot(kvm, memslot, gfn);
 	return 0;
 }
 
@@ -1944,18 +3319,18 @@ int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
 {
 	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
 
-	return __kvm_write_guest_page(slot, gfn, data, offset, len);
+	return __kvm_write_guest_page(kvm, slot, gfn, data, offset, len);
 }
-EXPORT_SYMBOL_GPL(kvm_write_guest_page);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_write_guest_page);
 
 int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
 			      const void *data, int offset, int len)
 {
 	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
 
-	return __kvm_write_guest_page(slot, gfn, data, offset, len);
+	return __kvm_write_guest_page(vcpu->kvm, slot, gfn, data, offset, len);
 }
-EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_write_guest_page);
 
 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
 		    unsigned long len)
@@ -1976,7 +3351,7 @@ int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
 	}
 	return 0;
 }
-EXPORT_SYMBOL_GPL(kvm_write_guest);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_write_guest);
 
 int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
 		         unsigned long len)
@@ -1997,7 +3372,7 @@ int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
 	}
 	return 0;
 }
-EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_write_guest);
 
 static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
 				       struct gfn_to_hva_cache *ghc,
@@ -2008,33 +3383,36 @@ static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
 	gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT;
 	gfn_t nr_pages_needed = end_gfn - start_gfn + 1;
 	gfn_t nr_pages_avail;
-	int r = start_gfn <= end_gfn ? 0 : -EINVAL;
 
-	ghc->gpa = gpa;
+	/* Update ghc->generation before performing any error checks. */
 	ghc->generation = slots->generation;
-	ghc->len = len;
-	ghc->hva = KVM_HVA_ERR_BAD;
+
+	if (start_gfn > end_gfn) {
+		ghc->hva = KVM_HVA_ERR_BAD;
+		return -EINVAL;
+	}
 
 	/*
 	 * If the requested region crosses two memslots, we still
 	 * verify that the entire region is valid here.
 	 */
-	while (!r && start_gfn <= end_gfn) {
+	for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) {
 		ghc->memslot = __gfn_to_memslot(slots, start_gfn);
 		ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
 					   &nr_pages_avail);
 		if (kvm_is_error_hva(ghc->hva))
-			r = -EFAULT;
-		start_gfn += nr_pages_avail;
+			return -EFAULT;
 	}
 
 	/* Use the slow path for cross page reads and writes. */
-	if (!r && nr_pages_needed == 1)
+	if (nr_pages_needed == 1)
 		ghc->hva += offset;
 	else
 		ghc->memslot = NULL;
 
-	return r;
+	ghc->gpa = gpa;
+	ghc->len = len;
+	return 0;
 }
 
 int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
@@ -2043,7 +3421,7 @@ int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
 	struct kvm_memslots *slots = kvm_memslots(kvm);
 	return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
 }
-EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gfn_to_hva_cache_init);
 
 int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
 				  void *data, unsigned int offset,
@@ -2053,75 +3431,83 @@ int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
 	int r;
 	gpa_t gpa = ghc->gpa + offset;
 
-	BUG_ON(len + offset > ghc->len);
-
-	if (slots->generation != ghc->generation)
-		__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len);
+	if (WARN_ON_ONCE(len + offset > ghc->len))
+		return -EINVAL;
 
-	if (unlikely(!ghc->memslot))
-		return kvm_write_guest(kvm, gpa, data, len);
+	if (slots->generation != ghc->generation) {
+		if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
+			return -EFAULT;
+	}
 
 	if (kvm_is_error_hva(ghc->hva))
 		return -EFAULT;
 
+	if (unlikely(!ghc->memslot))
+		return kvm_write_guest(kvm, gpa, data, len);
+
 	r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
 	if (r)
 		return -EFAULT;
-	mark_page_dirty_in_slot(ghc->memslot, gpa >> PAGE_SHIFT);
+	mark_page_dirty_in_slot(kvm, ghc->memslot, gpa >> PAGE_SHIFT);
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_write_guest_offset_cached);
 
 int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
 			   void *data, unsigned long len)
 {
 	return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
 }
-EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_write_guest_cached);
 
-int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
-			   void *data, unsigned long len)
+int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+				 void *data, unsigned int offset,
+				 unsigned long len)
 {
 	struct kvm_memslots *slots = kvm_memslots(kvm);
 	int r;
+	gpa_t gpa = ghc->gpa + offset;
 
-	BUG_ON(len > ghc->len);
-
-	if (slots->generation != ghc->generation)
-		__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len);
+	if (WARN_ON_ONCE(len + offset > ghc->len))
+		return -EINVAL;
 
-	if (unlikely(!ghc->memslot))
-		return kvm_read_guest(kvm, ghc->gpa, data, len);
+	if (slots->generation != ghc->generation) {
+		if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
+			return -EFAULT;
+	}
 
 	if (kvm_is_error_hva(ghc->hva))
 		return -EFAULT;
 
-	r = __copy_from_user(data, (void __user *)ghc->hva, len);
+	if (unlikely(!ghc->memslot))
+		return kvm_read_guest(kvm, gpa, data, len);
+
+	r = __copy_from_user(data, (void __user *)ghc->hva + offset, len);
 	if (r)
 		return -EFAULT;
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_read_guest_offset_cached);
 
-int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
+int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+			  void *data, unsigned long len)
 {
-	const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
-
-	return kvm_write_guest_page(kvm, gfn, zero_page, offset, len);
+	return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len);
 }
-EXPORT_SYMBOL_GPL(kvm_clear_guest_page);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_read_guest_cached);
 
 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
 {
+	const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
 	gfn_t gfn = gpa >> PAGE_SHIFT;
 	int seg;
 	int offset = offset_in_page(gpa);
 	int ret;
 
 	while ((seg = next_segment(len, offset)) != 0) {
-		ret = kvm_clear_guest_page(kvm, gfn, offset, seg);
+		ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, seg);
 		if (ret < 0)
 			return ret;
 		offset = 0;
@@ -2130,35 +3516,50 @@ int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
 	}
 	return 0;
 }
-EXPORT_SYMBOL_GPL(kvm_clear_guest);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_clear_guest);
 
-static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot,
-				    gfn_t gfn)
+void mark_page_dirty_in_slot(struct kvm *kvm,
+			     const struct kvm_memory_slot *memslot,
+		 	     gfn_t gfn)
 {
-	if (memslot && memslot->dirty_bitmap) {
+	struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
+
+#ifdef CONFIG_HAVE_KVM_DIRTY_RING
+	if (WARN_ON_ONCE(vcpu && vcpu->kvm != kvm))
+		return;
+
+	WARN_ON_ONCE(!vcpu && !kvm_arch_allow_write_without_running_vcpu(kvm));
+#endif
+
+	if (memslot && kvm_slot_dirty_track_enabled(memslot)) {
 		unsigned long rel_gfn = gfn - memslot->base_gfn;
+		u32 slot = (memslot->as_id << 16) | memslot->id;
 
-		set_bit_le(rel_gfn, memslot->dirty_bitmap);
+		if (kvm->dirty_ring_size && vcpu)
+			kvm_dirty_ring_push(vcpu, slot, rel_gfn);
+		else if (memslot->dirty_bitmap)
+			set_bit_le(rel_gfn, memslot->dirty_bitmap);
 	}
 }
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(mark_page_dirty_in_slot);
 
 void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
 {
 	struct kvm_memory_slot *memslot;
 
 	memslot = gfn_to_memslot(kvm, gfn);
-	mark_page_dirty_in_slot(memslot, gfn);
+	mark_page_dirty_in_slot(kvm, memslot, gfn);
 }
-EXPORT_SYMBOL_GPL(mark_page_dirty);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(mark_page_dirty);
 
 void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
 {
 	struct kvm_memory_slot *memslot;
 
 	memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
-	mark_page_dirty_in_slot(memslot, gfn);
+	mark_page_dirty_in_slot(vcpu->kvm, memslot, gfn);
 }
-EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_mark_page_dirty);
 
 void kvm_sigset_activate(struct kvm_vcpu *vcpu)
 {
@@ -2185,34 +3586,38 @@ void kvm_sigset_deactivate(struct kvm_vcpu *vcpu)
 
 static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
 {
-	unsigned int old, val, grow;
+	unsigned int old, val, grow, grow_start;
 
 	old = val = vcpu->halt_poll_ns;
+	grow_start = READ_ONCE(halt_poll_ns_grow_start);
 	grow = READ_ONCE(halt_poll_ns_grow);
-	/* 10us base */
-	if (val == 0 && grow)
-		val = 10000;
-	else
-		val *= grow;
+	if (!grow)
+		goto out;
 
-	if (val > halt_poll_ns)
-		val = halt_poll_ns;
+	val *= grow;
+	if (val < grow_start)
+		val = grow_start;
 
 	vcpu->halt_poll_ns = val;
+out:
 	trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
 }
 
 static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
 {
-	unsigned int old, val, shrink;
+	unsigned int old, val, shrink, grow_start;
 
 	old = val = vcpu->halt_poll_ns;
 	shrink = READ_ONCE(halt_poll_ns_shrink);
+	grow_start = READ_ONCE(halt_poll_ns_grow_start);
 	if (shrink == 0)
 		val = 0;
 	else
 		val /= shrink;
 
+	if (val < grow_start)
+		val = 0;
+
 	vcpu->halt_poll_ns = val;
 	trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
 }
@@ -2222,14 +3627,14 @@ static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
 	int ret = -EINTR;
 	int idx = srcu_read_lock(&vcpu->kvm->srcu);
 
-	if (kvm_arch_vcpu_runnable(vcpu)) {
-		kvm_make_request(KVM_REQ_UNHALT, vcpu);
+	if (kvm_arch_vcpu_runnable(vcpu))
 		goto out;
-	}
 	if (kvm_cpu_has_pending_timer(vcpu))
 		goto out;
 	if (signal_pending(current))
 		goto out;
+	if (kvm_check_request(KVM_REQ_UNBLOCK, vcpu))
+		goto out;
 
 	ret = 0;
 out:
@@ -2238,39 +3643,24 @@ out:
 }
 
 /*
- * The vCPU has executed a HLT instruction with in-kernel mode enabled.
+ * Block the vCPU until the vCPU is runnable, an event arrives, or a signal is
+ * pending.  This is mostly used when halting a vCPU, but may also be used
+ * directly for other vCPU non-runnable states, e.g. x86's Wait-For-SIPI.
  */
-void kvm_vcpu_block(struct kvm_vcpu *vcpu)
+bool kvm_vcpu_block(struct kvm_vcpu *vcpu)
 {
-	ktime_t start, cur;
-	DECLARE_SWAITQUEUE(wait);
+	struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu);
 	bool waited = false;
-	u64 block_ns;
 
-	start = cur = ktime_get();
-	if (vcpu->halt_poll_ns) {
-		ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns);
-
-		++vcpu->stat.halt_attempted_poll;
-		do {
-			/*
-			 * This sets KVM_REQ_UNHALT if an interrupt
-			 * arrives.
-			 */
-			if (kvm_vcpu_check_block(vcpu) < 0) {
-				++vcpu->stat.halt_successful_poll;
-				if (!vcpu_valid_wakeup(vcpu))
-					++vcpu->stat.halt_poll_invalid;
-				goto out;
-			}
-			cur = ktime_get();
-		} while (single_task_running() && ktime_before(cur, stop));
-	}
+	vcpu->stat.generic.blocking = 1;
 
+	preempt_disable();
 	kvm_arch_vcpu_blocking(vcpu);
+	prepare_to_rcuwait(wait);
+	preempt_enable();
 
 	for (;;) {
-		prepare_to_swait_exclusive(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
+		set_current_state(TASK_INTERRUPTIBLE);
 
 		if (kvm_vcpu_check_block(vcpu) < 0)
 			break;
@@ -2279,88 +3669,221 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 		schedule();
 	}
 
-	finish_swait(&vcpu->wq, &wait);
-	cur = ktime_get();
-
+	preempt_disable();
+	finish_rcuwait(wait);
 	kvm_arch_vcpu_unblocking(vcpu);
+	preempt_enable();
+
+	vcpu->stat.generic.blocking = 0;
+
+	return waited;
+}
+
+static inline void update_halt_poll_stats(struct kvm_vcpu *vcpu, ktime_t start,
+					  ktime_t end, bool success)
+{
+	struct kvm_vcpu_stat_generic *stats = &vcpu->stat.generic;
+	u64 poll_ns = ktime_to_ns(ktime_sub(end, start));
+
+	++vcpu->stat.generic.halt_attempted_poll;
+
+	if (success) {
+		++vcpu->stat.generic.halt_successful_poll;
+
+		if (!vcpu_valid_wakeup(vcpu))
+			++vcpu->stat.generic.halt_poll_invalid;
+
+		stats->halt_poll_success_ns += poll_ns;
+		KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_success_hist, poll_ns);
+	} else {
+		stats->halt_poll_fail_ns += poll_ns;
+		KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_fail_hist, poll_ns);
+	}
+}
+
+static unsigned int kvm_vcpu_max_halt_poll_ns(struct kvm_vcpu *vcpu)
+{
+	struct kvm *kvm = vcpu->kvm;
+
+	if (kvm->override_halt_poll_ns) {
+		/*
+		 * Ensure kvm->max_halt_poll_ns is not read before
+		 * kvm->override_halt_poll_ns.
+		 *
+		 * Pairs with the smp_wmb() when enabling KVM_CAP_HALT_POLL.
+		 */
+		smp_rmb();
+		return READ_ONCE(kvm->max_halt_poll_ns);
+	}
+
+	return READ_ONCE(halt_poll_ns);
+}
+
+/*
+ * Emulate a vCPU halt condition, e.g. HLT on x86, WFI on arm, etc...  If halt
+ * polling is enabled, busy wait for a short time before blocking to avoid the
+ * expensive block+unblock sequence if a wake event arrives soon after the vCPU
+ * is halted.
+ */
+void kvm_vcpu_halt(struct kvm_vcpu *vcpu)
+{
+	unsigned int max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu);
+	bool halt_poll_allowed = !kvm_arch_no_poll(vcpu);
+	ktime_t start, cur, poll_end;
+	bool waited = false;
+	bool do_halt_poll;
+	u64 halt_ns;
+
+	if (vcpu->halt_poll_ns > max_halt_poll_ns)
+		vcpu->halt_poll_ns = max_halt_poll_ns;
+
+	do_halt_poll = halt_poll_allowed && vcpu->halt_poll_ns;
+
+	start = cur = poll_end = ktime_get();
+	if (do_halt_poll) {
+		ktime_t stop = ktime_add_ns(start, vcpu->halt_poll_ns);
+
+		do {
+			if (kvm_vcpu_check_block(vcpu) < 0)
+				goto out;
+			cpu_relax();
+			poll_end = cur = ktime_get();
+		} while (kvm_vcpu_can_poll(cur, stop));
+	}
+
+	waited = kvm_vcpu_block(vcpu);
+
+	cur = ktime_get();
+	if (waited) {
+		vcpu->stat.generic.halt_wait_ns +=
+			ktime_to_ns(cur) - ktime_to_ns(poll_end);
+		KVM_STATS_LOG_HIST_UPDATE(vcpu->stat.generic.halt_wait_hist,
+				ktime_to_ns(cur) - ktime_to_ns(poll_end));
+	}
 out:
-	block_ns = ktime_to_ns(cur) - ktime_to_ns(start);
-
-	if (!vcpu_valid_wakeup(vcpu))
-		shrink_halt_poll_ns(vcpu);
-	else if (halt_poll_ns) {
-		if (block_ns <= vcpu->halt_poll_ns)
-			;
-		/* we had a long block, shrink polling */
-		else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns)
+	/* The total time the vCPU was "halted", including polling time. */
+	halt_ns = ktime_to_ns(cur) - ktime_to_ns(start);
+
+	/*
+	 * Note, halt-polling is considered successful so long as the vCPU was
+	 * never actually scheduled out, i.e. even if the wake event arrived
+	 * after of the halt-polling loop itself, but before the full wait.
+	 */
+	if (do_halt_poll)
+		update_halt_poll_stats(vcpu, start, poll_end, !waited);
+
+	if (halt_poll_allowed) {
+		/* Recompute the max halt poll time in case it changed. */
+		max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu);
+
+		if (!vcpu_valid_wakeup(vcpu)) {
 			shrink_halt_poll_ns(vcpu);
-		/* we had a short halt and our poll time is too small */
-		else if (vcpu->halt_poll_ns < halt_poll_ns &&
-			block_ns < halt_poll_ns)
-			grow_halt_poll_ns(vcpu);
-	} else
-		vcpu->halt_poll_ns = 0;
+		} else if (max_halt_poll_ns) {
+			if (halt_ns <= vcpu->halt_poll_ns)
+				;
+			/* we had a long block, shrink polling */
+			else if (vcpu->halt_poll_ns &&
+				 halt_ns > max_halt_poll_ns)
+				shrink_halt_poll_ns(vcpu);
+			/* we had a short halt and our poll time is too small */
+			else if (vcpu->halt_poll_ns < max_halt_poll_ns &&
+				 halt_ns < max_halt_poll_ns)
+				grow_halt_poll_ns(vcpu);
+		} else {
+			vcpu->halt_poll_ns = 0;
+		}
+	}
 
-	trace_kvm_vcpu_wakeup(block_ns, waited, vcpu_valid_wakeup(vcpu));
-	kvm_arch_vcpu_block_finish(vcpu);
+	trace_kvm_vcpu_wakeup(halt_ns, waited, vcpu_valid_wakeup(vcpu));
 }
-EXPORT_SYMBOL_GPL(kvm_vcpu_block);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_halt);
 
 bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
 {
-	struct swait_queue_head *wqp;
-
-	wqp = kvm_arch_vcpu_wq(vcpu);
-	if (swq_has_sleeper(wqp)) {
-		swake_up_one(wqp);
-		++vcpu->stat.halt_wakeup;
+	if (__kvm_vcpu_wake_up(vcpu)) {
+		WRITE_ONCE(vcpu->ready, true);
+		++vcpu->stat.generic.halt_wakeup;
 		return true;
 	}
 
 	return false;
 }
-EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_wake_up);
 
 #ifndef CONFIG_S390
 /*
  * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
  */
-void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
+void __kvm_vcpu_kick(struct kvm_vcpu *vcpu, bool wait)
 {
-	int me;
-	int cpu = vcpu->cpu;
+	int me, cpu;
 
 	if (kvm_vcpu_wake_up(vcpu))
 		return;
 
 	me = get_cpu();
-	if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
-		if (kvm_arch_vcpu_should_kick(vcpu))
-			smp_send_reschedule(cpu);
+	/*
+	 * The only state change done outside the vcpu mutex is IN_GUEST_MODE
+	 * to EXITING_GUEST_MODE.  Therefore the moderately expensive "should
+	 * kick" check does not need atomic operations if kvm_vcpu_kick is used
+	 * within the vCPU thread itself.
+	 */
+	if (vcpu == __this_cpu_read(kvm_running_vcpu)) {
+		if (vcpu->mode == IN_GUEST_MODE)
+			WRITE_ONCE(vcpu->mode, EXITING_GUEST_MODE);
+		goto out;
+	}
+
+	/*
+	 * Note, the vCPU could get migrated to a different pCPU at any point
+	 * after kvm_arch_vcpu_should_kick(), which could result in sending an
+	 * IPI to the previous pCPU.  But, that's ok because the purpose of the
+	 * IPI is to force the vCPU to leave IN_GUEST_MODE, and migrating the
+	 * vCPU also requires it to leave IN_GUEST_MODE.
+	 */
+	if (kvm_arch_vcpu_should_kick(vcpu)) {
+		cpu = READ_ONCE(vcpu->cpu);
+		if (cpu != me && (unsigned int)cpu < nr_cpu_ids && cpu_online(cpu)) {
+			/*
+			 * Use a reschedule IPI to kick the vCPU if the caller
+			 * doesn't need to wait for a response, as KVM allows
+			 * kicking vCPUs while IRQs are disabled, but using the
+			 * SMP function call framework with IRQs disabled can
+			 * deadlock due to taking cross-CPU locks.
+			 */
+			if (wait)
+				smp_call_function_single(cpu, ack_kick, NULL, wait);
+			else
+				smp_send_reschedule(cpu);
+		}
+	}
+out:
 	put_cpu();
 }
-EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_vcpu_kick);
 #endif /* !CONFIG_S390 */
 
 int kvm_vcpu_yield_to(struct kvm_vcpu *target)
 {
-	struct pid *pid;
 	struct task_struct *task = NULL;
-	int ret = 0;
+	int ret;
+
+	if (!read_trylock(&target->pid_lock))
+		return 0;
+
+	if (target->pid)
+		task = get_pid_task(target->pid, PIDTYPE_PID);
+
+	read_unlock(&target->pid_lock);
 
-	rcu_read_lock();
-	pid = rcu_dereference(target->pid);
-	if (pid)
-		task = get_pid_task(pid, PIDTYPE_PID);
-	rcu_read_unlock();
 	if (!task)
-		return ret;
+		return 0;
 	ret = yield_to(task, 1);
 	put_task_struct(task);
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_yield_to);
 
 /*
  * Helper that checks whether a VCPU is eligible for directed yield.
@@ -2368,7 +3891,7 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
  *
  *  (a) VCPU which has not done pl-exit or cpu relax intercepted recently
  *  (preempted lock holder), indicated by @in_spin_loop.
- *  Set at the beiginning and cleared at the end of interception/PLE handler.
+ *  Set at the beginning and cleared at the end of interception/PLE handler.
  *
  *  (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
  *  chance last time (mostly it has become eligible now since we have probably
@@ -2401,51 +3924,113 @@ static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
 #endif
 }
 
+/*
+ * Unlike kvm_arch_vcpu_runnable, this function is called outside
+ * a vcpu_load/vcpu_put pair.  However, for most architectures
+ * kvm_arch_vcpu_runnable does not require vcpu_load.
+ */
+bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
+{
+	return kvm_arch_vcpu_runnable(vcpu);
+}
+
+static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
+{
+	if (kvm_arch_dy_runnable(vcpu))
+		return true;
+
+#ifdef CONFIG_KVM_ASYNC_PF
+	if (!list_empty_careful(&vcpu->async_pf.done))
+		return true;
+#endif
+
+	return false;
+}
+
+/*
+ * By default, simply query the target vCPU's current mode when checking if a
+ * vCPU was preempted in kernel mode.  All architectures except x86 (or more
+ * specifical, except VMX) allow querying whether or not a vCPU is in kernel
+ * mode even if the vCPU is NOT loaded, i.e. using kvm_arch_vcpu_in_kernel()
+ * directly for cross-vCPU checks is functionally correct and accurate.
+ */
+bool __weak kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu)
+{
+	return kvm_arch_vcpu_in_kernel(vcpu);
+}
+
+bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
+{
+	return false;
+}
+
 void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
 {
+	int nr_vcpus, start, i, idx, yielded;
 	struct kvm *kvm = me->kvm;
 	struct kvm_vcpu *vcpu;
-	int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
-	int yielded = 0;
 	int try = 3;
-	int pass;
-	int i;
+
+	nr_vcpus = atomic_read(&kvm->online_vcpus);
+	if (nr_vcpus < 2)
+		return;
+
+	/* Pairs with the smp_wmb() in kvm_vm_ioctl_create_vcpu(). */
+	smp_rmb();
 
 	kvm_vcpu_set_in_spin_loop(me, true);
+
 	/*
-	 * We boost the priority of a VCPU that is runnable but not
-	 * currently running, because it got preempted by something
-	 * else and called schedule in __vcpu_run.  Hopefully that
-	 * VCPU is holding the lock that we need and will release it.
-	 * We approximate round-robin by starting at the last boosted VCPU.
+	 * The current vCPU ("me") is spinning in kernel mode, i.e. is likely
+	 * waiting for a resource to become available.  Attempt to yield to a
+	 * vCPU that is runnable, but not currently running, e.g. because the
+	 * vCPU was preempted by a higher priority task.  With luck, the vCPU
+	 * that was preempted is holding a lock or some other resource that the
+	 * current vCPU is waiting to acquire, and yielding to the other vCPU
+	 * will allow it to make forward progress and release the lock (or kick
+	 * the spinning vCPU, etc).
+	 *
+	 * Since KVM has no insight into what exactly the guest is doing,
+	 * approximate a round-robin selection by iterating over all vCPUs,
+	 * starting at the last boosted vCPU.  I.e. if N=kvm->last_boosted_vcpu,
+	 * iterate over vCPU[N+1]..vCPU[N-1], wrapping as needed.
+	 *
+	 * Note, this is inherently racy, e.g. if multiple vCPUs are spinning,
+	 * they may all try to yield to the same vCPU(s).  But as above, this
+	 * is all best effort due to KVM's lack of visibility into the guest.
 	 */
-	for (pass = 0; pass < 2 && !yielded && try; pass++) {
-		kvm_for_each_vcpu(i, vcpu, kvm) {
-			if (!pass && i <= last_boosted_vcpu) {
-				i = last_boosted_vcpu;
-				continue;
-			} else if (pass && i > last_boosted_vcpu)
-				break;
-			if (!READ_ONCE(vcpu->preempted))
-				continue;
-			if (vcpu == me)
-				continue;
-			if (swait_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
-				continue;
-			if (yield_to_kernel_mode && !kvm_arch_vcpu_in_kernel(vcpu))
-				continue;
-			if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
-				continue;
+	start = READ_ONCE(kvm->last_boosted_vcpu) + 1;
+	for (i = 0; i < nr_vcpus; i++) {
+		idx = (start + i) % nr_vcpus;
+		if (idx == me->vcpu_idx)
+			continue;
 
-			yielded = kvm_vcpu_yield_to(vcpu);
-			if (yielded > 0) {
-				kvm->last_boosted_vcpu = i;
-				break;
-			} else if (yielded < 0) {
-				try--;
-				if (!try)
-					break;
-			}
+		vcpu = xa_load(&kvm->vcpu_array, idx);
+		if (!READ_ONCE(vcpu->ready))
+			continue;
+		if (kvm_vcpu_is_blocking(vcpu) && !vcpu_dy_runnable(vcpu))
+			continue;
+
+		/*
+		 * Treat the target vCPU as being in-kernel if it has a pending
+		 * interrupt, as the vCPU trying to yield may be spinning
+		 * waiting on IPI delivery, i.e. the target vCPU is in-kernel
+		 * for the purposes of directed yield.
+		 */
+		if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
+		    !kvm_arch_dy_has_pending_interrupt(vcpu) &&
+		    !kvm_arch_vcpu_preempted_in_kernel(vcpu))
+			continue;
+
+		if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
+			continue;
+
+		yielded = kvm_vcpu_yield_to(vcpu);
+		if (yielded > 0) {
+			WRITE_ONCE(kvm->last_boosted_vcpu, idx);
+			break;
+		} else if (yielded < 0 && !--try) {
+			break;
 		}
 	}
 	kvm_vcpu_set_in_spin_loop(me, false);
@@ -2453,7 +4038,18 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
 	/* Ensure vcpu is not eligible during next spinloop */
 	kvm_vcpu_set_dy_eligible(me, false);
 }
-EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_on_spin);
+
+static bool kvm_page_in_dirty_ring(struct kvm *kvm, unsigned long pgoff)
+{
+#ifdef CONFIG_HAVE_KVM_DIRTY_RING
+	return (pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) &&
+	    (pgoff < KVM_DIRTY_LOG_PAGE_OFFSET +
+	     kvm->dirty_ring_size / PAGE_SIZE);
+#else
+	return false;
+#endif
+}
 
 static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
 {
@@ -2470,6 +4066,10 @@ static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
 	else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
 		page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
 #endif
+	else if (kvm_page_in_dirty_ring(vcpu->kvm, vmf->pgoff))
+		page = kvm_dirty_ring_get_page(
+		    &vcpu->dirty_ring,
+		    vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET);
 	else
 		return kvm_arch_vcpu_fault(vcpu, vmf);
 	get_page(page);
@@ -2483,6 +4083,14 @@ static const struct vm_operations_struct kvm_vcpu_vm_ops = {
 
 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
 {
+	struct kvm_vcpu *vcpu = file->private_data;
+	unsigned long pages = vma_pages(vma);
+
+	if ((kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff) ||
+	     kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff + pages - 1)) &&
+	    ((vma->vm_flags & VM_EXEC) || !(vma->vm_flags & VM_SHARED)))
+		return -EINVAL;
+
 	vma->vm_ops = &kvm_vcpu_vm_ops;
 	return 0;
 }
@@ -2491,7 +4099,6 @@ static int kvm_vcpu_release(struct inode *inode, struct file *filp)
 {
 	struct kvm_vcpu *vcpu = filp->private_data;
 
-	debugfs_remove_recursive(vcpu->debugfs_dentry);
 	kvm_put_kvm(vcpu->kvm);
 	return 0;
 }
@@ -2515,102 +4122,153 @@ static int create_vcpu_fd(struct kvm_vcpu *vcpu)
 	return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
 }
 
-static int kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
+#ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
+static int vcpu_get_pid(void *data, u64 *val)
 {
-	char dir_name[ITOA_MAX_LEN * 2];
-	int ret;
+	struct kvm_vcpu *vcpu = data;
 
-	if (!kvm_arch_has_vcpu_debugfs())
-		return 0;
+	read_lock(&vcpu->pid_lock);
+	*val = pid_nr(vcpu->pid);
+	read_unlock(&vcpu->pid_lock);
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_get_pid_fops, vcpu_get_pid, NULL, "%llu\n");
+
+static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
+{
+	struct dentry *debugfs_dentry;
+	char dir_name[ITOA_MAX_LEN * 2];
 
 	if (!debugfs_initialized())
-		return 0;
+		return;
 
 	snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
-	vcpu->debugfs_dentry = debugfs_create_dir(dir_name,
-								vcpu->kvm->debugfs_dentry);
-	if (!vcpu->debugfs_dentry)
-		return -ENOMEM;
+	debugfs_dentry = debugfs_create_dir(dir_name,
+					    vcpu->kvm->debugfs_dentry);
+	debugfs_create_file("pid", 0444, debugfs_dentry, vcpu,
+			    &vcpu_get_pid_fops);
 
-	ret = kvm_arch_create_vcpu_debugfs(vcpu);
-	if (ret < 0) {
-		debugfs_remove_recursive(vcpu->debugfs_dentry);
-		return ret;
-	}
-
-	return 0;
+	kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry);
 }
+#endif
 
 /*
  * Creates some virtual cpus.  Good luck creating more than one.
  */
-static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
+static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
 {
 	int r;
 	struct kvm_vcpu *vcpu;
+	struct page *page;
 
-	if (id >= KVM_MAX_VCPU_ID)
+	/*
+	 * KVM tracks vCPU IDs as 'int', be kind to userspace and reject
+	 * too-large values instead of silently truncating.
+	 *
+	 * Ensure KVM_MAX_VCPU_IDS isn't pushed above INT_MAX without first
+	 * changing the storage type (at the very least, IDs should be tracked
+	 * as unsigned ints).
+	 */
+	BUILD_BUG_ON(KVM_MAX_VCPU_IDS > INT_MAX);
+	if (id >= KVM_MAX_VCPU_IDS)
 		return -EINVAL;
 
 	mutex_lock(&kvm->lock);
-	if (kvm->created_vcpus == KVM_MAX_VCPUS) {
+	if (kvm->created_vcpus >= kvm->max_vcpus) {
 		mutex_unlock(&kvm->lock);
 		return -EINVAL;
 	}
 
+	r = kvm_arch_vcpu_precreate(kvm, id);
+	if (r) {
+		mutex_unlock(&kvm->lock);
+		return r;
+	}
+
 	kvm->created_vcpus++;
 	mutex_unlock(&kvm->lock);
 
-	vcpu = kvm_arch_vcpu_create(kvm, id);
-	if (IS_ERR(vcpu)) {
-		r = PTR_ERR(vcpu);
+	vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
+	if (!vcpu) {
+		r = -ENOMEM;
 		goto vcpu_decrement;
 	}
 
-	preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
+	BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE);
+	page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+	if (!page) {
+		r = -ENOMEM;
+		goto vcpu_free;
+	}
+	vcpu->run = page_address(page);
 
-	r = kvm_arch_vcpu_setup(vcpu);
-	if (r)
-		goto vcpu_destroy;
+	kvm_vcpu_init(vcpu, kvm, id);
 
-	r = kvm_create_vcpu_debugfs(vcpu);
+	r = kvm_arch_vcpu_create(vcpu);
 	if (r)
-		goto vcpu_destroy;
+		goto vcpu_free_run_page;
+
+	if (kvm->dirty_ring_size) {
+		r = kvm_dirty_ring_alloc(kvm, &vcpu->dirty_ring,
+					 id, kvm->dirty_ring_size);
+		if (r)
+			goto arch_vcpu_destroy;
+	}
 
 	mutex_lock(&kvm->lock);
+
 	if (kvm_get_vcpu_by_id(kvm, id)) {
 		r = -EEXIST;
 		goto unlock_vcpu_destroy;
 	}
 
-	BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]);
+	vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
+	r = xa_insert(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, GFP_KERNEL_ACCOUNT);
+	WARN_ON_ONCE(r == -EBUSY);
+	if (r)
+		goto unlock_vcpu_destroy;
 
-	/* Now it's all set up, let userspace reach it */
+	/*
+	 * Now it's all set up, let userspace reach it.  Grab the vCPU's mutex
+	 * so that userspace can't invoke vCPU ioctl()s until the vCPU is fully
+	 * visible (per online_vcpus), e.g. so that KVM doesn't get tricked
+	 * into a NULL-pointer dereference because KVM thinks the _current_
+	 * vCPU doesn't exist.  As a bonus, taking vcpu->mutex ensures lockdep
+	 * knows it's taken *inside* kvm->lock.
+	 */
+	mutex_lock(&vcpu->mutex);
 	kvm_get_kvm(kvm);
 	r = create_vcpu_fd(vcpu);
-	if (r < 0) {
-		kvm_put_kvm(kvm);
-		goto unlock_vcpu_destroy;
-	}
-
-	kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu;
+	if (r < 0)
+		goto kvm_put_xa_erase;
 
 	/*
-	 * Pairs with smp_rmb() in kvm_get_vcpu.  Write kvm->vcpus
-	 * before kvm->online_vcpu's incremented value.
+	 * Pairs with smp_rmb() in kvm_get_vcpu.  Store the vcpu
+	 * pointer before kvm->online_vcpu's incremented value.
 	 */
 	smp_wmb();
 	atomic_inc(&kvm->online_vcpus);
+	mutex_unlock(&vcpu->mutex);
 
 	mutex_unlock(&kvm->lock);
 	kvm_arch_vcpu_postcreate(vcpu);
+	kvm_create_vcpu_debugfs(vcpu);
 	return r;
 
+kvm_put_xa_erase:
+	mutex_unlock(&vcpu->mutex);
+	kvm_put_kvm_no_destroy(kvm);
+	xa_erase(&kvm->vcpu_array, vcpu->vcpu_idx);
 unlock_vcpu_destroy:
 	mutex_unlock(&kvm->lock);
-	debugfs_remove_recursive(vcpu->debugfs_dentry);
-vcpu_destroy:
+	kvm_dirty_ring_free(&vcpu->dirty_ring);
+arch_vcpu_destroy:
 	kvm_arch_vcpu_destroy(vcpu);
+vcpu_free_run_page:
+	free_page((unsigned long)vcpu->run);
+vcpu_free:
+	kmem_cache_free(kvm_vcpu_cache, vcpu);
 vcpu_decrement:
 	mutex_lock(&kvm->lock);
 	kvm->created_vcpus--;
@@ -2629,6 +4287,129 @@ static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
 	return 0;
 }
 
+static ssize_t kvm_vcpu_stats_read(struct file *file, char __user *user_buffer,
+			      size_t size, loff_t *offset)
+{
+	struct kvm_vcpu *vcpu = file->private_data;
+
+	return kvm_stats_read(vcpu->stats_id, &kvm_vcpu_stats_header,
+			&kvm_vcpu_stats_desc[0], &vcpu->stat,
+			sizeof(vcpu->stat), user_buffer, size, offset);
+}
+
+static int kvm_vcpu_stats_release(struct inode *inode, struct file *file)
+{
+	struct kvm_vcpu *vcpu = file->private_data;
+
+	kvm_put_kvm(vcpu->kvm);
+	return 0;
+}
+
+static const struct file_operations kvm_vcpu_stats_fops = {
+	.owner = THIS_MODULE,
+	.read = kvm_vcpu_stats_read,
+	.release = kvm_vcpu_stats_release,
+	.llseek = noop_llseek,
+};
+
+static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu)
+{
+	int fd;
+	struct file *file;
+	char name[15 + ITOA_MAX_LEN + 1];
+
+	snprintf(name, sizeof(name), "kvm-vcpu-stats:%d", vcpu->vcpu_id);
+
+	fd = get_unused_fd_flags(O_CLOEXEC);
+	if (fd < 0)
+		return fd;
+
+	file = anon_inode_getfile_fmode(name, &kvm_vcpu_stats_fops, vcpu,
+					O_RDONLY, FMODE_PREAD);
+	if (IS_ERR(file)) {
+		put_unused_fd(fd);
+		return PTR_ERR(file);
+	}
+
+	kvm_get_kvm(vcpu->kvm);
+	fd_install(fd, file);
+
+	return fd;
+}
+
+#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY
+static int kvm_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
+				     struct kvm_pre_fault_memory *range)
+{
+	int idx;
+	long r;
+	u64 full_size;
+
+	if (range->flags)
+		return -EINVAL;
+
+	if (!PAGE_ALIGNED(range->gpa) ||
+	    !PAGE_ALIGNED(range->size) ||
+	    range->gpa + range->size <= range->gpa)
+		return -EINVAL;
+
+	vcpu_load(vcpu);
+	idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+	full_size = range->size;
+	do {
+		if (signal_pending(current)) {
+			r = -EINTR;
+			break;
+		}
+
+		r = kvm_arch_vcpu_pre_fault_memory(vcpu, range);
+		if (WARN_ON_ONCE(r == 0 || r == -EIO))
+			break;
+
+		if (r < 0)
+			break;
+
+		range->size -= r;
+		range->gpa += r;
+		cond_resched();
+	} while (range->size);
+
+	srcu_read_unlock(&vcpu->kvm->srcu, idx);
+	vcpu_put(vcpu);
+
+	/* Return success if at least one page was mapped successfully.  */
+	return full_size == range->size ? r : 0;
+}
+#endif
+
+static int kvm_wait_for_vcpu_online(struct kvm_vcpu *vcpu)
+{
+	struct kvm *kvm = vcpu->kvm;
+
+	/*
+	 * In practice, this happy path will always be taken, as a well-behaved
+	 * VMM will never invoke a vCPU ioctl() before KVM_CREATE_VCPU returns.
+	 */
+	if (likely(vcpu->vcpu_idx < atomic_read(&kvm->online_vcpus)))
+		return 0;
+
+	/*
+	 * Acquire and release the vCPU's mutex to wait for vCPU creation to
+	 * complete (kvm_vm_ioctl_create_vcpu() holds the mutex until the vCPU
+	 * is fully online).
+	 */
+	if (mutex_lock_killable(&vcpu->mutex))
+		return -EINTR;
+
+	mutex_unlock(&vcpu->mutex);
+
+	if (WARN_ON_ONCE(!kvm_get_vcpu(kvm, vcpu->vcpu_idx)))
+		return -EIO;
+
+	return 0;
+}
+
 static long kvm_vcpu_ioctl(struct file *filp,
 			   unsigned int ioctl, unsigned long arg)
 {
@@ -2638,17 +4419,26 @@ static long kvm_vcpu_ioctl(struct file *filp,
 	struct kvm_fpu *fpu = NULL;
 	struct kvm_sregs *kvm_sregs = NULL;
 
-	if (vcpu->kvm->mm != current->mm)
+	if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
 		return -EIO;
 
 	if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
 		return -EINVAL;
 
 	/*
-	 * Some architectures have vcpu ioctls that are asynchronous to vcpu
-	 * execution; mutex_lock() would break them.
+	 * Wait for the vCPU to be online before handling the ioctl(), as KVM
+	 * assumes the vCPU is reachable via vcpu_array, i.e. may dereference
+	 * a NULL pointer if userspace invokes an ioctl() before KVM is ready.
 	 */
-	r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg);
+	r = kvm_wait_for_vcpu_online(vcpu);
+	if (r)
+		return r;
+
+	/*
+	 * Let arch code handle select vCPU ioctls without holding vcpu->mutex,
+	 * e.g. to support ioctls that can run asynchronous to vCPU execution.
+	 */
+	r = kvm_arch_vcpu_unlocked_ioctl(filp, ioctl, arg);
 	if (r != -ENOIOCTLCMD)
 		return r;
 
@@ -2660,7 +4450,14 @@ static long kvm_vcpu_ioctl(struct file *filp,
 		r = -EINVAL;
 		if (arg)
 			goto out;
-		oldpid = rcu_access_pointer(vcpu->pid);
+
+		/*
+		 * Note, vcpu->pid is primarily protected by vcpu->mutex. The
+		 * dedicated r/w lock allows other tasks, e.g. other vCPUs, to
+		 * read vcpu->pid while this vCPU is in KVM_RUN, e.g. to yield
+		 * directly to this vCPU
+		 */
+		oldpid = vcpu->pid;
 		if (unlikely(oldpid != task_pid(current))) {
 			/* The thread running this VCPU changed. */
 			struct pid *newpid;
@@ -2670,12 +4467,22 @@ static long kvm_vcpu_ioctl(struct file *filp,
 				break;
 
 			newpid = get_task_pid(current, PIDTYPE_PID);
-			rcu_assign_pointer(vcpu->pid, newpid);
-			if (oldpid)
-				synchronize_rcu();
+			write_lock(&vcpu->pid_lock);
+			vcpu->pid = newpid;
+			write_unlock(&vcpu->pid_lock);
+
 			put_pid(oldpid);
 		}
-		r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
+		vcpu->wants_to_run = !READ_ONCE(vcpu->run->immediate_exit__unsafe);
+		r = kvm_arch_vcpu_ioctl_run(vcpu);
+		vcpu->wants_to_run = false;
+
+		/*
+		 * FIXME: Remove this hack once all KVM architectures
+		 * support the generic TIF bits, i.e. a dedicated TIF_RSEQ.
+		 */
+		rseq_virt_userspace_exit();
+
 		trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
 		break;
 	}
@@ -2700,7 +4507,6 @@ out_free1:
 	case KVM_SET_REGS: {
 		struct kvm_regs *kvm_regs;
 
-		r = -ENOMEM;
 		kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
 		if (IS_ERR(kvm_regs)) {
 			r = PTR_ERR(kvm_regs);
@@ -2826,6 +4632,24 @@ out_free1:
 		r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
 		break;
 	}
+	case KVM_GET_STATS_FD: {
+		r = kvm_vcpu_ioctl_get_stats_fd(vcpu);
+		break;
+	}
+#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY
+	case KVM_PRE_FAULT_MEMORY: {
+		struct kvm_pre_fault_memory range;
+
+		r = -EFAULT;
+		if (copy_from_user(&range, argp, sizeof(range)))
+			break;
+		r = kvm_vcpu_pre_fault_memory(vcpu, &range);
+		/* Pass back leftover range. */
+		if (copy_to_user(argp, &range, sizeof(range)))
+			r = -EFAULT;
+		break;
+	}
+#endif
 	default:
 		r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
 	}
@@ -2844,7 +4668,7 @@ static long kvm_vcpu_compat_ioctl(struct file *filp,
 	void __user *argp = compat_ptr(arg);
 	int r;
 
-	if (vcpu->kvm->mm != current->mm)
+	if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
 		return -EIO;
 
 	switch (ioctl) {
@@ -2862,7 +4686,8 @@ static long kvm_vcpu_compat_ioctl(struct file *filp,
 			if (kvm_sigmask.len != sizeof(compat_sigset_t))
 				goto out;
 			r = -EFAULT;
-			if (get_compat_sigset(&sigset, (void *)sigmask_arg->sigset))
+			if (get_compat_sigset(&sigset,
+					      (compat_sigset_t __user *)sigmask_arg->sigset))
 				goto out;
 			r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
 		} else
@@ -2878,6 +4703,16 @@ out:
 }
 #endif
 
+static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	struct kvm_device *dev = filp->private_data;
+
+	if (dev->ops->mmap)
+		return dev->ops->mmap(dev, vma);
+
+	return -ENODEV;
+}
+
 static int kvm_device_ioctl_attr(struct kvm_device *dev,
 				 int (*accessor)(struct kvm_device *dev,
 						 struct kvm_device_attr *attr),
@@ -2899,6 +4734,9 @@ static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
 {
 	struct kvm_device *dev = filp->private_data;
 
+	if (dev->kvm->mm != current->mm || dev->kvm->vm_dead)
+		return -EIO;
+
 	switch (ioctl) {
 	case KVM_SET_DEVICE_ATTR:
 		return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
@@ -2919,14 +4757,23 @@ static int kvm_device_release(struct inode *inode, struct file *filp)
 	struct kvm_device *dev = filp->private_data;
 	struct kvm *kvm = dev->kvm;
 
+	if (dev->ops->release) {
+		mutex_lock(&kvm->lock);
+		list_del_rcu(&dev->vm_node);
+		synchronize_rcu();
+		dev->ops->release(dev);
+		mutex_unlock(&kvm->lock);
+	}
+
 	kvm_put_kvm(kvm);
 	return 0;
 }
 
-static const struct file_operations kvm_device_fops = {
+static struct file_operations kvm_device_fops = {
 	.unlocked_ioctl = kvm_device_ioctl,
 	.release = kvm_device_release,
 	KVM_COMPAT(kvm_device_ioctl),
+	.mmap = kvm_device_mmap,
 };
 
 struct kvm_device *kvm_device_from_filp(struct file *filp)
@@ -2937,14 +4784,14 @@ struct kvm_device *kvm_device_from_filp(struct file *filp)
 	return filp->private_data;
 }
 
-static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
+static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
 #ifdef CONFIG_KVM_MPIC
 	[KVM_DEV_TYPE_FSL_MPIC_20]	= &kvm_mpic_ops,
 	[KVM_DEV_TYPE_FSL_MPIC_42]	= &kvm_mpic_ops,
 #endif
 };
 
-int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type)
+int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type)
 {
 	if (type >= ARRAY_SIZE(kvm_device_ops_table))
 		return -ENOSPC;
@@ -2965,22 +4812,24 @@ void kvm_unregister_device_ops(u32 type)
 static int kvm_ioctl_create_device(struct kvm *kvm,
 				   struct kvm_create_device *cd)
 {
-	struct kvm_device_ops *ops = NULL;
+	const struct kvm_device_ops *ops;
 	struct kvm_device *dev;
 	bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
+	int type;
 	int ret;
 
 	if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
 		return -ENODEV;
 
-	ops = kvm_device_ops_table[cd->type];
+	type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table));
+	ops = kvm_device_ops_table[type];
 	if (ops == NULL)
 		return -ENODEV;
 
 	if (test)
 		return 0;
 
-	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT);
 	if (!dev)
 		return -ENOMEM;
 
@@ -2988,52 +4837,55 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
 	dev->kvm = kvm;
 
 	mutex_lock(&kvm->lock);
-	ret = ops->create(dev, cd->type);
+	ret = ops->create(dev, type);
 	if (ret < 0) {
 		mutex_unlock(&kvm->lock);
 		kfree(dev);
 		return ret;
 	}
-	list_add(&dev->vm_node, &kvm->devices);
+	list_add_rcu(&dev->vm_node, &kvm->devices);
 	mutex_unlock(&kvm->lock);
 
 	if (ops->init)
 		ops->init(dev);
 
+	kvm_get_kvm(kvm);
 	ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
 	if (ret < 0) {
+		kvm_put_kvm_no_destroy(kvm);
 		mutex_lock(&kvm->lock);
-		list_del(&dev->vm_node);
+		list_del_rcu(&dev->vm_node);
+		synchronize_rcu();
+		if (ops->release)
+			ops->release(dev);
 		mutex_unlock(&kvm->lock);
-		ops->destroy(dev);
+		if (ops->destroy)
+			ops->destroy(dev);
 		return ret;
 	}
 
-	kvm_get_kvm(kvm);
 	cd->fd = ret;
 	return 0;
 }
 
-static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
+static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
 {
 	switch (arg) {
 	case KVM_CAP_USER_MEMORY:
+	case KVM_CAP_USER_MEMORY2:
 	case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
 	case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
 	case KVM_CAP_INTERNAL_ERROR_DATA:
 #ifdef CONFIG_HAVE_KVM_MSI
 	case KVM_CAP_SIGNAL_MSI:
 #endif
-#ifdef CONFIG_HAVE_KVM_IRQFD
+#ifdef CONFIG_HAVE_KVM_IRQCHIP
 	case KVM_CAP_IRQFD:
-	case KVM_CAP_IRQFD_RESAMPLE:
 #endif
 	case KVM_CAP_IOEVENTFD_ANY_LENGTH:
 	case KVM_CAP_CHECK_EXTENSION_VM:
 	case KVM_CAP_ENABLE_CAP_VM:
-#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
-	case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT:
-#endif
+	case KVM_CAP_HALT_POLL:
 		return 1;
 #ifdef CONFIG_KVM_MMIO
 	case KVM_CAP_COALESCED_MMIO:
@@ -3041,44 +4893,264 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
 	case KVM_CAP_COALESCED_PIO:
 		return 1;
 #endif
+#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
+	case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2:
+		return KVM_DIRTY_LOG_MANUAL_CAPS;
+#endif
 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
 	case KVM_CAP_IRQ_ROUTING:
 		return KVM_MAX_IRQ_ROUTES;
 #endif
-#if KVM_ADDRESS_SPACE_NUM > 1
+#if KVM_MAX_NR_ADDRESS_SPACES > 1
 	case KVM_CAP_MULTI_ADDRESS_SPACE:
-		return KVM_ADDRESS_SPACE_NUM;
+		if (kvm)
+			return kvm_arch_nr_memslot_as_ids(kvm);
+		return KVM_MAX_NR_ADDRESS_SPACES;
+#endif
+	case KVM_CAP_NR_MEMSLOTS:
+		return KVM_USER_MEM_SLOTS;
+	case KVM_CAP_DIRTY_LOG_RING:
+#ifdef CONFIG_HAVE_KVM_DIRTY_RING_TSO
+		return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
+#else
+		return 0;
+#endif
+	case KVM_CAP_DIRTY_LOG_RING_ACQ_REL:
+#ifdef CONFIG_HAVE_KVM_DIRTY_RING_ACQ_REL
+		return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
+#else
+		return 0;
+#endif
+#ifdef CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP
+	case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP:
+#endif
+	case KVM_CAP_BINARY_STATS_FD:
+	case KVM_CAP_SYSTEM_EVENT_DATA:
+	case KVM_CAP_DEVICE_CTRL:
+		return 1;
+#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+	case KVM_CAP_MEMORY_ATTRIBUTES:
+		return kvm_supported_mem_attributes(kvm);
+#endif
+#ifdef CONFIG_KVM_GUEST_MEMFD
+	case KVM_CAP_GUEST_MEMFD:
+		return 1;
+	case KVM_CAP_GUEST_MEMFD_FLAGS:
+		return kvm_gmem_get_supported_flags(kvm);
 #endif
-	case KVM_CAP_MAX_VCPU_ID:
-		return KVM_MAX_VCPU_ID;
 	default:
 		break;
 	}
 	return kvm_vm_ioctl_check_extension(kvm, arg);
 }
 
+static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size)
+{
+	int r;
+
+	if (!KVM_DIRTY_LOG_PAGE_OFFSET)
+		return -EINVAL;
+
+	/* the size should be power of 2 */
+	if (!size || (size & (size - 1)))
+		return -EINVAL;
+
+	/* Should be bigger to keep the reserved entries, or a page */
+	if (size < kvm_dirty_ring_get_rsvd_entries(kvm) *
+	    sizeof(struct kvm_dirty_gfn) || size < PAGE_SIZE)
+		return -EINVAL;
+
+	if (size > KVM_DIRTY_RING_MAX_ENTRIES *
+	    sizeof(struct kvm_dirty_gfn))
+		return -E2BIG;
+
+	/* We only allow it to set once */
+	if (kvm->dirty_ring_size)
+		return -EINVAL;
+
+	mutex_lock(&kvm->lock);
+
+	if (kvm->created_vcpus) {
+		/* We don't allow to change this value after vcpu created */
+		r = -EINVAL;
+	} else {
+		kvm->dirty_ring_size = size;
+		r = 0;
+	}
+
+	mutex_unlock(&kvm->lock);
+	return r;
+}
+
+static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm)
+{
+	unsigned long i;
+	struct kvm_vcpu *vcpu;
+	int cleared = 0, r;
+
+	if (!kvm->dirty_ring_size)
+		return -EINVAL;
+
+	mutex_lock(&kvm->slots_lock);
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		r = kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring, &cleared);
+		if (r)
+			break;
+	}
+
+	mutex_unlock(&kvm->slots_lock);
+
+	if (cleared)
+		kvm_flush_remote_tlbs(kvm);
+
+	return cleared;
+}
+
 int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 						  struct kvm_enable_cap *cap)
 {
 	return -EINVAL;
 }
 
+bool kvm_are_all_memslots_empty(struct kvm *kvm)
+{
+	int i;
+
+	lockdep_assert_held(&kvm->slots_lock);
+
+	for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
+		if (!kvm_memslots_empty(__kvm_memslots(kvm, i)))
+			return false;
+	}
+
+	return true;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_are_all_memslots_empty);
+
 static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
 					   struct kvm_enable_cap *cap)
 {
 	switch (cap->cap) {
 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
-	case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT:
-		if (cap->flags || (cap->args[0] & ~1))
+	case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: {
+		u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE;
+
+		if (cap->args[0] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE)
+			allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS;
+
+		if (cap->flags || (cap->args[0] & ~allowed_options))
 			return -EINVAL;
 		kvm->manual_dirty_log_protect = cap->args[0];
 		return 0;
+	}
 #endif
+	case KVM_CAP_HALT_POLL: {
+		if (cap->flags || cap->args[0] != (unsigned int)cap->args[0])
+			return -EINVAL;
+
+		kvm->max_halt_poll_ns = cap->args[0];
+
+		/*
+		 * Ensure kvm->override_halt_poll_ns does not become visible
+		 * before kvm->max_halt_poll_ns.
+		 *
+		 * Pairs with the smp_rmb() in kvm_vcpu_max_halt_poll_ns().
+		 */
+		smp_wmb();
+		kvm->override_halt_poll_ns = true;
+
+		return 0;
+	}
+	case KVM_CAP_DIRTY_LOG_RING:
+	case KVM_CAP_DIRTY_LOG_RING_ACQ_REL:
+		if (!kvm_vm_ioctl_check_extension_generic(kvm, cap->cap))
+			return -EINVAL;
+
+		return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]);
+	case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP: {
+		int r = -EINVAL;
+
+		if (!IS_ENABLED(CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP) ||
+		    !kvm->dirty_ring_size || cap->flags)
+			return r;
+
+		mutex_lock(&kvm->slots_lock);
+
+		/*
+		 * For simplicity, allow enabling ring+bitmap if and only if
+		 * there are no memslots, e.g. to ensure all memslots allocate
+		 * a bitmap after the capability is enabled.
+		 */
+		if (kvm_are_all_memslots_empty(kvm)) {
+			kvm->dirty_ring_with_bitmap = true;
+			r = 0;
+		}
+
+		mutex_unlock(&kvm->slots_lock);
+
+		return r;
+	}
 	default:
 		return kvm_vm_ioctl_enable_cap(kvm, cap);
 	}
 }
 
+static ssize_t kvm_vm_stats_read(struct file *file, char __user *user_buffer,
+			      size_t size, loff_t *offset)
+{
+	struct kvm *kvm = file->private_data;
+
+	return kvm_stats_read(kvm->stats_id, &kvm_vm_stats_header,
+				&kvm_vm_stats_desc[0], &kvm->stat,
+				sizeof(kvm->stat), user_buffer, size, offset);
+}
+
+static int kvm_vm_stats_release(struct inode *inode, struct file *file)
+{
+	struct kvm *kvm = file->private_data;
+
+	kvm_put_kvm(kvm);
+	return 0;
+}
+
+static const struct file_operations kvm_vm_stats_fops = {
+	.owner = THIS_MODULE,
+	.read = kvm_vm_stats_read,
+	.release = kvm_vm_stats_release,
+	.llseek = noop_llseek,
+};
+
+static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm)
+{
+	int fd;
+	struct file *file;
+
+	fd = get_unused_fd_flags(O_CLOEXEC);
+	if (fd < 0)
+		return fd;
+
+	file = anon_inode_getfile_fmode("kvm-vm-stats",
+			&kvm_vm_stats_fops, kvm, O_RDONLY, FMODE_PREAD);
+	if (IS_ERR(file)) {
+		put_unused_fd(fd);
+		return PTR_ERR(file);
+	}
+
+	kvm_get_kvm(kvm);
+	fd_install(fd, file);
+
+	return fd;
+}
+
+#define SANITY_CHECK_MEM_REGION_FIELD(field)					\
+do {										\
+	BUILD_BUG_ON(offsetof(struct kvm_userspace_memory_region, field) !=		\
+		     offsetof(struct kvm_userspace_memory_region2, field));	\
+	BUILD_BUG_ON(sizeof_field(struct kvm_userspace_memory_region, field) !=		\
+		     sizeof_field(struct kvm_userspace_memory_region2, field));	\
+} while (0)
+
 static long kvm_vm_ioctl(struct file *filp,
 			   unsigned int ioctl, unsigned long arg)
 {
@@ -3086,7 +5158,7 @@ static long kvm_vm_ioctl(struct file *filp,
 	void __user *argp = (void __user *)arg;
 	int r;
 
-	if (kvm->mm != current->mm)
+	if (kvm->mm != current->mm || kvm->vm_dead)
 		return -EIO;
 	switch (ioctl) {
 	case KVM_CREATE_VCPU:
@@ -3101,15 +5173,39 @@ static long kvm_vm_ioctl(struct file *filp,
 		r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
 		break;
 	}
+	case KVM_SET_USER_MEMORY_REGION2:
 	case KVM_SET_USER_MEMORY_REGION: {
-		struct kvm_userspace_memory_region kvm_userspace_mem;
+		struct kvm_userspace_memory_region2 mem;
+		unsigned long size;
+
+		if (ioctl == KVM_SET_USER_MEMORY_REGION) {
+			/*
+			 * Fields beyond struct kvm_userspace_memory_region shouldn't be
+			 * accessed, but avoid leaking kernel memory in case of a bug.
+			 */
+			memset(&mem, 0, sizeof(mem));
+			size = sizeof(struct kvm_userspace_memory_region);
+		} else {
+			size = sizeof(struct kvm_userspace_memory_region2);
+		}
+
+		/* Ensure the common parts of the two structs are identical. */
+		SANITY_CHECK_MEM_REGION_FIELD(slot);
+		SANITY_CHECK_MEM_REGION_FIELD(flags);
+		SANITY_CHECK_MEM_REGION_FIELD(guest_phys_addr);
+		SANITY_CHECK_MEM_REGION_FIELD(memory_size);
+		SANITY_CHECK_MEM_REGION_FIELD(userspace_addr);
 
 		r = -EFAULT;
-		if (copy_from_user(&kvm_userspace_mem, argp,
-						sizeof(kvm_userspace_mem)))
+		if (copy_from_user(&mem, argp, size))
+			goto out;
+
+		r = -EINVAL;
+		if (ioctl == KVM_SET_USER_MEMORY_REGION &&
+		    (mem.flags & ~KVM_SET_USER_MEMORY_REGION_V1_FLAGS))
 			goto out;
 
-		r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem);
+		r = kvm_vm_ioctl_set_memory_region(kvm, &mem);
 		break;
 	}
 	case KVM_GET_DIRTY_LOG: {
@@ -3222,24 +5318,32 @@ static long kvm_vm_ioctl(struct file *filp,
 		if (routing.flags)
 			goto out;
 		if (routing.nr) {
-			r = -ENOMEM;
-			entries = vmalloc(array_size(sizeof(*entries),
-						     routing.nr));
-			if (!entries)
-				goto out;
-			r = -EFAULT;
 			urouting = argp;
-			if (copy_from_user(entries, urouting->entries,
-					   routing.nr * sizeof(*entries)))
-				goto out_free_irq_routing;
+			entries = vmemdup_array_user(urouting->entries,
+						     routing.nr, sizeof(*entries));
+			if (IS_ERR(entries)) {
+				r = PTR_ERR(entries);
+				goto out;
+			}
 		}
 		r = kvm_set_irq_routing(kvm, entries, routing.nr,
 					routing.flags);
-out_free_irq_routing:
-		vfree(entries);
+		kvfree(entries);
 		break;
 	}
 #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
+#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+	case KVM_SET_MEMORY_ATTRIBUTES: {
+		struct kvm_memory_attributes attrs;
+
+		r = -EFAULT;
+		if (copy_from_user(&attrs, argp, sizeof(attrs)))
+			goto out;
+
+		r = kvm_vm_ioctl_set_mem_attributes(kvm, &attrs);
+		break;
+	}
+#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
 	case KVM_CREATE_DEVICE: {
 		struct kvm_create_device cd;
 
@@ -3261,6 +5365,24 @@ out_free_irq_routing:
 	case KVM_CHECK_EXTENSION:
 		r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
 		break;
+	case KVM_RESET_DIRTY_RINGS:
+		r = kvm_vm_ioctl_reset_dirty_pages(kvm);
+		break;
+	case KVM_GET_STATS_FD:
+		r = kvm_vm_ioctl_get_stats_fd(kvm);
+		break;
+#ifdef CONFIG_KVM_GUEST_MEMFD
+	case KVM_CREATE_GUEST_MEMFD: {
+		struct kvm_create_guest_memfd guest_memfd;
+
+		r = -EFAULT;
+		if (copy_from_user(&guest_memfd, argp, sizeof(guest_memfd)))
+			goto out;
+
+		r = kvm_gmem_create(kvm, &guest_memfd);
+		break;
+	}
+#endif
 	default:
 		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
 	}
@@ -3278,15 +5400,54 @@ struct compat_kvm_dirty_log {
 	};
 };
 
+struct compat_kvm_clear_dirty_log {
+	__u32 slot;
+	__u32 num_pages;
+	__u64 first_page;
+	union {
+		compat_uptr_t dirty_bitmap; /* one bit per page */
+		__u64 padding2;
+	};
+};
+
+long __weak kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl,
+				     unsigned long arg)
+{
+	return -ENOTTY;
+}
+
 static long kvm_vm_compat_ioctl(struct file *filp,
 			   unsigned int ioctl, unsigned long arg)
 {
 	struct kvm *kvm = filp->private_data;
 	int r;
 
-	if (kvm->mm != current->mm)
+	if (kvm->mm != current->mm || kvm->vm_dead)
 		return -EIO;
+
+	r = kvm_arch_vm_compat_ioctl(filp, ioctl, arg);
+	if (r != -ENOTTY)
+		return r;
+
 	switch (ioctl) {
+#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
+	case KVM_CLEAR_DIRTY_LOG: {
+		struct compat_kvm_clear_dirty_log compat_log;
+		struct kvm_clear_dirty_log log;
+
+		if (copy_from_user(&compat_log, (void __user *)arg,
+				   sizeof(compat_log)))
+			return -EFAULT;
+		log.slot	 = compat_log.slot;
+		log.num_pages	 = compat_log.num_pages;
+		log.first_page	 = compat_log.first_page;
+		log.padding2	 = compat_log.padding2;
+		log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
+
+		r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
+		break;
+	}
+#endif
 	case KVM_GET_DIRTY_LOG: {
 		struct compat_kvm_dirty_log compat_log;
 		struct kvm_dirty_log log;
@@ -3316,27 +5477,33 @@ static struct file_operations kvm_vm_fops = {
 	KVM_COMPAT(kvm_vm_compat_ioctl),
 };
 
+bool file_is_kvm(struct file *file)
+{
+	return file && file->f_op == &kvm_vm_fops;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(file_is_kvm);
+
 static int kvm_dev_ioctl_create_vm(unsigned long type)
 {
-	int r;
+	char fdname[ITOA_MAX_LEN + 1];
+	int r, fd;
 	struct kvm *kvm;
 	struct file *file;
 
-	kvm = kvm_create_vm(type);
-	if (IS_ERR(kvm))
-		return PTR_ERR(kvm);
-#ifdef CONFIG_KVM_MMIO
-	r = kvm_coalesced_mmio_init(kvm);
-	if (r < 0)
-		goto put_kvm;
-#endif
-	r = get_unused_fd_flags(O_CLOEXEC);
-	if (r < 0)
-		goto put_kvm;
+	fd = get_unused_fd_flags(O_CLOEXEC);
+	if (fd < 0)
+		return fd;
+
+	snprintf(fdname, sizeof(fdname), "%d", fd);
+
+	kvm = kvm_create_vm(type, fdname);
+	if (IS_ERR(kvm)) {
+		r = PTR_ERR(kvm);
+		goto put_fd;
+	}
 
 	file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
 	if (IS_ERR(file)) {
-		put_unused_fd(r);
 		r = PTR_ERR(file);
 		goto put_kvm;
 	}
@@ -3347,25 +5514,22 @@ static int kvm_dev_ioctl_create_vm(unsigned long type)
 	 * cases it will be called by the final fput(file) and will take
 	 * care of doing kvm_put_kvm(kvm).
 	 */
-	if (kvm_create_vm_debugfs(kvm, r) < 0) {
-		put_unused_fd(r);
-		fput(file);
-		return -ENOMEM;
-	}
 	kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
 
-	fd_install(r, file);
-	return r;
+	fd_install(fd, file);
+	return fd;
 
 put_kvm:
 	kvm_put_kvm(kvm);
+put_fd:
+	put_unused_fd(fd);
 	return r;
 }
 
 static long kvm_dev_ioctl(struct file *filp,
 			  unsigned int ioctl, unsigned long arg)
 {
-	long r = -EINVAL;
+	int r = -EINVAL;
 
 	switch (ioctl) {
 	case KVM_GET_API_VERSION:
@@ -3390,11 +5554,6 @@ static long kvm_dev_ioctl(struct file *filp,
 		r += PAGE_SIZE;    /* coalesced mmio ring page */
 #endif
 		break;
-	case KVM_TRACE_ENABLE:
-	case KVM_TRACE_PAUSE:
-	case KVM_TRACE_DISABLE:
-		r = -EOPNOTSUPP;
-		break;
 	default:
 		return kvm_arch_dev_ioctl(filp, ioctl, arg);
 	}
@@ -3414,110 +5573,212 @@ static struct miscdevice kvm_dev = {
 	&kvm_chardev_ops,
 };
 
-static void hardware_enable_nolock(void *junk)
-{
-	int cpu = raw_smp_processor_id();
-	int r;
+#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
+bool enable_virt_at_load = true;
+module_param(enable_virt_at_load, bool, 0444);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_virt_at_load);
 
-	if (cpumask_test_cpu(cpu, cpus_hardware_enabled))
-		return;
+__visible bool kvm_rebooting;
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_rebooting);
+
+static DEFINE_PER_CPU(bool, virtualization_enabled);
+static DEFINE_MUTEX(kvm_usage_lock);
+static int kvm_usage_count;
 
-	cpumask_set_cpu(cpu, cpus_hardware_enabled);
+__weak void kvm_arch_enable_virtualization(void)
+{
 
-	r = kvm_arch_hardware_enable();
+}
+
+__weak void kvm_arch_disable_virtualization(void)
+{
 
-	if (r) {
-		cpumask_clear_cpu(cpu, cpus_hardware_enabled);
-		atomic_inc(&hardware_enable_failed);
-		pr_info("kvm: enabling virtualization on CPU%d failed\n", cpu);
-	}
 }
 
-static int kvm_starting_cpu(unsigned int cpu)
+static int kvm_enable_virtualization_cpu(void)
 {
-	raw_spin_lock(&kvm_count_lock);
-	if (kvm_usage_count)
-		hardware_enable_nolock(NULL);
-	raw_spin_unlock(&kvm_count_lock);
+	if (__this_cpu_read(virtualization_enabled))
+		return 0;
+
+	if (kvm_arch_enable_virtualization_cpu()) {
+		pr_info("kvm: enabling virtualization on CPU%d failed\n",
+			raw_smp_processor_id());
+		return -EIO;
+	}
+
+	__this_cpu_write(virtualization_enabled, true);
 	return 0;
 }
 
-static void hardware_disable_nolock(void *junk)
+static int kvm_online_cpu(unsigned int cpu)
 {
-	int cpu = raw_smp_processor_id();
+	/*
+	 * Abort the CPU online process if hardware virtualization cannot
+	 * be enabled. Otherwise running VMs would encounter unrecoverable
+	 * errors when scheduled to this CPU.
+	 */
+	return kvm_enable_virtualization_cpu();
+}
 
-	if (!cpumask_test_cpu(cpu, cpus_hardware_enabled))
+static void kvm_disable_virtualization_cpu(void *ign)
+{
+	if (!__this_cpu_read(virtualization_enabled))
 		return;
-	cpumask_clear_cpu(cpu, cpus_hardware_enabled);
-	kvm_arch_hardware_disable();
+
+	kvm_arch_disable_virtualization_cpu();
+
+	__this_cpu_write(virtualization_enabled, false);
 }
 
-static int kvm_dying_cpu(unsigned int cpu)
+static int kvm_offline_cpu(unsigned int cpu)
 {
-	raw_spin_lock(&kvm_count_lock);
-	if (kvm_usage_count)
-		hardware_disable_nolock(NULL);
-	raw_spin_unlock(&kvm_count_lock);
+	kvm_disable_virtualization_cpu(NULL);
 	return 0;
 }
 
-static void hardware_disable_all_nolock(void)
+static void kvm_shutdown(void *data)
 {
-	BUG_ON(!kvm_usage_count);
+	/*
+	 * Disable hardware virtualization and set kvm_rebooting to indicate
+	 * that KVM has asynchronously disabled hardware virtualization, i.e.
+	 * that relevant errors and exceptions aren't entirely unexpected.
+	 * Some flavors of hardware virtualization need to be disabled before
+	 * transferring control to firmware (to perform shutdown/reboot), e.g.
+	 * on x86, virtualization can block INIT interrupts, which are used by
+	 * firmware to pull APs back under firmware control.  Note, this path
+	 * is used for both shutdown and reboot scenarios, i.e. neither name is
+	 * 100% comprehensive.
+	 */
+	pr_info("kvm: exiting hardware virtualization\n");
+	kvm_rebooting = true;
+	on_each_cpu(kvm_disable_virtualization_cpu, NULL, 1);
+}
+
+static int kvm_suspend(void *data)
+{
+	/*
+	 * Secondary CPUs and CPU hotplug are disabled across the suspend/resume
+	 * callbacks, i.e. no need to acquire kvm_usage_lock to ensure the usage
+	 * count is stable.  Assert that kvm_usage_lock is not held to ensure
+	 * the system isn't suspended while KVM is enabling hardware.  Hardware
+	 * enabling can be preempted, but the task cannot be frozen until it has
+	 * dropped all locks (userspace tasks are frozen via a fake signal).
+	 */
+	lockdep_assert_not_held(&kvm_usage_lock);
+	lockdep_assert_irqs_disabled();
 
-	kvm_usage_count--;
-	if (!kvm_usage_count)
-		on_each_cpu(hardware_disable_nolock, NULL, 1);
+	kvm_disable_virtualization_cpu(NULL);
+	return 0;
 }
 
-static void hardware_disable_all(void)
+static void kvm_resume(void *data)
 {
-	raw_spin_lock(&kvm_count_lock);
-	hardware_disable_all_nolock();
-	raw_spin_unlock(&kvm_count_lock);
+	lockdep_assert_not_held(&kvm_usage_lock);
+	lockdep_assert_irqs_disabled();
+
+	WARN_ON_ONCE(kvm_enable_virtualization_cpu());
 }
 
-static int hardware_enable_all(void)
+static const struct syscore_ops kvm_syscore_ops = {
+	.suspend = kvm_suspend,
+	.resume = kvm_resume,
+	.shutdown = kvm_shutdown,
+};
+
+static struct syscore kvm_syscore = {
+	.ops = &kvm_syscore_ops,
+};
+
+int kvm_enable_virtualization(void)
 {
-	int r = 0;
+	int r;
 
-	raw_spin_lock(&kvm_count_lock);
+	guard(mutex)(&kvm_usage_lock);
 
-	kvm_usage_count++;
-	if (kvm_usage_count == 1) {
-		atomic_set(&hardware_enable_failed, 0);
-		on_each_cpu(hardware_enable_nolock, NULL, 1);
+	if (kvm_usage_count++)
+		return 0;
 
-		if (atomic_read(&hardware_enable_failed)) {
-			hardware_disable_all_nolock();
-			r = -EBUSY;
-		}
+	kvm_arch_enable_virtualization();
+
+	r = cpuhp_setup_state(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online",
+			      kvm_online_cpu, kvm_offline_cpu);
+	if (r)
+		goto err_cpuhp;
+
+	register_syscore(&kvm_syscore);
+
+	/*
+	 * Undo virtualization enabling and bail if the system is going down.
+	 * If userspace initiated a forced reboot, e.g. reboot -f, then it's
+	 * possible for an in-flight operation to enable virtualization after
+	 * syscore_shutdown() is called, i.e. without kvm_shutdown() being
+	 * invoked.  Note, this relies on system_state being set _before_
+	 * kvm_shutdown(), e.g. to ensure either kvm_shutdown() is invoked
+	 * or this CPU observes the impending shutdown.  Which is why KVM uses
+	 * a syscore ops hook instead of registering a dedicated reboot
+	 * notifier (the latter runs before system_state is updated).
+	 */
+	if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF ||
+	    system_state == SYSTEM_RESTART) {
+		r = -EBUSY;
+		goto err_rebooting;
 	}
 
-	raw_spin_unlock(&kvm_count_lock);
+	return 0;
 
+err_rebooting:
+	unregister_syscore(&kvm_syscore);
+	cpuhp_remove_state(CPUHP_AP_KVM_ONLINE);
+err_cpuhp:
+	kvm_arch_disable_virtualization();
+	--kvm_usage_count;
 	return r;
 }
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_enable_virtualization);
 
-static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
-		      void *v)
+void kvm_disable_virtualization(void)
 {
-	/*
-	 * Some (well, at least mine) BIOSes hang on reboot if
-	 * in vmx root mode.
-	 *
-	 * And Intel TXT required VMX off for all cpu when system shutdown.
-	 */
-	pr_info("kvm: exiting hardware virtualization\n");
-	kvm_rebooting = true;
-	on_each_cpu(hardware_disable_nolock, NULL, 1);
-	return NOTIFY_OK;
+	guard(mutex)(&kvm_usage_lock);
+
+	if (--kvm_usage_count)
+		return;
+
+	unregister_syscore(&kvm_syscore);
+	cpuhp_remove_state(CPUHP_AP_KVM_ONLINE);
+	kvm_arch_disable_virtualization();
 }
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_disable_virtualization);
 
-static struct notifier_block kvm_reboot_notifier = {
-	.notifier_call = kvm_reboot,
-	.priority = 0,
-};
+static int kvm_init_virtualization(void)
+{
+	if (enable_virt_at_load)
+		return kvm_enable_virtualization();
+
+	return 0;
+}
+
+static void kvm_uninit_virtualization(void)
+{
+	if (enable_virt_at_load)
+		kvm_disable_virtualization();
+}
+#else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
+static int kvm_init_virtualization(void)
+{
+	return 0;
+}
+
+static void kvm_uninit_virtualization(void)
+{
+
+}
+#endif /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
+
+static void kvm_iodevice_destructor(struct kvm_io_device *dev)
+{
+	if (dev->ops->destructor)
+		dev->ops->destructor(dev);
+}
 
 static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
 {
@@ -3605,7 +5866,18 @@ static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
 	return -EOPNOTSUPP;
 }
 
-/* kvm_io_bus_write - called under kvm->slots_lock */
+static struct kvm_io_bus *kvm_get_bus_srcu(struct kvm *kvm, enum kvm_bus idx)
+{
+	/*
+	 * Ensure that any updates to kvm_buses[] observed by the previous vCPU
+	 * machine instruction are also visible to the vCPU machine instruction
+	 * that triggered this call.
+	 */
+	smp_mb__after_srcu_read_lock();
+
+	return srcu_dereference(kvm->buses[idx], &kvm->srcu);
+}
+
 int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
 		     int len, const void *val)
 {
@@ -3618,14 +5890,14 @@ int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
 		.len = len,
 	};
 
-	bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
+	bus = kvm_get_bus_srcu(vcpu->kvm, bus_idx);
 	if (!bus)
 		return -ENOMEM;
 	r = __kvm_io_bus_write(vcpu, bus, &range, val);
 	return r < 0 ? r : 0;
 }
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_io_bus_write);
 
-/* kvm_io_bus_write_cookie - called under kvm->slots_lock */
 int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
 			    gpa_t addr, int len, const void *val, long cookie)
 {
@@ -3637,7 +5909,7 @@ int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
 		.len = len,
 	};
 
-	bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
+	bus = kvm_get_bus_srcu(vcpu->kvm, bus_idx);
 	if (!bus)
 		return -ENOMEM;
 
@@ -3674,9 +5946,7 @@ static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
 
 	return -EOPNOTSUPP;
 }
-EXPORT_SYMBOL_GPL(kvm_io_bus_write);
 
-/* kvm_io_bus_read - called under kvm->slots_lock */
 int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
 		    int len, void *val)
 {
@@ -3689,15 +5959,21 @@ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
 		.len = len,
 	};
 
-	bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
+	bus = kvm_get_bus_srcu(vcpu->kvm, bus_idx);
 	if (!bus)
 		return -ENOMEM;
 	r = __kvm_io_bus_read(vcpu, bus, &range, val);
 	return r < 0 ? r : 0;
 }
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_io_bus_read);
+
+static void __free_bus(struct rcu_head *rcu)
+{
+	struct kvm_io_bus *bus = container_of(rcu, struct kvm_io_bus, rcu);
 
+	kfree(bus);
+}
 
-/* Caller must hold slots_lock. */
 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 			    int len, struct kvm_io_device *dev)
 {
@@ -3705,6 +5981,8 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 	struct kvm_io_bus *new_bus, *bus;
 	struct kvm_io_range range;
 
+	lockdep_assert_held(&kvm->slots_lock);
+
 	bus = kvm_get_bus(kvm, bus_idx);
 	if (!bus)
 		return -ENOMEM;
@@ -3713,8 +5991,8 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 	if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
 		return -ENOSPC;
 
-	new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count + 1) *
-			  sizeof(struct kvm_io_range)), GFP_KERNEL);
+	new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1),
+			  GFP_KERNEL_ACCOUNT);
 	if (!new_bus)
 		return -ENOMEM;
 
@@ -3734,48 +6012,57 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 	memcpy(new_bus->range + i + 1, bus->range + i,
 		(bus->dev_count - i) * sizeof(struct kvm_io_range));
 	rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
-	synchronize_srcu_expedited(&kvm->srcu);
-	kfree(bus);
+	call_srcu(&kvm->srcu, &bus->rcu, __free_bus);
 
 	return 0;
 }
 
-/* Caller must hold slots_lock. */
-void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
-			       struct kvm_io_device *dev)
+int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+			      struct kvm_io_device *dev)
 {
 	int i;
 	struct kvm_io_bus *new_bus, *bus;
 
+	lockdep_assert_held(&kvm->slots_lock);
+
 	bus = kvm_get_bus(kvm, bus_idx);
 	if (!bus)
-		return;
+		return 0;
 
-	for (i = 0; i < bus->dev_count; i++)
+	for (i = 0; i < bus->dev_count; i++) {
 		if (bus->range[i].dev == dev) {
 			break;
 		}
+	}
 
 	if (i == bus->dev_count)
-		return;
+		return 0;
 
-	new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count - 1) *
-			  sizeof(struct kvm_io_range)), GFP_KERNEL);
-	if (!new_bus)  {
-		pr_err("kvm: failed to shrink bus, removing it completely\n");
-		goto broken;
+	new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
+			  GFP_KERNEL_ACCOUNT);
+	if (new_bus) {
+		memcpy(new_bus, bus, struct_size(bus, range, i));
+		new_bus->dev_count--;
+		memcpy(new_bus->range + i, bus->range + i + 1,
+				flex_array_size(new_bus, range, new_bus->dev_count - i));
 	}
 
-	memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
-	new_bus->dev_count--;
-	memcpy(new_bus->range + i, bus->range + i + 1,
-	       (new_bus->dev_count - i) * sizeof(struct kvm_io_range));
-
-broken:
 	rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
 	synchronize_srcu_expedited(&kvm->srcu);
+
+	/*
+	 * If NULL bus is installed, destroy the old bus, including all the
+	 * attached devices. Otherwise, destroy the caller's device only.
+	 */
+	if (!new_bus) {
+		pr_err("kvm: failed to shrink bus, removing it completely\n");
+		kvm_io_bus_destroy(bus);
+		return -ENOMEM;
+	}
+
+	kvm_iodevice_destructor(dev);
 	kfree(bus);
-	return;
+	return 0;
 }
 
 struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
@@ -3787,7 +6074,7 @@ struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
 
 	srcu_idx = srcu_read_lock(&kvm->srcu);
 
-	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
+	bus = kvm_get_bus_srcu(kvm, bus_idx);
 	if (!bus)
 		goto out_unlock;
 
@@ -3802,35 +6089,35 @@ out_unlock:
 
 	return iodev;
 }
-EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_io_bus_get_dev);
 
 static int kvm_debugfs_open(struct inode *inode, struct file *file,
 			   int (*get)(void *, u64 *), int (*set)(void *, u64),
 			   const char *fmt)
 {
-	struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
-					  inode->i_private;
+	int ret;
+	struct kvm_stat_data *stat_data = inode->i_private;
 
-	/* The debugfs files are a reference to the kvm struct which
-	 * is still valid when kvm_destroy_vm is called.
-	 * To avoid the race between open and the removal of the debugfs
-	 * directory we test against the users count.
+	/*
+	 * The debugfs files are a reference to the kvm struct which
+        * is still valid when kvm_destroy_vm is called.  kvm_get_kvm_safe
+        * avoids the race between open and the removal of the debugfs directory.
 	 */
-	if (!refcount_inc_not_zero(&stat_data->kvm->users_count))
+	if (!kvm_get_kvm_safe(stat_data->kvm))
 		return -ENOENT;
 
-	if (simple_attr_open(inode, file, get, set, fmt)) {
+	ret = simple_attr_open(inode, file, get,
+			       kvm_stats_debugfs_mode(stat_data->desc) & 0222
+			       ? set : NULL, fmt);
+	if (ret)
 		kvm_put_kvm(stat_data->kvm);
-		return -ENOMEM;
-	}
 
-	return 0;
+	return ret;
 }
 
 static int kvm_debugfs_release(struct inode *inode, struct file *file)
 {
-	struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
-					  inode->i_private;
+	struct kvm_stat_data *stat_data = inode->i_private;
 
 	simple_attr_release(inode, file);
 	kvm_put_kvm(stat_data->kvm);
@@ -3838,108 +6125,113 @@ static int kvm_debugfs_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static int vm_stat_get_per_vm(void *data, u64 *val)
+static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val)
 {
-	struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
+	*val = *(u64 *)((void *)(&kvm->stat) + offset);
 
-	*val = *(ulong *)((void *)stat_data->kvm + stat_data->offset);
+	return 0;
+}
+
+static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset)
+{
+	*(u64 *)((void *)(&kvm->stat) + offset) = 0;
 
 	return 0;
 }
 
-static int vm_stat_clear_per_vm(void *data, u64 val)
+static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val)
 {
-	struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
+	unsigned long i;
+	struct kvm_vcpu *vcpu;
 
-	if (val)
-		return -EINVAL;
+	*val = 0;
 
-	*(ulong *)((void *)stat_data->kvm + stat_data->offset) = 0;
+	kvm_for_each_vcpu(i, vcpu, kvm)
+		*val += *(u64 *)((void *)(&vcpu->stat) + offset);
 
 	return 0;
 }
 
-static int vm_stat_get_per_vm_open(struct inode *inode, struct file *file)
+static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset)
 {
-	__simple_attr_check_format("%llu\n", 0ull);
-	return kvm_debugfs_open(inode, file, vm_stat_get_per_vm,
-				vm_stat_clear_per_vm, "%llu\n");
-}
+	unsigned long i;
+	struct kvm_vcpu *vcpu;
 
-static const struct file_operations vm_stat_get_per_vm_fops = {
-	.owner   = THIS_MODULE,
-	.open    = vm_stat_get_per_vm_open,
-	.release = kvm_debugfs_release,
-	.read    = simple_attr_read,
-	.write   = simple_attr_write,
-	.llseek  = no_llseek,
-};
+	kvm_for_each_vcpu(i, vcpu, kvm)
+		*(u64 *)((void *)(&vcpu->stat) + offset) = 0;
 
-static int vcpu_stat_get_per_vm(void *data, u64 *val)
-{
-	int i;
-	struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
-	struct kvm_vcpu *vcpu;
+	return 0;
+}
 
-	*val = 0;
+static int kvm_stat_data_get(void *data, u64 *val)
+{
+	int r = -EFAULT;
+	struct kvm_stat_data *stat_data = data;
 
-	kvm_for_each_vcpu(i, vcpu, stat_data->kvm)
-		*val += *(u64 *)((void *)vcpu + stat_data->offset);
+	switch (stat_data->kind) {
+	case KVM_STAT_VM:
+		r = kvm_get_stat_per_vm(stat_data->kvm,
+					stat_data->desc->desc.offset, val);
+		break;
+	case KVM_STAT_VCPU:
+		r = kvm_get_stat_per_vcpu(stat_data->kvm,
+					  stat_data->desc->desc.offset, val);
+		break;
+	}
 
-	return 0;
+	return r;
 }
 
-static int vcpu_stat_clear_per_vm(void *data, u64 val)
+static int kvm_stat_data_clear(void *data, u64 val)
 {
-	int i;
-	struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
-	struct kvm_vcpu *vcpu;
+	int r = -EFAULT;
+	struct kvm_stat_data *stat_data = data;
 
 	if (val)
 		return -EINVAL;
 
-	kvm_for_each_vcpu(i, vcpu, stat_data->kvm)
-		*(u64 *)((void *)vcpu + stat_data->offset) = 0;
+	switch (stat_data->kind) {
+	case KVM_STAT_VM:
+		r = kvm_clear_stat_per_vm(stat_data->kvm,
+					  stat_data->desc->desc.offset);
+		break;
+	case KVM_STAT_VCPU:
+		r = kvm_clear_stat_per_vcpu(stat_data->kvm,
+					    stat_data->desc->desc.offset);
+		break;
+	}
 
-	return 0;
+	return r;
 }
 
-static int vcpu_stat_get_per_vm_open(struct inode *inode, struct file *file)
+static int kvm_stat_data_open(struct inode *inode, struct file *file)
 {
 	__simple_attr_check_format("%llu\n", 0ull);
-	return kvm_debugfs_open(inode, file, vcpu_stat_get_per_vm,
-				 vcpu_stat_clear_per_vm, "%llu\n");
+	return kvm_debugfs_open(inode, file, kvm_stat_data_get,
+				kvm_stat_data_clear, "%llu\n");
 }
 
-static const struct file_operations vcpu_stat_get_per_vm_fops = {
-	.owner   = THIS_MODULE,
-	.open    = vcpu_stat_get_per_vm_open,
+static const struct file_operations stat_fops_per_vm = {
+	.owner = THIS_MODULE,
+	.open = kvm_stat_data_open,
 	.release = kvm_debugfs_release,
-	.read    = simple_attr_read,
-	.write   = simple_attr_write,
-	.llseek  = no_llseek,
-};
-
-static const struct file_operations *stat_fops_per_vm[] = {
-	[KVM_STAT_VCPU] = &vcpu_stat_get_per_vm_fops,
-	[KVM_STAT_VM]   = &vm_stat_get_per_vm_fops,
+	.read = simple_attr_read,
+	.write = simple_attr_write,
 };
 
 static int vm_stat_get(void *_offset, u64 *val)
 {
 	unsigned offset = (long)_offset;
 	struct kvm *kvm;
-	struct kvm_stat_data stat_tmp = {.offset = offset};
 	u64 tmp_val;
 
 	*val = 0;
-	spin_lock(&kvm_lock);
+	mutex_lock(&kvm_lock);
 	list_for_each_entry(kvm, &vm_list, vm_list) {
-		stat_tmp.kvm = kvm;
-		vm_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
+		kvm_get_stat_per_vm(kvm, offset, &tmp_val);
 		*val += tmp_val;
 	}
-	spin_unlock(&kvm_lock);
+	mutex_unlock(&kvm_lock);
 	return 0;
 }
 
@@ -3947,38 +6239,35 @@ static int vm_stat_clear(void *_offset, u64 val)
 {
 	unsigned offset = (long)_offset;
 	struct kvm *kvm;
-	struct kvm_stat_data stat_tmp = {.offset = offset};
 
 	if (val)
 		return -EINVAL;
 
-	spin_lock(&kvm_lock);
+	mutex_lock(&kvm_lock);
 	list_for_each_entry(kvm, &vm_list, vm_list) {
-		stat_tmp.kvm = kvm;
-		vm_stat_clear_per_vm((void *)&stat_tmp, 0);
+		kvm_clear_stat_per_vm(kvm, offset);
 	}
-	spin_unlock(&kvm_lock);
+	mutex_unlock(&kvm_lock);
 
 	return 0;
 }
 
 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(vm_stat_readonly_fops, vm_stat_get, NULL, "%llu\n");
 
 static int vcpu_stat_get(void *_offset, u64 *val)
 {
 	unsigned offset = (long)_offset;
 	struct kvm *kvm;
-	struct kvm_stat_data stat_tmp = {.offset = offset};
 	u64 tmp_val;
 
 	*val = 0;
-	spin_lock(&kvm_lock);
+	mutex_lock(&kvm_lock);
 	list_for_each_entry(kvm, &vm_list, vm_list) {
-		stat_tmp.kvm = kvm;
-		vcpu_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
+		kvm_get_stat_per_vcpu(kvm, offset, &tmp_val);
 		*val += tmp_val;
 	}
-	spin_unlock(&kvm_lock);
+	mutex_unlock(&kvm_lock);
 	return 0;
 }
 
@@ -3986,28 +6275,22 @@ static int vcpu_stat_clear(void *_offset, u64 val)
 {
 	unsigned offset = (long)_offset;
 	struct kvm *kvm;
-	struct kvm_stat_data stat_tmp = {.offset = offset};
 
 	if (val)
 		return -EINVAL;
 
-	spin_lock(&kvm_lock);
+	mutex_lock(&kvm_lock);
 	list_for_each_entry(kvm, &vm_list, vm_list) {
-		stat_tmp.kvm = kvm;
-		vcpu_stat_clear_per_vm((void *)&stat_tmp, 0);
+		kvm_clear_stat_per_vcpu(kvm, offset);
 	}
-	spin_unlock(&kvm_lock);
+	mutex_unlock(&kvm_lock);
 
 	return 0;
 }
 
 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear,
 			"%llu\n");
-
-static const struct file_operations *stat_fops[] = {
-	[KVM_STAT_VCPU] = &vcpu_stat_fops,
-	[KVM_STAT_VM]   = &vm_stat_fops,
-};
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_readonly_fops, vcpu_stat_get, NULL, "%llu\n");
 
 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
 {
@@ -4017,7 +6300,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
 	if (!kvm_dev.this_device || !kvm)
 		return;
 
-	spin_lock(&kvm_lock);
+	mutex_lock(&kvm_lock);
 	if (type == KVM_EVENT_CREATE_VM) {
 		kvm_createvm_count++;
 		kvm_active_vms++;
@@ -4026,7 +6309,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
 	}
 	created = kvm_createvm_count;
 	active = kvm_active_vms;
-	spin_unlock(&kvm_lock);
+	mutex_unlock(&kvm_lock);
 
 	env = kzalloc(sizeof(*env), GFP_KERNEL);
 	if (!env)
@@ -4043,7 +6326,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
 	}
 	add_uevent_var(env, "PID=%d", kvm->userspace_pid);
 
-	if (kvm->debugfs_dentry) {
+	if (!IS_ERR(kvm->debugfs_dentry)) {
 		char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL);
 
 		if (p) {
@@ -4061,38 +6344,35 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
 
 static void kvm_init_debug(void)
 {
-	struct kvm_stats_debugfs_item *p;
+	const struct file_operations *fops;
+	const struct _kvm_stats_desc *pdesc;
+	int i;
 
 	kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
 
-	kvm_debugfs_num_entries = 0;
-	for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) {
-		debugfs_create_file(p->name, 0644, kvm_debugfs_dir,
-				    (void *)(long)p->offset,
-				    stat_fops[p->kind]);
+	for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
+		pdesc = &kvm_vm_stats_desc[i];
+		if (kvm_stats_debugfs_mode(pdesc) & 0222)
+			fops = &vm_stat_fops;
+		else
+			fops = &vm_stat_readonly_fops;
+		debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
+				kvm_debugfs_dir,
+				(void *)(long)pdesc->desc.offset, fops);
 	}
-}
-
-static int kvm_suspend(void)
-{
-	if (kvm_usage_count)
-		hardware_disable_nolock(NULL);
-	return 0;
-}
 
-static void kvm_resume(void)
-{
-	if (kvm_usage_count) {
-		WARN_ON(raw_spin_is_locked(&kvm_count_lock));
-		hardware_enable_nolock(NULL);
+	for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
+		pdesc = &kvm_vcpu_stats_desc[i];
+		if (kvm_stats_debugfs_mode(pdesc) & 0222)
+			fops = &vcpu_stat_fops;
+		else
+			fops = &vcpu_stat_readonly_fops;
+		debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
+				kvm_debugfs_dir,
+				(void *)(long)pdesc->desc.offset, fops);
 	}
 }
 
-static struct syscore_ops kvm_syscore_ops = {
-	.suspend = kvm_suspend,
-	.resume = kvm_resume,
-};
-
 static inline
 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
 {
@@ -4103,12 +6383,13 @@ static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
 {
 	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
 
-	if (vcpu->preempted)
-		vcpu->preempted = false;
-
-	kvm_arch_sched_in(vcpu, cpu);
+	WRITE_ONCE(vcpu->preempted, false);
+	WRITE_ONCE(vcpu->ready, false);
 
+	__this_cpu_write(kvm_running_vcpu, vcpu);
 	kvm_arch_vcpu_load(vcpu, cpu);
+
+	WRITE_ONCE(vcpu->scheduled_out, false);
 }
 
 static void kvm_sched_out(struct preempt_notifier *pn,
@@ -4116,54 +6397,93 @@ static void kvm_sched_out(struct preempt_notifier *pn,
 {
 	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
 
-	if (current->state == TASK_RUNNING)
-		vcpu->preempted = true;
+	WRITE_ONCE(vcpu->scheduled_out, true);
+
+	if (task_is_runnable(current) && vcpu->wants_to_run) {
+		WRITE_ONCE(vcpu->preempted, true);
+		WRITE_ONCE(vcpu->ready, true);
+	}
 	kvm_arch_vcpu_put(vcpu);
+	__this_cpu_write(kvm_running_vcpu, NULL);
 }
 
-int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
-		  struct module *module)
+/**
+ * kvm_get_running_vcpu - get the vcpu running on the current CPU.
+ *
+ * We can disable preemption locally around accessing the per-CPU variable,
+ * and use the resolved vcpu pointer after enabling preemption again,
+ * because even if the current thread is migrated to another CPU, reading
+ * the per-CPU value later will give us the same value as we update the
+ * per-CPU variable in the preempt notifier handlers.
+ */
+struct kvm_vcpu *kvm_get_running_vcpu(void)
 {
-	int r;
-	int cpu;
+	struct kvm_vcpu *vcpu;
 
-	r = kvm_arch_init(opaque);
-	if (r)
-		goto out_fail;
+	preempt_disable();
+	vcpu = __this_cpu_read(kvm_running_vcpu);
+	preempt_enable();
 
-	/*
-	 * kvm_arch_init makes sure there's at most one caller
-	 * for architectures that support multiple implementations,
-	 * like intel and amd on x86.
-	 * kvm_arch_init must be called before kvm_irqfd_init to avoid creating
-	 * conflicts in case kvm is already setup for another implementation.
-	 */
-	r = kvm_irqfd_init();
-	if (r)
-		goto out_irqfd;
+	return vcpu;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_running_vcpu);
 
-	if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
-		r = -ENOMEM;
-		goto out_free_0;
-	}
+/**
+ * kvm_get_running_vcpus - get the per-CPU array of currently running vcpus.
+ */
+struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void)
+{
+        return &kvm_running_vcpu;
+}
 
-	r = kvm_arch_hardware_setup();
-	if (r < 0)
-		goto out_free_0a;
+#ifdef CONFIG_GUEST_PERF_EVENTS
+static unsigned int kvm_guest_state(void)
+{
+	struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
+	unsigned int state;
 
-	for_each_online_cpu(cpu) {
-		smp_call_function_single(cpu,
-				kvm_arch_check_processor_compat,
-				&r, 1);
-		if (r < 0)
-			goto out_free_1;
-	}
+	if (!kvm_arch_pmi_in_guest(vcpu))
+		return 0;
 
-	r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting",
-				      kvm_starting_cpu, kvm_dying_cpu);
-	if (r)
-		goto out_free_2;
-	register_reboot_notifier(&kvm_reboot_notifier);
+	state = PERF_GUEST_ACTIVE;
+	if (!kvm_arch_vcpu_in_kernel(vcpu))
+		state |= PERF_GUEST_USER;
+
+	return state;
+}
+
+static unsigned long kvm_guest_get_ip(void)
+{
+	struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
+
+	/* Retrieving the IP must be guarded by a call to kvm_guest_state(). */
+	if (WARN_ON_ONCE(!kvm_arch_pmi_in_guest(vcpu)))
+		return 0;
+
+	return kvm_arch_vcpu_get_ip(vcpu);
+}
+
+static struct perf_guest_info_callbacks kvm_guest_cbs = {
+	.state			= kvm_guest_state,
+	.get_ip			= kvm_guest_get_ip,
+	.handle_intel_pt_intr	= NULL,
+};
+
+void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void))
+{
+	kvm_guest_cbs.handle_intel_pt_intr = pt_intr_handler;
+	perf_register_guest_info_callbacks(&kvm_guest_cbs);
+}
+void kvm_unregister_perf_callbacks(void)
+{
+	perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
+}
+#endif
+
+int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
+{
+	int r;
+	int cpu;
 
 	/* A kmem cache lets us meet the alignment requirements of fx_save. */
 	if (!vcpu_align)
@@ -4172,28 +6492,32 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
 		kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align,
 					   SLAB_ACCOUNT,
 					   offsetof(struct kvm_vcpu, arch),
-					   sizeof_field(struct kvm_vcpu, arch),
+					   offsetofend(struct kvm_vcpu, stats_id)
+					   - offsetof(struct kvm_vcpu, arch),
 					   NULL);
-	if (!kvm_vcpu_cache) {
-		r = -ENOMEM;
-		goto out_free_3;
+	if (!kvm_vcpu_cache)
+		return -ENOMEM;
+
+	for_each_possible_cpu(cpu) {
+		if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu),
+					    GFP_KERNEL, cpu_to_node(cpu))) {
+			r = -ENOMEM;
+			goto err_cpu_kick_mask;
+		}
 	}
 
+	r = kvm_irqfd_init();
+	if (r)
+		goto err_irqfd;
+
 	r = kvm_async_pf_init();
 	if (r)
-		goto out_free;
+		goto err_async_pf;
 
 	kvm_chardev_ops.owner = module;
 	kvm_vm_fops.owner = module;
 	kvm_vcpu_fops.owner = module;
-
-	r = misc_register(&kvm_dev);
-	if (r) {
-		pr_err("kvm: misc device register failed\n");
-		goto out_unreg;
-	}
-
-	register_syscore_ops(&kvm_syscore_ops);
+	kvm_device_fops.owner = module;
 
 	kvm_preempt_ops.sched_in = kvm_sched_in;
 	kvm_preempt_ops.sched_out = kvm_sched_out;
@@ -4201,45 +6525,68 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
 	kvm_init_debug();
 
 	r = kvm_vfio_ops_init();
-	WARN_ON(r);
+	if (WARN_ON_ONCE(r))
+		goto err_vfio;
+
+	r = kvm_gmem_init(module);
+	if (r)
+		goto err_gmem;
+
+	r = kvm_init_virtualization();
+	if (r)
+		goto err_virt;
+
+	/*
+	 * Registration _must_ be the very last thing done, as this exposes
+	 * /dev/kvm to userspace, i.e. all infrastructure must be setup!
+	 */
+	r = misc_register(&kvm_dev);
+	if (r) {
+		pr_err("kvm: misc device register failed\n");
+		goto err_register;
+	}
 
 	return 0;
 
-out_unreg:
+err_register:
+	kvm_uninit_virtualization();
+err_virt:
+	kvm_gmem_exit();
+err_gmem:
+	kvm_vfio_ops_exit();
+err_vfio:
 	kvm_async_pf_deinit();
-out_free:
-	kmem_cache_destroy(kvm_vcpu_cache);
-out_free_3:
-	unregister_reboot_notifier(&kvm_reboot_notifier);
-	cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
-out_free_2:
-out_free_1:
-	kvm_arch_hardware_unsetup();
-out_free_0a:
-	free_cpumask_var(cpus_hardware_enabled);
-out_free_0:
+err_async_pf:
 	kvm_irqfd_exit();
-out_irqfd:
-	kvm_arch_exit();
-out_fail:
+err_irqfd:
+err_cpu_kick_mask:
+	for_each_possible_cpu(cpu)
+		free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
+	kmem_cache_destroy(kvm_vcpu_cache);
 	return r;
 }
-EXPORT_SYMBOL_GPL(kvm_init);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_init);
 
 void kvm_exit(void)
 {
-	debugfs_remove_recursive(kvm_debugfs_dir);
+	int cpu;
+
+	/*
+	 * Note, unregistering /dev/kvm doesn't strictly need to come first,
+	 * fops_get(), a.k.a. try_module_get(), prevents acquiring references
+	 * to KVM while the module is being stopped.
+	 */
 	misc_deregister(&kvm_dev);
+
+	kvm_uninit_virtualization();
+
+	debugfs_remove_recursive(kvm_debugfs_dir);
+	for_each_possible_cpu(cpu)
+		free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
 	kmem_cache_destroy(kvm_vcpu_cache);
+	kvm_gmem_exit();
+	kvm_vfio_ops_exit();
 	kvm_async_pf_deinit();
-	unregister_syscore_ops(&kvm_syscore_ops);
-	unregister_reboot_notifier(&kvm_reboot_notifier);
-	cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
-	on_each_cpu(hardware_disable_nolock, NULL, 1);
-	kvm_arch_hardware_unsetup();
-	kvm_arch_exit();
 	kvm_irqfd_exit();
-	free_cpumask_var(cpus_hardware_enabled);
-	kvm_vfio_ops_exit();
 }
-EXPORT_SYMBOL_GPL(kvm_exit);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_exit);
diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
new file mode 100644
index 000000000000..9fcc5d5b7f8d
--- /dev/null
+++ b/virt/kvm/kvm_mm.h
@@ -0,0 +1,97 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef __KVM_MM_H__
+#define __KVM_MM_H__ 1
+
+/*
+ * Architectures can choose whether to use an rwlock or spinlock
+ * for the mmu_lock.  These macros, for use in common code
+ * only, avoids using #ifdefs in places that must deal with
+ * multiple architectures.
+ */
+
+#ifdef KVM_HAVE_MMU_RWLOCK
+#define KVM_MMU_LOCK_INIT(kvm)		rwlock_init(&(kvm)->mmu_lock)
+#define KVM_MMU_LOCK(kvm)		write_lock(&(kvm)->mmu_lock)
+#define KVM_MMU_UNLOCK(kvm)		write_unlock(&(kvm)->mmu_lock)
+#else
+#define KVM_MMU_LOCK_INIT(kvm)		spin_lock_init(&(kvm)->mmu_lock)
+#define KVM_MMU_LOCK(kvm)		spin_lock(&(kvm)->mmu_lock)
+#define KVM_MMU_UNLOCK(kvm)		spin_unlock(&(kvm)->mmu_lock)
+#endif /* KVM_HAVE_MMU_RWLOCK */
+
+
+struct kvm_follow_pfn {
+	const struct kvm_memory_slot *slot;
+	const gfn_t gfn;
+
+	unsigned long hva;
+
+	/* FOLL_* flags modifying lookup behavior, e.g. FOLL_WRITE. */
+	unsigned int flags;
+
+	/*
+	 * Pin the page (effectively FOLL_PIN, which is an mm/ internal flag).
+	 * The page *must* be pinned if KVM will write to the page via a kernel
+	 * mapping, e.g. via kmap(), mremap(), etc.
+	 */
+	bool pin;
+
+	/*
+	 * If non-NULL, try to get a writable mapping even for a read fault.
+	 * Set to true if a writable mapping was obtained.
+	 */
+	bool *map_writable;
+
+	/*
+	 * Optional output.  Set to a valid "struct page" if the returned pfn
+	 * is for a refcounted or pinned struct page, NULL if the returned pfn
+	 * has no struct page or if the struct page is not being refcounted
+	 * (e.g. tail pages of non-compound higher order allocations from
+	 * IO/PFNMAP mappings).
+	 */
+	struct page **refcounted_page;
+};
+
+kvm_pfn_t hva_to_pfn(struct kvm_follow_pfn *kfp);
+
+#ifdef CONFIG_HAVE_KVM_PFNCACHE
+void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,
+				       unsigned long start,
+				       unsigned long end);
+#else
+static inline void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,
+						     unsigned long start,
+						     unsigned long end)
+{
+}
+#endif /* HAVE_KVM_PFNCACHE */
+
+#ifdef CONFIG_KVM_GUEST_MEMFD
+int kvm_gmem_init(struct module *module);
+void kvm_gmem_exit(void);
+int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args);
+int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
+		  unsigned int fd, loff_t offset);
+void kvm_gmem_unbind(struct kvm_memory_slot *slot);
+#else
+static inline int kvm_gmem_init(struct module *module)
+{
+	return 0;
+}
+static inline void kvm_gmem_exit(void) {};
+static inline int kvm_gmem_bind(struct kvm *kvm,
+					 struct kvm_memory_slot *slot,
+					 unsigned int fd, loff_t offset)
+{
+	WARN_ON_ONCE(1);
+	return -EIO;
+}
+
+static inline void kvm_gmem_unbind(struct kvm_memory_slot *slot)
+{
+	WARN_ON_ONCE(1);
+}
+#endif /* CONFIG_KVM_GUEST_MEMFD */
+
+#endif /* __KVM_MM_H__ */
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
new file mode 100644
index 000000000000..728d2c1b488a
--- /dev/null
+++ b/virt/kvm/pfncache.c
@@ -0,0 +1,484 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This module enables kernel and guest-mode vCPU access to guest physical
+ * memory with suitable invalidation mechanisms.
+ *
+ * Copyright © 2021 Amazon.com, Inc. or its affiliates.
+ *
+ * Authors:
+ *   David Woodhouse <dwmw2@infradead.org>
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+
+#include "kvm_mm.h"
+
+/*
+ * MMU notifier 'invalidate_range_start' hook.
+ */
+void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start,
+				       unsigned long end)
+{
+	struct gfn_to_pfn_cache *gpc;
+
+	spin_lock(&kvm->gpc_lock);
+	list_for_each_entry(gpc, &kvm->gpc_list, list) {
+		read_lock_irq(&gpc->lock);
+
+		/* Only a single page so no need to care about length */
+		if (gpc->valid && !is_error_noslot_pfn(gpc->pfn) &&
+		    gpc->uhva >= start && gpc->uhva < end) {
+			read_unlock_irq(&gpc->lock);
+
+			/*
+			 * There is a small window here where the cache could
+			 * be modified, and invalidation would no longer be
+			 * necessary. Hence check again whether invalidation
+			 * is still necessary once the write lock has been
+			 * acquired.
+			 */
+
+			write_lock_irq(&gpc->lock);
+			if (gpc->valid && !is_error_noslot_pfn(gpc->pfn) &&
+			    gpc->uhva >= start && gpc->uhva < end)
+				gpc->valid = false;
+			write_unlock_irq(&gpc->lock);
+			continue;
+		}
+
+		read_unlock_irq(&gpc->lock);
+	}
+	spin_unlock(&kvm->gpc_lock);
+}
+
+static bool kvm_gpc_is_valid_len(gpa_t gpa, unsigned long uhva,
+				 unsigned long len)
+{
+	unsigned long offset = kvm_is_error_gpa(gpa) ? offset_in_page(uhva) :
+						       offset_in_page(gpa);
+
+	/*
+	 * The cached access must fit within a single page. The 'len' argument
+	 * to activate() and refresh() exists only to enforce that.
+	 */
+	return offset + len <= PAGE_SIZE;
+}
+
+bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, unsigned long len)
+{
+	struct kvm_memslots *slots = kvm_memslots(gpc->kvm);
+
+	if (!gpc->active)
+		return false;
+
+	/*
+	 * If the page was cached from a memslot, make sure the memslots have
+	 * not been re-configured.
+	 */
+	if (!kvm_is_error_gpa(gpc->gpa) && gpc->generation != slots->generation)
+		return false;
+
+	if (kvm_is_error_hva(gpc->uhva))
+		return false;
+
+	if (!kvm_gpc_is_valid_len(gpc->gpa, gpc->uhva, len))
+		return false;
+
+	if (!gpc->valid)
+		return false;
+
+	return true;
+}
+
+static void *gpc_map(kvm_pfn_t pfn)
+{
+	if (pfn_valid(pfn))
+		return kmap(pfn_to_page(pfn));
+
+#ifdef CONFIG_HAS_IOMEM
+	return memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
+#else
+	return NULL;
+#endif
+}
+
+static void gpc_unmap(kvm_pfn_t pfn, void *khva)
+{
+	/* Unmap the old pfn/page if it was mapped before. */
+	if (is_error_noslot_pfn(pfn) || !khva)
+		return;
+
+	if (pfn_valid(pfn)) {
+		kunmap(pfn_to_page(pfn));
+		return;
+	}
+
+#ifdef CONFIG_HAS_IOMEM
+	memunmap(khva);
+#endif
+}
+
+static inline bool mmu_notifier_retry_cache(struct kvm *kvm, unsigned long mmu_seq)
+{
+	/*
+	 * mn_active_invalidate_count acts for all intents and purposes
+	 * like mmu_invalidate_in_progress here; but the latter cannot
+	 * be used here because the invalidation of caches in the
+	 * mmu_notifier event occurs _before_ mmu_invalidate_in_progress
+	 * is elevated.
+	 *
+	 * Note, it does not matter that mn_active_invalidate_count
+	 * is not protected by gpc->lock.  It is guaranteed to
+	 * be elevated before the mmu_notifier acquires gpc->lock, and
+	 * isn't dropped until after mmu_invalidate_seq is updated.
+	 */
+	if (kvm->mn_active_invalidate_count)
+		return true;
+
+	/*
+	 * Ensure mn_active_invalidate_count is read before
+	 * mmu_invalidate_seq.  This pairs with the smp_wmb() in
+	 * mmu_notifier_invalidate_range_end() to guarantee either the
+	 * old (non-zero) value of mn_active_invalidate_count or the
+	 * new (incremented) value of mmu_invalidate_seq is observed.
+	 */
+	smp_rmb();
+	return kvm->mmu_invalidate_seq != mmu_seq;
+}
+
+static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
+{
+	/* Note, the new page offset may be different than the old! */
+	void *old_khva = (void *)PAGE_ALIGN_DOWN((uintptr_t)gpc->khva);
+	kvm_pfn_t new_pfn = KVM_PFN_ERR_FAULT;
+	void *new_khva = NULL;
+	unsigned long mmu_seq;
+	struct page *page;
+
+	struct kvm_follow_pfn kfp = {
+		.slot = gpc->memslot,
+		.gfn = gpa_to_gfn(gpc->gpa),
+		.flags = FOLL_WRITE,
+		.hva = gpc->uhva,
+		.refcounted_page = &page,
+	};
+
+	lockdep_assert_held(&gpc->refresh_lock);
+
+	lockdep_assert_held_write(&gpc->lock);
+
+	/*
+	 * Invalidate the cache prior to dropping gpc->lock, the gpa=>uhva
+	 * assets have already been updated and so a concurrent check() from a
+	 * different task may not fail the gpa/uhva/generation checks.
+	 */
+	gpc->valid = false;
+
+	do {
+		mmu_seq = gpc->kvm->mmu_invalidate_seq;
+		smp_rmb();
+
+		write_unlock_irq(&gpc->lock);
+
+		/*
+		 * If the previous iteration "failed" due to an mmu_notifier
+		 * event, release the pfn and unmap the kernel virtual address
+		 * from the previous attempt.  Unmapping might sleep, so this
+		 * needs to be done after dropping the lock.  Opportunistically
+		 * check for resched while the lock isn't held.
+		 */
+		if (new_pfn != KVM_PFN_ERR_FAULT) {
+			/*
+			 * Keep the mapping if the previous iteration reused
+			 * the existing mapping and didn't create a new one.
+			 */
+			if (new_khva != old_khva)
+				gpc_unmap(new_pfn, new_khva);
+
+			kvm_release_page_unused(page);
+
+			cond_resched();
+		}
+
+		new_pfn = hva_to_pfn(&kfp);
+		if (is_error_noslot_pfn(new_pfn))
+			goto out_error;
+
+		/*
+		 * Obtain a new kernel mapping if KVM itself will access the
+		 * pfn.  Note, kmap() and memremap() can both sleep, so this
+		 * too must be done outside of gpc->lock!
+		 */
+		if (new_pfn == gpc->pfn)
+			new_khva = old_khva;
+		else
+			new_khva = gpc_map(new_pfn);
+
+		if (!new_khva) {
+			kvm_release_page_unused(page);
+			goto out_error;
+		}
+
+		write_lock_irq(&gpc->lock);
+
+		/*
+		 * Other tasks must wait for _this_ refresh to complete before
+		 * attempting to refresh.
+		 */
+		WARN_ON_ONCE(gpc->valid);
+	} while (mmu_notifier_retry_cache(gpc->kvm, mmu_seq));
+
+	gpc->valid = true;
+	gpc->pfn = new_pfn;
+	gpc->khva = new_khva + offset_in_page(gpc->uhva);
+
+	/*
+	 * Put the reference to the _new_ page.  The page is now tracked by the
+	 * cache and can be safely migrated, swapped, etc... as the cache will
+	 * invalidate any mappings in response to relevant mmu_notifier events.
+	 */
+	kvm_release_page_clean(page);
+
+	return 0;
+
+out_error:
+	write_lock_irq(&gpc->lock);
+
+	return -EFAULT;
+}
+
+static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long uhva)
+{
+	unsigned long page_offset;
+	bool unmap_old = false;
+	unsigned long old_uhva;
+	kvm_pfn_t old_pfn;
+	bool hva_change = false;
+	void *old_khva;
+	int ret;
+
+	/* Either gpa or uhva must be valid, but not both */
+	if (WARN_ON_ONCE(kvm_is_error_gpa(gpa) == kvm_is_error_hva(uhva)))
+		return -EINVAL;
+
+	lockdep_assert_held(&gpc->refresh_lock);
+
+	write_lock_irq(&gpc->lock);
+
+	if (!gpc->active) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	old_pfn = gpc->pfn;
+	old_khva = (void *)PAGE_ALIGN_DOWN((uintptr_t)gpc->khva);
+	old_uhva = PAGE_ALIGN_DOWN(gpc->uhva);
+
+	if (kvm_is_error_gpa(gpa)) {
+		page_offset = offset_in_page(uhva);
+
+		gpc->gpa = INVALID_GPA;
+		gpc->memslot = NULL;
+		gpc->uhva = PAGE_ALIGN_DOWN(uhva);
+
+		if (gpc->uhva != old_uhva)
+			hva_change = true;
+	} else {
+		struct kvm_memslots *slots = kvm_memslots(gpc->kvm);
+
+		page_offset = offset_in_page(gpa);
+
+		if (gpc->gpa != gpa || gpc->generation != slots->generation ||
+		    kvm_is_error_hva(gpc->uhva)) {
+			gfn_t gfn = gpa_to_gfn(gpa);
+
+			gpc->gpa = gpa;
+			gpc->generation = slots->generation;
+			gpc->memslot = __gfn_to_memslot(slots, gfn);
+			gpc->uhva = gfn_to_hva_memslot(gpc->memslot, gfn);
+
+			if (kvm_is_error_hva(gpc->uhva)) {
+				ret = -EFAULT;
+				goto out;
+			}
+
+			/*
+			 * Even if the GPA and/or the memslot generation changed, the
+			 * HVA may still be the same.
+			 */
+			if (gpc->uhva != old_uhva)
+				hva_change = true;
+		} else {
+			gpc->uhva = old_uhva;
+		}
+	}
+
+	/* Note: the offset must be correct before calling hva_to_pfn_retry() */
+	gpc->uhva += page_offset;
+
+	/*
+	 * If the userspace HVA changed or the PFN was already invalid,
+	 * drop the lock and do the HVA to PFN lookup again.
+	 */
+	if (!gpc->valid || hva_change) {
+		ret = hva_to_pfn_retry(gpc);
+	} else {
+		/*
+		 * If the HVA→PFN mapping was already valid, don't unmap it.
+		 * But do update gpc->khva because the offset within the page
+		 * may have changed.
+		 */
+		gpc->khva = old_khva + page_offset;
+		ret = 0;
+		goto out_unlock;
+	}
+
+ out:
+	/*
+	 * Invalidate the cache and purge the pfn/khva if the refresh failed.
+	 * Some/all of the uhva, gpa, and memslot generation info may still be
+	 * valid, leave it as is.
+	 */
+	if (ret) {
+		gpc->valid = false;
+		gpc->pfn = KVM_PFN_ERR_FAULT;
+		gpc->khva = NULL;
+	}
+
+	/* Detect a pfn change before dropping the lock! */
+	unmap_old = (old_pfn != gpc->pfn);
+
+out_unlock:
+	write_unlock_irq(&gpc->lock);
+
+	if (unmap_old)
+		gpc_unmap(old_pfn, old_khva);
+
+	return ret;
+}
+
+int kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, unsigned long len)
+{
+	unsigned long uhva;
+
+	guard(mutex)(&gpc->refresh_lock);
+
+	if (!kvm_gpc_is_valid_len(gpc->gpa, gpc->uhva, len))
+		return -EINVAL;
+
+	/*
+	 * If the GPA is valid then ignore the HVA, as a cache can be GPA-based
+	 * or HVA-based, not both.  For GPA-based caches, the HVA will be
+	 * recomputed during refresh if necessary.
+	 */
+	uhva = kvm_is_error_gpa(gpc->gpa) ? gpc->uhva : KVM_HVA_ERR_BAD;
+
+	return __kvm_gpc_refresh(gpc, gpc->gpa, uhva);
+}
+
+void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm)
+{
+	rwlock_init(&gpc->lock);
+	mutex_init(&gpc->refresh_lock);
+
+	gpc->kvm = kvm;
+	gpc->pfn = KVM_PFN_ERR_FAULT;
+	gpc->gpa = INVALID_GPA;
+	gpc->uhva = KVM_HVA_ERR_BAD;
+	gpc->active = gpc->valid = false;
+}
+
+static int __kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long uhva,
+			      unsigned long len)
+{
+	struct kvm *kvm = gpc->kvm;
+
+	if (!kvm_gpc_is_valid_len(gpa, uhva, len))
+		return -EINVAL;
+
+	guard(mutex)(&gpc->refresh_lock);
+
+	if (!gpc->active) {
+		if (KVM_BUG_ON(gpc->valid, kvm))
+			return -EIO;
+
+		spin_lock(&kvm->gpc_lock);
+		list_add(&gpc->list, &kvm->gpc_list);
+		spin_unlock(&kvm->gpc_lock);
+
+		/*
+		 * Activate the cache after adding it to the list, a concurrent
+		 * refresh must not establish a mapping until the cache is
+		 * reachable by mmu_notifier events.
+		 */
+		write_lock_irq(&gpc->lock);
+		gpc->active = true;
+		write_unlock_irq(&gpc->lock);
+	}
+	return __kvm_gpc_refresh(gpc, gpa, uhva);
+}
+
+int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len)
+{
+	/*
+	 * Explicitly disallow INVALID_GPA so that the magic value can be used
+	 * by KVM to differentiate between GPA-based and HVA-based caches.
+	 */
+	if (WARN_ON_ONCE(kvm_is_error_gpa(gpa)))
+		return -EINVAL;
+
+	return __kvm_gpc_activate(gpc, gpa, KVM_HVA_ERR_BAD, len);
+}
+
+int kvm_gpc_activate_hva(struct gfn_to_pfn_cache *gpc, unsigned long uhva, unsigned long len)
+{
+	if (!access_ok((void __user *)uhva, len))
+		return -EINVAL;
+
+	return __kvm_gpc_activate(gpc, INVALID_GPA, uhva, len);
+}
+
+void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc)
+{
+	struct kvm *kvm = gpc->kvm;
+	kvm_pfn_t old_pfn;
+	void *old_khva;
+
+	guard(mutex)(&gpc->refresh_lock);
+
+	if (gpc->active) {
+		/*
+		 * Deactivate the cache before removing it from the list, KVM
+		 * must stall mmu_notifier events until all users go away, i.e.
+		 * until gpc->lock is dropped and refresh is guaranteed to fail.
+		 */
+		write_lock_irq(&gpc->lock);
+		gpc->active = false;
+		gpc->valid = false;
+
+		/*
+		 * Leave the GPA => uHVA cache intact, it's protected by the
+		 * memslot generation.  The PFN lookup needs to be redone every
+		 * time as mmu_notifier protection is lost when the cache is
+		 * removed from the VM's gpc_list.
+		 */
+		old_khva = gpc->khva - offset_in_page(gpc->khva);
+		gpc->khva = NULL;
+
+		old_pfn = gpc->pfn;
+		gpc->pfn = KVM_PFN_ERR_FAULT;
+		write_unlock_irq(&gpc->lock);
+
+		spin_lock(&kvm->gpc_lock);
+		list_del(&gpc->list);
+		spin_unlock(&kvm->gpc_lock);
+
+		gpc_unmap(old_pfn, old_khva);
+	}
+}
diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
index d99850c462a1..be50514bbd11 100644
--- a/virt/kvm/vfio.c
+++ b/virt/kvm/vfio.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * VFIO-KVM bridge pseudo device
  *
  * Copyright (C) 2013 Red Hat, Inc.  All rights reserved.
  *     Author: Alex Williamson <alex.williamson@redhat.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 
 #include <linux/errno.h>
@@ -24,149 +21,110 @@
 #include <asm/kvm_ppc.h>
 #endif
 
-struct kvm_vfio_group {
+struct kvm_vfio_file {
 	struct list_head node;
-	struct vfio_group *vfio_group;
+	struct file *file;
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+	struct iommu_group *iommu_group;
+#endif
 };
 
 struct kvm_vfio {
-	struct list_head group_list;
+	struct list_head file_list;
 	struct mutex lock;
 	bool noncoherent;
 };
 
-static struct vfio_group *kvm_vfio_group_get_external_user(struct file *filep)
+static void kvm_vfio_file_set_kvm(struct file *file, struct kvm *kvm)
 {
-	struct vfio_group *vfio_group;
-	struct vfio_group *(*fn)(struct file *);
+	void (*fn)(struct file *file, struct kvm *kvm);
 
-	fn = symbol_get(vfio_group_get_external_user);
+	fn = symbol_get(vfio_file_set_kvm);
 	if (!fn)
-		return ERR_PTR(-EINVAL);
-
-	vfio_group = fn(filep);
+		return;
 
-	symbol_put(vfio_group_get_external_user);
+	fn(file, kvm);
 
-	return vfio_group;
+	symbol_put(vfio_file_set_kvm);
 }
 
-static bool kvm_vfio_external_group_match_file(struct vfio_group *group,
-					       struct file *filep)
+static bool kvm_vfio_file_enforced_coherent(struct file *file)
 {
-	bool ret, (*fn)(struct vfio_group *, struct file *);
+	bool (*fn)(struct file *file);
+	bool ret;
 
-	fn = symbol_get(vfio_external_group_match_file);
+	fn = symbol_get(vfio_file_enforced_coherent);
 	if (!fn)
 		return false;
 
-	ret = fn(group, filep);
+	ret = fn(file);
 
-	symbol_put(vfio_external_group_match_file);
+	symbol_put(vfio_file_enforced_coherent);
 
 	return ret;
 }
 
-static void kvm_vfio_group_put_external_user(struct vfio_group *vfio_group)
-{
-	void (*fn)(struct vfio_group *);
-
-	fn = symbol_get(vfio_group_put_external_user);
-	if (!fn)
-		return;
-
-	fn(vfio_group);
-
-	symbol_put(vfio_group_put_external_user);
-}
-
-static void kvm_vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm)
+static bool kvm_vfio_file_is_valid(struct file *file)
 {
-	void (*fn)(struct vfio_group *, struct kvm *);
+	bool (*fn)(struct file *file);
+	bool ret;
 
-	fn = symbol_get(vfio_group_set_kvm);
-	if (!fn)
-		return;
-
-	fn(group, kvm);
-
-	symbol_put(vfio_group_set_kvm);
-}
-
-static bool kvm_vfio_group_is_coherent(struct vfio_group *vfio_group)
-{
-	long (*fn)(struct vfio_group *, unsigned long);
-	long ret;
-
-	fn = symbol_get(vfio_external_check_extension);
+	fn = symbol_get(vfio_file_is_valid);
 	if (!fn)
 		return false;
 
-	ret = fn(vfio_group, VFIO_DMA_CC_IOMMU);
+	ret = fn(file);
 
-	symbol_put(vfio_external_check_extension);
+	symbol_put(vfio_file_is_valid);
 
-	return ret > 0;
+	return ret;
 }
 
 #ifdef CONFIG_SPAPR_TCE_IOMMU
-static int kvm_vfio_external_user_iommu_id(struct vfio_group *vfio_group)
+static struct iommu_group *kvm_vfio_file_iommu_group(struct file *file)
 {
-	int (*fn)(struct vfio_group *);
-	int ret = -EINVAL;
+	struct iommu_group *(*fn)(struct file *file);
+	struct iommu_group *ret;
 
-	fn = symbol_get(vfio_external_user_iommu_id);
+	fn = symbol_get(vfio_file_iommu_group);
 	if (!fn)
-		return ret;
+		return NULL;
 
-	ret = fn(vfio_group);
+	ret = fn(file);
 
-	symbol_put(vfio_external_user_iommu_id);
+	symbol_put(vfio_file_iommu_group);
 
 	return ret;
 }
 
-static struct iommu_group *kvm_vfio_group_get_iommu_group(
-		struct vfio_group *group)
-{
-	int group_id = kvm_vfio_external_user_iommu_id(group);
-
-	if (group_id < 0)
-		return NULL;
-
-	return iommu_group_get_by_id(group_id);
-}
-
 static void kvm_spapr_tce_release_vfio_group(struct kvm *kvm,
-		struct vfio_group *vfio_group)
+					     struct kvm_vfio_file *kvf)
 {
-	struct iommu_group *grp = kvm_vfio_group_get_iommu_group(vfio_group);
-
-	if (WARN_ON_ONCE(!grp))
+	if (WARN_ON_ONCE(!kvf->iommu_group))
 		return;
 
-	kvm_spapr_tce_release_iommu_group(kvm, grp);
-	iommu_group_put(grp);
+	kvm_spapr_tce_release_iommu_group(kvm, kvf->iommu_group);
+	iommu_group_put(kvf->iommu_group);
+	kvf->iommu_group = NULL;
 }
 #endif
 
 /*
- * Groups can use the same or different IOMMU domains.  If the same then
- * adding a new group may change the coherency of groups we've previously
- * been told about.  We don't want to care about any of that so we retest
- * each group and bail as soon as we find one that's noncoherent.  This
- * means we only ever [un]register_noncoherent_dma once for the whole device.
+ * Groups/devices can use the same or different IOMMU domains. If the same
+ * then adding a new group/device may change the coherency of groups/devices
+ * we've previously been told about. We don't want to care about any of
+ * that so we retest each group/device and bail as soon as we find one that's
+ * noncoherent.  This means we only ever [un]register_noncoherent_dma once
+ * for the whole device.
  */
 static void kvm_vfio_update_coherency(struct kvm_device *dev)
 {
 	struct kvm_vfio *kv = dev->private;
 	bool noncoherent = false;
-	struct kvm_vfio_group *kvg;
-
-	mutex_lock(&kv->lock);
+	struct kvm_vfio_file *kvf;
 
-	list_for_each_entry(kvg, &kv->group_list, node) {
-		if (!kvm_vfio_group_is_coherent(kvg->vfio_group)) {
+	list_for_each_entry(kvf, &kv->file_list, node) {
+		if (!kvm_vfio_file_enforced_coherent(kvf->file)) {
 			noncoherent = true;
 			break;
 		}
@@ -180,153 +138,152 @@ static void kvm_vfio_update_coherency(struct kvm_device *dev)
 		else
 			kvm_arch_unregister_noncoherent_dma(dev->kvm);
 	}
-
-	mutex_unlock(&kv->lock);
 }
 
-static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg)
+static int kvm_vfio_file_add(struct kvm_device *dev, unsigned int fd)
 {
 	struct kvm_vfio *kv = dev->private;
-	struct vfio_group *vfio_group;
-	struct kvm_vfio_group *kvg;
-	int32_t __user *argp = (int32_t __user *)(unsigned long)arg;
-	struct fd f;
-	int32_t fd;
-	int ret;
-
-	switch (attr) {
-	case KVM_DEV_VFIO_GROUP_ADD:
-		if (get_user(fd, argp))
-			return -EFAULT;
-
-		f = fdget(fd);
-		if (!f.file)
-			return -EBADF;
-
-		vfio_group = kvm_vfio_group_get_external_user(f.file);
-		fdput(f);
-
-		if (IS_ERR(vfio_group))
-			return PTR_ERR(vfio_group);
-
-		mutex_lock(&kv->lock);
+	struct kvm_vfio_file *kvf;
+	struct file *filp;
+	int ret = 0;
+
+	filp = fget(fd);
+	if (!filp)
+		return -EBADF;
+
+	/* Ensure the FD is a vfio FD. */
+	if (!kvm_vfio_file_is_valid(filp)) {
+		ret = -EINVAL;
+		goto out_fput;
+	}
 
-		list_for_each_entry(kvg, &kv->group_list, node) {
-			if (kvg->vfio_group == vfio_group) {
-				mutex_unlock(&kv->lock);
-				kvm_vfio_group_put_external_user(vfio_group);
-				return -EEXIST;
-			}
-		}
+	mutex_lock(&kv->lock);
 
-		kvg = kzalloc(sizeof(*kvg), GFP_KERNEL);
-		if (!kvg) {
-			mutex_unlock(&kv->lock);
-			kvm_vfio_group_put_external_user(vfio_group);
-			return -ENOMEM;
+	list_for_each_entry(kvf, &kv->file_list, node) {
+		if (kvf->file == filp) {
+			ret = -EEXIST;
+			goto out_unlock;
 		}
+	}
 
-		list_add_tail(&kvg->node, &kv->group_list);
-		kvg->vfio_group = vfio_group;
-
-		kvm_arch_start_assignment(dev->kvm);
-
-		mutex_unlock(&kv->lock);
+	kvf = kzalloc(sizeof(*kvf), GFP_KERNEL_ACCOUNT);
+	if (!kvf) {
+		ret = -ENOMEM;
+		goto out_unlock;
+	}
 
-		kvm_vfio_group_set_kvm(vfio_group, dev->kvm);
+	kvf->file = get_file(filp);
+	list_add_tail(&kvf->node, &kv->file_list);
 
-		kvm_vfio_update_coherency(dev);
+	kvm_vfio_file_set_kvm(kvf->file, dev->kvm);
+	kvm_vfio_update_coherency(dev);
 
-		return 0;
+out_unlock:
+	mutex_unlock(&kv->lock);
+out_fput:
+	fput(filp);
+	return ret;
+}
 
-	case KVM_DEV_VFIO_GROUP_DEL:
-		if (get_user(fd, argp))
-			return -EFAULT;
+static int kvm_vfio_file_del(struct kvm_device *dev, unsigned int fd)
+{
+	struct kvm_vfio *kv = dev->private;
+	struct kvm_vfio_file *kvf;
+	CLASS(fd, f)(fd);
+	int ret;
 
-		f = fdget(fd);
-		if (!f.file)
-			return -EBADF;
+	if (fd_empty(f))
+		return -EBADF;
 
-		ret = -ENOENT;
+	ret = -ENOENT;
 
-		mutex_lock(&kv->lock);
+	mutex_lock(&kv->lock);
 
-		list_for_each_entry(kvg, &kv->group_list, node) {
-			if (!kvm_vfio_external_group_match_file(kvg->vfio_group,
-								f.file))
-				continue;
+	list_for_each_entry(kvf, &kv->file_list, node) {
+		if (kvf->file != fd_file(f))
+			continue;
 
-			list_del(&kvg->node);
-			kvm_arch_end_assignment(dev->kvm);
+		list_del(&kvf->node);
 #ifdef CONFIG_SPAPR_TCE_IOMMU
-			kvm_spapr_tce_release_vfio_group(dev->kvm,
-							 kvg->vfio_group);
+		kvm_spapr_tce_release_vfio_group(dev->kvm, kvf);
 #endif
-			kvm_vfio_group_set_kvm(kvg->vfio_group, NULL);
-			kvm_vfio_group_put_external_user(kvg->vfio_group);
-			kfree(kvg);
-			ret = 0;
-			break;
-		}
+		kvm_vfio_file_set_kvm(kvf->file, NULL);
+		fput(kvf->file);
+		kfree(kvf);
+		ret = 0;
+		break;
+	}
 
-		mutex_unlock(&kv->lock);
+	kvm_vfio_update_coherency(dev);
 
-		fdput(f);
+	mutex_unlock(&kv->lock);
+	return ret;
+}
 
-		kvm_vfio_update_coherency(dev);
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+static int kvm_vfio_file_set_spapr_tce(struct kvm_device *dev,
+				       void __user *arg)
+{
+	struct kvm_vfio_spapr_tce param;
+	struct kvm_vfio *kv = dev->private;
+	struct kvm_vfio_file *kvf;
+	int ret;
 
-		return ret;
+	if (copy_from_user(&param, arg, sizeof(struct kvm_vfio_spapr_tce)))
+		return -EFAULT;
 
-#ifdef CONFIG_SPAPR_TCE_IOMMU
-	case KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE: {
-		struct kvm_vfio_spapr_tce param;
-		struct kvm_vfio *kv = dev->private;
-		struct vfio_group *vfio_group;
-		struct kvm_vfio_group *kvg;
-		struct fd f;
-		struct iommu_group *grp;
-
-		if (copy_from_user(&param, (void __user *)arg,
-				sizeof(struct kvm_vfio_spapr_tce)))
-			return -EFAULT;
+	CLASS(fd, f)(param.groupfd);
+	if (fd_empty(f))
+		return -EBADF;
 
-		f = fdget(param.groupfd);
-		if (!f.file)
-			return -EBADF;
+	ret = -ENOENT;
 
-		vfio_group = kvm_vfio_group_get_external_user(f.file);
-		fdput(f);
+	mutex_lock(&kv->lock);
 
-		if (IS_ERR(vfio_group))
-			return PTR_ERR(vfio_group);
+	list_for_each_entry(kvf, &kv->file_list, node) {
+		if (kvf->file != fd_file(f))
+			continue;
 
-		grp = kvm_vfio_group_get_iommu_group(vfio_group);
-		if (WARN_ON_ONCE(!grp)) {
-			kvm_vfio_group_put_external_user(vfio_group);
-			return -EIO;
+		if (!kvf->iommu_group) {
+			kvf->iommu_group = kvm_vfio_file_iommu_group(kvf->file);
+			if (WARN_ON_ONCE(!kvf->iommu_group)) {
+				ret = -EIO;
+				goto err_fdput;
+			}
 		}
 
-		ret = -ENOENT;
-
-		mutex_lock(&kv->lock);
+		ret = kvm_spapr_tce_attach_iommu_group(dev->kvm, param.tablefd,
+						       kvf->iommu_group);
+		break;
+	}
 
-		list_for_each_entry(kvg, &kv->group_list, node) {
-			if (kvg->vfio_group != vfio_group)
-				continue;
+err_fdput:
+	mutex_unlock(&kv->lock);
+	return ret;
+}
+#endif
 
-			ret = kvm_spapr_tce_attach_iommu_group(dev->kvm,
-					param.tablefd, grp);
-			break;
-		}
+static int kvm_vfio_set_file(struct kvm_device *dev, long attr,
+			     void __user *arg)
+{
+	int32_t __user *argp = arg;
+	int32_t fd;
 
-		mutex_unlock(&kv->lock);
+	switch (attr) {
+	case KVM_DEV_VFIO_FILE_ADD:
+		if (get_user(fd, argp))
+			return -EFAULT;
+		return kvm_vfio_file_add(dev, fd);
 
-		iommu_group_put(grp);
-		kvm_vfio_group_put_external_user(vfio_group);
+	case KVM_DEV_VFIO_FILE_DEL:
+		if (get_user(fd, argp))
+			return -EFAULT;
+		return kvm_vfio_file_del(dev, fd);
 
-		return ret;
-	}
-#endif /* CONFIG_SPAPR_TCE_IOMMU */
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+	case KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE:
+		return kvm_vfio_file_set_spapr_tce(dev, arg);
+#endif
 	}
 
 	return -ENXIO;
@@ -336,8 +293,9 @@ static int kvm_vfio_set_attr(struct kvm_device *dev,
 			     struct kvm_device_attr *attr)
 {
 	switch (attr->group) {
-	case KVM_DEV_VFIO_GROUP:
-		return kvm_vfio_set_group(dev, attr->attr, attr->addr);
+	case KVM_DEV_VFIO_FILE:
+		return kvm_vfio_set_file(dev, attr->attr,
+					 u64_to_user_ptr(attr->addr));
 	}
 
 	return -ENXIO;
@@ -347,10 +305,10 @@ static int kvm_vfio_has_attr(struct kvm_device *dev,
 			     struct kvm_device_attr *attr)
 {
 	switch (attr->group) {
-	case KVM_DEV_VFIO_GROUP:
+	case KVM_DEV_VFIO_FILE:
 		switch (attr->attr) {
-		case KVM_DEV_VFIO_GROUP_ADD:
-		case KVM_DEV_VFIO_GROUP_DEL:
+		case KVM_DEV_VFIO_FILE_ADD:
+		case KVM_DEV_VFIO_FILE_DEL:
 #ifdef CONFIG_SPAPR_TCE_IOMMU
 		case KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE:
 #endif
@@ -363,34 +321,33 @@ static int kvm_vfio_has_attr(struct kvm_device *dev,
 	return -ENXIO;
 }
 
-static void kvm_vfio_destroy(struct kvm_device *dev)
+static void kvm_vfio_release(struct kvm_device *dev)
 {
 	struct kvm_vfio *kv = dev->private;
-	struct kvm_vfio_group *kvg, *tmp;
+	struct kvm_vfio_file *kvf, *tmp;
 
-	list_for_each_entry_safe(kvg, tmp, &kv->group_list, node) {
+	list_for_each_entry_safe(kvf, tmp, &kv->file_list, node) {
 #ifdef CONFIG_SPAPR_TCE_IOMMU
-		kvm_spapr_tce_release_vfio_group(dev->kvm, kvg->vfio_group);
+		kvm_spapr_tce_release_vfio_group(dev->kvm, kvf);
 #endif
-		kvm_vfio_group_set_kvm(kvg->vfio_group, NULL);
-		kvm_vfio_group_put_external_user(kvg->vfio_group);
-		list_del(&kvg->node);
-		kfree(kvg);
-		kvm_arch_end_assignment(dev->kvm);
+		kvm_vfio_file_set_kvm(kvf->file, NULL);
+		fput(kvf->file);
+		list_del(&kvf->node);
+		kfree(kvf);
 	}
 
 	kvm_vfio_update_coherency(dev);
 
 	kfree(kv);
-	kfree(dev); /* alloc by kvm_ioctl_create_device, free by .destroy */
+	kfree(dev); /* alloc by kvm_ioctl_create_device, free by .release */
 }
 
 static int kvm_vfio_create(struct kvm_device *dev, u32 type);
 
-static struct kvm_device_ops kvm_vfio_ops = {
+static const struct kvm_device_ops kvm_vfio_ops = {
 	.name = "kvm-vfio",
 	.create = kvm_vfio_create,
-	.destroy = kvm_vfio_destroy,
+	.release = kvm_vfio_release,
 	.set_attr = kvm_vfio_set_attr,
 	.has_attr = kvm_vfio_has_attr,
 };
@@ -400,16 +357,18 @@ static int kvm_vfio_create(struct kvm_device *dev, u32 type)
 	struct kvm_device *tmp;
 	struct kvm_vfio *kv;
 
+	lockdep_assert_held(&dev->kvm->lock);
+
 	/* Only one VFIO "device" per VM */
 	list_for_each_entry(tmp, &dev->kvm->devices, vm_node)
 		if (tmp->ops == &kvm_vfio_ops)
 			return -EBUSY;
 
-	kv = kzalloc(sizeof(*kv), GFP_KERNEL);
+	kv = kzalloc(sizeof(*kv), GFP_KERNEL_ACCOUNT);
 	if (!kv)
 		return -ENOMEM;
 
-	INIT_LIST_HEAD(&kv->group_list);
+	INIT_LIST_HEAD(&kv->file_list);
 	mutex_init(&kv->lock);
 
 	dev->private = kv;