30 files changed, 10141 insertions, 1587 deletions
diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig
index 0747a8f1fcee..7937ac0cbd0f 100644
--- a/drivers/hv/Kconfig
+++ b/drivers/hv/Kconfig
@@ -3,31 +3,110 @@
 menu "Microsoft Hyper-V guest support"
 
 config HYPERV
-	tristate "Microsoft Hyper-V client drivers"
-	depends on ACPI && ((X86 && X86_LOCAL_APIC && HYPERVISOR_GUEST) \
-		|| (ARM64 && !CPU_BIG_ENDIAN))
+	bool "Microsoft Hyper-V core hypervisor support"
+	depends on (X86 && X86_LOCAL_APIC && HYPERVISOR_GUEST) \
+		|| (ARM64 && !CPU_BIG_ENDIAN)
 	select PARAVIRT
 	select X86_HV_CALLBACK_VECTOR if X86
-	select VMAP_PFN
+	select OF_EARLY_FLATTREE if OF
+	select SYSFB if EFI && !HYPERV_VTL_MODE
+	select IRQ_MSI_LIB if X86
 	help
 	  Select this option to run Linux as a Hyper-V client operating
 	  system.
 
+config HYPERV_VTL_MODE
+	bool "Enable Linux to boot in VTL context"
+	depends on (X86_64 && HAVE_STATIC_CALL) || ARM64
+	depends on HYPERV
+	depends on SMP
+	default n
+	help
+	  Virtual Secure Mode (VSM) is a set of hypervisor capabilities and
+	  enlightenments offered to host and guest partitions which enables
+	  the creation and management of new security boundaries within
+	  operating system software.
+
+	  VSM achieves and maintains isolation through Virtual Trust Levels
+	  (VTLs). Virtual Trust Levels are hierarchical, with higher levels
+	  being more privileged than lower levels. VTL0 is the least privileged
+	  level, and currently only other level supported is VTL2.
+
+	  Select this option to build a Linux kernel to run at a VTL other than
+	  the normal VTL0, which currently is only VTL2.  This option
+	  initializes the kernel to run in VTL2, and adds the ability to boot
+	  secondary CPUs directly into 64-bit context as required for VTLs other
+	  than 0.  A kernel built with this option must run at VTL2, and will
+	  not run as a normal guest.
+
+	  If unsure, say N
+
 config HYPERV_TIMER
 	def_bool HYPERV && X86
 
 config HYPERV_UTILS
 	tristate "Microsoft Hyper-V Utilities driver"
-	depends on HYPERV && CONNECTOR && NLS
+	depends on HYPERV_VMBUS && CONNECTOR && NLS
 	depends on PTP_1588_CLOCK_OPTIONAL
 	help
 	  Select this option to enable the Hyper-V Utilities.
 
 config HYPERV_BALLOON
 	tristate "Microsoft Hyper-V Balloon driver"
-	depends on HYPERV
+	depends on HYPERV_VMBUS
 	select PAGE_REPORTING
 	help
 	  Select this option to enable Hyper-V Balloon driver.
 
+config HYPERV_VMBUS
+	tristate "Microsoft Hyper-V VMBus driver"
+	depends on HYPERV
+	default HYPERV
+	help
+	  Select this option to enable Hyper-V Vmbus driver.
+
+config MSHV_ROOT
+	tristate "Microsoft Hyper-V root partition support"
+	depends on HYPERV && (X86_64 || ARM64)
+	depends on !HYPERV_VTL_MODE
+	# The hypervisor interface operates on 4k pages. Enforcing it here
+	# simplifies many assumptions in the root partition code.
+	# e.g. When withdrawing memory, the hypervisor gives back 4k pages in
+	# no particular order, making it impossible to reassemble larger pages
+	depends on PAGE_SIZE_4KB
+	select EVENTFD
+	select VIRT_XFER_TO_GUEST_WORK
+	select HMM_MIRROR
+	select MMU_NOTIFIER
+	default n
+	help
+	  Select this option to enable support for booting and running as root
+	  partition on Microsoft Hyper-V.
+
+	  If unsure, say N.
+
+config MSHV_VTL
+	tristate "Microsoft Hyper-V VTL driver"
+	depends on X86_64 && HYPERV_VTL_MODE
+	depends on HYPERV_VMBUS
+	# Mapping VTL0 memory to a userspace process in VTL2 is supported in OpenHCL.
+	# VTL2 for OpenHCL makes use of Huge Pages to improve performance on VMs,
+	# specially with large memory requirements.
+	depends on TRANSPARENT_HUGEPAGE
+	# MTRRs are controlled by VTL0, and are not specific to individual VTLs.
+	# Therefore, do not attempt to access or modify MTRRs here.
+	depends on !MTRR
+	select CPUMASK_OFFSTACK
+	select VIRT_XFER_TO_GUEST_WORK
+	default n
+	help
+	  Select this option to enable Hyper-V VTL driver support.
+	  This driver provides interfaces for Virtual Machine Manager (VMM) running in VTL2
+	  userspace to create VTLs and partitions, setup and manage VTL0 memory and
+	  allow userspace to make direct hypercalls. This also allows to map VTL0's address
+	  space to a usermode process in VTL2 and supports getting new VMBus messages and channel
+	  events in VTL2.
+
+	  If unsure, say N.
+
 endmenu
diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile
index d76df5c8c2a9..a49f93c2d245 100644
--- a/drivers/hv/Makefile
+++ b/drivers/hv/Makefile
@@ -1,7 +1,9 @@
 # SPDX-License-Identifier: GPL-2.0
-obj-$(CONFIG_HYPERV)		+= hv_vmbus.o
+obj-$(CONFIG_HYPERV_VMBUS)	+= hv_vmbus.o
 obj-$(CONFIG_HYPERV_UTILS)	+= hv_utils.o
 obj-$(CONFIG_HYPERV_BALLOON)	+= hv_balloon.o
+obj-$(CONFIG_MSHV_ROOT)		+= mshv_root.o
+obj-$(CONFIG_MSHV_VTL)          += mshv_vtl.o
 
 CFLAGS_hv_trace.o = -I$(src)
 CFLAGS_hv_balloon.o = -I$(src)
@@ -10,7 +12,14 @@ hv_vmbus-y := vmbus_drv.o \
 		 hv.o connection.o channel.o \
 		 channel_mgmt.o ring_buffer.o hv_trace.o
 hv_vmbus-$(CONFIG_HYPERV_TESTING)	+= hv_debugfs.o
-hv_utils-y := hv_util.o hv_kvp.o hv_snapshot.o hv_fcopy.o hv_utils_transport.o
+hv_utils-y := hv_util.o hv_kvp.o hv_snapshot.o hv_utils_transport.o
+mshv_root-y := mshv_root_main.o mshv_synic.o mshv_eventfd.o mshv_irq.o \
+	       mshv_root_hv_call.o mshv_portid_table.o mshv_regions.o
+mshv_vtl-y := mshv_vtl_main.o
 
 # Code that must be built-in
-obj-$(subst m,y,$(CONFIG_HYPERV)) += hv_common.o
+obj-$(CONFIG_HYPERV) += hv_common.o
+obj-$(subst m,y,$(CONFIG_MSHV_ROOT)) += hv_proc.o
+ifneq ($(CONFIG_MSHV_ROOT)$(CONFIG_MSHV_VTL),)
+	obj-y += mshv_common.o
+endif
diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index 56f7e06c673e..6821f225248b 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c
@@ -18,6 +18,7 @@
 #include <linux/uio.h>
 #include <linux/interrupt.h>
 #include <linux/set_memory.h>
+#include <linux/export.h>
 #include <asm/page.h>
 #include <asm/mshyperv.h>
 
@@ -153,7 +154,9 @@ void vmbus_free_ring(struct vmbus_channel *channel)
 	hv_ringbuffer_cleanup(&channel->inbound);
 
 	if (channel->ringbuffer_page) {
-		__free_pages(channel->ringbuffer_page,
+		/* In a CoCo VM leak the memory if it didn't get re-encrypted */
+		if (!channel->ringbuffer_gpadlhandle.decrypted)
+			__free_pages(channel->ringbuffer_page,
 			     get_order(channel->ringbuffer_pagecount
 				       << PAGE_SHIFT));
 		channel->ringbuffer_page = NULL;
@@ -322,125 +325,104 @@ static int create_gpadl_header(enum hv_gpadl_type type, void *kbuffer,
 
 	pagecount = hv_gpadl_size(type, size) >> HV_HYP_PAGE_SHIFT;
 
-	/* do we need a gpadl body msg */
 	pfnsize = MAX_SIZE_CHANNEL_MESSAGE -
 		  sizeof(struct vmbus_channel_gpadl_header) -
 		  sizeof(struct gpa_range);
+	pfncount = umin(pagecount, pfnsize / sizeof(u64));
+
+	msgsize = sizeof(struct vmbus_channel_msginfo) +
+		  sizeof(struct vmbus_channel_gpadl_header) +
+		  sizeof(struct gpa_range) + pfncount * sizeof(u64);
+	msgheader =  kzalloc(msgsize, GFP_KERNEL);
+	if (!msgheader)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&msgheader->submsglist);
+	msgheader->msgsize = msgsize;
+
+	gpadl_header = (struct vmbus_channel_gpadl_header *)
+		msgheader->msg;
+	gpadl_header->rangecount = 1;
+	gpadl_header->range_buflen = sizeof(struct gpa_range) +
+				 pagecount * sizeof(u64);
+	gpadl_header->range[0].byte_offset = 0;
+	gpadl_header->range[0].byte_count = hv_gpadl_size(type, size);
+	for (i = 0; i < pfncount; i++)
+		gpadl_header->range[0].pfn_array[i] = hv_gpadl_hvpfn(
+			type, kbuffer, size, send_offset, i);
+	*msginfo = msgheader;
+
+	pfnsum = pfncount;
+	pfnleft = pagecount - pfncount;
+
+	/* how many pfns can we fit in a body message */
+	pfnsize = MAX_SIZE_CHANNEL_MESSAGE -
+		  sizeof(struct vmbus_channel_gpadl_body);
 	pfncount = pfnsize / sizeof(u64);
 
-	if (pagecount > pfncount) {
-		/* we need a gpadl body */
-		/* fill in the header */
+	/*
+	 * If pfnleft is zero, everything fits in the header and no body
+	 * messages are needed
+	 */
+	while (pfnleft) {
+		pfncurr = umin(pfncount, pfnleft);
 		msgsize = sizeof(struct vmbus_channel_msginfo) +
-			  sizeof(struct vmbus_channel_gpadl_header) +
-			  sizeof(struct gpa_range) + pfncount * sizeof(u64);
-		msgheader =  kzalloc(msgsize, GFP_KERNEL);
-		if (!msgheader)
-			goto nomem;
-
-		INIT_LIST_HEAD(&msgheader->submsglist);
-		msgheader->msgsize = msgsize;
-
-		gpadl_header = (struct vmbus_channel_gpadl_header *)
-			msgheader->msg;
-		gpadl_header->rangecount = 1;
-		gpadl_header->range_buflen = sizeof(struct gpa_range) +
-					 pagecount * sizeof(u64);
-		gpadl_header->range[0].byte_offset = 0;
-		gpadl_header->range[0].byte_count = hv_gpadl_size(type, size);
-		for (i = 0; i < pfncount; i++)
-			gpadl_header->range[0].pfn_array[i] = hv_gpadl_hvpfn(
-				type, kbuffer, size, send_offset, i);
-		*msginfo = msgheader;
-
-		pfnsum = pfncount;
-		pfnleft = pagecount - pfncount;
-
-		/* how many pfns can we fit */
-		pfnsize = MAX_SIZE_CHANNEL_MESSAGE -
-			  sizeof(struct vmbus_channel_gpadl_body);
-		pfncount = pfnsize / sizeof(u64);
-
-		/* fill in the body */
-		while (pfnleft) {
-			if (pfnleft > pfncount)
-				pfncurr = pfncount;
-			else
-				pfncurr = pfnleft;
-
-			msgsize = sizeof(struct vmbus_channel_msginfo) +
-				  sizeof(struct vmbus_channel_gpadl_body) +
-				  pfncurr * sizeof(u64);
-			msgbody = kzalloc(msgsize, GFP_KERNEL);
-
-			if (!msgbody) {
-				struct vmbus_channel_msginfo *pos = NULL;
-				struct vmbus_channel_msginfo *tmp = NULL;
-				/*
-				 * Free up all the allocated messages.
-				 */
-				list_for_each_entry_safe(pos, tmp,
-					&msgheader->submsglist,
-					msglistentry) {
-
-					list_del(&pos->msglistentry);
-					kfree(pos);
-				}
-
-				goto nomem;
-			}
-
-			msgbody->msgsize = msgsize;
-			gpadl_body =
-				(struct vmbus_channel_gpadl_body *)msgbody->msg;
+			  sizeof(struct vmbus_channel_gpadl_body) +
+			  pfncurr * sizeof(u64);
+		msgbody = kzalloc(msgsize, GFP_KERNEL);
 
+		if (!msgbody) {
+			struct vmbus_channel_msginfo *pos = NULL;
+			struct vmbus_channel_msginfo *tmp = NULL;
 			/*
-			 * Gpadl is u32 and we are using a pointer which could
-			 * be 64-bit
-			 * This is governed by the guest/host protocol and
-			 * so the hypervisor guarantees that this is ok.
+			 * Free up all the allocated messages.
 			 */
-			for (i = 0; i < pfncurr; i++)
-				gpadl_body->pfn[i] = hv_gpadl_hvpfn(type,
-					kbuffer, size, send_offset, pfnsum + i);
-
-			/* add to msg header */
-			list_add_tail(&msgbody->msglistentry,
-				      &msgheader->submsglist);
-			pfnsum += pfncurr;
-			pfnleft -= pfncurr;
+			list_for_each_entry_safe(pos, tmp,
+				&msgheader->submsglist,
+				msglistentry) {
+
+				list_del(&pos->msglistentry);
+				kfree(pos);
+			}
+			kfree(msgheader);
+			return -ENOMEM;
 		}
-	} else {
-		/* everything fits in a header */
-		msgsize = sizeof(struct vmbus_channel_msginfo) +
-			  sizeof(struct vmbus_channel_gpadl_header) +
-			  sizeof(struct gpa_range) + pagecount * sizeof(u64);
-		msgheader = kzalloc(msgsize, GFP_KERNEL);
-		if (msgheader == NULL)
-			goto nomem;
-
-		INIT_LIST_HEAD(&msgheader->submsglist);
-		msgheader->msgsize = msgsize;
-
-		gpadl_header = (struct vmbus_channel_gpadl_header *)
-			msgheader->msg;
-		gpadl_header->rangecount = 1;
-		gpadl_header->range_buflen = sizeof(struct gpa_range) +
-					 pagecount * sizeof(u64);
-		gpadl_header->range[0].byte_offset = 0;
-		gpadl_header->range[0].byte_count = hv_gpadl_size(type, size);
-		for (i = 0; i < pagecount; i++)
-			gpadl_header->range[0].pfn_array[i] = hv_gpadl_hvpfn(
-				type, kbuffer, size, send_offset, i);
-
-		*msginfo = msgheader;
+
+		msgbody->msgsize = msgsize;
+		gpadl_body = (struct vmbus_channel_gpadl_body *)msgbody->msg;
+
+		/*
+		 * Gpadl is u32 and we are using a pointer which could
+		 * be 64-bit
+		 * This is governed by the guest/host protocol and
+		 * so the hypervisor guarantees that this is ok.
+		 */
+		for (i = 0; i < pfncurr; i++)
+			gpadl_body->pfn[i] = hv_gpadl_hvpfn(type,
+				kbuffer, size, send_offset, pfnsum + i);
+
+		/* add to msg header */
+		list_add_tail(&msgbody->msglistentry, &msgheader->submsglist);
+		pfnsum += pfncurr;
+		pfnleft -= pfncurr;
 	}
 
 	return 0;
-nomem:
-	kfree(msgheader);
-	kfree(msgbody);
-	return -ENOMEM;
+}
+
+static void vmbus_free_channel_msginfo(struct vmbus_channel_msginfo *msginfo)
+{
+	struct vmbus_channel_msginfo *submsginfo, *tmp;
+
+	if (!msginfo)
+		return;
+
+	list_for_each_entry_safe(submsginfo, tmp, &msginfo->submsglist,
+				 msglistentry) {
+		kfree(submsginfo);
+	}
+
+	kfree(msginfo);
 }
 
 /*
@@ -462,7 +444,7 @@ static int __vmbus_establish_gpadl(struct vmbus_channel *channel,
 	struct vmbus_channel_gpadl_header *gpadlmsg;
 	struct vmbus_channel_gpadl_body *gpadl_body;
 	struct vmbus_channel_msginfo *msginfo = NULL;
-	struct vmbus_channel_msginfo *submsginfo, *tmp;
+	struct vmbus_channel_msginfo *submsginfo;
 	struct list_head *curr;
 	u32 next_gpadl_handle;
 	unsigned long flags;
@@ -472,18 +454,31 @@ static int __vmbus_establish_gpadl(struct vmbus_channel *channel,
 		(atomic_inc_return(&vmbus_connection.next_gpadl_handle) - 1);
 
 	ret = create_gpadl_header(type, kbuffer, size, send_offset, &msginfo);
-	if (ret)
-		return ret;
-
-	ret = set_memory_decrypted((unsigned long)kbuffer,
-				   PFN_UP(size));
 	if (ret) {
-		dev_warn(&channel->device_obj->device,
-			 "Failed to set host visibility for new GPADL %d.\n",
-			 ret);
+		gpadl->decrypted = false;
 		return ret;
 	}
 
+	gpadl->decrypted = !((channel->co_external_memory && type == HV_GPADL_BUFFER) ||
+		(channel->co_ring_buffer && type == HV_GPADL_RING));
+	if (gpadl->decrypted) {
+		/*
+		 * The "decrypted" flag being true assumes that set_memory_decrypted() succeeds.
+		 * But if it fails, the encryption state of the memory is unknown. In that case,
+		 * leave "decrypted" as true to ensure the memory is leaked instead of going back
+		 * on the free list.
+		 */
+		ret = set_memory_decrypted((unsigned long)kbuffer,
+					PFN_UP(size));
+		if (ret) {
+			dev_warn(&channel->device_obj->device,
+				"Failed to set host visibility for new GPADL %d.\n",
+				ret);
+			vmbus_free_channel_msginfo(msginfo);
+			return ret;
+		}
+	}
+
 	init_completion(&msginfo->waitevent);
 	msginfo->waiting_channel = channel;
 
@@ -556,16 +551,20 @@ cleanup:
 	spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
 	list_del(&msginfo->msglistentry);
 	spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
-	list_for_each_entry_safe(submsginfo, tmp, &msginfo->submsglist,
-				 msglistentry) {
-		kfree(submsginfo);
-	}
 
-	kfree(msginfo);
+	vmbus_free_channel_msginfo(msginfo);
 
-	if (ret)
-		set_memory_encrypted((unsigned long)kbuffer,
-				     PFN_UP(size));
+	if (ret) {
+		/*
+		 * If set_memory_encrypted() fails, the decrypted flag is
+		 * left as true so the memory is leaked instead of being
+		 * put back on the free list.
+		 */
+		if (gpadl->decrypted) {
+			if (!set_memory_encrypted((unsigned long)kbuffer, PFN_UP(size)))
+				gpadl->decrypted = false;
+		}
+	}
 
 	return ret;
 }
@@ -591,7 +590,7 @@ EXPORT_SYMBOL_GPL(vmbus_establish_gpadl);
  * keeps track of the next available slot in the array. Initially, each
  * slot points to the next one (as in a Linked List). The last slot
  * does not point to anything, so its value is U64_MAX by default.
- * @size The size of the array
+ * @size: The size of the array
  */
 static u64 *request_arr_init(u32 size)
 {
@@ -695,12 +694,13 @@ static int __vmbus_open(struct vmbus_channel *newchannel,
 		goto error_clean_ring;
 
 	err = hv_ringbuffer_init(&newchannel->outbound,
-				 page, send_pages, 0);
+				 page, send_pages, 0, newchannel->co_ring_buffer);
 	if (err)
 		goto error_free_gpadl;
 
 	err = hv_ringbuffer_init(&newchannel->inbound, &page[send_pages],
-				 recv_pages, newchannel->max_pkt_size);
+				 recv_pages, newchannel->max_pkt_size,
+				 newchannel->co_ring_buffer);
 	if (err)
 		goto error_free_gpadl;
 
@@ -881,11 +881,16 @@ post_msg_err:
 
 	kfree(info);
 
-	ret = set_memory_encrypted((unsigned long)gpadl->buffer,
-				   PFN_UP(gpadl->size));
+	if (gpadl->decrypted)
+		ret = set_memory_encrypted((unsigned long)gpadl->buffer,
+					PFN_UP(gpadl->size));
+	else
+		ret = 0;
 	if (ret)
 		pr_warn("Fail to set mem host visibility in GPADL teardown %d.\n", ret);
 
+	gpadl->decrypted = ret;
+
 	return ret;
 }
 EXPORT_SYMBOL_GPL(vmbus_teardown_gpadl);
@@ -941,7 +946,7 @@ static int vmbus_close_internal(struct vmbus_channel *channel)
 
 	/* Send a closing message */
 
-	msg = &channel->close_msg.msg;
+	msg = &channel->close_msg;
 
 	msg->header.msgtype = CHANNELMSG_CLOSECHANNEL;
 	msg->child_relid = channel->offermsg.child_relid;
@@ -1094,68 +1099,10 @@ int vmbus_sendpacket(struct vmbus_channel *channel, void *buffer,
 EXPORT_SYMBOL(vmbus_sendpacket);
 
 /*
- * vmbus_sendpacket_pagebuffer - Send a range of single-page buffer
- * packets using a GPADL Direct packet type. This interface allows you
- * to control notifying the host. This will be useful for sending
- * batched data. Also the sender can control the send flags
- * explicitly.
- */
-int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel,
-				struct hv_page_buffer pagebuffers[],
-				u32 pagecount, void *buffer, u32 bufferlen,
-				u64 requestid)
-{
-	int i;
-	struct vmbus_channel_packet_page_buffer desc;
-	u32 descsize;
-	u32 packetlen;
-	u32 packetlen_aligned;
-	struct kvec bufferlist[3];
-	u64 aligned_data = 0;
-
-	if (pagecount > MAX_PAGE_BUFFER_COUNT)
-		return -EINVAL;
-
-	/*
-	 * Adjust the size down since vmbus_channel_packet_page_buffer is the
-	 * largest size we support
-	 */
-	descsize = sizeof(struct vmbus_channel_packet_page_buffer) -
-			  ((MAX_PAGE_BUFFER_COUNT - pagecount) *
-			  sizeof(struct hv_page_buffer));
-	packetlen = descsize + bufferlen;
-	packetlen_aligned = ALIGN(packetlen, sizeof(u64));
-
-	/* Setup the descriptor */
-	desc.type = VM_PKT_DATA_USING_GPA_DIRECT;
-	desc.flags = VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED;
-	desc.dataoffset8 = descsize >> 3; /* in 8-bytes granularity */
-	desc.length8 = (u16)(packetlen_aligned >> 3);
-	desc.transactionid = VMBUS_RQST_ERROR; /* will be updated in hv_ringbuffer_write() */
-	desc.reserved = 0;
-	desc.rangecount = pagecount;
-
-	for (i = 0; i < pagecount; i++) {
-		desc.range[i].len = pagebuffers[i].len;
-		desc.range[i].offset = pagebuffers[i].offset;
-		desc.range[i].pfn	 = pagebuffers[i].pfn;
-	}
-
-	bufferlist[0].iov_base = &desc;
-	bufferlist[0].iov_len = descsize;
-	bufferlist[1].iov_base = buffer;
-	bufferlist[1].iov_len = bufferlen;
-	bufferlist[2].iov_base = &aligned_data;
-	bufferlist[2].iov_len = (packetlen_aligned - packetlen);
-
-	return hv_ringbuffer_write(channel, bufferlist, 3, requestid, NULL);
-}
-EXPORT_SYMBOL_GPL(vmbus_sendpacket_pagebuffer);
-
-/*
- * vmbus_sendpacket_multipagebuffer - Send a multi-page buffer packet
+ * vmbus_sendpacket_mpb_desc - Send one or more multi-page buffer packets
  * using a GPADL Direct packet type.
- * The buffer includes the vmbus descriptor.
+ * The desc argument must include space for the VMBus descriptor. The
+ * rangecount field must already be set.
  */
 int vmbus_sendpacket_mpb_desc(struct vmbus_channel *channel,
 			      struct vmbus_packet_mpb_array *desc,
@@ -1177,7 +1124,6 @@ int vmbus_sendpacket_mpb_desc(struct vmbus_channel *channel,
 	desc->length8 = (u16)(packetlen_aligned >> 3);
 	desc->transactionid = VMBUS_RQST_ERROR; /* will be updated in hv_ringbuffer_write() */
 	desc->reserved = 0;
-	desc->rangecount = 1;
 
 	bufferlist[0].iov_base = desc;
 	bufferlist[0].iov_len = desc_size;
diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index cc23b90cae02..74fed2c073d4 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -20,6 +20,7 @@
 #include <linux/delay.h>
 #include <linux/cpu.h>
 #include <linux/hyperv.h>
+#include <linux/export.h>
 #include <asm/mshyperv.h>
 #include <linux/sched/isolation.h>
 
@@ -67,7 +68,7 @@ const struct vmbus_device vmbus_devs[] = {
 	{ .dev_type = HV_PCIE,
 	  HV_PCIE_GUID,
 	  .perf_device = false,
-	  .allowed_in_isolated = false,
+	  .allowed_in_isolated = true,
 	},
 
 	/* Synthetic Frame Buffer */
@@ -120,7 +121,9 @@ const struct vmbus_device vmbus_devs[] = {
 	},
 
 	/* File copy */
-	{ .dev_type = HV_FCOPY,
+	/* fcopy always uses 16KB ring buffer size and is working well for last many years */
+	{ .pref_ring_size = 0x4000,
+	  .dev_type = HV_FCOPY,
 	  HV_FCOPY_GUID,
 	  .perf_device = false,
 	  .allowed_in_isolated = false,
@@ -140,12 +143,19 @@ const struct vmbus_device vmbus_devs[] = {
 	  .allowed_in_isolated = false,
 	},
 
-	/* Unknown GUID */
-	{ .dev_type = HV_UNKNOWN,
+	/*
+	 * Unknown GUID
+	 * 64 KB ring buffer + 4 KB header should be sufficient size for any Hyper-V device apart
+	 * from HV_NIC and HV_SCSI. This case avoid the fallback for unknown devices to allocate
+	 * much bigger (2 MB) of ring size.
+	 */
+	{ .pref_ring_size = 0x11000,
+	  .dev_type = HV_UNKNOWN,
 	  .perf_device = false,
 	  .allowed_in_isolated = false,
 	},
 };
+EXPORT_SYMBOL_GPL(vmbus_devs);
 
 static const struct {
 	guid_t guid;
@@ -829,11 +839,22 @@ static void vmbus_wait_for_unload(void)
 		if (completion_done(&vmbus_connection.unload_event))
 			goto completed;
 
-		for_each_online_cpu(cpu) {
+		for_each_present_cpu(cpu) {
 			struct hv_per_cpu_context *hv_cpu
 				= per_cpu_ptr(hv_context.cpu_context, cpu);
 
-			page_addr = hv_cpu->synic_message_page;
+			/*
+			 * In a CoCo VM the hyp_synic_message_page is not allocated
+			 * in hv_synic_alloc(). Instead it is set/cleared in
+			 * hv_hyp_synic_enable_regs() and hv_hyp_synic_disable_regs()
+			 * such that it is set only when the CPU is online. If
+			 * not all present CPUs are online, the message page
+			 * might be NULL, so skip such CPUs.
+			 */
+			page_addr = hv_cpu->hyp_synic_message_page;
+			if (!page_addr)
+				continue;
+
 			msg = (struct hv_message *)page_addr
 				+ VMBUS_MESSAGE_SINT;
 
@@ -867,11 +888,14 @@ completed:
 	 * maybe-pending messages on all CPUs to be able to receive new
 	 * messages after we reconnect.
 	 */
-	for_each_online_cpu(cpu) {
+	for_each_present_cpu(cpu) {
 		struct hv_per_cpu_context *hv_cpu
 			= per_cpu_ptr(hv_context.cpu_context, cpu);
 
-		page_addr = hv_cpu->synic_message_page;
+		page_addr = hv_cpu->hyp_synic_message_page;
+		if (!page_addr)
+			continue;
+
 		msg = (struct hv_message *)page_addr + VMBUS_MESSAGE_SINT;
 		msg->header.message_type = HVMSG_NONE;
 	}
@@ -921,16 +945,6 @@ void vmbus_initiate_unload(bool crash)
 		vmbus_wait_for_unload();
 }
 
-static void check_ready_for_resume_event(void)
-{
-	/*
-	 * If all the old primary channels have been fixed up, then it's safe
-	 * to resume.
-	 */
-	if (atomic_dec_and_test(&vmbus_connection.nr_chan_fixup_on_resume))
-		complete(&vmbus_connection.ready_for_resume_event);
-}
-
 static void vmbus_setup_channel_state(struct vmbus_channel *channel,
 				      struct vmbus_channel_offer_channel *offer)
 {
@@ -1008,6 +1022,7 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr)
 	struct vmbus_channel_offer_channel *offer;
 	struct vmbus_channel *oldchannel, *newchannel;
 	size_t offer_sz;
+	bool co_ring_buffer, co_external_memory;
 
 	offer = (struct vmbus_channel_offer_channel *)hdr;
 
@@ -1020,6 +1035,22 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr)
 		return;
 	}
 
+	co_ring_buffer = is_co_ring_buffer(offer);
+	co_external_memory = is_co_external_memory(offer);
+	if (!co_ring_buffer && co_external_memory) {
+		pr_err("Invalid offer relid=%d: the ring buffer isn't encrypted\n",
+			offer->child_relid);
+		return;
+	}
+	if (co_ring_buffer || co_external_memory) {
+		if (vmbus_proto_version < VERSION_WIN10_V6_0 || !vmbus_is_confidential()) {
+			pr_err("Invalid offer relid=%d: no support for confidential VMBus\n",
+				offer->child_relid);
+			atomic_dec(&vmbus_connection.offer_in_progress);
+			return;
+		}
+	}
+
 	oldchannel = find_primary_channel_by_offer(offer);
 
 	if (oldchannel != NULL) {
@@ -1086,8 +1117,6 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr)
 
 		/* Add the channel back to the array of channels. */
 		vmbus_channel_map_relid(oldchannel);
-		check_ready_for_resume_event();
-
 		mutex_unlock(&vmbus_connection.channel_mutex);
 		return;
 	}
@@ -1100,6 +1129,8 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr)
 		pr_err("Unable to allocate channel object\n");
 		return;
 	}
+	newchannel->co_ring_buffer = co_ring_buffer;
+	newchannel->co_external_memory = co_external_memory;
 
 	vmbus_setup_channel_state(newchannel, offer);
 
@@ -1273,13 +1304,28 @@ EXPORT_SYMBOL_GPL(vmbus_hvsock_device_unregister);
 
 /*
  * vmbus_onoffers_delivered -
- * This is invoked when all offers have been delivered.
+ * The CHANNELMSG_ALLOFFERS_DELIVERED message arrives after all
+ * boot-time offers are delivered. A boot-time offer is for the primary
+ * channel for any virtual hardware configured in the VM at the time it boots.
+ * Boot-time offers include offers for physical devices assigned to the VM
+ * via Hyper-V's Discrete Device Assignment (DDA) functionality that are
+ * handled as virtual PCI devices in Linux (e.g., NVMe devices and GPUs).
+ * Boot-time offers do not include offers for VMBus sub-channels. Because
+ * devices can be hot-added to the VM after it is booted, additional channel
+ * offers that aren't boot-time offers can be received at any time after the
+ * all-offers-delivered message.
  *
- * Nothing to do here.
+ * SR-IOV NIC Virtual Functions (VFs) assigned to a VM are not considered
+ * to be assigned to the VM at boot-time, and offers for VFs may occur after
+ * the all-offers-delivered message. VFs are optional accelerators to the
+ * synthetic VMBus NIC and are effectively hot-added only after the VMBus
+ * NIC channel is opened (once it knows the guest can support it, via the
+ * sriov bit in the netvsc protocol).
  */
 static void vmbus_onoffers_delivered(
 			struct vmbus_channel_message_header *hdr)
 {
+	complete(&vmbus_connection.all_offers_delivered_event);
 }
 
 /*
@@ -1555,7 +1601,8 @@ void vmbus_onmessage(struct vmbus_channel_message_header *hdr)
 }
 
 /*
- * vmbus_request_offers - Send a request to get all our pending offers.
+ * vmbus_request_offers - Send a request to get all our pending offers
+ * and wait for all boot-time offers to arrive.
  */
 int vmbus_request_offers(void)
 {
@@ -1573,6 +1620,10 @@ int vmbus_request_offers(void)
 
 	msg->msgtype = CHANNELMSG_REQUESTOFFERS;
 
+	/*
+	 * This REQUESTOFFERS message will result in the host sending an all
+	 * offers delivered message after all the boot-time offers are sent.
+	 */
 	ret = vmbus_post_msg(msg, sizeof(struct vmbus_channel_message_header),
 			     true);
 
@@ -1584,6 +1635,29 @@ int vmbus_request_offers(void)
 		goto cleanup;
 	}
 
+	/*
+	 * Wait for the host to send all boot-time offers.
+	 * Keeping it as a best-effort mechanism, where a warning is
+	 * printed if a timeout occurs, and execution is resumed.
+	 */
+	if (!wait_for_completion_timeout(&vmbus_connection.all_offers_delivered_event,
+					 secs_to_jiffies(60))) {
+		pr_warn("timed out waiting for all boot-time offers to be delivered.\n");
+	}
+
+	/*
+	 * Flush handling of offer messages (which may initiate work on
+	 * other work queues).
+	 */
+	flush_workqueue(vmbus_connection.work_queue);
+
+	/*
+	 * Flush workqueue for processing the incoming offers. Subchannel
+	 * offers and their processing can happen later, so there is no need to
+	 * flush that workqueue here.
+	 */
+	flush_workqueue(vmbus_connection.handle_primary_chan_wq);
+
 cleanup:
 	kfree(msginfo);
 
diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
index 9dc27e5d367a..5d9cb5bf2d62 100644
--- a/drivers/hv/connection.c
+++ b/drivers/hv/connection.c
@@ -34,8 +34,8 @@ struct vmbus_connection vmbus_connection = {
 
 	.ready_for_suspend_event = COMPLETION_INITIALIZER(
 				  vmbus_connection.ready_for_suspend_event),
-	.ready_for_resume_event	= COMPLETION_INITIALIZER(
-				  vmbus_connection.ready_for_resume_event),
+	.all_offers_delivered_event = COMPLETION_INITIALIZER(
+				  vmbus_connection.all_offers_delivered_event),
 };
 EXPORT_SYMBOL_GPL(vmbus_connection);
 
@@ -51,6 +51,7 @@ EXPORT_SYMBOL_GPL(vmbus_proto_version);
  * Linux guests and are not listed.
  */
 static __u32 vmbus_versions[] = {
+	VERSION_WIN10_V6_0,
 	VERSION_WIN10_V5_3,
 	VERSION_WIN10_V5_2,
 	VERSION_WIN10_V5_1,
@@ -65,7 +66,7 @@ static __u32 vmbus_versions[] = {
  * Maximal VMBus protocol version guests can negotiate.  Useful to cap the
  * VMBus version for testing and debugging purpose.
  */
-static uint max_version = VERSION_WIN10_V5_3;
+static uint max_version = VERSION_WIN10_V6_0;
 
 module_param(max_version, uint, S_IRUGO);
 MODULE_PARM_DESC(max_version,
@@ -98,14 +99,24 @@ int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo, u32 version)
 	 */
 	if (version >= VERSION_WIN10_V5) {
 		msg->msg_sint = VMBUS_MESSAGE_SINT;
+		msg->msg_vtl = ms_hyperv.vtl;
 		vmbus_connection.msg_conn_id = VMBUS_MESSAGE_CONNECTION_ID_4;
 	} else {
 		msg->interrupt_page = virt_to_phys(vmbus_connection.int_page);
 		vmbus_connection.msg_conn_id = VMBUS_MESSAGE_CONNECTION_ID;
 	}
 
-	msg->monitor_page1 = vmbus_connection.monitor_pages_pa[0];
-	msg->monitor_page2 = vmbus_connection.monitor_pages_pa[1];
+	if (vmbus_is_confidential() && version >= VERSION_WIN10_V6_0)
+		msg->feature_flags = VMBUS_FEATURE_FLAG_CONFIDENTIAL_CHANNELS;
+
+	/*
+	 * shared_gpa_boundary is zero in non-SNP VMs, so it's safe to always
+	 * bitwise OR it
+	 */
+	msg->monitor_page1 = virt_to_phys(vmbus_connection.monitor_pages[0]) |
+				ms_hyperv.shared_gpa_boundary;
+	msg->monitor_page2 = virt_to_phys(vmbus_connection.monitor_pages[1]) |
+				ms_hyperv.shared_gpa_boundary;
 
 	msg->target_vcpu = hv_cpu_number_to_vp_number(VMBUS_CONNECT_CPU);
 
@@ -200,11 +211,19 @@ int vmbus_connect(void)
 	mutex_init(&vmbus_connection.channel_mutex);
 
 	/*
+	 * The following Hyper-V interrupt and monitor pages can be used by
+	 * UIO for mapping to user-space, so they should always be allocated on
+	 * system page boundaries. The system page size must be >= the Hyper-V
+	 * page size.
+	 */
+	BUILD_BUG_ON(PAGE_SIZE < HV_HYP_PAGE_SIZE);
+
+	/*
 	 * Setup the vmbus event connection for channel interrupt
 	 * abstraction stuff
 	 */
 	vmbus_connection.int_page =
-	(void *)hv_alloc_hyperv_zeroed_page();
+		(void *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
 	if (vmbus_connection.int_page == NULL) {
 		ret = -ENOMEM;
 		goto cleanup;
@@ -219,73 +238,37 @@ int vmbus_connect(void)
 	 * Setup the monitor notification facility. The 1st page for
 	 * parent->child and the 2nd page for child->parent
 	 */
-	vmbus_connection.monitor_pages[0] = (void *)hv_alloc_hyperv_zeroed_page();
-	vmbus_connection.monitor_pages[1] = (void *)hv_alloc_hyperv_zeroed_page();
+	vmbus_connection.monitor_pages[0] = (void *)__get_free_page(GFP_KERNEL);
+	vmbus_connection.monitor_pages[1] = (void *)__get_free_page(GFP_KERNEL);
 	if ((vmbus_connection.monitor_pages[0] == NULL) ||
 	    (vmbus_connection.monitor_pages[1] == NULL)) {
 		ret = -ENOMEM;
 		goto cleanup;
 	}
 
-	vmbus_connection.monitor_pages_original[0]
-		= vmbus_connection.monitor_pages[0];
-	vmbus_connection.monitor_pages_original[1]
-		= vmbus_connection.monitor_pages[1];
-	vmbus_connection.monitor_pages_pa[0]
-		= virt_to_phys(vmbus_connection.monitor_pages[0]);
-	vmbus_connection.monitor_pages_pa[1]
-		= virt_to_phys(vmbus_connection.monitor_pages[1]);
-
-	if (hv_is_isolation_supported()) {
-		ret = set_memory_decrypted((unsigned long)
-					   vmbus_connection.monitor_pages[0],
-					   1);
-		ret |= set_memory_decrypted((unsigned long)
-					    vmbus_connection.monitor_pages[1],
-					    1);
-		if (ret)
-			goto cleanup;
-
+	ret = set_memory_decrypted((unsigned long)
+				vmbus_connection.monitor_pages[0], 1);
+	ret |= set_memory_decrypted((unsigned long)
+				vmbus_connection.monitor_pages[1], 1);
+	if (ret) {
 		/*
-		 * Isolation VM with AMD SNP needs to access monitor page via
-		 * address space above shared gpa boundary.
+		 * If set_memory_decrypted() fails, the encryption state
+		 * of the memory is unknown. So leak the memory instead
+		 * of risking returning decrypted memory to the free list.
+		 * For simplicity, always handle both pages the same.
 		 */
-		if (hv_isolation_type_snp()) {
-			vmbus_connection.monitor_pages_pa[0] +=
-				ms_hyperv.shared_gpa_boundary;
-			vmbus_connection.monitor_pages_pa[1] +=
-				ms_hyperv.shared_gpa_boundary;
-
-			vmbus_connection.monitor_pages[0]
-				= memremap(vmbus_connection.monitor_pages_pa[0],
-					   HV_HYP_PAGE_SIZE,
-					   MEMREMAP_WB);
-			if (!vmbus_connection.monitor_pages[0]) {
-				ret = -ENOMEM;
-				goto cleanup;
-			}
-
-			vmbus_connection.monitor_pages[1]
-				= memremap(vmbus_connection.monitor_pages_pa[1],
-					   HV_HYP_PAGE_SIZE,
-					   MEMREMAP_WB);
-			if (!vmbus_connection.monitor_pages[1]) {
-				ret = -ENOMEM;
-				goto cleanup;
-			}
-		}
-
-		/*
-		 * Set memory host visibility hvcall smears memory
-		 * and so zero monitor pages here.
-		 */
-		memset(vmbus_connection.monitor_pages[0], 0x00,
-		       HV_HYP_PAGE_SIZE);
-		memset(vmbus_connection.monitor_pages[1], 0x00,
-		       HV_HYP_PAGE_SIZE);
-
+		vmbus_connection.monitor_pages[0] = NULL;
+		vmbus_connection.monitor_pages[1] = NULL;
+		goto cleanup;
 	}
 
+	/*
+	 * Set_memory_decrypted() will change the memory contents if
+	 * decryption occurs, so zero monitor pages here.
+	 */
+	memset(vmbus_connection.monitor_pages[0], 0x00, HV_HYP_PAGE_SIZE);
+	memset(vmbus_connection.monitor_pages[1], 0x00, HV_HYP_PAGE_SIZE);
+
 	msginfo = kzalloc(sizeof(*msginfo) +
 			  sizeof(struct vmbus_channel_initiate_contact),
 			  GFP_KERNEL);
@@ -372,35 +355,25 @@ void vmbus_disconnect(void)
 		destroy_workqueue(vmbus_connection.work_queue);
 
 	if (vmbus_connection.int_page) {
-		hv_free_hyperv_page((unsigned long)vmbus_connection.int_page);
+		free_page((unsigned long)vmbus_connection.int_page);
 		vmbus_connection.int_page = NULL;
 	}
 
-	if (hv_is_isolation_supported()) {
-		/*
-		 * memunmap() checks input address is ioremap address or not
-		 * inside. It doesn't unmap any thing in the non-SNP CVM and
-		 * so not check CVM type here.
-		 */
-		memunmap(vmbus_connection.monitor_pages[0]);
-		memunmap(vmbus_connection.monitor_pages[1]);
-
-		set_memory_encrypted((unsigned long)
-			vmbus_connection.monitor_pages_original[0],
-			1);
-		set_memory_encrypted((unsigned long)
-			vmbus_connection.monitor_pages_original[1],
-			1);
+	if (vmbus_connection.monitor_pages[0]) {
+		if (!set_memory_encrypted(
+			(unsigned long)vmbus_connection.monitor_pages[0], 1))
+			free_page((unsigned long)
+				vmbus_connection.monitor_pages[0]);
+		vmbus_connection.monitor_pages[0] = NULL;
 	}
 
-	hv_free_hyperv_page((unsigned long)
-		vmbus_connection.monitor_pages_original[0]);
-	hv_free_hyperv_page((unsigned long)
-		vmbus_connection.monitor_pages_original[1]);
-	vmbus_connection.monitor_pages_original[0] =
-		vmbus_connection.monitor_pages[0] = NULL;
-	vmbus_connection.monitor_pages_original[1] =
+	if (vmbus_connection.monitor_pages[1]) {
+		if (!set_memory_encrypted(
+			(unsigned long)vmbus_connection.monitor_pages[1], 1))
+			free_page((unsigned long)
+				vmbus_connection.monitor_pages[1]);
 		vmbus_connection.monitor_pages[1] = NULL;
+	}
 }
 
 /*
@@ -409,6 +382,10 @@ void vmbus_disconnect(void)
  */
 struct vmbus_channel *relid2channel(u32 relid)
 {
+	if (vmbus_connection.channels == NULL) {
+		pr_warn_once("relid2channel: relid=%d: No channels mapped!\n", relid);
+		return NULL;
+	}
 	if (WARN_ON(relid >= MAX_CHANNEL_RELIDS))
 		return NULL;
 	return READ_ONCE(vmbus_connection.channels[relid]);
@@ -536,10 +513,20 @@ void vmbus_set_event(struct vmbus_channel *channel)
 
 	++channel->sig_events;
 
-	if (hv_isolation_type_snp())
-		hv_ghcb_hypercall(HVCALL_SIGNAL_EVENT, &channel->sig_event,
-				NULL, sizeof(channel->sig_event));
-	else
-		hv_do_fast_hypercall8(HVCALL_SIGNAL_EVENT, channel->sig_event);
+	if (ms_hyperv.paravisor_present) {
+		if (hv_isolation_type_snp())
+			hv_ghcb_hypercall(HVCALL_SIGNAL_EVENT, &channel->sig_event,
+					  NULL, sizeof(channel->sig_event));
+		else if (hv_isolation_type_tdx())
+			hv_tdx_hypercall(HVCALL_SIGNAL_EVENT | HV_HYPERCALL_FAST_BIT,
+					 channel->sig_event, 0);
+		else
+			WARN_ON_ONCE(1);
+	} else {
+		u64 control = HVCALL_SIGNAL_EVENT;
+
+		control |= hv_nested ? HV_HYPERCALL_NESTED : 0;
+		hv_do_fast_hypercall8(control, channel->sig_event);
+	}
 }
 EXPORT_SYMBOL_GPL(vmbus_set_event);
diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
index 4d6480d57546..c100f04b3581 100644
--- a/drivers/hv/hv.c
+++ b/drivers/hv/hv.c
@@ -18,12 +18,15 @@
 #include <linux/clockchips.h>
 #include <linux/delay.h>
 #include <linux/interrupt.h>
+#include <linux/export.h>
 #include <clocksource/hyperv_timer.h>
 #include <asm/mshyperv.h>
+#include <linux/set_memory.h>
 #include "hyperv_vmbus.h"
 
 /* The one and only */
 struct hv_context hv_context;
+EXPORT_SYMBOL_FOR_MODULES(hv_context, "mshv_vtl");
 
 /*
  * hv_init - Main initialization routine.
@@ -39,86 +42,136 @@ int hv_init(void)
 }
 
 /*
- * Functions for allocating and freeing memory with size and
- * alignment HV_HYP_PAGE_SIZE. These functions are needed because
- * the guest page size may not be the same as the Hyper-V page
- * size. We depend upon kmalloc() aligning power-of-two size
- * allocations to the allocation size boundary, so that the
- * allocated memory appears to Hyper-V as a page of the size
- * it expects.
- */
-
-void *hv_alloc_hyperv_page(void)
-{
-	BUILD_BUG_ON(PAGE_SIZE <  HV_HYP_PAGE_SIZE);
-
-	if (PAGE_SIZE == HV_HYP_PAGE_SIZE)
-		return (void *)__get_free_page(GFP_KERNEL);
-	else
-		return kmalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL);
-}
-
-void *hv_alloc_hyperv_zeroed_page(void)
-{
-	if (PAGE_SIZE == HV_HYP_PAGE_SIZE)
-		return (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
-	else
-		return kzalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL);
-}
-
-void hv_free_hyperv_page(unsigned long addr)
-{
-	if (PAGE_SIZE == HV_HYP_PAGE_SIZE)
-		free_page(addr);
-	else
-		kfree((void *)addr);
-}
-
-/*
  * hv_post_message - Post a message using the hypervisor message IPC.
  *
  * This involves a hypercall.
  */
 int hv_post_message(union hv_connection_id connection_id,
-		  enum hv_message_type message_type,
-		  void *payload, size_t payload_size)
+			enum hv_message_type message_type,
+			void *payload, size_t payload_size)
 {
 	struct hv_input_post_message *aligned_msg;
-	struct hv_per_cpu_context *hv_cpu;
+	unsigned long flags;
 	u64 status;
 
 	if (payload_size > HV_MESSAGE_PAYLOAD_BYTE_COUNT)
 		return -EMSGSIZE;
 
-	hv_cpu = get_cpu_ptr(hv_context.cpu_context);
-	aligned_msg = hv_cpu->post_msg_page;
+	local_irq_save(flags);
+
+	/*
+	 * A TDX VM with the paravisor must use the decrypted post_msg_page: see
+	 * the comment in struct hv_per_cpu_context. A SNP VM with the paravisor
+	 * can use the encrypted hyperv_pcpu_input_arg because it copies the
+	 * input into the GHCB page, which has been decrypted by the paravisor.
+	 */
+	if (hv_isolation_type_tdx() && ms_hyperv.paravisor_present)
+		aligned_msg = this_cpu_ptr(hv_context.cpu_context)->post_msg_page;
+	else
+		aligned_msg = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
 	aligned_msg->connectionid = connection_id;
 	aligned_msg->reserved = 0;
 	aligned_msg->message_type = message_type;
 	aligned_msg->payload_size = payload_size;
 	memcpy((void *)aligned_msg->payload, payload, payload_size);
 
-	if (hv_isolation_type_snp())
-		status = hv_ghcb_hypercall(HVCALL_POST_MESSAGE,
-				(void *)aligned_msg, NULL,
-				sizeof(*aligned_msg));
-	else
-		status = hv_do_hypercall(HVCALL_POST_MESSAGE,
-				aligned_msg, NULL);
+	if (ms_hyperv.paravisor_present && !vmbus_is_confidential()) {
+		/*
+		 * If the VMBus isn't confidential, use the CoCo-specific
+		 * mechanism to communicate with the hypervisor.
+		 */
+		if (hv_isolation_type_tdx())
+			status = hv_tdx_hypercall(HVCALL_POST_MESSAGE,
+						  virt_to_phys(aligned_msg), 0);
+		else if (hv_isolation_type_snp())
+			status = hv_ghcb_hypercall(HVCALL_POST_MESSAGE,
+						   aligned_msg, NULL,
+						   sizeof(*aligned_msg));
+		else
+			status = HV_STATUS_INVALID_PARAMETER;
+	} else {
+		u64 control = HVCALL_POST_MESSAGE;
 
-	/* Preemption must remain disabled until after the hypercall
-	 * so some other thread can't get scheduled onto this cpu and
-	 * corrupt the per-cpu post_msg_page
-	 */
-	put_cpu_ptr(hv_cpu);
+		control |= hv_nested ? HV_HYPERCALL_NESTED : 0;
+		/*
+		 * If there is no paravisor, this will go to the hypervisor.
+		 * In the Confidential VMBus case, there is the paravisor
+		 * to which this will trap.
+		 */
+		status = hv_do_hypercall(control, aligned_msg, NULL);
+	}
+
+	local_irq_restore(flags);
 
 	return hv_result(status);
 }
+EXPORT_SYMBOL_FOR_MODULES(hv_post_message, "mshv_vtl");
+
+static int hv_alloc_page(void **page, bool decrypt, const char *note)
+{
+	int ret = 0;
+
+	/*
+	 * After the page changes its encryption status, its contents might
+	 * appear scrambled on some hardware. Thus `get_zeroed_page` would
+	 * zero the page out in vain, so do that explicitly exactly once.
+	 *
+	 * By default, the page is allocated encrypted in a CoCo VM.
+	 */
+	*page = (void *)__get_free_page(GFP_KERNEL);
+	if (!*page)
+		return -ENOMEM;
+
+	if (decrypt)
+		ret = set_memory_decrypted((unsigned long)*page, 1);
+	if (ret)
+		goto failed;
+
+	memset(*page, 0, PAGE_SIZE);
+	return 0;
+
+failed:
+	/*
+	 * Report the failure but don't put the page back on the free list as
+	 * its encryption status is unknown.
+	 */
+	pr_err("allocation failed for %s page, error %d, decrypted %d\n",
+		note, ret, decrypt);
+	*page = NULL;
+	return ret;
+}
+
+static int hv_free_page(void **page, bool encrypt, const char *note)
+{
+	int ret = 0;
+
+	if (!*page)
+		return 0;
+
+	if (encrypt)
+		ret = set_memory_encrypted((unsigned long)*page, 1);
+
+	/*
+	 * In the case of the failure, the page is leaked. Something is wrong,
+	 * prefer to lose the page with the unknown encryption status and stay afloat.
+	 */
+	if (ret)
+		pr_err("deallocation failed for %s page, error %d, encrypt %d\n",
+			note, ret, encrypt);
+	else
+		free_page((unsigned long)*page);
+
+	*page = NULL;
+
+	return ret;
+}
 
 int hv_synic_alloc(void)
 {
-	int cpu;
+	int cpu, ret = -ENOMEM;
 	struct hv_per_cpu_context *hv_cpu;
+	const bool decrypt = !vmbus_is_confidential();
 
 	/*
 	 * First, zero all per-cpu memory areas so hv_synic_free() can
@@ -132,7 +185,7 @@ int hv_synic_alloc(void)
 
 	hv_context.hv_numa_map = kcalloc(nr_node_ids, sizeof(struct cpumask),
 					 GFP_KERNEL);
-	if (hv_context.hv_numa_map == NULL) {
+	if (!hv_context.hv_numa_map) {
 		pr_err("Unable to allocate NUMA map\n");
 		goto err;
 	}
@@ -141,204 +194,299 @@ int hv_synic_alloc(void)
 		hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu);
 
 		tasklet_init(&hv_cpu->msg_dpc,
-			     vmbus_on_msg_dpc, (unsigned long) hv_cpu);
+			     vmbus_on_msg_dpc, (unsigned long)hv_cpu);
+
+		if (ms_hyperv.paravisor_present && hv_isolation_type_tdx()) {
+			ret = hv_alloc_page(&hv_cpu->post_msg_page,
+				decrypt, "post msg");
+			if (ret)
+				goto err;
+		}
 
 		/*
-		 * Synic message and event pages are allocated by paravisor.
-		 * Skip these pages allocation here.
+		 * If these SynIC pages are not allocated, SIEF and SIM pages
+		 * are configured using what the root partition or the paravisor
+		 * provides upon reading the SIEFP and SIMP registers.
 		 */
-		if (!hv_isolation_type_snp()) {
-			hv_cpu->synic_message_page =
-				(void *)get_zeroed_page(GFP_ATOMIC);
-			if (hv_cpu->synic_message_page == NULL) {
-				pr_err("Unable to allocate SYNIC message page\n");
+		if (!ms_hyperv.paravisor_present && !hv_root_partition()) {
+			ret = hv_alloc_page(&hv_cpu->hyp_synic_message_page,
+				decrypt, "hypervisor SynIC msg");
+			if (ret)
 				goto err;
-			}
-
-			hv_cpu->synic_event_page =
-				(void *)get_zeroed_page(GFP_ATOMIC);
-			if (hv_cpu->synic_event_page == NULL) {
-				pr_err("Unable to allocate SYNIC event page\n");
+			ret = hv_alloc_page(&hv_cpu->hyp_synic_event_page,
+				decrypt, "hypervisor SynIC event");
+			if (ret)
 				goto err;
-			}
 		}
 
-		hv_cpu->post_msg_page = (void *)get_zeroed_page(GFP_ATOMIC);
-		if (hv_cpu->post_msg_page == NULL) {
-			pr_err("Unable to allocate post msg page\n");
-			goto err;
+		if (vmbus_is_confidential()) {
+			ret = hv_alloc_page(&hv_cpu->para_synic_message_page,
+				false, "paravisor SynIC msg");
+			if (ret)
+				goto err;
+			ret = hv_alloc_page(&hv_cpu->para_synic_event_page,
+				false, "paravisor SynIC event");
+			if (ret)
+				goto err;
 		}
 	}
 
 	return 0;
+
 err:
 	/*
 	 * Any memory allocations that succeeded will be freed when
 	 * the caller cleans up by calling hv_synic_free()
 	 */
-	return -ENOMEM;
+	return ret;
 }
 
-
 void hv_synic_free(void)
 {
 	int cpu;
+	const bool encrypt = !vmbus_is_confidential();
 
 	for_each_present_cpu(cpu) {
-		struct hv_per_cpu_context *hv_cpu
-			= per_cpu_ptr(hv_context.cpu_context, cpu);
-
-		free_page((unsigned long)hv_cpu->synic_event_page);
-		free_page((unsigned long)hv_cpu->synic_message_page);
-		free_page((unsigned long)hv_cpu->post_msg_page);
+		struct hv_per_cpu_context *hv_cpu =
+			per_cpu_ptr(hv_context.cpu_context, cpu);
+
+		if (ms_hyperv.paravisor_present && hv_isolation_type_tdx())
+			hv_free_page(&hv_cpu->post_msg_page,
+				encrypt, "post msg");
+		if (!ms_hyperv.paravisor_present && !hv_root_partition()) {
+			hv_free_page(&hv_cpu->hyp_synic_event_page,
+				encrypt, "hypervisor SynIC event");
+			hv_free_page(&hv_cpu->hyp_synic_message_page,
+				encrypt, "hypervisor SynIC msg");
+		}
+		if (vmbus_is_confidential()) {
+			hv_free_page(&hv_cpu->para_synic_event_page,
+				false, "paravisor SynIC event");
+			hv_free_page(&hv_cpu->para_synic_message_page,
+				false, "paravisor SynIC msg");
+		}
 	}
 
 	kfree(hv_context.hv_numa_map);
 }
 
 /*
- * hv_synic_init - Initialize the Synthetic Interrupt Controller.
- *
- * If it is already initialized by another entity (ie x2v shim), we need to
- * retrieve the initialized message and event pages.  Otherwise, we create and
- * initialize the message and event pages.
+ * hv_hyp_synic_enable_regs - Initialize the Synthetic Interrupt Controller
+ * with the hypervisor.
  */
-void hv_synic_enable_regs(unsigned int cpu)
+void hv_hyp_synic_enable_regs(unsigned int cpu)
 {
-	struct hv_per_cpu_context *hv_cpu
-		= per_cpu_ptr(hv_context.cpu_context, cpu);
+	struct hv_per_cpu_context *hv_cpu =
+		per_cpu_ptr(hv_context.cpu_context, cpu);
 	union hv_synic_simp simp;
 	union hv_synic_siefp siefp;
 	union hv_synic_sint shared_sint;
-	union hv_synic_scontrol sctrl;
 
-	/* Setup the Synic's message page */
-	simp.as_uint64 = hv_get_register(HV_REGISTER_SIMP);
+	/* Setup the Synic's message page with the hypervisor. */
+	simp.as_uint64 = hv_get_msr(HV_MSR_SIMP);
 	simp.simp_enabled = 1;
 
-	if (hv_isolation_type_snp()) {
-		hv_cpu->synic_message_page
-			= memremap(simp.base_simp_gpa << HV_HYP_PAGE_SHIFT,
-				   HV_HYP_PAGE_SIZE, MEMREMAP_WB);
-		if (!hv_cpu->synic_message_page)
-			pr_err("Fail to map syinc message page.\n");
+	if (ms_hyperv.paravisor_present || hv_root_partition()) {
+		/* Mask out vTOM bit. ioremap_cache() maps decrypted */
+		u64 base = (simp.base_simp_gpa << HV_HYP_PAGE_SHIFT) &
+				~ms_hyperv.shared_gpa_boundary;
+		hv_cpu->hyp_synic_message_page =
+			(void *)ioremap_cache(base, HV_HYP_PAGE_SIZE);
+		if (!hv_cpu->hyp_synic_message_page)
+			pr_err("Fail to map synic message page.\n");
 	} else {
-		simp.base_simp_gpa = virt_to_phys(hv_cpu->synic_message_page)
+		simp.base_simp_gpa = virt_to_phys(hv_cpu->hyp_synic_message_page)
 			>> HV_HYP_PAGE_SHIFT;
 	}
 
-	hv_set_register(HV_REGISTER_SIMP, simp.as_uint64);
+	hv_set_msr(HV_MSR_SIMP, simp.as_uint64);
 
-	/* Setup the Synic's event page */
-	siefp.as_uint64 = hv_get_register(HV_REGISTER_SIEFP);
+	/* Setup the Synic's event page with the hypervisor. */
+	siefp.as_uint64 = hv_get_msr(HV_MSR_SIEFP);
 	siefp.siefp_enabled = 1;
 
-	if (hv_isolation_type_snp()) {
-		hv_cpu->synic_event_page =
-			memremap(siefp.base_siefp_gpa << HV_HYP_PAGE_SHIFT,
-				 HV_HYP_PAGE_SIZE, MEMREMAP_WB);
-
-		if (!hv_cpu->synic_event_page)
-			pr_err("Fail to map syinc event page.\n");
+	if (ms_hyperv.paravisor_present || hv_root_partition()) {
+		/* Mask out vTOM bit. ioremap_cache() maps decrypted */
+		u64 base = (siefp.base_siefp_gpa << HV_HYP_PAGE_SHIFT) &
+				~ms_hyperv.shared_gpa_boundary;
+		hv_cpu->hyp_synic_event_page =
+			(void *)ioremap_cache(base, HV_HYP_PAGE_SIZE);
+		if (!hv_cpu->hyp_synic_event_page)
+			pr_err("Fail to map synic event page.\n");
 	} else {
-		siefp.base_siefp_gpa = virt_to_phys(hv_cpu->synic_event_page)
+		siefp.base_siefp_gpa = virt_to_phys(hv_cpu->hyp_synic_event_page)
 			>> HV_HYP_PAGE_SHIFT;
 	}
 
-	hv_set_register(HV_REGISTER_SIEFP, siefp.as_uint64);
+	hv_set_msr(HV_MSR_SIEFP, siefp.as_uint64);
+	hv_enable_coco_interrupt(cpu, vmbus_interrupt, true);
 
 	/* Setup the shared SINT. */
 	if (vmbus_irq != -1)
 		enable_percpu_irq(vmbus_irq, 0);
-	shared_sint.as_uint64 = hv_get_register(HV_REGISTER_SINT0 +
-					VMBUS_MESSAGE_SINT);
+	shared_sint.as_uint64 = hv_get_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT);
 
 	shared_sint.vector = vmbus_interrupt;
 	shared_sint.masked = false;
+	shared_sint.auto_eoi = hv_recommend_using_aeoi();
+	hv_set_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT, shared_sint.as_uint64);
+}
 
-	/*
-	 * On architectures where Hyper-V doesn't support AEOI (e.g., ARM64),
-	 * it doesn't provide a recommendation flag and AEOI must be disabled.
-	 */
-#ifdef HV_DEPRECATING_AEOI_RECOMMENDED
-	shared_sint.auto_eoi =
-			!(ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED);
-#else
-	shared_sint.auto_eoi = 0;
-#endif
-	hv_set_register(HV_REGISTER_SINT0 + VMBUS_MESSAGE_SINT,
-				shared_sint.as_uint64);
+static void hv_hyp_synic_enable_interrupts(void)
+{
+	union hv_synic_scontrol sctrl;
 
 	/* Enable the global synic bit */
-	sctrl.as_uint64 = hv_get_register(HV_REGISTER_SCONTROL);
+	sctrl.as_uint64 = hv_get_msr(HV_MSR_SCONTROL);
 	sctrl.enable = 1;
 
-	hv_set_register(HV_REGISTER_SCONTROL, sctrl.as_uint64);
+	hv_set_msr(HV_MSR_SCONTROL, sctrl.as_uint64);
+}
+
+static void hv_para_synic_enable_regs(unsigned int cpu)
+{
+	union hv_synic_simp simp;
+	union hv_synic_siefp siefp;
+	struct hv_per_cpu_context *hv_cpu
+		= per_cpu_ptr(hv_context.cpu_context, cpu);
+
+	/* Setup the Synic's message page with the paravisor. */
+	simp.as_uint64 = hv_para_get_synic_register(HV_MSR_SIMP);
+	simp.simp_enabled = 1;
+	simp.base_simp_gpa = virt_to_phys(hv_cpu->para_synic_message_page)
+			>> HV_HYP_PAGE_SHIFT;
+	hv_para_set_synic_register(HV_MSR_SIMP, simp.as_uint64);
+
+	/* Setup the Synic's event page with the paravisor. */
+	siefp.as_uint64 = hv_para_get_synic_register(HV_MSR_SIEFP);
+	siefp.siefp_enabled = 1;
+	siefp.base_siefp_gpa = virt_to_phys(hv_cpu->para_synic_event_page)
+			>> HV_HYP_PAGE_SHIFT;
+	hv_para_set_synic_register(HV_MSR_SIEFP, siefp.as_uint64);
+}
+
+static void hv_para_synic_enable_interrupts(void)
+{
+	union hv_synic_scontrol sctrl;
+
+	/* Enable the global synic bit */
+	sctrl.as_uint64 = hv_para_get_synic_register(HV_MSR_SCONTROL);
+	sctrl.enable = 1;
+	hv_para_set_synic_register(HV_MSR_SCONTROL, sctrl.as_uint64);
 }
 
 int hv_synic_init(unsigned int cpu)
 {
-	hv_synic_enable_regs(cpu);
+	if (vmbus_is_confidential())
+		hv_para_synic_enable_regs(cpu);
+
+	/*
+	 * The SINT is set in hv_hyp_synic_enable_regs() by calling
+	 * hv_set_msr(). hv_set_msr() in turn has special case code for the
+	 * SINT MSRs that write to the hypervisor version of the MSR *and*
+	 * the paravisor version of the MSR (but *without* the proxy bit when
+	 * VMBus is confidential).
+	 *
+	 * Then enable interrupts via the paravisor if VMBus is confidential,
+	 * and otherwise via the hypervisor.
+	 */
+
+	hv_hyp_synic_enable_regs(cpu);
+	if (vmbus_is_confidential())
+		hv_para_synic_enable_interrupts();
+	else
+		hv_hyp_synic_enable_interrupts();
 
 	hv_stimer_legacy_init(cpu, VMBUS_MESSAGE_SINT);
 
 	return 0;
 }
 
-/*
- * hv_synic_cleanup - Cleanup routine for hv_synic_init().
- */
-void hv_synic_disable_regs(unsigned int cpu)
+void hv_hyp_synic_disable_regs(unsigned int cpu)
 {
-	struct hv_per_cpu_context *hv_cpu
-		= per_cpu_ptr(hv_context.cpu_context, cpu);
+	struct hv_per_cpu_context *hv_cpu =
+		per_cpu_ptr(hv_context.cpu_context, cpu);
 	union hv_synic_sint shared_sint;
 	union hv_synic_simp simp;
 	union hv_synic_siefp siefp;
-	union hv_synic_scontrol sctrl;
 
-	shared_sint.as_uint64 = hv_get_register(HV_REGISTER_SINT0 +
-					VMBUS_MESSAGE_SINT);
+	shared_sint.as_uint64 = hv_get_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT);
 
 	shared_sint.masked = 1;
 
 	/* Need to correctly cleanup in the case of SMP!!! */
 	/* Disable the interrupt */
-	hv_set_register(HV_REGISTER_SINT0 + VMBUS_MESSAGE_SINT,
-				shared_sint.as_uint64);
+	hv_set_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT, shared_sint.as_uint64);
+	hv_enable_coco_interrupt(cpu, vmbus_interrupt, false);
 
-	simp.as_uint64 = hv_get_register(HV_REGISTER_SIMP);
+	simp.as_uint64 = hv_get_msr(HV_MSR_SIMP);
 	/*
-	 * In Isolation VM, sim and sief pages are allocated by
+	 * In Isolation VM, simp and sief pages are allocated by
 	 * paravisor. These pages also will be used by kdump
 	 * kernel. So just reset enable bit here and keep page
 	 * addresses.
 	 */
 	simp.simp_enabled = 0;
-	if (hv_isolation_type_snp())
-		memunmap(hv_cpu->synic_message_page);
-	else
+	if (ms_hyperv.paravisor_present || hv_root_partition()) {
+		if (hv_cpu->hyp_synic_message_page) {
+			iounmap(hv_cpu->hyp_synic_message_page);
+			hv_cpu->hyp_synic_message_page = NULL;
+		}
+	} else {
 		simp.base_simp_gpa = 0;
+	}
 
-	hv_set_register(HV_REGISTER_SIMP, simp.as_uint64);
+	hv_set_msr(HV_MSR_SIMP, simp.as_uint64);
 
-	siefp.as_uint64 = hv_get_register(HV_REGISTER_SIEFP);
+	siefp.as_uint64 = hv_get_msr(HV_MSR_SIEFP);
 	siefp.siefp_enabled = 0;
 
-	if (hv_isolation_type_snp())
-		memunmap(hv_cpu->synic_event_page);
-	else
+	if (ms_hyperv.paravisor_present || hv_root_partition()) {
+		if (hv_cpu->hyp_synic_event_page) {
+			iounmap(hv_cpu->hyp_synic_event_page);
+			hv_cpu->hyp_synic_event_page = NULL;
+		}
+	} else {
 		siefp.base_siefp_gpa = 0;
+	}
+
+	hv_set_msr(HV_MSR_SIEFP, siefp.as_uint64);
+}
 
-	hv_set_register(HV_REGISTER_SIEFP, siefp.as_uint64);
+static void hv_hyp_synic_disable_interrupts(void)
+{
+	union hv_synic_scontrol sctrl;
 
 	/* Disable the global synic bit */
-	sctrl.as_uint64 = hv_get_register(HV_REGISTER_SCONTROL);
+	sctrl.as_uint64 = hv_get_msr(HV_MSR_SCONTROL);
 	sctrl.enable = 0;
-	hv_set_register(HV_REGISTER_SCONTROL, sctrl.as_uint64);
+	hv_set_msr(HV_MSR_SCONTROL, sctrl.as_uint64);
+}
 
-	if (vmbus_irq != -1)
-		disable_percpu_irq(vmbus_irq);
+static void hv_para_synic_disable_regs(unsigned int cpu)
+{
+	union hv_synic_simp simp;
+	union hv_synic_siefp siefp;
+
+	/* Disable SynIC's message page in the paravisor. */
+	simp.as_uint64 = hv_para_get_synic_register(HV_MSR_SIMP);
+	simp.simp_enabled = 0;
+	hv_para_set_synic_register(HV_MSR_SIMP, simp.as_uint64);
+
+	/* Disable SynIC's event page in the paravisor. */
+	siefp.as_uint64 = hv_para_get_synic_register(HV_MSR_SIEFP);
+	siefp.siefp_enabled = 0;
+	hv_para_set_synic_register(HV_MSR_SIEFP, siefp.as_uint64);
+}
+
+static void hv_para_synic_disable_interrupts(void)
+{
+	union hv_synic_scontrol sctrl;
+
+	/* Disable the global synic bit */
+	sctrl.as_uint64 = hv_para_get_synic_register(HV_MSR_SCONTROL);
+	sctrl.enable = 0;
+	hv_para_set_synic_register(HV_MSR_SCONTROL, sctrl.as_uint64);
 }
 
 #define HV_MAX_TRIES 3
@@ -351,16 +499,18 @@ void hv_synic_disable_regs(unsigned int cpu)
  * that the normal interrupt handling mechanism will find and process the channel interrupt
  * "very soon", and in the process clear the bit.
  */
-static bool hv_synic_event_pending(void)
+static bool __hv_synic_event_pending(union hv_synic_event_flags *event, int sint)
 {
-	struct hv_per_cpu_context *hv_cpu = this_cpu_ptr(hv_context.cpu_context);
-	union hv_synic_event_flags *event =
-		(union hv_synic_event_flags *)hv_cpu->synic_event_page + VMBUS_MESSAGE_SINT;
-	unsigned long *recv_int_page = event->flags; /* assumes VMBus version >= VERSION_WIN8 */
+	unsigned long *recv_int_page;
 	bool pending;
 	u32 relid;
 	int tries = 0;
 
+	if (!event)
+		return false;
+
+	event += sint;
+	recv_int_page = event->flags; /* assumes VMBus version >= VERSION_WIN8 */
 retry:
 	pending = false;
 	for_each_set_bit(relid, recv_int_page, HV_EVENT_FLAGS_COUNT) {
@@ -377,10 +527,58 @@ retry:
 	return pending;
 }
 
+static bool hv_synic_event_pending(void)
+{
+	struct hv_per_cpu_context *hv_cpu = this_cpu_ptr(hv_context.cpu_context);
+	union hv_synic_event_flags *hyp_synic_event_page = hv_cpu->hyp_synic_event_page;
+	union hv_synic_event_flags *para_synic_event_page = hv_cpu->para_synic_event_page;
+
+	return
+		__hv_synic_event_pending(hyp_synic_event_page, VMBUS_MESSAGE_SINT) ||
+		__hv_synic_event_pending(para_synic_event_page, VMBUS_MESSAGE_SINT);
+}
+
+static int hv_pick_new_cpu(struct vmbus_channel *channel)
+{
+	int ret = -EBUSY;
+	int start;
+	int cpu;
+
+	lockdep_assert_cpus_held();
+	lockdep_assert_held(&vmbus_connection.channel_mutex);
+
+	/*
+	 * We can't assume that the relevant interrupts will be sent before
+	 * the cpu is offlined on older versions of hyperv.
+	 */
+	if (vmbus_proto_version < VERSION_WIN10_V5_3)
+		return -EBUSY;
+
+	start = get_random_u32_below(nr_cpu_ids);
+
+	for_each_cpu_wrap(cpu, cpu_online_mask, start) {
+		if (channel->target_cpu == cpu ||
+		    channel->target_cpu == VMBUS_CONNECT_CPU)
+			continue;
+
+		ret = vmbus_channel_set_cpu(channel, cpu);
+		if (!ret)
+			break;
+	}
+
+	if (ret)
+		ret = vmbus_channel_set_cpu(channel, VMBUS_CONNECT_CPU);
+
+	return ret;
+}
+
+/*
+ * hv_synic_cleanup - Cleanup routine for hv_synic_init().
+ */
 int hv_synic_cleanup(unsigned int cpu)
 {
 	struct vmbus_channel *channel, *sc;
-	bool channel_found = false;
+	int ret = 0;
 
 	if (vmbus_connection.conn_state != CONNECTED)
 		goto always_cleanup;
@@ -397,38 +595,34 @@ int hv_synic_cleanup(unsigned int cpu)
 
 	/*
 	 * Search for channels which are bound to the CPU we're about to
-	 * cleanup.  In case we find one and vmbus is still connected, we
-	 * fail; this will effectively prevent CPU offlining.
-	 *
-	 * TODO: Re-bind the channels to different CPUs.
+	 * cleanup.
 	 */
 	mutex_lock(&vmbus_connection.channel_mutex);
 	list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
 		if (channel->target_cpu == cpu) {
-			channel_found = true;
-			break;
+			ret = hv_pick_new_cpu(channel);
+			if (ret) {
+				mutex_unlock(&vmbus_connection.channel_mutex);
+				return ret;
+			}
 		}
 		list_for_each_entry(sc, &channel->sc_list, sc_list) {
 			if (sc->target_cpu == cpu) {
-				channel_found = true;
-				break;
+				ret = hv_pick_new_cpu(sc);
+				if (ret) {
+					mutex_unlock(&vmbus_connection.channel_mutex);
+					return ret;
+				}
 			}
 		}
-		if (channel_found)
-			break;
 	}
 	mutex_unlock(&vmbus_connection.channel_mutex);
 
-	if (channel_found)
-		return -EBUSY;
-
 	/*
-	 * channel_found == false means that any channels that were previously
-	 * assigned to the CPU have been reassigned elsewhere with a call of
-	 * vmbus_send_modifychannel().  Scan the event flags page looking for
-	 * bits that are set and waiting with a timeout for vmbus_chan_sched()
-	 * to process such bits.  If bits are still set after this operation
-	 * and VMBus is connected, fail the CPU offlining operation.
+	 * Scan the event flags page looking for bits that are set and waiting
+	 * with a timeout for vmbus_chan_sched() to process such bits. If bits
+	 * are still set after this operation and VMBus is connected, fail the
+	 * CPU offlining operation.
 	 */
 	if (vmbus_proto_version >= VERSION_WIN10_V4_1 && hv_synic_event_pending())
 		return -EBUSY;
@@ -436,7 +630,27 @@ int hv_synic_cleanup(unsigned int cpu)
 always_cleanup:
 	hv_stimer_legacy_cleanup(cpu);
 
-	hv_synic_disable_regs(cpu);
+	/*
+	 * First, disable the event and message pages
+	 * used for communicating with the host, and then
+	 * disable the host interrupts if VMBus is not
+	 * confidential.
+	 */
+	hv_hyp_synic_disable_regs(cpu);
+	if (!vmbus_is_confidential())
+		hv_hyp_synic_disable_interrupts();
 
-	return 0;
+	/*
+	 * Perform the same steps for the Confidential VMBus.
+	 * The sequencing provides the guarantee that no data
+	 * may be posted for processing before disabling interrupts.
+	 */
+	if (vmbus_is_confidential()) {
+		hv_para_synic_disable_regs(cpu);
+		hv_para_synic_disable_interrupts();
+	}
+	if (vmbus_irq != -1)
+		disable_percpu_irq(vmbus_irq);
+
+	return ret;
 }
diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
index cbe43e2567a7..2b4080e51f97 100644
--- a/drivers/hv/hv_balloon.c
+++ b/drivers/hv/hv_balloon.c
@@ -8,6 +8,7 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
+#include <linux/cleanup.h>
 #include <linux/kernel.h>
 #include <linux/jiffies.h>
 #include <linux/mman.h>
@@ -24,9 +25,10 @@
 #include <linux/notifier.h>
 #include <linux/percpu_counter.h>
 #include <linux/page_reporting.h>
+#include <linux/sizes.h>
 
 #include <linux/hyperv.h>
-#include <asm/hyperv-tlfs.h>
+#include <hyperv/hvhdk.h>
 
 #include <asm/mshyperv.h>
 
@@ -40,8 +42,6 @@
  * Begin protocol definitions.
  */
 
-
-
 /*
  * Protocol versions. The low word is the minor version, the high word the major
  * version.
@@ -70,8 +70,6 @@ enum {
 	DYNMEM_PROTOCOL_VERSION_CURRENT = DYNMEM_PROTOCOL_VERSION_WIN10
 };
 
-
-
 /*
  * Message Types
  */
@@ -100,7 +98,6 @@ enum dm_message_type {
 	DM_VERSION_1_MAX		= 12
 };
 
-
 /*
  * Structures defining the dynamic memory management
  * protocol.
@@ -114,7 +111,6 @@ union dm_version {
 	__u32 version;
 } __packed;
 
-
 union dm_caps {
 	struct {
 		__u64 balloon:1;
@@ -147,8 +143,6 @@ union dm_mem_page_range {
 	__u64  page_range;
 } __packed;
 
-
-
 /*
  * The header for all dynamic memory messages:
  *
@@ -173,7 +167,6 @@ struct dm_message {
 	__u8 data[]; /* enclosed message */
 } __packed;
 
-
 /*
  * Specific message types supporting the dynamic memory protocol.
  */
@@ -270,7 +263,6 @@ struct dm_status {
 	__u32 io_diff;
 } __packed;
 
-
 /*
  * Message to ask the guest to allocate memory - balloon up message.
  * This message is sent from the host to the guest. The guest may not be
@@ -285,14 +277,13 @@ struct dm_balloon {
 	__u32 reservedz;
 } __packed;
 
-
 /*
  * Balloon response message; this message is sent from the guest
  * to the host in response to the balloon message.
  *
  * reservedz: Reserved; must be set to zero.
  * more_pages: If FALSE, this is the last message of the transaction.
- * if TRUE there will atleast one more message from the guest.
+ * if TRUE there will be at least one more message from the guest.
  *
  * range_count: The number of ranges in the range array.
  *
@@ -313,7 +304,7 @@ struct dm_balloon_response {
  * to the guest to give guest more memory.
  *
  * more_pages: If FALSE, this is the last message of the transaction.
- * if TRUE there will atleast one more message from the guest.
+ * if TRUE there will be at least one more message from the guest.
  *
  * reservedz: Reserved; must be set to zero.
  *
@@ -341,7 +332,6 @@ struct dm_unballoon_response {
 	struct dm_header hdr;
 } __packed;
 
-
 /*
  * Hot add request message. Message sent from the host to the guest.
  *
@@ -389,7 +379,6 @@ enum dm_info_type {
 	MAX_INFO_TYPE
 };
 
-
 /*
  * Header for the information message.
  */
@@ -424,11 +413,11 @@ struct dm_info_msg {
  * The range start_pfn : end_pfn specifies the range
  * that the host has asked us to hot add. The range
  * start_pfn : ha_end_pfn specifies the range that we have
- * currently hot added. We hot add in multiples of 128M
- * chunks; it is possible that we may not be able to bring
- * online all the pages in the region. The range
+ * currently hot added. We hot add in chunks equal to the
+ * memory block size; it is possible that we may not be able
+ * to bring online all the pages in the region. The range
  * covered_start_pfn:covered_end_pfn defines the pages that can
- * be brough online.
+ * be brought online.
  */
 
 struct hv_hotadd_state {
@@ -479,10 +468,10 @@ static unsigned long last_post_time;
 
 static int hv_hypercall_multi_failure;
 
-module_param(hot_add, bool, (S_IRUGO | S_IWUSR));
+module_param(hot_add, bool, 0644);
 MODULE_PARM_DESC(hot_add, "If set attempt memory hot_add");
 
-module_param(pressure_report_delay, uint, (S_IRUGO | S_IWUSR));
+module_param(pressure_report_delay, uint, 0644);
 MODULE_PARM_DESC(pressure_report_delay, "Delay in secs in reporting pressure");
 static atomic_t trans_id = ATOMIC_INIT(0);
 
@@ -501,11 +490,13 @@ enum hv_dm_state {
 	DM_INIT_ERROR
 };
 
-
 static __u8 recv_buffer[HV_HYP_PAGE_SIZE];
 static __u8 balloon_up_send_buffer[HV_HYP_PAGE_SIZE];
+
+static unsigned long ha_pages_in_chunk;
+#define HA_BYTES_IN_CHUNK (ha_pages_in_chunk << PAGE_SHIFT)
+
 #define PAGES_IN_2M (2 * 1024 * 1024 / PAGE_SIZE)
-#define HA_CHUNK (128 * 1024 * 1024 / PAGE_SIZE)
 
 struct hv_dynmem_device {
 	struct hv_device *dev;
@@ -594,12 +585,12 @@ static inline bool has_pfn_is_backed(struct hv_hotadd_state *has,
 	struct hv_hotadd_gap *gap;
 
 	/* The page is not backed. */
-	if ((pfn < has->covered_start_pfn) || (pfn >= has->covered_end_pfn))
+	if (pfn < has->covered_start_pfn || pfn >= has->covered_end_pfn)
 		return false;
 
 	/* Check for gaps. */
 	list_for_each_entry(gap, &has->gap_list, list) {
-		if ((pfn >= gap->start_pfn) && (pfn < gap->end_pfn))
+		if (pfn >= gap->start_pfn && pfn < gap->end_pfn)
 			return false;
 	}
 
@@ -646,7 +637,7 @@ static int hv_memory_notifier(struct notifier_block *nb, unsigned long val,
 			      void *v)
 {
 	struct memory_notify *mem = (struct memory_notify *)v;
-	unsigned long flags, pfn_count;
+	unsigned long pfn_count;
 
 	switch (val) {
 	case MEM_ONLINE:
@@ -655,21 +646,22 @@ static int hv_memory_notifier(struct notifier_block *nb, unsigned long val,
 		break;
 
 	case MEM_OFFLINE:
-		spin_lock_irqsave(&dm_device.ha_lock, flags);
-		pfn_count = hv_page_offline_check(mem->start_pfn,
-						  mem->nr_pages);
-		if (pfn_count <= dm_device.num_pages_onlined) {
-			dm_device.num_pages_onlined -= pfn_count;
-		} else {
-			/*
-			 * We're offlining more pages than we managed to online.
-			 * This is unexpected. In any case don't let
-			 * num_pages_onlined wrap around zero.
-			 */
-			WARN_ON_ONCE(1);
-			dm_device.num_pages_onlined = 0;
+		scoped_guard(spinlock_irqsave, &dm_device.ha_lock) {
+			pfn_count = hv_page_offline_check(mem->start_pfn,
+							  mem->nr_pages);
+			if (pfn_count <= dm_device.num_pages_onlined) {
+				dm_device.num_pages_onlined -= pfn_count;
+			} else {
+				/*
+				 * We're offlining more pages than we
+				 * managed to online. This is
+				 * unexpected. In any case don't let
+				 * num_pages_onlined wrap around zero.
+				 */
+				WARN_ON_ONCE(1);
+				dm_device.num_pages_onlined = 0;
+			}
 		}
-		spin_unlock_irqrestore(&dm_device.ha_lock, flags);
 		break;
 	case MEM_GOING_ONLINE:
 	case MEM_GOING_OFFLINE:
@@ -691,9 +683,8 @@ static void hv_page_online_one(struct hv_hotadd_state *has, struct page *pg)
 		if (!PageOffline(pg))
 			__SetPageOffline(pg);
 		return;
-	}
-	if (PageOffline(pg))
-		__ClearPageOffline(pg);
+	} else if (!PageOffline(pg))
+		return;
 
 	/* This frame is currently backed; online the page. */
 	generic_online_page(pg, 0);
@@ -721,30 +712,22 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size,
 	unsigned long start_pfn;
 	unsigned long processed_pfn;
 	unsigned long total_pfn = pfn_count;
-	unsigned long flags;
-
-	for (i = 0; i < (size/HA_CHUNK); i++) {
-		start_pfn = start + (i * HA_CHUNK);
 
-		spin_lock_irqsave(&dm_device.ha_lock, flags);
-		has->ha_end_pfn +=  HA_CHUNK;
+	for (i = 0; i < (size/ha_pages_in_chunk); i++) {
+		start_pfn = start + (i * ha_pages_in_chunk);
 
-		if (total_pfn > HA_CHUNK) {
-			processed_pfn = HA_CHUNK;
-			total_pfn -= HA_CHUNK;
-		} else {
-			processed_pfn = total_pfn;
-			total_pfn = 0;
+		scoped_guard(spinlock_irqsave, &dm_device.ha_lock) {
+			has->ha_end_pfn += ha_pages_in_chunk;
+			processed_pfn = umin(total_pfn, ha_pages_in_chunk);
+			total_pfn -= processed_pfn;
+			has->covered_end_pfn += processed_pfn;
 		}
 
-		has->covered_end_pfn +=  processed_pfn;
-		spin_unlock_irqrestore(&dm_device.ha_lock, flags);
-
 		reinit_completion(&dm_device.ol_waitevent);
 
 		nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn));
 		ret = add_memory(nid, PFN_PHYS((start_pfn)),
-				(HA_CHUNK << PAGE_SHIFT), MHP_MERGE_RESOURCE);
+				 HA_BYTES_IN_CHUNK, MHP_MERGE_RESOURCE);
 
 		if (ret) {
 			pr_err("hot_add memory failed error is %d\n", ret);
@@ -758,10 +741,10 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size,
 				 */
 				do_hot_add = false;
 			}
-			spin_lock_irqsave(&dm_device.ha_lock, flags);
-			has->ha_end_pfn -= HA_CHUNK;
-			has->covered_end_pfn -=  processed_pfn;
-			spin_unlock_irqrestore(&dm_device.ha_lock, flags);
+			scoped_guard(spinlock_irqsave, &dm_device.ha_lock) {
+				has->ha_end_pfn -= ha_pages_in_chunk;
+				has->covered_end_pfn -=  processed_pfn;
+			}
 			break;
 		}
 
@@ -773,7 +756,7 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size,
 		 * adding succeeded, it is ok to proceed even if the memory was
 		 * not onlined in time.
 		 */
-		wait_for_completion_timeout(&dm_device.ol_waitevent, 5 * HZ);
+		wait_for_completion_timeout(&dm_device.ol_waitevent, secs_to_jiffies(5));
 		post_status(&dm_device);
 	}
 }
@@ -781,31 +764,30 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size,
 static void hv_online_page(struct page *pg, unsigned int order)
 {
 	struct hv_hotadd_state *has;
-	unsigned long flags;
 	unsigned long pfn = page_to_pfn(pg);
 
-	spin_lock_irqsave(&dm_device.ha_lock, flags);
-	list_for_each_entry(has, &dm_device.ha_region_list, list) {
-		/* The page belongs to a different HAS. */
-		if ((pfn < has->start_pfn) ||
+	scoped_guard(spinlock_irqsave, &dm_device.ha_lock) {
+		list_for_each_entry(has, &dm_device.ha_region_list, list) {
+			/* The page belongs to a different HAS. */
+			if (pfn < has->start_pfn ||
 				(pfn + (1UL << order) > has->end_pfn))
-			continue;
+				continue;
 
-		hv_bring_pgs_online(has, pfn, 1UL << order);
-		break;
+			hv_bring_pgs_online(has, pfn, 1UL << order);
+			return;
+		}
 	}
-	spin_unlock_irqrestore(&dm_device.ha_lock, flags);
+	generic_online_page(pg, order);
 }
 
 static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt)
 {
 	struct hv_hotadd_state *has;
 	struct hv_hotadd_gap *gap;
-	unsigned long residual, new_inc;
+	unsigned long residual;
 	int ret = 0;
-	unsigned long flags;
 
-	spin_lock_irqsave(&dm_device.ha_lock, flags);
+	guard(spinlock_irqsave)(&dm_device.ha_lock);
 	list_for_each_entry(has, &dm_device.ha_region_list, list) {
 		/*
 		 * If the pfn range we are dealing with is not in the current
@@ -838,27 +820,20 @@ static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt)
 		 * our current limit; extend it.
 		 */
 		if ((start_pfn + pfn_cnt) > has->end_pfn) {
+			/* Extend the region by multiples of ha_pages_in_chunk */
 			residual = (start_pfn + pfn_cnt - has->end_pfn);
-			/*
-			 * Extend the region by multiples of HA_CHUNK.
-			 */
-			new_inc = (residual / HA_CHUNK) * HA_CHUNK;
-			if (residual % HA_CHUNK)
-				new_inc += HA_CHUNK;
-
-			has->end_pfn += new_inc;
+			has->end_pfn += ALIGN(residual, ha_pages_in_chunk);
 		}
 
 		ret = 1;
 		break;
 	}
-	spin_unlock_irqrestore(&dm_device.ha_lock, flags);
 
 	return ret;
 }
 
 static unsigned long handle_pg_range(unsigned long pg_start,
-					unsigned long pg_count)
+				     unsigned long pg_count)
 {
 	unsigned long start_pfn = pg_start;
 	unsigned long pfn_cnt = pg_count;
@@ -869,7 +844,7 @@ static unsigned long handle_pg_range(unsigned long pg_start,
 	unsigned long res = 0, flags;
 
 	pr_debug("Hot adding %lu pages starting at pfn 0x%lx.\n", pg_count,
-		pg_start);
+		 pg_start);
 
 	spin_lock_irqsave(&dm_device.ha_lock, flags);
 	list_for_each_entry(has, &dm_device.ha_region_list, list) {
@@ -905,22 +880,19 @@ static unsigned long handle_pg_range(unsigned long pg_start,
 			if (start_pfn > has->start_pfn &&
 			    online_section_nr(pfn_to_section_nr(start_pfn)))
 				hv_bring_pgs_online(has, start_pfn, pgs_ol);
-
 		}
 
-		if ((has->ha_end_pfn < has->end_pfn) && (pfn_cnt > 0)) {
+		if (has->ha_end_pfn < has->end_pfn && pfn_cnt > 0) {
 			/*
 			 * We have some residual hot add range
 			 * that needs to be hot added; hot add
 			 * it now. Hot add a multiple of
-			 * HA_CHUNK that fully covers the pages
+			 * ha_pages_in_chunk that fully covers the pages
 			 * we have.
 			 */
 			size = (has->end_pfn - has->ha_end_pfn);
 			if (pfn_cnt <= size) {
-				size = ((pfn_cnt / HA_CHUNK) * HA_CHUNK);
-				if (pfn_cnt % HA_CHUNK)
-					size += HA_CHUNK;
+				size = ALIGN(pfn_cnt, ha_pages_in_chunk);
 			} else {
 				pfn_cnt = size;
 			}
@@ -947,7 +919,6 @@ static unsigned long process_hot_add(unsigned long pg_start,
 {
 	struct hv_hotadd_state *ha_region = NULL;
 	int covered;
-	unsigned long flags;
 
 	if (pfn_cnt == 0)
 		return 0;
@@ -979,9 +950,9 @@ static unsigned long process_hot_add(unsigned long pg_start,
 		ha_region->covered_end_pfn = pg_start;
 		ha_region->end_pfn = rg_start + rg_size;
 
-		spin_lock_irqsave(&dm_device.ha_lock, flags);
-		list_add_tail(&ha_region->list, &dm_device.ha_region_list);
-		spin_unlock_irqrestore(&dm_device.ha_lock, flags);
+		scoped_guard(spinlock_irqsave, &dm_device.ha_lock) {
+			list_add_tail(&ha_region->list, &dm_device.ha_region_list);
+		}
 	}
 
 do_pg_range:
@@ -1014,10 +985,7 @@ static void hot_add_req(struct work_struct *dummy)
 	rg_start = dm->ha_wrk.ha_region_range.finfo.start_page;
 	rg_sz = dm->ha_wrk.ha_region_range.finfo.page_cnt;
 
-	if ((rg_start == 0) && (!dm->host_specified_ha_region)) {
-		unsigned long region_size;
-		unsigned long region_start;
-
+	if (rg_start == 0 && !dm->host_specified_ha_region) {
 		/*
 		 * The host has not specified the hot-add region.
 		 * Based on the hot-add page range being specified,
@@ -1025,19 +993,13 @@ static void hot_add_req(struct work_struct *dummy)
 		 * that need to be hot-added while ensuring the alignment
 		 * and size requirements of Linux as it relates to hot-add.
 		 */
-		region_size = (pfn_cnt / HA_CHUNK) * HA_CHUNK;
-		if (pfn_cnt % HA_CHUNK)
-			region_size += HA_CHUNK;
-
-		region_start = (pg_start / HA_CHUNK) * HA_CHUNK;
-
-		rg_start = region_start;
-		rg_sz = region_size;
+		rg_start = ALIGN_DOWN(pg_start, ha_pages_in_chunk);
+		rg_sz = ALIGN(pfn_cnt, ha_pages_in_chunk);
 	}
 
 	if (do_hot_add)
 		resp.page_count = process_hot_add(pg_start, pfn_cnt,
-						rg_start, rg_sz);
+						  rg_start, rg_sz);
 
 	dm->num_pages_added += resp.page_count;
 #endif
@@ -1215,11 +1177,10 @@ static void post_status(struct hv_dynmem_device *dm)
 				sizeof(struct dm_status),
 				(unsigned long)NULL,
 				VM_PKT_DATA_INBAND, 0);
-
 }
 
 static void free_balloon_pages(struct hv_dynmem_device *dm,
-			 union dm_mem_page_range *range_array)
+			       union dm_mem_page_range *range_array)
 {
 	int num_pages = range_array->finfo.page_cnt;
 	__u64 start_frame = range_array->finfo.start_page;
@@ -1231,12 +1192,11 @@ static void free_balloon_pages(struct hv_dynmem_device *dm,
 		__ClearPageOffline(pg);
 		__free_page(pg);
 		dm->num_pages_ballooned--;
+		mod_node_page_state(page_pgdat(pg), NR_BALLOON_PAGES, -1);
 		adjust_managed_page_count(pg, 1);
 	}
 }
 
-
-
 static unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm,
 					unsigned int num_pages,
 					struct dm_balloon_response *bl_resp,
@@ -1262,6 +1222,7 @@ static unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm,
 			return i * alloc_unit;
 
 		dm->num_pages_ballooned += alloc_unit;
+		mod_node_page_state(page_pgdat(pg), NR_BALLOON_PAGES, alloc_unit);
 
 		/*
 		 * If we allocatted 2M pages; split them so we
@@ -1282,7 +1243,6 @@ static unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm,
 			page_to_pfn(pg);
 		bl_resp->range_array[i].finfo.page_cnt = alloc_unit;
 		bl_resp->hdr.size += sizeof(union dm_mem_page_range);
-
 	}
 
 	return i * alloc_unit;
@@ -1336,7 +1296,7 @@ static void balloon_up(struct work_struct *dummy)
 
 		if (num_ballooned == 0 || num_ballooned == num_pages) {
 			pr_debug("Ballooned %u out of %u requested pages.\n",
-				num_pages, dm_device.balloon_wrk.num_pages);
+				 num_pages, dm_device.balloon_wrk.num_pages);
 
 			bl_resp->more_pages = 0;
 			done = true;
@@ -1370,16 +1330,15 @@ static void balloon_up(struct work_struct *dummy)
 
 			for (i = 0; i < bl_resp->range_count; i++)
 				free_balloon_pages(&dm_device,
-						 &bl_resp->range_array[i]);
+						   &bl_resp->range_array[i]);
 
 			done = true;
 		}
 	}
-
 }
 
 static void balloon_down(struct hv_dynmem_device *dm,
-			struct dm_unballoon_request *req)
+			 struct dm_unballoon_request *req)
 {
 	union dm_mem_page_range *range_array = req->range_array;
 	int range_count = req->range_count;
@@ -1393,7 +1352,7 @@ static void balloon_down(struct hv_dynmem_device *dm,
 	}
 
 	pr_debug("Freed %u ballooned pages.\n",
-		prev_pages_ballooned - dm->num_pages_ballooned);
+		 prev_pages_ballooned - dm->num_pages_ballooned);
 
 	if (req->more_pages == 1)
 		return;
@@ -1418,8 +1377,8 @@ static int dm_thread_func(void *dm_dev)
 	struct hv_dynmem_device *dm = dm_dev;
 
 	while (!kthread_should_stop()) {
-		wait_for_completion_interruptible_timeout(
-						&dm_device.config_event, 1*HZ);
+		wait_for_completion_interruptible_timeout(&dm_device.config_event,
+								secs_to_jiffies(1));
 		/*
 		 * The host expects us to post information on the memory
 		 * pressure every second.
@@ -1443,9 +1402,8 @@ static int dm_thread_func(void *dm_dev)
 	return 0;
 }
 
-
 static void version_resp(struct hv_dynmem_device *dm,
-			struct dm_version_response *vresp)
+			 struct dm_version_response *vresp)
 {
 	struct dm_version_request version_req;
 	int ret;
@@ -1506,7 +1464,7 @@ version_error:
 }
 
 static void cap_resp(struct hv_dynmem_device *dm,
-			struct dm_capabilities_resp_msg *cap_resp)
+		     struct dm_capabilities_resp_msg *cap_resp)
 {
 	if (!cap_resp->is_accepted) {
 		pr_err("Capabilities not accepted by host\n");
@@ -1539,7 +1497,7 @@ static void balloon_onchannelcallback(void *context)
 		switch (dm_hdr->type) {
 		case DM_VERSION_RESPONSE:
 			version_resp(dm,
-				 (struct dm_version_response *)dm_msg);
+				     (struct dm_version_response *)dm_msg);
 			break;
 
 		case DM_CAPABILITIES_RESPONSE:
@@ -1569,7 +1527,7 @@ static void balloon_onchannelcallback(void *context)
 
 			dm->state = DM_BALLOON_DOWN;
 			balloon_down(dm,
-				 (struct dm_unballoon_request *)recv_buffer);
+				     (struct dm_unballoon_request *)recv_buffer);
 			break;
 
 		case DM_MEM_HOT_ADD_REQUEST:
@@ -1607,17 +1565,15 @@ static void balloon_onchannelcallback(void *context)
 
 		default:
 			pr_warn_ratelimited("Unhandled message: type: %d\n", dm_hdr->type);
-
 		}
 	}
-
 }
 
 #define HV_LARGE_REPORTING_ORDER	9
 #define HV_LARGE_REPORTING_LEN (HV_HYP_PAGE_SIZE << \
 		HV_LARGE_REPORTING_ORDER)
 static int hv_free_page_report(struct page_reporting_dev_info *pr_dev_info,
-		    struct scatterlist *sgl, unsigned int nents)
+			       struct scatterlist *sgl, unsigned int nents)
 {
 	unsigned long flags;
 	struct hv_memory_hint *hint;
@@ -1628,13 +1584,13 @@ static int hv_free_page_report(struct page_reporting_dev_info *pr_dev_info,
 	WARN_ON_ONCE(nents > HV_MEMORY_HINT_MAX_GPA_PAGE_RANGES);
 	WARN_ON_ONCE(sgl->length < (HV_HYP_PAGE_SIZE << page_reporting_order));
 	local_irq_save(flags);
-	hint = *(struct hv_memory_hint **)this_cpu_ptr(hyperv_pcpu_input_arg);
+	hint = *this_cpu_ptr(hyperv_pcpu_input_arg);
 	if (!hint) {
 		local_irq_restore(flags);
 		return -ENOSPC;
 	}
 
-	hint->type = HV_EXT_MEMORY_HEAT_HINT_TYPE_COLD_DISCARD;
+	hint->heat_type = HV_EXTMEM_HEAT_HINT_COLD_DISCARD;
 	hint->reserved = 0;
 	for_each_sg(sgl, sg, nents, i) {
 		union hv_gpa_page_range *range;
@@ -1652,7 +1608,7 @@ static int hv_free_page_report(struct page_reporting_dev_info *pr_dev_info,
 		 */
 
 		/* page reporting for pages 2MB or higher */
-		if (order >= HV_LARGE_REPORTING_ORDER ) {
+		if (order >= HV_LARGE_REPORTING_ORDER) {
 			range->page.largepage = 1;
 			range->page_size = HV_GPA_PAGE_RANGE_PAGE_SIZE_2MB;
 			range->base_large_pfn = page_to_hvpfn(
@@ -1666,23 +1622,21 @@ static int hv_free_page_report(struct page_reporting_dev_info *pr_dev_info,
 			range->page.additional_pages =
 				(sg->length / HV_HYP_PAGE_SIZE) - 1;
 		}
-
 	}
 
 	status = hv_do_rep_hypercall(HV_EXT_CALL_MEMORY_HEAT_HINT, nents, 0,
 				     hint, NULL);
 	local_irq_restore(flags);
 	if (!hv_result_success(status)) {
-
 		pr_err("Cold memory discard hypercall failed with status %llx\n",
-				status);
+		       status);
 		if (hv_hypercall_multi_failure > 0)
 			hv_hypercall_multi_failure++;
 
 		if (hv_result(status) == HV_STATUS_INVALID_PARAMETER) {
 			pr_err("Underlying Hyper-V does not support order less than 9. Hypercall failed\n");
 			pr_err("Defaulting to page_reporting_order %d\n",
-					pageblock_order);
+			       pageblock_order);
 			page_reporting_order = pageblock_order;
 			hv_hypercall_multi_failure++;
 			return -EINVAL;
@@ -1716,7 +1670,7 @@ static void enable_page_reporting(void)
 		pr_err("Failed to enable cold memory discard: %d\n", ret);
 	} else {
 		pr_info("Cold memory discard hint enabled with order %d\n",
-				page_reporting_order);
+			page_reporting_order);
 	}
 }
 
@@ -1799,7 +1753,7 @@ static int balloon_connect_vsp(struct hv_device *dev)
 	if (ret)
 		goto out;
 
-	t = wait_for_completion_timeout(&dm_device.host_event, 5*HZ);
+	t = wait_for_completion_timeout(&dm_device.host_event, secs_to_jiffies(5));
 	if (t == 0) {
 		ret = -ETIMEDOUT;
 		goto out;
@@ -1835,10 +1789,13 @@ static int balloon_connect_vsp(struct hv_device *dev)
 	cap_msg.caps.cap_bits.hot_add = hot_add_enabled();
 
 	/*
-	 * Specify our alignment requirements as it relates
-	 * memory hot-add. Specify 128MB alignment.
+	 * Specify our alignment requirements for memory hot-add. The value is
+	 * the log base 2 of the number of megabytes in a chunk. For example,
+	 * with 256 MiB chunks, the value is 8. The number of MiB in a chunk
+	 * must be a power of 2.
 	 */
-	cap_msg.caps.cap_bits.hot_add_alignment = 7;
+	cap_msg.caps.cap_bits.hot_add_alignment =
+					ilog2(HA_BYTES_IN_CHUNK / SZ_1M);
 
 	/*
 	 * Currently the host does not use these
@@ -1854,7 +1811,7 @@ static int balloon_connect_vsp(struct hv_device *dev)
 	if (ret)
 		goto out;
 
-	t = wait_for_completion_timeout(&dm_device.host_event, 5*HZ);
+	t = wait_for_completion_timeout(&dm_device.host_event, secs_to_jiffies(5));
 	if (t == 0) {
 		ret = -ETIMEDOUT;
 		goto out;
@@ -1895,8 +1852,8 @@ static int hv_balloon_debug_show(struct seq_file *f, void *offset)
 	char *sname;
 
 	seq_printf(f, "%-22s: %u.%u\n", "host_version",
-				DYNMEM_MAJOR_VERSION(dm->version),
-				DYNMEM_MINOR_VERSION(dm->version));
+			DYNMEM_MAJOR_VERSION(dm->version),
+			DYNMEM_MINOR_VERSION(dm->version));
 
 	seq_printf(f, "%-22s:", "capabilities");
 	if (ballooning_enabled())
@@ -1945,10 +1902,10 @@ static int hv_balloon_debug_show(struct seq_file *f, void *offset)
 	seq_printf(f, "%-22s: %u\n", "pages_ballooned", dm->num_pages_ballooned);
 
 	seq_printf(f, "%-22s: %lu\n", "total_pages_committed",
-				get_pages_committed(dm));
+		   get_pages_committed(dm));
 
 	seq_printf(f, "%-22s: %llu\n", "max_dynamic_page_count",
-				dm->max_dynamic_page_count);
+		   dm->max_dynamic_page_count);
 
 	return 0;
 }
@@ -1958,12 +1915,12 @@ DEFINE_SHOW_ATTRIBUTE(hv_balloon_debug);
 static void  hv_balloon_debugfs_init(struct hv_dynmem_device *b)
 {
 	debugfs_create_file("hv-balloon", 0444, NULL, b,
-			&hv_balloon_debug_fops);
+			    &hv_balloon_debug_fops);
 }
 
 static void  hv_balloon_debugfs_exit(struct hv_dynmem_device *b)
 {
-	debugfs_remove(debugfs_lookup("hv-balloon", NULL));
+	debugfs_lookup_and_remove("hv-balloon", NULL);
 }
 
 #else
@@ -1988,8 +1945,23 @@ static int balloon_probe(struct hv_device *dev,
 		hot_add = false;
 
 #ifdef CONFIG_MEMORY_HOTPLUG
+	/*
+	 * Hot-add must operate in chunks that are of size equal to the
+	 * memory block size because that's what the core add_memory()
+	 * interface requires. The Hyper-V interface requires that the memory
+	 * block size be a power of 2, which is guaranteed by the check in
+	 * memory_dev_init().
+	 */
+	ha_pages_in_chunk = memory_block_size_bytes() / PAGE_SIZE;
 	do_hot_add = hot_add;
 #else
+	/*
+	 * Without MEMORY_HOTPLUG, the guest returns a failure status for all
+	 * hot add requests from Hyper-V, and the chunk size is used only to
+	 * specify alignment to Hyper-V as required by the host/guest protocol.
+	 * Somewhat arbitrarily, use 128 MiB.
+	 */
+	ha_pages_in_chunk = SZ_128M / PAGE_SIZE;
 	do_hot_add = false;
 #endif
 	dm_device.dev = dev;
@@ -2042,12 +2014,11 @@ connect_error:
 	return ret;
 }
 
-static int balloon_remove(struct hv_device *dev)
+static void balloon_remove(struct hv_device *dev)
 {
 	struct hv_dynmem_device *dm = hv_get_drvdata(dev);
 	struct hv_hotadd_state *has, *tmp;
 	struct hv_hotadd_gap *gap, *tmp_gap;
-	unsigned long flags;
 
 	if (dm->num_pages_ballooned != 0)
 		pr_warn("Ballooned pages: %d\n", dm->num_pages_ballooned);
@@ -2073,7 +2044,7 @@ static int balloon_remove(struct hv_device *dev)
 #endif
 	}
 
-	spin_lock_irqsave(&dm_device.ha_lock, flags);
+	guard(spinlock_irqsave)(&dm_device.ha_lock);
 	list_for_each_entry_safe(has, tmp, &dm->ha_region_list, list) {
 		list_for_each_entry_safe(gap, tmp_gap, &has->gap_list, list) {
 			list_del(&gap->list);
@@ -2082,9 +2053,6 @@ static int balloon_remove(struct hv_device *dev)
 		list_del(&has->list);
 		kfree(has);
 	}
-	spin_unlock_irqrestore(&dm_device.ha_lock, flags);
-
-	return 0;
 }
 
 static int balloon_suspend(struct hv_device *hv_dev)
@@ -2105,7 +2073,6 @@ static int balloon_suspend(struct hv_device *hv_dev)
 	tasklet_enable(&hv_dev->channel->callback_event);
 
 	return 0;
-
 }
 
 static int balloon_resume(struct hv_device *dev)
@@ -2164,7 +2131,6 @@ static  struct hv_driver balloon_drv = {
 
 static int __init init_balloon_drv(void)
 {
-
 	return vmbus_driver_register(&balloon_drv);
 }
 
diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c
index ae68298c0dca..0a3ab7efed46 100644
--- a/drivers/hv/hv_common.c
+++ b/drivers/hv/hv_common.c
@@ -17,24 +17,37 @@
 #include <linux/export.h>
 #include <linux/bitfield.h>
 #include <linux/cpumask.h>
+#include <linux/sched/task_stack.h>
 #include <linux/panic_notifier.h>
 #include <linux/ptrace.h>
+#include <linux/random.h>
+#include <linux/efi.h>
+#include <linux/kdebug.h>
+#include <linux/kmsg_dump.h>
+#include <linux/sizes.h>
 #include <linux/slab.h>
 #include <linux/dma-map-ops.h>
-#include <asm/hyperv-tlfs.h>
+#include <linux/set_memory.h>
+#include <hyperv/hvhdk.h>
 #include <asm/mshyperv.h>
 
+u64 hv_current_partition_id = HV_PARTITION_ID_SELF;
+EXPORT_SYMBOL_GPL(hv_current_partition_id);
+
+enum hv_partition_type hv_curr_partition_type;
+EXPORT_SYMBOL_GPL(hv_curr_partition_type);
+
 /*
- * hv_root_partition and ms_hyperv are defined here with other Hyper-V
- * specific globals so they are shared across all architectures and are
+ * ms_hyperv and hv_nested are defined here with other
+ * Hyper-V specific globals so they are shared across all architectures and are
  * built only when CONFIG_HYPERV is defined.  But on x86,
  * ms_hyperv_init_platform() is built even when CONFIG_HYPERV is not
- * defined, and it uses these two variables.  So mark them as __weak
+ * defined, and it uses these three variables.  So mark them as __weak
  * here, allowing for an overriding definition in the module containing
  * ms_hyperv_init_platform().
  */
-bool __weak hv_root_partition;
-EXPORT_SYMBOL_GPL(hv_root_partition);
+bool __weak hv_nested;
+EXPORT_SYMBOL_GPL(hv_nested);
 
 struct ms_hyperv_info __weak ms_hyperv;
 EXPORT_SYMBOL_GPL(ms_hyperv);
@@ -51,6 +64,20 @@ EXPORT_SYMBOL_GPL(hyperv_pcpu_input_arg);
 void * __percpu *hyperv_pcpu_output_arg;
 EXPORT_SYMBOL_GPL(hyperv_pcpu_output_arg);
 
+static void hv_kmsg_dump_unregister(void);
+
+static struct ctl_table_header *hv_ctl_table_hdr;
+
+/*
+ * Per-cpu array holding the tail pointer for the SynIC event ring buffer
+ * for each SINT.
+ *
+ * We cannot maintain this in mshv driver because the tail pointer should
+ * persist even if the mshv driver is unloaded.
+ */
+u8 * __percpu *hv_synic_eventring_tail;
+EXPORT_SYMBOL_GPL(hv_synic_eventring_tail);
+
 /*
  * Hyper-V specific initialization and shutdown code that is
  * common across all architectures.  Called from architecture
@@ -59,6 +86,12 @@ EXPORT_SYMBOL_GPL(hyperv_pcpu_output_arg);
 
 void __init hv_common_free(void)
 {
+	unregister_sysctl_table(hv_ctl_table_hdr);
+	hv_ctl_table_hdr = NULL;
+
+	if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE)
+		hv_kmsg_dump_unregister();
+
 	kfree(hv_vp_index);
 	hv_vp_index = NULL;
 
@@ -67,11 +100,230 @@ void __init hv_common_free(void)
 
 	free_percpu(hyperv_pcpu_input_arg);
 	hyperv_pcpu_input_arg = NULL;
+
+	free_percpu(hv_synic_eventring_tail);
+	hv_synic_eventring_tail = NULL;
 }
 
+static void *hv_panic_page;
+
+/*
+ * Boolean to control whether to report panic messages over Hyper-V.
+ *
+ * It can be set via /proc/sys/kernel/hyperv_record_panic_msg
+ */
+static int sysctl_record_panic_msg = 1;
+
+/*
+ * sysctl option to allow the user to control whether kmsg data should be
+ * reported to Hyper-V on panic.
+ */
+static const struct ctl_table hv_ctl_table[] = {
+	{
+		.procname	= "hyperv_record_panic_msg",
+		.data		= &sysctl_record_panic_msg,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE
+	},
+};
+
+static int hv_die_panic_notify_crash(struct notifier_block *self,
+				     unsigned long val, void *args);
+
+static struct notifier_block hyperv_die_report_block = {
+	.notifier_call = hv_die_panic_notify_crash,
+};
+
+static struct notifier_block hyperv_panic_report_block = {
+	.notifier_call = hv_die_panic_notify_crash,
+};
+
+/*
+ * The following callback works both as die and panic notifier; its
+ * goal is to provide panic information to the hypervisor unless the
+ * kmsg dumper is used [see hv_kmsg_dump()], which provides more
+ * information but isn't always available.
+ *
+ * Notice that both the panic/die report notifiers are registered only
+ * if we have the capability HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE set.
+ */
+static int hv_die_panic_notify_crash(struct notifier_block *self,
+				     unsigned long val, void *args)
+{
+	struct pt_regs *regs;
+	bool is_die;
+
+	/* Don't notify Hyper-V unless we have a die oops event or panic. */
+	if (self == &hyperv_panic_report_block) {
+		is_die = false;
+		regs = current_pt_regs();
+	} else { /* die event */
+		if (val != DIE_OOPS)
+			return NOTIFY_DONE;
+
+		is_die = true;
+		regs = ((struct die_args *)args)->regs;
+	}
+
+	/*
+	 * Hyper-V should be notified only once about a panic/die. If we will
+	 * be calling hv_kmsg_dump() later with kmsg data, don't do the
+	 * notification here.
+	 */
+	if (!sysctl_record_panic_msg || !hv_panic_page)
+		hyperv_report_panic(regs, val, is_die);
+
+	return NOTIFY_DONE;
+}
+
+/*
+ * Callback from kmsg_dump. Grab as much as possible from the end of the kmsg
+ * buffer and call into Hyper-V to transfer the data.
+ */
+static void hv_kmsg_dump(struct kmsg_dumper *dumper,
+			 struct kmsg_dump_detail *detail)
+{
+	struct kmsg_dump_iter iter;
+	size_t bytes_written;
+
+	/* We are only interested in panics. */
+	if (detail->reason != KMSG_DUMP_PANIC || !sysctl_record_panic_msg)
+		return;
+
+	/*
+	 * Write dump contents to the page. No need to synchronize; panic should
+	 * be single-threaded.
+	 */
+	kmsg_dump_rewind(&iter);
+	kmsg_dump_get_buffer(&iter, false, hv_panic_page, HV_HYP_PAGE_SIZE,
+			     &bytes_written);
+	if (!bytes_written)
+		return;
+	/*
+	 * P3 to contain the physical address of the panic page & P4 to
+	 * contain the size of the panic data in that page. Rest of the
+	 * registers are no-op when the NOTIFY_MSG flag is set.
+	 */
+	hv_set_msr(HV_MSR_CRASH_P0, 0);
+	hv_set_msr(HV_MSR_CRASH_P1, 0);
+	hv_set_msr(HV_MSR_CRASH_P2, 0);
+	hv_set_msr(HV_MSR_CRASH_P3, virt_to_phys(hv_panic_page));
+	hv_set_msr(HV_MSR_CRASH_P4, bytes_written);
+
+	/*
+	 * Let Hyper-V know there is crash data available along with
+	 * the panic message.
+	 */
+	hv_set_msr(HV_MSR_CRASH_CTL,
+		   (HV_CRASH_CTL_CRASH_NOTIFY |
+		    HV_CRASH_CTL_CRASH_NOTIFY_MSG));
+}
+
+static struct kmsg_dumper hv_kmsg_dumper = {
+	.dump = hv_kmsg_dump,
+};
+
+static void hv_kmsg_dump_unregister(void)
+{
+	kmsg_dump_unregister(&hv_kmsg_dumper);
+	unregister_die_notifier(&hyperv_die_report_block);
+	atomic_notifier_chain_unregister(&panic_notifier_list,
+					 &hyperv_panic_report_block);
+
+	kfree(hv_panic_page);
+	hv_panic_page = NULL;
+}
+
+static void hv_kmsg_dump_register(void)
+{
+	int ret;
+
+	hv_panic_page = kzalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL);
+	if (!hv_panic_page) {
+		pr_err("Hyper-V: panic message page memory allocation failed\n");
+		return;
+	}
+
+	ret = kmsg_dump_register(&hv_kmsg_dumper);
+	if (ret) {
+		pr_err("Hyper-V: kmsg dump register error 0x%x\n", ret);
+		kfree(hv_panic_page);
+		hv_panic_page = NULL;
+	}
+}
+
+static inline bool hv_output_page_exists(void)
+{
+	return hv_parent_partition() || IS_ENABLED(CONFIG_HYPERV_VTL_MODE);
+}
+
+void __init hv_get_partition_id(void)
+{
+	struct hv_output_get_partition_id *output;
+	unsigned long flags;
+	u64 status, pt_id;
+
+	local_irq_save(flags);
+	output = *this_cpu_ptr(hyperv_pcpu_input_arg);
+	status = hv_do_hypercall(HVCALL_GET_PARTITION_ID, NULL, output);
+	pt_id = output->partition_id;
+	local_irq_restore(flags);
+
+	if (hv_result_success(status))
+		hv_current_partition_id = pt_id;
+	else
+		pr_err("Hyper-V: failed to get partition ID: %#x\n",
+		       hv_result(status));
+}
+#if IS_ENABLED(CONFIG_HYPERV_VTL_MODE)
+u8 __init get_vtl(void)
+{
+	u64 control = HV_HYPERCALL_REP_COMP_1 | HVCALL_GET_VP_REGISTERS;
+	struct hv_input_get_vp_registers *input;
+	struct hv_output_get_vp_registers *output;
+	unsigned long flags;
+	u64 ret;
+
+	local_irq_save(flags);
+	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+	output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+	memset(input, 0, struct_size(input, names, 1));
+	input->partition_id = HV_PARTITION_ID_SELF;
+	input->vp_index = HV_VP_INDEX_SELF;
+	input->input_vtl.as_uint8 = 0;
+	input->names[0] = HV_REGISTER_VSM_VP_STATUS;
+
+	ret = hv_do_hypercall(control, input, output);
+	if (hv_result_success(ret)) {
+		ret = output->values[0].reg8 & HV_VTL_MASK;
+	} else {
+		pr_err("Failed to get VTL(error: %lld) exiting...\n", ret);
+		BUG();
+	}
+
+	local_irq_restore(flags);
+	return ret;
+}
+#endif
+
 int __init hv_common_init(void)
 {
 	int i;
+	union hv_hypervisor_version_info version;
+
+	/* Get information about the Microsoft Hypervisor version */
+	if (!hv_get_hypervisor_version(&version))
+		pr_info("Hyper-V: Hypervisor Build %d.%d.%d.%d-%d-%d\n",
+			version.major_version, version.minor_version,
+			version.build_number, version.service_number,
+			version.service_pack, version.service_branch);
+
+	if (hv_is_isolation_supported())
+		sysctl_record_panic_msg = 0;
 
 	/*
 	 * Hyper-V expects to get crash register data or kmsg when
@@ -81,8 +333,33 @@ int __init hv_common_init(void)
 	 * kernel.
 	 */
 	if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) {
+		u64 hyperv_crash_ctl;
+
 		crash_kexec_post_notifiers = true;
 		pr_info("Hyper-V: enabling crash_kexec_post_notifiers\n");
+
+		/*
+		 * Panic message recording (sysctl_record_panic_msg)
+		 * is enabled by default in non-isolated guests and
+		 * disabled by default in isolated guests; the panic
+		 * message recording won't be available in isolated
+		 * guests should the following registration fail.
+		 */
+		hv_ctl_table_hdr = register_sysctl("kernel", hv_ctl_table);
+		if (!hv_ctl_table_hdr)
+			pr_err("Hyper-V: sysctl table register error");
+
+		/*
+		 * Register for panic kmsg callback only if the right
+		 * capability is supported by the hypervisor.
+		 */
+		hyperv_crash_ctl = hv_get_msr(HV_MSR_CRASH_CTL);
+		if (hyperv_crash_ctl & HV_CRASH_CTL_CRASH_NOTIFY_MSG)
+			hv_kmsg_dump_register();
+
+		register_die_notifier(&hyperv_die_report_block);
+		atomic_notifier_chain_register(&panic_notifier_list,
+					       &hyperv_panic_report_block);
 	}
 
 	/*
@@ -95,24 +372,95 @@ int __init hv_common_init(void)
 	BUG_ON(!hyperv_pcpu_input_arg);
 
 	/* Allocate the per-CPU state for output arg for root */
-	if (hv_root_partition) {
+	if (hv_output_page_exists()) {
 		hyperv_pcpu_output_arg = alloc_percpu(void *);
 		BUG_ON(!hyperv_pcpu_output_arg);
 	}
 
-	hv_vp_index = kmalloc_array(num_possible_cpus(), sizeof(*hv_vp_index),
+	if (hv_parent_partition()) {
+		hv_synic_eventring_tail = alloc_percpu(u8 *);
+		BUG_ON(!hv_synic_eventring_tail);
+	}
+
+	hv_vp_index = kmalloc_array(nr_cpu_ids, sizeof(*hv_vp_index),
 				    GFP_KERNEL);
 	if (!hv_vp_index) {
 		hv_common_free();
 		return -ENOMEM;
 	}
 
-	for (i = 0; i < num_possible_cpus(); i++)
+	for (i = 0; i < nr_cpu_ids; i++)
 		hv_vp_index[i] = VP_INVAL;
 
 	return 0;
 }
 
+void __init ms_hyperv_late_init(void)
+{
+	struct acpi_table_header *header;
+	acpi_status status;
+	u8 *randomdata;
+	u32 length, i;
+
+	/*
+	 * Seed the Linux random number generator with entropy provided by
+	 * the Hyper-V host in ACPI table OEM0.
+	 */
+	if (!IS_ENABLED(CONFIG_ACPI))
+		return;
+
+	status = acpi_get_table("OEM0", 0, &header);
+	if (ACPI_FAILURE(status) || !header)
+		return;
+
+	/*
+	 * Since the "OEM0" table name is for OEM specific usage, verify
+	 * that what we're seeing purports to be from Microsoft.
+	 */
+	if (strncmp(header->oem_table_id, "MICROSFT", 8))
+		goto error;
+
+	/*
+	 * Ensure the length is reasonable. Requiring at least 8 bytes and
+	 * no more than 4K bytes is somewhat arbitrary and just protects
+	 * against a malformed table. Hyper-V currently provides 64 bytes,
+	 * but allow for a change in a later version.
+	 */
+	if (header->length < sizeof(*header) + 8 ||
+	    header->length > sizeof(*header) + SZ_4K)
+		goto error;
+
+	length = header->length - sizeof(*header);
+	randomdata = (u8 *)(header + 1);
+
+	pr_debug("Hyper-V: Seeding rng with %d random bytes from ACPI table OEM0\n",
+			length);
+
+	add_bootloader_randomness(randomdata, length);
+
+	/*
+	 * To prevent the seed data from being visible in /sys/firmware/acpi,
+	 * zero out the random data in the ACPI table and fixup the checksum.
+	 * The zero'ing is done out of an abundance of caution in avoiding
+	 * potential security risks to the rng. Similarly, reset the table
+	 * length to just the header size so that a subsequent kexec doesn't
+	 * try to use the zero'ed out random data.
+	 */
+	for (i = 0; i < length; i++) {
+		header->checksum += randomdata[i];
+		randomdata[i] = 0;
+	}
+
+	for (i = 0; i < sizeof(header->length); i++)
+		header->checksum += ((u8 *)&header->length)[i];
+	header->length = sizeof(*header);
+	for (i = 0; i < sizeof(header->length); i++)
+		header->checksum -= ((u8 *)&header->length)[i];
+
+error:
+	acpi_put_table(header);
+}
+
 /*
  * Hyper-V specific initialization and die code for
  * individual CPUs that is common across all architectures.
@@ -122,54 +470,100 @@ int __init hv_common_init(void)
 int hv_common_cpu_init(unsigned int cpu)
 {
 	void **inputarg, **outputarg;
+	u8 **synic_eventring_tail;
 	u64 msr_vp_index;
 	gfp_t flags;
-	int pgcount = hv_root_partition ? 2 : 1;
+	const int pgcount = hv_output_page_exists() ? 2 : 1;
+	void *mem;
+	int ret = 0;
 
 	/* hv_cpu_init() can be called with IRQs disabled from hv_resume() */
 	flags = irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL;
 
 	inputarg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg);
-	*inputarg = kmalloc(pgcount * HV_HYP_PAGE_SIZE, flags);
-	if (!(*inputarg))
-		return -ENOMEM;
 
-	if (hv_root_partition) {
-		outputarg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg);
-		*outputarg = (char *)(*inputarg) + HV_HYP_PAGE_SIZE;
+	/*
+	 * The per-cpu memory is already allocated if this CPU was previously
+	 * online and then taken offline
+	 */
+	if (!*inputarg) {
+		mem = kmalloc_array(pgcount, HV_HYP_PAGE_SIZE, flags);
+		if (!mem)
+			return -ENOMEM;
+
+		if (hv_output_page_exists()) {
+			outputarg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg);
+			*outputarg = (char *)mem + HV_HYP_PAGE_SIZE;
+		}
+
+		if (!ms_hyperv.paravisor_present &&
+		    (hv_isolation_type_snp() || hv_isolation_type_tdx())) {
+			ret = set_memory_decrypted((unsigned long)mem, pgcount);
+			if (ret) {
+				/* It may be unsafe to free 'mem' */
+				return ret;
+			}
+
+			memset(mem, 0x00, pgcount * HV_HYP_PAGE_SIZE);
+		}
+
+		/*
+		 * In a fully enlightened TDX/SNP VM with more than 64 VPs, if
+		 * hyperv_pcpu_input_arg is not NULL, set_memory_decrypted() ->
+		 * ... -> cpa_flush()-> ... -> __send_ipi_mask_ex() tries to
+		 * use hyperv_pcpu_input_arg as the hypercall input page, which
+		 * must be a decrypted page in such a VM, but the page is still
+		 * encrypted before set_memory_decrypted() returns. Fix this by
+		 * setting *inputarg after the above set_memory_decrypted(): if
+		 * hyperv_pcpu_input_arg is NULL, __send_ipi_mask_ex() returns
+		 * HV_STATUS_INVALID_PARAMETER immediately, and the function
+		 * hv_send_ipi_mask() falls back to orig_apic.send_IPI_mask(),
+		 * which may be slightly slower than the hypercall, but still
+		 * works correctly in such a VM.
+		 */
+		*inputarg = mem;
 	}
 
-	msr_vp_index = hv_get_register(HV_REGISTER_VP_INDEX);
+	msr_vp_index = hv_get_msr(HV_MSR_VP_INDEX);
 
 	hv_vp_index[cpu] = msr_vp_index;
 
 	if (msr_vp_index > hv_max_vp_index)
 		hv_max_vp_index = msr_vp_index;
 
-	return 0;
+	if (hv_parent_partition()) {
+		synic_eventring_tail = (u8 **)this_cpu_ptr(hv_synic_eventring_tail);
+		*synic_eventring_tail = kcalloc(HV_SYNIC_SINT_COUNT,
+						sizeof(u8), flags);
+		/* No need to unwind any of the above on failure here */
+		if (unlikely(!*synic_eventring_tail))
+			ret = -ENOMEM;
+	}
+
+	return ret;
 }
 
 int hv_common_cpu_die(unsigned int cpu)
 {
-	unsigned long flags;
-	void **inputarg, **outputarg;
-	void *mem;
-
-	local_irq_save(flags);
-
-	inputarg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg);
-	mem = *inputarg;
-	*inputarg = NULL;
+	u8 **synic_eventring_tail;
+	/*
+	 * The hyperv_pcpu_input_arg and hyperv_pcpu_output_arg memory
+	 * is not freed when the CPU goes offline as the hyperv_pcpu_input_arg
+	 * may be used by the Hyper-V vPCI driver in reassigning interrupts
+	 * as part of the offlining process.  The interrupt reassignment
+	 * happens *after* the CPUHP_AP_HYPERV_ONLINE state has run and
+	 * called this function.
+	 *
+	 * If a previously offlined CPU is brought back online again, the
+	 * originally allocated memory is reused in hv_common_cpu_init().
+	 */
 
-	if (hv_root_partition) {
-		outputarg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg);
-		*outputarg = NULL;
+	if (hv_parent_partition()) {
+		synic_eventring_tail = this_cpu_ptr(hv_synic_eventring_tail);
+		kfree(*synic_eventring_tail);
+		*synic_eventring_tail = NULL;
 	}
 
-	local_irq_restore(flags);
-
-	kfree(mem);
-
 	return 0;
 }
 
@@ -221,17 +615,13 @@ EXPORT_SYMBOL_GPL(hv_query_ext_cap);
 
 void hv_setup_dma_ops(struct device *dev, bool coherent)
 {
-	/*
-	 * Hyper-V does not offer a vIOMMU in the guest
-	 * VM, so pass 0/NULL for the IOMMU settings
-	 */
-	arch_setup_dma_ops(dev, 0, 0, NULL, coherent);
+	arch_setup_dma_ops(dev, coherent);
 }
 EXPORT_SYMBOL_GPL(hv_setup_dma_ops);
 
 bool hv_is_hibernation_supported(void)
 {
-	return !hv_root_partition && acpi_sleep_state_supported(ACPI_STATE_S4);
+	return !hv_root_partition() && acpi_sleep_state_supported(ACPI_STATE_S4);
 }
 EXPORT_SYMBOL_GPL(hv_is_hibernation_supported);
 
@@ -243,7 +633,7 @@ EXPORT_SYMBOL_GPL(hv_is_hibernation_supported);
  */
 static u64 __hv_read_ref_counter(void)
 {
-	return hv_get_register(HV_REGISTER_TIME_REF_COUNT);
+	return hv_get_msr(HV_MSR_TIME_REF_COUNT);
 }
 
 u64 (*hv_read_reference_counter)(void) = __hv_read_ref_counter;
@@ -268,6 +658,12 @@ bool __weak hv_isolation_type_snp(void)
 }
 EXPORT_SYMBOL_GPL(hv_isolation_type_snp);
 
+bool __weak hv_isolation_type_tdx(void)
+{
+	return false;
+}
+EXPORT_SYMBOL_GPL(hv_isolation_type_tdx);
+
 void __weak hv_setup_vmbus_handler(void (*handler)(void))
 {
 }
@@ -278,6 +674,11 @@ void __weak hv_remove_vmbus_handler(void)
 }
 EXPORT_SYMBOL_GPL(hv_remove_vmbus_handler);
 
+void __weak hv_setup_mshv_handler(void (*handler)(void))
+{
+}
+EXPORT_SYMBOL_GPL(hv_setup_mshv_handler);
+
 void __weak hv_setup_kexec_handler(void (*handler)(void))
 {
 }
@@ -309,13 +710,151 @@ u64 __weak hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_s
 }
 EXPORT_SYMBOL_GPL(hv_ghcb_hypercall);
 
-void __weak *hv_map_memory(void *addr, unsigned long size)
+u64 __weak hv_tdx_hypercall(u64 control, u64 param1, u64 param2)
+{
+	return HV_STATUS_INVALID_PARAMETER;
+}
+EXPORT_SYMBOL_GPL(hv_tdx_hypercall);
+
+void __weak hv_enable_coco_interrupt(unsigned int cpu, unsigned int vector, bool set)
+{
+}
+EXPORT_SYMBOL_GPL(hv_enable_coco_interrupt);
+
+void __weak hv_para_set_sint_proxy(bool enable)
+{
+}
+EXPORT_SYMBOL_GPL(hv_para_set_sint_proxy);
+
+u64 __weak hv_para_get_synic_register(unsigned int reg)
+{
+	return ~0ULL;
+}
+EXPORT_SYMBOL_GPL(hv_para_get_synic_register);
+
+void __weak hv_para_set_synic_register(unsigned int reg, u64 val)
+{
+}
+EXPORT_SYMBOL_GPL(hv_para_set_synic_register);
+
+void hv_identify_partition_type(void)
+{
+	/* Assume guest role */
+	hv_curr_partition_type = HV_PARTITION_TYPE_GUEST;
+	/*
+	 * Check partition creation and cpu management privileges
+	 *
+	 * Hyper-V should never specify running as root and as a Confidential
+	 * VM. But to protect against a compromised/malicious Hyper-V trying
+	 * to exploit root behavior to expose Confidential VM memory, ignore
+	 * the root partition setting if also a Confidential VM.
+	 */
+	if ((ms_hyperv.priv_high & HV_CREATE_PARTITIONS) &&
+	    !(ms_hyperv.priv_high & HV_ISOLATION)) {
+
+		if (!IS_ENABLED(CONFIG_MSHV_ROOT)) {
+			pr_crit("Hyper-V: CONFIG_MSHV_ROOT not enabled!\n");
+		} else if (ms_hyperv.priv_high & HV_CPU_MANAGEMENT) {
+			pr_info("Hyper-V: running as root partition\n");
+			hv_curr_partition_type = HV_PARTITION_TYPE_ROOT;
+		} else {
+			pr_info("Hyper-V: running as L1VH partition\n");
+			hv_curr_partition_type = HV_PARTITION_TYPE_L1VH;
+		}
+	}
+}
+
+struct hv_status_info {
+	char *string;
+	int errno;
+	u16 code;
+};
+
+/*
+ * Note on the errno mappings:
+ * A failed hypercall is usually only recoverable (or loggable) near
+ * the call site where the HV_STATUS_* code is known. So the errno
+ * it gets converted to is not too useful further up the stack.
+ * Provide a few mappings that could be useful, and revert to -EIO
+ * as a fallback.
+ */
+static const struct hv_status_info hv_status_infos[] = {
+#define _STATUS_INFO(status, errno) { #status, (errno), (status) }
+	_STATUS_INFO(HV_STATUS_SUCCESS,				0),
+	_STATUS_INFO(HV_STATUS_INVALID_HYPERCALL_CODE,		-EINVAL),
+	_STATUS_INFO(HV_STATUS_INVALID_HYPERCALL_INPUT,		-EINVAL),
+	_STATUS_INFO(HV_STATUS_INVALID_ALIGNMENT,		-EIO),
+	_STATUS_INFO(HV_STATUS_INVALID_PARAMETER,		-EINVAL),
+	_STATUS_INFO(HV_STATUS_ACCESS_DENIED,			-EIO),
+	_STATUS_INFO(HV_STATUS_INVALID_PARTITION_STATE,		-EIO),
+	_STATUS_INFO(HV_STATUS_OPERATION_DENIED,		-EIO),
+	_STATUS_INFO(HV_STATUS_UNKNOWN_PROPERTY,		-EIO),
+	_STATUS_INFO(HV_STATUS_PROPERTY_VALUE_OUT_OF_RANGE,	-EIO),
+	_STATUS_INFO(HV_STATUS_INSUFFICIENT_MEMORY,		-ENOMEM),
+	_STATUS_INFO(HV_STATUS_INVALID_PARTITION_ID,		-EINVAL),
+	_STATUS_INFO(HV_STATUS_INVALID_VP_INDEX,		-EINVAL),
+	_STATUS_INFO(HV_STATUS_NOT_FOUND,			-EIO),
+	_STATUS_INFO(HV_STATUS_INVALID_PORT_ID,			-EINVAL),
+	_STATUS_INFO(HV_STATUS_INVALID_CONNECTION_ID,		-EINVAL),
+	_STATUS_INFO(HV_STATUS_INSUFFICIENT_BUFFERS,		-EIO),
+	_STATUS_INFO(HV_STATUS_NOT_ACKNOWLEDGED,		-EIO),
+	_STATUS_INFO(HV_STATUS_INVALID_VP_STATE,		-EIO),
+	_STATUS_INFO(HV_STATUS_NO_RESOURCES,			-EIO),
+	_STATUS_INFO(HV_STATUS_PROCESSOR_FEATURE_NOT_SUPPORTED,	-EIO),
+	_STATUS_INFO(HV_STATUS_INVALID_LP_INDEX,		-EINVAL),
+	_STATUS_INFO(HV_STATUS_INVALID_REGISTER_VALUE,		-EINVAL),
+	_STATUS_INFO(HV_STATUS_INVALID_LP_INDEX,		-EIO),
+	_STATUS_INFO(HV_STATUS_INVALID_REGISTER_VALUE,		-EIO),
+	_STATUS_INFO(HV_STATUS_OPERATION_FAILED,		-EIO),
+	_STATUS_INFO(HV_STATUS_TIME_OUT,			-EIO),
+	_STATUS_INFO(HV_STATUS_CALL_PENDING,			-EIO),
+	_STATUS_INFO(HV_STATUS_VTL_ALREADY_ENABLED,		-EIO),
+#undef _STATUS_INFO
+};
+
+static inline const struct hv_status_info *find_hv_status_info(u64 hv_status)
 {
+	int i;
+	u16 code = hv_result(hv_status);
+
+	for (i = 0; i < ARRAY_SIZE(hv_status_infos); ++i) {
+		const struct hv_status_info *info = &hv_status_infos[i];
+
+		if (info->code == code)
+			return info;
+	}
+
 	return NULL;
 }
-EXPORT_SYMBOL_GPL(hv_map_memory);
 
-void __weak hv_unmap_memory(void *addr)
+/* Convert a hypercall result into a linux-friendly error code. */
+int hv_result_to_errno(u64 status)
+{
+	const struct hv_status_info *info;
+
+	/* hv_do_hypercall() may return U64_MAX, hypercalls aren't possible */
+	if (unlikely(status == U64_MAX))
+		return -EOPNOTSUPP;
+
+	info = find_hv_status_info(status);
+	if (info)
+		return info->errno;
+
+	return -EIO;
+}
+EXPORT_SYMBOL_GPL(hv_result_to_errno);
+
+const char *hv_result_to_string(u64 status)
 {
+	const struct hv_status_info *info;
+
+	if (unlikely(status == U64_MAX))
+		return "Hypercall page missing!";
+
+	info = find_hv_status_info(status);
+	if (info)
+		return info->string;
+
+	return "Unknown";
 }
-EXPORT_SYMBOL_GPL(hv_unmap_memory);
+EXPORT_SYMBOL_GPL(hv_result_to_string);
diff --git a/drivers/hv/hv_fcopy.c b/drivers/hv/hv_fcopy.c
deleted file mode 100644
index 922d83eb7ddf..000000000000
--- a/drivers/hv/hv_fcopy.c
+++ /dev/null
@@ -1,427 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * An implementation of file copy service.
- *
- * Copyright (C) 2014, Microsoft, Inc.
- *
- * Author : K. Y. Srinivasan <ksrinivasan@novell.com>
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/nls.h>
-#include <linux/workqueue.h>
-#include <linux/hyperv.h>
-#include <linux/sched.h>
-#include <asm/hyperv-tlfs.h>
-
-#include "hyperv_vmbus.h"
-#include "hv_utils_transport.h"
-
-#define WIN8_SRV_MAJOR		1
-#define WIN8_SRV_MINOR		1
-#define WIN8_SRV_VERSION	(WIN8_SRV_MAJOR << 16 | WIN8_SRV_MINOR)
-
-#define FCOPY_VER_COUNT 1
-static const int fcopy_versions[] = {
-	WIN8_SRV_VERSION
-};
-
-#define FW_VER_COUNT 1
-static const int fw_versions[] = {
-	UTIL_FW_VERSION
-};
-
-/*
- * Global state maintained for transaction that is being processed.
- * For a class of integration services, including the "file copy service",
- * the specified protocol is a "request/response" protocol which means that
- * there can only be single outstanding transaction from the host at any
- * given point in time. We use this to simplify memory management in this
- * driver - we cache and process only one message at a time.
- *
- * While the request/response protocol is guaranteed by the host, we further
- * ensure this by serializing packet processing in this driver - we do not
- * read additional packets from the VMBUs until the current packet is fully
- * handled.
- */
-
-static struct {
-	int state;   /* hvutil_device_state */
-	int recv_len; /* number of bytes received. */
-	struct hv_fcopy_hdr  *fcopy_msg; /* current message */
-	struct vmbus_channel *recv_channel; /* chn we got the request */
-	u64 recv_req_id; /* request ID. */
-} fcopy_transaction;
-
-static void fcopy_respond_to_host(int error);
-static void fcopy_send_data(struct work_struct *dummy);
-static void fcopy_timeout_func(struct work_struct *dummy);
-static DECLARE_DELAYED_WORK(fcopy_timeout_work, fcopy_timeout_func);
-static DECLARE_WORK(fcopy_send_work, fcopy_send_data);
-static const char fcopy_devname[] = "vmbus/hv_fcopy";
-static u8 *recv_buffer;
-static struct hvutil_transport *hvt;
-/*
- * This state maintains the version number registered by the daemon.
- */
-static int dm_reg_value;
-
-static void fcopy_poll_wrapper(void *channel)
-{
-	/* Transaction is finished, reset the state here to avoid races. */
-	fcopy_transaction.state = HVUTIL_READY;
-	tasklet_schedule(&((struct vmbus_channel *)channel)->callback_event);
-}
-
-static void fcopy_timeout_func(struct work_struct *dummy)
-{
-	/*
-	 * If the timer fires, the user-mode component has not responded;
-	 * process the pending transaction.
-	 */
-	fcopy_respond_to_host(HV_E_FAIL);
-	hv_poll_channel(fcopy_transaction.recv_channel, fcopy_poll_wrapper);
-}
-
-static void fcopy_register_done(void)
-{
-	pr_debug("FCP: userspace daemon registered\n");
-	hv_poll_channel(fcopy_transaction.recv_channel, fcopy_poll_wrapper);
-}
-
-static int fcopy_handle_handshake(u32 version)
-{
-	u32 our_ver = FCOPY_CURRENT_VERSION;
-
-	switch (version) {
-	case FCOPY_VERSION_0:
-		/* Daemon doesn't expect us to reply */
-		dm_reg_value = version;
-		break;
-	case FCOPY_VERSION_1:
-		/* Daemon expects us to reply with our own version */
-		if (hvutil_transport_send(hvt, &our_ver, sizeof(our_ver),
-		    fcopy_register_done))
-			return -EFAULT;
-		dm_reg_value = version;
-		break;
-	default:
-		/*
-		 * For now we will fail the registration.
-		 * If and when we have multiple versions to
-		 * deal with, we will be backward compatible.
-		 * We will add this code when needed.
-		 */
-		return -EINVAL;
-	}
-	pr_debug("FCP: userspace daemon ver. %d connected\n", version);
-	return 0;
-}
-
-static void fcopy_send_data(struct work_struct *dummy)
-{
-	struct hv_start_fcopy *smsg_out = NULL;
-	int operation = fcopy_transaction.fcopy_msg->operation;
-	struct hv_start_fcopy *smsg_in;
-	void *out_src;
-	int rc, out_len;
-
-	/*
-	 * The  strings sent from the host are encoded in
-	 * utf16; convert it to utf8 strings.
-	 * The host assures us that the utf16 strings will not exceed
-	 * the max lengths specified. We will however, reserve room
-	 * for the string terminating character - in the utf16s_utf8s()
-	 * function we limit the size of the buffer where the converted
-	 * string is placed to W_MAX_PATH -1 to guarantee
-	 * that the strings can be properly terminated!
-	 */
-
-	switch (operation) {
-	case START_FILE_COPY:
-		out_len = sizeof(struct hv_start_fcopy);
-		smsg_out = kzalloc(sizeof(*smsg_out), GFP_KERNEL);
-		if (!smsg_out)
-			return;
-
-		smsg_out->hdr.operation = operation;
-		smsg_in = (struct hv_start_fcopy *)fcopy_transaction.fcopy_msg;
-
-		utf16s_to_utf8s((wchar_t *)smsg_in->file_name, W_MAX_PATH,
-				UTF16_LITTLE_ENDIAN,
-				(__u8 *)&smsg_out->file_name, W_MAX_PATH - 1);
-
-		utf16s_to_utf8s((wchar_t *)smsg_in->path_name, W_MAX_PATH,
-				UTF16_LITTLE_ENDIAN,
-				(__u8 *)&smsg_out->path_name, W_MAX_PATH - 1);
-
-		smsg_out->copy_flags = smsg_in->copy_flags;
-		smsg_out->file_size = smsg_in->file_size;
-		out_src = smsg_out;
-		break;
-
-	case WRITE_TO_FILE:
-		out_src = fcopy_transaction.fcopy_msg;
-		out_len = sizeof(struct hv_do_fcopy);
-		break;
-	default:
-		out_src = fcopy_transaction.fcopy_msg;
-		out_len = fcopy_transaction.recv_len;
-		break;
-	}
-
-	fcopy_transaction.state = HVUTIL_USERSPACE_REQ;
-	rc = hvutil_transport_send(hvt, out_src, out_len, NULL);
-	if (rc) {
-		pr_debug("FCP: failed to communicate to the daemon: %d\n", rc);
-		if (cancel_delayed_work_sync(&fcopy_timeout_work)) {
-			fcopy_respond_to_host(HV_E_FAIL);
-			fcopy_transaction.state = HVUTIL_READY;
-		}
-	}
-	kfree(smsg_out);
-}
-
-/*
- * Send a response back to the host.
- */
-
-static void
-fcopy_respond_to_host(int error)
-{
-	struct icmsg_hdr *icmsghdr;
-	u32 buf_len;
-	struct vmbus_channel *channel;
-	u64 req_id;
-
-	/*
-	 * Copy the global state for completing the transaction. Note that
-	 * only one transaction can be active at a time. This is guaranteed
-	 * by the file copy protocol implemented by the host. Furthermore,
-	 * the "transaction active" state we maintain ensures that there can
-	 * only be one active transaction at a time.
-	 */
-
-	buf_len = fcopy_transaction.recv_len;
-	channel = fcopy_transaction.recv_channel;
-	req_id = fcopy_transaction.recv_req_id;
-
-	icmsghdr = (struct icmsg_hdr *)
-			&recv_buffer[sizeof(struct vmbuspipe_hdr)];
-
-	if (channel->onchannel_callback == NULL)
-		/*
-		 * We have raced with util driver being unloaded;
-		 * silently return.
-		 */
-		return;
-
-	icmsghdr->status = error;
-	icmsghdr->icflags = ICMSGHDRFLAG_TRANSACTION | ICMSGHDRFLAG_RESPONSE;
-	vmbus_sendpacket(channel, recv_buffer, buf_len, req_id,
-				VM_PKT_DATA_INBAND, 0);
-}
-
-void hv_fcopy_onchannelcallback(void *context)
-{
-	struct vmbus_channel *channel = context;
-	u32 recvlen;
-	u64 requestid;
-	struct hv_fcopy_hdr *fcopy_msg;
-	struct icmsg_hdr *icmsghdr;
-	int fcopy_srv_version;
-
-	if (fcopy_transaction.state > HVUTIL_READY)
-		return;
-
-	if (vmbus_recvpacket(channel, recv_buffer, HV_HYP_PAGE_SIZE * 2, &recvlen, &requestid)) {
-		pr_err_ratelimited("Fcopy request received. Could not read into recv buf\n");
-		return;
-	}
-
-	if (!recvlen)
-		return;
-
-	/* Ensure recvlen is big enough to read header data */
-	if (recvlen < ICMSG_HDR) {
-		pr_err_ratelimited("Fcopy request received. Packet length too small: %d\n",
-				   recvlen);
-		return;
-	}
-
-	icmsghdr = (struct icmsg_hdr *)&recv_buffer[
-			sizeof(struct vmbuspipe_hdr)];
-
-	if (icmsghdr->icmsgtype == ICMSGTYPE_NEGOTIATE) {
-		if (vmbus_prep_negotiate_resp(icmsghdr,
-				recv_buffer, recvlen,
-				fw_versions, FW_VER_COUNT,
-				fcopy_versions, FCOPY_VER_COUNT,
-				NULL, &fcopy_srv_version)) {
-
-			pr_info("FCopy IC version %d.%d\n",
-				fcopy_srv_version >> 16,
-				fcopy_srv_version & 0xFFFF);
-		}
-	} else if (icmsghdr->icmsgtype == ICMSGTYPE_FCOPY) {
-		/* Ensure recvlen is big enough to contain hv_fcopy_hdr */
-		if (recvlen < ICMSG_HDR + sizeof(struct hv_fcopy_hdr)) {
-			pr_err_ratelimited("Invalid Fcopy hdr. Packet length too small: %u\n",
-					   recvlen);
-			return;
-		}
-		fcopy_msg = (struct hv_fcopy_hdr *)&recv_buffer[ICMSG_HDR];
-
-		/*
-		 * Stash away this global state for completing the
-		 * transaction; note transactions are serialized.
-		 */
-
-		fcopy_transaction.recv_len = recvlen;
-		fcopy_transaction.recv_req_id = requestid;
-		fcopy_transaction.fcopy_msg = fcopy_msg;
-
-		if (fcopy_transaction.state < HVUTIL_READY) {
-			/* Userspace is not registered yet */
-			fcopy_respond_to_host(HV_E_FAIL);
-			return;
-		}
-		fcopy_transaction.state = HVUTIL_HOSTMSG_RECEIVED;
-
-		/*
-		 * Send the information to the user-level daemon.
-		 */
-		schedule_work(&fcopy_send_work);
-		schedule_delayed_work(&fcopy_timeout_work,
-				      HV_UTIL_TIMEOUT * HZ);
-		return;
-	} else {
-		pr_err_ratelimited("Fcopy request received. Invalid msg type: %d\n",
-				   icmsghdr->icmsgtype);
-		return;
-	}
-	icmsghdr->icflags = ICMSGHDRFLAG_TRANSACTION | ICMSGHDRFLAG_RESPONSE;
-	vmbus_sendpacket(channel, recv_buffer, recvlen, requestid,
-			VM_PKT_DATA_INBAND, 0);
-}
-
-/* Callback when data is received from userspace */
-static int fcopy_on_msg(void *msg, int len)
-{
-	int *val = (int *)msg;
-
-	if (len != sizeof(int))
-		return -EINVAL;
-
-	if (fcopy_transaction.state == HVUTIL_DEVICE_INIT)
-		return fcopy_handle_handshake(*val);
-
-	if (fcopy_transaction.state != HVUTIL_USERSPACE_REQ)
-		return -EINVAL;
-
-	/*
-	 * Complete the transaction by forwarding the result
-	 * to the host. But first, cancel the timeout.
-	 */
-	if (cancel_delayed_work_sync(&fcopy_timeout_work)) {
-		fcopy_transaction.state = HVUTIL_USERSPACE_RECV;
-		fcopy_respond_to_host(*val);
-		hv_poll_channel(fcopy_transaction.recv_channel,
-				fcopy_poll_wrapper);
-	}
-
-	return 0;
-}
-
-static void fcopy_on_reset(void)
-{
-	/*
-	 * The daemon has exited; reset the state.
-	 */
-	fcopy_transaction.state = HVUTIL_DEVICE_INIT;
-
-	if (cancel_delayed_work_sync(&fcopy_timeout_work))
-		fcopy_respond_to_host(HV_E_FAIL);
-}
-
-int hv_fcopy_init(struct hv_util_service *srv)
-{
-	recv_buffer = srv->recv_buffer;
-	fcopy_transaction.recv_channel = srv->channel;
-	fcopy_transaction.recv_channel->max_pkt_size = HV_HYP_PAGE_SIZE * 2;
-
-	/*
-	 * When this driver loads, the user level daemon that
-	 * processes the host requests may not yet be running.
-	 * Defer processing channel callbacks until the daemon
-	 * has registered.
-	 */
-	fcopy_transaction.state = HVUTIL_DEVICE_INIT;
-
-	hvt = hvutil_transport_init(fcopy_devname, 0, 0,
-				    fcopy_on_msg, fcopy_on_reset);
-	if (!hvt)
-		return -EFAULT;
-
-	return 0;
-}
-
-static void hv_fcopy_cancel_work(void)
-{
-	cancel_delayed_work_sync(&fcopy_timeout_work);
-	cancel_work_sync(&fcopy_send_work);
-}
-
-int hv_fcopy_pre_suspend(void)
-{
-	struct vmbus_channel *channel = fcopy_transaction.recv_channel;
-	struct hv_fcopy_hdr *fcopy_msg;
-
-	/*
-	 * Fake a CANCEL_FCOPY message for the user space daemon in case the
-	 * daemon is in the middle of copying some file. It doesn't matter if
-	 * there is already a message pending to be delivered to the user
-	 * space since we force fcopy_transaction.state to be HVUTIL_READY, so
-	 * the user space daemon's write() will fail with EINVAL (see
-	 * fcopy_on_msg()), and the daemon will reset the device by closing
-	 * and re-opening it.
-	 */
-	fcopy_msg = kzalloc(sizeof(*fcopy_msg), GFP_KERNEL);
-	if (!fcopy_msg)
-		return -ENOMEM;
-
-	tasklet_disable(&channel->callback_event);
-
-	fcopy_msg->operation = CANCEL_FCOPY;
-
-	hv_fcopy_cancel_work();
-
-	/* We don't care about the return value. */
-	hvutil_transport_send(hvt, fcopy_msg, sizeof(*fcopy_msg), NULL);
-
-	kfree(fcopy_msg);
-
-	fcopy_transaction.state = HVUTIL_READY;
-
-	/* tasklet_enable() will be called in hv_fcopy_pre_resume(). */
-	return 0;
-}
-
-int hv_fcopy_pre_resume(void)
-{
-	struct vmbus_channel *channel = fcopy_transaction.recv_channel;
-
-	tasklet_enable(&channel->callback_event);
-
-	return 0;
-}
-
-void hv_fcopy_deinit(void)
-{
-	fcopy_transaction.state = HVUTIL_DEVICE_DYING;
-
-	hv_fcopy_cancel_work();
-
-	hvutil_transport_destroy(hvt);
-}
diff --git a/drivers/hv/hv_kvp.c b/drivers/hv/hv_kvp.c
index d35b60c06114..62795f6cbb00 100644
--- a/drivers/hv/hv_kvp.c
+++ b/drivers/hv/hv_kvp.c
@@ -27,7 +27,7 @@
 #include <linux/connector.h>
 #include <linux/workqueue.h>
 #include <linux/hyperv.h>
-#include <asm/hyperv-tlfs.h>
+#include <hyperv/hvhdk.h>
 
 #include "hyperv_vmbus.h"
 #include "hv_utils_transport.h"
@@ -655,7 +655,7 @@ void hv_kvp_onchannelcallback(void *context)
 		if (host_negotiatied == NEGO_NOT_STARTED) {
 			host_negotiatied = NEGO_IN_PROGRESS;
 			schedule_delayed_work(&kvp_host_handshake_work,
-				      HV_UTIL_NEGO_TIMEOUT * HZ);
+						secs_to_jiffies(HV_UTIL_NEGO_TIMEOUT));
 		}
 		return;
 	}
@@ -724,7 +724,7 @@ void hv_kvp_onchannelcallback(void *context)
 		 */
 		schedule_work(&kvp_sendkey_work);
 		schedule_delayed_work(&kvp_timeout_work,
-					HV_UTIL_TIMEOUT * HZ);
+				      secs_to_jiffies(HV_UTIL_TIMEOUT));
 
 		return;
 
@@ -767,6 +767,12 @@ hv_kvp_init(struct hv_util_service *srv)
 	 */
 	kvp_transaction.state = HVUTIL_DEVICE_INIT;
 
+	return 0;
+}
+
+int
+hv_kvp_init_transport(void)
+{
 	hvt = hvutil_transport_init(kvp_devname, CN_KVP_IDX, CN_KVP_VAL,
 				    kvp_on_msg, kvp_on_reset);
 	if (!hvt)
diff --git a/drivers/hv/hv_proc.c b/drivers/hv/hv_proc.c
new file mode 100644
index 000000000000..fbb4eb3901bb
--- /dev/null
+++ b/drivers/hv/hv_proc.c
@@ -0,0 +1,196 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/clockchips.h>
+#include <linux/slab.h>
+#include <linux/cpuhotplug.h>
+#include <linux/minmax.h>
+#include <linux/export.h>
+#include <asm/mshyperv.h>
+
+/*
+ * See struct hv_deposit_memory. The first u64 is partition ID, the rest
+ * are GPAs.
+ */
+#define HV_DEPOSIT_MAX (HV_HYP_PAGE_SIZE / sizeof(u64) - 1)
+
+/* Deposits exact number of pages. Must be called with interrupts enabled.  */
+int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages)
+{
+	struct page **pages, *page;
+	int *counts;
+	int num_allocations;
+	int i, j, page_count;
+	int order;
+	u64 status;
+	int ret;
+	u64 base_pfn;
+	struct hv_deposit_memory *input_page;
+	unsigned long flags;
+
+	if (num_pages > HV_DEPOSIT_MAX)
+		return -E2BIG;
+	if (!num_pages)
+		return 0;
+
+	/* One buffer for page pointers and counts */
+	page = alloc_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+	pages = page_address(page);
+
+	counts = kcalloc(HV_DEPOSIT_MAX, sizeof(int), GFP_KERNEL);
+	if (!counts) {
+		free_page((unsigned long)pages);
+		return -ENOMEM;
+	}
+
+	/* Allocate all the pages before disabling interrupts */
+	i = 0;
+
+	while (num_pages) {
+		/* Find highest order we can actually allocate */
+		order = 31 - __builtin_clz(num_pages);
+
+		while (1) {
+			pages[i] = alloc_pages_node(node, GFP_KERNEL, order);
+			if (pages[i])
+				break;
+			if (!order) {
+				ret = -ENOMEM;
+				num_allocations = i;
+				goto err_free_allocations;
+			}
+			--order;
+		}
+
+		split_page(pages[i], order);
+		counts[i] = 1 << order;
+		num_pages -= counts[i];
+		i++;
+	}
+	num_allocations = i;
+
+	local_irq_save(flags);
+
+	input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+	input_page->partition_id = partition_id;
+
+	/* Populate gpa_page_list - these will fit on the input page */
+	for (i = 0, page_count = 0; i < num_allocations; ++i) {
+		base_pfn = page_to_pfn(pages[i]);
+		for (j = 0; j < counts[i]; ++j, ++page_count)
+			input_page->gpa_page_list[page_count] = base_pfn + j;
+	}
+	status = hv_do_rep_hypercall(HVCALL_DEPOSIT_MEMORY,
+				     page_count, 0, input_page, NULL);
+	local_irq_restore(flags);
+	if (!hv_result_success(status)) {
+		hv_status_err(status, "\n");
+		ret = hv_result_to_errno(status);
+		goto err_free_allocations;
+	}
+
+	ret = 0;
+	goto free_buf;
+
+err_free_allocations:
+	for (i = 0; i < num_allocations; ++i) {
+		base_pfn = page_to_pfn(pages[i]);
+		for (j = 0; j < counts[i]; ++j)
+			__free_page(pfn_to_page(base_pfn + j));
+	}
+
+free_buf:
+	free_page((unsigned long)pages);
+	kfree(counts);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(hv_call_deposit_pages);
+
+int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id)
+{
+	struct hv_input_add_logical_processor *input;
+	struct hv_output_add_logical_processor *output;
+	u64 status;
+	unsigned long flags;
+	int ret = 0;
+
+	/*
+	 * When adding a logical processor, the hypervisor may return
+	 * HV_STATUS_INSUFFICIENT_MEMORY. When that happens, we deposit more
+	 * pages and retry.
+	 */
+	do {
+		local_irq_save(flags);
+
+		input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+		/* We don't do anything with the output right now */
+		output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+		input->lp_index = lp_index;
+		input->apic_id = apic_id;
+		input->proximity_domain_info = hv_numa_node_to_pxm_info(node);
+		status = hv_do_hypercall(HVCALL_ADD_LOGICAL_PROCESSOR,
+					 input, output);
+		local_irq_restore(flags);
+
+		if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
+			if (!hv_result_success(status)) {
+				hv_status_err(status, "cpu %u apic ID: %u\n",
+					      lp_index, apic_id);
+				ret = hv_result_to_errno(status);
+			}
+			break;
+		}
+		ret = hv_call_deposit_pages(node, hv_current_partition_id, 1);
+	} while (!ret);
+
+	return ret;
+}
+
+int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags)
+{
+	struct hv_create_vp *input;
+	u64 status;
+	unsigned long irq_flags;
+	int ret = 0;
+
+	/* Root VPs don't seem to need pages deposited */
+	if (partition_id != hv_current_partition_id) {
+		/* The value 90 is empirically determined. It may change. */
+		ret = hv_call_deposit_pages(node, partition_id, 90);
+		if (ret)
+			return ret;
+	}
+
+	do {
+		local_irq_save(irq_flags);
+
+		input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+		input->partition_id = partition_id;
+		input->vp_index = vp_index;
+		input->flags = flags;
+		input->subnode_type = HV_SUBNODE_ANY;
+		input->proximity_domain_info = hv_numa_node_to_pxm_info(node);
+		status = hv_do_hypercall(HVCALL_CREATE_VP, input, NULL);
+		local_irq_restore(irq_flags);
+
+		if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
+			if (!hv_result_success(status)) {
+				hv_status_err(status, "vcpu: %u, lp: %u\n",
+					      vp_index, flags);
+				ret = hv_result_to_errno(status);
+			}
+			break;
+		}
+		ret = hv_call_deposit_pages(node, partition_id, 1);
+
+	} while (!ret);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(hv_call_create_vp);
diff --git a/drivers/hv/hv_snapshot.c b/drivers/hv/hv_snapshot.c
index 0d2184be1691..2e7f537d53cf 100644
--- a/drivers/hv/hv_snapshot.c
+++ b/drivers/hv/hv_snapshot.c
@@ -12,7 +12,7 @@
 #include <linux/connector.h>
 #include <linux/workqueue.h>
 #include <linux/hyperv.h>
-#include <asm/hyperv-tlfs.h>
+#include <hyperv/hvhdk.h>
 
 #include "hyperv_vmbus.h"
 #include "hv_utils_transport.h"
@@ -193,7 +193,8 @@ static void vss_send_op(void)
 	vss_transaction.state = HVUTIL_USERSPACE_REQ;
 
 	schedule_delayed_work(&vss_timeout_work, op == VSS_OP_FREEZE ?
-			VSS_FREEZE_TIMEOUT * HZ : HV_UTIL_TIMEOUT * HZ);
+				secs_to_jiffies(VSS_FREEZE_TIMEOUT) :
+				secs_to_jiffies(HV_UTIL_TIMEOUT));
 
 	rc = hvutil_transport_send(hvt, vss_msg, sizeof(*vss_msg), NULL);
 	if (rc) {
@@ -388,6 +389,12 @@ hv_vss_init(struct hv_util_service *srv)
 	 */
 	vss_transaction.state = HVUTIL_DEVICE_INIT;
 
+	return 0;
+}
+
+int
+hv_vss_init_transport(void)
+{
 	hvt = hvutil_transport_init(vss_devname, CN_VSS_IDX, CN_VSS_VAL,
 				    vss_on_msg, vss_on_reset);
 	if (!hvt) {
diff --git a/drivers/hv/hv_util.c b/drivers/hv/hv_util.c
index d776074b49cb..7e9c8e169c66 100644
--- a/drivers/hv/hv_util.c
+++ b/drivers/hv/hv_util.c
@@ -141,6 +141,7 @@ static struct hv_util_service util_heartbeat = {
 static struct hv_util_service util_kvp = {
 	.util_cb = hv_kvp_onchannelcallback,
 	.util_init = hv_kvp_init,
+	.util_init_transport = hv_kvp_init_transport,
 	.util_pre_suspend = hv_kvp_pre_suspend,
 	.util_pre_resume = hv_kvp_pre_resume,
 	.util_deinit = hv_kvp_deinit,
@@ -149,19 +150,12 @@ static struct hv_util_service util_kvp = {
 static struct hv_util_service util_vss = {
 	.util_cb = hv_vss_onchannelcallback,
 	.util_init = hv_vss_init,
+	.util_init_transport = hv_vss_init_transport,
 	.util_pre_suspend = hv_vss_pre_suspend,
 	.util_pre_resume = hv_vss_pre_resume,
 	.util_deinit = hv_vss_deinit,
 };
 
-static struct hv_util_service util_fcopy = {
-	.util_cb = hv_fcopy_onchannelcallback,
-	.util_init = hv_fcopy_init,
-	.util_pre_suspend = hv_fcopy_pre_suspend,
-	.util_pre_resume = hv_fcopy_pre_resume,
-	.util_deinit = hv_fcopy_deinit,
-};
-
 static void perform_shutdown(struct work_struct *dummy)
 {
 	orderly_poweroff(true);
@@ -296,6 +290,11 @@ static struct {
 	spinlock_t			lock;
 } host_ts;
 
+static bool timesync_implicit;
+
+module_param(timesync_implicit, bool, 0644);
+MODULE_PARM_DESC(timesync_implicit, "If set treat SAMPLE as SYNC when clock is behind");
+
 static inline u64 reftime_to_ns(u64 reftime)
 {
 	return (reftime - WLTIMEDELTA) * 100;
@@ -345,6 +344,29 @@ static void hv_set_host_time(struct work_struct *work)
 }
 
 /*
+ * Due to a bug on Hyper-V hosts, the sync flag may not always be sent on resume.
+ * Force a sync if the guest is behind.
+ */
+static inline bool hv_implicit_sync(u64 host_time)
+{
+	struct timespec64 new_ts;
+	struct timespec64 threshold_ts;
+
+	new_ts = ns_to_timespec64(reftime_to_ns(host_time));
+	ktime_get_real_ts64(&threshold_ts);
+
+	threshold_ts.tv_sec += 5;
+
+	/*
+	 * If guest behind the host by 5 or more seconds.
+	 */
+	if (timespec64_compare(&new_ts, &threshold_ts) >= 0)
+		return true;
+
+	return false;
+}
+
+/*
  * Synchronize time with host after reboot, restore, etc.
  *
  * ICTIMESYNCFLAG_SYNC flag bit indicates reboot, restore events of the VM.
@@ -384,7 +406,8 @@ static inline void adj_guesttime(u64 hosttime, u64 reftime, u8 adj_flags)
 	spin_unlock_irqrestore(&host_ts.lock, flags);
 
 	/* Schedule work to do do_settimeofday64() */
-	if (adj_flags & ICTIMESYNCFLAG_SYNC)
+	if ((adj_flags & ICTIMESYNCFLAG_SYNC) ||
+	    (timesync_implicit && hv_implicit_sync(host_ts.host_time)))
 		schedule_work(&adj_time_work);
 }
 
@@ -563,16 +586,14 @@ static int util_probe(struct hv_device *dev,
 		(struct hv_util_service *)dev_id->driver_data;
 	int ret;
 
-	srv->recv_buffer = kmalloc(HV_HYP_PAGE_SIZE * 4, GFP_KERNEL);
+	srv->recv_buffer = kmalloc_array(4, HV_HYP_PAGE_SIZE, GFP_KERNEL);
 	if (!srv->recv_buffer)
 		return -ENOMEM;
 	srv->channel = dev->channel;
 	if (srv->util_init) {
 		ret = srv->util_init(srv);
-		if (ret) {
-			ret = -ENODEV;
+		if (ret)
 			goto error1;
-		}
 	}
 
 	/*
@@ -592,6 +613,13 @@ static int util_probe(struct hv_device *dev,
 	if (ret)
 		goto error;
 
+	if (srv->util_init_transport) {
+		ret = srv->util_init_transport();
+		if (ret) {
+			vmbus_close(dev->channel);
+			goto error;
+		}
+	}
 	return 0;
 
 error:
@@ -602,7 +630,7 @@ error1:
 	return ret;
 }
 
-static int util_remove(struct hv_device *dev)
+static void util_remove(struct hv_device *dev)
 {
 	struct hv_util_service *srv = hv_get_drvdata(dev);
 
@@ -610,8 +638,6 @@ static int util_remove(struct hv_device *dev)
 		srv->util_deinit();
 	vmbus_close(dev->channel);
 	kfree(srv->recv_buffer);
-
-	return 0;
 }
 
 /*
@@ -673,10 +699,6 @@ static const struct hv_vmbus_device_id id_table[] = {
 	{ HV_VSS_GUID,
 	  .driver_data = (unsigned long)&util_vss
 	},
-	/* File copy GUID */
-	{ HV_FCOPY_GUID,
-	  .driver_data = (unsigned long)&util_fcopy
-	},
 	{ },
 };
 
diff --git a/drivers/hv/hv_utils_transport.c b/drivers/hv/hv_utils_transport.c
index 832885198643..b3de35ff6334 100644
--- a/drivers/hv/hv_utils_transport.c
+++ b/drivers/hv/hv_utils_transport.c
@@ -129,8 +129,7 @@ static int hvt_op_open(struct inode *inode, struct file *file)
 		 * device gets released.
 		 */
 		hvt->mode = HVUTIL_TRANSPORT_CHARDEV;
-	}
-	else if (hvt->mode == HVUTIL_TRANSPORT_NETLINK) {
+	} else if (hvt->mode == HVUTIL_TRANSPORT_NETLINK) {
 		/*
 		 * We're switching from netlink communication to using char
 		 * device. Issue the reset first.
@@ -195,7 +194,7 @@ static void hvt_cn_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp)
 	}
 	spin_unlock(&hvt_list_lock);
 	if (!hvt_found) {
-		pr_warn("hvt_cn_callback: spurious message received!\n");
+		pr_warn("%s: spurious message received!\n", __func__);
 		return;
 	}
 
@@ -210,7 +209,7 @@ static void hvt_cn_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp)
 	if (hvt->mode == HVUTIL_TRANSPORT_NETLINK)
 		hvt_found->on_msg(msg->data, msg->len);
 	else
-		pr_warn("hvt_cn_callback: unexpected netlink message!\n");
+		pr_warn("%s: unexpected netlink message!\n", __func__);
 	mutex_unlock(&hvt->lock);
 }
 
@@ -260,8 +259,9 @@ int hvutil_transport_send(struct hvutil_transport *hvt, void *msg, int len,
 		hvt->outmsg_len = len;
 		hvt->on_read = on_read_cb;
 		wake_up_interruptible(&hvt->outmsg_q);
-	} else
+	} else {
 		ret = -ENOMEM;
+	}
 out_unlock:
 	mutex_unlock(&hvt->lock);
 	return ret;
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index dc673edf053c..b2862e0a317a 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -15,10 +15,11 @@
 #include <linux/list.h>
 #include <linux/bitops.h>
 #include <asm/sync_bitops.h>
-#include <asm/hyperv-tlfs.h>
+#include <asm/mshyperv.h>
 #include <linux/atomic.h>
 #include <linux/hyperv.h>
 #include <linux/interrupt.h>
+#include <hyperv/hvhdk.h>
 
 #include "hv_trace.h"
 
@@ -32,6 +33,7 @@
  */
 #define HV_UTIL_NEGO_TIMEOUT 55
 
+void vmbus_isr(void);
 
 /* Definitions for the monitored notification facility */
 union hv_monitor_trigger_group {
@@ -120,10 +122,35 @@ enum {
  * Per cpu state for channel handling
  */
 struct hv_per_cpu_context {
-	void *synic_message_page;
-	void *synic_event_page;
 	/*
-	 * buffer to post messages to the host.
+	 * SynIC pages for communicating with the host.
+	 *
+	 * These pages are accessible to the host partition and the hypervisor.
+	 * They may be used for exchanging data with the host partition and the
+	 * hypervisor even when they aren't trusted yet the guest partition
+	 * must be prepared to handle the malicious behavior.
+	 */
+	void *hyp_synic_message_page;
+	void *hyp_synic_event_page;
+	/*
+	 * SynIC pages for communicating with the paravisor.
+	 *
+	 * These pages may be accessed from within the guest partition only in
+	 * CoCo VMs. Neither the host partition nor the hypervisor can access
+	 * these pages in that case; they are used for exchanging data with the
+	 * paravisor.
+	 */
+	void *para_synic_message_page;
+	void *para_synic_event_page;
+
+	/*
+	 * The page is only used in hv_post_message() for a TDX VM (with the
+	 * paravisor) to post a messages to Hyper-V: when such a VM calls
+	 * HVCALL_POST_MESSAGE, it can't use the hyperv_pcpu_input_arg (which
+	 * is encrypted in such a VM) as the hypercall input page, because
+	 * the input page for HVCALL_POST_MESSAGE must be decrypted in such a
+	 * VM, so post_msg_page (which is decrypted in hv_synic_alloc()) is
+	 * introduced for this purpose. See hyperv_init() for more comments.
 	 */
 	void *post_msg_page;
 
@@ -164,10 +191,10 @@ extern int hv_synic_alloc(void);
 
 extern void hv_synic_free(void);
 
-extern void hv_synic_enable_regs(unsigned int cpu);
+extern void hv_hyp_synic_enable_regs(unsigned int cpu);
 extern int hv_synic_init(unsigned int cpu);
 
-extern void hv_synic_disable_regs(unsigned int cpu);
+extern void hv_hyp_synic_disable_regs(unsigned int cpu);
 extern int hv_synic_cleanup(unsigned int cpu);
 
 /* Interface */
@@ -175,7 +202,8 @@ extern int hv_synic_cleanup(unsigned int cpu);
 void hv_ringbuffer_pre_init(struct vmbus_channel *channel);
 
 int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info,
-		       struct page *pages, u32 pagecnt, u32 max_pkt_size);
+		       struct page *pages, u32 pagecnt, u32 max_pkt_size,
+			   bool confidential);
 
 void hv_ringbuffer_cleanup(struct hv_ring_buffer_info *ring_info);
 
@@ -241,8 +269,6 @@ struct vmbus_connection {
 	 * is child->parent notification
 	 */
 	struct hv_monitor_page *monitor_pages[2];
-	void *monitor_pages_original[2];
-	phys_addr_t monitor_pages_pa[2];
 	struct list_head chn_msg_list;
 	spinlock_t channelmsg_lock;
 
@@ -282,18 +308,10 @@ struct vmbus_connection {
 	struct completion ready_for_suspend_event;
 
 	/*
-	 * The number of primary channels that should be "fixed up"
-	 * upon resume: these channels are re-offered upon resume, and some
-	 * fields of the channel offers (i.e. child_relid and connection_id)
-	 * can change, so the old offermsg must be fixed up, before the resume
-	 * callbacks of the VSC drivers start to further touch the channels.
+	 * Completed once the host has offered all boot-time channels.
+	 * Note that some channels may still be under process on a workqueue.
 	 */
-	atomic_t nr_chan_fixup_on_resume;
-	/*
-	 * vmbus_bus_resume() waits for "nr_chan_fixup_on_resume" to
-	 * drop to zero.
-	 */
-	struct completion ready_for_resume_event;
+	struct completion all_offers_delivered_event;
 };
 
 
@@ -336,6 +354,51 @@ extern const struct vmbus_channel_message_table_entry
 
 /* General vmbus interface */
 
+bool vmbus_is_confidential(void);
+
+#if IS_ENABLED(CONFIG_HYPERV_VMBUS)
+/* Free the message slot and signal end-of-message if required */
+static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type)
+{
+	/*
+	 * On crash we're reading some other CPU's message page and we need
+	 * to be careful: this other CPU may already had cleared the header
+	 * and the host may already had delivered some other message there.
+	 * In case we blindly write msg->header.message_type we're going
+	 * to lose it. We can still lose a message of the same type but
+	 * we count on the fact that there can only be one
+	 * CHANNELMSG_UNLOAD_RESPONSE and we don't care about other messages
+	 * on crash.
+	 */
+	if (cmpxchg(&msg->header.message_type, old_msg_type,
+		    HVMSG_NONE) != old_msg_type)
+		return;
+
+	/*
+	 * The cmxchg() above does an implicit memory barrier to
+	 * ensure the write to MessageType (ie set to
+	 * HVMSG_NONE) happens before we read the
+	 * MessagePending and EOMing. Otherwise, the EOMing
+	 * will not deliver any more messages since there is
+	 * no empty slot
+	 */
+	if (msg->header.message_flags.msg_pending) {
+		/*
+		 * This will cause message queue rescan to
+		 * possibly deliver another msg from the
+		 * hypervisor
+		 */
+		if (vmbus_is_confidential())
+			hv_para_set_synic_register(HV_MSR_EOM, 0);
+		else
+			hv_set_msr(HV_MSR_EOM, 0);
+	}
+}
+
+extern int vmbus_interrupt;
+extern int vmbus_irq;
+#endif /* CONFIG_HYPERV_VMBUS */
+
 struct hv_device *vmbus_device_create(const guid_t *type,
 				      const guid_t *instance,
 				      struct vmbus_channel *channel);
@@ -365,22 +428,18 @@ void vmbus_on_event(unsigned long data);
 void vmbus_on_msg_dpc(unsigned long data);
 
 int hv_kvp_init(struct hv_util_service *srv);
+int hv_kvp_init_transport(void);
 void hv_kvp_deinit(void);
 int hv_kvp_pre_suspend(void);
 int hv_kvp_pre_resume(void);
 void hv_kvp_onchannelcallback(void *context);
 
 int hv_vss_init(struct hv_util_service *srv);
+int hv_vss_init_transport(void);
 void hv_vss_deinit(void);
 int hv_vss_pre_suspend(void);
 int hv_vss_pre_resume(void);
 void hv_vss_onchannelcallback(void *context);
-
-int hv_fcopy_init(struct hv_util_service *srv);
-void hv_fcopy_deinit(void);
-int hv_fcopy_pre_suspend(void);
-int hv_fcopy_pre_resume(void);
-void hv_fcopy_onchannelcallback(void *context);
 void vmbus_initiate_unload(bool crash);
 
 static inline void hv_poll_channel(struct vmbus_channel *channel,
@@ -412,6 +471,11 @@ static inline bool hv_is_perf_channel(struct vmbus_channel *channel)
 	return vmbus_devs[channel->device_id].perf_device;
 }
 
+static inline size_t hv_dev_ring_size(struct vmbus_channel *channel)
+{
+	return vmbus_devs[channel->device_id].pref_ring_size;
+}
+
 static inline bool hv_is_allocated_cpu(unsigned int cpu)
 {
 	struct vmbus_channel *channel, *sc;
@@ -479,4 +543,10 @@ static inline int hv_debug_add_dev_dir(struct hv_device *dev)
 
 #endif /* CONFIG_HYPERV_TESTING */
 
+/* Create and remove sysfs entry for memory mapped ring buffers for a channel */
+int hv_create_ring_sysfs(struct vmbus_channel *channel,
+			 int (*hv_mmap_ring_buffer)(struct vmbus_channel *channel,
+						    struct vm_area_struct *vma));
+int hv_remove_ring_sysfs(struct vmbus_channel *channel);
+
 #endif /* _HYPERV_VMBUS_H */
diff --git a/drivers/hv/mshv.h b/drivers/hv/mshv.h
new file mode 100644
index 000000000000..d4813df92b9c
--- /dev/null
+++ b/drivers/hv/mshv.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2023, Microsoft Corporation.
+ */
+
+#ifndef _MSHV_H_
+#define _MSHV_H_
+
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <hyperv/hvhdk.h>
+
+#define mshv_field_nonzero(STRUCT, MEMBER) \
+	memchr_inv(&((STRUCT).MEMBER), \
+		   0, sizeof_field(typeof(STRUCT), MEMBER))
+
+int hv_call_get_vp_registers(u32 vp_index, u64 partition_id, u16 count,
+			     union hv_input_vtl input_vtl,
+			     struct hv_register_assoc *registers);
+
+int hv_call_set_vp_registers(u32 vp_index, u64 partition_id, u16 count,
+			     union hv_input_vtl input_vtl,
+			     struct hv_register_assoc *registers);
+
+int hv_call_get_partition_property(u64 partition_id, u64 property_code,
+				   u64 *property_value);
+
+#endif /* _MSHV_H */
diff --git a/drivers/hv/mshv_common.c b/drivers/hv/mshv_common.c
new file mode 100644
index 000000000000..58027b23c206
--- /dev/null
+++ b/drivers/hv/mshv_common.c
@@ -0,0 +1,239 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024, Microsoft Corporation.
+ *
+ * This file contains functions that will be called from one or more modules.
+ * If any of these modules are configured to build, this file is built and just
+ * statically linked in.
+ *
+ * Authors: Microsoft Linux virtualization team
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <asm/mshyperv.h>
+#include <linux/resume_user_mode.h>
+#include <linux/export.h>
+#include <linux/acpi.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+
+#include "mshv.h"
+
+#define HV_GET_REGISTER_BATCH_SIZE	\
+	(HV_HYP_PAGE_SIZE / sizeof(union hv_register_value))
+#define HV_SET_REGISTER_BATCH_SIZE	\
+	((HV_HYP_PAGE_SIZE - sizeof(struct hv_input_set_vp_registers)) \
+		/ sizeof(struct hv_register_assoc))
+
+int hv_call_get_vp_registers(u32 vp_index, u64 partition_id, u16 count,
+			     union hv_input_vtl input_vtl,
+			     struct hv_register_assoc *registers)
+{
+	struct hv_input_get_vp_registers *input_page;
+	union hv_register_value *output_page;
+	u16 completed = 0;
+	unsigned long remaining = count;
+	int rep_count, i;
+	u64 status = HV_STATUS_SUCCESS;
+	unsigned long flags;
+
+	local_irq_save(flags);
+
+	input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
+	output_page = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+	input_page->partition_id = partition_id;
+	input_page->vp_index = vp_index;
+	input_page->input_vtl.as_uint8 = input_vtl.as_uint8;
+	input_page->rsvd_z8 = 0;
+	input_page->rsvd_z16 = 0;
+
+	while (remaining) {
+		rep_count = min(remaining, HV_GET_REGISTER_BATCH_SIZE);
+		for (i = 0; i < rep_count; ++i)
+			input_page->names[i] = registers[i].name;
+
+		status = hv_do_rep_hypercall(HVCALL_GET_VP_REGISTERS, rep_count,
+					     0, input_page, output_page);
+		if (!hv_result_success(status))
+			break;
+
+		completed = hv_repcomp(status);
+		for (i = 0; i < completed; ++i)
+			registers[i].value = output_page[i];
+
+		registers += completed;
+		remaining -= completed;
+	}
+	local_irq_restore(flags);
+
+	return hv_result_to_errno(status);
+}
+EXPORT_SYMBOL_GPL(hv_call_get_vp_registers);
+
+int hv_call_set_vp_registers(u32 vp_index, u64 partition_id, u16 count,
+			     union hv_input_vtl input_vtl,
+			     struct hv_register_assoc *registers)
+{
+	struct hv_input_set_vp_registers *input_page;
+	u16 completed = 0;
+	unsigned long remaining = count;
+	int rep_count;
+	u64 status = HV_STATUS_SUCCESS;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+	input_page->partition_id = partition_id;
+	input_page->vp_index = vp_index;
+	input_page->input_vtl.as_uint8 = input_vtl.as_uint8;
+	input_page->rsvd_z8 = 0;
+	input_page->rsvd_z16 = 0;
+
+	while (remaining) {
+		rep_count = min(remaining, HV_SET_REGISTER_BATCH_SIZE);
+		memcpy(input_page->elements, registers,
+		       sizeof(struct hv_register_assoc) * rep_count);
+
+		status = hv_do_rep_hypercall(HVCALL_SET_VP_REGISTERS, rep_count,
+					     0, input_page, NULL);
+		if (!hv_result_success(status))
+			break;
+
+		completed = hv_repcomp(status);
+		registers += completed;
+		remaining -= completed;
+	}
+
+	local_irq_restore(flags);
+
+	return hv_result_to_errno(status);
+}
+EXPORT_SYMBOL_GPL(hv_call_set_vp_registers);
+
+int hv_call_get_partition_property(u64 partition_id,
+				   u64 property_code,
+				   u64 *property_value)
+{
+	u64 status;
+	unsigned long flags;
+	struct hv_input_get_partition_property *input;
+	struct hv_output_get_partition_property *output;
+
+	local_irq_save(flags);
+	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+	output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+	memset(input, 0, sizeof(*input));
+	input->partition_id = partition_id;
+	input->property_code = property_code;
+	status = hv_do_hypercall(HVCALL_GET_PARTITION_PROPERTY, input, output);
+
+	if (!hv_result_success(status)) {
+		local_irq_restore(flags);
+		return hv_result_to_errno(status);
+	}
+	*property_value = output->property_value;
+
+	local_irq_restore(flags);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(hv_call_get_partition_property);
+
+/*
+ * Corresponding sleep states have to be initialized in order for a subsequent
+ * HVCALL_ENTER_SLEEP_STATE call to succeed. Currently only S5 state as per
+ * ACPI 6.4 chapter 7.4.2 is relevant, while S1, S2 and S3 can be supported.
+ *
+ * In order to pass proper PM values to mshv, ACPI should be initialized and
+ * should support S5 sleep state when this method is invoked.
+ */
+static int hv_initialize_sleep_states(void)
+{
+	u64 status;
+	unsigned long flags;
+	struct hv_input_set_system_property *in;
+	acpi_status acpi_status;
+	u8 sleep_type_a, sleep_type_b;
+
+	if (!acpi_sleep_state_supported(ACPI_STATE_S5)) {
+		pr_err("%s: S5 sleep state not supported.\n", __func__);
+		return -ENODEV;
+	}
+
+	acpi_status = acpi_get_sleep_type_data(ACPI_STATE_S5, &sleep_type_a,
+					       &sleep_type_b);
+	if (ACPI_FAILURE(acpi_status))
+		return -ENODEV;
+
+	local_irq_save(flags);
+	in = *this_cpu_ptr(hyperv_pcpu_input_arg);
+	memset(in, 0, sizeof(*in));
+
+	in->property_id = HV_SYSTEM_PROPERTY_SLEEP_STATE;
+	in->set_sleep_state_info.sleep_state = HV_SLEEP_STATE_S5;
+	in->set_sleep_state_info.pm1a_slp_typ = sleep_type_a;
+	in->set_sleep_state_info.pm1b_slp_typ = sleep_type_b;
+
+	status = hv_do_hypercall(HVCALL_SET_SYSTEM_PROPERTY, in, NULL);
+	local_irq_restore(flags);
+
+	if (!hv_result_success(status)) {
+		hv_status_err(status, "\n");
+		return hv_result_to_errno(status);
+	}
+
+	return 0;
+}
+
+/*
+ * This notifier initializes sleep states in mshv hypervisor which will be
+ * used during power off.
+ */
+static int hv_reboot_notifier_handler(struct notifier_block *this,
+				      unsigned long code, void *another)
+{
+	int ret = 0;
+
+	if (code == SYS_HALT || code == SYS_POWER_OFF)
+		ret = hv_initialize_sleep_states();
+
+	return ret ? NOTIFY_DONE : NOTIFY_OK;
+}
+
+static struct notifier_block hv_reboot_notifier = {
+	.notifier_call = hv_reboot_notifier_handler,
+};
+
+void hv_sleep_notifiers_register(void)
+{
+	int ret;
+
+	ret = register_reboot_notifier(&hv_reboot_notifier);
+	if (ret)
+		pr_err("%s: cannot register reboot notifier %d\n", __func__,
+		       ret);
+}
+
+/*
+ * Power off the machine by entering S5 sleep state via Hyper-V hypercall.
+ * This call does not return if successful.
+ */
+void hv_machine_power_off(void)
+{
+	unsigned long flags;
+	struct hv_input_enter_sleep_state *in;
+
+	local_irq_save(flags);
+	in = *this_cpu_ptr(hyperv_pcpu_input_arg);
+	in->sleep_state = HV_SLEEP_STATE_S5;
+
+	(void)hv_do_hypercall(HVCALL_ENTER_SLEEP_STATE, in, NULL);
+	local_irq_restore(flags);
+
+	/* should never reach here */
+	BUG();
+
+}
diff --git a/drivers/hv/mshv_eventfd.c b/drivers/hv/mshv_eventfd.c
new file mode 100644
index 000000000000..d93a18f09c76
--- /dev/null
+++ b/drivers/hv/mshv_eventfd.c
@@ -0,0 +1,839 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * eventfd support for mshv
+ *
+ * Heavily inspired from KVM implementation of irqfd/ioeventfd. The basic
+ * framework code is taken from the kvm implementation.
+ *
+ * All credits to kvm developers.
+ */
+
+#include <linux/syscalls.h>
+#include <linux/wait.h>
+#include <linux/poll.h>
+#include <linux/file.h>
+#include <linux/list.h>
+#include <linux/workqueue.h>
+#include <linux/eventfd.h>
+
+#if IS_ENABLED(CONFIG_X86_64)
+#include <asm/apic.h>
+#endif
+#include <asm/mshyperv.h>
+
+#include "mshv_eventfd.h"
+#include "mshv.h"
+#include "mshv_root.h"
+
+static struct workqueue_struct *irqfd_cleanup_wq;
+
+void mshv_register_irq_ack_notifier(struct mshv_partition *partition,
+				    struct mshv_irq_ack_notifier *mian)
+{
+	mutex_lock(&partition->pt_irq_lock);
+	hlist_add_head_rcu(&mian->link, &partition->irq_ack_notifier_list);
+	mutex_unlock(&partition->pt_irq_lock);
+}
+
+void mshv_unregister_irq_ack_notifier(struct mshv_partition *partition,
+				      struct mshv_irq_ack_notifier *mian)
+{
+	mutex_lock(&partition->pt_irq_lock);
+	hlist_del_init_rcu(&mian->link);
+	mutex_unlock(&partition->pt_irq_lock);
+	synchronize_rcu();
+}
+
+bool mshv_notify_acked_gsi(struct mshv_partition *partition, int gsi)
+{
+	struct mshv_irq_ack_notifier *mian;
+	bool acked = false;
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(mian, &partition->irq_ack_notifier_list,
+				 link) {
+		if (mian->irq_ack_gsi == gsi) {
+			mian->irq_acked(mian);
+			acked = true;
+		}
+	}
+	rcu_read_unlock();
+
+	return acked;
+}
+
+#if IS_ENABLED(CONFIG_ARM64)
+static inline bool hv_should_clear_interrupt(enum hv_interrupt_type type)
+{
+	return false;
+}
+#elif IS_ENABLED(CONFIG_X86_64)
+static inline bool hv_should_clear_interrupt(enum hv_interrupt_type type)
+{
+	return type == HV_X64_INTERRUPT_TYPE_EXTINT;
+}
+#endif
+
+static void mshv_irqfd_resampler_ack(struct mshv_irq_ack_notifier *mian)
+{
+	struct mshv_irqfd_resampler *resampler;
+	struct mshv_partition *partition;
+	struct mshv_irqfd *irqfd;
+	int idx;
+
+	resampler = container_of(mian, struct mshv_irqfd_resampler,
+				 rsmplr_notifier);
+	partition = resampler->rsmplr_partn;
+
+	idx = srcu_read_lock(&partition->pt_irq_srcu);
+
+	hlist_for_each_entry_rcu(irqfd, &resampler->rsmplr_irqfd_list,
+				 irqfd_resampler_hnode) {
+		if (hv_should_clear_interrupt(irqfd->irqfd_lapic_irq.lapic_control.interrupt_type))
+			hv_call_clear_virtual_interrupt(partition->pt_id);
+
+		eventfd_signal(irqfd->irqfd_resamplefd);
+	}
+
+	srcu_read_unlock(&partition->pt_irq_srcu, idx);
+}
+
+#if IS_ENABLED(CONFIG_X86_64)
+static bool
+mshv_vp_irq_vector_injected(union hv_vp_register_page_interrupt_vectors iv,
+			    u32 vector)
+{
+	int i;
+
+	for (i = 0; i < iv.vector_count; i++) {
+		if (iv.vector[i] == vector)
+			return true;
+	}
+
+	return false;
+}
+
+static int mshv_vp_irq_try_set_vector(struct mshv_vp *vp, u32 vector)
+{
+	union hv_vp_register_page_interrupt_vectors iv, new_iv;
+
+	iv = vp->vp_register_page->interrupt_vectors;
+	new_iv = iv;
+
+	if (mshv_vp_irq_vector_injected(iv, vector))
+		return 0;
+
+	if (iv.vector_count >= HV_VP_REGISTER_PAGE_MAX_VECTOR_COUNT)
+		return -ENOSPC;
+
+	new_iv.vector[new_iv.vector_count++] = vector;
+
+	if (cmpxchg(&vp->vp_register_page->interrupt_vectors.as_uint64,
+		    iv.as_uint64, new_iv.as_uint64) != iv.as_uint64)
+		return -EAGAIN;
+
+	return 0;
+}
+
+static int mshv_vp_irq_set_vector(struct mshv_vp *vp, u32 vector)
+{
+	int ret;
+
+	do {
+		ret = mshv_vp_irq_try_set_vector(vp, vector);
+	} while (ret == -EAGAIN && !need_resched());
+
+	return ret;
+}
+
+/*
+ * Try to raise irq for guest via shared vector array. hyp does the actual
+ * inject of the interrupt.
+ */
+static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd)
+{
+	struct mshv_partition *partition = irqfd->irqfd_partn;
+	struct mshv_lapic_irq *irq = &irqfd->irqfd_lapic_irq;
+	struct mshv_vp *vp;
+
+	if (!(ms_hyperv.ext_features &
+	      HV_VP_DISPATCH_INTERRUPT_INJECTION_AVAILABLE))
+		return -EOPNOTSUPP;
+
+	if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT)
+		return -EOPNOTSUPP;
+
+#if IS_ENABLED(CONFIG_X86)
+	if (irq->lapic_control.logical_dest_mode)
+		return -EOPNOTSUPP;
+#endif
+
+	vp = partition->pt_vp_array[irq->lapic_apic_id];
+
+	if (!vp->vp_register_page)
+		return -EOPNOTSUPP;
+
+	if (mshv_vp_irq_set_vector(vp, irq->lapic_vector))
+		return -EINVAL;
+
+	if (vp->run.flags.root_sched_dispatched &&
+	    vp->vp_register_page->interrupt_vectors.as_uint64)
+		return -EBUSY;
+
+	wake_up(&vp->run.vp_suspend_queue);
+
+	return 0;
+}
+#else /* CONFIG_X86_64 */
+static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
+static void mshv_assert_irq_slow(struct mshv_irqfd *irqfd)
+{
+	struct mshv_partition *partition = irqfd->irqfd_partn;
+	struct mshv_lapic_irq *irq = &irqfd->irqfd_lapic_irq;
+	unsigned int seq;
+	int idx;
+
+#if IS_ENABLED(CONFIG_X86)
+	WARN_ON(irqfd->irqfd_resampler &&
+		!irq->lapic_control.level_triggered);
+#endif
+
+	idx = srcu_read_lock(&partition->pt_irq_srcu);
+	if (irqfd->irqfd_girq_ent.guest_irq_num) {
+		if (!irqfd->irqfd_girq_ent.girq_entry_valid) {
+			srcu_read_unlock(&partition->pt_irq_srcu, idx);
+			return;
+		}
+
+		do {
+			seq = read_seqcount_begin(&irqfd->irqfd_irqe_sc);
+		} while (read_seqcount_retry(&irqfd->irqfd_irqe_sc, seq));
+	}
+
+	hv_call_assert_virtual_interrupt(irqfd->irqfd_partn->pt_id,
+					 irq->lapic_vector, irq->lapic_apic_id,
+					 irq->lapic_control);
+	srcu_read_unlock(&partition->pt_irq_srcu, idx);
+}
+
+static void mshv_irqfd_resampler_shutdown(struct mshv_irqfd *irqfd)
+{
+	struct mshv_irqfd_resampler *rp = irqfd->irqfd_resampler;
+	struct mshv_partition *pt = rp->rsmplr_partn;
+
+	mutex_lock(&pt->irqfds_resampler_lock);
+
+	hlist_del_rcu(&irqfd->irqfd_resampler_hnode);
+	synchronize_srcu(&pt->pt_irq_srcu);
+
+	if (hlist_empty(&rp->rsmplr_irqfd_list)) {
+		hlist_del(&rp->rsmplr_hnode);
+		mshv_unregister_irq_ack_notifier(pt, &rp->rsmplr_notifier);
+		kfree(rp);
+	}
+
+	mutex_unlock(&pt->irqfds_resampler_lock);
+}
+
+/*
+ * Race-free decouple logic (ordering is critical)
+ */
+static void mshv_irqfd_shutdown(struct work_struct *work)
+{
+	struct mshv_irqfd *irqfd =
+			container_of(work, struct mshv_irqfd, irqfd_shutdown);
+
+	/*
+	 * Synchronize with the wait-queue and unhook ourselves to prevent
+	 * further events.
+	 */
+	remove_wait_queue(irqfd->irqfd_wqh, &irqfd->irqfd_wait);
+
+	if (irqfd->irqfd_resampler) {
+		mshv_irqfd_resampler_shutdown(irqfd);
+		eventfd_ctx_put(irqfd->irqfd_resamplefd);
+	}
+
+	/*
+	 * It is now safe to release the object's resources
+	 */
+	eventfd_ctx_put(irqfd->irqfd_eventfd_ctx);
+	kfree(irqfd);
+}
+
+/* assumes partition->pt_irqfds_lock is held */
+static bool mshv_irqfd_is_active(struct mshv_irqfd *irqfd)
+{
+	return !hlist_unhashed(&irqfd->irqfd_hnode);
+}
+
+/*
+ * Mark the irqfd as inactive and schedule it for removal
+ *
+ * assumes partition->pt_irqfds_lock is held
+ */
+static void mshv_irqfd_deactivate(struct mshv_irqfd *irqfd)
+{
+	if (!mshv_irqfd_is_active(irqfd))
+		return;
+
+	hlist_del(&irqfd->irqfd_hnode);
+
+	queue_work(irqfd_cleanup_wq, &irqfd->irqfd_shutdown);
+}
+
+/*
+ * Called with wqh->lock held and interrupts disabled
+ */
+static int mshv_irqfd_wakeup(wait_queue_entry_t *wait, unsigned int mode,
+			     int sync, void *key)
+{
+	struct mshv_irqfd *irqfd = container_of(wait, struct mshv_irqfd,
+						irqfd_wait);
+	unsigned long flags = (unsigned long)key;
+	int idx;
+	unsigned int seq;
+	struct mshv_partition *pt = irqfd->irqfd_partn;
+	int ret = 0;
+
+	if (flags & POLLIN) {
+		u64 cnt;
+
+		eventfd_ctx_do_read(irqfd->irqfd_eventfd_ctx, &cnt);
+		idx = srcu_read_lock(&pt->pt_irq_srcu);
+		do {
+			seq = read_seqcount_begin(&irqfd->irqfd_irqe_sc);
+		} while (read_seqcount_retry(&irqfd->irqfd_irqe_sc, seq));
+
+		/* An event has been signaled, raise an interrupt */
+		ret = mshv_try_assert_irq_fast(irqfd);
+		if (ret)
+			mshv_assert_irq_slow(irqfd);
+
+		srcu_read_unlock(&pt->pt_irq_srcu, idx);
+
+		ret = 1;
+	}
+
+	if (flags & POLLHUP) {
+		/* The eventfd is closing, detach from the partition */
+		unsigned long flags;
+
+		spin_lock_irqsave(&pt->pt_irqfds_lock, flags);
+
+		/*
+		 * We must check if someone deactivated the irqfd before
+		 * we could acquire the pt_irqfds_lock since the item is
+		 * deactivated from the mshv side before it is unhooked from
+		 * the wait-queue.  If it is already deactivated, we can
+		 * simply return knowing the other side will cleanup for us.
+		 * We cannot race against the irqfd going away since the
+		 * other side is required to acquire wqh->lock, which we hold
+		 */
+		if (mshv_irqfd_is_active(irqfd))
+			mshv_irqfd_deactivate(irqfd);
+
+		spin_unlock_irqrestore(&pt->pt_irqfds_lock, flags);
+	}
+
+	return ret;
+}
+
+/* Must be called under pt_irqfds_lock */
+static void mshv_irqfd_update(struct mshv_partition *pt,
+			      struct mshv_irqfd *irqfd)
+{
+	write_seqcount_begin(&irqfd->irqfd_irqe_sc);
+	irqfd->irqfd_girq_ent = mshv_ret_girq_entry(pt,
+						    irqfd->irqfd_irqnum);
+	mshv_copy_girq_info(&irqfd->irqfd_girq_ent, &irqfd->irqfd_lapic_irq);
+	write_seqcount_end(&irqfd->irqfd_irqe_sc);
+}
+
+void mshv_irqfd_routing_update(struct mshv_partition *pt)
+{
+	struct mshv_irqfd *irqfd;
+
+	spin_lock_irq(&pt->pt_irqfds_lock);
+	hlist_for_each_entry(irqfd, &pt->pt_irqfds_list, irqfd_hnode)
+		mshv_irqfd_update(pt, irqfd);
+	spin_unlock_irq(&pt->pt_irqfds_lock);
+}
+
+static void mshv_irqfd_queue_proc(struct file *file, wait_queue_head_t *wqh,
+				  poll_table *polltbl)
+{
+	struct mshv_irqfd *irqfd =
+			container_of(polltbl, struct mshv_irqfd, irqfd_polltbl);
+
+	irqfd->irqfd_wqh = wqh;
+
+	/*
+	 * TODO: Ensure there isn't already an exclusive, priority waiter, e.g.
+	 * that the irqfd isn't already bound to another partition.  Only the
+	 * first exclusive waiter encountered will be notified, and
+	 * add_wait_queue_priority() doesn't enforce exclusivity.
+	 */
+	irqfd->irqfd_wait.flags |= WQ_FLAG_EXCLUSIVE;
+	add_wait_queue_priority(wqh, &irqfd->irqfd_wait);
+}
+
+static int mshv_irqfd_assign(struct mshv_partition *pt,
+			     struct mshv_user_irqfd *args)
+{
+	struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL;
+	struct mshv_irqfd *irqfd, *tmp;
+	unsigned int events;
+	int ret;
+	int idx;
+
+	CLASS(fd, f)(args->fd);
+
+	irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL);
+	if (!irqfd)
+		return -ENOMEM;
+
+	irqfd->irqfd_partn = pt;
+	irqfd->irqfd_irqnum = args->gsi;
+	INIT_WORK(&irqfd->irqfd_shutdown, mshv_irqfd_shutdown);
+	seqcount_spinlock_init(&irqfd->irqfd_irqe_sc, &pt->pt_irqfds_lock);
+
+	if (fd_empty(f)) {
+		ret = -EBADF;
+		goto out;
+	}
+
+	eventfd = eventfd_ctx_fileget(fd_file(f));
+	if (IS_ERR(eventfd)) {
+		ret = PTR_ERR(eventfd);
+		goto fail;
+	}
+
+	irqfd->irqfd_eventfd_ctx = eventfd;
+
+	if (args->flags & BIT(MSHV_IRQFD_BIT_RESAMPLE)) {
+		struct mshv_irqfd_resampler *rp;
+
+		resamplefd = eventfd_ctx_fdget(args->resamplefd);
+		if (IS_ERR(resamplefd)) {
+			ret = PTR_ERR(resamplefd);
+			goto fail;
+		}
+
+		irqfd->irqfd_resamplefd = resamplefd;
+
+		mutex_lock(&pt->irqfds_resampler_lock);
+
+		hlist_for_each_entry(rp, &pt->irqfds_resampler_list,
+				     rsmplr_hnode) {
+			if (rp->rsmplr_notifier.irq_ack_gsi ==
+							 irqfd->irqfd_irqnum) {
+				irqfd->irqfd_resampler = rp;
+				break;
+			}
+		}
+
+		if (!irqfd->irqfd_resampler) {
+			rp = kzalloc(sizeof(*rp), GFP_KERNEL_ACCOUNT);
+			if (!rp) {
+				ret = -ENOMEM;
+				mutex_unlock(&pt->irqfds_resampler_lock);
+				goto fail;
+			}
+
+			rp->rsmplr_partn = pt;
+			INIT_HLIST_HEAD(&rp->rsmplr_irqfd_list);
+			rp->rsmplr_notifier.irq_ack_gsi = irqfd->irqfd_irqnum;
+			rp->rsmplr_notifier.irq_acked =
+						      mshv_irqfd_resampler_ack;
+
+			hlist_add_head(&rp->rsmplr_hnode,
+				       &pt->irqfds_resampler_list);
+			mshv_register_irq_ack_notifier(pt,
+						       &rp->rsmplr_notifier);
+			irqfd->irqfd_resampler = rp;
+		}
+
+		hlist_add_head_rcu(&irqfd->irqfd_resampler_hnode,
+				   &irqfd->irqfd_resampler->rsmplr_irqfd_list);
+
+		mutex_unlock(&pt->irqfds_resampler_lock);
+	}
+
+	/*
+	 * Install our own custom wake-up handling so we are notified via
+	 * a callback whenever someone signals the underlying eventfd
+	 */
+	init_waitqueue_func_entry(&irqfd->irqfd_wait, mshv_irqfd_wakeup);
+	init_poll_funcptr(&irqfd->irqfd_polltbl, mshv_irqfd_queue_proc);
+
+	spin_lock_irq(&pt->pt_irqfds_lock);
+#if IS_ENABLED(CONFIG_X86)
+	if (args->flags & BIT(MSHV_IRQFD_BIT_RESAMPLE) &&
+	    !irqfd->irqfd_lapic_irq.lapic_control.level_triggered) {
+		/*
+		 * Resample Fd must be for level triggered interrupt
+		 * Otherwise return with failure
+		 */
+		spin_unlock_irq(&pt->pt_irqfds_lock);
+		ret = -EINVAL;
+		goto fail;
+	}
+#endif
+	ret = 0;
+	hlist_for_each_entry(tmp, &pt->pt_irqfds_list, irqfd_hnode) {
+		if (irqfd->irqfd_eventfd_ctx != tmp->irqfd_eventfd_ctx)
+			continue;
+		/* This fd is used for another irq already. */
+		ret = -EBUSY;
+		spin_unlock_irq(&pt->pt_irqfds_lock);
+		goto fail;
+	}
+
+	idx = srcu_read_lock(&pt->pt_irq_srcu);
+	mshv_irqfd_update(pt, irqfd);
+	hlist_add_head(&irqfd->irqfd_hnode, &pt->pt_irqfds_list);
+	spin_unlock_irq(&pt->pt_irqfds_lock);
+
+	/*
+	 * Check if there was an event already pending on the eventfd
+	 * before we registered, and trigger it as if we didn't miss it.
+	 */
+	events = vfs_poll(fd_file(f), &irqfd->irqfd_polltbl);
+
+	if (events & POLLIN)
+		mshv_assert_irq_slow(irqfd);
+
+	srcu_read_unlock(&pt->pt_irq_srcu, idx);
+	return 0;
+
+fail:
+	if (irqfd->irqfd_resampler)
+		mshv_irqfd_resampler_shutdown(irqfd);
+
+	if (resamplefd && !IS_ERR(resamplefd))
+		eventfd_ctx_put(resamplefd);
+
+	if (eventfd && !IS_ERR(eventfd))
+		eventfd_ctx_put(eventfd);
+
+out:
+	kfree(irqfd);
+	return ret;
+}
+
+/*
+ * shutdown any irqfd's that match fd+gsi
+ */
+static int mshv_irqfd_deassign(struct mshv_partition *pt,
+			       struct mshv_user_irqfd *args)
+{
+	struct mshv_irqfd *irqfd;
+	struct hlist_node *n;
+	struct eventfd_ctx *eventfd;
+
+	eventfd = eventfd_ctx_fdget(args->fd);
+	if (IS_ERR(eventfd))
+		return PTR_ERR(eventfd);
+
+	hlist_for_each_entry_safe(irqfd, n, &pt->pt_irqfds_list,
+				  irqfd_hnode) {
+		if (irqfd->irqfd_eventfd_ctx == eventfd &&
+		    irqfd->irqfd_irqnum == args->gsi)
+
+			mshv_irqfd_deactivate(irqfd);
+	}
+
+	eventfd_ctx_put(eventfd);
+
+	/*
+	 * Block until we know all outstanding shutdown jobs have completed
+	 * so that we guarantee there will not be any more interrupts on this
+	 * gsi once this deassign function returns.
+	 */
+	flush_workqueue(irqfd_cleanup_wq);
+
+	return 0;
+}
+
+int mshv_set_unset_irqfd(struct mshv_partition *pt,
+			 struct mshv_user_irqfd *args)
+{
+	if (args->flags & ~MSHV_IRQFD_FLAGS_MASK)
+		return -EINVAL;
+
+	if (args->flags & BIT(MSHV_IRQFD_BIT_DEASSIGN))
+		return mshv_irqfd_deassign(pt, args);
+
+	return mshv_irqfd_assign(pt, args);
+}
+
+/*
+ * This function is called as the mshv VM fd is being released.
+ * Shutdown all irqfds that still remain open
+ */
+static void mshv_irqfd_release(struct mshv_partition *pt)
+{
+	struct mshv_irqfd *irqfd;
+	struct hlist_node *n;
+
+	spin_lock_irq(&pt->pt_irqfds_lock);
+
+	hlist_for_each_entry_safe(irqfd, n, &pt->pt_irqfds_list, irqfd_hnode)
+		mshv_irqfd_deactivate(irqfd);
+
+	spin_unlock_irq(&pt->pt_irqfds_lock);
+
+	/*
+	 * Block until we know all outstanding shutdown jobs have completed
+	 * since we do not take a mshv_partition* reference.
+	 */
+	flush_workqueue(irqfd_cleanup_wq);
+}
+
+int mshv_irqfd_wq_init(void)
+{
+	irqfd_cleanup_wq = alloc_workqueue("mshv-irqfd-cleanup", WQ_PERCPU, 0);
+	if (!irqfd_cleanup_wq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void mshv_irqfd_wq_cleanup(void)
+{
+	destroy_workqueue(irqfd_cleanup_wq);
+}
+
+/*
+ * --------------------------------------------------------------------
+ * ioeventfd: translate a MMIO memory write to an eventfd signal.
+ *
+ * userspace can register a MMIO address with an eventfd for receiving
+ * notification when the memory has been touched.
+ * --------------------------------------------------------------------
+ */
+
+static void ioeventfd_release(struct mshv_ioeventfd *p, u64 partition_id)
+{
+	if (p->iovntfd_doorbell_id > 0)
+		mshv_unregister_doorbell(partition_id, p->iovntfd_doorbell_id);
+	eventfd_ctx_put(p->iovntfd_eventfd);
+	kfree(p);
+}
+
+/* MMIO writes trigger an event if the addr/val match */
+static void ioeventfd_mmio_write(int doorbell_id, void *data)
+{
+	struct mshv_partition *partition = (struct mshv_partition *)data;
+	struct mshv_ioeventfd *p;
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(p, &partition->ioeventfds_list, iovntfd_hnode)
+		if (p->iovntfd_doorbell_id == doorbell_id) {
+			eventfd_signal(p->iovntfd_eventfd);
+			break;
+		}
+
+	rcu_read_unlock();
+}
+
+static bool ioeventfd_check_collision(struct mshv_partition *pt,
+				      struct mshv_ioeventfd *p)
+	__must_hold(&pt->mutex)
+{
+	struct mshv_ioeventfd *_p;
+
+	hlist_for_each_entry(_p, &pt->ioeventfds_list, iovntfd_hnode)
+		if (_p->iovntfd_addr == p->iovntfd_addr &&
+		    _p->iovntfd_length == p->iovntfd_length &&
+		    (_p->iovntfd_wildcard || p->iovntfd_wildcard ||
+		     _p->iovntfd_datamatch == p->iovntfd_datamatch))
+			return true;
+
+	return false;
+}
+
+static int mshv_assign_ioeventfd(struct mshv_partition *pt,
+				 struct mshv_user_ioeventfd *args)
+	__must_hold(&pt->mutex)
+{
+	struct mshv_ioeventfd *p;
+	struct eventfd_ctx *eventfd;
+	u64 doorbell_flags = 0;
+	int ret;
+
+	/* This mutex is currently protecting ioeventfd.items list */
+	WARN_ON_ONCE(!mutex_is_locked(&pt->pt_mutex));
+
+	if (args->flags & BIT(MSHV_IOEVENTFD_BIT_PIO))
+		return -EOPNOTSUPP;
+
+	/* must be natural-word sized */
+	switch (args->len) {
+	case 0:
+		doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_ANY;
+		break;
+	case 1:
+		doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_BYTE;
+		break;
+	case 2:
+		doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_WORD;
+		break;
+	case 4:
+		doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_DWORD;
+		break;
+	case 8:
+		doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_QWORD;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	/* check for range overflow */
+	if (args->addr + args->len < args->addr)
+		return -EINVAL;
+
+	/* check for extra flags that we don't understand */
+	if (args->flags & ~MSHV_IOEVENTFD_FLAGS_MASK)
+		return -EINVAL;
+
+	eventfd = eventfd_ctx_fdget(args->fd);
+	if (IS_ERR(eventfd))
+		return PTR_ERR(eventfd);
+
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	if (!p) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	p->iovntfd_addr = args->addr;
+	p->iovntfd_length  = args->len;
+	p->iovntfd_eventfd = eventfd;
+
+	/* The datamatch feature is optional, otherwise this is a wildcard */
+	if (args->flags & BIT(MSHV_IOEVENTFD_BIT_DATAMATCH)) {
+		p->iovntfd_datamatch = args->datamatch;
+	} else {
+		p->iovntfd_wildcard = true;
+		doorbell_flags |= HV_DOORBELL_FLAG_TRIGGER_ANY_VALUE;
+	}
+
+	if (ioeventfd_check_collision(pt, p)) {
+		ret = -EEXIST;
+		goto unlock_fail;
+	}
+
+	ret = mshv_register_doorbell(pt->pt_id, ioeventfd_mmio_write,
+				     (void *)pt, p->iovntfd_addr,
+				     p->iovntfd_datamatch, doorbell_flags);
+	if (ret < 0)
+		goto unlock_fail;
+
+	p->iovntfd_doorbell_id = ret;
+
+	hlist_add_head_rcu(&p->iovntfd_hnode, &pt->ioeventfds_list);
+
+	return 0;
+
+unlock_fail:
+	kfree(p);
+
+fail:
+	eventfd_ctx_put(eventfd);
+
+	return ret;
+}
+
+static int mshv_deassign_ioeventfd(struct mshv_partition *pt,
+				   struct mshv_user_ioeventfd *args)
+	__must_hold(&pt->mutex)
+{
+	struct mshv_ioeventfd *p;
+	struct eventfd_ctx *eventfd;
+	struct hlist_node *n;
+	int ret = -ENOENT;
+
+	/* This mutex is currently protecting ioeventfd.items list */
+	WARN_ON_ONCE(!mutex_is_locked(&pt->pt_mutex));
+
+	eventfd = eventfd_ctx_fdget(args->fd);
+	if (IS_ERR(eventfd))
+		return PTR_ERR(eventfd);
+
+	hlist_for_each_entry_safe(p, n, &pt->ioeventfds_list, iovntfd_hnode) {
+		bool wildcard = !(args->flags & BIT(MSHV_IOEVENTFD_BIT_DATAMATCH));
+
+		if (p->iovntfd_eventfd != eventfd  ||
+		    p->iovntfd_addr != args->addr  ||
+		    p->iovntfd_length != args->len ||
+		    p->iovntfd_wildcard != wildcard)
+			continue;
+
+		if (!p->iovntfd_wildcard &&
+		    p->iovntfd_datamatch != args->datamatch)
+			continue;
+
+		hlist_del_rcu(&p->iovntfd_hnode);
+		synchronize_rcu();
+		ioeventfd_release(p, pt->pt_id);
+		ret = 0;
+		break;
+	}
+
+	eventfd_ctx_put(eventfd);
+
+	return ret;
+}
+
+int mshv_set_unset_ioeventfd(struct mshv_partition *pt,
+			     struct mshv_user_ioeventfd *args)
+	__must_hold(&pt->mutex)
+{
+	if ((args->flags & ~MSHV_IOEVENTFD_FLAGS_MASK) ||
+	    mshv_field_nonzero(*args, rsvd))
+		return -EINVAL;
+
+	/* PIO not yet implemented */
+	if (args->flags & BIT(MSHV_IOEVENTFD_BIT_PIO))
+		return -EOPNOTSUPP;
+
+	if (args->flags & BIT(MSHV_IOEVENTFD_BIT_DEASSIGN))
+		return mshv_deassign_ioeventfd(pt, args);
+
+	return mshv_assign_ioeventfd(pt, args);
+}
+
+void mshv_eventfd_init(struct mshv_partition *pt)
+{
+	spin_lock_init(&pt->pt_irqfds_lock);
+	INIT_HLIST_HEAD(&pt->pt_irqfds_list);
+
+	INIT_HLIST_HEAD(&pt->irqfds_resampler_list);
+	mutex_init(&pt->irqfds_resampler_lock);
+
+	INIT_HLIST_HEAD(&pt->ioeventfds_list);
+}
+
+void mshv_eventfd_release(struct mshv_partition *pt)
+{
+	struct hlist_head items;
+	struct hlist_node *n;
+	struct mshv_ioeventfd *p;
+
+	hlist_move_list(&pt->ioeventfds_list, &items);
+	synchronize_rcu();
+
+	hlist_for_each_entry_safe(p, n, &items, iovntfd_hnode) {
+		hlist_del(&p->iovntfd_hnode);
+		ioeventfd_release(p, pt->pt_id);
+	}
+
+	mshv_irqfd_release(pt);
+}
diff --git a/drivers/hv/mshv_eventfd.h b/drivers/hv/mshv_eventfd.h
new file mode 100644
index 000000000000..332e7670a344
--- /dev/null
+++ b/drivers/hv/mshv_eventfd.h
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * irqfd: Allows an fd to be used to inject an interrupt to the guest.
+ * ioeventfd: Allow an fd to be used to receive a signal from the guest.
+ * All credit goes to kvm developers.
+ */
+
+#ifndef __LINUX_MSHV_EVENTFD_H
+#define __LINUX_MSHV_EVENTFD_H
+
+#include <linux/poll.h>
+
+#include "mshv.h"
+#include "mshv_root.h"
+
+/* struct to contain list of irqfds sharing an irq. Updates are protected by
+ * partition.irqfds.resampler_lock
+ */
+struct mshv_irqfd_resampler {
+	struct mshv_partition	    *rsmplr_partn;
+	struct hlist_head	     rsmplr_irqfd_list;
+	struct mshv_irq_ack_notifier rsmplr_notifier;
+	struct hlist_node	     rsmplr_hnode;
+};
+
+struct mshv_irqfd {
+	struct mshv_partition		    *irqfd_partn;
+	struct eventfd_ctx		    *irqfd_eventfd_ctx;
+	struct mshv_guest_irq_ent	     irqfd_girq_ent;
+	seqcount_spinlock_t		     irqfd_irqe_sc;
+	u32				     irqfd_irqnum;
+	struct mshv_lapic_irq		     irqfd_lapic_irq;
+	struct hlist_node		     irqfd_hnode;
+	poll_table			     irqfd_polltbl;
+	wait_queue_head_t		    *irqfd_wqh;
+	wait_queue_entry_t		     irqfd_wait;
+	struct work_struct		     irqfd_shutdown;
+	struct mshv_irqfd_resampler	    *irqfd_resampler;
+	struct eventfd_ctx		    *irqfd_resamplefd;
+	struct hlist_node		     irqfd_resampler_hnode;
+};
+
+void mshv_eventfd_init(struct mshv_partition *partition);
+void mshv_eventfd_release(struct mshv_partition *partition);
+
+void mshv_register_irq_ack_notifier(struct mshv_partition *partition,
+				    struct mshv_irq_ack_notifier *mian);
+void mshv_unregister_irq_ack_notifier(struct mshv_partition *partition,
+				      struct mshv_irq_ack_notifier *mian);
+bool mshv_notify_acked_gsi(struct mshv_partition *partition, int gsi);
+
+int mshv_set_unset_irqfd(struct mshv_partition *partition,
+			 struct mshv_user_irqfd *args);
+
+int mshv_irqfd_wq_init(void);
+void mshv_irqfd_wq_cleanup(void);
+
+struct mshv_ioeventfd {
+	struct hlist_node    iovntfd_hnode;
+	u64		     iovntfd_addr;
+	int		     iovntfd_length;
+	struct eventfd_ctx  *iovntfd_eventfd;
+	u64		     iovntfd_datamatch;
+	int		     iovntfd_doorbell_id;
+	bool		     iovntfd_wildcard;
+};
+
+int mshv_set_unset_ioeventfd(struct mshv_partition *pt,
+			     struct mshv_user_ioeventfd *args);
+
+#endif /* __LINUX_MSHV_EVENTFD_H */
diff --git a/drivers/hv/mshv_irq.c b/drivers/hv/mshv_irq.c
new file mode 100644
index 000000000000..798e7e1ab06e
--- /dev/null
+++ b/drivers/hv/mshv_irq.c
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2023, Microsoft Corporation.
+ *
+ * Authors: Microsoft Linux virtualization team
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <asm/mshyperv.h>
+
+#include "mshv_eventfd.h"
+#include "mshv.h"
+#include "mshv_root.h"
+
+/* called from the ioctl code, user wants to update the guest irq table */
+int mshv_update_routing_table(struct mshv_partition *partition,
+			      const struct mshv_user_irq_entry *ue,
+			      unsigned int numents)
+{
+	struct mshv_girq_routing_table *new = NULL, *old;
+	u32 i, nr_rt_entries = 0;
+	int r = 0;
+
+	if (numents == 0)
+		goto swap_routes;
+
+	for (i = 0; i < numents; i++) {
+		if (ue[i].gsi >= MSHV_MAX_GUEST_IRQS)
+			return -EINVAL;
+
+		if (ue[i].address_hi)
+			return -EINVAL;
+
+		nr_rt_entries = max(nr_rt_entries, ue[i].gsi);
+	}
+	nr_rt_entries += 1;
+
+	new = kzalloc(struct_size(new, mshv_girq_info_tbl, nr_rt_entries),
+		      GFP_KERNEL_ACCOUNT);
+	if (!new)
+		return -ENOMEM;
+
+	new->num_rt_entries = nr_rt_entries;
+	for (i = 0; i < numents; i++) {
+		struct mshv_guest_irq_ent *girq;
+
+		girq = &new->mshv_girq_info_tbl[ue[i].gsi];
+
+		/*
+		 * Allow only one to one mapping between GSI and MSI routing.
+		 */
+		if (girq->guest_irq_num != 0) {
+			r = -EINVAL;
+			goto out;
+		}
+
+		girq->guest_irq_num = ue[i].gsi;
+		girq->girq_addr_lo = ue[i].address_lo;
+		girq->girq_addr_hi = ue[i].address_hi;
+		girq->girq_irq_data = ue[i].data;
+		girq->girq_entry_valid = true;
+	}
+
+swap_routes:
+	mutex_lock(&partition->pt_irq_lock);
+	old = rcu_dereference_protected(partition->pt_girq_tbl, 1);
+	rcu_assign_pointer(partition->pt_girq_tbl, new);
+	mshv_irqfd_routing_update(partition);
+	mutex_unlock(&partition->pt_irq_lock);
+
+	synchronize_srcu_expedited(&partition->pt_irq_srcu);
+	new = old;
+
+out:
+	kfree(new);
+
+	return r;
+}
+
+/* vm is going away, kfree the irq routing table */
+void mshv_free_routing_table(struct mshv_partition *partition)
+{
+	struct mshv_girq_routing_table *rt =
+				   rcu_access_pointer(partition->pt_girq_tbl);
+
+	kfree(rt);
+}
+
+struct mshv_guest_irq_ent
+mshv_ret_girq_entry(struct mshv_partition *partition, u32 irqnum)
+{
+	struct mshv_guest_irq_ent entry = { 0 };
+	struct mshv_girq_routing_table *girq_tbl;
+
+	girq_tbl = srcu_dereference_check(partition->pt_girq_tbl,
+					  &partition->pt_irq_srcu,
+					  lockdep_is_held(&partition->pt_irq_lock));
+	if (!girq_tbl || irqnum >= girq_tbl->num_rt_entries) {
+		/*
+		 * Premature register_irqfd, setting valid_entry = 0
+		 * would ignore this entry anyway
+		 */
+		entry.guest_irq_num = irqnum;
+		return entry;
+	}
+
+	return girq_tbl->mshv_girq_info_tbl[irqnum];
+}
+
+void mshv_copy_girq_info(struct mshv_guest_irq_ent *ent,
+			 struct mshv_lapic_irq *lirq)
+{
+	memset(lirq, 0, sizeof(*lirq));
+	if (!ent || !ent->girq_entry_valid)
+		return;
+
+	lirq->lapic_vector = ent->girq_irq_data & 0xFF;
+	lirq->lapic_apic_id = (ent->girq_addr_lo >> 12) & 0xFF;
+	lirq->lapic_control.interrupt_type = (ent->girq_irq_data & 0x700) >> 8;
+#if IS_ENABLED(CONFIG_X86)
+	lirq->lapic_control.level_triggered = (ent->girq_irq_data >> 15) & 0x1;
+	lirq->lapic_control.logical_dest_mode = (ent->girq_addr_lo >> 2) & 0x1;
+#elif IS_ENABLED(CONFIG_ARM64)
+	lirq->lapic_control.asserted = 1;
+#endif
+}
diff --git a/drivers/hv/mshv_portid_table.c b/drivers/hv/mshv_portid_table.c
new file mode 100644
index 000000000000..c349af1f0aaa
--- /dev/null
+++ b/drivers/hv/mshv_portid_table.c
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/idr.h>
+#include <asm/mshyperv.h>
+
+#include "mshv.h"
+#include "mshv_root.h"
+
+/*
+ * Ports and connections are hypervisor struct used for inter-partition
+ * communication. Port represents the source and connection represents
+ * the destination. Partitions are responsible for managing the port and
+ * connection ids.
+ *
+ */
+
+#define PORTID_MIN	1
+#define PORTID_MAX	INT_MAX
+
+static DEFINE_IDR(port_table_idr);
+
+void
+mshv_port_table_fini(void)
+{
+	struct port_table_info *port_info;
+	unsigned long i, tmp;
+
+	idr_lock(&port_table_idr);
+	if (!idr_is_empty(&port_table_idr)) {
+		idr_for_each_entry_ul(&port_table_idr, port_info, tmp, i) {
+			port_info = idr_remove(&port_table_idr, i);
+			kfree_rcu(port_info, portbl_rcu);
+		}
+	}
+	idr_unlock(&port_table_idr);
+}
+
+int
+mshv_portid_alloc(struct port_table_info *info)
+{
+	int ret = 0;
+
+	idr_lock(&port_table_idr);
+	ret = idr_alloc(&port_table_idr, info, PORTID_MIN,
+			PORTID_MAX, GFP_KERNEL);
+	idr_unlock(&port_table_idr);
+
+	return ret;
+}
+
+void
+mshv_portid_free(int port_id)
+{
+	struct port_table_info *info;
+
+	idr_lock(&port_table_idr);
+	info = idr_remove(&port_table_idr, port_id);
+	WARN_ON(!info);
+	idr_unlock(&port_table_idr);
+
+	synchronize_rcu();
+	kfree(info);
+}
+
+int
+mshv_portid_lookup(int port_id, struct port_table_info *info)
+{
+	struct port_table_info *_info;
+	int ret = -ENOENT;
+
+	rcu_read_lock();
+	_info = idr_find(&port_table_idr, port_id);
+	rcu_read_unlock();
+
+	if (_info) {
+		*info = *_info;
+		ret = 0;
+	}
+
+	return ret;
+}
diff --git a/drivers/hv/mshv_regions.c b/drivers/hv/mshv_regions.c
new file mode 100644
index 000000000000..202b9d551e39
--- /dev/null
+++ b/drivers/hv/mshv_regions.c
@@ -0,0 +1,555 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2025, Microsoft Corporation.
+ *
+ * Memory region management for mshv_root module.
+ *
+ * Authors: Microsoft Linux virtualization team
+ */
+
+#include <linux/hmm.h>
+#include <linux/hyperv.h>
+#include <linux/kref.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+
+#include <asm/mshyperv.h>
+
+#include "mshv_root.h"
+
+#define MSHV_MAP_FAULT_IN_PAGES				PTRS_PER_PMD
+
+/**
+ * mshv_region_process_chunk - Processes a contiguous chunk of memory pages
+ *                             in a region.
+ * @region     : Pointer to the memory region structure.
+ * @flags      : Flags to pass to the handler.
+ * @page_offset: Offset into the region's pages array to start processing.
+ * @page_count : Number of pages to process.
+ * @handler    : Callback function to handle the chunk.
+ *
+ * This function scans the region's pages starting from @page_offset,
+ * checking for contiguous present pages of the same size (normal or huge).
+ * It invokes @handler for the chunk of contiguous pages found. Returns the
+ * number of pages handled, or a negative error code if the first page is
+ * not present or the handler fails.
+ *
+ * Note: The @handler callback must be able to handle both normal and huge
+ * pages.
+ *
+ * Return: Number of pages handled, or negative error code.
+ */
+static long mshv_region_process_chunk(struct mshv_mem_region *region,
+				      u32 flags,
+				      u64 page_offset, u64 page_count,
+				      int (*handler)(struct mshv_mem_region *region,
+						     u32 flags,
+						     u64 page_offset,
+						     u64 page_count))
+{
+	u64 count, stride;
+	unsigned int page_order;
+	struct page *page;
+	int ret;
+
+	page = region->pages[page_offset];
+	if (!page)
+		return -EINVAL;
+
+	page_order = folio_order(page_folio(page));
+	/* The hypervisor only supports 4K and 2M page sizes */
+	if (page_order && page_order != HPAGE_PMD_ORDER)
+		return -EINVAL;
+
+	stride = 1 << page_order;
+
+	/* Start at stride since the first page is validated */
+	for (count = stride; count < page_count; count += stride) {
+		page = region->pages[page_offset + count];
+
+		/* Break if current page is not present */
+		if (!page)
+			break;
+
+		/* Break if page size changes */
+		if (page_order != folio_order(page_folio(page)))
+			break;
+	}
+
+	ret = handler(region, flags, page_offset, count);
+	if (ret)
+		return ret;
+
+	return count;
+}
+
+/**
+ * mshv_region_process_range - Processes a range of memory pages in a
+ *                             region.
+ * @region     : Pointer to the memory region structure.
+ * @flags      : Flags to pass to the handler.
+ * @page_offset: Offset into the region's pages array to start processing.
+ * @page_count : Number of pages to process.
+ * @handler    : Callback function to handle each chunk of contiguous
+ *               pages.
+ *
+ * Iterates over the specified range of pages in @region, skipping
+ * non-present pages. For each contiguous chunk of present pages, invokes
+ * @handler via mshv_region_process_chunk.
+ *
+ * Note: The @handler callback must be able to handle both normal and huge
+ * pages.
+ *
+ * Returns 0 on success, or a negative error code on failure.
+ */
+static int mshv_region_process_range(struct mshv_mem_region *region,
+				     u32 flags,
+				     u64 page_offset, u64 page_count,
+				     int (*handler)(struct mshv_mem_region *region,
+						    u32 flags,
+						    u64 page_offset,
+						    u64 page_count))
+{
+	long ret;
+
+	if (page_offset + page_count > region->nr_pages)
+		return -EINVAL;
+
+	while (page_count) {
+		/* Skip non-present pages */
+		if (!region->pages[page_offset]) {
+			page_offset++;
+			page_count--;
+			continue;
+		}
+
+		ret = mshv_region_process_chunk(region, flags,
+						page_offset,
+						page_count,
+						handler);
+		if (ret < 0)
+			return ret;
+
+		page_offset += ret;
+		page_count -= ret;
+	}
+
+	return 0;
+}
+
+struct mshv_mem_region *mshv_region_create(u64 guest_pfn, u64 nr_pages,
+					   u64 uaddr, u32 flags)
+{
+	struct mshv_mem_region *region;
+
+	region = vzalloc(sizeof(*region) + sizeof(struct page *) * nr_pages);
+	if (!region)
+		return ERR_PTR(-ENOMEM);
+
+	region->nr_pages = nr_pages;
+	region->start_gfn = guest_pfn;
+	region->start_uaddr = uaddr;
+	region->hv_map_flags = HV_MAP_GPA_READABLE | HV_MAP_GPA_ADJUSTABLE;
+	if (flags & BIT(MSHV_SET_MEM_BIT_WRITABLE))
+		region->hv_map_flags |= HV_MAP_GPA_WRITABLE;
+	if (flags & BIT(MSHV_SET_MEM_BIT_EXECUTABLE))
+		region->hv_map_flags |= HV_MAP_GPA_EXECUTABLE;
+
+	kref_init(&region->refcount);
+
+	return region;
+}
+
+static int mshv_region_chunk_share(struct mshv_mem_region *region,
+				   u32 flags,
+				   u64 page_offset, u64 page_count)
+{
+	struct page *page = region->pages[page_offset];
+
+	if (PageHuge(page) || PageTransCompound(page))
+		flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE;
+
+	return hv_call_modify_spa_host_access(region->partition->pt_id,
+					      region->pages + page_offset,
+					      page_count,
+					      HV_MAP_GPA_READABLE |
+					      HV_MAP_GPA_WRITABLE,
+					      flags, true);
+}
+
+int mshv_region_share(struct mshv_mem_region *region)
+{
+	u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_SHARED;
+
+	return mshv_region_process_range(region, flags,
+					 0, region->nr_pages,
+					 mshv_region_chunk_share);
+}
+
+static int mshv_region_chunk_unshare(struct mshv_mem_region *region,
+				     u32 flags,
+				     u64 page_offset, u64 page_count)
+{
+	struct page *page = region->pages[page_offset];
+
+	if (PageHuge(page) || PageTransCompound(page))
+		flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE;
+
+	return hv_call_modify_spa_host_access(region->partition->pt_id,
+					      region->pages + page_offset,
+					      page_count, 0,
+					      flags, false);
+}
+
+int mshv_region_unshare(struct mshv_mem_region *region)
+{
+	u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_EXCLUSIVE;
+
+	return mshv_region_process_range(region, flags,
+					 0, region->nr_pages,
+					 mshv_region_chunk_unshare);
+}
+
+static int mshv_region_chunk_remap(struct mshv_mem_region *region,
+				   u32 flags,
+				   u64 page_offset, u64 page_count)
+{
+	struct page *page = region->pages[page_offset];
+
+	if (PageHuge(page) || PageTransCompound(page))
+		flags |= HV_MAP_GPA_LARGE_PAGE;
+
+	return hv_call_map_gpa_pages(region->partition->pt_id,
+				     region->start_gfn + page_offset,
+				     page_count, flags,
+				     region->pages + page_offset);
+}
+
+static int mshv_region_remap_pages(struct mshv_mem_region *region,
+				   u32 map_flags,
+				   u64 page_offset, u64 page_count)
+{
+	return mshv_region_process_range(region, map_flags,
+					 page_offset, page_count,
+					 mshv_region_chunk_remap);
+}
+
+int mshv_region_map(struct mshv_mem_region *region)
+{
+	u32 map_flags = region->hv_map_flags;
+
+	return mshv_region_remap_pages(region, map_flags,
+				       0, region->nr_pages);
+}
+
+static void mshv_region_invalidate_pages(struct mshv_mem_region *region,
+					 u64 page_offset, u64 page_count)
+{
+	if (region->type == MSHV_REGION_TYPE_MEM_PINNED)
+		unpin_user_pages(region->pages + page_offset, page_count);
+
+	memset(region->pages + page_offset, 0,
+	       page_count * sizeof(struct page *));
+}
+
+void mshv_region_invalidate(struct mshv_mem_region *region)
+{
+	mshv_region_invalidate_pages(region, 0, region->nr_pages);
+}
+
+int mshv_region_pin(struct mshv_mem_region *region)
+{
+	u64 done_count, nr_pages;
+	struct page **pages;
+	__u64 userspace_addr;
+	int ret;
+
+	for (done_count = 0; done_count < region->nr_pages; done_count += ret) {
+		pages = region->pages + done_count;
+		userspace_addr = region->start_uaddr +
+				 done_count * HV_HYP_PAGE_SIZE;
+		nr_pages = min(region->nr_pages - done_count,
+			       MSHV_PIN_PAGES_BATCH_SIZE);
+
+		/*
+		 * Pinning assuming 4k pages works for large pages too.
+		 * All page structs within the large page are returned.
+		 *
+		 * Pin requests are batched because pin_user_pages_fast
+		 * with the FOLL_LONGTERM flag does a large temporary
+		 * allocation of contiguous memory.
+		 */
+		ret = pin_user_pages_fast(userspace_addr, nr_pages,
+					  FOLL_WRITE | FOLL_LONGTERM,
+					  pages);
+		if (ret < 0)
+			goto release_pages;
+	}
+
+	return 0;
+
+release_pages:
+	mshv_region_invalidate_pages(region, 0, done_count);
+	return ret;
+}
+
+static int mshv_region_chunk_unmap(struct mshv_mem_region *region,
+				   u32 flags,
+				   u64 page_offset, u64 page_count)
+{
+	struct page *page = region->pages[page_offset];
+
+	if (PageHuge(page) || PageTransCompound(page))
+		flags |= HV_UNMAP_GPA_LARGE_PAGE;
+
+	return hv_call_unmap_gpa_pages(region->partition->pt_id,
+				       region->start_gfn + page_offset,
+				       page_count, flags);
+}
+
+static int mshv_region_unmap(struct mshv_mem_region *region)
+{
+	return mshv_region_process_range(region, 0,
+					 0, region->nr_pages,
+					 mshv_region_chunk_unmap);
+}
+
+static void mshv_region_destroy(struct kref *ref)
+{
+	struct mshv_mem_region *region =
+		container_of(ref, struct mshv_mem_region, refcount);
+	struct mshv_partition *partition = region->partition;
+	int ret;
+
+	if (region->type == MSHV_REGION_TYPE_MEM_MOVABLE)
+		mshv_region_movable_fini(region);
+
+	if (mshv_partition_encrypted(partition)) {
+		ret = mshv_region_share(region);
+		if (ret) {
+			pt_err(partition,
+			       "Failed to regain access to memory, unpinning user pages will fail and crash the host error: %d\n",
+			       ret);
+			return;
+		}
+	}
+
+	mshv_region_unmap(region);
+
+	mshv_region_invalidate(region);
+
+	vfree(region);
+}
+
+void mshv_region_put(struct mshv_mem_region *region)
+{
+	kref_put(&region->refcount, mshv_region_destroy);
+}
+
+int mshv_region_get(struct mshv_mem_region *region)
+{
+	return kref_get_unless_zero(&region->refcount);
+}
+
+/**
+ * mshv_region_hmm_fault_and_lock - Handle HMM faults and lock the memory region
+ * @region: Pointer to the memory region structure
+ * @range: Pointer to the HMM range structure
+ *
+ * This function performs the following steps:
+ * 1. Reads the notifier sequence for the HMM range.
+ * 2. Acquires a read lock on the memory map.
+ * 3. Handles HMM faults for the specified range.
+ * 4. Releases the read lock on the memory map.
+ * 5. If successful, locks the memory region mutex.
+ * 6. Verifies if the notifier sequence has changed during the operation.
+ *    If it has, releases the mutex and returns -EBUSY to match with
+ *    hmm_range_fault() return code for repeating.
+ *
+ * Return: 0 on success, a negative error code otherwise.
+ */
+static int mshv_region_hmm_fault_and_lock(struct mshv_mem_region *region,
+					  struct hmm_range *range)
+{
+	int ret;
+
+	range->notifier_seq = mmu_interval_read_begin(range->notifier);
+	mmap_read_lock(region->mni.mm);
+	ret = hmm_range_fault(range);
+	mmap_read_unlock(region->mni.mm);
+	if (ret)
+		return ret;
+
+	mutex_lock(&region->mutex);
+
+	if (mmu_interval_read_retry(range->notifier, range->notifier_seq)) {
+		mutex_unlock(&region->mutex);
+		cond_resched();
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+/**
+ * mshv_region_range_fault - Handle memory range faults for a given region.
+ * @region: Pointer to the memory region structure.
+ * @page_offset: Offset of the page within the region.
+ * @page_count: Number of pages to handle.
+ *
+ * This function resolves memory faults for a specified range of pages
+ * within a memory region. It uses HMM (Heterogeneous Memory Management)
+ * to fault in the required pages and updates the region's page array.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+static int mshv_region_range_fault(struct mshv_mem_region *region,
+				   u64 page_offset, u64 page_count)
+{
+	struct hmm_range range = {
+		.notifier = &region->mni,
+		.default_flags = HMM_PFN_REQ_FAULT | HMM_PFN_REQ_WRITE,
+	};
+	unsigned long *pfns;
+	int ret;
+	u64 i;
+
+	pfns = kmalloc_array(page_count, sizeof(*pfns), GFP_KERNEL);
+	if (!pfns)
+		return -ENOMEM;
+
+	range.hmm_pfns = pfns;
+	range.start = region->start_uaddr + page_offset * HV_HYP_PAGE_SIZE;
+	range.end = range.start + page_count * HV_HYP_PAGE_SIZE;
+
+	do {
+		ret = mshv_region_hmm_fault_and_lock(region, &range);
+	} while (ret == -EBUSY);
+
+	if (ret)
+		goto out;
+
+	for (i = 0; i < page_count; i++)
+		region->pages[page_offset + i] = hmm_pfn_to_page(pfns[i]);
+
+	ret = mshv_region_remap_pages(region, region->hv_map_flags,
+				      page_offset, page_count);
+
+	mutex_unlock(&region->mutex);
+out:
+	kfree(pfns);
+	return ret;
+}
+
+bool mshv_region_handle_gfn_fault(struct mshv_mem_region *region, u64 gfn)
+{
+	u64 page_offset, page_count;
+	int ret;
+
+	/* Align the page offset to the nearest MSHV_MAP_FAULT_IN_PAGES. */
+	page_offset = ALIGN_DOWN(gfn - region->start_gfn,
+				 MSHV_MAP_FAULT_IN_PAGES);
+
+	/* Map more pages than requested to reduce the number of faults. */
+	page_count = min(region->nr_pages - page_offset,
+			 MSHV_MAP_FAULT_IN_PAGES);
+
+	ret = mshv_region_range_fault(region, page_offset, page_count);
+
+	WARN_ONCE(ret,
+		  "p%llu: GPA intercept failed: region %#llx-%#llx, gfn %#llx, page_offset %llu, page_count %llu\n",
+		  region->partition->pt_id, region->start_uaddr,
+		  region->start_uaddr + (region->nr_pages << HV_HYP_PAGE_SHIFT),
+		  gfn, page_offset, page_count);
+
+	return !ret;
+}
+
+/**
+ * mshv_region_interval_invalidate - Invalidate a range of memory region
+ * @mni: Pointer to the mmu_interval_notifier structure
+ * @range: Pointer to the mmu_notifier_range structure
+ * @cur_seq: Current sequence number for the interval notifier
+ *
+ * This function invalidates a memory region by remapping its pages with
+ * no access permissions. It locks the region's mutex to ensure thread safety
+ * and updates the sequence number for the interval notifier. If the range
+ * is blockable, it uses a blocking lock; otherwise, it attempts a non-blocking
+ * lock and returns false if unsuccessful.
+ *
+ * NOTE: Failure to invalidate a region is a serious error, as the pages will
+ * be considered freed while they are still mapped by the hypervisor.
+ * Any attempt to access such pages will likely crash the system.
+ *
+ * Return: true if the region was successfully invalidated, false otherwise.
+ */
+static bool mshv_region_interval_invalidate(struct mmu_interval_notifier *mni,
+					    const struct mmu_notifier_range *range,
+					    unsigned long cur_seq)
+{
+	struct mshv_mem_region *region = container_of(mni,
+						      struct mshv_mem_region,
+						      mni);
+	u64 page_offset, page_count;
+	unsigned long mstart, mend;
+	int ret = -EPERM;
+
+	if (mmu_notifier_range_blockable(range))
+		mutex_lock(&region->mutex);
+	else if (!mutex_trylock(&region->mutex))
+		goto out_fail;
+
+	mmu_interval_set_seq(mni, cur_seq);
+
+	mstart = max(range->start, region->start_uaddr);
+	mend = min(range->end, region->start_uaddr +
+		   (region->nr_pages << HV_HYP_PAGE_SHIFT));
+
+	page_offset = HVPFN_DOWN(mstart - region->start_uaddr);
+	page_count = HVPFN_DOWN(mend - mstart);
+
+	ret = mshv_region_remap_pages(region, HV_MAP_GPA_NO_ACCESS,
+				      page_offset, page_count);
+	if (ret)
+		goto out_fail;
+
+	mshv_region_invalidate_pages(region, page_offset, page_count);
+
+	mutex_unlock(&region->mutex);
+
+	return true;
+
+out_fail:
+	WARN_ONCE(ret,
+		  "Failed to invalidate region %#llx-%#llx (range %#lx-%#lx, event: %u, pages %#llx-%#llx, mm: %#llx): %d\n",
+		  region->start_uaddr,
+		  region->start_uaddr + (region->nr_pages << HV_HYP_PAGE_SHIFT),
+		  range->start, range->end, range->event,
+		  page_offset, page_offset + page_count - 1, (u64)range->mm, ret);
+	return false;
+}
+
+static const struct mmu_interval_notifier_ops mshv_region_mni_ops = {
+	.invalidate = mshv_region_interval_invalidate,
+};
+
+void mshv_region_movable_fini(struct mshv_mem_region *region)
+{
+	mmu_interval_notifier_remove(&region->mni);
+}
+
+bool mshv_region_movable_init(struct mshv_mem_region *region)
+{
+	int ret;
+
+	ret = mmu_interval_notifier_insert(&region->mni, current->mm,
+					   region->start_uaddr,
+					   region->nr_pages << HV_HYP_PAGE_SHIFT,
+					   &mshv_region_mni_ops);
+	if (ret)
+		return false;
+
+	mutex_init(&region->mutex);
+
+	return true;
+}
diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h
new file mode 100644
index 000000000000..3c1d88b36741
--- /dev/null
+++ b/drivers/hv/mshv_root.h
@@ -0,0 +1,336 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2023, Microsoft Corporation.
+ */
+
+#ifndef _MSHV_ROOT_H_
+#define _MSHV_ROOT_H_
+
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/semaphore.h>
+#include <linux/sched.h>
+#include <linux/srcu.h>
+#include <linux/wait.h>
+#include <linux/hashtable.h>
+#include <linux/dev_printk.h>
+#include <linux/build_bug.h>
+#include <linux/mmu_notifier.h>
+#include <uapi/linux/mshv.h>
+
+/*
+ * Hypervisor must be between these version numbers (inclusive)
+ * to guarantee compatibility
+ */
+#define MSHV_HV_MIN_VERSION		(27744)
+#define MSHV_HV_MAX_VERSION		(27751)
+
+static_assert(HV_HYP_PAGE_SIZE == MSHV_HV_PAGE_SIZE);
+
+#define MSHV_MAX_VPS			256
+
+#define MSHV_PARTITIONS_HASH_BITS	9
+
+#define MSHV_PIN_PAGES_BATCH_SIZE	(0x10000000ULL / HV_HYP_PAGE_SIZE)
+
+struct mshv_vp {
+	u32 vp_index;
+	struct mshv_partition *vp_partition;
+	struct mutex vp_mutex;
+	struct hv_vp_register_page *vp_register_page;
+	struct hv_message *vp_intercept_msg_page;
+	void *vp_ghcb_page;
+	struct hv_stats_page *vp_stats_pages[2];
+	struct {
+		atomic64_t vp_signaled_count;
+		struct {
+			u64 intercept_suspend: 1;
+			u64 root_sched_blocked: 1; /* root scheduler only */
+			u64 root_sched_dispatched: 1; /* root scheduler only */
+			u64 reserved: 61;
+		} flags;
+		unsigned int kicked_by_hv;
+		wait_queue_head_t vp_suspend_queue;
+	} run;
+};
+
+#define vp_fmt(fmt) "p%lluvp%u: " fmt
+#define vp_devprintk(level, v, fmt, ...) \
+do { \
+	const struct mshv_vp *__vp = (v); \
+	const struct mshv_partition *__pt = __vp->vp_partition; \
+	dev_##level(__pt->pt_module_dev, vp_fmt(fmt), __pt->pt_id, \
+		    __vp->vp_index, ##__VA_ARGS__); \
+} while (0)
+#define vp_emerg(v, fmt, ...)	vp_devprintk(emerg, v, fmt, ##__VA_ARGS__)
+#define vp_crit(v, fmt, ...)	vp_devprintk(crit, v, fmt, ##__VA_ARGS__)
+#define vp_alert(v, fmt, ...)	vp_devprintk(alert, v, fmt, ##__VA_ARGS__)
+#define vp_err(v, fmt, ...)	vp_devprintk(err, v, fmt, ##__VA_ARGS__)
+#define vp_warn(v, fmt, ...)	vp_devprintk(warn, v, fmt, ##__VA_ARGS__)
+#define vp_notice(v, fmt, ...)	vp_devprintk(notice, v, fmt, ##__VA_ARGS__)
+#define vp_info(v, fmt, ...)	vp_devprintk(info, v, fmt, ##__VA_ARGS__)
+#define vp_dbg(v, fmt, ...)	vp_devprintk(dbg, v, fmt, ##__VA_ARGS__)
+
+enum mshv_region_type {
+	MSHV_REGION_TYPE_MEM_PINNED,
+	MSHV_REGION_TYPE_MEM_MOVABLE,
+	MSHV_REGION_TYPE_MMIO
+};
+
+struct mshv_mem_region {
+	struct hlist_node hnode;
+	struct kref refcount;
+	u64 nr_pages;
+	u64 start_gfn;
+	u64 start_uaddr;
+	u32 hv_map_flags;
+	struct mshv_partition *partition;
+	enum mshv_region_type type;
+	struct mmu_interval_notifier mni;
+	struct mutex mutex;	/* protects region pages remapping */
+	struct page *pages[];
+};
+
+struct mshv_irq_ack_notifier {
+	struct hlist_node link;
+	unsigned int irq_ack_gsi;
+	void (*irq_acked)(struct mshv_irq_ack_notifier *mian);
+};
+
+struct mshv_partition {
+	struct device *pt_module_dev;
+
+	struct hlist_node pt_hnode;
+	u64 pt_id;
+	refcount_t pt_ref_count;
+	struct mutex pt_mutex;
+
+	spinlock_t pt_mem_regions_lock;
+	struct hlist_head pt_mem_regions; // not ordered
+
+	u32 pt_vp_count;
+	struct mshv_vp *pt_vp_array[MSHV_MAX_VPS];
+
+	struct mutex pt_irq_lock;
+	struct srcu_struct pt_irq_srcu;
+	struct hlist_head irq_ack_notifier_list;
+
+	struct hlist_head pt_devices;
+
+	/*
+	 * MSHV does not support more than one async hypercall in flight
+	 * for a single partition. Thus, it is okay to define per partition
+	 * async hypercall status.
+	 */
+	struct completion async_hypercall;
+	u64 async_hypercall_status;
+
+	spinlock_t	  pt_irqfds_lock;
+	struct hlist_head pt_irqfds_list;
+	struct mutex	  irqfds_resampler_lock;
+	struct hlist_head irqfds_resampler_list;
+
+	struct hlist_head ioeventfds_list;
+
+	struct mshv_girq_routing_table __rcu *pt_girq_tbl;
+	u64 isolation_type;
+	bool import_completed;
+	bool pt_initialized;
+};
+
+#define pt_fmt(fmt) "p%llu: " fmt
+#define pt_devprintk(level, p, fmt, ...) \
+do { \
+	const struct mshv_partition *__pt = (p); \
+	dev_##level(__pt->pt_module_dev, pt_fmt(fmt), __pt->pt_id, \
+		    ##__VA_ARGS__); \
+} while (0)
+#define pt_emerg(p, fmt, ...)	pt_devprintk(emerg, p, fmt, ##__VA_ARGS__)
+#define pt_crit(p, fmt, ...)	pt_devprintk(crit, p, fmt, ##__VA_ARGS__)
+#define pt_alert(p, fmt, ...)	pt_devprintk(alert, p, fmt, ##__VA_ARGS__)
+#define pt_err(p, fmt, ...)	pt_devprintk(err, p, fmt, ##__VA_ARGS__)
+#define pt_warn(p, fmt, ...)	pt_devprintk(warn, p, fmt, ##__VA_ARGS__)
+#define pt_notice(p, fmt, ...)	pt_devprintk(notice, p, fmt, ##__VA_ARGS__)
+#define pt_info(p, fmt, ...)	pt_devprintk(info, p, fmt, ##__VA_ARGS__)
+#define pt_dbg(p, fmt, ...)	pt_devprintk(dbg, p, fmt, ##__VA_ARGS__)
+
+struct mshv_lapic_irq {
+	u32 lapic_vector;
+	u64 lapic_apic_id;
+	union hv_interrupt_control lapic_control;
+};
+
+#define MSHV_MAX_GUEST_IRQS		4096
+
+/* representation of one guest irq entry, either msi or legacy */
+struct mshv_guest_irq_ent {
+	u32 girq_entry_valid;	/* vfio looks at this */
+	u32 guest_irq_num;	/* a unique number for each irq */
+	u32 girq_addr_lo;	/* guest irq msi address info */
+	u32 girq_addr_hi;
+	u32 girq_irq_data;	/* idt vector in some cases */
+};
+
+struct mshv_girq_routing_table {
+	u32 num_rt_entries;
+	struct mshv_guest_irq_ent mshv_girq_info_tbl[];
+};
+
+struct hv_synic_pages {
+	struct hv_message_page *hyp_synic_message_page;
+	struct hv_synic_event_flags_page *synic_event_flags_page;
+	struct hv_synic_event_ring_page *synic_event_ring_page;
+};
+
+struct mshv_root {
+	struct hv_synic_pages __percpu *synic_pages;
+	spinlock_t pt_ht_lock;
+	DECLARE_HASHTABLE(pt_htable, MSHV_PARTITIONS_HASH_BITS);
+	struct hv_partition_property_vmm_capabilities vmm_caps;
+};
+
+/*
+ * Callback for doorbell events.
+ * NOTE: This is called in interrupt context. Callback
+ * should defer slow and sleeping logic to later.
+ */
+typedef void (*doorbell_cb_t) (int doorbell_id, void *);
+
+/*
+ * port table information
+ */
+struct port_table_info {
+	struct rcu_head portbl_rcu;
+	enum hv_port_type hv_port_type;
+	union {
+		struct {
+			u64 reserved[2];
+		} hv_port_message;
+		struct {
+			u64 reserved[2];
+		} hv_port_event;
+		struct {
+			u64 reserved[2];
+		} hv_port_monitor;
+		struct {
+			doorbell_cb_t doorbell_cb;
+			void *data;
+		} hv_port_doorbell;
+	};
+};
+
+int mshv_update_routing_table(struct mshv_partition *partition,
+			      const struct mshv_user_irq_entry *entries,
+			      unsigned int numents);
+void mshv_free_routing_table(struct mshv_partition *partition);
+
+struct mshv_guest_irq_ent mshv_ret_girq_entry(struct mshv_partition *partition,
+					      u32 irq_num);
+
+void mshv_copy_girq_info(struct mshv_guest_irq_ent *src_irq,
+			 struct mshv_lapic_irq *dest_irq);
+
+void mshv_irqfd_routing_update(struct mshv_partition *partition);
+
+void mshv_port_table_fini(void);
+int mshv_portid_alloc(struct port_table_info *info);
+int mshv_portid_lookup(int port_id, struct port_table_info *info);
+void mshv_portid_free(int port_id);
+
+int mshv_register_doorbell(u64 partition_id, doorbell_cb_t doorbell_cb,
+			   void *data, u64 gpa, u64 val, u64 flags);
+void mshv_unregister_doorbell(u64 partition_id, int doorbell_portid);
+
+void mshv_isr(void);
+int mshv_synic_init(unsigned int cpu);
+int mshv_synic_cleanup(unsigned int cpu);
+
+static inline bool mshv_partition_encrypted(struct mshv_partition *partition)
+{
+	return partition->isolation_type == HV_PARTITION_ISOLATION_TYPE_SNP;
+}
+
+struct mshv_partition *mshv_partition_get(struct mshv_partition *partition);
+void mshv_partition_put(struct mshv_partition *partition);
+struct mshv_partition *mshv_partition_find(u64 partition_id) __must_hold(RCU);
+
+/* hypercalls */
+
+int hv_call_withdraw_memory(u64 count, int node, u64 partition_id);
+int hv_call_create_partition(u64 flags,
+			     struct hv_partition_creation_properties creation_properties,
+			     union hv_partition_isolation_properties isolation_properties,
+			     u64 *partition_id);
+int hv_call_initialize_partition(u64 partition_id);
+int hv_call_finalize_partition(u64 partition_id);
+int hv_call_delete_partition(u64 partition_id);
+int hv_call_map_mmio_pages(u64 partition_id, u64 gfn, u64 mmio_spa, u64 numpgs);
+int hv_call_map_gpa_pages(u64 partition_id, u64 gpa_target, u64 page_count,
+			  u32 flags, struct page **pages);
+int hv_call_unmap_gpa_pages(u64 partition_id, u64 gpa_target, u64 page_count,
+			    u32 flags);
+int hv_call_delete_vp(u64 partition_id, u32 vp_index);
+int hv_call_assert_virtual_interrupt(u64 partition_id, u32 vector,
+				     u64 dest_addr,
+				     union hv_interrupt_control control);
+int hv_call_clear_virtual_interrupt(u64 partition_id);
+int hv_call_get_gpa_access_states(u64 partition_id, u32 count, u64 gpa_base_pfn,
+				  union hv_gpa_page_access_state_flags state_flags,
+				  int *written_total,
+				  union hv_gpa_page_access_state *states);
+int hv_call_get_vp_state(u32 vp_index, u64 partition_id,
+			 struct hv_vp_state_data state_data,
+			 /* Choose between pages and ret_output */
+			 u64 page_count, struct page **pages,
+			 union hv_output_get_vp_state *ret_output);
+int hv_call_set_vp_state(u32 vp_index, u64 partition_id,
+			 /* Choose between pages and bytes */
+			 struct hv_vp_state_data state_data, u64 page_count,
+			 struct page **pages, u32 num_bytes, u8 *bytes);
+int hv_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
+			 union hv_input_vtl input_vtl,
+			 struct page **state_page);
+int hv_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
+			   struct page *state_page,
+			   union hv_input_vtl input_vtl);
+int hv_call_create_port(u64 port_partition_id, union hv_port_id port_id,
+			u64 connection_partition_id, struct hv_port_info *port_info,
+			u8 port_vtl, u8 min_connection_vtl, int node);
+int hv_call_delete_port(u64 port_partition_id, union hv_port_id port_id);
+int hv_call_connect_port(u64 port_partition_id, union hv_port_id port_id,
+			 u64 connection_partition_id,
+			 union hv_connection_id connection_id,
+			 struct hv_connection_info *connection_info,
+			 u8 connection_vtl, int node);
+int hv_call_disconnect_port(u64 connection_partition_id,
+			    union hv_connection_id connection_id);
+int hv_call_notify_port_ring_empty(u32 sint_index);
+int hv_map_stats_page(enum hv_stats_object_type type,
+		      const union hv_stats_object_identity *identity,
+		      void **addr);
+int hv_unmap_stats_page(enum hv_stats_object_type type, void *page_addr,
+			const union hv_stats_object_identity *identity);
+int hv_call_modify_spa_host_access(u64 partition_id, struct page **pages,
+				   u64 page_struct_count, u32 host_access,
+				   u32 flags, u8 acquire);
+int hv_call_get_partition_property_ex(u64 partition_id, u64 property_code, u64 arg,
+				      void *property_value, size_t property_value_sz);
+
+extern struct mshv_root mshv_root;
+extern enum hv_scheduler_type hv_scheduler_type;
+extern u8 * __percpu *hv_synic_eventring_tail;
+
+struct mshv_mem_region *mshv_region_create(u64 guest_pfn, u64 nr_pages,
+					   u64 uaddr, u32 flags);
+int mshv_region_share(struct mshv_mem_region *region);
+int mshv_region_unshare(struct mshv_mem_region *region);
+int mshv_region_map(struct mshv_mem_region *region);
+void mshv_region_invalidate(struct mshv_mem_region *region);
+int mshv_region_pin(struct mshv_mem_region *region);
+void mshv_region_put(struct mshv_mem_region *region);
+int mshv_region_get(struct mshv_mem_region *region);
+bool mshv_region_handle_gfn_fault(struct mshv_mem_region *region, u64 gfn);
+void mshv_region_movable_fini(struct mshv_mem_region *region);
+bool mshv_region_movable_init(struct mshv_mem_region *region);
+
+#endif /* _MSHV_ROOT_H_ */
diff --git a/drivers/hv/mshv_root_hv_call.c b/drivers/hv/mshv_root_hv_call.c
new file mode 100644
index 000000000000..598eaff4ff29
--- /dev/null
+++ b/drivers/hv/mshv_root_hv_call.c
@@ -0,0 +1,1024 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2023, Microsoft Corporation.
+ *
+ * Hypercall helper functions used by the mshv_root module.
+ *
+ * Authors: Microsoft Linux virtualization team
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/export.h>
+#include <asm/mshyperv.h>
+
+#include "mshv_root.h"
+
+/* Determined empirically */
+#define HV_INIT_PARTITION_DEPOSIT_PAGES 208
+#define HV_MAP_GPA_DEPOSIT_PAGES	256
+#define HV_UMAP_GPA_PAGES		512
+
+#define HV_PAGE_COUNT_2M_ALIGNED(pg_count) (!((pg_count) & (0x200 - 1)))
+
+#define HV_WITHDRAW_BATCH_SIZE	(HV_HYP_PAGE_SIZE / sizeof(u64))
+#define HV_MAP_GPA_BATCH_SIZE	\
+	((HV_HYP_PAGE_SIZE - sizeof(struct hv_input_map_gpa_pages)) \
+		/ sizeof(u64))
+#define HV_GET_VP_STATE_BATCH_SIZE	\
+	((HV_HYP_PAGE_SIZE - sizeof(struct hv_input_get_vp_state)) \
+		/ sizeof(u64))
+#define HV_SET_VP_STATE_BATCH_SIZE	\
+	((HV_HYP_PAGE_SIZE - sizeof(struct hv_input_set_vp_state)) \
+		/ sizeof(u64))
+#define HV_GET_GPA_ACCESS_STATES_BATCH_SIZE	\
+	((HV_HYP_PAGE_SIZE - sizeof(union hv_gpa_page_access_state)) \
+		/ sizeof(union hv_gpa_page_access_state))
+#define HV_MODIFY_SPARSE_SPA_PAGE_HOST_ACCESS_MAX_PAGE_COUNT		       \
+	((HV_HYP_PAGE_SIZE -						       \
+	  sizeof(struct hv_input_modify_sparse_spa_page_host_access)) /        \
+	 sizeof(u64))
+
+int hv_call_withdraw_memory(u64 count, int node, u64 partition_id)
+{
+	struct hv_input_withdraw_memory *input_page;
+	struct hv_output_withdraw_memory *output_page;
+	struct page *page;
+	u16 completed;
+	unsigned long remaining = count;
+	u64 status;
+	int i;
+	unsigned long flags;
+
+	page = alloc_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+	output_page = page_address(page);
+
+	while (remaining) {
+		local_irq_save(flags);
+
+		input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+		memset(input_page, 0, sizeof(*input_page));
+		input_page->partition_id = partition_id;
+		status = hv_do_rep_hypercall(HVCALL_WITHDRAW_MEMORY,
+					     min(remaining, HV_WITHDRAW_BATCH_SIZE),
+					     0, input_page, output_page);
+
+		local_irq_restore(flags);
+
+		completed = hv_repcomp(status);
+
+		for (i = 0; i < completed; i++)
+			__free_page(pfn_to_page(output_page->gpa_page_list[i]));
+
+		if (!hv_result_success(status)) {
+			if (hv_result(status) == HV_STATUS_NO_RESOURCES)
+				status = HV_STATUS_SUCCESS;
+			break;
+		}
+
+		remaining -= completed;
+	}
+	free_page((unsigned long)output_page);
+
+	return hv_result_to_errno(status);
+}
+
+int hv_call_create_partition(u64 flags,
+			     struct hv_partition_creation_properties creation_properties,
+			     union hv_partition_isolation_properties isolation_properties,
+			     u64 *partition_id)
+{
+	struct hv_input_create_partition *input;
+	struct hv_output_create_partition *output;
+	u64 status;
+	int ret;
+	unsigned long irq_flags;
+
+	do {
+		local_irq_save(irq_flags);
+		input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+		output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+		memset(input, 0, sizeof(*input));
+		input->flags = flags;
+		input->compatibility_version = HV_COMPATIBILITY_21_H2;
+
+		memcpy(&input->partition_creation_properties, &creation_properties,
+		       sizeof(creation_properties));
+
+		memcpy(&input->isolation_properties, &isolation_properties,
+		       sizeof(isolation_properties));
+
+		status = hv_do_hypercall(HVCALL_CREATE_PARTITION,
+					 input, output);
+
+		if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
+			if (hv_result_success(status))
+				*partition_id = output->partition_id;
+			local_irq_restore(irq_flags);
+			ret = hv_result_to_errno(status);
+			break;
+		}
+		local_irq_restore(irq_flags);
+		ret = hv_call_deposit_pages(NUMA_NO_NODE,
+					    hv_current_partition_id, 1);
+	} while (!ret);
+
+	return ret;
+}
+
+int hv_call_initialize_partition(u64 partition_id)
+{
+	struct hv_input_initialize_partition input;
+	u64 status;
+	int ret;
+
+	input.partition_id = partition_id;
+
+	ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id,
+				    HV_INIT_PARTITION_DEPOSIT_PAGES);
+	if (ret)
+		return ret;
+
+	do {
+		status = hv_do_fast_hypercall8(HVCALL_INITIALIZE_PARTITION,
+					       *(u64 *)&input);
+
+		if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
+			ret = hv_result_to_errno(status);
+			break;
+		}
+		ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id, 1);
+	} while (!ret);
+
+	return ret;
+}
+
+int hv_call_finalize_partition(u64 partition_id)
+{
+	struct hv_input_finalize_partition input;
+	u64 status;
+
+	input.partition_id = partition_id;
+	status = hv_do_fast_hypercall8(HVCALL_FINALIZE_PARTITION,
+				       *(u64 *)&input);
+
+	return hv_result_to_errno(status);
+}
+
+int hv_call_delete_partition(u64 partition_id)
+{
+	struct hv_input_delete_partition input;
+	u64 status;
+
+	input.partition_id = partition_id;
+	status = hv_do_fast_hypercall8(HVCALL_DELETE_PARTITION, *(u64 *)&input);
+
+	return hv_result_to_errno(status);
+}
+
+/* Ask the hypervisor to map guest ram pages or the guest mmio space */
+static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64 page_struct_count,
+			       u32 flags, struct page **pages, u64 mmio_spa)
+{
+	struct hv_input_map_gpa_pages *input_page;
+	u64 status, *pfnlist;
+	unsigned long irq_flags, large_shift = 0;
+	int ret = 0, done = 0;
+	u64 page_count = page_struct_count;
+
+	if (page_count == 0 || (pages && mmio_spa))
+		return -EINVAL;
+
+	if (flags & HV_MAP_GPA_LARGE_PAGE) {
+		if (mmio_spa)
+			return -EINVAL;
+
+		if (!HV_PAGE_COUNT_2M_ALIGNED(page_count))
+			return -EINVAL;
+
+		large_shift = HV_HYP_LARGE_PAGE_SHIFT - HV_HYP_PAGE_SHIFT;
+		page_count >>= large_shift;
+	}
+
+	while (done < page_count) {
+		ulong i, completed, remain = page_count - done;
+		int rep_count = min(remain, HV_MAP_GPA_BATCH_SIZE);
+
+		local_irq_save(irq_flags);
+		input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+		input_page->target_partition_id = partition_id;
+		input_page->target_gpa_base = gfn + (done << large_shift);
+		input_page->map_flags = flags;
+		pfnlist = input_page->source_gpa_page_list;
+
+		for (i = 0; i < rep_count; i++)
+			if (flags & HV_MAP_GPA_NO_ACCESS) {
+				pfnlist[i] = 0;
+			} else if (pages) {
+				u64 index = (done + i) << large_shift;
+
+				if (index >= page_struct_count) {
+					ret = -EINVAL;
+					break;
+				}
+				pfnlist[i] = page_to_pfn(pages[index]);
+			} else {
+				pfnlist[i] = mmio_spa + done + i;
+			}
+		if (ret)
+			break;
+
+		status = hv_do_rep_hypercall(HVCALL_MAP_GPA_PAGES, rep_count, 0,
+					     input_page, NULL);
+		local_irq_restore(irq_flags);
+
+		completed = hv_repcomp(status);
+
+		if (hv_result(status) == HV_STATUS_INSUFFICIENT_MEMORY) {
+			ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id,
+						    HV_MAP_GPA_DEPOSIT_PAGES);
+			if (ret)
+				break;
+
+		} else if (!hv_result_success(status)) {
+			ret = hv_result_to_errno(status);
+			break;
+		}
+
+		done += completed;
+	}
+
+	if (ret && done) {
+		u32 unmap_flags = 0;
+
+		if (flags & HV_MAP_GPA_LARGE_PAGE)
+			unmap_flags |= HV_UNMAP_GPA_LARGE_PAGE;
+		hv_call_unmap_gpa_pages(partition_id, gfn, done, unmap_flags);
+	}
+
+	return ret;
+}
+
+/* Ask the hypervisor to map guest ram pages */
+int hv_call_map_gpa_pages(u64 partition_id, u64 gpa_target, u64 page_count,
+			  u32 flags, struct page **pages)
+{
+	return hv_do_map_gpa_hcall(partition_id, gpa_target, page_count,
+				   flags, pages, 0);
+}
+
+/* Ask the hypervisor to map guest mmio space */
+int hv_call_map_mmio_pages(u64 partition_id, u64 gfn, u64 mmio_spa, u64 numpgs)
+{
+	int i;
+	u32 flags = HV_MAP_GPA_READABLE | HV_MAP_GPA_WRITABLE |
+		    HV_MAP_GPA_NOT_CACHED;
+
+	for (i = 0; i < numpgs; i++)
+		if (page_is_ram(mmio_spa + i))
+			return -EINVAL;
+
+	return hv_do_map_gpa_hcall(partition_id, gfn, numpgs, flags, NULL,
+				   mmio_spa);
+}
+
+int hv_call_unmap_gpa_pages(u64 partition_id, u64 gfn, u64 page_count_4k,
+			    u32 flags)
+{
+	struct hv_input_unmap_gpa_pages *input_page;
+	u64 status, page_count = page_count_4k;
+	unsigned long irq_flags, large_shift = 0;
+	int ret = 0, done = 0;
+
+	if (page_count == 0)
+		return -EINVAL;
+
+	if (flags & HV_UNMAP_GPA_LARGE_PAGE) {
+		if (!HV_PAGE_COUNT_2M_ALIGNED(page_count))
+			return -EINVAL;
+
+		large_shift = HV_HYP_LARGE_PAGE_SHIFT - HV_HYP_PAGE_SHIFT;
+		page_count >>= large_shift;
+	}
+
+	while (done < page_count) {
+		ulong completed, remain = page_count - done;
+		int rep_count = min(remain, HV_UMAP_GPA_PAGES);
+
+		local_irq_save(irq_flags);
+		input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+		input_page->target_partition_id = partition_id;
+		input_page->target_gpa_base = gfn + (done << large_shift);
+		input_page->unmap_flags = flags;
+		status = hv_do_rep_hypercall(HVCALL_UNMAP_GPA_PAGES, rep_count,
+					     0, input_page, NULL);
+		local_irq_restore(irq_flags);
+
+		completed = hv_repcomp(status);
+		if (!hv_result_success(status)) {
+			ret = hv_result_to_errno(status);
+			break;
+		}
+
+		done += completed;
+	}
+
+	return ret;
+}
+
+int hv_call_get_gpa_access_states(u64 partition_id, u32 count, u64 gpa_base_pfn,
+				  union hv_gpa_page_access_state_flags state_flags,
+				  int *written_total,
+				  union hv_gpa_page_access_state *states)
+{
+	struct hv_input_get_gpa_pages_access_state *input_page;
+	union hv_gpa_page_access_state *output_page;
+	int completed = 0;
+	unsigned long remaining = count;
+	int rep_count, i;
+	u64 status = 0;
+	unsigned long flags;
+
+	*written_total = 0;
+	while (remaining) {
+		local_irq_save(flags);
+		input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
+		output_page = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+		input_page->partition_id = partition_id;
+		input_page->hv_gpa_page_number = gpa_base_pfn + *written_total;
+		input_page->flags = state_flags;
+		rep_count = min(remaining, HV_GET_GPA_ACCESS_STATES_BATCH_SIZE);
+
+		status = hv_do_rep_hypercall(HVCALL_GET_GPA_PAGES_ACCESS_STATES, rep_count,
+					     0, input_page, output_page);
+		if (!hv_result_success(status)) {
+			local_irq_restore(flags);
+			break;
+		}
+		completed = hv_repcomp(status);
+		for (i = 0; i < completed; ++i)
+			states[i].as_uint8 = output_page[i].as_uint8;
+
+		local_irq_restore(flags);
+		states += completed;
+		*written_total += completed;
+		remaining -= completed;
+	}
+
+	return hv_result_to_errno(status);
+}
+
+int hv_call_assert_virtual_interrupt(u64 partition_id, u32 vector,
+				     u64 dest_addr,
+				     union hv_interrupt_control control)
+{
+	struct hv_input_assert_virtual_interrupt *input;
+	unsigned long flags;
+	u64 status;
+
+	local_irq_save(flags);
+	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+	memset(input, 0, sizeof(*input));
+	input->partition_id = partition_id;
+	input->vector = vector;
+	/*
+	 * NOTE: dest_addr only needs to be provided while asserting an
+	 * interrupt on x86 platform
+	 */
+#if IS_ENABLED(CONFIG_X86)
+	input->dest_addr = dest_addr;
+#endif
+	input->control = control;
+	status = hv_do_hypercall(HVCALL_ASSERT_VIRTUAL_INTERRUPT, input, NULL);
+	local_irq_restore(flags);
+
+	return hv_result_to_errno(status);
+}
+
+int hv_call_delete_vp(u64 partition_id, u32 vp_index)
+{
+	union hv_input_delete_vp input = {};
+	u64 status;
+
+	input.partition_id = partition_id;
+	input.vp_index = vp_index;
+
+	status = hv_do_fast_hypercall16(HVCALL_DELETE_VP,
+					input.as_uint64[0], input.as_uint64[1]);
+
+	return hv_result_to_errno(status);
+}
+EXPORT_SYMBOL_GPL(hv_call_delete_vp);
+
+int hv_call_get_vp_state(u32 vp_index, u64 partition_id,
+			 struct hv_vp_state_data state_data,
+			 /* Choose between pages and ret_output */
+			 u64 page_count, struct page **pages,
+			 union hv_output_get_vp_state *ret_output)
+{
+	struct hv_input_get_vp_state *input;
+	union hv_output_get_vp_state *output;
+	u64 status;
+	int i;
+	u64 control;
+	unsigned long flags;
+	int ret = 0;
+
+	if (page_count > HV_GET_VP_STATE_BATCH_SIZE)
+		return -EINVAL;
+
+	if (!page_count && !ret_output)
+		return -EINVAL;
+
+	do {
+		local_irq_save(flags);
+		input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+		output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+		memset(input, 0, sizeof(*input));
+		memset(output, 0, sizeof(*output));
+
+		input->partition_id = partition_id;
+		input->vp_index = vp_index;
+		input->state_data = state_data;
+		for (i = 0; i < page_count; i++)
+			input->output_data_pfns[i] = page_to_pfn(pages[i]);
+
+		control = (HVCALL_GET_VP_STATE) |
+			  (page_count << HV_HYPERCALL_VARHEAD_OFFSET);
+
+		status = hv_do_hypercall(control, input, output);
+
+		if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
+			if (hv_result_success(status) && ret_output)
+				memcpy(ret_output, output, sizeof(*output));
+
+			local_irq_restore(flags);
+			ret = hv_result_to_errno(status);
+			break;
+		}
+		local_irq_restore(flags);
+
+		ret = hv_call_deposit_pages(NUMA_NO_NODE,
+					    partition_id, 1);
+	} while (!ret);
+
+	return ret;
+}
+
+int hv_call_set_vp_state(u32 vp_index, u64 partition_id,
+			 /* Choose between pages and bytes */
+			 struct hv_vp_state_data state_data, u64 page_count,
+			 struct page **pages, u32 num_bytes, u8 *bytes)
+{
+	struct hv_input_set_vp_state *input;
+	u64 status;
+	int i;
+	u64 control;
+	unsigned long flags;
+	int ret = 0;
+	u16 varhead_sz;
+
+	if (page_count > HV_SET_VP_STATE_BATCH_SIZE)
+		return -EINVAL;
+	if (sizeof(*input) + num_bytes > HV_HYP_PAGE_SIZE)
+		return -EINVAL;
+
+	if (num_bytes)
+		/* round up to 8 and divide by 8 */
+		varhead_sz = (num_bytes + 7) >> 3;
+	else if (page_count)
+		varhead_sz = page_count;
+	else
+		return -EINVAL;
+
+	do {
+		local_irq_save(flags);
+		input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+		memset(input, 0, sizeof(*input));
+
+		input->partition_id = partition_id;
+		input->vp_index = vp_index;
+		input->state_data = state_data;
+		if (num_bytes) {
+			memcpy((u8 *)input->data, bytes, num_bytes);
+		} else {
+			for (i = 0; i < page_count; i++)
+				input->data[i].pfns = page_to_pfn(pages[i]);
+		}
+
+		control = (HVCALL_SET_VP_STATE) |
+			  (varhead_sz << HV_HYPERCALL_VARHEAD_OFFSET);
+
+		status = hv_do_hypercall(control, input, NULL);
+
+		if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
+			local_irq_restore(flags);
+			ret = hv_result_to_errno(status);
+			break;
+		}
+		local_irq_restore(flags);
+
+		ret = hv_call_deposit_pages(NUMA_NO_NODE,
+					    partition_id, 1);
+	} while (!ret);
+
+	return ret;
+}
+
+static int hv_call_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
+				     union hv_input_vtl input_vtl,
+				     struct page **state_page)
+{
+	struct hv_input_map_vp_state_page *input;
+	struct hv_output_map_vp_state_page *output;
+	u64 status;
+	int ret;
+	unsigned long flags;
+
+	do {
+		local_irq_save(flags);
+
+		input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+		output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+		memset(input, 0, sizeof(*input));
+		input->partition_id = partition_id;
+		input->vp_index = vp_index;
+		input->type = type;
+		input->input_vtl = input_vtl;
+
+		if (*state_page) {
+			input->flags.map_location_provided = 1;
+			input->requested_map_location =
+				page_to_pfn(*state_page);
+		}
+
+		status = hv_do_hypercall(HVCALL_MAP_VP_STATE_PAGE, input,
+					 output);
+
+		if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
+			if (hv_result_success(status))
+				*state_page = pfn_to_page(output->map_location);
+			local_irq_restore(flags);
+			ret = hv_result_to_errno(status);
+			break;
+		}
+
+		local_irq_restore(flags);
+
+		ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id, 1);
+	} while (!ret);
+
+	return ret;
+}
+
+static bool mshv_use_overlay_gpfn(void)
+{
+	return hv_l1vh_partition() &&
+	       mshv_root.vmm_caps.vmm_can_provide_overlay_gpfn;
+}
+
+int hv_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
+			 union hv_input_vtl input_vtl,
+			 struct page **state_page)
+{
+	int ret = 0;
+	struct page *allocated_page = NULL;
+
+	if (mshv_use_overlay_gpfn()) {
+		allocated_page = alloc_page(GFP_KERNEL);
+		if (!allocated_page)
+			return -ENOMEM;
+		*state_page = allocated_page;
+	} else {
+		*state_page = NULL;
+	}
+
+	ret = hv_call_map_vp_state_page(partition_id, vp_index, type, input_vtl,
+					state_page);
+
+	if (ret && allocated_page) {
+		__free_page(allocated_page);
+		*state_page = NULL;
+	}
+
+	return ret;
+}
+
+static int hv_call_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
+				       union hv_input_vtl input_vtl)
+{
+	unsigned long flags;
+	u64 status;
+	struct hv_input_unmap_vp_state_page *input;
+
+	local_irq_save(flags);
+
+	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+	memset(input, 0, sizeof(*input));
+
+	input->partition_id = partition_id;
+	input->vp_index = vp_index;
+	input->type = type;
+	input->input_vtl = input_vtl;
+
+	status = hv_do_hypercall(HVCALL_UNMAP_VP_STATE_PAGE, input, NULL);
+
+	local_irq_restore(flags);
+
+	return hv_result_to_errno(status);
+}
+
+int hv_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
+			   struct page *state_page, union hv_input_vtl input_vtl)
+{
+	int ret = hv_call_unmap_vp_state_page(partition_id, vp_index, type, input_vtl);
+
+	if (mshv_use_overlay_gpfn() && state_page)
+		__free_page(state_page);
+
+	return ret;
+}
+
+int hv_call_get_partition_property_ex(u64 partition_id, u64 property_code,
+				      u64 arg, void *property_value,
+				      size_t property_value_sz)
+{
+	u64 status;
+	unsigned long flags;
+	struct hv_input_get_partition_property_ex *input;
+	struct hv_output_get_partition_property_ex *output;
+
+	local_irq_save(flags);
+	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+	output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+	memset(input, 0, sizeof(*input));
+	input->partition_id = partition_id;
+	input->property_code = property_code;
+	input->arg = arg;
+	status = hv_do_hypercall(HVCALL_GET_PARTITION_PROPERTY_EX, input, output);
+
+	if (!hv_result_success(status)) {
+		local_irq_restore(flags);
+		hv_status_debug(status, "\n");
+		return hv_result_to_errno(status);
+	}
+	memcpy(property_value, &output->property_value, property_value_sz);
+
+	local_irq_restore(flags);
+
+	return 0;
+}
+
+int
+hv_call_clear_virtual_interrupt(u64 partition_id)
+{
+	int status;
+
+	status = hv_do_fast_hypercall8(HVCALL_CLEAR_VIRTUAL_INTERRUPT,
+				       partition_id);
+
+	return hv_result_to_errno(status);
+}
+
+int
+hv_call_create_port(u64 port_partition_id, union hv_port_id port_id,
+		    u64 connection_partition_id,
+		    struct hv_port_info *port_info,
+		    u8 port_vtl, u8 min_connection_vtl, int node)
+{
+	struct hv_input_create_port *input;
+	unsigned long flags;
+	int ret = 0;
+	int status;
+
+	do {
+		local_irq_save(flags);
+		input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+		memset(input, 0, sizeof(*input));
+
+		input->port_partition_id = port_partition_id;
+		input->port_id = port_id;
+		input->connection_partition_id = connection_partition_id;
+		input->port_info = *port_info;
+		input->port_vtl = port_vtl;
+		input->min_connection_vtl = min_connection_vtl;
+		input->proximity_domain_info = hv_numa_node_to_pxm_info(node);
+		status = hv_do_hypercall(HVCALL_CREATE_PORT, input, NULL);
+		local_irq_restore(flags);
+		if (hv_result_success(status))
+			break;
+
+		if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
+			ret = hv_result_to_errno(status);
+			break;
+		}
+		ret = hv_call_deposit_pages(NUMA_NO_NODE, port_partition_id, 1);
+
+	} while (!ret);
+
+	return ret;
+}
+
+int
+hv_call_delete_port(u64 port_partition_id, union hv_port_id port_id)
+{
+	union hv_input_delete_port input = { 0 };
+	int status;
+
+	input.port_partition_id = port_partition_id;
+	input.port_id = port_id;
+	status = hv_do_fast_hypercall16(HVCALL_DELETE_PORT,
+					input.as_uint64[0],
+					input.as_uint64[1]);
+
+	return hv_result_to_errno(status);
+}
+
+int
+hv_call_connect_port(u64 port_partition_id, union hv_port_id port_id,
+		     u64 connection_partition_id,
+		     union hv_connection_id connection_id,
+		     struct hv_connection_info *connection_info,
+		     u8 connection_vtl, int node)
+{
+	struct hv_input_connect_port *input;
+	unsigned long flags;
+	int ret = 0, status;
+
+	do {
+		local_irq_save(flags);
+		input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+		memset(input, 0, sizeof(*input));
+		input->port_partition_id = port_partition_id;
+		input->port_id = port_id;
+		input->connection_partition_id = connection_partition_id;
+		input->connection_id = connection_id;
+		input->connection_info = *connection_info;
+		input->connection_vtl = connection_vtl;
+		input->proximity_domain_info = hv_numa_node_to_pxm_info(node);
+		status = hv_do_hypercall(HVCALL_CONNECT_PORT, input, NULL);
+
+		local_irq_restore(flags);
+		if (hv_result_success(status))
+			break;
+
+		if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
+			ret = hv_result_to_errno(status);
+			break;
+		}
+		ret = hv_call_deposit_pages(NUMA_NO_NODE,
+					    connection_partition_id, 1);
+	} while (!ret);
+
+	return ret;
+}
+
+int
+hv_call_disconnect_port(u64 connection_partition_id,
+			union hv_connection_id connection_id)
+{
+	union hv_input_disconnect_port input = { 0 };
+	int status;
+
+	input.connection_partition_id = connection_partition_id;
+	input.connection_id = connection_id;
+	input.is_doorbell = 1;
+	status = hv_do_fast_hypercall16(HVCALL_DISCONNECT_PORT,
+					input.as_uint64[0],
+					input.as_uint64[1]);
+
+	return hv_result_to_errno(status);
+}
+
+int
+hv_call_notify_port_ring_empty(u32 sint_index)
+{
+	union hv_input_notify_port_ring_empty input = { 0 };
+	int status;
+
+	input.sint_index = sint_index;
+	status = hv_do_fast_hypercall8(HVCALL_NOTIFY_PORT_RING_EMPTY,
+				       input.as_uint64);
+
+	return hv_result_to_errno(status);
+}
+
+static int hv_call_map_stats_page2(enum hv_stats_object_type type,
+				   const union hv_stats_object_identity *identity,
+				   u64 map_location)
+{
+	unsigned long flags;
+	struct hv_input_map_stats_page2 *input;
+	u64 status;
+	int ret;
+
+	if (!map_location || !mshv_use_overlay_gpfn())
+		return -EINVAL;
+
+	do {
+		local_irq_save(flags);
+		input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+		memset(input, 0, sizeof(*input));
+		input->type = type;
+		input->identity = *identity;
+		input->map_location = map_location;
+
+		status = hv_do_hypercall(HVCALL_MAP_STATS_PAGE2, input, NULL);
+
+		local_irq_restore(flags);
+
+		ret = hv_result_to_errno(status);
+
+		if (!ret)
+			break;
+
+		if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
+			hv_status_debug(status, "\n");
+			break;
+		}
+
+		ret = hv_call_deposit_pages(NUMA_NO_NODE,
+					    hv_current_partition_id, 1);
+	} while (!ret);
+
+	return ret;
+}
+
+static int hv_call_map_stats_page(enum hv_stats_object_type type,
+				  const union hv_stats_object_identity *identity,
+				  void **addr)
+{
+	unsigned long flags;
+	struct hv_input_map_stats_page *input;
+	struct hv_output_map_stats_page *output;
+	u64 status, pfn;
+	int ret = 0;
+
+	do {
+		local_irq_save(flags);
+		input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+		output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+		memset(input, 0, sizeof(*input));
+		input->type = type;
+		input->identity = *identity;
+
+		status = hv_do_hypercall(HVCALL_MAP_STATS_PAGE, input, output);
+		pfn = output->map_location;
+
+		local_irq_restore(flags);
+		if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
+			ret = hv_result_to_errno(status);
+			if (hv_result_success(status))
+				break;
+			return ret;
+		}
+
+		ret = hv_call_deposit_pages(NUMA_NO_NODE,
+					    hv_current_partition_id, 1);
+		if (ret)
+			return ret;
+	} while (!ret);
+
+	*addr = page_address(pfn_to_page(pfn));
+
+	return ret;
+}
+
+int hv_map_stats_page(enum hv_stats_object_type type,
+		      const union hv_stats_object_identity *identity,
+		      void **addr)
+{
+	int ret;
+	struct page *allocated_page = NULL;
+
+	if (!addr)
+		return -EINVAL;
+
+	if (mshv_use_overlay_gpfn()) {
+		allocated_page = alloc_page(GFP_KERNEL);
+		if (!allocated_page)
+			return -ENOMEM;
+
+		ret = hv_call_map_stats_page2(type, identity,
+					      page_to_pfn(allocated_page));
+		*addr = page_address(allocated_page);
+	} else {
+		ret = hv_call_map_stats_page(type, identity, addr);
+	}
+
+	if (ret && allocated_page) {
+		__free_page(allocated_page);
+		*addr = NULL;
+	}
+
+	return ret;
+}
+
+static int hv_call_unmap_stats_page(enum hv_stats_object_type type,
+				    const union hv_stats_object_identity *identity)
+{
+	unsigned long flags;
+	struct hv_input_unmap_stats_page *input;
+	u64 status;
+
+	local_irq_save(flags);
+	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+	memset(input, 0, sizeof(*input));
+	input->type = type;
+	input->identity = *identity;
+
+	status = hv_do_hypercall(HVCALL_UNMAP_STATS_PAGE, input, NULL);
+	local_irq_restore(flags);
+
+	return hv_result_to_errno(status);
+}
+
+int hv_unmap_stats_page(enum hv_stats_object_type type, void *page_addr,
+			const union hv_stats_object_identity *identity)
+{
+	int ret;
+
+	ret = hv_call_unmap_stats_page(type, identity);
+
+	if (mshv_use_overlay_gpfn() && page_addr)
+		__free_page(virt_to_page(page_addr));
+
+	return ret;
+}
+
+int hv_call_modify_spa_host_access(u64 partition_id, struct page **pages,
+				   u64 page_struct_count, u32 host_access,
+				   u32 flags, u8 acquire)
+{
+	struct hv_input_modify_sparse_spa_page_host_access *input_page;
+	u64 status;
+	int done = 0;
+	unsigned long irq_flags, large_shift = 0;
+	u64 page_count = page_struct_count;
+	u16 code = acquire ? HVCALL_ACQUIRE_SPARSE_SPA_PAGE_HOST_ACCESS :
+			     HVCALL_RELEASE_SPARSE_SPA_PAGE_HOST_ACCESS;
+
+	if (page_count == 0)
+		return -EINVAL;
+
+	if (flags & HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE) {
+		if (!HV_PAGE_COUNT_2M_ALIGNED(page_count))
+			return -EINVAL;
+		large_shift = HV_HYP_LARGE_PAGE_SHIFT - HV_HYP_PAGE_SHIFT;
+		page_count >>= large_shift;
+	}
+
+	while (done < page_count) {
+		ulong i, completed, remain = page_count - done;
+		int rep_count = min(remain,
+				    HV_MODIFY_SPARSE_SPA_PAGE_HOST_ACCESS_MAX_PAGE_COUNT);
+
+		local_irq_save(irq_flags);
+		input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+		memset(input_page, 0, sizeof(*input_page));
+		/* Only set the partition id if you are making the pages
+		 * exclusive
+		 */
+		if (flags & HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_EXCLUSIVE)
+			input_page->partition_id = partition_id;
+		input_page->flags = flags;
+		input_page->host_access = host_access;
+
+		for (i = 0; i < rep_count; i++) {
+			u64 index = (done + i) << large_shift;
+
+			if (index >= page_struct_count)
+				return -EINVAL;
+
+			input_page->spa_page_list[i] =
+						page_to_pfn(pages[index]);
+		}
+
+		status = hv_do_rep_hypercall(code, rep_count, 0, input_page,
+					     NULL);
+		local_irq_restore(irq_flags);
+
+		completed = hv_repcomp(status);
+
+		if (!hv_result_success(status))
+			return hv_result_to_errno(status);
+
+		done += completed;
+	}
+
+	return 0;
+}
diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
new file mode 100644
index 000000000000..1134a82c7881
--- /dev/null
+++ b/drivers/hv/mshv_root_main.c
@@ -0,0 +1,2342 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024, Microsoft Corporation.
+ *
+ * The main part of the mshv_root module, providing APIs to create
+ * and manage guest partitions.
+ *
+ * Authors: Microsoft Linux virtualization team
+ */
+
+#include <linux/entry-virt.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/anon_inodes.h>
+#include <linux/mm.h>
+#include <linux/io.h>
+#include <linux/cpuhotplug.h>
+#include <linux/random.h>
+#include <asm/mshyperv.h>
+#include <linux/hyperv.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/kexec.h>
+#include <linux/page-flags.h>
+#include <linux/crash_dump.h>
+#include <linux/panic_notifier.h>
+#include <linux/vmalloc.h>
+#include <linux/rseq.h>
+
+#include "mshv_eventfd.h"
+#include "mshv.h"
+#include "mshv_root.h"
+
+MODULE_AUTHOR("Microsoft");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Microsoft Hyper-V root partition VMM interface /dev/mshv");
+
+/* TODO move this to another file when debugfs code is added */
+enum hv_stats_vp_counters {			/* HV_THREAD_COUNTER */
+#if defined(CONFIG_X86)
+	VpRootDispatchThreadBlocked			= 202,
+#elif defined(CONFIG_ARM64)
+	VpRootDispatchThreadBlocked			= 94,
+#endif
+	VpStatsMaxCounter
+};
+
+struct hv_stats_page {
+	union {
+		u64 vp_cntrs[VpStatsMaxCounter];		/* VP counters */
+		u8 data[HV_HYP_PAGE_SIZE];
+	};
+} __packed;
+
+struct mshv_root mshv_root;
+
+enum hv_scheduler_type hv_scheduler_type;
+
+/* Once we implement the fast extended hypercall ABI they can go away. */
+static void * __percpu *root_scheduler_input;
+static void * __percpu *root_scheduler_output;
+
+static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg);
+static int mshv_dev_open(struct inode *inode, struct file *filp);
+static int mshv_dev_release(struct inode *inode, struct file *filp);
+static int mshv_vp_release(struct inode *inode, struct file *filp);
+static long mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg);
+static int mshv_partition_release(struct inode *inode, struct file *filp);
+static long mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg);
+static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma);
+static vm_fault_t mshv_vp_fault(struct vm_fault *vmf);
+static int mshv_init_async_handler(struct mshv_partition *partition);
+static void mshv_async_hvcall_handler(void *data, u64 *status);
+
+static const union hv_input_vtl input_vtl_zero;
+static const union hv_input_vtl input_vtl_normal = {
+	.target_vtl = HV_NORMAL_VTL,
+	.use_target_vtl = 1,
+};
+
+static const struct vm_operations_struct mshv_vp_vm_ops = {
+	.fault = mshv_vp_fault,
+};
+
+static const struct file_operations mshv_vp_fops = {
+	.owner = THIS_MODULE,
+	.release = mshv_vp_release,
+	.unlocked_ioctl = mshv_vp_ioctl,
+	.llseek = noop_llseek,
+	.mmap = mshv_vp_mmap,
+};
+
+static const struct file_operations mshv_partition_fops = {
+	.owner = THIS_MODULE,
+	.release = mshv_partition_release,
+	.unlocked_ioctl = mshv_partition_ioctl,
+	.llseek = noop_llseek,
+};
+
+static const struct file_operations mshv_dev_fops = {
+	.owner = THIS_MODULE,
+	.open = mshv_dev_open,
+	.release = mshv_dev_release,
+	.unlocked_ioctl = mshv_dev_ioctl,
+	.llseek = noop_llseek,
+};
+
+static struct miscdevice mshv_dev = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = "mshv",
+	.fops = &mshv_dev_fops,
+	.mode = 0600,
+};
+
+/*
+ * Only allow hypercalls that have a u64 partition id as the first member of
+ * the input structure.
+ * These are sorted by value.
+ */
+static u16 mshv_passthru_hvcalls[] = {
+	HVCALL_GET_PARTITION_PROPERTY,
+	HVCALL_GET_PARTITION_PROPERTY_EX,
+	HVCALL_SET_PARTITION_PROPERTY,
+	HVCALL_INSTALL_INTERCEPT,
+	HVCALL_GET_VP_REGISTERS,
+	HVCALL_SET_VP_REGISTERS,
+	HVCALL_TRANSLATE_VIRTUAL_ADDRESS,
+	HVCALL_CLEAR_VIRTUAL_INTERRUPT,
+	HVCALL_REGISTER_INTERCEPT_RESULT,
+	HVCALL_ASSERT_VIRTUAL_INTERRUPT,
+	HVCALL_GET_GPA_PAGES_ACCESS_STATES,
+	HVCALL_SIGNAL_EVENT_DIRECT,
+	HVCALL_POST_MESSAGE_DIRECT,
+	HVCALL_GET_VP_CPUID_VALUES,
+};
+
+/*
+ * Only allow hypercalls that are safe to be called by the VMM with the host
+ * partition as target (i.e. HV_PARTITION_ID_SELF). Carefully audit that a
+ * hypercall cannot be misused by the VMM before adding it to this list.
+ */
+static u16 mshv_self_passthru_hvcalls[] = {
+	HVCALL_GET_PARTITION_PROPERTY,
+	HVCALL_GET_PARTITION_PROPERTY_EX,
+};
+
+static bool mshv_hvcall_is_async(u16 code)
+{
+	switch (code) {
+	case HVCALL_SET_PARTITION_PROPERTY:
+		return true;
+	default:
+		break;
+	}
+	return false;
+}
+
+static bool mshv_passthru_hvcall_allowed(u16 code, u64 pt_id)
+{
+	int i;
+	int n = ARRAY_SIZE(mshv_passthru_hvcalls);
+	u16 *allowed_hvcalls = mshv_passthru_hvcalls;
+
+	if (pt_id == HV_PARTITION_ID_SELF) {
+		n = ARRAY_SIZE(mshv_self_passthru_hvcalls);
+		allowed_hvcalls = mshv_self_passthru_hvcalls;
+	}
+
+	for (i = 0; i < n; ++i)
+		if (allowed_hvcalls[i] == code)
+			return true;
+
+	return false;
+}
+
+static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition,
+				      bool partition_locked,
+				      void __user *user_args)
+{
+	u64 status;
+	int ret = 0;
+	bool is_async;
+	struct mshv_root_hvcall args;
+	struct page *page;
+	unsigned int pages_order;
+	void *input_pg = NULL;
+	void *output_pg = NULL;
+	u16 reps_completed;
+	u64 pt_id = partition ? partition->pt_id : HV_PARTITION_ID_SELF;
+
+	if (copy_from_user(&args, user_args, sizeof(args)))
+		return -EFAULT;
+
+	if (args.status || !args.in_ptr || args.in_sz < sizeof(u64) ||
+	    mshv_field_nonzero(args, rsvd) || args.in_sz > HV_HYP_PAGE_SIZE)
+		return -EINVAL;
+
+	if (args.out_ptr && (!args.out_sz || args.out_sz > HV_HYP_PAGE_SIZE))
+		return -EINVAL;
+
+	if (!mshv_passthru_hvcall_allowed(args.code, pt_id))
+		return -EINVAL;
+
+	is_async = mshv_hvcall_is_async(args.code);
+	if (is_async) {
+		/* async hypercalls can only be called from partition fd */
+		if (!partition || !partition_locked)
+			return -EINVAL;
+		ret = mshv_init_async_handler(partition);
+		if (ret)
+			return ret;
+	}
+
+	pages_order = args.out_ptr ? 1 : 0;
+	page = alloc_pages(GFP_KERNEL, pages_order);
+	if (!page)
+		return -ENOMEM;
+	input_pg = page_address(page);
+
+	if (args.out_ptr)
+		output_pg = (char *)input_pg + PAGE_SIZE;
+	else
+		output_pg = NULL;
+
+	if (copy_from_user(input_pg, (void __user *)args.in_ptr,
+			   args.in_sz)) {
+		ret = -EFAULT;
+		goto free_pages_out;
+	}
+
+	/*
+	 * NOTE: This only works because all the allowed hypercalls' input
+	 * structs begin with a u64 partition_id field.
+	 */
+	*(u64 *)input_pg = pt_id;
+
+	reps_completed = 0;
+	do {
+		if (args.reps) {
+			status = hv_do_rep_hypercall_ex(args.code, args.reps,
+							0, reps_completed,
+							input_pg, output_pg);
+			reps_completed = hv_repcomp(status);
+		} else {
+			status = hv_do_hypercall(args.code, input_pg, output_pg);
+		}
+
+		if (hv_result(status) == HV_STATUS_CALL_PENDING) {
+			if (is_async) {
+				mshv_async_hvcall_handler(partition, &status);
+			} else { /* Paranoia check. This shouldn't happen! */
+				ret = -EBADFD;
+				goto free_pages_out;
+			}
+		}
+
+		if (hv_result_success(status))
+			break;
+
+		if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY)
+			ret = hv_result_to_errno(status);
+		else
+			ret = hv_call_deposit_pages(NUMA_NO_NODE,
+						    pt_id, 1);
+	} while (!ret);
+
+	args.status = hv_result(status);
+	args.reps = reps_completed;
+	if (copy_to_user(user_args, &args, sizeof(args)))
+		ret = -EFAULT;
+
+	if (!ret && output_pg &&
+	    copy_to_user((void __user *)args.out_ptr, output_pg, args.out_sz))
+		ret = -EFAULT;
+
+free_pages_out:
+	free_pages((unsigned long)input_pg, pages_order);
+
+	return ret;
+}
+
+static inline bool is_ghcb_mapping_available(void)
+{
+#if IS_ENABLED(CONFIG_X86_64)
+	return ms_hyperv.ext_features & HV_VP_GHCB_ROOT_MAPPING_AVAILABLE;
+#else
+	return 0;
+#endif
+}
+
+static int mshv_get_vp_registers(u32 vp_index, u64 partition_id, u16 count,
+				 struct hv_register_assoc *registers)
+{
+	return hv_call_get_vp_registers(vp_index, partition_id,
+					count, input_vtl_zero, registers);
+}
+
+static int mshv_set_vp_registers(u32 vp_index, u64 partition_id, u16 count,
+				 struct hv_register_assoc *registers)
+{
+	return hv_call_set_vp_registers(vp_index, partition_id,
+					count, input_vtl_zero, registers);
+}
+
+/*
+ * Explicit guest vCPU suspend is asynchronous by nature (as it is requested by
+ * dom0 vCPU for guest vCPU) and thus it can race with "intercept" suspend,
+ * done by the hypervisor.
+ * "Intercept" suspend leads to asynchronous message delivery to dom0 which
+ * should be awaited to keep the VP loop consistent (i.e. no message pending
+ * upon VP resume).
+ * VP intercept suspend can't be done when the VP is explicitly suspended
+ * already, and thus can be only two possible race scenarios:
+ *   1. implicit suspend bit set -> explicit suspend bit set -> message sent
+ *   2. implicit suspend bit set -> message sent -> explicit suspend bit set
+ * Checking for implicit suspend bit set after explicit suspend request has
+ * succeeded in either case allows us to reliably identify, if there is a
+ * message to receive and deliver to VMM.
+ */
+static int
+mshv_suspend_vp(const struct mshv_vp *vp, bool *message_in_flight)
+{
+	struct hv_register_assoc explicit_suspend = {
+		.name = HV_REGISTER_EXPLICIT_SUSPEND
+	};
+	struct hv_register_assoc intercept_suspend = {
+		.name = HV_REGISTER_INTERCEPT_SUSPEND
+	};
+	union hv_explicit_suspend_register *es =
+		&explicit_suspend.value.explicit_suspend;
+	union hv_intercept_suspend_register *is =
+		&intercept_suspend.value.intercept_suspend;
+	int ret;
+
+	es->suspended = 1;
+
+	ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
+				    1, &explicit_suspend);
+	if (ret) {
+		vp_err(vp, "Failed to explicitly suspend vCPU\n");
+		return ret;
+	}
+
+	ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
+				    1, &intercept_suspend);
+	if (ret) {
+		vp_err(vp, "Failed to get intercept suspend state\n");
+		return ret;
+	}
+
+	*message_in_flight = is->suspended;
+
+	return 0;
+}
+
+/*
+ * This function is used when VPs are scheduled by the hypervisor's
+ * scheduler.
+ *
+ * Caller has to make sure the registers contain cleared
+ * HV_REGISTER_INTERCEPT_SUSPEND and HV_REGISTER_EXPLICIT_SUSPEND registers
+ * exactly in this order (the hypervisor clears them sequentially) to avoid
+ * potential invalid clearing a newly arrived HV_REGISTER_INTERCEPT_SUSPEND
+ * after VP is released from HV_REGISTER_EXPLICIT_SUSPEND in case of the
+ * opposite order.
+ */
+static long mshv_run_vp_with_hyp_scheduler(struct mshv_vp *vp)
+{
+	long ret;
+	struct hv_register_assoc suspend_regs[2] = {
+			{ .name = HV_REGISTER_INTERCEPT_SUSPEND },
+			{ .name = HV_REGISTER_EXPLICIT_SUSPEND }
+	};
+	size_t count = ARRAY_SIZE(suspend_regs);
+
+	/* Resume VP execution */
+	ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
+				    count, suspend_regs);
+	if (ret) {
+		vp_err(vp, "Failed to resume vp execution. %lx\n", ret);
+		return ret;
+	}
+
+	ret = wait_event_interruptible(vp->run.vp_suspend_queue,
+				       vp->run.kicked_by_hv == 1);
+	if (ret) {
+		bool message_in_flight;
+
+		/*
+		 * Otherwise the waiting was interrupted by a signal: suspend
+		 * the vCPU explicitly and copy message in flight (if any).
+		 */
+		ret = mshv_suspend_vp(vp, &message_in_flight);
+		if (ret)
+			return ret;
+
+		/* Return if no message in flight */
+		if (!message_in_flight)
+			return -EINTR;
+
+		/* Wait for the message in flight. */
+		wait_event(vp->run.vp_suspend_queue, vp->run.kicked_by_hv == 1);
+	}
+
+	/*
+	 * Reset the flag to make the wait_event call above work
+	 * next time.
+	 */
+	vp->run.kicked_by_hv = 0;
+
+	return 0;
+}
+
+static int
+mshv_vp_dispatch(struct mshv_vp *vp, u32 flags,
+		 struct hv_output_dispatch_vp *res)
+{
+	struct hv_input_dispatch_vp *input;
+	struct hv_output_dispatch_vp *output;
+	u64 status;
+
+	preempt_disable();
+	input = *this_cpu_ptr(root_scheduler_input);
+	output = *this_cpu_ptr(root_scheduler_output);
+
+	memset(input, 0, sizeof(*input));
+	memset(output, 0, sizeof(*output));
+
+	input->partition_id = vp->vp_partition->pt_id;
+	input->vp_index = vp->vp_index;
+	input->time_slice = 0; /* Run forever until something happens */
+	input->spec_ctrl = 0; /* TODO: set sensible flags */
+	input->flags = flags;
+
+	vp->run.flags.root_sched_dispatched = 1;
+	status = hv_do_hypercall(HVCALL_DISPATCH_VP, input, output);
+	vp->run.flags.root_sched_dispatched = 0;
+
+	*res = *output;
+	preempt_enable();
+
+	if (!hv_result_success(status))
+		vp_err(vp, "%s: status %s\n", __func__,
+		       hv_result_to_string(status));
+
+	return hv_result_to_errno(status);
+}
+
+static int
+mshv_vp_clear_explicit_suspend(struct mshv_vp *vp)
+{
+	struct hv_register_assoc explicit_suspend = {
+		.name = HV_REGISTER_EXPLICIT_SUSPEND,
+		.value.explicit_suspend.suspended = 0,
+	};
+	int ret;
+
+	ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
+				    1, &explicit_suspend);
+
+	if (ret)
+		vp_err(vp, "Failed to unsuspend\n");
+
+	return ret;
+}
+
+#if IS_ENABLED(CONFIG_X86_64)
+static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp)
+{
+	if (!vp->vp_register_page)
+		return 0;
+	return vp->vp_register_page->interrupt_vectors.as_uint64;
+}
+#else
+static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp)
+{
+	return 0;
+}
+#endif
+
+static bool mshv_vp_dispatch_thread_blocked(struct mshv_vp *vp)
+{
+	struct hv_stats_page **stats = vp->vp_stats_pages;
+	u64 *self_vp_cntrs = stats[HV_STATS_AREA_SELF]->vp_cntrs;
+	u64 *parent_vp_cntrs = stats[HV_STATS_AREA_PARENT]->vp_cntrs;
+
+	if (self_vp_cntrs[VpRootDispatchThreadBlocked])
+		return self_vp_cntrs[VpRootDispatchThreadBlocked];
+	return parent_vp_cntrs[VpRootDispatchThreadBlocked];
+}
+
+static int
+mshv_vp_wait_for_hv_kick(struct mshv_vp *vp)
+{
+	int ret;
+
+	ret = wait_event_interruptible(vp->run.vp_suspend_queue,
+				       (vp->run.kicked_by_hv == 1 &&
+					!mshv_vp_dispatch_thread_blocked(vp)) ||
+				       mshv_vp_interrupt_pending(vp));
+	if (ret)
+		return -EINTR;
+
+	vp->run.flags.root_sched_blocked = 0;
+	vp->run.kicked_by_hv = 0;
+
+	return 0;
+}
+
+/* Must be called with interrupts enabled */
+static long mshv_run_vp_with_root_scheduler(struct mshv_vp *vp)
+{
+	long ret;
+
+	if (vp->run.flags.root_sched_blocked) {
+		/*
+		 * Dispatch state of this VP is blocked. Need to wait
+		 * for the hypervisor to clear the blocked state before
+		 * dispatching it.
+		 */
+		ret = mshv_vp_wait_for_hv_kick(vp);
+		if (ret)
+			return ret;
+	}
+
+	do {
+		u32 flags = 0;
+		struct hv_output_dispatch_vp output;
+
+		if (__xfer_to_guest_mode_work_pending()) {
+			ret = xfer_to_guest_mode_handle_work();
+			if (ret)
+				break;
+		}
+
+		if (vp->run.flags.intercept_suspend)
+			flags |= HV_DISPATCH_VP_FLAG_CLEAR_INTERCEPT_SUSPEND;
+
+		if (mshv_vp_interrupt_pending(vp))
+			flags |= HV_DISPATCH_VP_FLAG_SCAN_INTERRUPT_INJECTION;
+
+		ret = mshv_vp_dispatch(vp, flags, &output);
+		if (ret)
+			break;
+
+		vp->run.flags.intercept_suspend = 0;
+
+		if (output.dispatch_state == HV_VP_DISPATCH_STATE_BLOCKED) {
+			if (output.dispatch_event ==
+						HV_VP_DISPATCH_EVENT_SUSPEND) {
+				/*
+				 * TODO: remove the warning once VP canceling
+				 *	 is supported
+				 */
+				WARN_ONCE(atomic64_read(&vp->run.vp_signaled_count),
+					  "%s: vp#%d: unexpected explicit suspend\n",
+					  __func__, vp->vp_index);
+				/*
+				 * Need to clear explicit suspend before
+				 * dispatching.
+				 * Explicit suspend is either:
+				 * - set right after the first VP dispatch or
+				 * - set explicitly via hypercall
+				 * Since the latter case is not yet supported,
+				 * simply clear it here.
+				 */
+				ret = mshv_vp_clear_explicit_suspend(vp);
+				if (ret)
+					break;
+
+				ret = mshv_vp_wait_for_hv_kick(vp);
+				if (ret)
+					break;
+			} else {
+				vp->run.flags.root_sched_blocked = 1;
+				ret = mshv_vp_wait_for_hv_kick(vp);
+				if (ret)
+					break;
+			}
+		} else {
+			/* HV_VP_DISPATCH_STATE_READY */
+			if (output.dispatch_event ==
+						HV_VP_DISPATCH_EVENT_INTERCEPT)
+				vp->run.flags.intercept_suspend = 1;
+		}
+	} while (!vp->run.flags.intercept_suspend);
+
+	rseq_virt_userspace_exit();
+
+	return ret;
+}
+
+static_assert(sizeof(struct hv_message) <= MSHV_RUN_VP_BUF_SZ,
+	      "sizeof(struct hv_message) must not exceed MSHV_RUN_VP_BUF_SZ");
+
+static struct mshv_mem_region *
+mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn)
+{
+	struct mshv_mem_region *region;
+
+	hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) {
+		if (gfn >= region->start_gfn &&
+		    gfn < region->start_gfn + region->nr_pages)
+			return region;
+	}
+
+	return NULL;
+}
+
+#ifdef CONFIG_X86_64
+static struct mshv_mem_region *
+mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn)
+{
+	struct mshv_mem_region *region;
+
+	spin_lock(&p->pt_mem_regions_lock);
+	region = mshv_partition_region_by_gfn(p, gfn);
+	if (!region || !mshv_region_get(region)) {
+		spin_unlock(&p->pt_mem_regions_lock);
+		return NULL;
+	}
+	spin_unlock(&p->pt_mem_regions_lock);
+
+	return region;
+}
+
+/**
+ * mshv_handle_gpa_intercept - Handle GPA (Guest Physical Address) intercepts.
+ * @vp: Pointer to the virtual processor structure.
+ *
+ * This function processes GPA intercepts by identifying the memory region
+ * corresponding to the intercepted GPA, aligning the page offset, and
+ * mapping the required pages. It ensures that the region is valid and
+ * handles faults efficiently by mapping multiple pages at once.
+ *
+ * Return: true if the intercept was handled successfully, false otherwise.
+ */
+static bool mshv_handle_gpa_intercept(struct mshv_vp *vp)
+{
+	struct mshv_partition *p = vp->vp_partition;
+	struct mshv_mem_region *region;
+	struct hv_x64_memory_intercept_message *msg;
+	bool ret;
+	u64 gfn;
+
+	msg = (struct hv_x64_memory_intercept_message *)
+		vp->vp_intercept_msg_page->u.payload;
+
+	gfn = HVPFN_DOWN(msg->guest_physical_address);
+
+	region = mshv_partition_region_by_gfn_get(p, gfn);
+	if (!region)
+		return false;
+
+	/* Only movable memory ranges are supported for GPA intercepts */
+	if (region->type == MSHV_REGION_TYPE_MEM_MOVABLE)
+		ret = mshv_region_handle_gfn_fault(region, gfn);
+	else
+		ret = false;
+
+	mshv_region_put(region);
+
+	return ret;
+}
+#else  /* CONFIG_X86_64 */
+static bool mshv_handle_gpa_intercept(struct mshv_vp *vp) { return false; }
+#endif /* CONFIG_X86_64 */
+
+static bool mshv_vp_handle_intercept(struct mshv_vp *vp)
+{
+	switch (vp->vp_intercept_msg_page->header.message_type) {
+	case HVMSG_GPA_INTERCEPT:
+		return mshv_handle_gpa_intercept(vp);
+	}
+	return false;
+}
+
+static long mshv_vp_ioctl_run_vp(struct mshv_vp *vp, void __user *ret_msg)
+{
+	long rc;
+
+	do {
+		if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
+			rc = mshv_run_vp_with_root_scheduler(vp);
+		else
+			rc = mshv_run_vp_with_hyp_scheduler(vp);
+	} while (rc == 0 && mshv_vp_handle_intercept(vp));
+
+	if (rc)
+		return rc;
+
+	if (copy_to_user(ret_msg, vp->vp_intercept_msg_page,
+			 sizeof(struct hv_message)))
+		rc = -EFAULT;
+
+	return rc;
+}
+
+static int
+mshv_vp_ioctl_get_set_state_pfn(struct mshv_vp *vp,
+				struct hv_vp_state_data state_data,
+				unsigned long user_pfn, size_t page_count,
+				bool is_set)
+{
+	int completed, ret = 0;
+	unsigned long check;
+	struct page **pages;
+
+	if (page_count > INT_MAX)
+		return -EINVAL;
+	/*
+	 * Check the arithmetic for wraparound/overflow.
+	 * The last page address in the buffer is:
+	 * (user_pfn + (page_count - 1)) * PAGE_SIZE
+	 */
+	if (check_add_overflow(user_pfn, (page_count - 1), &check))
+		return -EOVERFLOW;
+	if (check_mul_overflow(check, PAGE_SIZE, &check))
+		return -EOVERFLOW;
+
+	/* Pin user pages so hypervisor can copy directly to them */
+	pages = kcalloc(page_count, sizeof(struct page *), GFP_KERNEL);
+	if (!pages)
+		return -ENOMEM;
+
+	for (completed = 0; completed < page_count; completed += ret) {
+		unsigned long user_addr = (user_pfn + completed) * PAGE_SIZE;
+		int remaining = page_count - completed;
+
+		ret = pin_user_pages_fast(user_addr, remaining, FOLL_WRITE,
+					  &pages[completed]);
+		if (ret < 0) {
+			vp_err(vp, "%s: Failed to pin user pages error %i\n",
+			       __func__, ret);
+			goto unpin_pages;
+		}
+	}
+
+	if (is_set)
+		ret = hv_call_set_vp_state(vp->vp_index,
+					   vp->vp_partition->pt_id,
+					   state_data, page_count, pages,
+					   0, NULL);
+	else
+		ret = hv_call_get_vp_state(vp->vp_index,
+					   vp->vp_partition->pt_id,
+					   state_data, page_count, pages,
+					   NULL);
+
+unpin_pages:
+	unpin_user_pages(pages, completed);
+	kfree(pages);
+	return ret;
+}
+
+static long
+mshv_vp_ioctl_get_set_state(struct mshv_vp *vp,
+			    struct mshv_get_set_vp_state __user *user_args,
+			    bool is_set)
+{
+	struct mshv_get_set_vp_state args;
+	long ret = 0;
+	union hv_output_get_vp_state vp_state;
+	u32 data_sz;
+	struct hv_vp_state_data state_data = {};
+
+	if (copy_from_user(&args, user_args, sizeof(args)))
+		return -EFAULT;
+
+	if (args.type >= MSHV_VP_STATE_COUNT || mshv_field_nonzero(args, rsvd) ||
+	    !args.buf_sz || !PAGE_ALIGNED(args.buf_sz) ||
+	    !PAGE_ALIGNED(args.buf_ptr))
+		return -EINVAL;
+
+	if (!access_ok((void __user *)args.buf_ptr, args.buf_sz))
+		return -EFAULT;
+
+	switch (args.type) {
+	case MSHV_VP_STATE_LAPIC:
+		state_data.type = HV_GET_SET_VP_STATE_LAPIC_STATE;
+		data_sz = HV_HYP_PAGE_SIZE;
+		break;
+	case MSHV_VP_STATE_XSAVE:
+	{
+		u64 data_sz_64;
+
+		ret = hv_call_get_partition_property(vp->vp_partition->pt_id,
+						     HV_PARTITION_PROPERTY_XSAVE_STATES,
+						     &state_data.xsave.states.as_uint64);
+		if (ret)
+			return ret;
+
+		ret = hv_call_get_partition_property(vp->vp_partition->pt_id,
+						     HV_PARTITION_PROPERTY_MAX_XSAVE_DATA_SIZE,
+						     &data_sz_64);
+		if (ret)
+			return ret;
+
+		data_sz = (u32)data_sz_64;
+		state_data.xsave.flags = 0;
+		/* Always request legacy states */
+		state_data.xsave.states.legacy_x87 = 1;
+		state_data.xsave.states.legacy_sse = 1;
+		state_data.type = HV_GET_SET_VP_STATE_XSAVE;
+		break;
+	}
+	case MSHV_VP_STATE_SIMP:
+		state_data.type = HV_GET_SET_VP_STATE_SIM_PAGE;
+		data_sz = HV_HYP_PAGE_SIZE;
+		break;
+	case MSHV_VP_STATE_SIEFP:
+		state_data.type = HV_GET_SET_VP_STATE_SIEF_PAGE;
+		data_sz = HV_HYP_PAGE_SIZE;
+		break;
+	case MSHV_VP_STATE_SYNTHETIC_TIMERS:
+		state_data.type = HV_GET_SET_VP_STATE_SYNTHETIC_TIMERS;
+		data_sz = sizeof(vp_state.synthetic_timers_state);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (copy_to_user(&user_args->buf_sz, &data_sz, sizeof(user_args->buf_sz)))
+		return -EFAULT;
+
+	if (data_sz > args.buf_sz)
+		return -EINVAL;
+
+	/* If the data is transmitted via pfns, delegate to helper */
+	if (state_data.type & HV_GET_SET_VP_STATE_TYPE_PFN) {
+		unsigned long user_pfn = PFN_DOWN(args.buf_ptr);
+		size_t page_count = PFN_DOWN(args.buf_sz);
+
+		return mshv_vp_ioctl_get_set_state_pfn(vp, state_data, user_pfn,
+						       page_count, is_set);
+	}
+
+	/* Paranoia check - this shouldn't happen! */
+	if (data_sz > sizeof(vp_state)) {
+		vp_err(vp, "Invalid vp state data size!\n");
+		return -EINVAL;
+	}
+
+	if (is_set) {
+		if (copy_from_user(&vp_state, (__user void *)args.buf_ptr, data_sz))
+			return -EFAULT;
+
+		return hv_call_set_vp_state(vp->vp_index,
+					    vp->vp_partition->pt_id,
+					    state_data, 0, NULL,
+					    sizeof(vp_state), (u8 *)&vp_state);
+	}
+
+	ret = hv_call_get_vp_state(vp->vp_index, vp->vp_partition->pt_id,
+				   state_data, 0, NULL, &vp_state);
+	if (ret)
+		return ret;
+
+	if (copy_to_user((void __user *)args.buf_ptr, &vp_state, data_sz))
+		return -EFAULT;
+
+	return 0;
+}
+
+static long
+mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
+{
+	struct mshv_vp *vp = filp->private_data;
+	long r = -ENOTTY;
+
+	if (mutex_lock_killable(&vp->vp_mutex))
+		return -EINTR;
+
+	switch (ioctl) {
+	case MSHV_RUN_VP:
+		r = mshv_vp_ioctl_run_vp(vp, (void __user *)arg);
+		break;
+	case MSHV_GET_VP_STATE:
+		r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, false);
+		break;
+	case MSHV_SET_VP_STATE:
+		r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, true);
+		break;
+	case MSHV_ROOT_HVCALL:
+		r = mshv_ioctl_passthru_hvcall(vp->vp_partition, false,
+					       (void __user *)arg);
+		break;
+	default:
+		vp_warn(vp, "Invalid ioctl: %#x\n", ioctl);
+		break;
+	}
+	mutex_unlock(&vp->vp_mutex);
+
+	return r;
+}
+
+static vm_fault_t mshv_vp_fault(struct vm_fault *vmf)
+{
+	struct mshv_vp *vp = vmf->vma->vm_file->private_data;
+
+	switch (vmf->vma->vm_pgoff) {
+	case MSHV_VP_MMAP_OFFSET_REGISTERS:
+		vmf->page = virt_to_page(vp->vp_register_page);
+		break;
+	case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE:
+		vmf->page = virt_to_page(vp->vp_intercept_msg_page);
+		break;
+	case MSHV_VP_MMAP_OFFSET_GHCB:
+		vmf->page = virt_to_page(vp->vp_ghcb_page);
+		break;
+	default:
+		return VM_FAULT_SIGBUS;
+	}
+
+	get_page(vmf->page);
+
+	return 0;
+}
+
+static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct mshv_vp *vp = file->private_data;
+
+	switch (vma->vm_pgoff) {
+	case MSHV_VP_MMAP_OFFSET_REGISTERS:
+		if (!vp->vp_register_page)
+			return -ENODEV;
+		break;
+	case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE:
+		if (!vp->vp_intercept_msg_page)
+			return -ENODEV;
+		break;
+	case MSHV_VP_MMAP_OFFSET_GHCB:
+		if (!vp->vp_ghcb_page)
+			return -ENODEV;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	vma->vm_ops = &mshv_vp_vm_ops;
+	return 0;
+}
+
+static int
+mshv_vp_release(struct inode *inode, struct file *filp)
+{
+	struct mshv_vp *vp = filp->private_data;
+
+	/* Rest of VP cleanup happens in destroy_partition() */
+	mshv_partition_put(vp->vp_partition);
+	return 0;
+}
+
+static void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index,
+				void *stats_pages[])
+{
+	union hv_stats_object_identity identity = {
+		.vp.partition_id = partition_id,
+		.vp.vp_index = vp_index,
+	};
+
+	identity.vp.stats_area_type = HV_STATS_AREA_SELF;
+	hv_unmap_stats_page(HV_STATS_OBJECT_VP, NULL, &identity);
+
+	identity.vp.stats_area_type = HV_STATS_AREA_PARENT;
+	hv_unmap_stats_page(HV_STATS_OBJECT_VP, NULL, &identity);
+}
+
+static int mshv_vp_stats_map(u64 partition_id, u32 vp_index,
+			     void *stats_pages[])
+{
+	union hv_stats_object_identity identity = {
+		.vp.partition_id = partition_id,
+		.vp.vp_index = vp_index,
+	};
+	int err;
+
+	identity.vp.stats_area_type = HV_STATS_AREA_SELF;
+	err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity,
+				&stats_pages[HV_STATS_AREA_SELF]);
+	if (err)
+		return err;
+
+	identity.vp.stats_area_type = HV_STATS_AREA_PARENT;
+	err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity,
+				&stats_pages[HV_STATS_AREA_PARENT]);
+	if (err)
+		goto unmap_self;
+
+	return 0;
+
+unmap_self:
+	identity.vp.stats_area_type = HV_STATS_AREA_SELF;
+	hv_unmap_stats_page(HV_STATS_OBJECT_VP, NULL, &identity);
+	return err;
+}
+
+static long
+mshv_partition_ioctl_create_vp(struct mshv_partition *partition,
+			       void __user *arg)
+{
+	struct mshv_create_vp args;
+	struct mshv_vp *vp;
+	struct page *intercept_msg_page, *register_page, *ghcb_page;
+	void *stats_pages[2];
+	long ret;
+
+	if (copy_from_user(&args, arg, sizeof(args)))
+		return -EFAULT;
+
+	if (args.vp_index >= MSHV_MAX_VPS)
+		return -EINVAL;
+
+	if (partition->pt_vp_array[args.vp_index])
+		return -EEXIST;
+
+	ret = hv_call_create_vp(NUMA_NO_NODE, partition->pt_id, args.vp_index,
+				0 /* Only valid for root partition VPs */);
+	if (ret)
+		return ret;
+
+	ret = hv_map_vp_state_page(partition->pt_id, args.vp_index,
+				   HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
+				   input_vtl_zero, &intercept_msg_page);
+	if (ret)
+		goto destroy_vp;
+
+	if (!mshv_partition_encrypted(partition)) {
+		ret = hv_map_vp_state_page(partition->pt_id, args.vp_index,
+					   HV_VP_STATE_PAGE_REGISTERS,
+					   input_vtl_zero, &register_page);
+		if (ret)
+			goto unmap_intercept_message_page;
+	}
+
+	if (mshv_partition_encrypted(partition) &&
+	    is_ghcb_mapping_available()) {
+		ret = hv_map_vp_state_page(partition->pt_id, args.vp_index,
+					   HV_VP_STATE_PAGE_GHCB,
+					   input_vtl_normal, &ghcb_page);
+		if (ret)
+			goto unmap_register_page;
+	}
+
+	/*
+	 * This mapping of the stats page is for detecting if dispatch thread
+	 * is blocked - only relevant for root scheduler
+	 */
+	if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) {
+		ret = mshv_vp_stats_map(partition->pt_id, args.vp_index,
+					stats_pages);
+		if (ret)
+			goto unmap_ghcb_page;
+	}
+
+	vp = kzalloc(sizeof(*vp), GFP_KERNEL);
+	if (!vp)
+		goto unmap_stats_pages;
+
+	vp->vp_partition = mshv_partition_get(partition);
+	if (!vp->vp_partition) {
+		ret = -EBADF;
+		goto free_vp;
+	}
+
+	mutex_init(&vp->vp_mutex);
+	init_waitqueue_head(&vp->run.vp_suspend_queue);
+	atomic64_set(&vp->run.vp_signaled_count, 0);
+
+	vp->vp_index = args.vp_index;
+	vp->vp_intercept_msg_page = page_to_virt(intercept_msg_page);
+	if (!mshv_partition_encrypted(partition))
+		vp->vp_register_page = page_to_virt(register_page);
+
+	if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available())
+		vp->vp_ghcb_page = page_to_virt(ghcb_page);
+
+	if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
+		memcpy(vp->vp_stats_pages, stats_pages, sizeof(stats_pages));
+
+	/*
+	 * Keep anon_inode_getfd last: it installs fd in the file struct and
+	 * thus makes the state accessible in user space.
+	 */
+	ret = anon_inode_getfd("mshv_vp", &mshv_vp_fops, vp,
+			       O_RDWR | O_CLOEXEC);
+	if (ret < 0)
+		goto put_partition;
+
+	/* already exclusive with the partition mutex for all ioctls */
+	partition->pt_vp_count++;
+	partition->pt_vp_array[args.vp_index] = vp;
+
+	return ret;
+
+put_partition:
+	mshv_partition_put(partition);
+free_vp:
+	kfree(vp);
+unmap_stats_pages:
+	if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
+		mshv_vp_stats_unmap(partition->pt_id, args.vp_index, stats_pages);
+unmap_ghcb_page:
+	if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available())
+		hv_unmap_vp_state_page(partition->pt_id, args.vp_index,
+				       HV_VP_STATE_PAGE_GHCB, ghcb_page,
+				       input_vtl_normal);
+unmap_register_page:
+	if (!mshv_partition_encrypted(partition))
+		hv_unmap_vp_state_page(partition->pt_id, args.vp_index,
+				       HV_VP_STATE_PAGE_REGISTERS,
+				       register_page, input_vtl_zero);
+unmap_intercept_message_page:
+	hv_unmap_vp_state_page(partition->pt_id, args.vp_index,
+			       HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
+			       intercept_msg_page, input_vtl_zero);
+destroy_vp:
+	hv_call_delete_vp(partition->pt_id, args.vp_index);
+	return ret;
+}
+
+static int mshv_init_async_handler(struct mshv_partition *partition)
+{
+	if (completion_done(&partition->async_hypercall)) {
+		pt_err(partition,
+		       "Cannot issue async hypercall while another one in progress!\n");
+		return -EPERM;
+	}
+
+	reinit_completion(&partition->async_hypercall);
+	return 0;
+}
+
+static void mshv_async_hvcall_handler(void *data, u64 *status)
+{
+	struct mshv_partition *partition = data;
+
+	wait_for_completion(&partition->async_hypercall);
+	pt_dbg(partition, "Async hypercall completed!\n");
+
+	*status = partition->async_hypercall_status;
+}
+
+/*
+ * NB: caller checks and makes sure mem->size is page aligned
+ * Returns: 0 with regionpp updated on success, or -errno
+ */
+static int mshv_partition_create_region(struct mshv_partition *partition,
+					struct mshv_user_mem_region *mem,
+					struct mshv_mem_region **regionpp,
+					bool is_mmio)
+{
+	struct mshv_mem_region *rg;
+	u64 nr_pages = HVPFN_DOWN(mem->size);
+
+	/* Reject overlapping regions */
+	spin_lock(&partition->pt_mem_regions_lock);
+	hlist_for_each_entry(rg, &partition->pt_mem_regions, hnode) {
+		if (mem->guest_pfn + nr_pages <= rg->start_gfn ||
+		    rg->start_gfn + rg->nr_pages <= mem->guest_pfn)
+			continue;
+		spin_unlock(&partition->pt_mem_regions_lock);
+		return -EEXIST;
+	}
+	spin_unlock(&partition->pt_mem_regions_lock);
+
+	rg = mshv_region_create(mem->guest_pfn, nr_pages,
+				mem->userspace_addr, mem->flags);
+	if (IS_ERR(rg))
+		return PTR_ERR(rg);
+
+	if (is_mmio)
+		rg->type = MSHV_REGION_TYPE_MMIO;
+	else if (mshv_partition_encrypted(partition) ||
+		 !mshv_region_movable_init(rg))
+		rg->type = MSHV_REGION_TYPE_MEM_PINNED;
+	else
+		rg->type = MSHV_REGION_TYPE_MEM_MOVABLE;
+
+	rg->partition = partition;
+
+	*regionpp = rg;
+
+	return 0;
+}
+
+/**
+ * mshv_prepare_pinned_region - Pin and map memory regions
+ * @region: Pointer to the memory region structure
+ *
+ * This function processes memory regions that are explicitly marked as pinned.
+ * Pinned regions are preallocated, mapped upfront, and do not rely on fault-based
+ * population. The function ensures the region is properly populated, handles
+ * encryption requirements for SNP partitions if applicable, maps the region,
+ * and performs necessary sharing or eviction operations based on the mapping
+ * result.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+static int mshv_prepare_pinned_region(struct mshv_mem_region *region)
+{
+	struct mshv_partition *partition = region->partition;
+	int ret;
+
+	ret = mshv_region_pin(region);
+	if (ret) {
+		pt_err(partition, "Failed to pin memory region: %d\n",
+		       ret);
+		goto err_out;
+	}
+
+	/*
+	 * For an SNP partition it is a requirement that for every memory region
+	 * that we are going to map for this partition we should make sure that
+	 * host access to that region is released. This is ensured by doing an
+	 * additional hypercall which will update the SLAT to release host
+	 * access to guest memory regions.
+	 */
+	if (mshv_partition_encrypted(partition)) {
+		ret = mshv_region_unshare(region);
+		if (ret) {
+			pt_err(partition,
+			       "Failed to unshare memory region (guest_pfn: %llu): %d\n",
+			       region->start_gfn, ret);
+			goto invalidate_region;
+		}
+	}
+
+	ret = mshv_region_map(region);
+	if (ret && mshv_partition_encrypted(partition)) {
+		int shrc;
+
+		shrc = mshv_region_share(region);
+		if (!shrc)
+			goto invalidate_region;
+
+		pt_err(partition,
+		       "Failed to share memory region (guest_pfn: %llu): %d\n",
+		       region->start_gfn, shrc);
+		/*
+		 * Don't unpin if marking shared failed because pages are no
+		 * longer mapped in the host, ie root, anymore.
+		 */
+		goto err_out;
+	}
+
+	return 0;
+
+invalidate_region:
+	mshv_region_invalidate(region);
+err_out:
+	return ret;
+}
+
+/*
+ * This maps two things: guest RAM and for pci passthru mmio space.
+ *
+ * mmio:
+ *  - vfio overloads vm_pgoff to store the mmio start pfn/spa.
+ *  - Two things need to happen for mapping mmio range:
+ *	1. mapped in the uaddr so VMM can access it.
+ *	2. mapped in the hwpt (gfn <-> mmio phys addr) so guest can access it.
+ *
+ *   This function takes care of the second. The first one is managed by vfio,
+ *   and hence is taken care of via vfio_pci_mmap_fault().
+ */
+static long
+mshv_map_user_memory(struct mshv_partition *partition,
+		     struct mshv_user_mem_region mem)
+{
+	struct mshv_mem_region *region;
+	struct vm_area_struct *vma;
+	bool is_mmio;
+	ulong mmio_pfn;
+	long ret;
+
+	if (mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP) ||
+	    !access_ok((const void *)mem.userspace_addr, mem.size))
+		return -EINVAL;
+
+	mmap_read_lock(current->mm);
+	vma = vma_lookup(current->mm, mem.userspace_addr);
+	is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0;
+	mmio_pfn = is_mmio ? vma->vm_pgoff : 0;
+	mmap_read_unlock(current->mm);
+
+	if (!vma)
+		return -EINVAL;
+
+	ret = mshv_partition_create_region(partition, &mem, &region,
+					   is_mmio);
+	if (ret)
+		return ret;
+
+	switch (region->type) {
+	case MSHV_REGION_TYPE_MEM_PINNED:
+		ret = mshv_prepare_pinned_region(region);
+		break;
+	case MSHV_REGION_TYPE_MEM_MOVABLE:
+		/*
+		 * For movable memory regions, remap with no access to let
+		 * the hypervisor track dirty pages, enabling pre-copy live
+		 * migration.
+		 */
+		ret = hv_call_map_gpa_pages(partition->pt_id,
+					    region->start_gfn,
+					    region->nr_pages,
+					    HV_MAP_GPA_NO_ACCESS, NULL);
+		break;
+	case MSHV_REGION_TYPE_MMIO:
+		ret = hv_call_map_mmio_pages(partition->pt_id,
+					     region->start_gfn,
+					     mmio_pfn,
+					     region->nr_pages);
+		break;
+	}
+
+	if (ret)
+		goto errout;
+
+	spin_lock(&partition->pt_mem_regions_lock);
+	hlist_add_head(&region->hnode, &partition->pt_mem_regions);
+	spin_unlock(&partition->pt_mem_regions_lock);
+
+	return 0;
+
+errout:
+	vfree(region);
+	return ret;
+}
+
+/* Called for unmapping both the guest ram and the mmio space */
+static long
+mshv_unmap_user_memory(struct mshv_partition *partition,
+		       struct mshv_user_mem_region mem)
+{
+	struct mshv_mem_region *region;
+
+	if (!(mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP)))
+		return -EINVAL;
+
+	spin_lock(&partition->pt_mem_regions_lock);
+
+	region = mshv_partition_region_by_gfn(partition, mem.guest_pfn);
+	if (!region) {
+		spin_unlock(&partition->pt_mem_regions_lock);
+		return -ENOENT;
+	}
+
+	/* Paranoia check */
+	if (region->start_uaddr != mem.userspace_addr ||
+	    region->start_gfn != mem.guest_pfn ||
+	    region->nr_pages != HVPFN_DOWN(mem.size)) {
+		spin_unlock(&partition->pt_mem_regions_lock);
+		return -EINVAL;
+	}
+
+	hlist_del(&region->hnode);
+
+	spin_unlock(&partition->pt_mem_regions_lock);
+
+	mshv_region_put(region);
+
+	return 0;
+}
+
+static long
+mshv_partition_ioctl_set_memory(struct mshv_partition *partition,
+				struct mshv_user_mem_region __user *user_mem)
+{
+	struct mshv_user_mem_region mem;
+
+	if (copy_from_user(&mem, user_mem, sizeof(mem)))
+		return -EFAULT;
+
+	if (!mem.size ||
+	    !PAGE_ALIGNED(mem.size) ||
+	    !PAGE_ALIGNED(mem.userspace_addr) ||
+	    (mem.flags & ~MSHV_SET_MEM_FLAGS_MASK) ||
+	    mshv_field_nonzero(mem, rsvd))
+		return -EINVAL;
+
+	if (mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP))
+		return mshv_unmap_user_memory(partition, mem);
+
+	return mshv_map_user_memory(partition, mem);
+}
+
+static long
+mshv_partition_ioctl_ioeventfd(struct mshv_partition *partition,
+			       void __user *user_args)
+{
+	struct mshv_user_ioeventfd args;
+
+	if (copy_from_user(&args, user_args, sizeof(args)))
+		return -EFAULT;
+
+	return mshv_set_unset_ioeventfd(partition, &args);
+}
+
+static long
+mshv_partition_ioctl_irqfd(struct mshv_partition *partition,
+			   void __user *user_args)
+{
+	struct mshv_user_irqfd args;
+
+	if (copy_from_user(&args, user_args, sizeof(args)))
+		return -EFAULT;
+
+	return mshv_set_unset_irqfd(partition, &args);
+}
+
+static long
+mshv_partition_ioctl_get_gpap_access_bitmap(struct mshv_partition *partition,
+					    void __user *user_args)
+{
+	struct mshv_gpap_access_bitmap args;
+	union hv_gpa_page_access_state *states;
+	long ret, i;
+	union hv_gpa_page_access_state_flags hv_flags = {};
+	u8 hv_type_mask;
+	ulong bitmap_buf_sz, states_buf_sz;
+	int written = 0;
+
+	if (copy_from_user(&args, user_args, sizeof(args)))
+		return -EFAULT;
+
+	if (args.access_type >= MSHV_GPAP_ACCESS_TYPE_COUNT ||
+	    args.access_op >= MSHV_GPAP_ACCESS_OP_COUNT ||
+	    mshv_field_nonzero(args, rsvd) || !args.page_count ||
+	    !args.bitmap_ptr)
+		return -EINVAL;
+
+	if (check_mul_overflow(args.page_count, sizeof(*states), &states_buf_sz))
+		return -E2BIG;
+
+	/* Num bytes needed to store bitmap; one bit per page rounded up */
+	bitmap_buf_sz = DIV_ROUND_UP(args.page_count, 8);
+
+	/* Sanity check */
+	if (bitmap_buf_sz > states_buf_sz)
+		return -EBADFD;
+
+	switch (args.access_type) {
+	case MSHV_GPAP_ACCESS_TYPE_ACCESSED:
+		hv_type_mask = 1;
+		if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) {
+			hv_flags.clear_accessed = 1;
+			/* not accessed implies not dirty */
+			hv_flags.clear_dirty = 1;
+		} else { /* MSHV_GPAP_ACCESS_OP_SET */
+			hv_flags.set_accessed = 1;
+		}
+		break;
+	case MSHV_GPAP_ACCESS_TYPE_DIRTY:
+		hv_type_mask = 2;
+		if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) {
+			hv_flags.clear_dirty = 1;
+		} else { /* MSHV_GPAP_ACCESS_OP_SET */
+			hv_flags.set_dirty = 1;
+			/* dirty implies accessed */
+			hv_flags.set_accessed = 1;
+		}
+		break;
+	}
+
+	states = vzalloc(states_buf_sz);
+	if (!states)
+		return -ENOMEM;
+
+	ret = hv_call_get_gpa_access_states(partition->pt_id, args.page_count,
+					    args.gpap_base, hv_flags, &written,
+					    states);
+	if (ret)
+		goto free_return;
+
+	/*
+	 * Overwrite states buffer with bitmap - the bits in hv_type_mask
+	 * correspond to bitfields in hv_gpa_page_access_state
+	 */
+	for (i = 0; i < written; ++i)
+		__assign_bit(i, (ulong *)states,
+			     states[i].as_uint8 & hv_type_mask);
+
+	/* zero the unused bits in the last byte(s) of the returned bitmap */
+	for (i = written; i < bitmap_buf_sz * 8; ++i)
+		__clear_bit(i, (ulong *)states);
+
+	if (copy_to_user((void __user *)args.bitmap_ptr, states, bitmap_buf_sz))
+		ret = -EFAULT;
+
+free_return:
+	vfree(states);
+	return ret;
+}
+
+static long
+mshv_partition_ioctl_set_msi_routing(struct mshv_partition *partition,
+				     void __user *user_args)
+{
+	struct mshv_user_irq_entry *entries = NULL;
+	struct mshv_user_irq_table args;
+	long ret;
+
+	if (copy_from_user(&args, user_args, sizeof(args)))
+		return -EFAULT;
+
+	if (args.nr > MSHV_MAX_GUEST_IRQS ||
+	    mshv_field_nonzero(args, rsvd))
+		return -EINVAL;
+
+	if (args.nr) {
+		struct mshv_user_irq_table __user *urouting = user_args;
+
+		entries = vmemdup_user(urouting->entries,
+				       array_size(sizeof(*entries),
+						  args.nr));
+		if (IS_ERR(entries))
+			return PTR_ERR(entries);
+	}
+	ret = mshv_update_routing_table(partition, entries, args.nr);
+	kvfree(entries);
+
+	return ret;
+}
+
+static long
+mshv_partition_ioctl_initialize(struct mshv_partition *partition)
+{
+	long ret;
+
+	if (partition->pt_initialized)
+		return 0;
+
+	ret = hv_call_initialize_partition(partition->pt_id);
+	if (ret)
+		goto withdraw_mem;
+
+	partition->pt_initialized = true;
+
+	return 0;
+
+withdraw_mem:
+	hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id);
+
+	return ret;
+}
+
+static long
+mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
+{
+	struct mshv_partition *partition = filp->private_data;
+	long ret;
+	void __user *uarg = (void __user *)arg;
+
+	if (mutex_lock_killable(&partition->pt_mutex))
+		return -EINTR;
+
+	switch (ioctl) {
+	case MSHV_INITIALIZE_PARTITION:
+		ret = mshv_partition_ioctl_initialize(partition);
+		break;
+	case MSHV_SET_GUEST_MEMORY:
+		ret = mshv_partition_ioctl_set_memory(partition, uarg);
+		break;
+	case MSHV_CREATE_VP:
+		ret = mshv_partition_ioctl_create_vp(partition, uarg);
+		break;
+	case MSHV_IRQFD:
+		ret = mshv_partition_ioctl_irqfd(partition, uarg);
+		break;
+	case MSHV_IOEVENTFD:
+		ret = mshv_partition_ioctl_ioeventfd(partition, uarg);
+		break;
+	case MSHV_SET_MSI_ROUTING:
+		ret = mshv_partition_ioctl_set_msi_routing(partition, uarg);
+		break;
+	case MSHV_GET_GPAP_ACCESS_BITMAP:
+		ret = mshv_partition_ioctl_get_gpap_access_bitmap(partition,
+								  uarg);
+		break;
+	case MSHV_ROOT_HVCALL:
+		ret = mshv_ioctl_passthru_hvcall(partition, true, uarg);
+		break;
+	default:
+		ret = -ENOTTY;
+	}
+
+	mutex_unlock(&partition->pt_mutex);
+	return ret;
+}
+
+static int
+disable_vp_dispatch(struct mshv_vp *vp)
+{
+	int ret;
+	struct hv_register_assoc dispatch_suspend = {
+		.name = HV_REGISTER_DISPATCH_SUSPEND,
+		.value.dispatch_suspend.suspended = 1,
+	};
+
+	ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
+				    1, &dispatch_suspend);
+	if (ret)
+		vp_err(vp, "failed to suspend\n");
+
+	return ret;
+}
+
+static int
+get_vp_signaled_count(struct mshv_vp *vp, u64 *count)
+{
+	int ret;
+	struct hv_register_assoc root_signal_count = {
+		.name = HV_REGISTER_VP_ROOT_SIGNAL_COUNT,
+	};
+
+	ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
+				    1, &root_signal_count);
+
+	if (ret) {
+		vp_err(vp, "Failed to get root signal count");
+		*count = 0;
+		return ret;
+	}
+
+	*count = root_signal_count.value.reg64;
+
+	return ret;
+}
+
+static void
+drain_vp_signals(struct mshv_vp *vp)
+{
+	u64 hv_signal_count;
+	u64 vp_signal_count;
+
+	get_vp_signaled_count(vp, &hv_signal_count);
+
+	vp_signal_count = atomic64_read(&vp->run.vp_signaled_count);
+
+	/*
+	 * There should be at most 1 outstanding notification, but be extra
+	 * careful anyway.
+	 */
+	while (hv_signal_count != vp_signal_count) {
+		WARN_ON(hv_signal_count - vp_signal_count != 1);
+
+		if (wait_event_interruptible(vp->run.vp_suspend_queue,
+					     vp->run.kicked_by_hv == 1))
+			break;
+		vp->run.kicked_by_hv = 0;
+		vp_signal_count = atomic64_read(&vp->run.vp_signaled_count);
+	}
+}
+
+static void drain_all_vps(const struct mshv_partition *partition)
+{
+	int i;
+	struct mshv_vp *vp;
+
+	/*
+	 * VPs are reachable from ISR. It is safe to not take the partition
+	 * lock because nobody else can enter this function and drop the
+	 * partition from the list.
+	 */
+	for (i = 0; i < MSHV_MAX_VPS; i++) {
+		vp = partition->pt_vp_array[i];
+		if (!vp)
+			continue;
+		/*
+		 * Disable dispatching of the VP in the hypervisor. After this
+		 * the hypervisor guarantees it won't generate any signals for
+		 * the VP and the hypervisor's VP signal count won't change.
+		 */
+		disable_vp_dispatch(vp);
+		drain_vp_signals(vp);
+	}
+}
+
+static void
+remove_partition(struct mshv_partition *partition)
+{
+	spin_lock(&mshv_root.pt_ht_lock);
+	hlist_del_rcu(&partition->pt_hnode);
+	spin_unlock(&mshv_root.pt_ht_lock);
+
+	synchronize_rcu();
+}
+
+/*
+ * Tear down a partition and remove it from the list.
+ * Partition's refcount must be 0
+ */
+static void destroy_partition(struct mshv_partition *partition)
+{
+	struct mshv_vp *vp;
+	struct mshv_mem_region *region;
+	struct hlist_node *n;
+	int i;
+
+	if (refcount_read(&partition->pt_ref_count)) {
+		pt_err(partition,
+		       "Attempt to destroy partition but refcount > 0\n");
+		return;
+	}
+
+	if (partition->pt_initialized) {
+		/*
+		 * We only need to drain signals for root scheduler. This should be
+		 * done before removing the partition from the partition list.
+		 */
+		if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
+			drain_all_vps(partition);
+
+		/* Remove vps */
+		for (i = 0; i < MSHV_MAX_VPS; ++i) {
+			vp = partition->pt_vp_array[i];
+			if (!vp)
+				continue;
+
+			if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
+				mshv_vp_stats_unmap(partition->pt_id, vp->vp_index,
+						    (void **)vp->vp_stats_pages);
+
+			if (vp->vp_register_page) {
+				(void)hv_unmap_vp_state_page(partition->pt_id,
+							     vp->vp_index,
+							     HV_VP_STATE_PAGE_REGISTERS,
+							     virt_to_page(vp->vp_register_page),
+							     input_vtl_zero);
+				vp->vp_register_page = NULL;
+			}
+
+			(void)hv_unmap_vp_state_page(partition->pt_id,
+						     vp->vp_index,
+						     HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
+						     virt_to_page(vp->vp_intercept_msg_page),
+						     input_vtl_zero);
+			vp->vp_intercept_msg_page = NULL;
+
+			if (vp->vp_ghcb_page) {
+				(void)hv_unmap_vp_state_page(partition->pt_id,
+							     vp->vp_index,
+							     HV_VP_STATE_PAGE_GHCB,
+							     virt_to_page(vp->vp_ghcb_page),
+							     input_vtl_normal);
+				vp->vp_ghcb_page = NULL;
+			}
+
+			kfree(vp);
+
+			partition->pt_vp_array[i] = NULL;
+		}
+
+		/* Deallocates and unmaps everything including vcpus, GPA mappings etc */
+		hv_call_finalize_partition(partition->pt_id);
+
+		partition->pt_initialized = false;
+	}
+
+	remove_partition(partition);
+
+	hlist_for_each_entry_safe(region, n, &partition->pt_mem_regions,
+				  hnode) {
+		hlist_del(&region->hnode);
+		mshv_region_put(region);
+	}
+
+	/* Withdraw and free all pages we deposited */
+	hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id);
+	hv_call_delete_partition(partition->pt_id);
+
+	mshv_free_routing_table(partition);
+	kfree(partition);
+}
+
+struct
+mshv_partition *mshv_partition_get(struct mshv_partition *partition)
+{
+	if (refcount_inc_not_zero(&partition->pt_ref_count))
+		return partition;
+	return NULL;
+}
+
+struct
+mshv_partition *mshv_partition_find(u64 partition_id)
+	__must_hold(RCU)
+{
+	struct mshv_partition *p;
+
+	hash_for_each_possible_rcu(mshv_root.pt_htable, p, pt_hnode,
+				   partition_id)
+		if (p->pt_id == partition_id)
+			return p;
+
+	return NULL;
+}
+
+void
+mshv_partition_put(struct mshv_partition *partition)
+{
+	if (refcount_dec_and_test(&partition->pt_ref_count))
+		destroy_partition(partition);
+}
+
+static int
+mshv_partition_release(struct inode *inode, struct file *filp)
+{
+	struct mshv_partition *partition = filp->private_data;
+
+	mshv_eventfd_release(partition);
+
+	cleanup_srcu_struct(&partition->pt_irq_srcu);
+
+	mshv_partition_put(partition);
+
+	return 0;
+}
+
+static int
+add_partition(struct mshv_partition *partition)
+{
+	spin_lock(&mshv_root.pt_ht_lock);
+
+	hash_add_rcu(mshv_root.pt_htable, &partition->pt_hnode,
+		     partition->pt_id);
+
+	spin_unlock(&mshv_root.pt_ht_lock);
+
+	return 0;
+}
+
+static_assert(MSHV_NUM_CPU_FEATURES_BANKS ==
+	      HV_PARTITION_PROCESSOR_FEATURES_BANKS);
+
+static long mshv_ioctl_process_pt_flags(void __user *user_arg, u64 *pt_flags,
+					struct hv_partition_creation_properties *cr_props,
+					union hv_partition_isolation_properties *isol_props)
+{
+	int i;
+	struct mshv_create_partition_v2 args;
+	union hv_partition_processor_features *disabled_procs;
+	union hv_partition_processor_xsave_features *disabled_xsave;
+
+	/* First, copy v1 struct in case user is on previous versions */
+	if (copy_from_user(&args, user_arg,
+			   sizeof(struct mshv_create_partition)))
+		return -EFAULT;
+
+	if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) ||
+	    args.pt_isolation >= MSHV_PT_ISOLATION_COUNT)
+		return -EINVAL;
+
+	disabled_procs = &cr_props->disabled_processor_features;
+	disabled_xsave = &cr_props->disabled_processor_xsave_features;
+
+	/* Check if user provided newer struct with feature fields */
+	if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_CPU_AND_XSAVE_FEATURES)) {
+		if (copy_from_user(&args, user_arg, sizeof(args)))
+			return -EFAULT;
+
+		/* Re-validate v1 fields after second copy_from_user() */
+		if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) ||
+		    args.pt_isolation >= MSHV_PT_ISOLATION_COUNT)
+			return -EINVAL;
+
+		if (args.pt_num_cpu_fbanks != MSHV_NUM_CPU_FEATURES_BANKS ||
+		    mshv_field_nonzero(args, pt_rsvd) ||
+		    mshv_field_nonzero(args, pt_rsvd1))
+			return -EINVAL;
+
+		/*
+		 * Note this assumes MSHV_NUM_CPU_FEATURES_BANKS will never
+		 * change and equals HV_PARTITION_PROCESSOR_FEATURES_BANKS
+		 * (i.e. 2).
+		 *
+		 * Further banks (index >= 2) will be modifiable as 'early'
+		 * properties via the set partition property hypercall.
+		 */
+		for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++)
+			disabled_procs->as_uint64[i] = args.pt_cpu_fbanks[i];
+
+#if IS_ENABLED(CONFIG_X86_64)
+		disabled_xsave->as_uint64 = args.pt_disabled_xsave;
+#else
+		/*
+		 * In practice this field is ignored on arm64, but safer to
+		 * zero it in case it is ever used.
+		 */
+		disabled_xsave->as_uint64 = 0;
+
+		if (mshv_field_nonzero(args, pt_rsvd2))
+			return -EINVAL;
+#endif
+	} else {
+		/*
+		 * v1 behavior: try to enable everything. The hypervisor will
+		 * disable features that are not supported. The banks can be
+		 * queried via the get partition property hypercall.
+		 */
+		for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++)
+			disabled_procs->as_uint64[i] = 0;
+
+		disabled_xsave->as_uint64 = 0;
+	}
+
+	/* Only support EXO partitions */
+	*pt_flags = HV_PARTITION_CREATION_FLAG_EXO_PARTITION |
+		    HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED;
+
+	if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_LAPIC))
+		*pt_flags |= HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED;
+	if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_X2APIC))
+		*pt_flags |= HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE;
+	if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_GPA_SUPER_PAGES))
+		*pt_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED;
+
+	isol_props->as_uint64 = 0;
+
+	switch (args.pt_isolation) {
+	case MSHV_PT_ISOLATION_NONE:
+		isol_props->isolation_type = HV_PARTITION_ISOLATION_TYPE_NONE;
+		break;
+	}
+
+	return 0;
+}
+
+static long
+mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev)
+{
+	u64 creation_flags;
+	struct hv_partition_creation_properties creation_properties;
+	union hv_partition_isolation_properties isolation_properties;
+	struct mshv_partition *partition;
+	long ret;
+
+	ret = mshv_ioctl_process_pt_flags(user_arg, &creation_flags,
+					  &creation_properties,
+					  &isolation_properties);
+	if (ret)
+		return ret;
+
+	partition = kzalloc(sizeof(*partition), GFP_KERNEL);
+	if (!partition)
+		return -ENOMEM;
+
+	partition->pt_module_dev = module_dev;
+	partition->isolation_type = isolation_properties.isolation_type;
+
+	refcount_set(&partition->pt_ref_count, 1);
+
+	mutex_init(&partition->pt_mutex);
+
+	mutex_init(&partition->pt_irq_lock);
+
+	init_completion(&partition->async_hypercall);
+
+	INIT_HLIST_HEAD(&partition->irq_ack_notifier_list);
+
+	INIT_HLIST_HEAD(&partition->pt_devices);
+
+	spin_lock_init(&partition->pt_mem_regions_lock);
+	INIT_HLIST_HEAD(&partition->pt_mem_regions);
+
+	mshv_eventfd_init(partition);
+
+	ret = init_srcu_struct(&partition->pt_irq_srcu);
+	if (ret)
+		goto free_partition;
+
+	ret = hv_call_create_partition(creation_flags,
+				       creation_properties,
+				       isolation_properties,
+				       &partition->pt_id);
+	if (ret)
+		goto cleanup_irq_srcu;
+
+	ret = add_partition(partition);
+	if (ret)
+		goto delete_partition;
+
+	ret = mshv_init_async_handler(partition);
+	if (!ret) {
+		ret = FD_ADD(O_CLOEXEC, anon_inode_getfile("mshv_partition",
+							   &mshv_partition_fops,
+							   partition, O_RDWR));
+		if (ret >= 0)
+			return ret;
+	}
+	remove_partition(partition);
+delete_partition:
+	hv_call_delete_partition(partition->pt_id);
+cleanup_irq_srcu:
+	cleanup_srcu_struct(&partition->pt_irq_srcu);
+free_partition:
+	kfree(partition);
+
+	return ret;
+}
+
+static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl,
+			   unsigned long arg)
+{
+	struct miscdevice *misc = filp->private_data;
+
+	switch (ioctl) {
+	case MSHV_CREATE_PARTITION:
+		return mshv_ioctl_create_partition((void __user *)arg,
+						misc->this_device);
+	case MSHV_ROOT_HVCALL:
+		return mshv_ioctl_passthru_hvcall(NULL, false,
+					(void __user *)arg);
+	}
+
+	return -ENOTTY;
+}
+
+static int
+mshv_dev_open(struct inode *inode, struct file *filp)
+{
+	return 0;
+}
+
+static int
+mshv_dev_release(struct inode *inode, struct file *filp)
+{
+	return 0;
+}
+
+static int mshv_cpuhp_online;
+static int mshv_root_sched_online;
+
+static const char *scheduler_type_to_string(enum hv_scheduler_type type)
+{
+	switch (type) {
+	case HV_SCHEDULER_TYPE_LP:
+		return "classic scheduler without SMT";
+	case HV_SCHEDULER_TYPE_LP_SMT:
+		return "classic scheduler with SMT";
+	case HV_SCHEDULER_TYPE_CORE_SMT:
+		return "core scheduler";
+	case HV_SCHEDULER_TYPE_ROOT:
+		return "root scheduler";
+	default:
+		return "unknown scheduler";
+	};
+}
+
+/* TODO move this to hv_common.c when needed outside */
+static int __init hv_retrieve_scheduler_type(enum hv_scheduler_type *out)
+{
+	struct hv_input_get_system_property *input;
+	struct hv_output_get_system_property *output;
+	unsigned long flags;
+	u64 status;
+
+	local_irq_save(flags);
+	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+	output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+	memset(input, 0, sizeof(*input));
+	memset(output, 0, sizeof(*output));
+	input->property_id = HV_SYSTEM_PROPERTY_SCHEDULER_TYPE;
+
+	status = hv_do_hypercall(HVCALL_GET_SYSTEM_PROPERTY, input, output);
+	if (!hv_result_success(status)) {
+		local_irq_restore(flags);
+		pr_err("%s: %s\n", __func__, hv_result_to_string(status));
+		return hv_result_to_errno(status);
+	}
+
+	*out = output->scheduler_type;
+	local_irq_restore(flags);
+
+	return 0;
+}
+
+/* Retrieve and stash the supported scheduler type */
+static int __init mshv_retrieve_scheduler_type(struct device *dev)
+{
+	int ret = 0;
+
+	if (hv_l1vh_partition())
+		hv_scheduler_type = HV_SCHEDULER_TYPE_CORE_SMT;
+	else
+		ret = hv_retrieve_scheduler_type(&hv_scheduler_type);
+
+	if (ret)
+		return ret;
+
+	dev_info(dev, "Hypervisor using %s\n",
+		 scheduler_type_to_string(hv_scheduler_type));
+
+	switch (hv_scheduler_type) {
+	case HV_SCHEDULER_TYPE_CORE_SMT:
+	case HV_SCHEDULER_TYPE_LP_SMT:
+	case HV_SCHEDULER_TYPE_ROOT:
+	case HV_SCHEDULER_TYPE_LP:
+		/* Supported scheduler, nothing to do */
+		break;
+	default:
+		dev_err(dev, "unsupported scheduler 0x%x, bailing.\n",
+			hv_scheduler_type);
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static int mshv_root_scheduler_init(unsigned int cpu)
+{
+	void **inputarg, **outputarg, *p;
+
+	inputarg = (void **)this_cpu_ptr(root_scheduler_input);
+	outputarg = (void **)this_cpu_ptr(root_scheduler_output);
+
+	/* Allocate two consecutive pages. One for input, one for output. */
+	p = kmalloc(2 * HV_HYP_PAGE_SIZE, GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	*inputarg = p;
+	*outputarg = (char *)p + HV_HYP_PAGE_SIZE;
+
+	return 0;
+}
+
+static int mshv_root_scheduler_cleanup(unsigned int cpu)
+{
+	void *p, **inputarg, **outputarg;
+
+	inputarg = (void **)this_cpu_ptr(root_scheduler_input);
+	outputarg = (void **)this_cpu_ptr(root_scheduler_output);
+
+	p = *inputarg;
+
+	*inputarg = NULL;
+	*outputarg = NULL;
+
+	kfree(p);
+
+	return 0;
+}
+
+/* Must be called after retrieving the scheduler type */
+static int
+root_scheduler_init(struct device *dev)
+{
+	int ret;
+
+	if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT)
+		return 0;
+
+	root_scheduler_input = alloc_percpu(void *);
+	root_scheduler_output = alloc_percpu(void *);
+
+	if (!root_scheduler_input || !root_scheduler_output) {
+		dev_err(dev, "Failed to allocate root scheduler buffers\n");
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_root_sched",
+				mshv_root_scheduler_init,
+				mshv_root_scheduler_cleanup);
+
+	if (ret < 0) {
+		dev_err(dev, "Failed to setup root scheduler state: %i\n", ret);
+		goto out;
+	}
+
+	mshv_root_sched_online = ret;
+
+	return 0;
+
+out:
+	free_percpu(root_scheduler_input);
+	free_percpu(root_scheduler_output);
+	return ret;
+}
+
+static void
+root_scheduler_deinit(void)
+{
+	if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT)
+		return;
+
+	cpuhp_remove_state(mshv_root_sched_online);
+	free_percpu(root_scheduler_input);
+	free_percpu(root_scheduler_output);
+}
+
+static int mshv_reboot_notify(struct notifier_block *nb,
+			      unsigned long code, void *unused)
+{
+	cpuhp_remove_state(mshv_cpuhp_online);
+	return 0;
+}
+
+struct notifier_block mshv_reboot_nb = {
+	.notifier_call = mshv_reboot_notify,
+};
+
+static void mshv_root_partition_exit(void)
+{
+	unregister_reboot_notifier(&mshv_reboot_nb);
+	root_scheduler_deinit();
+}
+
+static int __init mshv_root_partition_init(struct device *dev)
+{
+	int err;
+
+	err = root_scheduler_init(dev);
+	if (err)
+		return err;
+
+	err = register_reboot_notifier(&mshv_reboot_nb);
+	if (err)
+		goto root_sched_deinit;
+
+	return 0;
+
+root_sched_deinit:
+	root_scheduler_deinit();
+	return err;
+}
+
+static void mshv_init_vmm_caps(struct device *dev)
+{
+	/*
+	 * This can only fail here if HVCALL_GET_PARTITION_PROPERTY_EX or
+	 * HV_PARTITION_PROPERTY_VMM_CAPABILITIES are not supported. In that
+	 * case it's valid to proceed as if all vmm_caps are disabled (zero).
+	 */
+	if (hv_call_get_partition_property_ex(HV_PARTITION_ID_SELF,
+					      HV_PARTITION_PROPERTY_VMM_CAPABILITIES,
+					      0, &mshv_root.vmm_caps,
+					      sizeof(mshv_root.vmm_caps)))
+		dev_warn(dev, "Unable to get VMM capabilities\n");
+
+	dev_dbg(dev, "vmm_caps = %#llx\n", mshv_root.vmm_caps.as_uint64[0]);
+}
+
+static int __init mshv_parent_partition_init(void)
+{
+	int ret;
+	struct device *dev;
+	union hv_hypervisor_version_info version_info;
+
+	if (!hv_parent_partition() || is_kdump_kernel())
+		return -ENODEV;
+
+	if (hv_get_hypervisor_version(&version_info))
+		return -ENODEV;
+
+	ret = misc_register(&mshv_dev);
+	if (ret)
+		return ret;
+
+	dev = mshv_dev.this_device;
+
+	if (version_info.build_number < MSHV_HV_MIN_VERSION ||
+	    version_info.build_number > MSHV_HV_MAX_VERSION) {
+		dev_err(dev, "Running on unvalidated Hyper-V version\n");
+		dev_err(dev, "Versions: current: %u  min: %u  max: %u\n",
+			version_info.build_number, MSHV_HV_MIN_VERSION,
+			MSHV_HV_MAX_VERSION);
+	}
+
+	mshv_root.synic_pages = alloc_percpu(struct hv_synic_pages);
+	if (!mshv_root.synic_pages) {
+		dev_err(dev, "Failed to allocate percpu synic page\n");
+		ret = -ENOMEM;
+		goto device_deregister;
+	}
+
+	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_synic",
+				mshv_synic_init,
+				mshv_synic_cleanup);
+	if (ret < 0) {
+		dev_err(dev, "Failed to setup cpu hotplug state: %i\n", ret);
+		goto free_synic_pages;
+	}
+
+	mshv_cpuhp_online = ret;
+
+	ret = mshv_retrieve_scheduler_type(dev);
+	if (ret)
+		goto remove_cpu_state;
+
+	if (hv_root_partition())
+		ret = mshv_root_partition_init(dev);
+	if (ret)
+		goto remove_cpu_state;
+
+	mshv_init_vmm_caps(dev);
+
+	ret = mshv_irqfd_wq_init();
+	if (ret)
+		goto exit_partition;
+
+	spin_lock_init(&mshv_root.pt_ht_lock);
+	hash_init(mshv_root.pt_htable);
+
+	hv_setup_mshv_handler(mshv_isr);
+
+	return 0;
+
+exit_partition:
+	if (hv_root_partition())
+		mshv_root_partition_exit();
+remove_cpu_state:
+	cpuhp_remove_state(mshv_cpuhp_online);
+free_synic_pages:
+	free_percpu(mshv_root.synic_pages);
+device_deregister:
+	misc_deregister(&mshv_dev);
+	return ret;
+}
+
+static void __exit mshv_parent_partition_exit(void)
+{
+	hv_setup_mshv_handler(NULL);
+	mshv_port_table_fini();
+	misc_deregister(&mshv_dev);
+	mshv_irqfd_wq_cleanup();
+	if (hv_root_partition())
+		mshv_root_partition_exit();
+	cpuhp_remove_state(mshv_cpuhp_online);
+	free_percpu(mshv_root.synic_pages);
+}
+
+module_init(mshv_parent_partition_init);
+module_exit(mshv_parent_partition_exit);
diff --git a/drivers/hv/mshv_synic.c b/drivers/hv/mshv_synic.c
new file mode 100644
index 000000000000..f8b0337cdc82
--- /dev/null
+++ b/drivers/hv/mshv_synic.c
@@ -0,0 +1,665 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2023, Microsoft Corporation.
+ *
+ * mshv_root module's main interrupt handler and associated functionality.
+ *
+ * Authors: Microsoft Linux virtualization team
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/io.h>
+#include <linux/random.h>
+#include <asm/mshyperv.h>
+
+#include "mshv_eventfd.h"
+#include "mshv.h"
+
+static u32 synic_event_ring_get_queued_port(u32 sint_index)
+{
+	struct hv_synic_event_ring_page **event_ring_page;
+	volatile struct hv_synic_event_ring *ring;
+	struct hv_synic_pages *spages;
+	u8 **synic_eventring_tail;
+	u32 message;
+	u8 tail;
+
+	spages = this_cpu_ptr(mshv_root.synic_pages);
+	event_ring_page = &spages->synic_event_ring_page;
+	synic_eventring_tail = (u8 **)this_cpu_ptr(hv_synic_eventring_tail);
+
+	if (unlikely(!*synic_eventring_tail)) {
+		pr_debug("Missing synic event ring tail!\n");
+		return 0;
+	}
+	tail = (*synic_eventring_tail)[sint_index];
+
+	if (unlikely(!*event_ring_page)) {
+		pr_debug("Missing synic event ring page!\n");
+		return 0;
+	}
+
+	ring = &(*event_ring_page)->sint_event_ring[sint_index];
+
+	/*
+	 * Get the message.
+	 */
+	message = ring->data[tail];
+
+	if (!message) {
+		if (ring->ring_full) {
+			/*
+			 * Ring is marked full, but we would have consumed all
+			 * the messages. Notify the hypervisor that ring is now
+			 * empty and check again.
+			 */
+			ring->ring_full = 0;
+			hv_call_notify_port_ring_empty(sint_index);
+			message = ring->data[tail];
+		}
+
+		if (!message) {
+			ring->signal_masked = 0;
+			/*
+			 * Unmask the signal and sync with hypervisor
+			 * before one last check for any message.
+			 */
+			mb();
+			message = ring->data[tail];
+
+			/*
+			 * Ok, lets bail out.
+			 */
+			if (!message)
+				return 0;
+		}
+
+		ring->signal_masked = 1;
+	}
+
+	/*
+	 * Clear the message in the ring buffer.
+	 */
+	ring->data[tail] = 0;
+
+	if (++tail == HV_SYNIC_EVENT_RING_MESSAGE_COUNT)
+		tail = 0;
+
+	(*synic_eventring_tail)[sint_index] = tail;
+
+	return message;
+}
+
+static bool
+mshv_doorbell_isr(struct hv_message *msg)
+{
+	struct hv_notification_message_payload *notification;
+	u32 port;
+
+	if (msg->header.message_type != HVMSG_SYNIC_SINT_INTERCEPT)
+		return false;
+
+	notification = (struct hv_notification_message_payload *)msg->u.payload;
+	if (notification->sint_index != HV_SYNIC_DOORBELL_SINT_INDEX)
+		return false;
+
+	while ((port = synic_event_ring_get_queued_port(HV_SYNIC_DOORBELL_SINT_INDEX))) {
+		struct port_table_info ptinfo = { 0 };
+
+		if (mshv_portid_lookup(port, &ptinfo)) {
+			pr_debug("Failed to get port info from port_table!\n");
+			continue;
+		}
+
+		if (ptinfo.hv_port_type != HV_PORT_TYPE_DOORBELL) {
+			pr_debug("Not a doorbell port!, port: %d, port_type: %d\n",
+				 port, ptinfo.hv_port_type);
+			continue;
+		}
+
+		/* Invoke the callback */
+		ptinfo.hv_port_doorbell.doorbell_cb(port,
+						 ptinfo.hv_port_doorbell.data);
+	}
+
+	return true;
+}
+
+static bool mshv_async_call_completion_isr(struct hv_message *msg)
+{
+	bool handled = false;
+	struct hv_async_completion_message_payload *async_msg;
+	struct mshv_partition *partition;
+	u64 partition_id;
+
+	if (msg->header.message_type != HVMSG_ASYNC_CALL_COMPLETION)
+		goto out;
+
+	async_msg =
+		(struct hv_async_completion_message_payload *)msg->u.payload;
+
+	partition_id = async_msg->partition_id;
+
+	/*
+	 * Hold this lock for the rest of the isr, because the partition could
+	 * be released anytime.
+	 * e.g. the MSHV_RUN_VP thread could wake on another cpu; it could
+	 * release the partition unless we hold this!
+	 */
+	rcu_read_lock();
+
+	partition = mshv_partition_find(partition_id);
+
+	if (unlikely(!partition)) {
+		pr_debug("failed to find partition %llu\n", partition_id);
+		goto unlock_out;
+	}
+
+	partition->async_hypercall_status = async_msg->status;
+	complete(&partition->async_hypercall);
+
+	handled = true;
+
+unlock_out:
+	rcu_read_unlock();
+out:
+	return handled;
+}
+
+static void kick_vp(struct mshv_vp *vp)
+{
+	atomic64_inc(&vp->run.vp_signaled_count);
+	vp->run.kicked_by_hv = 1;
+	wake_up(&vp->run.vp_suspend_queue);
+}
+
+static void
+handle_bitset_message(const struct hv_vp_signal_bitset_scheduler_message *msg)
+{
+	int bank_idx, vps_signaled = 0, bank_mask_size;
+	struct mshv_partition *partition;
+	const struct hv_vpset *vpset;
+	const u64 *bank_contents;
+	u64 partition_id = msg->partition_id;
+
+	if (msg->vp_bitset.bitset.format != HV_GENERIC_SET_SPARSE_4K) {
+		pr_debug("scheduler message format is not HV_GENERIC_SET_SPARSE_4K");
+		return;
+	}
+
+	if (msg->vp_count == 0) {
+		pr_debug("scheduler message with no VP specified");
+		return;
+	}
+
+	rcu_read_lock();
+
+	partition = mshv_partition_find(partition_id);
+	if (unlikely(!partition)) {
+		pr_debug("failed to find partition %llu\n", partition_id);
+		goto unlock_out;
+	}
+
+	vpset = &msg->vp_bitset.bitset;
+
+	bank_idx = -1;
+	bank_contents = vpset->bank_contents;
+	bank_mask_size = sizeof(vpset->valid_bank_mask) * BITS_PER_BYTE;
+
+	while (true) {
+		int vp_bank_idx = -1;
+		int vp_bank_size = sizeof(*bank_contents) * BITS_PER_BYTE;
+		int vp_index;
+
+		bank_idx = find_next_bit((unsigned long *)&vpset->valid_bank_mask,
+					 bank_mask_size, bank_idx + 1);
+		if (bank_idx == bank_mask_size)
+			break;
+
+		while (true) {
+			struct mshv_vp *vp;
+
+			vp_bank_idx = find_next_bit((unsigned long *)bank_contents,
+						    vp_bank_size, vp_bank_idx + 1);
+			if (vp_bank_idx == vp_bank_size)
+				break;
+
+			vp_index = (bank_idx * vp_bank_size) + vp_bank_idx;
+
+			/* This shouldn't happen, but just in case. */
+			if (unlikely(vp_index >= MSHV_MAX_VPS)) {
+				pr_debug("VP index %u out of bounds\n",
+					 vp_index);
+				goto unlock_out;
+			}
+
+			vp = partition->pt_vp_array[vp_index];
+			if (unlikely(!vp)) {
+				pr_debug("failed to find VP %u\n", vp_index);
+				goto unlock_out;
+			}
+
+			kick_vp(vp);
+			vps_signaled++;
+		}
+
+		bank_contents++;
+	}
+
+unlock_out:
+	rcu_read_unlock();
+
+	if (vps_signaled != msg->vp_count)
+		pr_debug("asked to signal %u VPs but only did %u\n",
+			 msg->vp_count, vps_signaled);
+}
+
+static void
+handle_pair_message(const struct hv_vp_signal_pair_scheduler_message *msg)
+{
+	struct mshv_partition *partition = NULL;
+	struct mshv_vp *vp;
+	int idx;
+
+	rcu_read_lock();
+
+	for (idx = 0; idx < msg->vp_count; idx++) {
+		u64 partition_id = msg->partition_ids[idx];
+		u32 vp_index = msg->vp_indexes[idx];
+
+		if (idx == 0 || partition->pt_id != partition_id) {
+			partition = mshv_partition_find(partition_id);
+			if (unlikely(!partition)) {
+				pr_debug("failed to find partition %llu\n",
+					 partition_id);
+				break;
+			}
+		}
+
+		/* This shouldn't happen, but just in case. */
+		if (unlikely(vp_index >= MSHV_MAX_VPS)) {
+			pr_debug("VP index %u out of bounds\n", vp_index);
+			break;
+		}
+
+		vp = partition->pt_vp_array[vp_index];
+		if (!vp) {
+			pr_debug("failed to find VP %u\n", vp_index);
+			break;
+		}
+
+		kick_vp(vp);
+	}
+
+	rcu_read_unlock();
+}
+
+static bool
+mshv_scheduler_isr(struct hv_message *msg)
+{
+	if (msg->header.message_type != HVMSG_SCHEDULER_VP_SIGNAL_BITSET &&
+	    msg->header.message_type != HVMSG_SCHEDULER_VP_SIGNAL_PAIR)
+		return false;
+
+	if (msg->header.message_type == HVMSG_SCHEDULER_VP_SIGNAL_BITSET)
+		handle_bitset_message((struct hv_vp_signal_bitset_scheduler_message *)
+				      msg->u.payload);
+	else
+		handle_pair_message((struct hv_vp_signal_pair_scheduler_message *)
+				    msg->u.payload);
+
+	return true;
+}
+
+static bool
+mshv_intercept_isr(struct hv_message *msg)
+{
+	struct mshv_partition *partition;
+	bool handled = false;
+	struct mshv_vp *vp;
+	u64 partition_id;
+	u32 vp_index;
+
+	partition_id = msg->header.sender;
+
+	rcu_read_lock();
+
+	partition = mshv_partition_find(partition_id);
+	if (unlikely(!partition)) {
+		pr_debug("failed to find partition %llu\n",
+			 partition_id);
+		goto unlock_out;
+	}
+
+	if (msg->header.message_type == HVMSG_X64_APIC_EOI) {
+		/*
+		 * Check if this gsi is registered in the
+		 * ack_notifier list and invoke the callback
+		 * if registered.
+		 */
+
+		/*
+		 * If there is a notifier, the ack callback is supposed
+		 * to handle the VMEXIT. So we need not pass this message
+		 * to vcpu thread.
+		 */
+		struct hv_x64_apic_eoi_message *eoi_msg =
+			(struct hv_x64_apic_eoi_message *)&msg->u.payload[0];
+
+		if (mshv_notify_acked_gsi(partition, eoi_msg->interrupt_vector)) {
+			handled = true;
+			goto unlock_out;
+		}
+	}
+
+	/*
+	 * We should get an opaque intercept message here for all intercept
+	 * messages, since we're using the mapped VP intercept message page.
+	 *
+	 * The intercept message will have been placed in intercept message
+	 * page at this point.
+	 *
+	 * Make sure the message type matches our expectation.
+	 */
+	if (msg->header.message_type != HVMSG_OPAQUE_INTERCEPT) {
+		pr_debug("wrong message type %d", msg->header.message_type);
+		goto unlock_out;
+	}
+
+	/*
+	 * Since we directly index the vp, and it has to exist for us to be here
+	 * (because the vp is only deleted when the partition is), no additional
+	 * locking is needed here
+	 */
+	vp_index =
+	       ((struct hv_opaque_intercept_message *)msg->u.payload)->vp_index;
+	vp = partition->pt_vp_array[vp_index];
+	if (unlikely(!vp)) {
+		pr_debug("failed to find VP %u\n", vp_index);
+		goto unlock_out;
+	}
+
+	kick_vp(vp);
+
+	handled = true;
+
+unlock_out:
+	rcu_read_unlock();
+
+	return handled;
+}
+
+void mshv_isr(void)
+{
+	struct hv_synic_pages *spages = this_cpu_ptr(mshv_root.synic_pages);
+	struct hv_message_page **msg_page = &spages->hyp_synic_message_page;
+	struct hv_message *msg;
+	bool handled;
+
+	if (unlikely(!(*msg_page))) {
+		pr_debug("Missing synic page!\n");
+		return;
+	}
+
+	msg = &((*msg_page)->sint_message[HV_SYNIC_INTERCEPTION_SINT_INDEX]);
+
+	/*
+	 * If the type isn't set, there isn't really a message;
+	 * it may be some other hyperv interrupt
+	 */
+	if (msg->header.message_type == HVMSG_NONE)
+		return;
+
+	handled = mshv_doorbell_isr(msg);
+
+	if (!handled)
+		handled = mshv_scheduler_isr(msg);
+
+	if (!handled)
+		handled = mshv_async_call_completion_isr(msg);
+
+	if (!handled)
+		handled = mshv_intercept_isr(msg);
+
+	if (handled) {
+		/*
+		 * Acknowledge message with hypervisor if another message is
+		 * pending.
+		 */
+		msg->header.message_type = HVMSG_NONE;
+		/*
+		 * Ensure the write is complete so the hypervisor will deliver
+		 * the next message if available.
+		 */
+		mb();
+		if (msg->header.message_flags.msg_pending)
+			hv_set_non_nested_msr(HV_MSR_EOM, 0);
+
+#ifdef HYPERVISOR_CALLBACK_VECTOR
+		add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR);
+#endif
+	} else {
+		pr_warn_once("%s: unknown message type 0x%x\n", __func__,
+			     msg->header.message_type);
+	}
+}
+
+int mshv_synic_init(unsigned int cpu)
+{
+	union hv_synic_simp simp;
+	union hv_synic_siefp siefp;
+	union hv_synic_sirbp sirbp;
+#ifdef HYPERVISOR_CALLBACK_VECTOR
+	union hv_synic_sint sint;
+#endif
+	union hv_synic_scontrol sctrl;
+	struct hv_synic_pages *spages = this_cpu_ptr(mshv_root.synic_pages);
+	struct hv_message_page **msg_page = &spages->hyp_synic_message_page;
+	struct hv_synic_event_flags_page **event_flags_page =
+			&spages->synic_event_flags_page;
+	struct hv_synic_event_ring_page **event_ring_page =
+			&spages->synic_event_ring_page;
+
+	/* Setup the Synic's message page */
+	simp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIMP);
+	simp.simp_enabled = true;
+	*msg_page = memremap(simp.base_simp_gpa << HV_HYP_PAGE_SHIFT,
+			     HV_HYP_PAGE_SIZE,
+			     MEMREMAP_WB);
+
+	if (!(*msg_page))
+		return -EFAULT;
+
+	hv_set_non_nested_msr(HV_MSR_SIMP, simp.as_uint64);
+
+	/* Setup the Synic's event flags page */
+	siefp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIEFP);
+	siefp.siefp_enabled = true;
+	*event_flags_page = memremap(siefp.base_siefp_gpa << PAGE_SHIFT,
+				     PAGE_SIZE, MEMREMAP_WB);
+
+	if (!(*event_flags_page))
+		goto cleanup;
+
+	hv_set_non_nested_msr(HV_MSR_SIEFP, siefp.as_uint64);
+
+	/* Setup the Synic's event ring page */
+	sirbp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIRBP);
+	sirbp.sirbp_enabled = true;
+	*event_ring_page = memremap(sirbp.base_sirbp_gpa << PAGE_SHIFT,
+				    PAGE_SIZE, MEMREMAP_WB);
+
+	if (!(*event_ring_page))
+		goto cleanup;
+
+	hv_set_non_nested_msr(HV_MSR_SIRBP, sirbp.as_uint64);
+
+#ifdef HYPERVISOR_CALLBACK_VECTOR
+	/* Enable intercepts */
+	sint.as_uint64 = 0;
+	sint.vector = HYPERVISOR_CALLBACK_VECTOR;
+	sint.masked = false;
+	sint.auto_eoi = hv_recommend_using_aeoi();
+	hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_INTERCEPTION_SINT_INDEX,
+			      sint.as_uint64);
+
+	/* Doorbell SINT */
+	sint.as_uint64 = 0;
+	sint.vector = HYPERVISOR_CALLBACK_VECTOR;
+	sint.masked = false;
+	sint.as_intercept = 1;
+	sint.auto_eoi = hv_recommend_using_aeoi();
+	hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_DOORBELL_SINT_INDEX,
+			      sint.as_uint64);
+#endif
+
+	/* Enable global synic bit */
+	sctrl.as_uint64 = hv_get_non_nested_msr(HV_MSR_SCONTROL);
+	sctrl.enable = 1;
+	hv_set_non_nested_msr(HV_MSR_SCONTROL, sctrl.as_uint64);
+
+	return 0;
+
+cleanup:
+	if (*event_ring_page) {
+		sirbp.sirbp_enabled = false;
+		hv_set_non_nested_msr(HV_MSR_SIRBP, sirbp.as_uint64);
+		memunmap(*event_ring_page);
+	}
+	if (*event_flags_page) {
+		siefp.siefp_enabled = false;
+		hv_set_non_nested_msr(HV_MSR_SIEFP, siefp.as_uint64);
+		memunmap(*event_flags_page);
+	}
+	if (*msg_page) {
+		simp.simp_enabled = false;
+		hv_set_non_nested_msr(HV_MSR_SIMP, simp.as_uint64);
+		memunmap(*msg_page);
+	}
+
+	return -EFAULT;
+}
+
+int mshv_synic_cleanup(unsigned int cpu)
+{
+	union hv_synic_sint sint;
+	union hv_synic_simp simp;
+	union hv_synic_siefp siefp;
+	union hv_synic_sirbp sirbp;
+	union hv_synic_scontrol sctrl;
+	struct hv_synic_pages *spages = this_cpu_ptr(mshv_root.synic_pages);
+	struct hv_message_page **msg_page = &spages->hyp_synic_message_page;
+	struct hv_synic_event_flags_page **event_flags_page =
+		&spages->synic_event_flags_page;
+	struct hv_synic_event_ring_page **event_ring_page =
+		&spages->synic_event_ring_page;
+
+	/* Disable the interrupt */
+	sint.as_uint64 = hv_get_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_INTERCEPTION_SINT_INDEX);
+	sint.masked = true;
+	hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_INTERCEPTION_SINT_INDEX,
+			      sint.as_uint64);
+
+	/* Disable Doorbell SINT */
+	sint.as_uint64 = hv_get_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_DOORBELL_SINT_INDEX);
+	sint.masked = true;
+	hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_DOORBELL_SINT_INDEX,
+			      sint.as_uint64);
+
+	/* Disable Synic's event ring page */
+	sirbp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIRBP);
+	sirbp.sirbp_enabled = false;
+	hv_set_non_nested_msr(HV_MSR_SIRBP, sirbp.as_uint64);
+	memunmap(*event_ring_page);
+
+	/* Disable Synic's event flags page */
+	siefp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIEFP);
+	siefp.siefp_enabled = false;
+	hv_set_non_nested_msr(HV_MSR_SIEFP, siefp.as_uint64);
+	memunmap(*event_flags_page);
+
+	/* Disable Synic's message page */
+	simp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIMP);
+	simp.simp_enabled = false;
+	hv_set_non_nested_msr(HV_MSR_SIMP, simp.as_uint64);
+	memunmap(*msg_page);
+
+	/* Disable global synic bit */
+	sctrl.as_uint64 = hv_get_non_nested_msr(HV_MSR_SCONTROL);
+	sctrl.enable = 0;
+	hv_set_non_nested_msr(HV_MSR_SCONTROL, sctrl.as_uint64);
+
+	return 0;
+}
+
+int
+mshv_register_doorbell(u64 partition_id, doorbell_cb_t doorbell_cb, void *data,
+		       u64 gpa, u64 val, u64 flags)
+{
+	struct hv_connection_info connection_info = { 0 };
+	union hv_connection_id connection_id = { 0 };
+	struct port_table_info *port_table_info;
+	struct hv_port_info port_info = { 0 };
+	union hv_port_id port_id = { 0 };
+	int ret;
+
+	port_table_info = kmalloc(sizeof(*port_table_info), GFP_KERNEL);
+	if (!port_table_info)
+		return -ENOMEM;
+
+	port_table_info->hv_port_type = HV_PORT_TYPE_DOORBELL;
+	port_table_info->hv_port_doorbell.doorbell_cb = doorbell_cb;
+	port_table_info->hv_port_doorbell.data = data;
+	ret = mshv_portid_alloc(port_table_info);
+	if (ret < 0) {
+		kfree(port_table_info);
+		return ret;
+	}
+
+	port_id.u.id = ret;
+	port_info.port_type = HV_PORT_TYPE_DOORBELL;
+	port_info.doorbell_port_info.target_sint = HV_SYNIC_DOORBELL_SINT_INDEX;
+	port_info.doorbell_port_info.target_vp = HV_ANY_VP;
+	ret = hv_call_create_port(hv_current_partition_id, port_id, partition_id,
+				  &port_info,
+				  0, 0, NUMA_NO_NODE);
+
+	if (ret < 0) {
+		mshv_portid_free(port_id.u.id);
+		return ret;
+	}
+
+	connection_id.u.id = port_id.u.id;
+	connection_info.port_type = HV_PORT_TYPE_DOORBELL;
+	connection_info.doorbell_connection_info.gpa = gpa;
+	connection_info.doorbell_connection_info.trigger_value = val;
+	connection_info.doorbell_connection_info.flags = flags;
+
+	ret = hv_call_connect_port(hv_current_partition_id, port_id, partition_id,
+				   connection_id, &connection_info, 0, NUMA_NO_NODE);
+	if (ret < 0) {
+		hv_call_delete_port(hv_current_partition_id, port_id);
+		mshv_portid_free(port_id.u.id);
+		return ret;
+	}
+
+	// lets use the port_id as the doorbell_id
+	return port_id.u.id;
+}
+
+void
+mshv_unregister_doorbell(u64 partition_id, int doorbell_portid)
+{
+	union hv_port_id port_id = { 0 };
+	union hv_connection_id connection_id = { 0 };
+
+	connection_id.u.id = doorbell_portid;
+	hv_call_disconnect_port(partition_id, connection_id);
+
+	port_id.u.id = doorbell_portid;
+	hv_call_delete_port(hv_current_partition_id, port_id);
+
+	mshv_portid_free(doorbell_portid);
+}
diff --git a/drivers/hv/mshv_vtl.h b/drivers/hv/mshv_vtl.h
new file mode 100644
index 000000000000..a6eea52f7aa2
--- /dev/null
+++ b/drivers/hv/mshv_vtl.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _MSHV_VTL_H
+#define _MSHV_VTL_H
+
+#include <linux/mshv.h>
+#include <linux/types.h>
+
+struct mshv_vtl_run {
+	u32 cancel;
+	u32 vtl_ret_action_size;
+	u32 pad[2];
+	char exit_message[MSHV_MAX_RUN_MSG_SIZE];
+	union {
+		struct mshv_vtl_cpu_context cpu_context;
+
+		/*
+		 * Reserving room for the cpu context to grow and to maintain compatibility
+		 * with user mode.
+		 */
+		char reserved[1024];
+	};
+	char vtl_ret_actions[MSHV_MAX_RUN_MSG_SIZE];
+};
+
+#endif /* _MSHV_VTL_H */
diff --git a/drivers/hv/mshv_vtl_main.c b/drivers/hv/mshv_vtl_main.c
new file mode 100644
index 000000000000..2cebe9de5a5a
--- /dev/null
+++ b/drivers/hv/mshv_vtl_main.c
@@ -0,0 +1,1392 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2023, Microsoft Corporation.
+ *
+ * Author:
+ *   Roman Kisel <romank@linux.microsoft.com>
+ *   Saurabh Sengar <ssengar@linux.microsoft.com>
+ *   Naman Jain <namjain@linux.microsoft.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/miscdevice.h>
+#include <linux/anon_inodes.h>
+#include <linux/cpuhotplug.h>
+#include <linux/count_zeros.h>
+#include <linux/entry-virt.h>
+#include <linux/eventfd.h>
+#include <linux/poll.h>
+#include <linux/file.h>
+#include <linux/vmalloc.h>
+#include <asm/debugreg.h>
+#include <asm/mshyperv.h>
+#include <trace/events/ipi.h>
+#include <uapi/asm/mtrr.h>
+#include <uapi/linux/mshv.h>
+#include <hyperv/hvhdk.h>
+
+#include "../../kernel/fpu/legacy.h"
+#include "mshv.h"
+#include "mshv_vtl.h"
+#include "hyperv_vmbus.h"
+
+MODULE_AUTHOR("Microsoft");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Microsoft Hyper-V VTL Driver");
+
+#define MSHV_ENTRY_REASON_LOWER_VTL_CALL     0x1
+#define MSHV_ENTRY_REASON_INTERRUPT          0x2
+#define MSHV_ENTRY_REASON_INTERCEPT          0x3
+
+#define MSHV_REAL_OFF_SHIFT	16
+#define MSHV_PG_OFF_CPU_MASK	(BIT_ULL(MSHV_REAL_OFF_SHIFT) - 1)
+#define MSHV_RUN_PAGE_OFFSET	0
+#define MSHV_REG_PAGE_OFFSET	1
+#define VTL2_VMBUS_SINT_INDEX	7
+
+static struct device *mem_dev;
+
+static struct tasklet_struct msg_dpc;
+static wait_queue_head_t fd_wait_queue;
+static bool has_message;
+static struct eventfd_ctx *flag_eventfds[HV_EVENT_FLAGS_COUNT];
+static DEFINE_MUTEX(flag_lock);
+static bool __read_mostly mshv_has_reg_page;
+
+/* hvcall code is of type u16, allocate a bitmap of size (1 << 16) to accommodate it */
+#define MAX_BITMAP_SIZE ((U16_MAX + 1) / 8)
+
+struct mshv_vtl_hvcall_fd {
+	u8 allow_bitmap[MAX_BITMAP_SIZE];
+	bool allow_map_initialized;
+	/*
+	 * Used to protect hvcall setup in IOCTLs
+	 */
+	struct mutex init_mutex;
+	struct miscdevice *dev;
+};
+
+struct mshv_vtl_poll_file {
+	struct file *file;
+	wait_queue_entry_t wait;
+	wait_queue_head_t *wqh;
+	poll_table pt;
+	int cpu;
+};
+
+struct mshv_vtl {
+	struct device *module_dev;
+	u64 id;
+};
+
+struct mshv_vtl_per_cpu {
+	struct mshv_vtl_run *run;
+	struct page *reg_page;
+};
+
+/* SYNIC_OVERLAY_PAGE_MSR - internal, identical to hv_synic_simp */
+union hv_synic_overlay_page_msr {
+	u64 as_uint64;
+	struct {
+		u64 enabled: 1;
+		u64 reserved: 11;
+		u64 pfn: 52;
+	} __packed;
+};
+
+static struct mutex mshv_vtl_poll_file_lock;
+static union hv_register_vsm_page_offsets mshv_vsm_page_offsets;
+static union hv_register_vsm_capabilities mshv_vsm_capabilities;
+
+static DEFINE_PER_CPU(struct mshv_vtl_poll_file, mshv_vtl_poll_file);
+static DEFINE_PER_CPU(unsigned long long, num_vtl0_transitions);
+static DEFINE_PER_CPU(struct mshv_vtl_per_cpu, mshv_vtl_per_cpu);
+
+static const union hv_input_vtl input_vtl_zero;
+static const union hv_input_vtl input_vtl_normal = {
+	.use_target_vtl = 1,
+};
+
+static const struct file_operations mshv_vtl_fops;
+
+static long
+mshv_ioctl_create_vtl(void __user *user_arg, struct device *module_dev)
+{
+	struct mshv_vtl *vtl;
+	struct file *file;
+	int fd;
+
+	vtl = kzalloc(sizeof(*vtl), GFP_KERNEL);
+	if (!vtl)
+		return -ENOMEM;
+
+	fd = get_unused_fd_flags(O_CLOEXEC);
+	if (fd < 0) {
+		kfree(vtl);
+		return fd;
+	}
+	file = anon_inode_getfile("mshv_vtl", &mshv_vtl_fops,
+				  vtl, O_RDWR);
+	if (IS_ERR(file)) {
+		kfree(vtl);
+		return PTR_ERR(file);
+	}
+	vtl->module_dev = module_dev;
+	fd_install(fd, file);
+
+	return fd;
+}
+
+static long
+mshv_ioctl_check_extension(void __user *user_arg)
+{
+	u32 arg;
+
+	if (copy_from_user(&arg, user_arg, sizeof(arg)))
+		return -EFAULT;
+
+	switch (arg) {
+	case MSHV_CAP_CORE_API_STABLE:
+		return 0;
+	case MSHV_CAP_REGISTER_PAGE:
+		return mshv_has_reg_page;
+	case MSHV_CAP_VTL_RETURN_ACTION:
+		return mshv_vsm_capabilities.return_action_available;
+	case MSHV_CAP_DR6_SHARED:
+		return mshv_vsm_capabilities.dr6_shared;
+	}
+
+	return -EOPNOTSUPP;
+}
+
+static long
+mshv_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
+{
+	struct miscdevice *misc = filp->private_data;
+
+	switch (ioctl) {
+	case MSHV_CHECK_EXTENSION:
+		return mshv_ioctl_check_extension((void __user *)arg);
+	case MSHV_CREATE_VTL:
+		return mshv_ioctl_create_vtl((void __user *)arg, misc->this_device);
+	}
+
+	return -ENOTTY;
+}
+
+static const struct file_operations mshv_dev_fops = {
+	.owner		= THIS_MODULE,
+	.unlocked_ioctl	= mshv_dev_ioctl,
+	.llseek		= noop_llseek,
+};
+
+static struct miscdevice mshv_dev = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = "mshv",
+	.fops = &mshv_dev_fops,
+	.mode = 0600,
+};
+
+static struct mshv_vtl_run *mshv_vtl_this_run(void)
+{
+	return *this_cpu_ptr(&mshv_vtl_per_cpu.run);
+}
+
+static struct mshv_vtl_run *mshv_vtl_cpu_run(int cpu)
+{
+	return *per_cpu_ptr(&mshv_vtl_per_cpu.run, cpu);
+}
+
+static struct page *mshv_vtl_cpu_reg_page(int cpu)
+{
+	return *per_cpu_ptr(&mshv_vtl_per_cpu.reg_page, cpu);
+}
+
+static void mshv_vtl_configure_reg_page(struct mshv_vtl_per_cpu *per_cpu)
+{
+	struct hv_register_assoc reg_assoc = {};
+	union hv_synic_overlay_page_msr overlay = {};
+	struct page *reg_page;
+
+	reg_page = alloc_page(GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL);
+	if (!reg_page) {
+		WARN(1, "failed to allocate register page\n");
+		return;
+	}
+
+	overlay.enabled = 1;
+	overlay.pfn = page_to_hvpfn(reg_page);
+	reg_assoc.name = HV_X64_REGISTER_REG_PAGE;
+	reg_assoc.value.reg64 = overlay.as_uint64;
+
+	if (hv_call_set_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF,
+				     1, input_vtl_zero, &reg_assoc)) {
+		WARN(1, "failed to setup register page\n");
+		__free_page(reg_page);
+		return;
+	}
+
+	per_cpu->reg_page = reg_page;
+	mshv_has_reg_page = true;
+}
+
+static void mshv_vtl_synic_enable_regs(unsigned int cpu)
+{
+	union hv_synic_sint sint;
+
+	sint.as_uint64 = 0;
+	sint.vector = HYPERVISOR_CALLBACK_VECTOR;
+	sint.masked = false;
+	sint.auto_eoi = hv_recommend_using_aeoi();
+
+	/* Enable intercepts */
+	if (!mshv_vsm_capabilities.intercept_page_available)
+		hv_set_msr(HV_MSR_SINT0 + HV_SYNIC_INTERCEPTION_SINT_INDEX,
+			   sint.as_uint64);
+
+	/* VTL2 Host VSP SINT is (un)masked when the user mode requests that */
+}
+
+static int mshv_vtl_get_vsm_regs(void)
+{
+	struct hv_register_assoc registers[2];
+	int ret, count = 2;
+
+	registers[0].name = HV_REGISTER_VSM_CODE_PAGE_OFFSETS;
+	registers[1].name = HV_REGISTER_VSM_CAPABILITIES;
+
+	ret = hv_call_get_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF,
+				       count, input_vtl_zero, registers);
+	if (ret)
+		return ret;
+
+	mshv_vsm_page_offsets.as_uint64 = registers[0].value.reg64;
+	mshv_vsm_capabilities.as_uint64 = registers[1].value.reg64;
+
+	return ret;
+}
+
+static int mshv_vtl_configure_vsm_partition(struct device *dev)
+{
+	union hv_register_vsm_partition_config config;
+	struct hv_register_assoc reg_assoc;
+
+	config.as_uint64 = 0;
+	config.default_vtl_protection_mask = HV_MAP_GPA_PERMISSIONS_MASK;
+	config.enable_vtl_protection = 1;
+	config.zero_memory_on_reset = 1;
+	config.intercept_vp_startup = 1;
+	config.intercept_cpuid_unimplemented = 1;
+
+	if (mshv_vsm_capabilities.intercept_page_available) {
+		dev_dbg(dev, "using intercept page\n");
+		config.intercept_page = 1;
+	}
+
+	reg_assoc.name = HV_REGISTER_VSM_PARTITION_CONFIG;
+	reg_assoc.value.reg64 = config.as_uint64;
+
+	return hv_call_set_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF,
+				       1, input_vtl_zero, &reg_assoc);
+}
+
+static void mshv_vtl_vmbus_isr(void)
+{
+	struct hv_per_cpu_context *per_cpu;
+	struct hv_message *msg;
+	u32 message_type;
+	union hv_synic_event_flags *event_flags;
+	struct eventfd_ctx *eventfd;
+	u16 i;
+
+	per_cpu = this_cpu_ptr(hv_context.cpu_context);
+	if (smp_processor_id() == 0) {
+		msg = (struct hv_message *)per_cpu->hyp_synic_message_page + VTL2_VMBUS_SINT_INDEX;
+		message_type = READ_ONCE(msg->header.message_type);
+		if (message_type != HVMSG_NONE)
+			tasklet_schedule(&msg_dpc);
+	}
+
+	event_flags = (union hv_synic_event_flags *)per_cpu->hyp_synic_event_page +
+			VTL2_VMBUS_SINT_INDEX;
+	for_each_set_bit(i, event_flags->flags, HV_EVENT_FLAGS_COUNT) {
+		if (!sync_test_and_clear_bit(i, event_flags->flags))
+			continue;
+		rcu_read_lock();
+		eventfd = READ_ONCE(flag_eventfds[i]);
+		if (eventfd)
+			eventfd_signal(eventfd);
+		rcu_read_unlock();
+	}
+
+	vmbus_isr();
+}
+
+static int mshv_vtl_alloc_context(unsigned int cpu)
+{
+	struct mshv_vtl_per_cpu *per_cpu = this_cpu_ptr(&mshv_vtl_per_cpu);
+
+	per_cpu->run = (struct mshv_vtl_run *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+	if (!per_cpu->run)
+		return -ENOMEM;
+
+	if (mshv_vsm_capabilities.intercept_page_available)
+		mshv_vtl_configure_reg_page(per_cpu);
+
+	mshv_vtl_synic_enable_regs(cpu);
+
+	return 0;
+}
+
+static int mshv_vtl_cpuhp_online;
+
+static int hv_vtl_setup_synic(void)
+{
+	int ret;
+
+	/* Use our isr to first filter out packets destined for userspace */
+	hv_setup_vmbus_handler(mshv_vtl_vmbus_isr);
+
+	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "hyperv/vtl:online",
+				mshv_vtl_alloc_context, NULL);
+	if (ret < 0) {
+		hv_setup_vmbus_handler(vmbus_isr);
+		return ret;
+	}
+
+	mshv_vtl_cpuhp_online = ret;
+
+	return 0;
+}
+
+static void hv_vtl_remove_synic(void)
+{
+	cpuhp_remove_state(mshv_vtl_cpuhp_online);
+	hv_setup_vmbus_handler(vmbus_isr);
+}
+
+static int vtl_get_vp_register(struct hv_register_assoc *reg)
+{
+	return hv_call_get_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF,
+					1, input_vtl_normal, reg);
+}
+
+static int vtl_set_vp_register(struct hv_register_assoc *reg)
+{
+	return hv_call_set_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF,
+					1, input_vtl_normal, reg);
+}
+
+static int mshv_vtl_ioctl_add_vtl0_mem(struct mshv_vtl *vtl, void __user *arg)
+{
+	struct mshv_vtl_ram_disposition vtl0_mem;
+	struct dev_pagemap *pgmap;
+	void *addr;
+
+	if (copy_from_user(&vtl0_mem, arg, sizeof(vtl0_mem)))
+		return -EFAULT;
+	/* vtl0_mem.last_pfn is excluded in the pagemap range for VTL0 as per design */
+	if (vtl0_mem.last_pfn <= vtl0_mem.start_pfn) {
+		dev_err(vtl->module_dev, "range start pfn (%llx) > end pfn (%llx)\n",
+			vtl0_mem.start_pfn, vtl0_mem.last_pfn);
+		return -EFAULT;
+	}
+
+	pgmap = kzalloc(sizeof(*pgmap), GFP_KERNEL);
+	if (!pgmap)
+		return -ENOMEM;
+
+	pgmap->ranges[0].start = PFN_PHYS(vtl0_mem.start_pfn);
+	pgmap->ranges[0].end = PFN_PHYS(vtl0_mem.last_pfn) - 1;
+	pgmap->nr_range = 1;
+	pgmap->type = MEMORY_DEVICE_GENERIC;
+
+	/*
+	 * Determine the highest page order that can be used for the given memory range.
+	 * This works best when the range is aligned; i.e. both the start and the length.
+	 */
+	pgmap->vmemmap_shift = count_trailing_zeros(vtl0_mem.start_pfn | vtl0_mem.last_pfn);
+	dev_dbg(vtl->module_dev,
+		"Add VTL0 memory: start: 0x%llx, end_pfn: 0x%llx, page order: %lu\n",
+		vtl0_mem.start_pfn, vtl0_mem.last_pfn, pgmap->vmemmap_shift);
+
+	addr = devm_memremap_pages(mem_dev, pgmap);
+	if (IS_ERR(addr)) {
+		dev_err(vtl->module_dev, "devm_memremap_pages error: %ld\n", PTR_ERR(addr));
+		kfree(pgmap);
+		return -EFAULT;
+	}
+
+	/* Don't free pgmap, since it has to stick around until the memory
+	 * is unmapped, which will never happen as there is no scenario
+	 * where VTL0 can be released/shutdown without bringing down VTL2.
+	 */
+	return 0;
+}
+
+static void mshv_vtl_cancel(int cpu)
+{
+	int here = get_cpu();
+
+	if (here != cpu) {
+		if (!xchg_relaxed(&mshv_vtl_cpu_run(cpu)->cancel, 1))
+			smp_send_reschedule(cpu);
+	} else {
+		WRITE_ONCE(mshv_vtl_this_run()->cancel, 1);
+	}
+	put_cpu();
+}
+
+static int mshv_vtl_poll_file_wake(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key)
+{
+	struct mshv_vtl_poll_file *poll_file = container_of(wait, struct mshv_vtl_poll_file, wait);
+
+	mshv_vtl_cancel(poll_file->cpu);
+
+	return 0;
+}
+
+static void mshv_vtl_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh, poll_table *pt)
+{
+	struct mshv_vtl_poll_file *poll_file = container_of(pt, struct mshv_vtl_poll_file, pt);
+
+	WARN_ON(poll_file->wqh);
+	poll_file->wqh = wqh;
+	add_wait_queue(wqh, &poll_file->wait);
+}
+
+static int mshv_vtl_ioctl_set_poll_file(struct mshv_vtl_set_poll_file __user *user_input)
+{
+	struct file *file, *old_file;
+	struct mshv_vtl_poll_file *poll_file;
+	struct mshv_vtl_set_poll_file input;
+
+	if (copy_from_user(&input, user_input, sizeof(input)))
+		return -EFAULT;
+
+	if (input.cpu >= num_possible_cpus() || !cpu_online(input.cpu))
+		return -EINVAL;
+	/*
+	 * CPU Hotplug is not supported in VTL2 in OpenHCL, where this kernel driver exists.
+	 * CPU is expected to remain online after above cpu_online() check.
+	 */
+
+	file = NULL;
+	file = fget(input.fd);
+	if (!file)
+		return -EBADFD;
+
+	poll_file = per_cpu_ptr(&mshv_vtl_poll_file, READ_ONCE(input.cpu));
+	if (!poll_file)
+		return -EINVAL;
+
+	mutex_lock(&mshv_vtl_poll_file_lock);
+
+	if (poll_file->wqh)
+		remove_wait_queue(poll_file->wqh, &poll_file->wait);
+	poll_file->wqh = NULL;
+
+	old_file = poll_file->file;
+	poll_file->file = file;
+	poll_file->cpu = input.cpu;
+
+	if (file) {
+		init_waitqueue_func_entry(&poll_file->wait, mshv_vtl_poll_file_wake);
+		init_poll_funcptr(&poll_file->pt, mshv_vtl_ptable_queue_proc);
+		vfs_poll(file, &poll_file->pt);
+	}
+
+	mutex_unlock(&mshv_vtl_poll_file_lock);
+
+	if (old_file)
+		fput(old_file);
+
+	return 0;
+}
+
+/* Static table mapping register names to their corresponding actions */
+static const struct {
+	enum hv_register_name reg_name;
+	int debug_reg_num;  /* -1 if not a debug register */
+	u32 msr_addr;       /* 0 if not an MSR */
+} reg_table[] = {
+	/* Debug registers */
+	{HV_X64_REGISTER_DR0, 0, 0},
+	{HV_X64_REGISTER_DR1, 1, 0},
+	{HV_X64_REGISTER_DR2, 2, 0},
+	{HV_X64_REGISTER_DR3, 3, 0},
+	{HV_X64_REGISTER_DR6, 6, 0},
+	/* MTRR MSRs */
+	{HV_X64_REGISTER_MSR_MTRR_CAP, -1, MSR_MTRRcap},
+	{HV_X64_REGISTER_MSR_MTRR_DEF_TYPE, -1, MSR_MTRRdefType},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_BASE0, -1, MTRRphysBase_MSR(0)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_BASE1, -1, MTRRphysBase_MSR(1)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_BASE2, -1, MTRRphysBase_MSR(2)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_BASE3, -1, MTRRphysBase_MSR(3)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_BASE4, -1, MTRRphysBase_MSR(4)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_BASE5, -1, MTRRphysBase_MSR(5)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_BASE6, -1, MTRRphysBase_MSR(6)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_BASE7, -1, MTRRphysBase_MSR(7)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_BASE8, -1, MTRRphysBase_MSR(8)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_BASE9, -1, MTRRphysBase_MSR(9)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_BASEA, -1, MTRRphysBase_MSR(0xa)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_BASEB, -1, MTRRphysBase_MSR(0xb)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_BASEC, -1, MTRRphysBase_MSR(0xc)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_BASED, -1, MTRRphysBase_MSR(0xd)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_BASEE, -1, MTRRphysBase_MSR(0xe)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_BASEF, -1, MTRRphysBase_MSR(0xf)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_MASK0, -1, MTRRphysMask_MSR(0)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_MASK1, -1, MTRRphysMask_MSR(1)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_MASK2, -1, MTRRphysMask_MSR(2)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_MASK3, -1, MTRRphysMask_MSR(3)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_MASK4, -1, MTRRphysMask_MSR(4)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_MASK5, -1, MTRRphysMask_MSR(5)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_MASK6, -1, MTRRphysMask_MSR(6)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_MASK7, -1, MTRRphysMask_MSR(7)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_MASK8, -1, MTRRphysMask_MSR(8)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_MASK9, -1, MTRRphysMask_MSR(9)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_MASKA, -1, MTRRphysMask_MSR(0xa)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_MASKB, -1, MTRRphysMask_MSR(0xb)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_MASKC, -1, MTRRphysMask_MSR(0xc)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_MASKD, -1, MTRRphysMask_MSR(0xd)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_MASKE, -1, MTRRphysMask_MSR(0xe)},
+	{HV_X64_REGISTER_MSR_MTRR_PHYS_MASKF, -1, MTRRphysMask_MSR(0xf)},
+	{HV_X64_REGISTER_MSR_MTRR_FIX64K00000, -1, MSR_MTRRfix64K_00000},
+	{HV_X64_REGISTER_MSR_MTRR_FIX16K80000, -1, MSR_MTRRfix16K_80000},
+	{HV_X64_REGISTER_MSR_MTRR_FIX16KA0000, -1, MSR_MTRRfix16K_A0000},
+	{HV_X64_REGISTER_MSR_MTRR_FIX4KC0000, -1, MSR_MTRRfix4K_C0000},
+	{HV_X64_REGISTER_MSR_MTRR_FIX4KC8000, -1, MSR_MTRRfix4K_C8000},
+	{HV_X64_REGISTER_MSR_MTRR_FIX4KD0000, -1, MSR_MTRRfix4K_D0000},
+	{HV_X64_REGISTER_MSR_MTRR_FIX4KD8000, -1, MSR_MTRRfix4K_D8000},
+	{HV_X64_REGISTER_MSR_MTRR_FIX4KE0000, -1, MSR_MTRRfix4K_E0000},
+	{HV_X64_REGISTER_MSR_MTRR_FIX4KE8000, -1, MSR_MTRRfix4K_E8000},
+	{HV_X64_REGISTER_MSR_MTRR_FIX4KF0000, -1, MSR_MTRRfix4K_F0000},
+	{HV_X64_REGISTER_MSR_MTRR_FIX4KF8000, -1, MSR_MTRRfix4K_F8000},
+};
+
+static int mshv_vtl_get_set_reg(struct hv_register_assoc *regs, bool set)
+{
+	u64 *reg64;
+	enum hv_register_name gpr_name;
+	int i;
+
+	gpr_name = regs->name;
+	reg64 = &regs->value.reg64;
+
+	/* Search for the register in the table */
+	for (i = 0; i < ARRAY_SIZE(reg_table); i++) {
+		if (reg_table[i].reg_name != gpr_name)
+			continue;
+		if (reg_table[i].debug_reg_num != -1) {
+			/* Handle debug registers */
+			if (gpr_name == HV_X64_REGISTER_DR6 &&
+			    !mshv_vsm_capabilities.dr6_shared)
+				goto hypercall;
+			if (set)
+				native_set_debugreg(reg_table[i].debug_reg_num, *reg64);
+			else
+				*reg64 = native_get_debugreg(reg_table[i].debug_reg_num);
+		} else {
+			/* Handle MSRs */
+			if (set)
+				wrmsrl(reg_table[i].msr_addr, *reg64);
+			else
+				rdmsrl(reg_table[i].msr_addr, *reg64);
+		}
+		return 0;
+	}
+
+hypercall:
+	return 1;
+}
+
+static void mshv_vtl_return(struct mshv_vtl_cpu_context *vtl0)
+{
+	struct hv_vp_assist_page *hvp;
+
+	hvp = hv_vp_assist_page[smp_processor_id()];
+
+	/*
+	 * Process signal event direct set in the run page, if any.
+	 */
+	if (mshv_vsm_capabilities.return_action_available) {
+		u32 offset = READ_ONCE(mshv_vtl_this_run()->vtl_ret_action_size);
+
+		WRITE_ONCE(mshv_vtl_this_run()->vtl_ret_action_size, 0);
+
+		/*
+		 * Hypervisor will take care of clearing out the actions
+		 * set in the assist page.
+		 */
+		memcpy(hvp->vtl_ret_actions,
+		       mshv_vtl_this_run()->vtl_ret_actions,
+		       min_t(u32, offset, sizeof(hvp->vtl_ret_actions)));
+	}
+
+	mshv_vtl_return_call(vtl0);
+}
+
+static bool mshv_vtl_process_intercept(void)
+{
+	struct hv_per_cpu_context *mshv_cpu;
+	void *synic_message_page;
+	struct hv_message *msg;
+	u32 message_type;
+
+	mshv_cpu = this_cpu_ptr(hv_context.cpu_context);
+	synic_message_page = mshv_cpu->hyp_synic_message_page;
+	if (unlikely(!synic_message_page))
+		return true;
+
+	msg = (struct hv_message *)synic_message_page + HV_SYNIC_INTERCEPTION_SINT_INDEX;
+	message_type = READ_ONCE(msg->header.message_type);
+	if (message_type == HVMSG_NONE)
+		return true;
+
+	memcpy(mshv_vtl_this_run()->exit_message, msg, sizeof(*msg));
+	vmbus_signal_eom(msg, message_type);
+
+	return false;
+}
+
+static int mshv_vtl_ioctl_return_to_lower_vtl(void)
+{
+	preempt_disable();
+	for (;;) {
+		unsigned long irq_flags;
+		struct hv_vp_assist_page *hvp;
+		int ret;
+
+		if (__xfer_to_guest_mode_work_pending()) {
+			preempt_enable();
+			ret = xfer_to_guest_mode_handle_work();
+			if (ret)
+				return ret;
+			preempt_disable();
+		}
+
+		local_irq_save(irq_flags);
+		if (READ_ONCE(mshv_vtl_this_run()->cancel)) {
+			local_irq_restore(irq_flags);
+			preempt_enable();
+			return -EINTR;
+		}
+
+		mshv_vtl_return(&mshv_vtl_this_run()->cpu_context);
+		local_irq_restore(irq_flags);
+
+		hvp = hv_vp_assist_page[smp_processor_id()];
+		this_cpu_inc(num_vtl0_transitions);
+		switch (hvp->vtl_entry_reason) {
+		case MSHV_ENTRY_REASON_INTERRUPT:
+			if (!mshv_vsm_capabilities.intercept_page_available &&
+			    likely(!mshv_vtl_process_intercept()))
+				goto done;
+			break;
+
+		case MSHV_ENTRY_REASON_INTERCEPT:
+			WARN_ON(!mshv_vsm_capabilities.intercept_page_available);
+			memcpy(mshv_vtl_this_run()->exit_message, hvp->intercept_message,
+			       sizeof(hvp->intercept_message));
+			goto done;
+
+		default:
+			panic("unknown entry reason: %d", hvp->vtl_entry_reason);
+		}
+	}
+
+done:
+	preempt_enable();
+
+	return 0;
+}
+
+static long
+mshv_vtl_ioctl_get_regs(void __user *user_args)
+{
+	struct mshv_vp_registers args;
+	struct hv_register_assoc reg;
+	long ret;
+
+	if (copy_from_user(&args, user_args, sizeof(args)))
+		return -EFAULT;
+
+	/*  This IOCTL supports processing only one register at a time. */
+	if (args.count != 1)
+		return -EINVAL;
+
+	if (copy_from_user(&reg, (void __user *)args.regs_ptr,
+			   sizeof(reg)))
+		return -EFAULT;
+
+	ret = mshv_vtl_get_set_reg(&reg, false);
+	if (!ret)
+		goto copy_args; /* No need of hypercall */
+	ret = vtl_get_vp_register(&reg);
+	if (ret)
+		return ret;
+
+copy_args:
+	if (copy_to_user((void __user *)args.regs_ptr, &reg, sizeof(reg)))
+		ret = -EFAULT;
+
+	return ret;
+}
+
+static long
+mshv_vtl_ioctl_set_regs(void __user *user_args)
+{
+	struct mshv_vp_registers args;
+	struct hv_register_assoc reg;
+	long ret;
+
+	if (copy_from_user(&args, user_args, sizeof(args)))
+		return -EFAULT;
+
+	/*  This IOCTL supports processing only one register at a time. */
+	if (args.count != 1)
+		return -EINVAL;
+
+	if (copy_from_user(&reg, (void __user *)args.regs_ptr, sizeof(reg)))
+		return -EFAULT;
+
+	ret = mshv_vtl_get_set_reg(&reg, true);
+	if (!ret)
+		return ret; /* No need of hypercall */
+	ret = vtl_set_vp_register(&reg);
+
+	return ret;
+}
+
+static long
+mshv_vtl_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
+{
+	long ret;
+	struct mshv_vtl *vtl = filp->private_data;
+
+	switch (ioctl) {
+	case MSHV_SET_POLL_FILE:
+		ret = mshv_vtl_ioctl_set_poll_file((struct mshv_vtl_set_poll_file __user *)arg);
+		break;
+	case MSHV_GET_VP_REGISTERS:
+		ret = mshv_vtl_ioctl_get_regs((void __user *)arg);
+		break;
+	case MSHV_SET_VP_REGISTERS:
+		ret = mshv_vtl_ioctl_set_regs((void __user *)arg);
+		break;
+	case MSHV_RETURN_TO_LOWER_VTL:
+		ret = mshv_vtl_ioctl_return_to_lower_vtl();
+		break;
+	case MSHV_ADD_VTL0_MEMORY:
+		ret = mshv_vtl_ioctl_add_vtl0_mem(vtl, (void __user *)arg);
+		break;
+	default:
+		dev_err(vtl->module_dev, "invalid vtl ioctl: %#x\n", ioctl);
+		ret = -ENOTTY;
+	}
+
+	return ret;
+}
+
+static vm_fault_t mshv_vtl_fault(struct vm_fault *vmf)
+{
+	struct page *page;
+	int cpu = vmf->pgoff & MSHV_PG_OFF_CPU_MASK;
+	int real_off = vmf->pgoff >> MSHV_REAL_OFF_SHIFT;
+
+	if (!cpu_online(cpu))
+		return VM_FAULT_SIGBUS;
+	/*
+	 * CPU Hotplug is not supported in VTL2 in OpenHCL, where this kernel driver exists.
+	 * CPU is expected to remain online after above cpu_online() check.
+	 */
+
+	if (real_off == MSHV_RUN_PAGE_OFFSET) {
+		page = virt_to_page(mshv_vtl_cpu_run(cpu));
+	} else if (real_off == MSHV_REG_PAGE_OFFSET) {
+		if (!mshv_has_reg_page)
+			return VM_FAULT_SIGBUS;
+		page = mshv_vtl_cpu_reg_page(cpu);
+	} else {
+		return VM_FAULT_NOPAGE;
+	}
+
+	get_page(page);
+	vmf->page = page;
+
+	return 0;
+}
+
+static const struct vm_operations_struct mshv_vtl_vm_ops = {
+	.fault = mshv_vtl_fault,
+};
+
+static int mshv_vtl_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	vma->vm_ops = &mshv_vtl_vm_ops;
+
+	return 0;
+}
+
+static int mshv_vtl_release(struct inode *inode, struct file *filp)
+{
+	struct mshv_vtl *vtl = filp->private_data;
+
+	kfree(vtl);
+
+	return 0;
+}
+
+static const struct file_operations mshv_vtl_fops = {
+	.owner = THIS_MODULE,
+	.unlocked_ioctl = mshv_vtl_ioctl,
+	.release = mshv_vtl_release,
+	.mmap = mshv_vtl_mmap,
+};
+
+static void mshv_vtl_synic_mask_vmbus_sint(const u8 *mask)
+{
+	union hv_synic_sint sint;
+
+	sint.as_uint64 = 0;
+	sint.vector = HYPERVISOR_CALLBACK_VECTOR;
+	sint.masked = (*mask != 0);
+	sint.auto_eoi = hv_recommend_using_aeoi();
+
+	hv_set_msr(HV_MSR_SINT0 + VTL2_VMBUS_SINT_INDEX,
+		   sint.as_uint64);
+
+	if (!sint.masked)
+		pr_debug("%s: Unmasking VTL2 VMBUS SINT on VP %d\n", __func__, smp_processor_id());
+	else
+		pr_debug("%s: Masking VTL2 VMBUS SINT on VP %d\n", __func__, smp_processor_id());
+}
+
+static void mshv_vtl_read_remote(void *buffer)
+{
+	struct hv_per_cpu_context *mshv_cpu = this_cpu_ptr(hv_context.cpu_context);
+	struct hv_message *msg = (struct hv_message *)mshv_cpu->hyp_synic_message_page +
+					VTL2_VMBUS_SINT_INDEX;
+	u32 message_type = READ_ONCE(msg->header.message_type);
+
+	WRITE_ONCE(has_message, false);
+	if (message_type == HVMSG_NONE)
+		return;
+
+	memcpy(buffer, msg, sizeof(*msg));
+	vmbus_signal_eom(msg, message_type);
+}
+
+static bool vtl_synic_mask_vmbus_sint_masked = true;
+
+static ssize_t mshv_vtl_sint_read(struct file *filp, char __user *arg, size_t size, loff_t *offset)
+{
+	struct hv_message msg = {};
+	int ret;
+
+	if (size < sizeof(msg))
+		return -EINVAL;
+
+	for (;;) {
+		smp_call_function_single(VMBUS_CONNECT_CPU, mshv_vtl_read_remote, &msg, true);
+		if (msg.header.message_type != HVMSG_NONE)
+			break;
+
+		if (READ_ONCE(vtl_synic_mask_vmbus_sint_masked))
+			return 0; /* EOF */
+
+		if (filp->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+
+		ret = wait_event_interruptible(fd_wait_queue,
+					       READ_ONCE(has_message) ||
+						READ_ONCE(vtl_synic_mask_vmbus_sint_masked));
+		if (ret)
+			return ret;
+	}
+
+	if (copy_to_user(arg, &msg, sizeof(msg)))
+		return -EFAULT;
+
+	return sizeof(msg);
+}
+
+static __poll_t mshv_vtl_sint_poll(struct file *filp, poll_table *wait)
+{
+	__poll_t mask = 0;
+
+	poll_wait(filp, &fd_wait_queue, wait);
+	if (READ_ONCE(has_message) || READ_ONCE(vtl_synic_mask_vmbus_sint_masked))
+		mask |= EPOLLIN | EPOLLRDNORM;
+
+	return mask;
+}
+
+static void mshv_vtl_sint_on_msg_dpc(unsigned long data)
+{
+	WRITE_ONCE(has_message, true);
+	wake_up_interruptible_poll(&fd_wait_queue, EPOLLIN);
+}
+
+static int mshv_vtl_sint_ioctl_post_msg(struct mshv_vtl_sint_post_msg __user *arg)
+{
+	struct mshv_vtl_sint_post_msg message;
+	u8 payload[HV_MESSAGE_PAYLOAD_BYTE_COUNT];
+
+	if (copy_from_user(&message, arg, sizeof(message)))
+		return -EFAULT;
+	if (message.payload_size > HV_MESSAGE_PAYLOAD_BYTE_COUNT)
+		return -EINVAL;
+	if (copy_from_user(payload, (void __user *)message.payload_ptr,
+			   message.payload_size))
+		return -EFAULT;
+
+	return hv_post_message((union hv_connection_id)message.connection_id,
+			       message.message_type, (void *)payload,
+			       message.payload_size);
+}
+
+static int mshv_vtl_sint_ioctl_signal_event(struct mshv_vtl_signal_event __user *arg)
+{
+	u64 input, status;
+	struct mshv_vtl_signal_event signal_event;
+
+	if (copy_from_user(&signal_event, arg, sizeof(signal_event)))
+		return -EFAULT;
+
+	input = signal_event.connection_id | ((u64)signal_event.flag << 32);
+
+	status = hv_do_fast_hypercall8(HVCALL_SIGNAL_EVENT, input);
+
+	return hv_result_to_errno(status);
+}
+
+static int mshv_vtl_sint_ioctl_set_eventfd(struct mshv_vtl_set_eventfd __user *arg)
+{
+	struct mshv_vtl_set_eventfd set_eventfd;
+	struct eventfd_ctx *eventfd, *old_eventfd;
+
+	if (copy_from_user(&set_eventfd, arg, sizeof(set_eventfd)))
+		return -EFAULT;
+	if (set_eventfd.flag >= HV_EVENT_FLAGS_COUNT)
+		return -EINVAL;
+
+	eventfd = NULL;
+	if (set_eventfd.fd >= 0) {
+		eventfd = eventfd_ctx_fdget(set_eventfd.fd);
+		if (IS_ERR(eventfd))
+			return PTR_ERR(eventfd);
+	}
+
+	guard(mutex)(&flag_lock);
+	old_eventfd = READ_ONCE(flag_eventfds[set_eventfd.flag]);
+	WRITE_ONCE(flag_eventfds[set_eventfd.flag], eventfd);
+
+	if (old_eventfd) {
+		synchronize_rcu();
+		eventfd_ctx_put(old_eventfd);
+	}
+
+	return 0;
+}
+
+static int mshv_vtl_sint_ioctl_pause_msg_stream(struct mshv_sint_mask __user *arg)
+{
+	static DEFINE_MUTEX(vtl2_vmbus_sint_mask_mutex);
+	struct mshv_sint_mask mask;
+
+	if (copy_from_user(&mask, arg, sizeof(mask)))
+		return -EFAULT;
+	guard(mutex)(&vtl2_vmbus_sint_mask_mutex);
+	on_each_cpu((smp_call_func_t)mshv_vtl_synic_mask_vmbus_sint, &mask.mask, 1);
+	WRITE_ONCE(vtl_synic_mask_vmbus_sint_masked, mask.mask != 0);
+	if (mask.mask)
+		wake_up_interruptible_poll(&fd_wait_queue, EPOLLIN);
+
+	return 0;
+}
+
+static long mshv_vtl_sint_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
+{
+	switch (cmd) {
+	case MSHV_SINT_POST_MESSAGE:
+		return mshv_vtl_sint_ioctl_post_msg((struct mshv_vtl_sint_post_msg __user *)arg);
+	case MSHV_SINT_SIGNAL_EVENT:
+		return mshv_vtl_sint_ioctl_signal_event((struct mshv_vtl_signal_event __user *)arg);
+	case MSHV_SINT_SET_EVENTFD:
+		return mshv_vtl_sint_ioctl_set_eventfd((struct mshv_vtl_set_eventfd __user *)arg);
+	case MSHV_SINT_PAUSE_MESSAGE_STREAM:
+		return mshv_vtl_sint_ioctl_pause_msg_stream((struct mshv_sint_mask __user *)arg);
+	default:
+		return -ENOIOCTLCMD;
+	}
+}
+
+static const struct file_operations mshv_vtl_sint_ops = {
+	.owner = THIS_MODULE,
+	.read = mshv_vtl_sint_read,
+	.poll = mshv_vtl_sint_poll,
+	.unlocked_ioctl = mshv_vtl_sint_ioctl,
+};
+
+static struct miscdevice mshv_vtl_sint_dev = {
+	.name = "mshv_sint",
+	.fops = &mshv_vtl_sint_ops,
+	.mode = 0600,
+	.minor = MISC_DYNAMIC_MINOR,
+};
+
+static int mshv_vtl_hvcall_dev_open(struct inode *node, struct file *f)
+{
+	struct miscdevice *dev = f->private_data;
+	struct mshv_vtl_hvcall_fd *fd;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	fd = vzalloc(sizeof(*fd));
+	if (!fd)
+		return -ENOMEM;
+	fd->dev = dev;
+	f->private_data = fd;
+	mutex_init(&fd->init_mutex);
+
+	return 0;
+}
+
+static int mshv_vtl_hvcall_dev_release(struct inode *node, struct file *f)
+{
+	struct mshv_vtl_hvcall_fd *fd;
+
+	fd = f->private_data;
+	if (fd) {
+		vfree(fd);
+		f->private_data = NULL;
+	}
+
+	return 0;
+}
+
+static int mshv_vtl_hvcall_do_setup(struct mshv_vtl_hvcall_fd *fd,
+				    struct mshv_vtl_hvcall_setup __user *hvcall_setup_user)
+{
+	struct mshv_vtl_hvcall_setup hvcall_setup;
+
+	guard(mutex)(&fd->init_mutex);
+
+	if (fd->allow_map_initialized) {
+		dev_err(fd->dev->this_device,
+			"Hypercall allow map has already been set, pid %d\n",
+			current->pid);
+		return -EINVAL;
+	}
+
+	if (copy_from_user(&hvcall_setup, hvcall_setup_user,
+			   sizeof(struct mshv_vtl_hvcall_setup))) {
+		return -EFAULT;
+	}
+	if (hvcall_setup.bitmap_array_size > ARRAY_SIZE(fd->allow_bitmap))
+		return -EINVAL;
+
+	if (copy_from_user(&fd->allow_bitmap,
+			   (void __user *)hvcall_setup.allow_bitmap_ptr,
+			   hvcall_setup.bitmap_array_size)) {
+		return -EFAULT;
+	}
+
+	dev_info(fd->dev->this_device, "Hypercall allow map has been set, pid %d\n",
+		 current->pid);
+	fd->allow_map_initialized = true;
+	return 0;
+}
+
+static bool mshv_vtl_hvcall_is_allowed(struct mshv_vtl_hvcall_fd *fd, u16 call_code)
+{
+	return test_bit(call_code, (unsigned long *)fd->allow_bitmap);
+}
+
+static int mshv_vtl_hvcall_call(struct mshv_vtl_hvcall_fd *fd,
+				struct mshv_vtl_hvcall __user *hvcall_user)
+{
+	struct mshv_vtl_hvcall hvcall;
+	void *in, *out;
+	int ret;
+
+	if (copy_from_user(&hvcall, hvcall_user, sizeof(struct mshv_vtl_hvcall)))
+		return -EFAULT;
+	if (hvcall.input_size > HV_HYP_PAGE_SIZE)
+		return -EINVAL;
+	if (hvcall.output_size > HV_HYP_PAGE_SIZE)
+		return -EINVAL;
+
+	/*
+	 * By default, all hypercalls are not allowed.
+	 * The user mode code has to set up the allow bitmap once.
+	 */
+
+	if (!mshv_vtl_hvcall_is_allowed(fd, hvcall.control & 0xFFFF)) {
+		dev_err(fd->dev->this_device,
+			"Hypercall with control data %#llx isn't allowed\n",
+			hvcall.control);
+		return -EPERM;
+	}
+
+	/*
+	 * This may create a problem for Confidential VM (CVM) usecase where we need to use
+	 * Hyper-V driver allocated per-cpu input and output pages (hyperv_pcpu_input_arg and
+	 * hyperv_pcpu_output_arg) for making a hypervisor call.
+	 *
+	 * TODO: Take care of this when CVM support is added.
+	 */
+	in = (void *)__get_free_page(GFP_KERNEL);
+	out = (void *)__get_free_page(GFP_KERNEL);
+
+	if (copy_from_user(in, (void __user *)hvcall.input_ptr, hvcall.input_size)) {
+		ret = -EFAULT;
+		goto free_pages;
+	}
+
+	hvcall.status = hv_do_hypercall(hvcall.control, in, out);
+
+	if (copy_to_user((void __user *)hvcall.output_ptr, out, hvcall.output_size)) {
+		ret = -EFAULT;
+		goto free_pages;
+	}
+	ret = put_user(hvcall.status, &hvcall_user->status);
+free_pages:
+	free_page((unsigned long)in);
+	free_page((unsigned long)out);
+
+	return ret;
+}
+
+static long mshv_vtl_hvcall_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
+{
+	struct mshv_vtl_hvcall_fd *fd = f->private_data;
+
+	switch (cmd) {
+	case MSHV_HVCALL_SETUP:
+		return mshv_vtl_hvcall_do_setup(fd, (struct mshv_vtl_hvcall_setup __user *)arg);
+	case MSHV_HVCALL:
+		return mshv_vtl_hvcall_call(fd, (struct mshv_vtl_hvcall __user *)arg);
+	default:
+		break;
+	}
+
+	return -ENOIOCTLCMD;
+}
+
+static const struct file_operations mshv_vtl_hvcall_dev_file_ops = {
+	.owner = THIS_MODULE,
+	.open = mshv_vtl_hvcall_dev_open,
+	.release = mshv_vtl_hvcall_dev_release,
+	.unlocked_ioctl = mshv_vtl_hvcall_dev_ioctl,
+};
+
+static struct miscdevice mshv_vtl_hvcall_dev = {
+	.name = "mshv_hvcall",
+	.nodename = "mshv_hvcall",
+	.fops = &mshv_vtl_hvcall_dev_file_ops,
+	.mode = 0600,
+	.minor = MISC_DYNAMIC_MINOR,
+};
+
+static int mshv_vtl_low_open(struct inode *inodep, struct file *filp)
+{
+	pid_t pid = task_pid_vnr(current);
+	uid_t uid = current_uid().val;
+	int ret = 0;
+
+	pr_debug("%s: Opening VTL low, task group %d, uid %d\n", __func__, pid, uid);
+
+	if (capable(CAP_SYS_ADMIN)) {
+		filp->private_data = inodep;
+	} else {
+		pr_err("%s: VTL low open failed: CAP_SYS_ADMIN required. task group %d, uid %d",
+		       __func__, pid, uid);
+		ret = -EPERM;
+	}
+
+	return ret;
+}
+
+static bool can_fault(struct vm_fault *vmf, unsigned long size, unsigned long *pfn)
+{
+	unsigned long mask = size - 1;
+	unsigned long start = vmf->address & ~mask;
+	unsigned long end = start + size;
+	bool is_valid;
+
+	is_valid = (vmf->address & mask) == ((vmf->pgoff << PAGE_SHIFT) & mask) &&
+		start >= vmf->vma->vm_start &&
+		end <= vmf->vma->vm_end;
+
+	if (is_valid)
+		*pfn = vmf->pgoff & ~(mask >> PAGE_SHIFT);
+
+	return is_valid;
+}
+
+static vm_fault_t mshv_vtl_low_huge_fault(struct vm_fault *vmf, unsigned int order)
+{
+	unsigned long pfn = vmf->pgoff;
+	vm_fault_t ret = VM_FAULT_FALLBACK;
+
+	switch (order) {
+	case 0:
+		return vmf_insert_mixed(vmf->vma, vmf->address, pfn);
+
+	case PMD_ORDER:
+		if (can_fault(vmf, PMD_SIZE, &pfn))
+			ret = vmf_insert_pfn_pmd(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE);
+		return ret;
+
+	case PUD_ORDER:
+		if (can_fault(vmf, PUD_SIZE, &pfn))
+			ret = vmf_insert_pfn_pud(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE);
+		return ret;
+
+	default:
+		return VM_FAULT_SIGBUS;
+	}
+}
+
+static vm_fault_t mshv_vtl_low_fault(struct vm_fault *vmf)
+{
+	return mshv_vtl_low_huge_fault(vmf, 0);
+}
+
+static const struct vm_operations_struct mshv_vtl_low_vm_ops = {
+	.fault = mshv_vtl_low_fault,
+	.huge_fault = mshv_vtl_low_huge_fault,
+};
+
+static int mshv_vtl_low_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	vma->vm_ops = &mshv_vtl_low_vm_ops;
+	vm_flags_set(vma, VM_HUGEPAGE | VM_MIXEDMAP);
+
+	return 0;
+}
+
+static const struct file_operations mshv_vtl_low_file_ops = {
+	.owner		= THIS_MODULE,
+	.open		= mshv_vtl_low_open,
+	.mmap		= mshv_vtl_low_mmap,
+};
+
+static struct miscdevice mshv_vtl_low = {
+	.name = "mshv_vtl_low",
+	.nodename = "mshv_vtl_low",
+	.fops = &mshv_vtl_low_file_ops,
+	.mode = 0600,
+	.minor = MISC_DYNAMIC_MINOR,
+};
+
+static int __init mshv_vtl_init(void)
+{
+	int ret;
+	struct device *dev = mshv_dev.this_device;
+
+	/*
+	 * This creates /dev/mshv which provides functionality to create VTLs and partitions.
+	 */
+	ret = misc_register(&mshv_dev);
+	if (ret) {
+		dev_err(dev, "mshv device register failed: %d\n", ret);
+		goto free_dev;
+	}
+
+	tasklet_init(&msg_dpc, mshv_vtl_sint_on_msg_dpc, 0);
+	init_waitqueue_head(&fd_wait_queue);
+
+	if (mshv_vtl_get_vsm_regs()) {
+		dev_emerg(dev, "Unable to get VSM capabilities !!\n");
+		ret = -ENODEV;
+		goto free_dev;
+	}
+	if (mshv_vtl_configure_vsm_partition(dev)) {
+		dev_emerg(dev, "VSM configuration failed !!\n");
+		ret = -ENODEV;
+		goto free_dev;
+	}
+
+	mshv_vtl_return_call_init(mshv_vsm_page_offsets.vtl_return_offset);
+	ret = hv_vtl_setup_synic();
+	if (ret)
+		goto free_dev;
+
+	/*
+	 * mshv_sint device adds VMBus relay ioctl support.
+	 * This provides a channel for VTL0 to communicate with VTL2.
+	 */
+	ret = misc_register(&mshv_vtl_sint_dev);
+	if (ret)
+		goto free_synic;
+
+	/*
+	 * mshv_hvcall device adds interface to enable userspace for direct hypercalls support.
+	 */
+	ret = misc_register(&mshv_vtl_hvcall_dev);
+	if (ret)
+		goto free_sint;
+
+	/*
+	 * mshv_vtl_low device is used to map VTL0 address space to a user-mode process in VTL2.
+	 * It implements mmap() to allow a user-mode process in VTL2 to map to the address of VTL0.
+	 */
+	ret = misc_register(&mshv_vtl_low);
+	if (ret)
+		goto free_hvcall;
+
+	/*
+	 * "mshv vtl mem dev" device is later used to setup VTL0 memory.
+	 */
+	mem_dev = kzalloc(sizeof(*mem_dev), GFP_KERNEL);
+	if (!mem_dev) {
+		ret = -ENOMEM;
+		goto free_low;
+	}
+
+	mutex_init(&mshv_vtl_poll_file_lock);
+
+	device_initialize(mem_dev);
+	dev_set_name(mem_dev, "mshv vtl mem dev");
+	ret = device_add(mem_dev);
+	if (ret) {
+		dev_err(dev, "mshv vtl mem dev add: %d\n", ret);
+		goto free_mem;
+	}
+
+	return 0;
+
+free_mem:
+	kfree(mem_dev);
+free_low:
+	misc_deregister(&mshv_vtl_low);
+free_hvcall:
+	misc_deregister(&mshv_vtl_hvcall_dev);
+free_sint:
+	misc_deregister(&mshv_vtl_sint_dev);
+free_synic:
+	hv_vtl_remove_synic();
+free_dev:
+	misc_deregister(&mshv_dev);
+
+	return ret;
+}
+
+static void __exit mshv_vtl_exit(void)
+{
+	device_del(mem_dev);
+	kfree(mem_dev);
+	misc_deregister(&mshv_vtl_low);
+	misc_deregister(&mshv_vtl_hvcall_dev);
+	misc_deregister(&mshv_vtl_sint_dev);
+	hv_vtl_remove_synic();
+	misc_deregister(&mshv_dev);
+}
+
+module_init(mshv_vtl_init);
+module_exit(mshv_vtl_exit);
diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c
index c6692fd5ab15..3c421a7f78c0 100644
--- a/drivers/hv/ring_buffer.c
+++ b/drivers/hv/ring_buffer.c
@@ -18,6 +18,7 @@
 #include <linux/slab.h>
 #include <linux/prefetch.h>
 #include <linux/io.h>
+#include <linux/export.h>
 #include <asm/mshyperv.h>
 
 #include "hyperv_vmbus.h"
@@ -183,11 +184,10 @@ void hv_ringbuffer_pre_init(struct vmbus_channel *channel)
 
 /* Initialize the ring buffer. */
 int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info,
-		       struct page *pages, u32 page_cnt, u32 max_pkt_size)
+		       struct page *pages, u32 page_cnt, u32 max_pkt_size,
+			   bool confidential)
 {
 	struct page **pages_wraparound;
-	unsigned long *pfns_wraparound;
-	u64 pfn;
 	int i;
 
 	BUILD_BUG_ON((sizeof(struct hv_ring_buffer) != PAGE_SIZE));
@@ -196,50 +196,30 @@ int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info,
 	 * First page holds struct hv_ring_buffer, do wraparound mapping for
 	 * the rest.
 	 */
-	if (hv_isolation_type_snp()) {
-		pfn = page_to_pfn(pages) +
-			PFN_DOWN(ms_hyperv.shared_gpa_boundary);
+	pages_wraparound = kcalloc(page_cnt * 2 - 1,
+				   sizeof(struct page *),
+				   GFP_KERNEL);
+	if (!pages_wraparound)
+		return -ENOMEM;
 
-		pfns_wraparound = kcalloc(page_cnt * 2 - 1,
-			sizeof(unsigned long), GFP_KERNEL);
-		if (!pfns_wraparound)
-			return -ENOMEM;
-
-		pfns_wraparound[0] = pfn;
-		for (i = 0; i < 2 * (page_cnt - 1); i++)
-			pfns_wraparound[i + 1] = pfn + i % (page_cnt - 1) + 1;
-
-		ring_info->ring_buffer = (struct hv_ring_buffer *)
-			vmap_pfn(pfns_wraparound, page_cnt * 2 - 1,
-				 PAGE_KERNEL);
-		kfree(pfns_wraparound);
-
-		if (!ring_info->ring_buffer)
-			return -ENOMEM;
-
-		/* Zero ring buffer after setting memory host visibility. */
-		memset(ring_info->ring_buffer, 0x00, PAGE_SIZE * page_cnt);
-	} else {
-		pages_wraparound = kcalloc(page_cnt * 2 - 1,
-					   sizeof(struct page *),
-					   GFP_KERNEL);
-		if (!pages_wraparound)
-			return -ENOMEM;
-
-		pages_wraparound[0] = pages;
-		for (i = 0; i < 2 * (page_cnt - 1); i++)
-			pages_wraparound[i + 1] =
-				&pages[i % (page_cnt - 1) + 1];
+	pages_wraparound[0] = pages;
+	for (i = 0; i < 2 * (page_cnt - 1); i++)
+		pages_wraparound[i + 1] =
+			&pages[i % (page_cnt - 1) + 1];
 
-		ring_info->ring_buffer = (struct hv_ring_buffer *)
-			vmap(pages_wraparound, page_cnt * 2 - 1, VM_MAP,
-				PAGE_KERNEL);
+	ring_info->ring_buffer = (struct hv_ring_buffer *)
+		vmap(pages_wraparound, page_cnt * 2 - 1, VM_MAP,
+			confidential ? PAGE_KERNEL : pgprot_decrypted(PAGE_KERNEL));
 
-		kfree(pages_wraparound);
-		if (!ring_info->ring_buffer)
-			return -ENOMEM;
-	}
+	kfree(pages_wraparound);
+	if (!ring_info->ring_buffer)
+		return -ENOMEM;
 
+	/*
+	 * Ensure the header page is zero'ed since
+	 * encryption status may have changed.
+	 */
+	memset(ring_info->ring_buffer, 0, HV_HYP_PAGE_SIZE);
 
 	ring_info->ring_buffer->read_index =
 		ring_info->ring_buffer->write_index = 0;
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 3146710d4ac6..a53af6fe81a6 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -12,6 +12,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/device.h>
+#include <linux/platform_device.h>
 #include <linux/interrupt.h>
 #include <linux/sysctl.h>
 #include <linux/slab.h>
@@ -19,6 +20,7 @@
 #include <linux/completion.h>
 #include <linux/hyperv.h>
 #include <linux/kernel_stat.h>
+#include <linux/of_address.h>
 #include <linux/clockchips.h>
 #include <linux/cpu.h>
 #include <linux/sched/isolation.h>
@@ -28,13 +30,13 @@
 #include <linux/panic_notifier.h>
 #include <linux/ptrace.h>
 #include <linux/screen_info.h>
-#include <linux/kdebug.h>
 #include <linux/efi.h>
 #include <linux/random.h>
 #include <linux/kernel.h>
 #include <linux/syscore_ops.h>
 #include <linux/dma-map-ops.h>
 #include <linux/pci.h>
+#include <linux/export.h>
 #include <clocksource/hyperv_timer.h>
 #include <asm/mshyperv.h>
 #include "hyperv_vmbus.h"
@@ -44,12 +46,11 @@ struct vmbus_dynid {
 	struct hv_vmbus_device_id id;
 };
 
-static struct acpi_device  *hv_acpi_dev;
+/* VMBus Root Device */
+static struct device  *vmbus_root_device;
 
 static int hyperv_cpuhp_online;
 
-static void *hv_panic_page;
-
 static long __percpu *vmbus_evt;
 
 /* Values parsed from ACPI DSDT */
@@ -57,16 +58,16 @@ int vmbus_irq;
 int vmbus_interrupt;
 
 /*
- * Boolean to control whether to report panic messages over Hyper-V.
- *
- * It can be set via /proc/sys/kernel/hyperv_record_panic_msg
+ * If the Confidential VMBus is used, the data on the "wire" is not
+ * visible to either the host or the hypervisor.
  */
-static int sysctl_record_panic_msg = 1;
+static bool is_confidential;
 
-static int hyperv_report_reg(void)
+bool vmbus_is_confidential(void)
 {
-	return !sysctl_record_panic_msg || !hv_panic_page;
+	return is_confidential;
 }
+EXPORT_SYMBOL_GPL(vmbus_is_confidential);
 
 /*
  * The panic notifier below is responsible solely for unloading the
@@ -88,62 +89,20 @@ static struct notifier_block hyperv_panic_vmbus_unload_block = {
 	.priority	= INT_MIN + 1, /* almost the latest one to execute */
 };
 
-static int hv_die_panic_notify_crash(struct notifier_block *self,
-				     unsigned long val, void *args);
-
-static struct notifier_block hyperv_die_report_block = {
-	.notifier_call = hv_die_panic_notify_crash,
-};
-static struct notifier_block hyperv_panic_report_block = {
-	.notifier_call = hv_die_panic_notify_crash,
-};
-
-/*
- * The following callback works both as die and panic notifier; its
- * goal is to provide panic information to the hypervisor unless the
- * kmsg dumper is used [see hv_kmsg_dump()], which provides more
- * information but isn't always available.
- *
- * Notice that both the panic/die report notifiers are registered only
- * if we have the capability HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE set.
- */
-static int hv_die_panic_notify_crash(struct notifier_block *self,
-				     unsigned long val, void *args)
-{
-	struct pt_regs *regs;
-	bool is_die;
-
-	/* Don't notify Hyper-V unless we have a die oops event or panic. */
-	if (self == &hyperv_panic_report_block) {
-		is_die = false;
-		regs = current_pt_regs();
-	} else { /* die event */
-		if (val != DIE_OOPS)
-			return NOTIFY_DONE;
-
-		is_die = true;
-		regs = ((struct die_args *)args)->regs;
-	}
-
-	/*
-	 * Hyper-V should be notified only once about a panic/die. If we will
-	 * be calling hv_kmsg_dump() later with kmsg data, don't do the
-	 * notification here.
-	 */
-	if (hyperv_report_reg())
-		hyperv_report_panic(regs, val, is_die);
-
-	return NOTIFY_DONE;
-}
-
 static const char *fb_mmio_name = "fb_range";
 static struct resource *fb_mmio;
 static struct resource *hyperv_mmio;
 static DEFINE_MUTEX(hyperv_mmio_lock);
 
+struct device *hv_get_vmbus_root_device(void)
+{
+	return vmbus_root_device;
+}
+EXPORT_SYMBOL_GPL(hv_get_vmbus_root_device);
+
 static int vmbus_exists(void)
 {
-	if (hv_acpi_dev == NULL)
+	if (vmbus_root_device == NULL)
 		return -ENODEV;
 
 	return 0;
@@ -192,7 +151,7 @@ static ssize_t id_show(struct device *dev, struct device_attribute *dev_attr,
 
 	if (!hv_dev->channel)
 		return -ENODEV;
-	return sprintf(buf, "%d\n", hv_dev->channel->offermsg.child_relid);
+	return sysfs_emit(buf, "%d\n", hv_dev->channel->offermsg.child_relid);
 }
 static DEVICE_ATTR_RO(id);
 
@@ -203,7 +162,7 @@ static ssize_t state_show(struct device *dev, struct device_attribute *dev_attr,
 
 	if (!hv_dev->channel)
 		return -ENODEV;
-	return sprintf(buf, "%d\n", hv_dev->channel->state);
+	return sysfs_emit(buf, "%d\n", hv_dev->channel->state);
 }
 static DEVICE_ATTR_RO(state);
 
@@ -214,7 +173,7 @@ static ssize_t monitor_id_show(struct device *dev,
 
 	if (!hv_dev->channel)
 		return -ENODEV;
-	return sprintf(buf, "%d\n", hv_dev->channel->offermsg.monitorid);
+	return sysfs_emit(buf, "%d\n", hv_dev->channel->offermsg.monitorid);
 }
 static DEVICE_ATTR_RO(monitor_id);
 
@@ -225,8 +184,8 @@ static ssize_t class_id_show(struct device *dev,
 
 	if (!hv_dev->channel)
 		return -ENODEV;
-	return sprintf(buf, "{%pUl}\n",
-		       &hv_dev->channel->offermsg.offer.if_type);
+	return sysfs_emit(buf, "{%pUl}\n",
+			  &hv_dev->channel->offermsg.offer.if_type);
 }
 static DEVICE_ATTR_RO(class_id);
 
@@ -237,8 +196,8 @@ static ssize_t device_id_show(struct device *dev,
 
 	if (!hv_dev->channel)
 		return -ENODEV;
-	return sprintf(buf, "{%pUl}\n",
-		       &hv_dev->channel->offermsg.offer.if_instance);
+	return sysfs_emit(buf, "{%pUl}\n",
+			  &hv_dev->channel->offermsg.offer.if_instance);
 }
 static DEVICE_ATTR_RO(device_id);
 
@@ -247,7 +206,7 @@ static ssize_t modalias_show(struct device *dev,
 {
 	struct hv_device *hv_dev = device_to_hv_device(dev);
 
-	return sprintf(buf, "vmbus:%*phN\n", UUID_SIZE, &hv_dev->dev_type);
+	return sysfs_emit(buf, "vmbus:%*phN\n", UUID_SIZE, &hv_dev->dev_type);
 }
 static DEVICE_ATTR_RO(modalias);
 
@@ -260,7 +219,7 @@ static ssize_t numa_node_show(struct device *dev,
 	if (!hv_dev->channel)
 		return -ENODEV;
 
-	return sprintf(buf, "%d\n", cpu_to_node(hv_dev->channel->target_cpu));
+	return sysfs_emit(buf, "%d\n", cpu_to_node(hv_dev->channel->target_cpu));
 }
 static DEVICE_ATTR_RO(numa_node);
 #endif
@@ -273,9 +232,8 @@ static ssize_t server_monitor_pending_show(struct device *dev,
 
 	if (!hv_dev->channel)
 		return -ENODEV;
-	return sprintf(buf, "%d\n",
-		       channel_pending(hv_dev->channel,
-				       vmbus_connection.monitor_pages[0]));
+	return sysfs_emit(buf, "%d\n", channel_pending(hv_dev->channel,
+			  vmbus_connection.monitor_pages[0]));
 }
 static DEVICE_ATTR_RO(server_monitor_pending);
 
@@ -287,9 +245,8 @@ static ssize_t client_monitor_pending_show(struct device *dev,
 
 	if (!hv_dev->channel)
 		return -ENODEV;
-	return sprintf(buf, "%d\n",
-		       channel_pending(hv_dev->channel,
-				       vmbus_connection.monitor_pages[1]));
+	return sysfs_emit(buf, "%d\n", channel_pending(hv_dev->channel,
+			  vmbus_connection.monitor_pages[1]));
 }
 static DEVICE_ATTR_RO(client_monitor_pending);
 
@@ -301,9 +258,8 @@ static ssize_t server_monitor_latency_show(struct device *dev,
 
 	if (!hv_dev->channel)
 		return -ENODEV;
-	return sprintf(buf, "%d\n",
-		       channel_latency(hv_dev->channel,
-				       vmbus_connection.monitor_pages[0]));
+	return sysfs_emit(buf, "%d\n", channel_latency(hv_dev->channel,
+			  vmbus_connection.monitor_pages[0]));
 }
 static DEVICE_ATTR_RO(server_monitor_latency);
 
@@ -315,9 +271,8 @@ static ssize_t client_monitor_latency_show(struct device *dev,
 
 	if (!hv_dev->channel)
 		return -ENODEV;
-	return sprintf(buf, "%d\n",
-		       channel_latency(hv_dev->channel,
-				       vmbus_connection.monitor_pages[1]));
+	return sysfs_emit(buf, "%d\n", channel_latency(hv_dev->channel,
+			  vmbus_connection.monitor_pages[1]));
 }
 static DEVICE_ATTR_RO(client_monitor_latency);
 
@@ -329,9 +284,8 @@ static ssize_t server_monitor_conn_id_show(struct device *dev,
 
 	if (!hv_dev->channel)
 		return -ENODEV;
-	return sprintf(buf, "%d\n",
-		       channel_conn_id(hv_dev->channel,
-				       vmbus_connection.monitor_pages[0]));
+	return sysfs_emit(buf, "%d\n", channel_conn_id(hv_dev->channel,
+			  vmbus_connection.monitor_pages[0]));
 }
 static DEVICE_ATTR_RO(server_monitor_conn_id);
 
@@ -343,9 +297,8 @@ static ssize_t client_monitor_conn_id_show(struct device *dev,
 
 	if (!hv_dev->channel)
 		return -ENODEV;
-	return sprintf(buf, "%d\n",
-		       channel_conn_id(hv_dev->channel,
-				       vmbus_connection.monitor_pages[1]));
+	return sysfs_emit(buf, "%d\n", channel_conn_id(hv_dev->channel,
+			  vmbus_connection.monitor_pages[1]));
 }
 static DEVICE_ATTR_RO(client_monitor_conn_id);
 
@@ -364,7 +317,7 @@ static ssize_t out_intr_mask_show(struct device *dev,
 	if (ret < 0)
 		return ret;
 
-	return sprintf(buf, "%d\n", outbound.current_interrupt_mask);
+	return sysfs_emit(buf, "%d\n", outbound.current_interrupt_mask);
 }
 static DEVICE_ATTR_RO(out_intr_mask);
 
@@ -382,7 +335,7 @@ static ssize_t out_read_index_show(struct device *dev,
 					  &outbound);
 	if (ret < 0)
 		return ret;
-	return sprintf(buf, "%d\n", outbound.current_read_index);
+	return sysfs_emit(buf, "%u\n", outbound.current_read_index);
 }
 static DEVICE_ATTR_RO(out_read_index);
 
@@ -401,7 +354,7 @@ static ssize_t out_write_index_show(struct device *dev,
 					  &outbound);
 	if (ret < 0)
 		return ret;
-	return sprintf(buf, "%d\n", outbound.current_write_index);
+	return sysfs_emit(buf, "%u\n", outbound.current_write_index);
 }
 static DEVICE_ATTR_RO(out_write_index);
 
@@ -420,7 +373,7 @@ static ssize_t out_read_bytes_avail_show(struct device *dev,
 					  &outbound);
 	if (ret < 0)
 		return ret;
-	return sprintf(buf, "%d\n", outbound.bytes_avail_toread);
+	return sysfs_emit(buf, "%d\n", outbound.bytes_avail_toread);
 }
 static DEVICE_ATTR_RO(out_read_bytes_avail);
 
@@ -439,7 +392,7 @@ static ssize_t out_write_bytes_avail_show(struct device *dev,
 					  &outbound);
 	if (ret < 0)
 		return ret;
-	return sprintf(buf, "%d\n", outbound.bytes_avail_towrite);
+	return sysfs_emit(buf, "%d\n", outbound.bytes_avail_towrite);
 }
 static DEVICE_ATTR_RO(out_write_bytes_avail);
 
@@ -457,7 +410,7 @@ static ssize_t in_intr_mask_show(struct device *dev,
 	if (ret < 0)
 		return ret;
 
-	return sprintf(buf, "%d\n", inbound.current_interrupt_mask);
+	return sysfs_emit(buf, "%d\n", inbound.current_interrupt_mask);
 }
 static DEVICE_ATTR_RO(in_intr_mask);
 
@@ -475,7 +428,7 @@ static ssize_t in_read_index_show(struct device *dev,
 	if (ret < 0)
 		return ret;
 
-	return sprintf(buf, "%d\n", inbound.current_read_index);
+	return sysfs_emit(buf, "%d\n", inbound.current_read_index);
 }
 static DEVICE_ATTR_RO(in_read_index);
 
@@ -493,7 +446,7 @@ static ssize_t in_write_index_show(struct device *dev,
 	if (ret < 0)
 		return ret;
 
-	return sprintf(buf, "%d\n", inbound.current_write_index);
+	return sysfs_emit(buf, "%d\n", inbound.current_write_index);
 }
 static DEVICE_ATTR_RO(in_write_index);
 
@@ -512,7 +465,7 @@ static ssize_t in_read_bytes_avail_show(struct device *dev,
 	if (ret < 0)
 		return ret;
 
-	return sprintf(buf, "%d\n", inbound.bytes_avail_toread);
+	return sysfs_emit(buf, "%d\n", inbound.bytes_avail_toread);
 }
 static DEVICE_ATTR_RO(in_read_bytes_avail);
 
@@ -531,7 +484,7 @@ static ssize_t in_write_bytes_avail_show(struct device *dev,
 	if (ret < 0)
 		return ret;
 
-	return sprintf(buf, "%d\n", inbound.bytes_avail_towrite);
+	return sysfs_emit(buf, "%d\n", inbound.bytes_avail_towrite);
 }
 static DEVICE_ATTR_RO(in_write_bytes_avail);
 
@@ -541,7 +494,7 @@ static ssize_t channel_vp_mapping_show(struct device *dev,
 {
 	struct hv_device *hv_dev = device_to_hv_device(dev);
 	struct vmbus_channel *channel = hv_dev->channel, *cur_sc;
-	int buf_size = PAGE_SIZE, n_written, tot_written;
+	int n_written;
 	struct list_head *cur;
 
 	if (!channel)
@@ -549,25 +502,21 @@ static ssize_t channel_vp_mapping_show(struct device *dev,
 
 	mutex_lock(&vmbus_connection.channel_mutex);
 
-	tot_written = snprintf(buf, buf_size, "%u:%u\n",
-		channel->offermsg.child_relid, channel->target_cpu);
+	n_written = sysfs_emit(buf, "%u:%u\n",
+			       channel->offermsg.child_relid,
+			       channel->target_cpu);
 
 	list_for_each(cur, &channel->sc_list) {
-		if (tot_written >= buf_size - 1)
-			break;
 
 		cur_sc = list_entry(cur, struct vmbus_channel, sc_list);
-		n_written = scnprintf(buf + tot_written,
-				     buf_size - tot_written,
-				     "%u:%u\n",
-				     cur_sc->offermsg.child_relid,
-				     cur_sc->target_cpu);
-		tot_written += n_written;
+		n_written += sysfs_emit_at(buf, n_written, "%u:%u\n",
+					  cur_sc->offermsg.child_relid,
+					  cur_sc->target_cpu);
 	}
 
 	mutex_unlock(&vmbus_connection.channel_mutex);
 
-	return tot_written;
+	return n_written;
 }
 static DEVICE_ATTR_RO(channel_vp_mapping);
 
@@ -577,7 +526,7 @@ static ssize_t vendor_show(struct device *dev,
 {
 	struct hv_device *hv_dev = device_to_hv_device(dev);
 
-	return sprintf(buf, "0x%x\n", hv_dev->vendor_id);
+	return sysfs_emit(buf, "0x%x\n", hv_dev->vendor_id);
 }
 static DEVICE_ATTR_RO(vendor);
 
@@ -587,7 +536,7 @@ static ssize_t device_show(struct device *dev,
 {
 	struct hv_device *hv_dev = device_to_hv_device(dev);
 
-	return sprintf(buf, "0x%x\n", hv_dev->device_id);
+	return sysfs_emit(buf, "0x%x\n", hv_dev->device_id);
 }
 static DEVICE_ATTR_RO(device);
 
@@ -612,7 +561,7 @@ static ssize_t driver_override_show(struct device *dev,
 	ssize_t len;
 
 	device_lock(dev);
-	len = snprintf(buf, PAGE_SIZE, "%s\n", hv_dev->driver_override);
+	len = sysfs_emit(buf, "%s\n", hv_dev->driver_override);
 	device_unlock(dev);
 
 	return len;
@@ -684,7 +633,7 @@ static const struct attribute_group vmbus_dev_group = {
 __ATTRIBUTE_GROUPS(vmbus_dev);
 
 /* Set up the attribute for /sys/bus/vmbus/hibernation */
-static ssize_t hibernation_show(struct bus_type *bus, char *buf)
+static ssize_t hibernation_show(const struct bus_type *bus, char *buf)
 {
 	return sprintf(buf, "%d\n", !!hv_is_hibernation_supported());
 }
@@ -711,9 +660,9 @@ __ATTRIBUTE_GROUPS(vmbus_bus);
  * representation of the device guid (each byte of the guid will be
  * represented with two hex characters.
  */
-static int vmbus_uevent(struct device *device, struct kobj_uevent_env *env)
+static int vmbus_uevent(const struct device *device, struct kobj_uevent_env *env)
 {
-	struct hv_device *dev = device_to_hv_device(device);
+	const struct hv_device *dev = device_to_hv_device(device);
 	const char *format = "MODALIAS=vmbus:%*phN";
 
 	return add_uevent_var(env, format, UUID_SIZE, &dev->dev_type);
@@ -756,7 +705,7 @@ static const struct hv_vmbus_device_id vmbus_device_null;
  * Return a matching hv_vmbus_device_id pointer.
  * If there is no match, return NULL.
  */
-static const struct hv_vmbus_device_id *hv_vmbus_get_id(struct hv_driver *drv,
+static const struct hv_vmbus_device_id *hv_vmbus_get_id(const struct hv_driver *drv,
 							struct hv_device *dev)
 {
 	const guid_t *guid = &dev->dev_type;
@@ -767,7 +716,7 @@ static const struct hv_vmbus_device_id *hv_vmbus_get_id(struct hv_driver *drv,
 		return NULL;
 
 	/* Look at the dynamic ids first, before the static ones */
-	id = hv_vmbus_dynid_match(drv, guid);
+	id = hv_vmbus_dynid_match((struct hv_driver *)drv, guid);
 	if (!id)
 		id = hv_vmbus_dev_match(drv->id_table, guid);
 
@@ -778,7 +727,30 @@ static const struct hv_vmbus_device_id *hv_vmbus_get_id(struct hv_driver *drv,
 	return id;
 }
 
-/* vmbus_add_dynid - add a new device ID to this driver and re-probe devices */
+/* vmbus_add_dynid - add a new device ID to this driver and re-probe devices
+ *
+ * This function can race with vmbus_device_register(). This function is
+ * typically running on a user thread in response to writing to the "new_id"
+ * sysfs entry for a driver. vmbus_device_register() is running on a
+ * workqueue thread in response to the Hyper-V host offering a device to the
+ * guest. This function calls driver_attach(), which looks for an existing
+ * device matching the new id, and attaches the driver to which the new id
+ * has been assigned. vmbus_device_register() calls device_register(), which
+ * looks for a driver that matches the device being registered. If both
+ * operations are running simultaneously, the device driver probe function runs
+ * on whichever thread establishes the linkage between the driver and device.
+ *
+ * In most cases, it doesn't matter which thread runs the driver probe
+ * function. But if vmbus_device_register() does not find a matching driver,
+ * it proceeds to create the "channels" subdirectory and numbered per-channel
+ * subdirectory in sysfs. While that multi-step creation is in progress, this
+ * function could run the driver probe function. If the probe function checks
+ * for, or operates on, entries in the "channels" subdirectory, including by
+ * calling hv_create_ring_sysfs(), the operation may or may not succeed
+ * depending on the race. The race can't create a kernel failure in VMBus
+ * or device subsystem code, but probe functions in VMBus drivers doing such
+ * operations must be prepared for the failure case.
+ */
 static int vmbus_add_dynid(struct hv_driver *drv, guid_t *guid)
 {
 	struct vmbus_dynid *dynid;
@@ -880,9 +852,9 @@ ATTRIBUTE_GROUPS(vmbus_drv);
 /*
  * vmbus_match - Attempt to match the specified device to the specified driver
  */
-static int vmbus_match(struct device *device, struct device_driver *driver)
+static int vmbus_match(struct device *device, const struct device_driver *driver)
 {
-	struct hv_driver *drv = drv_to_hv_drv(driver);
+	const struct hv_driver *drv = drv_to_hv_drv(driver);
 	struct hv_device *hv_dev = device_to_hv_device(device);
 
 	/* The hv_sock driver handles all hv_sock offers. */
@@ -932,7 +904,7 @@ static int vmbus_dma_configure(struct device *child_device)
 	 * On x86/x64 coherence is assumed and these calls have no effect.
 	 */
 	hv_setup_dma_ops(child_device,
-		device_get_dma_attr(&hv_acpi_dev->dev) == DEV_DMA_COHERENT);
+		device_get_dma_attr(vmbus_root_device) == DEV_DMA_COHERENT);
 	return 0;
 }
 
@@ -1049,7 +1021,7 @@ static const struct dev_pm_ops vmbus_pm = {
 };
 
 /* The one and only one */
-static struct bus_type  hv_bus = {
+static const struct bus_type  hv_bus = {
 	.name =		"vmbus",
 	.match =		vmbus_match,
 	.shutdown =		vmbus_shutdown,
@@ -1086,12 +1058,9 @@ static void vmbus_onmessage_work(struct work_struct *work)
 	kfree(ctx);
 }
 
-void vmbus_on_msg_dpc(unsigned long data)
+static void __vmbus_on_msg_dpc(void *message_page_addr)
 {
-	struct hv_per_cpu_context *hv_cpu = (void *)data;
-	void *page_addr = hv_cpu->synic_message_page;
-	struct hv_message msg_copy, *msg = (struct hv_message *)page_addr +
-				  VMBUS_MESSAGE_SINT;
+	struct hv_message msg_copy, *msg;
 	struct vmbus_channel_message_header *hdr;
 	enum vmbus_channel_message_type msgtype;
 	const struct vmbus_channel_message_table_entry *entry;
@@ -1099,6 +1068,10 @@ void vmbus_on_msg_dpc(unsigned long data)
 	__u8 payload_size;
 	u32 message_type;
 
+	if (!message_page_addr)
+		return;
+	msg = (struct hv_message *)message_page_addr + VMBUS_MESSAGE_SINT;
+
 	/*
 	 * 'enum vmbus_channel_message_type' is supposed to always be 'u32' as
 	 * it is being used in 'struct vmbus_channel_message_header' definition
@@ -1224,6 +1197,14 @@ msg_handled:
 	vmbus_signal_eom(msg, message_type);
 }
 
+void vmbus_on_msg_dpc(unsigned long data)
+{
+	struct hv_per_cpu_context *hv_cpu = (void *)data;
+
+	__vmbus_on_msg_dpc(hv_cpu->hyp_synic_message_page);
+	__vmbus_on_msg_dpc(hv_cpu->para_synic_message_page);
+}
+
 #ifdef CONFIG_PM_SLEEP
 /*
  * Fake RESCIND_CHANNEL messages to clean up hv_sock channels by force for
@@ -1262,21 +1243,19 @@ static void vmbus_force_channel_rescinded(struct vmbus_channel *channel)
 #endif /* CONFIG_PM_SLEEP */
 
 /*
- * Schedule all channels with events pending
+ * Schedule all channels with events pending.
+ * The event page can be directly checked to get the id of
+ * the channel that has the interrupt pending.
  */
-static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu)
+static void vmbus_chan_sched(void *event_page_addr)
 {
 	unsigned long *recv_int_page;
 	u32 maxbits, relid;
+	union hv_synic_event_flags *event;
 
-	/*
-	 * The event page can be directly checked to get the id of
-	 * the channel that has the interrupt pending.
-	 */
-	void *page_addr = hv_cpu->synic_event_page;
-	union hv_synic_event_flags *event
-		= (union hv_synic_event_flags *)page_addr +
-					 VMBUS_MESSAGE_SINT;
+	if (!event_page_addr)
+		return;
+	event = (union hv_synic_event_flags *)event_page_addr + VMBUS_MESSAGE_SINT;
 
 	maxbits = HV_EVENT_FLAGS_COUNT;
 	recv_int_page = event->flags;
@@ -1284,6 +1263,11 @@ static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu)
 	if (unlikely(!recv_int_page))
 		return;
 
+	/*
+	 * Suggested-by: Michael Kelley <mhklinux@outlook.com>
+	 * One possible optimization would be to keep track of the largest relID that's in use,
+	 * and only scan up to that relID.
+	 */
 	for_each_set_bit(relid, recv_int_page, maxbits) {
 		void (*callback_fn)(void *context);
 		struct vmbus_channel *channel;
@@ -1347,29 +1331,39 @@ sched_unlock_rcu:
 	}
 }
 
-static void vmbus_isr(void)
+static void vmbus_message_sched(struct hv_per_cpu_context *hv_cpu, void *message_page_addr)
 {
-	struct hv_per_cpu_context *hv_cpu
-		= this_cpu_ptr(hv_context.cpu_context);
-	void *page_addr;
 	struct hv_message *msg;
 
-	vmbus_chan_sched(hv_cpu);
-
-	page_addr = hv_cpu->synic_message_page;
-	msg = (struct hv_message *)page_addr + VMBUS_MESSAGE_SINT;
+	if (!message_page_addr)
+		return;
+	msg = (struct hv_message *)message_page_addr + VMBUS_MESSAGE_SINT;
 
 	/* Check if there are actual msgs to be processed */
 	if (msg->header.message_type != HVMSG_NONE) {
 		if (msg->header.message_type == HVMSG_TIMER_EXPIRED) {
 			hv_stimer0_isr();
 			vmbus_signal_eom(msg, HVMSG_TIMER_EXPIRED);
-		} else
+		} else {
 			tasklet_schedule(&hv_cpu->msg_dpc);
+		}
 	}
+}
+
+void vmbus_isr(void)
+{
+	struct hv_per_cpu_context *hv_cpu
+		= this_cpu_ptr(hv_context.cpu_context);
+
+	vmbus_chan_sched(hv_cpu->hyp_synic_event_page);
+	vmbus_chan_sched(hv_cpu->para_synic_event_page);
+
+	vmbus_message_sched(hv_cpu, hv_cpu->hyp_synic_message_page);
+	vmbus_message_sched(hv_cpu, hv_cpu->para_synic_message_page);
 
 	add_interrupt_randomness(vmbus_interrupt);
 }
+EXPORT_SYMBOL_FOR_MODULES(vmbus_isr, "mshv_vtl");
 
 static irqreturn_t vmbus_percpu_isr(int irq, void *dev_id)
 {
@@ -1377,97 +1371,65 @@ static irqreturn_t vmbus_percpu_isr(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-/*
- * Callback from kmsg_dump. Grab as much as possible from the end of the kmsg
- * buffer and call into Hyper-V to transfer the data.
- */
-static void hv_kmsg_dump(struct kmsg_dumper *dumper,
-			 enum kmsg_dump_reason reason)
+static void vmbus_percpu_work(struct work_struct *work)
 {
-	struct kmsg_dump_iter iter;
-	size_t bytes_written;
+	unsigned int cpu = smp_processor_id();
 
-	/* We are only interested in panics. */
-	if ((reason != KMSG_DUMP_PANIC) || (!sysctl_record_panic_msg))
-		return;
-
-	/*
-	 * Write dump contents to the page. No need to synchronize; panic should
-	 * be single-threaded.
-	 */
-	kmsg_dump_rewind(&iter);
-	kmsg_dump_get_buffer(&iter, false, hv_panic_page, HV_HYP_PAGE_SIZE,
-			     &bytes_written);
-	if (!bytes_written)
-		return;
-	/*
-	 * P3 to contain the physical address of the panic page & P4 to
-	 * contain the size of the panic data in that page. Rest of the
-	 * registers are no-op when the NOTIFY_MSG flag is set.
-	 */
-	hv_set_register(HV_REGISTER_CRASH_P0, 0);
-	hv_set_register(HV_REGISTER_CRASH_P1, 0);
-	hv_set_register(HV_REGISTER_CRASH_P2, 0);
-	hv_set_register(HV_REGISTER_CRASH_P3, virt_to_phys(hv_panic_page));
-	hv_set_register(HV_REGISTER_CRASH_P4, bytes_written);
-
-	/*
-	 * Let Hyper-V know there is crash data available along with
-	 * the panic message.
-	 */
-	hv_set_register(HV_REGISTER_CRASH_CTL,
-	       (HV_CRASH_CTL_CRASH_NOTIFY | HV_CRASH_CTL_CRASH_NOTIFY_MSG));
+	hv_synic_init(cpu);
 }
 
-static struct kmsg_dumper hv_kmsg_dumper = {
-	.dump = hv_kmsg_dump,
-};
-
-static void hv_kmsg_dump_register(void)
+static int vmbus_alloc_synic_and_connect(void)
 {
-	int ret;
+	int ret, cpu;
+	struct work_struct __percpu *works;
+	int hyperv_cpuhp_online;
 
-	hv_panic_page = hv_alloc_hyperv_zeroed_page();
-	if (!hv_panic_page) {
-		pr_err("Hyper-V: panic message page memory allocation failed\n");
-		return;
+	ret = hv_synic_alloc();
+	if (ret < 0)
+		goto err_alloc;
+
+	works = alloc_percpu(struct work_struct);
+	if (!works) {
+		ret = -ENOMEM;
+		goto err_alloc;
 	}
 
-	ret = kmsg_dump_register(&hv_kmsg_dumper);
-	if (ret) {
-		pr_err("Hyper-V: kmsg dump register error 0x%x\n", ret);
-		hv_free_hyperv_page((unsigned long)hv_panic_page);
-		hv_panic_page = NULL;
+	/*
+	 * Initialize the per-cpu interrupt state and stimer state.
+	 * Then connect to the host.
+	 */
+	cpus_read_lock();
+	for_each_online_cpu(cpu) {
+		struct work_struct *work = per_cpu_ptr(works, cpu);
+
+		INIT_WORK(work, vmbus_percpu_work);
+		schedule_work_on(cpu, work);
 	}
-}
 
-static struct ctl_table_header *hv_ctl_table_hdr;
+	for_each_online_cpu(cpu)
+		flush_work(per_cpu_ptr(works, cpu));
 
-/*
- * sysctl option to allow the user to control whether kmsg data should be
- * reported to Hyper-V on panic.
- */
-static struct ctl_table hv_ctl_table[] = {
-	{
-		.procname       = "hyperv_record_panic_msg",
-		.data           = &sysctl_record_panic_msg,
-		.maxlen         = sizeof(int),
-		.mode           = 0644,
-		.proc_handler   = proc_dointvec_minmax,
-		.extra1		= SYSCTL_ZERO,
-		.extra2		= SYSCTL_ONE
-	},
-	{}
-};
+	/* Register the callbacks for possible CPU online/offline'ing */
+	ret = cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ONLINE_DYN, "hyperv/vmbus:online",
+						   hv_synic_init, hv_synic_cleanup);
+	cpus_read_unlock();
+	free_percpu(works);
+	if (ret < 0)
+		goto err_alloc;
+	hyperv_cpuhp_online = ret;
 
-static struct ctl_table hv_root_table[] = {
-	{
-		.procname	= "kernel",
-		.mode		= 0555,
-		.child		= hv_ctl_table
-	},
-	{}
-};
+	ret = vmbus_connect();
+	if (ret)
+		goto err_connect;
+	return 0;
+
+err_connect:
+	cpuhp_remove_state(hyperv_cpuhp_online);
+	return -ENODEV;
+err_alloc:
+	hv_synic_free();
+	return -ENOMEM;
+}
 
 /*
  * vmbus_bus_init -Main vmbus driver initialization routine.
@@ -1514,56 +1476,18 @@ static int vmbus_bus_init(void)
 		}
 	}
 
-	ret = hv_synic_alloc();
-	if (ret)
-		goto err_alloc;
-
 	/*
-	 * Initialize the per-cpu interrupt state and stimer state.
-	 * Then connect to the host.
+	 * Cache the value as getting it involves a VM exit on x86(_64), and
+	 * doing that on each VP while initializing SynIC's wastes time.
 	 */
-	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "hyperv/vmbus:online",
-				hv_synic_init, hv_synic_cleanup);
-	if (ret < 0)
-		goto err_cpuhp;
-	hyperv_cpuhp_online = ret;
-
-	ret = vmbus_connect();
+	is_confidential = ms_hyperv.confidential_vmbus_available;
+	if (is_confidential)
+		pr_info("Establishing connection to the confidential VMBus\n");
+	hv_para_set_sint_proxy(!is_confidential);
+	ret = vmbus_alloc_synic_and_connect();
 	if (ret)
 		goto err_connect;
 
-	if (hv_is_isolation_supported())
-		sysctl_record_panic_msg = 0;
-
-	/*
-	 * Only register if the crash MSRs are available
-	 */
-	if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) {
-		u64 hyperv_crash_ctl;
-		/*
-		 * Panic message recording (sysctl_record_panic_msg)
-		 * is enabled by default in non-isolated guests and
-		 * disabled by default in isolated guests; the panic
-		 * message recording won't be available in isolated
-		 * guests should the following registration fail.
-		 */
-		hv_ctl_table_hdr = register_sysctl_table(hv_root_table);
-		if (!hv_ctl_table_hdr)
-			pr_err("Hyper-V: sysctl table register error");
-
-		/*
-		 * Register for panic kmsg callback only if the right
-		 * capability is supported by the hypervisor.
-		 */
-		hyperv_crash_ctl = hv_get_register(HV_REGISTER_CRASH_CTL);
-		if (hyperv_crash_ctl & HV_CRASH_CTL_CRASH_NOTIFY_MSG)
-			hv_kmsg_dump_register();
-
-		register_die_notifier(&hyperv_die_report_block);
-		atomic_notifier_chain_register(&panic_notifier_list,
-						&hyperv_panic_report_block);
-	}
-
 	/*
 	 * Always register the vmbus unload panic notifier because we
 	 * need to shut the VMbus channel connection on panic.
@@ -1576,10 +1500,6 @@ static int vmbus_bus_init(void)
 	return 0;
 
 err_connect:
-	cpuhp_remove_state(hyperv_cpuhp_online);
-err_cpuhp:
-	hv_synic_free();
-err_alloc:
 	if (vmbus_irq == -1) {
 		hv_remove_vmbus_handler();
 	} else {
@@ -1588,8 +1508,6 @@ err_alloc:
 	}
 err_setup:
 	bus_unregister(&hv_bus);
-	unregister_sysctl_table(hv_ctl_table_hdr);
-	hv_ctl_table_hdr = NULL;
 	return ret;
 }
 
@@ -1781,16 +1699,16 @@ static ssize_t target_cpu_show(struct vmbus_channel *channel, char *buf)
 {
 	return sprintf(buf, "%u\n", channel->target_cpu);
 }
-static ssize_t target_cpu_store(struct vmbus_channel *channel,
-				const char *buf, size_t count)
+
+int vmbus_channel_set_cpu(struct vmbus_channel *channel, u32 target_cpu)
 {
-	u32 target_cpu, origin_cpu;
-	ssize_t ret = count;
+	u32 origin_cpu;
+	int ret = 0;
 
-	if (vmbus_proto_version < VERSION_WIN10_V4_1)
-		return -EIO;
+	lockdep_assert_cpus_held();
+	lockdep_assert_held(&vmbus_connection.channel_mutex);
 
-	if (sscanf(buf, "%uu", &target_cpu) != 1)
+	if (vmbus_proto_version < VERSION_WIN10_V4_1)
 		return -EIO;
 
 	/* Validate target_cpu for the cpumask_test_cpu() operation below. */
@@ -1800,22 +1718,17 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel,
 	if (!cpumask_test_cpu(target_cpu, housekeeping_cpumask(HK_TYPE_MANAGED_IRQ)))
 		return -EINVAL;
 
-	/* No CPUs should come up or down during this. */
-	cpus_read_lock();
-
-	if (!cpu_online(target_cpu)) {
-		cpus_read_unlock();
+	if (!cpu_online(target_cpu))
 		return -EINVAL;
-	}
 
 	/*
-	 * Synchronizes target_cpu_store() and channel closure:
+	 * Synchronizes vmbus_channel_set_cpu() and channel closure:
 	 *
 	 * { Initially: state = CHANNEL_OPENED }
 	 *
 	 * CPU1				CPU2
 	 *
-	 * [target_cpu_store()]		[vmbus_disconnect_ring()]
+	 * [vmbus_channel_set_cpu()]	[vmbus_disconnect_ring()]
 	 *
 	 * LOCK channel_mutex		LOCK channel_mutex
 	 * LOAD r1 = state		LOAD r2 = state
@@ -1830,7 +1743,6 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel,
 	 * Note.  The host processes the channel messages "sequentially", in
 	 * the order in which they are received on a per-partition basis.
 	 */
-	mutex_lock(&vmbus_connection.channel_mutex);
 
 	/*
 	 * Hyper-V will ignore MODIFYCHANNEL messages for "non-open" channels;
@@ -1838,17 +1750,17 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel,
 	 */
 	if (channel->state != CHANNEL_OPENED_STATE) {
 		ret = -EIO;
-		goto cpu_store_unlock;
+		goto end;
 	}
 
 	origin_cpu = channel->target_cpu;
 	if (target_cpu == origin_cpu)
-		goto cpu_store_unlock;
+		goto end;
 
 	if (vmbus_send_modifychannel(channel,
 				     hv_cpu_number_to_vp_number(target_cpu))) {
 		ret = -EIO;
-		goto cpu_store_unlock;
+		goto end;
 	}
 
 	/*
@@ -1878,10 +1790,26 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel,
 				origin_cpu, target_cpu);
 	}
 
-cpu_store_unlock:
+end:
+	return ret;
+}
+
+static ssize_t target_cpu_store(struct vmbus_channel *channel,
+				const char *buf, size_t count)
+{
+	u32 target_cpu;
+	ssize_t ret;
+
+	if (sscanf(buf, "%u", &target_cpu) != 1)
+		return -EIO;
+
+	cpus_read_lock();
+	mutex_lock(&vmbus_connection.channel_mutex);
+	ret = vmbus_channel_set_cpu(channel, target_cpu);
 	mutex_unlock(&vmbus_connection.channel_mutex);
 	cpus_read_unlock();
-	return ret;
+
+	return ret ?: count;
 }
 static VMBUS_CHAN_ATTR(cpu, 0644, target_cpu_show, target_cpu_store);
 
@@ -1962,6 +1890,26 @@ static ssize_t subchannel_id_show(struct vmbus_channel *channel,
 }
 static VMBUS_CHAN_ATTR_RO(subchannel_id);
 
+static int hv_mmap_ring_buffer_wrapper(struct file *filp, struct kobject *kobj,
+				       const struct bin_attribute *attr,
+				       struct vm_area_struct *vma)
+{
+	struct vmbus_channel *channel = container_of(kobj, struct vmbus_channel, kobj);
+
+	/*
+	 * hv_(create|remove)_ring_sysfs implementation ensures that mmap_ring_buffer
+	 * is not NULL.
+	 */
+	return channel->mmap_ring_buffer(channel, vma);
+}
+
+static struct bin_attribute chan_attr_ring_buffer = {
+	.attr = {
+		.name = "ring",
+		.mode = 0600,
+	},
+	.mmap = hv_mmap_ring_buffer_wrapper,
+};
 static struct attribute *vmbus_chan_attrs[] = {
 	&chan_attr_out_mask.attr,
 	&chan_attr_in_mask.attr,
@@ -1981,6 +1929,11 @@ static struct attribute *vmbus_chan_attrs[] = {
 	NULL
 };
 
+static const struct bin_attribute *vmbus_chan_bin_attrs[] = {
+	&chan_attr_ring_buffer,
+	NULL
+};
+
 /*
  * Channel-level attribute_group callback function. Returns the permission for
  * each attribute, and returns 0 if an attribute is not visible.
@@ -2001,16 +1954,99 @@ static umode_t vmbus_chan_attr_is_visible(struct kobject *kobj,
 	return attr->mode;
 }
 
-static struct attribute_group vmbus_chan_group = {
+static umode_t vmbus_chan_bin_attr_is_visible(struct kobject *kobj,
+					      const struct bin_attribute *attr, int idx)
+{
+	const struct vmbus_channel *channel =
+		container_of(kobj, struct vmbus_channel, kobj);
+
+	/* Hide ring attribute if channel's ring_sysfs_visible is set to false */
+	if (attr ==  &chan_attr_ring_buffer && !channel->ring_sysfs_visible)
+		return 0;
+
+	return attr->attr.mode;
+}
+
+static size_t vmbus_chan_bin_size(struct kobject *kobj,
+				  const struct bin_attribute *bin_attr, int a)
+{
+	const struct vmbus_channel *channel =
+		container_of(kobj, struct vmbus_channel, kobj);
+
+	return channel->ringbuffer_pagecount << PAGE_SHIFT;
+}
+
+static const struct attribute_group vmbus_chan_group = {
 	.attrs = vmbus_chan_attrs,
-	.is_visible = vmbus_chan_attr_is_visible
+	.bin_attrs = vmbus_chan_bin_attrs,
+	.is_visible = vmbus_chan_attr_is_visible,
+	.is_bin_visible = vmbus_chan_bin_attr_is_visible,
+	.bin_size = vmbus_chan_bin_size,
 };
 
-static struct kobj_type vmbus_chan_ktype = {
+static const struct kobj_type vmbus_chan_ktype = {
 	.sysfs_ops = &vmbus_chan_sysfs_ops,
 	.release = vmbus_chan_release,
 };
 
+/**
+ * hv_create_ring_sysfs() - create "ring" sysfs entry corresponding to ring buffers for a channel.
+ * @channel: Pointer to vmbus_channel structure
+ * @hv_mmap_ring_buffer: function pointer for initializing the function to be called on mmap of
+ *                       channel's "ring" sysfs node, which is for the ring buffer of that channel.
+ *                       Function pointer is of below type:
+ *                       int (*hv_mmap_ring_buffer)(struct vmbus_channel *channel,
+ *                                                  struct vm_area_struct *vma))
+ *                       This has a pointer to the channel and a pointer to vm_area_struct,
+ *                       used for mmap, as arguments.
+ *
+ * Sysfs node for ring buffer of a channel is created along with other fields, however its
+ * visibility is disabled by default. Sysfs creation needs to be controlled when the use-case
+ * is running.
+ * For example, HV_NIC device is used either by uio_hv_generic or hv_netvsc at any given point of
+ * time, and "ring" sysfs is needed only when uio_hv_generic is bound to that device. To avoid
+ * exposing the ring buffer by default, this function is responsible to enable visibility of
+ * ring for userspace to use.
+ * Note: Race conditions can happen with userspace and it is not encouraged to create new
+ * use-cases for this. This was added to maintain backward compatibility, while solving
+ * one of the race conditions in uio_hv_generic while creating sysfs. See comments with
+ * vmbus_add_dynid() and vmbus_device_register().
+ *
+ * Returns 0 on success or error code on failure.
+ */
+int hv_create_ring_sysfs(struct vmbus_channel *channel,
+			 int (*hv_mmap_ring_buffer)(struct vmbus_channel *channel,
+						    struct vm_area_struct *vma))
+{
+	struct kobject *kobj = &channel->kobj;
+
+	channel->mmap_ring_buffer = hv_mmap_ring_buffer;
+	channel->ring_sysfs_visible = true;
+
+	return sysfs_update_group(kobj, &vmbus_chan_group);
+}
+EXPORT_SYMBOL_GPL(hv_create_ring_sysfs);
+
+/**
+ * hv_remove_ring_sysfs() - remove ring sysfs entry corresponding to ring buffers for a channel.
+ * @channel: Pointer to vmbus_channel structure
+ *
+ * Hide "ring" sysfs for a channel by changing its is_visible attribute and updating sysfs group.
+ *
+ * Returns 0 on success or error code on failure.
+ */
+int hv_remove_ring_sysfs(struct vmbus_channel *channel)
+{
+	struct kobject *kobj = &channel->kobj;
+	int ret;
+
+	channel->ring_sysfs_visible = false;
+	ret = sysfs_update_group(kobj, &vmbus_chan_group);
+	channel->mmap_ring_buffer = NULL;
+	return ret;
+}
+EXPORT_SYMBOL_GPL(hv_remove_ring_sysfs);
+
 /*
  * vmbus_add_channel_kobj - setup a sub-directory under device/channels
  */
@@ -2090,7 +2126,7 @@ int vmbus_device_register(struct hv_device *child_device_obj)
 		     &child_device_obj->channel->offermsg.offer.if_instance);
 
 	child_device_obj->device.bus = &hv_bus;
-	child_device_obj->device.parent = &hv_acpi_dev->dev;
+	child_device_obj->device.parent = vmbus_root_device;
 	child_device_obj->device.release = vmbus_device_release;
 
 	child_device_obj->device.dma_parms = &child_device_obj->dma_parms;
@@ -2108,6 +2144,20 @@ int vmbus_device_register(struct hv_device *child_device_obj)
 		return ret;
 	}
 
+	/*
+	 * If device_register() found a driver to assign to the device, the
+	 * driver's probe function has already run at this point. If that
+	 * probe function accesses or operates on the "channels" subdirectory
+	 * in sysfs, those operations will have failed because the "channels"
+	 * subdirectory doesn't exist until the code below runs. Or if the
+	 * probe function creates a /dev entry, a user space program could
+	 * find and open the /dev entry, and then create a race by accessing
+	 * the "channels" subdirectory while the creation steps are in progress
+	 * here. The race can't result in a kernel failure, but the user space
+	 * program may get an error in accessing "channels" or its
+	 * subdirectories. See also comments with vmbus_add_dynid() about a
+	 * related race condition.
+	 */
 	child_device_obj->channels_kset = kset_create_and_add("channels",
 							      NULL, kobj);
 	if (!child_device_obj->channels_kset) {
@@ -2118,7 +2168,7 @@ int vmbus_device_register(struct hv_device *child_device_obj)
 	ret = vmbus_add_channel_kobj(child_device_obj,
 				     child_device_obj->channel);
 	if (ret) {
-		pr_err("Unable to register primary channeln");
+		pr_err("Unable to register primary channel\n");
 		goto err_kset_unregister;
 	}
 	hv_debug_add_dev_dir(child_device_obj);
@@ -2150,13 +2200,13 @@ void vmbus_device_unregister(struct hv_device *device_obj)
 	 */
 	device_unregister(&device_obj->device);
 }
+EXPORT_SYMBOL_GPL(vmbus_device_unregister);
 
-
+#ifdef CONFIG_ACPI
 /*
  * VMBUS is an acpi enumerated device. Get the information we
  * need from DSDT.
  */
-#define VTPM_BASE_ADDRESS 0xfed40000
 static acpi_status vmbus_walk_resources(struct acpi_resource *res, void *ctx)
 {
 	resource_size_t start = 0;
@@ -2261,8 +2311,9 @@ static acpi_status vmbus_walk_resources(struct acpi_resource *res, void *ctx)
 
 	return AE_OK;
 }
+#endif
 
-static void vmbus_acpi_remove(struct acpi_device *device)
+static void vmbus_mmio_remove(void)
 {
 	struct resource *cur_res;
 	struct resource *next_res;
@@ -2281,15 +2332,17 @@ static void vmbus_acpi_remove(struct acpi_device *device)
 	}
 }
 
-static void vmbus_reserve_fb(void)
+static void __maybe_unused vmbus_reserve_fb(void)
 {
 	resource_size_t start = 0, size;
 	struct pci_dev *pdev;
 
 	if (efi_enabled(EFI_BOOT)) {
 		/* Gen2 VM: get FB base from EFI framebuffer */
-		start = screen_info.lfb_base;
-		size = max_t(__u32, screen_info.lfb_size, 0x800000);
+		if (IS_ENABLED(CONFIG_SYSFB)) {
+			start = screen_info.lfb_base;
+			size = max_t(__u32, screen_info.lfb_size, 0x800000);
+		}
 	} else {
 		/* Gen1 VM: get FB base from PCI */
 		pdev = pci_get_device(PCI_VENDOR_ID_MICROSOFT,
@@ -2429,25 +2482,40 @@ void vmbus_free_mmio(resource_size_t start, resource_size_t size)
 	struct resource *iter;
 
 	mutex_lock(&hyperv_mmio_lock);
+
+	/*
+	 * If all bytes of the MMIO range to be released are within the
+	 * special case fb_mmio shadow region, skip releasing the shadow
+	 * region since no corresponding __request_region() was done
+	 * in vmbus_allocate_mmio().
+	 */
+	if (fb_mmio && start >= fb_mmio->start &&
+	    (start + size - 1 <= fb_mmio->end))
+		goto skip_shadow_release;
+
 	for (iter = hyperv_mmio; iter; iter = iter->sibling) {
 		if ((iter->start >= start + size) || (iter->end <= start))
 			continue;
 
 		__release_region(iter, start, size);
 	}
+
+skip_shadow_release:
 	release_mem_region(start, size);
 	mutex_unlock(&hyperv_mmio_lock);
 
 }
 EXPORT_SYMBOL_GPL(vmbus_free_mmio);
 
-static int vmbus_acpi_add(struct acpi_device *device)
+#ifdef CONFIG_ACPI
+static int vmbus_acpi_add(struct platform_device *pdev)
 {
 	acpi_status result;
 	int ret_val = -ENODEV;
 	struct acpi_device *ancestor;
+	struct acpi_device *device = ACPI_COMPANION(&pdev->dev);
 
-	hv_acpi_dev = device;
+	vmbus_root_device = &device->dev;
 
 	/*
 	 * Older versions of Hyper-V for ARM64 fail to include the _CCA
@@ -2473,7 +2541,8 @@ static int vmbus_acpi_add(struct acpi_device *device)
 	 * Some ancestor of the vmbus acpi device (Gen1 or Gen2
 	 * firmware) is the VMOD that has the mmio ranges. Get that.
 	 */
-	for (ancestor = acpi_dev_parent(device); ancestor;
+	for (ancestor = acpi_dev_parent(device);
+	     ancestor && ancestor->handle != ACPI_ROOT_OBJECT;
 	     ancestor = acpi_dev_parent(ancestor)) {
 		result = acpi_walk_resources(ancestor->handle, METHOD_NAME__CRS,
 					     vmbus_walk_resources, NULL);
@@ -2489,9 +2558,94 @@ static int vmbus_acpi_add(struct acpi_device *device)
 
 acpi_walk_err:
 	if (ret_val)
-		vmbus_acpi_remove(device);
+		vmbus_mmio_remove();
 	return ret_val;
 }
+#else
+static int vmbus_acpi_add(struct platform_device *pdev)
+{
+	return 0;
+}
+#endif
+#ifndef HYPERVISOR_CALLBACK_VECTOR
+static int vmbus_set_irq(struct platform_device *pdev)
+{
+	struct irq_data *data;
+	int irq;
+	irq_hw_number_t hwirq;
+
+	irq = platform_get_irq(pdev, 0);
+	/* platform_get_irq() may not return 0. */
+	if (irq < 0)
+		return irq;
+
+	data = irq_get_irq_data(irq);
+	if (!data) {
+		pr_err("No interrupt data for VMBus virq %d\n", irq);
+		return -ENODEV;
+	}
+	hwirq = irqd_to_hwirq(data);
+
+	vmbus_irq = irq;
+	vmbus_interrupt = hwirq;
+	pr_debug("VMBus virq %d, hwirq %d\n", vmbus_irq, vmbus_interrupt);
+
+	return 0;
+}
+#endif
+
+static int vmbus_device_add(struct platform_device *pdev)
+{
+	struct resource **cur_res = &hyperv_mmio;
+	struct of_range range;
+	struct of_range_parser parser;
+	struct device_node *np = pdev->dev.of_node;
+	int ret;
+
+	vmbus_root_device = &pdev->dev;
+
+	ret = of_range_parser_init(&parser, np);
+	if (ret)
+		return ret;
+
+#ifndef HYPERVISOR_CALLBACK_VECTOR
+	ret = vmbus_set_irq(pdev);
+	if (ret)
+		return ret;
+#endif
+	for_each_of_range(&parser, &range) {
+		struct resource *res;
+
+		res = kzalloc(sizeof(*res), GFP_KERNEL);
+		if (!res) {
+			vmbus_mmio_remove();
+			return -ENOMEM;
+		}
+
+		res->name = "hyperv mmio";
+		res->flags = range.flags;
+		res->start = range.cpu_addr;
+		res->end = range.cpu_addr + range.size;
+
+		*cur_res = res;
+		cur_res = &res->sibling;
+	}
+
+	return ret;
+}
+
+static int vmbus_platform_driver_probe(struct platform_device *pdev)
+{
+	if (acpi_disabled)
+		return vmbus_device_add(pdev);
+	else
+		return vmbus_acpi_add(pdev);
+}
+
+static void vmbus_platform_driver_remove(struct platform_device *pdev)
+{
+	vmbus_mmio_remove();
+}
 
 #ifdef CONFIG_PM_SLEEP
 static int vmbus_bus_suspend(struct device *dev)
@@ -2537,11 +2691,6 @@ static int vmbus_bus_suspend(struct device *dev)
 	if (atomic_read(&vmbus_connection.nr_chan_close_on_suspend) > 0)
 		wait_for_completion(&vmbus_connection.ready_for_suspend_event);
 
-	if (atomic_read(&vmbus_connection.nr_chan_fixup_on_resume) != 0) {
-		pr_err("Can not suspend due to a previous failed resuming\n");
-		return -EBUSY;
-	}
-
 	mutex_lock(&vmbus_connection.channel_mutex);
 
 	list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
@@ -2566,22 +2715,18 @@ static int vmbus_bus_suspend(struct device *dev)
 			pr_err("Sub-channel not deleted!\n");
 			WARN_ON_ONCE(1);
 		}
-
-		atomic_inc(&vmbus_connection.nr_chan_fixup_on_resume);
 	}
 
 	mutex_unlock(&vmbus_connection.channel_mutex);
 
 	vmbus_initiate_unload(false);
 
-	/* Reset the event for the next resume. */
-	reinit_completion(&vmbus_connection.ready_for_resume_event);
-
 	return 0;
 }
 
 static int vmbus_bus_resume(struct device *dev)
 {
+	struct vmbus_channel *channel;
 	struct vmbus_channel_msginfo *msginfo;
 	size_t msgsize;
 	int ret;
@@ -2612,13 +2757,23 @@ static int vmbus_bus_resume(struct device *dev)
 	if (ret != 0)
 		return ret;
 
-	WARN_ON(atomic_read(&vmbus_connection.nr_chan_fixup_on_resume) == 0);
-
 	vmbus_request_offers();
 
-	if (wait_for_completion_timeout(
-		&vmbus_connection.ready_for_resume_event, 10 * HZ) == 0)
-		pr_err("Some vmbus device is missing after suspending?\n");
+	mutex_lock(&vmbus_connection.channel_mutex);
+	list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
+		if (channel->offermsg.child_relid != INVALID_RELID)
+			continue;
+
+		/* hvsock channels are not expected to be present. */
+		if (is_hvsock_channel(channel))
+			continue;
+
+		pr_err("channel %pUl/%pUl not present after resume.\n",
+		       &channel->offermsg.offer.if_type,
+		       &channel->offermsg.offer.if_instance);
+		/* ToDo: Cleanup these channels here */
+	}
+	mutex_unlock(&vmbus_connection.channel_mutex);
 
 	/* Reset the event for the next suspend. */
 	reinit_completion(&vmbus_connection.ready_for_suspend_event);
@@ -2630,7 +2785,17 @@ static int vmbus_bus_resume(struct device *dev)
 #define vmbus_bus_resume NULL
 #endif /* CONFIG_PM_SLEEP */
 
-static const struct acpi_device_id vmbus_acpi_device_ids[] = {
+static const __maybe_unused struct of_device_id vmbus_of_match[] = {
+	{
+		.compatible = "microsoft,vmbus",
+	},
+	{
+		/* sentinel */
+	},
+};
+MODULE_DEVICE_TABLE(of, vmbus_of_match);
+
+static const __maybe_unused struct acpi_device_id vmbus_acpi_device_ids[] = {
 	{"VMBUS", 0},
 	{"VMBus", 0},
 	{"", 0},
@@ -2658,15 +2823,16 @@ static const struct dev_pm_ops vmbus_bus_pm = {
 	.restore_noirq	= vmbus_bus_resume
 };
 
-static struct acpi_driver vmbus_acpi_driver = {
-	.name = "vmbus",
-	.ids = vmbus_acpi_device_ids,
-	.ops = {
-		.add = vmbus_acpi_add,
-		.remove = vmbus_acpi_remove,
-	},
-	.drv.pm = &vmbus_bus_pm,
-	.drv.probe_type = PROBE_FORCE_SYNCHRONOUS,
+static struct platform_driver vmbus_platform_driver = {
+	.probe = vmbus_platform_driver_probe,
+	.remove = vmbus_platform_driver_remove,
+	.driver = {
+		.name = "vmbus",
+		.acpi_match_table = ACPI_PTR(vmbus_acpi_device_ids),
+		.of_match_table = of_match_ptr(vmbus_of_match),
+		.pm = &vmbus_bus_pm,
+		.probe_type = PROBE_FORCE_SYNCHRONOUS,
+	}
 };
 
 static void hv_kexec_handler(void)
@@ -2690,10 +2856,10 @@ static void hv_crash_handler(struct pt_regs *regs)
 	 */
 	cpu = smp_processor_id();
 	hv_stimer_cleanup(cpu);
-	hv_synic_disable_regs(cpu);
+	hv_hyp_synic_disable_regs(cpu);
 };
 
-static int hv_synic_suspend(void)
+static int hv_synic_suspend(void *data)
 {
 	/*
 	 * When we reach here, all the non-boot CPUs have been offlined.
@@ -2715,14 +2881,14 @@ static int hv_synic_suspend(void)
 	 * interrupts-disabled context.
 	 */
 
-	hv_synic_disable_regs(0);
+	hv_hyp_synic_disable_regs(0);
 
 	return 0;
 }
 
-static void hv_synic_resume(void)
+static void hv_synic_resume(void *data)
 {
-	hv_synic_enable_regs(0);
+	hv_hyp_synic_enable_regs(0);
 
 	/*
 	 * Note: we don't need to call hv_stimer_init(0), because the timer
@@ -2732,11 +2898,15 @@ static void hv_synic_resume(void)
 }
 
 /* The callbacks run only on CPU0, with irqs_disabled. */
-static struct syscore_ops hv_synic_syscore_ops = {
+static const struct syscore_ops hv_synic_syscore_ops = {
 	.suspend = hv_synic_suspend,
 	.resume = hv_synic_resume,
 };
 
+static struct syscore hv_synic_syscore = {
+	.ops = &hv_synic_syscore_ops,
+};
+
 static int __init hv_acpi_init(void)
 {
 	int ret;
@@ -2744,18 +2914,17 @@ static int __init hv_acpi_init(void)
 	if (!hv_is_hyperv_initialized())
 		return -ENODEV;
 
-	if (hv_root_partition)
+	if (hv_root_partition() && !hv_nested)
 		return 0;
 
 	/*
 	 * Get ACPI resources first.
 	 */
-	ret = acpi_bus_register_driver(&vmbus_acpi_driver);
-
+	ret = platform_driver_register(&vmbus_platform_driver);
 	if (ret)
 		return ret;
 
-	if (!hv_acpi_dev) {
+	if (!vmbus_root_device) {
 		ret = -ENODEV;
 		goto cleanup;
 	}
@@ -2780,13 +2949,13 @@ static int __init hv_acpi_init(void)
 	hv_setup_kexec_handler(hv_kexec_handler);
 	hv_setup_crash_handler(hv_crash_handler);
 
-	register_syscore_ops(&hv_synic_syscore_ops);
+	register_syscore(&hv_synic_syscore);
 
 	return 0;
 
 cleanup:
-	acpi_bus_unregister_driver(&vmbus_acpi_driver);
-	hv_acpi_dev = NULL;
+	platform_driver_unregister(&vmbus_platform_driver);
+	vmbus_root_device = NULL;
 	return ret;
 }
 
@@ -2794,7 +2963,7 @@ static void __exit vmbus_exit(void)
 {
 	int cpu;
 
-	unregister_syscore_ops(&hv_synic_syscore_ops);
+	unregister_syscore(&hv_synic_syscore);
 
 	hv_remove_kexec_handler();
 	hv_remove_crash_handler();
@@ -2818,13 +2987,6 @@ static void __exit vmbus_exit(void)
 	vmbus_free_channels();
 	kfree(vmbus_connection.channels);
 
-	if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) {
-		kmsg_dump_unregister(&hv_kmsg_dumper);
-		unregister_die_notifier(&hyperv_die_report_block);
-		atomic_notifier_chain_unregister(&panic_notifier_list,
-						&hyperv_panic_report_block);
-	}
-
 	/*
 	 * The vmbus panic notifier is always registered, hence we should
 	 * also unconditionally unregister it here as well.
@@ -2832,14 +2994,11 @@ static void __exit vmbus_exit(void)
 	atomic_notifier_chain_unregister(&panic_notifier_list,
 					&hyperv_panic_vmbus_unload_block);
 
-	free_page((unsigned long)hv_panic_page);
-	unregister_sysctl_table(hv_ctl_table_hdr);
-	hv_ctl_table_hdr = NULL;
 	bus_unregister(&hv_bus);
 
 	cpuhp_remove_state(hyperv_cpuhp_online);
 	hv_synic_free();
-	acpi_bus_unregister_driver(&vmbus_acpi_driver);
+	platform_driver_unregister(&vmbus_platform_driver);
 }