summaryrefslogtreecommitdiff
path: root/drivers/hv
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/hv')
-rw-r--r--drivers/hv/Kconfig29
-rw-r--r--drivers/hv/Makefile9
-rw-r--r--drivers/hv/channel.c75
-rw-r--r--drivers/hv/channel_mgmt.c27
-rw-r--r--drivers/hv/connection.c6
-rw-r--r--drivers/hv/hv.c377
-rw-r--r--drivers/hv/hv_common.c27
-rw-r--r--drivers/hv/hv_util.c2
-rw-r--r--drivers/hv/hyperv_vmbus.h76
-rw-r--r--drivers/hv/mshv_common.c99
-rw-r--r--drivers/hv/mshv_eventfd.c8
-rw-r--r--drivers/hv/mshv_irq.c4
-rw-r--r--drivers/hv/mshv_regions.c555
-rw-r--r--drivers/hv/mshv_root.h57
-rw-r--r--drivers/hv/mshv_root_hv_call.c196
-rw-r--r--drivers/hv/mshv_root_main.c778
-rw-r--r--drivers/hv/mshv_synic.c6
-rw-r--r--drivers/hv/mshv_vtl.h25
-rw-r--r--drivers/hv/mshv_vtl_main.c1392
-rw-r--r--drivers/hv/ring_buffer.c5
-rw-r--r--drivers/hv/vmbus_drv.c202
21 files changed, 3324 insertions, 631 deletions
diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig
index 0b8c391a0342..7937ac0cbd0f 100644
--- a/drivers/hv/Kconfig
+++ b/drivers/hv/Kconfig
@@ -17,7 +17,8 @@ config HYPERV
config HYPERV_VTL_MODE
bool "Enable Linux to boot in VTL context"
- depends on (X86_64 || ARM64) && HYPERV
+ depends on (X86_64 && HAVE_STATIC_CALL) || ARM64
+ depends on HYPERV
depends on SMP
default n
help
@@ -75,6 +76,8 @@ config MSHV_ROOT
depends on PAGE_SIZE_4KB
select EVENTFD
select VIRT_XFER_TO_GUEST_WORK
+ select HMM_MIRROR
+ select MMU_NOTIFIER
default n
help
Select this option to enable support for booting and running as root
@@ -82,4 +85,28 @@ config MSHV_ROOT
If unsure, say N.
+config MSHV_VTL
+ tristate "Microsoft Hyper-V VTL driver"
+ depends on X86_64 && HYPERV_VTL_MODE
+ depends on HYPERV_VMBUS
+ # Mapping VTL0 memory to a userspace process in VTL2 is supported in OpenHCL.
+ # VTL2 for OpenHCL makes use of Huge Pages to improve performance on VMs,
+ # specially with large memory requirements.
+ depends on TRANSPARENT_HUGEPAGE
+ # MTRRs are controlled by VTL0, and are not specific to individual VTLs.
+ # Therefore, do not attempt to access or modify MTRRs here.
+ depends on !MTRR
+ select CPUMASK_OFFSTACK
+ select VIRT_XFER_TO_GUEST_WORK
+ default n
+ help
+ Select this option to enable Hyper-V VTL driver support.
+ This driver provides interfaces for Virtual Machine Manager (VMM) running in VTL2
+ userspace to create VTLs and partitions, setup and manage VTL0 memory and
+ allow userspace to make direct hypercalls. This also allows to map VTL0's address
+ space to a usermode process in VTL2 and supports getting new VMBus messages and channel
+ events in VTL2.
+
+ If unsure, say N.
+
endmenu
diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile
index 1a1677bf4dac..a49f93c2d245 100644
--- a/drivers/hv/Makefile
+++ b/drivers/hv/Makefile
@@ -3,6 +3,7 @@ obj-$(CONFIG_HYPERV_VMBUS) += hv_vmbus.o
obj-$(CONFIG_HYPERV_UTILS) += hv_utils.o
obj-$(CONFIG_HYPERV_BALLOON) += hv_balloon.o
obj-$(CONFIG_MSHV_ROOT) += mshv_root.o
+obj-$(CONFIG_MSHV_VTL) += mshv_vtl.o
CFLAGS_hv_trace.o = -I$(src)
CFLAGS_hv_balloon.o = -I$(src)
@@ -13,8 +14,12 @@ hv_vmbus-y := vmbus_drv.o \
hv_vmbus-$(CONFIG_HYPERV_TESTING) += hv_debugfs.o
hv_utils-y := hv_util.o hv_kvp.o hv_snapshot.o hv_utils_transport.o
mshv_root-y := mshv_root_main.o mshv_synic.o mshv_eventfd.o mshv_irq.o \
- mshv_root_hv_call.o mshv_portid_table.o
+ mshv_root_hv_call.o mshv_portid_table.o mshv_regions.o
+mshv_vtl-y := mshv_vtl_main.o
# Code that must be built-in
obj-$(CONFIG_HYPERV) += hv_common.o
-obj-$(subst m,y,$(CONFIG_MSHV_ROOT)) += hv_proc.o mshv_common.o
+obj-$(subst m,y,$(CONFIG_MSHV_ROOT)) += hv_proc.o
+ifneq ($(CONFIG_MSHV_ROOT)$(CONFIG_MSHV_VTL),)
+ obj-y += mshv_common.o
+endif
diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index 162d6aeece7b..6821f225248b 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c
@@ -410,6 +410,21 @@ static int create_gpadl_header(enum hv_gpadl_type type, void *kbuffer,
return 0;
}
+static void vmbus_free_channel_msginfo(struct vmbus_channel_msginfo *msginfo)
+{
+ struct vmbus_channel_msginfo *submsginfo, *tmp;
+
+ if (!msginfo)
+ return;
+
+ list_for_each_entry_safe(submsginfo, tmp, &msginfo->submsglist,
+ msglistentry) {
+ kfree(submsginfo);
+ }
+
+ kfree(msginfo);
+}
+
/*
* __vmbus_establish_gpadl - Establish a GPADL for a buffer or ringbuffer
*
@@ -429,7 +444,7 @@ static int __vmbus_establish_gpadl(struct vmbus_channel *channel,
struct vmbus_channel_gpadl_header *gpadlmsg;
struct vmbus_channel_gpadl_body *gpadl_body;
struct vmbus_channel_msginfo *msginfo = NULL;
- struct vmbus_channel_msginfo *submsginfo, *tmp;
+ struct vmbus_channel_msginfo *submsginfo;
struct list_head *curr;
u32 next_gpadl_handle;
unsigned long flags;
@@ -444,20 +459,24 @@ static int __vmbus_establish_gpadl(struct vmbus_channel *channel,
return ret;
}
- /*
- * Set the "decrypted" flag to true for the set_memory_decrypted()
- * success case. In the failure case, the encryption state of the
- * memory is unknown. Leave "decrypted" as true to ensure the
- * memory will be leaked instead of going back on the free list.
- */
- gpadl->decrypted = true;
- ret = set_memory_decrypted((unsigned long)kbuffer,
- PFN_UP(size));
- if (ret) {
- dev_warn(&channel->device_obj->device,
- "Failed to set host visibility for new GPADL %d.\n",
- ret);
- return ret;
+ gpadl->decrypted = !((channel->co_external_memory && type == HV_GPADL_BUFFER) ||
+ (channel->co_ring_buffer && type == HV_GPADL_RING));
+ if (gpadl->decrypted) {
+ /*
+ * The "decrypted" flag being true assumes that set_memory_decrypted() succeeds.
+ * But if it fails, the encryption state of the memory is unknown. In that case,
+ * leave "decrypted" as true to ensure the memory is leaked instead of going back
+ * on the free list.
+ */
+ ret = set_memory_decrypted((unsigned long)kbuffer,
+ PFN_UP(size));
+ if (ret) {
+ dev_warn(&channel->device_obj->device,
+ "Failed to set host visibility for new GPADL %d.\n",
+ ret);
+ vmbus_free_channel_msginfo(msginfo);
+ return ret;
+ }
}
init_completion(&msginfo->waitevent);
@@ -532,12 +551,8 @@ cleanup:
spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
list_del(&msginfo->msglistentry);
spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
- list_for_each_entry_safe(submsginfo, tmp, &msginfo->submsglist,
- msglistentry) {
- kfree(submsginfo);
- }
- kfree(msginfo);
+ vmbus_free_channel_msginfo(msginfo);
if (ret) {
/*
@@ -545,8 +560,10 @@ cleanup:
* left as true so the memory is leaked instead of being
* put back on the free list.
*/
- if (!set_memory_encrypted((unsigned long)kbuffer, PFN_UP(size)))
- gpadl->decrypted = false;
+ if (gpadl->decrypted) {
+ if (!set_memory_encrypted((unsigned long)kbuffer, PFN_UP(size)))
+ gpadl->decrypted = false;
+ }
}
return ret;
@@ -573,7 +590,7 @@ EXPORT_SYMBOL_GPL(vmbus_establish_gpadl);
* keeps track of the next available slot in the array. Initially, each
* slot points to the next one (as in a Linked List). The last slot
* does not point to anything, so its value is U64_MAX by default.
- * @size The size of the array
+ * @size: The size of the array
*/
static u64 *request_arr_init(u32 size)
{
@@ -677,12 +694,13 @@ static int __vmbus_open(struct vmbus_channel *newchannel,
goto error_clean_ring;
err = hv_ringbuffer_init(&newchannel->outbound,
- page, send_pages, 0);
+ page, send_pages, 0, newchannel->co_ring_buffer);
if (err)
goto error_free_gpadl;
err = hv_ringbuffer_init(&newchannel->inbound, &page[send_pages],
- recv_pages, newchannel->max_pkt_size);
+ recv_pages, newchannel->max_pkt_size,
+ newchannel->co_ring_buffer);
if (err)
goto error_free_gpadl;
@@ -863,8 +881,11 @@ post_msg_err:
kfree(info);
- ret = set_memory_encrypted((unsigned long)gpadl->buffer,
- PFN_UP(gpadl->size));
+ if (gpadl->decrypted)
+ ret = set_memory_encrypted((unsigned long)gpadl->buffer,
+ PFN_UP(gpadl->size));
+ else
+ ret = 0;
if (ret)
pr_warn("Fail to set mem host visibility in GPADL teardown %d.\n", ret);
diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index 65dd299e2944..74fed2c073d4 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -844,14 +844,14 @@ static void vmbus_wait_for_unload(void)
= per_cpu_ptr(hv_context.cpu_context, cpu);
/*
- * In a CoCo VM the synic_message_page is not allocated
+ * In a CoCo VM the hyp_synic_message_page is not allocated
* in hv_synic_alloc(). Instead it is set/cleared in
- * hv_synic_enable_regs() and hv_synic_disable_regs()
+ * hv_hyp_synic_enable_regs() and hv_hyp_synic_disable_regs()
* such that it is set only when the CPU is online. If
* not all present CPUs are online, the message page
* might be NULL, so skip such CPUs.
*/
- page_addr = hv_cpu->synic_message_page;
+ page_addr = hv_cpu->hyp_synic_message_page;
if (!page_addr)
continue;
@@ -892,7 +892,7 @@ completed:
struct hv_per_cpu_context *hv_cpu
= per_cpu_ptr(hv_context.cpu_context, cpu);
- page_addr = hv_cpu->synic_message_page;
+ page_addr = hv_cpu->hyp_synic_message_page;
if (!page_addr)
continue;
@@ -1022,6 +1022,7 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr)
struct vmbus_channel_offer_channel *offer;
struct vmbus_channel *oldchannel, *newchannel;
size_t offer_sz;
+ bool co_ring_buffer, co_external_memory;
offer = (struct vmbus_channel_offer_channel *)hdr;
@@ -1034,6 +1035,22 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr)
return;
}
+ co_ring_buffer = is_co_ring_buffer(offer);
+ co_external_memory = is_co_external_memory(offer);
+ if (!co_ring_buffer && co_external_memory) {
+ pr_err("Invalid offer relid=%d: the ring buffer isn't encrypted\n",
+ offer->child_relid);
+ return;
+ }
+ if (co_ring_buffer || co_external_memory) {
+ if (vmbus_proto_version < VERSION_WIN10_V6_0 || !vmbus_is_confidential()) {
+ pr_err("Invalid offer relid=%d: no support for confidential VMBus\n",
+ offer->child_relid);
+ atomic_dec(&vmbus_connection.offer_in_progress);
+ return;
+ }
+ }
+
oldchannel = find_primary_channel_by_offer(offer);
if (oldchannel != NULL) {
@@ -1112,6 +1129,8 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr)
pr_err("Unable to allocate channel object\n");
return;
}
+ newchannel->co_ring_buffer = co_ring_buffer;
+ newchannel->co_external_memory = co_external_memory;
vmbus_setup_channel_state(newchannel, offer);
diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
index 1fe3573ae52a..5d9cb5bf2d62 100644
--- a/drivers/hv/connection.c
+++ b/drivers/hv/connection.c
@@ -51,6 +51,7 @@ EXPORT_SYMBOL_GPL(vmbus_proto_version);
* Linux guests and are not listed.
*/
static __u32 vmbus_versions[] = {
+ VERSION_WIN10_V6_0,
VERSION_WIN10_V5_3,
VERSION_WIN10_V5_2,
VERSION_WIN10_V5_1,
@@ -65,7 +66,7 @@ static __u32 vmbus_versions[] = {
* Maximal VMBus protocol version guests can negotiate. Useful to cap the
* VMBus version for testing and debugging purpose.
*/
-static uint max_version = VERSION_WIN10_V5_3;
+static uint max_version = VERSION_WIN10_V6_0;
module_param(max_version, uint, S_IRUGO);
MODULE_PARM_DESC(max_version,
@@ -105,6 +106,9 @@ int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo, u32 version)
vmbus_connection.msg_conn_id = VMBUS_MESSAGE_CONNECTION_ID;
}
+ if (vmbus_is_confidential() && version >= VERSION_WIN10_V6_0)
+ msg->feature_flags = VMBUS_FEATURE_FLAG_CONFIDENTIAL_CHANNELS;
+
/*
* shared_gpa_boundary is zero in non-SNP VMs, so it's safe to always
* bitwise OR it
diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
index b14c5f9e0ef2..c100f04b3581 100644
--- a/drivers/hv/hv.c
+++ b/drivers/hv/hv.c
@@ -18,6 +18,7 @@
#include <linux/clockchips.h>
#include <linux/delay.h>
#include <linux/interrupt.h>
+#include <linux/export.h>
#include <clocksource/hyperv_timer.h>
#include <asm/mshyperv.h>
#include <linux/set_memory.h>
@@ -25,6 +26,7 @@
/* The one and only */
struct hv_context hv_context;
+EXPORT_SYMBOL_FOR_MODULES(hv_context, "mshv_vtl");
/*
* hv_init - Main initialization routine.
@@ -74,7 +76,11 @@ int hv_post_message(union hv_connection_id connection_id,
aligned_msg->payload_size = payload_size;
memcpy((void *)aligned_msg->payload, payload, payload_size);
- if (ms_hyperv.paravisor_present) {
+ if (ms_hyperv.paravisor_present && !vmbus_is_confidential()) {
+ /*
+ * If the VMBus isn't confidential, use the CoCo-specific
+ * mechanism to communicate with the hypervisor.
+ */
if (hv_isolation_type_tdx())
status = hv_tdx_hypercall(HVCALL_POST_MESSAGE,
virt_to_phys(aligned_msg), 0);
@@ -88,6 +94,11 @@ int hv_post_message(union hv_connection_id connection_id,
u64 control = HVCALL_POST_MESSAGE;
control |= hv_nested ? HV_HYPERCALL_NESTED : 0;
+ /*
+ * If there is no paravisor, this will go to the hypervisor.
+ * In the Confidential VMBus case, there is the paravisor
+ * to which this will trap.
+ */
status = hv_do_hypercall(control, aligned_msg, NULL);
}
@@ -95,11 +106,72 @@ int hv_post_message(union hv_connection_id connection_id,
return hv_result(status);
}
+EXPORT_SYMBOL_FOR_MODULES(hv_post_message, "mshv_vtl");
+
+static int hv_alloc_page(void **page, bool decrypt, const char *note)
+{
+ int ret = 0;
+
+ /*
+ * After the page changes its encryption status, its contents might
+ * appear scrambled on some hardware. Thus `get_zeroed_page` would
+ * zero the page out in vain, so do that explicitly exactly once.
+ *
+ * By default, the page is allocated encrypted in a CoCo VM.
+ */
+ *page = (void *)__get_free_page(GFP_KERNEL);
+ if (!*page)
+ return -ENOMEM;
+
+ if (decrypt)
+ ret = set_memory_decrypted((unsigned long)*page, 1);
+ if (ret)
+ goto failed;
+
+ memset(*page, 0, PAGE_SIZE);
+ return 0;
+
+failed:
+ /*
+ * Report the failure but don't put the page back on the free list as
+ * its encryption status is unknown.
+ */
+ pr_err("allocation failed for %s page, error %d, decrypted %d\n",
+ note, ret, decrypt);
+ *page = NULL;
+ return ret;
+}
+
+static int hv_free_page(void **page, bool encrypt, const char *note)
+{
+ int ret = 0;
+
+ if (!*page)
+ return 0;
+
+ if (encrypt)
+ ret = set_memory_encrypted((unsigned long)*page, 1);
+
+ /*
+ * In the case of the failure, the page is leaked. Something is wrong,
+ * prefer to lose the page with the unknown encryption status and stay afloat.
+ */
+ if (ret)
+ pr_err("deallocation failed for %s page, error %d, encrypt %d\n",
+ note, ret, encrypt);
+ else
+ free_page((unsigned long)*page);
+
+ *page = NULL;
+
+ return ret;
+}
int hv_synic_alloc(void)
{
int cpu, ret = -ENOMEM;
struct hv_per_cpu_context *hv_cpu;
+ const bool decrypt = !vmbus_is_confidential();
/*
* First, zero all per-cpu memory areas so hv_synic_free() can
@@ -125,73 +197,37 @@ int hv_synic_alloc(void)
vmbus_on_msg_dpc, (unsigned long)hv_cpu);
if (ms_hyperv.paravisor_present && hv_isolation_type_tdx()) {
- hv_cpu->post_msg_page = (void *)get_zeroed_page(GFP_ATOMIC);
- if (!hv_cpu->post_msg_page) {
- pr_err("Unable to allocate post msg page\n");
+ ret = hv_alloc_page(&hv_cpu->post_msg_page,
+ decrypt, "post msg");
+ if (ret)
goto err;
- }
-
- ret = set_memory_decrypted((unsigned long)hv_cpu->post_msg_page, 1);
- if (ret) {
- pr_err("Failed to decrypt post msg page: %d\n", ret);
- /* Just leak the page, as it's unsafe to free the page. */
- hv_cpu->post_msg_page = NULL;
- goto err;
- }
-
- memset(hv_cpu->post_msg_page, 0, PAGE_SIZE);
}
/*
- * Synic message and event pages are allocated by paravisor.
- * Skip these pages allocation here.
+ * If these SynIC pages are not allocated, SIEF and SIM pages
+ * are configured using what the root partition or the paravisor
+ * provides upon reading the SIEFP and SIMP registers.
*/
if (!ms_hyperv.paravisor_present && !hv_root_partition()) {
- hv_cpu->synic_message_page =
- (void *)get_zeroed_page(GFP_ATOMIC);
- if (!hv_cpu->synic_message_page) {
- pr_err("Unable to allocate SYNIC message page\n");
+ ret = hv_alloc_page(&hv_cpu->hyp_synic_message_page,
+ decrypt, "hypervisor SynIC msg");
+ if (ret)
goto err;
- }
-
- hv_cpu->synic_event_page =
- (void *)get_zeroed_page(GFP_ATOMIC);
- if (!hv_cpu->synic_event_page) {
- pr_err("Unable to allocate SYNIC event page\n");
-
- free_page((unsigned long)hv_cpu->synic_message_page);
- hv_cpu->synic_message_page = NULL;
+ ret = hv_alloc_page(&hv_cpu->hyp_synic_event_page,
+ decrypt, "hypervisor SynIC event");
+ if (ret)
goto err;
- }
}
- if (!ms_hyperv.paravisor_present &&
- (hv_isolation_type_snp() || hv_isolation_type_tdx())) {
- ret = set_memory_decrypted((unsigned long)
- hv_cpu->synic_message_page, 1);
- if (ret) {
- pr_err("Failed to decrypt SYNIC msg page: %d\n", ret);
- hv_cpu->synic_message_page = NULL;
-
- /*
- * Free the event page here so that hv_synic_free()
- * won't later try to re-encrypt it.
- */
- free_page((unsigned long)hv_cpu->synic_event_page);
- hv_cpu->synic_event_page = NULL;
+ if (vmbus_is_confidential()) {
+ ret = hv_alloc_page(&hv_cpu->para_synic_message_page,
+ false, "paravisor SynIC msg");
+ if (ret)
goto err;
- }
-
- ret = set_memory_decrypted((unsigned long)
- hv_cpu->synic_event_page, 1);
- if (ret) {
- pr_err("Failed to decrypt SYNIC event page: %d\n", ret);
- hv_cpu->synic_event_page = NULL;
+ ret = hv_alloc_page(&hv_cpu->para_synic_event_page,
+ false, "paravisor SynIC event");
+ if (ret)
goto err;
- }
-
- memset(hv_cpu->synic_message_page, 0, PAGE_SIZE);
- memset(hv_cpu->synic_event_page, 0, PAGE_SIZE);
}
}
@@ -207,70 +243,46 @@ err:
void hv_synic_free(void)
{
- int cpu, ret;
+ int cpu;
+ const bool encrypt = !vmbus_is_confidential();
for_each_present_cpu(cpu) {
struct hv_per_cpu_context *hv_cpu =
per_cpu_ptr(hv_context.cpu_context, cpu);
- /* It's better to leak the page if the encryption fails. */
- if (ms_hyperv.paravisor_present && hv_isolation_type_tdx()) {
- if (hv_cpu->post_msg_page) {
- ret = set_memory_encrypted((unsigned long)
- hv_cpu->post_msg_page, 1);
- if (ret) {
- pr_err("Failed to encrypt post msg page: %d\n", ret);
- hv_cpu->post_msg_page = NULL;
- }
- }
+ if (ms_hyperv.paravisor_present && hv_isolation_type_tdx())
+ hv_free_page(&hv_cpu->post_msg_page,
+ encrypt, "post msg");
+ if (!ms_hyperv.paravisor_present && !hv_root_partition()) {
+ hv_free_page(&hv_cpu->hyp_synic_event_page,
+ encrypt, "hypervisor SynIC event");
+ hv_free_page(&hv_cpu->hyp_synic_message_page,
+ encrypt, "hypervisor SynIC msg");
}
-
- if (!ms_hyperv.paravisor_present &&
- (hv_isolation_type_snp() || hv_isolation_type_tdx())) {
- if (hv_cpu->synic_message_page) {
- ret = set_memory_encrypted((unsigned long)
- hv_cpu->synic_message_page, 1);
- if (ret) {
- pr_err("Failed to encrypt SYNIC msg page: %d\n", ret);
- hv_cpu->synic_message_page = NULL;
- }
- }
-
- if (hv_cpu->synic_event_page) {
- ret = set_memory_encrypted((unsigned long)
- hv_cpu->synic_event_page, 1);
- if (ret) {
- pr_err("Failed to encrypt SYNIC event page: %d\n", ret);
- hv_cpu->synic_event_page = NULL;
- }
- }
+ if (vmbus_is_confidential()) {
+ hv_free_page(&hv_cpu->para_synic_event_page,
+ false, "paravisor SynIC event");
+ hv_free_page(&hv_cpu->para_synic_message_page,
+ false, "paravisor SynIC msg");
}
-
- free_page((unsigned long)hv_cpu->post_msg_page);
- free_page((unsigned long)hv_cpu->synic_event_page);
- free_page((unsigned long)hv_cpu->synic_message_page);
}
kfree(hv_context.hv_numa_map);
}
/*
- * hv_synic_init - Initialize the Synthetic Interrupt Controller.
- *
- * If it is already initialized by another entity (ie x2v shim), we need to
- * retrieve the initialized message and event pages. Otherwise, we create and
- * initialize the message and event pages.
+ * hv_hyp_synic_enable_regs - Initialize the Synthetic Interrupt Controller
+ * with the hypervisor.
*/
-void hv_synic_enable_regs(unsigned int cpu)
+void hv_hyp_synic_enable_regs(unsigned int cpu)
{
struct hv_per_cpu_context *hv_cpu =
per_cpu_ptr(hv_context.cpu_context, cpu);
union hv_synic_simp simp;
union hv_synic_siefp siefp;
union hv_synic_sint shared_sint;
- union hv_synic_scontrol sctrl;
- /* Setup the Synic's message page */
+ /* Setup the Synic's message page with the hypervisor. */
simp.as_uint64 = hv_get_msr(HV_MSR_SIMP);
simp.simp_enabled = 1;
@@ -278,18 +290,18 @@ void hv_synic_enable_regs(unsigned int cpu)
/* Mask out vTOM bit. ioremap_cache() maps decrypted */
u64 base = (simp.base_simp_gpa << HV_HYP_PAGE_SHIFT) &
~ms_hyperv.shared_gpa_boundary;
- hv_cpu->synic_message_page =
+ hv_cpu->hyp_synic_message_page =
(void *)ioremap_cache(base, HV_HYP_PAGE_SIZE);
- if (!hv_cpu->synic_message_page)
+ if (!hv_cpu->hyp_synic_message_page)
pr_err("Fail to map synic message page.\n");
} else {
- simp.base_simp_gpa = virt_to_phys(hv_cpu->synic_message_page)
+ simp.base_simp_gpa = virt_to_phys(hv_cpu->hyp_synic_message_page)
>> HV_HYP_PAGE_SHIFT;
}
hv_set_msr(HV_MSR_SIMP, simp.as_uint64);
- /* Setup the Synic's event page */
+ /* Setup the Synic's event page with the hypervisor. */
siefp.as_uint64 = hv_get_msr(HV_MSR_SIEFP);
siefp.siefp_enabled = 1;
@@ -297,16 +309,17 @@ void hv_synic_enable_regs(unsigned int cpu)
/* Mask out vTOM bit. ioremap_cache() maps decrypted */
u64 base = (siefp.base_siefp_gpa << HV_HYP_PAGE_SHIFT) &
~ms_hyperv.shared_gpa_boundary;
- hv_cpu->synic_event_page =
+ hv_cpu->hyp_synic_event_page =
(void *)ioremap_cache(base, HV_HYP_PAGE_SIZE);
- if (!hv_cpu->synic_event_page)
+ if (!hv_cpu->hyp_synic_event_page)
pr_err("Fail to map synic event page.\n");
} else {
- siefp.base_siefp_gpa = virt_to_phys(hv_cpu->synic_event_page)
+ siefp.base_siefp_gpa = virt_to_phys(hv_cpu->hyp_synic_event_page)
>> HV_HYP_PAGE_SHIFT;
}
hv_set_msr(HV_MSR_SIEFP, siefp.as_uint64);
+ hv_enable_coco_interrupt(cpu, vmbus_interrupt, true);
/* Setup the shared SINT. */
if (vmbus_irq != -1)
@@ -317,6 +330,11 @@ void hv_synic_enable_regs(unsigned int cpu)
shared_sint.masked = false;
shared_sint.auto_eoi = hv_recommend_using_aeoi();
hv_set_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT, shared_sint.as_uint64);
+}
+
+static void hv_hyp_synic_enable_interrupts(void)
+{
+ union hv_synic_scontrol sctrl;
/* Enable the global synic bit */
sctrl.as_uint64 = hv_get_msr(HV_MSR_SCONTROL);
@@ -325,23 +343,72 @@ void hv_synic_enable_regs(unsigned int cpu)
hv_set_msr(HV_MSR_SCONTROL, sctrl.as_uint64);
}
+static void hv_para_synic_enable_regs(unsigned int cpu)
+{
+ union hv_synic_simp simp;
+ union hv_synic_siefp siefp;
+ struct hv_per_cpu_context *hv_cpu
+ = per_cpu_ptr(hv_context.cpu_context, cpu);
+
+ /* Setup the Synic's message page with the paravisor. */
+ simp.as_uint64 = hv_para_get_synic_register(HV_MSR_SIMP);
+ simp.simp_enabled = 1;
+ simp.base_simp_gpa = virt_to_phys(hv_cpu->para_synic_message_page)
+ >> HV_HYP_PAGE_SHIFT;
+ hv_para_set_synic_register(HV_MSR_SIMP, simp.as_uint64);
+
+ /* Setup the Synic's event page with the paravisor. */
+ siefp.as_uint64 = hv_para_get_synic_register(HV_MSR_SIEFP);
+ siefp.siefp_enabled = 1;
+ siefp.base_siefp_gpa = virt_to_phys(hv_cpu->para_synic_event_page)
+ >> HV_HYP_PAGE_SHIFT;
+ hv_para_set_synic_register(HV_MSR_SIEFP, siefp.as_uint64);
+}
+
+static void hv_para_synic_enable_interrupts(void)
+{
+ union hv_synic_scontrol sctrl;
+
+ /* Enable the global synic bit */
+ sctrl.as_uint64 = hv_para_get_synic_register(HV_MSR_SCONTROL);
+ sctrl.enable = 1;
+ hv_para_set_synic_register(HV_MSR_SCONTROL, sctrl.as_uint64);
+}
+
int hv_synic_init(unsigned int cpu)
{
- hv_synic_enable_regs(cpu);
+ if (vmbus_is_confidential())
+ hv_para_synic_enable_regs(cpu);
+
+ /*
+ * The SINT is set in hv_hyp_synic_enable_regs() by calling
+ * hv_set_msr(). hv_set_msr() in turn has special case code for the
+ * SINT MSRs that write to the hypervisor version of the MSR *and*
+ * the paravisor version of the MSR (but *without* the proxy bit when
+ * VMBus is confidential).
+ *
+ * Then enable interrupts via the paravisor if VMBus is confidential,
+ * and otherwise via the hypervisor.
+ */
+
+ hv_hyp_synic_enable_regs(cpu);
+ if (vmbus_is_confidential())
+ hv_para_synic_enable_interrupts();
+ else
+ hv_hyp_synic_enable_interrupts();
hv_stimer_legacy_init(cpu, VMBUS_MESSAGE_SINT);
return 0;
}
-void hv_synic_disable_regs(unsigned int cpu)
+void hv_hyp_synic_disable_regs(unsigned int cpu)
{
struct hv_per_cpu_context *hv_cpu =
per_cpu_ptr(hv_context.cpu_context, cpu);
union hv_synic_sint shared_sint;
union hv_synic_simp simp;
union hv_synic_siefp siefp;
- union hv_synic_scontrol sctrl;
shared_sint.as_uint64 = hv_get_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT);
@@ -350,18 +417,21 @@ void hv_synic_disable_regs(unsigned int cpu)
/* Need to correctly cleanup in the case of SMP!!! */
/* Disable the interrupt */
hv_set_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT, shared_sint.as_uint64);
+ hv_enable_coco_interrupt(cpu, vmbus_interrupt, false);
simp.as_uint64 = hv_get_msr(HV_MSR_SIMP);
/*
- * In Isolation VM, sim and sief pages are allocated by
+ * In Isolation VM, simp and sief pages are allocated by
* paravisor. These pages also will be used by kdump
* kernel. So just reset enable bit here and keep page
* addresses.
*/
simp.simp_enabled = 0;
if (ms_hyperv.paravisor_present || hv_root_partition()) {
- iounmap(hv_cpu->synic_message_page);
- hv_cpu->synic_message_page = NULL;
+ if (hv_cpu->hyp_synic_message_page) {
+ iounmap(hv_cpu->hyp_synic_message_page);
+ hv_cpu->hyp_synic_message_page = NULL;
+ }
} else {
simp.base_simp_gpa = 0;
}
@@ -372,21 +442,51 @@ void hv_synic_disable_regs(unsigned int cpu)
siefp.siefp_enabled = 0;
if (ms_hyperv.paravisor_present || hv_root_partition()) {
- iounmap(hv_cpu->synic_event_page);
- hv_cpu->synic_event_page = NULL;
+ if (hv_cpu->hyp_synic_event_page) {
+ iounmap(hv_cpu->hyp_synic_event_page);
+ hv_cpu->hyp_synic_event_page = NULL;
+ }
} else {
siefp.base_siefp_gpa = 0;
}
hv_set_msr(HV_MSR_SIEFP, siefp.as_uint64);
+}
+
+static void hv_hyp_synic_disable_interrupts(void)
+{
+ union hv_synic_scontrol sctrl;
/* Disable the global synic bit */
sctrl.as_uint64 = hv_get_msr(HV_MSR_SCONTROL);
sctrl.enable = 0;
hv_set_msr(HV_MSR_SCONTROL, sctrl.as_uint64);
+}
- if (vmbus_irq != -1)
- disable_percpu_irq(vmbus_irq);
+static void hv_para_synic_disable_regs(unsigned int cpu)
+{
+ union hv_synic_simp simp;
+ union hv_synic_siefp siefp;
+
+ /* Disable SynIC's message page in the paravisor. */
+ simp.as_uint64 = hv_para_get_synic_register(HV_MSR_SIMP);
+ simp.simp_enabled = 0;
+ hv_para_set_synic_register(HV_MSR_SIMP, simp.as_uint64);
+
+ /* Disable SynIC's event page in the paravisor. */
+ siefp.as_uint64 = hv_para_get_synic_register(HV_MSR_SIEFP);
+ siefp.siefp_enabled = 0;
+ hv_para_set_synic_register(HV_MSR_SIEFP, siefp.as_uint64);
+}
+
+static void hv_para_synic_disable_interrupts(void)
+{
+ union hv_synic_scontrol sctrl;
+
+ /* Disable the global synic bit */
+ sctrl.as_uint64 = hv_para_get_synic_register(HV_MSR_SCONTROL);
+ sctrl.enable = 0;
+ hv_para_set_synic_register(HV_MSR_SCONTROL, sctrl.as_uint64);
}
#define HV_MAX_TRIES 3
@@ -399,16 +499,18 @@ void hv_synic_disable_regs(unsigned int cpu)
* that the normal interrupt handling mechanism will find and process the channel interrupt
* "very soon", and in the process clear the bit.
*/
-static bool hv_synic_event_pending(void)
+static bool __hv_synic_event_pending(union hv_synic_event_flags *event, int sint)
{
- struct hv_per_cpu_context *hv_cpu = this_cpu_ptr(hv_context.cpu_context);
- union hv_synic_event_flags *event =
- (union hv_synic_event_flags *)hv_cpu->synic_event_page + VMBUS_MESSAGE_SINT;
- unsigned long *recv_int_page = event->flags; /* assumes VMBus version >= VERSION_WIN8 */
+ unsigned long *recv_int_page;
bool pending;
u32 relid;
int tries = 0;
+ if (!event)
+ return false;
+
+ event += sint;
+ recv_int_page = event->flags; /* assumes VMBus version >= VERSION_WIN8 */
retry:
pending = false;
for_each_set_bit(relid, recv_int_page, HV_EVENT_FLAGS_COUNT) {
@@ -425,6 +527,17 @@ retry:
return pending;
}
+static bool hv_synic_event_pending(void)
+{
+ struct hv_per_cpu_context *hv_cpu = this_cpu_ptr(hv_context.cpu_context);
+ union hv_synic_event_flags *hyp_synic_event_page = hv_cpu->hyp_synic_event_page;
+ union hv_synic_event_flags *para_synic_event_page = hv_cpu->para_synic_event_page;
+
+ return
+ __hv_synic_event_pending(hyp_synic_event_page, VMBUS_MESSAGE_SINT) ||
+ __hv_synic_event_pending(para_synic_event_page, VMBUS_MESSAGE_SINT);
+}
+
static int hv_pick_new_cpu(struct vmbus_channel *channel)
{
int ret = -EBUSY;
@@ -517,7 +630,27 @@ int hv_synic_cleanup(unsigned int cpu)
always_cleanup:
hv_stimer_legacy_cleanup(cpu);
- hv_synic_disable_regs(cpu);
+ /*
+ * First, disable the event and message pages
+ * used for communicating with the host, and then
+ * disable the host interrupts if VMBus is not
+ * confidential.
+ */
+ hv_hyp_synic_disable_regs(cpu);
+ if (!vmbus_is_confidential())
+ hv_hyp_synic_disable_interrupts();
+
+ /*
+ * Perform the same steps for the Confidential VMBus.
+ * The sequencing provides the guarantee that no data
+ * may be posted for processing before disabling interrupts.
+ */
+ if (vmbus_is_confidential()) {
+ hv_para_synic_disable_regs(cpu);
+ hv_para_synic_disable_interrupts();
+ }
+ if (vmbus_irq != -1)
+ disable_percpu_irq(vmbus_irq);
return ret;
}
diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c
index e109a620c83f..0a3ab7efed46 100644
--- a/drivers/hv/hv_common.c
+++ b/drivers/hv/hv_common.c
@@ -315,9 +315,9 @@ int __init hv_common_init(void)
int i;
union hv_hypervisor_version_info version;
- /* Get information about the Hyper-V host version */
+ /* Get information about the Microsoft Hypervisor version */
if (!hv_get_hypervisor_version(&version))
- pr_info("Hyper-V: Host Build %d.%d.%d.%d-%d-%d\n",
+ pr_info("Hyper-V: Hypervisor Build %d.%d.%d.%d-%d-%d\n",
version.major_version, version.minor_version,
version.build_number, version.service_number,
version.service_pack, version.service_branch);
@@ -487,7 +487,7 @@ int hv_common_cpu_init(unsigned int cpu)
* online and then taken offline
*/
if (!*inputarg) {
- mem = kmalloc(pgcount * HV_HYP_PAGE_SIZE, flags);
+ mem = kmalloc_array(pgcount, HV_HYP_PAGE_SIZE, flags);
if (!mem)
return -ENOMEM;
@@ -716,6 +716,27 @@ u64 __weak hv_tdx_hypercall(u64 control, u64 param1, u64 param2)
}
EXPORT_SYMBOL_GPL(hv_tdx_hypercall);
+void __weak hv_enable_coco_interrupt(unsigned int cpu, unsigned int vector, bool set)
+{
+}
+EXPORT_SYMBOL_GPL(hv_enable_coco_interrupt);
+
+void __weak hv_para_set_sint_proxy(bool enable)
+{
+}
+EXPORT_SYMBOL_GPL(hv_para_set_sint_proxy);
+
+u64 __weak hv_para_get_synic_register(unsigned int reg)
+{
+ return ~0ULL;
+}
+EXPORT_SYMBOL_GPL(hv_para_get_synic_register);
+
+void __weak hv_para_set_synic_register(unsigned int reg, u64 val)
+{
+}
+EXPORT_SYMBOL_GPL(hv_para_set_synic_register);
+
void hv_identify_partition_type(void)
{
/* Assume guest role */
diff --git a/drivers/hv/hv_util.c b/drivers/hv/hv_util.c
index 36ee89c0358b..7e9c8e169c66 100644
--- a/drivers/hv/hv_util.c
+++ b/drivers/hv/hv_util.c
@@ -586,7 +586,7 @@ static int util_probe(struct hv_device *dev,
(struct hv_util_service *)dev_id->driver_data;
int ret;
- srv->recv_buffer = kmalloc(HV_HYP_PAGE_SIZE * 4, GFP_KERNEL);
+ srv->recv_buffer = kmalloc_array(4, HV_HYP_PAGE_SIZE, GFP_KERNEL);
if (!srv->recv_buffer)
return -ENOMEM;
srv->channel = dev->channel;
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index 0b450e53161e..b2862e0a317a 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -15,6 +15,7 @@
#include <linux/list.h>
#include <linux/bitops.h>
#include <asm/sync_bitops.h>
+#include <asm/mshyperv.h>
#include <linux/atomic.h>
#include <linux/hyperv.h>
#include <linux/interrupt.h>
@@ -32,6 +33,7 @@
*/
#define HV_UTIL_NEGO_TIMEOUT 55
+void vmbus_isr(void);
/* Definitions for the monitored notification facility */
union hv_monitor_trigger_group {
@@ -120,8 +122,26 @@ enum {
* Per cpu state for channel handling
*/
struct hv_per_cpu_context {
- void *synic_message_page;
- void *synic_event_page;
+ /*
+ * SynIC pages for communicating with the host.
+ *
+ * These pages are accessible to the host partition and the hypervisor.
+ * They may be used for exchanging data with the host partition and the
+ * hypervisor even when they aren't trusted yet the guest partition
+ * must be prepared to handle the malicious behavior.
+ */
+ void *hyp_synic_message_page;
+ void *hyp_synic_event_page;
+ /*
+ * SynIC pages for communicating with the paravisor.
+ *
+ * These pages may be accessed from within the guest partition only in
+ * CoCo VMs. Neither the host partition nor the hypervisor can access
+ * these pages in that case; they are used for exchanging data with the
+ * paravisor.
+ */
+ void *para_synic_message_page;
+ void *para_synic_event_page;
/*
* The page is only used in hv_post_message() for a TDX VM (with the
@@ -171,10 +191,10 @@ extern int hv_synic_alloc(void);
extern void hv_synic_free(void);
-extern void hv_synic_enable_regs(unsigned int cpu);
+extern void hv_hyp_synic_enable_regs(unsigned int cpu);
extern int hv_synic_init(unsigned int cpu);
-extern void hv_synic_disable_regs(unsigned int cpu);
+extern void hv_hyp_synic_disable_regs(unsigned int cpu);
extern int hv_synic_cleanup(unsigned int cpu);
/* Interface */
@@ -182,7 +202,8 @@ extern int hv_synic_cleanup(unsigned int cpu);
void hv_ringbuffer_pre_init(struct vmbus_channel *channel);
int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info,
- struct page *pages, u32 pagecnt, u32 max_pkt_size);
+ struct page *pages, u32 pagecnt, u32 max_pkt_size,
+ bool confidential);
void hv_ringbuffer_cleanup(struct hv_ring_buffer_info *ring_info);
@@ -333,6 +354,51 @@ extern const struct vmbus_channel_message_table_entry
/* General vmbus interface */
+bool vmbus_is_confidential(void);
+
+#if IS_ENABLED(CONFIG_HYPERV_VMBUS)
+/* Free the message slot and signal end-of-message if required */
+static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type)
+{
+ /*
+ * On crash we're reading some other CPU's message page and we need
+ * to be careful: this other CPU may already had cleared the header
+ * and the host may already had delivered some other message there.
+ * In case we blindly write msg->header.message_type we're going
+ * to lose it. We can still lose a message of the same type but
+ * we count on the fact that there can only be one
+ * CHANNELMSG_UNLOAD_RESPONSE and we don't care about other messages
+ * on crash.
+ */
+ if (cmpxchg(&msg->header.message_type, old_msg_type,
+ HVMSG_NONE) != old_msg_type)
+ return;
+
+ /*
+ * The cmxchg() above does an implicit memory barrier to
+ * ensure the write to MessageType (ie set to
+ * HVMSG_NONE) happens before we read the
+ * MessagePending and EOMing. Otherwise, the EOMing
+ * will not deliver any more messages since there is
+ * no empty slot
+ */
+ if (msg->header.message_flags.msg_pending) {
+ /*
+ * This will cause message queue rescan to
+ * possibly deliver another msg from the
+ * hypervisor
+ */
+ if (vmbus_is_confidential())
+ hv_para_set_synic_register(HV_MSR_EOM, 0);
+ else
+ hv_set_msr(HV_MSR_EOM, 0);
+ }
+}
+
+extern int vmbus_interrupt;
+extern int vmbus_irq;
+#endif /* CONFIG_HYPERV_VMBUS */
+
struct hv_device *vmbus_device_create(const guid_t *type,
const guid_t *instance,
struct vmbus_channel *channel);
diff --git a/drivers/hv/mshv_common.c b/drivers/hv/mshv_common.c
index aa2be51979fd..58027b23c206 100644
--- a/drivers/hv/mshv_common.c
+++ b/drivers/hv/mshv_common.c
@@ -14,6 +14,9 @@
#include <asm/mshyperv.h>
#include <linux/resume_user_mode.h>
#include <linux/export.h>
+#include <linux/acpi.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
#include "mshv.h"
@@ -138,3 +141,99 @@ int hv_call_get_partition_property(u64 partition_id,
return 0;
}
EXPORT_SYMBOL_GPL(hv_call_get_partition_property);
+
+/*
+ * Corresponding sleep states have to be initialized in order for a subsequent
+ * HVCALL_ENTER_SLEEP_STATE call to succeed. Currently only S5 state as per
+ * ACPI 6.4 chapter 7.4.2 is relevant, while S1, S2 and S3 can be supported.
+ *
+ * In order to pass proper PM values to mshv, ACPI should be initialized and
+ * should support S5 sleep state when this method is invoked.
+ */
+static int hv_initialize_sleep_states(void)
+{
+ u64 status;
+ unsigned long flags;
+ struct hv_input_set_system_property *in;
+ acpi_status acpi_status;
+ u8 sleep_type_a, sleep_type_b;
+
+ if (!acpi_sleep_state_supported(ACPI_STATE_S5)) {
+ pr_err("%s: S5 sleep state not supported.\n", __func__);
+ return -ENODEV;
+ }
+
+ acpi_status = acpi_get_sleep_type_data(ACPI_STATE_S5, &sleep_type_a,
+ &sleep_type_b);
+ if (ACPI_FAILURE(acpi_status))
+ return -ENODEV;
+
+ local_irq_save(flags);
+ in = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ memset(in, 0, sizeof(*in));
+
+ in->property_id = HV_SYSTEM_PROPERTY_SLEEP_STATE;
+ in->set_sleep_state_info.sleep_state = HV_SLEEP_STATE_S5;
+ in->set_sleep_state_info.pm1a_slp_typ = sleep_type_a;
+ in->set_sleep_state_info.pm1b_slp_typ = sleep_type_b;
+
+ status = hv_do_hypercall(HVCALL_SET_SYSTEM_PROPERTY, in, NULL);
+ local_irq_restore(flags);
+
+ if (!hv_result_success(status)) {
+ hv_status_err(status, "\n");
+ return hv_result_to_errno(status);
+ }
+
+ return 0;
+}
+
+/*
+ * This notifier initializes sleep states in mshv hypervisor which will be
+ * used during power off.
+ */
+static int hv_reboot_notifier_handler(struct notifier_block *this,
+ unsigned long code, void *another)
+{
+ int ret = 0;
+
+ if (code == SYS_HALT || code == SYS_POWER_OFF)
+ ret = hv_initialize_sleep_states();
+
+ return ret ? NOTIFY_DONE : NOTIFY_OK;
+}
+
+static struct notifier_block hv_reboot_notifier = {
+ .notifier_call = hv_reboot_notifier_handler,
+};
+
+void hv_sleep_notifiers_register(void)
+{
+ int ret;
+
+ ret = register_reboot_notifier(&hv_reboot_notifier);
+ if (ret)
+ pr_err("%s: cannot register reboot notifier %d\n", __func__,
+ ret);
+}
+
+/*
+ * Power off the machine by entering S5 sleep state via Hyper-V hypercall.
+ * This call does not return if successful.
+ */
+void hv_machine_power_off(void)
+{
+ unsigned long flags;
+ struct hv_input_enter_sleep_state *in;
+
+ local_irq_save(flags);
+ in = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ in->sleep_state = HV_SLEEP_STATE_S5;
+
+ (void)hv_do_hypercall(HVCALL_ENTER_SLEEP_STATE, in, NULL);
+ local_irq_restore(flags);
+
+ /* should never reach here */
+ BUG();
+
+}
diff --git a/drivers/hv/mshv_eventfd.c b/drivers/hv/mshv_eventfd.c
index 806674722868..d93a18f09c76 100644
--- a/drivers/hv/mshv_eventfd.c
+++ b/drivers/hv/mshv_eventfd.c
@@ -163,8 +163,10 @@ static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd)
if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT)
return -EOPNOTSUPP;
+#if IS_ENABLED(CONFIG_X86)
if (irq->lapic_control.logical_dest_mode)
return -EOPNOTSUPP;
+#endif
vp = partition->pt_vp_array[irq->lapic_apic_id];
@@ -196,8 +198,10 @@ static void mshv_assert_irq_slow(struct mshv_irqfd *irqfd)
unsigned int seq;
int idx;
+#if IS_ENABLED(CONFIG_X86)
WARN_ON(irqfd->irqfd_resampler &&
!irq->lapic_control.level_triggered);
+#endif
idx = srcu_read_lock(&partition->pt_irq_srcu);
if (irqfd->irqfd_girq_ent.guest_irq_num) {
@@ -469,6 +473,7 @@ static int mshv_irqfd_assign(struct mshv_partition *pt,
init_poll_funcptr(&irqfd->irqfd_polltbl, mshv_irqfd_queue_proc);
spin_lock_irq(&pt->pt_irqfds_lock);
+#if IS_ENABLED(CONFIG_X86)
if (args->flags & BIT(MSHV_IRQFD_BIT_RESAMPLE) &&
!irqfd->irqfd_lapic_irq.lapic_control.level_triggered) {
/*
@@ -479,6 +484,7 @@ static int mshv_irqfd_assign(struct mshv_partition *pt,
ret = -EINVAL;
goto fail;
}
+#endif
ret = 0;
hlist_for_each_entry(tmp, &pt->pt_irqfds_list, irqfd_hnode) {
if (irqfd->irqfd_eventfd_ctx != tmp->irqfd_eventfd_ctx)
@@ -592,7 +598,7 @@ static void mshv_irqfd_release(struct mshv_partition *pt)
int mshv_irqfd_wq_init(void)
{
- irqfd_cleanup_wq = alloc_workqueue("mshv-irqfd-cleanup", 0, 0);
+ irqfd_cleanup_wq = alloc_workqueue("mshv-irqfd-cleanup", WQ_PERCPU, 0);
if (!irqfd_cleanup_wq)
return -ENOMEM;
diff --git a/drivers/hv/mshv_irq.c b/drivers/hv/mshv_irq.c
index d0fb9ef734f4..798e7e1ab06e 100644
--- a/drivers/hv/mshv_irq.c
+++ b/drivers/hv/mshv_irq.c
@@ -119,6 +119,10 @@ void mshv_copy_girq_info(struct mshv_guest_irq_ent *ent,
lirq->lapic_vector = ent->girq_irq_data & 0xFF;
lirq->lapic_apic_id = (ent->girq_addr_lo >> 12) & 0xFF;
lirq->lapic_control.interrupt_type = (ent->girq_irq_data & 0x700) >> 8;
+#if IS_ENABLED(CONFIG_X86)
lirq->lapic_control.level_triggered = (ent->girq_irq_data >> 15) & 0x1;
lirq->lapic_control.logical_dest_mode = (ent->girq_addr_lo >> 2) & 0x1;
+#elif IS_ENABLED(CONFIG_ARM64)
+ lirq->lapic_control.asserted = 1;
+#endif
}
diff --git a/drivers/hv/mshv_regions.c b/drivers/hv/mshv_regions.c
new file mode 100644
index 000000000000..202b9d551e39
--- /dev/null
+++ b/drivers/hv/mshv_regions.c
@@ -0,0 +1,555 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2025, Microsoft Corporation.
+ *
+ * Memory region management for mshv_root module.
+ *
+ * Authors: Microsoft Linux virtualization team
+ */
+
+#include <linux/hmm.h>
+#include <linux/hyperv.h>
+#include <linux/kref.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+
+#include <asm/mshyperv.h>
+
+#include "mshv_root.h"
+
+#define MSHV_MAP_FAULT_IN_PAGES PTRS_PER_PMD
+
+/**
+ * mshv_region_process_chunk - Processes a contiguous chunk of memory pages
+ * in a region.
+ * @region : Pointer to the memory region structure.
+ * @flags : Flags to pass to the handler.
+ * @page_offset: Offset into the region's pages array to start processing.
+ * @page_count : Number of pages to process.
+ * @handler : Callback function to handle the chunk.
+ *
+ * This function scans the region's pages starting from @page_offset,
+ * checking for contiguous present pages of the same size (normal or huge).
+ * It invokes @handler for the chunk of contiguous pages found. Returns the
+ * number of pages handled, or a negative error code if the first page is
+ * not present or the handler fails.
+ *
+ * Note: The @handler callback must be able to handle both normal and huge
+ * pages.
+ *
+ * Return: Number of pages handled, or negative error code.
+ */
+static long mshv_region_process_chunk(struct mshv_mem_region *region,
+ u32 flags,
+ u64 page_offset, u64 page_count,
+ int (*handler)(struct mshv_mem_region *region,
+ u32 flags,
+ u64 page_offset,
+ u64 page_count))
+{
+ u64 count, stride;
+ unsigned int page_order;
+ struct page *page;
+ int ret;
+
+ page = region->pages[page_offset];
+ if (!page)
+ return -EINVAL;
+
+ page_order = folio_order(page_folio(page));
+ /* The hypervisor only supports 4K and 2M page sizes */
+ if (page_order && page_order != HPAGE_PMD_ORDER)
+ return -EINVAL;
+
+ stride = 1 << page_order;
+
+ /* Start at stride since the first page is validated */
+ for (count = stride; count < page_count; count += stride) {
+ page = region->pages[page_offset + count];
+
+ /* Break if current page is not present */
+ if (!page)
+ break;
+
+ /* Break if page size changes */
+ if (page_order != folio_order(page_folio(page)))
+ break;
+ }
+
+ ret = handler(region, flags, page_offset, count);
+ if (ret)
+ return ret;
+
+ return count;
+}
+
+/**
+ * mshv_region_process_range - Processes a range of memory pages in a
+ * region.
+ * @region : Pointer to the memory region structure.
+ * @flags : Flags to pass to the handler.
+ * @page_offset: Offset into the region's pages array to start processing.
+ * @page_count : Number of pages to process.
+ * @handler : Callback function to handle each chunk of contiguous
+ * pages.
+ *
+ * Iterates over the specified range of pages in @region, skipping
+ * non-present pages. For each contiguous chunk of present pages, invokes
+ * @handler via mshv_region_process_chunk.
+ *
+ * Note: The @handler callback must be able to handle both normal and huge
+ * pages.
+ *
+ * Returns 0 on success, or a negative error code on failure.
+ */
+static int mshv_region_process_range(struct mshv_mem_region *region,
+ u32 flags,
+ u64 page_offset, u64 page_count,
+ int (*handler)(struct mshv_mem_region *region,
+ u32 flags,
+ u64 page_offset,
+ u64 page_count))
+{
+ long ret;
+
+ if (page_offset + page_count > region->nr_pages)
+ return -EINVAL;
+
+ while (page_count) {
+ /* Skip non-present pages */
+ if (!region->pages[page_offset]) {
+ page_offset++;
+ page_count--;
+ continue;
+ }
+
+ ret = mshv_region_process_chunk(region, flags,
+ page_offset,
+ page_count,
+ handler);
+ if (ret < 0)
+ return ret;
+
+ page_offset += ret;
+ page_count -= ret;
+ }
+
+ return 0;
+}
+
+struct mshv_mem_region *mshv_region_create(u64 guest_pfn, u64 nr_pages,
+ u64 uaddr, u32 flags)
+{
+ struct mshv_mem_region *region;
+
+ region = vzalloc(sizeof(*region) + sizeof(struct page *) * nr_pages);
+ if (!region)
+ return ERR_PTR(-ENOMEM);
+
+ region->nr_pages = nr_pages;
+ region->start_gfn = guest_pfn;
+ region->start_uaddr = uaddr;
+ region->hv_map_flags = HV_MAP_GPA_READABLE | HV_MAP_GPA_ADJUSTABLE;
+ if (flags & BIT(MSHV_SET_MEM_BIT_WRITABLE))
+ region->hv_map_flags |= HV_MAP_GPA_WRITABLE;
+ if (flags & BIT(MSHV_SET_MEM_BIT_EXECUTABLE))
+ region->hv_map_flags |= HV_MAP_GPA_EXECUTABLE;
+
+ kref_init(&region->refcount);
+
+ return region;
+}
+
+static int mshv_region_chunk_share(struct mshv_mem_region *region,
+ u32 flags,
+ u64 page_offset, u64 page_count)
+{
+ struct page *page = region->pages[page_offset];
+
+ if (PageHuge(page) || PageTransCompound(page))
+ flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE;
+
+ return hv_call_modify_spa_host_access(region->partition->pt_id,
+ region->pages + page_offset,
+ page_count,
+ HV_MAP_GPA_READABLE |
+ HV_MAP_GPA_WRITABLE,
+ flags, true);
+}
+
+int mshv_region_share(struct mshv_mem_region *region)
+{
+ u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_SHARED;
+
+ return mshv_region_process_range(region, flags,
+ 0, region->nr_pages,
+ mshv_region_chunk_share);
+}
+
+static int mshv_region_chunk_unshare(struct mshv_mem_region *region,
+ u32 flags,
+ u64 page_offset, u64 page_count)
+{
+ struct page *page = region->pages[page_offset];
+
+ if (PageHuge(page) || PageTransCompound(page))
+ flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE;
+
+ return hv_call_modify_spa_host_access(region->partition->pt_id,
+ region->pages + page_offset,
+ page_count, 0,
+ flags, false);
+}
+
+int mshv_region_unshare(struct mshv_mem_region *region)
+{
+ u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_EXCLUSIVE;
+
+ return mshv_region_process_range(region, flags,
+ 0, region->nr_pages,
+ mshv_region_chunk_unshare);
+}
+
+static int mshv_region_chunk_remap(struct mshv_mem_region *region,
+ u32 flags,
+ u64 page_offset, u64 page_count)
+{
+ struct page *page = region->pages[page_offset];
+
+ if (PageHuge(page) || PageTransCompound(page))
+ flags |= HV_MAP_GPA_LARGE_PAGE;
+
+ return hv_call_map_gpa_pages(region->partition->pt_id,
+ region->start_gfn + page_offset,
+ page_count, flags,
+ region->pages + page_offset);
+}
+
+static int mshv_region_remap_pages(struct mshv_mem_region *region,
+ u32 map_flags,
+ u64 page_offset, u64 page_count)
+{
+ return mshv_region_process_range(region, map_flags,
+ page_offset, page_count,
+ mshv_region_chunk_remap);
+}
+
+int mshv_region_map(struct mshv_mem_region *region)
+{
+ u32 map_flags = region->hv_map_flags;
+
+ return mshv_region_remap_pages(region, map_flags,
+ 0, region->nr_pages);
+}
+
+static void mshv_region_invalidate_pages(struct mshv_mem_region *region,
+ u64 page_offset, u64 page_count)
+{
+ if (region->type == MSHV_REGION_TYPE_MEM_PINNED)
+ unpin_user_pages(region->pages + page_offset, page_count);
+
+ memset(region->pages + page_offset, 0,
+ page_count * sizeof(struct page *));
+}
+
+void mshv_region_invalidate(struct mshv_mem_region *region)
+{
+ mshv_region_invalidate_pages(region, 0, region->nr_pages);
+}
+
+int mshv_region_pin(struct mshv_mem_region *region)
+{
+ u64 done_count, nr_pages;
+ struct page **pages;
+ __u64 userspace_addr;
+ int ret;
+
+ for (done_count = 0; done_count < region->nr_pages; done_count += ret) {
+ pages = region->pages + done_count;
+ userspace_addr = region->start_uaddr +
+ done_count * HV_HYP_PAGE_SIZE;
+ nr_pages = min(region->nr_pages - done_count,
+ MSHV_PIN_PAGES_BATCH_SIZE);
+
+ /*
+ * Pinning assuming 4k pages works for large pages too.
+ * All page structs within the large page are returned.
+ *
+ * Pin requests are batched because pin_user_pages_fast
+ * with the FOLL_LONGTERM flag does a large temporary
+ * allocation of contiguous memory.
+ */
+ ret = pin_user_pages_fast(userspace_addr, nr_pages,
+ FOLL_WRITE | FOLL_LONGTERM,
+ pages);
+ if (ret < 0)
+ goto release_pages;
+ }
+
+ return 0;
+
+release_pages:
+ mshv_region_invalidate_pages(region, 0, done_count);
+ return ret;
+}
+
+static int mshv_region_chunk_unmap(struct mshv_mem_region *region,
+ u32 flags,
+ u64 page_offset, u64 page_count)
+{
+ struct page *page = region->pages[page_offset];
+
+ if (PageHuge(page) || PageTransCompound(page))
+ flags |= HV_UNMAP_GPA_LARGE_PAGE;
+
+ return hv_call_unmap_gpa_pages(region->partition->pt_id,
+ region->start_gfn + page_offset,
+ page_count, flags);
+}
+
+static int mshv_region_unmap(struct mshv_mem_region *region)
+{
+ return mshv_region_process_range(region, 0,
+ 0, region->nr_pages,
+ mshv_region_chunk_unmap);
+}
+
+static void mshv_region_destroy(struct kref *ref)
+{
+ struct mshv_mem_region *region =
+ container_of(ref, struct mshv_mem_region, refcount);
+ struct mshv_partition *partition = region->partition;
+ int ret;
+
+ if (region->type == MSHV_REGION_TYPE_MEM_MOVABLE)
+ mshv_region_movable_fini(region);
+
+ if (mshv_partition_encrypted(partition)) {
+ ret = mshv_region_share(region);
+ if (ret) {
+ pt_err(partition,
+ "Failed to regain access to memory, unpinning user pages will fail and crash the host error: %d\n",
+ ret);
+ return;
+ }
+ }
+
+ mshv_region_unmap(region);
+
+ mshv_region_invalidate(region);
+
+ vfree(region);
+}
+
+void mshv_region_put(struct mshv_mem_region *region)
+{
+ kref_put(&region->refcount, mshv_region_destroy);
+}
+
+int mshv_region_get(struct mshv_mem_region *region)
+{
+ return kref_get_unless_zero(&region->refcount);
+}
+
+/**
+ * mshv_region_hmm_fault_and_lock - Handle HMM faults and lock the memory region
+ * @region: Pointer to the memory region structure
+ * @range: Pointer to the HMM range structure
+ *
+ * This function performs the following steps:
+ * 1. Reads the notifier sequence for the HMM range.
+ * 2. Acquires a read lock on the memory map.
+ * 3. Handles HMM faults for the specified range.
+ * 4. Releases the read lock on the memory map.
+ * 5. If successful, locks the memory region mutex.
+ * 6. Verifies if the notifier sequence has changed during the operation.
+ * If it has, releases the mutex and returns -EBUSY to match with
+ * hmm_range_fault() return code for repeating.
+ *
+ * Return: 0 on success, a negative error code otherwise.
+ */
+static int mshv_region_hmm_fault_and_lock(struct mshv_mem_region *region,
+ struct hmm_range *range)
+{
+ int ret;
+
+ range->notifier_seq = mmu_interval_read_begin(range->notifier);
+ mmap_read_lock(region->mni.mm);
+ ret = hmm_range_fault(range);
+ mmap_read_unlock(region->mni.mm);
+ if (ret)
+ return ret;
+
+ mutex_lock(&region->mutex);
+
+ if (mmu_interval_read_retry(range->notifier, range->notifier_seq)) {
+ mutex_unlock(&region->mutex);
+ cond_resched();
+ return -EBUSY;
+ }
+
+ return 0;
+}
+
+/**
+ * mshv_region_range_fault - Handle memory range faults for a given region.
+ * @region: Pointer to the memory region structure.
+ * @page_offset: Offset of the page within the region.
+ * @page_count: Number of pages to handle.
+ *
+ * This function resolves memory faults for a specified range of pages
+ * within a memory region. It uses HMM (Heterogeneous Memory Management)
+ * to fault in the required pages and updates the region's page array.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+static int mshv_region_range_fault(struct mshv_mem_region *region,
+ u64 page_offset, u64 page_count)
+{
+ struct hmm_range range = {
+ .notifier = &region->mni,
+ .default_flags = HMM_PFN_REQ_FAULT | HMM_PFN_REQ_WRITE,
+ };
+ unsigned long *pfns;
+ int ret;
+ u64 i;
+
+ pfns = kmalloc_array(page_count, sizeof(*pfns), GFP_KERNEL);
+ if (!pfns)
+ return -ENOMEM;
+
+ range.hmm_pfns = pfns;
+ range.start = region->start_uaddr + page_offset * HV_HYP_PAGE_SIZE;
+ range.end = range.start + page_count * HV_HYP_PAGE_SIZE;
+
+ do {
+ ret = mshv_region_hmm_fault_and_lock(region, &range);
+ } while (ret == -EBUSY);
+
+ if (ret)
+ goto out;
+
+ for (i = 0; i < page_count; i++)
+ region->pages[page_offset + i] = hmm_pfn_to_page(pfns[i]);
+
+ ret = mshv_region_remap_pages(region, region->hv_map_flags,
+ page_offset, page_count);
+
+ mutex_unlock(&region->mutex);
+out:
+ kfree(pfns);
+ return ret;
+}
+
+bool mshv_region_handle_gfn_fault(struct mshv_mem_region *region, u64 gfn)
+{
+ u64 page_offset, page_count;
+ int ret;
+
+ /* Align the page offset to the nearest MSHV_MAP_FAULT_IN_PAGES. */
+ page_offset = ALIGN_DOWN(gfn - region->start_gfn,
+ MSHV_MAP_FAULT_IN_PAGES);
+
+ /* Map more pages than requested to reduce the number of faults. */
+ page_count = min(region->nr_pages - page_offset,
+ MSHV_MAP_FAULT_IN_PAGES);
+
+ ret = mshv_region_range_fault(region, page_offset, page_count);
+
+ WARN_ONCE(ret,
+ "p%llu: GPA intercept failed: region %#llx-%#llx, gfn %#llx, page_offset %llu, page_count %llu\n",
+ region->partition->pt_id, region->start_uaddr,
+ region->start_uaddr + (region->nr_pages << HV_HYP_PAGE_SHIFT),
+ gfn, page_offset, page_count);
+
+ return !ret;
+}
+
+/**
+ * mshv_region_interval_invalidate - Invalidate a range of memory region
+ * @mni: Pointer to the mmu_interval_notifier structure
+ * @range: Pointer to the mmu_notifier_range structure
+ * @cur_seq: Current sequence number for the interval notifier
+ *
+ * This function invalidates a memory region by remapping its pages with
+ * no access permissions. It locks the region's mutex to ensure thread safety
+ * and updates the sequence number for the interval notifier. If the range
+ * is blockable, it uses a blocking lock; otherwise, it attempts a non-blocking
+ * lock and returns false if unsuccessful.
+ *
+ * NOTE: Failure to invalidate a region is a serious error, as the pages will
+ * be considered freed while they are still mapped by the hypervisor.
+ * Any attempt to access such pages will likely crash the system.
+ *
+ * Return: true if the region was successfully invalidated, false otherwise.
+ */
+static bool mshv_region_interval_invalidate(struct mmu_interval_notifier *mni,
+ const struct mmu_notifier_range *range,
+ unsigned long cur_seq)
+{
+ struct mshv_mem_region *region = container_of(mni,
+ struct mshv_mem_region,
+ mni);
+ u64 page_offset, page_count;
+ unsigned long mstart, mend;
+ int ret = -EPERM;
+
+ if (mmu_notifier_range_blockable(range))
+ mutex_lock(&region->mutex);
+ else if (!mutex_trylock(&region->mutex))
+ goto out_fail;
+
+ mmu_interval_set_seq(mni, cur_seq);
+
+ mstart = max(range->start, region->start_uaddr);
+ mend = min(range->end, region->start_uaddr +
+ (region->nr_pages << HV_HYP_PAGE_SHIFT));
+
+ page_offset = HVPFN_DOWN(mstart - region->start_uaddr);
+ page_count = HVPFN_DOWN(mend - mstart);
+
+ ret = mshv_region_remap_pages(region, HV_MAP_GPA_NO_ACCESS,
+ page_offset, page_count);
+ if (ret)
+ goto out_fail;
+
+ mshv_region_invalidate_pages(region, page_offset, page_count);
+
+ mutex_unlock(&region->mutex);
+
+ return true;
+
+out_fail:
+ WARN_ONCE(ret,
+ "Failed to invalidate region %#llx-%#llx (range %#lx-%#lx, event: %u, pages %#llx-%#llx, mm: %#llx): %d\n",
+ region->start_uaddr,
+ region->start_uaddr + (region->nr_pages << HV_HYP_PAGE_SHIFT),
+ range->start, range->end, range->event,
+ page_offset, page_offset + page_count - 1, (u64)range->mm, ret);
+ return false;
+}
+
+static const struct mmu_interval_notifier_ops mshv_region_mni_ops = {
+ .invalidate = mshv_region_interval_invalidate,
+};
+
+void mshv_region_movable_fini(struct mshv_mem_region *region)
+{
+ mmu_interval_notifier_remove(&region->mni);
+}
+
+bool mshv_region_movable_init(struct mshv_mem_region *region)
+{
+ int ret;
+
+ ret = mmu_interval_notifier_insert(&region->mni, current->mm,
+ region->start_uaddr,
+ region->nr_pages << HV_HYP_PAGE_SHIFT,
+ &mshv_region_mni_ops);
+ if (ret)
+ return false;
+
+ mutex_init(&region->mutex);
+
+ return true;
+}
diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h
index e3931b0f1269..3c1d88b36741 100644
--- a/drivers/hv/mshv_root.h
+++ b/drivers/hv/mshv_root.h
@@ -15,6 +15,7 @@
#include <linux/hashtable.h>
#include <linux/dev_printk.h>
#include <linux/build_bug.h>
+#include <linux/mmu_notifier.h>
#include <uapi/linux/mshv.h>
/*
@@ -70,18 +71,23 @@ do { \
#define vp_info(v, fmt, ...) vp_devprintk(info, v, fmt, ##__VA_ARGS__)
#define vp_dbg(v, fmt, ...) vp_devprintk(dbg, v, fmt, ##__VA_ARGS__)
+enum mshv_region_type {
+ MSHV_REGION_TYPE_MEM_PINNED,
+ MSHV_REGION_TYPE_MEM_MOVABLE,
+ MSHV_REGION_TYPE_MMIO
+};
+
struct mshv_mem_region {
struct hlist_node hnode;
+ struct kref refcount;
u64 nr_pages;
u64 start_gfn;
u64 start_uaddr;
u32 hv_map_flags;
- struct {
- u64 large_pages: 1; /* 2MiB */
- u64 range_pinned: 1;
- u64 reserved: 62;
- } flags;
struct mshv_partition *partition;
+ enum mshv_region_type type;
+ struct mmu_interval_notifier mni;
+ struct mutex mutex; /* protects region pages remapping */
struct page *pages[];
};
@@ -98,6 +104,8 @@ struct mshv_partition {
u64 pt_id;
refcount_t pt_ref_count;
struct mutex pt_mutex;
+
+ spinlock_t pt_mem_regions_lock;
struct hlist_head pt_mem_regions; // not ordered
u32 pt_vp_count;
@@ -169,7 +177,7 @@ struct mshv_girq_routing_table {
};
struct hv_synic_pages {
- struct hv_message_page *synic_message_page;
+ struct hv_message_page *hyp_synic_message_page;
struct hv_synic_event_flags_page *synic_event_flags_page;
struct hv_synic_event_ring_page *synic_event_ring_page;
};
@@ -178,6 +186,7 @@ struct mshv_root {
struct hv_synic_pages __percpu *synic_pages;
spinlock_t pt_ht_lock;
DECLARE_HASHTABLE(pt_htable, MSHV_PARTITIONS_HASH_BITS);
+ struct hv_partition_property_vmm_capabilities vmm_caps;
};
/*
@@ -278,11 +287,12 @@ int hv_call_set_vp_state(u32 vp_index, u64 partition_id,
/* Choose between pages and bytes */
struct hv_vp_state_data state_data, u64 page_count,
struct page **pages, u32 num_bytes, u8 *bytes);
-int hv_call_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
- union hv_input_vtl input_vtl,
- struct page **state_page);
-int hv_call_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
- union hv_input_vtl input_vtl);
+int hv_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
+ union hv_input_vtl input_vtl,
+ struct page **state_page);
+int hv_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
+ struct page *state_page,
+ union hv_input_vtl input_vtl);
int hv_call_create_port(u64 port_partition_id, union hv_port_id port_id,
u64 connection_partition_id, struct hv_port_info *port_info,
u8 port_vtl, u8 min_connection_vtl, int node);
@@ -295,17 +305,32 @@ int hv_call_connect_port(u64 port_partition_id, union hv_port_id port_id,
int hv_call_disconnect_port(u64 connection_partition_id,
union hv_connection_id connection_id);
int hv_call_notify_port_ring_empty(u32 sint_index);
-int hv_call_map_stat_page(enum hv_stats_object_type type,
- const union hv_stats_object_identity *identity,
- void **addr);
-int hv_call_unmap_stat_page(enum hv_stats_object_type type,
- const union hv_stats_object_identity *identity);
+int hv_map_stats_page(enum hv_stats_object_type type,
+ const union hv_stats_object_identity *identity,
+ void **addr);
+int hv_unmap_stats_page(enum hv_stats_object_type type, void *page_addr,
+ const union hv_stats_object_identity *identity);
int hv_call_modify_spa_host_access(u64 partition_id, struct page **pages,
u64 page_struct_count, u32 host_access,
u32 flags, u8 acquire);
+int hv_call_get_partition_property_ex(u64 partition_id, u64 property_code, u64 arg,
+ void *property_value, size_t property_value_sz);
extern struct mshv_root mshv_root;
extern enum hv_scheduler_type hv_scheduler_type;
extern u8 * __percpu *hv_synic_eventring_tail;
+struct mshv_mem_region *mshv_region_create(u64 guest_pfn, u64 nr_pages,
+ u64 uaddr, u32 flags);
+int mshv_region_share(struct mshv_mem_region *region);
+int mshv_region_unshare(struct mshv_mem_region *region);
+int mshv_region_map(struct mshv_mem_region *region);
+void mshv_region_invalidate(struct mshv_mem_region *region);
+int mshv_region_pin(struct mshv_mem_region *region);
+void mshv_region_put(struct mshv_mem_region *region);
+int mshv_region_get(struct mshv_mem_region *region);
+bool mshv_region_handle_gfn_fault(struct mshv_mem_region *region, u64 gfn);
+void mshv_region_movable_fini(struct mshv_mem_region *region);
+bool mshv_region_movable_init(struct mshv_mem_region *region);
+
#endif /* _MSHV_ROOT_H_ */
diff --git a/drivers/hv/mshv_root_hv_call.c b/drivers/hv/mshv_root_hv_call.c
index c9c274f29c3c..598eaff4ff29 100644
--- a/drivers/hv/mshv_root_hv_call.c
+++ b/drivers/hv/mshv_root_hv_call.c
@@ -388,7 +388,13 @@ int hv_call_assert_virtual_interrupt(u64 partition_id, u32 vector,
memset(input, 0, sizeof(*input));
input->partition_id = partition_id;
input->vector = vector;
+ /*
+ * NOTE: dest_addr only needs to be provided while asserting an
+ * interrupt on x86 platform
+ */
+#if IS_ENABLED(CONFIG_X86)
input->dest_addr = dest_addr;
+#endif
input->control = control;
status = hv_do_hypercall(HVCALL_ASSERT_VIRTUAL_INTERRUPT, input, NULL);
local_irq_restore(flags);
@@ -526,9 +532,9 @@ int hv_call_set_vp_state(u32 vp_index, u64 partition_id,
return ret;
}
-int hv_call_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
- union hv_input_vtl input_vtl,
- struct page **state_page)
+static int hv_call_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
+ union hv_input_vtl input_vtl,
+ struct page **state_page)
{
struct hv_input_map_vp_state_page *input;
struct hv_output_map_vp_state_page *output;
@@ -542,12 +548,20 @@ int hv_call_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
input = *this_cpu_ptr(hyperv_pcpu_input_arg);
output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+ memset(input, 0, sizeof(*input));
input->partition_id = partition_id;
input->vp_index = vp_index;
input->type = type;
input->input_vtl = input_vtl;
- status = hv_do_hypercall(HVCALL_MAP_VP_STATE_PAGE, input, output);
+ if (*state_page) {
+ input->flags.map_location_provided = 1;
+ input->requested_map_location =
+ page_to_pfn(*state_page);
+ }
+
+ status = hv_do_hypercall(HVCALL_MAP_VP_STATE_PAGE, input,
+ output);
if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
if (hv_result_success(status))
@@ -565,8 +579,41 @@ int hv_call_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
return ret;
}
-int hv_call_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
- union hv_input_vtl input_vtl)
+static bool mshv_use_overlay_gpfn(void)
+{
+ return hv_l1vh_partition() &&
+ mshv_root.vmm_caps.vmm_can_provide_overlay_gpfn;
+}
+
+int hv_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
+ union hv_input_vtl input_vtl,
+ struct page **state_page)
+{
+ int ret = 0;
+ struct page *allocated_page = NULL;
+
+ if (mshv_use_overlay_gpfn()) {
+ allocated_page = alloc_page(GFP_KERNEL);
+ if (!allocated_page)
+ return -ENOMEM;
+ *state_page = allocated_page;
+ } else {
+ *state_page = NULL;
+ }
+
+ ret = hv_call_map_vp_state_page(partition_id, vp_index, type, input_vtl,
+ state_page);
+
+ if (ret && allocated_page) {
+ __free_page(allocated_page);
+ *state_page = NULL;
+ }
+
+ return ret;
+}
+
+static int hv_call_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
+ union hv_input_vtl input_vtl)
{
unsigned long flags;
u64 status;
@@ -590,6 +637,48 @@ int hv_call_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
return hv_result_to_errno(status);
}
+int hv_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
+ struct page *state_page, union hv_input_vtl input_vtl)
+{
+ int ret = hv_call_unmap_vp_state_page(partition_id, vp_index, type, input_vtl);
+
+ if (mshv_use_overlay_gpfn() && state_page)
+ __free_page(state_page);
+
+ return ret;
+}
+
+int hv_call_get_partition_property_ex(u64 partition_id, u64 property_code,
+ u64 arg, void *property_value,
+ size_t property_value_sz)
+{
+ u64 status;
+ unsigned long flags;
+ struct hv_input_get_partition_property_ex *input;
+ struct hv_output_get_partition_property_ex *output;
+
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+ memset(input, 0, sizeof(*input));
+ input->partition_id = partition_id;
+ input->property_code = property_code;
+ input->arg = arg;
+ status = hv_do_hypercall(HVCALL_GET_PARTITION_PROPERTY_EX, input, output);
+
+ if (!hv_result_success(status)) {
+ local_irq_restore(flags);
+ hv_status_debug(status, "\n");
+ return hv_result_to_errno(status);
+ }
+ memcpy(property_value, &output->property_value, property_value_sz);
+
+ local_irq_restore(flags);
+
+ return 0;
+}
+
int
hv_call_clear_virtual_interrupt(u64 partition_id)
{
@@ -724,9 +813,51 @@ hv_call_notify_port_ring_empty(u32 sint_index)
return hv_result_to_errno(status);
}
-int hv_call_map_stat_page(enum hv_stats_object_type type,
- const union hv_stats_object_identity *identity,
- void **addr)
+static int hv_call_map_stats_page2(enum hv_stats_object_type type,
+ const union hv_stats_object_identity *identity,
+ u64 map_location)
+{
+ unsigned long flags;
+ struct hv_input_map_stats_page2 *input;
+ u64 status;
+ int ret;
+
+ if (!map_location || !mshv_use_overlay_gpfn())
+ return -EINVAL;
+
+ do {
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+ memset(input, 0, sizeof(*input));
+ input->type = type;
+ input->identity = *identity;
+ input->map_location = map_location;
+
+ status = hv_do_hypercall(HVCALL_MAP_STATS_PAGE2, input, NULL);
+
+ local_irq_restore(flags);
+
+ ret = hv_result_to_errno(status);
+
+ if (!ret)
+ break;
+
+ if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
+ hv_status_debug(status, "\n");
+ break;
+ }
+
+ ret = hv_call_deposit_pages(NUMA_NO_NODE,
+ hv_current_partition_id, 1);
+ } while (!ret);
+
+ return ret;
+}
+
+static int hv_call_map_stats_page(enum hv_stats_object_type type,
+ const union hv_stats_object_identity *identity,
+ void **addr)
{
unsigned long flags;
struct hv_input_map_stats_page *input;
@@ -765,8 +896,38 @@ int hv_call_map_stat_page(enum hv_stats_object_type type,
return ret;
}
-int hv_call_unmap_stat_page(enum hv_stats_object_type type,
- const union hv_stats_object_identity *identity)
+int hv_map_stats_page(enum hv_stats_object_type type,
+ const union hv_stats_object_identity *identity,
+ void **addr)
+{
+ int ret;
+ struct page *allocated_page = NULL;
+
+ if (!addr)
+ return -EINVAL;
+
+ if (mshv_use_overlay_gpfn()) {
+ allocated_page = alloc_page(GFP_KERNEL);
+ if (!allocated_page)
+ return -ENOMEM;
+
+ ret = hv_call_map_stats_page2(type, identity,
+ page_to_pfn(allocated_page));
+ *addr = page_address(allocated_page);
+ } else {
+ ret = hv_call_map_stats_page(type, identity, addr);
+ }
+
+ if (ret && allocated_page) {
+ __free_page(allocated_page);
+ *addr = NULL;
+ }
+
+ return ret;
+}
+
+static int hv_call_unmap_stats_page(enum hv_stats_object_type type,
+ const union hv_stats_object_identity *identity)
{
unsigned long flags;
struct hv_input_unmap_stats_page *input;
@@ -785,6 +946,19 @@ int hv_call_unmap_stat_page(enum hv_stats_object_type type,
return hv_result_to_errno(status);
}
+int hv_unmap_stats_page(enum hv_stats_object_type type, void *page_addr,
+ const union hv_stats_object_identity *identity)
+{
+ int ret;
+
+ ret = hv_call_unmap_stats_page(type, identity);
+
+ if (mshv_use_overlay_gpfn() && page_addr)
+ __free_page(virt_to_page(page_addr));
+
+ return ret;
+}
+
int hv_call_modify_spa_host_access(u64 partition_id, struct page **pages,
u64 page_struct_count, u32 host_access,
u32 flags, u8 acquire)
diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
index e3b2bd417c46..1134a82c7881 100644
--- a/drivers/hv/mshv_root_main.c
+++ b/drivers/hv/mshv_root_main.c
@@ -29,6 +29,7 @@
#include <linux/crash_dump.h>
#include <linux/panic_notifier.h>
#include <linux/vmalloc.h>
+#include <linux/rseq.h>
#include "mshv_eventfd.h"
#include "mshv.h"
@@ -41,7 +42,7 @@ MODULE_DESCRIPTION("Microsoft Hyper-V root partition VMM interface /dev/mshv");
/* TODO move this to another file when debugfs code is added */
enum hv_stats_vp_counters { /* HV_THREAD_COUNTER */
#if defined(CONFIG_X86)
- VpRootDispatchThreadBlocked = 201,
+ VpRootDispatchThreadBlocked = 202,
#elif defined(CONFIG_ARM64)
VpRootDispatchThreadBlocked = 94,
#endif
@@ -122,6 +123,7 @@ static struct miscdevice mshv_dev = {
*/
static u16 mshv_passthru_hvcalls[] = {
HVCALL_GET_PARTITION_PROPERTY,
+ HVCALL_GET_PARTITION_PROPERTY_EX,
HVCALL_SET_PARTITION_PROPERTY,
HVCALL_INSTALL_INTERCEPT,
HVCALL_GET_VP_REGISTERS,
@@ -136,6 +138,16 @@ static u16 mshv_passthru_hvcalls[] = {
HVCALL_GET_VP_CPUID_VALUES,
};
+/*
+ * Only allow hypercalls that are safe to be called by the VMM with the host
+ * partition as target (i.e. HV_PARTITION_ID_SELF). Carefully audit that a
+ * hypercall cannot be misused by the VMM before adding it to this list.
+ */
+static u16 mshv_self_passthru_hvcalls[] = {
+ HVCALL_GET_PARTITION_PROPERTY,
+ HVCALL_GET_PARTITION_PROPERTY_EX,
+};
+
static bool mshv_hvcall_is_async(u16 code)
{
switch (code) {
@@ -147,18 +159,38 @@ static bool mshv_hvcall_is_async(u16 code)
return false;
}
+static bool mshv_passthru_hvcall_allowed(u16 code, u64 pt_id)
+{
+ int i;
+ int n = ARRAY_SIZE(mshv_passthru_hvcalls);
+ u16 *allowed_hvcalls = mshv_passthru_hvcalls;
+
+ if (pt_id == HV_PARTITION_ID_SELF) {
+ n = ARRAY_SIZE(mshv_self_passthru_hvcalls);
+ allowed_hvcalls = mshv_self_passthru_hvcalls;
+ }
+
+ for (i = 0; i < n; ++i)
+ if (allowed_hvcalls[i] == code)
+ return true;
+
+ return false;
+}
+
static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition,
bool partition_locked,
void __user *user_args)
{
u64 status;
- int ret = 0, i;
+ int ret = 0;
bool is_async;
struct mshv_root_hvcall args;
struct page *page;
unsigned int pages_order;
void *input_pg = NULL;
void *output_pg = NULL;
+ u16 reps_completed;
+ u64 pt_id = partition ? partition->pt_id : HV_PARTITION_ID_SELF;
if (copy_from_user(&args, user_args, sizeof(args)))
return -EFAULT;
@@ -170,17 +202,13 @@ static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition,
if (args.out_ptr && (!args.out_sz || args.out_sz > HV_HYP_PAGE_SIZE))
return -EINVAL;
- for (i = 0; i < ARRAY_SIZE(mshv_passthru_hvcalls); ++i)
- if (args.code == mshv_passthru_hvcalls[i])
- break;
-
- if (i >= ARRAY_SIZE(mshv_passthru_hvcalls))
+ if (!mshv_passthru_hvcall_allowed(args.code, pt_id))
return -EINVAL;
is_async = mshv_hvcall_is_async(args.code);
if (is_async) {
/* async hypercalls can only be called from partition fd */
- if (!partition_locked)
+ if (!partition || !partition_locked)
return -EINVAL;
ret = mshv_init_async_handler(partition);
if (ret)
@@ -208,43 +236,44 @@ static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition,
* NOTE: This only works because all the allowed hypercalls' input
* structs begin with a u64 partition_id field.
*/
- *(u64 *)input_pg = partition->pt_id;
+ *(u64 *)input_pg = pt_id;
- if (args.reps)
- status = hv_do_rep_hypercall(args.code, args.reps, 0,
- input_pg, output_pg);
- else
- status = hv_do_hypercall(args.code, input_pg, output_pg);
-
- if (hv_result(status) == HV_STATUS_CALL_PENDING) {
- if (is_async) {
- mshv_async_hvcall_handler(partition, &status);
- } else { /* Paranoia check. This shouldn't happen! */
- ret = -EBADFD;
- goto free_pages_out;
+ reps_completed = 0;
+ do {
+ if (args.reps) {
+ status = hv_do_rep_hypercall_ex(args.code, args.reps,
+ 0, reps_completed,
+ input_pg, output_pg);
+ reps_completed = hv_repcomp(status);
+ } else {
+ status = hv_do_hypercall(args.code, input_pg, output_pg);
}
- }
- if (hv_result(status) == HV_STATUS_INSUFFICIENT_MEMORY) {
- ret = hv_call_deposit_pages(NUMA_NO_NODE, partition->pt_id, 1);
- if (!ret)
- ret = -EAGAIN;
- } else if (!hv_result_success(status)) {
- ret = hv_result_to_errno(status);
- }
+ if (hv_result(status) == HV_STATUS_CALL_PENDING) {
+ if (is_async) {
+ mshv_async_hvcall_handler(partition, &status);
+ } else { /* Paranoia check. This shouldn't happen! */
+ ret = -EBADFD;
+ goto free_pages_out;
+ }
+ }
+
+ if (hv_result_success(status))
+ break;
+
+ if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY)
+ ret = hv_result_to_errno(status);
+ else
+ ret = hv_call_deposit_pages(NUMA_NO_NODE,
+ pt_id, 1);
+ } while (!ret);
- /*
- * Always return the status and output data regardless of result.
- * The VMM may need it to determine how to proceed. E.g. the status may
- * contain the number of reps completed if a rep hypercall partially
- * succeeded.
- */
args.status = hv_result(status);
- args.reps = args.reps ? hv_repcomp(status) : 0;
+ args.reps = reps_completed;
if (copy_to_user(user_args, &args, sizeof(args)))
ret = -EFAULT;
- if (output_pg &&
+ if (!ret && output_pg &&
copy_to_user((void __user *)args.out_ptr, output_pg, args.out_sz))
ret = -EFAULT;
@@ -560,20 +589,106 @@ static long mshv_run_vp_with_root_scheduler(struct mshv_vp *vp)
}
} while (!vp->run.flags.intercept_suspend);
+ rseq_virt_userspace_exit();
+
return ret;
}
static_assert(sizeof(struct hv_message) <= MSHV_RUN_VP_BUF_SZ,
"sizeof(struct hv_message) must not exceed MSHV_RUN_VP_BUF_SZ");
+static struct mshv_mem_region *
+mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn)
+{
+ struct mshv_mem_region *region;
+
+ hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) {
+ if (gfn >= region->start_gfn &&
+ gfn < region->start_gfn + region->nr_pages)
+ return region;
+ }
+
+ return NULL;
+}
+
+#ifdef CONFIG_X86_64
+static struct mshv_mem_region *
+mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn)
+{
+ struct mshv_mem_region *region;
+
+ spin_lock(&p->pt_mem_regions_lock);
+ region = mshv_partition_region_by_gfn(p, gfn);
+ if (!region || !mshv_region_get(region)) {
+ spin_unlock(&p->pt_mem_regions_lock);
+ return NULL;
+ }
+ spin_unlock(&p->pt_mem_regions_lock);
+
+ return region;
+}
+
+/**
+ * mshv_handle_gpa_intercept - Handle GPA (Guest Physical Address) intercepts.
+ * @vp: Pointer to the virtual processor structure.
+ *
+ * This function processes GPA intercepts by identifying the memory region
+ * corresponding to the intercepted GPA, aligning the page offset, and
+ * mapping the required pages. It ensures that the region is valid and
+ * handles faults efficiently by mapping multiple pages at once.
+ *
+ * Return: true if the intercept was handled successfully, false otherwise.
+ */
+static bool mshv_handle_gpa_intercept(struct mshv_vp *vp)
+{
+ struct mshv_partition *p = vp->vp_partition;
+ struct mshv_mem_region *region;
+ struct hv_x64_memory_intercept_message *msg;
+ bool ret;
+ u64 gfn;
+
+ msg = (struct hv_x64_memory_intercept_message *)
+ vp->vp_intercept_msg_page->u.payload;
+
+ gfn = HVPFN_DOWN(msg->guest_physical_address);
+
+ region = mshv_partition_region_by_gfn_get(p, gfn);
+ if (!region)
+ return false;
+
+ /* Only movable memory ranges are supported for GPA intercepts */
+ if (region->type == MSHV_REGION_TYPE_MEM_MOVABLE)
+ ret = mshv_region_handle_gfn_fault(region, gfn);
+ else
+ ret = false;
+
+ mshv_region_put(region);
+
+ return ret;
+}
+#else /* CONFIG_X86_64 */
+static bool mshv_handle_gpa_intercept(struct mshv_vp *vp) { return false; }
+#endif /* CONFIG_X86_64 */
+
+static bool mshv_vp_handle_intercept(struct mshv_vp *vp)
+{
+ switch (vp->vp_intercept_msg_page->header.message_type) {
+ case HVMSG_GPA_INTERCEPT:
+ return mshv_handle_gpa_intercept(vp);
+ }
+ return false;
+}
+
static long mshv_vp_ioctl_run_vp(struct mshv_vp *vp, void __user *ret_msg)
{
long rc;
- if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
- rc = mshv_run_vp_with_root_scheduler(vp);
- else
- rc = mshv_run_vp_with_hyp_scheduler(vp);
+ do {
+ if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
+ rc = mshv_run_vp_with_root_scheduler(vp);
+ else
+ rc = mshv_run_vp_with_hyp_scheduler(vp);
+ } while (rc == 0 && mshv_vp_handle_intercept(vp));
if (rc)
return rc;
@@ -841,7 +956,8 @@ mshv_vp_release(struct inode *inode, struct file *filp)
return 0;
}
-static void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index)
+static void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index,
+ void *stats_pages[])
{
union hv_stats_object_identity identity = {
.vp.partition_id = partition_id,
@@ -849,10 +965,10 @@ static void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index)
};
identity.vp.stats_area_type = HV_STATS_AREA_SELF;
- hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity);
+ hv_unmap_stats_page(HV_STATS_OBJECT_VP, NULL, &identity);
identity.vp.stats_area_type = HV_STATS_AREA_PARENT;
- hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity);
+ hv_unmap_stats_page(HV_STATS_OBJECT_VP, NULL, &identity);
}
static int mshv_vp_stats_map(u64 partition_id, u32 vp_index,
@@ -865,14 +981,14 @@ static int mshv_vp_stats_map(u64 partition_id, u32 vp_index,
int err;
identity.vp.stats_area_type = HV_STATS_AREA_SELF;
- err = hv_call_map_stat_page(HV_STATS_OBJECT_VP, &identity,
- &stats_pages[HV_STATS_AREA_SELF]);
+ err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity,
+ &stats_pages[HV_STATS_AREA_SELF]);
if (err)
return err;
identity.vp.stats_area_type = HV_STATS_AREA_PARENT;
- err = hv_call_map_stat_page(HV_STATS_OBJECT_VP, &identity,
- &stats_pages[HV_STATS_AREA_PARENT]);
+ err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity,
+ &stats_pages[HV_STATS_AREA_PARENT]);
if (err)
goto unmap_self;
@@ -880,7 +996,7 @@ static int mshv_vp_stats_map(u64 partition_id, u32 vp_index,
unmap_self:
identity.vp.stats_area_type = HV_STATS_AREA_SELF;
- hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity);
+ hv_unmap_stats_page(HV_STATS_OBJECT_VP, NULL, &identity);
return err;
}
@@ -890,7 +1006,7 @@ mshv_partition_ioctl_create_vp(struct mshv_partition *partition,
{
struct mshv_create_vp args;
struct mshv_vp *vp;
- struct page *intercept_message_page, *register_page, *ghcb_page;
+ struct page *intercept_msg_page, *register_page, *ghcb_page;
void *stats_pages[2];
long ret;
@@ -908,33 +1024,34 @@ mshv_partition_ioctl_create_vp(struct mshv_partition *partition,
if (ret)
return ret;
- ret = hv_call_map_vp_state_page(partition->pt_id, args.vp_index,
- HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
- input_vtl_zero,
- &intercept_message_page);
+ ret = hv_map_vp_state_page(partition->pt_id, args.vp_index,
+ HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
+ input_vtl_zero, &intercept_msg_page);
if (ret)
goto destroy_vp;
if (!mshv_partition_encrypted(partition)) {
- ret = hv_call_map_vp_state_page(partition->pt_id, args.vp_index,
- HV_VP_STATE_PAGE_REGISTERS,
- input_vtl_zero,
- &register_page);
+ ret = hv_map_vp_state_page(partition->pt_id, args.vp_index,
+ HV_VP_STATE_PAGE_REGISTERS,
+ input_vtl_zero, &register_page);
if (ret)
goto unmap_intercept_message_page;
}
if (mshv_partition_encrypted(partition) &&
is_ghcb_mapping_available()) {
- ret = hv_call_map_vp_state_page(partition->pt_id, args.vp_index,
- HV_VP_STATE_PAGE_GHCB,
- input_vtl_normal,
- &ghcb_page);
+ ret = hv_map_vp_state_page(partition->pt_id, args.vp_index,
+ HV_VP_STATE_PAGE_GHCB,
+ input_vtl_normal, &ghcb_page);
if (ret)
goto unmap_register_page;
}
- if (hv_parent_partition()) {
+ /*
+ * This mapping of the stats page is for detecting if dispatch thread
+ * is blocked - only relevant for root scheduler
+ */
+ if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) {
ret = mshv_vp_stats_map(partition->pt_id, args.vp_index,
stats_pages);
if (ret)
@@ -956,14 +1073,14 @@ mshv_partition_ioctl_create_vp(struct mshv_partition *partition,
atomic64_set(&vp->run.vp_signaled_count, 0);
vp->vp_index = args.vp_index;
- vp->vp_intercept_msg_page = page_to_virt(intercept_message_page);
+ vp->vp_intercept_msg_page = page_to_virt(intercept_msg_page);
if (!mshv_partition_encrypted(partition))
vp->vp_register_page = page_to_virt(register_page);
if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available())
vp->vp_ghcb_page = page_to_virt(ghcb_page);
- if (hv_parent_partition())
+ if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
memcpy(vp->vp_stats_pages, stats_pages, sizeof(stats_pages));
/*
@@ -986,24 +1103,22 @@ put_partition:
free_vp:
kfree(vp);
unmap_stats_pages:
- if (hv_parent_partition())
- mshv_vp_stats_unmap(partition->pt_id, args.vp_index);
+ if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
+ mshv_vp_stats_unmap(partition->pt_id, args.vp_index, stats_pages);
unmap_ghcb_page:
- if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) {
- hv_call_unmap_vp_state_page(partition->pt_id, args.vp_index,
- HV_VP_STATE_PAGE_GHCB,
- input_vtl_normal);
- }
+ if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available())
+ hv_unmap_vp_state_page(partition->pt_id, args.vp_index,
+ HV_VP_STATE_PAGE_GHCB, ghcb_page,
+ input_vtl_normal);
unmap_register_page:
- if (!mshv_partition_encrypted(partition)) {
- hv_call_unmap_vp_state_page(partition->pt_id, args.vp_index,
- HV_VP_STATE_PAGE_REGISTERS,
- input_vtl_zero);
- }
+ if (!mshv_partition_encrypted(partition))
+ hv_unmap_vp_state_page(partition->pt_id, args.vp_index,
+ HV_VP_STATE_PAGE_REGISTERS,
+ register_page, input_vtl_zero);
unmap_intercept_message_page:
- hv_call_unmap_vp_state_page(partition->pt_id, args.vp_index,
- HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
- input_vtl_zero);
+ hv_unmap_vp_state_page(partition->pt_id, args.vp_index,
+ HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
+ intercept_msg_page, input_vtl_zero);
destroy_vp:
hv_call_delete_vp(partition->pt_id, args.vp_index);
return ret;
@@ -1031,162 +1146,6 @@ static void mshv_async_hvcall_handler(void *data, u64 *status)
*status = partition->async_hypercall_status;
}
-static int
-mshv_partition_region_share(struct mshv_mem_region *region)
-{
- u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_SHARED;
-
- if (region->flags.large_pages)
- flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE;
-
- return hv_call_modify_spa_host_access(region->partition->pt_id,
- region->pages, region->nr_pages,
- HV_MAP_GPA_READABLE | HV_MAP_GPA_WRITABLE,
- flags, true);
-}
-
-static int
-mshv_partition_region_unshare(struct mshv_mem_region *region)
-{
- u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_EXCLUSIVE;
-
- if (region->flags.large_pages)
- flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE;
-
- return hv_call_modify_spa_host_access(region->partition->pt_id,
- region->pages, region->nr_pages,
- 0,
- flags, false);
-}
-
-static int
-mshv_region_remap_pages(struct mshv_mem_region *region, u32 map_flags,
- u64 page_offset, u64 page_count)
-{
- if (page_offset + page_count > region->nr_pages)
- return -EINVAL;
-
- if (region->flags.large_pages)
- map_flags |= HV_MAP_GPA_LARGE_PAGE;
-
- /* ask the hypervisor to map guest ram */
- return hv_call_map_gpa_pages(region->partition->pt_id,
- region->start_gfn + page_offset,
- page_count, map_flags,
- region->pages + page_offset);
-}
-
-static int
-mshv_region_map(struct mshv_mem_region *region)
-{
- u32 map_flags = region->hv_map_flags;
-
- return mshv_region_remap_pages(region, map_flags,
- 0, region->nr_pages);
-}
-
-static void
-mshv_region_evict_pages(struct mshv_mem_region *region,
- u64 page_offset, u64 page_count)
-{
- if (region->flags.range_pinned)
- unpin_user_pages(region->pages + page_offset, page_count);
-
- memset(region->pages + page_offset, 0,
- page_count * sizeof(struct page *));
-}
-
-static void
-mshv_region_evict(struct mshv_mem_region *region)
-{
- mshv_region_evict_pages(region, 0, region->nr_pages);
-}
-
-static int
-mshv_region_populate_pages(struct mshv_mem_region *region,
- u64 page_offset, u64 page_count)
-{
- u64 done_count, nr_pages;
- struct page **pages;
- __u64 userspace_addr;
- int ret;
-
- if (page_offset + page_count > region->nr_pages)
- return -EINVAL;
-
- for (done_count = 0; done_count < page_count; done_count += ret) {
- pages = region->pages + page_offset + done_count;
- userspace_addr = region->start_uaddr +
- (page_offset + done_count) *
- HV_HYP_PAGE_SIZE;
- nr_pages = min(page_count - done_count,
- MSHV_PIN_PAGES_BATCH_SIZE);
-
- /*
- * Pinning assuming 4k pages works for large pages too.
- * All page structs within the large page are returned.
- *
- * Pin requests are batched because pin_user_pages_fast
- * with the FOLL_LONGTERM flag does a large temporary
- * allocation of contiguous memory.
- */
- if (region->flags.range_pinned)
- ret = pin_user_pages_fast(userspace_addr,
- nr_pages,
- FOLL_WRITE | FOLL_LONGTERM,
- pages);
- else
- ret = -EOPNOTSUPP;
-
- if (ret < 0)
- goto release_pages;
- }
-
- if (PageHuge(region->pages[page_offset]))
- region->flags.large_pages = true;
-
- return 0;
-
-release_pages:
- mshv_region_evict_pages(region, page_offset, done_count);
- return ret;
-}
-
-static int
-mshv_region_populate(struct mshv_mem_region *region)
-{
- return mshv_region_populate_pages(region, 0, region->nr_pages);
-}
-
-static struct mshv_mem_region *
-mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn)
-{
- struct mshv_mem_region *region;
-
- hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) {
- if (gfn >= region->start_gfn &&
- gfn < region->start_gfn + region->nr_pages)
- return region;
- }
-
- return NULL;
-}
-
-static struct mshv_mem_region *
-mshv_partition_region_by_uaddr(struct mshv_partition *partition, u64 uaddr)
-{
- struct mshv_mem_region *region;
-
- hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) {
- if (uaddr >= region->start_uaddr &&
- uaddr < region->start_uaddr +
- (region->nr_pages << HV_HYP_PAGE_SHIFT))
- return region;
- }
-
- return NULL;
-}
-
/*
* NB: caller checks and makes sure mem->size is page aligned
* Returns: 0 with regionpp updated on success, or -errno
@@ -1196,53 +1155,61 @@ static int mshv_partition_create_region(struct mshv_partition *partition,
struct mshv_mem_region **regionpp,
bool is_mmio)
{
- struct mshv_mem_region *region;
+ struct mshv_mem_region *rg;
u64 nr_pages = HVPFN_DOWN(mem->size);
/* Reject overlapping regions */
- if (mshv_partition_region_by_gfn(partition, mem->guest_pfn) ||
- mshv_partition_region_by_gfn(partition, mem->guest_pfn + nr_pages - 1) ||
- mshv_partition_region_by_uaddr(partition, mem->userspace_addr) ||
- mshv_partition_region_by_uaddr(partition, mem->userspace_addr + mem->size - 1))
+ spin_lock(&partition->pt_mem_regions_lock);
+ hlist_for_each_entry(rg, &partition->pt_mem_regions, hnode) {
+ if (mem->guest_pfn + nr_pages <= rg->start_gfn ||
+ rg->start_gfn + rg->nr_pages <= mem->guest_pfn)
+ continue;
+ spin_unlock(&partition->pt_mem_regions_lock);
return -EEXIST;
+ }
+ spin_unlock(&partition->pt_mem_regions_lock);
- region = vzalloc(sizeof(*region) + sizeof(struct page *) * nr_pages);
- if (!region)
- return -ENOMEM;
-
- region->nr_pages = nr_pages;
- region->start_gfn = mem->guest_pfn;
- region->start_uaddr = mem->userspace_addr;
- region->hv_map_flags = HV_MAP_GPA_READABLE | HV_MAP_GPA_ADJUSTABLE;
- if (mem->flags & BIT(MSHV_SET_MEM_BIT_WRITABLE))
- region->hv_map_flags |= HV_MAP_GPA_WRITABLE;
- if (mem->flags & BIT(MSHV_SET_MEM_BIT_EXECUTABLE))
- region->hv_map_flags |= HV_MAP_GPA_EXECUTABLE;
+ rg = mshv_region_create(mem->guest_pfn, nr_pages,
+ mem->userspace_addr, mem->flags);
+ if (IS_ERR(rg))
+ return PTR_ERR(rg);
- /* Note: large_pages flag populated when we pin the pages */
- if (!is_mmio)
- region->flags.range_pinned = true;
+ if (is_mmio)
+ rg->type = MSHV_REGION_TYPE_MMIO;
+ else if (mshv_partition_encrypted(partition) ||
+ !mshv_region_movable_init(rg))
+ rg->type = MSHV_REGION_TYPE_MEM_PINNED;
+ else
+ rg->type = MSHV_REGION_TYPE_MEM_MOVABLE;
- region->partition = partition;
+ rg->partition = partition;
- *regionpp = region;
+ *regionpp = rg;
return 0;
}
-/*
- * Map guest ram. if snp, make sure to release that from the host first
- * Side Effects: In case of failure, pages are unpinned when feasible.
+/**
+ * mshv_prepare_pinned_region - Pin and map memory regions
+ * @region: Pointer to the memory region structure
+ *
+ * This function processes memory regions that are explicitly marked as pinned.
+ * Pinned regions are preallocated, mapped upfront, and do not rely on fault-based
+ * population. The function ensures the region is properly populated, handles
+ * encryption requirements for SNP partitions if applicable, maps the region,
+ * and performs necessary sharing or eviction operations based on the mapping
+ * result.
+ *
+ * Return: 0 on success, negative error code on failure.
*/
-static int
-mshv_partition_mem_region_map(struct mshv_mem_region *region)
+static int mshv_prepare_pinned_region(struct mshv_mem_region *region)
{
struct mshv_partition *partition = region->partition;
int ret;
- ret = mshv_region_populate(region);
+ ret = mshv_region_pin(region);
if (ret) {
- pt_err(partition, "Failed to populate memory region: %d\n",
+ pt_err(partition, "Failed to pin memory region: %d\n",
ret);
goto err_out;
}
@@ -1255,12 +1222,12 @@ mshv_partition_mem_region_map(struct mshv_mem_region *region)
* access to guest memory regions.
*/
if (mshv_partition_encrypted(partition)) {
- ret = mshv_partition_region_unshare(region);
+ ret = mshv_region_unshare(region);
if (ret) {
pt_err(partition,
"Failed to unshare memory region (guest_pfn: %llu): %d\n",
region->start_gfn, ret);
- goto evict_region;
+ goto invalidate_region;
}
}
@@ -1268,9 +1235,9 @@ mshv_partition_mem_region_map(struct mshv_mem_region *region)
if (ret && mshv_partition_encrypted(partition)) {
int shrc;
- shrc = mshv_partition_region_share(region);
+ shrc = mshv_region_share(region);
if (!shrc)
- goto evict_region;
+ goto invalidate_region;
pt_err(partition,
"Failed to share memory region (guest_pfn: %llu): %d\n",
@@ -1284,8 +1251,8 @@ mshv_partition_mem_region_map(struct mshv_mem_region *region)
return 0;
-evict_region:
- mshv_region_evict(region);
+invalidate_region:
+ mshv_region_invalidate(region);
err_out:
return ret;
}
@@ -1330,17 +1297,35 @@ mshv_map_user_memory(struct mshv_partition *partition,
if (ret)
return ret;
- if (is_mmio)
- ret = hv_call_map_mmio_pages(partition->pt_id, mem.guest_pfn,
- mmio_pfn, HVPFN_DOWN(mem.size));
- else
- ret = mshv_partition_mem_region_map(region);
+ switch (region->type) {
+ case MSHV_REGION_TYPE_MEM_PINNED:
+ ret = mshv_prepare_pinned_region(region);
+ break;
+ case MSHV_REGION_TYPE_MEM_MOVABLE:
+ /*
+ * For movable memory regions, remap with no access to let
+ * the hypervisor track dirty pages, enabling pre-copy live
+ * migration.
+ */
+ ret = hv_call_map_gpa_pages(partition->pt_id,
+ region->start_gfn,
+ region->nr_pages,
+ HV_MAP_GPA_NO_ACCESS, NULL);
+ break;
+ case MSHV_REGION_TYPE_MMIO:
+ ret = hv_call_map_mmio_pages(partition->pt_id,
+ region->start_gfn,
+ mmio_pfn,
+ region->nr_pages);
+ break;
+ }
if (ret)
goto errout;
- /* Install the new region */
+ spin_lock(&partition->pt_mem_regions_lock);
hlist_add_head(&region->hnode, &partition->pt_mem_regions);
+ spin_unlock(&partition->pt_mem_regions_lock);
return 0;
@@ -1355,33 +1340,32 @@ mshv_unmap_user_memory(struct mshv_partition *partition,
struct mshv_user_mem_region mem)
{
struct mshv_mem_region *region;
- u32 unmap_flags = 0;
if (!(mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP)))
return -EINVAL;
+ spin_lock(&partition->pt_mem_regions_lock);
+
region = mshv_partition_region_by_gfn(partition, mem.guest_pfn);
- if (!region)
- return -EINVAL;
+ if (!region) {
+ spin_unlock(&partition->pt_mem_regions_lock);
+ return -ENOENT;
+ }
/* Paranoia check */
if (region->start_uaddr != mem.userspace_addr ||
region->start_gfn != mem.guest_pfn ||
- region->nr_pages != HVPFN_DOWN(mem.size))
+ region->nr_pages != HVPFN_DOWN(mem.size)) {
+ spin_unlock(&partition->pt_mem_regions_lock);
return -EINVAL;
+ }
hlist_del(&region->hnode);
- if (region->flags.large_pages)
- unmap_flags |= HV_UNMAP_GPA_LARGE_PAGE;
+ spin_unlock(&partition->pt_mem_regions_lock);
- /* ignore unmap failures and continue as process may be exiting */
- hv_call_unmap_gpa_pages(partition->pt_id, region->start_gfn,
- region->nr_pages, unmap_flags);
+ mshv_region_put(region);
- mshv_region_evict(region);
-
- vfree(region);
return 0;
}
@@ -1717,8 +1701,8 @@ static void destroy_partition(struct mshv_partition *partition)
{
struct mshv_vp *vp;
struct mshv_mem_region *region;
- int i, ret;
struct hlist_node *n;
+ int i;
if (refcount_read(&partition->pt_ref_count)) {
pt_err(partition,
@@ -1740,28 +1724,32 @@ static void destroy_partition(struct mshv_partition *partition)
if (!vp)
continue;
- if (hv_parent_partition())
- mshv_vp_stats_unmap(partition->pt_id, vp->vp_index);
+ if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
+ mshv_vp_stats_unmap(partition->pt_id, vp->vp_index,
+ (void **)vp->vp_stats_pages);
if (vp->vp_register_page) {
- (void)hv_call_unmap_vp_state_page(partition->pt_id,
- vp->vp_index,
- HV_VP_STATE_PAGE_REGISTERS,
- input_vtl_zero);
+ (void)hv_unmap_vp_state_page(partition->pt_id,
+ vp->vp_index,
+ HV_VP_STATE_PAGE_REGISTERS,
+ virt_to_page(vp->vp_register_page),
+ input_vtl_zero);
vp->vp_register_page = NULL;
}
- (void)hv_call_unmap_vp_state_page(partition->pt_id,
- vp->vp_index,
- HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
- input_vtl_zero);
+ (void)hv_unmap_vp_state_page(partition->pt_id,
+ vp->vp_index,
+ HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
+ virt_to_page(vp->vp_intercept_msg_page),
+ input_vtl_zero);
vp->vp_intercept_msg_page = NULL;
if (vp->vp_ghcb_page) {
- (void)hv_call_unmap_vp_state_page(partition->pt_id,
- vp->vp_index,
- HV_VP_STATE_PAGE_GHCB,
- input_vtl_normal);
+ (void)hv_unmap_vp_state_page(partition->pt_id,
+ vp->vp_index,
+ HV_VP_STATE_PAGE_GHCB,
+ virt_to_page(vp->vp_ghcb_page),
+ input_vtl_normal);
vp->vp_ghcb_page = NULL;
}
@@ -1778,24 +1766,10 @@ static void destroy_partition(struct mshv_partition *partition)
remove_partition(partition);
- /* Remove regions, regain access to the memory and unpin the pages */
hlist_for_each_entry_safe(region, n, &partition->pt_mem_regions,
hnode) {
hlist_del(&region->hnode);
-
- if (mshv_partition_encrypted(partition)) {
- ret = mshv_partition_region_share(region);
- if (ret) {
- pt_err(partition,
- "Failed to regain access to memory, unpinning user pages will fail and crash the host error: %d\n",
- ret);
- return;
- }
- }
-
- mshv_region_evict(region);
-
- vfree(region);
+ mshv_region_put(region);
}
/* Withdraw and free all pages we deposited */
@@ -1862,43 +1836,117 @@ add_partition(struct mshv_partition *partition)
return 0;
}
-static long
-mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev)
+static_assert(MSHV_NUM_CPU_FEATURES_BANKS ==
+ HV_PARTITION_PROCESSOR_FEATURES_BANKS);
+
+static long mshv_ioctl_process_pt_flags(void __user *user_arg, u64 *pt_flags,
+ struct hv_partition_creation_properties *cr_props,
+ union hv_partition_isolation_properties *isol_props)
{
- struct mshv_create_partition args;
- u64 creation_flags;
- struct hv_partition_creation_properties creation_properties = {};
- union hv_partition_isolation_properties isolation_properties = {};
- struct mshv_partition *partition;
- struct file *file;
- int fd;
- long ret;
+ int i;
+ struct mshv_create_partition_v2 args;
+ union hv_partition_processor_features *disabled_procs;
+ union hv_partition_processor_xsave_features *disabled_xsave;
- if (copy_from_user(&args, user_arg, sizeof(args)))
+ /* First, copy v1 struct in case user is on previous versions */
+ if (copy_from_user(&args, user_arg,
+ sizeof(struct mshv_create_partition)))
return -EFAULT;
if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) ||
args.pt_isolation >= MSHV_PT_ISOLATION_COUNT)
return -EINVAL;
+ disabled_procs = &cr_props->disabled_processor_features;
+ disabled_xsave = &cr_props->disabled_processor_xsave_features;
+
+ /* Check if user provided newer struct with feature fields */
+ if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_CPU_AND_XSAVE_FEATURES)) {
+ if (copy_from_user(&args, user_arg, sizeof(args)))
+ return -EFAULT;
+
+ /* Re-validate v1 fields after second copy_from_user() */
+ if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) ||
+ args.pt_isolation >= MSHV_PT_ISOLATION_COUNT)
+ return -EINVAL;
+
+ if (args.pt_num_cpu_fbanks != MSHV_NUM_CPU_FEATURES_BANKS ||
+ mshv_field_nonzero(args, pt_rsvd) ||
+ mshv_field_nonzero(args, pt_rsvd1))
+ return -EINVAL;
+
+ /*
+ * Note this assumes MSHV_NUM_CPU_FEATURES_BANKS will never
+ * change and equals HV_PARTITION_PROCESSOR_FEATURES_BANKS
+ * (i.e. 2).
+ *
+ * Further banks (index >= 2) will be modifiable as 'early'
+ * properties via the set partition property hypercall.
+ */
+ for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++)
+ disabled_procs->as_uint64[i] = args.pt_cpu_fbanks[i];
+
+#if IS_ENABLED(CONFIG_X86_64)
+ disabled_xsave->as_uint64 = args.pt_disabled_xsave;
+#else
+ /*
+ * In practice this field is ignored on arm64, but safer to
+ * zero it in case it is ever used.
+ */
+ disabled_xsave->as_uint64 = 0;
+
+ if (mshv_field_nonzero(args, pt_rsvd2))
+ return -EINVAL;
+#endif
+ } else {
+ /*
+ * v1 behavior: try to enable everything. The hypervisor will
+ * disable features that are not supported. The banks can be
+ * queried via the get partition property hypercall.
+ */
+ for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++)
+ disabled_procs->as_uint64[i] = 0;
+
+ disabled_xsave->as_uint64 = 0;
+ }
+
/* Only support EXO partitions */
- creation_flags = HV_PARTITION_CREATION_FLAG_EXO_PARTITION |
- HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED;
+ *pt_flags = HV_PARTITION_CREATION_FLAG_EXO_PARTITION |
+ HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED;
- if (args.pt_flags & BIT(MSHV_PT_BIT_LAPIC))
- creation_flags |= HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED;
- if (args.pt_flags & BIT(MSHV_PT_BIT_X2APIC))
- creation_flags |= HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE;
- if (args.pt_flags & BIT(MSHV_PT_BIT_GPA_SUPER_PAGES))
- creation_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED;
+ if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_LAPIC))
+ *pt_flags |= HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED;
+ if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_X2APIC))
+ *pt_flags |= HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE;
+ if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_GPA_SUPER_PAGES))
+ *pt_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED;
+
+ isol_props->as_uint64 = 0;
switch (args.pt_isolation) {
case MSHV_PT_ISOLATION_NONE:
- isolation_properties.isolation_type =
- HV_PARTITION_ISOLATION_TYPE_NONE;
+ isol_props->isolation_type = HV_PARTITION_ISOLATION_TYPE_NONE;
break;
}
+ return 0;
+}
+
+static long
+mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev)
+{
+ u64 creation_flags;
+ struct hv_partition_creation_properties creation_properties;
+ union hv_partition_isolation_properties isolation_properties;
+ struct mshv_partition *partition;
+ long ret;
+
+ ret = mshv_ioctl_process_pt_flags(user_arg, &creation_flags,
+ &creation_properties,
+ &isolation_properties);
+ if (ret)
+ return ret;
+
partition = kzalloc(sizeof(*partition), GFP_KERNEL);
if (!partition)
return -ENOMEM;
@@ -1918,6 +1966,7 @@ mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev)
INIT_HLIST_HEAD(&partition->pt_devices);
+ spin_lock_init(&partition->pt_mem_regions_lock);
INIT_HLIST_HEAD(&partition->pt_mem_regions);
mshv_eventfd_init(partition);
@@ -1938,29 +1987,13 @@ mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev)
goto delete_partition;
ret = mshv_init_async_handler(partition);
- if (ret)
- goto remove_partition;
-
- fd = get_unused_fd_flags(O_CLOEXEC);
- if (fd < 0) {
- ret = fd;
- goto remove_partition;
- }
-
- file = anon_inode_getfile("mshv_partition", &mshv_partition_fops,
- partition, O_RDWR);
- if (IS_ERR(file)) {
- ret = PTR_ERR(file);
- goto put_fd;
+ if (!ret) {
+ ret = FD_ADD(O_CLOEXEC, anon_inode_getfile("mshv_partition",
+ &mshv_partition_fops,
+ partition, O_RDWR));
+ if (ret >= 0)
+ return ret;
}
-
- fd_install(fd, file);
-
- return fd;
-
-put_fd:
- put_unused_fd(fd);
-remove_partition:
remove_partition(partition);
delete_partition:
hv_call_delete_partition(partition->pt_id);
@@ -1981,6 +2014,9 @@ static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl,
case MSHV_CREATE_PARTITION:
return mshv_ioctl_create_partition((void __user *)arg,
misc->this_device);
+ case MSHV_ROOT_HVCALL:
+ return mshv_ioctl_passthru_hvcall(NULL, false,
+ (void __user *)arg);
}
return -ENOTTY;
@@ -2197,6 +2233,22 @@ root_sched_deinit:
return err;
}
+static void mshv_init_vmm_caps(struct device *dev)
+{
+ /*
+ * This can only fail here if HVCALL_GET_PARTITION_PROPERTY_EX or
+ * HV_PARTITION_PROPERTY_VMM_CAPABILITIES are not supported. In that
+ * case it's valid to proceed as if all vmm_caps are disabled (zero).
+ */
+ if (hv_call_get_partition_property_ex(HV_PARTITION_ID_SELF,
+ HV_PARTITION_PROPERTY_VMM_CAPABILITIES,
+ 0, &mshv_root.vmm_caps,
+ sizeof(mshv_root.vmm_caps)))
+ dev_warn(dev, "Unable to get VMM capabilities\n");
+
+ dev_dbg(dev, "vmm_caps = %#llx\n", mshv_root.vmm_caps.as_uint64[0]);
+}
+
static int __init mshv_parent_partition_init(void)
{
int ret;
@@ -2249,6 +2301,8 @@ static int __init mshv_parent_partition_init(void)
if (ret)
goto remove_cpu_state;
+ mshv_init_vmm_caps(dev);
+
ret = mshv_irqfd_wq_init();
if (ret)
goto exit_partition;
diff --git a/drivers/hv/mshv_synic.c b/drivers/hv/mshv_synic.c
index e6b6381b7c36..f8b0337cdc82 100644
--- a/drivers/hv/mshv_synic.c
+++ b/drivers/hv/mshv_synic.c
@@ -394,7 +394,7 @@ unlock_out:
void mshv_isr(void)
{
struct hv_synic_pages *spages = this_cpu_ptr(mshv_root.synic_pages);
- struct hv_message_page **msg_page = &spages->synic_message_page;
+ struct hv_message_page **msg_page = &spages->hyp_synic_message_page;
struct hv_message *msg;
bool handled;
@@ -456,7 +456,7 @@ int mshv_synic_init(unsigned int cpu)
#endif
union hv_synic_scontrol sctrl;
struct hv_synic_pages *spages = this_cpu_ptr(mshv_root.synic_pages);
- struct hv_message_page **msg_page = &spages->synic_message_page;
+ struct hv_message_page **msg_page = &spages->hyp_synic_message_page;
struct hv_synic_event_flags_page **event_flags_page =
&spages->synic_event_flags_page;
struct hv_synic_event_ring_page **event_ring_page =
@@ -550,7 +550,7 @@ int mshv_synic_cleanup(unsigned int cpu)
union hv_synic_sirbp sirbp;
union hv_synic_scontrol sctrl;
struct hv_synic_pages *spages = this_cpu_ptr(mshv_root.synic_pages);
- struct hv_message_page **msg_page = &spages->synic_message_page;
+ struct hv_message_page **msg_page = &spages->hyp_synic_message_page;
struct hv_synic_event_flags_page **event_flags_page =
&spages->synic_event_flags_page;
struct hv_synic_event_ring_page **event_ring_page =
diff --git a/drivers/hv/mshv_vtl.h b/drivers/hv/mshv_vtl.h
new file mode 100644
index 000000000000..a6eea52f7aa2
--- /dev/null
+++ b/drivers/hv/mshv_vtl.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _MSHV_VTL_H
+#define _MSHV_VTL_H
+
+#include <linux/mshv.h>
+#include <linux/types.h>
+
+struct mshv_vtl_run {
+ u32 cancel;
+ u32 vtl_ret_action_size;
+ u32 pad[2];
+ char exit_message[MSHV_MAX_RUN_MSG_SIZE];
+ union {
+ struct mshv_vtl_cpu_context cpu_context;
+
+ /*
+ * Reserving room for the cpu context to grow and to maintain compatibility
+ * with user mode.
+ */
+ char reserved[1024];
+ };
+ char vtl_ret_actions[MSHV_MAX_RUN_MSG_SIZE];
+};
+
+#endif /* _MSHV_VTL_H */
diff --git a/drivers/hv/mshv_vtl_main.c b/drivers/hv/mshv_vtl_main.c
new file mode 100644
index 000000000000..2cebe9de5a5a
--- /dev/null
+++ b/drivers/hv/mshv_vtl_main.c
@@ -0,0 +1,1392 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2023, Microsoft Corporation.
+ *
+ * Author:
+ * Roman Kisel <romank@linux.microsoft.com>
+ * Saurabh Sengar <ssengar@linux.microsoft.com>
+ * Naman Jain <namjain@linux.microsoft.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/miscdevice.h>
+#include <linux/anon_inodes.h>
+#include <linux/cpuhotplug.h>
+#include <linux/count_zeros.h>
+#include <linux/entry-virt.h>
+#include <linux/eventfd.h>
+#include <linux/poll.h>
+#include <linux/file.h>
+#include <linux/vmalloc.h>
+#include <asm/debugreg.h>
+#include <asm/mshyperv.h>
+#include <trace/events/ipi.h>
+#include <uapi/asm/mtrr.h>
+#include <uapi/linux/mshv.h>
+#include <hyperv/hvhdk.h>
+
+#include "../../kernel/fpu/legacy.h"
+#include "mshv.h"
+#include "mshv_vtl.h"
+#include "hyperv_vmbus.h"
+
+MODULE_AUTHOR("Microsoft");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Microsoft Hyper-V VTL Driver");
+
+#define MSHV_ENTRY_REASON_LOWER_VTL_CALL 0x1
+#define MSHV_ENTRY_REASON_INTERRUPT 0x2
+#define MSHV_ENTRY_REASON_INTERCEPT 0x3
+
+#define MSHV_REAL_OFF_SHIFT 16
+#define MSHV_PG_OFF_CPU_MASK (BIT_ULL(MSHV_REAL_OFF_SHIFT) - 1)
+#define MSHV_RUN_PAGE_OFFSET 0
+#define MSHV_REG_PAGE_OFFSET 1
+#define VTL2_VMBUS_SINT_INDEX 7
+
+static struct device *mem_dev;
+
+static struct tasklet_struct msg_dpc;
+static wait_queue_head_t fd_wait_queue;
+static bool has_message;
+static struct eventfd_ctx *flag_eventfds[HV_EVENT_FLAGS_COUNT];
+static DEFINE_MUTEX(flag_lock);
+static bool __read_mostly mshv_has_reg_page;
+
+/* hvcall code is of type u16, allocate a bitmap of size (1 << 16) to accommodate it */
+#define MAX_BITMAP_SIZE ((U16_MAX + 1) / 8)
+
+struct mshv_vtl_hvcall_fd {
+ u8 allow_bitmap[MAX_BITMAP_SIZE];
+ bool allow_map_initialized;
+ /*
+ * Used to protect hvcall setup in IOCTLs
+ */
+ struct mutex init_mutex;
+ struct miscdevice *dev;
+};
+
+struct mshv_vtl_poll_file {
+ struct file *file;
+ wait_queue_entry_t wait;
+ wait_queue_head_t *wqh;
+ poll_table pt;
+ int cpu;
+};
+
+struct mshv_vtl {
+ struct device *module_dev;
+ u64 id;
+};
+
+struct mshv_vtl_per_cpu {
+ struct mshv_vtl_run *run;
+ struct page *reg_page;
+};
+
+/* SYNIC_OVERLAY_PAGE_MSR - internal, identical to hv_synic_simp */
+union hv_synic_overlay_page_msr {
+ u64 as_uint64;
+ struct {
+ u64 enabled: 1;
+ u64 reserved: 11;
+ u64 pfn: 52;
+ } __packed;
+};
+
+static struct mutex mshv_vtl_poll_file_lock;
+static union hv_register_vsm_page_offsets mshv_vsm_page_offsets;
+static union hv_register_vsm_capabilities mshv_vsm_capabilities;
+
+static DEFINE_PER_CPU(struct mshv_vtl_poll_file, mshv_vtl_poll_file);
+static DEFINE_PER_CPU(unsigned long long, num_vtl0_transitions);
+static DEFINE_PER_CPU(struct mshv_vtl_per_cpu, mshv_vtl_per_cpu);
+
+static const union hv_input_vtl input_vtl_zero;
+static const union hv_input_vtl input_vtl_normal = {
+ .use_target_vtl = 1,
+};
+
+static const struct file_operations mshv_vtl_fops;
+
+static long
+mshv_ioctl_create_vtl(void __user *user_arg, struct device *module_dev)
+{
+ struct mshv_vtl *vtl;
+ struct file *file;
+ int fd;
+
+ vtl = kzalloc(sizeof(*vtl), GFP_KERNEL);
+ if (!vtl)
+ return -ENOMEM;
+
+ fd = get_unused_fd_flags(O_CLOEXEC);
+ if (fd < 0) {
+ kfree(vtl);
+ return fd;
+ }
+ file = anon_inode_getfile("mshv_vtl", &mshv_vtl_fops,
+ vtl, O_RDWR);
+ if (IS_ERR(file)) {
+ kfree(vtl);
+ return PTR_ERR(file);
+ }
+ vtl->module_dev = module_dev;
+ fd_install(fd, file);
+
+ return fd;
+}
+
+static long
+mshv_ioctl_check_extension(void __user *user_arg)
+{
+ u32 arg;
+
+ if (copy_from_user(&arg, user_arg, sizeof(arg)))
+ return -EFAULT;
+
+ switch (arg) {
+ case MSHV_CAP_CORE_API_STABLE:
+ return 0;
+ case MSHV_CAP_REGISTER_PAGE:
+ return mshv_has_reg_page;
+ case MSHV_CAP_VTL_RETURN_ACTION:
+ return mshv_vsm_capabilities.return_action_available;
+ case MSHV_CAP_DR6_SHARED:
+ return mshv_vsm_capabilities.dr6_shared;
+ }
+
+ return -EOPNOTSUPP;
+}
+
+static long
+mshv_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
+{
+ struct miscdevice *misc = filp->private_data;
+
+ switch (ioctl) {
+ case MSHV_CHECK_EXTENSION:
+ return mshv_ioctl_check_extension((void __user *)arg);
+ case MSHV_CREATE_VTL:
+ return mshv_ioctl_create_vtl((void __user *)arg, misc->this_device);
+ }
+
+ return -ENOTTY;
+}
+
+static const struct file_operations mshv_dev_fops = {
+ .owner = THIS_MODULE,
+ .unlocked_ioctl = mshv_dev_ioctl,
+ .llseek = noop_llseek,
+};
+
+static struct miscdevice mshv_dev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "mshv",
+ .fops = &mshv_dev_fops,
+ .mode = 0600,
+};
+
+static struct mshv_vtl_run *mshv_vtl_this_run(void)
+{
+ return *this_cpu_ptr(&mshv_vtl_per_cpu.run);
+}
+
+static struct mshv_vtl_run *mshv_vtl_cpu_run(int cpu)
+{
+ return *per_cpu_ptr(&mshv_vtl_per_cpu.run, cpu);
+}
+
+static struct page *mshv_vtl_cpu_reg_page(int cpu)
+{
+ return *per_cpu_ptr(&mshv_vtl_per_cpu.reg_page, cpu);
+}
+
+static void mshv_vtl_configure_reg_page(struct mshv_vtl_per_cpu *per_cpu)
+{
+ struct hv_register_assoc reg_assoc = {};
+ union hv_synic_overlay_page_msr overlay = {};
+ struct page *reg_page;
+
+ reg_page = alloc_page(GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL);
+ if (!reg_page) {
+ WARN(1, "failed to allocate register page\n");
+ return;
+ }
+
+ overlay.enabled = 1;
+ overlay.pfn = page_to_hvpfn(reg_page);
+ reg_assoc.name = HV_X64_REGISTER_REG_PAGE;
+ reg_assoc.value.reg64 = overlay.as_uint64;
+
+ if (hv_call_set_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF,
+ 1, input_vtl_zero, &reg_assoc)) {
+ WARN(1, "failed to setup register page\n");
+ __free_page(reg_page);
+ return;
+ }
+
+ per_cpu->reg_page = reg_page;
+ mshv_has_reg_page = true;
+}
+
+static void mshv_vtl_synic_enable_regs(unsigned int cpu)
+{
+ union hv_synic_sint sint;
+
+ sint.as_uint64 = 0;
+ sint.vector = HYPERVISOR_CALLBACK_VECTOR;
+ sint.masked = false;
+ sint.auto_eoi = hv_recommend_using_aeoi();
+
+ /* Enable intercepts */
+ if (!mshv_vsm_capabilities.intercept_page_available)
+ hv_set_msr(HV_MSR_SINT0 + HV_SYNIC_INTERCEPTION_SINT_INDEX,
+ sint.as_uint64);
+
+ /* VTL2 Host VSP SINT is (un)masked when the user mode requests that */
+}
+
+static int mshv_vtl_get_vsm_regs(void)
+{
+ struct hv_register_assoc registers[2];
+ int ret, count = 2;
+
+ registers[0].name = HV_REGISTER_VSM_CODE_PAGE_OFFSETS;
+ registers[1].name = HV_REGISTER_VSM_CAPABILITIES;
+
+ ret = hv_call_get_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF,
+ count, input_vtl_zero, registers);
+ if (ret)
+ return ret;
+
+ mshv_vsm_page_offsets.as_uint64 = registers[0].value.reg64;
+ mshv_vsm_capabilities.as_uint64 = registers[1].value.reg64;
+
+ return ret;
+}
+
+static int mshv_vtl_configure_vsm_partition(struct device *dev)
+{
+ union hv_register_vsm_partition_config config;
+ struct hv_register_assoc reg_assoc;
+
+ config.as_uint64 = 0;
+ config.default_vtl_protection_mask = HV_MAP_GPA_PERMISSIONS_MASK;
+ config.enable_vtl_protection = 1;
+ config.zero_memory_on_reset = 1;
+ config.intercept_vp_startup = 1;
+ config.intercept_cpuid_unimplemented = 1;
+
+ if (mshv_vsm_capabilities.intercept_page_available) {
+ dev_dbg(dev, "using intercept page\n");
+ config.intercept_page = 1;
+ }
+
+ reg_assoc.name = HV_REGISTER_VSM_PARTITION_CONFIG;
+ reg_assoc.value.reg64 = config.as_uint64;
+
+ return hv_call_set_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF,
+ 1, input_vtl_zero, &reg_assoc);
+}
+
+static void mshv_vtl_vmbus_isr(void)
+{
+ struct hv_per_cpu_context *per_cpu;
+ struct hv_message *msg;
+ u32 message_type;
+ union hv_synic_event_flags *event_flags;
+ struct eventfd_ctx *eventfd;
+ u16 i;
+
+ per_cpu = this_cpu_ptr(hv_context.cpu_context);
+ if (smp_processor_id() == 0) {
+ msg = (struct hv_message *)per_cpu->hyp_synic_message_page + VTL2_VMBUS_SINT_INDEX;
+ message_type = READ_ONCE(msg->header.message_type);
+ if (message_type != HVMSG_NONE)
+ tasklet_schedule(&msg_dpc);
+ }
+
+ event_flags = (union hv_synic_event_flags *)per_cpu->hyp_synic_event_page +
+ VTL2_VMBUS_SINT_INDEX;
+ for_each_set_bit(i, event_flags->flags, HV_EVENT_FLAGS_COUNT) {
+ if (!sync_test_and_clear_bit(i, event_flags->flags))
+ continue;
+ rcu_read_lock();
+ eventfd = READ_ONCE(flag_eventfds[i]);
+ if (eventfd)
+ eventfd_signal(eventfd);
+ rcu_read_unlock();
+ }
+
+ vmbus_isr();
+}
+
+static int mshv_vtl_alloc_context(unsigned int cpu)
+{
+ struct mshv_vtl_per_cpu *per_cpu = this_cpu_ptr(&mshv_vtl_per_cpu);
+
+ per_cpu->run = (struct mshv_vtl_run *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+ if (!per_cpu->run)
+ return -ENOMEM;
+
+ if (mshv_vsm_capabilities.intercept_page_available)
+ mshv_vtl_configure_reg_page(per_cpu);
+
+ mshv_vtl_synic_enable_regs(cpu);
+
+ return 0;
+}
+
+static int mshv_vtl_cpuhp_online;
+
+static int hv_vtl_setup_synic(void)
+{
+ int ret;
+
+ /* Use our isr to first filter out packets destined for userspace */
+ hv_setup_vmbus_handler(mshv_vtl_vmbus_isr);
+
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "hyperv/vtl:online",
+ mshv_vtl_alloc_context, NULL);
+ if (ret < 0) {
+ hv_setup_vmbus_handler(vmbus_isr);
+ return ret;
+ }
+
+ mshv_vtl_cpuhp_online = ret;
+
+ return 0;
+}
+
+static void hv_vtl_remove_synic(void)
+{
+ cpuhp_remove_state(mshv_vtl_cpuhp_online);
+ hv_setup_vmbus_handler(vmbus_isr);
+}
+
+static int vtl_get_vp_register(struct hv_register_assoc *reg)
+{
+ return hv_call_get_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF,
+ 1, input_vtl_normal, reg);
+}
+
+static int vtl_set_vp_register(struct hv_register_assoc *reg)
+{
+ return hv_call_set_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF,
+ 1, input_vtl_normal, reg);
+}
+
+static int mshv_vtl_ioctl_add_vtl0_mem(struct mshv_vtl *vtl, void __user *arg)
+{
+ struct mshv_vtl_ram_disposition vtl0_mem;
+ struct dev_pagemap *pgmap;
+ void *addr;
+
+ if (copy_from_user(&vtl0_mem, arg, sizeof(vtl0_mem)))
+ return -EFAULT;
+ /* vtl0_mem.last_pfn is excluded in the pagemap range for VTL0 as per design */
+ if (vtl0_mem.last_pfn <= vtl0_mem.start_pfn) {
+ dev_err(vtl->module_dev, "range start pfn (%llx) > end pfn (%llx)\n",
+ vtl0_mem.start_pfn, vtl0_mem.last_pfn);
+ return -EFAULT;
+ }
+
+ pgmap = kzalloc(sizeof(*pgmap), GFP_KERNEL);
+ if (!pgmap)
+ return -ENOMEM;
+
+ pgmap->ranges[0].start = PFN_PHYS(vtl0_mem.start_pfn);
+ pgmap->ranges[0].end = PFN_PHYS(vtl0_mem.last_pfn) - 1;
+ pgmap->nr_range = 1;
+ pgmap->type = MEMORY_DEVICE_GENERIC;
+
+ /*
+ * Determine the highest page order that can be used for the given memory range.
+ * This works best when the range is aligned; i.e. both the start and the length.
+ */
+ pgmap->vmemmap_shift = count_trailing_zeros(vtl0_mem.start_pfn | vtl0_mem.last_pfn);
+ dev_dbg(vtl->module_dev,
+ "Add VTL0 memory: start: 0x%llx, end_pfn: 0x%llx, page order: %lu\n",
+ vtl0_mem.start_pfn, vtl0_mem.last_pfn, pgmap->vmemmap_shift);
+
+ addr = devm_memremap_pages(mem_dev, pgmap);
+ if (IS_ERR(addr)) {
+ dev_err(vtl->module_dev, "devm_memremap_pages error: %ld\n", PTR_ERR(addr));
+ kfree(pgmap);
+ return -EFAULT;
+ }
+
+ /* Don't free pgmap, since it has to stick around until the memory
+ * is unmapped, which will never happen as there is no scenario
+ * where VTL0 can be released/shutdown without bringing down VTL2.
+ */
+ return 0;
+}
+
+static void mshv_vtl_cancel(int cpu)
+{
+ int here = get_cpu();
+
+ if (here != cpu) {
+ if (!xchg_relaxed(&mshv_vtl_cpu_run(cpu)->cancel, 1))
+ smp_send_reschedule(cpu);
+ } else {
+ WRITE_ONCE(mshv_vtl_this_run()->cancel, 1);
+ }
+ put_cpu();
+}
+
+static int mshv_vtl_poll_file_wake(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key)
+{
+ struct mshv_vtl_poll_file *poll_file = container_of(wait, struct mshv_vtl_poll_file, wait);
+
+ mshv_vtl_cancel(poll_file->cpu);
+
+ return 0;
+}
+
+static void mshv_vtl_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh, poll_table *pt)
+{
+ struct mshv_vtl_poll_file *poll_file = container_of(pt, struct mshv_vtl_poll_file, pt);
+
+ WARN_ON(poll_file->wqh);
+ poll_file->wqh = wqh;
+ add_wait_queue(wqh, &poll_file->wait);
+}
+
+static int mshv_vtl_ioctl_set_poll_file(struct mshv_vtl_set_poll_file __user *user_input)
+{
+ struct file *file, *old_file;
+ struct mshv_vtl_poll_file *poll_file;
+ struct mshv_vtl_set_poll_file input;
+
+ if (copy_from_user(&input, user_input, sizeof(input)))
+ return -EFAULT;
+
+ if (input.cpu >= num_possible_cpus() || !cpu_online(input.cpu))
+ return -EINVAL;
+ /*
+ * CPU Hotplug is not supported in VTL2 in OpenHCL, where this kernel driver exists.
+ * CPU is expected to remain online after above cpu_online() check.
+ */
+
+ file = NULL;
+ file = fget(input.fd);
+ if (!file)
+ return -EBADFD;
+
+ poll_file = per_cpu_ptr(&mshv_vtl_poll_file, READ_ONCE(input.cpu));
+ if (!poll_file)
+ return -EINVAL;
+
+ mutex_lock(&mshv_vtl_poll_file_lock);
+
+ if (poll_file->wqh)
+ remove_wait_queue(poll_file->wqh, &poll_file->wait);
+ poll_file->wqh = NULL;
+
+ old_file = poll_file->file;
+ poll_file->file = file;
+ poll_file->cpu = input.cpu;
+
+ if (file) {
+ init_waitqueue_func_entry(&poll_file->wait, mshv_vtl_poll_file_wake);
+ init_poll_funcptr(&poll_file->pt, mshv_vtl_ptable_queue_proc);
+ vfs_poll(file, &poll_file->pt);
+ }
+
+ mutex_unlock(&mshv_vtl_poll_file_lock);
+
+ if (old_file)
+ fput(old_file);
+
+ return 0;
+}
+
+/* Static table mapping register names to their corresponding actions */
+static const struct {
+ enum hv_register_name reg_name;
+ int debug_reg_num; /* -1 if not a debug register */
+ u32 msr_addr; /* 0 if not an MSR */
+} reg_table[] = {
+ /* Debug registers */
+ {HV_X64_REGISTER_DR0, 0, 0},
+ {HV_X64_REGISTER_DR1, 1, 0},
+ {HV_X64_REGISTER_DR2, 2, 0},
+ {HV_X64_REGISTER_DR3, 3, 0},
+ {HV_X64_REGISTER_DR6, 6, 0},
+ /* MTRR MSRs */
+ {HV_X64_REGISTER_MSR_MTRR_CAP, -1, MSR_MTRRcap},
+ {HV_X64_REGISTER_MSR_MTRR_DEF_TYPE, -1, MSR_MTRRdefType},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE0, -1, MTRRphysBase_MSR(0)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE1, -1, MTRRphysBase_MSR(1)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE2, -1, MTRRphysBase_MSR(2)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE3, -1, MTRRphysBase_MSR(3)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE4, -1, MTRRphysBase_MSR(4)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE5, -1, MTRRphysBase_MSR(5)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE6, -1, MTRRphysBase_MSR(6)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE7, -1, MTRRphysBase_MSR(7)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE8, -1, MTRRphysBase_MSR(8)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE9, -1, MTRRphysBase_MSR(9)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_BASEA, -1, MTRRphysBase_MSR(0xa)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_BASEB, -1, MTRRphysBase_MSR(0xb)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_BASEC, -1, MTRRphysBase_MSR(0xc)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_BASED, -1, MTRRphysBase_MSR(0xd)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_BASEE, -1, MTRRphysBase_MSR(0xe)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_BASEF, -1, MTRRphysBase_MSR(0xf)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK0, -1, MTRRphysMask_MSR(0)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK1, -1, MTRRphysMask_MSR(1)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK2, -1, MTRRphysMask_MSR(2)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK3, -1, MTRRphysMask_MSR(3)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK4, -1, MTRRphysMask_MSR(4)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK5, -1, MTRRphysMask_MSR(5)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK6, -1, MTRRphysMask_MSR(6)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK7, -1, MTRRphysMask_MSR(7)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK8, -1, MTRRphysMask_MSR(8)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK9, -1, MTRRphysMask_MSR(9)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKA, -1, MTRRphysMask_MSR(0xa)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKB, -1, MTRRphysMask_MSR(0xb)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKC, -1, MTRRphysMask_MSR(0xc)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKD, -1, MTRRphysMask_MSR(0xd)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKE, -1, MTRRphysMask_MSR(0xe)},
+ {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKF, -1, MTRRphysMask_MSR(0xf)},
+ {HV_X64_REGISTER_MSR_MTRR_FIX64K00000, -1, MSR_MTRRfix64K_00000},
+ {HV_X64_REGISTER_MSR_MTRR_FIX16K80000, -1, MSR_MTRRfix16K_80000},
+ {HV_X64_REGISTER_MSR_MTRR_FIX16KA0000, -1, MSR_MTRRfix16K_A0000},
+ {HV_X64_REGISTER_MSR_MTRR_FIX4KC0000, -1, MSR_MTRRfix4K_C0000},
+ {HV_X64_REGISTER_MSR_MTRR_FIX4KC8000, -1, MSR_MTRRfix4K_C8000},
+ {HV_X64_REGISTER_MSR_MTRR_FIX4KD0000, -1, MSR_MTRRfix4K_D0000},
+ {HV_X64_REGISTER_MSR_MTRR_FIX4KD8000, -1, MSR_MTRRfix4K_D8000},
+ {HV_X64_REGISTER_MSR_MTRR_FIX4KE0000, -1, MSR_MTRRfix4K_E0000},
+ {HV_X64_REGISTER_MSR_MTRR_FIX4KE8000, -1, MSR_MTRRfix4K_E8000},
+ {HV_X64_REGISTER_MSR_MTRR_FIX4KF0000, -1, MSR_MTRRfix4K_F0000},
+ {HV_X64_REGISTER_MSR_MTRR_FIX4KF8000, -1, MSR_MTRRfix4K_F8000},
+};
+
+static int mshv_vtl_get_set_reg(struct hv_register_assoc *regs, bool set)
+{
+ u64 *reg64;
+ enum hv_register_name gpr_name;
+ int i;
+
+ gpr_name = regs->name;
+ reg64 = &regs->value.reg64;
+
+ /* Search for the register in the table */
+ for (i = 0; i < ARRAY_SIZE(reg_table); i++) {
+ if (reg_table[i].reg_name != gpr_name)
+ continue;
+ if (reg_table[i].debug_reg_num != -1) {
+ /* Handle debug registers */
+ if (gpr_name == HV_X64_REGISTER_DR6 &&
+ !mshv_vsm_capabilities.dr6_shared)
+ goto hypercall;
+ if (set)
+ native_set_debugreg(reg_table[i].debug_reg_num, *reg64);
+ else
+ *reg64 = native_get_debugreg(reg_table[i].debug_reg_num);
+ } else {
+ /* Handle MSRs */
+ if (set)
+ wrmsrl(reg_table[i].msr_addr, *reg64);
+ else
+ rdmsrl(reg_table[i].msr_addr, *reg64);
+ }
+ return 0;
+ }
+
+hypercall:
+ return 1;
+}
+
+static void mshv_vtl_return(struct mshv_vtl_cpu_context *vtl0)
+{
+ struct hv_vp_assist_page *hvp;
+
+ hvp = hv_vp_assist_page[smp_processor_id()];
+
+ /*
+ * Process signal event direct set in the run page, if any.
+ */
+ if (mshv_vsm_capabilities.return_action_available) {
+ u32 offset = READ_ONCE(mshv_vtl_this_run()->vtl_ret_action_size);
+
+ WRITE_ONCE(mshv_vtl_this_run()->vtl_ret_action_size, 0);
+
+ /*
+ * Hypervisor will take care of clearing out the actions
+ * set in the assist page.
+ */
+ memcpy(hvp->vtl_ret_actions,
+ mshv_vtl_this_run()->vtl_ret_actions,
+ min_t(u32, offset, sizeof(hvp->vtl_ret_actions)));
+ }
+
+ mshv_vtl_return_call(vtl0);
+}
+
+static bool mshv_vtl_process_intercept(void)
+{
+ struct hv_per_cpu_context *mshv_cpu;
+ void *synic_message_page;
+ struct hv_message *msg;
+ u32 message_type;
+
+ mshv_cpu = this_cpu_ptr(hv_context.cpu_context);
+ synic_message_page = mshv_cpu->hyp_synic_message_page;
+ if (unlikely(!synic_message_page))
+ return true;
+
+ msg = (struct hv_message *)synic_message_page + HV_SYNIC_INTERCEPTION_SINT_INDEX;
+ message_type = READ_ONCE(msg->header.message_type);
+ if (message_type == HVMSG_NONE)
+ return true;
+
+ memcpy(mshv_vtl_this_run()->exit_message, msg, sizeof(*msg));
+ vmbus_signal_eom(msg, message_type);
+
+ return false;
+}
+
+static int mshv_vtl_ioctl_return_to_lower_vtl(void)
+{
+ preempt_disable();
+ for (;;) {
+ unsigned long irq_flags;
+ struct hv_vp_assist_page *hvp;
+ int ret;
+
+ if (__xfer_to_guest_mode_work_pending()) {
+ preempt_enable();
+ ret = xfer_to_guest_mode_handle_work();
+ if (ret)
+ return ret;
+ preempt_disable();
+ }
+
+ local_irq_save(irq_flags);
+ if (READ_ONCE(mshv_vtl_this_run()->cancel)) {
+ local_irq_restore(irq_flags);
+ preempt_enable();
+ return -EINTR;
+ }
+
+ mshv_vtl_return(&mshv_vtl_this_run()->cpu_context);
+ local_irq_restore(irq_flags);
+
+ hvp = hv_vp_assist_page[smp_processor_id()];
+ this_cpu_inc(num_vtl0_transitions);
+ switch (hvp->vtl_entry_reason) {
+ case MSHV_ENTRY_REASON_INTERRUPT:
+ if (!mshv_vsm_capabilities.intercept_page_available &&
+ likely(!mshv_vtl_process_intercept()))
+ goto done;
+ break;
+
+ case MSHV_ENTRY_REASON_INTERCEPT:
+ WARN_ON(!mshv_vsm_capabilities.intercept_page_available);
+ memcpy(mshv_vtl_this_run()->exit_message, hvp->intercept_message,
+ sizeof(hvp->intercept_message));
+ goto done;
+
+ default:
+ panic("unknown entry reason: %d", hvp->vtl_entry_reason);
+ }
+ }
+
+done:
+ preempt_enable();
+
+ return 0;
+}
+
+static long
+mshv_vtl_ioctl_get_regs(void __user *user_args)
+{
+ struct mshv_vp_registers args;
+ struct hv_register_assoc reg;
+ long ret;
+
+ if (copy_from_user(&args, user_args, sizeof(args)))
+ return -EFAULT;
+
+ /* This IOCTL supports processing only one register at a time. */
+ if (args.count != 1)
+ return -EINVAL;
+
+ if (copy_from_user(&reg, (void __user *)args.regs_ptr,
+ sizeof(reg)))
+ return -EFAULT;
+
+ ret = mshv_vtl_get_set_reg(&reg, false);
+ if (!ret)
+ goto copy_args; /* No need of hypercall */
+ ret = vtl_get_vp_register(&reg);
+ if (ret)
+ return ret;
+
+copy_args:
+ if (copy_to_user((void __user *)args.regs_ptr, &reg, sizeof(reg)))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static long
+mshv_vtl_ioctl_set_regs(void __user *user_args)
+{
+ struct mshv_vp_registers args;
+ struct hv_register_assoc reg;
+ long ret;
+
+ if (copy_from_user(&args, user_args, sizeof(args)))
+ return -EFAULT;
+
+ /* This IOCTL supports processing only one register at a time. */
+ if (args.count != 1)
+ return -EINVAL;
+
+ if (copy_from_user(&reg, (void __user *)args.regs_ptr, sizeof(reg)))
+ return -EFAULT;
+
+ ret = mshv_vtl_get_set_reg(&reg, true);
+ if (!ret)
+ return ret; /* No need of hypercall */
+ ret = vtl_set_vp_register(&reg);
+
+ return ret;
+}
+
+static long
+mshv_vtl_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
+{
+ long ret;
+ struct mshv_vtl *vtl = filp->private_data;
+
+ switch (ioctl) {
+ case MSHV_SET_POLL_FILE:
+ ret = mshv_vtl_ioctl_set_poll_file((struct mshv_vtl_set_poll_file __user *)arg);
+ break;
+ case MSHV_GET_VP_REGISTERS:
+ ret = mshv_vtl_ioctl_get_regs((void __user *)arg);
+ break;
+ case MSHV_SET_VP_REGISTERS:
+ ret = mshv_vtl_ioctl_set_regs((void __user *)arg);
+ break;
+ case MSHV_RETURN_TO_LOWER_VTL:
+ ret = mshv_vtl_ioctl_return_to_lower_vtl();
+ break;
+ case MSHV_ADD_VTL0_MEMORY:
+ ret = mshv_vtl_ioctl_add_vtl0_mem(vtl, (void __user *)arg);
+ break;
+ default:
+ dev_err(vtl->module_dev, "invalid vtl ioctl: %#x\n", ioctl);
+ ret = -ENOTTY;
+ }
+
+ return ret;
+}
+
+static vm_fault_t mshv_vtl_fault(struct vm_fault *vmf)
+{
+ struct page *page;
+ int cpu = vmf->pgoff & MSHV_PG_OFF_CPU_MASK;
+ int real_off = vmf->pgoff >> MSHV_REAL_OFF_SHIFT;
+
+ if (!cpu_online(cpu))
+ return VM_FAULT_SIGBUS;
+ /*
+ * CPU Hotplug is not supported in VTL2 in OpenHCL, where this kernel driver exists.
+ * CPU is expected to remain online after above cpu_online() check.
+ */
+
+ if (real_off == MSHV_RUN_PAGE_OFFSET) {
+ page = virt_to_page(mshv_vtl_cpu_run(cpu));
+ } else if (real_off == MSHV_REG_PAGE_OFFSET) {
+ if (!mshv_has_reg_page)
+ return VM_FAULT_SIGBUS;
+ page = mshv_vtl_cpu_reg_page(cpu);
+ } else {
+ return VM_FAULT_NOPAGE;
+ }
+
+ get_page(page);
+ vmf->page = page;
+
+ return 0;
+}
+
+static const struct vm_operations_struct mshv_vtl_vm_ops = {
+ .fault = mshv_vtl_fault,
+};
+
+static int mshv_vtl_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ vma->vm_ops = &mshv_vtl_vm_ops;
+
+ return 0;
+}
+
+static int mshv_vtl_release(struct inode *inode, struct file *filp)
+{
+ struct mshv_vtl *vtl = filp->private_data;
+
+ kfree(vtl);
+
+ return 0;
+}
+
+static const struct file_operations mshv_vtl_fops = {
+ .owner = THIS_MODULE,
+ .unlocked_ioctl = mshv_vtl_ioctl,
+ .release = mshv_vtl_release,
+ .mmap = mshv_vtl_mmap,
+};
+
+static void mshv_vtl_synic_mask_vmbus_sint(const u8 *mask)
+{
+ union hv_synic_sint sint;
+
+ sint.as_uint64 = 0;
+ sint.vector = HYPERVISOR_CALLBACK_VECTOR;
+ sint.masked = (*mask != 0);
+ sint.auto_eoi = hv_recommend_using_aeoi();
+
+ hv_set_msr(HV_MSR_SINT0 + VTL2_VMBUS_SINT_INDEX,
+ sint.as_uint64);
+
+ if (!sint.masked)
+ pr_debug("%s: Unmasking VTL2 VMBUS SINT on VP %d\n", __func__, smp_processor_id());
+ else
+ pr_debug("%s: Masking VTL2 VMBUS SINT on VP %d\n", __func__, smp_processor_id());
+}
+
+static void mshv_vtl_read_remote(void *buffer)
+{
+ struct hv_per_cpu_context *mshv_cpu = this_cpu_ptr(hv_context.cpu_context);
+ struct hv_message *msg = (struct hv_message *)mshv_cpu->hyp_synic_message_page +
+ VTL2_VMBUS_SINT_INDEX;
+ u32 message_type = READ_ONCE(msg->header.message_type);
+
+ WRITE_ONCE(has_message, false);
+ if (message_type == HVMSG_NONE)
+ return;
+
+ memcpy(buffer, msg, sizeof(*msg));
+ vmbus_signal_eom(msg, message_type);
+}
+
+static bool vtl_synic_mask_vmbus_sint_masked = true;
+
+static ssize_t mshv_vtl_sint_read(struct file *filp, char __user *arg, size_t size, loff_t *offset)
+{
+ struct hv_message msg = {};
+ int ret;
+
+ if (size < sizeof(msg))
+ return -EINVAL;
+
+ for (;;) {
+ smp_call_function_single(VMBUS_CONNECT_CPU, mshv_vtl_read_remote, &msg, true);
+ if (msg.header.message_type != HVMSG_NONE)
+ break;
+
+ if (READ_ONCE(vtl_synic_mask_vmbus_sint_masked))
+ return 0; /* EOF */
+
+ if (filp->f_flags & O_NONBLOCK)
+ return -EAGAIN;
+
+ ret = wait_event_interruptible(fd_wait_queue,
+ READ_ONCE(has_message) ||
+ READ_ONCE(vtl_synic_mask_vmbus_sint_masked));
+ if (ret)
+ return ret;
+ }
+
+ if (copy_to_user(arg, &msg, sizeof(msg)))
+ return -EFAULT;
+
+ return sizeof(msg);
+}
+
+static __poll_t mshv_vtl_sint_poll(struct file *filp, poll_table *wait)
+{
+ __poll_t mask = 0;
+
+ poll_wait(filp, &fd_wait_queue, wait);
+ if (READ_ONCE(has_message) || READ_ONCE(vtl_synic_mask_vmbus_sint_masked))
+ mask |= EPOLLIN | EPOLLRDNORM;
+
+ return mask;
+}
+
+static void mshv_vtl_sint_on_msg_dpc(unsigned long data)
+{
+ WRITE_ONCE(has_message, true);
+ wake_up_interruptible_poll(&fd_wait_queue, EPOLLIN);
+}
+
+static int mshv_vtl_sint_ioctl_post_msg(struct mshv_vtl_sint_post_msg __user *arg)
+{
+ struct mshv_vtl_sint_post_msg message;
+ u8 payload[HV_MESSAGE_PAYLOAD_BYTE_COUNT];
+
+ if (copy_from_user(&message, arg, sizeof(message)))
+ return -EFAULT;
+ if (message.payload_size > HV_MESSAGE_PAYLOAD_BYTE_COUNT)
+ return -EINVAL;
+ if (copy_from_user(payload, (void __user *)message.payload_ptr,
+ message.payload_size))
+ return -EFAULT;
+
+ return hv_post_message((union hv_connection_id)message.connection_id,
+ message.message_type, (void *)payload,
+ message.payload_size);
+}
+
+static int mshv_vtl_sint_ioctl_signal_event(struct mshv_vtl_signal_event __user *arg)
+{
+ u64 input, status;
+ struct mshv_vtl_signal_event signal_event;
+
+ if (copy_from_user(&signal_event, arg, sizeof(signal_event)))
+ return -EFAULT;
+
+ input = signal_event.connection_id | ((u64)signal_event.flag << 32);
+
+ status = hv_do_fast_hypercall8(HVCALL_SIGNAL_EVENT, input);
+
+ return hv_result_to_errno(status);
+}
+
+static int mshv_vtl_sint_ioctl_set_eventfd(struct mshv_vtl_set_eventfd __user *arg)
+{
+ struct mshv_vtl_set_eventfd set_eventfd;
+ struct eventfd_ctx *eventfd, *old_eventfd;
+
+ if (copy_from_user(&set_eventfd, arg, sizeof(set_eventfd)))
+ return -EFAULT;
+ if (set_eventfd.flag >= HV_EVENT_FLAGS_COUNT)
+ return -EINVAL;
+
+ eventfd = NULL;
+ if (set_eventfd.fd >= 0) {
+ eventfd = eventfd_ctx_fdget(set_eventfd.fd);
+ if (IS_ERR(eventfd))
+ return PTR_ERR(eventfd);
+ }
+
+ guard(mutex)(&flag_lock);
+ old_eventfd = READ_ONCE(flag_eventfds[set_eventfd.flag]);
+ WRITE_ONCE(flag_eventfds[set_eventfd.flag], eventfd);
+
+ if (old_eventfd) {
+ synchronize_rcu();
+ eventfd_ctx_put(old_eventfd);
+ }
+
+ return 0;
+}
+
+static int mshv_vtl_sint_ioctl_pause_msg_stream(struct mshv_sint_mask __user *arg)
+{
+ static DEFINE_MUTEX(vtl2_vmbus_sint_mask_mutex);
+ struct mshv_sint_mask mask;
+
+ if (copy_from_user(&mask, arg, sizeof(mask)))
+ return -EFAULT;
+ guard(mutex)(&vtl2_vmbus_sint_mask_mutex);
+ on_each_cpu((smp_call_func_t)mshv_vtl_synic_mask_vmbus_sint, &mask.mask, 1);
+ WRITE_ONCE(vtl_synic_mask_vmbus_sint_masked, mask.mask != 0);
+ if (mask.mask)
+ wake_up_interruptible_poll(&fd_wait_queue, EPOLLIN);
+
+ return 0;
+}
+
+static long mshv_vtl_sint_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case MSHV_SINT_POST_MESSAGE:
+ return mshv_vtl_sint_ioctl_post_msg((struct mshv_vtl_sint_post_msg __user *)arg);
+ case MSHV_SINT_SIGNAL_EVENT:
+ return mshv_vtl_sint_ioctl_signal_event((struct mshv_vtl_signal_event __user *)arg);
+ case MSHV_SINT_SET_EVENTFD:
+ return mshv_vtl_sint_ioctl_set_eventfd((struct mshv_vtl_set_eventfd __user *)arg);
+ case MSHV_SINT_PAUSE_MESSAGE_STREAM:
+ return mshv_vtl_sint_ioctl_pause_msg_stream((struct mshv_sint_mask __user *)arg);
+ default:
+ return -ENOIOCTLCMD;
+ }
+}
+
+static const struct file_operations mshv_vtl_sint_ops = {
+ .owner = THIS_MODULE,
+ .read = mshv_vtl_sint_read,
+ .poll = mshv_vtl_sint_poll,
+ .unlocked_ioctl = mshv_vtl_sint_ioctl,
+};
+
+static struct miscdevice mshv_vtl_sint_dev = {
+ .name = "mshv_sint",
+ .fops = &mshv_vtl_sint_ops,
+ .mode = 0600,
+ .minor = MISC_DYNAMIC_MINOR,
+};
+
+static int mshv_vtl_hvcall_dev_open(struct inode *node, struct file *f)
+{
+ struct miscdevice *dev = f->private_data;
+ struct mshv_vtl_hvcall_fd *fd;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ fd = vzalloc(sizeof(*fd));
+ if (!fd)
+ return -ENOMEM;
+ fd->dev = dev;
+ f->private_data = fd;
+ mutex_init(&fd->init_mutex);
+
+ return 0;
+}
+
+static int mshv_vtl_hvcall_dev_release(struct inode *node, struct file *f)
+{
+ struct mshv_vtl_hvcall_fd *fd;
+
+ fd = f->private_data;
+ if (fd) {
+ vfree(fd);
+ f->private_data = NULL;
+ }
+
+ return 0;
+}
+
+static int mshv_vtl_hvcall_do_setup(struct mshv_vtl_hvcall_fd *fd,
+ struct mshv_vtl_hvcall_setup __user *hvcall_setup_user)
+{
+ struct mshv_vtl_hvcall_setup hvcall_setup;
+
+ guard(mutex)(&fd->init_mutex);
+
+ if (fd->allow_map_initialized) {
+ dev_err(fd->dev->this_device,
+ "Hypercall allow map has already been set, pid %d\n",
+ current->pid);
+ return -EINVAL;
+ }
+
+ if (copy_from_user(&hvcall_setup, hvcall_setup_user,
+ sizeof(struct mshv_vtl_hvcall_setup))) {
+ return -EFAULT;
+ }
+ if (hvcall_setup.bitmap_array_size > ARRAY_SIZE(fd->allow_bitmap))
+ return -EINVAL;
+
+ if (copy_from_user(&fd->allow_bitmap,
+ (void __user *)hvcall_setup.allow_bitmap_ptr,
+ hvcall_setup.bitmap_array_size)) {
+ return -EFAULT;
+ }
+
+ dev_info(fd->dev->this_device, "Hypercall allow map has been set, pid %d\n",
+ current->pid);
+ fd->allow_map_initialized = true;
+ return 0;
+}
+
+static bool mshv_vtl_hvcall_is_allowed(struct mshv_vtl_hvcall_fd *fd, u16 call_code)
+{
+ return test_bit(call_code, (unsigned long *)fd->allow_bitmap);
+}
+
+static int mshv_vtl_hvcall_call(struct mshv_vtl_hvcall_fd *fd,
+ struct mshv_vtl_hvcall __user *hvcall_user)
+{
+ struct mshv_vtl_hvcall hvcall;
+ void *in, *out;
+ int ret;
+
+ if (copy_from_user(&hvcall, hvcall_user, sizeof(struct mshv_vtl_hvcall)))
+ return -EFAULT;
+ if (hvcall.input_size > HV_HYP_PAGE_SIZE)
+ return -EINVAL;
+ if (hvcall.output_size > HV_HYP_PAGE_SIZE)
+ return -EINVAL;
+
+ /*
+ * By default, all hypercalls are not allowed.
+ * The user mode code has to set up the allow bitmap once.
+ */
+
+ if (!mshv_vtl_hvcall_is_allowed(fd, hvcall.control & 0xFFFF)) {
+ dev_err(fd->dev->this_device,
+ "Hypercall with control data %#llx isn't allowed\n",
+ hvcall.control);
+ return -EPERM;
+ }
+
+ /*
+ * This may create a problem for Confidential VM (CVM) usecase where we need to use
+ * Hyper-V driver allocated per-cpu input and output pages (hyperv_pcpu_input_arg and
+ * hyperv_pcpu_output_arg) for making a hypervisor call.
+ *
+ * TODO: Take care of this when CVM support is added.
+ */
+ in = (void *)__get_free_page(GFP_KERNEL);
+ out = (void *)__get_free_page(GFP_KERNEL);
+
+ if (copy_from_user(in, (void __user *)hvcall.input_ptr, hvcall.input_size)) {
+ ret = -EFAULT;
+ goto free_pages;
+ }
+
+ hvcall.status = hv_do_hypercall(hvcall.control, in, out);
+
+ if (copy_to_user((void __user *)hvcall.output_ptr, out, hvcall.output_size)) {
+ ret = -EFAULT;
+ goto free_pages;
+ }
+ ret = put_user(hvcall.status, &hvcall_user->status);
+free_pages:
+ free_page((unsigned long)in);
+ free_page((unsigned long)out);
+
+ return ret;
+}
+
+static long mshv_vtl_hvcall_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
+{
+ struct mshv_vtl_hvcall_fd *fd = f->private_data;
+
+ switch (cmd) {
+ case MSHV_HVCALL_SETUP:
+ return mshv_vtl_hvcall_do_setup(fd, (struct mshv_vtl_hvcall_setup __user *)arg);
+ case MSHV_HVCALL:
+ return mshv_vtl_hvcall_call(fd, (struct mshv_vtl_hvcall __user *)arg);
+ default:
+ break;
+ }
+
+ return -ENOIOCTLCMD;
+}
+
+static const struct file_operations mshv_vtl_hvcall_dev_file_ops = {
+ .owner = THIS_MODULE,
+ .open = mshv_vtl_hvcall_dev_open,
+ .release = mshv_vtl_hvcall_dev_release,
+ .unlocked_ioctl = mshv_vtl_hvcall_dev_ioctl,
+};
+
+static struct miscdevice mshv_vtl_hvcall_dev = {
+ .name = "mshv_hvcall",
+ .nodename = "mshv_hvcall",
+ .fops = &mshv_vtl_hvcall_dev_file_ops,
+ .mode = 0600,
+ .minor = MISC_DYNAMIC_MINOR,
+};
+
+static int mshv_vtl_low_open(struct inode *inodep, struct file *filp)
+{
+ pid_t pid = task_pid_vnr(current);
+ uid_t uid = current_uid().val;
+ int ret = 0;
+
+ pr_debug("%s: Opening VTL low, task group %d, uid %d\n", __func__, pid, uid);
+
+ if (capable(CAP_SYS_ADMIN)) {
+ filp->private_data = inodep;
+ } else {
+ pr_err("%s: VTL low open failed: CAP_SYS_ADMIN required. task group %d, uid %d",
+ __func__, pid, uid);
+ ret = -EPERM;
+ }
+
+ return ret;
+}
+
+static bool can_fault(struct vm_fault *vmf, unsigned long size, unsigned long *pfn)
+{
+ unsigned long mask = size - 1;
+ unsigned long start = vmf->address & ~mask;
+ unsigned long end = start + size;
+ bool is_valid;
+
+ is_valid = (vmf->address & mask) == ((vmf->pgoff << PAGE_SHIFT) & mask) &&
+ start >= vmf->vma->vm_start &&
+ end <= vmf->vma->vm_end;
+
+ if (is_valid)
+ *pfn = vmf->pgoff & ~(mask >> PAGE_SHIFT);
+
+ return is_valid;
+}
+
+static vm_fault_t mshv_vtl_low_huge_fault(struct vm_fault *vmf, unsigned int order)
+{
+ unsigned long pfn = vmf->pgoff;
+ vm_fault_t ret = VM_FAULT_FALLBACK;
+
+ switch (order) {
+ case 0:
+ return vmf_insert_mixed(vmf->vma, vmf->address, pfn);
+
+ case PMD_ORDER:
+ if (can_fault(vmf, PMD_SIZE, &pfn))
+ ret = vmf_insert_pfn_pmd(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE);
+ return ret;
+
+ case PUD_ORDER:
+ if (can_fault(vmf, PUD_SIZE, &pfn))
+ ret = vmf_insert_pfn_pud(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE);
+ return ret;
+
+ default:
+ return VM_FAULT_SIGBUS;
+ }
+}
+
+static vm_fault_t mshv_vtl_low_fault(struct vm_fault *vmf)
+{
+ return mshv_vtl_low_huge_fault(vmf, 0);
+}
+
+static const struct vm_operations_struct mshv_vtl_low_vm_ops = {
+ .fault = mshv_vtl_low_fault,
+ .huge_fault = mshv_vtl_low_huge_fault,
+};
+
+static int mshv_vtl_low_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ vma->vm_ops = &mshv_vtl_low_vm_ops;
+ vm_flags_set(vma, VM_HUGEPAGE | VM_MIXEDMAP);
+
+ return 0;
+}
+
+static const struct file_operations mshv_vtl_low_file_ops = {
+ .owner = THIS_MODULE,
+ .open = mshv_vtl_low_open,
+ .mmap = mshv_vtl_low_mmap,
+};
+
+static struct miscdevice mshv_vtl_low = {
+ .name = "mshv_vtl_low",
+ .nodename = "mshv_vtl_low",
+ .fops = &mshv_vtl_low_file_ops,
+ .mode = 0600,
+ .minor = MISC_DYNAMIC_MINOR,
+};
+
+static int __init mshv_vtl_init(void)
+{
+ int ret;
+ struct device *dev = mshv_dev.this_device;
+
+ /*
+ * This creates /dev/mshv which provides functionality to create VTLs and partitions.
+ */
+ ret = misc_register(&mshv_dev);
+ if (ret) {
+ dev_err(dev, "mshv device register failed: %d\n", ret);
+ goto free_dev;
+ }
+
+ tasklet_init(&msg_dpc, mshv_vtl_sint_on_msg_dpc, 0);
+ init_waitqueue_head(&fd_wait_queue);
+
+ if (mshv_vtl_get_vsm_regs()) {
+ dev_emerg(dev, "Unable to get VSM capabilities !!\n");
+ ret = -ENODEV;
+ goto free_dev;
+ }
+ if (mshv_vtl_configure_vsm_partition(dev)) {
+ dev_emerg(dev, "VSM configuration failed !!\n");
+ ret = -ENODEV;
+ goto free_dev;
+ }
+
+ mshv_vtl_return_call_init(mshv_vsm_page_offsets.vtl_return_offset);
+ ret = hv_vtl_setup_synic();
+ if (ret)
+ goto free_dev;
+
+ /*
+ * mshv_sint device adds VMBus relay ioctl support.
+ * This provides a channel for VTL0 to communicate with VTL2.
+ */
+ ret = misc_register(&mshv_vtl_sint_dev);
+ if (ret)
+ goto free_synic;
+
+ /*
+ * mshv_hvcall device adds interface to enable userspace for direct hypercalls support.
+ */
+ ret = misc_register(&mshv_vtl_hvcall_dev);
+ if (ret)
+ goto free_sint;
+
+ /*
+ * mshv_vtl_low device is used to map VTL0 address space to a user-mode process in VTL2.
+ * It implements mmap() to allow a user-mode process in VTL2 to map to the address of VTL0.
+ */
+ ret = misc_register(&mshv_vtl_low);
+ if (ret)
+ goto free_hvcall;
+
+ /*
+ * "mshv vtl mem dev" device is later used to setup VTL0 memory.
+ */
+ mem_dev = kzalloc(sizeof(*mem_dev), GFP_KERNEL);
+ if (!mem_dev) {
+ ret = -ENOMEM;
+ goto free_low;
+ }
+
+ mutex_init(&mshv_vtl_poll_file_lock);
+
+ device_initialize(mem_dev);
+ dev_set_name(mem_dev, "mshv vtl mem dev");
+ ret = device_add(mem_dev);
+ if (ret) {
+ dev_err(dev, "mshv vtl mem dev add: %d\n", ret);
+ goto free_mem;
+ }
+
+ return 0;
+
+free_mem:
+ kfree(mem_dev);
+free_low:
+ misc_deregister(&mshv_vtl_low);
+free_hvcall:
+ misc_deregister(&mshv_vtl_hvcall_dev);
+free_sint:
+ misc_deregister(&mshv_vtl_sint_dev);
+free_synic:
+ hv_vtl_remove_synic();
+free_dev:
+ misc_deregister(&mshv_dev);
+
+ return ret;
+}
+
+static void __exit mshv_vtl_exit(void)
+{
+ device_del(mem_dev);
+ kfree(mem_dev);
+ misc_deregister(&mshv_vtl_low);
+ misc_deregister(&mshv_vtl_hvcall_dev);
+ misc_deregister(&mshv_vtl_sint_dev);
+ hv_vtl_remove_synic();
+ misc_deregister(&mshv_dev);
+}
+
+module_init(mshv_vtl_init);
+module_exit(mshv_vtl_exit);
diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c
index 23ce1fb70de1..3c421a7f78c0 100644
--- a/drivers/hv/ring_buffer.c
+++ b/drivers/hv/ring_buffer.c
@@ -184,7 +184,8 @@ void hv_ringbuffer_pre_init(struct vmbus_channel *channel)
/* Initialize the ring buffer. */
int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info,
- struct page *pages, u32 page_cnt, u32 max_pkt_size)
+ struct page *pages, u32 page_cnt, u32 max_pkt_size,
+ bool confidential)
{
struct page **pages_wraparound;
int i;
@@ -208,7 +209,7 @@ int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info,
ring_info->ring_buffer = (struct hv_ring_buffer *)
vmap(pages_wraparound, page_cnt * 2 - 1, VM_MAP,
- pgprot_decrypted(PAGE_KERNEL));
+ confidential ? PAGE_KERNEL : pgprot_decrypted(PAGE_KERNEL));
kfree(pages_wraparound);
if (!ring_info->ring_buffer)
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 69591dc7bad2..a53af6fe81a6 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -36,6 +36,7 @@
#include <linux/syscore_ops.h>
#include <linux/dma-map-ops.h>
#include <linux/pci.h>
+#include <linux/export.h>
#include <clocksource/hyperv_timer.h>
#include <asm/mshyperv.h>
#include "hyperv_vmbus.h"
@@ -57,6 +58,18 @@ int vmbus_irq;
int vmbus_interrupt;
/*
+ * If the Confidential VMBus is used, the data on the "wire" is not
+ * visible to either the host or the hypervisor.
+ */
+static bool is_confidential;
+
+bool vmbus_is_confidential(void)
+{
+ return is_confidential;
+}
+EXPORT_SYMBOL_GPL(vmbus_is_confidential);
+
+/*
* The panic notifier below is responsible solely for unloading the
* vmbus connection, which is necessary in a panic event.
*
@@ -1045,12 +1058,9 @@ static void vmbus_onmessage_work(struct work_struct *work)
kfree(ctx);
}
-void vmbus_on_msg_dpc(unsigned long data)
+static void __vmbus_on_msg_dpc(void *message_page_addr)
{
- struct hv_per_cpu_context *hv_cpu = (void *)data;
- void *page_addr = hv_cpu->synic_message_page;
- struct hv_message msg_copy, *msg = (struct hv_message *)page_addr +
- VMBUS_MESSAGE_SINT;
+ struct hv_message msg_copy, *msg;
struct vmbus_channel_message_header *hdr;
enum vmbus_channel_message_type msgtype;
const struct vmbus_channel_message_table_entry *entry;
@@ -1058,6 +1068,10 @@ void vmbus_on_msg_dpc(unsigned long data)
__u8 payload_size;
u32 message_type;
+ if (!message_page_addr)
+ return;
+ msg = (struct hv_message *)message_page_addr + VMBUS_MESSAGE_SINT;
+
/*
* 'enum vmbus_channel_message_type' is supposed to always be 'u32' as
* it is being used in 'struct vmbus_channel_message_header' definition
@@ -1183,6 +1197,14 @@ msg_handled:
vmbus_signal_eom(msg, message_type);
}
+void vmbus_on_msg_dpc(unsigned long data)
+{
+ struct hv_per_cpu_context *hv_cpu = (void *)data;
+
+ __vmbus_on_msg_dpc(hv_cpu->hyp_synic_message_page);
+ __vmbus_on_msg_dpc(hv_cpu->para_synic_message_page);
+}
+
#ifdef CONFIG_PM_SLEEP
/*
* Fake RESCIND_CHANNEL messages to clean up hv_sock channels by force for
@@ -1221,21 +1243,19 @@ static void vmbus_force_channel_rescinded(struct vmbus_channel *channel)
#endif /* CONFIG_PM_SLEEP */
/*
- * Schedule all channels with events pending
+ * Schedule all channels with events pending.
+ * The event page can be directly checked to get the id of
+ * the channel that has the interrupt pending.
*/
-static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu)
+static void vmbus_chan_sched(void *event_page_addr)
{
unsigned long *recv_int_page;
u32 maxbits, relid;
+ union hv_synic_event_flags *event;
- /*
- * The event page can be directly checked to get the id of
- * the channel that has the interrupt pending.
- */
- void *page_addr = hv_cpu->synic_event_page;
- union hv_synic_event_flags *event
- = (union hv_synic_event_flags *)page_addr +
- VMBUS_MESSAGE_SINT;
+ if (!event_page_addr)
+ return;
+ event = (union hv_synic_event_flags *)event_page_addr + VMBUS_MESSAGE_SINT;
maxbits = HV_EVENT_FLAGS_COUNT;
recv_int_page = event->flags;
@@ -1243,6 +1263,11 @@ static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu)
if (unlikely(!recv_int_page))
return;
+ /*
+ * Suggested-by: Michael Kelley <mhklinux@outlook.com>
+ * One possible optimization would be to keep track of the largest relID that's in use,
+ * and only scan up to that relID.
+ */
for_each_set_bit(relid, recv_int_page, maxbits) {
void (*callback_fn)(void *context);
struct vmbus_channel *channel;
@@ -1306,29 +1331,39 @@ sched_unlock_rcu:
}
}
-static void vmbus_isr(void)
+static void vmbus_message_sched(struct hv_per_cpu_context *hv_cpu, void *message_page_addr)
{
- struct hv_per_cpu_context *hv_cpu
- = this_cpu_ptr(hv_context.cpu_context);
- void *page_addr;
struct hv_message *msg;
- vmbus_chan_sched(hv_cpu);
-
- page_addr = hv_cpu->synic_message_page;
- msg = (struct hv_message *)page_addr + VMBUS_MESSAGE_SINT;
+ if (!message_page_addr)
+ return;
+ msg = (struct hv_message *)message_page_addr + VMBUS_MESSAGE_SINT;
/* Check if there are actual msgs to be processed */
if (msg->header.message_type != HVMSG_NONE) {
if (msg->header.message_type == HVMSG_TIMER_EXPIRED) {
hv_stimer0_isr();
vmbus_signal_eom(msg, HVMSG_TIMER_EXPIRED);
- } else
+ } else {
tasklet_schedule(&hv_cpu->msg_dpc);
+ }
}
+}
+
+void vmbus_isr(void)
+{
+ struct hv_per_cpu_context *hv_cpu
+ = this_cpu_ptr(hv_context.cpu_context);
+
+ vmbus_chan_sched(hv_cpu->hyp_synic_event_page);
+ vmbus_chan_sched(hv_cpu->para_synic_event_page);
+
+ vmbus_message_sched(hv_cpu, hv_cpu->hyp_synic_message_page);
+ vmbus_message_sched(hv_cpu, hv_cpu->para_synic_message_page);
add_interrupt_randomness(vmbus_interrupt);
}
+EXPORT_SYMBOL_FOR_MODULES(vmbus_isr, "mshv_vtl");
static irqreturn_t vmbus_percpu_isr(int irq, void *dev_id)
{
@@ -1343,6 +1378,59 @@ static void vmbus_percpu_work(struct work_struct *work)
hv_synic_init(cpu);
}
+static int vmbus_alloc_synic_and_connect(void)
+{
+ int ret, cpu;
+ struct work_struct __percpu *works;
+ int hyperv_cpuhp_online;
+
+ ret = hv_synic_alloc();
+ if (ret < 0)
+ goto err_alloc;
+
+ works = alloc_percpu(struct work_struct);
+ if (!works) {
+ ret = -ENOMEM;
+ goto err_alloc;
+ }
+
+ /*
+ * Initialize the per-cpu interrupt state and stimer state.
+ * Then connect to the host.
+ */
+ cpus_read_lock();
+ for_each_online_cpu(cpu) {
+ struct work_struct *work = per_cpu_ptr(works, cpu);
+
+ INIT_WORK(work, vmbus_percpu_work);
+ schedule_work_on(cpu, work);
+ }
+
+ for_each_online_cpu(cpu)
+ flush_work(per_cpu_ptr(works, cpu));
+
+ /* Register the callbacks for possible CPU online/offline'ing */
+ ret = cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ONLINE_DYN, "hyperv/vmbus:online",
+ hv_synic_init, hv_synic_cleanup);
+ cpus_read_unlock();
+ free_percpu(works);
+ if (ret < 0)
+ goto err_alloc;
+ hyperv_cpuhp_online = ret;
+
+ ret = vmbus_connect();
+ if (ret)
+ goto err_connect;
+ return 0;
+
+err_connect:
+ cpuhp_remove_state(hyperv_cpuhp_online);
+ return -ENODEV;
+err_alloc:
+ hv_synic_free();
+ return -ENOMEM;
+}
+
/*
* vmbus_bus_init -Main vmbus driver initialization routine.
*
@@ -1353,8 +1441,7 @@ static void vmbus_percpu_work(struct work_struct *work)
*/
static int vmbus_bus_init(void)
{
- int ret, cpu;
- struct work_struct __percpu *works;
+ int ret;
ret = hv_init();
if (ret != 0) {
@@ -1389,41 +1476,15 @@ static int vmbus_bus_init(void)
}
}
- ret = hv_synic_alloc();
- if (ret)
- goto err_alloc;
-
- works = alloc_percpu(struct work_struct);
- if (!works) {
- ret = -ENOMEM;
- goto err_alloc;
- }
-
/*
- * Initialize the per-cpu interrupt state and stimer state.
- * Then connect to the host.
+ * Cache the value as getting it involves a VM exit on x86(_64), and
+ * doing that on each VP while initializing SynIC's wastes time.
*/
- cpus_read_lock();
- for_each_online_cpu(cpu) {
- struct work_struct *work = per_cpu_ptr(works, cpu);
-
- INIT_WORK(work, vmbus_percpu_work);
- schedule_work_on(cpu, work);
- }
-
- for_each_online_cpu(cpu)
- flush_work(per_cpu_ptr(works, cpu));
-
- /* Register the callbacks for possible CPU online/offline'ing */
- ret = cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ONLINE_DYN, "hyperv/vmbus:online",
- hv_synic_init, hv_synic_cleanup);
- cpus_read_unlock();
- free_percpu(works);
- if (ret < 0)
- goto err_alloc;
- hyperv_cpuhp_online = ret;
-
- ret = vmbus_connect();
+ is_confidential = ms_hyperv.confidential_vmbus_available;
+ if (is_confidential)
+ pr_info("Establishing connection to the confidential VMBus\n");
+ hv_para_set_sint_proxy(!is_confidential);
+ ret = vmbus_alloc_synic_and_connect();
if (ret)
goto err_connect;
@@ -1439,9 +1500,6 @@ static int vmbus_bus_init(void)
return 0;
err_connect:
- cpuhp_remove_state(hyperv_cpuhp_online);
-err_alloc:
- hv_synic_free();
if (vmbus_irq == -1) {
hv_remove_vmbus_handler();
} else {
@@ -2798,10 +2856,10 @@ static void hv_crash_handler(struct pt_regs *regs)
*/
cpu = smp_processor_id();
hv_stimer_cleanup(cpu);
- hv_synic_disable_regs(cpu);
+ hv_hyp_synic_disable_regs(cpu);
};
-static int hv_synic_suspend(void)
+static int hv_synic_suspend(void *data)
{
/*
* When we reach here, all the non-boot CPUs have been offlined.
@@ -2823,14 +2881,14 @@ static int hv_synic_suspend(void)
* interrupts-disabled context.
*/
- hv_synic_disable_regs(0);
+ hv_hyp_synic_disable_regs(0);
return 0;
}
-static void hv_synic_resume(void)
+static void hv_synic_resume(void *data)
{
- hv_synic_enable_regs(0);
+ hv_hyp_synic_enable_regs(0);
/*
* Note: we don't need to call hv_stimer_init(0), because the timer
@@ -2840,11 +2898,15 @@ static void hv_synic_resume(void)
}
/* The callbacks run only on CPU0, with irqs_disabled. */
-static struct syscore_ops hv_synic_syscore_ops = {
+static const struct syscore_ops hv_synic_syscore_ops = {
.suspend = hv_synic_suspend,
.resume = hv_synic_resume,
};
+static struct syscore hv_synic_syscore = {
+ .ops = &hv_synic_syscore_ops,
+};
+
static int __init hv_acpi_init(void)
{
int ret;
@@ -2887,7 +2949,7 @@ static int __init hv_acpi_init(void)
hv_setup_kexec_handler(hv_kexec_handler);
hv_setup_crash_handler(hv_crash_handler);
- register_syscore_ops(&hv_synic_syscore_ops);
+ register_syscore(&hv_synic_syscore);
return 0;
@@ -2901,7 +2963,7 @@ static void __exit vmbus_exit(void)
{
int cpu;
- unregister_syscore_ops(&hv_synic_syscore_ops);
+ unregister_syscore(&hv_synic_syscore);
hv_remove_kexec_handler();
hv_remove_crash_handler();