summaryrefslogtreecommitdiff
path: root/drivers/hv
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/hv')
-rw-r--r--drivers/hv/Kconfig17
-rw-r--r--drivers/hv/Makefile6
-rw-r--r--drivers/hv/channel.c65
-rw-r--r--drivers/hv/channel_mgmt.c76
-rw-r--r--drivers/hv/connection.c4
-rw-r--r--drivers/hv/hv.c137
-rw-r--r--drivers/hv/hv_balloon.c216
-rw-r--r--drivers/hv/hv_common.c227
-rw-r--r--drivers/hv/hv_fcopy.c427
-rw-r--r--drivers/hv/hv_kvp.c12
-rw-r--r--drivers/hv/hv_proc.c195
-rw-r--r--drivers/hv/hv_snapshot.c11
-rw-r--r--drivers/hv/hv_util.c25
-rw-r--r--drivers/hv/hyperv_vmbus.h35
-rw-r--r--drivers/hv/mshv.h30
-rw-r--r--drivers/hv/mshv_common.c161
-rw-r--r--drivers/hv/mshv_eventfd.c833
-rw-r--r--drivers/hv/mshv_eventfd.h71
-rw-r--r--drivers/hv/mshv_irq.c124
-rw-r--r--drivers/hv/mshv_portid_table.c83
-rw-r--r--drivers/hv/mshv_root.h311
-rw-r--r--drivers/hv/mshv_root_hv_call.c849
-rw-r--r--drivers/hv/mshv_root_main.c2307
-rw-r--r--drivers/hv/mshv_synic.c665
-rw-r--r--drivers/hv/vmbus_drv.c256
25 files changed, 6344 insertions, 799 deletions
diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig
index 862c47b191af..6c1416167bd2 100644
--- a/drivers/hv/Kconfig
+++ b/drivers/hv/Kconfig
@@ -55,4 +55,21 @@ config HYPERV_BALLOON
help
Select this option to enable Hyper-V Balloon driver.
+config MSHV_ROOT
+ tristate "Microsoft Hyper-V root partition support"
+ depends on HYPERV && (X86_64 || ARM64)
+ depends on !HYPERV_VTL_MODE
+ # The hypervisor interface operates on 4k pages. Enforcing it here
+ # simplifies many assumptions in the root partition code.
+ # e.g. When withdrawing memory, the hypervisor gives back 4k pages in
+ # no particular order, making it impossible to reassemble larger pages
+ depends on PAGE_SIZE_4KB
+ select EVENTFD
+ default n
+ help
+ Select this option to enable support for booting and running as root
+ partition on Microsoft Hyper-V.
+
+ If unsure, say N.
+
endmenu
diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile
index d76df5c8c2a9..976189c725dc 100644
--- a/drivers/hv/Makefile
+++ b/drivers/hv/Makefile
@@ -2,6 +2,7 @@
obj-$(CONFIG_HYPERV) += hv_vmbus.o
obj-$(CONFIG_HYPERV_UTILS) += hv_utils.o
obj-$(CONFIG_HYPERV_BALLOON) += hv_balloon.o
+obj-$(CONFIG_MSHV_ROOT) += mshv_root.o
CFLAGS_hv_trace.o = -I$(src)
CFLAGS_hv_balloon.o = -I$(src)
@@ -10,7 +11,10 @@ hv_vmbus-y := vmbus_drv.o \
hv.o connection.o channel.o \
channel_mgmt.o ring_buffer.o hv_trace.o
hv_vmbus-$(CONFIG_HYPERV_TESTING) += hv_debugfs.o
-hv_utils-y := hv_util.o hv_kvp.o hv_snapshot.o hv_fcopy.o hv_utils_transport.o
+hv_utils-y := hv_util.o hv_kvp.o hv_snapshot.o hv_utils_transport.o
+mshv_root-y := mshv_root_main.o mshv_synic.o mshv_eventfd.o mshv_irq.o \
+ mshv_root_hv_call.o mshv_portid_table.o
# Code that must be built-in
obj-$(subst m,y,$(CONFIG_HYPERV)) += hv_common.o
+obj-$(subst m,y,$(CONFIG_MSHV_ROOT)) += hv_proc.o mshv_common.o
diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index fb8cd8469328..35f26fa1ffe7 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c
@@ -1077,68 +1077,10 @@ int vmbus_sendpacket(struct vmbus_channel *channel, void *buffer,
EXPORT_SYMBOL(vmbus_sendpacket);
/*
- * vmbus_sendpacket_pagebuffer - Send a range of single-page buffer
- * packets using a GPADL Direct packet type. This interface allows you
- * to control notifying the host. This will be useful for sending
- * batched data. Also the sender can control the send flags
- * explicitly.
- */
-int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel,
- struct hv_page_buffer pagebuffers[],
- u32 pagecount, void *buffer, u32 bufferlen,
- u64 requestid)
-{
- int i;
- struct vmbus_channel_packet_page_buffer desc;
- u32 descsize;
- u32 packetlen;
- u32 packetlen_aligned;
- struct kvec bufferlist[3];
- u64 aligned_data = 0;
-
- if (pagecount > MAX_PAGE_BUFFER_COUNT)
- return -EINVAL;
-
- /*
- * Adjust the size down since vmbus_channel_packet_page_buffer is the
- * largest size we support
- */
- descsize = sizeof(struct vmbus_channel_packet_page_buffer) -
- ((MAX_PAGE_BUFFER_COUNT - pagecount) *
- sizeof(struct hv_page_buffer));
- packetlen = descsize + bufferlen;
- packetlen_aligned = ALIGN(packetlen, sizeof(u64));
-
- /* Setup the descriptor */
- desc.type = VM_PKT_DATA_USING_GPA_DIRECT;
- desc.flags = VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED;
- desc.dataoffset8 = descsize >> 3; /* in 8-bytes granularity */
- desc.length8 = (u16)(packetlen_aligned >> 3);
- desc.transactionid = VMBUS_RQST_ERROR; /* will be updated in hv_ringbuffer_write() */
- desc.reserved = 0;
- desc.rangecount = pagecount;
-
- for (i = 0; i < pagecount; i++) {
- desc.range[i].len = pagebuffers[i].len;
- desc.range[i].offset = pagebuffers[i].offset;
- desc.range[i].pfn = pagebuffers[i].pfn;
- }
-
- bufferlist[0].iov_base = &desc;
- bufferlist[0].iov_len = descsize;
- bufferlist[1].iov_base = buffer;
- bufferlist[1].iov_len = bufferlen;
- bufferlist[2].iov_base = &aligned_data;
- bufferlist[2].iov_len = (packetlen_aligned - packetlen);
-
- return hv_ringbuffer_write(channel, bufferlist, 3, requestid, NULL);
-}
-EXPORT_SYMBOL_GPL(vmbus_sendpacket_pagebuffer);
-
-/*
- * vmbus_sendpacket_multipagebuffer - Send a multi-page buffer packet
+ * vmbus_sendpacket_mpb_desc - Send one or more multi-page buffer packets
* using a GPADL Direct packet type.
- * The buffer includes the vmbus descriptor.
+ * The desc argument must include space for the VMBus descriptor. The
+ * rangecount field must already be set.
*/
int vmbus_sendpacket_mpb_desc(struct vmbus_channel *channel,
struct vmbus_packet_mpb_array *desc,
@@ -1160,7 +1102,6 @@ int vmbus_sendpacket_mpb_desc(struct vmbus_channel *channel,
desc->length8 = (u16)(packetlen_aligned >> 3);
desc->transactionid = VMBUS_RQST_ERROR; /* will be updated in hv_ringbuffer_write() */
desc->reserved = 0;
- desc->rangecount = 1;
bufferlist[0].iov_base = desc;
bufferlist[0].iov_len = desc_size;
diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index 2f4d09ce027a..6e084c207414 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -120,7 +120,9 @@ const struct vmbus_device vmbus_devs[] = {
},
/* File copy */
- { .dev_type = HV_FCOPY,
+ /* fcopy always uses 16KB ring buffer size and is working well for last many years */
+ { .pref_ring_size = 0x4000,
+ .dev_type = HV_FCOPY,
HV_FCOPY_GUID,
.perf_device = false,
.allowed_in_isolated = false,
@@ -140,12 +142,19 @@ const struct vmbus_device vmbus_devs[] = {
.allowed_in_isolated = false,
},
- /* Unknown GUID */
- { .dev_type = HV_UNKNOWN,
+ /*
+ * Unknown GUID
+ * 64 KB ring buffer + 4 KB header should be sufficient size for any Hyper-V device apart
+ * from HV_NIC and HV_SCSI. This case avoid the fallback for unknown devices to allocate
+ * much bigger (2 MB) of ring size.
+ */
+ { .pref_ring_size = 0x11000,
+ .dev_type = HV_UNKNOWN,
.perf_device = false,
.allowed_in_isolated = false,
},
};
+EXPORT_SYMBOL_GPL(vmbus_devs);
static const struct {
guid_t guid;
@@ -935,16 +944,6 @@ void vmbus_initiate_unload(bool crash)
vmbus_wait_for_unload();
}
-static void check_ready_for_resume_event(void)
-{
- /*
- * If all the old primary channels have been fixed up, then it's safe
- * to resume.
- */
- if (atomic_dec_and_test(&vmbus_connection.nr_chan_fixup_on_resume))
- complete(&vmbus_connection.ready_for_resume_event);
-}
-
static void vmbus_setup_channel_state(struct vmbus_channel *channel,
struct vmbus_channel_offer_channel *offer)
{
@@ -1100,8 +1099,6 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr)
/* Add the channel back to the array of channels. */
vmbus_channel_map_relid(oldchannel);
- check_ready_for_resume_event();
-
mutex_unlock(&vmbus_connection.channel_mutex);
return;
}
@@ -1287,13 +1284,28 @@ EXPORT_SYMBOL_GPL(vmbus_hvsock_device_unregister);
/*
* vmbus_onoffers_delivered -
- * This is invoked when all offers have been delivered.
+ * The CHANNELMSG_ALLOFFERS_DELIVERED message arrives after all
+ * boot-time offers are delivered. A boot-time offer is for the primary
+ * channel for any virtual hardware configured in the VM at the time it boots.
+ * Boot-time offers include offers for physical devices assigned to the VM
+ * via Hyper-V's Discrete Device Assignment (DDA) functionality that are
+ * handled as virtual PCI devices in Linux (e.g., NVMe devices and GPUs).
+ * Boot-time offers do not include offers for VMBus sub-channels. Because
+ * devices can be hot-added to the VM after it is booted, additional channel
+ * offers that aren't boot-time offers can be received at any time after the
+ * all-offers-delivered message.
*
- * Nothing to do here.
+ * SR-IOV NIC Virtual Functions (VFs) assigned to a VM are not considered
+ * to be assigned to the VM at boot-time, and offers for VFs may occur after
+ * the all-offers-delivered message. VFs are optional accelerators to the
+ * synthetic VMBus NIC and are effectively hot-added only after the VMBus
+ * NIC channel is opened (once it knows the guest can support it, via the
+ * sriov bit in the netvsc protocol).
*/
static void vmbus_onoffers_delivered(
struct vmbus_channel_message_header *hdr)
{
+ complete(&vmbus_connection.all_offers_delivered_event);
}
/*
@@ -1569,7 +1581,8 @@ void vmbus_onmessage(struct vmbus_channel_message_header *hdr)
}
/*
- * vmbus_request_offers - Send a request to get all our pending offers.
+ * vmbus_request_offers - Send a request to get all our pending offers
+ * and wait for all boot-time offers to arrive.
*/
int vmbus_request_offers(void)
{
@@ -1587,6 +1600,10 @@ int vmbus_request_offers(void)
msg->msgtype = CHANNELMSG_REQUESTOFFERS;
+ /*
+ * This REQUESTOFFERS message will result in the host sending an all
+ * offers delivered message after all the boot-time offers are sent.
+ */
ret = vmbus_post_msg(msg, sizeof(struct vmbus_channel_message_header),
true);
@@ -1598,6 +1615,29 @@ int vmbus_request_offers(void)
goto cleanup;
}
+ /*
+ * Wait for the host to send all boot-time offers.
+ * Keeping it as a best-effort mechanism, where a warning is
+ * printed if a timeout occurs, and execution is resumed.
+ */
+ if (!wait_for_completion_timeout(&vmbus_connection.all_offers_delivered_event,
+ secs_to_jiffies(60))) {
+ pr_warn("timed out waiting for all boot-time offers to be delivered.\n");
+ }
+
+ /*
+ * Flush handling of offer messages (which may initiate work on
+ * other work queues).
+ */
+ flush_workqueue(vmbus_connection.work_queue);
+
+ /*
+ * Flush workqueue for processing the incoming offers. Subchannel
+ * offers and their processing can happen later, so there is no need to
+ * flush that workqueue here.
+ */
+ flush_workqueue(vmbus_connection.handle_primary_chan_wq);
+
cleanup:
kfree(msginfo);
diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
index f001ae880e1d..8351360bba16 100644
--- a/drivers/hv/connection.c
+++ b/drivers/hv/connection.c
@@ -34,8 +34,8 @@ struct vmbus_connection vmbus_connection = {
.ready_for_suspend_event = COMPLETION_INITIALIZER(
vmbus_connection.ready_for_suspend_event),
- .ready_for_resume_event = COMPLETION_INITIALIZER(
- vmbus_connection.ready_for_resume_event),
+ .all_offers_delivered_event = COMPLETION_INITIALIZER(
+ vmbus_connection.all_offers_delivered_event),
};
EXPORT_SYMBOL_GPL(vmbus_connection);
diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
index a8ad728354cb..308c8f279df8 100644
--- a/drivers/hv/hv.c
+++ b/drivers/hv/hv.c
@@ -45,8 +45,8 @@ int hv_init(void)
* This involves a hypercall.
*/
int hv_post_message(union hv_connection_id connection_id,
- enum hv_message_type message_type,
- void *payload, size_t payload_size)
+ enum hv_message_type message_type,
+ void *payload, size_t payload_size)
{
struct hv_input_post_message *aligned_msg;
unsigned long flags;
@@ -86,7 +86,7 @@ int hv_post_message(union hv_connection_id connection_id,
status = HV_STATUS_INVALID_PARAMETER;
} else {
status = hv_do_hypercall(HVCALL_POST_MESSAGE,
- aligned_msg, NULL);
+ aligned_msg, NULL);
}
local_irq_restore(flags);
@@ -111,7 +111,7 @@ int hv_synic_alloc(void)
hv_context.hv_numa_map = kcalloc(nr_node_ids, sizeof(struct cpumask),
GFP_KERNEL);
- if (hv_context.hv_numa_map == NULL) {
+ if (!hv_context.hv_numa_map) {
pr_err("Unable to allocate NUMA map\n");
goto err;
}
@@ -120,11 +120,11 @@ int hv_synic_alloc(void)
hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu);
tasklet_init(&hv_cpu->msg_dpc,
- vmbus_on_msg_dpc, (unsigned long) hv_cpu);
+ vmbus_on_msg_dpc, (unsigned long)hv_cpu);
if (ms_hyperv.paravisor_present && hv_isolation_type_tdx()) {
hv_cpu->post_msg_page = (void *)get_zeroed_page(GFP_ATOMIC);
- if (hv_cpu->post_msg_page == NULL) {
+ if (!hv_cpu->post_msg_page) {
pr_err("Unable to allocate post msg page\n");
goto err;
}
@@ -144,17 +144,17 @@ int hv_synic_alloc(void)
* Synic message and event pages are allocated by paravisor.
* Skip these pages allocation here.
*/
- if (!ms_hyperv.paravisor_present && !hv_root_partition) {
+ if (!ms_hyperv.paravisor_present && !hv_root_partition()) {
hv_cpu->synic_message_page =
(void *)get_zeroed_page(GFP_ATOMIC);
- if (hv_cpu->synic_message_page == NULL) {
+ if (!hv_cpu->synic_message_page) {
pr_err("Unable to allocate SYNIC message page\n");
goto err;
}
hv_cpu->synic_event_page =
(void *)get_zeroed_page(GFP_ATOMIC);
- if (hv_cpu->synic_event_page == NULL) {
+ if (!hv_cpu->synic_event_page) {
pr_err("Unable to allocate SYNIC event page\n");
free_page((unsigned long)hv_cpu->synic_message_page);
@@ -203,14 +203,13 @@ err:
return ret;
}
-
void hv_synic_free(void)
{
int cpu, ret;
for_each_present_cpu(cpu) {
- struct hv_per_cpu_context *hv_cpu
- = per_cpu_ptr(hv_context.cpu_context, cpu);
+ struct hv_per_cpu_context *hv_cpu =
+ per_cpu_ptr(hv_context.cpu_context, cpu);
/* It's better to leak the page if the encryption fails. */
if (ms_hyperv.paravisor_present && hv_isolation_type_tdx()) {
@@ -262,8 +261,8 @@ void hv_synic_free(void)
*/
void hv_synic_enable_regs(unsigned int cpu)
{
- struct hv_per_cpu_context *hv_cpu
- = per_cpu_ptr(hv_context.cpu_context, cpu);
+ struct hv_per_cpu_context *hv_cpu =
+ per_cpu_ptr(hv_context.cpu_context, cpu);
union hv_synic_simp simp;
union hv_synic_siefp siefp;
union hv_synic_sint shared_sint;
@@ -273,12 +272,12 @@ void hv_synic_enable_regs(unsigned int cpu)
simp.as_uint64 = hv_get_msr(HV_MSR_SIMP);
simp.simp_enabled = 1;
- if (ms_hyperv.paravisor_present || hv_root_partition) {
+ if (ms_hyperv.paravisor_present || hv_root_partition()) {
/* Mask out vTOM bit. ioremap_cache() maps decrypted */
u64 base = (simp.base_simp_gpa << HV_HYP_PAGE_SHIFT) &
~ms_hyperv.shared_gpa_boundary;
- hv_cpu->synic_message_page
- = (void *)ioremap_cache(base, HV_HYP_PAGE_SIZE);
+ hv_cpu->synic_message_page =
+ (void *)ioremap_cache(base, HV_HYP_PAGE_SIZE);
if (!hv_cpu->synic_message_page)
pr_err("Fail to map synic message page.\n");
} else {
@@ -292,12 +291,12 @@ void hv_synic_enable_regs(unsigned int cpu)
siefp.as_uint64 = hv_get_msr(HV_MSR_SIEFP);
siefp.siefp_enabled = 1;
- if (ms_hyperv.paravisor_present || hv_root_partition) {
+ if (ms_hyperv.paravisor_present || hv_root_partition()) {
/* Mask out vTOM bit. ioremap_cache() maps decrypted */
u64 base = (siefp.base_siefp_gpa << HV_HYP_PAGE_SHIFT) &
~ms_hyperv.shared_gpa_boundary;
- hv_cpu->synic_event_page
- = (void *)ioremap_cache(base, HV_HYP_PAGE_SIZE);
+ hv_cpu->synic_event_page =
+ (void *)ioremap_cache(base, HV_HYP_PAGE_SIZE);
if (!hv_cpu->synic_event_page)
pr_err("Fail to map synic event page.\n");
} else {
@@ -314,17 +313,7 @@ void hv_synic_enable_regs(unsigned int cpu)
shared_sint.vector = vmbus_interrupt;
shared_sint.masked = false;
-
- /*
- * On architectures where Hyper-V doesn't support AEOI (e.g., ARM64),
- * it doesn't provide a recommendation flag and AEOI must be disabled.
- */
-#ifdef HV_DEPRECATING_AEOI_RECOMMENDED
- shared_sint.auto_eoi =
- !(ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED);
-#else
- shared_sint.auto_eoi = 0;
-#endif
+ shared_sint.auto_eoi = hv_recommend_using_aeoi();
hv_set_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT, shared_sint.as_uint64);
/* Enable the global synic bit */
@@ -343,13 +332,10 @@ int hv_synic_init(unsigned int cpu)
return 0;
}
-/*
- * hv_synic_cleanup - Cleanup routine for hv_synic_init().
- */
void hv_synic_disable_regs(unsigned int cpu)
{
- struct hv_per_cpu_context *hv_cpu
- = per_cpu_ptr(hv_context.cpu_context, cpu);
+ struct hv_per_cpu_context *hv_cpu =
+ per_cpu_ptr(hv_context.cpu_context, cpu);
union hv_synic_sint shared_sint;
union hv_synic_simp simp;
union hv_synic_siefp siefp;
@@ -371,7 +357,7 @@ void hv_synic_disable_regs(unsigned int cpu)
* addresses.
*/
simp.simp_enabled = 0;
- if (ms_hyperv.paravisor_present || hv_root_partition) {
+ if (ms_hyperv.paravisor_present || hv_root_partition()) {
iounmap(hv_cpu->synic_message_page);
hv_cpu->synic_message_page = NULL;
} else {
@@ -383,7 +369,7 @@ void hv_synic_disable_regs(unsigned int cpu)
siefp.as_uint64 = hv_get_msr(HV_MSR_SIEFP);
siefp.siefp_enabled = 0;
- if (ms_hyperv.paravisor_present || hv_root_partition) {
+ if (ms_hyperv.paravisor_present || hv_root_partition()) {
iounmap(hv_cpu->synic_event_page);
hv_cpu->synic_event_page = NULL;
} else {
@@ -437,10 +423,47 @@ retry:
return pending;
}
+static int hv_pick_new_cpu(struct vmbus_channel *channel)
+{
+ int ret = -EBUSY;
+ int start;
+ int cpu;
+
+ lockdep_assert_cpus_held();
+ lockdep_assert_held(&vmbus_connection.channel_mutex);
+
+ /*
+ * We can't assume that the relevant interrupts will be sent before
+ * the cpu is offlined on older versions of hyperv.
+ */
+ if (vmbus_proto_version < VERSION_WIN10_V5_3)
+ return -EBUSY;
+
+ start = get_random_u32_below(nr_cpu_ids);
+
+ for_each_cpu_wrap(cpu, cpu_online_mask, start) {
+ if (channel->target_cpu == cpu ||
+ channel->target_cpu == VMBUS_CONNECT_CPU)
+ continue;
+
+ ret = vmbus_channel_set_cpu(channel, cpu);
+ if (!ret)
+ break;
+ }
+
+ if (ret)
+ ret = vmbus_channel_set_cpu(channel, VMBUS_CONNECT_CPU);
+
+ return ret;
+}
+
+/*
+ * hv_synic_cleanup - Cleanup routine for hv_synic_init().
+ */
int hv_synic_cleanup(unsigned int cpu)
{
struct vmbus_channel *channel, *sc;
- bool channel_found = false;
+ int ret = 0;
if (vmbus_connection.conn_state != CONNECTED)
goto always_cleanup;
@@ -457,38 +480,34 @@ int hv_synic_cleanup(unsigned int cpu)
/*
* Search for channels which are bound to the CPU we're about to
- * cleanup. In case we find one and vmbus is still connected, we
- * fail; this will effectively prevent CPU offlining.
- *
- * TODO: Re-bind the channels to different CPUs.
+ * cleanup.
*/
mutex_lock(&vmbus_connection.channel_mutex);
list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
if (channel->target_cpu == cpu) {
- channel_found = true;
- break;
+ ret = hv_pick_new_cpu(channel);
+ if (ret) {
+ mutex_unlock(&vmbus_connection.channel_mutex);
+ return ret;
+ }
}
list_for_each_entry(sc, &channel->sc_list, sc_list) {
if (sc->target_cpu == cpu) {
- channel_found = true;
- break;
+ ret = hv_pick_new_cpu(sc);
+ if (ret) {
+ mutex_unlock(&vmbus_connection.channel_mutex);
+ return ret;
+ }
}
}
- if (channel_found)
- break;
}
mutex_unlock(&vmbus_connection.channel_mutex);
- if (channel_found)
- return -EBUSY;
-
/*
- * channel_found == false means that any channels that were previously
- * assigned to the CPU have been reassigned elsewhere with a call of
- * vmbus_send_modifychannel(). Scan the event flags page looking for
- * bits that are set and waiting with a timeout for vmbus_chan_sched()
- * to process such bits. If bits are still set after this operation
- * and VMBus is connected, fail the CPU offlining operation.
+ * Scan the event flags page looking for bits that are set and waiting
+ * with a timeout for vmbus_chan_sched() to process such bits. If bits
+ * are still set after this operation and VMBus is connected, fail the
+ * CPU offlining operation.
*/
if (vmbus_proto_version >= VERSION_WIN10_V4_1 && hv_synic_event_pending())
return -EBUSY;
@@ -498,5 +517,5 @@ always_cleanup:
hv_synic_disable_regs(cpu);
- return 0;
+ return ret;
}
diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
index e000fa3b9f97..2b4080e51f97 100644
--- a/drivers/hv/hv_balloon.c
+++ b/drivers/hv/hv_balloon.c
@@ -25,9 +25,10 @@
#include <linux/notifier.h>
#include <linux/percpu_counter.h>
#include <linux/page_reporting.h>
+#include <linux/sizes.h>
#include <linux/hyperv.h>
-#include <asm/hyperv-tlfs.h>
+#include <hyperv/hvhdk.h>
#include <asm/mshyperv.h>
@@ -41,8 +42,6 @@
* Begin protocol definitions.
*/
-
-
/*
* Protocol versions. The low word is the minor version, the high word the major
* version.
@@ -71,8 +70,6 @@ enum {
DYNMEM_PROTOCOL_VERSION_CURRENT = DYNMEM_PROTOCOL_VERSION_WIN10
};
-
-
/*
* Message Types
*/
@@ -101,7 +98,6 @@ enum dm_message_type {
DM_VERSION_1_MAX = 12
};
-
/*
* Structures defining the dynamic memory management
* protocol.
@@ -115,7 +111,6 @@ union dm_version {
__u32 version;
} __packed;
-
union dm_caps {
struct {
__u64 balloon:1;
@@ -148,8 +143,6 @@ union dm_mem_page_range {
__u64 page_range;
} __packed;
-
-
/*
* The header for all dynamic memory messages:
*
@@ -174,7 +167,6 @@ struct dm_message {
__u8 data[]; /* enclosed message */
} __packed;
-
/*
* Specific message types supporting the dynamic memory protocol.
*/
@@ -271,7 +263,6 @@ struct dm_status {
__u32 io_diff;
} __packed;
-
/*
* Message to ask the guest to allocate memory - balloon up message.
* This message is sent from the host to the guest. The guest may not be
@@ -286,14 +277,13 @@ struct dm_balloon {
__u32 reservedz;
} __packed;
-
/*
* Balloon response message; this message is sent from the guest
* to the host in response to the balloon message.
*
* reservedz: Reserved; must be set to zero.
* more_pages: If FALSE, this is the last message of the transaction.
- * if TRUE there will atleast one more message from the guest.
+ * if TRUE there will be at least one more message from the guest.
*
* range_count: The number of ranges in the range array.
*
@@ -314,7 +304,7 @@ struct dm_balloon_response {
* to the guest to give guest more memory.
*
* more_pages: If FALSE, this is the last message of the transaction.
- * if TRUE there will atleast one more message from the guest.
+ * if TRUE there will be at least one more message from the guest.
*
* reservedz: Reserved; must be set to zero.
*
@@ -342,7 +332,6 @@ struct dm_unballoon_response {
struct dm_header hdr;
} __packed;
-
/*
* Hot add request message. Message sent from the host to the guest.
*
@@ -390,7 +379,6 @@ enum dm_info_type {
MAX_INFO_TYPE
};
-
/*
* Header for the information message.
*/
@@ -425,11 +413,11 @@ struct dm_info_msg {
* The range start_pfn : end_pfn specifies the range
* that the host has asked us to hot add. The range
* start_pfn : ha_end_pfn specifies the range that we have
- * currently hot added. We hot add in multiples of 128M
- * chunks; it is possible that we may not be able to bring
- * online all the pages in the region. The range
+ * currently hot added. We hot add in chunks equal to the
+ * memory block size; it is possible that we may not be able
+ * to bring online all the pages in the region. The range
* covered_start_pfn:covered_end_pfn defines the pages that can
- * be brough online.
+ * be brought online.
*/
struct hv_hotadd_state {
@@ -480,10 +468,10 @@ static unsigned long last_post_time;
static int hv_hypercall_multi_failure;
-module_param(hot_add, bool, (S_IRUGO | S_IWUSR));
+module_param(hot_add, bool, 0644);
MODULE_PARM_DESC(hot_add, "If set attempt memory hot_add");
-module_param(pressure_report_delay, uint, (S_IRUGO | S_IWUSR));
+module_param(pressure_report_delay, uint, 0644);
MODULE_PARM_DESC(pressure_report_delay, "Delay in secs in reporting pressure");
static atomic_t trans_id = ATOMIC_INIT(0);
@@ -502,11 +490,13 @@ enum hv_dm_state {
DM_INIT_ERROR
};
-
static __u8 recv_buffer[HV_HYP_PAGE_SIZE];
static __u8 balloon_up_send_buffer[HV_HYP_PAGE_SIZE];
+
+static unsigned long ha_pages_in_chunk;
+#define HA_BYTES_IN_CHUNK (ha_pages_in_chunk << PAGE_SHIFT)
+
#define PAGES_IN_2M (2 * 1024 * 1024 / PAGE_SIZE)
-#define HA_CHUNK (128 * 1024 * 1024 / PAGE_SIZE)
struct hv_dynmem_device {
struct hv_device *dev;
@@ -595,12 +585,12 @@ static inline bool has_pfn_is_backed(struct hv_hotadd_state *has,
struct hv_hotadd_gap *gap;
/* The page is not backed. */
- if ((pfn < has->covered_start_pfn) || (pfn >= has->covered_end_pfn))
+ if (pfn < has->covered_start_pfn || pfn >= has->covered_end_pfn)
return false;
/* Check for gaps. */
list_for_each_entry(gap, &has->gap_list, list) {
- if ((pfn >= gap->start_pfn) && (pfn < gap->end_pfn))
+ if (pfn >= gap->start_pfn && pfn < gap->end_pfn)
return false;
}
@@ -693,9 +683,8 @@ static void hv_page_online_one(struct hv_hotadd_state *has, struct page *pg)
if (!PageOffline(pg))
__SetPageOffline(pg);
return;
- }
- if (PageOffline(pg))
- __ClearPageOffline(pg);
+ } else if (!PageOffline(pg))
+ return;
/* This frame is currently backed; online the page. */
generic_online_page(pg, 0);
@@ -724,28 +713,21 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size,
unsigned long processed_pfn;
unsigned long total_pfn = pfn_count;
- for (i = 0; i < (size/HA_CHUNK); i++) {
- start_pfn = start + (i * HA_CHUNK);
+ for (i = 0; i < (size/ha_pages_in_chunk); i++) {
+ start_pfn = start + (i * ha_pages_in_chunk);
scoped_guard(spinlock_irqsave, &dm_device.ha_lock) {
- has->ha_end_pfn += HA_CHUNK;
-
- if (total_pfn > HA_CHUNK) {
- processed_pfn = HA_CHUNK;
- total_pfn -= HA_CHUNK;
- } else {
- processed_pfn = total_pfn;
- total_pfn = 0;
- }
-
- has->covered_end_pfn += processed_pfn;
+ has->ha_end_pfn += ha_pages_in_chunk;
+ processed_pfn = umin(total_pfn, ha_pages_in_chunk);
+ total_pfn -= processed_pfn;
+ has->covered_end_pfn += processed_pfn;
}
reinit_completion(&dm_device.ol_waitevent);
nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn));
ret = add_memory(nid, PFN_PHYS((start_pfn)),
- (HA_CHUNK << PAGE_SHIFT), MHP_MERGE_RESOURCE);
+ HA_BYTES_IN_CHUNK, MHP_MERGE_RESOURCE);
if (ret) {
pr_err("hot_add memory failed error is %d\n", ret);
@@ -760,7 +742,7 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size,
do_hot_add = false;
}
scoped_guard(spinlock_irqsave, &dm_device.ha_lock) {
- has->ha_end_pfn -= HA_CHUNK;
+ has->ha_end_pfn -= ha_pages_in_chunk;
has->covered_end_pfn -= processed_pfn;
}
break;
@@ -774,7 +756,7 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size,
* adding succeeded, it is ok to proceed even if the memory was
* not onlined in time.
*/
- wait_for_completion_timeout(&dm_device.ol_waitevent, 5 * HZ);
+ wait_for_completion_timeout(&dm_device.ol_waitevent, secs_to_jiffies(5));
post_status(&dm_device);
}
}
@@ -784,23 +766,25 @@ static void hv_online_page(struct page *pg, unsigned int order)
struct hv_hotadd_state *has;
unsigned long pfn = page_to_pfn(pg);
- guard(spinlock_irqsave)(&dm_device.ha_lock);
- list_for_each_entry(has, &dm_device.ha_region_list, list) {
- /* The page belongs to a different HAS. */
- if ((pfn < has->start_pfn) ||
+ scoped_guard(spinlock_irqsave, &dm_device.ha_lock) {
+ list_for_each_entry(has, &dm_device.ha_region_list, list) {
+ /* The page belongs to a different HAS. */
+ if (pfn < has->start_pfn ||
(pfn + (1UL << order) > has->end_pfn))
- continue;
+ continue;
- hv_bring_pgs_online(has, pfn, 1UL << order);
- break;
+ hv_bring_pgs_online(has, pfn, 1UL << order);
+ return;
+ }
}
+ generic_online_page(pg, order);
}
static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt)
{
struct hv_hotadd_state *has;
struct hv_hotadd_gap *gap;
- unsigned long residual, new_inc;
+ unsigned long residual;
int ret = 0;
guard(spinlock_irqsave)(&dm_device.ha_lock);
@@ -836,15 +820,9 @@ static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt)
* our current limit; extend it.
*/
if ((start_pfn + pfn_cnt) > has->end_pfn) {
+ /* Extend the region by multiples of ha_pages_in_chunk */
residual = (start_pfn + pfn_cnt - has->end_pfn);
- /*
- * Extend the region by multiples of HA_CHUNK.
- */
- new_inc = (residual / HA_CHUNK) * HA_CHUNK;
- if (residual % HA_CHUNK)
- new_inc += HA_CHUNK;
-
- has->end_pfn += new_inc;
+ has->end_pfn += ALIGN(residual, ha_pages_in_chunk);
}
ret = 1;
@@ -855,7 +833,7 @@ static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt)
}
static unsigned long handle_pg_range(unsigned long pg_start,
- unsigned long pg_count)
+ unsigned long pg_count)
{
unsigned long start_pfn = pg_start;
unsigned long pfn_cnt = pg_count;
@@ -866,7 +844,7 @@ static unsigned long handle_pg_range(unsigned long pg_start,
unsigned long res = 0, flags;
pr_debug("Hot adding %lu pages starting at pfn 0x%lx.\n", pg_count,
- pg_start);
+ pg_start);
spin_lock_irqsave(&dm_device.ha_lock, flags);
list_for_each_entry(has, &dm_device.ha_region_list, list) {
@@ -902,22 +880,19 @@ static unsigned long handle_pg_range(unsigned long pg_start,
if (start_pfn > has->start_pfn &&
online_section_nr(pfn_to_section_nr(start_pfn)))
hv_bring_pgs_online(has, start_pfn, pgs_ol);
-
}
- if ((has->ha_end_pfn < has->end_pfn) && (pfn_cnt > 0)) {
+ if (has->ha_end_pfn < has->end_pfn && pfn_cnt > 0) {
/*
* We have some residual hot add range
* that needs to be hot added; hot add
* it now. Hot add a multiple of
- * HA_CHUNK that fully covers the pages
+ * ha_pages_in_chunk that fully covers the pages
* we have.
*/
size = (has->end_pfn - has->ha_end_pfn);
if (pfn_cnt <= size) {
- size = ((pfn_cnt / HA_CHUNK) * HA_CHUNK);
- if (pfn_cnt % HA_CHUNK)
- size += HA_CHUNK;
+ size = ALIGN(pfn_cnt, ha_pages_in_chunk);
} else {
pfn_cnt = size;
}
@@ -1010,10 +985,7 @@ static void hot_add_req(struct work_struct *dummy)
rg_start = dm->ha_wrk.ha_region_range.finfo.start_page;
rg_sz = dm->ha_wrk.ha_region_range.finfo.page_cnt;
- if ((rg_start == 0) && (!dm->host_specified_ha_region)) {
- unsigned long region_size;
- unsigned long region_start;
-
+ if (rg_start == 0 && !dm->host_specified_ha_region) {
/*
* The host has not specified the hot-add region.
* Based on the hot-add page range being specified,
@@ -1021,19 +993,13 @@ static void hot_add_req(struct work_struct *dummy)
* that need to be hot-added while ensuring the alignment
* and size requirements of Linux as it relates to hot-add.
*/
- region_size = (pfn_cnt / HA_CHUNK) * HA_CHUNK;
- if (pfn_cnt % HA_CHUNK)
- region_size += HA_CHUNK;
-
- region_start = (pg_start / HA_CHUNK) * HA_CHUNK;
-
- rg_start = region_start;
- rg_sz = region_size;
+ rg_start = ALIGN_DOWN(pg_start, ha_pages_in_chunk);
+ rg_sz = ALIGN(pfn_cnt, ha_pages_in_chunk);
}
if (do_hot_add)
resp.page_count = process_hot_add(pg_start, pfn_cnt,
- rg_start, rg_sz);
+ rg_start, rg_sz);
dm->num_pages_added += resp.page_count;
#endif
@@ -1211,11 +1177,10 @@ static void post_status(struct hv_dynmem_device *dm)
sizeof(struct dm_status),
(unsigned long)NULL,
VM_PKT_DATA_INBAND, 0);
-
}
static void free_balloon_pages(struct hv_dynmem_device *dm,
- union dm_mem_page_range *range_array)
+ union dm_mem_page_range *range_array)
{
int num_pages = range_array->finfo.page_cnt;
__u64 start_frame = range_array->finfo.start_page;
@@ -1227,12 +1192,11 @@ static void free_balloon_pages(struct hv_dynmem_device *dm,
__ClearPageOffline(pg);
__free_page(pg);
dm->num_pages_ballooned--;
+ mod_node_page_state(page_pgdat(pg), NR_BALLOON_PAGES, -1);
adjust_managed_page_count(pg, 1);
}
}
-
-
static unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm,
unsigned int num_pages,
struct dm_balloon_response *bl_resp,
@@ -1258,6 +1222,7 @@ static unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm,
return i * alloc_unit;
dm->num_pages_ballooned += alloc_unit;
+ mod_node_page_state(page_pgdat(pg), NR_BALLOON_PAGES, alloc_unit);
/*
* If we allocatted 2M pages; split them so we
@@ -1278,7 +1243,6 @@ static unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm,
page_to_pfn(pg);
bl_resp->range_array[i].finfo.page_cnt = alloc_unit;
bl_resp->hdr.size += sizeof(union dm_mem_page_range);
-
}
return i * alloc_unit;
@@ -1332,7 +1296,7 @@ static void balloon_up(struct work_struct *dummy)
if (num_ballooned == 0 || num_ballooned == num_pages) {
pr_debug("Ballooned %u out of %u requested pages.\n",
- num_pages, dm_device.balloon_wrk.num_pages);
+ num_pages, dm_device.balloon_wrk.num_pages);
bl_resp->more_pages = 0;
done = true;
@@ -1366,16 +1330,15 @@ static void balloon_up(struct work_struct *dummy)
for (i = 0; i < bl_resp->range_count; i++)
free_balloon_pages(&dm_device,
- &bl_resp->range_array[i]);
+ &bl_resp->range_array[i]);
done = true;
}
}
-
}
static void balloon_down(struct hv_dynmem_device *dm,
- struct dm_unballoon_request *req)
+ struct dm_unballoon_request *req)
{
union dm_mem_page_range *range_array = req->range_array;
int range_count = req->range_count;
@@ -1389,7 +1352,7 @@ static void balloon_down(struct hv_dynmem_device *dm,
}
pr_debug("Freed %u ballooned pages.\n",
- prev_pages_ballooned - dm->num_pages_ballooned);
+ prev_pages_ballooned - dm->num_pages_ballooned);
if (req->more_pages == 1)
return;
@@ -1414,8 +1377,8 @@ static int dm_thread_func(void *dm_dev)
struct hv_dynmem_device *dm = dm_dev;
while (!kthread_should_stop()) {
- wait_for_completion_interruptible_timeout(
- &dm_device.config_event, 1*HZ);
+ wait_for_completion_interruptible_timeout(&dm_device.config_event,
+ secs_to_jiffies(1));
/*
* The host expects us to post information on the memory
* pressure every second.
@@ -1439,9 +1402,8 @@ static int dm_thread_func(void *dm_dev)
return 0;
}
-
static void version_resp(struct hv_dynmem_device *dm,
- struct dm_version_response *vresp)
+ struct dm_version_response *vresp)
{
struct dm_version_request version_req;
int ret;
@@ -1502,7 +1464,7 @@ version_error:
}
static void cap_resp(struct hv_dynmem_device *dm,
- struct dm_capabilities_resp_msg *cap_resp)
+ struct dm_capabilities_resp_msg *cap_resp)
{
if (!cap_resp->is_accepted) {
pr_err("Capabilities not accepted by host\n");
@@ -1535,7 +1497,7 @@ static void balloon_onchannelcallback(void *context)
switch (dm_hdr->type) {
case DM_VERSION_RESPONSE:
version_resp(dm,
- (struct dm_version_response *)dm_msg);
+ (struct dm_version_response *)dm_msg);
break;
case DM_CAPABILITIES_RESPONSE:
@@ -1565,7 +1527,7 @@ static void balloon_onchannelcallback(void *context)
dm->state = DM_BALLOON_DOWN;
balloon_down(dm,
- (struct dm_unballoon_request *)recv_buffer);
+ (struct dm_unballoon_request *)recv_buffer);
break;
case DM_MEM_HOT_ADD_REQUEST:
@@ -1603,17 +1565,15 @@ static void balloon_onchannelcallback(void *context)
default:
pr_warn_ratelimited("Unhandled message: type: %d\n", dm_hdr->type);
-
}
}
-
}
#define HV_LARGE_REPORTING_ORDER 9
#define HV_LARGE_REPORTING_LEN (HV_HYP_PAGE_SIZE << \
HV_LARGE_REPORTING_ORDER)
static int hv_free_page_report(struct page_reporting_dev_info *pr_dev_info,
- struct scatterlist *sgl, unsigned int nents)
+ struct scatterlist *sgl, unsigned int nents)
{
unsigned long flags;
struct hv_memory_hint *hint;
@@ -1630,7 +1590,7 @@ static int hv_free_page_report(struct page_reporting_dev_info *pr_dev_info,
return -ENOSPC;
}
- hint->type = HV_EXT_MEMORY_HEAT_HINT_TYPE_COLD_DISCARD;
+ hint->heat_type = HV_EXTMEM_HEAT_HINT_COLD_DISCARD;
hint->reserved = 0;
for_each_sg(sgl, sg, nents, i) {
union hv_gpa_page_range *range;
@@ -1648,7 +1608,7 @@ static int hv_free_page_report(struct page_reporting_dev_info *pr_dev_info,
*/
/* page reporting for pages 2MB or higher */
- if (order >= HV_LARGE_REPORTING_ORDER ) {
+ if (order >= HV_LARGE_REPORTING_ORDER) {
range->page.largepage = 1;
range->page_size = HV_GPA_PAGE_RANGE_PAGE_SIZE_2MB;
range->base_large_pfn = page_to_hvpfn(
@@ -1662,23 +1622,21 @@ static int hv_free_page_report(struct page_reporting_dev_info *pr_dev_info,
range->page.additional_pages =
(sg->length / HV_HYP_PAGE_SIZE) - 1;
}
-
}
status = hv_do_rep_hypercall(HV_EXT_CALL_MEMORY_HEAT_HINT, nents, 0,
hint, NULL);
local_irq_restore(flags);
if (!hv_result_success(status)) {
-
pr_err("Cold memory discard hypercall failed with status %llx\n",
- status);
+ status);
if (hv_hypercall_multi_failure > 0)
hv_hypercall_multi_failure++;
if (hv_result(status) == HV_STATUS_INVALID_PARAMETER) {
pr_err("Underlying Hyper-V does not support order less than 9. Hypercall failed\n");
pr_err("Defaulting to page_reporting_order %d\n",
- pageblock_order);
+ pageblock_order);
page_reporting_order = pageblock_order;
hv_hypercall_multi_failure++;
return -EINVAL;
@@ -1712,7 +1670,7 @@ static void enable_page_reporting(void)
pr_err("Failed to enable cold memory discard: %d\n", ret);
} else {
pr_info("Cold memory discard hint enabled with order %d\n",
- page_reporting_order);
+ page_reporting_order);
}
}
@@ -1795,7 +1753,7 @@ static int balloon_connect_vsp(struct hv_device *dev)
if (ret)
goto out;
- t = wait_for_completion_timeout(&dm_device.host_event, 5*HZ);
+ t = wait_for_completion_timeout(&dm_device.host_event, secs_to_jiffies(5));
if (t == 0) {
ret = -ETIMEDOUT;
goto out;
@@ -1831,10 +1789,13 @@ static int balloon_connect_vsp(struct hv_device *dev)
cap_msg.caps.cap_bits.hot_add = hot_add_enabled();
/*
- * Specify our alignment requirements as it relates
- * memory hot-add. Specify 128MB alignment.
+ * Specify our alignment requirements for memory hot-add. The value is
+ * the log base 2 of the number of megabytes in a chunk. For example,
+ * with 256 MiB chunks, the value is 8. The number of MiB in a chunk
+ * must be a power of 2.
*/
- cap_msg.caps.cap_bits.hot_add_alignment = 7;
+ cap_msg.caps.cap_bits.hot_add_alignment =
+ ilog2(HA_BYTES_IN_CHUNK / SZ_1M);
/*
* Currently the host does not use these
@@ -1850,7 +1811,7 @@ static int balloon_connect_vsp(struct hv_device *dev)
if (ret)
goto out;
- t = wait_for_completion_timeout(&dm_device.host_event, 5*HZ);
+ t = wait_for_completion_timeout(&dm_device.host_event, secs_to_jiffies(5));
if (t == 0) {
ret = -ETIMEDOUT;
goto out;
@@ -1891,8 +1852,8 @@ static int hv_balloon_debug_show(struct seq_file *f, void *offset)
char *sname;
seq_printf(f, "%-22s: %u.%u\n", "host_version",
- DYNMEM_MAJOR_VERSION(dm->version),
- DYNMEM_MINOR_VERSION(dm->version));
+ DYNMEM_MAJOR_VERSION(dm->version),
+ DYNMEM_MINOR_VERSION(dm->version));
seq_printf(f, "%-22s:", "capabilities");
if (ballooning_enabled())
@@ -1941,10 +1902,10 @@ static int hv_balloon_debug_show(struct seq_file *f, void *offset)
seq_printf(f, "%-22s: %u\n", "pages_ballooned", dm->num_pages_ballooned);
seq_printf(f, "%-22s: %lu\n", "total_pages_committed",
- get_pages_committed(dm));
+ get_pages_committed(dm));
seq_printf(f, "%-22s: %llu\n", "max_dynamic_page_count",
- dm->max_dynamic_page_count);
+ dm->max_dynamic_page_count);
return 0;
}
@@ -1954,7 +1915,7 @@ DEFINE_SHOW_ATTRIBUTE(hv_balloon_debug);
static void hv_balloon_debugfs_init(struct hv_dynmem_device *b)
{
debugfs_create_file("hv-balloon", 0444, NULL, b,
- &hv_balloon_debug_fops);
+ &hv_balloon_debug_fops);
}
static void hv_balloon_debugfs_exit(struct hv_dynmem_device *b)
@@ -1984,8 +1945,23 @@ static int balloon_probe(struct hv_device *dev,
hot_add = false;
#ifdef CONFIG_MEMORY_HOTPLUG
+ /*
+ * Hot-add must operate in chunks that are of size equal to the
+ * memory block size because that's what the core add_memory()
+ * interface requires. The Hyper-V interface requires that the memory
+ * block size be a power of 2, which is guaranteed by the check in
+ * memory_dev_init().
+ */
+ ha_pages_in_chunk = memory_block_size_bytes() / PAGE_SIZE;
do_hot_add = hot_add;
#else
+ /*
+ * Without MEMORY_HOTPLUG, the guest returns a failure status for all
+ * hot add requests from Hyper-V, and the chunk size is used only to
+ * specify alignment to Hyper-V as required by the host/guest protocol.
+ * Somewhat arbitrarily, use 128 MiB.
+ */
+ ha_pages_in_chunk = SZ_128M / PAGE_SIZE;
do_hot_add = false;
#endif
dm_device.dev = dev;
@@ -2097,7 +2073,6 @@ static int balloon_suspend(struct hv_device *hv_dev)
tasklet_enable(&hv_dev->channel->callback_event);
return 0;
-
}
static int balloon_resume(struct hv_device *dev)
@@ -2156,7 +2131,6 @@ static struct hv_driver balloon_drv = {
static int __init init_balloon_drv(void)
{
-
return vmbus_driver_register(&balloon_drv);
}
diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c
index dde3f9b6871a..59792e00cecf 100644
--- a/drivers/hv/hv_common.c
+++ b/drivers/hv/hv_common.c
@@ -28,11 +28,17 @@
#include <linux/slab.h>
#include <linux/dma-map-ops.h>
#include <linux/set_memory.h>
-#include <asm/hyperv-tlfs.h>
+#include <hyperv/hvhdk.h>
#include <asm/mshyperv.h>
+u64 hv_current_partition_id = HV_PARTITION_ID_SELF;
+EXPORT_SYMBOL_GPL(hv_current_partition_id);
+
+enum hv_partition_type hv_curr_partition_type;
+EXPORT_SYMBOL_GPL(hv_curr_partition_type);
+
/*
- * hv_root_partition, ms_hyperv and hv_nested are defined here with other
+ * ms_hyperv and hv_nested are defined here with other
* Hyper-V specific globals so they are shared across all architectures and are
* built only when CONFIG_HYPERV is defined. But on x86,
* ms_hyperv_init_platform() is built even when CONFIG_HYPERV is not
@@ -40,9 +46,6 @@
* here, allowing for an overriding definition in the module containing
* ms_hyperv_init_platform().
*/
-bool __weak hv_root_partition;
-EXPORT_SYMBOL_GPL(hv_root_partition);
-
bool __weak hv_nested;
EXPORT_SYMBOL_GPL(hv_nested);
@@ -66,6 +69,16 @@ static void hv_kmsg_dump_unregister(void);
static struct ctl_table_header *hv_ctl_table_hdr;
/*
+ * Per-cpu array holding the tail pointer for the SynIC event ring buffer
+ * for each SINT.
+ *
+ * We cannot maintain this in mshv driver because the tail pointer should
+ * persist even if the mshv driver is unloaded.
+ */
+u8 * __percpu *hv_synic_eventring_tail;
+EXPORT_SYMBOL_GPL(hv_synic_eventring_tail);
+
+/*
* Hyper-V specific initialization and shutdown code that is
* common across all architectures. Called from architecture
* specific initialization functions.
@@ -87,6 +100,9 @@ void __init hv_common_free(void)
free_percpu(hyperv_pcpu_input_arg);
hyperv_pcpu_input_arg = NULL;
+
+ free_percpu(hv_synic_eventring_tail);
+ hv_synic_eventring_tail = NULL;
}
/*
@@ -141,7 +157,7 @@ static int sysctl_record_panic_msg = 1;
* sysctl option to allow the user to control whether kmsg data should be
* reported to Hyper-V on panic.
*/
-static struct ctl_table hv_ctl_table[] = {
+static const struct ctl_table hv_ctl_table[] = {
{
.procname = "hyperv_record_panic_msg",
.data = &sysctl_record_panic_msg,
@@ -207,13 +223,13 @@ static int hv_die_panic_notify_crash(struct notifier_block *self,
* buffer and call into Hyper-V to transfer the data.
*/
static void hv_kmsg_dump(struct kmsg_dumper *dumper,
- enum kmsg_dump_reason reason)
+ struct kmsg_dump_detail *detail)
{
struct kmsg_dump_iter iter;
size_t bytes_written;
/* We are only interested in panics. */
- if (reason != KMSG_DUMP_PANIC || !sysctl_record_panic_msg)
+ if (detail->reason != KMSG_DUMP_PANIC || !sysctl_record_panic_msg)
return;
/*
@@ -278,6 +294,30 @@ static void hv_kmsg_dump_register(void)
}
}
+static inline bool hv_output_page_exists(void)
+{
+ return hv_root_partition() || IS_ENABLED(CONFIG_HYPERV_VTL_MODE);
+}
+
+void __init hv_get_partition_id(void)
+{
+ struct hv_output_get_partition_id *output;
+ unsigned long flags;
+ u64 status, pt_id;
+
+ local_irq_save(flags);
+ output = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ status = hv_do_hypercall(HVCALL_GET_PARTITION_ID, NULL, output);
+ pt_id = output->partition_id;
+ local_irq_restore(flags);
+
+ if (hv_result_success(status))
+ hv_current_partition_id = pt_id;
+ else
+ pr_err("Hyper-V: failed to get partition ID: %#x\n",
+ hv_result(status));
+}
+
int __init hv_common_init(void)
{
int i;
@@ -340,19 +380,24 @@ int __init hv_common_init(void)
BUG_ON(!hyperv_pcpu_input_arg);
/* Allocate the per-CPU state for output arg for root */
- if (hv_root_partition) {
+ if (hv_output_page_exists()) {
hyperv_pcpu_output_arg = alloc_percpu(void *);
BUG_ON(!hyperv_pcpu_output_arg);
}
- hv_vp_index = kmalloc_array(num_possible_cpus(), sizeof(*hv_vp_index),
+ if (hv_root_partition()) {
+ hv_synic_eventring_tail = alloc_percpu(u8 *);
+ BUG_ON(!hv_synic_eventring_tail);
+ }
+
+ hv_vp_index = kmalloc_array(nr_cpu_ids, sizeof(*hv_vp_index),
GFP_KERNEL);
if (!hv_vp_index) {
hv_common_free();
return -ENOMEM;
}
- for (i = 0; i < num_possible_cpus(); i++)
+ for (i = 0; i < nr_cpu_ids; i++)
hv_vp_index[i] = VP_INVAL;
return 0;
@@ -433,11 +478,12 @@ error:
int hv_common_cpu_init(unsigned int cpu)
{
void **inputarg, **outputarg;
+ u8 **synic_eventring_tail;
u64 msr_vp_index;
gfp_t flags;
- int pgcount = hv_root_partition ? 2 : 1;
+ const int pgcount = hv_output_page_exists() ? 2 : 1;
void *mem;
- int ret;
+ int ret = 0;
/* hv_cpu_init() can be called with IRQs disabled from hv_resume() */
flags = irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL;
@@ -445,15 +491,15 @@ int hv_common_cpu_init(unsigned int cpu)
inputarg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg);
/*
- * hyperv_pcpu_input_arg and hyperv_pcpu_output_arg memory is already
- * allocated if this CPU was previously online and then taken offline
+ * The per-cpu memory is already allocated if this CPU was previously
+ * online and then taken offline
*/
if (!*inputarg) {
mem = kmalloc(pgcount * HV_HYP_PAGE_SIZE, flags);
if (!mem)
return -ENOMEM;
- if (hv_root_partition) {
+ if (hv_output_page_exists()) {
outputarg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg);
*outputarg = (char *)mem + HV_HYP_PAGE_SIZE;
}
@@ -493,11 +539,21 @@ int hv_common_cpu_init(unsigned int cpu)
if (msr_vp_index > hv_max_vp_index)
hv_max_vp_index = msr_vp_index;
- return 0;
+ if (hv_root_partition()) {
+ synic_eventring_tail = (u8 **)this_cpu_ptr(hv_synic_eventring_tail);
+ *synic_eventring_tail = kcalloc(HV_SYNIC_SINT_COUNT,
+ sizeof(u8), flags);
+ /* No need to unwind any of the above on failure here */
+ if (unlikely(!*synic_eventring_tail))
+ ret = -ENOMEM;
+ }
+
+ return ret;
}
int hv_common_cpu_die(unsigned int cpu)
{
+ u8 **synic_eventring_tail;
/*
* The hyperv_pcpu_input_arg and hyperv_pcpu_output_arg memory
* is not freed when the CPU goes offline as the hyperv_pcpu_input_arg
@@ -510,6 +566,12 @@ int hv_common_cpu_die(unsigned int cpu)
* originally allocated memory is reused in hv_common_cpu_init().
*/
+ if (hv_root_partition()) {
+ synic_eventring_tail = this_cpu_ptr(hv_synic_eventring_tail);
+ kfree(*synic_eventring_tail);
+ *synic_eventring_tail = NULL;
+ }
+
return 0;
}
@@ -561,17 +623,13 @@ EXPORT_SYMBOL_GPL(hv_query_ext_cap);
void hv_setup_dma_ops(struct device *dev, bool coherent)
{
- /*
- * Hyper-V does not offer a vIOMMU in the guest
- * VM, so pass 0/NULL for the IOMMU settings
- */
- arch_setup_dma_ops(dev, 0, 0, coherent);
+ arch_setup_dma_ops(dev, coherent);
}
EXPORT_SYMBOL_GPL(hv_setup_dma_ops);
bool hv_is_hibernation_supported(void)
{
- return !hv_root_partition && acpi_sleep_state_supported(ACPI_STATE_S4);
+ return !hv_root_partition() && acpi_sleep_state_supported(ACPI_STATE_S4);
}
EXPORT_SYMBOL_GPL(hv_is_hibernation_supported);
@@ -624,6 +682,11 @@ void __weak hv_remove_vmbus_handler(void)
}
EXPORT_SYMBOL_GPL(hv_remove_vmbus_handler);
+void __weak hv_setup_mshv_handler(void (*handler)(void))
+{
+}
+EXPORT_SYMBOL_GPL(hv_setup_mshv_handler);
+
void __weak hv_setup_kexec_handler(void (*handler)(void))
{
}
@@ -660,3 +723,121 @@ u64 __weak hv_tdx_hypercall(u64 control, u64 param1, u64 param2)
return HV_STATUS_INVALID_PARAMETER;
}
EXPORT_SYMBOL_GPL(hv_tdx_hypercall);
+
+void hv_identify_partition_type(void)
+{
+ /* Assume guest role */
+ hv_curr_partition_type = HV_PARTITION_TYPE_GUEST;
+ /*
+ * Check partition creation and cpu management privileges
+ *
+ * Hyper-V should never specify running as root and as a Confidential
+ * VM. But to protect against a compromised/malicious Hyper-V trying
+ * to exploit root behavior to expose Confidential VM memory, ignore
+ * the root partition setting if also a Confidential VM.
+ */
+ if ((ms_hyperv.priv_high & HV_CREATE_PARTITIONS) &&
+ (ms_hyperv.priv_high & HV_CPU_MANAGEMENT) &&
+ !(ms_hyperv.priv_high & HV_ISOLATION)) {
+ pr_info("Hyper-V: running as root partition\n");
+ if (IS_ENABLED(CONFIG_MSHV_ROOT))
+ hv_curr_partition_type = HV_PARTITION_TYPE_ROOT;
+ else
+ pr_crit("Hyper-V: CONFIG_MSHV_ROOT not enabled!\n");
+ }
+}
+
+struct hv_status_info {
+ char *string;
+ int errno;
+ u16 code;
+};
+
+/*
+ * Note on the errno mappings:
+ * A failed hypercall is usually only recoverable (or loggable) near
+ * the call site where the HV_STATUS_* code is known. So the errno
+ * it gets converted to is not too useful further up the stack.
+ * Provide a few mappings that could be useful, and revert to -EIO
+ * as a fallback.
+ */
+static const struct hv_status_info hv_status_infos[] = {
+#define _STATUS_INFO(status, errno) { #status, (errno), (status) }
+ _STATUS_INFO(HV_STATUS_SUCCESS, 0),
+ _STATUS_INFO(HV_STATUS_INVALID_HYPERCALL_CODE, -EINVAL),
+ _STATUS_INFO(HV_STATUS_INVALID_HYPERCALL_INPUT, -EINVAL),
+ _STATUS_INFO(HV_STATUS_INVALID_ALIGNMENT, -EIO),
+ _STATUS_INFO(HV_STATUS_INVALID_PARAMETER, -EINVAL),
+ _STATUS_INFO(HV_STATUS_ACCESS_DENIED, -EIO),
+ _STATUS_INFO(HV_STATUS_INVALID_PARTITION_STATE, -EIO),
+ _STATUS_INFO(HV_STATUS_OPERATION_DENIED, -EIO),
+ _STATUS_INFO(HV_STATUS_UNKNOWN_PROPERTY, -EIO),
+ _STATUS_INFO(HV_STATUS_PROPERTY_VALUE_OUT_OF_RANGE, -EIO),
+ _STATUS_INFO(HV_STATUS_INSUFFICIENT_MEMORY, -ENOMEM),
+ _STATUS_INFO(HV_STATUS_INVALID_PARTITION_ID, -EINVAL),
+ _STATUS_INFO(HV_STATUS_INVALID_VP_INDEX, -EINVAL),
+ _STATUS_INFO(HV_STATUS_NOT_FOUND, -EIO),
+ _STATUS_INFO(HV_STATUS_INVALID_PORT_ID, -EINVAL),
+ _STATUS_INFO(HV_STATUS_INVALID_CONNECTION_ID, -EINVAL),
+ _STATUS_INFO(HV_STATUS_INSUFFICIENT_BUFFERS, -EIO),
+ _STATUS_INFO(HV_STATUS_NOT_ACKNOWLEDGED, -EIO),
+ _STATUS_INFO(HV_STATUS_INVALID_VP_STATE, -EIO),
+ _STATUS_INFO(HV_STATUS_NO_RESOURCES, -EIO),
+ _STATUS_INFO(HV_STATUS_PROCESSOR_FEATURE_NOT_SUPPORTED, -EIO),
+ _STATUS_INFO(HV_STATUS_INVALID_LP_INDEX, -EINVAL),
+ _STATUS_INFO(HV_STATUS_INVALID_REGISTER_VALUE, -EINVAL),
+ _STATUS_INFO(HV_STATUS_INVALID_LP_INDEX, -EIO),
+ _STATUS_INFO(HV_STATUS_INVALID_REGISTER_VALUE, -EIO),
+ _STATUS_INFO(HV_STATUS_OPERATION_FAILED, -EIO),
+ _STATUS_INFO(HV_STATUS_TIME_OUT, -EIO),
+ _STATUS_INFO(HV_STATUS_CALL_PENDING, -EIO),
+ _STATUS_INFO(HV_STATUS_VTL_ALREADY_ENABLED, -EIO),
+#undef _STATUS_INFO
+};
+
+static inline const struct hv_status_info *find_hv_status_info(u64 hv_status)
+{
+ int i;
+ u16 code = hv_result(hv_status);
+
+ for (i = 0; i < ARRAY_SIZE(hv_status_infos); ++i) {
+ const struct hv_status_info *info = &hv_status_infos[i];
+
+ if (info->code == code)
+ return info;
+ }
+
+ return NULL;
+}
+
+/* Convert a hypercall result into a linux-friendly error code. */
+int hv_result_to_errno(u64 status)
+{
+ const struct hv_status_info *info;
+
+ /* hv_do_hypercall() may return U64_MAX, hypercalls aren't possible */
+ if (unlikely(status == U64_MAX))
+ return -EOPNOTSUPP;
+
+ info = find_hv_status_info(status);
+ if (info)
+ return info->errno;
+
+ return -EIO;
+}
+EXPORT_SYMBOL_GPL(hv_result_to_errno);
+
+const char *hv_result_to_string(u64 status)
+{
+ const struct hv_status_info *info;
+
+ if (unlikely(status == U64_MAX))
+ return "Hypercall page missing!";
+
+ info = find_hv_status_info(status);
+ if (info)
+ return info->string;
+
+ return "Unknown";
+}
+EXPORT_SYMBOL_GPL(hv_result_to_string);
diff --git a/drivers/hv/hv_fcopy.c b/drivers/hv/hv_fcopy.c
deleted file mode 100644
index 922d83eb7ddf..000000000000
--- a/drivers/hv/hv_fcopy.c
+++ /dev/null
@@ -1,427 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * An implementation of file copy service.
- *
- * Copyright (C) 2014, Microsoft, Inc.
- *
- * Author : K. Y. Srinivasan <ksrinivasan@novell.com>
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/nls.h>
-#include <linux/workqueue.h>
-#include <linux/hyperv.h>
-#include <linux/sched.h>
-#include <asm/hyperv-tlfs.h>
-
-#include "hyperv_vmbus.h"
-#include "hv_utils_transport.h"
-
-#define WIN8_SRV_MAJOR 1
-#define WIN8_SRV_MINOR 1
-#define WIN8_SRV_VERSION (WIN8_SRV_MAJOR << 16 | WIN8_SRV_MINOR)
-
-#define FCOPY_VER_COUNT 1
-static const int fcopy_versions[] = {
- WIN8_SRV_VERSION
-};
-
-#define FW_VER_COUNT 1
-static const int fw_versions[] = {
- UTIL_FW_VERSION
-};
-
-/*
- * Global state maintained for transaction that is being processed.
- * For a class of integration services, including the "file copy service",
- * the specified protocol is a "request/response" protocol which means that
- * there can only be single outstanding transaction from the host at any
- * given point in time. We use this to simplify memory management in this
- * driver - we cache and process only one message at a time.
- *
- * While the request/response protocol is guaranteed by the host, we further
- * ensure this by serializing packet processing in this driver - we do not
- * read additional packets from the VMBUs until the current packet is fully
- * handled.
- */
-
-static struct {
- int state; /* hvutil_device_state */
- int recv_len; /* number of bytes received. */
- struct hv_fcopy_hdr *fcopy_msg; /* current message */
- struct vmbus_channel *recv_channel; /* chn we got the request */
- u64 recv_req_id; /* request ID. */
-} fcopy_transaction;
-
-static void fcopy_respond_to_host(int error);
-static void fcopy_send_data(struct work_struct *dummy);
-static void fcopy_timeout_func(struct work_struct *dummy);
-static DECLARE_DELAYED_WORK(fcopy_timeout_work, fcopy_timeout_func);
-static DECLARE_WORK(fcopy_send_work, fcopy_send_data);
-static const char fcopy_devname[] = "vmbus/hv_fcopy";
-static u8 *recv_buffer;
-static struct hvutil_transport *hvt;
-/*
- * This state maintains the version number registered by the daemon.
- */
-static int dm_reg_value;
-
-static void fcopy_poll_wrapper(void *channel)
-{
- /* Transaction is finished, reset the state here to avoid races. */
- fcopy_transaction.state = HVUTIL_READY;
- tasklet_schedule(&((struct vmbus_channel *)channel)->callback_event);
-}
-
-static void fcopy_timeout_func(struct work_struct *dummy)
-{
- /*
- * If the timer fires, the user-mode component has not responded;
- * process the pending transaction.
- */
- fcopy_respond_to_host(HV_E_FAIL);
- hv_poll_channel(fcopy_transaction.recv_channel, fcopy_poll_wrapper);
-}
-
-static void fcopy_register_done(void)
-{
- pr_debug("FCP: userspace daemon registered\n");
- hv_poll_channel(fcopy_transaction.recv_channel, fcopy_poll_wrapper);
-}
-
-static int fcopy_handle_handshake(u32 version)
-{
- u32 our_ver = FCOPY_CURRENT_VERSION;
-
- switch (version) {
- case FCOPY_VERSION_0:
- /* Daemon doesn't expect us to reply */
- dm_reg_value = version;
- break;
- case FCOPY_VERSION_1:
- /* Daemon expects us to reply with our own version */
- if (hvutil_transport_send(hvt, &our_ver, sizeof(our_ver),
- fcopy_register_done))
- return -EFAULT;
- dm_reg_value = version;
- break;
- default:
- /*
- * For now we will fail the registration.
- * If and when we have multiple versions to
- * deal with, we will be backward compatible.
- * We will add this code when needed.
- */
- return -EINVAL;
- }
- pr_debug("FCP: userspace daemon ver. %d connected\n", version);
- return 0;
-}
-
-static void fcopy_send_data(struct work_struct *dummy)
-{
- struct hv_start_fcopy *smsg_out = NULL;
- int operation = fcopy_transaction.fcopy_msg->operation;
- struct hv_start_fcopy *smsg_in;
- void *out_src;
- int rc, out_len;
-
- /*
- * The strings sent from the host are encoded in
- * utf16; convert it to utf8 strings.
- * The host assures us that the utf16 strings will not exceed
- * the max lengths specified. We will however, reserve room
- * for the string terminating character - in the utf16s_utf8s()
- * function we limit the size of the buffer where the converted
- * string is placed to W_MAX_PATH -1 to guarantee
- * that the strings can be properly terminated!
- */
-
- switch (operation) {
- case START_FILE_COPY:
- out_len = sizeof(struct hv_start_fcopy);
- smsg_out = kzalloc(sizeof(*smsg_out), GFP_KERNEL);
- if (!smsg_out)
- return;
-
- smsg_out->hdr.operation = operation;
- smsg_in = (struct hv_start_fcopy *)fcopy_transaction.fcopy_msg;
-
- utf16s_to_utf8s((wchar_t *)smsg_in->file_name, W_MAX_PATH,
- UTF16_LITTLE_ENDIAN,
- (__u8 *)&smsg_out->file_name, W_MAX_PATH - 1);
-
- utf16s_to_utf8s((wchar_t *)smsg_in->path_name, W_MAX_PATH,
- UTF16_LITTLE_ENDIAN,
- (__u8 *)&smsg_out->path_name, W_MAX_PATH - 1);
-
- smsg_out->copy_flags = smsg_in->copy_flags;
- smsg_out->file_size = smsg_in->file_size;
- out_src = smsg_out;
- break;
-
- case WRITE_TO_FILE:
- out_src = fcopy_transaction.fcopy_msg;
- out_len = sizeof(struct hv_do_fcopy);
- break;
- default:
- out_src = fcopy_transaction.fcopy_msg;
- out_len = fcopy_transaction.recv_len;
- break;
- }
-
- fcopy_transaction.state = HVUTIL_USERSPACE_REQ;
- rc = hvutil_transport_send(hvt, out_src, out_len, NULL);
- if (rc) {
- pr_debug("FCP: failed to communicate to the daemon: %d\n", rc);
- if (cancel_delayed_work_sync(&fcopy_timeout_work)) {
- fcopy_respond_to_host(HV_E_FAIL);
- fcopy_transaction.state = HVUTIL_READY;
- }
- }
- kfree(smsg_out);
-}
-
-/*
- * Send a response back to the host.
- */
-
-static void
-fcopy_respond_to_host(int error)
-{
- struct icmsg_hdr *icmsghdr;
- u32 buf_len;
- struct vmbus_channel *channel;
- u64 req_id;
-
- /*
- * Copy the global state for completing the transaction. Note that
- * only one transaction can be active at a time. This is guaranteed
- * by the file copy protocol implemented by the host. Furthermore,
- * the "transaction active" state we maintain ensures that there can
- * only be one active transaction at a time.
- */
-
- buf_len = fcopy_transaction.recv_len;
- channel = fcopy_transaction.recv_channel;
- req_id = fcopy_transaction.recv_req_id;
-
- icmsghdr = (struct icmsg_hdr *)
- &recv_buffer[sizeof(struct vmbuspipe_hdr)];
-
- if (channel->onchannel_callback == NULL)
- /*
- * We have raced with util driver being unloaded;
- * silently return.
- */
- return;
-
- icmsghdr->status = error;
- icmsghdr->icflags = ICMSGHDRFLAG_TRANSACTION | ICMSGHDRFLAG_RESPONSE;
- vmbus_sendpacket(channel, recv_buffer, buf_len, req_id,
- VM_PKT_DATA_INBAND, 0);
-}
-
-void hv_fcopy_onchannelcallback(void *context)
-{
- struct vmbus_channel *channel = context;
- u32 recvlen;
- u64 requestid;
- struct hv_fcopy_hdr *fcopy_msg;
- struct icmsg_hdr *icmsghdr;
- int fcopy_srv_version;
-
- if (fcopy_transaction.state > HVUTIL_READY)
- return;
-
- if (vmbus_recvpacket(channel, recv_buffer, HV_HYP_PAGE_SIZE * 2, &recvlen, &requestid)) {
- pr_err_ratelimited("Fcopy request received. Could not read into recv buf\n");
- return;
- }
-
- if (!recvlen)
- return;
-
- /* Ensure recvlen is big enough to read header data */
- if (recvlen < ICMSG_HDR) {
- pr_err_ratelimited("Fcopy request received. Packet length too small: %d\n",
- recvlen);
- return;
- }
-
- icmsghdr = (struct icmsg_hdr *)&recv_buffer[
- sizeof(struct vmbuspipe_hdr)];
-
- if (icmsghdr->icmsgtype == ICMSGTYPE_NEGOTIATE) {
- if (vmbus_prep_negotiate_resp(icmsghdr,
- recv_buffer, recvlen,
- fw_versions, FW_VER_COUNT,
- fcopy_versions, FCOPY_VER_COUNT,
- NULL, &fcopy_srv_version)) {
-
- pr_info("FCopy IC version %d.%d\n",
- fcopy_srv_version >> 16,
- fcopy_srv_version & 0xFFFF);
- }
- } else if (icmsghdr->icmsgtype == ICMSGTYPE_FCOPY) {
- /* Ensure recvlen is big enough to contain hv_fcopy_hdr */
- if (recvlen < ICMSG_HDR + sizeof(struct hv_fcopy_hdr)) {
- pr_err_ratelimited("Invalid Fcopy hdr. Packet length too small: %u\n",
- recvlen);
- return;
- }
- fcopy_msg = (struct hv_fcopy_hdr *)&recv_buffer[ICMSG_HDR];
-
- /*
- * Stash away this global state for completing the
- * transaction; note transactions are serialized.
- */
-
- fcopy_transaction.recv_len = recvlen;
- fcopy_transaction.recv_req_id = requestid;
- fcopy_transaction.fcopy_msg = fcopy_msg;
-
- if (fcopy_transaction.state < HVUTIL_READY) {
- /* Userspace is not registered yet */
- fcopy_respond_to_host(HV_E_FAIL);
- return;
- }
- fcopy_transaction.state = HVUTIL_HOSTMSG_RECEIVED;
-
- /*
- * Send the information to the user-level daemon.
- */
- schedule_work(&fcopy_send_work);
- schedule_delayed_work(&fcopy_timeout_work,
- HV_UTIL_TIMEOUT * HZ);
- return;
- } else {
- pr_err_ratelimited("Fcopy request received. Invalid msg type: %d\n",
- icmsghdr->icmsgtype);
- return;
- }
- icmsghdr->icflags = ICMSGHDRFLAG_TRANSACTION | ICMSGHDRFLAG_RESPONSE;
- vmbus_sendpacket(channel, recv_buffer, recvlen, requestid,
- VM_PKT_DATA_INBAND, 0);
-}
-
-/* Callback when data is received from userspace */
-static int fcopy_on_msg(void *msg, int len)
-{
- int *val = (int *)msg;
-
- if (len != sizeof(int))
- return -EINVAL;
-
- if (fcopy_transaction.state == HVUTIL_DEVICE_INIT)
- return fcopy_handle_handshake(*val);
-
- if (fcopy_transaction.state != HVUTIL_USERSPACE_REQ)
- return -EINVAL;
-
- /*
- * Complete the transaction by forwarding the result
- * to the host. But first, cancel the timeout.
- */
- if (cancel_delayed_work_sync(&fcopy_timeout_work)) {
- fcopy_transaction.state = HVUTIL_USERSPACE_RECV;
- fcopy_respond_to_host(*val);
- hv_poll_channel(fcopy_transaction.recv_channel,
- fcopy_poll_wrapper);
- }
-
- return 0;
-}
-
-static void fcopy_on_reset(void)
-{
- /*
- * The daemon has exited; reset the state.
- */
- fcopy_transaction.state = HVUTIL_DEVICE_INIT;
-
- if (cancel_delayed_work_sync(&fcopy_timeout_work))
- fcopy_respond_to_host(HV_E_FAIL);
-}
-
-int hv_fcopy_init(struct hv_util_service *srv)
-{
- recv_buffer = srv->recv_buffer;
- fcopy_transaction.recv_channel = srv->channel;
- fcopy_transaction.recv_channel->max_pkt_size = HV_HYP_PAGE_SIZE * 2;
-
- /*
- * When this driver loads, the user level daemon that
- * processes the host requests may not yet be running.
- * Defer processing channel callbacks until the daemon
- * has registered.
- */
- fcopy_transaction.state = HVUTIL_DEVICE_INIT;
-
- hvt = hvutil_transport_init(fcopy_devname, 0, 0,
- fcopy_on_msg, fcopy_on_reset);
- if (!hvt)
- return -EFAULT;
-
- return 0;
-}
-
-static void hv_fcopy_cancel_work(void)
-{
- cancel_delayed_work_sync(&fcopy_timeout_work);
- cancel_work_sync(&fcopy_send_work);
-}
-
-int hv_fcopy_pre_suspend(void)
-{
- struct vmbus_channel *channel = fcopy_transaction.recv_channel;
- struct hv_fcopy_hdr *fcopy_msg;
-
- /*
- * Fake a CANCEL_FCOPY message for the user space daemon in case the
- * daemon is in the middle of copying some file. It doesn't matter if
- * there is already a message pending to be delivered to the user
- * space since we force fcopy_transaction.state to be HVUTIL_READY, so
- * the user space daemon's write() will fail with EINVAL (see
- * fcopy_on_msg()), and the daemon will reset the device by closing
- * and re-opening it.
- */
- fcopy_msg = kzalloc(sizeof(*fcopy_msg), GFP_KERNEL);
- if (!fcopy_msg)
- return -ENOMEM;
-
- tasklet_disable(&channel->callback_event);
-
- fcopy_msg->operation = CANCEL_FCOPY;
-
- hv_fcopy_cancel_work();
-
- /* We don't care about the return value. */
- hvutil_transport_send(hvt, fcopy_msg, sizeof(*fcopy_msg), NULL);
-
- kfree(fcopy_msg);
-
- fcopy_transaction.state = HVUTIL_READY;
-
- /* tasklet_enable() will be called in hv_fcopy_pre_resume(). */
- return 0;
-}
-
-int hv_fcopy_pre_resume(void)
-{
- struct vmbus_channel *channel = fcopy_transaction.recv_channel;
-
- tasklet_enable(&channel->callback_event);
-
- return 0;
-}
-
-void hv_fcopy_deinit(void)
-{
- fcopy_transaction.state = HVUTIL_DEVICE_DYING;
-
- hv_fcopy_cancel_work();
-
- hvutil_transport_destroy(hvt);
-}
diff --git a/drivers/hv/hv_kvp.c b/drivers/hv/hv_kvp.c
index d35b60c06114..62795f6cbb00 100644
--- a/drivers/hv/hv_kvp.c
+++ b/drivers/hv/hv_kvp.c
@@ -27,7 +27,7 @@
#include <linux/connector.h>
#include <linux/workqueue.h>
#include <linux/hyperv.h>
-#include <asm/hyperv-tlfs.h>
+#include <hyperv/hvhdk.h>
#include "hyperv_vmbus.h"
#include "hv_utils_transport.h"
@@ -655,7 +655,7 @@ void hv_kvp_onchannelcallback(void *context)
if (host_negotiatied == NEGO_NOT_STARTED) {
host_negotiatied = NEGO_IN_PROGRESS;
schedule_delayed_work(&kvp_host_handshake_work,
- HV_UTIL_NEGO_TIMEOUT * HZ);
+ secs_to_jiffies(HV_UTIL_NEGO_TIMEOUT));
}
return;
}
@@ -724,7 +724,7 @@ void hv_kvp_onchannelcallback(void *context)
*/
schedule_work(&kvp_sendkey_work);
schedule_delayed_work(&kvp_timeout_work,
- HV_UTIL_TIMEOUT * HZ);
+ secs_to_jiffies(HV_UTIL_TIMEOUT));
return;
@@ -767,6 +767,12 @@ hv_kvp_init(struct hv_util_service *srv)
*/
kvp_transaction.state = HVUTIL_DEVICE_INIT;
+ return 0;
+}
+
+int
+hv_kvp_init_transport(void)
+{
hvt = hvutil_transport_init(kvp_devname, CN_KVP_IDX, CN_KVP_VAL,
kvp_on_msg, kvp_on_reset);
if (!hvt)
diff --git a/drivers/hv/hv_proc.c b/drivers/hv/hv_proc.c
new file mode 100644
index 000000000000..7d7ecb6f6137
--- /dev/null
+++ b/drivers/hv/hv_proc.c
@@ -0,0 +1,195 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/clockchips.h>
+#include <linux/slab.h>
+#include <linux/cpuhotplug.h>
+#include <linux/minmax.h>
+#include <asm/mshyperv.h>
+
+/*
+ * See struct hv_deposit_memory. The first u64 is partition ID, the rest
+ * are GPAs.
+ */
+#define HV_DEPOSIT_MAX (HV_HYP_PAGE_SIZE / sizeof(u64) - 1)
+
+/* Deposits exact number of pages. Must be called with interrupts enabled. */
+int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages)
+{
+ struct page **pages, *page;
+ int *counts;
+ int num_allocations;
+ int i, j, page_count;
+ int order;
+ u64 status;
+ int ret;
+ u64 base_pfn;
+ struct hv_deposit_memory *input_page;
+ unsigned long flags;
+
+ if (num_pages > HV_DEPOSIT_MAX)
+ return -E2BIG;
+ if (!num_pages)
+ return 0;
+
+ /* One buffer for page pointers and counts */
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+ pages = page_address(page);
+
+ counts = kcalloc(HV_DEPOSIT_MAX, sizeof(int), GFP_KERNEL);
+ if (!counts) {
+ free_page((unsigned long)pages);
+ return -ENOMEM;
+ }
+
+ /* Allocate all the pages before disabling interrupts */
+ i = 0;
+
+ while (num_pages) {
+ /* Find highest order we can actually allocate */
+ order = 31 - __builtin_clz(num_pages);
+
+ while (1) {
+ pages[i] = alloc_pages_node(node, GFP_KERNEL, order);
+ if (pages[i])
+ break;
+ if (!order) {
+ ret = -ENOMEM;
+ num_allocations = i;
+ goto err_free_allocations;
+ }
+ --order;
+ }
+
+ split_page(pages[i], order);
+ counts[i] = 1 << order;
+ num_pages -= counts[i];
+ i++;
+ }
+ num_allocations = i;
+
+ local_irq_save(flags);
+
+ input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+ input_page->partition_id = partition_id;
+
+ /* Populate gpa_page_list - these will fit on the input page */
+ for (i = 0, page_count = 0; i < num_allocations; ++i) {
+ base_pfn = page_to_pfn(pages[i]);
+ for (j = 0; j < counts[i]; ++j, ++page_count)
+ input_page->gpa_page_list[page_count] = base_pfn + j;
+ }
+ status = hv_do_rep_hypercall(HVCALL_DEPOSIT_MEMORY,
+ page_count, 0, input_page, NULL);
+ local_irq_restore(flags);
+ if (!hv_result_success(status)) {
+ hv_status_err(status, "\n");
+ ret = hv_result_to_errno(status);
+ goto err_free_allocations;
+ }
+
+ ret = 0;
+ goto free_buf;
+
+err_free_allocations:
+ for (i = 0; i < num_allocations; ++i) {
+ base_pfn = page_to_pfn(pages[i]);
+ for (j = 0; j < counts[i]; ++j)
+ __free_page(pfn_to_page(base_pfn + j));
+ }
+
+free_buf:
+ free_page((unsigned long)pages);
+ kfree(counts);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(hv_call_deposit_pages);
+
+int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id)
+{
+ struct hv_input_add_logical_processor *input;
+ struct hv_output_add_logical_processor *output;
+ u64 status;
+ unsigned long flags;
+ int ret = 0;
+
+ /*
+ * When adding a logical processor, the hypervisor may return
+ * HV_STATUS_INSUFFICIENT_MEMORY. When that happens, we deposit more
+ * pages and retry.
+ */
+ do {
+ local_irq_save(flags);
+
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ /* We don't do anything with the output right now */
+ output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+ input->lp_index = lp_index;
+ input->apic_id = apic_id;
+ input->proximity_domain_info = hv_numa_node_to_pxm_info(node);
+ status = hv_do_hypercall(HVCALL_ADD_LOGICAL_PROCESSOR,
+ input, output);
+ local_irq_restore(flags);
+
+ if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
+ if (!hv_result_success(status)) {
+ hv_status_err(status, "cpu %u apic ID: %u\n",
+ lp_index, apic_id);
+ ret = hv_result_to_errno(status);
+ }
+ break;
+ }
+ ret = hv_call_deposit_pages(node, hv_current_partition_id, 1);
+ } while (!ret);
+
+ return ret;
+}
+
+int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags)
+{
+ struct hv_create_vp *input;
+ u64 status;
+ unsigned long irq_flags;
+ int ret = 0;
+
+ /* Root VPs don't seem to need pages deposited */
+ if (partition_id != hv_current_partition_id) {
+ /* The value 90 is empirically determined. It may change. */
+ ret = hv_call_deposit_pages(node, partition_id, 90);
+ if (ret)
+ return ret;
+ }
+
+ do {
+ local_irq_save(irq_flags);
+
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+ input->partition_id = partition_id;
+ input->vp_index = vp_index;
+ input->flags = flags;
+ input->subnode_type = HV_SUBNODE_ANY;
+ input->proximity_domain_info = hv_numa_node_to_pxm_info(node);
+ status = hv_do_hypercall(HVCALL_CREATE_VP, input, NULL);
+ local_irq_restore(irq_flags);
+
+ if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
+ if (!hv_result_success(status)) {
+ hv_status_err(status, "vcpu: %u, lp: %u\n",
+ vp_index, flags);
+ ret = hv_result_to_errno(status);
+ }
+ break;
+ }
+ ret = hv_call_deposit_pages(node, partition_id, 1);
+
+ } while (!ret);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(hv_call_create_vp);
diff --git a/drivers/hv/hv_snapshot.c b/drivers/hv/hv_snapshot.c
index 0d2184be1691..2e7f537d53cf 100644
--- a/drivers/hv/hv_snapshot.c
+++ b/drivers/hv/hv_snapshot.c
@@ -12,7 +12,7 @@
#include <linux/connector.h>
#include <linux/workqueue.h>
#include <linux/hyperv.h>
-#include <asm/hyperv-tlfs.h>
+#include <hyperv/hvhdk.h>
#include "hyperv_vmbus.h"
#include "hv_utils_transport.h"
@@ -193,7 +193,8 @@ static void vss_send_op(void)
vss_transaction.state = HVUTIL_USERSPACE_REQ;
schedule_delayed_work(&vss_timeout_work, op == VSS_OP_FREEZE ?
- VSS_FREEZE_TIMEOUT * HZ : HV_UTIL_TIMEOUT * HZ);
+ secs_to_jiffies(VSS_FREEZE_TIMEOUT) :
+ secs_to_jiffies(HV_UTIL_TIMEOUT));
rc = hvutil_transport_send(hvt, vss_msg, sizeof(*vss_msg), NULL);
if (rc) {
@@ -388,6 +389,12 @@ hv_vss_init(struct hv_util_service *srv)
*/
vss_transaction.state = HVUTIL_DEVICE_INIT;
+ return 0;
+}
+
+int
+hv_vss_init_transport(void)
+{
hvt = hvutil_transport_init(vss_devname, CN_VSS_IDX, CN_VSS_VAL,
vss_on_msg, vss_on_reset);
if (!hvt) {
diff --git a/drivers/hv/hv_util.c b/drivers/hv/hv_util.c
index 9c97c4065fe7..36ee89c0358b 100644
--- a/drivers/hv/hv_util.c
+++ b/drivers/hv/hv_util.c
@@ -141,6 +141,7 @@ static struct hv_util_service util_heartbeat = {
static struct hv_util_service util_kvp = {
.util_cb = hv_kvp_onchannelcallback,
.util_init = hv_kvp_init,
+ .util_init_transport = hv_kvp_init_transport,
.util_pre_suspend = hv_kvp_pre_suspend,
.util_pre_resume = hv_kvp_pre_resume,
.util_deinit = hv_kvp_deinit,
@@ -149,19 +150,12 @@ static struct hv_util_service util_kvp = {
static struct hv_util_service util_vss = {
.util_cb = hv_vss_onchannelcallback,
.util_init = hv_vss_init,
+ .util_init_transport = hv_vss_init_transport,
.util_pre_suspend = hv_vss_pre_suspend,
.util_pre_resume = hv_vss_pre_resume,
.util_deinit = hv_vss_deinit,
};
-static struct hv_util_service util_fcopy = {
- .util_cb = hv_fcopy_onchannelcallback,
- .util_init = hv_fcopy_init,
- .util_pre_suspend = hv_fcopy_pre_suspend,
- .util_pre_resume = hv_fcopy_pre_resume,
- .util_deinit = hv_fcopy_deinit,
-};
-
static void perform_shutdown(struct work_struct *dummy)
{
orderly_poweroff(true);
@@ -598,10 +592,8 @@ static int util_probe(struct hv_device *dev,
srv->channel = dev->channel;
if (srv->util_init) {
ret = srv->util_init(srv);
- if (ret) {
- ret = -ENODEV;
+ if (ret)
goto error1;
- }
}
/*
@@ -621,6 +613,13 @@ static int util_probe(struct hv_device *dev,
if (ret)
goto error;
+ if (srv->util_init_transport) {
+ ret = srv->util_init_transport();
+ if (ret) {
+ vmbus_close(dev->channel);
+ goto error;
+ }
+ }
return 0;
error:
@@ -700,10 +699,6 @@ static const struct hv_vmbus_device_id id_table[] = {
{ HV_VSS_GUID,
.driver_data = (unsigned long)&util_vss
},
- /* File copy GUID */
- { HV_FCOPY_GUID,
- .driver_data = (unsigned long)&util_fcopy
- },
{ },
};
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index f6b1e710f805..0b450e53161e 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -15,10 +15,10 @@
#include <linux/list.h>
#include <linux/bitops.h>
#include <asm/sync_bitops.h>
-#include <asm/hyperv-tlfs.h>
#include <linux/atomic.h>
#include <linux/hyperv.h>
#include <linux/interrupt.h>
+#include <hyperv/hvhdk.h>
#include "hv_trace.h"
@@ -287,18 +287,10 @@ struct vmbus_connection {
struct completion ready_for_suspend_event;
/*
- * The number of primary channels that should be "fixed up"
- * upon resume: these channels are re-offered upon resume, and some
- * fields of the channel offers (i.e. child_relid and connection_id)
- * can change, so the old offermsg must be fixed up, before the resume
- * callbacks of the VSC drivers start to further touch the channels.
+ * Completed once the host has offered all boot-time channels.
+ * Note that some channels may still be under process on a workqueue.
*/
- atomic_t nr_chan_fixup_on_resume;
- /*
- * vmbus_bus_resume() waits for "nr_chan_fixup_on_resume" to
- * drop to zero.
- */
- struct completion ready_for_resume_event;
+ struct completion all_offers_delivered_event;
};
@@ -370,22 +362,18 @@ void vmbus_on_event(unsigned long data);
void vmbus_on_msg_dpc(unsigned long data);
int hv_kvp_init(struct hv_util_service *srv);
+int hv_kvp_init_transport(void);
void hv_kvp_deinit(void);
int hv_kvp_pre_suspend(void);
int hv_kvp_pre_resume(void);
void hv_kvp_onchannelcallback(void *context);
int hv_vss_init(struct hv_util_service *srv);
+int hv_vss_init_transport(void);
void hv_vss_deinit(void);
int hv_vss_pre_suspend(void);
int hv_vss_pre_resume(void);
void hv_vss_onchannelcallback(void *context);
-
-int hv_fcopy_init(struct hv_util_service *srv);
-void hv_fcopy_deinit(void);
-int hv_fcopy_pre_suspend(void);
-int hv_fcopy_pre_resume(void);
-void hv_fcopy_onchannelcallback(void *context);
void vmbus_initiate_unload(bool crash);
static inline void hv_poll_channel(struct vmbus_channel *channel,
@@ -417,6 +405,11 @@ static inline bool hv_is_perf_channel(struct vmbus_channel *channel)
return vmbus_devs[channel->device_id].perf_device;
}
+static inline size_t hv_dev_ring_size(struct vmbus_channel *channel)
+{
+ return vmbus_devs[channel->device_id].pref_ring_size;
+}
+
static inline bool hv_is_allocated_cpu(unsigned int cpu)
{
struct vmbus_channel *channel, *sc;
@@ -484,4 +477,10 @@ static inline int hv_debug_add_dev_dir(struct hv_device *dev)
#endif /* CONFIG_HYPERV_TESTING */
+/* Create and remove sysfs entry for memory mapped ring buffers for a channel */
+int hv_create_ring_sysfs(struct vmbus_channel *channel,
+ int (*hv_mmap_ring_buffer)(struct vmbus_channel *channel,
+ struct vm_area_struct *vma));
+int hv_remove_ring_sysfs(struct vmbus_channel *channel);
+
#endif /* _HYPERV_VMBUS_H */
diff --git a/drivers/hv/mshv.h b/drivers/hv/mshv.h
new file mode 100644
index 000000000000..0340a67acd0a
--- /dev/null
+++ b/drivers/hv/mshv.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2023, Microsoft Corporation.
+ */
+
+#ifndef _MSHV_H_
+#define _MSHV_H_
+
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <hyperv/hvhdk.h>
+
+#define mshv_field_nonzero(STRUCT, MEMBER) \
+ memchr_inv(&((STRUCT).MEMBER), \
+ 0, sizeof_field(typeof(STRUCT), MEMBER))
+
+int hv_call_get_vp_registers(u32 vp_index, u64 partition_id, u16 count,
+ union hv_input_vtl input_vtl,
+ struct hv_register_assoc *registers);
+
+int hv_call_set_vp_registers(u32 vp_index, u64 partition_id, u16 count,
+ union hv_input_vtl input_vtl,
+ struct hv_register_assoc *registers);
+
+int hv_call_get_partition_property(u64 partition_id, u64 property_code,
+ u64 *property_value);
+
+int mshv_do_pre_guest_mode_work(ulong th_flags);
+
+#endif /* _MSHV_H */
diff --git a/drivers/hv/mshv_common.c b/drivers/hv/mshv_common.c
new file mode 100644
index 000000000000..2575e6d7a71f
--- /dev/null
+++ b/drivers/hv/mshv_common.c
@@ -0,0 +1,161 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024, Microsoft Corporation.
+ *
+ * This file contains functions that will be called from one or more modules.
+ * If any of these modules are configured to build, this file is built and just
+ * statically linked in.
+ *
+ * Authors: Microsoft Linux virtualization team
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <asm/mshyperv.h>
+#include <linux/resume_user_mode.h>
+
+#include "mshv.h"
+
+#define HV_GET_REGISTER_BATCH_SIZE \
+ (HV_HYP_PAGE_SIZE / sizeof(union hv_register_value))
+#define HV_SET_REGISTER_BATCH_SIZE \
+ ((HV_HYP_PAGE_SIZE - sizeof(struct hv_input_set_vp_registers)) \
+ / sizeof(struct hv_register_assoc))
+
+int hv_call_get_vp_registers(u32 vp_index, u64 partition_id, u16 count,
+ union hv_input_vtl input_vtl,
+ struct hv_register_assoc *registers)
+{
+ struct hv_input_get_vp_registers *input_page;
+ union hv_register_value *output_page;
+ u16 completed = 0;
+ unsigned long remaining = count;
+ int rep_count, i;
+ u64 status = HV_STATUS_SUCCESS;
+ unsigned long flags;
+
+ local_irq_save(flags);
+
+ input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ output_page = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+ input_page->partition_id = partition_id;
+ input_page->vp_index = vp_index;
+ input_page->input_vtl.as_uint8 = input_vtl.as_uint8;
+ input_page->rsvd_z8 = 0;
+ input_page->rsvd_z16 = 0;
+
+ while (remaining) {
+ rep_count = min(remaining, HV_GET_REGISTER_BATCH_SIZE);
+ for (i = 0; i < rep_count; ++i)
+ input_page->names[i] = registers[i].name;
+
+ status = hv_do_rep_hypercall(HVCALL_GET_VP_REGISTERS, rep_count,
+ 0, input_page, output_page);
+ if (!hv_result_success(status))
+ break;
+
+ completed = hv_repcomp(status);
+ for (i = 0; i < completed; ++i)
+ registers[i].value = output_page[i];
+
+ registers += completed;
+ remaining -= completed;
+ }
+ local_irq_restore(flags);
+
+ return hv_result_to_errno(status);
+}
+EXPORT_SYMBOL_GPL(hv_call_get_vp_registers);
+
+int hv_call_set_vp_registers(u32 vp_index, u64 partition_id, u16 count,
+ union hv_input_vtl input_vtl,
+ struct hv_register_assoc *registers)
+{
+ struct hv_input_set_vp_registers *input_page;
+ u16 completed = 0;
+ unsigned long remaining = count;
+ int rep_count;
+ u64 status = HV_STATUS_SUCCESS;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+ input_page->partition_id = partition_id;
+ input_page->vp_index = vp_index;
+ input_page->input_vtl.as_uint8 = input_vtl.as_uint8;
+ input_page->rsvd_z8 = 0;
+ input_page->rsvd_z16 = 0;
+
+ while (remaining) {
+ rep_count = min(remaining, HV_SET_REGISTER_BATCH_SIZE);
+ memcpy(input_page->elements, registers,
+ sizeof(struct hv_register_assoc) * rep_count);
+
+ status = hv_do_rep_hypercall(HVCALL_SET_VP_REGISTERS, rep_count,
+ 0, input_page, NULL);
+ if (!hv_result_success(status))
+ break;
+
+ completed = hv_repcomp(status);
+ registers += completed;
+ remaining -= completed;
+ }
+
+ local_irq_restore(flags);
+
+ return hv_result_to_errno(status);
+}
+EXPORT_SYMBOL_GPL(hv_call_set_vp_registers);
+
+int hv_call_get_partition_property(u64 partition_id,
+ u64 property_code,
+ u64 *property_value)
+{
+ u64 status;
+ unsigned long flags;
+ struct hv_input_get_partition_property *input;
+ struct hv_output_get_partition_property *output;
+
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+ memset(input, 0, sizeof(*input));
+ input->partition_id = partition_id;
+ input->property_code = property_code;
+ status = hv_do_hypercall(HVCALL_GET_PARTITION_PROPERTY, input, output);
+
+ if (!hv_result_success(status)) {
+ local_irq_restore(flags);
+ return hv_result_to_errno(status);
+ }
+ *property_value = output->property_value;
+
+ local_irq_restore(flags);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(hv_call_get_partition_property);
+
+/*
+ * Handle any pre-processing before going into the guest mode on this cpu, most
+ * notably call schedule(). Must be invoked with both preemption and
+ * interrupts enabled.
+ *
+ * Returns: 0 on success, -errno on error.
+ */
+int mshv_do_pre_guest_mode_work(ulong th_flags)
+{
+ if (th_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
+ return -EINTR;
+
+ if (th_flags & _TIF_NEED_RESCHED)
+ schedule();
+
+ if (th_flags & _TIF_NOTIFY_RESUME)
+ resume_user_mode_work(NULL);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mshv_do_pre_guest_mode_work);
diff --git a/drivers/hv/mshv_eventfd.c b/drivers/hv/mshv_eventfd.c
new file mode 100644
index 000000000000..8dd22be2ca0b
--- /dev/null
+++ b/drivers/hv/mshv_eventfd.c
@@ -0,0 +1,833 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * eventfd support for mshv
+ *
+ * Heavily inspired from KVM implementation of irqfd/ioeventfd. The basic
+ * framework code is taken from the kvm implementation.
+ *
+ * All credits to kvm developers.
+ */
+
+#include <linux/syscalls.h>
+#include <linux/wait.h>
+#include <linux/poll.h>
+#include <linux/file.h>
+#include <linux/list.h>
+#include <linux/workqueue.h>
+#include <linux/eventfd.h>
+
+#if IS_ENABLED(CONFIG_X86_64)
+#include <asm/apic.h>
+#endif
+#include <asm/mshyperv.h>
+
+#include "mshv_eventfd.h"
+#include "mshv.h"
+#include "mshv_root.h"
+
+static struct workqueue_struct *irqfd_cleanup_wq;
+
+void mshv_register_irq_ack_notifier(struct mshv_partition *partition,
+ struct mshv_irq_ack_notifier *mian)
+{
+ mutex_lock(&partition->pt_irq_lock);
+ hlist_add_head_rcu(&mian->link, &partition->irq_ack_notifier_list);
+ mutex_unlock(&partition->pt_irq_lock);
+}
+
+void mshv_unregister_irq_ack_notifier(struct mshv_partition *partition,
+ struct mshv_irq_ack_notifier *mian)
+{
+ mutex_lock(&partition->pt_irq_lock);
+ hlist_del_init_rcu(&mian->link);
+ mutex_unlock(&partition->pt_irq_lock);
+ synchronize_rcu();
+}
+
+bool mshv_notify_acked_gsi(struct mshv_partition *partition, int gsi)
+{
+ struct mshv_irq_ack_notifier *mian;
+ bool acked = false;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(mian, &partition->irq_ack_notifier_list,
+ link) {
+ if (mian->irq_ack_gsi == gsi) {
+ mian->irq_acked(mian);
+ acked = true;
+ }
+ }
+ rcu_read_unlock();
+
+ return acked;
+}
+
+#if IS_ENABLED(CONFIG_ARM64)
+static inline bool hv_should_clear_interrupt(enum hv_interrupt_type type)
+{
+ return false;
+}
+#elif IS_ENABLED(CONFIG_X86_64)
+static inline bool hv_should_clear_interrupt(enum hv_interrupt_type type)
+{
+ return type == HV_X64_INTERRUPT_TYPE_EXTINT;
+}
+#endif
+
+static void mshv_irqfd_resampler_ack(struct mshv_irq_ack_notifier *mian)
+{
+ struct mshv_irqfd_resampler *resampler;
+ struct mshv_partition *partition;
+ struct mshv_irqfd *irqfd;
+ int idx;
+
+ resampler = container_of(mian, struct mshv_irqfd_resampler,
+ rsmplr_notifier);
+ partition = resampler->rsmplr_partn;
+
+ idx = srcu_read_lock(&partition->pt_irq_srcu);
+
+ hlist_for_each_entry_rcu(irqfd, &resampler->rsmplr_irqfd_list,
+ irqfd_resampler_hnode) {
+ if (hv_should_clear_interrupt(irqfd->irqfd_lapic_irq.lapic_control.interrupt_type))
+ hv_call_clear_virtual_interrupt(partition->pt_id);
+
+ eventfd_signal(irqfd->irqfd_resamplefd);
+ }
+
+ srcu_read_unlock(&partition->pt_irq_srcu, idx);
+}
+
+#if IS_ENABLED(CONFIG_X86_64)
+static bool
+mshv_vp_irq_vector_injected(union hv_vp_register_page_interrupt_vectors iv,
+ u32 vector)
+{
+ int i;
+
+ for (i = 0; i < iv.vector_count; i++) {
+ if (iv.vector[i] == vector)
+ return true;
+ }
+
+ return false;
+}
+
+static int mshv_vp_irq_try_set_vector(struct mshv_vp *vp, u32 vector)
+{
+ union hv_vp_register_page_interrupt_vectors iv, new_iv;
+
+ iv = vp->vp_register_page->interrupt_vectors;
+ new_iv = iv;
+
+ if (mshv_vp_irq_vector_injected(iv, vector))
+ return 0;
+
+ if (iv.vector_count >= HV_VP_REGISTER_PAGE_MAX_VECTOR_COUNT)
+ return -ENOSPC;
+
+ new_iv.vector[new_iv.vector_count++] = vector;
+
+ if (cmpxchg(&vp->vp_register_page->interrupt_vectors.as_uint64,
+ iv.as_uint64, new_iv.as_uint64) != iv.as_uint64)
+ return -EAGAIN;
+
+ return 0;
+}
+
+static int mshv_vp_irq_set_vector(struct mshv_vp *vp, u32 vector)
+{
+ int ret;
+
+ do {
+ ret = mshv_vp_irq_try_set_vector(vp, vector);
+ } while (ret == -EAGAIN && !need_resched());
+
+ return ret;
+}
+
+/*
+ * Try to raise irq for guest via shared vector array. hyp does the actual
+ * inject of the interrupt.
+ */
+static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd)
+{
+ struct mshv_partition *partition = irqfd->irqfd_partn;
+ struct mshv_lapic_irq *irq = &irqfd->irqfd_lapic_irq;
+ struct mshv_vp *vp;
+
+ if (!(ms_hyperv.ext_features &
+ HV_VP_DISPATCH_INTERRUPT_INJECTION_AVAILABLE))
+ return -EOPNOTSUPP;
+
+ if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT)
+ return -EOPNOTSUPP;
+
+ if (irq->lapic_control.logical_dest_mode)
+ return -EOPNOTSUPP;
+
+ vp = partition->pt_vp_array[irq->lapic_apic_id];
+
+ if (!vp->vp_register_page)
+ return -EOPNOTSUPP;
+
+ if (mshv_vp_irq_set_vector(vp, irq->lapic_vector))
+ return -EINVAL;
+
+ if (vp->run.flags.root_sched_dispatched &&
+ vp->vp_register_page->interrupt_vectors.as_uint64)
+ return -EBUSY;
+
+ wake_up(&vp->run.vp_suspend_queue);
+
+ return 0;
+}
+#else /* CONFIG_X86_64 */
+static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
+static void mshv_assert_irq_slow(struct mshv_irqfd *irqfd)
+{
+ struct mshv_partition *partition = irqfd->irqfd_partn;
+ struct mshv_lapic_irq *irq = &irqfd->irqfd_lapic_irq;
+ unsigned int seq;
+ int idx;
+
+ WARN_ON(irqfd->irqfd_resampler &&
+ !irq->lapic_control.level_triggered);
+
+ idx = srcu_read_lock(&partition->pt_irq_srcu);
+ if (irqfd->irqfd_girq_ent.guest_irq_num) {
+ if (!irqfd->irqfd_girq_ent.girq_entry_valid) {
+ srcu_read_unlock(&partition->pt_irq_srcu, idx);
+ return;
+ }
+
+ do {
+ seq = read_seqcount_begin(&irqfd->irqfd_irqe_sc);
+ } while (read_seqcount_retry(&irqfd->irqfd_irqe_sc, seq));
+ }
+
+ hv_call_assert_virtual_interrupt(irqfd->irqfd_partn->pt_id,
+ irq->lapic_vector, irq->lapic_apic_id,
+ irq->lapic_control);
+ srcu_read_unlock(&partition->pt_irq_srcu, idx);
+}
+
+static void mshv_irqfd_resampler_shutdown(struct mshv_irqfd *irqfd)
+{
+ struct mshv_irqfd_resampler *rp = irqfd->irqfd_resampler;
+ struct mshv_partition *pt = rp->rsmplr_partn;
+
+ mutex_lock(&pt->irqfds_resampler_lock);
+
+ hlist_del_rcu(&irqfd->irqfd_resampler_hnode);
+ synchronize_srcu(&pt->pt_irq_srcu);
+
+ if (hlist_empty(&rp->rsmplr_irqfd_list)) {
+ hlist_del(&rp->rsmplr_hnode);
+ mshv_unregister_irq_ack_notifier(pt, &rp->rsmplr_notifier);
+ kfree(rp);
+ }
+
+ mutex_unlock(&pt->irqfds_resampler_lock);
+}
+
+/*
+ * Race-free decouple logic (ordering is critical)
+ */
+static void mshv_irqfd_shutdown(struct work_struct *work)
+{
+ struct mshv_irqfd *irqfd =
+ container_of(work, struct mshv_irqfd, irqfd_shutdown);
+
+ /*
+ * Synchronize with the wait-queue and unhook ourselves to prevent
+ * further events.
+ */
+ remove_wait_queue(irqfd->irqfd_wqh, &irqfd->irqfd_wait);
+
+ if (irqfd->irqfd_resampler) {
+ mshv_irqfd_resampler_shutdown(irqfd);
+ eventfd_ctx_put(irqfd->irqfd_resamplefd);
+ }
+
+ /*
+ * It is now safe to release the object's resources
+ */
+ eventfd_ctx_put(irqfd->irqfd_eventfd_ctx);
+ kfree(irqfd);
+}
+
+/* assumes partition->pt_irqfds_lock is held */
+static bool mshv_irqfd_is_active(struct mshv_irqfd *irqfd)
+{
+ return !hlist_unhashed(&irqfd->irqfd_hnode);
+}
+
+/*
+ * Mark the irqfd as inactive and schedule it for removal
+ *
+ * assumes partition->pt_irqfds_lock is held
+ */
+static void mshv_irqfd_deactivate(struct mshv_irqfd *irqfd)
+{
+ if (!mshv_irqfd_is_active(irqfd))
+ return;
+
+ hlist_del(&irqfd->irqfd_hnode);
+
+ queue_work(irqfd_cleanup_wq, &irqfd->irqfd_shutdown);
+}
+
+/*
+ * Called with wqh->lock held and interrupts disabled
+ */
+static int mshv_irqfd_wakeup(wait_queue_entry_t *wait, unsigned int mode,
+ int sync, void *key)
+{
+ struct mshv_irqfd *irqfd = container_of(wait, struct mshv_irqfd,
+ irqfd_wait);
+ unsigned long flags = (unsigned long)key;
+ int idx;
+ unsigned int seq;
+ struct mshv_partition *pt = irqfd->irqfd_partn;
+ int ret = 0;
+
+ if (flags & POLLIN) {
+ u64 cnt;
+
+ eventfd_ctx_do_read(irqfd->irqfd_eventfd_ctx, &cnt);
+ idx = srcu_read_lock(&pt->pt_irq_srcu);
+ do {
+ seq = read_seqcount_begin(&irqfd->irqfd_irqe_sc);
+ } while (read_seqcount_retry(&irqfd->irqfd_irqe_sc, seq));
+
+ /* An event has been signaled, raise an interrupt */
+ ret = mshv_try_assert_irq_fast(irqfd);
+ if (ret)
+ mshv_assert_irq_slow(irqfd);
+
+ srcu_read_unlock(&pt->pt_irq_srcu, idx);
+
+ ret = 1;
+ }
+
+ if (flags & POLLHUP) {
+ /* The eventfd is closing, detach from the partition */
+ unsigned long flags;
+
+ spin_lock_irqsave(&pt->pt_irqfds_lock, flags);
+
+ /*
+ * We must check if someone deactivated the irqfd before
+ * we could acquire the pt_irqfds_lock since the item is
+ * deactivated from the mshv side before it is unhooked from
+ * the wait-queue. If it is already deactivated, we can
+ * simply return knowing the other side will cleanup for us.
+ * We cannot race against the irqfd going away since the
+ * other side is required to acquire wqh->lock, which we hold
+ */
+ if (mshv_irqfd_is_active(irqfd))
+ mshv_irqfd_deactivate(irqfd);
+
+ spin_unlock_irqrestore(&pt->pt_irqfds_lock, flags);
+ }
+
+ return ret;
+}
+
+/* Must be called under pt_irqfds_lock */
+static void mshv_irqfd_update(struct mshv_partition *pt,
+ struct mshv_irqfd *irqfd)
+{
+ write_seqcount_begin(&irqfd->irqfd_irqe_sc);
+ irqfd->irqfd_girq_ent = mshv_ret_girq_entry(pt,
+ irqfd->irqfd_irqnum);
+ mshv_copy_girq_info(&irqfd->irqfd_girq_ent, &irqfd->irqfd_lapic_irq);
+ write_seqcount_end(&irqfd->irqfd_irqe_sc);
+}
+
+void mshv_irqfd_routing_update(struct mshv_partition *pt)
+{
+ struct mshv_irqfd *irqfd;
+
+ spin_lock_irq(&pt->pt_irqfds_lock);
+ hlist_for_each_entry(irqfd, &pt->pt_irqfds_list, irqfd_hnode)
+ mshv_irqfd_update(pt, irqfd);
+ spin_unlock_irq(&pt->pt_irqfds_lock);
+}
+
+static void mshv_irqfd_queue_proc(struct file *file, wait_queue_head_t *wqh,
+ poll_table *polltbl)
+{
+ struct mshv_irqfd *irqfd =
+ container_of(polltbl, struct mshv_irqfd, irqfd_polltbl);
+
+ irqfd->irqfd_wqh = wqh;
+ add_wait_queue_priority(wqh, &irqfd->irqfd_wait);
+}
+
+static int mshv_irqfd_assign(struct mshv_partition *pt,
+ struct mshv_user_irqfd *args)
+{
+ struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL;
+ struct mshv_irqfd *irqfd, *tmp;
+ unsigned int events;
+ struct fd f;
+ int ret;
+ int idx;
+
+ irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL);
+ if (!irqfd)
+ return -ENOMEM;
+
+ irqfd->irqfd_partn = pt;
+ irqfd->irqfd_irqnum = args->gsi;
+ INIT_WORK(&irqfd->irqfd_shutdown, mshv_irqfd_shutdown);
+ seqcount_spinlock_init(&irqfd->irqfd_irqe_sc, &pt->pt_irqfds_lock);
+
+ f = fdget(args->fd);
+ if (!fd_file(f)) {
+ ret = -EBADF;
+ goto out;
+ }
+
+ eventfd = eventfd_ctx_fileget(fd_file(f));
+ if (IS_ERR(eventfd)) {
+ ret = PTR_ERR(eventfd);
+ goto fail;
+ }
+
+ irqfd->irqfd_eventfd_ctx = eventfd;
+
+ if (args->flags & BIT(MSHV_IRQFD_BIT_RESAMPLE)) {
+ struct mshv_irqfd_resampler *rp;
+
+ resamplefd = eventfd_ctx_fdget(args->resamplefd);
+ if (IS_ERR(resamplefd)) {
+ ret = PTR_ERR(resamplefd);
+ goto fail;
+ }
+
+ irqfd->irqfd_resamplefd = resamplefd;
+
+ mutex_lock(&pt->irqfds_resampler_lock);
+
+ hlist_for_each_entry(rp, &pt->irqfds_resampler_list,
+ rsmplr_hnode) {
+ if (rp->rsmplr_notifier.irq_ack_gsi ==
+ irqfd->irqfd_irqnum) {
+ irqfd->irqfd_resampler = rp;
+ break;
+ }
+ }
+
+ if (!irqfd->irqfd_resampler) {
+ rp = kzalloc(sizeof(*rp), GFP_KERNEL_ACCOUNT);
+ if (!rp) {
+ ret = -ENOMEM;
+ mutex_unlock(&pt->irqfds_resampler_lock);
+ goto fail;
+ }
+
+ rp->rsmplr_partn = pt;
+ INIT_HLIST_HEAD(&rp->rsmplr_irqfd_list);
+ rp->rsmplr_notifier.irq_ack_gsi = irqfd->irqfd_irqnum;
+ rp->rsmplr_notifier.irq_acked =
+ mshv_irqfd_resampler_ack;
+
+ hlist_add_head(&rp->rsmplr_hnode,
+ &pt->irqfds_resampler_list);
+ mshv_register_irq_ack_notifier(pt,
+ &rp->rsmplr_notifier);
+ irqfd->irqfd_resampler = rp;
+ }
+
+ hlist_add_head_rcu(&irqfd->irqfd_resampler_hnode,
+ &irqfd->irqfd_resampler->rsmplr_irqfd_list);
+
+ mutex_unlock(&pt->irqfds_resampler_lock);
+ }
+
+ /*
+ * Install our own custom wake-up handling so we are notified via
+ * a callback whenever someone signals the underlying eventfd
+ */
+ init_waitqueue_func_entry(&irqfd->irqfd_wait, mshv_irqfd_wakeup);
+ init_poll_funcptr(&irqfd->irqfd_polltbl, mshv_irqfd_queue_proc);
+
+ spin_lock_irq(&pt->pt_irqfds_lock);
+ if (args->flags & BIT(MSHV_IRQFD_BIT_RESAMPLE) &&
+ !irqfd->irqfd_lapic_irq.lapic_control.level_triggered) {
+ /*
+ * Resample Fd must be for level triggered interrupt
+ * Otherwise return with failure
+ */
+ spin_unlock_irq(&pt->pt_irqfds_lock);
+ ret = -EINVAL;
+ goto fail;
+ }
+ ret = 0;
+ hlist_for_each_entry(tmp, &pt->pt_irqfds_list, irqfd_hnode) {
+ if (irqfd->irqfd_eventfd_ctx != tmp->irqfd_eventfd_ctx)
+ continue;
+ /* This fd is used for another irq already. */
+ ret = -EBUSY;
+ spin_unlock_irq(&pt->pt_irqfds_lock);
+ goto fail;
+ }
+
+ idx = srcu_read_lock(&pt->pt_irq_srcu);
+ mshv_irqfd_update(pt, irqfd);
+ hlist_add_head(&irqfd->irqfd_hnode, &pt->pt_irqfds_list);
+ spin_unlock_irq(&pt->pt_irqfds_lock);
+
+ /*
+ * Check if there was an event already pending on the eventfd
+ * before we registered, and trigger it as if we didn't miss it.
+ */
+ events = vfs_poll(fd_file(f), &irqfd->irqfd_polltbl);
+
+ if (events & POLLIN)
+ mshv_assert_irq_slow(irqfd);
+
+ srcu_read_unlock(&pt->pt_irq_srcu, idx);
+ /*
+ * do not drop the file until the irqfd is fully initialized, otherwise
+ * we might race against the POLLHUP
+ */
+ fdput(f);
+
+ return 0;
+
+fail:
+ if (irqfd->irqfd_resampler)
+ mshv_irqfd_resampler_shutdown(irqfd);
+
+ if (resamplefd && !IS_ERR(resamplefd))
+ eventfd_ctx_put(resamplefd);
+
+ if (eventfd && !IS_ERR(eventfd))
+ eventfd_ctx_put(eventfd);
+
+ fdput(f);
+
+out:
+ kfree(irqfd);
+ return ret;
+}
+
+/*
+ * shutdown any irqfd's that match fd+gsi
+ */
+static int mshv_irqfd_deassign(struct mshv_partition *pt,
+ struct mshv_user_irqfd *args)
+{
+ struct mshv_irqfd *irqfd;
+ struct hlist_node *n;
+ struct eventfd_ctx *eventfd;
+
+ eventfd = eventfd_ctx_fdget(args->fd);
+ if (IS_ERR(eventfd))
+ return PTR_ERR(eventfd);
+
+ hlist_for_each_entry_safe(irqfd, n, &pt->pt_irqfds_list,
+ irqfd_hnode) {
+ if (irqfd->irqfd_eventfd_ctx == eventfd &&
+ irqfd->irqfd_irqnum == args->gsi)
+
+ mshv_irqfd_deactivate(irqfd);
+ }
+
+ eventfd_ctx_put(eventfd);
+
+ /*
+ * Block until we know all outstanding shutdown jobs have completed
+ * so that we guarantee there will not be any more interrupts on this
+ * gsi once this deassign function returns.
+ */
+ flush_workqueue(irqfd_cleanup_wq);
+
+ return 0;
+}
+
+int mshv_set_unset_irqfd(struct mshv_partition *pt,
+ struct mshv_user_irqfd *args)
+{
+ if (args->flags & ~MSHV_IRQFD_FLAGS_MASK)
+ return -EINVAL;
+
+ if (args->flags & BIT(MSHV_IRQFD_BIT_DEASSIGN))
+ return mshv_irqfd_deassign(pt, args);
+
+ return mshv_irqfd_assign(pt, args);
+}
+
+/*
+ * This function is called as the mshv VM fd is being released.
+ * Shutdown all irqfds that still remain open
+ */
+static void mshv_irqfd_release(struct mshv_partition *pt)
+{
+ struct mshv_irqfd *irqfd;
+ struct hlist_node *n;
+
+ spin_lock_irq(&pt->pt_irqfds_lock);
+
+ hlist_for_each_entry_safe(irqfd, n, &pt->pt_irqfds_list, irqfd_hnode)
+ mshv_irqfd_deactivate(irqfd);
+
+ spin_unlock_irq(&pt->pt_irqfds_lock);
+
+ /*
+ * Block until we know all outstanding shutdown jobs have completed
+ * since we do not take a mshv_partition* reference.
+ */
+ flush_workqueue(irqfd_cleanup_wq);
+}
+
+int mshv_irqfd_wq_init(void)
+{
+ irqfd_cleanup_wq = alloc_workqueue("mshv-irqfd-cleanup", 0, 0);
+ if (!irqfd_cleanup_wq)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void mshv_irqfd_wq_cleanup(void)
+{
+ destroy_workqueue(irqfd_cleanup_wq);
+}
+
+/*
+ * --------------------------------------------------------------------
+ * ioeventfd: translate a MMIO memory write to an eventfd signal.
+ *
+ * userspace can register a MMIO address with an eventfd for receiving
+ * notification when the memory has been touched.
+ * --------------------------------------------------------------------
+ */
+
+static void ioeventfd_release(struct mshv_ioeventfd *p, u64 partition_id)
+{
+ if (p->iovntfd_doorbell_id > 0)
+ mshv_unregister_doorbell(partition_id, p->iovntfd_doorbell_id);
+ eventfd_ctx_put(p->iovntfd_eventfd);
+ kfree(p);
+}
+
+/* MMIO writes trigger an event if the addr/val match */
+static void ioeventfd_mmio_write(int doorbell_id, void *data)
+{
+ struct mshv_partition *partition = (struct mshv_partition *)data;
+ struct mshv_ioeventfd *p;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(p, &partition->ioeventfds_list, iovntfd_hnode)
+ if (p->iovntfd_doorbell_id == doorbell_id) {
+ eventfd_signal(p->iovntfd_eventfd);
+ break;
+ }
+
+ rcu_read_unlock();
+}
+
+static bool ioeventfd_check_collision(struct mshv_partition *pt,
+ struct mshv_ioeventfd *p)
+ __must_hold(&pt->mutex)
+{
+ struct mshv_ioeventfd *_p;
+
+ hlist_for_each_entry(_p, &pt->ioeventfds_list, iovntfd_hnode)
+ if (_p->iovntfd_addr == p->iovntfd_addr &&
+ _p->iovntfd_length == p->iovntfd_length &&
+ (_p->iovntfd_wildcard || p->iovntfd_wildcard ||
+ _p->iovntfd_datamatch == p->iovntfd_datamatch))
+ return true;
+
+ return false;
+}
+
+static int mshv_assign_ioeventfd(struct mshv_partition *pt,
+ struct mshv_user_ioeventfd *args)
+ __must_hold(&pt->mutex)
+{
+ struct mshv_ioeventfd *p;
+ struct eventfd_ctx *eventfd;
+ u64 doorbell_flags = 0;
+ int ret;
+
+ /* This mutex is currently protecting ioeventfd.items list */
+ WARN_ON_ONCE(!mutex_is_locked(&pt->pt_mutex));
+
+ if (args->flags & BIT(MSHV_IOEVENTFD_BIT_PIO))
+ return -EOPNOTSUPP;
+
+ /* must be natural-word sized */
+ switch (args->len) {
+ case 0:
+ doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_ANY;
+ break;
+ case 1:
+ doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_BYTE;
+ break;
+ case 2:
+ doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_WORD;
+ break;
+ case 4:
+ doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_DWORD;
+ break;
+ case 8:
+ doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_QWORD;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ /* check for range overflow */
+ if (args->addr + args->len < args->addr)
+ return -EINVAL;
+
+ /* check for extra flags that we don't understand */
+ if (args->flags & ~MSHV_IOEVENTFD_FLAGS_MASK)
+ return -EINVAL;
+
+ eventfd = eventfd_ctx_fdget(args->fd);
+ if (IS_ERR(eventfd))
+ return PTR_ERR(eventfd);
+
+ p = kzalloc(sizeof(*p), GFP_KERNEL);
+ if (!p) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ p->iovntfd_addr = args->addr;
+ p->iovntfd_length = args->len;
+ p->iovntfd_eventfd = eventfd;
+
+ /* The datamatch feature is optional, otherwise this is a wildcard */
+ if (args->flags & BIT(MSHV_IOEVENTFD_BIT_DATAMATCH)) {
+ p->iovntfd_datamatch = args->datamatch;
+ } else {
+ p->iovntfd_wildcard = true;
+ doorbell_flags |= HV_DOORBELL_FLAG_TRIGGER_ANY_VALUE;
+ }
+
+ if (ioeventfd_check_collision(pt, p)) {
+ ret = -EEXIST;
+ goto unlock_fail;
+ }
+
+ ret = mshv_register_doorbell(pt->pt_id, ioeventfd_mmio_write,
+ (void *)pt, p->iovntfd_addr,
+ p->iovntfd_datamatch, doorbell_flags);
+ if (ret < 0)
+ goto unlock_fail;
+
+ p->iovntfd_doorbell_id = ret;
+
+ hlist_add_head_rcu(&p->iovntfd_hnode, &pt->ioeventfds_list);
+
+ return 0;
+
+unlock_fail:
+ kfree(p);
+
+fail:
+ eventfd_ctx_put(eventfd);
+
+ return ret;
+}
+
+static int mshv_deassign_ioeventfd(struct mshv_partition *pt,
+ struct mshv_user_ioeventfd *args)
+ __must_hold(&pt->mutex)
+{
+ struct mshv_ioeventfd *p;
+ struct eventfd_ctx *eventfd;
+ struct hlist_node *n;
+ int ret = -ENOENT;
+
+ /* This mutex is currently protecting ioeventfd.items list */
+ WARN_ON_ONCE(!mutex_is_locked(&pt->pt_mutex));
+
+ eventfd = eventfd_ctx_fdget(args->fd);
+ if (IS_ERR(eventfd))
+ return PTR_ERR(eventfd);
+
+ hlist_for_each_entry_safe(p, n, &pt->ioeventfds_list, iovntfd_hnode) {
+ bool wildcard = !(args->flags & BIT(MSHV_IOEVENTFD_BIT_DATAMATCH));
+
+ if (p->iovntfd_eventfd != eventfd ||
+ p->iovntfd_addr != args->addr ||
+ p->iovntfd_length != args->len ||
+ p->iovntfd_wildcard != wildcard)
+ continue;
+
+ if (!p->iovntfd_wildcard &&
+ p->iovntfd_datamatch != args->datamatch)
+ continue;
+
+ hlist_del_rcu(&p->iovntfd_hnode);
+ synchronize_rcu();
+ ioeventfd_release(p, pt->pt_id);
+ ret = 0;
+ break;
+ }
+
+ eventfd_ctx_put(eventfd);
+
+ return ret;
+}
+
+int mshv_set_unset_ioeventfd(struct mshv_partition *pt,
+ struct mshv_user_ioeventfd *args)
+ __must_hold(&pt->mutex)
+{
+ if ((args->flags & ~MSHV_IOEVENTFD_FLAGS_MASK) ||
+ mshv_field_nonzero(*args, rsvd))
+ return -EINVAL;
+
+ /* PIO not yet implemented */
+ if (args->flags & BIT(MSHV_IOEVENTFD_BIT_PIO))
+ return -EOPNOTSUPP;
+
+ if (args->flags & BIT(MSHV_IOEVENTFD_BIT_DEASSIGN))
+ return mshv_deassign_ioeventfd(pt, args);
+
+ return mshv_assign_ioeventfd(pt, args);
+}
+
+void mshv_eventfd_init(struct mshv_partition *pt)
+{
+ spin_lock_init(&pt->pt_irqfds_lock);
+ INIT_HLIST_HEAD(&pt->pt_irqfds_list);
+
+ INIT_HLIST_HEAD(&pt->irqfds_resampler_list);
+ mutex_init(&pt->irqfds_resampler_lock);
+
+ INIT_HLIST_HEAD(&pt->ioeventfds_list);
+}
+
+void mshv_eventfd_release(struct mshv_partition *pt)
+{
+ struct hlist_head items;
+ struct hlist_node *n;
+ struct mshv_ioeventfd *p;
+
+ hlist_move_list(&pt->ioeventfds_list, &items);
+ synchronize_rcu();
+
+ hlist_for_each_entry_safe(p, n, &items, iovntfd_hnode) {
+ hlist_del(&p->iovntfd_hnode);
+ ioeventfd_release(p, pt->pt_id);
+ }
+
+ mshv_irqfd_release(pt);
+}
diff --git a/drivers/hv/mshv_eventfd.h b/drivers/hv/mshv_eventfd.h
new file mode 100644
index 000000000000..332e7670a344
--- /dev/null
+++ b/drivers/hv/mshv_eventfd.h
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * irqfd: Allows an fd to be used to inject an interrupt to the guest.
+ * ioeventfd: Allow an fd to be used to receive a signal from the guest.
+ * All credit goes to kvm developers.
+ */
+
+#ifndef __LINUX_MSHV_EVENTFD_H
+#define __LINUX_MSHV_EVENTFD_H
+
+#include <linux/poll.h>
+
+#include "mshv.h"
+#include "mshv_root.h"
+
+/* struct to contain list of irqfds sharing an irq. Updates are protected by
+ * partition.irqfds.resampler_lock
+ */
+struct mshv_irqfd_resampler {
+ struct mshv_partition *rsmplr_partn;
+ struct hlist_head rsmplr_irqfd_list;
+ struct mshv_irq_ack_notifier rsmplr_notifier;
+ struct hlist_node rsmplr_hnode;
+};
+
+struct mshv_irqfd {
+ struct mshv_partition *irqfd_partn;
+ struct eventfd_ctx *irqfd_eventfd_ctx;
+ struct mshv_guest_irq_ent irqfd_girq_ent;
+ seqcount_spinlock_t irqfd_irqe_sc;
+ u32 irqfd_irqnum;
+ struct mshv_lapic_irq irqfd_lapic_irq;
+ struct hlist_node irqfd_hnode;
+ poll_table irqfd_polltbl;
+ wait_queue_head_t *irqfd_wqh;
+ wait_queue_entry_t irqfd_wait;
+ struct work_struct irqfd_shutdown;
+ struct mshv_irqfd_resampler *irqfd_resampler;
+ struct eventfd_ctx *irqfd_resamplefd;
+ struct hlist_node irqfd_resampler_hnode;
+};
+
+void mshv_eventfd_init(struct mshv_partition *partition);
+void mshv_eventfd_release(struct mshv_partition *partition);
+
+void mshv_register_irq_ack_notifier(struct mshv_partition *partition,
+ struct mshv_irq_ack_notifier *mian);
+void mshv_unregister_irq_ack_notifier(struct mshv_partition *partition,
+ struct mshv_irq_ack_notifier *mian);
+bool mshv_notify_acked_gsi(struct mshv_partition *partition, int gsi);
+
+int mshv_set_unset_irqfd(struct mshv_partition *partition,
+ struct mshv_user_irqfd *args);
+
+int mshv_irqfd_wq_init(void);
+void mshv_irqfd_wq_cleanup(void);
+
+struct mshv_ioeventfd {
+ struct hlist_node iovntfd_hnode;
+ u64 iovntfd_addr;
+ int iovntfd_length;
+ struct eventfd_ctx *iovntfd_eventfd;
+ u64 iovntfd_datamatch;
+ int iovntfd_doorbell_id;
+ bool iovntfd_wildcard;
+};
+
+int mshv_set_unset_ioeventfd(struct mshv_partition *pt,
+ struct mshv_user_ioeventfd *args);
+
+#endif /* __LINUX_MSHV_EVENTFD_H */
diff --git a/drivers/hv/mshv_irq.c b/drivers/hv/mshv_irq.c
new file mode 100644
index 000000000000..d0fb9ef734f4
--- /dev/null
+++ b/drivers/hv/mshv_irq.c
@@ -0,0 +1,124 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2023, Microsoft Corporation.
+ *
+ * Authors: Microsoft Linux virtualization team
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <asm/mshyperv.h>
+
+#include "mshv_eventfd.h"
+#include "mshv.h"
+#include "mshv_root.h"
+
+/* called from the ioctl code, user wants to update the guest irq table */
+int mshv_update_routing_table(struct mshv_partition *partition,
+ const struct mshv_user_irq_entry *ue,
+ unsigned int numents)
+{
+ struct mshv_girq_routing_table *new = NULL, *old;
+ u32 i, nr_rt_entries = 0;
+ int r = 0;
+
+ if (numents == 0)
+ goto swap_routes;
+
+ for (i = 0; i < numents; i++) {
+ if (ue[i].gsi >= MSHV_MAX_GUEST_IRQS)
+ return -EINVAL;
+
+ if (ue[i].address_hi)
+ return -EINVAL;
+
+ nr_rt_entries = max(nr_rt_entries, ue[i].gsi);
+ }
+ nr_rt_entries += 1;
+
+ new = kzalloc(struct_size(new, mshv_girq_info_tbl, nr_rt_entries),
+ GFP_KERNEL_ACCOUNT);
+ if (!new)
+ return -ENOMEM;
+
+ new->num_rt_entries = nr_rt_entries;
+ for (i = 0; i < numents; i++) {
+ struct mshv_guest_irq_ent *girq;
+
+ girq = &new->mshv_girq_info_tbl[ue[i].gsi];
+
+ /*
+ * Allow only one to one mapping between GSI and MSI routing.
+ */
+ if (girq->guest_irq_num != 0) {
+ r = -EINVAL;
+ goto out;
+ }
+
+ girq->guest_irq_num = ue[i].gsi;
+ girq->girq_addr_lo = ue[i].address_lo;
+ girq->girq_addr_hi = ue[i].address_hi;
+ girq->girq_irq_data = ue[i].data;
+ girq->girq_entry_valid = true;
+ }
+
+swap_routes:
+ mutex_lock(&partition->pt_irq_lock);
+ old = rcu_dereference_protected(partition->pt_girq_tbl, 1);
+ rcu_assign_pointer(partition->pt_girq_tbl, new);
+ mshv_irqfd_routing_update(partition);
+ mutex_unlock(&partition->pt_irq_lock);
+
+ synchronize_srcu_expedited(&partition->pt_irq_srcu);
+ new = old;
+
+out:
+ kfree(new);
+
+ return r;
+}
+
+/* vm is going away, kfree the irq routing table */
+void mshv_free_routing_table(struct mshv_partition *partition)
+{
+ struct mshv_girq_routing_table *rt =
+ rcu_access_pointer(partition->pt_girq_tbl);
+
+ kfree(rt);
+}
+
+struct mshv_guest_irq_ent
+mshv_ret_girq_entry(struct mshv_partition *partition, u32 irqnum)
+{
+ struct mshv_guest_irq_ent entry = { 0 };
+ struct mshv_girq_routing_table *girq_tbl;
+
+ girq_tbl = srcu_dereference_check(partition->pt_girq_tbl,
+ &partition->pt_irq_srcu,
+ lockdep_is_held(&partition->pt_irq_lock));
+ if (!girq_tbl || irqnum >= girq_tbl->num_rt_entries) {
+ /*
+ * Premature register_irqfd, setting valid_entry = 0
+ * would ignore this entry anyway
+ */
+ entry.guest_irq_num = irqnum;
+ return entry;
+ }
+
+ return girq_tbl->mshv_girq_info_tbl[irqnum];
+}
+
+void mshv_copy_girq_info(struct mshv_guest_irq_ent *ent,
+ struct mshv_lapic_irq *lirq)
+{
+ memset(lirq, 0, sizeof(*lirq));
+ if (!ent || !ent->girq_entry_valid)
+ return;
+
+ lirq->lapic_vector = ent->girq_irq_data & 0xFF;
+ lirq->lapic_apic_id = (ent->girq_addr_lo >> 12) & 0xFF;
+ lirq->lapic_control.interrupt_type = (ent->girq_irq_data & 0x700) >> 8;
+ lirq->lapic_control.level_triggered = (ent->girq_irq_data >> 15) & 0x1;
+ lirq->lapic_control.logical_dest_mode = (ent->girq_addr_lo >> 2) & 0x1;
+}
diff --git a/drivers/hv/mshv_portid_table.c b/drivers/hv/mshv_portid_table.c
new file mode 100644
index 000000000000..c349af1f0aaa
--- /dev/null
+++ b/drivers/hv/mshv_portid_table.c
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/idr.h>
+#include <asm/mshyperv.h>
+
+#include "mshv.h"
+#include "mshv_root.h"
+
+/*
+ * Ports and connections are hypervisor struct used for inter-partition
+ * communication. Port represents the source and connection represents
+ * the destination. Partitions are responsible for managing the port and
+ * connection ids.
+ *
+ */
+
+#define PORTID_MIN 1
+#define PORTID_MAX INT_MAX
+
+static DEFINE_IDR(port_table_idr);
+
+void
+mshv_port_table_fini(void)
+{
+ struct port_table_info *port_info;
+ unsigned long i, tmp;
+
+ idr_lock(&port_table_idr);
+ if (!idr_is_empty(&port_table_idr)) {
+ idr_for_each_entry_ul(&port_table_idr, port_info, tmp, i) {
+ port_info = idr_remove(&port_table_idr, i);
+ kfree_rcu(port_info, portbl_rcu);
+ }
+ }
+ idr_unlock(&port_table_idr);
+}
+
+int
+mshv_portid_alloc(struct port_table_info *info)
+{
+ int ret = 0;
+
+ idr_lock(&port_table_idr);
+ ret = idr_alloc(&port_table_idr, info, PORTID_MIN,
+ PORTID_MAX, GFP_KERNEL);
+ idr_unlock(&port_table_idr);
+
+ return ret;
+}
+
+void
+mshv_portid_free(int port_id)
+{
+ struct port_table_info *info;
+
+ idr_lock(&port_table_idr);
+ info = idr_remove(&port_table_idr, port_id);
+ WARN_ON(!info);
+ idr_unlock(&port_table_idr);
+
+ synchronize_rcu();
+ kfree(info);
+}
+
+int
+mshv_portid_lookup(int port_id, struct port_table_info *info)
+{
+ struct port_table_info *_info;
+ int ret = -ENOENT;
+
+ rcu_read_lock();
+ _info = idr_find(&port_table_idr, port_id);
+ rcu_read_unlock();
+
+ if (_info) {
+ *info = *_info;
+ ret = 0;
+ }
+
+ return ret;
+}
diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h
new file mode 100644
index 000000000000..e3931b0f1269
--- /dev/null
+++ b/drivers/hv/mshv_root.h
@@ -0,0 +1,311 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2023, Microsoft Corporation.
+ */
+
+#ifndef _MSHV_ROOT_H_
+#define _MSHV_ROOT_H_
+
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/semaphore.h>
+#include <linux/sched.h>
+#include <linux/srcu.h>
+#include <linux/wait.h>
+#include <linux/hashtable.h>
+#include <linux/dev_printk.h>
+#include <linux/build_bug.h>
+#include <uapi/linux/mshv.h>
+
+/*
+ * Hypervisor must be between these version numbers (inclusive)
+ * to guarantee compatibility
+ */
+#define MSHV_HV_MIN_VERSION (27744)
+#define MSHV_HV_MAX_VERSION (27751)
+
+static_assert(HV_HYP_PAGE_SIZE == MSHV_HV_PAGE_SIZE);
+
+#define MSHV_MAX_VPS 256
+
+#define MSHV_PARTITIONS_HASH_BITS 9
+
+#define MSHV_PIN_PAGES_BATCH_SIZE (0x10000000ULL / HV_HYP_PAGE_SIZE)
+
+struct mshv_vp {
+ u32 vp_index;
+ struct mshv_partition *vp_partition;
+ struct mutex vp_mutex;
+ struct hv_vp_register_page *vp_register_page;
+ struct hv_message *vp_intercept_msg_page;
+ void *vp_ghcb_page;
+ struct hv_stats_page *vp_stats_pages[2];
+ struct {
+ atomic64_t vp_signaled_count;
+ struct {
+ u64 intercept_suspend: 1;
+ u64 root_sched_blocked: 1; /* root scheduler only */
+ u64 root_sched_dispatched: 1; /* root scheduler only */
+ u64 reserved: 61;
+ } flags;
+ unsigned int kicked_by_hv;
+ wait_queue_head_t vp_suspend_queue;
+ } run;
+};
+
+#define vp_fmt(fmt) "p%lluvp%u: " fmt
+#define vp_devprintk(level, v, fmt, ...) \
+do { \
+ const struct mshv_vp *__vp = (v); \
+ const struct mshv_partition *__pt = __vp->vp_partition; \
+ dev_##level(__pt->pt_module_dev, vp_fmt(fmt), __pt->pt_id, \
+ __vp->vp_index, ##__VA_ARGS__); \
+} while (0)
+#define vp_emerg(v, fmt, ...) vp_devprintk(emerg, v, fmt, ##__VA_ARGS__)
+#define vp_crit(v, fmt, ...) vp_devprintk(crit, v, fmt, ##__VA_ARGS__)
+#define vp_alert(v, fmt, ...) vp_devprintk(alert, v, fmt, ##__VA_ARGS__)
+#define vp_err(v, fmt, ...) vp_devprintk(err, v, fmt, ##__VA_ARGS__)
+#define vp_warn(v, fmt, ...) vp_devprintk(warn, v, fmt, ##__VA_ARGS__)
+#define vp_notice(v, fmt, ...) vp_devprintk(notice, v, fmt, ##__VA_ARGS__)
+#define vp_info(v, fmt, ...) vp_devprintk(info, v, fmt, ##__VA_ARGS__)
+#define vp_dbg(v, fmt, ...) vp_devprintk(dbg, v, fmt, ##__VA_ARGS__)
+
+struct mshv_mem_region {
+ struct hlist_node hnode;
+ u64 nr_pages;
+ u64 start_gfn;
+ u64 start_uaddr;
+ u32 hv_map_flags;
+ struct {
+ u64 large_pages: 1; /* 2MiB */
+ u64 range_pinned: 1;
+ u64 reserved: 62;
+ } flags;
+ struct mshv_partition *partition;
+ struct page *pages[];
+};
+
+struct mshv_irq_ack_notifier {
+ struct hlist_node link;
+ unsigned int irq_ack_gsi;
+ void (*irq_acked)(struct mshv_irq_ack_notifier *mian);
+};
+
+struct mshv_partition {
+ struct device *pt_module_dev;
+
+ struct hlist_node pt_hnode;
+ u64 pt_id;
+ refcount_t pt_ref_count;
+ struct mutex pt_mutex;
+ struct hlist_head pt_mem_regions; // not ordered
+
+ u32 pt_vp_count;
+ struct mshv_vp *pt_vp_array[MSHV_MAX_VPS];
+
+ struct mutex pt_irq_lock;
+ struct srcu_struct pt_irq_srcu;
+ struct hlist_head irq_ack_notifier_list;
+
+ struct hlist_head pt_devices;
+
+ /*
+ * MSHV does not support more than one async hypercall in flight
+ * for a single partition. Thus, it is okay to define per partition
+ * async hypercall status.
+ */
+ struct completion async_hypercall;
+ u64 async_hypercall_status;
+
+ spinlock_t pt_irqfds_lock;
+ struct hlist_head pt_irqfds_list;
+ struct mutex irqfds_resampler_lock;
+ struct hlist_head irqfds_resampler_list;
+
+ struct hlist_head ioeventfds_list;
+
+ struct mshv_girq_routing_table __rcu *pt_girq_tbl;
+ u64 isolation_type;
+ bool import_completed;
+ bool pt_initialized;
+};
+
+#define pt_fmt(fmt) "p%llu: " fmt
+#define pt_devprintk(level, p, fmt, ...) \
+do { \
+ const struct mshv_partition *__pt = (p); \
+ dev_##level(__pt->pt_module_dev, pt_fmt(fmt), __pt->pt_id, \
+ ##__VA_ARGS__); \
+} while (0)
+#define pt_emerg(p, fmt, ...) pt_devprintk(emerg, p, fmt, ##__VA_ARGS__)
+#define pt_crit(p, fmt, ...) pt_devprintk(crit, p, fmt, ##__VA_ARGS__)
+#define pt_alert(p, fmt, ...) pt_devprintk(alert, p, fmt, ##__VA_ARGS__)
+#define pt_err(p, fmt, ...) pt_devprintk(err, p, fmt, ##__VA_ARGS__)
+#define pt_warn(p, fmt, ...) pt_devprintk(warn, p, fmt, ##__VA_ARGS__)
+#define pt_notice(p, fmt, ...) pt_devprintk(notice, p, fmt, ##__VA_ARGS__)
+#define pt_info(p, fmt, ...) pt_devprintk(info, p, fmt, ##__VA_ARGS__)
+#define pt_dbg(p, fmt, ...) pt_devprintk(dbg, p, fmt, ##__VA_ARGS__)
+
+struct mshv_lapic_irq {
+ u32 lapic_vector;
+ u64 lapic_apic_id;
+ union hv_interrupt_control lapic_control;
+};
+
+#define MSHV_MAX_GUEST_IRQS 4096
+
+/* representation of one guest irq entry, either msi or legacy */
+struct mshv_guest_irq_ent {
+ u32 girq_entry_valid; /* vfio looks at this */
+ u32 guest_irq_num; /* a unique number for each irq */
+ u32 girq_addr_lo; /* guest irq msi address info */
+ u32 girq_addr_hi;
+ u32 girq_irq_data; /* idt vector in some cases */
+};
+
+struct mshv_girq_routing_table {
+ u32 num_rt_entries;
+ struct mshv_guest_irq_ent mshv_girq_info_tbl[];
+};
+
+struct hv_synic_pages {
+ struct hv_message_page *synic_message_page;
+ struct hv_synic_event_flags_page *synic_event_flags_page;
+ struct hv_synic_event_ring_page *synic_event_ring_page;
+};
+
+struct mshv_root {
+ struct hv_synic_pages __percpu *synic_pages;
+ spinlock_t pt_ht_lock;
+ DECLARE_HASHTABLE(pt_htable, MSHV_PARTITIONS_HASH_BITS);
+};
+
+/*
+ * Callback for doorbell events.
+ * NOTE: This is called in interrupt context. Callback
+ * should defer slow and sleeping logic to later.
+ */
+typedef void (*doorbell_cb_t) (int doorbell_id, void *);
+
+/*
+ * port table information
+ */
+struct port_table_info {
+ struct rcu_head portbl_rcu;
+ enum hv_port_type hv_port_type;
+ union {
+ struct {
+ u64 reserved[2];
+ } hv_port_message;
+ struct {
+ u64 reserved[2];
+ } hv_port_event;
+ struct {
+ u64 reserved[2];
+ } hv_port_monitor;
+ struct {
+ doorbell_cb_t doorbell_cb;
+ void *data;
+ } hv_port_doorbell;
+ };
+};
+
+int mshv_update_routing_table(struct mshv_partition *partition,
+ const struct mshv_user_irq_entry *entries,
+ unsigned int numents);
+void mshv_free_routing_table(struct mshv_partition *partition);
+
+struct mshv_guest_irq_ent mshv_ret_girq_entry(struct mshv_partition *partition,
+ u32 irq_num);
+
+void mshv_copy_girq_info(struct mshv_guest_irq_ent *src_irq,
+ struct mshv_lapic_irq *dest_irq);
+
+void mshv_irqfd_routing_update(struct mshv_partition *partition);
+
+void mshv_port_table_fini(void);
+int mshv_portid_alloc(struct port_table_info *info);
+int mshv_portid_lookup(int port_id, struct port_table_info *info);
+void mshv_portid_free(int port_id);
+
+int mshv_register_doorbell(u64 partition_id, doorbell_cb_t doorbell_cb,
+ void *data, u64 gpa, u64 val, u64 flags);
+void mshv_unregister_doorbell(u64 partition_id, int doorbell_portid);
+
+void mshv_isr(void);
+int mshv_synic_init(unsigned int cpu);
+int mshv_synic_cleanup(unsigned int cpu);
+
+static inline bool mshv_partition_encrypted(struct mshv_partition *partition)
+{
+ return partition->isolation_type == HV_PARTITION_ISOLATION_TYPE_SNP;
+}
+
+struct mshv_partition *mshv_partition_get(struct mshv_partition *partition);
+void mshv_partition_put(struct mshv_partition *partition);
+struct mshv_partition *mshv_partition_find(u64 partition_id) __must_hold(RCU);
+
+/* hypercalls */
+
+int hv_call_withdraw_memory(u64 count, int node, u64 partition_id);
+int hv_call_create_partition(u64 flags,
+ struct hv_partition_creation_properties creation_properties,
+ union hv_partition_isolation_properties isolation_properties,
+ u64 *partition_id);
+int hv_call_initialize_partition(u64 partition_id);
+int hv_call_finalize_partition(u64 partition_id);
+int hv_call_delete_partition(u64 partition_id);
+int hv_call_map_mmio_pages(u64 partition_id, u64 gfn, u64 mmio_spa, u64 numpgs);
+int hv_call_map_gpa_pages(u64 partition_id, u64 gpa_target, u64 page_count,
+ u32 flags, struct page **pages);
+int hv_call_unmap_gpa_pages(u64 partition_id, u64 gpa_target, u64 page_count,
+ u32 flags);
+int hv_call_delete_vp(u64 partition_id, u32 vp_index);
+int hv_call_assert_virtual_interrupt(u64 partition_id, u32 vector,
+ u64 dest_addr,
+ union hv_interrupt_control control);
+int hv_call_clear_virtual_interrupt(u64 partition_id);
+int hv_call_get_gpa_access_states(u64 partition_id, u32 count, u64 gpa_base_pfn,
+ union hv_gpa_page_access_state_flags state_flags,
+ int *written_total,
+ union hv_gpa_page_access_state *states);
+int hv_call_get_vp_state(u32 vp_index, u64 partition_id,
+ struct hv_vp_state_data state_data,
+ /* Choose between pages and ret_output */
+ u64 page_count, struct page **pages,
+ union hv_output_get_vp_state *ret_output);
+int hv_call_set_vp_state(u32 vp_index, u64 partition_id,
+ /* Choose between pages and bytes */
+ struct hv_vp_state_data state_data, u64 page_count,
+ struct page **pages, u32 num_bytes, u8 *bytes);
+int hv_call_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
+ union hv_input_vtl input_vtl,
+ struct page **state_page);
+int hv_call_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
+ union hv_input_vtl input_vtl);
+int hv_call_create_port(u64 port_partition_id, union hv_port_id port_id,
+ u64 connection_partition_id, struct hv_port_info *port_info,
+ u8 port_vtl, u8 min_connection_vtl, int node);
+int hv_call_delete_port(u64 port_partition_id, union hv_port_id port_id);
+int hv_call_connect_port(u64 port_partition_id, union hv_port_id port_id,
+ u64 connection_partition_id,
+ union hv_connection_id connection_id,
+ struct hv_connection_info *connection_info,
+ u8 connection_vtl, int node);
+int hv_call_disconnect_port(u64 connection_partition_id,
+ union hv_connection_id connection_id);
+int hv_call_notify_port_ring_empty(u32 sint_index);
+int hv_call_map_stat_page(enum hv_stats_object_type type,
+ const union hv_stats_object_identity *identity,
+ void **addr);
+int hv_call_unmap_stat_page(enum hv_stats_object_type type,
+ const union hv_stats_object_identity *identity);
+int hv_call_modify_spa_host_access(u64 partition_id, struct page **pages,
+ u64 page_struct_count, u32 host_access,
+ u32 flags, u8 acquire);
+
+extern struct mshv_root mshv_root;
+extern enum hv_scheduler_type hv_scheduler_type;
+extern u8 * __percpu *hv_synic_eventring_tail;
+
+#endif /* _MSHV_ROOT_H_ */
diff --git a/drivers/hv/mshv_root_hv_call.c b/drivers/hv/mshv_root_hv_call.c
new file mode 100644
index 000000000000..a222a16107f6
--- /dev/null
+++ b/drivers/hv/mshv_root_hv_call.c
@@ -0,0 +1,849 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2023, Microsoft Corporation.
+ *
+ * Hypercall helper functions used by the mshv_root module.
+ *
+ * Authors: Microsoft Linux virtualization team
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <asm/mshyperv.h>
+
+#include "mshv_root.h"
+
+/* Determined empirically */
+#define HV_INIT_PARTITION_DEPOSIT_PAGES 208
+#define HV_MAP_GPA_DEPOSIT_PAGES 256
+#define HV_UMAP_GPA_PAGES 512
+
+#define HV_PAGE_COUNT_2M_ALIGNED(pg_count) (!((pg_count) & (0x200 - 1)))
+
+#define HV_WITHDRAW_BATCH_SIZE (HV_HYP_PAGE_SIZE / sizeof(u64))
+#define HV_MAP_GPA_BATCH_SIZE \
+ ((HV_HYP_PAGE_SIZE - sizeof(struct hv_input_map_gpa_pages)) \
+ / sizeof(u64))
+#define HV_GET_VP_STATE_BATCH_SIZE \
+ ((HV_HYP_PAGE_SIZE - sizeof(struct hv_input_get_vp_state)) \
+ / sizeof(u64))
+#define HV_SET_VP_STATE_BATCH_SIZE \
+ ((HV_HYP_PAGE_SIZE - sizeof(struct hv_input_set_vp_state)) \
+ / sizeof(u64))
+#define HV_GET_GPA_ACCESS_STATES_BATCH_SIZE \
+ ((HV_HYP_PAGE_SIZE - sizeof(union hv_gpa_page_access_state)) \
+ / sizeof(union hv_gpa_page_access_state))
+#define HV_MODIFY_SPARSE_SPA_PAGE_HOST_ACCESS_MAX_PAGE_COUNT \
+ ((HV_HYP_PAGE_SIZE - \
+ sizeof(struct hv_input_modify_sparse_spa_page_host_access)) / \
+ sizeof(u64))
+
+int hv_call_withdraw_memory(u64 count, int node, u64 partition_id)
+{
+ struct hv_input_withdraw_memory *input_page;
+ struct hv_output_withdraw_memory *output_page;
+ struct page *page;
+ u16 completed;
+ unsigned long remaining = count;
+ u64 status;
+ int i;
+ unsigned long flags;
+
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+ output_page = page_address(page);
+
+ while (remaining) {
+ local_irq_save(flags);
+
+ input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+ memset(input_page, 0, sizeof(*input_page));
+ input_page->partition_id = partition_id;
+ status = hv_do_rep_hypercall(HVCALL_WITHDRAW_MEMORY,
+ min(remaining, HV_WITHDRAW_BATCH_SIZE),
+ 0, input_page, output_page);
+
+ local_irq_restore(flags);
+
+ completed = hv_repcomp(status);
+
+ for (i = 0; i < completed; i++)
+ __free_page(pfn_to_page(output_page->gpa_page_list[i]));
+
+ if (!hv_result_success(status)) {
+ if (hv_result(status) == HV_STATUS_NO_RESOURCES)
+ status = HV_STATUS_SUCCESS;
+ break;
+ }
+
+ remaining -= completed;
+ }
+ free_page((unsigned long)output_page);
+
+ return hv_result_to_errno(status);
+}
+
+int hv_call_create_partition(u64 flags,
+ struct hv_partition_creation_properties creation_properties,
+ union hv_partition_isolation_properties isolation_properties,
+ u64 *partition_id)
+{
+ struct hv_input_create_partition *input;
+ struct hv_output_create_partition *output;
+ u64 status;
+ int ret;
+ unsigned long irq_flags;
+
+ do {
+ local_irq_save(irq_flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+ memset(input, 0, sizeof(*input));
+ input->flags = flags;
+ input->compatibility_version = HV_COMPATIBILITY_21_H2;
+
+ memcpy(&input->partition_creation_properties, &creation_properties,
+ sizeof(creation_properties));
+
+ memcpy(&input->isolation_properties, &isolation_properties,
+ sizeof(isolation_properties));
+
+ status = hv_do_hypercall(HVCALL_CREATE_PARTITION,
+ input, output);
+
+ if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
+ if (hv_result_success(status))
+ *partition_id = output->partition_id;
+ local_irq_restore(irq_flags);
+ ret = hv_result_to_errno(status);
+ break;
+ }
+ local_irq_restore(irq_flags);
+ ret = hv_call_deposit_pages(NUMA_NO_NODE,
+ hv_current_partition_id, 1);
+ } while (!ret);
+
+ return ret;
+}
+
+int hv_call_initialize_partition(u64 partition_id)
+{
+ struct hv_input_initialize_partition input;
+ u64 status;
+ int ret;
+
+ input.partition_id = partition_id;
+
+ ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id,
+ HV_INIT_PARTITION_DEPOSIT_PAGES);
+ if (ret)
+ return ret;
+
+ do {
+ status = hv_do_fast_hypercall8(HVCALL_INITIALIZE_PARTITION,
+ *(u64 *)&input);
+
+ if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
+ ret = hv_result_to_errno(status);
+ break;
+ }
+ ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id, 1);
+ } while (!ret);
+
+ return ret;
+}
+
+int hv_call_finalize_partition(u64 partition_id)
+{
+ struct hv_input_finalize_partition input;
+ u64 status;
+
+ input.partition_id = partition_id;
+ status = hv_do_fast_hypercall8(HVCALL_FINALIZE_PARTITION,
+ *(u64 *)&input);
+
+ return hv_result_to_errno(status);
+}
+
+int hv_call_delete_partition(u64 partition_id)
+{
+ struct hv_input_delete_partition input;
+ u64 status;
+
+ input.partition_id = partition_id;
+ status = hv_do_fast_hypercall8(HVCALL_DELETE_PARTITION, *(u64 *)&input);
+
+ return hv_result_to_errno(status);
+}
+
+/* Ask the hypervisor to map guest ram pages or the guest mmio space */
+static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64 page_struct_count,
+ u32 flags, struct page **pages, u64 mmio_spa)
+{
+ struct hv_input_map_gpa_pages *input_page;
+ u64 status, *pfnlist;
+ unsigned long irq_flags, large_shift = 0;
+ int ret = 0, done = 0;
+ u64 page_count = page_struct_count;
+
+ if (page_count == 0 || (pages && mmio_spa))
+ return -EINVAL;
+
+ if (flags & HV_MAP_GPA_LARGE_PAGE) {
+ if (mmio_spa)
+ return -EINVAL;
+
+ if (!HV_PAGE_COUNT_2M_ALIGNED(page_count))
+ return -EINVAL;
+
+ large_shift = HV_HYP_LARGE_PAGE_SHIFT - HV_HYP_PAGE_SHIFT;
+ page_count >>= large_shift;
+ }
+
+ while (done < page_count) {
+ ulong i, completed, remain = page_count - done;
+ int rep_count = min(remain, HV_MAP_GPA_BATCH_SIZE);
+
+ local_irq_save(irq_flags);
+ input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+ input_page->target_partition_id = partition_id;
+ input_page->target_gpa_base = gfn + (done << large_shift);
+ input_page->map_flags = flags;
+ pfnlist = input_page->source_gpa_page_list;
+
+ for (i = 0; i < rep_count; i++)
+ if (flags & HV_MAP_GPA_NO_ACCESS) {
+ pfnlist[i] = 0;
+ } else if (pages) {
+ u64 index = (done + i) << large_shift;
+
+ if (index >= page_struct_count) {
+ ret = -EINVAL;
+ break;
+ }
+ pfnlist[i] = page_to_pfn(pages[index]);
+ } else {
+ pfnlist[i] = mmio_spa + done + i;
+ }
+ if (ret)
+ break;
+
+ status = hv_do_rep_hypercall(HVCALL_MAP_GPA_PAGES, rep_count, 0,
+ input_page, NULL);
+ local_irq_restore(irq_flags);
+
+ completed = hv_repcomp(status);
+
+ if (hv_result(status) == HV_STATUS_INSUFFICIENT_MEMORY) {
+ ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id,
+ HV_MAP_GPA_DEPOSIT_PAGES);
+ if (ret)
+ break;
+
+ } else if (!hv_result_success(status)) {
+ ret = hv_result_to_errno(status);
+ break;
+ }
+
+ done += completed;
+ }
+
+ if (ret && done) {
+ u32 unmap_flags = 0;
+
+ if (flags & HV_MAP_GPA_LARGE_PAGE)
+ unmap_flags |= HV_UNMAP_GPA_LARGE_PAGE;
+ hv_call_unmap_gpa_pages(partition_id, gfn, done, unmap_flags);
+ }
+
+ return ret;
+}
+
+/* Ask the hypervisor to map guest ram pages */
+int hv_call_map_gpa_pages(u64 partition_id, u64 gpa_target, u64 page_count,
+ u32 flags, struct page **pages)
+{
+ return hv_do_map_gpa_hcall(partition_id, gpa_target, page_count,
+ flags, pages, 0);
+}
+
+/* Ask the hypervisor to map guest mmio space */
+int hv_call_map_mmio_pages(u64 partition_id, u64 gfn, u64 mmio_spa, u64 numpgs)
+{
+ int i;
+ u32 flags = HV_MAP_GPA_READABLE | HV_MAP_GPA_WRITABLE |
+ HV_MAP_GPA_NOT_CACHED;
+
+ for (i = 0; i < numpgs; i++)
+ if (page_is_ram(mmio_spa + i))
+ return -EINVAL;
+
+ return hv_do_map_gpa_hcall(partition_id, gfn, numpgs, flags, NULL,
+ mmio_spa);
+}
+
+int hv_call_unmap_gpa_pages(u64 partition_id, u64 gfn, u64 page_count_4k,
+ u32 flags)
+{
+ struct hv_input_unmap_gpa_pages *input_page;
+ u64 status, page_count = page_count_4k;
+ unsigned long irq_flags, large_shift = 0;
+ int ret = 0, done = 0;
+
+ if (page_count == 0)
+ return -EINVAL;
+
+ if (flags & HV_UNMAP_GPA_LARGE_PAGE) {
+ if (!HV_PAGE_COUNT_2M_ALIGNED(page_count))
+ return -EINVAL;
+
+ large_shift = HV_HYP_LARGE_PAGE_SHIFT - HV_HYP_PAGE_SHIFT;
+ page_count >>= large_shift;
+ }
+
+ while (done < page_count) {
+ ulong completed, remain = page_count - done;
+ int rep_count = min(remain, HV_UMAP_GPA_PAGES);
+
+ local_irq_save(irq_flags);
+ input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+ input_page->target_partition_id = partition_id;
+ input_page->target_gpa_base = gfn + (done << large_shift);
+ input_page->unmap_flags = flags;
+ status = hv_do_rep_hypercall(HVCALL_UNMAP_GPA_PAGES, rep_count,
+ 0, input_page, NULL);
+ local_irq_restore(irq_flags);
+
+ completed = hv_repcomp(status);
+ if (!hv_result_success(status)) {
+ ret = hv_result_to_errno(status);
+ break;
+ }
+
+ done += completed;
+ }
+
+ return ret;
+}
+
+int hv_call_get_gpa_access_states(u64 partition_id, u32 count, u64 gpa_base_pfn,
+ union hv_gpa_page_access_state_flags state_flags,
+ int *written_total,
+ union hv_gpa_page_access_state *states)
+{
+ struct hv_input_get_gpa_pages_access_state *input_page;
+ union hv_gpa_page_access_state *output_page;
+ int completed = 0;
+ unsigned long remaining = count;
+ int rep_count, i;
+ u64 status = 0;
+ unsigned long flags;
+
+ *written_total = 0;
+ while (remaining) {
+ local_irq_save(flags);
+ input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ output_page = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+ input_page->partition_id = partition_id;
+ input_page->hv_gpa_page_number = gpa_base_pfn + *written_total;
+ input_page->flags = state_flags;
+ rep_count = min(remaining, HV_GET_GPA_ACCESS_STATES_BATCH_SIZE);
+
+ status = hv_do_rep_hypercall(HVCALL_GET_GPA_PAGES_ACCESS_STATES, rep_count,
+ 0, input_page, output_page);
+ if (!hv_result_success(status)) {
+ local_irq_restore(flags);
+ break;
+ }
+ completed = hv_repcomp(status);
+ for (i = 0; i < completed; ++i)
+ states[i].as_uint8 = output_page[i].as_uint8;
+
+ local_irq_restore(flags);
+ states += completed;
+ *written_total += completed;
+ remaining -= completed;
+ }
+
+ return hv_result_to_errno(status);
+}
+
+int hv_call_assert_virtual_interrupt(u64 partition_id, u32 vector,
+ u64 dest_addr,
+ union hv_interrupt_control control)
+{
+ struct hv_input_assert_virtual_interrupt *input;
+ unsigned long flags;
+ u64 status;
+
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ memset(input, 0, sizeof(*input));
+ input->partition_id = partition_id;
+ input->vector = vector;
+ input->dest_addr = dest_addr;
+ input->control = control;
+ status = hv_do_hypercall(HVCALL_ASSERT_VIRTUAL_INTERRUPT, input, NULL);
+ local_irq_restore(flags);
+
+ return hv_result_to_errno(status);
+}
+
+int hv_call_delete_vp(u64 partition_id, u32 vp_index)
+{
+ union hv_input_delete_vp input = {};
+ u64 status;
+
+ input.partition_id = partition_id;
+ input.vp_index = vp_index;
+
+ status = hv_do_fast_hypercall16(HVCALL_DELETE_VP,
+ input.as_uint64[0], input.as_uint64[1]);
+
+ return hv_result_to_errno(status);
+}
+EXPORT_SYMBOL_GPL(hv_call_delete_vp);
+
+int hv_call_get_vp_state(u32 vp_index, u64 partition_id,
+ struct hv_vp_state_data state_data,
+ /* Choose between pages and ret_output */
+ u64 page_count, struct page **pages,
+ union hv_output_get_vp_state *ret_output)
+{
+ struct hv_input_get_vp_state *input;
+ union hv_output_get_vp_state *output;
+ u64 status;
+ int i;
+ u64 control;
+ unsigned long flags;
+ int ret = 0;
+
+ if (page_count > HV_GET_VP_STATE_BATCH_SIZE)
+ return -EINVAL;
+
+ if (!page_count && !ret_output)
+ return -EINVAL;
+
+ do {
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+ memset(input, 0, sizeof(*input));
+ memset(output, 0, sizeof(*output));
+
+ input->partition_id = partition_id;
+ input->vp_index = vp_index;
+ input->state_data = state_data;
+ for (i = 0; i < page_count; i++)
+ input->output_data_pfns[i] = page_to_pfn(pages[i]);
+
+ control = (HVCALL_GET_VP_STATE) |
+ (page_count << HV_HYPERCALL_VARHEAD_OFFSET);
+
+ status = hv_do_hypercall(control, input, output);
+
+ if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
+ if (hv_result_success(status) && ret_output)
+ memcpy(ret_output, output, sizeof(*output));
+
+ local_irq_restore(flags);
+ ret = hv_result_to_errno(status);
+ break;
+ }
+ local_irq_restore(flags);
+
+ ret = hv_call_deposit_pages(NUMA_NO_NODE,
+ partition_id, 1);
+ } while (!ret);
+
+ return ret;
+}
+
+int hv_call_set_vp_state(u32 vp_index, u64 partition_id,
+ /* Choose between pages and bytes */
+ struct hv_vp_state_data state_data, u64 page_count,
+ struct page **pages, u32 num_bytes, u8 *bytes)
+{
+ struct hv_input_set_vp_state *input;
+ u64 status;
+ int i;
+ u64 control;
+ unsigned long flags;
+ int ret = 0;
+ u16 varhead_sz;
+
+ if (page_count > HV_SET_VP_STATE_BATCH_SIZE)
+ return -EINVAL;
+ if (sizeof(*input) + num_bytes > HV_HYP_PAGE_SIZE)
+ return -EINVAL;
+
+ if (num_bytes)
+ /* round up to 8 and divide by 8 */
+ varhead_sz = (num_bytes + 7) >> 3;
+ else if (page_count)
+ varhead_sz = page_count;
+ else
+ return -EINVAL;
+
+ do {
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ memset(input, 0, sizeof(*input));
+
+ input->partition_id = partition_id;
+ input->vp_index = vp_index;
+ input->state_data = state_data;
+ if (num_bytes) {
+ memcpy((u8 *)input->data, bytes, num_bytes);
+ } else {
+ for (i = 0; i < page_count; i++)
+ input->data[i].pfns = page_to_pfn(pages[i]);
+ }
+
+ control = (HVCALL_SET_VP_STATE) |
+ (varhead_sz << HV_HYPERCALL_VARHEAD_OFFSET);
+
+ status = hv_do_hypercall(control, input, NULL);
+
+ if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
+ local_irq_restore(flags);
+ ret = hv_result_to_errno(status);
+ break;
+ }
+ local_irq_restore(flags);
+
+ ret = hv_call_deposit_pages(NUMA_NO_NODE,
+ partition_id, 1);
+ } while (!ret);
+
+ return ret;
+}
+
+int hv_call_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
+ union hv_input_vtl input_vtl,
+ struct page **state_page)
+{
+ struct hv_input_map_vp_state_page *input;
+ struct hv_output_map_vp_state_page *output;
+ u64 status;
+ int ret;
+ unsigned long flags;
+
+ do {
+ local_irq_save(flags);
+
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+ input->partition_id = partition_id;
+ input->vp_index = vp_index;
+ input->type = type;
+ input->input_vtl = input_vtl;
+
+ status = hv_do_hypercall(HVCALL_MAP_VP_STATE_PAGE, input, output);
+
+ if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
+ if (hv_result_success(status))
+ *state_page = pfn_to_page(output->map_location);
+ local_irq_restore(flags);
+ ret = hv_result_to_errno(status);
+ break;
+ }
+
+ local_irq_restore(flags);
+
+ ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id, 1);
+ } while (!ret);
+
+ return ret;
+}
+
+int hv_call_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
+ union hv_input_vtl input_vtl)
+{
+ unsigned long flags;
+ u64 status;
+ struct hv_input_unmap_vp_state_page *input;
+
+ local_irq_save(flags);
+
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+ memset(input, 0, sizeof(*input));
+
+ input->partition_id = partition_id;
+ input->vp_index = vp_index;
+ input->type = type;
+ input->input_vtl = input_vtl;
+
+ status = hv_do_hypercall(HVCALL_UNMAP_VP_STATE_PAGE, input, NULL);
+
+ local_irq_restore(flags);
+
+ return hv_result_to_errno(status);
+}
+
+int
+hv_call_clear_virtual_interrupt(u64 partition_id)
+{
+ int status;
+
+ status = hv_do_fast_hypercall8(HVCALL_CLEAR_VIRTUAL_INTERRUPT,
+ partition_id);
+
+ return hv_result_to_errno(status);
+}
+
+int
+hv_call_create_port(u64 port_partition_id, union hv_port_id port_id,
+ u64 connection_partition_id,
+ struct hv_port_info *port_info,
+ u8 port_vtl, u8 min_connection_vtl, int node)
+{
+ struct hv_input_create_port *input;
+ unsigned long flags;
+ int ret = 0;
+ int status;
+
+ do {
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ memset(input, 0, sizeof(*input));
+
+ input->port_partition_id = port_partition_id;
+ input->port_id = port_id;
+ input->connection_partition_id = connection_partition_id;
+ input->port_info = *port_info;
+ input->port_vtl = port_vtl;
+ input->min_connection_vtl = min_connection_vtl;
+ input->proximity_domain_info = hv_numa_node_to_pxm_info(node);
+ status = hv_do_hypercall(HVCALL_CREATE_PORT, input, NULL);
+ local_irq_restore(flags);
+ if (hv_result_success(status))
+ break;
+
+ if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
+ ret = hv_result_to_errno(status);
+ break;
+ }
+ ret = hv_call_deposit_pages(NUMA_NO_NODE, port_partition_id, 1);
+
+ } while (!ret);
+
+ return ret;
+}
+
+int
+hv_call_delete_port(u64 port_partition_id, union hv_port_id port_id)
+{
+ union hv_input_delete_port input = { 0 };
+ int status;
+
+ input.port_partition_id = port_partition_id;
+ input.port_id = port_id;
+ status = hv_do_fast_hypercall16(HVCALL_DELETE_PORT,
+ input.as_uint64[0],
+ input.as_uint64[1]);
+
+ return hv_result_to_errno(status);
+}
+
+int
+hv_call_connect_port(u64 port_partition_id, union hv_port_id port_id,
+ u64 connection_partition_id,
+ union hv_connection_id connection_id,
+ struct hv_connection_info *connection_info,
+ u8 connection_vtl, int node)
+{
+ struct hv_input_connect_port *input;
+ unsigned long flags;
+ int ret = 0, status;
+
+ do {
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ memset(input, 0, sizeof(*input));
+ input->port_partition_id = port_partition_id;
+ input->port_id = port_id;
+ input->connection_partition_id = connection_partition_id;
+ input->connection_id = connection_id;
+ input->connection_info = *connection_info;
+ input->connection_vtl = connection_vtl;
+ input->proximity_domain_info = hv_numa_node_to_pxm_info(node);
+ status = hv_do_hypercall(HVCALL_CONNECT_PORT, input, NULL);
+
+ local_irq_restore(flags);
+ if (hv_result_success(status))
+ break;
+
+ if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
+ ret = hv_result_to_errno(status);
+ break;
+ }
+ ret = hv_call_deposit_pages(NUMA_NO_NODE,
+ connection_partition_id, 1);
+ } while (!ret);
+
+ return ret;
+}
+
+int
+hv_call_disconnect_port(u64 connection_partition_id,
+ union hv_connection_id connection_id)
+{
+ union hv_input_disconnect_port input = { 0 };
+ int status;
+
+ input.connection_partition_id = connection_partition_id;
+ input.connection_id = connection_id;
+ input.is_doorbell = 1;
+ status = hv_do_fast_hypercall16(HVCALL_DISCONNECT_PORT,
+ input.as_uint64[0],
+ input.as_uint64[1]);
+
+ return hv_result_to_errno(status);
+}
+
+int
+hv_call_notify_port_ring_empty(u32 sint_index)
+{
+ union hv_input_notify_port_ring_empty input = { 0 };
+ int status;
+
+ input.sint_index = sint_index;
+ status = hv_do_fast_hypercall8(HVCALL_NOTIFY_PORT_RING_EMPTY,
+ input.as_uint64);
+
+ return hv_result_to_errno(status);
+}
+
+int hv_call_map_stat_page(enum hv_stats_object_type type,
+ const union hv_stats_object_identity *identity,
+ void **addr)
+{
+ unsigned long flags;
+ struct hv_input_map_stats_page *input;
+ struct hv_output_map_stats_page *output;
+ u64 status, pfn;
+ int ret = 0;
+
+ do {
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+ memset(input, 0, sizeof(*input));
+ input->type = type;
+ input->identity = *identity;
+
+ status = hv_do_hypercall(HVCALL_MAP_STATS_PAGE, input, output);
+ pfn = output->map_location;
+
+ local_irq_restore(flags);
+ if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
+ ret = hv_result_to_errno(status);
+ if (hv_result_success(status))
+ break;
+ return ret;
+ }
+
+ ret = hv_call_deposit_pages(NUMA_NO_NODE,
+ hv_current_partition_id, 1);
+ if (ret)
+ return ret;
+ } while (!ret);
+
+ *addr = page_address(pfn_to_page(pfn));
+
+ return ret;
+}
+
+int hv_call_unmap_stat_page(enum hv_stats_object_type type,
+ const union hv_stats_object_identity *identity)
+{
+ unsigned long flags;
+ struct hv_input_unmap_stats_page *input;
+ u64 status;
+
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+ memset(input, 0, sizeof(*input));
+ input->type = type;
+ input->identity = *identity;
+
+ status = hv_do_hypercall(HVCALL_UNMAP_STATS_PAGE, input, NULL);
+ local_irq_restore(flags);
+
+ return hv_result_to_errno(status);
+}
+
+int hv_call_modify_spa_host_access(u64 partition_id, struct page **pages,
+ u64 page_struct_count, u32 host_access,
+ u32 flags, u8 acquire)
+{
+ struct hv_input_modify_sparse_spa_page_host_access *input_page;
+ u64 status;
+ int done = 0;
+ unsigned long irq_flags, large_shift = 0;
+ u64 page_count = page_struct_count;
+ u16 code = acquire ? HVCALL_ACQUIRE_SPARSE_SPA_PAGE_HOST_ACCESS :
+ HVCALL_RELEASE_SPARSE_SPA_PAGE_HOST_ACCESS;
+
+ if (page_count == 0)
+ return -EINVAL;
+
+ if (flags & HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE) {
+ if (!HV_PAGE_COUNT_2M_ALIGNED(page_count))
+ return -EINVAL;
+ large_shift = HV_HYP_LARGE_PAGE_SHIFT - HV_HYP_PAGE_SHIFT;
+ page_count >>= large_shift;
+ }
+
+ while (done < page_count) {
+ ulong i, completed, remain = page_count - done;
+ int rep_count = min(remain,
+ HV_MODIFY_SPARSE_SPA_PAGE_HOST_ACCESS_MAX_PAGE_COUNT);
+
+ local_irq_save(irq_flags);
+ input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+ memset(input_page, 0, sizeof(*input_page));
+ /* Only set the partition id if you are making the pages
+ * exclusive
+ */
+ if (flags & HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_EXCLUSIVE)
+ input_page->partition_id = partition_id;
+ input_page->flags = flags;
+ input_page->host_access = host_access;
+
+ for (i = 0; i < rep_count; i++) {
+ u64 index = (done + i) << large_shift;
+
+ if (index >= page_struct_count)
+ return -EINVAL;
+
+ input_page->spa_page_list[i] =
+ page_to_pfn(pages[index]);
+ }
+
+ status = hv_do_rep_hypercall(code, rep_count, 0, input_page,
+ NULL);
+ local_irq_restore(irq_flags);
+
+ completed = hv_repcomp(status);
+
+ if (!hv_result_success(status))
+ return hv_result_to_errno(status);
+
+ done += completed;
+ }
+
+ return 0;
+}
diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
new file mode 100644
index 000000000000..72df774e410a
--- /dev/null
+++ b/drivers/hv/mshv_root_main.c
@@ -0,0 +1,2307 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024, Microsoft Corporation.
+ *
+ * The main part of the mshv_root module, providing APIs to create
+ * and manage guest partitions.
+ *
+ * Authors: Microsoft Linux virtualization team
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/anon_inodes.h>
+#include <linux/mm.h>
+#include <linux/io.h>
+#include <linux/cpuhotplug.h>
+#include <linux/random.h>
+#include <asm/mshyperv.h>
+#include <linux/hyperv.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/kexec.h>
+#include <linux/page-flags.h>
+#include <linux/crash_dump.h>
+#include <linux/panic_notifier.h>
+#include <linux/vmalloc.h>
+
+#include "mshv_eventfd.h"
+#include "mshv.h"
+#include "mshv_root.h"
+
+MODULE_AUTHOR("Microsoft");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Microsoft Hyper-V root partition VMM interface /dev/mshv");
+
+/* TODO move this to mshyperv.h when needed outside driver */
+static inline bool hv_parent_partition(void)
+{
+ return hv_root_partition();
+}
+
+/* TODO move this to another file when debugfs code is added */
+enum hv_stats_vp_counters { /* HV_THREAD_COUNTER */
+#if defined(CONFIG_X86)
+ VpRootDispatchThreadBlocked = 201,
+#elif defined(CONFIG_ARM64)
+ VpRootDispatchThreadBlocked = 94,
+#endif
+ VpStatsMaxCounter
+};
+
+struct hv_stats_page {
+ union {
+ u64 vp_cntrs[VpStatsMaxCounter]; /* VP counters */
+ u8 data[HV_HYP_PAGE_SIZE];
+ };
+} __packed;
+
+struct mshv_root mshv_root;
+
+enum hv_scheduler_type hv_scheduler_type;
+
+/* Once we implement the fast extended hypercall ABI they can go away. */
+static void * __percpu *root_scheduler_input;
+static void * __percpu *root_scheduler_output;
+
+static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg);
+static int mshv_dev_open(struct inode *inode, struct file *filp);
+static int mshv_dev_release(struct inode *inode, struct file *filp);
+static int mshv_vp_release(struct inode *inode, struct file *filp);
+static long mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg);
+static int mshv_partition_release(struct inode *inode, struct file *filp);
+static long mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg);
+static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma);
+static vm_fault_t mshv_vp_fault(struct vm_fault *vmf);
+static int mshv_init_async_handler(struct mshv_partition *partition);
+static void mshv_async_hvcall_handler(void *data, u64 *status);
+
+static const union hv_input_vtl input_vtl_zero;
+static const union hv_input_vtl input_vtl_normal = {
+ .target_vtl = HV_NORMAL_VTL,
+ .use_target_vtl = 1,
+};
+
+static const struct vm_operations_struct mshv_vp_vm_ops = {
+ .fault = mshv_vp_fault,
+};
+
+static const struct file_operations mshv_vp_fops = {
+ .owner = THIS_MODULE,
+ .release = mshv_vp_release,
+ .unlocked_ioctl = mshv_vp_ioctl,
+ .llseek = noop_llseek,
+ .mmap = mshv_vp_mmap,
+};
+
+static const struct file_operations mshv_partition_fops = {
+ .owner = THIS_MODULE,
+ .release = mshv_partition_release,
+ .unlocked_ioctl = mshv_partition_ioctl,
+ .llseek = noop_llseek,
+};
+
+static const struct file_operations mshv_dev_fops = {
+ .owner = THIS_MODULE,
+ .open = mshv_dev_open,
+ .release = mshv_dev_release,
+ .unlocked_ioctl = mshv_dev_ioctl,
+ .llseek = noop_llseek,
+};
+
+static struct miscdevice mshv_dev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "mshv",
+ .fops = &mshv_dev_fops,
+ .mode = 0600,
+};
+
+/*
+ * Only allow hypercalls that have a u64 partition id as the first member of
+ * the input structure.
+ * These are sorted by value.
+ */
+static u16 mshv_passthru_hvcalls[] = {
+ HVCALL_GET_PARTITION_PROPERTY,
+ HVCALL_SET_PARTITION_PROPERTY,
+ HVCALL_INSTALL_INTERCEPT,
+ HVCALL_GET_VP_REGISTERS,
+ HVCALL_SET_VP_REGISTERS,
+ HVCALL_TRANSLATE_VIRTUAL_ADDRESS,
+ HVCALL_CLEAR_VIRTUAL_INTERRUPT,
+ HVCALL_REGISTER_INTERCEPT_RESULT,
+ HVCALL_ASSERT_VIRTUAL_INTERRUPT,
+ HVCALL_GET_GPA_PAGES_ACCESS_STATES,
+ HVCALL_SIGNAL_EVENT_DIRECT,
+ HVCALL_POST_MESSAGE_DIRECT,
+ HVCALL_GET_VP_CPUID_VALUES,
+};
+
+static bool mshv_hvcall_is_async(u16 code)
+{
+ switch (code) {
+ case HVCALL_SET_PARTITION_PROPERTY:
+ return true;
+ default:
+ break;
+ }
+ return false;
+}
+
+static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition,
+ bool partition_locked,
+ void __user *user_args)
+{
+ u64 status;
+ int ret = 0, i;
+ bool is_async;
+ struct mshv_root_hvcall args;
+ struct page *page;
+ unsigned int pages_order;
+ void *input_pg = NULL;
+ void *output_pg = NULL;
+
+ if (copy_from_user(&args, user_args, sizeof(args)))
+ return -EFAULT;
+
+ if (args.status || !args.in_ptr || args.in_sz < sizeof(u64) ||
+ mshv_field_nonzero(args, rsvd) || args.in_sz > HV_HYP_PAGE_SIZE)
+ return -EINVAL;
+
+ if (args.out_ptr && (!args.out_sz || args.out_sz > HV_HYP_PAGE_SIZE))
+ return -EINVAL;
+
+ for (i = 0; i < ARRAY_SIZE(mshv_passthru_hvcalls); ++i)
+ if (args.code == mshv_passthru_hvcalls[i])
+ break;
+
+ if (i >= ARRAY_SIZE(mshv_passthru_hvcalls))
+ return -EINVAL;
+
+ is_async = mshv_hvcall_is_async(args.code);
+ if (is_async) {
+ /* async hypercalls can only be called from partition fd */
+ if (!partition_locked)
+ return -EINVAL;
+ ret = mshv_init_async_handler(partition);
+ if (ret)
+ return ret;
+ }
+
+ pages_order = args.out_ptr ? 1 : 0;
+ page = alloc_pages(GFP_KERNEL, pages_order);
+ if (!page)
+ return -ENOMEM;
+ input_pg = page_address(page);
+
+ if (args.out_ptr)
+ output_pg = (char *)input_pg + PAGE_SIZE;
+ else
+ output_pg = NULL;
+
+ if (copy_from_user(input_pg, (void __user *)args.in_ptr,
+ args.in_sz)) {
+ ret = -EFAULT;
+ goto free_pages_out;
+ }
+
+ /*
+ * NOTE: This only works because all the allowed hypercalls' input
+ * structs begin with a u64 partition_id field.
+ */
+ *(u64 *)input_pg = partition->pt_id;
+
+ if (args.reps)
+ status = hv_do_rep_hypercall(args.code, args.reps, 0,
+ input_pg, output_pg);
+ else
+ status = hv_do_hypercall(args.code, input_pg, output_pg);
+
+ if (hv_result(status) == HV_STATUS_CALL_PENDING) {
+ if (is_async) {
+ mshv_async_hvcall_handler(partition, &status);
+ } else { /* Paranoia check. This shouldn't happen! */
+ ret = -EBADFD;
+ goto free_pages_out;
+ }
+ }
+
+ if (hv_result(status) == HV_STATUS_INSUFFICIENT_MEMORY) {
+ ret = hv_call_deposit_pages(NUMA_NO_NODE, partition->pt_id, 1);
+ if (!ret)
+ ret = -EAGAIN;
+ } else if (!hv_result_success(status)) {
+ ret = hv_result_to_errno(status);
+ }
+
+ /*
+ * Always return the status and output data regardless of result.
+ * The VMM may need it to determine how to proceed. E.g. the status may
+ * contain the number of reps completed if a rep hypercall partially
+ * succeeded.
+ */
+ args.status = hv_result(status);
+ args.reps = args.reps ? hv_repcomp(status) : 0;
+ if (copy_to_user(user_args, &args, sizeof(args)))
+ ret = -EFAULT;
+
+ if (output_pg &&
+ copy_to_user((void __user *)args.out_ptr, output_pg, args.out_sz))
+ ret = -EFAULT;
+
+free_pages_out:
+ free_pages((unsigned long)input_pg, pages_order);
+
+ return ret;
+}
+
+static inline bool is_ghcb_mapping_available(void)
+{
+#if IS_ENABLED(CONFIG_X86_64)
+ return ms_hyperv.ext_features & HV_VP_GHCB_ROOT_MAPPING_AVAILABLE;
+#else
+ return 0;
+#endif
+}
+
+static int mshv_get_vp_registers(u32 vp_index, u64 partition_id, u16 count,
+ struct hv_register_assoc *registers)
+{
+ return hv_call_get_vp_registers(vp_index, partition_id,
+ count, input_vtl_zero, registers);
+}
+
+static int mshv_set_vp_registers(u32 vp_index, u64 partition_id, u16 count,
+ struct hv_register_assoc *registers)
+{
+ return hv_call_set_vp_registers(vp_index, partition_id,
+ count, input_vtl_zero, registers);
+}
+
+/*
+ * Explicit guest vCPU suspend is asynchronous by nature (as it is requested by
+ * dom0 vCPU for guest vCPU) and thus it can race with "intercept" suspend,
+ * done by the hypervisor.
+ * "Intercept" suspend leads to asynchronous message delivery to dom0 which
+ * should be awaited to keep the VP loop consistent (i.e. no message pending
+ * upon VP resume).
+ * VP intercept suspend can't be done when the VP is explicitly suspended
+ * already, and thus can be only two possible race scenarios:
+ * 1. implicit suspend bit set -> explicit suspend bit set -> message sent
+ * 2. implicit suspend bit set -> message sent -> explicit suspend bit set
+ * Checking for implicit suspend bit set after explicit suspend request has
+ * succeeded in either case allows us to reliably identify, if there is a
+ * message to receive and deliver to VMM.
+ */
+static int
+mshv_suspend_vp(const struct mshv_vp *vp, bool *message_in_flight)
+{
+ struct hv_register_assoc explicit_suspend = {
+ .name = HV_REGISTER_EXPLICIT_SUSPEND
+ };
+ struct hv_register_assoc intercept_suspend = {
+ .name = HV_REGISTER_INTERCEPT_SUSPEND
+ };
+ union hv_explicit_suspend_register *es =
+ &explicit_suspend.value.explicit_suspend;
+ union hv_intercept_suspend_register *is =
+ &intercept_suspend.value.intercept_suspend;
+ int ret;
+
+ es->suspended = 1;
+
+ ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
+ 1, &explicit_suspend);
+ if (ret) {
+ vp_err(vp, "Failed to explicitly suspend vCPU\n");
+ return ret;
+ }
+
+ ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
+ 1, &intercept_suspend);
+ if (ret) {
+ vp_err(vp, "Failed to get intercept suspend state\n");
+ return ret;
+ }
+
+ *message_in_flight = is->suspended;
+
+ return 0;
+}
+
+/*
+ * This function is used when VPs are scheduled by the hypervisor's
+ * scheduler.
+ *
+ * Caller has to make sure the registers contain cleared
+ * HV_REGISTER_INTERCEPT_SUSPEND and HV_REGISTER_EXPLICIT_SUSPEND registers
+ * exactly in this order (the hypervisor clears them sequentially) to avoid
+ * potential invalid clearing a newly arrived HV_REGISTER_INTERCEPT_SUSPEND
+ * after VP is released from HV_REGISTER_EXPLICIT_SUSPEND in case of the
+ * opposite order.
+ */
+static long mshv_run_vp_with_hyp_scheduler(struct mshv_vp *vp)
+{
+ long ret;
+ struct hv_register_assoc suspend_regs[2] = {
+ { .name = HV_REGISTER_INTERCEPT_SUSPEND },
+ { .name = HV_REGISTER_EXPLICIT_SUSPEND }
+ };
+ size_t count = ARRAY_SIZE(suspend_regs);
+
+ /* Resume VP execution */
+ ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
+ count, suspend_regs);
+ if (ret) {
+ vp_err(vp, "Failed to resume vp execution. %lx\n", ret);
+ return ret;
+ }
+
+ ret = wait_event_interruptible(vp->run.vp_suspend_queue,
+ vp->run.kicked_by_hv == 1);
+ if (ret) {
+ bool message_in_flight;
+
+ /*
+ * Otherwise the waiting was interrupted by a signal: suspend
+ * the vCPU explicitly and copy message in flight (if any).
+ */
+ ret = mshv_suspend_vp(vp, &message_in_flight);
+ if (ret)
+ return ret;
+
+ /* Return if no message in flight */
+ if (!message_in_flight)
+ return -EINTR;
+
+ /* Wait for the message in flight. */
+ wait_event(vp->run.vp_suspend_queue, vp->run.kicked_by_hv == 1);
+ }
+
+ /*
+ * Reset the flag to make the wait_event call above work
+ * next time.
+ */
+ vp->run.kicked_by_hv = 0;
+
+ return 0;
+}
+
+static int
+mshv_vp_dispatch(struct mshv_vp *vp, u32 flags,
+ struct hv_output_dispatch_vp *res)
+{
+ struct hv_input_dispatch_vp *input;
+ struct hv_output_dispatch_vp *output;
+ u64 status;
+
+ preempt_disable();
+ input = *this_cpu_ptr(root_scheduler_input);
+ output = *this_cpu_ptr(root_scheduler_output);
+
+ memset(input, 0, sizeof(*input));
+ memset(output, 0, sizeof(*output));
+
+ input->partition_id = vp->vp_partition->pt_id;
+ input->vp_index = vp->vp_index;
+ input->time_slice = 0; /* Run forever until something happens */
+ input->spec_ctrl = 0; /* TODO: set sensible flags */
+ input->flags = flags;
+
+ vp->run.flags.root_sched_dispatched = 1;
+ status = hv_do_hypercall(HVCALL_DISPATCH_VP, input, output);
+ vp->run.flags.root_sched_dispatched = 0;
+
+ *res = *output;
+ preempt_enable();
+
+ if (!hv_result_success(status))
+ vp_err(vp, "%s: status %s\n", __func__,
+ hv_result_to_string(status));
+
+ return hv_result_to_errno(status);
+}
+
+static int
+mshv_vp_clear_explicit_suspend(struct mshv_vp *vp)
+{
+ struct hv_register_assoc explicit_suspend = {
+ .name = HV_REGISTER_EXPLICIT_SUSPEND,
+ .value.explicit_suspend.suspended = 0,
+ };
+ int ret;
+
+ ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
+ 1, &explicit_suspend);
+
+ if (ret)
+ vp_err(vp, "Failed to unsuspend\n");
+
+ return ret;
+}
+
+#if IS_ENABLED(CONFIG_X86_64)
+static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp)
+{
+ if (!vp->vp_register_page)
+ return 0;
+ return vp->vp_register_page->interrupt_vectors.as_uint64;
+}
+#else
+static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp)
+{
+ return 0;
+}
+#endif
+
+static bool mshv_vp_dispatch_thread_blocked(struct mshv_vp *vp)
+{
+ struct hv_stats_page **stats = vp->vp_stats_pages;
+ u64 *self_vp_cntrs = stats[HV_STATS_AREA_SELF]->vp_cntrs;
+ u64 *parent_vp_cntrs = stats[HV_STATS_AREA_PARENT]->vp_cntrs;
+
+ if (self_vp_cntrs[VpRootDispatchThreadBlocked])
+ return self_vp_cntrs[VpRootDispatchThreadBlocked];
+ return parent_vp_cntrs[VpRootDispatchThreadBlocked];
+}
+
+static int
+mshv_vp_wait_for_hv_kick(struct mshv_vp *vp)
+{
+ int ret;
+
+ ret = wait_event_interruptible(vp->run.vp_suspend_queue,
+ (vp->run.kicked_by_hv == 1 &&
+ !mshv_vp_dispatch_thread_blocked(vp)) ||
+ mshv_vp_interrupt_pending(vp));
+ if (ret)
+ return -EINTR;
+
+ vp->run.flags.root_sched_blocked = 0;
+ vp->run.kicked_by_hv = 0;
+
+ return 0;
+}
+
+static int mshv_pre_guest_mode_work(struct mshv_vp *vp)
+{
+ const ulong work_flags = _TIF_NOTIFY_SIGNAL | _TIF_SIGPENDING |
+ _TIF_NEED_RESCHED | _TIF_NOTIFY_RESUME;
+ ulong th_flags;
+
+ th_flags = read_thread_flags();
+ while (th_flags & work_flags) {
+ int ret;
+
+ /* nb: following will call schedule */
+ ret = mshv_do_pre_guest_mode_work(th_flags);
+
+ if (ret)
+ return ret;
+
+ th_flags = read_thread_flags();
+ }
+
+ return 0;
+}
+
+/* Must be called with interrupts enabled */
+static long mshv_run_vp_with_root_scheduler(struct mshv_vp *vp)
+{
+ long ret;
+
+ if (vp->run.flags.root_sched_blocked) {
+ /*
+ * Dispatch state of this VP is blocked. Need to wait
+ * for the hypervisor to clear the blocked state before
+ * dispatching it.
+ */
+ ret = mshv_vp_wait_for_hv_kick(vp);
+ if (ret)
+ return ret;
+ }
+
+ do {
+ u32 flags = 0;
+ struct hv_output_dispatch_vp output;
+
+ ret = mshv_pre_guest_mode_work(vp);
+ if (ret)
+ break;
+
+ if (vp->run.flags.intercept_suspend)
+ flags |= HV_DISPATCH_VP_FLAG_CLEAR_INTERCEPT_SUSPEND;
+
+ if (mshv_vp_interrupt_pending(vp))
+ flags |= HV_DISPATCH_VP_FLAG_SCAN_INTERRUPT_INJECTION;
+
+ ret = mshv_vp_dispatch(vp, flags, &output);
+ if (ret)
+ break;
+
+ vp->run.flags.intercept_suspend = 0;
+
+ if (output.dispatch_state == HV_VP_DISPATCH_STATE_BLOCKED) {
+ if (output.dispatch_event ==
+ HV_VP_DISPATCH_EVENT_SUSPEND) {
+ /*
+ * TODO: remove the warning once VP canceling
+ * is supported
+ */
+ WARN_ONCE(atomic64_read(&vp->run.vp_signaled_count),
+ "%s: vp#%d: unexpected explicit suspend\n",
+ __func__, vp->vp_index);
+ /*
+ * Need to clear explicit suspend before
+ * dispatching.
+ * Explicit suspend is either:
+ * - set right after the first VP dispatch or
+ * - set explicitly via hypercall
+ * Since the latter case is not yet supported,
+ * simply clear it here.
+ */
+ ret = mshv_vp_clear_explicit_suspend(vp);
+ if (ret)
+ break;
+
+ ret = mshv_vp_wait_for_hv_kick(vp);
+ if (ret)
+ break;
+ } else {
+ vp->run.flags.root_sched_blocked = 1;
+ ret = mshv_vp_wait_for_hv_kick(vp);
+ if (ret)
+ break;
+ }
+ } else {
+ /* HV_VP_DISPATCH_STATE_READY */
+ if (output.dispatch_event ==
+ HV_VP_DISPATCH_EVENT_INTERCEPT)
+ vp->run.flags.intercept_suspend = 1;
+ }
+ } while (!vp->run.flags.intercept_suspend);
+
+ return ret;
+}
+
+static_assert(sizeof(struct hv_message) <= MSHV_RUN_VP_BUF_SZ,
+ "sizeof(struct hv_message) must not exceed MSHV_RUN_VP_BUF_SZ");
+
+static long mshv_vp_ioctl_run_vp(struct mshv_vp *vp, void __user *ret_msg)
+{
+ long rc;
+
+ if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
+ rc = mshv_run_vp_with_root_scheduler(vp);
+ else
+ rc = mshv_run_vp_with_hyp_scheduler(vp);
+
+ if (rc)
+ return rc;
+
+ if (copy_to_user(ret_msg, vp->vp_intercept_msg_page,
+ sizeof(struct hv_message)))
+ rc = -EFAULT;
+
+ return rc;
+}
+
+static int
+mshv_vp_ioctl_get_set_state_pfn(struct mshv_vp *vp,
+ struct hv_vp_state_data state_data,
+ unsigned long user_pfn, size_t page_count,
+ bool is_set)
+{
+ int completed, ret = 0;
+ unsigned long check;
+ struct page **pages;
+
+ if (page_count > INT_MAX)
+ return -EINVAL;
+ /*
+ * Check the arithmetic for wraparound/overflow.
+ * The last page address in the buffer is:
+ * (user_pfn + (page_count - 1)) * PAGE_SIZE
+ */
+ if (check_add_overflow(user_pfn, (page_count - 1), &check))
+ return -EOVERFLOW;
+ if (check_mul_overflow(check, PAGE_SIZE, &check))
+ return -EOVERFLOW;
+
+ /* Pin user pages so hypervisor can copy directly to them */
+ pages = kcalloc(page_count, sizeof(struct page *), GFP_KERNEL);
+ if (!pages)
+ return -ENOMEM;
+
+ for (completed = 0; completed < page_count; completed += ret) {
+ unsigned long user_addr = (user_pfn + completed) * PAGE_SIZE;
+ int remaining = page_count - completed;
+
+ ret = pin_user_pages_fast(user_addr, remaining, FOLL_WRITE,
+ &pages[completed]);
+ if (ret < 0) {
+ vp_err(vp, "%s: Failed to pin user pages error %i\n",
+ __func__, ret);
+ goto unpin_pages;
+ }
+ }
+
+ if (is_set)
+ ret = hv_call_set_vp_state(vp->vp_index,
+ vp->vp_partition->pt_id,
+ state_data, page_count, pages,
+ 0, NULL);
+ else
+ ret = hv_call_get_vp_state(vp->vp_index,
+ vp->vp_partition->pt_id,
+ state_data, page_count, pages,
+ NULL);
+
+unpin_pages:
+ unpin_user_pages(pages, completed);
+ kfree(pages);
+ return ret;
+}
+
+static long
+mshv_vp_ioctl_get_set_state(struct mshv_vp *vp,
+ struct mshv_get_set_vp_state __user *user_args,
+ bool is_set)
+{
+ struct mshv_get_set_vp_state args;
+ long ret = 0;
+ union hv_output_get_vp_state vp_state;
+ u32 data_sz;
+ struct hv_vp_state_data state_data = {};
+
+ if (copy_from_user(&args, user_args, sizeof(args)))
+ return -EFAULT;
+
+ if (args.type >= MSHV_VP_STATE_COUNT || mshv_field_nonzero(args, rsvd) ||
+ !args.buf_sz || !PAGE_ALIGNED(args.buf_sz) ||
+ !PAGE_ALIGNED(args.buf_ptr))
+ return -EINVAL;
+
+ if (!access_ok((void __user *)args.buf_ptr, args.buf_sz))
+ return -EFAULT;
+
+ switch (args.type) {
+ case MSHV_VP_STATE_LAPIC:
+ state_data.type = HV_GET_SET_VP_STATE_LAPIC_STATE;
+ data_sz = HV_HYP_PAGE_SIZE;
+ break;
+ case MSHV_VP_STATE_XSAVE:
+ {
+ u64 data_sz_64;
+
+ ret = hv_call_get_partition_property(vp->vp_partition->pt_id,
+ HV_PARTITION_PROPERTY_XSAVE_STATES,
+ &state_data.xsave.states.as_uint64);
+ if (ret)
+ return ret;
+
+ ret = hv_call_get_partition_property(vp->vp_partition->pt_id,
+ HV_PARTITION_PROPERTY_MAX_XSAVE_DATA_SIZE,
+ &data_sz_64);
+ if (ret)
+ return ret;
+
+ data_sz = (u32)data_sz_64;
+ state_data.xsave.flags = 0;
+ /* Always request legacy states */
+ state_data.xsave.states.legacy_x87 = 1;
+ state_data.xsave.states.legacy_sse = 1;
+ state_data.type = HV_GET_SET_VP_STATE_XSAVE;
+ break;
+ }
+ case MSHV_VP_STATE_SIMP:
+ state_data.type = HV_GET_SET_VP_STATE_SIM_PAGE;
+ data_sz = HV_HYP_PAGE_SIZE;
+ break;
+ case MSHV_VP_STATE_SIEFP:
+ state_data.type = HV_GET_SET_VP_STATE_SIEF_PAGE;
+ data_sz = HV_HYP_PAGE_SIZE;
+ break;
+ case MSHV_VP_STATE_SYNTHETIC_TIMERS:
+ state_data.type = HV_GET_SET_VP_STATE_SYNTHETIC_TIMERS;
+ data_sz = sizeof(vp_state.synthetic_timers_state);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (copy_to_user(&user_args->buf_sz, &data_sz, sizeof(user_args->buf_sz)))
+ return -EFAULT;
+
+ if (data_sz > args.buf_sz)
+ return -EINVAL;
+
+ /* If the data is transmitted via pfns, delegate to helper */
+ if (state_data.type & HV_GET_SET_VP_STATE_TYPE_PFN) {
+ unsigned long user_pfn = PFN_DOWN(args.buf_ptr);
+ size_t page_count = PFN_DOWN(args.buf_sz);
+
+ return mshv_vp_ioctl_get_set_state_pfn(vp, state_data, user_pfn,
+ page_count, is_set);
+ }
+
+ /* Paranoia check - this shouldn't happen! */
+ if (data_sz > sizeof(vp_state)) {
+ vp_err(vp, "Invalid vp state data size!\n");
+ return -EINVAL;
+ }
+
+ if (is_set) {
+ if (copy_from_user(&vp_state, (__user void *)args.buf_ptr, data_sz))
+ return -EFAULT;
+
+ return hv_call_set_vp_state(vp->vp_index,
+ vp->vp_partition->pt_id,
+ state_data, 0, NULL,
+ sizeof(vp_state), (u8 *)&vp_state);
+ }
+
+ ret = hv_call_get_vp_state(vp->vp_index, vp->vp_partition->pt_id,
+ state_data, 0, NULL, &vp_state);
+ if (ret)
+ return ret;
+
+ if (copy_to_user((void __user *)args.buf_ptr, &vp_state, data_sz))
+ return -EFAULT;
+
+ return 0;
+}
+
+static long
+mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
+{
+ struct mshv_vp *vp = filp->private_data;
+ long r = -ENOTTY;
+
+ if (mutex_lock_killable(&vp->vp_mutex))
+ return -EINTR;
+
+ switch (ioctl) {
+ case MSHV_RUN_VP:
+ r = mshv_vp_ioctl_run_vp(vp, (void __user *)arg);
+ break;
+ case MSHV_GET_VP_STATE:
+ r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, false);
+ break;
+ case MSHV_SET_VP_STATE:
+ r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, true);
+ break;
+ case MSHV_ROOT_HVCALL:
+ r = mshv_ioctl_passthru_hvcall(vp->vp_partition, false,
+ (void __user *)arg);
+ break;
+ default:
+ vp_warn(vp, "Invalid ioctl: %#x\n", ioctl);
+ break;
+ }
+ mutex_unlock(&vp->vp_mutex);
+
+ return r;
+}
+
+static vm_fault_t mshv_vp_fault(struct vm_fault *vmf)
+{
+ struct mshv_vp *vp = vmf->vma->vm_file->private_data;
+
+ switch (vmf->vma->vm_pgoff) {
+ case MSHV_VP_MMAP_OFFSET_REGISTERS:
+ vmf->page = virt_to_page(vp->vp_register_page);
+ break;
+ case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE:
+ vmf->page = virt_to_page(vp->vp_intercept_msg_page);
+ break;
+ case MSHV_VP_MMAP_OFFSET_GHCB:
+ vmf->page = virt_to_page(vp->vp_ghcb_page);
+ break;
+ default:
+ return VM_FAULT_SIGBUS;
+ }
+
+ get_page(vmf->page);
+
+ return 0;
+}
+
+static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct mshv_vp *vp = file->private_data;
+
+ switch (vma->vm_pgoff) {
+ case MSHV_VP_MMAP_OFFSET_REGISTERS:
+ if (!vp->vp_register_page)
+ return -ENODEV;
+ break;
+ case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE:
+ if (!vp->vp_intercept_msg_page)
+ return -ENODEV;
+ break;
+ case MSHV_VP_MMAP_OFFSET_GHCB:
+ if (!vp->vp_ghcb_page)
+ return -ENODEV;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ vma->vm_ops = &mshv_vp_vm_ops;
+ return 0;
+}
+
+static int
+mshv_vp_release(struct inode *inode, struct file *filp)
+{
+ struct mshv_vp *vp = filp->private_data;
+
+ /* Rest of VP cleanup happens in destroy_partition() */
+ mshv_partition_put(vp->vp_partition);
+ return 0;
+}
+
+static void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index)
+{
+ union hv_stats_object_identity identity = {
+ .vp.partition_id = partition_id,
+ .vp.vp_index = vp_index,
+ };
+
+ identity.vp.stats_area_type = HV_STATS_AREA_SELF;
+ hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity);
+
+ identity.vp.stats_area_type = HV_STATS_AREA_PARENT;
+ hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity);
+}
+
+static int mshv_vp_stats_map(u64 partition_id, u32 vp_index,
+ void *stats_pages[])
+{
+ union hv_stats_object_identity identity = {
+ .vp.partition_id = partition_id,
+ .vp.vp_index = vp_index,
+ };
+ int err;
+
+ identity.vp.stats_area_type = HV_STATS_AREA_SELF;
+ err = hv_call_map_stat_page(HV_STATS_OBJECT_VP, &identity,
+ &stats_pages[HV_STATS_AREA_SELF]);
+ if (err)
+ return err;
+
+ identity.vp.stats_area_type = HV_STATS_AREA_PARENT;
+ err = hv_call_map_stat_page(HV_STATS_OBJECT_VP, &identity,
+ &stats_pages[HV_STATS_AREA_PARENT]);
+ if (err)
+ goto unmap_self;
+
+ return 0;
+
+unmap_self:
+ identity.vp.stats_area_type = HV_STATS_AREA_SELF;
+ hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity);
+ return err;
+}
+
+static long
+mshv_partition_ioctl_create_vp(struct mshv_partition *partition,
+ void __user *arg)
+{
+ struct mshv_create_vp args;
+ struct mshv_vp *vp;
+ struct page *intercept_message_page, *register_page, *ghcb_page;
+ void *stats_pages[2];
+ long ret;
+
+ if (copy_from_user(&args, arg, sizeof(args)))
+ return -EFAULT;
+
+ if (args.vp_index >= MSHV_MAX_VPS)
+ return -EINVAL;
+
+ if (partition->pt_vp_array[args.vp_index])
+ return -EEXIST;
+
+ ret = hv_call_create_vp(NUMA_NO_NODE, partition->pt_id, args.vp_index,
+ 0 /* Only valid for root partition VPs */);
+ if (ret)
+ return ret;
+
+ ret = hv_call_map_vp_state_page(partition->pt_id, args.vp_index,
+ HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
+ input_vtl_zero,
+ &intercept_message_page);
+ if (ret)
+ goto destroy_vp;
+
+ if (!mshv_partition_encrypted(partition)) {
+ ret = hv_call_map_vp_state_page(partition->pt_id, args.vp_index,
+ HV_VP_STATE_PAGE_REGISTERS,
+ input_vtl_zero,
+ &register_page);
+ if (ret)
+ goto unmap_intercept_message_page;
+ }
+
+ if (mshv_partition_encrypted(partition) &&
+ is_ghcb_mapping_available()) {
+ ret = hv_call_map_vp_state_page(partition->pt_id, args.vp_index,
+ HV_VP_STATE_PAGE_GHCB,
+ input_vtl_normal,
+ &ghcb_page);
+ if (ret)
+ goto unmap_register_page;
+ }
+
+ if (hv_parent_partition()) {
+ ret = mshv_vp_stats_map(partition->pt_id, args.vp_index,
+ stats_pages);
+ if (ret)
+ goto unmap_ghcb_page;
+ }
+
+ vp = kzalloc(sizeof(*vp), GFP_KERNEL);
+ if (!vp)
+ goto unmap_stats_pages;
+
+ vp->vp_partition = mshv_partition_get(partition);
+ if (!vp->vp_partition) {
+ ret = -EBADF;
+ goto free_vp;
+ }
+
+ mutex_init(&vp->vp_mutex);
+ init_waitqueue_head(&vp->run.vp_suspend_queue);
+ atomic64_set(&vp->run.vp_signaled_count, 0);
+
+ vp->vp_index = args.vp_index;
+ vp->vp_intercept_msg_page = page_to_virt(intercept_message_page);
+ if (!mshv_partition_encrypted(partition))
+ vp->vp_register_page = page_to_virt(register_page);
+
+ if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available())
+ vp->vp_ghcb_page = page_to_virt(ghcb_page);
+
+ if (hv_parent_partition())
+ memcpy(vp->vp_stats_pages, stats_pages, sizeof(stats_pages));
+
+ /*
+ * Keep anon_inode_getfd last: it installs fd in the file struct and
+ * thus makes the state accessible in user space.
+ */
+ ret = anon_inode_getfd("mshv_vp", &mshv_vp_fops, vp,
+ O_RDWR | O_CLOEXEC);
+ if (ret < 0)
+ goto put_partition;
+
+ /* already exclusive with the partition mutex for all ioctls */
+ partition->pt_vp_count++;
+ partition->pt_vp_array[args.vp_index] = vp;
+
+ return ret;
+
+put_partition:
+ mshv_partition_put(partition);
+free_vp:
+ kfree(vp);
+unmap_stats_pages:
+ if (hv_parent_partition())
+ mshv_vp_stats_unmap(partition->pt_id, args.vp_index);
+unmap_ghcb_page:
+ if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) {
+ hv_call_unmap_vp_state_page(partition->pt_id, args.vp_index,
+ HV_VP_STATE_PAGE_GHCB,
+ input_vtl_normal);
+ }
+unmap_register_page:
+ if (!mshv_partition_encrypted(partition)) {
+ hv_call_unmap_vp_state_page(partition->pt_id, args.vp_index,
+ HV_VP_STATE_PAGE_REGISTERS,
+ input_vtl_zero);
+ }
+unmap_intercept_message_page:
+ hv_call_unmap_vp_state_page(partition->pt_id, args.vp_index,
+ HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
+ input_vtl_zero);
+destroy_vp:
+ hv_call_delete_vp(partition->pt_id, args.vp_index);
+ return ret;
+}
+
+static int mshv_init_async_handler(struct mshv_partition *partition)
+{
+ if (completion_done(&partition->async_hypercall)) {
+ pt_err(partition,
+ "Cannot issue async hypercall while another one in progress!\n");
+ return -EPERM;
+ }
+
+ reinit_completion(&partition->async_hypercall);
+ return 0;
+}
+
+static void mshv_async_hvcall_handler(void *data, u64 *status)
+{
+ struct mshv_partition *partition = data;
+
+ wait_for_completion(&partition->async_hypercall);
+ pt_dbg(partition, "Async hypercall completed!\n");
+
+ *status = partition->async_hypercall_status;
+}
+
+static int
+mshv_partition_region_share(struct mshv_mem_region *region)
+{
+ u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_SHARED;
+
+ if (region->flags.large_pages)
+ flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE;
+
+ return hv_call_modify_spa_host_access(region->partition->pt_id,
+ region->pages, region->nr_pages,
+ HV_MAP_GPA_READABLE | HV_MAP_GPA_WRITABLE,
+ flags, true);
+}
+
+static int
+mshv_partition_region_unshare(struct mshv_mem_region *region)
+{
+ u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_EXCLUSIVE;
+
+ if (region->flags.large_pages)
+ flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE;
+
+ return hv_call_modify_spa_host_access(region->partition->pt_id,
+ region->pages, region->nr_pages,
+ 0,
+ flags, false);
+}
+
+static int
+mshv_region_remap_pages(struct mshv_mem_region *region, u32 map_flags,
+ u64 page_offset, u64 page_count)
+{
+ if (page_offset + page_count > region->nr_pages)
+ return -EINVAL;
+
+ if (region->flags.large_pages)
+ map_flags |= HV_MAP_GPA_LARGE_PAGE;
+
+ /* ask the hypervisor to map guest ram */
+ return hv_call_map_gpa_pages(region->partition->pt_id,
+ region->start_gfn + page_offset,
+ page_count, map_flags,
+ region->pages + page_offset);
+}
+
+static int
+mshv_region_map(struct mshv_mem_region *region)
+{
+ u32 map_flags = region->hv_map_flags;
+
+ return mshv_region_remap_pages(region, map_flags,
+ 0, region->nr_pages);
+}
+
+static void
+mshv_region_evict_pages(struct mshv_mem_region *region,
+ u64 page_offset, u64 page_count)
+{
+ if (region->flags.range_pinned)
+ unpin_user_pages(region->pages + page_offset, page_count);
+
+ memset(region->pages + page_offset, 0,
+ page_count * sizeof(struct page *));
+}
+
+static void
+mshv_region_evict(struct mshv_mem_region *region)
+{
+ mshv_region_evict_pages(region, 0, region->nr_pages);
+}
+
+static int
+mshv_region_populate_pages(struct mshv_mem_region *region,
+ u64 page_offset, u64 page_count)
+{
+ u64 done_count, nr_pages;
+ struct page **pages;
+ __u64 userspace_addr;
+ int ret;
+
+ if (page_offset + page_count > region->nr_pages)
+ return -EINVAL;
+
+ for (done_count = 0; done_count < page_count; done_count += ret) {
+ pages = region->pages + page_offset + done_count;
+ userspace_addr = region->start_uaddr +
+ (page_offset + done_count) *
+ HV_HYP_PAGE_SIZE;
+ nr_pages = min(page_count - done_count,
+ MSHV_PIN_PAGES_BATCH_SIZE);
+
+ /*
+ * Pinning assuming 4k pages works for large pages too.
+ * All page structs within the large page are returned.
+ *
+ * Pin requests are batched because pin_user_pages_fast
+ * with the FOLL_LONGTERM flag does a large temporary
+ * allocation of contiguous memory.
+ */
+ if (region->flags.range_pinned)
+ ret = pin_user_pages_fast(userspace_addr,
+ nr_pages,
+ FOLL_WRITE | FOLL_LONGTERM,
+ pages);
+ else
+ ret = -EOPNOTSUPP;
+
+ if (ret < 0)
+ goto release_pages;
+ }
+
+ if (PageHuge(region->pages[page_offset]))
+ region->flags.large_pages = true;
+
+ return 0;
+
+release_pages:
+ mshv_region_evict_pages(region, page_offset, done_count);
+ return ret;
+}
+
+static int
+mshv_region_populate(struct mshv_mem_region *region)
+{
+ return mshv_region_populate_pages(region, 0, region->nr_pages);
+}
+
+static struct mshv_mem_region *
+mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn)
+{
+ struct mshv_mem_region *region;
+
+ hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) {
+ if (gfn >= region->start_gfn &&
+ gfn < region->start_gfn + region->nr_pages)
+ return region;
+ }
+
+ return NULL;
+}
+
+static struct mshv_mem_region *
+mshv_partition_region_by_uaddr(struct mshv_partition *partition, u64 uaddr)
+{
+ struct mshv_mem_region *region;
+
+ hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) {
+ if (uaddr >= region->start_uaddr &&
+ uaddr < region->start_uaddr +
+ (region->nr_pages << HV_HYP_PAGE_SHIFT))
+ return region;
+ }
+
+ return NULL;
+}
+
+/*
+ * NB: caller checks and makes sure mem->size is page aligned
+ * Returns: 0 with regionpp updated on success, or -errno
+ */
+static int mshv_partition_create_region(struct mshv_partition *partition,
+ struct mshv_user_mem_region *mem,
+ struct mshv_mem_region **regionpp,
+ bool is_mmio)
+{
+ struct mshv_mem_region *region;
+ u64 nr_pages = HVPFN_DOWN(mem->size);
+
+ /* Reject overlapping regions */
+ if (mshv_partition_region_by_gfn(partition, mem->guest_pfn) ||
+ mshv_partition_region_by_gfn(partition, mem->guest_pfn + nr_pages - 1) ||
+ mshv_partition_region_by_uaddr(partition, mem->userspace_addr) ||
+ mshv_partition_region_by_uaddr(partition, mem->userspace_addr + mem->size - 1))
+ return -EEXIST;
+
+ region = vzalloc(sizeof(*region) + sizeof(struct page *) * nr_pages);
+ if (!region)
+ return -ENOMEM;
+
+ region->nr_pages = nr_pages;
+ region->start_gfn = mem->guest_pfn;
+ region->start_uaddr = mem->userspace_addr;
+ region->hv_map_flags = HV_MAP_GPA_READABLE | HV_MAP_GPA_ADJUSTABLE;
+ if (mem->flags & BIT(MSHV_SET_MEM_BIT_WRITABLE))
+ region->hv_map_flags |= HV_MAP_GPA_WRITABLE;
+ if (mem->flags & BIT(MSHV_SET_MEM_BIT_EXECUTABLE))
+ region->hv_map_flags |= HV_MAP_GPA_EXECUTABLE;
+
+ /* Note: large_pages flag populated when we pin the pages */
+ if (!is_mmio)
+ region->flags.range_pinned = true;
+
+ region->partition = partition;
+
+ *regionpp = region;
+
+ return 0;
+}
+
+/*
+ * Map guest ram. if snp, make sure to release that from the host first
+ * Side Effects: In case of failure, pages are unpinned when feasible.
+ */
+static int
+mshv_partition_mem_region_map(struct mshv_mem_region *region)
+{
+ struct mshv_partition *partition = region->partition;
+ int ret;
+
+ ret = mshv_region_populate(region);
+ if (ret) {
+ pt_err(partition, "Failed to populate memory region: %d\n",
+ ret);
+ goto err_out;
+ }
+
+ /*
+ * For an SNP partition it is a requirement that for every memory region
+ * that we are going to map for this partition we should make sure that
+ * host access to that region is released. This is ensured by doing an
+ * additional hypercall which will update the SLAT to release host
+ * access to guest memory regions.
+ */
+ if (mshv_partition_encrypted(partition)) {
+ ret = mshv_partition_region_unshare(region);
+ if (ret) {
+ pt_err(partition,
+ "Failed to unshare memory region (guest_pfn: %llu): %d\n",
+ region->start_gfn, ret);
+ goto evict_region;
+ }
+ }
+
+ ret = mshv_region_map(region);
+ if (ret && mshv_partition_encrypted(partition)) {
+ int shrc;
+
+ shrc = mshv_partition_region_share(region);
+ if (!shrc)
+ goto evict_region;
+
+ pt_err(partition,
+ "Failed to share memory region (guest_pfn: %llu): %d\n",
+ region->start_gfn, shrc);
+ /*
+ * Don't unpin if marking shared failed because pages are no
+ * longer mapped in the host, ie root, anymore.
+ */
+ goto err_out;
+ }
+
+ return 0;
+
+evict_region:
+ mshv_region_evict(region);
+err_out:
+ return ret;
+}
+
+/*
+ * This maps two things: guest RAM and for pci passthru mmio space.
+ *
+ * mmio:
+ * - vfio overloads vm_pgoff to store the mmio start pfn/spa.
+ * - Two things need to happen for mapping mmio range:
+ * 1. mapped in the uaddr so VMM can access it.
+ * 2. mapped in the hwpt (gfn <-> mmio phys addr) so guest can access it.
+ *
+ * This function takes care of the second. The first one is managed by vfio,
+ * and hence is taken care of via vfio_pci_mmap_fault().
+ */
+static long
+mshv_map_user_memory(struct mshv_partition *partition,
+ struct mshv_user_mem_region mem)
+{
+ struct mshv_mem_region *region;
+ struct vm_area_struct *vma;
+ bool is_mmio;
+ ulong mmio_pfn;
+ long ret;
+
+ if (mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP) ||
+ !access_ok((const void *)mem.userspace_addr, mem.size))
+ return -EINVAL;
+
+ mmap_read_lock(current->mm);
+ vma = vma_lookup(current->mm, mem.userspace_addr);
+ is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0;
+ mmio_pfn = is_mmio ? vma->vm_pgoff : 0;
+ mmap_read_unlock(current->mm);
+
+ if (!vma)
+ return -EINVAL;
+
+ ret = mshv_partition_create_region(partition, &mem, &region,
+ is_mmio);
+ if (ret)
+ return ret;
+
+ if (is_mmio)
+ ret = hv_call_map_mmio_pages(partition->pt_id, mem.guest_pfn,
+ mmio_pfn, HVPFN_DOWN(mem.size));
+ else
+ ret = mshv_partition_mem_region_map(region);
+
+ if (ret)
+ goto errout;
+
+ /* Install the new region */
+ hlist_add_head(&region->hnode, &partition->pt_mem_regions);
+
+ return 0;
+
+errout:
+ vfree(region);
+ return ret;
+}
+
+/* Called for unmapping both the guest ram and the mmio space */
+static long
+mshv_unmap_user_memory(struct mshv_partition *partition,
+ struct mshv_user_mem_region mem)
+{
+ struct mshv_mem_region *region;
+ u32 unmap_flags = 0;
+
+ if (!(mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP)))
+ return -EINVAL;
+
+ region = mshv_partition_region_by_gfn(partition, mem.guest_pfn);
+ if (!region)
+ return -EINVAL;
+
+ /* Paranoia check */
+ if (region->start_uaddr != mem.userspace_addr ||
+ region->start_gfn != mem.guest_pfn ||
+ region->nr_pages != HVPFN_DOWN(mem.size))
+ return -EINVAL;
+
+ hlist_del(&region->hnode);
+
+ if (region->flags.large_pages)
+ unmap_flags |= HV_UNMAP_GPA_LARGE_PAGE;
+
+ /* ignore unmap failures and continue as process may be exiting */
+ hv_call_unmap_gpa_pages(partition->pt_id, region->start_gfn,
+ region->nr_pages, unmap_flags);
+
+ mshv_region_evict(region);
+
+ vfree(region);
+ return 0;
+}
+
+static long
+mshv_partition_ioctl_set_memory(struct mshv_partition *partition,
+ struct mshv_user_mem_region __user *user_mem)
+{
+ struct mshv_user_mem_region mem;
+
+ if (copy_from_user(&mem, user_mem, sizeof(mem)))
+ return -EFAULT;
+
+ if (!mem.size ||
+ !PAGE_ALIGNED(mem.size) ||
+ !PAGE_ALIGNED(mem.userspace_addr) ||
+ (mem.flags & ~MSHV_SET_MEM_FLAGS_MASK) ||
+ mshv_field_nonzero(mem, rsvd))
+ return -EINVAL;
+
+ if (mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP))
+ return mshv_unmap_user_memory(partition, mem);
+
+ return mshv_map_user_memory(partition, mem);
+}
+
+static long
+mshv_partition_ioctl_ioeventfd(struct mshv_partition *partition,
+ void __user *user_args)
+{
+ struct mshv_user_ioeventfd args;
+
+ if (copy_from_user(&args, user_args, sizeof(args)))
+ return -EFAULT;
+
+ return mshv_set_unset_ioeventfd(partition, &args);
+}
+
+static long
+mshv_partition_ioctl_irqfd(struct mshv_partition *partition,
+ void __user *user_args)
+{
+ struct mshv_user_irqfd args;
+
+ if (copy_from_user(&args, user_args, sizeof(args)))
+ return -EFAULT;
+
+ return mshv_set_unset_irqfd(partition, &args);
+}
+
+static long
+mshv_partition_ioctl_get_gpap_access_bitmap(struct mshv_partition *partition,
+ void __user *user_args)
+{
+ struct mshv_gpap_access_bitmap args;
+ union hv_gpa_page_access_state *states;
+ long ret, i;
+ union hv_gpa_page_access_state_flags hv_flags = {};
+ u8 hv_type_mask;
+ ulong bitmap_buf_sz, states_buf_sz;
+ int written = 0;
+
+ if (copy_from_user(&args, user_args, sizeof(args)))
+ return -EFAULT;
+
+ if (args.access_type >= MSHV_GPAP_ACCESS_TYPE_COUNT ||
+ args.access_op >= MSHV_GPAP_ACCESS_OP_COUNT ||
+ mshv_field_nonzero(args, rsvd) || !args.page_count ||
+ !args.bitmap_ptr)
+ return -EINVAL;
+
+ if (check_mul_overflow(args.page_count, sizeof(*states), &states_buf_sz))
+ return -E2BIG;
+
+ /* Num bytes needed to store bitmap; one bit per page rounded up */
+ bitmap_buf_sz = DIV_ROUND_UP(args.page_count, 8);
+
+ /* Sanity check */
+ if (bitmap_buf_sz > states_buf_sz)
+ return -EBADFD;
+
+ switch (args.access_type) {
+ case MSHV_GPAP_ACCESS_TYPE_ACCESSED:
+ hv_type_mask = 1;
+ if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) {
+ hv_flags.clear_accessed = 1;
+ /* not accessed implies not dirty */
+ hv_flags.clear_dirty = 1;
+ } else { /* MSHV_GPAP_ACCESS_OP_SET */
+ hv_flags.set_accessed = 1;
+ }
+ break;
+ case MSHV_GPAP_ACCESS_TYPE_DIRTY:
+ hv_type_mask = 2;
+ if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) {
+ hv_flags.clear_dirty = 1;
+ } else { /* MSHV_GPAP_ACCESS_OP_SET */
+ hv_flags.set_dirty = 1;
+ /* dirty implies accessed */
+ hv_flags.set_accessed = 1;
+ }
+ break;
+ }
+
+ states = vzalloc(states_buf_sz);
+ if (!states)
+ return -ENOMEM;
+
+ ret = hv_call_get_gpa_access_states(partition->pt_id, args.page_count,
+ args.gpap_base, hv_flags, &written,
+ states);
+ if (ret)
+ goto free_return;
+
+ /*
+ * Overwrite states buffer with bitmap - the bits in hv_type_mask
+ * correspond to bitfields in hv_gpa_page_access_state
+ */
+ for (i = 0; i < written; ++i)
+ __assign_bit(i, (ulong *)states,
+ states[i].as_uint8 & hv_type_mask);
+
+ /* zero the unused bits in the last byte(s) of the returned bitmap */
+ for (i = written; i < bitmap_buf_sz * 8; ++i)
+ __clear_bit(i, (ulong *)states);
+
+ if (copy_to_user((void __user *)args.bitmap_ptr, states, bitmap_buf_sz))
+ ret = -EFAULT;
+
+free_return:
+ vfree(states);
+ return ret;
+}
+
+static long
+mshv_partition_ioctl_set_msi_routing(struct mshv_partition *partition,
+ void __user *user_args)
+{
+ struct mshv_user_irq_entry *entries = NULL;
+ struct mshv_user_irq_table args;
+ long ret;
+
+ if (copy_from_user(&args, user_args, sizeof(args)))
+ return -EFAULT;
+
+ if (args.nr > MSHV_MAX_GUEST_IRQS ||
+ mshv_field_nonzero(args, rsvd))
+ return -EINVAL;
+
+ if (args.nr) {
+ struct mshv_user_irq_table __user *urouting = user_args;
+
+ entries = vmemdup_user(urouting->entries,
+ array_size(sizeof(*entries),
+ args.nr));
+ if (IS_ERR(entries))
+ return PTR_ERR(entries);
+ }
+ ret = mshv_update_routing_table(partition, entries, args.nr);
+ kvfree(entries);
+
+ return ret;
+}
+
+static long
+mshv_partition_ioctl_initialize(struct mshv_partition *partition)
+{
+ long ret;
+
+ if (partition->pt_initialized)
+ return 0;
+
+ ret = hv_call_initialize_partition(partition->pt_id);
+ if (ret)
+ goto withdraw_mem;
+
+ partition->pt_initialized = true;
+
+ return 0;
+
+withdraw_mem:
+ hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id);
+
+ return ret;
+}
+
+static long
+mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
+{
+ struct mshv_partition *partition = filp->private_data;
+ long ret;
+ void __user *uarg = (void __user *)arg;
+
+ if (mutex_lock_killable(&partition->pt_mutex))
+ return -EINTR;
+
+ switch (ioctl) {
+ case MSHV_INITIALIZE_PARTITION:
+ ret = mshv_partition_ioctl_initialize(partition);
+ break;
+ case MSHV_SET_GUEST_MEMORY:
+ ret = mshv_partition_ioctl_set_memory(partition, uarg);
+ break;
+ case MSHV_CREATE_VP:
+ ret = mshv_partition_ioctl_create_vp(partition, uarg);
+ break;
+ case MSHV_IRQFD:
+ ret = mshv_partition_ioctl_irqfd(partition, uarg);
+ break;
+ case MSHV_IOEVENTFD:
+ ret = mshv_partition_ioctl_ioeventfd(partition, uarg);
+ break;
+ case MSHV_SET_MSI_ROUTING:
+ ret = mshv_partition_ioctl_set_msi_routing(partition, uarg);
+ break;
+ case MSHV_GET_GPAP_ACCESS_BITMAP:
+ ret = mshv_partition_ioctl_get_gpap_access_bitmap(partition,
+ uarg);
+ break;
+ case MSHV_ROOT_HVCALL:
+ ret = mshv_ioctl_passthru_hvcall(partition, true, uarg);
+ break;
+ default:
+ ret = -ENOTTY;
+ }
+
+ mutex_unlock(&partition->pt_mutex);
+ return ret;
+}
+
+static int
+disable_vp_dispatch(struct mshv_vp *vp)
+{
+ int ret;
+ struct hv_register_assoc dispatch_suspend = {
+ .name = HV_REGISTER_DISPATCH_SUSPEND,
+ .value.dispatch_suspend.suspended = 1,
+ };
+
+ ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
+ 1, &dispatch_suspend);
+ if (ret)
+ vp_err(vp, "failed to suspend\n");
+
+ return ret;
+}
+
+static int
+get_vp_signaled_count(struct mshv_vp *vp, u64 *count)
+{
+ int ret;
+ struct hv_register_assoc root_signal_count = {
+ .name = HV_REGISTER_VP_ROOT_SIGNAL_COUNT,
+ };
+
+ ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id,
+ 1, &root_signal_count);
+
+ if (ret) {
+ vp_err(vp, "Failed to get root signal count");
+ *count = 0;
+ return ret;
+ }
+
+ *count = root_signal_count.value.reg64;
+
+ return ret;
+}
+
+static void
+drain_vp_signals(struct mshv_vp *vp)
+{
+ u64 hv_signal_count;
+ u64 vp_signal_count;
+
+ get_vp_signaled_count(vp, &hv_signal_count);
+
+ vp_signal_count = atomic64_read(&vp->run.vp_signaled_count);
+
+ /*
+ * There should be at most 1 outstanding notification, but be extra
+ * careful anyway.
+ */
+ while (hv_signal_count != vp_signal_count) {
+ WARN_ON(hv_signal_count - vp_signal_count != 1);
+
+ if (wait_event_interruptible(vp->run.vp_suspend_queue,
+ vp->run.kicked_by_hv == 1))
+ break;
+ vp->run.kicked_by_hv = 0;
+ vp_signal_count = atomic64_read(&vp->run.vp_signaled_count);
+ }
+}
+
+static void drain_all_vps(const struct mshv_partition *partition)
+{
+ int i;
+ struct mshv_vp *vp;
+
+ /*
+ * VPs are reachable from ISR. It is safe to not take the partition
+ * lock because nobody else can enter this function and drop the
+ * partition from the list.
+ */
+ for (i = 0; i < MSHV_MAX_VPS; i++) {
+ vp = partition->pt_vp_array[i];
+ if (!vp)
+ continue;
+ /*
+ * Disable dispatching of the VP in the hypervisor. After this
+ * the hypervisor guarantees it won't generate any signals for
+ * the VP and the hypervisor's VP signal count won't change.
+ */
+ disable_vp_dispatch(vp);
+ drain_vp_signals(vp);
+ }
+}
+
+static void
+remove_partition(struct mshv_partition *partition)
+{
+ spin_lock(&mshv_root.pt_ht_lock);
+ hlist_del_rcu(&partition->pt_hnode);
+ spin_unlock(&mshv_root.pt_ht_lock);
+
+ synchronize_rcu();
+}
+
+/*
+ * Tear down a partition and remove it from the list.
+ * Partition's refcount must be 0
+ */
+static void destroy_partition(struct mshv_partition *partition)
+{
+ struct mshv_vp *vp;
+ struct mshv_mem_region *region;
+ int i, ret;
+ struct hlist_node *n;
+
+ if (refcount_read(&partition->pt_ref_count)) {
+ pt_err(partition,
+ "Attempt to destroy partition but refcount > 0\n");
+ return;
+ }
+
+ if (partition->pt_initialized) {
+ /*
+ * We only need to drain signals for root scheduler. This should be
+ * done before removing the partition from the partition list.
+ */
+ if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
+ drain_all_vps(partition);
+
+ /* Remove vps */
+ for (i = 0; i < MSHV_MAX_VPS; ++i) {
+ vp = partition->pt_vp_array[i];
+ if (!vp)
+ continue;
+
+ if (hv_parent_partition())
+ mshv_vp_stats_unmap(partition->pt_id, vp->vp_index);
+
+ if (vp->vp_register_page) {
+ (void)hv_call_unmap_vp_state_page(partition->pt_id,
+ vp->vp_index,
+ HV_VP_STATE_PAGE_REGISTERS,
+ input_vtl_zero);
+ vp->vp_register_page = NULL;
+ }
+
+ (void)hv_call_unmap_vp_state_page(partition->pt_id,
+ vp->vp_index,
+ HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
+ input_vtl_zero);
+ vp->vp_intercept_msg_page = NULL;
+
+ if (vp->vp_ghcb_page) {
+ (void)hv_call_unmap_vp_state_page(partition->pt_id,
+ vp->vp_index,
+ HV_VP_STATE_PAGE_GHCB,
+ input_vtl_normal);
+ vp->vp_ghcb_page = NULL;
+ }
+
+ kfree(vp);
+
+ partition->pt_vp_array[i] = NULL;
+ }
+
+ /* Deallocates and unmaps everything including vcpus, GPA mappings etc */
+ hv_call_finalize_partition(partition->pt_id);
+
+ partition->pt_initialized = false;
+ }
+
+ remove_partition(partition);
+
+ /* Remove regions, regain access to the memory and unpin the pages */
+ hlist_for_each_entry_safe(region, n, &partition->pt_mem_regions,
+ hnode) {
+ hlist_del(&region->hnode);
+
+ if (mshv_partition_encrypted(partition)) {
+ ret = mshv_partition_region_share(region);
+ if (ret) {
+ pt_err(partition,
+ "Failed to regain access to memory, unpinning user pages will fail and crash the host error: %d\n",
+ ret);
+ return;
+ }
+ }
+
+ mshv_region_evict(region);
+
+ vfree(region);
+ }
+
+ /* Withdraw and free all pages we deposited */
+ hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id);
+ hv_call_delete_partition(partition->pt_id);
+
+ mshv_free_routing_table(partition);
+ kfree(partition);
+}
+
+struct
+mshv_partition *mshv_partition_get(struct mshv_partition *partition)
+{
+ if (refcount_inc_not_zero(&partition->pt_ref_count))
+ return partition;
+ return NULL;
+}
+
+struct
+mshv_partition *mshv_partition_find(u64 partition_id)
+ __must_hold(RCU)
+{
+ struct mshv_partition *p;
+
+ hash_for_each_possible_rcu(mshv_root.pt_htable, p, pt_hnode,
+ partition_id)
+ if (p->pt_id == partition_id)
+ return p;
+
+ return NULL;
+}
+
+void
+mshv_partition_put(struct mshv_partition *partition)
+{
+ if (refcount_dec_and_test(&partition->pt_ref_count))
+ destroy_partition(partition);
+}
+
+static int
+mshv_partition_release(struct inode *inode, struct file *filp)
+{
+ struct mshv_partition *partition = filp->private_data;
+
+ mshv_eventfd_release(partition);
+
+ cleanup_srcu_struct(&partition->pt_irq_srcu);
+
+ mshv_partition_put(partition);
+
+ return 0;
+}
+
+static int
+add_partition(struct mshv_partition *partition)
+{
+ spin_lock(&mshv_root.pt_ht_lock);
+
+ hash_add_rcu(mshv_root.pt_htable, &partition->pt_hnode,
+ partition->pt_id);
+
+ spin_unlock(&mshv_root.pt_ht_lock);
+
+ return 0;
+}
+
+static long
+mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev)
+{
+ struct mshv_create_partition args;
+ u64 creation_flags;
+ struct hv_partition_creation_properties creation_properties = {};
+ union hv_partition_isolation_properties isolation_properties = {};
+ struct mshv_partition *partition;
+ struct file *file;
+ int fd;
+ long ret;
+
+ if (copy_from_user(&args, user_arg, sizeof(args)))
+ return -EFAULT;
+
+ if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) ||
+ args.pt_isolation >= MSHV_PT_ISOLATION_COUNT)
+ return -EINVAL;
+
+ /* Only support EXO partitions */
+ creation_flags = HV_PARTITION_CREATION_FLAG_EXO_PARTITION |
+ HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED;
+
+ if (args.pt_flags & BIT(MSHV_PT_BIT_LAPIC))
+ creation_flags |= HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED;
+ if (args.pt_flags & BIT(MSHV_PT_BIT_X2APIC))
+ creation_flags |= HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE;
+ if (args.pt_flags & BIT(MSHV_PT_BIT_GPA_SUPER_PAGES))
+ creation_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED;
+
+ switch (args.pt_isolation) {
+ case MSHV_PT_ISOLATION_NONE:
+ isolation_properties.isolation_type =
+ HV_PARTITION_ISOLATION_TYPE_NONE;
+ break;
+ }
+
+ partition = kzalloc(sizeof(*partition), GFP_KERNEL);
+ if (!partition)
+ return -ENOMEM;
+
+ partition->pt_module_dev = module_dev;
+ partition->isolation_type = isolation_properties.isolation_type;
+
+ refcount_set(&partition->pt_ref_count, 1);
+
+ mutex_init(&partition->pt_mutex);
+
+ mutex_init(&partition->pt_irq_lock);
+
+ init_completion(&partition->async_hypercall);
+
+ INIT_HLIST_HEAD(&partition->irq_ack_notifier_list);
+
+ INIT_HLIST_HEAD(&partition->pt_devices);
+
+ INIT_HLIST_HEAD(&partition->pt_mem_regions);
+
+ mshv_eventfd_init(partition);
+
+ ret = init_srcu_struct(&partition->pt_irq_srcu);
+ if (ret)
+ goto free_partition;
+
+ ret = hv_call_create_partition(creation_flags,
+ creation_properties,
+ isolation_properties,
+ &partition->pt_id);
+ if (ret)
+ goto cleanup_irq_srcu;
+
+ ret = add_partition(partition);
+ if (ret)
+ goto delete_partition;
+
+ ret = mshv_init_async_handler(partition);
+ if (ret)
+ goto remove_partition;
+
+ fd = get_unused_fd_flags(O_CLOEXEC);
+ if (fd < 0) {
+ ret = fd;
+ goto remove_partition;
+ }
+
+ file = anon_inode_getfile("mshv_partition", &mshv_partition_fops,
+ partition, O_RDWR);
+ if (IS_ERR(file)) {
+ ret = PTR_ERR(file);
+ goto put_fd;
+ }
+
+ fd_install(fd, file);
+
+ return fd;
+
+put_fd:
+ put_unused_fd(fd);
+remove_partition:
+ remove_partition(partition);
+delete_partition:
+ hv_call_delete_partition(partition->pt_id);
+cleanup_irq_srcu:
+ cleanup_srcu_struct(&partition->pt_irq_srcu);
+free_partition:
+ kfree(partition);
+
+ return ret;
+}
+
+static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl,
+ unsigned long arg)
+{
+ struct miscdevice *misc = filp->private_data;
+
+ switch (ioctl) {
+ case MSHV_CREATE_PARTITION:
+ return mshv_ioctl_create_partition((void __user *)arg,
+ misc->this_device);
+ }
+
+ return -ENOTTY;
+}
+
+static int
+mshv_dev_open(struct inode *inode, struct file *filp)
+{
+ return 0;
+}
+
+static int
+mshv_dev_release(struct inode *inode, struct file *filp)
+{
+ return 0;
+}
+
+static int mshv_cpuhp_online;
+static int mshv_root_sched_online;
+
+static const char *scheduler_type_to_string(enum hv_scheduler_type type)
+{
+ switch (type) {
+ case HV_SCHEDULER_TYPE_LP:
+ return "classic scheduler without SMT";
+ case HV_SCHEDULER_TYPE_LP_SMT:
+ return "classic scheduler with SMT";
+ case HV_SCHEDULER_TYPE_CORE_SMT:
+ return "core scheduler";
+ case HV_SCHEDULER_TYPE_ROOT:
+ return "root scheduler";
+ default:
+ return "unknown scheduler";
+ };
+}
+
+/* TODO move this to hv_common.c when needed outside */
+static int __init hv_retrieve_scheduler_type(enum hv_scheduler_type *out)
+{
+ struct hv_input_get_system_property *input;
+ struct hv_output_get_system_property *output;
+ unsigned long flags;
+ u64 status;
+
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+ memset(input, 0, sizeof(*input));
+ memset(output, 0, sizeof(*output));
+ input->property_id = HV_SYSTEM_PROPERTY_SCHEDULER_TYPE;
+
+ status = hv_do_hypercall(HVCALL_GET_SYSTEM_PROPERTY, input, output);
+ if (!hv_result_success(status)) {
+ local_irq_restore(flags);
+ pr_err("%s: %s\n", __func__, hv_result_to_string(status));
+ return hv_result_to_errno(status);
+ }
+
+ *out = output->scheduler_type;
+ local_irq_restore(flags);
+
+ return 0;
+}
+
+/* Retrieve and stash the supported scheduler type */
+static int __init mshv_retrieve_scheduler_type(struct device *dev)
+{
+ int ret;
+
+ ret = hv_retrieve_scheduler_type(&hv_scheduler_type);
+ if (ret)
+ return ret;
+
+ dev_info(dev, "Hypervisor using %s\n",
+ scheduler_type_to_string(hv_scheduler_type));
+
+ switch (hv_scheduler_type) {
+ case HV_SCHEDULER_TYPE_CORE_SMT:
+ case HV_SCHEDULER_TYPE_LP_SMT:
+ case HV_SCHEDULER_TYPE_ROOT:
+ case HV_SCHEDULER_TYPE_LP:
+ /* Supported scheduler, nothing to do */
+ break;
+ default:
+ dev_err(dev, "unsupported scheduler 0x%x, bailing.\n",
+ hv_scheduler_type);
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+static int mshv_root_scheduler_init(unsigned int cpu)
+{
+ void **inputarg, **outputarg, *p;
+
+ inputarg = (void **)this_cpu_ptr(root_scheduler_input);
+ outputarg = (void **)this_cpu_ptr(root_scheduler_output);
+
+ /* Allocate two consecutive pages. One for input, one for output. */
+ p = kmalloc(2 * HV_HYP_PAGE_SIZE, GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+
+ *inputarg = p;
+ *outputarg = (char *)p + HV_HYP_PAGE_SIZE;
+
+ return 0;
+}
+
+static int mshv_root_scheduler_cleanup(unsigned int cpu)
+{
+ void *p, **inputarg, **outputarg;
+
+ inputarg = (void **)this_cpu_ptr(root_scheduler_input);
+ outputarg = (void **)this_cpu_ptr(root_scheduler_output);
+
+ p = *inputarg;
+
+ *inputarg = NULL;
+ *outputarg = NULL;
+
+ kfree(p);
+
+ return 0;
+}
+
+/* Must be called after retrieving the scheduler type */
+static int
+root_scheduler_init(struct device *dev)
+{
+ int ret;
+
+ if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT)
+ return 0;
+
+ root_scheduler_input = alloc_percpu(void *);
+ root_scheduler_output = alloc_percpu(void *);
+
+ if (!root_scheduler_input || !root_scheduler_output) {
+ dev_err(dev, "Failed to allocate root scheduler buffers\n");
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_root_sched",
+ mshv_root_scheduler_init,
+ mshv_root_scheduler_cleanup);
+
+ if (ret < 0) {
+ dev_err(dev, "Failed to setup root scheduler state: %i\n", ret);
+ goto out;
+ }
+
+ mshv_root_sched_online = ret;
+
+ return 0;
+
+out:
+ free_percpu(root_scheduler_input);
+ free_percpu(root_scheduler_output);
+ return ret;
+}
+
+static void
+root_scheduler_deinit(void)
+{
+ if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT)
+ return;
+
+ cpuhp_remove_state(mshv_root_sched_online);
+ free_percpu(root_scheduler_input);
+ free_percpu(root_scheduler_output);
+}
+
+static int mshv_reboot_notify(struct notifier_block *nb,
+ unsigned long code, void *unused)
+{
+ cpuhp_remove_state(mshv_cpuhp_online);
+ return 0;
+}
+
+struct notifier_block mshv_reboot_nb = {
+ .notifier_call = mshv_reboot_notify,
+};
+
+static void mshv_root_partition_exit(void)
+{
+ unregister_reboot_notifier(&mshv_reboot_nb);
+ root_scheduler_deinit();
+}
+
+static int __init mshv_root_partition_init(struct device *dev)
+{
+ int err;
+
+ if (mshv_retrieve_scheduler_type(dev))
+ return -ENODEV;
+
+ err = root_scheduler_init(dev);
+ if (err)
+ return err;
+
+ err = register_reboot_notifier(&mshv_reboot_nb);
+ if (err)
+ goto root_sched_deinit;
+
+ return 0;
+
+root_sched_deinit:
+ root_scheduler_deinit();
+ return err;
+}
+
+static int __init mshv_parent_partition_init(void)
+{
+ int ret;
+ struct device *dev;
+ union hv_hypervisor_version_info version_info;
+
+ if (!hv_root_partition() || is_kdump_kernel())
+ return -ENODEV;
+
+ if (hv_get_hypervisor_version(&version_info))
+ return -ENODEV;
+
+ ret = misc_register(&mshv_dev);
+ if (ret)
+ return ret;
+
+ dev = mshv_dev.this_device;
+
+ if (version_info.build_number < MSHV_HV_MIN_VERSION ||
+ version_info.build_number > MSHV_HV_MAX_VERSION) {
+ dev_err(dev, "Running on unvalidated Hyper-V version\n");
+ dev_err(dev, "Versions: current: %u min: %u max: %u\n",
+ version_info.build_number, MSHV_HV_MIN_VERSION,
+ MSHV_HV_MAX_VERSION);
+ }
+
+ mshv_root.synic_pages = alloc_percpu(struct hv_synic_pages);
+ if (!mshv_root.synic_pages) {
+ dev_err(dev, "Failed to allocate percpu synic page\n");
+ ret = -ENOMEM;
+ goto device_deregister;
+ }
+
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_synic",
+ mshv_synic_init,
+ mshv_synic_cleanup);
+ if (ret < 0) {
+ dev_err(dev, "Failed to setup cpu hotplug state: %i\n", ret);
+ goto free_synic_pages;
+ }
+
+ mshv_cpuhp_online = ret;
+
+ ret = mshv_root_partition_init(dev);
+ if (ret)
+ goto remove_cpu_state;
+
+ ret = mshv_irqfd_wq_init();
+ if (ret)
+ goto exit_partition;
+
+ spin_lock_init(&mshv_root.pt_ht_lock);
+ hash_init(mshv_root.pt_htable);
+
+ hv_setup_mshv_handler(mshv_isr);
+
+ return 0;
+
+exit_partition:
+ if (hv_root_partition())
+ mshv_root_partition_exit();
+remove_cpu_state:
+ cpuhp_remove_state(mshv_cpuhp_online);
+free_synic_pages:
+ free_percpu(mshv_root.synic_pages);
+device_deregister:
+ misc_deregister(&mshv_dev);
+ return ret;
+}
+
+static void __exit mshv_parent_partition_exit(void)
+{
+ hv_setup_mshv_handler(NULL);
+ mshv_port_table_fini();
+ misc_deregister(&mshv_dev);
+ mshv_irqfd_wq_cleanup();
+ if (hv_root_partition())
+ mshv_root_partition_exit();
+ cpuhp_remove_state(mshv_cpuhp_online);
+ free_percpu(mshv_root.synic_pages);
+}
+
+module_init(mshv_parent_partition_init);
+module_exit(mshv_parent_partition_exit);
diff --git a/drivers/hv/mshv_synic.c b/drivers/hv/mshv_synic.c
new file mode 100644
index 000000000000..e6b6381b7c36
--- /dev/null
+++ b/drivers/hv/mshv_synic.c
@@ -0,0 +1,665 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2023, Microsoft Corporation.
+ *
+ * mshv_root module's main interrupt handler and associated functionality.
+ *
+ * Authors: Microsoft Linux virtualization team
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/io.h>
+#include <linux/random.h>
+#include <asm/mshyperv.h>
+
+#include "mshv_eventfd.h"
+#include "mshv.h"
+
+static u32 synic_event_ring_get_queued_port(u32 sint_index)
+{
+ struct hv_synic_event_ring_page **event_ring_page;
+ volatile struct hv_synic_event_ring *ring;
+ struct hv_synic_pages *spages;
+ u8 **synic_eventring_tail;
+ u32 message;
+ u8 tail;
+
+ spages = this_cpu_ptr(mshv_root.synic_pages);
+ event_ring_page = &spages->synic_event_ring_page;
+ synic_eventring_tail = (u8 **)this_cpu_ptr(hv_synic_eventring_tail);
+
+ if (unlikely(!*synic_eventring_tail)) {
+ pr_debug("Missing synic event ring tail!\n");
+ return 0;
+ }
+ tail = (*synic_eventring_tail)[sint_index];
+
+ if (unlikely(!*event_ring_page)) {
+ pr_debug("Missing synic event ring page!\n");
+ return 0;
+ }
+
+ ring = &(*event_ring_page)->sint_event_ring[sint_index];
+
+ /*
+ * Get the message.
+ */
+ message = ring->data[tail];
+
+ if (!message) {
+ if (ring->ring_full) {
+ /*
+ * Ring is marked full, but we would have consumed all
+ * the messages. Notify the hypervisor that ring is now
+ * empty and check again.
+ */
+ ring->ring_full = 0;
+ hv_call_notify_port_ring_empty(sint_index);
+ message = ring->data[tail];
+ }
+
+ if (!message) {
+ ring->signal_masked = 0;
+ /*
+ * Unmask the signal and sync with hypervisor
+ * before one last check for any message.
+ */
+ mb();
+ message = ring->data[tail];
+
+ /*
+ * Ok, lets bail out.
+ */
+ if (!message)
+ return 0;
+ }
+
+ ring->signal_masked = 1;
+ }
+
+ /*
+ * Clear the message in the ring buffer.
+ */
+ ring->data[tail] = 0;
+
+ if (++tail == HV_SYNIC_EVENT_RING_MESSAGE_COUNT)
+ tail = 0;
+
+ (*synic_eventring_tail)[sint_index] = tail;
+
+ return message;
+}
+
+static bool
+mshv_doorbell_isr(struct hv_message *msg)
+{
+ struct hv_notification_message_payload *notification;
+ u32 port;
+
+ if (msg->header.message_type != HVMSG_SYNIC_SINT_INTERCEPT)
+ return false;
+
+ notification = (struct hv_notification_message_payload *)msg->u.payload;
+ if (notification->sint_index != HV_SYNIC_DOORBELL_SINT_INDEX)
+ return false;
+
+ while ((port = synic_event_ring_get_queued_port(HV_SYNIC_DOORBELL_SINT_INDEX))) {
+ struct port_table_info ptinfo = { 0 };
+
+ if (mshv_portid_lookup(port, &ptinfo)) {
+ pr_debug("Failed to get port info from port_table!\n");
+ continue;
+ }
+
+ if (ptinfo.hv_port_type != HV_PORT_TYPE_DOORBELL) {
+ pr_debug("Not a doorbell port!, port: %d, port_type: %d\n",
+ port, ptinfo.hv_port_type);
+ continue;
+ }
+
+ /* Invoke the callback */
+ ptinfo.hv_port_doorbell.doorbell_cb(port,
+ ptinfo.hv_port_doorbell.data);
+ }
+
+ return true;
+}
+
+static bool mshv_async_call_completion_isr(struct hv_message *msg)
+{
+ bool handled = false;
+ struct hv_async_completion_message_payload *async_msg;
+ struct mshv_partition *partition;
+ u64 partition_id;
+
+ if (msg->header.message_type != HVMSG_ASYNC_CALL_COMPLETION)
+ goto out;
+
+ async_msg =
+ (struct hv_async_completion_message_payload *)msg->u.payload;
+
+ partition_id = async_msg->partition_id;
+
+ /*
+ * Hold this lock for the rest of the isr, because the partition could
+ * be released anytime.
+ * e.g. the MSHV_RUN_VP thread could wake on another cpu; it could
+ * release the partition unless we hold this!
+ */
+ rcu_read_lock();
+
+ partition = mshv_partition_find(partition_id);
+
+ if (unlikely(!partition)) {
+ pr_debug("failed to find partition %llu\n", partition_id);
+ goto unlock_out;
+ }
+
+ partition->async_hypercall_status = async_msg->status;
+ complete(&partition->async_hypercall);
+
+ handled = true;
+
+unlock_out:
+ rcu_read_unlock();
+out:
+ return handled;
+}
+
+static void kick_vp(struct mshv_vp *vp)
+{
+ atomic64_inc(&vp->run.vp_signaled_count);
+ vp->run.kicked_by_hv = 1;
+ wake_up(&vp->run.vp_suspend_queue);
+}
+
+static void
+handle_bitset_message(const struct hv_vp_signal_bitset_scheduler_message *msg)
+{
+ int bank_idx, vps_signaled = 0, bank_mask_size;
+ struct mshv_partition *partition;
+ const struct hv_vpset *vpset;
+ const u64 *bank_contents;
+ u64 partition_id = msg->partition_id;
+
+ if (msg->vp_bitset.bitset.format != HV_GENERIC_SET_SPARSE_4K) {
+ pr_debug("scheduler message format is not HV_GENERIC_SET_SPARSE_4K");
+ return;
+ }
+
+ if (msg->vp_count == 0) {
+ pr_debug("scheduler message with no VP specified");
+ return;
+ }
+
+ rcu_read_lock();
+
+ partition = mshv_partition_find(partition_id);
+ if (unlikely(!partition)) {
+ pr_debug("failed to find partition %llu\n", partition_id);
+ goto unlock_out;
+ }
+
+ vpset = &msg->vp_bitset.bitset;
+
+ bank_idx = -1;
+ bank_contents = vpset->bank_contents;
+ bank_mask_size = sizeof(vpset->valid_bank_mask) * BITS_PER_BYTE;
+
+ while (true) {
+ int vp_bank_idx = -1;
+ int vp_bank_size = sizeof(*bank_contents) * BITS_PER_BYTE;
+ int vp_index;
+
+ bank_idx = find_next_bit((unsigned long *)&vpset->valid_bank_mask,
+ bank_mask_size, bank_idx + 1);
+ if (bank_idx == bank_mask_size)
+ break;
+
+ while (true) {
+ struct mshv_vp *vp;
+
+ vp_bank_idx = find_next_bit((unsigned long *)bank_contents,
+ vp_bank_size, vp_bank_idx + 1);
+ if (vp_bank_idx == vp_bank_size)
+ break;
+
+ vp_index = (bank_idx * vp_bank_size) + vp_bank_idx;
+
+ /* This shouldn't happen, but just in case. */
+ if (unlikely(vp_index >= MSHV_MAX_VPS)) {
+ pr_debug("VP index %u out of bounds\n",
+ vp_index);
+ goto unlock_out;
+ }
+
+ vp = partition->pt_vp_array[vp_index];
+ if (unlikely(!vp)) {
+ pr_debug("failed to find VP %u\n", vp_index);
+ goto unlock_out;
+ }
+
+ kick_vp(vp);
+ vps_signaled++;
+ }
+
+ bank_contents++;
+ }
+
+unlock_out:
+ rcu_read_unlock();
+
+ if (vps_signaled != msg->vp_count)
+ pr_debug("asked to signal %u VPs but only did %u\n",
+ msg->vp_count, vps_signaled);
+}
+
+static void
+handle_pair_message(const struct hv_vp_signal_pair_scheduler_message *msg)
+{
+ struct mshv_partition *partition = NULL;
+ struct mshv_vp *vp;
+ int idx;
+
+ rcu_read_lock();
+
+ for (idx = 0; idx < msg->vp_count; idx++) {
+ u64 partition_id = msg->partition_ids[idx];
+ u32 vp_index = msg->vp_indexes[idx];
+
+ if (idx == 0 || partition->pt_id != partition_id) {
+ partition = mshv_partition_find(partition_id);
+ if (unlikely(!partition)) {
+ pr_debug("failed to find partition %llu\n",
+ partition_id);
+ break;
+ }
+ }
+
+ /* This shouldn't happen, but just in case. */
+ if (unlikely(vp_index >= MSHV_MAX_VPS)) {
+ pr_debug("VP index %u out of bounds\n", vp_index);
+ break;
+ }
+
+ vp = partition->pt_vp_array[vp_index];
+ if (!vp) {
+ pr_debug("failed to find VP %u\n", vp_index);
+ break;
+ }
+
+ kick_vp(vp);
+ }
+
+ rcu_read_unlock();
+}
+
+static bool
+mshv_scheduler_isr(struct hv_message *msg)
+{
+ if (msg->header.message_type != HVMSG_SCHEDULER_VP_SIGNAL_BITSET &&
+ msg->header.message_type != HVMSG_SCHEDULER_VP_SIGNAL_PAIR)
+ return false;
+
+ if (msg->header.message_type == HVMSG_SCHEDULER_VP_SIGNAL_BITSET)
+ handle_bitset_message((struct hv_vp_signal_bitset_scheduler_message *)
+ msg->u.payload);
+ else
+ handle_pair_message((struct hv_vp_signal_pair_scheduler_message *)
+ msg->u.payload);
+
+ return true;
+}
+
+static bool
+mshv_intercept_isr(struct hv_message *msg)
+{
+ struct mshv_partition *partition;
+ bool handled = false;
+ struct mshv_vp *vp;
+ u64 partition_id;
+ u32 vp_index;
+
+ partition_id = msg->header.sender;
+
+ rcu_read_lock();
+
+ partition = mshv_partition_find(partition_id);
+ if (unlikely(!partition)) {
+ pr_debug("failed to find partition %llu\n",
+ partition_id);
+ goto unlock_out;
+ }
+
+ if (msg->header.message_type == HVMSG_X64_APIC_EOI) {
+ /*
+ * Check if this gsi is registered in the
+ * ack_notifier list and invoke the callback
+ * if registered.
+ */
+
+ /*
+ * If there is a notifier, the ack callback is supposed
+ * to handle the VMEXIT. So we need not pass this message
+ * to vcpu thread.
+ */
+ struct hv_x64_apic_eoi_message *eoi_msg =
+ (struct hv_x64_apic_eoi_message *)&msg->u.payload[0];
+
+ if (mshv_notify_acked_gsi(partition, eoi_msg->interrupt_vector)) {
+ handled = true;
+ goto unlock_out;
+ }
+ }
+
+ /*
+ * We should get an opaque intercept message here for all intercept
+ * messages, since we're using the mapped VP intercept message page.
+ *
+ * The intercept message will have been placed in intercept message
+ * page at this point.
+ *
+ * Make sure the message type matches our expectation.
+ */
+ if (msg->header.message_type != HVMSG_OPAQUE_INTERCEPT) {
+ pr_debug("wrong message type %d", msg->header.message_type);
+ goto unlock_out;
+ }
+
+ /*
+ * Since we directly index the vp, and it has to exist for us to be here
+ * (because the vp is only deleted when the partition is), no additional
+ * locking is needed here
+ */
+ vp_index =
+ ((struct hv_opaque_intercept_message *)msg->u.payload)->vp_index;
+ vp = partition->pt_vp_array[vp_index];
+ if (unlikely(!vp)) {
+ pr_debug("failed to find VP %u\n", vp_index);
+ goto unlock_out;
+ }
+
+ kick_vp(vp);
+
+ handled = true;
+
+unlock_out:
+ rcu_read_unlock();
+
+ return handled;
+}
+
+void mshv_isr(void)
+{
+ struct hv_synic_pages *spages = this_cpu_ptr(mshv_root.synic_pages);
+ struct hv_message_page **msg_page = &spages->synic_message_page;
+ struct hv_message *msg;
+ bool handled;
+
+ if (unlikely(!(*msg_page))) {
+ pr_debug("Missing synic page!\n");
+ return;
+ }
+
+ msg = &((*msg_page)->sint_message[HV_SYNIC_INTERCEPTION_SINT_INDEX]);
+
+ /*
+ * If the type isn't set, there isn't really a message;
+ * it may be some other hyperv interrupt
+ */
+ if (msg->header.message_type == HVMSG_NONE)
+ return;
+
+ handled = mshv_doorbell_isr(msg);
+
+ if (!handled)
+ handled = mshv_scheduler_isr(msg);
+
+ if (!handled)
+ handled = mshv_async_call_completion_isr(msg);
+
+ if (!handled)
+ handled = mshv_intercept_isr(msg);
+
+ if (handled) {
+ /*
+ * Acknowledge message with hypervisor if another message is
+ * pending.
+ */
+ msg->header.message_type = HVMSG_NONE;
+ /*
+ * Ensure the write is complete so the hypervisor will deliver
+ * the next message if available.
+ */
+ mb();
+ if (msg->header.message_flags.msg_pending)
+ hv_set_non_nested_msr(HV_MSR_EOM, 0);
+
+#ifdef HYPERVISOR_CALLBACK_VECTOR
+ add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR);
+#endif
+ } else {
+ pr_warn_once("%s: unknown message type 0x%x\n", __func__,
+ msg->header.message_type);
+ }
+}
+
+int mshv_synic_init(unsigned int cpu)
+{
+ union hv_synic_simp simp;
+ union hv_synic_siefp siefp;
+ union hv_synic_sirbp sirbp;
+#ifdef HYPERVISOR_CALLBACK_VECTOR
+ union hv_synic_sint sint;
+#endif
+ union hv_synic_scontrol sctrl;
+ struct hv_synic_pages *spages = this_cpu_ptr(mshv_root.synic_pages);
+ struct hv_message_page **msg_page = &spages->synic_message_page;
+ struct hv_synic_event_flags_page **event_flags_page =
+ &spages->synic_event_flags_page;
+ struct hv_synic_event_ring_page **event_ring_page =
+ &spages->synic_event_ring_page;
+
+ /* Setup the Synic's message page */
+ simp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIMP);
+ simp.simp_enabled = true;
+ *msg_page = memremap(simp.base_simp_gpa << HV_HYP_PAGE_SHIFT,
+ HV_HYP_PAGE_SIZE,
+ MEMREMAP_WB);
+
+ if (!(*msg_page))
+ return -EFAULT;
+
+ hv_set_non_nested_msr(HV_MSR_SIMP, simp.as_uint64);
+
+ /* Setup the Synic's event flags page */
+ siefp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIEFP);
+ siefp.siefp_enabled = true;
+ *event_flags_page = memremap(siefp.base_siefp_gpa << PAGE_SHIFT,
+ PAGE_SIZE, MEMREMAP_WB);
+
+ if (!(*event_flags_page))
+ goto cleanup;
+
+ hv_set_non_nested_msr(HV_MSR_SIEFP, siefp.as_uint64);
+
+ /* Setup the Synic's event ring page */
+ sirbp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIRBP);
+ sirbp.sirbp_enabled = true;
+ *event_ring_page = memremap(sirbp.base_sirbp_gpa << PAGE_SHIFT,
+ PAGE_SIZE, MEMREMAP_WB);
+
+ if (!(*event_ring_page))
+ goto cleanup;
+
+ hv_set_non_nested_msr(HV_MSR_SIRBP, sirbp.as_uint64);
+
+#ifdef HYPERVISOR_CALLBACK_VECTOR
+ /* Enable intercepts */
+ sint.as_uint64 = 0;
+ sint.vector = HYPERVISOR_CALLBACK_VECTOR;
+ sint.masked = false;
+ sint.auto_eoi = hv_recommend_using_aeoi();
+ hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_INTERCEPTION_SINT_INDEX,
+ sint.as_uint64);
+
+ /* Doorbell SINT */
+ sint.as_uint64 = 0;
+ sint.vector = HYPERVISOR_CALLBACK_VECTOR;
+ sint.masked = false;
+ sint.as_intercept = 1;
+ sint.auto_eoi = hv_recommend_using_aeoi();
+ hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_DOORBELL_SINT_INDEX,
+ sint.as_uint64);
+#endif
+
+ /* Enable global synic bit */
+ sctrl.as_uint64 = hv_get_non_nested_msr(HV_MSR_SCONTROL);
+ sctrl.enable = 1;
+ hv_set_non_nested_msr(HV_MSR_SCONTROL, sctrl.as_uint64);
+
+ return 0;
+
+cleanup:
+ if (*event_ring_page) {
+ sirbp.sirbp_enabled = false;
+ hv_set_non_nested_msr(HV_MSR_SIRBP, sirbp.as_uint64);
+ memunmap(*event_ring_page);
+ }
+ if (*event_flags_page) {
+ siefp.siefp_enabled = false;
+ hv_set_non_nested_msr(HV_MSR_SIEFP, siefp.as_uint64);
+ memunmap(*event_flags_page);
+ }
+ if (*msg_page) {
+ simp.simp_enabled = false;
+ hv_set_non_nested_msr(HV_MSR_SIMP, simp.as_uint64);
+ memunmap(*msg_page);
+ }
+
+ return -EFAULT;
+}
+
+int mshv_synic_cleanup(unsigned int cpu)
+{
+ union hv_synic_sint sint;
+ union hv_synic_simp simp;
+ union hv_synic_siefp siefp;
+ union hv_synic_sirbp sirbp;
+ union hv_synic_scontrol sctrl;
+ struct hv_synic_pages *spages = this_cpu_ptr(mshv_root.synic_pages);
+ struct hv_message_page **msg_page = &spages->synic_message_page;
+ struct hv_synic_event_flags_page **event_flags_page =
+ &spages->synic_event_flags_page;
+ struct hv_synic_event_ring_page **event_ring_page =
+ &spages->synic_event_ring_page;
+
+ /* Disable the interrupt */
+ sint.as_uint64 = hv_get_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_INTERCEPTION_SINT_INDEX);
+ sint.masked = true;
+ hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_INTERCEPTION_SINT_INDEX,
+ sint.as_uint64);
+
+ /* Disable Doorbell SINT */
+ sint.as_uint64 = hv_get_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_DOORBELL_SINT_INDEX);
+ sint.masked = true;
+ hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_DOORBELL_SINT_INDEX,
+ sint.as_uint64);
+
+ /* Disable Synic's event ring page */
+ sirbp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIRBP);
+ sirbp.sirbp_enabled = false;
+ hv_set_non_nested_msr(HV_MSR_SIRBP, sirbp.as_uint64);
+ memunmap(*event_ring_page);
+
+ /* Disable Synic's event flags page */
+ siefp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIEFP);
+ siefp.siefp_enabled = false;
+ hv_set_non_nested_msr(HV_MSR_SIEFP, siefp.as_uint64);
+ memunmap(*event_flags_page);
+
+ /* Disable Synic's message page */
+ simp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIMP);
+ simp.simp_enabled = false;
+ hv_set_non_nested_msr(HV_MSR_SIMP, simp.as_uint64);
+ memunmap(*msg_page);
+
+ /* Disable global synic bit */
+ sctrl.as_uint64 = hv_get_non_nested_msr(HV_MSR_SCONTROL);
+ sctrl.enable = 0;
+ hv_set_non_nested_msr(HV_MSR_SCONTROL, sctrl.as_uint64);
+
+ return 0;
+}
+
+int
+mshv_register_doorbell(u64 partition_id, doorbell_cb_t doorbell_cb, void *data,
+ u64 gpa, u64 val, u64 flags)
+{
+ struct hv_connection_info connection_info = { 0 };
+ union hv_connection_id connection_id = { 0 };
+ struct port_table_info *port_table_info;
+ struct hv_port_info port_info = { 0 };
+ union hv_port_id port_id = { 0 };
+ int ret;
+
+ port_table_info = kmalloc(sizeof(*port_table_info), GFP_KERNEL);
+ if (!port_table_info)
+ return -ENOMEM;
+
+ port_table_info->hv_port_type = HV_PORT_TYPE_DOORBELL;
+ port_table_info->hv_port_doorbell.doorbell_cb = doorbell_cb;
+ port_table_info->hv_port_doorbell.data = data;
+ ret = mshv_portid_alloc(port_table_info);
+ if (ret < 0) {
+ kfree(port_table_info);
+ return ret;
+ }
+
+ port_id.u.id = ret;
+ port_info.port_type = HV_PORT_TYPE_DOORBELL;
+ port_info.doorbell_port_info.target_sint = HV_SYNIC_DOORBELL_SINT_INDEX;
+ port_info.doorbell_port_info.target_vp = HV_ANY_VP;
+ ret = hv_call_create_port(hv_current_partition_id, port_id, partition_id,
+ &port_info,
+ 0, 0, NUMA_NO_NODE);
+
+ if (ret < 0) {
+ mshv_portid_free(port_id.u.id);
+ return ret;
+ }
+
+ connection_id.u.id = port_id.u.id;
+ connection_info.port_type = HV_PORT_TYPE_DOORBELL;
+ connection_info.doorbell_connection_info.gpa = gpa;
+ connection_info.doorbell_connection_info.trigger_value = val;
+ connection_info.doorbell_connection_info.flags = flags;
+
+ ret = hv_call_connect_port(hv_current_partition_id, port_id, partition_id,
+ connection_id, &connection_info, 0, NUMA_NO_NODE);
+ if (ret < 0) {
+ hv_call_delete_port(hv_current_partition_id, port_id);
+ mshv_portid_free(port_id.u.id);
+ return ret;
+ }
+
+ // lets use the port_id as the doorbell_id
+ return port_id.u.id;
+}
+
+void
+mshv_unregister_doorbell(u64 partition_id, int doorbell_portid)
+{
+ union hv_port_id port_id = { 0 };
+ union hv_connection_id connection_id = { 0 };
+
+ connection_id.u.id = doorbell_portid;
+ hv_call_disconnect_port(partition_id, connection_id);
+
+ port_id.u.id = doorbell_portid;
+ hv_call_delete_port(hv_current_partition_id, port_id);
+
+ mshv_portid_free(doorbell_portid);
+}
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 12a707ab73f8..d74adb5bba44 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -685,7 +685,7 @@ static const struct hv_vmbus_device_id vmbus_device_null;
* Return a matching hv_vmbus_device_id pointer.
* If there is no match, return NULL.
*/
-static const struct hv_vmbus_device_id *hv_vmbus_get_id(struct hv_driver *drv,
+static const struct hv_vmbus_device_id *hv_vmbus_get_id(const struct hv_driver *drv,
struct hv_device *dev)
{
const guid_t *guid = &dev->dev_type;
@@ -696,7 +696,7 @@ static const struct hv_vmbus_device_id *hv_vmbus_get_id(struct hv_driver *drv,
return NULL;
/* Look at the dynamic ids first, before the static ones */
- id = hv_vmbus_dynid_match(drv, guid);
+ id = hv_vmbus_dynid_match((struct hv_driver *)drv, guid);
if (!id)
id = hv_vmbus_dev_match(drv->id_table, guid);
@@ -809,9 +809,9 @@ ATTRIBUTE_GROUPS(vmbus_drv);
/*
* vmbus_match - Attempt to match the specified device to the specified driver
*/
-static int vmbus_match(struct device *device, struct device_driver *driver)
+static int vmbus_match(struct device *device, const struct device_driver *driver)
{
- struct hv_driver *drv = drv_to_hv_drv(driver);
+ const struct hv_driver *drv = drv_to_hv_drv(driver);
struct hv_device *hv_dev = device_to_hv_device(device);
/* The hv_sock driver handles all hv_sock offers. */
@@ -1306,6 +1306,13 @@ static irqreturn_t vmbus_percpu_isr(int irq, void *dev_id)
return IRQ_HANDLED;
}
+static void vmbus_percpu_work(struct work_struct *work)
+{
+ unsigned int cpu = smp_processor_id();
+
+ hv_synic_init(cpu);
+}
+
/*
* vmbus_bus_init -Main vmbus driver initialization routine.
*
@@ -1316,7 +1323,8 @@ static irqreturn_t vmbus_percpu_isr(int irq, void *dev_id)
*/
static int vmbus_bus_init(void)
{
- int ret;
+ int ret, cpu;
+ struct work_struct __percpu *works;
ret = hv_init();
if (ret != 0) {
@@ -1355,12 +1363,32 @@ static int vmbus_bus_init(void)
if (ret)
goto err_alloc;
+ works = alloc_percpu(struct work_struct);
+ if (!works) {
+ ret = -ENOMEM;
+ goto err_alloc;
+ }
+
/*
* Initialize the per-cpu interrupt state and stimer state.
* Then connect to the host.
*/
- ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "hyperv/vmbus:online",
- hv_synic_init, hv_synic_cleanup);
+ cpus_read_lock();
+ for_each_online_cpu(cpu) {
+ struct work_struct *work = per_cpu_ptr(works, cpu);
+
+ INIT_WORK(work, vmbus_percpu_work);
+ schedule_work_on(cpu, work);
+ }
+
+ for_each_online_cpu(cpu)
+ flush_work(per_cpu_ptr(works, cpu));
+
+ /* Register the callbacks for possible CPU online/offline'ing */
+ ret = cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ONLINE_DYN, "hyperv/vmbus:online",
+ hv_synic_init, hv_synic_cleanup);
+ cpus_read_unlock();
+ free_percpu(works);
if (ret < 0)
goto err_alloc;
hyperv_cpuhp_online = ret;
@@ -1583,16 +1611,16 @@ static ssize_t target_cpu_show(struct vmbus_channel *channel, char *buf)
{
return sprintf(buf, "%u\n", channel->target_cpu);
}
-static ssize_t target_cpu_store(struct vmbus_channel *channel,
- const char *buf, size_t count)
+
+int vmbus_channel_set_cpu(struct vmbus_channel *channel, u32 target_cpu)
{
- u32 target_cpu, origin_cpu;
- ssize_t ret = count;
+ u32 origin_cpu;
+ int ret = 0;
- if (vmbus_proto_version < VERSION_WIN10_V4_1)
- return -EIO;
+ lockdep_assert_cpus_held();
+ lockdep_assert_held(&vmbus_connection.channel_mutex);
- if (sscanf(buf, "%uu", &target_cpu) != 1)
+ if (vmbus_proto_version < VERSION_WIN10_V4_1)
return -EIO;
/* Validate target_cpu for the cpumask_test_cpu() operation below. */
@@ -1602,22 +1630,17 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel,
if (!cpumask_test_cpu(target_cpu, housekeeping_cpumask(HK_TYPE_MANAGED_IRQ)))
return -EINVAL;
- /* No CPUs should come up or down during this. */
- cpus_read_lock();
-
- if (!cpu_online(target_cpu)) {
- cpus_read_unlock();
+ if (!cpu_online(target_cpu))
return -EINVAL;
- }
/*
- * Synchronizes target_cpu_store() and channel closure:
+ * Synchronizes vmbus_channel_set_cpu() and channel closure:
*
* { Initially: state = CHANNEL_OPENED }
*
* CPU1 CPU2
*
- * [target_cpu_store()] [vmbus_disconnect_ring()]
+ * [vmbus_channel_set_cpu()] [vmbus_disconnect_ring()]
*
* LOCK channel_mutex LOCK channel_mutex
* LOAD r1 = state LOAD r2 = state
@@ -1632,7 +1655,6 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel,
* Note. The host processes the channel messages "sequentially", in
* the order in which they are received on a per-partition basis.
*/
- mutex_lock(&vmbus_connection.channel_mutex);
/*
* Hyper-V will ignore MODIFYCHANNEL messages for "non-open" channels;
@@ -1640,17 +1662,17 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel,
*/
if (channel->state != CHANNEL_OPENED_STATE) {
ret = -EIO;
- goto cpu_store_unlock;
+ goto end;
}
origin_cpu = channel->target_cpu;
if (target_cpu == origin_cpu)
- goto cpu_store_unlock;
+ goto end;
if (vmbus_send_modifychannel(channel,
hv_cpu_number_to_vp_number(target_cpu))) {
ret = -EIO;
- goto cpu_store_unlock;
+ goto end;
}
/*
@@ -1680,10 +1702,26 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel,
origin_cpu, target_cpu);
}
-cpu_store_unlock:
+end:
+ return ret;
+}
+
+static ssize_t target_cpu_store(struct vmbus_channel *channel,
+ const char *buf, size_t count)
+{
+ u32 target_cpu;
+ ssize_t ret;
+
+ if (sscanf(buf, "%uu", &target_cpu) != 1)
+ return -EIO;
+
+ cpus_read_lock();
+ mutex_lock(&vmbus_connection.channel_mutex);
+ ret = vmbus_channel_set_cpu(channel, target_cpu);
mutex_unlock(&vmbus_connection.channel_mutex);
cpus_read_unlock();
- return ret;
+
+ return ret ?: count;
}
static VMBUS_CHAN_ATTR(cpu, 0644, target_cpu_show, target_cpu_store);
@@ -1764,6 +1802,26 @@ static ssize_t subchannel_id_show(struct vmbus_channel *channel,
}
static VMBUS_CHAN_ATTR_RO(subchannel_id);
+static int hv_mmap_ring_buffer_wrapper(struct file *filp, struct kobject *kobj,
+ const struct bin_attribute *attr,
+ struct vm_area_struct *vma)
+{
+ struct vmbus_channel *channel = container_of(kobj, struct vmbus_channel, kobj);
+
+ /*
+ * hv_(create|remove)_ring_sysfs implementation ensures that mmap_ring_buffer
+ * is not NULL.
+ */
+ return channel->mmap_ring_buffer(channel, vma);
+}
+
+static struct bin_attribute chan_attr_ring_buffer = {
+ .attr = {
+ .name = "ring",
+ .mode = 0600,
+ },
+ .mmap = hv_mmap_ring_buffer_wrapper,
+};
static struct attribute *vmbus_chan_attrs[] = {
&chan_attr_out_mask.attr,
&chan_attr_in_mask.attr,
@@ -1783,6 +1841,11 @@ static struct attribute *vmbus_chan_attrs[] = {
NULL
};
+static const struct bin_attribute *vmbus_chan_bin_attrs[] = {
+ &chan_attr_ring_buffer,
+ NULL
+};
+
/*
* Channel-level attribute_group callback function. Returns the permission for
* each attribute, and returns 0 if an attribute is not visible.
@@ -1803,16 +1866,98 @@ static umode_t vmbus_chan_attr_is_visible(struct kobject *kobj,
return attr->mode;
}
-static struct attribute_group vmbus_chan_group = {
+static umode_t vmbus_chan_bin_attr_is_visible(struct kobject *kobj,
+ const struct bin_attribute *attr, int idx)
+{
+ const struct vmbus_channel *channel =
+ container_of(kobj, struct vmbus_channel, kobj);
+
+ /* Hide ring attribute if channel's ring_sysfs_visible is set to false */
+ if (attr == &chan_attr_ring_buffer && !channel->ring_sysfs_visible)
+ return 0;
+
+ return attr->attr.mode;
+}
+
+static size_t vmbus_chan_bin_size(struct kobject *kobj,
+ const struct bin_attribute *bin_attr, int a)
+{
+ const struct vmbus_channel *channel =
+ container_of(kobj, struct vmbus_channel, kobj);
+
+ return channel->ringbuffer_pagecount << PAGE_SHIFT;
+}
+
+static const struct attribute_group vmbus_chan_group = {
.attrs = vmbus_chan_attrs,
- .is_visible = vmbus_chan_attr_is_visible
+ .bin_attrs = vmbus_chan_bin_attrs,
+ .is_visible = vmbus_chan_attr_is_visible,
+ .is_bin_visible = vmbus_chan_bin_attr_is_visible,
+ .bin_size = vmbus_chan_bin_size,
};
-static struct kobj_type vmbus_chan_ktype = {
+static const struct kobj_type vmbus_chan_ktype = {
.sysfs_ops = &vmbus_chan_sysfs_ops,
.release = vmbus_chan_release,
};
+/**
+ * hv_create_ring_sysfs() - create "ring" sysfs entry corresponding to ring buffers for a channel.
+ * @channel: Pointer to vmbus_channel structure
+ * @hv_mmap_ring_buffer: function pointer for initializing the function to be called on mmap of
+ * channel's "ring" sysfs node, which is for the ring buffer of that channel.
+ * Function pointer is of below type:
+ * int (*hv_mmap_ring_buffer)(struct vmbus_channel *channel,
+ * struct vm_area_struct *vma))
+ * This has a pointer to the channel and a pointer to vm_area_struct,
+ * used for mmap, as arguments.
+ *
+ * Sysfs node for ring buffer of a channel is created along with other fields, however its
+ * visibility is disabled by default. Sysfs creation needs to be controlled when the use-case
+ * is running.
+ * For example, HV_NIC device is used either by uio_hv_generic or hv_netvsc at any given point of
+ * time, and "ring" sysfs is needed only when uio_hv_generic is bound to that device. To avoid
+ * exposing the ring buffer by default, this function is reponsible to enable visibility of
+ * ring for userspace to use.
+ * Note: Race conditions can happen with userspace and it is not encouraged to create new
+ * use-cases for this. This was added to maintain backward compatibility, while solving
+ * one of the race conditions in uio_hv_generic while creating sysfs.
+ *
+ * Returns 0 on success or error code on failure.
+ */
+int hv_create_ring_sysfs(struct vmbus_channel *channel,
+ int (*hv_mmap_ring_buffer)(struct vmbus_channel *channel,
+ struct vm_area_struct *vma))
+{
+ struct kobject *kobj = &channel->kobj;
+
+ channel->mmap_ring_buffer = hv_mmap_ring_buffer;
+ channel->ring_sysfs_visible = true;
+
+ return sysfs_update_group(kobj, &vmbus_chan_group);
+}
+EXPORT_SYMBOL_GPL(hv_create_ring_sysfs);
+
+/**
+ * hv_remove_ring_sysfs() - remove ring sysfs entry corresponding to ring buffers for a channel.
+ * @channel: Pointer to vmbus_channel structure
+ *
+ * Hide "ring" sysfs for a channel by changing its is_visible attribute and updating sysfs group.
+ *
+ * Returns 0 on success or error code on failure.
+ */
+int hv_remove_ring_sysfs(struct vmbus_channel *channel)
+{
+ struct kobject *kobj = &channel->kobj;
+ int ret;
+
+ channel->ring_sysfs_visible = false;
+ ret = sysfs_update_group(kobj, &vmbus_chan_group);
+ channel->mmap_ring_buffer = NULL;
+ return ret;
+}
+EXPORT_SYMBOL_GPL(hv_remove_ring_sysfs);
+
/*
* vmbus_add_channel_kobj - setup a sub-directory under device/channels
*/
@@ -1952,6 +2097,7 @@ void vmbus_device_unregister(struct hv_device *device_obj)
*/
device_unregister(&device_obj->device);
}
+EXPORT_SYMBOL_GPL(vmbus_device_unregister);
#ifdef CONFIG_ACPI
/*
@@ -2233,12 +2379,25 @@ void vmbus_free_mmio(resource_size_t start, resource_size_t size)
struct resource *iter;
mutex_lock(&hyperv_mmio_lock);
+
+ /*
+ * If all bytes of the MMIO range to be released are within the
+ * special case fb_mmio shadow region, skip releasing the shadow
+ * region since no corresponding __request_region() was done
+ * in vmbus_allocate_mmio().
+ */
+ if (fb_mmio && start >= fb_mmio->start &&
+ (start + size - 1 <= fb_mmio->end))
+ goto skip_shadow_release;
+
for (iter = hyperv_mmio; iter; iter = iter->sibling) {
if ((iter->start >= start + size) || (iter->end <= start))
continue;
__release_region(iter, start, size);
}
+
+skip_shadow_release:
release_mem_region(start, size);
mutex_unlock(&hyperv_mmio_lock);
@@ -2398,11 +2557,6 @@ static int vmbus_bus_suspend(struct device *dev)
if (atomic_read(&vmbus_connection.nr_chan_close_on_suspend) > 0)
wait_for_completion(&vmbus_connection.ready_for_suspend_event);
- if (atomic_read(&vmbus_connection.nr_chan_fixup_on_resume) != 0) {
- pr_err("Can not suspend due to a previous failed resuming\n");
- return -EBUSY;
- }
-
mutex_lock(&vmbus_connection.channel_mutex);
list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
@@ -2427,22 +2581,18 @@ static int vmbus_bus_suspend(struct device *dev)
pr_err("Sub-channel not deleted!\n");
WARN_ON_ONCE(1);
}
-
- atomic_inc(&vmbus_connection.nr_chan_fixup_on_resume);
}
mutex_unlock(&vmbus_connection.channel_mutex);
vmbus_initiate_unload(false);
- /* Reset the event for the next resume. */
- reinit_completion(&vmbus_connection.ready_for_resume_event);
-
return 0;
}
static int vmbus_bus_resume(struct device *dev)
{
+ struct vmbus_channel *channel;
struct vmbus_channel_msginfo *msginfo;
size_t msgsize;
int ret;
@@ -2473,13 +2623,23 @@ static int vmbus_bus_resume(struct device *dev)
if (ret != 0)
return ret;
- WARN_ON(atomic_read(&vmbus_connection.nr_chan_fixup_on_resume) == 0);
-
vmbus_request_offers();
- if (wait_for_completion_timeout(
- &vmbus_connection.ready_for_resume_event, 10 * HZ) == 0)
- pr_err("Some vmbus device is missing after suspending?\n");
+ mutex_lock(&vmbus_connection.channel_mutex);
+ list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
+ if (channel->offermsg.child_relid != INVALID_RELID)
+ continue;
+
+ /* hvsock channels are not expected to be present. */
+ if (is_hvsock_channel(channel))
+ continue;
+
+ pr_err("channel %pUl/%pUl not present after resume.\n",
+ &channel->offermsg.offer.if_type,
+ &channel->offermsg.offer.if_instance);
+ /* ToDo: Cleanup these channels here */
+ }
+ mutex_unlock(&vmbus_connection.channel_mutex);
/* Reset the event for the next suspend. */
reinit_completion(&vmbus_connection.ready_for_suspend_event);
@@ -2531,7 +2691,7 @@ static const struct dev_pm_ops vmbus_bus_pm = {
static struct platform_driver vmbus_platform_driver = {
.probe = vmbus_platform_driver_probe,
- .remove_new = vmbus_platform_driver_remove,
+ .remove = vmbus_platform_driver_remove,
.driver = {
.name = "vmbus",
.acpi_match_table = ACPI_PTR(vmbus_acpi_device_ids),
@@ -2616,7 +2776,7 @@ static int __init hv_acpi_init(void)
if (!hv_is_hyperv_initialized())
return -ENODEV;
- if (hv_root_partition && !hv_nested)
+ if (hv_root_partition() && !hv_nested)
return 0;
/*