diff options
Diffstat (limited to 'drivers/hv')
30 files changed, 10141 insertions, 1587 deletions
diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig index 0747a8f1fcee..7937ac0cbd0f 100644 --- a/drivers/hv/Kconfig +++ b/drivers/hv/Kconfig @@ -3,31 +3,110 @@ menu "Microsoft Hyper-V guest support" config HYPERV - tristate "Microsoft Hyper-V client drivers" - depends on ACPI && ((X86 && X86_LOCAL_APIC && HYPERVISOR_GUEST) \ - || (ARM64 && !CPU_BIG_ENDIAN)) + bool "Microsoft Hyper-V core hypervisor support" + depends on (X86 && X86_LOCAL_APIC && HYPERVISOR_GUEST) \ + || (ARM64 && !CPU_BIG_ENDIAN) select PARAVIRT select X86_HV_CALLBACK_VECTOR if X86 - select VMAP_PFN + select OF_EARLY_FLATTREE if OF + select SYSFB if EFI && !HYPERV_VTL_MODE + select IRQ_MSI_LIB if X86 help Select this option to run Linux as a Hyper-V client operating system. +config HYPERV_VTL_MODE + bool "Enable Linux to boot in VTL context" + depends on (X86_64 && HAVE_STATIC_CALL) || ARM64 + depends on HYPERV + depends on SMP + default n + help + Virtual Secure Mode (VSM) is a set of hypervisor capabilities and + enlightenments offered to host and guest partitions which enables + the creation and management of new security boundaries within + operating system software. + + VSM achieves and maintains isolation through Virtual Trust Levels + (VTLs). Virtual Trust Levels are hierarchical, with higher levels + being more privileged than lower levels. VTL0 is the least privileged + level, and currently only other level supported is VTL2. + + Select this option to build a Linux kernel to run at a VTL other than + the normal VTL0, which currently is only VTL2. This option + initializes the kernel to run in VTL2, and adds the ability to boot + secondary CPUs directly into 64-bit context as required for VTLs other + than 0. A kernel built with this option must run at VTL2, and will + not run as a normal guest. + + If unsure, say N + config HYPERV_TIMER def_bool HYPERV && X86 config HYPERV_UTILS tristate "Microsoft Hyper-V Utilities driver" - depends on HYPERV && CONNECTOR && NLS + depends on HYPERV_VMBUS && CONNECTOR && NLS depends on PTP_1588_CLOCK_OPTIONAL help Select this option to enable the Hyper-V Utilities. config HYPERV_BALLOON tristate "Microsoft Hyper-V Balloon driver" - depends on HYPERV + depends on HYPERV_VMBUS select PAGE_REPORTING help Select this option to enable Hyper-V Balloon driver. +config HYPERV_VMBUS + tristate "Microsoft Hyper-V VMBus driver" + depends on HYPERV + default HYPERV + help + Select this option to enable Hyper-V Vmbus driver. + +config MSHV_ROOT + tristate "Microsoft Hyper-V root partition support" + depends on HYPERV && (X86_64 || ARM64) + depends on !HYPERV_VTL_MODE + # The hypervisor interface operates on 4k pages. Enforcing it here + # simplifies many assumptions in the root partition code. + # e.g. When withdrawing memory, the hypervisor gives back 4k pages in + # no particular order, making it impossible to reassemble larger pages + depends on PAGE_SIZE_4KB + select EVENTFD + select VIRT_XFER_TO_GUEST_WORK + select HMM_MIRROR + select MMU_NOTIFIER + default n + help + Select this option to enable support for booting and running as root + partition on Microsoft Hyper-V. + + If unsure, say N. + +config MSHV_VTL + tristate "Microsoft Hyper-V VTL driver" + depends on X86_64 && HYPERV_VTL_MODE + depends on HYPERV_VMBUS + # Mapping VTL0 memory to a userspace process in VTL2 is supported in OpenHCL. + # VTL2 for OpenHCL makes use of Huge Pages to improve performance on VMs, + # specially with large memory requirements. + depends on TRANSPARENT_HUGEPAGE + # MTRRs are controlled by VTL0, and are not specific to individual VTLs. + # Therefore, do not attempt to access or modify MTRRs here. + depends on !MTRR + select CPUMASK_OFFSTACK + select VIRT_XFER_TO_GUEST_WORK + default n + help + Select this option to enable Hyper-V VTL driver support. + This driver provides interfaces for Virtual Machine Manager (VMM) running in VTL2 + userspace to create VTLs and partitions, setup and manage VTL0 memory and + allow userspace to make direct hypercalls. This also allows to map VTL0's address + space to a usermode process in VTL2 and supports getting new VMBus messages and channel + events in VTL2. + + If unsure, say N. + endmenu diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile index d76df5c8c2a9..a49f93c2d245 100644 --- a/drivers/hv/Makefile +++ b/drivers/hv/Makefile @@ -1,7 +1,9 @@ # SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_HYPERV) += hv_vmbus.o +obj-$(CONFIG_HYPERV_VMBUS) += hv_vmbus.o obj-$(CONFIG_HYPERV_UTILS) += hv_utils.o obj-$(CONFIG_HYPERV_BALLOON) += hv_balloon.o +obj-$(CONFIG_MSHV_ROOT) += mshv_root.o +obj-$(CONFIG_MSHV_VTL) += mshv_vtl.o CFLAGS_hv_trace.o = -I$(src) CFLAGS_hv_balloon.o = -I$(src) @@ -10,7 +12,14 @@ hv_vmbus-y := vmbus_drv.o \ hv.o connection.o channel.o \ channel_mgmt.o ring_buffer.o hv_trace.o hv_vmbus-$(CONFIG_HYPERV_TESTING) += hv_debugfs.o -hv_utils-y := hv_util.o hv_kvp.o hv_snapshot.o hv_fcopy.o hv_utils_transport.o +hv_utils-y := hv_util.o hv_kvp.o hv_snapshot.o hv_utils_transport.o +mshv_root-y := mshv_root_main.o mshv_synic.o mshv_eventfd.o mshv_irq.o \ + mshv_root_hv_call.o mshv_portid_table.o mshv_regions.o +mshv_vtl-y := mshv_vtl_main.o # Code that must be built-in -obj-$(subst m,y,$(CONFIG_HYPERV)) += hv_common.o +obj-$(CONFIG_HYPERV) += hv_common.o +obj-$(subst m,y,$(CONFIG_MSHV_ROOT)) += hv_proc.o +ifneq ($(CONFIG_MSHV_ROOT)$(CONFIG_MSHV_VTL),) + obj-y += mshv_common.o +endif diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index 56f7e06c673e..6821f225248b 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -18,6 +18,7 @@ #include <linux/uio.h> #include <linux/interrupt.h> #include <linux/set_memory.h> +#include <linux/export.h> #include <asm/page.h> #include <asm/mshyperv.h> @@ -153,7 +154,9 @@ void vmbus_free_ring(struct vmbus_channel *channel) hv_ringbuffer_cleanup(&channel->inbound); if (channel->ringbuffer_page) { - __free_pages(channel->ringbuffer_page, + /* In a CoCo VM leak the memory if it didn't get re-encrypted */ + if (!channel->ringbuffer_gpadlhandle.decrypted) + __free_pages(channel->ringbuffer_page, get_order(channel->ringbuffer_pagecount << PAGE_SHIFT)); channel->ringbuffer_page = NULL; @@ -322,125 +325,104 @@ static int create_gpadl_header(enum hv_gpadl_type type, void *kbuffer, pagecount = hv_gpadl_size(type, size) >> HV_HYP_PAGE_SHIFT; - /* do we need a gpadl body msg */ pfnsize = MAX_SIZE_CHANNEL_MESSAGE - sizeof(struct vmbus_channel_gpadl_header) - sizeof(struct gpa_range); + pfncount = umin(pagecount, pfnsize / sizeof(u64)); + + msgsize = sizeof(struct vmbus_channel_msginfo) + + sizeof(struct vmbus_channel_gpadl_header) + + sizeof(struct gpa_range) + pfncount * sizeof(u64); + msgheader = kzalloc(msgsize, GFP_KERNEL); + if (!msgheader) + return -ENOMEM; + + INIT_LIST_HEAD(&msgheader->submsglist); + msgheader->msgsize = msgsize; + + gpadl_header = (struct vmbus_channel_gpadl_header *) + msgheader->msg; + gpadl_header->rangecount = 1; + gpadl_header->range_buflen = sizeof(struct gpa_range) + + pagecount * sizeof(u64); + gpadl_header->range[0].byte_offset = 0; + gpadl_header->range[0].byte_count = hv_gpadl_size(type, size); + for (i = 0; i < pfncount; i++) + gpadl_header->range[0].pfn_array[i] = hv_gpadl_hvpfn( + type, kbuffer, size, send_offset, i); + *msginfo = msgheader; + + pfnsum = pfncount; + pfnleft = pagecount - pfncount; + + /* how many pfns can we fit in a body message */ + pfnsize = MAX_SIZE_CHANNEL_MESSAGE - + sizeof(struct vmbus_channel_gpadl_body); pfncount = pfnsize / sizeof(u64); - if (pagecount > pfncount) { - /* we need a gpadl body */ - /* fill in the header */ + /* + * If pfnleft is zero, everything fits in the header and no body + * messages are needed + */ + while (pfnleft) { + pfncurr = umin(pfncount, pfnleft); msgsize = sizeof(struct vmbus_channel_msginfo) + - sizeof(struct vmbus_channel_gpadl_header) + - sizeof(struct gpa_range) + pfncount * sizeof(u64); - msgheader = kzalloc(msgsize, GFP_KERNEL); - if (!msgheader) - goto nomem; - - INIT_LIST_HEAD(&msgheader->submsglist); - msgheader->msgsize = msgsize; - - gpadl_header = (struct vmbus_channel_gpadl_header *) - msgheader->msg; - gpadl_header->rangecount = 1; - gpadl_header->range_buflen = sizeof(struct gpa_range) + - pagecount * sizeof(u64); - gpadl_header->range[0].byte_offset = 0; - gpadl_header->range[0].byte_count = hv_gpadl_size(type, size); - for (i = 0; i < pfncount; i++) - gpadl_header->range[0].pfn_array[i] = hv_gpadl_hvpfn( - type, kbuffer, size, send_offset, i); - *msginfo = msgheader; - - pfnsum = pfncount; - pfnleft = pagecount - pfncount; - - /* how many pfns can we fit */ - pfnsize = MAX_SIZE_CHANNEL_MESSAGE - - sizeof(struct vmbus_channel_gpadl_body); - pfncount = pfnsize / sizeof(u64); - - /* fill in the body */ - while (pfnleft) { - if (pfnleft > pfncount) - pfncurr = pfncount; - else - pfncurr = pfnleft; - - msgsize = sizeof(struct vmbus_channel_msginfo) + - sizeof(struct vmbus_channel_gpadl_body) + - pfncurr * sizeof(u64); - msgbody = kzalloc(msgsize, GFP_KERNEL); - - if (!msgbody) { - struct vmbus_channel_msginfo *pos = NULL; - struct vmbus_channel_msginfo *tmp = NULL; - /* - * Free up all the allocated messages. - */ - list_for_each_entry_safe(pos, tmp, - &msgheader->submsglist, - msglistentry) { - - list_del(&pos->msglistentry); - kfree(pos); - } - - goto nomem; - } - - msgbody->msgsize = msgsize; - gpadl_body = - (struct vmbus_channel_gpadl_body *)msgbody->msg; + sizeof(struct vmbus_channel_gpadl_body) + + pfncurr * sizeof(u64); + msgbody = kzalloc(msgsize, GFP_KERNEL); + if (!msgbody) { + struct vmbus_channel_msginfo *pos = NULL; + struct vmbus_channel_msginfo *tmp = NULL; /* - * Gpadl is u32 and we are using a pointer which could - * be 64-bit - * This is governed by the guest/host protocol and - * so the hypervisor guarantees that this is ok. + * Free up all the allocated messages. */ - for (i = 0; i < pfncurr; i++) - gpadl_body->pfn[i] = hv_gpadl_hvpfn(type, - kbuffer, size, send_offset, pfnsum + i); - - /* add to msg header */ - list_add_tail(&msgbody->msglistentry, - &msgheader->submsglist); - pfnsum += pfncurr; - pfnleft -= pfncurr; + list_for_each_entry_safe(pos, tmp, + &msgheader->submsglist, + msglistentry) { + + list_del(&pos->msglistentry); + kfree(pos); + } + kfree(msgheader); + return -ENOMEM; } - } else { - /* everything fits in a header */ - msgsize = sizeof(struct vmbus_channel_msginfo) + - sizeof(struct vmbus_channel_gpadl_header) + - sizeof(struct gpa_range) + pagecount * sizeof(u64); - msgheader = kzalloc(msgsize, GFP_KERNEL); - if (msgheader == NULL) - goto nomem; - - INIT_LIST_HEAD(&msgheader->submsglist); - msgheader->msgsize = msgsize; - - gpadl_header = (struct vmbus_channel_gpadl_header *) - msgheader->msg; - gpadl_header->rangecount = 1; - gpadl_header->range_buflen = sizeof(struct gpa_range) + - pagecount * sizeof(u64); - gpadl_header->range[0].byte_offset = 0; - gpadl_header->range[0].byte_count = hv_gpadl_size(type, size); - for (i = 0; i < pagecount; i++) - gpadl_header->range[0].pfn_array[i] = hv_gpadl_hvpfn( - type, kbuffer, size, send_offset, i); - - *msginfo = msgheader; + + msgbody->msgsize = msgsize; + gpadl_body = (struct vmbus_channel_gpadl_body *)msgbody->msg; + + /* + * Gpadl is u32 and we are using a pointer which could + * be 64-bit + * This is governed by the guest/host protocol and + * so the hypervisor guarantees that this is ok. + */ + for (i = 0; i < pfncurr; i++) + gpadl_body->pfn[i] = hv_gpadl_hvpfn(type, + kbuffer, size, send_offset, pfnsum + i); + + /* add to msg header */ + list_add_tail(&msgbody->msglistentry, &msgheader->submsglist); + pfnsum += pfncurr; + pfnleft -= pfncurr; } return 0; -nomem: - kfree(msgheader); - kfree(msgbody); - return -ENOMEM; +} + +static void vmbus_free_channel_msginfo(struct vmbus_channel_msginfo *msginfo) +{ + struct vmbus_channel_msginfo *submsginfo, *tmp; + + if (!msginfo) + return; + + list_for_each_entry_safe(submsginfo, tmp, &msginfo->submsglist, + msglistentry) { + kfree(submsginfo); + } + + kfree(msginfo); } /* @@ -462,7 +444,7 @@ static int __vmbus_establish_gpadl(struct vmbus_channel *channel, struct vmbus_channel_gpadl_header *gpadlmsg; struct vmbus_channel_gpadl_body *gpadl_body; struct vmbus_channel_msginfo *msginfo = NULL; - struct vmbus_channel_msginfo *submsginfo, *tmp; + struct vmbus_channel_msginfo *submsginfo; struct list_head *curr; u32 next_gpadl_handle; unsigned long flags; @@ -472,18 +454,31 @@ static int __vmbus_establish_gpadl(struct vmbus_channel *channel, (atomic_inc_return(&vmbus_connection.next_gpadl_handle) - 1); ret = create_gpadl_header(type, kbuffer, size, send_offset, &msginfo); - if (ret) - return ret; - - ret = set_memory_decrypted((unsigned long)kbuffer, - PFN_UP(size)); if (ret) { - dev_warn(&channel->device_obj->device, - "Failed to set host visibility for new GPADL %d.\n", - ret); + gpadl->decrypted = false; return ret; } + gpadl->decrypted = !((channel->co_external_memory && type == HV_GPADL_BUFFER) || + (channel->co_ring_buffer && type == HV_GPADL_RING)); + if (gpadl->decrypted) { + /* + * The "decrypted" flag being true assumes that set_memory_decrypted() succeeds. + * But if it fails, the encryption state of the memory is unknown. In that case, + * leave "decrypted" as true to ensure the memory is leaked instead of going back + * on the free list. + */ + ret = set_memory_decrypted((unsigned long)kbuffer, + PFN_UP(size)); + if (ret) { + dev_warn(&channel->device_obj->device, + "Failed to set host visibility for new GPADL %d.\n", + ret); + vmbus_free_channel_msginfo(msginfo); + return ret; + } + } + init_completion(&msginfo->waitevent); msginfo->waiting_channel = channel; @@ -556,16 +551,20 @@ cleanup: spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags); list_del(&msginfo->msglistentry); spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags); - list_for_each_entry_safe(submsginfo, tmp, &msginfo->submsglist, - msglistentry) { - kfree(submsginfo); - } - kfree(msginfo); + vmbus_free_channel_msginfo(msginfo); - if (ret) - set_memory_encrypted((unsigned long)kbuffer, - PFN_UP(size)); + if (ret) { + /* + * If set_memory_encrypted() fails, the decrypted flag is + * left as true so the memory is leaked instead of being + * put back on the free list. + */ + if (gpadl->decrypted) { + if (!set_memory_encrypted((unsigned long)kbuffer, PFN_UP(size))) + gpadl->decrypted = false; + } + } return ret; } @@ -591,7 +590,7 @@ EXPORT_SYMBOL_GPL(vmbus_establish_gpadl); * keeps track of the next available slot in the array. Initially, each * slot points to the next one (as in a Linked List). The last slot * does not point to anything, so its value is U64_MAX by default. - * @size The size of the array + * @size: The size of the array */ static u64 *request_arr_init(u32 size) { @@ -695,12 +694,13 @@ static int __vmbus_open(struct vmbus_channel *newchannel, goto error_clean_ring; err = hv_ringbuffer_init(&newchannel->outbound, - page, send_pages, 0); + page, send_pages, 0, newchannel->co_ring_buffer); if (err) goto error_free_gpadl; err = hv_ringbuffer_init(&newchannel->inbound, &page[send_pages], - recv_pages, newchannel->max_pkt_size); + recv_pages, newchannel->max_pkt_size, + newchannel->co_ring_buffer); if (err) goto error_free_gpadl; @@ -881,11 +881,16 @@ post_msg_err: kfree(info); - ret = set_memory_encrypted((unsigned long)gpadl->buffer, - PFN_UP(gpadl->size)); + if (gpadl->decrypted) + ret = set_memory_encrypted((unsigned long)gpadl->buffer, + PFN_UP(gpadl->size)); + else + ret = 0; if (ret) pr_warn("Fail to set mem host visibility in GPADL teardown %d.\n", ret); + gpadl->decrypted = ret; + return ret; } EXPORT_SYMBOL_GPL(vmbus_teardown_gpadl); @@ -941,7 +946,7 @@ static int vmbus_close_internal(struct vmbus_channel *channel) /* Send a closing message */ - msg = &channel->close_msg.msg; + msg = &channel->close_msg; msg->header.msgtype = CHANNELMSG_CLOSECHANNEL; msg->child_relid = channel->offermsg.child_relid; @@ -1094,68 +1099,10 @@ int vmbus_sendpacket(struct vmbus_channel *channel, void *buffer, EXPORT_SYMBOL(vmbus_sendpacket); /* - * vmbus_sendpacket_pagebuffer - Send a range of single-page buffer - * packets using a GPADL Direct packet type. This interface allows you - * to control notifying the host. This will be useful for sending - * batched data. Also the sender can control the send flags - * explicitly. - */ -int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel, - struct hv_page_buffer pagebuffers[], - u32 pagecount, void *buffer, u32 bufferlen, - u64 requestid) -{ - int i; - struct vmbus_channel_packet_page_buffer desc; - u32 descsize; - u32 packetlen; - u32 packetlen_aligned; - struct kvec bufferlist[3]; - u64 aligned_data = 0; - - if (pagecount > MAX_PAGE_BUFFER_COUNT) - return -EINVAL; - - /* - * Adjust the size down since vmbus_channel_packet_page_buffer is the - * largest size we support - */ - descsize = sizeof(struct vmbus_channel_packet_page_buffer) - - ((MAX_PAGE_BUFFER_COUNT - pagecount) * - sizeof(struct hv_page_buffer)); - packetlen = descsize + bufferlen; - packetlen_aligned = ALIGN(packetlen, sizeof(u64)); - - /* Setup the descriptor */ - desc.type = VM_PKT_DATA_USING_GPA_DIRECT; - desc.flags = VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED; - desc.dataoffset8 = descsize >> 3; /* in 8-bytes granularity */ - desc.length8 = (u16)(packetlen_aligned >> 3); - desc.transactionid = VMBUS_RQST_ERROR; /* will be updated in hv_ringbuffer_write() */ - desc.reserved = 0; - desc.rangecount = pagecount; - - for (i = 0; i < pagecount; i++) { - desc.range[i].len = pagebuffers[i].len; - desc.range[i].offset = pagebuffers[i].offset; - desc.range[i].pfn = pagebuffers[i].pfn; - } - - bufferlist[0].iov_base = &desc; - bufferlist[0].iov_len = descsize; - bufferlist[1].iov_base = buffer; - bufferlist[1].iov_len = bufferlen; - bufferlist[2].iov_base = &aligned_data; - bufferlist[2].iov_len = (packetlen_aligned - packetlen); - - return hv_ringbuffer_write(channel, bufferlist, 3, requestid, NULL); -} -EXPORT_SYMBOL_GPL(vmbus_sendpacket_pagebuffer); - -/* - * vmbus_sendpacket_multipagebuffer - Send a multi-page buffer packet + * vmbus_sendpacket_mpb_desc - Send one or more multi-page buffer packets * using a GPADL Direct packet type. - * The buffer includes the vmbus descriptor. + * The desc argument must include space for the VMBus descriptor. The + * rangecount field must already be set. */ int vmbus_sendpacket_mpb_desc(struct vmbus_channel *channel, struct vmbus_packet_mpb_array *desc, @@ -1177,7 +1124,6 @@ int vmbus_sendpacket_mpb_desc(struct vmbus_channel *channel, desc->length8 = (u16)(packetlen_aligned >> 3); desc->transactionid = VMBUS_RQST_ERROR; /* will be updated in hv_ringbuffer_write() */ desc->reserved = 0; - desc->rangecount = 1; bufferlist[0].iov_base = desc; bufferlist[0].iov_len = desc_size; diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index cc23b90cae02..74fed2c073d4 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -20,6 +20,7 @@ #include <linux/delay.h> #include <linux/cpu.h> #include <linux/hyperv.h> +#include <linux/export.h> #include <asm/mshyperv.h> #include <linux/sched/isolation.h> @@ -67,7 +68,7 @@ const struct vmbus_device vmbus_devs[] = { { .dev_type = HV_PCIE, HV_PCIE_GUID, .perf_device = false, - .allowed_in_isolated = false, + .allowed_in_isolated = true, }, /* Synthetic Frame Buffer */ @@ -120,7 +121,9 @@ const struct vmbus_device vmbus_devs[] = { }, /* File copy */ - { .dev_type = HV_FCOPY, + /* fcopy always uses 16KB ring buffer size and is working well for last many years */ + { .pref_ring_size = 0x4000, + .dev_type = HV_FCOPY, HV_FCOPY_GUID, .perf_device = false, .allowed_in_isolated = false, @@ -140,12 +143,19 @@ const struct vmbus_device vmbus_devs[] = { .allowed_in_isolated = false, }, - /* Unknown GUID */ - { .dev_type = HV_UNKNOWN, + /* + * Unknown GUID + * 64 KB ring buffer + 4 KB header should be sufficient size for any Hyper-V device apart + * from HV_NIC and HV_SCSI. This case avoid the fallback for unknown devices to allocate + * much bigger (2 MB) of ring size. + */ + { .pref_ring_size = 0x11000, + .dev_type = HV_UNKNOWN, .perf_device = false, .allowed_in_isolated = false, }, }; +EXPORT_SYMBOL_GPL(vmbus_devs); static const struct { guid_t guid; @@ -829,11 +839,22 @@ static void vmbus_wait_for_unload(void) if (completion_done(&vmbus_connection.unload_event)) goto completed; - for_each_online_cpu(cpu) { + for_each_present_cpu(cpu) { struct hv_per_cpu_context *hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu); - page_addr = hv_cpu->synic_message_page; + /* + * In a CoCo VM the hyp_synic_message_page is not allocated + * in hv_synic_alloc(). Instead it is set/cleared in + * hv_hyp_synic_enable_regs() and hv_hyp_synic_disable_regs() + * such that it is set only when the CPU is online. If + * not all present CPUs are online, the message page + * might be NULL, so skip such CPUs. + */ + page_addr = hv_cpu->hyp_synic_message_page; + if (!page_addr) + continue; + msg = (struct hv_message *)page_addr + VMBUS_MESSAGE_SINT; @@ -867,11 +888,14 @@ completed: * maybe-pending messages on all CPUs to be able to receive new * messages after we reconnect. */ - for_each_online_cpu(cpu) { + for_each_present_cpu(cpu) { struct hv_per_cpu_context *hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu); - page_addr = hv_cpu->synic_message_page; + page_addr = hv_cpu->hyp_synic_message_page; + if (!page_addr) + continue; + msg = (struct hv_message *)page_addr + VMBUS_MESSAGE_SINT; msg->header.message_type = HVMSG_NONE; } @@ -921,16 +945,6 @@ void vmbus_initiate_unload(bool crash) vmbus_wait_for_unload(); } -static void check_ready_for_resume_event(void) -{ - /* - * If all the old primary channels have been fixed up, then it's safe - * to resume. - */ - if (atomic_dec_and_test(&vmbus_connection.nr_chan_fixup_on_resume)) - complete(&vmbus_connection.ready_for_resume_event); -} - static void vmbus_setup_channel_state(struct vmbus_channel *channel, struct vmbus_channel_offer_channel *offer) { @@ -1008,6 +1022,7 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr) struct vmbus_channel_offer_channel *offer; struct vmbus_channel *oldchannel, *newchannel; size_t offer_sz; + bool co_ring_buffer, co_external_memory; offer = (struct vmbus_channel_offer_channel *)hdr; @@ -1020,6 +1035,22 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr) return; } + co_ring_buffer = is_co_ring_buffer(offer); + co_external_memory = is_co_external_memory(offer); + if (!co_ring_buffer && co_external_memory) { + pr_err("Invalid offer relid=%d: the ring buffer isn't encrypted\n", + offer->child_relid); + return; + } + if (co_ring_buffer || co_external_memory) { + if (vmbus_proto_version < VERSION_WIN10_V6_0 || !vmbus_is_confidential()) { + pr_err("Invalid offer relid=%d: no support for confidential VMBus\n", + offer->child_relid); + atomic_dec(&vmbus_connection.offer_in_progress); + return; + } + } + oldchannel = find_primary_channel_by_offer(offer); if (oldchannel != NULL) { @@ -1086,8 +1117,6 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr) /* Add the channel back to the array of channels. */ vmbus_channel_map_relid(oldchannel); - check_ready_for_resume_event(); - mutex_unlock(&vmbus_connection.channel_mutex); return; } @@ -1100,6 +1129,8 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr) pr_err("Unable to allocate channel object\n"); return; } + newchannel->co_ring_buffer = co_ring_buffer; + newchannel->co_external_memory = co_external_memory; vmbus_setup_channel_state(newchannel, offer); @@ -1273,13 +1304,28 @@ EXPORT_SYMBOL_GPL(vmbus_hvsock_device_unregister); /* * vmbus_onoffers_delivered - - * This is invoked when all offers have been delivered. + * The CHANNELMSG_ALLOFFERS_DELIVERED message arrives after all + * boot-time offers are delivered. A boot-time offer is for the primary + * channel for any virtual hardware configured in the VM at the time it boots. + * Boot-time offers include offers for physical devices assigned to the VM + * via Hyper-V's Discrete Device Assignment (DDA) functionality that are + * handled as virtual PCI devices in Linux (e.g., NVMe devices and GPUs). + * Boot-time offers do not include offers for VMBus sub-channels. Because + * devices can be hot-added to the VM after it is booted, additional channel + * offers that aren't boot-time offers can be received at any time after the + * all-offers-delivered message. * - * Nothing to do here. + * SR-IOV NIC Virtual Functions (VFs) assigned to a VM are not considered + * to be assigned to the VM at boot-time, and offers for VFs may occur after + * the all-offers-delivered message. VFs are optional accelerators to the + * synthetic VMBus NIC and are effectively hot-added only after the VMBus + * NIC channel is opened (once it knows the guest can support it, via the + * sriov bit in the netvsc protocol). */ static void vmbus_onoffers_delivered( struct vmbus_channel_message_header *hdr) { + complete(&vmbus_connection.all_offers_delivered_event); } /* @@ -1555,7 +1601,8 @@ void vmbus_onmessage(struct vmbus_channel_message_header *hdr) } /* - * vmbus_request_offers - Send a request to get all our pending offers. + * vmbus_request_offers - Send a request to get all our pending offers + * and wait for all boot-time offers to arrive. */ int vmbus_request_offers(void) { @@ -1573,6 +1620,10 @@ int vmbus_request_offers(void) msg->msgtype = CHANNELMSG_REQUESTOFFERS; + /* + * This REQUESTOFFERS message will result in the host sending an all + * offers delivered message after all the boot-time offers are sent. + */ ret = vmbus_post_msg(msg, sizeof(struct vmbus_channel_message_header), true); @@ -1584,6 +1635,29 @@ int vmbus_request_offers(void) goto cleanup; } + /* + * Wait for the host to send all boot-time offers. + * Keeping it as a best-effort mechanism, where a warning is + * printed if a timeout occurs, and execution is resumed. + */ + if (!wait_for_completion_timeout(&vmbus_connection.all_offers_delivered_event, + secs_to_jiffies(60))) { + pr_warn("timed out waiting for all boot-time offers to be delivered.\n"); + } + + /* + * Flush handling of offer messages (which may initiate work on + * other work queues). + */ + flush_workqueue(vmbus_connection.work_queue); + + /* + * Flush workqueue for processing the incoming offers. Subchannel + * offers and their processing can happen later, so there is no need to + * flush that workqueue here. + */ + flush_workqueue(vmbus_connection.handle_primary_chan_wq); + cleanup: kfree(msginfo); diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c index 9dc27e5d367a..5d9cb5bf2d62 100644 --- a/drivers/hv/connection.c +++ b/drivers/hv/connection.c @@ -34,8 +34,8 @@ struct vmbus_connection vmbus_connection = { .ready_for_suspend_event = COMPLETION_INITIALIZER( vmbus_connection.ready_for_suspend_event), - .ready_for_resume_event = COMPLETION_INITIALIZER( - vmbus_connection.ready_for_resume_event), + .all_offers_delivered_event = COMPLETION_INITIALIZER( + vmbus_connection.all_offers_delivered_event), }; EXPORT_SYMBOL_GPL(vmbus_connection); @@ -51,6 +51,7 @@ EXPORT_SYMBOL_GPL(vmbus_proto_version); * Linux guests and are not listed. */ static __u32 vmbus_versions[] = { + VERSION_WIN10_V6_0, VERSION_WIN10_V5_3, VERSION_WIN10_V5_2, VERSION_WIN10_V5_1, @@ -65,7 +66,7 @@ static __u32 vmbus_versions[] = { * Maximal VMBus protocol version guests can negotiate. Useful to cap the * VMBus version for testing and debugging purpose. */ -static uint max_version = VERSION_WIN10_V5_3; +static uint max_version = VERSION_WIN10_V6_0; module_param(max_version, uint, S_IRUGO); MODULE_PARM_DESC(max_version, @@ -98,14 +99,24 @@ int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo, u32 version) */ if (version >= VERSION_WIN10_V5) { msg->msg_sint = VMBUS_MESSAGE_SINT; + msg->msg_vtl = ms_hyperv.vtl; vmbus_connection.msg_conn_id = VMBUS_MESSAGE_CONNECTION_ID_4; } else { msg->interrupt_page = virt_to_phys(vmbus_connection.int_page); vmbus_connection.msg_conn_id = VMBUS_MESSAGE_CONNECTION_ID; } - msg->monitor_page1 = vmbus_connection.monitor_pages_pa[0]; - msg->monitor_page2 = vmbus_connection.monitor_pages_pa[1]; + if (vmbus_is_confidential() && version >= VERSION_WIN10_V6_0) + msg->feature_flags = VMBUS_FEATURE_FLAG_CONFIDENTIAL_CHANNELS; + + /* + * shared_gpa_boundary is zero in non-SNP VMs, so it's safe to always + * bitwise OR it + */ + msg->monitor_page1 = virt_to_phys(vmbus_connection.monitor_pages[0]) | + ms_hyperv.shared_gpa_boundary; + msg->monitor_page2 = virt_to_phys(vmbus_connection.monitor_pages[1]) | + ms_hyperv.shared_gpa_boundary; msg->target_vcpu = hv_cpu_number_to_vp_number(VMBUS_CONNECT_CPU); @@ -200,11 +211,19 @@ int vmbus_connect(void) mutex_init(&vmbus_connection.channel_mutex); /* + * The following Hyper-V interrupt and monitor pages can be used by + * UIO for mapping to user-space, so they should always be allocated on + * system page boundaries. The system page size must be >= the Hyper-V + * page size. + */ + BUILD_BUG_ON(PAGE_SIZE < HV_HYP_PAGE_SIZE); + + /* * Setup the vmbus event connection for channel interrupt * abstraction stuff */ vmbus_connection.int_page = - (void *)hv_alloc_hyperv_zeroed_page(); + (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO); if (vmbus_connection.int_page == NULL) { ret = -ENOMEM; goto cleanup; @@ -219,73 +238,37 @@ int vmbus_connect(void) * Setup the monitor notification facility. The 1st page for * parent->child and the 2nd page for child->parent */ - vmbus_connection.monitor_pages[0] = (void *)hv_alloc_hyperv_zeroed_page(); - vmbus_connection.monitor_pages[1] = (void *)hv_alloc_hyperv_zeroed_page(); + vmbus_connection.monitor_pages[0] = (void *)__get_free_page(GFP_KERNEL); + vmbus_connection.monitor_pages[1] = (void *)__get_free_page(GFP_KERNEL); if ((vmbus_connection.monitor_pages[0] == NULL) || (vmbus_connection.monitor_pages[1] == NULL)) { ret = -ENOMEM; goto cleanup; } - vmbus_connection.monitor_pages_original[0] - = vmbus_connection.monitor_pages[0]; - vmbus_connection.monitor_pages_original[1] - = vmbus_connection.monitor_pages[1]; - vmbus_connection.monitor_pages_pa[0] - = virt_to_phys(vmbus_connection.monitor_pages[0]); - vmbus_connection.monitor_pages_pa[1] - = virt_to_phys(vmbus_connection.monitor_pages[1]); - - if (hv_is_isolation_supported()) { - ret = set_memory_decrypted((unsigned long) - vmbus_connection.monitor_pages[0], - 1); - ret |= set_memory_decrypted((unsigned long) - vmbus_connection.monitor_pages[1], - 1); - if (ret) - goto cleanup; - + ret = set_memory_decrypted((unsigned long) + vmbus_connection.monitor_pages[0], 1); + ret |= set_memory_decrypted((unsigned long) + vmbus_connection.monitor_pages[1], 1); + if (ret) { /* - * Isolation VM with AMD SNP needs to access monitor page via - * address space above shared gpa boundary. + * If set_memory_decrypted() fails, the encryption state + * of the memory is unknown. So leak the memory instead + * of risking returning decrypted memory to the free list. + * For simplicity, always handle both pages the same. */ - if (hv_isolation_type_snp()) { - vmbus_connection.monitor_pages_pa[0] += - ms_hyperv.shared_gpa_boundary; - vmbus_connection.monitor_pages_pa[1] += - ms_hyperv.shared_gpa_boundary; - - vmbus_connection.monitor_pages[0] - = memremap(vmbus_connection.monitor_pages_pa[0], - HV_HYP_PAGE_SIZE, - MEMREMAP_WB); - if (!vmbus_connection.monitor_pages[0]) { - ret = -ENOMEM; - goto cleanup; - } - - vmbus_connection.monitor_pages[1] - = memremap(vmbus_connection.monitor_pages_pa[1], - HV_HYP_PAGE_SIZE, - MEMREMAP_WB); - if (!vmbus_connection.monitor_pages[1]) { - ret = -ENOMEM; - goto cleanup; - } - } - - /* - * Set memory host visibility hvcall smears memory - * and so zero monitor pages here. - */ - memset(vmbus_connection.monitor_pages[0], 0x00, - HV_HYP_PAGE_SIZE); - memset(vmbus_connection.monitor_pages[1], 0x00, - HV_HYP_PAGE_SIZE); - + vmbus_connection.monitor_pages[0] = NULL; + vmbus_connection.monitor_pages[1] = NULL; + goto cleanup; } + /* + * Set_memory_decrypted() will change the memory contents if + * decryption occurs, so zero monitor pages here. + */ + memset(vmbus_connection.monitor_pages[0], 0x00, HV_HYP_PAGE_SIZE); + memset(vmbus_connection.monitor_pages[1], 0x00, HV_HYP_PAGE_SIZE); + msginfo = kzalloc(sizeof(*msginfo) + sizeof(struct vmbus_channel_initiate_contact), GFP_KERNEL); @@ -372,35 +355,25 @@ void vmbus_disconnect(void) destroy_workqueue(vmbus_connection.work_queue); if (vmbus_connection.int_page) { - hv_free_hyperv_page((unsigned long)vmbus_connection.int_page); + free_page((unsigned long)vmbus_connection.int_page); vmbus_connection.int_page = NULL; } - if (hv_is_isolation_supported()) { - /* - * memunmap() checks input address is ioremap address or not - * inside. It doesn't unmap any thing in the non-SNP CVM and - * so not check CVM type here. - */ - memunmap(vmbus_connection.monitor_pages[0]); - memunmap(vmbus_connection.monitor_pages[1]); - - set_memory_encrypted((unsigned long) - vmbus_connection.monitor_pages_original[0], - 1); - set_memory_encrypted((unsigned long) - vmbus_connection.monitor_pages_original[1], - 1); + if (vmbus_connection.monitor_pages[0]) { + if (!set_memory_encrypted( + (unsigned long)vmbus_connection.monitor_pages[0], 1)) + free_page((unsigned long) + vmbus_connection.monitor_pages[0]); + vmbus_connection.monitor_pages[0] = NULL; } - hv_free_hyperv_page((unsigned long) - vmbus_connection.monitor_pages_original[0]); - hv_free_hyperv_page((unsigned long) - vmbus_connection.monitor_pages_original[1]); - vmbus_connection.monitor_pages_original[0] = - vmbus_connection.monitor_pages[0] = NULL; - vmbus_connection.monitor_pages_original[1] = + if (vmbus_connection.monitor_pages[1]) { + if (!set_memory_encrypted( + (unsigned long)vmbus_connection.monitor_pages[1], 1)) + free_page((unsigned long) + vmbus_connection.monitor_pages[1]); vmbus_connection.monitor_pages[1] = NULL; + } } /* @@ -409,6 +382,10 @@ void vmbus_disconnect(void) */ struct vmbus_channel *relid2channel(u32 relid) { + if (vmbus_connection.channels == NULL) { + pr_warn_once("relid2channel: relid=%d: No channels mapped!\n", relid); + return NULL; + } if (WARN_ON(relid >= MAX_CHANNEL_RELIDS)) return NULL; return READ_ONCE(vmbus_connection.channels[relid]); @@ -536,10 +513,20 @@ void vmbus_set_event(struct vmbus_channel *channel) ++channel->sig_events; - if (hv_isolation_type_snp()) - hv_ghcb_hypercall(HVCALL_SIGNAL_EVENT, &channel->sig_event, - NULL, sizeof(channel->sig_event)); - else - hv_do_fast_hypercall8(HVCALL_SIGNAL_EVENT, channel->sig_event); + if (ms_hyperv.paravisor_present) { + if (hv_isolation_type_snp()) + hv_ghcb_hypercall(HVCALL_SIGNAL_EVENT, &channel->sig_event, + NULL, sizeof(channel->sig_event)); + else if (hv_isolation_type_tdx()) + hv_tdx_hypercall(HVCALL_SIGNAL_EVENT | HV_HYPERCALL_FAST_BIT, + channel->sig_event, 0); + else + WARN_ON_ONCE(1); + } else { + u64 control = HVCALL_SIGNAL_EVENT; + + control |= hv_nested ? HV_HYPERCALL_NESTED : 0; + hv_do_fast_hypercall8(control, channel->sig_event); + } } EXPORT_SYMBOL_GPL(vmbus_set_event); diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c index 4d6480d57546..c100f04b3581 100644 --- a/drivers/hv/hv.c +++ b/drivers/hv/hv.c @@ -18,12 +18,15 @@ #include <linux/clockchips.h> #include <linux/delay.h> #include <linux/interrupt.h> +#include <linux/export.h> #include <clocksource/hyperv_timer.h> #include <asm/mshyperv.h> +#include <linux/set_memory.h> #include "hyperv_vmbus.h" /* The one and only */ struct hv_context hv_context; +EXPORT_SYMBOL_FOR_MODULES(hv_context, "mshv_vtl"); /* * hv_init - Main initialization routine. @@ -39,86 +42,136 @@ int hv_init(void) } /* - * Functions for allocating and freeing memory with size and - * alignment HV_HYP_PAGE_SIZE. These functions are needed because - * the guest page size may not be the same as the Hyper-V page - * size. We depend upon kmalloc() aligning power-of-two size - * allocations to the allocation size boundary, so that the - * allocated memory appears to Hyper-V as a page of the size - * it expects. - */ - -void *hv_alloc_hyperv_page(void) -{ - BUILD_BUG_ON(PAGE_SIZE < HV_HYP_PAGE_SIZE); - - if (PAGE_SIZE == HV_HYP_PAGE_SIZE) - return (void *)__get_free_page(GFP_KERNEL); - else - return kmalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL); -} - -void *hv_alloc_hyperv_zeroed_page(void) -{ - if (PAGE_SIZE == HV_HYP_PAGE_SIZE) - return (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO); - else - return kzalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL); -} - -void hv_free_hyperv_page(unsigned long addr) -{ - if (PAGE_SIZE == HV_HYP_PAGE_SIZE) - free_page(addr); - else - kfree((void *)addr); -} - -/* * hv_post_message - Post a message using the hypervisor message IPC. * * This involves a hypercall. */ int hv_post_message(union hv_connection_id connection_id, - enum hv_message_type message_type, - void *payload, size_t payload_size) + enum hv_message_type message_type, + void *payload, size_t payload_size) { struct hv_input_post_message *aligned_msg; - struct hv_per_cpu_context *hv_cpu; + unsigned long flags; u64 status; if (payload_size > HV_MESSAGE_PAYLOAD_BYTE_COUNT) return -EMSGSIZE; - hv_cpu = get_cpu_ptr(hv_context.cpu_context); - aligned_msg = hv_cpu->post_msg_page; + local_irq_save(flags); + + /* + * A TDX VM with the paravisor must use the decrypted post_msg_page: see + * the comment in struct hv_per_cpu_context. A SNP VM with the paravisor + * can use the encrypted hyperv_pcpu_input_arg because it copies the + * input into the GHCB page, which has been decrypted by the paravisor. + */ + if (hv_isolation_type_tdx() && ms_hyperv.paravisor_present) + aligned_msg = this_cpu_ptr(hv_context.cpu_context)->post_msg_page; + else + aligned_msg = *this_cpu_ptr(hyperv_pcpu_input_arg); + aligned_msg->connectionid = connection_id; aligned_msg->reserved = 0; aligned_msg->message_type = message_type; aligned_msg->payload_size = payload_size; memcpy((void *)aligned_msg->payload, payload, payload_size); - if (hv_isolation_type_snp()) - status = hv_ghcb_hypercall(HVCALL_POST_MESSAGE, - (void *)aligned_msg, NULL, - sizeof(*aligned_msg)); - else - status = hv_do_hypercall(HVCALL_POST_MESSAGE, - aligned_msg, NULL); + if (ms_hyperv.paravisor_present && !vmbus_is_confidential()) { + /* + * If the VMBus isn't confidential, use the CoCo-specific + * mechanism to communicate with the hypervisor. + */ + if (hv_isolation_type_tdx()) + status = hv_tdx_hypercall(HVCALL_POST_MESSAGE, + virt_to_phys(aligned_msg), 0); + else if (hv_isolation_type_snp()) + status = hv_ghcb_hypercall(HVCALL_POST_MESSAGE, + aligned_msg, NULL, + sizeof(*aligned_msg)); + else + status = HV_STATUS_INVALID_PARAMETER; + } else { + u64 control = HVCALL_POST_MESSAGE; - /* Preemption must remain disabled until after the hypercall - * so some other thread can't get scheduled onto this cpu and - * corrupt the per-cpu post_msg_page - */ - put_cpu_ptr(hv_cpu); + control |= hv_nested ? HV_HYPERCALL_NESTED : 0; + /* + * If there is no paravisor, this will go to the hypervisor. + * In the Confidential VMBus case, there is the paravisor + * to which this will trap. + */ + status = hv_do_hypercall(control, aligned_msg, NULL); + } + + local_irq_restore(flags); return hv_result(status); } +EXPORT_SYMBOL_FOR_MODULES(hv_post_message, "mshv_vtl"); + +static int hv_alloc_page(void **page, bool decrypt, const char *note) +{ + int ret = 0; + + /* + * After the page changes its encryption status, its contents might + * appear scrambled on some hardware. Thus `get_zeroed_page` would + * zero the page out in vain, so do that explicitly exactly once. + * + * By default, the page is allocated encrypted in a CoCo VM. + */ + *page = (void *)__get_free_page(GFP_KERNEL); + if (!*page) + return -ENOMEM; + + if (decrypt) + ret = set_memory_decrypted((unsigned long)*page, 1); + if (ret) + goto failed; + + memset(*page, 0, PAGE_SIZE); + return 0; + +failed: + /* + * Report the failure but don't put the page back on the free list as + * its encryption status is unknown. + */ + pr_err("allocation failed for %s page, error %d, decrypted %d\n", + note, ret, decrypt); + *page = NULL; + return ret; +} + +static int hv_free_page(void **page, bool encrypt, const char *note) +{ + int ret = 0; + + if (!*page) + return 0; + + if (encrypt) + ret = set_memory_encrypted((unsigned long)*page, 1); + + /* + * In the case of the failure, the page is leaked. Something is wrong, + * prefer to lose the page with the unknown encryption status and stay afloat. + */ + if (ret) + pr_err("deallocation failed for %s page, error %d, encrypt %d\n", + note, ret, encrypt); + else + free_page((unsigned long)*page); + + *page = NULL; + + return ret; +} int hv_synic_alloc(void) { - int cpu; + int cpu, ret = -ENOMEM; struct hv_per_cpu_context *hv_cpu; + const bool decrypt = !vmbus_is_confidential(); /* * First, zero all per-cpu memory areas so hv_synic_free() can @@ -132,7 +185,7 @@ int hv_synic_alloc(void) hv_context.hv_numa_map = kcalloc(nr_node_ids, sizeof(struct cpumask), GFP_KERNEL); - if (hv_context.hv_numa_map == NULL) { + if (!hv_context.hv_numa_map) { pr_err("Unable to allocate NUMA map\n"); goto err; } @@ -141,204 +194,299 @@ int hv_synic_alloc(void) hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu); tasklet_init(&hv_cpu->msg_dpc, - vmbus_on_msg_dpc, (unsigned long) hv_cpu); + vmbus_on_msg_dpc, (unsigned long)hv_cpu); + + if (ms_hyperv.paravisor_present && hv_isolation_type_tdx()) { + ret = hv_alloc_page(&hv_cpu->post_msg_page, + decrypt, "post msg"); + if (ret) + goto err; + } /* - * Synic message and event pages are allocated by paravisor. - * Skip these pages allocation here. + * If these SynIC pages are not allocated, SIEF and SIM pages + * are configured using what the root partition or the paravisor + * provides upon reading the SIEFP and SIMP registers. */ - if (!hv_isolation_type_snp()) { - hv_cpu->synic_message_page = - (void *)get_zeroed_page(GFP_ATOMIC); - if (hv_cpu->synic_message_page == NULL) { - pr_err("Unable to allocate SYNIC message page\n"); + if (!ms_hyperv.paravisor_present && !hv_root_partition()) { + ret = hv_alloc_page(&hv_cpu->hyp_synic_message_page, + decrypt, "hypervisor SynIC msg"); + if (ret) goto err; - } - - hv_cpu->synic_event_page = - (void *)get_zeroed_page(GFP_ATOMIC); - if (hv_cpu->synic_event_page == NULL) { - pr_err("Unable to allocate SYNIC event page\n"); + ret = hv_alloc_page(&hv_cpu->hyp_synic_event_page, + decrypt, "hypervisor SynIC event"); + if (ret) goto err; - } } - hv_cpu->post_msg_page = (void *)get_zeroed_page(GFP_ATOMIC); - if (hv_cpu->post_msg_page == NULL) { - pr_err("Unable to allocate post msg page\n"); - goto err; + if (vmbus_is_confidential()) { + ret = hv_alloc_page(&hv_cpu->para_synic_message_page, + false, "paravisor SynIC msg"); + if (ret) + goto err; + ret = hv_alloc_page(&hv_cpu->para_synic_event_page, + false, "paravisor SynIC event"); + if (ret) + goto err; } } return 0; + err: /* * Any memory allocations that succeeded will be freed when * the caller cleans up by calling hv_synic_free() */ - return -ENOMEM; + return ret; } - void hv_synic_free(void) { int cpu; + const bool encrypt = !vmbus_is_confidential(); for_each_present_cpu(cpu) { - struct hv_per_cpu_context *hv_cpu - = per_cpu_ptr(hv_context.cpu_context, cpu); - - free_page((unsigned long)hv_cpu->synic_event_page); - free_page((unsigned long)hv_cpu->synic_message_page); - free_page((unsigned long)hv_cpu->post_msg_page); + struct hv_per_cpu_context *hv_cpu = + per_cpu_ptr(hv_context.cpu_context, cpu); + + if (ms_hyperv.paravisor_present && hv_isolation_type_tdx()) + hv_free_page(&hv_cpu->post_msg_page, + encrypt, "post msg"); + if (!ms_hyperv.paravisor_present && !hv_root_partition()) { + hv_free_page(&hv_cpu->hyp_synic_event_page, + encrypt, "hypervisor SynIC event"); + hv_free_page(&hv_cpu->hyp_synic_message_page, + encrypt, "hypervisor SynIC msg"); + } + if (vmbus_is_confidential()) { + hv_free_page(&hv_cpu->para_synic_event_page, + false, "paravisor SynIC event"); + hv_free_page(&hv_cpu->para_synic_message_page, + false, "paravisor SynIC msg"); + } } kfree(hv_context.hv_numa_map); } /* - * hv_synic_init - Initialize the Synthetic Interrupt Controller. - * - * If it is already initialized by another entity (ie x2v shim), we need to - * retrieve the initialized message and event pages. Otherwise, we create and - * initialize the message and event pages. + * hv_hyp_synic_enable_regs - Initialize the Synthetic Interrupt Controller + * with the hypervisor. */ -void hv_synic_enable_regs(unsigned int cpu) +void hv_hyp_synic_enable_regs(unsigned int cpu) { - struct hv_per_cpu_context *hv_cpu - = per_cpu_ptr(hv_context.cpu_context, cpu); + struct hv_per_cpu_context *hv_cpu = + per_cpu_ptr(hv_context.cpu_context, cpu); union hv_synic_simp simp; union hv_synic_siefp siefp; union hv_synic_sint shared_sint; - union hv_synic_scontrol sctrl; - /* Setup the Synic's message page */ - simp.as_uint64 = hv_get_register(HV_REGISTER_SIMP); + /* Setup the Synic's message page with the hypervisor. */ + simp.as_uint64 = hv_get_msr(HV_MSR_SIMP); simp.simp_enabled = 1; - if (hv_isolation_type_snp()) { - hv_cpu->synic_message_page - = memremap(simp.base_simp_gpa << HV_HYP_PAGE_SHIFT, - HV_HYP_PAGE_SIZE, MEMREMAP_WB); - if (!hv_cpu->synic_message_page) - pr_err("Fail to map syinc message page.\n"); + if (ms_hyperv.paravisor_present || hv_root_partition()) { + /* Mask out vTOM bit. ioremap_cache() maps decrypted */ + u64 base = (simp.base_simp_gpa << HV_HYP_PAGE_SHIFT) & + ~ms_hyperv.shared_gpa_boundary; + hv_cpu->hyp_synic_message_page = + (void *)ioremap_cache(base, HV_HYP_PAGE_SIZE); + if (!hv_cpu->hyp_synic_message_page) + pr_err("Fail to map synic message page.\n"); } else { - simp.base_simp_gpa = virt_to_phys(hv_cpu->synic_message_page) + simp.base_simp_gpa = virt_to_phys(hv_cpu->hyp_synic_message_page) >> HV_HYP_PAGE_SHIFT; } - hv_set_register(HV_REGISTER_SIMP, simp.as_uint64); + hv_set_msr(HV_MSR_SIMP, simp.as_uint64); - /* Setup the Synic's event page */ - siefp.as_uint64 = hv_get_register(HV_REGISTER_SIEFP); + /* Setup the Synic's event page with the hypervisor. */ + siefp.as_uint64 = hv_get_msr(HV_MSR_SIEFP); siefp.siefp_enabled = 1; - if (hv_isolation_type_snp()) { - hv_cpu->synic_event_page = - memremap(siefp.base_siefp_gpa << HV_HYP_PAGE_SHIFT, - HV_HYP_PAGE_SIZE, MEMREMAP_WB); - - if (!hv_cpu->synic_event_page) - pr_err("Fail to map syinc event page.\n"); + if (ms_hyperv.paravisor_present || hv_root_partition()) { + /* Mask out vTOM bit. ioremap_cache() maps decrypted */ + u64 base = (siefp.base_siefp_gpa << HV_HYP_PAGE_SHIFT) & + ~ms_hyperv.shared_gpa_boundary; + hv_cpu->hyp_synic_event_page = + (void *)ioremap_cache(base, HV_HYP_PAGE_SIZE); + if (!hv_cpu->hyp_synic_event_page) + pr_err("Fail to map synic event page.\n"); } else { - siefp.base_siefp_gpa = virt_to_phys(hv_cpu->synic_event_page) + siefp.base_siefp_gpa = virt_to_phys(hv_cpu->hyp_synic_event_page) >> HV_HYP_PAGE_SHIFT; } - hv_set_register(HV_REGISTER_SIEFP, siefp.as_uint64); + hv_set_msr(HV_MSR_SIEFP, siefp.as_uint64); + hv_enable_coco_interrupt(cpu, vmbus_interrupt, true); /* Setup the shared SINT. */ if (vmbus_irq != -1) enable_percpu_irq(vmbus_irq, 0); - shared_sint.as_uint64 = hv_get_register(HV_REGISTER_SINT0 + - VMBUS_MESSAGE_SINT); + shared_sint.as_uint64 = hv_get_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT); shared_sint.vector = vmbus_interrupt; shared_sint.masked = false; + shared_sint.auto_eoi = hv_recommend_using_aeoi(); + hv_set_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT, shared_sint.as_uint64); +} - /* - * On architectures where Hyper-V doesn't support AEOI (e.g., ARM64), - * it doesn't provide a recommendation flag and AEOI must be disabled. - */ -#ifdef HV_DEPRECATING_AEOI_RECOMMENDED - shared_sint.auto_eoi = - !(ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED); -#else - shared_sint.auto_eoi = 0; -#endif - hv_set_register(HV_REGISTER_SINT0 + VMBUS_MESSAGE_SINT, - shared_sint.as_uint64); +static void hv_hyp_synic_enable_interrupts(void) +{ + union hv_synic_scontrol sctrl; /* Enable the global synic bit */ - sctrl.as_uint64 = hv_get_register(HV_REGISTER_SCONTROL); + sctrl.as_uint64 = hv_get_msr(HV_MSR_SCONTROL); sctrl.enable = 1; - hv_set_register(HV_REGISTER_SCONTROL, sctrl.as_uint64); + hv_set_msr(HV_MSR_SCONTROL, sctrl.as_uint64); +} + +static void hv_para_synic_enable_regs(unsigned int cpu) +{ + union hv_synic_simp simp; + union hv_synic_siefp siefp; + struct hv_per_cpu_context *hv_cpu + = per_cpu_ptr(hv_context.cpu_context, cpu); + + /* Setup the Synic's message page with the paravisor. */ + simp.as_uint64 = hv_para_get_synic_register(HV_MSR_SIMP); + simp.simp_enabled = 1; + simp.base_simp_gpa = virt_to_phys(hv_cpu->para_synic_message_page) + >> HV_HYP_PAGE_SHIFT; + hv_para_set_synic_register(HV_MSR_SIMP, simp.as_uint64); + + /* Setup the Synic's event page with the paravisor. */ + siefp.as_uint64 = hv_para_get_synic_register(HV_MSR_SIEFP); + siefp.siefp_enabled = 1; + siefp.base_siefp_gpa = virt_to_phys(hv_cpu->para_synic_event_page) + >> HV_HYP_PAGE_SHIFT; + hv_para_set_synic_register(HV_MSR_SIEFP, siefp.as_uint64); +} + +static void hv_para_synic_enable_interrupts(void) +{ + union hv_synic_scontrol sctrl; + + /* Enable the global synic bit */ + sctrl.as_uint64 = hv_para_get_synic_register(HV_MSR_SCONTROL); + sctrl.enable = 1; + hv_para_set_synic_register(HV_MSR_SCONTROL, sctrl.as_uint64); } int hv_synic_init(unsigned int cpu) { - hv_synic_enable_regs(cpu); + if (vmbus_is_confidential()) + hv_para_synic_enable_regs(cpu); + + /* + * The SINT is set in hv_hyp_synic_enable_regs() by calling + * hv_set_msr(). hv_set_msr() in turn has special case code for the + * SINT MSRs that write to the hypervisor version of the MSR *and* + * the paravisor version of the MSR (but *without* the proxy bit when + * VMBus is confidential). + * + * Then enable interrupts via the paravisor if VMBus is confidential, + * and otherwise via the hypervisor. + */ + + hv_hyp_synic_enable_regs(cpu); + if (vmbus_is_confidential()) + hv_para_synic_enable_interrupts(); + else + hv_hyp_synic_enable_interrupts(); hv_stimer_legacy_init(cpu, VMBUS_MESSAGE_SINT); return 0; } -/* - * hv_synic_cleanup - Cleanup routine for hv_synic_init(). - */ -void hv_synic_disable_regs(unsigned int cpu) +void hv_hyp_synic_disable_regs(unsigned int cpu) { - struct hv_per_cpu_context *hv_cpu - = per_cpu_ptr(hv_context.cpu_context, cpu); + struct hv_per_cpu_context *hv_cpu = + per_cpu_ptr(hv_context.cpu_context, cpu); union hv_synic_sint shared_sint; union hv_synic_simp simp; union hv_synic_siefp siefp; - union hv_synic_scontrol sctrl; - shared_sint.as_uint64 = hv_get_register(HV_REGISTER_SINT0 + - VMBUS_MESSAGE_SINT); + shared_sint.as_uint64 = hv_get_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT); shared_sint.masked = 1; /* Need to correctly cleanup in the case of SMP!!! */ /* Disable the interrupt */ - hv_set_register(HV_REGISTER_SINT0 + VMBUS_MESSAGE_SINT, - shared_sint.as_uint64); + hv_set_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT, shared_sint.as_uint64); + hv_enable_coco_interrupt(cpu, vmbus_interrupt, false); - simp.as_uint64 = hv_get_register(HV_REGISTER_SIMP); + simp.as_uint64 = hv_get_msr(HV_MSR_SIMP); /* - * In Isolation VM, sim and sief pages are allocated by + * In Isolation VM, simp and sief pages are allocated by * paravisor. These pages also will be used by kdump * kernel. So just reset enable bit here and keep page * addresses. */ simp.simp_enabled = 0; - if (hv_isolation_type_snp()) - memunmap(hv_cpu->synic_message_page); - else + if (ms_hyperv.paravisor_present || hv_root_partition()) { + if (hv_cpu->hyp_synic_message_page) { + iounmap(hv_cpu->hyp_synic_message_page); + hv_cpu->hyp_synic_message_page = NULL; + } + } else { simp.base_simp_gpa = 0; + } - hv_set_register(HV_REGISTER_SIMP, simp.as_uint64); + hv_set_msr(HV_MSR_SIMP, simp.as_uint64); - siefp.as_uint64 = hv_get_register(HV_REGISTER_SIEFP); + siefp.as_uint64 = hv_get_msr(HV_MSR_SIEFP); siefp.siefp_enabled = 0; - if (hv_isolation_type_snp()) - memunmap(hv_cpu->synic_event_page); - else + if (ms_hyperv.paravisor_present || hv_root_partition()) { + if (hv_cpu->hyp_synic_event_page) { + iounmap(hv_cpu->hyp_synic_event_page); + hv_cpu->hyp_synic_event_page = NULL; + } + } else { siefp.base_siefp_gpa = 0; + } + + hv_set_msr(HV_MSR_SIEFP, siefp.as_uint64); +} - hv_set_register(HV_REGISTER_SIEFP, siefp.as_uint64); +static void hv_hyp_synic_disable_interrupts(void) +{ + union hv_synic_scontrol sctrl; /* Disable the global synic bit */ - sctrl.as_uint64 = hv_get_register(HV_REGISTER_SCONTROL); + sctrl.as_uint64 = hv_get_msr(HV_MSR_SCONTROL); sctrl.enable = 0; - hv_set_register(HV_REGISTER_SCONTROL, sctrl.as_uint64); + hv_set_msr(HV_MSR_SCONTROL, sctrl.as_uint64); +} - if (vmbus_irq != -1) - disable_percpu_irq(vmbus_irq); +static void hv_para_synic_disable_regs(unsigned int cpu) +{ + union hv_synic_simp simp; + union hv_synic_siefp siefp; + + /* Disable SynIC's message page in the paravisor. */ + simp.as_uint64 = hv_para_get_synic_register(HV_MSR_SIMP); + simp.simp_enabled = 0; + hv_para_set_synic_register(HV_MSR_SIMP, simp.as_uint64); + + /* Disable SynIC's event page in the paravisor. */ + siefp.as_uint64 = hv_para_get_synic_register(HV_MSR_SIEFP); + siefp.siefp_enabled = 0; + hv_para_set_synic_register(HV_MSR_SIEFP, siefp.as_uint64); +} + +static void hv_para_synic_disable_interrupts(void) +{ + union hv_synic_scontrol sctrl; + + /* Disable the global synic bit */ + sctrl.as_uint64 = hv_para_get_synic_register(HV_MSR_SCONTROL); + sctrl.enable = 0; + hv_para_set_synic_register(HV_MSR_SCONTROL, sctrl.as_uint64); } #define HV_MAX_TRIES 3 @@ -351,16 +499,18 @@ void hv_synic_disable_regs(unsigned int cpu) * that the normal interrupt handling mechanism will find and process the channel interrupt * "very soon", and in the process clear the bit. */ -static bool hv_synic_event_pending(void) +static bool __hv_synic_event_pending(union hv_synic_event_flags *event, int sint) { - struct hv_per_cpu_context *hv_cpu = this_cpu_ptr(hv_context.cpu_context); - union hv_synic_event_flags *event = - (union hv_synic_event_flags *)hv_cpu->synic_event_page + VMBUS_MESSAGE_SINT; - unsigned long *recv_int_page = event->flags; /* assumes VMBus version >= VERSION_WIN8 */ + unsigned long *recv_int_page; bool pending; u32 relid; int tries = 0; + if (!event) + return false; + + event += sint; + recv_int_page = event->flags; /* assumes VMBus version >= VERSION_WIN8 */ retry: pending = false; for_each_set_bit(relid, recv_int_page, HV_EVENT_FLAGS_COUNT) { @@ -377,10 +527,58 @@ retry: return pending; } +static bool hv_synic_event_pending(void) +{ + struct hv_per_cpu_context *hv_cpu = this_cpu_ptr(hv_context.cpu_context); + union hv_synic_event_flags *hyp_synic_event_page = hv_cpu->hyp_synic_event_page; + union hv_synic_event_flags *para_synic_event_page = hv_cpu->para_synic_event_page; + + return + __hv_synic_event_pending(hyp_synic_event_page, VMBUS_MESSAGE_SINT) || + __hv_synic_event_pending(para_synic_event_page, VMBUS_MESSAGE_SINT); +} + +static int hv_pick_new_cpu(struct vmbus_channel *channel) +{ + int ret = -EBUSY; + int start; + int cpu; + + lockdep_assert_cpus_held(); + lockdep_assert_held(&vmbus_connection.channel_mutex); + + /* + * We can't assume that the relevant interrupts will be sent before + * the cpu is offlined on older versions of hyperv. + */ + if (vmbus_proto_version < VERSION_WIN10_V5_3) + return -EBUSY; + + start = get_random_u32_below(nr_cpu_ids); + + for_each_cpu_wrap(cpu, cpu_online_mask, start) { + if (channel->target_cpu == cpu || + channel->target_cpu == VMBUS_CONNECT_CPU) + continue; + + ret = vmbus_channel_set_cpu(channel, cpu); + if (!ret) + break; + } + + if (ret) + ret = vmbus_channel_set_cpu(channel, VMBUS_CONNECT_CPU); + + return ret; +} + +/* + * hv_synic_cleanup - Cleanup routine for hv_synic_init(). + */ int hv_synic_cleanup(unsigned int cpu) { struct vmbus_channel *channel, *sc; - bool channel_found = false; + int ret = 0; if (vmbus_connection.conn_state != CONNECTED) goto always_cleanup; @@ -397,38 +595,34 @@ int hv_synic_cleanup(unsigned int cpu) /* * Search for channels which are bound to the CPU we're about to - * cleanup. In case we find one and vmbus is still connected, we - * fail; this will effectively prevent CPU offlining. - * - * TODO: Re-bind the channels to different CPUs. + * cleanup. */ mutex_lock(&vmbus_connection.channel_mutex); list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { if (channel->target_cpu == cpu) { - channel_found = true; - break; + ret = hv_pick_new_cpu(channel); + if (ret) { + mutex_unlock(&vmbus_connection.channel_mutex); + return ret; + } } list_for_each_entry(sc, &channel->sc_list, sc_list) { if (sc->target_cpu == cpu) { - channel_found = true; - break; + ret = hv_pick_new_cpu(sc); + if (ret) { + mutex_unlock(&vmbus_connection.channel_mutex); + return ret; + } } } - if (channel_found) - break; } mutex_unlock(&vmbus_connection.channel_mutex); - if (channel_found) - return -EBUSY; - /* - * channel_found == false means that any channels that were previously - * assigned to the CPU have been reassigned elsewhere with a call of - * vmbus_send_modifychannel(). Scan the event flags page looking for - * bits that are set and waiting with a timeout for vmbus_chan_sched() - * to process such bits. If bits are still set after this operation - * and VMBus is connected, fail the CPU offlining operation. + * Scan the event flags page looking for bits that are set and waiting + * with a timeout for vmbus_chan_sched() to process such bits. If bits + * are still set after this operation and VMBus is connected, fail the + * CPU offlining operation. */ if (vmbus_proto_version >= VERSION_WIN10_V4_1 && hv_synic_event_pending()) return -EBUSY; @@ -436,7 +630,27 @@ int hv_synic_cleanup(unsigned int cpu) always_cleanup: hv_stimer_legacy_cleanup(cpu); - hv_synic_disable_regs(cpu); + /* + * First, disable the event and message pages + * used for communicating with the host, and then + * disable the host interrupts if VMBus is not + * confidential. + */ + hv_hyp_synic_disable_regs(cpu); + if (!vmbus_is_confidential()) + hv_hyp_synic_disable_interrupts(); - return 0; + /* + * Perform the same steps for the Confidential VMBus. + * The sequencing provides the guarantee that no data + * may be posted for processing before disabling interrupts. + */ + if (vmbus_is_confidential()) { + hv_para_synic_disable_regs(cpu); + hv_para_synic_disable_interrupts(); + } + if (vmbus_irq != -1) + disable_percpu_irq(vmbus_irq); + + return ret; } diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index cbe43e2567a7..2b4080e51f97 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -8,6 +8,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/cleanup.h> #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/mman.h> @@ -24,9 +25,10 @@ #include <linux/notifier.h> #include <linux/percpu_counter.h> #include <linux/page_reporting.h> +#include <linux/sizes.h> #include <linux/hyperv.h> -#include <asm/hyperv-tlfs.h> +#include <hyperv/hvhdk.h> #include <asm/mshyperv.h> @@ -40,8 +42,6 @@ * Begin protocol definitions. */ - - /* * Protocol versions. The low word is the minor version, the high word the major * version. @@ -70,8 +70,6 @@ enum { DYNMEM_PROTOCOL_VERSION_CURRENT = DYNMEM_PROTOCOL_VERSION_WIN10 }; - - /* * Message Types */ @@ -100,7 +98,6 @@ enum dm_message_type { DM_VERSION_1_MAX = 12 }; - /* * Structures defining the dynamic memory management * protocol. @@ -114,7 +111,6 @@ union dm_version { __u32 version; } __packed; - union dm_caps { struct { __u64 balloon:1; @@ -147,8 +143,6 @@ union dm_mem_page_range { __u64 page_range; } __packed; - - /* * The header for all dynamic memory messages: * @@ -173,7 +167,6 @@ struct dm_message { __u8 data[]; /* enclosed message */ } __packed; - /* * Specific message types supporting the dynamic memory protocol. */ @@ -270,7 +263,6 @@ struct dm_status { __u32 io_diff; } __packed; - /* * Message to ask the guest to allocate memory - balloon up message. * This message is sent from the host to the guest. The guest may not be @@ -285,14 +277,13 @@ struct dm_balloon { __u32 reservedz; } __packed; - /* * Balloon response message; this message is sent from the guest * to the host in response to the balloon message. * * reservedz: Reserved; must be set to zero. * more_pages: If FALSE, this is the last message of the transaction. - * if TRUE there will atleast one more message from the guest. + * if TRUE there will be at least one more message from the guest. * * range_count: The number of ranges in the range array. * @@ -313,7 +304,7 @@ struct dm_balloon_response { * to the guest to give guest more memory. * * more_pages: If FALSE, this is the last message of the transaction. - * if TRUE there will atleast one more message from the guest. + * if TRUE there will be at least one more message from the guest. * * reservedz: Reserved; must be set to zero. * @@ -341,7 +332,6 @@ struct dm_unballoon_response { struct dm_header hdr; } __packed; - /* * Hot add request message. Message sent from the host to the guest. * @@ -389,7 +379,6 @@ enum dm_info_type { MAX_INFO_TYPE }; - /* * Header for the information message. */ @@ -424,11 +413,11 @@ struct dm_info_msg { * The range start_pfn : end_pfn specifies the range * that the host has asked us to hot add. The range * start_pfn : ha_end_pfn specifies the range that we have - * currently hot added. We hot add in multiples of 128M - * chunks; it is possible that we may not be able to bring - * online all the pages in the region. The range + * currently hot added. We hot add in chunks equal to the + * memory block size; it is possible that we may not be able + * to bring online all the pages in the region. The range * covered_start_pfn:covered_end_pfn defines the pages that can - * be brough online. + * be brought online. */ struct hv_hotadd_state { @@ -479,10 +468,10 @@ static unsigned long last_post_time; static int hv_hypercall_multi_failure; -module_param(hot_add, bool, (S_IRUGO | S_IWUSR)); +module_param(hot_add, bool, 0644); MODULE_PARM_DESC(hot_add, "If set attempt memory hot_add"); -module_param(pressure_report_delay, uint, (S_IRUGO | S_IWUSR)); +module_param(pressure_report_delay, uint, 0644); MODULE_PARM_DESC(pressure_report_delay, "Delay in secs in reporting pressure"); static atomic_t trans_id = ATOMIC_INIT(0); @@ -501,11 +490,13 @@ enum hv_dm_state { DM_INIT_ERROR }; - static __u8 recv_buffer[HV_HYP_PAGE_SIZE]; static __u8 balloon_up_send_buffer[HV_HYP_PAGE_SIZE]; + +static unsigned long ha_pages_in_chunk; +#define HA_BYTES_IN_CHUNK (ha_pages_in_chunk << PAGE_SHIFT) + #define PAGES_IN_2M (2 * 1024 * 1024 / PAGE_SIZE) -#define HA_CHUNK (128 * 1024 * 1024 / PAGE_SIZE) struct hv_dynmem_device { struct hv_device *dev; @@ -594,12 +585,12 @@ static inline bool has_pfn_is_backed(struct hv_hotadd_state *has, struct hv_hotadd_gap *gap; /* The page is not backed. */ - if ((pfn < has->covered_start_pfn) || (pfn >= has->covered_end_pfn)) + if (pfn < has->covered_start_pfn || pfn >= has->covered_end_pfn) return false; /* Check for gaps. */ list_for_each_entry(gap, &has->gap_list, list) { - if ((pfn >= gap->start_pfn) && (pfn < gap->end_pfn)) + if (pfn >= gap->start_pfn && pfn < gap->end_pfn) return false; } @@ -646,7 +637,7 @@ static int hv_memory_notifier(struct notifier_block *nb, unsigned long val, void *v) { struct memory_notify *mem = (struct memory_notify *)v; - unsigned long flags, pfn_count; + unsigned long pfn_count; switch (val) { case MEM_ONLINE: @@ -655,21 +646,22 @@ static int hv_memory_notifier(struct notifier_block *nb, unsigned long val, break; case MEM_OFFLINE: - spin_lock_irqsave(&dm_device.ha_lock, flags); - pfn_count = hv_page_offline_check(mem->start_pfn, - mem->nr_pages); - if (pfn_count <= dm_device.num_pages_onlined) { - dm_device.num_pages_onlined -= pfn_count; - } else { - /* - * We're offlining more pages than we managed to online. - * This is unexpected. In any case don't let - * num_pages_onlined wrap around zero. - */ - WARN_ON_ONCE(1); - dm_device.num_pages_onlined = 0; + scoped_guard(spinlock_irqsave, &dm_device.ha_lock) { + pfn_count = hv_page_offline_check(mem->start_pfn, + mem->nr_pages); + if (pfn_count <= dm_device.num_pages_onlined) { + dm_device.num_pages_onlined -= pfn_count; + } else { + /* + * We're offlining more pages than we + * managed to online. This is + * unexpected. In any case don't let + * num_pages_onlined wrap around zero. + */ + WARN_ON_ONCE(1); + dm_device.num_pages_onlined = 0; + } } - spin_unlock_irqrestore(&dm_device.ha_lock, flags); break; case MEM_GOING_ONLINE: case MEM_GOING_OFFLINE: @@ -691,9 +683,8 @@ static void hv_page_online_one(struct hv_hotadd_state *has, struct page *pg) if (!PageOffline(pg)) __SetPageOffline(pg); return; - } - if (PageOffline(pg)) - __ClearPageOffline(pg); + } else if (!PageOffline(pg)) + return; /* This frame is currently backed; online the page. */ generic_online_page(pg, 0); @@ -721,30 +712,22 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size, unsigned long start_pfn; unsigned long processed_pfn; unsigned long total_pfn = pfn_count; - unsigned long flags; - - for (i = 0; i < (size/HA_CHUNK); i++) { - start_pfn = start + (i * HA_CHUNK); - spin_lock_irqsave(&dm_device.ha_lock, flags); - has->ha_end_pfn += HA_CHUNK; + for (i = 0; i < (size/ha_pages_in_chunk); i++) { + start_pfn = start + (i * ha_pages_in_chunk); - if (total_pfn > HA_CHUNK) { - processed_pfn = HA_CHUNK; - total_pfn -= HA_CHUNK; - } else { - processed_pfn = total_pfn; - total_pfn = 0; + scoped_guard(spinlock_irqsave, &dm_device.ha_lock) { + has->ha_end_pfn += ha_pages_in_chunk; + processed_pfn = umin(total_pfn, ha_pages_in_chunk); + total_pfn -= processed_pfn; + has->covered_end_pfn += processed_pfn; } - has->covered_end_pfn += processed_pfn; - spin_unlock_irqrestore(&dm_device.ha_lock, flags); - reinit_completion(&dm_device.ol_waitevent); nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn)); ret = add_memory(nid, PFN_PHYS((start_pfn)), - (HA_CHUNK << PAGE_SHIFT), MHP_MERGE_RESOURCE); + HA_BYTES_IN_CHUNK, MHP_MERGE_RESOURCE); if (ret) { pr_err("hot_add memory failed error is %d\n", ret); @@ -758,10 +741,10 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size, */ do_hot_add = false; } - spin_lock_irqsave(&dm_device.ha_lock, flags); - has->ha_end_pfn -= HA_CHUNK; - has->covered_end_pfn -= processed_pfn; - spin_unlock_irqrestore(&dm_device.ha_lock, flags); + scoped_guard(spinlock_irqsave, &dm_device.ha_lock) { + has->ha_end_pfn -= ha_pages_in_chunk; + has->covered_end_pfn -= processed_pfn; + } break; } @@ -773,7 +756,7 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size, * adding succeeded, it is ok to proceed even if the memory was * not onlined in time. */ - wait_for_completion_timeout(&dm_device.ol_waitevent, 5 * HZ); + wait_for_completion_timeout(&dm_device.ol_waitevent, secs_to_jiffies(5)); post_status(&dm_device); } } @@ -781,31 +764,30 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size, static void hv_online_page(struct page *pg, unsigned int order) { struct hv_hotadd_state *has; - unsigned long flags; unsigned long pfn = page_to_pfn(pg); - spin_lock_irqsave(&dm_device.ha_lock, flags); - list_for_each_entry(has, &dm_device.ha_region_list, list) { - /* The page belongs to a different HAS. */ - if ((pfn < has->start_pfn) || + scoped_guard(spinlock_irqsave, &dm_device.ha_lock) { + list_for_each_entry(has, &dm_device.ha_region_list, list) { + /* The page belongs to a different HAS. */ + if (pfn < has->start_pfn || (pfn + (1UL << order) > has->end_pfn)) - continue; + continue; - hv_bring_pgs_online(has, pfn, 1UL << order); - break; + hv_bring_pgs_online(has, pfn, 1UL << order); + return; + } } - spin_unlock_irqrestore(&dm_device.ha_lock, flags); + generic_online_page(pg, order); } static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt) { struct hv_hotadd_state *has; struct hv_hotadd_gap *gap; - unsigned long residual, new_inc; + unsigned long residual; int ret = 0; - unsigned long flags; - spin_lock_irqsave(&dm_device.ha_lock, flags); + guard(spinlock_irqsave)(&dm_device.ha_lock); list_for_each_entry(has, &dm_device.ha_region_list, list) { /* * If the pfn range we are dealing with is not in the current @@ -838,27 +820,20 @@ static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt) * our current limit; extend it. */ if ((start_pfn + pfn_cnt) > has->end_pfn) { + /* Extend the region by multiples of ha_pages_in_chunk */ residual = (start_pfn + pfn_cnt - has->end_pfn); - /* - * Extend the region by multiples of HA_CHUNK. - */ - new_inc = (residual / HA_CHUNK) * HA_CHUNK; - if (residual % HA_CHUNK) - new_inc += HA_CHUNK; - - has->end_pfn += new_inc; + has->end_pfn += ALIGN(residual, ha_pages_in_chunk); } ret = 1; break; } - spin_unlock_irqrestore(&dm_device.ha_lock, flags); return ret; } static unsigned long handle_pg_range(unsigned long pg_start, - unsigned long pg_count) + unsigned long pg_count) { unsigned long start_pfn = pg_start; unsigned long pfn_cnt = pg_count; @@ -869,7 +844,7 @@ static unsigned long handle_pg_range(unsigned long pg_start, unsigned long res = 0, flags; pr_debug("Hot adding %lu pages starting at pfn 0x%lx.\n", pg_count, - pg_start); + pg_start); spin_lock_irqsave(&dm_device.ha_lock, flags); list_for_each_entry(has, &dm_device.ha_region_list, list) { @@ -905,22 +880,19 @@ static unsigned long handle_pg_range(unsigned long pg_start, if (start_pfn > has->start_pfn && online_section_nr(pfn_to_section_nr(start_pfn))) hv_bring_pgs_online(has, start_pfn, pgs_ol); - } - if ((has->ha_end_pfn < has->end_pfn) && (pfn_cnt > 0)) { + if (has->ha_end_pfn < has->end_pfn && pfn_cnt > 0) { /* * We have some residual hot add range * that needs to be hot added; hot add * it now. Hot add a multiple of - * HA_CHUNK that fully covers the pages + * ha_pages_in_chunk that fully covers the pages * we have. */ size = (has->end_pfn - has->ha_end_pfn); if (pfn_cnt <= size) { - size = ((pfn_cnt / HA_CHUNK) * HA_CHUNK); - if (pfn_cnt % HA_CHUNK) - size += HA_CHUNK; + size = ALIGN(pfn_cnt, ha_pages_in_chunk); } else { pfn_cnt = size; } @@ -947,7 +919,6 @@ static unsigned long process_hot_add(unsigned long pg_start, { struct hv_hotadd_state *ha_region = NULL; int covered; - unsigned long flags; if (pfn_cnt == 0) return 0; @@ -979,9 +950,9 @@ static unsigned long process_hot_add(unsigned long pg_start, ha_region->covered_end_pfn = pg_start; ha_region->end_pfn = rg_start + rg_size; - spin_lock_irqsave(&dm_device.ha_lock, flags); - list_add_tail(&ha_region->list, &dm_device.ha_region_list); - spin_unlock_irqrestore(&dm_device.ha_lock, flags); + scoped_guard(spinlock_irqsave, &dm_device.ha_lock) { + list_add_tail(&ha_region->list, &dm_device.ha_region_list); + } } do_pg_range: @@ -1014,10 +985,7 @@ static void hot_add_req(struct work_struct *dummy) rg_start = dm->ha_wrk.ha_region_range.finfo.start_page; rg_sz = dm->ha_wrk.ha_region_range.finfo.page_cnt; - if ((rg_start == 0) && (!dm->host_specified_ha_region)) { - unsigned long region_size; - unsigned long region_start; - + if (rg_start == 0 && !dm->host_specified_ha_region) { /* * The host has not specified the hot-add region. * Based on the hot-add page range being specified, @@ -1025,19 +993,13 @@ static void hot_add_req(struct work_struct *dummy) * that need to be hot-added while ensuring the alignment * and size requirements of Linux as it relates to hot-add. */ - region_size = (pfn_cnt / HA_CHUNK) * HA_CHUNK; - if (pfn_cnt % HA_CHUNK) - region_size += HA_CHUNK; - - region_start = (pg_start / HA_CHUNK) * HA_CHUNK; - - rg_start = region_start; - rg_sz = region_size; + rg_start = ALIGN_DOWN(pg_start, ha_pages_in_chunk); + rg_sz = ALIGN(pfn_cnt, ha_pages_in_chunk); } if (do_hot_add) resp.page_count = process_hot_add(pg_start, pfn_cnt, - rg_start, rg_sz); + rg_start, rg_sz); dm->num_pages_added += resp.page_count; #endif @@ -1215,11 +1177,10 @@ static void post_status(struct hv_dynmem_device *dm) sizeof(struct dm_status), (unsigned long)NULL, VM_PKT_DATA_INBAND, 0); - } static void free_balloon_pages(struct hv_dynmem_device *dm, - union dm_mem_page_range *range_array) + union dm_mem_page_range *range_array) { int num_pages = range_array->finfo.page_cnt; __u64 start_frame = range_array->finfo.start_page; @@ -1231,12 +1192,11 @@ static void free_balloon_pages(struct hv_dynmem_device *dm, __ClearPageOffline(pg); __free_page(pg); dm->num_pages_ballooned--; + mod_node_page_state(page_pgdat(pg), NR_BALLOON_PAGES, -1); adjust_managed_page_count(pg, 1); } } - - static unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm, unsigned int num_pages, struct dm_balloon_response *bl_resp, @@ -1262,6 +1222,7 @@ static unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm, return i * alloc_unit; dm->num_pages_ballooned += alloc_unit; + mod_node_page_state(page_pgdat(pg), NR_BALLOON_PAGES, alloc_unit); /* * If we allocatted 2M pages; split them so we @@ -1282,7 +1243,6 @@ static unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm, page_to_pfn(pg); bl_resp->range_array[i].finfo.page_cnt = alloc_unit; bl_resp->hdr.size += sizeof(union dm_mem_page_range); - } return i * alloc_unit; @@ -1336,7 +1296,7 @@ static void balloon_up(struct work_struct *dummy) if (num_ballooned == 0 || num_ballooned == num_pages) { pr_debug("Ballooned %u out of %u requested pages.\n", - num_pages, dm_device.balloon_wrk.num_pages); + num_pages, dm_device.balloon_wrk.num_pages); bl_resp->more_pages = 0; done = true; @@ -1370,16 +1330,15 @@ static void balloon_up(struct work_struct *dummy) for (i = 0; i < bl_resp->range_count; i++) free_balloon_pages(&dm_device, - &bl_resp->range_array[i]); + &bl_resp->range_array[i]); done = true; } } - } static void balloon_down(struct hv_dynmem_device *dm, - struct dm_unballoon_request *req) + struct dm_unballoon_request *req) { union dm_mem_page_range *range_array = req->range_array; int range_count = req->range_count; @@ -1393,7 +1352,7 @@ static void balloon_down(struct hv_dynmem_device *dm, } pr_debug("Freed %u ballooned pages.\n", - prev_pages_ballooned - dm->num_pages_ballooned); + prev_pages_ballooned - dm->num_pages_ballooned); if (req->more_pages == 1) return; @@ -1418,8 +1377,8 @@ static int dm_thread_func(void *dm_dev) struct hv_dynmem_device *dm = dm_dev; while (!kthread_should_stop()) { - wait_for_completion_interruptible_timeout( - &dm_device.config_event, 1*HZ); + wait_for_completion_interruptible_timeout(&dm_device.config_event, + secs_to_jiffies(1)); /* * The host expects us to post information on the memory * pressure every second. @@ -1443,9 +1402,8 @@ static int dm_thread_func(void *dm_dev) return 0; } - static void version_resp(struct hv_dynmem_device *dm, - struct dm_version_response *vresp) + struct dm_version_response *vresp) { struct dm_version_request version_req; int ret; @@ -1506,7 +1464,7 @@ version_error: } static void cap_resp(struct hv_dynmem_device *dm, - struct dm_capabilities_resp_msg *cap_resp) + struct dm_capabilities_resp_msg *cap_resp) { if (!cap_resp->is_accepted) { pr_err("Capabilities not accepted by host\n"); @@ -1539,7 +1497,7 @@ static void balloon_onchannelcallback(void *context) switch (dm_hdr->type) { case DM_VERSION_RESPONSE: version_resp(dm, - (struct dm_version_response *)dm_msg); + (struct dm_version_response *)dm_msg); break; case DM_CAPABILITIES_RESPONSE: @@ -1569,7 +1527,7 @@ static void balloon_onchannelcallback(void *context) dm->state = DM_BALLOON_DOWN; balloon_down(dm, - (struct dm_unballoon_request *)recv_buffer); + (struct dm_unballoon_request *)recv_buffer); break; case DM_MEM_HOT_ADD_REQUEST: @@ -1607,17 +1565,15 @@ static void balloon_onchannelcallback(void *context) default: pr_warn_ratelimited("Unhandled message: type: %d\n", dm_hdr->type); - } } - } #define HV_LARGE_REPORTING_ORDER 9 #define HV_LARGE_REPORTING_LEN (HV_HYP_PAGE_SIZE << \ HV_LARGE_REPORTING_ORDER) static int hv_free_page_report(struct page_reporting_dev_info *pr_dev_info, - struct scatterlist *sgl, unsigned int nents) + struct scatterlist *sgl, unsigned int nents) { unsigned long flags; struct hv_memory_hint *hint; @@ -1628,13 +1584,13 @@ static int hv_free_page_report(struct page_reporting_dev_info *pr_dev_info, WARN_ON_ONCE(nents > HV_MEMORY_HINT_MAX_GPA_PAGE_RANGES); WARN_ON_ONCE(sgl->length < (HV_HYP_PAGE_SIZE << page_reporting_order)); local_irq_save(flags); - hint = *(struct hv_memory_hint **)this_cpu_ptr(hyperv_pcpu_input_arg); + hint = *this_cpu_ptr(hyperv_pcpu_input_arg); if (!hint) { local_irq_restore(flags); return -ENOSPC; } - hint->type = HV_EXT_MEMORY_HEAT_HINT_TYPE_COLD_DISCARD; + hint->heat_type = HV_EXTMEM_HEAT_HINT_COLD_DISCARD; hint->reserved = 0; for_each_sg(sgl, sg, nents, i) { union hv_gpa_page_range *range; @@ -1652,7 +1608,7 @@ static int hv_free_page_report(struct page_reporting_dev_info *pr_dev_info, */ /* page reporting for pages 2MB or higher */ - if (order >= HV_LARGE_REPORTING_ORDER ) { + if (order >= HV_LARGE_REPORTING_ORDER) { range->page.largepage = 1; range->page_size = HV_GPA_PAGE_RANGE_PAGE_SIZE_2MB; range->base_large_pfn = page_to_hvpfn( @@ -1666,23 +1622,21 @@ static int hv_free_page_report(struct page_reporting_dev_info *pr_dev_info, range->page.additional_pages = (sg->length / HV_HYP_PAGE_SIZE) - 1; } - } status = hv_do_rep_hypercall(HV_EXT_CALL_MEMORY_HEAT_HINT, nents, 0, hint, NULL); local_irq_restore(flags); if (!hv_result_success(status)) { - pr_err("Cold memory discard hypercall failed with status %llx\n", - status); + status); if (hv_hypercall_multi_failure > 0) hv_hypercall_multi_failure++; if (hv_result(status) == HV_STATUS_INVALID_PARAMETER) { pr_err("Underlying Hyper-V does not support order less than 9. Hypercall failed\n"); pr_err("Defaulting to page_reporting_order %d\n", - pageblock_order); + pageblock_order); page_reporting_order = pageblock_order; hv_hypercall_multi_failure++; return -EINVAL; @@ -1716,7 +1670,7 @@ static void enable_page_reporting(void) pr_err("Failed to enable cold memory discard: %d\n", ret); } else { pr_info("Cold memory discard hint enabled with order %d\n", - page_reporting_order); + page_reporting_order); } } @@ -1799,7 +1753,7 @@ static int balloon_connect_vsp(struct hv_device *dev) if (ret) goto out; - t = wait_for_completion_timeout(&dm_device.host_event, 5*HZ); + t = wait_for_completion_timeout(&dm_device.host_event, secs_to_jiffies(5)); if (t == 0) { ret = -ETIMEDOUT; goto out; @@ -1835,10 +1789,13 @@ static int balloon_connect_vsp(struct hv_device *dev) cap_msg.caps.cap_bits.hot_add = hot_add_enabled(); /* - * Specify our alignment requirements as it relates - * memory hot-add. Specify 128MB alignment. + * Specify our alignment requirements for memory hot-add. The value is + * the log base 2 of the number of megabytes in a chunk. For example, + * with 256 MiB chunks, the value is 8. The number of MiB in a chunk + * must be a power of 2. */ - cap_msg.caps.cap_bits.hot_add_alignment = 7; + cap_msg.caps.cap_bits.hot_add_alignment = + ilog2(HA_BYTES_IN_CHUNK / SZ_1M); /* * Currently the host does not use these @@ -1854,7 +1811,7 @@ static int balloon_connect_vsp(struct hv_device *dev) if (ret) goto out; - t = wait_for_completion_timeout(&dm_device.host_event, 5*HZ); + t = wait_for_completion_timeout(&dm_device.host_event, secs_to_jiffies(5)); if (t == 0) { ret = -ETIMEDOUT; goto out; @@ -1895,8 +1852,8 @@ static int hv_balloon_debug_show(struct seq_file *f, void *offset) char *sname; seq_printf(f, "%-22s: %u.%u\n", "host_version", - DYNMEM_MAJOR_VERSION(dm->version), - DYNMEM_MINOR_VERSION(dm->version)); + DYNMEM_MAJOR_VERSION(dm->version), + DYNMEM_MINOR_VERSION(dm->version)); seq_printf(f, "%-22s:", "capabilities"); if (ballooning_enabled()) @@ -1945,10 +1902,10 @@ static int hv_balloon_debug_show(struct seq_file *f, void *offset) seq_printf(f, "%-22s: %u\n", "pages_ballooned", dm->num_pages_ballooned); seq_printf(f, "%-22s: %lu\n", "total_pages_committed", - get_pages_committed(dm)); + get_pages_committed(dm)); seq_printf(f, "%-22s: %llu\n", "max_dynamic_page_count", - dm->max_dynamic_page_count); + dm->max_dynamic_page_count); return 0; } @@ -1958,12 +1915,12 @@ DEFINE_SHOW_ATTRIBUTE(hv_balloon_debug); static void hv_balloon_debugfs_init(struct hv_dynmem_device *b) { debugfs_create_file("hv-balloon", 0444, NULL, b, - &hv_balloon_debug_fops); + &hv_balloon_debug_fops); } static void hv_balloon_debugfs_exit(struct hv_dynmem_device *b) { - debugfs_remove(debugfs_lookup("hv-balloon", NULL)); + debugfs_lookup_and_remove("hv-balloon", NULL); } #else @@ -1988,8 +1945,23 @@ static int balloon_probe(struct hv_device *dev, hot_add = false; #ifdef CONFIG_MEMORY_HOTPLUG + /* + * Hot-add must operate in chunks that are of size equal to the + * memory block size because that's what the core add_memory() + * interface requires. The Hyper-V interface requires that the memory + * block size be a power of 2, which is guaranteed by the check in + * memory_dev_init(). + */ + ha_pages_in_chunk = memory_block_size_bytes() / PAGE_SIZE; do_hot_add = hot_add; #else + /* + * Without MEMORY_HOTPLUG, the guest returns a failure status for all + * hot add requests from Hyper-V, and the chunk size is used only to + * specify alignment to Hyper-V as required by the host/guest protocol. + * Somewhat arbitrarily, use 128 MiB. + */ + ha_pages_in_chunk = SZ_128M / PAGE_SIZE; do_hot_add = false; #endif dm_device.dev = dev; @@ -2042,12 +2014,11 @@ connect_error: return ret; } -static int balloon_remove(struct hv_device *dev) +static void balloon_remove(struct hv_device *dev) { struct hv_dynmem_device *dm = hv_get_drvdata(dev); struct hv_hotadd_state *has, *tmp; struct hv_hotadd_gap *gap, *tmp_gap; - unsigned long flags; if (dm->num_pages_ballooned != 0) pr_warn("Ballooned pages: %d\n", dm->num_pages_ballooned); @@ -2073,7 +2044,7 @@ static int balloon_remove(struct hv_device *dev) #endif } - spin_lock_irqsave(&dm_device.ha_lock, flags); + guard(spinlock_irqsave)(&dm_device.ha_lock); list_for_each_entry_safe(has, tmp, &dm->ha_region_list, list) { list_for_each_entry_safe(gap, tmp_gap, &has->gap_list, list) { list_del(&gap->list); @@ -2082,9 +2053,6 @@ static int balloon_remove(struct hv_device *dev) list_del(&has->list); kfree(has); } - spin_unlock_irqrestore(&dm_device.ha_lock, flags); - - return 0; } static int balloon_suspend(struct hv_device *hv_dev) @@ -2105,7 +2073,6 @@ static int balloon_suspend(struct hv_device *hv_dev) tasklet_enable(&hv_dev->channel->callback_event); return 0; - } static int balloon_resume(struct hv_device *dev) @@ -2164,7 +2131,6 @@ static struct hv_driver balloon_drv = { static int __init init_balloon_drv(void) { - return vmbus_driver_register(&balloon_drv); } diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c index ae68298c0dca..0a3ab7efed46 100644 --- a/drivers/hv/hv_common.c +++ b/drivers/hv/hv_common.c @@ -17,24 +17,37 @@ #include <linux/export.h> #include <linux/bitfield.h> #include <linux/cpumask.h> +#include <linux/sched/task_stack.h> #include <linux/panic_notifier.h> #include <linux/ptrace.h> +#include <linux/random.h> +#include <linux/efi.h> +#include <linux/kdebug.h> +#include <linux/kmsg_dump.h> +#include <linux/sizes.h> #include <linux/slab.h> #include <linux/dma-map-ops.h> -#include <asm/hyperv-tlfs.h> +#include <linux/set_memory.h> +#include <hyperv/hvhdk.h> #include <asm/mshyperv.h> +u64 hv_current_partition_id = HV_PARTITION_ID_SELF; +EXPORT_SYMBOL_GPL(hv_current_partition_id); + +enum hv_partition_type hv_curr_partition_type; +EXPORT_SYMBOL_GPL(hv_curr_partition_type); + /* - * hv_root_partition and ms_hyperv are defined here with other Hyper-V - * specific globals so they are shared across all architectures and are + * ms_hyperv and hv_nested are defined here with other + * Hyper-V specific globals so they are shared across all architectures and are * built only when CONFIG_HYPERV is defined. But on x86, * ms_hyperv_init_platform() is built even when CONFIG_HYPERV is not - * defined, and it uses these two variables. So mark them as __weak + * defined, and it uses these three variables. So mark them as __weak * here, allowing for an overriding definition in the module containing * ms_hyperv_init_platform(). */ -bool __weak hv_root_partition; -EXPORT_SYMBOL_GPL(hv_root_partition); +bool __weak hv_nested; +EXPORT_SYMBOL_GPL(hv_nested); struct ms_hyperv_info __weak ms_hyperv; EXPORT_SYMBOL_GPL(ms_hyperv); @@ -51,6 +64,20 @@ EXPORT_SYMBOL_GPL(hyperv_pcpu_input_arg); void * __percpu *hyperv_pcpu_output_arg; EXPORT_SYMBOL_GPL(hyperv_pcpu_output_arg); +static void hv_kmsg_dump_unregister(void); + +static struct ctl_table_header *hv_ctl_table_hdr; + +/* + * Per-cpu array holding the tail pointer for the SynIC event ring buffer + * for each SINT. + * + * We cannot maintain this in mshv driver because the tail pointer should + * persist even if the mshv driver is unloaded. + */ +u8 * __percpu *hv_synic_eventring_tail; +EXPORT_SYMBOL_GPL(hv_synic_eventring_tail); + /* * Hyper-V specific initialization and shutdown code that is * common across all architectures. Called from architecture @@ -59,6 +86,12 @@ EXPORT_SYMBOL_GPL(hyperv_pcpu_output_arg); void __init hv_common_free(void) { + unregister_sysctl_table(hv_ctl_table_hdr); + hv_ctl_table_hdr = NULL; + + if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) + hv_kmsg_dump_unregister(); + kfree(hv_vp_index); hv_vp_index = NULL; @@ -67,11 +100,230 @@ void __init hv_common_free(void) free_percpu(hyperv_pcpu_input_arg); hyperv_pcpu_input_arg = NULL; + + free_percpu(hv_synic_eventring_tail); + hv_synic_eventring_tail = NULL; } +static void *hv_panic_page; + +/* + * Boolean to control whether to report panic messages over Hyper-V. + * + * It can be set via /proc/sys/kernel/hyperv_record_panic_msg + */ +static int sysctl_record_panic_msg = 1; + +/* + * sysctl option to allow the user to control whether kmsg data should be + * reported to Hyper-V on panic. + */ +static const struct ctl_table hv_ctl_table[] = { + { + .procname = "hyperv_record_panic_msg", + .data = &sysctl_record_panic_msg, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE + }, +}; + +static int hv_die_panic_notify_crash(struct notifier_block *self, + unsigned long val, void *args); + +static struct notifier_block hyperv_die_report_block = { + .notifier_call = hv_die_panic_notify_crash, +}; + +static struct notifier_block hyperv_panic_report_block = { + .notifier_call = hv_die_panic_notify_crash, +}; + +/* + * The following callback works both as die and panic notifier; its + * goal is to provide panic information to the hypervisor unless the + * kmsg dumper is used [see hv_kmsg_dump()], which provides more + * information but isn't always available. + * + * Notice that both the panic/die report notifiers are registered only + * if we have the capability HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE set. + */ +static int hv_die_panic_notify_crash(struct notifier_block *self, + unsigned long val, void *args) +{ + struct pt_regs *regs; + bool is_die; + + /* Don't notify Hyper-V unless we have a die oops event or panic. */ + if (self == &hyperv_panic_report_block) { + is_die = false; + regs = current_pt_regs(); + } else { /* die event */ + if (val != DIE_OOPS) + return NOTIFY_DONE; + + is_die = true; + regs = ((struct die_args *)args)->regs; + } + + /* + * Hyper-V should be notified only once about a panic/die. If we will + * be calling hv_kmsg_dump() later with kmsg data, don't do the + * notification here. + */ + if (!sysctl_record_panic_msg || !hv_panic_page) + hyperv_report_panic(regs, val, is_die); + + return NOTIFY_DONE; +} + +/* + * Callback from kmsg_dump. Grab as much as possible from the end of the kmsg + * buffer and call into Hyper-V to transfer the data. + */ +static void hv_kmsg_dump(struct kmsg_dumper *dumper, + struct kmsg_dump_detail *detail) +{ + struct kmsg_dump_iter iter; + size_t bytes_written; + + /* We are only interested in panics. */ + if (detail->reason != KMSG_DUMP_PANIC || !sysctl_record_panic_msg) + return; + + /* + * Write dump contents to the page. No need to synchronize; panic should + * be single-threaded. + */ + kmsg_dump_rewind(&iter); + kmsg_dump_get_buffer(&iter, false, hv_panic_page, HV_HYP_PAGE_SIZE, + &bytes_written); + if (!bytes_written) + return; + /* + * P3 to contain the physical address of the panic page & P4 to + * contain the size of the panic data in that page. Rest of the + * registers are no-op when the NOTIFY_MSG flag is set. + */ + hv_set_msr(HV_MSR_CRASH_P0, 0); + hv_set_msr(HV_MSR_CRASH_P1, 0); + hv_set_msr(HV_MSR_CRASH_P2, 0); + hv_set_msr(HV_MSR_CRASH_P3, virt_to_phys(hv_panic_page)); + hv_set_msr(HV_MSR_CRASH_P4, bytes_written); + + /* + * Let Hyper-V know there is crash data available along with + * the panic message. + */ + hv_set_msr(HV_MSR_CRASH_CTL, + (HV_CRASH_CTL_CRASH_NOTIFY | + HV_CRASH_CTL_CRASH_NOTIFY_MSG)); +} + +static struct kmsg_dumper hv_kmsg_dumper = { + .dump = hv_kmsg_dump, +}; + +static void hv_kmsg_dump_unregister(void) +{ + kmsg_dump_unregister(&hv_kmsg_dumper); + unregister_die_notifier(&hyperv_die_report_block); + atomic_notifier_chain_unregister(&panic_notifier_list, + &hyperv_panic_report_block); + + kfree(hv_panic_page); + hv_panic_page = NULL; +} + +static void hv_kmsg_dump_register(void) +{ + int ret; + + hv_panic_page = kzalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL); + if (!hv_panic_page) { + pr_err("Hyper-V: panic message page memory allocation failed\n"); + return; + } + + ret = kmsg_dump_register(&hv_kmsg_dumper); + if (ret) { + pr_err("Hyper-V: kmsg dump register error 0x%x\n", ret); + kfree(hv_panic_page); + hv_panic_page = NULL; + } +} + +static inline bool hv_output_page_exists(void) +{ + return hv_parent_partition() || IS_ENABLED(CONFIG_HYPERV_VTL_MODE); +} + +void __init hv_get_partition_id(void) +{ + struct hv_output_get_partition_id *output; + unsigned long flags; + u64 status, pt_id; + + local_irq_save(flags); + output = *this_cpu_ptr(hyperv_pcpu_input_arg); + status = hv_do_hypercall(HVCALL_GET_PARTITION_ID, NULL, output); + pt_id = output->partition_id; + local_irq_restore(flags); + + if (hv_result_success(status)) + hv_current_partition_id = pt_id; + else + pr_err("Hyper-V: failed to get partition ID: %#x\n", + hv_result(status)); +} +#if IS_ENABLED(CONFIG_HYPERV_VTL_MODE) +u8 __init get_vtl(void) +{ + u64 control = HV_HYPERCALL_REP_COMP_1 | HVCALL_GET_VP_REGISTERS; + struct hv_input_get_vp_registers *input; + struct hv_output_get_vp_registers *output; + unsigned long flags; + u64 ret; + + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + output = *this_cpu_ptr(hyperv_pcpu_output_arg); + + memset(input, 0, struct_size(input, names, 1)); + input->partition_id = HV_PARTITION_ID_SELF; + input->vp_index = HV_VP_INDEX_SELF; + input->input_vtl.as_uint8 = 0; + input->names[0] = HV_REGISTER_VSM_VP_STATUS; + + ret = hv_do_hypercall(control, input, output); + if (hv_result_success(ret)) { + ret = output->values[0].reg8 & HV_VTL_MASK; + } else { + pr_err("Failed to get VTL(error: %lld) exiting...\n", ret); + BUG(); + } + + local_irq_restore(flags); + return ret; +} +#endif + int __init hv_common_init(void) { int i; + union hv_hypervisor_version_info version; + + /* Get information about the Microsoft Hypervisor version */ + if (!hv_get_hypervisor_version(&version)) + pr_info("Hyper-V: Hypervisor Build %d.%d.%d.%d-%d-%d\n", + version.major_version, version.minor_version, + version.build_number, version.service_number, + version.service_pack, version.service_branch); + + if (hv_is_isolation_supported()) + sysctl_record_panic_msg = 0; /* * Hyper-V expects to get crash register data or kmsg when @@ -81,8 +333,33 @@ int __init hv_common_init(void) * kernel. */ if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) { + u64 hyperv_crash_ctl; + crash_kexec_post_notifiers = true; pr_info("Hyper-V: enabling crash_kexec_post_notifiers\n"); + + /* + * Panic message recording (sysctl_record_panic_msg) + * is enabled by default in non-isolated guests and + * disabled by default in isolated guests; the panic + * message recording won't be available in isolated + * guests should the following registration fail. + */ + hv_ctl_table_hdr = register_sysctl("kernel", hv_ctl_table); + if (!hv_ctl_table_hdr) + pr_err("Hyper-V: sysctl table register error"); + + /* + * Register for panic kmsg callback only if the right + * capability is supported by the hypervisor. + */ + hyperv_crash_ctl = hv_get_msr(HV_MSR_CRASH_CTL); + if (hyperv_crash_ctl & HV_CRASH_CTL_CRASH_NOTIFY_MSG) + hv_kmsg_dump_register(); + + register_die_notifier(&hyperv_die_report_block); + atomic_notifier_chain_register(&panic_notifier_list, + &hyperv_panic_report_block); } /* @@ -95,24 +372,95 @@ int __init hv_common_init(void) BUG_ON(!hyperv_pcpu_input_arg); /* Allocate the per-CPU state for output arg for root */ - if (hv_root_partition) { + if (hv_output_page_exists()) { hyperv_pcpu_output_arg = alloc_percpu(void *); BUG_ON(!hyperv_pcpu_output_arg); } - hv_vp_index = kmalloc_array(num_possible_cpus(), sizeof(*hv_vp_index), + if (hv_parent_partition()) { + hv_synic_eventring_tail = alloc_percpu(u8 *); + BUG_ON(!hv_synic_eventring_tail); + } + + hv_vp_index = kmalloc_array(nr_cpu_ids, sizeof(*hv_vp_index), GFP_KERNEL); if (!hv_vp_index) { hv_common_free(); return -ENOMEM; } - for (i = 0; i < num_possible_cpus(); i++) + for (i = 0; i < nr_cpu_ids; i++) hv_vp_index[i] = VP_INVAL; return 0; } +void __init ms_hyperv_late_init(void) +{ + struct acpi_table_header *header; + acpi_status status; + u8 *randomdata; + u32 length, i; + + /* + * Seed the Linux random number generator with entropy provided by + * the Hyper-V host in ACPI table OEM0. + */ + if (!IS_ENABLED(CONFIG_ACPI)) + return; + + status = acpi_get_table("OEM0", 0, &header); + if (ACPI_FAILURE(status) || !header) + return; + + /* + * Since the "OEM0" table name is for OEM specific usage, verify + * that what we're seeing purports to be from Microsoft. + */ + if (strncmp(header->oem_table_id, "MICROSFT", 8)) + goto error; + + /* + * Ensure the length is reasonable. Requiring at least 8 bytes and + * no more than 4K bytes is somewhat arbitrary and just protects + * against a malformed table. Hyper-V currently provides 64 bytes, + * but allow for a change in a later version. + */ + if (header->length < sizeof(*header) + 8 || + header->length > sizeof(*header) + SZ_4K) + goto error; + + length = header->length - sizeof(*header); + randomdata = (u8 *)(header + 1); + + pr_debug("Hyper-V: Seeding rng with %d random bytes from ACPI table OEM0\n", + length); + + add_bootloader_randomness(randomdata, length); + + /* + * To prevent the seed data from being visible in /sys/firmware/acpi, + * zero out the random data in the ACPI table and fixup the checksum. + * The zero'ing is done out of an abundance of caution in avoiding + * potential security risks to the rng. Similarly, reset the table + * length to just the header size so that a subsequent kexec doesn't + * try to use the zero'ed out random data. + */ + for (i = 0; i < length; i++) { + header->checksum += randomdata[i]; + randomdata[i] = 0; + } + + for (i = 0; i < sizeof(header->length); i++) + header->checksum += ((u8 *)&header->length)[i]; + header->length = sizeof(*header); + for (i = 0; i < sizeof(header->length); i++) + header->checksum -= ((u8 *)&header->length)[i]; + +error: + acpi_put_table(header); +} + /* * Hyper-V specific initialization and die code for * individual CPUs that is common across all architectures. @@ -122,54 +470,100 @@ int __init hv_common_init(void) int hv_common_cpu_init(unsigned int cpu) { void **inputarg, **outputarg; + u8 **synic_eventring_tail; u64 msr_vp_index; gfp_t flags; - int pgcount = hv_root_partition ? 2 : 1; + const int pgcount = hv_output_page_exists() ? 2 : 1; + void *mem; + int ret = 0; /* hv_cpu_init() can be called with IRQs disabled from hv_resume() */ flags = irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL; inputarg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg); - *inputarg = kmalloc(pgcount * HV_HYP_PAGE_SIZE, flags); - if (!(*inputarg)) - return -ENOMEM; - if (hv_root_partition) { - outputarg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg); - *outputarg = (char *)(*inputarg) + HV_HYP_PAGE_SIZE; + /* + * The per-cpu memory is already allocated if this CPU was previously + * online and then taken offline + */ + if (!*inputarg) { + mem = kmalloc_array(pgcount, HV_HYP_PAGE_SIZE, flags); + if (!mem) + return -ENOMEM; + + if (hv_output_page_exists()) { + outputarg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg); + *outputarg = (char *)mem + HV_HYP_PAGE_SIZE; + } + + if (!ms_hyperv.paravisor_present && + (hv_isolation_type_snp() || hv_isolation_type_tdx())) { + ret = set_memory_decrypted((unsigned long)mem, pgcount); + if (ret) { + /* It may be unsafe to free 'mem' */ + return ret; + } + + memset(mem, 0x00, pgcount * HV_HYP_PAGE_SIZE); + } + + /* + * In a fully enlightened TDX/SNP VM with more than 64 VPs, if + * hyperv_pcpu_input_arg is not NULL, set_memory_decrypted() -> + * ... -> cpa_flush()-> ... -> __send_ipi_mask_ex() tries to + * use hyperv_pcpu_input_arg as the hypercall input page, which + * must be a decrypted page in such a VM, but the page is still + * encrypted before set_memory_decrypted() returns. Fix this by + * setting *inputarg after the above set_memory_decrypted(): if + * hyperv_pcpu_input_arg is NULL, __send_ipi_mask_ex() returns + * HV_STATUS_INVALID_PARAMETER immediately, and the function + * hv_send_ipi_mask() falls back to orig_apic.send_IPI_mask(), + * which may be slightly slower than the hypercall, but still + * works correctly in such a VM. + */ + *inputarg = mem; } - msr_vp_index = hv_get_register(HV_REGISTER_VP_INDEX); + msr_vp_index = hv_get_msr(HV_MSR_VP_INDEX); hv_vp_index[cpu] = msr_vp_index; if (msr_vp_index > hv_max_vp_index) hv_max_vp_index = msr_vp_index; - return 0; + if (hv_parent_partition()) { + synic_eventring_tail = (u8 **)this_cpu_ptr(hv_synic_eventring_tail); + *synic_eventring_tail = kcalloc(HV_SYNIC_SINT_COUNT, + sizeof(u8), flags); + /* No need to unwind any of the above on failure here */ + if (unlikely(!*synic_eventring_tail)) + ret = -ENOMEM; + } + + return ret; } int hv_common_cpu_die(unsigned int cpu) { - unsigned long flags; - void **inputarg, **outputarg; - void *mem; - - local_irq_save(flags); - - inputarg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg); - mem = *inputarg; - *inputarg = NULL; + u8 **synic_eventring_tail; + /* + * The hyperv_pcpu_input_arg and hyperv_pcpu_output_arg memory + * is not freed when the CPU goes offline as the hyperv_pcpu_input_arg + * may be used by the Hyper-V vPCI driver in reassigning interrupts + * as part of the offlining process. The interrupt reassignment + * happens *after* the CPUHP_AP_HYPERV_ONLINE state has run and + * called this function. + * + * If a previously offlined CPU is brought back online again, the + * originally allocated memory is reused in hv_common_cpu_init(). + */ - if (hv_root_partition) { - outputarg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg); - *outputarg = NULL; + if (hv_parent_partition()) { + synic_eventring_tail = this_cpu_ptr(hv_synic_eventring_tail); + kfree(*synic_eventring_tail); + *synic_eventring_tail = NULL; } - local_irq_restore(flags); - - kfree(mem); - return 0; } @@ -221,17 +615,13 @@ EXPORT_SYMBOL_GPL(hv_query_ext_cap); void hv_setup_dma_ops(struct device *dev, bool coherent) { - /* - * Hyper-V does not offer a vIOMMU in the guest - * VM, so pass 0/NULL for the IOMMU settings - */ - arch_setup_dma_ops(dev, 0, 0, NULL, coherent); + arch_setup_dma_ops(dev, coherent); } EXPORT_SYMBOL_GPL(hv_setup_dma_ops); bool hv_is_hibernation_supported(void) { - return !hv_root_partition && acpi_sleep_state_supported(ACPI_STATE_S4); + return !hv_root_partition() && acpi_sleep_state_supported(ACPI_STATE_S4); } EXPORT_SYMBOL_GPL(hv_is_hibernation_supported); @@ -243,7 +633,7 @@ EXPORT_SYMBOL_GPL(hv_is_hibernation_supported); */ static u64 __hv_read_ref_counter(void) { - return hv_get_register(HV_REGISTER_TIME_REF_COUNT); + return hv_get_msr(HV_MSR_TIME_REF_COUNT); } u64 (*hv_read_reference_counter)(void) = __hv_read_ref_counter; @@ -268,6 +658,12 @@ bool __weak hv_isolation_type_snp(void) } EXPORT_SYMBOL_GPL(hv_isolation_type_snp); +bool __weak hv_isolation_type_tdx(void) +{ + return false; +} +EXPORT_SYMBOL_GPL(hv_isolation_type_tdx); + void __weak hv_setup_vmbus_handler(void (*handler)(void)) { } @@ -278,6 +674,11 @@ void __weak hv_remove_vmbus_handler(void) } EXPORT_SYMBOL_GPL(hv_remove_vmbus_handler); +void __weak hv_setup_mshv_handler(void (*handler)(void)) +{ +} +EXPORT_SYMBOL_GPL(hv_setup_mshv_handler); + void __weak hv_setup_kexec_handler(void (*handler)(void)) { } @@ -309,13 +710,151 @@ u64 __weak hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_s } EXPORT_SYMBOL_GPL(hv_ghcb_hypercall); -void __weak *hv_map_memory(void *addr, unsigned long size) +u64 __weak hv_tdx_hypercall(u64 control, u64 param1, u64 param2) +{ + return HV_STATUS_INVALID_PARAMETER; +} +EXPORT_SYMBOL_GPL(hv_tdx_hypercall); + +void __weak hv_enable_coco_interrupt(unsigned int cpu, unsigned int vector, bool set) +{ +} +EXPORT_SYMBOL_GPL(hv_enable_coco_interrupt); + +void __weak hv_para_set_sint_proxy(bool enable) +{ +} +EXPORT_SYMBOL_GPL(hv_para_set_sint_proxy); + +u64 __weak hv_para_get_synic_register(unsigned int reg) +{ + return ~0ULL; +} +EXPORT_SYMBOL_GPL(hv_para_get_synic_register); + +void __weak hv_para_set_synic_register(unsigned int reg, u64 val) +{ +} +EXPORT_SYMBOL_GPL(hv_para_set_synic_register); + +void hv_identify_partition_type(void) +{ + /* Assume guest role */ + hv_curr_partition_type = HV_PARTITION_TYPE_GUEST; + /* + * Check partition creation and cpu management privileges + * + * Hyper-V should never specify running as root and as a Confidential + * VM. But to protect against a compromised/malicious Hyper-V trying + * to exploit root behavior to expose Confidential VM memory, ignore + * the root partition setting if also a Confidential VM. + */ + if ((ms_hyperv.priv_high & HV_CREATE_PARTITIONS) && + !(ms_hyperv.priv_high & HV_ISOLATION)) { + + if (!IS_ENABLED(CONFIG_MSHV_ROOT)) { + pr_crit("Hyper-V: CONFIG_MSHV_ROOT not enabled!\n"); + } else if (ms_hyperv.priv_high & HV_CPU_MANAGEMENT) { + pr_info("Hyper-V: running as root partition\n"); + hv_curr_partition_type = HV_PARTITION_TYPE_ROOT; + } else { + pr_info("Hyper-V: running as L1VH partition\n"); + hv_curr_partition_type = HV_PARTITION_TYPE_L1VH; + } + } +} + +struct hv_status_info { + char *string; + int errno; + u16 code; +}; + +/* + * Note on the errno mappings: + * A failed hypercall is usually only recoverable (or loggable) near + * the call site where the HV_STATUS_* code is known. So the errno + * it gets converted to is not too useful further up the stack. + * Provide a few mappings that could be useful, and revert to -EIO + * as a fallback. + */ +static const struct hv_status_info hv_status_infos[] = { +#define _STATUS_INFO(status, errno) { #status, (errno), (status) } + _STATUS_INFO(HV_STATUS_SUCCESS, 0), + _STATUS_INFO(HV_STATUS_INVALID_HYPERCALL_CODE, -EINVAL), + _STATUS_INFO(HV_STATUS_INVALID_HYPERCALL_INPUT, -EINVAL), + _STATUS_INFO(HV_STATUS_INVALID_ALIGNMENT, -EIO), + _STATUS_INFO(HV_STATUS_INVALID_PARAMETER, -EINVAL), + _STATUS_INFO(HV_STATUS_ACCESS_DENIED, -EIO), + _STATUS_INFO(HV_STATUS_INVALID_PARTITION_STATE, -EIO), + _STATUS_INFO(HV_STATUS_OPERATION_DENIED, -EIO), + _STATUS_INFO(HV_STATUS_UNKNOWN_PROPERTY, -EIO), + _STATUS_INFO(HV_STATUS_PROPERTY_VALUE_OUT_OF_RANGE, -EIO), + _STATUS_INFO(HV_STATUS_INSUFFICIENT_MEMORY, -ENOMEM), + _STATUS_INFO(HV_STATUS_INVALID_PARTITION_ID, -EINVAL), + _STATUS_INFO(HV_STATUS_INVALID_VP_INDEX, -EINVAL), + _STATUS_INFO(HV_STATUS_NOT_FOUND, -EIO), + _STATUS_INFO(HV_STATUS_INVALID_PORT_ID, -EINVAL), + _STATUS_INFO(HV_STATUS_INVALID_CONNECTION_ID, -EINVAL), + _STATUS_INFO(HV_STATUS_INSUFFICIENT_BUFFERS, -EIO), + _STATUS_INFO(HV_STATUS_NOT_ACKNOWLEDGED, -EIO), + _STATUS_INFO(HV_STATUS_INVALID_VP_STATE, -EIO), + _STATUS_INFO(HV_STATUS_NO_RESOURCES, -EIO), + _STATUS_INFO(HV_STATUS_PROCESSOR_FEATURE_NOT_SUPPORTED, -EIO), + _STATUS_INFO(HV_STATUS_INVALID_LP_INDEX, -EINVAL), + _STATUS_INFO(HV_STATUS_INVALID_REGISTER_VALUE, -EINVAL), + _STATUS_INFO(HV_STATUS_INVALID_LP_INDEX, -EIO), + _STATUS_INFO(HV_STATUS_INVALID_REGISTER_VALUE, -EIO), + _STATUS_INFO(HV_STATUS_OPERATION_FAILED, -EIO), + _STATUS_INFO(HV_STATUS_TIME_OUT, -EIO), + _STATUS_INFO(HV_STATUS_CALL_PENDING, -EIO), + _STATUS_INFO(HV_STATUS_VTL_ALREADY_ENABLED, -EIO), +#undef _STATUS_INFO +}; + +static inline const struct hv_status_info *find_hv_status_info(u64 hv_status) { + int i; + u16 code = hv_result(hv_status); + + for (i = 0; i < ARRAY_SIZE(hv_status_infos); ++i) { + const struct hv_status_info *info = &hv_status_infos[i]; + + if (info->code == code) + return info; + } + return NULL; } -EXPORT_SYMBOL_GPL(hv_map_memory); -void __weak hv_unmap_memory(void *addr) +/* Convert a hypercall result into a linux-friendly error code. */ +int hv_result_to_errno(u64 status) +{ + const struct hv_status_info *info; + + /* hv_do_hypercall() may return U64_MAX, hypercalls aren't possible */ + if (unlikely(status == U64_MAX)) + return -EOPNOTSUPP; + + info = find_hv_status_info(status); + if (info) + return info->errno; + + return -EIO; +} +EXPORT_SYMBOL_GPL(hv_result_to_errno); + +const char *hv_result_to_string(u64 status) { + const struct hv_status_info *info; + + if (unlikely(status == U64_MAX)) + return "Hypercall page missing!"; + + info = find_hv_status_info(status); + if (info) + return info->string; + + return "Unknown"; } -EXPORT_SYMBOL_GPL(hv_unmap_memory); +EXPORT_SYMBOL_GPL(hv_result_to_string); diff --git a/drivers/hv/hv_fcopy.c b/drivers/hv/hv_fcopy.c deleted file mode 100644 index 922d83eb7ddf..000000000000 --- a/drivers/hv/hv_fcopy.c +++ /dev/null @@ -1,427 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * An implementation of file copy service. - * - * Copyright (C) 2014, Microsoft, Inc. - * - * Author : K. Y. Srinivasan <ksrinivasan@novell.com> - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/nls.h> -#include <linux/workqueue.h> -#include <linux/hyperv.h> -#include <linux/sched.h> -#include <asm/hyperv-tlfs.h> - -#include "hyperv_vmbus.h" -#include "hv_utils_transport.h" - -#define WIN8_SRV_MAJOR 1 -#define WIN8_SRV_MINOR 1 -#define WIN8_SRV_VERSION (WIN8_SRV_MAJOR << 16 | WIN8_SRV_MINOR) - -#define FCOPY_VER_COUNT 1 -static const int fcopy_versions[] = { - WIN8_SRV_VERSION -}; - -#define FW_VER_COUNT 1 -static const int fw_versions[] = { - UTIL_FW_VERSION -}; - -/* - * Global state maintained for transaction that is being processed. - * For a class of integration services, including the "file copy service", - * the specified protocol is a "request/response" protocol which means that - * there can only be single outstanding transaction from the host at any - * given point in time. We use this to simplify memory management in this - * driver - we cache and process only one message at a time. - * - * While the request/response protocol is guaranteed by the host, we further - * ensure this by serializing packet processing in this driver - we do not - * read additional packets from the VMBUs until the current packet is fully - * handled. - */ - -static struct { - int state; /* hvutil_device_state */ - int recv_len; /* number of bytes received. */ - struct hv_fcopy_hdr *fcopy_msg; /* current message */ - struct vmbus_channel *recv_channel; /* chn we got the request */ - u64 recv_req_id; /* request ID. */ -} fcopy_transaction; - -static void fcopy_respond_to_host(int error); -static void fcopy_send_data(struct work_struct *dummy); -static void fcopy_timeout_func(struct work_struct *dummy); -static DECLARE_DELAYED_WORK(fcopy_timeout_work, fcopy_timeout_func); -static DECLARE_WORK(fcopy_send_work, fcopy_send_data); -static const char fcopy_devname[] = "vmbus/hv_fcopy"; -static u8 *recv_buffer; -static struct hvutil_transport *hvt; -/* - * This state maintains the version number registered by the daemon. - */ -static int dm_reg_value; - -static void fcopy_poll_wrapper(void *channel) -{ - /* Transaction is finished, reset the state here to avoid races. */ - fcopy_transaction.state = HVUTIL_READY; - tasklet_schedule(&((struct vmbus_channel *)channel)->callback_event); -} - -static void fcopy_timeout_func(struct work_struct *dummy) -{ - /* - * If the timer fires, the user-mode component has not responded; - * process the pending transaction. - */ - fcopy_respond_to_host(HV_E_FAIL); - hv_poll_channel(fcopy_transaction.recv_channel, fcopy_poll_wrapper); -} - -static void fcopy_register_done(void) -{ - pr_debug("FCP: userspace daemon registered\n"); - hv_poll_channel(fcopy_transaction.recv_channel, fcopy_poll_wrapper); -} - -static int fcopy_handle_handshake(u32 version) -{ - u32 our_ver = FCOPY_CURRENT_VERSION; - - switch (version) { - case FCOPY_VERSION_0: - /* Daemon doesn't expect us to reply */ - dm_reg_value = version; - break; - case FCOPY_VERSION_1: - /* Daemon expects us to reply with our own version */ - if (hvutil_transport_send(hvt, &our_ver, sizeof(our_ver), - fcopy_register_done)) - return -EFAULT; - dm_reg_value = version; - break; - default: - /* - * For now we will fail the registration. - * If and when we have multiple versions to - * deal with, we will be backward compatible. - * We will add this code when needed. - */ - return -EINVAL; - } - pr_debug("FCP: userspace daemon ver. %d connected\n", version); - return 0; -} - -static void fcopy_send_data(struct work_struct *dummy) -{ - struct hv_start_fcopy *smsg_out = NULL; - int operation = fcopy_transaction.fcopy_msg->operation; - struct hv_start_fcopy *smsg_in; - void *out_src; - int rc, out_len; - - /* - * The strings sent from the host are encoded in - * utf16; convert it to utf8 strings. - * The host assures us that the utf16 strings will not exceed - * the max lengths specified. We will however, reserve room - * for the string terminating character - in the utf16s_utf8s() - * function we limit the size of the buffer where the converted - * string is placed to W_MAX_PATH -1 to guarantee - * that the strings can be properly terminated! - */ - - switch (operation) { - case START_FILE_COPY: - out_len = sizeof(struct hv_start_fcopy); - smsg_out = kzalloc(sizeof(*smsg_out), GFP_KERNEL); - if (!smsg_out) - return; - - smsg_out->hdr.operation = operation; - smsg_in = (struct hv_start_fcopy *)fcopy_transaction.fcopy_msg; - - utf16s_to_utf8s((wchar_t *)smsg_in->file_name, W_MAX_PATH, - UTF16_LITTLE_ENDIAN, - (__u8 *)&smsg_out->file_name, W_MAX_PATH - 1); - - utf16s_to_utf8s((wchar_t *)smsg_in->path_name, W_MAX_PATH, - UTF16_LITTLE_ENDIAN, - (__u8 *)&smsg_out->path_name, W_MAX_PATH - 1); - - smsg_out->copy_flags = smsg_in->copy_flags; - smsg_out->file_size = smsg_in->file_size; - out_src = smsg_out; - break; - - case WRITE_TO_FILE: - out_src = fcopy_transaction.fcopy_msg; - out_len = sizeof(struct hv_do_fcopy); - break; - default: - out_src = fcopy_transaction.fcopy_msg; - out_len = fcopy_transaction.recv_len; - break; - } - - fcopy_transaction.state = HVUTIL_USERSPACE_REQ; - rc = hvutil_transport_send(hvt, out_src, out_len, NULL); - if (rc) { - pr_debug("FCP: failed to communicate to the daemon: %d\n", rc); - if (cancel_delayed_work_sync(&fcopy_timeout_work)) { - fcopy_respond_to_host(HV_E_FAIL); - fcopy_transaction.state = HVUTIL_READY; - } - } - kfree(smsg_out); -} - -/* - * Send a response back to the host. - */ - -static void -fcopy_respond_to_host(int error) -{ - struct icmsg_hdr *icmsghdr; - u32 buf_len; - struct vmbus_channel *channel; - u64 req_id; - - /* - * Copy the global state for completing the transaction. Note that - * only one transaction can be active at a time. This is guaranteed - * by the file copy protocol implemented by the host. Furthermore, - * the "transaction active" state we maintain ensures that there can - * only be one active transaction at a time. - */ - - buf_len = fcopy_transaction.recv_len; - channel = fcopy_transaction.recv_channel; - req_id = fcopy_transaction.recv_req_id; - - icmsghdr = (struct icmsg_hdr *) - &recv_buffer[sizeof(struct vmbuspipe_hdr)]; - - if (channel->onchannel_callback == NULL) - /* - * We have raced with util driver being unloaded; - * silently return. - */ - return; - - icmsghdr->status = error; - icmsghdr->icflags = ICMSGHDRFLAG_TRANSACTION | ICMSGHDRFLAG_RESPONSE; - vmbus_sendpacket(channel, recv_buffer, buf_len, req_id, - VM_PKT_DATA_INBAND, 0); -} - -void hv_fcopy_onchannelcallback(void *context) -{ - struct vmbus_channel *channel = context; - u32 recvlen; - u64 requestid; - struct hv_fcopy_hdr *fcopy_msg; - struct icmsg_hdr *icmsghdr; - int fcopy_srv_version; - - if (fcopy_transaction.state > HVUTIL_READY) - return; - - if (vmbus_recvpacket(channel, recv_buffer, HV_HYP_PAGE_SIZE * 2, &recvlen, &requestid)) { - pr_err_ratelimited("Fcopy request received. Could not read into recv buf\n"); - return; - } - - if (!recvlen) - return; - - /* Ensure recvlen is big enough to read header data */ - if (recvlen < ICMSG_HDR) { - pr_err_ratelimited("Fcopy request received. Packet length too small: %d\n", - recvlen); - return; - } - - icmsghdr = (struct icmsg_hdr *)&recv_buffer[ - sizeof(struct vmbuspipe_hdr)]; - - if (icmsghdr->icmsgtype == ICMSGTYPE_NEGOTIATE) { - if (vmbus_prep_negotiate_resp(icmsghdr, - recv_buffer, recvlen, - fw_versions, FW_VER_COUNT, - fcopy_versions, FCOPY_VER_COUNT, - NULL, &fcopy_srv_version)) { - - pr_info("FCopy IC version %d.%d\n", - fcopy_srv_version >> 16, - fcopy_srv_version & 0xFFFF); - } - } else if (icmsghdr->icmsgtype == ICMSGTYPE_FCOPY) { - /* Ensure recvlen is big enough to contain hv_fcopy_hdr */ - if (recvlen < ICMSG_HDR + sizeof(struct hv_fcopy_hdr)) { - pr_err_ratelimited("Invalid Fcopy hdr. Packet length too small: %u\n", - recvlen); - return; - } - fcopy_msg = (struct hv_fcopy_hdr *)&recv_buffer[ICMSG_HDR]; - - /* - * Stash away this global state for completing the - * transaction; note transactions are serialized. - */ - - fcopy_transaction.recv_len = recvlen; - fcopy_transaction.recv_req_id = requestid; - fcopy_transaction.fcopy_msg = fcopy_msg; - - if (fcopy_transaction.state < HVUTIL_READY) { - /* Userspace is not registered yet */ - fcopy_respond_to_host(HV_E_FAIL); - return; - } - fcopy_transaction.state = HVUTIL_HOSTMSG_RECEIVED; - - /* - * Send the information to the user-level daemon. - */ - schedule_work(&fcopy_send_work); - schedule_delayed_work(&fcopy_timeout_work, - HV_UTIL_TIMEOUT * HZ); - return; - } else { - pr_err_ratelimited("Fcopy request received. Invalid msg type: %d\n", - icmsghdr->icmsgtype); - return; - } - icmsghdr->icflags = ICMSGHDRFLAG_TRANSACTION | ICMSGHDRFLAG_RESPONSE; - vmbus_sendpacket(channel, recv_buffer, recvlen, requestid, - VM_PKT_DATA_INBAND, 0); -} - -/* Callback when data is received from userspace */ -static int fcopy_on_msg(void *msg, int len) -{ - int *val = (int *)msg; - - if (len != sizeof(int)) - return -EINVAL; - - if (fcopy_transaction.state == HVUTIL_DEVICE_INIT) - return fcopy_handle_handshake(*val); - - if (fcopy_transaction.state != HVUTIL_USERSPACE_REQ) - return -EINVAL; - - /* - * Complete the transaction by forwarding the result - * to the host. But first, cancel the timeout. - */ - if (cancel_delayed_work_sync(&fcopy_timeout_work)) { - fcopy_transaction.state = HVUTIL_USERSPACE_RECV; - fcopy_respond_to_host(*val); - hv_poll_channel(fcopy_transaction.recv_channel, - fcopy_poll_wrapper); - } - - return 0; -} - -static void fcopy_on_reset(void) -{ - /* - * The daemon has exited; reset the state. - */ - fcopy_transaction.state = HVUTIL_DEVICE_INIT; - - if (cancel_delayed_work_sync(&fcopy_timeout_work)) - fcopy_respond_to_host(HV_E_FAIL); -} - -int hv_fcopy_init(struct hv_util_service *srv) -{ - recv_buffer = srv->recv_buffer; - fcopy_transaction.recv_channel = srv->channel; - fcopy_transaction.recv_channel->max_pkt_size = HV_HYP_PAGE_SIZE * 2; - - /* - * When this driver loads, the user level daemon that - * processes the host requests may not yet be running. - * Defer processing channel callbacks until the daemon - * has registered. - */ - fcopy_transaction.state = HVUTIL_DEVICE_INIT; - - hvt = hvutil_transport_init(fcopy_devname, 0, 0, - fcopy_on_msg, fcopy_on_reset); - if (!hvt) - return -EFAULT; - - return 0; -} - -static void hv_fcopy_cancel_work(void) -{ - cancel_delayed_work_sync(&fcopy_timeout_work); - cancel_work_sync(&fcopy_send_work); -} - -int hv_fcopy_pre_suspend(void) -{ - struct vmbus_channel *channel = fcopy_transaction.recv_channel; - struct hv_fcopy_hdr *fcopy_msg; - - /* - * Fake a CANCEL_FCOPY message for the user space daemon in case the - * daemon is in the middle of copying some file. It doesn't matter if - * there is already a message pending to be delivered to the user - * space since we force fcopy_transaction.state to be HVUTIL_READY, so - * the user space daemon's write() will fail with EINVAL (see - * fcopy_on_msg()), and the daemon will reset the device by closing - * and re-opening it. - */ - fcopy_msg = kzalloc(sizeof(*fcopy_msg), GFP_KERNEL); - if (!fcopy_msg) - return -ENOMEM; - - tasklet_disable(&channel->callback_event); - - fcopy_msg->operation = CANCEL_FCOPY; - - hv_fcopy_cancel_work(); - - /* We don't care about the return value. */ - hvutil_transport_send(hvt, fcopy_msg, sizeof(*fcopy_msg), NULL); - - kfree(fcopy_msg); - - fcopy_transaction.state = HVUTIL_READY; - - /* tasklet_enable() will be called in hv_fcopy_pre_resume(). */ - return 0; -} - -int hv_fcopy_pre_resume(void) -{ - struct vmbus_channel *channel = fcopy_transaction.recv_channel; - - tasklet_enable(&channel->callback_event); - - return 0; -} - -void hv_fcopy_deinit(void) -{ - fcopy_transaction.state = HVUTIL_DEVICE_DYING; - - hv_fcopy_cancel_work(); - - hvutil_transport_destroy(hvt); -} diff --git a/drivers/hv/hv_kvp.c b/drivers/hv/hv_kvp.c index d35b60c06114..62795f6cbb00 100644 --- a/drivers/hv/hv_kvp.c +++ b/drivers/hv/hv_kvp.c @@ -27,7 +27,7 @@ #include <linux/connector.h> #include <linux/workqueue.h> #include <linux/hyperv.h> -#include <asm/hyperv-tlfs.h> +#include <hyperv/hvhdk.h> #include "hyperv_vmbus.h" #include "hv_utils_transport.h" @@ -655,7 +655,7 @@ void hv_kvp_onchannelcallback(void *context) if (host_negotiatied == NEGO_NOT_STARTED) { host_negotiatied = NEGO_IN_PROGRESS; schedule_delayed_work(&kvp_host_handshake_work, - HV_UTIL_NEGO_TIMEOUT * HZ); + secs_to_jiffies(HV_UTIL_NEGO_TIMEOUT)); } return; } @@ -724,7 +724,7 @@ void hv_kvp_onchannelcallback(void *context) */ schedule_work(&kvp_sendkey_work); schedule_delayed_work(&kvp_timeout_work, - HV_UTIL_TIMEOUT * HZ); + secs_to_jiffies(HV_UTIL_TIMEOUT)); return; @@ -767,6 +767,12 @@ hv_kvp_init(struct hv_util_service *srv) */ kvp_transaction.state = HVUTIL_DEVICE_INIT; + return 0; +} + +int +hv_kvp_init_transport(void) +{ hvt = hvutil_transport_init(kvp_devname, CN_KVP_IDX, CN_KVP_VAL, kvp_on_msg, kvp_on_reset); if (!hvt) diff --git a/drivers/hv/hv_proc.c b/drivers/hv/hv_proc.c new file mode 100644 index 000000000000..fbb4eb3901bb --- /dev/null +++ b/drivers/hv/hv_proc.c @@ -0,0 +1,196 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/types.h> +#include <linux/vmalloc.h> +#include <linux/mm.h> +#include <linux/clockchips.h> +#include <linux/slab.h> +#include <linux/cpuhotplug.h> +#include <linux/minmax.h> +#include <linux/export.h> +#include <asm/mshyperv.h> + +/* + * See struct hv_deposit_memory. The first u64 is partition ID, the rest + * are GPAs. + */ +#define HV_DEPOSIT_MAX (HV_HYP_PAGE_SIZE / sizeof(u64) - 1) + +/* Deposits exact number of pages. Must be called with interrupts enabled. */ +int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages) +{ + struct page **pages, *page; + int *counts; + int num_allocations; + int i, j, page_count; + int order; + u64 status; + int ret; + u64 base_pfn; + struct hv_deposit_memory *input_page; + unsigned long flags; + + if (num_pages > HV_DEPOSIT_MAX) + return -E2BIG; + if (!num_pages) + return 0; + + /* One buffer for page pointers and counts */ + page = alloc_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + pages = page_address(page); + + counts = kcalloc(HV_DEPOSIT_MAX, sizeof(int), GFP_KERNEL); + if (!counts) { + free_page((unsigned long)pages); + return -ENOMEM; + } + + /* Allocate all the pages before disabling interrupts */ + i = 0; + + while (num_pages) { + /* Find highest order we can actually allocate */ + order = 31 - __builtin_clz(num_pages); + + while (1) { + pages[i] = alloc_pages_node(node, GFP_KERNEL, order); + if (pages[i]) + break; + if (!order) { + ret = -ENOMEM; + num_allocations = i; + goto err_free_allocations; + } + --order; + } + + split_page(pages[i], order); + counts[i] = 1 << order; + num_pages -= counts[i]; + i++; + } + num_allocations = i; + + local_irq_save(flags); + + input_page = *this_cpu_ptr(hyperv_pcpu_input_arg); + + input_page->partition_id = partition_id; + + /* Populate gpa_page_list - these will fit on the input page */ + for (i = 0, page_count = 0; i < num_allocations; ++i) { + base_pfn = page_to_pfn(pages[i]); + for (j = 0; j < counts[i]; ++j, ++page_count) + input_page->gpa_page_list[page_count] = base_pfn + j; + } + status = hv_do_rep_hypercall(HVCALL_DEPOSIT_MEMORY, + page_count, 0, input_page, NULL); + local_irq_restore(flags); + if (!hv_result_success(status)) { + hv_status_err(status, "\n"); + ret = hv_result_to_errno(status); + goto err_free_allocations; + } + + ret = 0; + goto free_buf; + +err_free_allocations: + for (i = 0; i < num_allocations; ++i) { + base_pfn = page_to_pfn(pages[i]); + for (j = 0; j < counts[i]; ++j) + __free_page(pfn_to_page(base_pfn + j)); + } + +free_buf: + free_page((unsigned long)pages); + kfree(counts); + return ret; +} +EXPORT_SYMBOL_GPL(hv_call_deposit_pages); + +int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id) +{ + struct hv_input_add_logical_processor *input; + struct hv_output_add_logical_processor *output; + u64 status; + unsigned long flags; + int ret = 0; + + /* + * When adding a logical processor, the hypervisor may return + * HV_STATUS_INSUFFICIENT_MEMORY. When that happens, we deposit more + * pages and retry. + */ + do { + local_irq_save(flags); + + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + /* We don't do anything with the output right now */ + output = *this_cpu_ptr(hyperv_pcpu_output_arg); + + input->lp_index = lp_index; + input->apic_id = apic_id; + input->proximity_domain_info = hv_numa_node_to_pxm_info(node); + status = hv_do_hypercall(HVCALL_ADD_LOGICAL_PROCESSOR, + input, output); + local_irq_restore(flags); + + if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { + if (!hv_result_success(status)) { + hv_status_err(status, "cpu %u apic ID: %u\n", + lp_index, apic_id); + ret = hv_result_to_errno(status); + } + break; + } + ret = hv_call_deposit_pages(node, hv_current_partition_id, 1); + } while (!ret); + + return ret; +} + +int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags) +{ + struct hv_create_vp *input; + u64 status; + unsigned long irq_flags; + int ret = 0; + + /* Root VPs don't seem to need pages deposited */ + if (partition_id != hv_current_partition_id) { + /* The value 90 is empirically determined. It may change. */ + ret = hv_call_deposit_pages(node, partition_id, 90); + if (ret) + return ret; + } + + do { + local_irq_save(irq_flags); + + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + + input->partition_id = partition_id; + input->vp_index = vp_index; + input->flags = flags; + input->subnode_type = HV_SUBNODE_ANY; + input->proximity_domain_info = hv_numa_node_to_pxm_info(node); + status = hv_do_hypercall(HVCALL_CREATE_VP, input, NULL); + local_irq_restore(irq_flags); + + if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { + if (!hv_result_success(status)) { + hv_status_err(status, "vcpu: %u, lp: %u\n", + vp_index, flags); + ret = hv_result_to_errno(status); + } + break; + } + ret = hv_call_deposit_pages(node, partition_id, 1); + + } while (!ret); + + return ret; +} +EXPORT_SYMBOL_GPL(hv_call_create_vp); diff --git a/drivers/hv/hv_snapshot.c b/drivers/hv/hv_snapshot.c index 0d2184be1691..2e7f537d53cf 100644 --- a/drivers/hv/hv_snapshot.c +++ b/drivers/hv/hv_snapshot.c @@ -12,7 +12,7 @@ #include <linux/connector.h> #include <linux/workqueue.h> #include <linux/hyperv.h> -#include <asm/hyperv-tlfs.h> +#include <hyperv/hvhdk.h> #include "hyperv_vmbus.h" #include "hv_utils_transport.h" @@ -193,7 +193,8 @@ static void vss_send_op(void) vss_transaction.state = HVUTIL_USERSPACE_REQ; schedule_delayed_work(&vss_timeout_work, op == VSS_OP_FREEZE ? - VSS_FREEZE_TIMEOUT * HZ : HV_UTIL_TIMEOUT * HZ); + secs_to_jiffies(VSS_FREEZE_TIMEOUT) : + secs_to_jiffies(HV_UTIL_TIMEOUT)); rc = hvutil_transport_send(hvt, vss_msg, sizeof(*vss_msg), NULL); if (rc) { @@ -388,6 +389,12 @@ hv_vss_init(struct hv_util_service *srv) */ vss_transaction.state = HVUTIL_DEVICE_INIT; + return 0; +} + +int +hv_vss_init_transport(void) +{ hvt = hvutil_transport_init(vss_devname, CN_VSS_IDX, CN_VSS_VAL, vss_on_msg, vss_on_reset); if (!hvt) { diff --git a/drivers/hv/hv_util.c b/drivers/hv/hv_util.c index d776074b49cb..7e9c8e169c66 100644 --- a/drivers/hv/hv_util.c +++ b/drivers/hv/hv_util.c @@ -141,6 +141,7 @@ static struct hv_util_service util_heartbeat = { static struct hv_util_service util_kvp = { .util_cb = hv_kvp_onchannelcallback, .util_init = hv_kvp_init, + .util_init_transport = hv_kvp_init_transport, .util_pre_suspend = hv_kvp_pre_suspend, .util_pre_resume = hv_kvp_pre_resume, .util_deinit = hv_kvp_deinit, @@ -149,19 +150,12 @@ static struct hv_util_service util_kvp = { static struct hv_util_service util_vss = { .util_cb = hv_vss_onchannelcallback, .util_init = hv_vss_init, + .util_init_transport = hv_vss_init_transport, .util_pre_suspend = hv_vss_pre_suspend, .util_pre_resume = hv_vss_pre_resume, .util_deinit = hv_vss_deinit, }; -static struct hv_util_service util_fcopy = { - .util_cb = hv_fcopy_onchannelcallback, - .util_init = hv_fcopy_init, - .util_pre_suspend = hv_fcopy_pre_suspend, - .util_pre_resume = hv_fcopy_pre_resume, - .util_deinit = hv_fcopy_deinit, -}; - static void perform_shutdown(struct work_struct *dummy) { orderly_poweroff(true); @@ -296,6 +290,11 @@ static struct { spinlock_t lock; } host_ts; +static bool timesync_implicit; + +module_param(timesync_implicit, bool, 0644); +MODULE_PARM_DESC(timesync_implicit, "If set treat SAMPLE as SYNC when clock is behind"); + static inline u64 reftime_to_ns(u64 reftime) { return (reftime - WLTIMEDELTA) * 100; @@ -345,6 +344,29 @@ static void hv_set_host_time(struct work_struct *work) } /* + * Due to a bug on Hyper-V hosts, the sync flag may not always be sent on resume. + * Force a sync if the guest is behind. + */ +static inline bool hv_implicit_sync(u64 host_time) +{ + struct timespec64 new_ts; + struct timespec64 threshold_ts; + + new_ts = ns_to_timespec64(reftime_to_ns(host_time)); + ktime_get_real_ts64(&threshold_ts); + + threshold_ts.tv_sec += 5; + + /* + * If guest behind the host by 5 or more seconds. + */ + if (timespec64_compare(&new_ts, &threshold_ts) >= 0) + return true; + + return false; +} + +/* * Synchronize time with host after reboot, restore, etc. * * ICTIMESYNCFLAG_SYNC flag bit indicates reboot, restore events of the VM. @@ -384,7 +406,8 @@ static inline void adj_guesttime(u64 hosttime, u64 reftime, u8 adj_flags) spin_unlock_irqrestore(&host_ts.lock, flags); /* Schedule work to do do_settimeofday64() */ - if (adj_flags & ICTIMESYNCFLAG_SYNC) + if ((adj_flags & ICTIMESYNCFLAG_SYNC) || + (timesync_implicit && hv_implicit_sync(host_ts.host_time))) schedule_work(&adj_time_work); } @@ -563,16 +586,14 @@ static int util_probe(struct hv_device *dev, (struct hv_util_service *)dev_id->driver_data; int ret; - srv->recv_buffer = kmalloc(HV_HYP_PAGE_SIZE * 4, GFP_KERNEL); + srv->recv_buffer = kmalloc_array(4, HV_HYP_PAGE_SIZE, GFP_KERNEL); if (!srv->recv_buffer) return -ENOMEM; srv->channel = dev->channel; if (srv->util_init) { ret = srv->util_init(srv); - if (ret) { - ret = -ENODEV; + if (ret) goto error1; - } } /* @@ -592,6 +613,13 @@ static int util_probe(struct hv_device *dev, if (ret) goto error; + if (srv->util_init_transport) { + ret = srv->util_init_transport(); + if (ret) { + vmbus_close(dev->channel); + goto error; + } + } return 0; error: @@ -602,7 +630,7 @@ error1: return ret; } -static int util_remove(struct hv_device *dev) +static void util_remove(struct hv_device *dev) { struct hv_util_service *srv = hv_get_drvdata(dev); @@ -610,8 +638,6 @@ static int util_remove(struct hv_device *dev) srv->util_deinit(); vmbus_close(dev->channel); kfree(srv->recv_buffer); - - return 0; } /* @@ -673,10 +699,6 @@ static const struct hv_vmbus_device_id id_table[] = { { HV_VSS_GUID, .driver_data = (unsigned long)&util_vss }, - /* File copy GUID */ - { HV_FCOPY_GUID, - .driver_data = (unsigned long)&util_fcopy - }, { }, }; diff --git a/drivers/hv/hv_utils_transport.c b/drivers/hv/hv_utils_transport.c index 832885198643..b3de35ff6334 100644 --- a/drivers/hv/hv_utils_transport.c +++ b/drivers/hv/hv_utils_transport.c @@ -129,8 +129,7 @@ static int hvt_op_open(struct inode *inode, struct file *file) * device gets released. */ hvt->mode = HVUTIL_TRANSPORT_CHARDEV; - } - else if (hvt->mode == HVUTIL_TRANSPORT_NETLINK) { + } else if (hvt->mode == HVUTIL_TRANSPORT_NETLINK) { /* * We're switching from netlink communication to using char * device. Issue the reset first. @@ -195,7 +194,7 @@ static void hvt_cn_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp) } spin_unlock(&hvt_list_lock); if (!hvt_found) { - pr_warn("hvt_cn_callback: spurious message received!\n"); + pr_warn("%s: spurious message received!\n", __func__); return; } @@ -210,7 +209,7 @@ static void hvt_cn_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp) if (hvt->mode == HVUTIL_TRANSPORT_NETLINK) hvt_found->on_msg(msg->data, msg->len); else - pr_warn("hvt_cn_callback: unexpected netlink message!\n"); + pr_warn("%s: unexpected netlink message!\n", __func__); mutex_unlock(&hvt->lock); } @@ -260,8 +259,9 @@ int hvutil_transport_send(struct hvutil_transport *hvt, void *msg, int len, hvt->outmsg_len = len; hvt->on_read = on_read_cb; wake_up_interruptible(&hvt->outmsg_q); - } else + } else { ret = -ENOMEM; + } out_unlock: mutex_unlock(&hvt->lock); return ret; diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index dc673edf053c..b2862e0a317a 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -15,10 +15,11 @@ #include <linux/list.h> #include <linux/bitops.h> #include <asm/sync_bitops.h> -#include <asm/hyperv-tlfs.h> +#include <asm/mshyperv.h> #include <linux/atomic.h> #include <linux/hyperv.h> #include <linux/interrupt.h> +#include <hyperv/hvhdk.h> #include "hv_trace.h" @@ -32,6 +33,7 @@ */ #define HV_UTIL_NEGO_TIMEOUT 55 +void vmbus_isr(void); /* Definitions for the monitored notification facility */ union hv_monitor_trigger_group { @@ -120,10 +122,35 @@ enum { * Per cpu state for channel handling */ struct hv_per_cpu_context { - void *synic_message_page; - void *synic_event_page; /* - * buffer to post messages to the host. + * SynIC pages for communicating with the host. + * + * These pages are accessible to the host partition and the hypervisor. + * They may be used for exchanging data with the host partition and the + * hypervisor even when they aren't trusted yet the guest partition + * must be prepared to handle the malicious behavior. + */ + void *hyp_synic_message_page; + void *hyp_synic_event_page; + /* + * SynIC pages for communicating with the paravisor. + * + * These pages may be accessed from within the guest partition only in + * CoCo VMs. Neither the host partition nor the hypervisor can access + * these pages in that case; they are used for exchanging data with the + * paravisor. + */ + void *para_synic_message_page; + void *para_synic_event_page; + + /* + * The page is only used in hv_post_message() for a TDX VM (with the + * paravisor) to post a messages to Hyper-V: when such a VM calls + * HVCALL_POST_MESSAGE, it can't use the hyperv_pcpu_input_arg (which + * is encrypted in such a VM) as the hypercall input page, because + * the input page for HVCALL_POST_MESSAGE must be decrypted in such a + * VM, so post_msg_page (which is decrypted in hv_synic_alloc()) is + * introduced for this purpose. See hyperv_init() for more comments. */ void *post_msg_page; @@ -164,10 +191,10 @@ extern int hv_synic_alloc(void); extern void hv_synic_free(void); -extern void hv_synic_enable_regs(unsigned int cpu); +extern void hv_hyp_synic_enable_regs(unsigned int cpu); extern int hv_synic_init(unsigned int cpu); -extern void hv_synic_disable_regs(unsigned int cpu); +extern void hv_hyp_synic_disable_regs(unsigned int cpu); extern int hv_synic_cleanup(unsigned int cpu); /* Interface */ @@ -175,7 +202,8 @@ extern int hv_synic_cleanup(unsigned int cpu); void hv_ringbuffer_pre_init(struct vmbus_channel *channel); int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info, - struct page *pages, u32 pagecnt, u32 max_pkt_size); + struct page *pages, u32 pagecnt, u32 max_pkt_size, + bool confidential); void hv_ringbuffer_cleanup(struct hv_ring_buffer_info *ring_info); @@ -241,8 +269,6 @@ struct vmbus_connection { * is child->parent notification */ struct hv_monitor_page *monitor_pages[2]; - void *monitor_pages_original[2]; - phys_addr_t monitor_pages_pa[2]; struct list_head chn_msg_list; spinlock_t channelmsg_lock; @@ -282,18 +308,10 @@ struct vmbus_connection { struct completion ready_for_suspend_event; /* - * The number of primary channels that should be "fixed up" - * upon resume: these channels are re-offered upon resume, and some - * fields of the channel offers (i.e. child_relid and connection_id) - * can change, so the old offermsg must be fixed up, before the resume - * callbacks of the VSC drivers start to further touch the channels. + * Completed once the host has offered all boot-time channels. + * Note that some channels may still be under process on a workqueue. */ - atomic_t nr_chan_fixup_on_resume; - /* - * vmbus_bus_resume() waits for "nr_chan_fixup_on_resume" to - * drop to zero. - */ - struct completion ready_for_resume_event; + struct completion all_offers_delivered_event; }; @@ -336,6 +354,51 @@ extern const struct vmbus_channel_message_table_entry /* General vmbus interface */ +bool vmbus_is_confidential(void); + +#if IS_ENABLED(CONFIG_HYPERV_VMBUS) +/* Free the message slot and signal end-of-message if required */ +static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type) +{ + /* + * On crash we're reading some other CPU's message page and we need + * to be careful: this other CPU may already had cleared the header + * and the host may already had delivered some other message there. + * In case we blindly write msg->header.message_type we're going + * to lose it. We can still lose a message of the same type but + * we count on the fact that there can only be one + * CHANNELMSG_UNLOAD_RESPONSE and we don't care about other messages + * on crash. + */ + if (cmpxchg(&msg->header.message_type, old_msg_type, + HVMSG_NONE) != old_msg_type) + return; + + /* + * The cmxchg() above does an implicit memory barrier to + * ensure the write to MessageType (ie set to + * HVMSG_NONE) happens before we read the + * MessagePending and EOMing. Otherwise, the EOMing + * will not deliver any more messages since there is + * no empty slot + */ + if (msg->header.message_flags.msg_pending) { + /* + * This will cause message queue rescan to + * possibly deliver another msg from the + * hypervisor + */ + if (vmbus_is_confidential()) + hv_para_set_synic_register(HV_MSR_EOM, 0); + else + hv_set_msr(HV_MSR_EOM, 0); + } +} + +extern int vmbus_interrupt; +extern int vmbus_irq; +#endif /* CONFIG_HYPERV_VMBUS */ + struct hv_device *vmbus_device_create(const guid_t *type, const guid_t *instance, struct vmbus_channel *channel); @@ -365,22 +428,18 @@ void vmbus_on_event(unsigned long data); void vmbus_on_msg_dpc(unsigned long data); int hv_kvp_init(struct hv_util_service *srv); +int hv_kvp_init_transport(void); void hv_kvp_deinit(void); int hv_kvp_pre_suspend(void); int hv_kvp_pre_resume(void); void hv_kvp_onchannelcallback(void *context); int hv_vss_init(struct hv_util_service *srv); +int hv_vss_init_transport(void); void hv_vss_deinit(void); int hv_vss_pre_suspend(void); int hv_vss_pre_resume(void); void hv_vss_onchannelcallback(void *context); - -int hv_fcopy_init(struct hv_util_service *srv); -void hv_fcopy_deinit(void); -int hv_fcopy_pre_suspend(void); -int hv_fcopy_pre_resume(void); -void hv_fcopy_onchannelcallback(void *context); void vmbus_initiate_unload(bool crash); static inline void hv_poll_channel(struct vmbus_channel *channel, @@ -412,6 +471,11 @@ static inline bool hv_is_perf_channel(struct vmbus_channel *channel) return vmbus_devs[channel->device_id].perf_device; } +static inline size_t hv_dev_ring_size(struct vmbus_channel *channel) +{ + return vmbus_devs[channel->device_id].pref_ring_size; +} + static inline bool hv_is_allocated_cpu(unsigned int cpu) { struct vmbus_channel *channel, *sc; @@ -479,4 +543,10 @@ static inline int hv_debug_add_dev_dir(struct hv_device *dev) #endif /* CONFIG_HYPERV_TESTING */ +/* Create and remove sysfs entry for memory mapped ring buffers for a channel */ +int hv_create_ring_sysfs(struct vmbus_channel *channel, + int (*hv_mmap_ring_buffer)(struct vmbus_channel *channel, + struct vm_area_struct *vma)); +int hv_remove_ring_sysfs(struct vmbus_channel *channel); + #endif /* _HYPERV_VMBUS_H */ diff --git a/drivers/hv/mshv.h b/drivers/hv/mshv.h new file mode 100644 index 000000000000..d4813df92b9c --- /dev/null +++ b/drivers/hv/mshv.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2023, Microsoft Corporation. + */ + +#ifndef _MSHV_H_ +#define _MSHV_H_ + +#include <linux/stddef.h> +#include <linux/string.h> +#include <hyperv/hvhdk.h> + +#define mshv_field_nonzero(STRUCT, MEMBER) \ + memchr_inv(&((STRUCT).MEMBER), \ + 0, sizeof_field(typeof(STRUCT), MEMBER)) + +int hv_call_get_vp_registers(u32 vp_index, u64 partition_id, u16 count, + union hv_input_vtl input_vtl, + struct hv_register_assoc *registers); + +int hv_call_set_vp_registers(u32 vp_index, u64 partition_id, u16 count, + union hv_input_vtl input_vtl, + struct hv_register_assoc *registers); + +int hv_call_get_partition_property(u64 partition_id, u64 property_code, + u64 *property_value); + +#endif /* _MSHV_H */ diff --git a/drivers/hv/mshv_common.c b/drivers/hv/mshv_common.c new file mode 100644 index 000000000000..58027b23c206 --- /dev/null +++ b/drivers/hv/mshv_common.c @@ -0,0 +1,239 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024, Microsoft Corporation. + * + * This file contains functions that will be called from one or more modules. + * If any of these modules are configured to build, this file is built and just + * statically linked in. + * + * Authors: Microsoft Linux virtualization team + */ + +#include <linux/kernel.h> +#include <linux/mm.h> +#include <asm/mshyperv.h> +#include <linux/resume_user_mode.h> +#include <linux/export.h> +#include <linux/acpi.h> +#include <linux/notifier.h> +#include <linux/reboot.h> + +#include "mshv.h" + +#define HV_GET_REGISTER_BATCH_SIZE \ + (HV_HYP_PAGE_SIZE / sizeof(union hv_register_value)) +#define HV_SET_REGISTER_BATCH_SIZE \ + ((HV_HYP_PAGE_SIZE - sizeof(struct hv_input_set_vp_registers)) \ + / sizeof(struct hv_register_assoc)) + +int hv_call_get_vp_registers(u32 vp_index, u64 partition_id, u16 count, + union hv_input_vtl input_vtl, + struct hv_register_assoc *registers) +{ + struct hv_input_get_vp_registers *input_page; + union hv_register_value *output_page; + u16 completed = 0; + unsigned long remaining = count; + int rep_count, i; + u64 status = HV_STATUS_SUCCESS; + unsigned long flags; + + local_irq_save(flags); + + input_page = *this_cpu_ptr(hyperv_pcpu_input_arg); + output_page = *this_cpu_ptr(hyperv_pcpu_output_arg); + + input_page->partition_id = partition_id; + input_page->vp_index = vp_index; + input_page->input_vtl.as_uint8 = input_vtl.as_uint8; + input_page->rsvd_z8 = 0; + input_page->rsvd_z16 = 0; + + while (remaining) { + rep_count = min(remaining, HV_GET_REGISTER_BATCH_SIZE); + for (i = 0; i < rep_count; ++i) + input_page->names[i] = registers[i].name; + + status = hv_do_rep_hypercall(HVCALL_GET_VP_REGISTERS, rep_count, + 0, input_page, output_page); + if (!hv_result_success(status)) + break; + + completed = hv_repcomp(status); + for (i = 0; i < completed; ++i) + registers[i].value = output_page[i]; + + registers += completed; + remaining -= completed; + } + local_irq_restore(flags); + + return hv_result_to_errno(status); +} +EXPORT_SYMBOL_GPL(hv_call_get_vp_registers); + +int hv_call_set_vp_registers(u32 vp_index, u64 partition_id, u16 count, + union hv_input_vtl input_vtl, + struct hv_register_assoc *registers) +{ + struct hv_input_set_vp_registers *input_page; + u16 completed = 0; + unsigned long remaining = count; + int rep_count; + u64 status = HV_STATUS_SUCCESS; + unsigned long flags; + + local_irq_save(flags); + input_page = *this_cpu_ptr(hyperv_pcpu_input_arg); + + input_page->partition_id = partition_id; + input_page->vp_index = vp_index; + input_page->input_vtl.as_uint8 = input_vtl.as_uint8; + input_page->rsvd_z8 = 0; + input_page->rsvd_z16 = 0; + + while (remaining) { + rep_count = min(remaining, HV_SET_REGISTER_BATCH_SIZE); + memcpy(input_page->elements, registers, + sizeof(struct hv_register_assoc) * rep_count); + + status = hv_do_rep_hypercall(HVCALL_SET_VP_REGISTERS, rep_count, + 0, input_page, NULL); + if (!hv_result_success(status)) + break; + + completed = hv_repcomp(status); + registers += completed; + remaining -= completed; + } + + local_irq_restore(flags); + + return hv_result_to_errno(status); +} +EXPORT_SYMBOL_GPL(hv_call_set_vp_registers); + +int hv_call_get_partition_property(u64 partition_id, + u64 property_code, + u64 *property_value) +{ + u64 status; + unsigned long flags; + struct hv_input_get_partition_property *input; + struct hv_output_get_partition_property *output; + + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + output = *this_cpu_ptr(hyperv_pcpu_output_arg); + memset(input, 0, sizeof(*input)); + input->partition_id = partition_id; + input->property_code = property_code; + status = hv_do_hypercall(HVCALL_GET_PARTITION_PROPERTY, input, output); + + if (!hv_result_success(status)) { + local_irq_restore(flags); + return hv_result_to_errno(status); + } + *property_value = output->property_value; + + local_irq_restore(flags); + + return 0; +} +EXPORT_SYMBOL_GPL(hv_call_get_partition_property); + +/* + * Corresponding sleep states have to be initialized in order for a subsequent + * HVCALL_ENTER_SLEEP_STATE call to succeed. Currently only S5 state as per + * ACPI 6.4 chapter 7.4.2 is relevant, while S1, S2 and S3 can be supported. + * + * In order to pass proper PM values to mshv, ACPI should be initialized and + * should support S5 sleep state when this method is invoked. + */ +static int hv_initialize_sleep_states(void) +{ + u64 status; + unsigned long flags; + struct hv_input_set_system_property *in; + acpi_status acpi_status; + u8 sleep_type_a, sleep_type_b; + + if (!acpi_sleep_state_supported(ACPI_STATE_S5)) { + pr_err("%s: S5 sleep state not supported.\n", __func__); + return -ENODEV; + } + + acpi_status = acpi_get_sleep_type_data(ACPI_STATE_S5, &sleep_type_a, + &sleep_type_b); + if (ACPI_FAILURE(acpi_status)) + return -ENODEV; + + local_irq_save(flags); + in = *this_cpu_ptr(hyperv_pcpu_input_arg); + memset(in, 0, sizeof(*in)); + + in->property_id = HV_SYSTEM_PROPERTY_SLEEP_STATE; + in->set_sleep_state_info.sleep_state = HV_SLEEP_STATE_S5; + in->set_sleep_state_info.pm1a_slp_typ = sleep_type_a; + in->set_sleep_state_info.pm1b_slp_typ = sleep_type_b; + + status = hv_do_hypercall(HVCALL_SET_SYSTEM_PROPERTY, in, NULL); + local_irq_restore(flags); + + if (!hv_result_success(status)) { + hv_status_err(status, "\n"); + return hv_result_to_errno(status); + } + + return 0; +} + +/* + * This notifier initializes sleep states in mshv hypervisor which will be + * used during power off. + */ +static int hv_reboot_notifier_handler(struct notifier_block *this, + unsigned long code, void *another) +{ + int ret = 0; + + if (code == SYS_HALT || code == SYS_POWER_OFF) + ret = hv_initialize_sleep_states(); + + return ret ? NOTIFY_DONE : NOTIFY_OK; +} + +static struct notifier_block hv_reboot_notifier = { + .notifier_call = hv_reboot_notifier_handler, +}; + +void hv_sleep_notifiers_register(void) +{ + int ret; + + ret = register_reboot_notifier(&hv_reboot_notifier); + if (ret) + pr_err("%s: cannot register reboot notifier %d\n", __func__, + ret); +} + +/* + * Power off the machine by entering S5 sleep state via Hyper-V hypercall. + * This call does not return if successful. + */ +void hv_machine_power_off(void) +{ + unsigned long flags; + struct hv_input_enter_sleep_state *in; + + local_irq_save(flags); + in = *this_cpu_ptr(hyperv_pcpu_input_arg); + in->sleep_state = HV_SLEEP_STATE_S5; + + (void)hv_do_hypercall(HVCALL_ENTER_SLEEP_STATE, in, NULL); + local_irq_restore(flags); + + /* should never reach here */ + BUG(); + +} diff --git a/drivers/hv/mshv_eventfd.c b/drivers/hv/mshv_eventfd.c new file mode 100644 index 000000000000..d93a18f09c76 --- /dev/null +++ b/drivers/hv/mshv_eventfd.c @@ -0,0 +1,839 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * eventfd support for mshv + * + * Heavily inspired from KVM implementation of irqfd/ioeventfd. The basic + * framework code is taken from the kvm implementation. + * + * All credits to kvm developers. + */ + +#include <linux/syscalls.h> +#include <linux/wait.h> +#include <linux/poll.h> +#include <linux/file.h> +#include <linux/list.h> +#include <linux/workqueue.h> +#include <linux/eventfd.h> + +#if IS_ENABLED(CONFIG_X86_64) +#include <asm/apic.h> +#endif +#include <asm/mshyperv.h> + +#include "mshv_eventfd.h" +#include "mshv.h" +#include "mshv_root.h" + +static struct workqueue_struct *irqfd_cleanup_wq; + +void mshv_register_irq_ack_notifier(struct mshv_partition *partition, + struct mshv_irq_ack_notifier *mian) +{ + mutex_lock(&partition->pt_irq_lock); + hlist_add_head_rcu(&mian->link, &partition->irq_ack_notifier_list); + mutex_unlock(&partition->pt_irq_lock); +} + +void mshv_unregister_irq_ack_notifier(struct mshv_partition *partition, + struct mshv_irq_ack_notifier *mian) +{ + mutex_lock(&partition->pt_irq_lock); + hlist_del_init_rcu(&mian->link); + mutex_unlock(&partition->pt_irq_lock); + synchronize_rcu(); +} + +bool mshv_notify_acked_gsi(struct mshv_partition *partition, int gsi) +{ + struct mshv_irq_ack_notifier *mian; + bool acked = false; + + rcu_read_lock(); + hlist_for_each_entry_rcu(mian, &partition->irq_ack_notifier_list, + link) { + if (mian->irq_ack_gsi == gsi) { + mian->irq_acked(mian); + acked = true; + } + } + rcu_read_unlock(); + + return acked; +} + +#if IS_ENABLED(CONFIG_ARM64) +static inline bool hv_should_clear_interrupt(enum hv_interrupt_type type) +{ + return false; +} +#elif IS_ENABLED(CONFIG_X86_64) +static inline bool hv_should_clear_interrupt(enum hv_interrupt_type type) +{ + return type == HV_X64_INTERRUPT_TYPE_EXTINT; +} +#endif + +static void mshv_irqfd_resampler_ack(struct mshv_irq_ack_notifier *mian) +{ + struct mshv_irqfd_resampler *resampler; + struct mshv_partition *partition; + struct mshv_irqfd *irqfd; + int idx; + + resampler = container_of(mian, struct mshv_irqfd_resampler, + rsmplr_notifier); + partition = resampler->rsmplr_partn; + + idx = srcu_read_lock(&partition->pt_irq_srcu); + + hlist_for_each_entry_rcu(irqfd, &resampler->rsmplr_irqfd_list, + irqfd_resampler_hnode) { + if (hv_should_clear_interrupt(irqfd->irqfd_lapic_irq.lapic_control.interrupt_type)) + hv_call_clear_virtual_interrupt(partition->pt_id); + + eventfd_signal(irqfd->irqfd_resamplefd); + } + + srcu_read_unlock(&partition->pt_irq_srcu, idx); +} + +#if IS_ENABLED(CONFIG_X86_64) +static bool +mshv_vp_irq_vector_injected(union hv_vp_register_page_interrupt_vectors iv, + u32 vector) +{ + int i; + + for (i = 0; i < iv.vector_count; i++) { + if (iv.vector[i] == vector) + return true; + } + + return false; +} + +static int mshv_vp_irq_try_set_vector(struct mshv_vp *vp, u32 vector) +{ + union hv_vp_register_page_interrupt_vectors iv, new_iv; + + iv = vp->vp_register_page->interrupt_vectors; + new_iv = iv; + + if (mshv_vp_irq_vector_injected(iv, vector)) + return 0; + + if (iv.vector_count >= HV_VP_REGISTER_PAGE_MAX_VECTOR_COUNT) + return -ENOSPC; + + new_iv.vector[new_iv.vector_count++] = vector; + + if (cmpxchg(&vp->vp_register_page->interrupt_vectors.as_uint64, + iv.as_uint64, new_iv.as_uint64) != iv.as_uint64) + return -EAGAIN; + + return 0; +} + +static int mshv_vp_irq_set_vector(struct mshv_vp *vp, u32 vector) +{ + int ret; + + do { + ret = mshv_vp_irq_try_set_vector(vp, vector); + } while (ret == -EAGAIN && !need_resched()); + + return ret; +} + +/* + * Try to raise irq for guest via shared vector array. hyp does the actual + * inject of the interrupt. + */ +static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd) +{ + struct mshv_partition *partition = irqfd->irqfd_partn; + struct mshv_lapic_irq *irq = &irqfd->irqfd_lapic_irq; + struct mshv_vp *vp; + + if (!(ms_hyperv.ext_features & + HV_VP_DISPATCH_INTERRUPT_INJECTION_AVAILABLE)) + return -EOPNOTSUPP; + + if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT) + return -EOPNOTSUPP; + +#if IS_ENABLED(CONFIG_X86) + if (irq->lapic_control.logical_dest_mode) + return -EOPNOTSUPP; +#endif + + vp = partition->pt_vp_array[irq->lapic_apic_id]; + + if (!vp->vp_register_page) + return -EOPNOTSUPP; + + if (mshv_vp_irq_set_vector(vp, irq->lapic_vector)) + return -EINVAL; + + if (vp->run.flags.root_sched_dispatched && + vp->vp_register_page->interrupt_vectors.as_uint64) + return -EBUSY; + + wake_up(&vp->run.vp_suspend_queue); + + return 0; +} +#else /* CONFIG_X86_64 */ +static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd) +{ + return -EOPNOTSUPP; +} +#endif + +static void mshv_assert_irq_slow(struct mshv_irqfd *irqfd) +{ + struct mshv_partition *partition = irqfd->irqfd_partn; + struct mshv_lapic_irq *irq = &irqfd->irqfd_lapic_irq; + unsigned int seq; + int idx; + +#if IS_ENABLED(CONFIG_X86) + WARN_ON(irqfd->irqfd_resampler && + !irq->lapic_control.level_triggered); +#endif + + idx = srcu_read_lock(&partition->pt_irq_srcu); + if (irqfd->irqfd_girq_ent.guest_irq_num) { + if (!irqfd->irqfd_girq_ent.girq_entry_valid) { + srcu_read_unlock(&partition->pt_irq_srcu, idx); + return; + } + + do { + seq = read_seqcount_begin(&irqfd->irqfd_irqe_sc); + } while (read_seqcount_retry(&irqfd->irqfd_irqe_sc, seq)); + } + + hv_call_assert_virtual_interrupt(irqfd->irqfd_partn->pt_id, + irq->lapic_vector, irq->lapic_apic_id, + irq->lapic_control); + srcu_read_unlock(&partition->pt_irq_srcu, idx); +} + +static void mshv_irqfd_resampler_shutdown(struct mshv_irqfd *irqfd) +{ + struct mshv_irqfd_resampler *rp = irqfd->irqfd_resampler; + struct mshv_partition *pt = rp->rsmplr_partn; + + mutex_lock(&pt->irqfds_resampler_lock); + + hlist_del_rcu(&irqfd->irqfd_resampler_hnode); + synchronize_srcu(&pt->pt_irq_srcu); + + if (hlist_empty(&rp->rsmplr_irqfd_list)) { + hlist_del(&rp->rsmplr_hnode); + mshv_unregister_irq_ack_notifier(pt, &rp->rsmplr_notifier); + kfree(rp); + } + + mutex_unlock(&pt->irqfds_resampler_lock); +} + +/* + * Race-free decouple logic (ordering is critical) + */ +static void mshv_irqfd_shutdown(struct work_struct *work) +{ + struct mshv_irqfd *irqfd = + container_of(work, struct mshv_irqfd, irqfd_shutdown); + + /* + * Synchronize with the wait-queue and unhook ourselves to prevent + * further events. + */ + remove_wait_queue(irqfd->irqfd_wqh, &irqfd->irqfd_wait); + + if (irqfd->irqfd_resampler) { + mshv_irqfd_resampler_shutdown(irqfd); + eventfd_ctx_put(irqfd->irqfd_resamplefd); + } + + /* + * It is now safe to release the object's resources + */ + eventfd_ctx_put(irqfd->irqfd_eventfd_ctx); + kfree(irqfd); +} + +/* assumes partition->pt_irqfds_lock is held */ +static bool mshv_irqfd_is_active(struct mshv_irqfd *irqfd) +{ + return !hlist_unhashed(&irqfd->irqfd_hnode); +} + +/* + * Mark the irqfd as inactive and schedule it for removal + * + * assumes partition->pt_irqfds_lock is held + */ +static void mshv_irqfd_deactivate(struct mshv_irqfd *irqfd) +{ + if (!mshv_irqfd_is_active(irqfd)) + return; + + hlist_del(&irqfd->irqfd_hnode); + + queue_work(irqfd_cleanup_wq, &irqfd->irqfd_shutdown); +} + +/* + * Called with wqh->lock held and interrupts disabled + */ +static int mshv_irqfd_wakeup(wait_queue_entry_t *wait, unsigned int mode, + int sync, void *key) +{ + struct mshv_irqfd *irqfd = container_of(wait, struct mshv_irqfd, + irqfd_wait); + unsigned long flags = (unsigned long)key; + int idx; + unsigned int seq; + struct mshv_partition *pt = irqfd->irqfd_partn; + int ret = 0; + + if (flags & POLLIN) { + u64 cnt; + + eventfd_ctx_do_read(irqfd->irqfd_eventfd_ctx, &cnt); + idx = srcu_read_lock(&pt->pt_irq_srcu); + do { + seq = read_seqcount_begin(&irqfd->irqfd_irqe_sc); + } while (read_seqcount_retry(&irqfd->irqfd_irqe_sc, seq)); + + /* An event has been signaled, raise an interrupt */ + ret = mshv_try_assert_irq_fast(irqfd); + if (ret) + mshv_assert_irq_slow(irqfd); + + srcu_read_unlock(&pt->pt_irq_srcu, idx); + + ret = 1; + } + + if (flags & POLLHUP) { + /* The eventfd is closing, detach from the partition */ + unsigned long flags; + + spin_lock_irqsave(&pt->pt_irqfds_lock, flags); + + /* + * We must check if someone deactivated the irqfd before + * we could acquire the pt_irqfds_lock since the item is + * deactivated from the mshv side before it is unhooked from + * the wait-queue. If it is already deactivated, we can + * simply return knowing the other side will cleanup for us. + * We cannot race against the irqfd going away since the + * other side is required to acquire wqh->lock, which we hold + */ + if (mshv_irqfd_is_active(irqfd)) + mshv_irqfd_deactivate(irqfd); + + spin_unlock_irqrestore(&pt->pt_irqfds_lock, flags); + } + + return ret; +} + +/* Must be called under pt_irqfds_lock */ +static void mshv_irqfd_update(struct mshv_partition *pt, + struct mshv_irqfd *irqfd) +{ + write_seqcount_begin(&irqfd->irqfd_irqe_sc); + irqfd->irqfd_girq_ent = mshv_ret_girq_entry(pt, + irqfd->irqfd_irqnum); + mshv_copy_girq_info(&irqfd->irqfd_girq_ent, &irqfd->irqfd_lapic_irq); + write_seqcount_end(&irqfd->irqfd_irqe_sc); +} + +void mshv_irqfd_routing_update(struct mshv_partition *pt) +{ + struct mshv_irqfd *irqfd; + + spin_lock_irq(&pt->pt_irqfds_lock); + hlist_for_each_entry(irqfd, &pt->pt_irqfds_list, irqfd_hnode) + mshv_irqfd_update(pt, irqfd); + spin_unlock_irq(&pt->pt_irqfds_lock); +} + +static void mshv_irqfd_queue_proc(struct file *file, wait_queue_head_t *wqh, + poll_table *polltbl) +{ + struct mshv_irqfd *irqfd = + container_of(polltbl, struct mshv_irqfd, irqfd_polltbl); + + irqfd->irqfd_wqh = wqh; + + /* + * TODO: Ensure there isn't already an exclusive, priority waiter, e.g. + * that the irqfd isn't already bound to another partition. Only the + * first exclusive waiter encountered will be notified, and + * add_wait_queue_priority() doesn't enforce exclusivity. + */ + irqfd->irqfd_wait.flags |= WQ_FLAG_EXCLUSIVE; + add_wait_queue_priority(wqh, &irqfd->irqfd_wait); +} + +static int mshv_irqfd_assign(struct mshv_partition *pt, + struct mshv_user_irqfd *args) +{ + struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL; + struct mshv_irqfd *irqfd, *tmp; + unsigned int events; + int ret; + int idx; + + CLASS(fd, f)(args->fd); + + irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL); + if (!irqfd) + return -ENOMEM; + + irqfd->irqfd_partn = pt; + irqfd->irqfd_irqnum = args->gsi; + INIT_WORK(&irqfd->irqfd_shutdown, mshv_irqfd_shutdown); + seqcount_spinlock_init(&irqfd->irqfd_irqe_sc, &pt->pt_irqfds_lock); + + if (fd_empty(f)) { + ret = -EBADF; + goto out; + } + + eventfd = eventfd_ctx_fileget(fd_file(f)); + if (IS_ERR(eventfd)) { + ret = PTR_ERR(eventfd); + goto fail; + } + + irqfd->irqfd_eventfd_ctx = eventfd; + + if (args->flags & BIT(MSHV_IRQFD_BIT_RESAMPLE)) { + struct mshv_irqfd_resampler *rp; + + resamplefd = eventfd_ctx_fdget(args->resamplefd); + if (IS_ERR(resamplefd)) { + ret = PTR_ERR(resamplefd); + goto fail; + } + + irqfd->irqfd_resamplefd = resamplefd; + + mutex_lock(&pt->irqfds_resampler_lock); + + hlist_for_each_entry(rp, &pt->irqfds_resampler_list, + rsmplr_hnode) { + if (rp->rsmplr_notifier.irq_ack_gsi == + irqfd->irqfd_irqnum) { + irqfd->irqfd_resampler = rp; + break; + } + } + + if (!irqfd->irqfd_resampler) { + rp = kzalloc(sizeof(*rp), GFP_KERNEL_ACCOUNT); + if (!rp) { + ret = -ENOMEM; + mutex_unlock(&pt->irqfds_resampler_lock); + goto fail; + } + + rp->rsmplr_partn = pt; + INIT_HLIST_HEAD(&rp->rsmplr_irqfd_list); + rp->rsmplr_notifier.irq_ack_gsi = irqfd->irqfd_irqnum; + rp->rsmplr_notifier.irq_acked = + mshv_irqfd_resampler_ack; + + hlist_add_head(&rp->rsmplr_hnode, + &pt->irqfds_resampler_list); + mshv_register_irq_ack_notifier(pt, + &rp->rsmplr_notifier); + irqfd->irqfd_resampler = rp; + } + + hlist_add_head_rcu(&irqfd->irqfd_resampler_hnode, + &irqfd->irqfd_resampler->rsmplr_irqfd_list); + + mutex_unlock(&pt->irqfds_resampler_lock); + } + + /* + * Install our own custom wake-up handling so we are notified via + * a callback whenever someone signals the underlying eventfd + */ + init_waitqueue_func_entry(&irqfd->irqfd_wait, mshv_irqfd_wakeup); + init_poll_funcptr(&irqfd->irqfd_polltbl, mshv_irqfd_queue_proc); + + spin_lock_irq(&pt->pt_irqfds_lock); +#if IS_ENABLED(CONFIG_X86) + if (args->flags & BIT(MSHV_IRQFD_BIT_RESAMPLE) && + !irqfd->irqfd_lapic_irq.lapic_control.level_triggered) { + /* + * Resample Fd must be for level triggered interrupt + * Otherwise return with failure + */ + spin_unlock_irq(&pt->pt_irqfds_lock); + ret = -EINVAL; + goto fail; + } +#endif + ret = 0; + hlist_for_each_entry(tmp, &pt->pt_irqfds_list, irqfd_hnode) { + if (irqfd->irqfd_eventfd_ctx != tmp->irqfd_eventfd_ctx) + continue; + /* This fd is used for another irq already. */ + ret = -EBUSY; + spin_unlock_irq(&pt->pt_irqfds_lock); + goto fail; + } + + idx = srcu_read_lock(&pt->pt_irq_srcu); + mshv_irqfd_update(pt, irqfd); + hlist_add_head(&irqfd->irqfd_hnode, &pt->pt_irqfds_list); + spin_unlock_irq(&pt->pt_irqfds_lock); + + /* + * Check if there was an event already pending on the eventfd + * before we registered, and trigger it as if we didn't miss it. + */ + events = vfs_poll(fd_file(f), &irqfd->irqfd_polltbl); + + if (events & POLLIN) + mshv_assert_irq_slow(irqfd); + + srcu_read_unlock(&pt->pt_irq_srcu, idx); + return 0; + +fail: + if (irqfd->irqfd_resampler) + mshv_irqfd_resampler_shutdown(irqfd); + + if (resamplefd && !IS_ERR(resamplefd)) + eventfd_ctx_put(resamplefd); + + if (eventfd && !IS_ERR(eventfd)) + eventfd_ctx_put(eventfd); + +out: + kfree(irqfd); + return ret; +} + +/* + * shutdown any irqfd's that match fd+gsi + */ +static int mshv_irqfd_deassign(struct mshv_partition *pt, + struct mshv_user_irqfd *args) +{ + struct mshv_irqfd *irqfd; + struct hlist_node *n; + struct eventfd_ctx *eventfd; + + eventfd = eventfd_ctx_fdget(args->fd); + if (IS_ERR(eventfd)) + return PTR_ERR(eventfd); + + hlist_for_each_entry_safe(irqfd, n, &pt->pt_irqfds_list, + irqfd_hnode) { + if (irqfd->irqfd_eventfd_ctx == eventfd && + irqfd->irqfd_irqnum == args->gsi) + + mshv_irqfd_deactivate(irqfd); + } + + eventfd_ctx_put(eventfd); + + /* + * Block until we know all outstanding shutdown jobs have completed + * so that we guarantee there will not be any more interrupts on this + * gsi once this deassign function returns. + */ + flush_workqueue(irqfd_cleanup_wq); + + return 0; +} + +int mshv_set_unset_irqfd(struct mshv_partition *pt, + struct mshv_user_irqfd *args) +{ + if (args->flags & ~MSHV_IRQFD_FLAGS_MASK) + return -EINVAL; + + if (args->flags & BIT(MSHV_IRQFD_BIT_DEASSIGN)) + return mshv_irqfd_deassign(pt, args); + + return mshv_irqfd_assign(pt, args); +} + +/* + * This function is called as the mshv VM fd is being released. + * Shutdown all irqfds that still remain open + */ +static void mshv_irqfd_release(struct mshv_partition *pt) +{ + struct mshv_irqfd *irqfd; + struct hlist_node *n; + + spin_lock_irq(&pt->pt_irqfds_lock); + + hlist_for_each_entry_safe(irqfd, n, &pt->pt_irqfds_list, irqfd_hnode) + mshv_irqfd_deactivate(irqfd); + + spin_unlock_irq(&pt->pt_irqfds_lock); + + /* + * Block until we know all outstanding shutdown jobs have completed + * since we do not take a mshv_partition* reference. + */ + flush_workqueue(irqfd_cleanup_wq); +} + +int mshv_irqfd_wq_init(void) +{ + irqfd_cleanup_wq = alloc_workqueue("mshv-irqfd-cleanup", WQ_PERCPU, 0); + if (!irqfd_cleanup_wq) + return -ENOMEM; + + return 0; +} + +void mshv_irqfd_wq_cleanup(void) +{ + destroy_workqueue(irqfd_cleanup_wq); +} + +/* + * -------------------------------------------------------------------- + * ioeventfd: translate a MMIO memory write to an eventfd signal. + * + * userspace can register a MMIO address with an eventfd for receiving + * notification when the memory has been touched. + * -------------------------------------------------------------------- + */ + +static void ioeventfd_release(struct mshv_ioeventfd *p, u64 partition_id) +{ + if (p->iovntfd_doorbell_id > 0) + mshv_unregister_doorbell(partition_id, p->iovntfd_doorbell_id); + eventfd_ctx_put(p->iovntfd_eventfd); + kfree(p); +} + +/* MMIO writes trigger an event if the addr/val match */ +static void ioeventfd_mmio_write(int doorbell_id, void *data) +{ + struct mshv_partition *partition = (struct mshv_partition *)data; + struct mshv_ioeventfd *p; + + rcu_read_lock(); + hlist_for_each_entry_rcu(p, &partition->ioeventfds_list, iovntfd_hnode) + if (p->iovntfd_doorbell_id == doorbell_id) { + eventfd_signal(p->iovntfd_eventfd); + break; + } + + rcu_read_unlock(); +} + +static bool ioeventfd_check_collision(struct mshv_partition *pt, + struct mshv_ioeventfd *p) + __must_hold(&pt->mutex) +{ + struct mshv_ioeventfd *_p; + + hlist_for_each_entry(_p, &pt->ioeventfds_list, iovntfd_hnode) + if (_p->iovntfd_addr == p->iovntfd_addr && + _p->iovntfd_length == p->iovntfd_length && + (_p->iovntfd_wildcard || p->iovntfd_wildcard || + _p->iovntfd_datamatch == p->iovntfd_datamatch)) + return true; + + return false; +} + +static int mshv_assign_ioeventfd(struct mshv_partition *pt, + struct mshv_user_ioeventfd *args) + __must_hold(&pt->mutex) +{ + struct mshv_ioeventfd *p; + struct eventfd_ctx *eventfd; + u64 doorbell_flags = 0; + int ret; + + /* This mutex is currently protecting ioeventfd.items list */ + WARN_ON_ONCE(!mutex_is_locked(&pt->pt_mutex)); + + if (args->flags & BIT(MSHV_IOEVENTFD_BIT_PIO)) + return -EOPNOTSUPP; + + /* must be natural-word sized */ + switch (args->len) { + case 0: + doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_ANY; + break; + case 1: + doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_BYTE; + break; + case 2: + doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_WORD; + break; + case 4: + doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_DWORD; + break; + case 8: + doorbell_flags = HV_DOORBELL_FLAG_TRIGGER_SIZE_QWORD; + break; + default: + return -EINVAL; + } + + /* check for range overflow */ + if (args->addr + args->len < args->addr) + return -EINVAL; + + /* check for extra flags that we don't understand */ + if (args->flags & ~MSHV_IOEVENTFD_FLAGS_MASK) + return -EINVAL; + + eventfd = eventfd_ctx_fdget(args->fd); + if (IS_ERR(eventfd)) + return PTR_ERR(eventfd); + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) { + ret = -ENOMEM; + goto fail; + } + + p->iovntfd_addr = args->addr; + p->iovntfd_length = args->len; + p->iovntfd_eventfd = eventfd; + + /* The datamatch feature is optional, otherwise this is a wildcard */ + if (args->flags & BIT(MSHV_IOEVENTFD_BIT_DATAMATCH)) { + p->iovntfd_datamatch = args->datamatch; + } else { + p->iovntfd_wildcard = true; + doorbell_flags |= HV_DOORBELL_FLAG_TRIGGER_ANY_VALUE; + } + + if (ioeventfd_check_collision(pt, p)) { + ret = -EEXIST; + goto unlock_fail; + } + + ret = mshv_register_doorbell(pt->pt_id, ioeventfd_mmio_write, + (void *)pt, p->iovntfd_addr, + p->iovntfd_datamatch, doorbell_flags); + if (ret < 0) + goto unlock_fail; + + p->iovntfd_doorbell_id = ret; + + hlist_add_head_rcu(&p->iovntfd_hnode, &pt->ioeventfds_list); + + return 0; + +unlock_fail: + kfree(p); + +fail: + eventfd_ctx_put(eventfd); + + return ret; +} + +static int mshv_deassign_ioeventfd(struct mshv_partition *pt, + struct mshv_user_ioeventfd *args) + __must_hold(&pt->mutex) +{ + struct mshv_ioeventfd *p; + struct eventfd_ctx *eventfd; + struct hlist_node *n; + int ret = -ENOENT; + + /* This mutex is currently protecting ioeventfd.items list */ + WARN_ON_ONCE(!mutex_is_locked(&pt->pt_mutex)); + + eventfd = eventfd_ctx_fdget(args->fd); + if (IS_ERR(eventfd)) + return PTR_ERR(eventfd); + + hlist_for_each_entry_safe(p, n, &pt->ioeventfds_list, iovntfd_hnode) { + bool wildcard = !(args->flags & BIT(MSHV_IOEVENTFD_BIT_DATAMATCH)); + + if (p->iovntfd_eventfd != eventfd || + p->iovntfd_addr != args->addr || + p->iovntfd_length != args->len || + p->iovntfd_wildcard != wildcard) + continue; + + if (!p->iovntfd_wildcard && + p->iovntfd_datamatch != args->datamatch) + continue; + + hlist_del_rcu(&p->iovntfd_hnode); + synchronize_rcu(); + ioeventfd_release(p, pt->pt_id); + ret = 0; + break; + } + + eventfd_ctx_put(eventfd); + + return ret; +} + +int mshv_set_unset_ioeventfd(struct mshv_partition *pt, + struct mshv_user_ioeventfd *args) + __must_hold(&pt->mutex) +{ + if ((args->flags & ~MSHV_IOEVENTFD_FLAGS_MASK) || + mshv_field_nonzero(*args, rsvd)) + return -EINVAL; + + /* PIO not yet implemented */ + if (args->flags & BIT(MSHV_IOEVENTFD_BIT_PIO)) + return -EOPNOTSUPP; + + if (args->flags & BIT(MSHV_IOEVENTFD_BIT_DEASSIGN)) + return mshv_deassign_ioeventfd(pt, args); + + return mshv_assign_ioeventfd(pt, args); +} + +void mshv_eventfd_init(struct mshv_partition *pt) +{ + spin_lock_init(&pt->pt_irqfds_lock); + INIT_HLIST_HEAD(&pt->pt_irqfds_list); + + INIT_HLIST_HEAD(&pt->irqfds_resampler_list); + mutex_init(&pt->irqfds_resampler_lock); + + INIT_HLIST_HEAD(&pt->ioeventfds_list); +} + +void mshv_eventfd_release(struct mshv_partition *pt) +{ + struct hlist_head items; + struct hlist_node *n; + struct mshv_ioeventfd *p; + + hlist_move_list(&pt->ioeventfds_list, &items); + synchronize_rcu(); + + hlist_for_each_entry_safe(p, n, &items, iovntfd_hnode) { + hlist_del(&p->iovntfd_hnode); + ioeventfd_release(p, pt->pt_id); + } + + mshv_irqfd_release(pt); +} diff --git a/drivers/hv/mshv_eventfd.h b/drivers/hv/mshv_eventfd.h new file mode 100644 index 000000000000..332e7670a344 --- /dev/null +++ b/drivers/hv/mshv_eventfd.h @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * irqfd: Allows an fd to be used to inject an interrupt to the guest. + * ioeventfd: Allow an fd to be used to receive a signal from the guest. + * All credit goes to kvm developers. + */ + +#ifndef __LINUX_MSHV_EVENTFD_H +#define __LINUX_MSHV_EVENTFD_H + +#include <linux/poll.h> + +#include "mshv.h" +#include "mshv_root.h" + +/* struct to contain list of irqfds sharing an irq. Updates are protected by + * partition.irqfds.resampler_lock + */ +struct mshv_irqfd_resampler { + struct mshv_partition *rsmplr_partn; + struct hlist_head rsmplr_irqfd_list; + struct mshv_irq_ack_notifier rsmplr_notifier; + struct hlist_node rsmplr_hnode; +}; + +struct mshv_irqfd { + struct mshv_partition *irqfd_partn; + struct eventfd_ctx *irqfd_eventfd_ctx; + struct mshv_guest_irq_ent irqfd_girq_ent; + seqcount_spinlock_t irqfd_irqe_sc; + u32 irqfd_irqnum; + struct mshv_lapic_irq irqfd_lapic_irq; + struct hlist_node irqfd_hnode; + poll_table irqfd_polltbl; + wait_queue_head_t *irqfd_wqh; + wait_queue_entry_t irqfd_wait; + struct work_struct irqfd_shutdown; + struct mshv_irqfd_resampler *irqfd_resampler; + struct eventfd_ctx *irqfd_resamplefd; + struct hlist_node irqfd_resampler_hnode; +}; + +void mshv_eventfd_init(struct mshv_partition *partition); +void mshv_eventfd_release(struct mshv_partition *partition); + +void mshv_register_irq_ack_notifier(struct mshv_partition *partition, + struct mshv_irq_ack_notifier *mian); +void mshv_unregister_irq_ack_notifier(struct mshv_partition *partition, + struct mshv_irq_ack_notifier *mian); +bool mshv_notify_acked_gsi(struct mshv_partition *partition, int gsi); + +int mshv_set_unset_irqfd(struct mshv_partition *partition, + struct mshv_user_irqfd *args); + +int mshv_irqfd_wq_init(void); +void mshv_irqfd_wq_cleanup(void); + +struct mshv_ioeventfd { + struct hlist_node iovntfd_hnode; + u64 iovntfd_addr; + int iovntfd_length; + struct eventfd_ctx *iovntfd_eventfd; + u64 iovntfd_datamatch; + int iovntfd_doorbell_id; + bool iovntfd_wildcard; +}; + +int mshv_set_unset_ioeventfd(struct mshv_partition *pt, + struct mshv_user_ioeventfd *args); + +#endif /* __LINUX_MSHV_EVENTFD_H */ diff --git a/drivers/hv/mshv_irq.c b/drivers/hv/mshv_irq.c new file mode 100644 index 000000000000..798e7e1ab06e --- /dev/null +++ b/drivers/hv/mshv_irq.c @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2023, Microsoft Corporation. + * + * Authors: Microsoft Linux virtualization team + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <asm/mshyperv.h> + +#include "mshv_eventfd.h" +#include "mshv.h" +#include "mshv_root.h" + +/* called from the ioctl code, user wants to update the guest irq table */ +int mshv_update_routing_table(struct mshv_partition *partition, + const struct mshv_user_irq_entry *ue, + unsigned int numents) +{ + struct mshv_girq_routing_table *new = NULL, *old; + u32 i, nr_rt_entries = 0; + int r = 0; + + if (numents == 0) + goto swap_routes; + + for (i = 0; i < numents; i++) { + if (ue[i].gsi >= MSHV_MAX_GUEST_IRQS) + return -EINVAL; + + if (ue[i].address_hi) + return -EINVAL; + + nr_rt_entries = max(nr_rt_entries, ue[i].gsi); + } + nr_rt_entries += 1; + + new = kzalloc(struct_size(new, mshv_girq_info_tbl, nr_rt_entries), + GFP_KERNEL_ACCOUNT); + if (!new) + return -ENOMEM; + + new->num_rt_entries = nr_rt_entries; + for (i = 0; i < numents; i++) { + struct mshv_guest_irq_ent *girq; + + girq = &new->mshv_girq_info_tbl[ue[i].gsi]; + + /* + * Allow only one to one mapping between GSI and MSI routing. + */ + if (girq->guest_irq_num != 0) { + r = -EINVAL; + goto out; + } + + girq->guest_irq_num = ue[i].gsi; + girq->girq_addr_lo = ue[i].address_lo; + girq->girq_addr_hi = ue[i].address_hi; + girq->girq_irq_data = ue[i].data; + girq->girq_entry_valid = true; + } + +swap_routes: + mutex_lock(&partition->pt_irq_lock); + old = rcu_dereference_protected(partition->pt_girq_tbl, 1); + rcu_assign_pointer(partition->pt_girq_tbl, new); + mshv_irqfd_routing_update(partition); + mutex_unlock(&partition->pt_irq_lock); + + synchronize_srcu_expedited(&partition->pt_irq_srcu); + new = old; + +out: + kfree(new); + + return r; +} + +/* vm is going away, kfree the irq routing table */ +void mshv_free_routing_table(struct mshv_partition *partition) +{ + struct mshv_girq_routing_table *rt = + rcu_access_pointer(partition->pt_girq_tbl); + + kfree(rt); +} + +struct mshv_guest_irq_ent +mshv_ret_girq_entry(struct mshv_partition *partition, u32 irqnum) +{ + struct mshv_guest_irq_ent entry = { 0 }; + struct mshv_girq_routing_table *girq_tbl; + + girq_tbl = srcu_dereference_check(partition->pt_girq_tbl, + &partition->pt_irq_srcu, + lockdep_is_held(&partition->pt_irq_lock)); + if (!girq_tbl || irqnum >= girq_tbl->num_rt_entries) { + /* + * Premature register_irqfd, setting valid_entry = 0 + * would ignore this entry anyway + */ + entry.guest_irq_num = irqnum; + return entry; + } + + return girq_tbl->mshv_girq_info_tbl[irqnum]; +} + +void mshv_copy_girq_info(struct mshv_guest_irq_ent *ent, + struct mshv_lapic_irq *lirq) +{ + memset(lirq, 0, sizeof(*lirq)); + if (!ent || !ent->girq_entry_valid) + return; + + lirq->lapic_vector = ent->girq_irq_data & 0xFF; + lirq->lapic_apic_id = (ent->girq_addr_lo >> 12) & 0xFF; + lirq->lapic_control.interrupt_type = (ent->girq_irq_data & 0x700) >> 8; +#if IS_ENABLED(CONFIG_X86) + lirq->lapic_control.level_triggered = (ent->girq_irq_data >> 15) & 0x1; + lirq->lapic_control.logical_dest_mode = (ent->girq_addr_lo >> 2) & 0x1; +#elif IS_ENABLED(CONFIG_ARM64) + lirq->lapic_control.asserted = 1; +#endif +} diff --git a/drivers/hv/mshv_portid_table.c b/drivers/hv/mshv_portid_table.c new file mode 100644 index 000000000000..c349af1f0aaa --- /dev/null +++ b/drivers/hv/mshv_portid_table.c @@ -0,0 +1,83 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/idr.h> +#include <asm/mshyperv.h> + +#include "mshv.h" +#include "mshv_root.h" + +/* + * Ports and connections are hypervisor struct used for inter-partition + * communication. Port represents the source and connection represents + * the destination. Partitions are responsible for managing the port and + * connection ids. + * + */ + +#define PORTID_MIN 1 +#define PORTID_MAX INT_MAX + +static DEFINE_IDR(port_table_idr); + +void +mshv_port_table_fini(void) +{ + struct port_table_info *port_info; + unsigned long i, tmp; + + idr_lock(&port_table_idr); + if (!idr_is_empty(&port_table_idr)) { + idr_for_each_entry_ul(&port_table_idr, port_info, tmp, i) { + port_info = idr_remove(&port_table_idr, i); + kfree_rcu(port_info, portbl_rcu); + } + } + idr_unlock(&port_table_idr); +} + +int +mshv_portid_alloc(struct port_table_info *info) +{ + int ret = 0; + + idr_lock(&port_table_idr); + ret = idr_alloc(&port_table_idr, info, PORTID_MIN, + PORTID_MAX, GFP_KERNEL); + idr_unlock(&port_table_idr); + + return ret; +} + +void +mshv_portid_free(int port_id) +{ + struct port_table_info *info; + + idr_lock(&port_table_idr); + info = idr_remove(&port_table_idr, port_id); + WARN_ON(!info); + idr_unlock(&port_table_idr); + + synchronize_rcu(); + kfree(info); +} + +int +mshv_portid_lookup(int port_id, struct port_table_info *info) +{ + struct port_table_info *_info; + int ret = -ENOENT; + + rcu_read_lock(); + _info = idr_find(&port_table_idr, port_id); + rcu_read_unlock(); + + if (_info) { + *info = *_info; + ret = 0; + } + + return ret; +} diff --git a/drivers/hv/mshv_regions.c b/drivers/hv/mshv_regions.c new file mode 100644 index 000000000000..202b9d551e39 --- /dev/null +++ b/drivers/hv/mshv_regions.c @@ -0,0 +1,555 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2025, Microsoft Corporation. + * + * Memory region management for mshv_root module. + * + * Authors: Microsoft Linux virtualization team + */ + +#include <linux/hmm.h> +#include <linux/hyperv.h> +#include <linux/kref.h> +#include <linux/mm.h> +#include <linux/vmalloc.h> + +#include <asm/mshyperv.h> + +#include "mshv_root.h" + +#define MSHV_MAP_FAULT_IN_PAGES PTRS_PER_PMD + +/** + * mshv_region_process_chunk - Processes a contiguous chunk of memory pages + * in a region. + * @region : Pointer to the memory region structure. + * @flags : Flags to pass to the handler. + * @page_offset: Offset into the region's pages array to start processing. + * @page_count : Number of pages to process. + * @handler : Callback function to handle the chunk. + * + * This function scans the region's pages starting from @page_offset, + * checking for contiguous present pages of the same size (normal or huge). + * It invokes @handler for the chunk of contiguous pages found. Returns the + * number of pages handled, or a negative error code if the first page is + * not present or the handler fails. + * + * Note: The @handler callback must be able to handle both normal and huge + * pages. + * + * Return: Number of pages handled, or negative error code. + */ +static long mshv_region_process_chunk(struct mshv_mem_region *region, + u32 flags, + u64 page_offset, u64 page_count, + int (*handler)(struct mshv_mem_region *region, + u32 flags, + u64 page_offset, + u64 page_count)) +{ + u64 count, stride; + unsigned int page_order; + struct page *page; + int ret; + + page = region->pages[page_offset]; + if (!page) + return -EINVAL; + + page_order = folio_order(page_folio(page)); + /* The hypervisor only supports 4K and 2M page sizes */ + if (page_order && page_order != HPAGE_PMD_ORDER) + return -EINVAL; + + stride = 1 << page_order; + + /* Start at stride since the first page is validated */ + for (count = stride; count < page_count; count += stride) { + page = region->pages[page_offset + count]; + + /* Break if current page is not present */ + if (!page) + break; + + /* Break if page size changes */ + if (page_order != folio_order(page_folio(page))) + break; + } + + ret = handler(region, flags, page_offset, count); + if (ret) + return ret; + + return count; +} + +/** + * mshv_region_process_range - Processes a range of memory pages in a + * region. + * @region : Pointer to the memory region structure. + * @flags : Flags to pass to the handler. + * @page_offset: Offset into the region's pages array to start processing. + * @page_count : Number of pages to process. + * @handler : Callback function to handle each chunk of contiguous + * pages. + * + * Iterates over the specified range of pages in @region, skipping + * non-present pages. For each contiguous chunk of present pages, invokes + * @handler via mshv_region_process_chunk. + * + * Note: The @handler callback must be able to handle both normal and huge + * pages. + * + * Returns 0 on success, or a negative error code on failure. + */ +static int mshv_region_process_range(struct mshv_mem_region *region, + u32 flags, + u64 page_offset, u64 page_count, + int (*handler)(struct mshv_mem_region *region, + u32 flags, + u64 page_offset, + u64 page_count)) +{ + long ret; + + if (page_offset + page_count > region->nr_pages) + return -EINVAL; + + while (page_count) { + /* Skip non-present pages */ + if (!region->pages[page_offset]) { + page_offset++; + page_count--; + continue; + } + + ret = mshv_region_process_chunk(region, flags, + page_offset, + page_count, + handler); + if (ret < 0) + return ret; + + page_offset += ret; + page_count -= ret; + } + + return 0; +} + +struct mshv_mem_region *mshv_region_create(u64 guest_pfn, u64 nr_pages, + u64 uaddr, u32 flags) +{ + struct mshv_mem_region *region; + + region = vzalloc(sizeof(*region) + sizeof(struct page *) * nr_pages); + if (!region) + return ERR_PTR(-ENOMEM); + + region->nr_pages = nr_pages; + region->start_gfn = guest_pfn; + region->start_uaddr = uaddr; + region->hv_map_flags = HV_MAP_GPA_READABLE | HV_MAP_GPA_ADJUSTABLE; + if (flags & BIT(MSHV_SET_MEM_BIT_WRITABLE)) + region->hv_map_flags |= HV_MAP_GPA_WRITABLE; + if (flags & BIT(MSHV_SET_MEM_BIT_EXECUTABLE)) + region->hv_map_flags |= HV_MAP_GPA_EXECUTABLE; + + kref_init(®ion->refcount); + + return region; +} + +static int mshv_region_chunk_share(struct mshv_mem_region *region, + u32 flags, + u64 page_offset, u64 page_count) +{ + struct page *page = region->pages[page_offset]; + + if (PageHuge(page) || PageTransCompound(page)) + flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE; + + return hv_call_modify_spa_host_access(region->partition->pt_id, + region->pages + page_offset, + page_count, + HV_MAP_GPA_READABLE | + HV_MAP_GPA_WRITABLE, + flags, true); +} + +int mshv_region_share(struct mshv_mem_region *region) +{ + u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_SHARED; + + return mshv_region_process_range(region, flags, + 0, region->nr_pages, + mshv_region_chunk_share); +} + +static int mshv_region_chunk_unshare(struct mshv_mem_region *region, + u32 flags, + u64 page_offset, u64 page_count) +{ + struct page *page = region->pages[page_offset]; + + if (PageHuge(page) || PageTransCompound(page)) + flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE; + + return hv_call_modify_spa_host_access(region->partition->pt_id, + region->pages + page_offset, + page_count, 0, + flags, false); +} + +int mshv_region_unshare(struct mshv_mem_region *region) +{ + u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_EXCLUSIVE; + + return mshv_region_process_range(region, flags, + 0, region->nr_pages, + mshv_region_chunk_unshare); +} + +static int mshv_region_chunk_remap(struct mshv_mem_region *region, + u32 flags, + u64 page_offset, u64 page_count) +{ + struct page *page = region->pages[page_offset]; + + if (PageHuge(page) || PageTransCompound(page)) + flags |= HV_MAP_GPA_LARGE_PAGE; + + return hv_call_map_gpa_pages(region->partition->pt_id, + region->start_gfn + page_offset, + page_count, flags, + region->pages + page_offset); +} + +static int mshv_region_remap_pages(struct mshv_mem_region *region, + u32 map_flags, + u64 page_offset, u64 page_count) +{ + return mshv_region_process_range(region, map_flags, + page_offset, page_count, + mshv_region_chunk_remap); +} + +int mshv_region_map(struct mshv_mem_region *region) +{ + u32 map_flags = region->hv_map_flags; + + return mshv_region_remap_pages(region, map_flags, + 0, region->nr_pages); +} + +static void mshv_region_invalidate_pages(struct mshv_mem_region *region, + u64 page_offset, u64 page_count) +{ + if (region->type == MSHV_REGION_TYPE_MEM_PINNED) + unpin_user_pages(region->pages + page_offset, page_count); + + memset(region->pages + page_offset, 0, + page_count * sizeof(struct page *)); +} + +void mshv_region_invalidate(struct mshv_mem_region *region) +{ + mshv_region_invalidate_pages(region, 0, region->nr_pages); +} + +int mshv_region_pin(struct mshv_mem_region *region) +{ + u64 done_count, nr_pages; + struct page **pages; + __u64 userspace_addr; + int ret; + + for (done_count = 0; done_count < region->nr_pages; done_count += ret) { + pages = region->pages + done_count; + userspace_addr = region->start_uaddr + + done_count * HV_HYP_PAGE_SIZE; + nr_pages = min(region->nr_pages - done_count, + MSHV_PIN_PAGES_BATCH_SIZE); + + /* + * Pinning assuming 4k pages works for large pages too. + * All page structs within the large page are returned. + * + * Pin requests are batched because pin_user_pages_fast + * with the FOLL_LONGTERM flag does a large temporary + * allocation of contiguous memory. + */ + ret = pin_user_pages_fast(userspace_addr, nr_pages, + FOLL_WRITE | FOLL_LONGTERM, + pages); + if (ret < 0) + goto release_pages; + } + + return 0; + +release_pages: + mshv_region_invalidate_pages(region, 0, done_count); + return ret; +} + +static int mshv_region_chunk_unmap(struct mshv_mem_region *region, + u32 flags, + u64 page_offset, u64 page_count) +{ + struct page *page = region->pages[page_offset]; + + if (PageHuge(page) || PageTransCompound(page)) + flags |= HV_UNMAP_GPA_LARGE_PAGE; + + return hv_call_unmap_gpa_pages(region->partition->pt_id, + region->start_gfn + page_offset, + page_count, flags); +} + +static int mshv_region_unmap(struct mshv_mem_region *region) +{ + return mshv_region_process_range(region, 0, + 0, region->nr_pages, + mshv_region_chunk_unmap); +} + +static void mshv_region_destroy(struct kref *ref) +{ + struct mshv_mem_region *region = + container_of(ref, struct mshv_mem_region, refcount); + struct mshv_partition *partition = region->partition; + int ret; + + if (region->type == MSHV_REGION_TYPE_MEM_MOVABLE) + mshv_region_movable_fini(region); + + if (mshv_partition_encrypted(partition)) { + ret = mshv_region_share(region); + if (ret) { + pt_err(partition, + "Failed to regain access to memory, unpinning user pages will fail and crash the host error: %d\n", + ret); + return; + } + } + + mshv_region_unmap(region); + + mshv_region_invalidate(region); + + vfree(region); +} + +void mshv_region_put(struct mshv_mem_region *region) +{ + kref_put(®ion->refcount, mshv_region_destroy); +} + +int mshv_region_get(struct mshv_mem_region *region) +{ + return kref_get_unless_zero(®ion->refcount); +} + +/** + * mshv_region_hmm_fault_and_lock - Handle HMM faults and lock the memory region + * @region: Pointer to the memory region structure + * @range: Pointer to the HMM range structure + * + * This function performs the following steps: + * 1. Reads the notifier sequence for the HMM range. + * 2. Acquires a read lock on the memory map. + * 3. Handles HMM faults for the specified range. + * 4. Releases the read lock on the memory map. + * 5. If successful, locks the memory region mutex. + * 6. Verifies if the notifier sequence has changed during the operation. + * If it has, releases the mutex and returns -EBUSY to match with + * hmm_range_fault() return code for repeating. + * + * Return: 0 on success, a negative error code otherwise. + */ +static int mshv_region_hmm_fault_and_lock(struct mshv_mem_region *region, + struct hmm_range *range) +{ + int ret; + + range->notifier_seq = mmu_interval_read_begin(range->notifier); + mmap_read_lock(region->mni.mm); + ret = hmm_range_fault(range); + mmap_read_unlock(region->mni.mm); + if (ret) + return ret; + + mutex_lock(®ion->mutex); + + if (mmu_interval_read_retry(range->notifier, range->notifier_seq)) { + mutex_unlock(®ion->mutex); + cond_resched(); + return -EBUSY; + } + + return 0; +} + +/** + * mshv_region_range_fault - Handle memory range faults for a given region. + * @region: Pointer to the memory region structure. + * @page_offset: Offset of the page within the region. + * @page_count: Number of pages to handle. + * + * This function resolves memory faults for a specified range of pages + * within a memory region. It uses HMM (Heterogeneous Memory Management) + * to fault in the required pages and updates the region's page array. + * + * Return: 0 on success, negative error code on failure. + */ +static int mshv_region_range_fault(struct mshv_mem_region *region, + u64 page_offset, u64 page_count) +{ + struct hmm_range range = { + .notifier = ®ion->mni, + .default_flags = HMM_PFN_REQ_FAULT | HMM_PFN_REQ_WRITE, + }; + unsigned long *pfns; + int ret; + u64 i; + + pfns = kmalloc_array(page_count, sizeof(*pfns), GFP_KERNEL); + if (!pfns) + return -ENOMEM; + + range.hmm_pfns = pfns; + range.start = region->start_uaddr + page_offset * HV_HYP_PAGE_SIZE; + range.end = range.start + page_count * HV_HYP_PAGE_SIZE; + + do { + ret = mshv_region_hmm_fault_and_lock(region, &range); + } while (ret == -EBUSY); + + if (ret) + goto out; + + for (i = 0; i < page_count; i++) + region->pages[page_offset + i] = hmm_pfn_to_page(pfns[i]); + + ret = mshv_region_remap_pages(region, region->hv_map_flags, + page_offset, page_count); + + mutex_unlock(®ion->mutex); +out: + kfree(pfns); + return ret; +} + +bool mshv_region_handle_gfn_fault(struct mshv_mem_region *region, u64 gfn) +{ + u64 page_offset, page_count; + int ret; + + /* Align the page offset to the nearest MSHV_MAP_FAULT_IN_PAGES. */ + page_offset = ALIGN_DOWN(gfn - region->start_gfn, + MSHV_MAP_FAULT_IN_PAGES); + + /* Map more pages than requested to reduce the number of faults. */ + page_count = min(region->nr_pages - page_offset, + MSHV_MAP_FAULT_IN_PAGES); + + ret = mshv_region_range_fault(region, page_offset, page_count); + + WARN_ONCE(ret, + "p%llu: GPA intercept failed: region %#llx-%#llx, gfn %#llx, page_offset %llu, page_count %llu\n", + region->partition->pt_id, region->start_uaddr, + region->start_uaddr + (region->nr_pages << HV_HYP_PAGE_SHIFT), + gfn, page_offset, page_count); + + return !ret; +} + +/** + * mshv_region_interval_invalidate - Invalidate a range of memory region + * @mni: Pointer to the mmu_interval_notifier structure + * @range: Pointer to the mmu_notifier_range structure + * @cur_seq: Current sequence number for the interval notifier + * + * This function invalidates a memory region by remapping its pages with + * no access permissions. It locks the region's mutex to ensure thread safety + * and updates the sequence number for the interval notifier. If the range + * is blockable, it uses a blocking lock; otherwise, it attempts a non-blocking + * lock and returns false if unsuccessful. + * + * NOTE: Failure to invalidate a region is a serious error, as the pages will + * be considered freed while they are still mapped by the hypervisor. + * Any attempt to access such pages will likely crash the system. + * + * Return: true if the region was successfully invalidated, false otherwise. + */ +static bool mshv_region_interval_invalidate(struct mmu_interval_notifier *mni, + const struct mmu_notifier_range *range, + unsigned long cur_seq) +{ + struct mshv_mem_region *region = container_of(mni, + struct mshv_mem_region, + mni); + u64 page_offset, page_count; + unsigned long mstart, mend; + int ret = -EPERM; + + if (mmu_notifier_range_blockable(range)) + mutex_lock(®ion->mutex); + else if (!mutex_trylock(®ion->mutex)) + goto out_fail; + + mmu_interval_set_seq(mni, cur_seq); + + mstart = max(range->start, region->start_uaddr); + mend = min(range->end, region->start_uaddr + + (region->nr_pages << HV_HYP_PAGE_SHIFT)); + + page_offset = HVPFN_DOWN(mstart - region->start_uaddr); + page_count = HVPFN_DOWN(mend - mstart); + + ret = mshv_region_remap_pages(region, HV_MAP_GPA_NO_ACCESS, + page_offset, page_count); + if (ret) + goto out_fail; + + mshv_region_invalidate_pages(region, page_offset, page_count); + + mutex_unlock(®ion->mutex); + + return true; + +out_fail: + WARN_ONCE(ret, + "Failed to invalidate region %#llx-%#llx (range %#lx-%#lx, event: %u, pages %#llx-%#llx, mm: %#llx): %d\n", + region->start_uaddr, + region->start_uaddr + (region->nr_pages << HV_HYP_PAGE_SHIFT), + range->start, range->end, range->event, + page_offset, page_offset + page_count - 1, (u64)range->mm, ret); + return false; +} + +static const struct mmu_interval_notifier_ops mshv_region_mni_ops = { + .invalidate = mshv_region_interval_invalidate, +}; + +void mshv_region_movable_fini(struct mshv_mem_region *region) +{ + mmu_interval_notifier_remove(®ion->mni); +} + +bool mshv_region_movable_init(struct mshv_mem_region *region) +{ + int ret; + + ret = mmu_interval_notifier_insert(®ion->mni, current->mm, + region->start_uaddr, + region->nr_pages << HV_HYP_PAGE_SHIFT, + &mshv_region_mni_ops); + if (ret) + return false; + + mutex_init(®ion->mutex); + + return true; +} diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h new file mode 100644 index 000000000000..3c1d88b36741 --- /dev/null +++ b/drivers/hv/mshv_root.h @@ -0,0 +1,336 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2023, Microsoft Corporation. + */ + +#ifndef _MSHV_ROOT_H_ +#define _MSHV_ROOT_H_ + +#include <linux/spinlock.h> +#include <linux/mutex.h> +#include <linux/semaphore.h> +#include <linux/sched.h> +#include <linux/srcu.h> +#include <linux/wait.h> +#include <linux/hashtable.h> +#include <linux/dev_printk.h> +#include <linux/build_bug.h> +#include <linux/mmu_notifier.h> +#include <uapi/linux/mshv.h> + +/* + * Hypervisor must be between these version numbers (inclusive) + * to guarantee compatibility + */ +#define MSHV_HV_MIN_VERSION (27744) +#define MSHV_HV_MAX_VERSION (27751) + +static_assert(HV_HYP_PAGE_SIZE == MSHV_HV_PAGE_SIZE); + +#define MSHV_MAX_VPS 256 + +#define MSHV_PARTITIONS_HASH_BITS 9 + +#define MSHV_PIN_PAGES_BATCH_SIZE (0x10000000ULL / HV_HYP_PAGE_SIZE) + +struct mshv_vp { + u32 vp_index; + struct mshv_partition *vp_partition; + struct mutex vp_mutex; + struct hv_vp_register_page *vp_register_page; + struct hv_message *vp_intercept_msg_page; + void *vp_ghcb_page; + struct hv_stats_page *vp_stats_pages[2]; + struct { + atomic64_t vp_signaled_count; + struct { + u64 intercept_suspend: 1; + u64 root_sched_blocked: 1; /* root scheduler only */ + u64 root_sched_dispatched: 1; /* root scheduler only */ + u64 reserved: 61; + } flags; + unsigned int kicked_by_hv; + wait_queue_head_t vp_suspend_queue; + } run; +}; + +#define vp_fmt(fmt) "p%lluvp%u: " fmt +#define vp_devprintk(level, v, fmt, ...) \ +do { \ + const struct mshv_vp *__vp = (v); \ + const struct mshv_partition *__pt = __vp->vp_partition; \ + dev_##level(__pt->pt_module_dev, vp_fmt(fmt), __pt->pt_id, \ + __vp->vp_index, ##__VA_ARGS__); \ +} while (0) +#define vp_emerg(v, fmt, ...) vp_devprintk(emerg, v, fmt, ##__VA_ARGS__) +#define vp_crit(v, fmt, ...) vp_devprintk(crit, v, fmt, ##__VA_ARGS__) +#define vp_alert(v, fmt, ...) vp_devprintk(alert, v, fmt, ##__VA_ARGS__) +#define vp_err(v, fmt, ...) vp_devprintk(err, v, fmt, ##__VA_ARGS__) +#define vp_warn(v, fmt, ...) vp_devprintk(warn, v, fmt, ##__VA_ARGS__) +#define vp_notice(v, fmt, ...) vp_devprintk(notice, v, fmt, ##__VA_ARGS__) +#define vp_info(v, fmt, ...) vp_devprintk(info, v, fmt, ##__VA_ARGS__) +#define vp_dbg(v, fmt, ...) vp_devprintk(dbg, v, fmt, ##__VA_ARGS__) + +enum mshv_region_type { + MSHV_REGION_TYPE_MEM_PINNED, + MSHV_REGION_TYPE_MEM_MOVABLE, + MSHV_REGION_TYPE_MMIO +}; + +struct mshv_mem_region { + struct hlist_node hnode; + struct kref refcount; + u64 nr_pages; + u64 start_gfn; + u64 start_uaddr; + u32 hv_map_flags; + struct mshv_partition *partition; + enum mshv_region_type type; + struct mmu_interval_notifier mni; + struct mutex mutex; /* protects region pages remapping */ + struct page *pages[]; +}; + +struct mshv_irq_ack_notifier { + struct hlist_node link; + unsigned int irq_ack_gsi; + void (*irq_acked)(struct mshv_irq_ack_notifier *mian); +}; + +struct mshv_partition { + struct device *pt_module_dev; + + struct hlist_node pt_hnode; + u64 pt_id; + refcount_t pt_ref_count; + struct mutex pt_mutex; + + spinlock_t pt_mem_regions_lock; + struct hlist_head pt_mem_regions; // not ordered + + u32 pt_vp_count; + struct mshv_vp *pt_vp_array[MSHV_MAX_VPS]; + + struct mutex pt_irq_lock; + struct srcu_struct pt_irq_srcu; + struct hlist_head irq_ack_notifier_list; + + struct hlist_head pt_devices; + + /* + * MSHV does not support more than one async hypercall in flight + * for a single partition. Thus, it is okay to define per partition + * async hypercall status. + */ + struct completion async_hypercall; + u64 async_hypercall_status; + + spinlock_t pt_irqfds_lock; + struct hlist_head pt_irqfds_list; + struct mutex irqfds_resampler_lock; + struct hlist_head irqfds_resampler_list; + + struct hlist_head ioeventfds_list; + + struct mshv_girq_routing_table __rcu *pt_girq_tbl; + u64 isolation_type; + bool import_completed; + bool pt_initialized; +}; + +#define pt_fmt(fmt) "p%llu: " fmt +#define pt_devprintk(level, p, fmt, ...) \ +do { \ + const struct mshv_partition *__pt = (p); \ + dev_##level(__pt->pt_module_dev, pt_fmt(fmt), __pt->pt_id, \ + ##__VA_ARGS__); \ +} while (0) +#define pt_emerg(p, fmt, ...) pt_devprintk(emerg, p, fmt, ##__VA_ARGS__) +#define pt_crit(p, fmt, ...) pt_devprintk(crit, p, fmt, ##__VA_ARGS__) +#define pt_alert(p, fmt, ...) pt_devprintk(alert, p, fmt, ##__VA_ARGS__) +#define pt_err(p, fmt, ...) pt_devprintk(err, p, fmt, ##__VA_ARGS__) +#define pt_warn(p, fmt, ...) pt_devprintk(warn, p, fmt, ##__VA_ARGS__) +#define pt_notice(p, fmt, ...) pt_devprintk(notice, p, fmt, ##__VA_ARGS__) +#define pt_info(p, fmt, ...) pt_devprintk(info, p, fmt, ##__VA_ARGS__) +#define pt_dbg(p, fmt, ...) pt_devprintk(dbg, p, fmt, ##__VA_ARGS__) + +struct mshv_lapic_irq { + u32 lapic_vector; + u64 lapic_apic_id; + union hv_interrupt_control lapic_control; +}; + +#define MSHV_MAX_GUEST_IRQS 4096 + +/* representation of one guest irq entry, either msi or legacy */ +struct mshv_guest_irq_ent { + u32 girq_entry_valid; /* vfio looks at this */ + u32 guest_irq_num; /* a unique number for each irq */ + u32 girq_addr_lo; /* guest irq msi address info */ + u32 girq_addr_hi; + u32 girq_irq_data; /* idt vector in some cases */ +}; + +struct mshv_girq_routing_table { + u32 num_rt_entries; + struct mshv_guest_irq_ent mshv_girq_info_tbl[]; +}; + +struct hv_synic_pages { + struct hv_message_page *hyp_synic_message_page; + struct hv_synic_event_flags_page *synic_event_flags_page; + struct hv_synic_event_ring_page *synic_event_ring_page; +}; + +struct mshv_root { + struct hv_synic_pages __percpu *synic_pages; + spinlock_t pt_ht_lock; + DECLARE_HASHTABLE(pt_htable, MSHV_PARTITIONS_HASH_BITS); + struct hv_partition_property_vmm_capabilities vmm_caps; +}; + +/* + * Callback for doorbell events. + * NOTE: This is called in interrupt context. Callback + * should defer slow and sleeping logic to later. + */ +typedef void (*doorbell_cb_t) (int doorbell_id, void *); + +/* + * port table information + */ +struct port_table_info { + struct rcu_head portbl_rcu; + enum hv_port_type hv_port_type; + union { + struct { + u64 reserved[2]; + } hv_port_message; + struct { + u64 reserved[2]; + } hv_port_event; + struct { + u64 reserved[2]; + } hv_port_monitor; + struct { + doorbell_cb_t doorbell_cb; + void *data; + } hv_port_doorbell; + }; +}; + +int mshv_update_routing_table(struct mshv_partition *partition, + const struct mshv_user_irq_entry *entries, + unsigned int numents); +void mshv_free_routing_table(struct mshv_partition *partition); + +struct mshv_guest_irq_ent mshv_ret_girq_entry(struct mshv_partition *partition, + u32 irq_num); + +void mshv_copy_girq_info(struct mshv_guest_irq_ent *src_irq, + struct mshv_lapic_irq *dest_irq); + +void mshv_irqfd_routing_update(struct mshv_partition *partition); + +void mshv_port_table_fini(void); +int mshv_portid_alloc(struct port_table_info *info); +int mshv_portid_lookup(int port_id, struct port_table_info *info); +void mshv_portid_free(int port_id); + +int mshv_register_doorbell(u64 partition_id, doorbell_cb_t doorbell_cb, + void *data, u64 gpa, u64 val, u64 flags); +void mshv_unregister_doorbell(u64 partition_id, int doorbell_portid); + +void mshv_isr(void); +int mshv_synic_init(unsigned int cpu); +int mshv_synic_cleanup(unsigned int cpu); + +static inline bool mshv_partition_encrypted(struct mshv_partition *partition) +{ + return partition->isolation_type == HV_PARTITION_ISOLATION_TYPE_SNP; +} + +struct mshv_partition *mshv_partition_get(struct mshv_partition *partition); +void mshv_partition_put(struct mshv_partition *partition); +struct mshv_partition *mshv_partition_find(u64 partition_id) __must_hold(RCU); + +/* hypercalls */ + +int hv_call_withdraw_memory(u64 count, int node, u64 partition_id); +int hv_call_create_partition(u64 flags, + struct hv_partition_creation_properties creation_properties, + union hv_partition_isolation_properties isolation_properties, + u64 *partition_id); +int hv_call_initialize_partition(u64 partition_id); +int hv_call_finalize_partition(u64 partition_id); +int hv_call_delete_partition(u64 partition_id); +int hv_call_map_mmio_pages(u64 partition_id, u64 gfn, u64 mmio_spa, u64 numpgs); +int hv_call_map_gpa_pages(u64 partition_id, u64 gpa_target, u64 page_count, + u32 flags, struct page **pages); +int hv_call_unmap_gpa_pages(u64 partition_id, u64 gpa_target, u64 page_count, + u32 flags); +int hv_call_delete_vp(u64 partition_id, u32 vp_index); +int hv_call_assert_virtual_interrupt(u64 partition_id, u32 vector, + u64 dest_addr, + union hv_interrupt_control control); +int hv_call_clear_virtual_interrupt(u64 partition_id); +int hv_call_get_gpa_access_states(u64 partition_id, u32 count, u64 gpa_base_pfn, + union hv_gpa_page_access_state_flags state_flags, + int *written_total, + union hv_gpa_page_access_state *states); +int hv_call_get_vp_state(u32 vp_index, u64 partition_id, + struct hv_vp_state_data state_data, + /* Choose between pages and ret_output */ + u64 page_count, struct page **pages, + union hv_output_get_vp_state *ret_output); +int hv_call_set_vp_state(u32 vp_index, u64 partition_id, + /* Choose between pages and bytes */ + struct hv_vp_state_data state_data, u64 page_count, + struct page **pages, u32 num_bytes, u8 *bytes); +int hv_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type, + union hv_input_vtl input_vtl, + struct page **state_page); +int hv_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type, + struct page *state_page, + union hv_input_vtl input_vtl); +int hv_call_create_port(u64 port_partition_id, union hv_port_id port_id, + u64 connection_partition_id, struct hv_port_info *port_info, + u8 port_vtl, u8 min_connection_vtl, int node); +int hv_call_delete_port(u64 port_partition_id, union hv_port_id port_id); +int hv_call_connect_port(u64 port_partition_id, union hv_port_id port_id, + u64 connection_partition_id, + union hv_connection_id connection_id, + struct hv_connection_info *connection_info, + u8 connection_vtl, int node); +int hv_call_disconnect_port(u64 connection_partition_id, + union hv_connection_id connection_id); +int hv_call_notify_port_ring_empty(u32 sint_index); +int hv_map_stats_page(enum hv_stats_object_type type, + const union hv_stats_object_identity *identity, + void **addr); +int hv_unmap_stats_page(enum hv_stats_object_type type, void *page_addr, + const union hv_stats_object_identity *identity); +int hv_call_modify_spa_host_access(u64 partition_id, struct page **pages, + u64 page_struct_count, u32 host_access, + u32 flags, u8 acquire); +int hv_call_get_partition_property_ex(u64 partition_id, u64 property_code, u64 arg, + void *property_value, size_t property_value_sz); + +extern struct mshv_root mshv_root; +extern enum hv_scheduler_type hv_scheduler_type; +extern u8 * __percpu *hv_synic_eventring_tail; + +struct mshv_mem_region *mshv_region_create(u64 guest_pfn, u64 nr_pages, + u64 uaddr, u32 flags); +int mshv_region_share(struct mshv_mem_region *region); +int mshv_region_unshare(struct mshv_mem_region *region); +int mshv_region_map(struct mshv_mem_region *region); +void mshv_region_invalidate(struct mshv_mem_region *region); +int mshv_region_pin(struct mshv_mem_region *region); +void mshv_region_put(struct mshv_mem_region *region); +int mshv_region_get(struct mshv_mem_region *region); +bool mshv_region_handle_gfn_fault(struct mshv_mem_region *region, u64 gfn); +void mshv_region_movable_fini(struct mshv_mem_region *region); +bool mshv_region_movable_init(struct mshv_mem_region *region); + +#endif /* _MSHV_ROOT_H_ */ diff --git a/drivers/hv/mshv_root_hv_call.c b/drivers/hv/mshv_root_hv_call.c new file mode 100644 index 000000000000..598eaff4ff29 --- /dev/null +++ b/drivers/hv/mshv_root_hv_call.c @@ -0,0 +1,1024 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2023, Microsoft Corporation. + * + * Hypercall helper functions used by the mshv_root module. + * + * Authors: Microsoft Linux virtualization team + */ + +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/export.h> +#include <asm/mshyperv.h> + +#include "mshv_root.h" + +/* Determined empirically */ +#define HV_INIT_PARTITION_DEPOSIT_PAGES 208 +#define HV_MAP_GPA_DEPOSIT_PAGES 256 +#define HV_UMAP_GPA_PAGES 512 + +#define HV_PAGE_COUNT_2M_ALIGNED(pg_count) (!((pg_count) & (0x200 - 1))) + +#define HV_WITHDRAW_BATCH_SIZE (HV_HYP_PAGE_SIZE / sizeof(u64)) +#define HV_MAP_GPA_BATCH_SIZE \ + ((HV_HYP_PAGE_SIZE - sizeof(struct hv_input_map_gpa_pages)) \ + / sizeof(u64)) +#define HV_GET_VP_STATE_BATCH_SIZE \ + ((HV_HYP_PAGE_SIZE - sizeof(struct hv_input_get_vp_state)) \ + / sizeof(u64)) +#define HV_SET_VP_STATE_BATCH_SIZE \ + ((HV_HYP_PAGE_SIZE - sizeof(struct hv_input_set_vp_state)) \ + / sizeof(u64)) +#define HV_GET_GPA_ACCESS_STATES_BATCH_SIZE \ + ((HV_HYP_PAGE_SIZE - sizeof(union hv_gpa_page_access_state)) \ + / sizeof(union hv_gpa_page_access_state)) +#define HV_MODIFY_SPARSE_SPA_PAGE_HOST_ACCESS_MAX_PAGE_COUNT \ + ((HV_HYP_PAGE_SIZE - \ + sizeof(struct hv_input_modify_sparse_spa_page_host_access)) / \ + sizeof(u64)) + +int hv_call_withdraw_memory(u64 count, int node, u64 partition_id) +{ + struct hv_input_withdraw_memory *input_page; + struct hv_output_withdraw_memory *output_page; + struct page *page; + u16 completed; + unsigned long remaining = count; + u64 status; + int i; + unsigned long flags; + + page = alloc_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + output_page = page_address(page); + + while (remaining) { + local_irq_save(flags); + + input_page = *this_cpu_ptr(hyperv_pcpu_input_arg); + + memset(input_page, 0, sizeof(*input_page)); + input_page->partition_id = partition_id; + status = hv_do_rep_hypercall(HVCALL_WITHDRAW_MEMORY, + min(remaining, HV_WITHDRAW_BATCH_SIZE), + 0, input_page, output_page); + + local_irq_restore(flags); + + completed = hv_repcomp(status); + + for (i = 0; i < completed; i++) + __free_page(pfn_to_page(output_page->gpa_page_list[i])); + + if (!hv_result_success(status)) { + if (hv_result(status) == HV_STATUS_NO_RESOURCES) + status = HV_STATUS_SUCCESS; + break; + } + + remaining -= completed; + } + free_page((unsigned long)output_page); + + return hv_result_to_errno(status); +} + +int hv_call_create_partition(u64 flags, + struct hv_partition_creation_properties creation_properties, + union hv_partition_isolation_properties isolation_properties, + u64 *partition_id) +{ + struct hv_input_create_partition *input; + struct hv_output_create_partition *output; + u64 status; + int ret; + unsigned long irq_flags; + + do { + local_irq_save(irq_flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + output = *this_cpu_ptr(hyperv_pcpu_output_arg); + + memset(input, 0, sizeof(*input)); + input->flags = flags; + input->compatibility_version = HV_COMPATIBILITY_21_H2; + + memcpy(&input->partition_creation_properties, &creation_properties, + sizeof(creation_properties)); + + memcpy(&input->isolation_properties, &isolation_properties, + sizeof(isolation_properties)); + + status = hv_do_hypercall(HVCALL_CREATE_PARTITION, + input, output); + + if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { + if (hv_result_success(status)) + *partition_id = output->partition_id; + local_irq_restore(irq_flags); + ret = hv_result_to_errno(status); + break; + } + local_irq_restore(irq_flags); + ret = hv_call_deposit_pages(NUMA_NO_NODE, + hv_current_partition_id, 1); + } while (!ret); + + return ret; +} + +int hv_call_initialize_partition(u64 partition_id) +{ + struct hv_input_initialize_partition input; + u64 status; + int ret; + + input.partition_id = partition_id; + + ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id, + HV_INIT_PARTITION_DEPOSIT_PAGES); + if (ret) + return ret; + + do { + status = hv_do_fast_hypercall8(HVCALL_INITIALIZE_PARTITION, + *(u64 *)&input); + + if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { + ret = hv_result_to_errno(status); + break; + } + ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id, 1); + } while (!ret); + + return ret; +} + +int hv_call_finalize_partition(u64 partition_id) +{ + struct hv_input_finalize_partition input; + u64 status; + + input.partition_id = partition_id; + status = hv_do_fast_hypercall8(HVCALL_FINALIZE_PARTITION, + *(u64 *)&input); + + return hv_result_to_errno(status); +} + +int hv_call_delete_partition(u64 partition_id) +{ + struct hv_input_delete_partition input; + u64 status; + + input.partition_id = partition_id; + status = hv_do_fast_hypercall8(HVCALL_DELETE_PARTITION, *(u64 *)&input); + + return hv_result_to_errno(status); +} + +/* Ask the hypervisor to map guest ram pages or the guest mmio space */ +static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64 page_struct_count, + u32 flags, struct page **pages, u64 mmio_spa) +{ + struct hv_input_map_gpa_pages *input_page; + u64 status, *pfnlist; + unsigned long irq_flags, large_shift = 0; + int ret = 0, done = 0; + u64 page_count = page_struct_count; + + if (page_count == 0 || (pages && mmio_spa)) + return -EINVAL; + + if (flags & HV_MAP_GPA_LARGE_PAGE) { + if (mmio_spa) + return -EINVAL; + + if (!HV_PAGE_COUNT_2M_ALIGNED(page_count)) + return -EINVAL; + + large_shift = HV_HYP_LARGE_PAGE_SHIFT - HV_HYP_PAGE_SHIFT; + page_count >>= large_shift; + } + + while (done < page_count) { + ulong i, completed, remain = page_count - done; + int rep_count = min(remain, HV_MAP_GPA_BATCH_SIZE); + + local_irq_save(irq_flags); + input_page = *this_cpu_ptr(hyperv_pcpu_input_arg); + + input_page->target_partition_id = partition_id; + input_page->target_gpa_base = gfn + (done << large_shift); + input_page->map_flags = flags; + pfnlist = input_page->source_gpa_page_list; + + for (i = 0; i < rep_count; i++) + if (flags & HV_MAP_GPA_NO_ACCESS) { + pfnlist[i] = 0; + } else if (pages) { + u64 index = (done + i) << large_shift; + + if (index >= page_struct_count) { + ret = -EINVAL; + break; + } + pfnlist[i] = page_to_pfn(pages[index]); + } else { + pfnlist[i] = mmio_spa + done + i; + } + if (ret) + break; + + status = hv_do_rep_hypercall(HVCALL_MAP_GPA_PAGES, rep_count, 0, + input_page, NULL); + local_irq_restore(irq_flags); + + completed = hv_repcomp(status); + + if (hv_result(status) == HV_STATUS_INSUFFICIENT_MEMORY) { + ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id, + HV_MAP_GPA_DEPOSIT_PAGES); + if (ret) + break; + + } else if (!hv_result_success(status)) { + ret = hv_result_to_errno(status); + break; + } + + done += completed; + } + + if (ret && done) { + u32 unmap_flags = 0; + + if (flags & HV_MAP_GPA_LARGE_PAGE) + unmap_flags |= HV_UNMAP_GPA_LARGE_PAGE; + hv_call_unmap_gpa_pages(partition_id, gfn, done, unmap_flags); + } + + return ret; +} + +/* Ask the hypervisor to map guest ram pages */ +int hv_call_map_gpa_pages(u64 partition_id, u64 gpa_target, u64 page_count, + u32 flags, struct page **pages) +{ + return hv_do_map_gpa_hcall(partition_id, gpa_target, page_count, + flags, pages, 0); +} + +/* Ask the hypervisor to map guest mmio space */ +int hv_call_map_mmio_pages(u64 partition_id, u64 gfn, u64 mmio_spa, u64 numpgs) +{ + int i; + u32 flags = HV_MAP_GPA_READABLE | HV_MAP_GPA_WRITABLE | + HV_MAP_GPA_NOT_CACHED; + + for (i = 0; i < numpgs; i++) + if (page_is_ram(mmio_spa + i)) + return -EINVAL; + + return hv_do_map_gpa_hcall(partition_id, gfn, numpgs, flags, NULL, + mmio_spa); +} + +int hv_call_unmap_gpa_pages(u64 partition_id, u64 gfn, u64 page_count_4k, + u32 flags) +{ + struct hv_input_unmap_gpa_pages *input_page; + u64 status, page_count = page_count_4k; + unsigned long irq_flags, large_shift = 0; + int ret = 0, done = 0; + + if (page_count == 0) + return -EINVAL; + + if (flags & HV_UNMAP_GPA_LARGE_PAGE) { + if (!HV_PAGE_COUNT_2M_ALIGNED(page_count)) + return -EINVAL; + + large_shift = HV_HYP_LARGE_PAGE_SHIFT - HV_HYP_PAGE_SHIFT; + page_count >>= large_shift; + } + + while (done < page_count) { + ulong completed, remain = page_count - done; + int rep_count = min(remain, HV_UMAP_GPA_PAGES); + + local_irq_save(irq_flags); + input_page = *this_cpu_ptr(hyperv_pcpu_input_arg); + + input_page->target_partition_id = partition_id; + input_page->target_gpa_base = gfn + (done << large_shift); + input_page->unmap_flags = flags; + status = hv_do_rep_hypercall(HVCALL_UNMAP_GPA_PAGES, rep_count, + 0, input_page, NULL); + local_irq_restore(irq_flags); + + completed = hv_repcomp(status); + if (!hv_result_success(status)) { + ret = hv_result_to_errno(status); + break; + } + + done += completed; + } + + return ret; +} + +int hv_call_get_gpa_access_states(u64 partition_id, u32 count, u64 gpa_base_pfn, + union hv_gpa_page_access_state_flags state_flags, + int *written_total, + union hv_gpa_page_access_state *states) +{ + struct hv_input_get_gpa_pages_access_state *input_page; + union hv_gpa_page_access_state *output_page; + int completed = 0; + unsigned long remaining = count; + int rep_count, i; + u64 status = 0; + unsigned long flags; + + *written_total = 0; + while (remaining) { + local_irq_save(flags); + input_page = *this_cpu_ptr(hyperv_pcpu_input_arg); + output_page = *this_cpu_ptr(hyperv_pcpu_output_arg); + + input_page->partition_id = partition_id; + input_page->hv_gpa_page_number = gpa_base_pfn + *written_total; + input_page->flags = state_flags; + rep_count = min(remaining, HV_GET_GPA_ACCESS_STATES_BATCH_SIZE); + + status = hv_do_rep_hypercall(HVCALL_GET_GPA_PAGES_ACCESS_STATES, rep_count, + 0, input_page, output_page); + if (!hv_result_success(status)) { + local_irq_restore(flags); + break; + } + completed = hv_repcomp(status); + for (i = 0; i < completed; ++i) + states[i].as_uint8 = output_page[i].as_uint8; + + local_irq_restore(flags); + states += completed; + *written_total += completed; + remaining -= completed; + } + + return hv_result_to_errno(status); +} + +int hv_call_assert_virtual_interrupt(u64 partition_id, u32 vector, + u64 dest_addr, + union hv_interrupt_control control) +{ + struct hv_input_assert_virtual_interrupt *input; + unsigned long flags; + u64 status; + + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + memset(input, 0, sizeof(*input)); + input->partition_id = partition_id; + input->vector = vector; + /* + * NOTE: dest_addr only needs to be provided while asserting an + * interrupt on x86 platform + */ +#if IS_ENABLED(CONFIG_X86) + input->dest_addr = dest_addr; +#endif + input->control = control; + status = hv_do_hypercall(HVCALL_ASSERT_VIRTUAL_INTERRUPT, input, NULL); + local_irq_restore(flags); + + return hv_result_to_errno(status); +} + +int hv_call_delete_vp(u64 partition_id, u32 vp_index) +{ + union hv_input_delete_vp input = {}; + u64 status; + + input.partition_id = partition_id; + input.vp_index = vp_index; + + status = hv_do_fast_hypercall16(HVCALL_DELETE_VP, + input.as_uint64[0], input.as_uint64[1]); + + return hv_result_to_errno(status); +} +EXPORT_SYMBOL_GPL(hv_call_delete_vp); + +int hv_call_get_vp_state(u32 vp_index, u64 partition_id, + struct hv_vp_state_data state_data, + /* Choose between pages and ret_output */ + u64 page_count, struct page **pages, + union hv_output_get_vp_state *ret_output) +{ + struct hv_input_get_vp_state *input; + union hv_output_get_vp_state *output; + u64 status; + int i; + u64 control; + unsigned long flags; + int ret = 0; + + if (page_count > HV_GET_VP_STATE_BATCH_SIZE) + return -EINVAL; + + if (!page_count && !ret_output) + return -EINVAL; + + do { + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + output = *this_cpu_ptr(hyperv_pcpu_output_arg); + memset(input, 0, sizeof(*input)); + memset(output, 0, sizeof(*output)); + + input->partition_id = partition_id; + input->vp_index = vp_index; + input->state_data = state_data; + for (i = 0; i < page_count; i++) + input->output_data_pfns[i] = page_to_pfn(pages[i]); + + control = (HVCALL_GET_VP_STATE) | + (page_count << HV_HYPERCALL_VARHEAD_OFFSET); + + status = hv_do_hypercall(control, input, output); + + if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { + if (hv_result_success(status) && ret_output) + memcpy(ret_output, output, sizeof(*output)); + + local_irq_restore(flags); + ret = hv_result_to_errno(status); + break; + } + local_irq_restore(flags); + + ret = hv_call_deposit_pages(NUMA_NO_NODE, + partition_id, 1); + } while (!ret); + + return ret; +} + +int hv_call_set_vp_state(u32 vp_index, u64 partition_id, + /* Choose between pages and bytes */ + struct hv_vp_state_data state_data, u64 page_count, + struct page **pages, u32 num_bytes, u8 *bytes) +{ + struct hv_input_set_vp_state *input; + u64 status; + int i; + u64 control; + unsigned long flags; + int ret = 0; + u16 varhead_sz; + + if (page_count > HV_SET_VP_STATE_BATCH_SIZE) + return -EINVAL; + if (sizeof(*input) + num_bytes > HV_HYP_PAGE_SIZE) + return -EINVAL; + + if (num_bytes) + /* round up to 8 and divide by 8 */ + varhead_sz = (num_bytes + 7) >> 3; + else if (page_count) + varhead_sz = page_count; + else + return -EINVAL; + + do { + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + memset(input, 0, sizeof(*input)); + + input->partition_id = partition_id; + input->vp_index = vp_index; + input->state_data = state_data; + if (num_bytes) { + memcpy((u8 *)input->data, bytes, num_bytes); + } else { + for (i = 0; i < page_count; i++) + input->data[i].pfns = page_to_pfn(pages[i]); + } + + control = (HVCALL_SET_VP_STATE) | + (varhead_sz << HV_HYPERCALL_VARHEAD_OFFSET); + + status = hv_do_hypercall(control, input, NULL); + + if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { + local_irq_restore(flags); + ret = hv_result_to_errno(status); + break; + } + local_irq_restore(flags); + + ret = hv_call_deposit_pages(NUMA_NO_NODE, + partition_id, 1); + } while (!ret); + + return ret; +} + +static int hv_call_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type, + union hv_input_vtl input_vtl, + struct page **state_page) +{ + struct hv_input_map_vp_state_page *input; + struct hv_output_map_vp_state_page *output; + u64 status; + int ret; + unsigned long flags; + + do { + local_irq_save(flags); + + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + output = *this_cpu_ptr(hyperv_pcpu_output_arg); + + memset(input, 0, sizeof(*input)); + input->partition_id = partition_id; + input->vp_index = vp_index; + input->type = type; + input->input_vtl = input_vtl; + + if (*state_page) { + input->flags.map_location_provided = 1; + input->requested_map_location = + page_to_pfn(*state_page); + } + + status = hv_do_hypercall(HVCALL_MAP_VP_STATE_PAGE, input, + output); + + if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { + if (hv_result_success(status)) + *state_page = pfn_to_page(output->map_location); + local_irq_restore(flags); + ret = hv_result_to_errno(status); + break; + } + + local_irq_restore(flags); + + ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id, 1); + } while (!ret); + + return ret; +} + +static bool mshv_use_overlay_gpfn(void) +{ + return hv_l1vh_partition() && + mshv_root.vmm_caps.vmm_can_provide_overlay_gpfn; +} + +int hv_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type, + union hv_input_vtl input_vtl, + struct page **state_page) +{ + int ret = 0; + struct page *allocated_page = NULL; + + if (mshv_use_overlay_gpfn()) { + allocated_page = alloc_page(GFP_KERNEL); + if (!allocated_page) + return -ENOMEM; + *state_page = allocated_page; + } else { + *state_page = NULL; + } + + ret = hv_call_map_vp_state_page(partition_id, vp_index, type, input_vtl, + state_page); + + if (ret && allocated_page) { + __free_page(allocated_page); + *state_page = NULL; + } + + return ret; +} + +static int hv_call_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type, + union hv_input_vtl input_vtl) +{ + unsigned long flags; + u64 status; + struct hv_input_unmap_vp_state_page *input; + + local_irq_save(flags); + + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + + memset(input, 0, sizeof(*input)); + + input->partition_id = partition_id; + input->vp_index = vp_index; + input->type = type; + input->input_vtl = input_vtl; + + status = hv_do_hypercall(HVCALL_UNMAP_VP_STATE_PAGE, input, NULL); + + local_irq_restore(flags); + + return hv_result_to_errno(status); +} + +int hv_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type, + struct page *state_page, union hv_input_vtl input_vtl) +{ + int ret = hv_call_unmap_vp_state_page(partition_id, vp_index, type, input_vtl); + + if (mshv_use_overlay_gpfn() && state_page) + __free_page(state_page); + + return ret; +} + +int hv_call_get_partition_property_ex(u64 partition_id, u64 property_code, + u64 arg, void *property_value, + size_t property_value_sz) +{ + u64 status; + unsigned long flags; + struct hv_input_get_partition_property_ex *input; + struct hv_output_get_partition_property_ex *output; + + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + output = *this_cpu_ptr(hyperv_pcpu_output_arg); + + memset(input, 0, sizeof(*input)); + input->partition_id = partition_id; + input->property_code = property_code; + input->arg = arg; + status = hv_do_hypercall(HVCALL_GET_PARTITION_PROPERTY_EX, input, output); + + if (!hv_result_success(status)) { + local_irq_restore(flags); + hv_status_debug(status, "\n"); + return hv_result_to_errno(status); + } + memcpy(property_value, &output->property_value, property_value_sz); + + local_irq_restore(flags); + + return 0; +} + +int +hv_call_clear_virtual_interrupt(u64 partition_id) +{ + int status; + + status = hv_do_fast_hypercall8(HVCALL_CLEAR_VIRTUAL_INTERRUPT, + partition_id); + + return hv_result_to_errno(status); +} + +int +hv_call_create_port(u64 port_partition_id, union hv_port_id port_id, + u64 connection_partition_id, + struct hv_port_info *port_info, + u8 port_vtl, u8 min_connection_vtl, int node) +{ + struct hv_input_create_port *input; + unsigned long flags; + int ret = 0; + int status; + + do { + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + memset(input, 0, sizeof(*input)); + + input->port_partition_id = port_partition_id; + input->port_id = port_id; + input->connection_partition_id = connection_partition_id; + input->port_info = *port_info; + input->port_vtl = port_vtl; + input->min_connection_vtl = min_connection_vtl; + input->proximity_domain_info = hv_numa_node_to_pxm_info(node); + status = hv_do_hypercall(HVCALL_CREATE_PORT, input, NULL); + local_irq_restore(flags); + if (hv_result_success(status)) + break; + + if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { + ret = hv_result_to_errno(status); + break; + } + ret = hv_call_deposit_pages(NUMA_NO_NODE, port_partition_id, 1); + + } while (!ret); + + return ret; +} + +int +hv_call_delete_port(u64 port_partition_id, union hv_port_id port_id) +{ + union hv_input_delete_port input = { 0 }; + int status; + + input.port_partition_id = port_partition_id; + input.port_id = port_id; + status = hv_do_fast_hypercall16(HVCALL_DELETE_PORT, + input.as_uint64[0], + input.as_uint64[1]); + + return hv_result_to_errno(status); +} + +int +hv_call_connect_port(u64 port_partition_id, union hv_port_id port_id, + u64 connection_partition_id, + union hv_connection_id connection_id, + struct hv_connection_info *connection_info, + u8 connection_vtl, int node) +{ + struct hv_input_connect_port *input; + unsigned long flags; + int ret = 0, status; + + do { + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + memset(input, 0, sizeof(*input)); + input->port_partition_id = port_partition_id; + input->port_id = port_id; + input->connection_partition_id = connection_partition_id; + input->connection_id = connection_id; + input->connection_info = *connection_info; + input->connection_vtl = connection_vtl; + input->proximity_domain_info = hv_numa_node_to_pxm_info(node); + status = hv_do_hypercall(HVCALL_CONNECT_PORT, input, NULL); + + local_irq_restore(flags); + if (hv_result_success(status)) + break; + + if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { + ret = hv_result_to_errno(status); + break; + } + ret = hv_call_deposit_pages(NUMA_NO_NODE, + connection_partition_id, 1); + } while (!ret); + + return ret; +} + +int +hv_call_disconnect_port(u64 connection_partition_id, + union hv_connection_id connection_id) +{ + union hv_input_disconnect_port input = { 0 }; + int status; + + input.connection_partition_id = connection_partition_id; + input.connection_id = connection_id; + input.is_doorbell = 1; + status = hv_do_fast_hypercall16(HVCALL_DISCONNECT_PORT, + input.as_uint64[0], + input.as_uint64[1]); + + return hv_result_to_errno(status); +} + +int +hv_call_notify_port_ring_empty(u32 sint_index) +{ + union hv_input_notify_port_ring_empty input = { 0 }; + int status; + + input.sint_index = sint_index; + status = hv_do_fast_hypercall8(HVCALL_NOTIFY_PORT_RING_EMPTY, + input.as_uint64); + + return hv_result_to_errno(status); +} + +static int hv_call_map_stats_page2(enum hv_stats_object_type type, + const union hv_stats_object_identity *identity, + u64 map_location) +{ + unsigned long flags; + struct hv_input_map_stats_page2 *input; + u64 status; + int ret; + + if (!map_location || !mshv_use_overlay_gpfn()) + return -EINVAL; + + do { + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + + memset(input, 0, sizeof(*input)); + input->type = type; + input->identity = *identity; + input->map_location = map_location; + + status = hv_do_hypercall(HVCALL_MAP_STATS_PAGE2, input, NULL); + + local_irq_restore(flags); + + ret = hv_result_to_errno(status); + + if (!ret) + break; + + if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { + hv_status_debug(status, "\n"); + break; + } + + ret = hv_call_deposit_pages(NUMA_NO_NODE, + hv_current_partition_id, 1); + } while (!ret); + + return ret; +} + +static int hv_call_map_stats_page(enum hv_stats_object_type type, + const union hv_stats_object_identity *identity, + void **addr) +{ + unsigned long flags; + struct hv_input_map_stats_page *input; + struct hv_output_map_stats_page *output; + u64 status, pfn; + int ret = 0; + + do { + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + output = *this_cpu_ptr(hyperv_pcpu_output_arg); + + memset(input, 0, sizeof(*input)); + input->type = type; + input->identity = *identity; + + status = hv_do_hypercall(HVCALL_MAP_STATS_PAGE, input, output); + pfn = output->map_location; + + local_irq_restore(flags); + if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { + ret = hv_result_to_errno(status); + if (hv_result_success(status)) + break; + return ret; + } + + ret = hv_call_deposit_pages(NUMA_NO_NODE, + hv_current_partition_id, 1); + if (ret) + return ret; + } while (!ret); + + *addr = page_address(pfn_to_page(pfn)); + + return ret; +} + +int hv_map_stats_page(enum hv_stats_object_type type, + const union hv_stats_object_identity *identity, + void **addr) +{ + int ret; + struct page *allocated_page = NULL; + + if (!addr) + return -EINVAL; + + if (mshv_use_overlay_gpfn()) { + allocated_page = alloc_page(GFP_KERNEL); + if (!allocated_page) + return -ENOMEM; + + ret = hv_call_map_stats_page2(type, identity, + page_to_pfn(allocated_page)); + *addr = page_address(allocated_page); + } else { + ret = hv_call_map_stats_page(type, identity, addr); + } + + if (ret && allocated_page) { + __free_page(allocated_page); + *addr = NULL; + } + + return ret; +} + +static int hv_call_unmap_stats_page(enum hv_stats_object_type type, + const union hv_stats_object_identity *identity) +{ + unsigned long flags; + struct hv_input_unmap_stats_page *input; + u64 status; + + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + + memset(input, 0, sizeof(*input)); + input->type = type; + input->identity = *identity; + + status = hv_do_hypercall(HVCALL_UNMAP_STATS_PAGE, input, NULL); + local_irq_restore(flags); + + return hv_result_to_errno(status); +} + +int hv_unmap_stats_page(enum hv_stats_object_type type, void *page_addr, + const union hv_stats_object_identity *identity) +{ + int ret; + + ret = hv_call_unmap_stats_page(type, identity); + + if (mshv_use_overlay_gpfn() && page_addr) + __free_page(virt_to_page(page_addr)); + + return ret; +} + +int hv_call_modify_spa_host_access(u64 partition_id, struct page **pages, + u64 page_struct_count, u32 host_access, + u32 flags, u8 acquire) +{ + struct hv_input_modify_sparse_spa_page_host_access *input_page; + u64 status; + int done = 0; + unsigned long irq_flags, large_shift = 0; + u64 page_count = page_struct_count; + u16 code = acquire ? HVCALL_ACQUIRE_SPARSE_SPA_PAGE_HOST_ACCESS : + HVCALL_RELEASE_SPARSE_SPA_PAGE_HOST_ACCESS; + + if (page_count == 0) + return -EINVAL; + + if (flags & HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE) { + if (!HV_PAGE_COUNT_2M_ALIGNED(page_count)) + return -EINVAL; + large_shift = HV_HYP_LARGE_PAGE_SHIFT - HV_HYP_PAGE_SHIFT; + page_count >>= large_shift; + } + + while (done < page_count) { + ulong i, completed, remain = page_count - done; + int rep_count = min(remain, + HV_MODIFY_SPARSE_SPA_PAGE_HOST_ACCESS_MAX_PAGE_COUNT); + + local_irq_save(irq_flags); + input_page = *this_cpu_ptr(hyperv_pcpu_input_arg); + + memset(input_page, 0, sizeof(*input_page)); + /* Only set the partition id if you are making the pages + * exclusive + */ + if (flags & HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_EXCLUSIVE) + input_page->partition_id = partition_id; + input_page->flags = flags; + input_page->host_access = host_access; + + for (i = 0; i < rep_count; i++) { + u64 index = (done + i) << large_shift; + + if (index >= page_struct_count) + return -EINVAL; + + input_page->spa_page_list[i] = + page_to_pfn(pages[index]); + } + + status = hv_do_rep_hypercall(code, rep_count, 0, input_page, + NULL); + local_irq_restore(irq_flags); + + completed = hv_repcomp(status); + + if (!hv_result_success(status)) + return hv_result_to_errno(status); + + done += completed; + } + + return 0; +} diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c new file mode 100644 index 000000000000..1134a82c7881 --- /dev/null +++ b/drivers/hv/mshv_root_main.c @@ -0,0 +1,2342 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024, Microsoft Corporation. + * + * The main part of the mshv_root module, providing APIs to create + * and manage guest partitions. + * + * Authors: Microsoft Linux virtualization team + */ + +#include <linux/entry-virt.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/miscdevice.h> +#include <linux/slab.h> +#include <linux/file.h> +#include <linux/anon_inodes.h> +#include <linux/mm.h> +#include <linux/io.h> +#include <linux/cpuhotplug.h> +#include <linux/random.h> +#include <asm/mshyperv.h> +#include <linux/hyperv.h> +#include <linux/notifier.h> +#include <linux/reboot.h> +#include <linux/kexec.h> +#include <linux/page-flags.h> +#include <linux/crash_dump.h> +#include <linux/panic_notifier.h> +#include <linux/vmalloc.h> +#include <linux/rseq.h> + +#include "mshv_eventfd.h" +#include "mshv.h" +#include "mshv_root.h" + +MODULE_AUTHOR("Microsoft"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Microsoft Hyper-V root partition VMM interface /dev/mshv"); + +/* TODO move this to another file when debugfs code is added */ +enum hv_stats_vp_counters { /* HV_THREAD_COUNTER */ +#if defined(CONFIG_X86) + VpRootDispatchThreadBlocked = 202, +#elif defined(CONFIG_ARM64) + VpRootDispatchThreadBlocked = 94, +#endif + VpStatsMaxCounter +}; + +struct hv_stats_page { + union { + u64 vp_cntrs[VpStatsMaxCounter]; /* VP counters */ + u8 data[HV_HYP_PAGE_SIZE]; + }; +} __packed; + +struct mshv_root mshv_root; + +enum hv_scheduler_type hv_scheduler_type; + +/* Once we implement the fast extended hypercall ABI they can go away. */ +static void * __percpu *root_scheduler_input; +static void * __percpu *root_scheduler_output; + +static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); +static int mshv_dev_open(struct inode *inode, struct file *filp); +static int mshv_dev_release(struct inode *inode, struct file *filp); +static int mshv_vp_release(struct inode *inode, struct file *filp); +static long mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); +static int mshv_partition_release(struct inode *inode, struct file *filp); +static long mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); +static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma); +static vm_fault_t mshv_vp_fault(struct vm_fault *vmf); +static int mshv_init_async_handler(struct mshv_partition *partition); +static void mshv_async_hvcall_handler(void *data, u64 *status); + +static const union hv_input_vtl input_vtl_zero; +static const union hv_input_vtl input_vtl_normal = { + .target_vtl = HV_NORMAL_VTL, + .use_target_vtl = 1, +}; + +static const struct vm_operations_struct mshv_vp_vm_ops = { + .fault = mshv_vp_fault, +}; + +static const struct file_operations mshv_vp_fops = { + .owner = THIS_MODULE, + .release = mshv_vp_release, + .unlocked_ioctl = mshv_vp_ioctl, + .llseek = noop_llseek, + .mmap = mshv_vp_mmap, +}; + +static const struct file_operations mshv_partition_fops = { + .owner = THIS_MODULE, + .release = mshv_partition_release, + .unlocked_ioctl = mshv_partition_ioctl, + .llseek = noop_llseek, +}; + +static const struct file_operations mshv_dev_fops = { + .owner = THIS_MODULE, + .open = mshv_dev_open, + .release = mshv_dev_release, + .unlocked_ioctl = mshv_dev_ioctl, + .llseek = noop_llseek, +}; + +static struct miscdevice mshv_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "mshv", + .fops = &mshv_dev_fops, + .mode = 0600, +}; + +/* + * Only allow hypercalls that have a u64 partition id as the first member of + * the input structure. + * These are sorted by value. + */ +static u16 mshv_passthru_hvcalls[] = { + HVCALL_GET_PARTITION_PROPERTY, + HVCALL_GET_PARTITION_PROPERTY_EX, + HVCALL_SET_PARTITION_PROPERTY, + HVCALL_INSTALL_INTERCEPT, + HVCALL_GET_VP_REGISTERS, + HVCALL_SET_VP_REGISTERS, + HVCALL_TRANSLATE_VIRTUAL_ADDRESS, + HVCALL_CLEAR_VIRTUAL_INTERRUPT, + HVCALL_REGISTER_INTERCEPT_RESULT, + HVCALL_ASSERT_VIRTUAL_INTERRUPT, + HVCALL_GET_GPA_PAGES_ACCESS_STATES, + HVCALL_SIGNAL_EVENT_DIRECT, + HVCALL_POST_MESSAGE_DIRECT, + HVCALL_GET_VP_CPUID_VALUES, +}; + +/* + * Only allow hypercalls that are safe to be called by the VMM with the host + * partition as target (i.e. HV_PARTITION_ID_SELF). Carefully audit that a + * hypercall cannot be misused by the VMM before adding it to this list. + */ +static u16 mshv_self_passthru_hvcalls[] = { + HVCALL_GET_PARTITION_PROPERTY, + HVCALL_GET_PARTITION_PROPERTY_EX, +}; + +static bool mshv_hvcall_is_async(u16 code) +{ + switch (code) { + case HVCALL_SET_PARTITION_PROPERTY: + return true; + default: + break; + } + return false; +} + +static bool mshv_passthru_hvcall_allowed(u16 code, u64 pt_id) +{ + int i; + int n = ARRAY_SIZE(mshv_passthru_hvcalls); + u16 *allowed_hvcalls = mshv_passthru_hvcalls; + + if (pt_id == HV_PARTITION_ID_SELF) { + n = ARRAY_SIZE(mshv_self_passthru_hvcalls); + allowed_hvcalls = mshv_self_passthru_hvcalls; + } + + for (i = 0; i < n; ++i) + if (allowed_hvcalls[i] == code) + return true; + + return false; +} + +static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition, + bool partition_locked, + void __user *user_args) +{ + u64 status; + int ret = 0; + bool is_async; + struct mshv_root_hvcall args; + struct page *page; + unsigned int pages_order; + void *input_pg = NULL; + void *output_pg = NULL; + u16 reps_completed; + u64 pt_id = partition ? partition->pt_id : HV_PARTITION_ID_SELF; + + if (copy_from_user(&args, user_args, sizeof(args))) + return -EFAULT; + + if (args.status || !args.in_ptr || args.in_sz < sizeof(u64) || + mshv_field_nonzero(args, rsvd) || args.in_sz > HV_HYP_PAGE_SIZE) + return -EINVAL; + + if (args.out_ptr && (!args.out_sz || args.out_sz > HV_HYP_PAGE_SIZE)) + return -EINVAL; + + if (!mshv_passthru_hvcall_allowed(args.code, pt_id)) + return -EINVAL; + + is_async = mshv_hvcall_is_async(args.code); + if (is_async) { + /* async hypercalls can only be called from partition fd */ + if (!partition || !partition_locked) + return -EINVAL; + ret = mshv_init_async_handler(partition); + if (ret) + return ret; + } + + pages_order = args.out_ptr ? 1 : 0; + page = alloc_pages(GFP_KERNEL, pages_order); + if (!page) + return -ENOMEM; + input_pg = page_address(page); + + if (args.out_ptr) + output_pg = (char *)input_pg + PAGE_SIZE; + else + output_pg = NULL; + + if (copy_from_user(input_pg, (void __user *)args.in_ptr, + args.in_sz)) { + ret = -EFAULT; + goto free_pages_out; + } + + /* + * NOTE: This only works because all the allowed hypercalls' input + * structs begin with a u64 partition_id field. + */ + *(u64 *)input_pg = pt_id; + + reps_completed = 0; + do { + if (args.reps) { + status = hv_do_rep_hypercall_ex(args.code, args.reps, + 0, reps_completed, + input_pg, output_pg); + reps_completed = hv_repcomp(status); + } else { + status = hv_do_hypercall(args.code, input_pg, output_pg); + } + + if (hv_result(status) == HV_STATUS_CALL_PENDING) { + if (is_async) { + mshv_async_hvcall_handler(partition, &status); + } else { /* Paranoia check. This shouldn't happen! */ + ret = -EBADFD; + goto free_pages_out; + } + } + + if (hv_result_success(status)) + break; + + if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) + ret = hv_result_to_errno(status); + else + ret = hv_call_deposit_pages(NUMA_NO_NODE, + pt_id, 1); + } while (!ret); + + args.status = hv_result(status); + args.reps = reps_completed; + if (copy_to_user(user_args, &args, sizeof(args))) + ret = -EFAULT; + + if (!ret && output_pg && + copy_to_user((void __user *)args.out_ptr, output_pg, args.out_sz)) + ret = -EFAULT; + +free_pages_out: + free_pages((unsigned long)input_pg, pages_order); + + return ret; +} + +static inline bool is_ghcb_mapping_available(void) +{ +#if IS_ENABLED(CONFIG_X86_64) + return ms_hyperv.ext_features & HV_VP_GHCB_ROOT_MAPPING_AVAILABLE; +#else + return 0; +#endif +} + +static int mshv_get_vp_registers(u32 vp_index, u64 partition_id, u16 count, + struct hv_register_assoc *registers) +{ + return hv_call_get_vp_registers(vp_index, partition_id, + count, input_vtl_zero, registers); +} + +static int mshv_set_vp_registers(u32 vp_index, u64 partition_id, u16 count, + struct hv_register_assoc *registers) +{ + return hv_call_set_vp_registers(vp_index, partition_id, + count, input_vtl_zero, registers); +} + +/* + * Explicit guest vCPU suspend is asynchronous by nature (as it is requested by + * dom0 vCPU for guest vCPU) and thus it can race with "intercept" suspend, + * done by the hypervisor. + * "Intercept" suspend leads to asynchronous message delivery to dom0 which + * should be awaited to keep the VP loop consistent (i.e. no message pending + * upon VP resume). + * VP intercept suspend can't be done when the VP is explicitly suspended + * already, and thus can be only two possible race scenarios: + * 1. implicit suspend bit set -> explicit suspend bit set -> message sent + * 2. implicit suspend bit set -> message sent -> explicit suspend bit set + * Checking for implicit suspend bit set after explicit suspend request has + * succeeded in either case allows us to reliably identify, if there is a + * message to receive and deliver to VMM. + */ +static int +mshv_suspend_vp(const struct mshv_vp *vp, bool *message_in_flight) +{ + struct hv_register_assoc explicit_suspend = { + .name = HV_REGISTER_EXPLICIT_SUSPEND + }; + struct hv_register_assoc intercept_suspend = { + .name = HV_REGISTER_INTERCEPT_SUSPEND + }; + union hv_explicit_suspend_register *es = + &explicit_suspend.value.explicit_suspend; + union hv_intercept_suspend_register *is = + &intercept_suspend.value.intercept_suspend; + int ret; + + es->suspended = 1; + + ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, + 1, &explicit_suspend); + if (ret) { + vp_err(vp, "Failed to explicitly suspend vCPU\n"); + return ret; + } + + ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id, + 1, &intercept_suspend); + if (ret) { + vp_err(vp, "Failed to get intercept suspend state\n"); + return ret; + } + + *message_in_flight = is->suspended; + + return 0; +} + +/* + * This function is used when VPs are scheduled by the hypervisor's + * scheduler. + * + * Caller has to make sure the registers contain cleared + * HV_REGISTER_INTERCEPT_SUSPEND and HV_REGISTER_EXPLICIT_SUSPEND registers + * exactly in this order (the hypervisor clears them sequentially) to avoid + * potential invalid clearing a newly arrived HV_REGISTER_INTERCEPT_SUSPEND + * after VP is released from HV_REGISTER_EXPLICIT_SUSPEND in case of the + * opposite order. + */ +static long mshv_run_vp_with_hyp_scheduler(struct mshv_vp *vp) +{ + long ret; + struct hv_register_assoc suspend_regs[2] = { + { .name = HV_REGISTER_INTERCEPT_SUSPEND }, + { .name = HV_REGISTER_EXPLICIT_SUSPEND } + }; + size_t count = ARRAY_SIZE(suspend_regs); + + /* Resume VP execution */ + ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, + count, suspend_regs); + if (ret) { + vp_err(vp, "Failed to resume vp execution. %lx\n", ret); + return ret; + } + + ret = wait_event_interruptible(vp->run.vp_suspend_queue, + vp->run.kicked_by_hv == 1); + if (ret) { + bool message_in_flight; + + /* + * Otherwise the waiting was interrupted by a signal: suspend + * the vCPU explicitly and copy message in flight (if any). + */ + ret = mshv_suspend_vp(vp, &message_in_flight); + if (ret) + return ret; + + /* Return if no message in flight */ + if (!message_in_flight) + return -EINTR; + + /* Wait for the message in flight. */ + wait_event(vp->run.vp_suspend_queue, vp->run.kicked_by_hv == 1); + } + + /* + * Reset the flag to make the wait_event call above work + * next time. + */ + vp->run.kicked_by_hv = 0; + + return 0; +} + +static int +mshv_vp_dispatch(struct mshv_vp *vp, u32 flags, + struct hv_output_dispatch_vp *res) +{ + struct hv_input_dispatch_vp *input; + struct hv_output_dispatch_vp *output; + u64 status; + + preempt_disable(); + input = *this_cpu_ptr(root_scheduler_input); + output = *this_cpu_ptr(root_scheduler_output); + + memset(input, 0, sizeof(*input)); + memset(output, 0, sizeof(*output)); + + input->partition_id = vp->vp_partition->pt_id; + input->vp_index = vp->vp_index; + input->time_slice = 0; /* Run forever until something happens */ + input->spec_ctrl = 0; /* TODO: set sensible flags */ + input->flags = flags; + + vp->run.flags.root_sched_dispatched = 1; + status = hv_do_hypercall(HVCALL_DISPATCH_VP, input, output); + vp->run.flags.root_sched_dispatched = 0; + + *res = *output; + preempt_enable(); + + if (!hv_result_success(status)) + vp_err(vp, "%s: status %s\n", __func__, + hv_result_to_string(status)); + + return hv_result_to_errno(status); +} + +static int +mshv_vp_clear_explicit_suspend(struct mshv_vp *vp) +{ + struct hv_register_assoc explicit_suspend = { + .name = HV_REGISTER_EXPLICIT_SUSPEND, + .value.explicit_suspend.suspended = 0, + }; + int ret; + + ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, + 1, &explicit_suspend); + + if (ret) + vp_err(vp, "Failed to unsuspend\n"); + + return ret; +} + +#if IS_ENABLED(CONFIG_X86_64) +static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp) +{ + if (!vp->vp_register_page) + return 0; + return vp->vp_register_page->interrupt_vectors.as_uint64; +} +#else +static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp) +{ + return 0; +} +#endif + +static bool mshv_vp_dispatch_thread_blocked(struct mshv_vp *vp) +{ + struct hv_stats_page **stats = vp->vp_stats_pages; + u64 *self_vp_cntrs = stats[HV_STATS_AREA_SELF]->vp_cntrs; + u64 *parent_vp_cntrs = stats[HV_STATS_AREA_PARENT]->vp_cntrs; + + if (self_vp_cntrs[VpRootDispatchThreadBlocked]) + return self_vp_cntrs[VpRootDispatchThreadBlocked]; + return parent_vp_cntrs[VpRootDispatchThreadBlocked]; +} + +static int +mshv_vp_wait_for_hv_kick(struct mshv_vp *vp) +{ + int ret; + + ret = wait_event_interruptible(vp->run.vp_suspend_queue, + (vp->run.kicked_by_hv == 1 && + !mshv_vp_dispatch_thread_blocked(vp)) || + mshv_vp_interrupt_pending(vp)); + if (ret) + return -EINTR; + + vp->run.flags.root_sched_blocked = 0; + vp->run.kicked_by_hv = 0; + + return 0; +} + +/* Must be called with interrupts enabled */ +static long mshv_run_vp_with_root_scheduler(struct mshv_vp *vp) +{ + long ret; + + if (vp->run.flags.root_sched_blocked) { + /* + * Dispatch state of this VP is blocked. Need to wait + * for the hypervisor to clear the blocked state before + * dispatching it. + */ + ret = mshv_vp_wait_for_hv_kick(vp); + if (ret) + return ret; + } + + do { + u32 flags = 0; + struct hv_output_dispatch_vp output; + + if (__xfer_to_guest_mode_work_pending()) { + ret = xfer_to_guest_mode_handle_work(); + if (ret) + break; + } + + if (vp->run.flags.intercept_suspend) + flags |= HV_DISPATCH_VP_FLAG_CLEAR_INTERCEPT_SUSPEND; + + if (mshv_vp_interrupt_pending(vp)) + flags |= HV_DISPATCH_VP_FLAG_SCAN_INTERRUPT_INJECTION; + + ret = mshv_vp_dispatch(vp, flags, &output); + if (ret) + break; + + vp->run.flags.intercept_suspend = 0; + + if (output.dispatch_state == HV_VP_DISPATCH_STATE_BLOCKED) { + if (output.dispatch_event == + HV_VP_DISPATCH_EVENT_SUSPEND) { + /* + * TODO: remove the warning once VP canceling + * is supported + */ + WARN_ONCE(atomic64_read(&vp->run.vp_signaled_count), + "%s: vp#%d: unexpected explicit suspend\n", + __func__, vp->vp_index); + /* + * Need to clear explicit suspend before + * dispatching. + * Explicit suspend is either: + * - set right after the first VP dispatch or + * - set explicitly via hypercall + * Since the latter case is not yet supported, + * simply clear it here. + */ + ret = mshv_vp_clear_explicit_suspend(vp); + if (ret) + break; + + ret = mshv_vp_wait_for_hv_kick(vp); + if (ret) + break; + } else { + vp->run.flags.root_sched_blocked = 1; + ret = mshv_vp_wait_for_hv_kick(vp); + if (ret) + break; + } + } else { + /* HV_VP_DISPATCH_STATE_READY */ + if (output.dispatch_event == + HV_VP_DISPATCH_EVENT_INTERCEPT) + vp->run.flags.intercept_suspend = 1; + } + } while (!vp->run.flags.intercept_suspend); + + rseq_virt_userspace_exit(); + + return ret; +} + +static_assert(sizeof(struct hv_message) <= MSHV_RUN_VP_BUF_SZ, + "sizeof(struct hv_message) must not exceed MSHV_RUN_VP_BUF_SZ"); + +static struct mshv_mem_region * +mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn) +{ + struct mshv_mem_region *region; + + hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) { + if (gfn >= region->start_gfn && + gfn < region->start_gfn + region->nr_pages) + return region; + } + + return NULL; +} + +#ifdef CONFIG_X86_64 +static struct mshv_mem_region * +mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn) +{ + struct mshv_mem_region *region; + + spin_lock(&p->pt_mem_regions_lock); + region = mshv_partition_region_by_gfn(p, gfn); + if (!region || !mshv_region_get(region)) { + spin_unlock(&p->pt_mem_regions_lock); + return NULL; + } + spin_unlock(&p->pt_mem_regions_lock); + + return region; +} + +/** + * mshv_handle_gpa_intercept - Handle GPA (Guest Physical Address) intercepts. + * @vp: Pointer to the virtual processor structure. + * + * This function processes GPA intercepts by identifying the memory region + * corresponding to the intercepted GPA, aligning the page offset, and + * mapping the required pages. It ensures that the region is valid and + * handles faults efficiently by mapping multiple pages at once. + * + * Return: true if the intercept was handled successfully, false otherwise. + */ +static bool mshv_handle_gpa_intercept(struct mshv_vp *vp) +{ + struct mshv_partition *p = vp->vp_partition; + struct mshv_mem_region *region; + struct hv_x64_memory_intercept_message *msg; + bool ret; + u64 gfn; + + msg = (struct hv_x64_memory_intercept_message *) + vp->vp_intercept_msg_page->u.payload; + + gfn = HVPFN_DOWN(msg->guest_physical_address); + + region = mshv_partition_region_by_gfn_get(p, gfn); + if (!region) + return false; + + /* Only movable memory ranges are supported for GPA intercepts */ + if (region->type == MSHV_REGION_TYPE_MEM_MOVABLE) + ret = mshv_region_handle_gfn_fault(region, gfn); + else + ret = false; + + mshv_region_put(region); + + return ret; +} +#else /* CONFIG_X86_64 */ +static bool mshv_handle_gpa_intercept(struct mshv_vp *vp) { return false; } +#endif /* CONFIG_X86_64 */ + +static bool mshv_vp_handle_intercept(struct mshv_vp *vp) +{ + switch (vp->vp_intercept_msg_page->header.message_type) { + case HVMSG_GPA_INTERCEPT: + return mshv_handle_gpa_intercept(vp); + } + return false; +} + +static long mshv_vp_ioctl_run_vp(struct mshv_vp *vp, void __user *ret_msg) +{ + long rc; + + do { + if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) + rc = mshv_run_vp_with_root_scheduler(vp); + else + rc = mshv_run_vp_with_hyp_scheduler(vp); + } while (rc == 0 && mshv_vp_handle_intercept(vp)); + + if (rc) + return rc; + + if (copy_to_user(ret_msg, vp->vp_intercept_msg_page, + sizeof(struct hv_message))) + rc = -EFAULT; + + return rc; +} + +static int +mshv_vp_ioctl_get_set_state_pfn(struct mshv_vp *vp, + struct hv_vp_state_data state_data, + unsigned long user_pfn, size_t page_count, + bool is_set) +{ + int completed, ret = 0; + unsigned long check; + struct page **pages; + + if (page_count > INT_MAX) + return -EINVAL; + /* + * Check the arithmetic for wraparound/overflow. + * The last page address in the buffer is: + * (user_pfn + (page_count - 1)) * PAGE_SIZE + */ + if (check_add_overflow(user_pfn, (page_count - 1), &check)) + return -EOVERFLOW; + if (check_mul_overflow(check, PAGE_SIZE, &check)) + return -EOVERFLOW; + + /* Pin user pages so hypervisor can copy directly to them */ + pages = kcalloc(page_count, sizeof(struct page *), GFP_KERNEL); + if (!pages) + return -ENOMEM; + + for (completed = 0; completed < page_count; completed += ret) { + unsigned long user_addr = (user_pfn + completed) * PAGE_SIZE; + int remaining = page_count - completed; + + ret = pin_user_pages_fast(user_addr, remaining, FOLL_WRITE, + &pages[completed]); + if (ret < 0) { + vp_err(vp, "%s: Failed to pin user pages error %i\n", + __func__, ret); + goto unpin_pages; + } + } + + if (is_set) + ret = hv_call_set_vp_state(vp->vp_index, + vp->vp_partition->pt_id, + state_data, page_count, pages, + 0, NULL); + else + ret = hv_call_get_vp_state(vp->vp_index, + vp->vp_partition->pt_id, + state_data, page_count, pages, + NULL); + +unpin_pages: + unpin_user_pages(pages, completed); + kfree(pages); + return ret; +} + +static long +mshv_vp_ioctl_get_set_state(struct mshv_vp *vp, + struct mshv_get_set_vp_state __user *user_args, + bool is_set) +{ + struct mshv_get_set_vp_state args; + long ret = 0; + union hv_output_get_vp_state vp_state; + u32 data_sz; + struct hv_vp_state_data state_data = {}; + + if (copy_from_user(&args, user_args, sizeof(args))) + return -EFAULT; + + if (args.type >= MSHV_VP_STATE_COUNT || mshv_field_nonzero(args, rsvd) || + !args.buf_sz || !PAGE_ALIGNED(args.buf_sz) || + !PAGE_ALIGNED(args.buf_ptr)) + return -EINVAL; + + if (!access_ok((void __user *)args.buf_ptr, args.buf_sz)) + return -EFAULT; + + switch (args.type) { + case MSHV_VP_STATE_LAPIC: + state_data.type = HV_GET_SET_VP_STATE_LAPIC_STATE; + data_sz = HV_HYP_PAGE_SIZE; + break; + case MSHV_VP_STATE_XSAVE: + { + u64 data_sz_64; + + ret = hv_call_get_partition_property(vp->vp_partition->pt_id, + HV_PARTITION_PROPERTY_XSAVE_STATES, + &state_data.xsave.states.as_uint64); + if (ret) + return ret; + + ret = hv_call_get_partition_property(vp->vp_partition->pt_id, + HV_PARTITION_PROPERTY_MAX_XSAVE_DATA_SIZE, + &data_sz_64); + if (ret) + return ret; + + data_sz = (u32)data_sz_64; + state_data.xsave.flags = 0; + /* Always request legacy states */ + state_data.xsave.states.legacy_x87 = 1; + state_data.xsave.states.legacy_sse = 1; + state_data.type = HV_GET_SET_VP_STATE_XSAVE; + break; + } + case MSHV_VP_STATE_SIMP: + state_data.type = HV_GET_SET_VP_STATE_SIM_PAGE; + data_sz = HV_HYP_PAGE_SIZE; + break; + case MSHV_VP_STATE_SIEFP: + state_data.type = HV_GET_SET_VP_STATE_SIEF_PAGE; + data_sz = HV_HYP_PAGE_SIZE; + break; + case MSHV_VP_STATE_SYNTHETIC_TIMERS: + state_data.type = HV_GET_SET_VP_STATE_SYNTHETIC_TIMERS; + data_sz = sizeof(vp_state.synthetic_timers_state); + break; + default: + return -EINVAL; + } + + if (copy_to_user(&user_args->buf_sz, &data_sz, sizeof(user_args->buf_sz))) + return -EFAULT; + + if (data_sz > args.buf_sz) + return -EINVAL; + + /* If the data is transmitted via pfns, delegate to helper */ + if (state_data.type & HV_GET_SET_VP_STATE_TYPE_PFN) { + unsigned long user_pfn = PFN_DOWN(args.buf_ptr); + size_t page_count = PFN_DOWN(args.buf_sz); + + return mshv_vp_ioctl_get_set_state_pfn(vp, state_data, user_pfn, + page_count, is_set); + } + + /* Paranoia check - this shouldn't happen! */ + if (data_sz > sizeof(vp_state)) { + vp_err(vp, "Invalid vp state data size!\n"); + return -EINVAL; + } + + if (is_set) { + if (copy_from_user(&vp_state, (__user void *)args.buf_ptr, data_sz)) + return -EFAULT; + + return hv_call_set_vp_state(vp->vp_index, + vp->vp_partition->pt_id, + state_data, 0, NULL, + sizeof(vp_state), (u8 *)&vp_state); + } + + ret = hv_call_get_vp_state(vp->vp_index, vp->vp_partition->pt_id, + state_data, 0, NULL, &vp_state); + if (ret) + return ret; + + if (copy_to_user((void __user *)args.buf_ptr, &vp_state, data_sz)) + return -EFAULT; + + return 0; +} + +static long +mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) +{ + struct mshv_vp *vp = filp->private_data; + long r = -ENOTTY; + + if (mutex_lock_killable(&vp->vp_mutex)) + return -EINTR; + + switch (ioctl) { + case MSHV_RUN_VP: + r = mshv_vp_ioctl_run_vp(vp, (void __user *)arg); + break; + case MSHV_GET_VP_STATE: + r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, false); + break; + case MSHV_SET_VP_STATE: + r = mshv_vp_ioctl_get_set_state(vp, (void __user *)arg, true); + break; + case MSHV_ROOT_HVCALL: + r = mshv_ioctl_passthru_hvcall(vp->vp_partition, false, + (void __user *)arg); + break; + default: + vp_warn(vp, "Invalid ioctl: %#x\n", ioctl); + break; + } + mutex_unlock(&vp->vp_mutex); + + return r; +} + +static vm_fault_t mshv_vp_fault(struct vm_fault *vmf) +{ + struct mshv_vp *vp = vmf->vma->vm_file->private_data; + + switch (vmf->vma->vm_pgoff) { + case MSHV_VP_MMAP_OFFSET_REGISTERS: + vmf->page = virt_to_page(vp->vp_register_page); + break; + case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE: + vmf->page = virt_to_page(vp->vp_intercept_msg_page); + break; + case MSHV_VP_MMAP_OFFSET_GHCB: + vmf->page = virt_to_page(vp->vp_ghcb_page); + break; + default: + return VM_FAULT_SIGBUS; + } + + get_page(vmf->page); + + return 0; +} + +static int mshv_vp_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct mshv_vp *vp = file->private_data; + + switch (vma->vm_pgoff) { + case MSHV_VP_MMAP_OFFSET_REGISTERS: + if (!vp->vp_register_page) + return -ENODEV; + break; + case MSHV_VP_MMAP_OFFSET_INTERCEPT_MESSAGE: + if (!vp->vp_intercept_msg_page) + return -ENODEV; + break; + case MSHV_VP_MMAP_OFFSET_GHCB: + if (!vp->vp_ghcb_page) + return -ENODEV; + break; + default: + return -EINVAL; + } + + vma->vm_ops = &mshv_vp_vm_ops; + return 0; +} + +static int +mshv_vp_release(struct inode *inode, struct file *filp) +{ + struct mshv_vp *vp = filp->private_data; + + /* Rest of VP cleanup happens in destroy_partition() */ + mshv_partition_put(vp->vp_partition); + return 0; +} + +static void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index, + void *stats_pages[]) +{ + union hv_stats_object_identity identity = { + .vp.partition_id = partition_id, + .vp.vp_index = vp_index, + }; + + identity.vp.stats_area_type = HV_STATS_AREA_SELF; + hv_unmap_stats_page(HV_STATS_OBJECT_VP, NULL, &identity); + + identity.vp.stats_area_type = HV_STATS_AREA_PARENT; + hv_unmap_stats_page(HV_STATS_OBJECT_VP, NULL, &identity); +} + +static int mshv_vp_stats_map(u64 partition_id, u32 vp_index, + void *stats_pages[]) +{ + union hv_stats_object_identity identity = { + .vp.partition_id = partition_id, + .vp.vp_index = vp_index, + }; + int err; + + identity.vp.stats_area_type = HV_STATS_AREA_SELF; + err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity, + &stats_pages[HV_STATS_AREA_SELF]); + if (err) + return err; + + identity.vp.stats_area_type = HV_STATS_AREA_PARENT; + err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity, + &stats_pages[HV_STATS_AREA_PARENT]); + if (err) + goto unmap_self; + + return 0; + +unmap_self: + identity.vp.stats_area_type = HV_STATS_AREA_SELF; + hv_unmap_stats_page(HV_STATS_OBJECT_VP, NULL, &identity); + return err; +} + +static long +mshv_partition_ioctl_create_vp(struct mshv_partition *partition, + void __user *arg) +{ + struct mshv_create_vp args; + struct mshv_vp *vp; + struct page *intercept_msg_page, *register_page, *ghcb_page; + void *stats_pages[2]; + long ret; + + if (copy_from_user(&args, arg, sizeof(args))) + return -EFAULT; + + if (args.vp_index >= MSHV_MAX_VPS) + return -EINVAL; + + if (partition->pt_vp_array[args.vp_index]) + return -EEXIST; + + ret = hv_call_create_vp(NUMA_NO_NODE, partition->pt_id, args.vp_index, + 0 /* Only valid for root partition VPs */); + if (ret) + return ret; + + ret = hv_map_vp_state_page(partition->pt_id, args.vp_index, + HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, + input_vtl_zero, &intercept_msg_page); + if (ret) + goto destroy_vp; + + if (!mshv_partition_encrypted(partition)) { + ret = hv_map_vp_state_page(partition->pt_id, args.vp_index, + HV_VP_STATE_PAGE_REGISTERS, + input_vtl_zero, ®ister_page); + if (ret) + goto unmap_intercept_message_page; + } + + if (mshv_partition_encrypted(partition) && + is_ghcb_mapping_available()) { + ret = hv_map_vp_state_page(partition->pt_id, args.vp_index, + HV_VP_STATE_PAGE_GHCB, + input_vtl_normal, &ghcb_page); + if (ret) + goto unmap_register_page; + } + + /* + * This mapping of the stats page is for detecting if dispatch thread + * is blocked - only relevant for root scheduler + */ + if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) { + ret = mshv_vp_stats_map(partition->pt_id, args.vp_index, + stats_pages); + if (ret) + goto unmap_ghcb_page; + } + + vp = kzalloc(sizeof(*vp), GFP_KERNEL); + if (!vp) + goto unmap_stats_pages; + + vp->vp_partition = mshv_partition_get(partition); + if (!vp->vp_partition) { + ret = -EBADF; + goto free_vp; + } + + mutex_init(&vp->vp_mutex); + init_waitqueue_head(&vp->run.vp_suspend_queue); + atomic64_set(&vp->run.vp_signaled_count, 0); + + vp->vp_index = args.vp_index; + vp->vp_intercept_msg_page = page_to_virt(intercept_msg_page); + if (!mshv_partition_encrypted(partition)) + vp->vp_register_page = page_to_virt(register_page); + + if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) + vp->vp_ghcb_page = page_to_virt(ghcb_page); + + if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) + memcpy(vp->vp_stats_pages, stats_pages, sizeof(stats_pages)); + + /* + * Keep anon_inode_getfd last: it installs fd in the file struct and + * thus makes the state accessible in user space. + */ + ret = anon_inode_getfd("mshv_vp", &mshv_vp_fops, vp, + O_RDWR | O_CLOEXEC); + if (ret < 0) + goto put_partition; + + /* already exclusive with the partition mutex for all ioctls */ + partition->pt_vp_count++; + partition->pt_vp_array[args.vp_index] = vp; + + return ret; + +put_partition: + mshv_partition_put(partition); +free_vp: + kfree(vp); +unmap_stats_pages: + if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) + mshv_vp_stats_unmap(partition->pt_id, args.vp_index, stats_pages); +unmap_ghcb_page: + if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) + hv_unmap_vp_state_page(partition->pt_id, args.vp_index, + HV_VP_STATE_PAGE_GHCB, ghcb_page, + input_vtl_normal); +unmap_register_page: + if (!mshv_partition_encrypted(partition)) + hv_unmap_vp_state_page(partition->pt_id, args.vp_index, + HV_VP_STATE_PAGE_REGISTERS, + register_page, input_vtl_zero); +unmap_intercept_message_page: + hv_unmap_vp_state_page(partition->pt_id, args.vp_index, + HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, + intercept_msg_page, input_vtl_zero); +destroy_vp: + hv_call_delete_vp(partition->pt_id, args.vp_index); + return ret; +} + +static int mshv_init_async_handler(struct mshv_partition *partition) +{ + if (completion_done(&partition->async_hypercall)) { + pt_err(partition, + "Cannot issue async hypercall while another one in progress!\n"); + return -EPERM; + } + + reinit_completion(&partition->async_hypercall); + return 0; +} + +static void mshv_async_hvcall_handler(void *data, u64 *status) +{ + struct mshv_partition *partition = data; + + wait_for_completion(&partition->async_hypercall); + pt_dbg(partition, "Async hypercall completed!\n"); + + *status = partition->async_hypercall_status; +} + +/* + * NB: caller checks and makes sure mem->size is page aligned + * Returns: 0 with regionpp updated on success, or -errno + */ +static int mshv_partition_create_region(struct mshv_partition *partition, + struct mshv_user_mem_region *mem, + struct mshv_mem_region **regionpp, + bool is_mmio) +{ + struct mshv_mem_region *rg; + u64 nr_pages = HVPFN_DOWN(mem->size); + + /* Reject overlapping regions */ + spin_lock(&partition->pt_mem_regions_lock); + hlist_for_each_entry(rg, &partition->pt_mem_regions, hnode) { + if (mem->guest_pfn + nr_pages <= rg->start_gfn || + rg->start_gfn + rg->nr_pages <= mem->guest_pfn) + continue; + spin_unlock(&partition->pt_mem_regions_lock); + return -EEXIST; + } + spin_unlock(&partition->pt_mem_regions_lock); + + rg = mshv_region_create(mem->guest_pfn, nr_pages, + mem->userspace_addr, mem->flags); + if (IS_ERR(rg)) + return PTR_ERR(rg); + + if (is_mmio) + rg->type = MSHV_REGION_TYPE_MMIO; + else if (mshv_partition_encrypted(partition) || + !mshv_region_movable_init(rg)) + rg->type = MSHV_REGION_TYPE_MEM_PINNED; + else + rg->type = MSHV_REGION_TYPE_MEM_MOVABLE; + + rg->partition = partition; + + *regionpp = rg; + + return 0; +} + +/** + * mshv_prepare_pinned_region - Pin and map memory regions + * @region: Pointer to the memory region structure + * + * This function processes memory regions that are explicitly marked as pinned. + * Pinned regions are preallocated, mapped upfront, and do not rely on fault-based + * population. The function ensures the region is properly populated, handles + * encryption requirements for SNP partitions if applicable, maps the region, + * and performs necessary sharing or eviction operations based on the mapping + * result. + * + * Return: 0 on success, negative error code on failure. + */ +static int mshv_prepare_pinned_region(struct mshv_mem_region *region) +{ + struct mshv_partition *partition = region->partition; + int ret; + + ret = mshv_region_pin(region); + if (ret) { + pt_err(partition, "Failed to pin memory region: %d\n", + ret); + goto err_out; + } + + /* + * For an SNP partition it is a requirement that for every memory region + * that we are going to map for this partition we should make sure that + * host access to that region is released. This is ensured by doing an + * additional hypercall which will update the SLAT to release host + * access to guest memory regions. + */ + if (mshv_partition_encrypted(partition)) { + ret = mshv_region_unshare(region); + if (ret) { + pt_err(partition, + "Failed to unshare memory region (guest_pfn: %llu): %d\n", + region->start_gfn, ret); + goto invalidate_region; + } + } + + ret = mshv_region_map(region); + if (ret && mshv_partition_encrypted(partition)) { + int shrc; + + shrc = mshv_region_share(region); + if (!shrc) + goto invalidate_region; + + pt_err(partition, + "Failed to share memory region (guest_pfn: %llu): %d\n", + region->start_gfn, shrc); + /* + * Don't unpin if marking shared failed because pages are no + * longer mapped in the host, ie root, anymore. + */ + goto err_out; + } + + return 0; + +invalidate_region: + mshv_region_invalidate(region); +err_out: + return ret; +} + +/* + * This maps two things: guest RAM and for pci passthru mmio space. + * + * mmio: + * - vfio overloads vm_pgoff to store the mmio start pfn/spa. + * - Two things need to happen for mapping mmio range: + * 1. mapped in the uaddr so VMM can access it. + * 2. mapped in the hwpt (gfn <-> mmio phys addr) so guest can access it. + * + * This function takes care of the second. The first one is managed by vfio, + * and hence is taken care of via vfio_pci_mmap_fault(). + */ +static long +mshv_map_user_memory(struct mshv_partition *partition, + struct mshv_user_mem_region mem) +{ + struct mshv_mem_region *region; + struct vm_area_struct *vma; + bool is_mmio; + ulong mmio_pfn; + long ret; + + if (mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP) || + !access_ok((const void *)mem.userspace_addr, mem.size)) + return -EINVAL; + + mmap_read_lock(current->mm); + vma = vma_lookup(current->mm, mem.userspace_addr); + is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0; + mmio_pfn = is_mmio ? vma->vm_pgoff : 0; + mmap_read_unlock(current->mm); + + if (!vma) + return -EINVAL; + + ret = mshv_partition_create_region(partition, &mem, ®ion, + is_mmio); + if (ret) + return ret; + + switch (region->type) { + case MSHV_REGION_TYPE_MEM_PINNED: + ret = mshv_prepare_pinned_region(region); + break; + case MSHV_REGION_TYPE_MEM_MOVABLE: + /* + * For movable memory regions, remap with no access to let + * the hypervisor track dirty pages, enabling pre-copy live + * migration. + */ + ret = hv_call_map_gpa_pages(partition->pt_id, + region->start_gfn, + region->nr_pages, + HV_MAP_GPA_NO_ACCESS, NULL); + break; + case MSHV_REGION_TYPE_MMIO: + ret = hv_call_map_mmio_pages(partition->pt_id, + region->start_gfn, + mmio_pfn, + region->nr_pages); + break; + } + + if (ret) + goto errout; + + spin_lock(&partition->pt_mem_regions_lock); + hlist_add_head(®ion->hnode, &partition->pt_mem_regions); + spin_unlock(&partition->pt_mem_regions_lock); + + return 0; + +errout: + vfree(region); + return ret; +} + +/* Called for unmapping both the guest ram and the mmio space */ +static long +mshv_unmap_user_memory(struct mshv_partition *partition, + struct mshv_user_mem_region mem) +{ + struct mshv_mem_region *region; + + if (!(mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP))) + return -EINVAL; + + spin_lock(&partition->pt_mem_regions_lock); + + region = mshv_partition_region_by_gfn(partition, mem.guest_pfn); + if (!region) { + spin_unlock(&partition->pt_mem_regions_lock); + return -ENOENT; + } + + /* Paranoia check */ + if (region->start_uaddr != mem.userspace_addr || + region->start_gfn != mem.guest_pfn || + region->nr_pages != HVPFN_DOWN(mem.size)) { + spin_unlock(&partition->pt_mem_regions_lock); + return -EINVAL; + } + + hlist_del(®ion->hnode); + + spin_unlock(&partition->pt_mem_regions_lock); + + mshv_region_put(region); + + return 0; +} + +static long +mshv_partition_ioctl_set_memory(struct mshv_partition *partition, + struct mshv_user_mem_region __user *user_mem) +{ + struct mshv_user_mem_region mem; + + if (copy_from_user(&mem, user_mem, sizeof(mem))) + return -EFAULT; + + if (!mem.size || + !PAGE_ALIGNED(mem.size) || + !PAGE_ALIGNED(mem.userspace_addr) || + (mem.flags & ~MSHV_SET_MEM_FLAGS_MASK) || + mshv_field_nonzero(mem, rsvd)) + return -EINVAL; + + if (mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP)) + return mshv_unmap_user_memory(partition, mem); + + return mshv_map_user_memory(partition, mem); +} + +static long +mshv_partition_ioctl_ioeventfd(struct mshv_partition *partition, + void __user *user_args) +{ + struct mshv_user_ioeventfd args; + + if (copy_from_user(&args, user_args, sizeof(args))) + return -EFAULT; + + return mshv_set_unset_ioeventfd(partition, &args); +} + +static long +mshv_partition_ioctl_irqfd(struct mshv_partition *partition, + void __user *user_args) +{ + struct mshv_user_irqfd args; + + if (copy_from_user(&args, user_args, sizeof(args))) + return -EFAULT; + + return mshv_set_unset_irqfd(partition, &args); +} + +static long +mshv_partition_ioctl_get_gpap_access_bitmap(struct mshv_partition *partition, + void __user *user_args) +{ + struct mshv_gpap_access_bitmap args; + union hv_gpa_page_access_state *states; + long ret, i; + union hv_gpa_page_access_state_flags hv_flags = {}; + u8 hv_type_mask; + ulong bitmap_buf_sz, states_buf_sz; + int written = 0; + + if (copy_from_user(&args, user_args, sizeof(args))) + return -EFAULT; + + if (args.access_type >= MSHV_GPAP_ACCESS_TYPE_COUNT || + args.access_op >= MSHV_GPAP_ACCESS_OP_COUNT || + mshv_field_nonzero(args, rsvd) || !args.page_count || + !args.bitmap_ptr) + return -EINVAL; + + if (check_mul_overflow(args.page_count, sizeof(*states), &states_buf_sz)) + return -E2BIG; + + /* Num bytes needed to store bitmap; one bit per page rounded up */ + bitmap_buf_sz = DIV_ROUND_UP(args.page_count, 8); + + /* Sanity check */ + if (bitmap_buf_sz > states_buf_sz) + return -EBADFD; + + switch (args.access_type) { + case MSHV_GPAP_ACCESS_TYPE_ACCESSED: + hv_type_mask = 1; + if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) { + hv_flags.clear_accessed = 1; + /* not accessed implies not dirty */ + hv_flags.clear_dirty = 1; + } else { /* MSHV_GPAP_ACCESS_OP_SET */ + hv_flags.set_accessed = 1; + } + break; + case MSHV_GPAP_ACCESS_TYPE_DIRTY: + hv_type_mask = 2; + if (args.access_op == MSHV_GPAP_ACCESS_OP_CLEAR) { + hv_flags.clear_dirty = 1; + } else { /* MSHV_GPAP_ACCESS_OP_SET */ + hv_flags.set_dirty = 1; + /* dirty implies accessed */ + hv_flags.set_accessed = 1; + } + break; + } + + states = vzalloc(states_buf_sz); + if (!states) + return -ENOMEM; + + ret = hv_call_get_gpa_access_states(partition->pt_id, args.page_count, + args.gpap_base, hv_flags, &written, + states); + if (ret) + goto free_return; + + /* + * Overwrite states buffer with bitmap - the bits in hv_type_mask + * correspond to bitfields in hv_gpa_page_access_state + */ + for (i = 0; i < written; ++i) + __assign_bit(i, (ulong *)states, + states[i].as_uint8 & hv_type_mask); + + /* zero the unused bits in the last byte(s) of the returned bitmap */ + for (i = written; i < bitmap_buf_sz * 8; ++i) + __clear_bit(i, (ulong *)states); + + if (copy_to_user((void __user *)args.bitmap_ptr, states, bitmap_buf_sz)) + ret = -EFAULT; + +free_return: + vfree(states); + return ret; +} + +static long +mshv_partition_ioctl_set_msi_routing(struct mshv_partition *partition, + void __user *user_args) +{ + struct mshv_user_irq_entry *entries = NULL; + struct mshv_user_irq_table args; + long ret; + + if (copy_from_user(&args, user_args, sizeof(args))) + return -EFAULT; + + if (args.nr > MSHV_MAX_GUEST_IRQS || + mshv_field_nonzero(args, rsvd)) + return -EINVAL; + + if (args.nr) { + struct mshv_user_irq_table __user *urouting = user_args; + + entries = vmemdup_user(urouting->entries, + array_size(sizeof(*entries), + args.nr)); + if (IS_ERR(entries)) + return PTR_ERR(entries); + } + ret = mshv_update_routing_table(partition, entries, args.nr); + kvfree(entries); + + return ret; +} + +static long +mshv_partition_ioctl_initialize(struct mshv_partition *partition) +{ + long ret; + + if (partition->pt_initialized) + return 0; + + ret = hv_call_initialize_partition(partition->pt_id); + if (ret) + goto withdraw_mem; + + partition->pt_initialized = true; + + return 0; + +withdraw_mem: + hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id); + + return ret; +} + +static long +mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) +{ + struct mshv_partition *partition = filp->private_data; + long ret; + void __user *uarg = (void __user *)arg; + + if (mutex_lock_killable(&partition->pt_mutex)) + return -EINTR; + + switch (ioctl) { + case MSHV_INITIALIZE_PARTITION: + ret = mshv_partition_ioctl_initialize(partition); + break; + case MSHV_SET_GUEST_MEMORY: + ret = mshv_partition_ioctl_set_memory(partition, uarg); + break; + case MSHV_CREATE_VP: + ret = mshv_partition_ioctl_create_vp(partition, uarg); + break; + case MSHV_IRQFD: + ret = mshv_partition_ioctl_irqfd(partition, uarg); + break; + case MSHV_IOEVENTFD: + ret = mshv_partition_ioctl_ioeventfd(partition, uarg); + break; + case MSHV_SET_MSI_ROUTING: + ret = mshv_partition_ioctl_set_msi_routing(partition, uarg); + break; + case MSHV_GET_GPAP_ACCESS_BITMAP: + ret = mshv_partition_ioctl_get_gpap_access_bitmap(partition, + uarg); + break; + case MSHV_ROOT_HVCALL: + ret = mshv_ioctl_passthru_hvcall(partition, true, uarg); + break; + default: + ret = -ENOTTY; + } + + mutex_unlock(&partition->pt_mutex); + return ret; +} + +static int +disable_vp_dispatch(struct mshv_vp *vp) +{ + int ret; + struct hv_register_assoc dispatch_suspend = { + .name = HV_REGISTER_DISPATCH_SUSPEND, + .value.dispatch_suspend.suspended = 1, + }; + + ret = mshv_set_vp_registers(vp->vp_index, vp->vp_partition->pt_id, + 1, &dispatch_suspend); + if (ret) + vp_err(vp, "failed to suspend\n"); + + return ret; +} + +static int +get_vp_signaled_count(struct mshv_vp *vp, u64 *count) +{ + int ret; + struct hv_register_assoc root_signal_count = { + .name = HV_REGISTER_VP_ROOT_SIGNAL_COUNT, + }; + + ret = mshv_get_vp_registers(vp->vp_index, vp->vp_partition->pt_id, + 1, &root_signal_count); + + if (ret) { + vp_err(vp, "Failed to get root signal count"); + *count = 0; + return ret; + } + + *count = root_signal_count.value.reg64; + + return ret; +} + +static void +drain_vp_signals(struct mshv_vp *vp) +{ + u64 hv_signal_count; + u64 vp_signal_count; + + get_vp_signaled_count(vp, &hv_signal_count); + + vp_signal_count = atomic64_read(&vp->run.vp_signaled_count); + + /* + * There should be at most 1 outstanding notification, but be extra + * careful anyway. + */ + while (hv_signal_count != vp_signal_count) { + WARN_ON(hv_signal_count - vp_signal_count != 1); + + if (wait_event_interruptible(vp->run.vp_suspend_queue, + vp->run.kicked_by_hv == 1)) + break; + vp->run.kicked_by_hv = 0; + vp_signal_count = atomic64_read(&vp->run.vp_signaled_count); + } +} + +static void drain_all_vps(const struct mshv_partition *partition) +{ + int i; + struct mshv_vp *vp; + + /* + * VPs are reachable from ISR. It is safe to not take the partition + * lock because nobody else can enter this function and drop the + * partition from the list. + */ + for (i = 0; i < MSHV_MAX_VPS; i++) { + vp = partition->pt_vp_array[i]; + if (!vp) + continue; + /* + * Disable dispatching of the VP in the hypervisor. After this + * the hypervisor guarantees it won't generate any signals for + * the VP and the hypervisor's VP signal count won't change. + */ + disable_vp_dispatch(vp); + drain_vp_signals(vp); + } +} + +static void +remove_partition(struct mshv_partition *partition) +{ + spin_lock(&mshv_root.pt_ht_lock); + hlist_del_rcu(&partition->pt_hnode); + spin_unlock(&mshv_root.pt_ht_lock); + + synchronize_rcu(); +} + +/* + * Tear down a partition and remove it from the list. + * Partition's refcount must be 0 + */ +static void destroy_partition(struct mshv_partition *partition) +{ + struct mshv_vp *vp; + struct mshv_mem_region *region; + struct hlist_node *n; + int i; + + if (refcount_read(&partition->pt_ref_count)) { + pt_err(partition, + "Attempt to destroy partition but refcount > 0\n"); + return; + } + + if (partition->pt_initialized) { + /* + * We only need to drain signals for root scheduler. This should be + * done before removing the partition from the partition list. + */ + if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) + drain_all_vps(partition); + + /* Remove vps */ + for (i = 0; i < MSHV_MAX_VPS; ++i) { + vp = partition->pt_vp_array[i]; + if (!vp) + continue; + + if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) + mshv_vp_stats_unmap(partition->pt_id, vp->vp_index, + (void **)vp->vp_stats_pages); + + if (vp->vp_register_page) { + (void)hv_unmap_vp_state_page(partition->pt_id, + vp->vp_index, + HV_VP_STATE_PAGE_REGISTERS, + virt_to_page(vp->vp_register_page), + input_vtl_zero); + vp->vp_register_page = NULL; + } + + (void)hv_unmap_vp_state_page(partition->pt_id, + vp->vp_index, + HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, + virt_to_page(vp->vp_intercept_msg_page), + input_vtl_zero); + vp->vp_intercept_msg_page = NULL; + + if (vp->vp_ghcb_page) { + (void)hv_unmap_vp_state_page(partition->pt_id, + vp->vp_index, + HV_VP_STATE_PAGE_GHCB, + virt_to_page(vp->vp_ghcb_page), + input_vtl_normal); + vp->vp_ghcb_page = NULL; + } + + kfree(vp); + + partition->pt_vp_array[i] = NULL; + } + + /* Deallocates and unmaps everything including vcpus, GPA mappings etc */ + hv_call_finalize_partition(partition->pt_id); + + partition->pt_initialized = false; + } + + remove_partition(partition); + + hlist_for_each_entry_safe(region, n, &partition->pt_mem_regions, + hnode) { + hlist_del(®ion->hnode); + mshv_region_put(region); + } + + /* Withdraw and free all pages we deposited */ + hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id); + hv_call_delete_partition(partition->pt_id); + + mshv_free_routing_table(partition); + kfree(partition); +} + +struct +mshv_partition *mshv_partition_get(struct mshv_partition *partition) +{ + if (refcount_inc_not_zero(&partition->pt_ref_count)) + return partition; + return NULL; +} + +struct +mshv_partition *mshv_partition_find(u64 partition_id) + __must_hold(RCU) +{ + struct mshv_partition *p; + + hash_for_each_possible_rcu(mshv_root.pt_htable, p, pt_hnode, + partition_id) + if (p->pt_id == partition_id) + return p; + + return NULL; +} + +void +mshv_partition_put(struct mshv_partition *partition) +{ + if (refcount_dec_and_test(&partition->pt_ref_count)) + destroy_partition(partition); +} + +static int +mshv_partition_release(struct inode *inode, struct file *filp) +{ + struct mshv_partition *partition = filp->private_data; + + mshv_eventfd_release(partition); + + cleanup_srcu_struct(&partition->pt_irq_srcu); + + mshv_partition_put(partition); + + return 0; +} + +static int +add_partition(struct mshv_partition *partition) +{ + spin_lock(&mshv_root.pt_ht_lock); + + hash_add_rcu(mshv_root.pt_htable, &partition->pt_hnode, + partition->pt_id); + + spin_unlock(&mshv_root.pt_ht_lock); + + return 0; +} + +static_assert(MSHV_NUM_CPU_FEATURES_BANKS == + HV_PARTITION_PROCESSOR_FEATURES_BANKS); + +static long mshv_ioctl_process_pt_flags(void __user *user_arg, u64 *pt_flags, + struct hv_partition_creation_properties *cr_props, + union hv_partition_isolation_properties *isol_props) +{ + int i; + struct mshv_create_partition_v2 args; + union hv_partition_processor_features *disabled_procs; + union hv_partition_processor_xsave_features *disabled_xsave; + + /* First, copy v1 struct in case user is on previous versions */ + if (copy_from_user(&args, user_arg, + sizeof(struct mshv_create_partition))) + return -EFAULT; + + if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) || + args.pt_isolation >= MSHV_PT_ISOLATION_COUNT) + return -EINVAL; + + disabled_procs = &cr_props->disabled_processor_features; + disabled_xsave = &cr_props->disabled_processor_xsave_features; + + /* Check if user provided newer struct with feature fields */ + if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_CPU_AND_XSAVE_FEATURES)) { + if (copy_from_user(&args, user_arg, sizeof(args))) + return -EFAULT; + + /* Re-validate v1 fields after second copy_from_user() */ + if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) || + args.pt_isolation >= MSHV_PT_ISOLATION_COUNT) + return -EINVAL; + + if (args.pt_num_cpu_fbanks != MSHV_NUM_CPU_FEATURES_BANKS || + mshv_field_nonzero(args, pt_rsvd) || + mshv_field_nonzero(args, pt_rsvd1)) + return -EINVAL; + + /* + * Note this assumes MSHV_NUM_CPU_FEATURES_BANKS will never + * change and equals HV_PARTITION_PROCESSOR_FEATURES_BANKS + * (i.e. 2). + * + * Further banks (index >= 2) will be modifiable as 'early' + * properties via the set partition property hypercall. + */ + for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++) + disabled_procs->as_uint64[i] = args.pt_cpu_fbanks[i]; + +#if IS_ENABLED(CONFIG_X86_64) + disabled_xsave->as_uint64 = args.pt_disabled_xsave; +#else + /* + * In practice this field is ignored on arm64, but safer to + * zero it in case it is ever used. + */ + disabled_xsave->as_uint64 = 0; + + if (mshv_field_nonzero(args, pt_rsvd2)) + return -EINVAL; +#endif + } else { + /* + * v1 behavior: try to enable everything. The hypervisor will + * disable features that are not supported. The banks can be + * queried via the get partition property hypercall. + */ + for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++) + disabled_procs->as_uint64[i] = 0; + + disabled_xsave->as_uint64 = 0; + } + + /* Only support EXO partitions */ + *pt_flags = HV_PARTITION_CREATION_FLAG_EXO_PARTITION | + HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED; + + if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_LAPIC)) + *pt_flags |= HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED; + if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_X2APIC)) + *pt_flags |= HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE; + if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_GPA_SUPER_PAGES)) + *pt_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED; + + isol_props->as_uint64 = 0; + + switch (args.pt_isolation) { + case MSHV_PT_ISOLATION_NONE: + isol_props->isolation_type = HV_PARTITION_ISOLATION_TYPE_NONE; + break; + } + + return 0; +} + +static long +mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev) +{ + u64 creation_flags; + struct hv_partition_creation_properties creation_properties; + union hv_partition_isolation_properties isolation_properties; + struct mshv_partition *partition; + long ret; + + ret = mshv_ioctl_process_pt_flags(user_arg, &creation_flags, + &creation_properties, + &isolation_properties); + if (ret) + return ret; + + partition = kzalloc(sizeof(*partition), GFP_KERNEL); + if (!partition) + return -ENOMEM; + + partition->pt_module_dev = module_dev; + partition->isolation_type = isolation_properties.isolation_type; + + refcount_set(&partition->pt_ref_count, 1); + + mutex_init(&partition->pt_mutex); + + mutex_init(&partition->pt_irq_lock); + + init_completion(&partition->async_hypercall); + + INIT_HLIST_HEAD(&partition->irq_ack_notifier_list); + + INIT_HLIST_HEAD(&partition->pt_devices); + + spin_lock_init(&partition->pt_mem_regions_lock); + INIT_HLIST_HEAD(&partition->pt_mem_regions); + + mshv_eventfd_init(partition); + + ret = init_srcu_struct(&partition->pt_irq_srcu); + if (ret) + goto free_partition; + + ret = hv_call_create_partition(creation_flags, + creation_properties, + isolation_properties, + &partition->pt_id); + if (ret) + goto cleanup_irq_srcu; + + ret = add_partition(partition); + if (ret) + goto delete_partition; + + ret = mshv_init_async_handler(partition); + if (!ret) { + ret = FD_ADD(O_CLOEXEC, anon_inode_getfile("mshv_partition", + &mshv_partition_fops, + partition, O_RDWR)); + if (ret >= 0) + return ret; + } + remove_partition(partition); +delete_partition: + hv_call_delete_partition(partition->pt_id); +cleanup_irq_srcu: + cleanup_srcu_struct(&partition->pt_irq_srcu); +free_partition: + kfree(partition); + + return ret; +} + +static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) +{ + struct miscdevice *misc = filp->private_data; + + switch (ioctl) { + case MSHV_CREATE_PARTITION: + return mshv_ioctl_create_partition((void __user *)arg, + misc->this_device); + case MSHV_ROOT_HVCALL: + return mshv_ioctl_passthru_hvcall(NULL, false, + (void __user *)arg); + } + + return -ENOTTY; +} + +static int +mshv_dev_open(struct inode *inode, struct file *filp) +{ + return 0; +} + +static int +mshv_dev_release(struct inode *inode, struct file *filp) +{ + return 0; +} + +static int mshv_cpuhp_online; +static int mshv_root_sched_online; + +static const char *scheduler_type_to_string(enum hv_scheduler_type type) +{ + switch (type) { + case HV_SCHEDULER_TYPE_LP: + return "classic scheduler without SMT"; + case HV_SCHEDULER_TYPE_LP_SMT: + return "classic scheduler with SMT"; + case HV_SCHEDULER_TYPE_CORE_SMT: + return "core scheduler"; + case HV_SCHEDULER_TYPE_ROOT: + return "root scheduler"; + default: + return "unknown scheduler"; + }; +} + +/* TODO move this to hv_common.c when needed outside */ +static int __init hv_retrieve_scheduler_type(enum hv_scheduler_type *out) +{ + struct hv_input_get_system_property *input; + struct hv_output_get_system_property *output; + unsigned long flags; + u64 status; + + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + output = *this_cpu_ptr(hyperv_pcpu_output_arg); + + memset(input, 0, sizeof(*input)); + memset(output, 0, sizeof(*output)); + input->property_id = HV_SYSTEM_PROPERTY_SCHEDULER_TYPE; + + status = hv_do_hypercall(HVCALL_GET_SYSTEM_PROPERTY, input, output); + if (!hv_result_success(status)) { + local_irq_restore(flags); + pr_err("%s: %s\n", __func__, hv_result_to_string(status)); + return hv_result_to_errno(status); + } + + *out = output->scheduler_type; + local_irq_restore(flags); + + return 0; +} + +/* Retrieve and stash the supported scheduler type */ +static int __init mshv_retrieve_scheduler_type(struct device *dev) +{ + int ret = 0; + + if (hv_l1vh_partition()) + hv_scheduler_type = HV_SCHEDULER_TYPE_CORE_SMT; + else + ret = hv_retrieve_scheduler_type(&hv_scheduler_type); + + if (ret) + return ret; + + dev_info(dev, "Hypervisor using %s\n", + scheduler_type_to_string(hv_scheduler_type)); + + switch (hv_scheduler_type) { + case HV_SCHEDULER_TYPE_CORE_SMT: + case HV_SCHEDULER_TYPE_LP_SMT: + case HV_SCHEDULER_TYPE_ROOT: + case HV_SCHEDULER_TYPE_LP: + /* Supported scheduler, nothing to do */ + break; + default: + dev_err(dev, "unsupported scheduler 0x%x, bailing.\n", + hv_scheduler_type); + return -EOPNOTSUPP; + } + + return 0; +} + +static int mshv_root_scheduler_init(unsigned int cpu) +{ + void **inputarg, **outputarg, *p; + + inputarg = (void **)this_cpu_ptr(root_scheduler_input); + outputarg = (void **)this_cpu_ptr(root_scheduler_output); + + /* Allocate two consecutive pages. One for input, one for output. */ + p = kmalloc(2 * HV_HYP_PAGE_SIZE, GFP_KERNEL); + if (!p) + return -ENOMEM; + + *inputarg = p; + *outputarg = (char *)p + HV_HYP_PAGE_SIZE; + + return 0; +} + +static int mshv_root_scheduler_cleanup(unsigned int cpu) +{ + void *p, **inputarg, **outputarg; + + inputarg = (void **)this_cpu_ptr(root_scheduler_input); + outputarg = (void **)this_cpu_ptr(root_scheduler_output); + + p = *inputarg; + + *inputarg = NULL; + *outputarg = NULL; + + kfree(p); + + return 0; +} + +/* Must be called after retrieving the scheduler type */ +static int +root_scheduler_init(struct device *dev) +{ + int ret; + + if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT) + return 0; + + root_scheduler_input = alloc_percpu(void *); + root_scheduler_output = alloc_percpu(void *); + + if (!root_scheduler_input || !root_scheduler_output) { + dev_err(dev, "Failed to allocate root scheduler buffers\n"); + ret = -ENOMEM; + goto out; + } + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_root_sched", + mshv_root_scheduler_init, + mshv_root_scheduler_cleanup); + + if (ret < 0) { + dev_err(dev, "Failed to setup root scheduler state: %i\n", ret); + goto out; + } + + mshv_root_sched_online = ret; + + return 0; + +out: + free_percpu(root_scheduler_input); + free_percpu(root_scheduler_output); + return ret; +} + +static void +root_scheduler_deinit(void) +{ + if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT) + return; + + cpuhp_remove_state(mshv_root_sched_online); + free_percpu(root_scheduler_input); + free_percpu(root_scheduler_output); +} + +static int mshv_reboot_notify(struct notifier_block *nb, + unsigned long code, void *unused) +{ + cpuhp_remove_state(mshv_cpuhp_online); + return 0; +} + +struct notifier_block mshv_reboot_nb = { + .notifier_call = mshv_reboot_notify, +}; + +static void mshv_root_partition_exit(void) +{ + unregister_reboot_notifier(&mshv_reboot_nb); + root_scheduler_deinit(); +} + +static int __init mshv_root_partition_init(struct device *dev) +{ + int err; + + err = root_scheduler_init(dev); + if (err) + return err; + + err = register_reboot_notifier(&mshv_reboot_nb); + if (err) + goto root_sched_deinit; + + return 0; + +root_sched_deinit: + root_scheduler_deinit(); + return err; +} + +static void mshv_init_vmm_caps(struct device *dev) +{ + /* + * This can only fail here if HVCALL_GET_PARTITION_PROPERTY_EX or + * HV_PARTITION_PROPERTY_VMM_CAPABILITIES are not supported. In that + * case it's valid to proceed as if all vmm_caps are disabled (zero). + */ + if (hv_call_get_partition_property_ex(HV_PARTITION_ID_SELF, + HV_PARTITION_PROPERTY_VMM_CAPABILITIES, + 0, &mshv_root.vmm_caps, + sizeof(mshv_root.vmm_caps))) + dev_warn(dev, "Unable to get VMM capabilities\n"); + + dev_dbg(dev, "vmm_caps = %#llx\n", mshv_root.vmm_caps.as_uint64[0]); +} + +static int __init mshv_parent_partition_init(void) +{ + int ret; + struct device *dev; + union hv_hypervisor_version_info version_info; + + if (!hv_parent_partition() || is_kdump_kernel()) + return -ENODEV; + + if (hv_get_hypervisor_version(&version_info)) + return -ENODEV; + + ret = misc_register(&mshv_dev); + if (ret) + return ret; + + dev = mshv_dev.this_device; + + if (version_info.build_number < MSHV_HV_MIN_VERSION || + version_info.build_number > MSHV_HV_MAX_VERSION) { + dev_err(dev, "Running on unvalidated Hyper-V version\n"); + dev_err(dev, "Versions: current: %u min: %u max: %u\n", + version_info.build_number, MSHV_HV_MIN_VERSION, + MSHV_HV_MAX_VERSION); + } + + mshv_root.synic_pages = alloc_percpu(struct hv_synic_pages); + if (!mshv_root.synic_pages) { + dev_err(dev, "Failed to allocate percpu synic page\n"); + ret = -ENOMEM; + goto device_deregister; + } + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_synic", + mshv_synic_init, + mshv_synic_cleanup); + if (ret < 0) { + dev_err(dev, "Failed to setup cpu hotplug state: %i\n", ret); + goto free_synic_pages; + } + + mshv_cpuhp_online = ret; + + ret = mshv_retrieve_scheduler_type(dev); + if (ret) + goto remove_cpu_state; + + if (hv_root_partition()) + ret = mshv_root_partition_init(dev); + if (ret) + goto remove_cpu_state; + + mshv_init_vmm_caps(dev); + + ret = mshv_irqfd_wq_init(); + if (ret) + goto exit_partition; + + spin_lock_init(&mshv_root.pt_ht_lock); + hash_init(mshv_root.pt_htable); + + hv_setup_mshv_handler(mshv_isr); + + return 0; + +exit_partition: + if (hv_root_partition()) + mshv_root_partition_exit(); +remove_cpu_state: + cpuhp_remove_state(mshv_cpuhp_online); +free_synic_pages: + free_percpu(mshv_root.synic_pages); +device_deregister: + misc_deregister(&mshv_dev); + return ret; +} + +static void __exit mshv_parent_partition_exit(void) +{ + hv_setup_mshv_handler(NULL); + mshv_port_table_fini(); + misc_deregister(&mshv_dev); + mshv_irqfd_wq_cleanup(); + if (hv_root_partition()) + mshv_root_partition_exit(); + cpuhp_remove_state(mshv_cpuhp_online); + free_percpu(mshv_root.synic_pages); +} + +module_init(mshv_parent_partition_init); +module_exit(mshv_parent_partition_exit); diff --git a/drivers/hv/mshv_synic.c b/drivers/hv/mshv_synic.c new file mode 100644 index 000000000000..f8b0337cdc82 --- /dev/null +++ b/drivers/hv/mshv_synic.c @@ -0,0 +1,665 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2023, Microsoft Corporation. + * + * mshv_root module's main interrupt handler and associated functionality. + * + * Authors: Microsoft Linux virtualization team + */ + +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/io.h> +#include <linux/random.h> +#include <asm/mshyperv.h> + +#include "mshv_eventfd.h" +#include "mshv.h" + +static u32 synic_event_ring_get_queued_port(u32 sint_index) +{ + struct hv_synic_event_ring_page **event_ring_page; + volatile struct hv_synic_event_ring *ring; + struct hv_synic_pages *spages; + u8 **synic_eventring_tail; + u32 message; + u8 tail; + + spages = this_cpu_ptr(mshv_root.synic_pages); + event_ring_page = &spages->synic_event_ring_page; + synic_eventring_tail = (u8 **)this_cpu_ptr(hv_synic_eventring_tail); + + if (unlikely(!*synic_eventring_tail)) { + pr_debug("Missing synic event ring tail!\n"); + return 0; + } + tail = (*synic_eventring_tail)[sint_index]; + + if (unlikely(!*event_ring_page)) { + pr_debug("Missing synic event ring page!\n"); + return 0; + } + + ring = &(*event_ring_page)->sint_event_ring[sint_index]; + + /* + * Get the message. + */ + message = ring->data[tail]; + + if (!message) { + if (ring->ring_full) { + /* + * Ring is marked full, but we would have consumed all + * the messages. Notify the hypervisor that ring is now + * empty and check again. + */ + ring->ring_full = 0; + hv_call_notify_port_ring_empty(sint_index); + message = ring->data[tail]; + } + + if (!message) { + ring->signal_masked = 0; + /* + * Unmask the signal and sync with hypervisor + * before one last check for any message. + */ + mb(); + message = ring->data[tail]; + + /* + * Ok, lets bail out. + */ + if (!message) + return 0; + } + + ring->signal_masked = 1; + } + + /* + * Clear the message in the ring buffer. + */ + ring->data[tail] = 0; + + if (++tail == HV_SYNIC_EVENT_RING_MESSAGE_COUNT) + tail = 0; + + (*synic_eventring_tail)[sint_index] = tail; + + return message; +} + +static bool +mshv_doorbell_isr(struct hv_message *msg) +{ + struct hv_notification_message_payload *notification; + u32 port; + + if (msg->header.message_type != HVMSG_SYNIC_SINT_INTERCEPT) + return false; + + notification = (struct hv_notification_message_payload *)msg->u.payload; + if (notification->sint_index != HV_SYNIC_DOORBELL_SINT_INDEX) + return false; + + while ((port = synic_event_ring_get_queued_port(HV_SYNIC_DOORBELL_SINT_INDEX))) { + struct port_table_info ptinfo = { 0 }; + + if (mshv_portid_lookup(port, &ptinfo)) { + pr_debug("Failed to get port info from port_table!\n"); + continue; + } + + if (ptinfo.hv_port_type != HV_PORT_TYPE_DOORBELL) { + pr_debug("Not a doorbell port!, port: %d, port_type: %d\n", + port, ptinfo.hv_port_type); + continue; + } + + /* Invoke the callback */ + ptinfo.hv_port_doorbell.doorbell_cb(port, + ptinfo.hv_port_doorbell.data); + } + + return true; +} + +static bool mshv_async_call_completion_isr(struct hv_message *msg) +{ + bool handled = false; + struct hv_async_completion_message_payload *async_msg; + struct mshv_partition *partition; + u64 partition_id; + + if (msg->header.message_type != HVMSG_ASYNC_CALL_COMPLETION) + goto out; + + async_msg = + (struct hv_async_completion_message_payload *)msg->u.payload; + + partition_id = async_msg->partition_id; + + /* + * Hold this lock for the rest of the isr, because the partition could + * be released anytime. + * e.g. the MSHV_RUN_VP thread could wake on another cpu; it could + * release the partition unless we hold this! + */ + rcu_read_lock(); + + partition = mshv_partition_find(partition_id); + + if (unlikely(!partition)) { + pr_debug("failed to find partition %llu\n", partition_id); + goto unlock_out; + } + + partition->async_hypercall_status = async_msg->status; + complete(&partition->async_hypercall); + + handled = true; + +unlock_out: + rcu_read_unlock(); +out: + return handled; +} + +static void kick_vp(struct mshv_vp *vp) +{ + atomic64_inc(&vp->run.vp_signaled_count); + vp->run.kicked_by_hv = 1; + wake_up(&vp->run.vp_suspend_queue); +} + +static void +handle_bitset_message(const struct hv_vp_signal_bitset_scheduler_message *msg) +{ + int bank_idx, vps_signaled = 0, bank_mask_size; + struct mshv_partition *partition; + const struct hv_vpset *vpset; + const u64 *bank_contents; + u64 partition_id = msg->partition_id; + + if (msg->vp_bitset.bitset.format != HV_GENERIC_SET_SPARSE_4K) { + pr_debug("scheduler message format is not HV_GENERIC_SET_SPARSE_4K"); + return; + } + + if (msg->vp_count == 0) { + pr_debug("scheduler message with no VP specified"); + return; + } + + rcu_read_lock(); + + partition = mshv_partition_find(partition_id); + if (unlikely(!partition)) { + pr_debug("failed to find partition %llu\n", partition_id); + goto unlock_out; + } + + vpset = &msg->vp_bitset.bitset; + + bank_idx = -1; + bank_contents = vpset->bank_contents; + bank_mask_size = sizeof(vpset->valid_bank_mask) * BITS_PER_BYTE; + + while (true) { + int vp_bank_idx = -1; + int vp_bank_size = sizeof(*bank_contents) * BITS_PER_BYTE; + int vp_index; + + bank_idx = find_next_bit((unsigned long *)&vpset->valid_bank_mask, + bank_mask_size, bank_idx + 1); + if (bank_idx == bank_mask_size) + break; + + while (true) { + struct mshv_vp *vp; + + vp_bank_idx = find_next_bit((unsigned long *)bank_contents, + vp_bank_size, vp_bank_idx + 1); + if (vp_bank_idx == vp_bank_size) + break; + + vp_index = (bank_idx * vp_bank_size) + vp_bank_idx; + + /* This shouldn't happen, but just in case. */ + if (unlikely(vp_index >= MSHV_MAX_VPS)) { + pr_debug("VP index %u out of bounds\n", + vp_index); + goto unlock_out; + } + + vp = partition->pt_vp_array[vp_index]; + if (unlikely(!vp)) { + pr_debug("failed to find VP %u\n", vp_index); + goto unlock_out; + } + + kick_vp(vp); + vps_signaled++; + } + + bank_contents++; + } + +unlock_out: + rcu_read_unlock(); + + if (vps_signaled != msg->vp_count) + pr_debug("asked to signal %u VPs but only did %u\n", + msg->vp_count, vps_signaled); +} + +static void +handle_pair_message(const struct hv_vp_signal_pair_scheduler_message *msg) +{ + struct mshv_partition *partition = NULL; + struct mshv_vp *vp; + int idx; + + rcu_read_lock(); + + for (idx = 0; idx < msg->vp_count; idx++) { + u64 partition_id = msg->partition_ids[idx]; + u32 vp_index = msg->vp_indexes[idx]; + + if (idx == 0 || partition->pt_id != partition_id) { + partition = mshv_partition_find(partition_id); + if (unlikely(!partition)) { + pr_debug("failed to find partition %llu\n", + partition_id); + break; + } + } + + /* This shouldn't happen, but just in case. */ + if (unlikely(vp_index >= MSHV_MAX_VPS)) { + pr_debug("VP index %u out of bounds\n", vp_index); + break; + } + + vp = partition->pt_vp_array[vp_index]; + if (!vp) { + pr_debug("failed to find VP %u\n", vp_index); + break; + } + + kick_vp(vp); + } + + rcu_read_unlock(); +} + +static bool +mshv_scheduler_isr(struct hv_message *msg) +{ + if (msg->header.message_type != HVMSG_SCHEDULER_VP_SIGNAL_BITSET && + msg->header.message_type != HVMSG_SCHEDULER_VP_SIGNAL_PAIR) + return false; + + if (msg->header.message_type == HVMSG_SCHEDULER_VP_SIGNAL_BITSET) + handle_bitset_message((struct hv_vp_signal_bitset_scheduler_message *) + msg->u.payload); + else + handle_pair_message((struct hv_vp_signal_pair_scheduler_message *) + msg->u.payload); + + return true; +} + +static bool +mshv_intercept_isr(struct hv_message *msg) +{ + struct mshv_partition *partition; + bool handled = false; + struct mshv_vp *vp; + u64 partition_id; + u32 vp_index; + + partition_id = msg->header.sender; + + rcu_read_lock(); + + partition = mshv_partition_find(partition_id); + if (unlikely(!partition)) { + pr_debug("failed to find partition %llu\n", + partition_id); + goto unlock_out; + } + + if (msg->header.message_type == HVMSG_X64_APIC_EOI) { + /* + * Check if this gsi is registered in the + * ack_notifier list and invoke the callback + * if registered. + */ + + /* + * If there is a notifier, the ack callback is supposed + * to handle the VMEXIT. So we need not pass this message + * to vcpu thread. + */ + struct hv_x64_apic_eoi_message *eoi_msg = + (struct hv_x64_apic_eoi_message *)&msg->u.payload[0]; + + if (mshv_notify_acked_gsi(partition, eoi_msg->interrupt_vector)) { + handled = true; + goto unlock_out; + } + } + + /* + * We should get an opaque intercept message here for all intercept + * messages, since we're using the mapped VP intercept message page. + * + * The intercept message will have been placed in intercept message + * page at this point. + * + * Make sure the message type matches our expectation. + */ + if (msg->header.message_type != HVMSG_OPAQUE_INTERCEPT) { + pr_debug("wrong message type %d", msg->header.message_type); + goto unlock_out; + } + + /* + * Since we directly index the vp, and it has to exist for us to be here + * (because the vp is only deleted when the partition is), no additional + * locking is needed here + */ + vp_index = + ((struct hv_opaque_intercept_message *)msg->u.payload)->vp_index; + vp = partition->pt_vp_array[vp_index]; + if (unlikely(!vp)) { + pr_debug("failed to find VP %u\n", vp_index); + goto unlock_out; + } + + kick_vp(vp); + + handled = true; + +unlock_out: + rcu_read_unlock(); + + return handled; +} + +void mshv_isr(void) +{ + struct hv_synic_pages *spages = this_cpu_ptr(mshv_root.synic_pages); + struct hv_message_page **msg_page = &spages->hyp_synic_message_page; + struct hv_message *msg; + bool handled; + + if (unlikely(!(*msg_page))) { + pr_debug("Missing synic page!\n"); + return; + } + + msg = &((*msg_page)->sint_message[HV_SYNIC_INTERCEPTION_SINT_INDEX]); + + /* + * If the type isn't set, there isn't really a message; + * it may be some other hyperv interrupt + */ + if (msg->header.message_type == HVMSG_NONE) + return; + + handled = mshv_doorbell_isr(msg); + + if (!handled) + handled = mshv_scheduler_isr(msg); + + if (!handled) + handled = mshv_async_call_completion_isr(msg); + + if (!handled) + handled = mshv_intercept_isr(msg); + + if (handled) { + /* + * Acknowledge message with hypervisor if another message is + * pending. + */ + msg->header.message_type = HVMSG_NONE; + /* + * Ensure the write is complete so the hypervisor will deliver + * the next message if available. + */ + mb(); + if (msg->header.message_flags.msg_pending) + hv_set_non_nested_msr(HV_MSR_EOM, 0); + +#ifdef HYPERVISOR_CALLBACK_VECTOR + add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR); +#endif + } else { + pr_warn_once("%s: unknown message type 0x%x\n", __func__, + msg->header.message_type); + } +} + +int mshv_synic_init(unsigned int cpu) +{ + union hv_synic_simp simp; + union hv_synic_siefp siefp; + union hv_synic_sirbp sirbp; +#ifdef HYPERVISOR_CALLBACK_VECTOR + union hv_synic_sint sint; +#endif + union hv_synic_scontrol sctrl; + struct hv_synic_pages *spages = this_cpu_ptr(mshv_root.synic_pages); + struct hv_message_page **msg_page = &spages->hyp_synic_message_page; + struct hv_synic_event_flags_page **event_flags_page = + &spages->synic_event_flags_page; + struct hv_synic_event_ring_page **event_ring_page = + &spages->synic_event_ring_page; + + /* Setup the Synic's message page */ + simp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIMP); + simp.simp_enabled = true; + *msg_page = memremap(simp.base_simp_gpa << HV_HYP_PAGE_SHIFT, + HV_HYP_PAGE_SIZE, + MEMREMAP_WB); + + if (!(*msg_page)) + return -EFAULT; + + hv_set_non_nested_msr(HV_MSR_SIMP, simp.as_uint64); + + /* Setup the Synic's event flags page */ + siefp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIEFP); + siefp.siefp_enabled = true; + *event_flags_page = memremap(siefp.base_siefp_gpa << PAGE_SHIFT, + PAGE_SIZE, MEMREMAP_WB); + + if (!(*event_flags_page)) + goto cleanup; + + hv_set_non_nested_msr(HV_MSR_SIEFP, siefp.as_uint64); + + /* Setup the Synic's event ring page */ + sirbp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIRBP); + sirbp.sirbp_enabled = true; + *event_ring_page = memremap(sirbp.base_sirbp_gpa << PAGE_SHIFT, + PAGE_SIZE, MEMREMAP_WB); + + if (!(*event_ring_page)) + goto cleanup; + + hv_set_non_nested_msr(HV_MSR_SIRBP, sirbp.as_uint64); + +#ifdef HYPERVISOR_CALLBACK_VECTOR + /* Enable intercepts */ + sint.as_uint64 = 0; + sint.vector = HYPERVISOR_CALLBACK_VECTOR; + sint.masked = false; + sint.auto_eoi = hv_recommend_using_aeoi(); + hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_INTERCEPTION_SINT_INDEX, + sint.as_uint64); + + /* Doorbell SINT */ + sint.as_uint64 = 0; + sint.vector = HYPERVISOR_CALLBACK_VECTOR; + sint.masked = false; + sint.as_intercept = 1; + sint.auto_eoi = hv_recommend_using_aeoi(); + hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_DOORBELL_SINT_INDEX, + sint.as_uint64); +#endif + + /* Enable global synic bit */ + sctrl.as_uint64 = hv_get_non_nested_msr(HV_MSR_SCONTROL); + sctrl.enable = 1; + hv_set_non_nested_msr(HV_MSR_SCONTROL, sctrl.as_uint64); + + return 0; + +cleanup: + if (*event_ring_page) { + sirbp.sirbp_enabled = false; + hv_set_non_nested_msr(HV_MSR_SIRBP, sirbp.as_uint64); + memunmap(*event_ring_page); + } + if (*event_flags_page) { + siefp.siefp_enabled = false; + hv_set_non_nested_msr(HV_MSR_SIEFP, siefp.as_uint64); + memunmap(*event_flags_page); + } + if (*msg_page) { + simp.simp_enabled = false; + hv_set_non_nested_msr(HV_MSR_SIMP, simp.as_uint64); + memunmap(*msg_page); + } + + return -EFAULT; +} + +int mshv_synic_cleanup(unsigned int cpu) +{ + union hv_synic_sint sint; + union hv_synic_simp simp; + union hv_synic_siefp siefp; + union hv_synic_sirbp sirbp; + union hv_synic_scontrol sctrl; + struct hv_synic_pages *spages = this_cpu_ptr(mshv_root.synic_pages); + struct hv_message_page **msg_page = &spages->hyp_synic_message_page; + struct hv_synic_event_flags_page **event_flags_page = + &spages->synic_event_flags_page; + struct hv_synic_event_ring_page **event_ring_page = + &spages->synic_event_ring_page; + + /* Disable the interrupt */ + sint.as_uint64 = hv_get_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_INTERCEPTION_SINT_INDEX); + sint.masked = true; + hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_INTERCEPTION_SINT_INDEX, + sint.as_uint64); + + /* Disable Doorbell SINT */ + sint.as_uint64 = hv_get_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_DOORBELL_SINT_INDEX); + sint.masked = true; + hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_DOORBELL_SINT_INDEX, + sint.as_uint64); + + /* Disable Synic's event ring page */ + sirbp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIRBP); + sirbp.sirbp_enabled = false; + hv_set_non_nested_msr(HV_MSR_SIRBP, sirbp.as_uint64); + memunmap(*event_ring_page); + + /* Disable Synic's event flags page */ + siefp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIEFP); + siefp.siefp_enabled = false; + hv_set_non_nested_msr(HV_MSR_SIEFP, siefp.as_uint64); + memunmap(*event_flags_page); + + /* Disable Synic's message page */ + simp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIMP); + simp.simp_enabled = false; + hv_set_non_nested_msr(HV_MSR_SIMP, simp.as_uint64); + memunmap(*msg_page); + + /* Disable global synic bit */ + sctrl.as_uint64 = hv_get_non_nested_msr(HV_MSR_SCONTROL); + sctrl.enable = 0; + hv_set_non_nested_msr(HV_MSR_SCONTROL, sctrl.as_uint64); + + return 0; +} + +int +mshv_register_doorbell(u64 partition_id, doorbell_cb_t doorbell_cb, void *data, + u64 gpa, u64 val, u64 flags) +{ + struct hv_connection_info connection_info = { 0 }; + union hv_connection_id connection_id = { 0 }; + struct port_table_info *port_table_info; + struct hv_port_info port_info = { 0 }; + union hv_port_id port_id = { 0 }; + int ret; + + port_table_info = kmalloc(sizeof(*port_table_info), GFP_KERNEL); + if (!port_table_info) + return -ENOMEM; + + port_table_info->hv_port_type = HV_PORT_TYPE_DOORBELL; + port_table_info->hv_port_doorbell.doorbell_cb = doorbell_cb; + port_table_info->hv_port_doorbell.data = data; + ret = mshv_portid_alloc(port_table_info); + if (ret < 0) { + kfree(port_table_info); + return ret; + } + + port_id.u.id = ret; + port_info.port_type = HV_PORT_TYPE_DOORBELL; + port_info.doorbell_port_info.target_sint = HV_SYNIC_DOORBELL_SINT_INDEX; + port_info.doorbell_port_info.target_vp = HV_ANY_VP; + ret = hv_call_create_port(hv_current_partition_id, port_id, partition_id, + &port_info, + 0, 0, NUMA_NO_NODE); + + if (ret < 0) { + mshv_portid_free(port_id.u.id); + return ret; + } + + connection_id.u.id = port_id.u.id; + connection_info.port_type = HV_PORT_TYPE_DOORBELL; + connection_info.doorbell_connection_info.gpa = gpa; + connection_info.doorbell_connection_info.trigger_value = val; + connection_info.doorbell_connection_info.flags = flags; + + ret = hv_call_connect_port(hv_current_partition_id, port_id, partition_id, + connection_id, &connection_info, 0, NUMA_NO_NODE); + if (ret < 0) { + hv_call_delete_port(hv_current_partition_id, port_id); + mshv_portid_free(port_id.u.id); + return ret; + } + + // lets use the port_id as the doorbell_id + return port_id.u.id; +} + +void +mshv_unregister_doorbell(u64 partition_id, int doorbell_portid) +{ + union hv_port_id port_id = { 0 }; + union hv_connection_id connection_id = { 0 }; + + connection_id.u.id = doorbell_portid; + hv_call_disconnect_port(partition_id, connection_id); + + port_id.u.id = doorbell_portid; + hv_call_delete_port(hv_current_partition_id, port_id); + + mshv_portid_free(doorbell_portid); +} diff --git a/drivers/hv/mshv_vtl.h b/drivers/hv/mshv_vtl.h new file mode 100644 index 000000000000..a6eea52f7aa2 --- /dev/null +++ b/drivers/hv/mshv_vtl.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _MSHV_VTL_H +#define _MSHV_VTL_H + +#include <linux/mshv.h> +#include <linux/types.h> + +struct mshv_vtl_run { + u32 cancel; + u32 vtl_ret_action_size; + u32 pad[2]; + char exit_message[MSHV_MAX_RUN_MSG_SIZE]; + union { + struct mshv_vtl_cpu_context cpu_context; + + /* + * Reserving room for the cpu context to grow and to maintain compatibility + * with user mode. + */ + char reserved[1024]; + }; + char vtl_ret_actions[MSHV_MAX_RUN_MSG_SIZE]; +}; + +#endif /* _MSHV_VTL_H */ diff --git a/drivers/hv/mshv_vtl_main.c b/drivers/hv/mshv_vtl_main.c new file mode 100644 index 000000000000..2cebe9de5a5a --- /dev/null +++ b/drivers/hv/mshv_vtl_main.c @@ -0,0 +1,1392 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2023, Microsoft Corporation. + * + * Author: + * Roman Kisel <romank@linux.microsoft.com> + * Saurabh Sengar <ssengar@linux.microsoft.com> + * Naman Jain <namjain@linux.microsoft.com> + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/miscdevice.h> +#include <linux/anon_inodes.h> +#include <linux/cpuhotplug.h> +#include <linux/count_zeros.h> +#include <linux/entry-virt.h> +#include <linux/eventfd.h> +#include <linux/poll.h> +#include <linux/file.h> +#include <linux/vmalloc.h> +#include <asm/debugreg.h> +#include <asm/mshyperv.h> +#include <trace/events/ipi.h> +#include <uapi/asm/mtrr.h> +#include <uapi/linux/mshv.h> +#include <hyperv/hvhdk.h> + +#include "../../kernel/fpu/legacy.h" +#include "mshv.h" +#include "mshv_vtl.h" +#include "hyperv_vmbus.h" + +MODULE_AUTHOR("Microsoft"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Microsoft Hyper-V VTL Driver"); + +#define MSHV_ENTRY_REASON_LOWER_VTL_CALL 0x1 +#define MSHV_ENTRY_REASON_INTERRUPT 0x2 +#define MSHV_ENTRY_REASON_INTERCEPT 0x3 + +#define MSHV_REAL_OFF_SHIFT 16 +#define MSHV_PG_OFF_CPU_MASK (BIT_ULL(MSHV_REAL_OFF_SHIFT) - 1) +#define MSHV_RUN_PAGE_OFFSET 0 +#define MSHV_REG_PAGE_OFFSET 1 +#define VTL2_VMBUS_SINT_INDEX 7 + +static struct device *mem_dev; + +static struct tasklet_struct msg_dpc; +static wait_queue_head_t fd_wait_queue; +static bool has_message; +static struct eventfd_ctx *flag_eventfds[HV_EVENT_FLAGS_COUNT]; +static DEFINE_MUTEX(flag_lock); +static bool __read_mostly mshv_has_reg_page; + +/* hvcall code is of type u16, allocate a bitmap of size (1 << 16) to accommodate it */ +#define MAX_BITMAP_SIZE ((U16_MAX + 1) / 8) + +struct mshv_vtl_hvcall_fd { + u8 allow_bitmap[MAX_BITMAP_SIZE]; + bool allow_map_initialized; + /* + * Used to protect hvcall setup in IOCTLs + */ + struct mutex init_mutex; + struct miscdevice *dev; +}; + +struct mshv_vtl_poll_file { + struct file *file; + wait_queue_entry_t wait; + wait_queue_head_t *wqh; + poll_table pt; + int cpu; +}; + +struct mshv_vtl { + struct device *module_dev; + u64 id; +}; + +struct mshv_vtl_per_cpu { + struct mshv_vtl_run *run; + struct page *reg_page; +}; + +/* SYNIC_OVERLAY_PAGE_MSR - internal, identical to hv_synic_simp */ +union hv_synic_overlay_page_msr { + u64 as_uint64; + struct { + u64 enabled: 1; + u64 reserved: 11; + u64 pfn: 52; + } __packed; +}; + +static struct mutex mshv_vtl_poll_file_lock; +static union hv_register_vsm_page_offsets mshv_vsm_page_offsets; +static union hv_register_vsm_capabilities mshv_vsm_capabilities; + +static DEFINE_PER_CPU(struct mshv_vtl_poll_file, mshv_vtl_poll_file); +static DEFINE_PER_CPU(unsigned long long, num_vtl0_transitions); +static DEFINE_PER_CPU(struct mshv_vtl_per_cpu, mshv_vtl_per_cpu); + +static const union hv_input_vtl input_vtl_zero; +static const union hv_input_vtl input_vtl_normal = { + .use_target_vtl = 1, +}; + +static const struct file_operations mshv_vtl_fops; + +static long +mshv_ioctl_create_vtl(void __user *user_arg, struct device *module_dev) +{ + struct mshv_vtl *vtl; + struct file *file; + int fd; + + vtl = kzalloc(sizeof(*vtl), GFP_KERNEL); + if (!vtl) + return -ENOMEM; + + fd = get_unused_fd_flags(O_CLOEXEC); + if (fd < 0) { + kfree(vtl); + return fd; + } + file = anon_inode_getfile("mshv_vtl", &mshv_vtl_fops, + vtl, O_RDWR); + if (IS_ERR(file)) { + kfree(vtl); + return PTR_ERR(file); + } + vtl->module_dev = module_dev; + fd_install(fd, file); + + return fd; +} + +static long +mshv_ioctl_check_extension(void __user *user_arg) +{ + u32 arg; + + if (copy_from_user(&arg, user_arg, sizeof(arg))) + return -EFAULT; + + switch (arg) { + case MSHV_CAP_CORE_API_STABLE: + return 0; + case MSHV_CAP_REGISTER_PAGE: + return mshv_has_reg_page; + case MSHV_CAP_VTL_RETURN_ACTION: + return mshv_vsm_capabilities.return_action_available; + case MSHV_CAP_DR6_SHARED: + return mshv_vsm_capabilities.dr6_shared; + } + + return -EOPNOTSUPP; +} + +static long +mshv_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) +{ + struct miscdevice *misc = filp->private_data; + + switch (ioctl) { + case MSHV_CHECK_EXTENSION: + return mshv_ioctl_check_extension((void __user *)arg); + case MSHV_CREATE_VTL: + return mshv_ioctl_create_vtl((void __user *)arg, misc->this_device); + } + + return -ENOTTY; +} + +static const struct file_operations mshv_dev_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = mshv_dev_ioctl, + .llseek = noop_llseek, +}; + +static struct miscdevice mshv_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "mshv", + .fops = &mshv_dev_fops, + .mode = 0600, +}; + +static struct mshv_vtl_run *mshv_vtl_this_run(void) +{ + return *this_cpu_ptr(&mshv_vtl_per_cpu.run); +} + +static struct mshv_vtl_run *mshv_vtl_cpu_run(int cpu) +{ + return *per_cpu_ptr(&mshv_vtl_per_cpu.run, cpu); +} + +static struct page *mshv_vtl_cpu_reg_page(int cpu) +{ + return *per_cpu_ptr(&mshv_vtl_per_cpu.reg_page, cpu); +} + +static void mshv_vtl_configure_reg_page(struct mshv_vtl_per_cpu *per_cpu) +{ + struct hv_register_assoc reg_assoc = {}; + union hv_synic_overlay_page_msr overlay = {}; + struct page *reg_page; + + reg_page = alloc_page(GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL); + if (!reg_page) { + WARN(1, "failed to allocate register page\n"); + return; + } + + overlay.enabled = 1; + overlay.pfn = page_to_hvpfn(reg_page); + reg_assoc.name = HV_X64_REGISTER_REG_PAGE; + reg_assoc.value.reg64 = overlay.as_uint64; + + if (hv_call_set_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF, + 1, input_vtl_zero, ®_assoc)) { + WARN(1, "failed to setup register page\n"); + __free_page(reg_page); + return; + } + + per_cpu->reg_page = reg_page; + mshv_has_reg_page = true; +} + +static void mshv_vtl_synic_enable_regs(unsigned int cpu) +{ + union hv_synic_sint sint; + + sint.as_uint64 = 0; + sint.vector = HYPERVISOR_CALLBACK_VECTOR; + sint.masked = false; + sint.auto_eoi = hv_recommend_using_aeoi(); + + /* Enable intercepts */ + if (!mshv_vsm_capabilities.intercept_page_available) + hv_set_msr(HV_MSR_SINT0 + HV_SYNIC_INTERCEPTION_SINT_INDEX, + sint.as_uint64); + + /* VTL2 Host VSP SINT is (un)masked when the user mode requests that */ +} + +static int mshv_vtl_get_vsm_regs(void) +{ + struct hv_register_assoc registers[2]; + int ret, count = 2; + + registers[0].name = HV_REGISTER_VSM_CODE_PAGE_OFFSETS; + registers[1].name = HV_REGISTER_VSM_CAPABILITIES; + + ret = hv_call_get_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF, + count, input_vtl_zero, registers); + if (ret) + return ret; + + mshv_vsm_page_offsets.as_uint64 = registers[0].value.reg64; + mshv_vsm_capabilities.as_uint64 = registers[1].value.reg64; + + return ret; +} + +static int mshv_vtl_configure_vsm_partition(struct device *dev) +{ + union hv_register_vsm_partition_config config; + struct hv_register_assoc reg_assoc; + + config.as_uint64 = 0; + config.default_vtl_protection_mask = HV_MAP_GPA_PERMISSIONS_MASK; + config.enable_vtl_protection = 1; + config.zero_memory_on_reset = 1; + config.intercept_vp_startup = 1; + config.intercept_cpuid_unimplemented = 1; + + if (mshv_vsm_capabilities.intercept_page_available) { + dev_dbg(dev, "using intercept page\n"); + config.intercept_page = 1; + } + + reg_assoc.name = HV_REGISTER_VSM_PARTITION_CONFIG; + reg_assoc.value.reg64 = config.as_uint64; + + return hv_call_set_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF, + 1, input_vtl_zero, ®_assoc); +} + +static void mshv_vtl_vmbus_isr(void) +{ + struct hv_per_cpu_context *per_cpu; + struct hv_message *msg; + u32 message_type; + union hv_synic_event_flags *event_flags; + struct eventfd_ctx *eventfd; + u16 i; + + per_cpu = this_cpu_ptr(hv_context.cpu_context); + if (smp_processor_id() == 0) { + msg = (struct hv_message *)per_cpu->hyp_synic_message_page + VTL2_VMBUS_SINT_INDEX; + message_type = READ_ONCE(msg->header.message_type); + if (message_type != HVMSG_NONE) + tasklet_schedule(&msg_dpc); + } + + event_flags = (union hv_synic_event_flags *)per_cpu->hyp_synic_event_page + + VTL2_VMBUS_SINT_INDEX; + for_each_set_bit(i, event_flags->flags, HV_EVENT_FLAGS_COUNT) { + if (!sync_test_and_clear_bit(i, event_flags->flags)) + continue; + rcu_read_lock(); + eventfd = READ_ONCE(flag_eventfds[i]); + if (eventfd) + eventfd_signal(eventfd); + rcu_read_unlock(); + } + + vmbus_isr(); +} + +static int mshv_vtl_alloc_context(unsigned int cpu) +{ + struct mshv_vtl_per_cpu *per_cpu = this_cpu_ptr(&mshv_vtl_per_cpu); + + per_cpu->run = (struct mshv_vtl_run *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + if (!per_cpu->run) + return -ENOMEM; + + if (mshv_vsm_capabilities.intercept_page_available) + mshv_vtl_configure_reg_page(per_cpu); + + mshv_vtl_synic_enable_regs(cpu); + + return 0; +} + +static int mshv_vtl_cpuhp_online; + +static int hv_vtl_setup_synic(void) +{ + int ret; + + /* Use our isr to first filter out packets destined for userspace */ + hv_setup_vmbus_handler(mshv_vtl_vmbus_isr); + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "hyperv/vtl:online", + mshv_vtl_alloc_context, NULL); + if (ret < 0) { + hv_setup_vmbus_handler(vmbus_isr); + return ret; + } + + mshv_vtl_cpuhp_online = ret; + + return 0; +} + +static void hv_vtl_remove_synic(void) +{ + cpuhp_remove_state(mshv_vtl_cpuhp_online); + hv_setup_vmbus_handler(vmbus_isr); +} + +static int vtl_get_vp_register(struct hv_register_assoc *reg) +{ + return hv_call_get_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF, + 1, input_vtl_normal, reg); +} + +static int vtl_set_vp_register(struct hv_register_assoc *reg) +{ + return hv_call_set_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF, + 1, input_vtl_normal, reg); +} + +static int mshv_vtl_ioctl_add_vtl0_mem(struct mshv_vtl *vtl, void __user *arg) +{ + struct mshv_vtl_ram_disposition vtl0_mem; + struct dev_pagemap *pgmap; + void *addr; + + if (copy_from_user(&vtl0_mem, arg, sizeof(vtl0_mem))) + return -EFAULT; + /* vtl0_mem.last_pfn is excluded in the pagemap range for VTL0 as per design */ + if (vtl0_mem.last_pfn <= vtl0_mem.start_pfn) { + dev_err(vtl->module_dev, "range start pfn (%llx) > end pfn (%llx)\n", + vtl0_mem.start_pfn, vtl0_mem.last_pfn); + return -EFAULT; + } + + pgmap = kzalloc(sizeof(*pgmap), GFP_KERNEL); + if (!pgmap) + return -ENOMEM; + + pgmap->ranges[0].start = PFN_PHYS(vtl0_mem.start_pfn); + pgmap->ranges[0].end = PFN_PHYS(vtl0_mem.last_pfn) - 1; + pgmap->nr_range = 1; + pgmap->type = MEMORY_DEVICE_GENERIC; + + /* + * Determine the highest page order that can be used for the given memory range. + * This works best when the range is aligned; i.e. both the start and the length. + */ + pgmap->vmemmap_shift = count_trailing_zeros(vtl0_mem.start_pfn | vtl0_mem.last_pfn); + dev_dbg(vtl->module_dev, + "Add VTL0 memory: start: 0x%llx, end_pfn: 0x%llx, page order: %lu\n", + vtl0_mem.start_pfn, vtl0_mem.last_pfn, pgmap->vmemmap_shift); + + addr = devm_memremap_pages(mem_dev, pgmap); + if (IS_ERR(addr)) { + dev_err(vtl->module_dev, "devm_memremap_pages error: %ld\n", PTR_ERR(addr)); + kfree(pgmap); + return -EFAULT; + } + + /* Don't free pgmap, since it has to stick around until the memory + * is unmapped, which will never happen as there is no scenario + * where VTL0 can be released/shutdown without bringing down VTL2. + */ + return 0; +} + +static void mshv_vtl_cancel(int cpu) +{ + int here = get_cpu(); + + if (here != cpu) { + if (!xchg_relaxed(&mshv_vtl_cpu_run(cpu)->cancel, 1)) + smp_send_reschedule(cpu); + } else { + WRITE_ONCE(mshv_vtl_this_run()->cancel, 1); + } + put_cpu(); +} + +static int mshv_vtl_poll_file_wake(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key) +{ + struct mshv_vtl_poll_file *poll_file = container_of(wait, struct mshv_vtl_poll_file, wait); + + mshv_vtl_cancel(poll_file->cpu); + + return 0; +} + +static void mshv_vtl_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh, poll_table *pt) +{ + struct mshv_vtl_poll_file *poll_file = container_of(pt, struct mshv_vtl_poll_file, pt); + + WARN_ON(poll_file->wqh); + poll_file->wqh = wqh; + add_wait_queue(wqh, &poll_file->wait); +} + +static int mshv_vtl_ioctl_set_poll_file(struct mshv_vtl_set_poll_file __user *user_input) +{ + struct file *file, *old_file; + struct mshv_vtl_poll_file *poll_file; + struct mshv_vtl_set_poll_file input; + + if (copy_from_user(&input, user_input, sizeof(input))) + return -EFAULT; + + if (input.cpu >= num_possible_cpus() || !cpu_online(input.cpu)) + return -EINVAL; + /* + * CPU Hotplug is not supported in VTL2 in OpenHCL, where this kernel driver exists. + * CPU is expected to remain online after above cpu_online() check. + */ + + file = NULL; + file = fget(input.fd); + if (!file) + return -EBADFD; + + poll_file = per_cpu_ptr(&mshv_vtl_poll_file, READ_ONCE(input.cpu)); + if (!poll_file) + return -EINVAL; + + mutex_lock(&mshv_vtl_poll_file_lock); + + if (poll_file->wqh) + remove_wait_queue(poll_file->wqh, &poll_file->wait); + poll_file->wqh = NULL; + + old_file = poll_file->file; + poll_file->file = file; + poll_file->cpu = input.cpu; + + if (file) { + init_waitqueue_func_entry(&poll_file->wait, mshv_vtl_poll_file_wake); + init_poll_funcptr(&poll_file->pt, mshv_vtl_ptable_queue_proc); + vfs_poll(file, &poll_file->pt); + } + + mutex_unlock(&mshv_vtl_poll_file_lock); + + if (old_file) + fput(old_file); + + return 0; +} + +/* Static table mapping register names to their corresponding actions */ +static const struct { + enum hv_register_name reg_name; + int debug_reg_num; /* -1 if not a debug register */ + u32 msr_addr; /* 0 if not an MSR */ +} reg_table[] = { + /* Debug registers */ + {HV_X64_REGISTER_DR0, 0, 0}, + {HV_X64_REGISTER_DR1, 1, 0}, + {HV_X64_REGISTER_DR2, 2, 0}, + {HV_X64_REGISTER_DR3, 3, 0}, + {HV_X64_REGISTER_DR6, 6, 0}, + /* MTRR MSRs */ + {HV_X64_REGISTER_MSR_MTRR_CAP, -1, MSR_MTRRcap}, + {HV_X64_REGISTER_MSR_MTRR_DEF_TYPE, -1, MSR_MTRRdefType}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE0, -1, MTRRphysBase_MSR(0)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE1, -1, MTRRphysBase_MSR(1)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE2, -1, MTRRphysBase_MSR(2)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE3, -1, MTRRphysBase_MSR(3)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE4, -1, MTRRphysBase_MSR(4)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE5, -1, MTRRphysBase_MSR(5)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE6, -1, MTRRphysBase_MSR(6)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE7, -1, MTRRphysBase_MSR(7)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE8, -1, MTRRphysBase_MSR(8)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE9, -1, MTRRphysBase_MSR(9)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASEA, -1, MTRRphysBase_MSR(0xa)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASEB, -1, MTRRphysBase_MSR(0xb)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASEC, -1, MTRRphysBase_MSR(0xc)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASED, -1, MTRRphysBase_MSR(0xd)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASEE, -1, MTRRphysBase_MSR(0xe)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASEF, -1, MTRRphysBase_MSR(0xf)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK0, -1, MTRRphysMask_MSR(0)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK1, -1, MTRRphysMask_MSR(1)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK2, -1, MTRRphysMask_MSR(2)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK3, -1, MTRRphysMask_MSR(3)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK4, -1, MTRRphysMask_MSR(4)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK5, -1, MTRRphysMask_MSR(5)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK6, -1, MTRRphysMask_MSR(6)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK7, -1, MTRRphysMask_MSR(7)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK8, -1, MTRRphysMask_MSR(8)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK9, -1, MTRRphysMask_MSR(9)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKA, -1, MTRRphysMask_MSR(0xa)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKB, -1, MTRRphysMask_MSR(0xb)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKC, -1, MTRRphysMask_MSR(0xc)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKD, -1, MTRRphysMask_MSR(0xd)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKE, -1, MTRRphysMask_MSR(0xe)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKF, -1, MTRRphysMask_MSR(0xf)}, + {HV_X64_REGISTER_MSR_MTRR_FIX64K00000, -1, MSR_MTRRfix64K_00000}, + {HV_X64_REGISTER_MSR_MTRR_FIX16K80000, -1, MSR_MTRRfix16K_80000}, + {HV_X64_REGISTER_MSR_MTRR_FIX16KA0000, -1, MSR_MTRRfix16K_A0000}, + {HV_X64_REGISTER_MSR_MTRR_FIX4KC0000, -1, MSR_MTRRfix4K_C0000}, + {HV_X64_REGISTER_MSR_MTRR_FIX4KC8000, -1, MSR_MTRRfix4K_C8000}, + {HV_X64_REGISTER_MSR_MTRR_FIX4KD0000, -1, MSR_MTRRfix4K_D0000}, + {HV_X64_REGISTER_MSR_MTRR_FIX4KD8000, -1, MSR_MTRRfix4K_D8000}, + {HV_X64_REGISTER_MSR_MTRR_FIX4KE0000, -1, MSR_MTRRfix4K_E0000}, + {HV_X64_REGISTER_MSR_MTRR_FIX4KE8000, -1, MSR_MTRRfix4K_E8000}, + {HV_X64_REGISTER_MSR_MTRR_FIX4KF0000, -1, MSR_MTRRfix4K_F0000}, + {HV_X64_REGISTER_MSR_MTRR_FIX4KF8000, -1, MSR_MTRRfix4K_F8000}, +}; + +static int mshv_vtl_get_set_reg(struct hv_register_assoc *regs, bool set) +{ + u64 *reg64; + enum hv_register_name gpr_name; + int i; + + gpr_name = regs->name; + reg64 = ®s->value.reg64; + + /* Search for the register in the table */ + for (i = 0; i < ARRAY_SIZE(reg_table); i++) { + if (reg_table[i].reg_name != gpr_name) + continue; + if (reg_table[i].debug_reg_num != -1) { + /* Handle debug registers */ + if (gpr_name == HV_X64_REGISTER_DR6 && + !mshv_vsm_capabilities.dr6_shared) + goto hypercall; + if (set) + native_set_debugreg(reg_table[i].debug_reg_num, *reg64); + else + *reg64 = native_get_debugreg(reg_table[i].debug_reg_num); + } else { + /* Handle MSRs */ + if (set) + wrmsrl(reg_table[i].msr_addr, *reg64); + else + rdmsrl(reg_table[i].msr_addr, *reg64); + } + return 0; + } + +hypercall: + return 1; +} + +static void mshv_vtl_return(struct mshv_vtl_cpu_context *vtl0) +{ + struct hv_vp_assist_page *hvp; + + hvp = hv_vp_assist_page[smp_processor_id()]; + + /* + * Process signal event direct set in the run page, if any. + */ + if (mshv_vsm_capabilities.return_action_available) { + u32 offset = READ_ONCE(mshv_vtl_this_run()->vtl_ret_action_size); + + WRITE_ONCE(mshv_vtl_this_run()->vtl_ret_action_size, 0); + + /* + * Hypervisor will take care of clearing out the actions + * set in the assist page. + */ + memcpy(hvp->vtl_ret_actions, + mshv_vtl_this_run()->vtl_ret_actions, + min_t(u32, offset, sizeof(hvp->vtl_ret_actions))); + } + + mshv_vtl_return_call(vtl0); +} + +static bool mshv_vtl_process_intercept(void) +{ + struct hv_per_cpu_context *mshv_cpu; + void *synic_message_page; + struct hv_message *msg; + u32 message_type; + + mshv_cpu = this_cpu_ptr(hv_context.cpu_context); + synic_message_page = mshv_cpu->hyp_synic_message_page; + if (unlikely(!synic_message_page)) + return true; + + msg = (struct hv_message *)synic_message_page + HV_SYNIC_INTERCEPTION_SINT_INDEX; + message_type = READ_ONCE(msg->header.message_type); + if (message_type == HVMSG_NONE) + return true; + + memcpy(mshv_vtl_this_run()->exit_message, msg, sizeof(*msg)); + vmbus_signal_eom(msg, message_type); + + return false; +} + +static int mshv_vtl_ioctl_return_to_lower_vtl(void) +{ + preempt_disable(); + for (;;) { + unsigned long irq_flags; + struct hv_vp_assist_page *hvp; + int ret; + + if (__xfer_to_guest_mode_work_pending()) { + preempt_enable(); + ret = xfer_to_guest_mode_handle_work(); + if (ret) + return ret; + preempt_disable(); + } + + local_irq_save(irq_flags); + if (READ_ONCE(mshv_vtl_this_run()->cancel)) { + local_irq_restore(irq_flags); + preempt_enable(); + return -EINTR; + } + + mshv_vtl_return(&mshv_vtl_this_run()->cpu_context); + local_irq_restore(irq_flags); + + hvp = hv_vp_assist_page[smp_processor_id()]; + this_cpu_inc(num_vtl0_transitions); + switch (hvp->vtl_entry_reason) { + case MSHV_ENTRY_REASON_INTERRUPT: + if (!mshv_vsm_capabilities.intercept_page_available && + likely(!mshv_vtl_process_intercept())) + goto done; + break; + + case MSHV_ENTRY_REASON_INTERCEPT: + WARN_ON(!mshv_vsm_capabilities.intercept_page_available); + memcpy(mshv_vtl_this_run()->exit_message, hvp->intercept_message, + sizeof(hvp->intercept_message)); + goto done; + + default: + panic("unknown entry reason: %d", hvp->vtl_entry_reason); + } + } + +done: + preempt_enable(); + + return 0; +} + +static long +mshv_vtl_ioctl_get_regs(void __user *user_args) +{ + struct mshv_vp_registers args; + struct hv_register_assoc reg; + long ret; + + if (copy_from_user(&args, user_args, sizeof(args))) + return -EFAULT; + + /* This IOCTL supports processing only one register at a time. */ + if (args.count != 1) + return -EINVAL; + + if (copy_from_user(®, (void __user *)args.regs_ptr, + sizeof(reg))) + return -EFAULT; + + ret = mshv_vtl_get_set_reg(®, false); + if (!ret) + goto copy_args; /* No need of hypercall */ + ret = vtl_get_vp_register(®); + if (ret) + return ret; + +copy_args: + if (copy_to_user((void __user *)args.regs_ptr, ®, sizeof(reg))) + ret = -EFAULT; + + return ret; +} + +static long +mshv_vtl_ioctl_set_regs(void __user *user_args) +{ + struct mshv_vp_registers args; + struct hv_register_assoc reg; + long ret; + + if (copy_from_user(&args, user_args, sizeof(args))) + return -EFAULT; + + /* This IOCTL supports processing only one register at a time. */ + if (args.count != 1) + return -EINVAL; + + if (copy_from_user(®, (void __user *)args.regs_ptr, sizeof(reg))) + return -EFAULT; + + ret = mshv_vtl_get_set_reg(®, true); + if (!ret) + return ret; /* No need of hypercall */ + ret = vtl_set_vp_register(®); + + return ret; +} + +static long +mshv_vtl_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) +{ + long ret; + struct mshv_vtl *vtl = filp->private_data; + + switch (ioctl) { + case MSHV_SET_POLL_FILE: + ret = mshv_vtl_ioctl_set_poll_file((struct mshv_vtl_set_poll_file __user *)arg); + break; + case MSHV_GET_VP_REGISTERS: + ret = mshv_vtl_ioctl_get_regs((void __user *)arg); + break; + case MSHV_SET_VP_REGISTERS: + ret = mshv_vtl_ioctl_set_regs((void __user *)arg); + break; + case MSHV_RETURN_TO_LOWER_VTL: + ret = mshv_vtl_ioctl_return_to_lower_vtl(); + break; + case MSHV_ADD_VTL0_MEMORY: + ret = mshv_vtl_ioctl_add_vtl0_mem(vtl, (void __user *)arg); + break; + default: + dev_err(vtl->module_dev, "invalid vtl ioctl: %#x\n", ioctl); + ret = -ENOTTY; + } + + return ret; +} + +static vm_fault_t mshv_vtl_fault(struct vm_fault *vmf) +{ + struct page *page; + int cpu = vmf->pgoff & MSHV_PG_OFF_CPU_MASK; + int real_off = vmf->pgoff >> MSHV_REAL_OFF_SHIFT; + + if (!cpu_online(cpu)) + return VM_FAULT_SIGBUS; + /* + * CPU Hotplug is not supported in VTL2 in OpenHCL, where this kernel driver exists. + * CPU is expected to remain online after above cpu_online() check. + */ + + if (real_off == MSHV_RUN_PAGE_OFFSET) { + page = virt_to_page(mshv_vtl_cpu_run(cpu)); + } else if (real_off == MSHV_REG_PAGE_OFFSET) { + if (!mshv_has_reg_page) + return VM_FAULT_SIGBUS; + page = mshv_vtl_cpu_reg_page(cpu); + } else { + return VM_FAULT_NOPAGE; + } + + get_page(page); + vmf->page = page; + + return 0; +} + +static const struct vm_operations_struct mshv_vtl_vm_ops = { + .fault = mshv_vtl_fault, +}; + +static int mshv_vtl_mmap(struct file *filp, struct vm_area_struct *vma) +{ + vma->vm_ops = &mshv_vtl_vm_ops; + + return 0; +} + +static int mshv_vtl_release(struct inode *inode, struct file *filp) +{ + struct mshv_vtl *vtl = filp->private_data; + + kfree(vtl); + + return 0; +} + +static const struct file_operations mshv_vtl_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = mshv_vtl_ioctl, + .release = mshv_vtl_release, + .mmap = mshv_vtl_mmap, +}; + +static void mshv_vtl_synic_mask_vmbus_sint(const u8 *mask) +{ + union hv_synic_sint sint; + + sint.as_uint64 = 0; + sint.vector = HYPERVISOR_CALLBACK_VECTOR; + sint.masked = (*mask != 0); + sint.auto_eoi = hv_recommend_using_aeoi(); + + hv_set_msr(HV_MSR_SINT0 + VTL2_VMBUS_SINT_INDEX, + sint.as_uint64); + + if (!sint.masked) + pr_debug("%s: Unmasking VTL2 VMBUS SINT on VP %d\n", __func__, smp_processor_id()); + else + pr_debug("%s: Masking VTL2 VMBUS SINT on VP %d\n", __func__, smp_processor_id()); +} + +static void mshv_vtl_read_remote(void *buffer) +{ + struct hv_per_cpu_context *mshv_cpu = this_cpu_ptr(hv_context.cpu_context); + struct hv_message *msg = (struct hv_message *)mshv_cpu->hyp_synic_message_page + + VTL2_VMBUS_SINT_INDEX; + u32 message_type = READ_ONCE(msg->header.message_type); + + WRITE_ONCE(has_message, false); + if (message_type == HVMSG_NONE) + return; + + memcpy(buffer, msg, sizeof(*msg)); + vmbus_signal_eom(msg, message_type); +} + +static bool vtl_synic_mask_vmbus_sint_masked = true; + +static ssize_t mshv_vtl_sint_read(struct file *filp, char __user *arg, size_t size, loff_t *offset) +{ + struct hv_message msg = {}; + int ret; + + if (size < sizeof(msg)) + return -EINVAL; + + for (;;) { + smp_call_function_single(VMBUS_CONNECT_CPU, mshv_vtl_read_remote, &msg, true); + if (msg.header.message_type != HVMSG_NONE) + break; + + if (READ_ONCE(vtl_synic_mask_vmbus_sint_masked)) + return 0; /* EOF */ + + if (filp->f_flags & O_NONBLOCK) + return -EAGAIN; + + ret = wait_event_interruptible(fd_wait_queue, + READ_ONCE(has_message) || + READ_ONCE(vtl_synic_mask_vmbus_sint_masked)); + if (ret) + return ret; + } + + if (copy_to_user(arg, &msg, sizeof(msg))) + return -EFAULT; + + return sizeof(msg); +} + +static __poll_t mshv_vtl_sint_poll(struct file *filp, poll_table *wait) +{ + __poll_t mask = 0; + + poll_wait(filp, &fd_wait_queue, wait); + if (READ_ONCE(has_message) || READ_ONCE(vtl_synic_mask_vmbus_sint_masked)) + mask |= EPOLLIN | EPOLLRDNORM; + + return mask; +} + +static void mshv_vtl_sint_on_msg_dpc(unsigned long data) +{ + WRITE_ONCE(has_message, true); + wake_up_interruptible_poll(&fd_wait_queue, EPOLLIN); +} + +static int mshv_vtl_sint_ioctl_post_msg(struct mshv_vtl_sint_post_msg __user *arg) +{ + struct mshv_vtl_sint_post_msg message; + u8 payload[HV_MESSAGE_PAYLOAD_BYTE_COUNT]; + + if (copy_from_user(&message, arg, sizeof(message))) + return -EFAULT; + if (message.payload_size > HV_MESSAGE_PAYLOAD_BYTE_COUNT) + return -EINVAL; + if (copy_from_user(payload, (void __user *)message.payload_ptr, + message.payload_size)) + return -EFAULT; + + return hv_post_message((union hv_connection_id)message.connection_id, + message.message_type, (void *)payload, + message.payload_size); +} + +static int mshv_vtl_sint_ioctl_signal_event(struct mshv_vtl_signal_event __user *arg) +{ + u64 input, status; + struct mshv_vtl_signal_event signal_event; + + if (copy_from_user(&signal_event, arg, sizeof(signal_event))) + return -EFAULT; + + input = signal_event.connection_id | ((u64)signal_event.flag << 32); + + status = hv_do_fast_hypercall8(HVCALL_SIGNAL_EVENT, input); + + return hv_result_to_errno(status); +} + +static int mshv_vtl_sint_ioctl_set_eventfd(struct mshv_vtl_set_eventfd __user *arg) +{ + struct mshv_vtl_set_eventfd set_eventfd; + struct eventfd_ctx *eventfd, *old_eventfd; + + if (copy_from_user(&set_eventfd, arg, sizeof(set_eventfd))) + return -EFAULT; + if (set_eventfd.flag >= HV_EVENT_FLAGS_COUNT) + return -EINVAL; + + eventfd = NULL; + if (set_eventfd.fd >= 0) { + eventfd = eventfd_ctx_fdget(set_eventfd.fd); + if (IS_ERR(eventfd)) + return PTR_ERR(eventfd); + } + + guard(mutex)(&flag_lock); + old_eventfd = READ_ONCE(flag_eventfds[set_eventfd.flag]); + WRITE_ONCE(flag_eventfds[set_eventfd.flag], eventfd); + + if (old_eventfd) { + synchronize_rcu(); + eventfd_ctx_put(old_eventfd); + } + + return 0; +} + +static int mshv_vtl_sint_ioctl_pause_msg_stream(struct mshv_sint_mask __user *arg) +{ + static DEFINE_MUTEX(vtl2_vmbus_sint_mask_mutex); + struct mshv_sint_mask mask; + + if (copy_from_user(&mask, arg, sizeof(mask))) + return -EFAULT; + guard(mutex)(&vtl2_vmbus_sint_mask_mutex); + on_each_cpu((smp_call_func_t)mshv_vtl_synic_mask_vmbus_sint, &mask.mask, 1); + WRITE_ONCE(vtl_synic_mask_vmbus_sint_masked, mask.mask != 0); + if (mask.mask) + wake_up_interruptible_poll(&fd_wait_queue, EPOLLIN); + + return 0; +} + +static long mshv_vtl_sint_ioctl(struct file *f, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case MSHV_SINT_POST_MESSAGE: + return mshv_vtl_sint_ioctl_post_msg((struct mshv_vtl_sint_post_msg __user *)arg); + case MSHV_SINT_SIGNAL_EVENT: + return mshv_vtl_sint_ioctl_signal_event((struct mshv_vtl_signal_event __user *)arg); + case MSHV_SINT_SET_EVENTFD: + return mshv_vtl_sint_ioctl_set_eventfd((struct mshv_vtl_set_eventfd __user *)arg); + case MSHV_SINT_PAUSE_MESSAGE_STREAM: + return mshv_vtl_sint_ioctl_pause_msg_stream((struct mshv_sint_mask __user *)arg); + default: + return -ENOIOCTLCMD; + } +} + +static const struct file_operations mshv_vtl_sint_ops = { + .owner = THIS_MODULE, + .read = mshv_vtl_sint_read, + .poll = mshv_vtl_sint_poll, + .unlocked_ioctl = mshv_vtl_sint_ioctl, +}; + +static struct miscdevice mshv_vtl_sint_dev = { + .name = "mshv_sint", + .fops = &mshv_vtl_sint_ops, + .mode = 0600, + .minor = MISC_DYNAMIC_MINOR, +}; + +static int mshv_vtl_hvcall_dev_open(struct inode *node, struct file *f) +{ + struct miscdevice *dev = f->private_data; + struct mshv_vtl_hvcall_fd *fd; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + fd = vzalloc(sizeof(*fd)); + if (!fd) + return -ENOMEM; + fd->dev = dev; + f->private_data = fd; + mutex_init(&fd->init_mutex); + + return 0; +} + +static int mshv_vtl_hvcall_dev_release(struct inode *node, struct file *f) +{ + struct mshv_vtl_hvcall_fd *fd; + + fd = f->private_data; + if (fd) { + vfree(fd); + f->private_data = NULL; + } + + return 0; +} + +static int mshv_vtl_hvcall_do_setup(struct mshv_vtl_hvcall_fd *fd, + struct mshv_vtl_hvcall_setup __user *hvcall_setup_user) +{ + struct mshv_vtl_hvcall_setup hvcall_setup; + + guard(mutex)(&fd->init_mutex); + + if (fd->allow_map_initialized) { + dev_err(fd->dev->this_device, + "Hypercall allow map has already been set, pid %d\n", + current->pid); + return -EINVAL; + } + + if (copy_from_user(&hvcall_setup, hvcall_setup_user, + sizeof(struct mshv_vtl_hvcall_setup))) { + return -EFAULT; + } + if (hvcall_setup.bitmap_array_size > ARRAY_SIZE(fd->allow_bitmap)) + return -EINVAL; + + if (copy_from_user(&fd->allow_bitmap, + (void __user *)hvcall_setup.allow_bitmap_ptr, + hvcall_setup.bitmap_array_size)) { + return -EFAULT; + } + + dev_info(fd->dev->this_device, "Hypercall allow map has been set, pid %d\n", + current->pid); + fd->allow_map_initialized = true; + return 0; +} + +static bool mshv_vtl_hvcall_is_allowed(struct mshv_vtl_hvcall_fd *fd, u16 call_code) +{ + return test_bit(call_code, (unsigned long *)fd->allow_bitmap); +} + +static int mshv_vtl_hvcall_call(struct mshv_vtl_hvcall_fd *fd, + struct mshv_vtl_hvcall __user *hvcall_user) +{ + struct mshv_vtl_hvcall hvcall; + void *in, *out; + int ret; + + if (copy_from_user(&hvcall, hvcall_user, sizeof(struct mshv_vtl_hvcall))) + return -EFAULT; + if (hvcall.input_size > HV_HYP_PAGE_SIZE) + return -EINVAL; + if (hvcall.output_size > HV_HYP_PAGE_SIZE) + return -EINVAL; + + /* + * By default, all hypercalls are not allowed. + * The user mode code has to set up the allow bitmap once. + */ + + if (!mshv_vtl_hvcall_is_allowed(fd, hvcall.control & 0xFFFF)) { + dev_err(fd->dev->this_device, + "Hypercall with control data %#llx isn't allowed\n", + hvcall.control); + return -EPERM; + } + + /* + * This may create a problem for Confidential VM (CVM) usecase where we need to use + * Hyper-V driver allocated per-cpu input and output pages (hyperv_pcpu_input_arg and + * hyperv_pcpu_output_arg) for making a hypervisor call. + * + * TODO: Take care of this when CVM support is added. + */ + in = (void *)__get_free_page(GFP_KERNEL); + out = (void *)__get_free_page(GFP_KERNEL); + + if (copy_from_user(in, (void __user *)hvcall.input_ptr, hvcall.input_size)) { + ret = -EFAULT; + goto free_pages; + } + + hvcall.status = hv_do_hypercall(hvcall.control, in, out); + + if (copy_to_user((void __user *)hvcall.output_ptr, out, hvcall.output_size)) { + ret = -EFAULT; + goto free_pages; + } + ret = put_user(hvcall.status, &hvcall_user->status); +free_pages: + free_page((unsigned long)in); + free_page((unsigned long)out); + + return ret; +} + +static long mshv_vtl_hvcall_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg) +{ + struct mshv_vtl_hvcall_fd *fd = f->private_data; + + switch (cmd) { + case MSHV_HVCALL_SETUP: + return mshv_vtl_hvcall_do_setup(fd, (struct mshv_vtl_hvcall_setup __user *)arg); + case MSHV_HVCALL: + return mshv_vtl_hvcall_call(fd, (struct mshv_vtl_hvcall __user *)arg); + default: + break; + } + + return -ENOIOCTLCMD; +} + +static const struct file_operations mshv_vtl_hvcall_dev_file_ops = { + .owner = THIS_MODULE, + .open = mshv_vtl_hvcall_dev_open, + .release = mshv_vtl_hvcall_dev_release, + .unlocked_ioctl = mshv_vtl_hvcall_dev_ioctl, +}; + +static struct miscdevice mshv_vtl_hvcall_dev = { + .name = "mshv_hvcall", + .nodename = "mshv_hvcall", + .fops = &mshv_vtl_hvcall_dev_file_ops, + .mode = 0600, + .minor = MISC_DYNAMIC_MINOR, +}; + +static int mshv_vtl_low_open(struct inode *inodep, struct file *filp) +{ + pid_t pid = task_pid_vnr(current); + uid_t uid = current_uid().val; + int ret = 0; + + pr_debug("%s: Opening VTL low, task group %d, uid %d\n", __func__, pid, uid); + + if (capable(CAP_SYS_ADMIN)) { + filp->private_data = inodep; + } else { + pr_err("%s: VTL low open failed: CAP_SYS_ADMIN required. task group %d, uid %d", + __func__, pid, uid); + ret = -EPERM; + } + + return ret; +} + +static bool can_fault(struct vm_fault *vmf, unsigned long size, unsigned long *pfn) +{ + unsigned long mask = size - 1; + unsigned long start = vmf->address & ~mask; + unsigned long end = start + size; + bool is_valid; + + is_valid = (vmf->address & mask) == ((vmf->pgoff << PAGE_SHIFT) & mask) && + start >= vmf->vma->vm_start && + end <= vmf->vma->vm_end; + + if (is_valid) + *pfn = vmf->pgoff & ~(mask >> PAGE_SHIFT); + + return is_valid; +} + +static vm_fault_t mshv_vtl_low_huge_fault(struct vm_fault *vmf, unsigned int order) +{ + unsigned long pfn = vmf->pgoff; + vm_fault_t ret = VM_FAULT_FALLBACK; + + switch (order) { + case 0: + return vmf_insert_mixed(vmf->vma, vmf->address, pfn); + + case PMD_ORDER: + if (can_fault(vmf, PMD_SIZE, &pfn)) + ret = vmf_insert_pfn_pmd(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE); + return ret; + + case PUD_ORDER: + if (can_fault(vmf, PUD_SIZE, &pfn)) + ret = vmf_insert_pfn_pud(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE); + return ret; + + default: + return VM_FAULT_SIGBUS; + } +} + +static vm_fault_t mshv_vtl_low_fault(struct vm_fault *vmf) +{ + return mshv_vtl_low_huge_fault(vmf, 0); +} + +static const struct vm_operations_struct mshv_vtl_low_vm_ops = { + .fault = mshv_vtl_low_fault, + .huge_fault = mshv_vtl_low_huge_fault, +}; + +static int mshv_vtl_low_mmap(struct file *filp, struct vm_area_struct *vma) +{ + vma->vm_ops = &mshv_vtl_low_vm_ops; + vm_flags_set(vma, VM_HUGEPAGE | VM_MIXEDMAP); + + return 0; +} + +static const struct file_operations mshv_vtl_low_file_ops = { + .owner = THIS_MODULE, + .open = mshv_vtl_low_open, + .mmap = mshv_vtl_low_mmap, +}; + +static struct miscdevice mshv_vtl_low = { + .name = "mshv_vtl_low", + .nodename = "mshv_vtl_low", + .fops = &mshv_vtl_low_file_ops, + .mode = 0600, + .minor = MISC_DYNAMIC_MINOR, +}; + +static int __init mshv_vtl_init(void) +{ + int ret; + struct device *dev = mshv_dev.this_device; + + /* + * This creates /dev/mshv which provides functionality to create VTLs and partitions. + */ + ret = misc_register(&mshv_dev); + if (ret) { + dev_err(dev, "mshv device register failed: %d\n", ret); + goto free_dev; + } + + tasklet_init(&msg_dpc, mshv_vtl_sint_on_msg_dpc, 0); + init_waitqueue_head(&fd_wait_queue); + + if (mshv_vtl_get_vsm_regs()) { + dev_emerg(dev, "Unable to get VSM capabilities !!\n"); + ret = -ENODEV; + goto free_dev; + } + if (mshv_vtl_configure_vsm_partition(dev)) { + dev_emerg(dev, "VSM configuration failed !!\n"); + ret = -ENODEV; + goto free_dev; + } + + mshv_vtl_return_call_init(mshv_vsm_page_offsets.vtl_return_offset); + ret = hv_vtl_setup_synic(); + if (ret) + goto free_dev; + + /* + * mshv_sint device adds VMBus relay ioctl support. + * This provides a channel for VTL0 to communicate with VTL2. + */ + ret = misc_register(&mshv_vtl_sint_dev); + if (ret) + goto free_synic; + + /* + * mshv_hvcall device adds interface to enable userspace for direct hypercalls support. + */ + ret = misc_register(&mshv_vtl_hvcall_dev); + if (ret) + goto free_sint; + + /* + * mshv_vtl_low device is used to map VTL0 address space to a user-mode process in VTL2. + * It implements mmap() to allow a user-mode process in VTL2 to map to the address of VTL0. + */ + ret = misc_register(&mshv_vtl_low); + if (ret) + goto free_hvcall; + + /* + * "mshv vtl mem dev" device is later used to setup VTL0 memory. + */ + mem_dev = kzalloc(sizeof(*mem_dev), GFP_KERNEL); + if (!mem_dev) { + ret = -ENOMEM; + goto free_low; + } + + mutex_init(&mshv_vtl_poll_file_lock); + + device_initialize(mem_dev); + dev_set_name(mem_dev, "mshv vtl mem dev"); + ret = device_add(mem_dev); + if (ret) { + dev_err(dev, "mshv vtl mem dev add: %d\n", ret); + goto free_mem; + } + + return 0; + +free_mem: + kfree(mem_dev); +free_low: + misc_deregister(&mshv_vtl_low); +free_hvcall: + misc_deregister(&mshv_vtl_hvcall_dev); +free_sint: + misc_deregister(&mshv_vtl_sint_dev); +free_synic: + hv_vtl_remove_synic(); +free_dev: + misc_deregister(&mshv_dev); + + return ret; +} + +static void __exit mshv_vtl_exit(void) +{ + device_del(mem_dev); + kfree(mem_dev); + misc_deregister(&mshv_vtl_low); + misc_deregister(&mshv_vtl_hvcall_dev); + misc_deregister(&mshv_vtl_sint_dev); + hv_vtl_remove_synic(); + misc_deregister(&mshv_dev); +} + +module_init(mshv_vtl_init); +module_exit(mshv_vtl_exit); diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c index c6692fd5ab15..3c421a7f78c0 100644 --- a/drivers/hv/ring_buffer.c +++ b/drivers/hv/ring_buffer.c @@ -18,6 +18,7 @@ #include <linux/slab.h> #include <linux/prefetch.h> #include <linux/io.h> +#include <linux/export.h> #include <asm/mshyperv.h> #include "hyperv_vmbus.h" @@ -183,11 +184,10 @@ void hv_ringbuffer_pre_init(struct vmbus_channel *channel) /* Initialize the ring buffer. */ int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info, - struct page *pages, u32 page_cnt, u32 max_pkt_size) + struct page *pages, u32 page_cnt, u32 max_pkt_size, + bool confidential) { struct page **pages_wraparound; - unsigned long *pfns_wraparound; - u64 pfn; int i; BUILD_BUG_ON((sizeof(struct hv_ring_buffer) != PAGE_SIZE)); @@ -196,50 +196,30 @@ int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info, * First page holds struct hv_ring_buffer, do wraparound mapping for * the rest. */ - if (hv_isolation_type_snp()) { - pfn = page_to_pfn(pages) + - PFN_DOWN(ms_hyperv.shared_gpa_boundary); + pages_wraparound = kcalloc(page_cnt * 2 - 1, + sizeof(struct page *), + GFP_KERNEL); + if (!pages_wraparound) + return -ENOMEM; - pfns_wraparound = kcalloc(page_cnt * 2 - 1, - sizeof(unsigned long), GFP_KERNEL); - if (!pfns_wraparound) - return -ENOMEM; - - pfns_wraparound[0] = pfn; - for (i = 0; i < 2 * (page_cnt - 1); i++) - pfns_wraparound[i + 1] = pfn + i % (page_cnt - 1) + 1; - - ring_info->ring_buffer = (struct hv_ring_buffer *) - vmap_pfn(pfns_wraparound, page_cnt * 2 - 1, - PAGE_KERNEL); - kfree(pfns_wraparound); - - if (!ring_info->ring_buffer) - return -ENOMEM; - - /* Zero ring buffer after setting memory host visibility. */ - memset(ring_info->ring_buffer, 0x00, PAGE_SIZE * page_cnt); - } else { - pages_wraparound = kcalloc(page_cnt * 2 - 1, - sizeof(struct page *), - GFP_KERNEL); - if (!pages_wraparound) - return -ENOMEM; - - pages_wraparound[0] = pages; - for (i = 0; i < 2 * (page_cnt - 1); i++) - pages_wraparound[i + 1] = - &pages[i % (page_cnt - 1) + 1]; + pages_wraparound[0] = pages; + for (i = 0; i < 2 * (page_cnt - 1); i++) + pages_wraparound[i + 1] = + &pages[i % (page_cnt - 1) + 1]; - ring_info->ring_buffer = (struct hv_ring_buffer *) - vmap(pages_wraparound, page_cnt * 2 - 1, VM_MAP, - PAGE_KERNEL); + ring_info->ring_buffer = (struct hv_ring_buffer *) + vmap(pages_wraparound, page_cnt * 2 - 1, VM_MAP, + confidential ? PAGE_KERNEL : pgprot_decrypted(PAGE_KERNEL)); - kfree(pages_wraparound); - if (!ring_info->ring_buffer) - return -ENOMEM; - } + kfree(pages_wraparound); + if (!ring_info->ring_buffer) + return -ENOMEM; + /* + * Ensure the header page is zero'ed since + * encryption status may have changed. + */ + memset(ring_info->ring_buffer, 0, HV_HYP_PAGE_SIZE); ring_info->ring_buffer->read_index = ring_info->ring_buffer->write_index = 0; diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 3146710d4ac6..a53af6fe81a6 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -12,6 +12,7 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/device.h> +#include <linux/platform_device.h> #include <linux/interrupt.h> #include <linux/sysctl.h> #include <linux/slab.h> @@ -19,6 +20,7 @@ #include <linux/completion.h> #include <linux/hyperv.h> #include <linux/kernel_stat.h> +#include <linux/of_address.h> #include <linux/clockchips.h> #include <linux/cpu.h> #include <linux/sched/isolation.h> @@ -28,13 +30,13 @@ #include <linux/panic_notifier.h> #include <linux/ptrace.h> #include <linux/screen_info.h> -#include <linux/kdebug.h> #include <linux/efi.h> #include <linux/random.h> #include <linux/kernel.h> #include <linux/syscore_ops.h> #include <linux/dma-map-ops.h> #include <linux/pci.h> +#include <linux/export.h> #include <clocksource/hyperv_timer.h> #include <asm/mshyperv.h> #include "hyperv_vmbus.h" @@ -44,12 +46,11 @@ struct vmbus_dynid { struct hv_vmbus_device_id id; }; -static struct acpi_device *hv_acpi_dev; +/* VMBus Root Device */ +static struct device *vmbus_root_device; static int hyperv_cpuhp_online; -static void *hv_panic_page; - static long __percpu *vmbus_evt; /* Values parsed from ACPI DSDT */ @@ -57,16 +58,16 @@ int vmbus_irq; int vmbus_interrupt; /* - * Boolean to control whether to report panic messages over Hyper-V. - * - * It can be set via /proc/sys/kernel/hyperv_record_panic_msg + * If the Confidential VMBus is used, the data on the "wire" is not + * visible to either the host or the hypervisor. */ -static int sysctl_record_panic_msg = 1; +static bool is_confidential; -static int hyperv_report_reg(void) +bool vmbus_is_confidential(void) { - return !sysctl_record_panic_msg || !hv_panic_page; + return is_confidential; } +EXPORT_SYMBOL_GPL(vmbus_is_confidential); /* * The panic notifier below is responsible solely for unloading the @@ -88,62 +89,20 @@ static struct notifier_block hyperv_panic_vmbus_unload_block = { .priority = INT_MIN + 1, /* almost the latest one to execute */ }; -static int hv_die_panic_notify_crash(struct notifier_block *self, - unsigned long val, void *args); - -static struct notifier_block hyperv_die_report_block = { - .notifier_call = hv_die_panic_notify_crash, -}; -static struct notifier_block hyperv_panic_report_block = { - .notifier_call = hv_die_panic_notify_crash, -}; - -/* - * The following callback works both as die and panic notifier; its - * goal is to provide panic information to the hypervisor unless the - * kmsg dumper is used [see hv_kmsg_dump()], which provides more - * information but isn't always available. - * - * Notice that both the panic/die report notifiers are registered only - * if we have the capability HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE set. - */ -static int hv_die_panic_notify_crash(struct notifier_block *self, - unsigned long val, void *args) -{ - struct pt_regs *regs; - bool is_die; - - /* Don't notify Hyper-V unless we have a die oops event or panic. */ - if (self == &hyperv_panic_report_block) { - is_die = false; - regs = current_pt_regs(); - } else { /* die event */ - if (val != DIE_OOPS) - return NOTIFY_DONE; - - is_die = true; - regs = ((struct die_args *)args)->regs; - } - - /* - * Hyper-V should be notified only once about a panic/die. If we will - * be calling hv_kmsg_dump() later with kmsg data, don't do the - * notification here. - */ - if (hyperv_report_reg()) - hyperv_report_panic(regs, val, is_die); - - return NOTIFY_DONE; -} - static const char *fb_mmio_name = "fb_range"; static struct resource *fb_mmio; static struct resource *hyperv_mmio; static DEFINE_MUTEX(hyperv_mmio_lock); +struct device *hv_get_vmbus_root_device(void) +{ + return vmbus_root_device; +} +EXPORT_SYMBOL_GPL(hv_get_vmbus_root_device); + static int vmbus_exists(void) { - if (hv_acpi_dev == NULL) + if (vmbus_root_device == NULL) return -ENODEV; return 0; @@ -192,7 +151,7 @@ static ssize_t id_show(struct device *dev, struct device_attribute *dev_attr, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", hv_dev->channel->offermsg.child_relid); + return sysfs_emit(buf, "%d\n", hv_dev->channel->offermsg.child_relid); } static DEVICE_ATTR_RO(id); @@ -203,7 +162,7 @@ static ssize_t state_show(struct device *dev, struct device_attribute *dev_attr, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", hv_dev->channel->state); + return sysfs_emit(buf, "%d\n", hv_dev->channel->state); } static DEVICE_ATTR_RO(state); @@ -214,7 +173,7 @@ static ssize_t monitor_id_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", hv_dev->channel->offermsg.monitorid); + return sysfs_emit(buf, "%d\n", hv_dev->channel->offermsg.monitorid); } static DEVICE_ATTR_RO(monitor_id); @@ -225,8 +184,8 @@ static ssize_t class_id_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "{%pUl}\n", - &hv_dev->channel->offermsg.offer.if_type); + return sysfs_emit(buf, "{%pUl}\n", + &hv_dev->channel->offermsg.offer.if_type); } static DEVICE_ATTR_RO(class_id); @@ -237,8 +196,8 @@ static ssize_t device_id_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "{%pUl}\n", - &hv_dev->channel->offermsg.offer.if_instance); + return sysfs_emit(buf, "{%pUl}\n", + &hv_dev->channel->offermsg.offer.if_instance); } static DEVICE_ATTR_RO(device_id); @@ -247,7 +206,7 @@ static ssize_t modalias_show(struct device *dev, { struct hv_device *hv_dev = device_to_hv_device(dev); - return sprintf(buf, "vmbus:%*phN\n", UUID_SIZE, &hv_dev->dev_type); + return sysfs_emit(buf, "vmbus:%*phN\n", UUID_SIZE, &hv_dev->dev_type); } static DEVICE_ATTR_RO(modalias); @@ -260,7 +219,7 @@ static ssize_t numa_node_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", cpu_to_node(hv_dev->channel->target_cpu)); + return sysfs_emit(buf, "%d\n", cpu_to_node(hv_dev->channel->target_cpu)); } static DEVICE_ATTR_RO(numa_node); #endif @@ -273,9 +232,8 @@ static ssize_t server_monitor_pending_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", - channel_pending(hv_dev->channel, - vmbus_connection.monitor_pages[0])); + return sysfs_emit(buf, "%d\n", channel_pending(hv_dev->channel, + vmbus_connection.monitor_pages[0])); } static DEVICE_ATTR_RO(server_monitor_pending); @@ -287,9 +245,8 @@ static ssize_t client_monitor_pending_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", - channel_pending(hv_dev->channel, - vmbus_connection.monitor_pages[1])); + return sysfs_emit(buf, "%d\n", channel_pending(hv_dev->channel, + vmbus_connection.monitor_pages[1])); } static DEVICE_ATTR_RO(client_monitor_pending); @@ -301,9 +258,8 @@ static ssize_t server_monitor_latency_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", - channel_latency(hv_dev->channel, - vmbus_connection.monitor_pages[0])); + return sysfs_emit(buf, "%d\n", channel_latency(hv_dev->channel, + vmbus_connection.monitor_pages[0])); } static DEVICE_ATTR_RO(server_monitor_latency); @@ -315,9 +271,8 @@ static ssize_t client_monitor_latency_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", - channel_latency(hv_dev->channel, - vmbus_connection.monitor_pages[1])); + return sysfs_emit(buf, "%d\n", channel_latency(hv_dev->channel, + vmbus_connection.monitor_pages[1])); } static DEVICE_ATTR_RO(client_monitor_latency); @@ -329,9 +284,8 @@ static ssize_t server_monitor_conn_id_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", - channel_conn_id(hv_dev->channel, - vmbus_connection.monitor_pages[0])); + return sysfs_emit(buf, "%d\n", channel_conn_id(hv_dev->channel, + vmbus_connection.monitor_pages[0])); } static DEVICE_ATTR_RO(server_monitor_conn_id); @@ -343,9 +297,8 @@ static ssize_t client_monitor_conn_id_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", - channel_conn_id(hv_dev->channel, - vmbus_connection.monitor_pages[1])); + return sysfs_emit(buf, "%d\n", channel_conn_id(hv_dev->channel, + vmbus_connection.monitor_pages[1])); } static DEVICE_ATTR_RO(client_monitor_conn_id); @@ -364,7 +317,7 @@ static ssize_t out_intr_mask_show(struct device *dev, if (ret < 0) return ret; - return sprintf(buf, "%d\n", outbound.current_interrupt_mask); + return sysfs_emit(buf, "%d\n", outbound.current_interrupt_mask); } static DEVICE_ATTR_RO(out_intr_mask); @@ -382,7 +335,7 @@ static ssize_t out_read_index_show(struct device *dev, &outbound); if (ret < 0) return ret; - return sprintf(buf, "%d\n", outbound.current_read_index); + return sysfs_emit(buf, "%u\n", outbound.current_read_index); } static DEVICE_ATTR_RO(out_read_index); @@ -401,7 +354,7 @@ static ssize_t out_write_index_show(struct device *dev, &outbound); if (ret < 0) return ret; - return sprintf(buf, "%d\n", outbound.current_write_index); + return sysfs_emit(buf, "%u\n", outbound.current_write_index); } static DEVICE_ATTR_RO(out_write_index); @@ -420,7 +373,7 @@ static ssize_t out_read_bytes_avail_show(struct device *dev, &outbound); if (ret < 0) return ret; - return sprintf(buf, "%d\n", outbound.bytes_avail_toread); + return sysfs_emit(buf, "%d\n", outbound.bytes_avail_toread); } static DEVICE_ATTR_RO(out_read_bytes_avail); @@ -439,7 +392,7 @@ static ssize_t out_write_bytes_avail_show(struct device *dev, &outbound); if (ret < 0) return ret; - return sprintf(buf, "%d\n", outbound.bytes_avail_towrite); + return sysfs_emit(buf, "%d\n", outbound.bytes_avail_towrite); } static DEVICE_ATTR_RO(out_write_bytes_avail); @@ -457,7 +410,7 @@ static ssize_t in_intr_mask_show(struct device *dev, if (ret < 0) return ret; - return sprintf(buf, "%d\n", inbound.current_interrupt_mask); + return sysfs_emit(buf, "%d\n", inbound.current_interrupt_mask); } static DEVICE_ATTR_RO(in_intr_mask); @@ -475,7 +428,7 @@ static ssize_t in_read_index_show(struct device *dev, if (ret < 0) return ret; - return sprintf(buf, "%d\n", inbound.current_read_index); + return sysfs_emit(buf, "%d\n", inbound.current_read_index); } static DEVICE_ATTR_RO(in_read_index); @@ -493,7 +446,7 @@ static ssize_t in_write_index_show(struct device *dev, if (ret < 0) return ret; - return sprintf(buf, "%d\n", inbound.current_write_index); + return sysfs_emit(buf, "%d\n", inbound.current_write_index); } static DEVICE_ATTR_RO(in_write_index); @@ -512,7 +465,7 @@ static ssize_t in_read_bytes_avail_show(struct device *dev, if (ret < 0) return ret; - return sprintf(buf, "%d\n", inbound.bytes_avail_toread); + return sysfs_emit(buf, "%d\n", inbound.bytes_avail_toread); } static DEVICE_ATTR_RO(in_read_bytes_avail); @@ -531,7 +484,7 @@ static ssize_t in_write_bytes_avail_show(struct device *dev, if (ret < 0) return ret; - return sprintf(buf, "%d\n", inbound.bytes_avail_towrite); + return sysfs_emit(buf, "%d\n", inbound.bytes_avail_towrite); } static DEVICE_ATTR_RO(in_write_bytes_avail); @@ -541,7 +494,7 @@ static ssize_t channel_vp_mapping_show(struct device *dev, { struct hv_device *hv_dev = device_to_hv_device(dev); struct vmbus_channel *channel = hv_dev->channel, *cur_sc; - int buf_size = PAGE_SIZE, n_written, tot_written; + int n_written; struct list_head *cur; if (!channel) @@ -549,25 +502,21 @@ static ssize_t channel_vp_mapping_show(struct device *dev, mutex_lock(&vmbus_connection.channel_mutex); - tot_written = snprintf(buf, buf_size, "%u:%u\n", - channel->offermsg.child_relid, channel->target_cpu); + n_written = sysfs_emit(buf, "%u:%u\n", + channel->offermsg.child_relid, + channel->target_cpu); list_for_each(cur, &channel->sc_list) { - if (tot_written >= buf_size - 1) - break; cur_sc = list_entry(cur, struct vmbus_channel, sc_list); - n_written = scnprintf(buf + tot_written, - buf_size - tot_written, - "%u:%u\n", - cur_sc->offermsg.child_relid, - cur_sc->target_cpu); - tot_written += n_written; + n_written += sysfs_emit_at(buf, n_written, "%u:%u\n", + cur_sc->offermsg.child_relid, + cur_sc->target_cpu); } mutex_unlock(&vmbus_connection.channel_mutex); - return tot_written; + return n_written; } static DEVICE_ATTR_RO(channel_vp_mapping); @@ -577,7 +526,7 @@ static ssize_t vendor_show(struct device *dev, { struct hv_device *hv_dev = device_to_hv_device(dev); - return sprintf(buf, "0x%x\n", hv_dev->vendor_id); + return sysfs_emit(buf, "0x%x\n", hv_dev->vendor_id); } static DEVICE_ATTR_RO(vendor); @@ -587,7 +536,7 @@ static ssize_t device_show(struct device *dev, { struct hv_device *hv_dev = device_to_hv_device(dev); - return sprintf(buf, "0x%x\n", hv_dev->device_id); + return sysfs_emit(buf, "0x%x\n", hv_dev->device_id); } static DEVICE_ATTR_RO(device); @@ -612,7 +561,7 @@ static ssize_t driver_override_show(struct device *dev, ssize_t len; device_lock(dev); - len = snprintf(buf, PAGE_SIZE, "%s\n", hv_dev->driver_override); + len = sysfs_emit(buf, "%s\n", hv_dev->driver_override); device_unlock(dev); return len; @@ -684,7 +633,7 @@ static const struct attribute_group vmbus_dev_group = { __ATTRIBUTE_GROUPS(vmbus_dev); /* Set up the attribute for /sys/bus/vmbus/hibernation */ -static ssize_t hibernation_show(struct bus_type *bus, char *buf) +static ssize_t hibernation_show(const struct bus_type *bus, char *buf) { return sprintf(buf, "%d\n", !!hv_is_hibernation_supported()); } @@ -711,9 +660,9 @@ __ATTRIBUTE_GROUPS(vmbus_bus); * representation of the device guid (each byte of the guid will be * represented with two hex characters. */ -static int vmbus_uevent(struct device *device, struct kobj_uevent_env *env) +static int vmbus_uevent(const struct device *device, struct kobj_uevent_env *env) { - struct hv_device *dev = device_to_hv_device(device); + const struct hv_device *dev = device_to_hv_device(device); const char *format = "MODALIAS=vmbus:%*phN"; return add_uevent_var(env, format, UUID_SIZE, &dev->dev_type); @@ -756,7 +705,7 @@ static const struct hv_vmbus_device_id vmbus_device_null; * Return a matching hv_vmbus_device_id pointer. * If there is no match, return NULL. */ -static const struct hv_vmbus_device_id *hv_vmbus_get_id(struct hv_driver *drv, +static const struct hv_vmbus_device_id *hv_vmbus_get_id(const struct hv_driver *drv, struct hv_device *dev) { const guid_t *guid = &dev->dev_type; @@ -767,7 +716,7 @@ static const struct hv_vmbus_device_id *hv_vmbus_get_id(struct hv_driver *drv, return NULL; /* Look at the dynamic ids first, before the static ones */ - id = hv_vmbus_dynid_match(drv, guid); + id = hv_vmbus_dynid_match((struct hv_driver *)drv, guid); if (!id) id = hv_vmbus_dev_match(drv->id_table, guid); @@ -778,7 +727,30 @@ static const struct hv_vmbus_device_id *hv_vmbus_get_id(struct hv_driver *drv, return id; } -/* vmbus_add_dynid - add a new device ID to this driver and re-probe devices */ +/* vmbus_add_dynid - add a new device ID to this driver and re-probe devices + * + * This function can race with vmbus_device_register(). This function is + * typically running on a user thread in response to writing to the "new_id" + * sysfs entry for a driver. vmbus_device_register() is running on a + * workqueue thread in response to the Hyper-V host offering a device to the + * guest. This function calls driver_attach(), which looks for an existing + * device matching the new id, and attaches the driver to which the new id + * has been assigned. vmbus_device_register() calls device_register(), which + * looks for a driver that matches the device being registered. If both + * operations are running simultaneously, the device driver probe function runs + * on whichever thread establishes the linkage between the driver and device. + * + * In most cases, it doesn't matter which thread runs the driver probe + * function. But if vmbus_device_register() does not find a matching driver, + * it proceeds to create the "channels" subdirectory and numbered per-channel + * subdirectory in sysfs. While that multi-step creation is in progress, this + * function could run the driver probe function. If the probe function checks + * for, or operates on, entries in the "channels" subdirectory, including by + * calling hv_create_ring_sysfs(), the operation may or may not succeed + * depending on the race. The race can't create a kernel failure in VMBus + * or device subsystem code, but probe functions in VMBus drivers doing such + * operations must be prepared for the failure case. + */ static int vmbus_add_dynid(struct hv_driver *drv, guid_t *guid) { struct vmbus_dynid *dynid; @@ -880,9 +852,9 @@ ATTRIBUTE_GROUPS(vmbus_drv); /* * vmbus_match - Attempt to match the specified device to the specified driver */ -static int vmbus_match(struct device *device, struct device_driver *driver) +static int vmbus_match(struct device *device, const struct device_driver *driver) { - struct hv_driver *drv = drv_to_hv_drv(driver); + const struct hv_driver *drv = drv_to_hv_drv(driver); struct hv_device *hv_dev = device_to_hv_device(device); /* The hv_sock driver handles all hv_sock offers. */ @@ -932,7 +904,7 @@ static int vmbus_dma_configure(struct device *child_device) * On x86/x64 coherence is assumed and these calls have no effect. */ hv_setup_dma_ops(child_device, - device_get_dma_attr(&hv_acpi_dev->dev) == DEV_DMA_COHERENT); + device_get_dma_attr(vmbus_root_device) == DEV_DMA_COHERENT); return 0; } @@ -1049,7 +1021,7 @@ static const struct dev_pm_ops vmbus_pm = { }; /* The one and only one */ -static struct bus_type hv_bus = { +static const struct bus_type hv_bus = { .name = "vmbus", .match = vmbus_match, .shutdown = vmbus_shutdown, @@ -1086,12 +1058,9 @@ static void vmbus_onmessage_work(struct work_struct *work) kfree(ctx); } -void vmbus_on_msg_dpc(unsigned long data) +static void __vmbus_on_msg_dpc(void *message_page_addr) { - struct hv_per_cpu_context *hv_cpu = (void *)data; - void *page_addr = hv_cpu->synic_message_page; - struct hv_message msg_copy, *msg = (struct hv_message *)page_addr + - VMBUS_MESSAGE_SINT; + struct hv_message msg_copy, *msg; struct vmbus_channel_message_header *hdr; enum vmbus_channel_message_type msgtype; const struct vmbus_channel_message_table_entry *entry; @@ -1099,6 +1068,10 @@ void vmbus_on_msg_dpc(unsigned long data) __u8 payload_size; u32 message_type; + if (!message_page_addr) + return; + msg = (struct hv_message *)message_page_addr + VMBUS_MESSAGE_SINT; + /* * 'enum vmbus_channel_message_type' is supposed to always be 'u32' as * it is being used in 'struct vmbus_channel_message_header' definition @@ -1224,6 +1197,14 @@ msg_handled: vmbus_signal_eom(msg, message_type); } +void vmbus_on_msg_dpc(unsigned long data) +{ + struct hv_per_cpu_context *hv_cpu = (void *)data; + + __vmbus_on_msg_dpc(hv_cpu->hyp_synic_message_page); + __vmbus_on_msg_dpc(hv_cpu->para_synic_message_page); +} + #ifdef CONFIG_PM_SLEEP /* * Fake RESCIND_CHANNEL messages to clean up hv_sock channels by force for @@ -1262,21 +1243,19 @@ static void vmbus_force_channel_rescinded(struct vmbus_channel *channel) #endif /* CONFIG_PM_SLEEP */ /* - * Schedule all channels with events pending + * Schedule all channels with events pending. + * The event page can be directly checked to get the id of + * the channel that has the interrupt pending. */ -static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu) +static void vmbus_chan_sched(void *event_page_addr) { unsigned long *recv_int_page; u32 maxbits, relid; + union hv_synic_event_flags *event; - /* - * The event page can be directly checked to get the id of - * the channel that has the interrupt pending. - */ - void *page_addr = hv_cpu->synic_event_page; - union hv_synic_event_flags *event - = (union hv_synic_event_flags *)page_addr + - VMBUS_MESSAGE_SINT; + if (!event_page_addr) + return; + event = (union hv_synic_event_flags *)event_page_addr + VMBUS_MESSAGE_SINT; maxbits = HV_EVENT_FLAGS_COUNT; recv_int_page = event->flags; @@ -1284,6 +1263,11 @@ static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu) if (unlikely(!recv_int_page)) return; + /* + * Suggested-by: Michael Kelley <mhklinux@outlook.com> + * One possible optimization would be to keep track of the largest relID that's in use, + * and only scan up to that relID. + */ for_each_set_bit(relid, recv_int_page, maxbits) { void (*callback_fn)(void *context); struct vmbus_channel *channel; @@ -1347,29 +1331,39 @@ sched_unlock_rcu: } } -static void vmbus_isr(void) +static void vmbus_message_sched(struct hv_per_cpu_context *hv_cpu, void *message_page_addr) { - struct hv_per_cpu_context *hv_cpu - = this_cpu_ptr(hv_context.cpu_context); - void *page_addr; struct hv_message *msg; - vmbus_chan_sched(hv_cpu); - - page_addr = hv_cpu->synic_message_page; - msg = (struct hv_message *)page_addr + VMBUS_MESSAGE_SINT; + if (!message_page_addr) + return; + msg = (struct hv_message *)message_page_addr + VMBUS_MESSAGE_SINT; /* Check if there are actual msgs to be processed */ if (msg->header.message_type != HVMSG_NONE) { if (msg->header.message_type == HVMSG_TIMER_EXPIRED) { hv_stimer0_isr(); vmbus_signal_eom(msg, HVMSG_TIMER_EXPIRED); - } else + } else { tasklet_schedule(&hv_cpu->msg_dpc); + } } +} + +void vmbus_isr(void) +{ + struct hv_per_cpu_context *hv_cpu + = this_cpu_ptr(hv_context.cpu_context); + + vmbus_chan_sched(hv_cpu->hyp_synic_event_page); + vmbus_chan_sched(hv_cpu->para_synic_event_page); + + vmbus_message_sched(hv_cpu, hv_cpu->hyp_synic_message_page); + vmbus_message_sched(hv_cpu, hv_cpu->para_synic_message_page); add_interrupt_randomness(vmbus_interrupt); } +EXPORT_SYMBOL_FOR_MODULES(vmbus_isr, "mshv_vtl"); static irqreturn_t vmbus_percpu_isr(int irq, void *dev_id) { @@ -1377,97 +1371,65 @@ static irqreturn_t vmbus_percpu_isr(int irq, void *dev_id) return IRQ_HANDLED; } -/* - * Callback from kmsg_dump. Grab as much as possible from the end of the kmsg - * buffer and call into Hyper-V to transfer the data. - */ -static void hv_kmsg_dump(struct kmsg_dumper *dumper, - enum kmsg_dump_reason reason) +static void vmbus_percpu_work(struct work_struct *work) { - struct kmsg_dump_iter iter; - size_t bytes_written; + unsigned int cpu = smp_processor_id(); - /* We are only interested in panics. */ - if ((reason != KMSG_DUMP_PANIC) || (!sysctl_record_panic_msg)) - return; - - /* - * Write dump contents to the page. No need to synchronize; panic should - * be single-threaded. - */ - kmsg_dump_rewind(&iter); - kmsg_dump_get_buffer(&iter, false, hv_panic_page, HV_HYP_PAGE_SIZE, - &bytes_written); - if (!bytes_written) - return; - /* - * P3 to contain the physical address of the panic page & P4 to - * contain the size of the panic data in that page. Rest of the - * registers are no-op when the NOTIFY_MSG flag is set. - */ - hv_set_register(HV_REGISTER_CRASH_P0, 0); - hv_set_register(HV_REGISTER_CRASH_P1, 0); - hv_set_register(HV_REGISTER_CRASH_P2, 0); - hv_set_register(HV_REGISTER_CRASH_P3, virt_to_phys(hv_panic_page)); - hv_set_register(HV_REGISTER_CRASH_P4, bytes_written); - - /* - * Let Hyper-V know there is crash data available along with - * the panic message. - */ - hv_set_register(HV_REGISTER_CRASH_CTL, - (HV_CRASH_CTL_CRASH_NOTIFY | HV_CRASH_CTL_CRASH_NOTIFY_MSG)); + hv_synic_init(cpu); } -static struct kmsg_dumper hv_kmsg_dumper = { - .dump = hv_kmsg_dump, -}; - -static void hv_kmsg_dump_register(void) +static int vmbus_alloc_synic_and_connect(void) { - int ret; + int ret, cpu; + struct work_struct __percpu *works; + int hyperv_cpuhp_online; - hv_panic_page = hv_alloc_hyperv_zeroed_page(); - if (!hv_panic_page) { - pr_err("Hyper-V: panic message page memory allocation failed\n"); - return; + ret = hv_synic_alloc(); + if (ret < 0) + goto err_alloc; + + works = alloc_percpu(struct work_struct); + if (!works) { + ret = -ENOMEM; + goto err_alloc; } - ret = kmsg_dump_register(&hv_kmsg_dumper); - if (ret) { - pr_err("Hyper-V: kmsg dump register error 0x%x\n", ret); - hv_free_hyperv_page((unsigned long)hv_panic_page); - hv_panic_page = NULL; + /* + * Initialize the per-cpu interrupt state and stimer state. + * Then connect to the host. + */ + cpus_read_lock(); + for_each_online_cpu(cpu) { + struct work_struct *work = per_cpu_ptr(works, cpu); + + INIT_WORK(work, vmbus_percpu_work); + schedule_work_on(cpu, work); } -} -static struct ctl_table_header *hv_ctl_table_hdr; + for_each_online_cpu(cpu) + flush_work(per_cpu_ptr(works, cpu)); -/* - * sysctl option to allow the user to control whether kmsg data should be - * reported to Hyper-V on panic. - */ -static struct ctl_table hv_ctl_table[] = { - { - .procname = "hyperv_record_panic_msg", - .data = &sysctl_record_panic_msg, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE - }, - {} -}; + /* Register the callbacks for possible CPU online/offline'ing */ + ret = cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ONLINE_DYN, "hyperv/vmbus:online", + hv_synic_init, hv_synic_cleanup); + cpus_read_unlock(); + free_percpu(works); + if (ret < 0) + goto err_alloc; + hyperv_cpuhp_online = ret; -static struct ctl_table hv_root_table[] = { - { - .procname = "kernel", - .mode = 0555, - .child = hv_ctl_table - }, - {} -}; + ret = vmbus_connect(); + if (ret) + goto err_connect; + return 0; + +err_connect: + cpuhp_remove_state(hyperv_cpuhp_online); + return -ENODEV; +err_alloc: + hv_synic_free(); + return -ENOMEM; +} /* * vmbus_bus_init -Main vmbus driver initialization routine. @@ -1514,56 +1476,18 @@ static int vmbus_bus_init(void) } } - ret = hv_synic_alloc(); - if (ret) - goto err_alloc; - /* - * Initialize the per-cpu interrupt state and stimer state. - * Then connect to the host. + * Cache the value as getting it involves a VM exit on x86(_64), and + * doing that on each VP while initializing SynIC's wastes time. */ - ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "hyperv/vmbus:online", - hv_synic_init, hv_synic_cleanup); - if (ret < 0) - goto err_cpuhp; - hyperv_cpuhp_online = ret; - - ret = vmbus_connect(); + is_confidential = ms_hyperv.confidential_vmbus_available; + if (is_confidential) + pr_info("Establishing connection to the confidential VMBus\n"); + hv_para_set_sint_proxy(!is_confidential); + ret = vmbus_alloc_synic_and_connect(); if (ret) goto err_connect; - if (hv_is_isolation_supported()) - sysctl_record_panic_msg = 0; - - /* - * Only register if the crash MSRs are available - */ - if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) { - u64 hyperv_crash_ctl; - /* - * Panic message recording (sysctl_record_panic_msg) - * is enabled by default in non-isolated guests and - * disabled by default in isolated guests; the panic - * message recording won't be available in isolated - * guests should the following registration fail. - */ - hv_ctl_table_hdr = register_sysctl_table(hv_root_table); - if (!hv_ctl_table_hdr) - pr_err("Hyper-V: sysctl table register error"); - - /* - * Register for panic kmsg callback only if the right - * capability is supported by the hypervisor. - */ - hyperv_crash_ctl = hv_get_register(HV_REGISTER_CRASH_CTL); - if (hyperv_crash_ctl & HV_CRASH_CTL_CRASH_NOTIFY_MSG) - hv_kmsg_dump_register(); - - register_die_notifier(&hyperv_die_report_block); - atomic_notifier_chain_register(&panic_notifier_list, - &hyperv_panic_report_block); - } - /* * Always register the vmbus unload panic notifier because we * need to shut the VMbus channel connection on panic. @@ -1576,10 +1500,6 @@ static int vmbus_bus_init(void) return 0; err_connect: - cpuhp_remove_state(hyperv_cpuhp_online); -err_cpuhp: - hv_synic_free(); -err_alloc: if (vmbus_irq == -1) { hv_remove_vmbus_handler(); } else { @@ -1588,8 +1508,6 @@ err_alloc: } err_setup: bus_unregister(&hv_bus); - unregister_sysctl_table(hv_ctl_table_hdr); - hv_ctl_table_hdr = NULL; return ret; } @@ -1781,16 +1699,16 @@ static ssize_t target_cpu_show(struct vmbus_channel *channel, char *buf) { return sprintf(buf, "%u\n", channel->target_cpu); } -static ssize_t target_cpu_store(struct vmbus_channel *channel, - const char *buf, size_t count) + +int vmbus_channel_set_cpu(struct vmbus_channel *channel, u32 target_cpu) { - u32 target_cpu, origin_cpu; - ssize_t ret = count; + u32 origin_cpu; + int ret = 0; - if (vmbus_proto_version < VERSION_WIN10_V4_1) - return -EIO; + lockdep_assert_cpus_held(); + lockdep_assert_held(&vmbus_connection.channel_mutex); - if (sscanf(buf, "%uu", &target_cpu) != 1) + if (vmbus_proto_version < VERSION_WIN10_V4_1) return -EIO; /* Validate target_cpu for the cpumask_test_cpu() operation below. */ @@ -1800,22 +1718,17 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel, if (!cpumask_test_cpu(target_cpu, housekeeping_cpumask(HK_TYPE_MANAGED_IRQ))) return -EINVAL; - /* No CPUs should come up or down during this. */ - cpus_read_lock(); - - if (!cpu_online(target_cpu)) { - cpus_read_unlock(); + if (!cpu_online(target_cpu)) return -EINVAL; - } /* - * Synchronizes target_cpu_store() and channel closure: + * Synchronizes vmbus_channel_set_cpu() and channel closure: * * { Initially: state = CHANNEL_OPENED } * * CPU1 CPU2 * - * [target_cpu_store()] [vmbus_disconnect_ring()] + * [vmbus_channel_set_cpu()] [vmbus_disconnect_ring()] * * LOCK channel_mutex LOCK channel_mutex * LOAD r1 = state LOAD r2 = state @@ -1830,7 +1743,6 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel, * Note. The host processes the channel messages "sequentially", in * the order in which they are received on a per-partition basis. */ - mutex_lock(&vmbus_connection.channel_mutex); /* * Hyper-V will ignore MODIFYCHANNEL messages for "non-open" channels; @@ -1838,17 +1750,17 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel, */ if (channel->state != CHANNEL_OPENED_STATE) { ret = -EIO; - goto cpu_store_unlock; + goto end; } origin_cpu = channel->target_cpu; if (target_cpu == origin_cpu) - goto cpu_store_unlock; + goto end; if (vmbus_send_modifychannel(channel, hv_cpu_number_to_vp_number(target_cpu))) { ret = -EIO; - goto cpu_store_unlock; + goto end; } /* @@ -1878,10 +1790,26 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel, origin_cpu, target_cpu); } -cpu_store_unlock: +end: + return ret; +} + +static ssize_t target_cpu_store(struct vmbus_channel *channel, + const char *buf, size_t count) +{ + u32 target_cpu; + ssize_t ret; + + if (sscanf(buf, "%u", &target_cpu) != 1) + return -EIO; + + cpus_read_lock(); + mutex_lock(&vmbus_connection.channel_mutex); + ret = vmbus_channel_set_cpu(channel, target_cpu); mutex_unlock(&vmbus_connection.channel_mutex); cpus_read_unlock(); - return ret; + + return ret ?: count; } static VMBUS_CHAN_ATTR(cpu, 0644, target_cpu_show, target_cpu_store); @@ -1962,6 +1890,26 @@ static ssize_t subchannel_id_show(struct vmbus_channel *channel, } static VMBUS_CHAN_ATTR_RO(subchannel_id); +static int hv_mmap_ring_buffer_wrapper(struct file *filp, struct kobject *kobj, + const struct bin_attribute *attr, + struct vm_area_struct *vma) +{ + struct vmbus_channel *channel = container_of(kobj, struct vmbus_channel, kobj); + + /* + * hv_(create|remove)_ring_sysfs implementation ensures that mmap_ring_buffer + * is not NULL. + */ + return channel->mmap_ring_buffer(channel, vma); +} + +static struct bin_attribute chan_attr_ring_buffer = { + .attr = { + .name = "ring", + .mode = 0600, + }, + .mmap = hv_mmap_ring_buffer_wrapper, +}; static struct attribute *vmbus_chan_attrs[] = { &chan_attr_out_mask.attr, &chan_attr_in_mask.attr, @@ -1981,6 +1929,11 @@ static struct attribute *vmbus_chan_attrs[] = { NULL }; +static const struct bin_attribute *vmbus_chan_bin_attrs[] = { + &chan_attr_ring_buffer, + NULL +}; + /* * Channel-level attribute_group callback function. Returns the permission for * each attribute, and returns 0 if an attribute is not visible. @@ -2001,16 +1954,99 @@ static umode_t vmbus_chan_attr_is_visible(struct kobject *kobj, return attr->mode; } -static struct attribute_group vmbus_chan_group = { +static umode_t vmbus_chan_bin_attr_is_visible(struct kobject *kobj, + const struct bin_attribute *attr, int idx) +{ + const struct vmbus_channel *channel = + container_of(kobj, struct vmbus_channel, kobj); + + /* Hide ring attribute if channel's ring_sysfs_visible is set to false */ + if (attr == &chan_attr_ring_buffer && !channel->ring_sysfs_visible) + return 0; + + return attr->attr.mode; +} + +static size_t vmbus_chan_bin_size(struct kobject *kobj, + const struct bin_attribute *bin_attr, int a) +{ + const struct vmbus_channel *channel = + container_of(kobj, struct vmbus_channel, kobj); + + return channel->ringbuffer_pagecount << PAGE_SHIFT; +} + +static const struct attribute_group vmbus_chan_group = { .attrs = vmbus_chan_attrs, - .is_visible = vmbus_chan_attr_is_visible + .bin_attrs = vmbus_chan_bin_attrs, + .is_visible = vmbus_chan_attr_is_visible, + .is_bin_visible = vmbus_chan_bin_attr_is_visible, + .bin_size = vmbus_chan_bin_size, }; -static struct kobj_type vmbus_chan_ktype = { +static const struct kobj_type vmbus_chan_ktype = { .sysfs_ops = &vmbus_chan_sysfs_ops, .release = vmbus_chan_release, }; +/** + * hv_create_ring_sysfs() - create "ring" sysfs entry corresponding to ring buffers for a channel. + * @channel: Pointer to vmbus_channel structure + * @hv_mmap_ring_buffer: function pointer for initializing the function to be called on mmap of + * channel's "ring" sysfs node, which is for the ring buffer of that channel. + * Function pointer is of below type: + * int (*hv_mmap_ring_buffer)(struct vmbus_channel *channel, + * struct vm_area_struct *vma)) + * This has a pointer to the channel and a pointer to vm_area_struct, + * used for mmap, as arguments. + * + * Sysfs node for ring buffer of a channel is created along with other fields, however its + * visibility is disabled by default. Sysfs creation needs to be controlled when the use-case + * is running. + * For example, HV_NIC device is used either by uio_hv_generic or hv_netvsc at any given point of + * time, and "ring" sysfs is needed only when uio_hv_generic is bound to that device. To avoid + * exposing the ring buffer by default, this function is responsible to enable visibility of + * ring for userspace to use. + * Note: Race conditions can happen with userspace and it is not encouraged to create new + * use-cases for this. This was added to maintain backward compatibility, while solving + * one of the race conditions in uio_hv_generic while creating sysfs. See comments with + * vmbus_add_dynid() and vmbus_device_register(). + * + * Returns 0 on success or error code on failure. + */ +int hv_create_ring_sysfs(struct vmbus_channel *channel, + int (*hv_mmap_ring_buffer)(struct vmbus_channel *channel, + struct vm_area_struct *vma)) +{ + struct kobject *kobj = &channel->kobj; + + channel->mmap_ring_buffer = hv_mmap_ring_buffer; + channel->ring_sysfs_visible = true; + + return sysfs_update_group(kobj, &vmbus_chan_group); +} +EXPORT_SYMBOL_GPL(hv_create_ring_sysfs); + +/** + * hv_remove_ring_sysfs() - remove ring sysfs entry corresponding to ring buffers for a channel. + * @channel: Pointer to vmbus_channel structure + * + * Hide "ring" sysfs for a channel by changing its is_visible attribute and updating sysfs group. + * + * Returns 0 on success or error code on failure. + */ +int hv_remove_ring_sysfs(struct vmbus_channel *channel) +{ + struct kobject *kobj = &channel->kobj; + int ret; + + channel->ring_sysfs_visible = false; + ret = sysfs_update_group(kobj, &vmbus_chan_group); + channel->mmap_ring_buffer = NULL; + return ret; +} +EXPORT_SYMBOL_GPL(hv_remove_ring_sysfs); + /* * vmbus_add_channel_kobj - setup a sub-directory under device/channels */ @@ -2090,7 +2126,7 @@ int vmbus_device_register(struct hv_device *child_device_obj) &child_device_obj->channel->offermsg.offer.if_instance); child_device_obj->device.bus = &hv_bus; - child_device_obj->device.parent = &hv_acpi_dev->dev; + child_device_obj->device.parent = vmbus_root_device; child_device_obj->device.release = vmbus_device_release; child_device_obj->device.dma_parms = &child_device_obj->dma_parms; @@ -2108,6 +2144,20 @@ int vmbus_device_register(struct hv_device *child_device_obj) return ret; } + /* + * If device_register() found a driver to assign to the device, the + * driver's probe function has already run at this point. If that + * probe function accesses or operates on the "channels" subdirectory + * in sysfs, those operations will have failed because the "channels" + * subdirectory doesn't exist until the code below runs. Or if the + * probe function creates a /dev entry, a user space program could + * find and open the /dev entry, and then create a race by accessing + * the "channels" subdirectory while the creation steps are in progress + * here. The race can't result in a kernel failure, but the user space + * program may get an error in accessing "channels" or its + * subdirectories. See also comments with vmbus_add_dynid() about a + * related race condition. + */ child_device_obj->channels_kset = kset_create_and_add("channels", NULL, kobj); if (!child_device_obj->channels_kset) { @@ -2118,7 +2168,7 @@ int vmbus_device_register(struct hv_device *child_device_obj) ret = vmbus_add_channel_kobj(child_device_obj, child_device_obj->channel); if (ret) { - pr_err("Unable to register primary channeln"); + pr_err("Unable to register primary channel\n"); goto err_kset_unregister; } hv_debug_add_dev_dir(child_device_obj); @@ -2150,13 +2200,13 @@ void vmbus_device_unregister(struct hv_device *device_obj) */ device_unregister(&device_obj->device); } +EXPORT_SYMBOL_GPL(vmbus_device_unregister); - +#ifdef CONFIG_ACPI /* * VMBUS is an acpi enumerated device. Get the information we * need from DSDT. */ -#define VTPM_BASE_ADDRESS 0xfed40000 static acpi_status vmbus_walk_resources(struct acpi_resource *res, void *ctx) { resource_size_t start = 0; @@ -2261,8 +2311,9 @@ static acpi_status vmbus_walk_resources(struct acpi_resource *res, void *ctx) return AE_OK; } +#endif -static void vmbus_acpi_remove(struct acpi_device *device) +static void vmbus_mmio_remove(void) { struct resource *cur_res; struct resource *next_res; @@ -2281,15 +2332,17 @@ static void vmbus_acpi_remove(struct acpi_device *device) } } -static void vmbus_reserve_fb(void) +static void __maybe_unused vmbus_reserve_fb(void) { resource_size_t start = 0, size; struct pci_dev *pdev; if (efi_enabled(EFI_BOOT)) { /* Gen2 VM: get FB base from EFI framebuffer */ - start = screen_info.lfb_base; - size = max_t(__u32, screen_info.lfb_size, 0x800000); + if (IS_ENABLED(CONFIG_SYSFB)) { + start = screen_info.lfb_base; + size = max_t(__u32, screen_info.lfb_size, 0x800000); + } } else { /* Gen1 VM: get FB base from PCI */ pdev = pci_get_device(PCI_VENDOR_ID_MICROSOFT, @@ -2429,25 +2482,40 @@ void vmbus_free_mmio(resource_size_t start, resource_size_t size) struct resource *iter; mutex_lock(&hyperv_mmio_lock); + + /* + * If all bytes of the MMIO range to be released are within the + * special case fb_mmio shadow region, skip releasing the shadow + * region since no corresponding __request_region() was done + * in vmbus_allocate_mmio(). + */ + if (fb_mmio && start >= fb_mmio->start && + (start + size - 1 <= fb_mmio->end)) + goto skip_shadow_release; + for (iter = hyperv_mmio; iter; iter = iter->sibling) { if ((iter->start >= start + size) || (iter->end <= start)) continue; __release_region(iter, start, size); } + +skip_shadow_release: release_mem_region(start, size); mutex_unlock(&hyperv_mmio_lock); } EXPORT_SYMBOL_GPL(vmbus_free_mmio); -static int vmbus_acpi_add(struct acpi_device *device) +#ifdef CONFIG_ACPI +static int vmbus_acpi_add(struct platform_device *pdev) { acpi_status result; int ret_val = -ENODEV; struct acpi_device *ancestor; + struct acpi_device *device = ACPI_COMPANION(&pdev->dev); - hv_acpi_dev = device; + vmbus_root_device = &device->dev; /* * Older versions of Hyper-V for ARM64 fail to include the _CCA @@ -2473,7 +2541,8 @@ static int vmbus_acpi_add(struct acpi_device *device) * Some ancestor of the vmbus acpi device (Gen1 or Gen2 * firmware) is the VMOD that has the mmio ranges. Get that. */ - for (ancestor = acpi_dev_parent(device); ancestor; + for (ancestor = acpi_dev_parent(device); + ancestor && ancestor->handle != ACPI_ROOT_OBJECT; ancestor = acpi_dev_parent(ancestor)) { result = acpi_walk_resources(ancestor->handle, METHOD_NAME__CRS, vmbus_walk_resources, NULL); @@ -2489,9 +2558,94 @@ static int vmbus_acpi_add(struct acpi_device *device) acpi_walk_err: if (ret_val) - vmbus_acpi_remove(device); + vmbus_mmio_remove(); return ret_val; } +#else +static int vmbus_acpi_add(struct platform_device *pdev) +{ + return 0; +} +#endif +#ifndef HYPERVISOR_CALLBACK_VECTOR +static int vmbus_set_irq(struct platform_device *pdev) +{ + struct irq_data *data; + int irq; + irq_hw_number_t hwirq; + + irq = platform_get_irq(pdev, 0); + /* platform_get_irq() may not return 0. */ + if (irq < 0) + return irq; + + data = irq_get_irq_data(irq); + if (!data) { + pr_err("No interrupt data for VMBus virq %d\n", irq); + return -ENODEV; + } + hwirq = irqd_to_hwirq(data); + + vmbus_irq = irq; + vmbus_interrupt = hwirq; + pr_debug("VMBus virq %d, hwirq %d\n", vmbus_irq, vmbus_interrupt); + + return 0; +} +#endif + +static int vmbus_device_add(struct platform_device *pdev) +{ + struct resource **cur_res = &hyperv_mmio; + struct of_range range; + struct of_range_parser parser; + struct device_node *np = pdev->dev.of_node; + int ret; + + vmbus_root_device = &pdev->dev; + + ret = of_range_parser_init(&parser, np); + if (ret) + return ret; + +#ifndef HYPERVISOR_CALLBACK_VECTOR + ret = vmbus_set_irq(pdev); + if (ret) + return ret; +#endif + for_each_of_range(&parser, &range) { + struct resource *res; + + res = kzalloc(sizeof(*res), GFP_KERNEL); + if (!res) { + vmbus_mmio_remove(); + return -ENOMEM; + } + + res->name = "hyperv mmio"; + res->flags = range.flags; + res->start = range.cpu_addr; + res->end = range.cpu_addr + range.size; + + *cur_res = res; + cur_res = &res->sibling; + } + + return ret; +} + +static int vmbus_platform_driver_probe(struct platform_device *pdev) +{ + if (acpi_disabled) + return vmbus_device_add(pdev); + else + return vmbus_acpi_add(pdev); +} + +static void vmbus_platform_driver_remove(struct platform_device *pdev) +{ + vmbus_mmio_remove(); +} #ifdef CONFIG_PM_SLEEP static int vmbus_bus_suspend(struct device *dev) @@ -2537,11 +2691,6 @@ static int vmbus_bus_suspend(struct device *dev) if (atomic_read(&vmbus_connection.nr_chan_close_on_suspend) > 0) wait_for_completion(&vmbus_connection.ready_for_suspend_event); - if (atomic_read(&vmbus_connection.nr_chan_fixup_on_resume) != 0) { - pr_err("Can not suspend due to a previous failed resuming\n"); - return -EBUSY; - } - mutex_lock(&vmbus_connection.channel_mutex); list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { @@ -2566,22 +2715,18 @@ static int vmbus_bus_suspend(struct device *dev) pr_err("Sub-channel not deleted!\n"); WARN_ON_ONCE(1); } - - atomic_inc(&vmbus_connection.nr_chan_fixup_on_resume); } mutex_unlock(&vmbus_connection.channel_mutex); vmbus_initiate_unload(false); - /* Reset the event for the next resume. */ - reinit_completion(&vmbus_connection.ready_for_resume_event); - return 0; } static int vmbus_bus_resume(struct device *dev) { + struct vmbus_channel *channel; struct vmbus_channel_msginfo *msginfo; size_t msgsize; int ret; @@ -2612,13 +2757,23 @@ static int vmbus_bus_resume(struct device *dev) if (ret != 0) return ret; - WARN_ON(atomic_read(&vmbus_connection.nr_chan_fixup_on_resume) == 0); - vmbus_request_offers(); - if (wait_for_completion_timeout( - &vmbus_connection.ready_for_resume_event, 10 * HZ) == 0) - pr_err("Some vmbus device is missing after suspending?\n"); + mutex_lock(&vmbus_connection.channel_mutex); + list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { + if (channel->offermsg.child_relid != INVALID_RELID) + continue; + + /* hvsock channels are not expected to be present. */ + if (is_hvsock_channel(channel)) + continue; + + pr_err("channel %pUl/%pUl not present after resume.\n", + &channel->offermsg.offer.if_type, + &channel->offermsg.offer.if_instance); + /* ToDo: Cleanup these channels here */ + } + mutex_unlock(&vmbus_connection.channel_mutex); /* Reset the event for the next suspend. */ reinit_completion(&vmbus_connection.ready_for_suspend_event); @@ -2630,7 +2785,17 @@ static int vmbus_bus_resume(struct device *dev) #define vmbus_bus_resume NULL #endif /* CONFIG_PM_SLEEP */ -static const struct acpi_device_id vmbus_acpi_device_ids[] = { +static const __maybe_unused struct of_device_id vmbus_of_match[] = { + { + .compatible = "microsoft,vmbus", + }, + { + /* sentinel */ + }, +}; +MODULE_DEVICE_TABLE(of, vmbus_of_match); + +static const __maybe_unused struct acpi_device_id vmbus_acpi_device_ids[] = { {"VMBUS", 0}, {"VMBus", 0}, {"", 0}, @@ -2658,15 +2823,16 @@ static const struct dev_pm_ops vmbus_bus_pm = { .restore_noirq = vmbus_bus_resume }; -static struct acpi_driver vmbus_acpi_driver = { - .name = "vmbus", - .ids = vmbus_acpi_device_ids, - .ops = { - .add = vmbus_acpi_add, - .remove = vmbus_acpi_remove, - }, - .drv.pm = &vmbus_bus_pm, - .drv.probe_type = PROBE_FORCE_SYNCHRONOUS, +static struct platform_driver vmbus_platform_driver = { + .probe = vmbus_platform_driver_probe, + .remove = vmbus_platform_driver_remove, + .driver = { + .name = "vmbus", + .acpi_match_table = ACPI_PTR(vmbus_acpi_device_ids), + .of_match_table = of_match_ptr(vmbus_of_match), + .pm = &vmbus_bus_pm, + .probe_type = PROBE_FORCE_SYNCHRONOUS, + } }; static void hv_kexec_handler(void) @@ -2690,10 +2856,10 @@ static void hv_crash_handler(struct pt_regs *regs) */ cpu = smp_processor_id(); hv_stimer_cleanup(cpu); - hv_synic_disable_regs(cpu); + hv_hyp_synic_disable_regs(cpu); }; -static int hv_synic_suspend(void) +static int hv_synic_suspend(void *data) { /* * When we reach here, all the non-boot CPUs have been offlined. @@ -2715,14 +2881,14 @@ static int hv_synic_suspend(void) * interrupts-disabled context. */ - hv_synic_disable_regs(0); + hv_hyp_synic_disable_regs(0); return 0; } -static void hv_synic_resume(void) +static void hv_synic_resume(void *data) { - hv_synic_enable_regs(0); + hv_hyp_synic_enable_regs(0); /* * Note: we don't need to call hv_stimer_init(0), because the timer @@ -2732,11 +2898,15 @@ static void hv_synic_resume(void) } /* The callbacks run only on CPU0, with irqs_disabled. */ -static struct syscore_ops hv_synic_syscore_ops = { +static const struct syscore_ops hv_synic_syscore_ops = { .suspend = hv_synic_suspend, .resume = hv_synic_resume, }; +static struct syscore hv_synic_syscore = { + .ops = &hv_synic_syscore_ops, +}; + static int __init hv_acpi_init(void) { int ret; @@ -2744,18 +2914,17 @@ static int __init hv_acpi_init(void) if (!hv_is_hyperv_initialized()) return -ENODEV; - if (hv_root_partition) + if (hv_root_partition() && !hv_nested) return 0; /* * Get ACPI resources first. */ - ret = acpi_bus_register_driver(&vmbus_acpi_driver); - + ret = platform_driver_register(&vmbus_platform_driver); if (ret) return ret; - if (!hv_acpi_dev) { + if (!vmbus_root_device) { ret = -ENODEV; goto cleanup; } @@ -2780,13 +2949,13 @@ static int __init hv_acpi_init(void) hv_setup_kexec_handler(hv_kexec_handler); hv_setup_crash_handler(hv_crash_handler); - register_syscore_ops(&hv_synic_syscore_ops); + register_syscore(&hv_synic_syscore); return 0; cleanup: - acpi_bus_unregister_driver(&vmbus_acpi_driver); - hv_acpi_dev = NULL; + platform_driver_unregister(&vmbus_platform_driver); + vmbus_root_device = NULL; return ret; } @@ -2794,7 +2963,7 @@ static void __exit vmbus_exit(void) { int cpu; - unregister_syscore_ops(&hv_synic_syscore_ops); + unregister_syscore(&hv_synic_syscore); hv_remove_kexec_handler(); hv_remove_crash_handler(); @@ -2818,13 +2987,6 @@ static void __exit vmbus_exit(void) vmbus_free_channels(); kfree(vmbus_connection.channels); - if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) { - kmsg_dump_unregister(&hv_kmsg_dumper); - unregister_die_notifier(&hyperv_die_report_block); - atomic_notifier_chain_unregister(&panic_notifier_list, - &hyperv_panic_report_block); - } - /* * The vmbus panic notifier is always registered, hence we should * also unconditionally unregister it here as well. @@ -2832,14 +2994,11 @@ static void __exit vmbus_exit(void) atomic_notifier_chain_unregister(&panic_notifier_list, &hyperv_panic_vmbus_unload_block); - free_page((unsigned long)hv_panic_page); - unregister_sysctl_table(hv_ctl_table_hdr); - hv_ctl_table_hdr = NULL; bus_unregister(&hv_bus); cpuhp_remove_state(hyperv_cpuhp_online); hv_synic_free(); - acpi_bus_unregister_driver(&vmbus_acpi_driver); + platform_driver_unregister(&vmbus_platform_driver); } |
