diff options
Diffstat (limited to 'drivers/hv')
-rw-r--r-- | drivers/hv/Kconfig | 1 | ||||
-rw-r--r-- | drivers/hv/Makefile | 2 | ||||
-rw-r--r-- | drivers/hv/channel.c | 29 | ||||
-rw-r--r-- | drivers/hv/channel_mgmt.c | 76 | ||||
-rw-r--r-- | drivers/hv/connection.c | 33 | ||||
-rw-r--r-- | drivers/hv/hv.c | 79 | ||||
-rw-r--r-- | drivers/hv/hv_balloon.c | 214 | ||||
-rw-r--r-- | drivers/hv/hv_common.c | 128 | ||||
-rw-r--r-- | drivers/hv/hv_fcopy.c | 427 | ||||
-rw-r--r-- | drivers/hv/hv_kvp.c | 12 | ||||
-rw-r--r-- | drivers/hv/hv_snapshot.c | 11 | ||||
-rw-r--r-- | drivers/hv/hv_util.c | 25 | ||||
-rw-r--r-- | drivers/hv/hyperv_vmbus.h | 29 | ||||
-rw-r--r-- | drivers/hv/vmbus_drv.c | 188 |
14 files changed, 490 insertions, 764 deletions
diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig index 00242107d62e..862c47b191af 100644 --- a/drivers/hv/Kconfig +++ b/drivers/hv/Kconfig @@ -16,6 +16,7 @@ config HYPERV config HYPERV_VTL_MODE bool "Enable Linux to boot in VTL context" depends on X86_64 && HYPERV + depends on SMP default n help Virtual Secure Mode (VSM) is a set of hypervisor capabilities and diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile index d76df5c8c2a9..b992c0ed182b 100644 --- a/drivers/hv/Makefile +++ b/drivers/hv/Makefile @@ -10,7 +10,7 @@ hv_vmbus-y := vmbus_drv.o \ hv.o connection.o channel.o \ channel_mgmt.o ring_buffer.o hv_trace.o hv_vmbus-$(CONFIG_HYPERV_TESTING) += hv_debugfs.o -hv_utils-y := hv_util.o hv_kvp.o hv_snapshot.o hv_fcopy.o hv_utils_transport.o +hv_utils-y := hv_util.o hv_kvp.o hv_snapshot.o hv_utils_transport.o # Code that must be built-in obj-$(subst m,y,$(CONFIG_HYPERV)) += hv_common.o diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index adbf674355b2..fb8cd8469328 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -153,7 +153,9 @@ void vmbus_free_ring(struct vmbus_channel *channel) hv_ringbuffer_cleanup(&channel->inbound); if (channel->ringbuffer_page) { - __free_pages(channel->ringbuffer_page, + /* In a CoCo VM leak the memory if it didn't get re-encrypted */ + if (!channel->ringbuffer_gpadlhandle.decrypted) + __free_pages(channel->ringbuffer_page, get_order(channel->ringbuffer_pagecount << PAGE_SHIFT)); channel->ringbuffer_page = NULL; @@ -436,9 +438,18 @@ static int __vmbus_establish_gpadl(struct vmbus_channel *channel, (atomic_inc_return(&vmbus_connection.next_gpadl_handle) - 1); ret = create_gpadl_header(type, kbuffer, size, send_offset, &msginfo); - if (ret) + if (ret) { + gpadl->decrypted = false; return ret; + } + /* + * Set the "decrypted" flag to true for the set_memory_decrypted() + * success case. In the failure case, the encryption state of the + * memory is unknown. Leave "decrypted" as true to ensure the + * memory will be leaked instead of going back on the free list. + */ + gpadl->decrypted = true; ret = set_memory_decrypted((unsigned long)kbuffer, PFN_UP(size)); if (ret) { @@ -527,9 +538,15 @@ cleanup: kfree(msginfo); - if (ret) - set_memory_encrypted((unsigned long)kbuffer, - PFN_UP(size)); + if (ret) { + /* + * If set_memory_encrypted() fails, the decrypted flag is + * left as true so the memory is leaked instead of being + * put back on the free list. + */ + if (!set_memory_encrypted((unsigned long)kbuffer, PFN_UP(size))) + gpadl->decrypted = false; + } return ret; } @@ -850,6 +867,8 @@ post_msg_err: if (ret) pr_warn("Fail to set mem host visibility in GPADL teardown %d.\n", ret); + gpadl->decrypted = ret; + return ret; } EXPORT_SYMBOL_GPL(vmbus_teardown_gpadl); diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index 2f4d09ce027a..6e084c207414 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -120,7 +120,9 @@ const struct vmbus_device vmbus_devs[] = { }, /* File copy */ - { .dev_type = HV_FCOPY, + /* fcopy always uses 16KB ring buffer size and is working well for last many years */ + { .pref_ring_size = 0x4000, + .dev_type = HV_FCOPY, HV_FCOPY_GUID, .perf_device = false, .allowed_in_isolated = false, @@ -140,12 +142,19 @@ const struct vmbus_device vmbus_devs[] = { .allowed_in_isolated = false, }, - /* Unknown GUID */ - { .dev_type = HV_UNKNOWN, + /* + * Unknown GUID + * 64 KB ring buffer + 4 KB header should be sufficient size for any Hyper-V device apart + * from HV_NIC and HV_SCSI. This case avoid the fallback for unknown devices to allocate + * much bigger (2 MB) of ring size. + */ + { .pref_ring_size = 0x11000, + .dev_type = HV_UNKNOWN, .perf_device = false, .allowed_in_isolated = false, }, }; +EXPORT_SYMBOL_GPL(vmbus_devs); static const struct { guid_t guid; @@ -935,16 +944,6 @@ void vmbus_initiate_unload(bool crash) vmbus_wait_for_unload(); } -static void check_ready_for_resume_event(void) -{ - /* - * If all the old primary channels have been fixed up, then it's safe - * to resume. - */ - if (atomic_dec_and_test(&vmbus_connection.nr_chan_fixup_on_resume)) - complete(&vmbus_connection.ready_for_resume_event); -} - static void vmbus_setup_channel_state(struct vmbus_channel *channel, struct vmbus_channel_offer_channel *offer) { @@ -1100,8 +1099,6 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr) /* Add the channel back to the array of channels. */ vmbus_channel_map_relid(oldchannel); - check_ready_for_resume_event(); - mutex_unlock(&vmbus_connection.channel_mutex); return; } @@ -1287,13 +1284,28 @@ EXPORT_SYMBOL_GPL(vmbus_hvsock_device_unregister); /* * vmbus_onoffers_delivered - - * This is invoked when all offers have been delivered. + * The CHANNELMSG_ALLOFFERS_DELIVERED message arrives after all + * boot-time offers are delivered. A boot-time offer is for the primary + * channel for any virtual hardware configured in the VM at the time it boots. + * Boot-time offers include offers for physical devices assigned to the VM + * via Hyper-V's Discrete Device Assignment (DDA) functionality that are + * handled as virtual PCI devices in Linux (e.g., NVMe devices and GPUs). + * Boot-time offers do not include offers for VMBus sub-channels. Because + * devices can be hot-added to the VM after it is booted, additional channel + * offers that aren't boot-time offers can be received at any time after the + * all-offers-delivered message. * - * Nothing to do here. + * SR-IOV NIC Virtual Functions (VFs) assigned to a VM are not considered + * to be assigned to the VM at boot-time, and offers for VFs may occur after + * the all-offers-delivered message. VFs are optional accelerators to the + * synthetic VMBus NIC and are effectively hot-added only after the VMBus + * NIC channel is opened (once it knows the guest can support it, via the + * sriov bit in the netvsc protocol). */ static void vmbus_onoffers_delivered( struct vmbus_channel_message_header *hdr) { + complete(&vmbus_connection.all_offers_delivered_event); } /* @@ -1569,7 +1581,8 @@ void vmbus_onmessage(struct vmbus_channel_message_header *hdr) } /* - * vmbus_request_offers - Send a request to get all our pending offers. + * vmbus_request_offers - Send a request to get all our pending offers + * and wait for all boot-time offers to arrive. */ int vmbus_request_offers(void) { @@ -1587,6 +1600,10 @@ int vmbus_request_offers(void) msg->msgtype = CHANNELMSG_REQUESTOFFERS; + /* + * This REQUESTOFFERS message will result in the host sending an all + * offers delivered message after all the boot-time offers are sent. + */ ret = vmbus_post_msg(msg, sizeof(struct vmbus_channel_message_header), true); @@ -1598,6 +1615,29 @@ int vmbus_request_offers(void) goto cleanup; } + /* + * Wait for the host to send all boot-time offers. + * Keeping it as a best-effort mechanism, where a warning is + * printed if a timeout occurs, and execution is resumed. + */ + if (!wait_for_completion_timeout(&vmbus_connection.all_offers_delivered_event, + secs_to_jiffies(60))) { + pr_warn("timed out waiting for all boot-time offers to be delivered.\n"); + } + + /* + * Flush handling of offer messages (which may initiate work on + * other work queues). + */ + flush_workqueue(vmbus_connection.work_queue); + + /* + * Flush workqueue for processing the incoming offers. Subchannel + * offers and their processing can happen later, so there is no need to + * flush that workqueue here. + */ + flush_workqueue(vmbus_connection.handle_primary_chan_wq); + cleanup: kfree(msginfo); diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c index 3cabeeabb1ca..8351360bba16 100644 --- a/drivers/hv/connection.c +++ b/drivers/hv/connection.c @@ -34,8 +34,8 @@ struct vmbus_connection vmbus_connection = { .ready_for_suspend_event = COMPLETION_INITIALIZER( vmbus_connection.ready_for_suspend_event), - .ready_for_resume_event = COMPLETION_INITIALIZER( - vmbus_connection.ready_for_resume_event), + .all_offers_delivered_event = COMPLETION_INITIALIZER( + vmbus_connection.all_offers_delivered_event), }; EXPORT_SYMBOL_GPL(vmbus_connection); @@ -237,8 +237,17 @@ int vmbus_connect(void) vmbus_connection.monitor_pages[0], 1); ret |= set_memory_decrypted((unsigned long) vmbus_connection.monitor_pages[1], 1); - if (ret) + if (ret) { + /* + * If set_memory_decrypted() fails, the encryption state + * of the memory is unknown. So leak the memory instead + * of risking returning decrypted memory to the free list. + * For simplicity, always handle both pages the same. + */ + vmbus_connection.monitor_pages[0] = NULL; + vmbus_connection.monitor_pages[1] = NULL; goto cleanup; + } /* * Set_memory_decrypted() will change the memory contents if @@ -337,13 +346,19 @@ void vmbus_disconnect(void) vmbus_connection.int_page = NULL; } - set_memory_encrypted((unsigned long)vmbus_connection.monitor_pages[0], 1); - set_memory_encrypted((unsigned long)vmbus_connection.monitor_pages[1], 1); + if (vmbus_connection.monitor_pages[0]) { + if (!set_memory_encrypted( + (unsigned long)vmbus_connection.monitor_pages[0], 1)) + hv_free_hyperv_page(vmbus_connection.monitor_pages[0]); + vmbus_connection.monitor_pages[0] = NULL; + } - hv_free_hyperv_page(vmbus_connection.monitor_pages[0]); - hv_free_hyperv_page(vmbus_connection.monitor_pages[1]); - vmbus_connection.monitor_pages[0] = NULL; - vmbus_connection.monitor_pages[1] = NULL; + if (vmbus_connection.monitor_pages[1]) { + if (!set_memory_encrypted( + (unsigned long)vmbus_connection.monitor_pages[1], 1)) + hv_free_hyperv_page(vmbus_connection.monitor_pages[1]); + vmbus_connection.monitor_pages[1] = NULL; + } } /* diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c index 51e5018ac9b2..36d9ba097ff5 100644 --- a/drivers/hv/hv.c +++ b/drivers/hv/hv.c @@ -45,8 +45,8 @@ int hv_init(void) * This involves a hypercall. */ int hv_post_message(union hv_connection_id connection_id, - enum hv_message_type message_type, - void *payload, size_t payload_size) + enum hv_message_type message_type, + void *payload, size_t payload_size) { struct hv_input_post_message *aligned_msg; unsigned long flags; @@ -86,7 +86,7 @@ int hv_post_message(union hv_connection_id connection_id, status = HV_STATUS_INVALID_PARAMETER; } else { status = hv_do_hypercall(HVCALL_POST_MESSAGE, - aligned_msg, NULL); + aligned_msg, NULL); } local_irq_restore(flags); @@ -111,7 +111,7 @@ int hv_synic_alloc(void) hv_context.hv_numa_map = kcalloc(nr_node_ids, sizeof(struct cpumask), GFP_KERNEL); - if (hv_context.hv_numa_map == NULL) { + if (!hv_context.hv_numa_map) { pr_err("Unable to allocate NUMA map\n"); goto err; } @@ -120,11 +120,11 @@ int hv_synic_alloc(void) hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu); tasklet_init(&hv_cpu->msg_dpc, - vmbus_on_msg_dpc, (unsigned long) hv_cpu); + vmbus_on_msg_dpc, (unsigned long)hv_cpu); if (ms_hyperv.paravisor_present && hv_isolation_type_tdx()) { hv_cpu->post_msg_page = (void *)get_zeroed_page(GFP_ATOMIC); - if (hv_cpu->post_msg_page == NULL) { + if (!hv_cpu->post_msg_page) { pr_err("Unable to allocate post msg page\n"); goto err; } @@ -147,14 +147,14 @@ int hv_synic_alloc(void) if (!ms_hyperv.paravisor_present && !hv_root_partition) { hv_cpu->synic_message_page = (void *)get_zeroed_page(GFP_ATOMIC); - if (hv_cpu->synic_message_page == NULL) { + if (!hv_cpu->synic_message_page) { pr_err("Unable to allocate SYNIC message page\n"); goto err; } hv_cpu->synic_event_page = (void *)get_zeroed_page(GFP_ATOMIC); - if (hv_cpu->synic_event_page == NULL) { + if (!hv_cpu->synic_event_page) { pr_err("Unable to allocate SYNIC event page\n"); free_page((unsigned long)hv_cpu->synic_message_page); @@ -203,14 +203,13 @@ err: return ret; } - void hv_synic_free(void) { int cpu, ret; for_each_present_cpu(cpu) { - struct hv_per_cpu_context *hv_cpu - = per_cpu_ptr(hv_context.cpu_context, cpu); + struct hv_per_cpu_context *hv_cpu = + per_cpu_ptr(hv_context.cpu_context, cpu); /* It's better to leak the page if the encryption fails. */ if (ms_hyperv.paravisor_present && hv_isolation_type_tdx()) { @@ -262,23 +261,23 @@ void hv_synic_free(void) */ void hv_synic_enable_regs(unsigned int cpu) { - struct hv_per_cpu_context *hv_cpu - = per_cpu_ptr(hv_context.cpu_context, cpu); + struct hv_per_cpu_context *hv_cpu = + per_cpu_ptr(hv_context.cpu_context, cpu); union hv_synic_simp simp; union hv_synic_siefp siefp; union hv_synic_sint shared_sint; union hv_synic_scontrol sctrl; /* Setup the Synic's message page */ - simp.as_uint64 = hv_get_register(HV_REGISTER_SIMP); + simp.as_uint64 = hv_get_msr(HV_MSR_SIMP); simp.simp_enabled = 1; if (ms_hyperv.paravisor_present || hv_root_partition) { /* Mask out vTOM bit. ioremap_cache() maps decrypted */ u64 base = (simp.base_simp_gpa << HV_HYP_PAGE_SHIFT) & ~ms_hyperv.shared_gpa_boundary; - hv_cpu->synic_message_page - = (void *)ioremap_cache(base, HV_HYP_PAGE_SIZE); + hv_cpu->synic_message_page = + (void *)ioremap_cache(base, HV_HYP_PAGE_SIZE); if (!hv_cpu->synic_message_page) pr_err("Fail to map synic message page.\n"); } else { @@ -286,18 +285,18 @@ void hv_synic_enable_regs(unsigned int cpu) >> HV_HYP_PAGE_SHIFT; } - hv_set_register(HV_REGISTER_SIMP, simp.as_uint64); + hv_set_msr(HV_MSR_SIMP, simp.as_uint64); /* Setup the Synic's event page */ - siefp.as_uint64 = hv_get_register(HV_REGISTER_SIEFP); + siefp.as_uint64 = hv_get_msr(HV_MSR_SIEFP); siefp.siefp_enabled = 1; if (ms_hyperv.paravisor_present || hv_root_partition) { /* Mask out vTOM bit. ioremap_cache() maps decrypted */ u64 base = (siefp.base_siefp_gpa << HV_HYP_PAGE_SHIFT) & ~ms_hyperv.shared_gpa_boundary; - hv_cpu->synic_event_page - = (void *)ioremap_cache(base, HV_HYP_PAGE_SIZE); + hv_cpu->synic_event_page = + (void *)ioremap_cache(base, HV_HYP_PAGE_SIZE); if (!hv_cpu->synic_event_page) pr_err("Fail to map synic event page.\n"); } else { @@ -305,13 +304,12 @@ void hv_synic_enable_regs(unsigned int cpu) >> HV_HYP_PAGE_SHIFT; } - hv_set_register(HV_REGISTER_SIEFP, siefp.as_uint64); + hv_set_msr(HV_MSR_SIEFP, siefp.as_uint64); /* Setup the shared SINT. */ if (vmbus_irq != -1) enable_percpu_irq(vmbus_irq, 0); - shared_sint.as_uint64 = hv_get_register(HV_REGISTER_SINT0 + - VMBUS_MESSAGE_SINT); + shared_sint.as_uint64 = hv_get_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT); shared_sint.vector = vmbus_interrupt; shared_sint.masked = false; @@ -326,14 +324,13 @@ void hv_synic_enable_regs(unsigned int cpu) #else shared_sint.auto_eoi = 0; #endif - hv_set_register(HV_REGISTER_SINT0 + VMBUS_MESSAGE_SINT, - shared_sint.as_uint64); + hv_set_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT, shared_sint.as_uint64); /* Enable the global synic bit */ - sctrl.as_uint64 = hv_get_register(HV_REGISTER_SCONTROL); + sctrl.as_uint64 = hv_get_msr(HV_MSR_SCONTROL); sctrl.enable = 1; - hv_set_register(HV_REGISTER_SCONTROL, sctrl.as_uint64); + hv_set_msr(HV_MSR_SCONTROL, sctrl.as_uint64); } int hv_synic_init(unsigned int cpu) @@ -345,29 +342,24 @@ int hv_synic_init(unsigned int cpu) return 0; } -/* - * hv_synic_cleanup - Cleanup routine for hv_synic_init(). - */ void hv_synic_disable_regs(unsigned int cpu) { - struct hv_per_cpu_context *hv_cpu - = per_cpu_ptr(hv_context.cpu_context, cpu); + struct hv_per_cpu_context *hv_cpu = + per_cpu_ptr(hv_context.cpu_context, cpu); union hv_synic_sint shared_sint; union hv_synic_simp simp; union hv_synic_siefp siefp; union hv_synic_scontrol sctrl; - shared_sint.as_uint64 = hv_get_register(HV_REGISTER_SINT0 + - VMBUS_MESSAGE_SINT); + shared_sint.as_uint64 = hv_get_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT); shared_sint.masked = 1; /* Need to correctly cleanup in the case of SMP!!! */ /* Disable the interrupt */ - hv_set_register(HV_REGISTER_SINT0 + VMBUS_MESSAGE_SINT, - shared_sint.as_uint64); + hv_set_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT, shared_sint.as_uint64); - simp.as_uint64 = hv_get_register(HV_REGISTER_SIMP); + simp.as_uint64 = hv_get_msr(HV_MSR_SIMP); /* * In Isolation VM, sim and sief pages are allocated by * paravisor. These pages also will be used by kdump @@ -382,9 +374,9 @@ void hv_synic_disable_regs(unsigned int cpu) simp.base_simp_gpa = 0; } - hv_set_register(HV_REGISTER_SIMP, simp.as_uint64); + hv_set_msr(HV_MSR_SIMP, simp.as_uint64); - siefp.as_uint64 = hv_get_register(HV_REGISTER_SIEFP); + siefp.as_uint64 = hv_get_msr(HV_MSR_SIEFP); siefp.siefp_enabled = 0; if (ms_hyperv.paravisor_present || hv_root_partition) { @@ -394,12 +386,12 @@ void hv_synic_disable_regs(unsigned int cpu) siefp.base_siefp_gpa = 0; } - hv_set_register(HV_REGISTER_SIEFP, siefp.as_uint64); + hv_set_msr(HV_MSR_SIEFP, siefp.as_uint64); /* Disable the global synic bit */ - sctrl.as_uint64 = hv_get_register(HV_REGISTER_SCONTROL); + sctrl.as_uint64 = hv_get_msr(HV_MSR_SCONTROL); sctrl.enable = 0; - hv_set_register(HV_REGISTER_SCONTROL, sctrl.as_uint64); + hv_set_msr(HV_MSR_SCONTROL, sctrl.as_uint64); if (vmbus_irq != -1) disable_percpu_irq(vmbus_irq); @@ -441,6 +433,9 @@ retry: return pending; } +/* + * hv_synic_cleanup - Cleanup routine for hv_synic_init(). + */ int hv_synic_cleanup(unsigned int cpu) { struct vmbus_channel *channel, *sc; diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index e000fa3b9f97..fec2f18679e3 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -25,9 +25,10 @@ #include <linux/notifier.h> #include <linux/percpu_counter.h> #include <linux/page_reporting.h> +#include <linux/sizes.h> #include <linux/hyperv.h> -#include <asm/hyperv-tlfs.h> +#include <hyperv/hvhdk.h> #include <asm/mshyperv.h> @@ -41,8 +42,6 @@ * Begin protocol definitions. */ - - /* * Protocol versions. The low word is the minor version, the high word the major * version. @@ -71,8 +70,6 @@ enum { DYNMEM_PROTOCOL_VERSION_CURRENT = DYNMEM_PROTOCOL_VERSION_WIN10 }; - - /* * Message Types */ @@ -101,7 +98,6 @@ enum dm_message_type { DM_VERSION_1_MAX = 12 }; - /* * Structures defining the dynamic memory management * protocol. @@ -115,7 +111,6 @@ union dm_version { __u32 version; } __packed; - union dm_caps { struct { __u64 balloon:1; @@ -148,8 +143,6 @@ union dm_mem_page_range { __u64 page_range; } __packed; - - /* * The header for all dynamic memory messages: * @@ -174,7 +167,6 @@ struct dm_message { __u8 data[]; /* enclosed message */ } __packed; - /* * Specific message types supporting the dynamic memory protocol. */ @@ -271,7 +263,6 @@ struct dm_status { __u32 io_diff; } __packed; - /* * Message to ask the guest to allocate memory - balloon up message. * This message is sent from the host to the guest. The guest may not be @@ -286,14 +277,13 @@ struct dm_balloon { __u32 reservedz; } __packed; - /* * Balloon response message; this message is sent from the guest * to the host in response to the balloon message. * * reservedz: Reserved; must be set to zero. * more_pages: If FALSE, this is the last message of the transaction. - * if TRUE there will atleast one more message from the guest. + * if TRUE there will be at least one more message from the guest. * * range_count: The number of ranges in the range array. * @@ -314,7 +304,7 @@ struct dm_balloon_response { * to the guest to give guest more memory. * * more_pages: If FALSE, this is the last message of the transaction. - * if TRUE there will atleast one more message from the guest. + * if TRUE there will be at least one more message from the guest. * * reservedz: Reserved; must be set to zero. * @@ -342,7 +332,6 @@ struct dm_unballoon_response { struct dm_header hdr; } __packed; - /* * Hot add request message. Message sent from the host to the guest. * @@ -390,7 +379,6 @@ enum dm_info_type { MAX_INFO_TYPE }; - /* * Header for the information message. */ @@ -425,11 +413,11 @@ struct dm_info_msg { * The range start_pfn : end_pfn specifies the range * that the host has asked us to hot add. The range * start_pfn : ha_end_pfn specifies the range that we have - * currently hot added. We hot add in multiples of 128M - * chunks; it is possible that we may not be able to bring - * online all the pages in the region. The range + * currently hot added. We hot add in chunks equal to the + * memory block size; it is possible that we may not be able + * to bring online all the pages in the region. The range * covered_start_pfn:covered_end_pfn defines the pages that can - * be brough online. + * be brought online. */ struct hv_hotadd_state { @@ -480,10 +468,10 @@ static unsigned long last_post_time; static int hv_hypercall_multi_failure; -module_param(hot_add, bool, (S_IRUGO | S_IWUSR)); +module_param(hot_add, bool, 0644); MODULE_PARM_DESC(hot_add, "If set attempt memory hot_add"); -module_param(pressure_report_delay, uint, (S_IRUGO | S_IWUSR)); +module_param(pressure_report_delay, uint, 0644); MODULE_PARM_DESC(pressure_report_delay, "Delay in secs in reporting pressure"); static atomic_t trans_id = ATOMIC_INIT(0); @@ -502,11 +490,13 @@ enum hv_dm_state { DM_INIT_ERROR }; - static __u8 recv_buffer[HV_HYP_PAGE_SIZE]; static __u8 balloon_up_send_buffer[HV_HYP_PAGE_SIZE]; + +static unsigned long ha_pages_in_chunk; +#define HA_BYTES_IN_CHUNK (ha_pages_in_chunk << PAGE_SHIFT) + #define PAGES_IN_2M (2 * 1024 * 1024 / PAGE_SIZE) -#define HA_CHUNK (128 * 1024 * 1024 / PAGE_SIZE) struct hv_dynmem_device { struct hv_device *dev; @@ -595,12 +585,12 @@ static inline bool has_pfn_is_backed(struct hv_hotadd_state *has, struct hv_hotadd_gap *gap; /* The page is not backed. */ - if ((pfn < has->covered_start_pfn) || (pfn >= has->covered_end_pfn)) + if (pfn < has->covered_start_pfn || pfn >= has->covered_end_pfn) return false; /* Check for gaps. */ list_for_each_entry(gap, &has->gap_list, list) { - if ((pfn >= gap->start_pfn) && (pfn < gap->end_pfn)) + if (pfn >= gap->start_pfn && pfn < gap->end_pfn) return false; } @@ -693,9 +683,8 @@ static void hv_page_online_one(struct hv_hotadd_state *has, struct page *pg) if (!PageOffline(pg)) __SetPageOffline(pg); return; - } - if (PageOffline(pg)) - __ClearPageOffline(pg); + } else if (!PageOffline(pg)) + return; /* This frame is currently backed; online the page. */ generic_online_page(pg, 0); @@ -724,28 +713,21 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size, unsigned long processed_pfn; unsigned long total_pfn = pfn_count; - for (i = 0; i < (size/HA_CHUNK); i++) { - start_pfn = start + (i * HA_CHUNK); + for (i = 0; i < (size/ha_pages_in_chunk); i++) { + start_pfn = start + (i * ha_pages_in_chunk); scoped_guard(spinlock_irqsave, &dm_device.ha_lock) { - has->ha_end_pfn += HA_CHUNK; - - if (total_pfn > HA_CHUNK) { - processed_pfn = HA_CHUNK; - total_pfn -= HA_CHUNK; - } else { - processed_pfn = total_pfn; - total_pfn = 0; - } - - has->covered_end_pfn += processed_pfn; + has->ha_end_pfn += ha_pages_in_chunk; + processed_pfn = umin(total_pfn, ha_pages_in_chunk); + total_pfn -= processed_pfn; + has->covered_end_pfn += processed_pfn; } reinit_completion(&dm_device.ol_waitevent); nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn)); ret = add_memory(nid, PFN_PHYS((start_pfn)), - (HA_CHUNK << PAGE_SHIFT), MHP_MERGE_RESOURCE); + HA_BYTES_IN_CHUNK, MHP_MERGE_RESOURCE); if (ret) { pr_err("hot_add memory failed error is %d\n", ret); @@ -760,7 +742,7 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size, do_hot_add = false; } scoped_guard(spinlock_irqsave, &dm_device.ha_lock) { - has->ha_end_pfn -= HA_CHUNK; + has->ha_end_pfn -= ha_pages_in_chunk; has->covered_end_pfn -= processed_pfn; } break; @@ -774,7 +756,7 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size, * adding succeeded, it is ok to proceed even if the memory was * not onlined in time. */ - wait_for_completion_timeout(&dm_device.ol_waitevent, 5 * HZ); + wait_for_completion_timeout(&dm_device.ol_waitevent, secs_to_jiffies(5)); post_status(&dm_device); } } @@ -784,23 +766,25 @@ static void hv_online_page(struct page *pg, unsigned int order) struct hv_hotadd_state *has; unsigned long pfn = page_to_pfn(pg); - guard(spinlock_irqsave)(&dm_device.ha_lock); - list_for_each_entry(has, &dm_device.ha_region_list, list) { - /* The page belongs to a different HAS. */ - if ((pfn < has->start_pfn) || + scoped_guard(spinlock_irqsave, &dm_device.ha_lock) { + list_for_each_entry(has, &dm_device.ha_region_list, list) { + /* The page belongs to a different HAS. */ + if (pfn < has->start_pfn || (pfn + (1UL << order) > has->end_pfn)) - continue; + continue; - hv_bring_pgs_online(has, pfn, 1UL << order); - break; + hv_bring_pgs_online(has, pfn, 1UL << order); + return; + } } + generic_online_page(pg, order); } static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt) { struct hv_hotadd_state *has; struct hv_hotadd_gap *gap; - unsigned long residual, new_inc; + unsigned long residual; int ret = 0; guard(spinlock_irqsave)(&dm_device.ha_lock); @@ -836,15 +820,9 @@ static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt) * our current limit; extend it. */ if ((start_pfn + pfn_cnt) > has->end_pfn) { + /* Extend the region by multiples of ha_pages_in_chunk */ residual = (start_pfn + pfn_cnt - has->end_pfn); - /* - * Extend the region by multiples of HA_CHUNK. - */ - new_inc = (residual / HA_CHUNK) * HA_CHUNK; - if (residual % HA_CHUNK) - new_inc += HA_CHUNK; - - has->end_pfn += new_inc; + has->end_pfn += ALIGN(residual, ha_pages_in_chunk); } ret = 1; @@ -855,7 +833,7 @@ static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt) } static unsigned long handle_pg_range(unsigned long pg_start, - unsigned long pg_count) + unsigned long pg_count) { unsigned long start_pfn = pg_start; unsigned long pfn_cnt = pg_count; @@ -866,7 +844,7 @@ static unsigned long handle_pg_range(unsigned long pg_start, unsigned long res = 0, flags; pr_debug("Hot adding %lu pages starting at pfn 0x%lx.\n", pg_count, - pg_start); + pg_start); spin_lock_irqsave(&dm_device.ha_lock, flags); list_for_each_entry(has, &dm_device.ha_region_list, list) { @@ -902,22 +880,19 @@ static unsigned long handle_pg_range(unsigned long pg_start, if (start_pfn > has->start_pfn && online_section_nr(pfn_to_section_nr(start_pfn))) hv_bring_pgs_online(has, start_pfn, pgs_ol); - } - if ((has->ha_end_pfn < has->end_pfn) && (pfn_cnt > 0)) { + if (has->ha_end_pfn < has->end_pfn && pfn_cnt > 0) { /* * We have some residual hot add range * that needs to be hot added; hot add * it now. Hot add a multiple of - * HA_CHUNK that fully covers the pages + * ha_pages_in_chunk that fully covers the pages * we have. */ size = (has->end_pfn - has->ha_end_pfn); if (pfn_cnt <= size) { - size = ((pfn_cnt / HA_CHUNK) * HA_CHUNK); - if (pfn_cnt % HA_CHUNK) - size += HA_CHUNK; + size = ALIGN(pfn_cnt, ha_pages_in_chunk); } else { pfn_cnt = size; } @@ -1010,10 +985,7 @@ static void hot_add_req(struct work_struct *dummy) rg_start = dm->ha_wrk.ha_region_range.finfo.start_page; rg_sz = dm->ha_wrk.ha_region_range.finfo.page_cnt; - if ((rg_start == 0) && (!dm->host_specified_ha_region)) { - unsigned long region_size; - unsigned long region_start; - + if (rg_start == 0 && !dm->host_specified_ha_region) { /* * The host has not specified the hot-add region. * Based on the hot-add page range being specified, @@ -1021,19 +993,13 @@ static void hot_add_req(struct work_struct *dummy) * that need to be hot-added while ensuring the alignment * and size requirements of Linux as it relates to hot-add. */ - region_size = (pfn_cnt / HA_CHUNK) * HA_CHUNK; - if (pfn_cnt % HA_CHUNK) - region_size += HA_CHUNK; - - region_start = (pg_start / HA_CHUNK) * HA_CHUNK; - - rg_start = region_start; - rg_sz = region_size; + rg_start = ALIGN_DOWN(pg_start, ha_pages_in_chunk); + rg_sz = ALIGN(pfn_cnt, ha_pages_in_chunk); } if (do_hot_add) resp.page_count = process_hot_add(pg_start, pfn_cnt, - rg_start, rg_sz); + rg_start, rg_sz); dm->num_pages_added += resp.page_count; #endif @@ -1211,11 +1177,10 @@ static void post_status(struct hv_dynmem_device *dm) sizeof(struct dm_status), (unsigned long)NULL, VM_PKT_DATA_INBAND, 0); - } static void free_balloon_pages(struct hv_dynmem_device *dm, - union dm_mem_page_range *range_array) + union dm_mem_page_range *range_array) { int num_pages = range_array->finfo.page_cnt; __u64 start_frame = range_array->finfo.start_page; @@ -1231,8 +1196,6 @@ static void free_balloon_pages(struct hv_dynmem_device *dm, } } - - static unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm, unsigned int num_pages, struct dm_balloon_response *bl_resp, @@ -1278,7 +1241,6 @@ static unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm, page_to_pfn(pg); bl_resp->range_array[i].finfo.page_cnt = alloc_unit; bl_resp->hdr.size += sizeof(union dm_mem_page_range); - } return i * alloc_unit; @@ -1332,7 +1294,7 @@ static void balloon_up(struct work_struct *dummy) if (num_ballooned == 0 || num_ballooned == num_pages) { pr_debug("Ballooned %u out of %u requested pages.\n", - num_pages, dm_device.balloon_wrk.num_pages); + num_pages, dm_device.balloon_wrk.num_pages); bl_resp->more_pages = 0; done = true; @@ -1366,16 +1328,15 @@ static void balloon_up(struct work_struct *dummy) for (i = 0; i < bl_resp->range_count; i++) free_balloon_pages(&dm_device, - &bl_resp->range_array[i]); + &bl_resp->range_array[i]); done = true; } } - } static void balloon_down(struct hv_dynmem_device *dm, - struct dm_unballoon_request *req) + struct dm_unballoon_request *req) { union dm_mem_page_range *range_array = req->range_array; int range_count = req->range_count; @@ -1389,7 +1350,7 @@ static void balloon_down(struct hv_dynmem_device *dm, } pr_debug("Freed %u ballooned pages.\n", - prev_pages_ballooned - dm->num_pages_ballooned); + prev_pages_ballooned - dm->num_pages_ballooned); if (req->more_pages == 1) return; @@ -1414,8 +1375,8 @@ static int dm_thread_func(void *dm_dev) struct hv_dynmem_device *dm = dm_dev; while (!kthread_should_stop()) { - wait_for_completion_interruptible_timeout( - &dm_device.config_event, 1*HZ); + wait_for_completion_interruptible_timeout(&dm_device.config_event, + secs_to_jiffies(1)); /* * The host expects us to post information on the memory * pressure every second. @@ -1439,9 +1400,8 @@ static int dm_thread_func(void *dm_dev) return 0; } - static void version_resp(struct hv_dynmem_device *dm, - struct dm_version_response *vresp) + struct dm_version_response *vresp) { struct dm_version_request version_req; int ret; @@ -1502,7 +1462,7 @@ version_error: } static void cap_resp(struct hv_dynmem_device *dm, - struct dm_capabilities_resp_msg *cap_resp) + struct dm_capabilities_resp_msg *cap_resp) { if (!cap_resp->is_accepted) { pr_err("Capabilities not accepted by host\n"); @@ -1535,7 +1495,7 @@ static void balloon_onchannelcallback(void *context) switch (dm_hdr->type) { case DM_VERSION_RESPONSE: version_resp(dm, - (struct dm_version_response *)dm_msg); + (struct dm_version_response *)dm_msg); break; case DM_CAPABILITIES_RESPONSE: @@ -1565,7 +1525,7 @@ static void balloon_onchannelcallback(void *context) dm->state = DM_BALLOON_DOWN; balloon_down(dm, - (struct dm_unballoon_request *)recv_buffer); + (struct dm_unballoon_request *)recv_buffer); break; case DM_MEM_HOT_ADD_REQUEST: @@ -1603,17 +1563,15 @@ static void balloon_onchannelcallback(void *context) default: pr_warn_ratelimited("Unhandled message: type: %d\n", dm_hdr->type); - } } - } #define HV_LARGE_REPORTING_ORDER 9 #define HV_LARGE_REPORTING_LEN (HV_HYP_PAGE_SIZE << \ HV_LARGE_REPORTING_ORDER) static int hv_free_page_report(struct page_reporting_dev_info *pr_dev_info, - struct scatterlist *sgl, unsigned int nents) + struct scatterlist *sgl, unsigned int nents) { unsigned long flags; struct hv_memory_hint *hint; @@ -1630,7 +1588,7 @@ static int hv_free_page_report(struct page_reporting_dev_info *pr_dev_info, return -ENOSPC; } - hint->type = HV_EXT_MEMORY_HEAT_HINT_TYPE_COLD_DISCARD; + hint->heat_type = HV_EXTMEM_HEAT_HINT_COLD_DISCARD; hint->reserved = 0; for_each_sg(sgl, sg, nents, i) { union hv_gpa_page_range *range; @@ -1648,7 +1606,7 @@ static int hv_free_page_report(struct page_reporting_dev_info *pr_dev_info, */ /* page reporting for pages 2MB or higher */ - if (order >= HV_LARGE_REPORTING_ORDER ) { + if (order >= HV_LARGE_REPORTING_ORDER) { range->page.largepage = 1; range->page_size = HV_GPA_PAGE_RANGE_PAGE_SIZE_2MB; range->base_large_pfn = page_to_hvpfn( @@ -1662,23 +1620,21 @@ static int hv_free_page_report(struct page_reporting_dev_info *pr_dev_info, range->page.additional_pages = (sg->length / HV_HYP_PAGE_SIZE) - 1; } - } status = hv_do_rep_hypercall(HV_EXT_CALL_MEMORY_HEAT_HINT, nents, 0, hint, NULL); local_irq_restore(flags); if (!hv_result_success(status)) { - pr_err("Cold memory discard hypercall failed with status %llx\n", - status); + status); if (hv_hypercall_multi_failure > 0) hv_hypercall_multi_failure++; if (hv_result(status) == HV_STATUS_INVALID_PARAMETER) { pr_err("Underlying Hyper-V does not support order less than 9. Hypercall failed\n"); pr_err("Defaulting to page_reporting_order %d\n", - pageblock_order); + pageblock_order); page_reporting_order = pageblock_order; hv_hypercall_multi_failure++; return -EINVAL; @@ -1712,7 +1668,7 @@ static void enable_page_reporting(void) pr_err("Failed to enable cold memory discard: %d\n", ret); } else { pr_info("Cold memory discard hint enabled with order %d\n", - page_reporting_order); + page_reporting_order); } } @@ -1795,7 +1751,7 @@ static int balloon_connect_vsp(struct hv_device *dev) if (ret) goto out; - t = wait_for_completion_timeout(&dm_device.host_event, 5*HZ); + t = wait_for_completion_timeout(&dm_device.host_event, secs_to_jiffies(5)); if (t == 0) { ret = -ETIMEDOUT; goto out; @@ -1831,10 +1787,13 @@ static int balloon_connect_vsp(struct hv_device *dev) cap_msg.caps.cap_bits.hot_add = hot_add_enabled(); /* - * Specify our alignment requirements as it relates - * memory hot-add. Specify 128MB alignment. + * Specify our alignment requirements for memory hot-add. The value is + * the log base 2 of the number of megabytes in a chunk. For example, + * with 256 MiB chunks, the value is 8. The number of MiB in a chunk + * must be a power of 2. */ - cap_msg.caps.cap_bits.hot_add_alignment = 7; + cap_msg.caps.cap_bits.hot_add_alignment = + ilog2(HA_BYTES_IN_CHUNK / SZ_1M); /* * Currently the host does not use these @@ -1850,7 +1809,7 @@ static int balloon_connect_vsp(struct hv_device *dev) if (ret) goto out; - t = wait_for_completion_timeout(&dm_device.host_event, 5*HZ); + t = wait_for_completion_timeout(&dm_device.host_event, secs_to_jiffies(5)); if (t == 0) { ret = -ETIMEDOUT; goto out; @@ -1891,8 +1850,8 @@ static int hv_balloon_debug_show(struct seq_file *f, void *offset) char *sname; seq_printf(f, "%-22s: %u.%u\n", "host_version", - DYNMEM_MAJOR_VERSION(dm->version), - DYNMEM_MINOR_VERSION(dm->version)); + DYNMEM_MAJOR_VERSION(dm->version), + DYNMEM_MINOR_VERSION(dm->version)); seq_printf(f, "%-22s:", "capabilities"); if (ballooning_enabled()) @@ -1941,10 +1900,10 @@ static int hv_balloon_debug_show(struct seq_file *f, void *offset) seq_printf(f, "%-22s: %u\n", "pages_ballooned", dm->num_pages_ballooned); seq_printf(f, "%-22s: %lu\n", "total_pages_committed", - get_pages_committed(dm)); + get_pages_committed(dm)); seq_printf(f, "%-22s: %llu\n", "max_dynamic_page_count", - dm->max_dynamic_page_count); + dm->max_dynamic_page_count); return 0; } @@ -1954,7 +1913,7 @@ DEFINE_SHOW_ATTRIBUTE(hv_balloon_debug); static void hv_balloon_debugfs_init(struct hv_dynmem_device *b) { debugfs_create_file("hv-balloon", 0444, NULL, b, - &hv_balloon_debug_fops); + &hv_balloon_debug_fops); } static void hv_balloon_debugfs_exit(struct hv_dynmem_device *b) @@ -1984,8 +1943,23 @@ static int balloon_probe(struct hv_device *dev, hot_add = false; #ifdef CONFIG_MEMORY_HOTPLUG + /* + * Hot-add must operate in chunks that are of size equal to the + * memory block size because that's what the core add_memory() + * interface requires. The Hyper-V interface requires that the memory + * block size be a power of 2, which is guaranteed by the check in + * memory_dev_init(). + */ + ha_pages_in_chunk = memory_block_size_bytes() / PAGE_SIZE; do_hot_add = hot_add; #else + /* + * Without MEMORY_HOTPLUG, the guest returns a failure status for all + * hot add requests from Hyper-V, and the chunk size is used only to + * specify alignment to Hyper-V as required by the host/guest protocol. + * Somewhat arbitrarily, use 128 MiB. + */ + ha_pages_in_chunk = SZ_128M / PAGE_SIZE; do_hot_add = false; #endif dm_device.dev = dev; @@ -2097,7 +2071,6 @@ static int balloon_suspend(struct hv_device *hv_dev) tasklet_enable(&hv_dev->channel->callback_event); return 0; - } static int balloon_resume(struct hv_device *dev) @@ -2156,7 +2129,6 @@ static struct hv_driver balloon_drv = { static int __init init_balloon_drv(void) { - return vmbus_driver_register(&balloon_drv); } diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c index 0285a74363b3..f2e6f55d6ca6 100644 --- a/drivers/hv/hv_common.c +++ b/drivers/hv/hv_common.c @@ -20,12 +20,15 @@ #include <linux/sched/task_stack.h> #include <linux/panic_notifier.h> #include <linux/ptrace.h> +#include <linux/random.h> +#include <linux/efi.h> #include <linux/kdebug.h> #include <linux/kmsg_dump.h> +#include <linux/sizes.h> #include <linux/slab.h> #include <linux/dma-map-ops.h> #include <linux/set_memory.h> -#include <asm/hyperv-tlfs.h> +#include <hyperv/hvhdk.h> #include <asm/mshyperv.h> /* @@ -138,7 +141,7 @@ static int sysctl_record_panic_msg = 1; * sysctl option to allow the user to control whether kmsg data should be * reported to Hyper-V on panic. */ -static struct ctl_table hv_ctl_table[] = { +static const struct ctl_table hv_ctl_table[] = { { .procname = "hyperv_record_panic_msg", .data = &sysctl_record_panic_msg, @@ -204,13 +207,13 @@ static int hv_die_panic_notify_crash(struct notifier_block *self, * buffer and call into Hyper-V to transfer the data. */ static void hv_kmsg_dump(struct kmsg_dumper *dumper, - enum kmsg_dump_reason reason) + struct kmsg_dump_detail *detail) { struct kmsg_dump_iter iter; size_t bytes_written; /* We are only interested in panics. */ - if (reason != KMSG_DUMP_PANIC || !sysctl_record_panic_msg) + if (detail->reason != KMSG_DUMP_PANIC || !sysctl_record_panic_msg) return; /* @@ -227,19 +230,19 @@ static void hv_kmsg_dump(struct kmsg_dumper *dumper, * contain the size of the panic data in that page. Rest of the * registers are no-op when the NOTIFY_MSG flag is set. */ - hv_set_register(HV_REGISTER_CRASH_P0, 0); - hv_set_register(HV_REGISTER_CRASH_P1, 0); - hv_set_register(HV_REGISTER_CRASH_P2, 0); - hv_set_register(HV_REGISTER_CRASH_P3, virt_to_phys(hv_panic_page)); - hv_set_register(HV_REGISTER_CRASH_P4, bytes_written); + hv_set_msr(HV_MSR_CRASH_P0, 0); + hv_set_msr(HV_MSR_CRASH_P1, 0); + hv_set_msr(HV_MSR_CRASH_P2, 0); + hv_set_msr(HV_MSR_CRASH_P3, virt_to_phys(hv_panic_page)); + hv_set_msr(HV_MSR_CRASH_P4, bytes_written); /* * Let Hyper-V know there is crash data available along with * the panic message. */ - hv_set_register(HV_REGISTER_CRASH_CTL, - (HV_CRASH_CTL_CRASH_NOTIFY | - HV_CRASH_CTL_CRASH_NOTIFY_MSG)); + hv_set_msr(HV_MSR_CRASH_CTL, + (HV_CRASH_CTL_CRASH_NOTIFY | + HV_CRASH_CTL_CRASH_NOTIFY_MSG)); } static struct kmsg_dumper hv_kmsg_dumper = { @@ -275,9 +278,22 @@ static void hv_kmsg_dump_register(void) } } +static inline bool hv_output_page_exists(void) +{ + return hv_root_partition || IS_ENABLED(CONFIG_HYPERV_VTL_MODE); +} + int __init hv_common_init(void) { int i; + union hv_hypervisor_version_info version; + + /* Get information about the Hyper-V host version */ + if (!hv_get_hypervisor_version(&version)) + pr_info("Hyper-V: Host Build %d.%d.%d.%d-%d-%d\n", + version.major_version, version.minor_version, + version.build_number, version.service_number, + version.service_pack, version.service_branch); if (hv_is_isolation_supported()) sysctl_record_panic_msg = 0; @@ -310,7 +326,7 @@ int __init hv_common_init(void) * Register for panic kmsg callback only if the right * capability is supported by the hypervisor. */ - hyperv_crash_ctl = hv_get_register(HV_REGISTER_CRASH_CTL); + hyperv_crash_ctl = hv_get_msr(HV_MSR_CRASH_CTL); if (hyperv_crash_ctl & HV_CRASH_CTL_CRASH_NOTIFY_MSG) hv_kmsg_dump_register(); @@ -329,24 +345,90 @@ int __init hv_common_init(void) BUG_ON(!hyperv_pcpu_input_arg); /* Allocate the per-CPU state for output arg for root */ - if (hv_root_partition) { + if (hv_output_page_exists()) { hyperv_pcpu_output_arg = alloc_percpu(void *); BUG_ON(!hyperv_pcpu_output_arg); } - hv_vp_index = kmalloc_array(num_possible_cpus(), sizeof(*hv_vp_index), + hv_vp_index = kmalloc_array(nr_cpu_ids, sizeof(*hv_vp_index), GFP_KERNEL); if (!hv_vp_index) { hv_common_free(); return -ENOMEM; } - for (i = 0; i < num_possible_cpus(); i++) + for (i = 0; i < nr_cpu_ids; i++) hv_vp_index[i] = VP_INVAL; return 0; } +void __init ms_hyperv_late_init(void) +{ + struct acpi_table_header *header; + acpi_status status; + u8 *randomdata; + u32 length, i; + + /* + * Seed the Linux random number generator with entropy provided by + * the Hyper-V host in ACPI table OEM0. + */ + if (!IS_ENABLED(CONFIG_ACPI)) + return; + + status = acpi_get_table("OEM0", 0, &header); + if (ACPI_FAILURE(status) || !header) + return; + + /* + * Since the "OEM0" table name is for OEM specific usage, verify + * that what we're seeing purports to be from Microsoft. + */ + if (strncmp(header->oem_table_id, "MICROSFT", 8)) + goto error; + + /* + * Ensure the length is reasonable. Requiring at least 8 bytes and + * no more than 4K bytes is somewhat arbitrary and just protects + * against a malformed table. Hyper-V currently provides 64 bytes, + * but allow for a change in a later version. + */ + if (header->length < sizeof(*header) + 8 || + header->length > sizeof(*header) + SZ_4K) + goto error; + + length = header->length - sizeof(*header); + randomdata = (u8 *)(header + 1); + + pr_debug("Hyper-V: Seeding rng with %d random bytes from ACPI table OEM0\n", + length); + + add_bootloader_randomness(randomdata, length); + + /* + * To prevent the seed data from being visible in /sys/firmware/acpi, + * zero out the random data in the ACPI table and fixup the checksum. + * The zero'ing is done out of an abundance of caution in avoiding + * potential security risks to the rng. Similarly, reset the table + * length to just the header size so that a subsequent kexec doesn't + * try to use the zero'ed out random data. + */ + for (i = 0; i < length; i++) { + header->checksum += randomdata[i]; + randomdata[i] = 0; + } + + for (i = 0; i < sizeof(header->length); i++) + header->checksum += ((u8 *)&header->length)[i]; + header->length = sizeof(*header); + for (i = 0; i < sizeof(header->length); i++) + header->checksum -= ((u8 *)&header->length)[i]; + +error: + acpi_put_table(header); +} + /* * Hyper-V specific initialization and die code for * individual CPUs that is common across all architectures. @@ -358,7 +440,7 @@ int hv_common_cpu_init(unsigned int cpu) void **inputarg, **outputarg; u64 msr_vp_index; gfp_t flags; - int pgcount = hv_root_partition ? 2 : 1; + const int pgcount = hv_output_page_exists() ? 2 : 1; void *mem; int ret; @@ -376,7 +458,7 @@ int hv_common_cpu_init(unsigned int cpu) if (!mem) return -ENOMEM; - if (hv_root_partition) { + if (hv_output_page_exists()) { outputarg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg); *outputarg = (char *)mem + HV_HYP_PAGE_SIZE; } @@ -409,7 +491,7 @@ int hv_common_cpu_init(unsigned int cpu) *inputarg = mem; } - msr_vp_index = hv_get_register(HV_REGISTER_VP_INDEX); + msr_vp_index = hv_get_msr(HV_MSR_VP_INDEX); hv_vp_index[cpu] = msr_vp_index; @@ -484,11 +566,7 @@ EXPORT_SYMBOL_GPL(hv_query_ext_cap); void hv_setup_dma_ops(struct device *dev, bool coherent) { - /* - * Hyper-V does not offer a vIOMMU in the guest - * VM, so pass 0/NULL for the IOMMU settings - */ - arch_setup_dma_ops(dev, 0, 0, coherent); + arch_setup_dma_ops(dev, coherent); } EXPORT_SYMBOL_GPL(hv_setup_dma_ops); @@ -506,7 +584,7 @@ EXPORT_SYMBOL_GPL(hv_is_hibernation_supported); */ static u64 __hv_read_ref_counter(void) { - return hv_get_register(HV_REGISTER_TIME_REF_COUNT); + return hv_get_msr(HV_MSR_TIME_REF_COUNT); } u64 (*hv_read_reference_counter)(void) = __hv_read_ref_counter; diff --git a/drivers/hv/hv_fcopy.c b/drivers/hv/hv_fcopy.c deleted file mode 100644 index 922d83eb7ddf..000000000000 --- a/drivers/hv/hv_fcopy.c +++ /dev/null @@ -1,427 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * An implementation of file copy service. - * - * Copyright (C) 2014, Microsoft, Inc. - * - * Author : K. Y. Srinivasan <ksrinivasan@novell.com> - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/nls.h> -#include <linux/workqueue.h> -#include <linux/hyperv.h> -#include <linux/sched.h> -#include <asm/hyperv-tlfs.h> - -#include "hyperv_vmbus.h" -#include "hv_utils_transport.h" - -#define WIN8_SRV_MAJOR 1 -#define WIN8_SRV_MINOR 1 -#define WIN8_SRV_VERSION (WIN8_SRV_MAJOR << 16 | WIN8_SRV_MINOR) - -#define FCOPY_VER_COUNT 1 -static const int fcopy_versions[] = { - WIN8_SRV_VERSION -}; - -#define FW_VER_COUNT 1 -static const int fw_versions[] = { - UTIL_FW_VERSION -}; - -/* - * Global state maintained for transaction that is being processed. - * For a class of integration services, including the "file copy service", - * the specified protocol is a "request/response" protocol which means that - * there can only be single outstanding transaction from the host at any - * given point in time. We use this to simplify memory management in this - * driver - we cache and process only one message at a time. - * - * While the request/response protocol is guaranteed by the host, we further - * ensure this by serializing packet processing in this driver - we do not - * read additional packets from the VMBUs until the current packet is fully - * handled. - */ - -static struct { - int state; /* hvutil_device_state */ - int recv_len; /* number of bytes received. */ - struct hv_fcopy_hdr *fcopy_msg; /* current message */ - struct vmbus_channel *recv_channel; /* chn we got the request */ - u64 recv_req_id; /* request ID. */ -} fcopy_transaction; - -static void fcopy_respond_to_host(int error); -static void fcopy_send_data(struct work_struct *dummy); -static void fcopy_timeout_func(struct work_struct *dummy); -static DECLARE_DELAYED_WORK(fcopy_timeout_work, fcopy_timeout_func); -static DECLARE_WORK(fcopy_send_work, fcopy_send_data); -static const char fcopy_devname[] = "vmbus/hv_fcopy"; -static u8 *recv_buffer; -static struct hvutil_transport *hvt; -/* - * This state maintains the version number registered by the daemon. - */ -static int dm_reg_value; - -static void fcopy_poll_wrapper(void *channel) -{ - /* Transaction is finished, reset the state here to avoid races. */ - fcopy_transaction.state = HVUTIL_READY; - tasklet_schedule(&((struct vmbus_channel *)channel)->callback_event); -} - -static void fcopy_timeout_func(struct work_struct *dummy) -{ - /* - * If the timer fires, the user-mode component has not responded; - * process the pending transaction. - */ - fcopy_respond_to_host(HV_E_FAIL); - hv_poll_channel(fcopy_transaction.recv_channel, fcopy_poll_wrapper); -} - -static void fcopy_register_done(void) -{ - pr_debug("FCP: userspace daemon registered\n"); - hv_poll_channel(fcopy_transaction.recv_channel, fcopy_poll_wrapper); -} - -static int fcopy_handle_handshake(u32 version) -{ - u32 our_ver = FCOPY_CURRENT_VERSION; - - switch (version) { - case FCOPY_VERSION_0: - /* Daemon doesn't expect us to reply */ - dm_reg_value = version; - break; - case FCOPY_VERSION_1: - /* Daemon expects us to reply with our own version */ - if (hvutil_transport_send(hvt, &our_ver, sizeof(our_ver), - fcopy_register_done)) - return -EFAULT; - dm_reg_value = version; - break; - default: - /* - * For now we will fail the registration. - * If and when we have multiple versions to - * deal with, we will be backward compatible. - * We will add this code when needed. - */ - return -EINVAL; - } - pr_debug("FCP: userspace daemon ver. %d connected\n", version); - return 0; -} - -static void fcopy_send_data(struct work_struct *dummy) -{ - struct hv_start_fcopy *smsg_out = NULL; - int operation = fcopy_transaction.fcopy_msg->operation; - struct hv_start_fcopy *smsg_in; - void *out_src; - int rc, out_len; - - /* - * The strings sent from the host are encoded in - * utf16; convert it to utf8 strings. - * The host assures us that the utf16 strings will not exceed - * the max lengths specified. We will however, reserve room - * for the string terminating character - in the utf16s_utf8s() - * function we limit the size of the buffer where the converted - * string is placed to W_MAX_PATH -1 to guarantee - * that the strings can be properly terminated! - */ - - switch (operation) { - case START_FILE_COPY: - out_len = sizeof(struct hv_start_fcopy); - smsg_out = kzalloc(sizeof(*smsg_out), GFP_KERNEL); - if (!smsg_out) - return; - - smsg_out->hdr.operation = operation; - smsg_in = (struct hv_start_fcopy *)fcopy_transaction.fcopy_msg; - - utf16s_to_utf8s((wchar_t *)smsg_in->file_name, W_MAX_PATH, - UTF16_LITTLE_ENDIAN, - (__u8 *)&smsg_out->file_name, W_MAX_PATH - 1); - - utf16s_to_utf8s((wchar_t *)smsg_in->path_name, W_MAX_PATH, - UTF16_LITTLE_ENDIAN, - (__u8 *)&smsg_out->path_name, W_MAX_PATH - 1); - - smsg_out->copy_flags = smsg_in->copy_flags; - smsg_out->file_size = smsg_in->file_size; - out_src = smsg_out; - break; - - case WRITE_TO_FILE: - out_src = fcopy_transaction.fcopy_msg; - out_len = sizeof(struct hv_do_fcopy); - break; - default: - out_src = fcopy_transaction.fcopy_msg; - out_len = fcopy_transaction.recv_len; - break; - } - - fcopy_transaction.state = HVUTIL_USERSPACE_REQ; - rc = hvutil_transport_send(hvt, out_src, out_len, NULL); - if (rc) { - pr_debug("FCP: failed to communicate to the daemon: %d\n", rc); - if (cancel_delayed_work_sync(&fcopy_timeout_work)) { - fcopy_respond_to_host(HV_E_FAIL); - fcopy_transaction.state = HVUTIL_READY; - } - } - kfree(smsg_out); -} - -/* - * Send a response back to the host. - */ - -static void -fcopy_respond_to_host(int error) -{ - struct icmsg_hdr *icmsghdr; - u32 buf_len; - struct vmbus_channel *channel; - u64 req_id; - - /* - * Copy the global state for completing the transaction. Note that - * only one transaction can be active at a time. This is guaranteed - * by the file copy protocol implemented by the host. Furthermore, - * the "transaction active" state we maintain ensures that there can - * only be one active transaction at a time. - */ - - buf_len = fcopy_transaction.recv_len; - channel = fcopy_transaction.recv_channel; - req_id = fcopy_transaction.recv_req_id; - - icmsghdr = (struct icmsg_hdr *) - &recv_buffer[sizeof(struct vmbuspipe_hdr)]; - - if (channel->onchannel_callback == NULL) - /* - * We have raced with util driver being unloaded; - * silently return. - */ - return; - - icmsghdr->status = error; - icmsghdr->icflags = ICMSGHDRFLAG_TRANSACTION | ICMSGHDRFLAG_RESPONSE; - vmbus_sendpacket(channel, recv_buffer, buf_len, req_id, - VM_PKT_DATA_INBAND, 0); -} - -void hv_fcopy_onchannelcallback(void *context) -{ - struct vmbus_channel *channel = context; - u32 recvlen; - u64 requestid; - struct hv_fcopy_hdr *fcopy_msg; - struct icmsg_hdr *icmsghdr; - int fcopy_srv_version; - - if (fcopy_transaction.state > HVUTIL_READY) - return; - - if (vmbus_recvpacket(channel, recv_buffer, HV_HYP_PAGE_SIZE * 2, &recvlen, &requestid)) { - pr_err_ratelimited("Fcopy request received. Could not read into recv buf\n"); - return; - } - - if (!recvlen) - return; - - /* Ensure recvlen is big enough to read header data */ - if (recvlen < ICMSG_HDR) { - pr_err_ratelimited("Fcopy request received. Packet length too small: %d\n", - recvlen); - return; - } - - icmsghdr = (struct icmsg_hdr *)&recv_buffer[ - sizeof(struct vmbuspipe_hdr)]; - - if (icmsghdr->icmsgtype == ICMSGTYPE_NEGOTIATE) { - if (vmbus_prep_negotiate_resp(icmsghdr, - recv_buffer, recvlen, - fw_versions, FW_VER_COUNT, - fcopy_versions, FCOPY_VER_COUNT, - NULL, &fcopy_srv_version)) { - - pr_info("FCopy IC version %d.%d\n", - fcopy_srv_version >> 16, - fcopy_srv_version & 0xFFFF); - } - } else if (icmsghdr->icmsgtype == ICMSGTYPE_FCOPY) { - /* Ensure recvlen is big enough to contain hv_fcopy_hdr */ - if (recvlen < ICMSG_HDR + sizeof(struct hv_fcopy_hdr)) { - pr_err_ratelimited("Invalid Fcopy hdr. Packet length too small: %u\n", - recvlen); - return; - } - fcopy_msg = (struct hv_fcopy_hdr *)&recv_buffer[ICMSG_HDR]; - - /* - * Stash away this global state for completing the - * transaction; note transactions are serialized. - */ - - fcopy_transaction.recv_len = recvlen; - fcopy_transaction.recv_req_id = requestid; - fcopy_transaction.fcopy_msg = fcopy_msg; - - if (fcopy_transaction.state < HVUTIL_READY) { - /* Userspace is not registered yet */ - fcopy_respond_to_host(HV_E_FAIL); - return; - } - fcopy_transaction.state = HVUTIL_HOSTMSG_RECEIVED; - - /* - * Send the information to the user-level daemon. - */ - schedule_work(&fcopy_send_work); - schedule_delayed_work(&fcopy_timeout_work, - HV_UTIL_TIMEOUT * HZ); - return; - } else { - pr_err_ratelimited("Fcopy request received. Invalid msg type: %d\n", - icmsghdr->icmsgtype); - return; - } - icmsghdr->icflags = ICMSGHDRFLAG_TRANSACTION | ICMSGHDRFLAG_RESPONSE; - vmbus_sendpacket(channel, recv_buffer, recvlen, requestid, - VM_PKT_DATA_INBAND, 0); -} - -/* Callback when data is received from userspace */ -static int fcopy_on_msg(void *msg, int len) -{ - int *val = (int *)msg; - - if (len != sizeof(int)) - return -EINVAL; - - if (fcopy_transaction.state == HVUTIL_DEVICE_INIT) - return fcopy_handle_handshake(*val); - - if (fcopy_transaction.state != HVUTIL_USERSPACE_REQ) - return -EINVAL; - - /* - * Complete the transaction by forwarding the result - * to the host. But first, cancel the timeout. - */ - if (cancel_delayed_work_sync(&fcopy_timeout_work)) { - fcopy_transaction.state = HVUTIL_USERSPACE_RECV; - fcopy_respond_to_host(*val); - hv_poll_channel(fcopy_transaction.recv_channel, - fcopy_poll_wrapper); - } - - return 0; -} - -static void fcopy_on_reset(void) -{ - /* - * The daemon has exited; reset the state. - */ - fcopy_transaction.state = HVUTIL_DEVICE_INIT; - - if (cancel_delayed_work_sync(&fcopy_timeout_work)) - fcopy_respond_to_host(HV_E_FAIL); -} - -int hv_fcopy_init(struct hv_util_service *srv) -{ - recv_buffer = srv->recv_buffer; - fcopy_transaction.recv_channel = srv->channel; - fcopy_transaction.recv_channel->max_pkt_size = HV_HYP_PAGE_SIZE * 2; - - /* - * When this driver loads, the user level daemon that - * processes the host requests may not yet be running. - * Defer processing channel callbacks until the daemon - * has registered. - */ - fcopy_transaction.state = HVUTIL_DEVICE_INIT; - - hvt = hvutil_transport_init(fcopy_devname, 0, 0, - fcopy_on_msg, fcopy_on_reset); - if (!hvt) - return -EFAULT; - - return 0; -} - -static void hv_fcopy_cancel_work(void) -{ - cancel_delayed_work_sync(&fcopy_timeout_work); - cancel_work_sync(&fcopy_send_work); -} - -int hv_fcopy_pre_suspend(void) -{ - struct vmbus_channel *channel = fcopy_transaction.recv_channel; - struct hv_fcopy_hdr *fcopy_msg; - - /* - * Fake a CANCEL_FCOPY message for the user space daemon in case the - * daemon is in the middle of copying some file. It doesn't matter if - * there is already a message pending to be delivered to the user - * space since we force fcopy_transaction.state to be HVUTIL_READY, so - * the user space daemon's write() will fail with EINVAL (see - * fcopy_on_msg()), and the daemon will reset the device by closing - * and re-opening it. - */ - fcopy_msg = kzalloc(sizeof(*fcopy_msg), GFP_KERNEL); - if (!fcopy_msg) - return -ENOMEM; - - tasklet_disable(&channel->callback_event); - - fcopy_msg->operation = CANCEL_FCOPY; - - hv_fcopy_cancel_work(); - - /* We don't care about the return value. */ - hvutil_transport_send(hvt, fcopy_msg, sizeof(*fcopy_msg), NULL); - - kfree(fcopy_msg); - - fcopy_transaction.state = HVUTIL_READY; - - /* tasklet_enable() will be called in hv_fcopy_pre_resume(). */ - return 0; -} - -int hv_fcopy_pre_resume(void) -{ - struct vmbus_channel *channel = fcopy_transaction.recv_channel; - - tasklet_enable(&channel->callback_event); - - return 0; -} - -void hv_fcopy_deinit(void) -{ - fcopy_transaction.state = HVUTIL_DEVICE_DYING; - - hv_fcopy_cancel_work(); - - hvutil_transport_destroy(hvt); -} diff --git a/drivers/hv/hv_kvp.c b/drivers/hv/hv_kvp.c index d35b60c06114..62795f6cbb00 100644 --- a/drivers/hv/hv_kvp.c +++ b/drivers/hv/hv_kvp.c @@ -27,7 +27,7 @@ #include <linux/connector.h> #include <linux/workqueue.h> #include <linux/hyperv.h> -#include <asm/hyperv-tlfs.h> +#include <hyperv/hvhdk.h> #include "hyperv_vmbus.h" #include "hv_utils_transport.h" @@ -655,7 +655,7 @@ void hv_kvp_onchannelcallback(void *context) if (host_negotiatied == NEGO_NOT_STARTED) { host_negotiatied = NEGO_IN_PROGRESS; schedule_delayed_work(&kvp_host_handshake_work, - HV_UTIL_NEGO_TIMEOUT * HZ); + secs_to_jiffies(HV_UTIL_NEGO_TIMEOUT)); } return; } @@ -724,7 +724,7 @@ void hv_kvp_onchannelcallback(void *context) */ schedule_work(&kvp_sendkey_work); schedule_delayed_work(&kvp_timeout_work, - HV_UTIL_TIMEOUT * HZ); + secs_to_jiffies(HV_UTIL_TIMEOUT)); return; @@ -767,6 +767,12 @@ hv_kvp_init(struct hv_util_service *srv) */ kvp_transaction.state = HVUTIL_DEVICE_INIT; + return 0; +} + +int +hv_kvp_init_transport(void) +{ hvt = hvutil_transport_init(kvp_devname, CN_KVP_IDX, CN_KVP_VAL, kvp_on_msg, kvp_on_reset); if (!hvt) diff --git a/drivers/hv/hv_snapshot.c b/drivers/hv/hv_snapshot.c index 0d2184be1691..2e7f537d53cf 100644 --- a/drivers/hv/hv_snapshot.c +++ b/drivers/hv/hv_snapshot.c @@ -12,7 +12,7 @@ #include <linux/connector.h> #include <linux/workqueue.h> #include <linux/hyperv.h> -#include <asm/hyperv-tlfs.h> +#include <hyperv/hvhdk.h> #include "hyperv_vmbus.h" #include "hv_utils_transport.h" @@ -193,7 +193,8 @@ static void vss_send_op(void) vss_transaction.state = HVUTIL_USERSPACE_REQ; schedule_delayed_work(&vss_timeout_work, op == VSS_OP_FREEZE ? - VSS_FREEZE_TIMEOUT * HZ : HV_UTIL_TIMEOUT * HZ); + secs_to_jiffies(VSS_FREEZE_TIMEOUT) : + secs_to_jiffies(HV_UTIL_TIMEOUT)); rc = hvutil_transport_send(hvt, vss_msg, sizeof(*vss_msg), NULL); if (rc) { @@ -388,6 +389,12 @@ hv_vss_init(struct hv_util_service *srv) */ vss_transaction.state = HVUTIL_DEVICE_INIT; + return 0; +} + +int +hv_vss_init_transport(void) +{ hvt = hvutil_transport_init(vss_devname, CN_VSS_IDX, CN_VSS_VAL, vss_on_msg, vss_on_reset); if (!hvt) { diff --git a/drivers/hv/hv_util.c b/drivers/hv/hv_util.c index 9c97c4065fe7..36ee89c0358b 100644 --- a/drivers/hv/hv_util.c +++ b/drivers/hv/hv_util.c @@ -141,6 +141,7 @@ static struct hv_util_service util_heartbeat = { static struct hv_util_service util_kvp = { .util_cb = hv_kvp_onchannelcallback, .util_init = hv_kvp_init, + .util_init_transport = hv_kvp_init_transport, .util_pre_suspend = hv_kvp_pre_suspend, .util_pre_resume = hv_kvp_pre_resume, .util_deinit = hv_kvp_deinit, @@ -149,19 +150,12 @@ static struct hv_util_service util_kvp = { static struct hv_util_service util_vss = { .util_cb = hv_vss_onchannelcallback, .util_init = hv_vss_init, + .util_init_transport = hv_vss_init_transport, .util_pre_suspend = hv_vss_pre_suspend, .util_pre_resume = hv_vss_pre_resume, .util_deinit = hv_vss_deinit, }; -static struct hv_util_service util_fcopy = { - .util_cb = hv_fcopy_onchannelcallback, - .util_init = hv_fcopy_init, - .util_pre_suspend = hv_fcopy_pre_suspend, - .util_pre_resume = hv_fcopy_pre_resume, - .util_deinit = hv_fcopy_deinit, -}; - static void perform_shutdown(struct work_struct *dummy) { orderly_poweroff(true); @@ -598,10 +592,8 @@ static int util_probe(struct hv_device *dev, srv->channel = dev->channel; if (srv->util_init) { ret = srv->util_init(srv); - if (ret) { - ret = -ENODEV; + if (ret) goto error1; - } } /* @@ -621,6 +613,13 @@ static int util_probe(struct hv_device *dev, if (ret) goto error; + if (srv->util_init_transport) { + ret = srv->util_init_transport(); + if (ret) { + vmbus_close(dev->channel); + goto error; + } + } return 0; error: @@ -700,10 +699,6 @@ static const struct hv_vmbus_device_id id_table[] = { { HV_VSS_GUID, .driver_data = (unsigned long)&util_vss }, - /* File copy GUID */ - { HV_FCOPY_GUID, - .driver_data = (unsigned long)&util_fcopy - }, { }, }; diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index f6b1e710f805..29780f3a7478 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -15,10 +15,10 @@ #include <linux/list.h> #include <linux/bitops.h> #include <asm/sync_bitops.h> -#include <asm/hyperv-tlfs.h> #include <linux/atomic.h> #include <linux/hyperv.h> #include <linux/interrupt.h> +#include <hyperv/hvhdk.h> #include "hv_trace.h" @@ -287,18 +287,10 @@ struct vmbus_connection { struct completion ready_for_suspend_event; /* - * The number of primary channels that should be "fixed up" - * upon resume: these channels are re-offered upon resume, and some - * fields of the channel offers (i.e. child_relid and connection_id) - * can change, so the old offermsg must be fixed up, before the resume - * callbacks of the VSC drivers start to further touch the channels. + * Completed once the host has offered all boot-time channels. + * Note that some channels may still be under process on a workqueue. */ - atomic_t nr_chan_fixup_on_resume; - /* - * vmbus_bus_resume() waits for "nr_chan_fixup_on_resume" to - * drop to zero. - */ - struct completion ready_for_resume_event; + struct completion all_offers_delivered_event; }; @@ -370,22 +362,18 @@ void vmbus_on_event(unsigned long data); void vmbus_on_msg_dpc(unsigned long data); int hv_kvp_init(struct hv_util_service *srv); +int hv_kvp_init_transport(void); void hv_kvp_deinit(void); int hv_kvp_pre_suspend(void); int hv_kvp_pre_resume(void); void hv_kvp_onchannelcallback(void *context); int hv_vss_init(struct hv_util_service *srv); +int hv_vss_init_transport(void); void hv_vss_deinit(void); int hv_vss_pre_suspend(void); int hv_vss_pre_resume(void); void hv_vss_onchannelcallback(void *context); - -int hv_fcopy_init(struct hv_util_service *srv); -void hv_fcopy_deinit(void); -int hv_fcopy_pre_suspend(void); -int hv_fcopy_pre_resume(void); -void hv_fcopy_onchannelcallback(void *context); void vmbus_initiate_unload(bool crash); static inline void hv_poll_channel(struct vmbus_channel *channel, @@ -417,6 +405,11 @@ static inline bool hv_is_perf_channel(struct vmbus_channel *channel) return vmbus_devs[channel->device_id].perf_device; } +static inline size_t hv_dev_ring_size(struct vmbus_channel *channel) +{ + return vmbus_devs[channel->device_id].pref_ring_size; +} + static inline bool hv_is_allocated_cpu(unsigned int cpu) { struct vmbus_channel *channel, *sc; diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 7f7965f3d187..6e55a1a2613d 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -131,7 +131,7 @@ static ssize_t id_show(struct device *dev, struct device_attribute *dev_attr, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", hv_dev->channel->offermsg.child_relid); + return sysfs_emit(buf, "%d\n", hv_dev->channel->offermsg.child_relid); } static DEVICE_ATTR_RO(id); @@ -142,7 +142,7 @@ static ssize_t state_show(struct device *dev, struct device_attribute *dev_attr, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", hv_dev->channel->state); + return sysfs_emit(buf, "%d\n", hv_dev->channel->state); } static DEVICE_ATTR_RO(state); @@ -153,7 +153,7 @@ static ssize_t monitor_id_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", hv_dev->channel->offermsg.monitorid); + return sysfs_emit(buf, "%d\n", hv_dev->channel->offermsg.monitorid); } static DEVICE_ATTR_RO(monitor_id); @@ -164,8 +164,8 @@ static ssize_t class_id_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "{%pUl}\n", - &hv_dev->channel->offermsg.offer.if_type); + return sysfs_emit(buf, "{%pUl}\n", + &hv_dev->channel->offermsg.offer.if_type); } static DEVICE_ATTR_RO(class_id); @@ -176,8 +176,8 @@ static ssize_t device_id_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "{%pUl}\n", - &hv_dev->channel->offermsg.offer.if_instance); + return sysfs_emit(buf, "{%pUl}\n", + &hv_dev->channel->offermsg.offer.if_instance); } static DEVICE_ATTR_RO(device_id); @@ -186,7 +186,7 @@ static ssize_t modalias_show(struct device *dev, { struct hv_device *hv_dev = device_to_hv_device(dev); - return sprintf(buf, "vmbus:%*phN\n", UUID_SIZE, &hv_dev->dev_type); + return sysfs_emit(buf, "vmbus:%*phN\n", UUID_SIZE, &hv_dev->dev_type); } static DEVICE_ATTR_RO(modalias); @@ -199,7 +199,7 @@ static ssize_t numa_node_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", cpu_to_node(hv_dev->channel->target_cpu)); + return sysfs_emit(buf, "%d\n", cpu_to_node(hv_dev->channel->target_cpu)); } static DEVICE_ATTR_RO(numa_node); #endif @@ -212,9 +212,8 @@ static ssize_t server_monitor_pending_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", - channel_pending(hv_dev->channel, - vmbus_connection.monitor_pages[0])); + return sysfs_emit(buf, "%d\n", channel_pending(hv_dev->channel, + vmbus_connection.monitor_pages[0])); } static DEVICE_ATTR_RO(server_monitor_pending); @@ -226,9 +225,8 @@ static ssize_t client_monitor_pending_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", - channel_pending(hv_dev->channel, - vmbus_connection.monitor_pages[1])); + return sysfs_emit(buf, "%d\n", channel_pending(hv_dev->channel, + vmbus_connection.monitor_pages[1])); } static DEVICE_ATTR_RO(client_monitor_pending); @@ -240,9 +238,8 @@ static ssize_t server_monitor_latency_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", - channel_latency(hv_dev->channel, - vmbus_connection.monitor_pages[0])); + return sysfs_emit(buf, "%d\n", channel_latency(hv_dev->channel, + vmbus_connection.monitor_pages[0])); } static DEVICE_ATTR_RO(server_monitor_latency); @@ -254,9 +251,8 @@ static ssize_t client_monitor_latency_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", - channel_latency(hv_dev->channel, - vmbus_connection.monitor_pages[1])); + return sysfs_emit(buf, "%d\n", channel_latency(hv_dev->channel, + vmbus_connection.monitor_pages[1])); } static DEVICE_ATTR_RO(client_monitor_latency); @@ -268,9 +264,8 @@ static ssize_t server_monitor_conn_id_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", - channel_conn_id(hv_dev->channel, - vmbus_connection.monitor_pages[0])); + return sysfs_emit(buf, "%d\n", channel_conn_id(hv_dev->channel, + vmbus_connection.monitor_pages[0])); } static DEVICE_ATTR_RO(server_monitor_conn_id); @@ -282,9 +277,8 @@ static ssize_t client_monitor_conn_id_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; - return sprintf(buf, "%d\n", - channel_conn_id(hv_dev->channel, - vmbus_connection.monitor_pages[1])); + return sysfs_emit(buf, "%d\n", channel_conn_id(hv_dev->channel, + vmbus_connection.monitor_pages[1])); } static DEVICE_ATTR_RO(client_monitor_conn_id); @@ -303,7 +297,7 @@ static ssize_t out_intr_mask_show(struct device *dev, if (ret < 0) return ret; - return sprintf(buf, "%d\n", outbound.current_interrupt_mask); + return sysfs_emit(buf, "%d\n", outbound.current_interrupt_mask); } static DEVICE_ATTR_RO(out_intr_mask); @@ -321,7 +315,7 @@ static ssize_t out_read_index_show(struct device *dev, &outbound); if (ret < 0) return ret; - return sprintf(buf, "%d\n", outbound.current_read_index); + return sysfs_emit(buf, "%d\n", outbound.current_read_index); } static DEVICE_ATTR_RO(out_read_index); @@ -340,7 +334,7 @@ static ssize_t out_write_index_show(struct device *dev, &outbound); if (ret < 0) return ret; - return sprintf(buf, "%d\n", outbound.current_write_index); + return sysfs_emit(buf, "%d\n", outbound.current_write_index); } static DEVICE_ATTR_RO(out_write_index); @@ -359,7 +353,7 @@ static ssize_t out_read_bytes_avail_show(struct device *dev, &outbound); if (ret < 0) return ret; - return sprintf(buf, "%d\n", outbound.bytes_avail_toread); + return sysfs_emit(buf, "%d\n", outbound.bytes_avail_toread); } static DEVICE_ATTR_RO(out_read_bytes_avail); @@ -378,7 +372,7 @@ static ssize_t out_write_bytes_avail_show(struct device *dev, &outbound); if (ret < 0) return ret; - return sprintf(buf, "%d\n", outbound.bytes_avail_towrite); + return sysfs_emit(buf, "%d\n", outbound.bytes_avail_towrite); } static DEVICE_ATTR_RO(out_write_bytes_avail); @@ -396,7 +390,7 @@ static ssize_t in_intr_mask_show(struct device *dev, if (ret < 0) return ret; - return sprintf(buf, "%d\n", inbound.current_interrupt_mask); + return sysfs_emit(buf, "%d\n", inbound.current_interrupt_mask); } static DEVICE_ATTR_RO(in_intr_mask); @@ -414,7 +408,7 @@ static ssize_t in_read_index_show(struct device *dev, if (ret < 0) return ret; - return sprintf(buf, "%d\n", inbound.current_read_index); + return sysfs_emit(buf, "%d\n", inbound.current_read_index); } static DEVICE_ATTR_RO(in_read_index); @@ -432,7 +426,7 @@ static ssize_t in_write_index_show(struct device *dev, if (ret < 0) return ret; - return sprintf(buf, "%d\n", inbound.current_write_index); + return sysfs_emit(buf, "%d\n", inbound.current_write_index); } static DEVICE_ATTR_RO(in_write_index); @@ -451,7 +445,7 @@ static ssize_t in_read_bytes_avail_show(struct device *dev, if (ret < 0) return ret; - return sprintf(buf, "%d\n", inbound.bytes_avail_toread); + return sysfs_emit(buf, "%d\n", inbound.bytes_avail_toread); } static DEVICE_ATTR_RO(in_read_bytes_avail); @@ -470,7 +464,7 @@ static ssize_t in_write_bytes_avail_show(struct device *dev, if (ret < 0) return ret; - return sprintf(buf, "%d\n", inbound.bytes_avail_towrite); + return sysfs_emit(buf, "%d\n", inbound.bytes_avail_towrite); } static DEVICE_ATTR_RO(in_write_bytes_avail); @@ -480,7 +474,7 @@ static ssize_t channel_vp_mapping_show(struct device *dev, { struct hv_device *hv_dev = device_to_hv_device(dev); struct vmbus_channel *channel = hv_dev->channel, *cur_sc; - int buf_size = PAGE_SIZE, n_written, tot_written; + int n_written; struct list_head *cur; if (!channel) @@ -488,25 +482,21 @@ static ssize_t channel_vp_mapping_show(struct device *dev, mutex_lock(&vmbus_connection.channel_mutex); - tot_written = snprintf(buf, buf_size, "%u:%u\n", - channel->offermsg.child_relid, channel->target_cpu); + n_written = sysfs_emit(buf, "%u:%u\n", + channel->offermsg.child_relid, + channel->target_cpu); list_for_each(cur, &channel->sc_list) { - if (tot_written >= buf_size - 1) - break; cur_sc = list_entry(cur, struct vmbus_channel, sc_list); - n_written = scnprintf(buf + tot_written, - buf_size - tot_written, - "%u:%u\n", - cur_sc->offermsg.child_relid, - cur_sc->target_cpu); - tot_written += n_written; + n_written += sysfs_emit_at(buf, n_written, "%u:%u\n", + cur_sc->offermsg.child_relid, + cur_sc->target_cpu); } mutex_unlock(&vmbus_connection.channel_mutex); - return tot_written; + return n_written; } static DEVICE_ATTR_RO(channel_vp_mapping); @@ -516,7 +506,7 @@ static ssize_t vendor_show(struct device *dev, { struct hv_device *hv_dev = device_to_hv_device(dev); - return sprintf(buf, "0x%x\n", hv_dev->vendor_id); + return sysfs_emit(buf, "0x%x\n", hv_dev->vendor_id); } static DEVICE_ATTR_RO(vendor); @@ -526,7 +516,7 @@ static ssize_t device_show(struct device *dev, { struct hv_device *hv_dev = device_to_hv_device(dev); - return sprintf(buf, "0x%x\n", hv_dev->device_id); + return sysfs_emit(buf, "0x%x\n", hv_dev->device_id); } static DEVICE_ATTR_RO(device); @@ -551,7 +541,7 @@ static ssize_t driver_override_show(struct device *dev, ssize_t len; device_lock(dev); - len = snprintf(buf, PAGE_SIZE, "%s\n", hv_dev->driver_override); + len = sysfs_emit(buf, "%s\n", hv_dev->driver_override); device_unlock(dev); return len; @@ -695,7 +685,7 @@ static const struct hv_vmbus_device_id vmbus_device_null; * Return a matching hv_vmbus_device_id pointer. * If there is no match, return NULL. */ -static const struct hv_vmbus_device_id *hv_vmbus_get_id(struct hv_driver *drv, +static const struct hv_vmbus_device_id *hv_vmbus_get_id(const struct hv_driver *drv, struct hv_device *dev) { const guid_t *guid = &dev->dev_type; @@ -706,7 +696,7 @@ static const struct hv_vmbus_device_id *hv_vmbus_get_id(struct hv_driver *drv, return NULL; /* Look at the dynamic ids first, before the static ones */ - id = hv_vmbus_dynid_match(drv, guid); + id = hv_vmbus_dynid_match((struct hv_driver *)drv, guid); if (!id) id = hv_vmbus_dev_match(drv->id_table, guid); @@ -819,9 +809,9 @@ ATTRIBUTE_GROUPS(vmbus_drv); /* * vmbus_match - Attempt to match the specified device to the specified driver */ -static int vmbus_match(struct device *device, struct device_driver *driver) +static int vmbus_match(struct device *device, const struct device_driver *driver) { - struct hv_driver *drv = drv_to_hv_drv(driver); + const struct hv_driver *drv = drv_to_hv_drv(driver); struct hv_device *hv_dev = device_to_hv_device(device); /* The hv_sock driver handles all hv_sock offers. */ @@ -1316,6 +1306,13 @@ static irqreturn_t vmbus_percpu_isr(int irq, void *dev_id) return IRQ_HANDLED; } +static void vmbus_percpu_work(struct work_struct *work) +{ + unsigned int cpu = smp_processor_id(); + + hv_synic_init(cpu); +} + /* * vmbus_bus_init -Main vmbus driver initialization routine. * @@ -1326,7 +1323,8 @@ static irqreturn_t vmbus_percpu_isr(int irq, void *dev_id) */ static int vmbus_bus_init(void) { - int ret; + int ret, cpu; + struct work_struct __percpu *works; ret = hv_init(); if (ret != 0) { @@ -1365,12 +1363,32 @@ static int vmbus_bus_init(void) if (ret) goto err_alloc; + works = alloc_percpu(struct work_struct); + if (!works) { + ret = -ENOMEM; + goto err_alloc; + } + /* * Initialize the per-cpu interrupt state and stimer state. * Then connect to the host. */ - ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "hyperv/vmbus:online", - hv_synic_init, hv_synic_cleanup); + cpus_read_lock(); + for_each_online_cpu(cpu) { + struct work_struct *work = per_cpu_ptr(works, cpu); + + INIT_WORK(work, vmbus_percpu_work); + schedule_work_on(cpu, work); + } + + for_each_online_cpu(cpu) + flush_work(per_cpu_ptr(works, cpu)); + + /* Register the callbacks for possible CPU online/offline'ing */ + ret = cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ONLINE_DYN, "hyperv/vmbus:online", + hv_synic_init, hv_synic_cleanup); + cpus_read_unlock(); + free_percpu(works); if (ret < 0) goto err_alloc; hyperv_cpuhp_online = ret; @@ -1813,12 +1831,12 @@ static umode_t vmbus_chan_attr_is_visible(struct kobject *kobj, return attr->mode; } -static struct attribute_group vmbus_chan_group = { +static const struct attribute_group vmbus_chan_group = { .attrs = vmbus_chan_attrs, .is_visible = vmbus_chan_attr_is_visible }; -static struct kobj_type vmbus_chan_ktype = { +static const struct kobj_type vmbus_chan_ktype = { .sysfs_ops = &vmbus_chan_sysfs_ops, .release = vmbus_chan_release, }; @@ -1962,6 +1980,7 @@ void vmbus_device_unregister(struct hv_device *device_obj) */ device_unregister(&device_obj->device); } +EXPORT_SYMBOL_GPL(vmbus_device_unregister); #ifdef CONFIG_ACPI /* @@ -2243,12 +2262,25 @@ void vmbus_free_mmio(resource_size_t start, resource_size_t size) struct resource *iter; mutex_lock(&hyperv_mmio_lock); + + /* + * If all bytes of the MMIO range to be released are within the + * special case fb_mmio shadow region, skip releasing the shadow + * region since no corresponding __request_region() was done + * in vmbus_allocate_mmio(). + */ + if (fb_mmio && start >= fb_mmio->start && + (start + size - 1 <= fb_mmio->end)) + goto skip_shadow_release; + for (iter = hyperv_mmio; iter; iter = iter->sibling) { if ((iter->start >= start + size) || (iter->end <= start)) continue; __release_region(iter, start, size); } + +skip_shadow_release: release_mem_region(start, size); mutex_unlock(&hyperv_mmio_lock); @@ -2359,10 +2391,9 @@ static int vmbus_platform_driver_probe(struct platform_device *pdev) return vmbus_acpi_add(pdev); } -static int vmbus_platform_driver_remove(struct platform_device *pdev) +static void vmbus_platform_driver_remove(struct platform_device *pdev) { vmbus_mmio_remove(); - return 0; } #ifdef CONFIG_PM_SLEEP @@ -2409,11 +2440,6 @@ static int vmbus_bus_suspend(struct device *dev) if (atomic_read(&vmbus_connection.nr_chan_close_on_suspend) > 0) wait_for_completion(&vmbus_connection.ready_for_suspend_event); - if (atomic_read(&vmbus_connection.nr_chan_fixup_on_resume) != 0) { - pr_err("Can not suspend due to a previous failed resuming\n"); - return -EBUSY; - } - mutex_lock(&vmbus_connection.channel_mutex); list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { @@ -2438,22 +2464,18 @@ static int vmbus_bus_suspend(struct device *dev) pr_err("Sub-channel not deleted!\n"); WARN_ON_ONCE(1); } - - atomic_inc(&vmbus_connection.nr_chan_fixup_on_resume); } mutex_unlock(&vmbus_connection.channel_mutex); vmbus_initiate_unload(false); - /* Reset the event for the next resume. */ - reinit_completion(&vmbus_connection.ready_for_resume_event); - return 0; } static int vmbus_bus_resume(struct device *dev) { + struct vmbus_channel *channel; struct vmbus_channel_msginfo *msginfo; size_t msgsize; int ret; @@ -2484,13 +2506,23 @@ static int vmbus_bus_resume(struct device *dev) if (ret != 0) return ret; - WARN_ON(atomic_read(&vmbus_connection.nr_chan_fixup_on_resume) == 0); - vmbus_request_offers(); - if (wait_for_completion_timeout( - &vmbus_connection.ready_for_resume_event, 10 * HZ) == 0) - pr_err("Some vmbus device is missing after suspending?\n"); + mutex_lock(&vmbus_connection.channel_mutex); + list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { + if (channel->offermsg.child_relid != INVALID_RELID) + continue; + + /* hvsock channels are not expected to be present. */ + if (is_hvsock_channel(channel)) + continue; + + pr_err("channel %pUl/%pUl not present after resume.\n", + &channel->offermsg.offer.if_type, + &channel->offermsg.offer.if_instance); + /* ToDo: Cleanup these channels here */ + } + mutex_unlock(&vmbus_connection.channel_mutex); /* Reset the event for the next suspend. */ reinit_completion(&vmbus_connection.ready_for_suspend_event); |