From 2b01281273738bf2d6551da48d65db2df3f28998 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:08:45 +0000 Subject: KVM: Register /dev/kvm as the _very_ last thing during initialization Register /dev/kvm, i.e. expose KVM to userspace, only after all other setup has completed. Once /dev/kvm is exposed, userspace can start invoking KVM ioctls, creating VMs, etc... If userspace creates a VM before KVM is done with its configuration, bad things may happen, e.g. KVM will fail to properly migrate vCPU state if a VM is created before KVM has registered preemption notifiers. Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20221130230934.1014142-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 13e88297f999..28a1a02f5228 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5988,12 +5988,6 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, kvm_chardev_ops.owner = module; - r = misc_register(&kvm_dev); - if (r) { - pr_err("kvm: misc device register failed\n"); - goto out_unreg; - } - register_syscore_ops(&kvm_syscore_ops); kvm_preempt_ops.sched_in = kvm_sched_in; @@ -6002,11 +5996,24 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, kvm_init_debug(); r = kvm_vfio_ops_init(); - WARN_ON(r); + if (WARN_ON_ONCE(r)) + goto err_vfio; + + /* + * Registration _must_ be the very last thing done, as this exposes + * /dev/kvm to userspace, i.e. all infrastructure must be setup! + */ + r = misc_register(&kvm_dev); + if (r) { + pr_err("kvm: misc device register failed\n"); + goto err_register; + } return 0; -out_unreg: +err_register: + kvm_vfio_ops_exit(); +err_vfio: kvm_async_pf_deinit(); out_free_4: for_each_possible_cpu(cpu) @@ -6032,8 +6039,14 @@ void kvm_exit(void) { int cpu; - debugfs_remove_recursive(kvm_debugfs_dir); + /* + * Note, unregistering /dev/kvm doesn't strictly need to come first, + * fops_get(), a.k.a. try_module_get(), prevents acquiring references + * to KVM while the module is being stopped. + */ misc_deregister(&kvm_dev); + + debugfs_remove_recursive(kvm_debugfs_dir); for_each_possible_cpu(cpu) free_cpumask_var(per_cpu(cpu_kick_mask, cpu)); kmem_cache_destroy(kvm_vcpu_cache); -- cgit From 5910ccf03de4fd6d321cf30cab7a06f1436a2053 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:08:46 +0000 Subject: KVM: Initialize IRQ FD after arch hardware setup Move initialization of KVM's IRQ FD workqueue below arch hardware setup as a step towards consolidating arch "init" and "hardware setup", and eventually towards dropping the hooks entirely. There is no dependency on the workqueue being created before hardware setup, the workqueue is used only when destroying VMs, i.e. only needs to be created before /dev/kvm is exposed to userspace. Move the destruction of the workqueue before the arch hooks to maintain symmetry, and so that arch code can move away from the hooks without having to worry about ordering changes. Reword the comment about kvm_irqfd_init() needing to come after kvm_arch_init() to call out that kvm_arch_init() must come before common KVM does _anything_, as x86 very subtly relies on that behavior to deal with multiple calls to kvm_init(), e.g. if userspace attempts to load kvm_amd.ko and kvm_intel.ko. Tag the code with a FIXME, as x86's subtle requirement is gross, and invoking an arch callback as the very first action in a helper that is called only from arch code is silly. Signed-off-by: Sean Christopherson Message-Id: <20221130230934.1014142-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 28a1a02f5228..6793a7e3525a 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5921,24 +5921,19 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, int r; int cpu; - r = kvm_arch_init(opaque); - if (r) - goto out_fail; - /* - * kvm_arch_init makes sure there's at most one caller - * for architectures that support multiple implementations, - * like intel and amd on x86. - * kvm_arch_init must be called before kvm_irqfd_init to avoid creating - * conflicts in case kvm is already setup for another implementation. + * FIXME: Get rid of kvm_arch_init(), vendor code should call arch code + * directly. Note, kvm_arch_init() _must_ be called before anything + * else as x86 relies on checks buried in kvm_arch_init() to guard + * against multiple calls to kvm_init(). */ - r = kvm_irqfd_init(); + r = kvm_arch_init(opaque); if (r) - goto out_irqfd; + return r; if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { r = -ENOMEM; - goto out_free_0; + goto err_hw_enabled; } r = kvm_arch_hardware_setup(opaque); @@ -5982,9 +5977,13 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, } } + r = kvm_irqfd_init(); + if (r) + goto err_irqfd; + r = kvm_async_pf_init(); if (r) - goto out_free_4; + goto err_async_pf; kvm_chardev_ops.owner = module; @@ -6015,6 +6014,9 @@ err_register: kvm_vfio_ops_exit(); err_vfio: kvm_async_pf_deinit(); +err_async_pf: + kvm_irqfd_exit(); +err_irqfd: out_free_4: for_each_possible_cpu(cpu) free_cpumask_var(per_cpu(cpu_kick_mask, cpu)); @@ -6026,11 +6028,8 @@ out_free_2: kvm_arch_hardware_unsetup(); out_free_1: free_cpumask_var(cpus_hardware_enabled); -out_free_0: - kvm_irqfd_exit(); -out_irqfd: +err_hw_enabled: kvm_arch_exit(); -out_fail: return r; } EXPORT_SYMBOL_GPL(kvm_init); @@ -6055,9 +6054,9 @@ void kvm_exit(void) unregister_reboot_notifier(&kvm_reboot_notifier); cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); on_each_cpu(hardware_disable_nolock, NULL, 1); + kvm_irqfd_exit(); kvm_arch_hardware_unsetup(); kvm_arch_exit(); - kvm_irqfd_exit(); free_cpumask_var(cpus_hardware_enabled); kvm_vfio_ops_exit(); } -- cgit From c9650228efbad65c1b9681198f3502de08a42944 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:08:47 +0000 Subject: KVM: Allocate cpus_hardware_enabled after arch hardware setup Allocate cpus_hardware_enabled after arch hardware setup so that arch "init" and "hardware setup" are called back-to-back and thus can be combined in a future patch. cpus_hardware_enabled is never used before kvm_create_vm(), i.e. doesn't have a dependency with hardware setup and only needs to be allocated before /dev/kvm is exposed to userspace. Free the object before the arch hooks are invoked to maintain symmetry, and so that arch code can move away from the hooks without having to worry about ordering changes. Signed-off-by: Sean Christopherson Reviewed-by: Yuan Yao Message-Id: <20221130230934.1014142-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 6793a7e3525a..63983588c93b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5931,15 +5931,15 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, if (r) return r; + r = kvm_arch_hardware_setup(opaque); + if (r < 0) + goto err_hw_setup; + if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { r = -ENOMEM; goto err_hw_enabled; } - r = kvm_arch_hardware_setup(opaque); - if (r < 0) - goto out_free_1; - c.ret = &r; c.opaque = opaque; for_each_online_cpu(cpu) { @@ -6025,10 +6025,10 @@ out_free_3: unregister_reboot_notifier(&kvm_reboot_notifier); cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); out_free_2: - kvm_arch_hardware_unsetup(); -out_free_1: free_cpumask_var(cpus_hardware_enabled); err_hw_enabled: + kvm_arch_hardware_unsetup(); +err_hw_setup: kvm_arch_exit(); return r; } @@ -6055,9 +6055,9 @@ void kvm_exit(void) cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); on_each_cpu(hardware_disable_nolock, NULL, 1); kvm_irqfd_exit(); + free_cpumask_var(cpus_hardware_enabled); kvm_arch_hardware_unsetup(); kvm_arch_exit(); - free_cpumask_var(cpus_hardware_enabled); kvm_vfio_ops_exit(); } EXPORT_SYMBOL_GPL(kvm_exit); -- cgit From 73b8dc04132c1f8d4f9250a39f1fc9878ba2ee13 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:08:48 +0000 Subject: KVM: Teardown VFIO ops earlier in kvm_exit() Move the call to kvm_vfio_ops_exit() further up kvm_exit() to try and bring some amount of symmetry to the setup order in kvm_init(), and more importantly so that the arch hooks are invoked dead last by kvm_exit(). This will allow arch code to move away from the arch hooks without any change in ordering between arch code and common code in kvm_exit(). That kvm_vfio_ops_exit() is called last appears to be 100% arbitrary. It was bolted on after the fact by commit 571ee1b68598 ("kvm: vfio: fix unregister kvm_device_ops of vfio"). The nullified kvm_device_ops_table is also local to kvm_main.c and is used only when there are active VMs, so unless arch code is doing something truly bizarre, nullifying the table earlier in kvm_exit() is little more than a nop. Signed-off-by: Sean Christopherson Reviewed-by: Cornelia Huck Reviewed-by: Eric Farman Message-Id: <20221130230934.1014142-5-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 63983588c93b..70264378da41 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -6049,6 +6049,7 @@ void kvm_exit(void) for_each_possible_cpu(cpu) free_cpumask_var(per_cpu(cpu_kick_mask, cpu)); kmem_cache_destroy(kvm_vcpu_cache); + kvm_vfio_ops_exit(); kvm_async_pf_deinit(); unregister_syscore_ops(&kvm_syscore_ops); unregister_reboot_notifier(&kvm_reboot_notifier); @@ -6058,7 +6059,6 @@ void kvm_exit(void) free_cpumask_var(cpus_hardware_enabled); kvm_arch_hardware_unsetup(); kvm_arch_exit(); - kvm_vfio_ops_exit(); } EXPORT_SYMBOL_GPL(kvm_exit); -- cgit From 63a1bd8ad1ac9e4e8bfcd5914c8899606e04898d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:08:53 +0000 Subject: KVM: Drop arch hardware (un)setup hooks Drop kvm_arch_hardware_setup() and kvm_arch_hardware_unsetup() now that all implementations are nops. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Eric Farman # s390 Acked-by: Anup Patel Message-Id: <20221130230934.1014142-10-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 70264378da41..8393347fd35f 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5931,10 +5931,6 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, if (r) return r; - r = kvm_arch_hardware_setup(opaque); - if (r < 0) - goto err_hw_setup; - if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { r = -ENOMEM; goto err_hw_enabled; @@ -6027,8 +6023,6 @@ out_free_3: out_free_2: free_cpumask_var(cpus_hardware_enabled); err_hw_enabled: - kvm_arch_hardware_unsetup(); -err_hw_setup: kvm_arch_exit(); return r; } @@ -6057,7 +6051,6 @@ void kvm_exit(void) on_each_cpu(hardware_disable_nolock, NULL, 1); kvm_irqfd_exit(); free_cpumask_var(cpus_hardware_enabled); - kvm_arch_hardware_unsetup(); kvm_arch_exit(); } EXPORT_SYMBOL_GPL(kvm_exit); -- cgit From a578a0a9e3526483ad1904fac019d95e7089fb34 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:09:13 +0000 Subject: KVM: Drop kvm_arch_{init,exit}() hooks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop kvm_arch_init() and kvm_arch_exit() now that all implementations are nops. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Eric Farman # s390 Reviewed-by: Philippe Mathieu-Daudé Acked-by: Anup Patel Message-Id: <20221130230934.1014142-30-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 8393347fd35f..597ff734b312 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5921,20 +5921,8 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, int r; int cpu; - /* - * FIXME: Get rid of kvm_arch_init(), vendor code should call arch code - * directly. Note, kvm_arch_init() _must_ be called before anything - * else as x86 relies on checks buried in kvm_arch_init() to guard - * against multiple calls to kvm_init(). - */ - r = kvm_arch_init(opaque); - if (r) - return r; - - if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { - r = -ENOMEM; - goto err_hw_enabled; - } + if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) + return -ENOMEM; c.ret = &r; c.opaque = opaque; @@ -6022,8 +6010,6 @@ out_free_3: cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); out_free_2: free_cpumask_var(cpus_hardware_enabled); -err_hw_enabled: - kvm_arch_exit(); return r; } EXPORT_SYMBOL_GPL(kvm_init); @@ -6051,7 +6037,6 @@ void kvm_exit(void) on_each_cpu(hardware_disable_nolock, NULL, 1); kvm_irqfd_exit(); free_cpumask_var(cpus_hardware_enabled); - kvm_arch_exit(); } EXPORT_SYMBOL_GPL(kvm_exit); -- cgit From 81a1cf9f89a6b71e71bfd7d43837ce9235e70b38 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:09:16 +0000 Subject: KVM: Drop kvm_arch_check_processor_compat() hook MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop kvm_arch_check_processor_compat() and its support code now that all architecture implementations are nops. Signed-off-by: Sean Christopherson Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Eric Farman # s390 Acked-by: Anup Patel Reviewed-by: Kai Huang Message-Id: <20221130230934.1014142-33-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 24 +----------------------- 1 file changed, 1 insertion(+), 23 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 597ff734b312..e13b369cfc1b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5902,36 +5902,14 @@ void kvm_unregister_perf_callbacks(void) } #endif -struct kvm_cpu_compat_check { - void *opaque; - int *ret; -}; - -static void check_processor_compat(void *data) +int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) { - struct kvm_cpu_compat_check *c = data; - - *c->ret = kvm_arch_check_processor_compat(c->opaque); -} - -int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, - struct module *module) -{ - struct kvm_cpu_compat_check c; int r; int cpu; if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) return -ENOMEM; - c.ret = &r; - c.opaque = opaque; - for_each_online_cpu(cpu) { - smp_call_function_single(cpu, check_processor_compat, &c, 1); - if (r < 0) - goto out_free_2; - } - r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting", kvm_starting_cpu, kvm_dying_cpu); if (r) -- cgit From aaf12a7b4323eb7d94677bcefc286ff6b772ed1c Mon Sep 17 00:00:00 2001 From: Chao Gao Date: Wed, 30 Nov 2022 23:09:25 +0000 Subject: KVM: Rename and move CPUHP_AP_KVM_STARTING to ONLINE section The CPU STARTING section doesn't allow callbacks to fail. Move KVM's hotplug callback to ONLINE section so that it can abort onlining a CPU in certain cases to avoid potentially breaking VMs running on existing CPUs. For example, when KVM fails to enable hardware virtualization on the hotplugged CPU. Place KVM's hotplug state before CPUHP_AP_SCHED_WAIT_EMPTY as it ensures when offlining a CPU, all user tasks and non-pinned kernel tasks have left the CPU, i.e. there cannot be a vCPU task around. So, it is safe for KVM's CPU offline callback to disable hardware virtualization at that point. Likewise, KVM's online callback can enable hardware virtualization before any vCPU task gets a chance to run on hotplugged CPUs. Drop kvm_x86_check_processor_compatibility()'s WARN that IRQs are disabled, as the ONLINE section runs with IRQs disabled. The WARN wasn't intended to be a requirement, e.g. disabling preemption is sufficient, the IRQ thing was purely an aggressive sanity check since the helper was only ever invoked via SMP function call. Rename KVM's CPU hotplug callbacks accordingly. Suggested-by: Thomas Gleixner Signed-off-by: Chao Gao Signed-off-by: Isaku Yamahata Reviewed-by: Yuan Yao [sean: drop WARN that IRQs are disabled] Signed-off-by: Sean Christopherson Message-Id: <20221130230934.1014142-42-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e13b369cfc1b..ee1005cb99e1 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5114,13 +5114,27 @@ static void hardware_enable_nolock(void *junk) } } -static int kvm_starting_cpu(unsigned int cpu) +static int kvm_online_cpu(unsigned int cpu) { + int ret = 0; + + /* + * Abort the CPU online process if hardware virtualization cannot + * be enabled. Otherwise running VMs would encounter unrecoverable + * errors when scheduled to this CPU. + */ raw_spin_lock(&kvm_count_lock); - if (kvm_usage_count) + if (kvm_usage_count) { + WARN_ON_ONCE(atomic_read(&hardware_enable_failed)); + hardware_enable_nolock(NULL); + if (atomic_read(&hardware_enable_failed)) { + atomic_set(&hardware_enable_failed, 0); + ret = -EIO; + } + } raw_spin_unlock(&kvm_count_lock); - return 0; + return ret; } static void hardware_disable_nolock(void *junk) @@ -5133,7 +5147,7 @@ static void hardware_disable_nolock(void *junk) kvm_arch_hardware_disable(); } -static int kvm_dying_cpu(unsigned int cpu) +static int kvm_offline_cpu(unsigned int cpu) { raw_spin_lock(&kvm_count_lock); if (kvm_usage_count) @@ -5910,8 +5924,8 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) return -ENOMEM; - r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting", - kvm_starting_cpu, kvm_dying_cpu); + r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online", + kvm_online_cpu, kvm_offline_cpu); if (r) goto out_free_2; register_reboot_notifier(&kvm_reboot_notifier); @@ -5985,7 +5999,7 @@ out_free_4: kmem_cache_destroy(kvm_vcpu_cache); out_free_3: unregister_reboot_notifier(&kvm_reboot_notifier); - cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); + cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE); out_free_2: free_cpumask_var(cpus_hardware_enabled); return r; @@ -6011,7 +6025,7 @@ void kvm_exit(void) kvm_async_pf_deinit(); unregister_syscore_ops(&kvm_syscore_ops); unregister_reboot_notifier(&kvm_reboot_notifier); - cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); + cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE); on_each_cpu(hardware_disable_nolock, NULL, 1); kvm_irqfd_exit(); free_cpumask_var(cpus_hardware_enabled); -- cgit From e4aa7f88af1a123863530af4a238ce64ec8fad5a Mon Sep 17 00:00:00 2001 From: Chao Gao Date: Wed, 30 Nov 2022 23:09:26 +0000 Subject: KVM: Disable CPU hotplug during hardware enabling/disabling Disable CPU hotplug when enabling/disabling hardware to prevent the corner case where if the following sequence occurs: 1. A hotplugged CPU marks itself online in cpu_online_mask 2. The hotplugged CPU enables interrupt before invoking KVM's ONLINE callback 3 hardware_{en,dis}able_all() is invoked on another CPU the hotplugged CPU will be included in on_each_cpu() and thus get sent through hardware_{en,dis}able_nolock() before kvm_online_cpu() is called. start_secondary { ... set_cpu_online(smp_processor_id(), true); <- 1 ... local_irq_enable(); <- 2 ... cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); <- 3 } KVM currently fudges around this race by keeping track of which CPUs have done hardware enabling (see commit 1b6c016818a5 "KVM: Keep track of which cpus have virtualization enabled"), but that's an inefficient, convoluted, and hacky solution. Signed-off-by: Chao Gao [sean: split to separate patch, write changelog] Signed-off-by: Sean Christopherson Message-Id: <20221130230934.1014142-43-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ee1005cb99e1..2d6203ecab88 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5167,15 +5167,26 @@ static void hardware_disable_all_nolock(void) static void hardware_disable_all(void) { + cpus_read_lock(); raw_spin_lock(&kvm_count_lock); hardware_disable_all_nolock(); raw_spin_unlock(&kvm_count_lock); + cpus_read_unlock(); } static int hardware_enable_all(void) { int r = 0; + /* + * When onlining a CPU, cpu_online_mask is set before kvm_online_cpu() + * is called, and so on_each_cpu() between them includes the CPU that + * is being onlined. As a result, hardware_enable_nolock() may get + * invoked before kvm_online_cpu(), which also enables hardware if the + * usage count is non-zero. Disable CPU hotplug to avoid attempting to + * enable hardware multiple times. + */ + cpus_read_lock(); raw_spin_lock(&kvm_count_lock); kvm_usage_count++; @@ -5190,6 +5201,7 @@ static int hardware_enable_all(void) } raw_spin_unlock(&kvm_count_lock); + cpus_read_unlock(); return r; } -- cgit From 2c106f29004d2a6dbdc375cd9c0c8f981f475f47 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:09:27 +0000 Subject: KVM: Ensure CPU is stable during low level hardware enable/disable Use the non-raw smp_processor_id() in the low hardware enable/disable helpers as KVM absolutely relies on the CPU being stable, e.g. KVM would end up with incorrect state if the task were migrated between accessing cpus_hardware_enabled and actually enabling/disabling hardware. Signed-off-by: Sean Christopherson Message-Id: <20221130230934.1014142-44-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 2d6203ecab88..0c3b9d6fa1b1 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5097,7 +5097,7 @@ static struct miscdevice kvm_dev = { static void hardware_enable_nolock(void *junk) { - int cpu = raw_smp_processor_id(); + int cpu = smp_processor_id(); int r; if (cpumask_test_cpu(cpu, cpus_hardware_enabled)) @@ -5139,7 +5139,7 @@ static int kvm_online_cpu(unsigned int cpu) static void hardware_disable_nolock(void *junk) { - int cpu = raw_smp_processor_id(); + int cpu = smp_processor_id(); if (!cpumask_test_cpu(cpu, cpus_hardware_enabled)) return; -- cgit From 0bf50497f03b3d892c470c7d1a10a3e9c3c95821 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Wed, 30 Nov 2022 23:09:28 +0000 Subject: KVM: Drop kvm_count_lock and instead protect kvm_usage_count with kvm_lock Drop kvm_count_lock and instead protect kvm_usage_count with kvm_lock now that KVM hooks CPU hotplug during the ONLINE phase, which can sleep. Previously, KVM hooked the STARTING phase, which is not allowed to sleep and thus could not take kvm_lock (a mutex). This effectively allows the task that's initiating hardware enabling/disabling to preempted and/or migrated. Note, the Documentation/virt/kvm/locking.rst statement that kvm_count_lock is "raw" because hardware enabling/disabling needs to be atomic with respect to migration is wrong on multiple fronts. First, while regular spinlocks can be preempted, the task holding the lock cannot be migrated. Second, preventing migration is not required. on_each_cpu() disables preemption, which ensures that cpus_hardware_enabled correctly reflects hardware state. The task may be preempted/migrated between bumping kvm_usage_count and invoking on_each_cpu(), but that's perfectly ok as kvm_usage_count is still protected, e.g. other tasks that call hardware_enable_all() will be blocked until the preempted/migrated owner exits its critical section. KVM does have lockless accesses to kvm_usage_count in the suspend/resume flows, but those are safe because all tasks must be frozen prior to suspending CPUs, and a task cannot be frozen while it holds one or more locks (userspace tasks are frozen via a fake signal). Preemption doesn't need to be explicitly disabled in the hotplug path. The hotplug thread is pinned to the CPU that's being hotplugged, and KVM only cares about having a stable CPU, i.e. to ensure hardware is enabled on the correct CPU. Lockep, i.e. check_preemption_disabled(), plays nice with this state too, as is_percpu_thread() is true for the hotplug thread. Signed-off-by: Isaku Yamahata Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Message-Id: <20221130230934.1014142-45-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 0c3b9d6fa1b1..f6caee790dc8 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -100,7 +100,6 @@ EXPORT_SYMBOL_GPL(halt_poll_ns_shrink); */ DEFINE_MUTEX(kvm_lock); -static DEFINE_RAW_SPINLOCK(kvm_count_lock); LIST_HEAD(vm_list); static cpumask_var_t cpus_hardware_enabled; @@ -5123,17 +5122,18 @@ static int kvm_online_cpu(unsigned int cpu) * be enabled. Otherwise running VMs would encounter unrecoverable * errors when scheduled to this CPU. */ - raw_spin_lock(&kvm_count_lock); + mutex_lock(&kvm_lock); if (kvm_usage_count) { WARN_ON_ONCE(atomic_read(&hardware_enable_failed)); hardware_enable_nolock(NULL); + if (atomic_read(&hardware_enable_failed)) { atomic_set(&hardware_enable_failed, 0); ret = -EIO; } } - raw_spin_unlock(&kvm_count_lock); + mutex_unlock(&kvm_lock); return ret; } @@ -5149,10 +5149,10 @@ static void hardware_disable_nolock(void *junk) static int kvm_offline_cpu(unsigned int cpu) { - raw_spin_lock(&kvm_count_lock); + mutex_lock(&kvm_lock); if (kvm_usage_count) hardware_disable_nolock(NULL); - raw_spin_unlock(&kvm_count_lock); + mutex_unlock(&kvm_lock); return 0; } @@ -5168,9 +5168,9 @@ static void hardware_disable_all_nolock(void) static void hardware_disable_all(void) { cpus_read_lock(); - raw_spin_lock(&kvm_count_lock); + mutex_lock(&kvm_lock); hardware_disable_all_nolock(); - raw_spin_unlock(&kvm_count_lock); + mutex_unlock(&kvm_lock); cpus_read_unlock(); } @@ -5187,7 +5187,7 @@ static int hardware_enable_all(void) * enable hardware multiple times. */ cpus_read_lock(); - raw_spin_lock(&kvm_count_lock); + mutex_lock(&kvm_lock); kvm_usage_count++; if (kvm_usage_count == 1) { @@ -5200,7 +5200,7 @@ static int hardware_enable_all(void) } } - raw_spin_unlock(&kvm_count_lock); + mutex_unlock(&kvm_lock); cpus_read_unlock(); return r; @@ -5806,6 +5806,17 @@ static void kvm_init_debug(void) static int kvm_suspend(void) { + /* + * Secondary CPUs and CPU hotplug are disabled across the suspend/resume + * callbacks, i.e. no need to acquire kvm_lock to ensure the usage count + * is stable. Assert that kvm_lock is not held to ensure the system + * isn't suspended while KVM is enabling hardware. Hardware enabling + * can be preempted, but the task cannot be frozen until it has dropped + * all locks (userspace tasks are frozen via a fake signal). + */ + lockdep_assert_not_held(&kvm_lock); + lockdep_assert_irqs_disabled(); + if (kvm_usage_count) hardware_disable_nolock(NULL); return 0; @@ -5813,10 +5824,11 @@ static int kvm_suspend(void) static void kvm_resume(void) { - if (kvm_usage_count) { - lockdep_assert_not_held(&kvm_count_lock); + lockdep_assert_not_held(&kvm_lock); + lockdep_assert_irqs_disabled(); + + if (kvm_usage_count) hardware_enable_nolock(NULL); - } } static struct syscore_ops kvm_syscore_ops = { -- cgit From 667a83bf6a30b5660d02712dce9fb07e99abde38 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Wed, 30 Nov 2022 23:09:29 +0000 Subject: KVM: Remove on_each_cpu(hardware_disable_nolock) in kvm_exit() Drop the superfluous invocation of hardware_disable_nolock() during kvm_exit(), as it's nothing more than a glorified nop. KVM automatically disables hardware on all CPUs when the last VM is destroyed, and kvm_exit() cannot be called until the last VM goes away as the calling module is pinned by an elevated refcount of the fops associated with /dev/kvm. This holds true even on x86, where the caller of kvm_exit() is not kvm.ko, but is instead a dependent module, kvm_amd.ko or kvm_intel.ko, as kvm_chardev_ops.owner is set to the module that calls kvm_init(), not hardcoded to the base kvm.ko module. Signed-off-by: Isaku Yamahata [sean: rework changelog] Signed-off-by: Sean Christopherson Message-Id: <20221130230934.1014142-46-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 1 - 1 file changed, 1 deletion(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index f6caee790dc8..d1ed34856412 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -6050,7 +6050,6 @@ void kvm_exit(void) unregister_syscore_ops(&kvm_syscore_ops); unregister_reboot_notifier(&kvm_reboot_notifier); cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE); - on_each_cpu(hardware_disable_nolock, NULL, 1); kvm_irqfd_exit(); free_cpumask_var(cpus_hardware_enabled); } -- cgit From 37d258818562bbb5baf487489848640abceafbd9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:09:30 +0000 Subject: KVM: Use a per-CPU variable to track which CPUs have enabled virtualization Use a per-CPU variable instead of a shared bitmap to track which CPUs have successfully enabled virtualization hardware. Using a per-CPU bool avoids the need for an additional allocation, and arguably yields easier to read code. Using a bitmap would be advantageous if KVM used it to avoid generating IPIs to CPUs that failed to enable hardware, but that's an extreme edge case and not worth optimizing, and the low level helpers would still want to keep their individual checks as attempting to enable virtualization hardware when it's already enabled can be problematic, e.g. Intel's VMXON will fault. Opportunistically change the order in hardware_enable_nolock() to set the flag if and only if hardware enabling is successful, instead of speculatively setting the flag and then clearing it on failure. Add a comment explaining that the check in hardware_disable_nolock() isn't simply paranoia. Waaay back when, commit 1b6c016818a5 ("KVM: Keep track of which cpus have virtualization enabled"), added the logic as a guards against CPU hotplug racing with hardware enable/disable. Now that KVM has eliminated the race by taking cpu_hotplug_lock for read (via cpus_read_lock()) when enabling or disabling hardware, at first glance it appears that the check is now superfluous, i.e. it's tempting to remove the per-CPU flag entirely... Signed-off-by: Sean Christopherson Message-Id: <20221130230934.1014142-47-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 41 ++++++++++++++++++----------------------- 1 file changed, 18 insertions(+), 23 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index d1ed34856412..bcfc0bc15153 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -102,7 +102,7 @@ EXPORT_SYMBOL_GPL(halt_poll_ns_shrink); DEFINE_MUTEX(kvm_lock); LIST_HEAD(vm_list); -static cpumask_var_t cpus_hardware_enabled; +static DEFINE_PER_CPU(bool, hardware_enabled); static int kvm_usage_count; static atomic_t hardware_enable_failed; @@ -5096,21 +5096,17 @@ static struct miscdevice kvm_dev = { static void hardware_enable_nolock(void *junk) { - int cpu = smp_processor_id(); - int r; - - if (cpumask_test_cpu(cpu, cpus_hardware_enabled)) + if (__this_cpu_read(hardware_enabled)) return; - cpumask_set_cpu(cpu, cpus_hardware_enabled); - - r = kvm_arch_hardware_enable(); - - if (r) { - cpumask_clear_cpu(cpu, cpus_hardware_enabled); + if (kvm_arch_hardware_enable()) { atomic_inc(&hardware_enable_failed); - pr_info("kvm: enabling virtualization on CPU%d failed\n", cpu); + pr_info("kvm: enabling virtualization on CPU%d failed\n", + raw_smp_processor_id()); + return; } + + __this_cpu_write(hardware_enabled, true); } static int kvm_online_cpu(unsigned int cpu) @@ -5139,12 +5135,16 @@ static int kvm_online_cpu(unsigned int cpu) static void hardware_disable_nolock(void *junk) { - int cpu = smp_processor_id(); - - if (!cpumask_test_cpu(cpu, cpus_hardware_enabled)) + /* + * Note, hardware_disable_all_nolock() tells all online CPUs to disable + * hardware, not just CPUs that successfully enabled hardware! + */ + if (!__this_cpu_read(hardware_enabled)) return; - cpumask_clear_cpu(cpu, cpus_hardware_enabled); + kvm_arch_hardware_disable(); + + __this_cpu_write(hardware_enabled, false); } static int kvm_offline_cpu(unsigned int cpu) @@ -5945,13 +5945,11 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) int r; int cpu; - if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) - return -ENOMEM; - r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online", kvm_online_cpu, kvm_offline_cpu); if (r) - goto out_free_2; + return r; + register_reboot_notifier(&kvm_reboot_notifier); /* A kmem cache lets us meet the alignment requirements of fx_save. */ @@ -6024,8 +6022,6 @@ out_free_4: out_free_3: unregister_reboot_notifier(&kvm_reboot_notifier); cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE); -out_free_2: - free_cpumask_var(cpus_hardware_enabled); return r; } EXPORT_SYMBOL_GPL(kvm_init); @@ -6051,7 +6047,6 @@ void kvm_exit(void) unregister_reboot_notifier(&kvm_reboot_notifier); cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE); kvm_irqfd_exit(); - free_cpumask_var(cpus_hardware_enabled); } EXPORT_SYMBOL_GPL(kvm_exit); -- cgit From e6fb7d6eb4217badb018169ec2871da8e92f6555 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Wed, 30 Nov 2022 23:09:31 +0000 Subject: KVM: Make hardware_enable_failed a local variable in the "enable all" path Rework detecting hardware enabling errors to use a local variable in the "enable all" path to track whether or not enabling was successful across all CPUs. Using a global variable complicates paths that enable hardware only on the current CPU, e.g. kvm_resume() and kvm_online_cpu(). Opportunistically add a WARN if hardware enabling fails during kvm_resume(), KVM is all kinds of hosed if CPU0 fails to enable hardware. The WARN is largely futile in the current code, as KVM BUG()s on spurious faults on VMX instructions, e.g. attempting to run a vCPU on CPU if hardware enabling fails will explode. ------------[ cut here ]------------ kernel BUG at arch/x86/kvm/x86.c:508! invalid opcode: 0000 [#1] SMP CPU: 3 PID: 1009 Comm: CPU 4/KVM Not tainted 6.1.0-rc1+ #11 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 RIP: 0010:kvm_spurious_fault+0xa/0x10 Call Trace: vmx_vcpu_load_vmcs+0x192/0x230 [kvm_intel] vmx_vcpu_load+0x16/0x60 [kvm_intel] kvm_arch_vcpu_load+0x32/0x1f0 vcpu_load+0x2f/0x40 kvm_arch_vcpu_ioctl_run+0x19/0x9d0 kvm_vcpu_ioctl+0x271/0x660 __x64_sys_ioctl+0x80/0xb0 do_syscall_64+0x2b/0x50 entry_SYSCALL_64_after_hwframe+0x46/0xb0 But, the WARN may provide a breadcrumb to understand what went awry, and someday KVM may fix one or both of those bugs, e.g. by finding a way to eat spurious faults no matter the context (easier said than done due to side effects of certain operations, e.g. Intel's VMCLEAR). Signed-off-by: Isaku Yamahata [sean: rebase, WARN on failure in kvm_resume()] Signed-off-by: Sean Christopherson Message-Id: <20221130230934.1014142-48-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index bcfc0bc15153..2d26a898ac6c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -104,7 +104,6 @@ LIST_HEAD(vm_list); static DEFINE_PER_CPU(bool, hardware_enabled); static int kvm_usage_count; -static atomic_t hardware_enable_failed; static struct kmem_cache *kvm_vcpu_cache; @@ -5094,19 +5093,25 @@ static struct miscdevice kvm_dev = { &kvm_chardev_ops, }; -static void hardware_enable_nolock(void *junk) +static int __hardware_enable_nolock(void) { if (__this_cpu_read(hardware_enabled)) - return; + return 0; if (kvm_arch_hardware_enable()) { - atomic_inc(&hardware_enable_failed); pr_info("kvm: enabling virtualization on CPU%d failed\n", raw_smp_processor_id()); - return; + return -EIO; } __this_cpu_write(hardware_enabled, true); + return 0; +} + +static void hardware_enable_nolock(void *failed) +{ + if (__hardware_enable_nolock()) + atomic_inc(failed); } static int kvm_online_cpu(unsigned int cpu) @@ -5119,16 +5124,8 @@ static int kvm_online_cpu(unsigned int cpu) * errors when scheduled to this CPU. */ mutex_lock(&kvm_lock); - if (kvm_usage_count) { - WARN_ON_ONCE(atomic_read(&hardware_enable_failed)); - - hardware_enable_nolock(NULL); - - if (atomic_read(&hardware_enable_failed)) { - atomic_set(&hardware_enable_failed, 0); - ret = -EIO; - } - } + if (kvm_usage_count) + ret = __hardware_enable_nolock(); mutex_unlock(&kvm_lock); return ret; } @@ -5176,6 +5173,7 @@ static void hardware_disable_all(void) static int hardware_enable_all(void) { + atomic_t failed = ATOMIC_INIT(0); int r = 0; /* @@ -5191,10 +5189,9 @@ static int hardware_enable_all(void) kvm_usage_count++; if (kvm_usage_count == 1) { - atomic_set(&hardware_enable_failed, 0); - on_each_cpu(hardware_enable_nolock, NULL, 1); + on_each_cpu(hardware_enable_nolock, &failed, 1); - if (atomic_read(&hardware_enable_failed)) { + if (atomic_read(&failed)) { hardware_disable_all_nolock(); r = -EBUSY; } @@ -5828,7 +5825,7 @@ static void kvm_resume(void) lockdep_assert_irqs_disabled(); if (kvm_usage_count) - hardware_enable_nolock(NULL); + WARN_ON_ONCE(__hardware_enable_nolock()); } static struct syscore_ops kvm_syscore_ops = { -- cgit From 35774a9f94db08df78e1c08d2b097666deec4e76 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:09:32 +0000 Subject: KVM: Register syscore (suspend/resume) ops early in kvm_init() Register the suspend/resume notifier hooks at the same time KVM registers its reboot notifier so that all the code in kvm_init() that deals with enabling/disabling hardware is bundled together. Opportunstically move KVM's implementations to reside near the reboot notifier code for the same reason. Bunching the code together will allow architectures to opt out of KVM's generic hardware enable/disable logic with minimal #ifdeffery. Signed-off-by: Sean Christopherson Message-Id: <20221130230934.1014142-49-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 68 ++++++++++++++++++++++++++--------------------------- 1 file changed, 34 insertions(+), 34 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 2d26a898ac6c..6c0bd9a8e27e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5223,6 +5223,38 @@ static struct notifier_block kvm_reboot_notifier = { .priority = 0, }; +static int kvm_suspend(void) +{ + /* + * Secondary CPUs and CPU hotplug are disabled across the suspend/resume + * callbacks, i.e. no need to acquire kvm_lock to ensure the usage count + * is stable. Assert that kvm_lock is not held to ensure the system + * isn't suspended while KVM is enabling hardware. Hardware enabling + * can be preempted, but the task cannot be frozen until it has dropped + * all locks (userspace tasks are frozen via a fake signal). + */ + lockdep_assert_not_held(&kvm_lock); + lockdep_assert_irqs_disabled(); + + if (kvm_usage_count) + hardware_disable_nolock(NULL); + return 0; +} + +static void kvm_resume(void) +{ + lockdep_assert_not_held(&kvm_lock); + lockdep_assert_irqs_disabled(); + + if (kvm_usage_count) + WARN_ON_ONCE(__hardware_enable_nolock()); +} + +static struct syscore_ops kvm_syscore_ops = { + .suspend = kvm_suspend, + .resume = kvm_resume, +}; + static void kvm_io_bus_destroy(struct kvm_io_bus *bus) { int i; @@ -5801,38 +5833,6 @@ static void kvm_init_debug(void) } } -static int kvm_suspend(void) -{ - /* - * Secondary CPUs and CPU hotplug are disabled across the suspend/resume - * callbacks, i.e. no need to acquire kvm_lock to ensure the usage count - * is stable. Assert that kvm_lock is not held to ensure the system - * isn't suspended while KVM is enabling hardware. Hardware enabling - * can be preempted, but the task cannot be frozen until it has dropped - * all locks (userspace tasks are frozen via a fake signal). - */ - lockdep_assert_not_held(&kvm_lock); - lockdep_assert_irqs_disabled(); - - if (kvm_usage_count) - hardware_disable_nolock(NULL); - return 0; -} - -static void kvm_resume(void) -{ - lockdep_assert_not_held(&kvm_lock); - lockdep_assert_irqs_disabled(); - - if (kvm_usage_count) - WARN_ON_ONCE(__hardware_enable_nolock()); -} - -static struct syscore_ops kvm_syscore_ops = { - .suspend = kvm_suspend, - .resume = kvm_resume, -}; - static inline struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) { @@ -5948,6 +5948,7 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) return r; register_reboot_notifier(&kvm_reboot_notifier); + register_syscore_ops(&kvm_syscore_ops); /* A kmem cache lets us meet the alignment requirements of fx_save. */ if (!vcpu_align) @@ -5982,8 +5983,6 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) kvm_chardev_ops.owner = module; - register_syscore_ops(&kvm_syscore_ops); - kvm_preempt_ops.sched_in = kvm_sched_in; kvm_preempt_ops.sched_out = kvm_sched_out; @@ -6017,6 +6016,7 @@ out_free_4: free_cpumask_var(per_cpu(cpu_kick_mask, cpu)); kmem_cache_destroy(kvm_vcpu_cache); out_free_3: + unregister_syscore_ops(&kvm_syscore_ops); unregister_reboot_notifier(&kvm_reboot_notifier); cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE); return r; -- cgit From 441f7bfa99fe2b8a7e504aa72047e20579e88a5d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:09:33 +0000 Subject: KVM: Opt out of generic hardware enabling on s390 and PPC Allow architectures to opt out of the generic hardware enabling logic, and opt out on both s390 and PPC, which don't need to manually enable virtualization as it's always on (when available). In addition to letting s390 and PPC drop a bit of dead code, this will hopefully also allow ARM to clean up its related code, e.g. ARM has its own per-CPU flag to track which CPUs have enable hardware due to the need to keep hardware enabled indefinitely when pKVM is enabled. Signed-off-by: Sean Christopherson Acked-by: Anup Patel Message-Id: <20221130230934.1014142-50-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/Kconfig | 3 +++ virt/kvm/kvm_main.c | 30 ++++++++++++++++++++++++------ 2 files changed, 27 insertions(+), 6 deletions(-) (limited to 'virt') diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 9fb1ff6f19e5..b74916de5183 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -92,3 +92,6 @@ config KVM_XFER_TO_GUEST_WORK config HAVE_KVM_PM_NOTIFIER bool + +config KVM_GENERIC_HARDWARE_ENABLING + bool diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 6c0bd9a8e27e..3d0678579c5e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -102,9 +102,6 @@ EXPORT_SYMBOL_GPL(halt_poll_ns_shrink); DEFINE_MUTEX(kvm_lock); LIST_HEAD(vm_list); -static DEFINE_PER_CPU(bool, hardware_enabled); -static int kvm_usage_count; - static struct kmem_cache *kvm_vcpu_cache; static __read_mostly struct preempt_ops kvm_preempt_ops; @@ -146,9 +143,6 @@ static void hardware_disable_all(void); static void kvm_io_bus_destroy(struct kvm_io_bus *bus); -__visible bool kvm_rebooting; -EXPORT_SYMBOL_GPL(kvm_rebooting); - #define KVM_EVENT_CREATE_VM 0 #define KVM_EVENT_DESTROY_VM 1 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm); @@ -5093,6 +5087,13 @@ static struct miscdevice kvm_dev = { &kvm_chardev_ops, }; +#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING +__visible bool kvm_rebooting; +EXPORT_SYMBOL_GPL(kvm_rebooting); + +static DEFINE_PER_CPU(bool, hardware_enabled); +static int kvm_usage_count; + static int __hardware_enable_nolock(void) { if (__this_cpu_read(hardware_enabled)) @@ -5254,6 +5255,17 @@ static struct syscore_ops kvm_syscore_ops = { .suspend = kvm_suspend, .resume = kvm_resume, }; +#else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */ +static int hardware_enable_all(void) +{ + return 0; +} + +static void hardware_disable_all(void) +{ + +} +#endif /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */ static void kvm_io_bus_destroy(struct kvm_io_bus *bus) { @@ -5942,6 +5954,7 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) int r; int cpu; +#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online", kvm_online_cpu, kvm_offline_cpu); if (r) @@ -5949,6 +5962,7 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) register_reboot_notifier(&kvm_reboot_notifier); register_syscore_ops(&kvm_syscore_ops); +#endif /* A kmem cache lets us meet the alignment requirements of fx_save. */ if (!vcpu_align) @@ -6016,9 +6030,11 @@ out_free_4: free_cpumask_var(per_cpu(cpu_kick_mask, cpu)); kmem_cache_destroy(kvm_vcpu_cache); out_free_3: +#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING unregister_syscore_ops(&kvm_syscore_ops); unregister_reboot_notifier(&kvm_reboot_notifier); cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE); +#endif return r; } EXPORT_SYMBOL_GPL(kvm_init); @@ -6040,9 +6056,11 @@ void kvm_exit(void) kmem_cache_destroy(kvm_vcpu_cache); kvm_vfio_ops_exit(); kvm_async_pf_deinit(); +#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING unregister_syscore_ops(&kvm_syscore_ops); unregister_reboot_notifier(&kvm_reboot_notifier); cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE); +#endif kvm_irqfd_exit(); } EXPORT_SYMBOL_GPL(kvm_exit); -- cgit From 9f1a4c004869d3c8061f286fec4d8096dd099b84 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Nov 2022 23:09:34 +0000 Subject: KVM: Clean up error labels in kvm_init() Convert the last two "out" lables to "err" labels now that the dust has settled, i.e. now that there are no more planned changes to the order of things in kvm_init(). Use "err" instead of "out" as it's easier to describe what failed than it is to describe what needs to be unwound, e.g. if allocating a per-CPU kick mask fails, KVM needs to free any masks that were allocated, and of course needs to unwind previous operations. Reported-by: Chao Gao Signed-off-by: Sean Christopherson Message-Id: <20221130230934.1014142-51-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 3d0678579c5e..0f17b8786792 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5976,14 +5976,14 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) NULL); if (!kvm_vcpu_cache) { r = -ENOMEM; - goto out_free_3; + goto err_vcpu_cache; } for_each_possible_cpu(cpu) { if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu), GFP_KERNEL, cpu_to_node(cpu))) { r = -ENOMEM; - goto out_free_4; + goto err_cpu_kick_mask; } } @@ -6025,11 +6025,11 @@ err_vfio: err_async_pf: kvm_irqfd_exit(); err_irqfd: -out_free_4: +err_cpu_kick_mask: for_each_possible_cpu(cpu) free_cpumask_var(per_cpu(cpu_kick_mask, cpu)); kmem_cache_destroy(kvm_vcpu_cache); -out_free_3: +err_vcpu_cache: #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING unregister_syscore_ops(&kvm_syscore_ops); unregister_reboot_notifier(&kvm_reboot_notifier); -- cgit