From 61c08aa9606d4e48a8a50639c956448a720174c3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:40:48 -0800 Subject: KVM: VMX: Compare only a single byte for VMCS' "launched" in vCPU-run The vCPU-run asm blob does a manual comparison of a VMCS' launched status to execute the correct VM-Enter instruction, i.e. VMLAUNCH vs. VMRESUME. The launched flag is a bool, which is a typedef of _Bool. C99 does not define an exact size for _Bool, stating only that is must be large enough to hold '0' and '1'. Most, if not all, compilers use a single byte for _Bool, including gcc[1]. Originally, 'launched' was of type 'int' and so the asm blob used 'cmpl' to check the launch status. When 'launched' was moved to be stored on a per-VMCS basis, struct vcpu_vmx's "temporary" __launched flag was added in order to avoid having to pass the current VMCS into the asm blob. The new '__launched' was defined as a 'bool' and not an 'int', but the 'cmp' instruction was not updated. This has not caused any known problems, likely due to compilers aligning variables to 4-byte or 8-byte boundaries and KVM zeroing out struct vcpu_vmx during allocation. I.e. vCPU-run accesses "junk" data, it just happens to always be zero and so doesn't affect the result. [1] https://gcc.gnu.org/ml/gcc-patches/2000-10/msg01127.html Fixes: d462b8192368 ("KVM: VMX: Keep list of loaded VMCSs, instead of vcpus") Cc: Reviewed-by: Jim Mattson Reviewed-by: Konrad Rzeszutek Wilk Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 95d618045001..fdb6305cc971 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6402,7 +6402,7 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "mov %%" _ASM_AX", %%cr2 \n\t" "3: \n\t" /* Check if vmlaunch or vmresume is needed */ - "cmpl $0, %c[launched](%%" _ASM_CX ") \n\t" + "cmpb $0, %c[launched](%%" _ASM_CX ") \n\t" /* Load guest registers. Don't clobber flags. */ "mov %c[rax](%%" _ASM_CX "), %%" _ASM_AX " \n\t" "mov %c[rbx](%%" _ASM_CX "), %%" _ASM_BX " \n\t" -- cgit From 1ce072cbfd8dba46f117804850398e0b3040a541 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:40:49 -0800 Subject: KVM: nVMX: Check a single byte for VMCS "launched" in nested early checks Nested early checks does a manual comparison of a VMCS' launched status in its asm blob to execute the correct VM-Enter instruction, i.e. VMLAUNCH vs. VMRESUME. The launched flag is a bool, which is a typedef of _Bool. C99 does not define an exact size for _Bool, stating only that is must be large enough to hold '0' and '1'. Most, if not all, compilers use a single byte for _Bool, including gcc[1]. The use of 'cmpl' instead of 'cmpb' was not deliberate, but rather the result of a copy-paste as the asm blob was directly derived from the asm blob for vCPU-run. This has not caused any known problems, likely due to compilers aligning variables to 4-byte or 8-byte boundaries and KVM zeroing out struct vcpu_vmx during allocation. I.e. vCPU-run accesses "junk" data, it just happens to always be zero and so doesn't affect the result. [1] https://gcc.gnu.org/ml/gcc-patches/2000-10/msg01127.html Fixes: 52017608da33 ("KVM: nVMX: add option to perform early consistency checks via H/W") Cc: Reviewed-by: Jim Mattson Reviewed-by: Konrad Rzeszutek Wilk Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index d8ea4ebd79e7..50c9e04910ea 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2761,7 +2761,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */ /* Check if vmlaunch or vmresume is needed */ - "cmpl $0, %c[launched](%% " _ASM_CX")\n\t" + "cmpb $0, %c[launched](%% " _ASM_CX")\n\t" "call vmx_vmenter\n\t" -- cgit From 0e0ab73c9a0243736bcd779b30b717e23ba9a56d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:40:50 -0800 Subject: KVM: VMX: Zero out *all* general purpose registers after VM-Exit ...except RSP, which is restored by hardware as part of VM-Exit. Paolo theorized that restoring registers from the stack after a VM-Exit in lieu of zeroing them could lead to speculative execution with the guest's values, e.g. if the stack accesses miss the L1 cache[1]. Zeroing XORs are dirt cheap, so just be ultra-paranoid. Note that the scratch register (currently RCX) used to save/restore the guest state is also zeroed as its host-defined value is loaded via the stack, just with a MOV instead of a POP. [1] https://patchwork.kernel.org/patch/10771539/#22441255 Fixes: 0cb5b30698fd ("kvm: vmx: Scrub hardware GPRs at VM-exit") Cc: Cc: Jim Mattson Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index fdb6305cc971..10fee67a6dcd 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6452,10 +6452,15 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "mov %%r13, %c[r13](%%" _ASM_CX ") \n\t" "mov %%r14, %c[r14](%%" _ASM_CX ") \n\t" "mov %%r15, %c[r15](%%" _ASM_CX ") \n\t" + /* - * Clear host registers marked as clobbered to prevent - * speculative use. - */ + * Clear all general purpose registers (except RSP, which is loaded by + * the CPU during VM-Exit) to prevent speculative use of the guest's + * values, even those that are saved/loaded via the stack. In theory, + * an L1 cache miss when restoring registers could lead to speculative + * execution with the guest's values. Zeroing XORs are dirt cheap, + * i.e. the extra paranoia is essentially free. + */ "xor %%r8d, %%r8d \n\t" "xor %%r9d, %%r9d \n\t" "xor %%r10d, %%r10d \n\t" @@ -6470,8 +6475,11 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "xor %%eax, %%eax \n\t" "xor %%ebx, %%ebx \n\t" + "xor %%ecx, %%ecx \n\t" + "xor %%edx, %%edx \n\t" "xor %%esi, %%esi \n\t" "xor %%edi, %%edi \n\t" + "xor %%ebp, %%ebp \n\t" "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t" : ASM_CALL_CONSTRAINT : "c"(vmx), "d"((unsigned long)HOST_RSP), "S"(evmcs_rsp), -- cgit From 831a3011294d9aa284afae9194619893d454d708 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:40:51 -0800 Subject: KVM: VMX: Modify only RSP when creating a placeholder for guest's RCX In the vCPU-run asm blob, the guest's RCX is temporarily saved onto the stack after VM-Exit as the exit flow must first load a register with a pointer to the vCPU's save area in order to save the guest's registers. RCX is arbitrarily designated as the scratch register. Since the stack usage is to (1)save host, (2)save guest, (3)load host and (4)load guest, the code can't conform to the stack's natural FIFO semantics, i.e. it can't simply do PUSH/POP. Regardless of whether it is done for the host's value or guest's value, at some point the code needs to access the stack using a non-traditional method, e.g. MOV instead of POP. vCPU-run opts to create a placeholder on the stack for guest's RCX (by adjusting RSP) and saves RCX to its place immediately after VM-Exit (via MOV). In other words, the purpose of the first 'PUSH RCX' at the start of the vCPU-run asm blob is to adjust RSP down, i.e. there's no need to actually access memory. Use 'SUB $wordsize, RSP' instead of 'PUSH RCX' to make it more obvious that the intent is simply to create a gap on the stack for the guest's RCX. Reviewed-by: Jim Mattson Reviewed-by: Konrad Rzeszutek Wilk Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 10fee67a6dcd..cb0dd17c31c6 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6378,7 +6378,7 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) asm( /* Store host registers */ "push %%" _ASM_DX "; push %%" _ASM_BP ";" - "push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */ + "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* placeholder for guest RCX */ "push %%" _ASM_CX " \n\t" "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */ "cmp %%" _ASM_SP ", %c[host_rsp](%%" _ASM_CX ") \n\t" -- cgit From f3689e3f17f064fd4cd5f0cb01ae2395c94f39d9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:40:52 -0800 Subject: KVM: VMX: Save RSI to an unused output in the vCPU-run asm blob RSI is clobbered by the vCPU-run asm blob, but it's not marked as such, probably because GCC doesn't let you mark inputs as clobbered. "Save" RSI to a dummy output so that GCC recognizes it as being clobbered. Fixes: 773e8a0425c9 ("x86/kvm: use Enlightened VMCS when running on Hyper-V") Reviewed-by: Jim Mattson Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index cb0dd17c31c6..3ae51a09f420 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6481,7 +6481,7 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "xor %%edi, %%edi \n\t" "xor %%ebp, %%ebp \n\t" "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t" - : ASM_CALL_CONSTRAINT + : ASM_CALL_CONSTRAINT, "=S"((int){0}) : "c"(vmx), "d"((unsigned long)HOST_RSP), "S"(evmcs_rsp), [launched]"i"(offsetof(struct vcpu_vmx, __launched)), [fail]"i"(offsetof(struct vcpu_vmx, fail)), -- cgit From ccf447434ee624b64a1c215d88453f1921aaa88a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:40:53 -0800 Subject: KVM: VMX: Manually load RDX in vCPU-run asm blob Load RDX with the VMCS.HOST_RSP field encoding on-demand instead of delegating to the compiler via an input constraint. In addition to saving one whole MOV instruction, this allows RDX to be properly clobbered (in a future patch) instead of being saved/loaded to/from the stack. Despite nested_vmx_check_vmentry_hw() having similar code, leave it alone, for now. In that case, RDX is unconditionally used and isn't clobbered, i.e. sending in HOST_RSP as an input is simpler. Note that because HOST_RSP is an enum and not a define, it must be redefined as an immediate instead of using __stringify(HOST_RSP). The naming "conflict" between host_rsp and HOST_RSP is slightly confusing, but the former will be removed in a future patch, at which point HOST_RSP is absolutely what is desired. Reviewed-by: Jim Mattson Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 3ae51a09f420..b935ba3306aa 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6390,6 +6390,7 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "mov %%" _ASM_SP ", (%%" _ASM_SI ") \n\t" "jmp 1f \n\t" "2: \n\t" + "mov $%c[HOST_RSP], %%" _ASM_DX " \n\t" __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t" "1: \n\t" "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */ @@ -6482,10 +6483,11 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "xor %%ebp, %%ebp \n\t" "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t" : ASM_CALL_CONSTRAINT, "=S"((int){0}) - : "c"(vmx), "d"((unsigned long)HOST_RSP), "S"(evmcs_rsp), + : "c"(vmx), "S"(evmcs_rsp), [launched]"i"(offsetof(struct vcpu_vmx, __launched)), [fail]"i"(offsetof(struct vcpu_vmx, fail)), [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), + [HOST_RSP]"i"(HOST_RSP), [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), -- cgit From 6f7c6d23b71af2e65056e777eeb2f1e6e0195f14 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:40:54 -0800 Subject: KVM: VMX: Let the compiler save/load RDX during vCPU-run Per commit c20363006af6 ("KVM: VMX: Let gcc to choose which registers to save (x86_64)"), the only reason RDX is saved/loaded to/from the stack is because it was specified as an input, i.e. couldn't be marked as clobbered (ignoring the fact that "saving" it to a dummy output would indirectly mark it as clobbered). Now that RDX is no longer an input, clobber it. Reviewed-by: Jim Mattson Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index b935ba3306aa..e9d81cd8421f 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6377,7 +6377,7 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) asm( /* Store host registers */ - "push %%" _ASM_DX "; push %%" _ASM_BP ";" + "push %%" _ASM_BP " \n\t" "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* placeholder for guest RCX */ "push %%" _ASM_CX " \n\t" "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */ @@ -6481,7 +6481,7 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "xor %%esi, %%esi \n\t" "xor %%edi, %%edi \n\t" "xor %%ebp, %%ebp \n\t" - "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t" + "pop %%" _ASM_BP " \n\t" : ASM_CALL_CONSTRAINT, "=S"((int){0}) : "c"(vmx), "S"(evmcs_rsp), [launched]"i"(offsetof(struct vcpu_vmx, __launched)), @@ -6509,10 +6509,10 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) [wordsize]"i"(sizeof(ulong)) : "cc", "memory" #ifdef CONFIG_X86_64 - , "rax", "rbx", "rdi" + , "rax", "rbx", "rdx", "rdi" , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" #else - , "eax", "ebx", "edi" + , "eax", "ebx", "edx", "edi" #endif ); } -- cgit From 9ce0a07a6f49822238fd4357c02e0dba060a43cc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:40:55 -0800 Subject: KVM: nVMX: Remove a rogue "rax" clobber from nested_vmx_check_vmentry_hw() RAX is not touched by nested_vmx_check_vmentry_hw(), directly or indirectly (e.g. vmx_vmenter()). Remove it from the clobber list. Fixes: 52017608da33 ("KVM: nVMX: add option to perform early consistency checks via H/W") Reviewed-by: Jim Mattson Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 50c9e04910ea..bb85e3a05424 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2773,7 +2773,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) [fail]"i"(offsetof(struct vcpu_vmx, fail)), [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), [wordsize]"i"(sizeof(ulong)) - : "rax", "cc", "memory" + : "cc", "memory" ); preempt_enable(); -- cgit From 98ff2acc91d836277d34558fbbba7678e04281ed Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:40:56 -0800 Subject: KVM: nVMX: Drop STACK_FRAME_NON_STANDARD from nested_vmx_check_vmentry_hw() ...as it doesn't technically actually do anything non-standard with the stack even though it modifies RSP in a weird way. E.g. RSP is loaded with VMCS.HOST_RSP if the VM-Enter gets far enough to trigger VM-Exit, but it's simply reloaded with the current value. Reviewed-by: Jim Mattson Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index bb85e3a05424..e40df725b952 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2809,8 +2809,6 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) return 0; } -STACK_FRAME_NON_STANDARD(nested_vmx_check_vmentry_hw); - static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12); -- cgit From 6c1e7e5b40f23b9e754a47852924115febba35df Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:40:57 -0800 Subject: KVM: nVMX: Explicitly reference the scratch reg in nested early checks Using %1 to reference RCX, i.e. the 'vmx' pointer', is obtuse and fragile, e.g. it results in cryptic and infurating compile errors if the output constraints are touched by anything more than a gentle breeze. Reviewed-by: Jim Mattson Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index e40df725b952..a562ecabc118 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2757,7 +2757,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) /* Set HOST_RSP */ "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */ __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t" - "mov %%" _ASM_SP ", %c[host_rsp](%1)\n\t" + "mov %%" _ASM_SP ", %c[host_rsp](%% " _ASM_CX ")\n\t" "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */ /* Check if vmlaunch or vmresume is needed */ -- cgit From f1727b4954772a778df0b73a93c4b646fd3c21f6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:40:58 -0800 Subject: KVM: nVMX: Capture VM-Fail to a local var in nested_vmx_check_vmentry_hw() Unlike the primary vCPU-run flow, the nested early checks code doesn't actually want to propagate VM-Fail back to 'vmx'. Yay copy+paste. In additional to eliminating the need to clear vmx->fail before returning, using a local boolean also drops a reference to 'vmx' in the asm blob. Dropping the reference to 'vmx' will save a register in the long run as future patches will shift all pointer references from 'vmx' to 'vmx->loaded_vmcs'. Fixes: 52017608da33 ("KVM: nVMX: add option to perform early consistency checks via H/W") Signed-off-by: Sean Christopherson Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index a562ecabc118..bfacf9029466 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2718,6 +2718,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long cr3, cr4; + bool vm_fail; if (!nested_early_check) return 0; @@ -2763,14 +2764,18 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) /* Check if vmlaunch or vmresume is needed */ "cmpb $0, %c[launched](%% " _ASM_CX")\n\t" + /* + * VMLAUNCH and VMRESUME clear RFLAGS.{CF,ZF} on VM-Exit, set + * RFLAGS.CF on VM-Fail Invalid and set RFLAGS.ZF on VM-Fail + * Valid. vmx_vmenter() directly "returns" RFLAGS, and so the + * results of VM-Enter is captured via SETBE to vm_fail. + */ "call vmx_vmenter\n\t" - /* Set vmx->fail accordingly */ - "setbe %c[fail](%% " _ASM_CX")\n\t" - : ASM_CALL_CONSTRAINT + "setbe %[fail]\n\t" + : ASM_CALL_CONSTRAINT, [fail]"=qm"(vm_fail) : "c"(vmx), "d"((unsigned long)HOST_RSP), [launched]"i"(offsetof(struct vcpu_vmx, __launched)), - [fail]"i"(offsetof(struct vcpu_vmx, fail)), [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), [wordsize]"i"(sizeof(ulong)) : "cc", "memory" @@ -2783,10 +2788,9 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) if (vmx->msr_autoload.guest.nr) vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); - if (vmx->fail) { + if (vm_fail) { WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != VMXERR_ENTRY_INVALID_CONTROL_FIELD); - vmx->fail = 0; return 1; } -- cgit From bbc0b8239257d91cd43fe219de5552dee8bb9123 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:40:59 -0800 Subject: KVM: nVMX: Capture VM-Fail via CC_{SET,OUT} in nested early checks ...to take advantage of __GCC_ASM_FLAG_OUTPUTS__ when possible. Reviewed-by: Jim Mattson Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index bfacf9029466..67f8fdf568eb 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2768,12 +2768,12 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) * VMLAUNCH and VMRESUME clear RFLAGS.{CF,ZF} on VM-Exit, set * RFLAGS.CF on VM-Fail Invalid and set RFLAGS.ZF on VM-Fail * Valid. vmx_vmenter() directly "returns" RFLAGS, and so the - * results of VM-Enter is captured via SETBE to vm_fail. + * results of VM-Enter is captured via CC_{SET,OUT} to vm_fail. */ "call vmx_vmenter\n\t" - "setbe %[fail]\n\t" - : ASM_CALL_CONSTRAINT, [fail]"=qm"(vm_fail) + CC_SET(be) + : ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail) : "c"(vmx), "d"((unsigned long)HOST_RSP), [launched]"i"(offsetof(struct vcpu_vmx, __launched)), [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), -- cgit From 74dfa2784e961fae66143b811f45105b43c63046 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:00 -0800 Subject: KVM: nVMX: Reference vmx->loaded_vmcs->launched directly Temporarily propagating vmx->loaded_vmcs->launched to vmx->__launched is not functionally necessary, but rather was done historically to avoid passing both 'vmx' and 'loaded_vmcs' to the vCPU-run asm blob. Nested early checks inherited this behavior by virtue of copy+paste. A future patch will move HOST_RSP caching to be per-VMCS, i.e. store 'host_rsp' in loaded VMCS. Now that the reference to 'vmx->fail' is also gone from nested early checks, referencing 'loaded_vmcs' directly means we can drop the 'vmx' reference when introducing per-VMCS RSP caching. And it means __launched can be dropped from struct vcpu_vmx if/when vCPU-run receives similar treatment. Note the use of a named register constraint for 'loaded_vmcs'. Using RCX to hold 'vmx' was inherited from vCPU-run. In the vCPU-run case, the scratch register needs to be explicitly defined as it is crushed when loading guest state, i.e. deferring to the compiler would corrupt the pointer. Since nested early checks never loads guests state, it's a-ok to let the compiler pick any register. Naming the constraint avoids the fragility of referencing constraints via %1, %2, etc.., which breaks horribly when modifying constraints, and generally makes the asm blob more readable. Signed-off-by: Sean Christopherson Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 67f8fdf568eb..47299e10522f 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2752,8 +2752,6 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) vmx->loaded_vmcs->host_state.cr4 = cr4; } - vmx->__launched = vmx->loaded_vmcs->launched; - asm( /* Set HOST_RSP */ "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */ @@ -2762,7 +2760,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */ /* Check if vmlaunch or vmresume is needed */ - "cmpb $0, %c[launched](%% " _ASM_CX")\n\t" + "cmpb $0, %c[launched](%[loaded_vmcs])\n\t" /* * VMLAUNCH and VMRESUME clear RFLAGS.{CF,ZF} on VM-Exit, set @@ -2775,7 +2773,8 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) CC_SET(be) : ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail) : "c"(vmx), "d"((unsigned long)HOST_RSP), - [launched]"i"(offsetof(struct vcpu_vmx, __launched)), + [loaded_vmcs]"r"(vmx->loaded_vmcs), + [launched]"i"(offsetof(struct loaded_vmcs, launched)), [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), [wordsize]"i"(sizeof(ulong)) : "cc", "memory" -- cgit From fbda0fd31a6d683637f848ba17956048dd0c7e48 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:01 -0800 Subject: KVM: nVMX: Let the compiler select the reg for holding HOST_RSP ...and provide an explicit name for the constraint. Naming the input constraint makes the code self-documenting and also avoids the fragility of numerically referring to constraints, e.g. %4 breaks badly whenever the constraints are modified. Explicitly using RDX was inherited from vCPU-run, i.e. completely arbitrary. Even vCPU-run doesn't truly need to explicitly use RDX, but doing so is more robust as vCPU-run needs tight control over its register usage. Note that while the naming "conflict" between host_rsp and HOST_RSP is slightly confusing, the former will be renamed slightly in a future patch, at which point HOST_RSP is absolutely what is desired. Signed-off-by: Sean Christopherson Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 47299e10522f..23a2c1b91389 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2753,9 +2753,8 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) } asm( - /* Set HOST_RSP */ "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */ - __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t" + __ex("vmwrite %%" _ASM_SP ", %[HOST_RSP]") "\n\t" "mov %%" _ASM_SP ", %c[host_rsp](%% " _ASM_CX ")\n\t" "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */ @@ -2772,7 +2771,8 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) CC_SET(be) : ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail) - : "c"(vmx), "d"((unsigned long)HOST_RSP), + : "c"(vmx), + [HOST_RSP]"r"((unsigned long)HOST_RSP), [loaded_vmcs]"r"(vmx->loaded_vmcs), [launched]"i"(offsetof(struct loaded_vmcs, launched)), [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), -- cgit From 5a8781607e677eda60b20e0a4c91d2a5f12f9244 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:02 -0800 Subject: KVM: nVMX: Cache host_rsp on a per-VMCS basis Currently, host_rsp is cached on a per-vCPU basis, i.e. it's stored in struct vcpu_vmx. In non-nested usage the caching is for all intents and purposes 100% effective, e.g. only the first VMLAUNCH needs to synchronize VMCS.HOST_RSP since the call stack to vmx_vcpu_run() is identical each and every time. But when running a nested guest, KVM must invalidate the cache when switching the current VMCS as it can't guarantee the new VMCS has the same HOST_RSP as the previous VMCS. In other words, the cache loses almost all of its efficacy when running a nested VM. Move host_rsp to struct vmcs_host_state, which is per-VMCS, so that it is cached on a per-VMCS basis and restores its 100% hit rate when nested VMs are in play. Note that the host_rsp cache for vmcs02 essentially "breaks" when nested early checks are enabled as nested_vmx_check_vmentry_hw() will see a different RSP at the time of its VM-Enter. While it's possible to avoid even that VMCS.HOST_RSP synchronization, e.g. by employing a dedicated VM-Exit stack, there is little motivation for doing so as the overhead of two VMWRITEs (~55 cycles) is dwarfed by the overhead of the extra VMX transition (600+ cycles) and is a proverbial drop in the ocean relative to the total cost of a nested transtion (10s of thousands of cycles). Signed-off-by: Sean Christopherson Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 24 ++++++------------------ arch/x86/kvm/vmx/vmcs.h | 1 + arch/x86/kvm/vmx/vmx.c | 13 ++++++------- arch/x86/kvm/vmx/vmx.h | 1 - 4 files changed, 13 insertions(+), 26 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 23a2c1b91389..0e67649e39ce 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1979,17 +1979,6 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) prepare_vmcs02_early_full(vmx, vmcs12); - /* - * HOST_RSP is normally set correctly in vmx_vcpu_run() just before - * entry, but only if the current (host) sp changed from the value - * we wrote last (vmx->host_rsp). This cache is no longer relevant - * if we switch vmcs, and rather than hold a separate cache per vmcs, - * here we just force the write to happen on entry. host_rsp will - * also be written unconditionally by nested_vmx_check_vmentry_hw() - * if we are doing early consistency checks via hardware. - */ - vmx->host_rsp = 0; - /* * PIN CONTROLS */ @@ -2754,8 +2743,11 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) asm( "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */ + "cmp %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t" + "je 1f \n\t" __ex("vmwrite %%" _ASM_SP ", %[HOST_RSP]") "\n\t" - "mov %%" _ASM_SP ", %c[host_rsp](%% " _ASM_CX ")\n\t" + "mov %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t" + "1: \n\t" "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */ /* Check if vmlaunch or vmresume is needed */ @@ -2771,11 +2763,10 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) CC_SET(be) : ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail) - : "c"(vmx), - [HOST_RSP]"r"((unsigned long)HOST_RSP), + : [HOST_RSP]"r"((unsigned long)HOST_RSP), [loaded_vmcs]"r"(vmx->loaded_vmcs), [launched]"i"(offsetof(struct loaded_vmcs, launched)), - [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), + [host_state_rsp]"i"(offsetof(struct loaded_vmcs, host_state.rsp)), [wordsize]"i"(sizeof(ulong)) : "cc", "memory" ); @@ -3912,9 +3903,6 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vmx_flush_tlb(vcpu, true); } - /* This is needed for same reason as it was needed in prepare_vmcs02 */ - vmx->host_rsp = 0; - /* Unpin physical memory we referred to in vmcs02 */ if (vmx->nested.apic_access_page) { kvm_release_page_dirty(vmx->nested.apic_access_page); diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index 6def3ba88e3b..cb6079f8a227 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -34,6 +34,7 @@ struct vmcs_host_state { unsigned long cr4; /* May not match real cr4 */ unsigned long gs_base; unsigned long fs_base; + unsigned long rsp; u16 fs_sel, gs_sel, ldt_sel; #ifdef CONFIG_X86_64 diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e9d81cd8421f..12bb61e7aca6 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6381,9 +6381,9 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* placeholder for guest RCX */ "push %%" _ASM_CX " \n\t" "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */ - "cmp %%" _ASM_SP ", %c[host_rsp](%%" _ASM_CX ") \n\t" + "cmp %%" _ASM_SP ", (%%" _ASM_DI ") \n\t" "je 1f \n\t" - "mov %%" _ASM_SP ", %c[host_rsp](%%" _ASM_CX ") \n\t" + "mov %%" _ASM_SP ", (%%" _ASM_DI ") \n\t" /* Avoid VMWRITE when Enlightened VMCS is in use */ "test %%" _ASM_SI ", %%" _ASM_SI " \n\t" "jz 2f \n\t" @@ -6482,11 +6482,10 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "xor %%edi, %%edi \n\t" "xor %%ebp, %%ebp \n\t" "pop %%" _ASM_BP " \n\t" - : ASM_CALL_CONSTRAINT, "=S"((int){0}) - : "c"(vmx), "S"(evmcs_rsp), + : ASM_CALL_CONSTRAINT, "=D"((int){0}), "=S"((int){0}) + : "c"(vmx), "D"(&vmx->loaded_vmcs->host_state.rsp), "S"(evmcs_rsp), [launched]"i"(offsetof(struct vcpu_vmx, __launched)), [fail]"i"(offsetof(struct vcpu_vmx, fail)), - [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), [HOST_RSP]"i"(HOST_RSP), [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), @@ -6509,10 +6508,10 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) [wordsize]"i"(sizeof(ulong)) : "cc", "memory" #ifdef CONFIG_X86_64 - , "rax", "rbx", "rdx", "rdi" + , "rax", "rbx", "rdx" , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" #else - , "eax", "ebx", "edx", "edi" + , "eax", "ebx", "edx" #endif ); } diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 99328954c2fc..8e203b725928 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -175,7 +175,6 @@ struct nested_vmx { struct vcpu_vmx { struct kvm_vcpu vcpu; - unsigned long host_rsp; u8 fail; u8 msr_bitmap_mode; u32 exit_intr_info; -- cgit From 47e97c099bbcb3211b22456679991095c0578da2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:03 -0800 Subject: KVM: VMX: Load/save guest CR2 via C code in __vmx_vcpu_run() ...to eliminate its parameter and struct vcpu_vmx offset definition from the assembly blob. Accessing CR2 from C versus assembly doesn't change the likelihood of taking a page fault (and modifying CR2) while it's loaded with the guest's value, so long as we don't do anything silly between accessing CR2 and VM-Enter/VM-Exit. Signed-off-by: Sean Christopherson Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 12bb61e7aca6..5e43999ece1d 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6375,6 +6375,9 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) if (static_branch_unlikely(&vmx_l1d_should_flush)) vmx_l1d_flush(vcpu); + if (vcpu->arch.cr2 != read_cr2()) + write_cr2(vcpu->arch.cr2); + asm( /* Store host registers */ "push %%" _ASM_BP " \n\t" @@ -6395,13 +6398,6 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "1: \n\t" "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */ - /* Reload cr2 if changed */ - "mov %c[cr2](%%" _ASM_CX "), %%" _ASM_AX " \n\t" - "mov %%cr2, %%" _ASM_DX " \n\t" - "cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t" - "je 3f \n\t" - "mov %%" _ASM_AX", %%cr2 \n\t" - "3: \n\t" /* Check if vmlaunch or vmresume is needed */ "cmpb $0, %c[launched](%%" _ASM_CX ") \n\t" /* Load guest registers. Don't clobber flags. */ @@ -6471,9 +6467,6 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "xor %%r14d, %%r14d \n\t" "xor %%r15d, %%r15d \n\t" #endif - "mov %%cr2, %%" _ASM_AX " \n\t" - "mov %%" _ASM_AX ", %c[cr2](%%" _ASM_CX ") \n\t" - "xor %%eax, %%eax \n\t" "xor %%ebx, %%ebx \n\t" "xor %%ecx, %%ecx \n\t" @@ -6504,7 +6497,6 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])), [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])), #endif - [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)), [wordsize]"i"(sizeof(ulong)) : "cc", "memory" #ifdef CONFIG_X86_64 @@ -6514,6 +6506,8 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) , "eax", "ebx", "edx" #endif ); + + vcpu->arch.cr2 = read_cr2(); } STACK_FRAME_NON_STANDARD(__vmx_vcpu_run); -- cgit From c09b03eb7f964370149577fdba425623b7f5b0b5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:04 -0800 Subject: KVM: VMX: Update VMCS.HOST_RSP via helper C function Providing a helper function to update HOST_RSP is visibly easier to read, and more importantly (for the future) eliminates two arguments to the VM-Enter assembly blob. Reducing the number of arguments to the asm blob is for all intents and purposes a prerequisite to moving the code to a proper assembly routine. It's not truly mandatory, but it greatly simplifies the future code, and the cost of the extra CALL+RET is negligible in the grand scheme. Note that although _ASM_ARG[1-3] can be used in the inline asm itself, the intput/output constraints need to be manually defined. gcc will actually compile with _ASM_ARG[1-3] specified as constraints, but what it actually ends up doing with the bogus constraint is unknown. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 51 +++++++++++++++++++++++++------------------------- 1 file changed, 26 insertions(+), 25 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 5e43999ece1d..4b8a94fedb76 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6363,15 +6363,18 @@ static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) vmx->loaded_vmcs->hv_timer_armed = false; } -static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) +void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) { - unsigned long evmcs_rsp; + if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) { + vmx->loaded_vmcs->host_state.rsp = host_rsp; + vmcs_writel(HOST_RSP, host_rsp); + } +} +static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) +{ vmx->__launched = vmx->loaded_vmcs->launched; - evmcs_rsp = static_branch_unlikely(&enable_evmcs) ? - (unsigned long)¤t_evmcs->host_rsp : 0; - if (static_branch_unlikely(&vmx_l1d_should_flush)) vmx_l1d_flush(vcpu); @@ -6382,21 +6385,14 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) /* Store host registers */ "push %%" _ASM_BP " \n\t" "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* placeholder for guest RCX */ - "push %%" _ASM_CX " \n\t" - "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */ - "cmp %%" _ASM_SP ", (%%" _ASM_DI ") \n\t" - "je 1f \n\t" - "mov %%" _ASM_SP ", (%%" _ASM_DI ") \n\t" - /* Avoid VMWRITE when Enlightened VMCS is in use */ - "test %%" _ASM_SI ", %%" _ASM_SI " \n\t" - "jz 2f \n\t" - "mov %%" _ASM_SP ", (%%" _ASM_SI ") \n\t" - "jmp 1f \n\t" - "2: \n\t" - "mov $%c[HOST_RSP], %%" _ASM_DX " \n\t" - __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t" - "1: \n\t" - "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */ + "push %%" _ASM_ARG1 " \n\t" + + /* Adjust RSP to account for the CALL to vmx_vmenter(). */ + "lea -%c[wordsize](%%" _ASM_SP "), %%" _ASM_ARG2 " \n\t" + "call vmx_update_host_rsp \n\t" + + /* Load the vcpu_vmx pointer to RCX. */ + "mov (%%" _ASM_SP "), %%" _ASM_CX " \n\t" /* Check if vmlaunch or vmresume is needed */ "cmpb $0, %c[launched](%%" _ASM_CX ") \n\t" @@ -6475,11 +6471,16 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "xor %%edi, %%edi \n\t" "xor %%ebp, %%ebp \n\t" "pop %%" _ASM_BP " \n\t" - : ASM_CALL_CONSTRAINT, "=D"((int){0}), "=S"((int){0}) - : "c"(vmx), "D"(&vmx->loaded_vmcs->host_state.rsp), "S"(evmcs_rsp), + : ASM_CALL_CONSTRAINT, +#ifdef CONFIG_X86_64 + "=D"((int){0}) + : "D"(vmx), +#else + "=a"((int){0}) + : "a"(vmx), +#endif [launched]"i"(offsetof(struct vcpu_vmx, __launched)), [fail]"i"(offsetof(struct vcpu_vmx, fail)), - [HOST_RSP]"i"(HOST_RSP), [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), @@ -6500,10 +6501,10 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) [wordsize]"i"(sizeof(ulong)) : "cc", "memory" #ifdef CONFIG_X86_64 - , "rax", "rbx", "rdx" + , "rax", "rbx", "rcx", "rdx", "rsi" , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" #else - , "eax", "ebx", "edx" + , "ebx", "ecx", "edx", "edi", "esi" #endif ); -- cgit From c9afc58cc368623aaa34a62e321eea2a59240f6f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:05 -0800 Subject: KVM: VMX: Pass "launched" directly to the vCPU-run asm blob ...and remove struct vcpu_vmx's temporary __launched variable. Eliminating __launched is a bonus, the real motivation is to get to the point where the only reference to struct vcpu_vmx in the asm code is to vcpu.arch.regs, which will simplify moving the blob to a proper asm file. Note that also means this approach is deliberately different than what is used in nested_vmx_check_vmentry_hw(). Use BL as it is a callee-save register in both 32-bit and 64-bit ABIs, i.e. it can't be modified by vmx_update_host_rsp(), to avoid having to temporarily save/restore the launched flag. Signed-off-by: Sean Christopherson Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 13 ++++++------- arch/x86/kvm/vmx/vmx.h | 2 +- 2 files changed, 7 insertions(+), 8 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 4b8a94fedb76..996a13ea86cc 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6373,8 +6373,6 @@ void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) { - vmx->__launched = vmx->loaded_vmcs->launched; - if (static_branch_unlikely(&vmx_l1d_should_flush)) vmx_l1d_flush(vcpu); @@ -6395,7 +6393,8 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "mov (%%" _ASM_SP "), %%" _ASM_CX " \n\t" /* Check if vmlaunch or vmresume is needed */ - "cmpb $0, %c[launched](%%" _ASM_CX ") \n\t" + "cmpb $0, %%bl \n\t" + /* Load guest registers. Don't clobber flags. */ "mov %c[rax](%%" _ASM_CX "), %%" _ASM_AX " \n\t" "mov %c[rbx](%%" _ASM_CX "), %%" _ASM_BX " \n\t" @@ -6471,7 +6470,7 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "xor %%edi, %%edi \n\t" "xor %%ebp, %%ebp \n\t" "pop %%" _ASM_BP " \n\t" - : ASM_CALL_CONSTRAINT, + : ASM_CALL_CONSTRAINT, "=b"((int){0}), #ifdef CONFIG_X86_64 "=D"((int){0}) : "D"(vmx), @@ -6479,7 +6478,7 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "=a"((int){0}) : "a"(vmx), #endif - [launched]"i"(offsetof(struct vcpu_vmx, __launched)), + "b"(vmx->loaded_vmcs->launched), [fail]"i"(offsetof(struct vcpu_vmx, fail)), [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), @@ -6501,10 +6500,10 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) [wordsize]"i"(sizeof(ulong)) : "cc", "memory" #ifdef CONFIG_X86_64 - , "rax", "rbx", "rcx", "rdx", "rsi" + , "rax", "rcx", "rdx", "rsi" , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" #else - , "ebx", "ecx", "edx", "edi", "esi" + , "ecx", "edx", "edi", "esi" #endif ); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 8e203b725928..6ee6a492efaf 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -208,7 +208,7 @@ struct vcpu_vmx { struct loaded_vmcs vmcs01; struct loaded_vmcs *loaded_vmcs; struct loaded_vmcs *loaded_cpu_state; - bool __launched; /* temporary, used in vmx_vcpu_run */ + struct msr_autoload { struct vmx_msrs guest; struct vmx_msrs host; -- cgit From 217aaff53c25f03b1d2fc23eff9dc2bae34f690e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:06 -0800 Subject: KVM: VMX: Invert the ordering of saving guest/host scratch reg at VM-Enter Switching the ordering allows for an out-of-line path for VM-Fail that elides saving guest state but still shares the register clearing with the VM-Exit path. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 996a13ea86cc..79b42197ed7e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6382,7 +6382,6 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) asm( /* Store host registers */ "push %%" _ASM_BP " \n\t" - "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* placeholder for guest RCX */ "push %%" _ASM_ARG1 " \n\t" /* Adjust RSP to account for the CALL to vmx_vmenter(). */ @@ -6418,11 +6417,11 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) /* Enter guest mode */ "call vmx_vmenter\n\t" - /* Save guest's RCX to the stack placeholder (see above) */ - "mov %%" _ASM_CX ", %c[wordsize](%%" _ASM_SP ") \n\t" + /* Temporarily save guest's RCX. */ + "push %%" _ASM_CX " \n\t" - /* Load host's RCX, i.e. the vmx_vcpu pointer */ - "pop %%" _ASM_CX " \n\t" + /* Reload the vcpu_vmx pointer to RCX. */ + "mov %c[wordsize](%%" _ASM_SP "), %%" _ASM_CX " \n\t" /* Set vmx->fail based on EFLAGS.{CF,ZF} */ "setbe %c[fail](%%" _ASM_CX ")\n\t" @@ -6469,6 +6468,9 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "xor %%esi, %%esi \n\t" "xor %%edi, %%edi \n\t" "xor %%ebp, %%ebp \n\t" + + /* "POP" the vcpu_vmx pointer. */ + "add $%c[wordsize], %%" _ASM_SP " \n\t" "pop %%" _ASM_BP " \n\t" : ASM_CALL_CONSTRAINT, "=b"((int){0}), #ifdef CONFIG_X86_64 -- cgit From f78d0971b7bd5bf4373a1fac27f176af5d5594ed Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:07 -0800 Subject: KVM: VMX: Don't save guest registers after VM-Fail A failed VM-Enter (obviously) didn't succeed, meaning the CPU never executed an instrunction in guest mode and so can't have changed the general purpose registers. In addition to saving some instructions in the VM-Fail case, this also provides a separate path entirely and thus an opportunity to propagate the fail condition to vmx->fail via register without introducing undue pain. Using a register, as opposed to directly referencing vmx->fail, eliminates the need to pass the offset of 'fail', which will simplify moving the code to proper assembly in future patches. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 79b42197ed7e..1dcd3f157a70 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6416,6 +6416,7 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) /* Enter guest mode */ "call vmx_vmenter\n\t" + "jbe 2f \n\t" /* Temporarily save guest's RCX. */ "push %%" _ASM_CX " \n\t" @@ -6423,9 +6424,6 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) /* Reload the vcpu_vmx pointer to RCX. */ "mov %c[wordsize](%%" _ASM_SP "), %%" _ASM_CX " \n\t" - /* Set vmx->fail based on EFLAGS.{CF,ZF} */ - "setbe %c[fail](%%" _ASM_CX ")\n\t" - /* Save all guest registers, including RCX from the stack */ "mov %%" _ASM_AX ", %c[rax](%%" _ASM_CX ") \n\t" "mov %%" _ASM_BX ", %c[rbx](%%" _ASM_CX ") \n\t" @@ -6443,15 +6441,22 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "mov %%r13, %c[r13](%%" _ASM_CX ") \n\t" "mov %%r14, %c[r14](%%" _ASM_CX ") \n\t" "mov %%r15, %c[r15](%%" _ASM_CX ") \n\t" +#endif + + /* Clear EBX to indicate VM-Exit (as opposed to VM-Fail). */ + "xor %%ebx, %%ebx \n\t" /* - * Clear all general purpose registers (except RSP, which is loaded by - * the CPU during VM-Exit) to prevent speculative use of the guest's - * values, even those that are saved/loaded via the stack. In theory, - * an L1 cache miss when restoring registers could lead to speculative - * execution with the guest's values. Zeroing XORs are dirt cheap, - * i.e. the extra paranoia is essentially free. + * Clear all general purpose registers except RSP and RBX to prevent + * speculative use of the guest's values, even those that are reloaded + * via the stack. In theory, an L1 cache miss when restoring registers + * could lead to speculative execution with the guest's values. + * Zeroing XORs are dirt cheap, i.e. the extra paranoia is essentially + * free. RSP and RBX are exempt as RSP is restored by hardware during + * VM-Exit and RBX is explicitly loaded with 0 or 1 to "return" VM-Fail. */ + "1: \n\t" +#ifdef CONFIG_X86_64 "xor %%r8d, %%r8d \n\t" "xor %%r9d, %%r9d \n\t" "xor %%r10d, %%r10d \n\t" @@ -6462,7 +6467,6 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "xor %%r15d, %%r15d \n\t" #endif "xor %%eax, %%eax \n\t" - "xor %%ebx, %%ebx \n\t" "xor %%ecx, %%ecx \n\t" "xor %%edx, %%edx \n\t" "xor %%esi, %%esi \n\t" @@ -6472,7 +6476,15 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) /* "POP" the vcpu_vmx pointer. */ "add $%c[wordsize], %%" _ASM_SP " \n\t" "pop %%" _ASM_BP " \n\t" - : ASM_CALL_CONSTRAINT, "=b"((int){0}), + "jmp 3f \n\t" + + /* VM-Fail. Out-of-line to avoid a taken Jcc after VM-Exit. */ + "2: \n\t" + "mov $1, %%ebx \n\t" + "jmp 1b \n\t" + "3: \n\t" + + : ASM_CALL_CONSTRAINT, "=b"(vmx->fail), #ifdef CONFIG_X86_64 "=D"((int){0}) : "D"(vmx), @@ -6481,7 +6493,6 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) : "a"(vmx), #endif "b"(vmx->loaded_vmcs->launched), - [fail]"i"(offsetof(struct vcpu_vmx, fail)), [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), -- cgit From d55892049171872a8f1aba542aa0691efe3da52d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:08 -0800 Subject: KVM: VMX: Use vcpu->arch.regs directly when saving/loading guest state ...now that all other references to struct vcpu_vmx have been removed. Note that 'vmx' still needs to be passed into the asm blob in _ASM_ARG1 as it is consumed by vmx_update_host_rsp(). And similar to that code, use _ASM_ARG2 in the assembly code to prepare for moving to proper asm, while explicitly referencing the exact registers in the clobber list for clarity in the short term and to avoid additional precompiler games. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 53 +++++++++++++++++++++++++++----------------------- 1 file changed, 29 insertions(+), 24 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 1dcd3f157a70..cd92568bed26 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6382,13 +6382,18 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) asm( /* Store host registers */ "push %%" _ASM_BP " \n\t" - "push %%" _ASM_ARG1 " \n\t" + + /* + * Save @regs, _ASM_ARG2 may be modified by vmx_update_host_rsp() and + * @regs is needed after VM-Exit to save the guest's register values. + */ + "push %%" _ASM_ARG2 " \n\t" /* Adjust RSP to account for the CALL to vmx_vmenter(). */ "lea -%c[wordsize](%%" _ASM_SP "), %%" _ASM_ARG2 " \n\t" "call vmx_update_host_rsp \n\t" - /* Load the vcpu_vmx pointer to RCX. */ + /* Load RCX with @regs. */ "mov (%%" _ASM_SP "), %%" _ASM_CX " \n\t" /* Check if vmlaunch or vmresume is needed */ @@ -6421,7 +6426,7 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) /* Temporarily save guest's RCX. */ "push %%" _ASM_CX " \n\t" - /* Reload the vcpu_vmx pointer to RCX. */ + /* Reload RCX with @regs. */ "mov %c[wordsize](%%" _ASM_SP "), %%" _ASM_CX " \n\t" /* Save all guest registers, including RCX from the stack */ @@ -6486,37 +6491,37 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) : ASM_CALL_CONSTRAINT, "=b"(vmx->fail), #ifdef CONFIG_X86_64 - "=D"((int){0}) - : "D"(vmx), + "=D"((int){0}), "=S"((int){0}) + : "D"(vmx), "S"(&vcpu->arch.regs), #else - "=a"((int){0}) - : "a"(vmx), + "=a"((int){0}), "=d"((int){0}) + : "a"(vmx), "d"(&vcpu->arch.regs), #endif "b"(vmx->loaded_vmcs->launched), - [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), - [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), - [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), - [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])), - [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])), - [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])), - [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])), + [rax]"i"(VCPU_REGS_RAX * sizeof(ulong)), + [rbx]"i"(VCPU_REGS_RBX * sizeof(ulong)), + [rcx]"i"(VCPU_REGS_RCX * sizeof(ulong)), + [rdx]"i"(VCPU_REGS_RDX * sizeof(ulong)), + [rsi]"i"(VCPU_REGS_RSI * sizeof(ulong)), + [rdi]"i"(VCPU_REGS_RDI * sizeof(ulong)), + [rbp]"i"(VCPU_REGS_RBP * sizeof(ulong)), #ifdef CONFIG_X86_64 - [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])), - [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])), - [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])), - [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])), - [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])), - [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])), - [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])), - [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])), + [r8]"i"(VCPU_REGS_R8 * sizeof(ulong)), + [r9]"i"(VCPU_REGS_R9 * sizeof(ulong)), + [r10]"i"(VCPU_REGS_R10 * sizeof(ulong)), + [r11]"i"(VCPU_REGS_R11 * sizeof(ulong)), + [r12]"i"(VCPU_REGS_R12 * sizeof(ulong)), + [r13]"i"(VCPU_REGS_R13 * sizeof(ulong)), + [r14]"i"(VCPU_REGS_R14 * sizeof(ulong)), + [r15]"i"(VCPU_REGS_R15 * sizeof(ulong)), #endif [wordsize]"i"(sizeof(ulong)) : "cc", "memory" #ifdef CONFIG_X86_64 - , "rax", "rcx", "rdx", "rsi" + , "rax", "rcx", "rdx" , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" #else - , "ecx", "edx", "edi", "esi" + , "ecx", "edi", "esi" #endif ); -- cgit From c14f9dd50b01b55834a757dd50af35b8e168512d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:10 -0800 Subject: KVM: VMX: Use #defines in place of immediates in VM-Enter inline asm ...to prepare for moving the inline asm to a proper asm sub-routine. Eliminating the immediates allows a nearly verbatim move, e.g. quotes, newlines, tabs and __stringify need to be dropped, but other than those cosmetic changes the only function change will be to replace the final "jmp" with a "ret". Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 113 ++++++++++++++++++++++++++----------------------- 1 file changed, 61 insertions(+), 52 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index cd92568bed26..12fb342218ad 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6371,6 +6371,33 @@ void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) } } +#ifdef CONFIG_X86_64 +#define WORD_SIZE 8 +#else +#define WORD_SIZE 4 +#endif + +#define _WORD_SIZE __stringify(WORD_SIZE) + +#define VCPU_RAX __stringify(__VCPU_REGS_RAX * WORD_SIZE) +#define VCPU_RCX __stringify(__VCPU_REGS_RCX * WORD_SIZE) +#define VCPU_RDX __stringify(__VCPU_REGS_RDX * WORD_SIZE) +#define VCPU_RBX __stringify(__VCPU_REGS_RBX * WORD_SIZE) +/* Intentionally omit %RSP as it's context switched by hardware */ +#define VCPU_RBP __stringify(__VCPU_REGS_RBP * WORD_SIZE) +#define VCPU_RSI __stringify(__VCPU_REGS_RSI * WORD_SIZE) +#define VCPU_RDI __stringify(__VCPU_REGS_RDI * WORD_SIZE) +#ifdef CONFIG_X86_64 +#define VCPU_R8 __stringify(__VCPU_REGS_R8 * WORD_SIZE) +#define VCPU_R9 __stringify(__VCPU_REGS_R9 * WORD_SIZE) +#define VCPU_R10 __stringify(__VCPU_REGS_R10 * WORD_SIZE) +#define VCPU_R11 __stringify(__VCPU_REGS_R11 * WORD_SIZE) +#define VCPU_R12 __stringify(__VCPU_REGS_R12 * WORD_SIZE) +#define VCPU_R13 __stringify(__VCPU_REGS_R13 * WORD_SIZE) +#define VCPU_R14 __stringify(__VCPU_REGS_R14 * WORD_SIZE) +#define VCPU_R15 __stringify(__VCPU_REGS_R15 * WORD_SIZE) +#endif + static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) { if (static_branch_unlikely(&vmx_l1d_should_flush)) @@ -6390,7 +6417,7 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "push %%" _ASM_ARG2 " \n\t" /* Adjust RSP to account for the CALL to vmx_vmenter(). */ - "lea -%c[wordsize](%%" _ASM_SP "), %%" _ASM_ARG2 " \n\t" + "lea -" _WORD_SIZE "(%%" _ASM_SP "), %%" _ASM_ARG2 " \n\t" "call vmx_update_host_rsp \n\t" /* Load RCX with @regs. */ @@ -6400,24 +6427,24 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "cmpb $0, %%bl \n\t" /* Load guest registers. Don't clobber flags. */ - "mov %c[rax](%%" _ASM_CX "), %%" _ASM_AX " \n\t" - "mov %c[rbx](%%" _ASM_CX "), %%" _ASM_BX " \n\t" - "mov %c[rdx](%%" _ASM_CX "), %%" _ASM_DX " \n\t" - "mov %c[rsi](%%" _ASM_CX "), %%" _ASM_SI " \n\t" - "mov %c[rdi](%%" _ASM_CX "), %%" _ASM_DI " \n\t" - "mov %c[rbp](%%" _ASM_CX "), %%" _ASM_BP " \n\t" + "mov " VCPU_RAX "(%%" _ASM_CX "), %%" _ASM_AX " \n\t" + "mov " VCPU_RBX "(%%" _ASM_CX "), %%" _ASM_BX " \n\t" + "mov " VCPU_RDX "(%%" _ASM_CX "), %%" _ASM_DX " \n\t" + "mov " VCPU_RSI "(%%" _ASM_CX "), %%" _ASM_SI " \n\t" + "mov " VCPU_RDI "(%%" _ASM_CX "), %%" _ASM_DI " \n\t" + "mov " VCPU_RBP "(%%" _ASM_CX "), %%" _ASM_BP " \n\t" #ifdef CONFIG_X86_64 - "mov %c[r8](%%" _ASM_CX "), %%r8 \n\t" - "mov %c[r9](%%" _ASM_CX "), %%r9 \n\t" - "mov %c[r10](%%" _ASM_CX "), %%r10 \n\t" - "mov %c[r11](%%" _ASM_CX "), %%r11 \n\t" - "mov %c[r12](%%" _ASM_CX "), %%r12 \n\t" - "mov %c[r13](%%" _ASM_CX "), %%r13 \n\t" - "mov %c[r14](%%" _ASM_CX "), %%r14 \n\t" - "mov %c[r15](%%" _ASM_CX "), %%r15 \n\t" + "mov " VCPU_R8 "(%%" _ASM_CX "), %%r8 \n\t" + "mov " VCPU_R9 "(%%" _ASM_CX "), %%r9 \n\t" + "mov " VCPU_R10 "(%%" _ASM_CX "), %%r10 \n\t" + "mov " VCPU_R11 "(%%" _ASM_CX "), %%r11 \n\t" + "mov " VCPU_R12 "(%%" _ASM_CX "), %%r12 \n\t" + "mov " VCPU_R13 "(%%" _ASM_CX "), %%r13 \n\t" + "mov " VCPU_R14 "(%%" _ASM_CX "), %%r14 \n\t" + "mov " VCPU_R15 "(%%" _ASM_CX "), %%r15 \n\t" #endif /* Load guest RCX. This kills the vmx_vcpu pointer! */ - "mov %c[rcx](%%" _ASM_CX "), %%" _ASM_CX " \n\t" + "mov " VCPU_RCX"(%%" _ASM_CX "), %%" _ASM_CX " \n\t" /* Enter guest mode */ "call vmx_vmenter\n\t" @@ -6427,25 +6454,25 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "push %%" _ASM_CX " \n\t" /* Reload RCX with @regs. */ - "mov %c[wordsize](%%" _ASM_SP "), %%" _ASM_CX " \n\t" + "mov " _WORD_SIZE "(%%" _ASM_SP "), %%" _ASM_CX " \n\t" /* Save all guest registers, including RCX from the stack */ - "mov %%" _ASM_AX ", %c[rax](%%" _ASM_CX ") \n\t" - "mov %%" _ASM_BX ", %c[rbx](%%" _ASM_CX ") \n\t" - __ASM_SIZE(pop) " %c[rcx](%%" _ASM_CX ") \n\t" - "mov %%" _ASM_DX ", %c[rdx](%%" _ASM_CX ") \n\t" - "mov %%" _ASM_SI ", %c[rsi](%%" _ASM_CX ") \n\t" - "mov %%" _ASM_DI ", %c[rdi](%%" _ASM_CX ") \n\t" - "mov %%" _ASM_BP ", %c[rbp](%%" _ASM_CX ") \n\t" + "mov %%" _ASM_AX ", " VCPU_RAX "(%%" _ASM_CX ") \n\t" + "mov %%" _ASM_BX ", " VCPU_RBX "(%%" _ASM_CX ") \n\t" + __ASM_SIZE(pop) " " VCPU_RCX "(%%" _ASM_CX ") \n\t" + "mov %%" _ASM_DX ", " VCPU_RDX "(%%" _ASM_CX ") \n\t" + "mov %%" _ASM_SI ", " VCPU_RSI "(%%" _ASM_CX ") \n\t" + "mov %%" _ASM_DI ", " VCPU_RDI "(%%" _ASM_CX ") \n\t" + "mov %%" _ASM_BP ", " VCPU_RBP "(%%" _ASM_CX ") \n\t" #ifdef CONFIG_X86_64 - "mov %%r8, %c[r8](%%" _ASM_CX ") \n\t" - "mov %%r9, %c[r9](%%" _ASM_CX ") \n\t" - "mov %%r10, %c[r10](%%" _ASM_CX ") \n\t" - "mov %%r11, %c[r11](%%" _ASM_CX ") \n\t" - "mov %%r12, %c[r12](%%" _ASM_CX ") \n\t" - "mov %%r13, %c[r13](%%" _ASM_CX ") \n\t" - "mov %%r14, %c[r14](%%" _ASM_CX ") \n\t" - "mov %%r15, %c[r15](%%" _ASM_CX ") \n\t" + "mov %%r8, " VCPU_R8 "(%%" _ASM_CX ") \n\t" + "mov %%r9, " VCPU_R9 "(%%" _ASM_CX ") \n\t" + "mov %%r10, " VCPU_R10 "(%%" _ASM_CX ") \n\t" + "mov %%r11, " VCPU_R11 "(%%" _ASM_CX ") \n\t" + "mov %%r12, " VCPU_R12 "(%%" _ASM_CX ") \n\t" + "mov %%r13, " VCPU_R13 "(%%" _ASM_CX ") \n\t" + "mov %%r14, " VCPU_R14 "(%%" _ASM_CX ") \n\t" + "mov %%r15, " VCPU_R15 "(%%" _ASM_CX ") \n\t" #endif /* Clear EBX to indicate VM-Exit (as opposed to VM-Fail). */ @@ -6479,7 +6506,7 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "xor %%ebp, %%ebp \n\t" /* "POP" the vcpu_vmx pointer. */ - "add $%c[wordsize], %%" _ASM_SP " \n\t" + "add $" _WORD_SIZE ", %%" _ASM_SP " \n\t" "pop %%" _ASM_BP " \n\t" "jmp 3f \n\t" @@ -6497,25 +6524,7 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) "=a"((int){0}), "=d"((int){0}) : "a"(vmx), "d"(&vcpu->arch.regs), #endif - "b"(vmx->loaded_vmcs->launched), - [rax]"i"(VCPU_REGS_RAX * sizeof(ulong)), - [rbx]"i"(VCPU_REGS_RBX * sizeof(ulong)), - [rcx]"i"(VCPU_REGS_RCX * sizeof(ulong)), - [rdx]"i"(VCPU_REGS_RDX * sizeof(ulong)), - [rsi]"i"(VCPU_REGS_RSI * sizeof(ulong)), - [rdi]"i"(VCPU_REGS_RDI * sizeof(ulong)), - [rbp]"i"(VCPU_REGS_RBP * sizeof(ulong)), -#ifdef CONFIG_X86_64 - [r8]"i"(VCPU_REGS_R8 * sizeof(ulong)), - [r9]"i"(VCPU_REGS_R9 * sizeof(ulong)), - [r10]"i"(VCPU_REGS_R10 * sizeof(ulong)), - [r11]"i"(VCPU_REGS_R11 * sizeof(ulong)), - [r12]"i"(VCPU_REGS_R12 * sizeof(ulong)), - [r13]"i"(VCPU_REGS_R13 * sizeof(ulong)), - [r14]"i"(VCPU_REGS_R14 * sizeof(ulong)), - [r15]"i"(VCPU_REGS_R15 * sizeof(ulong)), -#endif - [wordsize]"i"(sizeof(ulong)) + "b"(vmx->loaded_vmcs->launched) : "cc", "memory" #ifdef CONFIG_X86_64 , "rax", "rcx", "rdx" -- cgit From 63c73aa07fcabc090661a586f7ae5200a0fc5cb4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:11 -0800 Subject: KVM: VMX: Create a stack frame in vCPU-run ...in preparation for moving to a proper assembly sub-routnine. vCPU-run isn't a leaf function since it calls vmx_update_host_rsp() and vmx_vmenter(). And since we need to save/restore RBP anyways, unconditionally creating the frame costs a single MOV, i.e. don't bother keying off CONFIG_FRAME_POINTER or using FRAME_BEGIN, etc... Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 12fb342218ad..52fea89732db 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6407,8 +6407,8 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) write_cr2(vcpu->arch.cr2); asm( - /* Store host registers */ "push %%" _ASM_BP " \n\t" + "mov %%" _ASM_SP ", %%" _ASM_BP " \n\t" /* * Save @regs, _ASM_ARG2 may be modified by vmx_update_host_rsp() and -- cgit From 5e0781df18990497f4cd96a959ca583d83b6d1e0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:12 -0800 Subject: KVM: VMX: Move vCPU-run code to a proper assembly routine As evidenced by the myriad patches leading up to this moment, using an inline asm blob for vCPU-run is nothing short of horrific. It's also been called "unholy", "an abomination" and likely a whole host of other names that would violate the Code of Conduct if recorded here and now. The code is relocated nearly verbatim, e.g. quotes, newlines, tabs and __stringify need to be dropped, but other than those cosmetic changes the only functional changees are to add the "call" and replace the final "jmp" with a "ret". Note that STACK_FRAME_NON_STANDARD is also dropped from __vmx_vcpu_run(). Suggested-by: Andi Kleen Suggested-by: Josh Poimboeuf Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmenter.S | 144 +++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/vmx/vmx.c | 138 +------------------------------------------ 2 files changed, 145 insertions(+), 137 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index bcef2c7e9bc4..e64617f3b196 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -1,6 +1,30 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include #include +#include +#include + +#define WORD_SIZE (BITS_PER_LONG / 8) + +#define VCPU_RAX __VCPU_REGS_RAX * WORD_SIZE +#define VCPU_RCX __VCPU_REGS_RCX * WORD_SIZE +#define VCPU_RDX __VCPU_REGS_RDX * WORD_SIZE +#define VCPU_RBX __VCPU_REGS_RBX * WORD_SIZE +/* Intentionally omit RSP as it's context switched by hardware */ +#define VCPU_RBP __VCPU_REGS_RBP * WORD_SIZE +#define VCPU_RSI __VCPU_REGS_RSI * WORD_SIZE +#define VCPU_RDI __VCPU_REGS_RDI * WORD_SIZE + +#ifdef CONFIG_X86_64 +#define VCPU_R8 __VCPU_REGS_R8 * WORD_SIZE +#define VCPU_R9 __VCPU_REGS_R9 * WORD_SIZE +#define VCPU_R10 __VCPU_REGS_R10 * WORD_SIZE +#define VCPU_R11 __VCPU_REGS_R11 * WORD_SIZE +#define VCPU_R12 __VCPU_REGS_R12 * WORD_SIZE +#define VCPU_R13 __VCPU_REGS_R13 * WORD_SIZE +#define VCPU_R14 __VCPU_REGS_R14 * WORD_SIZE +#define VCPU_R15 __VCPU_REGS_R15 * WORD_SIZE +#endif .text @@ -55,3 +79,123 @@ ENDPROC(vmx_vmenter) ENTRY(vmx_vmexit) ret ENDPROC(vmx_vmexit) + +/** + * ____vmx_vcpu_run - Run a vCPU via a transition to VMX guest mode + * @vmx: struct vcpu_vmx * + * @regs: unsigned long * (to guest registers) + * %RBX: VMCS launched status (non-zero indicates already launched) + * + * Returns: + * %RBX is 0 on VM-Exit, 1 on VM-Fail + */ +ENTRY(____vmx_vcpu_run) + push %_ASM_BP + mov %_ASM_SP, %_ASM_BP + + /* + * Save @regs, _ASM_ARG2 may be modified by vmx_update_host_rsp() and + * @regs is needed after VM-Exit to save the guest's register values. + */ + push %_ASM_ARG2 + + /* Adjust RSP to account for the CALL to vmx_vmenter(). */ + lea -WORD_SIZE(%_ASM_SP), %_ASM_ARG2 + call vmx_update_host_rsp + + /* Load @regs to RCX. */ + mov (%_ASM_SP), %_ASM_CX + + /* Check if vmlaunch or vmresume is needed */ + cmpb $0, %bl + + /* Load guest registers. Don't clobber flags. */ + mov VCPU_RAX(%_ASM_CX), %_ASM_AX + mov VCPU_RBX(%_ASM_CX), %_ASM_BX + mov VCPU_RDX(%_ASM_CX), %_ASM_DX + mov VCPU_RSI(%_ASM_CX), %_ASM_SI + mov VCPU_RDI(%_ASM_CX), %_ASM_DI + mov VCPU_RBP(%_ASM_CX), %_ASM_BP +#ifdef CONFIG_X86_64 + mov VCPU_R8 (%_ASM_CX), %r8 + mov VCPU_R9 (%_ASM_CX), %r9 + mov VCPU_R10(%_ASM_CX), %r10 + mov VCPU_R11(%_ASM_CX), %r11 + mov VCPU_R12(%_ASM_CX), %r12 + mov VCPU_R13(%_ASM_CX), %r13 + mov VCPU_R14(%_ASM_CX), %r14 + mov VCPU_R15(%_ASM_CX), %r15 +#endif + /* Load guest RCX. This kills the vmx_vcpu pointer! */ + mov VCPU_RCX(%_ASM_CX), %_ASM_CX + + /* Enter guest mode */ + call vmx_vmenter + + /* Jump on VM-Fail. */ + jbe 2f + + /* Temporarily save guest's RCX. */ + push %_ASM_CX + + /* Reload @regs to RCX. */ + mov WORD_SIZE(%_ASM_SP), %_ASM_CX + + /* Save all guest registers, including RCX from the stack */ + mov %_ASM_AX, VCPU_RAX(%_ASM_CX) + mov %_ASM_BX, VCPU_RBX(%_ASM_CX) + __ASM_SIZE(pop) VCPU_RCX(%_ASM_CX) + mov %_ASM_DX, VCPU_RDX(%_ASM_CX) + mov %_ASM_SI, VCPU_RSI(%_ASM_CX) + mov %_ASM_DI, VCPU_RDI(%_ASM_CX) + mov %_ASM_BP, VCPU_RBP(%_ASM_CX) +#ifdef CONFIG_X86_64 + mov %r8, VCPU_R8 (%_ASM_CX) + mov %r9, VCPU_R9 (%_ASM_CX) + mov %r10, VCPU_R10(%_ASM_CX) + mov %r11, VCPU_R11(%_ASM_CX) + mov %r12, VCPU_R12(%_ASM_CX) + mov %r13, VCPU_R13(%_ASM_CX) + mov %r14, VCPU_R14(%_ASM_CX) + mov %r15, VCPU_R15(%_ASM_CX) +#endif + + /* Clear EBX to indicate VM-Exit (as opposed to VM-Fail). */ + xor %ebx, %ebx + + /* + * Clear all general purpose registers except RSP and RBX to prevent + * speculative use of the guest's values, even those that are reloaded + * via the stack. In theory, an L1 cache miss when restoring registers + * could lead to speculative execution with the guest's values. + * Zeroing XORs are dirt cheap, i.e. the extra paranoia is essentially + * free. RSP and RBX are exempt as RSP is restored by hardware during + * VM-Exit and RBX is explicitly loaded with 0 or 1 to "return" VM-Fail. + */ +1: +#ifdef CONFIG_X86_64 + xor %r8d, %r8d + xor %r9d, %r9d + xor %r10d, %r10d + xor %r11d, %r11d + xor %r12d, %r12d + xor %r13d, %r13d + xor %r14d, %r14d + xor %r15d, %r15d +#endif + xor %eax, %eax + xor %ecx, %ecx + xor %edx, %edx + xor %esi, %esi + xor %edi, %edi + xor %ebp, %ebp + + /* "POP" @regs. */ + add $WORD_SIZE, %_ASM_SP + pop %_ASM_BP + ret + + /* VM-Fail. Out-of-line to avoid a taken Jcc after VM-Exit. */ +2: mov $1, %ebx + jmp 1b +ENDPROC(____vmx_vcpu_run) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 52fea89732db..0e6e2dd6265e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6371,33 +6371,6 @@ void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) } } -#ifdef CONFIG_X86_64 -#define WORD_SIZE 8 -#else -#define WORD_SIZE 4 -#endif - -#define _WORD_SIZE __stringify(WORD_SIZE) - -#define VCPU_RAX __stringify(__VCPU_REGS_RAX * WORD_SIZE) -#define VCPU_RCX __stringify(__VCPU_REGS_RCX * WORD_SIZE) -#define VCPU_RDX __stringify(__VCPU_REGS_RDX * WORD_SIZE) -#define VCPU_RBX __stringify(__VCPU_REGS_RBX * WORD_SIZE) -/* Intentionally omit %RSP as it's context switched by hardware */ -#define VCPU_RBP __stringify(__VCPU_REGS_RBP * WORD_SIZE) -#define VCPU_RSI __stringify(__VCPU_REGS_RSI * WORD_SIZE) -#define VCPU_RDI __stringify(__VCPU_REGS_RDI * WORD_SIZE) -#ifdef CONFIG_X86_64 -#define VCPU_R8 __stringify(__VCPU_REGS_R8 * WORD_SIZE) -#define VCPU_R9 __stringify(__VCPU_REGS_R9 * WORD_SIZE) -#define VCPU_R10 __stringify(__VCPU_REGS_R10 * WORD_SIZE) -#define VCPU_R11 __stringify(__VCPU_REGS_R11 * WORD_SIZE) -#define VCPU_R12 __stringify(__VCPU_REGS_R12 * WORD_SIZE) -#define VCPU_R13 __stringify(__VCPU_REGS_R13 * WORD_SIZE) -#define VCPU_R14 __stringify(__VCPU_REGS_R14 * WORD_SIZE) -#define VCPU_R15 __stringify(__VCPU_REGS_R15 * WORD_SIZE) -#endif - static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) { if (static_branch_unlikely(&vmx_l1d_should_flush)) @@ -6407,115 +6380,7 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) write_cr2(vcpu->arch.cr2); asm( - "push %%" _ASM_BP " \n\t" - "mov %%" _ASM_SP ", %%" _ASM_BP " \n\t" - - /* - * Save @regs, _ASM_ARG2 may be modified by vmx_update_host_rsp() and - * @regs is needed after VM-Exit to save the guest's register values. - */ - "push %%" _ASM_ARG2 " \n\t" - - /* Adjust RSP to account for the CALL to vmx_vmenter(). */ - "lea -" _WORD_SIZE "(%%" _ASM_SP "), %%" _ASM_ARG2 " \n\t" - "call vmx_update_host_rsp \n\t" - - /* Load RCX with @regs. */ - "mov (%%" _ASM_SP "), %%" _ASM_CX " \n\t" - - /* Check if vmlaunch or vmresume is needed */ - "cmpb $0, %%bl \n\t" - - /* Load guest registers. Don't clobber flags. */ - "mov " VCPU_RAX "(%%" _ASM_CX "), %%" _ASM_AX " \n\t" - "mov " VCPU_RBX "(%%" _ASM_CX "), %%" _ASM_BX " \n\t" - "mov " VCPU_RDX "(%%" _ASM_CX "), %%" _ASM_DX " \n\t" - "mov " VCPU_RSI "(%%" _ASM_CX "), %%" _ASM_SI " \n\t" - "mov " VCPU_RDI "(%%" _ASM_CX "), %%" _ASM_DI " \n\t" - "mov " VCPU_RBP "(%%" _ASM_CX "), %%" _ASM_BP " \n\t" -#ifdef CONFIG_X86_64 - "mov " VCPU_R8 "(%%" _ASM_CX "), %%r8 \n\t" - "mov " VCPU_R9 "(%%" _ASM_CX "), %%r9 \n\t" - "mov " VCPU_R10 "(%%" _ASM_CX "), %%r10 \n\t" - "mov " VCPU_R11 "(%%" _ASM_CX "), %%r11 \n\t" - "mov " VCPU_R12 "(%%" _ASM_CX "), %%r12 \n\t" - "mov " VCPU_R13 "(%%" _ASM_CX "), %%r13 \n\t" - "mov " VCPU_R14 "(%%" _ASM_CX "), %%r14 \n\t" - "mov " VCPU_R15 "(%%" _ASM_CX "), %%r15 \n\t" -#endif - /* Load guest RCX. This kills the vmx_vcpu pointer! */ - "mov " VCPU_RCX"(%%" _ASM_CX "), %%" _ASM_CX " \n\t" - - /* Enter guest mode */ - "call vmx_vmenter\n\t" - "jbe 2f \n\t" - - /* Temporarily save guest's RCX. */ - "push %%" _ASM_CX " \n\t" - - /* Reload RCX with @regs. */ - "mov " _WORD_SIZE "(%%" _ASM_SP "), %%" _ASM_CX " \n\t" - - /* Save all guest registers, including RCX from the stack */ - "mov %%" _ASM_AX ", " VCPU_RAX "(%%" _ASM_CX ") \n\t" - "mov %%" _ASM_BX ", " VCPU_RBX "(%%" _ASM_CX ") \n\t" - __ASM_SIZE(pop) " " VCPU_RCX "(%%" _ASM_CX ") \n\t" - "mov %%" _ASM_DX ", " VCPU_RDX "(%%" _ASM_CX ") \n\t" - "mov %%" _ASM_SI ", " VCPU_RSI "(%%" _ASM_CX ") \n\t" - "mov %%" _ASM_DI ", " VCPU_RDI "(%%" _ASM_CX ") \n\t" - "mov %%" _ASM_BP ", " VCPU_RBP "(%%" _ASM_CX ") \n\t" -#ifdef CONFIG_X86_64 - "mov %%r8, " VCPU_R8 "(%%" _ASM_CX ") \n\t" - "mov %%r9, " VCPU_R9 "(%%" _ASM_CX ") \n\t" - "mov %%r10, " VCPU_R10 "(%%" _ASM_CX ") \n\t" - "mov %%r11, " VCPU_R11 "(%%" _ASM_CX ") \n\t" - "mov %%r12, " VCPU_R12 "(%%" _ASM_CX ") \n\t" - "mov %%r13, " VCPU_R13 "(%%" _ASM_CX ") \n\t" - "mov %%r14, " VCPU_R14 "(%%" _ASM_CX ") \n\t" - "mov %%r15, " VCPU_R15 "(%%" _ASM_CX ") \n\t" -#endif - - /* Clear EBX to indicate VM-Exit (as opposed to VM-Fail). */ - "xor %%ebx, %%ebx \n\t" - - /* - * Clear all general purpose registers except RSP and RBX to prevent - * speculative use of the guest's values, even those that are reloaded - * via the stack. In theory, an L1 cache miss when restoring registers - * could lead to speculative execution with the guest's values. - * Zeroing XORs are dirt cheap, i.e. the extra paranoia is essentially - * free. RSP and RBX are exempt as RSP is restored by hardware during - * VM-Exit and RBX is explicitly loaded with 0 or 1 to "return" VM-Fail. - */ - "1: \n\t" -#ifdef CONFIG_X86_64 - "xor %%r8d, %%r8d \n\t" - "xor %%r9d, %%r9d \n\t" - "xor %%r10d, %%r10d \n\t" - "xor %%r11d, %%r11d \n\t" - "xor %%r12d, %%r12d \n\t" - "xor %%r13d, %%r13d \n\t" - "xor %%r14d, %%r14d \n\t" - "xor %%r15d, %%r15d \n\t" -#endif - "xor %%eax, %%eax \n\t" - "xor %%ecx, %%ecx \n\t" - "xor %%edx, %%edx \n\t" - "xor %%esi, %%esi \n\t" - "xor %%edi, %%edi \n\t" - "xor %%ebp, %%ebp \n\t" - - /* "POP" the vcpu_vmx pointer. */ - "add $" _WORD_SIZE ", %%" _ASM_SP " \n\t" - "pop %%" _ASM_BP " \n\t" - "jmp 3f \n\t" - - /* VM-Fail. Out-of-line to avoid a taken Jcc after VM-Exit. */ - "2: \n\t" - "mov $1, %%ebx \n\t" - "jmp 1b \n\t" - "3: \n\t" - + "call ____vmx_vcpu_run \n\t" : ASM_CALL_CONSTRAINT, "=b"(vmx->fail), #ifdef CONFIG_X86_64 "=D"((int){0}), "=S"((int){0}) @@ -6536,7 +6401,6 @@ static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) vcpu->arch.cr2 = read_cr2(); } -STACK_FRAME_NON_STANDARD(__vmx_vcpu_run); static void vmx_vcpu_run(struct kvm_vcpu *vcpu) { -- cgit From c823dd5c0f3fafa595ed51cc72c6e006efc20ad3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:13 -0800 Subject: KVM: VMX: Fold __vmx_vcpu_run() back into vmx_vcpu_run() ...now that the code is no longer tagged with STACK_FRAME_NON_STANDARD. Arguably, providing __vmx_vcpu_run() to break up vmx_vcpu_run() is valuable on its own, but the previous split was purposely made as small as possible to limit the effects STACK_FRAME_NON_STANDARD. In other words, the current split is now completely arbitrary and likely not the most logical. This also allows renaming ____vmx_vcpu_run() to __vmx_vcpu_run() in a future patch. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 59 +++++++++++++++++++++++--------------------------- 1 file changed, 27 insertions(+), 32 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 0e6e2dd6265e..e61bb8da9767 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6371,37 +6371,6 @@ void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) } } -static void __vmx_vcpu_run(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) -{ - if (static_branch_unlikely(&vmx_l1d_should_flush)) - vmx_l1d_flush(vcpu); - - if (vcpu->arch.cr2 != read_cr2()) - write_cr2(vcpu->arch.cr2); - - asm( - "call ____vmx_vcpu_run \n\t" - : ASM_CALL_CONSTRAINT, "=b"(vmx->fail), -#ifdef CONFIG_X86_64 - "=D"((int){0}), "=S"((int){0}) - : "D"(vmx), "S"(&vcpu->arch.regs), -#else - "=a"((int){0}), "=d"((int){0}) - : "a"(vmx), "d"(&vcpu->arch.regs), -#endif - "b"(vmx->loaded_vmcs->launched) - : "cc", "memory" -#ifdef CONFIG_X86_64 - , "rax", "rcx", "rdx" - , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" -#else - , "ecx", "edi", "esi" -#endif - ); - - vcpu->arch.cr2 = read_cr2(); -} - static void vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6469,7 +6438,33 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) */ x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0); - __vmx_vcpu_run(vcpu, vmx); + if (static_branch_unlikely(&vmx_l1d_should_flush)) + vmx_l1d_flush(vcpu); + + if (vcpu->arch.cr2 != read_cr2()) + write_cr2(vcpu->arch.cr2); + + asm( + "call ____vmx_vcpu_run \n\t" + : ASM_CALL_CONSTRAINT, "=b"(vmx->fail), +#ifdef CONFIG_X86_64 + "=D"((int){0}), "=S"((int){0}) + : "D"(vmx), "S"(&vcpu->arch.regs), +#else + "=a"((int){0}), "=d"((int){0}) + : "a"(vmx), "d"(&vcpu->arch.regs), +#endif + "b"(vmx->loaded_vmcs->launched) + : "cc", "memory" +#ifdef CONFIG_X86_64 + , "rax", "rcx", "rdx" + , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" +#else + , "ecx", "edi", "esi" +#endif + ); + + vcpu->arch.cr2 = read_cr2(); /* * We do not use IBRS in the kernel. If this vCPU has used the -- cgit From ee2fc635ef714b1d46da3616bbec972067f0d187 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:14 -0800 Subject: KVM: VMX: Rename ____vmx_vcpu_run() to __vmx_vcpu_run() ...now that the name is no longer usurped by a defunct helper function. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmenter.S | 6 +++--- arch/x86/kvm/vmx/vmx.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index e64617f3b196..fa6e03b36348 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -81,7 +81,7 @@ ENTRY(vmx_vmexit) ENDPROC(vmx_vmexit) /** - * ____vmx_vcpu_run - Run a vCPU via a transition to VMX guest mode + * __vmx_vcpu_run - Run a vCPU via a transition to VMX guest mode * @vmx: struct vcpu_vmx * * @regs: unsigned long * (to guest registers) * %RBX: VMCS launched status (non-zero indicates already launched) @@ -89,7 +89,7 @@ ENDPROC(vmx_vmexit) * Returns: * %RBX is 0 on VM-Exit, 1 on VM-Fail */ -ENTRY(____vmx_vcpu_run) +ENTRY(__vmx_vcpu_run) push %_ASM_BP mov %_ASM_SP, %_ASM_BP @@ -198,4 +198,4 @@ ENTRY(____vmx_vcpu_run) /* VM-Fail. Out-of-line to avoid a taken Jcc after VM-Exit. */ 2: mov $1, %ebx jmp 1b -ENDPROC(____vmx_vcpu_run) +ENDPROC(__vmx_vcpu_run) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e61bb8da9767..8bc8f09eaf0c 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6445,7 +6445,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) write_cr2(vcpu->arch.cr2); asm( - "call ____vmx_vcpu_run \n\t" + "call __vmx_vcpu_run \n\t" : ASM_CALL_CONSTRAINT, "=b"(vmx->fail), #ifdef CONFIG_X86_64 "=D"((int){0}), "=S"((int){0}) -- cgit From a62fd5a76c99dd96c74c6638408961b7ff3c71c4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:15 -0800 Subject: KVM: VMX: Use RAX as the scratch register during vCPU-run ...to prepare for making the sub-routine callable from C code. That means returning the result in RAX. Since RAX will be used to return the result, use it as the scratch register as well to make the code readable and to document that the scratch register is more or less arbitrary. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmenter.S | 76 +++++++++++++++++++++++----------------------- 1 file changed, 38 insertions(+), 38 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index fa6e03b36348..7d8b09abcdec 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -103,31 +103,31 @@ ENTRY(__vmx_vcpu_run) lea -WORD_SIZE(%_ASM_SP), %_ASM_ARG2 call vmx_update_host_rsp - /* Load @regs to RCX. */ - mov (%_ASM_SP), %_ASM_CX + /* Load @regs to RAX. */ + mov (%_ASM_SP), %_ASM_AX /* Check if vmlaunch or vmresume is needed */ cmpb $0, %bl /* Load guest registers. Don't clobber flags. */ - mov VCPU_RAX(%_ASM_CX), %_ASM_AX - mov VCPU_RBX(%_ASM_CX), %_ASM_BX - mov VCPU_RDX(%_ASM_CX), %_ASM_DX - mov VCPU_RSI(%_ASM_CX), %_ASM_SI - mov VCPU_RDI(%_ASM_CX), %_ASM_DI - mov VCPU_RBP(%_ASM_CX), %_ASM_BP + mov VCPU_RBX(%_ASM_AX), %_ASM_BX + mov VCPU_RCX(%_ASM_AX), %_ASM_CX + mov VCPU_RDX(%_ASM_AX), %_ASM_DX + mov VCPU_RSI(%_ASM_AX), %_ASM_SI + mov VCPU_RDI(%_ASM_AX), %_ASM_DI + mov VCPU_RBP(%_ASM_AX), %_ASM_BP #ifdef CONFIG_X86_64 - mov VCPU_R8 (%_ASM_CX), %r8 - mov VCPU_R9 (%_ASM_CX), %r9 - mov VCPU_R10(%_ASM_CX), %r10 - mov VCPU_R11(%_ASM_CX), %r11 - mov VCPU_R12(%_ASM_CX), %r12 - mov VCPU_R13(%_ASM_CX), %r13 - mov VCPU_R14(%_ASM_CX), %r14 - mov VCPU_R15(%_ASM_CX), %r15 + mov VCPU_R8 (%_ASM_AX), %r8 + mov VCPU_R9 (%_ASM_AX), %r9 + mov VCPU_R10(%_ASM_AX), %r10 + mov VCPU_R11(%_ASM_AX), %r11 + mov VCPU_R12(%_ASM_AX), %r12 + mov VCPU_R13(%_ASM_AX), %r13 + mov VCPU_R14(%_ASM_AX), %r14 + mov VCPU_R15(%_ASM_AX), %r15 #endif - /* Load guest RCX. This kills the vmx_vcpu pointer! */ - mov VCPU_RCX(%_ASM_CX), %_ASM_CX + /* Load guest RAX. This kills the vmx_vcpu pointer! */ + mov VCPU_RAX(%_ASM_AX), %_ASM_AX /* Enter guest mode */ call vmx_vmenter @@ -135,29 +135,29 @@ ENTRY(__vmx_vcpu_run) /* Jump on VM-Fail. */ jbe 2f - /* Temporarily save guest's RCX. */ - push %_ASM_CX + /* Temporarily save guest's RAX. */ + push %_ASM_AX - /* Reload @regs to RCX. */ - mov WORD_SIZE(%_ASM_SP), %_ASM_CX + /* Reload @regs to RAX. */ + mov WORD_SIZE(%_ASM_SP), %_ASM_AX - /* Save all guest registers, including RCX from the stack */ - mov %_ASM_AX, VCPU_RAX(%_ASM_CX) - mov %_ASM_BX, VCPU_RBX(%_ASM_CX) - __ASM_SIZE(pop) VCPU_RCX(%_ASM_CX) - mov %_ASM_DX, VCPU_RDX(%_ASM_CX) - mov %_ASM_SI, VCPU_RSI(%_ASM_CX) - mov %_ASM_DI, VCPU_RDI(%_ASM_CX) - mov %_ASM_BP, VCPU_RBP(%_ASM_CX) + /* Save all guest registers, including RAX from the stack */ + __ASM_SIZE(pop) VCPU_RAX(%_ASM_AX) + mov %_ASM_BX, VCPU_RBX(%_ASM_AX) + mov %_ASM_CX, VCPU_RCX(%_ASM_AX) + mov %_ASM_DX, VCPU_RDX(%_ASM_AX) + mov %_ASM_SI, VCPU_RSI(%_ASM_AX) + mov %_ASM_DI, VCPU_RDI(%_ASM_AX) + mov %_ASM_BP, VCPU_RBP(%_ASM_AX) #ifdef CONFIG_X86_64 - mov %r8, VCPU_R8 (%_ASM_CX) - mov %r9, VCPU_R9 (%_ASM_CX) - mov %r10, VCPU_R10(%_ASM_CX) - mov %r11, VCPU_R11(%_ASM_CX) - mov %r12, VCPU_R12(%_ASM_CX) - mov %r13, VCPU_R13(%_ASM_CX) - mov %r14, VCPU_R14(%_ASM_CX) - mov %r15, VCPU_R15(%_ASM_CX) + mov %r8, VCPU_R8 (%_ASM_AX) + mov %r9, VCPU_R9 (%_ASM_AX) + mov %r10, VCPU_R10(%_ASM_AX) + mov %r11, VCPU_R11(%_ASM_AX) + mov %r12, VCPU_R12(%_ASM_AX) + mov %r13, VCPU_R13(%_ASM_AX) + mov %r14, VCPU_R14(%_ASM_AX) + mov %r15, VCPU_R15(%_ASM_AX) #endif /* Clear EBX to indicate VM-Exit (as opposed to VM-Fail). */ -- cgit From 77df549559dbe7f265ab19bd444d6acb3a718b4d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:16 -0800 Subject: KVM: VMX: Pass @launched to the vCPU-run asm via standard ABI regs ...to prepare for making the sub-routine callable from C code. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmenter.S | 5 ++++- arch/x86/kvm/vmx/vmx.c | 13 ++++++------- 2 files changed, 10 insertions(+), 8 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 7d8b09abcdec..a3d9a8e062f9 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -84,7 +84,7 @@ ENDPROC(vmx_vmexit) * __vmx_vcpu_run - Run a vCPU via a transition to VMX guest mode * @vmx: struct vcpu_vmx * * @regs: unsigned long * (to guest registers) - * %RBX: VMCS launched status (non-zero indicates already launched) + * @launched: %true if the VMCS has been launched * * Returns: * %RBX is 0 on VM-Exit, 1 on VM-Fail @@ -99,6 +99,9 @@ ENTRY(__vmx_vcpu_run) */ push %_ASM_ARG2 + /* Copy @launched to BL, _ASM_ARG3 is volatile. */ + mov %_ASM_ARG3B, %bl + /* Adjust RSP to account for the CALL to vmx_vmenter(). */ lea -WORD_SIZE(%_ASM_SP), %_ASM_ARG2 call vmx_update_host_rsp diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 8bc8f09eaf0c..1b73a82444a2 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6448,19 +6448,18 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) "call __vmx_vcpu_run \n\t" : ASM_CALL_CONSTRAINT, "=b"(vmx->fail), #ifdef CONFIG_X86_64 - "=D"((int){0}), "=S"((int){0}) - : "D"(vmx), "S"(&vcpu->arch.regs), + "=D"((int){0}), "=S"((int){0}), "=d"((int){0}) + : "D"(vmx), "S"(&vcpu->arch.regs), "d"(vmx->loaded_vmcs->launched) #else - "=a"((int){0}), "=d"((int){0}) - : "a"(vmx), "d"(&vcpu->arch.regs), + "=a"((int){0}), "=d"((int){0}), "=c"((int){0}) + : "a"(vmx), "d"(&vcpu->arch.regs), "c"(vmx->loaded_vmcs->launched) #endif - "b"(vmx->loaded_vmcs->launched) : "cc", "memory" #ifdef CONFIG_X86_64 - , "rax", "rcx", "rdx" + , "rax", "rcx" , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" #else - , "ecx", "edi", "esi" + , "edi", "esi" #endif ); -- cgit From e75c3c3a0487da878cbfa7f125dcd080a8606eaf Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:17 -0800 Subject: KVM: VMX: Return VM-Fail from vCPU-run assembly via standard ABI reg ...to prepare for making the assembly sub-routine callable from C code. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmenter.S | 16 ++++++++-------- arch/x86/kvm/vmx/vmx.c | 8 ++++---- 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index a3d9a8e062f9..e06a3f33311e 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -87,7 +87,7 @@ ENDPROC(vmx_vmexit) * @launched: %true if the VMCS has been launched * * Returns: - * %RBX is 0 on VM-Exit, 1 on VM-Fail + * 0 on VM-Exit, 1 on VM-Fail */ ENTRY(__vmx_vcpu_run) push %_ASM_BP @@ -163,17 +163,17 @@ ENTRY(__vmx_vcpu_run) mov %r15, VCPU_R15(%_ASM_AX) #endif - /* Clear EBX to indicate VM-Exit (as opposed to VM-Fail). */ - xor %ebx, %ebx + /* Clear RAX to indicate VM-Exit (as opposed to VM-Fail). */ + xor %eax, %eax /* - * Clear all general purpose registers except RSP and RBX to prevent + * Clear all general purpose registers except RSP and RAX to prevent * speculative use of the guest's values, even those that are reloaded * via the stack. In theory, an L1 cache miss when restoring registers * could lead to speculative execution with the guest's values. * Zeroing XORs are dirt cheap, i.e. the extra paranoia is essentially - * free. RSP and RBX are exempt as RSP is restored by hardware during - * VM-Exit and RBX is explicitly loaded with 0 or 1 to "return" VM-Fail. + * free. RSP and RAX are exempt as RSP is restored by hardware during + * VM-Exit and RAX is explicitly loaded with 0 or 1 to return VM-Fail. */ 1: #ifdef CONFIG_X86_64 @@ -186,7 +186,7 @@ ENTRY(__vmx_vcpu_run) xor %r14d, %r14d xor %r15d, %r15d #endif - xor %eax, %eax + xor %ebx, %ebx xor %ecx, %ecx xor %edx, %edx xor %esi, %esi @@ -199,6 +199,6 @@ ENTRY(__vmx_vcpu_run) ret /* VM-Fail. Out-of-line to avoid a taken Jcc after VM-Exit. */ -2: mov $1, %ebx +2: mov $1, %eax jmp 1b ENDPROC(__vmx_vcpu_run) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 1b73a82444a2..9a1d27e77684 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6446,20 +6446,20 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) asm( "call __vmx_vcpu_run \n\t" - : ASM_CALL_CONSTRAINT, "=b"(vmx->fail), + : ASM_CALL_CONSTRAINT, "=a"(vmx->fail), #ifdef CONFIG_X86_64 "=D"((int){0}), "=S"((int){0}), "=d"((int){0}) : "D"(vmx), "S"(&vcpu->arch.regs), "d"(vmx->loaded_vmcs->launched) #else - "=a"((int){0}), "=d"((int){0}), "=c"((int){0}) + "=d"((int){0}), "=c"((int){0}) : "a"(vmx), "d"(&vcpu->arch.regs), "c"(vmx->loaded_vmcs->launched) #endif : "cc", "memory" #ifdef CONFIG_X86_64 - , "rax", "rcx" + , "rbx", "rcx" , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" #else - , "edi", "esi" + , "ebx", "edi", "esi" #endif ); -- cgit From 3b895ef48615382db03adcf125e0db8437b9acbe Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:18 -0800 Subject: KVM: VMX: Preserve callee-save registers in vCPU-run asm sub-routine ...to make it callable from C code. Note that because KVM chooses to be ultra paranoid about guest register values, all callee-save registers are still cleared after VM-Exit even though the host's values are now reloaded from the stack. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmenter.S | 21 +++++++++++++++++++++ arch/x86/kvm/vmx/vmx.c | 5 +---- 2 files changed, 22 insertions(+), 4 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index e06a3f33311e..d325f1d6110b 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -92,6 +92,16 @@ ENDPROC(vmx_vmexit) ENTRY(__vmx_vcpu_run) push %_ASM_BP mov %_ASM_SP, %_ASM_BP +#ifdef CONFIG_X86_64 + push %r15 + push %r14 + push %r13 + push %r12 +#else + push %edi + push %esi +#endif + push %_ASM_BX /* * Save @regs, _ASM_ARG2 may be modified by vmx_update_host_rsp() and @@ -195,6 +205,17 @@ ENTRY(__vmx_vcpu_run) /* "POP" @regs. */ add $WORD_SIZE, %_ASM_SP + pop %_ASM_BX + +#ifdef CONFIG_X86_64 + pop %r12 + pop %r13 + pop %r14 + pop %r15 +#else + pop %esi + pop %edi +#endif pop %_ASM_BP ret diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 9a1d27e77684..43723d0007be 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6456,10 +6456,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) #endif : "cc", "memory" #ifdef CONFIG_X86_64 - , "rbx", "rcx" - , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" -#else - , "ebx", "edi", "esi" + , "rcx", "r8", "r9", "r10", "r11" #endif ); -- cgit From fc2ba5a27a1aaa16b664e64f85e0e1307d2bde3a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:19 -0800 Subject: KVM: VMX: Call vCPU-run asm sub-routine from C and remove clobbering ...now that the sub-routine follows standard calling conventions. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 43723d0007be..c39f1c38b878 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6371,6 +6371,8 @@ void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) } } +bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched); + static void vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6444,21 +6446,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) if (vcpu->arch.cr2 != read_cr2()) write_cr2(vcpu->arch.cr2); - asm( - "call __vmx_vcpu_run \n\t" - : ASM_CALL_CONSTRAINT, "=a"(vmx->fail), -#ifdef CONFIG_X86_64 - "=D"((int){0}), "=S"((int){0}), "=d"((int){0}) - : "D"(vmx), "S"(&vcpu->arch.regs), "d"(vmx->loaded_vmcs->launched) -#else - "=d"((int){0}), "=c"((int){0}) - : "a"(vmx), "d"(&vcpu->arch.regs), "c"(vmx->loaded_vmcs->launched) -#endif - : "cc", "memory" -#ifdef CONFIG_X86_64 - , "rcx", "r8", "r9", "r10", "r11" -#endif - ); + vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, + vmx->loaded_vmcs->launched); vcpu->arch.cr2 = read_cr2(); -- cgit From 4f44c4eec5b72020c6d99e9e4f3d0505874c98c9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 25 Jan 2019 07:41:20 -0800 Subject: KVM: VMX: Reorder clearing of registers in the vCPU-run assembly flow Move the clearing of the common registers (not 64-bit-only) to the start of the flow that clears registers holding guest state. This is purely a cosmetic change so that the label doesn't point at a blank line and a #define. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmenter.S | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index d325f1d6110b..7b272738c576 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -185,7 +185,12 @@ ENTRY(__vmx_vcpu_run) * free. RSP and RAX are exempt as RSP is restored by hardware during * VM-Exit and RAX is explicitly loaded with 0 or 1 to return VM-Fail. */ -1: +1: xor %ebx, %ebx + xor %ecx, %ecx + xor %edx, %edx + xor %esi, %esi + xor %edi, %edi + xor %ebp, %ebp #ifdef CONFIG_X86_64 xor %r8d, %r8d xor %r9d, %r9d @@ -196,12 +201,6 @@ ENTRY(__vmx_vcpu_run) xor %r14d, %r14d xor %r15d, %r15d #endif - xor %ebx, %ebx - xor %ecx, %ecx - xor %edx, %edx - xor %esi, %esi - xor %edi, %edi - xor %ebp, %ebp /* "POP" @regs. */ add $WORD_SIZE, %_ASM_SP -- cgit From 98d90582be2e08246a70af31e09950ecb8876252 Mon Sep 17 00:00:00 2001 From: "Suthikulpanit, Suravee" Date: Tue, 29 Jan 2019 08:08:42 +0000 Subject: svm: Fix AVIC DFR and LDR handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Current SVM AVIC driver makes two incorrect assumptions: 1. APIC LDR register cannot be zero 2. APIC DFR for all vCPUs must be the same LDR=0 means the local APIC does not support logical destination mode. Therefore, the driver should mark any previously assigned logical APIC ID table entry as invalid, and return success. Also, DFR is specific to a particular local APIC, and can be different among all vCPUs (as observed on Windows 10). These incorrect assumptions cause Windows 10 and FreeBSD VMs to fail to boot with AVIC enabled. So, instead of flush the whole logical APIC ID table, handle DFR and LDR for each vCPU independently. Fixes: 18f40c53e10f ('svm: Add VMEXIT handlers for AVIC') Cc: Radim Krčmář Cc: Paolo Bonzini Reported-by: Julian Stecklina Signed-off-by: Suravee Suthikulpanit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 64 +++++++++++++++++++++++++----------------------------- 1 file changed, 30 insertions(+), 34 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f13a3a24d360..01b66bbcdd7a 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -145,7 +145,6 @@ struct kvm_svm { /* Struct members for AVIC */ u32 avic_vm_id; - u32 ldr_mode; struct page *avic_logical_id_table_page; struct page *avic_physical_id_table_page; struct hlist_node hnode; @@ -236,6 +235,7 @@ struct vcpu_svm { bool nrips_enabled : 1; u32 ldr_reg; + u32 dfr_reg; struct page *avic_backing_page; u64 *avic_physical_id_cache; bool avic_is_running; @@ -2106,6 +2106,7 @@ static int avic_init_vcpu(struct vcpu_svm *svm) INIT_LIST_HEAD(&svm->ir_list); spin_lock_init(&svm->ir_list_lock); + svm->dfr_reg = APIC_DFR_FLAT; return ret; } @@ -4565,8 +4566,7 @@ static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat) return &logical_apic_id_table[index]; } -static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr, - bool valid) +static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr) { bool flat; u32 *entry, new_entry; @@ -4579,31 +4579,39 @@ static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr, new_entry = READ_ONCE(*entry); new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK; new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK); - if (valid) - new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK; - else - new_entry &= ~AVIC_LOGICAL_ID_ENTRY_VALID_MASK; + new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK; WRITE_ONCE(*entry, new_entry); return 0; } +static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + bool flat = svm->dfr_reg == APIC_DFR_FLAT; + u32 *entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat); + + if (entry) + WRITE_ONCE(*entry, (u32) ~AVIC_LOGICAL_ID_ENTRY_VALID_MASK); +} + static int avic_handle_ldr_update(struct kvm_vcpu *vcpu) { - int ret; + int ret = 0; struct vcpu_svm *svm = to_svm(vcpu); u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR); - if (!ldr) - return 1; + if (ldr == svm->ldr_reg) + return 0; - ret = avic_ldr_write(vcpu, vcpu->vcpu_id, ldr, true); - if (ret && svm->ldr_reg) { - avic_ldr_write(vcpu, 0, svm->ldr_reg, false); - svm->ldr_reg = 0; - } else { + avic_invalidate_logical_id_entry(vcpu); + + if (ldr) + ret = avic_ldr_write(vcpu, vcpu->vcpu_id, ldr); + + if (!ret) svm->ldr_reg = ldr; - } + return ret; } @@ -4637,27 +4645,16 @@ static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu) return 0; } -static int avic_handle_dfr_update(struct kvm_vcpu *vcpu) +static void avic_handle_dfr_update(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR); - u32 mod = (dfr >> 28) & 0xf; - - /* - * We assume that all local APICs are using the same type. - * If this changes, we need to flush the AVIC logical - * APID id table. - */ - if (kvm_svm->ldr_mode == mod) - return 0; - clear_page(page_address(kvm_svm->avic_logical_id_table_page)); - kvm_svm->ldr_mode = mod; + if (svm->dfr_reg == dfr) + return; - if (svm->ldr_reg) - avic_handle_ldr_update(vcpu); - return 0; + avic_invalidate_logical_id_entry(vcpu); + svm->dfr_reg = dfr; } static int avic_unaccel_trap_write(struct vcpu_svm *svm) @@ -6163,8 +6160,7 @@ static inline void avic_post_state_restore(struct kvm_vcpu *vcpu) { if (avic_handle_apic_id_update(vcpu) != 0) return; - if (avic_handle_dfr_update(vcpu) != 0) - return; + avic_handle_dfr_update(vcpu); avic_handle_ldr_update(vcpu); } -- cgit From f7589cca50ef8970dda56cdced8d96d05aa0a777 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 30 Jan 2019 17:18:59 +0100 Subject: KVM: x86: cull apicv code when userspace irqchip is requested Currently apicv_active can be true even if in-kernel LAPIC emulation is disabled. Avoid this by properly initializing it in kvm_arch_vcpu_init, and then do not do anything to deactivate APICv when it is actually not used (Currently APICv is only deactivated by SynIC code that in turn is only reachable when in-kernel LAPIC is in use. However, it is cleaner if kvm_vcpu_deactivate_apicv avoids relying on this. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e67ecf25e690..a1fb99f21317 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7055,6 +7055,13 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid) void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu) { + if (!lapic_in_kernel(vcpu)) { + WARN_ON_ONCE(vcpu->arch.apicv_active); + return; + } + if (!vcpu->arch.apicv_active) + return; + vcpu->arch.apicv_active = false; kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu); } @@ -9005,7 +9012,6 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) struct page *page; int r; - vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu); vcpu->arch.emulate_ctxt.ops = &emulate_ops; if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; @@ -9026,6 +9032,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) goto fail_free_pio_data; if (irqchip_in_kernel(vcpu->kvm)) { + vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu); r = kvm_create_lapic(vcpu); if (r < 0) goto fail_mmu_destroy; -- cgit From c57cd3c89ecf2812976f53e494580a396f93efd2 Mon Sep 17 00:00:00 2001 From: "Suthikulpanit, Suravee" Date: Tue, 29 Jan 2019 08:09:46 +0000 Subject: svm: Fix improper check when deactivate AVIC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The function svm_refresh_apicv_exec_ctrl() always returning prematurely as kvm_vcpu_apicv_active() always return false when calling from the function arch/x86/kvm/x86.c:kvm_vcpu_deactivate_apicv(). This is because the apicv_active is set to false just before calling refresh_apicv_exec_ctrl(). Also, we need to mark VMCB_AVIC bit as dirty instead of VMCB_INTR. So, fix svm_refresh_apicv_exec_ctrl() to properly deactivate AVIC. Fixes: 67034bb9dd5e ('KVM: SVM: Add irqchip_split() checks before enabling AVIC') Cc: Radim Krčmář Cc: Paolo Bonzini Signed-off-by: Suravee Suthikulpanit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 01b66bbcdd7a..74ceda470eae 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -5122,11 +5122,11 @@ static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *vmcb = svm->vmcb; - if (!kvm_vcpu_apicv_active(&svm->vcpu)) - return; - - vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK; - mark_dirty(vmcb, VMCB_INTR); + if (kvm_vcpu_apicv_active(vcpu)) + vmcb->control.int_ctl |= AVIC_ENABLE_MASK; + else + vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK; + mark_dirty(vmcb, VMCB_AVIC); } static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) -- cgit From 946c522b603f281195af1df91837a1d4d1eb3bc9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Jan 2019 14:39:23 -0800 Subject: KVM: nVMX: Sign extend displacements of VMX instr's mem operands MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The VMCS.EXIT_QUALIFCATION field reports the displacements of memory operands for various instructions, including VMX instructions, as a naturally sized unsigned value, but masks the value by the addr size, e.g. given a ModRM encoded as -0x28(%ebp), the -0x28 displacement is reported as 0xffffffd8 for a 32-bit address size. Despite some weird wording regarding sign extension, the SDM explicitly states that bits beyond the instructions address size are undefined: In all cases, bits of this field beyond the instruction’s address size are undefined. Failure to sign extend the displacement results in KVM incorrectly treating a negative displacement as a large positive displacement when the address size of the VMX instruction is smaller than KVM's native size, e.g. a 32-bit address size on a 64-bit KVM. The very original decoding, added by commit 064aea774768 ("KVM: nVMX: Decoding memory operands of VMX instructions"), sort of modeled sign extension by truncating the final virtual/linear address for a 32-bit address size. I.e. it messed up the effective address but made it work by adjusting the final address. When segmentation checks were added, the truncation logic was kept as-is and no sign extension logic was introduced. In other words, it kept calculating the wrong effective address while mostly generating the correct virtual/linear address. As the effective address is what's used in the segment limit checks, this results in KVM incorreclty injecting #GP/#SS faults due to non-existent segment violations when a nested VMM uses negative displacements with an address size smaller than KVM's native address size. Using the -0x28(%ebp) example, an EBP value of 0x1000 will result in KVM using 0x100000fd8 as the effective address when checking for a segment limit violation. This causes a 100% failure rate when running a 32-bit KVM build as L1 on top of a 64-bit KVM L0. Fixes: f9eb4af67c9d ("KVM: nVMX: VMX instructions: add checks for #GP/#SS exceptions") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 0e67649e39ce..d531f4c91a34 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4020,6 +4020,10 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, /* Addr = segment_base + offset */ /* offset = base + [index * scale] + displacement */ off = exit_qualification; /* holds the displacement */ + if (addr_size == 1) + off = (gva_t)sign_extend64(off, 31); + else if (addr_size == 0) + off = (gva_t)sign_extend64(off, 15); if (base_is_valid) off += kvm_register_read(vcpu, base_reg); if (index_is_valid) -- cgit From 8570f9e881e3fde98801bb3a47eef84dd934d405 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Jan 2019 14:39:24 -0800 Subject: KVM: nVMX: Apply addr size mask to effective address for VMX instructions The address size of an instruction affects the effective address, not the virtual/linear address. The final address may still be truncated, e.g. to 32-bits outside of long mode, but that happens irrespective of the address size, e.g. a 32-bit address size can yield a 64-bit virtual address when using FS/GS with a non-zero base. Fixes: 064aea774768 ("KVM: nVMX: Decoding memory operands of VMX instructions") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index d531f4c91a34..34081cc8cdeb 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4029,20 +4029,41 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, if (index_is_valid) off += kvm_register_read(vcpu, index_reg)< Date: Wed, 23 Jan 2019 14:39:25 -0800 Subject: KVM: nVMX: Ignore limit checks on VMX instructions using flat segments Regarding segments with a limit==0xffffffff, the SDM officially states: When the effective limit is FFFFFFFFH (4 GBytes), these accesses may or may not cause the indicated exceptions. Behavior is implementation-specific and may vary from one execution to another. In practice, all CPUs that support VMX ignore limit checks for "flat segments", i.e. an expand-up data or code segment with base=0 and limit=0xffffffff. This is subtly different than wrapping the effective address calculation based on the address size, as the flat segment behavior also applies to accesses that would wrap the 4g boundary, e.g. a 4-byte access starting at 0xffffffff will access linear addresses 0xffffffff, 0x0, 0x1 and 0x2. Fixes: f9eb4af67c9d ("KVM: nVMX: VMX instructions: add checks for #GP/#SS exceptions") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 34081cc8cdeb..0050c179c5d3 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4087,10 +4087,16 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. */ exn = (s.unusable != 0); - /* Protected mode: #GP(0)/#SS(0) if the memory - * operand is outside the segment limit. + + /* + * Protected mode: #GP(0)/#SS(0) if the memory operand is + * outside the segment limit. All CPUs that support VMX ignore + * limit checks for flat segments, i.e. segments with base==0, + * limit==0xffffffff and of type expand-up data or code. */ - exn = exn || (off + sizeof(u64) > s.limit); + if (!(s.base == 0 && s.limit == 0xffffffff && + ((s.type & 8) || !(s.type & 4)))) + exn = exn || (off + sizeof(u64) > s.limit); } if (exn) { kvm_queue_exception_e(vcpu, -- cgit From e0dfacbfe91ad7947f0c44829c0358ac2e17d3c6 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 30 Jan 2019 17:25:38 +0100 Subject: KVM: nVMX: remove useless is_protmode check VMX is only accessible in protected mode, remove a confusing check that causes the conditional to lack a final "else" branch. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 0050c179c5d3..be9b2c8fd32e 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4056,7 +4056,7 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, * destination for long mode! */ exn = is_noncanonical_address(*ret, vcpu); - } else if (is_protmode(vcpu)) { + } else { /* * When not in long mode, the virtual/linear address is * unconditionally truncated to 32 bits regardless of the -- cgit From 8acc0993e3f9a04a407ff1507dd329455f340121 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Tue, 15 Jan 2019 17:28:40 +1300 Subject: kvm, x86, mmu: Use kernel generic dynamic physical address mask AMD's SME/SEV is no longer the only case which reduces supported physical address bits, since Intel introduced Multi-key Total Memory Encryption (MKTME), which repurposes high bits of physical address as keyID, thus effectively shrinks supported physical address bits. To cover both cases (and potential similar future features), kernel MM introduced generic dynamaic physical address mask instead of hard-coded __PHYSICAL_MASK in 'commit 94d49eb30e854 ("x86/mm: Decouple dynamic __PHYSICAL_MASK from AMD SME")'. KVM should use that too. Change PT64_BASE_ADDR_MASK to use kernel dynamic physical address mask when it is enabled, instead of sme_clr. PT64_DIR_BASE_ADDR_MASK is also deleted since it is not used at all. Signed-off-by: Kai Huang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index da9c42349b1f..45eb988aa411 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -109,9 +109,11 @@ module_param(dbg, bool, 0644); (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) -#define PT64_BASE_ADDR_MASK __sme_clr((((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))) -#define PT64_DIR_BASE_ADDR_MASK \ - (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) +#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK +#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1)) +#else +#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) +#endif #define PT64_LVL_ADDR_MASK(level) \ (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ * PT64_LEVEL_BITS))) - 1)) -- cgit From 74f2370bb64f5c1c418f115e338f20f11b75f954 Mon Sep 17 00:00:00 2001 From: Liu Jingqi Date: Tue, 6 Nov 2018 13:55:27 +0800 Subject: KVM: x86: expose MOVDIRI CPU feature into VM. MOVDIRI moves doubleword or quadword from register to memory through direct store which is implemented by using write combining (WC) for writing data directly into memory without caching the data. Availability of the MOVDIRI instruction is indicated by the presence of the CPUID feature flag MOVDIRI(CPUID.0x07.0x0:ECX[bit 27]). This patch exposes the movdiri feature to the guest. The release document ref below link: https://software.intel.com/sites/default/files/managed/c5/15/\ architecture-instruction-set-extensions-programming-reference.pdf Signed-off-by: Liu Jingqi Cc: Xu Tao Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index bbffa6c54697..8045434093f8 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -404,7 +404,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | - F(CLDEMOTE); + F(CLDEMOTE) | F(MOVDIRI); /* cpuid 7.0.edx*/ const u32 kvm_cpuid_7_0_edx_x86_features = -- cgit From c029b5deb0b5d7e5090317b835f21c5d93999db7 Mon Sep 17 00:00:00 2001 From: Liu Jingqi Date: Tue, 6 Nov 2018 13:55:28 +0800 Subject: KVM: x86: expose MOVDIR64B CPU feature into VM. MOVDIR64B moves 64-bytes as direct-store with 64-bytes write atomicity. Direct store is implemented by using write combining (WC) for writing data directly into memory without caching the data. Availability of the MOVDIR64B instruction is indicated by the presence of the CPUID feature flag MOVDIR64B (CPUID.0x07.0x0:ECX[bit 28]). This patch exposes the movdir64b feature to the guest. The release document ref below link: https://software.intel.com/sites/default/files/managed/c5/15/\ architecture-instruction-set-extensions-programming-reference.pdf Signed-off-by: Liu Jingqi Cc: Xu Tao Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 8045434093f8..e4644ba9c3da 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -404,7 +404,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | - F(CLDEMOTE) | F(MOVDIRI); + F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B); /* cpuid 7.0.edx*/ const u32 kvm_cpuid_7_0_edx_x86_features = -- cgit From 81b016676e1c8f58027bd4d2b1d8a981776b36fe Mon Sep 17 00:00:00 2001 From: Luwei Kang Date: Thu, 31 Jan 2019 16:52:02 +0800 Subject: KVM: x86: Sync the pending Posted-Interrupts Some Posted-Interrupts from passthrough devices may be lost or overwritten when the vCPU is in runnable state. The SN (Suppress Notification) of PID (Posted Interrupt Descriptor) will be set when the vCPU is preempted (vCPU in KVM_MP_STATE_RUNNABLE state but not running on physical CPU). If a posted interrupt coming at this time, the irq remmaping facility will set the bit of PIR (Posted Interrupt Requests) without ON (Outstanding Notification). So this interrupt can't be sync to APIC virtualization register and will not be handled by Guest because ON is zero. Signed-off-by: Luwei Kang [Eliminate the pi_clear_sn fast path. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 26 +++++++++++--------------- arch/x86/kvm/vmx/vmx.h | 12 ++++++------ arch/x86/kvm/x86.c | 2 +- 3 files changed, 18 insertions(+), 22 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index c39f1c38b878..c81dc632464a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1193,21 +1193,6 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu) return; - /* - * First handle the simple case where no cmpxchg is necessary; just - * allow posting non-urgent interrupts. - * - * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change - * PI.NDST: pi_post_block will do it for us and the wakeup_handler - * expects the VCPU to be on the blocked_vcpu_list that matches - * PI.NDST. - */ - if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || - vcpu->cpu == cpu) { - pi_clear_sn(pi_desc); - return; - } - /* The full case. */ do { old.control = new.control = pi_desc->control; @@ -1222,6 +1207,17 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) new.sn = 0; } while (cmpxchg64(&pi_desc->control, old.control, new.control) != old.control); + + /* + * Clear SN before reading the bitmap; this ensures that any + * interrupt that comes after the bitmap is read sets ON. The + * VT-d firmware * writes the bitmap and reads SN atomically (5.2.3 + * in the spec), so it doesn't really have a memory barrier that + * pairs with this. However, we cannot do that and we need one. + */ + smp_mb__after_atomic(); + if (!bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS)) + pi_set_on(pi_desc); } /* diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 6ee6a492efaf..ec23b4d65fb7 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -336,16 +336,16 @@ static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); } -static inline void pi_clear_sn(struct pi_desc *pi_desc) +static inline void pi_set_sn(struct pi_desc *pi_desc) { - return clear_bit(POSTED_INTR_SN, - (unsigned long *)&pi_desc->control); + set_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); } -static inline void pi_set_sn(struct pi_desc *pi_desc) +static inline void pi_set_on(struct pi_desc *pi_desc) { - return set_bit(POSTED_INTR_SN, - (unsigned long *)&pi_desc->control); + set_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); } static inline void pi_clear_on(struct pi_desc *pi_desc) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a1fb99f21317..96f87d356c79 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7808,7 +7808,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * 1) We should set ->mode before checking ->requests. Please see * the comment in kvm_vcpu_exiting_guest_mode(). * - * 2) For APICv, we should set ->mode before checking PIR.ON. This + * 2) For APICv, we should set ->mode before checking PID.ON. This * pairs with the memory barrier implicit in pi_test_and_set_on * (see vmx_deliver_posted_interrupt). * -- cgit From b4b65b5642d6edd50e35a159f20e793dc478d686 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 29 Jan 2019 19:12:35 +0100 Subject: KVM: x86: cleanup freeing of nested state Ensure that the VCPU free path goes through vmx_leave_nested and thus nested_vmx_vmexit, so that the cancellation of the timer does not have to be in free_nested. In addition, because some paths through nested_vmx_vmexit do not go through sync_vmcs12, the cancellation of the timer is moved there. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 12 ++++++------ arch/x86/kvm/vmx/vmx.c | 1 - 2 files changed, 6 insertions(+), 7 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index be9b2c8fd32e..523d30e935e1 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -211,7 +211,6 @@ static void free_nested(struct kvm_vcpu *vcpu) if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) return; - hrtimer_cancel(&vmx->nested.preemption_timer); vmx->nested.vmxon = false; vmx->nested.smm.vmxon = false; free_vpid(vmx->nested.vpid02); @@ -274,6 +273,7 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) { vcpu_load(vcpu); + vmx_leave_nested(vcpu); vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01); free_nested(vcpu); vcpu_put(vcpu); @@ -3438,13 +3438,10 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) else vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; - if (nested_cpu_has_preemption_timer(vmcs12)) { - if (vmcs12->vm_exit_controls & - VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) + if (nested_cpu_has_preemption_timer(vmcs12) && + vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) vmcs12->vmx_preemption_timer_value = vmx_get_preemption_timer_value(vcpu); - hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); - } /* * In some cases (usually, nested EPT), L2 is allowed to change its @@ -3852,6 +3849,9 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, leave_guest_mode(vcpu); + if (nested_cpu_has_preemption_timer(vmcs12)) + hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); + if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) vcpu->arch.tsc_offset -= vmcs12->tsc_offset; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index c81dc632464a..de6a9e7c6471 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6546,7 +6546,6 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) if (enable_pml) vmx_destroy_pml_buffer(vmx); free_vpid(vmx->vpid); - leave_guest_mode(vcpu); nested_vmx_free_vcpu(vcpu); free_loaded_vmcs(vmx->loaded_vmcs); kfree(vmx->guest_msrs); -- cgit From d92935979adba274b1099e67b7f713f6d8413121 Mon Sep 17 00:00:00 2001 From: Yu Zhang Date: Thu, 31 Jan 2019 11:26:39 +0800 Subject: kvm: vmx: Fix typos in vmentry/vmexit control setting Previously, 'commit f99e3daf94ff ("KVM: x86: Add Intel PT virtualization work mode")' work mode' offered framework to support Intel PT virtualization. However, the patch has some typos in vmx_vmentry_ctrl() and vmx_vmexit_ctrl(), e.g. used wrong flags and wrong variable, which will cause the VM entry failure later. Fixes: 'commit f99e3daf94ff ("KVM: x86: Add Intel PT virtualization work mode")' Signed-off-by: Yu Zhang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index ec23b4d65fb7..d7d1048d221b 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -444,7 +444,8 @@ static inline u32 vmx_vmentry_ctrl(void) { u32 vmentry_ctrl = vmcs_config.vmentry_ctrl; if (pt_mode == PT_MODE_SYSTEM) - vmentry_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP | VM_EXIT_CLEAR_IA32_RTIT_CTL); + vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP | + VM_ENTRY_LOAD_IA32_RTIT_CTL); /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ return vmentry_ctrl & ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | VM_ENTRY_LOAD_IA32_EFER); @@ -454,9 +455,10 @@ static inline u32 vmx_vmexit_ctrl(void) { u32 vmexit_ctrl = vmcs_config.vmexit_ctrl; if (pt_mode == PT_MODE_SYSTEM) - vmexit_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP | VM_ENTRY_LOAD_IA32_RTIT_CTL); + vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP | + VM_EXIT_CLEAR_IA32_RTIT_CTL); /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ - return vmcs_config.vmexit_ctrl & + return vmexit_ctrl & ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER); } -- cgit From 359a6c3ddc5184122989d5152a4b975c1c262d33 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 29 Jan 2019 19:14:46 +0100 Subject: KVM: nVMX: do not start the preemption timer hrtimer unnecessarily The preemption timer can be started even if there is a vmentry failure during or after loading guest state. That is pointless, move the call after all conditions have been checked. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 523d30e935e1..11431bb4c7ed 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2278,10 +2278,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, } vmx_set_rflags(vcpu, vmcs12->guest_rflags); - vmx->nested.preemption_timer_expired = false; - if (nested_cpu_has_preemption_timer(vmcs12)) - vmx_start_preemption_timer(vcpu); - /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the * bitwise-or of what L1 wants to trap for L2, and what we want to * trap. Note that CR0.TS also needs updating - we do this later. @@ -3018,6 +3014,15 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) if (unlikely(evaluate_pending_interrupts)) kvm_make_request(KVM_REQ_EVENT, vcpu); + /* + * Do not start the preemption timer hrtimer until after we know + * we are successful, so that only nested_vmx_vmexit needs to cancel + * the timer. + */ + vmx->nested.preemption_timer_expired = false; + if (nested_cpu_has_preemption_timer(vmcs12)) + vmx_start_preemption_timer(vcpu); + /* * Note no nested_vmx_succeed or nested_vmx_fail here. At this point * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet -- cgit From 254272ce6505948ecc0b4bf5cd0aa61cdd815994 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 11 Feb 2019 11:02:50 -0800 Subject: kvm: x86: Add memcg accounting to KVM allocations There are many KVM kernel memory allocations which are tied to the life of the VM process and should be charged to the VM process's cgroup. If the allocations aren't tied to the process, the OOM killer will not know that killing the process will free the associated kernel memory. Add __GFP_ACCOUNT flags to many of the allocations which are not yet being charged to the VM process's cgroup. Tested: Ran all kvm-unit-tests on a 64 bit Haswell machine, the patch introduced no new failures. Ran a kernel memory accounting test which creates a VM to touch memory and then checks that the kernel memory allocated for the process is within certain bounds. With this patch we account for much more of the vmalloc and slab memory allocated for the VM. There remain a few allocations which should be charged to the VM's cgroup but are not. In x86, they include: vcpu->arch.pio_data There allocations are unaccounted in this patch because they are mapped to userspace, and accounting them to a cgroup causes problems. This should be addressed in a future patch. Signed-off-by: Ben Gardon Reviewed-by: Shakeel Butt Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 2 +- arch/x86/kvm/i8254.c | 2 +- arch/x86/kvm/i8259.c | 2 +- arch/x86/kvm/ioapic.c | 2 +- arch/x86/kvm/lapic.c | 7 ++++--- arch/x86/kvm/mmu.c | 6 +++--- arch/x86/kvm/page_track.c | 2 +- arch/x86/kvm/x86.c | 16 +++++++++------- 8 files changed, 21 insertions(+), 18 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 89d20ed1d2e8..27c43525a05f 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1729,7 +1729,7 @@ static int kvm_hv_eventfd_assign(struct kvm *kvm, u32 conn_id, int fd) mutex_lock(&hv->hv_lock); ret = idr_alloc(&hv->conn_to_evt, eventfd, conn_id, conn_id + 1, - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); mutex_unlock(&hv->hv_lock); if (ret >= 0) diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index af192895b1fc..4a6dc54cc12b 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -653,7 +653,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) pid_t pid_nr; int ret; - pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL); + pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL_ACCOUNT); if (!pit) return NULL; diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index bdcd4139eca9..8b38bb4868a6 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -583,7 +583,7 @@ int kvm_pic_init(struct kvm *kvm) struct kvm_pic *s; int ret; - s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); + s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL_ACCOUNT); if (!s) return -ENOMEM; spin_lock_init(&s->lock); diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 4e822ad363f3..1add1bc881e2 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -622,7 +622,7 @@ int kvm_ioapic_init(struct kvm *kvm) struct kvm_ioapic *ioapic; int ret; - ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL); + ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL_ACCOUNT); if (!ioapic) return -ENOMEM; spin_lock_init(&ioapic->lock); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 4b6c2da7265c..991fdf7fc17f 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -181,7 +181,8 @@ static void recalculate_apic_map(struct kvm *kvm) max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic)); new = kvzalloc(sizeof(struct kvm_apic_map) + - sizeof(struct kvm_lapic *) * ((u64)max_id + 1), GFP_KERNEL); + sizeof(struct kvm_lapic *) * ((u64)max_id + 1), + GFP_KERNEL_ACCOUNT); if (!new) goto out; @@ -2259,13 +2260,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) ASSERT(vcpu != NULL); apic_debug("apic_init %d\n", vcpu->vcpu_id); - apic = kzalloc(sizeof(*apic), GFP_KERNEL); + apic = kzalloc(sizeof(*apic), GFP_KERNEL_ACCOUNT); if (!apic) goto nomem; vcpu->arch.apic = apic; - apic->regs = (void *)get_zeroed_page(GFP_KERNEL); + apic->regs = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!apic->regs) { printk(KERN_ERR "malloc apic regs error for vcpu %x\n", vcpu->vcpu_id); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 45eb988aa411..415d0e62cb3e 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -961,7 +961,7 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, if (cache->nobjs >= min) return 0; while (cache->nobjs < ARRAY_SIZE(cache->objects)) { - obj = kmem_cache_zalloc(base_cache, GFP_KERNEL); + obj = kmem_cache_zalloc(base_cache, GFP_KERNEL_ACCOUNT); if (!obj) return cache->nobjs >= min ? 0 : -ENOMEM; cache->objects[cache->nobjs++] = obj; @@ -3702,7 +3702,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) u64 *lm_root; - lm_root = (void*)get_zeroed_page(GFP_KERNEL); + lm_root = (void*)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (lm_root == NULL) return 1; @@ -5499,7 +5499,7 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) * Therefore we need to allocate shadow page tables in the first * 4GB of memory, which happens to fit the DMA32 zone. */ - page = alloc_page(GFP_KERNEL | __GFP_DMA32); + page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32); if (!page) return -ENOMEM; diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c index 3052a59a3065..fd04d462fdae 100644 --- a/arch/x86/kvm/page_track.c +++ b/arch/x86/kvm/page_track.c @@ -42,7 +42,7 @@ int kvm_page_track_create_memslot(struct kvm_memory_slot *slot, for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) { slot->arch.gfn_track[i] = kvcalloc(npages, sizeof(*slot->arch.gfn_track[i]), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!slot->arch.gfn_track[i]) goto track_free; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 96f87d356c79..3de586f89730 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3879,7 +3879,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -EINVAL; if (!lapic_in_kernel(vcpu)) goto out; - u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); + u.lapic = kzalloc(sizeof(struct kvm_lapic_state), + GFP_KERNEL_ACCOUNT); r = -ENOMEM; if (!u.lapic) @@ -4066,7 +4067,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } case KVM_GET_XSAVE: { - u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); + u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT); r = -ENOMEM; if (!u.xsave) break; @@ -4090,7 +4091,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } case KVM_GET_XCRS: { - u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); + u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT); r = -ENOMEM; if (!u.xcrs) break; @@ -9040,14 +9041,15 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) static_key_slow_inc(&kvm_no_apic_vcpu); vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!vcpu->arch.mce_banks) { r = -ENOMEM; goto fail_free_lapic; } vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; - if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) { + if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, + GFP_KERNEL_ACCOUNT)) { r = -ENOMEM; goto fail_free_mce_banks; } @@ -9306,13 +9308,13 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, slot->arch.rmap[i] = kvcalloc(lpages, sizeof(*slot->arch.rmap[i]), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!slot->arch.rmap[i]) goto out_free; if (i == 0) continue; - linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL); + linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT); if (!linfo) goto out_free; -- cgit From 1ec696470c860e2b1b95e178e368e180ad2b040f Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 11 Feb 2019 11:02:51 -0800 Subject: kvm: svm: Add memcg accounting to KVM allocations There are many KVM kernel memory allocations which are tied to the life of the VM process and should be charged to the VM process's cgroup. If the allocations aren't tied to the process, the OOM killer will not know that killing the process will free the associated kernel memory. Add __GFP_ACCOUNT flags to many of the allocations which are not yet being charged to the VM process's cgroup. Tested: Ran all kvm-unit-tests on a 64 bit Haswell machine, the patch introduced no new failures. Ran a kernel memory accounting test which creates a VM to touch memory and then checks that the kernel memory allocated for the process is within certain bounds. With this patch we account for much more of the vmalloc and slab memory allocated for the VM. Signed-off-by: Ben Gardon Reviewed-by: Shakeel Butt Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 46 +++++++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 21 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 74ceda470eae..b5b128a0a051 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1795,9 +1795,10 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, /* Avoid using vmalloc for smaller buffers. */ size = npages * sizeof(struct page *); if (size > PAGE_SIZE) - pages = vmalloc(size); + pages = __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO, + PAGE_KERNEL); else - pages = kmalloc(size, GFP_KERNEL); + pages = kmalloc(size, GFP_KERNEL_ACCOUNT); if (!pages) return NULL; @@ -1865,7 +1866,9 @@ static void __unregister_enc_region_locked(struct kvm *kvm, static struct kvm *svm_vm_alloc(void) { - struct kvm_svm *kvm_svm = vzalloc(sizeof(struct kvm_svm)); + struct kvm_svm *kvm_svm = __vmalloc(sizeof(struct kvm_svm), + GFP_KERNEL_ACCOUNT | __GFP_ZERO, + PAGE_KERNEL); return &kvm_svm->kvm; } @@ -1940,7 +1943,7 @@ static int avic_vm_init(struct kvm *kvm) return 0; /* Allocating physical APIC ID table (4KB) */ - p_page = alloc_page(GFP_KERNEL); + p_page = alloc_page(GFP_KERNEL_ACCOUNT); if (!p_page) goto free_avic; @@ -1948,7 +1951,7 @@ static int avic_vm_init(struct kvm *kvm) clear_page(page_address(p_page)); /* Allocating logical APIC ID table (4KB) */ - l_page = alloc_page(GFP_KERNEL); + l_page = alloc_page(GFP_KERNEL_ACCOUNT); if (!l_page) goto free_avic; @@ -2120,13 +2123,14 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) struct page *nested_msrpm_pages; int err; - svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); + svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT); if (!svm) { err = -ENOMEM; goto out; } - svm->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, GFP_KERNEL); + svm->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, + GFP_KERNEL_ACCOUNT); if (!svm->vcpu.arch.guest_fpu) { printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n"); err = -ENOMEM; @@ -2138,19 +2142,19 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) goto free_svm; err = -ENOMEM; - page = alloc_page(GFP_KERNEL); + page = alloc_page(GFP_KERNEL_ACCOUNT); if (!page) goto uninit; - msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); + msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER); if (!msrpm_pages) goto free_page1; - nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); + nested_msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER); if (!nested_msrpm_pages) goto free_page2; - hsave_page = alloc_page(GFP_KERNEL); + hsave_page = alloc_page(GFP_KERNEL_ACCOUNT); if (!hsave_page) goto free_page3; @@ -5192,7 +5196,7 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) * Allocating new amd_iommu_pi_data, which will get * add to the per-vcpu ir_list. */ - ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL); + ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL_ACCOUNT); if (!ir) { ret = -ENOMEM; goto out; @@ -6307,7 +6311,7 @@ static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error) if (ret) return ret; - data = kzalloc(sizeof(*data), GFP_KERNEL); + data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); if (!data) return -ENOMEM; @@ -6357,7 +6361,7 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) return -EFAULT; - start = kzalloc(sizeof(*start), GFP_KERNEL); + start = kzalloc(sizeof(*start), GFP_KERNEL_ACCOUNT); if (!start) return -ENOMEM; @@ -6454,7 +6458,7 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) return -EFAULT; - data = kzalloc(sizeof(*data), GFP_KERNEL); + data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); if (!data) return -ENOMEM; @@ -6531,7 +6535,7 @@ static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp) if (copy_from_user(¶ms, measure, sizeof(params))) return -EFAULT; - data = kzalloc(sizeof(*data), GFP_KERNEL); + data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); if (!data) return -ENOMEM; @@ -6593,7 +6597,7 @@ static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!sev_guest(kvm)) return -ENOTTY; - data = kzalloc(sizeof(*data), GFP_KERNEL); + data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); if (!data) return -ENOMEM; @@ -6614,7 +6618,7 @@ static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!sev_guest(kvm)) return -ENOTTY; - data = kzalloc(sizeof(*data), GFP_KERNEL); + data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); if (!data) return -ENOMEM; @@ -6642,7 +6646,7 @@ static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src, struct sev_data_dbg *data; int ret; - data = kzalloc(sizeof(*data), GFP_KERNEL); + data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); if (!data) return -ENOMEM; @@ -6897,7 +6901,7 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) } ret = -ENOMEM; - data = kzalloc(sizeof(*data), GFP_KERNEL); + data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); if (!data) goto e_unpin_memory; @@ -7003,7 +7007,7 @@ static int svm_register_enc_region(struct kvm *kvm, if (range->addr > ULONG_MAX || range->size > ULONG_MAX) return -EINVAL; - region = kzalloc(sizeof(*region), GFP_KERNEL); + region = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT); if (!region) return -ENOMEM; -- cgit From 4183683918efc3549b5ebddde4ed5edfdac45c17 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 11 Feb 2019 11:02:52 -0800 Subject: kvm: vmx: Add memcg accounting to KVM allocations There are many KVM kernel memory allocations which are tied to the life of the VM process and should be charged to the VM process's cgroup. If the allocations aren't tied to the process, the OOM killer will not know that killing the process will free the associated kernel memory. Add __GFP_ACCOUNT flags to many of the allocations which are not yet being charged to the VM process's cgroup. Tested: Ran all kvm-unit-tests on a 64 bit Haswell machine, the patch introduced no new failures. Ran a kernel memory accounting test which creates a VM to touch memory and then checks that the kernel memory allocated for the process is within certain bounds. With this patch we account for much more of the vmalloc and slab memory allocated for the VM. Signed-off-by: Ben Gardon Reviewed-by: Shakeel Butt Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 8 ++++++-- arch/x86/kvm/vmx/vmx.c | 27 ++++++++++++++++++--------- arch/x86/kvm/vmx/vmx.h | 5 +++-- 3 files changed, 27 insertions(+), 13 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 11431bb4c7ed..a4b1b5a50489 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4166,11 +4166,11 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) if (r < 0) goto out_vmcs02; - vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL); + vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); if (!vmx->nested.cached_vmcs12) goto out_cached_vmcs12; - vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL); + vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); if (!vmx->nested.cached_shadow_vmcs12) goto out_cached_shadow_vmcs12; @@ -5715,6 +5715,10 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) enable_shadow_vmcs = 0; if (enable_shadow_vmcs) { for (i = 0; i < VMX_BITMAP_NR; i++) { + /* + * The vmx_bitmap is not tied to a VM and so should + * not be charged to a memcg. + */ vmx_bitmap[i] = (unsigned long *) __get_free_page(GFP_KERNEL); if (!vmx_bitmap[i]) { diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index de6a9e7c6471..4950bb20e06a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -246,6 +246,10 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages && !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) { + /* + * This allocation for vmx_l1d_flush_pages is not tied to a VM + * lifetime and so should not be charged to a memcg. + */ page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER); if (!page) return -ENOMEM; @@ -2386,13 +2390,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, return 0; } -struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu) +struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags) { int node = cpu_to_node(cpu); struct page *pages; struct vmcs *vmcs; - pages = __alloc_pages_node(node, GFP_KERNEL, vmcs_config.order); + pages = __alloc_pages_node(node, flags, vmcs_config.order); if (!pages) return NULL; vmcs = page_address(pages); @@ -2439,7 +2443,8 @@ int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) loaded_vmcs_init(loaded_vmcs); if (cpu_has_vmx_msr_bitmap()) { - loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); + loaded_vmcs->msr_bitmap = (unsigned long *) + __get_free_page(GFP_KERNEL_ACCOUNT); if (!loaded_vmcs->msr_bitmap) goto out_vmcs; memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE); @@ -2480,7 +2485,7 @@ static __init int alloc_kvm_area(void) for_each_possible_cpu(cpu) { struct vmcs *vmcs; - vmcs = alloc_vmcs_cpu(false, cpu); + vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL); if (!vmcs) { free_kvm_area(); return -ENOMEM; @@ -6530,7 +6535,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) static struct kvm *vmx_vm_alloc(void) { - struct kvm_vmx *kvm_vmx = vzalloc(sizeof(struct kvm_vmx)); + struct kvm_vmx *kvm_vmx = __vmalloc(sizeof(struct kvm_vmx), + GFP_KERNEL_ACCOUNT | __GFP_ZERO, + PAGE_KERNEL); return &kvm_vmx->kvm; } @@ -6557,14 +6564,16 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) { int err; - struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); + struct vcpu_vmx *vmx; unsigned long *msr_bitmap; int cpu; + vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT); if (!vmx) return ERR_PTR(-ENOMEM); - vmx->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, GFP_KERNEL); + vmx->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, + GFP_KERNEL_ACCOUNT); if (!vmx->vcpu.arch.guest_fpu) { printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n"); err = -ENOMEM; @@ -6586,12 +6595,12 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) * for the guest, etc. */ if (enable_pml) { - vmx->pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO); + vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!vmx->pml_pg) goto uninit_vcpu; } - vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); + vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT); BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0]) > PAGE_SIZE); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index d7d1048d221b..1554cb45b393 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -479,7 +479,7 @@ static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) return &(to_vmx(vcpu)->pi_desc); } -struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu); +struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags); void free_vmcs(struct vmcs *vmcs); int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs); void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs); @@ -488,7 +488,8 @@ void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs); static inline struct vmcs *alloc_vmcs(bool shadow) { - return alloc_vmcs_cpu(shadow, raw_smp_processor_id()); + return alloc_vmcs_cpu(shadow, raw_smp_processor_id(), + GFP_KERNEL_ACCOUNT); } u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa); -- cgit From 152482580a1b0accb60676063a1ac57b2d12daf6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 12:54:17 -0800 Subject: KVM: Call kvm_arch_memslots_updated() before updating memslots kvm_arch_memslots_updated() is at this point in time an x86-specific hook for handling MMIO generation wraparound. x86 stashes 19 bits of the memslots generation number in its MMIO sptes in order to avoid full page fault walks for repeat faults on emulated MMIO addresses. Because only 19 bits are used, wrapping the MMIO generation number is possible, if unlikely. kvm_arch_memslots_updated() alerts x86 that the generation has changed so that it can invalidate all MMIO sptes in case the effective MMIO generation has wrapped so as to avoid using a stale spte, e.g. a (very) old spte that was created with generation==0. Given that the purpose of kvm_arch_memslots_updated() is to prevent consuming stale entries, it needs to be called before the new generation is propagated to memslots. Invalidating the MMIO sptes after updating memslots means that there is a window where a vCPU could dereference the new memslots generation, e.g. 0, and incorrectly reuse an old MMIO spte that was created with (pre-wrap) generation==0. Fixes: e59dbe09f8e6 ("KVM: Introduce kvm_arch_memslots_updated()") Cc: Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 4 ++-- arch/x86/kvm/x86.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 415d0e62cb3e..a53a0e7ad9e6 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5893,13 +5893,13 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages)); } -void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots) +void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) { /* * The very rare case: if the generation-number is round, * zap all shadow pages. */ - if (unlikely((slots->generation & MMIO_GEN_MASK) == 0)) { + if (unlikely((gen & MMIO_GEN_MASK) == 0)) { kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n"); kvm_mmu_invalidate_zap_all_pages(kvm); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3de586f89730..03d26ffb29cd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9357,13 +9357,13 @@ out_free: return -ENOMEM; } -void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) +void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) { /* * memslots->generation has been incremented. * mmio generation may have reached its maximum value. */ - kvm_mmu_invalidate_mmio_sptes(kvm, slots); + kvm_mmu_invalidate_mmio_sptes(kvm, gen); } int kvm_arch_prepare_memory_region(struct kvm *kvm, -- cgit From e1359e2beb8b0a1188abc997273acbaedc8ee791 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:12 -0800 Subject: KVM: x86/mmu: Detect MMIO generation wrap in any address space The check to detect a wrap of the MMIO generation explicitly looks for a generation number of zero. Now that unique memslots generation numbers are assigned to each address space, only address space 0 will get a generation number of exactly zero when wrapping. E.g. when address space 1 goes from 0x7fffe to 0x80002, the MMIO generation number will wrap to 0x2. Adjust the MMIO generation to strip the address space modifier prior to checking for a wrap. Fixes: 4bd518f1598d ("KVM: use separate generations for each address space") Cc: Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index a53a0e7ad9e6..c2f2c9de63ed 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5895,11 +5895,28 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) { + gen &= MMIO_GEN_MASK; + + /* + * Shift to eliminate the "update in-progress" flag, which isn't + * included in the spte's generation number. + */ + gen >>= 1; + + /* + * Generation numbers are incremented in multiples of the number of + * address spaces in order to provide unique generations across all + * address spaces. Strip what is effectively the address space + * modifier prior to checking for a wrap of the MMIO generation so + * that a wrap in any address space is detected. + */ + gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1); + /* - * The very rare case: if the generation-number is round, + * The very rare case: if the MMIO generation number has wrapped, * zap all shadow pages. */ - if (unlikely((gen & MMIO_GEN_MASK) == 0)) { + if (unlikely(gen == 0)) { kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n"); kvm_mmu_invalidate_zap_all_pages(kvm); } -- cgit From ddfd1730fd829743e41213e32ccc8b4aa6dc8325 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:13 -0800 Subject: KVM: x86/mmu: Do not cache MMIO accesses while memslots are in flux When installing new memslots, KVM sets bit 0 of the generation number to indicate that an update is in-progress. Until the update is complete, there are no guarantees as to whether a vCPU will see the old or the new memslots. Explicity prevent caching MMIO accesses so as to avoid using an access cached from the old memslots after the new memslots have been installed. Note that it is unclear whether or not disabling caching during the update window is strictly necessary as there is no definitive documentation as to what ordering guarantees KVM provides with respect to updating memslots. That being said, the MMIO spte code does not allow reusing sptes created while an update is in-progress, and the associated documentation explicitly states: We do not want to use an MMIO sptes created with an odd generation number, ... If KVM is unlucky and creates an MMIO spte while the low bit is 1, the next access to the spte will always be a cache miss. At the very least, disabling the per-vCPU MMIO cache during updates will make its behavior consistent with the MMIO spte behavior and documentation. Fixes: 56f17dd3fbc4 ("kvm: x86: fix stale mmio cache bug") Cc: Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 224cd0a47568..20ede17202bf 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -181,6 +181,11 @@ static inline bool emul_is_noncanonical_address(u64 la, static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, unsigned access) { + u64 gen = kvm_memslots(vcpu->kvm)->generation; + + if (unlikely(gen & 1)) + return; + /* * If this is a shadow nested page table, the "GVA" is * actually a nGPA. @@ -188,7 +193,7 @@ static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu, vcpu->arch.mmio_gva = mmu_is_nested(vcpu) ? 0 : gva & PAGE_MASK; vcpu->arch.access = access; vcpu->arch.mmio_gfn = gfn; - vcpu->arch.mmio_gen = kvm_memslots(vcpu->kvm)->generation; + vcpu->arch.mmio_gen = gen; } static inline bool vcpu_match_mmio_gen(struct kvm_vcpu *vcpu) -- cgit From 361209e054a2c9f34da090ee1ee4c1e8bfe76a64 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:14 -0800 Subject: KVM: Explicitly define the "memslot update in-progress" bit KVM uses bit 0 of the memslots generation as an "update in-progress" flag, which is used by x86 to prevent caching MMIO access while the memslots are changing. Although the intended behavior is flag-like, e.g. MMIO sptes intentionally drop the in-progress bit so as to avoid caching data from in-flux memslots, the implementation oftentimes treats the bit as part of the generation number itself, e.g. incrementing the generation increments twice, once to set the flag and once to clear it. Prior to commit 4bd518f1598d ("KVM: use separate generations for each address space"), incorporating the "update in-progress" bit into the generation number largely made sense, e.g. "real" generations are even, "bogus" generations are odd, most code doesn't need to be aware of the bit, etc... Now that unique memslots generation numbers are assigned to each address space, stealthing the in-progress status into the generation number results in a wide variety of subtle code, e.g. kvm_create_vm() jumps over bit 0 when initializing the memslots generation without any hint as to why. Explicitly define the flag and convert as much code as possible (which isn't much) to actually treat it like a flag. This paves the way for eventually using a different bit for "update in-progress" so that it can be a flag in truth instead of a awkward extension to the generation number. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 20ede17202bf..28406aa1136d 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -183,7 +183,7 @@ static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu, { u64 gen = kvm_memslots(vcpu->kvm)->generation; - if (unlikely(gen & 1)) + if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS)) return; /* -- cgit From 5192f9b976f9687569a90602b8a6c053da4498f6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:15 -0800 Subject: KVM: x86: Use a u64 when passing the MMIO gen around KVM currently uses an 'unsigned int' for the MMIO generation number despite it being derived from the 64-bit memslots generation and being propagated to (potentially) 64-bit sptes. There is no hidden agenda behind using an 'unsigned int', it's done simply because the MMIO generation will never set bits above bit 19. Passing a u64 will allow the "update in-progress" flag to be relocated from bit 0 to bit 63 and removes the need to cast the generation back to a u64 when propagating it to a spte. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c2f2c9de63ed..54bb706e40e8 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -348,20 +348,20 @@ static inline bool is_access_track_spte(u64 spte) #define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 2) #define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1) -static u64 generation_mmio_spte_mask(unsigned int gen) +static u64 generation_mmio_spte_mask(u64 gen) { u64 mask; WARN_ON(gen & ~MMIO_GEN_MASK); mask = (gen & MMIO_GEN_LOW_MASK) << MMIO_SPTE_GEN_LOW_SHIFT; - mask |= ((u64)gen >> MMIO_GEN_LOW_SHIFT) << MMIO_SPTE_GEN_HIGH_SHIFT; + mask |= (gen >> MMIO_GEN_LOW_SHIFT) << MMIO_SPTE_GEN_HIGH_SHIFT; return mask; } -static unsigned int get_mmio_spte_generation(u64 spte) +static u64 get_mmio_spte_generation(u64 spte) { - unsigned int gen; + u64 gen; spte &= ~shadow_mmio_mask; @@ -370,7 +370,7 @@ static unsigned int get_mmio_spte_generation(u64 spte) return gen; } -static unsigned int kvm_current_mmio_generation(struct kvm_vcpu *vcpu) +static u64 kvm_current_mmio_generation(struct kvm_vcpu *vcpu) { return kvm_vcpu_memslots(vcpu)->generation & MMIO_GEN_MASK; } @@ -378,7 +378,7 @@ static unsigned int kvm_current_mmio_generation(struct kvm_vcpu *vcpu) static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, unsigned access) { - unsigned int gen = kvm_current_mmio_generation(vcpu); + u64 gen = kvm_current_mmio_generation(vcpu); u64 mask = generation_mmio_spte_mask(gen); u64 gpa = gfn << PAGE_SHIFT; @@ -426,7 +426,7 @@ static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) { - unsigned int kvm_gen, spte_gen; + u64 kvm_gen, spte_gen; kvm_gen = kvm_current_mmio_generation(vcpu); spte_gen = get_mmio_spte_generation(spte); -- cgit From cae7ed3c2cb06680400adab632a243c5e5f42637 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:16 -0800 Subject: KVM: x86: Refactor the MMIO SPTE generation handling The code to propagate the memslots generation number into MMIO sptes is a bit convoluted. The "what" is relatively straightfoward, e.g. the comment explaining which bits go where is quite readable, but the "how" requires a lot of staring to understand what is happening. For example, 'MMIO_GEN_LOW_SHIFT' is actually used to calculate the high bits of the spte, while 'MMIO_SPTE_GEN_LOW_SHIFT' is used to calculate the low bits. Refactor the code to: - use #defines whose values align with the bits defined in the comment - use consistent code for both the high and low mask - explicitly highlight the handling of bit 0 (update in-progress flag) - explicitly call out that the defines are for MMIO sptes (to avoid confusion with the per-vCPU MMIO cache, which uses the full memslots generation) In addition to making the code a little less magical, this paves the way for moving the update in-progress flag to bit 63 without having to simultaneously rewrite all of the MMIO spte code. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 76 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 43 insertions(+), 33 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 54bb706e40e8..364b2a737d94 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -332,30 +332,41 @@ static inline bool is_access_track_spte(u64 spte) } /* - * the low bit of the generation number is always presumed to be zero. - * This disables mmio caching during memslot updates. The concept is - * similar to a seqcount but instead of retrying the access we just punt - * and ignore the cache. + * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of + * the memslots generation and is derived as follows: * - * spte bits 3-11 are used as bits 1-9 of the generation number, - * the bits 52-61 are used as bits 10-19 of the generation number. + * Bits 1-9 of the memslot generation are propagated to spte bits 3-11 + * Bits 10-19 of the memslot generation are propagated to spte bits 52-61 + * + * The MMIO generation starts at bit 1 of the memslots generation in order to + * skip over bit 0, the KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag. Including + * the flag would require stealing a bit from the "real" generation number and + * thus effectively halve the maximum number of MMIO generations that can be + * handled before encountering a wrap (which requires a full MMU zap). The + * flag is instead explicitly queried when checking for MMIO spte cache hits. */ -#define MMIO_SPTE_GEN_LOW_SHIFT 2 -#define MMIO_SPTE_GEN_HIGH_SHIFT 52 - -#define MMIO_GEN_SHIFT 20 -#define MMIO_GEN_LOW_SHIFT 10 -#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 2) -#define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1) - +#define MMIO_SPTE_GEN_MASK GENMASK_ULL(19, 1) +#define MMIO_SPTE_GEN_SHIFT 1 + +#define MMIO_SPTE_GEN_LOW_START 3 +#define MMIO_SPTE_GEN_LOW_END 11 +#define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \ + MMIO_SPTE_GEN_LOW_START) + +#define MMIO_SPTE_GEN_HIGH_START 52 +#define MMIO_SPTE_GEN_HIGH_END 61 +#define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \ + MMIO_SPTE_GEN_HIGH_START) static u64 generation_mmio_spte_mask(u64 gen) { u64 mask; - WARN_ON(gen & ~MMIO_GEN_MASK); + WARN_ON(gen & ~MMIO_SPTE_GEN_MASK); - mask = (gen & MMIO_GEN_LOW_MASK) << MMIO_SPTE_GEN_LOW_SHIFT; - mask |= (gen >> MMIO_GEN_LOW_SHIFT) << MMIO_SPTE_GEN_HIGH_SHIFT; + gen >>= MMIO_SPTE_GEN_SHIFT; + + mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK; + mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK; return mask; } @@ -365,20 +376,15 @@ static u64 get_mmio_spte_generation(u64 spte) spte &= ~shadow_mmio_mask; - gen = (spte >> MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_GEN_LOW_MASK; - gen |= (spte >> MMIO_SPTE_GEN_HIGH_SHIFT) << MMIO_GEN_LOW_SHIFT; - return gen; -} - -static u64 kvm_current_mmio_generation(struct kvm_vcpu *vcpu) -{ - return kvm_vcpu_memslots(vcpu)->generation & MMIO_GEN_MASK; + gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START; + gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START; + return gen << MMIO_SPTE_GEN_SHIFT; } static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, unsigned access) { - u64 gen = kvm_current_mmio_generation(vcpu); + u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK; u64 mask = generation_mmio_spte_mask(gen); u64 gpa = gfn << PAGE_SHIFT; @@ -409,7 +415,7 @@ static gfn_t get_mmio_spte_gfn(u64 spte) static unsigned get_mmio_spte_access(u64 spte) { - u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask; + u64 mask = generation_mmio_spte_mask(MMIO_SPTE_GEN_MASK) | shadow_mmio_mask; return (spte & ~mask) & ~PAGE_MASK; } @@ -426,9 +432,13 @@ static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) { - u64 kvm_gen, spte_gen; + u64 kvm_gen, spte_gen, gen; + + gen = kvm_vcpu_memslots(vcpu)->generation; + if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS)) + return false; - kvm_gen = kvm_current_mmio_generation(vcpu); + kvm_gen = gen & MMIO_SPTE_GEN_MASK; spte_gen = get_mmio_spte_generation(spte); trace_check_mmio_spte(spte, kvm_gen, spte_gen); @@ -5895,13 +5905,13 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) { - gen &= MMIO_GEN_MASK; + gen &= MMIO_SPTE_GEN_MASK; /* - * Shift to eliminate the "update in-progress" flag, which isn't - * included in the spte's generation number. + * Shift to adjust for the "update in-progress" flag, which isn't + * included in the MMIO generation number. */ - gen >>= 1; + gen >>= MMIO_SPTE_GEN_SHIFT; /* * Generation numbers are incremented in multiples of the number of -- cgit From 164bf7e56c5a73f2f819c39ba7e0f20e0f97dc7b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:18 -0800 Subject: KVM: Move the memslot update in-progress flag to bit 63 ...now that KVM won't explode by moving it out of bit 0. Using bit 63 eliminates the need to jump over bit 0, e.g. when calculating a new memslots generation or when propagating the memslots generation to an MMIO spte. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 364b2a737d94..bcf62e1e1ff7 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -335,18 +335,17 @@ static inline bool is_access_track_spte(u64 spte) * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of * the memslots generation and is derived as follows: * - * Bits 1-9 of the memslot generation are propagated to spte bits 3-11 - * Bits 10-19 of the memslot generation are propagated to spte bits 52-61 + * Bits 0-8 of the MMIO generation are propagated to spte bits 3-11 + * Bits 9-18 of the MMIO generation are propagated to spte bits 52-61 * - * The MMIO generation starts at bit 1 of the memslots generation in order to - * skip over bit 0, the KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag. Including - * the flag would require stealing a bit from the "real" generation number and - * thus effectively halve the maximum number of MMIO generations that can be - * handled before encountering a wrap (which requires a full MMU zap). The - * flag is instead explicitly queried when checking for MMIO spte cache hits. + * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in + * the MMIO generation number, as doing so would require stealing a bit from + * the "real" generation number and thus effectively halve the maximum number + * of MMIO generations that can be handled before encountering a wrap (which + * requires a full MMU zap). The flag is instead explicitly queried when + * checking for MMIO spte cache hits. */ -#define MMIO_SPTE_GEN_MASK GENMASK_ULL(19, 1) -#define MMIO_SPTE_GEN_SHIFT 1 +#define MMIO_SPTE_GEN_MASK GENMASK_ULL(18, 0) #define MMIO_SPTE_GEN_LOW_START 3 #define MMIO_SPTE_GEN_LOW_END 11 @@ -363,8 +362,6 @@ static u64 generation_mmio_spte_mask(u64 gen) WARN_ON(gen & ~MMIO_SPTE_GEN_MASK); - gen >>= MMIO_SPTE_GEN_SHIFT; - mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK; mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK; return mask; @@ -378,7 +375,7 @@ static u64 get_mmio_spte_generation(u64 spte) gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START; gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START; - return gen << MMIO_SPTE_GEN_SHIFT; + return gen; } static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, @@ -5905,13 +5902,9 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) { - gen &= MMIO_SPTE_GEN_MASK; + WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); - /* - * Shift to adjust for the "update in-progress" flag, which isn't - * included in the MMIO generation number. - */ - gen >>= MMIO_SPTE_GEN_SHIFT; + gen &= MMIO_SPTE_GEN_MASK; /* * Generation numbers are incremented in multiples of the number of -- cgit From 85875a133ea3aca5e4ea423c7cb991ad53f4866e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:19 -0800 Subject: KVM: x86/mmu: Move slot_level_*() helper functions up a few lines ...so that kvm_mmu_invalidate_zap_pages_in_memslot() can utilize the helpers in future patches. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 139 +++++++++++++++++++++++++++-------------------------- 1 file changed, 70 insertions(+), 69 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index bcf62e1e1ff7..5a8af2bcd891 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5487,6 +5487,76 @@ void kvm_disable_tdp(void) } EXPORT_SYMBOL_GPL(kvm_disable_tdp); + +/* The return value indicates if tlb flush on all vcpus is needed. */ +typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head); + +/* The caller should hold mmu-lock before calling this function. */ +static __always_inline bool +slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, + slot_level_handler fn, int start_level, int end_level, + gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb) +{ + struct slot_rmap_walk_iterator iterator; + bool flush = false; + + for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn, + end_gfn, &iterator) { + if (iterator.rmap) + flush |= fn(kvm, iterator.rmap); + + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { + if (flush && lock_flush_tlb) { + kvm_flush_remote_tlbs(kvm); + flush = false; + } + cond_resched_lock(&kvm->mmu_lock); + } + } + + if (flush && lock_flush_tlb) { + kvm_flush_remote_tlbs(kvm); + flush = false; + } + + return flush; +} + +static __always_inline bool +slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot, + slot_level_handler fn, int start_level, int end_level, + bool lock_flush_tlb) +{ + return slot_handle_level_range(kvm, memslot, fn, start_level, + end_level, memslot->base_gfn, + memslot->base_gfn + memslot->npages - 1, + lock_flush_tlb); +} + +static __always_inline bool +slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot, + slot_level_handler fn, bool lock_flush_tlb) +{ + return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL, + PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); +} + +static __always_inline bool +slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot, + slot_level_handler fn, bool lock_flush_tlb) +{ + return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1, + PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); +} + +static __always_inline bool +slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot, + slot_level_handler fn, bool lock_flush_tlb) +{ + return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL, + PT_PAGE_TABLE_LEVEL, lock_flush_tlb); +} + static void free_mmu_pages(struct kvm_vcpu *vcpu) { free_page((unsigned long)vcpu->arch.mmu->pae_root); @@ -5561,75 +5631,6 @@ void kvm_mmu_uninit_vm(struct kvm *kvm) kvm_page_track_unregister_notifier(kvm, node); } -/* The return value indicates if tlb flush on all vcpus is needed. */ -typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head); - -/* The caller should hold mmu-lock before calling this function. */ -static __always_inline bool -slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, - slot_level_handler fn, int start_level, int end_level, - gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb) -{ - struct slot_rmap_walk_iterator iterator; - bool flush = false; - - for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn, - end_gfn, &iterator) { - if (iterator.rmap) - flush |= fn(kvm, iterator.rmap); - - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { - if (flush && lock_flush_tlb) { - kvm_flush_remote_tlbs(kvm); - flush = false; - } - cond_resched_lock(&kvm->mmu_lock); - } - } - - if (flush && lock_flush_tlb) { - kvm_flush_remote_tlbs(kvm); - flush = false; - } - - return flush; -} - -static __always_inline bool -slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot, - slot_level_handler fn, int start_level, int end_level, - bool lock_flush_tlb) -{ - return slot_handle_level_range(kvm, memslot, fn, start_level, - end_level, memslot->base_gfn, - memslot->base_gfn + memslot->npages - 1, - lock_flush_tlb); -} - -static __always_inline bool -slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot, - slot_level_handler fn, bool lock_flush_tlb) -{ - return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL, - PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); -} - -static __always_inline bool -slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot, - slot_level_handler fn, bool lock_flush_tlb) -{ - return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1, - PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); -} - -static __always_inline bool -slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot, - slot_level_handler fn, bool lock_flush_tlb) -{ - return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL, - PT_PAGE_TABLE_LEVEL, lock_flush_tlb); -} - void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) { struct kvm_memslots *slots; -- cgit From a21136345cb6f1a5b7f576701b6a454da5b6e606 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:20 -0800 Subject: KVM: x86/mmu: Split remote_flush+zap case out of kvm_mmu_flush_or_zap() ...and into a separate helper, kvm_mmu_remote_flush_or_zap(), that does not require a vcpu so that the code can be (re)used by kvm_mmu_invalidate_zap_pages_in_memslot(). Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 5a8af2bcd891..1cce120f06ae 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2240,18 +2240,28 @@ static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, return true; } +static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm, + struct list_head *invalid_list, + bool remote_flush) +{ + if (!remote_flush && !list_empty(invalid_list)) + return false; + + if (!list_empty(invalid_list)) + kvm_mmu_commit_zap_page(kvm, invalid_list); + else + kvm_flush_remote_tlbs(kvm); + return true; +} + static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu, struct list_head *invalid_list, bool remote_flush, bool local_flush) { - if (!list_empty(invalid_list)) { - kvm_mmu_commit_zap_page(vcpu->kvm, invalid_list); + if (kvm_mmu_remote_flush_or_zap(vcpu->kvm, invalid_list, remote_flush)) return; - } - if (remote_flush) - kvm_flush_remote_tlbs(vcpu->kvm); - else if (local_flush) + if (local_flush) kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } -- cgit From 4e103134b862314dc2f2f18f2fb0ab972adc3f5f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:21 -0800 Subject: KVM: x86/mmu: Zap only the relevant pages when removing a memslot Modify kvm_mmu_invalidate_zap_pages_in_memslot(), a.k.a. the x86 MMU's handler for kvm_arch_flush_shadow_memslot(), to zap only the pages/PTEs that actually belong to the memslot being removed. This improves performance, especially why the deleted memslot has only a few shadow entries, or even no entries. E.g. a microbenchmark to access regular memory while concurrently reading PCI ROM to trigger memslot deletion showed a 5% improvement in throughput. Cc: Xiao Guangrong Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 1cce120f06ae..b81e2cad0237 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5622,7 +5622,38 @@ static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, struct kvm_page_track_notifier_node *node) { - kvm_mmu_invalidate_zap_all_pages(kvm); + struct kvm_mmu_page *sp; + LIST_HEAD(invalid_list); + unsigned long i; + bool flush; + gfn_t gfn; + + spin_lock(&kvm->mmu_lock); + + if (list_empty(&kvm->arch.active_mmu_pages)) + goto out_unlock; + + flush = slot_handle_all_level(kvm, slot, kvm_zap_rmapp, false); + + for (i = 0; i < slot->npages; i++) { + gfn = slot->base_gfn + i; + + for_each_valid_sp(kvm, sp, gfn) { + if (sp->gfn != gfn) + continue; + + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); + } + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { + kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); + flush = false; + cond_resched_lock(&kvm->mmu_lock); + } + } + kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); + +out_unlock: + spin_unlock(&kvm->mmu_lock); } void kvm_mmu_init_vm(struct kvm *kvm) -- cgit From 4771450c345dc5e3e3417d82aff62e0d88e7eee6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:23 -0800 Subject: Revert "KVM: MMU: drop kvm_mmu_zap_mmio_sptes" Revert back to a dedicated (and slower) mechanism for handling the scenario where all MMIO shadow PTEs need to be zapped due to overflowing the MMIO generation number. The MMIO generation scenario is almost literally a one-in-a-million occurrence, i.e. is not a performance sensitive scenario. Restoring kvm_mmu_zap_mmio_sptes() leaves VM teardown as the only user of kvm_mmu_invalidate_zap_all_pages() and paves the way for removing the fast invalidate mechanism altogether. This reverts commit a8eca9dcc656a405a28ffba43f3d86a1ff0eb331. Cc: Xiao Guangrong Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b81e2cad0237..d80c1558b23c 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -391,6 +391,8 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, mask |= (gpa & shadow_nonpresent_or_rsvd_mask) << shadow_nonpresent_or_rsvd_mask_len; + page_header(__pa(sptep))->mmio_cached = true; + trace_mark_mmio_spte(sptep, gfn, access, gen); mmu_spte_set(sptep, mask); } @@ -5942,6 +5944,24 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages)); } +static void kvm_mmu_zap_mmio_sptes(struct kvm *kvm) +{ + struct kvm_mmu_page *sp, *node; + LIST_HEAD(invalid_list); + + spin_lock(&kvm->mmu_lock); +restart: + list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { + if (!sp->mmio_cached) + continue; + if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) + goto restart; + } + + kvm_mmu_commit_zap_page(kvm, &invalid_list); + spin_unlock(&kvm->mmu_lock); +} + void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) { WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); @@ -5963,7 +5983,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) */ if (unlikely(gen == 0)) { kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n"); - kvm_mmu_invalidate_zap_all_pages(kvm); + kvm_mmu_zap_mmio_sptes(kvm); } } -- cgit From 571c5af06e303b4a69016193fd6b5afbc96eac40 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:24 -0800 Subject: KVM: x86/mmu: Voluntarily reschedule as needed when zapping MMIO sptes Call cond_resched_lock() when zapping MMIO to reschedule if needed or to release and reacquire mmu_lock in case of contention. There is no need to flush or zap when temporarily dropping mmu_lock as zapping MMIO sptes is done when holding the memslots lock and with the "update in-progress" bit set in the memslots generation, which disables MMIO spte caching. The walk does need to be restarted if mmu_lock is dropped as the active pages list may be modified. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d80c1558b23c..2190679eda39 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5954,7 +5954,8 @@ restart: list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { if (!sp->mmio_cached) continue; - if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) + if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list) || + cond_resched_lock(&kvm->mmu_lock)) goto restart; } -- cgit From 5ff0568374ed2e585376a3832857ade5daccd381 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:25 -0800 Subject: KVM: x86/mmu: Remove is_obsolete() call Unwinding usage of is_obsolete() is a step towards removing x86's fast invalidate mechanism, i.e. this is one part of a revert all patches from the series that introduced the mechanism[1]. This is a partial revert of commit 05988d728dcd ("KVM: MMU: reduce KVM_REQ_MMU_RELOAD when root page is zapped"). [1] https://lkml.kernel.org/r/1369960590-14138-1-git-send-email-xiaoguangrong@linux.vnet.ibm.com Cc: Xiao Guangrong Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 2190679eda39..6cbffc775220 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2713,11 +2713,7 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, } else { list_move(&sp->link, &kvm->arch.active_mmu_pages); - /* - * The obsolete pages can not be used on any vcpus. - * See the comments in kvm_mmu_invalidate_zap_all_pages(). - */ - if (!sp->role.invalid && !is_obsolete_sp(kvm, sp)) + if (!sp->role.invalid) kvm_reload_remote_mmus(kvm); } -- cgit From 52d5dedc79bdcbac2976159a172069618cf31be5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:26 -0800 Subject: Revert "KVM: MMU: reclaim the zapped-obsolete page first" Unwinding optimizations related to obsolete pages is a step towards removing x86 KVM's fast invalidate mechanism, i.e. this is one part of a revert all patches from the series that introduced the mechanism[1]. This reverts commit 365c886860c4ba670d245e762b23987c912c129a. [1] https://lkml.kernel.org/r/1369960590-14138-1-git-send-email-xiaoguangrong@linux.vnet.ibm.com Cc: Xiao Guangrong Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 21 ++++----------------- arch/x86/kvm/x86.c | 1 - 2 files changed, 4 insertions(+), 18 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 6cbffc775220..255b0212fc5b 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5858,6 +5858,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty); static void kvm_zap_obsolete_pages(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; + LIST_HEAD(invalid_list); int batch = 0; restart: @@ -5890,8 +5891,7 @@ restart: goto restart; } - ret = kvm_mmu_prepare_zap_page(kvm, sp, - &kvm->arch.zapped_obsolete_pages); + ret = kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); batch += ret; if (ret) @@ -5902,7 +5902,7 @@ restart: * Should flush tlb before free page tables since lockless-walking * may use the pages. */ - kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages); + kvm_mmu_commit_zap_page(kvm, &invalid_list); } /* @@ -5935,11 +5935,6 @@ void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm) spin_unlock(&kvm->mmu_lock); } -static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) -{ - return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages)); -} - static void kvm_mmu_zap_mmio_sptes(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; @@ -6011,24 +6006,16 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) * want to shrink a VM that only started to populate its MMU * anyway. */ - if (!kvm->arch.n_used_mmu_pages && - !kvm_has_zapped_obsolete_pages(kvm)) + if (!kvm->arch.n_used_mmu_pages) continue; idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); - if (kvm_has_zapped_obsolete_pages(kvm)) { - kvm_mmu_commit_zap_page(kvm, - &kvm->arch.zapped_obsolete_pages); - goto unlock; - } - if (prepare_zap_oldest_mmu_page(kvm, &invalid_list)) freed++; kvm_mmu_commit_zap_page(kvm, &invalid_list); -unlock: spin_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, idx); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 03d26ffb29cd..78fb13f190a3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9113,7 +9113,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list); INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); - INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); atomic_set(&kvm->arch.noncoherent_dma_count, 0); -- cgit From 210f494261e1e84ad1f15877baa1c615afe3b342 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:27 -0800 Subject: Revert "KVM: MMU: collapse TLB flushes when zap all pages" Unwinding optimizations related to obsolete pages is a step towards removing x86 KVM's fast invalidate mechanism, i.e. this is one part of a revert all patches from the series that introduced the mechanism[1]. This reverts commit f34d251d66ba263c077ed9d2bbd1874339a4c887. [1] https://lkml.kernel.org/r/1369960590-14138-1-git-send-email-xiaoguangrong@linux.vnet.ibm.com Cc: Xiao Guangrong Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 31 +++---------------------------- 1 file changed, 3 insertions(+), 28 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 255b0212fc5b..e733262027ed 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2211,14 +2211,6 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, static void kvm_mmu_commit_zap_page(struct kvm *kvm, struct list_head *invalid_list); -/* - * NOTE: we should pay more attention on the zapped-obsolete page - * (is_obsolete_sp(sp) && sp->role.invalid) when you do hash list walk - * since it has been deleted from active_mmu_pages but still can be found - * at hast list. - * - * for_each_valid_sp() has skipped that kind of pages. - */ #define for_each_valid_sp(_kvm, _sp, _gfn) \ hlist_for_each_entry(_sp, \ &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ @@ -5881,13 +5873,11 @@ restart: if (sp->role.invalid) continue; - /* - * Need not flush tlb since we only zap the sp with invalid - * generation number. - */ if (batch >= BATCH_ZAP_PAGES && - cond_resched_lock(&kvm->mmu_lock)) { + (need_resched() || spin_needbreak(&kvm->mmu_lock))) { batch = 0; + kvm_mmu_commit_zap_page(kvm, &invalid_list); + cond_resched_lock(&kvm->mmu_lock); goto restart; } @@ -5898,10 +5888,6 @@ restart: goto restart; } - /* - * Should flush tlb before free page tables since lockless-walking - * may use the pages. - */ kvm_mmu_commit_zap_page(kvm, &invalid_list); } @@ -5920,17 +5906,6 @@ void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm) trace_kvm_mmu_invalidate_zap_all_pages(kvm); kvm->arch.mmu_valid_gen++; - /* - * Notify all vcpus to reload its shadow page table - * and flush TLB. Then all vcpus will switch to new - * shadow page table with the new mmu_valid_gen. - * - * Note: we should do this under the protection of - * mmu-lock, otherwise, vcpu would purge shadow page - * but miss tlb flush. - */ - kvm_reload_remote_mmus(kvm); - kvm_zap_obsolete_pages(kvm); spin_unlock(&kvm->mmu_lock); } -- cgit From 43d2b14b105fb00b8864c7b0ee7043cc1cc4a969 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:28 -0800 Subject: Revert "KVM: MMU: zap pages in batch" Unwinding optimizations related to obsolete pages is a step towards removing x86 KVM's fast invalidate mechanism, i.e. this is one part of a revert all patches from the series that introduced the mechanism[1]. This reverts commit e7d11c7a894986a13817c1c001e1e7668c5c4eb4. [1] https://lkml.kernel.org/r/1369960590-14138-1-git-send-email-xiaoguangrong@linux.vnet.ibm.com Cc: Xiao Guangrong Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e733262027ed..cb9fd69d2632 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5846,18 +5846,14 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm, } EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty); -#define BATCH_ZAP_PAGES 10 static void kvm_zap_obsolete_pages(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; LIST_HEAD(invalid_list); - int batch = 0; restart: list_for_each_entry_safe_reverse(sp, node, &kvm->arch.active_mmu_pages, link) { - int ret; - /* * No obsolete page exists before new created page since * active_mmu_pages is the FIFO list. @@ -5866,6 +5862,28 @@ restart: break; /* + * Do not repeatedly zap a root page to avoid unnecessary + * KVM_REQ_MMU_RELOAD, otherwise we may not be able to + * progress: + * vcpu 0 vcpu 1 + * call vcpu_enter_guest(): + * 1): handle KVM_REQ_MMU_RELOAD + * and require mmu-lock to + * load mmu + * repeat: + * 1): zap root page and + * send KVM_REQ_MMU_RELOAD + * + * 2): if (cond_resched_lock(mmu-lock)) + * + * 2): hold mmu-lock and load mmu + * + * 3): see KVM_REQ_MMU_RELOAD bit + * on vcpu->requests is set + * then return 1 to call + * vcpu_enter_guest() again. + * goto repeat; + * * Since we are reversely walking the list and the invalid * list will be moved to the head, skip the invalid page * can help us to avoid the infinity list walking. @@ -5873,18 +5891,13 @@ restart: if (sp->role.invalid) continue; - if (batch >= BATCH_ZAP_PAGES && - (need_resched() || spin_needbreak(&kvm->mmu_lock))) { - batch = 0; + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { kvm_mmu_commit_zap_page(kvm, &invalid_list); cond_resched_lock(&kvm->mmu_lock); goto restart; } - ret = kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); - batch += ret; - - if (ret) + if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) goto restart; } -- cgit From 42560fb1f3c6c7f730897b7fa7a478bc37e0be50 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:29 -0800 Subject: Revert "KVM: MMU: add tracepoint for kvm_mmu_invalidate_all_pages" ...as part of removing x86 KVM's fast invalidate mechanism, i.e. this is one part of a revert all patches from the series that introduced the mechanism[1]. This reverts commit 35006126f024f68727c67001b9cb703c38f69268. [1] https://lkml.kernel.org/r/1369960590-14138-1-git-send-email-xiaoguangrong@linux.vnet.ibm.com Cc: Xiao Guangrong Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 1 - arch/x86/kvm/mmutrace.h | 21 --------------------- 2 files changed, 22 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index cb9fd69d2632..df4025e6f1e1 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5916,7 +5916,6 @@ restart: void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm) { spin_lock(&kvm->mmu_lock); - trace_kvm_mmu_invalidate_zap_all_pages(kvm); kvm->arch.mmu_valid_gen++; kvm_zap_obsolete_pages(kvm); diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index c73bf4e4988c..cac88910b310 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -282,27 +282,6 @@ TRACE_EVENT( ) ); -TRACE_EVENT( - kvm_mmu_invalidate_zap_all_pages, - TP_PROTO(struct kvm *kvm), - TP_ARGS(kvm), - - TP_STRUCT__entry( - __field(unsigned long, mmu_valid_gen) - __field(unsigned int, mmu_used_pages) - ), - - TP_fast_assign( - __entry->mmu_valid_gen = kvm->arch.mmu_valid_gen; - __entry->mmu_used_pages = kvm->arch.n_used_mmu_pages; - ), - - TP_printk("kvm-mmu-valid-gen %lx used_pages %x", - __entry->mmu_valid_gen, __entry->mmu_used_pages - ) -); - - TRACE_EVENT( check_mmio_spte, TP_PROTO(u64 spte, unsigned int kvm_gen, unsigned int spte_gen), -- cgit From b59c4830ca185ba0e9f9e046fb1cd10a4a92627a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:30 -0800 Subject: Revert "KVM: MMU: show mmu_valid_gen in shadow page related tracepoints" ...as part of removing x86 KVM's fast invalidate mechanism, i.e. this is one part of a revert all patches from the series that introduced the mechanism[1]. This reverts commit 2248b023219251908aedda0621251cffc548f258. [1] https://lkml.kernel.org/r/1369960590-14138-1-git-send-email-xiaoguangrong@linux.vnet.ibm.com Cc: Xiao Guangrong Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmutrace.h | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index cac88910b310..9f6c855a0043 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -8,18 +8,16 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM kvmmmu -#define KVM_MMU_PAGE_FIELDS \ - __field(unsigned long, mmu_valid_gen) \ - __field(__u64, gfn) \ - __field(__u32, role) \ - __field(__u32, root_count) \ +#define KVM_MMU_PAGE_FIELDS \ + __field(__u64, gfn) \ + __field(__u32, role) \ + __field(__u32, root_count) \ __field(bool, unsync) -#define KVM_MMU_PAGE_ASSIGN(sp) \ - __entry->mmu_valid_gen = sp->mmu_valid_gen; \ - __entry->gfn = sp->gfn; \ - __entry->role = sp->role.word; \ - __entry->root_count = sp->root_count; \ +#define KVM_MMU_PAGE_ASSIGN(sp) \ + __entry->gfn = sp->gfn; \ + __entry->role = sp->role.word; \ + __entry->root_count = sp->root_count; \ __entry->unsync = sp->unsync; #define KVM_MMU_PAGE_PRINTK() ({ \ @@ -31,9 +29,8 @@ \ role.word = __entry->role; \ \ - trace_seq_printf(p, "sp gen %lx gfn %llx l%u%s q%u%s %s%s" \ + trace_seq_printf(p, "sp gfn %llx l%u%s q%u%s %s%s" \ " %snxe %sad root %u %s%c", \ - __entry->mmu_valid_gen, \ __entry->gfn, role.level, \ role.cr4_pae ? " pae" : "", \ role.quadrant, \ -- cgit From 7390de1e99a70895721165d0ccd4a6e16482960a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:31 -0800 Subject: Revert "KVM: x86: use the fast way to invalidate all pages" Revert to a slow kvm_mmu_zap_all() for kvm_arch_flush_shadow_all(). Flushing all shadow entries is only done during VM teardown, i.e. kvm_arch_flush_shadow_all() is only called when the associated MM struct is being released or when the VM instance is being freed. Although the performance of teardown itself isn't critical, KVM should still voluntarily schedule to play nice with the rest of the kernel; but that can be done without the fast invalidate mechanism in a future patch. This reverts commit 6ca18b6950f8dee29361722f28f69847724b276f. Cc: Xiao Guangrong Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 15 +++++++++++++++ arch/x86/kvm/x86.c | 2 +- 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index df4025e6f1e1..5cdeb88850f8 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5846,6 +5846,21 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm, } EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty); +void kvm_mmu_zap_all(struct kvm *kvm) +{ + struct kvm_mmu_page *sp, *node; + LIST_HEAD(invalid_list); + + spin_lock(&kvm->mmu_lock); +restart: + list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) + if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) + goto restart; + + kvm_mmu_commit_zap_page(kvm, &invalid_list); + spin_unlock(&kvm->mmu_lock); +} + static void kvm_zap_obsolete_pages(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 78fb13f190a3..65e4559eef2f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9470,7 +9470,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, void kvm_arch_flush_shadow_all(struct kvm *kvm) { - kvm_mmu_invalidate_zap_all_pages(kvm); + kvm_mmu_zap_all(kvm); } void kvm_arch_flush_shadow_memslot(struct kvm *kvm, -- cgit From 8a674adc11cd4cc59e51eaea6f0cc4b3d5710411 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:32 -0800 Subject: KVM: x86/mmu: skip over invalid root pages when zapping all sptes ...to guarantee forward progress. When zapped, root pages are marked invalid and moved to the head of the active pages list until they are explicitly freed. Theoretically, having unzappable root pages at the head of the list could prevent kvm_mmu_zap_all() from making forward progress were a future patch to add a loop restart after processing a page, e.g. to drop mmu_lock on contention. Although kvm_mmu_prepare_zap_page() can theoretically take action on invalid pages, e.g. to zap unsync children, functionally it's not necessary (root pages will be re-zapped when freed) and practically speaking the odds of e.g. @unsync or @unsync_children becoming %true while zapping all pages is basically nil. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 5cdeb88850f8..c79ad7f31fdb 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5853,9 +5853,12 @@ void kvm_mmu_zap_all(struct kvm *kvm) spin_lock(&kvm->mmu_lock); restart: - list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) + list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { + if (sp->role.invalid && sp->root_count) + continue; if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) goto restart; + } kvm_mmu_commit_zap_page(kvm, &invalid_list); spin_unlock(&kvm->mmu_lock); -- cgit From 5d6317ca4e61a3fa7528f832cd945c42fde8e67f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:33 -0800 Subject: KVM: x86/mmu: Voluntarily reschedule as needed when zapping all sptes Call cond_resched_lock() when zapping all sptes to reschedule if needed or to release and reacquire mmu_lock in case of contention. There is no need to flush or zap when temporarily dropping mmu_lock as zapping all sptes is done only when the owning userspace VMM has exited or when the VM is being destroyed, i.e. there is no interplay with memslots or MMIO generations to worry about. Be paranoid and restart the walk if mmu_lock is dropped to avoid any potential issues with consuming a stale iterator. The overhead in doing so is negligible as at worst there will be a few root shadow pages at the head of the list, i.e. the iterator is essentially the head of the list already. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c79ad7f31fdb..fa153d771f47 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5856,7 +5856,8 @@ restart: list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { if (sp->role.invalid && sp->root_count) continue; - if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) + if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list) || + cond_resched_lock(&kvm->mmu_lock)) goto restart; } -- cgit From ea145aacf4ae8485cf179a4d0dc502e9f75044f4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:34 -0800 Subject: Revert "KVM: MMU: fast invalidate all pages" Remove x86 KVM's fast invalidate mechanism, i.e. revert all patches from the original series[1], now that all users of the fast invalidate mechanism are gone. This reverts commit 5304b8d37c2a5ebca48330f5e7868d240eafbed1. [1] https://lkml.kernel.org/r/1369960590-14138-1-git-send-email-xiaoguangrong@linux.vnet.ibm.com Cc: Xiao Guangrong Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 98 +----------------------------------------------------- arch/x86/kvm/mmu.h | 1 - 2 files changed, 1 insertion(+), 98 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index fa153d771f47..6d602d4c3ca4 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2060,12 +2060,6 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct if (!direct) sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); - - /* - * The active_mmu_pages list is the FIFO list, do not move the - * page until it is zapped. kvm_zap_obsolete_pages depends on - * this feature. See the comments in kvm_zap_obsolete_pages(). - */ list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); kvm_mod_used_mmu_pages(vcpu->kvm, +1); return sp; @@ -2214,7 +2208,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, #define for_each_valid_sp(_kvm, _sp, _gfn) \ hlist_for_each_entry(_sp, \ &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ - if (is_obsolete_sp((_kvm), (_sp)) || (_sp)->role.invalid) { \ + if ((_sp)->role.invalid) { \ } else #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \ @@ -2266,11 +2260,6 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { } static void mmu_audit_disable(void) { } #endif -static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) -{ - return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); -} - static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, struct list_head *invalid_list) { @@ -2495,7 +2484,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, if (level > PT_PAGE_TABLE_LEVEL && need_sync) flush |= kvm_sync_pages(vcpu, gfn, &invalid_list); } - sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen; clear_page(sp->spt); trace_kvm_mmu_get_page(sp, true); @@ -4206,14 +4194,6 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3, return false; if (cached_root_available(vcpu, new_cr3, new_role)) { - /* - * It is possible that the cached previous root page is - * obsolete because of a change in the MMU - * generation number. However, that is accompanied by - * KVM_REQ_MMU_RELOAD, which will free the root that we - * have set here and allocate a new one. - */ - kvm_make_request(KVM_REQ_LOAD_CR3, vcpu); if (!skip_tlb_flush) { kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); @@ -5865,82 +5845,6 @@ restart: spin_unlock(&kvm->mmu_lock); } -static void kvm_zap_obsolete_pages(struct kvm *kvm) -{ - struct kvm_mmu_page *sp, *node; - LIST_HEAD(invalid_list); - -restart: - list_for_each_entry_safe_reverse(sp, node, - &kvm->arch.active_mmu_pages, link) { - /* - * No obsolete page exists before new created page since - * active_mmu_pages is the FIFO list. - */ - if (!is_obsolete_sp(kvm, sp)) - break; - - /* - * Do not repeatedly zap a root page to avoid unnecessary - * KVM_REQ_MMU_RELOAD, otherwise we may not be able to - * progress: - * vcpu 0 vcpu 1 - * call vcpu_enter_guest(): - * 1): handle KVM_REQ_MMU_RELOAD - * and require mmu-lock to - * load mmu - * repeat: - * 1): zap root page and - * send KVM_REQ_MMU_RELOAD - * - * 2): if (cond_resched_lock(mmu-lock)) - * - * 2): hold mmu-lock and load mmu - * - * 3): see KVM_REQ_MMU_RELOAD bit - * on vcpu->requests is set - * then return 1 to call - * vcpu_enter_guest() again. - * goto repeat; - * - * Since we are reversely walking the list and the invalid - * list will be moved to the head, skip the invalid page - * can help us to avoid the infinity list walking. - */ - if (sp->role.invalid) - continue; - - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { - kvm_mmu_commit_zap_page(kvm, &invalid_list); - cond_resched_lock(&kvm->mmu_lock); - goto restart; - } - - if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) - goto restart; - } - - kvm_mmu_commit_zap_page(kvm, &invalid_list); -} - -/* - * Fast invalidate all shadow pages and use lock-break technique - * to zap obsolete pages. - * - * It's required when memslot is being deleted or VM is being - * destroyed, in these cases, we should ensure that KVM MMU does - * not use any resource of the being-deleted slot or all slots - * after calling the function. - */ -void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm) -{ - spin_lock(&kvm->mmu_lock); - kvm->arch.mmu_valid_gen++; - - kvm_zap_obsolete_pages(kvm); - spin_unlock(&kvm->mmu_lock); -} - static void kvm_mmu_zap_mmio_sptes(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index c7b333147c4a..bbdc60f2fae8 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -203,7 +203,6 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, return -(u32)fault & errcode; } -void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm); void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end); void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); -- cgit From 83cdb56864bcb1466b454f17fff47348ca7925a2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:35 -0800 Subject: KVM: x86/mmu: Differentiate between nr zapped and list unstable The return value of kvm_mmu_prepare_zap_page() has evolved to become overloaded to convey two separate pieces of information. 1) was at least one page zapped and 2) has the list of MMU pages become unstable. In it's original incarnation (as kvm_mmu_zap_page()), there was no return value at all. Commit 0738541396be ("KVM: MMU: awareness of new kvm_mmu_zap_page behaviour") added a return value in preparation for commit 4731d4c7a077 ("KVM: MMU: out of sync shadow core"). Although the return value was of type 'int', it was actually used as a boolean to indicate whether or not active_mmu_pages may have become unstable due to zapping children. Walking a list with list_for_each_entry_safe() only protects against deleting/moving the current entry, i.e. zapping a child page would break iteration due to modifying any number of entries. Later, commit 60c8aec6e2c9 ("KVM: MMU: use page array in unsync walk") modified mmu_zap_unsync_children() to return an approximation of the number of children zapped. This was not intentional, it was simply a side effect of how the code was written. The unintented side affect was then morphed into an actual feature by commit 77662e0028c7 ("KVM: MMU: fix kvm_mmu_zap_page() and its calling path"), which modified kvm_mmu_change_mmu_pages() to use the number of zapped pages when determining the number of MMU pages in use by the VM. Finally, commit 54a4f0239f2e ("KVM: MMU: make kvm_mmu_zap_page() return the number of pages it actually freed") added the initial page to the return value to make its behavior more consistent with what most users would expect. Incorporating the initial parent page in the return value of kvm_mmu_zap_page() breaks the original usage of restarting a list walk on a non-zero return value to handle a potentially unstable list, i.e. walks will unnecessarily restart when any page is zapped. Fix this by restoring the original behavior of kvm_mmu_zap_page(), i.e. return a boolean to indicate that the list may be unstable and move the number of zapped children to a dedicated parameter. Since the majority of callers to kvm_mmu_prepare_zap_page() don't care about either return value, preserve the current definition of kvm_mmu_prepare_zap_page() by making it a wrapper of a new helper, __kvm_mmu_prepare_zap_page(). This avoids having to update every call site and also provides cleaner code for functions that only care about the number of pages zapped. Fixes: 54a4f0239f2e ("KVM: MMU: make kvm_mmu_zap_page() return the number of pages it actually freed") Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 6d602d4c3ca4..4b93fcdf0839 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2200,8 +2200,8 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) --kvm->stat.mmu_unsync; } -static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, - struct list_head *invalid_list); +static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, + struct list_head *invalid_list); static void kvm_mmu_commit_zap_page(struct kvm *kvm, struct list_head *invalid_list); @@ -2669,17 +2669,22 @@ static int mmu_zap_unsync_children(struct kvm *kvm, return zapped; } -static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, - struct list_head *invalid_list) +static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, + struct kvm_mmu_page *sp, + struct list_head *invalid_list, + int *nr_zapped) { - int ret; + bool list_unstable; trace_kvm_mmu_prepare_zap_page(sp); ++kvm->stat.mmu_shadow_zapped; - ret = mmu_zap_unsync_children(kvm, sp, invalid_list); + *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list); kvm_mmu_page_unlink_children(kvm, sp); kvm_mmu_unlink_parents(kvm, sp); + /* Zapping children means active_mmu_pages has become unstable. */ + list_unstable = *nr_zapped; + if (!sp->role.invalid && !sp->role.direct) unaccount_shadowed(kvm, sp); @@ -2687,7 +2692,7 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, kvm_unlink_unsync_page(kvm, sp); if (!sp->root_count) { /* Count self */ - ret++; + (*nr_zapped)++; list_move(&sp->link, invalid_list); kvm_mod_used_mmu_pages(kvm, -1); } else { @@ -2698,7 +2703,16 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, } sp->role.invalid = 1; - return ret; + return list_unstable; +} + +static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, + struct list_head *invalid_list) +{ + int nr_zapped; + + __kvm_mmu_prepare_zap_page(kvm, sp, invalid_list, &nr_zapped); + return nr_zapped; } static void kvm_mmu_commit_zap_page(struct kvm *kvm, @@ -5830,13 +5844,14 @@ void kvm_mmu_zap_all(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; LIST_HEAD(invalid_list); + int ign; spin_lock(&kvm->mmu_lock); restart: list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { if (sp->role.invalid && sp->root_count) continue; - if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list) || + if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign) || cond_resched_lock(&kvm->mmu_lock)) goto restart; } @@ -5849,13 +5864,14 @@ static void kvm_mmu_zap_mmio_sptes(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; LIST_HEAD(invalid_list); + int ign; spin_lock(&kvm->mmu_lock); restart: list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { if (!sp->mmio_cached) continue; - if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list) || + if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign) || cond_resched_lock(&kvm->mmu_lock)) goto restart; } -- cgit From 24efe61f696c7b44a66c7d6a41c181b31aa338fc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:36 -0800 Subject: KVM: x86/mmu: WARN if zapping a MMIO spte results in zapping children Paolo expressed a concern that kvm_mmu_zap_mmio_sptes() could have a quadratic runtime[1], i.e. restarting the spte walk while zapping only MMIO sptes could result in re-walking large portions of the list over and over due to the non-MMIO sptes encountered before the restart not being removed. At the time, the concern was legitimate as the walk was restarted when any spte was zapped. But that is no longer the case as the walk is now restarted iff one or more children have been zapped, which is necessary because zapping children makes the active_mmu_pages list unstable. Furthermore, it should be impossible for an MMIO spte to have children, i.e. zapping an MMIO spte should never result in zapping children. In other words, kvm_mmu_zap_mmio_sptes() should never restart its walk, and so should always execute in linear time. WARN if this assertion fails. Although it should never be needed, leave the restart logic in place. In normal operation, the cost is at worst an extra CMP+Jcc, and if for some reason the list does become unstable, not restarting would likely crash KVM, or worse, the kernel. [1] https://patchwork.kernel.org/patch/10756589/#22452085 Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 4b93fcdf0839..81618070367b 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5871,8 +5871,11 @@ restart: list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { if (!sp->mmio_cached) continue; - if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign) || - cond_resched_lock(&kvm->mmu_lock)) + if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) { + WARN_ON_ONCE(1); + goto restart; + } + if (cond_resched_lock(&kvm->mmu_lock)) goto restart; } -- cgit From 8ab3c471eef20925bf64c6d4fa46e88cdb4e86d5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Feb 2019 13:01:37 -0800 Subject: KVM: x86/mmu: Consolidate kvm_mmu_zap_all() and kvm_mmu_zap_mmio_sptes() ...via a new helper, __kvm_mmu_zap_all(). An alternative to passing a 'bool mmio_only' would be to pass a callback function to filter the shadow page, i.e. to make __kvm_mmu_zap_all() generic and reusable, but zapping all shadow pages is a last resort, i.e. making the helper less extensible is a feature of sorts. And the explicit MMIO parameter makes it easy to preserve the WARN_ON_ONCE() if a restart is triggered when zapping MMIO sptes. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 33 ++++++++++----------------------- 1 file changed, 10 insertions(+), 23 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 81618070367b..8d43b7c0f56f 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5840,7 +5840,7 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm, } EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty); -void kvm_mmu_zap_all(struct kvm *kvm) +static void __kvm_mmu_zap_all(struct kvm *kvm, bool mmio_only) { struct kvm_mmu_page *sp, *node; LIST_HEAD(invalid_list); @@ -5849,30 +5849,12 @@ void kvm_mmu_zap_all(struct kvm *kvm) spin_lock(&kvm->mmu_lock); restart: list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { - if (sp->role.invalid && sp->root_count) + if (mmio_only && !sp->mmio_cached) continue; - if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign) || - cond_resched_lock(&kvm->mmu_lock)) - goto restart; - } - - kvm_mmu_commit_zap_page(kvm, &invalid_list); - spin_unlock(&kvm->mmu_lock); -} - -static void kvm_mmu_zap_mmio_sptes(struct kvm *kvm) -{ - struct kvm_mmu_page *sp, *node; - LIST_HEAD(invalid_list); - int ign; - - spin_lock(&kvm->mmu_lock); -restart: - list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { - if (!sp->mmio_cached) + if (sp->role.invalid && sp->root_count) continue; if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) { - WARN_ON_ONCE(1); + WARN_ON_ONCE(mmio_only); goto restart; } if (cond_resched_lock(&kvm->mmu_lock)) @@ -5883,6 +5865,11 @@ restart: spin_unlock(&kvm->mmu_lock); } +void kvm_mmu_zap_all(struct kvm *kvm) +{ + return __kvm_mmu_zap_all(kvm, false); +} + void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) { WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); @@ -5904,7 +5891,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) */ if (unlikely(gen == 0)) { kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n"); - kvm_mmu_zap_mmio_sptes(kvm); + __kvm_mmu_zap_all(kvm, true); } } -- cgit From 92da008fa21034c369cdb8ca2b629fe5c196826b Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Tue, 12 Mar 2019 11:45:58 -0700 Subject: Revert "KVM/MMU: Flush tlb directly in the kvm_zap_gfn_range()" This reverts commit 71883a62fcd6c70639fa12cda733378b4d997409. The above commit contains an optimization to kvm_zap_gfn_range which uses gfn-limited TLB flushes, if enabled. If using these limited flushes, kvm_zap_gfn_range passes lock_flush_tlb=false to slot_handle_level_range which creates a race when the function unlocks to call cond_resched. See an example of this race below: CPU 0 CPU 1 CPU 3 // zap_direct_gfn_range mmu_lock() // *ptep == pte_1 *ptep = 0 if (lock_flush_tlb) flush_tlbs() mmu_unlock() // In invalidate range // MMU notifier mmu_lock() if (pte != 0) *ptep = 0 flush = true if (flush) flush_remote_tlbs() mmu_unlock() return // Host MM reallocates // page previously // backing guest memory. // Guest accesses // invalid page // through pte_1 // in its TLB!! Tested: Ran all kvm-unit-tests on a Intel Haswell machine with and without this patch. The patch introduced no new failures. Signed-off-by: Ben Gardon Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 8d43b7c0f56f..4cda5ee48845 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5660,13 +5660,8 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; - bool flush_tlb = true; - bool flush = false; int i; - if (kvm_available_flush_tlb_with_range()) - flush_tlb = false; - spin_lock(&kvm->mmu_lock); for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { slots = __kvm_memslots(kvm, i); @@ -5678,17 +5673,12 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) if (start >= end) continue; - flush |= slot_handle_level_range(kvm, memslot, - kvm_zap_rmapp, PT_PAGE_TABLE_LEVEL, - PT_MAX_HUGEPAGE_LEVEL, start, - end - 1, flush_tlb); + slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, + PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL, + start, end - 1, true); } } - if (flush) - kvm_flush_remote_tlbs_with_address(kvm, gfn_start, - gfn_end - gfn_start + 1); - spin_unlock(&kvm->mmu_lock); } -- cgit From 4a605bc08e98381d8df61c30a4acb2eac15eb7da Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 15 Mar 2019 19:23:45 +0100 Subject: kvm: vmx: fix formatting of a comment Eliminate a gratuitous conflict with 5.0. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86/kvm') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 4950bb20e06a..886930db77c5 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1213,13 +1213,13 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) new.control) != old.control); /* - * Clear SN before reading the bitmap; this ensures that any - * interrupt that comes after the bitmap is read sets ON. The - * VT-d firmware * writes the bitmap and reads SN atomically (5.2.3 - * in the spec), so it doesn't really have a memory barrier that - * pairs with this. However, we cannot do that and we need one. + * Clear SN before reading the bitmap. The VT-d firmware + * writes the bitmap and reads SN atomically (5.2.3 in the + * spec), so it doesn't really have a memory barrier that + * pairs with this, but we cannot do that and we need one. */ smp_mb__after_atomic(); + if (!bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS)) pi_set_on(pi_desc); } -- cgit