diff options
32 files changed, 796 insertions, 437 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 9fa2bf8c3f6f..695544420ff2 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -959,7 +959,8 @@ documentation when it pops into existence). 4.37 KVM_ENABLE_CAP Capability: KVM_CAP_ENABLE_CAP, KVM_CAP_ENABLE_CAP_VM -Architectures: ppc, s390 +Architectures: x86 (only KVM_CAP_ENABLE_CAP_VM), + mips (only KVM_CAP_ENABLE_CAP), ppc, s390 Type: vcpu ioctl, vm ioctl (with KVM_CAP_ENABLE_CAP_VM) Parameters: struct kvm_enable_cap (in) Returns: 0 on success; -1 on error diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index d9631ecddd56..e41cb11f71b2 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -553,13 +553,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) * Enter the guest */ trace_kvm_entry(*vcpu_pc(vcpu)); - kvm_guest_enter(); + __kvm_guest_enter(); vcpu->mode = IN_GUEST_MODE; ret = kvm_call_hyp(__kvm_vcpu_run, vcpu); vcpu->mode = OUTSIDE_GUEST_MODE; - kvm_guest_exit(); + __kvm_guest_exit(); trace_kvm_exit(kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu)); /* * We may have taken a host interrupt in HYP mode (ie diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index bb68e8d520e8..a8e660a44474 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -393,7 +393,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) kvm_mips_deliver_interrupts(vcpu, kvm_read_c0_guest_cause(vcpu->arch.cop0)); - kvm_guest_enter(); + __kvm_guest_enter(); /* Disable hardware page table walking while in guest */ htw_stop(); @@ -403,7 +403,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) /* Re-enable HTW before enabling interrupts */ htw_start(); - kvm_guest_exit(); + __kvm_guest_exit(); local_irq_enable(); if (vcpu->sigset_active) @@ -982,7 +982,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) /* If nothing is dirty, don't bother messing with page tables. */ if (is_dirty) { - memslot = &kvm->memslots->memslots[log->slot]; + memslot = id_to_memslot(kvm->memslots, log->slot); ga = memslot->base_gfn << PAGE_SHIFT; ga_end = ga + (memslot->npages << PAGE_SHIFT); diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 6c1316a15a27..3872ab31c80a 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -1004,10 +1004,10 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, break; } - local_irq_enable(); - trace_kvm_exit(exit_nr, vcpu); - kvm_guest_exit(); + __kvm_guest_exit(); + + local_irq_enable(); run->exit_reason = KVM_EXIT_UNKNOWN; run->ready_for_interrupt_injection = 1; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index ac3ddf115f3d..8cd1f80fdc70 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -115,7 +115,7 @@ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu) continue; } - kvm_guest_enter(); + __kvm_guest_enter(); return 1; } diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index d01fc588b5c3..444c412307cb 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -80,6 +80,7 @@ struct sca_block { #define CPUSTAT_MCDS 0x00000100 #define CPUSTAT_SM 0x00000080 #define CPUSTAT_IBS 0x00000040 +#define CPUSTAT_GED2 0x00000010 #define CPUSTAT_G 0x00000008 #define CPUSTAT_GED 0x00000004 #define CPUSTAT_J 0x00000002 @@ -95,7 +96,8 @@ struct kvm_s390_sie_block { #define PROG_IN_SIE (1<<0) __u32 prog0c; /* 0x000c */ __u8 reserved10[16]; /* 0x0010 */ -#define PROG_BLOCK_SIE 0x00000001 +#define PROG_BLOCK_SIE (1<<0) +#define PROG_REQUEST (1<<1) atomic_t prog20; /* 0x0020 */ __u8 reserved24[4]; /* 0x0024 */ __u64 cputm; /* 0x0028 */ diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 99b44acbfcc7..3238893c9d4f 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -1005,7 +1005,7 @@ ENTRY(sie64a) .Lsie_gmap: lg %r14,__SF_EMPTY(%r15) # get control block pointer oi __SIE_PROG0C+3(%r14),1 # we are going into SIE now - tm __SIE_PROG20+3(%r14),1 # last exit... + tm __SIE_PROG20+3(%r14),3 # last exit... jnz .Lsie_done LPP __SF_EMPTY(%r15) # set guest id sie 0(%r14) diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 9e3779e3e496..7365e8a46032 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -241,21 +241,6 @@ static int handle_prog(struct kvm_vcpu *vcpu) return kvm_s390_inject_prog_irq(vcpu, &pgm_info); } -static int handle_instruction_and_prog(struct kvm_vcpu *vcpu) -{ - int rc, rc2; - - vcpu->stat.exit_instr_and_program++; - rc = handle_instruction(vcpu); - rc2 = handle_prog(vcpu); - - if (rc == -EOPNOTSUPP) - vcpu->arch.sie_block->icptcode = 0x04; - if (rc) - return rc; - return rc2; -} - /** * handle_external_interrupt - used for external interruption interceptions * @@ -355,7 +340,6 @@ static const intercept_handler_t intercept_funcs[] = { [0x00 >> 2] = handle_noop, [0x04 >> 2] = handle_instruction, [0x08 >> 2] = handle_prog, - [0x0C >> 2] = handle_instruction_and_prog, [0x10 >> 2] = handle_noop, [0x14 >> 2] = handle_external_interrupt, [0x18 >> 2] = handle_noop, diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 9de47265ef73..322ef9cfdc80 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -134,6 +134,8 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu) active_mask = pending_local_irqs(vcpu); active_mask |= pending_floating_irqs(vcpu); + if (!active_mask) + return 0; if (psw_extint_disabled(vcpu)) active_mask &= ~IRQ_PEND_EXT_MASK; @@ -941,12 +943,9 @@ int __must_check kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu) if (cpu_timer_irq_pending(vcpu)) set_bit(IRQ_PEND_EXT_CPU_TIMER, &li->pending_irqs); - do { - irqs = deliverable_irqs(vcpu); + while ((irqs = deliverable_irqs(vcpu)) && !rc) { /* bits are in the order of interrupt priority */ irq_type = find_first_bit(&irqs, IRQ_PEND_COUNT); - if (irq_type == IRQ_PEND_COUNT) - break; if (is_ioirq(irq_type)) { rc = __deliver_io(vcpu, irq_type); } else { @@ -958,9 +957,7 @@ int __must_check kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu) } rc = func(vcpu); } - if (rc) - break; - } while (!rc); + } set_intercept_indicators(vcpu); @@ -1061,7 +1058,7 @@ static int __inject_extcall(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) if (sclp_has_sigpif()) return __inject_extcall_sigpif(vcpu, src_id); - if (!test_and_set_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs)) + if (test_and_set_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs)) return -EBUSY; *extcall = irq->u.extcall; atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags); @@ -1340,12 +1337,54 @@ static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) return 0; } -static int __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) +/* + * Find a destination VCPU for a floating irq and kick it. + */ +static void __floating_irq_kick(struct kvm *kvm, u64 type) { + struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int; struct kvm_s390_local_interrupt *li; + struct kvm_vcpu *dst_vcpu; + int sigcpu, online_vcpus, nr_tries = 0; + + online_vcpus = atomic_read(&kvm->online_vcpus); + if (!online_vcpus) + return; + + /* find idle VCPUs first, then round robin */ + sigcpu = find_first_bit(fi->idle_mask, online_vcpus); + if (sigcpu == online_vcpus) { + do { + sigcpu = fi->next_rr_cpu; + fi->next_rr_cpu = (fi->next_rr_cpu + 1) % online_vcpus; + /* avoid endless loops if all vcpus are stopped */ + if (nr_tries++ >= online_vcpus) + return; + } while (is_vcpu_stopped(kvm_get_vcpu(kvm, sigcpu))); + } + dst_vcpu = kvm_get_vcpu(kvm, sigcpu); + + /* make the VCPU drop out of the SIE, or wake it up if sleeping */ + li = &dst_vcpu->arch.local_int; + spin_lock(&li->lock); + switch (type) { + case KVM_S390_MCHK: + atomic_set_mask(CPUSTAT_STOP_INT, li->cpuflags); + break; + case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX: + atomic_set_mask(CPUSTAT_IO_INT, li->cpuflags); + break; + default: + atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags); + break; + } + spin_unlock(&li->lock); + kvm_s390_vcpu_wakeup(dst_vcpu); +} + +static int __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) +{ struct kvm_s390_float_interrupt *fi; - struct kvm_vcpu *dst_vcpu = NULL; - int sigcpu; u64 type = READ_ONCE(inti->type); int rc; @@ -1373,32 +1412,8 @@ static int __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) if (rc) return rc; - sigcpu = find_first_bit(fi->idle_mask, KVM_MAX_VCPUS); - if (sigcpu == KVM_MAX_VCPUS) { - do { - sigcpu = fi->next_rr_cpu++; - if (sigcpu == KVM_MAX_VCPUS) - sigcpu = fi->next_rr_cpu = 0; - } while (kvm_get_vcpu(kvm, sigcpu) == NULL); - } - dst_vcpu = kvm_get_vcpu(kvm, sigcpu); - li = &dst_vcpu->arch.local_int; - spin_lock(&li->lock); - switch (type) { - case KVM_S390_MCHK: - atomic_set_mask(CPUSTAT_STOP_INT, li->cpuflags); - break; - case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX: - atomic_set_mask(CPUSTAT_IO_INT, li->cpuflags); - break; - default: - atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags); - break; - } - spin_unlock(&li->lock); - kvm_s390_vcpu_wakeup(kvm_get_vcpu(kvm, sigcpu)); + __floating_irq_kick(kvm, type); return 0; - } int kvm_s390_inject_vm(struct kvm *kvm, diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 8cd8e7b288c5..d461f8a15c07 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -110,7 +110,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { /* upper facilities limit for kvm */ unsigned long kvm_s390_fac_list_mask[] = { 0xffe6fffbfcfdfc40UL, - 0x005c800000000000UL, + 0x005e800000000000UL, }; unsigned long kvm_s390_fac_list_mask_size(void) @@ -454,10 +454,10 @@ static int kvm_s390_set_tod_low(struct kvm *kvm, struct kvm_device_attr *attr) mutex_lock(&kvm->lock); kvm->arch.epoch = gtod - host_tod; - kvm_for_each_vcpu(vcpu_idx, cur_vcpu, kvm) { + kvm_s390_vcpu_block_all(kvm); + kvm_for_each_vcpu(vcpu_idx, cur_vcpu, kvm) cur_vcpu->arch.sie_block->epoch = kvm->arch.epoch; - exit_sie(cur_vcpu); - } + kvm_s390_vcpu_unblock_all(kvm); mutex_unlock(&kvm->lock); return 0; } @@ -1311,8 +1311,13 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH | CPUSTAT_SM | - CPUSTAT_STOPPED | - CPUSTAT_GED); + CPUSTAT_STOPPED); + + if (test_kvm_facility(vcpu->kvm, 78)) + atomic_set_mask(CPUSTAT_GED2, &vcpu->arch.sie_block->cpuflags); + else if (test_kvm_facility(vcpu->kvm, 8)) + atomic_set_mask(CPUSTAT_GED, &vcpu->arch.sie_block->cpuflags); + kvm_s390_vcpu_setup_model(vcpu); vcpu->arch.sie_block->ecb = 6; @@ -1409,16 +1414,26 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) return kvm_s390_vcpu_has_irq(vcpu, 0); } -void s390_vcpu_block(struct kvm_vcpu *vcpu) +void kvm_s390_vcpu_block(struct kvm_vcpu *vcpu) { atomic_set_mask(PROG_BLOCK_SIE, &vcpu->arch.sie_block->prog20); } -void s390_vcpu_unblock(struct kvm_vcpu *vcpu) +void kvm_s390_vcpu_unblock(struct kvm_vcpu *vcpu) { atomic_clear_mask(PROG_BLOCK_SIE, &vcpu->arch.sie_block->prog20); } +static void kvm_s390_vcpu_request(struct kvm_vcpu *vcpu) +{ + atomic_set_mask(PROG_REQUEST, &vcpu->arch.sie_block->prog20); +} + +static void kvm_s390_vcpu_request_handled(struct kvm_vcpu *vcpu) +{ + atomic_clear_mask(PROG_REQUEST, &vcpu->arch.sie_block->prog20); +} + /* * Kick a guest cpu out of SIE and wait until SIE is not running. * If the CPU is not running (e.g. waiting as idle) the function will @@ -1430,10 +1445,11 @@ void exit_sie(struct kvm_vcpu *vcpu) cpu_relax(); } -/* Kick a guest cpu out of SIE and prevent SIE-reentry */ -void exit_sie_sync(struct kvm_vcpu *vcpu) +/* Kick a guest cpu out of SIE to process a request synchronously */ +void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu) { - s390_vcpu_block(vcpu); + kvm_make_request(req, vcpu); + kvm_s390_vcpu_request(vcpu); exit_sie(vcpu); } @@ -1447,8 +1463,7 @@ static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address) /* match against both prefix pages */ if (kvm_s390_get_prefix(vcpu) == (address & ~0x1000UL)) { VCPU_EVENT(vcpu, 2, "gmap notifier for %lx", address); - kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu); - exit_sie_sync(vcpu); + kvm_s390_sync_request(KVM_REQ_MMU_RELOAD, vcpu); } } } @@ -1720,8 +1735,10 @@ static bool ibs_enabled(struct kvm_vcpu *vcpu) static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu) { + if (!vcpu->requests) + return 0; retry: - s390_vcpu_unblock(vcpu); + kvm_s390_vcpu_request_handled(vcpu); /* * We use MMU_RELOAD just to re-arm the ipte notifier for the * guest prefix page. gmap_ipte_notify will wait on the ptl lock. @@ -1993,12 +2010,14 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) * As PF_VCPU will be used in fault handler, between * guest_enter and guest_exit should be no uaccess. */ - preempt_disable(); - kvm_guest_enter(); - preempt_enable(); + local_irq_disable(); + __kvm_guest_enter(); + local_irq_enable(); exit_reason = sie64a(vcpu->arch.sie_block, vcpu->run->s.regs.gprs); - kvm_guest_exit(); + local_irq_disable(); + __kvm_guest_exit(); + local_irq_enable(); vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); rc = vcpu_post_run(vcpu, exit_reason); @@ -2206,8 +2225,7 @@ int kvm_s390_vcpu_store_adtl_status(struct kvm_vcpu *vcpu, unsigned long addr) static void __disable_ibs_on_vcpu(struct kvm_vcpu *vcpu) { kvm_check_request(KVM_REQ_ENABLE_IBS, vcpu); - kvm_make_request(KVM_REQ_DISABLE_IBS, vcpu); - exit_sie_sync(vcpu); + kvm_s390_sync_request(KVM_REQ_DISABLE_IBS, vcpu); } static void __disable_ibs_on_all_vcpus(struct kvm *kvm) @@ -2223,8 +2241,7 @@ static void __disable_ibs_on_all_vcpus(struct kvm *kvm) static void __enable_ibs_on_vcpu(struct kvm_vcpu *vcpu) { kvm_check_request(KVM_REQ_DISABLE_IBS, vcpu); - kvm_make_request(KVM_REQ_ENABLE_IBS, vcpu); - exit_sie_sync(vcpu); + kvm_s390_sync_request(KVM_REQ_ENABLE_IBS, vcpu); } void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu) diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index ca108b90ae56..c5704786e473 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -211,10 +211,10 @@ int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr); int kvm_s390_vcpu_store_adtl_status(struct kvm_vcpu *vcpu, unsigned long addr); void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu); void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu); -void s390_vcpu_block(struct kvm_vcpu *vcpu); -void s390_vcpu_unblock(struct kvm_vcpu *vcpu); +void kvm_s390_vcpu_block(struct kvm_vcpu *vcpu); +void kvm_s390_vcpu_unblock(struct kvm_vcpu *vcpu); void exit_sie(struct kvm_vcpu *vcpu); -void exit_sie_sync(struct kvm_vcpu *vcpu); +void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu); int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu); void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu); /* is cmma enabled */ @@ -228,6 +228,25 @@ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu); int kvm_s390_inject_prog_irq(struct kvm_vcpu *vcpu, struct kvm_s390_pgm_info *pgm_info); +static inline void kvm_s390_vcpu_block_all(struct kvm *kvm) +{ + int i; + struct kvm_vcpu *vcpu; + + WARN_ON(!mutex_is_locked(&kvm->lock)); + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_s390_vcpu_block(vcpu); +} + +static inline void kvm_s390_vcpu_unblock_all(struct kvm *kvm) +{ + int i; + struct kvm_vcpu *vcpu; + + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_s390_vcpu_unblock(vcpu); +} + /** * kvm_s390_inject_prog_cond - conditionally inject a program check * @vcpu: virtual cpu diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index d22d8ee1ff9d..ad4242245771 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -698,10 +698,14 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) case 0x00001000: end = (start + (1UL << 20)) & ~((1UL << 20) - 1); break; - /* We dont support EDAT2 case 0x00002000: + /* only support 2G frame size if EDAT2 is available and we are + not in 24-bit addressing mode */ + if (!test_kvm_facility(vcpu->kvm, 78) || + psw_bits(vcpu->arch.sie_block->gpsw).eaba == PSW_AMODE_24BIT) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); end = (start + (1UL << 31)) & ~((1UL << 31) - 1); - break;*/ + break; default: return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); } diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f4a555beef19..1a4d6a054749 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -637,6 +637,8 @@ struct kvm_arch { #endif bool boot_vcpu_runs_old_kvmclock; + + u64 disabled_quirks; }; struct kvm_vm_stat { @@ -689,12 +691,13 @@ struct msr_data { struct kvm_lapic_irq { u32 vector; - u32 delivery_mode; - u32 dest_mode; - u32 level; - u32 trig_mode; + u16 delivery_mode; + u16 dest_mode; + bool level; + u16 trig_mode; u32 shorthand; u32 dest_id; + bool msi_redir_hint; }; struct kvm_x86_ops { @@ -711,7 +714,7 @@ struct kvm_x86_ops { /* Create, but do not attach this VCPU */ struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id); void (*vcpu_free)(struct kvm_vcpu *vcpu); - void (*vcpu_reset)(struct kvm_vcpu *vcpu); + void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event); void (*prepare_guest_switch)(struct kvm_vcpu *vcpu); void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); @@ -1002,7 +1005,7 @@ void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id); void kvm_inject_nmi(struct kvm_vcpu *vcpu); -int fx_init(struct kvm_vcpu *vcpu); +int fx_init(struct kvm_vcpu *vcpu, bool init_event); void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, int bytes); @@ -1146,7 +1149,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); int kvm_cpu_get_interrupt(struct kvm_vcpu *v); -void kvm_vcpu_reset(struct kvm_vcpu *vcpu); +void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu); void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, unsigned long address); diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index d6b078e9fa28..628954ceede1 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -86,7 +86,6 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, offset = pvclock_get_nsec_offset(src); ret = src->system_time + offset; ret_flags = src->flags; - rdtsc_barrier(); *cycles = ret; *flags = ret_flags; diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index d7dcef58aefa..2fec75e4b1e1 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -345,4 +345,7 @@ struct kvm_xcrs { struct kvm_sync_regs { }; +#define KVM_QUIRK_LINT0_REENABLED (1 << 0) +#define KVM_QUIRK_CD_NW_CLEARED (1 << 1) + #endif /* _ASM_X86_KVM_H */ diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 9435620062df..cc34cece853e 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -331,7 +331,7 @@ static void kvm_guest_apic_eoi_write(u32 reg, u32 val) apic_write(APIC_EOI, APIC_EOI_ACK); } -void kvm_guest_cpu_init(void) +static void kvm_guest_cpu_init(void) { if (!kvm_para_available()) return; @@ -655,7 +655,7 @@ static inline void spin_time_accum_blocked(u64 start) static struct dentry *d_spin_debug; static struct dentry *d_kvm_debug; -struct dentry *kvm_init_debugfs(void) +static struct dentry *kvm_init_debugfs(void) { d_kvm_debug = debugfs_create_dir("kvm-guest", NULL); if (!d_kvm_debug) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 1d08ad3582d0..92a74a0428aa 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -97,7 +97,7 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) if (best && (best->eax & (F(XSAVES) | F(XSAVEC)))) best->ebx = xstate_required_size(vcpu->arch.xcr0, true); - vcpu->arch.eager_fpu = guest_cpuid_has_mpx(vcpu); + vcpu->arch.eager_fpu = use_eager_fpu() || guest_cpuid_has_mpx(vcpu); /* * The existing code assumes virtual address is 48-bit in the canonical diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 630bcb0d7a04..9b655d113fc6 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -25,6 +25,7 @@ #include <linux/module.h> #include <asm/kvm_emulate.h> #include <linux/stringify.h> +#include <asm/debugreg.h> #include "x86.h" #include "tss.h" @@ -523,13 +524,9 @@ static void masked_increment(ulong *reg, ulong mask, int inc) static inline void register_address_increment(struct x86_emulate_ctxt *ctxt, int reg, int inc) { - ulong mask; + ulong *preg = reg_rmw(ctxt, reg); - if (ctxt->ad_bytes == sizeof(unsigned long)) - mask = ~0UL; - else - mask = ad_mask(ctxt); - masked_increment(reg_rmw(ctxt, reg), mask, inc); + assign_register(preg, *preg + inc, ctxt->ad_bytes); } static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc) @@ -2573,6 +2570,30 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt, return true; } +static void string_registers_quirk(struct x86_emulate_ctxt *ctxt) +{ + /* + * Intel CPUs mask the counter and pointers in quite strange + * manner when ECX is zero due to REP-string optimizations. + */ +#ifdef CONFIG_X86_64 + if (ctxt->ad_bytes != 4 || !vendor_intel(ctxt)) + return; + + *reg_write(ctxt, VCPU_REGS_RCX) = 0; + + switch (ctxt->b) { + case 0xa4: /* movsb */ + case 0xa5: /* movsd/w */ + *reg_rmw(ctxt, VCPU_REGS_RSI) &= (u32)-1; + /* fall through */ + case 0xaa: /* stosb */ + case 0xab: /* stosd/w */ + *reg_rmw(ctxt, VCPU_REGS_RDI) &= (u32)-1; + } +#endif +} + static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt, struct tss_segment_16 *tss) { @@ -2849,7 +2870,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, ulong old_tss_base = ops->get_cached_segment_base(ctxt, VCPU_SREG_TR); u32 desc_limit; - ulong desc_addr; + ulong desc_addr, dr7; /* FIXME: old_tss_base == ~0 ? */ @@ -2934,6 +2955,9 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, ret = em_push(ctxt); } + ops->get_dr(ctxt, 7, &dr7); + ops->set_dr(ctxt, 7, dr7 & ~(DR_LOCAL_ENABLE_MASK | DR_LOCAL_SLOWDOWN)); + return ret; } @@ -3840,7 +3864,7 @@ static const struct opcode group5[] = { F(DstMem | SrcNone | Lock, em_inc), F(DstMem | SrcNone | Lock, em_dec), I(SrcMem | NearBranch, em_call_near_abs), - I(SrcMemFAddr | ImplicitOps | Stack, em_call_far), + I(SrcMemFAddr | ImplicitOps, em_call_far), I(SrcMem | NearBranch, em_jmp_abs), I(SrcMemFAddr | ImplicitOps, em_jmp_far), I(SrcMem | Stack, em_push), D(Undefined), @@ -4910,6 +4934,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) if (ctxt->rep_prefix && (ctxt->d & String)) { /* All REP prefixes have the same first termination condition */ if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) { + string_registers_quirk(ctxt); ctxt->eip = ctxt->_eip; ctxt->eflags &= ~X86_EFLAGS_RF; goto done; diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 28146f03c514..856f79105bb5 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -349,6 +349,7 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status) irqe.delivery_mode = entry->fields.delivery_mode << 8; irqe.level = 1; irqe.shorthand = 0; + irqe.msi_redir_hint = false; if (irqe.trig_mode == IOAPIC_EDGE_TRIG) ioapic->irr_delivered |= 1 << irq; @@ -637,11 +638,9 @@ void kvm_ioapic_destroy(struct kvm *kvm) struct kvm_ioapic *ioapic = kvm->arch.vioapic; cancel_delayed_work_sync(&ioapic->eoi_inject); - if (ioapic) { - kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev); - kvm->arch.vioapic = NULL; - kfree(ioapic); - } + kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev); + kvm->arch.vioapic = NULL; + kfree(ioapic); } int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state) diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 72298b3ac025..9efff9e5b58c 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -31,6 +31,8 @@ #include "ioapic.h" +#include "lapic.h" + static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, bool line_status) @@ -48,11 +50,6 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e, line_status); } -inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq) -{ - return irq->delivery_mode == APIC_DM_LOWEST; -} - int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, unsigned long *dest_map) { @@ -60,7 +57,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, struct kvm_vcpu *vcpu, *lowest = NULL; if (irq->dest_mode == 0 && irq->dest_id == 0xff && - kvm_is_dm_lowest_prio(irq)) { + kvm_lowest_prio_delivery(irq)) { printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n"); irq->delivery_mode = APIC_DM_FIXED; } @@ -76,7 +73,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, irq->dest_id, irq->dest_mode)) continue; - if (!kvm_is_dm_lowest_prio(irq)) { + if (!kvm_lowest_prio_delivery(irq)) { if (r < 0) r = 0; r += kvm_apic_set_irq(vcpu, irq, dest_map); @@ -106,9 +103,10 @@ static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, irq->dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo; irq->trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data; irq->delivery_mode = e->msi.data & 0x700; + irq->msi_redir_hint = ((e->msi.address_lo + & MSI_ADDR_REDIRECTION_LOWPRI) > 0); irq->level = 1; irq->shorthand = 0; - /* TODO Deal with RH bit of MSI message address */ } int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 629af0f1c5c4..dc5b57bb1f76 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -728,7 +728,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, dst = map->logical_map[cid]; - if (irq->delivery_mode == APIC_DM_LOWEST) { + if (kvm_lowest_prio_delivery(irq)) { int l = -1; for_each_set_bit(i, &bitmap, 16) { if (!dst[i]) @@ -914,9 +914,10 @@ static void apic_send_ipi(struct kvm_lapic *apic) irq.vector = icr_low & APIC_VECTOR_MASK; irq.delivery_mode = icr_low & APIC_MODE_MASK; irq.dest_mode = icr_low & APIC_DEST_MASK; - irq.level = icr_low & APIC_INT_ASSERT; + irq.level = (icr_low & APIC_INT_ASSERT) != 0; irq.trig_mode = icr_low & APIC_INT_LEVELTRIG; irq.shorthand = icr_low & APIC_SHORT_MASK; + irq.msi_redir_hint = false; if (apic_x2apic_mode(apic)) irq.dest_id = icr_high; else @@ -926,10 +927,11 @@ static void apic_send_ipi(struct kvm_lapic *apic) apic_debug("icr_high 0x%x, icr_low 0x%x, " "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, " - "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n", + "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x, " + "msi_redir_hint 0x%x\n", icr_high, icr_low, irq.shorthand, irq.dest_id, irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode, - irq.vector); + irq.vector, irq.msi_redir_hint); kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq, NULL); } @@ -1557,7 +1559,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) } -void kvm_lapic_reset(struct kvm_vcpu *vcpu) +void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) { struct kvm_lapic *apic; int i; @@ -1571,14 +1573,16 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) /* Stop the timer in case it's a reset to an active apic */ hrtimer_cancel(&apic->lapic_timer.timer); - kvm_apic_set_id(apic, vcpu->vcpu_id); + if (!init_event) + kvm_apic_set_id(apic, vcpu->vcpu_id); kvm_apic_set_version(apic->vcpu); for (i = 0; i < APIC_LVT_NUM; i++) apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); apic->lapic_timer.timer_mode = 0; - apic_set_reg(apic, APIC_LVT0, - SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT)); + if (!(vcpu->kvm->arch.disabled_quirks & KVM_QUIRK_LINT0_REENABLED)) + apic_set_reg(apic, APIC_LVT0, + SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT)); apic_set_reg(apic, APIC_DFR, 0xffffffffU); apic_set_spiv(apic, 0xff); @@ -1712,7 +1716,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE); static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */ - kvm_lapic_reset(vcpu); + kvm_lapic_reset(vcpu, false); kvm_iodevice_init(&apic->dev, &apic_mmio_ops); return 0; @@ -2046,8 +2050,8 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu) pe = xchg(&apic->pending_events, 0); if (test_bit(KVM_APIC_INIT, &pe)) { - kvm_lapic_reset(vcpu); - kvm_vcpu_reset(vcpu); + kvm_lapic_reset(vcpu, true); + kvm_vcpu_reset(vcpu, true); if (kvm_vcpu_is_bsp(apic->vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; else diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 9d28383fc1e7..71b150cae5f9 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -48,7 +48,7 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu); int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); void kvm_apic_accept_events(struct kvm_vcpu *vcpu); -void kvm_lapic_reset(struct kvm_vcpu *vcpu); +void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event); u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu); @@ -153,6 +153,12 @@ static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu) return vcpu->arch.apic->pending_events; } +static inline bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq) +{ + return (irq->delivery_mode == APIC_DM_LOWEST || + irq->msi_redir_hint); +} + bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); void wait_lapic_expire(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 44a7d2515497..49c34e632b91 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -811,8 +811,7 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn) int i; slot = gfn_to_memslot(kvm, gfn); - for (i = PT_DIRECTORY_LEVEL; - i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { + for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { linfo = lpage_info_slot(gfn, slot, i); linfo->write_count += 1; } @@ -826,8 +825,7 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) int i; slot = gfn_to_memslot(kvm, gfn); - for (i = PT_DIRECTORY_LEVEL; - i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { + for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { linfo = lpage_info_slot(gfn, slot, i); linfo->write_count -= 1; WARN_ON(linfo->write_count < 0); @@ -858,8 +856,7 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn) page_size = kvm_host_page_size(kvm, gfn); - for (i = PT_PAGE_TABLE_LEVEL; - i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) { + for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { if (page_size >= KVM_HPAGE_SIZE(i)) ret = i; else @@ -1142,6 +1139,11 @@ static u64 *rmap_get_next(struct rmap_iterator *iter) return NULL; } +#define for_each_rmap_spte(_rmap_, _iter_, _spte_) \ + for (_spte_ = rmap_get_first(*_rmap_, _iter_); \ + _spte_ && ({BUG_ON(!is_shadow_present_pte(*_spte_)); 1;}); \ + _spte_ = rmap_get_next(_iter_)) + static void drop_spte(struct kvm *kvm, u64 *sptep) { if (mmu_spte_clear_track_bits(sptep)) @@ -1205,12 +1207,8 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, struct rmap_iterator iter; bool flush = false; - for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { - BUG_ON(!(*sptep & PT_PRESENT_MASK)); - + for_each_rmap_spte(rmapp, &iter, sptep) flush |= spte_write_protect(kvm, sptep, pt_protect); - sptep = rmap_get_next(&iter); - } return flush; } @@ -1232,12 +1230,8 @@ static bool __rmap_clear_dirty(struct kvm *kvm, unsigned long *rmapp) struct rmap_iterator iter; bool flush = false; - for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { - BUG_ON(!(*sptep & PT_PRESENT_MASK)); - + for_each_rmap_spte(rmapp, &iter, sptep) flush |= spte_clear_dirty(kvm, sptep); - sptep = rmap_get_next(&iter); - } return flush; } @@ -1259,12 +1253,8 @@ static bool __rmap_set_dirty(struct kvm *kvm, unsigned long *rmapp) struct rmap_iterator iter; bool flush = false; - for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { - BUG_ON(!(*sptep & PT_PRESENT_MASK)); - + for_each_rmap_spte(rmapp, &iter, sptep) flush |= spte_set_dirty(kvm, sptep); - sptep = rmap_get_next(&iter); - } return flush; } @@ -1351,8 +1341,7 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn) slot = gfn_to_memslot(kvm, gfn); - for (i = PT_PAGE_TABLE_LEVEL; - i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { + for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { rmapp = __gfn_to_rmap(gfn, i, slot); write_protected |= __rmap_write_protect(kvm, rmapp, true); } @@ -1360,24 +1349,28 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn) return write_protected; } -static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, - struct kvm_memory_slot *slot, gfn_t gfn, int level, - unsigned long data) +static bool kvm_zap_rmapp(struct kvm *kvm, unsigned long *rmapp) { u64 *sptep; struct rmap_iterator iter; - int need_tlb_flush = 0; + bool flush = false; while ((sptep = rmap_get_first(*rmapp, &iter))) { BUG_ON(!(*sptep & PT_PRESENT_MASK)); - rmap_printk("kvm_rmap_unmap_hva: spte %p %llx gfn %llx (%d)\n", - sptep, *sptep, gfn, level); + rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep); drop_spte(kvm, sptep); - need_tlb_flush = 1; + flush = true; } - return need_tlb_flush; + return flush; +} + +static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, + struct kvm_memory_slot *slot, gfn_t gfn, int level, + unsigned long data) +{ + return kvm_zap_rmapp(kvm, rmapp); } static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, @@ -1394,8 +1387,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, WARN_ON(pte_huge(*ptep)); new_pfn = pte_pfn(*ptep); - for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { - BUG_ON(!is_shadow_present_pte(*sptep)); +restart: + for_each_rmap_spte(rmapp, &iter, sptep) { rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n", sptep, *sptep, gfn, level); @@ -1403,7 +1396,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, if (pte_write(*ptep)) { drop_spte(kvm, sptep); - sptep = rmap_get_first(*rmapp, &iter); + goto restart; } else { new_spte = *sptep & ~PT64_BASE_ADDR_MASK; new_spte |= (u64)new_pfn << PAGE_SHIFT; @@ -1414,7 +1407,6 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, mmu_spte_clear_track_bits(sptep); mmu_spte_set(sptep, new_spte); - sptep = rmap_get_next(&iter); } } @@ -1424,6 +1416,74 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, return 0; } +struct slot_rmap_walk_iterator { + /* input fields. */ + struct kvm_memory_slot *slot; + gfn_t start_gfn; + gfn_t end_gfn; + int start_level; + int end_level; + + /* output fields. */ + gfn_t gfn; + unsigned long *rmap; + int level; + + /* private field. */ + unsigned long *end_rmap; +}; + +static void +rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level) +{ + iterator->level = level; + iterator->gfn = iterator->start_gfn; + iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot); + iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level, + iterator->slot); +} + +static void +slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator, + struct kvm_memory_slot *slot, int start_level, + int end_level, gfn_t start_gfn, gfn_t end_gfn) +{ + iterator->slot = slot; + iterator->start_level = start_level; + iterator->end_level = end_level; + iterator->start_gfn = start_gfn; + iterator->end_gfn = end_gfn; + + rmap_walk_init_level(iterator, iterator->start_level); +} + +static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator) +{ + return !!iterator->rmap; +} + +static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator) +{ + if (++iterator->rmap <= iterator->end_rmap) { + iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level)); + return; + } + + if (++iterator->level > iterator->end_level) { + iterator->rmap = NULL; + return; + } + + rmap_walk_init_level(iterator, iterator->level); +} + +#define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_, \ + _start_gfn, _end_gfn, _iter_) \ + for (slot_rmap_walk_init(_iter_, _slot_, _start_level_, \ + _end_level_, _start_gfn, _end_gfn); \ + slot_rmap_walk_okay(_iter_); \ + slot_rmap_walk_next(_iter_)) + static int kvm_handle_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, @@ -1435,10 +1495,10 @@ static int kvm_handle_hva_range(struct kvm *kvm, int level, unsigned long data)) { - int j; - int ret = 0; struct kvm_memslots *slots; struct kvm_memory_slot *memslot; + struct slot_rmap_walk_iterator iterator; + int ret = 0; slots = kvm_memslots(kvm); @@ -1458,26 +1518,11 @@ static int kvm_handle_hva_range(struct kvm *kvm, gfn_start = hva_to_gfn_memslot(hva_start, memslot); gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); - for (j = PT_PAGE_TABLE_LEVEL; - j < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++j) { - unsigned long idx, idx_end; - unsigned long *rmapp; - gfn_t gfn = gfn_start; - - /* - * {idx(page_j) | page_j intersects with - * [hva_start, hva_end)} = {idx, idx+1, ..., idx_end}. - */ - idx = gfn_to_index(gfn_start, memslot->base_gfn, j); - idx_end = gfn_to_index(gfn_end - 1, memslot->base_gfn, j); - - rmapp = __gfn_to_rmap(gfn_start, j, memslot); - - for (; idx <= idx_end; - ++idx, gfn += (1UL << KVM_HPAGE_GFN_SHIFT(j))) - ret |= handler(kvm, rmapp++, memslot, - gfn, j, data); - } + for_each_slot_rmap_range(memslot, PT_PAGE_TABLE_LEVEL, + PT_MAX_HUGEPAGE_LEVEL, gfn_start, gfn_end - 1, + &iterator) + ret |= handler(kvm, iterator.rmap, memslot, + iterator.gfn, iterator.level, data); } return ret; @@ -1518,16 +1563,13 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, BUG_ON(!shadow_accessed_mask); - for (sptep = rmap_get_first(*rmapp, &iter); sptep; - sptep = rmap_get_next(&iter)) { - BUG_ON(!is_shadow_present_pte(*sptep)); - + for_each_rmap_spte(rmapp, &iter, sptep) if (*sptep & shadow_accessed_mask) { young = 1; clear_bit((ffs(shadow_accessed_mask) - 1), (unsigned long *)sptep); } - } + trace_kvm_age_page(gfn, level, slot, young); return young; } @@ -1548,15 +1590,11 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, if (!shadow_accessed_mask) goto out; - for (sptep = rmap_get_first(*rmapp, &iter); sptep; - sptep = rmap_get_next(&iter)) { - BUG_ON(!is_shadow_present_pte(*sptep)); - + for_each_rmap_spte(rmapp, &iter, sptep) if (*sptep & shadow_accessed_mask) { young = 1; break; } - } out: return young; } @@ -2393,19 +2431,20 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page); static int get_mtrr_type(struct mtrr_state_type *mtrr_state, u64 start, u64 end) { - int i; u64 base, mask; u8 prev_match, curr_match; - int num_var_ranges = KVM_NR_VAR_MTRR; + int i, num_var_ranges = KVM_NR_VAR_MTRR; - if (!mtrr_state->enabled) - return 0xFF; + /* MTRR is completely disabled, use UC for all of physical memory. */ + if (!(mtrr_state->enabled & 0x2)) + return MTRR_TYPE_UNCACHABLE; /* Make end inclusive end, instead of exclusive */ end--; /* Look in fixed ranges. Just return the type as per start */ - if (mtrr_state->have_fixed && (start < 0x100000)) { + if (mtrr_state->have_fixed && (mtrr_state->enabled & 0x1) && + (start < 0x100000)) { int idx; if (start < 0x80000) { @@ -2428,9 +2467,6 @@ static int get_mtrr_type(struct mtrr_state_type *mtrr_state, * Look of multiple ranges matching this address and pick type * as per MTRR precedence */ - if (!(mtrr_state->enabled & 2)) - return mtrr_state->def_type; - prev_match = 0xFF; for (i = 0; i < num_var_ranges; ++i) { unsigned short start_state, end_state; @@ -3475,10 +3511,12 @@ static bool can_do_async_pf(struct kvm_vcpu *vcpu) static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, gva_t gva, pfn_t *pfn, bool write, bool *writable) { + struct kvm_memory_slot *slot; bool async; - *pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async, write, writable); - + slot = gfn_to_memslot(vcpu->kvm, gfn); + async = false; + *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable); if (!async) return false; /* *pfn has correct page already */ @@ -3492,8 +3530,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, return true; } - *pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, writable); - + *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable); return false; } @@ -4420,36 +4457,113 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu) init_kvm_mmu(vcpu); } -void kvm_mmu_slot_remove_write_access(struct kvm *kvm, - struct kvm_memory_slot *memslot) +/* The return value indicates if tlb flush on all vcpus is needed. */ +typedef bool (*slot_level_handler) (struct kvm *kvm, unsigned long *rmap); + +/* The caller should hold mmu-lock before calling this function. */ +static bool +slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, + slot_level_handler fn, int start_level, int end_level, + gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb) { - gfn_t last_gfn; - int i; + struct slot_rmap_walk_iterator iterator; bool flush = false; - last_gfn = memslot->base_gfn + memslot->npages - 1; + for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn, + end_gfn, &iterator) { + if (iterator.rmap) + flush |= fn(kvm, iterator.rmap); - spin_lock(&kvm->mmu_lock); + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { + if (flush && lock_flush_tlb) { + kvm_flush_remote_tlbs(kvm); + flush = false; + } + cond_resched_lock(&kvm->mmu_lock); + } + } - for (i = PT_PAGE_TABLE_LEVEL; - i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { - unsigned long *rmapp; - unsigned long last_index, index; + if (flush && lock_flush_tlb) { + kvm_flush_remote_tlbs(kvm); + flush = false; + } - rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL]; - last_index = gfn_to_index(last_gfn, memslot->base_gfn, i); + return flush; +} - for (index = 0; index <= last_index; ++index, ++rmapp) { - if (*rmapp) - flush |= __rmap_write_protect(kvm, rmapp, - false); +static bool +slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot, + slot_level_handler fn, int start_level, int end_level, + bool lock_flush_tlb) +{ + return slot_handle_level_range(kvm, memslot, fn, start_level, + end_level, memslot->base_gfn, + memslot->base_gfn + memslot->npages - 1, + lock_flush_tlb); +} - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) - cond_resched_lock(&kvm->mmu_lock); - } +static bool +slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot, + slot_level_handler fn, bool lock_flush_tlb) +{ + return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL, + PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); +} + +static bool +slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot, + slot_level_handler fn, bool lock_flush_tlb) +{ + return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1, + PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); +} + +static bool +slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot, + slot_level_handler fn, bool lock_flush_tlb) +{ + return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL, + PT_PAGE_TABLE_LEVEL, lock_flush_tlb); +} + +void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + + slots = kvm_memslots(kvm); + + spin_lock(&kvm->mmu_lock); + kvm_for_each_memslot(memslot, slots) { + gfn_t start, end; + + start = max(gfn_start, memslot->base_gfn); + end = min(gfn_end, memslot->base_gfn + memslot->npages); + if (start >= end) + continue; + + slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, + PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL, + start, end - 1, true); } spin_unlock(&kvm->mmu_lock); +} + +static bool slot_rmap_write_protect(struct kvm *kvm, unsigned long *rmapp) +{ + return __rmap_write_protect(kvm, rmapp, false); +} + +void kvm_mmu_slot_remove_write_access(struct kvm *kvm, + struct kvm_memory_slot *memslot) +{ + bool flush; + + spin_lock(&kvm->mmu_lock); + flush = slot_handle_all_level(kvm, memslot, slot_rmap_write_protect, + false); + spin_unlock(&kvm->mmu_lock); /* * kvm_mmu_slot_remove_write_access() and kvm_vm_ioctl_get_dirty_log() @@ -4482,9 +4596,8 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, pfn_t pfn; struct kvm_mmu_page *sp; - for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { - BUG_ON(!(*sptep & PT_PRESENT_MASK)); - +restart: + for_each_rmap_spte(rmapp, &iter, sptep) { sp = page_header(__pa(sptep)); pfn = spte_to_pfn(*sptep); @@ -4499,10 +4612,9 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, !kvm_is_reserved_pfn(pfn) && PageTransCompound(pfn_to_page(pfn))) { drop_spte(kvm, sptep); - sptep = rmap_get_first(*rmapp, &iter); need_tlb_flush = 1; - } else - sptep = rmap_get_next(&iter); + goto restart; + } } return need_tlb_flush; @@ -4511,59 +4623,18 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, struct kvm_memory_slot *memslot) { - bool flush = false; - unsigned long *rmapp; - unsigned long last_index, index; - spin_lock(&kvm->mmu_lock); - - rmapp = memslot->arch.rmap[0]; - last_index = gfn_to_index(memslot->base_gfn + memslot->npages - 1, - memslot->base_gfn, PT_PAGE_TABLE_LEVEL); - - for (index = 0; index <= last_index; ++index, ++rmapp) { - if (*rmapp) - flush |= kvm_mmu_zap_collapsible_spte(kvm, rmapp); - - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { - if (flush) { - kvm_flush_remote_tlbs(kvm); - flush = false; - } - cond_resched_lock(&kvm->mmu_lock); - } - } - - if (flush) - kvm_flush_remote_tlbs(kvm); - + slot_handle_leaf(kvm, memslot, kvm_mmu_zap_collapsible_spte, true); spin_unlock(&kvm->mmu_lock); } void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot) { - gfn_t last_gfn; - unsigned long *rmapp; - unsigned long last_index, index; - bool flush = false; - - last_gfn = memslot->base_gfn + memslot->npages - 1; + bool flush; spin_lock(&kvm->mmu_lock); - - rmapp = memslot->arch.rmap[PT_PAGE_TABLE_LEVEL - 1]; - last_index = gfn_to_index(last_gfn, memslot->base_gfn, - PT_PAGE_TABLE_LEVEL); - - for (index = 0; index <= last_index; ++index, ++rmapp) { - if (*rmapp) - flush |= __rmap_clear_dirty(kvm, rmapp); - - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) - cond_resched_lock(&kvm->mmu_lock); - } - + flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false); spin_unlock(&kvm->mmu_lock); lockdep_assert_held(&kvm->slots_lock); @@ -4582,31 +4653,11 @@ EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty); void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, struct kvm_memory_slot *memslot) { - gfn_t last_gfn; - int i; - bool flush = false; - - last_gfn = memslot->base_gfn + memslot->npages - 1; + bool flush; spin_lock(&kvm->mmu_lock); - - for (i = PT_PAGE_TABLE_LEVEL + 1; /* skip rmap for 4K page */ - i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { - unsigned long *rmapp; - unsigned long last_index, index; - - rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL]; - last_index = gfn_to_index(last_gfn, memslot->base_gfn, i); - - for (index = 0; index <= last_index; ++index, ++rmapp) { - if (*rmapp) - flush |= __rmap_write_protect(kvm, rmapp, - false); - - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) - cond_resched_lock(&kvm->mmu_lock); - } - } + flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect, + false); spin_unlock(&kvm->mmu_lock); /* see kvm_mmu_slot_remove_write_access */ @@ -4620,31 +4671,10 @@ EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access); void kvm_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot) { - gfn_t last_gfn; - int i; - bool flush = false; - - last_gfn = memslot->base_gfn + memslot->npages - 1; + bool flush; spin_lock(&kvm->mmu_lock); - - for (i = PT_PAGE_TABLE_LEVEL; - i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { - unsigned long *rmapp; - unsigned long last_index, index; - - rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL]; - last_index = gfn_to_index(last_gfn, memslot->base_gfn, i); - - for (index = 0; index <= last_index; ++index, ++rmapp) { - if (*rmapp) - flush |= __rmap_set_dirty(kvm, rmapp); - - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) - cond_resched_lock(&kvm->mmu_lock); - } - } - + flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false); spin_unlock(&kvm->mmu_lock); lockdep_assert_held(&kvm->slots_lock); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 0ada65ecddcf..398d21c0f6dd 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -43,6 +43,7 @@ #define PT_PDPE_LEVEL 3 #define PT_DIRECTORY_LEVEL 2 #define PT_PAGE_TABLE_LEVEL 1 +#define PT_MAX_HUGEPAGE_LEVEL (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES - 1) static inline u64 rsvd_bits(int s, int e) { @@ -170,4 +171,5 @@ static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, } void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm); +void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end); #endif diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index 9ade5cfb5a4c..368d53497314 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -197,13 +197,11 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) rmapp = gfn_to_rmap(kvm, sp->gfn, PT_PAGE_TABLE_LEVEL); - for (sptep = rmap_get_first(*rmapp, &iter); sptep; - sptep = rmap_get_next(&iter)) { + for_each_rmap_spte(rmapp, &iter, sptep) if (is_writable_pte(*sptep)) audit_printk(kvm, "shadow page has writable " "mappings: gfn %llx role %x\n", sp->gfn, sp->role.word); - } } static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 9afa233b5482..b9f9e1073e50 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1082,7 +1082,7 @@ static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) return target_tsc - tsc; } -static void init_vmcb(struct vcpu_svm *svm) +static void init_vmcb(struct vcpu_svm *svm, bool init_event) { struct vmcb_control_area *control = &svm->vmcb->control; struct vmcb_save_area *save = &svm->vmcb->save; @@ -1153,17 +1153,17 @@ static void init_vmcb(struct vcpu_svm *svm) init_sys_seg(&save->ldtr, SEG_TYPE_LDT); init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); - svm_set_efer(&svm->vcpu, 0); + if (!init_event) + svm_set_efer(&svm->vcpu, 0); save->dr6 = 0xffff0ff0; kvm_set_rflags(&svm->vcpu, 2); save->rip = 0x0000fff0; svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; /* - * This is the guest-visible cr0 value. * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. + * It also updates the guest-visible cr0 value. */ - svm->vcpu.arch.cr0 = 0; (void)kvm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET); save->cr4 = X86_CR4_PAE; @@ -1176,7 +1176,7 @@ static void init_vmcb(struct vcpu_svm *svm) clr_exception_intercept(svm, PF_VECTOR); clr_cr_intercept(svm, INTERCEPT_CR3_READ); clr_cr_intercept(svm, INTERCEPT_CR3_WRITE); - save->g_pat = 0x0007040600070406ULL; + save->g_pat = svm->vcpu.arch.pat; save->cr3 = 0; save->cr4 = 0; } @@ -1195,13 +1195,19 @@ static void init_vmcb(struct vcpu_svm *svm) enable_gif(svm); } -static void svm_vcpu_reset(struct kvm_vcpu *vcpu) +static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { struct vcpu_svm *svm = to_svm(vcpu); u32 dummy; u32 eax = 1; - init_vmcb(svm); + if (!init_event) { + svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE | + MSR_IA32_APICBASE_ENABLE; + if (kvm_vcpu_is_reset_bsp(&svm->vcpu)) + svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; + } + init_vmcb(svm, init_event); kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy); kvm_register_write(vcpu, VCPU_REGS_RDX, eax); @@ -1257,12 +1263,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) clear_page(svm->vmcb); svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; svm->asid_generation = 0; - init_vmcb(svm); - - svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE | - MSR_IA32_APICBASE_ENABLE; - if (kvm_vcpu_is_reset_bsp(&svm->vcpu)) - svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; + init_vmcb(svm, false); svm_init_osvw(&svm->vcpu); @@ -1575,7 +1576,8 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) * does not do it - this results in some delay at * reboot */ - cr0 &= ~(X86_CR0_CD | X86_CR0_NW); + if (!(vcpu->kvm->arch.disabled_quirks & KVM_QUIRK_CD_NW_CLEARED)) + cr0 &= ~(X86_CR0_CD | X86_CR0_NW); svm->vmcb->save.cr0 = cr0; mark_dirty(svm->vmcb, VMCB_CR); update_cr0_intercept(svm); @@ -1883,7 +1885,7 @@ static int shutdown_interception(struct vcpu_svm *svm) * so reinitialize it. */ clear_page(svm->vmcb); - init_vmcb(svm); + init_vmcb(svm, false); kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; return 0; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2d73807f0d31..9cf5030927d5 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2170,8 +2170,7 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu) if (is_guest_mode(vcpu)) msr_bitmap = vmx_msr_bitmap_nested; - else if (irqchip_in_kernel(vcpu->kvm) && - apic_x2apic_mode(vcpu->arch.apic)) { + else if (vcpu->arch.apic_base & X2APIC_ENABLE) { if (is_long_mode(vcpu)) msr_bitmap = vmx_msr_bitmap_longmode_x2apic; else @@ -4667,16 +4666,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest)); - if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { - u32 msr_low, msr_high; - u64 host_pat; - rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high); - host_pat = msr_low | ((u64) msr_high << 32); - /* Write the default value follow host pat */ - vmcs_write64(GUEST_IA32_PAT, host_pat); - /* Keep arch.pat sync with GUEST_IA32_PAT */ - vmx->vcpu.arch.pat = host_pat; - } + if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) + vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) { u32 index = vmx_msr_index[i]; @@ -4708,22 +4699,27 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) return 0; } -static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) +static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct msr_data apic_base_msr; + u64 cr0; vmx->rmode.vm86_active = 0; vmx->soft_vnmi_blocked = 0; vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); - kvm_set_cr8(&vmx->vcpu, 0); - apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE; - if (kvm_vcpu_is_reset_bsp(&vmx->vcpu)) - apic_base_msr.data |= MSR_IA32_APICBASE_BSP; - apic_base_msr.host_initiated = true; - kvm_set_apic_base(&vmx->vcpu, &apic_base_msr); + kvm_set_cr8(vcpu, 0); + + if (!init_event) { + apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | + MSR_IA32_APICBASE_ENABLE; + if (kvm_vcpu_is_reset_bsp(vcpu)) + apic_base_msr.data |= MSR_IA32_APICBASE_BSP; + apic_base_msr.host_initiated = true; + kvm_set_apic_base(vcpu, &apic_base_msr); + } vmx_segment_cache_clear(vmx); @@ -4747,9 +4743,12 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); - vmcs_write32(GUEST_SYSENTER_CS, 0); - vmcs_writel(GUEST_SYSENTER_ESP, 0); - vmcs_writel(GUEST_SYSENTER_EIP, 0); + if (!init_event) { + vmcs_write32(GUEST_SYSENTER_CS, 0); + vmcs_writel(GUEST_SYSENTER_ESP, 0); + vmcs_writel(GUEST_SYSENTER_EIP, 0); + vmcs_write64(GUEST_IA32_DEBUGCTL, 0); + } vmcs_writel(GUEST_RFLAGS, 0x02); kvm_rip_write(vcpu, 0xfff0); @@ -4764,18 +4763,15 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); - /* Special registers */ - vmcs_write64(GUEST_IA32_DEBUGCTL, 0); - setup_msrs(vmx); vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ - if (cpu_has_vmx_tpr_shadow()) { + if (cpu_has_vmx_tpr_shadow() && !init_event) { vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); - if (vm_need_tpr_shadow(vmx->vcpu.kvm)) + if (vm_need_tpr_shadow(vcpu->kvm)) vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, - __pa(vmx->vcpu.arch.apic->regs)); + __pa(vcpu->arch.apic->regs)); vmcs_write32(TPR_THRESHOLD, 0); } @@ -4787,12 +4783,14 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) if (vmx->vpid != 0) vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); - vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; - vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */ - vmx_set_cr4(&vmx->vcpu, 0); - vmx_set_efer(&vmx->vcpu, 0); - vmx_fpu_activate(&vmx->vcpu); - update_exception_bitmap(&vmx->vcpu); + cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; + vmx_set_cr0(vcpu, cr0); /* enter rmode */ + vmx->vcpu.arch.cr0 = cr0; + vmx_set_cr4(vcpu, 0); + if (!init_event) + vmx_set_efer(vcpu, 0); + vmx_fpu_activate(vcpu); + update_exception_bitmap(vcpu); vpid_sync_context(vmx); } @@ -5710,9 +5708,6 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) return 0; } - /* clear all local breakpoint enable flags */ - vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~0x155); - /* * TODO: What about debug traps on tss switch? * Are we supposed to inject them and update dr6? @@ -7691,6 +7686,158 @@ static void kvm_flush_pml_buffers(struct kvm *kvm) kvm_vcpu_kick(vcpu); } +static void vmx_dump_sel(char *name, uint32_t sel) +{ + pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n", + name, vmcs_read32(sel), + vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR), + vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR), + vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR)); +} + +static void vmx_dump_dtsel(char *name, uint32_t limit) +{ + pr_err("%s limit=0x%08x, base=0x%016lx\n", + name, vmcs_read32(limit), + vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT)); +} + +static void dump_vmcs(void) +{ + u32 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS); + u32 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS); + u32 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + u32 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL); + u32 secondary_exec_control = 0; + unsigned long cr4 = vmcs_readl(GUEST_CR4); + u64 efer = vmcs_readl(GUEST_IA32_EFER); + int i, n; + + if (cpu_has_secondary_exec_ctrls()) + secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + + pr_err("*** Guest State ***\n"); + pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", + vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW), + vmcs_readl(CR0_GUEST_HOST_MASK)); + pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", + cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK)); + pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3)); + if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) && + (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA)) + { + pr_err("PDPTR0 = 0x%016lx PDPTR1 = 0x%016lx\n", + vmcs_readl(GUEST_PDPTR0), vmcs_readl(GUEST_PDPTR1)); + pr_err("PDPTR2 = 0x%016lx PDPTR3 = 0x%016lx\n", + vmcs_readl(GUEST_PDPTR2), vmcs_readl(GUEST_PDPTR3)); + } + pr_err("RSP = 0x%016lx RIP = 0x%016lx\n", + vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP)); + pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n", + vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7)); + pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", + vmcs_readl(GUEST_SYSENTER_ESP), + vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP)); + vmx_dump_sel("CS: ", GUEST_CS_SELECTOR); + vmx_dump_sel("DS: ", GUEST_DS_SELECTOR); + vmx_dump_sel("SS: ", GUEST_SS_SELECTOR); + vmx_dump_sel("ES: ", GUEST_ES_SELECTOR); + vmx_dump_sel("FS: ", GUEST_FS_SELECTOR); + vmx_dump_sel("GS: ", GUEST_GS_SELECTOR); + vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT); + vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR); + vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT); + vmx_dump_sel("TR: ", GUEST_TR_SELECTOR); + if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) || + (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER))) + pr_err("EFER = 0x%016llx PAT = 0x%016lx\n", + efer, vmcs_readl(GUEST_IA32_PAT)); + pr_err("DebugCtl = 0x%016lx DebugExceptions = 0x%016lx\n", + vmcs_readl(GUEST_IA32_DEBUGCTL), + vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS)); + if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) + pr_err("PerfGlobCtl = 0x%016lx\n", + vmcs_readl(GUEST_IA32_PERF_GLOBAL_CTRL)); + if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS) + pr_err("BndCfgS = 0x%016lx\n", vmcs_readl(GUEST_BNDCFGS)); + pr_err("Interruptibility = %08x ActivityState = %08x\n", + vmcs_read32(GUEST_INTERRUPTIBILITY_INFO), + vmcs_read32(GUEST_ACTIVITY_STATE)); + if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) + pr_err("InterruptStatus = %04x\n", + vmcs_read16(GUEST_INTR_STATUS)); + + pr_err("*** Host State ***\n"); + pr_err("RIP = 0x%016lx RSP = 0x%016lx\n", + vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP)); + pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n", + vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR), + vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR), + vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR), + vmcs_read16(HOST_TR_SELECTOR)); + pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n", + vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE), + vmcs_readl(HOST_TR_BASE)); + pr_err("GDTBase=%016lx IDTBase=%016lx\n", + vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE)); + pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n", + vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3), + vmcs_readl(HOST_CR4)); + pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", + vmcs_readl(HOST_IA32_SYSENTER_ESP), + vmcs_read32(HOST_IA32_SYSENTER_CS), + vmcs_readl(HOST_IA32_SYSENTER_EIP)); + if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER)) + pr_err("EFER = 0x%016lx PAT = 0x%016lx\n", + vmcs_readl(HOST_IA32_EFER), vmcs_readl(HOST_IA32_PAT)); + if (vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) + pr_err("PerfGlobCtl = 0x%016lx\n", + vmcs_readl(HOST_IA32_PERF_GLOBAL_CTRL)); + + pr_err("*** Control State ***\n"); + pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n", + pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control); + pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl); + pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n", + vmcs_read32(EXCEPTION_BITMAP), + vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK), + vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH)); + pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n", + vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), + vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE), + vmcs_read32(VM_ENTRY_INSTRUCTION_LEN)); + pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n", + vmcs_read32(VM_EXIT_INTR_INFO), + vmcs_read32(VM_EXIT_INTR_ERROR_CODE), + vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); + pr_err(" reason=%08x qualification=%016lx\n", + vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION)); + pr_err("IDTVectoring: info=%08x errcode=%08x\n", + vmcs_read32(IDT_VECTORING_INFO_FIELD), + vmcs_read32(IDT_VECTORING_ERROR_CODE)); + pr_err("TSC Offset = 0x%016lx\n", vmcs_readl(TSC_OFFSET)); + if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) + pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD)); + if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR) + pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV)); + if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)) + pr_err("EPT pointer = 0x%016lx\n", vmcs_readl(EPT_POINTER)); + n = vmcs_read32(CR3_TARGET_COUNT); + for (i = 0; i + 1 < n; i += 4) + pr_err("CR3 target%u=%016lx target%u=%016lx\n", + i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2), + i + 1, vmcs_readl(CR3_TARGET_VALUE0 + i * 2 + 2)); + if (i < n) + pr_err("CR3 target%u=%016lx\n", + i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2)); + if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING) + pr_err("PLE Gap=%08x Window=%08x\n", + vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW)); + if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID) + pr_err("Virtual processor ID = 0x%04x\n", + vmcs_read16(VIRTUAL_PROCESSOR_ID)); +} + /* * The guest has exited. See if we can fix it or if we need userspace * assistance. @@ -7723,6 +7870,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) } if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { + dump_vmcs(); vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; vcpu->run->fail_entry.hardware_entry_failure_reason = exit_reason; @@ -8924,7 +9072,7 @@ static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, struct vmx_msr_entry *e) { /* x2APIC MSR accesses are not allowed */ - if (apic_x2apic_mode(vcpu->arch.apic) && e->index >> 8 == 0x8) + if (vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8) return -EINVAL; if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */ e->index == MSR_IA32_UCODE_REV) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ea306adbbc13..d42d8ace90f1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -572,8 +572,7 @@ out: int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { unsigned long old_cr0 = kvm_read_cr0(vcpu); - unsigned long update_bits = X86_CR0_PG | X86_CR0_WP | - X86_CR0_CD | X86_CR0_NW; + unsigned long update_bits = X86_CR0_PG | X86_CR0_WP; cr0 |= X86_CR0_ET; @@ -1852,6 +1851,63 @@ bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) } EXPORT_SYMBOL_GPL(kvm_mtrr_valid); +static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr) +{ + struct mtrr_state_type *mtrr_state = &vcpu->arch.mtrr_state; + unsigned char mtrr_enabled = mtrr_state->enabled; + gfn_t start, end, mask; + int index; + bool is_fixed = true; + + if (msr == MSR_IA32_CR_PAT || !tdp_enabled || + !kvm_arch_has_noncoherent_dma(vcpu->kvm)) + return; + + if (!(mtrr_enabled & 0x2) && msr != MSR_MTRRdefType) + return; + + switch (msr) { + case MSR_MTRRfix64K_00000: + start = 0x0; + end = 0x80000; + break; + case MSR_MTRRfix16K_80000: + start = 0x80000; + end = 0xa0000; + break; + case MSR_MTRRfix16K_A0000: + start = 0xa0000; + end = 0xc0000; + break; + case MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000: + index = msr - MSR_MTRRfix4K_C0000; + start = 0xc0000 + index * (32 << 10); + end = start + (32 << 10); + break; + case MSR_MTRRdefType: + is_fixed = false; + start = 0x0; + end = ~0ULL; + break; + default: + /* variable range MTRRs. */ + is_fixed = false; + index = (msr - 0x200) / 2; + start = (((u64)mtrr_state->var_ranges[index].base_hi) << 32) + + (mtrr_state->var_ranges[index].base_lo & PAGE_MASK); + mask = (((u64)mtrr_state->var_ranges[index].mask_hi) << 32) + + (mtrr_state->var_ranges[index].mask_lo & PAGE_MASK); + mask |= ~0ULL << cpuid_maxphyaddr(vcpu); + + end = ((start & mask) | ~mask) + 1; + } + + if (is_fixed && !(mtrr_enabled & 0x1)) + return; + + kvm_zap_gfn_range(vcpu->kvm, gpa_to_gfn(start), gpa_to_gfn(end)); +} + static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) { u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; @@ -1885,7 +1941,7 @@ static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) *pt = data; } - kvm_mmu_reset_context(vcpu); + update_mtrr(vcpu, msr); return 0; } @@ -2798,6 +2854,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_HYPERV_TIME: case KVM_CAP_IOAPIC_POLARITY_IGNORED: case KVM_CAP_TSC_DEADLINE_TIMER: + case KVM_CAP_ENABLE_CAP_VM: + case KVM_CAP_DISABLE_QUIRKS: #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT case KVM_CAP_ASSIGN_DEV_IRQ: case KVM_CAP_PCI_2_3: @@ -3845,6 +3903,26 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, return 0; } +static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, + struct kvm_enable_cap *cap) +{ + int r; + + if (cap->flags) + return -EINVAL; + + switch (cap->cap) { + case KVM_CAP_DISABLE_QUIRKS: + kvm->arch.disabled_quirks = cap->args[0]; + r = 0; + break; + default: + r = -EINVAL; + break; + } + return r; +} + long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -4097,7 +4175,15 @@ long kvm_arch_vm_ioctl(struct file *filp, r = 0; break; } + case KVM_ENABLE_CAP: { + struct kvm_enable_cap cap; + r = -EFAULT; + if (copy_from_user(&cap, argp, sizeof(cap))) + goto out; + r = kvm_vm_ioctl_enable_cap(kvm, &cap); + break; + } default: r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg); } @@ -5952,6 +6038,7 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid) lapic_irq.shorthand = 0; lapic_irq.dest_mode = 0; lapic_irq.dest_id = apicid; + lapic_irq.msi_redir_hint = false; lapic_irq.delivery_mode = APIC_DM_REMRD; kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL); @@ -6347,7 +6434,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (req_immediate_exit) smp_send_reschedule(vcpu->cpu); - kvm_guest_enter(); + __kvm_guest_enter(); if (unlikely(vcpu->arch.switch_db_regs)) { set_debugreg(0, 7); @@ -7003,7 +7090,7 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) return 0; } -int fx_init(struct kvm_vcpu *vcpu) +int fx_init(struct kvm_vcpu *vcpu, bool init_event) { int err; @@ -7011,7 +7098,9 @@ int fx_init(struct kvm_vcpu *vcpu) if (err) return err; - fpu_finit(&vcpu->arch.guest_fpu); + if (!init_event) + fpu_finit(&vcpu->arch.guest_fpu); + if (cpu_has_xsaves) vcpu->arch.guest_fpu.state->xsave.xsave_hdr.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED; @@ -7053,16 +7142,25 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { kvm_put_guest_xcr0(vcpu); - if (!vcpu->guest_fpu_loaded) + if (!vcpu->guest_fpu_loaded) { + vcpu->fpu_counter = 0; return; + } vcpu->guest_fpu_loaded = 0; fpu_save_init(&vcpu->arch.guest_fpu); __kernel_fpu_end(); ++vcpu->stat.fpu_reload; - if (!vcpu->arch.eager_fpu) - kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu); - + /* + * If using eager FPU mode, or if the guest is a frequent user + * of the FPU, just leave the FPU active for next time. + * Every 255 times fpu_counter rolls over to 0; a guest that uses + * the FPU in bursts will revert to loading it on demand. + */ + if (!vcpu->arch.eager_fpu) { + if (++vcpu->fpu_counter < 5) + kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu); + } trace_kvm_fpu(0); } @@ -7103,7 +7201,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) r = vcpu_load(vcpu); if (r) return r; - kvm_vcpu_reset(vcpu); + kvm_vcpu_reset(vcpu, false); kvm_mmu_setup(vcpu); vcpu_put(vcpu); @@ -7141,7 +7239,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_x86_ops->vcpu_free(vcpu); } -void kvm_vcpu_reset(struct kvm_vcpu *vcpu) +void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { atomic_set(&vcpu->arch.nmi_queued, 0); vcpu->arch.nmi_pending = 0; @@ -7168,13 +7266,14 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu) kvm_async_pf_hash_reset(vcpu); vcpu->arch.apf.halted = false; - kvm_pmu_reset(vcpu); + if (!init_event) + kvm_pmu_reset(vcpu); memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs)); vcpu->arch.regs_avail = ~0; vcpu->arch.regs_dirty = ~0; - kvm_x86_ops->vcpu_reset(vcpu); + kvm_x86_ops->vcpu_reset(vcpu, init_event); } void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) @@ -7363,7 +7462,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) goto fail_free_mce_banks; } - r = fx_init(vcpu); + r = fx_init(vcpu, false); if (r) goto fail_free_wbinvd_dirty_mask; @@ -7375,6 +7474,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); + vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; + kvm_async_pf_hash_reset(vcpu); kvm_pmu_init(vcpu); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index f5fef1868096..01a1d011e073 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -4,6 +4,8 @@ #include <linux/kvm_host.h> #include "kvm_cache_regs.h" +#define MSR_IA32_CR_PAT_DEFAULT 0x0007040600070406ULL + static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu) { vcpu->arch.exception.pending = false; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index ad45054309a0..87fd74a04005 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -230,6 +230,7 @@ struct kvm_vcpu { int fpu_active; int guest_fpu_loaded, guest_xcr0_loaded; + unsigned char fpu_counter; wait_queue_head_t wq; struct pid *pid; int sigset_active; @@ -538,13 +539,13 @@ void kvm_release_page_dirty(struct page *page); void kvm_set_page_accessed(struct page *page); pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn); -pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async, - bool write_fault, bool *writable); pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn); pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, bool *writable); pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn); pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn); +pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic, + bool *async, bool write_fault, bool *writable); void kvm_release_pfn_clean(pfn_t pfn); void kvm_set_pfn_dirty(pfn_t pfn); @@ -762,16 +763,10 @@ static inline void kvm_iommu_unmap_pages(struct kvm *kvm, } #endif -static inline void kvm_guest_enter(void) +/* must be called with irqs disabled */ +static inline void __kvm_guest_enter(void) { - unsigned long flags; - - BUG_ON(preemptible()); - - local_irq_save(flags); guest_enter(); - local_irq_restore(flags); - /* KVM does not hold any references to rcu protected data when it * switches CPU into a guest mode. In fact switching to a guest mode * is very similar to exiting to userspace from rcu point of view. In @@ -783,12 +778,27 @@ static inline void kvm_guest_enter(void) rcu_virt_note_context_switch(smp_processor_id()); } +/* must be called with irqs disabled */ +static inline void __kvm_guest_exit(void) +{ + guest_exit(); +} + +static inline void kvm_guest_enter(void) +{ + unsigned long flags; + + local_irq_save(flags); + __kvm_guest_enter(); + local_irq_restore(flags); +} + static inline void kvm_guest_exit(void) { unsigned long flags; local_irq_save(flags); - guest_exit(); + __kvm_guest_exit(); local_irq_restore(flags); } diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 4b60056776d1..75bd9f7fd846 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -814,6 +814,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_S390_INJECT_IRQ 113 #define KVM_CAP_S390_IRQ_STATE 114 #define KVM_CAP_PPC_HWRNG 115 +#define KVM_CAP_DISABLE_QUIRKS 116 #ifdef KVM_CAP_IRQ_ROUTING diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 90977418aeb6..bd3c08a7c6c2 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1355,9 +1355,8 @@ exit: return pfn; } -static pfn_t -__gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic, - bool *async, bool write_fault, bool *writable) +pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic, + bool *async, bool write_fault, bool *writable) { unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault); @@ -1376,44 +1375,35 @@ __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic, return hva_to_pfn(addr, atomic, async, write_fault, writable); } +EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot); -static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async, +static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool write_fault, bool *writable) { struct kvm_memory_slot *slot; - if (async) - *async = false; - slot = gfn_to_memslot(kvm, gfn); - return __gfn_to_pfn_memslot(slot, gfn, atomic, async, write_fault, + return __gfn_to_pfn_memslot(slot, gfn, atomic, NULL, write_fault, writable); } pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) { - return __gfn_to_pfn(kvm, gfn, true, NULL, true, NULL); + return __gfn_to_pfn(kvm, gfn, true, true, NULL); } EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic); -pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async, - bool write_fault, bool *writable) -{ - return __gfn_to_pfn(kvm, gfn, false, async, write_fault, writable); -} -EXPORT_SYMBOL_GPL(gfn_to_pfn_async); - pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) { - return __gfn_to_pfn(kvm, gfn, false, NULL, true, NULL); + return __gfn_to_pfn(kvm, gfn, false, true, NULL); } EXPORT_SYMBOL_GPL(gfn_to_pfn); pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, bool *writable) { - return __gfn_to_pfn(kvm, gfn, false, NULL, write_fault, writable); + return __gfn_to_pfn(kvm, gfn, false, write_fault, writable); } EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); @@ -1590,15 +1580,17 @@ int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, int offset, int len) { int r; + struct kvm_memory_slot *memslot; unsigned long addr; - addr = gfn_to_hva(kvm, gfn); + memslot = gfn_to_memslot(kvm, gfn); + addr = gfn_to_hva_memslot(memslot, gfn); if (kvm_is_error_hva(addr)) return -EFAULT; r = __copy_to_user((void __user *)addr + offset, data, len); if (r) return -EFAULT; - mark_page_dirty(kvm, gfn); + mark_page_dirty_in_slot(kvm, memslot, gfn); return 0; } EXPORT_SYMBOL_GPL(kvm_write_guest_page); @@ -2882,18 +2874,12 @@ static int hardware_enable_all(void) static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, void *v) { - int cpu = (long)v; - val &= ~CPU_TASKS_FROZEN; switch (val) { case CPU_DYING: - pr_info("kvm: disabling virtualization on CPU%d\n", - cpu); hardware_disable(); break; case CPU_STARTING: - pr_info("kvm: enabling virtualization on CPU%d\n", - cpu); hardware_enable(); break; } |