diff options
94 files changed, 6454 insertions, 2223 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 03145b7cafaa..e4f2cdcf78eb 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2061,6 +2061,8 @@ registers, find a list below: MIPS | KVM_REG_MIPS_LO | 64 MIPS | KVM_REG_MIPS_PC | 64 MIPS | KVM_REG_MIPS_CP0_INDEX | 32 + MIPS | KVM_REG_MIPS_CP0_ENTRYLO0 | 64 + MIPS | KVM_REG_MIPS_CP0_ENTRYLO1 | 64 MIPS | KVM_REG_MIPS_CP0_CONTEXT | 64 MIPS | KVM_REG_MIPS_CP0_USERLOCAL | 64 MIPS | KVM_REG_MIPS_CP0_PAGEMASK | 32 @@ -2071,9 +2073,11 @@ registers, find a list below: MIPS | KVM_REG_MIPS_CP0_ENTRYHI | 64 MIPS | KVM_REG_MIPS_CP0_COMPARE | 32 MIPS | KVM_REG_MIPS_CP0_STATUS | 32 + MIPS | KVM_REG_MIPS_CP0_INTCTL | 32 MIPS | KVM_REG_MIPS_CP0_CAUSE | 32 MIPS | KVM_REG_MIPS_CP0_EPC | 64 MIPS | KVM_REG_MIPS_CP0_PRID | 32 + MIPS | KVM_REG_MIPS_CP0_EBASE | 64 MIPS | KVM_REG_MIPS_CP0_CONFIG | 32 MIPS | KVM_REG_MIPS_CP0_CONFIG1 | 32 MIPS | KVM_REG_MIPS_CP0_CONFIG2 | 32 @@ -2148,6 +2152,12 @@ patterns depending on whether they're 32-bit or 64-bit registers: 0x7020 0000 0001 00 <reg:5> <sel:3> (32-bit) 0x7030 0000 0001 00 <reg:5> <sel:3> (64-bit) +Note: KVM_REG_MIPS_CP0_ENTRYLO0 and KVM_REG_MIPS_CP0_ENTRYLO1 are the MIPS64 +versions of the EntryLo registers regardless of the word size of the host +hardware, host kernel, guest, and whether XPA is present in the guest, i.e. +with the RI and XI bits (if they exist) in bits 63 and 62 respectively, and +the PFNX field starting at bit 30. + MIPS KVM control registers (see above) have the following id bit patterns: 0x7030 0000 0002 <reg:16> @@ -2443,18 +2453,20 @@ are, it will do nothing and return an EBUSY error. The parameter is a pointer to a 32-bit unsigned integer variable containing the order (log base 2) of the desired size of the hash table, which must be between 18 and 46. On successful return from the -ioctl, it will have been updated with the order of the hash table that -was allocated. +ioctl, the value will not be changed by the kernel. If no hash table has been allocated when any vcpu is asked to run (with the KVM_RUN ioctl), the host kernel will allocate a default-sized hash table (16 MB). If this ioctl is called when a hash table has already been allocated, -the kernel will clear out the existing hash table (zero all HPTEs) and -return the hash table order in the parameter. (If the guest is using -the virtualized real-mode area (VRMA) facility, the kernel will -re-create the VMRA HPTEs on the next KVM_RUN of any vcpu.) +with a different order from the existing hash table, the existing hash +table will be freed and a new one allocated. If this is ioctl is +called when a hash table has already been allocated of the same order +as specified, the kernel will clear out the existing hash table (zero +all HPTEs). In either case, if the guest is using the virtualized +real-mode area (VRMA) facility, the kernel will re-create the VMRA +HPTEs on the next KVM_RUN of any vcpu. 4.77 KVM_S390_INTERRUPT @@ -3177,7 +3189,7 @@ of IOMMU pages. The rest of functionality is identical to KVM_CREATE_SPAPR_TCE. -4.98 KVM_REINJECT_CONTROL +4.99 KVM_REINJECT_CONTROL Capability: KVM_CAP_REINJECT_CONTROL Architectures: x86 @@ -3201,6 +3213,166 @@ struct kvm_reinject_control { pit_reinject = 0 (!reinject mode) is recommended, unless running an old operating system that uses the PIT for timing (e.g. Linux 2.4.x). +4.100 KVM_PPC_CONFIGURE_V3_MMU + +Capability: KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3 +Architectures: ppc +Type: vm ioctl +Parameters: struct kvm_ppc_mmuv3_cfg (in) +Returns: 0 on success, + -EFAULT if struct kvm_ppc_mmuv3_cfg cannot be read, + -EINVAL if the configuration is invalid + +This ioctl controls whether the guest will use radix or HPT (hashed +page table) translation, and sets the pointer to the process table for +the guest. + +struct kvm_ppc_mmuv3_cfg { + __u64 flags; + __u64 process_table; +}; + +There are two bits that can be set in flags; KVM_PPC_MMUV3_RADIX and +KVM_PPC_MMUV3_GTSE. KVM_PPC_MMUV3_RADIX, if set, configures the guest +to use radix tree translation, and if clear, to use HPT translation. +KVM_PPC_MMUV3_GTSE, if set and if KVM permits it, configures the guest +to be able to use the global TLB and SLB invalidation instructions; +if clear, the guest may not use these instructions. + +The process_table field specifies the address and size of the guest +process table, which is in the guest's space. This field is formatted +as the second doubleword of the partition table entry, as defined in +the Power ISA V3.00, Book III section 5.7.6.1. + +4.101 KVM_PPC_GET_RMMU_INFO + +Capability: KVM_CAP_PPC_RADIX_MMU +Architectures: ppc +Type: vm ioctl +Parameters: struct kvm_ppc_rmmu_info (out) +Returns: 0 on success, + -EFAULT if struct kvm_ppc_rmmu_info cannot be written, + -EINVAL if no useful information can be returned + +This ioctl returns a structure containing two things: (a) a list +containing supported radix tree geometries, and (b) a list that maps +page sizes to put in the "AP" (actual page size) field for the tlbie +(TLB invalidate entry) instruction. + +struct kvm_ppc_rmmu_info { + struct kvm_ppc_radix_geom { + __u8 page_shift; + __u8 level_bits[4]; + __u8 pad[3]; + } geometries[8]; + __u32 ap_encodings[8]; +}; + +The geometries[] field gives up to 8 supported geometries for the +radix page table, in terms of the log base 2 of the smallest page +size, and the number of bits indexed at each level of the tree, from +the PTE level up to the PGD level in that order. Any unused entries +will have 0 in the page_shift field. + +The ap_encodings gives the supported page sizes and their AP field +encodings, encoded with the AP value in the top 3 bits and the log +base 2 of the page size in the bottom 6 bits. + +4.102 KVM_PPC_RESIZE_HPT_PREPARE + +Capability: KVM_CAP_SPAPR_RESIZE_HPT +Architectures: powerpc +Type: vm ioctl +Parameters: struct kvm_ppc_resize_hpt (in) +Returns: 0 on successful completion, + >0 if a new HPT is being prepared, the value is an estimated + number of milliseconds until preparation is complete + -EFAULT if struct kvm_reinject_control cannot be read, + -EINVAL if the supplied shift or flags are invalid + -ENOMEM if unable to allocate the new HPT + -ENOSPC if there was a hash collision when moving existing + HPT entries to the new HPT + -EIO on other error conditions + +Used to implement the PAPR extension for runtime resizing of a guest's +Hashed Page Table (HPT). Specifically this starts, stops or monitors +the preparation of a new potential HPT for the guest, essentially +implementing the H_RESIZE_HPT_PREPARE hypercall. + +If called with shift > 0 when there is no pending HPT for the guest, +this begins preparation of a new pending HPT of size 2^(shift) bytes. +It then returns a positive integer with the estimated number of +milliseconds until preparation is complete. + +If called when there is a pending HPT whose size does not match that +requested in the parameters, discards the existing pending HPT and +creates a new one as above. + +If called when there is a pending HPT of the size requested, will: + * If preparation of the pending HPT is already complete, return 0 + * If preparation of the pending HPT has failed, return an error + code, then discard the pending HPT. + * If preparation of the pending HPT is still in progress, return an + estimated number of milliseconds until preparation is complete. + +If called with shift == 0, discards any currently pending HPT and +returns 0 (i.e. cancels any in-progress preparation). + +flags is reserved for future expansion, currently setting any bits in +flags will result in an -EINVAL. + +Normally this will be called repeatedly with the same parameters until +it returns <= 0. The first call will initiate preparation, subsequent +ones will monitor preparation until it completes or fails. + +struct kvm_ppc_resize_hpt { + __u64 flags; + __u32 shift; + __u32 pad; +}; + +4.103 KVM_PPC_RESIZE_HPT_COMMIT + +Capability: KVM_CAP_SPAPR_RESIZE_HPT +Architectures: powerpc +Type: vm ioctl +Parameters: struct kvm_ppc_resize_hpt (in) +Returns: 0 on successful completion, + -EFAULT if struct kvm_reinject_control cannot be read, + -EINVAL if the supplied shift or flags are invalid + -ENXIO is there is no pending HPT, or the pending HPT doesn't + have the requested size + -EBUSY if the pending HPT is not fully prepared + -ENOSPC if there was a hash collision when moving existing + HPT entries to the new HPT + -EIO on other error conditions + +Used to implement the PAPR extension for runtime resizing of a guest's +Hashed Page Table (HPT). Specifically this requests that the guest be +transferred to working with the new HPT, essentially implementing the +H_RESIZE_HPT_COMMIT hypercall. + +This should only be called after KVM_PPC_RESIZE_HPT_PREPARE has +returned 0 with the same parameters. In other cases +KVM_PPC_RESIZE_HPT_COMMIT will return an error (usually -ENXIO or +-EBUSY, though others may be possible if the preparation was started, +but failed). + +This will have undefined effects on the guest if it has not already +placed itself in a quiescent state where no vcpu will make MMU enabled +memory accesses. + +On succsful completion, the pending HPT will become the guest's active +HPT and the previous HPT will be discarded. + +On failure, the guest will still be operating on its previous HPT. + +struct kvm_ppc_resize_hpt { + __u64 flags; + __u32 shift; + __u32 pad; +}; + 5. The kvm_run structure ------------------------ @@ -3942,3 +4114,21 @@ In order to use SynIC, it has to be activated by setting this capability via KVM_ENABLE_CAP ioctl on the vcpu fd. Note that this will disable the use of APIC hardware virtualization even if supported by the CPU, as it's incompatible with SynIC auto-EOI behavior. + +8.3 KVM_CAP_PPC_RADIX_MMU + +Architectures: ppc + +This capability, if KVM_CHECK_EXTENSION indicates that it is +available, means that that the kernel can support guests using the +radix MMU defined in Power ISA V3.00 (as implemented in the POWER9 +processor). + +8.4 KVM_CAP_PPC_HASH_MMU_V3 + +Architectures: ppc + +This capability, if KVM_CHECK_EXTENSION indicates that it is +available, means that that the kernel can support guests using the +hashed page table MMU defined in Power ISA V3.00 (as implemented in +the POWER9 processor), including in-memory segment tables. diff --git a/Documentation/virtual/kvm/hypercalls.txt b/Documentation/virtual/kvm/hypercalls.txt index c8d040e27046..feaaa634f154 100644 --- a/Documentation/virtual/kvm/hypercalls.txt +++ b/Documentation/virtual/kvm/hypercalls.txt @@ -81,3 +81,38 @@ the vcpu to sleep until occurrence of an appropriate event. Another vcpu of the same guest can wakeup the sleeping vcpu by issuing KVM_HC_KICK_CPU hypercall, specifying APIC ID (a1) of the vcpu to be woken up. An additional argument (a0) is used in the hypercall for future use. + + +6. KVM_HC_CLOCK_PAIRING +------------------------ +Architecture: x86 +Status: active +Purpose: Hypercall used to synchronize host and guest clocks. +Usage: + +a0: guest physical address where host copies +"struct kvm_clock_offset" structure. + +a1: clock_type, ATM only KVM_CLOCK_PAIRING_WALLCLOCK (0) +is supported (corresponding to the host's CLOCK_REALTIME clock). + + struct kvm_clock_pairing { + __s64 sec; + __s64 nsec; + __u64 tsc; + __u32 flags; + __u32 pad[9]; + }; + + Where: + * sec: seconds from clock_type clock. + * nsec: nanoseconds from clock_type clock. + * tsc: guest TSC value used to calculate sec/nsec pair + * flags: flags, unused (0) at the moment. + +The hypercall lets a guest compute a precise timestamp across +host and guest. The guest can use the returned TSC value to +compute the CLOCK_REALTIME for its clock, at the same instant. + +Returns KVM_EOPNOTSUPP if the host does not use TSC clocksource, +or if clock type is different than KVM_CLOCK_PAIRING_WALLCLOCK. diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt index fd013bf4115b..1bb8bcaf8497 100644 --- a/Documentation/virtual/kvm/locking.txt +++ b/Documentation/virtual/kvm/locking.txt @@ -26,9 +26,16 @@ sections. Fast page fault: Fast page fault is the fast path which fixes the guest page fault out of -the mmu-lock on x86. Currently, the page fault can be fast only if the -shadow page table is present and it is caused by write-protect, that means -we just need change the W bit of the spte. +the mmu-lock on x86. Currently, the page fault can be fast in one of the +following two cases: + +1. Access Tracking: The SPTE is not present, but it is marked for access +tracking i.e. the SPTE_SPECIAL_MASK is set. That means we need to +restore the saved R/X bits. This is described in more detail later below. + +2. Write-Protection: The SPTE is present and the fault is +caused by write-protect. That means we just need to change the W bit of the +spte. What we use to avoid all the race is the SPTE_HOST_WRITEABLE bit and SPTE_MMU_WRITEABLE bit on the spte: @@ -38,7 +45,8 @@ SPTE_MMU_WRITEABLE bit on the spte: page write-protection. On fast page fault path, we will use cmpxchg to atomically set the spte W -bit if spte.SPTE_HOST_WRITEABLE = 1 and spte.SPTE_WRITE_PROTECT = 1, this +bit if spte.SPTE_HOST_WRITEABLE = 1 and spte.SPTE_WRITE_PROTECT = 1, or +restore the saved R/X bits if VMX_EPT_TRACK_ACCESS mask is set, or both. This is safe because whenever changing these bits can be detected by cmpxchg. But we need carefully check these cases: @@ -142,6 +150,21 @@ Since the spte is "volatile" if it can be updated out of mmu-lock, we always atomically update the spte, the race caused by fast page fault can be avoided, See the comments in spte_has_volatile_bits() and mmu_spte_update(). +Lockless Access Tracking: + +This is used for Intel CPUs that are using EPT but do not support the EPT A/D +bits. In this case, when the KVM MMU notifier is called to track accesses to a +page (via kvm_mmu_notifier_clear_flush_young), it marks the PTE as not-present +by clearing the RWX bits in the PTE and storing the original R & X bits in +some unused/ignored bits. In addition, the SPTE_SPECIAL_MASK is also set on the +PTE (using the ignored bit 62). When the VM tries to access the page later on, +a fault is generated and the fast page fault mechanism described above is used +to atomically restore the PTE to a Present state. The W bit is not saved when +the PTE is marked for access tracking and during restoration to the Present +state, the W bit is set depending on whether or not it was a write access. If +it wasn't, then the W bit will remain clear until a write access happens, at +which time it will be set using the Dirty tracking mechanism described above. + 3. Reference ------------ diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index bebec370324f..05e785fc061d 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -43,6 +43,7 @@ #define KVM_REG_MIPS_CP0_ENTRYHI MIPS_CP0_64(10, 0) #define KVM_REG_MIPS_CP0_COMPARE MIPS_CP0_32(11, 0) #define KVM_REG_MIPS_CP0_STATUS MIPS_CP0_32(12, 0) +#define KVM_REG_MIPS_CP0_INTCTL MIPS_CP0_32(12, 1) #define KVM_REG_MIPS_CP0_CAUSE MIPS_CP0_32(13, 0) #define KVM_REG_MIPS_CP0_EPC MIPS_CP0_64(14, 0) #define KVM_REG_MIPS_CP0_PRID MIPS_CP0_32(15, 0) @@ -64,7 +65,7 @@ #define KVM_REG_MIPS_CP0_KSCRATCH6 MIPS_CP0_64(31, 7) -#define KVM_MAX_VCPUS 1 +#define KVM_MAX_VCPUS 8 #define KVM_USER_MEM_SLOTS 8 /* memory slots that does not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 0 @@ -88,6 +89,7 @@ #define KVM_GUEST_KUSEG 0x00000000UL #define KVM_GUEST_KSEG0 0x40000000UL +#define KVM_GUEST_KSEG1 0x40000000UL #define KVM_GUEST_KSEG23 0x60000000UL #define KVM_GUEST_KSEGX(a) ((_ACAST32_(a)) & 0xe0000000) #define KVM_GUEST_CPHYSADDR(a) ((_ACAST32_(a)) & 0x1fffffff) @@ -104,7 +106,6 @@ #define KVM_GUEST_KSEG23ADDR(a) (KVM_GUEST_CPHYSADDR(a) | KVM_GUEST_KSEG23) #define KVM_INVALID_PAGE 0xdeadbeef -#define KVM_INVALID_INST 0xdeadbeef #define KVM_INVALID_ADDR 0xdeadbeef /* @@ -121,8 +122,6 @@ static inline bool kvm_is_error_hva(unsigned long addr) return IS_ERR_VALUE(addr); } -extern atomic_t kvm_mips_instance; - struct kvm_vm_stat { ulong remote_tlb_flush; }; @@ -156,12 +155,8 @@ struct kvm_arch_memory_slot { }; struct kvm_arch { - /* Guest GVA->HPA page table */ - unsigned long *guest_pmap; - unsigned long guest_pmap_npages; - - /* Wired host TLB used for the commpage */ - int commpage_tlb; + /* Guest physical mm */ + struct mm_struct gpa_mm; }; #define N_MIPS_COPROC_REGS 32 @@ -233,6 +228,7 @@ enum emulation_result { EMULATE_FAIL, /* can't emulate this instruction */ EMULATE_WAIT, /* WAIT instruction */ EMULATE_PRIV_FAIL, + EMULATE_EXCEPT, /* A guest exception has been generated */ }; #define mips3_paddr_to_tlbpfn(x) \ @@ -250,6 +246,7 @@ enum emulation_result { #define TLB_ASID(x) ((x).tlb_hi & KVM_ENTRYHI_ASID) #define TLB_LO_IDX(x, va) (((va) >> PAGE_SHIFT) & 1) #define TLB_IS_VALID(x, va) ((x).tlb_lo[TLB_LO_IDX(x, va)] & ENTRYLO_V) +#define TLB_IS_DIRTY(x, va) ((x).tlb_lo[TLB_LO_IDX(x, va)] & ENTRYLO_D) #define TLB_HI_VPN2_HIT(x, y) ((TLB_VPN2(x) & ~(x).tlb_mask) == \ ((y) & VPN2_MASK & ~(x).tlb_mask)) #define TLB_HI_ASID_HIT(x, y) (TLB_IS_GLOBAL(x) || \ @@ -261,6 +258,17 @@ struct kvm_mips_tlb { long tlb_lo[2]; }; +#define KVM_NR_MEM_OBJS 4 + +/* + * We don't want allocation failures within the mmu code, so we preallocate + * enough memory for a single page fault in a cache. + */ +struct kvm_mmu_memory_cache { + int nobjs; + void *objects[KVM_NR_MEM_OBJS]; +}; + #define KVM_MIPS_AUX_FPU 0x1 #define KVM_MIPS_AUX_MSA 0x2 @@ -275,6 +283,8 @@ struct kvm_vcpu_arch { unsigned long host_cp0_badvaddr; unsigned long host_cp0_epc; u32 host_cp0_cause; + u32 host_cp0_badinstr; + u32 host_cp0_badinstrp; /* GPRS */ unsigned long gprs[32]; @@ -318,20 +328,18 @@ struct kvm_vcpu_arch { /* Bitmask of pending exceptions to be cleared */ unsigned long pending_exceptions_clr; - /* Save/Restore the entryhi register when are are preempted/scheduled back in */ - unsigned long preempt_entryhi; - /* S/W Based TLB for guest */ struct kvm_mips_tlb guest_tlb[KVM_MIPS_GUEST_TLB_SIZE]; - /* Cached guest kernel/user ASIDs */ - u32 guest_user_asid[NR_CPUS]; - u32 guest_kernel_asid[NR_CPUS]; + /* Guest kernel/user [partial] mm */ struct mm_struct guest_kernel_mm, guest_user_mm; /* Guest ASID of last user mode execution */ unsigned int last_user_gasid; + /* Cache some mmu pages needed inside spinlock regions */ + struct kvm_mmu_memory_cache mmu_page_cache; + int last_sched_cpu; /* WAIT executed */ @@ -339,14 +347,15 @@ struct kvm_vcpu_arch { u8 fpu_enabled; u8 msa_enabled; - u8 kscratch_enabled; }; #define kvm_read_c0_guest_index(cop0) (cop0->reg[MIPS_CP0_TLB_INDEX][0]) #define kvm_write_c0_guest_index(cop0, val) (cop0->reg[MIPS_CP0_TLB_INDEX][0] = val) #define kvm_read_c0_guest_entrylo0(cop0) (cop0->reg[MIPS_CP0_TLB_LO0][0]) +#define kvm_write_c0_guest_entrylo0(cop0, val) (cop0->reg[MIPS_CP0_TLB_LO0][0] = (val)) #define kvm_read_c0_guest_entrylo1(cop0) (cop0->reg[MIPS_CP0_TLB_LO1][0]) +#define kvm_write_c0_guest_entrylo1(cop0, val) (cop0->reg[MIPS_CP0_TLB_LO1][0] = (val)) #define kvm_read_c0_guest_context(cop0) (cop0->reg[MIPS_CP0_TLB_CONTEXT][0]) #define kvm_write_c0_guest_context(cop0, val) (cop0->reg[MIPS_CP0_TLB_CONTEXT][0] = (val)) #define kvm_read_c0_guest_userlocal(cop0) (cop0->reg[MIPS_CP0_TLB_CONTEXT][2]) @@ -522,9 +531,17 @@ struct kvm_mips_callbacks { int (*handle_msa_fpe)(struct kvm_vcpu *vcpu); int (*handle_fpe)(struct kvm_vcpu *vcpu); int (*handle_msa_disabled)(struct kvm_vcpu *vcpu); - int (*vm_init)(struct kvm *kvm); int (*vcpu_init)(struct kvm_vcpu *vcpu); + void (*vcpu_uninit)(struct kvm_vcpu *vcpu); int (*vcpu_setup)(struct kvm_vcpu *vcpu); + void (*flush_shadow_all)(struct kvm *kvm); + /* + * Must take care of flushing any cached GPA PTEs (e.g. guest entries in + * VZ root TLB, or T&E GVA page tables and corresponding root TLB + * mappings). + */ + void (*flush_shadow_memslot)(struct kvm *kvm, + const struct kvm_memory_slot *slot); gpa_t (*gva_to_gpa)(gva_t gva); void (*queue_timer_int)(struct kvm_vcpu *vcpu); void (*dequeue_timer_int)(struct kvm_vcpu *vcpu); @@ -542,8 +559,10 @@ struct kvm_mips_callbacks { const struct kvm_one_reg *reg, s64 *v); int (*set_one_reg)(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, s64 v); - int (*vcpu_get_regs)(struct kvm_vcpu *vcpu); - int (*vcpu_set_regs)(struct kvm_vcpu *vcpu); + int (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); + int (*vcpu_put)(struct kvm_vcpu *vcpu, int cpu); + int (*vcpu_run)(struct kvm_run *run, struct kvm_vcpu *vcpu); + void (*vcpu_reenter)(struct kvm_run *run, struct kvm_vcpu *vcpu); }; extern struct kvm_mips_callbacks *kvm_mips_callbacks; int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks); @@ -556,6 +575,7 @@ extern int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu); /* Building of entry/exception code */ int kvm_mips_entry_setup(void); void *kvm_mips_build_vcpu_run(void *addr); +void *kvm_mips_build_tlb_refill_exception(void *addr, void *handler); void *kvm_mips_build_exception(void *addr, void *handler); void *kvm_mips_build_exit(void *addr); @@ -580,54 +600,125 @@ u32 kvm_get_user_asid(struct kvm_vcpu *vcpu); u32 kvm_get_commpage_asid (struct kvm_vcpu *vcpu); extern int kvm_mips_handle_kseg0_tlb_fault(unsigned long badbaddr, - struct kvm_vcpu *vcpu); + struct kvm_vcpu *vcpu, + bool write_fault); extern int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr, struct kvm_vcpu *vcpu); extern int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu, - struct kvm_mips_tlb *tlb); + struct kvm_mips_tlb *tlb, + unsigned long gva, + bool write_fault); extern enum emulation_result kvm_mips_handle_tlbmiss(u32 cause, u32 *opc, struct kvm_run *run, - struct kvm_vcpu *vcpu); - -extern enum emulation_result kvm_mips_handle_tlbmod(u32 cause, - u32 *opc, - struct kvm_run *run, - struct kvm_vcpu *vcpu); + struct kvm_vcpu *vcpu, + bool write_fault); extern void kvm_mips_dump_host_tlbs(void); extern void kvm_mips_dump_guest_tlbs(struct kvm_vcpu *vcpu); -extern int kvm_mips_host_tlb_write(struct kvm_vcpu *vcpu, unsigned long entryhi, - unsigned long entrylo0, - unsigned long entrylo1, - int flush_dcache_mask); -extern void kvm_mips_flush_host_tlb(int skip_kseg0); -extern int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long entryhi); +extern int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long entryhi, + bool user, bool kernel); extern int kvm_mips_guest_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long entryhi); -extern int kvm_mips_host_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long vaddr); -extern unsigned long kvm_mips_translate_guest_kseg0_to_hpa(struct kvm_vcpu *vcpu, - unsigned long gva); -extern void kvm_get_new_mmu_context(struct mm_struct *mm, unsigned long cpu, - struct kvm_vcpu *vcpu); -extern void kvm_local_flush_tlb_all(void); -extern void kvm_mips_alloc_new_mmu_context(struct kvm_vcpu *vcpu); -extern void kvm_mips_vcpu_load(struct kvm_vcpu *vcpu, int cpu); -extern void kvm_mips_vcpu_put(struct kvm_vcpu *vcpu); + +void kvm_mips_suspend_mm(int cpu); +void kvm_mips_resume_mm(int cpu); + +/* MMU handling */ + +/** + * enum kvm_mips_flush - Types of MMU flushes. + * @KMF_USER: Flush guest user virtual memory mappings. + * Guest USeg only. + * @KMF_KERN: Flush guest kernel virtual memory mappings. + * Guest USeg and KSeg2/3. + * @KMF_GPA: Flush guest physical memory mappings. + * Also includes KSeg0 if KMF_KERN is set. + */ +enum kvm_mips_flush { + KMF_USER = 0x0, + KMF_KERN = 0x1, + KMF_GPA = 0x2, +}; +void kvm_mips_flush_gva_pt(pgd_t *pgd, enum kvm_mips_flush flags); +bool kvm_mips_flush_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn); +int kvm_mips_mkclean_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn); +pgd_t *kvm_pgd_alloc(void); +void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu); +void kvm_trap_emul_invalidate_gva(struct kvm_vcpu *vcpu, unsigned long addr, + bool user); +void kvm_trap_emul_gva_lockless_begin(struct kvm_vcpu *vcpu); +void kvm_trap_emul_gva_lockless_end(struct kvm_vcpu *vcpu); + +enum kvm_mips_fault_result { + KVM_MIPS_MAPPED = 0, + KVM_MIPS_GVA, + KVM_MIPS_GPA, + KVM_MIPS_TLB, + KVM_MIPS_TLBINV, + KVM_MIPS_TLBMOD, +}; +enum kvm_mips_fault_result kvm_trap_emul_gva_fault(struct kvm_vcpu *vcpu, + unsigned long gva, + bool write); + +#define KVM_ARCH_WANT_MMU_NOTIFIER +int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); +int kvm_unmap_hva_range(struct kvm *kvm, + unsigned long start, unsigned long end); +void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); +int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); +int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); + +static inline void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, + unsigned long address) +{ +} /* Emulation */ -u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu); +int kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu, u32 *out); enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause); +int kvm_get_badinstr(u32 *opc, struct kvm_vcpu *vcpu, u32 *out); +int kvm_get_badinstrp(u32 *opc, struct kvm_vcpu *vcpu, u32 *out); + +/** + * kvm_is_ifetch_fault() - Find whether a TLBL exception is due to ifetch fault. + * @vcpu: Virtual CPU. + * + * Returns: Whether the TLBL exception was likely due to an instruction + * fetch fault rather than a data load fault. + */ +static inline bool kvm_is_ifetch_fault(struct kvm_vcpu_arch *vcpu) +{ + unsigned long badvaddr = vcpu->host_cp0_badvaddr; + unsigned long epc = msk_isa16_mode(vcpu->pc); + u32 cause = vcpu->host_cp0_cause; + + if (epc == badvaddr) + return true; + + /* + * Branches may be 32-bit or 16-bit instructions. + * This isn't exact, but we don't really support MIPS16 or microMIPS yet + * in KVM anyway. + */ + if ((cause & CAUSEF_BD) && badvaddr - epc <= 4) + return true; + + return false; +} extern enum emulation_result kvm_mips_emulate_inst(u32 cause, u32 *opc, struct kvm_run *run, struct kvm_vcpu *vcpu); +long kvm_mips_guest_exception_base(struct kvm_vcpu *vcpu); + extern enum emulation_result kvm_mips_emulate_syscall(u32 cause, u32 *opc, struct kvm_run *run, @@ -761,10 +852,6 @@ static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, struct kvm_memory_slot *dont) {} static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) {} -static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {} -static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm, - struct kvm_memory_slot *slot) {} -static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} diff --git a/arch/mips/include/asm/mmu_context.h b/arch/mips/include/asm/mmu_context.h index ddd57ade1aa8..2abf94f72c0a 100644 --- a/arch/mips/include/asm/mmu_context.h +++ b/arch/mips/include/asm/mmu_context.h @@ -29,9 +29,11 @@ do { \ } \ } while (0) +extern void tlbmiss_handler_setup_pgd(unsigned long); + +/* Note: This is also implemented with uasm in arch/mips/kvm/entry.c */ #define TLBMISS_HANDLER_SETUP_PGD(pgd) \ do { \ - extern void tlbmiss_handler_setup_pgd(unsigned long); \ tlbmiss_handler_setup_pgd((unsigned long)(pgd)); \ htw_set_pwbase((unsigned long)pgd); \ } while (0) @@ -97,17 +99,12 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) static inline void get_new_mmu_context(struct mm_struct *mm, unsigned long cpu) { - extern void kvm_local_flush_tlb_all(void); unsigned long asid = asid_cache(cpu); if (!((asid += cpu_asid_inc()) & cpu_asid_mask(&cpu_data[cpu]))) { if (cpu_has_vtag_icache) flush_icache_all(); -#ifdef CONFIG_KVM - kvm_local_flush_tlb_all(); /* start new asid cycle */ -#else local_flush_tlb_all(); /* start new asid cycle */ -#endif if (!asid) /* fix version if needed */ asid = asid_first_version(cpu); } diff --git a/arch/mips/include/asm/pgalloc.h b/arch/mips/include/asm/pgalloc.h index a03e86969f78..a8705f6c8180 100644 --- a/arch/mips/include/asm/pgalloc.h +++ b/arch/mips/include/asm/pgalloc.h @@ -43,21 +43,7 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) * Initialize a new pgd / pmd table with invalid pointers. */ extern void pgd_init(unsigned long page); - -static inline pgd_t *pgd_alloc(struct mm_struct *mm) -{ - pgd_t *ret, *init; - - ret = (pgd_t *) __get_free_pages(GFP_KERNEL, PGD_ORDER); - if (ret) { - init = pgd_offset(&init_mm, 0UL); - pgd_init((unsigned long)ret); - memcpy(ret + USER_PTRS_PER_PGD, init + USER_PTRS_PER_PGD, - (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); - } - - return ret; -} +extern pgd_t *pgd_alloc(struct mm_struct *mm); static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { diff --git a/arch/mips/include/asm/r4kcache.h b/arch/mips/include/asm/r4kcache.h index b42b513007a2..7227c158cbf8 100644 --- a/arch/mips/include/asm/r4kcache.h +++ b/arch/mips/include/asm/r4kcache.h @@ -147,49 +147,64 @@ static inline void flush_scache_line(unsigned long addr) } #define protected_cache_op(op,addr) \ +({ \ + int __err = 0; \ __asm__ __volatile__( \ " .set push \n" \ " .set noreorder \n" \ " .set "MIPS_ISA_ARCH_LEVEL" \n" \ - "1: cache %0, (%1) \n" \ + "1: cache %1, (%2) \n" \ "2: .set pop \n" \ + " .section .fixup,\"ax\" \n" \ + "3: li %0, %3 \n" \ + " j 2b \n" \ + " .previous \n" \ " .section __ex_table,\"a\" \n" \ - " "STR(PTR)" 1b, 2b \n" \ + " "STR(PTR)" 1b, 3b \n" \ " .previous" \ - : \ - : "i" (op), "r" (addr)) + : "+r" (__err) \ + : "i" (op), "r" (addr), "i" (-EFAULT)); \ + __err; \ +}) + #define protected_cachee_op(op,addr) \ +({ \ + int __err = 0; \ __asm__ __volatile__( \ " .set push \n" \ " .set noreorder \n" \ " .set mips0 \n" \ " .set eva \n" \ - "1: cachee %0, (%1) \n" \ + "1: cachee %1, (%2) \n" \ "2: .set pop \n" \ + " .section .fixup,\"ax\" \n" \ + "3: li %0, %3 \n" \ + " j 2b \n" \ + " .previous \n" \ " .section __ex_table,\"a\" \n" \ - " "STR(PTR)" 1b, 2b \n" \ + " "STR(PTR)" 1b, 3b \n" \ " .previous" \ - : \ - : "i" (op), "r" (addr)) + : "+r" (__err) \ + : "i" (op), "r" (addr), "i" (-EFAULT)); \ + __err; \ +}) /* * The next two are for badland addresses like signal trampolines. */ -static inline void protected_flush_icache_line(unsigned long addr) +static inline int protected_flush_icache_line(unsigned long addr) { switch (boot_cpu_type()) { case CPU_LOONGSON2: - protected_cache_op(Hit_Invalidate_I_Loongson2, addr); - break; + return protected_cache_op(Hit_Invalidate_I_Loongson2, addr); default: #ifdef CONFIG_EVA - protected_cachee_op(Hit_Invalidate_I, addr); + return protected_cachee_op(Hit_Invalidate_I, addr); #else - protected_cache_op(Hit_Invalidate_I, addr); + return protected_cache_op(Hit_Invalidate_I, addr); #endif - break; } } @@ -199,21 +214,21 @@ static inline void protected_flush_icache_line(unsigned long addr) * caches. We're talking about one cacheline unnecessarily getting invalidated * here so the penalty isn't overly hard. */ -static inline void protected_writeback_dcache_line(unsigned long addr) +static inline int protected_writeback_dcache_line(unsigned long addr) { #ifdef CONFIG_EVA - protected_cachee_op(Hit_Writeback_Inv_D, addr); + return protected_cachee_op(Hit_Writeback_Inv_D, addr); #else - protected_cache_op(Hit_Writeback_Inv_D, addr); + return protected_cache_op(Hit_Writeback_Inv_D, addr); #endif } -static inline void protected_writeback_scache_line(unsigned long addr) +static inline int protected_writeback_scache_line(unsigned long addr) { #ifdef CONFIG_EVA - protected_cachee_op(Hit_Writeback_Inv_SD, addr); + return protected_cachee_op(Hit_Writeback_Inv_SD, addr); #else - protected_cache_op(Hit_Writeback_Inv_SD, addr); + return protected_cache_op(Hit_Writeback_Inv_SD, addr); #endif } diff --git a/arch/mips/include/asm/tlbex.h b/arch/mips/include/asm/tlbex.h new file mode 100644 index 000000000000..53050e9dd2c9 --- /dev/null +++ b/arch/mips/include/asm/tlbex.h @@ -0,0 +1,26 @@ +#ifndef __ASM_TLBEX_H +#define __ASM_TLBEX_H + +#include <asm/uasm.h> + +/* + * Write random or indexed TLB entry, and care about the hazards from + * the preceding mtc0 and for the following eret. + */ +enum tlb_write_entry { + tlb_random, + tlb_indexed +}; + +extern int pgd_reg; + +void build_get_pmde64(u32 **p, struct uasm_label **l, struct uasm_reloc **r, + unsigned int tmp, unsigned int ptr); +void build_get_pgde32(u32 **p, unsigned int tmp, unsigned int ptr); +void build_get_ptep(u32 **p, unsigned int tmp, unsigned int ptr); +void build_update_entries(u32 **p, unsigned int tmp, unsigned int ptep); +void build_tlb_write_entry(u32 **p, struct uasm_label **l, + struct uasm_reloc **r, + enum tlb_write_entry wmode); + +#endif /* __ASM_TLBEX_H */ diff --git a/arch/mips/include/asm/uasm.h b/arch/mips/include/asm/uasm.h index f7929f65f7ca..e9a9e2ade1d2 100644 --- a/arch/mips/include/asm/uasm.h +++ b/arch/mips/include/asm/uasm.h @@ -9,6 +9,9 @@ * Copyright (C) 2012, 2013 MIPS Technologies, Inc. All rights reserved. */ +#ifndef __ASM_UASM_H +#define __ASM_UASM_H + #include <linux/types.h> #ifdef CONFIG_EXPORT_UASM @@ -309,3 +312,5 @@ void uasm_il_bltz(u32 **p, struct uasm_reloc **r, unsigned int reg, int lid); void uasm_il_bne(u32 **p, struct uasm_reloc **r, unsigned int reg1, unsigned int reg2, int lid); void uasm_il_bnez(u32 **p, struct uasm_reloc **r, unsigned int reg, int lid); + +#endif /* __ASM_UASM_H */ diff --git a/arch/mips/include/uapi/asm/kvm.h b/arch/mips/include/uapi/asm/kvm.h index 6985eb59b085..a8a0199bf760 100644 --- a/arch/mips/include/uapi/asm/kvm.h +++ b/arch/mips/include/uapi/asm/kvm.h @@ -19,6 +19,8 @@ * Some parts derived from the x86 version of this file. */ +#define __KVM_HAVE_READONLY_MEM + /* * for KVM_GET_REGS and KVM_SET_REGS * diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig index 7c56d6b124d1..65067327db12 100644 --- a/arch/mips/kvm/Kconfig +++ b/arch/mips/kvm/Kconfig @@ -20,7 +20,9 @@ config KVM select EXPORT_UASM select PREEMPT_NOTIFIERS select ANON_INODES + select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_MMIO + select MMU_NOTIFIER select SRCU ---help--- Support for hosting Guest kernels. diff --git a/arch/mips/kvm/dyntrans.c b/arch/mips/kvm/dyntrans.c index 010cef240688..f8e772564d74 100644 --- a/arch/mips/kvm/dyntrans.c +++ b/arch/mips/kvm/dyntrans.c @@ -13,6 +13,7 @@ #include <linux/err.h> #include <linux/highmem.h> #include <linux/kvm_host.h> +#include <linux/uaccess.h> #include <linux/vmalloc.h> #include <linux/fs.h> #include <linux/bootmem.h> @@ -29,28 +30,37 @@ static int kvm_mips_trans_replace(struct kvm_vcpu *vcpu, u32 *opc, union mips_instruction replace) { - unsigned long paddr, flags; - void *vaddr; - - if (KVM_GUEST_KSEGX((unsigned long)opc) == KVM_GUEST_KSEG0) { - paddr = kvm_mips_translate_guest_kseg0_to_hpa(vcpu, - (unsigned long)opc); - vaddr = kmap_atomic(pfn_to_page(PHYS_PFN(paddr))); - vaddr += paddr & ~PAGE_MASK; - memcpy(vaddr, (void *)&replace, sizeof(u32)); - local_flush_icache_range((unsigned long)vaddr, - (unsigned long)vaddr + 32); - kunmap_atomic(vaddr); - } else if (KVM_GUEST_KSEGX((unsigned long) opc) == KVM_GUEST_KSEG23) { - local_irq_save(flags); - memcpy((void *)opc, (void *)&replace, sizeof(u32)); - __local_flush_icache_user_range((unsigned long)opc, - (unsigned long)opc + 32); - local_irq_restore(flags); - } else { - kvm_err("%s: Invalid address: %p\n", __func__, opc); - return -EFAULT; + unsigned long vaddr = (unsigned long)opc; + int err; + +retry: + /* The GVA page table is still active so use the Linux TLB handlers */ + kvm_trap_emul_gva_lockless_begin(vcpu); + err = put_user(replace.word, opc); + kvm_trap_emul_gva_lockless_end(vcpu); + + if (unlikely(err)) { + /* + * We write protect clean pages in GVA page table so normal + * Linux TLB mod handler doesn't silently dirty the page. + * Its also possible we raced with a GVA invalidation. + * Try to force the page to become dirty. + */ + err = kvm_trap_emul_gva_fault(vcpu, vaddr, true); + if (unlikely(err)) { + kvm_info("%s: Address unwriteable: %p\n", + __func__, opc); + return -EFAULT; + } + + /* + * Try again. This will likely trigger a TLB refill, which will + * fetch the new dirty entry from the GVA page table, which + * should then succeed. + */ + goto retry; } + __local_flush_icache_user_range(vaddr, vaddr + 4); return 0; } diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index aa0937423e28..d40cfaad4529 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -38,23 +38,25 @@ * Compute the return address and do emulate branch simulation, if required. * This function should be called only in branch delay slot active. */ -unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu, - unsigned long instpc) +static int kvm_compute_return_epc(struct kvm_vcpu *vcpu, unsigned long instpc, + unsigned long *out) { unsigned int dspcontrol; union mips_instruction insn; struct kvm_vcpu_arch *arch = &vcpu->arch; long epc = instpc; - long nextpc = KVM_INVALID_INST; + long nextpc; + int err; - if (epc & 3) - goto unaligned; + if (epc & 3) { + kvm_err("%s: unaligned epc\n", __func__); + return -EINVAL; + } /* Read the instruction */ - insn.word = kvm_get_inst((u32 *) epc, vcpu); - - if (insn.word == KVM_INVALID_INST) - return KVM_INVALID_INST; + err = kvm_get_badinstrp((u32 *)epc, vcpu, &insn.word); + if (err) + return err; switch (insn.i_format.opcode) { /* jr and jalr are in r_format format. */ @@ -66,6 +68,8 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu, case jr_op: nextpc = arch->gprs[insn.r_format.rs]; break; + default: + return -EINVAL; } break; @@ -114,8 +118,11 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu, nextpc = epc; break; case bposge32_op: - if (!cpu_has_dsp) - goto sigill; + if (!cpu_has_dsp) { + kvm_err("%s: DSP branch but not DSP ASE\n", + __func__); + return -EINVAL; + } dspcontrol = rddsp(0x01); @@ -125,6 +132,8 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu, epc += 8; nextpc = epc; break; + default: + return -EINVAL; } break; @@ -189,7 +198,7 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu, /* And now the FPA/cp1 branch instructions. */ case cop1_op: kvm_err("%s: unsupported cop1_op\n", __func__); - break; + return -EINVAL; #ifdef CONFIG_CPU_MIPSR6 /* R6 added the following compact branches with forbidden slots */ @@ -198,19 +207,19 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu, /* only rt == 0 isn't compact branch */ if (insn.i_format.rt != 0) goto compact_branch; - break; + return -EINVAL; case pop10_op: case pop30_op: /* only rs == rt == 0 is reserved, rest are compact branches */ if (insn.i_format.rs != 0 || insn.i_format.rt != 0) goto compact_branch; - break; + return -EINVAL; case pop66_op: case pop76_op: /* only rs == 0 isn't compact branch */ if (insn.i_format.rs != 0) goto compact_branch; - break; + return -EINVAL; compact_branch: /* * If we've hit an exception on the forbidden slot, then @@ -221,42 +230,74 @@ compact_branch: break; #else compact_branch: - /* Compact branches not supported before R6 */ - break; + /* Fall through - Compact branches not supported before R6 */ #endif + default: + return -EINVAL; } - return nextpc; - -unaligned: - kvm_err("%s: unaligned epc\n", __func__); - return nextpc; - -sigill: - kvm_err("%s: DSP branch but not DSP ASE\n", __func__); - return nextpc; + *out = nextpc; + return 0; } enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause) { - unsigned long branch_pc; - enum emulation_result er = EMULATE_DONE; + int err; if (cause & CAUSEF_BD) { - branch_pc = kvm_compute_return_epc(vcpu, vcpu->arch.pc); - if (branch_pc == KVM_INVALID_INST) { - er = EMULATE_FAIL; - } else { - vcpu->arch.pc = branch_pc; - kvm_debug("BD update_pc(): New PC: %#lx\n", - vcpu->arch.pc); - } - } else + err = kvm_compute_return_epc(vcpu, vcpu->arch.pc, + &vcpu->arch.pc); + if (err) + return EMULATE_FAIL; + } else { vcpu->arch.pc += 4; + } kvm_debug("update_pc(): New PC: %#lx\n", vcpu->arch.pc); - return er; + return EMULATE_DONE; +} + +/** + * kvm_get_badinstr() - Get bad instruction encoding. + * @opc: Guest pointer to faulting instruction. + * @vcpu: KVM VCPU information. + * + * Gets the instruction encoding of the faulting instruction, using the saved + * BadInstr register value if it exists, otherwise falling back to reading guest + * memory at @opc. + * + * Returns: The instruction encoding of the faulting instruction. + */ +int kvm_get_badinstr(u32 *opc, struct kvm_vcpu *vcpu, u32 *out) +{ + if (cpu_has_badinstr) { + *out = vcpu->arch.host_cp0_badinstr; + return 0; + } else { + return kvm_get_inst(opc, vcpu, out); + } +} + +/** + * kvm_get_badinstrp() - Get bad prior instruction encoding. + * @opc: Guest pointer to prior faulting instruction. + * @vcpu: KVM VCPU information. + * + * Gets the instruction encoding of the prior faulting instruction (the branch + * containing the delay slot which faulted), using the saved BadInstrP register + * value if it exists, otherwise falling back to reading guest memory at @opc. + * + * Returns: The instruction encoding of the prior faulting instruction. + */ +int kvm_get_badinstrp(u32 *opc, struct kvm_vcpu *vcpu, u32 *out) +{ + if (cpu_has_badinstrp) { + *out = vcpu->arch.host_cp0_badinstrp; + return 0; + } else { + return kvm_get_inst(opc, vcpu, out); + } } /** @@ -856,22 +897,30 @@ enum emulation_result kvm_mips_emul_tlbr(struct kvm_vcpu *vcpu) static void kvm_mips_invalidate_guest_tlb(struct kvm_vcpu *vcpu, struct kvm_mips_tlb *tlb) { + struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm; + struct mm_struct *user_mm = &vcpu->arch.guest_user_mm; int cpu, i; bool user; /* No need to flush for entries which are already invalid */ if (!((tlb->tlb_lo[0] | tlb->tlb_lo[1]) & ENTRYLO_V)) return; + /* Don't touch host kernel page tables or TLB mappings */ + if ((unsigned long)tlb->tlb_hi > 0x7fffffff) + return; /* User address space doesn't need flushing for KSeg2/3 changes */ user = tlb->tlb_hi < KVM_GUEST_KSEG0; preempt_disable(); + /* Invalidate page table entries */ + kvm_trap_emul_invalidate_gva(vcpu, tlb->tlb_hi & VPN2_MASK, user); + /* * Probe the shadow host TLB for the entry being overwritten, if one * matches, invalidate it */ - kvm_mips_host_tlb_inv(vcpu, tlb->tlb_hi); + kvm_mips_host_tlb_inv(vcpu, tlb->tlb_hi, user, true); /* Invalidate the whole ASID on other CPUs */ cpu = smp_processor_id(); @@ -879,8 +928,8 @@ static void kvm_mips_invalidate_guest_tlb(struct kvm_vcpu *vcpu, if (i == cpu) continue; if (user) - vcpu->arch.guest_user_asid[i] = 0; - vcpu->arch.guest_kernel_asid[i] = 0; + cpu_context(i, user_mm) = 0; + cpu_context(i, kern_mm) = 0; } preempt_enable(); @@ -1017,7 +1066,7 @@ unsigned int kvm_mips_config4_wrmask(struct kvm_vcpu *vcpu) unsigned int mask = MIPS_CONF_M; /* KScrExist */ - mask |= (unsigned int)vcpu->arch.kscratch_enabled << 16; + mask |= 0xfc << MIPS_CONF4_KSCREXIST_SHIFT; return mask; } @@ -1056,6 +1105,7 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst, struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; + struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm; enum emulation_result er = EMULATE_DONE; u32 rt, rd, sel; unsigned long curr_pc; @@ -1150,14 +1200,13 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst, er = EMULATE_FAIL; break; } -#define C0_EBASE_CORE_MASK 0xff if ((rd == MIPS_CP0_PRID) && (sel == 1)) { - /* Preserve CORE number */ - kvm_change_c0_guest_ebase(cop0, - ~(C0_EBASE_CORE_MASK), + /* + * Preserve core number, and keep the exception + * base in guest KSeg0. + */ + kvm_change_c0_guest_ebase(cop0, 0x1ffff000, vcpu->arch.gprs[rt]); - kvm_err("MTCz, cop0->reg[EBASE]: %#lx\n", - kvm_read_c0_guest_ebase(cop0)); } else if (rd == MIPS_CP0_TLB_HI && sel == 0) { u32 nasid = vcpu->arch.gprs[rt] & KVM_ENTRYHI_ASID; @@ -1169,6 +1218,17 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst, nasid); /* + * Flush entries from the GVA page + * tables. + * Guest user page table will get + * flushed lazily on re-entry to guest + * user if the guest ASID actually + * changes. + */ + kvm_mips_flush_gva_pt(kern_mm->pgd, + KMF_KERN); + + /* * Regenerate/invalidate kernel MMU * context. * The user MMU context will be @@ -1178,13 +1238,10 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst, */ preempt_disable(); cpu = smp_processor_id(); - kvm_get_new_mmu_context(&vcpu->arch.guest_kernel_mm, - cpu, vcpu); - vcpu->arch.guest_kernel_asid[cpu] = - vcpu->arch.guest_kernel_mm.context.asid[cpu]; + get_new_mmu_context(kern_mm, cpu); for_each_possible_cpu(i) if (i != cpu) - vcpu->arch.guest_kernel_asid[i] = 0; + cpu_context(i, kern_mm) = 0; preempt_enable(); } kvm_write_c0_guest_entryhi(cop0, @@ -1639,12 +1696,56 @@ enum emulation_result kvm_mips_emulate_load(union mips_instruction inst, return er; } +static enum emulation_result kvm_mips_guest_cache_op(int (*fn)(unsigned long), + unsigned long curr_pc, + unsigned long addr, + struct kvm_run *run, + struct kvm_vcpu *vcpu, + u32 cause) +{ + int err; + + for (;;) { + /* Carefully attempt the cache operation */ + kvm_trap_emul_gva_lockless_begin(vcpu); + err = fn(addr); + kvm_trap_emul_gva_lockless_end(vcpu); + + if (likely(!err)) + return EMULATE_DONE; + + /* + * Try to handle the fault and retry, maybe we just raced with a + * GVA invalidation. + */ + switch (kvm_trap_emul_gva_fault(vcpu, addr, false)) { + case KVM_MIPS_GVA: + case KVM_MIPS_GPA: + /* bad virtual or physical address */ + return EMULATE_FAIL; + case KVM_MIPS_TLB: + /* no matching guest TLB */ + vcpu->arch.host_cp0_badvaddr = addr; + vcpu->arch.pc = curr_pc; + kvm_mips_emulate_tlbmiss_ld(cause, NULL, run, vcpu); + return EMULATE_EXCEPT; + case KVM_MIPS_TLBINV: + /* invalid matching guest TLB */ + vcpu->arch.host_cp0_badvaddr = addr; + vcpu->arch.pc = curr_pc; + kvm_mips_emulate_tlbinv_ld(cause, NULL, run, vcpu); + return EMULATE_EXCEPT; + default: + break; + }; + } +} + enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst, u32 *opc, u32 cause, struct kvm_run *run, struct kvm_vcpu *vcpu) { - struct mips_coproc *cop0 = vcpu->arch.cop0; enum emulation_result er = EMULATE_DONE; u32 cache, op_inst, op, base; s16 offset; @@ -1701,80 +1802,16 @@ enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst, goto done; } - preempt_disable(); - if (KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG0) { - if (kvm_mips_host_tlb_lookup(vcpu, va) < 0 && - kvm_mips_handle_kseg0_tlb_fault(va, vcpu)) { - kvm_err("%s: handling mapped kseg0 tlb fault for %lx, vcpu: %p, ASID: %#lx\n", - __func__, va, vcpu, read_c0_entryhi()); - er = EMULATE_FAIL; - preempt_enable(); - goto done; - } - } else if ((KVM_GUEST_KSEGX(va) < KVM_GUEST_KSEG0) || - KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG23) { - int index; - - /* If an entry already exists then skip */ - if (kvm_mips_host_tlb_lookup(vcpu, va) >= 0) - goto skip_fault; - - /* - * If address not in the guest TLB, then give the guest a fault, - * the resulting handler will do the right thing - */ - index = kvm_mips_guest_tlb_lookup(vcpu, (va & VPN2_MASK) | - (kvm_read_c0_guest_entryhi - (cop0) & KVM_ENTRYHI_ASID)); - - if (index < 0) { - vcpu->arch.host_cp0_badvaddr = va; - vcpu->arch.pc = curr_pc; - er = kvm_mips_emulate_tlbmiss_ld(cause, NULL, run, - vcpu); - preempt_enable(); - goto dont_update_pc; - } else { - struct kvm_mips_tlb *tlb = &vcpu->arch.guest_tlb[index]; - /* - * Check if the entry is valid, if not then setup a TLB - * invalid exception to the guest - */ - if (!TLB_IS_VALID(*tlb, va)) { - vcpu->arch.host_cp0_badvaddr = va; - vcpu->arch.pc = curr_pc; - er = kvm_mips_emulate_tlbinv_ld(cause, NULL, - run, vcpu); - preempt_enable(); - goto dont_update_pc; - } - /* - * We fault an entry from the guest tlb to the - * shadow host TLB - */ - if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb)) { - kvm_err("%s: handling mapped seg tlb fault for %lx, index: %u, vcpu: %p, ASID: %#lx\n", - __func__, va, index, vcpu, - read_c0_entryhi()); - er = EMULATE_FAIL; - preempt_enable(); - goto done; - } - } - } else { - kvm_err("INVALID CACHE INDEX/ADDRESS (cache: %#x, op: %#x, base[%d]: %#lx, offset: %#x\n", - cache, op, base, arch->gprs[base], offset); - er = EMULATE_FAIL; - preempt_enable(); - goto done; - - } - -skip_fault: /* XXXKYMA: Only a subset of cache ops are supported, used by Linux */ if (op_inst == Hit_Writeback_Inv_D || op_inst == Hit_Invalidate_D) { - flush_dcache_line(va); - + /* + * Perform the dcache part of icache synchronisation on the + * guest's behalf. + */ + er = kvm_mips_guest_cache_op(protected_writeback_dcache_line, + curr_pc, va, run, vcpu, cause); + if (er != EMULATE_DONE) + goto done; #ifdef CONFIG_KVM_MIPS_DYN_TRANS /* * Replace the CACHE instruction, with a SYNCI, not the same, @@ -1783,8 +1820,15 @@ skip_fault: kvm_mips_trans_cache_va(inst, opc, vcpu); #endif } else if (op_inst == Hit_Invalidate_I) { - flush_dcache_line(va); - flush_icache_line(va); + /* Perform the icache synchronisation on the guest's behalf */ + er = kvm_mips_guest_cache_op(protected_writeback_dcache_line, + curr_pc, va, run, vcpu, cause); + if (er != EMULATE_DONE) + goto done; + er = kvm_mips_guest_cache_op(protected_flush_icache_line, + curr_pc, va, run, vcpu, cause); + if (er != EMULATE_DONE) + goto done; #ifdef CONFIG_KVM_MIPS_DYN_TRANS /* Replace the CACHE instruction, with a SYNCI */ @@ -1796,17 +1840,13 @@ skip_fault: er = EMULATE_FAIL; } - preempt_enable(); done: /* Rollback PC only if emulation was unsuccessful */ if (er == EMULATE_FAIL) vcpu->arch.pc = curr_pc; - -dont_update_pc: - /* - * This is for exceptions whose emulation updates the PC, so do not - * overwrite the PC under any circumstances - */ + /* Guest exception needs guest to resume */ + if (er == EMULATE_EXCEPT) + er = EMULATE_DONE; return er; } @@ -1817,12 +1857,14 @@ enum emulation_result kvm_mips_emulate_inst(u32 cause, u32 *opc, { union mips_instruction inst; enum emulation_result er = EMULATE_DONE; + int err; /* Fetch the instruction. */ if (cause & CAUSEF_BD) opc += 1; - - inst.word = kvm_get_inst(opc, vcpu); + err = kvm_get_badinstr(opc, vcpu, &inst.word); + if (err) + return EMULATE_FAIL; switch (inst.r_format.opcode) { case cop0_op: @@ -1874,6 +1916,22 @@ unknown: return er; } +/** + * kvm_mips_guest_exception_base() - Find guest exception vector base address. + * + * Returns: The base address of the current guest exception vector, taking + * both Guest.CP0_Status.BEV and Guest.CP0_EBase into account. + */ +long kvm_mips_guest_exception_base(struct kvm_vcpu *vcpu) +{ + struct mips_coproc *cop0 = vcpu->arch.cop0; + + if (kvm_read_c0_guest_status(cop0) & ST0_BEV) + return KVM_GUEST_CKSEG1ADDR(0x1fc00200); + else + return kvm_read_c0_guest_ebase(cop0) & MIPS_EBASE_BASE; +} + enum emulation_result kvm_mips_emulate_syscall(u32 cause, u32 *opc, struct kvm_run *run, @@ -1899,7 +1957,7 @@ enum emulation_result kvm_mips_emulate_syscall(u32 cause, (EXCCODE_SYS << CAUSEB_EXCCODE)); /* Set PC to the exception entry point */ - arch->pc = KVM_GUEST_KSEG0 + 0x180; + arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180; } else { kvm_err("Trying to deliver SYSCALL when EXL is already set\n"); @@ -1933,13 +1991,13 @@ enum emulation_result kvm_mips_emulate_tlbmiss_ld(u32 cause, arch->pc); /* set pc to the exception entry point */ - arch->pc = KVM_GUEST_KSEG0 + 0x0; + arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x0; } else { kvm_debug("[EXL == 1] delivering TLB MISS @ pc %#lx\n", arch->pc); - arch->pc = KVM_GUEST_KSEG0 + 0x180; + arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180; } kvm_change_c0_guest_cause(cop0, (0xff), @@ -1949,8 +2007,6 @@ enum emulation_result kvm_mips_emulate_tlbmiss_ld(u32 cause, kvm_write_c0_guest_badvaddr(cop0, vcpu->arch.host_cp0_badvaddr); /* XXXKYMA: is the context register used by linux??? */ kvm_write_c0_guest_entryhi(cop0, entryhi); - /* Blow away the shadow host TLBs */ - kvm_mips_flush_host_tlb(1); return EMULATE_DONE; } @@ -1978,16 +2034,14 @@ enum emulation_result kvm_mips_emulate_tlbinv_ld(u32 cause, kvm_debug("[EXL == 0] delivering TLB INV @ pc %#lx\n", arch->pc); - - /* set pc to the exception entry point */ - arch->pc = KVM_GUEST_KSEG0 + 0x180; - } else { kvm_debug("[EXL == 1] delivering TLB MISS @ pc %#lx\n", arch->pc); - arch->pc = KVM_GUEST_KSEG0 + 0x180; } + /* set pc to the exception entry point */ + arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180; + kvm_change_c0_guest_cause(cop0, (0xff), (EXCCODE_TLBL << CAUSEB_EXCCODE)); @@ -1995,8 +2049,6 @@ enum emulation_result kvm_mips_emulate_tlbinv_ld(u32 cause, kvm_write_c0_guest_badvaddr(cop0, vcpu->arch.host_cp0_badvaddr); /* XXXKYMA: is the context register used by linux??? */ kvm_write_c0_guest_entryhi(cop0, entryhi); - /* Blow away the shadow host TLBs */ - kvm_mips_flush_host_tlb(1); return EMULATE_DONE; } @@ -2025,11 +2077,11 @@ enum emulation_result kvm_mips_emulate_tlbmiss_st(u32 cause, arch->pc); /* Set PC to the exception entry point */ - arch->pc = KVM_GUEST_KSEG0 + 0x0; + arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x0; } else { kvm_debug("[EXL == 1] Delivering TLB MISS @ pc %#lx\n", arch->pc); - arch->pc = KVM_GUEST_KSEG0 + 0x180; + arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180; } kvm_change_c0_guest_cause(cop0, (0xff), @@ -2039,8 +2091,6 @@ enum emulation_result kvm_mips_emulate_tlbmiss_st(u32 cause, kvm_write_c0_guest_badvaddr(cop0, vcpu->arch.host_cp0_badvaddr); /* XXXKYMA: is the context register used by linux??? */ kvm_write_c0_guest_entryhi(cop0, entryhi); - /* Blow away the shadow host TLBs */ - kvm_mips_flush_host_tlb(1); return EMULATE_DONE; } @@ -2067,15 +2117,14 @@ enum emulation_result kvm_mips_emulate_tlbinv_st(u32 cause, kvm_debug("[EXL == 0] Delivering TLB MISS @ pc %#lx\n", arch->pc); - - /* Set PC to the exception entry point */ - arch->pc = KVM_GUEST_KSEG0 + 0x180; } else { kvm_debug("[EXL == 1] Delivering TLB MISS @ pc %#lx\n", arch->pc); - arch->pc = KVM_GUEST_KSEG0 + 0x180; } + /* Set PC to the exception entry point */ + arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180; + kvm_change_c0_guest_cause(cop0, (0xff), (EXCCODE_TLBS << CAUSEB_EXCCODE)); @@ -2083,41 +2132,10 @@ enum emulation_result kvm_mips_emulate_tlbinv_st(u32 cause, kvm_write_c0_guest_badvaddr(cop0, vcpu->arch.host_cp0_badvaddr); /* XXXKYMA: is the context register used by linux??? */ kvm_write_c0_guest_entryhi(cop0, entryhi); - /* Blow away the shadow host TLBs */ - kvm_mips_flush_host_tlb(1); return EMULATE_DONE; } -/* TLBMOD: store into address matching TLB with Dirty bit off */ -enum emulation_result kvm_mips_handle_tlbmod(u32 cause, u32 *opc, - struct kvm_run *run, - struct kvm_vcpu *vcpu) -{ - enum emulation_result er = EMULATE_DONE; -#ifdef DEBUG - struct mips_coproc *cop0 = vcpu->arch.cop0; - unsigned long entryhi = (vcpu->arch.host_cp0_badvaddr & VPN2_MASK) | - (kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID); - int index; - - /* If address not in the guest TLB, then we are in trouble */ - index = kvm_mips_guest_tlb_lookup(vcpu, entryhi); - if (index < 0) { - /* XXXKYMA Invalidate and retry */ - kvm_mips_host_tlb_inv(vcpu, vcpu->arch.host_cp0_badvaddr); - kvm_err("%s: host got TLBMOD for %#lx but entry not present in Guest TLB\n", - __func__, entryhi); - kvm_mips_dump_guest_tlbs(vcpu); - kvm_mips_dump_host_tlbs(); - return EMULATE_FAIL; - } -#endif - - er = kvm_mips_emulate_tlbmod(cause, opc, run, vcpu); - return er; -} - enum emulation_result kvm_mips_emulate_tlbmod(u32 cause, u32 *opc, struct kvm_run *run, @@ -2140,14 +2158,13 @@ enum emulation_result kvm_mips_emulate_tlbmod(u32 cause, kvm_debug("[EXL == 0] Delivering TLB MOD @ pc %#lx\n", arch->pc); - - arch->pc = KVM_GUEST_KSEG0 + 0x180; } else { kvm_debug("[EXL == 1] Delivering TLB MOD @ pc %#lx\n", arch->pc); - arch->pc = KVM_GUEST_KSEG0 + 0x180; } + arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180; + kvm_change_c0_guest_cause(cop0, (0xff), (EXCCODE_MOD << CAUSEB_EXCCODE)); @@ -2155,8 +2172,6 @@ enum emulation_result kvm_mips_emulate_tlbmod(u32 cause, kvm_write_c0_guest_badvaddr(cop0, vcpu->arch.host_cp0_badvaddr); /* XXXKYMA: is the context register used by linux??? */ kvm_write_c0_guest_entryhi(cop0, entryhi); - /* Blow away the shadow host TLBs */ - kvm_mips_flush_host_tlb(1); return EMULATE_DONE; } @@ -2181,7 +2196,7 @@ enum emulation_result kvm_mips_emulate_fpu_exc(u32 cause, } - arch->pc = KVM_GUEST_KSEG0 + 0x180; + arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180; kvm_change_c0_guest_cause(cop0, (0xff), (EXCCODE_CPU << CAUSEB_EXCCODE)); @@ -2215,7 +2230,7 @@ enum emulation_result kvm_mips_emulate_ri_exc(u32 cause, (EXCCODE_RI << CAUSEB_EXCCODE)); /* Set PC to the exception entry point */ - arch->pc = KVM_GUEST_KSEG0 + 0x180; + arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180; } else { kvm_err("Trying to deliver RI when EXL is already set\n"); @@ -2250,7 +2265,7 @@ enum emulation_result kvm_mips_emulate_bp_exc(u32 cause, (EXCCODE_BP << CAUSEB_EXCCODE)); /* Set PC to the exception entry point */ - arch->pc = KVM_GUEST_KSEG0 + 0x180; + arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180; } else { kvm_err("Trying to deliver BP when EXL is already set\n"); @@ -2285,7 +2300,7 @@ enum emulation_result kvm_mips_emulate_trap_exc(u32 cause, (EXCCODE_TR << CAUSEB_EXCCODE)); /* Set PC to the exception entry point */ - arch->pc = KVM_GUEST_KSEG0 + 0x180; + arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180; } else { kvm_err("Trying to deliver TRAP when EXL is already set\n"); @@ -2320,7 +2335,7 @@ enum emulation_result kvm_mips_emulate_msafpe_exc(u32 cause, (EXCCODE_MSAFPE << CAUSEB_EXCCODE)); /* Set PC to the exception entry point */ - arch->pc = KVM_GUEST_KSEG0 + 0x180; + arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180; } else { kvm_err("Trying to deliver MSAFPE when EXL is already set\n"); @@ -2355,7 +2370,7 @@ enum emulation_result kvm_mips_emulate_fpe_exc(u32 cause, (EXCCODE_FPE << CAUSEB_EXCCODE)); /* Set PC to the exception entry point */ - arch->pc = KVM_GUEST_KSEG0 + 0x180; + arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180; } else { kvm_err("Trying to deliver FPE when EXL is already set\n"); @@ -2390,7 +2405,7 @@ enum emulation_result kvm_mips_emulate_msadis_exc(u32 cause, (EXCCODE_MSADIS << CAUSEB_EXCCODE)); /* Set PC to the exception entry point */ - arch->pc = KVM_GUEST_KSEG0 + 0x180; + arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180; } else { kvm_err("Trying to deliver MSADIS when EXL is already set\n"); @@ -2409,6 +2424,7 @@ enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc, enum emulation_result er = EMULATE_DONE; unsigned long curr_pc; union mips_instruction inst; + int err; /* * Update PC and hold onto current PC in case there is @@ -2422,11 +2438,9 @@ enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc, /* Fetch the instruction. */ if (cause & CAUSEF_BD) opc += 1; - - inst.word = kvm_get_inst(opc, vcpu); - - if (inst.word == KVM_INVALID_INST) { - kvm_err("%s: Cannot get inst @ %p\n", __func__, opc); + err = kvm_get_badinstr(opc, vcpu, &inst.word); + if (err) { + kvm_err("%s: Cannot get inst @ %p (%d)\n", __func__, opc, err); return EMULATE_FAIL; } @@ -2557,7 +2571,7 @@ static enum emulation_result kvm_mips_emulate_exc(u32 cause, (exccode << CAUSEB_EXCCODE)); /* Set PC to the exception entry point */ - arch->pc = KVM_GUEST_KSEG0 + 0x180; + arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180; kvm_write_c0_guest_badvaddr(cop0, vcpu->arch.host_cp0_badvaddr); kvm_debug("Delivering EXC %d @ pc %#lx, badVaddr: %#lx\n", @@ -2670,7 +2684,8 @@ enum emulation_result kvm_mips_check_privilege(u32 cause, enum emulation_result kvm_mips_handle_tlbmiss(u32 cause, u32 *opc, struct kvm_run *run, - struct kvm_vcpu *vcpu) + struct kvm_vcpu *vcpu, + bool write_fault) { enum emulation_result er = EMULATE_DONE; u32 exccode = (cause >> CAUSEB_EXCCODE) & 0x1f; @@ -2726,7 +2741,8 @@ enum emulation_result kvm_mips_handle_tlbmiss(u32 cause, * OK we have a Guest TLB entry, now inject it into the * shadow host TLB */ - if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb)) { + if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb, va, + write_fault)) { kvm_err("%s: handling mapped seg tlb fault for %lx, index: %u, vcpu: %p, ASID: %#lx\n", __func__, va, index, vcpu, read_c0_entryhi()); diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c index e92fb190e2d6..c5b254c4d0da 100644 --- a/arch/mips/kvm/entry.c +++ b/arch/mips/kvm/entry.c @@ -12,8 +12,11 @@ */ #include <linux/kvm_host.h> +#include <linux/log2.h> +#include <asm/mmu_context.h> #include <asm/msa.h> #include <asm/setup.h> +#include <asm/tlbex.h> #include <asm/uasm.h> /* Register names */ @@ -50,6 +53,8 @@ /* Some CP0 registers */ #define C0_HWRENA 7, 0 #define C0_BADVADDR 8, 0 +#define C0_BADINSTR 8, 1 +#define C0_BADINSTRP 8, 2 #define C0_ENTRYHI 10, 0 #define C0_STATUS 12, 0 #define C0_CAUSE 13, 0 @@ -89,6 +94,21 @@ static void *kvm_mips_build_ret_from_exit(void *addr); static void *kvm_mips_build_ret_to_guest(void *addr); static void *kvm_mips_build_ret_to_host(void *addr); +/* + * The version of this function in tlbex.c uses current_cpu_type(), but for KVM + * we assume symmetry. + */ +static int c0_kscratch(void) +{ + switch (boot_cpu_type()) { + case CPU_XLP: + case CPU_XLR: + return 22; + default: + return 31; + } +} + /** * kvm_mips_entry_setup() - Perform global setup for entry code. * @@ -103,18 +123,21 @@ int kvm_mips_entry_setup(void) * We prefer to use KScratchN registers if they are available over the * defaults above, which may not work on all cores. */ - unsigned int kscratch_mask = cpu_data[0].kscratch_mask & 0xfc; + unsigned int kscratch_mask = cpu_data[0].kscratch_mask; + + if (pgd_reg != -1) + kscratch_mask &= ~BIT(pgd_reg); /* Pick a scratch register for storing VCPU */ if (kscratch_mask) { - scratch_vcpu[0] = 31; + scratch_vcpu[0] = c0_kscratch(); scratch_vcpu[1] = ffs(kscratch_mask) - 1; kscratch_mask &= ~BIT(scratch_vcpu[1]); } /* Pick a scratch register to use as a temp for saving state */ if (kscratch_mask) { - scratch_tmp[0] = 31; + scratch_tmp[0] = c0_kscratch(); scratch_tmp[1] = ffs(kscratch_mask) - 1; kscratch_mask &= ~BIT(scratch_tmp[1]); } @@ -130,7 +153,7 @@ static void kvm_mips_build_save_scratch(u32 **p, unsigned int tmp, UASM_i_SW(p, tmp, offsetof(struct pt_regs, cp0_epc), frame); /* Save the temp scratch register value in cp0_cause of stack frame */ - if (scratch_tmp[0] == 31) { + if (scratch_tmp[0] == c0_kscratch()) { UASM_i_MFC0(p, tmp, scratch_tmp[0], scratch_tmp[1]); UASM_i_SW(p, tmp, offsetof(struct pt_regs, cp0_cause), frame); } @@ -146,7 +169,7 @@ static void kvm_mips_build_restore_scratch(u32 **p, unsigned int tmp, UASM_i_LW(p, tmp, offsetof(struct pt_regs, cp0_epc), frame); UASM_i_MTC0(p, tmp, scratch_vcpu[0], scratch_vcpu[1]); - if (scratch_tmp[0] == 31) { + if (scratch_tmp[0] == c0_kscratch()) { UASM_i_LW(p, tmp, offsetof(struct pt_regs, cp0_cause), frame); UASM_i_MTC0(p, tmp, scratch_tmp[0], scratch_tmp[1]); } @@ -286,23 +309,26 @@ static void *kvm_mips_build_enter_guest(void *addr) uasm_i_andi(&p, T0, T0, KSU_USER | ST0_ERL | ST0_EXL); uasm_i_xori(&p, T0, T0, KSU_USER); uasm_il_bnez(&p, &r, T0, label_kernel_asid); - UASM_i_ADDIU(&p, T1, K1, - offsetof(struct kvm_vcpu_arch, guest_kernel_asid)); + UASM_i_ADDIU(&p, T1, K1, offsetof(struct kvm_vcpu_arch, + guest_kernel_mm.context.asid)); /* else user */ - UASM_i_ADDIU(&p, T1, K1, - offsetof(struct kvm_vcpu_arch, guest_user_asid)); + UASM_i_ADDIU(&p, T1, K1, offsetof(struct kvm_vcpu_arch, + guest_user_mm.context.asid)); uasm_l_kernel_asid(&l, p); /* t1: contains the base of the ASID array, need to get the cpu id */ /* smp_processor_id */ uasm_i_lw(&p, T2, offsetof(struct thread_info, cpu), GP); - /* x4 */ - uasm_i_sll(&p, T2, T2, 2); + /* index the ASID array */ + uasm_i_sll(&p, T2, T2, ilog2(sizeof(long))); UASM_i_ADDU(&p, T3, T1, T2); - uasm_i_lw(&p, K0, 0, T3); + UASM_i_LW(&p, K0, 0, T3); #ifdef CONFIG_MIPS_ASID_BITS_VARIABLE - /* x sizeof(struct cpuinfo_mips)/4 */ - uasm_i_addiu(&p, T3, ZERO, sizeof(struct cpuinfo_mips)/4); + /* + * reuse ASID array offset + * cpuinfo_mips is a multiple of sizeof(long) + */ + uasm_i_addiu(&p, T3, ZERO, sizeof(struct cpuinfo_mips)/sizeof(long)); uasm_i_mul(&p, T2, T2, T3); UASM_i_LA_mostly(&p, AT, (long)&cpu_data[0].asid_mask); @@ -312,7 +338,20 @@ static void *kvm_mips_build_enter_guest(void *addr) #else uasm_i_andi(&p, K0, K0, MIPS_ENTRYHI_ASID); #endif - uasm_i_mtc0(&p, K0, C0_ENTRYHI); + + /* + * Set up KVM T&E GVA pgd. + * This does roughly the same as TLBMISS_HANDLER_SETUP_PGD(): + * - call tlbmiss_handler_setup_pgd(mm->pgd) + * - but skips write into CP0_PWBase for now + */ + UASM_i_LW(&p, A0, (int)offsetof(struct mm_struct, pgd) - + (int)offsetof(struct mm_struct, context.asid), T1); + + UASM_i_LA(&p, T9, (unsigned long)tlbmiss_handler_setup_pgd); + uasm_i_jalr(&p, RA, T9); + uasm_i_mtc0(&p, K0, C0_ENTRYHI); + uasm_i_ehb(&p); /* Disable RDHWR access */ @@ -348,6 +387,80 @@ static void *kvm_mips_build_enter_guest(void *addr) } /** + * kvm_mips_build_tlb_refill_exception() - Assemble TLB refill handler. + * @addr: Address to start writing code. + * @handler: Address of common handler (within range of @addr). + * + * Assemble TLB refill exception fast path handler for guest execution. + * + * Returns: Next address after end of written function. + */ +void *kvm_mips_build_tlb_refill_exception(void *addr, void *handler) +{ + u32 *p = addr; + struct uasm_label labels[2]; + struct uasm_reloc relocs[2]; + struct uasm_label *l = labels; + struct uasm_reloc *r = relocs; + + memset(labels, 0, sizeof(labels)); + memset(relocs, 0, sizeof(relocs)); + + /* Save guest k1 into scratch register */ + UASM_i_MTC0(&p, K1, scratch_tmp[0], scratch_tmp[1]); + + /* Get the VCPU pointer from the VCPU scratch register */ + UASM_i_MFC0(&p, K1, scratch_vcpu[0], scratch_vcpu[1]); + + /* Save guest k0 into VCPU structure */ + UASM_i_SW(&p, K0, offsetof(struct kvm_vcpu, arch.gprs[K0]), K1); + + /* + * Some of the common tlbex code uses current_cpu_type(). For KVM we + * assume symmetry and just disable preemption to silence the warning. + */ + preempt_disable(); + + /* + * Now for the actual refill bit. A lot of this can be common with the + * Linux TLB refill handler, however we don't need to handle so many + * cases. We only need to handle user mode refills, and user mode runs + * with 32-bit addressing. + * + * Therefore the branch to label_vmalloc generated by build_get_pmde64() + * that isn't resolved should never actually get taken and is harmless + * to leave in place for now. + */ + +#ifdef CONFIG_64BIT + build_get_pmde64(&p, &l, &r, K0, K1); /* get pmd in K1 */ +#else + build_get_pgde32(&p, K0, K1); /* get pgd in K1 */ +#endif + + /* we don't support huge pages yet */ + + build_get_ptep(&p, K0, K1); + build_update_entries(&p, K0, K1); + build_tlb_write_entry(&p, &l, &r, tlb_random); + + preempt_enable(); + + /* Get the VCPU pointer from the VCPU scratch register again */ + UASM_i_MFC0(&p, K1, scratch_vcpu[0], scratch_vcpu[1]); + + /* Restore the guest's k0/k1 registers */ + UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu, arch.gprs[K0]), K1); + uasm_i_ehb(&p); + UASM_i_MFC0(&p, K1, scratch_tmp[0], scratch_tmp[1]); + + /* Jump to guest */ + uasm_i_eret(&p); + + return p; +} + +/** * kvm_mips_build_exception() - Assemble first level guest exception handler. * @addr: Address to start writing code. * @handler: Address of common handler (within range of @addr). @@ -468,6 +581,18 @@ void *kvm_mips_build_exit(void *addr) uasm_i_mfc0(&p, K0, C0_CAUSE); uasm_i_sw(&p, K0, offsetof(struct kvm_vcpu_arch, host_cp0_cause), K1); + if (cpu_has_badinstr) { + uasm_i_mfc0(&p, K0, C0_BADINSTR); + uasm_i_sw(&p, K0, offsetof(struct kvm_vcpu_arch, + host_cp0_badinstr), K1); + } + + if (cpu_has_badinstrp) { + uasm_i_mfc0(&p, K0, C0_BADINSTRP); + uasm_i_sw(&p, K0, offsetof(struct kvm_vcpu_arch, + host_cp0_badinstrp), K1); + } + /* Now restore the host state just enough to run the handlers */ /* Switch EBASE to the one used by Linux */ diff --git a/arch/mips/kvm/interrupt.c b/arch/mips/kvm/interrupt.c index e88403b3dcdd..aa0a1a00faf6 100644 --- a/arch/mips/kvm/interrupt.c +++ b/arch/mips/kvm/interrupt.c @@ -183,10 +183,11 @@ int kvm_mips_irq_deliver_cb(struct kvm_vcpu *vcpu, unsigned int priority, (exccode << CAUSEB_EXCCODE)); /* XXXSL Set PC to the interrupt exception entry point */ + arch->pc = kvm_mips_guest_exception_base(vcpu); if (kvm_read_c0_guest_cause(cop0) & CAUSEF_IV) - arch->pc = KVM_GUEST_KSEG0 + 0x200; + arch->pc += 0x200; else - arch->pc = KVM_GUEST_KSEG0 + 0x180; + arch->pc += 0x180; clear_bit(priority, &vcpu->arch.pending_exceptions); } diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 29ec9ab3fd55..31ee5ee0010b 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -22,6 +22,7 @@ #include <asm/page.h> #include <asm/cacheflush.h> #include <asm/mmu_context.h> +#include <asm/pgalloc.h> #include <asm/pgtable.h> #include <linux/kvm_host.h> @@ -63,18 +64,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { {NULL} }; -static int kvm_mips_reset_vcpu(struct kvm_vcpu *vcpu) -{ - int i; - - for_each_possible_cpu(i) { - vcpu->arch.guest_kernel_asid[i] = 0; - vcpu->arch.guest_user_asid[i] = 0; - } - - return 0; -} - /* * XXXKYMA: We are simulatoring a processor that has the WII bit set in * Config7, so we are "runnable" if interrupts are pending @@ -104,39 +93,12 @@ void kvm_arch_check_processor_compat(void *rtn) *(int *)rtn = 0; } -static void kvm_mips_init_tlbs(struct kvm *kvm) -{ - unsigned long wired; - - /* - * Add a wired entry to the TLB, it is used to map the commpage to - * the Guest kernel - */ - wired = read_c0_wired(); - write_c0_wired(wired + 1); - mtc0_tlbw_hazard(); - kvm->arch.commpage_tlb = wired; - - kvm_debug("[%d] commpage TLB: %d\n", smp_processor_id(), - kvm->arch.commpage_tlb); -} - -static void kvm_mips_init_vm_percpu(void *arg) -{ - struct kvm *kvm = (struct kvm *)arg; - - kvm_mips_init_tlbs(kvm); - kvm_mips_callbacks->vm_init(kvm); - -} - int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { - if (atomic_inc_return(&kvm_mips_instance) == 1) { - kvm_debug("%s: 1st KVM instance, setup host TLB parameters\n", - __func__); - on_each_cpu(kvm_mips_init_vm_percpu, kvm, 1); - } + /* Allocate page table to map GPA -> RPA */ + kvm->arch.gpa_mm.pgd = kvm_pgd_alloc(); + if (!kvm->arch.gpa_mm.pgd) + return -ENOMEM; return 0; } @@ -156,13 +118,6 @@ void kvm_mips_free_vcpus(struct kvm *kvm) unsigned int i; struct kvm_vcpu *vcpu; - /* Put the pages we reserved for the guest pmap */ - for (i = 0; i < kvm->arch.guest_pmap_npages; i++) { - if (kvm->arch.guest_pmap[i] != KVM_INVALID_PAGE) - kvm_release_pfn_clean(kvm->arch.guest_pmap[i]); - } - kfree(kvm->arch.guest_pmap); - kvm_for_each_vcpu(i, vcpu, kvm) { kvm_arch_vcpu_free(vcpu); } @@ -177,25 +132,17 @@ void kvm_mips_free_vcpus(struct kvm *kvm) mutex_unlock(&kvm->lock); } -static void kvm_mips_uninit_tlbs(void *arg) +static void kvm_mips_free_gpa_pt(struct kvm *kvm) { - /* Restore wired count */ - write_c0_wired(0); - mtc0_tlbw_hazard(); - /* Clear out all the TLBs */ - kvm_local_flush_tlb_all(); + /* It should always be safe to remove after flushing the whole range */ + WARN_ON(!kvm_mips_flush_gpa_pt(kvm, 0, ~0)); + pgd_free(NULL, kvm->arch.gpa_mm.pgd); } void kvm_arch_destroy_vm(struct kvm *kvm) { kvm_mips_free_vcpus(kvm); - - /* If this is the last instance, restore wired count */ - if (atomic_dec_return(&kvm_mips_instance) == 0) { - kvm_debug("%s: last KVM instance, restoring TLB parameters\n", - __func__); - on_each_cpu(kvm_mips_uninit_tlbs, NULL, 1); - } + kvm_mips_free_gpa_pt(kvm); } long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, @@ -210,6 +157,32 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, return 0; } +void kvm_arch_flush_shadow_all(struct kvm *kvm) +{ + /* Flush whole GPA */ + kvm_mips_flush_gpa_pt(kvm, 0, ~0); + + /* Let implementation do the rest */ + kvm_mips_callbacks->flush_shadow_all(kvm); +} + +void kvm_arch_flush_shadow_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot) +{ + /* + * The slot has been made invalid (ready for moving or deletion), so we + * need to ensure that it can no longer be accessed by any guest VCPUs. + */ + + spin_lock(&kvm->mmu_lock); + /* Flush slot from GPA */ + kvm_mips_flush_gpa_pt(kvm, slot->base_gfn, + slot->base_gfn + slot->npages - 1); + /* Let implementation do the rest */ + kvm_mips_callbacks->flush_shadow_memslot(kvm, slot); + spin_unlock(&kvm->mmu_lock); +} + int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, const struct kvm_userspace_memory_region *mem, @@ -224,35 +197,32 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, const struct kvm_memory_slot *new, enum kvm_mr_change change) { - unsigned long npages = 0; - int i; + int needs_flush; kvm_debug("%s: kvm: %p slot: %d, GPA: %llx, size: %llx, QVA: %llx\n", __func__, kvm, mem->slot, mem->guest_phys_addr, mem->memory_size, mem->userspace_addr); - /* Setup Guest PMAP table */ - if (!kvm->arch.guest_pmap) { - if (mem->slot == 0) - npages = mem->memory_size >> PAGE_SHIFT; - - if (npages) { - kvm->arch.guest_pmap_npages = npages; - kvm->arch.guest_pmap = - kzalloc(npages * sizeof(unsigned long), GFP_KERNEL); - - if (!kvm->arch.guest_pmap) { - kvm_err("Failed to allocate guest PMAP\n"); - return; - } - - kvm_debug("Allocated space for Guest PMAP Table (%ld pages) @ %p\n", - npages, kvm->arch.guest_pmap); - - /* Now setup the page table */ - for (i = 0; i < npages; i++) - kvm->arch.guest_pmap[i] = KVM_INVALID_PAGE; - } + /* + * If dirty page logging is enabled, write protect all pages in the slot + * ready for dirty logging. + * + * There is no need to do this in any of the following cases: + * CREATE: No dirty mappings will already exist. + * MOVE/DELETE: The old mappings will already have been cleaned up by + * kvm_arch_flush_shadow_memslot() + */ + if (change == KVM_MR_FLAGS_ONLY && + (!(old->flags & KVM_MEM_LOG_DIRTY_PAGES) && + new->flags & KVM_MEM_LOG_DIRTY_PAGES)) { + spin_lock(&kvm->mmu_lock); + /* Write protect GPA page table entries */ + needs_flush = kvm_mips_mkclean_gpa_pt(kvm, new->base_gfn, + new->base_gfn + new->npages - 1); + /* Let implementation do the rest */ + if (needs_flush) + kvm_mips_callbacks->flush_shadow_memslot(kvm, new); + spin_unlock(&kvm->mmu_lock); } } @@ -276,7 +246,7 @@ static inline void dump_handler(const char *symbol, void *start, void *end) struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) { int err, size; - void *gebase, *p, *handler; + void *gebase, *p, *handler, *refill_start, *refill_end; int i; struct kvm_vcpu *vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL); @@ -329,8 +299,9 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) /* Build guest exception vectors dynamically in unmapped memory */ handler = gebase + 0x2000; - /* TLB Refill, EXL = 0 */ - kvm_mips_build_exception(gebase, handler); + /* TLB refill */ + refill_start = gebase; + refill_end = kvm_mips_build_tlb_refill_exception(refill_start, handler); /* General Exception Entry point */ kvm_mips_build_exception(gebase + 0x180, handler); @@ -356,6 +327,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) pr_debug("#include <asm/regdef.h>\n"); pr_debug("\n"); dump_handler("kvm_vcpu_run", vcpu->arch.vcpu_run, p); + dump_handler("kvm_tlb_refill", refill_start, refill_end); dump_handler("kvm_gen_exc", gebase + 0x180, gebase + 0x200); dump_handler("kvm_exit", gebase + 0x2000, vcpu->arch.vcpu_run); @@ -406,6 +378,7 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) kvm_mips_dump_stats(vcpu); + kvm_mmu_free_memory_caches(vcpu); kfree(vcpu->arch.guest_ebase); kfree(vcpu->arch.kseg0_commpage); kfree(vcpu); @@ -422,34 +395,6 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, return -ENOIOCTLCMD; } -/* Must be called with preemption disabled, just before entering guest */ -static void kvm_mips_check_asids(struct kvm_vcpu *vcpu) -{ - struct mips_coproc *cop0 = vcpu->arch.cop0; - int i, cpu = smp_processor_id(); - unsigned int gasid; - - /* - * Lazy host ASID regeneration for guest user mode. - * If the guest ASID has changed since the last guest usermode - * execution, regenerate the host ASID so as to invalidate stale TLB - * entries. - */ - if (!KVM_GUEST_KERNEL_MODE(vcpu)) { - gasid = kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID; - if (gasid != vcpu->arch.last_user_gasid) { - kvm_get_new_mmu_context(&vcpu->arch.guest_user_mm, cpu, - vcpu); - vcpu->arch.guest_user_asid[cpu] = - vcpu->arch.guest_user_mm.context.asid[cpu]; - for_each_possible_cpu(i) - if (i != cpu) - vcpu->arch.guest_user_asid[cpu] = 0; - vcpu->arch.last_user_gasid = gasid; - } - } -} - int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) { int r = 0; @@ -467,25 +412,20 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) lose_fpu(1); local_irq_disable(); - /* Check if we have any exceptions/interrupts pending */ - kvm_mips_deliver_interrupts(vcpu, - kvm_read_c0_guest_cause(vcpu->arch.cop0)); - guest_enter_irqoff(); - - /* Disable hardware page table walking while in guest */ - htw_stop(); - trace_kvm_enter(vcpu); - kvm_mips_check_asids(vcpu); + /* + * Make sure the read of VCPU requests in vcpu_run() callback is not + * reordered ahead of the write to vcpu->mode, or we could miss a TLB + * flush request while the requester sees the VCPU as outside of guest + * mode and not needing an IPI. + */ + smp_store_mb(vcpu->mode, IN_GUEST_MODE); - r = vcpu->arch.vcpu_run(run, vcpu); - trace_kvm_out(vcpu); - - /* Re-enable HTW before enabling interrupts */ - htw_start(); + r = kvm_mips_callbacks->vcpu_run(run, vcpu); + trace_kvm_out(vcpu); guest_exit_irqoff(); local_irq_enable(); @@ -580,33 +520,6 @@ static u64 kvm_mips_get_one_regs[] = { KVM_REG_MIPS_LO, #endif KVM_REG_MIPS_PC, - - KVM_REG_MIPS_CP0_INDEX, - KVM_REG_MIPS_CP0_CONTEXT, - KVM_REG_MIPS_CP0_USERLOCAL, - KVM_REG_MIPS_CP0_PAGEMASK, - KVM_REG_MIPS_CP0_WIRED, - KVM_REG_MIPS_CP0_HWRENA, - KVM_REG_MIPS_CP0_BADVADDR, - KVM_REG_MIPS_CP0_COUNT, - KVM_REG_MIPS_CP0_ENTRYHI, - KVM_REG_MIPS_CP0_COMPARE, - KVM_REG_MIPS_CP0_STATUS, - KVM_REG_MIPS_CP0_CAUSE, - KVM_REG_MIPS_CP0_EPC, - KVM_REG_MIPS_CP0_PRID, - KVM_REG_MIPS_CP0_CONFIG, - KVM_REG_MIPS_CP0_CONFIG1, - KVM_REG_MIPS_CP0_CONFIG2, - KVM_REG_MIPS_CP0_CONFIG3, - KVM_REG_MIPS_CP0_CONFIG4, - KVM_REG_MIPS_CP0_CONFIG5, - KVM_REG_MIPS_CP0_CONFIG7, - KVM_REG_MIPS_CP0_ERROREPC, - - KVM_REG_MIPS_COUNT_CTL, - KVM_REG_MIPS_COUNT_RESUME, - KVM_REG_MIPS_COUNT_HZ, }; static u64 kvm_mips_get_one_regs_fpu[] = { @@ -619,15 +532,6 @@ static u64 kvm_mips_get_one_regs_msa[] = { KVM_REG_MIPS_MSA_CSR, }; -static u64 kvm_mips_get_one_regs_kscratch[] = { - KVM_REG_MIPS_CP0_KSCRATCH1, - KVM_REG_MIPS_CP0_KSCRATCH2, - KVM_REG_MIPS_CP0_KSCRATCH3, - KVM_REG_MIPS_CP0_KSCRATCH4, - KVM_REG_MIPS_CP0_KSCRATCH5, - KVM_REG_MIPS_CP0_KSCRATCH6, -}; - static unsigned long kvm_mips_num_regs(struct kvm_vcpu *vcpu) { unsigned long ret; @@ -641,7 +545,6 @@ static unsigned long kvm_mips_num_regs(struct kvm_vcpu *vcpu) } if (kvm_mips_guest_can_have_msa(&vcpu->arch)) ret += ARRAY_SIZE(kvm_mips_get_one_regs_msa) + 32; - ret += __arch_hweight8(vcpu->arch.kscratch_enabled); ret += kvm_mips_callbacks->num_regs(vcpu); return ret; @@ -694,16 +597,6 @@ static int kvm_mips_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices) } } - for (i = 0; i < 6; ++i) { - if (!(vcpu->arch.kscratch_enabled & BIT(i + 2))) - continue; - - if (copy_to_user(indices, &kvm_mips_get_one_regs_kscratch[i], - sizeof(kvm_mips_get_one_regs_kscratch[i]))) - return -EFAULT; - ++indices; - } - return kvm_mips_callbacks->copy_reg_indices(vcpu, indices); } @@ -794,95 +687,6 @@ static int kvm_mips_get_reg(struct kvm_vcpu *vcpu, v = fpu->msacsr; break; - /* Co-processor 0 registers */ - case KVM_REG_MIPS_CP0_INDEX: - v = (long)kvm_read_c0_guest_index(cop0); - break; - case KVM_REG_MIPS_CP0_CONTEXT: - v = (long)kvm_read_c0_guest_context(cop0); - break; - case KVM_REG_MIPS_CP0_USERLOCAL: - v = (long)kvm_read_c0_guest_userlocal(cop0); - break; - case KVM_REG_MIPS_CP0_PAGEMASK: - v = (long)kvm_read_c0_guest_pagemask(cop0); - break; - case KVM_REG_MIPS_CP0_WIRED: - v = (long)kvm_read_c0_guest_wired(cop0); - break; - case KVM_REG_MIPS_CP0_HWRENA: - v = (long)kvm_read_c0_guest_hwrena(cop0); - break; - case KVM_REG_MIPS_CP0_BADVADDR: - v = (long)kvm_read_c0_guest_badvaddr(cop0); - break; - case KVM_REG_MIPS_CP0_ENTRYHI: - v = (long)kvm_read_c0_guest_entryhi(cop0); - break; - case KVM_REG_MIPS_CP0_COMPARE: - v = (long)kvm_read_c0_guest_compare(cop0); - break; - case KVM_REG_MIPS_CP0_STATUS: - v = (long)kvm_read_c0_guest_status(cop0); - break; - case KVM_REG_MIPS_CP0_CAUSE: - v = (long)kvm_read_c0_guest_cause(cop0); - break; - case KVM_REG_MIPS_CP0_EPC: - v = (long)kvm_read_c0_guest_epc(cop0); - break; - case KVM_REG_MIPS_CP0_PRID: - v = (long)kvm_read_c0_guest_prid(cop0); - break; - case KVM_REG_MIPS_CP0_CONFIG: - v = (long)kvm_read_c0_guest_config(cop0); - break; - case KVM_REG_MIPS_CP0_CONFIG1: - v = (long)kvm_read_c0_guest_config1(cop0); - break; - case KVM_REG_MIPS_CP0_CONFIG2: - v = (long)kvm_read_c0_guest_config2(cop0); - break; - case KVM_REG_MIPS_CP0_CONFIG3: - v = (long)kvm_read_c0_guest_config3(cop0); - break; - case KVM_REG_MIPS_CP0_CONFIG4: - v = (long)kvm_read_c0_guest_config4(cop0); - break; - case KVM_REG_MIPS_CP0_CONFIG5: - v = (long)kvm_read_c0_guest_config5(cop0); - break; - case KVM_REG_MIPS_CP0_CONFIG7: - v = (long)kvm_read_c0_guest_config7(cop0); - break; - case KVM_REG_MIPS_CP0_ERROREPC: - v = (long)kvm_read_c0_guest_errorepc(cop0); - break; - case KVM_REG_MIPS_CP0_KSCRATCH1 ... KVM_REG_MIPS_CP0_KSCRATCH6: - idx = reg->id - KVM_REG_MIPS_CP0_KSCRATCH1 + 2; - if (!(vcpu->arch.kscratch_enabled & BIT(idx))) - return -EINVAL; - switch (idx) { - case 2: - v = (long)kvm_read_c0_guest_kscratch1(cop0); - break; - case 3: - v = (long)kvm_read_c0_guest_kscratch2(cop0); - break; - case 4: - v = (long)kvm_read_c0_guest_kscratch3(cop0); - break; - case 5: - v = (long)kvm_read_c0_guest_kscratch4(cop0); - break; - case 6: - v = (long)kvm_read_c0_guest_kscratch5(cop0); - break; - case 7: - v = (long)kvm_read_c0_guest_kscratch6(cop0); - break; - } - break; /* registers to be handled specially */ default: ret = kvm_mips_callbacks->get_one_reg(vcpu, reg, &v); @@ -1014,68 +818,6 @@ static int kvm_mips_set_reg(struct kvm_vcpu *vcpu, fpu->msacsr = v; break; - /* Co-processor 0 registers */ - case KVM_REG_MIPS_CP0_INDEX: - kvm_write_c0_guest_index(cop0, v); - break; - case KVM_REG_MIPS_CP0_CONTEXT: - kvm_write_c0_guest_context(cop0, v); - break; - case KVM_REG_MIPS_CP0_USERLOCAL: - kvm_write_c0_guest_userlocal(cop0, v); - break; - case KVM_REG_MIPS_CP0_PAGEMASK: - kvm_write_c0_guest_pagemask(cop0, v); - break; - case KVM_REG_MIPS_CP0_WIRED: - kvm_write_c0_guest_wired(cop0, v); - break; - case KVM_REG_MIPS_CP0_HWRENA: - kvm_write_c0_guest_hwrena(cop0, v); - break; - case KVM_REG_MIPS_CP0_BADVADDR: - kvm_write_c0_guest_badvaddr(cop0, v); - break; - case KVM_REG_MIPS_CP0_ENTRYHI: - kvm_write_c0_guest_entryhi(cop0, v); - break; - case KVM_REG_MIPS_CP0_STATUS: - kvm_write_c0_guest_status(cop0, v); - break; - case KVM_REG_MIPS_CP0_EPC: - kvm_write_c0_guest_epc(cop0, v); - break; - case KVM_REG_MIPS_CP0_PRID: - kvm_write_c0_guest_prid(cop0, v); - break; - case KVM_REG_MIPS_CP0_ERROREPC: - kvm_write_c0_guest_errorepc(cop0, v); - break; - case KVM_REG_MIPS_CP0_KSCRATCH1 ... KVM_REG_MIPS_CP0_KSCRATCH6: - idx = reg->id - KVM_REG_MIPS_CP0_KSCRATCH1 + 2; - if (!(vcpu->arch.kscratch_enabled & BIT(idx))) - return -EINVAL; - switch (idx) { - case 2: - kvm_write_c0_guest_kscratch1(cop0, v); - break; - case 3: - kvm_write_c0_guest_kscratch2(cop0, v); - break; - case 4: - kvm_write_c0_guest_kscratch3(cop0, v); - break; - case 5: - kvm_write_c0_guest_kscratch4(cop0, v); - break; - case 6: - kvm_write_c0_guest_kscratch5(cop0, v); - break; - case 7: - kvm_write_c0_guest_kscratch6(cop0, v); - break; - } - break; /* registers to be handled specially */ default: return kvm_mips_callbacks->set_one_reg(vcpu, reg, v); @@ -1144,18 +886,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, return -E2BIG; return kvm_mips_copy_reg_indices(vcpu, user_list->reg); } - case KVM_NMI: - /* Treat the NMI as a CPU reset */ - r = kvm_mips_reset_vcpu(vcpu); - break; case KVM_INTERRUPT: { struct kvm_mips_interrupt irq; - r = -EFAULT; if (copy_from_user(&irq, argp, sizeof(irq))) - goto out; - + return -EFAULT; kvm_debug("[%d] %s: irq: %d\n", vcpu->vcpu_id, __func__, irq.irq); @@ -1165,56 +901,57 @@ long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, case KVM_ENABLE_CAP: { struct kvm_enable_cap cap; - r = -EFAULT; if (copy_from_user(&cap, argp, sizeof(cap))) - goto out; + return -EFAULT; r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap); break; } default: r = -ENOIOCTLCMD; } - -out: return r; } -/* Get (and clear) the dirty memory log for a memory slot. */ +/** + * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot + * @kvm: kvm instance + * @log: slot id and address to which we copy the log + * + * Steps 1-4 below provide general overview of dirty page logging. See + * kvm_get_dirty_log_protect() function description for additional details. + * + * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we + * always flush the TLB (step 4) even if previous step failed and the dirty + * bitmap may be corrupt. Regardless of previous outcome the KVM logging API + * does not preclude user space subsequent dirty log read. Flushing TLB ensures + * writes will be marked dirty for next log read. + * + * 1. Take a snapshot of the bit and clear it if needed. + * 2. Write protect the corresponding page. + * 3. Copy the snapshot to the userspace. + * 4. Flush TLB's if needed. + */ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; - unsigned long ga, ga_end; - int is_dirty = 0; + bool is_dirty = false; int r; - unsigned long n; mutex_lock(&kvm->slots_lock); - r = kvm_get_dirty_log(kvm, log, &is_dirty); - if (r) - goto out; + r = kvm_get_dirty_log_protect(kvm, log, &is_dirty); - /* If nothing is dirty, don't bother messing with page tables. */ if (is_dirty) { slots = kvm_memslots(kvm); memslot = id_to_memslot(slots, log->slot); - ga = memslot->base_gfn << PAGE_SHIFT; - ga_end = ga + (memslot->npages << PAGE_SHIFT); - - kvm_info("%s: dirty, ga: %#lx, ga_end %#lx\n", __func__, ga, - ga_end); - - n = kvm_dirty_bitmap_bytes(memslot); - memset(memslot->dirty_bitmap, 0, n); + /* Let implementation handle TLB/GVA invalidation */ + kvm_mips_callbacks->flush_shadow_memslot(kvm, memslot); } - r = 0; -out: mutex_unlock(&kvm->slots_lock); return r; - } long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) @@ -1282,11 +1019,19 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) switch (ext) { case KVM_CAP_ONE_REG: case KVM_CAP_ENABLE_CAP: + case KVM_CAP_READONLY_MEM: + case KVM_CAP_SYNC_MMU: r = 1; break; case KVM_CAP_COALESCED_MMIO: r = KVM_COALESCED_MMIO_PAGE_OFFSET; break; + case KVM_CAP_NR_VCPUS: + r = num_online_cpus(); + break; + case KVM_CAP_MAX_VCPUS: + r = KVM_MAX_VCPUS; + break; case KVM_CAP_MIPS_FPU: /* We don't handle systems with inconsistent cpu_has_fpu */ r = !!raw_cpu_has_fpu; @@ -1400,13 +1145,23 @@ static enum hrtimer_restart kvm_mips_comparecount_wakeup(struct hrtimer *timer) int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { - kvm_mips_callbacks->vcpu_init(vcpu); + int err; + + err = kvm_mips_callbacks->vcpu_init(vcpu); + if (err) + return err; + hrtimer_init(&vcpu->arch.comparecount_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); vcpu->arch.comparecount_timer.function = kvm_mips_comparecount_wakeup; return 0; } +void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) +{ + kvm_mips_callbacks->vcpu_uninit(vcpu); +} + int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, struct kvm_translation *tr) { @@ -1440,8 +1195,11 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) u32 __user *opc = (u32 __user *) vcpu->arch.pc; unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr; enum emulation_result er = EMULATE_DONE; + u32 inst; int ret = RESUME_GUEST; + vcpu->mode = OUTSIDE_GUEST_MODE; + /* re-enable HTW before enabling interrupts */ htw_start(); @@ -1564,8 +1322,12 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) break; default: + if (cause & CAUSEF_BD) + opc += 1; + inst = 0; + kvm_get_badinstr(opc, vcpu, &inst); kvm_err("Exception Code: %d, not yet handled, @ PC: %p, inst: 0x%08x BadVaddr: %#lx Status: %#lx\n", - exccode, opc, kvm_get_inst(opc, vcpu), badvaddr, + exccode, opc, inst, badvaddr, kvm_read_c0_guest_status(vcpu->arch.cop0)); kvm_arch_vcpu_dump_regs(vcpu); run->exit_reason = KVM_EXIT_INTERNAL_ERROR; @@ -1593,7 +1355,15 @@ skip_emul: if (ret == RESUME_GUEST) { trace_kvm_reenter(vcpu); - kvm_mips_check_asids(vcpu); + /* + * Make sure the read of VCPU requests in vcpu_reenter() + * callback is not reordered ahead of the write to vcpu->mode, + * or we could miss a TLB flush request while the requester sees + * the VCPU as outside of guest mode and not needing an IPI. + */ + smp_store_mb(vcpu->mode, IN_GUEST_MODE); + + kvm_mips_callbacks->vcpu_reenter(run, vcpu); /* * If FPU / MSA are enabled (i.e. the guest's FPU / MSA context diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index 3b677c851be0..cb0faade311e 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -11,86 +11,995 @@ #include <linux/highmem.h> #include <linux/kvm_host.h> +#include <linux/uaccess.h> #include <asm/mmu_context.h> +#include <asm/pgalloc.h> -static u32 kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu) +/* + * KVM_MMU_CACHE_MIN_PAGES is the number of GPA page table translation levels + * for which pages need to be cached. + */ +#if defined(__PAGETABLE_PMD_FOLDED) +#define KVM_MMU_CACHE_MIN_PAGES 1 +#else +#define KVM_MMU_CACHE_MIN_PAGES 2 +#endif + +static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, + int min, int max) { - int cpu = smp_processor_id(); + void *page; + + BUG_ON(max > KVM_NR_MEM_OBJS); + if (cache->nobjs >= min) + return 0; + while (cache->nobjs < max) { + page = (void *)__get_free_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + cache->objects[cache->nobjs++] = page; + } + return 0; +} - return vcpu->arch.guest_kernel_asid[cpu] & - cpu_asid_mask(&cpu_data[cpu]); +static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) +{ + while (mc->nobjs) + free_page((unsigned long)mc->objects[--mc->nobjs]); } -static u32 kvm_mips_get_user_asid(struct kvm_vcpu *vcpu) +static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) { - int cpu = smp_processor_id(); + void *p; - return vcpu->arch.guest_user_asid[cpu] & - cpu_asid_mask(&cpu_data[cpu]); + BUG_ON(!mc || !mc->nobjs); + p = mc->objects[--mc->nobjs]; + return p; } -static int kvm_mips_map_page(struct kvm *kvm, gfn_t gfn) +void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu) { - int srcu_idx, err = 0; - kvm_pfn_t pfn; + mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); +} + +/** + * kvm_pgd_init() - Initialise KVM GPA page directory. + * @page: Pointer to page directory (PGD) for KVM GPA. + * + * Initialise a KVM GPA page directory with pointers to the invalid table, i.e. + * representing no mappings. This is similar to pgd_init(), however it + * initialises all the page directory pointers, not just the ones corresponding + * to the userland address space (since it is for the guest physical address + * space rather than a virtual address space). + */ +static void kvm_pgd_init(void *page) +{ + unsigned long *p, *end; + unsigned long entry; + +#ifdef __PAGETABLE_PMD_FOLDED + entry = (unsigned long)invalid_pte_table; +#else + entry = (unsigned long)invalid_pmd_table; +#endif + + p = (unsigned long *)page; + end = p + PTRS_PER_PGD; + + do { + p[0] = entry; + p[1] = entry; + p[2] = entry; + p[3] = entry; + p[4] = entry; + p += 8; + p[-3] = entry; + p[-2] = entry; + p[-1] = entry; + } while (p != end); +} + +/** + * kvm_pgd_alloc() - Allocate and initialise a KVM GPA page directory. + * + * Allocate a blank KVM GPA page directory (PGD) for representing guest physical + * to host physical page mappings. + * + * Returns: Pointer to new KVM GPA page directory. + * NULL on allocation failure. + */ +pgd_t *kvm_pgd_alloc(void) +{ + pgd_t *ret; + + ret = (pgd_t *)__get_free_pages(GFP_KERNEL, PGD_ORDER); + if (ret) + kvm_pgd_init(ret); + + return ret; +} + +/** + * kvm_mips_walk_pgd() - Walk page table with optional allocation. + * @pgd: Page directory pointer. + * @addr: Address to index page table using. + * @cache: MMU page cache to allocate new page tables from, or NULL. + * + * Walk the page tables pointed to by @pgd to find the PTE corresponding to the + * address @addr. If page tables don't exist for @addr, they will be created + * from the MMU cache if @cache is not NULL. + * + * Returns: Pointer to pte_t corresponding to @addr. + * NULL if a page table doesn't exist for @addr and !@cache. + * NULL if a page table allocation failed. + */ +static pte_t *kvm_mips_walk_pgd(pgd_t *pgd, struct kvm_mmu_memory_cache *cache, + unsigned long addr) +{ + pud_t *pud; + pmd_t *pmd; + + pgd += pgd_index(addr); + if (pgd_none(*pgd)) { + /* Not used on MIPS yet */ + BUG(); + return NULL; + } + pud = pud_offset(pgd, addr); + if (pud_none(*pud)) { + pmd_t *new_pmd; + + if (!cache) + return NULL; + new_pmd = mmu_memory_cache_alloc(cache); + pmd_init((unsigned long)new_pmd, + (unsigned long)invalid_pte_table); + pud_populate(NULL, pud, new_pmd); + } + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) { + pte_t *new_pte; + + if (!cache) + return NULL; + new_pte = mmu_memory_cache_alloc(cache); + clear_page(new_pte); + pmd_populate_kernel(NULL, pmd, new_pte); + } + return pte_offset(pmd, addr); +} + +/* Caller must hold kvm->mm_lock */ +static pte_t *kvm_mips_pte_for_gpa(struct kvm *kvm, + struct kvm_mmu_memory_cache *cache, + unsigned long addr) +{ + return kvm_mips_walk_pgd(kvm->arch.gpa_mm.pgd, cache, addr); +} + +/* + * kvm_mips_flush_gpa_{pte,pmd,pud,pgd,pt}. + * Flush a range of guest physical address space from the VM's GPA page tables. + */ + +static bool kvm_mips_flush_gpa_pte(pte_t *pte, unsigned long start_gpa, + unsigned long end_gpa) +{ + int i_min = __pte_offset(start_gpa); + int i_max = __pte_offset(end_gpa); + bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PTE - 1); + int i; + + for (i = i_min; i <= i_max; ++i) { + if (!pte_present(pte[i])) + continue; + + set_pte(pte + i, __pte(0)); + } + return safe_to_remove; +} + +static bool kvm_mips_flush_gpa_pmd(pmd_t *pmd, unsigned long start_gpa, + unsigned long end_gpa) +{ + pte_t *pte; + unsigned long end = ~0ul; + int i_min = __pmd_offset(start_gpa); + int i_max = __pmd_offset(end_gpa); + bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PMD - 1); + int i; + + for (i = i_min; i <= i_max; ++i, start_gpa = 0) { + if (!pmd_present(pmd[i])) + continue; + + pte = pte_offset(pmd + i, 0); + if (i == i_max) + end = end_gpa; + + if (kvm_mips_flush_gpa_pte(pte, start_gpa, end)) { + pmd_clear(pmd + i); + pte_free_kernel(NULL, pte); + } else { + safe_to_remove = false; + } + } + return safe_to_remove; +} + +static bool kvm_mips_flush_gpa_pud(pud_t *pud, unsigned long start_gpa, + unsigned long end_gpa) +{ + pmd_t *pmd; + unsigned long end = ~0ul; + int i_min = __pud_offset(start_gpa); + int i_max = __pud_offset(end_gpa); + bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PUD - 1); + int i; + + for (i = i_min; i <= i_max; ++i, start_gpa = 0) { + if (!pud_present(pud[i])) + continue; + + pmd = pmd_offset(pud + i, 0); + if (i == i_max) + end = end_gpa; + + if (kvm_mips_flush_gpa_pmd(pmd, start_gpa, end)) { + pud_clear(pud + i); + pmd_free(NULL, pmd); + } else { + safe_to_remove = false; + } + } + return safe_to_remove; +} + +static bool kvm_mips_flush_gpa_pgd(pgd_t *pgd, unsigned long start_gpa, + unsigned long end_gpa) +{ + pud_t *pud; + unsigned long end = ~0ul; + int i_min = pgd_index(start_gpa); + int i_max = pgd_index(end_gpa); + bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PGD - 1); + int i; + + for (i = i_min; i <= i_max; ++i, start_gpa = 0) { + if (!pgd_present(pgd[i])) + continue; + + pud = pud_offset(pgd + i, 0); + if (i == i_max) + end = end_gpa; + + if (kvm_mips_flush_gpa_pud(pud, start_gpa, end)) { + pgd_clear(pgd + i); + pud_free(NULL, pud); + } else { + safe_to_remove = false; + } + } + return safe_to_remove; +} + +/** + * kvm_mips_flush_gpa_pt() - Flush a range of guest physical addresses. + * @kvm: KVM pointer. + * @start_gfn: Guest frame number of first page in GPA range to flush. + * @end_gfn: Guest frame number of last page in GPA range to flush. + * + * Flushes a range of GPA mappings from the GPA page tables. + * + * The caller must hold the @kvm->mmu_lock spinlock. + * + * Returns: Whether its safe to remove the top level page directory because + * all lower levels have been removed. + */ +bool kvm_mips_flush_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn) +{ + return kvm_mips_flush_gpa_pgd(kvm->arch.gpa_mm.pgd, + start_gfn << PAGE_SHIFT, + end_gfn << PAGE_SHIFT); +} + +#define BUILD_PTE_RANGE_OP(name, op) \ +static int kvm_mips_##name##_pte(pte_t *pte, unsigned long start, \ + unsigned long end) \ +{ \ + int ret = 0; \ + int i_min = __pte_offset(start); \ + int i_max = __pte_offset(end); \ + int i; \ + pte_t old, new; \ + \ + for (i = i_min; i <= i_max; ++i) { \ + if (!pte_present(pte[i])) \ + continue; \ + \ + old = pte[i]; \ + new = op(old); \ + if (pte_val(new) == pte_val(old)) \ + continue; \ + set_pte(pte + i, new); \ + ret = 1; \ + } \ + return ret; \ +} \ + \ +/* returns true if anything was done */ \ +static int kvm_mips_##name##_pmd(pmd_t *pmd, unsigned long start, \ + unsigned long end) \ +{ \ + int ret = 0; \ + pte_t *pte; \ + unsigned long cur_end = ~0ul; \ + int i_min = __pmd_offset(start); \ + int i_max = __pmd_offset(end); \ + int i; \ + \ + for (i = i_min; i <= i_max; ++i, start = 0) { \ + if (!pmd_present(pmd[i])) \ + continue; \ + \ + pte = pte_offset(pmd + i, 0); \ + if (i == i_max) \ + cur_end = end; \ + \ + ret |= kvm_mips_##name##_pte(pte, start, cur_end); \ + } \ + return ret; \ +} \ + \ +static int kvm_mips_##name##_pud(pud_t *pud, unsigned long start, \ + unsigned long end) \ +{ \ + int ret = 0; \ + pmd_t *pmd; \ + unsigned long cur_end = ~0ul; \ + int i_min = __pud_offset(start); \ + int i_max = __pud_offset(end); \ + int i; \ + \ + for (i = i_min; i <= i_max; ++i, start = 0) { \ + if (!pud_present(pud[i])) \ + continue; \ + \ + pmd = pmd_offset(pud + i, 0); \ + if (i == i_max) \ + cur_end = end; \ + \ + ret |= kvm_mips_##name##_pmd(pmd, start, cur_end); \ + } \ + return ret; \ +} \ + \ +static int kvm_mips_##name##_pgd(pgd_t *pgd, unsigned long start, \ + unsigned long end) \ +{ \ + int ret = 0; \ + pud_t *pud; \ + unsigned long cur_end = ~0ul; \ + int i_min = pgd_index(start); \ + int i_max = pgd_index(end); \ + int i; \ + \ + for (i = i_min; i <= i_max; ++i, start = 0) { \ + if (!pgd_present(pgd[i])) \ + continue; \ + \ + pud = pud_offset(pgd + i, 0); \ + if (i == i_max) \ + cur_end = end; \ + \ + ret |= kvm_mips_##name##_pud(pud, start, cur_end); \ + } \ + return ret; \ +} + +/* + * kvm_mips_mkclean_gpa_pt. + * Mark a range of guest physical address space clean (writes fault) in the VM's + * GPA page table to allow dirty page tracking. + */ - if (kvm->arch.guest_pmap[gfn] != KVM_INVALID_PAGE) +BUILD_PTE_RANGE_OP(mkclean, pte_mkclean) + +/** + * kvm_mips_mkclean_gpa_pt() - Make a range of guest physical addresses clean. + * @kvm: KVM pointer. + * @start_gfn: Guest frame number of first page in GPA range to flush. + * @end_gfn: Guest frame number of last page in GPA range to flush. + * + * Make a range of GPA mappings clean so that guest writes will fault and + * trigger dirty page logging. + * + * The caller must hold the @kvm->mmu_lock spinlock. + * + * Returns: Whether any GPA mappings were modified, which would require + * derived mappings (GVA page tables & TLB enties) to be + * invalidated. + */ +int kvm_mips_mkclean_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn) +{ + return kvm_mips_mkclean_pgd(kvm->arch.gpa_mm.pgd, + start_gfn << PAGE_SHIFT, + end_gfn << PAGE_SHIFT); +} + +/** + * kvm_arch_mmu_enable_log_dirty_pt_masked() - write protect dirty pages + * @kvm: The KVM pointer + * @slot: The memory slot associated with mask + * @gfn_offset: The gfn offset in memory slot + * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory + * slot to be write protected + * + * Walks bits set in mask write protects the associated pte's. Caller must + * acquire @kvm->mmu_lock. + */ +void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask) +{ + gfn_t base_gfn = slot->base_gfn + gfn_offset; + gfn_t start = base_gfn + __ffs(mask); + gfn_t end = base_gfn + __fls(mask); + + kvm_mips_mkclean_gpa_pt(kvm, start, end); +} + +/* + * kvm_mips_mkold_gpa_pt. + * Mark a range of guest physical address space old (all accesses fault) in the + * VM's GPA page table to allow detection of commonly used pages. + */ + +BUILD_PTE_RANGE_OP(mkold, pte_mkold) + +static int kvm_mips_mkold_gpa_pt(struct kvm *kvm, gfn_t start_gfn, + gfn_t end_gfn) +{ + return kvm_mips_mkold_pgd(kvm->arch.gpa_mm.pgd, + start_gfn << PAGE_SHIFT, + end_gfn << PAGE_SHIFT); +} + +static int handle_hva_to_gpa(struct kvm *kvm, + unsigned long start, + unsigned long end, + int (*handler)(struct kvm *kvm, gfn_t gfn, + gpa_t gfn_end, + struct kvm_memory_slot *memslot, + void *data), + void *data) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + int ret = 0; + + slots = kvm_memslots(kvm); + + /* we only care about the pages that the guest sees */ + kvm_for_each_memslot(memslot, slots) { + unsigned long hva_start, hva_end; + gfn_t gfn, gfn_end; + + hva_start = max(start, memslot->userspace_addr); + hva_end = min(end, memslot->userspace_addr + + (memslot->npages << PAGE_SHIFT)); + if (hva_start >= hva_end) + continue; + + /* + * {gfn(page) | page intersects with [hva_start, hva_end)} = + * {gfn_start, gfn_start+1, ..., gfn_end-1}. + */ + gfn = hva_to_gfn_memslot(hva_start, memslot); + gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); + + ret |= handler(kvm, gfn, gfn_end, memslot, data); + } + + return ret; +} + + +static int kvm_unmap_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, + struct kvm_memory_slot *memslot, void *data) +{ + kvm_mips_flush_gpa_pt(kvm, gfn, gfn_end); + return 1; +} + +int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) +{ + unsigned long end = hva + PAGE_SIZE; + + handle_hva_to_gpa(kvm, hva, end, &kvm_unmap_hva_handler, NULL); + + kvm_mips_callbacks->flush_shadow_all(kvm); + return 0; +} + +int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) +{ + handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL); + + kvm_mips_callbacks->flush_shadow_all(kvm); + return 0; +} + +static int kvm_set_spte_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, + struct kvm_memory_slot *memslot, void *data) +{ + gpa_t gpa = gfn << PAGE_SHIFT; + pte_t hva_pte = *(pte_t *)data; + pte_t *gpa_pte = kvm_mips_pte_for_gpa(kvm, NULL, gpa); + pte_t old_pte; + + if (!gpa_pte) + return 0; + + /* Mapping may need adjusting depending on memslot flags */ + old_pte = *gpa_pte; + if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES && !pte_dirty(old_pte)) + hva_pte = pte_mkclean(hva_pte); + else if (memslot->flags & KVM_MEM_READONLY) + hva_pte = pte_wrprotect(hva_pte); + + set_pte(gpa_pte, hva_pte); + + /* Replacing an absent or old page doesn't need flushes */ + if (!pte_present(old_pte) || !pte_young(old_pte)) return 0; + /* Pages swapped, aged, moved, or cleaned require flushes */ + return !pte_present(hva_pte) || + !pte_young(hva_pte) || + pte_pfn(old_pte) != pte_pfn(hva_pte) || + (pte_dirty(old_pte) && !pte_dirty(hva_pte)); +} + +void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +{ + unsigned long end = hva + PAGE_SIZE; + int ret; + + ret = handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pte); + if (ret) + kvm_mips_callbacks->flush_shadow_all(kvm); +} + +static int kvm_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, + struct kvm_memory_slot *memslot, void *data) +{ + return kvm_mips_mkold_gpa_pt(kvm, gfn, gfn_end); +} + +static int kvm_test_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end, + struct kvm_memory_slot *memslot, void *data) +{ + gpa_t gpa = gfn << PAGE_SHIFT; + pte_t *gpa_pte = kvm_mips_pte_for_gpa(kvm, NULL, gpa); + + if (!gpa_pte) + return 0; + return pte_young(*gpa_pte); +} + +int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) +{ + return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL); +} + +int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +{ + return handle_hva_to_gpa(kvm, hva, hva, kvm_test_age_hva_handler, NULL); +} + +/** + * _kvm_mips_map_page_fast() - Fast path GPA fault handler. + * @vcpu: VCPU pointer. + * @gpa: Guest physical address of fault. + * @write_fault: Whether the fault was due to a write. + * @out_entry: New PTE for @gpa (written on success unless NULL). + * @out_buddy: New PTE for @gpa's buddy (written on success unless + * NULL). + * + * Perform fast path GPA fault handling, doing all that can be done without + * calling into KVM. This handles marking old pages young (for idle page + * tracking), and dirtying of clean pages (for dirty page logging). + * + * Returns: 0 on success, in which case we can update derived mappings and + * resume guest execution. + * -EFAULT on failure due to absent GPA mapping or write to + * read-only page, in which case KVM must be consulted. + */ +static int _kvm_mips_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa, + bool write_fault, + pte_t *out_entry, pte_t *out_buddy) +{ + struct kvm *kvm = vcpu->kvm; + gfn_t gfn = gpa >> PAGE_SHIFT; + pte_t *ptep; + kvm_pfn_t pfn = 0; /* silence bogus GCC warning */ + bool pfn_valid = false; + int ret = 0; + + spin_lock(&kvm->mmu_lock); + + /* Fast path - just check GPA page table for an existing entry */ + ptep = kvm_mips_pte_for_gpa(kvm, NULL, gpa); + if (!ptep || !pte_present(*ptep)) { + ret = -EFAULT; + goto out; + } + + /* Track access to pages marked old */ + if (!pte_young(*ptep)) { + set_pte(ptep, pte_mkyoung(*ptep)); + pfn = pte_pfn(*ptep); + pfn_valid = true; + /* call kvm_set_pfn_accessed() after unlock */ + } + if (write_fault && !pte_dirty(*ptep)) { + if (!pte_write(*ptep)) { + ret = -EFAULT; + goto out; + } + + /* Track dirtying of writeable pages */ + set_pte(ptep, pte_mkdirty(*ptep)); + pfn = pte_pfn(*ptep); + mark_page_dirty(kvm, gfn); + kvm_set_pfn_dirty(pfn); + } + + if (out_entry) + *out_entry = *ptep; + if (out_buddy) + *out_buddy = *ptep_buddy(ptep); + +out: + spin_unlock(&kvm->mmu_lock); + if (pfn_valid) + kvm_set_pfn_accessed(pfn); + return ret; +} + +/** + * kvm_mips_map_page() - Map a guest physical page. + * @vcpu: VCPU pointer. + * @gpa: Guest physical address of fault. + * @write_fault: Whether the fault was due to a write. + * @out_entry: New PTE for @gpa (written on success unless NULL). + * @out_buddy: New PTE for @gpa's buddy (written on success unless + * NULL). + * + * Handle GPA faults by creating a new GPA mapping (or updating an existing + * one). + * + * This takes care of marking pages young or dirty (idle/dirty page tracking), + * asking KVM for the corresponding PFN, and creating a mapping in the GPA page + * tables. Derived mappings (GVA page tables and TLBs) must be handled by the + * caller. + * + * Returns: 0 on success, in which case the caller may use the @out_entry + * and @out_buddy PTEs to update derived mappings and resume guest + * execution. + * -EFAULT if there is no memory region at @gpa or a write was + * attempted to a read-only memory region. This is usually handled + * as an MMIO access. + */ +static int kvm_mips_map_page(struct kvm_vcpu *vcpu, unsigned long gpa, + bool write_fault, + pte_t *out_entry, pte_t *out_buddy) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; + gfn_t gfn = gpa >> PAGE_SHIFT; + int srcu_idx, err; + kvm_pfn_t pfn; + pte_t *ptep, entry, old_pte; + bool writeable; + unsigned long prot_bits; + unsigned long mmu_seq; + + /* Try the fast path to handle old / clean pages */ srcu_idx = srcu_read_lock(&kvm->srcu); - pfn = gfn_to_pfn(kvm, gfn); + err = _kvm_mips_map_page_fast(vcpu, gpa, write_fault, out_entry, + out_buddy); + if (!err) + goto out; + /* We need a minimum of cached pages ready for page table creation */ + err = mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES, + KVM_NR_MEM_OBJS); + if (err) + goto out; + +retry: + /* + * Used to check for invalidations in progress, of the pfn that is + * returned by pfn_to_pfn_prot below. + */ + mmu_seq = kvm->mmu_notifier_seq; + /* + * Ensure the read of mmu_notifier_seq isn't reordered with PTE reads in + * gfn_to_pfn_prot() (which calls get_user_pages()), so that we don't + * risk the page we get a reference to getting unmapped before we have a + * chance to grab the mmu_lock without mmu_notifier_retry() noticing. + * + * This smp_rmb() pairs with the effective smp_wmb() of the combination + * of the pte_unmap_unlock() after the PTE is zapped, and the + * spin_lock() in kvm_mmu_notifier_invalidate_<page|range_end>() before + * mmu_notifier_seq is incremented. + */ + smp_rmb(); + + /* Slow path - ask KVM core whether we can access this GPA */ + pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writeable); if (is_error_noslot_pfn(pfn)) { - kvm_err("Couldn't get pfn for gfn %#llx!\n", gfn); err = -EFAULT; goto out; } - kvm->arch.guest_pmap[gfn] = pfn; + spin_lock(&kvm->mmu_lock); + /* Check if an invalidation has taken place since we got pfn */ + if (mmu_notifier_retry(kvm, mmu_seq)) { + /* + * This can happen when mappings are changed asynchronously, but + * also synchronously if a COW is triggered by + * gfn_to_pfn_prot(). + */ + spin_unlock(&kvm->mmu_lock); + kvm_release_pfn_clean(pfn); + goto retry; + } + + /* Ensure page tables are allocated */ + ptep = kvm_mips_pte_for_gpa(kvm, memcache, gpa); + + /* Set up the PTE */ + prot_bits = _PAGE_PRESENT | __READABLE | _page_cachable_default; + if (writeable) { + prot_bits |= _PAGE_WRITE; + if (write_fault) { + prot_bits |= __WRITEABLE; + mark_page_dirty(kvm, gfn); + kvm_set_pfn_dirty(pfn); + } + } + entry = pfn_pte(pfn, __pgprot(prot_bits)); + + /* Write the PTE */ + old_pte = *ptep; + set_pte(ptep, entry); + + err = 0; + if (out_entry) + *out_entry = *ptep; + if (out_buddy) + *out_buddy = *ptep_buddy(ptep); + + spin_unlock(&kvm->mmu_lock); + kvm_release_pfn_clean(pfn); + kvm_set_pfn_accessed(pfn); out: srcu_read_unlock(&kvm->srcu, srcu_idx); return err; } -/* Translate guest KSEG0 addresses to Host PA */ -unsigned long kvm_mips_translate_guest_kseg0_to_hpa(struct kvm_vcpu *vcpu, - unsigned long gva) +static pte_t *kvm_trap_emul_pte_for_gva(struct kvm_vcpu *vcpu, + unsigned long addr) { - gfn_t gfn; - unsigned long offset = gva & ~PAGE_MASK; - struct kvm *kvm = vcpu->kvm; + struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; + pgd_t *pgdp; + int ret; + + /* We need a minimum of cached pages ready for page table creation */ + ret = mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES, + KVM_NR_MEM_OBJS); + if (ret) + return NULL; + + if (KVM_GUEST_KERNEL_MODE(vcpu)) + pgdp = vcpu->arch.guest_kernel_mm.pgd; + else + pgdp = vcpu->arch.guest_user_mm.pgd; + + return kvm_mips_walk_pgd(pgdp, memcache, addr); +} - if (KVM_GUEST_KSEGX(gva) != KVM_GUEST_KSEG0) { - kvm_err("%s/%p: Invalid gva: %#lx\n", __func__, - __builtin_return_address(0), gva); - return KVM_INVALID_PAGE; +void kvm_trap_emul_invalidate_gva(struct kvm_vcpu *vcpu, unsigned long addr, + bool user) +{ + pgd_t *pgdp; + pte_t *ptep; + + addr &= PAGE_MASK << 1; + + pgdp = vcpu->arch.guest_kernel_mm.pgd; + ptep = kvm_mips_walk_pgd(pgdp, NULL, addr); + if (ptep) { + ptep[0] = pfn_pte(0, __pgprot(0)); + ptep[1] = pfn_pte(0, __pgprot(0)); + } + + if (user) { + pgdp = vcpu->arch.guest_user_mm.pgd; + ptep = kvm_mips_walk_pgd(pgdp, NULL, addr); + if (ptep) { + ptep[0] = pfn_pte(0, __pgprot(0)); + ptep[1] = pfn_pte(0, __pgprot(0)); + } } +} - gfn = (KVM_GUEST_CPHYSADDR(gva) >> PAGE_SHIFT); +/* + * kvm_mips_flush_gva_{pte,pmd,pud,pgd,pt}. + * Flush a range of guest physical address space from the VM's GPA page tables. + */ - if (gfn >= kvm->arch.guest_pmap_npages) { - kvm_err("%s: Invalid gfn: %#llx, GVA: %#lx\n", __func__, gfn, - gva); - return KVM_INVALID_PAGE; +static bool kvm_mips_flush_gva_pte(pte_t *pte, unsigned long start_gva, + unsigned long end_gva) +{ + int i_min = __pte_offset(start_gva); + int i_max = __pte_offset(end_gva); + bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PTE - 1); + int i; + + /* + * There's no freeing to do, so there's no point clearing individual + * entries unless only part of the last level page table needs flushing. + */ + if (safe_to_remove) + return true; + + for (i = i_min; i <= i_max; ++i) { + if (!pte_present(pte[i])) + continue; + + set_pte(pte + i, __pte(0)); } + return false; +} - if (kvm_mips_map_page(vcpu->kvm, gfn) < 0) - return KVM_INVALID_ADDR; +static bool kvm_mips_flush_gva_pmd(pmd_t *pmd, unsigned long start_gva, + unsigned long end_gva) +{ + pte_t *pte; + unsigned long end = ~0ul; + int i_min = __pmd_offset(start_gva); + int i_max = __pmd_offset(end_gva); + bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PMD - 1); + int i; + + for (i = i_min; i <= i_max; ++i, start_gva = 0) { + if (!pmd_present(pmd[i])) + continue; + + pte = pte_offset(pmd + i, 0); + if (i == i_max) + end = end_gva; + + if (kvm_mips_flush_gva_pte(pte, start_gva, end)) { + pmd_clear(pmd + i); + pte_free_kernel(NULL, pte); + } else { + safe_to_remove = false; + } + } + return safe_to_remove; +} - return (kvm->arch.guest_pmap[gfn] << PAGE_SHIFT) + offset; +static bool kvm_mips_flush_gva_pud(pud_t *pud, unsigned long start_gva, + unsigned long end_gva) +{ + pmd_t *pmd; + unsigned long end = ~0ul; + int i_min = __pud_offset(start_gva); + int i_max = __pud_offset(end_gva); + bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PUD - 1); + int i; + + for (i = i_min; i <= i_max; ++i, start_gva = 0) { + if (!pud_present(pud[i])) + continue; + + pmd = pmd_offset(pud + i, 0); + if (i == i_max) + end = end_gva; + + if (kvm_mips_flush_gva_pmd(pmd, start_gva, end)) { + pud_clear(pud + i); + pmd_free(NULL, pmd); + } else { + safe_to_remove = false; + } + } + return safe_to_remove; +} + +static bool kvm_mips_flush_gva_pgd(pgd_t *pgd, unsigned long start_gva, + unsigned long end_gva) +{ + pud_t *pud; + unsigned long end = ~0ul; + int i_min = pgd_index(start_gva); + int i_max = pgd_index(end_gva); + bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PGD - 1); + int i; + + for (i = i_min; i <= i_max; ++i, start_gva = 0) { + if (!pgd_present(pgd[i])) + continue; + + pud = pud_offset(pgd + i, 0); + if (i == i_max) + end = end_gva; + + if (kvm_mips_flush_gva_pud(pud, start_gva, end)) { + pgd_clear(pgd + i); + pud_free(NULL, pud); + } else { + safe_to_remove = false; + } + } + return safe_to_remove; +} + +void kvm_mips_flush_gva_pt(pgd_t *pgd, enum kvm_mips_flush flags) +{ + if (flags & KMF_GPA) { + /* all of guest virtual address space could be affected */ + if (flags & KMF_KERN) + /* useg, kseg0, seg2/3 */ + kvm_mips_flush_gva_pgd(pgd, 0, 0x7fffffff); + else + /* useg */ + kvm_mips_flush_gva_pgd(pgd, 0, 0x3fffffff); + } else { + /* useg */ + kvm_mips_flush_gva_pgd(pgd, 0, 0x3fffffff); + + /* kseg2/3 */ + if (flags & KMF_KERN) + kvm_mips_flush_gva_pgd(pgd, 0x60000000, 0x7fffffff); + } +} + +static pte_t kvm_mips_gpa_pte_to_gva_unmapped(pte_t pte) +{ + /* + * Don't leak writeable but clean entries from GPA page tables. We don't + * want the normal Linux tlbmod handler to handle dirtying when KVM + * accesses guest memory. + */ + if (!pte_dirty(pte)) + pte = pte_wrprotect(pte); + + return pte; +} + +static pte_t kvm_mips_gpa_pte_to_gva_mapped(pte_t pte, long entrylo) +{ + /* Guest EntryLo overrides host EntryLo */ + if (!(entrylo & ENTRYLO_D)) + pte = pte_mkclean(pte); + + return kvm_mips_gpa_pte_to_gva_unmapped(pte); } /* XXXKYMA: Must be called with interrupts disabled */ int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr, - struct kvm_vcpu *vcpu) + struct kvm_vcpu *vcpu, + bool write_fault) { - gfn_t gfn; - kvm_pfn_t pfn0, pfn1; - unsigned long vaddr = 0; - unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0; - struct kvm *kvm = vcpu->kvm; - const int flush_dcache_mask = 0; - int ret; + unsigned long gpa; + pte_t pte_gpa[2], *ptep_gva; + int idx; if (KVM_GUEST_KSEGX(badvaddr) != KVM_GUEST_KSEG0) { kvm_err("%s: Invalid BadVaddr: %#lx\n", __func__, badvaddr); @@ -98,49 +1007,39 @@ int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr, return -1; } - gfn = (KVM_GUEST_CPHYSADDR(badvaddr) >> PAGE_SHIFT); - if ((gfn | 1) >= kvm->arch.guest_pmap_npages) { - kvm_err("%s: Invalid gfn: %#llx, BadVaddr: %#lx\n", __func__, - gfn, badvaddr); - kvm_mips_dump_host_tlbs(); + /* Get the GPA page table entry */ + gpa = KVM_GUEST_CPHYSADDR(badvaddr); + idx = (badvaddr >> PAGE_SHIFT) & 1; + if (kvm_mips_map_page(vcpu, gpa, write_fault, &pte_gpa[idx], + &pte_gpa[!idx]) < 0) return -1; - } - vaddr = badvaddr & (PAGE_MASK << 1); - if (kvm_mips_map_page(vcpu->kvm, gfn) < 0) + /* Get the GVA page table entry */ + ptep_gva = kvm_trap_emul_pte_for_gva(vcpu, badvaddr & ~PAGE_SIZE); + if (!ptep_gva) { + kvm_err("No ptep for gva %lx\n", badvaddr); return -1; + } - if (kvm_mips_map_page(vcpu->kvm, gfn ^ 0x1) < 0) - return -1; - - pfn0 = kvm->arch.guest_pmap[gfn & ~0x1]; - pfn1 = kvm->arch.guest_pmap[gfn | 0x1]; - - entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | - ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) | - ENTRYLO_D | ENTRYLO_V; - entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) | - ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) | - ENTRYLO_D | ENTRYLO_V; - - preempt_disable(); - entryhi = (vaddr | kvm_mips_get_kernel_asid(vcpu)); - ret = kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1, - flush_dcache_mask); - preempt_enable(); + /* Copy a pair of entries from GPA page table to GVA page table */ + ptep_gva[0] = kvm_mips_gpa_pte_to_gva_unmapped(pte_gpa[0]); + ptep_gva[1] = kvm_mips_gpa_pte_to_gva_unmapped(pte_gpa[1]); - return ret; + /* Invalidate this entry in the TLB, guest kernel ASID only */ + kvm_mips_host_tlb_inv(vcpu, badvaddr, false, true); + return 0; } int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu, - struct kvm_mips_tlb *tlb) + struct kvm_mips_tlb *tlb, + unsigned long gva, + bool write_fault) { - unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0; struct kvm *kvm = vcpu->kvm; - kvm_pfn_t pfn0, pfn1; - gfn_t gfn0, gfn1; long tlb_lo[2]; - int ret; + pte_t pte_gpa[2], *ptep_buddy, *ptep_gva; + unsigned int idx = TLB_LO_IDX(*tlb, gva); + bool kernel = KVM_GUEST_KERNEL_MODE(vcpu); tlb_lo[0] = tlb->tlb_lo[0]; tlb_lo[1] = tlb->tlb_lo[1]; @@ -149,70 +1048,64 @@ int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu, * The commpage address must not be mapped to anything else if the guest * TLB contains entries nearby, or commpage accesses will break. */ - if (!((tlb->tlb_hi ^ KVM_GUEST_COMMPAGE_ADDR) & - VPN2_MASK & (PAGE_MASK << 1))) - tlb_lo[(KVM_GUEST_COMMPAGE_ADDR >> PAGE_SHIFT) & 1] = 0; - - gfn0 = mips3_tlbpfn_to_paddr(tlb_lo[0]) >> PAGE_SHIFT; - gfn1 = mips3_tlbpfn_to_paddr(tlb_lo[1]) >> PAGE_SHIFT; - if (gfn0 >= kvm->arch.guest_pmap_npages || - gfn1 >= kvm->arch.guest_pmap_npages) { - kvm_err("%s: Invalid gfn: [%#llx, %#llx], EHi: %#lx\n", - __func__, gfn0, gfn1, tlb->tlb_hi); - kvm_mips_dump_guest_tlbs(vcpu); - return -1; - } + if (!((gva ^ KVM_GUEST_COMMPAGE_ADDR) & VPN2_MASK & (PAGE_MASK << 1))) + tlb_lo[TLB_LO_IDX(*tlb, KVM_GUEST_COMMPAGE_ADDR)] = 0; - if (kvm_mips_map_page(kvm, gfn0) < 0) + /* Get the GPA page table entry */ + if (kvm_mips_map_page(vcpu, mips3_tlbpfn_to_paddr(tlb_lo[idx]), + write_fault, &pte_gpa[idx], NULL) < 0) return -1; - if (kvm_mips_map_page(kvm, gfn1) < 0) + /* And its GVA buddy's GPA page table entry if it also exists */ + pte_gpa[!idx] = pfn_pte(0, __pgprot(0)); + if (tlb_lo[!idx] & ENTRYLO_V) { + spin_lock(&kvm->mmu_lock); + ptep_buddy = kvm_mips_pte_for_gpa(kvm, NULL, + mips3_tlbpfn_to_paddr(tlb_lo[!idx])); + if (ptep_buddy) + pte_gpa[!idx] = *ptep_buddy; + spin_unlock(&kvm->mmu_lock); + } + + /* Get the GVA page table entry pair */ + ptep_gva = kvm_trap_emul_pte_for_gva(vcpu, gva & ~PAGE_SIZE); + if (!ptep_gva) { + kvm_err("No ptep for gva %lx\n", gva); return -1; + } - pfn0 = kvm->arch.guest_pmap[gfn0]; - pfn1 = kvm->arch.guest_pmap[gfn1]; + /* Copy a pair of entries from GPA page table to GVA page table */ + ptep_gva[0] = kvm_mips_gpa_pte_to_gva_mapped(pte_gpa[0], tlb_lo[0]); + ptep_gva[1] = kvm_mips_gpa_pte_to_gva_mapped(pte_gpa[1], tlb_lo[1]); - /* Get attributes from the Guest TLB */ - entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | - ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) | - (tlb_lo[0] & ENTRYLO_D) | - (tlb_lo[0] & ENTRYLO_V); - entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) | - ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) | - (tlb_lo[1] & ENTRYLO_D) | - (tlb_lo[1] & ENTRYLO_V); + /* Invalidate this entry in the TLB, current guest mode ASID only */ + kvm_mips_host_tlb_inv(vcpu, gva, !kernel, kernel); kvm_debug("@ %#lx tlb_lo0: 0x%08lx tlb_lo1: 0x%08lx\n", vcpu->arch.pc, tlb->tlb_lo[0], tlb->tlb_lo[1]); - preempt_disable(); - entryhi = (tlb->tlb_hi & VPN2_MASK) | (KVM_GUEST_KERNEL_MODE(vcpu) ? - kvm_mips_get_kernel_asid(vcpu) : - kvm_mips_get_user_asid(vcpu)); - ret = kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1, - tlb->tlb_mask); - preempt_enable(); - - return ret; + return 0; } -void kvm_get_new_mmu_context(struct mm_struct *mm, unsigned long cpu, - struct kvm_vcpu *vcpu) +int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr, + struct kvm_vcpu *vcpu) { - unsigned long asid = asid_cache(cpu); - - asid += cpu_asid_inc(); - if (!(asid & cpu_asid_mask(&cpu_data[cpu]))) { - if (cpu_has_vtag_icache) - flush_icache_all(); - - kvm_local_flush_tlb_all(); /* start new asid cycle */ + kvm_pfn_t pfn; + pte_t *ptep; - if (!asid) /* fix version if needed */ - asid = asid_first_version(cpu); + ptep = kvm_trap_emul_pte_for_gva(vcpu, badvaddr); + if (!ptep) { + kvm_err("No ptep for commpage %lx\n", badvaddr); + return -1; } - cpu_context(cpu, mm) = asid_cache(cpu) = asid; + pfn = PFN_DOWN(virt_to_phys(vcpu->arch.kseg0_commpage)); + /* Also set valid and dirty, so refill handler doesn't have to */ + *ptep = pte_mkyoung(pte_mkdirty(pfn_pte(pfn, PAGE_SHARED))); + + /* Invalidate this entry in the TLB, guest kernel ASID only */ + kvm_mips_host_tlb_inv(vcpu, badvaddr, false, true); + return 0; } /** @@ -235,42 +1128,13 @@ static void kvm_mips_migrate_count(struct kvm_vcpu *vcpu) /* Restore ASID once we are scheduled back after preemption */ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { - unsigned long asid_mask = cpu_asid_mask(&cpu_data[cpu]); unsigned long flags; - int newasid = 0; kvm_debug("%s: vcpu %p, cpu: %d\n", __func__, vcpu, cpu); - /* Allocate new kernel and user ASIDs if needed */ - local_irq_save(flags); - if ((vcpu->arch.guest_kernel_asid[cpu] ^ asid_cache(cpu)) & - asid_version_mask(cpu)) { - kvm_get_new_mmu_context(&vcpu->arch.guest_kernel_mm, cpu, vcpu); - vcpu->arch.guest_kernel_asid[cpu] = - vcpu->arch.guest_kernel_mm.context.asid[cpu]; - newasid++; - - kvm_debug("[%d]: cpu_context: %#lx\n", cpu, - cpu_context(cpu, current->mm)); - kvm_debug("[%d]: Allocated new ASID for Guest Kernel: %#x\n", - cpu, vcpu->arch.guest_kernel_asid[cpu]); - } - - if ((vcpu->arch.guest_user_asid[cpu] ^ asid_cache(cpu)) & - asid_version_mask(cpu)) { - kvm_get_new_mmu_context(&vcpu->arch.guest_user_mm, cpu, vcpu); - vcpu->arch.guest_user_asid[cpu] = - vcpu->arch.guest_user_mm.context.asid[cpu]; - newasid++; - - kvm_debug("[%d]: cpu_context: %#lx\n", cpu, - cpu_context(cpu, current->mm)); - kvm_debug("[%d]: Allocated new ASID for Guest User: %#x\n", cpu, - vcpu->arch.guest_user_asid[cpu]); - } - + vcpu->cpu = cpu; if (vcpu->arch.last_sched_cpu != cpu) { kvm_debug("[%d->%d]KVM VCPU[%d] switch\n", vcpu->arch.last_sched_cpu, cpu, vcpu->vcpu_id); @@ -282,42 +1146,10 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_mips_migrate_count(vcpu); } - if (!newasid) { - /* - * If we preempted while the guest was executing, then reload - * the pre-empted ASID - */ - if (current->flags & PF_VCPU) { - write_c0_entryhi(vcpu->arch. - preempt_entryhi & asid_mask); - ehb(); - } - } else { - /* New ASIDs were allocated for the VM */ - - /* - * Were we in guest context? If so then the pre-empted ASID is - * no longer valid, we need to set it to what it should be based - * on the mode of the Guest (Kernel/User) - */ - if (current->flags & PF_VCPU) { - if (KVM_GUEST_KERNEL_MODE(vcpu)) - write_c0_entryhi(vcpu->arch. - guest_kernel_asid[cpu] & - asid_mask); - else - write_c0_entryhi(vcpu->arch. - guest_user_asid[cpu] & - asid_mask); - ehb(); - } - } - /* restore guest state to registers */ - kvm_mips_callbacks->vcpu_set_regs(vcpu); + kvm_mips_callbacks->vcpu_load(vcpu, cpu); local_irq_restore(flags); - } /* ASID can change if another task is scheduled during preemption */ @@ -329,75 +1161,90 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) local_irq_save(flags); cpu = smp_processor_id(); - - vcpu->arch.preempt_entryhi = read_c0_entryhi(); vcpu->arch.last_sched_cpu = cpu; + vcpu->cpu = -1; /* save guest state in registers */ - kvm_mips_callbacks->vcpu_get_regs(vcpu); - - if (((cpu_context(cpu, current->mm) ^ asid_cache(cpu)) & - asid_version_mask(cpu))) { - kvm_debug("%s: Dropping MMU Context: %#lx\n", __func__, - cpu_context(cpu, current->mm)); - drop_mmu_context(current->mm, cpu); - } - write_c0_entryhi(cpu_asid(cpu, current->mm)); - ehb(); + kvm_mips_callbacks->vcpu_put(vcpu, cpu); local_irq_restore(flags); } -u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu) +/** + * kvm_trap_emul_gva_fault() - Safely attempt to handle a GVA access fault. + * @vcpu: Virtual CPU. + * @gva: Guest virtual address to be accessed. + * @write: True if write attempted (must be dirtied and made writable). + * + * Safely attempt to handle a GVA fault, mapping GVA pages if necessary, and + * dirtying the page if @write so that guest instructions can be modified. + * + * Returns: KVM_MIPS_MAPPED on success. + * KVM_MIPS_GVA if bad guest virtual address. + * KVM_MIPS_GPA if bad guest physical address. + * KVM_MIPS_TLB if guest TLB not present. + * KVM_MIPS_TLBINV if guest TLB present but not valid. + * KVM_MIPS_TLBMOD if guest TLB read only. + */ +enum kvm_mips_fault_result kvm_trap_emul_gva_fault(struct kvm_vcpu *vcpu, + unsigned long gva, + bool write) { struct mips_coproc *cop0 = vcpu->arch.cop0; - unsigned long paddr, flags, vpn2, asid; - unsigned long va = (unsigned long)opc; - void *vaddr; - u32 inst; + struct kvm_mips_tlb *tlb; int index; - if (KVM_GUEST_KSEGX(va) < KVM_GUEST_KSEG0 || - KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG23) { - local_irq_save(flags); - index = kvm_mips_host_tlb_lookup(vcpu, va); - if (index >= 0) { - inst = *(opc); - } else { - vpn2 = va & VPN2_MASK; - asid = kvm_read_c0_guest_entryhi(cop0) & - KVM_ENTRYHI_ASID; - index = kvm_mips_guest_tlb_lookup(vcpu, vpn2 | asid); - if (index < 0) { - kvm_err("%s: get_user_failed for %p, vcpu: %p, ASID: %#lx\n", - __func__, opc, vcpu, read_c0_entryhi()); - kvm_mips_dump_host_tlbs(); - kvm_mips_dump_guest_tlbs(vcpu); - local_irq_restore(flags); - return KVM_INVALID_INST; - } - if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, - &vcpu->arch.guest_tlb[index])) { - kvm_err("%s: handling mapped seg tlb fault failed for %p, index: %u, vcpu: %p, ASID: %#lx\n", - __func__, opc, index, vcpu, - read_c0_entryhi()); - kvm_mips_dump_guest_tlbs(vcpu); - local_irq_restore(flags); - return KVM_INVALID_INST; - } - inst = *(opc); - } - local_irq_restore(flags); - } else if (KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG0) { - paddr = kvm_mips_translate_guest_kseg0_to_hpa(vcpu, va); - vaddr = kmap_atomic(pfn_to_page(PHYS_PFN(paddr))); - vaddr += paddr & ~PAGE_MASK; - inst = *(u32 *)vaddr; - kunmap_atomic(vaddr); + if (KVM_GUEST_KSEGX(gva) == KVM_GUEST_KSEG0) { + if (kvm_mips_handle_kseg0_tlb_fault(gva, vcpu, write) < 0) + return KVM_MIPS_GPA; + } else if ((KVM_GUEST_KSEGX(gva) < KVM_GUEST_KSEG0) || + KVM_GUEST_KSEGX(gva) == KVM_GUEST_KSEG23) { + /* Address should be in the guest TLB */ + index = kvm_mips_guest_tlb_lookup(vcpu, (gva & VPN2_MASK) | + (kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID)); + if (index < 0) + return KVM_MIPS_TLB; + tlb = &vcpu->arch.guest_tlb[index]; + + /* Entry should be valid, and dirty for writes */ + if (!TLB_IS_VALID(*tlb, gva)) + return KVM_MIPS_TLBINV; + if (write && !TLB_IS_DIRTY(*tlb, gva)) + return KVM_MIPS_TLBMOD; + + if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb, gva, write)) + return KVM_MIPS_GPA; } else { - kvm_err("%s: illegal address: %p\n", __func__, opc); - return KVM_INVALID_INST; + return KVM_MIPS_GVA; } - return inst; + return KVM_MIPS_MAPPED; +} + +int kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu, u32 *out) +{ + int err; + +retry: + kvm_trap_emul_gva_lockless_begin(vcpu); + err = get_user(*out, opc); + kvm_trap_emul_gva_lockless_end(vcpu); + + if (unlikely(err)) { + /* + * Try to handle the fault, maybe we just raced with a GVA + * invalidation. + */ + err = kvm_trap_emul_gva_fault(vcpu, (unsigned long)opc, + false); + if (unlikely(err)) { + kvm_err("%s: illegal address: %p\n", + __func__, opc); + return -EFAULT; + } + + /* Hopefully it'll work now */ + goto retry; + } + return 0; } diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c index 254377d8e0b9..2819eb793345 100644 --- a/arch/mips/kvm/tlb.c +++ b/arch/mips/kvm/tlb.c @@ -33,28 +33,20 @@ #define KVM_GUEST_PC_TLB 0 #define KVM_GUEST_SP_TLB 1 -atomic_t kvm_mips_instance; -EXPORT_SYMBOL_GPL(kvm_mips_instance); - static u32 kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu) { + struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm; int cpu = smp_processor_id(); - return vcpu->arch.guest_kernel_asid[cpu] & - cpu_asid_mask(&cpu_data[cpu]); + return cpu_asid(cpu, kern_mm); } static u32 kvm_mips_get_user_asid(struct kvm_vcpu *vcpu) { + struct mm_struct *user_mm = &vcpu->arch.guest_user_mm; int cpu = smp_processor_id(); - return vcpu->arch.guest_user_asid[cpu] & - cpu_asid_mask(&cpu_data[cpu]); -} - -inline u32 kvm_mips_get_commpage_asid(struct kvm_vcpu *vcpu) -{ - return vcpu->kvm->arch.commpage_tlb; + return cpu_asid(cpu, user_mm); } /* Structure defining an tlb entry data set. */ @@ -104,109 +96,6 @@ void kvm_mips_dump_guest_tlbs(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_mips_dump_guest_tlbs); -/* XXXKYMA: Must be called with interrupts disabled */ -/* set flush_dcache_mask == 0 if no dcache flush required */ -int kvm_mips_host_tlb_write(struct kvm_vcpu *vcpu, unsigned long entryhi, - unsigned long entrylo0, unsigned long entrylo1, - int flush_dcache_mask) -{ - unsigned long flags; - unsigned long old_entryhi; - int idx; - - local_irq_save(flags); - - old_entryhi = read_c0_entryhi(); - write_c0_entryhi(entryhi); - mtc0_tlbw_hazard(); - - tlb_probe(); - tlb_probe_hazard(); - idx = read_c0_index(); - - if (idx > current_cpu_data.tlbsize) { - kvm_err("%s: Invalid Index: %d\n", __func__, idx); - kvm_mips_dump_host_tlbs(); - local_irq_restore(flags); - return -1; - } - - write_c0_entrylo0(entrylo0); - write_c0_entrylo1(entrylo1); - mtc0_tlbw_hazard(); - - if (idx < 0) - tlb_write_random(); - else - tlb_write_indexed(); - tlbw_use_hazard(); - - kvm_debug("@ %#lx idx: %2d [entryhi(R): %#lx] entrylo0(R): 0x%08lx, entrylo1(R): 0x%08lx\n", - vcpu->arch.pc, idx, read_c0_entryhi(), - read_c0_entrylo0(), read_c0_entrylo1()); - - /* Flush D-cache */ - if (flush_dcache_mask) { - if (entrylo0 & ENTRYLO_V) { - ++vcpu->stat.flush_dcache_exits; - flush_data_cache_page((entryhi & VPN2_MASK) & - ~flush_dcache_mask); - } - if (entrylo1 & ENTRYLO_V) { - ++vcpu->stat.flush_dcache_exits; - flush_data_cache_page(((entryhi & VPN2_MASK) & - ~flush_dcache_mask) | - (0x1 << PAGE_SHIFT)); - } - } - - /* Restore old ASID */ - write_c0_entryhi(old_entryhi); - mtc0_tlbw_hazard(); - local_irq_restore(flags); - return 0; -} -EXPORT_SYMBOL_GPL(kvm_mips_host_tlb_write); - -int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr, - struct kvm_vcpu *vcpu) -{ - kvm_pfn_t pfn; - unsigned long flags, old_entryhi = 0, vaddr = 0; - unsigned long entrylo[2] = { 0, 0 }; - unsigned int pair_idx; - - pfn = PFN_DOWN(virt_to_phys(vcpu->arch.kseg0_commpage)); - pair_idx = (badvaddr >> PAGE_SHIFT) & 1; - entrylo[pair_idx] = mips3_paddr_to_tlbpfn(pfn << PAGE_SHIFT) | - ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) | - ENTRYLO_D | ENTRYLO_V; - - local_irq_save(flags); - - old_entryhi = read_c0_entryhi(); - vaddr = badvaddr & (PAGE_MASK << 1); - write_c0_entryhi(vaddr | kvm_mips_get_kernel_asid(vcpu)); - write_c0_entrylo0(entrylo[0]); - write_c0_entrylo1(entrylo[1]); - write_c0_index(kvm_mips_get_commpage_asid(vcpu)); - mtc0_tlbw_hazard(); - tlb_write_indexed(); - tlbw_use_hazard(); - - kvm_debug("@ %#lx idx: %2d [entryhi(R): %#lx] entrylo0 (R): 0x%08lx, entrylo1(R): 0x%08lx\n", - vcpu->arch.pc, read_c0_index(), read_c0_entryhi(), - read_c0_entrylo0(), read_c0_entrylo1()); - - /* Restore old ASID */ - write_c0_entryhi(old_entryhi); - mtc0_tlbw_hazard(); - local_irq_restore(flags); - - return 0; -} -EXPORT_SYMBOL_GPL(kvm_mips_handle_commpage_tlb_fault); - int kvm_mips_guest_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long entryhi) { int i; @@ -228,51 +117,11 @@ int kvm_mips_guest_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long entryhi) } EXPORT_SYMBOL_GPL(kvm_mips_guest_tlb_lookup); -int kvm_mips_host_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long vaddr) -{ - unsigned long old_entryhi, flags; - int idx; - - local_irq_save(flags); - - old_entryhi = read_c0_entryhi(); - - if (KVM_GUEST_KERNEL_MODE(vcpu)) - write_c0_entryhi((vaddr & VPN2_MASK) | - kvm_mips_get_kernel_asid(vcpu)); - else { - write_c0_entryhi((vaddr & VPN2_MASK) | - kvm_mips_get_user_asid(vcpu)); - } - - mtc0_tlbw_hazard(); - - tlb_probe(); - tlb_probe_hazard(); - idx = read_c0_index(); - - /* Restore old ASID */ - write_c0_entryhi(old_entryhi); - mtc0_tlbw_hazard(); - - local_irq_restore(flags); - - kvm_debug("Host TLB lookup, %#lx, idx: %2d\n", vaddr, idx); - - return idx; -} -EXPORT_SYMBOL_GPL(kvm_mips_host_tlb_lookup); - -int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va) +static int _kvm_mips_host_tlb_inv(unsigned long entryhi) { int idx; - unsigned long flags, old_entryhi; - - local_irq_save(flags); - - old_entryhi = read_c0_entryhi(); - write_c0_entryhi((va & VPN2_MASK) | kvm_mips_get_user_asid(vcpu)); + write_c0_entryhi(entryhi); mtc0_tlbw_hazard(); tlb_probe(); @@ -282,7 +131,7 @@ int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va) if (idx >= current_cpu_data.tlbsize) BUG(); - if (idx > 0) { + if (idx >= 0) { write_c0_entryhi(UNIQUE_ENTRYHI(idx)); write_c0_entrylo0(0); write_c0_entrylo1(0); @@ -292,93 +141,75 @@ int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va) tlbw_use_hazard(); } - write_c0_entryhi(old_entryhi); - mtc0_tlbw_hazard(); - - local_irq_restore(flags); - - if (idx > 0) - kvm_debug("%s: Invalidated entryhi %#lx @ idx %d\n", __func__, - (va & VPN2_MASK) | kvm_mips_get_user_asid(vcpu), idx); - - return 0; + return idx; } -EXPORT_SYMBOL_GPL(kvm_mips_host_tlb_inv); -void kvm_mips_flush_host_tlb(int skip_kseg0) +int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va, + bool user, bool kernel) { - unsigned long flags; - unsigned long old_entryhi, entryhi; - unsigned long old_pagemask; - int entry = 0; - int maxentry = current_cpu_data.tlbsize; + int idx_user, idx_kernel; + unsigned long flags, old_entryhi; local_irq_save(flags); old_entryhi = read_c0_entryhi(); - old_pagemask = read_c0_pagemask(); - - /* Blast 'em all away. */ - for (entry = 0; entry < maxentry; entry++) { - write_c0_index(entry); - - if (skip_kseg0) { - mtc0_tlbr_hazard(); - tlb_read(); - tlb_read_hazard(); - - entryhi = read_c0_entryhi(); - /* Don't blow away guest kernel entries */ - if (KVM_GUEST_KSEGX(entryhi) == KVM_GUEST_KSEG0) - continue; - - write_c0_pagemask(old_pagemask); - } - - /* Make sure all entries differ. */ - write_c0_entryhi(UNIQUE_ENTRYHI(entry)); - write_c0_entrylo0(0); - write_c0_entrylo1(0); - mtc0_tlbw_hazard(); - - tlb_write_indexed(); - tlbw_use_hazard(); - } + if (user) + idx_user = _kvm_mips_host_tlb_inv((va & VPN2_MASK) | + kvm_mips_get_user_asid(vcpu)); + if (kernel) + idx_kernel = _kvm_mips_host_tlb_inv((va & VPN2_MASK) | + kvm_mips_get_kernel_asid(vcpu)); write_c0_entryhi(old_entryhi); - write_c0_pagemask(old_pagemask); mtc0_tlbw_hazard(); local_irq_restore(flags); + + if (user && idx_user >= 0) + kvm_debug("%s: Invalidated guest user entryhi %#lx @ idx %d\n", + __func__, (va & VPN2_MASK) | + kvm_mips_get_user_asid(vcpu), idx_user); + if (kernel && idx_kernel >= 0) + kvm_debug("%s: Invalidated guest kernel entryhi %#lx @ idx %d\n", + __func__, (va & VPN2_MASK) | + kvm_mips_get_kernel_asid(vcpu), idx_kernel); + + return 0; } -EXPORT_SYMBOL_GPL(kvm_mips_flush_host_tlb); +EXPORT_SYMBOL_GPL(kvm_mips_host_tlb_inv); -void kvm_local_flush_tlb_all(void) +/** + * kvm_mips_suspend_mm() - Suspend the active mm. + * @cpu The CPU we're running on. + * + * Suspend the active_mm, ready for a switch to a KVM guest virtual address + * space. This is left active for the duration of guest context, including time + * with interrupts enabled, so we need to be careful not to confuse e.g. cache + * management IPIs. + * + * kvm_mips_resume_mm() should be called before context switching to a different + * process so we don't need to worry about reference counting. + * + * This needs to be in static kernel code to avoid exporting init_mm. + */ +void kvm_mips_suspend_mm(int cpu) { - unsigned long flags; - unsigned long old_ctx; - int entry = 0; - - local_irq_save(flags); - /* Save old context and create impossible VPN2 value */ - old_ctx = read_c0_entryhi(); - write_c0_entrylo0(0); - write_c0_entrylo1(0); - - /* Blast 'em all away. */ - while (entry < current_cpu_data.tlbsize) { - /* Make sure all entries differ. */ - write_c0_entryhi(UNIQUE_ENTRYHI(entry)); - write_c0_index(entry); - mtc0_tlbw_hazard(); - tlb_write_indexed(); - tlbw_use_hazard(); - entry++; - } - write_c0_entryhi(old_ctx); - mtc0_tlbw_hazard(); + cpumask_clear_cpu(cpu, mm_cpumask(current->active_mm)); + current->active_mm = &init_mm; +} +EXPORT_SYMBOL_GPL(kvm_mips_suspend_mm); - local_irq_restore(flags); +/** + * kvm_mips_resume_mm() - Resume the current process mm. + * @cpu The CPU we're running on. + * + * Resume the mm of the current process, after a switch back from a KVM guest + * virtual address space (see kvm_mips_suspend_mm()). + */ +void kvm_mips_resume_mm(int cpu) +{ + cpumask_set_cpu(cpu, mm_cpumask(current->mm)); + current->active_mm = current->mm; } -EXPORT_SYMBOL_GPL(kvm_local_flush_tlb_all); +EXPORT_SYMBOL_GPL(kvm_mips_resume_mm); diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index 3b20441f2beb..b1fa53b252ea 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -11,9 +11,11 @@ #include <linux/errno.h> #include <linux/err.h> -#include <linux/vmalloc.h> - #include <linux/kvm_host.h> +#include <linux/uaccess.h> +#include <linux/vmalloc.h> +#include <asm/mmu_context.h> +#include <asm/pgalloc.h> #include "interrupt.h" @@ -21,9 +23,12 @@ static gpa_t kvm_trap_emul_gva_to_gpa_cb(gva_t gva) { gpa_t gpa; gva_t kseg = KSEGX(gva); + gva_t gkseg = KVM_GUEST_KSEGX(gva); if ((kseg == CKSEG0) || (kseg == CKSEG1)) gpa = CPHYSADDR(gva); + else if (gkseg == KVM_GUEST_KSEG0) + gpa = KVM_GUEST_CPHYSADDR(gva); else { kvm_err("%s: cannot find GPA for GVA: %#lx\n", __func__, gva); kvm_mips_dump_host_tlbs(); @@ -83,48 +88,134 @@ static int kvm_trap_emul_handle_cop_unusable(struct kvm_vcpu *vcpu) return ret; } +static int kvm_mips_bad_load(u32 cause, u32 *opc, struct kvm_run *run, + struct kvm_vcpu *vcpu) +{ + enum emulation_result er; + union mips_instruction inst; + int err; + + /* A code fetch fault doesn't count as an MMIO */ + if (kvm_is_ifetch_fault(&vcpu->arch)) { + run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + return RESUME_HOST; + } + + /* Fetch the instruction. */ + if (cause & CAUSEF_BD) + opc += 1; + err = kvm_get_badinstr(opc, vcpu, &inst.word); + if (err) { + run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + return RESUME_HOST; + } + + /* Emulate the load */ + er = kvm_mips_emulate_load(inst, cause, run, vcpu); + if (er == EMULATE_FAIL) { + kvm_err("Emulate load from MMIO space failed\n"); + run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + } else { + run->exit_reason = KVM_EXIT_MMIO; + } + return RESUME_HOST; +} + +static int kvm_mips_bad_store(u32 cause, u32 *opc, struct kvm_run *run, + struct kvm_vcpu *vcpu) +{ + enum emulation_result er; + union mips_instruction inst; + int err; + + /* Fetch the instruction. */ + if (cause & CAUSEF_BD) + opc += 1; + err = kvm_get_badinstr(opc, vcpu, &inst.word); + if (err) { + run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + return RESUME_HOST; + } + + /* Emulate the store */ + er = kvm_mips_emulate_store(inst, cause, run, vcpu); + if (er == EMULATE_FAIL) { + kvm_err("Emulate store to MMIO space failed\n"); + run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + } else { + run->exit_reason = KVM_EXIT_MMIO; + } + return RESUME_HOST; +} + +static int kvm_mips_bad_access(u32 cause, u32 *opc, struct kvm_run *run, + struct kvm_vcpu *vcpu, bool store) +{ + if (store) + return kvm_mips_bad_store(cause, opc, run, vcpu); + else + return kvm_mips_bad_load(cause, opc, run, vcpu); +} + static int kvm_trap_emul_handle_tlb_mod(struct kvm_vcpu *vcpu) { + struct mips_coproc *cop0 = vcpu->arch.cop0; struct kvm_run *run = vcpu->run; u32 __user *opc = (u32 __user *) vcpu->arch.pc; unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr; u32 cause = vcpu->arch.host_cp0_cause; - enum emulation_result er = EMULATE_DONE; - int ret = RESUME_GUEST; + struct kvm_mips_tlb *tlb; + unsigned long entryhi; + int index; if (KVM_GUEST_KSEGX(badvaddr) < KVM_GUEST_KSEG0 || KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG23) { - kvm_debug("USER/KSEG23 ADDR TLB MOD fault: cause %#x, PC: %p, BadVaddr: %#lx\n", - cause, opc, badvaddr); - er = kvm_mips_handle_tlbmod(cause, opc, run, vcpu); + /* + * First find the mapping in the guest TLB. If the failure to + * write was due to the guest TLB, it should be up to the guest + * to handle it. + */ + entryhi = (badvaddr & VPN2_MASK) | + (kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID); + index = kvm_mips_guest_tlb_lookup(vcpu, entryhi); - if (er == EMULATE_DONE) - ret = RESUME_GUEST; - else { + /* + * These should never happen. + * They would indicate stale host TLB entries. + */ + if (unlikely(index < 0)) { run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - ret = RESUME_HOST; + return RESUME_HOST; } - } else if (KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG0) { + tlb = vcpu->arch.guest_tlb + index; + if (unlikely(!TLB_IS_VALID(*tlb, badvaddr))) { + run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + return RESUME_HOST; + } + /* - * XXXKYMA: The guest kernel does not expect to get this fault - * when we are not using HIGHMEM. Need to address this in a - * HIGHMEM kernel + * Guest entry not dirty? That would explain the TLB modified + * exception. Relay that on to the guest so it can handle it. */ - kvm_err("TLB MOD fault not handled, cause %#x, PC: %p, BadVaddr: %#lx\n", - cause, opc, badvaddr); - kvm_mips_dump_host_tlbs(); - kvm_arch_vcpu_dump_regs(vcpu); - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - ret = RESUME_HOST; + if (!TLB_IS_DIRTY(*tlb, badvaddr)) { + kvm_mips_emulate_tlbmod(cause, opc, run, vcpu); + return RESUME_GUEST; + } + + if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb, badvaddr, + true)) + /* Not writable, needs handling as MMIO */ + return kvm_mips_bad_store(cause, opc, run, vcpu); + return RESUME_GUEST; + } else if (KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG0) { + if (kvm_mips_handle_kseg0_tlb_fault(badvaddr, vcpu, true) < 0) + /* Not writable, needs handling as MMIO */ + return kvm_mips_bad_store(cause, opc, run, vcpu); + return RESUME_GUEST; } else { - kvm_err("Illegal TLB Mod fault address , cause %#x, PC: %p, BadVaddr: %#lx\n", - cause, opc, badvaddr); - kvm_mips_dump_host_tlbs(); - kvm_arch_vcpu_dump_regs(vcpu); - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - ret = RESUME_HOST; + /* host kernel addresses are all handled as MMIO */ + return kvm_mips_bad_store(cause, opc, run, vcpu); } - return ret; } static int kvm_trap_emul_handle_tlb_miss(struct kvm_vcpu *vcpu, bool store) @@ -157,7 +248,7 @@ static int kvm_trap_emul_handle_tlb_miss(struct kvm_vcpu *vcpu, bool store) * into the shadow host TLB */ - er = kvm_mips_handle_tlbmiss(cause, opc, run, vcpu); + er = kvm_mips_handle_tlbmiss(cause, opc, run, vcpu, store); if (er == EMULATE_DONE) ret = RESUME_GUEST; else { @@ -169,29 +260,15 @@ static int kvm_trap_emul_handle_tlb_miss(struct kvm_vcpu *vcpu, bool store) * All KSEG0 faults are handled by KVM, as the guest kernel does * not expect to ever get them */ - if (kvm_mips_handle_kseg0_tlb_fault - (vcpu->arch.host_cp0_badvaddr, vcpu) < 0) { - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - ret = RESUME_HOST; - } + if (kvm_mips_handle_kseg0_tlb_fault(badvaddr, vcpu, store) < 0) + ret = kvm_mips_bad_access(cause, opc, run, vcpu, store); } else if (KVM_GUEST_KERNEL_MODE(vcpu) && (KSEGX(badvaddr) == CKSEG0 || KSEGX(badvaddr) == CKSEG1)) { /* * With EVA we may get a TLB exception instead of an address * error when the guest performs MMIO to KSeg1 addresses. */ - kvm_debug("Emulate %s MMIO space\n", - store ? "Store to" : "Load from"); - er = kvm_mips_emulate_inst(cause, opc, run, vcpu); - if (er == EMULATE_FAIL) { - kvm_err("Emulate %s MMIO space failed\n", - store ? "Store to" : "Load from"); - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - ret = RESUME_HOST; - } else { - run->exit_reason = KVM_EXIT_MMIO; - ret = RESUME_HOST; - } + ret = kvm_mips_bad_access(cause, opc, run, vcpu, store); } else { kvm_err("Illegal TLB %s fault address , cause %#x, PC: %p, BadVaddr: %#lx\n", store ? "ST" : "LD", cause, opc, badvaddr); @@ -219,21 +296,11 @@ static int kvm_trap_emul_handle_addr_err_st(struct kvm_vcpu *vcpu) u32 __user *opc = (u32 __user *) vcpu->arch.pc; unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr; u32 cause = vcpu->arch.host_cp0_cause; - enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; if (KVM_GUEST_KERNEL_MODE(vcpu) && (KSEGX(badvaddr) == CKSEG0 || KSEGX(badvaddr) == CKSEG1)) { - kvm_debug("Emulate Store to MMIO space\n"); - er = kvm_mips_emulate_inst(cause, opc, run, vcpu); - if (er == EMULATE_FAIL) { - kvm_err("Emulate Store to MMIO space failed\n"); - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - ret = RESUME_HOST; - } else { - run->exit_reason = KVM_EXIT_MMIO; - ret = RESUME_HOST; - } + ret = kvm_mips_bad_store(cause, opc, run, vcpu); } else { kvm_err("Address Error (STORE): cause %#x, PC: %p, BadVaddr: %#lx\n", cause, opc, badvaddr); @@ -249,26 +316,15 @@ static int kvm_trap_emul_handle_addr_err_ld(struct kvm_vcpu *vcpu) u32 __user *opc = (u32 __user *) vcpu->arch.pc; unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr; u32 cause = vcpu->arch.host_cp0_cause; - enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; if (KSEGX(badvaddr) == CKSEG0 || KSEGX(badvaddr) == CKSEG1) { - kvm_debug("Emulate Load from MMIO space @ %#lx\n", badvaddr); - er = kvm_mips_emulate_inst(cause, opc, run, vcpu); - if (er == EMULATE_FAIL) { - kvm_err("Emulate Load from MMIO space failed\n"); - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - ret = RESUME_HOST; - } else { - run->exit_reason = KVM_EXIT_MMIO; - ret = RESUME_HOST; - } + ret = kvm_mips_bad_load(cause, opc, run, vcpu); } else { kvm_err("Address Error (LOAD): cause %#x, PC: %p, BadVaddr: %#lx\n", cause, opc, badvaddr); run->exit_reason = KVM_EXIT_INTERNAL_ERROR; ret = RESUME_HOST; - er = EMULATE_FAIL; } return ret; } @@ -428,16 +484,75 @@ static int kvm_trap_emul_handle_msa_disabled(struct kvm_vcpu *vcpu) return ret; } -static int kvm_trap_emul_vm_init(struct kvm *kvm) +static int kvm_trap_emul_vcpu_init(struct kvm_vcpu *vcpu) { + struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm; + struct mm_struct *user_mm = &vcpu->arch.guest_user_mm; + + /* + * Allocate GVA -> HPA page tables. + * MIPS doesn't use the mm_struct pointer argument. + */ + kern_mm->pgd = pgd_alloc(kern_mm); + if (!kern_mm->pgd) + return -ENOMEM; + + user_mm->pgd = pgd_alloc(user_mm); + if (!user_mm->pgd) { + pgd_free(kern_mm, kern_mm->pgd); + return -ENOMEM; + } + return 0; } -static int kvm_trap_emul_vcpu_init(struct kvm_vcpu *vcpu) +static void kvm_mips_emul_free_gva_pt(pgd_t *pgd) { - vcpu->arch.kscratch_enabled = 0xfc; + /* Don't free host kernel page tables copied from init_mm.pgd */ + const unsigned long end = 0x80000000; + unsigned long pgd_va, pud_va, pmd_va; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + int i, j, k; + + for (i = 0; i < USER_PTRS_PER_PGD; i++) { + if (pgd_none(pgd[i])) + continue; + + pgd_va = (unsigned long)i << PGDIR_SHIFT; + if (pgd_va >= end) + break; + pud = pud_offset(pgd + i, 0); + for (j = 0; j < PTRS_PER_PUD; j++) { + if (pud_none(pud[j])) + continue; + + pud_va = pgd_va | ((unsigned long)j << PUD_SHIFT); + if (pud_va >= end) + break; + pmd = pmd_offset(pud + j, 0); + for (k = 0; k < PTRS_PER_PMD; k++) { + if (pmd_none(pmd[k])) + continue; + + pmd_va = pud_va | (k << PMD_SHIFT); + if (pmd_va >= end) + break; + pte = pte_offset(pmd + k, 0); + pte_free_kernel(NULL, pte); + } + pmd_free(NULL, pmd); + } + pud_free(NULL, pud); + } + pgd_free(NULL, pgd); +} - return 0; +static void kvm_trap_emul_vcpu_uninit(struct kvm_vcpu *vcpu) +{ + kvm_mips_emul_free_gva_pt(vcpu->arch.guest_kernel_mm.pgd); + kvm_mips_emul_free_gva_pt(vcpu->arch.guest_user_mm.pgd); } static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu) @@ -499,6 +614,9 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu) /* Set Wait IE/IXMT Ignore in Config7, IAR, AR */ kvm_write_c0_guest_config7(cop0, (MIPS_CONF7_WII) | (1 << 10)); + /* Status */ + kvm_write_c0_guest_status(cop0, ST0_BEV | ST0_ERL); + /* * Setup IntCtl defaults, compatibility mode for timer interrupts (HW5) */ @@ -508,17 +626,76 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu) kvm_write_c0_guest_ebase(cop0, KVM_GUEST_KSEG0 | (vcpu_id & MIPS_EBASE_CPUNUM)); + /* Put PC at guest reset vector */ + vcpu->arch.pc = KVM_GUEST_CKSEG1ADDR(0x1fc00000); + return 0; } +static void kvm_trap_emul_flush_shadow_all(struct kvm *kvm) +{ + /* Flush GVA page tables and invalidate GVA ASIDs on all VCPUs */ + kvm_flush_remote_tlbs(kvm); +} + +static void kvm_trap_emul_flush_shadow_memslot(struct kvm *kvm, + const struct kvm_memory_slot *slot) +{ + kvm_trap_emul_flush_shadow_all(kvm); +} + +static u64 kvm_trap_emul_get_one_regs[] = { + KVM_REG_MIPS_CP0_INDEX, + KVM_REG_MIPS_CP0_ENTRYLO0, + KVM_REG_MIPS_CP0_ENTRYLO1, + KVM_REG_MIPS_CP0_CONTEXT, + KVM_REG_MIPS_CP0_USERLOCAL, + KVM_REG_MIPS_CP0_PAGEMASK, + KVM_REG_MIPS_CP0_WIRED, + KVM_REG_MIPS_CP0_HWRENA, + KVM_REG_MIPS_CP0_BADVADDR, + KVM_REG_MIPS_CP0_COUNT, + KVM_REG_MIPS_CP0_ENTRYHI, + KVM_REG_MIPS_CP0_COMPARE, + KVM_REG_MIPS_CP0_STATUS, + KVM_REG_MIPS_CP0_INTCTL, + KVM_REG_MIPS_CP0_CAUSE, + KVM_REG_MIPS_CP0_EPC, + KVM_REG_MIPS_CP0_PRID, + KVM_REG_MIPS_CP0_EBASE, + KVM_REG_MIPS_CP0_CONFIG, + KVM_REG_MIPS_CP0_CONFIG1, + KVM_REG_MIPS_CP0_CONFIG2, + KVM_REG_MIPS_CP0_CONFIG3, + KVM_REG_MIPS_CP0_CONFIG4, + KVM_REG_MIPS_CP0_CONFIG5, + KVM_REG_MIPS_CP0_CONFIG7, + KVM_REG_MIPS_CP0_ERROREPC, + KVM_REG_MIPS_CP0_KSCRATCH1, + KVM_REG_MIPS_CP0_KSCRATCH2, + KVM_REG_MIPS_CP0_KSCRATCH3, + KVM_REG_MIPS_CP0_KSCRATCH4, + KVM_REG_MIPS_CP0_KSCRATCH5, + KVM_REG_MIPS_CP0_KSCRATCH6, + + KVM_REG_MIPS_COUNT_CTL, + KVM_REG_MIPS_COUNT_RESUME, + KVM_REG_MIPS_COUNT_HZ, +}; + static unsigned long kvm_trap_emul_num_regs(struct kvm_vcpu *vcpu) { - return 0; + return ARRAY_SIZE(kvm_trap_emul_get_one_regs); } static int kvm_trap_emul_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices) { + if (copy_to_user(indices, kvm_trap_emul_get_one_regs, + sizeof(kvm_trap_emul_get_one_regs))) + return -EFAULT; + indices += ARRAY_SIZE(kvm_trap_emul_get_one_regs); + return 0; } @@ -526,7 +703,81 @@ static int kvm_trap_emul_get_one_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, s64 *v) { + struct mips_coproc *cop0 = vcpu->arch.cop0; + switch (reg->id) { + case KVM_REG_MIPS_CP0_INDEX: + *v = (long)kvm_read_c0_guest_index(cop0); + break; + case KVM_REG_MIPS_CP0_ENTRYLO0: + *v = kvm_read_c0_guest_entrylo0(cop0); + break; + case KVM_REG_MIPS_CP0_ENTRYLO1: + *v = kvm_read_c0_guest_entrylo1(cop0); + break; + case KVM_REG_MIPS_CP0_CONTEXT: + *v = (long)kvm_read_c0_guest_context(cop0); + break; + case KVM_REG_MIPS_CP0_USERLOCAL: + *v = (long)kvm_read_c0_guest_userlocal(cop0); + break; + case KVM_REG_MIPS_CP0_PAGEMASK: + *v = (long)kvm_read_c0_guest_pagemask(cop0); + break; + case KVM_REG_MIPS_CP0_WIRED: + *v = (long)kvm_read_c0_guest_wired(cop0); + break; + case KVM_REG_MIPS_CP0_HWRENA: + *v = (long)kvm_read_c0_guest_hwrena(cop0); + break; + case KVM_REG_MIPS_CP0_BADVADDR: + *v = (long)kvm_read_c0_guest_badvaddr(cop0); + break; + case KVM_REG_MIPS_CP0_ENTRYHI: + *v = (long)kvm_read_c0_guest_entryhi(cop0); + break; + case KVM_REG_MIPS_CP0_COMPARE: + *v = (long)kvm_read_c0_guest_compare(cop0); + break; + case KVM_REG_MIPS_CP0_STATUS: + *v = (long)kvm_read_c0_guest_status(cop0); + break; + case KVM_REG_MIPS_CP0_INTCTL: + *v = (long)kvm_read_c0_guest_intctl(cop0); + break; + case KVM_REG_MIPS_CP0_CAUSE: + *v = (long)kvm_read_c0_guest_cause(cop0); + break; + case KVM_REG_MIPS_CP0_EPC: + *v = (long)kvm_read_c0_guest_epc(cop0); + break; + case KVM_REG_MIPS_CP0_PRID: + *v = (long)kvm_read_c0_guest_prid(cop0); + break; + case KVM_REG_MIPS_CP0_EBASE: + *v = (long)kvm_read_c0_guest_ebase(cop0); + break; + case KVM_REG_MIPS_CP0_CONFIG: + *v = (long)kvm_read_c0_guest_config(cop0); + break; + case KVM_REG_MIPS_CP0_CONFIG1: + *v = (long)kvm_read_c0_guest_config1(cop0); + break; + case KVM_REG_MIPS_CP0_CONFIG2: + *v = (long)kvm_read_c0_guest_config2(cop0); + break; + case KVM_REG_MIPS_CP0_CONFIG3: + *v = (long)kvm_read_c0_guest_config3(cop0); + break; + case KVM_REG_MIPS_CP0_CONFIG4: + *v = (long)kvm_read_c0_guest_config4(cop0); + break; + case KVM_REG_MIPS_CP0_CONFIG5: + *v = (long)kvm_read_c0_guest_config5(cop0); + break; + case KVM_REG_MIPS_CP0_CONFIG7: + *v = (long)kvm_read_c0_guest_config7(cop0); + break; case KVM_REG_MIPS_CP0_COUNT: *v = kvm_mips_read_count(vcpu); break; @@ -539,6 +790,27 @@ static int kvm_trap_emul_get_one_reg(struct kvm_vcpu *vcpu, case KVM_REG_MIPS_COUNT_HZ: *v = vcpu->arch.count_hz; break; + case KVM_REG_MIPS_CP0_ERROREPC: + *v = (long)kvm_read_c0_guest_errorepc(cop0); + break; + case KVM_REG_MIPS_CP0_KSCRATCH1: + *v = (long)kvm_read_c0_guest_kscratch1(cop0); + break; + case KVM_REG_MIPS_CP0_KSCRATCH2: + *v = (long)kvm_read_c0_guest_kscratch2(cop0); + break; + case KVM_REG_MIPS_CP0_KSCRATCH3: + *v = (long)kvm_read_c0_guest_kscratch3(cop0); + break; + case KVM_REG_MIPS_CP0_KSCRATCH4: + *v = (long)kvm_read_c0_guest_kscratch4(cop0); + break; + case KVM_REG_MIPS_CP0_KSCRATCH5: + *v = (long)kvm_read_c0_guest_kscratch5(cop0); + break; + case KVM_REG_MIPS_CP0_KSCRATCH6: + *v = (long)kvm_read_c0_guest_kscratch6(cop0); + break; default: return -EINVAL; } @@ -554,6 +826,56 @@ static int kvm_trap_emul_set_one_reg(struct kvm_vcpu *vcpu, unsigned int cur, change; switch (reg->id) { + case KVM_REG_MIPS_CP0_INDEX: + kvm_write_c0_guest_index(cop0, v); + break; + case KVM_REG_MIPS_CP0_ENTRYLO0: + kvm_write_c0_guest_entrylo0(cop0, v); + break; + case KVM_REG_MIPS_CP0_ENTRYLO1: + kvm_write_c0_guest_entrylo1(cop0, v); + break; + case KVM_REG_MIPS_CP0_CONTEXT: + kvm_write_c0_guest_context(cop0, v); + break; + case KVM_REG_MIPS_CP0_USERLOCAL: + kvm_write_c0_guest_userlocal(cop0, v); + break; + case KVM_REG_MIPS_CP0_PAGEMASK: + kvm_write_c0_guest_pagemask(cop0, v); + break; + case KVM_REG_MIPS_CP0_WIRED: + kvm_write_c0_guest_wired(cop0, v); + break; + case KVM_REG_MIPS_CP0_HWRENA: + kvm_write_c0_guest_hwrena(cop0, v); + break; + case KVM_REG_MIPS_CP0_BADVADDR: + kvm_write_c0_guest_badvaddr(cop0, v); + break; + case KVM_REG_MIPS_CP0_ENTRYHI: + kvm_write_c0_guest_entryhi(cop0, v); + break; + case KVM_REG_MIPS_CP0_STATUS: + kvm_write_c0_guest_status(cop0, v); + break; + case KVM_REG_MIPS_CP0_INTCTL: + /* No VInt, so no VS, read-only for now */ + break; + case KVM_REG_MIPS_CP0_EPC: + kvm_write_c0_guest_epc(cop0, v); + break; + case KVM_REG_MIPS_CP0_PRID: + kvm_write_c0_guest_prid(cop0, v); + break; + case KVM_REG_MIPS_CP0_EBASE: + /* + * Allow core number to be written, but the exception base must + * remain in guest KSeg0. + */ + kvm_change_c0_guest_ebase(cop0, 0x1ffff000 | MIPS_EBASE_CPUNUM, + v); + break; case KVM_REG_MIPS_CP0_COUNT: kvm_mips_write_count(vcpu, v); break; @@ -618,6 +940,9 @@ static int kvm_trap_emul_set_one_reg(struct kvm_vcpu *vcpu, kvm_write_c0_guest_config5(cop0, v); } break; + case KVM_REG_MIPS_CP0_CONFIG7: + /* writes ignored */ + break; case KVM_REG_MIPS_COUNT_CTL: ret = kvm_mips_set_count_ctl(vcpu, v); break; @@ -627,24 +952,269 @@ static int kvm_trap_emul_set_one_reg(struct kvm_vcpu *vcpu, case KVM_REG_MIPS_COUNT_HZ: ret = kvm_mips_set_count_hz(vcpu, v); break; + case KVM_REG_MIPS_CP0_ERROREPC: + kvm_write_c0_guest_errorepc(cop0, v); + break; + case KVM_REG_MIPS_CP0_KSCRATCH1: + kvm_write_c0_guest_kscratch1(cop0, v); + break; + case KVM_REG_MIPS_CP0_KSCRATCH2: + kvm_write_c0_guest_kscratch2(cop0, v); + break; + case KVM_REG_MIPS_CP0_KSCRATCH3: + kvm_write_c0_guest_kscratch3(cop0, v); + break; + case KVM_REG_MIPS_CP0_KSCRATCH4: + kvm_write_c0_guest_kscratch4(cop0, v); + break; + case KVM_REG_MIPS_CP0_KSCRATCH5: + kvm_write_c0_guest_kscratch5(cop0, v); + break; + case KVM_REG_MIPS_CP0_KSCRATCH6: + kvm_write_c0_guest_kscratch6(cop0, v); + break; default: return -EINVAL; } return ret; } -static int kvm_trap_emul_vcpu_get_regs(struct kvm_vcpu *vcpu) +static int kvm_trap_emul_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { - kvm_lose_fpu(vcpu); + struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm; + struct mm_struct *user_mm = &vcpu->arch.guest_user_mm; + struct mm_struct *mm; + + /* + * Were we in guest context? If so, restore the appropriate ASID based + * on the mode of the Guest (Kernel/User). + */ + if (current->flags & PF_VCPU) { + mm = KVM_GUEST_KERNEL_MODE(vcpu) ? kern_mm : user_mm; + if ((cpu_context(cpu, mm) ^ asid_cache(cpu)) & + asid_version_mask(cpu)) + get_new_mmu_context(mm, cpu); + write_c0_entryhi(cpu_asid(cpu, mm)); + TLBMISS_HANDLER_SETUP_PGD(mm->pgd); + kvm_mips_suspend_mm(cpu); + ehb(); + } return 0; } -static int kvm_trap_emul_vcpu_set_regs(struct kvm_vcpu *vcpu) +static int kvm_trap_emul_vcpu_put(struct kvm_vcpu *vcpu, int cpu) { + kvm_lose_fpu(vcpu); + + if (current->flags & PF_VCPU) { + /* Restore normal Linux process memory map */ + if (((cpu_context(cpu, current->mm) ^ asid_cache(cpu)) & + asid_version_mask(cpu))) + get_new_mmu_context(current->mm, cpu); + write_c0_entryhi(cpu_asid(cpu, current->mm)); + TLBMISS_HANDLER_SETUP_PGD(current->mm->pgd); + kvm_mips_resume_mm(cpu); + ehb(); + } + return 0; } +static void kvm_trap_emul_check_requests(struct kvm_vcpu *vcpu, int cpu, + bool reload_asid) +{ + struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm; + struct mm_struct *user_mm = &vcpu->arch.guest_user_mm; + struct mm_struct *mm; + int i; + + if (likely(!vcpu->requests)) + return; + + if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) { + /* + * Both kernel & user GVA mappings must be invalidated. The + * caller is just about to check whether the ASID is stale + * anyway so no need to reload it here. + */ + kvm_mips_flush_gva_pt(kern_mm->pgd, KMF_GPA | KMF_KERN); + kvm_mips_flush_gva_pt(user_mm->pgd, KMF_GPA | KMF_USER); + for_each_possible_cpu(i) { + cpu_context(i, kern_mm) = 0; + cpu_context(i, user_mm) = 0; + } + + /* Generate new ASID for current mode */ + if (reload_asid) { + mm = KVM_GUEST_KERNEL_MODE(vcpu) ? kern_mm : user_mm; + get_new_mmu_context(mm, cpu); + htw_stop(); + write_c0_entryhi(cpu_asid(cpu, mm)); + TLBMISS_HANDLER_SETUP_PGD(mm->pgd); + htw_start(); + } + } +} + +/** + * kvm_trap_emul_gva_lockless_begin() - Begin lockless access to GVA space. + * @vcpu: VCPU pointer. + * + * Call before a GVA space access outside of guest mode, to ensure that + * asynchronous TLB flush requests are handled or delayed until completion of + * the GVA access (as indicated by a matching kvm_trap_emul_gva_lockless_end()). + * + * Should be called with IRQs already enabled. + */ +void kvm_trap_emul_gva_lockless_begin(struct kvm_vcpu *vcpu) +{ + /* We re-enable IRQs in kvm_trap_emul_gva_lockless_end() */ + WARN_ON_ONCE(irqs_disabled()); + + /* + * The caller is about to access the GVA space, so we set the mode to + * force TLB flush requests to send an IPI, and also disable IRQs to + * delay IPI handling until kvm_trap_emul_gva_lockless_end(). + */ + local_irq_disable(); + + /* + * Make sure the read of VCPU requests is not reordered ahead of the + * write to vcpu->mode, or we could miss a TLB flush request while + * the requester sees the VCPU as outside of guest mode and not needing + * an IPI. + */ + smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES); + + /* + * If a TLB flush has been requested (potentially while + * OUTSIDE_GUEST_MODE and assumed immediately effective), perform it + * before accessing the GVA space, and be sure to reload the ASID if + * necessary as it'll be immediately used. + * + * TLB flush requests after this check will trigger an IPI due to the + * mode change above, which will be delayed due to IRQs disabled. + */ + kvm_trap_emul_check_requests(vcpu, smp_processor_id(), true); +} + +/** + * kvm_trap_emul_gva_lockless_end() - End lockless access to GVA space. + * @vcpu: VCPU pointer. + * + * Called after a GVA space access outside of guest mode. Should have a matching + * call to kvm_trap_emul_gva_lockless_begin(). + */ +void kvm_trap_emul_gva_lockless_end(struct kvm_vcpu *vcpu) +{ + /* + * Make sure the write to vcpu->mode is not reordered in front of GVA + * accesses, or a TLB flush requester may not think it necessary to send + * an IPI. + */ + smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE); + + /* + * Now that the access to GVA space is complete, its safe for pending + * TLB flush request IPIs to be handled (which indicates completion). + */ + local_irq_enable(); +} + +static void kvm_trap_emul_vcpu_reenter(struct kvm_run *run, + struct kvm_vcpu *vcpu) +{ + struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm; + struct mm_struct *user_mm = &vcpu->arch.guest_user_mm; + struct mm_struct *mm; + struct mips_coproc *cop0 = vcpu->arch.cop0; + int i, cpu = smp_processor_id(); + unsigned int gasid; + + /* + * No need to reload ASID, IRQs are disabled already so there's no rush, + * and we'll check if we need to regenerate below anyway before + * re-entering the guest. + */ + kvm_trap_emul_check_requests(vcpu, cpu, false); + + if (KVM_GUEST_KERNEL_MODE(vcpu)) { + mm = kern_mm; + } else { + mm = user_mm; + + /* + * Lazy host ASID regeneration / PT flush for guest user mode. + * If the guest ASID has changed since the last guest usermode + * execution, invalidate the stale TLB entries and flush GVA PT + * entries too. + */ + gasid = kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID; + if (gasid != vcpu->arch.last_user_gasid) { + kvm_mips_flush_gva_pt(user_mm->pgd, KMF_USER); + for_each_possible_cpu(i) + cpu_context(i, user_mm) = 0; + vcpu->arch.last_user_gasid = gasid; + } + } + + /* + * Check if ASID is stale. This may happen due to a TLB flush request or + * a lazy user MM invalidation. + */ + if ((cpu_context(cpu, mm) ^ asid_cache(cpu)) & + asid_version_mask(cpu)) + get_new_mmu_context(mm, cpu); +} + +static int kvm_trap_emul_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) +{ + int cpu = smp_processor_id(); + int r; + + /* Check if we have any exceptions/interrupts pending */ + kvm_mips_deliver_interrupts(vcpu, + kvm_read_c0_guest_cause(vcpu->arch.cop0)); + + kvm_trap_emul_vcpu_reenter(run, vcpu); + + /* + * We use user accessors to access guest memory, but we don't want to + * invoke Linux page faulting. + */ + pagefault_disable(); + + /* Disable hardware page table walking while in guest */ + htw_stop(); + + /* + * While in guest context we're in the guest's address space, not the + * host process address space, so we need to be careful not to confuse + * e.g. cache management IPIs. + */ + kvm_mips_suspend_mm(cpu); + + r = vcpu->arch.vcpu_run(run, vcpu); + + /* We may have migrated while handling guest exits */ + cpu = smp_processor_id(); + + /* Restore normal Linux process memory map */ + if (((cpu_context(cpu, current->mm) ^ asid_cache(cpu)) & + asid_version_mask(cpu))) + get_new_mmu_context(current->mm, cpu); + write_c0_entryhi(cpu_asid(cpu, current->mm)); + TLBMISS_HANDLER_SETUP_PGD(current->mm->pgd); + kvm_mips_resume_mm(cpu); + + htw_start(); + + pagefault_enable(); + + return r; +} + static struct kvm_mips_callbacks kvm_trap_emul_callbacks = { /* exit handlers */ .handle_cop_unusable = kvm_trap_emul_handle_cop_unusable, @@ -661,9 +1231,11 @@ static struct kvm_mips_callbacks kvm_trap_emul_callbacks = { .handle_fpe = kvm_trap_emul_handle_fpe, .handle_msa_disabled = kvm_trap_emul_handle_msa_disabled, - .vm_init = kvm_trap_emul_vm_init, .vcpu_init = kvm_trap_emul_vcpu_init, + .vcpu_uninit = kvm_trap_emul_vcpu_uninit, .vcpu_setup = kvm_trap_emul_vcpu_setup, + .flush_shadow_all = kvm_trap_emul_flush_shadow_all, + .flush_shadow_memslot = kvm_trap_emul_flush_shadow_memslot, .gva_to_gpa = kvm_trap_emul_gva_to_gpa_cb, .queue_timer_int = kvm_mips_queue_timer_int_cb, .dequeue_timer_int = kvm_mips_dequeue_timer_int_cb, @@ -675,8 +1247,10 @@ static struct kvm_mips_callbacks kvm_trap_emul_callbacks = { .copy_reg_indices = kvm_trap_emul_copy_reg_indices, .get_one_reg = kvm_trap_emul_get_one_reg, .set_one_reg = kvm_trap_emul_set_one_reg, - .vcpu_get_regs = kvm_trap_emul_vcpu_get_regs, - .vcpu_set_regs = kvm_trap_emul_vcpu_set_regs, + .vcpu_load = kvm_trap_emul_vcpu_load, + .vcpu_put = kvm_trap_emul_vcpu_put, + .vcpu_run = kvm_trap_emul_vcpu_run, + .vcpu_reenter = kvm_trap_emul_vcpu_reenter, }; int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks) diff --git a/arch/mips/mm/Makefile b/arch/mips/mm/Makefile index b4c64bd3f723..b4cc8811a664 100644 --- a/arch/mips/mm/Makefile +++ b/arch/mips/mm/Makefile @@ -4,7 +4,7 @@ obj-y += cache.o dma-default.o extable.o fault.o \ gup.o init.o mmap.o page.o page-funcs.o \ - tlbex.o tlbex-fault.o tlb-funcs.o + pgtable.o tlbex.o tlbex-fault.o tlb-funcs.o ifdef CONFIG_CPU_MICROMIPS obj-y += uasm-micromips.o diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index e86ebcf5c071..653569bc0da7 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -538,5 +538,6 @@ unsigned long pgd_current[NR_CPUS]; pgd_t swapper_pg_dir[_PTRS_PER_PGD] __section(.bss..swapper_pg_dir); #ifndef __PAGETABLE_PMD_FOLDED pmd_t invalid_pmd_table[PTRS_PER_PMD] __page_aligned_bss; +EXPORT_SYMBOL_GPL(invalid_pmd_table); #endif pte_t invalid_pte_table[PTRS_PER_PTE] __page_aligned_bss; diff --git a/arch/mips/mm/pgtable-64.c b/arch/mips/mm/pgtable-64.c index ce4473e7c0d2..0ae7b28b4db5 100644 --- a/arch/mips/mm/pgtable-64.c +++ b/arch/mips/mm/pgtable-64.c @@ -6,6 +6,7 @@ * Copyright (C) 1999, 2000 by Silicon Graphics * Copyright (C) 2003 by Ralf Baechle */ +#include <linux/export.h> #include <linux/init.h> #include <linux/mm.h> #include <asm/fixmap.h> @@ -60,6 +61,7 @@ void pmd_init(unsigned long addr, unsigned long pagetable) p[-1] = pagetable; } while (p != end); } +EXPORT_SYMBOL_GPL(pmd_init); #endif pmd_t mk_pmd(struct page *page, pgprot_t prot) diff --git a/arch/mips/mm/pgtable.c b/arch/mips/mm/pgtable.c new file mode 100644 index 000000000000..05560b042d82 --- /dev/null +++ b/arch/mips/mm/pgtable.c @@ -0,0 +1,25 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + */ +#include <linux/export.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <asm/pgalloc.h> + +pgd_t *pgd_alloc(struct mm_struct *mm) +{ + pgd_t *ret, *init; + + ret = (pgd_t *) __get_free_pages(GFP_KERNEL, PGD_ORDER); + if (ret) { + init = pgd_offset(&init_mm, 0UL); + pgd_init((unsigned long)ret); + memcpy(ret + USER_PTRS_PER_PGD, init + USER_PTRS_PER_PGD, + (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); + } + + return ret; +} +EXPORT_SYMBOL_GPL(pgd_alloc); diff --git a/arch/mips/mm/tlbex.c b/arch/mips/mm/tlbex.c index 55ce39606cb8..2465f83c79c3 100644 --- a/arch/mips/mm/tlbex.c +++ b/arch/mips/mm/tlbex.c @@ -22,6 +22,7 @@ */ #include <linux/bug.h> +#include <linux/export.h> #include <linux/kernel.h> #include <linux/types.h> #include <linux/smp.h> @@ -34,6 +35,7 @@ #include <asm/war.h> #include <asm/uasm.h> #include <asm/setup.h> +#include <asm/tlbex.h> static int mips_xpa_disabled; @@ -344,7 +346,8 @@ static int allocate_kscratch(void) } static int scratch_reg; -static int pgd_reg; +int pgd_reg; +EXPORT_SYMBOL_GPL(pgd_reg); enum vmalloc64_mode {not_refill, refill_scratch, refill_noscratch}; static struct work_registers build_get_work_registers(u32 **p) @@ -496,15 +499,9 @@ static void __maybe_unused build_tlb_probe_entry(u32 **p) } } -/* - * Write random or indexed TLB entry, and care about the hazards from - * the preceding mtc0 and for the following eret. - */ -enum tlb_write_entry { tlb_random, tlb_indexed }; - -static void build_tlb_write_entry(u32 **p, struct uasm_label **l, - struct uasm_reloc **r, - enum tlb_write_entry wmode) +void build_tlb_write_entry(u32 **p, struct uasm_label **l, + struct uasm_reloc **r, + enum tlb_write_entry wmode) { void(*tlbw)(u32 **) = NULL; @@ -627,6 +624,7 @@ static void build_tlb_write_entry(u32 **p, struct uasm_label **l, break; } } +EXPORT_SYMBOL_GPL(build_tlb_write_entry); static __maybe_unused void build_convert_pte_to_entrylo(u32 **p, unsigned int reg) @@ -781,9 +779,8 @@ static void build_huge_handler_tail(u32 **p, struct uasm_reloc **r, * TMP and PTR are scratch. * TMP will be clobbered, PTR will hold the pmd entry. */ -static void -build_get_pmde64(u32 **p, struct uasm_label **l, struct uasm_reloc **r, - unsigned int tmp, unsigned int ptr) +void build_get_pmde64(u32 **p, struct uasm_label **l, struct uasm_reloc **r, + unsigned int tmp, unsigned int ptr) { #ifndef CONFIG_MIPS_PGD_C0_CONTEXT long pgdc = (long)pgd_current; @@ -859,6 +856,7 @@ build_get_pmde64(u32 **p, struct uasm_label **l, struct uasm_reloc **r, uasm_i_daddu(p, ptr, ptr, tmp); /* add in pmd offset */ #endif } +EXPORT_SYMBOL_GPL(build_get_pmde64); /* * BVADDR is the faulting address, PTR is scratch. @@ -934,8 +932,7 @@ build_get_pgd_vmalloc64(u32 **p, struct uasm_label **l, struct uasm_reloc **r, * TMP and PTR are scratch. * TMP will be clobbered, PTR will hold the pgd entry. */ -static void __maybe_unused -build_get_pgde32(u32 **p, unsigned int tmp, unsigned int ptr) +void build_get_pgde32(u32 **p, unsigned int tmp, unsigned int ptr) { if (pgd_reg != -1) { /* pgd is in pgd_reg */ @@ -960,6 +957,7 @@ build_get_pgde32(u32 **p, unsigned int tmp, unsigned int ptr) uasm_i_sll(p, tmp, tmp, PGD_T_LOG2); uasm_i_addu(p, ptr, ptr, tmp); /* add in pgd offset */ } +EXPORT_SYMBOL_GPL(build_get_pgde32); #endif /* !CONFIG_64BIT */ @@ -989,7 +987,7 @@ static void build_adjust_context(u32 **p, unsigned int ctx) uasm_i_andi(p, ctx, ctx, mask); } -static void build_get_ptep(u32 **p, unsigned int tmp, unsigned int ptr) +void build_get_ptep(u32 **p, unsigned int tmp, unsigned int ptr) { /* * Bug workaround for the Nevada. It seems as if under certain @@ -1013,8 +1011,9 @@ static void build_get_ptep(u32 **p, unsigned int tmp, unsigned int ptr) build_adjust_context(p, tmp); UASM_i_ADDU(p, ptr, ptr, tmp); /* add in offset */ } +EXPORT_SYMBOL_GPL(build_get_ptep); -static void build_update_entries(u32 **p, unsigned int tmp, unsigned int ptep) +void build_update_entries(u32 **p, unsigned int tmp, unsigned int ptep) { int pte_off_even = 0; int pte_off_odd = sizeof(pte_t); @@ -1063,6 +1062,7 @@ static void build_update_entries(u32 **p, unsigned int tmp, unsigned int ptep) UASM_i_MTC0(p, 0, C0_ENTRYLO1); UASM_i_MTC0(p, ptep, C0_ENTRYLO1); /* load it */ } +EXPORT_SYMBOL_GPL(build_update_entries); struct mips_huge_tlb_info { int huge_pte; @@ -1536,7 +1536,9 @@ static void build_loongson3_tlb_refill_handler(void) extern u32 handle_tlbl[], handle_tlbl_end[]; extern u32 handle_tlbs[], handle_tlbs_end[]; extern u32 handle_tlbm[], handle_tlbm_end[]; -extern u32 tlbmiss_handler_setup_pgd_start[], tlbmiss_handler_setup_pgd[]; +extern u32 tlbmiss_handler_setup_pgd_start[]; +extern u32 tlbmiss_handler_setup_pgd[]; +EXPORT_SYMBOL_GPL(tlbmiss_handler_setup_pgd); extern u32 tlbmiss_handler_setup_pgd_end[]; static void build_setup_pgd(void) diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index 8afb0e00f7d9..d73e9dfa5237 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -44,10 +44,20 @@ struct patb_entry { }; extern struct patb_entry *partition_tb; +/* Bits in patb0 field */ #define PATB_HR (1UL << 63) -#define PATB_GR (1UL << 63) #define RPDB_MASK 0x0ffffffffffff00fUL #define RPDB_SHIFT (1UL << 8) +#define RTS1_SHIFT 61 /* top 2 bits of radix tree size */ +#define RTS1_MASK (3UL << RTS1_SHIFT) +#define RTS2_SHIFT 5 /* bottom 3 bits of radix tree size */ +#define RTS2_MASK (7UL << RTS2_SHIFT) +#define RPDS_MASK 0x1f /* root page dir. size field */ + +/* Bits in patb1 field */ +#define PATB_GR (1UL << 63) /* guest uses radix; must match HR */ +#define PRTS_MASK 0x1f /* process table size field */ + /* * Limit process table to PAGE_SIZE table. This * also limit the max pid we can support. @@ -138,5 +148,11 @@ static inline void setup_initial_memory_limit(phys_addr_t first_memblock_base, extern int (*register_process_table)(unsigned long base, unsigned long page_size, unsigned long tbl_size); +#ifdef CONFIG_PPC_PSERIES +extern void radix_init_pseries(void); +#else +static inline void radix_init_pseries(void) { }; +#endif + #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_BOOK3S_64_MMU_H_ */ diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index 9a3eee661297..8fa09fa500f0 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -97,6 +97,15 @@ ld reg,PACAKBASE(r13); \ ori reg,reg,(ABS_ADDR(label))@l; +/* + * Branches from unrelocated code (e.g., interrupts) to labels outside + * head-y require >64K offsets. + */ +#define __LOAD_FAR_HANDLER(reg, label) \ + ld reg,PACAKBASE(r13); \ + ori reg,reg,(ABS_ADDR(label))@l; \ + addis reg,reg,(ABS_ADDR(label))@h; + /* Exception register prefixes */ #define EXC_HV H #define EXC_STD @@ -227,13 +236,41 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) mtctr reg; \ bctr +/* + * KVM requires __LOAD_FAR_HANDLER. + * + * __BRANCH_TO_KVM_EXIT branches are also a special case because they + * explicitly use r9 then reload it from PACA before branching. Hence + * the double-underscore. + */ +#define __BRANCH_TO_KVM_EXIT(area, label) \ + mfctr r9; \ + std r9,HSTATE_SCRATCH1(r13); \ + __LOAD_FAR_HANDLER(r9, label); \ + mtctr r9; \ + ld r9,area+EX_R9(r13); \ + bctr + +#define BRANCH_TO_KVM(reg, label) \ + __LOAD_FAR_HANDLER(reg, label); \ + mtctr reg; \ + bctr + #else #define BRANCH_TO_COMMON(reg, label) \ b label +#define BRANCH_TO_KVM(reg, label) \ + b label + +#define __BRANCH_TO_KVM_EXIT(area, label) \ + ld r9,area+EX_R9(r13); \ + b label + #endif -#define __KVM_HANDLER_PROLOG(area, n) \ + +#define __KVM_HANDLER(area, h, n) \ BEGIN_FTR_SECTION_NESTED(947) \ ld r10,area+EX_CFAR(r13); \ std r10,HSTATE_CFAR(r13); \ @@ -243,30 +280,28 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) std r10,HSTATE_PPR(r13); \ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948); \ ld r10,area+EX_R10(r13); \ - stw r9,HSTATE_SCRATCH1(r13); \ - ld r9,area+EX_R9(r13); \ std r12,HSTATE_SCRATCH0(r13); \ - -#define __KVM_HANDLER(area, h, n) \ - __KVM_HANDLER_PROLOG(area, n) \ - li r12,n; \ - b kvmppc_interrupt + sldi r12,r9,32; \ + ori r12,r12,(n); \ + /* This reloads r9 before branching to kvmppc_interrupt */ \ + __BRANCH_TO_KVM_EXIT(area, kvmppc_interrupt) #define __KVM_HANDLER_SKIP(area, h, n) \ cmpwi r10,KVM_GUEST_MODE_SKIP; \ - ld r10,area+EX_R10(r13); \ beq 89f; \ - stw r9,HSTATE_SCRATCH1(r13); \ BEGIN_FTR_SECTION_NESTED(948) \ - ld r9,area+EX_PPR(r13); \ - std r9,HSTATE_PPR(r13); \ + ld r10,area+EX_PPR(r13); \ + std r10,HSTATE_PPR(r13); \ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948); \ - ld r9,area+EX_R9(r13); \ + ld r10,area+EX_R10(r13); \ std r12,HSTATE_SCRATCH0(r13); \ - li r12,n; \ - b kvmppc_interrupt; \ + sldi r12,r9,32; \ + ori r12,r12,(n); \ + /* This reloads r9 before branching to kvmppc_interrupt */ \ + __BRANCH_TO_KVM_EXIT(area, kvmppc_interrupt); \ 89: mtocrf 0x80,r9; \ ld r9,area+EX_R9(r13); \ + ld r10,area+EX_R10(r13); \ b kvmppc_skip_##h##interrupt #ifdef CONFIG_KVM_BOOK3S_64_HANDLER @@ -393,12 +428,12 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) EXCEPTION_RELON_PROLOG_PSERIES_1(label, EXC_STD) #define STD_RELON_EXCEPTION_HV(loc, vec, label) \ - /* No guest interrupts come through here */ \ SET_SCRATCH0(r13); /* save r13 */ \ - EXCEPTION_RELON_PROLOG_PSERIES(PACA_EXGEN, label, EXC_HV, NOTEST, vec); + EXCEPTION_RELON_PROLOG_PSERIES(PACA_EXGEN, label, \ + EXC_HV, KVMTEST_HV, vec); #define STD_RELON_EXCEPTION_HV_OOL(vec, label) \ - EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, vec); \ + EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST_HV, vec); \ EXCEPTION_RELON_PROLOG_PSERIES_1(label, EXC_HV) /* This associate vector numbers with bits in paca->irq_happened */ @@ -475,10 +510,10 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) #define MASKABLE_RELON_EXCEPTION_HV(loc, vec, label) \ _MASKABLE_RELON_EXCEPTION_PSERIES(vec, label, \ - EXC_HV, SOFTEN_NOTEST_HV) + EXC_HV, SOFTEN_TEST_HV) #define MASKABLE_RELON_EXCEPTION_HV_OOL(vec, label) \ - EXCEPTION_PROLOG_1(PACA_EXGEN, SOFTEN_NOTEST_HV, vec); \ + EXCEPTION_PROLOG_1(PACA_EXGEN, SOFTEN_TEST_HV, vec); \ EXCEPTION_PROLOG_PSERIES_1(label, EXC_HV) /* diff --git a/arch/powerpc/include/asm/head-64.h b/arch/powerpc/include/asm/head-64.h index fca7033839a9..9bd81619d090 100644 --- a/arch/powerpc/include/asm/head-64.h +++ b/arch/powerpc/include/asm/head-64.h @@ -218,7 +218,7 @@ name: #ifdef CONFIG_KVM_BOOK3S_64_HANDLER #define TRAMP_KVM_BEGIN(name) \ - TRAMP_REAL_BEGIN(name) + TRAMP_VIRT_BEGIN(name) #else #define TRAMP_KVM_BEGIN(name) #endif diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index 77ff1ba99d1f..54d11b3a6bf7 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -276,6 +276,7 @@ #define H_GET_MPP_X 0x314 #define H_SET_MODE 0x31C #define H_CLEAR_HPT 0x358 +#define H_REGISTER_PROC_TBL 0x37C #define H_SIGNAL_SYS_RESET 0x380 #define MAX_HCALL_OPCODE H_SIGNAL_SYS_RESET @@ -313,6 +314,16 @@ #define H_SIGNAL_SYS_RESET_ALL_OTHERS -2 /* >= 0 values are CPU number */ +/* Flag values used in H_REGISTER_PROC_TBL hcall */ +#define PROC_TABLE_OP_MASK 0x18 +#define PROC_TABLE_DEREG 0x10 +#define PROC_TABLE_NEW 0x18 +#define PROC_TABLE_TYPE_MASK 0x06 +#define PROC_TABLE_HPT_SLB 0x00 +#define PROC_TABLE_HPT_PT 0x02 +#define PROC_TABLE_RADIX 0x04 +#define PROC_TABLE_GTSE 0x01 + #ifndef __ASSEMBLY__ /** diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 5cf306ae0ac3..2bf35017ffc0 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -170,6 +170,8 @@ extern int kvmppc_book3s_hv_page_fault(struct kvm_run *run, unsigned long status); extern long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v, unsigned long valid); +extern int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned long gpa, gva_t ea, int is_store); extern void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte); extern struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu); @@ -182,6 +184,25 @@ extern void kvmppc_mmu_hpte_sysexit(void); extern int kvmppc_mmu_hv_init(void); extern int kvmppc_book3s_hcall_implemented(struct kvm *kvm, unsigned long hc); +extern int kvmppc_book3s_radix_page_fault(struct kvm_run *run, + struct kvm_vcpu *vcpu, + unsigned long ea, unsigned long dsisr); +extern int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, + struct kvmppc_pte *gpte, bool data, bool iswrite); +extern int kvmppc_init_vm_radix(struct kvm *kvm); +extern void kvmppc_free_radix(struct kvm *kvm); +extern int kvmppc_radix_init(void); +extern void kvmppc_radix_exit(void); +extern int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn); +extern int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn); +extern int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn); +extern long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm, + struct kvm_memory_slot *memslot, unsigned long *map); +extern int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info); + /* XXX remove this export when load_last_inst() is generic */ extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data); extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec); @@ -211,8 +232,11 @@ extern long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, extern long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags, unsigned long pte_index, unsigned long avpn, unsigned long *hpret); -extern long kvmppc_hv_get_dirty_log(struct kvm *kvm, +extern long kvmppc_hv_get_dirty_log_hpt(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long *map); +extern void kvmppc_harvest_vpa_dirty(struct kvmppc_vpa *vpa, + struct kvm_memory_slot *memslot, + unsigned long *map); extern void kvmppc_update_lpcr(struct kvm *kvm, unsigned long lpcr, unsigned long mask); extern void kvmppc_set_fscr(struct kvm_vcpu *vcpu, u64 fscr); diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 848292176908..d9b48f5bb606 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -22,6 +22,10 @@ #include <asm/book3s/64/mmu-hash.h> +/* Power architecture requires HPT is at least 256kiB, at most 64TiB */ +#define PPC_MIN_HPT_ORDER 18 +#define PPC_MAX_HPT_ORDER 46 + #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE static inline struct kvmppc_book3s_shadow_vcpu *svcpu_get(struct kvm_vcpu *vcpu) { @@ -36,6 +40,12 @@ static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu *svcpu) #endif #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + +static inline bool kvm_is_radix(struct kvm *kvm) +{ + return kvm->arch.radix; +} + #define KVM_DEFAULT_HPT_ORDER 24 /* 16MB HPT by default */ #endif @@ -350,6 +360,18 @@ extern void kvmppc_mmu_debugfs_init(struct kvm *kvm); extern void kvmhv_rm_send_ipi(int cpu); +static inline unsigned long kvmppc_hpt_npte(struct kvm_hpt_info *hpt) +{ + /* HPTEs are 2**4 bytes long */ + return 1UL << (hpt->order - 4); +} + +static inline unsigned long kvmppc_hpt_mask(struct kvm_hpt_info *hpt) +{ + /* 128 (2**7) bytes in each HPTEG */ + return (1UL << (hpt->order - 7)) - 1; +} + #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ #endif /* __ASM_KVM_BOOK3S_64_H__ */ diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index e59b172666cd..7bba8f415627 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -241,12 +241,24 @@ struct kvm_arch_memory_slot { #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ }; +struct kvm_hpt_info { + /* Host virtual (linear mapping) address of guest HPT */ + unsigned long virt; + /* Array of reverse mapping entries for each guest HPTE */ + struct revmap_entry *rev; + /* Guest HPT size is 2**(order) bytes */ + u32 order; + /* 1 if HPT allocated with CMA, 0 otherwise */ + int cma; +}; + +struct kvm_resize_hpt; + struct kvm_arch { unsigned int lpid; #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE unsigned int tlb_sets; - unsigned long hpt_virt; - struct revmap_entry *revmap; + struct kvm_hpt_info hpt; atomic64_t mmio_update; unsigned int host_lpid; unsigned long host_lpcr; @@ -256,16 +268,17 @@ struct kvm_arch { unsigned long lpcr; unsigned long vrma_slb_v; int hpte_setup_done; - u32 hpt_order; atomic_t vcpus_running; u32 online_vcores; - unsigned long hpt_npte; - unsigned long hpt_mask; atomic_t hpte_mod_interest; cpumask_t need_tlb_flush; - int hpt_cma_alloc; + cpumask_t cpu_in_guest; + u8 radix; + pgd_t *pgtable; + u64 process_table; struct dentry *debugfs_dir; struct dentry *htab_dentry; + struct kvm_resize_hpt *resize_hpt; /* protected by kvm->lock */ #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE struct mutex hpt_mutex; @@ -603,6 +616,7 @@ struct kvm_vcpu_arch { ulong fault_dar; u32 fault_dsisr; unsigned long intr_msr; + ulong fault_gpa; /* guest real address of page fault (POWER9) */ #endif #ifdef CONFIG_BOOKE @@ -657,6 +671,7 @@ struct kvm_vcpu_arch { int state; int ptid; int thread_cpu; + int prev_cpu; bool timer_running; wait_queue_head_t cpu_run; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 2da67bf1f2ec..dd11c4c8c56a 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -155,9 +155,10 @@ extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu); extern int kvmppc_kvm_pv(struct kvm_vcpu *vcpu); extern void kvmppc_map_magic(struct kvm_vcpu *vcpu); -extern long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp); -extern long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp); -extern void kvmppc_free_hpt(struct kvm *kvm); +extern int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order); +extern void kvmppc_set_hpt(struct kvm *kvm, struct kvm_hpt_info *info); +extern long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order); +extern void kvmppc_free_hpt(struct kvm_hpt_info *info); extern long kvmppc_prepare_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem); extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu, @@ -186,8 +187,8 @@ extern long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu, unsigned long tce_value, unsigned long npages); extern long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, unsigned long ioba); -extern struct page *kvm_alloc_hpt(unsigned long nr_pages); -extern void kvm_release_hpt(struct page *page, unsigned long nr_pages); +extern struct page *kvm_alloc_hpt_cma(unsigned long nr_pages); +extern void kvm_free_hpt_cma(struct page *page, unsigned long nr_pages); extern int kvmppc_core_init_vm(struct kvm *kvm); extern void kvmppc_core_destroy_vm(struct kvm *kvm); extern void kvmppc_core_free_memslot(struct kvm *kvm, @@ -214,6 +215,10 @@ extern void kvmppc_bookehv_exit(void); extern int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu); extern int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *); +extern long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm, + struct kvm_ppc_resize_hpt *rhpt); +extern long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm, + struct kvm_ppc_resize_hpt *rhpt); int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq); @@ -291,6 +296,8 @@ struct kvmppc_ops { struct irq_bypass_producer *); void (*irq_bypass_del_producer)(struct irq_bypass_consumer *, struct irq_bypass_producer *); + int (*configure_mmu)(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg); + int (*get_rmmu_info)(struct kvm *kvm, struct kvm_ppc_rmmu_info *info); }; extern struct kvmppc_ops *kvmppc_hv_ops; diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h index 5e57705b4759..8af2546ea593 100644 --- a/arch/powerpc/include/asm/prom.h +++ b/arch/powerpc/include/asm/prom.h @@ -121,6 +121,8 @@ struct of_drconf_cell { #define OV1_PPC_2_06 0x02 /* set if we support PowerPC 2.06 */ #define OV1_PPC_2_07 0x01 /* set if we support PowerPC 2.07 */ +#define OV1_PPC_3_00 0x80 /* set if we support PowerPC 3.00 */ + /* Option vector 2: Open Firmware options supported */ #define OV2_REAL_MODE 0x20 /* set if we want OF in real mode */ @@ -151,10 +153,17 @@ struct of_drconf_cell { #define OV5_XCMO 0x0440 /* Page Coalescing */ #define OV5_TYPE1_AFFINITY 0x0580 /* Type 1 NUMA affinity */ #define OV5_PRRN 0x0540 /* Platform Resource Reassignment */ -#define OV5_PFO_HW_RNG 0x0E80 /* PFO Random Number Generator */ -#define OV5_PFO_HW_842 0x0E40 /* PFO Compression Accelerator */ -#define OV5_PFO_HW_ENCR 0x0E20 /* PFO Encryption Accelerator */ -#define OV5_SUB_PROCESSORS 0x0F01 /* 1,2,or 4 Sub-Processors supported */ +#define OV5_PFO_HW_RNG 0x1180 /* PFO Random Number Generator */ +#define OV5_PFO_HW_842 0x1140 /* PFO Compression Accelerator */ +#define OV5_PFO_HW_ENCR 0x1120 /* PFO Encryption Accelerator */ +#define OV5_SUB_PROCESSORS 0x1501 /* 1,2,or 4 Sub-Processors supported */ +#define OV5_XIVE_EXPLOIT 0x1701 /* XIVE exploitation supported */ +#define OV5_MMU_RADIX_300 0x1880 /* ISA v3.00 radix MMU supported */ +#define OV5_MMU_HASH_300 0x1840 /* ISA v3.00 hash MMU supported */ +#define OV5_MMU_SEGM_RADIX 0x1820 /* radix mode (no segmentation) */ +#define OV5_MMU_PROC_TBL 0x1810 /* hcall selects SLB or proc table */ +#define OV5_MMU_SLB 0x1800 /* always use SLB */ +#define OV5_MMU_GTSE 0x1808 /* Guest translation shootdown */ /* Option Vector 6: IBM PAPR hints */ #define OV6_LINUX 0x02 /* Linux is our OS */ diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 0d4531aa2052..aa44a83ad3ec 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -274,10 +274,14 @@ #define SPRN_DSISR 0x012 /* Data Storage Interrupt Status Register */ #define DSISR_NOHPTE 0x40000000 /* no translation found */ #define DSISR_PROTFAULT 0x08000000 /* protection fault */ +#define DSISR_BADACCESS 0x04000000 /* bad access to CI or G */ #define DSISR_ISSTORE 0x02000000 /* access was a store */ #define DSISR_DABRMATCH 0x00400000 /* hit data breakpoint */ #define DSISR_NOSEGMENT 0x00200000 /* SLB miss */ #define DSISR_KEYFAULT 0x00200000 /* Key fault */ +#define DSISR_UNSUPP_MMU 0x00080000 /* Unsupported MMU config */ +#define DSISR_SET_RC 0x00040000 /* Failed setting of R/C bits */ +#define DSISR_PGDIRFAULT 0x00020000 /* Fault on page directory */ #define SPRN_TBRL 0x10C /* Time Base Read Lower Register (user, R/O) */ #define SPRN_TBRU 0x10D /* Time Base Read Upper Register (user, R/O) */ #define SPRN_CIR 0x11B /* Chip Information Register (hyper, R/0) */ diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index 3603b6f51b11..4edbe4bb0e8b 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -413,6 +413,26 @@ struct kvm_get_htab_header { __u16 n_invalid; }; +/* For KVM_PPC_CONFIGURE_V3_MMU */ +struct kvm_ppc_mmuv3_cfg { + __u64 flags; + __u64 process_table; /* second doubleword of partition table entry */ +}; + +/* Flag values for KVM_PPC_CONFIGURE_V3_MMU */ +#define KVM_PPC_MMUV3_RADIX 1 /* 1 = radix mode, 0 = HPT */ +#define KVM_PPC_MMUV3_GTSE 2 /* global translation shootdown enb. */ + +/* For KVM_PPC_GET_RMMU_INFO */ +struct kvm_ppc_rmmu_info { + struct kvm_ppc_radix_geom { + __u8 page_shift; + __u8 level_bits[4]; + __u8 pad[3]; + } geometries[8]; + __u32 ap_encodings[8]; +}; + /* Per-vcpu XICS interrupt controller state */ #define KVM_REG_PPC_ICP_STATE (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8c) @@ -613,5 +633,7 @@ struct kvm_get_htab_header { #define KVM_XICS_LEVEL_SENSITIVE (1ULL << 40) #define KVM_XICS_MASKED (1ULL << 41) #define KVM_XICS_PENDING (1ULL << 42) +#define KVM_XICS_PRESENTED (1ULL << 43) +#define KVM_XICS_QUEUED (1ULL << 44) #endif /* __LINUX_KVM_POWERPC_H */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 0601e6a7297c..3afa0ad9837f 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -498,6 +498,7 @@ int main(void) DEFINE(KVM_NEED_FLUSH, offsetof(struct kvm, arch.need_tlb_flush.bits)); DEFINE(KVM_ENABLED_HCALLS, offsetof(struct kvm, arch.enabled_hcalls)); DEFINE(KVM_VRMA_SLB_V, offsetof(struct kvm, arch.vrma_slb_v)); + DEFINE(KVM_RADIX, offsetof(struct kvm, arch.radix)); DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.shregs.dsisr)); DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar)); DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa.pinned_addr)); @@ -537,6 +538,7 @@ int main(void) DEFINE(VCPU_SLB_NR, offsetof(struct kvm_vcpu, arch.slb_nr)); DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, arch.fault_dsisr)); DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar)); + DEFINE(VCPU_FAULT_GPA, offsetof(struct kvm_vcpu, arch.fault_gpa)); DEFINE(VCPU_INTR_MSR, offsetof(struct kvm_vcpu, arch.intr_msr)); DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst)); DEFINE(VCPU_TRAP, offsetof(struct kvm_vcpu, arch.trap)); diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index d39d6118c6e9..34a04a5fa468 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -142,7 +142,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) lbz r0,HSTATE_HWTHREAD_REQ(r13) cmpwi r0,0 beq 1f - b kvm_start_guest + BRANCH_TO_KVM(r10, kvm_start_guest) 1: #endif @@ -717,13 +717,9 @@ hardware_interrupt_hv: BEGIN_FTR_SECTION _MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt_common, EXC_HV, SOFTEN_TEST_HV) -do_kvm_H0x500: - KVM_HANDLER(PACA_EXGEN, EXC_HV, 0x502) FTR_SECTION_ELSE _MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt_common, EXC_STD, SOFTEN_TEST_PR) -do_kvm_0x500: - KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x500) ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) EXC_REAL_END(hardware_interrupt, 0x500, 0x600) @@ -737,6 +733,8 @@ hardware_interrupt_relon_hv: ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) EXC_VIRT_END(hardware_interrupt, 0x4500, 0x4600) +TRAMP_KVM(PACA_EXGEN, 0x500) +TRAMP_KVM_HV(PACA_EXGEN, 0x500) EXC_COMMON_ASYNC(hardware_interrupt_common, 0x500, do_IRQ) @@ -832,6 +830,31 @@ EXC_VIRT(trap_0b, 0x4b00, 0x4c00, 0xb00) TRAMP_KVM(PACA_EXGEN, 0xb00) EXC_COMMON(trap_0b_common, 0xb00, unknown_exception) +#ifdef CONFIG_KVM_BOOK3S_64_HANDLER + /* + * If CONFIG_KVM_BOOK3S_64_HANDLER is set, save the PPR (on systems + * that support it) before changing to HMT_MEDIUM. That allows the KVM + * code to save that value into the guest state (it is the guest's PPR + * value). Otherwise just change to HMT_MEDIUM as userspace has + * already saved the PPR. + */ +#define SYSCALL_KVMTEST \ + SET_SCRATCH0(r13); \ + GET_PACA(r13); \ + std r9,PACA_EXGEN+EX_R9(r13); \ + OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR); \ + HMT_MEDIUM; \ + std r10,PACA_EXGEN+EX_R10(r13); \ + OPT_SAVE_REG_TO_PACA(PACA_EXGEN+EX_PPR, r9, CPU_FTR_HAS_PPR); \ + mfcr r9; \ + KVMTEST_PR(0xc00); \ + GET_SCRATCH0(r13) + +#else +#define SYSCALL_KVMTEST \ + HMT_MEDIUM +#endif + #define LOAD_SYSCALL_HANDLER(reg) \ __LOAD_HANDLER(reg, system_call_common) @@ -885,34 +908,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ #endif EXC_REAL_BEGIN(system_call, 0xc00, 0xd00) - /* - * If CONFIG_KVM_BOOK3S_64_HANDLER is set, save the PPR (on systems - * that support it) before changing to HMT_MEDIUM. That allows the KVM - * code to save that value into the guest state (it is the guest's PPR - * value). Otherwise just change to HMT_MEDIUM as userspace has - * already saved the PPR. - */ -#ifdef CONFIG_KVM_BOOK3S_64_HANDLER - SET_SCRATCH0(r13) - GET_PACA(r13) - std r9,PACA_EXGEN+EX_R9(r13) - OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR); - HMT_MEDIUM; - std r10,PACA_EXGEN+EX_R10(r13) - OPT_SAVE_REG_TO_PACA(PACA_EXGEN+EX_PPR, r9, CPU_FTR_HAS_PPR); - mfcr r9 - KVMTEST_PR(0xc00) - GET_SCRATCH0(r13) -#else - HMT_MEDIUM; -#endif + SYSCALL_KVMTEST SYSCALL_PSERIES_1 SYSCALL_PSERIES_2_RFID SYSCALL_PSERIES_3 EXC_REAL_END(system_call, 0xc00, 0xd00) EXC_VIRT_BEGIN(system_call, 0x4c00, 0x4d00) - HMT_MEDIUM + SYSCALL_KVMTEST SYSCALL_PSERIES_1 SYSCALL_PSERIES_2_DIRECT SYSCALL_PSERIES_3 @@ -927,7 +930,7 @@ TRAMP_KVM(PACA_EXGEN, 0xd00) EXC_COMMON(single_step_common, 0xd00, single_step_exception) EXC_REAL_OOL_HV(h_data_storage, 0xe00, 0xe20) -EXC_VIRT_NONE(0x4e00, 0x4e20) +EXC_VIRT_OOL_HV(h_data_storage, 0x4e00, 0x4e20, 0xe00) TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0xe00) EXC_COMMON_BEGIN(h_data_storage_common) mfspr r10,SPRN_HDAR @@ -943,7 +946,7 @@ EXC_COMMON_BEGIN(h_data_storage_common) EXC_REAL_OOL_HV(h_instr_storage, 0xe20, 0xe40) -EXC_VIRT_NONE(0x4e20, 0x4e40) +EXC_VIRT_OOL_HV(h_instr_storage, 0x4e20, 0x4e40, 0xe20) TRAMP_KVM_HV(PACA_EXGEN, 0xe20) EXC_COMMON(h_instr_storage_common, 0xe20, unknown_exception) diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index ec47a939cbdd..358d43f8f84f 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -649,6 +649,7 @@ static void __init early_cmdline_parse(void) struct option_vector1 { u8 byte1; u8 arch_versions; + u8 arch_versions3; } __packed; struct option_vector2 { @@ -691,6 +692,9 @@ struct option_vector5 { u8 reserved2; __be16 reserved3; u8 subprocessors; + u8 byte22; + u8 intarch; + u8 mmu; } __packed; struct option_vector6 { @@ -700,7 +704,7 @@ struct option_vector6 { } __packed; struct ibm_arch_vec { - struct { u32 mask, val; } pvrs[10]; + struct { u32 mask, val; } pvrs[12]; u8 num_vectors; @@ -750,6 +754,14 @@ struct ibm_arch_vec __cacheline_aligned ibm_architecture_vec = { .val = cpu_to_be32(0x004d0000), }, { + .mask = cpu_to_be32(0xffff0000), /* POWER9 */ + .val = cpu_to_be32(0x004e0000), + }, + { + .mask = cpu_to_be32(0xffffffff), /* all 3.00-compliant */ + .val = cpu_to_be32(0x0f000005), + }, + { .mask = cpu_to_be32(0xffffffff), /* all 2.07-compliant */ .val = cpu_to_be32(0x0f000004), }, @@ -774,6 +786,7 @@ struct ibm_arch_vec __cacheline_aligned ibm_architecture_vec = { .byte1 = 0, .arch_versions = OV1_PPC_2_00 | OV1_PPC_2_01 | OV1_PPC_2_02 | OV1_PPC_2_03 | OV1_PPC_2_04 | OV1_PPC_2_05 | OV1_PPC_2_06 | OV1_PPC_2_07, + .arch_versions3 = OV1_PPC_3_00, }, .vec2_len = VECTOR_LENGTH(sizeof(struct option_vector2)), @@ -836,6 +849,9 @@ struct ibm_arch_vec __cacheline_aligned ibm_architecture_vec = { .reserved2 = 0, .reserved3 = 0, .subprocessors = 1, + .intarch = 0, + .mmu = OV5_FEAT(OV5_MMU_RADIX_300) | OV5_FEAT(OV5_MMU_HASH_300) | + OV5_FEAT(OV5_MMU_PROC_TBL) | OV5_FEAT(OV5_MMU_GTSE), }, /* option vector 6: IBM PAPR hints */ diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index 7dd89b79d038..b87ccde2137a 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -70,7 +70,8 @@ endif kvm-hv-y += \ book3s_hv.o \ book3s_hv_interrupts.o \ - book3s_64_mmu_hv.o + book3s_64_mmu_hv.o \ + book3s_64_mmu_radix.o kvm-book3s_64-builtin-xics-objs-$(CONFIG_KVM_XICS) := \ book3s_hv_rm_xics.o diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 019f008775b9..b6b5c185bd92 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -239,6 +239,7 @@ void kvmppc_core_queue_data_storage(struct kvm_vcpu *vcpu, ulong dar, kvmppc_set_dsisr(vcpu, flags); kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE); } +EXPORT_SYMBOL_GPL(kvmppc_core_queue_data_storage); /* used by kvm_hv */ void kvmppc_core_queue_inst_storage(struct kvm_vcpu *vcpu, ulong flags) { diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index b795dd1ac2ef..013552f05182 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -40,84 +40,104 @@ #include "trace_hv.h" -/* Power architecture requires HPT is at least 256kB */ -#define PPC_MIN_HPT_ORDER 18 +//#define DEBUG_RESIZE_HPT 1 + +#ifdef DEBUG_RESIZE_HPT +#define resize_hpt_debug(resize, ...) \ + do { \ + printk(KERN_DEBUG "RESIZE HPT %p: ", resize); \ + printk(__VA_ARGS__); \ + } while (0) +#else +#define resize_hpt_debug(resize, ...) \ + do { } while (0) +#endif static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags, long pte_index, unsigned long pteh, unsigned long ptel, unsigned long *pte_idx_ret); + +struct kvm_resize_hpt { + /* These fields read-only after init */ + struct kvm *kvm; + struct work_struct work; + u32 order; + + /* These fields protected by kvm->lock */ + int error; + bool prepare_done; + + /* Private to the work thread, until prepare_done is true, + * then protected by kvm->resize_hpt_sem */ + struct kvm_hpt_info hpt; +}; + static void kvmppc_rmap_reset(struct kvm *kvm); -long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp) +int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order) { unsigned long hpt = 0; - struct revmap_entry *rev; + int cma = 0; struct page *page = NULL; - long order = KVM_DEFAULT_HPT_ORDER; + struct revmap_entry *rev; + unsigned long npte; - if (htab_orderp) { - order = *htab_orderp; - if (order < PPC_MIN_HPT_ORDER) - order = PPC_MIN_HPT_ORDER; - } + if ((order < PPC_MIN_HPT_ORDER) || (order > PPC_MAX_HPT_ORDER)) + return -EINVAL; - kvm->arch.hpt_cma_alloc = 0; - page = kvm_alloc_hpt(1ul << (order - PAGE_SHIFT)); + page = kvm_alloc_hpt_cma(1ul << (order - PAGE_SHIFT)); if (page) { hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); memset((void *)hpt, 0, (1ul << order)); - kvm->arch.hpt_cma_alloc = 1; + cma = 1; } - /* Lastly try successively smaller sizes from the page allocator */ - /* Only do this if userspace didn't specify a size via ioctl */ - while (!hpt && order > PPC_MIN_HPT_ORDER && !htab_orderp) { - hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT| - __GFP_NOWARN, order - PAGE_SHIFT); - if (!hpt) - --order; - } + if (!hpt) + hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT + |__GFP_NOWARN, order - PAGE_SHIFT); if (!hpt) return -ENOMEM; - kvm->arch.hpt_virt = hpt; - kvm->arch.hpt_order = order; /* HPTEs are 2**4 bytes long */ - kvm->arch.hpt_npte = 1ul << (order - 4); - /* 128 (2**7) bytes in each HPTEG */ - kvm->arch.hpt_mask = (1ul << (order - 7)) - 1; - - atomic64_set(&kvm->arch.mmio_update, 0); + npte = 1ul << (order - 4); /* Allocate reverse map array */ - rev = vmalloc(sizeof(struct revmap_entry) * kvm->arch.hpt_npte); + rev = vmalloc(sizeof(struct revmap_entry) * npte); if (!rev) { - pr_err("kvmppc_alloc_hpt: Couldn't alloc reverse map array\n"); - goto out_freehpt; + pr_err("kvmppc_allocate_hpt: Couldn't alloc reverse map array\n"); + if (cma) + kvm_free_hpt_cma(page, 1 << (order - PAGE_SHIFT)); + else + free_pages(hpt, order - PAGE_SHIFT); + return -ENOMEM; } - kvm->arch.revmap = rev; - kvm->arch.sdr1 = __pa(hpt) | (order - 18); - pr_info("KVM guest htab at %lx (order %ld), LPID %x\n", - hpt, order, kvm->arch.lpid); + info->order = order; + info->virt = hpt; + info->cma = cma; + info->rev = rev; - if (htab_orderp) - *htab_orderp = order; return 0; +} - out_freehpt: - if (kvm->arch.hpt_cma_alloc) - kvm_release_hpt(page, 1 << (order - PAGE_SHIFT)); - else - free_pages(hpt, order - PAGE_SHIFT); - return -ENOMEM; +void kvmppc_set_hpt(struct kvm *kvm, struct kvm_hpt_info *info) +{ + atomic64_set(&kvm->arch.mmio_update, 0); + kvm->arch.hpt = *info; + kvm->arch.sdr1 = __pa(info->virt) | (info->order - 18); + + pr_info("KVM guest htab at %lx (order %ld), LPID %x\n", + info->virt, (long)info->order, kvm->arch.lpid); } -long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp) +long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order) { long err = -EBUSY; - long order; + struct kvm_hpt_info info; + + if (kvm_is_radix(kvm)) + return -EINVAL; mutex_lock(&kvm->lock); if (kvm->arch.hpte_setup_done) { @@ -129,37 +149,44 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp) goto out; } } - if (kvm->arch.hpt_virt) { - order = kvm->arch.hpt_order; + if (kvm->arch.hpt.order == order) { + /* We already have a suitable HPT */ + /* Set the entire HPT to 0, i.e. invalid HPTEs */ - memset((void *)kvm->arch.hpt_virt, 0, 1ul << order); + memset((void *)kvm->arch.hpt.virt, 0, 1ul << order); /* * Reset all the reverse-mapping chains for all memslots */ kvmppc_rmap_reset(kvm); /* Ensure that each vcpu will flush its TLB on next entry. */ cpumask_setall(&kvm->arch.need_tlb_flush); - *htab_orderp = order; err = 0; - } else { - err = kvmppc_alloc_hpt(kvm, htab_orderp); - order = *htab_orderp; + goto out; } - out: + + if (kvm->arch.hpt.virt) + kvmppc_free_hpt(&kvm->arch.hpt); + + err = kvmppc_allocate_hpt(&info, order); + if (err < 0) + goto out; + kvmppc_set_hpt(kvm, &info); + +out: mutex_unlock(&kvm->lock); return err; } -void kvmppc_free_hpt(struct kvm *kvm) +void kvmppc_free_hpt(struct kvm_hpt_info *info) { - kvmppc_free_lpid(kvm->arch.lpid); - vfree(kvm->arch.revmap); - if (kvm->arch.hpt_cma_alloc) - kvm_release_hpt(virt_to_page(kvm->arch.hpt_virt), - 1 << (kvm->arch.hpt_order - PAGE_SHIFT)); - else - free_pages(kvm->arch.hpt_virt, - kvm->arch.hpt_order - PAGE_SHIFT); + vfree(info->rev); + if (info->cma) + kvm_free_hpt_cma(virt_to_page(info->virt), + 1 << (info->order - PAGE_SHIFT)); + else if (info->virt) + free_pages(info->virt, info->order - PAGE_SHIFT); + info->virt = 0; + info->order = 0; } /* Bits in first HPTE dword for pagesize 4k, 64k or 16M */ @@ -194,8 +221,8 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, if (npages > 1ul << (40 - porder)) npages = 1ul << (40 - porder); /* Can't use more than 1 HPTE per HPTEG */ - if (npages > kvm->arch.hpt_mask + 1) - npages = kvm->arch.hpt_mask + 1; + if (npages > kvmppc_hpt_mask(&kvm->arch.hpt) + 1) + npages = kvmppc_hpt_mask(&kvm->arch.hpt) + 1; hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) | HPTE_V_BOLTED | hpte0_pgsize_encoding(psize); @@ -205,7 +232,8 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, for (i = 0; i < npages; ++i) { addr = i << porder; /* can't use hpt_hash since va > 64 bits */ - hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & kvm->arch.hpt_mask; + hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) + & kvmppc_hpt_mask(&kvm->arch.hpt); /* * We assume that the hash table is empty and no * vcpus are using it at this stage. Since we create @@ -338,11 +366,11 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, preempt_enable(); return -ENOENT; } - hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4)); + hptep = (__be64 *)(kvm->arch.hpt.virt + (index << 4)); v = orig_v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK; if (cpu_has_feature(CPU_FTR_ARCH_300)) v = hpte_new_to_old_v(v, be64_to_cpu(hptep[1])); - gr = kvm->arch.revmap[index].guest_rpte; + gr = kvm->arch.hpt.rev[index].guest_rpte; unlock_hpte(hptep, orig_v); preempt_enable(); @@ -392,8 +420,8 @@ static int instruction_is_store(unsigned int instr) return (instr & mask) != 0; } -static int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu, - unsigned long gpa, gva_t ea, int is_store) +int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned long gpa, gva_t ea, int is_store) { u32 last_inst; @@ -458,6 +486,9 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned long rcbits; long mmio_update; + if (kvm_is_radix(kvm)) + return kvmppc_book3s_radix_page_fault(run, vcpu, ea, dsisr); + /* * Real-mode code has already searched the HPT and found the * entry we're interested in. Lock the entry and check that @@ -480,8 +511,8 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, } } index = vcpu->arch.pgfault_index; - hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4)); - rev = &kvm->arch.revmap[index]; + hptep = (__be64 *)(kvm->arch.hpt.virt + (index << 4)); + rev = &kvm->arch.hpt.rev[index]; preempt_disable(); while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) cpu_relax(); @@ -695,12 +726,13 @@ static void kvmppc_rmap_reset(struct kvm *kvm) srcu_read_unlock(&kvm->srcu, srcu_idx); } +typedef int (*hva_handler_fn)(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn); + static int kvm_handle_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, - int (*handler)(struct kvm *kvm, - unsigned long *rmapp, - unsigned long gfn)) + hva_handler_fn handler) { int ret; int retval = 0; @@ -725,9 +757,7 @@ static int kvm_handle_hva_range(struct kvm *kvm, gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); for (; gfn < gfn_end; ++gfn) { - gfn_t gfn_offset = gfn - memslot->base_gfn; - - ret = handler(kvm, &memslot->arch.rmap[gfn_offset], gfn); + ret = handler(kvm, memslot, gfn); retval |= ret; } } @@ -736,20 +766,61 @@ static int kvm_handle_hva_range(struct kvm *kvm, } static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, - int (*handler)(struct kvm *kvm, unsigned long *rmapp, - unsigned long gfn)) + hva_handler_fn handler) { return kvm_handle_hva_range(kvm, hva, hva + 1, handler); } -static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, +/* Must be called with both HPTE and rmap locked */ +static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i, + unsigned long *rmapp, unsigned long gfn) +{ + __be64 *hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4)); + struct revmap_entry *rev = kvm->arch.hpt.rev; + unsigned long j, h; + unsigned long ptel, psize, rcbits; + + j = rev[i].forw; + if (j == i) { + /* chain is now empty */ + *rmapp &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX); + } else { + /* remove i from chain */ + h = rev[i].back; + rev[h].forw = j; + rev[j].back = h; + rev[i].forw = rev[i].back = i; + *rmapp = (*rmapp & ~KVMPPC_RMAP_INDEX) | j; + } + + /* Now check and modify the HPTE */ + ptel = rev[i].guest_rpte; + psize = hpte_page_size(be64_to_cpu(hptep[0]), ptel); + if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) && + hpte_rpn(ptel, psize) == gfn) { + hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); + kvmppc_invalidate_hpte(kvm, hptep, i); + hptep[1] &= ~cpu_to_be64(HPTE_R_KEY_HI | HPTE_R_KEY_LO); + /* Harvest R and C */ + rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C); + *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT; + if (rcbits & HPTE_R_C) + kvmppc_update_rmap_change(rmapp, psize); + if (rcbits & ~rev[i].guest_rpte) { + rev[i].guest_rpte = ptel | rcbits; + note_hpte_modification(kvm, &rev[i]); + } + } +} + +static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gfn) { - struct revmap_entry *rev = kvm->arch.revmap; - unsigned long h, i, j; + unsigned long i; __be64 *hptep; - unsigned long ptel, psize, rcbits; + unsigned long *rmapp; + rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; for (;;) { lock_rmap(rmapp); if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { @@ -763,7 +834,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, * rmap chain lock. */ i = *rmapp & KVMPPC_RMAP_INDEX; - hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4)); + hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4)); if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { /* unlock rmap before spinning on the HPTE lock */ unlock_rmap(rmapp); @@ -771,37 +842,8 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, cpu_relax(); continue; } - j = rev[i].forw; - if (j == i) { - /* chain is now empty */ - *rmapp &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX); - } else { - /* remove i from chain */ - h = rev[i].back; - rev[h].forw = j; - rev[j].back = h; - rev[i].forw = rev[i].back = i; - *rmapp = (*rmapp & ~KVMPPC_RMAP_INDEX) | j; - } - /* Now check and modify the HPTE */ - ptel = rev[i].guest_rpte; - psize = hpte_page_size(be64_to_cpu(hptep[0]), ptel); - if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) && - hpte_rpn(ptel, psize) == gfn) { - hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); - kvmppc_invalidate_hpte(kvm, hptep, i); - hptep[1] &= ~cpu_to_be64(HPTE_R_KEY_HI | HPTE_R_KEY_LO); - /* Harvest R and C */ - rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C); - *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT; - if (rcbits & HPTE_R_C) - kvmppc_update_rmap_change(rmapp, psize); - if (rcbits & ~rev[i].guest_rpte) { - rev[i].guest_rpte = ptel | rcbits; - note_hpte_modification(kvm, &rev[i]); - } - } + kvmppc_unmap_hpte(kvm, i, rmapp, gfn); unlock_rmap(rmapp); __unlock_hpte(hptep, be64_to_cpu(hptep[0])); } @@ -810,26 +852,36 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, int kvm_unmap_hva_hv(struct kvm *kvm, unsigned long hva) { - kvm_handle_hva(kvm, hva, kvm_unmap_rmapp); + hva_handler_fn handler; + + handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp; + kvm_handle_hva(kvm, hva, handler); return 0; } int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, unsigned long end) { - kvm_handle_hva_range(kvm, start, end, kvm_unmap_rmapp); + hva_handler_fn handler; + + handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp; + kvm_handle_hva_range(kvm, start, end, handler); return 0; } void kvmppc_core_flush_memslot_hv(struct kvm *kvm, struct kvm_memory_slot *memslot) { - unsigned long *rmapp; unsigned long gfn; unsigned long n; + unsigned long *rmapp; - rmapp = memslot->arch.rmap; gfn = memslot->base_gfn; - for (n = memslot->npages; n; --n) { + rmapp = memslot->arch.rmap; + for (n = memslot->npages; n; --n, ++gfn) { + if (kvm_is_radix(kvm)) { + kvm_unmap_radix(kvm, memslot, gfn); + continue; + } /* * Testing the present bit without locking is OK because * the memslot has been marked invalid already, and hence @@ -837,20 +889,21 @@ void kvmppc_core_flush_memslot_hv(struct kvm *kvm, * thus the present bit can't go from 0 to 1. */ if (*rmapp & KVMPPC_RMAP_PRESENT) - kvm_unmap_rmapp(kvm, rmapp, gfn); + kvm_unmap_rmapp(kvm, memslot, gfn); ++rmapp; - ++gfn; } } -static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, +static int kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gfn) { - struct revmap_entry *rev = kvm->arch.revmap; + struct revmap_entry *rev = kvm->arch.hpt.rev; unsigned long head, i, j; __be64 *hptep; int ret = 0; + unsigned long *rmapp; + rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; retry: lock_rmap(rmapp); if (*rmapp & KVMPPC_RMAP_REFERENCED) { @@ -864,7 +917,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, i = head = *rmapp & KVMPPC_RMAP_INDEX; do { - hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4)); + hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4)); j = rev[i].forw; /* If this HPTE isn't referenced, ignore it */ @@ -898,17 +951,22 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, int kvm_age_hva_hv(struct kvm *kvm, unsigned long start, unsigned long end) { - return kvm_handle_hva_range(kvm, start, end, kvm_age_rmapp); + hva_handler_fn handler; + + handler = kvm_is_radix(kvm) ? kvm_age_radix : kvm_age_rmapp; + return kvm_handle_hva_range(kvm, start, end, handler); } -static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, +static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gfn) { - struct revmap_entry *rev = kvm->arch.revmap; + struct revmap_entry *rev = kvm->arch.hpt.rev; unsigned long head, i, j; unsigned long *hp; int ret = 1; + unsigned long *rmapp; + rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; if (*rmapp & KVMPPC_RMAP_REFERENCED) return 1; @@ -919,7 +977,7 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, if (*rmapp & KVMPPC_RMAP_PRESENT) { i = head = *rmapp & KVMPPC_RMAP_INDEX; do { - hp = (unsigned long *)(kvm->arch.hpt_virt + (i << 4)); + hp = (unsigned long *)(kvm->arch.hpt.virt + (i << 4)); j = rev[i].forw; if (be64_to_cpu(hp[1]) & HPTE_R_R) goto out; @@ -934,12 +992,18 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva) { - return kvm_handle_hva(kvm, hva, kvm_test_age_rmapp); + hva_handler_fn handler; + + handler = kvm_is_radix(kvm) ? kvm_test_age_radix : kvm_test_age_rmapp; + return kvm_handle_hva(kvm, hva, handler); } void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte) { - kvm_handle_hva(kvm, hva, kvm_unmap_rmapp); + hva_handler_fn handler; + + handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp; + kvm_handle_hva(kvm, hva, handler); } static int vcpus_running(struct kvm *kvm) @@ -953,7 +1017,7 @@ static int vcpus_running(struct kvm *kvm) */ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp) { - struct revmap_entry *rev = kvm->arch.revmap; + struct revmap_entry *rev = kvm->arch.hpt.rev; unsigned long head, i, j; unsigned long n; unsigned long v, r; @@ -978,7 +1042,7 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp) i = head = *rmapp & KVMPPC_RMAP_INDEX; do { unsigned long hptep1; - hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4)); + hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4)); j = rev[i].forw; /* @@ -1040,7 +1104,7 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp) return npages_dirty; } -static void harvest_vpa_dirty(struct kvmppc_vpa *vpa, +void kvmppc_harvest_vpa_dirty(struct kvmppc_vpa *vpa, struct kvm_memory_slot *memslot, unsigned long *map) { @@ -1058,12 +1122,11 @@ static void harvest_vpa_dirty(struct kvmppc_vpa *vpa, __set_bit_le(gfn - memslot->base_gfn, map); } -long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot, - unsigned long *map) +long kvmppc_hv_get_dirty_log_hpt(struct kvm *kvm, + struct kvm_memory_slot *memslot, unsigned long *map) { unsigned long i, j; unsigned long *rmapp; - struct kvm_vcpu *vcpu; preempt_disable(); rmapp = memslot->arch.rmap; @@ -1079,15 +1142,6 @@ long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot, __set_bit_le(j, map); ++rmapp; } - - /* Harvest dirty bits from VPA and DTL updates */ - /* Note: we never modify the SLB shadow buffer areas */ - kvm_for_each_vcpu(i, vcpu, kvm) { - spin_lock(&vcpu->arch.vpa_update_lock); - harvest_vpa_dirty(&vcpu->arch.vpa, memslot, map); - harvest_vpa_dirty(&vcpu->arch.dtl, memslot, map); - spin_unlock(&vcpu->arch.vpa_update_lock); - } preempt_enable(); return 0; } @@ -1142,15 +1196,367 @@ void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa, srcu_idx = srcu_read_lock(&kvm->srcu); memslot = gfn_to_memslot(kvm, gfn); if (memslot) { - rmap = &memslot->arch.rmap[gfn - memslot->base_gfn]; - lock_rmap(rmap); - *rmap |= KVMPPC_RMAP_CHANGED; - unlock_rmap(rmap); + if (!kvm_is_radix(kvm)) { + rmap = &memslot->arch.rmap[gfn - memslot->base_gfn]; + lock_rmap(rmap); + *rmap |= KVMPPC_RMAP_CHANGED; + unlock_rmap(rmap); + } else if (memslot->dirty_bitmap) { + mark_page_dirty(kvm, gfn); + } } srcu_read_unlock(&kvm->srcu, srcu_idx); } /* + * HPT resizing + */ +static int resize_hpt_allocate(struct kvm_resize_hpt *resize) +{ + int rc; + + rc = kvmppc_allocate_hpt(&resize->hpt, resize->order); + if (rc < 0) + return rc; + + resize_hpt_debug(resize, "resize_hpt_allocate(): HPT @ 0x%lx\n", + resize->hpt.virt); + + return 0; +} + +static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize, + unsigned long idx) +{ + struct kvm *kvm = resize->kvm; + struct kvm_hpt_info *old = &kvm->arch.hpt; + struct kvm_hpt_info *new = &resize->hpt; + unsigned long old_hash_mask = (1ULL << (old->order - 7)) - 1; + unsigned long new_hash_mask = (1ULL << (new->order - 7)) - 1; + __be64 *hptep, *new_hptep; + unsigned long vpte, rpte, guest_rpte; + int ret; + struct revmap_entry *rev; + unsigned long apsize, psize, avpn, pteg, hash; + unsigned long new_idx, new_pteg, replace_vpte; + + hptep = (__be64 *)(old->virt + (idx << 4)); + + /* Guest is stopped, so new HPTEs can't be added or faulted + * in, only unmapped or altered by host actions. So, it's + * safe to check this before we take the HPTE lock */ + vpte = be64_to_cpu(hptep[0]); + if (!(vpte & HPTE_V_VALID) && !(vpte & HPTE_V_ABSENT)) + return 0; /* nothing to do */ + + while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) + cpu_relax(); + + vpte = be64_to_cpu(hptep[0]); + + ret = 0; + if (!(vpte & HPTE_V_VALID) && !(vpte & HPTE_V_ABSENT)) + /* Nothing to do */ + goto out; + + /* Unmap */ + rev = &old->rev[idx]; + guest_rpte = rev->guest_rpte; + + ret = -EIO; + apsize = hpte_page_size(vpte, guest_rpte); + if (!apsize) + goto out; + + if (vpte & HPTE_V_VALID) { + unsigned long gfn = hpte_rpn(guest_rpte, apsize); + int srcu_idx = srcu_read_lock(&kvm->srcu); + struct kvm_memory_slot *memslot = + __gfn_to_memslot(kvm_memslots(kvm), gfn); + + if (memslot) { + unsigned long *rmapp; + rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; + + lock_rmap(rmapp); + kvmppc_unmap_hpte(kvm, idx, rmapp, gfn); + unlock_rmap(rmapp); + } + + srcu_read_unlock(&kvm->srcu, srcu_idx); + } + + /* Reload PTE after unmap */ + vpte = be64_to_cpu(hptep[0]); + + BUG_ON(vpte & HPTE_V_VALID); + BUG_ON(!(vpte & HPTE_V_ABSENT)); + + ret = 0; + if (!(vpte & HPTE_V_BOLTED)) + goto out; + + rpte = be64_to_cpu(hptep[1]); + psize = hpte_base_page_size(vpte, rpte); + avpn = HPTE_V_AVPN_VAL(vpte) & ~((psize - 1) >> 23); + pteg = idx / HPTES_PER_GROUP; + if (vpte & HPTE_V_SECONDARY) + pteg = ~pteg; + + if (!(vpte & HPTE_V_1TB_SEG)) { + unsigned long offset, vsid; + + /* We only have 28 - 23 bits of offset in avpn */ + offset = (avpn & 0x1f) << 23; + vsid = avpn >> 5; + /* We can find more bits from the pteg value */ + if (psize < (1ULL << 23)) + offset |= ((vsid ^ pteg) & old_hash_mask) * psize; + + hash = vsid ^ (offset / psize); + } else { + unsigned long offset, vsid; + + /* We only have 40 - 23 bits of seg_off in avpn */ + offset = (avpn & 0x1ffff) << 23; + vsid = avpn >> 17; + if (psize < (1ULL << 23)) + offset |= ((vsid ^ (vsid << 25) ^ pteg) & old_hash_mask) * psize; + + hash = vsid ^ (vsid << 25) ^ (offset / psize); + } + + new_pteg = hash & new_hash_mask; + if (vpte & HPTE_V_SECONDARY) { + BUG_ON(~pteg != (hash & old_hash_mask)); + new_pteg = ~new_pteg; + } else { + BUG_ON(pteg != (hash & old_hash_mask)); + } + + new_idx = new_pteg * HPTES_PER_GROUP + (idx % HPTES_PER_GROUP); + new_hptep = (__be64 *)(new->virt + (new_idx << 4)); + + replace_vpte = be64_to_cpu(new_hptep[0]); + + if (replace_vpte & (HPTE_V_VALID | HPTE_V_ABSENT)) { + BUG_ON(new->order >= old->order); + + if (replace_vpte & HPTE_V_BOLTED) { + if (vpte & HPTE_V_BOLTED) + /* Bolted collision, nothing we can do */ + ret = -ENOSPC; + /* Discard the new HPTE */ + goto out; + } + + /* Discard the previous HPTE */ + } + + new_hptep[1] = cpu_to_be64(rpte); + new->rev[new_idx].guest_rpte = guest_rpte; + /* No need for a barrier, since new HPT isn't active */ + new_hptep[0] = cpu_to_be64(vpte); + unlock_hpte(new_hptep, vpte); + +out: + unlock_hpte(hptep, vpte); + return ret; +} + +static int resize_hpt_rehash(struct kvm_resize_hpt *resize) +{ + struct kvm *kvm = resize->kvm; + unsigned long i; + int rc; + + for (i = 0; i < kvmppc_hpt_npte(&kvm->arch.hpt); i++) { + rc = resize_hpt_rehash_hpte(resize, i); + if (rc != 0) + return rc; + } + + return 0; +} + +static void resize_hpt_pivot(struct kvm_resize_hpt *resize) +{ + struct kvm *kvm = resize->kvm; + struct kvm_hpt_info hpt_tmp; + + /* Exchange the pending tables in the resize structure with + * the active tables */ + + resize_hpt_debug(resize, "resize_hpt_pivot()\n"); + + spin_lock(&kvm->mmu_lock); + asm volatile("ptesync" : : : "memory"); + + hpt_tmp = kvm->arch.hpt; + kvmppc_set_hpt(kvm, &resize->hpt); + resize->hpt = hpt_tmp; + + spin_unlock(&kvm->mmu_lock); + + synchronize_srcu_expedited(&kvm->srcu); + + resize_hpt_debug(resize, "resize_hpt_pivot() done\n"); +} + +static void resize_hpt_release(struct kvm *kvm, struct kvm_resize_hpt *resize) +{ + BUG_ON(kvm->arch.resize_hpt != resize); + + if (resize->hpt.virt) + kvmppc_free_hpt(&resize->hpt); + + kvm->arch.resize_hpt = NULL; + kfree(resize); +} + +static void resize_hpt_prepare_work(struct work_struct *work) +{ + struct kvm_resize_hpt *resize = container_of(work, + struct kvm_resize_hpt, + work); + struct kvm *kvm = resize->kvm; + int err; + + resize_hpt_debug(resize, "resize_hpt_prepare_work(): order = %d\n", + resize->order); + + err = resize_hpt_allocate(resize); + + mutex_lock(&kvm->lock); + + resize->error = err; + resize->prepare_done = true; + + mutex_unlock(&kvm->lock); +} + +long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm, + struct kvm_ppc_resize_hpt *rhpt) +{ + unsigned long flags = rhpt->flags; + unsigned long shift = rhpt->shift; + struct kvm_resize_hpt *resize; + int ret; + + if (flags != 0) + return -EINVAL; + + if (shift && ((shift < 18) || (shift > 46))) + return -EINVAL; + + mutex_lock(&kvm->lock); + + resize = kvm->arch.resize_hpt; + + if (resize) { + if (resize->order == shift) { + /* Suitable resize in progress */ + if (resize->prepare_done) { + ret = resize->error; + if (ret != 0) + resize_hpt_release(kvm, resize); + } else { + ret = 100; /* estimated time in ms */ + } + + goto out; + } + + /* not suitable, cancel it */ + resize_hpt_release(kvm, resize); + } + + ret = 0; + if (!shift) + goto out; /* nothing to do */ + + /* start new resize */ + + resize = kzalloc(sizeof(*resize), GFP_KERNEL); + resize->order = shift; + resize->kvm = kvm; + INIT_WORK(&resize->work, resize_hpt_prepare_work); + kvm->arch.resize_hpt = resize; + + schedule_work(&resize->work); + + ret = 100; /* estimated time in ms */ + +out: + mutex_unlock(&kvm->lock); + return ret; +} + +static void resize_hpt_boot_vcpu(void *opaque) +{ + /* Nothing to do, just force a KVM exit */ +} + +long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm, + struct kvm_ppc_resize_hpt *rhpt) +{ + unsigned long flags = rhpt->flags; + unsigned long shift = rhpt->shift; + struct kvm_resize_hpt *resize; + long ret; + + if (flags != 0) + return -EINVAL; + + if (shift && ((shift < 18) || (shift > 46))) + return -EINVAL; + + mutex_lock(&kvm->lock); + + resize = kvm->arch.resize_hpt; + + /* This shouldn't be possible */ + ret = -EIO; + if (WARN_ON(!kvm->arch.hpte_setup_done)) + goto out_no_hpt; + + /* Stop VCPUs from running while we mess with the HPT */ + kvm->arch.hpte_setup_done = 0; + smp_mb(); + + /* Boot all CPUs out of the guest so they re-read + * hpte_setup_done */ + on_each_cpu(resize_hpt_boot_vcpu, NULL, 1); + + ret = -ENXIO; + if (!resize || (resize->order != shift)) + goto out; + + ret = -EBUSY; + if (!resize->prepare_done) + goto out; + + ret = resize->error; + if (ret != 0) + goto out; + + ret = resize_hpt_rehash(resize); + if (ret != 0) + goto out; + + resize_hpt_pivot(resize); + +out: + /* Let VCPUs run again */ + kvm->arch.hpte_setup_done = 1; + smp_mb(); +out_no_hpt: + resize_hpt_release(kvm, resize); + mutex_unlock(&kvm->lock); + return ret; +} + +/* * Functions for reading and writing the hash table via reads and * writes on a file descriptor. * @@ -1290,8 +1696,8 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf, flags = ctx->flags; i = ctx->index; - hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE)); - revp = kvm->arch.revmap + i; + hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE)); + revp = kvm->arch.hpt.rev + i; lbuf = (unsigned long __user *)buf; nb = 0; @@ -1306,7 +1712,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf, /* Skip uninteresting entries, i.e. clean on not-first pass */ if (!first_pass) { - while (i < kvm->arch.hpt_npte && + while (i < kvmppc_hpt_npte(&kvm->arch.hpt) && !hpte_dirty(revp, hptp)) { ++i; hptp += 2; @@ -1316,7 +1722,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf, hdr.index = i; /* Grab a series of valid entries */ - while (i < kvm->arch.hpt_npte && + while (i < kvmppc_hpt_npte(&kvm->arch.hpt) && hdr.n_valid < 0xffff && nb + HPTE_SIZE < count && record_hpte(flags, hptp, hpte, revp, 1, first_pass)) { @@ -1332,7 +1738,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf, ++revp; } /* Now skip invalid entries while we can */ - while (i < kvm->arch.hpt_npte && + while (i < kvmppc_hpt_npte(&kvm->arch.hpt) && hdr.n_invalid < 0xffff && record_hpte(flags, hptp, hpte, revp, 0, first_pass)) { /* found an invalid entry */ @@ -1353,7 +1759,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf, } /* Check if we've wrapped around the hash table */ - if (i >= kvm->arch.hpt_npte) { + if (i >= kvmppc_hpt_npte(&kvm->arch.hpt)) { i = 0; ctx->first_pass = 0; break; @@ -1412,11 +1818,11 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf, err = -EINVAL; i = hdr.index; - if (i >= kvm->arch.hpt_npte || - i + hdr.n_valid + hdr.n_invalid > kvm->arch.hpt_npte) + if (i >= kvmppc_hpt_npte(&kvm->arch.hpt) || + i + hdr.n_valid + hdr.n_invalid > kvmppc_hpt_npte(&kvm->arch.hpt)) break; - hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE)); + hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE)); lbuf = (unsigned long __user *)buf; for (j = 0; j < hdr.n_valid; ++j) { __be64 hpte_v; @@ -1603,8 +2009,9 @@ static ssize_t debugfs_htab_read(struct file *file, char __user *buf, kvm = p->kvm; i = p->hpt_index; - hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE)); - for (; len != 0 && i < kvm->arch.hpt_npte; ++i, hptp += 2) { + hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE)); + for (; len != 0 && i < kvmppc_hpt_npte(&kvm->arch.hpt); + ++i, hptp += 2) { if (!(be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT))) continue; @@ -1614,7 +2021,7 @@ static ssize_t debugfs_htab_read(struct file *file, char __user *buf, cpu_relax(); v = be64_to_cpu(hptp[0]) & ~HPTE_V_HVLOCK; hr = be64_to_cpu(hptp[1]); - gr = kvm->arch.revmap[i].guest_rpte; + gr = kvm->arch.hpt.rev[i].guest_rpte; unlock_hpte(hptp, v); preempt_enable(); @@ -1675,7 +2082,10 @@ void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu) vcpu->arch.slb_nr = 32; /* POWER7/POWER8 */ - mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate; + if (kvm_is_radix(vcpu->kvm)) + mmu->xlate = kvmppc_mmu_radix_xlate; + else + mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate; mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr; vcpu->arch.hflags |= BOOK3S_HFLAG_SLB; diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c new file mode 100644 index 000000000000..4344651f408c --- /dev/null +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -0,0 +1,716 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * Copyright 2016 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + */ + +#include <linux/types.h> +#include <linux/string.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> + +#include <asm/kvm_ppc.h> +#include <asm/kvm_book3s.h> +#include <asm/page.h> +#include <asm/mmu.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> + +/* + * Supported radix tree geometry. + * Like p9, we support either 5 or 9 bits at the first (lowest) level, + * for a page size of 64k or 4k. + */ +static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 }; + +int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, + struct kvmppc_pte *gpte, bool data, bool iswrite) +{ + struct kvm *kvm = vcpu->kvm; + u32 pid; + int ret, level, ps; + __be64 prte, rpte; + unsigned long root, pte, index; + unsigned long rts, bits, offset; + unsigned long gpa; + unsigned long proc_tbl_size; + + /* Work out effective PID */ + switch (eaddr >> 62) { + case 0: + pid = vcpu->arch.pid; + break; + case 3: + pid = 0; + break; + default: + return -EINVAL; + } + proc_tbl_size = 1 << ((kvm->arch.process_table & PRTS_MASK) + 12); + if (pid * 16 >= proc_tbl_size) + return -EINVAL; + + /* Read partition table to find root of tree for effective PID */ + ret = kvm_read_guest(kvm, kvm->arch.process_table + pid * 16, + &prte, sizeof(prte)); + if (ret) + return ret; + + root = be64_to_cpu(prte); + rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) | + ((root & RTS2_MASK) >> RTS2_SHIFT); + bits = root & RPDS_MASK; + root = root & RPDB_MASK; + + /* P9 DD1 interprets RTS (radix tree size) differently */ + offset = rts + 31; + if (cpu_has_feature(CPU_FTR_POWER9_DD1)) + offset -= 3; + + /* current implementations only support 52-bit space */ + if (offset != 52) + return -EINVAL; + + for (level = 3; level >= 0; --level) { + if (level && bits != p9_supported_radix_bits[level]) + return -EINVAL; + if (level == 0 && !(bits == 5 || bits == 9)) + return -EINVAL; + offset -= bits; + index = (eaddr >> offset) & ((1UL << bits) - 1); + /* check that low bits of page table base are zero */ + if (root & ((1UL << (bits + 3)) - 1)) + return -EINVAL; + ret = kvm_read_guest(kvm, root + index * 8, + &rpte, sizeof(rpte)); + if (ret) + return ret; + pte = __be64_to_cpu(rpte); + if (!(pte & _PAGE_PRESENT)) + return -ENOENT; + if (pte & _PAGE_PTE) + break; + bits = pte & 0x1f; + root = pte & 0x0fffffffffffff00ul; + } + /* need a leaf at lowest level; 512GB pages not supported */ + if (level < 0 || level == 3) + return -EINVAL; + + /* offset is now log base 2 of the page size */ + gpa = pte & 0x01fffffffffff000ul; + if (gpa & ((1ul << offset) - 1)) + return -EINVAL; + gpa += eaddr & ((1ul << offset) - 1); + for (ps = MMU_PAGE_4K; ps < MMU_PAGE_COUNT; ++ps) + if (offset == mmu_psize_defs[ps].shift) + break; + gpte->page_size = ps; + + gpte->eaddr = eaddr; + gpte->raddr = gpa; + + /* Work out permissions */ + gpte->may_read = !!(pte & _PAGE_READ); + gpte->may_write = !!(pte & _PAGE_WRITE); + gpte->may_execute = !!(pte & _PAGE_EXEC); + if (kvmppc_get_msr(vcpu) & MSR_PR) { + if (pte & _PAGE_PRIVILEGED) { + gpte->may_read = 0; + gpte->may_write = 0; + gpte->may_execute = 0; + } + } else { + if (!(pte & _PAGE_PRIVILEGED)) { + /* Check AMR/IAMR to see if strict mode is in force */ + if (vcpu->arch.amr & (1ul << 62)) + gpte->may_read = 0; + if (vcpu->arch.amr & (1ul << 63)) + gpte->may_write = 0; + if (vcpu->arch.iamr & (1ul << 62)) + gpte->may_execute = 0; + } + } + + return 0; +} + +#ifdef CONFIG_PPC_64K_PAGES +#define MMU_BASE_PSIZE MMU_PAGE_64K +#else +#define MMU_BASE_PSIZE MMU_PAGE_4K +#endif + +static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr, + unsigned int pshift) +{ + int psize = MMU_BASE_PSIZE; + + if (pshift >= PMD_SHIFT) + psize = MMU_PAGE_2M; + addr &= ~0xfffUL; + addr |= mmu_psize_defs[psize].ap << 5; + asm volatile("ptesync": : :"memory"); + asm volatile(PPC_TLBIE_5(%0, %1, 0, 0, 1) + : : "r" (addr), "r" (kvm->arch.lpid) : "memory"); + asm volatile("ptesync": : :"memory"); +} + +unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep, + unsigned long clr, unsigned long set, + unsigned long addr, unsigned int shift) +{ + unsigned long old = 0; + + if (!(clr & _PAGE_PRESENT) && cpu_has_feature(CPU_FTR_POWER9_DD1) && + pte_present(*ptep)) { + /* have to invalidate it first */ + old = __radix_pte_update(ptep, _PAGE_PRESENT, 0); + kvmppc_radix_tlbie_page(kvm, addr, shift); + set |= _PAGE_PRESENT; + old &= _PAGE_PRESENT; + } + return __radix_pte_update(ptep, clr, set) | old; +} + +void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + radix__set_pte_at(kvm->mm, addr, ptep, pte, 0); +} + +static struct kmem_cache *kvm_pte_cache; + +static pte_t *kvmppc_pte_alloc(void) +{ + return kmem_cache_alloc(kvm_pte_cache, GFP_KERNEL); +} + +static void kvmppc_pte_free(pte_t *ptep) +{ + kmem_cache_free(kvm_pte_cache, ptep); +} + +static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, + unsigned int level, unsigned long mmu_seq) +{ + pgd_t *pgd; + pud_t *pud, *new_pud = NULL; + pmd_t *pmd, *new_pmd = NULL; + pte_t *ptep, *new_ptep = NULL; + unsigned long old; + int ret; + + /* Traverse the guest's 2nd-level tree, allocate new levels needed */ + pgd = kvm->arch.pgtable + pgd_index(gpa); + pud = NULL; + if (pgd_present(*pgd)) + pud = pud_offset(pgd, gpa); + else + new_pud = pud_alloc_one(kvm->mm, gpa); + + pmd = NULL; + if (pud && pud_present(*pud)) + pmd = pmd_offset(pud, gpa); + else + new_pmd = pmd_alloc_one(kvm->mm, gpa); + + if (level == 0 && !(pmd && pmd_present(*pmd))) + new_ptep = kvmppc_pte_alloc(); + + /* Check if we might have been invalidated; let the guest retry if so */ + spin_lock(&kvm->mmu_lock); + ret = -EAGAIN; + if (mmu_notifier_retry(kvm, mmu_seq)) + goto out_unlock; + + /* Now traverse again under the lock and change the tree */ + ret = -ENOMEM; + if (pgd_none(*pgd)) { + if (!new_pud) + goto out_unlock; + pgd_populate(kvm->mm, pgd, new_pud); + new_pud = NULL; + } + pud = pud_offset(pgd, gpa); + if (pud_none(*pud)) { + if (!new_pmd) + goto out_unlock; + pud_populate(kvm->mm, pud, new_pmd); + new_pmd = NULL; + } + pmd = pmd_offset(pud, gpa); + if (pmd_large(*pmd)) { + /* Someone else has instantiated a large page here; retry */ + ret = -EAGAIN; + goto out_unlock; + } + if (level == 1 && !pmd_none(*pmd)) { + /* + * There's a page table page here, but we wanted + * to install a large page. Tell the caller and let + * it try installing a normal page if it wants. + */ + ret = -EBUSY; + goto out_unlock; + } + if (level == 0) { + if (pmd_none(*pmd)) { + if (!new_ptep) + goto out_unlock; + pmd_populate(kvm->mm, pmd, new_ptep); + new_ptep = NULL; + } + ptep = pte_offset_kernel(pmd, gpa); + if (pte_present(*ptep)) { + /* PTE was previously valid, so invalidate it */ + old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT, + 0, gpa, 0); + kvmppc_radix_tlbie_page(kvm, gpa, 0); + if (old & _PAGE_DIRTY) + mark_page_dirty(kvm, gpa >> PAGE_SHIFT); + } + kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte); + } else { + kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte); + } + ret = 0; + + out_unlock: + spin_unlock(&kvm->mmu_lock); + if (new_pud) + pud_free(kvm->mm, new_pud); + if (new_pmd) + pmd_free(kvm->mm, new_pmd); + if (new_ptep) + kvmppc_pte_free(new_ptep); + return ret; +} + +int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned long ea, unsigned long dsisr) +{ + struct kvm *kvm = vcpu->kvm; + unsigned long mmu_seq, pte_size; + unsigned long gpa, gfn, hva, pfn; + struct kvm_memory_slot *memslot; + struct page *page = NULL, *pages[1]; + long ret, npages, ok; + unsigned int writing; + struct vm_area_struct *vma; + unsigned long flags; + pte_t pte, *ptep; + unsigned long pgflags; + unsigned int shift, level; + + /* Check for unusual errors */ + if (dsisr & DSISR_UNSUPP_MMU) { + pr_err("KVM: Got unsupported MMU fault\n"); + return -EFAULT; + } + if (dsisr & DSISR_BADACCESS) { + /* Reflect to the guest as DSI */ + pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr); + kvmppc_core_queue_data_storage(vcpu, ea, dsisr); + return RESUME_GUEST; + } + + /* Translate the logical address and get the page */ + gpa = vcpu->arch.fault_gpa & ~0xfffUL; + gpa &= ~0xF000000000000000ul; + gfn = gpa >> PAGE_SHIFT; + if (!(dsisr & DSISR_PGDIRFAULT)) + gpa |= ea & 0xfff; + memslot = gfn_to_memslot(kvm, gfn); + + /* No memslot means it's an emulated MMIO region */ + if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) { + if (dsisr & (DSISR_PGDIRFAULT | DSISR_BADACCESS | + DSISR_SET_RC)) { + /* + * Bad address in guest page table tree, or other + * unusual error - reflect it to the guest as DSI. + */ + kvmppc_core_queue_data_storage(vcpu, ea, dsisr); + return RESUME_GUEST; + } + return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, + dsisr & DSISR_ISSTORE); + } + + /* used to check for invalidations in progress */ + mmu_seq = kvm->mmu_notifier_seq; + smp_rmb(); + + writing = (dsisr & DSISR_ISSTORE) != 0; + hva = gfn_to_hva_memslot(memslot, gfn); + if (dsisr & DSISR_SET_RC) { + /* + * Need to set an R or C bit in the 2nd-level tables; + * if the relevant bits aren't already set in the linux + * page tables, fall through to do the gup_fast to + * set them in the linux page tables too. + */ + ok = 0; + pgflags = _PAGE_ACCESSED; + if (writing) + pgflags |= _PAGE_DIRTY; + local_irq_save(flags); + ptep = __find_linux_pte_or_hugepte(current->mm->pgd, hva, + NULL, NULL); + if (ptep) { + pte = READ_ONCE(*ptep); + if (pte_present(pte) && + (pte_val(pte) & pgflags) == pgflags) + ok = 1; + } + local_irq_restore(flags); + if (ok) { + spin_lock(&kvm->mmu_lock); + if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) { + spin_unlock(&kvm->mmu_lock); + return RESUME_GUEST; + } + ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, + gpa, NULL, &shift); + if (ptep && pte_present(*ptep)) { + kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, + gpa, shift); + spin_unlock(&kvm->mmu_lock); + return RESUME_GUEST; + } + spin_unlock(&kvm->mmu_lock); + } + } + + ret = -EFAULT; + pfn = 0; + pte_size = PAGE_SIZE; + pgflags = _PAGE_READ | _PAGE_EXEC; + level = 0; + npages = get_user_pages_fast(hva, 1, writing, pages); + if (npages < 1) { + /* Check if it's an I/O mapping */ + down_read(¤t->mm->mmap_sem); + vma = find_vma(current->mm, hva); + if (vma && vma->vm_start <= hva && hva < vma->vm_end && + (vma->vm_flags & VM_PFNMAP)) { + pfn = vma->vm_pgoff + + ((hva - vma->vm_start) >> PAGE_SHIFT); + pgflags = pgprot_val(vma->vm_page_prot); + } + up_read(¤t->mm->mmap_sem); + if (!pfn) + return -EFAULT; + } else { + page = pages[0]; + pfn = page_to_pfn(page); + if (PageHuge(page)) { + page = compound_head(page); + pte_size <<= compound_order(page); + /* See if we can insert a 2MB large-page PTE here */ + if (pte_size >= PMD_SIZE && + (gpa & PMD_MASK & PAGE_MASK) == + (hva & PMD_MASK & PAGE_MASK)) { + level = 1; + pfn &= ~((PMD_SIZE >> PAGE_SHIFT) - 1); + } + } + /* See if we can provide write access */ + if (writing) { + /* + * We assume gup_fast has set dirty on the host PTE. + */ + pgflags |= _PAGE_WRITE; + } else { + local_irq_save(flags); + ptep = __find_linux_pte_or_hugepte(current->mm->pgd, + hva, NULL, NULL); + if (ptep && pte_write(*ptep) && pte_dirty(*ptep)) + pgflags |= _PAGE_WRITE; + local_irq_restore(flags); + } + } + + /* + * Compute the PTE value that we need to insert. + */ + pgflags |= _PAGE_PRESENT | _PAGE_PTE | _PAGE_ACCESSED; + if (pgflags & _PAGE_WRITE) + pgflags |= _PAGE_DIRTY; + pte = pfn_pte(pfn, __pgprot(pgflags)); + + /* Allocate space in the tree and write the PTE */ + ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq); + if (ret == -EBUSY) { + /* + * There's already a PMD where wanted to install a large page; + * for now, fall back to installing a small page. + */ + level = 0; + pfn |= gfn & ((PMD_SIZE >> PAGE_SHIFT) - 1); + pte = pfn_pte(pfn, __pgprot(pgflags)); + ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq); + } + if (ret == 0 || ret == -EAGAIN) + ret = RESUME_GUEST; + + if (page) { + /* + * We drop pages[0] here, not page because page might + * have been set to the head page of a compound, but + * we have to drop the reference on the correct tail + * page to match the get inside gup() + */ + put_page(pages[0]); + } + return ret; +} + +static void mark_pages_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn, unsigned int order) +{ + unsigned long i, limit; + unsigned long *dp; + + if (!memslot->dirty_bitmap) + return; + limit = 1ul << order; + if (limit < BITS_PER_LONG) { + for (i = 0; i < limit; ++i) + mark_page_dirty(kvm, gfn + i); + return; + } + dp = memslot->dirty_bitmap + (gfn - memslot->base_gfn); + limit /= BITS_PER_LONG; + for (i = 0; i < limit; ++i) + *dp++ = ~0ul; +} + +/* Called with kvm->lock held */ +int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn) +{ + pte_t *ptep; + unsigned long gpa = gfn << PAGE_SHIFT; + unsigned int shift; + unsigned long old; + + ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, gpa, + NULL, &shift); + if (ptep && pte_present(*ptep)) { + old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT, 0, + gpa, shift); + kvmppc_radix_tlbie_page(kvm, gpa, shift); + if (old & _PAGE_DIRTY) { + if (!shift) + mark_page_dirty(kvm, gfn); + else + mark_pages_dirty(kvm, memslot, + gfn, shift - PAGE_SHIFT); + } + } + return 0; +} + +/* Called with kvm->lock held */ +int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn) +{ + pte_t *ptep; + unsigned long gpa = gfn << PAGE_SHIFT; + unsigned int shift; + int ref = 0; + + ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, gpa, + NULL, &shift); + if (ptep && pte_present(*ptep) && pte_young(*ptep)) { + kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0, + gpa, shift); + /* XXX need to flush tlb here? */ + ref = 1; + } + return ref; +} + +/* Called with kvm->lock held */ +int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn) +{ + pte_t *ptep; + unsigned long gpa = gfn << PAGE_SHIFT; + unsigned int shift; + int ref = 0; + + ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, gpa, + NULL, &shift); + if (ptep && pte_present(*ptep) && pte_young(*ptep)) + ref = 1; + return ref; +} + +/* Returns the number of PAGE_SIZE pages that are dirty */ +static int kvm_radix_test_clear_dirty(struct kvm *kvm, + struct kvm_memory_slot *memslot, int pagenum) +{ + unsigned long gfn = memslot->base_gfn + pagenum; + unsigned long gpa = gfn << PAGE_SHIFT; + pte_t *ptep; + unsigned int shift; + int ret = 0; + + ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, gpa, + NULL, &shift); + if (ptep && pte_present(*ptep) && pte_dirty(*ptep)) { + ret = 1; + if (shift) + ret = 1 << (shift - PAGE_SHIFT); + kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0, + gpa, shift); + kvmppc_radix_tlbie_page(kvm, gpa, shift); + } + return ret; +} + +long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm, + struct kvm_memory_slot *memslot, unsigned long *map) +{ + unsigned long i, j; + unsigned long n, *p; + int npages; + + /* + * Radix accumulates dirty bits in the first half of the + * memslot's dirty_bitmap area, for when pages are paged + * out or modified by the host directly. Pick up these + * bits and add them to the map. + */ + n = kvm_dirty_bitmap_bytes(memslot) / sizeof(long); + p = memslot->dirty_bitmap; + for (i = 0; i < n; ++i) + map[i] |= xchg(&p[i], 0); + + for (i = 0; i < memslot->npages; i = j) { + npages = kvm_radix_test_clear_dirty(kvm, memslot, i); + + /* + * Note that if npages > 0 then i must be a multiple of npages, + * since huge pages are only used to back the guest at guest + * real addresses that are a multiple of their size. + * Since we have at most one PTE covering any given guest + * real address, if npages > 1 we can skip to i + npages. + */ + j = i + 1; + if (npages) + for (j = i; npages; ++j, --npages) + __set_bit_le(j, map); + } + return 0; +} + +static void add_rmmu_ap_encoding(struct kvm_ppc_rmmu_info *info, + int psize, int *indexp) +{ + if (!mmu_psize_defs[psize].shift) + return; + info->ap_encodings[*indexp] = mmu_psize_defs[psize].shift | + (mmu_psize_defs[psize].ap << 29); + ++(*indexp); +} + +int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info) +{ + int i; + + if (!radix_enabled()) + return -EINVAL; + memset(info, 0, sizeof(*info)); + + /* 4k page size */ + info->geometries[0].page_shift = 12; + info->geometries[0].level_bits[0] = 9; + for (i = 1; i < 4; ++i) + info->geometries[0].level_bits[i] = p9_supported_radix_bits[i]; + /* 64k page size */ + info->geometries[1].page_shift = 16; + for (i = 0; i < 4; ++i) + info->geometries[1].level_bits[i] = p9_supported_radix_bits[i]; + + i = 0; + add_rmmu_ap_encoding(info, MMU_PAGE_4K, &i); + add_rmmu_ap_encoding(info, MMU_PAGE_64K, &i); + add_rmmu_ap_encoding(info, MMU_PAGE_2M, &i); + add_rmmu_ap_encoding(info, MMU_PAGE_1G, &i); + + return 0; +} + +int kvmppc_init_vm_radix(struct kvm *kvm) +{ + kvm->arch.pgtable = pgd_alloc(kvm->mm); + if (!kvm->arch.pgtable) + return -ENOMEM; + return 0; +} + +void kvmppc_free_radix(struct kvm *kvm) +{ + unsigned long ig, iu, im; + pte_t *pte; + pmd_t *pmd; + pud_t *pud; + pgd_t *pgd; + + if (!kvm->arch.pgtable) + return; + pgd = kvm->arch.pgtable; + for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) { + if (!pgd_present(*pgd)) + continue; + pud = pud_offset(pgd, 0); + for (iu = 0; iu < PTRS_PER_PUD; ++iu, ++pud) { + if (!pud_present(*pud)) + continue; + pmd = pmd_offset(pud, 0); + for (im = 0; im < PTRS_PER_PMD; ++im, ++pmd) { + if (pmd_huge(*pmd)) { + pmd_clear(pmd); + continue; + } + if (!pmd_present(*pmd)) + continue; + pte = pte_offset_map(pmd, 0); + memset(pte, 0, sizeof(long) << PTE_INDEX_SIZE); + kvmppc_pte_free(pte); + pmd_clear(pmd); + } + pmd_free(kvm->mm, pmd_offset(pud, 0)); + pud_clear(pud); + } + pud_free(kvm->mm, pud_offset(pgd, 0)); + pgd_clear(pgd); + } + pgd_free(kvm->mm, kvm->arch.pgtable); +} + +static void pte_ctor(void *addr) +{ + memset(addr, 0, PTE_TABLE_SIZE); +} + +int kvmppc_radix_init(void) +{ + unsigned long size = sizeof(void *) << PTE_INDEX_SIZE; + + kvm_pte_cache = kmem_cache_create("kvm-pte", size, size, 0, pte_ctor); + if (!kvm_pte_cache) + return -ENOMEM; + return 0; +} + +void kvmppc_radix_exit(void) +{ + kmem_cache_destroy(kvm_pte_cache); +} diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index ec34e39471a7..1e107ece4e37 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -182,7 +182,8 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu) ++vcpu->stat.halt_wakeup; } - if (kvmppc_ipi_thread(vcpu->arch.thread_cpu)) + cpu = READ_ONCE(vcpu->arch.thread_cpu); + if (cpu >= 0 && kvmppc_ipi_thread(cpu)) return; /* CPU points to the first thread of the core */ @@ -773,12 +774,8 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) } tvcpu->arch.prodded = 1; smp_mb(); - if (vcpu->arch.ceded) { - if (swait_active(&vcpu->wq)) { - swake_up(&vcpu->wq); - vcpu->stat.halt_wakeup++; - } - } + if (tvcpu->arch.ceded) + kvmppc_fast_vcpu_kick_hv(tvcpu); break; case H_CONFER: target = kvmppc_get_gpr(vcpu, 4); @@ -1135,7 +1132,7 @@ static void kvmppc_set_lpcr(struct kvm_vcpu *vcpu, u64 new_lpcr, /* * Userspace can only modify DPFD (default prefetch depth), * ILE (interrupt little-endian) and TC (translation control). - * On POWER8 userspace can also modify AIL (alt. interrupt loc.) + * On POWER8 and POWER9 userspace can also modify AIL (alt. interrupt loc.). */ mask = LPCR_DPFD | LPCR_ILE | LPCR_TC; if (cpu_has_feature(CPU_FTR_ARCH_207S)) @@ -1821,6 +1818,7 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm, vcpu->arch.vcore = vcore; vcpu->arch.ptid = vcpu->vcpu_id - vcore->first_vcpuid; vcpu->arch.thread_cpu = -1; + vcpu->arch.prev_cpu = -1; vcpu->arch.cpu_type = KVM_CPU_3S_64; kvmppc_sanity_check(vcpu); @@ -1950,11 +1948,33 @@ static void kvmppc_release_hwthread(int cpu) tpaca->kvm_hstate.kvm_split_mode = NULL; } +static void do_nothing(void *x) +{ +} + +static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu) +{ + int i; + + cpu = cpu_first_thread_sibling(cpu); + cpumask_set_cpu(cpu, &kvm->arch.need_tlb_flush); + /* + * Make sure setting of bit in need_tlb_flush precedes + * testing of cpu_in_guest bits. The matching barrier on + * the other side is the first smp_mb() in kvmppc_run_core(). + */ + smp_mb(); + for (i = 0; i < threads_per_core; ++i) + if (cpumask_test_cpu(cpu + i, &kvm->arch.cpu_in_guest)) + smp_call_function_single(cpu + i, do_nothing, NULL, 1); +} + static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc) { int cpu; struct paca_struct *tpaca; struct kvmppc_vcore *mvc = vc->master_vcore; + struct kvm *kvm = vc->kvm; cpu = vc->pcpu; if (vcpu) { @@ -1965,6 +1985,27 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc) cpu += vcpu->arch.ptid; vcpu->cpu = mvc->pcpu; vcpu->arch.thread_cpu = cpu; + + /* + * With radix, the guest can do TLB invalidations itself, + * and it could choose to use the local form (tlbiel) if + * it is invalidating a translation that has only ever been + * used on one vcpu. However, that doesn't mean it has + * only ever been used on one physical cpu, since vcpus + * can move around between pcpus. To cope with this, when + * a vcpu moves from one pcpu to another, we need to tell + * any vcpus running on the same core as this vcpu previously + * ran to flush the TLB. The TLB is shared between threads, + * so we use a single bit in .need_tlb_flush for all 4 threads. + */ + if (kvm_is_radix(kvm) && vcpu->arch.prev_cpu != cpu) { + if (vcpu->arch.prev_cpu >= 0 && + cpu_first_thread_sibling(vcpu->arch.prev_cpu) != + cpu_first_thread_sibling(cpu)) + radix_flush_cpu(kvm, vcpu->arch.prev_cpu, vcpu); + vcpu->arch.prev_cpu = cpu; + } + cpumask_set_cpu(cpu, &kvm->arch.cpu_in_guest); } tpaca = &paca[cpu]; tpaca->kvm_hstate.kvm_vcpu = vcpu; @@ -2552,6 +2593,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) kvmppc_release_hwthread(pcpu + i); if (sip && sip->napped[i]) kvmppc_ipi_thread(pcpu + i); + cpumask_clear_cpu(pcpu + i, &vc->kvm->arch.cpu_in_guest); } kvmppc_set_host_core(pcpu); @@ -2620,7 +2662,8 @@ static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc) int i; for_each_runnable_thread(i, vcpu, vc) { - if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded) + if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded || + vcpu->arch.prodded) return 1; } @@ -2806,7 +2849,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) break; n_ceded = 0; for_each_runnable_thread(i, v, vc) { - if (!v->arch.pending_exceptions) + if (!v->arch.pending_exceptions && !v->arch.prodded) n_ceded += v->arch.ceded; else v->arch.ceded = 0; @@ -2877,7 +2920,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) smp_mb(); /* On the first time here, set up HTAB and VRMA */ - if (!vcpu->kvm->arch.hpte_setup_done) { + if (!kvm_is_radix(vcpu->kvm) && !vcpu->kvm->arch.hpte_setup_done) { r = kvmppc_hv_setup_htab_rma(vcpu); if (r) goto out; @@ -2939,6 +2982,13 @@ static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm, { struct kvm_ppc_one_seg_page_size *sps; + /* + * Since we don't yet support HPT guests on a radix host, + * return an error if the host uses radix. + */ + if (radix_enabled()) + return -EINVAL; + info->flags = KVM_PPC_PAGE_SIZES_REAL; if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) info->flags |= KVM_PPC_1T_SEGMENTS; @@ -2961,8 +3011,10 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm, { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; - int r; + int i, r; unsigned long n; + unsigned long *buf; + struct kvm_vcpu *vcpu; mutex_lock(&kvm->slots_lock); @@ -2976,15 +3028,32 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm, if (!memslot->dirty_bitmap) goto out; + /* + * Use second half of bitmap area because radix accumulates + * bits in the first half. + */ n = kvm_dirty_bitmap_bytes(memslot); - memset(memslot->dirty_bitmap, 0, n); + buf = memslot->dirty_bitmap + n / sizeof(long); + memset(buf, 0, n); - r = kvmppc_hv_get_dirty_log(kvm, memslot, memslot->dirty_bitmap); + if (kvm_is_radix(kvm)) + r = kvmppc_hv_get_dirty_log_radix(kvm, memslot, buf); + else + r = kvmppc_hv_get_dirty_log_hpt(kvm, memslot, buf); if (r) goto out; + /* Harvest dirty bits from VPA and DTL updates */ + /* Note: we never modify the SLB shadow buffer areas */ + kvm_for_each_vcpu(i, vcpu, kvm) { + spin_lock(&vcpu->arch.vpa_update_lock); + kvmppc_harvest_vpa_dirty(&vcpu->arch.vpa, memslot, buf); + kvmppc_harvest_vpa_dirty(&vcpu->arch.dtl, memslot, buf); + spin_unlock(&vcpu->arch.vpa_update_lock); + } + r = -EFAULT; - if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) + if (copy_to_user(log->dirty_bitmap, buf, n)) goto out; r = 0; @@ -3005,6 +3074,15 @@ static void kvmppc_core_free_memslot_hv(struct kvm_memory_slot *free, static int kvmppc_core_create_memslot_hv(struct kvm_memory_slot *slot, unsigned long npages) { + /* + * For now, if radix_enabled() then we only support radix guests, + * and in that case we don't need the rmap array. + */ + if (radix_enabled()) { + slot->arch.rmap = NULL; + return 0; + } + slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap)); if (!slot->arch.rmap) return -ENOMEM; @@ -3037,7 +3115,7 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm, if (npages) atomic64_inc(&kvm->arch.mmio_update); - if (npages && old->npages) { + if (npages && old->npages && !kvm_is_radix(kvm)) { /* * If modifying a memslot, reset all the rmap dirty bits. * If this is a new memslot, we don't need to do anything @@ -3046,7 +3124,7 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm, */ slots = kvm_memslots(kvm); memslot = id_to_memslot(slots, mem->slot); - kvmppc_hv_get_dirty_log(kvm, memslot, NULL); + kvmppc_hv_get_dirty_log_hpt(kvm, memslot, NULL); } } @@ -3085,14 +3163,20 @@ static void kvmppc_setup_partition_table(struct kvm *kvm) { unsigned long dw0, dw1; - /* PS field - page size for VRMA */ - dw0 = ((kvm->arch.vrma_slb_v & SLB_VSID_L) >> 1) | - ((kvm->arch.vrma_slb_v & SLB_VSID_LP) << 1); - /* HTABSIZE and HTABORG fields */ - dw0 |= kvm->arch.sdr1; + if (!kvm_is_radix(kvm)) { + /* PS field - page size for VRMA */ + dw0 = ((kvm->arch.vrma_slb_v & SLB_VSID_L) >> 1) | + ((kvm->arch.vrma_slb_v & SLB_VSID_LP) << 1); + /* HTABSIZE and HTABORG fields */ + dw0 |= kvm->arch.sdr1; - /* Second dword has GR=0; other fields are unused since UPRT=0 */ - dw1 = 0; + /* Second dword as set by userspace */ + dw1 = kvm->arch.process_table; + } else { + dw0 = PATB_HR | radix__get_tree_size() | + __pa(kvm->arch.pgtable) | RADIX_PGD_INDEX_SIZE; + dw1 = PATB_GR | kvm->arch.process_table; + } mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1); } @@ -3113,12 +3197,23 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) goto out; /* another vcpu beat us to it */ /* Allocate hashed page table (if not done already) and reset it */ - if (!kvm->arch.hpt_virt) { - err = kvmppc_alloc_hpt(kvm, NULL); - if (err) { + if (!kvm->arch.hpt.virt) { + int order = KVM_DEFAULT_HPT_ORDER; + struct kvm_hpt_info info; + + err = kvmppc_allocate_hpt(&info, order); + /* If we get here, it means userspace didn't specify a + * size explicitly. So, try successively smaller + * sizes if the default failed. */ + while ((err == -ENOMEM) && --order >= PPC_MIN_HPT_ORDER) + err = kvmppc_allocate_hpt(&info, order); + + if (err < 0) { pr_err("KVM: Couldn't alloc HPT\n"); goto out; } + + kvmppc_set_hpt(kvm, &info); } /* Look up the memslot for guest physical address 0 */ @@ -3262,6 +3357,7 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) { unsigned long lpcr, lpid; char buf[32]; + int ret; /* Allocate the guest's logical partition ID */ @@ -3309,13 +3405,33 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) lpcr |= LPCR_HVICE; } + /* + * For now, if the host uses radix, the guest must be radix. + */ + if (radix_enabled()) { + kvm->arch.radix = 1; + lpcr &= ~LPCR_VPM1; + lpcr |= LPCR_UPRT | LPCR_GTSE | LPCR_HR; + ret = kvmppc_init_vm_radix(kvm); + if (ret) { + kvmppc_free_lpid(kvm->arch.lpid); + return ret; + } + kvmppc_setup_partition_table(kvm); + } + kvm->arch.lpcr = lpcr; + /* Initialization for future HPT resizes */ + kvm->arch.resize_hpt = NULL; + /* * Work out how many sets the TLB has, for the use of * the TLB invalidation loop in book3s_hv_rmhandlers.S. */ - if (cpu_has_feature(CPU_FTR_ARCH_300)) + if (kvm_is_radix(kvm)) + kvm->arch.tlb_sets = POWER9_TLB_SETS_RADIX; /* 128 */ + else if (cpu_has_feature(CPU_FTR_ARCH_300)) kvm->arch.tlb_sets = POWER9_TLB_SETS_HASH; /* 256 */ else if (cpu_has_feature(CPU_FTR_ARCH_207S)) kvm->arch.tlb_sets = POWER8_TLB_SETS; /* 512 */ @@ -3325,8 +3441,11 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) /* * Track that we now have a HV mode VM active. This blocks secondary * CPU threads from coming online. + * On POWER9, we only need to do this for HPT guests on a radix + * host, which is not yet supported. */ - kvm_hv_vm_activated(); + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + kvm_hv_vm_activated(); /* * Create a debugfs directory for the VM @@ -3352,11 +3471,17 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm) { debugfs_remove_recursive(kvm->arch.debugfs_dir); - kvm_hv_vm_deactivated(); + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + kvm_hv_vm_deactivated(); kvmppc_free_vcores(kvm); - kvmppc_free_hpt(kvm); + kvmppc_free_lpid(kvm->arch.lpid); + + if (kvm_is_radix(kvm)) + kvmppc_free_radix(kvm); + else + kvmppc_free_hpt(&kvm->arch.hpt); kvmppc_free_pimap(kvm); } @@ -3385,11 +3510,6 @@ static int kvmppc_core_check_processor_compat_hv(void) if (!cpu_has_feature(CPU_FTR_HVMODE) || !cpu_has_feature(CPU_FTR_ARCH_206)) return -EIO; - /* - * Disable KVM for Power9 in radix mode. - */ - if (cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled()) - return -EIO; return 0; } @@ -3587,12 +3707,9 @@ static long kvm_arch_vm_ioctl_hv(struct file *filp, r = -EFAULT; if (get_user(htab_order, (u32 __user *)argp)) break; - r = kvmppc_alloc_reset_hpt(kvm, &htab_order); + r = kvmppc_alloc_reset_hpt(kvm, htab_order); if (r) break; - r = -EFAULT; - if (put_user(htab_order, (u32 __user *)argp)) - break; r = 0; break; } @@ -3607,6 +3724,28 @@ static long kvm_arch_vm_ioctl_hv(struct file *filp, break; } + case KVM_PPC_RESIZE_HPT_PREPARE: { + struct kvm_ppc_resize_hpt rhpt; + + r = -EFAULT; + if (copy_from_user(&rhpt, argp, sizeof(rhpt))) + break; + + r = kvm_vm_ioctl_resize_hpt_prepare(kvm, &rhpt); + break; + } + + case KVM_PPC_RESIZE_HPT_COMMIT: { + struct kvm_ppc_resize_hpt rhpt; + + r = -EFAULT; + if (copy_from_user(&rhpt, argp, sizeof(rhpt))) + break; + + r = kvm_vm_ioctl_resize_hpt_commit(kvm, &rhpt); + break; + } + default: r = -ENOTTY; } @@ -3657,6 +3796,41 @@ static void init_default_hcalls(void) } } +static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg) +{ + unsigned long lpcr; + int radix; + + /* If not on a POWER9, reject it */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + return -ENODEV; + + /* If any unknown flags set, reject it */ + if (cfg->flags & ~(KVM_PPC_MMUV3_RADIX | KVM_PPC_MMUV3_GTSE)) + return -EINVAL; + + /* We can't change a guest to/from radix yet */ + radix = !!(cfg->flags & KVM_PPC_MMUV3_RADIX); + if (radix != kvm_is_radix(kvm)) + return -EINVAL; + + /* GR (guest radix) bit in process_table field must match */ + if (!!(cfg->process_table & PATB_GR) != radix) + return -EINVAL; + + /* Process table size field must be reasonable, i.e. <= 24 */ + if ((cfg->process_table & PRTS_MASK) > 24) + return -EINVAL; + + kvm->arch.process_table = cfg->process_table; + kvmppc_setup_partition_table(kvm); + + lpcr = (cfg->flags & KVM_PPC_MMUV3_GTSE) ? LPCR_GTSE : 0; + kvmppc_update_lpcr(kvm, lpcr, LPCR_GTSE); + + return 0; +} + static struct kvmppc_ops kvm_ops_hv = { .get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv, .set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv, @@ -3694,6 +3868,8 @@ static struct kvmppc_ops kvm_ops_hv = { .irq_bypass_add_producer = kvmppc_irq_bypass_add_producer_hv, .irq_bypass_del_producer = kvmppc_irq_bypass_del_producer_hv, #endif + .configure_mmu = kvmhv_configure_mmu, + .get_rmmu_info = kvmhv_get_rmmu_info, }; static int kvm_init_subcore_bitmap(void) @@ -3728,6 +3904,11 @@ static int kvm_init_subcore_bitmap(void) return 0; } +static int kvmppc_radix_possible(void) +{ + return cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled(); +} + static int kvmppc_book3s_init_hv(void) { int r; @@ -3767,12 +3948,19 @@ static int kvmppc_book3s_init_hv(void) init_vcore_lists(); r = kvmppc_mmu_hv_init(); + if (r) + return r; + + if (kvmppc_radix_possible()) + r = kvmppc_radix_init(); return r; } static void kvmppc_book3s_exit_hv(void) { kvmppc_free_host_rm_ops(); + if (kvmppc_radix_possible()) + kvmppc_radix_exit(); kvmppc_hv_ops = NULL; } diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index 5bb24be0b346..96e7e609f621 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -29,6 +29,11 @@ #include <asm/opal.h> #include <asm/smp.h> +static bool in_realmode(void) +{ + return !(mfmsr() & MSR_IR); +} + #define KVM_CMA_CHUNK_ORDER 18 /* @@ -52,19 +57,19 @@ static int __init early_parse_kvm_cma_resv(char *p) } early_param("kvm_cma_resv_ratio", early_parse_kvm_cma_resv); -struct page *kvm_alloc_hpt(unsigned long nr_pages) +struct page *kvm_alloc_hpt_cma(unsigned long nr_pages) { VM_BUG_ON(order_base_2(nr_pages) < KVM_CMA_CHUNK_ORDER - PAGE_SHIFT); return cma_alloc(kvm_cma, nr_pages, order_base_2(HPT_ALIGN_PAGES)); } -EXPORT_SYMBOL_GPL(kvm_alloc_hpt); +EXPORT_SYMBOL_GPL(kvm_alloc_hpt_cma); -void kvm_release_hpt(struct page *page, unsigned long nr_pages) +void kvm_free_hpt_cma(struct page *page, unsigned long nr_pages) { cma_release(kvm_cma, page, nr_pages); } -EXPORT_SYMBOL_GPL(kvm_release_hpt); +EXPORT_SYMBOL_GPL(kvm_free_hpt_cma); /** * kvm_cma_reserve() - reserve area for kvm hash pagetable @@ -200,7 +205,6 @@ static inline void rm_writeb(unsigned long paddr, u8 val) /* * Send an interrupt or message to another CPU. - * This can only be called in real mode. * The caller needs to include any barrier needed to order writes * to memory vs. the IPI/message. */ @@ -226,7 +230,9 @@ void kvmhv_rm_send_ipi(int cpu) /* Else poke the target with an IPI */ xics_phys = paca[cpu].kvm_hstate.xics_phys; - if (xics_phys) + if (!in_realmode()) + opal_int_set_mfrr(get_hard_smp_processor_id(cpu), IPI_PRIORITY); + else if (xics_phys) rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY); else opal_rm_int_set_mfrr(get_hard_smp_processor_id(cpu), @@ -412,14 +418,15 @@ static long kvmppc_read_one_intr(bool *again) /* Now read the interrupt from the ICP */ xics_phys = local_paca->kvm_hstate.xics_phys; - if (!xics_phys) { - /* Use OPAL to read the XIRR */ + rc = 0; + if (!in_realmode()) + rc = opal_int_get_xirr(&xirr, false); + else if (!xics_phys) rc = opal_rm_int_get_xirr(&xirr, false); - if (rc < 0) - return 1; - } else { + else xirr = _lwzcix(xics_phys + XICS_XIRR); - } + if (rc < 0) + return 1; /* * Save XIRR for later. Since we get control in reverse endian @@ -445,15 +452,19 @@ static long kvmppc_read_one_intr(bool *again) * If it is an IPI, clear the MFRR and EOI it. */ if (xisr == XICS_IPI) { - if (xics_phys) { + rc = 0; + if (!in_realmode()) { + opal_int_set_mfrr(hard_smp_processor_id(), 0xff); + rc = opal_int_eoi(h_xirr); + } else if (xics_phys) { _stbcix(xics_phys + XICS_MFRR, 0xff); _stwcix(xics_phys + XICS_XIRR, xirr); } else { opal_rm_int_set_mfrr(hard_smp_processor_id(), 0xff); rc = opal_rm_int_eoi(h_xirr); - /* If rc > 0, there is another interrupt pending */ - *again = rc > 0; } + /* If rc > 0, there is another interrupt pending */ + *again = rc > 0; /* * Need to ensure side effects of above stores @@ -471,7 +482,10 @@ static long kvmppc_read_one_intr(bool *again) /* We raced with the host, * we need to resend that IPI, bummer */ - if (xics_phys) + if (!in_realmode()) + opal_int_set_mfrr(hard_smp_processor_id(), + IPI_PRIORITY); + else if (xics_phys) _stbcix(xics_phys + XICS_MFRR, IPI_PRIORITY); else opal_rm_int_set_mfrr(hard_smp_processor_id(), diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 9ef3c4be952f..6fca970373ee 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -43,6 +43,7 @@ static void *real_vmalloc_addr(void *x) static int global_invalidates(struct kvm *kvm, unsigned long flags) { int global; + int cpu; /* * If there is only one vcore, and it's currently running, @@ -60,8 +61,14 @@ static int global_invalidates(struct kvm *kvm, unsigned long flags) /* any other core might now have stale TLB entries... */ smp_wmb(); cpumask_setall(&kvm->arch.need_tlb_flush); - cpumask_clear_cpu(local_paca->kvm_hstate.kvm_vcore->pcpu, - &kvm->arch.need_tlb_flush); + cpu = local_paca->kvm_hstate.kvm_vcore->pcpu; + /* + * On POWER9, threads are independent but the TLB is shared, + * so use the bit for the first thread to represent the core. + */ + if (cpu_has_feature(CPU_FTR_ARCH_300)) + cpu = cpu_first_thread_sibling(cpu); + cpumask_clear_cpu(cpu, &kvm->arch.need_tlb_flush); } return global; @@ -79,10 +86,10 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, if (*rmap & KVMPPC_RMAP_PRESENT) { i = *rmap & KVMPPC_RMAP_INDEX; - head = &kvm->arch.revmap[i]; + head = &kvm->arch.hpt.rev[i]; if (realmode) head = real_vmalloc_addr(head); - tail = &kvm->arch.revmap[head->back]; + tail = &kvm->arch.hpt.rev[head->back]; if (realmode) tail = real_vmalloc_addr(tail); rev->forw = i; @@ -147,8 +154,8 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index, lock_rmap(rmap); head = *rmap & KVMPPC_RMAP_INDEX; - next = real_vmalloc_addr(&kvm->arch.revmap[rev->forw]); - prev = real_vmalloc_addr(&kvm->arch.revmap[rev->back]); + next = real_vmalloc_addr(&kvm->arch.hpt.rev[rev->forw]); + prev = real_vmalloc_addr(&kvm->arch.hpt.rev[rev->back]); next->back = rev->back; prev->forw = rev->forw; if (head == pte_index) { @@ -182,6 +189,8 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, unsigned long mmu_seq; unsigned long rcbits, irq_flags = 0; + if (kvm_is_radix(kvm)) + return H_FUNCTION; psize = hpte_page_size(pteh, ptel); if (!psize) return H_PARAMETER; @@ -283,11 +292,11 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, /* Find and lock the HPTEG slot to use */ do_insert: - if (pte_index >= kvm->arch.hpt_npte) + if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt)) return H_PARAMETER; if (likely((flags & H_EXACT) == 0)) { pte_index &= ~7UL; - hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4)); + hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4)); for (i = 0; i < 8; ++i) { if ((be64_to_cpu(*hpte) & HPTE_V_VALID) == 0 && try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID | @@ -318,7 +327,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, } pte_index += i; } else { - hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4)); + hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4)); if (!try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID | HPTE_V_ABSENT)) { /* Lock the slot and check again */ @@ -335,7 +344,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, } /* Save away the guest's idea of the second HPTE dword */ - rev = &kvm->arch.revmap[pte_index]; + rev = &kvm->arch.hpt.rev[pte_index]; if (realmode) rev = real_vmalloc_addr(rev); if (rev) { @@ -458,9 +467,11 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags, struct revmap_entry *rev; u64 pte, orig_pte, pte_r; - if (pte_index >= kvm->arch.hpt_npte) + if (kvm_is_radix(kvm)) + return H_FUNCTION; + if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt)) return H_PARAMETER; - hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4)); + hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4)); while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) cpu_relax(); pte = orig_pte = be64_to_cpu(hpte[0]); @@ -476,7 +487,7 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags, return H_NOT_FOUND; } - rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); + rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]); v = pte & ~HPTE_V_HVLOCK; if (v & HPTE_V_VALID) { hpte[0] &= ~cpu_to_be64(HPTE_V_VALID); @@ -529,6 +540,8 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) struct revmap_entry *rev, *revs[4]; u64 hp0, hp1; + if (kvm_is_radix(kvm)) + return H_FUNCTION; global = global_invalidates(kvm, 0); for (i = 0; i < 4 && ret == H_SUCCESS; ) { n = 0; @@ -544,13 +557,13 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) break; } if (req != 1 || flags == 3 || - pte_index >= kvm->arch.hpt_npte) { + pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt)) { /* parameter error */ args[j] = ((0xa0 | flags) << 56) + pte_index; ret = H_PARAMETER; break; } - hp = (__be64 *) (kvm->arch.hpt_virt + (pte_index << 4)); + hp = (__be64 *) (kvm->arch.hpt.virt + (pte_index << 4)); /* to avoid deadlock, don't spin except for first */ if (!try_lock_hpte(hp, HPTE_V_HVLOCK)) { if (n) @@ -587,7 +600,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) } args[j] = ((0x80 | flags) << 56) + pte_index; - rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); + rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]); note_hpte_modification(kvm, rev); if (!(hp0 & HPTE_V_VALID)) { @@ -642,10 +655,12 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, unsigned long v, r, rb, mask, bits; u64 pte_v, pte_r; - if (pte_index >= kvm->arch.hpt_npte) + if (kvm_is_radix(kvm)) + return H_FUNCTION; + if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt)) return H_PARAMETER; - hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4)); + hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4)); while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) cpu_relax(); v = pte_v = be64_to_cpu(hpte[0]); @@ -665,7 +680,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, /* Update guest view of 2nd HPTE dword */ mask = HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_HI | HPTE_R_KEY_LO; - rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); + rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]); if (rev) { r = (rev->guest_rpte & ~mask) | bits; rev->guest_rpte = r; @@ -711,15 +726,17 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags, int i, n = 1; struct revmap_entry *rev = NULL; - if (pte_index >= kvm->arch.hpt_npte) + if (kvm_is_radix(kvm)) + return H_FUNCTION; + if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt)) return H_PARAMETER; if (flags & H_READ_4) { pte_index &= ~3; n = 4; } - rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); + rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]); for (i = 0; i < n; ++i, ++pte_index) { - hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4)); + hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4)); v = be64_to_cpu(hpte[0]) & ~HPTE_V_HVLOCK; r = be64_to_cpu(hpte[1]); if (cpu_has_feature(CPU_FTR_ARCH_300)) { @@ -750,11 +767,13 @@ long kvmppc_h_clear_ref(struct kvm_vcpu *vcpu, unsigned long flags, unsigned long *rmap; long ret = H_NOT_FOUND; - if (pte_index >= kvm->arch.hpt_npte) + if (kvm_is_radix(kvm)) + return H_FUNCTION; + if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt)) return H_PARAMETER; - rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); - hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4)); + rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]); + hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4)); while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) cpu_relax(); v = be64_to_cpu(hpte[0]); @@ -796,11 +815,13 @@ long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags, unsigned long *rmap; long ret = H_NOT_FOUND; - if (pte_index >= kvm->arch.hpt_npte) + if (kvm_is_radix(kvm)) + return H_FUNCTION; + if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt)) return H_PARAMETER; - rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); - hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4)); + rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]); + hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4)); while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) cpu_relax(); v = be64_to_cpu(hpte[0]); @@ -949,7 +970,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v, somask = (1UL << 28) - 1; vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT; } - hash = (vsid ^ ((eaddr & somask) >> pshift)) & kvm->arch.hpt_mask; + hash = (vsid ^ ((eaddr & somask) >> pshift)) & kvmppc_hpt_mask(&kvm->arch.hpt); avpn = slb_v & ~(somask >> 16); /* also includes B */ avpn |= (eaddr & somask) >> 16; @@ -960,7 +981,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v, val |= avpn; for (;;) { - hpte = (__be64 *)(kvm->arch.hpt_virt + (hash << 7)); + hpte = (__be64 *)(kvm->arch.hpt.virt + (hash << 7)); for (i = 0; i < 16; i += 2) { /* Read the PTE racily */ @@ -996,7 +1017,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v, if (val & HPTE_V_SECONDARY) break; val |= HPTE_V_SECONDARY; - hash = hash ^ kvm->arch.hpt_mask; + hash = hash ^ kvmppc_hpt_mask(&kvm->arch.hpt); } return -1; } @@ -1045,14 +1066,14 @@ long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr, return status; /* there really was no HPTE */ return 0; /* for prot fault, HPTE disappeared */ } - hpte = (__be64 *)(kvm->arch.hpt_virt + (index << 4)); + hpte = (__be64 *)(kvm->arch.hpt.virt + (index << 4)); v = orig_v = be64_to_cpu(hpte[0]) & ~HPTE_V_HVLOCK; r = be64_to_cpu(hpte[1]); if (cpu_has_feature(CPU_FTR_ARCH_300)) { v = hpte_new_to_old_v(v, r); r = hpte_new_to_old_r(r); } - rev = real_vmalloc_addr(&kvm->arch.revmap[index]); + rev = real_vmalloc_addr(&kvm->arch.hpt.rev[index]); gr = rev->guest_rpte; unlock_hpte(hpte, orig_v); diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 06edc4366639..0b2e388f4cdf 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -35,7 +35,7 @@ int kvm_irq_bypass = 1; EXPORT_SYMBOL(kvm_irq_bypass); static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, - u32 new_irq); + u32 new_irq, bool check_resend); static int xics_opal_rm_set_server(unsigned int hw_irq, int server_cpu); /* -- ICS routines -- */ @@ -44,20 +44,12 @@ static void ics_rm_check_resend(struct kvmppc_xics *xics, { int i; - arch_spin_lock(&ics->lock); - for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { struct ics_irq_state *state = &ics->irq_state[i]; - - if (!state->resend) - continue; - - arch_spin_unlock(&ics->lock); - icp_rm_deliver_irq(xics, icp, state->number); - arch_spin_lock(&ics->lock); + if (state->resend) + icp_rm_deliver_irq(xics, icp, state->number, true); } - arch_spin_unlock(&ics->lock); } /* -- ICP routines -- */ @@ -70,11 +62,9 @@ static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu) hcpu = hcore << threads_shift; kvmppc_host_rm_ops_hv->rm_core[hcore].rm_data = vcpu; smp_muxed_ipi_set_message(hcpu, PPC_MSG_RM_HOST_ACTION); - if (paca[hcpu].kvm_hstate.xics_phys) - icp_native_cause_ipi_rm(hcpu); - else - opal_rm_int_set_mfrr(get_hard_smp_processor_id(hcpu), - IPI_PRIORITY); + kvmppc_set_host_ipi(hcpu, 1); + smp_mb(); + kvmhv_rm_send_ipi(hcpu); } #else static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu) { } @@ -290,7 +280,7 @@ static bool icp_rm_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority, } static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, - u32 new_irq) + u32 new_irq, bool check_resend) { struct ics_irq_state *state; struct kvmppc_ics *ics; @@ -335,6 +325,10 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, } } + if (check_resend) + if (!state->resend) + goto out; + /* Clear the resend bit of that interrupt */ state->resend = 0; @@ -380,7 +374,9 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, */ if (reject && reject != XICS_IPI) { arch_spin_unlock(&ics->lock); + icp->n_reject++; new_irq = reject; + check_resend = 0; goto again; } } else { @@ -388,10 +384,16 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, * We failed to deliver the interrupt we need to set the * resend map bit and mark the ICS state as needing a resend */ - set_bit(ics->icsid, icp->resend_map); state->resend = 1; /* + * Make sure when checking resend, we don't miss the resend + * if resend_map bit is seen and cleared. + */ + smp_wmb(); + set_bit(ics->icsid, icp->resend_map); + + /* * If the need_resend flag got cleared in the ICP some time * between icp_rm_try_to_deliver() atomic update and now, then * we know it might have missed the resend_map bit. So we @@ -399,7 +401,9 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, */ smp_mb(); if (!icp->state.need_resend) { + state->resend = 0; arch_spin_unlock(&ics->lock); + check_resend = 0; goto again; } } @@ -594,7 +598,7 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, /* Handle reject in real mode */ if (reject && reject != XICS_IPI) { this_icp->n_reject++; - icp_rm_deliver_irq(xics, icp, reject); + icp_rm_deliver_irq(xics, icp, reject, false); } /* Handle resends in real mode */ @@ -662,59 +666,45 @@ int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) */ if (reject && reject != XICS_IPI) { icp->n_reject++; - icp_rm_deliver_irq(xics, icp, reject); + icp_rm_deliver_irq(xics, icp, reject, false); } bail: return check_too_hard(xics, icp); } -int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) +static int ics_rm_eoi(struct kvm_vcpu *vcpu, u32 irq) { struct kvmppc_xics *xics = vcpu->kvm->arch.xics; struct kvmppc_icp *icp = vcpu->arch.icp; struct kvmppc_ics *ics; struct ics_irq_state *state; - u32 irq = xirr & 0x00ffffff; u16 src; - - if (!xics || !xics->real_mode) - return H_TOO_HARD; + u32 pq_old, pq_new; /* - * ICP State: EOI - * - * Note: If EOI is incorrectly used by SW to lower the CPPR - * value (ie more favored), we do not check for rejection of - * a pending interrupt, this is a SW error and PAPR sepcifies - * that we don't have to deal with it. + * ICS EOI handling: For LSI, if P bit is still set, we need to + * resend it. * - * The sending of an EOI to the ICS is handled after the - * CPPR update - * - * ICP State: Down_CPPR which we handle - * in a separate function as it's shared with H_CPPR. + * For MSI, we move Q bit into P (and clear Q). If it is set, + * resend it. */ - icp_rm_down_cppr(xics, icp, xirr >> 24); - /* IPIs have no EOI */ - if (irq == XICS_IPI) - goto bail; - /* - * EOI handling: If the interrupt is still asserted, we need to - * resend it. We can take a lockless "peek" at the ICS state here. - * - * "Message" interrupts will never have "asserted" set - */ ics = kvmppc_xics_find_ics(xics, irq, &src); if (!ics) goto bail; + state = &ics->irq_state[src]; - /* Still asserted, resend it */ - if (state->asserted) { - icp->n_reject++; - icp_rm_deliver_irq(xics, icp, irq); - } + if (state->lsi) + pq_new = state->pq_state; + else + do { + pq_old = state->pq_state; + pq_new = pq_old >> 1; + } while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old); + + if (pq_new & PQ_PRESENTED) + icp_rm_deliver_irq(xics, NULL, irq, false); if (!hlist_empty(&vcpu->kvm->irq_ack_notifier_list)) { icp->rm_action |= XICS_RM_NOTIFY_EOI; @@ -735,10 +725,43 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) state->intr_cpu = -1; } } + bail: return check_too_hard(xics, icp); } +int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) +{ + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; + struct kvmppc_icp *icp = vcpu->arch.icp; + u32 irq = xirr & 0x00ffffff; + + if (!xics || !xics->real_mode) + return H_TOO_HARD; + + /* + * ICP State: EOI + * + * Note: If EOI is incorrectly used by SW to lower the CPPR + * value (ie more favored), we do not check for rejection of + * a pending interrupt, this is a SW error and PAPR specifies + * that we don't have to deal with it. + * + * The sending of an EOI to the ICS is handled after the + * CPPR update + * + * ICP State: Down_CPPR which we handle + * in a separate function as it's shared with H_CPPR. + */ + icp_rm_down_cppr(xics, icp, xirr >> 24); + + /* IPIs have no EOI */ + if (irq == XICS_IPI) + return check_too_hard(xics, icp); + + return ics_rm_eoi(vcpu, irq); +} + unsigned long eoi_rc; static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again) @@ -825,14 +848,33 @@ long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, { struct kvmppc_xics *xics; struct kvmppc_icp *icp; + struct kvmppc_ics *ics; + struct ics_irq_state *state; u32 irq; + u16 src; + u32 pq_old, pq_new; irq = irq_map->v_hwirq; xics = vcpu->kvm->arch.xics; icp = vcpu->arch.icp; kvmppc_rm_handle_irq_desc(irq_map->desc); - icp_rm_deliver_irq(xics, icp, irq); + + ics = kvmppc_xics_find_ics(xics, irq, &src); + if (!ics) + return 2; + + state = &ics->irq_state[src]; + + /* only MSIs register bypass producers, so it must be MSI here */ + do { + pq_old = state->pq_state; + pq_new = ((pq_old << 1) & 3) | PQ_PRESENTED; + } while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old); + + /* Test P=1, Q=0, this is the only case where we present */ + if (pq_new == PQ_PRESENTED) + icp_rm_deliver_irq(xics, icp, irq, false); /* EOI the interrupt */ icp_eoi(irq_desc_get_chip(irq_map->desc), irq_map->r_hwirq, xirr, diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 9338a818e05c..47414a6fe2dd 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -148,6 +148,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) addi r1, r1, 112 ld r7, HSTATE_HOST_MSR(r13) + /* + * If we came back from the guest via a relocation-on interrupt, + * we will be in virtual mode at this point, which makes it a + * little easier to get back to the caller. + */ + mfmsr r0 + andi. r0, r0, MSR_IR /* in real mode? */ + bne .Lvirt_return + cmpwi cr1, r12, BOOK3S_INTERRUPT_MACHINE_CHECK cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL beq 11f @@ -181,6 +190,26 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) mtspr SPRN_HSRR1, r7 ba 0xe80 + /* Virtual-mode return - can't get here for HMI or machine check */ +.Lvirt_return: + cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL + beq 16f + cmpwi r12, BOOK3S_INTERRUPT_H_DOORBELL + beq 17f + andi. r0, r7, MSR_EE /* were interrupts hard-enabled? */ + beq 18f + mtmsrd r7, 1 /* if so then re-enable them */ +18: mtlr r8 + blr + +16: mtspr SPRN_HSRR0, r8 /* jump to reloc-on external vector */ + mtspr SPRN_HSRR1, r7 + b exc_virt_0x4500_hardware_interrupt + +17: mtspr SPRN_HSRR0, r8 + mtspr SPRN_HSRR1, r7 + b exc_virt_0x4e80_h_doorbell + kvmppc_primary_no_guest: /* We handle this much like a ceded vcpu */ /* put the HDEC into the DEC, since HDEC interrupts don't wake us */ @@ -518,6 +547,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) /* Stack frame offsets */ #define STACK_SLOT_TID (112-16) #define STACK_SLOT_PSSCR (112-24) +#define STACK_SLOT_PID (112-32) .global kvmppc_hv_entry kvmppc_hv_entry: @@ -530,6 +560,7 @@ kvmppc_hv_entry: * R1 = host R1 * R2 = TOC * all other volatile GPRS = free + * Does not preserve non-volatile GPRs or CR fields */ mflr r0 std r0, PPC_LR_STKOFF(r1) @@ -549,32 +580,38 @@ kvmppc_hv_entry: bl kvmhv_start_timing 1: #endif - /* Clear out SLB */ + + /* Use cr7 as an indication of radix mode */ + ld r5, HSTATE_KVM_VCORE(r13) + ld r9, VCORE_KVM(r5) /* pointer to struct kvm */ + lbz r0, KVM_RADIX(r9) + cmpwi cr7, r0, 0 + + /* Clear out SLB if hash */ + bne cr7, 2f li r6,0 slbmte r6,r6 slbia ptesync - +2: /* * POWER7/POWER8 host -> guest partition switch code. * We don't have to lock against concurrent tlbies, * but we do have to coordinate across hardware threads. */ /* Set bit in entry map iff exit map is zero. */ - ld r5, HSTATE_KVM_VCORE(r13) li r7, 1 lbz r6, HSTATE_PTID(r13) sld r7, r7, r6 - addi r9, r5, VCORE_ENTRY_EXIT -21: lwarx r3, 0, r9 + addi r8, r5, VCORE_ENTRY_EXIT +21: lwarx r3, 0, r8 cmpwi r3, 0x100 /* any threads starting to exit? */ bge secondary_too_late /* if so we're too late to the party */ or r3, r3, r7 - stwcx. r3, 0, r9 + stwcx. r3, 0, r8 bne 21b /* Primary thread switches to guest partition. */ - ld r9,VCORE_KVM(r5) /* pointer to struct kvm */ cmpwi r6,0 bne 10f lwz r7,KVM_LPID(r9) @@ -590,30 +627,44 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) /* See if we need to flush the TLB */ lhz r6,PACAPACAINDEX(r13) /* test_bit(cpu, need_tlb_flush) */ +BEGIN_FTR_SECTION + /* + * On POWER9, individual threads can come in here, but the + * TLB is shared between the 4 threads in a core, hence + * invalidating on one thread invalidates for all. + * Thus we make all 4 threads use the same bit here. + */ + clrrdi r6,r6,2 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) clrldi r7,r6,64-6 /* extract bit number (6 bits) */ srdi r6,r6,6 /* doubleword number */ sldi r6,r6,3 /* address offset */ add r6,r6,r9 addi r6,r6,KVM_NEED_FLUSH /* dword in kvm->arch.need_tlb_flush */ - li r0,1 - sld r0,r0,r7 + li r8,1 + sld r8,r8,r7 ld r7,0(r6) - and. r7,r7,r0 + and. r7,r7,r8 beq 22f -23: ldarx r7,0,r6 /* if set, clear the bit */ - andc r7,r7,r0 - stdcx. r7,0,r6 - bne 23b /* Flush the TLB of any entries for this LPID */ - lwz r6,KVM_TLB_SETS(r9) - li r0,0 /* RS for P9 version of tlbiel */ - mtctr r6 + lwz r0,KVM_TLB_SETS(r9) + mtctr r0 li r7,0x800 /* IS field = 0b10 */ ptesync -28: tlbiel r7 + li r0,0 /* RS for P9 version of tlbiel */ + bne cr7, 29f +28: tlbiel r7 /* On P9, rs=0, RIC=0, PRS=0, R=0 */ addi r7,r7,0x1000 bdnz 28b - ptesync + b 30f +29: PPC_TLBIEL(7,0,2,1,1) /* for radix, RIC=2, PRS=1, R=1 */ + addi r7,r7,0x1000 + bdnz 29b +30: ptesync +23: ldarx r7,0,r6 /* clear the bit after TLB flushed */ + andc r7,r7,r8 + stdcx. r7,0,r6 + bne 23b /* Add timebase offset onto timebase */ 22: ld r8,VCORE_TB_OFFSET(r5) @@ -658,7 +709,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) beq kvmppc_primary_no_guest kvmppc_got_guest: - /* Load up guest SLB entries */ + /* Load up guest SLB entries (N.B. slb_max will be 0 for radix) */ lwz r5,VCPU_SLB_MAX(r4) cmpwi r5,0 beq 9f @@ -696,8 +747,10 @@ kvmppc_got_guest: BEGIN_FTR_SECTION mfspr r5, SPRN_TIDR mfspr r6, SPRN_PSSCR + mfspr r7, SPRN_PID std r5, STACK_SLOT_TID(r1) std r6, STACK_SLOT_PSSCR(r1) + std r7, STACK_SLOT_PID(r1) END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) BEGIN_FTR_SECTION @@ -824,6 +877,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) mtspr SPRN_PID, r7 mtspr SPRN_WORT, r8 BEGIN_FTR_SECTION + PPC_INVALIDATE_ERAT +END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1) +BEGIN_FTR_SECTION /* POWER8-only registers */ ld r5, VCPU_TCSCR(r4) ld r6, VCPU_ACOP(r4) @@ -1057,13 +1113,13 @@ hdec_soon: kvmppc_interrupt_hv: /* * Register contents: - * R12 = interrupt vector + * R12 = (guest CR << 32) | interrupt vector * R13 = PACA - * guest CR, R12 saved in shadow VCPU SCRATCH1/0 + * guest R12 saved in shadow VCPU SCRATCH0 + * guest CTR saved in shadow VCPU SCRATCH1 if RELOCATABLE * guest R13 saved in SPRN_SCRATCH0 */ std r9, HSTATE_SCRATCH2(r13) - lbz r9, HSTATE_IN_GUEST(r13) cmpwi r9, KVM_GUEST_MODE_HOST_HV beq kvmppc_bad_host_intr @@ -1094,8 +1150,9 @@ kvmppc_interrupt_hv: std r10, VCPU_GPR(R10)(r9) std r11, VCPU_GPR(R11)(r9) ld r3, HSTATE_SCRATCH0(r13) - lwz r4, HSTATE_SCRATCH1(r13) std r3, VCPU_GPR(R12)(r9) + /* CR is in the high half of r12 */ + srdi r4, r12, 32 stw r4, VCPU_CR(r9) BEGIN_FTR_SECTION ld r3, HSTATE_CFAR(r13) @@ -1114,6 +1171,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) mfspr r11, SPRN_SRR1 std r10, VCPU_SRR0(r9) std r11, VCPU_SRR1(r9) + /* trap is in the low half of r12, clear CR from the high half */ + clrldi r12, r12, 32 andi. r0, r12, 2 /* need to read HSRR0/1? */ beq 1f mfspr r10, SPRN_HSRR0 @@ -1149,7 +1208,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) 11: stw r3,VCPU_HEIR(r9) /* these are volatile across C function calls */ +#ifdef CONFIG_RELOCATABLE + ld r3, HSTATE_SCRATCH1(r13) + mtctr r3 +#else mfctr r3 +#endif mfxer r4 std r3, VCPU_CTR(r9) std r4, VCPU_XER(r9) @@ -1285,11 +1349,15 @@ mc_cont: mtspr SPRN_CTRLT,r6 4: /* Read the guest SLB and save it away */ + ld r5, VCPU_KVM(r9) + lbz r0, KVM_RADIX(r5) + cmpwi r0, 0 + li r5, 0 + bne 3f /* for radix, save 0 entries */ lwz r0,VCPU_SLB_NR(r9) /* number of entries in SLB */ mtctr r0 li r6,0 addi r7,r9,VCPU_SLB - li r5,0 1: slbmfee r8,r6 andis. r0,r8,SLB_ESID_V@h beq 2f @@ -1301,7 +1369,7 @@ mc_cont: addi r5,r5,1 2: addi r6,r6,1 bdnz 1b - stw r5,VCPU_SLB_MAX(r9) +3: stw r5,VCPU_SLB_MAX(r9) /* * Save the guest PURR/SPURR @@ -1550,9 +1618,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) BEGIN_FTR_SECTION ld r5, STACK_SLOT_TID(r1) ld r6, STACK_SLOT_PSSCR(r1) + ld r7, STACK_SLOT_PID(r1) mtspr SPRN_TIDR, r5 mtspr SPRN_PSSCR, r6 + mtspr SPRN_PID, r7 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) +BEGIN_FTR_SECTION + PPC_INVALIDATE_ERAT +END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1) /* * POWER7/POWER8 guest -> host partition switch code. @@ -1663,6 +1736,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) isync /* load host SLB entries */ +BEGIN_MMU_FTR_SECTION + b 0f +END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX) ld r8,PACA_SLBSHADOWPTR(r13) .rept SLB_NUM_BOLTED @@ -1675,7 +1751,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) slbmte r6,r5 1: addi r8,r8,16 .endr - +0: #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING /* Finish timing, if we have a vcpu */ ld r4, HSTATE_KVM_VCPU(r13) @@ -1702,11 +1778,19 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) * reflect the HDSI to the guest as a DSI. */ kvmppc_hdsi: + ld r3, VCPU_KVM(r9) + lbz r0, KVM_RADIX(r3) + cmpwi r0, 0 mfspr r4, SPRN_HDAR mfspr r6, SPRN_HDSISR + bne .Lradix_hdsi /* on radix, just save DAR/DSISR/ASDR */ /* HPTE not found fault or protection fault? */ andis. r0, r6, (DSISR_NOHPTE | DSISR_PROTFAULT)@h beq 1f /* if not, send it to the guest */ +BEGIN_FTR_SECTION + mfspr r5, SPRN_ASDR /* on POWER9, use ASDR to get VSID */ + b 4f +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) andi. r0, r11, MSR_DR /* data relocation enabled? */ beq 3f clrrdi r0, r4, 28 @@ -1776,13 +1860,29 @@ fast_interrupt_c_return: stb r0, HSTATE_IN_GUEST(r13) b guest_exit_cont +.Lradix_hdsi: + std r4, VCPU_FAULT_DAR(r9) + stw r6, VCPU_FAULT_DSISR(r9) +.Lradix_hisi: + mfspr r5, SPRN_ASDR + std r5, VCPU_FAULT_GPA(r9) + b guest_exit_cont + /* * Similarly for an HISI, reflect it to the guest as an ISI unless * it is an HPTE not found fault for a page that we have paged out. */ kvmppc_hisi: + ld r3, VCPU_KVM(r9) + lbz r0, KVM_RADIX(r3) + cmpwi r0, 0 + bne .Lradix_hisi /* for radix, just save ASDR */ andis. r0, r11, SRR1_ISI_NOPT@h beq 1f +BEGIN_FTR_SECTION + mfspr r5, SPRN_ASDR /* on POWER9, use ASDR to get VSID */ + b 4f +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) andi. r0, r11, MSR_IR /* instruction relocation enabled? */ beq 3f clrrdi r0, r10, 28 diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 1482961ceb4d..d4dfc0ca2a44 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -902,6 +902,69 @@ static void kvmppc_clear_debug(struct kvm_vcpu *vcpu) } } +static int kvmppc_exit_pr_progint(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int exit_nr) +{ + enum emulation_result er; + ulong flags; + u32 last_inst; + int emul, r; + + /* + * shadow_srr1 only contains valid flags if we came here via a program + * exception. The other exceptions (emulation assist, FP unavailable, + * etc.) do not provide flags in SRR1, so use an illegal-instruction + * exception when injecting a program interrupt into the guest. + */ + if (exit_nr == BOOK3S_INTERRUPT_PROGRAM) + flags = vcpu->arch.shadow_srr1 & 0x1f0000ull; + else + flags = SRR1_PROGILL; + + emul = kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst); + if (emul != EMULATE_DONE) + return RESUME_GUEST; + + if (kvmppc_get_msr(vcpu) & MSR_PR) { +#ifdef EXIT_DEBUG + pr_info("Userspace triggered 0x700 exception at\n 0x%lx (0x%x)\n", + kvmppc_get_pc(vcpu), last_inst); +#endif + if ((last_inst & 0xff0007ff) != (INS_DCBZ & 0xfffffff7)) { + kvmppc_core_queue_program(vcpu, flags); + return RESUME_GUEST; + } + } + + vcpu->stat.emulated_inst_exits++; + er = kvmppc_emulate_instruction(run, vcpu); + switch (er) { + case EMULATE_DONE: + r = RESUME_GUEST_NV; + break; + case EMULATE_AGAIN: + r = RESUME_GUEST; + break; + case EMULATE_FAIL: + pr_crit("%s: emulation at %lx failed (%08x)\n", + __func__, kvmppc_get_pc(vcpu), last_inst); + kvmppc_core_queue_program(vcpu, flags); + r = RESUME_GUEST; + break; + case EMULATE_DO_MMIO: + run->exit_reason = KVM_EXIT_MMIO; + r = RESUME_HOST_NV; + break; + case EMULATE_EXIT_USER: + r = RESUME_HOST_NV; + break; + default: + BUG(); + } + + return r; +} + int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int exit_nr) { @@ -1044,71 +1107,8 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, break; case BOOK3S_INTERRUPT_PROGRAM: case BOOK3S_INTERRUPT_H_EMUL_ASSIST: - { - enum emulation_result er; - ulong flags; - u32 last_inst; - int emul; - -program_interrupt: - /* - * shadow_srr1 only contains valid flags if we came here via - * a program exception. The other exceptions (emulation assist, - * FP unavailable, etc.) do not provide flags in SRR1, so use - * an illegal-instruction exception when injecting a program - * interrupt into the guest. - */ - if (exit_nr == BOOK3S_INTERRUPT_PROGRAM) - flags = vcpu->arch.shadow_srr1 & 0x1f0000ull; - else - flags = SRR1_PROGILL; - - emul = kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst); - if (emul != EMULATE_DONE) { - r = RESUME_GUEST; - break; - } - - if (kvmppc_get_msr(vcpu) & MSR_PR) { -#ifdef EXIT_DEBUG - pr_info("Userspace triggered 0x700 exception at\n 0x%lx (0x%x)\n", - kvmppc_get_pc(vcpu), last_inst); -#endif - if ((last_inst & 0xff0007ff) != - (INS_DCBZ & 0xfffffff7)) { - kvmppc_core_queue_program(vcpu, flags); - r = RESUME_GUEST; - break; - } - } - - vcpu->stat.emulated_inst_exits++; - er = kvmppc_emulate_instruction(run, vcpu); - switch (er) { - case EMULATE_DONE: - r = RESUME_GUEST_NV; - break; - case EMULATE_AGAIN: - r = RESUME_GUEST; - break; - case EMULATE_FAIL: - printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n", - __func__, kvmppc_get_pc(vcpu), last_inst); - kvmppc_core_queue_program(vcpu, flags); - r = RESUME_GUEST; - break; - case EMULATE_DO_MMIO: - run->exit_reason = KVM_EXIT_MMIO; - r = RESUME_HOST_NV; - break; - case EMULATE_EXIT_USER: - r = RESUME_HOST_NV; - break; - default: - BUG(); - } + r = kvmppc_exit_pr_progint(run, vcpu, exit_nr); break; - } case BOOK3S_INTERRUPT_SYSCALL: { u32 last_sc; @@ -1185,7 +1185,7 @@ program_interrupt: emul = kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst); if (emul == EMULATE_DONE) - goto program_interrupt; + r = kvmppc_exit_pr_progint(run, vcpu, exit_nr); else r = RESUME_GUEST; diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S index ca8f174289bb..2a2b96d53999 100644 --- a/arch/powerpc/kvm/book3s_segment.S +++ b/arch/powerpc/kvm/book3s_segment.S @@ -167,20 +167,38 @@ kvmppc_handler_trampoline_enter_end: * * *****************************************************************************/ -.global kvmppc_handler_trampoline_exit -kvmppc_handler_trampoline_exit: - .global kvmppc_interrupt_pr kvmppc_interrupt_pr: + /* 64-bit entry. Register usage at this point: + * + * SPRG_SCRATCH0 = guest R13 + * R12 = (guest CR << 32) | exit handler id + * R13 = PACA + * HSTATE.SCRATCH0 = guest R12 + * HSTATE.SCRATCH1 = guest CTR if RELOCATABLE + */ +#ifdef CONFIG_PPC64 + /* Match 32-bit entry */ +#ifdef CONFIG_RELOCATABLE + std r9, HSTATE_SCRATCH2(r13) + ld r9, HSTATE_SCRATCH1(r13) + mtctr r9 + ld r9, HSTATE_SCRATCH2(r13) +#endif + rotldi r12, r12, 32 /* Flip R12 halves for stw */ + stw r12, HSTATE_SCRATCH1(r13) /* CR is now in the low half */ + srdi r12, r12, 32 /* shift trap into low half */ +#endif +.global kvmppc_handler_trampoline_exit +kvmppc_handler_trampoline_exit: /* Register usage at this point: * - * SPRG_SCRATCH0 = guest R13 - * R12 = exit handler id - * R13 = shadow vcpu (32-bit) or PACA (64-bit) + * SPRG_SCRATCH0 = guest R13 + * R12 = exit handler id + * R13 = shadow vcpu (32-bit) or PACA (64-bit) * HSTATE.SCRATCH0 = guest R12 * HSTATE.SCRATCH1 = guest CR - * */ /* Save registers */ diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index 20dff102a06f..e48803e2918d 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -63,7 +63,7 @@ /* -- ICS routines -- */ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, - u32 new_irq); + u32 new_irq, bool check_resend); /* * Return value ideally indicates how the interrupt was handled, but no @@ -75,6 +75,7 @@ static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level) struct ics_irq_state *state; struct kvmppc_ics *ics; u16 src; + u32 pq_old, pq_new; XICS_DBG("ics deliver %#x (level: %d)\n", irq, level); @@ -87,25 +88,41 @@ static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level) if (!state->exists) return -EINVAL; + if (level == KVM_INTERRUPT_SET_LEVEL || level == KVM_INTERRUPT_SET) + level = 1; + else if (level == KVM_INTERRUPT_UNSET) + level = 0; /* - * We set state->asserted locklessly. This should be fine as - * we are the only setter, thus concurrent access is undefined - * to begin with. + * Take other values the same as 1, consistent with original code. + * maybe WARN here? */ - if ((level == 1 && state->lsi) || level == KVM_INTERRUPT_SET_LEVEL) - state->asserted = 1; - else if (level == 0 || level == KVM_INTERRUPT_UNSET) { - state->asserted = 0; + + if (!state->lsi && level == 0) /* noop for MSI */ return 0; - } + + do { + pq_old = state->pq_state; + if (state->lsi) { + if (level) { + if (pq_old & PQ_PRESENTED) + /* Setting already set LSI ... */ + return 0; + + pq_new = PQ_PRESENTED; + } else + pq_new = 0; + } else + pq_new = ((pq_old << 1) & 3) | PQ_PRESENTED; + } while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old); + + /* Test P=1, Q=0, this is the only case where we present */ + if (pq_new == PQ_PRESENTED) + icp_deliver_irq(xics, NULL, irq, false); /* Record which CPU this arrived on for passed-through interrupts */ if (state->host_irq) state->intr_cpu = raw_smp_processor_id(); - /* Attempt delivery */ - icp_deliver_irq(xics, NULL, irq); - return 0; } @@ -114,29 +131,14 @@ static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics, { int i; - unsigned long flags; - - local_irq_save(flags); - arch_spin_lock(&ics->lock); - for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { struct ics_irq_state *state = &ics->irq_state[i]; - - if (!state->resend) - continue; - - XICS_DBG("resend %#x prio %#x\n", state->number, - state->priority); - - arch_spin_unlock(&ics->lock); - local_irq_restore(flags); - icp_deliver_irq(xics, icp, state->number); - local_irq_save(flags); - arch_spin_lock(&ics->lock); + if (state->resend) { + XICS_DBG("resend %#x prio %#x\n", state->number, + state->priority); + icp_deliver_irq(xics, icp, state->number, true); + } } - - arch_spin_unlock(&ics->lock); - local_irq_restore(flags); } static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics, @@ -155,6 +157,7 @@ static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics, deliver = false; if ((state->masked_pending || state->resend) && priority != MASKED) { state->masked_pending = 0; + state->resend = 0; deliver = true; } @@ -189,7 +192,7 @@ int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server, u32 priority) state->masked_pending, state->resend); if (write_xive(xics, ics, state, server, priority, priority)) - icp_deliver_irq(xics, icp, irq); + icp_deliver_irq(xics, icp, irq, false); return 0; } @@ -242,7 +245,7 @@ int kvmppc_xics_int_on(struct kvm *kvm, u32 irq) if (write_xive(xics, ics, state, state->server, state->saved_priority, state->saved_priority)) - icp_deliver_irq(xics, icp, irq); + icp_deliver_irq(xics, icp, irq, false); return 0; } @@ -376,7 +379,7 @@ static bool icp_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority, } static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, - u32 new_irq) + u32 new_irq, bool check_resend) { struct ics_irq_state *state; struct kvmppc_ics *ics; @@ -422,6 +425,10 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, } } + if (check_resend) + if (!state->resend) + goto out; + /* Clear the resend bit of that interrupt */ state->resend = 0; @@ -470,6 +477,7 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, arch_spin_unlock(&ics->lock); local_irq_restore(flags); new_irq = reject; + check_resend = 0; goto again; } } else { @@ -477,10 +485,16 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, * We failed to deliver the interrupt we need to set the * resend map bit and mark the ICS state as needing a resend */ - set_bit(ics->icsid, icp->resend_map); state->resend = 1; /* + * Make sure when checking resend, we don't miss the resend + * if resend_map bit is seen and cleared. + */ + smp_wmb(); + set_bit(ics->icsid, icp->resend_map); + + /* * If the need_resend flag got cleared in the ICP some time * between icp_try_to_deliver() atomic update and now, then * we know it might have missed the resend_map bit. So we @@ -488,8 +502,10 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, */ smp_mb(); if (!icp->state.need_resend) { + state->resend = 0; arch_spin_unlock(&ics->lock); local_irq_restore(flags); + check_resend = 0; goto again; } } @@ -681,7 +697,7 @@ static noinline int kvmppc_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, /* Handle reject */ if (reject && reject != XICS_IPI) - icp_deliver_irq(xics, icp, reject); + icp_deliver_irq(xics, icp, reject, false); /* Handle resend */ if (resend) @@ -761,17 +777,54 @@ static noinline void kvmppc_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) * attempt (see comments in icp_deliver_irq). */ if (reject && reject != XICS_IPI) - icp_deliver_irq(xics, icp, reject); + icp_deliver_irq(xics, icp, reject, false); } -static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) +static int ics_eoi(struct kvm_vcpu *vcpu, u32 irq) { struct kvmppc_xics *xics = vcpu->kvm->arch.xics; struct kvmppc_icp *icp = vcpu->arch.icp; struct kvmppc_ics *ics; struct ics_irq_state *state; - u32 irq = xirr & 0x00ffffff; u16 src; + u32 pq_old, pq_new; + + /* + * ICS EOI handling: For LSI, if P bit is still set, we need to + * resend it. + * + * For MSI, we move Q bit into P (and clear Q). If it is set, + * resend it. + */ + + ics = kvmppc_xics_find_ics(xics, irq, &src); + if (!ics) { + XICS_DBG("ios_eoi: IRQ 0x%06x not found !\n", irq); + return H_PARAMETER; + } + state = &ics->irq_state[src]; + + if (state->lsi) + pq_new = state->pq_state; + else + do { + pq_old = state->pq_state; + pq_new = pq_old >> 1; + } while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old); + + if (pq_new & PQ_PRESENTED) + icp_deliver_irq(xics, icp, irq, false); + + kvm_notify_acked_irq(vcpu->kvm, 0, irq); + + return H_SUCCESS; +} + +static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) +{ + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; + struct kvmppc_icp *icp = vcpu->arch.icp; + u32 irq = xirr & 0x00ffffff; XICS_DBG("h_eoi vcpu %d eoi %#lx\n", vcpu->vcpu_id, xirr); @@ -794,26 +847,8 @@ static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) /* IPIs have no EOI */ if (irq == XICS_IPI) return H_SUCCESS; - /* - * EOI handling: If the interrupt is still asserted, we need to - * resend it. We can take a lockless "peek" at the ICS state here. - * - * "Message" interrupts will never have "asserted" set - */ - ics = kvmppc_xics_find_ics(xics, irq, &src); - if (!ics) { - XICS_DBG("h_eoi: IRQ 0x%06x not found !\n", irq); - return H_PARAMETER; - } - state = &ics->irq_state[src]; - /* Still asserted, resend it */ - if (state->asserted) - icp_deliver_irq(xics, icp, irq); - - kvm_notify_acked_irq(vcpu->kvm, 0, irq); - - return H_SUCCESS; + return ics_eoi(vcpu, irq); } int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall) @@ -832,10 +867,6 @@ int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall) icp->n_rm_check_resend++; icp_check_resend(xics, icp->rm_resend_icp); } - if (icp->rm_action & XICS_RM_REJECT) { - icp->n_rm_reject++; - icp_deliver_irq(xics, icp, icp->rm_reject); - } if (icp->rm_action & XICS_RM_NOTIFY_EOI) { icp->n_rm_notify_eoi++; kvm_notify_acked_irq(vcpu->kvm, 0, icp->rm_eoied_irq); @@ -920,7 +951,7 @@ static int xics_debug_show(struct seq_file *m, void *private) int icsid, i; unsigned long flags; unsigned long t_rm_kick_vcpu, t_rm_check_resend; - unsigned long t_rm_reject, t_rm_notify_eoi; + unsigned long t_rm_notify_eoi; unsigned long t_reject, t_check_resend; if (!kvm) @@ -929,7 +960,6 @@ static int xics_debug_show(struct seq_file *m, void *private) t_rm_kick_vcpu = 0; t_rm_notify_eoi = 0; t_rm_check_resend = 0; - t_rm_reject = 0; t_check_resend = 0; t_reject = 0; @@ -952,14 +982,13 @@ static int xics_debug_show(struct seq_file *m, void *private) t_rm_kick_vcpu += icp->n_rm_kick_vcpu; t_rm_notify_eoi += icp->n_rm_notify_eoi; t_rm_check_resend += icp->n_rm_check_resend; - t_rm_reject += icp->n_rm_reject; t_check_resend += icp->n_check_resend; t_reject += icp->n_reject; } - seq_printf(m, "ICP Guest->Host totals: kick_vcpu=%lu check_resend=%lu reject=%lu notify_eoi=%lu\n", + seq_printf(m, "ICP Guest->Host totals: kick_vcpu=%lu check_resend=%lu notify_eoi=%lu\n", t_rm_kick_vcpu, t_rm_check_resend, - t_rm_reject, t_rm_notify_eoi); + t_rm_notify_eoi); seq_printf(m, "ICP Real Mode totals: check_resend=%lu resend=%lu\n", t_check_resend, t_reject); for (icsid = 0; icsid <= KVMPPC_XICS_MAX_ICS_ID; icsid++) { @@ -977,9 +1006,9 @@ static int xics_debug_show(struct seq_file *m, void *private) for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { struct ics_irq_state *irq = &ics->irq_state[i]; - seq_printf(m, "irq 0x%06x: server %#x prio %#x save prio %#x asserted %d resend %d masked pending %d\n", + seq_printf(m, "irq 0x%06x: server %#x prio %#x save prio %#x pq_state %d resend %d masked pending %d\n", irq->number, irq->server, irq->priority, - irq->saved_priority, irq->asserted, + irq->saved_priority, irq->pq_state, irq->resend, irq->masked_pending); } @@ -1198,10 +1227,17 @@ static int xics_get_source(struct kvmppc_xics *xics, long irq, u64 addr) val |= prio << KVM_XICS_PRIORITY_SHIFT; if (irqp->lsi) { val |= KVM_XICS_LEVEL_SENSITIVE; - if (irqp->asserted) + if (irqp->pq_state & PQ_PRESENTED) val |= KVM_XICS_PENDING; } else if (irqp->masked_pending || irqp->resend) val |= KVM_XICS_PENDING; + + if (irqp->pq_state & PQ_PRESENTED) + val |= KVM_XICS_PRESENTED; + + if (irqp->pq_state & PQ_QUEUED) + val |= KVM_XICS_QUEUED; + ret = 0; } arch_spin_unlock(&ics->lock); @@ -1253,18 +1289,20 @@ static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr) irqp->resend = 0; irqp->masked_pending = 0; irqp->lsi = 0; - irqp->asserted = 0; - if (val & KVM_XICS_LEVEL_SENSITIVE) { + irqp->pq_state = 0; + if (val & KVM_XICS_LEVEL_SENSITIVE) irqp->lsi = 1; - if (val & KVM_XICS_PENDING) - irqp->asserted = 1; - } + /* If PENDING, set P in case P is not saved because of old code */ + if (val & KVM_XICS_PRESENTED || val & KVM_XICS_PENDING) + irqp->pq_state |= PQ_PRESENTED; + if (val & KVM_XICS_QUEUED) + irqp->pq_state |= PQ_QUEUED; irqp->exists = 1; arch_spin_unlock(&ics->lock); local_irq_restore(flags); if (val & KVM_XICS_PENDING) - icp_deliver_irq(xics, NULL, irqp->number); + icp_deliver_irq(xics, NULL, irqp->number, false); return 0; } diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h index 2a50320b55ca..ec5474cf70c6 100644 --- a/arch/powerpc/kvm/book3s_xics.h +++ b/arch/powerpc/kvm/book3s_xics.h @@ -31,16 +31,19 @@ /* Priority value to use for disabling an interrupt */ #define MASKED 0xff +#define PQ_PRESENTED 1 +#define PQ_QUEUED 2 + /* State for one irq source */ struct ics_irq_state { u32 number; u32 server; + u32 pq_state; u8 priority; u8 saved_priority; u8 resend; u8 masked_pending; u8 lsi; /* level-sensitive interrupt */ - u8 asserted; /* Only for LSI */ u8 exists; int intr_cpu; u32 host_irq; @@ -73,7 +76,6 @@ struct kvmppc_icp { */ #define XICS_RM_KICK_VCPU 0x1 #define XICS_RM_CHECK_RESEND 0x2 -#define XICS_RM_REJECT 0x4 #define XICS_RM_NOTIFY_EOI 0x8 u32 rm_action; struct kvm_vcpu *rm_kick_target; @@ -84,7 +86,6 @@ struct kvmppc_icp { /* Counters for each reason we exited real mode */ unsigned long n_rm_kick_vcpu; unsigned long n_rm_check_resend; - unsigned long n_rm_reject; unsigned long n_rm_notify_eoi; /* Counters for handling ICP processing in real mode */ unsigned long n_check_resend; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index cd892dec7cb6..2b3e4e620078 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -565,6 +565,13 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_PPC_HWRNG: r = kvmppc_hwrng_present(); break; + case KVM_CAP_PPC_MMU_RADIX: + r = !!(hv_enabled && radix_enabled()); + break; + case KVM_CAP_PPC_MMU_HASH_V3: + r = !!(hv_enabled && !radix_enabled() && + cpu_has_feature(CPU_FTR_ARCH_300)); + break; #endif case KVM_CAP_SYNC_MMU: #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE @@ -605,6 +612,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SPAPR_MULTITCE: r = 1; break; + case KVM_CAP_SPAPR_RESIZE_HPT: + r = !!hv_enabled; + break; #endif case KVM_CAP_PPC_HTM: r = cpu_has_feature(CPU_FTR_TM_COMP) && @@ -1468,6 +1478,31 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_vm_ioctl_rtas_define_token(kvm, argp); break; } + case KVM_PPC_CONFIGURE_V3_MMU: { + struct kvm *kvm = filp->private_data; + struct kvm_ppc_mmuv3_cfg cfg; + + r = -EINVAL; + if (!kvm->arch.kvm_ops->configure_mmu) + goto out; + r = -EFAULT; + if (copy_from_user(&cfg, argp, sizeof(cfg))) + goto out; + r = kvm->arch.kvm_ops->configure_mmu(kvm, &cfg); + break; + } + case KVM_PPC_GET_RMMU_INFO: { + struct kvm *kvm = filp->private_data; + struct kvm_ppc_rmmu_info info; + + r = -EINVAL; + if (!kvm->arch.kvm_ops->get_rmmu_info) + goto out; + r = kvm->arch.kvm_ops->get_rmmu_info(kvm, &info); + if (r >= 0 && copy_to_user(argp, &info, sizeof(info))) + r = -EFAULT; + break; + } default: { struct kvm *kvm = filp->private_data; r = kvm->arch.kvm_ops->arch_vm_ioctl(filp, ioctl, arg); diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c index f2108c40e697..eb8c6c8c4851 100644 --- a/arch/powerpc/mm/init-common.c +++ b/arch/powerpc/mm/init-common.c @@ -41,6 +41,7 @@ static void pmd_ctor(void *addr) } struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE]; +EXPORT_SYMBOL_GPL(pgtable_cache); /* used by kvm_hv module */ /* * Create a kmem_cache() for pagetables. This is not used for PTE @@ -86,7 +87,7 @@ void pgtable_cache_add(unsigned shift, void (*ctor)(void *)) pr_debug("Allocated pgtable cache for order %d\n", shift); } - +EXPORT_SYMBOL_GPL(pgtable_cache_add); /* used by kvm_hv module */ void pgtable_cache_init(void) { diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 93abf8a9813d..10c9a545a646 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -42,6 +42,8 @@ #include <linux/memblock.h> #include <linux/hugetlb.h> #include <linux/slab.h> +#include <linux/of_fdt.h> +#include <linux/libfdt.h> #include <asm/pgalloc.h> #include <asm/page.h> @@ -344,12 +346,45 @@ static int __init parse_disable_radix(char *p) } early_param("disable_radix", parse_disable_radix); +/* + * If we're running under a hypervisor, we need to check the contents of + * /chosen/ibm,architecture-vec-5 to see if the hypervisor is willing to do + * radix. If not, we clear the radix feature bit so we fall back to hash. + */ +static void early_check_vec5(void) +{ + unsigned long root, chosen; + int size; + const u8 *vec5; + + root = of_get_flat_dt_root(); + chosen = of_get_flat_dt_subnode_by_name(root, "chosen"); + if (chosen == -FDT_ERR_NOTFOUND) + return; + vec5 = of_get_flat_dt_prop(chosen, "ibm,architecture-vec-5", &size); + if (!vec5) + return; + if (size <= OV5_INDX(OV5_MMU_RADIX_300) || + !(vec5[OV5_INDX(OV5_MMU_RADIX_300)] & OV5_FEAT(OV5_MMU_RADIX_300))) + /* Hypervisor doesn't support radix */ + cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX; +} + void __init mmu_early_init_devtree(void) { /* Disable radix mode based on kernel command line. */ if (disable_radix) cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX; + /* + * Check /chosen/ibm,architecture-vec-5 if running as a guest. + * When running bare-metal, we can use radix if we like + * even though the ibm,architecture-vec-5 property created by + * skiboot doesn't have the necessary bits set. + */ + if (early_radix_enabled() && !(mfmsr() & MSR_HV)) + early_check_vec5(); + if (early_radix_enabled()) radix__early_init_devtree(); else diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index cfa53ccc8baf..94323c4ececc 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -401,6 +401,8 @@ void __init radix__early_init_mmu(void) mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); radix_init_partition_table(); radix_init_amor(); + } else { + radix_init_pseries(); } memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 8bca7f58afc4..d6b5e5cde412 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -454,13 +454,23 @@ void __init mmu_partition_table_init(void) void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0, unsigned long dw1) { + unsigned long old = be64_to_cpu(partition_tb[lpid].patb0); + partition_tb[lpid].patb0 = cpu_to_be64(dw0); partition_tb[lpid].patb1 = cpu_to_be64(dw1); - /* Global flush of TLBs and partition table caches for this lpid */ + /* + * Global flush of TLBs and partition table caches for this lpid. + * The type of flush (hash or radix) depends on what the previous + * use of this partition ID was, not the new use. + */ asm volatile("ptesync" : : : "memory"); - asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : : - "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); + if (old & PATB_HR) + asm volatile(PPC_TLBIE_5(%0,%1,2,0,1) : : + "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); + else + asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : : + "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); asm volatile("eieio; tlbsync; ptesync" : : : "memory"); } EXPORT_SYMBOL_GPL(mmu_partition_table_set_entry); diff --git a/arch/powerpc/platforms/pseries/firmware.c b/arch/powerpc/platforms/pseries/firmware.c index ea7f09bd73b1..7d67623203b8 100644 --- a/arch/powerpc/platforms/pseries/firmware.c +++ b/arch/powerpc/platforms/pseries/firmware.c @@ -126,7 +126,7 @@ static void __init fw_vec5_feature_init(const char *vec5, unsigned long len) index = OV5_INDX(vec5_fw_features_table[i].feature); feat = OV5_FEAT(vec5_fw_features_table[i].feature); - if (vec5[index] & feat) + if (index < len && (vec5[index] & feat)) powerpc_firmware_features |= vec5_fw_features_table[i].val; } diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 5dc1c3c6e716..0587655aea69 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -609,6 +609,29 @@ static int __init disable_bulk_remove(char *str) __setup("bulk_remove=", disable_bulk_remove); +/* Actually only used for radix, so far */ +static int pseries_lpar_register_process_table(unsigned long base, + unsigned long page_size, unsigned long table_size) +{ + long rc; + unsigned long flags = PROC_TABLE_NEW; + + if (radix_enabled()) + flags |= PROC_TABLE_RADIX | PROC_TABLE_GTSE; + for (;;) { + rc = plpar_hcall_norets(H_REGISTER_PROC_TBL, flags, base, + page_size, table_size); + if (!H_IS_LONG_BUSY(rc)) + break; + mdelay(get_longbusy_msecs(rc)); + } + if (rc != H_SUCCESS) { + pr_err("Failed to register process table (rc=%ld)\n", rc); + BUG(); + } + return rc; +} + void __init hpte_init_pseries(void) { mmu_hash_ops.hpte_invalidate = pSeries_lpar_hpte_invalidate; @@ -622,6 +645,12 @@ void __init hpte_init_pseries(void) mmu_hash_ops.hugepage_invalidate = pSeries_lpar_hugepage_invalidate; } +void radix_init_pseries(void) +{ + pr_info("Using radix MMU under hypervisor\n"); + register_process_table = pseries_lpar_register_process_table; +} + #ifdef CONFIG_PPC_SMLPAR #define CMO_FREE_HINT_DEFAULT 1 static int cmo_free_hint_flag = CMO_FREE_HINT_DEFAULT; diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 4aa8a7e2a1da..4492c9363178 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -373,7 +373,7 @@ void ipte_unlock(struct kvm_vcpu *vcpu) ipte_unlock_simple(vcpu); } -static int ar_translation(struct kvm_vcpu *vcpu, union asce *asce, ar_t ar, +static int ar_translation(struct kvm_vcpu *vcpu, union asce *asce, u8 ar, enum gacc_mode mode) { union alet alet; @@ -465,7 +465,9 @@ static int ar_translation(struct kvm_vcpu *vcpu, union asce *asce, ar_t ar, struct trans_exc_code_bits { unsigned long addr : 52; /* Translation-exception Address */ unsigned long fsi : 2; /* Access Exception Fetch/Store Indication */ - unsigned long : 6; + unsigned long : 2; + unsigned long b56 : 1; + unsigned long : 3; unsigned long b60 : 1; unsigned long b61 : 1; unsigned long as : 2; /* ASCE Identifier */ @@ -485,7 +487,7 @@ enum prot_type { }; static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva, - ar_t ar, enum gacc_mode mode, enum prot_type prot) + u8 ar, enum gacc_mode mode, enum prot_type prot) { struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm; struct trans_exc_code_bits *tec; @@ -497,14 +499,18 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva, switch (code) { case PGM_PROTECTION: switch (prot) { + case PROT_TYPE_LA: + tec->b56 = 1; + break; + case PROT_TYPE_KEYC: + tec->b60 = 1; + break; case PROT_TYPE_ALC: tec->b60 = 1; /* FALL THROUGH */ case PROT_TYPE_DAT: tec->b61 = 1; break; - default: /* LA and KEYC set b61 to 0, other params undefined */ - return code; } /* FALL THROUGH */ case PGM_ASCE_TYPE: @@ -539,7 +545,7 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva, } static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce, - unsigned long ga, ar_t ar, enum gacc_mode mode) + unsigned long ga, u8 ar, enum gacc_mode mode) { int rc; struct psw_bits psw = psw_bits(vcpu->arch.sie_block->gpsw); @@ -771,7 +777,7 @@ static int low_address_protection_enabled(struct kvm_vcpu *vcpu, return 1; } -static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, +static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, unsigned long *pages, unsigned long nr_pages, const union asce asce, enum gacc_mode mode) { @@ -803,7 +809,7 @@ static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, return 0; } -int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data, +int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data, unsigned long len, enum gacc_mode mode) { psw_t *psw = &vcpu->arch.sie_block->gpsw; @@ -877,7 +883,7 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, * Note: The IPTE lock is not taken during this function, so the caller * has to take care of this. */ -int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar, +int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar, unsigned long *gpa, enum gacc_mode mode) { psw_t *psw = &vcpu->arch.sie_block->gpsw; @@ -910,7 +916,7 @@ int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar, /** * check_gva_range - test a range of guest virtual addresses for accessibility */ -int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar, +int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar, unsigned long length, enum gacc_mode mode) { unsigned long gpa; diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h index 8756569ad938..7ce47fd36f28 100644 --- a/arch/s390/kvm/gaccess.h +++ b/arch/s390/kvm/gaccess.h @@ -162,11 +162,11 @@ enum gacc_mode { }; int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, - ar_t ar, unsigned long *gpa, enum gacc_mode mode); -int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar, + u8 ar, unsigned long *gpa, enum gacc_mode mode); +int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar, unsigned long length, enum gacc_mode mode); -int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data, +int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data, unsigned long len, enum gacc_mode mode); int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, @@ -218,7 +218,7 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, * if data has been changed in guest space in case of an exception. */ static inline __must_check -int write_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data, +int write_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data, unsigned long len) { return access_guest(vcpu, ga, ar, data, len, GACC_STORE); @@ -238,7 +238,7 @@ int write_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data, * data will be copied from guest space to kernel space. */ static inline __must_check -int read_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data, +int read_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data, unsigned long len) { return access_guest(vcpu, ga, ar, data, len, GACC_FETCH); @@ -247,10 +247,11 @@ int read_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data, /** * read_guest_instr - copy instruction data from guest space to kernel space * @vcpu: virtual cpu + * @ga: guest address * @data: destination address in kernel space * @len: number of bytes to copy * - * Copy @len bytes from the current psw address (guest space) to @data (kernel + * Copy @len bytes from the given address (guest space) to @data (kernel * space). * * The behaviour of read_guest_instr is identical to read_guest, except that @@ -258,10 +259,10 @@ int read_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data, * address-space mode. */ static inline __must_check -int read_guest_instr(struct kvm_vcpu *vcpu, void *data, unsigned long len) +int read_guest_instr(struct kvm_vcpu *vcpu, unsigned long ga, void *data, + unsigned long len) { - return access_guest(vcpu, vcpu->arch.sie_block->gpsw.addr, 0, data, len, - GACC_IFETCH); + return access_guest(vcpu, ga, 0, data, len, GACC_IFETCH); } /** diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c index d7c6a7f53ced..23d9a4e12da1 100644 --- a/arch/s390/kvm/guestdbg.c +++ b/arch/s390/kvm/guestdbg.c @@ -388,14 +388,13 @@ void kvm_s390_prepare_debug_exit(struct kvm_vcpu *vcpu) #define per_write_wp_event(code) \ (code & (PER_CODE_STORE | PER_CODE_STORE_REAL)) -static int debug_exit_required(struct kvm_vcpu *vcpu) +static int debug_exit_required(struct kvm_vcpu *vcpu, u8 perc, + unsigned long peraddr) { - u8 perc = vcpu->arch.sie_block->perc; struct kvm_debug_exit_arch *debug_exit = &vcpu->run->debug.arch; struct kvm_hw_wp_info_arch *wp_info = NULL; struct kvm_hw_bp_info_arch *bp_info = NULL; unsigned long addr = vcpu->arch.sie_block->gpsw.addr; - unsigned long peraddr = vcpu->arch.sie_block->peraddr; if (guestdbg_hw_bp_enabled(vcpu)) { if (per_write_wp_event(perc) && @@ -437,36 +436,118 @@ exit_required: return 1; } +static int per_fetched_addr(struct kvm_vcpu *vcpu, unsigned long *addr) +{ + u8 exec_ilen = 0; + u16 opcode[3]; + int rc; + + if (vcpu->arch.sie_block->icptcode == ICPT_PROGI) { + /* PER address references the fetched or the execute instr */ + *addr = vcpu->arch.sie_block->peraddr; + /* + * Manually detect if we have an EXECUTE instruction. As + * instructions are always 2 byte aligned we can read the + * first two bytes unconditionally + */ + rc = read_guest_instr(vcpu, *addr, &opcode, 2); + if (rc) + return rc; + if (opcode[0] >> 8 == 0x44) + exec_ilen = 4; + if ((opcode[0] & 0xff0f) == 0xc600) + exec_ilen = 6; + } else { + /* instr was suppressed, calculate the responsible instr */ + *addr = __rewind_psw(vcpu->arch.sie_block->gpsw, + kvm_s390_get_ilen(vcpu)); + if (vcpu->arch.sie_block->icptstatus & 0x01) { + exec_ilen = (vcpu->arch.sie_block->icptstatus & 0x60) >> 4; + if (!exec_ilen) + exec_ilen = 4; + } + } + + if (exec_ilen) { + /* read the complete EXECUTE instr to detect the fetched addr */ + rc = read_guest_instr(vcpu, *addr, &opcode, exec_ilen); + if (rc) + return rc; + if (exec_ilen == 6) { + /* EXECUTE RELATIVE LONG - RIL-b format */ + s32 rl = *((s32 *) (opcode + 1)); + + /* rl is a _signed_ 32 bit value specifying halfwords */ + *addr += (u64)(s64) rl * 2; + } else { + /* EXECUTE - RX-a format */ + u32 base = (opcode[1] & 0xf000) >> 12; + u32 disp = opcode[1] & 0x0fff; + u32 index = opcode[0] & 0x000f; + + *addr = base ? vcpu->run->s.regs.gprs[base] : 0; + *addr += index ? vcpu->run->s.regs.gprs[index] : 0; + *addr += disp; + } + *addr = kvm_s390_logical_to_effective(vcpu, *addr); + } + return 0; +} + #define guest_per_enabled(vcpu) \ (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PER) int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu) { + const u64 cr10 = vcpu->arch.sie_block->gcr[10]; + const u64 cr11 = vcpu->arch.sie_block->gcr[11]; const u8 ilen = kvm_s390_get_ilen(vcpu); struct kvm_s390_pgm_info pgm_info = { .code = PGM_PER, .per_code = PER_CODE_IFETCH, .per_address = __rewind_psw(vcpu->arch.sie_block->gpsw, ilen), }; + unsigned long fetched_addr; + int rc; /* * The PSW points to the next instruction, therefore the intercepted * instruction generated a PER i-fetch event. PER address therefore * points at the previous PSW address (could be an EXECUTE function). */ - return kvm_s390_inject_prog_irq(vcpu, &pgm_info); + if (!guestdbg_enabled(vcpu)) + return kvm_s390_inject_prog_irq(vcpu, &pgm_info); + + if (debug_exit_required(vcpu, pgm_info.per_code, pgm_info.per_address)) + vcpu->guest_debug |= KVM_GUESTDBG_EXIT_PENDING; + + if (!guest_per_enabled(vcpu) || + !(vcpu->arch.sie_block->gcr[9] & PER_EVENT_IFETCH)) + return 0; + + rc = per_fetched_addr(vcpu, &fetched_addr); + if (rc < 0) + return rc; + if (rc) + /* instruction-fetching exceptions */ + return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + + if (in_addr_range(fetched_addr, cr10, cr11)) + return kvm_s390_inject_prog_irq(vcpu, &pgm_info); + return 0; } -static void filter_guest_per_event(struct kvm_vcpu *vcpu) +static int filter_guest_per_event(struct kvm_vcpu *vcpu) { const u8 perc = vcpu->arch.sie_block->perc; - u64 peraddr = vcpu->arch.sie_block->peraddr; u64 addr = vcpu->arch.sie_block->gpsw.addr; u64 cr9 = vcpu->arch.sie_block->gcr[9]; u64 cr10 = vcpu->arch.sie_block->gcr[10]; u64 cr11 = vcpu->arch.sie_block->gcr[11]; /* filter all events, demanded by the guest */ u8 guest_perc = perc & (cr9 >> 24) & PER_CODE_MASK; + unsigned long fetched_addr; + int rc; if (!guest_per_enabled(vcpu)) guest_perc = 0; @@ -478,9 +559,17 @@ static void filter_guest_per_event(struct kvm_vcpu *vcpu) guest_perc &= ~PER_CODE_BRANCH; /* filter "instruction-fetching" events */ - if (guest_perc & PER_CODE_IFETCH && - !in_addr_range(peraddr, cr10, cr11)) - guest_perc &= ~PER_CODE_IFETCH; + if (guest_perc & PER_CODE_IFETCH) { + rc = per_fetched_addr(vcpu, &fetched_addr); + if (rc < 0) + return rc; + /* + * Don't inject an irq on exceptions. This would make handling + * on icpt code 8 very complex (as PSW was already rewound). + */ + if (rc || !in_addr_range(fetched_addr, cr10, cr11)) + guest_perc &= ~PER_CODE_IFETCH; + } /* All other PER events will be given to the guest */ /* TODO: Check altered address/address space */ @@ -489,6 +578,7 @@ static void filter_guest_per_event(struct kvm_vcpu *vcpu) if (!guest_perc) vcpu->arch.sie_block->iprcc &= ~PGM_PER; + return 0; } #define pssec(vcpu) (vcpu->arch.sie_block->gcr[1] & _ASCE_SPACE_SWITCH) @@ -496,14 +586,17 @@ static void filter_guest_per_event(struct kvm_vcpu *vcpu) #define old_ssec(vcpu) ((vcpu->arch.sie_block->tecmc >> 31) & 0x1) #define old_as_is_home(vcpu) !(vcpu->arch.sie_block->tecmc & 0xffff) -void kvm_s390_handle_per_event(struct kvm_vcpu *vcpu) +int kvm_s390_handle_per_event(struct kvm_vcpu *vcpu) { - int new_as; + int rc, new_as; - if (debug_exit_required(vcpu)) + if (debug_exit_required(vcpu, vcpu->arch.sie_block->perc, + vcpu->arch.sie_block->peraddr)) vcpu->guest_debug |= KVM_GUESTDBG_EXIT_PENDING; - filter_guest_per_event(vcpu); + rc = filter_guest_per_event(vcpu); + if (rc) + return rc; /* * Only RP, SAC, SACF, PT, PTI, PR, PC instructions can trigger @@ -532,4 +625,5 @@ void kvm_s390_handle_per_event(struct kvm_vcpu *vcpu) (pssec(vcpu) || old_ssec(vcpu))) vcpu->arch.sie_block->iprcc = PGM_SPACE_SWITCH; } + return 0; } diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 7a27eebab28a..59920f96ebc0 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -238,7 +238,9 @@ static int handle_prog(struct kvm_vcpu *vcpu) vcpu->stat.exit_program_interruption++; if (guestdbg_enabled(vcpu) && per_event(vcpu)) { - kvm_s390_handle_per_event(vcpu); + rc = kvm_s390_handle_per_event(vcpu); + if (rc) + return rc; /* the interrupt might have been filtered out completely */ if (vcpu->arch.sie_block->iprcc == 0) return 0; @@ -359,6 +361,9 @@ static int handle_partial_execution(struct kvm_vcpu *vcpu) static int handle_operexc(struct kvm_vcpu *vcpu) { + psw_t oldpsw, newpsw; + int rc; + vcpu->stat.exit_operation_exception++; trace_kvm_s390_handle_operexc(vcpu, vcpu->arch.sie_block->ipa, vcpu->arch.sie_block->ipb); @@ -369,6 +374,24 @@ static int handle_operexc(struct kvm_vcpu *vcpu) if (vcpu->arch.sie_block->ipa == 0 && vcpu->kvm->arch.user_instr0) return -EOPNOTSUPP; + rc = read_guest_lc(vcpu, __LC_PGM_NEW_PSW, &newpsw, sizeof(psw_t)); + if (rc) + return rc; + /* + * Avoid endless loops of operation exceptions, if the pgm new + * PSW will cause a new operation exception. + * The heuristic checks if the pgm new psw is within 6 bytes before + * the faulting psw address (with same DAT, AS settings) and the + * new psw is not a wait psw and the fault was not triggered by + * problem state. + */ + oldpsw = vcpu->arch.sie_block->gpsw; + if (oldpsw.addr - newpsw.addr <= 6 && + !(newpsw.mask & PSW_MASK_WAIT) && + !(oldpsw.mask & PSW_MASK_PSTATE) && + (newpsw.mask & PSW_MASK_ASC) == (oldpsw.mask & PSW_MASK_ASC) && + (newpsw.mask & PSW_MASK_DAT) == (oldpsw.mask & PSW_MASK_DAT)) + return -EOPNOTSUPP; return kvm_s390_inject_program_int(vcpu, PGM_OPERATION); } diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 6484a250021e..502de74ea984 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -217,7 +217,7 @@ static void allow_cpu_feat(unsigned long nr) static inline int plo_test_bit(unsigned char nr) { register unsigned long r0 asm("0") = (unsigned long) nr | 0x100; - int cc = 3; /* subfunction not available */ + int cc; asm volatile( /* Parameter registers are ignored for "test bit" */ @@ -442,6 +442,9 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot; int is_dirty = 0; + if (kvm_is_ucontrol(kvm)) + return -EINVAL; + mutex_lock(&kvm->slots_lock); r = -EINVAL; @@ -505,6 +508,14 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) } else if (MACHINE_HAS_VX) { set_kvm_facility(kvm->arch.model.fac_mask, 129); set_kvm_facility(kvm->arch.model.fac_list, 129); + if (test_facility(134)) { + set_kvm_facility(kvm->arch.model.fac_mask, 134); + set_kvm_facility(kvm->arch.model.fac_list, 134); + } + if (test_facility(135)) { + set_kvm_facility(kvm->arch.model.fac_mask, 135); + set_kvm_facility(kvm->arch.model.fac_list, 135); + } r = 0; } else r = -EINVAL; @@ -821,6 +832,13 @@ static int kvm_s390_set_processor(struct kvm *kvm, struct kvm_device_attr *attr) } memcpy(kvm->arch.model.fac_list, proc->fac_list, S390_ARCH_FAC_LIST_SIZE_BYTE); + VM_EVENT(kvm, 3, "SET: guest ibc: 0x%4.4x, guest cpuid: 0x%16.16llx", + kvm->arch.model.ibc, + kvm->arch.model.cpuid); + VM_EVENT(kvm, 3, "SET: guest faclist: 0x%16.16llx.%16.16llx.%16.16llx", + kvm->arch.model.fac_list[0], + kvm->arch.model.fac_list[1], + kvm->arch.model.fac_list[2]); } else ret = -EFAULT; kfree(proc); @@ -894,6 +912,13 @@ static int kvm_s390_get_processor(struct kvm *kvm, struct kvm_device_attr *attr) proc->ibc = kvm->arch.model.ibc; memcpy(&proc->fac_list, kvm->arch.model.fac_list, S390_ARCH_FAC_LIST_SIZE_BYTE); + VM_EVENT(kvm, 3, "GET: guest ibc: 0x%4.4x, guest cpuid: 0x%16.16llx", + kvm->arch.model.ibc, + kvm->arch.model.cpuid); + VM_EVENT(kvm, 3, "GET: guest faclist: 0x%16.16llx.%16.16llx.%16.16llx", + kvm->arch.model.fac_list[0], + kvm->arch.model.fac_list[1], + kvm->arch.model.fac_list[2]); if (copy_to_user((void __user *)attr->addr, proc, sizeof(*proc))) ret = -EFAULT; kfree(proc); @@ -917,6 +942,17 @@ static int kvm_s390_get_machine(struct kvm *kvm, struct kvm_device_attr *attr) S390_ARCH_FAC_LIST_SIZE_BYTE); memcpy((unsigned long *)&mach->fac_list, S390_lowcore.stfle_fac_list, sizeof(S390_lowcore.stfle_fac_list)); + VM_EVENT(kvm, 3, "GET: host ibc: 0x%4.4x, host cpuid: 0x%16.16llx", + kvm->arch.model.ibc, + kvm->arch.model.cpuid); + VM_EVENT(kvm, 3, "GET: host facmask: 0x%16.16llx.%16.16llx.%16.16llx", + mach->fac_mask[0], + mach->fac_mask[1], + mach->fac_mask[2]); + VM_EVENT(kvm, 3, "GET: host faclist: 0x%16.16llx.%16.16llx.%16.16llx", + mach->fac_list[0], + mach->fac_list[1], + mach->fac_list[2]); if (copy_to_user((void __user *)attr->addr, mach, sizeof(*mach))) ret = -EFAULT; kfree(mach); @@ -1938,6 +1974,8 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) if (test_kvm_facility(vcpu->kvm, 8) && sclp.has_pfmfi) vcpu->arch.sie_block->ecb2 |= 0x08; + if (test_kvm_facility(vcpu->kvm, 130)) + vcpu->arch.sie_block->ecb2 |= 0x20; vcpu->arch.sie_block->eca = 0x1002000U; if (sclp.has_cei) vcpu->arch.sie_block->eca |= 0x80000000U; @@ -2578,7 +2616,7 @@ static int vcpu_post_run_fault_in_sie(struct kvm_vcpu *vcpu) * to look up the current opcode to get the length of the instruction * to be able to forward the PSW. */ - rc = read_guest_instr(vcpu, &opcode, 1); + rc = read_guest_instr(vcpu, vcpu->arch.sie_block->gpsw.addr, &opcode, 1); ilen = insn_length(opcode); if (rc < 0) { return rc; diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 3a4e97f1a9e6..af9fa91a0c91 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -86,9 +86,7 @@ static inline void kvm_s390_set_prefix(struct kvm_vcpu *vcpu, u32 prefix) kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu); } -typedef u8 __bitwise ar_t; - -static inline u64 kvm_s390_get_base_disp_s(struct kvm_vcpu *vcpu, ar_t *ar) +static inline u64 kvm_s390_get_base_disp_s(struct kvm_vcpu *vcpu, u8 *ar) { u32 base2 = vcpu->arch.sie_block->ipb >> 28; u32 disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16); @@ -101,7 +99,7 @@ static inline u64 kvm_s390_get_base_disp_s(struct kvm_vcpu *vcpu, ar_t *ar) static inline void kvm_s390_get_base_disp_sse(struct kvm_vcpu *vcpu, u64 *address1, u64 *address2, - ar_t *ar_b1, ar_t *ar_b2) + u8 *ar_b1, u8 *ar_b2) { u32 base1 = (vcpu->arch.sie_block->ipb & 0xf0000000) >> 28; u32 disp1 = (vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16; @@ -125,7 +123,7 @@ static inline void kvm_s390_get_regs_rre(struct kvm_vcpu *vcpu, int *r1, int *r2 *r2 = (vcpu->arch.sie_block->ipb & 0x000f0000) >> 16; } -static inline u64 kvm_s390_get_base_disp_rsy(struct kvm_vcpu *vcpu, ar_t *ar) +static inline u64 kvm_s390_get_base_disp_rsy(struct kvm_vcpu *vcpu, u8 *ar) { u32 base2 = vcpu->arch.sie_block->ipb >> 28; u32 disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16) + @@ -140,7 +138,7 @@ static inline u64 kvm_s390_get_base_disp_rsy(struct kvm_vcpu *vcpu, ar_t *ar) return (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + (long)(int)disp2; } -static inline u64 kvm_s390_get_base_disp_rs(struct kvm_vcpu *vcpu, ar_t *ar) +static inline u64 kvm_s390_get_base_disp_rs(struct kvm_vcpu *vcpu, u8 *ar) { u32 base2 = vcpu->arch.sie_block->ipb >> 28; u32 disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16); @@ -379,7 +377,7 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu, void kvm_s390_clear_bp_data(struct kvm_vcpu *vcpu); void kvm_s390_prepare_debug_exit(struct kvm_vcpu *vcpu); int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu); -void kvm_s390_handle_per_event(struct kvm_vcpu *vcpu); +int kvm_s390_handle_per_event(struct kvm_vcpu *vcpu); /* support for Basic/Extended SCA handling */ static inline union ipte_control *kvm_s390_get_ipte_control(struct kvm *kvm) diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index e18435355c16..1ecc1cffdf7c 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -54,7 +54,7 @@ int kvm_s390_handle_aa(struct kvm_vcpu *vcpu) static int handle_set_clock(struct kvm_vcpu *vcpu) { int rc; - ar_t ar; + u8 ar; u64 op2, val; if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) @@ -79,7 +79,7 @@ static int handle_set_prefix(struct kvm_vcpu *vcpu) u64 operand2; u32 address; int rc; - ar_t ar; + u8 ar; vcpu->stat.instruction_spx++; @@ -117,7 +117,7 @@ static int handle_store_prefix(struct kvm_vcpu *vcpu) u64 operand2; u32 address; int rc; - ar_t ar; + u8 ar; vcpu->stat.instruction_stpx++; @@ -147,7 +147,7 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu) u16 vcpu_id = vcpu->vcpu_id; u64 ga; int rc; - ar_t ar; + u8 ar; vcpu->stat.instruction_stap++; @@ -380,7 +380,7 @@ static int handle_tpi(struct kvm_vcpu *vcpu) u32 tpi_data[3]; int rc; u64 addr; - ar_t ar; + u8 ar; addr = kvm_s390_get_base_disp_s(vcpu, &ar); if (addr & 3) @@ -548,7 +548,7 @@ int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu) psw_compat_t new_psw; u64 addr; int rc; - ar_t ar; + u8 ar; if (gpsw->mask & PSW_MASK_PSTATE) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); @@ -575,7 +575,7 @@ static int handle_lpswe(struct kvm_vcpu *vcpu) psw_t new_psw; u64 addr; int rc; - ar_t ar; + u8 ar; if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); @@ -597,7 +597,7 @@ static int handle_stidp(struct kvm_vcpu *vcpu) u64 stidp_data = vcpu->kvm->arch.model.cpuid; u64 operand2; int rc; - ar_t ar; + u8 ar; vcpu->stat.instruction_stidp++; @@ -644,7 +644,7 @@ static void handle_stsi_3_2_2(struct kvm_vcpu *vcpu, struct sysinfo_3_2_2 *mem) ASCEBC(mem->vm[0].cpi, 16); } -static void insert_stsi_usr_data(struct kvm_vcpu *vcpu, u64 addr, ar_t ar, +static void insert_stsi_usr_data(struct kvm_vcpu *vcpu, u64 addr, u8 ar, u8 fc, u8 sel1, u16 sel2) { vcpu->run->exit_reason = KVM_EXIT_S390_STSI; @@ -663,7 +663,7 @@ static int handle_stsi(struct kvm_vcpu *vcpu) unsigned long mem = 0; u64 operand2; int rc = 0; - ar_t ar; + u8 ar; vcpu->stat.instruction_stsi++; VCPU_EVENT(vcpu, 3, "STSI: fc: %u sel1: %u sel2: %u", fc, sel1, sel2); @@ -970,7 +970,7 @@ int kvm_s390_handle_lctl(struct kvm_vcpu *vcpu) int reg, rc, nr_regs; u32 ctl_array[16]; u64 ga; - ar_t ar; + u8 ar; vcpu->stat.instruction_lctl++; @@ -1009,7 +1009,7 @@ int kvm_s390_handle_stctl(struct kvm_vcpu *vcpu) int reg, rc, nr_regs; u32 ctl_array[16]; u64 ga; - ar_t ar; + u8 ar; vcpu->stat.instruction_stctl++; @@ -1043,7 +1043,7 @@ static int handle_lctlg(struct kvm_vcpu *vcpu) int reg, rc, nr_regs; u64 ctl_array[16]; u64 ga; - ar_t ar; + u8 ar; vcpu->stat.instruction_lctlg++; @@ -1081,7 +1081,7 @@ static int handle_stctg(struct kvm_vcpu *vcpu) int reg, rc, nr_regs; u64 ctl_array[16]; u64 ga; - ar_t ar; + u8 ar; vcpu->stat.instruction_stctg++; @@ -1132,7 +1132,7 @@ static int handle_tprot(struct kvm_vcpu *vcpu) unsigned long hva, gpa; int ret = 0, cc = 0; bool writable; - ar_t ar; + u8 ar; vcpu->stat.instruction_tprot++; diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index d8673e243f13..ed62c6d57d93 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -324,6 +324,9 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) /* Run-time-Instrumentation */ if (test_kvm_facility(vcpu->kvm, 64)) scb_s->ecb3 |= scb_o->ecb3 & 0x01U; + /* Instruction Execution Prevention */ + if (test_kvm_facility(vcpu->kvm, 130)) + scb_s->ecb2 |= scb_o->ecb2 & 0x20U; if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIIF)) scb_s->eca |= scb_o->eca & 0x00000001U; if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IB)) diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 7a1897c51c54..f70db837ddc4 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -741,7 +741,7 @@ int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr) pgste_set_unlock(ptep, new); pte_unmap_unlock(ptep, ptl); - return 0; + return cc; } EXPORT_SYMBOL(reset_guest_reference_bit); diff --git a/arch/s390/tools/gen_facilities.c b/arch/s390/tools/gen_facilities.c index 8cc53b1e6d03..0cf802de52a1 100644 --- a/arch/s390/tools/gen_facilities.c +++ b/arch/s390/tools/gen_facilities.c @@ -80,6 +80,8 @@ static struct facility_def facility_defs[] = { 76, /* msa extension 3 */ 77, /* msa extension 4 */ 78, /* enhanced-DAT 2 */ + 130, /* instruction-execution-protection */ + 131, /* enhanced-SOP 2 and side-effect */ -1 /* END */ } }, diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index eafee3161d1c..d9d7136edf05 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -288,6 +288,7 @@ #define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ #define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ #define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ +#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */ #define X86_FEATURE_RDPID (16*32+ 22) /* RDPID instruction */ /* AMD-defined CPU features, CPUID level 0x80000007 (ebx), word 17 */ @@ -320,5 +321,4 @@ #define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ #define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ #define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ - #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index e9cd7befcb76..3e8c287090e4 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -441,5 +441,6 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt, int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq); void emulator_invalidate_register_cache(struct x86_emulate_ctxt *ctxt); void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt); +bool emulator_can_use_gpa(struct x86_emulate_ctxt *ctxt); #endif /* _ASM_X86_KVM_X86_EMULATE_H */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a7066dc1a7e9..417502cf42b6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -115,7 +115,7 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) #define KVM_PERMILLE_MMU_PAGES 20 #define KVM_MIN_ALLOC_MMU_PAGES 64 -#define KVM_MMU_HASH_SHIFT 10 +#define KVM_MMU_HASH_SHIFT 12 #define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT) #define KVM_MIN_FREE_MMU_PAGES 5 #define KVM_REFILL_PAGES 25 @@ -208,6 +208,13 @@ enum { PFERR_WRITE_MASK | \ PFERR_PRESENT_MASK) +/* + * The mask used to denote special SPTEs, which can be either MMIO SPTEs or + * Access Tracking SPTEs. We use bit 62 instead of bit 63 to avoid conflicting + * with the SVE bit in EPT PTEs. + */ +#define SPTE_SPECIAL_MASK (1ULL << 62) + /* apic attention bits */ #define KVM_APIC_CHECK_VAPIC 0 /* @@ -668,6 +675,9 @@ struct kvm_vcpu_arch { int pending_ioapic_eoi; int pending_external_vector; + + /* GPA available (AMD only) */ + bool gpa_available; }; struct kvm_lpage_info { @@ -716,6 +726,12 @@ struct kvm_hv { HV_REFERENCE_TSC_PAGE tsc_ref; }; +enum kvm_irqchip_mode { + KVM_IRQCHIP_NONE, + KVM_IRQCHIP_KERNEL, /* created with KVM_CREATE_IRQCHIP */ + KVM_IRQCHIP_SPLIT, /* created with KVM_CAP_SPLIT_IRQCHIP */ +}; + struct kvm_arch { unsigned int n_used_mmu_pages; unsigned int n_requested_mmu_pages; @@ -788,7 +804,7 @@ struct kvm_arch { u64 disabled_quirks; - bool irqchip_split; + enum kvm_irqchip_mode irqchip_mode; u8 nr_reserved_ioapic_pins; bool disabled_lapic_found; @@ -815,6 +831,7 @@ struct kvm_vm_stat { ulong mmu_unsync; ulong remote_tlb_flush; ulong lpages; + ulong max_mmu_page_hash_collisions; }; struct kvm_vcpu_stat { @@ -844,6 +861,7 @@ struct kvm_vcpu_stat { u64 hypercalls; u64 irq_injections; u64 nmi_injections; + u64 req_event; }; struct x86_instruction_info; @@ -1050,7 +1068,8 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu); void kvm_mmu_init_vm(struct kvm *kvm); void kvm_mmu_uninit_vm(struct kvm *kvm); void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask); + u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, + u64 acc_track_mask); void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, diff --git a/arch/x86/include/asm/kvmclock.h b/arch/x86/include/asm/kvmclock.h new file mode 100644 index 000000000000..f260bef63591 --- /dev/null +++ b/arch/x86/include/asm/kvmclock.h @@ -0,0 +1,6 @@ +#ifndef _ASM_X86_KVM_CLOCK_H +#define _ASM_X86_KVM_CLOCK_H + +extern struct clocksource kvm_clock; + +#endif /* _ASM_X86_KVM_CLOCK_H */ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 2b5b2d4b924e..cc54b7026567 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -467,8 +467,16 @@ enum vmcs_field { #define VMX_EPT_WRITABLE_MASK 0x2ull #define VMX_EPT_EXECUTABLE_MASK 0x4ull #define VMX_EPT_IPAT_BIT (1ull << 6) -#define VMX_EPT_ACCESS_BIT (1ull << 8) -#define VMX_EPT_DIRTY_BIT (1ull << 9) +#define VMX_EPT_ACCESS_BIT (1ull << 8) +#define VMX_EPT_DIRTY_BIT (1ull << 9) +#define VMX_EPT_RWX_MASK (VMX_EPT_READABLE_MASK | \ + VMX_EPT_WRITABLE_MASK | \ + VMX_EPT_EXECUTABLE_MASK) +#define VMX_EPT_MT_MASK (7ull << VMX_EPT_MT_EPTE_SHIFT) + +/* The mask to use to trigger an EPT Misconfiguration in order to track MMIO */ +#define VMX_EPT_MISCONFIG_WX_VALUE (VMX_EPT_WRITABLE_MASK | \ + VMX_EPT_EXECUTABLE_MASK) #define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul @@ -500,6 +508,22 @@ struct vmx_msr_entry { #define ENTRY_FAIL_VMCS_LINK_PTR 4 /* + * Exit Qualifications for EPT Violations + */ +#define EPT_VIOLATION_ACC_READ_BIT 0 +#define EPT_VIOLATION_ACC_WRITE_BIT 1 +#define EPT_VIOLATION_ACC_INSTR_BIT 2 +#define EPT_VIOLATION_READABLE_BIT 3 +#define EPT_VIOLATION_WRITABLE_BIT 4 +#define EPT_VIOLATION_EXECUTABLE_BIT 5 +#define EPT_VIOLATION_ACC_READ (1 << EPT_VIOLATION_ACC_READ_BIT) +#define EPT_VIOLATION_ACC_WRITE (1 << EPT_VIOLATION_ACC_WRITE_BIT) +#define EPT_VIOLATION_ACC_INSTR (1 << EPT_VIOLATION_ACC_INSTR_BIT) +#define EPT_VIOLATION_READABLE (1 << EPT_VIOLATION_READABLE_BIT) +#define EPT_VIOLATION_WRITABLE (1 << EPT_VIOLATION_WRITABLE_BIT) +#define EPT_VIOLATION_EXECUTABLE (1 << EPT_VIOLATION_EXECUTABLE_BIT) + +/* * VM-instruction error numbers */ enum vm_instruction_error_number { diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 1421a6585126..cff0bb6556f8 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -50,6 +50,15 @@ struct kvm_steal_time { __u32 pad[11]; }; +#define KVM_CLOCK_PAIRING_WALLCLOCK 0 +struct kvm_clock_pairing { + __s64 sec; + __s64 nsec; + __u64 tsc; + __u32 flags; + __u32 pad[9]; +}; + #define KVM_STEAL_ALIGNMENT_BITS 5 #define KVM_STEAL_VALID_BITS ((-1ULL << (KVM_STEAL_ALIGNMENT_BITS + 1))) #define KVM_STEAL_RESERVED_MASK (((1 << KVM_STEAL_ALIGNMENT_BITS) - 1 ) << 1) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 1d7770447b3e..35f7024aace5 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -78,6 +78,7 @@ void fpu__xstate_clear_all_cpu_caps(void) setup_clear_cpu_cap(X86_FEATURE_PKU); setup_clear_cpu_cap(X86_FEATURE_AVX512_4VNNIW); setup_clear_cpu_cap(X86_FEATURE_AVX512_4FMAPS); + setup_clear_cpu_cap(X86_FEATURE_AVX512_VPOPCNTDQ); } /* diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 2a5cafdf8808..995fa260a6da 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -28,6 +28,7 @@ #include <asm/x86_init.h> #include <asm/reboot.h> +#include <asm/kvmclock.h> static int kvmclock __ro_after_init = 1; static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; @@ -49,6 +50,7 @@ struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void) { return hv_clock; } +EXPORT_SYMBOL_GPL(pvclock_pvti_cpu0_va); /* * The wallclock is the time of day when we booted. Since then, some time may @@ -174,13 +176,14 @@ bool kvm_check_and_clear_guest_paused(void) return ret; } -static struct clocksource kvm_clock = { +struct clocksource kvm_clock = { .name = "kvm-clock", .read = kvm_clock_get_cycles, .rating = 400, .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; +EXPORT_SYMBOL_GPL(kvm_clock); int kvm_register_clock(char *txt) { diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index e85f6bd7b9d5..c0e2036217ad 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -383,7 +383,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 7.0.ecx*/ const u32 kvm_cpuid_7_0_ecx_x86_features = - F(AVX512VBMI) | F(PKU) | 0 /*OSPKE*/; + F(AVX512VBMI) | F(PKU) | 0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ); /* cpuid 7.0.edx*/ const u32 kvm_cpuid_7_0_edx_x86_features = @@ -861,12 +861,6 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) if (!best) best = check_cpuid_limit(vcpu, function, index); - /* - * Perfmon not yet supported for L2 guest. - */ - if (is_guest_mode(vcpu) && function == 0xa) - best = NULL; - if (best) { *eax = best->eax; *ebx = best->ebx; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index cedbba0f3402..45c7306c8780 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -173,6 +173,7 @@ #define NearBranch ((u64)1 << 52) /* Near branches */ #define No16 ((u64)1 << 53) /* No 16 bit operand */ #define IncSP ((u64)1 << 54) /* SP is incremented before ModRM calc */ +#define TwoMemOp ((u64)1 << 55) /* Instruction has two memory operand */ #define DstXacc (DstAccLo | SrcAccHi | SrcWrite) @@ -4298,7 +4299,7 @@ static const struct opcode group1[] = { }; static const struct opcode group1A[] = { - I(DstMem | SrcNone | Mov | Stack | IncSP, em_pop), N, N, N, N, N, N, N, + I(DstMem | SrcNone | Mov | Stack | IncSP | TwoMemOp, em_pop), N, N, N, N, N, N, N, }; static const struct opcode group2[] = { @@ -4336,7 +4337,7 @@ static const struct opcode group5[] = { I(SrcMemFAddr | ImplicitOps, em_call_far), I(SrcMem | NearBranch, em_jmp_abs), I(SrcMemFAddr | ImplicitOps, em_jmp_far), - I(SrcMem | Stack, em_push), D(Undefined), + I(SrcMem | Stack | TwoMemOp, em_push), D(Undefined), }; static const struct opcode group6[] = { @@ -4556,8 +4557,8 @@ static const struct opcode opcode_table[256] = { /* 0xA0 - 0xA7 */ I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov), - I2bv(SrcSI | DstDI | Mov | String, em_mov), - F2bv(SrcSI | DstDI | String | NoWrite, em_cmp_r), + I2bv(SrcSI | DstDI | Mov | String | TwoMemOp, em_mov), + F2bv(SrcSI | DstDI | String | NoWrite | TwoMemOp, em_cmp_r), /* 0xA8 - 0xAF */ F2bv(DstAcc | SrcImm | NoWrite, em_test), I2bv(SrcAcc | DstDI | Mov | String, em_mov), @@ -5671,3 +5672,14 @@ void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt) { writeback_registers(ctxt); } + +bool emulator_can_use_gpa(struct x86_emulate_ctxt *ctxt) +{ + if (ctxt->rep_prefix && (ctxt->d & String)) + return false; + + if (ctxt->d & TwoMemOp) + return false; + + return true; +} diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 1572c35b4f1a..08b27e0c7b71 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -305,13 +305,13 @@ static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint) return -ENOENT; memset(&irq, 0, sizeof(irq)); - irq.dest_id = kvm_apic_id(vcpu->arch.apic); + irq.shorthand = APIC_DEST_SELF; irq.dest_mode = APIC_DEST_PHYSICAL; irq.delivery_mode = APIC_DM_FIXED; irq.vector = vector; irq.level = 1; - ret = kvm_irq_delivery_to_apic(vcpu->kvm, NULL, &irq, NULL); + ret = kvm_irq_delivery_to_apic(vcpu->kvm, vcpu->arch.apic, &irq, NULL); trace_kvm_hv_synic_set_irq(vcpu->vcpu_id, sint, irq.vector, ret); return ret; } diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 7cc2360f1848..73ea24d4f119 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -598,14 +598,14 @@ static const struct kvm_io_device_ops picdev_eclr_ops = { .write = picdev_eclr_write, }; -struct kvm_pic *kvm_create_pic(struct kvm *kvm) +int kvm_pic_init(struct kvm *kvm) { struct kvm_pic *s; int ret; s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); if (!s) - return NULL; + return -ENOMEM; spin_lock_init(&s->lock); s->kvm = kvm; s->pics[0].elcr_mask = 0xf8; @@ -635,7 +635,9 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) mutex_unlock(&kvm->slots_lock); - return s; + kvm->arch.vpic = s; + + return 0; fail_unreg_1: kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_slave); @@ -648,13 +650,17 @@ fail_unlock: kfree(s); - return NULL; + return ret; } -void kvm_destroy_pic(struct kvm_pic *vpic) +void kvm_pic_destroy(struct kvm *kvm) { + struct kvm_pic *vpic = kvm->arch.vpic; + kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_master); kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_slave); kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_eclr); + + kvm->arch.vpic = NULL; kfree(vpic); } diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 035731eb3897..40d5b2cf6061 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -73,8 +73,8 @@ struct kvm_pic { unsigned long irq_states[PIC_NUM_PINS]; }; -struct kvm_pic *kvm_create_pic(struct kvm *kvm); -void kvm_destroy_pic(struct kvm_pic *vpic); +int kvm_pic_init(struct kvm *kvm); +void kvm_pic_destroy(struct kvm *kvm); int kvm_pic_read_irq(struct kvm *kvm); void kvm_pic_update_irq(struct kvm_pic *s); @@ -93,18 +93,19 @@ static inline int pic_in_kernel(struct kvm *kvm) static inline int irqchip_split(struct kvm *kvm) { - return kvm->arch.irqchip_split; + return kvm->arch.irqchip_mode == KVM_IRQCHIP_SPLIT; } -static inline int irqchip_in_kernel(struct kvm *kvm) +static inline int irqchip_kernel(struct kvm *kvm) { - struct kvm_pic *vpic = pic_irqchip(kvm); - bool ret; + return kvm->arch.irqchip_mode == KVM_IRQCHIP_KERNEL; +} - ret = (vpic != NULL); - ret |= irqchip_split(kvm); +static inline int irqchip_in_kernel(struct kvm *kvm) +{ + bool ret = kvm->arch.irqchip_mode != KVM_IRQCHIP_NONE; - /* Read vpic before kvm->irq_routing. */ + /* Matches with wmb after initializing kvm->irq_routing. */ smp_rmb(); return ret; } diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 6c0191615f23..b96d3893f121 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -41,15 +41,6 @@ static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, bool line_status) { struct kvm_pic *pic = pic_irqchip(kvm); - - /* - * XXX: rejecting pic routes when pic isn't in use would be better, - * but the default routing table is installed while kvm->arch.vpic is - * NULL and KVM_CREATE_IRQCHIP can race with KVM_IRQ_LINE. - */ - if (!pic) - return -1; - return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level); } @@ -58,10 +49,6 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e, bool line_status) { struct kvm_ioapic *ioapic = kvm->arch.vioapic; - - if (!ioapic) - return -1; - return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level, line_status); } @@ -297,16 +284,20 @@ int kvm_set_routing_entry(struct kvm *kvm, case KVM_IRQ_ROUTING_IRQCHIP: delta = 0; switch (ue->u.irqchip.irqchip) { - case KVM_IRQCHIP_PIC_MASTER: - e->set = kvm_set_pic_irq; - max_pin = PIC_NUM_PINS; - break; case KVM_IRQCHIP_PIC_SLAVE: + delta = 8; + /* fall through */ + case KVM_IRQCHIP_PIC_MASTER: + if (!pic_in_kernel(kvm)) + goto out; + e->set = kvm_set_pic_irq; max_pin = PIC_NUM_PINS; - delta = 8; break; case KVM_IRQCHIP_IOAPIC: + if (!ioapic_in_kernel(kvm)) + goto out; + max_pin = KVM_IOAPIC_NUM_PINS; e->set = kvm_set_ioapic_irq; break; @@ -409,7 +400,7 @@ int kvm_setup_empty_irq_routing(struct kvm *kvm) void kvm_arch_post_irq_routing_update(struct kvm *kvm) { - if (ioapic_in_kernel(kvm) || !irqchip_in_kernel(kvm)) + if (!irqchip_split(kvm)) return; kvm_make_scan_ioapic_request(kvm); } diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 2f6ef5121a4c..33b799fd3a6e 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -115,6 +115,16 @@ static inline int apic_enabled(struct kvm_lapic *apic) (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \ APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER) +static inline u8 kvm_xapic_id(struct kvm_lapic *apic) +{ + return kvm_lapic_get_reg(apic, APIC_ID) >> 24; +} + +static inline u32 kvm_x2apic_id(struct kvm_lapic *apic) +{ + return apic->vcpu->vcpu_id; +} + static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map, u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) { switch (map->mode) { @@ -159,13 +169,13 @@ static void recalculate_apic_map(struct kvm *kvm) struct kvm_apic_map *new, *old = NULL; struct kvm_vcpu *vcpu; int i; - u32 max_id = 255; + u32 max_id = 255; /* enough space for any xAPIC ID */ mutex_lock(&kvm->arch.apic_map_lock); kvm_for_each_vcpu(i, vcpu, kvm) if (kvm_apic_present(vcpu)) - max_id = max(max_id, kvm_apic_id(vcpu->arch.apic)); + max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic)); new = kvm_kvzalloc(sizeof(struct kvm_apic_map) + sizeof(struct kvm_lapic *) * ((u64)max_id + 1)); @@ -179,16 +189,28 @@ static void recalculate_apic_map(struct kvm *kvm) struct kvm_lapic *apic = vcpu->arch.apic; struct kvm_lapic **cluster; u16 mask; - u32 ldr, aid; + u32 ldr; + u8 xapic_id; + u32 x2apic_id; if (!kvm_apic_present(vcpu)) continue; - aid = kvm_apic_id(apic); - ldr = kvm_lapic_get_reg(apic, APIC_LDR); + xapic_id = kvm_xapic_id(apic); + x2apic_id = kvm_x2apic_id(apic); - if (aid <= new->max_apic_id) - new->phys_map[aid] = apic; + /* Hotplug hack: see kvm_apic_match_physical_addr(), ... */ + if ((apic_x2apic_mode(apic) || x2apic_id > 0xff) && + x2apic_id <= new->max_apic_id) + new->phys_map[x2apic_id] = apic; + /* + * ... xAPIC ID of VCPUs with APIC ID > 0xff will wrap-around, + * prevent them from masking VCPUs with APIC ID <= 0xff. + */ + if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id]) + new->phys_map[xapic_id] = apic; + + ldr = kvm_lapic_get_reg(apic, APIC_LDR); if (apic_x2apic_mode(apic)) { new->mode |= KVM_APIC_MODE_X2APIC; @@ -250,6 +272,8 @@ static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id) { u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf)); + WARN_ON_ONCE(id != apic->vcpu->vcpu_id); + kvm_lapic_set_reg(apic, APIC_ID, id); kvm_lapic_set_reg(apic, APIC_LDR, ldr); recalculate_apic_map(apic->vcpu->kvm); @@ -546,7 +570,15 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu) __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); } -static void apic_update_ppr(struct kvm_lapic *apic) +static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr) +{ + int highest_irr = apic_find_highest_irr(apic); + if (highest_irr == -1 || (highest_irr & 0xF0) <= ppr) + return -1; + return highest_irr; +} + +static bool __apic_update_ppr(struct kvm_lapic *apic, u32 *new_ppr) { u32 tpr, isrv, ppr, old_ppr; int isr; @@ -564,12 +596,27 @@ static void apic_update_ppr(struct kvm_lapic *apic) apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x", apic, ppr, isr, isrv); - if (old_ppr != ppr) { + *new_ppr = ppr; + if (old_ppr != ppr) kvm_lapic_set_reg(apic, APIC_PROCPRI, ppr); - if (ppr < old_ppr) - kvm_make_request(KVM_REQ_EVENT, apic->vcpu); - } + + return ppr < old_ppr; +} + +static void apic_update_ppr(struct kvm_lapic *apic) +{ + u32 ppr; + + if (__apic_update_ppr(apic, &ppr) && + apic_has_interrupt_for_ppr(apic, ppr) != -1) + kvm_make_request(KVM_REQ_EVENT, apic->vcpu); +} + +void kvm_apic_update_ppr(struct kvm_vcpu *vcpu) +{ + apic_update_ppr(vcpu->arch.apic); } +EXPORT_SYMBOL_GPL(kvm_apic_update_ppr); static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) { @@ -579,10 +626,8 @@ static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 mda) { - if (apic_x2apic_mode(apic)) - return mda == X2APIC_BROADCAST; - - return GET_APIC_DEST_FIELD(mda) == APIC_BROADCAST; + return mda == (apic_x2apic_mode(apic) ? + X2APIC_BROADCAST : APIC_BROADCAST); } static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda) @@ -591,9 +636,18 @@ static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda) return true; if (apic_x2apic_mode(apic)) - return mda == kvm_apic_id(apic); + return mda == kvm_x2apic_id(apic); + + /* + * Hotplug hack: Make LAPIC in xAPIC mode also accept interrupts as if + * it were in x2APIC mode. Hotplugged VCPUs start in xAPIC mode and + * this allows unique addressing of VCPUs with APIC ID over 0xff. + * The 0xff condition is needed because writeable xAPIC ID. + */ + if (kvm_x2apic_id(apic) > 0xff && mda == kvm_x2apic_id(apic)) + return true; - return mda == SET_APIC_DEST_FIELD(kvm_apic_id(apic)); + return mda == kvm_xapic_id(apic); } static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) @@ -610,7 +664,6 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) && (logical_id & mda & 0xffff) != 0; logical_id = GET_APIC_LOGICAL_ID(logical_id); - mda = GET_APIC_DEST_FIELD(mda); switch (kvm_lapic_get_reg(apic, APIC_DFR)) { case APIC_DFR_FLAT: @@ -627,9 +680,9 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) /* The KVM local APIC implementation has two quirks: * - * - the xAPIC MDA stores the destination at bits 24-31, while this - * is not true of struct kvm_lapic_irq's dest_id field. This is - * just a quirk in the API and is not problematic. + * - Real hardware delivers interrupts destined to x2APIC ID > 0xff to LAPICs + * in xAPIC mode if the "destination & 0xff" matches its xAPIC ID. + * KVM doesn't do that aliasing. * * - in-kernel IOAPIC messages have to be delivered directly to * x2APIC, because the kernel does not support interrupt remapping. @@ -645,13 +698,12 @@ static u32 kvm_apic_mda(struct kvm_vcpu *vcpu, unsigned int dest_id, struct kvm_lapic *source, struct kvm_lapic *target) { bool ipi = source != NULL; - bool x2apic_mda = apic_x2apic_mode(ipi ? source : target); if (!vcpu->kvm->arch.x2apic_broadcast_quirk_disabled && - !ipi && dest_id == APIC_BROADCAST && x2apic_mda) + !ipi && dest_id == APIC_BROADCAST && apic_x2apic_mode(target)) return X2APIC_BROADCAST; - return x2apic_mda ? dest_id : SET_APIC_DEST_FIELD(dest_id); + return dest_id; } bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, @@ -1907,9 +1959,9 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.apic_arb_prio = 0; vcpu->arch.apic_attention = 0; - apic_debug("%s: vcpu=%p, id=%d, base_msr=" + apic_debug("%s: vcpu=%p, id=0x%x, base_msr=" "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__, - vcpu, kvm_apic_id(apic), + vcpu, kvm_lapic_get_reg(apic, APIC_ID), vcpu->arch.apic_base, apic->base_address); } @@ -2021,17 +2073,13 @@ nomem: int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - int highest_irr; + u32 ppr; if (!apic_enabled(apic)) return -1; - apic_update_ppr(apic); - highest_irr = apic_find_highest_irr(apic); - if ((highest_irr == -1) || - ((highest_irr & 0xF0) <= kvm_lapic_get_reg(apic, APIC_PROCPRI))) - return -1; - return highest_irr; + __apic_update_ppr(apic, &ppr); + return apic_has_interrupt_for_ppr(apic, ppr); } int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) @@ -2067,6 +2115,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) { int vector = kvm_apic_has_interrupt(vcpu); struct kvm_lapic *apic = vcpu->arch.apic; + u32 ppr; if (vector == -1) return -1; @@ -2078,13 +2127,23 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) * because the process would deliver it through the IDT. */ - apic_set_isr(vector, apic); - apic_update_ppr(apic); apic_clear_irr(vector, apic); - if (test_bit(vector, vcpu_to_synic(vcpu)->auto_eoi_bitmap)) { - apic_clear_isr(vector, apic); + /* + * For auto-EOI interrupts, there might be another pending + * interrupt above PPR, so check whether to raise another + * KVM_REQ_EVENT. + */ apic_update_ppr(apic); + } else { + /* + * For normal interrupts, PPR has been raised and there cannot + * be a higher-priority pending interrupt---except if there was + * a concurrent interrupt injection, but that would have + * triggered KVM_REQ_EVENT already. + */ + apic_set_isr(vector, apic); + __apic_update_ppr(apic, &ppr); } return vector; diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index ff8039d61672..05abd837b78a 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -73,6 +73,7 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, void __kvm_apic_update_irr(u32 *pir, void *regs); void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir); +void kvm_apic_update_ppr(struct kvm_vcpu *vcpu); int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, struct dest_map *dest_map); int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type); @@ -203,17 +204,6 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu) return lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); } -static inline u32 kvm_apic_id(struct kvm_lapic *apic) -{ - /* To avoid a race between apic_base and following APIC_ID update when - * switching to x2apic_mode, the x2apic mode returns initial x2apic id. - */ - if (apic_x2apic_mode(apic)) - return apic->vcpu->vcpu_id; - - return kvm_lapic_get_reg(apic, APIC_ID) >> 24; -} - bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); void wait_lapic_expire(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 7012de4a1fed..2fd7586aad4d 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -37,6 +37,8 @@ #include <linux/srcu.h> #include <linux/slab.h> #include <linux/uaccess.h> +#include <linux/hash.h> +#include <linux/kern_levels.h> #include <asm/page.h> #include <asm/cmpxchg.h> @@ -129,6 +131,10 @@ module_param(dbg, bool, 0644); #define ACC_USER_MASK PT_USER_MASK #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) +/* The mask for the R/X bits in EPT PTEs */ +#define PT64_EPT_READABLE_MASK 0x1ull +#define PT64_EPT_EXECUTABLE_MASK 0x4ull + #include <trace/events/kvm.h> #define CREATE_TRACE_POINTS @@ -178,15 +184,40 @@ static u64 __read_mostly shadow_dirty_mask; static u64 __read_mostly shadow_mmio_mask; static u64 __read_mostly shadow_present_mask; +/* + * The mask/value to distinguish a PTE that has been marked not-present for + * access tracking purposes. + * The mask would be either 0 if access tracking is disabled, or + * SPTE_SPECIAL_MASK|VMX_EPT_RWX_MASK if access tracking is enabled. + */ +static u64 __read_mostly shadow_acc_track_mask; +static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK; + +/* + * The mask/shift to use for saving the original R/X bits when marking the PTE + * as not-present for access tracking purposes. We do not save the W bit as the + * PTEs being access tracked also need to be dirty tracked, so the W bit will be + * restored only when a write is attempted to the page. + */ +static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK | + PT64_EPT_EXECUTABLE_MASK; +static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT; + static void mmu_spte_set(u64 *sptep, u64 spte); static void mmu_free_roots(struct kvm_vcpu *vcpu); void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) { - shadow_mmio_mask = mmio_mask; + shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK; } EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); +static inline bool is_access_track_spte(u64 spte) +{ + /* Always false if shadow_acc_track_mask is zero. */ + return (spte & shadow_acc_track_mask) == shadow_acc_track_value; +} + /* * the low bit of the generation number is always presumed to be zero. * This disables mmio caching during memslot updates. The concept is @@ -284,17 +315,35 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) } void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask) + u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, + u64 acc_track_mask) { + if (acc_track_mask != 0) + acc_track_mask |= SPTE_SPECIAL_MASK; + shadow_user_mask = user_mask; shadow_accessed_mask = accessed_mask; shadow_dirty_mask = dirty_mask; shadow_nx_mask = nx_mask; shadow_x_mask = x_mask; shadow_present_mask = p_mask; + shadow_acc_track_mask = acc_track_mask; + WARN_ON(shadow_accessed_mask != 0 && shadow_acc_track_mask != 0); } EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); +void kvm_mmu_clear_all_pte_masks(void) +{ + shadow_user_mask = 0; + shadow_accessed_mask = 0; + shadow_dirty_mask = 0; + shadow_nx_mask = 0; + shadow_x_mask = 0; + shadow_mmio_mask = 0; + shadow_present_mask = 0; + shadow_acc_track_mask = 0; +} + static int is_cpuid_PSE36(void) { return 1; @@ -307,7 +356,7 @@ static int is_nx(struct kvm_vcpu *vcpu) static int is_shadow_present_pte(u64 pte) { - return (pte & 0xFFFFFFFFull) && !is_mmio_spte(pte); + return (pte != 0) && !is_mmio_spte(pte); } static int is_large_pte(u64 pte) @@ -324,6 +373,11 @@ static int is_last_spte(u64 pte, int level) return 0; } +static bool is_executable_pte(u64 spte) +{ + return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask; +} + static kvm_pfn_t spte_to_pfn(u64 pte) { return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; @@ -473,7 +527,7 @@ retry: } #endif -static bool spte_is_locklessly_modifiable(u64 spte) +static bool spte_can_locklessly_be_made_writable(u64 spte) { return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) == (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE); @@ -481,36 +535,38 @@ static bool spte_is_locklessly_modifiable(u64 spte) static bool spte_has_volatile_bits(u64 spte) { + if (!is_shadow_present_pte(spte)) + return false; + /* * Always atomically update spte if it can be updated * out of mmu-lock, it can ensure dirty bit is not lost, * also, it can help us to get a stable is_writable_pte() * to ensure tlb flush is not missed. */ - if (spte_is_locklessly_modifiable(spte)) + if (spte_can_locklessly_be_made_writable(spte) || + is_access_track_spte(spte)) return true; - if (!shadow_accessed_mask) - return false; - - if (!is_shadow_present_pte(spte)) - return false; - - if ((spte & shadow_accessed_mask) && - (!is_writable_pte(spte) || (spte & shadow_dirty_mask))) - return false; + if (shadow_accessed_mask) { + if ((spte & shadow_accessed_mask) == 0 || + (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0)) + return true; + } - return true; + return false; } -static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask) +static bool is_accessed_spte(u64 spte) { - return (old_spte & bit_mask) && !(new_spte & bit_mask); + return shadow_accessed_mask ? spte & shadow_accessed_mask + : !is_access_track_spte(spte); } -static bool spte_is_bit_changed(u64 old_spte, u64 new_spte, u64 bit_mask) +static bool is_dirty_spte(u64 spte) { - return (old_spte & bit_mask) != (new_spte & bit_mask); + return shadow_dirty_mask ? spte & shadow_dirty_mask + : spte & PT_WRITABLE_MASK; } /* Rules for using mmu_spte_set: @@ -525,25 +581,19 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte) __set_spte(sptep, new_spte); } -/* Rules for using mmu_spte_update: - * Update the state bits, it means the mapped pfn is not changed. - * - * Whenever we overwrite a writable spte with a read-only one we - * should flush remote TLBs. Otherwise rmap_write_protect - * will find a read-only spte, even though the writable spte - * might be cached on a CPU's TLB, the return value indicates this - * case. +/* + * Update the SPTE (excluding the PFN), but do not track changes in its + * accessed/dirty status. */ -static bool mmu_spte_update(u64 *sptep, u64 new_spte) +static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte) { u64 old_spte = *sptep; - bool ret = false; WARN_ON(!is_shadow_present_pte(new_spte)); if (!is_shadow_present_pte(old_spte)) { mmu_spte_set(sptep, new_spte); - return ret; + return old_spte; } if (!spte_has_volatile_bits(old_spte)) @@ -551,45 +601,62 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) else old_spte = __update_clear_spte_slow(sptep, new_spte); + WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte)); + + return old_spte; +} + +/* Rules for using mmu_spte_update: + * Update the state bits, it means the mapped pfn is not changed. + * + * Whenever we overwrite a writable spte with a read-only one we + * should flush remote TLBs. Otherwise rmap_write_protect + * will find a read-only spte, even though the writable spte + * might be cached on a CPU's TLB, the return value indicates this + * case. + * + * Returns true if the TLB needs to be flushed + */ +static bool mmu_spte_update(u64 *sptep, u64 new_spte) +{ + bool flush = false; + u64 old_spte = mmu_spte_update_no_track(sptep, new_spte); + + if (!is_shadow_present_pte(old_spte)) + return false; + /* * For the spte updated out of mmu-lock is safe, since * we always atomically update it, see the comments in * spte_has_volatile_bits(). */ - if (spte_is_locklessly_modifiable(old_spte) && + if (spte_can_locklessly_be_made_writable(old_spte) && !is_writable_pte(new_spte)) - ret = true; - - if (!shadow_accessed_mask) { - /* - * We don't set page dirty when dropping non-writable spte. - * So do it now if the new spte is becoming non-writable. - */ - if (ret) - kvm_set_pfn_dirty(spte_to_pfn(old_spte)); - return ret; - } + flush = true; /* - * Flush TLB when accessed/dirty bits are changed in the page tables, + * Flush TLB when accessed/dirty states are changed in the page tables, * to guarantee consistency between TLB and page tables. */ - if (spte_is_bit_changed(old_spte, new_spte, - shadow_accessed_mask | shadow_dirty_mask)) - ret = true; - if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask)) + if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) { + flush = true; kvm_set_pfn_accessed(spte_to_pfn(old_spte)); - if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask)) + } + + if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) { + flush = true; kvm_set_pfn_dirty(spte_to_pfn(old_spte)); + } - return ret; + return flush; } /* * Rules for using mmu_spte_clear_track_bits: * It sets the sptep from present to nonpresent, and track the * state bits, it is used to clear the last level sptep. + * Returns non-zero if the PTE was previously valid. */ static int mmu_spte_clear_track_bits(u64 *sptep) { @@ -613,11 +680,12 @@ static int mmu_spte_clear_track_bits(u64 *sptep) */ WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn))); - if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) + if (is_accessed_spte(old_spte)) kvm_set_pfn_accessed(pfn); - if (old_spte & (shadow_dirty_mask ? shadow_dirty_mask : - PT_WRITABLE_MASK)) + + if (is_dirty_spte(old_spte)) kvm_set_pfn_dirty(pfn); + return 1; } @@ -636,6 +704,78 @@ static u64 mmu_spte_get_lockless(u64 *sptep) return __get_spte_lockless(sptep); } +static u64 mark_spte_for_access_track(u64 spte) +{ + if (shadow_accessed_mask != 0) + return spte & ~shadow_accessed_mask; + + if (shadow_acc_track_mask == 0 || is_access_track_spte(spte)) + return spte; + + /* + * Making an Access Tracking PTE will result in removal of write access + * from the PTE. So, verify that we will be able to restore the write + * access in the fast page fault path later on. + */ + WARN_ONCE((spte & PT_WRITABLE_MASK) && + !spte_can_locklessly_be_made_writable(spte), + "kvm: Writable SPTE is not locklessly dirty-trackable\n"); + + WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask << + shadow_acc_track_saved_bits_shift), + "kvm: Access Tracking saved bit locations are not zero\n"); + + spte |= (spte & shadow_acc_track_saved_bits_mask) << + shadow_acc_track_saved_bits_shift; + spte &= ~shadow_acc_track_mask; + spte |= shadow_acc_track_value; + + return spte; +} + +/* Restore an acc-track PTE back to a regular PTE */ +static u64 restore_acc_track_spte(u64 spte) +{ + u64 new_spte = spte; + u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift) + & shadow_acc_track_saved_bits_mask; + + WARN_ON_ONCE(!is_access_track_spte(spte)); + + new_spte &= ~shadow_acc_track_mask; + new_spte &= ~(shadow_acc_track_saved_bits_mask << + shadow_acc_track_saved_bits_shift); + new_spte |= saved_bits; + + return new_spte; +} + +/* Returns the Accessed status of the PTE and resets it at the same time. */ +static bool mmu_spte_age(u64 *sptep) +{ + u64 spte = mmu_spte_get_lockless(sptep); + + if (!is_accessed_spte(spte)) + return false; + + if (shadow_accessed_mask) { + clear_bit((ffs(shadow_accessed_mask) - 1), + (unsigned long *)sptep); + } else { + /* + * Capture the dirty status of the page, so that it doesn't get + * lost when the SPTE is marked for access tracking. + */ + if (is_writable_pte(spte)) + kvm_set_pfn_dirty(spte_to_pfn(spte)); + + spte = mark_spte_for_access_track(spte); + mmu_spte_update_no_track(sptep, spte); + } + + return true; +} + static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) { /* @@ -1212,7 +1352,7 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect) u64 spte = *sptep; if (!is_writable_pte(spte) && - !(pt_protect && spte_is_locklessly_modifiable(spte))) + !(pt_protect && spte_can_locklessly_be_made_writable(spte))) return false; rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); @@ -1420,7 +1560,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, restart: for_each_rmap_spte(rmap_head, &iter, sptep) { rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n", - sptep, *sptep, gfn, level); + sptep, *sptep, gfn, level); need_flush = 1; @@ -1433,7 +1573,8 @@ restart: new_spte &= ~PT_WRITABLE_MASK; new_spte &= ~SPTE_HOST_WRITEABLE; - new_spte &= ~shadow_accessed_mask; + + new_spte = mark_spte_for_access_track(new_spte); mmu_spte_clear_track_bits(sptep); mmu_spte_set(sptep, new_spte); @@ -1595,15 +1736,8 @@ static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct rmap_iterator uninitialized_var(iter); int young = 0; - BUG_ON(!shadow_accessed_mask); - - for_each_rmap_spte(rmap_head, &iter, sptep) { - if (*sptep & shadow_accessed_mask) { - young = 1; - clear_bit((ffs(shadow_accessed_mask) - 1), - (unsigned long *)sptep); - } - } + for_each_rmap_spte(rmap_head, &iter, sptep) + young |= mmu_spte_age(sptep); trace_kvm_age_page(gfn, level, slot, young); return young; @@ -1615,24 +1749,20 @@ static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, { u64 *sptep; struct rmap_iterator iter; - int young = 0; /* - * If there's no access bit in the secondary pte set by the - * hardware it's up to gup-fast/gup to set the access bit in - * the primary pte or in the page structure. + * If there's no access bit in the secondary pte set by the hardware and + * fast access tracking is also not enabled, it's up to gup-fast/gup to + * set the access bit in the primary pte or in the page structure. */ - if (!shadow_accessed_mask) + if (!shadow_accessed_mask && !shadow_acc_track_mask) goto out; - for_each_rmap_spte(rmap_head, &iter, sptep) { - if (*sptep & shadow_accessed_mask) { - young = 1; - break; - } - } + for_each_rmap_spte(rmap_head, &iter, sptep) + if (is_accessed_spte(*sptep)) + return 1; out: - return young; + return 0; } #define RMAP_RECYCLE_THRESHOLD 1000 @@ -1660,7 +1790,7 @@ int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) * This has some overhead, but not as much as the cost of swapping * out actively used pages or breaking up actively used hugepages. */ - if (!shadow_accessed_mask) + if (!shadow_accessed_mask && !shadow_acc_track_mask) return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp); @@ -1713,7 +1843,7 @@ static void kvm_mmu_free_page(struct kvm_mmu_page *sp) static unsigned kvm_page_table_hashfn(gfn_t gfn) { - return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1); + return hash_64(gfn, KVM_MMU_HASH_SHIFT); } static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, @@ -1904,17 +2034,17 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, * since it has been deleted from active_mmu_pages but still can be found * at hast list. * - * for_each_gfn_valid_sp() has skipped that kind of pages. + * for_each_valid_sp() has skipped that kind of pages. */ -#define for_each_gfn_valid_sp(_kvm, _sp, _gfn) \ +#define for_each_valid_sp(_kvm, _sp, _gfn) \ hlist_for_each_entry(_sp, \ &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ - if ((_sp)->gfn != (_gfn) || is_obsolete_sp((_kvm), (_sp)) \ - || (_sp)->role.invalid) {} else + if (is_obsolete_sp((_kvm), (_sp)) || (_sp)->role.invalid) { \ + } else #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \ - for_each_gfn_valid_sp(_kvm, _sp, _gfn) \ - if ((_sp)->role.direct) {} else + for_each_valid_sp(_kvm, _sp, _gfn) \ + if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else /* @sp->gfn should be write-protected at the call site */ static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, @@ -2116,6 +2246,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp; bool need_sync = false; bool flush = false; + int collisions = 0; LIST_HEAD(invalid_list); role = vcpu->arch.mmu.base_role; @@ -2130,7 +2261,12 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; role.quadrant = quadrant; } - for_each_gfn_valid_sp(vcpu->kvm, sp, gfn) { + for_each_valid_sp(vcpu->kvm, sp, gfn) { + if (sp->gfn != gfn) { + collisions++; + continue; + } + if (!need_sync && sp->unsync) need_sync = true; @@ -2153,7 +2289,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, __clear_sp_write_flooding_count(sp); trace_kvm_mmu_get_page(sp, false); - return sp; + goto out; } ++vcpu->kvm->stat.mmu_cache_miss; @@ -2183,6 +2319,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, trace_kvm_mmu_get_page(sp, true); kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); +out: + if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions) + vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions; return sp; } @@ -2583,6 +2722,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, spte |= shadow_dirty_mask; } + if (speculative) + spte = mark_spte_for_access_track(spte); + set_pte: if (mmu_spte_update(sptep, spte)) kvm_flush_remote_tlbs(vcpu->kvm); @@ -2636,7 +2778,7 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, pgprintk("%s: setting spte %llx\n", __func__, *sptep); pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", is_large_pte(*sptep)? "2MB" : "4kB", - *sptep & PT_PRESENT_MASK ?"RW":"R", gfn, + *sptep & PT_WRITABLE_MASK ? "RW" : "R", gfn, *sptep, sptep); if (!was_rmapped && is_large_pte(*sptep)) ++vcpu->kvm->stat.lpages; @@ -2869,33 +3011,43 @@ static bool page_fault_can_be_fast(u32 error_code) if (unlikely(error_code & PFERR_RSVD_MASK)) return false; + /* See if the page fault is due to an NX violation */ + if (unlikely(((error_code & (PFERR_FETCH_MASK | PFERR_PRESENT_MASK)) + == (PFERR_FETCH_MASK | PFERR_PRESENT_MASK)))) + return false; + /* - * #PF can be fast only if the shadow page table is present and it - * is caused by write-protect, that means we just need change the - * W bit of the spte which can be done out of mmu-lock. + * #PF can be fast if: + * 1. The shadow page table entry is not present, which could mean that + * the fault is potentially caused by access tracking (if enabled). + * 2. The shadow page table entry is present and the fault + * is caused by write-protect, that means we just need change the W + * bit of the spte which can be done out of mmu-lock. + * + * However, if access tracking is disabled we know that a non-present + * page must be a genuine page fault where we have to create a new SPTE. + * So, if access tracking is disabled, we return true only for write + * accesses to a present page. */ - if (!(error_code & PFERR_PRESENT_MASK) || - !(error_code & PFERR_WRITE_MASK)) - return false; - return true; + return shadow_acc_track_mask != 0 || + ((error_code & (PFERR_WRITE_MASK | PFERR_PRESENT_MASK)) + == (PFERR_WRITE_MASK | PFERR_PRESENT_MASK)); } +/* + * Returns true if the SPTE was fixed successfully. Otherwise, + * someone else modified the SPTE from its original value. + */ static bool fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - u64 *sptep, u64 spte) + u64 *sptep, u64 old_spte, u64 new_spte) { gfn_t gfn; WARN_ON(!sp->role.direct); /* - * The gfn of direct spte is stable since it is calculated - * by sp->gfn. - */ - gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); - - /* * Theoretically we could also set dirty bit (and flush TLB) here in * order to eliminate unnecessary PML logging. See comments in * set_spte. But fast_page_fault is very unlikely to happen with PML @@ -2907,12 +3059,33 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, * * Compare with set_spte where instead shadow_dirty_mask is set. */ - if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte) + if (cmpxchg64(sptep, old_spte, new_spte) != old_spte) + return false; + + if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) { + /* + * The gfn of direct spte is stable since it is + * calculated by sp->gfn. + */ + gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); kvm_vcpu_mark_page_dirty(vcpu, gfn); + } return true; } +static bool is_access_allowed(u32 fault_err_code, u64 spte) +{ + if (fault_err_code & PFERR_FETCH_MASK) + return is_executable_pte(spte); + + if (fault_err_code & PFERR_WRITE_MASK) + return is_writable_pte(spte); + + /* Fault was on Read access */ + return spte & PT_PRESENT_MASK; +} + /* * Return value: * - true: let the vcpu to access on the same address again. @@ -2923,8 +3096,9 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, { struct kvm_shadow_walk_iterator iterator; struct kvm_mmu_page *sp; - bool ret = false; + bool fault_handled = false; u64 spte = 0ull; + uint retry_count = 0; if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return false; @@ -2933,66 +3107,93 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, return false; walk_shadow_page_lockless_begin(vcpu); - for_each_shadow_entry_lockless(vcpu, gva, iterator, spte) - if (!is_shadow_present_pte(spte) || iterator.level < level) + + do { + u64 new_spte; + + for_each_shadow_entry_lockless(vcpu, gva, iterator, spte) + if (!is_shadow_present_pte(spte) || + iterator.level < level) + break; + + sp = page_header(__pa(iterator.sptep)); + if (!is_last_spte(spte, sp->role.level)) break; - /* - * If the mapping has been changed, let the vcpu fault on the - * same address again. - */ - if (!is_shadow_present_pte(spte)) { - ret = true; - goto exit; - } + /* + * Check whether the memory access that caused the fault would + * still cause it if it were to be performed right now. If not, + * then this is a spurious fault caused by TLB lazily flushed, + * or some other CPU has already fixed the PTE after the + * current CPU took the fault. + * + * Need not check the access of upper level table entries since + * they are always ACC_ALL. + */ + if (is_access_allowed(error_code, spte)) { + fault_handled = true; + break; + } - sp = page_header(__pa(iterator.sptep)); - if (!is_last_spte(spte, sp->role.level)) - goto exit; + new_spte = spte; - /* - * Check if it is a spurious fault caused by TLB lazily flushed. - * - * Need not check the access of upper level table entries since - * they are always ACC_ALL. - */ - if (is_writable_pte(spte)) { - ret = true; - goto exit; - } + if (is_access_track_spte(spte)) + new_spte = restore_acc_track_spte(new_spte); - /* - * Currently, to simplify the code, only the spte write-protected - * by dirty-log can be fast fixed. - */ - if (!spte_is_locklessly_modifiable(spte)) - goto exit; + /* + * Currently, to simplify the code, write-protection can + * be removed in the fast path only if the SPTE was + * write-protected for dirty-logging or access tracking. + */ + if ((error_code & PFERR_WRITE_MASK) && + spte_can_locklessly_be_made_writable(spte)) + { + new_spte |= PT_WRITABLE_MASK; - /* - * Do not fix write-permission on the large spte since we only dirty - * the first page into the dirty-bitmap in fast_pf_fix_direct_spte() - * that means other pages are missed if its slot is dirty-logged. - * - * Instead, we let the slow page fault path create a normal spte to - * fix the access. - * - * See the comments in kvm_arch_commit_memory_region(). - */ - if (sp->role.level > PT_PAGE_TABLE_LEVEL) - goto exit; + /* + * Do not fix write-permission on the large spte. Since + * we only dirty the first page into the dirty-bitmap in + * fast_pf_fix_direct_spte(), other pages are missed + * if its slot has dirty logging enabled. + * + * Instead, we let the slow page fault path create a + * normal spte to fix the access. + * + * See the comments in kvm_arch_commit_memory_region(). + */ + if (sp->role.level > PT_PAGE_TABLE_LEVEL) + break; + } + + /* Verify that the fault can be handled in the fast path */ + if (new_spte == spte || + !is_access_allowed(error_code, new_spte)) + break; + + /* + * Currently, fast page fault only works for direct mapping + * since the gfn is not stable for indirect shadow page. See + * Documentation/virtual/kvm/locking.txt to get more detail. + */ + fault_handled = fast_pf_fix_direct_spte(vcpu, sp, + iterator.sptep, spte, + new_spte); + if (fault_handled) + break; + + if (++retry_count > 4) { + printk_once(KERN_WARNING + "kvm: Fast #PF retrying more than 4 times.\n"); + break; + } + + } while (true); - /* - * Currently, fast page fault only works for direct mapping since - * the gfn is not stable for indirect shadow page. - * See Documentation/virtual/kvm/locking.txt to get more detail. - */ - ret = fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte); -exit: trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep, - spte, ret); + spte, fault_handled); walk_shadow_page_lockless_end(vcpu); - return ret; + return fault_handled; } static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, @@ -5063,6 +5264,8 @@ static void mmu_destroy_caches(void) int kvm_mmu_module_init(void) { + kvm_mmu_clear_all_pte_masks(); + pte_list_desc_cache = kmem_cache_create("pte_list_desc", sizeof(struct pte_list_desc), 0, 0, NULL); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 08a4d3ab3455..d0414f054bdf 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4182,6 +4182,8 @@ static int handle_exit(struct kvm_vcpu *vcpu) trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM); + vcpu->arch.gpa_available = (exit_code == SVM_EXIT_NPF); + if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE)) vcpu->arch.cr0 = svm->vmcb->save.cr0; if (npt_enabled) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a236decb81e4..7c3e42623090 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4953,7 +4953,7 @@ static bool vmx_get_enable_apicv(void) return enable_apicv; } -static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) +static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); int max_irr; @@ -4964,19 +4964,15 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) vmx->nested.pi_pending) { vmx->nested.pi_pending = false; if (!pi_test_and_clear_on(vmx->nested.pi_desc)) - return 0; + return; max_irr = find_last_bit( (unsigned long *)vmx->nested.pi_desc->pir, 256); if (max_irr == 256) - return 0; + return; vapic_page = kmap(vmx->nested.virtual_apic_page); - if (!vapic_page) { - WARN_ON(1); - return -ENOMEM; - } __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page); kunmap(vmx->nested.virtual_apic_page); @@ -4987,7 +4983,6 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) vmcs_write16(GUEST_INTR_STATUS, status); } } - return 0; } static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu) @@ -5236,10 +5231,8 @@ static void ept_set_mmio_spte_mask(void) /* * EPT Misconfigurations can be generated if the value of bits 2:0 * of an EPT paging-structure entry is 110b (write/execute). - * Also, magic bits (0x3ull << 62) is set to quickly identify mmio - * spte. */ - kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull); + kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE); } #define VMX_XSS_EXIT_BITMAP 0 @@ -6152,7 +6145,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu) static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) { - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_apic_update_ppr(vcpu); return 1; } @@ -6374,14 +6367,20 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); trace_kvm_page_fault(gpa, exit_qualification); - /* it is a read fault? */ - error_code = (exit_qualification << 2) & PFERR_USER_MASK; - /* it is a write fault? */ - error_code |= exit_qualification & PFERR_WRITE_MASK; - /* It is a fetch fault? */ - error_code |= (exit_qualification << 2) & PFERR_FETCH_MASK; - /* ept page table is present? */ - error_code |= (exit_qualification & 0x38) != 0; + /* Is it a read fault? */ + error_code = (exit_qualification & EPT_VIOLATION_ACC_READ) + ? PFERR_USER_MASK : 0; + /* Is it a write fault? */ + error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE) + ? PFERR_WRITE_MASK : 0; + /* Is it a fetch fault? */ + error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR) + ? PFERR_FETCH_MASK : 0; + /* ept page table entry is present? */ + error_code |= (exit_qualification & + (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE | + EPT_VIOLATION_EXECUTABLE)) + ? PFERR_PRESENT_MASK : 0; vcpu->arch.exit_qualification = exit_qualification; @@ -6572,6 +6571,19 @@ static void wakeup_handler(void) spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); } +void vmx_enable_tdp(void) +{ + kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, + enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull, + enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull, + 0ull, VMX_EPT_EXECUTABLE_MASK, + cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK, + enable_ept_ad_bits ? 0ull : VMX_EPT_RWX_MASK); + + ept_set_mmio_spte_mask(); + kvm_enable_tdp(); +} + static __init int hardware_setup(void) { int r = -ENOMEM, i, msr; @@ -6697,16 +6709,9 @@ static __init int hardware_setup(void) /* SELF-IPI */ vmx_disable_intercept_msr_x2apic(0x83f, MSR_TYPE_W, true); - if (enable_ept) { - kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, - (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull, - (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull, - 0ull, VMX_EPT_EXECUTABLE_MASK, - cpu_has_vmx_ept_execute_only() ? - 0ull : VMX_EPT_READABLE_MASK); - ept_set_mmio_spte_mask(); - kvm_enable_tdp(); - } else + if (enable_ept) + vmx_enable_tdp(); + else kvm_disable_tdp(); update_ple_window_actual_max(); @@ -7168,9 +7173,6 @@ static int handle_vmon(struct kvm_vcpu *vcpu) return 1; } - if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL)) - return 1; - if (vmx->nested.vmxon) { nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); return kvm_skip_emulated_instruction(vcpu); @@ -7182,6 +7184,9 @@ static int handle_vmon(struct kvm_vcpu *vcpu) return 1; } + if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL)) + return 1; + if (cpu_has_vmx_msr_bitmap()) { vmx->nested.msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); @@ -8191,8 +8196,6 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) case EXIT_REASON_TASK_SWITCH: return true; case EXIT_REASON_CPUID: - if (kvm_register_read(vcpu, VCPU_REGS_RAX) == 0xa) - return false; return true; case EXIT_REASON_HLT: return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); @@ -9730,11 +9733,6 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, return false; } msr_bitmap_l1 = (unsigned long *)kmap(page); - if (!msr_bitmap_l1) { - nested_release_page_clean(page); - WARN_ON(1); - return false; - } memset(msr_bitmap_l0, 0xff, PAGE_SIZE); @@ -10696,7 +10694,8 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) return 0; } - return vmx_complete_nested_posted_interrupt(vcpu); + vmx_complete_nested_posted_interrupt(vcpu); + return 0; } static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d153be8929a6..500008f800dc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -180,6 +180,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, { "irq_injections", VCPU_STAT(irq_injections) }, { "nmi_injections", VCPU_STAT(nmi_injections) }, + { "req_event", VCPU_STAT(req_event) }, { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, { "mmu_pte_write", VM_STAT(mmu_pte_write) }, { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, @@ -190,6 +191,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "mmu_unsync", VM_STAT(mmu_unsync) }, { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, { "largepages", VM_STAT(lpages) }, + { "max_mmu_page_hash_collisions", + VM_STAT(max_mmu_page_hash_collisions) }, { NULL } }; @@ -1139,6 +1142,7 @@ struct pvclock_gtod_data { u64 boot_ns; u64 nsec_base; + u64 wall_time_sec; }; static struct pvclock_gtod_data pvclock_gtod_data; @@ -1162,6 +1166,8 @@ static void update_pvclock_gtod(struct timekeeper *tk) vdata->boot_ns = boot_ns; vdata->nsec_base = tk->tkr_mono.xtime_nsec; + vdata->wall_time_sec = tk->xtime_sec; + write_seqcount_end(&vdata->seq); } #endif @@ -1623,6 +1629,28 @@ static int do_monotonic_boot(s64 *t, u64 *cycle_now) return mode; } +static int do_realtime(struct timespec *ts, u64 *cycle_now) +{ + struct pvclock_gtod_data *gtod = &pvclock_gtod_data; + unsigned long seq; + int mode; + u64 ns; + + do { + seq = read_seqcount_begin(>od->seq); + mode = gtod->clock.vclock_mode; + ts->tv_sec = gtod->wall_time_sec; + ns = gtod->nsec_base; + ns += vgettsc(cycle_now); + ns >>= gtod->clock.shift; + } while (unlikely(read_seqcount_retry(>od->seq, seq))); + + ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); + ts->tv_nsec = ns; + + return mode; +} + /* returns true if host is using tsc clocksource */ static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *cycle_now) { @@ -1632,6 +1660,17 @@ static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *cycle_now) return do_monotonic_boot(kernel_ns, cycle_now) == VCLOCK_TSC; } + +/* returns true if host is using tsc clocksource */ +static bool kvm_get_walltime_and_clockread(struct timespec *ts, + u64 *cycle_now) +{ + /* checked again under seqlock below */ + if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC) + return false; + + return do_realtime(ts, cycle_now) == VCLOCK_TSC; +} #endif /* @@ -3896,7 +3935,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, goto split_irqchip_unlock; /* Pairs with irqchip_in_kernel. */ smp_wmb(); - kvm->arch.irqchip_split = true; + kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT; kvm->arch.nr_reserved_ioapic_pins = cap->args[0]; r = 0; split_irqchip_unlock: @@ -3959,40 +3998,41 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); break; case KVM_CREATE_IRQCHIP: { - struct kvm_pic *vpic; - mutex_lock(&kvm->lock); + r = -EEXIST; - if (kvm->arch.vpic) + if (irqchip_in_kernel(kvm)) goto create_irqchip_unlock; + r = -EINVAL; if (kvm->created_vcpus) goto create_irqchip_unlock; - r = -ENOMEM; - vpic = kvm_create_pic(kvm); - if (vpic) { - r = kvm_ioapic_init(kvm); - if (r) { - mutex_lock(&kvm->slots_lock); - kvm_destroy_pic(vpic); - mutex_unlock(&kvm->slots_lock); - goto create_irqchip_unlock; - } - } else + + r = kvm_pic_init(kvm); + if (r) goto create_irqchip_unlock; + + r = kvm_ioapic_init(kvm); + if (r) { + mutex_lock(&kvm->slots_lock); + kvm_pic_destroy(kvm); + mutex_unlock(&kvm->slots_lock); + goto create_irqchip_unlock; + } + r = kvm_setup_default_irq_routing(kvm); if (r) { mutex_lock(&kvm->slots_lock); mutex_lock(&kvm->irq_lock); kvm_ioapic_destroy(kvm); - kvm_destroy_pic(vpic); + kvm_pic_destroy(kvm); mutex_unlock(&kvm->irq_lock); mutex_unlock(&kvm->slots_lock); goto create_irqchip_unlock; } - /* Write kvm->irq_routing before kvm->arch.vpic. */ + /* Write kvm->irq_routing before enabling irqchip_in_kernel. */ smp_wmb(); - kvm->arch.vpic = vpic; + kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL; create_irqchip_unlock: mutex_unlock(&kvm->lock); break; @@ -4028,7 +4068,7 @@ long kvm_arch_vm_ioctl(struct file *filp, } r = -ENXIO; - if (!irqchip_in_kernel(kvm) || irqchip_split(kvm)) + if (!irqchip_kernel(kvm)) goto get_irqchip_out; r = kvm_vm_ioctl_get_irqchip(kvm, chip); if (r) @@ -4052,7 +4092,7 @@ long kvm_arch_vm_ioctl(struct file *filp, } r = -ENXIO; - if (!irqchip_in_kernel(kvm) || irqchip_split(kvm)) + if (!irqchip_kernel(kvm)) goto set_irqchip_out; r = kvm_vm_ioctl_set_irqchip(kvm, chip); if (r) @@ -4461,6 +4501,21 @@ out: } EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system); +static int vcpu_is_mmio_gpa(struct kvm_vcpu *vcpu, unsigned long gva, + gpa_t gpa, bool write) +{ + /* For APIC access vmexit */ + if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) + return 1; + + if (vcpu_match_mmio_gpa(vcpu, gpa)) { + trace_vcpu_match_mmio(gva, gpa, write, true); + return 1; + } + + return 0; +} + static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, gpa_t *gpa, struct x86_exception *exception, bool write) @@ -4487,16 +4542,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, if (*gpa == UNMAPPED_GVA) return -1; - /* For APIC access vmexit */ - if ((*gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) - return 1; - - if (vcpu_match_mmio_gpa(vcpu, *gpa)) { - trace_vcpu_match_mmio(gva, *gpa, write, true); - return 1; - } - - return 0; + return vcpu_is_mmio_gpa(vcpu, gva, *gpa, write); } int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, @@ -4593,6 +4639,22 @@ static int emulator_read_write_onepage(unsigned long addr, void *val, int handled, ret; bool write = ops->write; struct kvm_mmio_fragment *frag; + struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + + /* + * If the exit was due to a NPF we may already have a GPA. + * If the GPA is present, use it to avoid the GVA to GPA table walk. + * Note, this cannot be used on string operations since string + * operation using rep will only have the initial GPA from the NPF + * occurred. + */ + if (vcpu->arch.gpa_available && + emulator_can_use_gpa(ctxt) && + vcpu_is_mmio_gpa(vcpu, addr, exception->address, write) && + (addr & ~PAGE_MASK) == (exception->address & ~PAGE_MASK)) { + gpa = exception->address; + goto mmio; + } ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write); @@ -5609,6 +5671,9 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, } restart: + /* Save the faulting GPA (cr2) in the address field */ + ctxt->exception.address = cr2; + r = x86_emulate_insn(ctxt); if (r == EMULATION_INTERCEPTED) @@ -5923,9 +5988,6 @@ static void kvm_set_mmio_spte_mask(void) /* Mask the reserved physical address bits. */ mask = rsvd_bits(maxphyaddr, 51); - /* Bit 62 is always reserved for 32bit host. */ - mask |= 0x3ull << 62; - /* Set the present bit. */ mask |= 1ull; @@ -6024,7 +6086,7 @@ int kvm_arch_init(void *opaque) kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, PT_DIRTY_MASK, PT64_NX_MASK, 0, - PT_PRESENT_MASK); + PT_PRESENT_MASK, 0); kvm_timer_init(); perf_register_guest_info_callbacks(&kvm_guest_cbs); @@ -6086,6 +6148,33 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_halt); +static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr, + unsigned long clock_type) +{ + struct kvm_clock_pairing clock_pairing; + struct timespec ts; + u64 cycle; + int ret; + + if (clock_type != KVM_CLOCK_PAIRING_WALLCLOCK) + return -KVM_EOPNOTSUPP; + + if (kvm_get_walltime_and_clockread(&ts, &cycle) == false) + return -KVM_EOPNOTSUPP; + + clock_pairing.sec = ts.tv_sec; + clock_pairing.nsec = ts.tv_nsec; + clock_pairing.tsc = kvm_read_l1_tsc(vcpu, cycle); + clock_pairing.flags = 0; + + ret = 0; + if (kvm_write_guest(vcpu->kvm, paddr, &clock_pairing, + sizeof(struct kvm_clock_pairing))) + ret = -KVM_EFAULT; + + return ret; +} + /* * kvm_pv_kick_cpu_op: Kick a vcpu. * @@ -6150,6 +6239,9 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1); ret = 0; break; + case KVM_HC_CLOCK_PAIRING: + ret = kvm_pv_clock_pairing(vcpu, a0, a1); + break; default: ret = -KVM_ENOSYS; break; @@ -6732,6 +6824,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { + ++vcpu->stat.req_event; kvm_apic_accept_events(vcpu); if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { r = 1; diff --git a/drivers/ptp/Kconfig b/drivers/ptp/Kconfig index bdce33291161..384f661a6496 100644 --- a/drivers/ptp/Kconfig +++ b/drivers/ptp/Kconfig @@ -90,4 +90,16 @@ config PTP_1588_CLOCK_PCH To compile this driver as a module, choose M here: the module will be called ptp_pch. +config PTP_1588_CLOCK_KVM + tristate "KVM virtual PTP clock" + depends on PTP_1588_CLOCK + depends on KVM_GUEST && X86 + default y + help + This driver adds support for using kvm infrastructure as a PTP + clock. This clock is only useful if you are using KVM guests. + + To compile this driver as a module, choose M here: the module + will be called ptp_kvm. + endmenu diff --git a/drivers/ptp/Makefile b/drivers/ptp/Makefile index 8b58597298de..530736161a8b 100644 --- a/drivers/ptp/Makefile +++ b/drivers/ptp/Makefile @@ -6,3 +6,4 @@ ptp-y := ptp_clock.o ptp_chardev.o ptp_sysfs.o obj-$(CONFIG_PTP_1588_CLOCK) += ptp.o obj-$(CONFIG_PTP_1588_CLOCK_IXP46X) += ptp_ixp46x.o obj-$(CONFIG_PTP_1588_CLOCK_PCH) += ptp_pch.o +obj-$(CONFIG_PTP_1588_CLOCK_KVM) += ptp_kvm.o diff --git a/drivers/ptp/ptp_kvm.c b/drivers/ptp/ptp_kvm.c new file mode 100644 index 000000000000..0a54e8326a90 --- /dev/null +++ b/drivers/ptp/ptp_kvm.c @@ -0,0 +1,200 @@ +/* + * Virtual PTP 1588 clock for use with KVM guests + * + * Copyright (C) 2017 Red Hat Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ +#include <linux/device.h> +#include <linux/err.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <uapi/linux/kvm_para.h> +#include <asm/kvm_para.h> +#include <asm/pvclock.h> +#include <asm/kvmclock.h> +#include <uapi/asm/kvm_para.h> + +#include <linux/ptp_clock_kernel.h> + +struct kvm_ptp_clock { + struct ptp_clock *ptp_clock; + struct ptp_clock_info caps; +}; + +DEFINE_SPINLOCK(kvm_ptp_lock); + +static struct pvclock_vsyscall_time_info *hv_clock; + +static struct kvm_clock_pairing clock_pair; +static phys_addr_t clock_pair_gpa; + +static int ptp_kvm_get_time_fn(ktime_t *device_time, + struct system_counterval_t *system_counter, + void *ctx) +{ + unsigned long ret; + struct timespec64 tspec; + unsigned version; + int cpu; + struct pvclock_vcpu_time_info *src; + + spin_lock(&kvm_ptp_lock); + + preempt_disable_notrace(); + cpu = smp_processor_id(); + src = &hv_clock[cpu].pvti; + + do { + /* + * We are using a TSC value read in the hosts + * kvm_hc_clock_pairing handling. + * So any changes to tsc_to_system_mul + * and tsc_shift or any other pvclock + * data invalidate that measurement. + */ + version = pvclock_read_begin(src); + + ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, + clock_pair_gpa, + KVM_CLOCK_PAIRING_WALLCLOCK); + if (ret != 0) { + pr_err_ratelimited("clock pairing hypercall ret %lu\n", ret); + spin_unlock(&kvm_ptp_lock); + preempt_enable_notrace(); + return -EOPNOTSUPP; + } + + tspec.tv_sec = clock_pair.sec; + tspec.tv_nsec = clock_pair.nsec; + ret = __pvclock_read_cycles(src, clock_pair.tsc); + } while (pvclock_read_retry(src, version)); + + preempt_enable_notrace(); + + system_counter->cycles = ret; + system_counter->cs = &kvm_clock; + + *device_time = timespec64_to_ktime(tspec); + + spin_unlock(&kvm_ptp_lock); + + return 0; +} + +static int ptp_kvm_getcrosststamp(struct ptp_clock_info *ptp, + struct system_device_crosststamp *xtstamp) +{ + return get_device_system_crosststamp(ptp_kvm_get_time_fn, NULL, + NULL, xtstamp); +} + +/* + * PTP clock operations + */ + +static int ptp_kvm_adjfreq(struct ptp_clock_info *ptp, s32 ppb) +{ + return -EOPNOTSUPP; +} + +static int ptp_kvm_adjtime(struct ptp_clock_info *ptp, s64 delta) +{ + return -EOPNOTSUPP; +} + +static int ptp_kvm_settime(struct ptp_clock_info *ptp, + const struct timespec64 *ts) +{ + return -EOPNOTSUPP; +} + +static int ptp_kvm_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts) +{ + unsigned long ret; + struct timespec64 tspec; + + spin_lock(&kvm_ptp_lock); + + ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, + clock_pair_gpa, + KVM_CLOCK_PAIRING_WALLCLOCK); + if (ret != 0) { + pr_err_ratelimited("clock offset hypercall ret %lu\n", ret); + spin_unlock(&kvm_ptp_lock); + return -EOPNOTSUPP; + } + + tspec.tv_sec = clock_pair.sec; + tspec.tv_nsec = clock_pair.nsec; + spin_unlock(&kvm_ptp_lock); + + memcpy(ts, &tspec, sizeof(struct timespec64)); + + return 0; +} + +static int ptp_kvm_enable(struct ptp_clock_info *ptp, + struct ptp_clock_request *rq, int on) +{ + return -EOPNOTSUPP; +} + +static struct ptp_clock_info ptp_kvm_caps = { + .owner = THIS_MODULE, + .name = "KVM virtual PTP", + .max_adj = 0, + .n_ext_ts = 0, + .n_pins = 0, + .pps = 0, + .adjfreq = ptp_kvm_adjfreq, + .adjtime = ptp_kvm_adjtime, + .gettime64 = ptp_kvm_gettime, + .settime64 = ptp_kvm_settime, + .enable = ptp_kvm_enable, + .getcrosststamp = ptp_kvm_getcrosststamp, +}; + +/* module operations */ + +static struct kvm_ptp_clock kvm_ptp_clock; + +static void __exit ptp_kvm_exit(void) +{ + ptp_clock_unregister(kvm_ptp_clock.ptp_clock); +} + +static int __init ptp_kvm_init(void) +{ + clock_pair_gpa = slow_virt_to_phys(&clock_pair); + hv_clock = pvclock_pvti_cpu0_va(); + + if (!hv_clock) + return -ENODEV; + + kvm_ptp_clock.caps = ptp_kvm_caps; + + kvm_ptp_clock.ptp_clock = ptp_clock_register(&kvm_ptp_clock.caps, NULL); + + if (IS_ERR(kvm_ptp_clock.ptp_clock)) + return PTR_ERR(kvm_ptp_clock.ptp_clock); + + return 0; +} + +module_init(ptp_kvm_init); +module_exit(ptp_kvm_exit); + +MODULE_AUTHOR("Marcelo Tosatti <mtosatti@redhat.com>"); +MODULE_DESCRIPTION("PTP clock using KVMCLOCK"); +MODULE_LICENSE("GPL"); diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index cac48eda1075..7964b970b9ad 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -685,6 +685,13 @@ struct kvm_ppc_smmu_info { struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ]; }; +/* for KVM_PPC_RESIZE_HPT_{PREPARE,COMMIT} */ +struct kvm_ppc_resize_hpt { + __u64 flags; + __u32 shift; + __u32 pad; +}; + #define KVMIO 0xAE /* machine type bits, to be used as argument to KVM_CREATE_VM */ @@ -871,6 +878,9 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_S390_USER_INSTR0 130 #define KVM_CAP_MSI_DEVID 131 #define KVM_CAP_PPC_HTM 132 +#define KVM_CAP_SPAPR_RESIZE_HPT 133 +#define KVM_CAP_PPC_MMU_RADIX 134 +#define KVM_CAP_PPC_MMU_HASH_V3 135 #ifdef KVM_CAP_IRQ_ROUTING @@ -1187,6 +1197,13 @@ struct kvm_s390_ucas_mapping { #define KVM_ARM_SET_DEVICE_ADDR _IOW(KVMIO, 0xab, struct kvm_arm_device_addr) /* Available with KVM_CAP_PPC_RTAS */ #define KVM_PPC_RTAS_DEFINE_TOKEN _IOW(KVMIO, 0xac, struct kvm_rtas_token_args) +/* Available with KVM_CAP_SPAPR_RESIZE_HPT */ +#define KVM_PPC_RESIZE_HPT_PREPARE _IOR(KVMIO, 0xad, struct kvm_ppc_resize_hpt) +#define KVM_PPC_RESIZE_HPT_COMMIT _IOR(KVMIO, 0xae, struct kvm_ppc_resize_hpt) +/* Available with KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3 */ +#define KVM_PPC_CONFIGURE_V3_MMU _IOW(KVMIO, 0xaf, struct kvm_ppc_mmuv3_cfg) +/* Available with KVM_CAP_PPC_RADIX_MMU */ +#define KVM_PPC_GET_RMMU_INFO _IOW(KVMIO, 0xb0, struct kvm_ppc_rmmu_info) /* ioctl for vm fd */ #define KVM_CREATE_DEVICE _IOWR(KVMIO, 0xe0, struct kvm_create_device) diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h index bf6cd7d5cac2..fed506aeff62 100644 --- a/include/uapi/linux/kvm_para.h +++ b/include/uapi/linux/kvm_para.h @@ -14,6 +14,7 @@ #define KVM_EFAULT EFAULT #define KVM_E2BIG E2BIG #define KVM_EPERM EPERM +#define KVM_EOPNOTSUPP 95 #define KVM_HC_VAPIC_POLL_IRQ 1 #define KVM_HC_MMU_OP 2 @@ -23,6 +24,7 @@ #define KVM_HC_MIPS_GET_CLOCK_FREQ 6 #define KVM_HC_MIPS_EXIT_VM 7 #define KVM_HC_MIPS_CONSOLE_OUTPUT 8 +#define KVM_HC_CLOCK_PAIRING 9 /* * hypercalls use architecture specific diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index cddd5d06e1cb..3603556fa0d9 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -280,6 +280,7 @@ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */ #define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ #define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ +#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */ /* AMD-defined CPU features, CPUID level 0x80000007 (ebx), word 17 */ #define X86_FEATURE_OVERFLOW_RECOV (17*32+0) /* MCA overflow recovery support */ |