diff options
Diffstat (limited to 'arch/s390/kvm')
-rw-r--r-- | arch/s390/kvm/Kconfig | 16 | ||||
-rw-r--r-- | arch/s390/kvm/Makefile | 8 | ||||
-rw-r--r-- | arch/s390/kvm/diag.c | 56 | ||||
-rw-r--r-- | arch/s390/kvm/gaccess.c | 746 | ||||
-rw-r--r-- | arch/s390/kvm/gaccess.h | 153 | ||||
-rw-r--r-- | arch/s390/kvm/guestdbg.c | 12 | ||||
-rw-r--r-- | arch/s390/kvm/intercept.c | 239 | ||||
-rw-r--r-- | arch/s390/kvm/interrupt.c | 661 | ||||
-rw-r--r-- | arch/s390/kvm/irq.h | 19 | ||||
-rw-r--r-- | arch/s390/kvm/kvm-s390.c | 2479 | ||||
-rw-r--r-- | arch/s390/kvm/kvm-s390.h | 117 | ||||
-rw-r--r-- | arch/s390/kvm/pci.c | 704 | ||||
-rw-r--r-- | arch/s390/kvm/pci.h | 87 | ||||
-rw-r--r-- | arch/s390/kvm/priv.c | 195 | ||||
-rw-r--r-- | arch/s390/kvm/pv.c | 894 | ||||
-rw-r--r-- | arch/s390/kvm/sigp.c | 48 | ||||
-rw-r--r-- | arch/s390/kvm/trace-s390.h | 23 | ||||
-rw-r--r-- | arch/s390/kvm/vsie.c | 234 |
18 files changed, 5435 insertions, 1256 deletions
diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index d3db3d7ed077..72e9b7dcdf7d 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -7,7 +7,7 @@ source "virt/kvm/Kconfig" menuconfig VIRTUALIZATION def_bool y prompt "KVM" - ---help--- + help Say Y here to get to see options for using your Linux host to run other operating systems inside virtual machines (guests). This option alone does not add any kernel code. @@ -20,20 +20,18 @@ config KVM def_tristate y prompt "Kernel-based Virtual Machine (KVM) support" depends on HAVE_KVM - select PREEMPT_NOTIFIERS select HAVE_KVM_CPU_RELAX_INTERCEPT select HAVE_KVM_VCPU_ASYNC_IOCTL - select HAVE_KVM_EVENTFD select KVM_ASYNC_PF select KVM_ASYNC_PF_SYNC + select KVM_COMMON select HAVE_KVM_IRQCHIP - select HAVE_KVM_IRQFD select HAVE_KVM_IRQ_ROUTING select HAVE_KVM_INVALID_WAKEUPS select HAVE_KVM_NO_POLL - select SRCU select KVM_VFIO - ---help--- + select MMU_NOTIFIER + help Support hosting paravirtualized guest machines using the SIE virtualization capability on the mainframe. This should work on any 64bit machine. @@ -49,14 +47,10 @@ config KVM config KVM_S390_UCONTROL bool "Userspace controlled virtual machines" depends on KVM - ---help--- + help Allow CAP_SYS_ADMIN users to create KVM virtual machines that are controlled by userspace. If unsure, say N. -# OK, it's a little counter-intuitive to do this, but it puts it neatly under -# the virtualization menu. -source "drivers/vhost/Kconfig" - endif # VIRTUALIZATION diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile index 05ee90a5ea08..02217fb4ae10 100644 --- a/arch/s390/kvm/Makefile +++ b/arch/s390/kvm/Makefile @@ -3,12 +3,12 @@ # # Copyright IBM Corp. 2008 -KVM := ../../../virt/kvm -common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o $(KVM)/async_pf.o $(KVM)/irqchip.o $(KVM)/vfio.o +include $(srctree)/virt/kvm/Makefile.kvm ccflags-y := -Ivirt/kvm -Iarch/s390/kvm -kvm-objs := $(common-objs) kvm-s390.o intercept.o interrupt.o priv.o sigp.o -kvm-objs += diag.o gaccess.o guestdbg.o vsie.o +kvm-y += kvm-s390.o intercept.o interrupt.o priv.o sigp.o +kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o +kvm-$(CONFIG_VFIO_PCI_ZDEV_KVM) += pci.o obj-$(CONFIG_KVM) += kvm.o diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c index 3fb54ec2cf3e..3c65b8258ae6 100644 --- a/arch/s390/kvm/diag.c +++ b/arch/s390/kvm/diag.c @@ -2,7 +2,7 @@ /* * handling diagnose instructions * - * Copyright IBM Corp. 2008, 2011 + * Copyright IBM Corp. 2008, 2020 * * Author(s): Carsten Otte <cotte@de.ibm.com> * Christian Borntraeger <borntraeger@de.ibm.com> @@ -10,7 +10,6 @@ #include <linux/kvm.h> #include <linux/kvm_host.h> -#include <asm/pgalloc.h> #include <asm/gmap.h> #include <asm/virtio-ccw.h> #include "kvm-s390.h" @@ -25,7 +24,7 @@ static int diag_release_pages(struct kvm_vcpu *vcpu) start = vcpu->run->s.regs.gprs[(vcpu->arch.sie_block->ipa & 0xf0) >> 4]; end = vcpu->run->s.regs.gprs[vcpu->arch.sie_block->ipa & 0xf] + PAGE_SIZE; - vcpu->stat.diagnose_10++; + vcpu->stat.instruction_diagnose_10++; if (start & ~PAGE_MASK || end & ~PAGE_MASK || start >= end || start < 2 * PAGE_SIZE) @@ -75,7 +74,7 @@ static int __diag_page_ref_service(struct kvm_vcpu *vcpu) VCPU_EVENT(vcpu, 3, "diag page reference parameter block at 0x%llx", vcpu->run->s.regs.gprs[rx]); - vcpu->stat.diagnose_258++; + vcpu->stat.instruction_diagnose_258++; if (vcpu->run->s.regs.gprs[rx] & 7) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); rc = read_guest(vcpu, vcpu->run->s.regs.gprs[rx], rx, &parm, sizeof(parm)); @@ -146,18 +145,32 @@ static int __diag_page_ref_service(struct kvm_vcpu *vcpu) static int __diag_time_slice_end(struct kvm_vcpu *vcpu) { VCPU_EVENT(vcpu, 5, "%s", "diag time slice end"); - vcpu->stat.diagnose_44++; + vcpu->stat.instruction_diagnose_44++; kvm_vcpu_on_spin(vcpu, true); return 0; } +static int forward_cnt; +static unsigned long cur_slice; + +static int diag9c_forwarding_overrun(void) +{ + /* Reset the count on a new slice */ + if (time_after(jiffies, cur_slice)) { + cur_slice = jiffies; + forward_cnt = diag9c_forwarding_hz / HZ; + } + return forward_cnt-- <= 0 ? 1 : 0; +} + static int __diag_time_slice_end_directed(struct kvm_vcpu *vcpu) { struct kvm_vcpu *tcpu; + int tcpu_cpu; int tid; tid = vcpu->run->s.regs.gprs[(vcpu->arch.sie_block->ipa & 0xf0) >> 4]; - vcpu->stat.diagnose_9c++; + vcpu->stat.instruction_diagnose_9c++; /* yield to self */ if (tid == vcpu->vcpu_id) @@ -168,9 +181,22 @@ static int __diag_time_slice_end_directed(struct kvm_vcpu *vcpu) if (!tcpu) goto no_yield; - /* target already running */ - if (READ_ONCE(tcpu->cpu) >= 0) - goto no_yield; + /* target guest VCPU already running */ + tcpu_cpu = READ_ONCE(tcpu->cpu); + if (tcpu_cpu >= 0) { + if (!diag9c_forwarding_hz || diag9c_forwarding_overrun()) + goto no_yield; + + /* target host CPU already running */ + if (!vcpu_is_preempted(tcpu_cpu)) + goto no_yield; + smp_yield_cpu(tcpu_cpu); + VCPU_EVENT(vcpu, 5, + "diag time slice end directed to %d: yield forwarded", + tid); + vcpu->stat.diag_9c_forward++; + return 0; + } if (kvm_vcpu_yield_to(tcpu) <= 0) goto no_yield; @@ -179,7 +205,7 @@ static int __diag_time_slice_end_directed(struct kvm_vcpu *vcpu) return 0; no_yield: VCPU_EVENT(vcpu, 5, "diag time slice end directed to %d: ignored", tid); - vcpu->stat.diagnose_9c_ignored++; + vcpu->stat.diag_9c_ignored++; return 0; } @@ -189,7 +215,7 @@ static int __diag_ipl_functions(struct kvm_vcpu *vcpu) unsigned long subcode = vcpu->run->s.regs.gprs[reg] & 0xffff; VCPU_EVENT(vcpu, 3, "diag ipl functions, subcode %lx", subcode); - vcpu->stat.diagnose_308++; + vcpu->stat.instruction_diagnose_308++; switch (subcode) { case 3: vcpu->run->s390_reset_flags = KVM_S390_RESET_CLEAR; @@ -201,6 +227,10 @@ static int __diag_ipl_functions(struct kvm_vcpu *vcpu) return -EOPNOTSUPP; } + /* + * no need to check the return value of vcpu_stop as it can only have + * an error for protvirt, but protvirt means user cpu state + */ if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm)) kvm_s390_vcpu_stop(vcpu); vcpu->run->s390_reset_flags |= KVM_S390_RESET_SUBSYSTEM; @@ -217,7 +247,7 @@ static int __diag_virtio_hypercall(struct kvm_vcpu *vcpu) { int ret; - vcpu->stat.diagnose_500++; + vcpu->stat.instruction_diagnose_500++; /* No virtio-ccw notification? Get out quickly. */ if (!vcpu->kvm->arch.css_support || (vcpu->run->s.regs.gprs[1] != KVM_S390_VIRTIO_CCW_NOTIFY)) @@ -271,7 +301,7 @@ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu) case 0x500: return __diag_virtio_hypercall(vcpu); default: - vcpu->stat.diagnose_other++; + vcpu->stat.instruction_diagnose_other++; return -EOPNOTSUPP; } } diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 07d30ffcfa41..5bfcc50c1a68 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -9,8 +9,9 @@ #include <linux/vmalloc.h> #include <linux/mm_types.h> #include <linux/err.h> - -#include <asm/pgtable.h> +#include <linux/pgtable.h> +#include <linux/bitfield.h> +#include <asm/fault.h> #include <asm/gmap.h> #include "kvm-s390.h" #include "gaccess.h" @@ -261,77 +262,77 @@ struct aste { /* .. more fields there */ }; -int ipte_lock_held(struct kvm_vcpu *vcpu) +int ipte_lock_held(struct kvm *kvm) { - if (vcpu->arch.sie_block->eca & ECA_SII) { + if (sclp.has_siif) { int rc; - read_lock(&vcpu->kvm->arch.sca_lock); - rc = kvm_s390_get_ipte_control(vcpu->kvm)->kh != 0; - read_unlock(&vcpu->kvm->arch.sca_lock); + read_lock(&kvm->arch.sca_lock); + rc = kvm_s390_get_ipte_control(kvm)->kh != 0; + read_unlock(&kvm->arch.sca_lock); return rc; } - return vcpu->kvm->arch.ipte_lock_count != 0; + return kvm->arch.ipte_lock_count != 0; } -static void ipte_lock_simple(struct kvm_vcpu *vcpu) +static void ipte_lock_simple(struct kvm *kvm) { union ipte_control old, new, *ic; - mutex_lock(&vcpu->kvm->arch.ipte_mutex); - vcpu->kvm->arch.ipte_lock_count++; - if (vcpu->kvm->arch.ipte_lock_count > 1) + mutex_lock(&kvm->arch.ipte_mutex); + kvm->arch.ipte_lock_count++; + if (kvm->arch.ipte_lock_count > 1) goto out; retry: - read_lock(&vcpu->kvm->arch.sca_lock); - ic = kvm_s390_get_ipte_control(vcpu->kvm); + read_lock(&kvm->arch.sca_lock); + ic = kvm_s390_get_ipte_control(kvm); do { old = READ_ONCE(*ic); if (old.k) { - read_unlock(&vcpu->kvm->arch.sca_lock); + read_unlock(&kvm->arch.sca_lock); cond_resched(); goto retry; } new = old; new.k = 1; } while (cmpxchg(&ic->val, old.val, new.val) != old.val); - read_unlock(&vcpu->kvm->arch.sca_lock); + read_unlock(&kvm->arch.sca_lock); out: - mutex_unlock(&vcpu->kvm->arch.ipte_mutex); + mutex_unlock(&kvm->arch.ipte_mutex); } -static void ipte_unlock_simple(struct kvm_vcpu *vcpu) +static void ipte_unlock_simple(struct kvm *kvm) { union ipte_control old, new, *ic; - mutex_lock(&vcpu->kvm->arch.ipte_mutex); - vcpu->kvm->arch.ipte_lock_count--; - if (vcpu->kvm->arch.ipte_lock_count) + mutex_lock(&kvm->arch.ipte_mutex); + kvm->arch.ipte_lock_count--; + if (kvm->arch.ipte_lock_count) goto out; - read_lock(&vcpu->kvm->arch.sca_lock); - ic = kvm_s390_get_ipte_control(vcpu->kvm); + read_lock(&kvm->arch.sca_lock); + ic = kvm_s390_get_ipte_control(kvm); do { old = READ_ONCE(*ic); new = old; new.k = 0; } while (cmpxchg(&ic->val, old.val, new.val) != old.val); - read_unlock(&vcpu->kvm->arch.sca_lock); - wake_up(&vcpu->kvm->arch.ipte_wq); + read_unlock(&kvm->arch.sca_lock); + wake_up(&kvm->arch.ipte_wq); out: - mutex_unlock(&vcpu->kvm->arch.ipte_mutex); + mutex_unlock(&kvm->arch.ipte_mutex); } -static void ipte_lock_siif(struct kvm_vcpu *vcpu) +static void ipte_lock_siif(struct kvm *kvm) { union ipte_control old, new, *ic; retry: - read_lock(&vcpu->kvm->arch.sca_lock); - ic = kvm_s390_get_ipte_control(vcpu->kvm); + read_lock(&kvm->arch.sca_lock); + ic = kvm_s390_get_ipte_control(kvm); do { old = READ_ONCE(*ic); if (old.kg) { - read_unlock(&vcpu->kvm->arch.sca_lock); + read_unlock(&kvm->arch.sca_lock); cond_resched(); goto retry; } @@ -339,15 +340,15 @@ retry: new.k = 1; new.kh++; } while (cmpxchg(&ic->val, old.val, new.val) != old.val); - read_unlock(&vcpu->kvm->arch.sca_lock); + read_unlock(&kvm->arch.sca_lock); } -static void ipte_unlock_siif(struct kvm_vcpu *vcpu) +static void ipte_unlock_siif(struct kvm *kvm) { union ipte_control old, new, *ic; - read_lock(&vcpu->kvm->arch.sca_lock); - ic = kvm_s390_get_ipte_control(vcpu->kvm); + read_lock(&kvm->arch.sca_lock); + ic = kvm_s390_get_ipte_control(kvm); do { old = READ_ONCE(*ic); new = old; @@ -355,25 +356,25 @@ static void ipte_unlock_siif(struct kvm_vcpu *vcpu) if (!new.kh) new.k = 0; } while (cmpxchg(&ic->val, old.val, new.val) != old.val); - read_unlock(&vcpu->kvm->arch.sca_lock); + read_unlock(&kvm->arch.sca_lock); if (!new.kh) - wake_up(&vcpu->kvm->arch.ipte_wq); + wake_up(&kvm->arch.ipte_wq); } -void ipte_lock(struct kvm_vcpu *vcpu) +void ipte_lock(struct kvm *kvm) { - if (vcpu->arch.sie_block->eca & ECA_SII) - ipte_lock_siif(vcpu); + if (sclp.has_siif) + ipte_lock_siif(kvm); else - ipte_lock_simple(vcpu); + ipte_lock_simple(kvm); } -void ipte_unlock(struct kvm_vcpu *vcpu) +void ipte_unlock(struct kvm *kvm) { - if (vcpu->arch.sie_block->eca & ECA_SII) - ipte_unlock_siif(vcpu); + if (sclp.has_siif) + ipte_unlock_siif(kvm); else - ipte_unlock_simple(vcpu); + ipte_unlock_simple(kvm); } static int ar_translation(struct kvm_vcpu *vcpu, union asce *asce, u8 ar, @@ -465,61 +466,55 @@ static int ar_translation(struct kvm_vcpu *vcpu, union asce *asce, u8 ar, return 0; } -struct trans_exc_code_bits { - unsigned long addr : 52; /* Translation-exception Address */ - unsigned long fsi : 2; /* Access Exception Fetch/Store Indication */ - unsigned long : 2; - unsigned long b56 : 1; - unsigned long : 3; - unsigned long b60 : 1; - unsigned long b61 : 1; - unsigned long as : 2; /* ASCE Identifier */ -}; - -enum { - FSI_UNKNOWN = 0, /* Unknown wether fetch or store */ - FSI_STORE = 1, /* Exception was due to store operation */ - FSI_FETCH = 2 /* Exception was due to fetch operation */ -}; - enum prot_type { PROT_TYPE_LA = 0, PROT_TYPE_KEYC = 1, PROT_TYPE_ALC = 2, PROT_TYPE_DAT = 3, PROT_TYPE_IEP = 4, + /* Dummy value for passing an initialized value when code != PGM_PROTECTION */ + PROT_NONE, }; -static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva, - u8 ar, enum gacc_mode mode, enum prot_type prot) +static int trans_exc_ending(struct kvm_vcpu *vcpu, int code, unsigned long gva, u8 ar, + enum gacc_mode mode, enum prot_type prot, bool terminate) { struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm; - struct trans_exc_code_bits *tec; + union teid *teid; memset(pgm, 0, sizeof(*pgm)); pgm->code = code; - tec = (struct trans_exc_code_bits *)&pgm->trans_exc_code; + teid = (union teid *)&pgm->trans_exc_code; switch (code) { case PGM_PROTECTION: switch (prot) { + case PROT_NONE: + /* We should never get here, acts like termination */ + WARN_ON_ONCE(1); + break; case PROT_TYPE_IEP: - tec->b61 = 1; - /* FALL THROUGH */ + teid->b61 = 1; + fallthrough; case PROT_TYPE_LA: - tec->b56 = 1; + teid->b56 = 1; break; case PROT_TYPE_KEYC: - tec->b60 = 1; + teid->b60 = 1; break; case PROT_TYPE_ALC: - tec->b60 = 1; - /* FALL THROUGH */ + teid->b60 = 1; + fallthrough; case PROT_TYPE_DAT: - tec->b61 = 1; + teid->b61 = 1; break; } - /* FALL THROUGH */ + if (terminate) { + teid->b56 = 0; + teid->b60 = 0; + teid->b61 = 0; + } + fallthrough; case PGM_ASCE_TYPE: case PGM_PAGE_TRANSLATION: case PGM_REGION_FIRST_TRANS: @@ -531,10 +526,10 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva, * exc_access_id has to be set to 0 for some instructions. Both * cases have to be handled by the caller. */ - tec->addr = gva >> PAGE_SHIFT; - tec->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH; - tec->as = psw_bits(vcpu->arch.sie_block->gpsw).as; - /* FALL THROUGH */ + teid->addr = gva >> PAGE_SHIFT; + teid->fsi = mode == GACC_STORE ? TEID_FSI_STORE : TEID_FSI_FETCH; + teid->as = psw_bits(vcpu->arch.sie_block->gpsw).as; + fallthrough; case PGM_ALEN_TRANSLATION: case PGM_ALE_SEQUENCE: case PGM_ASTE_VALIDITY: @@ -551,6 +546,12 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva, return code; } +static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva, u8 ar, + enum gacc_mode mode, enum prot_type prot) +{ + return trans_exc_ending(vcpu, code, gva, ar, mode, prot, false); +} + static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce, unsigned long ga, u8 ar, enum gacc_mode mode) { @@ -607,7 +608,7 @@ static int deref_table(struct kvm *kvm, unsigned long gpa, unsigned long *val) * Returns: - zero on success; @gpa contains the resulting absolute address * - a negative value if guest access failed due to e.g. broken * guest mapping - * - a positve value if an access exception happened. In this case + * - a positive value if an access exception happened. In this case * the returned value is the program interruption code as defined * by the architecture */ @@ -677,7 +678,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, dat_protection |= rfte.p; ptr = rfte.rto * PAGE_SIZE + vaddr.rsx * 8; } - /* fallthrough */ + fallthrough; case ASCE_TYPE_REGION2: { union region2_table_entry rste; @@ -695,7 +696,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, dat_protection |= rste.p; ptr = rste.rto * PAGE_SIZE + vaddr.rtx * 8; } - /* fallthrough */ + fallthrough; case ASCE_TYPE_REGION3: { union region3_table_entry rtte; @@ -723,7 +724,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, dat_protection |= rtte.fc0.p; ptr = rtte.fc0.sto * PAGE_SIZE + vaddr.sx * 8; } - /* fallthrough */ + fallthrough; case ASCE_TYPE_SEGMENT: { union segment_table_entry ste; @@ -794,48 +795,270 @@ static int low_address_protection_enabled(struct kvm_vcpu *vcpu, return 1; } -static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, - unsigned long *pages, unsigned long nr_pages, - const union asce asce, enum gacc_mode mode) +static int vm_check_access_key(struct kvm *kvm, u8 access_key, + enum gacc_mode mode, gpa_t gpa) +{ + u8 storage_key, access_control; + bool fetch_protected; + unsigned long hva; + int r; + + if (access_key == 0) + return 0; + + hva = gfn_to_hva(kvm, gpa_to_gfn(gpa)); + if (kvm_is_error_hva(hva)) + return PGM_ADDRESSING; + + mmap_read_lock(current->mm); + r = get_guest_storage_key(current->mm, hva, &storage_key); + mmap_read_unlock(current->mm); + if (r) + return r; + access_control = FIELD_GET(_PAGE_ACC_BITS, storage_key); + if (access_control == access_key) + return 0; + fetch_protected = storage_key & _PAGE_FP_BIT; + if ((mode == GACC_FETCH || mode == GACC_IFETCH) && !fetch_protected) + return 0; + return PGM_PROTECTION; +} + +static bool fetch_prot_override_applicable(struct kvm_vcpu *vcpu, enum gacc_mode mode, + union asce asce) +{ + psw_t *psw = &vcpu->arch.sie_block->gpsw; + unsigned long override; + + if (mode == GACC_FETCH || mode == GACC_IFETCH) { + /* check if fetch protection override enabled */ + override = vcpu->arch.sie_block->gcr[0]; + override &= CR0_FETCH_PROTECTION_OVERRIDE; + /* not applicable if subject to DAT && private space */ + override = override && !(psw_bits(*psw).dat && asce.p); + return override; + } + return false; +} + +static bool fetch_prot_override_applies(unsigned long ga, unsigned int len) +{ + return ga < 2048 && ga + len <= 2048; +} + +static bool storage_prot_override_applicable(struct kvm_vcpu *vcpu) +{ + /* check if storage protection override enabled */ + return vcpu->arch.sie_block->gcr[0] & CR0_STORAGE_PROTECTION_OVERRIDE; +} + +static bool storage_prot_override_applies(u8 access_control) +{ + /* matches special storage protection override key (9) -> allow */ + return access_control == PAGE_SPO_ACC; +} + +static int vcpu_check_access_key(struct kvm_vcpu *vcpu, u8 access_key, + enum gacc_mode mode, union asce asce, gpa_t gpa, + unsigned long ga, unsigned int len) +{ + u8 storage_key, access_control; + unsigned long hva; + int r; + + /* access key 0 matches any storage key -> allow */ + if (access_key == 0) + return 0; + /* + * caller needs to ensure that gfn is accessible, so we can + * assume that this cannot fail + */ + hva = gfn_to_hva(vcpu->kvm, gpa_to_gfn(gpa)); + mmap_read_lock(current->mm); + r = get_guest_storage_key(current->mm, hva, &storage_key); + mmap_read_unlock(current->mm); + if (r) + return r; + access_control = FIELD_GET(_PAGE_ACC_BITS, storage_key); + /* access key matches storage key -> allow */ + if (access_control == access_key) + return 0; + if (mode == GACC_FETCH || mode == GACC_IFETCH) { + /* it is a fetch and fetch protection is off -> allow */ + if (!(storage_key & _PAGE_FP_BIT)) + return 0; + if (fetch_prot_override_applicable(vcpu, mode, asce) && + fetch_prot_override_applies(ga, len)) + return 0; + } + if (storage_prot_override_applicable(vcpu) && + storage_prot_override_applies(access_control)) + return 0; + return PGM_PROTECTION; +} + +/** + * guest_range_to_gpas() - Calculate guest physical addresses of page fragments + * covering a logical range + * @vcpu: virtual cpu + * @ga: guest address, start of range + * @ar: access register + * @gpas: output argument, may be NULL + * @len: length of range in bytes + * @asce: address-space-control element to use for translation + * @mode: access mode + * @access_key: access key to mach the range's storage keys against + * + * Translate a logical range to a series of guest absolute addresses, + * such that the concatenation of page fragments starting at each gpa make up + * the whole range. + * The translation is performed as if done by the cpu for the given @asce, @ar, + * @mode and state of the @vcpu. + * If the translation causes an exception, its program interruption code is + * returned and the &struct kvm_s390_pgm_info pgm member of @vcpu is modified + * such that a subsequent call to kvm_s390_inject_prog_vcpu() will inject + * a correct exception into the guest. + * The resulting gpas are stored into @gpas, unless it is NULL. + * + * Note: All fragments except the first one start at the beginning of a page. + * When deriving the boundaries of a fragment from a gpa, all but the last + * fragment end at the end of the page. + * + * Return: + * * 0 - success + * * <0 - translation could not be performed, for example if guest + * memory could not be accessed + * * >0 - an access exception occurred. In this case the returned value + * is the program interruption code and the contents of pgm may + * be used to inject an exception into the guest. + */ +static int guest_range_to_gpas(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, + unsigned long *gpas, unsigned long len, + const union asce asce, enum gacc_mode mode, + u8 access_key) { psw_t *psw = &vcpu->arch.sie_block->gpsw; + unsigned int offset = offset_in_page(ga); + unsigned int fragment_len; int lap_enabled, rc = 0; enum prot_type prot; + unsigned long gpa; lap_enabled = low_address_protection_enabled(vcpu, asce); - while (nr_pages) { + while (min(PAGE_SIZE - offset, len) > 0) { + fragment_len = min(PAGE_SIZE - offset, len); ga = kvm_s390_logical_to_effective(vcpu, ga); if (mode == GACC_STORE && lap_enabled && is_low_address(ga)) return trans_exc(vcpu, PGM_PROTECTION, ga, ar, mode, PROT_TYPE_LA); - ga &= PAGE_MASK; if (psw_bits(*psw).dat) { - rc = guest_translate(vcpu, ga, pages, asce, mode, &prot); + rc = guest_translate(vcpu, ga, &gpa, asce, mode, &prot); if (rc < 0) return rc; } else { - *pages = kvm_s390_real_to_abs(vcpu, ga); - if (kvm_is_error_gpa(vcpu->kvm, *pages)) + gpa = kvm_s390_real_to_abs(vcpu, ga); + if (kvm_is_error_gpa(vcpu->kvm, gpa)) { rc = PGM_ADDRESSING; + prot = PROT_NONE; + } } if (rc) return trans_exc(vcpu, rc, ga, ar, mode, prot); - ga += PAGE_SIZE; - pages++; - nr_pages--; + rc = vcpu_check_access_key(vcpu, access_key, mode, asce, gpa, ga, + fragment_len); + if (rc) + return trans_exc(vcpu, rc, ga, ar, mode, PROT_TYPE_KEYC); + if (gpas) + *gpas++ = gpa; + offset = 0; + ga += fragment_len; + len -= fragment_len; + } + return 0; +} + +static int access_guest_page(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa, + void *data, unsigned int len) +{ + const unsigned int offset = offset_in_page(gpa); + const gfn_t gfn = gpa_to_gfn(gpa); + int rc; + + if (mode == GACC_STORE) + rc = kvm_write_guest_page(kvm, gfn, data, offset, len); + else + rc = kvm_read_guest_page(kvm, gfn, data, offset, len); + return rc; +} + +static int +access_guest_page_with_key(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa, + void *data, unsigned int len, u8 access_key) +{ + struct kvm_memory_slot *slot; + bool writable; + gfn_t gfn; + hva_t hva; + int rc; + + gfn = gpa >> PAGE_SHIFT; + slot = gfn_to_memslot(kvm, gfn); + hva = gfn_to_hva_memslot_prot(slot, gfn, &writable); + + if (kvm_is_error_hva(hva)) + return PGM_ADDRESSING; + /* + * Check if it's a ro memslot, even tho that can't occur (they're unsupported). + * Don't try to actually handle that case. + */ + if (!writable && mode == GACC_STORE) + return -EOPNOTSUPP; + hva += offset_in_page(gpa); + if (mode == GACC_STORE) + rc = copy_to_user_key((void __user *)hva, data, len, access_key); + else + rc = copy_from_user_key(data, (void __user *)hva, len, access_key); + if (rc) + return PGM_PROTECTION; + if (mode == GACC_STORE) + mark_page_dirty_in_slot(kvm, slot, gfn); + return 0; +} + +int access_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, void *data, + unsigned long len, enum gacc_mode mode, u8 access_key) +{ + int offset = offset_in_page(gpa); + int fragment_len; + int rc; + + while (min(PAGE_SIZE - offset, len) > 0) { + fragment_len = min(PAGE_SIZE - offset, len); + rc = access_guest_page_with_key(kvm, mode, gpa, data, fragment_len, access_key); + if (rc) + return rc; + offset = 0; + len -= fragment_len; + data += fragment_len; + gpa += fragment_len; } return 0; } -int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data, - unsigned long len, enum gacc_mode mode) +int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, + void *data, unsigned long len, enum gacc_mode mode, + u8 access_key) { psw_t *psw = &vcpu->arch.sie_block->gpsw; - unsigned long _len, nr_pages, gpa, idx; - unsigned long pages_array[2]; - unsigned long *pages; + unsigned long nr_pages, idx; + unsigned long gpa_array[2]; + unsigned int fragment_len; + unsigned long *gpas; + enum prot_type prot; int need_ipte_lock; union asce asce; + bool try_storage_prot_override; + bool try_fetch_prot_override; int rc; if (!len) @@ -845,55 +1068,199 @@ int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data, if (rc) return rc; nr_pages = (((ga & ~PAGE_MASK) + len - 1) >> PAGE_SHIFT) + 1; - pages = pages_array; - if (nr_pages > ARRAY_SIZE(pages_array)) - pages = vmalloc(array_size(nr_pages, sizeof(unsigned long))); - if (!pages) + gpas = gpa_array; + if (nr_pages > ARRAY_SIZE(gpa_array)) + gpas = vmalloc(array_size(nr_pages, sizeof(unsigned long))); + if (!gpas) return -ENOMEM; + try_fetch_prot_override = fetch_prot_override_applicable(vcpu, mode, asce); + try_storage_prot_override = storage_prot_override_applicable(vcpu); need_ipte_lock = psw_bits(*psw).dat && !asce.r; if (need_ipte_lock) - ipte_lock(vcpu); - rc = guest_page_range(vcpu, ga, ar, pages, nr_pages, asce, mode); - for (idx = 0; idx < nr_pages && !rc; idx++) { - gpa = *(pages + idx) + (ga & ~PAGE_MASK); - _len = min(PAGE_SIZE - (gpa & ~PAGE_MASK), len); - if (mode == GACC_STORE) - rc = kvm_write_guest(vcpu->kvm, gpa, data, _len); + ipte_lock(vcpu->kvm); + /* + * Since we do the access further down ultimately via a move instruction + * that does key checking and returns an error in case of a protection + * violation, we don't need to do the check during address translation. + * Skip it by passing access key 0, which matches any storage key, + * obviating the need for any further checks. As a result the check is + * handled entirely in hardware on access, we only need to take care to + * forego key protection checking if fetch protection override applies or + * retry with the special key 9 in case of storage protection override. + */ + rc = guest_range_to_gpas(vcpu, ga, ar, gpas, len, asce, mode, 0); + if (rc) + goto out_unlock; + for (idx = 0; idx < nr_pages; idx++) { + fragment_len = min(PAGE_SIZE - offset_in_page(gpas[idx]), len); + if (try_fetch_prot_override && fetch_prot_override_applies(ga, fragment_len)) { + rc = access_guest_page(vcpu->kvm, mode, gpas[idx], + data, fragment_len); + } else { + rc = access_guest_page_with_key(vcpu->kvm, mode, gpas[idx], + data, fragment_len, access_key); + } + if (rc == PGM_PROTECTION && try_storage_prot_override) + rc = access_guest_page_with_key(vcpu->kvm, mode, gpas[idx], + data, fragment_len, PAGE_SPO_ACC); + if (rc) + break; + len -= fragment_len; + data += fragment_len; + ga = kvm_s390_logical_to_effective(vcpu, ga + fragment_len); + } + if (rc > 0) { + bool terminate = (mode == GACC_STORE) && (idx > 0); + + if (rc == PGM_PROTECTION) + prot = PROT_TYPE_KEYC; else - rc = kvm_read_guest(vcpu->kvm, gpa, data, _len); - len -= _len; - ga += _len; - data += _len; + prot = PROT_NONE; + rc = trans_exc_ending(vcpu, rc, ga, ar, mode, prot, terminate); } +out_unlock: if (need_ipte_lock) - ipte_unlock(vcpu); - if (nr_pages > ARRAY_SIZE(pages_array)) - vfree(pages); + ipte_unlock(vcpu->kvm); + if (nr_pages > ARRAY_SIZE(gpa_array)) + vfree(gpas); return rc; } int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, void *data, unsigned long len, enum gacc_mode mode) { - unsigned long _len, gpa; + unsigned int fragment_len; + unsigned long gpa; int rc = 0; while (len && !rc) { gpa = kvm_s390_real_to_abs(vcpu, gra); - _len = min(PAGE_SIZE - (gpa & ~PAGE_MASK), len); - if (mode) - rc = write_guest_abs(vcpu, gpa, data, _len); - else - rc = read_guest_abs(vcpu, gpa, data, _len); - len -= _len; - gra += _len; - data += _len; + fragment_len = min(PAGE_SIZE - offset_in_page(gpa), len); + rc = access_guest_page(vcpu->kvm, mode, gpa, data, fragment_len); + len -= fragment_len; + gra += fragment_len; + data += fragment_len; } return rc; } /** - * guest_translate_address - translate guest logical into guest absolute address + * cmpxchg_guest_abs_with_key() - Perform cmpxchg on guest absolute address. + * @kvm: Virtual machine instance. + * @gpa: Absolute guest address of the location to be changed. + * @len: Operand length of the cmpxchg, required: 1 <= len <= 16. Providing a + * non power of two will result in failure. + * @old_addr: Pointer to old value. If the location at @gpa contains this value, + * the exchange will succeed. After calling cmpxchg_guest_abs_with_key() + * *@old_addr contains the value at @gpa before the attempt to + * exchange the value. + * @new: The value to place at @gpa. + * @access_key: The access key to use for the guest access. + * @success: output value indicating if an exchange occurred. + * + * Atomically exchange the value at @gpa by @new, if it contains *@old. + * Honors storage keys. + * + * Return: * 0: successful exchange + * * >0: a program interruption code indicating the reason cmpxchg could + * not be attempted + * * -EINVAL: address misaligned or len not power of two + * * -EAGAIN: transient failure (len 1 or 2) + * * -EOPNOTSUPP: read-only memslot (should never occur) + */ +int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, + __uint128_t *old_addr, __uint128_t new, + u8 access_key, bool *success) +{ + gfn_t gfn = gpa_to_gfn(gpa); + struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); + bool writable; + hva_t hva; + int ret; + + if (!IS_ALIGNED(gpa, len)) + return -EINVAL; + + hva = gfn_to_hva_memslot_prot(slot, gfn, &writable); + if (kvm_is_error_hva(hva)) + return PGM_ADDRESSING; + /* + * Check if it's a read-only memslot, even though that cannot occur + * since those are unsupported. + * Don't try to actually handle that case. + */ + if (!writable) + return -EOPNOTSUPP; + + hva += offset_in_page(gpa); + /* + * The cmpxchg_user_key macro depends on the type of "old", so we need + * a case for each valid length and get some code duplication as long + * as we don't introduce a new macro. + */ + switch (len) { + case 1: { + u8 old; + + ret = cmpxchg_user_key((u8 __user *)hva, &old, *old_addr, new, access_key); + *success = !ret && old == *old_addr; + *old_addr = old; + break; + } + case 2: { + u16 old; + + ret = cmpxchg_user_key((u16 __user *)hva, &old, *old_addr, new, access_key); + *success = !ret && old == *old_addr; + *old_addr = old; + break; + } + case 4: { + u32 old; + + ret = cmpxchg_user_key((u32 __user *)hva, &old, *old_addr, new, access_key); + *success = !ret && old == *old_addr; + *old_addr = old; + break; + } + case 8: { + u64 old; + + ret = cmpxchg_user_key((u64 __user *)hva, &old, *old_addr, new, access_key); + *success = !ret && old == *old_addr; + *old_addr = old; + break; + } + case 16: { + __uint128_t old; + + ret = cmpxchg_user_key((__uint128_t __user *)hva, &old, *old_addr, new, access_key); + *success = !ret && old == *old_addr; + *old_addr = old; + break; + } + default: + return -EINVAL; + } + if (*success) + mark_page_dirty_in_slot(kvm, slot, gfn); + /* + * Assume that the fault is caused by protection, either key protection + * or user page write protection. + */ + if (ret == -EFAULT) + ret = PGM_PROTECTION; + return ret; +} + +/** + * guest_translate_address_with_key - translate guest logical into guest absolute address + * @vcpu: virtual cpu + * @gva: Guest virtual address + * @ar: Access register + * @gpa: Guest physical address + * @mode: Translation access mode + * @access_key: access key to mach the storage key with * * Parameter semantics are the same as the ones from guest_translate. * The memory contents at the guest address are not changed. @@ -901,11 +1268,10 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, * Note: The IPTE lock is not taken during this function, so the caller * has to take care of this. */ -int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar, - unsigned long *gpa, enum gacc_mode mode) +int guest_translate_address_with_key(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar, + unsigned long *gpa, enum gacc_mode mode, + u8 access_key) { - psw_t *psw = &vcpu->arch.sie_block->gpsw; - enum prot_type prot; union asce asce; int rc; @@ -913,49 +1279,62 @@ int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar, rc = get_vcpu_asce(vcpu, &asce, gva, ar, mode); if (rc) return rc; - if (is_low_address(gva) && low_address_protection_enabled(vcpu, asce)) { - if (mode == GACC_STORE) - return trans_exc(vcpu, PGM_PROTECTION, gva, 0, - mode, PROT_TYPE_LA); - } + return guest_range_to_gpas(vcpu, gva, ar, gpa, 1, asce, mode, + access_key); +} - if (psw_bits(*psw).dat && !asce.r) { /* Use DAT? */ - rc = guest_translate(vcpu, gva, gpa, asce, mode, &prot); - if (rc > 0) - return trans_exc(vcpu, rc, gva, 0, mode, prot); - } else { - *gpa = kvm_s390_real_to_abs(vcpu, gva); - if (kvm_is_error_gpa(vcpu->kvm, *gpa)) - return trans_exc(vcpu, rc, gva, PGM_ADDRESSING, mode, 0); - } +/** + * check_gva_range - test a range of guest virtual addresses for accessibility + * @vcpu: virtual cpu + * @gva: Guest virtual address + * @ar: Access register + * @length: Length of test range + * @mode: Translation access mode + * @access_key: access key to mach the storage keys with + */ +int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar, + unsigned long length, enum gacc_mode mode, u8 access_key) +{ + union asce asce; + int rc = 0; + + rc = get_vcpu_asce(vcpu, &asce, gva, ar, mode); + if (rc) + return rc; + ipte_lock(vcpu->kvm); + rc = guest_range_to_gpas(vcpu, gva, ar, NULL, length, asce, mode, + access_key); + ipte_unlock(vcpu->kvm); return rc; } /** - * check_gva_range - test a range of guest virtual addresses for accessibility + * check_gpa_range - test a range of guest physical addresses for accessibility + * @kvm: virtual machine instance + * @gpa: guest physical address + * @length: length of test range + * @mode: access mode to test, relevant for storage keys + * @access_key: access key to mach the storage keys with */ -int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar, - unsigned long length, enum gacc_mode mode) +int check_gpa_range(struct kvm *kvm, unsigned long gpa, unsigned long length, + enum gacc_mode mode, u8 access_key) { - unsigned long gpa; - unsigned long currlen; + unsigned int fragment_len; int rc = 0; - ipte_lock(vcpu); - while (length > 0 && !rc) { - currlen = min(length, PAGE_SIZE - (gva % PAGE_SIZE)); - rc = guest_translate_address(vcpu, gva, ar, &gpa, mode); - gva += currlen; - length -= currlen; + while (length && !rc) { + fragment_len = min(PAGE_SIZE - offset_in_page(gpa), length); + rc = vm_check_access_key(kvm, access_key, mode, gpa); + length -= fragment_len; + gpa += fragment_len; } - ipte_unlock(vcpu); - return rc; } /** * kvm_s390_check_low_addr_prot_real - check for low-address protection + * @vcpu: virtual cpu * @gra: Guest real address * * Checks whether an address is subject to low-address protection and set @@ -976,13 +1355,17 @@ int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra) * kvm_s390_shadow_tables - walk the guest page table and create shadow tables * @sg: pointer to the shadow guest address space structure * @saddr: faulting address in the shadow gmap - * @pgt: pointer to the page table address result + * @pgt: pointer to the beginning of the page table for the given address if + * successful (return value 0), or to the first invalid DAT entry in + * case of exceptions (return value > 0) + * @dat_protection: referenced memory is write protected * @fake: pgt references contiguous guest memory block, not a pgtable */ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, unsigned long *pgt, int *dat_protection, int *fake) { + struct kvm *kvm; struct gmap *parent; union asce asce; union vaddress vaddr; @@ -991,6 +1374,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, *fake = 0; *dat_protection = 0; + kvm = sg->private; parent = sg->parent; vaddr.addr = saddr; asce.val = sg->orig_asce; @@ -1034,6 +1418,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, rfte.val = ptr; goto shadow_r2t; } + *pgt = ptr + vaddr.rfx * 8; rc = gmap_read_table(parent, ptr + vaddr.rfx * 8, &rfte.val); if (rc) return rc; @@ -1050,7 +1435,9 @@ shadow_r2t: rc = gmap_shadow_r2t(sg, saddr, rfte.val, *fake); if (rc) return rc; - } /* fallthrough */ + kvm->stat.gmap_shadow_r1_entry++; + } + fallthrough; case ASCE_TYPE_REGION2: { union region2_table_entry rste; @@ -1059,6 +1446,7 @@ shadow_r2t: rste.val = ptr; goto shadow_r3t; } + *pgt = ptr + vaddr.rsx * 8; rc = gmap_read_table(parent, ptr + vaddr.rsx * 8, &rste.val); if (rc) return rc; @@ -1076,7 +1464,9 @@ shadow_r3t: rc = gmap_shadow_r3t(sg, saddr, rste.val, *fake); if (rc) return rc; - } /* fallthrough */ + kvm->stat.gmap_shadow_r2_entry++; + } + fallthrough; case ASCE_TYPE_REGION3: { union region3_table_entry rtte; @@ -1085,6 +1475,7 @@ shadow_r3t: rtte.val = ptr; goto shadow_sgt; } + *pgt = ptr + vaddr.rtx * 8; rc = gmap_read_table(parent, ptr + vaddr.rtx * 8, &rtte.val); if (rc) return rc; @@ -1111,7 +1502,9 @@ shadow_sgt: rc = gmap_shadow_sgt(sg, saddr, rtte.val, *fake); if (rc) return rc; - } /* fallthrough */ + kvm->stat.gmap_shadow_r3_entry++; + } + fallthrough; case ASCE_TYPE_SEGMENT: { union segment_table_entry ste; @@ -1120,6 +1513,7 @@ shadow_sgt: ste.val = ptr; goto shadow_pgt; } + *pgt = ptr + vaddr.sx * 8; rc = gmap_read_table(parent, ptr + vaddr.sx * 8, &ste.val); if (rc) return rc; @@ -1142,6 +1536,7 @@ shadow_pgt: rc = gmap_shadow_pgt(sg, saddr, ste.val, *fake); if (rc) return rc; + kvm->stat.gmap_shadow_sg_entry++; } } /* Return the parent address of the page table */ @@ -1154,6 +1549,8 @@ shadow_pgt: * @vcpu: virtual cpu * @sg: pointer to the shadow guest address space structure * @saddr: faulting address in the shadow gmap + * @datptr: will contain the address of the faulting DAT table entry, or of + * the valid leaf, plus some flags * * Returns: - 0 if the shadow fault was successfully resolved * - > 0 (pgm exception code) on exceptions while faulting @@ -1162,21 +1559,21 @@ shadow_pgt: * - -ENOMEM if out of memory */ int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, - unsigned long saddr) + unsigned long saddr, unsigned long *datptr) { union vaddress vaddr; union page_table_entry pte; - unsigned long pgt; + unsigned long pgt = 0; int dat_protection, fake; int rc; - down_read(&sg->mm->mmap_sem); + mmap_read_lock(sg->mm); /* * We don't want any guest-2 tables to change - so the parent * tables/pointers we read stay valid - unshadowing is however * always possible - only guest_table_lock protects us. */ - ipte_lock(vcpu); + ipte_lock(vcpu->kvm); rc = gmap_shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection, &fake); if (rc) @@ -1188,8 +1585,20 @@ int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, pte.val = pgt + vaddr.px * PAGE_SIZE; goto shadow_page; } - if (!rc) - rc = gmap_read_table(sg->parent, pgt + vaddr.px * 8, &pte.val); + + switch (rc) { + case PGM_SEGMENT_TRANSLATION: + case PGM_REGION_THIRD_TRANS: + case PGM_REGION_SECOND_TRANS: + case PGM_REGION_FIRST_TRANS: + pgt |= PEI_NOT_PTE; + break; + case 0: + pgt += vaddr.px * 8; + rc = gmap_read_table(sg->parent, pgt, &pte.val); + } + if (datptr) + *datptr = pgt | dat_protection * PEI_DAT_PROT; if (!rc && pte.i) rc = PGM_PAGE_TRANSLATION; if (!rc && pte.z) @@ -1198,7 +1607,8 @@ shadow_page: pte.p |= dat_protection; if (!rc) rc = gmap_shadow_page(sg, saddr, __pte(pte.val)); - ipte_unlock(vcpu); - up_read(&sg->mm->mmap_sem); + vcpu->kvm->stat.gmap_shadow_pg_entry++; + ipte_unlock(vcpu->kvm); + mmap_read_unlock(sg->mm); return rc; } diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h index f4c51756c462..b320d12aa049 100644 --- a/arch/s390/kvm/gaccess.h +++ b/arch/s390/kvm/gaccess.h @@ -18,17 +18,14 @@ /** * kvm_s390_real_to_abs - convert guest real address to guest absolute address - * @vcpu - guest virtual cpu + * @prefix - guest prefix * @gra - guest real address * * Returns the guest absolute address that corresponds to the passed guest real - * address @gra of a virtual guest cpu by applying its prefix. + * address @gra of by applying the given prefix. */ -static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu, - unsigned long gra) +static inline unsigned long _kvm_s390_real_to_abs(u32 prefix, unsigned long gra) { - unsigned long prefix = kvm_s390_get_prefix(vcpu); - if (gra < 2 * PAGE_SIZE) gra += prefix; else if (gra >= prefix && gra < prefix + 2 * PAGE_SIZE) @@ -37,6 +34,43 @@ static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu, } /** + * kvm_s390_real_to_abs - convert guest real address to guest absolute address + * @vcpu - guest virtual cpu + * @gra - guest real address + * + * Returns the guest absolute address that corresponds to the passed guest real + * address @gra of a virtual guest cpu by applying its prefix. + */ +static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu, + unsigned long gra) +{ + return _kvm_s390_real_to_abs(kvm_s390_get_prefix(vcpu), gra); +} + +/** + * _kvm_s390_logical_to_effective - convert guest logical to effective address + * @psw: psw of the guest + * @ga: guest logical address + * + * Convert a guest logical address to an effective address by applying the + * rules of the addressing mode defined by bits 31 and 32 of the given PSW + * (extendended/basic addressing mode). + * + * Depending on the addressing mode, the upper 40 bits (24 bit addressing + * mode), 33 bits (31 bit addressing mode) or no bits (64 bit addressing + * mode) of @ga will be zeroed and the remaining bits will be returned. + */ +static inline unsigned long _kvm_s390_logical_to_effective(psw_t *psw, + unsigned long ga) +{ + if (psw_bits(*psw).eaba == PSW_BITS_AMODE_64BIT) + return ga; + if (psw_bits(*psw).eaba == PSW_BITS_AMODE_31BIT) + return ga & ((1UL << 31) - 1); + return ga & ((1UL << 24) - 1); +} + +/** * kvm_s390_logical_to_effective - convert guest logical to effective address * @vcpu: guest virtual cpu * @ga: guest logical address @@ -52,13 +86,7 @@ static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu, static inline unsigned long kvm_s390_logical_to_effective(struct kvm_vcpu *vcpu, unsigned long ga) { - psw_t *psw = &vcpu->arch.sie_block->gpsw; - - if (psw_bits(*psw).eaba == PSW_BITS_AMODE_64BIT) - return ga; - if (psw_bits(*psw).eaba == PSW_BITS_AMODE_31BIT) - return ga & ((1UL << 31) - 1); - return ga & ((1UL << 24) - 1); + return _kvm_s390_logical_to_effective(&vcpu->arch.sie_block->gpsw, ga); } /* @@ -158,24 +186,37 @@ enum gacc_mode { GACC_IFETCH, }; -int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, - u8 ar, unsigned long *gpa, enum gacc_mode mode); +int guest_translate_address_with_key(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar, + unsigned long *gpa, enum gacc_mode mode, + u8 access_key); + int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar, - unsigned long length, enum gacc_mode mode); + unsigned long length, enum gacc_mode mode, u8 access_key); + +int check_gpa_range(struct kvm *kvm, unsigned long gpa, unsigned long length, + enum gacc_mode mode, u8 access_key); + +int access_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, void *data, + unsigned long len, enum gacc_mode mode, u8 access_key); -int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data, - unsigned long len, enum gacc_mode mode); +int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, + void *data, unsigned long len, enum gacc_mode mode, + u8 access_key); int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, void *data, unsigned long len, enum gacc_mode mode); +int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, __uint128_t *old, + __uint128_t new, u8 access_key, bool *success); + /** - * write_guest - copy data from kernel space to guest space + * write_guest_with_key - copy data from kernel space to guest space * @vcpu: virtual cpu * @ga: guest address * @ar: access register * @data: source address in kernel space * @len: number of bytes to copy + * @access_key: access key the storage key needs to match * * Copy @len bytes from @data (kernel space) to @ga (guest address). * In order to copy data to guest space the PSW of the vcpu is inspected: @@ -186,8 +227,8 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, * The addressing mode of the PSW is also inspected, so that address wrap * around is taken into account for 24-, 31- and 64-bit addressing mode, * if the to be copied data crosses page boundaries in guest address space. - * In addition also low address and DAT protection are inspected before - * copying any data (key protection is currently not implemented). + * In addition low address, DAT and key protection checks are performed before + * copying any data. * * This function modifies the 'struct kvm_s390_pgm_info pgm' member of @vcpu. * In case of an access exception (e.g. protection exception) pgm will contain @@ -215,10 +256,53 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, * if data has been changed in guest space in case of an exception. */ static inline __must_check +int write_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, + void *data, unsigned long len, u8 access_key) +{ + return access_guest_with_key(vcpu, ga, ar, data, len, GACC_STORE, + access_key); +} + +/** + * write_guest - copy data from kernel space to guest space + * @vcpu: virtual cpu + * @ga: guest address + * @ar: access register + * @data: source address in kernel space + * @len: number of bytes to copy + * + * The behaviour of write_guest is identical to write_guest_with_key, except + * that the PSW access key is used instead of an explicit argument. + */ +static inline __must_check int write_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data, unsigned long len) { - return access_guest(vcpu, ga, ar, data, len, GACC_STORE); + u8 access_key = psw_bits(vcpu->arch.sie_block->gpsw).key; + + return write_guest_with_key(vcpu, ga, ar, data, len, access_key); +} + +/** + * read_guest_with_key - copy data from guest space to kernel space + * @vcpu: virtual cpu + * @ga: guest address + * @ar: access register + * @data: destination address in kernel space + * @len: number of bytes to copy + * @access_key: access key the storage key needs to match + * + * Copy @len bytes from @ga (guest address) to @data (kernel space). + * + * The behaviour of read_guest_with_key is identical to write_guest_with_key, + * except that data will be copied from guest space to kernel space. + */ +static inline __must_check +int read_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, + void *data, unsigned long len, u8 access_key) +{ + return access_guest_with_key(vcpu, ga, ar, data, len, GACC_FETCH, + access_key); } /** @@ -231,14 +315,16 @@ int write_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data, * * Copy @len bytes from @ga (guest address) to @data (kernel space). * - * The behaviour of read_guest is identical to write_guest, except that - * data will be copied from guest space to kernel space. + * The behaviour of read_guest is identical to read_guest_with_key, except + * that the PSW access key is used instead of an explicit argument. */ static inline __must_check int read_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data, unsigned long len) { - return access_guest(vcpu, ga, ar, data, len, GACC_FETCH); + u8 access_key = psw_bits(vcpu->arch.sie_block->gpsw).key; + + return read_guest_with_key(vcpu, ga, ar, data, len, access_key); } /** @@ -259,7 +345,10 @@ static inline __must_check int read_guest_instr(struct kvm_vcpu *vcpu, unsigned long ga, void *data, unsigned long len) { - return access_guest(vcpu, ga, 0, data, len, GACC_IFETCH); + u8 access_key = psw_bits(vcpu->arch.sie_block->gpsw).key; + + return access_guest_with_key(vcpu, ga, 0, data, len, GACC_IFETCH, + access_key); } /** @@ -354,12 +443,16 @@ int read_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, void *data, return access_guest_real(vcpu, gra, data, len, 0); } -void ipte_lock(struct kvm_vcpu *vcpu); -void ipte_unlock(struct kvm_vcpu *vcpu); -int ipte_lock_held(struct kvm_vcpu *vcpu); +void ipte_lock(struct kvm *kvm); +void ipte_unlock(struct kvm *kvm); +int ipte_lock_held(struct kvm *kvm); int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra); +/* MVPG PEI indication bits */ +#define PEI_DAT_PROT 2 +#define PEI_NOT_PTE 4 + int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *shadow, - unsigned long saddr); + unsigned long saddr, unsigned long *datptr); #endif /* __KVM_S390_GACCESS_H */ diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c index 394a5f53805b..80879fc73c90 100644 --- a/arch/s390/kvm/guestdbg.c +++ b/arch/s390/kvm/guestdbg.c @@ -184,7 +184,7 @@ static int __import_wp_info(struct kvm_vcpu *vcpu, if (wp_info->len < 0 || wp_info->len > MAX_WP_SIZE) return -EINVAL; - wp_info->old_data = kmalloc(bp_data->len, GFP_KERNEL); + wp_info->old_data = kmalloc(bp_data->len, GFP_KERNEL_ACCOUNT); if (!wp_info->old_data) return -ENOMEM; /* try to backup the original value */ @@ -213,8 +213,8 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu, else if (dbg->arch.nr_hw_bp > MAX_BP_COUNT) return -EINVAL; - bp_data = memdup_user(dbg->arch.hw_bp, - sizeof(*bp_data) * dbg->arch.nr_hw_bp); + bp_data = memdup_array_user(dbg->arch.hw_bp, dbg->arch.nr_hw_bp, + sizeof(*bp_data)); if (IS_ERR(bp_data)) return PTR_ERR(bp_data); @@ -234,7 +234,7 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu, if (nr_wp > 0) { wp_info = kmalloc_array(nr_wp, sizeof(*wp_info), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!wp_info) { ret = -ENOMEM; goto error; @@ -243,7 +243,7 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu, if (nr_bp > 0) { bp_info = kmalloc_array(nr_bp, sizeof(*bp_info), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!bp_info) { ret = -ENOMEM; goto error; @@ -349,7 +349,7 @@ static struct kvm_hw_wp_info_arch *any_wp_changed(struct kvm_vcpu *vcpu) if (!wp_info || !wp_info->old_data || wp_info->len <= 0) continue; - temp = kmalloc(wp_info->len, GFP_KERNEL); + temp = kmalloc(wp_info->len, GFP_KERNEL_ACCOUNT); if (!temp) continue; diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index a389fa85cca2..b16352083ff9 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -2,7 +2,7 @@ /* * in-kernel handling for sie intercepts * - * Copyright IBM Corp. 2008, 2014 + * Copyright IBM Corp. 2008, 2020 * * Author(s): Carsten Otte <cotte@de.ibm.com> * Christian Borntraeger <borntraeger@de.ibm.com> @@ -12,10 +12,10 @@ #include <linux/errno.h> #include <linux/pagemap.h> -#include <asm/kvm_host.h> #include <asm/asm-offsets.h> #include <asm/irq.h> #include <asm/sysinfo.h> +#include <asm/uv.h> #include "kvm-s390.h" #include "gaccess.h" @@ -79,6 +79,10 @@ static int handle_stop(struct kvm_vcpu *vcpu) return rc; } + /* + * no need to check the return value of vcpu_stop as it can only have + * an error for protvirt, but protvirt means user cpu state + */ if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm)) kvm_s390_vcpu_stop(vcpu); return -EOPNOTSUPP; @@ -213,7 +217,7 @@ static int handle_itdb(struct kvm_vcpu *vcpu) return 0; if (current->thread.per_flags & PER_FLAG_NO_TE) return 0; - itdb = (struct kvm_s390_itdb *)vcpu->arch.sie_block->itdba; + itdb = phys_to_virt(vcpu->arch.sie_block->itdba); rc = write_guest_lc(vcpu, __LC_PGM_TDB, itdb, sizeof(*itdb)); if (rc) return rc; @@ -224,6 +228,21 @@ static int handle_itdb(struct kvm_vcpu *vcpu) #define per_event(vcpu) (vcpu->arch.sie_block->iprcc & PGM_PER) +static bool should_handle_per_event(const struct kvm_vcpu *vcpu) +{ + if (!guestdbg_enabled(vcpu) || !per_event(vcpu)) + return false; + if (guestdbg_sstep_enabled(vcpu) && + vcpu->arch.sie_block->iprcc != PGM_PER) { + /* + * __vcpu_run() will exit after delivering the concurrently + * indicated condition. + */ + return false; + } + return true; +} + static int handle_prog(struct kvm_vcpu *vcpu) { psw_t psw; @@ -231,7 +250,14 @@ static int handle_prog(struct kvm_vcpu *vcpu) vcpu->stat.exit_program_interruption++; - if (guestdbg_enabled(vcpu) && per_event(vcpu)) { + /* + * Intercept 8 indicates a loop of specification exceptions + * for protected guests. + */ + if (kvm_s390_pv_cpu_is_protected(vcpu)) + return -EOPNOTSUPP; + + if (should_handle_per_event(vcpu)) { rc = kvm_s390_handle_per_event(vcpu); if (rc) return rc; @@ -258,11 +284,20 @@ static int handle_prog(struct kvm_vcpu *vcpu) /** * handle_external_interrupt - used for external interruption interceptions + * @vcpu: virtual cpu + * + * This interception occurs if: + * - the CPUSTAT_EXT_INT bit was already set when the external interrupt + * occurred. In this case, the interrupt needs to be injected manually to + * preserve interrupt priority. + * - the external new PSW has external interrupts enabled, which will cause an + * interruption loop. We drop to userspace in this case. + * + * The latter case can be detected by inspecting the external mask bit in the + * external new psw. * - * This interception only occurs if the CPUSTAT_EXT_INT bit was set, or if - * the new PSW does not have external interrupts disabled. In the first case, - * we've got to deliver the interrupt manually, and in the second case, we - * drop to userspace to handle the situation there. + * Under PV, only the latter case can occur, since interrupt priorities are + * handled in the ultravisor. */ static int handle_external_interrupt(struct kvm_vcpu *vcpu) { @@ -273,10 +308,18 @@ static int handle_external_interrupt(struct kvm_vcpu *vcpu) vcpu->stat.exit_external_interrupt++; - rc = read_guest_lc(vcpu, __LC_EXT_NEW_PSW, &newpsw, sizeof(psw_t)); - if (rc) - return rc; - /* We can not handle clock comparator or timer interrupt with bad PSW */ + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + newpsw = vcpu->arch.sie_block->gpsw; + } else { + rc = read_guest_lc(vcpu, __LC_EXT_NEW_PSW, &newpsw, sizeof(psw_t)); + if (rc) + return rc; + } + + /* + * Clock comparator or timer interrupt with external interrupt enabled + * will cause interrupt loop. Drop to userspace. + */ if ((eic == EXT_IRQ_CLK_COMP || eic == EXT_IRQ_CPU_TIMER) && (newpsw.mask & PSW_MASK_EXT)) return -EOPNOTSUPP; @@ -304,7 +347,8 @@ static int handle_external_interrupt(struct kvm_vcpu *vcpu) } /** - * Handle MOVE PAGE partial execution interception. + * handle_mvpg_pei - Handle MOVE PAGE partial execution interception. + * @vcpu: virtual cpu * * This interception can only happen for guests with DAT disabled and * addresses that are currently not mapped in the host. Thus we try to @@ -318,18 +362,18 @@ static int handle_mvpg_pei(struct kvm_vcpu *vcpu) kvm_s390_get_regs_rre(vcpu, ®1, ®2); - /* Make sure that the source is paged-in */ - rc = guest_translate_address(vcpu, vcpu->run->s.regs.gprs[reg2], - reg2, &srcaddr, GACC_FETCH); + /* Ensure that the source is paged-in, no actual access -> no key checking */ + rc = guest_translate_address_with_key(vcpu, vcpu->run->s.regs.gprs[reg2], + reg2, &srcaddr, GACC_FETCH, 0); if (rc) return kvm_s390_inject_prog_cond(vcpu, rc); rc = kvm_arch_fault_in_page(vcpu, srcaddr, 0); if (rc != 0) return rc; - /* Make sure that the destination is paged-in */ - rc = guest_translate_address(vcpu, vcpu->run->s.regs.gprs[reg1], - reg1, &dstaddr, GACC_STORE); + /* Ensure that the source is paged-in, no actual access -> no key checking */ + rc = guest_translate_address_with_key(vcpu, vcpu->run->s.regs.gprs[reg1], + reg1, &dstaddr, GACC_STORE, 0); if (rc) return kvm_s390_inject_prog_cond(vcpu, rc); rc = kvm_arch_fault_in_page(vcpu, dstaddr, 1); @@ -360,8 +404,8 @@ static int handle_partial_execution(struct kvm_vcpu *vcpu) */ int handle_sthyi(struct kvm_vcpu *vcpu) { - int reg1, reg2, r = 0; - u64 code, addr, cc = 0, rc = 0; + int reg1, reg2, cc = 0, r = 0; + u64 code, addr, rc = 0; struct sthyi_sctns *sctns = NULL; if (!test_kvm_facility(vcpu->kvm, 74)) @@ -384,21 +428,28 @@ int handle_sthyi(struct kvm_vcpu *vcpu) goto out; } - if (addr & ~PAGE_MASK) + if (!kvm_s390_pv_cpu_is_protected(vcpu) && (addr & ~PAGE_MASK)) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - sctns = (void *)get_zeroed_page(GFP_KERNEL); + sctns = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!sctns) return -ENOMEM; cc = sthyi_fill(sctns, &rc); - + if (cc < 0) { + free_page((unsigned long)sctns); + return cc; + } out: if (!cc) { - r = write_guest(vcpu, addr, reg2, sctns, PAGE_SIZE); - if (r) { - free_page((unsigned long)sctns); - return kvm_s390_inject_prog_cond(vcpu, r); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + memcpy(sida_addr(vcpu->arch.sie_block), sctns, PAGE_SIZE); + } else { + r = write_guest(vcpu, addr, reg2, sctns, PAGE_SIZE); + if (r) { + free_page((unsigned long)sctns); + return kvm_s390_inject_prog_cond(vcpu, r); + } } } @@ -444,6 +495,110 @@ static int handle_operexc(struct kvm_vcpu *vcpu) return kvm_s390_inject_program_int(vcpu, PGM_OPERATION); } +static int handle_pv_spx(struct kvm_vcpu *vcpu) +{ + u32 pref = *(u32 *)sida_addr(vcpu->arch.sie_block); + + kvm_s390_set_prefix(vcpu, pref); + trace_kvm_s390_handle_prefix(vcpu, 1, pref); + return 0; +} + +static int handle_pv_sclp(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; + + spin_lock(&fi->lock); + /* + * 2 cases: + * a: an sccb answering interrupt was already pending or in flight. + * As the sccb value is not known we can simply set some value to + * trigger delivery of a saved SCCB. UV will then use its saved + * copy of the SCCB value. + * b: an error SCCB interrupt needs to be injected so we also inject + * a fake SCCB address. Firmware will use the proper one. + * This makes sure, that both errors and real sccb returns will only + * be delivered after a notification intercept (instruction has + * finished) but not after others. + */ + fi->srv_signal.ext_params |= 0x43000; + set_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs); + clear_bit(IRQ_PEND_EXT_SERVICE, &fi->masked_irqs); + spin_unlock(&fi->lock); + return 0; +} + +static int handle_pv_uvc(struct kvm_vcpu *vcpu) +{ + struct uv_cb_share *guest_uvcb = sida_addr(vcpu->arch.sie_block); + struct uv_cb_cts uvcb = { + .header.cmd = UVC_CMD_UNPIN_PAGE_SHARED, + .header.len = sizeof(uvcb), + .guest_handle = kvm_s390_pv_get_handle(vcpu->kvm), + .gaddr = guest_uvcb->paddr, + }; + int rc; + + if (guest_uvcb->header.cmd != UVC_CMD_REMOVE_SHARED_ACCESS) { + WARN_ONCE(1, "Unexpected notification intercept for UVC 0x%x\n", + guest_uvcb->header.cmd); + return 0; + } + rc = gmap_make_secure(vcpu->arch.gmap, uvcb.gaddr, &uvcb); + /* + * If the unpin did not succeed, the guest will exit again for the UVC + * and we will retry the unpin. + */ + if (rc == -EINVAL) + return 0; + /* + * If we got -EAGAIN here, we simply return it. It will eventually + * get propagated all the way to userspace, which should then try + * again. + */ + return rc; +} + +static int handle_pv_notification(struct kvm_vcpu *vcpu) +{ + int ret; + + if (vcpu->arch.sie_block->ipa == 0xb210) + return handle_pv_spx(vcpu); + if (vcpu->arch.sie_block->ipa == 0xb220) + return handle_pv_sclp(vcpu); + if (vcpu->arch.sie_block->ipa == 0xb9a4) + return handle_pv_uvc(vcpu); + if (vcpu->arch.sie_block->ipa >> 8 == 0xae) { + /* + * Besides external call, other SIGP orders also cause a + * 108 (pv notify) intercept. In contrast to external call, + * these orders need to be emulated and hence the appropriate + * place to handle them is in handle_instruction(). + * So first try kvm_s390_handle_sigp_pei() and if that isn't + * successful, go on with handle_instruction(). + */ + ret = kvm_s390_handle_sigp_pei(vcpu); + if (!ret) + return ret; + } + + return handle_instruction(vcpu); +} + +static bool should_handle_per_ifetch(const struct kvm_vcpu *vcpu, int rc) +{ + /* Process PER, also if the instruction is processed in user space. */ + if (!(vcpu->arch.sie_block->icptstatus & 0x02)) + return false; + if (rc != 0 && rc != -EOPNOTSUPP) + return false; + if (guestdbg_sstep_enabled(vcpu) && vcpu->arch.local_int.pending_irqs) + /* __vcpu_run() will exit after delivering the interrupt. */ + return false; + return true; +} + int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu) { int rc, per_rc = 0; @@ -478,15 +633,35 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu) rc = handle_partial_execution(vcpu); break; case ICPT_KSS: - rc = kvm_s390_skey_check_enable(vcpu); + /* Instruction will be redriven, skip the PER check. */ + return kvm_s390_skey_check_enable(vcpu); + case ICPT_MCHKREQ: + case ICPT_INT_ENABLE: + /* + * PSW bit 13 or a CR (0, 6, 14) changed and we might + * now be able to deliver interrupts. The pre-run code + * will take care of this. + */ + rc = 0; + break; + case ICPT_PV_INSTR: + rc = handle_instruction(vcpu); + break; + case ICPT_PV_NOTIFY: + rc = handle_pv_notification(vcpu); + break; + case ICPT_PV_PREF: + rc = 0; + gmap_convert_to_secure(vcpu->arch.gmap, + kvm_s390_get_prefix(vcpu)); + gmap_convert_to_secure(vcpu->arch.gmap, + kvm_s390_get_prefix(vcpu) + PAGE_SIZE); break; default: return -EOPNOTSUPP; } - /* process PER, also if the instrution is processed in user space */ - if (vcpu->arch.sie_block->icptstatus & 0x02 && - (!rc || rc == -EOPNOTSUPP)) + if (should_handle_per_ifetch(vcpu, rc)) per_rc = kvm_s390_handle_per_ifetch_icpt(vcpu); return per_rc ? per_rc : rc; } diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 165dea4c7f19..fc4007cc067a 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -2,7 +2,7 @@ /* * handling kvm guest interrupts * - * Copyright IBM Corp. 2008, 2015 + * Copyright IBM Corp. 2008, 2020 * * Author(s): Carsten Otte <cotte@de.ibm.com> */ @@ -28,9 +28,11 @@ #include <asm/switch_to.h> #include <asm/nmi.h> #include <asm/airq.h> +#include <asm/tpi.h> #include "kvm-s390.h" #include "gaccess.h" #include "trace-s390.h" +#include "pci.h" #define PFAULT_INIT 0x0600 #define PFAULT_DONE 0x0680 @@ -81,8 +83,9 @@ static int sca_inject_ext_call(struct kvm_vcpu *vcpu, int src_id) struct esca_block *sca = vcpu->kvm->arch.sca; union esca_sigp_ctrl *sigp_ctrl = &(sca->cpu[vcpu->vcpu_id].sigp_ctrl); - union esca_sigp_ctrl new_val = {0}, old_val = *sigp_ctrl; + union esca_sigp_ctrl new_val = {0}, old_val; + old_val = READ_ONCE(*sigp_ctrl); new_val.scn = src_id; new_val.c = 1; old_val.c = 0; @@ -93,8 +96,9 @@ static int sca_inject_ext_call(struct kvm_vcpu *vcpu, int src_id) struct bsca_block *sca = vcpu->kvm->arch.sca; union bsca_sigp_ctrl *sigp_ctrl = &(sca->cpu[vcpu->vcpu_id].sigp_ctrl); - union bsca_sigp_ctrl new_val = {0}, old_val = *sigp_ctrl; + union bsca_sigp_ctrl new_val = {0}, old_val; + old_val = READ_ONCE(*sigp_ctrl); new_val.scn = src_id; new_val.c = 1; old_val.c = 0; @@ -124,16 +128,18 @@ static void sca_clear_ext_call(struct kvm_vcpu *vcpu) struct esca_block *sca = vcpu->kvm->arch.sca; union esca_sigp_ctrl *sigp_ctrl = &(sca->cpu[vcpu->vcpu_id].sigp_ctrl); - union esca_sigp_ctrl old = *sigp_ctrl; + union esca_sigp_ctrl old; + old = READ_ONCE(*sigp_ctrl); expect = old.value; rc = cmpxchg(&sigp_ctrl->value, old.value, 0); } else { struct bsca_block *sca = vcpu->kvm->arch.sca; union bsca_sigp_ctrl *sigp_ctrl = &(sca->cpu[vcpu->vcpu_id].sigp_ctrl); - union bsca_sigp_ctrl old = *sigp_ctrl; + union bsca_sigp_ctrl old; + old = READ_ONCE(*sigp_ctrl); expect = old.value; rc = cmpxchg(&sigp_ctrl->value, old.value, 0); } @@ -297,11 +303,6 @@ static inline u8 gisa_get_ipm_or_restore_iam(struct kvm_s390_gisa_interrupt *gi) return 0; } -static inline int gisa_in_alert_list(struct kvm_s390_gisa *gisa) -{ - return READ_ONCE(gisa->next_alert) != (u32)(u64)gisa; -} - static inline void gisa_set_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) { set_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa); @@ -312,11 +313,6 @@ static inline u8 gisa_get_ipm(struct kvm_s390_gisa *gisa) return READ_ONCE(gisa->ipm); } -static inline void gisa_clear_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) -{ - clear_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa); -} - static inline int gisa_tac_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) { return test_and_clear_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa); @@ -324,8 +320,11 @@ static inline int gisa_tac_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) static inline unsigned long pending_irqs_no_gisa(struct kvm_vcpu *vcpu) { - return vcpu->kvm->arch.float_int.pending_irqs | - vcpu->arch.local_int.pending_irqs; + unsigned long pending = vcpu->kvm->arch.float_int.pending_irqs | + vcpu->arch.local_int.pending_irqs; + + pending &= ~vcpu->kvm->arch.float_int.masked_irqs; + return pending; } static inline unsigned long pending_irqs(struct kvm_vcpu *vcpu) @@ -383,10 +382,18 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu) __clear_bit(IRQ_PEND_EXT_CLOCK_COMP, &active_mask); if (!(vcpu->arch.sie_block->gcr[0] & CR0_CPU_TIMER_SUBMASK)) __clear_bit(IRQ_PEND_EXT_CPU_TIMER, &active_mask); - if (!(vcpu->arch.sie_block->gcr[0] & CR0_SERVICE_SIGNAL_SUBMASK)) + if (!(vcpu->arch.sie_block->gcr[0] & CR0_SERVICE_SIGNAL_SUBMASK)) { __clear_bit(IRQ_PEND_EXT_SERVICE, &active_mask); + __clear_bit(IRQ_PEND_EXT_SERVICE_EV, &active_mask); + } if (psw_mchk_disabled(vcpu)) active_mask &= ~IRQ_PEND_MCHK_MASK; + /* PV guest cpus can have a single interruption injected at a time. */ + if (kvm_s390_pv_cpu_get_handle(vcpu) && + vcpu->arch.sie_block->iictl != IICTL_CODE_NONE) + active_mask &= ~(IRQ_PEND_EXT_II_MASK | + IRQ_PEND_IO_MASK | + IRQ_PEND_MCHK_MASK); /* * Check both floating and local interrupt's cr14 because * bit IRQ_PEND_MCHK_REP could be set in both cases. @@ -408,13 +415,13 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu) static void __set_cpu_idle(struct kvm_vcpu *vcpu) { kvm_s390_set_cpuflags(vcpu, CPUSTAT_WAIT); - set_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask); + set_bit(vcpu->vcpu_idx, vcpu->kvm->arch.idle_mask); } static void __unset_cpu_idle(struct kvm_vcpu *vcpu) { kvm_s390_clear_cpuflags(vcpu, CPUSTAT_WAIT); - clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask); + clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.idle_mask); } static void __reset_intercept_indicators(struct kvm_vcpu *vcpu) @@ -479,19 +486,23 @@ static void set_intercept_indicators(struct kvm_vcpu *vcpu) static int __must_check __deliver_cpu_timer(struct kvm_vcpu *vcpu) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; - int rc; + int rc = 0; vcpu->stat.deliver_cputm++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_CPU_TIMER, 0, 0); - - rc = put_guest_lc(vcpu, EXT_IRQ_CPU_TIMER, - (u16 *)__LC_EXT_INT_CODE); - rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR); - rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW, - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); - rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW, - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + vcpu->arch.sie_block->iictl = IICTL_CODE_EXT; + vcpu->arch.sie_block->eic = EXT_IRQ_CPU_TIMER; + } else { + rc = put_guest_lc(vcpu, EXT_IRQ_CPU_TIMER, + (u16 *)__LC_EXT_INT_CODE); + rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR); + rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + } clear_bit(IRQ_PEND_EXT_CPU_TIMER, &li->pending_irqs); return rc ? -EFAULT : 0; } @@ -499,19 +510,23 @@ static int __must_check __deliver_cpu_timer(struct kvm_vcpu *vcpu) static int __must_check __deliver_ckc(struct kvm_vcpu *vcpu) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; - int rc; + int rc = 0; vcpu->stat.deliver_ckc++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_CLOCK_COMP, 0, 0); - - rc = put_guest_lc(vcpu, EXT_IRQ_CLK_COMP, - (u16 __user *)__LC_EXT_INT_CODE); - rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR); - rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW, - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); - rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW, - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + vcpu->arch.sie_block->iictl = IICTL_CODE_EXT; + vcpu->arch.sie_block->eic = EXT_IRQ_CLK_COMP; + } else { + rc = put_guest_lc(vcpu, EXT_IRQ_CLK_COMP, + (u16 __user *)__LC_EXT_INT_CODE); + rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR); + rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + } clear_bit(IRQ_PEND_EXT_CLOCK_COMP, &li->pending_irqs); return rc ? -EFAULT : 0; } @@ -553,6 +568,20 @@ static int __write_machine_check(struct kvm_vcpu *vcpu, union mci mci; int rc; + /* + * All other possible payload for a machine check (e.g. the register + * contents in the save area) will be handled by the ultravisor, as + * the hypervisor does not not have the needed information for + * protected guests. + */ + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + vcpu->arch.sie_block->iictl = IICTL_CODE_MCHK; + vcpu->arch.sie_block->mcic = mchk->mcic; + vcpu->arch.sie_block->faddr = mchk->failing_storage_address; + vcpu->arch.sie_block->edc = mchk->ext_damage_code; + return 0; + } + mci.val = mchk->mcic; /* take care of lazy register loading */ save_fpu_regs(); @@ -610,7 +639,7 @@ static int __write_machine_check(struct kvm_vcpu *vcpu, rc |= put_guest_lc(vcpu, mci.val, (u64 __user *) __LC_MCCK_CODE); /* Register-save areas */ - if (MACHINE_HAS_VX) { + if (cpu_has_vx()) { convert_vx_to_fp(fprs, (__vector128 *) vcpu->run->s.regs.vrs); rc |= write_guest_lc(vcpu, __LC_FPREGS_SAVE_AREA, fprs, 128); } else { @@ -669,7 +698,7 @@ static int __must_check __deliver_machine_check(struct kvm_vcpu *vcpu) /* * We indicate floating repressible conditions along with * other pending conditions. Channel Report Pending and Channel - * Subsystem damage are the only two and and are indicated by + * Subsystem damage are the only two and are indicated by * bits in mcic and masked in cr14. */ if (test_and_clear_bit(IRQ_PEND_MCHK_REP, &fi->pending_irqs)) { @@ -696,17 +725,21 @@ static int __must_check __deliver_machine_check(struct kvm_vcpu *vcpu) static int __must_check __deliver_restart(struct kvm_vcpu *vcpu) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; - int rc; + int rc = 0; VCPU_EVENT(vcpu, 3, "%s", "deliver: cpu restart"); vcpu->stat.deliver_restart_signal++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_RESTART, 0, 0); - rc = write_guest_lc(vcpu, - offsetof(struct lowcore, restart_old_psw), - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); - rc |= read_guest_lc(vcpu, offsetof(struct lowcore, restart_psw), - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + vcpu->arch.sie_block->iictl = IICTL_CODE_RESTART; + } else { + rc = write_guest_lc(vcpu, + offsetof(struct lowcore, restart_old_psw), + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= read_guest_lc(vcpu, offsetof(struct lowcore, restart_psw), + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + } clear_bit(IRQ_PEND_RESTART, &li->pending_irqs); return rc ? -EFAULT : 0; } @@ -748,6 +781,12 @@ static int __must_check __deliver_emergency_signal(struct kvm_vcpu *vcpu) vcpu->stat.deliver_emergency_signal++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_EMERGENCY, cpu_addr, 0); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + vcpu->arch.sie_block->iictl = IICTL_CODE_EXT; + vcpu->arch.sie_block->eic = EXT_IRQ_EMERGENCY_SIG; + vcpu->arch.sie_block->extcpuaddr = cpu_addr; + return 0; + } rc = put_guest_lc(vcpu, EXT_IRQ_EMERGENCY_SIG, (u16 *)__LC_EXT_INT_CODE); @@ -776,6 +815,12 @@ static int __must_check __deliver_external_call(struct kvm_vcpu *vcpu) trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_EXTERNAL_CALL, extcall.code, 0); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + vcpu->arch.sie_block->iictl = IICTL_CODE_EXT; + vcpu->arch.sie_block->eic = EXT_IRQ_EXTERNAL_CALL; + vcpu->arch.sie_block->extcpuaddr = extcall.code; + return 0; + } rc = put_guest_lc(vcpu, EXT_IRQ_EXTERNAL_CALL, (u16 *)__LC_EXT_INT_CODE); @@ -787,6 +832,21 @@ static int __must_check __deliver_external_call(struct kvm_vcpu *vcpu) return rc ? -EFAULT : 0; } +static int __deliver_prog_pv(struct kvm_vcpu *vcpu, u16 code) +{ + switch (code) { + case PGM_SPECIFICATION: + vcpu->arch.sie_block->iictl = IICTL_CODE_SPECIFICATION; + break; + case PGM_OPERAND: + vcpu->arch.sie_block->iictl = IICTL_CODE_OPERAND; + break; + default: + return -EINVAL; + } + return 0; +} + static int __must_check __deliver_prog(struct kvm_vcpu *vcpu) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; @@ -807,6 +867,10 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu) trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_PROGRAM_INT, pgm_info.code, 0); + /* PER is handled by the ultravisor */ + if (kvm_s390_pv_cpu_is_protected(vcpu)) + return __deliver_prog_pv(vcpu, pgm_info.code & ~PGM_PER); + switch (pgm_info.code & ~PGM_PER) { case PGM_AFX_TRANSLATION: case PGM_ASX_TRANSLATION: @@ -818,7 +882,7 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu) case PGM_PRIMARY_AUTHORITY: case PGM_SECONDARY_AUTHORITY: nullifying = true; - /* fall through */ + fallthrough; case PGM_SPACE_SWITCH: rc = put_guest_lc(vcpu, pgm_info.trans_exc_code, (u64 *)__LC_TRANS_EXC_CODE); @@ -892,7 +956,7 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu) /* bit 1+2 of the target are the ilc, so we can directly use ilen */ rc |= put_guest_lc(vcpu, ilen, (u16 *) __LC_PGM_ILC); rc |= put_guest_lc(vcpu, vcpu->arch.sie_block->gbea, - (u64 *) __LC_LAST_BREAK); + (u64 *) __LC_PGM_LAST_BREAK); rc |= put_guest_lc(vcpu, pgm_info.code, (u16 *)__LC_PGM_INT_CODE); rc |= write_guest_lc(vcpu, __LC_PGM_OLD_PSW, @@ -902,20 +966,49 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu) return rc ? -EFAULT : 0; } +#define SCCB_MASK 0xFFFFFFF8 +#define SCCB_EVENT_PENDING 0x3 + +static int write_sclp(struct kvm_vcpu *vcpu, u32 parm) +{ + int rc; + + if (kvm_s390_pv_cpu_get_handle(vcpu)) { + vcpu->arch.sie_block->iictl = IICTL_CODE_EXT; + vcpu->arch.sie_block->eic = EXT_IRQ_SERVICE_SIG; + vcpu->arch.sie_block->eiparams = parm; + return 0; + } + + rc = put_guest_lc(vcpu, EXT_IRQ_SERVICE_SIG, (u16 *)__LC_EXT_INT_CODE); + rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR); + rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= put_guest_lc(vcpu, parm, + (u32 *)__LC_EXT_PARAMS); + + return rc ? -EFAULT : 0; +} + static int __must_check __deliver_service(struct kvm_vcpu *vcpu) { struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; struct kvm_s390_ext_info ext; - int rc = 0; spin_lock(&fi->lock); - if (!(test_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs))) { + if (test_bit(IRQ_PEND_EXT_SERVICE, &fi->masked_irqs) || + !(test_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs))) { spin_unlock(&fi->lock); return 0; } ext = fi->srv_signal; memset(&fi->srv_signal, 0, sizeof(ext)); clear_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs); + clear_bit(IRQ_PEND_EXT_SERVICE_EV, &fi->pending_irqs); + if (kvm_s390_pv_cpu_is_protected(vcpu)) + set_bit(IRQ_PEND_EXT_SERVICE, &fi->masked_irqs); spin_unlock(&fi->lock); VCPU_EVENT(vcpu, 4, "deliver: sclp parameter 0x%x", @@ -924,16 +1017,31 @@ static int __must_check __deliver_service(struct kvm_vcpu *vcpu) trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_SERVICE, ext.ext_params, 0); - rc = put_guest_lc(vcpu, EXT_IRQ_SERVICE_SIG, (u16 *)__LC_EXT_INT_CODE); - rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR); - rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW, - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); - rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW, - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); - rc |= put_guest_lc(vcpu, ext.ext_params, - (u32 *)__LC_EXT_PARAMS); + return write_sclp(vcpu, ext.ext_params); +} - return rc ? -EFAULT : 0; +static int __must_check __deliver_service_ev(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; + struct kvm_s390_ext_info ext; + + spin_lock(&fi->lock); + if (!(test_bit(IRQ_PEND_EXT_SERVICE_EV, &fi->pending_irqs))) { + spin_unlock(&fi->lock); + return 0; + } + ext = fi->srv_signal; + /* only clear the event bit */ + fi->srv_signal.ext_params &= ~SCCB_EVENT_PENDING; + clear_bit(IRQ_PEND_EXT_SERVICE_EV, &fi->pending_irqs); + spin_unlock(&fi->lock); + + VCPU_EVENT(vcpu, 4, "%s", "deliver: sclp parameter event"); + vcpu->stat.deliver_service_signal++; + trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_SERVICE, + ext.ext_params, 0); + + return write_sclp(vcpu, SCCB_EVENT_PENDING); } static int __must_check __deliver_pfault_done(struct kvm_vcpu *vcpu) @@ -1028,6 +1136,15 @@ static int __do_deliver_io(struct kvm_vcpu *vcpu, struct kvm_s390_io_info *io) { int rc; + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + vcpu->arch.sie_block->iictl = IICTL_CODE_IO; + vcpu->arch.sie_block->subchannel_id = io->subchannel_id; + vcpu->arch.sie_block->subchannel_nr = io->subchannel_nr; + vcpu->arch.sie_block->io_int_parm = io->io_int_parm; + vcpu->arch.sie_block->io_int_word = io->io_int_word; + return 0; + } + rc = put_guest_lc(vcpu, io->subchannel_id, (u16 *)__LC_SUBCHANNEL_ID); rc |= put_guest_lc(vcpu, io->subchannel_nr, (u16 *)__LC_SUBCHANNEL_NR); rc |= put_guest_lc(vcpu, io->io_int_parm, (u32 *)__LC_IO_INT_PARM); @@ -1166,7 +1283,7 @@ static u64 __calculate_sltime(struct kvm_vcpu *vcpu) /* already expired? */ if (cputm >> 63) return 0; - return min(sltime, tod_to_ns(cputm)); + return min_t(u64, sltime, tod_to_ns(cputm)); } } else if (cpu_timer_interrupts_enabled(vcpu)) { sltime = kvm_s390_get_cpu_timer(vcpu); @@ -1213,10 +1330,11 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu) hrtimer_start(&vcpu->arch.ckc_timer, sltime, HRTIMER_MODE_REL); VCPU_EVENT(vcpu, 4, "enabled wait: %llu ns", sltime); no_timer: - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); - kvm_vcpu_block(vcpu); + kvm_vcpu_srcu_read_unlock(vcpu); + kvm_vcpu_halt(vcpu); + vcpu->valid_wakeup = false; __unset_cpu_idle(vcpu); - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + kvm_vcpu_srcu_read_lock(vcpu); hrtimer_cancel(&vcpu->arch.ckc_timer); return 0; @@ -1269,6 +1387,7 @@ int __must_check kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; int rc = 0; + bool delivered = false; unsigned long irq_type; unsigned long irqs; @@ -1329,6 +1448,9 @@ int __must_check kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu) case IRQ_PEND_EXT_SERVICE: rc = __deliver_service(vcpu); break; + case IRQ_PEND_EXT_SERVICE_EV: + rc = __deliver_service_ev(vcpu); + break; case IRQ_PEND_PFAULT_DONE: rc = __deliver_pfault_done(vcpu); break; @@ -1339,6 +1461,19 @@ int __must_check kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu) WARN_ONCE(1, "Unknown pending irq type %ld", irq_type); clear_bit(irq_type, &li->pending_irqs); } + delivered |= !rc; + } + + /* + * We delivered at least one interrupt and modified the PC. Force a + * singlestep event now. + */ + if (delivered && guestdbg_sstep_enabled(vcpu)) { + struct kvm_debug_exit_arch *debug_exit = &vcpu->run->debug.arch; + + debug_exit->addr = vcpu->arch.sie_block->gpsw.addr; + debug_exit->type = KVM_SINGLESTEP; + vcpu->guest_debug |= KVM_GUESTDBG_EXIT_PENDING; } set_intercept_indicators(vcpu); @@ -1421,7 +1556,7 @@ static int __inject_extcall(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) if (kvm_get_vcpu_by_id(vcpu->kvm, src_id) == NULL) return -EINVAL; - if (sclp.has_sigpif) + if (sclp.has_sigpif && !kvm_s390_pv_cpu_get_handle(vcpu)) return sca_inject_ext_call(vcpu, src_id); if (test_and_set_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs)) @@ -1668,7 +1803,7 @@ struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm, goto out; } gisa_out: - tmp_inti = kzalloc(sizeof(*inti), GFP_KERNEL); + tmp_inti = kzalloc(sizeof(*inti), GFP_KERNEL_ACCOUNT); if (tmp_inti) { tmp_inti->type = KVM_S390_INT_IO(1, 0, 0, 0); tmp_inti->io.io_int_word = isc_to_int_word(isc); @@ -1681,9 +1816,6 @@ out: return inti; } -#define SCCB_MASK 0xFFFFFFF8 -#define SCCB_EVENT_PENDING 0x3 - static int __inject_service(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) { @@ -1692,6 +1824,11 @@ static int __inject_service(struct kvm *kvm, kvm->stat.inject_service_signal++; spin_lock(&fi->lock); fi->srv_signal.ext_params |= inti->ext.ext_params & SCCB_EVENT_PENDING; + + /* We always allow events, track them separately from the sccb ints */ + if (fi->srv_signal.ext_params & SCCB_EVENT_PENDING) + set_bit(IRQ_PEND_EXT_SERVICE_EV, &fi->pending_irqs); + /* * Early versions of the QEMU s390 bios will inject several * service interrupts after another without handling a @@ -1773,6 +1910,12 @@ static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) kvm->stat.inject_io++; isc = int_word_to_isc(inti->io.io_int_word); + /* + * We do not use the lock checking variant as this is just a + * performance optimization and we do not hold the lock here. + * This is ok as the code will pick interrupts from both "lists" + * for delivery. + */ if (gi->origin && inti->type & KVM_S390_INT_IO_AI_MASK) { VM_EVENT(kvm, 4, "%s isc %1u", "inject: I/O (AI/gisa)", isc); gisa_set_ipm_gisc(gi->origin, isc); @@ -1834,7 +1977,8 @@ static void __floating_irq_kick(struct kvm *kvm, u64 type) break; case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX: if (!(type & KVM_S390_INT_IO_AI_MASK && - kvm->arch.gisa_int.origin)) + kvm->arch.gisa_int.origin) || + kvm_s390_pv_cpu_get_handle(dst_vcpu)) kvm_s390_set_cpuflags(dst_vcpu, CPUSTAT_IO_INT); break; default: @@ -1881,7 +2025,7 @@ int kvm_s390_inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti; int rc; - inti = kzalloc(sizeof(*inti), GFP_KERNEL); + inti = kzalloc(sizeof(*inti), GFP_KERNEL_ACCOUNT); if (!inti) return -ENOMEM; @@ -1981,6 +2125,13 @@ int kvm_s390_is_stop_irq_pending(struct kvm_vcpu *vcpu) return test_bit(IRQ_PEND_SIGP_STOP, &li->pending_irqs); } +int kvm_s390_is_restart_irq_pending(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; + + return test_bit(IRQ_PEND_RESTART, &li->pending_irqs); +} + void kvm_s390_clear_stop_irq(struct kvm_vcpu *vcpu) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; @@ -2080,6 +2231,10 @@ void kvm_s390_clear_float_irqs(struct kvm *kvm) struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int; int i; + mutex_lock(&kvm->lock); + if (!kvm_s390_pv_is_protected(kvm)) + fi->masked_irqs = 0; + mutex_unlock(&kvm->lock); spin_lock(&fi->lock); fi->pending_irqs = 0; memset(&fi->srv_signal, 0, sizeof(fi->srv_signal)); @@ -2146,7 +2301,8 @@ static int get_all_floating_irqs(struct kvm *kvm, u8 __user *usrbuf, u64 len) n++; } } - if (test_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs)) { + if (test_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs) || + test_bit(IRQ_PEND_EXT_SERVICE_EV, &fi->pending_irqs)) { if (n == max_irqs) { /* signal userspace to try again */ ret = -ENOMEM; @@ -2190,7 +2346,7 @@ static int flic_ais_mode_get_all(struct kvm *kvm, struct kvm_device_attr *attr) return -EINVAL; if (!test_kvm_facility(kvm, 72)) - return -ENOTSUPP; + return -EOPNOTSUPP; mutex_lock(&fi->ais_lock); ais.simm = fi->simm; @@ -2275,7 +2431,7 @@ static int enqueue_floating_irq(struct kvm_device *dev, return -EINVAL; while (len >= sizeof(struct kvm_s390_irq)) { - inti = kzalloc(sizeof(*inti), GFP_KERNEL); + inti = kzalloc(sizeof(*inti), GFP_KERNEL_ACCOUNT); if (!inti) return -ENOMEM; @@ -2323,13 +2479,10 @@ static int register_io_adapter(struct kvm_device *dev, if (dev->kvm->arch.adapters[adapter_info.id] != NULL) return -EINVAL; - adapter = kzalloc(sizeof(*adapter), GFP_KERNEL); + adapter = kzalloc(sizeof(*adapter), GFP_KERNEL_ACCOUNT); if (!adapter) return -ENOMEM; - INIT_LIST_HEAD(&adapter->maps); - init_rwsem(&adapter->maps_lock); - atomic_set(&adapter->nr_maps, 0); adapter->id = adapter_info.id; adapter->isc = adapter_info.isc; adapter->maskable = adapter_info.maskable; @@ -2354,87 +2507,12 @@ int kvm_s390_mask_adapter(struct kvm *kvm, unsigned int id, bool masked) return ret; } -static int kvm_s390_adapter_map(struct kvm *kvm, unsigned int id, __u64 addr) -{ - struct s390_io_adapter *adapter = get_io_adapter(kvm, id); - struct s390_map_info *map; - int ret; - - if (!adapter || !addr) - return -EINVAL; - - map = kzalloc(sizeof(*map), GFP_KERNEL); - if (!map) { - ret = -ENOMEM; - goto out; - } - INIT_LIST_HEAD(&map->list); - map->guest_addr = addr; - map->addr = gmap_translate(kvm->arch.gmap, addr); - if (map->addr == -EFAULT) { - ret = -EFAULT; - goto out; - } - ret = get_user_pages_fast(map->addr, 1, FOLL_WRITE, &map->page); - if (ret < 0) - goto out; - BUG_ON(ret != 1); - down_write(&adapter->maps_lock); - if (atomic_inc_return(&adapter->nr_maps) < MAX_S390_ADAPTER_MAPS) { - list_add_tail(&map->list, &adapter->maps); - ret = 0; - } else { - put_page(map->page); - ret = -EINVAL; - } - up_write(&adapter->maps_lock); -out: - if (ret) - kfree(map); - return ret; -} - -static int kvm_s390_adapter_unmap(struct kvm *kvm, unsigned int id, __u64 addr) -{ - struct s390_io_adapter *adapter = get_io_adapter(kvm, id); - struct s390_map_info *map, *tmp; - int found = 0; - - if (!adapter || !addr) - return -EINVAL; - - down_write(&adapter->maps_lock); - list_for_each_entry_safe(map, tmp, &adapter->maps, list) { - if (map->guest_addr == addr) { - found = 1; - atomic_dec(&adapter->nr_maps); - list_del(&map->list); - put_page(map->page); - kfree(map); - break; - } - } - up_write(&adapter->maps_lock); - - return found ? 0 : -EINVAL; -} - void kvm_s390_destroy_adapters(struct kvm *kvm) { int i; - struct s390_map_info *map, *tmp; - for (i = 0; i < MAX_S390_IO_ADAPTERS; i++) { - if (!kvm->arch.adapters[i]) - continue; - list_for_each_entry_safe(map, tmp, - &kvm->arch.adapters[i]->maps, list) { - list_del(&map->list); - put_page(map->page); - kfree(map); - } + for (i = 0; i < MAX_S390_IO_ADAPTERS; i++) kfree(kvm->arch.adapters[i]); - } } static int modify_io_adapter(struct kvm_device *dev, @@ -2456,11 +2534,14 @@ static int modify_io_adapter(struct kvm_device *dev, if (ret > 0) ret = 0; break; + /* + * The following operations are no longer needed and therefore no-ops. + * The gpa to hva translation is done when an IRQ route is set up. The + * set_irq code uses get_user_pages_remote() to do the actual write. + */ case KVM_S390_IO_ADAPTER_MAP: - ret = kvm_s390_adapter_map(dev->kvm, req.id, req.addr); - break; case KVM_S390_IO_ADAPTER_UNMAP: - ret = kvm_s390_adapter_unmap(dev->kvm, req.id, req.addr); + ret = 0; break; default: ret = -EINVAL; @@ -2499,7 +2580,7 @@ static int modify_ais_mode(struct kvm *kvm, struct kvm_device_attr *attr) int ret = 0; if (!test_kvm_facility(kvm, 72)) - return -ENOTSUPP; + return -EOPNOTSUPP; if (copy_from_user(&req, (void __user *)attr->addr, sizeof(req))) return -EFAULT; @@ -2579,7 +2660,7 @@ static int flic_ais_mode_set_all(struct kvm *kvm, struct kvm_device_attr *attr) struct kvm_s390_ais_all ais; if (!test_kvm_facility(kvm, 72)) - return -ENOTSUPP; + return -EOPNOTSUPP; if (copy_from_user(&ais, (void __user *)attr->addr, sizeof(ais))) return -EFAULT; @@ -2595,7 +2676,7 @@ static int flic_ais_mode_set_all(struct kvm *kvm, struct kvm_device_attr *attr) static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { int r = 0; - unsigned int i; + unsigned long i; struct kvm_vcpu *vcpu; switch (attr->group) { @@ -2699,19 +2780,15 @@ static unsigned long get_ind_bit(__u64 addr, unsigned long bit_nr, bool swap) return swap ? (bit ^ (BITS_PER_LONG - 1)) : bit; } -static struct s390_map_info *get_map_info(struct s390_io_adapter *adapter, - u64 addr) +static struct page *get_map_page(struct kvm *kvm, u64 uaddr) { - struct s390_map_info *map; + struct page *page = NULL; - if (!adapter) - return NULL; - - list_for_each_entry(map, &adapter->maps, list) { - if (map->guest_addr == addr) - return map; - } - return NULL; + mmap_read_lock(kvm->mm); + get_user_pages_remote(kvm->mm, uaddr, 1, FOLL_WRITE, + &page, NULL); + mmap_read_unlock(kvm->mm); + return page; } static int adapter_indicators_set(struct kvm *kvm, @@ -2720,30 +2797,35 @@ static int adapter_indicators_set(struct kvm *kvm, { unsigned long bit; int summary_set, idx; - struct s390_map_info *info; + struct page *ind_page, *summary_page; void *map; - info = get_map_info(adapter, adapter_int->ind_addr); - if (!info) + ind_page = get_map_page(kvm, adapter_int->ind_addr); + if (!ind_page) return -1; - map = page_address(info->page); - bit = get_ind_bit(info->addr, adapter_int->ind_offset, adapter->swap); - set_bit(bit, map); - idx = srcu_read_lock(&kvm->srcu); - mark_page_dirty(kvm, info->guest_addr >> PAGE_SHIFT); - set_page_dirty_lock(info->page); - info = get_map_info(adapter, adapter_int->summary_addr); - if (!info) { - srcu_read_unlock(&kvm->srcu, idx); + summary_page = get_map_page(kvm, adapter_int->summary_addr); + if (!summary_page) { + put_page(ind_page); return -1; } - map = page_address(info->page); - bit = get_ind_bit(info->addr, adapter_int->summary_offset, - adapter->swap); + + idx = srcu_read_lock(&kvm->srcu); + map = page_address(ind_page); + bit = get_ind_bit(adapter_int->ind_addr, + adapter_int->ind_offset, adapter->swap); + set_bit(bit, map); + mark_page_dirty(kvm, adapter_int->ind_addr >> PAGE_SHIFT); + set_page_dirty_lock(ind_page); + map = page_address(summary_page); + bit = get_ind_bit(adapter_int->summary_addr, + adapter_int->summary_offset, adapter->swap); summary_set = test_and_set_bit(bit, map); - mark_page_dirty(kvm, info->guest_addr >> PAGE_SHIFT); - set_page_dirty_lock(info->page); + mark_page_dirty(kvm, adapter_int->summary_addr >> PAGE_SHIFT); + set_page_dirty_lock(summary_page); srcu_read_unlock(&kvm->srcu, idx); + + put_page(ind_page); + put_page(summary_page); return summary_set ? 0 : 1; } @@ -2765,9 +2847,7 @@ static int set_adapter_int(struct kvm_kernel_irq_routing_entry *e, adapter = get_io_adapter(kvm, e->adapter.adapter_id); if (!adapter) return -1; - down_read(&adapter->maps_lock); ret = adapter_indicators_set(kvm, adapter, &e->adapter); - up_read(&adapter->maps_lock); if ((ret > 0) && !adapter->masked) { ret = kvm_s390_inject_airq(kvm, adapter); if (ret == 0) @@ -2818,23 +2898,27 @@ int kvm_set_routing_entry(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) { - int ret; + u64 uaddr; switch (ue->type) { + /* we store the userspace addresses instead of the guest addresses */ case KVM_IRQ_ROUTING_S390_ADAPTER: e->set = set_adapter_int; - e->adapter.summary_addr = ue->u.adapter.summary_addr; - e->adapter.ind_addr = ue->u.adapter.ind_addr; + uaddr = gmap_translate(kvm->arch.gmap, ue->u.adapter.summary_addr); + if (uaddr == -EFAULT) + return -EFAULT; + e->adapter.summary_addr = uaddr; + uaddr = gmap_translate(kvm->arch.gmap, ue->u.adapter.ind_addr); + if (uaddr == -EFAULT) + return -EFAULT; + e->adapter.ind_addr = uaddr; e->adapter.summary_offset = ue->u.adapter.summary_offset; e->adapter.ind_offset = ue->u.adapter.ind_offset; e->adapter.adapter_id = ue->u.adapter.adapter_id; - ret = 0; - break; + return 0; default: - ret = -EINVAL; + return -EINVAL; } - - return ret; } int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, @@ -2983,18 +3067,19 @@ int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu, __u8 __user *buf, int len) static void __airqs_kick_single_vcpu(struct kvm *kvm, u8 deliverable_mask) { - int vcpu_id, online_vcpus = atomic_read(&kvm->online_vcpus); + int vcpu_idx, online_vcpus = atomic_read(&kvm->online_vcpus); struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; struct kvm_vcpu *vcpu; + u8 vcpu_isc_mask; - for_each_set_bit(vcpu_id, kvm->arch.idle_mask, online_vcpus) { - vcpu = kvm_get_vcpu(kvm, vcpu_id); + for_each_set_bit(vcpu_idx, kvm->arch.idle_mask, online_vcpus) { + vcpu = kvm_get_vcpu(kvm, vcpu_idx); if (psw_ioint_disabled(vcpu)) continue; - deliverable_mask &= (u8)(vcpu->arch.sie_block->gcr[6] >> 24); - if (deliverable_mask) { + vcpu_isc_mask = (u8)(vcpu->arch.sie_block->gcr[6] >> 24); + if (deliverable_mask & vcpu_isc_mask) { /* lately kicked but not yet running */ - if (test_and_set_bit(vcpu_id, gi->kicked_mask)) + if (test_and_set_bit(vcpu_idx, gi->kicked_mask)) return; kvm_s390_vcpu_wakeup(vcpu); return; @@ -3015,7 +3100,7 @@ static enum hrtimer_restart gisa_vcpu_kicker(struct hrtimer *timer) __airqs_kick_single_vcpu(kvm, pending_mask); hrtimer_forward_now(timer, ns_to_ktime(gi->expires)); return HRTIMER_RESTART; - }; + } return HRTIMER_NORESTART; } @@ -3027,9 +3112,9 @@ static enum hrtimer_restart gisa_vcpu_kicker(struct hrtimer *timer) static void process_gib_alert_list(void) { struct kvm_s390_gisa_interrupt *gi; + u32 final, gisa_phys, origin = 0UL; struct kvm_s390_gisa *gisa; struct kvm *kvm; - u32 final, origin = 0UL; do { /* @@ -3055,9 +3140,10 @@ static void process_gib_alert_list(void) * interruptions asap. */ while (origin & GISA_ADDR_MASK) { - gisa = (struct kvm_s390_gisa *)(u64)origin; + gisa_phys = origin; + gisa = phys_to_virt(gisa_phys); origin = gisa->next_alert; - gisa->next_alert = (u32)(u64)gisa; + gisa->next_alert = gisa_phys; kvm = container_of(gisa, struct sie_page2, gisa)->kvm; gi = &kvm->arch.gisa_int; if (hrtimer_active(&gi->timer)) @@ -3091,23 +3177,67 @@ void kvm_s390_gisa_init(struct kvm *kvm) hrtimer_init(&gi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); gi->timer.function = gisa_vcpu_kicker; memset(gi->origin, 0, sizeof(struct kvm_s390_gisa)); - gi->origin->next_alert = (u32)(u64)gi->origin; + gi->origin->next_alert = (u32)virt_to_phys(gi->origin); VM_EVENT(kvm, 3, "gisa 0x%pK initialized", gi->origin); } +void kvm_s390_gisa_enable(struct kvm *kvm) +{ + struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; + struct kvm_vcpu *vcpu; + unsigned long i; + u32 gisa_desc; + + if (gi->origin) + return; + kvm_s390_gisa_init(kvm); + gisa_desc = kvm_s390_get_gisa_desc(kvm); + if (!gisa_desc) + return; + kvm_for_each_vcpu(i, vcpu, kvm) { + mutex_lock(&vcpu->mutex); + vcpu->arch.sie_block->gd = gisa_desc; + vcpu->arch.sie_block->eca |= ECA_AIV; + VCPU_EVENT(vcpu, 3, "AIV gisa format-%u enabled for cpu %03u", + vcpu->arch.sie_block->gd & 0x3, vcpu->vcpu_id); + mutex_unlock(&vcpu->mutex); + } +} + void kvm_s390_gisa_destroy(struct kvm *kvm) { struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; + struct kvm_s390_gisa *gisa = gi->origin; if (!gi->origin) return; - if (gi->alert.mask) - KVM_EVENT(3, "vm 0x%pK has unexpected iam 0x%02x", - kvm, gi->alert.mask); - while (gisa_in_alert_list(gi->origin)) - cpu_relax(); + WARN(gi->alert.mask != 0x00, + "unexpected non zero alert.mask 0x%02x", + gi->alert.mask); + gi->alert.mask = 0x00; + if (gisa_set_iam(gi->origin, gi->alert.mask)) + process_gib_alert_list(); hrtimer_cancel(&gi->timer); gi->origin = NULL; + VM_EVENT(kvm, 3, "gisa 0x%pK destroyed", gisa); +} + +void kvm_s390_gisa_disable(struct kvm *kvm) +{ + struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; + struct kvm_vcpu *vcpu; + unsigned long i; + + if (!gi->origin) + return; + kvm_for_each_vcpu(i, vcpu, kvm) { + mutex_lock(&vcpu->mutex); + vcpu->arch.sie_block->eca &= ~ECA_AIV; + vcpu->arch.sie_block->gd = 0U; + mutex_unlock(&vcpu->mutex); + VCPU_EVENT(vcpu, 3, "AIV disabled for cpu %03u", vcpu->vcpu_id); + } + kvm_s390_gisa_destroy(kvm); } /** @@ -3193,29 +3323,111 @@ out: } EXPORT_SYMBOL_GPL(kvm_s390_gisc_unregister); -static void gib_alert_irq_handler(struct airq_struct *airq, bool floating) +static void aen_host_forward(unsigned long si) +{ + struct kvm_s390_gisa_interrupt *gi; + struct zpci_gaite *gaite; + struct kvm *kvm; + + gaite = (struct zpci_gaite *)aift->gait + + (si * sizeof(struct zpci_gaite)); + if (gaite->count == 0) + return; + if (gaite->aisb != 0) + set_bit_inv(gaite->aisbo, phys_to_virt(gaite->aisb)); + + kvm = kvm_s390_pci_si_to_kvm(aift, si); + if (!kvm) + return; + gi = &kvm->arch.gisa_int; + + if (!(gi->origin->g1.simm & AIS_MODE_MASK(gaite->gisc)) || + !(gi->origin->g1.nimm & AIS_MODE_MASK(gaite->gisc))) { + gisa_set_ipm_gisc(gi->origin, gaite->gisc); + if (hrtimer_active(&gi->timer)) + hrtimer_cancel(&gi->timer); + hrtimer_start(&gi->timer, 0, HRTIMER_MODE_REL); + kvm->stat.aen_forward++; + } +} + +static void aen_process_gait(u8 isc) { + bool found = false, first = true; + union zpci_sic_iib iib = {{0}}; + unsigned long si, flags; + + spin_lock_irqsave(&aift->gait_lock, flags); + + if (!aift->gait) { + spin_unlock_irqrestore(&aift->gait_lock, flags); + return; + } + + for (si = 0;;) { + /* Scan adapter summary indicator bit vector */ + si = airq_iv_scan(aift->sbv, si, airq_iv_end(aift->sbv)); + if (si == -1UL) { + if (first || found) { + /* Re-enable interrupts. */ + zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, isc, + &iib); + first = found = false; + } else { + /* Interrupts on and all bits processed */ + break; + } + found = false; + si = 0; + /* Scan again after re-enabling interrupts */ + continue; + } + found = true; + aen_host_forward(si); + } + + spin_unlock_irqrestore(&aift->gait_lock, flags); +} + +static void gib_alert_irq_handler(struct airq_struct *airq, + struct tpi_info *tpi_info) +{ + struct tpi_adapter_info *info = (struct tpi_adapter_info *)tpi_info; + inc_irq_stat(IRQIO_GAL); - process_gib_alert_list(); + + if ((info->forward || info->error) && + IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) { + aen_process_gait(info->isc); + if (info->aism != 0) + process_gib_alert_list(); + } else { + process_gib_alert_list(); + } } static struct airq_struct gib_alert_irq = { .handler = gib_alert_irq_handler, - .lsi_ptr = &gib_alert_irq.lsi_mask, }; void kvm_s390_gib_destroy(void) { if (!gib) return; + if (kvm_s390_pci_interp_allowed() && aift) { + mutex_lock(&aift->aift_lock); + kvm_s390_pci_aen_exit(); + mutex_unlock(&aift->aift_lock); + } chsc_sgib(0); unregister_adapter_interrupt(&gib_alert_irq); free_page((unsigned long)gib); gib = NULL; } -int kvm_s390_gib_init(u8 nisc) +int __init kvm_s390_gib_init(u8 nisc) { + u32 gib_origin; int rc = 0; if (!css_general_characteristics.aiv) { @@ -3223,7 +3435,7 @@ int kvm_s390_gib_init(u8 nisc) goto out; } - gib = (struct kvm_s390_gib *)get_zeroed_page(GFP_KERNEL | GFP_DMA); + gib = (struct kvm_s390_gib *)get_zeroed_page(GFP_KERNEL_ACCOUNT | GFP_DMA); if (!gib) { rc = -ENOMEM; goto out; @@ -3235,9 +3447,12 @@ int kvm_s390_gib_init(u8 nisc) rc = -EIO; goto out_free_gib; } + /* adapter interrupts used for AP (applicable here) don't use the LSI */ + *gib_alert_irq.lsi_ptr = 0xff; gib->nisc = nisc; - if (chsc_sgib((u32)(u64)gib)) { + gib_origin = virt_to_phys(gib); + if (chsc_sgib(gib_origin)) { pr_err("Associating the GIB with the AIV facility failed\n"); free_page((unsigned long)gib); gib = NULL; @@ -3245,6 +3460,14 @@ int kvm_s390_gib_init(u8 nisc) goto out_unreg_gal; } + if (kvm_s390_pci_interp_allowed()) { + if (kvm_s390_pci_aen_init(nisc)) { + pr_err("Initializing AEN for PCI failed\n"); + rc = -EIO; + goto out_unreg_gal; + } + } + KVM_EVENT(3, "gib 0x%pK (nisc=%d) initialized", gib, gib->nisc); goto out; diff --git a/arch/s390/kvm/irq.h b/arch/s390/kvm/irq.h deleted file mode 100644 index 484608c71dd0..000000000000 --- a/arch/s390/kvm/irq.h +++ /dev/null @@ -1,19 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * s390 irqchip routines - * - * Copyright IBM Corp. 2014 - * - * Author(s): Cornelia Huck <cornelia.huck@de.ibm.com> - */ -#ifndef __KVM_IRQ_H -#define __KVM_IRQ_H - -#include <linux/kvm_host.h> - -static inline int irqchip_in_kernel(struct kvm *kvm) -{ - return 1; -} - -#endif diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index d9e6bf3d54f0..ea63ac769889 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2,11 +2,10 @@ /* * hosting IBM Z kernel virtual machines (s390x) * - * Copyright IBM Corp. 2008, 2018 + * Copyright IBM Corp. 2008, 2020 * * Author(s): Carsten Otte <cotte@de.ibm.com> * Christian Borntraeger <borntraeger@de.ibm.com> - * Heiko Carstens <heiko.carstens@de.ibm.com> * Christian Ehrhardt <ehrhardt@de.ibm.com> * Jason J. Herne <jjherne@us.ibm.com> */ @@ -31,11 +30,12 @@ #include <linux/bitmap.h> #include <linux/sched/signal.h> #include <linux/string.h> +#include <linux/pgtable.h> +#include <linux/mmu_notifier.h> #include <asm/asm-offsets.h> #include <asm/lowcore.h> #include <asm/stp.h> -#include <asm/pgtable.h> #include <asm/gmap.h> #include <asm/nmi.h> #include <asm/switch_to.h> @@ -44,8 +44,11 @@ #include <asm/cpacf.h> #include <asm/timex.h> #include <asm/ap.h> +#include <asm/uv.h> +#include <asm/fpu/api.h> #include "kvm-s390.h" #include "gaccess.h" +#include "pci.h" #define CREATE_TRACE_POINTS #include "trace.h" @@ -56,118 +59,137 @@ #define VCPU_IRQS_MAX_BUF (sizeof(struct kvm_s390_irq) * \ (KVM_MAX_VCPUS + LOCAL_IRQS)) -#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU -#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM - -struct kvm_stats_debugfs_item debugfs_entries[] = { - { "userspace_handled", VCPU_STAT(exit_userspace) }, - { "exit_null", VCPU_STAT(exit_null) }, - { "exit_validity", VCPU_STAT(exit_validity) }, - { "exit_stop_request", VCPU_STAT(exit_stop_request) }, - { "exit_external_request", VCPU_STAT(exit_external_request) }, - { "exit_io_request", VCPU_STAT(exit_io_request) }, - { "exit_external_interrupt", VCPU_STAT(exit_external_interrupt) }, - { "exit_instruction", VCPU_STAT(exit_instruction) }, - { "exit_pei", VCPU_STAT(exit_pei) }, - { "exit_program_interruption", VCPU_STAT(exit_program_interruption) }, - { "exit_instr_and_program_int", VCPU_STAT(exit_instr_and_program) }, - { "exit_operation_exception", VCPU_STAT(exit_operation_exception) }, - { "halt_successful_poll", VCPU_STAT(halt_successful_poll) }, - { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) }, - { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) }, - { "halt_no_poll_steal", VCPU_STAT(halt_no_poll_steal) }, - { "halt_wakeup", VCPU_STAT(halt_wakeup) }, - { "instruction_lctlg", VCPU_STAT(instruction_lctlg) }, - { "instruction_lctl", VCPU_STAT(instruction_lctl) }, - { "instruction_stctl", VCPU_STAT(instruction_stctl) }, - { "instruction_stctg", VCPU_STAT(instruction_stctg) }, - { "deliver_ckc", VCPU_STAT(deliver_ckc) }, - { "deliver_cputm", VCPU_STAT(deliver_cputm) }, - { "deliver_emergency_signal", VCPU_STAT(deliver_emergency_signal) }, - { "deliver_external_call", VCPU_STAT(deliver_external_call) }, - { "deliver_service_signal", VCPU_STAT(deliver_service_signal) }, - { "deliver_virtio", VCPU_STAT(deliver_virtio) }, - { "deliver_stop_signal", VCPU_STAT(deliver_stop_signal) }, - { "deliver_prefix_signal", VCPU_STAT(deliver_prefix_signal) }, - { "deliver_restart_signal", VCPU_STAT(deliver_restart_signal) }, - { "deliver_program", VCPU_STAT(deliver_program) }, - { "deliver_io", VCPU_STAT(deliver_io) }, - { "deliver_machine_check", VCPU_STAT(deliver_machine_check) }, - { "exit_wait_state", VCPU_STAT(exit_wait_state) }, - { "inject_ckc", VCPU_STAT(inject_ckc) }, - { "inject_cputm", VCPU_STAT(inject_cputm) }, - { "inject_external_call", VCPU_STAT(inject_external_call) }, - { "inject_float_mchk", VM_STAT(inject_float_mchk) }, - { "inject_emergency_signal", VCPU_STAT(inject_emergency_signal) }, - { "inject_io", VM_STAT(inject_io) }, - { "inject_mchk", VCPU_STAT(inject_mchk) }, - { "inject_pfault_done", VM_STAT(inject_pfault_done) }, - { "inject_program", VCPU_STAT(inject_program) }, - { "inject_restart", VCPU_STAT(inject_restart) }, - { "inject_service_signal", VM_STAT(inject_service_signal) }, - { "inject_set_prefix", VCPU_STAT(inject_set_prefix) }, - { "inject_stop_signal", VCPU_STAT(inject_stop_signal) }, - { "inject_pfault_init", VCPU_STAT(inject_pfault_init) }, - { "inject_virtio", VM_STAT(inject_virtio) }, - { "instruction_epsw", VCPU_STAT(instruction_epsw) }, - { "instruction_gs", VCPU_STAT(instruction_gs) }, - { "instruction_io_other", VCPU_STAT(instruction_io_other) }, - { "instruction_lpsw", VCPU_STAT(instruction_lpsw) }, - { "instruction_lpswe", VCPU_STAT(instruction_lpswe) }, - { "instruction_pfmf", VCPU_STAT(instruction_pfmf) }, - { "instruction_ptff", VCPU_STAT(instruction_ptff) }, - { "instruction_stidp", VCPU_STAT(instruction_stidp) }, - { "instruction_sck", VCPU_STAT(instruction_sck) }, - { "instruction_sckpf", VCPU_STAT(instruction_sckpf) }, - { "instruction_spx", VCPU_STAT(instruction_spx) }, - { "instruction_stpx", VCPU_STAT(instruction_stpx) }, - { "instruction_stap", VCPU_STAT(instruction_stap) }, - { "instruction_iske", VCPU_STAT(instruction_iske) }, - { "instruction_ri", VCPU_STAT(instruction_ri) }, - { "instruction_rrbe", VCPU_STAT(instruction_rrbe) }, - { "instruction_sske", VCPU_STAT(instruction_sske) }, - { "instruction_ipte_interlock", VCPU_STAT(instruction_ipte_interlock) }, - { "instruction_essa", VCPU_STAT(instruction_essa) }, - { "instruction_stsi", VCPU_STAT(instruction_stsi) }, - { "instruction_stfl", VCPU_STAT(instruction_stfl) }, - { "instruction_tb", VCPU_STAT(instruction_tb) }, - { "instruction_tpi", VCPU_STAT(instruction_tpi) }, - { "instruction_tprot", VCPU_STAT(instruction_tprot) }, - { "instruction_tsch", VCPU_STAT(instruction_tsch) }, - { "instruction_sthyi", VCPU_STAT(instruction_sthyi) }, - { "instruction_sie", VCPU_STAT(instruction_sie) }, - { "instruction_sigp_sense", VCPU_STAT(instruction_sigp_sense) }, - { "instruction_sigp_sense_running", VCPU_STAT(instruction_sigp_sense_running) }, - { "instruction_sigp_external_call", VCPU_STAT(instruction_sigp_external_call) }, - { "instruction_sigp_emergency", VCPU_STAT(instruction_sigp_emergency) }, - { "instruction_sigp_cond_emergency", VCPU_STAT(instruction_sigp_cond_emergency) }, - { "instruction_sigp_start", VCPU_STAT(instruction_sigp_start) }, - { "instruction_sigp_stop", VCPU_STAT(instruction_sigp_stop) }, - { "instruction_sigp_stop_store_status", VCPU_STAT(instruction_sigp_stop_store_status) }, - { "instruction_sigp_store_status", VCPU_STAT(instruction_sigp_store_status) }, - { "instruction_sigp_store_adtl_status", VCPU_STAT(instruction_sigp_store_adtl_status) }, - { "instruction_sigp_set_arch", VCPU_STAT(instruction_sigp_arch) }, - { "instruction_sigp_set_prefix", VCPU_STAT(instruction_sigp_prefix) }, - { "instruction_sigp_restart", VCPU_STAT(instruction_sigp_restart) }, - { "instruction_sigp_cpu_reset", VCPU_STAT(instruction_sigp_cpu_reset) }, - { "instruction_sigp_init_cpu_reset", VCPU_STAT(instruction_sigp_init_cpu_reset) }, - { "instruction_sigp_unknown", VCPU_STAT(instruction_sigp_unknown) }, - { "instruction_diag_10", VCPU_STAT(diagnose_10) }, - { "instruction_diag_44", VCPU_STAT(diagnose_44) }, - { "instruction_diag_9c", VCPU_STAT(diagnose_9c) }, - { "diag_9c_ignored", VCPU_STAT(diagnose_9c_ignored) }, - { "instruction_diag_258", VCPU_STAT(diagnose_258) }, - { "instruction_diag_308", VCPU_STAT(diagnose_308) }, - { "instruction_diag_500", VCPU_STAT(diagnose_500) }, - { "instruction_diag_other", VCPU_STAT(diagnose_other) }, - { NULL } +const struct _kvm_stats_desc kvm_vm_stats_desc[] = { + KVM_GENERIC_VM_STATS(), + STATS_DESC_COUNTER(VM, inject_io), + STATS_DESC_COUNTER(VM, inject_float_mchk), + STATS_DESC_COUNTER(VM, inject_pfault_done), + STATS_DESC_COUNTER(VM, inject_service_signal), + STATS_DESC_COUNTER(VM, inject_virtio), + STATS_DESC_COUNTER(VM, aen_forward), + STATS_DESC_COUNTER(VM, gmap_shadow_reuse), + STATS_DESC_COUNTER(VM, gmap_shadow_create), + STATS_DESC_COUNTER(VM, gmap_shadow_r1_entry), + STATS_DESC_COUNTER(VM, gmap_shadow_r2_entry), + STATS_DESC_COUNTER(VM, gmap_shadow_r3_entry), + STATS_DESC_COUNTER(VM, gmap_shadow_sg_entry), + STATS_DESC_COUNTER(VM, gmap_shadow_pg_entry), }; -struct kvm_s390_tod_clock_ext { - __u8 epoch_idx; - __u64 tod; - __u8 reserved[7]; -} __packed; +const struct kvm_stats_header kvm_vm_stats_header = { + .name_size = KVM_STATS_NAME_SIZE, + .num_desc = ARRAY_SIZE(kvm_vm_stats_desc), + .id_offset = sizeof(struct kvm_stats_header), + .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE, + .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE + + sizeof(kvm_vm_stats_desc), +}; + +const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { + KVM_GENERIC_VCPU_STATS(), + STATS_DESC_COUNTER(VCPU, exit_userspace), + STATS_DESC_COUNTER(VCPU, exit_null), + STATS_DESC_COUNTER(VCPU, exit_external_request), + STATS_DESC_COUNTER(VCPU, exit_io_request), + STATS_DESC_COUNTER(VCPU, exit_external_interrupt), + STATS_DESC_COUNTER(VCPU, exit_stop_request), + STATS_DESC_COUNTER(VCPU, exit_validity), + STATS_DESC_COUNTER(VCPU, exit_instruction), + STATS_DESC_COUNTER(VCPU, exit_pei), + STATS_DESC_COUNTER(VCPU, halt_no_poll_steal), + STATS_DESC_COUNTER(VCPU, instruction_lctl), + STATS_DESC_COUNTER(VCPU, instruction_lctlg), + STATS_DESC_COUNTER(VCPU, instruction_stctl), + STATS_DESC_COUNTER(VCPU, instruction_stctg), + STATS_DESC_COUNTER(VCPU, exit_program_interruption), + STATS_DESC_COUNTER(VCPU, exit_instr_and_program), + STATS_DESC_COUNTER(VCPU, exit_operation_exception), + STATS_DESC_COUNTER(VCPU, deliver_ckc), + STATS_DESC_COUNTER(VCPU, deliver_cputm), + STATS_DESC_COUNTER(VCPU, deliver_external_call), + STATS_DESC_COUNTER(VCPU, deliver_emergency_signal), + STATS_DESC_COUNTER(VCPU, deliver_service_signal), + STATS_DESC_COUNTER(VCPU, deliver_virtio), + STATS_DESC_COUNTER(VCPU, deliver_stop_signal), + STATS_DESC_COUNTER(VCPU, deliver_prefix_signal), + STATS_DESC_COUNTER(VCPU, deliver_restart_signal), + STATS_DESC_COUNTER(VCPU, deliver_program), + STATS_DESC_COUNTER(VCPU, deliver_io), + STATS_DESC_COUNTER(VCPU, deliver_machine_check), + STATS_DESC_COUNTER(VCPU, exit_wait_state), + STATS_DESC_COUNTER(VCPU, inject_ckc), + STATS_DESC_COUNTER(VCPU, inject_cputm), + STATS_DESC_COUNTER(VCPU, inject_external_call), + STATS_DESC_COUNTER(VCPU, inject_emergency_signal), + STATS_DESC_COUNTER(VCPU, inject_mchk), + STATS_DESC_COUNTER(VCPU, inject_pfault_init), + STATS_DESC_COUNTER(VCPU, inject_program), + STATS_DESC_COUNTER(VCPU, inject_restart), + STATS_DESC_COUNTER(VCPU, inject_set_prefix), + STATS_DESC_COUNTER(VCPU, inject_stop_signal), + STATS_DESC_COUNTER(VCPU, instruction_epsw), + STATS_DESC_COUNTER(VCPU, instruction_gs), + STATS_DESC_COUNTER(VCPU, instruction_io_other), + STATS_DESC_COUNTER(VCPU, instruction_lpsw), + STATS_DESC_COUNTER(VCPU, instruction_lpswe), + STATS_DESC_COUNTER(VCPU, instruction_pfmf), + STATS_DESC_COUNTER(VCPU, instruction_ptff), + STATS_DESC_COUNTER(VCPU, instruction_sck), + STATS_DESC_COUNTER(VCPU, instruction_sckpf), + STATS_DESC_COUNTER(VCPU, instruction_stidp), + STATS_DESC_COUNTER(VCPU, instruction_spx), + STATS_DESC_COUNTER(VCPU, instruction_stpx), + STATS_DESC_COUNTER(VCPU, instruction_stap), + STATS_DESC_COUNTER(VCPU, instruction_iske), + STATS_DESC_COUNTER(VCPU, instruction_ri), + STATS_DESC_COUNTER(VCPU, instruction_rrbe), + STATS_DESC_COUNTER(VCPU, instruction_sske), + STATS_DESC_COUNTER(VCPU, instruction_ipte_interlock), + STATS_DESC_COUNTER(VCPU, instruction_stsi), + STATS_DESC_COUNTER(VCPU, instruction_stfl), + STATS_DESC_COUNTER(VCPU, instruction_tb), + STATS_DESC_COUNTER(VCPU, instruction_tpi), + STATS_DESC_COUNTER(VCPU, instruction_tprot), + STATS_DESC_COUNTER(VCPU, instruction_tsch), + STATS_DESC_COUNTER(VCPU, instruction_sie), + STATS_DESC_COUNTER(VCPU, instruction_essa), + STATS_DESC_COUNTER(VCPU, instruction_sthyi), + STATS_DESC_COUNTER(VCPU, instruction_sigp_sense), + STATS_DESC_COUNTER(VCPU, instruction_sigp_sense_running), + STATS_DESC_COUNTER(VCPU, instruction_sigp_external_call), + STATS_DESC_COUNTER(VCPU, instruction_sigp_emergency), + STATS_DESC_COUNTER(VCPU, instruction_sigp_cond_emergency), + STATS_DESC_COUNTER(VCPU, instruction_sigp_start), + STATS_DESC_COUNTER(VCPU, instruction_sigp_stop), + STATS_DESC_COUNTER(VCPU, instruction_sigp_stop_store_status), + STATS_DESC_COUNTER(VCPU, instruction_sigp_store_status), + STATS_DESC_COUNTER(VCPU, instruction_sigp_store_adtl_status), + STATS_DESC_COUNTER(VCPU, instruction_sigp_arch), + STATS_DESC_COUNTER(VCPU, instruction_sigp_prefix), + STATS_DESC_COUNTER(VCPU, instruction_sigp_restart), + STATS_DESC_COUNTER(VCPU, instruction_sigp_init_cpu_reset), + STATS_DESC_COUNTER(VCPU, instruction_sigp_cpu_reset), + STATS_DESC_COUNTER(VCPU, instruction_sigp_unknown), + STATS_DESC_COUNTER(VCPU, instruction_diagnose_10), + STATS_DESC_COUNTER(VCPU, instruction_diagnose_44), + STATS_DESC_COUNTER(VCPU, instruction_diagnose_9c), + STATS_DESC_COUNTER(VCPU, diag_9c_ignored), + STATS_DESC_COUNTER(VCPU, diag_9c_forward), + STATS_DESC_COUNTER(VCPU, instruction_diagnose_258), + STATS_DESC_COUNTER(VCPU, instruction_diagnose_308), + STATS_DESC_COUNTER(VCPU, instruction_diagnose_500), + STATS_DESC_COUNTER(VCPU, instruction_diagnose_other), + STATS_DESC_COUNTER(VCPU, pfault_sync) +}; + +const struct kvm_stats_header kvm_vcpu_stats_header = { + .name_size = KVM_STATS_NAME_SIZE, + .num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc), + .id_offset = sizeof(struct kvm_stats_header), + .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE, + .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE + + sizeof(kvm_vcpu_stats_desc), +}; /* allow nested virtualization in KVM (if enabled by user space) */ static int nested; @@ -184,6 +206,24 @@ static u8 halt_poll_max_steal = 10; module_param(halt_poll_max_steal, byte, 0644); MODULE_PARM_DESC(halt_poll_max_steal, "Maximum percentage of steal time to allow polling"); +/* if set to true, the GISA will be initialized and used if available */ +static bool use_gisa = true; +module_param(use_gisa, bool, 0644); +MODULE_PARM_DESC(use_gisa, "Use the GISA if the host supports it."); + +/* maximum diag9c forwarding per second */ +unsigned int diag9c_forwarding_hz; +module_param(diag9c_forwarding_hz, uint, 0644); +MODULE_PARM_DESC(diag9c_forwarding_hz, "Maximum diag9c forwarding per second, 0 to turn off"); + +/* + * allow asynchronous deinit for protected guests; enable by default since + * the feature is opt-in anyway + */ +static int async_destroy = 1; +module_param(async_destroy, int, 0444); +MODULE_PARM_DESC(async_destroy, "Asynchronous destroy for protected guests"); + /* * For now we handle at most 16 double words as this is what the s390 base * kernel handles and stores in the prefix page. If we ever need to go beyond @@ -207,7 +247,7 @@ static unsigned long kvm_s390_fac_size(void) BUILD_BUG_ON(SIZE_INTERNAL > S390_ARCH_FAC_MASK_SIZE_U64); BUILD_BUG_ON(SIZE_INTERNAL > S390_ARCH_FAC_LIST_SIZE_U64); BUILD_BUG_ON(SIZE_INTERNAL * sizeof(unsigned long) > - sizeof(S390_lowcore.stfle_fac_list)); + sizeof(stfle_fac_list)); return SIZE_INTERNAL; } @@ -220,21 +260,13 @@ static struct kvm_s390_vm_cpu_subfunc kvm_s390_available_subfunc; static struct gmap_notifier gmap_notifier; static struct gmap_notifier vsie_gmap_notifier; debug_info_t *kvm_s390_dbf; +debug_info_t *kvm_s390_dbf_uv; /* Section: not file related */ -int kvm_arch_hardware_enable(void) -{ - /* every s390 is virtualization enabled ;-) */ - return 0; -} - -int kvm_arch_check_processor_compat(void) -{ - return 0; -} - +/* forward declarations */ static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start, unsigned long end); +static int sca_switch_to_extended(struct kvm *kvm); static void kvm_clock_sync_scb(struct kvm_s390_sie_block *scb, u64 delta) { @@ -269,7 +301,7 @@ static int kvm_clock_sync(struct notifier_block *notifier, unsigned long val, { struct kvm *kvm; struct kvm_vcpu *vcpu; - int i; + unsigned long i; unsigned long long *delta = v; list_for_each_entry(kvm, &vm_list, vm_list) { @@ -293,25 +325,6 @@ static struct notifier_block kvm_clock_notifier = { .notifier_call = kvm_clock_sync, }; -int kvm_arch_hardware_setup(void) -{ - gmap_notifier.notifier_call = kvm_gmap_notifier; - gmap_register_pte_notifier(&gmap_notifier); - vsie_gmap_notifier.notifier_call = kvm_s390_vsie_gmap_notifier; - gmap_register_pte_notifier(&vsie_gmap_notifier); - atomic_notifier_chain_register(&s390_epoch_delta_notifier, - &kvm_clock_notifier); - return 0; -} - -void kvm_arch_hardware_unsetup(void) -{ - gmap_unregister_pte_notifier(&gmap_notifier); - gmap_unregister_pte_notifier(&vsie_gmap_notifier); - atomic_notifier_chain_unregister(&s390_epoch_delta_notifier, - &kvm_clock_notifier); -} - static void allow_cpu_feat(unsigned long nr) { set_bit_inv(nr, kvm_s390_available_cpu_feat); @@ -319,37 +332,37 @@ static void allow_cpu_feat(unsigned long nr) static inline int plo_test_bit(unsigned char nr) { - register unsigned long r0 asm("0") = (unsigned long) nr | 0x100; + unsigned long function = (unsigned long)nr | 0x100; int cc; asm volatile( + " lgr 0,%[function]\n" /* Parameter registers are ignored for "test bit" */ " plo 0,0,0,0(0)\n" " ipm %0\n" " srl %0,28\n" : "=d" (cc) - : "d" (r0) - : "cc"); + : [function] "d" (function) + : "cc", "0"); return cc == 0; } static __always_inline void __insn32_query(unsigned int opcode, u8 *query) { - register unsigned long r0 asm("0") = 0; /* query function */ - register unsigned long r1 asm("1") = (unsigned long) query; - asm volatile( - /* Parameter regs are ignored */ + " lghi 0,0\n" + " lgr 1,%[query]\n" + /* Parameter registers are ignored */ " .insn rrf,%[opc] << 16,2,4,6,0\n" : - : "d" (r0), "a" (r1), [opc] "i" (opcode) - : "cc", "memory"); + : [query] "d" ((unsigned long)query), [opc] "i" (opcode) + : "cc", "memory", "0", "1"); } #define INSN_SORTL 0xb938 #define INSN_DFLTCC 0xb939 -static void kvm_s390_cpu_feat_init(void) +static void __init kvm_s390_cpu_feat_init(void) { int i; @@ -452,7 +465,7 @@ static void kvm_s390_cpu_feat_init(void) */ } -int kvm_arch_init(void *opaque) +static int __init __kvm_s390_init(void) { int rc = -ENOMEM; @@ -460,8 +473,13 @@ int kvm_arch_init(void *opaque) if (!kvm_s390_dbf) return -ENOMEM; - if (debug_register_view(kvm_s390_dbf, &debug_sprintf_view)) - goto out; + kvm_s390_dbf_uv = debug_register("kvm-uv", 32, 1, 7 * sizeof(long)); + if (!kvm_s390_dbf_uv) + goto err_kvm_uv; + + if (debug_register_view(kvm_s390_dbf, &debug_sprintf_view) || + debug_register_view(kvm_s390_dbf_uv, &debug_sprintf_view)) + goto err_debug_view; kvm_s390_cpu_feat_init(); @@ -469,24 +487,54 @@ int kvm_arch_init(void *opaque) rc = kvm_register_device_ops(&kvm_flic_ops, KVM_DEV_TYPE_FLIC); if (rc) { pr_err("A FLIC registration call failed with rc=%d\n", rc); - goto out; + goto err_flic; + } + + if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) { + rc = kvm_s390_pci_init(); + if (rc) { + pr_err("Unable to allocate AIFT for PCI\n"); + goto err_pci; + } } rc = kvm_s390_gib_init(GAL_ISC); if (rc) - goto out; + goto err_gib; + + gmap_notifier.notifier_call = kvm_gmap_notifier; + gmap_register_pte_notifier(&gmap_notifier); + vsie_gmap_notifier.notifier_call = kvm_s390_vsie_gmap_notifier; + gmap_register_pte_notifier(&vsie_gmap_notifier); + atomic_notifier_chain_register(&s390_epoch_delta_notifier, + &kvm_clock_notifier); return 0; -out: - kvm_arch_exit(); +err_gib: + if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) + kvm_s390_pci_exit(); +err_pci: +err_flic: +err_debug_view: + debug_unregister(kvm_s390_dbf_uv); +err_kvm_uv: + debug_unregister(kvm_s390_dbf); return rc; } -void kvm_arch_exit(void) +static void __kvm_s390_exit(void) { + gmap_unregister_pte_notifier(&gmap_notifier); + gmap_unregister_pte_notifier(&vsie_gmap_notifier); + atomic_notifier_chain_unregister(&s390_epoch_delta_notifier, + &kvm_clock_notifier); + kvm_s390_gib_destroy(); + if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) + kvm_s390_pci_exit(); debug_unregister(kvm_s390_dbf); + debug_unregister(kvm_s390_dbf_uv); } /* Section: device related */ @@ -515,7 +563,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ENABLE_CAP: case KVM_CAP_S390_CSS_SUPPORT: case KVM_CAP_IOEVENTFD: - case KVM_CAP_DEVICE_CTRL: case KVM_CAP_S390_IRQCHIP: case KVM_CAP_VM_ATTRIBUTES: case KVM_CAP_MP_STATE: @@ -529,8 +576,15 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_CMMA_MIGRATION: case KVM_CAP_S390_AIS: case KVM_CAP_S390_AIS_MIGRATION: + case KVM_CAP_S390_VCPU_RESETS: + case KVM_CAP_SET_GUEST_DEBUG: + case KVM_CAP_S390_DIAG318: + case KVM_CAP_IRQFD_RESAMPLE: r = 1; break; + case KVM_CAP_SET_GUEST_DEBUG2: + r = KVM_GUESTDBG_VALID_MASK; + break; case KVM_CAP_S390_HPAGE_1M: r = 0; if (hpage && !kvm_is_ucontrol(kvm)) @@ -539,6 +593,15 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_MEM_OP: r = MEM_OP_MAX_SIZE; break; + case KVM_CAP_S390_MEM_OP_EXTENSION: + /* + * Flag bits indicating which extensions are supported. + * If r > 0, the base extension must also be supported/indicated, + * in order to maintain backwards compatibility. + */ + r = KVM_S390_MEMOP_EXTENSION_CAP_BASE | + KVM_S390_MEMOP_EXTENSION_CAP_CMPXCHG; + break; case KVM_CAP_NR_VCPUS: case KVM_CAP_MAX_VCPUS: case KVM_CAP_MAX_VCPU_ID: @@ -547,12 +610,14 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = KVM_MAX_VCPUS; else if (sclp.has_esca && sclp.has_64bscao) r = KVM_S390_ESCA_CPU_SLOTS; + if (ext == KVM_CAP_NR_VCPUS) + r = min_t(unsigned int, num_online_cpus(), r); break; case KVM_CAP_S390_COW: r = MACHINE_HAS_ESOP; break; case KVM_CAP_S390_VECTOR_REGISTERS: - r = MACHINE_HAS_VX; + r = test_facility(129); break; case KVM_CAP_S390_RI: r = test_facility(64); @@ -563,14 +628,45 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_BPB: r = test_facility(82); break; + case KVM_CAP_S390_PROTECTED_ASYNC_DISABLE: + r = async_destroy && is_prot_virt_host(); + break; + case KVM_CAP_S390_PROTECTED: + r = is_prot_virt_host(); + break; + case KVM_CAP_S390_PROTECTED_DUMP: { + u64 pv_cmds_dump[] = { + BIT_UVC_CMD_DUMP_INIT, + BIT_UVC_CMD_DUMP_CONFIG_STOR_STATE, + BIT_UVC_CMD_DUMP_CPU, + BIT_UVC_CMD_DUMP_COMPLETE, + }; + int i; + + r = is_prot_virt_host(); + + for (i = 0; i < ARRAY_SIZE(pv_cmds_dump); i++) { + if (!test_bit_inv(pv_cmds_dump[i], + (unsigned long *)&uv_info.inst_calls_list)) { + r = 0; + break; + } + } + break; + } + case KVM_CAP_S390_ZPCI_OP: + r = kvm_s390_pci_interp_allowed(); + break; + case KVM_CAP_S390_CPU_TOPOLOGY: + r = test_facility(11); + break; default: r = 0; } return r; } -static void kvm_s390_sync_dirty_log(struct kvm *kvm, - struct kvm_memory_slot *memslot) +void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) { int i; gfn_t cur_gfn, last_gfn; @@ -611,9 +707,8 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, { int r; unsigned long n; - struct kvm_memslots *slots; struct kvm_memory_slot *memslot; - int is_dirty = 0; + int is_dirty; if (kvm_is_ucontrol(kvm)) return -EINVAL; @@ -624,14 +719,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, if (log->slot >= KVM_USER_MEM_SLOTS) goto out; - slots = kvm_memslots(kvm); - memslot = id_to_memslot(slots, log->slot); - r = -ENOENT; - if (!memslot->dirty_bitmap) - goto out; - - kvm_s390_sync_dirty_log(kvm, memslot); - r = kvm_get_dirty_log(kvm, log, &is_dirty); + r = kvm_get_dirty_log(kvm, log, &is_dirty, &memslot); if (r) goto out; @@ -648,7 +736,7 @@ out: static void icpt_operexc_on_all_vcpus(struct kvm *kvm) { - unsigned int i; + unsigned long i; struct kvm_vcpu *vcpu; kvm_for_each_vcpu(i, vcpu, kvm) { @@ -678,7 +766,7 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) mutex_lock(&kvm->lock); if (kvm->created_vcpus) { r = -EBUSY; - } else if (MACHINE_HAS_VX) { + } else if (cpu_has_vx()) { set_kvm_facility(kvm->arch.model.fac_mask, 129); set_kvm_facility(kvm->arch.model.fac_list, 129); if (test_facility(134)) { @@ -697,6 +785,10 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) set_kvm_facility(kvm->arch.model.fac_mask, 152); set_kvm_facility(kvm->arch.model.fac_list, 152); } + if (test_facility(192)) { + set_kvm_facility(kvm->arch.model.fac_mask, 192); + set_kvm_facility(kvm->arch.model.fac_list, 192); + } r = 0; } else r = -EINVAL; @@ -753,9 +845,9 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) r = -EINVAL; else { r = 0; - down_write(&kvm->mm->mmap_sem); + mmap_write_lock(kvm->mm); kvm->mm->context.allow_gmap_hpage_1m = 1; - up_write(&kvm->mm->mmap_sem); + mmap_write_unlock(kvm->mm); /* * We might have to create fake 4k page * tables. To avoid that the hardware works on @@ -779,6 +871,20 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) icpt_operexc_on_all_vcpus(kvm); r = 0; break; + case KVM_CAP_S390_CPU_TOPOLOGY: + r = -EINVAL; + mutex_lock(&kvm->lock); + if (kvm->created_vcpus) { + r = -EBUSY; + } else if (test_facility(11)) { + set_kvm_facility(kvm->arch.model.fac_mask, 11); + set_kvm_facility(kvm->arch.model.fac_list, 11); + r = 0; + } + mutex_unlock(&kvm->lock); + VM_EVENT(kvm, 3, "ENABLE: CAP_S390_CPU_TOPOLOGY %s", + r ? "(not available)" : "(success)"); + break; default: r = -EINVAL; break; @@ -898,7 +1004,7 @@ static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu); void kvm_s390_vcpu_crypto_reset_all(struct kvm *kvm) { struct kvm_vcpu *vcpu; - int i; + unsigned long i; kvm_s390_vcpu_block_all(kvm); @@ -981,9 +1087,45 @@ static int kvm_s390_vm_set_crypto(struct kvm *kvm, struct kvm_device_attr *attr) return 0; } +static void kvm_s390_vcpu_pci_setup(struct kvm_vcpu *vcpu) +{ + /* Only set the ECB bits after guest requests zPCI interpretation */ + if (!vcpu->kvm->arch.use_zpci_interp) + return; + + vcpu->arch.sie_block->ecb2 |= ECB2_ZPCI_LSI; + vcpu->arch.sie_block->ecb3 |= ECB3_AISII + ECB3_AISI; +} + +void kvm_s390_vcpu_pci_enable_interp(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + unsigned long i; + + lockdep_assert_held(&kvm->lock); + + if (!kvm_s390_pci_interp_allowed()) + return; + + /* + * If host is configured for PCI and the necessary facilities are + * available, turn on interpretation for the life of this guest + */ + kvm->arch.use_zpci_interp = 1; + + kvm_s390_vcpu_block_all(kvm); + + kvm_for_each_vcpu(i, vcpu, kvm) { + kvm_s390_vcpu_pci_setup(vcpu); + kvm_s390_sync_request(KVM_REQ_VSIE_RESTART, vcpu); + } + + kvm_s390_vcpu_unblock_all(kvm); +} + static void kvm_s390_sync_request_broadcast(struct kvm *kvm, int req) { - int cx; + unsigned long cx; struct kvm_vcpu *vcpu; kvm_for_each_vcpu(cx, vcpu, kvm) @@ -999,13 +1141,13 @@ static int kvm_s390_vm_start_migration(struct kvm *kvm) struct kvm_memory_slot *ms; struct kvm_memslots *slots; unsigned long ram_pages = 0; - int slotnr; + int bkt; /* migration mode already enabled */ if (kvm->arch.migration_mode) return 0; slots = kvm_memslots(kvm); - if (!slots || !slots->used_slots) + if (!slots || kvm_memslots_empty(slots)) return -EINVAL; if (!kvm->arch.use_cmma) { @@ -1013,8 +1155,7 @@ static int kvm_s390_vm_start_migration(struct kvm *kvm) return 0; } /* mark all the pages in active slots as dirty */ - for (slotnr = 0; slotnr < slots->used_slots; slotnr++) { - ms = slots->memslots + slotnr; + kvm_for_each_memslot(ms, bkt, slots) { if (!ms->dirty_bitmap) return -EINVAL; /* @@ -1081,6 +1222,8 @@ static int kvm_s390_vm_get_migration(struct kvm *kvm, return 0; } +static void __kvm_s390_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod); + static int kvm_s390_set_tod_ext(struct kvm *kvm, struct kvm_device_attr *attr) { struct kvm_s390_vm_tod_clock gtod; @@ -1090,7 +1233,7 @@ static int kvm_s390_set_tod_ext(struct kvm *kvm, struct kvm_device_attr *attr) if (!test_kvm_facility(kvm, 139) && gtod.epoch_idx) return -EINVAL; - kvm_s390_set_tod_clock(kvm, >od); + __kvm_s390_set_tod_clock(kvm, >od); VM_EVENT(kvm, 3, "SET: TOD extension: 0x%x, TOD base: 0x%llx", gtod.epoch_idx, gtod.tod); @@ -1121,7 +1264,7 @@ static int kvm_s390_set_tod_low(struct kvm *kvm, struct kvm_device_attr *attr) sizeof(gtod.tod))) return -EFAULT; - kvm_s390_set_tod_clock(kvm, >od); + __kvm_s390_set_tod_clock(kvm, >od); VM_EVENT(kvm, 3, "SET: TOD base: 0x%llx", gtod.tod); return 0; } @@ -1133,6 +1276,16 @@ static int kvm_s390_set_tod(struct kvm *kvm, struct kvm_device_attr *attr) if (attr->flags) return -EINVAL; + mutex_lock(&kvm->lock); + /* + * For protected guests, the TOD is managed by the ultravisor, so trying + * to change it will never bring the expected results. + */ + if (kvm_s390_pv_is_protected(kvm)) { + ret = -EOPNOTSUPP; + goto out_unlock; + } + switch (attr->attr) { case KVM_S390_VM_TOD_EXT: ret = kvm_s390_set_tod_ext(kvm, attr); @@ -1147,23 +1300,26 @@ static int kvm_s390_set_tod(struct kvm *kvm, struct kvm_device_attr *attr) ret = -ENXIO; break; } + +out_unlock: + mutex_unlock(&kvm->lock); return ret; } static void kvm_s390_get_tod_clock(struct kvm *kvm, struct kvm_s390_vm_tod_clock *gtod) { - struct kvm_s390_tod_clock_ext htod; + union tod_clock clk; preempt_disable(); - get_tod_clock_ext((char *)&htod); + store_tod_clock_ext(&clk); - gtod->tod = htod.tod + kvm->arch.epoch; + gtod->tod = clk.tod + kvm->arch.epoch; gtod->epoch_idx = 0; if (test_kvm_facility(kvm, 139)) { - gtod->epoch_idx = htod.epoch_idx + kvm->arch.epdx; - if (gtod->tod < htod.tod) + gtod->epoch_idx = clk.ei + kvm->arch.epdx; + if (gtod->tod < clk.tod) gtod->epoch_idx += 1; } @@ -1243,7 +1399,7 @@ static int kvm_s390_set_processor(struct kvm *kvm, struct kvm_device_attr *attr) ret = -EBUSY; goto out; } - proc = kzalloc(sizeof(*proc), GFP_KERNEL); + proc = kzalloc(sizeof(*proc), GFP_KERNEL_ACCOUNT); if (!proc) { ret = -ENOMEM; goto out; @@ -1295,8 +1451,7 @@ static int kvm_s390_set_processor_feat(struct kvm *kvm, mutex_unlock(&kvm->lock); return -EBUSY; } - bitmap_copy(kvm->arch.cpu_feat, (unsigned long *) data.feat, - KVM_S390_VM_CPU_FEAT_NR_BITS); + bitmap_from_arr64(kvm->arch.cpu_feat, data.feat, KVM_S390_VM_CPU_FEAT_NR_BITS); mutex_unlock(&kvm->lock); VM_EVENT(kvm, 3, "SET: guest feat: 0x%16.16llx.0x%16.16llx.0x%16.16llx", data.feat[0], @@ -1382,6 +1537,39 @@ static int kvm_s390_set_processor_subfunc(struct kvm *kvm, return 0; } +#define KVM_S390_VM_CPU_UV_FEAT_GUEST_MASK \ +( \ + ((struct kvm_s390_vm_cpu_uv_feat){ \ + .ap = 1, \ + .ap_intr = 1, \ + }) \ + .feat \ +) + +static int kvm_s390_set_uv_feat(struct kvm *kvm, struct kvm_device_attr *attr) +{ + struct kvm_s390_vm_cpu_uv_feat __user *ptr = (void __user *)attr->addr; + unsigned long data, filter; + + filter = uv_info.uv_feature_indications & KVM_S390_VM_CPU_UV_FEAT_GUEST_MASK; + if (get_user(data, &ptr->feat)) + return -EFAULT; + if (!bitmap_subset(&data, &filter, KVM_S390_VM_CPU_UV_FEAT_NR_BITS)) + return -EINVAL; + + mutex_lock(&kvm->lock); + if (kvm->created_vcpus) { + mutex_unlock(&kvm->lock); + return -EBUSY; + } + kvm->arch.model.uv_feat_guest.feat = data; + mutex_unlock(&kvm->lock); + + VM_EVENT(kvm, 3, "SET: guest UV-feat: 0x%16.16lx", data); + + return 0; +} + static int kvm_s390_set_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr) { int ret = -ENXIO; @@ -1396,6 +1584,9 @@ static int kvm_s390_set_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr) case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC: ret = kvm_s390_set_processor_subfunc(kvm, attr); break; + case KVM_S390_VM_CPU_PROCESSOR_UV_FEAT_GUEST: + ret = kvm_s390_set_uv_feat(kvm, attr); + break; } return ret; } @@ -1405,7 +1596,7 @@ static int kvm_s390_get_processor(struct kvm *kvm, struct kvm_device_attr *attr) struct kvm_s390_vm_cpu_processor *proc; int ret = 0; - proc = kzalloc(sizeof(*proc), GFP_KERNEL); + proc = kzalloc(sizeof(*proc), GFP_KERNEL_ACCOUNT); if (!proc) { ret = -ENOMEM; goto out; @@ -1433,7 +1624,7 @@ static int kvm_s390_get_machine(struct kvm *kvm, struct kvm_device_attr *attr) struct kvm_s390_vm_cpu_machine *mach; int ret = 0; - mach = kzalloc(sizeof(*mach), GFP_KERNEL); + mach = kzalloc(sizeof(*mach), GFP_KERNEL_ACCOUNT); if (!mach) { ret = -ENOMEM; goto out; @@ -1442,8 +1633,8 @@ static int kvm_s390_get_machine(struct kvm *kvm, struct kvm_device_attr *attr) mach->ibc = sclp.ibc; memcpy(&mach->fac_mask, kvm->arch.model.fac_mask, S390_ARCH_FAC_LIST_SIZE_BYTE); - memcpy((unsigned long *)&mach->fac_list, S390_lowcore.stfle_fac_list, - sizeof(S390_lowcore.stfle_fac_list)); + memcpy((unsigned long *)&mach->fac_list, stfle_fac_list, + sizeof(stfle_fac_list)); VM_EVENT(kvm, 3, "GET: host ibc: 0x%4.4x, host cpuid: 0x%16.16llx", kvm->arch.model.ibc, kvm->arch.model.cpuid); @@ -1467,8 +1658,7 @@ static int kvm_s390_get_processor_feat(struct kvm *kvm, { struct kvm_s390_vm_cpu_feat data; - bitmap_copy((unsigned long *) data.feat, kvm->arch.cpu_feat, - KVM_S390_VM_CPU_FEAT_NR_BITS); + bitmap_to_arr64(data.feat, kvm->arch.cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS); if (copy_to_user((void __user *)attr->addr, &data, sizeof(data))) return -EFAULT; VM_EVENT(kvm, 3, "GET: guest feat: 0x%16.16llx.0x%16.16llx.0x%16.16llx", @@ -1483,9 +1673,7 @@ static int kvm_s390_get_machine_feat(struct kvm *kvm, { struct kvm_s390_vm_cpu_feat data; - bitmap_copy((unsigned long *) data.feat, - kvm_s390_available_cpu_feat, - KVM_S390_VM_CPU_FEAT_NR_BITS); + bitmap_to_arr64(data.feat, kvm_s390_available_cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS); if (copy_to_user((void __user *)attr->addr, &data, sizeof(data))) return -EFAULT; VM_EVENT(kvm, 3, "GET: host feat: 0x%16.16llx.0x%16.16llx.0x%16.16llx", @@ -1631,6 +1819,33 @@ static int kvm_s390_get_machine_subfunc(struct kvm *kvm, return 0; } +static int kvm_s390_get_processor_uv_feat(struct kvm *kvm, struct kvm_device_attr *attr) +{ + struct kvm_s390_vm_cpu_uv_feat __user *dst = (void __user *)attr->addr; + unsigned long feat = kvm->arch.model.uv_feat_guest.feat; + + if (put_user(feat, &dst->feat)) + return -EFAULT; + VM_EVENT(kvm, 3, "GET: guest UV-feat: 0x%16.16lx", feat); + + return 0; +} + +static int kvm_s390_get_machine_uv_feat(struct kvm *kvm, struct kvm_device_attr *attr) +{ + struct kvm_s390_vm_cpu_uv_feat __user *dst = (void __user *)attr->addr; + unsigned long feat; + + BUILD_BUG_ON(sizeof(*dst) != sizeof(uv_info.uv_feature_indications)); + + feat = uv_info.uv_feature_indications & KVM_S390_VM_CPU_UV_FEAT_GUEST_MASK; + if (put_user(feat, &dst->feat)) + return -EFAULT; + VM_EVENT(kvm, 3, "GET: guest UV-feat: 0x%16.16lx", feat); + + return 0; +} + static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr) { int ret = -ENXIO; @@ -1654,10 +1869,67 @@ static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr) case KVM_S390_VM_CPU_MACHINE_SUBFUNC: ret = kvm_s390_get_machine_subfunc(kvm, attr); break; + case KVM_S390_VM_CPU_PROCESSOR_UV_FEAT_GUEST: + ret = kvm_s390_get_processor_uv_feat(kvm, attr); + break; + case KVM_S390_VM_CPU_MACHINE_UV_FEAT_GUEST: + ret = kvm_s390_get_machine_uv_feat(kvm, attr); + break; } return ret; } +/** + * kvm_s390_update_topology_change_report - update CPU topology change report + * @kvm: guest KVM description + * @val: set or clear the MTCR bit + * + * Updates the Multiprocessor Topology-Change-Report bit to signal + * the guest with a topology change. + * This is only relevant if the topology facility is present. + * + * The SCA version, bsca or esca, doesn't matter as offset is the same. + */ +static void kvm_s390_update_topology_change_report(struct kvm *kvm, bool val) +{ + union sca_utility new, old; + struct bsca_block *sca; + + read_lock(&kvm->arch.sca_lock); + sca = kvm->arch.sca; + do { + old = READ_ONCE(sca->utility); + new = old; + new.mtcr = val; + } while (cmpxchg(&sca->utility.val, old.val, new.val) != old.val); + read_unlock(&kvm->arch.sca_lock); +} + +static int kvm_s390_set_topo_change_indication(struct kvm *kvm, + struct kvm_device_attr *attr) +{ + if (!test_kvm_facility(kvm, 11)) + return -ENXIO; + + kvm_s390_update_topology_change_report(kvm, !!attr->attr); + return 0; +} + +static int kvm_s390_get_topo_change_indication(struct kvm *kvm, + struct kvm_device_attr *attr) +{ + u8 topo; + + if (!test_kvm_facility(kvm, 11)) + return -ENXIO; + + read_lock(&kvm->arch.sca_lock); + topo = ((struct bsca_block *)kvm->arch.sca)->utility.mtcr; + read_unlock(&kvm->arch.sca_lock); + + return put_user(topo, (u8 __user *)attr->addr); +} + static int kvm_s390_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr) { int ret; @@ -1678,6 +1950,9 @@ static int kvm_s390_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr) case KVM_S390_VM_MIGRATION: ret = kvm_s390_vm_set_migration(kvm, attr); break; + case KVM_S390_VM_CPU_TOPOLOGY: + ret = kvm_s390_set_topo_change_indication(kvm, attr); + break; default: ret = -ENXIO; break; @@ -1703,6 +1978,9 @@ static int kvm_s390_vm_get_attr(struct kvm *kvm, struct kvm_device_attr *attr) case KVM_S390_VM_MIGRATION: ret = kvm_s390_vm_get_migration(kvm, attr); break; + case KVM_S390_VM_CPU_TOPOLOGY: + ret = kvm_s390_get_topo_change_indication(kvm, attr); + break; default: ret = -ENXIO; break; @@ -1749,6 +2027,8 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) case KVM_S390_VM_CPU_MACHINE_FEAT: case KVM_S390_VM_CPU_MACHINE_SUBFUNC: case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC: + case KVM_S390_VM_CPU_MACHINE_UV_FEAT_GUEST: + case KVM_S390_VM_CPU_PROCESSOR_UV_FEAT_GUEST: ret = 0; break; default: @@ -1776,6 +2056,9 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) case KVM_S390_VM_MIGRATION: ret = 0; break; + case KVM_S390_VM_CPU_TOPOLOGY: + ret = test_kvm_facility(kvm, 11) ? 0 : -ENXIO; + break; default: ret = -ENXIO; break; @@ -1784,7 +2067,7 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) return ret; } -static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) +static int kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) { uint8_t *keys; uint64_t hva; @@ -1801,11 +2084,11 @@ static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) if (args->count < 1 || args->count > KVM_S390_SKEYS_MAX) return -EINVAL; - keys = kvmalloc_array(args->count, sizeof(uint8_t), GFP_KERNEL); + keys = kvmalloc_array(args->count, sizeof(uint8_t), GFP_KERNEL_ACCOUNT); if (!keys) return -ENOMEM; - down_read(¤t->mm->mmap_sem); + mmap_read_lock(current->mm); srcu_idx = srcu_read_lock(&kvm->srcu); for (i = 0; i < args->count; i++) { hva = gfn_to_hva(kvm, args->start_gfn + i); @@ -1819,7 +2102,7 @@ static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) break; } srcu_read_unlock(&kvm->srcu, srcu_idx); - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); if (!r) { r = copy_to_user((uint8_t __user *)args->skeydata_addr, keys, @@ -1832,7 +2115,7 @@ static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) return r; } -static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) +static int kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) { uint8_t *keys; uint64_t hva; @@ -1846,7 +2129,7 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) if (args->count < 1 || args->count > KVM_S390_SKEYS_MAX) return -EINVAL; - keys = kvmalloc_array(args->count, sizeof(uint8_t), GFP_KERNEL); + keys = kvmalloc_array(args->count, sizeof(uint8_t), GFP_KERNEL_ACCOUNT); if (!keys) return -ENOMEM; @@ -1863,7 +2146,7 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) goto out; i = 0; - down_read(¤t->mm->mmap_sem); + mmap_read_lock(current->mm); srcu_idx = srcu_read_lock(&kvm->srcu); while (i < args->count) { unlocked = false; @@ -1881,7 +2164,7 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) r = set_guest_storage_key(current->mm, hva, keys[i], 0); if (r) { - r = fixup_user_fault(current, current->mm, hva, + r = fixup_user_fault(current->mm, hva, FAULT_FLAG_WRITE, &unlocked); if (r) break; @@ -1890,7 +2173,7 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) i++; } srcu_read_unlock(&kvm->srcu, srcu_idx); - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); out: kvfree(keys); return r; @@ -1905,38 +2188,6 @@ out: /* for consistency */ #define KVM_S390_CMMA_SIZE_MAX ((u32)KVM_S390_SKEYS_MAX) -/* - * Similar to gfn_to_memslot, but returns the index of a memslot also when the - * address falls in a hole. In that case the index of one of the memslots - * bordering the hole is returned. - */ -static int gfn_to_memslot_approx(struct kvm_memslots *slots, gfn_t gfn) -{ - int start = 0, end = slots->used_slots; - int slot = atomic_read(&slots->lru_slot); - struct kvm_memory_slot *memslots = slots->memslots; - - if (gfn >= memslots[slot].base_gfn && - gfn < memslots[slot].base_gfn + memslots[slot].npages) - return slot; - - while (start < end) { - slot = start + (end - start) / 2; - - if (gfn >= memslots[slot].base_gfn) - end = slot; - else - start = slot + 1; - } - - if (gfn >= memslots[start].base_gfn && - gfn < memslots[start].base_gfn + memslots[start].npages) { - atomic_set(&slots->lru_slot, start); - } - - return start; -} - static int kvm_s390_peek_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args, u8 *res, unsigned long bufsize) { @@ -1960,27 +2211,36 @@ static int kvm_s390_peek_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args, return 0; } +static struct kvm_memory_slot *gfn_to_memslot_approx(struct kvm_memslots *slots, + gfn_t gfn) +{ + return ____gfn_to_memslot(slots, gfn, true); +} + static unsigned long kvm_s390_next_dirty_cmma(struct kvm_memslots *slots, unsigned long cur_gfn) { - int slotidx = gfn_to_memslot_approx(slots, cur_gfn); - struct kvm_memory_slot *ms = slots->memslots + slotidx; + struct kvm_memory_slot *ms = gfn_to_memslot_approx(slots, cur_gfn); unsigned long ofs = cur_gfn - ms->base_gfn; + struct rb_node *mnode = &ms->gfn_node[slots->node_idx]; if (ms->base_gfn + ms->npages <= cur_gfn) { - slotidx--; + mnode = rb_next(mnode); /* If we are above the highest slot, wrap around */ - if (slotidx < 0) - slotidx = slots->used_slots - 1; + if (!mnode) + mnode = rb_first(&slots->gfn_tree); - ms = slots->memslots + slotidx; + ms = container_of(mnode, struct kvm_memory_slot, gfn_node[slots->node_idx]); ofs = 0; } + + if (cur_gfn < ms->base_gfn) + ofs = 0; + ofs = find_next_bit(kvm_second_dirty_bitmap(ms), ms->npages, ofs); - while ((slotidx > 0) && (ofs >= ms->npages)) { - slotidx--; - ms = slots->memslots + slotidx; - ofs = find_next_bit(kvm_second_dirty_bitmap(ms), ms->npages, 0); + while (ofs >= ms->npages && (mnode = rb_next(mnode))) { + ms = container_of(mnode, struct kvm_memory_slot, gfn_node[slots->node_idx]); + ofs = find_first_bit(kvm_second_dirty_bitmap(ms), ms->npages); } return ms->base_gfn + ofs; } @@ -1992,6 +2252,9 @@ static int kvm_s390_get_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args, struct kvm_memslots *slots = kvm_memslots(kvm); struct kvm_memory_slot *ms; + if (unlikely(kvm_memslots_empty(slots))) + return 0; + cur_gfn = kvm_s390_next_dirty_cmma(slots, args->start_gfn); ms = gfn_to_memslot(kvm, cur_gfn); args->count = 0; @@ -1999,7 +2262,7 @@ static int kvm_s390_get_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args, if (!ms) return 0; next_gfn = kvm_s390_next_dirty_cmma(slots, cur_gfn + 1); - mem_end = slots->memslots[0].base_gfn + slots->memslots[0].npages; + mem_end = kvm_s390_get_gfn_end(slots); while (args->count < bufsize) { hva = gfn_to_hva(kvm, cur_gfn); @@ -2073,14 +2336,14 @@ static int kvm_s390_get_cmma_bits(struct kvm *kvm, if (!values) return -ENOMEM; - down_read(&kvm->mm->mmap_sem); + mmap_read_lock(kvm->mm); srcu_idx = srcu_read_lock(&kvm->srcu); if (peek) ret = kvm_s390_peek_cmma(kvm, args, values, bufsize); else ret = kvm_s390_get_cmma(kvm, args, values, bufsize); srcu_read_unlock(&kvm->srcu, srcu_idx); - up_read(&kvm->mm->mmap_sem); + mmap_read_unlock(kvm->mm); if (kvm->arch.migration_mode) args->remaining = atomic64_read(&kvm->arch.cmma_dirty_pages); @@ -2130,7 +2393,7 @@ static int kvm_s390_set_cmma_bits(struct kvm *kvm, goto out; } - down_read(&kvm->mm->mmap_sem); + mmap_read_lock(kvm->mm); srcu_idx = srcu_read_lock(&kvm->srcu); for (i = 0; i < args->count; i++) { hva = gfn_to_hva(kvm, args->start_gfn + i); @@ -2145,20 +2408,579 @@ static int kvm_s390_set_cmma_bits(struct kvm *kvm, set_pgste_bits(kvm->mm, hva, mask, pgstev); } srcu_read_unlock(&kvm->srcu, srcu_idx); - up_read(&kvm->mm->mmap_sem); + mmap_read_unlock(kvm->mm); if (!kvm->mm->context.uses_cmm) { - down_write(&kvm->mm->mmap_sem); + mmap_write_lock(kvm->mm); kvm->mm->context.uses_cmm = 1; - up_write(&kvm->mm->mmap_sem); + mmap_write_unlock(kvm->mm); } out: vfree(bits); return r; } -long kvm_arch_vm_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +/** + * kvm_s390_cpus_from_pv - Convert all protected vCPUs in a protected VM to + * non protected. + * @kvm: the VM whose protected vCPUs are to be converted + * @rc: return value for the RC field of the UVC (in case of error) + * @rrc: return value for the RRC field of the UVC (in case of error) + * + * Does not stop in case of error, tries to convert as many + * CPUs as possible. In case of error, the RC and RRC of the last error are + * returned. + * + * Return: 0 in case of success, otherwise -EIO + */ +int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + struct kvm_vcpu *vcpu; + unsigned long i; + u16 _rc, _rrc; + int ret = 0; + + /* + * We ignore failures and try to destroy as many CPUs as possible. + * At the same time we must not free the assigned resources when + * this fails, as the ultravisor has still access to that memory. + * So kvm_s390_pv_destroy_cpu can leave a "wanted" memory leak + * behind. + * We want to return the first failure rc and rrc, though. + */ + kvm_for_each_vcpu(i, vcpu, kvm) { + mutex_lock(&vcpu->mutex); + if (kvm_s390_pv_destroy_cpu(vcpu, &_rc, &_rrc) && !ret) { + *rc = _rc; + *rrc = _rrc; + ret = -EIO; + } + mutex_unlock(&vcpu->mutex); + } + /* Ensure that we re-enable gisa if the non-PV guest used it but the PV guest did not. */ + if (use_gisa) + kvm_s390_gisa_enable(kvm); + return ret; +} + +/** + * kvm_s390_cpus_to_pv - Convert all non-protected vCPUs in a protected VM + * to protected. + * @kvm: the VM whose protected vCPUs are to be converted + * @rc: return value for the RC field of the UVC (in case of error) + * @rrc: return value for the RRC field of the UVC (in case of error) + * + * Tries to undo the conversion in case of error. + * + * Return: 0 in case of success, otherwise -EIO + */ +static int kvm_s390_cpus_to_pv(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + unsigned long i; + int r = 0; + u16 dummy; + + struct kvm_vcpu *vcpu; + + /* Disable the GISA if the ultravisor does not support AIV. */ + if (!uv_has_feature(BIT_UV_FEAT_AIV)) + kvm_s390_gisa_disable(kvm); + + kvm_for_each_vcpu(i, vcpu, kvm) { + mutex_lock(&vcpu->mutex); + r = kvm_s390_pv_create_cpu(vcpu, rc, rrc); + mutex_unlock(&vcpu->mutex); + if (r) + break; + } + if (r) + kvm_s390_cpus_from_pv(kvm, &dummy, &dummy); + return r; +} + +/* + * Here we provide user space with a direct interface to query UV + * related data like UV maxima and available features as well as + * feature specific data. + * + * To facilitate future extension of the data structures we'll try to + * write data up to the maximum requested length. + */ +static ssize_t kvm_s390_handle_pv_info(struct kvm_s390_pv_info *info) +{ + ssize_t len_min; + + switch (info->header.id) { + case KVM_PV_INFO_VM: { + len_min = sizeof(info->header) + sizeof(info->vm); + + if (info->header.len_max < len_min) + return -EINVAL; + + memcpy(info->vm.inst_calls_list, + uv_info.inst_calls_list, + sizeof(uv_info.inst_calls_list)); + + /* It's max cpuid not max cpus, so it's off by one */ + info->vm.max_cpus = uv_info.max_guest_cpu_id + 1; + info->vm.max_guests = uv_info.max_num_sec_conf; + info->vm.max_guest_addr = uv_info.max_sec_stor_addr; + info->vm.feature_indication = uv_info.uv_feature_indications; + + return len_min; + } + case KVM_PV_INFO_DUMP: { + len_min = sizeof(info->header) + sizeof(info->dump); + + if (info->header.len_max < len_min) + return -EINVAL; + + info->dump.dump_cpu_buffer_len = uv_info.guest_cpu_stor_len; + info->dump.dump_config_mem_buffer_per_1m = uv_info.conf_dump_storage_state_len; + info->dump.dump_config_finalize_len = uv_info.conf_dump_finalize_len; + return len_min; + } + default: + return -EINVAL; + } +} + +static int kvm_s390_pv_dmp(struct kvm *kvm, struct kvm_pv_cmd *cmd, + struct kvm_s390_pv_dmp dmp) +{ + int r = -EINVAL; + void __user *result_buff = (void __user *)dmp.buff_addr; + + switch (dmp.subcmd) { + case KVM_PV_DUMP_INIT: { + if (kvm->arch.pv.dumping) + break; + + /* + * Block SIE entry as concurrent dump UVCs could lead + * to validities. + */ + kvm_s390_vcpu_block_all(kvm); + + r = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm), + UVC_CMD_DUMP_INIT, &cmd->rc, &cmd->rrc); + KVM_UV_EVENT(kvm, 3, "PROTVIRT DUMP INIT: rc %x rrc %x", + cmd->rc, cmd->rrc); + if (!r) { + kvm->arch.pv.dumping = true; + } else { + kvm_s390_vcpu_unblock_all(kvm); + r = -EINVAL; + } + break; + } + case KVM_PV_DUMP_CONFIG_STOR_STATE: { + if (!kvm->arch.pv.dumping) + break; + + /* + * gaddr is an output parameter since we might stop + * early. As dmp will be copied back in our caller, we + * don't need to do it ourselves. + */ + r = kvm_s390_pv_dump_stor_state(kvm, result_buff, &dmp.gaddr, dmp.buff_len, + &cmd->rc, &cmd->rrc); + break; + } + case KVM_PV_DUMP_COMPLETE: { + if (!kvm->arch.pv.dumping) + break; + + r = -EINVAL; + if (dmp.buff_len < uv_info.conf_dump_finalize_len) + break; + + r = kvm_s390_pv_dump_complete(kvm, result_buff, + &cmd->rc, &cmd->rrc); + break; + } + default: + r = -ENOTTY; + break; + } + + return r; +} + +static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd) +{ + const bool need_lock = (cmd->cmd != KVM_PV_ASYNC_CLEANUP_PERFORM); + void __user *argp = (void __user *)cmd->data; + int r = 0; + u16 dummy; + + if (need_lock) + mutex_lock(&kvm->lock); + + switch (cmd->cmd) { + case KVM_PV_ENABLE: { + r = -EINVAL; + if (kvm_s390_pv_is_protected(kvm)) + break; + + /* + * FMT 4 SIE needs esca. As we never switch back to bsca from + * esca, we need no cleanup in the error cases below + */ + r = sca_switch_to_extended(kvm); + if (r) + break; + + mmap_write_lock(current->mm); + r = gmap_mark_unmergeable(); + mmap_write_unlock(current->mm); + if (r) + break; + + r = kvm_s390_pv_init_vm(kvm, &cmd->rc, &cmd->rrc); + if (r) + break; + + r = kvm_s390_cpus_to_pv(kvm, &cmd->rc, &cmd->rrc); + if (r) + kvm_s390_pv_deinit_vm(kvm, &dummy, &dummy); + + /* we need to block service interrupts from now on */ + set_bit(IRQ_PEND_EXT_SERVICE, &kvm->arch.float_int.masked_irqs); + break; + } + case KVM_PV_ASYNC_CLEANUP_PREPARE: + r = -EINVAL; + if (!kvm_s390_pv_is_protected(kvm) || !async_destroy) + break; + + r = kvm_s390_cpus_from_pv(kvm, &cmd->rc, &cmd->rrc); + /* + * If a CPU could not be destroyed, destroy VM will also fail. + * There is no point in trying to destroy it. Instead return + * the rc and rrc from the first CPU that failed destroying. + */ + if (r) + break; + r = kvm_s390_pv_set_aside(kvm, &cmd->rc, &cmd->rrc); + + /* no need to block service interrupts any more */ + clear_bit(IRQ_PEND_EXT_SERVICE, &kvm->arch.float_int.masked_irqs); + break; + case KVM_PV_ASYNC_CLEANUP_PERFORM: + r = -EINVAL; + if (!async_destroy) + break; + /* kvm->lock must not be held; this is asserted inside the function. */ + r = kvm_s390_pv_deinit_aside_vm(kvm, &cmd->rc, &cmd->rrc); + break; + case KVM_PV_DISABLE: { + r = -EINVAL; + if (!kvm_s390_pv_is_protected(kvm)) + break; + + r = kvm_s390_cpus_from_pv(kvm, &cmd->rc, &cmd->rrc); + /* + * If a CPU could not be destroyed, destroy VM will also fail. + * There is no point in trying to destroy it. Instead return + * the rc and rrc from the first CPU that failed destroying. + */ + if (r) + break; + r = kvm_s390_pv_deinit_cleanup_all(kvm, &cmd->rc, &cmd->rrc); + + /* no need to block service interrupts any more */ + clear_bit(IRQ_PEND_EXT_SERVICE, &kvm->arch.float_int.masked_irqs); + break; + } + case KVM_PV_SET_SEC_PARMS: { + struct kvm_s390_pv_sec_parm parms = {}; + void *hdr; + + r = -EINVAL; + if (!kvm_s390_pv_is_protected(kvm)) + break; + + r = -EFAULT; + if (copy_from_user(&parms, argp, sizeof(parms))) + break; + + /* Currently restricted to 8KB */ + r = -EINVAL; + if (parms.length > PAGE_SIZE * 2) + break; + + r = -ENOMEM; + hdr = vmalloc(parms.length); + if (!hdr) + break; + + r = -EFAULT; + if (!copy_from_user(hdr, (void __user *)parms.origin, + parms.length)) + r = kvm_s390_pv_set_sec_parms(kvm, hdr, parms.length, + &cmd->rc, &cmd->rrc); + + vfree(hdr); + break; + } + case KVM_PV_UNPACK: { + struct kvm_s390_pv_unp unp = {}; + + r = -EINVAL; + if (!kvm_s390_pv_is_protected(kvm) || !mm_is_protected(kvm->mm)) + break; + + r = -EFAULT; + if (copy_from_user(&unp, argp, sizeof(unp))) + break; + + r = kvm_s390_pv_unpack(kvm, unp.addr, unp.size, unp.tweak, + &cmd->rc, &cmd->rrc); + break; + } + case KVM_PV_VERIFY: { + r = -EINVAL; + if (!kvm_s390_pv_is_protected(kvm)) + break; + + r = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm), + UVC_CMD_VERIFY_IMG, &cmd->rc, &cmd->rrc); + KVM_UV_EVENT(kvm, 3, "PROTVIRT VERIFY: rc %x rrc %x", cmd->rc, + cmd->rrc); + break; + } + case KVM_PV_PREP_RESET: { + r = -EINVAL; + if (!kvm_s390_pv_is_protected(kvm)) + break; + + r = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm), + UVC_CMD_PREPARE_RESET, &cmd->rc, &cmd->rrc); + KVM_UV_EVENT(kvm, 3, "PROTVIRT PREP RESET: rc %x rrc %x", + cmd->rc, cmd->rrc); + break; + } + case KVM_PV_UNSHARE_ALL: { + r = -EINVAL; + if (!kvm_s390_pv_is_protected(kvm)) + break; + + r = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm), + UVC_CMD_SET_UNSHARE_ALL, &cmd->rc, &cmd->rrc); + KVM_UV_EVENT(kvm, 3, "PROTVIRT UNSHARE: rc %x rrc %x", + cmd->rc, cmd->rrc); + break; + } + case KVM_PV_INFO: { + struct kvm_s390_pv_info info = {}; + ssize_t data_len; + + /* + * No need to check the VM protection here. + * + * Maybe user space wants to query some of the data + * when the VM is still unprotected. If we see the + * need to fence a new data command we can still + * return an error in the info handler. + */ + + r = -EFAULT; + if (copy_from_user(&info, argp, sizeof(info.header))) + break; + + r = -EINVAL; + if (info.header.len_max < sizeof(info.header)) + break; + + data_len = kvm_s390_handle_pv_info(&info); + if (data_len < 0) { + r = data_len; + break; + } + /* + * If a data command struct is extended (multiple + * times) this can be used to determine how much of it + * is valid. + */ + info.header.len_written = data_len; + + r = -EFAULT; + if (copy_to_user(argp, &info, data_len)) + break; + + r = 0; + break; + } + case KVM_PV_DUMP: { + struct kvm_s390_pv_dmp dmp; + + r = -EINVAL; + if (!kvm_s390_pv_is_protected(kvm)) + break; + + r = -EFAULT; + if (copy_from_user(&dmp, argp, sizeof(dmp))) + break; + + r = kvm_s390_pv_dmp(kvm, cmd, dmp); + if (r) + break; + + if (copy_to_user(argp, &dmp, sizeof(dmp))) { + r = -EFAULT; + break; + } + + break; + } + default: + r = -ENOTTY; + } + if (need_lock) + mutex_unlock(&kvm->lock); + + return r; +} + +static int mem_op_validate_common(struct kvm_s390_mem_op *mop, u64 supported_flags) +{ + if (mop->flags & ~supported_flags || !mop->size) + return -EINVAL; + if (mop->size > MEM_OP_MAX_SIZE) + return -E2BIG; + if (mop->flags & KVM_S390_MEMOP_F_SKEY_PROTECTION) { + if (mop->key > 0xf) + return -EINVAL; + } else { + mop->key = 0; + } + return 0; +} + +static int kvm_s390_vm_mem_op_abs(struct kvm *kvm, struct kvm_s390_mem_op *mop) +{ + void __user *uaddr = (void __user *)mop->buf; + enum gacc_mode acc_mode; + void *tmpbuf = NULL; + int r, srcu_idx; + + r = mem_op_validate_common(mop, KVM_S390_MEMOP_F_SKEY_PROTECTION | + KVM_S390_MEMOP_F_CHECK_ONLY); + if (r) + return r; + + if (!(mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY)) { + tmpbuf = vmalloc(mop->size); + if (!tmpbuf) + return -ENOMEM; + } + + srcu_idx = srcu_read_lock(&kvm->srcu); + + if (kvm_is_error_gpa(kvm, mop->gaddr)) { + r = PGM_ADDRESSING; + goto out_unlock; + } + + acc_mode = mop->op == KVM_S390_MEMOP_ABSOLUTE_READ ? GACC_FETCH : GACC_STORE; + if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) { + r = check_gpa_range(kvm, mop->gaddr, mop->size, acc_mode, mop->key); + goto out_unlock; + } + if (acc_mode == GACC_FETCH) { + r = access_guest_abs_with_key(kvm, mop->gaddr, tmpbuf, + mop->size, GACC_FETCH, mop->key); + if (r) + goto out_unlock; + if (copy_to_user(uaddr, tmpbuf, mop->size)) + r = -EFAULT; + } else { + if (copy_from_user(tmpbuf, uaddr, mop->size)) { + r = -EFAULT; + goto out_unlock; + } + r = access_guest_abs_with_key(kvm, mop->gaddr, tmpbuf, + mop->size, GACC_STORE, mop->key); + } + +out_unlock: + srcu_read_unlock(&kvm->srcu, srcu_idx); + + vfree(tmpbuf); + return r; +} + +static int kvm_s390_vm_mem_op_cmpxchg(struct kvm *kvm, struct kvm_s390_mem_op *mop) +{ + void __user *uaddr = (void __user *)mop->buf; + void __user *old_addr = (void __user *)mop->old_addr; + union { + __uint128_t quad; + char raw[sizeof(__uint128_t)]; + } old = { .quad = 0}, new = { .quad = 0 }; + unsigned int off_in_quad = sizeof(new) - mop->size; + int r, srcu_idx; + bool success; + + r = mem_op_validate_common(mop, KVM_S390_MEMOP_F_SKEY_PROTECTION); + if (r) + return r; + /* + * This validates off_in_quad. Checking that size is a power + * of two is not necessary, as cmpxchg_guest_abs_with_key + * takes care of that + */ + if (mop->size > sizeof(new)) + return -EINVAL; + if (copy_from_user(&new.raw[off_in_quad], uaddr, mop->size)) + return -EFAULT; + if (copy_from_user(&old.raw[off_in_quad], old_addr, mop->size)) + return -EFAULT; + + srcu_idx = srcu_read_lock(&kvm->srcu); + + if (kvm_is_error_gpa(kvm, mop->gaddr)) { + r = PGM_ADDRESSING; + goto out_unlock; + } + + r = cmpxchg_guest_abs_with_key(kvm, mop->gaddr, mop->size, &old.quad, + new.quad, mop->key, &success); + if (!success && copy_to_user(old_addr, &old.raw[off_in_quad], mop->size)) + r = -EFAULT; + +out_unlock: + srcu_read_unlock(&kvm->srcu, srcu_idx); + return r; +} + +static int kvm_s390_vm_mem_op(struct kvm *kvm, struct kvm_s390_mem_op *mop) +{ + /* + * This is technically a heuristic only, if the kvm->lock is not + * taken, it is not guaranteed that the vm is/remains non-protected. + * This is ok from a kernel perspective, wrongdoing is detected + * on the access, -EFAULT is returned and the vm may crash the + * next time it accesses the memory in question. + * There is no sane usecase to do switching and a memop on two + * different CPUs at the same time. + */ + if (kvm_s390_pv_get_handle(kvm)) + return -EINVAL; + + switch (mop->op) { + case KVM_S390_MEMOP_ABSOLUTE_READ: + case KVM_S390_MEMOP_ABSOLUTE_WRITE: + return kvm_s390_vm_mem_op_abs(kvm, mop); + case KVM_S390_MEMOP_ABSOLUTE_CMPXCHG: + return kvm_s390_vm_mem_op_cmpxchg(kvm, mop); + default: + return -EINVAL; + } +} + +int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct kvm *kvm = filp->private_data; void __user *argp = (void __user *)arg; @@ -2254,6 +3076,54 @@ long kvm_arch_vm_ioctl(struct file *filp, mutex_unlock(&kvm->slots_lock); break; } + case KVM_S390_PV_COMMAND: { + struct kvm_pv_cmd args; + + /* protvirt means user cpu state */ + kvm_s390_set_user_cpu_state_ctrl(kvm); + r = 0; + if (!is_prot_virt_host()) { + r = -EINVAL; + break; + } + if (copy_from_user(&args, argp, sizeof(args))) { + r = -EFAULT; + break; + } + if (args.flags) { + r = -EINVAL; + break; + } + /* must be called without kvm->lock */ + r = kvm_s390_handle_pv(kvm, &args); + if (copy_to_user(argp, &args, sizeof(args))) { + r = -EFAULT; + break; + } + break; + } + case KVM_S390_MEM_OP: { + struct kvm_s390_mem_op mem_op; + + if (copy_from_user(&mem_op, argp, sizeof(mem_op)) == 0) + r = kvm_s390_vm_mem_op(kvm, &mem_op); + else + r = -EFAULT; + break; + } + case KVM_S390_ZPCI_OP: { + struct kvm_s390_zpci_op args; + + r = -EINVAL; + if (!IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) + break; + if (copy_from_user(&args, argp, sizeof(args))) { + r = -EFAULT; + break; + } + r = kvm_s390_pci_zpci_op(kvm, &args); + break; + } default: r = -ENOTTY; } @@ -2298,12 +3168,26 @@ static void kvm_s390_set_crycb_format(struct kvm *kvm) kvm->arch.crypto.crycbd |= CRYCB_FORMAT1; } +/* + * kvm_arch_crypto_set_masks + * + * @kvm: pointer to the target guest's KVM struct containing the crypto masks + * to be set. + * @apm: the mask identifying the accessible AP adapters + * @aqm: the mask identifying the accessible AP domains + * @adm: the mask identifying the accessible AP control domains + * + * Set the masks that identify the adapters, domains and control domains to + * which the KVM guest is granted access. + * + * Note: The kvm->lock mutex must be locked by the caller before invoking this + * function. + */ void kvm_arch_crypto_set_masks(struct kvm *kvm, unsigned long *apm, unsigned long *aqm, unsigned long *adm) { struct kvm_s390_crypto_cb *crycb = kvm->arch.crypto.crycb; - mutex_lock(&kvm->lock); kvm_s390_vcpu_block_all(kvm); switch (kvm->arch.crypto.crycbd & CRYCB_FORMAT_MASK) { @@ -2334,13 +3218,23 @@ void kvm_arch_crypto_set_masks(struct kvm *kvm, unsigned long *apm, /* recreate the shadow crycb for each vcpu */ kvm_s390_sync_request_broadcast(kvm, KVM_REQ_VSIE_RESTART); kvm_s390_vcpu_unblock_all(kvm); - mutex_unlock(&kvm->lock); } EXPORT_SYMBOL_GPL(kvm_arch_crypto_set_masks); +/* + * kvm_arch_crypto_clear_masks + * + * @kvm: pointer to the target guest's KVM struct containing the crypto masks + * to be cleared. + * + * Clear the masks that identify the adapters, domains and control domains to + * which the KVM guest is granted access. + * + * Note: The kvm->lock mutex must be locked by the caller before invoking this + * function. + */ void kvm_arch_crypto_clear_masks(struct kvm *kvm) { - mutex_lock(&kvm->lock); kvm_s390_vcpu_block_all(kvm); memset(&kvm->arch.crypto.crycb->apcb0, 0, @@ -2352,7 +3246,6 @@ void kvm_arch_crypto_clear_masks(struct kvm *kvm) /* recreate the shadow crycb for each vcpu */ kvm_s390_sync_request_broadcast(kvm, KVM_REQ_VSIE_RESTART); kvm_s390_vcpu_unblock_all(kvm); - mutex_unlock(&kvm->lock); } EXPORT_SYMBOL_GPL(kvm_arch_crypto_clear_masks); @@ -2369,6 +3262,7 @@ static void kvm_s390_crypto_init(struct kvm *kvm) { kvm->arch.crypto.crycb = &kvm->arch.sie_page2->crycb; kvm_s390_set_crycb_format(kvm); + init_rwsem(&kvm->arch.crypto.pqap_hook_rwsem); if (!test_kvm_facility(kvm, 76)) return; @@ -2391,9 +3285,17 @@ static void sca_dispose(struct kvm *kvm) kvm->arch.sca = NULL; } +void kvm_arch_free_vm(struct kvm *kvm) +{ + if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) + kvm_s390_pci_clear_list(kvm); + + __kvm_arch_free_vm(kvm); +} + int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { - gfp_t alloc_flags = GFP_KERNEL; + gfp_t alloc_flags = GFP_KERNEL_ACCOUNT; int i, rc; char debug_name[16]; static unsigned long sca_offset; @@ -2438,7 +3340,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) BUILD_BUG_ON(sizeof(struct sie_page2) != 4096); kvm->arch.sie_page2 = - (struct sie_page2 *) get_zeroed_page(GFP_KERNEL | GFP_DMA); + (struct sie_page2 *) get_zeroed_page(GFP_KERNEL_ACCOUNT | GFP_DMA); if (!kvm->arch.sie_page2) goto out_err; @@ -2446,10 +3348,10 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm->arch.model.fac_list = kvm->arch.sie_page2->fac_list; for (i = 0; i < kvm_s390_fac_size(); i++) { - kvm->arch.model.fac_mask[i] = S390_lowcore.stfle_fac_list[i] & + kvm->arch.model.fac_mask[i] = stfle_fac_list[i] & (kvm_s390_fac_base[i] | kvm_s390_fac_ext[i]); - kvm->arch.model.fac_list[i] = S390_lowcore.stfle_fac_list[i] & + kvm->arch.model.fac_list[i] = stfle_fac_list[i] & kvm_s390_fac_base[i]; } kvm->arch.model.subfuncs = kvm_s390_available_subfunc; @@ -2471,8 +3373,17 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm->arch.model.cpuid = kvm_s390_get_initial_cpuid(); kvm->arch.model.ibc = sclp.ibc & 0x0fff; + kvm->arch.model.uv_feat_guest.feat = 0; + kvm_s390_crypto_init(kvm); + if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) { + mutex_lock(&kvm->lock); + kvm_s390_pci_init_list(kvm); + kvm_s390_vcpu_pci_enable_interp(kvm); + mutex_unlock(&kvm->lock); + } + mutex_init(&kvm->arch.float_int.ais_lock); spin_lock_init(&kvm->arch.float_int.lock); for (i = 0; i < FIRQ_LIST_COUNT; i++) @@ -2503,7 +3414,10 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm->arch.use_skf = sclp.has_skey; spin_lock_init(&kvm->arch.start_stop_lock); kvm_s390_vsie_init(kvm); - kvm_s390_gisa_init(kvm); + if (use_gisa) + kvm_s390_gisa_init(kvm); + INIT_LIST_HEAD(&kvm->arch.pv.need_cleanup); + kvm->arch.pv.set_aside = NULL; KVM_EVENT(3, "vm 0x%pK created by pid %u", kvm, current->pid); return 0; @@ -2517,46 +3431,50 @@ out_err: void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { + u16 rc, rrc; + VCPU_EVENT(vcpu, 3, "%s", "free cpu"); trace_kvm_s390_destroy_vcpu(vcpu->vcpu_id); kvm_s390_clear_local_irqs(vcpu); kvm_clear_async_pf_completion_queue(vcpu); if (!kvm_is_ucontrol(vcpu->kvm)) sca_del_vcpu(vcpu); + kvm_s390_update_topology_change_report(vcpu->kvm, 1); if (kvm_is_ucontrol(vcpu->kvm)) gmap_remove(vcpu->arch.gmap); if (vcpu->kvm->arch.use_cmma) kvm_s390_vcpu_unsetup_cmma(vcpu); + /* We can not hold the vcpu mutex here, we are already dying */ + if (kvm_s390_pv_cpu_get_handle(vcpu)) + kvm_s390_pv_destroy_cpu(vcpu, &rc, &rrc); free_page((unsigned long)(vcpu->arch.sie_block)); - - kvm_vcpu_uninit(vcpu); - kmem_cache_free(kvm_vcpu_cache, vcpu); -} - -static void kvm_free_vcpus(struct kvm *kvm) -{ - unsigned int i; - struct kvm_vcpu *vcpu; - - kvm_for_each_vcpu(i, vcpu, kvm) - kvm_arch_vcpu_destroy(vcpu); - - mutex_lock(&kvm->lock); - for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) - kvm->vcpus[i] = NULL; - - atomic_set(&kvm->online_vcpus, 0); - mutex_unlock(&kvm->lock); } void kvm_arch_destroy_vm(struct kvm *kvm) { - kvm_free_vcpus(kvm); + u16 rc, rrc; + + kvm_destroy_vcpus(kvm); sca_dispose(kvm); - debug_unregister(kvm->arch.dbf); kvm_s390_gisa_destroy(kvm); + /* + * We are already at the end of life and kvm->lock is not taken. + * This is ok as the file descriptor is closed by now and nobody + * can mess with the pv state. + */ + kvm_s390_pv_deinit_cleanup_all(kvm, &rc, &rrc); + /* + * Remove the mmu notifier only when the whole KVM VM is torn down, + * and only if one was registered to begin with. If the VM is + * currently not protected, but has been previously been protected, + * then it's possible that the notifier is still registered. + */ + if (kvm->arch.pv.mmu_notifier.ops) + mmu_notifier_unregister(&kvm->arch.pv.mmu_notifier, kvm->mm); + + debug_unregister(kvm->arch.dbf); free_page((unsigned long)kvm->arch.sie_page2); if (!kvm_is_ucontrol(kvm)) gmap_remove(kvm->arch.gmap); @@ -2599,28 +3517,30 @@ static void sca_del_vcpu(struct kvm_vcpu *vcpu) static void sca_add_vcpu(struct kvm_vcpu *vcpu) { if (!kvm_s390_use_sca_entries()) { - struct bsca_block *sca = vcpu->kvm->arch.sca; + phys_addr_t sca_phys = virt_to_phys(vcpu->kvm->arch.sca); /* we still need the basic sca for the ipte control */ - vcpu->arch.sie_block->scaoh = (__u32)(((__u64)sca) >> 32); - vcpu->arch.sie_block->scaol = (__u32)(__u64)sca; + vcpu->arch.sie_block->scaoh = sca_phys >> 32; + vcpu->arch.sie_block->scaol = sca_phys; return; } read_lock(&vcpu->kvm->arch.sca_lock); if (vcpu->kvm->arch.use_esca) { struct esca_block *sca = vcpu->kvm->arch.sca; + phys_addr_t sca_phys = virt_to_phys(sca); - sca->cpu[vcpu->vcpu_id].sda = (__u64) vcpu->arch.sie_block; - vcpu->arch.sie_block->scaoh = (__u32)(((__u64)sca) >> 32); - vcpu->arch.sie_block->scaol = (__u32)(__u64)sca & ~0x3fU; + sca->cpu[vcpu->vcpu_id].sda = virt_to_phys(vcpu->arch.sie_block); + vcpu->arch.sie_block->scaoh = sca_phys >> 32; + vcpu->arch.sie_block->scaol = sca_phys & ESCA_SCAOL_MASK; vcpu->arch.sie_block->ecb2 |= ECB2_ESCA; set_bit_inv(vcpu->vcpu_id, (unsigned long *) sca->mcn); } else { struct bsca_block *sca = vcpu->kvm->arch.sca; + phys_addr_t sca_phys = virt_to_phys(sca); - sca->cpu[vcpu->vcpu_id].sda = (__u64) vcpu->arch.sie_block; - vcpu->arch.sie_block->scaoh = (__u32)(((__u64)sca) >> 32); - vcpu->arch.sie_block->scaol = (__u32)(__u64)sca; + sca->cpu[vcpu->vcpu_id].sda = virt_to_phys(vcpu->arch.sie_block); + vcpu->arch.sie_block->scaoh = sca_phys >> 32; + vcpu->arch.sie_block->scaol = sca_phys; set_bit_inv(vcpu->vcpu_id, (unsigned long *) &sca->mcn); } read_unlock(&vcpu->kvm->arch.sca_lock); @@ -2649,15 +3569,20 @@ static int sca_switch_to_extended(struct kvm *kvm) struct bsca_block *old_sca = kvm->arch.sca; struct esca_block *new_sca; struct kvm_vcpu *vcpu; - unsigned int vcpu_idx; + unsigned long vcpu_idx; u32 scaol, scaoh; + phys_addr_t new_sca_phys; - new_sca = alloc_pages_exact(sizeof(*new_sca), GFP_KERNEL|__GFP_ZERO); + if (kvm->arch.use_esca) + return 0; + + new_sca = alloc_pages_exact(sizeof(*new_sca), GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!new_sca) return -ENOMEM; - scaoh = (u32)((u64)(new_sca) >> 32); - scaol = (u32)(u64)(new_sca) & ~0x3fU; + new_sca_phys = virt_to_phys(new_sca); + scaoh = new_sca_phys >> 32; + scaol = new_sca_phys & ESCA_SCAOL_MASK; kvm_s390_vcpu_block_all(kvm); write_lock(&kvm->arch.sca_lock); @@ -2696,46 +3621,11 @@ static int sca_can_add_vcpu(struct kvm *kvm, unsigned int id) if (!sclp.has_esca || !sclp.has_64bscao) return false; - mutex_lock(&kvm->lock); rc = kvm->arch.use_esca ? 0 : sca_switch_to_extended(kvm); - mutex_unlock(&kvm->lock); return rc == 0 && id < KVM_S390_ESCA_CPU_SLOTS; } -int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) -{ - vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID; - kvm_clear_async_pf_completion_queue(vcpu); - vcpu->run->kvm_valid_regs = KVM_SYNC_PREFIX | - KVM_SYNC_GPRS | - KVM_SYNC_ACRS | - KVM_SYNC_CRS | - KVM_SYNC_ARCH0 | - KVM_SYNC_PFAULT; - kvm_s390_set_prefix(vcpu, 0); - if (test_kvm_facility(vcpu->kvm, 64)) - vcpu->run->kvm_valid_regs |= KVM_SYNC_RICCB; - if (test_kvm_facility(vcpu->kvm, 82)) - vcpu->run->kvm_valid_regs |= KVM_SYNC_BPBC; - if (test_kvm_facility(vcpu->kvm, 133)) - vcpu->run->kvm_valid_regs |= KVM_SYNC_GSCB; - if (test_kvm_facility(vcpu->kvm, 156)) - vcpu->run->kvm_valid_regs |= KVM_SYNC_ETOKEN; - /* fprs can be synchronized via vrs, even if the guest has no vx. With - * MACHINE_HAS_VX, (load|store)_fpu_regs() will work with vrs format. - */ - if (MACHINE_HAS_VX) - vcpu->run->kvm_valid_regs |= KVM_SYNC_VRS; - else - vcpu->run->kvm_valid_regs |= KVM_SYNC_FPRS; - - if (kvm_is_ucontrol(vcpu->kvm)) - return __kvm_ucontrol_vcpu_init(vcpu); - - return 0; -} - /* needs disabled preemption to protect from TOD sync and vcpu_load/put */ static void __start_cpu_timer_accounting(struct kvm_vcpu *vcpu) { @@ -2844,35 +3734,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) } -static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu) -{ - /* this equals initial cpu reset in pop, but we don't switch to ESA */ - vcpu->arch.sie_block->gpsw.mask = 0UL; - vcpu->arch.sie_block->gpsw.addr = 0UL; - kvm_s390_set_prefix(vcpu, 0); - kvm_s390_set_cpu_timer(vcpu, 0); - vcpu->arch.sie_block->ckc = 0UL; - vcpu->arch.sie_block->todpr = 0; - memset(vcpu->arch.sie_block->gcr, 0, 16 * sizeof(__u64)); - vcpu->arch.sie_block->gcr[0] = CR0_UNUSED_56 | - CR0_INTERRUPT_KEY_SUBMASK | - CR0_MEASUREMENT_ALERT_SUBMASK; - vcpu->arch.sie_block->gcr[14] = CR14_UNUSED_32 | - CR14_UNUSED_33 | - CR14_EXTERNAL_DAMAGE_SUBMASK; - /* make sure the new fpc will be lazily loaded */ - save_fpu_regs(); - current->thread.fpu.fpc = 0; - vcpu->arch.sie_block->gbea = 1; - vcpu->arch.sie_block->pp = 0; - vcpu->arch.sie_block->fpf &= ~FPF_BPBC; - vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID; - kvm_clear_async_pf_completion_queue(vcpu); - if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm)) - kvm_s390_vcpu_stop(vcpu); - kvm_s390_clear_local_irqs(vcpu); -} - void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) { mutex_lock(&vcpu->kvm->lock); @@ -2941,15 +3802,18 @@ static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu) void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu) { - free_page(vcpu->arch.sie_block->cbrlo); + free_page((unsigned long)phys_to_virt(vcpu->arch.sie_block->cbrlo)); vcpu->arch.sie_block->cbrlo = 0; } int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu) { - vcpu->arch.sie_block->cbrlo = get_zeroed_page(GFP_KERNEL); - if (!vcpu->arch.sie_block->cbrlo) + void *cbrlo_page = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); + + if (!cbrlo_page) return -ENOMEM; + + vcpu->arch.sie_block->cbrlo = virt_to_phys(cbrlo_page); return 0; } @@ -2959,12 +3823,13 @@ static void kvm_s390_vcpu_setup_model(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->ibc = model->ibc; if (test_kvm_facility(vcpu->kvm, 7)) - vcpu->arch.sie_block->fac = (u32)(u64) model->fac_list; + vcpu->arch.sie_block->fac = virt_to_phys(model->fac_list); } -int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) +static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu) { int rc = 0; + u16 uvrc, uvrrc; atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH | CPUSTAT_SM | @@ -2982,8 +3847,12 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->ecb |= ECB_HOSTPROTINT; if (test_kvm_facility(vcpu->kvm, 9)) vcpu->arch.sie_block->ecb |= ECB_SRSI; + if (test_kvm_facility(vcpu->kvm, 11)) + vcpu->arch.sie_block->ecb |= ECB_PTF; if (test_kvm_facility(vcpu->kvm, 73)) vcpu->arch.sie_block->ecb |= ECB_TE; + if (!kvm_is_ucontrol(vcpu->kvm)) + vcpu->arch.sie_block->ecb |= ECB_SPECI; if (test_kvm_facility(vcpu->kvm, 8) && vcpu->kvm->arch.use_pfmfi) vcpu->arch.sie_block->ecb2 |= ECB2_PFMFI; @@ -3011,9 +3880,8 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) VCPU_EVENT(vcpu, 3, "AIV gisa format-%u enabled for cpu %03u", vcpu->arch.sie_block->gd & 0x3, vcpu->vcpu_id); } - vcpu->arch.sie_block->sdnxo = ((unsigned long) &vcpu->run->s.regs.sdnx) - | SDNXC; - vcpu->arch.sie_block->riccbd = (unsigned long) &vcpu->run->s.regs.riccb; + vcpu->arch.sie_block->sdnxo = virt_to_phys(&vcpu->run->s.regs.sdnx) | SDNXC; + vcpu->arch.sie_block->riccbd = virt_to_phys(&vcpu->run->s.regs.riccb); if (sclp.has_kss) kvm_s390_set_cpuflags(vcpu, CPUSTAT_KSS); @@ -3032,62 +3900,102 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) kvm_s390_vcpu_crypto_setup(vcpu); + kvm_s390_vcpu_pci_setup(vcpu); + + mutex_lock(&vcpu->kvm->lock); + if (kvm_s390_pv_is_protected(vcpu->kvm)) { + rc = kvm_s390_pv_create_cpu(vcpu, &uvrc, &uvrrc); + if (rc) + kvm_s390_vcpu_unsetup_cmma(vcpu); + } + mutex_unlock(&vcpu->kvm->lock); + return rc; } -struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, - unsigned int id) +int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) { - struct kvm_vcpu *vcpu; - struct sie_page *sie_page; - int rc = -EINVAL; - if (!kvm_is_ucontrol(kvm) && !sca_can_add_vcpu(kvm, id)) - goto out; - - rc = -ENOMEM; + return -EINVAL; + return 0; +} - vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); - if (!vcpu) - goto out; +int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) +{ + struct sie_page *sie_page; + int rc; BUILD_BUG_ON(sizeof(struct sie_page) != 4096); - sie_page = (struct sie_page *) get_zeroed_page(GFP_KERNEL); + sie_page = (struct sie_page *) get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!sie_page) - goto out_free_cpu; + return -ENOMEM; vcpu->arch.sie_block = &sie_page->sie_block; - vcpu->arch.sie_block->itdba = (unsigned long) &sie_page->itdb; + vcpu->arch.sie_block->itdba = virt_to_phys(&sie_page->itdb); /* the real guest size will always be smaller than msl */ vcpu->arch.sie_block->mso = 0; vcpu->arch.sie_block->msl = sclp.hamax; - vcpu->arch.sie_block->icpua = id; + vcpu->arch.sie_block->icpua = vcpu->vcpu_id; spin_lock_init(&vcpu->arch.local_int.lock); - vcpu->arch.sie_block->gd = (u32)(u64)kvm->arch.gisa_int.origin; - if (vcpu->arch.sie_block->gd && sclp.has_gisaf) - vcpu->arch.sie_block->gd |= GISA_FORMAT1; + vcpu->arch.sie_block->gd = kvm_s390_get_gisa_desc(vcpu->kvm); seqcount_init(&vcpu->arch.cputm_seqcount); - rc = kvm_vcpu_init(vcpu, kvm, id); + vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID; + kvm_clear_async_pf_completion_queue(vcpu); + vcpu->run->kvm_valid_regs = KVM_SYNC_PREFIX | + KVM_SYNC_GPRS | + KVM_SYNC_ACRS | + KVM_SYNC_CRS | + KVM_SYNC_ARCH0 | + KVM_SYNC_PFAULT | + KVM_SYNC_DIAG318; + kvm_s390_set_prefix(vcpu, 0); + if (test_kvm_facility(vcpu->kvm, 64)) + vcpu->run->kvm_valid_regs |= KVM_SYNC_RICCB; + if (test_kvm_facility(vcpu->kvm, 82)) + vcpu->run->kvm_valid_regs |= KVM_SYNC_BPBC; + if (test_kvm_facility(vcpu->kvm, 133)) + vcpu->run->kvm_valid_regs |= KVM_SYNC_GSCB; + if (test_kvm_facility(vcpu->kvm, 156)) + vcpu->run->kvm_valid_regs |= KVM_SYNC_ETOKEN; + /* fprs can be synchronized via vrs, even if the guest has no vx. With + * cpu_has_vx(), (load|store)_fpu_regs() will work with vrs format. + */ + if (cpu_has_vx()) + vcpu->run->kvm_valid_regs |= KVM_SYNC_VRS; + else + vcpu->run->kvm_valid_regs |= KVM_SYNC_FPRS; + + if (kvm_is_ucontrol(vcpu->kvm)) { + rc = __kvm_ucontrol_vcpu_init(vcpu); + if (rc) + goto out_free_sie_block; + } + + VM_EVENT(vcpu->kvm, 3, "create cpu %d at 0x%pK, sie block at 0x%pK", + vcpu->vcpu_id, vcpu, vcpu->arch.sie_block); + trace_kvm_s390_create_vcpu(vcpu->vcpu_id, vcpu, vcpu->arch.sie_block); + + rc = kvm_s390_vcpu_setup(vcpu); if (rc) - goto out_free_sie_block; - VM_EVENT(kvm, 3, "create cpu %d at 0x%pK, sie block at 0x%pK", id, vcpu, - vcpu->arch.sie_block); - trace_kvm_s390_create_vcpu(id, vcpu, vcpu->arch.sie_block); + goto out_ucontrol_uninit; - return vcpu; + kvm_s390_update_topology_change_report(vcpu->kvm, 1); + return 0; + +out_ucontrol_uninit: + if (kvm_is_ucontrol(vcpu->kvm)) + gmap_remove(vcpu->arch.gmap); out_free_sie_block: free_page((unsigned long)(vcpu->arch.sie_block)); -out_free_cpu: - kmem_cache_free(kvm_vcpu_cache, vcpu); -out: - return ERR_PTR(rc); + return rc; } int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { + clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.gisa_int.kicked_mask); return kvm_s390_vcpu_has_irq(vcpu, 0); } @@ -3139,7 +4047,7 @@ void exit_sie(struct kvm_vcpu *vcpu) /* Kick a guest cpu out of SIE to process a request synchronously */ void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu) { - kvm_make_request(req, vcpu); + __kvm_make_request(req, vcpu); kvm_s390_vcpu_request(vcpu); } @@ -3149,7 +4057,9 @@ static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start, struct kvm *kvm = gmap->private; struct kvm_vcpu *vcpu; unsigned long prefix; - int i; + unsigned long i; + + trace_kvm_s390_gmap_notifier(start, end, gmap_is_shadow(gmap)); if (gmap_is_shadow(gmap)) return; @@ -3162,7 +4072,7 @@ static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start, if (prefix <= end && start <= prefix + 2*PAGE_SIZE - 1) { VCPU_EVENT(vcpu, 2, "gmap notifier for %lx-%lx", start, end); - kvm_s390_sync_request(KVM_REQ_MMU_RELOAD, vcpu); + kvm_s390_sync_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu); } } } @@ -3171,7 +4081,7 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) { /* do not poll with more than halt_poll_max_steal percent of steal time */ if (S390_lowcore.avg_steal_timer * 100 / (TICK_USEC << 12) >= - halt_poll_max_steal) { + READ_ONCE(halt_poll_max_steal)) { vcpu->stat.halt_no_poll_steal++; return true; } @@ -3287,10 +4197,76 @@ static int kvm_arch_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, return r; } -static int kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu) +static void kvm_arch_vcpu_ioctl_normal_reset(struct kvm_vcpu *vcpu) { - kvm_s390_vcpu_initial_reset(vcpu); - return 0; + vcpu->arch.sie_block->gpsw.mask &= ~PSW_MASK_RI; + vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID; + memset(vcpu->run->s.regs.riccb, 0, sizeof(vcpu->run->s.regs.riccb)); + + kvm_clear_async_pf_completion_queue(vcpu); + if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm)) + kvm_s390_vcpu_stop(vcpu); + kvm_s390_clear_local_irqs(vcpu); +} + +static void kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu) +{ + /* Initial reset is a superset of the normal reset */ + kvm_arch_vcpu_ioctl_normal_reset(vcpu); + + /* + * This equals initial cpu reset in pop, but we don't switch to ESA. + * We do not only reset the internal data, but also ... + */ + vcpu->arch.sie_block->gpsw.mask = 0; + vcpu->arch.sie_block->gpsw.addr = 0; + kvm_s390_set_prefix(vcpu, 0); + kvm_s390_set_cpu_timer(vcpu, 0); + vcpu->arch.sie_block->ckc = 0; + memset(vcpu->arch.sie_block->gcr, 0, sizeof(vcpu->arch.sie_block->gcr)); + vcpu->arch.sie_block->gcr[0] = CR0_INITIAL_MASK; + vcpu->arch.sie_block->gcr[14] = CR14_INITIAL_MASK; + + /* ... the data in sync regs */ + memset(vcpu->run->s.regs.crs, 0, sizeof(vcpu->run->s.regs.crs)); + vcpu->run->s.regs.ckc = 0; + vcpu->run->s.regs.crs[0] = CR0_INITIAL_MASK; + vcpu->run->s.regs.crs[14] = CR14_INITIAL_MASK; + vcpu->run->psw_addr = 0; + vcpu->run->psw_mask = 0; + vcpu->run->s.regs.todpr = 0; + vcpu->run->s.regs.cputm = 0; + vcpu->run->s.regs.ckc = 0; + vcpu->run->s.regs.pp = 0; + vcpu->run->s.regs.gbea = 1; + vcpu->run->s.regs.fpc = 0; + /* + * Do not reset these registers in the protected case, as some of + * them are overlaid and they are not accessible in this case + * anyway. + */ + if (!kvm_s390_pv_cpu_is_protected(vcpu)) { + vcpu->arch.sie_block->gbea = 1; + vcpu->arch.sie_block->pp = 0; + vcpu->arch.sie_block->fpf &= ~FPF_BPBC; + vcpu->arch.sie_block->todpr = 0; + } +} + +static void kvm_arch_vcpu_ioctl_clear_reset(struct kvm_vcpu *vcpu) +{ + struct kvm_sync_regs *regs = &vcpu->run->s.regs; + + /* Clear reset is a superset of the initial reset */ + kvm_arch_vcpu_ioctl_initial_reset(vcpu); + + memset(®s->gprs, 0, sizeof(regs->gprs)); + memset(®s->vrs, 0, sizeof(regs->vrs)); + memset(®s->acrs, 0, sizeof(regs->acrs)); + memset(®s->gscb, 0, sizeof(regs->gscb)); + + regs->etoken = 0; + regs->etoken_extension = 0; } int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) @@ -3339,18 +4315,13 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) vcpu_load(vcpu); - if (test_fp_ctl(fpu->fpc)) { - ret = -EINVAL; - goto out; - } vcpu->run->s.regs.fpc = fpu->fpc; - if (MACHINE_HAS_VX) + if (cpu_has_vx()) convert_fp_to_vx((__vector128 *) vcpu->run->s.regs.vrs, (freg_t *) fpu->fprs); else memcpy(vcpu->run->s.regs.fprs, &fpu->fprs, sizeof(fpu->fprs)); -out: vcpu_put(vcpu); return ret; } @@ -3359,9 +4330,7 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { vcpu_load(vcpu); - /* make sure we have the latest values */ - save_fpu_regs(); - if (MACHINE_HAS_VX) + if (cpu_has_vx()) convert_vx_to_fp((freg_t *) fpu->fprs, (__vector128 *) vcpu->run->s.regs.vrs); else @@ -3460,18 +4429,24 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, vcpu_load(vcpu); /* user space knows about this interface - let it control the state */ - vcpu->kvm->arch.user_cpu_state_ctrl = 1; + kvm_s390_set_user_cpu_state_ctrl(vcpu->kvm); switch (mp_state->mp_state) { case KVM_MP_STATE_STOPPED: - kvm_s390_vcpu_stop(vcpu); + rc = kvm_s390_vcpu_stop(vcpu); break; case KVM_MP_STATE_OPERATING: - kvm_s390_vcpu_start(vcpu); + rc = kvm_s390_vcpu_start(vcpu); break; case KVM_MP_STATE_LOAD: + if (!kvm_s390_pv_cpu_is_protected(vcpu)) { + rc = -ENXIO; + break; + } + rc = kvm_s390_pv_set_cpu_state(vcpu, PV_CPU_STATE_OPR_LOAD); + break; case KVM_MP_STATE_CHECK_STOP: - /* fall through - CHECK_STOP and LOAD are not supported yet */ + fallthrough; /* CHECK_STOP and LOAD are not supported yet */ default: rc = -ENXIO; } @@ -3492,19 +4467,19 @@ retry: if (!kvm_request_pending(vcpu)) return 0; /* - * We use MMU_RELOAD just to re-arm the ipte notifier for the + * If the guest prefix changed, re-arm the ipte notifier for the * guest prefix page. gmap_mprotect_notify will wait on the ptl lock. * This ensures that the ipte instruction for this request has * already finished. We might race against a second unmapper that * wants to set the blocking bit. Lets just retry the request loop. */ - if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) { + if (kvm_check_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu)) { int rc; rc = gmap_mprotect_notify(vcpu->arch.gmap, kvm_s390_get_prefix(vcpu), PAGE_SIZE * 2, PROT_WRITE); if (rc) { - kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu); + kvm_make_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu); return rc; } goto retry; @@ -3557,30 +4532,26 @@ retry: goto retry; } - /* nothing to do, just clear the request */ - kvm_clear_request(KVM_REQ_UNHALT, vcpu); /* we left the vsie handler, nothing to do, just clear the request */ kvm_clear_request(KVM_REQ_VSIE_RESTART, vcpu); return 0; } -void kvm_s390_set_tod_clock(struct kvm *kvm, - const struct kvm_s390_vm_tod_clock *gtod) +static void __kvm_s390_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod) { struct kvm_vcpu *vcpu; - struct kvm_s390_tod_clock_ext htod; - int i; + union tod_clock clk; + unsigned long i; - mutex_lock(&kvm->lock); preempt_disable(); - get_tod_clock_ext((char *)&htod); + store_tod_clock_ext(&clk); - kvm->arch.epoch = gtod->tod - htod.tod; + kvm->arch.epoch = gtod->tod - clk.tod; kvm->arch.epdx = 0; if (test_kvm_facility(kvm, 139)) { - kvm->arch.epdx = gtod->epoch_idx - htod.epoch_idx; + kvm->arch.epdx = gtod->epoch_idx - clk.ei; if (kvm->arch.epoch > gtod->tod) kvm->arch.epdx -= 1; } @@ -3593,7 +4564,15 @@ void kvm_s390_set_tod_clock(struct kvm *kvm, kvm_s390_vcpu_unblock_all(kvm); preempt_enable(); +} + +int kvm_s390_try_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod) +{ + if (!mutex_trylock(&kvm->lock)) + return 0; + __kvm_s390_set_tod_clock(kvm, gtod); mutex_unlock(&kvm->lock); + return 1; } /** @@ -3629,11 +4608,13 @@ static void __kvm_inject_pfault_token(struct kvm_vcpu *vcpu, bool start_token, } } -void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, +bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) { trace_kvm_s390_pfault_init(vcpu, work->arch.pfault_token); __kvm_inject_pfault_token(vcpu, true, work->arch.pfault_token); + + return true; } void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, @@ -3649,7 +4630,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, /* s390 will always inject the page directly */ } -bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu) +bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu) { /* * s390 will always inject the page directly, @@ -3658,33 +4639,31 @@ bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu) return true; } -static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu) +static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu) { hva_t hva; struct kvm_arch_async_pf arch; - int rc; if (vcpu->arch.pfault_token == KVM_S390_PFAULT_TOKEN_INVALID) - return 0; + return false; if ((vcpu->arch.sie_block->gpsw.mask & vcpu->arch.pfault_select) != vcpu->arch.pfault_compare) - return 0; + return false; if (psw_extint_disabled(vcpu)) - return 0; + return false; if (kvm_s390_vcpu_has_irq(vcpu, 0)) - return 0; + return false; if (!(vcpu->arch.sie_block->gcr[0] & CR0_SERVICE_SIGNAL_SUBMASK)) - return 0; + return false; if (!vcpu->arch.gmap->pfault_enabled) - return 0; + return false; hva = gfn_to_hva(vcpu->kvm, gpa_to_gfn(current->thread.gmap_addr)); hva += current->thread.gmap_addr & ~PAGE_MASK; if (read_guest_real(vcpu, vcpu->arch.pfault_token, &arch.pfault_token, 8)) - return 0; + return false; - rc = kvm_setup_async_pf(vcpu, current->thread.gmap_addr, hva, &arch); - return rc; + return kvm_setup_async_pf(vcpu, current->thread.gmap_addr, hva, &arch); } static int vcpu_pre_run(struct kvm_vcpu *vcpu) @@ -3704,12 +4683,9 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu) if (need_resched()) schedule(); - if (test_cpu_flag(CIF_MCCK_PENDING)) - s390_handle_mcck(); - if (!kvm_is_ucontrol(vcpu->kvm)) { rc = kvm_s390_deliver_pending_interrupts(vcpu); - if (rc) + if (rc || guestdbg_exit_pending(vcpu)) return rc; } @@ -3722,7 +4698,7 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu) kvm_s390_patch_guest_per_regs(vcpu); } - clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.gisa_int.kicked_mask); + clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.gisa_int.kicked_mask); vcpu->arch.sie_block->icptcode = 0; cpuflags = atomic_read(&vcpu->arch.sie_block->cpuflags); @@ -3816,27 +4792,30 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason) current->thread.gmap_pfault = 0; if (kvm_arch_setup_async_pf(vcpu)) return 0; + vcpu->stat.pfault_sync++; return kvm_arch_fault_in_page(vcpu, current->thread.gmap_addr, 1); } return vcpu_post_run_fault_in_sie(vcpu); } +#define PSW_INT_MASK (PSW_MASK_EXT | PSW_MASK_IO | PSW_MASK_MCHECK) static int __vcpu_run(struct kvm_vcpu *vcpu) { int rc, exit_reason; + struct sie_page *sie_page = (struct sie_page *)vcpu->arch.sie_block; /* * We try to hold kvm->srcu during most of vcpu_run (except when run- * ning the guest), so that memslots (and other stuff) are protected */ - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + kvm_vcpu_srcu_read_lock(vcpu); do { rc = vcpu_pre_run(vcpu); - if (rc) + if (rc || guestdbg_exit_pending(vcpu)) break; - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + kvm_vcpu_srcu_read_unlock(vcpu); /* * As PF_VCPU will be used in fault handler, between * guest_enter and guest_exit should be no uaccess. @@ -3845,23 +4824,46 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) guest_enter_irqoff(); __disable_cpu_timer_accounting(vcpu); local_irq_enable(); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + memcpy(sie_page->pv_grregs, + vcpu->run->s.regs.gprs, + sizeof(sie_page->pv_grregs)); + } + if (test_cpu_flag(CIF_FPU)) + load_fpu_regs(); exit_reason = sie64a(vcpu->arch.sie_block, vcpu->run->s.regs.gprs); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + memcpy(vcpu->run->s.regs.gprs, + sie_page->pv_grregs, + sizeof(sie_page->pv_grregs)); + /* + * We're not allowed to inject interrupts on intercepts + * that leave the guest state in an "in-between" state + * where the next SIE entry will do a continuation. + * Fence interrupts in our "internal" PSW. + */ + if (vcpu->arch.sie_block->icptcode == ICPT_PV_INSTR || + vcpu->arch.sie_block->icptcode == ICPT_PV_PREF) { + vcpu->arch.sie_block->gpsw.mask &= ~PSW_INT_MASK; + } + } local_irq_disable(); __enable_cpu_timer_accounting(vcpu); guest_exit_irqoff(); local_irq_enable(); - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + kvm_vcpu_srcu_read_lock(vcpu); rc = vcpu_post_run(vcpu, exit_reason); } while (!signal_pending(current) && !guestdbg_exit_pending(vcpu) && !rc); - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + kvm_vcpu_srcu_read_unlock(vcpu); return rc; } -static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static void sync_regs_fmt2(struct kvm_vcpu *vcpu) { + struct kvm_run *kvm_run = vcpu->run; struct runtime_instr_cb *riccb; struct gs_cb *gscb; @@ -3869,16 +4871,7 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) gscb = (struct gs_cb *) &kvm_run->s.regs.gscb; vcpu->arch.sie_block->gpsw.mask = kvm_run->psw_mask; vcpu->arch.sie_block->gpsw.addr = kvm_run->psw_addr; - if (kvm_run->kvm_dirty_regs & KVM_SYNC_PREFIX) - kvm_s390_set_prefix(vcpu, kvm_run->s.regs.prefix); - if (kvm_run->kvm_dirty_regs & KVM_SYNC_CRS) { - memcpy(&vcpu->arch.sie_block->gcr, &kvm_run->s.regs.crs, 128); - /* some control register changes require a tlb flush */ - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); - } if (kvm_run->kvm_dirty_regs & KVM_SYNC_ARCH0) { - kvm_s390_set_cpu_timer(vcpu, kvm_run->s.regs.cputm); - vcpu->arch.sie_block->ckc = kvm_run->s.regs.ckc; vcpu->arch.sie_block->todpr = kvm_run->s.regs.todpr; vcpu->arch.sie_block->pp = kvm_run->s.regs.pp; vcpu->arch.sie_block->gbea = kvm_run->s.regs.gbea; @@ -3890,6 +4883,11 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (vcpu->arch.pfault_token == KVM_S390_PFAULT_TOKEN_INVALID) kvm_clear_async_pf_completion_queue(vcpu); } + if (kvm_run->kvm_dirty_regs & KVM_SYNC_DIAG318) { + vcpu->arch.diag318_info.val = kvm_run->s.regs.diag318; + vcpu->arch.sie_block->cpnc = vcpu->arch.diag318_info.cpnc; + VCPU_EVENT(vcpu, 3, "setting cpnc to %d", vcpu->arch.diag318_info.cpnc); + } /* * If userspace sets the riccb (e.g. after migration) to a valid state, * we should enable RI here instead of doing the lazy enablement. @@ -3919,23 +4917,9 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vcpu->arch.sie_block->fpf &= ~FPF_BPBC; vcpu->arch.sie_block->fpf |= kvm_run->s.regs.bpbc ? FPF_BPBC : 0; } - save_access_regs(vcpu->arch.host_acrs); - restore_access_regs(vcpu->run->s.regs.acrs); - /* save host (userspace) fprs/vrs */ - save_fpu_regs(); - vcpu->arch.host_fpregs.fpc = current->thread.fpu.fpc; - vcpu->arch.host_fpregs.regs = current->thread.fpu.regs; - if (MACHINE_HAS_VX) - current->thread.fpu.regs = vcpu->run->s.regs.vrs; - else - current->thread.fpu.regs = vcpu->run->s.regs.fprs; - current->thread.fpu.fpc = vcpu->run->s.regs.fpc; - if (test_fp_ctl(current->thread.fpu.fpc)) - /* User space provided an invalid FPC, let's clear it */ - current->thread.fpu.fpc = 0; if (MACHINE_HAS_GS) { preempt_disable(); - __ctl_set_bit(2, 4); + local_ctl_set_bit(2, CR2_GUARDED_STORAGE_BIT); if (current->thread.gs_cb) { vcpu->arch.host_gscb = current->thread.gs_cb; save_gs_cb(vcpu->arch.host_gscb); @@ -3948,25 +4932,93 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) preempt_enable(); } /* SIE will load etoken directly from SDNX and therefore kvm_run */ +} + +static void sync_regs(struct kvm_vcpu *vcpu) +{ + struct kvm_run *kvm_run = vcpu->run; + + if (kvm_run->kvm_dirty_regs & KVM_SYNC_PREFIX) + kvm_s390_set_prefix(vcpu, kvm_run->s.regs.prefix); + if (kvm_run->kvm_dirty_regs & KVM_SYNC_CRS) { + memcpy(&vcpu->arch.sie_block->gcr, &kvm_run->s.regs.crs, 128); + /* some control register changes require a tlb flush */ + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + } + if (kvm_run->kvm_dirty_regs & KVM_SYNC_ARCH0) { + kvm_s390_set_cpu_timer(vcpu, kvm_run->s.regs.cputm); + vcpu->arch.sie_block->ckc = kvm_run->s.regs.ckc; + } + save_access_regs(vcpu->arch.host_acrs); + restore_access_regs(vcpu->run->s.regs.acrs); + /* save host (userspace) fprs/vrs */ + save_fpu_regs(); + vcpu->arch.host_fpregs.fpc = current->thread.fpu.fpc; + vcpu->arch.host_fpregs.regs = current->thread.fpu.regs; + if (cpu_has_vx()) + current->thread.fpu.regs = vcpu->run->s.regs.vrs; + else + current->thread.fpu.regs = vcpu->run->s.regs.fprs; + current->thread.fpu.fpc = vcpu->run->s.regs.fpc; + + /* Sync fmt2 only data */ + if (likely(!kvm_s390_pv_cpu_is_protected(vcpu))) { + sync_regs_fmt2(vcpu); + } else { + /* + * In several places we have to modify our internal view to + * not do things that are disallowed by the ultravisor. For + * example we must not inject interrupts after specific exits + * (e.g. 112 prefix page not secure). We do this by turning + * off the machine check, external and I/O interrupt bits + * of our PSW copy. To avoid getting validity intercepts, we + * do only accept the condition code from userspace. + */ + vcpu->arch.sie_block->gpsw.mask &= ~PSW_MASK_CC; + vcpu->arch.sie_block->gpsw.mask |= kvm_run->psw_mask & + PSW_MASK_CC; + } kvm_run->kvm_dirty_regs = 0; } -static void store_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static void store_regs_fmt2(struct kvm_vcpu *vcpu) { + struct kvm_run *kvm_run = vcpu->run; + + kvm_run->s.regs.todpr = vcpu->arch.sie_block->todpr; + kvm_run->s.regs.pp = vcpu->arch.sie_block->pp; + kvm_run->s.regs.gbea = vcpu->arch.sie_block->gbea; + kvm_run->s.regs.bpbc = (vcpu->arch.sie_block->fpf & FPF_BPBC) == FPF_BPBC; + kvm_run->s.regs.diag318 = vcpu->arch.diag318_info.val; + if (MACHINE_HAS_GS) { + preempt_disable(); + local_ctl_set_bit(2, CR2_GUARDED_STORAGE_BIT); + if (vcpu->arch.gs_enabled) + save_gs_cb(current->thread.gs_cb); + current->thread.gs_cb = vcpu->arch.host_gscb; + restore_gs_cb(vcpu->arch.host_gscb); + if (!vcpu->arch.host_gscb) + local_ctl_clear_bit(2, CR2_GUARDED_STORAGE_BIT); + vcpu->arch.host_gscb = NULL; + preempt_enable(); + } + /* SIE will save etoken directly into SDNX and therefore kvm_run */ +} + +static void store_regs(struct kvm_vcpu *vcpu) +{ + struct kvm_run *kvm_run = vcpu->run; + kvm_run->psw_mask = vcpu->arch.sie_block->gpsw.mask; kvm_run->psw_addr = vcpu->arch.sie_block->gpsw.addr; kvm_run->s.regs.prefix = kvm_s390_get_prefix(vcpu); memcpy(&kvm_run->s.regs.crs, &vcpu->arch.sie_block->gcr, 128); kvm_run->s.regs.cputm = kvm_s390_get_cpu_timer(vcpu); kvm_run->s.regs.ckc = vcpu->arch.sie_block->ckc; - kvm_run->s.regs.todpr = vcpu->arch.sie_block->todpr; - kvm_run->s.regs.pp = vcpu->arch.sie_block->pp; - kvm_run->s.regs.gbea = vcpu->arch.sie_block->gbea; kvm_run->s.regs.pft = vcpu->arch.pfault_token; kvm_run->s.regs.pfs = vcpu->arch.pfault_select; kvm_run->s.regs.pfc = vcpu->arch.pfault_compare; - kvm_run->s.regs.bpbc = (vcpu->arch.sie_block->fpf & FPF_BPBC) == FPF_BPBC; save_access_regs(vcpu->run->s.regs.acrs); restore_access_regs(vcpu->arch.host_acrs); /* Save guest register state */ @@ -3975,25 +5027,24 @@ static void store_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) /* Restore will be done lazily at return */ current->thread.fpu.fpc = vcpu->arch.host_fpregs.fpc; current->thread.fpu.regs = vcpu->arch.host_fpregs.regs; - if (MACHINE_HAS_GS) { - __ctl_set_bit(2, 4); - if (vcpu->arch.gs_enabled) - save_gs_cb(current->thread.gs_cb); - preempt_disable(); - current->thread.gs_cb = vcpu->arch.host_gscb; - restore_gs_cb(vcpu->arch.host_gscb); - preempt_enable(); - if (!vcpu->arch.host_gscb) - __ctl_clear_bit(2, 4); - vcpu->arch.host_gscb = NULL; - } - /* SIE will save etoken directly into SDNX and therefore kvm_run */ + if (likely(!kvm_s390_pv_cpu_is_protected(vcpu))) + store_regs_fmt2(vcpu); } -int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { + struct kvm_run *kvm_run = vcpu->run; int rc; + /* + * Running a VM while dumping always has the potential to + * produce inconsistent dump data. But for PV vcpus a SIE + * entry while dumping could also lead to a fatal validity + * intercept which we absolutely want to avoid. + */ + if (vcpu->kvm->arch.pv.dumping) + return -EINVAL; + if (kvm_run->immediate_exit) return -EINTR; @@ -4011,6 +5062,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) kvm_sigset_activate(vcpu); + /* + * no need to check the return value of vcpu_start as it can only have + * an error for protvirt, but protvirt means user cpu state + */ if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm)) { kvm_s390_vcpu_start(vcpu); } else if (is_vcpu_stopped(vcpu)) { @@ -4020,7 +5075,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) goto out; } - sync_regs(vcpu, kvm_run); + sync_regs(vcpu); enable_cpu_timer_accounting(vcpu); might_fault(); @@ -4042,7 +5097,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } disable_cpu_timer_accounting(vcpu); - store_regs(vcpu, kvm_run); + store_regs(vcpu); kvm_sigset_deactivate(vcpu); @@ -4079,7 +5134,7 @@ int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long gpa) gpa -= __LC_FPREGS_SAVE_AREA; /* manually convert vector registers if necessary */ - if (MACHINE_HAS_VX) { + if (cpu_has_vx()) { convert_vx_to_fp(fprs, (__vector128 *) vcpu->run->s.regs.vrs); rc = write_guest_abs(vcpu, gpa + __LC_FPREGS_SAVE_AREA, fprs, 128); @@ -4132,7 +5187,7 @@ static void __disable_ibs_on_vcpu(struct kvm_vcpu *vcpu) static void __disable_ibs_on_all_vcpus(struct kvm *kvm) { - unsigned int i; + unsigned long i; struct kvm_vcpu *vcpu; kvm_for_each_vcpu(i, vcpu, kvm) { @@ -4148,20 +5203,29 @@ static void __enable_ibs_on_vcpu(struct kvm_vcpu *vcpu) kvm_s390_sync_request(KVM_REQ_ENABLE_IBS, vcpu); } -void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu) +int kvm_s390_vcpu_start(struct kvm_vcpu *vcpu) { - int i, online_vcpus, started_vcpus = 0; + int i, online_vcpus, r = 0, started_vcpus = 0; if (!is_vcpu_stopped(vcpu)) - return; + return 0; trace_kvm_s390_vcpu_start_stop(vcpu->vcpu_id, 1); /* Only one cpu at a time may enter/leave the STOPPED state. */ spin_lock(&vcpu->kvm->arch.start_stop_lock); online_vcpus = atomic_read(&vcpu->kvm->online_vcpus); + /* Let's tell the UV that we want to change into the operating state */ + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + r = kvm_s390_pv_set_cpu_state(vcpu, PV_CPU_STATE_OPR); + if (r) { + spin_unlock(&vcpu->kvm->arch.start_stop_lock); + return r; + } + } + for (i = 0; i < online_vcpus; i++) { - if (!is_vcpu_stopped(vcpu->kvm->vcpus[i])) + if (!is_vcpu_stopped(kvm_get_vcpu(vcpu->kvm, i))) started_vcpus++; } @@ -4172,44 +5236,67 @@ void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu) /* * As we are starting a second VCPU, we have to disable * the IBS facility on all VCPUs to remove potentially - * oustanding ENABLE requests. + * outstanding ENABLE requests. */ __disable_ibs_on_all_vcpus(vcpu->kvm); } kvm_s390_clear_cpuflags(vcpu, CPUSTAT_STOPPED); /* + * The real PSW might have changed due to a RESTART interpreted by the + * ultravisor. We block all interrupts and let the next sie exit + * refresh our view. + */ + if (kvm_s390_pv_cpu_is_protected(vcpu)) + vcpu->arch.sie_block->gpsw.mask &= ~PSW_INT_MASK; + /* * Another VCPU might have used IBS while we were offline. * Let's play safe and flush the VCPU at startup. */ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); spin_unlock(&vcpu->kvm->arch.start_stop_lock); - return; + return 0; } -void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu) +int kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu) { - int i, online_vcpus, started_vcpus = 0; + int i, online_vcpus, r = 0, started_vcpus = 0; struct kvm_vcpu *started_vcpu = NULL; if (is_vcpu_stopped(vcpu)) - return; + return 0; trace_kvm_s390_vcpu_start_stop(vcpu->vcpu_id, 0); /* Only one cpu at a time may enter/leave the STOPPED state. */ spin_lock(&vcpu->kvm->arch.start_stop_lock); online_vcpus = atomic_read(&vcpu->kvm->online_vcpus); - /* SIGP STOP and SIGP STOP AND STORE STATUS has been fully processed */ - kvm_s390_clear_stop_irq(vcpu); + /* Let's tell the UV that we want to change into the stopped state */ + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + r = kvm_s390_pv_set_cpu_state(vcpu, PV_CPU_STATE_STP); + if (r) { + spin_unlock(&vcpu->kvm->arch.start_stop_lock); + return r; + } + } + /* + * Set the VCPU to STOPPED and THEN clear the interrupt flag, + * now that the SIGP STOP and SIGP STOP AND STORE STATUS orders + * have been fully processed. This will ensure that the VCPU + * is kept BUSY if another VCPU is inquiring with SIGP SENSE. + */ kvm_s390_set_cpuflags(vcpu, CPUSTAT_STOPPED); + kvm_s390_clear_stop_irq(vcpu); + __disable_ibs_on_vcpu(vcpu); for (i = 0; i < online_vcpus; i++) { - if (!is_vcpu_stopped(vcpu->kvm->vcpus[i])) { + struct kvm_vcpu *tmp = kvm_get_vcpu(vcpu->kvm, i); + + if (!is_vcpu_stopped(tmp)) { started_vcpus++; - started_vcpu = vcpu->kvm->vcpus[i]; + started_vcpu = tmp; } } @@ -4222,7 +5309,7 @@ void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu) } spin_unlock(&vcpu->kvm->arch.start_stop_lock); - return; + return 0; } static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, @@ -4249,64 +5336,116 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, return r; } -static long kvm_s390_guest_mem_op(struct kvm_vcpu *vcpu, +static long kvm_s390_vcpu_sida_op(struct kvm_vcpu *vcpu, struct kvm_s390_mem_op *mop) { void __user *uaddr = (void __user *)mop->buf; - void *tmpbuf = NULL; - int r, srcu_idx; - const u64 supported_flags = KVM_S390_MEMOP_F_INJECT_EXCEPTION - | KVM_S390_MEMOP_F_CHECK_ONLY; + void *sida_addr; + int r = 0; - if (mop->flags & ~supported_flags || mop->ar >= NUM_ACRS || !mop->size) + if (mop->flags || !mop->size) return -EINVAL; - - if (mop->size > MEM_OP_MAX_SIZE) + if (mop->size + mop->sida_offset < mop->size) + return -EINVAL; + if (mop->size + mop->sida_offset > sida_size(vcpu->arch.sie_block)) return -E2BIG; + if (!kvm_s390_pv_cpu_is_protected(vcpu)) + return -EINVAL; + + sida_addr = (char *)sida_addr(vcpu->arch.sie_block) + mop->sida_offset; + switch (mop->op) { + case KVM_S390_MEMOP_SIDA_READ: + if (copy_to_user(uaddr, sida_addr, mop->size)) + r = -EFAULT; + + break; + case KVM_S390_MEMOP_SIDA_WRITE: + if (copy_from_user(sida_addr, uaddr, mop->size)) + r = -EFAULT; + break; + } + return r; +} + +static long kvm_s390_vcpu_mem_op(struct kvm_vcpu *vcpu, + struct kvm_s390_mem_op *mop) +{ + void __user *uaddr = (void __user *)mop->buf; + enum gacc_mode acc_mode; + void *tmpbuf = NULL; + int r; + + r = mem_op_validate_common(mop, KVM_S390_MEMOP_F_INJECT_EXCEPTION | + KVM_S390_MEMOP_F_CHECK_ONLY | + KVM_S390_MEMOP_F_SKEY_PROTECTION); + if (r) + return r; + if (mop->ar >= NUM_ACRS) + return -EINVAL; + if (kvm_s390_pv_cpu_is_protected(vcpu)) + return -EINVAL; if (!(mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY)) { tmpbuf = vmalloc(mop->size); if (!tmpbuf) return -ENOMEM; } + acc_mode = mop->op == KVM_S390_MEMOP_LOGICAL_READ ? GACC_FETCH : GACC_STORE; + if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) { + r = check_gva_range(vcpu, mop->gaddr, mop->ar, mop->size, + acc_mode, mop->key); + goto out_inject; + } + if (acc_mode == GACC_FETCH) { + r = read_guest_with_key(vcpu, mop->gaddr, mop->ar, tmpbuf, + mop->size, mop->key); + if (r) + goto out_inject; + if (copy_to_user(uaddr, tmpbuf, mop->size)) { + r = -EFAULT; + goto out_free; + } + } else { + if (copy_from_user(tmpbuf, uaddr, mop->size)) { + r = -EFAULT; + goto out_free; + } + r = write_guest_with_key(vcpu, mop->gaddr, mop->ar, tmpbuf, + mop->size, mop->key); + } + +out_inject: + if (r > 0 && (mop->flags & KVM_S390_MEMOP_F_INJECT_EXCEPTION) != 0) + kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm); + +out_free: + vfree(tmpbuf); + return r; +} + +static long kvm_s390_vcpu_memsida_op(struct kvm_vcpu *vcpu, + struct kvm_s390_mem_op *mop) +{ + int r, srcu_idx; + srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); switch (mop->op) { case KVM_S390_MEMOP_LOGICAL_READ: - if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) { - r = check_gva_range(vcpu, mop->gaddr, mop->ar, - mop->size, GACC_FETCH); - break; - } - r = read_guest(vcpu, mop->gaddr, mop->ar, tmpbuf, mop->size); - if (r == 0) { - if (copy_to_user(uaddr, tmpbuf, mop->size)) - r = -EFAULT; - } - break; case KVM_S390_MEMOP_LOGICAL_WRITE: - if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) { - r = check_gva_range(vcpu, mop->gaddr, mop->ar, - mop->size, GACC_STORE); - break; - } - if (copy_from_user(tmpbuf, uaddr, mop->size)) { - r = -EFAULT; - break; - } - r = write_guest(vcpu, mop->gaddr, mop->ar, tmpbuf, mop->size); + r = kvm_s390_vcpu_mem_op(vcpu, mop); + break; + case KVM_S390_MEMOP_SIDA_READ: + case KVM_S390_MEMOP_SIDA_WRITE: + /* we are locked against sida going away by the vcpu->mutex */ + r = kvm_s390_vcpu_sida_op(vcpu, mop); break; default: r = -EINVAL; } srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); - - if (r > 0 && (mop->flags & KVM_S390_MEMOP_F_INJECT_EXCEPTION) != 0) - kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm); - - vfree(tmpbuf); return r; } @@ -4315,6 +5454,7 @@ long kvm_arch_vcpu_async_ioctl(struct file *filp, { struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; + int rc; switch (ioctl) { case KVM_S390_IRQ: { @@ -4322,7 +5462,8 @@ long kvm_arch_vcpu_async_ioctl(struct file *filp, if (copy_from_user(&s390irq, argp, sizeof(s390irq))) return -EFAULT; - return kvm_s390_inject_vcpu(vcpu, &s390irq); + rc = kvm_s390_inject_vcpu(vcpu, &s390irq); + break; } case KVM_S390_INTERRUPT: { struct kvm_s390_interrupt s390int; @@ -4332,10 +5473,67 @@ long kvm_arch_vcpu_async_ioctl(struct file *filp, return -EFAULT; if (s390int_to_s390irq(&s390int, &s390irq)) return -EINVAL; - return kvm_s390_inject_vcpu(vcpu, &s390irq); + rc = kvm_s390_inject_vcpu(vcpu, &s390irq); + break; } + default: + rc = -ENOIOCTLCMD; + break; } - return -ENOIOCTLCMD; + + /* + * To simplify single stepping of userspace-emulated instructions, + * KVM_EXIT_S390_SIEIC exit sets KVM_GUESTDBG_EXIT_PENDING (see + * should_handle_per_ifetch()). However, if userspace emulation injects + * an interrupt, it needs to be cleared, so that KVM_EXIT_DEBUG happens + * after (and not before) the interrupt delivery. + */ + if (!rc) + vcpu->guest_debug &= ~KVM_GUESTDBG_EXIT_PENDING; + + return rc; +} + +static int kvm_s390_handle_pv_vcpu_dump(struct kvm_vcpu *vcpu, + struct kvm_pv_cmd *cmd) +{ + struct kvm_s390_pv_dmp dmp; + void *data; + int ret; + + /* Dump initialization is a prerequisite */ + if (!vcpu->kvm->arch.pv.dumping) + return -EINVAL; + + if (copy_from_user(&dmp, (__u8 __user *)cmd->data, sizeof(dmp))) + return -EFAULT; + + /* We only handle this subcmd right now */ + if (dmp.subcmd != KVM_PV_DUMP_CPU) + return -EINVAL; + + /* CPU dump length is the same as create cpu storage donation. */ + if (dmp.buff_len != uv_info.guest_cpu_stor_len) + return -EINVAL; + + data = kvzalloc(uv_info.guest_cpu_stor_len, GFP_KERNEL); + if (!data) + return -ENOMEM; + + ret = kvm_s390_pv_dump_cpu(vcpu, data, &cmd->rc, &cmd->rrc); + + VCPU_EVENT(vcpu, 3, "PROTVIRT DUMP CPU %d rc %x rrc %x", + vcpu->vcpu_id, cmd->rc, cmd->rrc); + + if (ret) + ret = -EINVAL; + + /* On success copy over the dump data */ + if (!ret && copy_to_user((__u8 __user *)dmp.buff_addr, data, uv_info.guest_cpu_stor_len)) + ret = -EFAULT; + + kvfree(data); + return ret; } long kvm_arch_vcpu_ioctl(struct file *filp, @@ -4345,13 +5543,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp, void __user *argp = (void __user *)arg; int idx; long r; + u16 rc, rrc; vcpu_load(vcpu); switch (ioctl) { case KVM_S390_STORE_STATUS: idx = srcu_read_lock(&vcpu->kvm->srcu); - r = kvm_s390_vcpu_store_status(vcpu, arg); + r = kvm_s390_store_status_unloaded(vcpu, arg); srcu_read_unlock(&vcpu->kvm->srcu, idx); break; case KVM_S390_SET_INITIAL_PSW: { @@ -4363,12 +5562,43 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_arch_vcpu_ioctl_set_initial_psw(vcpu, psw); break; } + case KVM_S390_CLEAR_RESET: + r = 0; + kvm_arch_vcpu_ioctl_clear_reset(vcpu); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + r = uv_cmd_nodata(kvm_s390_pv_cpu_get_handle(vcpu), + UVC_CMD_CPU_RESET_CLEAR, &rc, &rrc); + VCPU_EVENT(vcpu, 3, "PROTVIRT RESET CLEAR VCPU: rc %x rrc %x", + rc, rrc); + } + break; case KVM_S390_INITIAL_RESET: - r = kvm_arch_vcpu_ioctl_initial_reset(vcpu); + r = 0; + kvm_arch_vcpu_ioctl_initial_reset(vcpu); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + r = uv_cmd_nodata(kvm_s390_pv_cpu_get_handle(vcpu), + UVC_CMD_CPU_RESET_INITIAL, + &rc, &rrc); + VCPU_EVENT(vcpu, 3, "PROTVIRT RESET INITIAL VCPU: rc %x rrc %x", + rc, rrc); + } + break; + case KVM_S390_NORMAL_RESET: + r = 0; + kvm_arch_vcpu_ioctl_normal_reset(vcpu); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + r = uv_cmd_nodata(kvm_s390_pv_cpu_get_handle(vcpu), + UVC_CMD_CPU_RESET, &rc, &rrc); + VCPU_EVENT(vcpu, 3, "PROTVIRT RESET NORMAL VCPU: rc %x rrc %x", + rc, rrc); + } break; case KVM_SET_ONE_REG: case KVM_GET_ONE_REG: { struct kvm_one_reg reg; + r = -EINVAL; + if (kvm_s390_pv_cpu_is_protected(vcpu)) + break; r = -EFAULT; if (copy_from_user(®, argp, sizeof(reg))) break; @@ -4431,7 +5661,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, struct kvm_s390_mem_op mem_op; if (copy_from_user(&mem_op, argp, sizeof(mem_op)) == 0) - r = kvm_s390_guest_mem_op(vcpu, &mem_op); + r = kvm_s390_vcpu_memsida_op(vcpu, &mem_op); else r = -EFAULT; break; @@ -4470,6 +5700,33 @@ long kvm_arch_vcpu_ioctl(struct file *filp, irq_state.len); break; } + case KVM_S390_PV_CPU_COMMAND: { + struct kvm_pv_cmd cmd; + + r = -EINVAL; + if (!is_prot_virt_host()) + break; + + r = -EFAULT; + if (copy_from_user(&cmd, argp, sizeof(cmd))) + break; + + r = -EINVAL; + if (cmd.flags) + break; + + /* We only handle this cmd right now */ + if (cmd.cmd != KVM_PV_DUMP) + break; + + r = kvm_s390_handle_pv_vcpu_dump(vcpu, &cmd); + + /* Always copy over UV rc / rrc data */ + if (copy_to_user((__u8 __user *)argp, &cmd.rc, + sizeof(cmd.rc) + sizeof(cmd.rrc))) + r = -EFAULT; + break; + } default: r = -ENOTTY; } @@ -4491,38 +5748,63 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) return VM_FAULT_SIGBUS; } -int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, - unsigned long npages) +bool kvm_arch_irqchip_in_kernel(struct kvm *kvm) { - return 0; + return true; } /* Section: memory related */ int kvm_arch_prepare_memory_region(struct kvm *kvm, - struct kvm_memory_slot *memslot, - const struct kvm_userspace_memory_region *mem, + const struct kvm_memory_slot *old, + struct kvm_memory_slot *new, enum kvm_mr_change change) { - /* A few sanity checks. We can have memory slots which have to be - located/ended at a segment boundary (1MB). The memory in userland is - ok to be fragmented into various different vmas. It is okay to mmap() - and munmap() stuff in this slot after doing this call at any time */ + gpa_t size; - if (mem->userspace_addr & 0xffffful) + /* When we are protected, we should not change the memory slots */ + if (kvm_s390_pv_get_handle(kvm)) return -EINVAL; - if (mem->memory_size & 0xffffful) - return -EINVAL; + if (change != KVM_MR_DELETE && change != KVM_MR_FLAGS_ONLY) { + /* + * A few sanity checks. We can have memory slots which have to be + * located/ended at a segment boundary (1MB). The memory in userland is + * ok to be fragmented into various different vmas. It is okay to mmap() + * and munmap() stuff in this slot after doing this call at any time + */ - if (mem->guest_phys_addr + mem->memory_size > kvm->arch.mem_limit) - return -EINVAL; + if (new->userspace_addr & 0xffffful) + return -EINVAL; + + size = new->npages * PAGE_SIZE; + if (size & 0xffffful) + return -EINVAL; + + if ((new->base_gfn * PAGE_SIZE) + size > kvm->arch.mem_limit) + return -EINVAL; + } + + if (!kvm->arch.migration_mode) + return 0; + + /* + * Turn off migration mode when: + * - userspace creates a new memslot with dirty logging off, + * - userspace modifies an existing memslot (MOVE or FLAGS_ONLY) and + * dirty logging is turned off. + * Migration mode expects dirty page logging being enabled to store + * its dirty bitmap. + */ + if (change != KVM_MR_DELETE && + !(new->flags & KVM_MEM_LOG_DIRTY_PAGES)) + WARN(kvm_s390_vm_stop_migration(kvm), + "Failed to stop migration mode"); return 0; } void kvm_arch_commit_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem, - const struct kvm_memory_slot *old, + struct kvm_memory_slot *old, const struct kvm_memory_slot *new, enum kvm_mr_change change) { @@ -4538,10 +5820,11 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, old->npages * PAGE_SIZE); if (rc) break; - /* FALLTHROUGH */ + fallthrough; case KVM_MR_CREATE: - rc = gmap_map_segment(kvm->arch.gmap, mem->userspace_addr, - mem->guest_phys_addr, mem->memory_size); + rc = gmap_map_segment(kvm->arch.gmap, new->userspace_addr, + new->base_gfn * PAGE_SIZE, + new->npages * PAGE_SIZE); break; case KVM_MR_FLAGS_ONLY: break; @@ -4560,14 +5843,9 @@ static inline unsigned long nonhyp_mask(int i) return 0x0000ffffffffffffUL >> (nonhyp_fai << 4); } -void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) -{ - vcpu->valid_wakeup = false; -} - static int __init kvm_s390_init(void) { - int i; + int i, r; if (!sclp.has_sief2) { pr_info("SIE is not available\n"); @@ -4581,14 +5859,25 @@ static int __init kvm_s390_init(void) for (i = 0; i < 16; i++) kvm_s390_fac_base[i] |= - S390_lowcore.stfle_fac_list[i] & nonhyp_mask(i); + stfle_fac_list[i] & nonhyp_mask(i); + + r = __kvm_s390_init(); + if (r) + return r; - return kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); + r = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE); + if (r) { + __kvm_s390_exit(); + return r; + } + return 0; } static void __exit kvm_s390_exit(void) { kvm_exit(); + + __kvm_s390_exit(); } module_init(kvm_s390_init); diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 6d9448dbd052..a7ea80cfa445 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -2,7 +2,7 @@ /* * definition for kvm on s390 * - * Copyright IBM Corp. 2008, 2009 + * Copyright IBM Corp. 2008, 2020 * * Author(s): Carsten Otte <cotte@de.ibm.com> * Christian Borntraeger <borntraeger@de.ibm.com> @@ -15,6 +15,7 @@ #include <linux/hrtimer.h> #include <linux/kvm.h> #include <linux/kvm_host.h> +#include <linux/lockdep.h> #include <asm/facility.h> #include <asm/processor.h> #include <asm/sclp.h> @@ -22,9 +23,21 @@ /* Transactional Memory Execution related macros */ #define IS_TE_ENABLED(vcpu) ((vcpu->arch.sie_block->ecb & ECB_TE)) #define TDB_FORMAT1 1 -#define IS_ITDB_VALID(vcpu) ((*(char *)vcpu->arch.sie_block->itdba == TDB_FORMAT1)) +#define IS_ITDB_VALID(vcpu) \ + ((*(char *)phys_to_virt((vcpu)->arch.sie_block->itdba) == TDB_FORMAT1)) extern debug_info_t *kvm_s390_dbf; +extern debug_info_t *kvm_s390_dbf_uv; + +#define KVM_UV_EVENT(d_kvm, d_loglevel, d_string, d_args...)\ +do { \ + debug_sprintf_event((d_kvm)->arch.dbf, d_loglevel, d_string "\n", \ + d_args); \ + debug_sprintf_event(kvm_s390_dbf_uv, d_loglevel, \ + "%d: " d_string "\n", (d_kvm)->userspace_pid, \ + d_args); \ +} while (0) + #define KVM_EVENT(d_loglevel, d_string, d_args...)\ do { \ debug_sprintf_event(kvm_s390_dbf, d_loglevel, d_string "\n", \ @@ -67,7 +80,7 @@ static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu) static inline int is_vcpu_idle(struct kvm_vcpu *vcpu) { - return test_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask); + return test_bit(vcpu->vcpu_idx, vcpu->kvm->arch.idle_mask); } static inline int kvm_is_ucontrol(struct kvm *kvm) @@ -93,7 +106,7 @@ static inline void kvm_s390_set_prefix(struct kvm_vcpu *vcpu, u32 prefix) prefix); vcpu->arch.sie_block->prefix = prefix >> GUEST_PREFIX_SHIFT; kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); - kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu); + kvm_make_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu); } static inline u64 kvm_s390_get_base_disp_s(struct kvm_vcpu *vcpu, u8 *ar) @@ -196,6 +209,67 @@ static inline int kvm_s390_user_cpu_state_ctrl(struct kvm *kvm) return kvm->arch.user_cpu_state_ctrl != 0; } +static inline void kvm_s390_set_user_cpu_state_ctrl(struct kvm *kvm) +{ + if (kvm->arch.user_cpu_state_ctrl) + return; + + VM_EVENT(kvm, 3, "%s", "ENABLE: Userspace CPU state control"); + kvm->arch.user_cpu_state_ctrl = 1; +} + +/* get the end gfn of the last (highest gfn) memslot */ +static inline unsigned long kvm_s390_get_gfn_end(struct kvm_memslots *slots) +{ + struct rb_node *node; + struct kvm_memory_slot *ms; + + if (WARN_ON(kvm_memslots_empty(slots))) + return 0; + + node = rb_last(&slots->gfn_tree); + ms = container_of(node, struct kvm_memory_slot, gfn_node[slots->node_idx]); + return ms->base_gfn + ms->npages; +} + +static inline u32 kvm_s390_get_gisa_desc(struct kvm *kvm) +{ + u32 gd = virt_to_phys(kvm->arch.gisa_int.origin); + + if (gd && sclp.has_gisaf) + gd |= GISA_FORMAT1; + return gd; +} + +/* implemented in pv.c */ +int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc); +int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc); +int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc); +int kvm_s390_pv_deinit_aside_vm(struct kvm *kvm, u16 *rc, u16 *rrc); +int kvm_s390_pv_deinit_cleanup_all(struct kvm *kvm, u16 *rc, u16 *rrc); +int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc); +int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc); +int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc, + u16 *rrc); +int kvm_s390_pv_unpack(struct kvm *kvm, unsigned long addr, unsigned long size, + unsigned long tweak, u16 *rc, u16 *rrc); +int kvm_s390_pv_set_cpu_state(struct kvm_vcpu *vcpu, u8 state); +int kvm_s390_pv_dump_cpu(struct kvm_vcpu *vcpu, void *buff, u16 *rc, u16 *rrc); +int kvm_s390_pv_dump_stor_state(struct kvm *kvm, void __user *buff_user, + u64 *gaddr, u64 buff_user_len, u16 *rc, u16 *rrc); +int kvm_s390_pv_dump_complete(struct kvm *kvm, void __user *buff_user, + u16 *rc, u16 *rrc); + +static inline u64 kvm_s390_pv_get_handle(struct kvm *kvm) +{ + return kvm->arch.pv.handle; +} + +static inline u64 kvm_s390_pv_cpu_get_handle(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.pv.handle; +} + /* implemented in interrupt.c */ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu); void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu); @@ -281,13 +355,12 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu); int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu); /* implemented in kvm-s390.c */ -void kvm_s390_set_tod_clock(struct kvm *kvm, - const struct kvm_s390_vm_tod_clock *gtod); +int kvm_s390_try_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod); long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable); int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr); int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr); -void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu); -void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu); +int kvm_s390_vcpu_start(struct kvm_vcpu *vcpu); +int kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu); void kvm_s390_vcpu_block(struct kvm_vcpu *vcpu); void kvm_s390_vcpu_unblock(struct kvm_vcpu *vcpu); bool kvm_s390_vcpu_sie_inhibited(struct kvm_vcpu *vcpu); @@ -297,13 +370,14 @@ int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu); void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu); void kvm_s390_set_cpu_timer(struct kvm_vcpu *vcpu, __u64 cputm); __u64 kvm_s390_get_cpu_timer(struct kvm_vcpu *vcpu); +int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rc, u16 *rrc); /* implemented in diag.c */ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu); static inline void kvm_s390_vcpu_block_all(struct kvm *kvm) { - int i; + unsigned long i; struct kvm_vcpu *vcpu; WARN_ON(!mutex_is_locked(&kvm->lock)); @@ -313,7 +387,7 @@ static inline void kvm_s390_vcpu_block_all(struct kvm *kvm) static inline void kvm_s390_vcpu_unblock_all(struct kvm *kvm) { - int i; + unsigned long i; struct kvm_vcpu *vcpu; kvm_for_each_vcpu(i, vcpu, kvm) @@ -373,6 +447,7 @@ void kvm_s390_destroy_adapters(struct kvm *kvm); int kvm_s390_ext_call_pending(struct kvm_vcpu *vcpu); extern struct kvm_device_ops kvm_flic_ops; int kvm_s390_is_stop_irq_pending(struct kvm_vcpu *vcpu); +int kvm_s390_is_restart_irq_pending(struct kvm_vcpu *vcpu); void kvm_s390_clear_stop_irq(struct kvm_vcpu *vcpu); int kvm_s390_set_irq_state(struct kvm_vcpu *vcpu, void __user *buf, int len); @@ -381,7 +456,9 @@ int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu, void kvm_s390_gisa_init(struct kvm *kvm); void kvm_s390_gisa_clear(struct kvm *kvm); void kvm_s390_gisa_destroy(struct kvm *kvm); -int kvm_s390_gib_init(u8 nisc); +void kvm_s390_gisa_disable(struct kvm *kvm); +void kvm_s390_gisa_enable(struct kvm *kvm); +int __init kvm_s390_gib_init(u8 nisc); void kvm_s390_gib_destroy(void); /* implemented in guestdbg.c */ @@ -426,4 +503,22 @@ void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu, * @kvm: the KVM guest */ void kvm_s390_vcpu_crypto_reset_all(struct kvm *kvm); + +/** + * kvm_s390_vcpu_pci_enable_interp + * + * Set the associated PCI attributes for each vcpu to allow for zPCI Load/Store + * interpretation as well as adapter interruption forwarding. + * + * @kvm: the KVM guest + */ +void kvm_s390_vcpu_pci_enable_interp(struct kvm *kvm); + +/** + * diag9c_forwarding_hz + * + * Set the maximum number of diag9c forwarding per second + */ +extern unsigned int diag9c_forwarding_hz; + #endif diff --git a/arch/s390/kvm/pci.c b/arch/s390/kvm/pci.c new file mode 100644 index 000000000000..ffa7739c7a28 --- /dev/null +++ b/arch/s390/kvm/pci.c @@ -0,0 +1,704 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * s390 kvm PCI passthrough support + * + * Copyright IBM Corp. 2022 + * + * Author(s): Matthew Rosato <mjrosato@linux.ibm.com> + */ + +#include <linux/kvm_host.h> +#include <linux/pci.h> +#include <asm/pci.h> +#include <asm/pci_insn.h> +#include <asm/pci_io.h> +#include <asm/sclp.h> +#include "pci.h" +#include "kvm-s390.h" + +struct zpci_aift *aift; + +static inline int __set_irq_noiib(u16 ctl, u8 isc) +{ + union zpci_sic_iib iib = {{0}}; + + return zpci_set_irq_ctrl(ctl, isc, &iib); +} + +void kvm_s390_pci_aen_exit(void) +{ + unsigned long flags; + struct kvm_zdev **gait_kzdev; + + lockdep_assert_held(&aift->aift_lock); + + /* + * Contents of the aipb remain registered for the life of the host + * kernel, the information preserved in zpci_aipb and zpci_aif_sbv + * in case we insert the KVM module again later. Clear the AIFT + * information and free anything not registered with underlying + * firmware. + */ + spin_lock_irqsave(&aift->gait_lock, flags); + gait_kzdev = aift->kzdev; + aift->gait = NULL; + aift->sbv = NULL; + aift->kzdev = NULL; + spin_unlock_irqrestore(&aift->gait_lock, flags); + + kfree(gait_kzdev); +} + +static int zpci_setup_aipb(u8 nisc) +{ + struct page *page; + int size, rc; + + zpci_aipb = kzalloc(sizeof(union zpci_sic_iib), GFP_KERNEL); + if (!zpci_aipb) + return -ENOMEM; + + aift->sbv = airq_iv_create(ZPCI_NR_DEVICES, AIRQ_IV_ALLOC, NULL); + if (!aift->sbv) { + rc = -ENOMEM; + goto free_aipb; + } + zpci_aif_sbv = aift->sbv; + size = get_order(PAGE_ALIGN(ZPCI_NR_DEVICES * + sizeof(struct zpci_gaite))); + page = alloc_pages(GFP_KERNEL | __GFP_ZERO, size); + if (!page) { + rc = -ENOMEM; + goto free_sbv; + } + aift->gait = (struct zpci_gaite *)page_to_virt(page); + + zpci_aipb->aipb.faisb = virt_to_phys(aift->sbv->vector); + zpci_aipb->aipb.gait = virt_to_phys(aift->gait); + zpci_aipb->aipb.afi = nisc; + zpci_aipb->aipb.faal = ZPCI_NR_DEVICES; + + /* Setup Adapter Event Notification Interpretation */ + if (zpci_set_irq_ctrl(SIC_SET_AENI_CONTROLS, 0, zpci_aipb)) { + rc = -EIO; + goto free_gait; + } + + return 0; + +free_gait: + free_pages((unsigned long)aift->gait, size); +free_sbv: + airq_iv_release(aift->sbv); + zpci_aif_sbv = NULL; +free_aipb: + kfree(zpci_aipb); + zpci_aipb = NULL; + + return rc; +} + +static int zpci_reset_aipb(u8 nisc) +{ + /* + * AEN registration can only happen once per system boot. If + * an aipb already exists then AEN was already registered and + * we can re-use the aipb contents. This can only happen if + * the KVM module was removed and re-inserted. However, we must + * ensure that the same forwarding ISC is used as this is assigned + * during KVM module load. + */ + if (zpci_aipb->aipb.afi != nisc) + return -EINVAL; + + aift->sbv = zpci_aif_sbv; + aift->gait = phys_to_virt(zpci_aipb->aipb.gait); + + return 0; +} + +int kvm_s390_pci_aen_init(u8 nisc) +{ + int rc = 0; + + /* If already enabled for AEN, bail out now */ + if (aift->gait || aift->sbv) + return -EPERM; + + mutex_lock(&aift->aift_lock); + aift->kzdev = kcalloc(ZPCI_NR_DEVICES, sizeof(struct kvm_zdev *), + GFP_KERNEL); + if (!aift->kzdev) { + rc = -ENOMEM; + goto unlock; + } + + if (!zpci_aipb) + rc = zpci_setup_aipb(nisc); + else + rc = zpci_reset_aipb(nisc); + if (rc) + goto free_zdev; + + /* Enable floating IRQs */ + if (__set_irq_noiib(SIC_IRQ_MODE_SINGLE, nisc)) { + rc = -EIO; + kvm_s390_pci_aen_exit(); + } + + goto unlock; + +free_zdev: + kfree(aift->kzdev); +unlock: + mutex_unlock(&aift->aift_lock); + return rc; +} + +/* Modify PCI: Register floating adapter interruption forwarding */ +static int kvm_zpci_set_airq(struct zpci_dev *zdev) +{ + u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_REG_INT); + struct zpci_fib fib = {}; + u8 status; + + fib.fmt0.isc = zdev->kzdev->fib.fmt0.isc; + fib.fmt0.sum = 1; /* enable summary notifications */ + fib.fmt0.noi = airq_iv_end(zdev->aibv); + fib.fmt0.aibv = virt_to_phys(zdev->aibv->vector); + fib.fmt0.aibvo = 0; + fib.fmt0.aisb = virt_to_phys(aift->sbv->vector + (zdev->aisb / 64) * 8); + fib.fmt0.aisbo = zdev->aisb & 63; + fib.gd = zdev->gisa; + + return zpci_mod_fc(req, &fib, &status) ? -EIO : 0; +} + +/* Modify PCI: Unregister floating adapter interruption forwarding */ +static int kvm_zpci_clear_airq(struct zpci_dev *zdev) +{ + u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_DEREG_INT); + struct zpci_fib fib = {}; + u8 cc, status; + + fib.gd = zdev->gisa; + + cc = zpci_mod_fc(req, &fib, &status); + if (cc == 3 || (cc == 1 && status == 24)) + /* Function already gone or IRQs already deregistered. */ + cc = 0; + + return cc ? -EIO : 0; +} + +static inline void unaccount_mem(unsigned long nr_pages) +{ + struct user_struct *user = get_uid(current_user()); + + if (user) + atomic_long_sub(nr_pages, &user->locked_vm); + if (current->mm) + atomic64_sub(nr_pages, ¤t->mm->pinned_vm); +} + +static inline int account_mem(unsigned long nr_pages) +{ + struct user_struct *user = get_uid(current_user()); + unsigned long page_limit, cur_pages, new_pages; + + page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + + do { + cur_pages = atomic_long_read(&user->locked_vm); + new_pages = cur_pages + nr_pages; + if (new_pages > page_limit) + return -ENOMEM; + } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages, + new_pages) != cur_pages); + + atomic64_add(nr_pages, ¤t->mm->pinned_vm); + + return 0; +} + +static int kvm_s390_pci_aif_enable(struct zpci_dev *zdev, struct zpci_fib *fib, + bool assist) +{ + struct page *pages[1], *aibv_page, *aisb_page = NULL; + unsigned int msi_vecs, idx; + struct zpci_gaite *gaite; + unsigned long hva, bit; + struct kvm *kvm; + phys_addr_t gaddr; + int rc = 0, gisc, npages, pcount = 0; + + /* + * Interrupt forwarding is only applicable if the device is already + * enabled for interpretation + */ + if (zdev->gisa == 0) + return -EINVAL; + + kvm = zdev->kzdev->kvm; + msi_vecs = min_t(unsigned int, fib->fmt0.noi, zdev->max_msi); + + /* Get the associated forwarding ISC - if invalid, return the error */ + gisc = kvm_s390_gisc_register(kvm, fib->fmt0.isc); + if (gisc < 0) + return gisc; + + /* Replace AIBV address */ + idx = srcu_read_lock(&kvm->srcu); + hva = gfn_to_hva(kvm, gpa_to_gfn((gpa_t)fib->fmt0.aibv)); + npages = pin_user_pages_fast(hva, 1, FOLL_WRITE | FOLL_LONGTERM, pages); + srcu_read_unlock(&kvm->srcu, idx); + if (npages < 1) { + rc = -EIO; + goto out; + } + aibv_page = pages[0]; + pcount++; + gaddr = page_to_phys(aibv_page) + (fib->fmt0.aibv & ~PAGE_MASK); + fib->fmt0.aibv = gaddr; + + /* Pin the guest AISB if one was specified */ + if (fib->fmt0.sum == 1) { + idx = srcu_read_lock(&kvm->srcu); + hva = gfn_to_hva(kvm, gpa_to_gfn((gpa_t)fib->fmt0.aisb)); + npages = pin_user_pages_fast(hva, 1, FOLL_WRITE | FOLL_LONGTERM, + pages); + srcu_read_unlock(&kvm->srcu, idx); + if (npages < 1) { + rc = -EIO; + goto unpin1; + } + aisb_page = pages[0]; + pcount++; + } + + /* Account for pinned pages, roll back on failure */ + if (account_mem(pcount)) + goto unpin2; + + /* AISB must be allocated before we can fill in GAITE */ + mutex_lock(&aift->aift_lock); + bit = airq_iv_alloc_bit(aift->sbv); + if (bit == -1UL) + goto unlock; + zdev->aisb = bit; /* store the summary bit number */ + zdev->aibv = airq_iv_create(msi_vecs, AIRQ_IV_DATA | + AIRQ_IV_BITLOCK | + AIRQ_IV_GUESTVEC, + phys_to_virt(fib->fmt0.aibv)); + + spin_lock_irq(&aift->gait_lock); + gaite = (struct zpci_gaite *)aift->gait + (zdev->aisb * + sizeof(struct zpci_gaite)); + + /* If assist not requested, host will get all alerts */ + if (assist) + gaite->gisa = (u32)virt_to_phys(&kvm->arch.sie_page2->gisa); + else + gaite->gisa = 0; + + gaite->gisc = fib->fmt0.isc; + gaite->count++; + gaite->aisbo = fib->fmt0.aisbo; + gaite->aisb = virt_to_phys(page_address(aisb_page) + (fib->fmt0.aisb & + ~PAGE_MASK)); + aift->kzdev[zdev->aisb] = zdev->kzdev; + spin_unlock_irq(&aift->gait_lock); + + /* Update guest FIB for re-issue */ + fib->fmt0.aisbo = zdev->aisb & 63; + fib->fmt0.aisb = virt_to_phys(aift->sbv->vector + (zdev->aisb / 64) * 8); + fib->fmt0.isc = gisc; + + /* Save some guest fib values in the host for later use */ + zdev->kzdev->fib.fmt0.isc = fib->fmt0.isc; + zdev->kzdev->fib.fmt0.aibv = fib->fmt0.aibv; + mutex_unlock(&aift->aift_lock); + + /* Issue the clp to setup the irq now */ + rc = kvm_zpci_set_airq(zdev); + return rc; + +unlock: + mutex_unlock(&aift->aift_lock); +unpin2: + if (fib->fmt0.sum == 1) + unpin_user_page(aisb_page); +unpin1: + unpin_user_page(aibv_page); +out: + return rc; +} + +static int kvm_s390_pci_aif_disable(struct zpci_dev *zdev, bool force) +{ + struct kvm_zdev *kzdev = zdev->kzdev; + struct zpci_gaite *gaite; + struct page *vpage = NULL, *spage = NULL; + int rc, pcount = 0; + u8 isc; + + if (zdev->gisa == 0) + return -EINVAL; + + mutex_lock(&aift->aift_lock); + + /* + * If the clear fails due to an error, leave now unless we know this + * device is about to go away (force) -- In that case clear the GAITE + * regardless. + */ + rc = kvm_zpci_clear_airq(zdev); + if (rc && !force) + goto out; + + if (zdev->kzdev->fib.fmt0.aibv == 0) + goto out; + spin_lock_irq(&aift->gait_lock); + gaite = (struct zpci_gaite *)aift->gait + (zdev->aisb * + sizeof(struct zpci_gaite)); + isc = gaite->gisc; + gaite->count--; + if (gaite->count == 0) { + /* Release guest AIBV and AISB */ + vpage = phys_to_page(kzdev->fib.fmt0.aibv); + if (gaite->aisb != 0) + spage = phys_to_page(gaite->aisb); + /* Clear the GAIT entry */ + gaite->aisb = 0; + gaite->gisc = 0; + gaite->aisbo = 0; + gaite->gisa = 0; + aift->kzdev[zdev->aisb] = NULL; + /* Clear zdev info */ + airq_iv_free_bit(aift->sbv, zdev->aisb); + airq_iv_release(zdev->aibv); + zdev->aisb = 0; + zdev->aibv = NULL; + } + spin_unlock_irq(&aift->gait_lock); + kvm_s390_gisc_unregister(kzdev->kvm, isc); + kzdev->fib.fmt0.isc = 0; + kzdev->fib.fmt0.aibv = 0; + + if (vpage) { + unpin_user_page(vpage); + pcount++; + } + if (spage) { + unpin_user_page(spage); + pcount++; + } + if (pcount > 0) + unaccount_mem(pcount); +out: + mutex_unlock(&aift->aift_lock); + + return rc; +} + +static int kvm_s390_pci_dev_open(struct zpci_dev *zdev) +{ + struct kvm_zdev *kzdev; + + kzdev = kzalloc(sizeof(struct kvm_zdev), GFP_KERNEL); + if (!kzdev) + return -ENOMEM; + + kzdev->zdev = zdev; + zdev->kzdev = kzdev; + + return 0; +} + +static void kvm_s390_pci_dev_release(struct zpci_dev *zdev) +{ + struct kvm_zdev *kzdev; + + kzdev = zdev->kzdev; + WARN_ON(kzdev->zdev != zdev); + zdev->kzdev = NULL; + kfree(kzdev); +} + + +/* + * Register device with the specified KVM. If interpretation facilities are + * available, enable them and let userspace indicate whether or not they will + * be used (specify SHM bit to disable). + */ +static int kvm_s390_pci_register_kvm(void *opaque, struct kvm *kvm) +{ + struct zpci_dev *zdev = opaque; + u8 status; + int rc; + + if (!zdev) + return -EINVAL; + + mutex_lock(&zdev->kzdev_lock); + + if (zdev->kzdev || zdev->gisa != 0 || !kvm) { + mutex_unlock(&zdev->kzdev_lock); + return -EINVAL; + } + + kvm_get_kvm(kvm); + + mutex_lock(&kvm->lock); + + rc = kvm_s390_pci_dev_open(zdev); + if (rc) + goto err; + + /* + * If interpretation facilities aren't available, add the device to + * the kzdev list but don't enable for interpretation. + */ + if (!kvm_s390_pci_interp_allowed()) + goto out; + + /* + * If this is the first request to use an interpreted device, make the + * necessary vcpu changes + */ + if (!kvm->arch.use_zpci_interp) + kvm_s390_vcpu_pci_enable_interp(kvm); + + if (zdev_enabled(zdev)) { + rc = zpci_disable_device(zdev); + if (rc) + goto err; + } + + /* + * Store information about the identity of the kvm guest allowed to + * access this device via interpretation to be used by host CLP + */ + zdev->gisa = (u32)virt_to_phys(&kvm->arch.sie_page2->gisa); + + rc = zpci_enable_device(zdev); + if (rc) + goto clear_gisa; + + /* Re-register the IOMMU that was already created */ + rc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, + virt_to_phys(zdev->dma_table), &status); + if (rc) + goto clear_gisa; + +out: + zdev->kzdev->kvm = kvm; + + spin_lock(&kvm->arch.kzdev_list_lock); + list_add_tail(&zdev->kzdev->entry, &kvm->arch.kzdev_list); + spin_unlock(&kvm->arch.kzdev_list_lock); + + mutex_unlock(&kvm->lock); + mutex_unlock(&zdev->kzdev_lock); + return 0; + +clear_gisa: + zdev->gisa = 0; +err: + if (zdev->kzdev) + kvm_s390_pci_dev_release(zdev); + mutex_unlock(&kvm->lock); + mutex_unlock(&zdev->kzdev_lock); + kvm_put_kvm(kvm); + return rc; +} + +static void kvm_s390_pci_unregister_kvm(void *opaque) +{ + struct zpci_dev *zdev = opaque; + struct kvm *kvm; + u8 status; + + if (!zdev) + return; + + mutex_lock(&zdev->kzdev_lock); + + if (WARN_ON(!zdev->kzdev)) { + mutex_unlock(&zdev->kzdev_lock); + return; + } + + kvm = zdev->kzdev->kvm; + mutex_lock(&kvm->lock); + + /* + * A 0 gisa means interpretation was never enabled, just remove the + * device from the list. + */ + if (zdev->gisa == 0) + goto out; + + /* Forwarding must be turned off before interpretation */ + if (zdev->kzdev->fib.fmt0.aibv != 0) + kvm_s390_pci_aif_disable(zdev, true); + + /* Remove the host CLP guest designation */ + zdev->gisa = 0; + + if (zdev_enabled(zdev)) { + if (zpci_disable_device(zdev)) + goto out; + } + + if (zpci_enable_device(zdev)) + goto out; + + /* Re-register the IOMMU that was already created */ + zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, + virt_to_phys(zdev->dma_table), &status); + +out: + spin_lock(&kvm->arch.kzdev_list_lock); + list_del(&zdev->kzdev->entry); + spin_unlock(&kvm->arch.kzdev_list_lock); + kvm_s390_pci_dev_release(zdev); + + mutex_unlock(&kvm->lock); + mutex_unlock(&zdev->kzdev_lock); + + kvm_put_kvm(kvm); +} + +void kvm_s390_pci_init_list(struct kvm *kvm) +{ + spin_lock_init(&kvm->arch.kzdev_list_lock); + INIT_LIST_HEAD(&kvm->arch.kzdev_list); +} + +void kvm_s390_pci_clear_list(struct kvm *kvm) +{ + /* + * This list should already be empty, either via vfio device closures + * or kvm fd cleanup. + */ + spin_lock(&kvm->arch.kzdev_list_lock); + WARN_ON_ONCE(!list_empty(&kvm->arch.kzdev_list)); + spin_unlock(&kvm->arch.kzdev_list_lock); +} + +static struct zpci_dev *get_zdev_from_kvm_by_fh(struct kvm *kvm, u32 fh) +{ + struct zpci_dev *zdev = NULL; + struct kvm_zdev *kzdev; + + spin_lock(&kvm->arch.kzdev_list_lock); + list_for_each_entry(kzdev, &kvm->arch.kzdev_list, entry) { + if (kzdev->zdev->fh == fh) { + zdev = kzdev->zdev; + break; + } + } + spin_unlock(&kvm->arch.kzdev_list_lock); + + return zdev; +} + +static int kvm_s390_pci_zpci_reg_aen(struct zpci_dev *zdev, + struct kvm_s390_zpci_op *args) +{ + struct zpci_fib fib = {}; + bool hostflag; + + fib.fmt0.aibv = args->u.reg_aen.ibv; + fib.fmt0.isc = args->u.reg_aen.isc; + fib.fmt0.noi = args->u.reg_aen.noi; + if (args->u.reg_aen.sb != 0) { + fib.fmt0.aisb = args->u.reg_aen.sb; + fib.fmt0.aisbo = args->u.reg_aen.sbo; + fib.fmt0.sum = 1; + } else { + fib.fmt0.aisb = 0; + fib.fmt0.aisbo = 0; + fib.fmt0.sum = 0; + } + + hostflag = !(args->u.reg_aen.flags & KVM_S390_ZPCIOP_REGAEN_HOST); + return kvm_s390_pci_aif_enable(zdev, &fib, hostflag); +} + +int kvm_s390_pci_zpci_op(struct kvm *kvm, struct kvm_s390_zpci_op *args) +{ + struct kvm_zdev *kzdev; + struct zpci_dev *zdev; + int r; + + zdev = get_zdev_from_kvm_by_fh(kvm, args->fh); + if (!zdev) + return -ENODEV; + + mutex_lock(&zdev->kzdev_lock); + mutex_lock(&kvm->lock); + + kzdev = zdev->kzdev; + if (!kzdev) { + r = -ENODEV; + goto out; + } + if (kzdev->kvm != kvm) { + r = -EPERM; + goto out; + } + + switch (args->op) { + case KVM_S390_ZPCIOP_REG_AEN: + /* Fail on unknown flags */ + if (args->u.reg_aen.flags & ~KVM_S390_ZPCIOP_REGAEN_HOST) { + r = -EINVAL; + break; + } + r = kvm_s390_pci_zpci_reg_aen(zdev, args); + break; + case KVM_S390_ZPCIOP_DEREG_AEN: + r = kvm_s390_pci_aif_disable(zdev, false); + break; + default: + r = -EINVAL; + } + +out: + mutex_unlock(&kvm->lock); + mutex_unlock(&zdev->kzdev_lock); + return r; +} + +int __init kvm_s390_pci_init(void) +{ + zpci_kvm_hook.kvm_register = kvm_s390_pci_register_kvm; + zpci_kvm_hook.kvm_unregister = kvm_s390_pci_unregister_kvm; + + if (!kvm_s390_pci_interp_allowed()) + return 0; + + aift = kzalloc(sizeof(struct zpci_aift), GFP_KERNEL); + if (!aift) + return -ENOMEM; + + spin_lock_init(&aift->gait_lock); + mutex_init(&aift->aift_lock); + + return 0; +} + +void kvm_s390_pci_exit(void) +{ + zpci_kvm_hook.kvm_register = NULL; + zpci_kvm_hook.kvm_unregister = NULL; + + if (!kvm_s390_pci_interp_allowed()) + return; + + mutex_destroy(&aift->aift_lock); + + kfree(aift); +} diff --git a/arch/s390/kvm/pci.h b/arch/s390/kvm/pci.h new file mode 100644 index 000000000000..ff0972dd5e71 --- /dev/null +++ b/arch/s390/kvm/pci.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * s390 kvm PCI passthrough support + * + * Copyright IBM Corp. 2022 + * + * Author(s): Matthew Rosato <mjrosato@linux.ibm.com> + */ + +#ifndef __KVM_S390_PCI_H +#define __KVM_S390_PCI_H + +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/mutex.h> +#include <linux/pci.h> +#include <asm/airq.h> +#include <asm/cpu.h> + +struct kvm_zdev { + struct zpci_dev *zdev; + struct kvm *kvm; + struct zpci_fib fib; + struct list_head entry; +}; + +struct zpci_gaite { + u32 gisa; + u8 gisc; + u8 count; + u8 reserved; + u8 aisbo; + u64 aisb; +}; + +struct zpci_aift { + struct zpci_gaite *gait; + struct airq_iv *sbv; + struct kvm_zdev **kzdev; + spinlock_t gait_lock; /* Protects the gait, used during AEN forward */ + struct mutex aift_lock; /* Protects the other structures in aift */ +}; + +extern struct zpci_aift *aift; + +static inline struct kvm *kvm_s390_pci_si_to_kvm(struct zpci_aift *aift, + unsigned long si) +{ + if (!IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM) || !aift->kzdev || + !aift->kzdev[si]) + return NULL; + return aift->kzdev[si]->kvm; +}; + +int kvm_s390_pci_aen_init(u8 nisc); +void kvm_s390_pci_aen_exit(void); + +void kvm_s390_pci_init_list(struct kvm *kvm); +void kvm_s390_pci_clear_list(struct kvm *kvm); + +int kvm_s390_pci_zpci_op(struct kvm *kvm, struct kvm_s390_zpci_op *args); + +int __init kvm_s390_pci_init(void); +void kvm_s390_pci_exit(void); + +static inline bool kvm_s390_pci_interp_allowed(void) +{ + struct cpuid cpu_id; + + get_cpu_id(&cpu_id); + switch (cpu_id.machine) { + case 0x2817: + case 0x2818: + case 0x2827: + case 0x2828: + case 0x2964: + case 0x2965: + /* No SHM on certain machines */ + return false; + default: + return (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM) && + sclp.has_zpci_lsi && sclp.has_aeni && sclp.has_aisi && + sclp.has_aisii); + } +} + +#endif /* __KVM_S390_PCI_H */ diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index ed52ffa8d5d4..621a17fd1a1b 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -2,7 +2,7 @@ /* * handling privileged instructions * - * Copyright IBM Corp. 2008, 2018 + * Copyright IBM Corp. 2008, 2020 * * Author(s): Carsten Otte <cotte@de.ibm.com> * Christian Borntraeger <borntraeger@de.ibm.com> @@ -11,20 +11,17 @@ #include <linux/kvm.h> #include <linux/gfp.h> #include <linux/errno.h> -#include <linux/compat.h> #include <linux/mm_types.h> - +#include <linux/pgtable.h> +#include <linux/io.h> #include <asm/asm-offsets.h> #include <asm/facility.h> #include <asm/current.h> #include <asm/debug.h> #include <asm/ebcdic.h> #include <asm/sysinfo.h> -#include <asm/pgtable.h> #include <asm/page-states.h> -#include <asm/pgalloc.h> #include <asm/gmap.h> -#include <asm/io.h> #include <asm/ptrace.h> #include <asm/sclp.h> #include <asm/ap.h> @@ -60,7 +57,7 @@ static int handle_gs(struct kvm_vcpu *vcpu) if (test_kvm_facility(vcpu->kvm, 133)) { VCPU_EVENT(vcpu, 3, "%s", "ENABLE: GS (lazy)"); preempt_disable(); - __ctl_set_bit(2, 4); + local_ctl_set_bit(2, CR2_GUARDED_STORAGE_BIT); current->thread.gs_cb = (struct gs_cb *)&vcpu->run->s.regs.gscb; restore_gs_cb(current->thread.gs_cb); preempt_enable(); @@ -103,7 +100,20 @@ static int handle_set_clock(struct kvm_vcpu *vcpu) return kvm_s390_inject_prog_cond(vcpu, rc); VCPU_EVENT(vcpu, 3, "SCK: setting guest TOD to 0x%llx", gtod.tod); - kvm_s390_set_tod_clock(vcpu->kvm, >od); + /* + * To set the TOD clock the kvm lock must be taken, but the vcpu lock + * is already held in handle_set_clock. The usual lock order is the + * opposite. As SCK is deprecated and should not be used in several + * cases, for example when the multiple epoch facility or TOD clock + * steering facility is installed (see Principles of Operation), a + * slow path can be used. If the lock can not be taken via try_lock, + * the instruction will be retried via -EAGAIN at a later point in + * time. + */ + if (!kvm_s390_try_set_tod_clock(vcpu->kvm, >od)) { + kvm_s390_retry_instr(vcpu); + return -EAGAIN; + } kvm_s390_set_psw_cc(vcpu, 0); return 0; @@ -270,18 +280,18 @@ static int handle_iske(struct kvm_vcpu *vcpu) return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); retry: unlocked = false; - down_read(¤t->mm->mmap_sem); + mmap_read_lock(current->mm); rc = get_guest_storage_key(current->mm, vmaddr, &key); if (rc) { - rc = fixup_user_fault(current, current->mm, vmaddr, + rc = fixup_user_fault(current->mm, vmaddr, FAULT_FLAG_WRITE, &unlocked); if (!rc) { - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); goto retry; } } - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); if (rc == -EFAULT) return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); if (rc < 0) @@ -317,17 +327,17 @@ static int handle_rrbe(struct kvm_vcpu *vcpu) return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); retry: unlocked = false; - down_read(¤t->mm->mmap_sem); + mmap_read_lock(current->mm); rc = reset_guest_reference_bit(current->mm, vmaddr); if (rc < 0) { - rc = fixup_user_fault(current, current->mm, vmaddr, + rc = fixup_user_fault(current->mm, vmaddr, FAULT_FLAG_WRITE, &unlocked); if (!rc) { - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); goto retry; } } - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); if (rc == -EFAULT) return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); if (rc < 0) @@ -385,19 +395,21 @@ static int handle_sske(struct kvm_vcpu *vcpu) if (kvm_is_error_hva(vmaddr)) return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - down_read(¤t->mm->mmap_sem); + mmap_read_lock(current->mm); rc = cond_set_guest_storage_key(current->mm, vmaddr, key, &oldkey, m3 & SSKE_NQ, m3 & SSKE_MR, m3 & SSKE_MC); if (rc < 0) { - rc = fixup_user_fault(current, current->mm, vmaddr, + rc = fixup_user_fault(current->mm, vmaddr, FAULT_FLAG_WRITE, &unlocked); rc = !rc ? -EAGAIN : rc; } - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); if (rc == -EFAULT) return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); + if (rc == -EAGAIN) + continue; if (rc < 0) return rc; start += PAGE_SIZE; @@ -429,7 +441,7 @@ static int handle_ipte_interlock(struct kvm_vcpu *vcpu) vcpu->stat.instruction_ipte_interlock++; if (psw_bits(vcpu->arch.sie_block->gpsw).pstate) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); - wait_event(vcpu->kvm->arch.ipte_wq, !ipte_lock_held(vcpu)); + wait_event(vcpu->kvm->arch.ipte_wq, !ipte_lock_held(vcpu->kvm)); kvm_s390_retry_instr(vcpu); VCPU_EVENT(vcpu, 4, "%s", "retrying ipte interlock operation"); return 0; @@ -611,6 +623,7 @@ static int handle_io_inst(struct kvm_vcpu *vcpu) static int handle_pqap(struct kvm_vcpu *vcpu) { struct ap_queue_status status = {}; + crypto_hook pqap_hook; unsigned long reg0; int ret; uint8_t fc; @@ -626,10 +639,12 @@ static int handle_pqap(struct kvm_vcpu *vcpu) * available for the guest are AQIC and TAPQ with the t bit set * since we do not set IC.3 (FIII) we currently will only intercept * the AQIC function code. + * Note: running nested under z/VM can result in intercepts for other + * function codes, e.g. PQAP(QCI). We do not support this and bail out. */ reg0 = vcpu->run->s.regs.gprs[0]; fc = (reg0 >> 24) & 0xff; - if (WARN_ON_ONCE(fc != 0x03)) + if (fc != 0x03) return -EOPNOTSUPP; /* PQAP instruction is allowed for guest kernel only */ @@ -653,18 +668,20 @@ static int handle_pqap(struct kvm_vcpu *vcpu) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); /* - * Verify that the hook callback is registered, lock the owner - * and call the hook. + * If the hook callback is registered, there will be a pointer to the + * hook function pointer in the kvm_s390_crypto structure. Lock the + * owner, retrieve the hook function pointer and call the hook. */ + down_read(&vcpu->kvm->arch.crypto.pqap_hook_rwsem); if (vcpu->kvm->arch.crypto.pqap_hook) { - if (!try_module_get(vcpu->kvm->arch.crypto.pqap_hook->owner)) - return -EOPNOTSUPP; - ret = vcpu->kvm->arch.crypto.pqap_hook->hook(vcpu); - module_put(vcpu->kvm->arch.crypto.pqap_hook->owner); + pqap_hook = *vcpu->kvm->arch.crypto.pqap_hook; + ret = pqap_hook(vcpu); if (!ret && vcpu->run->s.regs.gprs[1] & 0x00ff0000) kvm_s390_set_psw_cc(vcpu, 3); + up_read(&vcpu->kvm->arch.crypto.pqap_hook_rwsem); return ret; } + up_read(&vcpu->kvm->arch.crypto.pqap_hook_rwsem); /* * A vfio_driver must register a hook. * No hook means no driver to enable the SIE CRYCB and no queues. @@ -855,10 +872,18 @@ static int handle_stsi(struct kvm_vcpu *vcpu) if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); - if (fc > 3) { - kvm_s390_set_psw_cc(vcpu, 3); - return 0; - } + /* Bailout forbidden function codes */ + if (fc > 3 && fc != 15) + goto out_no_data; + + /* + * fc 15 is provided only with + * - PTF/CPU topology support through facility 15 + * - KVM_CAP_S390_USER_STSI + */ + if (fc == 15 && (!test_kvm_facility(vcpu->kvm, 11) || + !vcpu->kvm->arch.user_stsi)) + goto out_no_data; if (vcpu->run->s.regs.gprs[0] & 0x0fffff00 || vcpu->run->s.regs.gprs[1] & 0xffff0000) @@ -872,13 +897,13 @@ static int handle_stsi(struct kvm_vcpu *vcpu) operand2 = kvm_s390_get_base_disp_s(vcpu, &ar); - if (operand2 & 0xfff) + if (!kvm_s390_pv_cpu_is_protected(vcpu) && (operand2 & 0xfff)) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); switch (fc) { case 1: /* same handling for 1 and 2 */ case 2: - mem = get_zeroed_page(GFP_KERNEL); + mem = get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!mem) goto out_no_data; if (stsi((void *) mem, fc, sel1, sel2)) @@ -887,14 +912,22 @@ static int handle_stsi(struct kvm_vcpu *vcpu) case 3: if (sel1 != 2 || sel2 != 2) goto out_no_data; - mem = get_zeroed_page(GFP_KERNEL); + mem = get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!mem) goto out_no_data; handle_stsi_3_2_2(vcpu, (void *) mem); break; + case 15: /* fc 15 is fully handled in userspace */ + insert_stsi_usr_data(vcpu, operand2, ar, fc, sel1, sel2); + trace_kvm_s390_handle_stsi(vcpu, fc, sel1, sel2, operand2); + return -EREMOTE; + } + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + memcpy(sida_addr(vcpu->arch.sie_block), (void *)mem, PAGE_SIZE); + rc = 0; + } else { + rc = write_guest(vcpu, operand2, ar, (void *)mem, PAGE_SIZE); } - - rc = write_guest(vcpu, operand2, ar, (void *)mem, PAGE_SIZE); if (rc) { rc = kvm_s390_inject_prog_cond(vcpu, rc); goto out; @@ -1084,15 +1117,15 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) if (rc) return rc; - down_read(¤t->mm->mmap_sem); + mmap_read_lock(current->mm); rc = cond_set_guest_storage_key(current->mm, vmaddr, key, NULL, nq, mr, mc); if (rc < 0) { - rc = fixup_user_fault(current, current->mm, vmaddr, + rc = fixup_user_fault(current->mm, vmaddr, FAULT_FLAG_WRITE, &unlocked); rc = !rc ? -EAGAIN : rc; } - up_read(¤t->mm->mmap_sem); + mmap_read_unlock(current->mm); if (rc == -EFAULT) return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); if (rc == -EAGAIN) @@ -1115,7 +1148,7 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) } /* - * Must be called with relevant read locks held (kvm->mm->mmap_sem, kvm->srcu) + * Must be called with relevant read locks held (kvm->mm->mmap_lock, kvm->srcu) */ static inline int __do_essa(struct kvm_vcpu *vcpu, const int orc) { @@ -1213,9 +1246,9 @@ static int handle_essa(struct kvm_vcpu *vcpu) * already correct, we do nothing and avoid the lock. */ if (vcpu->kvm->mm->context.uses_cmm == 0) { - down_write(&vcpu->kvm->mm->mmap_sem); + mmap_write_lock(vcpu->kvm->mm); vcpu->kvm->mm->context.uses_cmm = 1; - up_write(&vcpu->kvm->mm->mmap_sem); + mmap_write_unlock(vcpu->kvm->mm); } /* * If we are here, we are supposed to have CMMA enabled in @@ -1232,11 +1265,11 @@ static int handle_essa(struct kvm_vcpu *vcpu) } else { int srcu_idx; - down_read(&vcpu->kvm->mm->mmap_sem); + mmap_read_lock(vcpu->kvm->mm); srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); i = __do_essa(vcpu, orc); srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); - up_read(&vcpu->kvm->mm->mmap_sem); + mmap_read_unlock(vcpu->kvm->mm); if (i < 0) return i; /* Account for the possible extra cbrl entry */ @@ -1244,10 +1277,10 @@ static int handle_essa(struct kvm_vcpu *vcpu) } vcpu->arch.sie_block->cbrlo &= PAGE_MASK; /* reset nceo */ cbrlo = phys_to_virt(vcpu->arch.sie_block->cbrlo); - down_read(&gmap->mm->mmap_sem); + mmap_read_lock(gmap->mm); for (i = 0; i < entries; ++i) __gmap_zap(gmap, cbrlo[i]); - up_read(&gmap->mm->mmap_sem); + mmap_read_unlock(gmap->mm); return 0; } @@ -1432,10 +1465,11 @@ int kvm_s390_handle_eb(struct kvm_vcpu *vcpu) static int handle_tprot(struct kvm_vcpu *vcpu) { - u64 address1, address2; - unsigned long hva, gpa; - int ret = 0, cc = 0; + u64 address, operand2; + unsigned long gpa; + u8 access_key; bool writable; + int ret, cc; u8 ar; vcpu->stat.instruction_tprot++; @@ -1443,45 +1477,48 @@ static int handle_tprot(struct kvm_vcpu *vcpu) if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); - kvm_s390_get_base_disp_sse(vcpu, &address1, &address2, &ar, NULL); + kvm_s390_get_base_disp_sse(vcpu, &address, &operand2, &ar, NULL); + access_key = (operand2 & 0xf0) >> 4; - /* we only handle the Linux memory detection case: - * access key == 0 - * everything else goes to userspace. */ - if (address2 & 0xf0) - return -EOPNOTSUPP; if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_DAT) - ipte_lock(vcpu); - ret = guest_translate_address(vcpu, address1, ar, &gpa, GACC_STORE); - if (ret == PGM_PROTECTION) { + ipte_lock(vcpu->kvm); + + ret = guest_translate_address_with_key(vcpu, address, ar, &gpa, + GACC_STORE, access_key); + if (ret == 0) { + gfn_to_hva_prot(vcpu->kvm, gpa_to_gfn(gpa), &writable); + } else if (ret == PGM_PROTECTION) { + writable = false; /* Write protected? Try again with read-only... */ - cc = 1; - ret = guest_translate_address(vcpu, address1, ar, &gpa, - GACC_FETCH); + ret = guest_translate_address_with_key(vcpu, address, ar, &gpa, + GACC_FETCH, access_key); } - if (ret) { - if (ret == PGM_ADDRESSING || ret == PGM_TRANSLATION_SPEC) { - ret = kvm_s390_inject_program_int(vcpu, ret); - } else if (ret > 0) { - /* Translation not available */ - kvm_s390_set_psw_cc(vcpu, 3); + if (ret >= 0) { + cc = -1; + + /* Fetching permitted; storing permitted */ + if (ret == 0 && writable) + cc = 0; + /* Fetching permitted; storing not permitted */ + else if (ret == 0 && !writable) + cc = 1; + /* Fetching not permitted; storing not permitted */ + else if (ret == PGM_PROTECTION) + cc = 2; + /* Translation not available */ + else if (ret != PGM_ADDRESSING && ret != PGM_TRANSLATION_SPEC) + cc = 3; + + if (cc != -1) { + kvm_s390_set_psw_cc(vcpu, cc); ret = 0; + } else { + ret = kvm_s390_inject_program_int(vcpu, ret); } - goto out_unlock; } - hva = gfn_to_hva_prot(vcpu->kvm, gpa_to_gfn(gpa), &writable); - if (kvm_is_error_hva(hva)) { - ret = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - } else { - if (!writable) - cc = 1; /* Write not permitted ==> read-only */ - kvm_s390_set_psw_cc(vcpu, cc); - /* Note: CC2 only occurs for storage keys (not supported yet) */ - } -out_unlock: if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_DAT) - ipte_unlock(vcpu); + ipte_unlock(vcpu->kvm); return ret; } diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c new file mode 100644 index 000000000000..75e81ba26d04 --- /dev/null +++ b/arch/s390/kvm/pv.c @@ -0,0 +1,894 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Hosting Protected Virtual Machines + * + * Copyright IBM Corp. 2019, 2020 + * Author(s): Janosch Frank <frankja@linux.ibm.com> + */ +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/minmax.h> +#include <linux/pagemap.h> +#include <linux/sched/signal.h> +#include <asm/gmap.h> +#include <asm/uv.h> +#include <asm/mman.h> +#include <linux/pagewalk.h> +#include <linux/sched/mm.h> +#include <linux/mmu_notifier.h> +#include "kvm-s390.h" + +bool kvm_s390_pv_is_protected(struct kvm *kvm) +{ + lockdep_assert_held(&kvm->lock); + return !!kvm_s390_pv_get_handle(kvm); +} +EXPORT_SYMBOL_GPL(kvm_s390_pv_is_protected); + +bool kvm_s390_pv_cpu_is_protected(struct kvm_vcpu *vcpu) +{ + lockdep_assert_held(&vcpu->mutex); + return !!kvm_s390_pv_cpu_get_handle(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_s390_pv_cpu_is_protected); + +/** + * struct pv_vm_to_be_destroyed - Represents a protected VM that needs to + * be destroyed + * + * @list: list head for the list of leftover VMs + * @old_gmap_table: the gmap table of the leftover protected VM + * @handle: the handle of the leftover protected VM + * @stor_var: pointer to the variable storage of the leftover protected VM + * @stor_base: address of the base storage of the leftover protected VM + * + * Represents a protected VM that is still registered with the Ultravisor, + * but which does not correspond any longer to an active KVM VM. It should + * be destroyed at some point later, either asynchronously or when the + * process terminates. + */ +struct pv_vm_to_be_destroyed { + struct list_head list; + unsigned long old_gmap_table; + u64 handle; + void *stor_var; + unsigned long stor_base; +}; + +static void kvm_s390_clear_pv_state(struct kvm *kvm) +{ + kvm->arch.pv.handle = 0; + kvm->arch.pv.guest_len = 0; + kvm->arch.pv.stor_base = 0; + kvm->arch.pv.stor_var = NULL; +} + +int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc) +{ + int cc; + + if (!kvm_s390_pv_cpu_get_handle(vcpu)) + return 0; + + cc = uv_cmd_nodata(kvm_s390_pv_cpu_get_handle(vcpu), UVC_CMD_DESTROY_SEC_CPU, rc, rrc); + + KVM_UV_EVENT(vcpu->kvm, 3, "PROTVIRT DESTROY VCPU %d: rc %x rrc %x", + vcpu->vcpu_id, *rc, *rrc); + WARN_ONCE(cc, "protvirt destroy cpu failed rc %x rrc %x", *rc, *rrc); + + /* Intended memory leak for something that should never happen. */ + if (!cc) + free_pages(vcpu->arch.pv.stor_base, + get_order(uv_info.guest_cpu_stor_len)); + + free_page((unsigned long)sida_addr(vcpu->arch.sie_block)); + vcpu->arch.sie_block->pv_handle_cpu = 0; + vcpu->arch.sie_block->pv_handle_config = 0; + memset(&vcpu->arch.pv, 0, sizeof(vcpu->arch.pv)); + vcpu->arch.sie_block->sdf = 0; + /* + * The sidad field (for sdf == 2) is now the gbea field (for sdf == 0). + * Use the reset value of gbea to avoid leaking the kernel pointer of + * the just freed sida. + */ + vcpu->arch.sie_block->gbea = 1; + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + + return cc ? EIO : 0; +} + +int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc) +{ + struct uv_cb_csc uvcb = { + .header.cmd = UVC_CMD_CREATE_SEC_CPU, + .header.len = sizeof(uvcb), + }; + void *sida_addr; + int cc; + + if (kvm_s390_pv_cpu_get_handle(vcpu)) + return -EINVAL; + + vcpu->arch.pv.stor_base = __get_free_pages(GFP_KERNEL_ACCOUNT, + get_order(uv_info.guest_cpu_stor_len)); + if (!vcpu->arch.pv.stor_base) + return -ENOMEM; + + /* Input */ + uvcb.guest_handle = kvm_s390_pv_get_handle(vcpu->kvm); + uvcb.num = vcpu->arch.sie_block->icpua; + uvcb.state_origin = virt_to_phys(vcpu->arch.sie_block); + uvcb.stor_origin = virt_to_phys((void *)vcpu->arch.pv.stor_base); + + /* Alloc Secure Instruction Data Area Designation */ + sida_addr = (void *)__get_free_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!sida_addr) { + free_pages(vcpu->arch.pv.stor_base, + get_order(uv_info.guest_cpu_stor_len)); + return -ENOMEM; + } + vcpu->arch.sie_block->sidad = virt_to_phys(sida_addr); + + cc = uv_call(0, (u64)&uvcb); + *rc = uvcb.header.rc; + *rrc = uvcb.header.rrc; + KVM_UV_EVENT(vcpu->kvm, 3, + "PROTVIRT CREATE VCPU: cpu %d handle %llx rc %x rrc %x", + vcpu->vcpu_id, uvcb.cpu_handle, uvcb.header.rc, + uvcb.header.rrc); + + if (cc) { + u16 dummy; + + kvm_s390_pv_destroy_cpu(vcpu, &dummy, &dummy); + return -EIO; + } + + /* Output */ + vcpu->arch.pv.handle = uvcb.cpu_handle; + vcpu->arch.sie_block->pv_handle_cpu = uvcb.cpu_handle; + vcpu->arch.sie_block->pv_handle_config = kvm_s390_pv_get_handle(vcpu->kvm); + vcpu->arch.sie_block->sdf = 2; + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + return 0; +} + +/* only free resources when the destroy was successful */ +static void kvm_s390_pv_dealloc_vm(struct kvm *kvm) +{ + vfree(kvm->arch.pv.stor_var); + free_pages(kvm->arch.pv.stor_base, + get_order(uv_info.guest_base_stor_len)); + kvm_s390_clear_pv_state(kvm); +} + +static int kvm_s390_pv_alloc_vm(struct kvm *kvm) +{ + unsigned long base = uv_info.guest_base_stor_len; + unsigned long virt = uv_info.guest_virt_var_stor_len; + unsigned long npages = 0, vlen = 0; + + kvm->arch.pv.stor_var = NULL; + kvm->arch.pv.stor_base = __get_free_pages(GFP_KERNEL_ACCOUNT, get_order(base)); + if (!kvm->arch.pv.stor_base) + return -ENOMEM; + + /* + * Calculate current guest storage for allocation of the + * variable storage, which is based on the length in MB. + * + * Slots are sorted by GFN + */ + mutex_lock(&kvm->slots_lock); + npages = kvm_s390_get_gfn_end(kvm_memslots(kvm)); + mutex_unlock(&kvm->slots_lock); + + kvm->arch.pv.guest_len = npages * PAGE_SIZE; + + /* Allocate variable storage */ + vlen = ALIGN(virt * ((npages * PAGE_SIZE) / HPAGE_SIZE), PAGE_SIZE); + vlen += uv_info.guest_virt_base_stor_len; + kvm->arch.pv.stor_var = vzalloc(vlen); + if (!kvm->arch.pv.stor_var) + goto out_err; + return 0; + +out_err: + kvm_s390_pv_dealloc_vm(kvm); + return -ENOMEM; +} + +/** + * kvm_s390_pv_dispose_one_leftover - Clean up one leftover protected VM. + * @kvm: the KVM that was associated with this leftover protected VM + * @leftover: details about the leftover protected VM that needs a clean up + * @rc: the RC code of the Destroy Secure Configuration UVC + * @rrc: the RRC code of the Destroy Secure Configuration UVC + * + * Destroy one leftover protected VM. + * On success, kvm->mm->context.protected_count will be decremented atomically + * and all other resources used by the VM will be freed. + * + * Return: 0 in case of success, otherwise 1 + */ +static int kvm_s390_pv_dispose_one_leftover(struct kvm *kvm, + struct pv_vm_to_be_destroyed *leftover, + u16 *rc, u16 *rrc) +{ + int cc; + + /* It used the destroy-fast UVC, nothing left to do here */ + if (!leftover->handle) + goto done_fast; + cc = uv_cmd_nodata(leftover->handle, UVC_CMD_DESTROY_SEC_CONF, rc, rrc); + KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY LEFTOVER VM: rc %x rrc %x", *rc, *rrc); + WARN_ONCE(cc, "protvirt destroy leftover vm failed rc %x rrc %x", *rc, *rrc); + if (cc) + return cc; + /* + * Intentionally leak unusable memory. If the UVC fails, the memory + * used for the VM and its metadata is permanently unusable. + * This can only happen in case of a serious KVM or hardware bug; it + * is not expected to happen in normal operation. + */ + free_pages(leftover->stor_base, get_order(uv_info.guest_base_stor_len)); + free_pages(leftover->old_gmap_table, CRST_ALLOC_ORDER); + vfree(leftover->stor_var); +done_fast: + atomic_dec(&kvm->mm->context.protected_count); + return 0; +} + +/** + * kvm_s390_destroy_lower_2g - Destroy the first 2GB of protected guest memory. + * @kvm: the VM whose memory is to be cleared. + * + * Destroy the first 2GB of guest memory, to avoid prefix issues after reboot. + * The CPUs of the protected VM need to be destroyed beforehand. + */ +static void kvm_s390_destroy_lower_2g(struct kvm *kvm) +{ + const unsigned long pages_2g = SZ_2G / PAGE_SIZE; + struct kvm_memory_slot *slot; + unsigned long len; + int srcu_idx; + + srcu_idx = srcu_read_lock(&kvm->srcu); + + /* Take the memslot containing guest absolute address 0 */ + slot = gfn_to_memslot(kvm, 0); + /* Clear all slots or parts thereof that are below 2GB */ + while (slot && slot->base_gfn < pages_2g) { + len = min_t(u64, slot->npages, pages_2g - slot->base_gfn) * PAGE_SIZE; + s390_uv_destroy_range(kvm->mm, slot->userspace_addr, slot->userspace_addr + len); + /* Take the next memslot */ + slot = gfn_to_memslot(kvm, slot->base_gfn + slot->npages); + } + + srcu_read_unlock(&kvm->srcu, srcu_idx); +} + +static int kvm_s390_pv_deinit_vm_fast(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + struct uv_cb_destroy_fast uvcb = { + .header.cmd = UVC_CMD_DESTROY_SEC_CONF_FAST, + .header.len = sizeof(uvcb), + .handle = kvm_s390_pv_get_handle(kvm), + }; + int cc; + + cc = uv_call_sched(0, (u64)&uvcb); + if (rc) + *rc = uvcb.header.rc; + if (rrc) + *rrc = uvcb.header.rrc; + WRITE_ONCE(kvm->arch.gmap->guest_handle, 0); + KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM FAST: rc %x rrc %x", + uvcb.header.rc, uvcb.header.rrc); + WARN_ONCE(cc && uvcb.header.rc != 0x104, + "protvirt destroy vm fast failed handle %llx rc %x rrc %x", + kvm_s390_pv_get_handle(kvm), uvcb.header.rc, uvcb.header.rrc); + /* Intended memory leak on "impossible" error */ + if (!cc) + kvm_s390_pv_dealloc_vm(kvm); + return cc ? -EIO : 0; +} + +static inline bool is_destroy_fast_available(void) +{ + return test_bit_inv(BIT_UVC_CMD_DESTROY_SEC_CONF_FAST, uv_info.inst_calls_list); +} + +/** + * kvm_s390_pv_set_aside - Set aside a protected VM for later teardown. + * @kvm: the VM + * @rc: return value for the RC field of the UVCB + * @rrc: return value for the RRC field of the UVCB + * + * Set aside the protected VM for a subsequent teardown. The VM will be able + * to continue immediately as a non-secure VM, and the information needed to + * properly tear down the protected VM is set aside. If another protected VM + * was already set aside without starting its teardown, this function will + * fail. + * The CPUs of the protected VM need to be destroyed beforehand. + * + * Context: kvm->lock needs to be held + * + * Return: 0 in case of success, -EINVAL if another protected VM was already set + * aside, -ENOMEM if the system ran out of memory. + */ +int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + struct pv_vm_to_be_destroyed *priv; + int res = 0; + + lockdep_assert_held(&kvm->lock); + /* + * If another protected VM was already prepared for teardown, refuse. + * A normal deinitialization has to be performed instead. + */ + if (kvm->arch.pv.set_aside) + return -EINVAL; + + /* Guest with segment type ASCE, refuse to destroy asynchronously */ + if ((kvm->arch.gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT) + return -EINVAL; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + if (is_destroy_fast_available()) { + res = kvm_s390_pv_deinit_vm_fast(kvm, rc, rrc); + } else { + priv->stor_var = kvm->arch.pv.stor_var; + priv->stor_base = kvm->arch.pv.stor_base; + priv->handle = kvm_s390_pv_get_handle(kvm); + priv->old_gmap_table = (unsigned long)kvm->arch.gmap->table; + WRITE_ONCE(kvm->arch.gmap->guest_handle, 0); + if (s390_replace_asce(kvm->arch.gmap)) + res = -ENOMEM; + } + + if (res) { + kfree(priv); + return res; + } + + kvm_s390_destroy_lower_2g(kvm); + kvm_s390_clear_pv_state(kvm); + kvm->arch.pv.set_aside = priv; + + *rc = UVC_RC_EXECUTED; + *rrc = 42; + return 0; +} + +/** + * kvm_s390_pv_deinit_vm - Deinitialize the current protected VM + * @kvm: the KVM whose protected VM needs to be deinitialized + * @rc: the RC code of the UVC + * @rrc: the RRC code of the UVC + * + * Deinitialize the current protected VM. This function will destroy and + * cleanup the current protected VM, but it will not cleanup the guest + * memory. This function should only be called when the protected VM has + * just been created and therefore does not have any guest memory, or when + * the caller cleans up the guest memory separately. + * + * This function should not fail, but if it does, the donated memory must + * not be freed. + * + * Context: kvm->lock needs to be held + * + * Return: 0 in case of success, otherwise -EIO + */ +int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + int cc; + + cc = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm), + UVC_CMD_DESTROY_SEC_CONF, rc, rrc); + WRITE_ONCE(kvm->arch.gmap->guest_handle, 0); + if (!cc) { + atomic_dec(&kvm->mm->context.protected_count); + kvm_s390_pv_dealloc_vm(kvm); + } else { + /* Intended memory leak on "impossible" error */ + s390_replace_asce(kvm->arch.gmap); + } + KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM: rc %x rrc %x", *rc, *rrc); + WARN_ONCE(cc, "protvirt destroy vm failed rc %x rrc %x", *rc, *rrc); + + return cc ? -EIO : 0; +} + +/** + * kvm_s390_pv_deinit_cleanup_all - Clean up all protected VMs associated + * with a specific KVM. + * @kvm: the KVM to be cleaned up + * @rc: the RC code of the first failing UVC + * @rrc: the RRC code of the first failing UVC + * + * This function will clean up all protected VMs associated with a KVM. + * This includes the active one, the one prepared for deinitialization with + * kvm_s390_pv_set_aside, and any still pending in the need_cleanup list. + * + * Context: kvm->lock needs to be held unless being called from + * kvm_arch_destroy_vm. + * + * Return: 0 if all VMs are successfully cleaned up, otherwise -EIO + */ +int kvm_s390_pv_deinit_cleanup_all(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + struct pv_vm_to_be_destroyed *cur; + bool need_zap = false; + u16 _rc, _rrc; + int cc = 0; + + /* + * Nothing to do if the counter was already 0. Otherwise make sure + * the counter does not reach 0 before calling s390_uv_destroy_range. + */ + if (!atomic_inc_not_zero(&kvm->mm->context.protected_count)) + return 0; + + *rc = 1; + /* If the current VM is protected, destroy it */ + if (kvm_s390_pv_get_handle(kvm)) { + cc = kvm_s390_pv_deinit_vm(kvm, rc, rrc); + need_zap = true; + } + + /* If a previous protected VM was set aside, put it in the need_cleanup list */ + if (kvm->arch.pv.set_aside) { + list_add(kvm->arch.pv.set_aside, &kvm->arch.pv.need_cleanup); + kvm->arch.pv.set_aside = NULL; + } + + /* Cleanup all protected VMs in the need_cleanup list */ + while (!list_empty(&kvm->arch.pv.need_cleanup)) { + cur = list_first_entry(&kvm->arch.pv.need_cleanup, typeof(*cur), list); + need_zap = true; + if (kvm_s390_pv_dispose_one_leftover(kvm, cur, &_rc, &_rrc)) { + cc = 1; + /* + * Only return the first error rc and rrc, so make + * sure it is not overwritten. All destroys will + * additionally be reported via KVM_UV_EVENT(). + */ + if (*rc == UVC_RC_EXECUTED) { + *rc = _rc; + *rrc = _rrc; + } + } + list_del(&cur->list); + kfree(cur); + } + + /* + * If the mm still has a mapping, try to mark all its pages as + * accessible. The counter should not reach zero before this + * cleanup has been performed. + */ + if (need_zap && mmget_not_zero(kvm->mm)) { + s390_uv_destroy_range(kvm->mm, 0, TASK_SIZE); + mmput(kvm->mm); + } + + /* Now the counter can safely reach 0 */ + atomic_dec(&kvm->mm->context.protected_count); + return cc ? -EIO : 0; +} + +/** + * kvm_s390_pv_deinit_aside_vm - Teardown a previously set aside protected VM. + * @kvm: the VM previously associated with the protected VM + * @rc: return value for the RC field of the UVCB + * @rrc: return value for the RRC field of the UVCB + * + * Tear down the protected VM that had been previously prepared for teardown + * using kvm_s390_pv_set_aside_vm. Ideally this should be called by + * userspace asynchronously from a separate thread. + * + * Context: kvm->lock must not be held. + * + * Return: 0 in case of success, -EINVAL if no protected VM had been + * prepared for asynchronous teardowm, -EIO in case of other errors. + */ +int kvm_s390_pv_deinit_aside_vm(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + struct pv_vm_to_be_destroyed *p; + int ret = 0; + + lockdep_assert_not_held(&kvm->lock); + mutex_lock(&kvm->lock); + p = kvm->arch.pv.set_aside; + kvm->arch.pv.set_aside = NULL; + mutex_unlock(&kvm->lock); + if (!p) + return -EINVAL; + + /* When a fatal signal is received, stop immediately */ + if (s390_uv_destroy_range_interruptible(kvm->mm, 0, TASK_SIZE_MAX)) + goto done; + if (kvm_s390_pv_dispose_one_leftover(kvm, p, rc, rrc)) + ret = -EIO; + kfree(p); + p = NULL; +done: + /* + * p is not NULL if we aborted because of a fatal signal, in which + * case queue the leftover for later cleanup. + */ + if (p) { + mutex_lock(&kvm->lock); + list_add(&p->list, &kvm->arch.pv.need_cleanup); + mutex_unlock(&kvm->lock); + /* Did not finish, but pretend things went well */ + *rc = UVC_RC_EXECUTED; + *rrc = 42; + } + return ret; +} + +static void kvm_s390_pv_mmu_notifier_release(struct mmu_notifier *subscription, + struct mm_struct *mm) +{ + struct kvm *kvm = container_of(subscription, struct kvm, arch.pv.mmu_notifier); + u16 dummy; + int r; + + /* + * No locking is needed since this is the last thread of the last user of this + * struct mm. + * When the struct kvm gets deinitialized, this notifier is also + * unregistered. This means that if this notifier runs, then the + * struct kvm is still valid. + */ + r = kvm_s390_cpus_from_pv(kvm, &dummy, &dummy); + if (!r && is_destroy_fast_available() && kvm_s390_pv_get_handle(kvm)) + kvm_s390_pv_deinit_vm_fast(kvm, &dummy, &dummy); +} + +static const struct mmu_notifier_ops kvm_s390_pv_mmu_notifier_ops = { + .release = kvm_s390_pv_mmu_notifier_release, +}; + +int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + struct uv_cb_cgc uvcb = { + .header.cmd = UVC_CMD_CREATE_SEC_CONF, + .header.len = sizeof(uvcb) + }; + int cc, ret; + u16 dummy; + + ret = kvm_s390_pv_alloc_vm(kvm); + if (ret) + return ret; + + /* Inputs */ + uvcb.guest_stor_origin = 0; /* MSO is 0 for KVM */ + uvcb.guest_stor_len = kvm->arch.pv.guest_len; + uvcb.guest_asce = kvm->arch.gmap->asce; + uvcb.guest_sca = virt_to_phys(kvm->arch.sca); + uvcb.conf_base_stor_origin = + virt_to_phys((void *)kvm->arch.pv.stor_base); + uvcb.conf_virt_stor_origin = (u64)kvm->arch.pv.stor_var; + uvcb.flags.ap_allow_instr = kvm->arch.model.uv_feat_guest.ap; + uvcb.flags.ap_instr_intr = kvm->arch.model.uv_feat_guest.ap_intr; + + cc = uv_call_sched(0, (u64)&uvcb); + *rc = uvcb.header.rc; + *rrc = uvcb.header.rrc; + KVM_UV_EVENT(kvm, 3, "PROTVIRT CREATE VM: handle %llx len %llx rc %x rrc %x flags %04x", + uvcb.guest_handle, uvcb.guest_stor_len, *rc, *rrc, uvcb.flags.raw); + + /* Outputs */ + kvm->arch.pv.handle = uvcb.guest_handle; + + atomic_inc(&kvm->mm->context.protected_count); + if (cc) { + if (uvcb.header.rc & UVC_RC_NEED_DESTROY) { + kvm_s390_pv_deinit_vm(kvm, &dummy, &dummy); + } else { + atomic_dec(&kvm->mm->context.protected_count); + kvm_s390_pv_dealloc_vm(kvm); + } + return -EIO; + } + kvm->arch.gmap->guest_handle = uvcb.guest_handle; + /* Add the notifier only once. No races because we hold kvm->lock */ + if (kvm->arch.pv.mmu_notifier.ops != &kvm_s390_pv_mmu_notifier_ops) { + kvm->arch.pv.mmu_notifier.ops = &kvm_s390_pv_mmu_notifier_ops; + mmu_notifier_register(&kvm->arch.pv.mmu_notifier, kvm->mm); + } + return 0; +} + +int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc, + u16 *rrc) +{ + struct uv_cb_ssc uvcb = { + .header.cmd = UVC_CMD_SET_SEC_CONF_PARAMS, + .header.len = sizeof(uvcb), + .sec_header_origin = (u64)hdr, + .sec_header_len = length, + .guest_handle = kvm_s390_pv_get_handle(kvm), + }; + int cc = uv_call(0, (u64)&uvcb); + + *rc = uvcb.header.rc; + *rrc = uvcb.header.rrc; + KVM_UV_EVENT(kvm, 3, "PROTVIRT VM SET PARMS: rc %x rrc %x", + *rc, *rrc); + return cc ? -EINVAL : 0; +} + +static int unpack_one(struct kvm *kvm, unsigned long addr, u64 tweak, + u64 offset, u16 *rc, u16 *rrc) +{ + struct uv_cb_unp uvcb = { + .header.cmd = UVC_CMD_UNPACK_IMG, + .header.len = sizeof(uvcb), + .guest_handle = kvm_s390_pv_get_handle(kvm), + .gaddr = addr, + .tweak[0] = tweak, + .tweak[1] = offset, + }; + int ret = gmap_make_secure(kvm->arch.gmap, addr, &uvcb); + + *rc = uvcb.header.rc; + *rrc = uvcb.header.rrc; + + if (ret && ret != -EAGAIN) + KVM_UV_EVENT(kvm, 3, "PROTVIRT VM UNPACK: failed addr %llx with rc %x rrc %x", + uvcb.gaddr, *rc, *rrc); + return ret; +} + +int kvm_s390_pv_unpack(struct kvm *kvm, unsigned long addr, unsigned long size, + unsigned long tweak, u16 *rc, u16 *rrc) +{ + u64 offset = 0; + int ret = 0; + + if (addr & ~PAGE_MASK || !size || size & ~PAGE_MASK) + return -EINVAL; + + KVM_UV_EVENT(kvm, 3, "PROTVIRT VM UNPACK: start addr %lx size %lx", + addr, size); + + while (offset < size) { + ret = unpack_one(kvm, addr, tweak, offset, rc, rrc); + if (ret == -EAGAIN) { + cond_resched(); + if (fatal_signal_pending(current)) + break; + continue; + } + if (ret) + break; + addr += PAGE_SIZE; + offset += PAGE_SIZE; + } + if (!ret) + KVM_UV_EVENT(kvm, 3, "%s", "PROTVIRT VM UNPACK: successful"); + return ret; +} + +int kvm_s390_pv_set_cpu_state(struct kvm_vcpu *vcpu, u8 state) +{ + struct uv_cb_cpu_set_state uvcb = { + .header.cmd = UVC_CMD_CPU_SET_STATE, + .header.len = sizeof(uvcb), + .cpu_handle = kvm_s390_pv_cpu_get_handle(vcpu), + .state = state, + }; + int cc; + + cc = uv_call(0, (u64)&uvcb); + KVM_UV_EVENT(vcpu->kvm, 3, "PROTVIRT SET CPU %d STATE %d rc %x rrc %x", + vcpu->vcpu_id, state, uvcb.header.rc, uvcb.header.rrc); + if (cc) + return -EINVAL; + return 0; +} + +int kvm_s390_pv_dump_cpu(struct kvm_vcpu *vcpu, void *buff, u16 *rc, u16 *rrc) +{ + struct uv_cb_dump_cpu uvcb = { + .header.cmd = UVC_CMD_DUMP_CPU, + .header.len = sizeof(uvcb), + .cpu_handle = vcpu->arch.pv.handle, + .dump_area_origin = (u64)buff, + }; + int cc; + + cc = uv_call_sched(0, (u64)&uvcb); + *rc = uvcb.header.rc; + *rrc = uvcb.header.rrc; + return cc; +} + +/* Size of the cache for the storage state dump data. 1MB for now */ +#define DUMP_BUFF_LEN HPAGE_SIZE + +/** + * kvm_s390_pv_dump_stor_state + * + * @kvm: pointer to the guest's KVM struct + * @buff_user: Userspace pointer where we will write the results to + * @gaddr: Starting absolute guest address for which the storage state + * is requested. + * @buff_user_len: Length of the buff_user buffer + * @rc: Pointer to where the uvcb return code is stored + * @rrc: Pointer to where the uvcb return reason code is stored + * + * Stores buff_len bytes of tweak component values to buff_user + * starting with the 1MB block specified by the absolute guest address + * (gaddr). The gaddr pointer will be updated with the last address + * for which data was written when returning to userspace. buff_user + * might be written to even if an error rc is returned. For instance + * if we encounter a fault after writing the first page of data. + * + * Context: kvm->lock needs to be held + * + * Return: + * 0 on success + * -ENOMEM if allocating the cache fails + * -EINVAL if gaddr is not aligned to 1MB + * -EINVAL if buff_user_len is not aligned to uv_info.conf_dump_storage_state_len + * -EINVAL if the UV call fails, rc and rrc will be set in this case + * -EFAULT if copying the result to buff_user failed + */ +int kvm_s390_pv_dump_stor_state(struct kvm *kvm, void __user *buff_user, + u64 *gaddr, u64 buff_user_len, u16 *rc, u16 *rrc) +{ + struct uv_cb_dump_stor_state uvcb = { + .header.cmd = UVC_CMD_DUMP_CONF_STOR_STATE, + .header.len = sizeof(uvcb), + .config_handle = kvm->arch.pv.handle, + .gaddr = *gaddr, + .dump_area_origin = 0, + }; + const u64 increment_len = uv_info.conf_dump_storage_state_len; + size_t buff_kvm_size; + size_t size_done = 0; + u8 *buff_kvm = NULL; + int cc, ret; + + ret = -EINVAL; + /* UV call processes 1MB guest storage chunks at a time */ + if (!IS_ALIGNED(*gaddr, HPAGE_SIZE)) + goto out; + + /* + * We provide the storage state for 1MB chunks of guest + * storage. The buffer will need to be aligned to + * conf_dump_storage_state_len so we don't end on a partial + * chunk. + */ + if (!buff_user_len || + !IS_ALIGNED(buff_user_len, increment_len)) + goto out; + + /* + * Allocate a buffer from which we will later copy to the user + * process. We don't want userspace to dictate our buffer size + * so we limit it to DUMP_BUFF_LEN. + */ + ret = -ENOMEM; + buff_kvm_size = min_t(u64, buff_user_len, DUMP_BUFF_LEN); + buff_kvm = vzalloc(buff_kvm_size); + if (!buff_kvm) + goto out; + + ret = 0; + uvcb.dump_area_origin = (u64)buff_kvm; + /* We will loop until the user buffer is filled or an error occurs */ + do { + /* Get 1MB worth of guest storage state data */ + cc = uv_call_sched(0, (u64)&uvcb); + + /* All or nothing */ + if (cc) { + ret = -EINVAL; + break; + } + + size_done += increment_len; + uvcb.dump_area_origin += increment_len; + buff_user_len -= increment_len; + uvcb.gaddr += HPAGE_SIZE; + + /* KVM Buffer full, time to copy to the process */ + if (!buff_user_len || size_done == DUMP_BUFF_LEN) { + if (copy_to_user(buff_user, buff_kvm, size_done)) { + ret = -EFAULT; + break; + } + + buff_user += size_done; + size_done = 0; + uvcb.dump_area_origin = (u64)buff_kvm; + } + } while (buff_user_len); + + /* Report back where we ended dumping */ + *gaddr = uvcb.gaddr; + + /* Lets only log errors, we don't want to spam */ +out: + if (ret) + KVM_UV_EVENT(kvm, 3, + "PROTVIRT DUMP STORAGE STATE: addr %llx ret %d, uvcb rc %x rrc %x", + uvcb.gaddr, ret, uvcb.header.rc, uvcb.header.rrc); + *rc = uvcb.header.rc; + *rrc = uvcb.header.rrc; + vfree(buff_kvm); + + return ret; +} + +/** + * kvm_s390_pv_dump_complete + * + * @kvm: pointer to the guest's KVM struct + * @buff_user: Userspace pointer where we will write the results to + * @rc: Pointer to where the uvcb return code is stored + * @rrc: Pointer to where the uvcb return reason code is stored + * + * Completes the dumping operation and writes the completion data to + * user space. + * + * Context: kvm->lock needs to be held + * + * Return: + * 0 on success + * -ENOMEM if allocating the completion buffer fails + * -EINVAL if the UV call fails, rc and rrc will be set in this case + * -EFAULT if copying the result to buff_user failed + */ +int kvm_s390_pv_dump_complete(struct kvm *kvm, void __user *buff_user, + u16 *rc, u16 *rrc) +{ + struct uv_cb_dump_complete complete = { + .header.len = sizeof(complete), + .header.cmd = UVC_CMD_DUMP_COMPLETE, + .config_handle = kvm_s390_pv_get_handle(kvm), + }; + u64 *compl_data; + int ret; + + /* Allocate dump area */ + compl_data = vzalloc(uv_info.conf_dump_finalize_len); + if (!compl_data) + return -ENOMEM; + complete.dump_area_origin = (u64)compl_data; + + ret = uv_call_sched(0, (u64)&complete); + *rc = complete.header.rc; + *rrc = complete.header.rrc; + KVM_UV_EVENT(kvm, 3, "PROTVIRT DUMP COMPLETE: rc %x rrc %x", + complete.header.rc, complete.header.rrc); + + if (!ret) { + /* + * kvm_s390_pv_dealloc_vm() will also (mem)set + * this to false on a reboot or other destroy + * operation for this vm. + */ + kvm->arch.pv.dumping = false; + kvm_s390_vcpu_unblock_all(kvm); + ret = copy_to_user(buff_user, compl_data, uv_info.conf_dump_finalize_len); + if (ret) + ret = -EFAULT; + } + vfree(compl_data); + /* If the UVC returned an error, translate it to -EINVAL */ + if (ret > 0) + ret = -EINVAL; + return ret; +} diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c index 683036c1c92a..d9696b530064 100644 --- a/arch/s390/kvm/sigp.c +++ b/arch/s390/kvm/sigp.c @@ -151,22 +151,10 @@ static int __sigp_stop_and_store_status(struct kvm_vcpu *vcpu, static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter, u64 *status_reg) { - unsigned int i; - struct kvm_vcpu *v; - bool all_stopped = true; - - kvm_for_each_vcpu(i, v, vcpu->kvm) { - if (v == vcpu) - continue; - if (!is_vcpu_stopped(v)) - all_stopped = false; - } - *status_reg &= 0xffffffff00000000UL; /* Reject set arch order, with czam we're always in z/Arch mode. */ - *status_reg |= (all_stopped ? SIGP_STATUS_INVALID_PARAMETER : - SIGP_STATUS_INCORRECT_STATE); + *status_reg |= SIGP_STATUS_INVALID_PARAMETER; return SIGP_CC_STATUS_STORED; } @@ -288,6 +276,34 @@ static int handle_sigp_dst(struct kvm_vcpu *vcpu, u8 order_code, if (!dst_vcpu) return SIGP_CC_NOT_OPERATIONAL; + /* + * SIGP RESTART, SIGP STOP, and SIGP STOP AND STORE STATUS orders + * are processed asynchronously. Until the affected VCPU finishes + * its work and calls back into KVM to clear the (RESTART or STOP) + * interrupt, we need to return any new non-reset orders "busy". + * + * This is important because a single VCPU could issue: + * 1) SIGP STOP $DESTINATION + * 2) SIGP SENSE $DESTINATION + * + * If the SIGP SENSE would not be rejected as "busy", it could + * return an incorrect answer as to whether the VCPU is STOPPED + * or OPERATING. + */ + if (order_code != SIGP_INITIAL_CPU_RESET && + order_code != SIGP_CPU_RESET) { + /* + * Lockless check. Both SIGP STOP and SIGP (RE)START + * properly synchronize everything while processing + * their orders, while the guest cannot observe a + * difference when issuing other orders from two + * different VCPUs. + */ + if (kvm_s390_is_stop_irq_pending(dst_vcpu) || + kvm_s390_is_restart_irq_pending(dst_vcpu)) + return SIGP_CC_BUSY; + } + switch (order_code) { case SIGP_SENSE: vcpu->stat.instruction_sigp_sense++; @@ -453,7 +469,7 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu) * * This interception will occur at the source cpu when a source cpu sends an * external call to a target cpu and the target cpu has the WAIT bit set in - * its cpuflags. Interception will occurr after the interrupt indicator bits at + * its cpuflags. Interception will occur after the interrupt indicator bits at * the target cpu have been set. All error cases will lead to instruction * interception, therefore nothing is to be checked or prepared. */ @@ -464,9 +480,9 @@ int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu) struct kvm_vcpu *dest_vcpu; u8 order_code = kvm_s390_get_base_disp_rs(vcpu, NULL); - trace_kvm_s390_handle_sigp_pei(vcpu, order_code, cpu_addr); - if (order_code == SIGP_EXTERNAL_CALL) { + trace_kvm_s390_handle_sigp_pei(vcpu, order_code, cpu_addr); + dest_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, cpu_addr); BUG_ON(dest_vcpu == NULL); diff --git a/arch/s390/kvm/trace-s390.h b/arch/s390/kvm/trace-s390.h index 6f0209d45164..9ac92dbf680d 100644 --- a/arch/s390/kvm/trace-s390.h +++ b/arch/s390/kvm/trace-s390.h @@ -333,6 +333,29 @@ TRACE_EVENT(kvm_s390_airq_suppressed, __entry->id, __entry->isc) ); +/* + * Trace point for gmap notifier calls. + */ +TRACE_EVENT(kvm_s390_gmap_notifier, + TP_PROTO(unsigned long start, unsigned long end, unsigned int shadow), + TP_ARGS(start, end, shadow), + + TP_STRUCT__entry( + __field(unsigned long, start) + __field(unsigned long, end) + __field(unsigned int, shadow) + ), + + TP_fast_assign( + __entry->start = start; + __entry->end = end; + __entry->shadow = shadow; + ), + + TP_printk("gmap notified (start:0x%lx end:0x%lx shadow:%d)", + __entry->start, __entry->end, __entry->shadow) + ); + #endif /* _TRACE_KVMS390_H */ diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 076090f9e666..fef42e2a80a2 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -18,6 +18,8 @@ #include <asm/sclp.h> #include <asm/nmi.h> #include <asm/dis.h> +#include <asm/fpu/api.h> +#include <asm/facility.h> #include "kvm-s390.h" #include "gaccess.h" @@ -137,11 +139,15 @@ static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) } /* Copy to APCB FORMAT1 from APCB FORMAT0 */ static int setup_apcb10(struct kvm_vcpu *vcpu, struct kvm_s390_apcb1 *apcb_s, - unsigned long apcb_o, struct kvm_s390_apcb1 *apcb_h) + unsigned long crycb_gpa, struct kvm_s390_apcb1 *apcb_h) { struct kvm_s390_apcb0 tmp; + unsigned long apcb_gpa; - if (read_guest_real(vcpu, apcb_o, &tmp, sizeof(struct kvm_s390_apcb0))) + apcb_gpa = crycb_gpa + offsetof(struct kvm_s390_crypto_cb, apcb0); + + if (read_guest_real(vcpu, apcb_gpa, &tmp, + sizeof(struct kvm_s390_apcb0))) return -EFAULT; apcb_s->apm[0] = apcb_h->apm[0] & tmp.apm[0]; @@ -156,19 +162,24 @@ static int setup_apcb10(struct kvm_vcpu *vcpu, struct kvm_s390_apcb1 *apcb_s, * setup_apcb00 - Copy to APCB FORMAT0 from APCB FORMAT0 * @vcpu: pointer to the virtual CPU * @apcb_s: pointer to start of apcb in the shadow crycb - * @apcb_o: pointer to start of original apcb in the guest2 + * @crycb_gpa: guest physical address to start of original guest crycb * @apcb_h: pointer to start of apcb in the guest1 * * Returns 0 and -EFAULT on error reading guest apcb */ static int setup_apcb00(struct kvm_vcpu *vcpu, unsigned long *apcb_s, - unsigned long apcb_o, unsigned long *apcb_h) + unsigned long crycb_gpa, unsigned long *apcb_h) { - if (read_guest_real(vcpu, apcb_o, apcb_s, + unsigned long apcb_gpa; + + apcb_gpa = crycb_gpa + offsetof(struct kvm_s390_crypto_cb, apcb0); + + if (read_guest_real(vcpu, apcb_gpa, apcb_s, sizeof(struct kvm_s390_apcb0))) return -EFAULT; - bitmap_and(apcb_s, apcb_s, apcb_h, sizeof(struct kvm_s390_apcb0)); + bitmap_and(apcb_s, apcb_s, apcb_h, + BITS_PER_BYTE * sizeof(struct kvm_s390_apcb0)); return 0; } @@ -177,20 +188,25 @@ static int setup_apcb00(struct kvm_vcpu *vcpu, unsigned long *apcb_s, * setup_apcb11 - Copy the FORMAT1 APCB from the guest to the shadow CRYCB * @vcpu: pointer to the virtual CPU * @apcb_s: pointer to start of apcb in the shadow crycb - * @apcb_o: pointer to start of original guest apcb + * @crycb_gpa: guest physical address to start of original guest crycb * @apcb_h: pointer to start of apcb in the host * * Returns 0 and -EFAULT on error reading guest apcb */ static int setup_apcb11(struct kvm_vcpu *vcpu, unsigned long *apcb_s, - unsigned long apcb_o, + unsigned long crycb_gpa, unsigned long *apcb_h) { - if (read_guest_real(vcpu, apcb_o, apcb_s, + unsigned long apcb_gpa; + + apcb_gpa = crycb_gpa + offsetof(struct kvm_s390_crypto_cb, apcb1); + + if (read_guest_real(vcpu, apcb_gpa, apcb_s, sizeof(struct kvm_s390_apcb1))) return -EFAULT; - bitmap_and(apcb_s, apcb_s, apcb_h, sizeof(struct kvm_s390_apcb1)); + bitmap_and(apcb_s, apcb_s, apcb_h, + BITS_PER_BYTE * sizeof(struct kvm_s390_apcb1)); return 0; } @@ -199,7 +215,7 @@ static int setup_apcb11(struct kvm_vcpu *vcpu, unsigned long *apcb_s, * setup_apcb - Create a shadow copy of the apcb. * @vcpu: pointer to the virtual CPU * @crycb_s: pointer to shadow crycb - * @crycb_o: pointer to original guest crycb + * @crycb_gpa: guest physical address of original guest crycb * @crycb_h: pointer to the host crycb * @fmt_o: format of the original guest crycb. * @fmt_h: format of the host crycb. @@ -210,50 +226,46 @@ static int setup_apcb11(struct kvm_vcpu *vcpu, unsigned long *apcb_s, * Return 0 or an error number if the guest and host crycb are incompatible. */ static int setup_apcb(struct kvm_vcpu *vcpu, struct kvm_s390_crypto_cb *crycb_s, - const u32 crycb_o, + const u32 crycb_gpa, struct kvm_s390_crypto_cb *crycb_h, int fmt_o, int fmt_h) { - struct kvm_s390_crypto_cb *crycb; - - crycb = (struct kvm_s390_crypto_cb *) (unsigned long)crycb_o; - switch (fmt_o) { case CRYCB_FORMAT2: - if ((crycb_o & PAGE_MASK) != ((crycb_o + 256) & PAGE_MASK)) + if ((crycb_gpa & PAGE_MASK) != ((crycb_gpa + 256) & PAGE_MASK)) return -EACCES; if (fmt_h != CRYCB_FORMAT2) return -EINVAL; return setup_apcb11(vcpu, (unsigned long *)&crycb_s->apcb1, - (unsigned long) &crycb->apcb1, + crycb_gpa, (unsigned long *)&crycb_h->apcb1); case CRYCB_FORMAT1: switch (fmt_h) { case CRYCB_FORMAT2: return setup_apcb10(vcpu, &crycb_s->apcb1, - (unsigned long) &crycb->apcb0, + crycb_gpa, &crycb_h->apcb1); case CRYCB_FORMAT1: return setup_apcb00(vcpu, (unsigned long *) &crycb_s->apcb0, - (unsigned long) &crycb->apcb0, + crycb_gpa, (unsigned long *) &crycb_h->apcb0); } break; case CRYCB_FORMAT0: - if ((crycb_o & PAGE_MASK) != ((crycb_o + 32) & PAGE_MASK)) + if ((crycb_gpa & PAGE_MASK) != ((crycb_gpa + 32) & PAGE_MASK)) return -EACCES; switch (fmt_h) { case CRYCB_FORMAT2: return setup_apcb10(vcpu, &crycb_s->apcb1, - (unsigned long) &crycb->apcb0, + crycb_gpa, &crycb_h->apcb1); case CRYCB_FORMAT1: case CRYCB_FORMAT0: return setup_apcb00(vcpu, (unsigned long *) &crycb_s->apcb0, - (unsigned long) &crycb->apcb0, + crycb_gpa, (unsigned long *) &crycb_h->apcb0); } } @@ -416,11 +428,6 @@ static void unshadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) memcpy((void *)((u64)scb_o + 0xc0), (void *)((u64)scb_s + 0xc0), 0xf0 - 0xc0); break; - case ICPT_PARTEXEC: - /* MVPG only */ - memcpy((void *)((u64)scb_o + 0xc0), - (void *)((u64)scb_s + 0xc0), 0xd0 - 0xc0); - break; } if (scb_s->ihcpu != 0xffffU) @@ -498,7 +505,7 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) scb_s->mso = new_mso; scb_s->prefix = new_prefix; - /* We have to definetly flush the tlb if this scb never ran */ + /* We have to definitely flush the tlb if this scb never ran */ if (scb_s->ihcpu != 0xffffU) scb_s->ihcpu = scb_o->ihcpu; @@ -507,6 +514,14 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) /* Host-protection-interruption introduced with ESOP */ if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_ESOP)) scb_s->ecb |= scb_o->ecb & ECB_HOSTPROTINT; + /* + * CPU Topology + * This facility only uses the utility field of the SCA and none of + * the cpu entries that are problematic with the other interpretation + * facilities so we can pass it through + */ + if (test_kvm_facility(vcpu->kvm, 11)) + scb_s->ecb |= scb_o->ecb & ECB_PTF; /* transactional execution */ if (test_kvm_facility(vcpu->kvm, 73) && wants_tx) { /* remap the prefix is tx is toggled on */ @@ -514,6 +529,8 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) prefix_unmapped(vsie_page); scb_s->ecb |= ECB_TE; } + /* specification exception interpretation */ + scb_s->ecb |= scb_o->ecb & ECB_SPECI; /* branch prediction */ if (test_kvm_facility(vcpu->kvm, 82)) scb_s->fpf |= scb_o->fpf & FPF_BPBC; @@ -540,14 +557,17 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_CEI)) scb_s->eca |= scb_o->eca & ECA_CEI; /* Epoch Extension */ - if (test_kvm_facility(vcpu->kvm, 139)) + if (test_kvm_facility(vcpu->kvm, 139)) { scb_s->ecd |= scb_o->ecd & ECD_MEF; + scb_s->epdx = scb_o->epdx; + } /* etoken */ if (test_kvm_facility(vcpu->kvm, 156)) scb_s->ecd |= scb_o->ecd & ECD_ETOKENF; scb_s->hpid = HPID_VSIE; + scb_s->cpnc = scb_o->cpnc; prepare_ibc(vcpu, vsie_page); rc = shadow_crycb(vcpu, vsie_page); @@ -568,10 +588,6 @@ void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start, if (!gmap_is_shadow(gmap)) return; - if (start >= 1UL << 31) - /* We are only interested in prefix pages */ - return; - /* * Only new shadow blocks are added to the list during runtime, * therefore we can safely reference them all the time. @@ -618,10 +634,10 @@ static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) /* with mso/msl, the prefix lies at offset *mso* */ prefix += scb_s->mso; - rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix); + rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix, NULL); if (!rc && (scb_s->ecb & ECB_TE)) rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, - prefix + PAGE_SIZE); + prefix + PAGE_SIZE, NULL); /* * We don't have to mprotect, we will be called for all unshadows. * SIE will detect if protection applies and trigger a validity. @@ -647,7 +663,7 @@ static int pin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t *hpa) page = gfn_to_page(kvm, gpa_to_gfn(gpa)); if (is_error_page(page)) return -EINVAL; - *hpa = (hpa_t) page_to_virt(page) + (gpa & ~PAGE_MASK); + *hpa = (hpa_t)page_to_phys(page) + (gpa & ~PAGE_MASK); return 0; } @@ -862,7 +878,7 @@ static int pin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, WARN_ON_ONCE(rc); return 1; } - vsie_page->scb_o = (struct kvm_s390_sie_block *) hpa; + vsie_page->scb_o = phys_to_virt(hpa); return 0; } @@ -882,7 +898,7 @@ static int inject_fault(struct kvm_vcpu *vcpu, __u16 code, __u64 vaddr, (vaddr & 0xfffffffffffff000UL) | /* 52-53: store / fetch */ (((unsigned int) !write_flag) + 1) << 10, - /* 62-63: asce id (alway primary == 0) */ + /* 62-63: asce id (always primary == 0) */ .exc_access_id = 0, /* always primary */ .op_access_id = 0, /* not MVPG */ }; @@ -912,7 +928,7 @@ static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) current->thread.gmap_addr, 1); rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, - current->thread.gmap_addr); + current->thread.gmap_addr, NULL); if (rc > 0) { rc = inject_fault(vcpu, rc, current->thread.gmap_addr, @@ -934,7 +950,7 @@ static void handle_last_fault(struct kvm_vcpu *vcpu, { if (vsie_page->fault_addr) kvm_s390_shadow_fault(vcpu, vsie_page->gmap, - vsie_page->fault_addr); + vsie_page->fault_addr, NULL); vsie_page->fault_addr = 0; } @@ -969,12 +985,26 @@ static void retry_vsie_icpt(struct vsie_page *vsie_page) static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) { struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; - __u32 fac = READ_ONCE(vsie_page->scb_o->fac) & 0x7ffffff8U; + __u32 fac = READ_ONCE(vsie_page->scb_o->fac); + /* + * Alternate-STFLE-Interpretive-Execution facilities are not supported + * -> format-0 flcb + */ if (fac && test_kvm_facility(vcpu->kvm, 7)) { retry_vsie_icpt(vsie_page); + /* + * The facility list origin (FLO) is in bits 1 - 28 of the FLD + * so we need to mask here before reading. + */ + fac = fac & 0x7ffffff8U; + /* + * format-0 -> size of nested guest's facility list == guest's size + * guest's size == host's size, since STFLE is interpretatively executed + * using a format-0 for the guest, too. + */ if (read_guest_real(vcpu, fac, &vsie_page->fac, - sizeof(vsie_page->fac))) + stfle_size() * sizeof(u64))) return set_validity_icpt(scb_s, 0x1090U); scb_s->fac = (__u32)(__u64) &vsie_page->fac; } @@ -982,6 +1012,98 @@ static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) } /* + * Get a register for a nested guest. + * @vcpu the vcpu of the guest + * @vsie_page the vsie_page for the nested guest + * @reg the register number, the upper 4 bits are ignored. + * returns: the value of the register. + */ +static u64 vsie_get_register(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, u8 reg) +{ + /* no need to validate the parameter and/or perform error handling */ + reg &= 0xf; + switch (reg) { + case 15: + return vsie_page->scb_s.gg15; + case 14: + return vsie_page->scb_s.gg14; + default: + return vcpu->run->s.regs.gprs[reg]; + } +} + +static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) +{ + struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; + unsigned long pei_dest, pei_src, src, dest, mask, prefix; + u64 *pei_block = &vsie_page->scb_o->mcic; + int edat, rc_dest, rc_src; + union ctlreg0 cr0; + + cr0.val = vcpu->arch.sie_block->gcr[0]; + edat = cr0.edat && test_kvm_facility(vcpu->kvm, 8); + mask = _kvm_s390_logical_to_effective(&scb_s->gpsw, PAGE_MASK); + prefix = scb_s->prefix << GUEST_PREFIX_SHIFT; + + dest = vsie_get_register(vcpu, vsie_page, scb_s->ipb >> 20) & mask; + dest = _kvm_s390_real_to_abs(prefix, dest) + scb_s->mso; + src = vsie_get_register(vcpu, vsie_page, scb_s->ipb >> 16) & mask; + src = _kvm_s390_real_to_abs(prefix, src) + scb_s->mso; + + rc_dest = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, dest, &pei_dest); + rc_src = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, src, &pei_src); + /* + * Either everything went well, or something non-critical went wrong + * e.g. because of a race. In either case, simply retry. + */ + if (rc_dest == -EAGAIN || rc_src == -EAGAIN || (!rc_dest && !rc_src)) { + retry_vsie_icpt(vsie_page); + return -EAGAIN; + } + /* Something more serious went wrong, propagate the error */ + if (rc_dest < 0) + return rc_dest; + if (rc_src < 0) + return rc_src; + + /* The only possible suppressing exception: just deliver it */ + if (rc_dest == PGM_TRANSLATION_SPEC || rc_src == PGM_TRANSLATION_SPEC) { + clear_vsie_icpt(vsie_page); + rc_dest = kvm_s390_inject_program_int(vcpu, PGM_TRANSLATION_SPEC); + WARN_ON_ONCE(rc_dest); + return 1; + } + + /* + * Forward the PEI intercept to the guest if it was a page fault, or + * also for segment and region table faults if EDAT applies. + */ + if (edat) { + rc_dest = rc_dest == PGM_ASCE_TYPE ? rc_dest : 0; + rc_src = rc_src == PGM_ASCE_TYPE ? rc_src : 0; + } else { + rc_dest = rc_dest != PGM_PAGE_TRANSLATION ? rc_dest : 0; + rc_src = rc_src != PGM_PAGE_TRANSLATION ? rc_src : 0; + } + if (!rc_dest && !rc_src) { + pei_block[0] = pei_dest; + pei_block[1] = pei_src; + return 1; + } + + retry_vsie_icpt(vsie_page); + + /* + * The host has edat, and the guest does not, or it was an ASCE type + * exception. The host needs to inject the appropriate DAT interrupts + * into the guest. + */ + if (rc_dest) + return inject_fault(vcpu, rc_dest, dest, 1); + return inject_fault(vcpu, rc_src, src, 0); +} + +/* * Run the vsie on a shadow scb and a shadow gmap, without any further * sanity checks, handling SIE faults. * @@ -1000,12 +1122,7 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) handle_last_fault(vcpu, vsie_page); - if (need_resched()) - schedule(); - if (test_cpu_flag(CIF_MCCK_PENDING)) - s390_handle_mcck(); - - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + kvm_vcpu_srcu_read_unlock(vcpu); /* save current guest state of bp isolation override */ guest_bp_isolation = test_thread_flag(TIF_ISOLATE_BP_GUEST); @@ -1032,6 +1149,8 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) */ vcpu->arch.sie_block->prog0c |= PROG_IN_SIE; barrier(); + if (test_cpu_flag(CIF_FPU)) + load_fpu_regs(); if (!kvm_s390_vcpu_sie_inhibited(vcpu)) rc = sie64a(scb_s, vcpu->run->s.regs.gprs); barrier(); @@ -1045,7 +1164,7 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) if (!guest_bp_isolation) clear_thread_flag(TIF_ISOLATE_BP_GUEST); - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + kvm_vcpu_srcu_read_lock(vcpu); if (rc == -EINTR) { VCPU_EVENT(vcpu, 3, "%s", "machine check"); @@ -1072,6 +1191,10 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) if ((scb_s->ipa & 0xf000) != 0xf000) scb_s->ipa += 0x1000; break; + case ICPT_PARTEXEC: + if (scb_s->ipa == 0xb254) + rc = vsie_handle_mvpg(vcpu, vsie_page); + break; } return rc; } @@ -1102,8 +1225,10 @@ static int acquire_gmap_shadow(struct kvm_vcpu *vcpu, * we're holding has been unshadowed. If the gmap is still valid, * we can safely reuse it. */ - if (vsie_page->gmap && gmap_shadow_valid(vsie_page->gmap, asce, edat)) + if (vsie_page->gmap && gmap_shadow_valid(vsie_page->gmap, asce, edat)) { + vcpu->kvm->stat.gmap_shadow_reuse++; return 0; + } /* release the old shadow - if any, and mark the prefix as unmapped */ release_gmap_shadow(vsie_page); @@ -1111,6 +1236,7 @@ static int acquire_gmap_shadow(struct kvm_vcpu *vcpu, if (IS_ERR(gmap)) return PTR_ERR(gmap); gmap->private = vcpu->kvm; + vcpu->kvm->stat.gmap_shadow_create++; WRITE_ONCE(vsie_page->gmap, gmap); return 0; } @@ -1185,6 +1311,7 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) kvm_s390_vcpu_has_irq(vcpu, 0) || kvm_s390_vcpu_sie_inhibited(vcpu)) break; + cond_resched(); } if (rc == -EFAULT) { @@ -1202,6 +1329,7 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) scb_s->iprcc = PGM_ADDRESSING; scb_s->pgmilc = 4; scb_s->gpsw.addr = __rewind_psw(scb_s->gpsw, 4); + rc = 1; } return rc; } @@ -1236,7 +1364,7 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr) mutex_lock(&kvm->arch.vsie.mutex); if (kvm->arch.vsie.page_count < nr_vcpus) { - page = alloc_page(GFP_KERNEL | __GFP_ZERO | GFP_DMA); + page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO | GFP_DMA); if (!page) { mutex_unlock(&kvm->arch.vsie.mutex); return ERR_PTR(-ENOMEM); @@ -1338,7 +1466,7 @@ out_put: void kvm_s390_vsie_init(struct kvm *kvm) { mutex_init(&kvm->arch.vsie.mutex); - INIT_RADIX_TREE(&kvm->arch.vsie.addr_to_page, GFP_KERNEL); + INIT_RADIX_TREE(&kvm->arch.vsie.addr_to_page, GFP_KERNEL_ACCOUNT); } /* Destroy the vsie data structures. To be called when a vm is destroyed. */ |