summaryrefslogtreecommitdiff
path: root/arch/s390/kvm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-08-02 16:11:27 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-08-02 16:11:27 -0400
commit221bb8a46e230b9824204ae86537183d9991ff2a (patch)
tree92510d72285b2285be7cb87288bf088cb28af4c1 /arch/s390/kvm
parentf7b32e4c021fd788f13f6785e17efbc3eb05b351 (diff)
parent23528bb21ee2c9b27f3feddd77a2a3351a8df148 (diff)
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Paolo Bonzini: - ARM: GICv3 ITS emulation and various fixes. Removal of the old VGIC implementation. - s390: support for trapping software breakpoints, nested virtualization (vSIE), the STHYI opcode, initial extensions for CPU model support. - MIPS: support for MIPS64 hosts (32-bit guests only) and lots of cleanups, preliminary to this and the upcoming support for hardware virtualization extensions. - x86: support for execute-only mappings in nested EPT; reduced vmexit latency for TSC deadline timer (by about 30%) on Intel hosts; support for more than 255 vCPUs. - PPC: bugfixes. * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (302 commits) KVM: PPC: Introduce KVM_CAP_PPC_HTM MIPS: Select HAVE_KVM for MIPS64_R{2,6} MIPS: KVM: Reset CP0_PageMask during host TLB flush MIPS: KVM: Fix ptr->int cast via KVM_GUEST_KSEGX() MIPS: KVM: Sign extend MFC0/RDHWR results MIPS: KVM: Fix 64-bit big endian dynamic translation MIPS: KVM: Fail if ebase doesn't fit in CP0_EBase MIPS: KVM: Use 64-bit CP0_EBase when appropriate MIPS: KVM: Set CP0_Status.KX on MIPS64 MIPS: KVM: Make entry code MIPS64 friendly MIPS: KVM: Use kmap instead of CKSEG0ADDR() MIPS: KVM: Use virt_to_phys() to get commpage PFN MIPS: Fix definition of KSEGX() for 64-bit KVM: VMX: Add VMCS to CPU's loaded VMCSs before VMPTRLD kvm: x86: nVMX: maintain internal copy of current VMCS KVM: PPC: Book3S HV: Save/restore TM state in H_CEDE KVM: PPC: Book3S HV: Pull out TM state save/restore into separate procedures KVM: arm64: vgic-its: Simplify MAPI error handling KVM: arm64: vgic-its: Make vgic_its_cmd_handle_mapi similar to other handlers KVM: arm64: vgic-its: Turn device_id validation into generic ID validation ...
Diffstat (limited to 'arch/s390/kvm')
-rw-r--r--arch/s390/kvm/Makefile2
-rw-r--r--arch/s390/kvm/diag.c5
-rw-r--r--arch/s390/kvm/gaccess.c387
-rw-r--r--arch/s390/kvm/gaccess.h3
-rw-r--r--arch/s390/kvm/guestdbg.c19
-rw-r--r--arch/s390/kvm/intercept.c33
-rw-r--r--arch/s390/kvm/interrupt.c34
-rw-r--r--arch/s390/kvm/kvm-s390.c404
-rw-r--r--arch/s390/kvm/kvm-s390.h22
-rw-r--r--arch/s390/kvm/priv.c226
-rw-r--r--arch/s390/kvm/sigp.c10
-rw-r--r--arch/s390/kvm/sthyi.c471
-rw-r--r--arch/s390/kvm/trace.h49
-rw-r--r--arch/s390/kvm/vsie.c1091
14 files changed, 2573 insertions, 183 deletions
diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile
index d42fa38c2429..09a9e6dfc09f 100644
--- a/arch/s390/kvm/Makefile
+++ b/arch/s390/kvm/Makefile
@@ -12,6 +12,6 @@ common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o $(KVM)/async_pf.o $(KVM)/irqch
ccflags-y := -Ivirt/kvm -Iarch/s390/kvm
kvm-objs := $(common-objs) kvm-s390.o intercept.o interrupt.o priv.o sigp.o
-kvm-objs += diag.o gaccess.o guestdbg.o
+kvm-objs += diag.o gaccess.o guestdbg.o sthyi.o vsie.o
obj-$(CONFIG_KVM) += kvm.o
diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c
index 1ea4095b67d7..ce865bd4f81d 100644
--- a/arch/s390/kvm/diag.c
+++ b/arch/s390/kvm/diag.c
@@ -212,6 +212,11 @@ static int __diag_virtio_hypercall(struct kvm_vcpu *vcpu)
(vcpu->run->s.regs.gprs[1] != KVM_S390_VIRTIO_CCW_NOTIFY))
return -EOPNOTSUPP;
+ VCPU_EVENT(vcpu, 4, "diag 0x500 schid 0x%8.8x queue 0x%x cookie 0x%llx",
+ (u32) vcpu->run->s.regs.gprs[2],
+ (u32) vcpu->run->s.regs.gprs[3],
+ vcpu->run->s.regs.gprs[4]);
+
/*
* The layout is as follows:
* - gpr 2 contains the subchannel id (passed as addr)
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 66938d283b77..54200208bf24 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -8,6 +8,7 @@
#include <linux/vmalloc.h>
#include <linux/err.h>
#include <asm/pgtable.h>
+#include <asm/gmap.h>
#include "kvm-s390.h"
#include "gaccess.h"
#include <asm/switch_to.h>
@@ -476,18 +477,73 @@ enum {
FSI_FETCH = 2 /* Exception was due to fetch operation */
};
-static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce,
- ar_t ar, enum gacc_mode mode)
+enum prot_type {
+ PROT_TYPE_LA = 0,
+ PROT_TYPE_KEYC = 1,
+ PROT_TYPE_ALC = 2,
+ PROT_TYPE_DAT = 3,
+};
+
+static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
+ ar_t ar, enum gacc_mode mode, enum prot_type prot)
{
- int rc;
- struct psw_bits psw = psw_bits(vcpu->arch.sie_block->gpsw);
struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
- struct trans_exc_code_bits *tec_bits;
+ struct trans_exc_code_bits *tec;
memset(pgm, 0, sizeof(*pgm));
- tec_bits = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
- tec_bits->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH;
- tec_bits->as = psw.as;
+ pgm->code = code;
+ tec = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
+
+ switch (code) {
+ case PGM_ASCE_TYPE:
+ case PGM_PAGE_TRANSLATION:
+ case PGM_REGION_FIRST_TRANS:
+ case PGM_REGION_SECOND_TRANS:
+ case PGM_REGION_THIRD_TRANS:
+ case PGM_SEGMENT_TRANSLATION:
+ /*
+ * op_access_id only applies to MOVE_PAGE -> set bit 61
+ * exc_access_id has to be set to 0 for some instructions. Both
+ * cases have to be handled by the caller. We can always store
+ * exc_access_id, as it is undefined for non-ar cases.
+ */
+ tec->addr = gva >> PAGE_SHIFT;
+ tec->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH;
+ tec->as = psw_bits(vcpu->arch.sie_block->gpsw).as;
+ /* FALL THROUGH */
+ case PGM_ALEN_TRANSLATION:
+ case PGM_ALE_SEQUENCE:
+ case PGM_ASTE_VALIDITY:
+ case PGM_ASTE_SEQUENCE:
+ case PGM_EXTENDED_AUTHORITY:
+ pgm->exc_access_id = ar;
+ break;
+ case PGM_PROTECTION:
+ switch (prot) {
+ case PROT_TYPE_ALC:
+ tec->b60 = 1;
+ /* FALL THROUGH */
+ case PROT_TYPE_DAT:
+ tec->b61 = 1;
+ tec->addr = gva >> PAGE_SHIFT;
+ tec->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH;
+ tec->as = psw_bits(vcpu->arch.sie_block->gpsw).as;
+ /* exc_access_id is undefined for most cases */
+ pgm->exc_access_id = ar;
+ break;
+ default: /* LA and KEYC set b61 to 0, other params undefined */
+ break;
+ }
+ break;
+ }
+ return code;
+}
+
+static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce,
+ unsigned long ga, ar_t ar, enum gacc_mode mode)
+{
+ int rc;
+ struct psw_bits psw = psw_bits(vcpu->arch.sie_block->gpsw);
if (!psw.t) {
asce->val = 0;
@@ -510,21 +566,8 @@ static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce,
return 0;
case PSW_AS_ACCREG:
rc = ar_translation(vcpu, asce, ar, mode);
- switch (rc) {
- case PGM_ALEN_TRANSLATION:
- case PGM_ALE_SEQUENCE:
- case PGM_ASTE_VALIDITY:
- case PGM_ASTE_SEQUENCE:
- case PGM_EXTENDED_AUTHORITY:
- vcpu->arch.pgm.exc_access_id = ar;
- break;
- case PGM_PROTECTION:
- tec_bits->b60 = 1;
- tec_bits->b61 = 1;
- break;
- }
if (rc > 0)
- pgm->code = rc;
+ return trans_exc(vcpu, rc, ga, ar, mode, PROT_TYPE_ALC);
return rc;
}
return 0;
@@ -729,40 +772,31 @@ static int low_address_protection_enabled(struct kvm_vcpu *vcpu,
return 1;
}
-static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga,
+static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar,
unsigned long *pages, unsigned long nr_pages,
const union asce asce, enum gacc_mode mode)
{
- struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
psw_t *psw = &vcpu->arch.sie_block->gpsw;
- struct trans_exc_code_bits *tec_bits;
- int lap_enabled, rc;
+ int lap_enabled, rc = 0;
- tec_bits = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
lap_enabled = low_address_protection_enabled(vcpu, asce);
while (nr_pages) {
ga = kvm_s390_logical_to_effective(vcpu, ga);
- tec_bits->addr = ga >> PAGE_SHIFT;
- if (mode == GACC_STORE && lap_enabled && is_low_address(ga)) {
- pgm->code = PGM_PROTECTION;
- return pgm->code;
- }
+ if (mode == GACC_STORE && lap_enabled && is_low_address(ga))
+ return trans_exc(vcpu, PGM_PROTECTION, ga, ar, mode,
+ PROT_TYPE_LA);
ga &= PAGE_MASK;
if (psw_bits(*psw).t) {
rc = guest_translate(vcpu, ga, pages, asce, mode);
if (rc < 0)
return rc;
- if (rc == PGM_PROTECTION)
- tec_bits->b61 = 1;
- if (rc)
- pgm->code = rc;
} else {
*pages = kvm_s390_real_to_abs(vcpu, ga);
if (kvm_is_error_gpa(vcpu->kvm, *pages))
- pgm->code = PGM_ADDRESSING;
+ rc = PGM_ADDRESSING;
}
- if (pgm->code)
- return pgm->code;
+ if (rc)
+ return trans_exc(vcpu, rc, ga, ar, mode, PROT_TYPE_DAT);
ga += PAGE_SIZE;
pages++;
nr_pages--;
@@ -783,7 +817,8 @@ int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
if (!len)
return 0;
- rc = get_vcpu_asce(vcpu, &asce, ar, mode);
+ ga = kvm_s390_logical_to_effective(vcpu, ga);
+ rc = get_vcpu_asce(vcpu, &asce, ga, ar, mode);
if (rc)
return rc;
nr_pages = (((ga & ~PAGE_MASK) + len - 1) >> PAGE_SHIFT) + 1;
@@ -795,7 +830,7 @@ int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
need_ipte_lock = psw_bits(*psw).t && !asce.r;
if (need_ipte_lock)
ipte_lock(vcpu);
- rc = guest_page_range(vcpu, ga, pages, nr_pages, asce, mode);
+ rc = guest_page_range(vcpu, ga, ar, pages, nr_pages, asce, mode);
for (idx = 0; idx < nr_pages && !rc; idx++) {
gpa = *(pages + idx) + (ga & ~PAGE_MASK);
_len = min(PAGE_SIZE - (gpa & ~PAGE_MASK), len);
@@ -846,37 +881,28 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
unsigned long *gpa, enum gacc_mode mode)
{
- struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
psw_t *psw = &vcpu->arch.sie_block->gpsw;
- struct trans_exc_code_bits *tec;
union asce asce;
int rc;
gva = kvm_s390_logical_to_effective(vcpu, gva);
- tec = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
- rc = get_vcpu_asce(vcpu, &asce, ar, mode);
- tec->addr = gva >> PAGE_SHIFT;
+ rc = get_vcpu_asce(vcpu, &asce, gva, ar, mode);
if (rc)
return rc;
if (is_low_address(gva) && low_address_protection_enabled(vcpu, asce)) {
- if (mode == GACC_STORE) {
- rc = pgm->code = PGM_PROTECTION;
- return rc;
- }
+ if (mode == GACC_STORE)
+ return trans_exc(vcpu, PGM_PROTECTION, gva, 0,
+ mode, PROT_TYPE_LA);
}
if (psw_bits(*psw).t && !asce.r) { /* Use DAT? */
rc = guest_translate(vcpu, gva, gpa, asce, mode);
- if (rc > 0) {
- if (rc == PGM_PROTECTION)
- tec->b61 = 1;
- pgm->code = rc;
- }
+ if (rc > 0)
+ return trans_exc(vcpu, rc, gva, 0, mode, PROT_TYPE_DAT);
} else {
- rc = 0;
*gpa = kvm_s390_real_to_abs(vcpu, gva);
if (kvm_is_error_gpa(vcpu->kvm, *gpa))
- rc = pgm->code = PGM_ADDRESSING;
+ return trans_exc(vcpu, rc, gva, PGM_ADDRESSING, mode, 0);
}
return rc;
@@ -915,20 +941,247 @@ int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
*/
int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra)
{
- struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
- psw_t *psw = &vcpu->arch.sie_block->gpsw;
- struct trans_exc_code_bits *tec_bits;
union ctlreg0 ctlreg0 = {.val = vcpu->arch.sie_block->gcr[0]};
if (!ctlreg0.lap || !is_low_address(gra))
return 0;
+ return trans_exc(vcpu, PGM_PROTECTION, gra, 0, GACC_STORE, PROT_TYPE_LA);
+}
- memset(pgm, 0, sizeof(*pgm));
- tec_bits = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
- tec_bits->fsi = FSI_STORE;
- tec_bits->as = psw_bits(*psw).as;
- tec_bits->addr = gra >> PAGE_SHIFT;
- pgm->code = PGM_PROTECTION;
+/**
+ * kvm_s390_shadow_tables - walk the guest page table and create shadow tables
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @pgt: pointer to the page table address result
+ * @fake: pgt references contiguous guest memory block, not a pgtable
+ */
+static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
+ unsigned long *pgt, int *dat_protection,
+ int *fake)
+{
+ struct gmap *parent;
+ union asce asce;
+ union vaddress vaddr;
+ unsigned long ptr;
+ int rc;
+
+ *fake = 0;
+ *dat_protection = 0;
+ parent = sg->parent;
+ vaddr.addr = saddr;
+ asce.val = sg->orig_asce;
+ ptr = asce.origin * 4096;
+ if (asce.r) {
+ *fake = 1;
+ asce.dt = ASCE_TYPE_REGION1;
+ }
+ switch (asce.dt) {
+ case ASCE_TYPE_REGION1:
+ if (vaddr.rfx01 > asce.tl && !asce.r)
+ return PGM_REGION_FIRST_TRANS;
+ break;
+ case ASCE_TYPE_REGION2:
+ if (vaddr.rfx)
+ return PGM_ASCE_TYPE;
+ if (vaddr.rsx01 > asce.tl)
+ return PGM_REGION_SECOND_TRANS;
+ break;
+ case ASCE_TYPE_REGION3:
+ if (vaddr.rfx || vaddr.rsx)
+ return PGM_ASCE_TYPE;
+ if (vaddr.rtx01 > asce.tl)
+ return PGM_REGION_THIRD_TRANS;
+ break;
+ case ASCE_TYPE_SEGMENT:
+ if (vaddr.rfx || vaddr.rsx || vaddr.rtx)
+ return PGM_ASCE_TYPE;
+ if (vaddr.sx01 > asce.tl)
+ return PGM_SEGMENT_TRANSLATION;
+ break;
+ }
+
+ switch (asce.dt) {
+ case ASCE_TYPE_REGION1: {
+ union region1_table_entry rfte;
- return pgm->code;
+ if (*fake) {
+ /* offset in 16EB guest memory block */
+ ptr = ptr + ((unsigned long) vaddr.rsx << 53UL);
+ rfte.val = ptr;
+ goto shadow_r2t;
+ }
+ rc = gmap_read_table(parent, ptr + vaddr.rfx * 8, &rfte.val);
+ if (rc)
+ return rc;
+ if (rfte.i)
+ return PGM_REGION_FIRST_TRANS;
+ if (rfte.tt != TABLE_TYPE_REGION1)
+ return PGM_TRANSLATION_SPEC;
+ if (vaddr.rsx01 < rfte.tf || vaddr.rsx01 > rfte.tl)
+ return PGM_REGION_SECOND_TRANS;
+ if (sg->edat_level >= 1)
+ *dat_protection |= rfte.p;
+ ptr = rfte.rto << 12UL;
+shadow_r2t:
+ rc = gmap_shadow_r2t(sg, saddr, rfte.val, *fake);
+ if (rc)
+ return rc;
+ /* fallthrough */
+ }
+ case ASCE_TYPE_REGION2: {
+ union region2_table_entry rste;
+
+ if (*fake) {
+ /* offset in 8PB guest memory block */
+ ptr = ptr + ((unsigned long) vaddr.rtx << 42UL);
+ rste.val = ptr;
+ goto shadow_r3t;
+ }
+ rc = gmap_read_table(parent, ptr + vaddr.rsx * 8, &rste.val);
+ if (rc)
+ return rc;
+ if (rste.i)
+ return PGM_REGION_SECOND_TRANS;
+ if (rste.tt != TABLE_TYPE_REGION2)
+ return PGM_TRANSLATION_SPEC;
+ if (vaddr.rtx01 < rste.tf || vaddr.rtx01 > rste.tl)
+ return PGM_REGION_THIRD_TRANS;
+ if (sg->edat_level >= 1)
+ *dat_protection |= rste.p;
+ ptr = rste.rto << 12UL;
+shadow_r3t:
+ rste.p |= *dat_protection;
+ rc = gmap_shadow_r3t(sg, saddr, rste.val, *fake);
+ if (rc)
+ return rc;
+ /* fallthrough */
+ }
+ case ASCE_TYPE_REGION3: {
+ union region3_table_entry rtte;
+
+ if (*fake) {
+ /* offset in 4TB guest memory block */
+ ptr = ptr + ((unsigned long) vaddr.sx << 31UL);
+ rtte.val = ptr;
+ goto shadow_sgt;
+ }
+ rc = gmap_read_table(parent, ptr + vaddr.rtx * 8, &rtte.val);
+ if (rc)
+ return rc;
+ if (rtte.i)
+ return PGM_REGION_THIRD_TRANS;
+ if (rtte.tt != TABLE_TYPE_REGION3)
+ return PGM_TRANSLATION_SPEC;
+ if (rtte.cr && asce.p && sg->edat_level >= 2)
+ return PGM_TRANSLATION_SPEC;
+ if (rtte.fc && sg->edat_level >= 2) {
+ *dat_protection |= rtte.fc0.p;
+ *fake = 1;
+ ptr = rtte.fc1.rfaa << 31UL;
+ rtte.val = ptr;
+ goto shadow_sgt;
+ }
+ if (vaddr.sx01 < rtte.fc0.tf || vaddr.sx01 > rtte.fc0.tl)
+ return PGM_SEGMENT_TRANSLATION;
+ if (sg->edat_level >= 1)
+ *dat_protection |= rtte.fc0.p;
+ ptr = rtte.fc0.sto << 12UL;
+shadow_sgt:
+ rtte.fc0.p |= *dat_protection;
+ rc = gmap_shadow_sgt(sg, saddr, rtte.val, *fake);
+ if (rc)
+ return rc;
+ /* fallthrough */
+ }
+ case ASCE_TYPE_SEGMENT: {
+ union segment_table_entry ste;
+
+ if (*fake) {
+ /* offset in 2G guest memory block */
+ ptr = ptr + ((unsigned long) vaddr.sx << 20UL);
+ ste.val = ptr;
+ goto shadow_pgt;
+ }
+ rc = gmap_read_table(parent, ptr + vaddr.sx * 8, &ste.val);
+ if (rc)
+ return rc;
+ if (ste.i)
+ return PGM_SEGMENT_TRANSLATION;
+ if (ste.tt != TABLE_TYPE_SEGMENT)
+ return PGM_TRANSLATION_SPEC;
+ if (ste.cs && asce.p)
+ return PGM_TRANSLATION_SPEC;
+ *dat_protection |= ste.fc0.p;
+ if (ste.fc && sg->edat_level >= 1) {
+ *fake = 1;
+ ptr = ste.fc1.sfaa << 20UL;
+ ste.val = ptr;
+ goto shadow_pgt;
+ }
+ ptr = ste.fc0.pto << 11UL;
+shadow_pgt:
+ ste.fc0.p |= *dat_protection;
+ rc = gmap_shadow_pgt(sg, saddr, ste.val, *fake);
+ if (rc)
+ return rc;
+ }
+ }
+ /* Return the parent address of the page table */
+ *pgt = ptr;
+ return 0;
+}
+
+/**
+ * kvm_s390_shadow_fault - handle fault on a shadow page table
+ * @vcpu: virtual cpu
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ *
+ * Returns: - 0 if the shadow fault was successfully resolved
+ * - > 0 (pgm exception code) on exceptions while faulting
+ * - -EAGAIN if the caller can retry immediately
+ * - -EFAULT when accessing invalid guest addresses
+ * - -ENOMEM if out of memory
+ */
+int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg,
+ unsigned long saddr)
+{
+ union vaddress vaddr;
+ union page_table_entry pte;
+ unsigned long pgt;
+ int dat_protection, fake;
+ int rc;
+
+ down_read(&sg->mm->mmap_sem);
+ /*
+ * We don't want any guest-2 tables to change - so the parent
+ * tables/pointers we read stay valid - unshadowing is however
+ * always possible - only guest_table_lock protects us.
+ */
+ ipte_lock(vcpu);
+
+ rc = gmap_shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection, &fake);
+ if (rc)
+ rc = kvm_s390_shadow_tables(sg, saddr, &pgt, &dat_protection,
+ &fake);
+
+ vaddr.addr = saddr;
+ if (fake) {
+ /* offset in 1MB guest memory block */
+ pte.val = pgt + ((unsigned long) vaddr.px << 12UL);
+ goto shadow_page;
+ }
+ if (!rc)
+ rc = gmap_read_table(sg->parent, pgt + vaddr.px * 8, &pte.val);
+ if (!rc && pte.i)
+ rc = PGM_PAGE_TRANSLATION;
+ if (!rc && (pte.z || (pte.co && sg->edat_level < 1)))
+ rc = PGM_TRANSLATION_SPEC;
+shadow_page:
+ pte.p |= dat_protection;
+ if (!rc)
+ rc = gmap_shadow_page(sg, saddr, __pte(pte.val));
+ ipte_unlock(vcpu);
+ up_read(&sg->mm->mmap_sem);
+ return rc;
}
diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h
index df0a79dd8159..8756569ad938 100644
--- a/arch/s390/kvm/gaccess.h
+++ b/arch/s390/kvm/gaccess.h
@@ -361,4 +361,7 @@ void ipte_unlock(struct kvm_vcpu *vcpu);
int ipte_lock_held(struct kvm_vcpu *vcpu);
int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra);
+int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *shadow,
+ unsigned long saddr);
+
#endif /* __KVM_S390_GACCESS_H */
diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c
index e8c6843b9600..31a05330d11c 100644
--- a/arch/s390/kvm/guestdbg.c
+++ b/arch/s390/kvm/guestdbg.c
@@ -439,6 +439,23 @@ exit_required:
#define guest_per_enabled(vcpu) \
(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PER)
+int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu)
+{
+ const u8 ilen = kvm_s390_get_ilen(vcpu);
+ struct kvm_s390_pgm_info pgm_info = {
+ .code = PGM_PER,
+ .per_code = PER_EVENT_IFETCH >> 24,
+ .per_address = __rewind_psw(vcpu->arch.sie_block->gpsw, ilen),
+ };
+
+ /*
+ * The PSW points to the next instruction, therefore the intercepted
+ * instruction generated a PER i-fetch event. PER address therefore
+ * points at the previous PSW address (could be an EXECUTE function).
+ */
+ return kvm_s390_inject_prog_irq(vcpu, &pgm_info);
+}
+
static void filter_guest_per_event(struct kvm_vcpu *vcpu)
{
u32 perc = vcpu->arch.sie_block->perc << 24;
@@ -465,7 +482,7 @@ static void filter_guest_per_event(struct kvm_vcpu *vcpu)
guest_perc &= ~PER_EVENT_IFETCH;
/* All other PER events will be given to the guest */
- /* TODO: Check alterated address/address space */
+ /* TODO: Check altered address/address space */
vcpu->arch.sie_block->perc = guest_perc >> 24;
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 252157181302..dfd0ca2638fa 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -351,8 +351,26 @@ static int handle_partial_execution(struct kvm_vcpu *vcpu)
return -EOPNOTSUPP;
}
+static int handle_operexc(struct kvm_vcpu *vcpu)
+{
+ vcpu->stat.exit_operation_exception++;
+ trace_kvm_s390_handle_operexc(vcpu, vcpu->arch.sie_block->ipa,
+ vcpu->arch.sie_block->ipb);
+
+ if (vcpu->arch.sie_block->ipa == 0xb256 &&
+ test_kvm_facility(vcpu->kvm, 74))
+ return handle_sthyi(vcpu);
+
+ if (vcpu->arch.sie_block->ipa == 0 && vcpu->kvm->arch.user_instr0)
+ return -EOPNOTSUPP;
+
+ return kvm_s390_inject_program_int(vcpu, PGM_OPERATION);
+}
+
int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
{
+ int rc, per_rc = 0;
+
if (kvm_is_ucontrol(vcpu->kvm))
return -EOPNOTSUPP;
@@ -361,7 +379,8 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
case 0x18:
return handle_noop(vcpu);
case 0x04:
- return handle_instruction(vcpu);
+ rc = handle_instruction(vcpu);
+ break;
case 0x08:
return handle_prog(vcpu);
case 0x14:
@@ -372,9 +391,19 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
return handle_validity(vcpu);
case 0x28:
return handle_stop(vcpu);
+ case 0x2c:
+ rc = handle_operexc(vcpu);
+ break;
case 0x38:
- return handle_partial_execution(vcpu);
+ rc = handle_partial_execution(vcpu);
+ break;
default:
return -EOPNOTSUPP;
}
+
+ /* process PER, also if the instrution is processed in user space */
+ if (vcpu->arch.sie_block->icptstatus & 0x02 &&
+ (!rc || rc == -EOPNOTSUPP))
+ per_rc = kvm_s390_handle_per_ifetch_icpt(vcpu);
+ return per_rc ? per_rc : rc;
}
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 5a80af740d3e..24524c0f3ef8 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -28,9 +28,6 @@
#include "gaccess.h"
#include "trace-s390.h"
-#define IOINT_SCHID_MASK 0x0000ffff
-#define IOINT_SSID_MASK 0x00030000
-#define IOINT_CSSID_MASK 0x03fc0000
#define PFAULT_INIT 0x0600
#define PFAULT_DONE 0x0680
#define VIRTIO_PARAM 0x0d00
@@ -821,7 +818,14 @@ static int __must_check __deliver_io(struct kvm_vcpu *vcpu,
struct kvm_s390_interrupt_info,
list);
if (inti) {
- VCPU_EVENT(vcpu, 4, "deliver: I/O 0x%llx", inti->type);
+ if (inti->type & KVM_S390_INT_IO_AI_MASK)
+ VCPU_EVENT(vcpu, 4, "%s", "deliver: I/O (AI)");
+ else
+ VCPU_EVENT(vcpu, 4, "deliver: I/O %x ss %x schid %04x",
+ inti->io.subchannel_id >> 8,
+ inti->io.subchannel_id >> 1 & 0x3,
+ inti->io.subchannel_nr);
+
vcpu->stat.deliver_io_int++;
trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id,
inti->type,
@@ -991,6 +995,11 @@ void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu)
swake_up(&vcpu->wq);
vcpu->stat.halt_wakeup++;
}
+ /*
+ * The VCPU might not be sleeping but is executing the VSIE. Let's
+ * kick it, so it leaves the SIE to process the request.
+ */
+ kvm_s390_vsie_kick(vcpu);
}
enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer)
@@ -1415,6 +1424,13 @@ static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
}
fi->counters[FIRQ_CNTR_IO] += 1;
+ if (inti->type & KVM_S390_INT_IO_AI_MASK)
+ VM_EVENT(kvm, 4, "%s", "inject: I/O (AI)");
+ else
+ VM_EVENT(kvm, 4, "inject: I/O %x ss %x schid %04x",
+ inti->io.subchannel_id >> 8,
+ inti->io.subchannel_id >> 1 & 0x3,
+ inti->io.subchannel_nr);
isc = int_word_to_isc(inti->io.io_int_word);
list = &fi->lists[FIRQ_LIST_IO_ISC_0 + isc];
list_add_tail(&inti->list, list);
@@ -1531,13 +1547,6 @@ int kvm_s390_inject_vm(struct kvm *kvm,
inti->mchk.mcic = s390int->parm64;
break;
case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
- if (inti->type & KVM_S390_INT_IO_AI_MASK)
- VM_EVENT(kvm, 5, "%s", "inject: I/O (AI)");
- else
- VM_EVENT(kvm, 5, "inject: I/O css %x ss %x schid %04x",
- s390int->type & IOINT_CSSID_MASK,
- s390int->type & IOINT_SSID_MASK,
- s390int->type & IOINT_SCHID_MASK);
inti->io.subchannel_id = s390int->parm >> 16;
inti->io.subchannel_nr = s390int->parm & 0x0000ffffu;
inti->io.io_int_parm = s390int->parm64 >> 32;
@@ -2237,7 +2246,8 @@ static int set_adapter_int(struct kvm_kernel_irq_routing_entry *e,
return ret;
}
-int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e,
+int kvm_set_routing_entry(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e,
const struct kvm_irq_routing_entry *ue)
{
int ret;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 6f5c344cd785..3f3ae4865d57 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -21,11 +21,13 @@
#include <linux/init.h>
#include <linux/kvm.h>
#include <linux/kvm_host.h>
+#include <linux/mman.h>
#include <linux/module.h>
#include <linux/random.h>
#include <linux/slab.h>
#include <linux/timer.h>
#include <linux/vmalloc.h>
+#include <linux/bitmap.h>
#include <asm/asm-offsets.h>
#include <asm/lowcore.h>
#include <asm/stp.h>
@@ -35,6 +37,8 @@
#include <asm/switch_to.h>
#include <asm/isc.h>
#include <asm/sclp.h>
+#include <asm/cpacf.h>
+#include <asm/timex.h>
#include "kvm-s390.h"
#include "gaccess.h"
@@ -64,6 +68,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "exit_pei", VCPU_STAT(exit_pei) },
{ "exit_program_interruption", VCPU_STAT(exit_program_interruption) },
{ "exit_instr_and_program_int", VCPU_STAT(exit_instr_and_program) },
+ { "exit_operation_exception", VCPU_STAT(exit_operation_exception) },
{ "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
{ "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
{ "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
@@ -94,6 +99,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "instruction_stsi", VCPU_STAT(instruction_stsi) },
{ "instruction_stfl", VCPU_STAT(instruction_stfl) },
{ "instruction_tprot", VCPU_STAT(instruction_tprot) },
+ { "instruction_sthyi", VCPU_STAT(instruction_sthyi) },
+ { "instruction_sie", VCPU_STAT(instruction_sie) },
{ "instruction_sigp_sense", VCPU_STAT(instruction_sigp_sense) },
{ "instruction_sigp_sense_running", VCPU_STAT(instruction_sigp_sense_running) },
{ "instruction_sigp_external_call", VCPU_STAT(instruction_sigp_external_call) },
@@ -119,6 +126,11 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ NULL }
};
+/* allow nested virtualization in KVM (if enabled by user space) */
+static int nested;
+module_param(nested, int, S_IRUGO);
+MODULE_PARM_DESC(nested, "Nested virtualization support");
+
/* upper facilities limit for kvm */
unsigned long kvm_s390_fac_list_mask[16] = {
0xffe6000000000000UL,
@@ -131,7 +143,13 @@ unsigned long kvm_s390_fac_list_mask_size(void)
return ARRAY_SIZE(kvm_s390_fac_list_mask);
}
+/* available cpu features supported by kvm */
+static DECLARE_BITMAP(kvm_s390_available_cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
+/* available subfunctions indicated via query / "test bit" */
+static struct kvm_s390_vm_cpu_subfunc kvm_s390_available_subfunc;
+
static struct gmap_notifier gmap_notifier;
+static struct gmap_notifier vsie_gmap_notifier;
debug_info_t *kvm_s390_dbf;
/* Section: not file related */
@@ -141,7 +159,8 @@ int kvm_arch_hardware_enable(void)
return 0;
}
-static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address);
+static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
+ unsigned long end);
/*
* This callback is executed during stop_machine(). All CPUs are therefore
@@ -163,6 +182,8 @@ static int kvm_clock_sync(struct notifier_block *notifier, unsigned long val,
vcpu->arch.sie_block->epoch -= *delta;
if (vcpu->arch.cputm_enabled)
vcpu->arch.cputm_start += *delta;
+ if (vcpu->arch.vsie_block)
+ vcpu->arch.vsie_block->epoch -= *delta;
}
}
return NOTIFY_OK;
@@ -175,7 +196,9 @@ static struct notifier_block kvm_clock_notifier = {
int kvm_arch_hardware_setup(void)
{
gmap_notifier.notifier_call = kvm_gmap_notifier;
- gmap_register_ipte_notifier(&gmap_notifier);
+ gmap_register_pte_notifier(&gmap_notifier);
+ vsie_gmap_notifier.notifier_call = kvm_s390_vsie_gmap_notifier;
+ gmap_register_pte_notifier(&vsie_gmap_notifier);
atomic_notifier_chain_register(&s390_epoch_delta_notifier,
&kvm_clock_notifier);
return 0;
@@ -183,11 +206,109 @@ int kvm_arch_hardware_setup(void)
void kvm_arch_hardware_unsetup(void)
{
- gmap_unregister_ipte_notifier(&gmap_notifier);
+ gmap_unregister_pte_notifier(&gmap_notifier);
+ gmap_unregister_pte_notifier(&vsie_gmap_notifier);
atomic_notifier_chain_unregister(&s390_epoch_delta_notifier,
&kvm_clock_notifier);
}
+static void allow_cpu_feat(unsigned long nr)
+{
+ set_bit_inv(nr, kvm_s390_available_cpu_feat);
+}
+
+static inline int plo_test_bit(unsigned char nr)
+{
+ register unsigned long r0 asm("0") = (unsigned long) nr | 0x100;
+ int cc = 3; /* subfunction not available */
+
+ asm volatile(
+ /* Parameter registers are ignored for "test bit" */
+ " plo 0,0,0,0(0)\n"
+ " ipm %0\n"
+ " srl %0,28\n"
+ : "=d" (cc)
+ : "d" (r0)
+ : "cc");
+ return cc == 0;
+}
+
+static void kvm_s390_cpu_feat_init(void)
+{
+ int i;
+
+ for (i = 0; i < 256; ++i) {
+ if (plo_test_bit(i))
+ kvm_s390_available_subfunc.plo[i >> 3] |= 0x80 >> (i & 7);
+ }
+
+ if (test_facility(28)) /* TOD-clock steering */
+ ptff(kvm_s390_available_subfunc.ptff,
+ sizeof(kvm_s390_available_subfunc.ptff),
+ PTFF_QAF);
+
+ if (test_facility(17)) { /* MSA */
+ __cpacf_query(CPACF_KMAC, kvm_s390_available_subfunc.kmac);
+ __cpacf_query(CPACF_KMC, kvm_s390_available_subfunc.kmc);
+ __cpacf_query(CPACF_KM, kvm_s390_available_subfunc.km);
+ __cpacf_query(CPACF_KIMD, kvm_s390_available_subfunc.kimd);
+ __cpacf_query(CPACF_KLMD, kvm_s390_available_subfunc.klmd);
+ }
+ if (test_facility(76)) /* MSA3 */
+ __cpacf_query(CPACF_PCKMO, kvm_s390_available_subfunc.pckmo);
+ if (test_facility(77)) { /* MSA4 */
+ __cpacf_query(CPACF_KMCTR, kvm_s390_available_subfunc.kmctr);
+ __cpacf_query(CPACF_KMF, kvm_s390_available_subfunc.kmf);
+ __cpacf_query(CPACF_KMO, kvm_s390_available_subfunc.kmo);
+ __cpacf_query(CPACF_PCC, kvm_s390_available_subfunc.pcc);
+ }
+ if (test_facility(57)) /* MSA5 */
+ __cpacf_query(CPACF_PPNO, kvm_s390_available_subfunc.ppno);
+
+ if (MACHINE_HAS_ESOP)
+ allow_cpu_feat(KVM_S390_VM_CPU_FEAT_ESOP);
+ /*
+ * We need SIE support, ESOP (PROT_READ protection for gmap_shadow),
+ * 64bit SCAO (SCA passthrough) and IDTE (for gmap_shadow unshadowing).
+ */
+ if (!sclp.has_sief2 || !MACHINE_HAS_ESOP || !sclp.has_64bscao ||
+ !test_facility(3) || !nested)
+ return;
+ allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIEF2);
+ if (sclp.has_64bscao)
+ allow_cpu_feat(KVM_S390_VM_CPU_FEAT_64BSCAO);
+ if (sclp.has_siif)
+ allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIIF);
+ if (sclp.has_gpere)
+ allow_cpu_feat(KVM_S390_VM_CPU_FEAT_GPERE);
+ if (sclp.has_gsls)
+ allow_cpu_feat(KVM_S390_VM_CPU_FEAT_GSLS);
+ if (sclp.has_ib)
+ allow_cpu_feat(KVM_S390_VM_CPU_FEAT_IB);
+ if (sclp.has_cei)
+ allow_cpu_feat(KVM_S390_VM_CPU_FEAT_CEI);
+ if (sclp.has_ibs)
+ allow_cpu_feat(KVM_S390_VM_CPU_FEAT_IBS);
+ /*
+ * KVM_S390_VM_CPU_FEAT_SKEY: Wrong shadow of PTE.I bits will make
+ * all skey handling functions read/set the skey from the PGSTE
+ * instead of the real storage key.
+ *
+ * KVM_S390_VM_CPU_FEAT_CMMA: Wrong shadow of PTE.I bits will make
+ * pages being detected as preserved although they are resident.
+ *
+ * KVM_S390_VM_CPU_FEAT_PFMFI: Wrong shadow of PTE.I bits will
+ * have the same effect as for KVM_S390_VM_CPU_FEAT_SKEY.
+ *
+ * For KVM_S390_VM_CPU_FEAT_SKEY, KVM_S390_VM_CPU_FEAT_CMMA and
+ * KVM_S390_VM_CPU_FEAT_PFMFI, all PTE.I and PGSTE bits have to be
+ * correctly shadowed. We can do that for the PGSTE but not for PTE.I.
+ *
+ * KVM_S390_VM_CPU_FEAT_SIGPIF: Wrong SCB addresses in the SCA. We
+ * cannot easily shadow the SCA because of the ipte lock.
+ */
+}
+
int kvm_arch_init(void *opaque)
{
kvm_s390_dbf = debug_register("kvm-trace", 32, 1, 7 * sizeof(long));
@@ -199,6 +320,8 @@ int kvm_arch_init(void *opaque)
return -ENOMEM;
}
+ kvm_s390_cpu_feat_init();
+
/* Register floating interrupt controller interface. */
return kvm_register_device_ops(&kvm_flic_ops, KVM_DEV_TYPE_FLIC);
}
@@ -244,6 +367,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_S390_USER_STSI:
case KVM_CAP_S390_SKEYS:
case KVM_CAP_S390_IRQ_STATE:
+ case KVM_CAP_S390_USER_INSTR0:
r = 1;
break;
case KVM_CAP_S390_MEM_OP:
@@ -251,8 +375,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
break;
case KVM_CAP_NR_VCPUS:
case KVM_CAP_MAX_VCPUS:
- r = sclp.has_esca ? KVM_S390_ESCA_CPU_SLOTS
- : KVM_S390_BSCA_CPU_SLOTS;
+ r = KVM_S390_BSCA_CPU_SLOTS;
+ if (sclp.has_esca && sclp.has_64bscao)
+ r = KVM_S390_ESCA_CPU_SLOTS;
break;
case KVM_CAP_NR_MEMSLOTS:
r = KVM_USER_MEM_SLOTS;
@@ -335,6 +460,16 @@ out:
return r;
}
+static void icpt_operexc_on_all_vcpus(struct kvm *kvm)
+{
+ unsigned int i;
+ struct kvm_vcpu *vcpu;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ kvm_s390_sync_request(KVM_REQ_ICPT_OPEREXC, vcpu);
+ }
+}
+
static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
{
int r;
@@ -355,7 +490,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
break;
case KVM_CAP_S390_VECTOR_REGISTERS:
mutex_lock(&kvm->lock);
- if (atomic_read(&kvm->online_vcpus)) {
+ if (kvm->created_vcpus) {
r = -EBUSY;
} else if (MACHINE_HAS_VX) {
set_kvm_facility(kvm->arch.model.fac_mask, 129);
@@ -370,7 +505,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
case KVM_CAP_S390_RI:
r = -EINVAL;
mutex_lock(&kvm->lock);
- if (atomic_read(&kvm->online_vcpus)) {
+ if (kvm->created_vcpus) {
r = -EBUSY;
} else if (test_facility(64)) {
set_kvm_facility(kvm->arch.model.fac_mask, 64);
@@ -386,6 +521,12 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
kvm->arch.user_stsi = 1;
r = 0;
break;
+ case KVM_CAP_S390_USER_INSTR0:
+ VM_EVENT(kvm, 3, "%s", "ENABLE: CAP_S390_USER_INSTR0");
+ kvm->arch.user_instr0 = 1;
+ icpt_operexc_on_all_vcpus(kvm);
+ r = 0;
+ break;
default:
r = -EINVAL;
break;
@@ -418,21 +559,23 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att
unsigned int idx;
switch (attr->attr) {
case KVM_S390_VM_MEM_ENABLE_CMMA:
- /* enable CMMA only for z10 and later (EDAT_1) */
- ret = -EINVAL;
- if (!MACHINE_IS_LPAR || !MACHINE_HAS_EDAT1)
+ ret = -ENXIO;
+ if (!sclp.has_cmma)
break;
ret = -EBUSY;
VM_EVENT(kvm, 3, "%s", "ENABLE: CMMA support");
mutex_lock(&kvm->lock);
- if (atomic_read(&kvm->online_vcpus) == 0) {
+ if (!kvm->created_vcpus) {
kvm->arch.use_cmma = 1;
ret = 0;
}
mutex_unlock(&kvm->lock);
break;
case KVM_S390_VM_MEM_CLR_CMMA:
+ ret = -ENXIO;
+ if (!sclp.has_cmma)
+ break;
ret = -EINVAL;
if (!kvm->arch.use_cmma)
break;
@@ -461,20 +604,20 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att
if (!new_limit)
return -EINVAL;
- /* gmap_alloc takes last usable address */
+ /* gmap_create takes last usable address */
if (new_limit != KVM_S390_NO_MEM_LIMIT)
new_limit -= 1;
ret = -EBUSY;
mutex_lock(&kvm->lock);
- if (atomic_read(&kvm->online_vcpus) == 0) {
- /* gmap_alloc will round the limit up */
- struct gmap *new = gmap_alloc(current->mm, new_limit);
+ if (!kvm->created_vcpus) {
+ /* gmap_create will round the limit up */
+ struct gmap *new = gmap_create(current->mm, new_limit);
if (!new) {
ret = -ENOMEM;
} else {
- gmap_free(kvm->arch.gmap);
+ gmap_remove(kvm->arch.gmap);
new->private = kvm;
kvm->arch.gmap = new;
ret = 0;
@@ -644,7 +787,7 @@ static int kvm_s390_set_processor(struct kvm *kvm, struct kvm_device_attr *attr)
int ret = 0;
mutex_lock(&kvm->lock);
- if (atomic_read(&kvm->online_vcpus)) {
+ if (kvm->created_vcpus) {
ret = -EBUSY;
goto out;
}
@@ -676,6 +819,39 @@ out:
return ret;
}
+static int kvm_s390_set_processor_feat(struct kvm *kvm,
+ struct kvm_device_attr *attr)
+{
+ struct kvm_s390_vm_cpu_feat data;
+ int ret = -EBUSY;
+
+ if (copy_from_user(&data, (void __user *)attr->addr, sizeof(data)))
+ return -EFAULT;
+ if (!bitmap_subset((unsigned long *) data.feat,
+ kvm_s390_available_cpu_feat,
+ KVM_S390_VM_CPU_FEAT_NR_BITS))
+ return -EINVAL;
+
+ mutex_lock(&kvm->lock);
+ if (!atomic_read(&kvm->online_vcpus)) {
+ bitmap_copy(kvm->arch.cpu_feat, (unsigned long *) data.feat,
+ KVM_S390_VM_CPU_FEAT_NR_BITS);
+ ret = 0;
+ }
+ mutex_unlock(&kvm->lock);
+ return ret;
+}
+
+static int kvm_s390_set_processor_subfunc(struct kvm *kvm,
+ struct kvm_device_attr *attr)
+{
+ /*
+ * Once supported by kernel + hw, we have to store the subfunctions
+ * in kvm->arch and remember that user space configured them.
+ */
+ return -ENXIO;
+}
+
static int kvm_s390_set_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
{
int ret = -ENXIO;
@@ -684,6 +860,12 @@ static int kvm_s390_set_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
case KVM_S390_VM_CPU_PROCESSOR:
ret = kvm_s390_set_processor(kvm, attr);
break;
+ case KVM_S390_VM_CPU_PROCESSOR_FEAT:
+ ret = kvm_s390_set_processor_feat(kvm, attr);
+ break;
+ case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC:
+ ret = kvm_s390_set_processor_subfunc(kvm, attr);
+ break;
}
return ret;
}
@@ -732,6 +914,50 @@ out:
return ret;
}
+static int kvm_s390_get_processor_feat(struct kvm *kvm,
+ struct kvm_device_attr *attr)
+{
+ struct kvm_s390_vm_cpu_feat data;
+
+ bitmap_copy((unsigned long *) data.feat, kvm->arch.cpu_feat,
+ KVM_S390_VM_CPU_FEAT_NR_BITS);
+ if (copy_to_user((void __user *)attr->addr, &data, sizeof(data)))
+ return -EFAULT;
+ return 0;
+}
+
+static int kvm_s390_get_machine_feat(struct kvm *kvm,
+ struct kvm_device_attr *attr)
+{
+ struct kvm_s390_vm_cpu_feat data;
+
+ bitmap_copy((unsigned long *) data.feat,
+ kvm_s390_available_cpu_feat,
+ KVM_S390_VM_CPU_FEAT_NR_BITS);
+ if (copy_to_user((void __user *)attr->addr, &data, sizeof(data)))
+ return -EFAULT;
+ return 0;
+}
+
+static int kvm_s390_get_processor_subfunc(struct kvm *kvm,
+ struct kvm_device_attr *attr)
+{
+ /*
+ * Once we can actually configure subfunctions (kernel + hw support),
+ * we have to check if they were already set by user space, if so copy
+ * them from kvm->arch.
+ */
+ return -ENXIO;
+}
+
+static int kvm_s390_get_machine_subfunc(struct kvm *kvm,
+ struct kvm_device_attr *attr)
+{
+ if (copy_to_user((void __user *)attr->addr, &kvm_s390_available_subfunc,
+ sizeof(struct kvm_s390_vm_cpu_subfunc)))
+ return -EFAULT;
+ return 0;
+}
static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
{
int ret = -ENXIO;
@@ -743,6 +969,18 @@ static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
case KVM_S390_VM_CPU_MACHINE:
ret = kvm_s390_get_machine(kvm, attr);
break;
+ case KVM_S390_VM_CPU_PROCESSOR_FEAT:
+ ret = kvm_s390_get_processor_feat(kvm, attr);
+ break;
+ case KVM_S390_VM_CPU_MACHINE_FEAT:
+ ret = kvm_s390_get_machine_feat(kvm, attr);
+ break;
+ case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC:
+ ret = kvm_s390_get_processor_subfunc(kvm, attr);
+ break;
+ case KVM_S390_VM_CPU_MACHINE_SUBFUNC:
+ ret = kvm_s390_get_machine_subfunc(kvm, attr);
+ break;
}
return ret;
}
@@ -803,6 +1041,8 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
switch (attr->attr) {
case KVM_S390_VM_MEM_ENABLE_CMMA:
case KVM_S390_VM_MEM_CLR_CMMA:
+ ret = sclp.has_cmma ? 0 : -ENXIO;
+ break;
case KVM_S390_VM_MEM_LIMIT_SIZE:
ret = 0;
break;
@@ -826,8 +1066,13 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
switch (attr->attr) {
case KVM_S390_VM_CPU_PROCESSOR:
case KVM_S390_VM_CPU_MACHINE:
+ case KVM_S390_VM_CPU_PROCESSOR_FEAT:
+ case KVM_S390_VM_CPU_MACHINE_FEAT:
+ case KVM_S390_VM_CPU_MACHINE_SUBFUNC:
ret = 0;
break;
+ /* configuring subfunctions is not supported yet */
+ case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC:
default:
ret = -ENXIO;
break;
@@ -858,7 +1103,6 @@ static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
{
uint8_t *keys;
uint64_t hva;
- unsigned long curkey;
int i, r = 0;
if (args->flags != 0)
@@ -879,26 +1123,27 @@ static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
if (!keys)
return -ENOMEM;
+ down_read(&current->mm->mmap_sem);
for (i = 0; i < args->count; i++) {
hva = gfn_to_hva(kvm, args->start_gfn + i);
if (kvm_is_error_hva(hva)) {
r = -EFAULT;
- goto out;
+ break;
}
- curkey = get_guest_storage_key(current->mm, hva);
- if (IS_ERR_VALUE(curkey)) {
- r = curkey;
- goto out;
- }
- keys[i] = curkey;
+ r = get_guest_storage_key(current->mm, hva, &keys[i]);
+ if (r)
+ break;
+ }
+ up_read(&current->mm->mmap_sem);
+
+ if (!r) {
+ r = copy_to_user((uint8_t __user *)args->skeydata_addr, keys,
+ sizeof(uint8_t) * args->count);
+ if (r)
+ r = -EFAULT;
}
- r = copy_to_user((uint8_t __user *)args->skeydata_addr, keys,
- sizeof(uint8_t) * args->count);
- if (r)
- r = -EFAULT;
-out:
kvfree(keys);
return r;
}
@@ -935,24 +1180,25 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
if (r)
goto out;
+ down_read(&current->mm->mmap_sem);
for (i = 0; i < args->count; i++) {
hva = gfn_to_hva(kvm, args->start_gfn + i);
if (kvm_is_error_hva(hva)) {
r = -EFAULT;
- goto out;
+ break;
}
/* Lowest order bit is reserved */
if (keys[i] & 0x01) {
r = -EINVAL;
- goto out;
+ break;
}
- r = set_guest_storage_key(current->mm, hva,
- (unsigned long)keys[i], 0);
+ r = set_guest_storage_key(current->mm, hva, keys[i], 0);
if (r)
- goto out;
+ break;
}
+ up_read(&current->mm->mmap_sem);
out:
kvfree(keys);
return r;
@@ -1129,6 +1375,7 @@ static void sca_dispose(struct kvm *kvm)
int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
{
+ gfp_t alloc_flags = GFP_KERNEL;
int i, rc;
char debug_name[16];
static unsigned long sca_offset;
@@ -1150,9 +1397,13 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
rc = -ENOMEM;
+ ratelimit_state_init(&kvm->arch.sthyi_limit, 5 * HZ, 500);
+
kvm->arch.use_esca = 0; /* start with basic SCA */
+ if (!sclp.has_64bscao)
+ alloc_flags |= GFP_DMA;
rwlock_init(&kvm->arch.sca_lock);
- kvm->arch.sca = (struct bsca_block *) get_zeroed_page(GFP_KERNEL);
+ kvm->arch.sca = (struct bsca_block *) get_zeroed_page(alloc_flags);
if (!kvm->arch.sca)
goto out_err;
spin_lock(&kvm_lock);
@@ -1189,6 +1440,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
memcpy(kvm->arch.model.fac_list, kvm->arch.model.fac_mask,
S390_ARCH_FAC_LIST_SIZE_BYTE);
+ set_kvm_facility(kvm->arch.model.fac_mask, 74);
+ set_kvm_facility(kvm->arch.model.fac_list, 74);
+
kvm->arch.model.cpuid = kvm_s390_get_initial_cpuid();
kvm->arch.model.ibc = sclp.ibc & 0x0fff;
@@ -1212,7 +1466,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
else
kvm->arch.mem_limit = min_t(unsigned long, TASK_MAX_SIZE,
sclp.hamax + 1);
- kvm->arch.gmap = gmap_alloc(current->mm, kvm->arch.mem_limit - 1);
+ kvm->arch.gmap = gmap_create(current->mm, kvm->arch.mem_limit - 1);
if (!kvm->arch.gmap)
goto out_err;
kvm->arch.gmap->private = kvm;
@@ -1224,6 +1478,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
kvm->arch.epoch = 0;
spin_lock_init(&kvm->arch.start_stop_lock);
+ kvm_s390_vsie_init(kvm);
KVM_EVENT(3, "vm 0x%pK created by pid %u", kvm, current->pid);
return 0;
@@ -1245,7 +1500,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
sca_del_vcpu(vcpu);
if (kvm_is_ucontrol(vcpu->kvm))
- gmap_free(vcpu->arch.gmap);
+ gmap_remove(vcpu->arch.gmap);
if (vcpu->kvm->arch.use_cmma)
kvm_s390_vcpu_unsetup_cmma(vcpu);
@@ -1278,16 +1533,17 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
debug_unregister(kvm->arch.dbf);
free_page((unsigned long)kvm->arch.sie_page2);
if (!kvm_is_ucontrol(kvm))
- gmap_free(kvm->arch.gmap);
+ gmap_remove(kvm->arch.gmap);
kvm_s390_destroy_adapters(kvm);
kvm_s390_clear_float_irqs(kvm);
+ kvm_s390_vsie_destroy(kvm);
KVM_EVENT(3, "vm 0x%pK destroyed", kvm);
}
/* Section: vcpu related */
static int __kvm_ucontrol_vcpu_init(struct kvm_vcpu *vcpu)
{
- vcpu->arch.gmap = gmap_alloc(current->mm, -1UL);
+ vcpu->arch.gmap = gmap_create(current->mm, -1UL);
if (!vcpu->arch.gmap)
return -ENOMEM;
vcpu->arch.gmap->private = vcpu->kvm;
@@ -1396,7 +1652,7 @@ static int sca_can_add_vcpu(struct kvm *kvm, unsigned int id)
if (id < KVM_S390_BSCA_CPU_SLOTS)
return true;
- if (!sclp.has_esca)
+ if (!sclp.has_esca || !sclp.has_64bscao)
return false;
mutex_lock(&kvm->lock);
@@ -1537,7 +1793,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
save_access_regs(vcpu->arch.host_acrs);
restore_access_regs(vcpu->run->s.regs.acrs);
- gmap_enable(vcpu->arch.gmap);
+ gmap_enable(vcpu->arch.enabled_gmap);
atomic_or(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu))
__start_cpu_timer_accounting(vcpu);
@@ -1550,7 +1806,8 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu))
__stop_cpu_timer_accounting(vcpu);
atomic_andnot(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
- gmap_disable(vcpu->arch.gmap);
+ vcpu->arch.enabled_gmap = gmap_get_enabled();
+ gmap_disable(vcpu->arch.enabled_gmap);
/* Save guest register state */
save_fpu_regs();
@@ -1599,7 +1856,10 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
vcpu->arch.gmap = vcpu->kvm->arch.gmap;
sca_add_vcpu(vcpu);
}
-
+ if (test_kvm_facility(vcpu->kvm, 74) || vcpu->kvm->arch.user_instr0)
+ vcpu->arch.sie_block->ictl |= ICTL_OPEREXC;
+ /* make vcpu_load load the right gmap on the first trigger */
+ vcpu->arch.enabled_gmap = vcpu->arch.gmap;
}
static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu)
@@ -1658,15 +1918,21 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
kvm_s390_vcpu_setup_model(vcpu);
- vcpu->arch.sie_block->ecb = 0x02;
+ /* pgste_set_pte has special handling for !MACHINE_HAS_ESOP */
+ if (MACHINE_HAS_ESOP)
+ vcpu->arch.sie_block->ecb |= 0x02;
if (test_kvm_facility(vcpu->kvm, 9))
vcpu->arch.sie_block->ecb |= 0x04;
- if (test_kvm_facility(vcpu->kvm, 50) && test_kvm_facility(vcpu->kvm, 73))
+ if (test_kvm_facility(vcpu->kvm, 73))
vcpu->arch.sie_block->ecb |= 0x10;
- if (test_kvm_facility(vcpu->kvm, 8))
+ if (test_kvm_facility(vcpu->kvm, 8) && sclp.has_pfmfi)
vcpu->arch.sie_block->ecb2 |= 0x08;
- vcpu->arch.sie_block->eca = 0xC1002000U;
+ vcpu->arch.sie_block->eca = 0x1002000U;
+ if (sclp.has_cei)
+ vcpu->arch.sie_block->eca |= 0x80000000U;
+ if (sclp.has_ib)
+ vcpu->arch.sie_block->eca |= 0x40000000U;
if (sclp.has_siif)
vcpu->arch.sie_block->eca |= 1;
if (sclp.has_sigpif)
@@ -1716,6 +1982,10 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
vcpu->arch.sie_block = &sie_page->sie_block;
vcpu->arch.sie_block->itdba = (unsigned long) &sie_page->itdb;
+ /* the real guest size will always be smaller than msl */
+ vcpu->arch.sie_block->mso = 0;
+ vcpu->arch.sie_block->msl = sclp.hamax;
+
vcpu->arch.sie_block->icpua = id;
spin_lock_init(&vcpu->arch.local_int.lock);
vcpu->arch.local_int.float_int = &kvm->arch.float_int;
@@ -1784,16 +2054,25 @@ void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu)
kvm_s390_vcpu_request(vcpu);
}
-static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address)
+static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
+ unsigned long end)
{
- int i;
struct kvm *kvm = gmap->private;
struct kvm_vcpu *vcpu;
+ unsigned long prefix;
+ int i;
+ if (gmap_is_shadow(gmap))
+ return;
+ if (start >= 1UL << 31)
+ /* We are only interested in prefix pages */
+ return;
kvm_for_each_vcpu(i, vcpu, kvm) {
/* match against both prefix pages */
- if (kvm_s390_get_prefix(vcpu) == (address & ~0x1000UL)) {
- VCPU_EVENT(vcpu, 2, "gmap notifier for %lx", address);
+ prefix = kvm_s390_get_prefix(vcpu);
+ if (prefix <= end && start <= prefix + 2*PAGE_SIZE - 1) {
+ VCPU_EVENT(vcpu, 2, "gmap notifier for %lx-%lx",
+ start, end);
kvm_s390_sync_request(KVM_REQ_MMU_RELOAD, vcpu);
}
}
@@ -2002,6 +2281,8 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
if (dbg->control & ~VALID_GUESTDBG_FLAGS)
return -EINVAL;
+ if (!sclp.has_gpere)
+ return -EINVAL;
if (dbg->control & KVM_GUESTDBG_ENABLE) {
vcpu->guest_debug = dbg->control;
@@ -2070,16 +2351,16 @@ retry:
return 0;
/*
* We use MMU_RELOAD just to re-arm the ipte notifier for the
- * guest prefix page. gmap_ipte_notify will wait on the ptl lock.
+ * guest prefix page. gmap_mprotect_notify will wait on the ptl lock.
* This ensures that the ipte instruction for this request has
* already finished. We might race against a second unmapper that
* wants to set the blocking bit. Lets just retry the request loop.
*/
if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) {
int rc;
- rc = gmap_ipte_notify(vcpu->arch.gmap,
- kvm_s390_get_prefix(vcpu),
- PAGE_SIZE * 2);
+ rc = gmap_mprotect_notify(vcpu->arch.gmap,
+ kvm_s390_get_prefix(vcpu),
+ PAGE_SIZE * 2, PROT_WRITE);
if (rc)
return rc;
goto retry;
@@ -2108,6 +2389,11 @@ retry:
goto retry;
}
+ if (kvm_check_request(KVM_REQ_ICPT_OPEREXC, vcpu)) {
+ vcpu->arch.sie_block->ictl |= ICTL_OPEREXC;
+ goto retry;
+ }
+
/* nothing to do, just clear the request */
clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
@@ -2362,14 +2648,14 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
* guest_enter and guest_exit should be no uaccess.
*/
local_irq_disable();
- __kvm_guest_enter();
+ guest_enter_irqoff();
__disable_cpu_timer_accounting(vcpu);
local_irq_enable();
exit_reason = sie64a(vcpu->arch.sie_block,
vcpu->run->s.regs.gprs);
local_irq_disable();
__enable_cpu_timer_accounting(vcpu);
- __kvm_guest_exit();
+ guest_exit_irqoff();
local_irq_enable();
vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
@@ -2598,6 +2884,8 @@ static void __disable_ibs_on_all_vcpus(struct kvm *kvm)
static void __enable_ibs_on_vcpu(struct kvm_vcpu *vcpu)
{
+ if (!sclp.has_ibs)
+ return;
kvm_check_request(KVM_REQ_DISABLE_IBS, vcpu);
kvm_s390_sync_request(KVM_REQ_ENABLE_IBS, vcpu);
}
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 8621ab00ec8e..b8432862a817 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -56,7 +56,7 @@ static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu)
static inline int is_vcpu_idle(struct kvm_vcpu *vcpu)
{
- return atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_WAIT;
+ return test_bit(vcpu->vcpu_id, vcpu->arch.local_int.float_int->idle_mask);
}
static inline int kvm_is_ucontrol(struct kvm *kvm)
@@ -175,6 +175,12 @@ static inline int set_kvm_facility(u64 *fac_list, unsigned long nr)
return 0;
}
+static inline int test_kvm_cpu_feat(struct kvm *kvm, unsigned long nr)
+{
+ WARN_ON_ONCE(nr >= KVM_S390_VM_CPU_FEAT_NR_BITS);
+ return test_bit_inv(nr, kvm->arch.cpu_feat);
+}
+
/* are cpu states controlled by user space */
static inline int kvm_s390_user_cpu_state_ctrl(struct kvm *kvm)
{
@@ -232,6 +238,8 @@ static inline void kvm_s390_forward_psw(struct kvm_vcpu *vcpu, int ilen)
}
static inline void kvm_s390_retry_instr(struct kvm_vcpu *vcpu)
{
+ /* don't inject PER events if we re-execute the instruction */
+ vcpu->arch.sie_block->icptstatus &= ~0x02;
kvm_s390_rewind_psw(vcpu, kvm_s390_get_ilen(vcpu));
}
@@ -246,10 +254,21 @@ int kvm_s390_handle_stctl(struct kvm_vcpu *vcpu);
int kvm_s390_handle_lctl(struct kvm_vcpu *vcpu);
int kvm_s390_handle_eb(struct kvm_vcpu *vcpu);
+/* implemented in vsie.c */
+int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu);
+void kvm_s390_vsie_kick(struct kvm_vcpu *vcpu);
+void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start,
+ unsigned long end);
+void kvm_s390_vsie_init(struct kvm *kvm);
+void kvm_s390_vsie_destroy(struct kvm *kvm);
+
/* implemented in sigp.c */
int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu);
int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu);
+/* implemented in sthyi.c */
+int handle_sthyi(struct kvm_vcpu *vcpu);
+
/* implemented in kvm-s390.c */
void kvm_s390_set_tod_clock(struct kvm *kvm, u64 tod);
long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable);
@@ -360,6 +379,7 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu,
struct kvm_guest_debug *dbg);
void kvm_s390_clear_bp_data(struct kvm_vcpu *vcpu);
void kvm_s390_prepare_debug_exit(struct kvm_vcpu *vcpu);
+int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu);
void kvm_s390_handle_per_event(struct kvm_vcpu *vcpu);
/* support for Basic/Extended SCA handling */
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 95916fa7c670..46160388e996 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -27,6 +27,7 @@
#include <asm/io.h>
#include <asm/ptrace.h>
#include <asm/compat.h>
+#include <asm/sclp.h>
#include "gaccess.h"
#include "kvm-s390.h"
#include "trace.h"
@@ -152,30 +153,166 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu)
static int __skey_check_enable(struct kvm_vcpu *vcpu)
{
int rc = 0;
+
+ trace_kvm_s390_skey_related_inst(vcpu);
if (!(vcpu->arch.sie_block->ictl & (ICTL_ISKE | ICTL_SSKE | ICTL_RRBE)))
return rc;
rc = s390_enable_skey();
- VCPU_EVENT(vcpu, 3, "%s", "enabling storage keys for guest");
- trace_kvm_s390_skey_related_inst(vcpu);
- vcpu->arch.sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE | ICTL_RRBE);
+ VCPU_EVENT(vcpu, 3, "enabling storage keys for guest: %d", rc);
+ if (!rc)
+ vcpu->arch.sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE | ICTL_RRBE);
return rc;
}
-
-static int handle_skey(struct kvm_vcpu *vcpu)
+static int try_handle_skey(struct kvm_vcpu *vcpu)
{
- int rc = __skey_check_enable(vcpu);
+ int rc;
+ vcpu->stat.instruction_storage_key++;
+ rc = __skey_check_enable(vcpu);
if (rc)
return rc;
- vcpu->stat.instruction_storage_key++;
-
+ if (sclp.has_skey) {
+ /* with storage-key facility, SIE interprets it for us */
+ kvm_s390_retry_instr(vcpu);
+ VCPU_EVENT(vcpu, 4, "%s", "retrying storage key operation");
+ return -EAGAIN;
+ }
if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
+ return 0;
+}
- kvm_s390_retry_instr(vcpu);
- VCPU_EVENT(vcpu, 4, "%s", "retrying storage key operation");
+static int handle_iske(struct kvm_vcpu *vcpu)
+{
+ unsigned long addr;
+ unsigned char key;
+ int reg1, reg2;
+ int rc;
+
+ rc = try_handle_skey(vcpu);
+ if (rc)
+ return rc != -EAGAIN ? rc : 0;
+
+ kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
+
+ addr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
+ addr = kvm_s390_logical_to_effective(vcpu, addr);
+ addr = kvm_s390_real_to_abs(vcpu, addr);
+ addr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(addr));
+ if (kvm_is_error_hva(addr))
+ return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+
+ down_read(&current->mm->mmap_sem);
+ rc = get_guest_storage_key(current->mm, addr, &key);
+ up_read(&current->mm->mmap_sem);
+ if (rc)
+ return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+ vcpu->run->s.regs.gprs[reg1] &= ~0xff;
+ vcpu->run->s.regs.gprs[reg1] |= key;
+ return 0;
+}
+
+static int handle_rrbe(struct kvm_vcpu *vcpu)
+{
+ unsigned long addr;
+ int reg1, reg2;
+ int rc;
+
+ rc = try_handle_skey(vcpu);
+ if (rc)
+ return rc != -EAGAIN ? rc : 0;
+
+ kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
+
+ addr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
+ addr = kvm_s390_logical_to_effective(vcpu, addr);
+ addr = kvm_s390_real_to_abs(vcpu, addr);
+ addr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(addr));
+ if (kvm_is_error_hva(addr))
+ return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+
+ down_read(&current->mm->mmap_sem);
+ rc = reset_guest_reference_bit(current->mm, addr);
+ up_read(&current->mm->mmap_sem);
+ if (rc < 0)
+ return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+
+ kvm_s390_set_psw_cc(vcpu, rc);
+ return 0;
+}
+
+#define SSKE_NQ 0x8
+#define SSKE_MR 0x4
+#define SSKE_MC 0x2
+#define SSKE_MB 0x1
+static int handle_sske(struct kvm_vcpu *vcpu)
+{
+ unsigned char m3 = vcpu->arch.sie_block->ipb >> 28;
+ unsigned long start, end;
+ unsigned char key, oldkey;
+ int reg1, reg2;
+ int rc;
+
+ rc = try_handle_skey(vcpu);
+ if (rc)
+ return rc != -EAGAIN ? rc : 0;
+
+ if (!test_kvm_facility(vcpu->kvm, 8))
+ m3 &= ~SSKE_MB;
+ if (!test_kvm_facility(vcpu->kvm, 10))
+ m3 &= ~(SSKE_MC | SSKE_MR);
+ if (!test_kvm_facility(vcpu->kvm, 14))
+ m3 &= ~SSKE_NQ;
+
+ kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
+
+ key = vcpu->run->s.regs.gprs[reg1] & 0xfe;
+ start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
+ start = kvm_s390_logical_to_effective(vcpu, start);
+ if (m3 & SSKE_MB) {
+ /* start already designates an absolute address */
+ end = (start + (1UL << 20)) & ~((1UL << 20) - 1);
+ } else {
+ start = kvm_s390_real_to_abs(vcpu, start);
+ end = start + PAGE_SIZE;
+ }
+
+ while (start != end) {
+ unsigned long addr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(start));
+
+ if (kvm_is_error_hva(addr))
+ return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+
+ down_read(&current->mm->mmap_sem);
+ rc = cond_set_guest_storage_key(current->mm, addr, key, &oldkey,
+ m3 & SSKE_NQ, m3 & SSKE_MR,
+ m3 & SSKE_MC);
+ up_read(&current->mm->mmap_sem);
+ if (rc < 0)
+ return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+ start += PAGE_SIZE;
+ };
+
+ if (m3 & (SSKE_MC | SSKE_MR)) {
+ if (m3 & SSKE_MB) {
+ /* skey in reg1 is unpredictable */
+ kvm_s390_set_psw_cc(vcpu, 3);
+ } else {
+ kvm_s390_set_psw_cc(vcpu, rc);
+ vcpu->run->s.regs.gprs[reg1] &= ~0xff00UL;
+ vcpu->run->s.regs.gprs[reg1] |= (u64) oldkey << 8;
+ }
+ }
+ if (m3 & SSKE_MB) {
+ if (psw_bits(vcpu->arch.sie_block->gpsw).eaba == PSW_AMODE_64BIT)
+ vcpu->run->s.regs.gprs[reg2] &= ~PAGE_MASK;
+ else
+ vcpu->run->s.regs.gprs[reg2] &= ~0xfffff000UL;
+ end = kvm_s390_logical_to_effective(vcpu, end);
+ vcpu->run->s.regs.gprs[reg2] |= end;
+ }
return 0;
}
@@ -582,10 +719,11 @@ static const intercept_handler_t b2_handlers[256] = {
[0x10] = handle_set_prefix,
[0x11] = handle_store_prefix,
[0x12] = handle_store_cpu_address,
+ [0x14] = kvm_s390_handle_vsie,
[0x21] = handle_ipte_interlock,
- [0x29] = handle_skey,
- [0x2a] = handle_skey,
- [0x2b] = handle_skey,
+ [0x29] = handle_iske,
+ [0x2a] = handle_rrbe,
+ [0x2b] = handle_sske,
[0x2c] = handle_test_block,
[0x30] = handle_io_inst,
[0x31] = handle_io_inst,
@@ -654,8 +792,10 @@ static int handle_epsw(struct kvm_vcpu *vcpu)
static int handle_pfmf(struct kvm_vcpu *vcpu)
{
+ bool mr = false, mc = false, nq;
int reg1, reg2;
unsigned long start, end;
+ unsigned char key;
vcpu->stat.instruction_pfmf++;
@@ -675,15 +815,27 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
!test_kvm_facility(vcpu->kvm, 14))
return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
- /* No support for conditional-SSKE */
- if (vcpu->run->s.regs.gprs[reg1] & (PFMF_MR | PFMF_MC))
- return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+ /* Only provide conditional-SSKE support if enabled for the guest */
+ if (vcpu->run->s.regs.gprs[reg1] & PFMF_SK &&
+ test_kvm_facility(vcpu->kvm, 10)) {
+ mr = vcpu->run->s.regs.gprs[reg1] & PFMF_MR;
+ mc = vcpu->run->s.regs.gprs[reg1] & PFMF_MC;
+ }
+ nq = vcpu->run->s.regs.gprs[reg1] & PFMF_NQ;
+ key = vcpu->run->s.regs.gprs[reg1] & PFMF_KEY;
start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
start = kvm_s390_logical_to_effective(vcpu, start);
+ if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) {
+ if (kvm_s390_check_low_addr_prot_real(vcpu, start))
+ return kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm);
+ }
+
switch (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) {
case 0x00000000:
+ /* only 4k frames specify a real address */
+ start = kvm_s390_real_to_abs(vcpu, start);
end = (start + (1UL << 12)) & ~((1UL << 12) - 1);
break;
case 0x00001000:
@@ -701,20 +853,11 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
}
- if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) {
- if (kvm_s390_check_low_addr_prot_real(vcpu, start))
- return kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm);
- }
-
- while (start < end) {
- unsigned long useraddr, abs_addr;
+ while (start != end) {
+ unsigned long useraddr;
/* Translate guest address to host address */
- if ((vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) == 0)
- abs_addr = kvm_s390_real_to_abs(vcpu, start);
- else
- abs_addr = start;
- useraddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(abs_addr));
+ useraddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(start));
if (kvm_is_error_hva(useraddr))
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
@@ -728,16 +871,25 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
if (rc)
return rc;
- if (set_guest_storage_key(current->mm, useraddr,
- vcpu->run->s.regs.gprs[reg1] & PFMF_KEY,
- vcpu->run->s.regs.gprs[reg1] & PFMF_NQ))
+ down_read(&current->mm->mmap_sem);
+ rc = cond_set_guest_storage_key(current->mm, useraddr,
+ key, NULL, nq, mr, mc);
+ up_read(&current->mm->mmap_sem);
+ if (rc < 0)
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
}
start += PAGE_SIZE;
}
- if (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC)
- vcpu->run->s.regs.gprs[reg2] = end;
+ if (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) {
+ if (psw_bits(vcpu->arch.sie_block->gpsw).eaba == PSW_AMODE_64BIT) {
+ vcpu->run->s.regs.gprs[reg2] = end;
+ } else {
+ vcpu->run->s.regs.gprs[reg2] &= ~0xffffffffUL;
+ end = kvm_s390_logical_to_effective(vcpu, end);
+ vcpu->run->s.regs.gprs[reg2] |= end;
+ }
+ }
return 0;
}
@@ -1033,7 +1185,15 @@ static int handle_sckpf(struct kvm_vcpu *vcpu)
return 0;
}
+static int handle_ptff(struct kvm_vcpu *vcpu)
+{
+ /* we don't emulate any control instructions yet */
+ kvm_s390_set_psw_cc(vcpu, 3);
+ return 0;
+}
+
static const intercept_handler_t x01_handlers[256] = {
+ [0x04] = handle_ptff,
[0x07] = handle_sckpf,
};
diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c
index 28ea0cab1f1b..1a252f537081 100644
--- a/arch/s390/kvm/sigp.c
+++ b/arch/s390/kvm/sigp.c
@@ -77,18 +77,18 @@ static int __sigp_conditional_emergency(struct kvm_vcpu *vcpu,
const u64 psw_int_mask = PSW_MASK_IO | PSW_MASK_EXT;
u16 p_asn, s_asn;
psw_t *psw;
- u32 flags;
+ bool idle;
- flags = atomic_read(&dst_vcpu->arch.sie_block->cpuflags);
+ idle = is_vcpu_idle(vcpu);
psw = &dst_vcpu->arch.sie_block->gpsw;
p_asn = dst_vcpu->arch.sie_block->gcr[4] & 0xffff; /* Primary ASN */
s_asn = dst_vcpu->arch.sie_block->gcr[3] & 0xffff; /* Secondary ASN */
/* Inject the emergency signal? */
- if (!(flags & CPUSTAT_STOPPED)
+ if (!is_vcpu_stopped(vcpu)
|| (psw->mask & psw_int_mask) != psw_int_mask
- || ((flags & CPUSTAT_WAIT) && psw->addr != 0)
- || (!(flags & CPUSTAT_WAIT) && (asn == p_asn || asn == s_asn))) {
+ || (idle && psw->addr != 0)
+ || (!idle && (asn == p_asn || asn == s_asn))) {
return __inject_sigp_emergency(vcpu, dst_vcpu);
} else {
*reg &= 0xffffffff00000000UL;
diff --git a/arch/s390/kvm/sthyi.c b/arch/s390/kvm/sthyi.c
new file mode 100644
index 000000000000..bd98b7d25200
--- /dev/null
+++ b/arch/s390/kvm/sthyi.c
@@ -0,0 +1,471 @@
+/*
+ * store hypervisor information instruction emulation functions.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ * Copyright IBM Corp. 2016
+ * Author(s): Janosch Frank <frankja@linux.vnet.ibm.com>
+ */
+#include <linux/kvm_host.h>
+#include <linux/errno.h>
+#include <linux/pagemap.h>
+#include <linux/vmalloc.h>
+#include <linux/ratelimit.h>
+
+#include <asm/kvm_host.h>
+#include <asm/asm-offsets.h>
+#include <asm/sclp.h>
+#include <asm/diag.h>
+#include <asm/sysinfo.h>
+#include <asm/ebcdic.h>
+
+#include "kvm-s390.h"
+#include "gaccess.h"
+#include "trace.h"
+
+#define DED_WEIGHT 0xffff
+/*
+ * CP and IFL as EBCDIC strings, SP/0x40 determines the end of string
+ * as they are justified with spaces.
+ */
+#define CP 0xc3d7404040404040UL
+#define IFL 0xc9c6d34040404040UL
+
+enum hdr_flags {
+ HDR_NOT_LPAR = 0x10,
+ HDR_STACK_INCM = 0x20,
+ HDR_STSI_UNAV = 0x40,
+ HDR_PERF_UNAV = 0x80,
+};
+
+enum mac_validity {
+ MAC_NAME_VLD = 0x20,
+ MAC_ID_VLD = 0x40,
+ MAC_CNT_VLD = 0x80,
+};
+
+enum par_flag {
+ PAR_MT_EN = 0x80,
+};
+
+enum par_validity {
+ PAR_GRP_VLD = 0x08,
+ PAR_ID_VLD = 0x10,
+ PAR_ABS_VLD = 0x20,
+ PAR_WGHT_VLD = 0x40,
+ PAR_PCNT_VLD = 0x80,
+};
+
+struct hdr_sctn {
+ u8 infhflg1;
+ u8 infhflg2; /* reserved */
+ u8 infhval1; /* reserved */
+ u8 infhval2; /* reserved */
+ u8 reserved[3];
+ u8 infhygct;
+ u16 infhtotl;
+ u16 infhdln;
+ u16 infmoff;
+ u16 infmlen;
+ u16 infpoff;
+ u16 infplen;
+ u16 infhoff1;
+ u16 infhlen1;
+ u16 infgoff1;
+ u16 infglen1;
+ u16 infhoff2;
+ u16 infhlen2;
+ u16 infgoff2;
+ u16 infglen2;
+ u16 infhoff3;
+ u16 infhlen3;
+ u16 infgoff3;
+ u16 infglen3;
+ u8 reserved2[4];
+} __packed;
+
+struct mac_sctn {
+ u8 infmflg1; /* reserved */
+ u8 infmflg2; /* reserved */
+ u8 infmval1;
+ u8 infmval2; /* reserved */
+ u16 infmscps;
+ u16 infmdcps;
+ u16 infmsifl;
+ u16 infmdifl;
+ char infmname[8];
+ char infmtype[4];
+ char infmmanu[16];
+ char infmseq[16];
+ char infmpman[4];
+ u8 reserved[4];
+} __packed;
+
+struct par_sctn {
+ u8 infpflg1;
+ u8 infpflg2; /* reserved */
+ u8 infpval1;
+ u8 infpval2; /* reserved */
+ u16 infppnum;
+ u16 infpscps;
+ u16 infpdcps;
+ u16 infpsifl;
+ u16 infpdifl;
+ u16 reserved;
+ char infppnam[8];
+ u32 infpwbcp;
+ u32 infpabcp;
+ u32 infpwbif;
+ u32 infpabif;
+ char infplgnm[8];
+ u32 infplgcp;
+ u32 infplgif;
+} __packed;
+
+struct sthyi_sctns {
+ struct hdr_sctn hdr;
+ struct mac_sctn mac;
+ struct par_sctn par;
+} __packed;
+
+struct cpu_inf {
+ u64 lpar_cap;
+ u64 lpar_grp_cap;
+ u64 lpar_weight;
+ u64 all_weight;
+ int cpu_num_ded;
+ int cpu_num_shd;
+};
+
+struct lpar_cpu_inf {
+ struct cpu_inf cp;
+ struct cpu_inf ifl;
+};
+
+static inline u64 cpu_id(u8 ctidx, void *diag224_buf)
+{
+ return *((u64 *)(diag224_buf + (ctidx + 1) * DIAG204_CPU_NAME_LEN));
+}
+
+/*
+ * Scales the cpu capping from the lpar range to the one expected in
+ * sthyi data.
+ *
+ * diag204 reports a cap in hundredths of processor units.
+ * z/VM's range for one core is 0 - 0x10000.
+ */
+static u32 scale_cap(u32 in)
+{
+ return (0x10000 * in) / 100;
+}
+
+static void fill_hdr(struct sthyi_sctns *sctns)
+{
+ sctns->hdr.infhdln = sizeof(sctns->hdr);
+ sctns->hdr.infmoff = sizeof(sctns->hdr);
+ sctns->hdr.infmlen = sizeof(sctns->mac);
+ sctns->hdr.infplen = sizeof(sctns->par);
+ sctns->hdr.infpoff = sctns->hdr.infhdln + sctns->hdr.infmlen;
+ sctns->hdr.infhtotl = sctns->hdr.infpoff + sctns->hdr.infplen;
+}
+
+static void fill_stsi_mac(struct sthyi_sctns *sctns,
+ struct sysinfo_1_1_1 *sysinfo)
+{
+ if (stsi(sysinfo, 1, 1, 1))
+ return;
+
+ sclp_ocf_cpc_name_copy(sctns->mac.infmname);
+
+ memcpy(sctns->mac.infmtype, sysinfo->type, sizeof(sctns->mac.infmtype));
+ memcpy(sctns->mac.infmmanu, sysinfo->manufacturer, sizeof(sctns->mac.infmmanu));
+ memcpy(sctns->mac.infmpman, sysinfo->plant, sizeof(sctns->mac.infmpman));
+ memcpy(sctns->mac.infmseq, sysinfo->sequence, sizeof(sctns->mac.infmseq));
+
+ sctns->mac.infmval1 |= MAC_ID_VLD | MAC_NAME_VLD;
+}
+
+static void fill_stsi_par(struct sthyi_sctns *sctns,
+ struct sysinfo_2_2_2 *sysinfo)
+{
+ if (stsi(sysinfo, 2, 2, 2))
+ return;
+
+ sctns->par.infppnum = sysinfo->lpar_number;
+ memcpy(sctns->par.infppnam, sysinfo->name, sizeof(sctns->par.infppnam));
+
+ sctns->par.infpval1 |= PAR_ID_VLD;
+}
+
+static void fill_stsi(struct sthyi_sctns *sctns)
+{
+ void *sysinfo;
+
+ /* Errors are handled through the validity bits in the response. */
+ sysinfo = (void *)__get_free_page(GFP_KERNEL);
+ if (!sysinfo)
+ return;
+
+ fill_stsi_mac(sctns, sysinfo);
+ fill_stsi_par(sctns, sysinfo);
+
+ free_pages((unsigned long)sysinfo, 0);
+}
+
+static void fill_diag_mac(struct sthyi_sctns *sctns,
+ struct diag204_x_phys_block *block,
+ void *diag224_buf)
+{
+ int i;
+
+ for (i = 0; i < block->hdr.cpus; i++) {
+ switch (cpu_id(block->cpus[i].ctidx, diag224_buf)) {
+ case CP:
+ if (block->cpus[i].weight == DED_WEIGHT)
+ sctns->mac.infmdcps++;
+ else
+ sctns->mac.infmscps++;
+ break;
+ case IFL:
+ if (block->cpus[i].weight == DED_WEIGHT)
+ sctns->mac.infmdifl++;
+ else
+ sctns->mac.infmsifl++;
+ break;
+ }
+ }
+ sctns->mac.infmval1 |= MAC_CNT_VLD;
+}
+
+/* Returns a pointer to the the next partition block. */
+static struct diag204_x_part_block *lpar_cpu_inf(struct lpar_cpu_inf *part_inf,
+ bool this_lpar,
+ void *diag224_buf,
+ struct diag204_x_part_block *block)
+{
+ int i, capped = 0, weight_cp = 0, weight_ifl = 0;
+ struct cpu_inf *cpu_inf;
+
+ for (i = 0; i < block->hdr.rcpus; i++) {
+ if (!(block->cpus[i].cflag & DIAG204_CPU_ONLINE))
+ continue;
+
+ switch (cpu_id(block->cpus[i].ctidx, diag224_buf)) {
+ case CP:
+ cpu_inf = &part_inf->cp;
+ if (block->cpus[i].cur_weight < DED_WEIGHT)
+ weight_cp |= block->cpus[i].cur_weight;
+ break;
+ case IFL:
+ cpu_inf = &part_inf->ifl;
+ if (block->cpus[i].cur_weight < DED_WEIGHT)
+ weight_ifl |= block->cpus[i].cur_weight;
+ break;
+ default:
+ continue;
+ }
+
+ if (!this_lpar)
+ continue;
+
+ capped |= block->cpus[i].cflag & DIAG204_CPU_CAPPED;
+ cpu_inf->lpar_cap |= block->cpus[i].cpu_type_cap;
+ cpu_inf->lpar_grp_cap |= block->cpus[i].group_cpu_type_cap;
+
+ if (block->cpus[i].weight == DED_WEIGHT)
+ cpu_inf->cpu_num_ded += 1;
+ else
+ cpu_inf->cpu_num_shd += 1;
+ }
+
+ if (this_lpar && capped) {
+ part_inf->cp.lpar_weight = weight_cp;
+ part_inf->ifl.lpar_weight = weight_ifl;
+ }
+ part_inf->cp.all_weight += weight_cp;
+ part_inf->ifl.all_weight += weight_ifl;
+ return (struct diag204_x_part_block *)&block->cpus[i];
+}
+
+static void fill_diag(struct sthyi_sctns *sctns)
+{
+ int i, r, pages;
+ bool this_lpar;
+ void *diag204_buf;
+ void *diag224_buf = NULL;
+ struct diag204_x_info_blk_hdr *ti_hdr;
+ struct diag204_x_part_block *part_block;
+ struct diag204_x_phys_block *phys_block;
+ struct lpar_cpu_inf lpar_inf = {};
+
+ /* Errors are handled through the validity bits in the response. */
+ pages = diag204((unsigned long)DIAG204_SUBC_RSI |
+ (unsigned long)DIAG204_INFO_EXT, 0, NULL);
+ if (pages <= 0)
+ return;
+
+ diag204_buf = vmalloc(PAGE_SIZE * pages);
+ if (!diag204_buf)
+ return;
+
+ r = diag204((unsigned long)DIAG204_SUBC_STIB7 |
+ (unsigned long)DIAG204_INFO_EXT, pages, diag204_buf);
+ if (r < 0)
+ goto out;
+
+ diag224_buf = kmalloc(PAGE_SIZE, GFP_KERNEL | GFP_DMA);
+ if (!diag224_buf || diag224(diag224_buf))
+ goto out;
+
+ ti_hdr = diag204_buf;
+ part_block = diag204_buf + sizeof(*ti_hdr);
+
+ for (i = 0; i < ti_hdr->npar; i++) {
+ /*
+ * For the calling lpar we also need to get the cpu
+ * caps and weights. The time information block header
+ * specifies the offset to the partition block of the
+ * caller lpar, so we know when we process its data.
+ */
+ this_lpar = (void *)part_block - diag204_buf == ti_hdr->this_part;
+ part_block = lpar_cpu_inf(&lpar_inf, this_lpar, diag224_buf,
+ part_block);
+ }
+
+ phys_block = (struct diag204_x_phys_block *)part_block;
+ part_block = diag204_buf + ti_hdr->this_part;
+ if (part_block->hdr.mtid)
+ sctns->par.infpflg1 = PAR_MT_EN;
+
+ sctns->par.infpval1 |= PAR_GRP_VLD;
+ sctns->par.infplgcp = scale_cap(lpar_inf.cp.lpar_grp_cap);
+ sctns->par.infplgif = scale_cap(lpar_inf.ifl.lpar_grp_cap);
+ memcpy(sctns->par.infplgnm, part_block->hdr.hardware_group_name,
+ sizeof(sctns->par.infplgnm));
+
+ sctns->par.infpscps = lpar_inf.cp.cpu_num_shd;
+ sctns->par.infpdcps = lpar_inf.cp.cpu_num_ded;
+ sctns->par.infpsifl = lpar_inf.ifl.cpu_num_shd;
+ sctns->par.infpdifl = lpar_inf.ifl.cpu_num_ded;
+ sctns->par.infpval1 |= PAR_PCNT_VLD;
+
+ sctns->par.infpabcp = scale_cap(lpar_inf.cp.lpar_cap);
+ sctns->par.infpabif = scale_cap(lpar_inf.ifl.lpar_cap);
+ sctns->par.infpval1 |= PAR_ABS_VLD;
+
+ /*
+ * Everything below needs global performance data to be
+ * meaningful.
+ */
+ if (!(ti_hdr->flags & DIAG204_LPAR_PHYS_FLG)) {
+ sctns->hdr.infhflg1 |= HDR_PERF_UNAV;
+ goto out;
+ }
+
+ fill_diag_mac(sctns, phys_block, diag224_buf);
+
+ if (lpar_inf.cp.lpar_weight) {
+ sctns->par.infpwbcp = sctns->mac.infmscps * 0x10000 *
+ lpar_inf.cp.lpar_weight / lpar_inf.cp.all_weight;
+ }
+
+ if (lpar_inf.ifl.lpar_weight) {
+ sctns->par.infpwbif = sctns->mac.infmsifl * 0x10000 *
+ lpar_inf.ifl.lpar_weight / lpar_inf.ifl.all_weight;
+ }
+ sctns->par.infpval1 |= PAR_WGHT_VLD;
+
+out:
+ kfree(diag224_buf);
+ vfree(diag204_buf);
+}
+
+static int sthyi(u64 vaddr)
+{
+ register u64 code asm("0") = 0;
+ register u64 addr asm("2") = vaddr;
+ int cc;
+
+ asm volatile(
+ ".insn rre,0xB2560000,%[code],%[addr]\n"
+ "ipm %[cc]\n"
+ "srl %[cc],28\n"
+ : [cc] "=d" (cc)
+ : [code] "d" (code), [addr] "a" (addr)
+ : "memory", "cc");
+ return cc;
+}
+
+int handle_sthyi(struct kvm_vcpu *vcpu)
+{
+ int reg1, reg2, r = 0;
+ u64 code, addr, cc = 0;
+ struct sthyi_sctns *sctns = NULL;
+
+ /*
+ * STHYI requires extensive locking in the higher hypervisors
+ * and is very computational/memory expensive. Therefore we
+ * ratelimit the executions per VM.
+ */
+ if (!__ratelimit(&vcpu->kvm->arch.sthyi_limit)) {
+ kvm_s390_retry_instr(vcpu);
+ return 0;
+ }
+
+ kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
+ code = vcpu->run->s.regs.gprs[reg1];
+ addr = vcpu->run->s.regs.gprs[reg2];
+
+ vcpu->stat.instruction_sthyi++;
+ VCPU_EVENT(vcpu, 3, "STHYI: fc: %llu addr: 0x%016llx", code, addr);
+ trace_kvm_s390_handle_sthyi(vcpu, code, addr);
+
+ if (reg1 == reg2 || reg1 & 1 || reg2 & 1 || addr & ~PAGE_MASK)
+ return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+
+ if (code & 0xffff) {
+ cc = 3;
+ goto out;
+ }
+
+ /*
+ * If the page has not yet been faulted in, we want to do that
+ * now and not after all the expensive calculations.
+ */
+ r = write_guest(vcpu, addr, reg2, &cc, 1);
+ if (r)
+ return kvm_s390_inject_prog_cond(vcpu, r);
+
+ sctns = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!sctns)
+ return -ENOMEM;
+
+ /*
+ * If we are a guest, we don't want to emulate an emulated
+ * instruction. We ask the hypervisor to provide the data.
+ */
+ if (test_facility(74)) {
+ cc = sthyi((u64)sctns);
+ goto out;
+ }
+
+ fill_hdr(sctns);
+ fill_stsi(sctns);
+ fill_diag(sctns);
+
+out:
+ if (!cc) {
+ r = write_guest(vcpu, addr, reg2, sctns, PAGE_SIZE);
+ if (r) {
+ free_page((unsigned long)sctns);
+ return kvm_s390_inject_prog_cond(vcpu, r);
+ }
+ }
+
+ free_page((unsigned long)sctns);
+ vcpu->run->s.regs.gprs[reg2 + 1] = cc ? 4 : 0;
+ kvm_s390_set_psw_cc(vcpu, cc);
+ return r;
+}
diff --git a/arch/s390/kvm/trace.h b/arch/s390/kvm/trace.h
index 916834d7a73a..4fc9d4e5be89 100644
--- a/arch/s390/kvm/trace.h
+++ b/arch/s390/kvm/trace.h
@@ -41,7 +41,7 @@ TRACE_EVENT(kvm_s390_skey_related_inst,
TP_fast_assign(
VCPU_ASSIGN_COMMON
),
- VCPU_TP_PRINTK("%s", "first instruction related to skeys on vcpu")
+ VCPU_TP_PRINTK("%s", "storage key related instruction")
);
TRACE_EVENT(kvm_s390_major_guest_pfault,
@@ -185,8 +185,10 @@ TRACE_EVENT(kvm_s390_intercept_prog,
__entry->code = code;
),
- VCPU_TP_PRINTK("intercepted program interruption %04x",
- __entry->code)
+ VCPU_TP_PRINTK("intercepted program interruption %04x (%s)",
+ __entry->code,
+ __print_symbolic(__entry->code,
+ icpt_prog_codes))
);
/*
@@ -412,6 +414,47 @@ TRACE_EVENT(kvm_s390_handle_stsi,
__entry->addr)
);
+TRACE_EVENT(kvm_s390_handle_operexc,
+ TP_PROTO(VCPU_PROTO_COMMON, __u16 ipa, __u32 ipb),
+ TP_ARGS(VCPU_ARGS_COMMON, ipa, ipb),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ __field(__u64, instruction)
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ __entry->instruction = ((__u64)ipa << 48) |
+ ((__u64)ipb << 16);
+ ),
+
+ VCPU_TP_PRINTK("operation exception on instruction %016llx (%s)",
+ __entry->instruction,
+ __print_symbolic(icpt_insn_decoder(__entry->instruction),
+ icpt_insn_codes))
+ );
+
+TRACE_EVENT(kvm_s390_handle_sthyi,
+ TP_PROTO(VCPU_PROTO_COMMON, u64 code, u64 addr),
+ TP_ARGS(VCPU_ARGS_COMMON, code, addr),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ __field(u64, code)
+ __field(u64, addr)
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ __entry->code = code;
+ __entry->addr = addr;
+ ),
+
+ VCPU_TP_PRINTK("STHYI fc: %llu addr: %016llx",
+ __entry->code, __entry->addr)
+ );
+
#endif /* _TRACE_KVM_H */
/* This part must be outside protection */
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
new file mode 100644
index 000000000000..c106488b4137
--- /dev/null
+++ b/arch/s390/kvm/vsie.c
@@ -0,0 +1,1091 @@
+/*
+ * kvm nested virtualization support for s390x
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ * Author(s): David Hildenbrand <dahi@linux.vnet.ibm.com>
+ */
+#include <linux/vmalloc.h>
+#include <linux/kvm_host.h>
+#include <linux/bug.h>
+#include <linux/list.h>
+#include <linux/bitmap.h>
+#include <asm/gmap.h>
+#include <asm/mmu_context.h>
+#include <asm/sclp.h>
+#include <asm/nmi.h>
+#include <asm/dis.h>
+#include "kvm-s390.h"
+#include "gaccess.h"
+
+struct vsie_page {
+ struct kvm_s390_sie_block scb_s; /* 0x0000 */
+ /* the pinned originial scb */
+ struct kvm_s390_sie_block *scb_o; /* 0x0200 */
+ /* the shadow gmap in use by the vsie_page */
+ struct gmap *gmap; /* 0x0208 */
+ /* address of the last reported fault to guest2 */
+ unsigned long fault_addr; /* 0x0210 */
+ __u8 reserved[0x0700 - 0x0218]; /* 0x0218 */
+ struct kvm_s390_crypto_cb crycb; /* 0x0700 */
+ __u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE]; /* 0x0800 */
+} __packed;
+
+/* trigger a validity icpt for the given scb */
+static int set_validity_icpt(struct kvm_s390_sie_block *scb,
+ __u16 reason_code)
+{
+ scb->ipa = 0x1000;
+ scb->ipb = ((__u32) reason_code) << 16;
+ scb->icptcode = ICPT_VALIDITY;
+ return 1;
+}
+
+/* mark the prefix as unmapped, this will block the VSIE */
+static void prefix_unmapped(struct vsie_page *vsie_page)
+{
+ atomic_or(PROG_REQUEST, &vsie_page->scb_s.prog20);
+}
+
+/* mark the prefix as unmapped and wait until the VSIE has been left */
+static void prefix_unmapped_sync(struct vsie_page *vsie_page)
+{
+ prefix_unmapped(vsie_page);
+ if (vsie_page->scb_s.prog0c & PROG_IN_SIE)
+ atomic_or(CPUSTAT_STOP_INT, &vsie_page->scb_s.cpuflags);
+ while (vsie_page->scb_s.prog0c & PROG_IN_SIE)
+ cpu_relax();
+}
+
+/* mark the prefix as mapped, this will allow the VSIE to run */
+static void prefix_mapped(struct vsie_page *vsie_page)
+{
+ atomic_andnot(PROG_REQUEST, &vsie_page->scb_s.prog20);
+}
+
+/* test if the prefix is mapped into the gmap shadow */
+static int prefix_is_mapped(struct vsie_page *vsie_page)
+{
+ return !(atomic_read(&vsie_page->scb_s.prog20) & PROG_REQUEST);
+}
+
+/* copy the updated intervention request bits into the shadow scb */
+static void update_intervention_requests(struct vsie_page *vsie_page)
+{
+ const int bits = CPUSTAT_STOP_INT | CPUSTAT_IO_INT | CPUSTAT_EXT_INT;
+ int cpuflags;
+
+ cpuflags = atomic_read(&vsie_page->scb_o->cpuflags);
+ atomic_andnot(bits, &vsie_page->scb_s.cpuflags);
+ atomic_or(cpuflags & bits, &vsie_page->scb_s.cpuflags);
+}
+
+/* shadow (filter and validate) the cpuflags */
+static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+ int newflags, cpuflags = atomic_read(&scb_o->cpuflags);
+
+ /* we don't allow ESA/390 guests */
+ if (!(cpuflags & CPUSTAT_ZARCH))
+ return set_validity_icpt(scb_s, 0x0001U);
+
+ if (cpuflags & (CPUSTAT_RRF | CPUSTAT_MCDS))
+ return set_validity_icpt(scb_s, 0x0001U);
+ else if (cpuflags & (CPUSTAT_SLSV | CPUSTAT_SLSR))
+ return set_validity_icpt(scb_s, 0x0007U);
+
+ /* intervention requests will be set later */
+ newflags = CPUSTAT_ZARCH;
+ if (cpuflags & CPUSTAT_GED && test_kvm_facility(vcpu->kvm, 8))
+ newflags |= CPUSTAT_GED;
+ if (cpuflags & CPUSTAT_GED2 && test_kvm_facility(vcpu->kvm, 78)) {
+ if (cpuflags & CPUSTAT_GED)
+ return set_validity_icpt(scb_s, 0x0001U);
+ newflags |= CPUSTAT_GED2;
+ }
+ if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_GPERE))
+ newflags |= cpuflags & CPUSTAT_P;
+ if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_GSLS))
+ newflags |= cpuflags & CPUSTAT_SM;
+ if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IBS))
+ newflags |= cpuflags & CPUSTAT_IBS;
+
+ atomic_set(&scb_s->cpuflags, newflags);
+ return 0;
+}
+
+/*
+ * Create a shadow copy of the crycb block and setup key wrapping, if
+ * requested for guest 3 and enabled for guest 2.
+ *
+ * We only accept format-1 (no AP in g2), but convert it into format-2
+ * There is nothing to do for format-0.
+ *
+ * Returns: - 0 if shadowed or nothing to do
+ * - > 0 if control has to be given to guest 2
+ */
+static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+ u32 crycb_addr = scb_o->crycbd & 0x7ffffff8U;
+ unsigned long *b1, *b2;
+ u8 ecb3_flags;
+
+ scb_s->crycbd = 0;
+ if (!(scb_o->crycbd & vcpu->arch.sie_block->crycbd & CRYCB_FORMAT1))
+ return 0;
+ /* format-1 is supported with message-security-assist extension 3 */
+ if (!test_kvm_facility(vcpu->kvm, 76))
+ return 0;
+ /* we may only allow it if enabled for guest 2 */
+ ecb3_flags = scb_o->ecb3 & vcpu->arch.sie_block->ecb3 &
+ (ECB3_AES | ECB3_DEA);
+ if (!ecb3_flags)
+ return 0;
+
+ if ((crycb_addr & PAGE_MASK) != ((crycb_addr + 128) & PAGE_MASK))
+ return set_validity_icpt(scb_s, 0x003CU);
+ else if (!crycb_addr)
+ return set_validity_icpt(scb_s, 0x0039U);
+
+ /* copy only the wrapping keys */
+ if (read_guest_real(vcpu, crycb_addr + 72, &vsie_page->crycb, 56))
+ return set_validity_icpt(scb_s, 0x0035U);
+
+ scb_s->ecb3 |= ecb3_flags;
+ scb_s->crycbd = ((__u32)(__u64) &vsie_page->crycb) | CRYCB_FORMAT1 |
+ CRYCB_FORMAT2;
+
+ /* xor both blocks in one run */
+ b1 = (unsigned long *) vsie_page->crycb.dea_wrapping_key_mask;
+ b2 = (unsigned long *)
+ vcpu->kvm->arch.crypto.crycb->dea_wrapping_key_mask;
+ /* as 56%8 == 0, bitmap_xor won't overwrite any data */
+ bitmap_xor(b1, b1, b2, BITS_PER_BYTE * 56);
+ return 0;
+}
+
+/* shadow (round up/down) the ibc to avoid validity icpt */
+static void prepare_ibc(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+ __u64 min_ibc = (sclp.ibc >> 16) & 0x0fffU;
+
+ scb_s->ibc = 0;
+ /* ibc installed in g2 and requested for g3 */
+ if (vcpu->kvm->arch.model.ibc && (scb_o->ibc & 0x0fffU)) {
+ scb_s->ibc = scb_o->ibc & 0x0fffU;
+ /* takte care of the minimum ibc level of the machine */
+ if (scb_s->ibc < min_ibc)
+ scb_s->ibc = min_ibc;
+ /* take care of the maximum ibc level set for the guest */
+ if (scb_s->ibc > vcpu->kvm->arch.model.ibc)
+ scb_s->ibc = vcpu->kvm->arch.model.ibc;
+ }
+}
+
+/* unshadow the scb, copying parameters back to the real scb */
+static void unshadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+
+ /* interception */
+ scb_o->icptcode = scb_s->icptcode;
+ scb_o->icptstatus = scb_s->icptstatus;
+ scb_o->ipa = scb_s->ipa;
+ scb_o->ipb = scb_s->ipb;
+ scb_o->gbea = scb_s->gbea;
+
+ /* timer */
+ scb_o->cputm = scb_s->cputm;
+ scb_o->ckc = scb_s->ckc;
+ scb_o->todpr = scb_s->todpr;
+
+ /* guest state */
+ scb_o->gpsw = scb_s->gpsw;
+ scb_o->gg14 = scb_s->gg14;
+ scb_o->gg15 = scb_s->gg15;
+ memcpy(scb_o->gcr, scb_s->gcr, 128);
+ scb_o->pp = scb_s->pp;
+
+ /* interrupt intercept */
+ switch (scb_s->icptcode) {
+ case ICPT_PROGI:
+ case ICPT_INSTPROGI:
+ case ICPT_EXTINT:
+ memcpy((void *)((u64)scb_o + 0xc0),
+ (void *)((u64)scb_s + 0xc0), 0xf0 - 0xc0);
+ break;
+ case ICPT_PARTEXEC:
+ /* MVPG only */
+ memcpy((void *)((u64)scb_o + 0xc0),
+ (void *)((u64)scb_s + 0xc0), 0xd0 - 0xc0);
+ break;
+ }
+
+ if (scb_s->ihcpu != 0xffffU)
+ scb_o->ihcpu = scb_s->ihcpu;
+}
+
+/*
+ * Setup the shadow scb by copying and checking the relevant parts of the g2
+ * provided scb.
+ *
+ * Returns: - 0 if the scb has been shadowed
+ * - > 0 if control has to be given to guest 2
+ */
+static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ bool had_tx = scb_s->ecb & 0x10U;
+ unsigned long new_mso = 0;
+ int rc;
+
+ /* make sure we don't have any leftovers when reusing the scb */
+ scb_s->icptcode = 0;
+ scb_s->eca = 0;
+ scb_s->ecb = 0;
+ scb_s->ecb2 = 0;
+ scb_s->ecb3 = 0;
+ scb_s->ecd = 0;
+ scb_s->fac = 0;
+
+ rc = prepare_cpuflags(vcpu, vsie_page);
+ if (rc)
+ goto out;
+
+ /* timer */
+ scb_s->cputm = scb_o->cputm;
+ scb_s->ckc = scb_o->ckc;
+ scb_s->todpr = scb_o->todpr;
+ scb_s->epoch = scb_o->epoch;
+
+ /* guest state */
+ scb_s->gpsw = scb_o->gpsw;
+ scb_s->gg14 = scb_o->gg14;
+ scb_s->gg15 = scb_o->gg15;
+ memcpy(scb_s->gcr, scb_o->gcr, 128);
+ scb_s->pp = scb_o->pp;
+
+ /* interception / execution handling */
+ scb_s->gbea = scb_o->gbea;
+ scb_s->lctl = scb_o->lctl;
+ scb_s->svcc = scb_o->svcc;
+ scb_s->ictl = scb_o->ictl;
+ /*
+ * SKEY handling functions can't deal with false setting of PTE invalid
+ * bits. Therefore we cannot provide interpretation and would later
+ * have to provide own emulation handlers.
+ */
+ scb_s->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE;
+ scb_s->icpua = scb_o->icpua;
+
+ if (!(atomic_read(&scb_s->cpuflags) & CPUSTAT_SM))
+ new_mso = scb_o->mso & 0xfffffffffff00000UL;
+ /* if the hva of the prefix changes, we have to remap the prefix */
+ if (scb_s->mso != new_mso || scb_s->prefix != scb_o->prefix)
+ prefix_unmapped(vsie_page);
+ /* SIE will do mso/msl validity and exception checks for us */
+ scb_s->msl = scb_o->msl & 0xfffffffffff00000UL;
+ scb_s->mso = new_mso;
+ scb_s->prefix = scb_o->prefix;
+
+ /* We have to definetly flush the tlb if this scb never ran */
+ if (scb_s->ihcpu != 0xffffU)
+ scb_s->ihcpu = scb_o->ihcpu;
+
+ /* MVPG and Protection Exception Interpretation are always available */
+ scb_s->eca |= scb_o->eca & 0x01002000U;
+ /* Host-protection-interruption introduced with ESOP */
+ if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_ESOP))
+ scb_s->ecb |= scb_o->ecb & 0x02U;
+ /* transactional execution */
+ if (test_kvm_facility(vcpu->kvm, 73)) {
+ /* remap the prefix is tx is toggled on */
+ if ((scb_o->ecb & 0x10U) && !had_tx)
+ prefix_unmapped(vsie_page);
+ scb_s->ecb |= scb_o->ecb & 0x10U;
+ }
+ /* SIMD */
+ if (test_kvm_facility(vcpu->kvm, 129)) {
+ scb_s->eca |= scb_o->eca & 0x00020000U;
+ scb_s->ecd |= scb_o->ecd & 0x20000000U;
+ }
+ /* Run-time-Instrumentation */
+ if (test_kvm_facility(vcpu->kvm, 64))
+ scb_s->ecb3 |= scb_o->ecb3 & 0x01U;
+ if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIIF))
+ scb_s->eca |= scb_o->eca & 0x00000001U;
+ if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IB))
+ scb_s->eca |= scb_o->eca & 0x40000000U;
+ if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_CEI))
+ scb_s->eca |= scb_o->eca & 0x80000000U;
+
+ prepare_ibc(vcpu, vsie_page);
+ rc = shadow_crycb(vcpu, vsie_page);
+out:
+ if (rc)
+ unshadow_scb(vcpu, vsie_page);
+ return rc;
+}
+
+void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start,
+ unsigned long end)
+{
+ struct kvm *kvm = gmap->private;
+ struct vsie_page *cur;
+ unsigned long prefix;
+ struct page *page;
+ int i;
+
+ if (!gmap_is_shadow(gmap))
+ return;
+ if (start >= 1UL << 31)
+ /* We are only interested in prefix pages */
+ return;
+
+ /*
+ * Only new shadow blocks are added to the list during runtime,
+ * therefore we can safely reference them all the time.
+ */
+ for (i = 0; i < kvm->arch.vsie.page_count; i++) {
+ page = READ_ONCE(kvm->arch.vsie.pages[i]);
+ if (!page)
+ continue;
+ cur = page_to_virt(page);
+ if (READ_ONCE(cur->gmap) != gmap)
+ continue;
+ prefix = cur->scb_s.prefix << GUEST_PREFIX_SHIFT;
+ /* with mso/msl, the prefix lies at an offset */
+ prefix += cur->scb_s.mso;
+ if (prefix <= end && start <= prefix + 2 * PAGE_SIZE - 1)
+ prefix_unmapped_sync(cur);
+ }
+}
+
+/*
+ * Map the first prefix page and if tx is enabled also the second prefix page.
+ *
+ * The prefix will be protected, a gmap notifier will inform about unmaps.
+ * The shadow scb must not be executed until the prefix is remapped, this is
+ * guaranteed by properly handling PROG_REQUEST.
+ *
+ * Returns: - 0 on if successfully mapped or already mapped
+ * - > 0 if control has to be given to guest 2
+ * - -EAGAIN if the caller can retry immediately
+ * - -ENOMEM if out of memory
+ */
+static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ u64 prefix = scb_s->prefix << GUEST_PREFIX_SHIFT;
+ int rc;
+
+ if (prefix_is_mapped(vsie_page))
+ return 0;
+
+ /* mark it as mapped so we can catch any concurrent unmappers */
+ prefix_mapped(vsie_page);
+
+ /* with mso/msl, the prefix lies at offset *mso* */
+ prefix += scb_s->mso;
+
+ rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix);
+ if (!rc && (scb_s->ecb & 0x10U))
+ rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
+ prefix + PAGE_SIZE);
+ /*
+ * We don't have to mprotect, we will be called for all unshadows.
+ * SIE will detect if protection applies and trigger a validity.
+ */
+ if (rc)
+ prefix_unmapped(vsie_page);
+ if (rc > 0 || rc == -EFAULT)
+ rc = set_validity_icpt(scb_s, 0x0037U);
+ return rc;
+}
+
+/*
+ * Pin the guest page given by gpa and set hpa to the pinned host address.
+ * Will always be pinned writable.
+ *
+ * Returns: - 0 on success
+ * - -EINVAL if the gpa is not valid guest storage
+ * - -ENOMEM if out of memory
+ */
+static int pin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t *hpa)
+{
+ struct page *page;
+ hva_t hva;
+ int rc;
+
+ hva = gfn_to_hva(kvm, gpa_to_gfn(gpa));
+ if (kvm_is_error_hva(hva))
+ return -EINVAL;
+ rc = get_user_pages_fast(hva, 1, 1, &page);
+ if (rc < 0)
+ return rc;
+ else if (rc != 1)
+ return -ENOMEM;
+ *hpa = (hpa_t) page_to_virt(page) + (gpa & ~PAGE_MASK);
+ return 0;
+}
+
+/* Unpins a page previously pinned via pin_guest_page, marking it as dirty. */
+static void unpin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t hpa)
+{
+ struct page *page;
+
+ page = virt_to_page(hpa);
+ set_page_dirty_lock(page);
+ put_page(page);
+ /* mark the page always as dirty for migration */
+ mark_page_dirty(kvm, gpa_to_gfn(gpa));
+}
+
+/* unpin all blocks previously pinned by pin_blocks(), marking them dirty */
+static void unpin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ hpa_t hpa;
+ gpa_t gpa;
+
+ hpa = (u64) scb_s->scaoh << 32 | scb_s->scaol;
+ if (hpa) {
+ gpa = scb_o->scaol & ~0xfUL;
+ if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_64BSCAO))
+ gpa |= (u64) scb_o->scaoh << 32;
+ unpin_guest_page(vcpu->kvm, gpa, hpa);
+ scb_s->scaol = 0;
+ scb_s->scaoh = 0;
+ }
+
+ hpa = scb_s->itdba;
+ if (hpa) {
+ gpa = scb_o->itdba & ~0xffUL;
+ unpin_guest_page(vcpu->kvm, gpa, hpa);
+ scb_s->itdba = 0;
+ }
+
+ hpa = scb_s->gvrd;
+ if (hpa) {
+ gpa = scb_o->gvrd & ~0x1ffUL;
+ unpin_guest_page(vcpu->kvm, gpa, hpa);
+ scb_s->gvrd = 0;
+ }
+
+ hpa = scb_s->riccbd;
+ if (hpa) {
+ gpa = scb_o->riccbd & ~0x3fUL;
+ unpin_guest_page(vcpu->kvm, gpa, hpa);
+ scb_s->riccbd = 0;
+ }
+}
+
+/*
+ * Instead of shadowing some blocks, we can simply forward them because the
+ * addresses in the scb are 64 bit long.
+ *
+ * This works as long as the data lies in one page. If blocks ever exceed one
+ * page, we have to fall back to shadowing.
+ *
+ * As we reuse the sca, the vcpu pointers contained in it are invalid. We must
+ * therefore not enable any facilities that access these pointers (e.g. SIGPIF).
+ *
+ * Returns: - 0 if all blocks were pinned.
+ * - > 0 if control has to be given to guest 2
+ * - -ENOMEM if out of memory
+ */
+static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ hpa_t hpa;
+ gpa_t gpa;
+ int rc = 0;
+
+ gpa = scb_o->scaol & ~0xfUL;
+ if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_64BSCAO))
+ gpa |= (u64) scb_o->scaoh << 32;
+ if (gpa) {
+ if (!(gpa & ~0x1fffUL))
+ rc = set_validity_icpt(scb_s, 0x0038U);
+ else if ((gpa & ~0x1fffUL) == kvm_s390_get_prefix(vcpu))
+ rc = set_validity_icpt(scb_s, 0x0011U);
+ else if ((gpa & PAGE_MASK) !=
+ ((gpa + sizeof(struct bsca_block) - 1) & PAGE_MASK))
+ rc = set_validity_icpt(scb_s, 0x003bU);
+ if (!rc) {
+ rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+ if (rc == -EINVAL)
+ rc = set_validity_icpt(scb_s, 0x0034U);
+ }
+ if (rc)
+ goto unpin;
+ scb_s->scaoh = (u32)((u64)hpa >> 32);
+ scb_s->scaol = (u32)(u64)hpa;
+ }
+
+ gpa = scb_o->itdba & ~0xffUL;
+ if (gpa && (scb_s->ecb & 0x10U)) {
+ if (!(gpa & ~0x1fffU)) {
+ rc = set_validity_icpt(scb_s, 0x0080U);
+ goto unpin;
+ }
+ /* 256 bytes cannot cross page boundaries */
+ rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+ if (rc == -EINVAL)
+ rc = set_validity_icpt(scb_s, 0x0080U);
+ if (rc)
+ goto unpin;
+ scb_s->itdba = hpa;
+ }
+
+ gpa = scb_o->gvrd & ~0x1ffUL;
+ if (gpa && (scb_s->eca & 0x00020000U) &&
+ !(scb_s->ecd & 0x20000000U)) {
+ if (!(gpa & ~0x1fffUL)) {
+ rc = set_validity_icpt(scb_s, 0x1310U);
+ goto unpin;
+ }
+ /*
+ * 512 bytes vector registers cannot cross page boundaries
+ * if this block gets bigger, we have to shadow it.
+ */
+ rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+ if (rc == -EINVAL)
+ rc = set_validity_icpt(scb_s, 0x1310U);
+ if (rc)
+ goto unpin;
+ scb_s->gvrd = hpa;
+ }
+
+ gpa = scb_o->riccbd & ~0x3fUL;
+ if (gpa && (scb_s->ecb3 & 0x01U)) {
+ if (!(gpa & ~0x1fffUL)) {
+ rc = set_validity_icpt(scb_s, 0x0043U);
+ goto unpin;
+ }
+ /* 64 bytes cannot cross page boundaries */
+ rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+ if (rc == -EINVAL)
+ rc = set_validity_icpt(scb_s, 0x0043U);
+ /* Validity 0x0044 will be checked by SIE */
+ if (rc)
+ goto unpin;
+ scb_s->gvrd = hpa;
+ }
+ return 0;
+unpin:
+ unpin_blocks(vcpu, vsie_page);
+ return rc;
+}
+
+/* unpin the scb provided by guest 2, marking it as dirty */
+static void unpin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
+ gpa_t gpa)
+{
+ hpa_t hpa = (hpa_t) vsie_page->scb_o;
+
+ if (hpa)
+ unpin_guest_page(vcpu->kvm, gpa, hpa);
+ vsie_page->scb_o = NULL;
+}
+
+/*
+ * Pin the scb at gpa provided by guest 2 at vsie_page->scb_o.
+ *
+ * Returns: - 0 if the scb was pinned.
+ * - > 0 if control has to be given to guest 2
+ * - -ENOMEM if out of memory
+ */
+static int pin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
+ gpa_t gpa)
+{
+ hpa_t hpa;
+ int rc;
+
+ rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+ if (rc == -EINVAL) {
+ rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+ if (!rc)
+ rc = 1;
+ }
+ if (!rc)
+ vsie_page->scb_o = (struct kvm_s390_sie_block *) hpa;
+ return rc;
+}
+
+/*
+ * Inject a fault into guest 2.
+ *
+ * Returns: - > 0 if control has to be given to guest 2
+ * < 0 if an error occurred during injection.
+ */
+static int inject_fault(struct kvm_vcpu *vcpu, __u16 code, __u64 vaddr,
+ bool write_flag)
+{
+ struct kvm_s390_pgm_info pgm = {
+ .code = code,
+ .trans_exc_code =
+ /* 0-51: virtual address */
+ (vaddr & 0xfffffffffffff000UL) |
+ /* 52-53: store / fetch */
+ (((unsigned int) !write_flag) + 1) << 10,
+ /* 62-63: asce id (alway primary == 0) */
+ .exc_access_id = 0, /* always primary */
+ .op_access_id = 0, /* not MVPG */
+ };
+ int rc;
+
+ if (code == PGM_PROTECTION)
+ pgm.trans_exc_code |= 0x4UL;
+
+ rc = kvm_s390_inject_prog_irq(vcpu, &pgm);
+ return rc ? rc : 1;
+}
+
+/*
+ * Handle a fault during vsie execution on a gmap shadow.
+ *
+ * Returns: - 0 if the fault was resolved
+ * - > 0 if control has to be given to guest 2
+ * - < 0 if an error occurred
+ */
+static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ int rc;
+
+ if (current->thread.gmap_int_code == PGM_PROTECTION)
+ /* we can directly forward all protection exceptions */
+ return inject_fault(vcpu, PGM_PROTECTION,
+ current->thread.gmap_addr, 1);
+
+ rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
+ current->thread.gmap_addr);
+ if (rc > 0) {
+ rc = inject_fault(vcpu, rc,
+ current->thread.gmap_addr,
+ current->thread.gmap_write_flag);
+ if (rc >= 0)
+ vsie_page->fault_addr = current->thread.gmap_addr;
+ }
+ return rc;
+}
+
+/*
+ * Retry the previous fault that required guest 2 intervention. This avoids
+ * one superfluous SIE re-entry and direct exit.
+ *
+ * Will ignore any errors. The next SIE fault will do proper fault handling.
+ */
+static void handle_last_fault(struct kvm_vcpu *vcpu,
+ struct vsie_page *vsie_page)
+{
+ if (vsie_page->fault_addr)
+ kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
+ vsie_page->fault_addr);
+ vsie_page->fault_addr = 0;
+}
+
+static inline void clear_vsie_icpt(struct vsie_page *vsie_page)
+{
+ vsie_page->scb_s.icptcode = 0;
+}
+
+/* rewind the psw and clear the vsie icpt, so we can retry execution */
+static void retry_vsie_icpt(struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ int ilen = insn_length(scb_s->ipa >> 8);
+
+ /* take care of EXECUTE instructions */
+ if (scb_s->icptstatus & 1) {
+ ilen = (scb_s->icptstatus >> 4) & 0x6;
+ if (!ilen)
+ ilen = 4;
+ }
+ scb_s->gpsw.addr = __rewind_psw(scb_s->gpsw, ilen);
+ clear_vsie_icpt(vsie_page);
+}
+
+/*
+ * Try to shadow + enable the guest 2 provided facility list.
+ * Retry instruction execution if enabled for and provided by guest 2.
+ *
+ * Returns: - 0 if handled (retry or guest 2 icpt)
+ * - > 0 if control has to be given to guest 2
+ */
+static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ __u32 fac = vsie_page->scb_o->fac & 0x7ffffff8U;
+
+ if (fac && test_kvm_facility(vcpu->kvm, 7)) {
+ retry_vsie_icpt(vsie_page);
+ if (read_guest_real(vcpu, fac, &vsie_page->fac,
+ sizeof(vsie_page->fac)))
+ return set_validity_icpt(scb_s, 0x1090U);
+ scb_s->fac = (__u32)(__u64) &vsie_page->fac;
+ }
+ return 0;
+}
+
+/*
+ * Run the vsie on a shadow scb and a shadow gmap, without any further
+ * sanity checks, handling SIE faults.
+ *
+ * Returns: - 0 everything went fine
+ * - > 0 if control has to be given to guest 2
+ * - < 0 if an error occurred
+ */
+static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+ int rc;
+
+ handle_last_fault(vcpu, vsie_page);
+
+ if (need_resched())
+ schedule();
+ if (test_cpu_flag(CIF_MCCK_PENDING))
+ s390_handle_mcck();
+
+ srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+ local_irq_disable();
+ guest_enter_irqoff();
+ local_irq_enable();
+
+ rc = sie64a(scb_s, vcpu->run->s.regs.gprs);
+
+ local_irq_disable();
+ guest_exit_irqoff();
+ local_irq_enable();
+ vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+ if (rc > 0)
+ rc = 0; /* we could still have an icpt */
+ else if (rc == -EFAULT)
+ return handle_fault(vcpu, vsie_page);
+
+ switch (scb_s->icptcode) {
+ case ICPT_INST:
+ if (scb_s->ipa == 0xb2b0)
+ rc = handle_stfle(vcpu, vsie_page);
+ break;
+ case ICPT_STOP:
+ /* stop not requested by g2 - must have been a kick */
+ if (!(atomic_read(&scb_o->cpuflags) & CPUSTAT_STOP_INT))
+ clear_vsie_icpt(vsie_page);
+ break;
+ case ICPT_VALIDITY:
+ if ((scb_s->ipa & 0xf000) != 0xf000)
+ scb_s->ipa += 0x1000;
+ break;
+ }
+ return rc;
+}
+
+static void release_gmap_shadow(struct vsie_page *vsie_page)
+{
+ if (vsie_page->gmap)
+ gmap_put(vsie_page->gmap);
+ WRITE_ONCE(vsie_page->gmap, NULL);
+ prefix_unmapped(vsie_page);
+}
+
+static int acquire_gmap_shadow(struct kvm_vcpu *vcpu,
+ struct vsie_page *vsie_page)
+{
+ unsigned long asce;
+ union ctlreg0 cr0;
+ struct gmap *gmap;
+ int edat;
+
+ asce = vcpu->arch.sie_block->gcr[1];
+ cr0.val = vcpu->arch.sie_block->gcr[0];
+ edat = cr0.edat && test_kvm_facility(vcpu->kvm, 8);
+ edat += edat && test_kvm_facility(vcpu->kvm, 78);
+
+ /*
+ * ASCE or EDAT could have changed since last icpt, or the gmap
+ * we're holding has been unshadowed. If the gmap is still valid,
+ * we can safely reuse it.
+ */
+ if (vsie_page->gmap && gmap_shadow_valid(vsie_page->gmap, asce, edat))
+ return 0;
+
+ /* release the old shadow - if any, and mark the prefix as unmapped */
+ release_gmap_shadow(vsie_page);
+ gmap = gmap_shadow(vcpu->arch.gmap, asce, edat);
+ if (IS_ERR(gmap))
+ return PTR_ERR(gmap);
+ gmap->private = vcpu->kvm;
+ WRITE_ONCE(vsie_page->gmap, gmap);
+ return 0;
+}
+
+/*
+ * Register the shadow scb at the VCPU, e.g. for kicking out of vsie.
+ */
+static void register_shadow_scb(struct kvm_vcpu *vcpu,
+ struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+
+ WRITE_ONCE(vcpu->arch.vsie_block, &vsie_page->scb_s);
+ /*
+ * External calls have to lead to a kick of the vcpu and
+ * therefore the vsie -> Simulate Wait state.
+ */
+ atomic_or(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags);
+ /*
+ * We have to adjust the g3 epoch by the g2 epoch. The epoch will
+ * automatically be adjusted on tod clock changes via kvm_sync_clock.
+ */
+ preempt_disable();
+ scb_s->epoch += vcpu->kvm->arch.epoch;
+ preempt_enable();
+}
+
+/*
+ * Unregister a shadow scb from a VCPU.
+ */
+static void unregister_shadow_scb(struct kvm_vcpu *vcpu)
+{
+ atomic_andnot(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags);
+ WRITE_ONCE(vcpu->arch.vsie_block, NULL);
+}
+
+/*
+ * Run the vsie on a shadowed scb, managing the gmap shadow, handling
+ * prefix pages and faults.
+ *
+ * Returns: - 0 if no errors occurred
+ * - > 0 if control has to be given to guest 2
+ * - -ENOMEM if out of memory
+ */
+static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ int rc = 0;
+
+ while (1) {
+ rc = acquire_gmap_shadow(vcpu, vsie_page);
+ if (!rc)
+ rc = map_prefix(vcpu, vsie_page);
+ if (!rc) {
+ gmap_enable(vsie_page->gmap);
+ update_intervention_requests(vsie_page);
+ rc = do_vsie_run(vcpu, vsie_page);
+ gmap_enable(vcpu->arch.gmap);
+ }
+ atomic_andnot(PROG_BLOCK_SIE, &scb_s->prog20);
+
+ if (rc == -EAGAIN)
+ rc = 0;
+ if (rc || scb_s->icptcode || signal_pending(current) ||
+ kvm_s390_vcpu_has_irq(vcpu, 0))
+ break;
+ };
+
+ if (rc == -EFAULT) {
+ /*
+ * Addressing exceptions are always presentes as intercepts.
+ * As addressing exceptions are suppressing and our guest 3 PSW
+ * points at the responsible instruction, we have to
+ * forward the PSW and set the ilc. If we can't read guest 3
+ * instruction, we can use an arbitrary ilc. Let's always use
+ * ilen = 4 for now, so we can avoid reading in guest 3 virtual
+ * memory. (we could also fake the shadow so the hardware
+ * handles it).
+ */
+ scb_s->icptcode = ICPT_PROGI;
+ scb_s->iprcc = PGM_ADDRESSING;
+ scb_s->pgmilc = 4;
+ scb_s->gpsw.addr = __rewind_psw(scb_s->gpsw, 4);
+ }
+ return rc;
+}
+
+/*
+ * Get or create a vsie page for a scb address.
+ *
+ * Returns: - address of a vsie page (cached or new one)
+ * - NULL if the same scb address is already used by another VCPU
+ * - ERR_PTR(-ENOMEM) if out of memory
+ */
+static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr)
+{
+ struct vsie_page *vsie_page;
+ struct page *page;
+ int nr_vcpus;
+
+ rcu_read_lock();
+ page = radix_tree_lookup(&kvm->arch.vsie.addr_to_page, addr >> 9);
+ rcu_read_unlock();
+ if (page) {
+ if (page_ref_inc_return(page) == 2)
+ return page_to_virt(page);
+ page_ref_dec(page);
+ }
+
+ /*
+ * We want at least #online_vcpus shadows, so every VCPU can execute
+ * the VSIE in parallel.
+ */
+ nr_vcpus = atomic_read(&kvm->online_vcpus);
+
+ mutex_lock(&kvm->arch.vsie.mutex);
+ if (kvm->arch.vsie.page_count < nr_vcpus) {
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO | GFP_DMA);
+ if (!page) {
+ mutex_unlock(&kvm->arch.vsie.mutex);
+ return ERR_PTR(-ENOMEM);
+ }
+ page_ref_inc(page);
+ kvm->arch.vsie.pages[kvm->arch.vsie.page_count] = page;
+ kvm->arch.vsie.page_count++;
+ } else {
+ /* reuse an existing entry that belongs to nobody */
+ while (true) {
+ page = kvm->arch.vsie.pages[kvm->arch.vsie.next];
+ if (page_ref_inc_return(page) == 2)
+ break;
+ page_ref_dec(page);
+ kvm->arch.vsie.next++;
+ kvm->arch.vsie.next %= nr_vcpus;
+ }
+ radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9);
+ }
+ page->index = addr;
+ /* double use of the same address */
+ if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9, page)) {
+ page_ref_dec(page);
+ mutex_unlock(&kvm->arch.vsie.mutex);
+ return NULL;
+ }
+ mutex_unlock(&kvm->arch.vsie.mutex);
+
+ vsie_page = page_to_virt(page);
+ memset(&vsie_page->scb_s, 0, sizeof(struct kvm_s390_sie_block));
+ release_gmap_shadow(vsie_page);
+ vsie_page->fault_addr = 0;
+ vsie_page->scb_s.ihcpu = 0xffffU;
+ return vsie_page;
+}
+
+/* put a vsie page acquired via get_vsie_page */
+static void put_vsie_page(struct kvm *kvm, struct vsie_page *vsie_page)
+{
+ struct page *page = pfn_to_page(__pa(vsie_page) >> PAGE_SHIFT);
+
+ page_ref_dec(page);
+}
+
+int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu)
+{
+ struct vsie_page *vsie_page;
+ unsigned long scb_addr;
+ int rc;
+
+ vcpu->stat.instruction_sie++;
+ if (!test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIEF2))
+ return -EOPNOTSUPP;
+ if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
+ return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
+
+ BUILD_BUG_ON(sizeof(struct vsie_page) != 4096);
+ scb_addr = kvm_s390_get_base_disp_s(vcpu, NULL);
+
+ /* 512 byte alignment */
+ if (unlikely(scb_addr & 0x1ffUL))
+ return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+
+ if (signal_pending(current) || kvm_s390_vcpu_has_irq(vcpu, 0))
+ return 0;
+
+ vsie_page = get_vsie_page(vcpu->kvm, scb_addr);
+ if (IS_ERR(vsie_page))
+ return PTR_ERR(vsie_page);
+ else if (!vsie_page)
+ /* double use of sie control block - simply do nothing */
+ return 0;
+
+ rc = pin_scb(vcpu, vsie_page, scb_addr);
+ if (rc)
+ goto out_put;
+ rc = shadow_scb(vcpu, vsie_page);
+ if (rc)
+ goto out_unpin_scb;
+ rc = pin_blocks(vcpu, vsie_page);
+ if (rc)
+ goto out_unshadow;
+ register_shadow_scb(vcpu, vsie_page);
+ rc = vsie_run(vcpu, vsie_page);
+ unregister_shadow_scb(vcpu);
+ unpin_blocks(vcpu, vsie_page);
+out_unshadow:
+ unshadow_scb(vcpu, vsie_page);
+out_unpin_scb:
+ unpin_scb(vcpu, vsie_page, scb_addr);
+out_put:
+ put_vsie_page(vcpu->kvm, vsie_page);
+
+ return rc < 0 ? rc : 0;
+}
+
+/* Init the vsie data structures. To be called when a vm is initialized. */
+void kvm_s390_vsie_init(struct kvm *kvm)
+{
+ mutex_init(&kvm->arch.vsie.mutex);
+ INIT_RADIX_TREE(&kvm->arch.vsie.addr_to_page, GFP_KERNEL);
+}
+
+/* Destroy the vsie data structures. To be called when a vm is destroyed. */
+void kvm_s390_vsie_destroy(struct kvm *kvm)
+{
+ struct vsie_page *vsie_page;
+ struct page *page;
+ int i;
+
+ mutex_lock(&kvm->arch.vsie.mutex);
+ for (i = 0; i < kvm->arch.vsie.page_count; i++) {
+ page = kvm->arch.vsie.pages[i];
+ kvm->arch.vsie.pages[i] = NULL;
+ vsie_page = page_to_virt(page);
+ release_gmap_shadow(vsie_page);
+ /* free the radix tree entry */
+ radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9);
+ __free_page(page);
+ }
+ kvm->arch.vsie.page_count = 0;
+ mutex_unlock(&kvm->arch.vsie.mutex);
+}
+
+void kvm_s390_vsie_kick(struct kvm_vcpu *vcpu)
+{
+ struct kvm_s390_sie_block *scb = READ_ONCE(vcpu->arch.vsie_block);
+
+ /*
+ * Even if the VCPU lets go of the shadow sie block reference, it is
+ * still valid in the cache. So we can safely kick it.
+ */
+ if (scb) {
+ atomic_or(PROG_BLOCK_SIE, &scb->prog20);
+ if (scb->prog0c & PROG_IN_SIE)
+ atomic_or(CPUSTAT_STOP_INT, &scb->cpuflags);
+ }
+}