summaryrefslogtreecommitdiff
path: root/arch/s390/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/s390/kvm')
-rw-r--r--arch/s390/kvm/Kconfig9
-rw-r--r--arch/s390/kvm/Makefile2
-rw-r--r--arch/s390/kvm/diag.c42
-rw-r--r--arch/s390/kvm/gaccess.c443
-rw-r--r--arch/s390/kvm/gaccess.h17
-rw-r--r--arch/s390/kvm/gmap-vsie.c141
-rw-r--r--arch/s390/kvm/guestdbg.c4
-rw-r--r--arch/s390/kvm/intercept.c98
-rw-r--r--arch/s390/kvm/interrupt.c240
-rw-r--r--arch/s390/kvm/kvm-s390.c1273
-rw-r--r--arch/s390/kvm/kvm-s390.h124
-rw-r--r--arch/s390/kvm/pci.c30
-rw-r--r--arch/s390/kvm/pci.h2
-rw-r--r--arch/s390/kvm/priv.c65
-rw-r--r--arch/s390/kvm/pv.c136
-rw-r--r--arch/s390/kvm/sigp.c4
-rw-r--r--arch/s390/kvm/trace-s390.h27
-rw-r--r--arch/s390/kvm/vsie.c301
18 files changed, 1897 insertions, 1061 deletions
diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig
index 33f4ff909476..f4ec8c1ce214 100644
--- a/arch/s390/kvm/Kconfig
+++ b/arch/s390/kvm/Kconfig
@@ -19,22 +19,17 @@ if VIRTUALIZATION
config KVM
def_tristate y
prompt "Kernel-based Virtual Machine (KVM) support"
- depends on HAVE_KVM
- select PREEMPT_NOTIFIERS
select HAVE_KVM_CPU_RELAX_INTERCEPT
- select HAVE_KVM_VCPU_ASYNC_IOCTL
- select HAVE_KVM_EVENTFD
select KVM_ASYNC_PF
select KVM_ASYNC_PF_SYNC
+ select KVM_COMMON
select HAVE_KVM_IRQCHIP
- select HAVE_KVM_IRQFD
select HAVE_KVM_IRQ_ROUTING
select HAVE_KVM_INVALID_WAKEUPS
select HAVE_KVM_NO_POLL
- select SRCU
select KVM_VFIO
- select INTERVAL_TREE
select MMU_NOTIFIER
+ select VIRT_XFER_TO_GUEST_WORK
help
Support hosting paravirtualized guest machines using the SIE
virtualization capability on the mainframe. This should work
diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile
index 02217fb4ae10..9a723c48b05a 100644
--- a/arch/s390/kvm/Makefile
+++ b/arch/s390/kvm/Makefile
@@ -8,7 +8,7 @@ include $(srctree)/virt/kvm/Makefile.kvm
ccflags-y := -Ivirt/kvm -Iarch/s390/kvm
kvm-y += kvm-s390.o intercept.o interrupt.o priv.o sigp.o
-kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o
+kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o gmap-vsie.o
kvm-$(CONFIG_VFIO_PCI_ZDEV_KVM) += pci.o
obj-$(CONFIG_KVM) += kvm.o
diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c
index 807fa9da1e72..53233dec8cad 100644
--- a/arch/s390/kvm/diag.c
+++ b/arch/s390/kvm/diag.c
@@ -11,12 +11,30 @@
#include <linux/kvm.h>
#include <linux/kvm_host.h>
#include <asm/gmap.h>
+#include <asm/gmap_helpers.h>
#include <asm/virtio-ccw.h>
#include "kvm-s390.h"
#include "trace.h"
#include "trace-s390.h"
#include "gaccess.h"
+static void do_discard_gfn_range(struct kvm_vcpu *vcpu, gfn_t gfn_start, gfn_t gfn_end)
+{
+ struct kvm_memslot_iter iter;
+ struct kvm_memory_slot *slot;
+ struct kvm_memslots *slots;
+ unsigned long start, end;
+
+ slots = kvm_vcpu_memslots(vcpu);
+
+ kvm_for_each_memslot_in_gfn_range(&iter, slots, gfn_start, gfn_end) {
+ slot = iter.slot;
+ start = __gfn_to_hva_memslot(slot, max(gfn_start, slot->base_gfn));
+ end = __gfn_to_hva_memslot(slot, min(gfn_end, slot->base_gfn + slot->npages));
+ gmap_helper_discard(vcpu->kvm->mm, start, end);
+ }
+}
+
static int diag_release_pages(struct kvm_vcpu *vcpu)
{
unsigned long start, end;
@@ -32,12 +50,13 @@ static int diag_release_pages(struct kvm_vcpu *vcpu)
VCPU_EVENT(vcpu, 5, "diag release pages %lX %lX", start, end);
+ mmap_read_lock(vcpu->kvm->mm);
/*
* We checked for start >= end above, so lets check for the
* fast path (no prefix swap page involved)
*/
if (end <= prefix || start >= prefix + 2 * PAGE_SIZE) {
- gmap_discard(vcpu->arch.gmap, start, end);
+ do_discard_gfn_range(vcpu, gpa_to_gfn(start), gpa_to_gfn(end));
} else {
/*
* This is slow path. gmap_discard will check for start
@@ -45,13 +64,14 @@ static int diag_release_pages(struct kvm_vcpu *vcpu)
* prefix and let gmap_discard make some of these calls
* NOPs.
*/
- gmap_discard(vcpu->arch.gmap, start, prefix);
+ do_discard_gfn_range(vcpu, gpa_to_gfn(start), gpa_to_gfn(prefix));
if (start <= prefix)
- gmap_discard(vcpu->arch.gmap, 0, PAGE_SIZE);
+ do_discard_gfn_range(vcpu, 0, 1);
if (end > prefix + PAGE_SIZE)
- gmap_discard(vcpu->arch.gmap, PAGE_SIZE, 2 * PAGE_SIZE);
- gmap_discard(vcpu->arch.gmap, prefix + 2 * PAGE_SIZE, end);
+ do_discard_gfn_range(vcpu, 1, 2);
+ do_discard_gfn_range(vcpu, gpa_to_gfn(prefix) + 2, gpa_to_gfn(end));
}
+ mmap_read_unlock(vcpu->kvm->mm);
return 0;
}
@@ -77,7 +97,7 @@ static int __diag_page_ref_service(struct kvm_vcpu *vcpu)
vcpu->stat.instruction_diagnose_258++;
if (vcpu->run->s.regs.gprs[rx] & 7)
return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
- rc = read_guest(vcpu, vcpu->run->s.regs.gprs[rx], rx, &parm, sizeof(parm));
+ rc = read_guest_real(vcpu, vcpu->run->s.regs.gprs[rx], &parm, sizeof(parm));
if (rc)
return kvm_s390_inject_prog_cond(vcpu, rc);
if (parm.parm_version != 2 || parm.parm_len < 5 || parm.code != 0x258)
@@ -102,7 +122,7 @@ static int __diag_page_ref_service(struct kvm_vcpu *vcpu)
parm.token_addr & 7 || parm.zarch != 0x8000000000000000ULL)
return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
- if (kvm_is_error_gpa(vcpu->kvm, parm.token_addr))
+ if (!kvm_is_gpa_in_memslot(vcpu->kvm, parm.token_addr))
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
vcpu->arch.pfault_token = parm.token_addr;
@@ -166,6 +186,7 @@ static int diag9c_forwarding_overrun(void)
static int __diag_time_slice_end_directed(struct kvm_vcpu *vcpu)
{
struct kvm_vcpu *tcpu;
+ int tcpu_cpu;
int tid;
tid = vcpu->run->s.regs.gprs[(vcpu->arch.sie_block->ipa & 0xf0) >> 4];
@@ -181,14 +202,15 @@ static int __diag_time_slice_end_directed(struct kvm_vcpu *vcpu)
goto no_yield;
/* target guest VCPU already running */
- if (READ_ONCE(tcpu->cpu) >= 0) {
+ tcpu_cpu = READ_ONCE(tcpu->cpu);
+ if (tcpu_cpu >= 0) {
if (!diag9c_forwarding_hz || diag9c_forwarding_overrun())
goto no_yield;
/* target host CPU already running */
- if (!vcpu_is_preempted(tcpu->cpu))
+ if (!vcpu_is_preempted(tcpu_cpu))
goto no_yield;
- smp_yield_cpu(tcpu->cpu);
+ smp_yield_cpu(tcpu_cpu);
VCPU_EVENT(vcpu, 5,
"diag time slice end directed to %d: yield forwarded",
tid);
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 0243b6e38d36..41ca6b0ee7a9 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -11,169 +11,14 @@
#include <linux/err.h>
#include <linux/pgtable.h>
#include <linux/bitfield.h>
-
+#include <asm/access-regs.h>
+#include <asm/fault.h>
#include <asm/gmap.h>
+#include <asm/dat-bits.h>
#include "kvm-s390.h"
#include "gaccess.h"
-#include <asm/switch_to.h>
-
-union asce {
- unsigned long val;
- struct {
- unsigned long origin : 52; /* Region- or Segment-Table Origin */
- unsigned long : 2;
- unsigned long g : 1; /* Subspace Group Control */
- unsigned long p : 1; /* Private Space Control */
- unsigned long s : 1; /* Storage-Alteration-Event Control */
- unsigned long x : 1; /* Space-Switch-Event Control */
- unsigned long r : 1; /* Real-Space Control */
- unsigned long : 1;
- unsigned long dt : 2; /* Designation-Type Control */
- unsigned long tl : 2; /* Region- or Segment-Table Length */
- };
-};
-
-enum {
- ASCE_TYPE_SEGMENT = 0,
- ASCE_TYPE_REGION3 = 1,
- ASCE_TYPE_REGION2 = 2,
- ASCE_TYPE_REGION1 = 3
-};
-
-union region1_table_entry {
- unsigned long val;
- struct {
- unsigned long rto: 52;/* Region-Table Origin */
- unsigned long : 2;
- unsigned long p : 1; /* DAT-Protection Bit */
- unsigned long : 1;
- unsigned long tf : 2; /* Region-Second-Table Offset */
- unsigned long i : 1; /* Region-Invalid Bit */
- unsigned long : 1;
- unsigned long tt : 2; /* Table-Type Bits */
- unsigned long tl : 2; /* Region-Second-Table Length */
- };
-};
-
-union region2_table_entry {
- unsigned long val;
- struct {
- unsigned long rto: 52;/* Region-Table Origin */
- unsigned long : 2;
- unsigned long p : 1; /* DAT-Protection Bit */
- unsigned long : 1;
- unsigned long tf : 2; /* Region-Third-Table Offset */
- unsigned long i : 1; /* Region-Invalid Bit */
- unsigned long : 1;
- unsigned long tt : 2; /* Table-Type Bits */
- unsigned long tl : 2; /* Region-Third-Table Length */
- };
-};
-
-struct region3_table_entry_fc0 {
- unsigned long sto: 52;/* Segment-Table Origin */
- unsigned long : 1;
- unsigned long fc : 1; /* Format-Control */
- unsigned long p : 1; /* DAT-Protection Bit */
- unsigned long : 1;
- unsigned long tf : 2; /* Segment-Table Offset */
- unsigned long i : 1; /* Region-Invalid Bit */
- unsigned long cr : 1; /* Common-Region Bit */
- unsigned long tt : 2; /* Table-Type Bits */
- unsigned long tl : 2; /* Segment-Table Length */
-};
-
-struct region3_table_entry_fc1 {
- unsigned long rfaa : 33; /* Region-Frame Absolute Address */
- unsigned long : 14;
- unsigned long av : 1; /* ACCF-Validity Control */
- unsigned long acc: 4; /* Access-Control Bits */
- unsigned long f : 1; /* Fetch-Protection Bit */
- unsigned long fc : 1; /* Format-Control */
- unsigned long p : 1; /* DAT-Protection Bit */
- unsigned long iep: 1; /* Instruction-Execution-Protection */
- unsigned long : 2;
- unsigned long i : 1; /* Region-Invalid Bit */
- unsigned long cr : 1; /* Common-Region Bit */
- unsigned long tt : 2; /* Table-Type Bits */
- unsigned long : 2;
-};
-
-union region3_table_entry {
- unsigned long val;
- struct region3_table_entry_fc0 fc0;
- struct region3_table_entry_fc1 fc1;
- struct {
- unsigned long : 53;
- unsigned long fc : 1; /* Format-Control */
- unsigned long : 4;
- unsigned long i : 1; /* Region-Invalid Bit */
- unsigned long cr : 1; /* Common-Region Bit */
- unsigned long tt : 2; /* Table-Type Bits */
- unsigned long : 2;
- };
-};
-
-struct segment_entry_fc0 {
- unsigned long pto: 53;/* Page-Table Origin */
- unsigned long fc : 1; /* Format-Control */
- unsigned long p : 1; /* DAT-Protection Bit */
- unsigned long : 3;
- unsigned long i : 1; /* Segment-Invalid Bit */
- unsigned long cs : 1; /* Common-Segment Bit */
- unsigned long tt : 2; /* Table-Type Bits */
- unsigned long : 2;
-};
-struct segment_entry_fc1 {
- unsigned long sfaa : 44; /* Segment-Frame Absolute Address */
- unsigned long : 3;
- unsigned long av : 1; /* ACCF-Validity Control */
- unsigned long acc: 4; /* Access-Control Bits */
- unsigned long f : 1; /* Fetch-Protection Bit */
- unsigned long fc : 1; /* Format-Control */
- unsigned long p : 1; /* DAT-Protection Bit */
- unsigned long iep: 1; /* Instruction-Execution-Protection */
- unsigned long : 2;
- unsigned long i : 1; /* Segment-Invalid Bit */
- unsigned long cs : 1; /* Common-Segment Bit */
- unsigned long tt : 2; /* Table-Type Bits */
- unsigned long : 2;
-};
-
-union segment_table_entry {
- unsigned long val;
- struct segment_entry_fc0 fc0;
- struct segment_entry_fc1 fc1;
- struct {
- unsigned long : 53;
- unsigned long fc : 1; /* Format-Control */
- unsigned long : 4;
- unsigned long i : 1; /* Segment-Invalid Bit */
- unsigned long cs : 1; /* Common-Segment Bit */
- unsigned long tt : 2; /* Table-Type Bits */
- unsigned long : 2;
- };
-};
-
-enum {
- TABLE_TYPE_SEGMENT = 0,
- TABLE_TYPE_REGION3 = 1,
- TABLE_TYPE_REGION2 = 2,
- TABLE_TYPE_REGION1 = 3
-};
-
-union page_table_entry {
- unsigned long val;
- struct {
- unsigned long pfra : 52; /* Page-Frame Real Address */
- unsigned long z : 1; /* Zero Bit */
- unsigned long i : 1; /* Page-Invalid Bit */
- unsigned long p : 1; /* DAT-Protection Bit */
- unsigned long iep: 1; /* Instruction-Execution-Protection */
- unsigned long : 8;
- };
-};
+#define GMAP_SHADOW_FAKE_TABLE 1ULL
/*
* vaddress union in order to easily decode a virtual address into its
@@ -264,14 +109,9 @@ struct aste {
int ipte_lock_held(struct kvm *kvm)
{
- if (sclp.has_siif) {
- int rc;
+ if (sclp.has_siif)
+ return kvm->arch.sca->ipte_control.kh != 0;
- read_lock(&kvm->arch.sca_lock);
- rc = kvm_s390_get_ipte_control(kvm)->kh != 0;
- read_unlock(&kvm->arch.sca_lock);
- return rc;
- }
return kvm->arch.ipte_lock_count != 0;
}
@@ -284,19 +124,16 @@ static void ipte_lock_simple(struct kvm *kvm)
if (kvm->arch.ipte_lock_count > 1)
goto out;
retry:
- read_lock(&kvm->arch.sca_lock);
- ic = kvm_s390_get_ipte_control(kvm);
+ ic = &kvm->arch.sca->ipte_control;
+ old = READ_ONCE(*ic);
do {
- old = READ_ONCE(*ic);
if (old.k) {
- read_unlock(&kvm->arch.sca_lock);
cond_resched();
goto retry;
}
new = old;
new.k = 1;
- } while (cmpxchg(&ic->val, old.val, new.val) != old.val);
- read_unlock(&kvm->arch.sca_lock);
+ } while (!try_cmpxchg(&ic->val, &old.val, new.val));
out:
mutex_unlock(&kvm->arch.ipte_mutex);
}
@@ -309,14 +146,12 @@ static void ipte_unlock_simple(struct kvm *kvm)
kvm->arch.ipte_lock_count--;
if (kvm->arch.ipte_lock_count)
goto out;
- read_lock(&kvm->arch.sca_lock);
- ic = kvm_s390_get_ipte_control(kvm);
+ ic = &kvm->arch.sca->ipte_control;
+ old = READ_ONCE(*ic);
do {
- old = READ_ONCE(*ic);
new = old;
new.k = 0;
- } while (cmpxchg(&ic->val, old.val, new.val) != old.val);
- read_unlock(&kvm->arch.sca_lock);
+ } while (!try_cmpxchg(&ic->val, &old.val, new.val));
wake_up(&kvm->arch.ipte_wq);
out:
mutex_unlock(&kvm->arch.ipte_mutex);
@@ -327,36 +162,31 @@ static void ipte_lock_siif(struct kvm *kvm)
union ipte_control old, new, *ic;
retry:
- read_lock(&kvm->arch.sca_lock);
- ic = kvm_s390_get_ipte_control(kvm);
+ ic = &kvm->arch.sca->ipte_control;
+ old = READ_ONCE(*ic);
do {
- old = READ_ONCE(*ic);
if (old.kg) {
- read_unlock(&kvm->arch.sca_lock);
cond_resched();
goto retry;
}
new = old;
new.k = 1;
new.kh++;
- } while (cmpxchg(&ic->val, old.val, new.val) != old.val);
- read_unlock(&kvm->arch.sca_lock);
+ } while (!try_cmpxchg(&ic->val, &old.val, new.val));
}
static void ipte_unlock_siif(struct kvm *kvm)
{
union ipte_control old, new, *ic;
- read_lock(&kvm->arch.sca_lock);
- ic = kvm_s390_get_ipte_control(kvm);
+ ic = &kvm->arch.sca->ipte_control;
+ old = READ_ONCE(*ic);
do {
- old = READ_ONCE(*ic);
new = old;
new.kh--;
if (!new.kh)
new.k = 0;
- } while (cmpxchg(&ic->val, old.val, new.val) != old.val);
- read_unlock(&kvm->arch.sca_lock);
+ } while (!try_cmpxchg(&ic->val, &old.val, new.val));
if (!new.kh)
wake_up(&kvm->arch.ipte_wq);
}
@@ -391,7 +221,8 @@ static int ar_translation(struct kvm_vcpu *vcpu, union asce *asce, u8 ar,
if (ar >= NUM_ACRS)
return -EINVAL;
- save_access_regs(vcpu->run->s.regs.acrs);
+ if (vcpu->arch.acrs_loaded)
+ save_access_regs(vcpu->run->s.regs.acrs);
alet.val = vcpu->run->s.regs.acrs[ar];
if (ar == 0 || alet.val == 0) {
@@ -466,23 +297,6 @@ static int ar_translation(struct kvm_vcpu *vcpu, union asce *asce, u8 ar,
return 0;
}
-struct trans_exc_code_bits {
- unsigned long addr : 52; /* Translation-exception Address */
- unsigned long fsi : 2; /* Access Exception Fetch/Store Indication */
- unsigned long : 2;
- unsigned long b56 : 1;
- unsigned long : 3;
- unsigned long b60 : 1;
- unsigned long b61 : 1;
- unsigned long as : 2; /* ASCE Identifier */
-};
-
-enum {
- FSI_UNKNOWN = 0, /* Unknown wether fetch or store */
- FSI_STORE = 1, /* Exception was due to store operation */
- FSI_FETCH = 2 /* Exception was due to fetch operation */
-};
-
enum prot_type {
PROT_TYPE_LA = 0,
PROT_TYPE_KEYC = 1,
@@ -490,46 +304,46 @@ enum prot_type {
PROT_TYPE_DAT = 3,
PROT_TYPE_IEP = 4,
/* Dummy value for passing an initialized value when code != PGM_PROTECTION */
- PROT_NONE,
+ PROT_TYPE_DUMMY,
};
static int trans_exc_ending(struct kvm_vcpu *vcpu, int code, unsigned long gva, u8 ar,
enum gacc_mode mode, enum prot_type prot, bool terminate)
{
struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
- struct trans_exc_code_bits *tec;
+ union teid *teid;
memset(pgm, 0, sizeof(*pgm));
pgm->code = code;
- tec = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
+ teid = (union teid *)&pgm->trans_exc_code;
switch (code) {
case PGM_PROTECTION:
switch (prot) {
- case PROT_NONE:
+ case PROT_TYPE_DUMMY:
/* We should never get here, acts like termination */
WARN_ON_ONCE(1);
break;
case PROT_TYPE_IEP:
- tec->b61 = 1;
+ teid->b61 = 1;
fallthrough;
case PROT_TYPE_LA:
- tec->b56 = 1;
+ teid->b56 = 1;
break;
case PROT_TYPE_KEYC:
- tec->b60 = 1;
+ teid->b60 = 1;
break;
case PROT_TYPE_ALC:
- tec->b60 = 1;
+ teid->b60 = 1;
fallthrough;
case PROT_TYPE_DAT:
- tec->b61 = 1;
+ teid->b61 = 1;
break;
}
if (terminate) {
- tec->b56 = 0;
- tec->b60 = 0;
- tec->b61 = 0;
+ teid->b56 = 0;
+ teid->b60 = 0;
+ teid->b61 = 0;
}
fallthrough;
case PGM_ASCE_TYPE:
@@ -543,9 +357,9 @@ static int trans_exc_ending(struct kvm_vcpu *vcpu, int code, unsigned long gva,
* exc_access_id has to be set to 0 for some instructions. Both
* cases have to be handled by the caller.
*/
- tec->addr = gva >> PAGE_SHIFT;
- tec->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH;
- tec->as = psw_bits(vcpu->arch.sie_block->gpsw).as;
+ teid->addr = gva >> PAGE_SHIFT;
+ teid->fsi = mode == GACC_STORE ? TEID_FSI_STORE : TEID_FSI_FETCH;
+ teid->as = psw_bits(vcpu->arch.sie_block->gpsw).as;
fallthrough;
case PGM_ALEN_TRANSLATION:
case PGM_ALE_SEQUENCE:
@@ -625,7 +439,7 @@ static int deref_table(struct kvm *kvm, unsigned long gpa, unsigned long *val)
* Returns: - zero on success; @gpa contains the resulting absolute address
* - a negative value if guest access failed due to e.g. broken
* guest mapping
- * - a positve value if an access exception happened. In this case
+ * - a positive value if an access exception happened. In this case
* the returned value is the program interruption code as defined
* by the architecture
*/
@@ -648,7 +462,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
iep = ctlreg0.iep && test_kvm_facility(vcpu->kvm, 130);
if (asce.r)
goto real_address;
- ptr = asce.origin * PAGE_SIZE;
+ ptr = asce.rsto * PAGE_SIZE;
switch (asce.dt) {
case ASCE_TYPE_REGION1:
if (vaddr.rfx01 > asce.tl)
@@ -681,7 +495,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
case ASCE_TYPE_REGION1: {
union region1_table_entry rfte;
- if (kvm_is_error_gpa(vcpu->kvm, ptr))
+ if (!kvm_is_gpa_in_memslot(vcpu->kvm, ptr))
return PGM_ADDRESSING;
if (deref_table(vcpu->kvm, ptr, &rfte.val))
return -EFAULT;
@@ -699,7 +513,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
case ASCE_TYPE_REGION2: {
union region2_table_entry rste;
- if (kvm_is_error_gpa(vcpu->kvm, ptr))
+ if (!kvm_is_gpa_in_memslot(vcpu->kvm, ptr))
return PGM_ADDRESSING;
if (deref_table(vcpu->kvm, ptr, &rste.val))
return -EFAULT;
@@ -717,7 +531,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
case ASCE_TYPE_REGION3: {
union region3_table_entry rtte;
- if (kvm_is_error_gpa(vcpu->kvm, ptr))
+ if (!kvm_is_gpa_in_memslot(vcpu->kvm, ptr))
return PGM_ADDRESSING;
if (deref_table(vcpu->kvm, ptr, &rtte.val))
return -EFAULT;
@@ -745,7 +559,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
case ASCE_TYPE_SEGMENT: {
union segment_table_entry ste;
- if (kvm_is_error_gpa(vcpu->kvm, ptr))
+ if (!kvm_is_gpa_in_memslot(vcpu->kvm, ptr))
return PGM_ADDRESSING;
if (deref_table(vcpu->kvm, ptr, &ste.val))
return -EFAULT;
@@ -765,7 +579,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
ptr = ste.fc0.pto * (PAGE_SIZE / 2) + vaddr.px * 8;
}
}
- if (kvm_is_error_gpa(vcpu->kvm, ptr))
+ if (!kvm_is_gpa_in_memslot(vcpu->kvm, ptr))
return PGM_ADDRESSING;
if (deref_table(vcpu->kvm, ptr, &pte.val))
return -EFAULT;
@@ -787,7 +601,7 @@ absolute_address:
*prot = PROT_TYPE_IEP;
return PGM_PROTECTION;
}
- if (kvm_is_error_gpa(vcpu->kvm, raddr.addr))
+ if (!kvm_is_gpa_in_memslot(vcpu->kvm, raddr.addr))
return PGM_ADDRESSING;
*gpa = raddr.addr;
return 0;
@@ -974,9 +788,9 @@ static int guest_range_to_gpas(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
return rc;
} else {
gpa = kvm_s390_real_to_abs(vcpu, ga);
- if (kvm_is_error_gpa(vcpu->kvm, gpa)) {
+ if (!kvm_is_gpa_in_memslot(vcpu->kvm, gpa)) {
rc = PGM_ADDRESSING;
- prot = PROT_NONE;
+ prot = PROT_TYPE_DUMMY;
}
}
if (rc)
@@ -1001,6 +815,8 @@ static int access_guest_page(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa,
const gfn_t gfn = gpa_to_gfn(gpa);
int rc;
+ if (!gfn_to_memslot(kvm, gfn))
+ return PGM_ADDRESSING;
if (mode == GACC_STORE)
rc = kvm_write_guest_page(kvm, gfn, data, offset, len);
else
@@ -1132,7 +948,7 @@ int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
if (rc == PGM_PROTECTION)
prot = PROT_TYPE_KEYC;
else
- prot = PROT_NONE;
+ prot = PROT_TYPE_DUMMY;
rc = trans_exc_ending(vcpu, rc, ga, ar, mode, prot, terminate);
}
out_unlock:
@@ -1158,10 +974,121 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
gra += fragment_len;
data += fragment_len;
}
+ if (rc > 0)
+ vcpu->arch.pgm.code = rc;
return rc;
}
/**
+ * cmpxchg_guest_abs_with_key() - Perform cmpxchg on guest absolute address.
+ * @kvm: Virtual machine instance.
+ * @gpa: Absolute guest address of the location to be changed.
+ * @len: Operand length of the cmpxchg, required: 1 <= len <= 16. Providing a
+ * non power of two will result in failure.
+ * @old_addr: Pointer to old value. If the location at @gpa contains this value,
+ * the exchange will succeed. After calling cmpxchg_guest_abs_with_key()
+ * *@old_addr contains the value at @gpa before the attempt to
+ * exchange the value.
+ * @new: The value to place at @gpa.
+ * @access_key: The access key to use for the guest access.
+ * @success: output value indicating if an exchange occurred.
+ *
+ * Atomically exchange the value at @gpa by @new, if it contains *@old.
+ * Honors storage keys.
+ *
+ * Return: * 0: successful exchange
+ * * >0: a program interruption code indicating the reason cmpxchg could
+ * not be attempted
+ * * -EINVAL: address misaligned or len not power of two
+ * * -EAGAIN: transient failure (len 1 or 2)
+ * * -EOPNOTSUPP: read-only memslot (should never occur)
+ */
+int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len,
+ __uint128_t *old_addr, __uint128_t new,
+ u8 access_key, bool *success)
+{
+ gfn_t gfn = gpa_to_gfn(gpa);
+ struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
+ bool writable;
+ hva_t hva;
+ int ret;
+
+ if (!IS_ALIGNED(gpa, len))
+ return -EINVAL;
+
+ hva = gfn_to_hva_memslot_prot(slot, gfn, &writable);
+ if (kvm_is_error_hva(hva))
+ return PGM_ADDRESSING;
+ /*
+ * Check if it's a read-only memslot, even though that cannot occur
+ * since those are unsupported.
+ * Don't try to actually handle that case.
+ */
+ if (!writable)
+ return -EOPNOTSUPP;
+
+ hva += offset_in_page(gpa);
+ /*
+ * The cmpxchg_user_key macro depends on the type of "old", so we need
+ * a case for each valid length and get some code duplication as long
+ * as we don't introduce a new macro.
+ */
+ switch (len) {
+ case 1: {
+ u8 old;
+
+ ret = cmpxchg_user_key((u8 __user *)hva, &old, *old_addr, new, access_key);
+ *success = !ret && old == *old_addr;
+ *old_addr = old;
+ break;
+ }
+ case 2: {
+ u16 old;
+
+ ret = cmpxchg_user_key((u16 __user *)hva, &old, *old_addr, new, access_key);
+ *success = !ret && old == *old_addr;
+ *old_addr = old;
+ break;
+ }
+ case 4: {
+ u32 old;
+
+ ret = cmpxchg_user_key((u32 __user *)hva, &old, *old_addr, new, access_key);
+ *success = !ret && old == *old_addr;
+ *old_addr = old;
+ break;
+ }
+ case 8: {
+ u64 old;
+
+ ret = cmpxchg_user_key((u64 __user *)hva, &old, *old_addr, new, access_key);
+ *success = !ret && old == *old_addr;
+ *old_addr = old;
+ break;
+ }
+ case 16: {
+ __uint128_t old;
+
+ ret = cmpxchg_user_key((__uint128_t __user *)hva, &old, *old_addr, new, access_key);
+ *success = !ret && old == *old_addr;
+ *old_addr = old;
+ break;
+ }
+ default:
+ return -EINVAL;
+ }
+ if (*success)
+ mark_page_dirty_in_slot(kvm, slot, gfn);
+ /*
+ * Assume that the fault is caused by protection, either key protection
+ * or user page write protection.
+ */
+ if (ret == -EFAULT)
+ ret = PGM_PROTECTION;
+ return ret;
+}
+
+/**
* guest_translate_address_with_key - translate guest logical into guest absolute address
* @vcpu: virtual cpu
* @gva: Guest virtual address
@@ -1273,6 +1200,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
unsigned long *pgt, int *dat_protection,
int *fake)
{
+ struct kvm *kvm;
struct gmap *parent;
union asce asce;
union vaddress vaddr;
@@ -1281,10 +1209,11 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
*fake = 0;
*dat_protection = 0;
+ kvm = sg->private;
parent = sg->parent;
vaddr.addr = saddr;
asce.val = sg->orig_asce;
- ptr = asce.origin * PAGE_SIZE;
+ ptr = asce.rsto * PAGE_SIZE;
if (asce.r) {
*fake = 1;
ptr = 0;
@@ -1341,6 +1270,7 @@ shadow_r2t:
rc = gmap_shadow_r2t(sg, saddr, rfte.val, *fake);
if (rc)
return rc;
+ kvm->stat.gmap_shadow_r1_entry++;
}
fallthrough;
case ASCE_TYPE_REGION2: {
@@ -1369,6 +1299,7 @@ shadow_r3t:
rc = gmap_shadow_r3t(sg, saddr, rste.val, *fake);
if (rc)
return rc;
+ kvm->stat.gmap_shadow_r2_entry++;
}
fallthrough;
case ASCE_TYPE_REGION3: {
@@ -1406,6 +1337,7 @@ shadow_sgt:
rc = gmap_shadow_sgt(sg, saddr, rtte.val, *fake);
if (rc)
return rc;
+ kvm->stat.gmap_shadow_r3_entry++;
}
fallthrough;
case ASCE_TYPE_SEGMENT: {
@@ -1439,6 +1371,7 @@ shadow_pgt:
rc = gmap_shadow_pgt(sg, saddr, ste.val, *fake);
if (rc)
return rc;
+ kvm->stat.gmap_shadow_sg_entry++;
}
}
/* Return the parent address of the page table */
@@ -1447,6 +1380,44 @@ shadow_pgt:
}
/**
+ * shadow_pgt_lookup() - find a shadow page table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: the address in the shadow aguest address space
+ * @pgt: parent gmap address of the page table to get shadowed
+ * @dat_protection: if the pgtable is marked as protected by dat
+ * @fake: pgt references contiguous guest memory block, not a pgtable
+ *
+ * Returns 0 if the shadow page table was found and -EAGAIN if the page
+ * table was not found.
+ *
+ * Called with sg->mm->mmap_lock in read.
+ */
+static int shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, unsigned long *pgt,
+ int *dat_protection, int *fake)
+{
+ unsigned long pt_index;
+ unsigned long *table;
+ struct page *page;
+ int rc;
+
+ spin_lock(&sg->guest_table_lock);
+ table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
+ if (table && !(*table & _SEGMENT_ENTRY_INVALID)) {
+ /* Shadow page tables are full pages (pte+pgste) */
+ page = pfn_to_page(*table >> PAGE_SHIFT);
+ pt_index = gmap_pgste_get_pgt_addr(page_to_virt(page));
+ *pgt = pt_index & ~GMAP_SHADOW_FAKE_TABLE;
+ *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT);
+ *fake = !!(pt_index & GMAP_SHADOW_FAKE_TABLE);
+ rc = 0;
+ } else {
+ rc = -EAGAIN;
+ }
+ spin_unlock(&sg->guest_table_lock);
+ return rc;
+}
+
+/**
* kvm_s390_shadow_fault - handle fault on a shadow page table
* @vcpu: virtual cpu
* @sg: pointer to the shadow guest address space structure
@@ -1469,6 +1440,9 @@ int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg,
int dat_protection, fake;
int rc;
+ if (KVM_BUG_ON(!gmap_is_shadow(sg), vcpu->kvm))
+ return -EFAULT;
+
mmap_read_lock(sg->mm);
/*
* We don't want any guest-2 tables to change - so the parent
@@ -1477,7 +1451,7 @@ int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg,
*/
ipte_lock(vcpu->kvm);
- rc = gmap_shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection, &fake);
+ rc = shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection, &fake);
if (rc)
rc = kvm_s390_shadow_tables(sg, saddr, &pgt, &dat_protection,
&fake);
@@ -1509,6 +1483,7 @@ shadow_page:
pte.p |= dat_protection;
if (!rc)
rc = gmap_shadow_page(sg, saddr, __pte(pte.val));
+ vcpu->kvm->stat.gmap_shadow_pg_entry++;
ipte_unlock(vcpu->kvm);
mmap_read_unlock(sg->mm);
return rc;
diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h
index 9408d6cc8e2c..3fde45a151f2 100644
--- a/arch/s390/kvm/gaccess.h
+++ b/arch/s390/kvm/gaccess.h
@@ -206,6 +206,9 @@ int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
void *data, unsigned long len, enum gacc_mode mode);
+int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, __uint128_t *old,
+ __uint128_t new, u8 access_key, bool *success);
+
/**
* write_guest_with_key - copy data from kernel space to guest space
* @vcpu: virtual cpu
@@ -402,11 +405,12 @@ int read_guest_abs(struct kvm_vcpu *vcpu, unsigned long gpa, void *data,
* @len: number of bytes to copy
*
* Copy @len bytes from @data (kernel space) to @gra (guest real address).
- * It is up to the caller to ensure that the entire guest memory range is
- * valid memory before calling this function.
* Guest low address and key protection are not checked.
*
- * Returns zero on success or -EFAULT on error.
+ * Returns zero on success, -EFAULT when copying from @data failed, or
+ * PGM_ADRESSING in case @gra is outside a memslot. In this case, pgm check info
+ * is also stored to allow injecting into the guest (if applicable) using
+ * kvm_s390_inject_prog_cond().
*
* If an error occurs data may have been copied partially to guest memory.
*/
@@ -425,11 +429,12 @@ int write_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, void *data,
* @len: number of bytes to copy
*
* Copy @len bytes from @gra (guest real address) to @data (kernel space).
- * It is up to the caller to ensure that the entire guest memory range is
- * valid memory before calling this function.
* Guest key protection is not checked.
*
- * Returns zero on success or -EFAULT on error.
+ * Returns zero on success, -EFAULT when copying to @data failed, or
+ * PGM_ADRESSING in case @gra is outside a memslot. In this case, pgm check info
+ * is also stored to allow injecting into the guest (if applicable) using
+ * kvm_s390_inject_prog_cond().
*
* If an error occurs data may have been copied partially to kernel space.
*/
diff --git a/arch/s390/kvm/gmap-vsie.c b/arch/s390/kvm/gmap-vsie.c
new file mode 100644
index 000000000000..56ef153eb8fe
--- /dev/null
+++ b/arch/s390/kvm/gmap-vsie.c
@@ -0,0 +1,141 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Guest memory management for KVM/s390 nested VMs.
+ *
+ * Copyright IBM Corp. 2008, 2020, 2024
+ *
+ * Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com>
+ * Martin Schwidefsky <schwidefsky@de.ibm.com>
+ * David Hildenbrand <david@redhat.com>
+ * Janosch Frank <frankja@linux.vnet.ibm.com>
+ */
+
+#include <linux/compiler.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/pgtable.h>
+#include <linux/pagemap.h>
+#include <linux/mman.h>
+
+#include <asm/lowcore.h>
+#include <asm/gmap.h>
+#include <asm/uv.h>
+
+#include "kvm-s390.h"
+
+/**
+ * gmap_find_shadow - find a specific asce in the list of shadow tables
+ * @parent: pointer to the parent gmap
+ * @asce: ASCE for which the shadow table is created
+ * @edat_level: edat level to be used for the shadow translation
+ *
+ * Returns the pointer to a gmap if a shadow table with the given asce is
+ * already available, ERR_PTR(-EAGAIN) if another one is just being created,
+ * otherwise NULL
+ *
+ * Context: Called with parent->shadow_lock held
+ */
+static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce, int edat_level)
+{
+ struct gmap *sg;
+
+ lockdep_assert_held(&parent->shadow_lock);
+ list_for_each_entry(sg, &parent->children, list) {
+ if (!gmap_shadow_valid(sg, asce, edat_level))
+ continue;
+ if (!sg->initialized)
+ return ERR_PTR(-EAGAIN);
+ refcount_inc(&sg->ref_count);
+ return sg;
+ }
+ return NULL;
+}
+
+/**
+ * gmap_shadow - create/find a shadow guest address space
+ * @parent: pointer to the parent gmap
+ * @asce: ASCE for which the shadow table is created
+ * @edat_level: edat level to be used for the shadow translation
+ *
+ * The pages of the top level page table referred by the asce parameter
+ * will be set to read-only and marked in the PGSTEs of the kvm process.
+ * The shadow table will be removed automatically on any change to the
+ * PTE mapping for the source table.
+ *
+ * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory,
+ * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the
+ * parent gmap table could not be protected.
+ */
+struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, int edat_level)
+{
+ struct gmap *sg, *new;
+ unsigned long limit;
+ int rc;
+
+ if (KVM_BUG_ON(parent->mm->context.allow_gmap_hpage_1m, (struct kvm *)parent->private) ||
+ KVM_BUG_ON(gmap_is_shadow(parent), (struct kvm *)parent->private))
+ return ERR_PTR(-EFAULT);
+ spin_lock(&parent->shadow_lock);
+ sg = gmap_find_shadow(parent, asce, edat_level);
+ spin_unlock(&parent->shadow_lock);
+ if (sg)
+ return sg;
+ /* Create a new shadow gmap */
+ limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11));
+ if (asce & _ASCE_REAL_SPACE)
+ limit = -1UL;
+ new = gmap_alloc(limit);
+ if (!new)
+ return ERR_PTR(-ENOMEM);
+ new->mm = parent->mm;
+ new->parent = gmap_get(parent);
+ new->private = parent->private;
+ new->orig_asce = asce;
+ new->edat_level = edat_level;
+ new->initialized = false;
+ spin_lock(&parent->shadow_lock);
+ /* Recheck if another CPU created the same shadow */
+ sg = gmap_find_shadow(parent, asce, edat_level);
+ if (sg) {
+ spin_unlock(&parent->shadow_lock);
+ gmap_free(new);
+ return sg;
+ }
+ if (asce & _ASCE_REAL_SPACE) {
+ /* only allow one real-space gmap shadow */
+ list_for_each_entry(sg, &parent->children, list) {
+ if (sg->orig_asce & _ASCE_REAL_SPACE) {
+ spin_lock(&sg->guest_table_lock);
+ gmap_unshadow(sg);
+ spin_unlock(&sg->guest_table_lock);
+ list_del(&sg->list);
+ gmap_put(sg);
+ break;
+ }
+ }
+ }
+ refcount_set(&new->ref_count, 2);
+ list_add(&new->list, &parent->children);
+ if (asce & _ASCE_REAL_SPACE) {
+ /* nothing to protect, return right away */
+ new->initialized = true;
+ spin_unlock(&parent->shadow_lock);
+ return new;
+ }
+ spin_unlock(&parent->shadow_lock);
+ /* protect after insertion, so it will get properly invalidated */
+ mmap_read_lock(parent->mm);
+ rc = __kvm_s390_mprotect_many(parent, asce & _ASCE_ORIGIN,
+ ((asce & _ASCE_TABLE_LENGTH) + 1),
+ PROT_READ, GMAP_NOTIFY_SHADOW);
+ mmap_read_unlock(parent->mm);
+ spin_lock(&parent->shadow_lock);
+ new->initialized = true;
+ if (rc) {
+ list_del(&new->list);
+ gmap_free(new);
+ new = ERR_PTR(rc);
+ }
+ spin_unlock(&parent->shadow_lock);
+ return new;
+}
diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c
index 3765c4223bf9..80879fc73c90 100644
--- a/arch/s390/kvm/guestdbg.c
+++ b/arch/s390/kvm/guestdbg.c
@@ -213,8 +213,8 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu,
else if (dbg->arch.nr_hw_bp > MAX_BP_COUNT)
return -EINVAL;
- bp_data = memdup_user(dbg->arch.hw_bp,
- sizeof(*bp_data) * dbg->arch.nr_hw_bp);
+ bp_data = memdup_array_user(dbg->arch.hw_bp, dbg->arch.nr_hw_bp,
+ sizeof(*bp_data));
if (IS_ERR(bp_data))
return PTR_ERR(bp_data);
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 0ee02dae14b2..420ae62977e2 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -94,7 +94,7 @@ static int handle_validity(struct kvm_vcpu *vcpu)
vcpu->stat.exit_validity++;
trace_kvm_s390_intercept_validity(vcpu, viwhy);
- KVM_EVENT(3, "validity intercept 0x%x for pid %u (kvm 0x%pK)", viwhy,
+ KVM_EVENT(3, "validity intercept 0x%x for pid %u (kvm 0x%p)", viwhy,
current->pid, vcpu->kvm);
/* do not warn on invalid runtime instrumentation mode */
@@ -228,6 +228,21 @@ static int handle_itdb(struct kvm_vcpu *vcpu)
#define per_event(vcpu) (vcpu->arch.sie_block->iprcc & PGM_PER)
+static bool should_handle_per_event(const struct kvm_vcpu *vcpu)
+{
+ if (!guestdbg_enabled(vcpu) || !per_event(vcpu))
+ return false;
+ if (guestdbg_sstep_enabled(vcpu) &&
+ vcpu->arch.sie_block->iprcc != PGM_PER) {
+ /*
+ * __vcpu_run() will exit after delivering the concurrently
+ * indicated condition.
+ */
+ return false;
+ }
+ return true;
+}
+
static int handle_prog(struct kvm_vcpu *vcpu)
{
psw_t psw;
@@ -242,7 +257,7 @@ static int handle_prog(struct kvm_vcpu *vcpu)
if (kvm_s390_pv_cpu_is_protected(vcpu))
return -EOPNOTSUPP;
- if (guestdbg_enabled(vcpu) && per_event(vcpu)) {
+ if (should_handle_per_event(vcpu)) {
rc = kvm_s390_handle_per_event(vcpu);
if (rc)
return rc;
@@ -271,10 +286,18 @@ static int handle_prog(struct kvm_vcpu *vcpu)
* handle_external_interrupt - used for external interruption interceptions
* @vcpu: virtual cpu
*
- * This interception only occurs if the CPUSTAT_EXT_INT bit was set, or if
- * the new PSW does not have external interrupts disabled. In the first case,
- * we've got to deliver the interrupt manually, and in the second case, we
- * drop to userspace to handle the situation there.
+ * This interception occurs if:
+ * - the CPUSTAT_EXT_INT bit was already set when the external interrupt
+ * occurred. In this case, the interrupt needs to be injected manually to
+ * preserve interrupt priority.
+ * - the external new PSW has external interrupts enabled, which will cause an
+ * interruption loop. We drop to userspace in this case.
+ *
+ * The latter case can be detected by inspecting the external mask bit in the
+ * external new psw.
+ *
+ * Under PV, only the latter case can occur, since interrupt priorities are
+ * handled in the ultravisor.
*/
static int handle_external_interrupt(struct kvm_vcpu *vcpu)
{
@@ -285,10 +308,18 @@ static int handle_external_interrupt(struct kvm_vcpu *vcpu)
vcpu->stat.exit_external_interrupt++;
- rc = read_guest_lc(vcpu, __LC_EXT_NEW_PSW, &newpsw, sizeof(psw_t));
- if (rc)
- return rc;
- /* We can not handle clock comparator or timer interrupt with bad PSW */
+ if (kvm_s390_pv_cpu_is_protected(vcpu)) {
+ newpsw = vcpu->arch.sie_block->gpsw;
+ } else {
+ rc = read_guest_lc(vcpu, __LC_EXT_NEW_PSW, &newpsw, sizeof(psw_t));
+ if (rc)
+ return rc;
+ }
+
+ /*
+ * Clock comparator or timer interrupt with external interrupt enabled
+ * will cause interrupt loop. Drop to userspace.
+ */
if ((eic == EXT_IRQ_CLK_COMP || eic == EXT_IRQ_CPU_TIMER) &&
(newpsw.mask & PSW_MASK_EXT))
return -EOPNOTSUPP;
@@ -336,7 +367,7 @@ static int handle_mvpg_pei(struct kvm_vcpu *vcpu)
reg2, &srcaddr, GACC_FETCH, 0);
if (rc)
return kvm_s390_inject_prog_cond(vcpu, rc);
- rc = kvm_arch_fault_in_page(vcpu, srcaddr, 0);
+ rc = kvm_s390_handle_dat_fault(vcpu, srcaddr, 0);
if (rc != 0)
return rc;
@@ -345,7 +376,7 @@ static int handle_mvpg_pei(struct kvm_vcpu *vcpu)
reg1, &dstaddr, GACC_STORE, 0);
if (rc)
return kvm_s390_inject_prog_cond(vcpu, rc);
- rc = kvm_arch_fault_in_page(vcpu, dstaddr, 1);
+ rc = kvm_s390_handle_dat_fault(vcpu, dstaddr, FOLL_WRITE);
if (rc != 0)
return rc;
@@ -373,8 +404,8 @@ static int handle_partial_execution(struct kvm_vcpu *vcpu)
*/
int handle_sthyi(struct kvm_vcpu *vcpu)
{
- int reg1, reg2, r = 0;
- u64 code, addr, cc = 0, rc = 0;
+ int reg1, reg2, cc = 0, r = 0;
+ u64 code, addr, rc = 0;
struct sthyi_sctns *sctns = NULL;
if (!test_kvm_facility(vcpu->kvm, 74))
@@ -405,7 +436,10 @@ int handle_sthyi(struct kvm_vcpu *vcpu)
return -ENOMEM;
cc = sthyi_fill(sctns, &rc);
-
+ if (cc < 0) {
+ free_page((unsigned long)sctns);
+ return cc;
+ }
out:
if (!cc) {
if (kvm_s390_pv_cpu_is_protected(vcpu)) {
@@ -437,6 +471,9 @@ static int handle_operexc(struct kvm_vcpu *vcpu)
if (vcpu->arch.sie_block->ipa == 0xb256)
return handle_sthyi(vcpu);
+ if (vcpu->kvm->arch.user_operexec)
+ return -EOPNOTSUPP;
+
if (vcpu->arch.sie_block->ipa == 0 && vcpu->kvm->arch.user_instr0)
return -EOPNOTSUPP;
rc = read_guest_lc(vcpu, __LC_PGM_NEW_PSW, &newpsw, sizeof(psw_t));
@@ -510,12 +547,12 @@ static int handle_pv_uvc(struct kvm_vcpu *vcpu)
guest_uvcb->header.cmd);
return 0;
}
- rc = gmap_make_secure(vcpu->arch.gmap, uvcb.gaddr, &uvcb);
+ rc = kvm_s390_pv_make_secure(vcpu->kvm, uvcb.gaddr, &uvcb);
/*
* If the unpin did not succeed, the guest will exit again for the UVC
* and we will retry the unpin.
*/
- if (rc == -EINVAL)
+ if (rc == -EINVAL || rc == -ENXIO)
return 0;
/*
* If we got -EAGAIN here, we simply return it. It will eventually
@@ -552,6 +589,19 @@ static int handle_pv_notification(struct kvm_vcpu *vcpu)
return handle_instruction(vcpu);
}
+static bool should_handle_per_ifetch(const struct kvm_vcpu *vcpu, int rc)
+{
+ /* Process PER, also if the instruction is processed in user space. */
+ if (!(vcpu->arch.sie_block->icptstatus & 0x02))
+ return false;
+ if (rc != 0 && rc != -EOPNOTSUPP)
+ return false;
+ if (guestdbg_sstep_enabled(vcpu) && vcpu->arch.local_int.pending_irqs)
+ /* __vcpu_run() will exit after delivering the interrupt. */
+ return false;
+ return true;
+}
+
int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
{
int rc, per_rc = 0;
@@ -586,8 +636,8 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
rc = handle_partial_execution(vcpu);
break;
case ICPT_KSS:
- rc = kvm_s390_skey_check_enable(vcpu);
- break;
+ /* Instruction will be redriven, skip the PER check. */
+ return kvm_s390_skey_check_enable(vcpu);
case ICPT_MCHKREQ:
case ICPT_INT_ENABLE:
/*
@@ -605,18 +655,14 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
break;
case ICPT_PV_PREF:
rc = 0;
- gmap_convert_to_secure(vcpu->arch.gmap,
- kvm_s390_get_prefix(vcpu));
- gmap_convert_to_secure(vcpu->arch.gmap,
- kvm_s390_get_prefix(vcpu) + PAGE_SIZE);
+ kvm_s390_pv_convert_to_secure(vcpu->kvm, kvm_s390_get_prefix(vcpu));
+ kvm_s390_pv_convert_to_secure(vcpu->kvm, kvm_s390_get_prefix(vcpu) + PAGE_SIZE);
break;
default:
return -EOPNOTSUPP;
}
- /* process PER, also if the instrution is processed in user space */
- if (vcpu->arch.sie_block->icptstatus & 0x02 &&
- (!rc || rc == -EOPNOTSUPP))
+ if (should_handle_per_ifetch(vcpu, rc))
per_rc = kvm_s390_handle_per_ifetch_icpt(vcpu);
return per_rc ? per_rc : rc;
}
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index ab26aa53ee37..249cdc822ec5 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -7,25 +7,26 @@
* Author(s): Carsten Otte <cotte@de.ibm.com>
*/
-#define KMSG_COMPONENT "kvm-s390"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "kvm-s390: " fmt
+#include <linux/cpufeature.h>
#include <linux/interrupt.h>
#include <linux/kvm_host.h>
#include <linux/hrtimer.h>
+#include <linux/export.h>
#include <linux/mmu_context.h>
#include <linux/nospec.h>
#include <linux/signal.h>
#include <linux/slab.h>
#include <linux/bitmap.h>
#include <linux/vmalloc.h>
+#include <asm/access-regs.h>
#include <asm/asm-offsets.h>
#include <asm/dis.h>
#include <linux/uaccess.h>
#include <asm/sclp.h>
#include <asm/isc.h>
#include <asm/gmap.h>
-#include <asm/switch_to.h>
#include <asm/nmi.h>
#include <asm/airq.h>
#include <asm/tpi.h>
@@ -43,70 +44,34 @@ static struct kvm_s390_gib *gib;
/* handle external calls via sigp interpretation facility */
static int sca_ext_call_pending(struct kvm_vcpu *vcpu, int *src_id)
{
- int c, scn;
+ struct esca_block *sca = vcpu->kvm->arch.sca;
+ union esca_sigp_ctrl sigp_ctrl = sca->cpu[vcpu->vcpu_id].sigp_ctrl;
if (!kvm_s390_test_cpuflags(vcpu, CPUSTAT_ECALL_PEND))
return 0;
BUG_ON(!kvm_s390_use_sca_entries());
- read_lock(&vcpu->kvm->arch.sca_lock);
- if (vcpu->kvm->arch.use_esca) {
- struct esca_block *sca = vcpu->kvm->arch.sca;
- union esca_sigp_ctrl sigp_ctrl =
- sca->cpu[vcpu->vcpu_id].sigp_ctrl;
-
- c = sigp_ctrl.c;
- scn = sigp_ctrl.scn;
- } else {
- struct bsca_block *sca = vcpu->kvm->arch.sca;
- union bsca_sigp_ctrl sigp_ctrl =
- sca->cpu[vcpu->vcpu_id].sigp_ctrl;
-
- c = sigp_ctrl.c;
- scn = sigp_ctrl.scn;
- }
- read_unlock(&vcpu->kvm->arch.sca_lock);
if (src_id)
- *src_id = scn;
+ *src_id = sigp_ctrl.scn;
- return c;
+ return sigp_ctrl.c;
}
static int sca_inject_ext_call(struct kvm_vcpu *vcpu, int src_id)
{
+ struct esca_block *sca = vcpu->kvm->arch.sca;
+ union esca_sigp_ctrl *sigp_ctrl = &sca->cpu[vcpu->vcpu_id].sigp_ctrl;
+ union esca_sigp_ctrl old_val, new_val = {.scn = src_id, .c = 1};
int expect, rc;
BUG_ON(!kvm_s390_use_sca_entries());
- read_lock(&vcpu->kvm->arch.sca_lock);
- if (vcpu->kvm->arch.use_esca) {
- struct esca_block *sca = vcpu->kvm->arch.sca;
- union esca_sigp_ctrl *sigp_ctrl =
- &(sca->cpu[vcpu->vcpu_id].sigp_ctrl);
- union esca_sigp_ctrl new_val = {0}, old_val;
-
- old_val = READ_ONCE(*sigp_ctrl);
- new_val.scn = src_id;
- new_val.c = 1;
- old_val.c = 0;
-
- expect = old_val.value;
- rc = cmpxchg(&sigp_ctrl->value, old_val.value, new_val.value);
- } else {
- struct bsca_block *sca = vcpu->kvm->arch.sca;
- union bsca_sigp_ctrl *sigp_ctrl =
- &(sca->cpu[vcpu->vcpu_id].sigp_ctrl);
- union bsca_sigp_ctrl new_val = {0}, old_val;
- old_val = READ_ONCE(*sigp_ctrl);
- new_val.scn = src_id;
- new_val.c = 1;
- old_val.c = 0;
+ old_val = READ_ONCE(*sigp_ctrl);
+ old_val.c = 0;
- expect = old_val.value;
- rc = cmpxchg(&sigp_ctrl->value, old_val.value, new_val.value);
- }
- read_unlock(&vcpu->kvm->arch.sca_lock);
+ expect = old_val.value;
+ rc = cmpxchg(&sigp_ctrl->value, old_val.value, new_val.value);
if (rc != expect) {
/* another external call is pending */
@@ -118,33 +83,14 @@ static int sca_inject_ext_call(struct kvm_vcpu *vcpu, int src_id)
static void sca_clear_ext_call(struct kvm_vcpu *vcpu)
{
- int rc, expect;
+ struct esca_block *sca = vcpu->kvm->arch.sca;
+ union esca_sigp_ctrl *sigp_ctrl = &sca->cpu[vcpu->vcpu_id].sigp_ctrl;
if (!kvm_s390_use_sca_entries())
return;
kvm_s390_clear_cpuflags(vcpu, CPUSTAT_ECALL_PEND);
- read_lock(&vcpu->kvm->arch.sca_lock);
- if (vcpu->kvm->arch.use_esca) {
- struct esca_block *sca = vcpu->kvm->arch.sca;
- union esca_sigp_ctrl *sigp_ctrl =
- &(sca->cpu[vcpu->vcpu_id].sigp_ctrl);
- union esca_sigp_ctrl old;
-
- old = READ_ONCE(*sigp_ctrl);
- expect = old.value;
- rc = cmpxchg(&sigp_ctrl->value, old.value, 0);
- } else {
- struct bsca_block *sca = vcpu->kvm->arch.sca;
- union bsca_sigp_ctrl *sigp_ctrl =
- &(sca->cpu[vcpu->vcpu_id].sigp_ctrl);
- union bsca_sigp_ctrl old;
- old = READ_ONCE(*sigp_ctrl);
- expect = old.value;
- rc = cmpxchg(&sigp_ctrl->value, old.value, 0);
- }
- read_unlock(&vcpu->kvm->arch.sca_lock);
- WARN_ON(rc != expect); /* cannot clear? */
+ WRITE_ONCE(sigp_ctrl->value, 0);
}
int psw_extint_disabled(struct kvm_vcpu *vcpu)
@@ -247,12 +193,12 @@ static inline int gisa_set_iam(struct kvm_s390_gisa *gisa, u8 iam)
{
u64 word, _word;
+ word = READ_ONCE(gisa->u64.word[0]);
do {
- word = READ_ONCE(gisa->u64.word[0]);
if ((u64)gisa != word >> 32)
return -EBUSY;
_word = (word & ~0xffUL) | iam;
- } while (cmpxchg(&gisa->u64.word[0], word, _word) != word);
+ } while (!try_cmpxchg(&gisa->u64.word[0], &word, _word));
return 0;
}
@@ -270,10 +216,10 @@ static inline void gisa_clear_ipm(struct kvm_s390_gisa *gisa)
{
u64 word, _word;
+ word = READ_ONCE(gisa->u64.word[0]);
do {
- word = READ_ONCE(gisa->u64.word[0]);
_word = word & ~(0xffUL << 24);
- } while (cmpxchg(&gisa->u64.word[0], word, _word) != word);
+ } while (!try_cmpxchg(&gisa->u64.word[0], &word, _word));
}
/**
@@ -291,23 +237,18 @@ static inline u8 gisa_get_ipm_or_restore_iam(struct kvm_s390_gisa_interrupt *gi)
u8 pending_mask, alert_mask;
u64 word, _word;
+ word = READ_ONCE(gi->origin->u64.word[0]);
do {
- word = READ_ONCE(gi->origin->u64.word[0]);
alert_mask = READ_ONCE(gi->alert.mask);
pending_mask = (u8)(word >> 24) & alert_mask;
if (pending_mask)
return pending_mask;
_word = (word & ~0xffUL) | alert_mask;
- } while (cmpxchg(&gi->origin->u64.word[0], word, _word) != word);
+ } while (!try_cmpxchg(&gi->origin->u64.word[0], &word, _word));
return 0;
}
-static inline int gisa_in_alert_list(struct kvm_s390_gisa *gisa)
-{
- return READ_ONCE(gisa->next_alert) != (u32)(u64)gisa;
-}
-
static inline void gisa_set_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc)
{
set_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa);
@@ -589,9 +530,9 @@ static int __write_machine_check(struct kvm_vcpu *vcpu,
mci.val = mchk->mcic;
/* take care of lazy register loading */
- save_fpu_regs();
+ kvm_s390_fpu_store(vcpu->run);
save_access_regs(vcpu->run->s.regs.acrs);
- if (MACHINE_HAS_GS && vcpu->arch.gs_enabled)
+ if (cpu_has_gs() && vcpu->arch.gs_enabled)
save_gs_cb(current->thread.gs_cb);
/* Extended save area */
@@ -644,7 +585,7 @@ static int __write_machine_check(struct kvm_vcpu *vcpu,
rc |= put_guest_lc(vcpu, mci.val, (u64 __user *) __LC_MCCK_CODE);
/* Register-save areas */
- if (MACHINE_HAS_VX) {
+ if (cpu_has_vx()) {
convert_vx_to_fp(fprs, (__vector128 *) vcpu->run->s.regs.vrs);
rc |= write_guest_lc(vcpu, __LC_FPREGS_SAVE_AREA, fprs, 128);
} else {
@@ -653,7 +594,7 @@ static int __write_machine_check(struct kvm_vcpu *vcpu,
}
rc |= write_guest_lc(vcpu, __LC_GPREGS_SAVE_AREA,
vcpu->run->s.regs.gprs, 128);
- rc |= put_guest_lc(vcpu, current->thread.fpu.fpc,
+ rc |= put_guest_lc(vcpu, vcpu->run->s.regs.fpc,
(u32 __user *) __LC_FP_CREG_SAVE_AREA);
rc |= put_guest_lc(vcpu, vcpu->arch.sie_block->todpr,
(u32 __user *) __LC_TOD_PROGREG_SAVE_AREA);
@@ -962,8 +903,7 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu)
rc |= put_guest_lc(vcpu, ilen, (u16 *) __LC_PGM_ILC);
rc |= put_guest_lc(vcpu, vcpu->arch.sie_block->gbea,
(u64 *) __LC_PGM_LAST_BREAK);
- rc |= put_guest_lc(vcpu, pgm_info.code,
- (u16 *)__LC_PGM_INT_CODE);
+ rc |= put_guest_lc(vcpu, pgm_info.code, (u16 *)__LC_PGM_CODE);
rc |= write_guest_lc(vcpu, __LC_PGM_OLD_PSW,
&vcpu->arch.sie_block->gpsw, sizeof(psw_t));
rc |= read_guest_lc(vcpu, __LC_PGM_NEW_PSW,
@@ -1036,7 +976,7 @@ static int __must_check __deliver_service_ev(struct kvm_vcpu *vcpu)
return 0;
}
ext = fi->srv_signal;
- /* only clear the event bit */
+ /* only clear the event bits */
fi->srv_signal.ext_params &= ~SCCB_EVENT_PENDING;
clear_bit(IRQ_PEND_EXT_SERVICE_EV, &fi->pending_irqs);
spin_unlock(&fi->lock);
@@ -1046,7 +986,7 @@ static int __must_check __deliver_service_ev(struct kvm_vcpu *vcpu)
trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_SERVICE,
ext.ext_params, 0);
- return write_sclp(vcpu, SCCB_EVENT_PENDING);
+ return write_sclp(vcpu, ext.ext_params & SCCB_EVENT_PENDING);
}
static int __must_check __deliver_pfault_done(struct kvm_vcpu *vcpu)
@@ -1237,7 +1177,7 @@ int kvm_s390_ext_call_pending(struct kvm_vcpu *vcpu)
{
struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
- if (!sclp.has_sigpif)
+ if (!kvm_s390_use_sca_entries())
return test_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs);
return sca_ext_call_pending(vcpu, NULL);
@@ -1336,6 +1276,7 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu)
VCPU_EVENT(vcpu, 4, "enabled wait: %llu ns", sltime);
no_timer:
kvm_vcpu_srcu_read_unlock(vcpu);
+ vcpu->kvm->arch.float_int.last_sleep_cpu = vcpu->vcpu_idx;
kvm_vcpu_halt(vcpu);
vcpu->valid_wakeup = false;
__unset_cpu_idle(vcpu);
@@ -1392,6 +1333,7 @@ int __must_check kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu)
{
struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
int rc = 0;
+ bool delivered = false;
unsigned long irq_type;
unsigned long irqs;
@@ -1465,6 +1407,19 @@ int __must_check kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu)
WARN_ONCE(1, "Unknown pending irq type %ld", irq_type);
clear_bit(irq_type, &li->pending_irqs);
}
+ delivered |= !rc;
+ }
+
+ /*
+ * We delivered at least one interrupt and modified the PC. Force a
+ * singlestep event now.
+ */
+ if (delivered && guestdbg_sstep_enabled(vcpu)) {
+ struct kvm_debug_exit_arch *debug_exit = &vcpu->run->debug.arch;
+
+ debug_exit->addr = vcpu->arch.sie_block->gpsw.addr;
+ debug_exit->type = KVM_SINGLESTEP;
+ vcpu->guest_debug |= KVM_GUESTDBG_EXIT_PENDING;
}
set_intercept_indicators(vcpu);
@@ -1547,7 +1502,7 @@ static int __inject_extcall(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
if (kvm_get_vcpu_by_id(vcpu->kvm, src_id) == NULL)
return -EINVAL;
- if (sclp.has_sigpif && !kvm_s390_pv_cpu_get_handle(vcpu))
+ if (kvm_s390_use_sca_entries() && !kvm_s390_pv_cpu_get_handle(vcpu))
return sca_inject_ext_call(vcpu, src_id);
if (test_and_set_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs))
@@ -1948,18 +1903,15 @@ static void __floating_irq_kick(struct kvm *kvm, u64 type)
if (!online_vcpus)
return;
- /* find idle VCPUs first, then round robin */
- sigcpu = find_first_bit(kvm->arch.idle_mask, online_vcpus);
- if (sigcpu == online_vcpus) {
- do {
- sigcpu = kvm->arch.float_int.next_rr_cpu++;
- kvm->arch.float_int.next_rr_cpu %= online_vcpus;
- /* avoid endless loops if all vcpus are stopped */
- if (nr_tries++ >= online_vcpus)
- return;
- } while (is_vcpu_stopped(kvm_get_vcpu(kvm, sigcpu)));
+ for (sigcpu = kvm->arch.float_int.last_sleep_cpu; ; sigcpu++) {
+ sigcpu %= online_vcpus;
+ dst_vcpu = kvm_get_vcpu(kvm, sigcpu);
+ if (!is_vcpu_stopped(dst_vcpu))
+ break;
+ /* avoid endless loops if all vcpus are stopped */
+ if (nr_tries++ >= online_vcpus)
+ return;
}
- dst_vcpu = kvm_get_vcpu(kvm, sigcpu);
/* make the VCPU drop out of the SIE, or wake it up if sleeping */
switch (type) {
@@ -2678,9 +2630,13 @@ static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
kvm_s390_clear_float_irqs(dev->kvm);
break;
case KVM_DEV_FLIC_APF_ENABLE:
+ if (kvm_is_ucontrol(dev->kvm))
+ return -EINVAL;
dev->kvm->arch.gmap->pfault_enabled = 1;
break;
case KVM_DEV_FLIC_APF_DISABLE_WAIT:
+ if (kvm_is_ucontrol(dev->kvm))
+ return -EINVAL;
dev->kvm->arch.gmap->pfault_enabled = 0;
/*
* Make sure no async faults are in transition when
@@ -2773,12 +2729,19 @@ static unsigned long get_ind_bit(__u64 addr, unsigned long bit_nr, bool swap)
static struct page *get_map_page(struct kvm *kvm, u64 uaddr)
{
+ struct mm_struct *mm = kvm->mm;
struct page *page = NULL;
+ int locked = 1;
+
+ if (mmget_not_zero(mm)) {
+ mmap_read_lock(mm);
+ get_user_pages_remote(mm, uaddr, 1, FOLL_WRITE,
+ &page, &locked);
+ if (locked)
+ mmap_read_unlock(mm);
+ mmput(mm);
+ }
- mmap_read_lock(kvm->mm);
- get_user_pages_remote(kvm->mm, uaddr, 1, FOLL_WRITE,
- &page, NULL, NULL);
- mmap_read_unlock(kvm->mm);
return page;
}
@@ -2889,20 +2852,25 @@ int kvm_set_routing_entry(struct kvm *kvm,
struct kvm_kernel_irq_routing_entry *e,
const struct kvm_irq_routing_entry *ue)
{
- u64 uaddr;
+ u64 uaddr_s, uaddr_i;
+ int idx;
switch (ue->type) {
/* we store the userspace addresses instead of the guest addresses */
case KVM_IRQ_ROUTING_S390_ADAPTER:
+ if (kvm_is_ucontrol(kvm))
+ return -EINVAL;
e->set = set_adapter_int;
- uaddr = gmap_translate(kvm->arch.gmap, ue->u.adapter.summary_addr);
- if (uaddr == -EFAULT)
- return -EFAULT;
- e->adapter.summary_addr = uaddr;
- uaddr = gmap_translate(kvm->arch.gmap, ue->u.adapter.ind_addr);
- if (uaddr == -EFAULT)
+
+ idx = srcu_read_lock(&kvm->srcu);
+ uaddr_s = gpa_to_hva(kvm, ue->u.adapter.summary_addr);
+ uaddr_i = gpa_to_hva(kvm, ue->u.adapter.ind_addr);
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ if (kvm_is_error_hva(uaddr_s) || kvm_is_error_hva(uaddr_i))
return -EFAULT;
- e->adapter.ind_addr = uaddr;
+ e->adapter.summary_addr = uaddr_s;
+ e->adapter.ind_addr = uaddr_i;
e->adapter.summary_offset = ue->u.adapter.summary_offset;
e->adapter.ind_offset = ue->u.adapter.ind_offset;
e->adapter.adapter_id = ue->u.adapter.adapter_id;
@@ -3103,9 +3071,9 @@ static enum hrtimer_restart gisa_vcpu_kicker(struct hrtimer *timer)
static void process_gib_alert_list(void)
{
struct kvm_s390_gisa_interrupt *gi;
+ u32 final, gisa_phys, origin = 0UL;
struct kvm_s390_gisa *gisa;
struct kvm *kvm;
- u32 final, origin = 0UL;
do {
/*
@@ -3131,9 +3099,10 @@ static void process_gib_alert_list(void)
* interruptions asap.
*/
while (origin & GISA_ADDR_MASK) {
- gisa = (struct kvm_s390_gisa *)(u64)origin;
+ gisa_phys = origin;
+ gisa = phys_to_virt(gisa_phys);
origin = gisa->next_alert;
- gisa->next_alert = (u32)(u64)gisa;
+ gisa->next_alert = gisa_phys;
kvm = container_of(gisa, struct sie_page2, gisa)->kvm;
gi = &kvm->arch.gisa_int;
if (hrtimer_active(&gi->timer))
@@ -3151,7 +3120,7 @@ void kvm_s390_gisa_clear(struct kvm *kvm)
if (!gi->origin)
return;
gisa_clear_ipm(gi->origin);
- VM_EVENT(kvm, 3, "gisa 0x%pK cleared", gi->origin);
+ VM_EVENT(kvm, 3, "gisa 0x%p cleared", gi->origin);
}
void kvm_s390_gisa_init(struct kvm *kvm)
@@ -3164,11 +3133,10 @@ void kvm_s390_gisa_init(struct kvm *kvm)
gi->alert.mask = 0;
spin_lock_init(&gi->alert.ref_lock);
gi->expires = 50 * 1000; /* 50 usec */
- hrtimer_init(&gi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- gi->timer.function = gisa_vcpu_kicker;
+ hrtimer_setup(&gi->timer, gisa_vcpu_kicker, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
memset(gi->origin, 0, sizeof(struct kvm_s390_gisa));
- gi->origin->next_alert = (u32)(u64)gi->origin;
- VM_EVENT(kvm, 3, "gisa 0x%pK initialized", gi->origin);
+ gi->origin->next_alert = (u32)virt_to_phys(gi->origin);
+ VM_EVENT(kvm, 3, "gisa 0x%p initialized", gi->origin);
}
void kvm_s390_gisa_enable(struct kvm *kvm)
@@ -3201,14 +3169,15 @@ void kvm_s390_gisa_destroy(struct kvm *kvm)
if (!gi->origin)
return;
- if (gi->alert.mask)
- KVM_EVENT(3, "vm 0x%pK has unexpected iam 0x%02x",
- kvm, gi->alert.mask);
- while (gisa_in_alert_list(gi->origin))
- cpu_relax();
+ WARN(gi->alert.mask != 0x00,
+ "unexpected non zero alert.mask 0x%02x",
+ gi->alert.mask);
+ gi->alert.mask = 0x00;
+ if (gisa_set_iam(gi->origin, gi->alert.mask))
+ process_gib_alert_list();
hrtimer_cancel(&gi->timer);
gi->origin = NULL;
- VM_EVENT(kvm, 3, "gisa 0x%pK destroyed", gisa);
+ VM_EVENT(kvm, 3, "gisa 0x%p destroyed", gisa);
}
void kvm_s390_gisa_disable(struct kvm *kvm)
@@ -3397,7 +3366,6 @@ static void gib_alert_irq_handler(struct airq_struct *airq,
static struct airq_struct gib_alert_irq = {
.handler = gib_alert_irq_handler,
- .lsi_ptr = &gib_alert_irq.lsi_mask,
};
void kvm_s390_gib_destroy(void)
@@ -3415,8 +3383,9 @@ void kvm_s390_gib_destroy(void)
gib = NULL;
}
-int kvm_s390_gib_init(u8 nisc)
+int __init kvm_s390_gib_init(u8 nisc)
{
+ u32 gib_origin;
int rc = 0;
if (!css_general_characteristics.aiv) {
@@ -3436,9 +3405,12 @@ int kvm_s390_gib_init(u8 nisc)
rc = -EIO;
goto out_free_gib;
}
+ /* adapter interrupts used for AP (applicable here) don't use the LSI */
+ *gib_alert_irq.lsi_ptr = 0xff;
gib->nisc = nisc;
- if (chsc_sgib((u32)(u64)gib)) {
+ gib_origin = virt_to_phys(gib);
+ if (chsc_sgib(gib_origin)) {
pr_err("Associating the GIB with the AIV facility failed\n");
free_page((unsigned long)gib);
gib = NULL;
@@ -3454,7 +3426,7 @@ int kvm_s390_gib_init(u8 nisc)
}
}
- KVM_EVENT(3, "gib 0x%pK (nisc=%d) initialized", gib, gib->nisc);
+ KVM_EVENT(3, "gib 0x%p (nisc=%d) initialized", gib, gib->nisc);
goto out;
out_unreg_gal:
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index e4890e04b210..56a50524b3ee 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -10,10 +10,11 @@
* Jason J. Herne <jjherne@us.ibm.com>
*/
-#define KMSG_COMPONENT "kvm-s390"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "kvm-s390: " fmt
#include <linux/compiler.h>
+#include <linux/entry-virt.h>
+#include <linux/export.h>
#include <linux/err.h>
#include <linux/fs.h>
#include <linux/hrtimer.h>
@@ -23,6 +24,7 @@
#include <linux/mman.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
+#include <linux/cpufeature.h>
#include <linux/random.h>
#include <linux/slab.h>
#include <linux/timer.h>
@@ -33,19 +35,22 @@
#include <linux/pgtable.h>
#include <linux/mmu_notifier.h>
+#include <asm/access-regs.h>
#include <asm/asm-offsets.h>
#include <asm/lowcore.h>
+#include <asm/machine.h>
#include <asm/stp.h>
#include <asm/gmap.h>
+#include <asm/gmap_helpers.h>
#include <asm/nmi.h>
-#include <asm/switch_to.h>
#include <asm/isc.h>
#include <asm/sclp.h>
#include <asm/cpacf.h>
#include <asm/timex.h>
+#include <asm/asm.h>
+#include <asm/fpu.h>
#include <asm/ap.h>
#include <asm/uv.h>
-#include <asm/fpu/api.h>
#include "kvm-s390.h"
#include "gaccess.h"
#include "pci.h"
@@ -66,7 +71,14 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
STATS_DESC_COUNTER(VM, inject_pfault_done),
STATS_DESC_COUNTER(VM, inject_service_signal),
STATS_DESC_COUNTER(VM, inject_virtio),
- STATS_DESC_COUNTER(VM, aen_forward)
+ STATS_DESC_COUNTER(VM, aen_forward),
+ STATS_DESC_COUNTER(VM, gmap_shadow_reuse),
+ STATS_DESC_COUNTER(VM, gmap_shadow_create),
+ STATS_DESC_COUNTER(VM, gmap_shadow_r1_entry),
+ STATS_DESC_COUNTER(VM, gmap_shadow_r2_entry),
+ STATS_DESC_COUNTER(VM, gmap_shadow_r3_entry),
+ STATS_DESC_COUNTER(VM, gmap_shadow_sg_entry),
+ STATS_DESC_COUNTER(VM, gmap_shadow_pg_entry),
};
const struct kvm_stats_header kvm_vm_stats_header = {
@@ -125,6 +137,7 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
STATS_DESC_COUNTER(VCPU, instruction_io_other),
STATS_DESC_COUNTER(VCPU, instruction_lpsw),
STATS_DESC_COUNTER(VCPU, instruction_lpswe),
+ STATS_DESC_COUNTER(VCPU, instruction_lpswey),
STATS_DESC_COUNTER(VCPU, instruction_pfmf),
STATS_DESC_COUNTER(VCPU, instruction_ptff),
STATS_DESC_COUNTER(VCPU, instruction_sck),
@@ -172,7 +185,8 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
STATS_DESC_COUNTER(VCPU, instruction_diagnose_308),
STATS_DESC_COUNTER(VCPU, instruction_diagnose_500),
STATS_DESC_COUNTER(VCPU, instruction_diagnose_other),
- STATS_DESC_COUNTER(VCPU, pfault_sync)
+ STATS_DESC_COUNTER(VCPU, pfault_sync),
+ STATS_DESC_COUNTER(VCPU, signal_exits)
};
const struct kvm_stats_header kvm_vcpu_stats_header = {
@@ -256,21 +270,9 @@ debug_info_t *kvm_s390_dbf;
debug_info_t *kvm_s390_dbf_uv;
/* Section: not file related */
-int kvm_arch_hardware_enable(void)
-{
- /* every s390 is virtualization enabled ;-) */
- return 0;
-}
-
-int kvm_arch_check_processor_compat(void *opaque)
-{
- return 0;
-}
-
/* forward declarations */
static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
unsigned long end);
-static int sca_switch_to_extended(struct kvm *kvm);
static void kvm_clock_sync_scb(struct kvm_s390_sie_block *scb, u64 delta)
{
@@ -329,25 +331,6 @@ static struct notifier_block kvm_clock_notifier = {
.notifier_call = kvm_clock_sync,
};
-int kvm_arch_hardware_setup(void *opaque)
-{
- gmap_notifier.notifier_call = kvm_gmap_notifier;
- gmap_register_pte_notifier(&gmap_notifier);
- vsie_gmap_notifier.notifier_call = kvm_s390_vsie_gmap_notifier;
- gmap_register_pte_notifier(&vsie_gmap_notifier);
- atomic_notifier_chain_register(&s390_epoch_delta_notifier,
- &kvm_clock_notifier);
- return 0;
-}
-
-void kvm_arch_hardware_unsetup(void)
-{
- gmap_unregister_pte_notifier(&gmap_notifier);
- gmap_unregister_pte_notifier(&vsie_gmap_notifier);
- atomic_notifier_chain_unregister(&s390_epoch_delta_notifier,
- &kvm_clock_notifier);
-}
-
static void allow_cpu_feat(unsigned long nr)
{
set_bit_inv(nr, kvm_s390_available_cpu_feat);
@@ -362,30 +345,48 @@ static inline int plo_test_bit(unsigned char nr)
" lgr 0,%[function]\n"
/* Parameter registers are ignored for "test bit" */
" plo 0,0,0,0(0)\n"
- " ipm %0\n"
- " srl %0,28\n"
- : "=d" (cc)
+ CC_IPM(cc)
+ : CC_OUT(cc, cc)
: [function] "d" (function)
+ : CC_CLOBBER_LIST("0"));
+ return CC_TRANSFORM(cc) == 0;
+}
+
+static __always_inline void pfcr_query(u8 (*query)[16])
+{
+ asm volatile(
+ " lghi 0,0\n"
+ " .insn rsy,0xeb0000000016,0,0,%[query]"
+ : [query] "=QS" (*query)
+ :
: "cc", "0");
- return cc == 0;
}
-static __always_inline void __insn32_query(unsigned int opcode, u8 *query)
+static __always_inline void __sortl_query(u8 (*query)[32])
{
asm volatile(
" lghi 0,0\n"
- " lgr 1,%[query]\n"
+ " la 1,%[query]\n"
/* Parameter registers are ignored */
- " .insn rrf,%[opc] << 16,2,4,6,0\n"
+ " .insn rre,0xb9380000,2,4"
+ : [query] "=R" (*query)
:
- : [query] "d" ((unsigned long)query), [opc] "i" (opcode)
- : "cc", "memory", "0", "1");
+ : "cc", "0", "1");
}
-#define INSN_SORTL 0xb938
-#define INSN_DFLTCC 0xb939
+static __always_inline void __dfltcc_query(u8 (*query)[32])
+{
+ asm volatile(
+ " lghi 0,0\n"
+ " la 1,%[query]\n"
+ /* Parameter registers are ignored */
+ " .insn rrf,0xb9390000,2,4,6,0"
+ : [query] "=R" (*query)
+ :
+ : "cc", "0", "1");
+}
-static void kvm_s390_cpu_feat_init(void)
+static void __init kvm_s390_cpu_feat_init(void)
{
int i;
@@ -437,18 +438,21 @@ static void kvm_s390_cpu_feat_init(void)
kvm_s390_available_subfunc.kdsa);
if (test_facility(150)) /* SORTL */
- __insn32_query(INSN_SORTL, kvm_s390_available_subfunc.sortl);
+ __sortl_query(&kvm_s390_available_subfunc.sortl);
if (test_facility(151)) /* DFLTCC */
- __insn32_query(INSN_DFLTCC, kvm_s390_available_subfunc.dfltcc);
+ __dfltcc_query(&kvm_s390_available_subfunc.dfltcc);
- if (MACHINE_HAS_ESOP)
+ if (test_facility(201)) /* PFCR */
+ pfcr_query(&kvm_s390_available_subfunc.pfcr);
+
+ if (machine_has_esop())
allow_cpu_feat(KVM_S390_VM_CPU_FEAT_ESOP);
/*
* We need SIE support, ESOP (PROT_READ protection for gmap_shadow),
* 64bit SCAO (SCA passthrough) and IDTE (for gmap_shadow unshadowing).
*/
- if (!sclp.has_sief2 || !MACHINE_HAS_ESOP || !sclp.has_64bscao ||
+ if (!sclp.has_sief2 || !machine_has_esop() || !sclp.has_64bscao ||
!test_facility(3) || !nested)
return;
allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIEF2);
@@ -488,7 +492,7 @@ static void kvm_s390_cpu_feat_init(void)
*/
}
-int kvm_arch_init(void *opaque)
+static int __init __kvm_s390_init(void)
{
int rc = -ENOMEM;
@@ -498,11 +502,11 @@ int kvm_arch_init(void *opaque)
kvm_s390_dbf_uv = debug_register("kvm-uv", 32, 1, 7 * sizeof(long));
if (!kvm_s390_dbf_uv)
- goto out;
+ goto err_kvm_uv;
if (debug_register_view(kvm_s390_dbf, &debug_sprintf_view) ||
debug_register_view(kvm_s390_dbf_uv, &debug_sprintf_view))
- goto out;
+ goto err_debug_view;
kvm_s390_cpu_feat_init();
@@ -510,30 +514,49 @@ int kvm_arch_init(void *opaque)
rc = kvm_register_device_ops(&kvm_flic_ops, KVM_DEV_TYPE_FLIC);
if (rc) {
pr_err("A FLIC registration call failed with rc=%d\n", rc);
- goto out;
+ goto err_flic;
}
if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) {
rc = kvm_s390_pci_init();
if (rc) {
pr_err("Unable to allocate AIFT for PCI\n");
- goto out;
+ goto err_pci;
}
}
rc = kvm_s390_gib_init(GAL_ISC);
if (rc)
- goto out;
+ goto err_gib;
+
+ gmap_notifier.notifier_call = kvm_gmap_notifier;
+ gmap_register_pte_notifier(&gmap_notifier);
+ vsie_gmap_notifier.notifier_call = kvm_s390_vsie_gmap_notifier;
+ gmap_register_pte_notifier(&vsie_gmap_notifier);
+ atomic_notifier_chain_register(&s390_epoch_delta_notifier,
+ &kvm_clock_notifier);
return 0;
-out:
- kvm_arch_exit();
+err_gib:
+ if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM))
+ kvm_s390_pci_exit();
+err_pci:
+err_flic:
+err_debug_view:
+ debug_unregister(kvm_s390_dbf_uv);
+err_kvm_uv:
+ debug_unregister(kvm_s390_dbf);
return rc;
}
-void kvm_arch_exit(void)
+static void __kvm_s390_exit(void)
{
+ gmap_unregister_pte_notifier(&gmap_notifier);
+ gmap_unregister_pte_notifier(&vsie_gmap_notifier);
+ atomic_notifier_chain_unregister(&s390_epoch_delta_notifier,
+ &kvm_clock_notifier);
+
kvm_s390_gib_destroy();
if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM))
kvm_s390_pci_exit();
@@ -567,7 +590,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_ENABLE_CAP:
case KVM_CAP_S390_CSS_SUPPORT:
case KVM_CAP_IOEVENTFD:
- case KVM_CAP_DEVICE_CTRL:
case KVM_CAP_S390_IRQCHIP:
case KVM_CAP_VM_ATTRIBUTES:
case KVM_CAP_MP_STATE:
@@ -584,7 +606,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_S390_VCPU_RESETS:
case KVM_CAP_SET_GUEST_DEBUG:
case KVM_CAP_S390_DIAG318:
- case KVM_CAP_S390_MEM_OP_EXTENSION:
+ case KVM_CAP_IRQFD_RESAMPLE:
+ case KVM_CAP_S390_USER_OPEREXEC:
r = 1;
break;
case KVM_CAP_SET_GUEST_DEBUG2:
@@ -592,28 +615,39 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
break;
case KVM_CAP_S390_HPAGE_1M:
r = 0;
- if (hpage && !kvm_is_ucontrol(kvm))
+ if (hpage && !(kvm && kvm_is_ucontrol(kvm)))
r = 1;
break;
case KVM_CAP_S390_MEM_OP:
r = MEM_OP_MAX_SIZE;
break;
+ case KVM_CAP_S390_MEM_OP_EXTENSION:
+ /*
+ * Flag bits indicating which extensions are supported.
+ * If r > 0, the base extension must also be supported/indicated,
+ * in order to maintain backwards compatibility.
+ */
+ r = KVM_S390_MEMOP_EXTENSION_CAP_BASE |
+ KVM_S390_MEMOP_EXTENSION_CAP_CMPXCHG;
+ break;
case KVM_CAP_NR_VCPUS:
case KVM_CAP_MAX_VCPUS:
case KVM_CAP_MAX_VCPU_ID:
- r = KVM_S390_BSCA_CPU_SLOTS;
+ /*
+ * Return the same value for KVM_CAP_MAX_VCPUS and
+ * KVM_CAP_MAX_VCPU_ID to conform with the KVM API.
+ */
+ r = KVM_S390_ESCA_CPU_SLOTS;
if (!kvm_s390_use_sca_entries())
r = KVM_MAX_VCPUS;
- else if (sclp.has_esca && sclp.has_64bscao)
- r = KVM_S390_ESCA_CPU_SLOTS;
if (ext == KVM_CAP_NR_VCPUS)
r = min_t(unsigned int, num_online_cpus(), r);
break;
case KVM_CAP_S390_COW:
- r = MACHINE_HAS_ESOP;
+ r = machine_has_esop();
break;
case KVM_CAP_S390_VECTOR_REGISTERS:
- r = MACHINE_HAS_VX;
+ r = test_facility(129);
break;
case KVM_CAP_S390_RI:
r = test_facility(64);
@@ -762,7 +796,7 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
mutex_lock(&kvm->lock);
if (kvm->created_vcpus) {
r = -EBUSY;
- } else if (MACHINE_HAS_VX) {
+ } else if (cpu_has_vx()) {
set_kvm_facility(kvm->arch.model.fac_mask, 129);
set_kvm_facility(kvm->arch.model.fac_list, 129);
if (test_facility(134)) {
@@ -785,6 +819,14 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
set_kvm_facility(kvm->arch.model.fac_mask, 192);
set_kvm_facility(kvm->arch.model.fac_list, 192);
}
+ if (test_facility(198)) {
+ set_kvm_facility(kvm->arch.model.fac_mask, 198);
+ set_kvm_facility(kvm->arch.model.fac_list, 198);
+ }
+ if (test_facility(199)) {
+ set_kvm_facility(kvm->arch.model.fac_mask, 199);
+ set_kvm_facility(kvm->arch.model.fac_list, 199);
+ }
r = 0;
} else
r = -EINVAL;
@@ -881,6 +923,12 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
VM_EVENT(kvm, 3, "ENABLE: CAP_S390_CPU_TOPOLOGY %s",
r ? "(not available)" : "(success)");
break;
+ case KVM_CAP_S390_USER_OPEREXEC:
+ VM_EVENT(kvm, 3, "%s", "ENABLE: CAP_S390_USER_OPEREXEC");
+ kvm->arch.user_operexec = 1;
+ icpt_operexc_on_all_vcpus(kvm);
+ r = 0;
+ break;
default:
r = -EINVAL;
break;
@@ -984,7 +1032,7 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att
}
mutex_unlock(&kvm->lock);
VM_EVENT(kvm, 3, "SET: max guest address: %lu", new_limit);
- VM_EVENT(kvm, 3, "New guest asce: 0x%pK",
+ VM_EVENT(kvm, 3, "New guest asce: 0x%p",
(void *) kvm->arch.gmap->asce);
break;
}
@@ -1529,6 +1577,42 @@ static int kvm_s390_set_processor_subfunc(struct kvm *kvm,
((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[1],
((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[2],
((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[3]);
+ VM_EVENT(kvm, 3, "GET: guest PFCR subfunc 0x%16.16lx.%16.16lx",
+ ((unsigned long *) &kvm_s390_available_subfunc.pfcr)[0],
+ ((unsigned long *) &kvm_s390_available_subfunc.pfcr)[1]);
+
+ return 0;
+}
+
+#define KVM_S390_VM_CPU_UV_FEAT_GUEST_MASK \
+( \
+ ((struct kvm_s390_vm_cpu_uv_feat){ \
+ .ap = 1, \
+ .ap_intr = 1, \
+ }) \
+ .feat \
+)
+
+static int kvm_s390_set_uv_feat(struct kvm *kvm, struct kvm_device_attr *attr)
+{
+ struct kvm_s390_vm_cpu_uv_feat __user *ptr = (void __user *)attr->addr;
+ unsigned long data, filter;
+
+ filter = uv_info.uv_feature_indications & KVM_S390_VM_CPU_UV_FEAT_GUEST_MASK;
+ if (get_user(data, &ptr->feat))
+ return -EFAULT;
+ if (!bitmap_subset(&data, &filter, KVM_S390_VM_CPU_UV_FEAT_NR_BITS))
+ return -EINVAL;
+
+ mutex_lock(&kvm->lock);
+ if (kvm->created_vcpus) {
+ mutex_unlock(&kvm->lock);
+ return -EBUSY;
+ }
+ kvm->arch.model.uv_feat_guest.feat = data;
+ mutex_unlock(&kvm->lock);
+
+ VM_EVENT(kvm, 3, "SET: guest UV-feat: 0x%16.16lx", data);
return 0;
}
@@ -1547,6 +1631,9 @@ static int kvm_s390_set_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC:
ret = kvm_s390_set_processor_subfunc(kvm, attr);
break;
+ case KVM_S390_VM_CPU_PROCESSOR_UV_FEAT_GUEST:
+ ret = kvm_s390_set_uv_feat(kvm, attr);
+ break;
}
return ret;
}
@@ -1707,6 +1794,9 @@ static int kvm_s390_get_processor_subfunc(struct kvm *kvm,
((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[1],
((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[2],
((unsigned long *) &kvm->arch.model.subfuncs.dfltcc)[3]);
+ VM_EVENT(kvm, 3, "GET: guest PFCR subfunc 0x%16.16lx.%16.16lx",
+ ((unsigned long *) &kvm_s390_available_subfunc.pfcr)[0],
+ ((unsigned long *) &kvm_s390_available_subfunc.pfcr)[1]);
return 0;
}
@@ -1775,6 +1865,36 @@ static int kvm_s390_get_machine_subfunc(struct kvm *kvm,
((unsigned long *) &kvm_s390_available_subfunc.dfltcc)[1],
((unsigned long *) &kvm_s390_available_subfunc.dfltcc)[2],
((unsigned long *) &kvm_s390_available_subfunc.dfltcc)[3]);
+ VM_EVENT(kvm, 3, "GET: host PFCR subfunc 0x%16.16lx.%16.16lx",
+ ((unsigned long *) &kvm_s390_available_subfunc.pfcr)[0],
+ ((unsigned long *) &kvm_s390_available_subfunc.pfcr)[1]);
+
+ return 0;
+}
+
+static int kvm_s390_get_processor_uv_feat(struct kvm *kvm, struct kvm_device_attr *attr)
+{
+ struct kvm_s390_vm_cpu_uv_feat __user *dst = (void __user *)attr->addr;
+ unsigned long feat = kvm->arch.model.uv_feat_guest.feat;
+
+ if (put_user(feat, &dst->feat))
+ return -EFAULT;
+ VM_EVENT(kvm, 3, "GET: guest UV-feat: 0x%16.16lx", feat);
+
+ return 0;
+}
+
+static int kvm_s390_get_machine_uv_feat(struct kvm *kvm, struct kvm_device_attr *attr)
+{
+ struct kvm_s390_vm_cpu_uv_feat __user *dst = (void __user *)attr->addr;
+ unsigned long feat;
+
+ BUILD_BUG_ON(sizeof(*dst) != sizeof(uv_info.uv_feature_indications));
+
+ feat = uv_info.uv_feature_indications & KVM_S390_VM_CPU_UV_FEAT_GUEST_MASK;
+ if (put_user(feat, &dst->feat))
+ return -EFAULT;
+ VM_EVENT(kvm, 3, "GET: guest UV-feat: 0x%16.16lx", feat);
return 0;
}
@@ -1802,6 +1922,12 @@ static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
case KVM_S390_VM_CPU_MACHINE_SUBFUNC:
ret = kvm_s390_get_machine_subfunc(kvm, attr);
break;
+ case KVM_S390_VM_CPU_PROCESSOR_UV_FEAT_GUEST:
+ ret = kvm_s390_get_processor_uv_feat(kvm, attr);
+ break;
+ case KVM_S390_VM_CPU_MACHINE_UV_FEAT_GUEST:
+ ret = kvm_s390_get_machine_uv_feat(kvm, attr);
+ break;
}
return ret;
}
@@ -1814,22 +1940,18 @@ static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
* Updates the Multiprocessor Topology-Change-Report bit to signal
* the guest with a topology change.
* This is only relevant if the topology facility is present.
- *
- * The SCA version, bsca or esca, doesn't matter as offset is the same.
*/
static void kvm_s390_update_topology_change_report(struct kvm *kvm, bool val)
{
union sca_utility new, old;
- struct bsca_block *sca;
+ struct esca_block *sca;
- read_lock(&kvm->arch.sca_lock);
sca = kvm->arch.sca;
+ old = READ_ONCE(sca->utility);
do {
- old = READ_ONCE(sca->utility);
new = old;
new.mtcr = val;
- } while (cmpxchg(&sca->utility.val, old.val, new.val) != old.val);
- read_unlock(&kvm->arch.sca_lock);
+ } while (!try_cmpxchg(&sca->utility.val, &old.val, new.val));
}
static int kvm_s390_set_topo_change_indication(struct kvm *kvm,
@@ -1850,9 +1972,7 @@ static int kvm_s390_get_topo_change_indication(struct kvm *kvm,
if (!test_kvm_facility(kvm, 11))
return -ENXIO;
- read_lock(&kvm->arch.sca_lock);
- topo = ((struct bsca_block *)kvm->arch.sca)->utility.mtcr;
- read_unlock(&kvm->arch.sca_lock);
+ topo = kvm->arch.sca->utility.mtcr;
return put_user(topo, (u8 __user *)attr->addr);
}
@@ -1954,6 +2074,8 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
case KVM_S390_VM_CPU_MACHINE_FEAT:
case KVM_S390_VM_CPU_MACHINE_SUBFUNC:
case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC:
+ case KVM_S390_VM_CPU_MACHINE_UV_FEAT_GUEST:
+ case KVM_S390_VM_CPU_PROCESSOR_UV_FEAT_GUEST:
ret = 0;
break;
default:
@@ -1992,7 +2114,7 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
return ret;
}
-static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
+static int kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
{
uint8_t *keys;
uint64_t hva;
@@ -2040,7 +2162,7 @@ static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
return r;
}
-static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
+static int kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
{
uint8_t *keys;
uint64_t hva;
@@ -2158,6 +2280,10 @@ static unsigned long kvm_s390_next_dirty_cmma(struct kvm_memslots *slots,
ms = container_of(mnode, struct kvm_memory_slot, gfn_node[slots->node_idx]);
ofs = 0;
}
+
+ if (cur_gfn < ms->base_gfn)
+ ofs = 0;
+
ofs = find_next_bit(kvm_second_dirty_bitmap(ms), ms->npages, ofs);
while (ofs >= ms->npages && (mnode = rb_next(mnode))) {
ms = container_of(mnode, struct kvm_memory_slot, gfn_node[slots->node_idx]);
@@ -2404,7 +2530,7 @@ static int kvm_s390_cpus_to_pv(struct kvm *kvm, u16 *rc, u16 *rrc)
struct kvm_vcpu *vcpu;
/* Disable the GISA if the ultravisor does not support AIV. */
- if (!test_bit_inv(BIT_UV_FEAT_AIV, &uv_info.uv_feature_indications))
+ if (!uv_has_feature(BIT_UV_FEAT_AIV))
kvm_s390_gisa_disable(kvm);
kvm_for_each_vcpu(i, vcpu, kvm) {
@@ -2544,17 +2670,9 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
if (kvm_s390_pv_is_protected(kvm))
break;
- /*
- * FMT 4 SIE needs esca. As we never switch back to bsca from
- * esca, we need no cleanup in the error cases below
- */
- r = sca_switch_to_extended(kvm);
- if (r)
- break;
-
- mmap_write_lock(current->mm);
- r = gmap_mark_unmergeable();
- mmap_write_unlock(current->mm);
+ mmap_write_lock(kvm->mm);
+ r = gmap_helper_disable_cow_sharing();
+ mmap_write_unlock(kvm->mm);
if (r)
break;
@@ -2764,41 +2882,33 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
return r;
}
-static bool access_key_invalid(u8 access_key)
+static int mem_op_validate_common(struct kvm_s390_mem_op *mop, u64 supported_flags)
{
- return access_key > 0xf;
-}
-
-static int kvm_s390_vm_mem_op(struct kvm *kvm, struct kvm_s390_mem_op *mop)
-{
- void __user *uaddr = (void __user *)mop->buf;
- u64 supported_flags;
- void *tmpbuf = NULL;
- int r, srcu_idx;
-
- supported_flags = KVM_S390_MEMOP_F_SKEY_PROTECTION
- | KVM_S390_MEMOP_F_CHECK_ONLY;
if (mop->flags & ~supported_flags || !mop->size)
return -EINVAL;
if (mop->size > MEM_OP_MAX_SIZE)
return -E2BIG;
- /*
- * This is technically a heuristic only, if the kvm->lock is not
- * taken, it is not guaranteed that the vm is/remains non-protected.
- * This is ok from a kernel perspective, wrongdoing is detected
- * on the access, -EFAULT is returned and the vm may crash the
- * next time it accesses the memory in question.
- * There is no sane usecase to do switching and a memop on two
- * different CPUs at the same time.
- */
- if (kvm_s390_pv_get_handle(kvm))
- return -EINVAL;
if (mop->flags & KVM_S390_MEMOP_F_SKEY_PROTECTION) {
- if (access_key_invalid(mop->key))
+ if (mop->key > 0xf)
return -EINVAL;
} else {
mop->key = 0;
}
+ return 0;
+}
+
+static int kvm_s390_vm_mem_op_abs(struct kvm *kvm, struct kvm_s390_mem_op *mop)
+{
+ void __user *uaddr = (void __user *)mop->buf;
+ enum gacc_mode acc_mode;
+ void *tmpbuf = NULL;
+ int r, srcu_idx;
+
+ r = mem_op_validate_common(mop, KVM_S390_MEMOP_F_SKEY_PROTECTION |
+ KVM_S390_MEMOP_F_CHECK_ONLY);
+ if (r)
+ return r;
+
if (!(mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY)) {
tmpbuf = vmalloc(mop->size);
if (!tmpbuf)
@@ -2807,40 +2917,30 @@ static int kvm_s390_vm_mem_op(struct kvm *kvm, struct kvm_s390_mem_op *mop)
srcu_idx = srcu_read_lock(&kvm->srcu);
- if (kvm_is_error_gpa(kvm, mop->gaddr)) {
+ if (!kvm_is_gpa_in_memslot(kvm, mop->gaddr)) {
r = PGM_ADDRESSING;
goto out_unlock;
}
- switch (mop->op) {
- case KVM_S390_MEMOP_ABSOLUTE_READ: {
- if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
- r = check_gpa_range(kvm, mop->gaddr, mop->size, GACC_FETCH, mop->key);
- } else {
- r = access_guest_abs_with_key(kvm, mop->gaddr, tmpbuf,
- mop->size, GACC_FETCH, mop->key);
- if (r == 0) {
- if (copy_to_user(uaddr, tmpbuf, mop->size))
- r = -EFAULT;
- }
- }
- break;
+ acc_mode = mop->op == KVM_S390_MEMOP_ABSOLUTE_READ ? GACC_FETCH : GACC_STORE;
+ if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
+ r = check_gpa_range(kvm, mop->gaddr, mop->size, acc_mode, mop->key);
+ goto out_unlock;
}
- case KVM_S390_MEMOP_ABSOLUTE_WRITE: {
- if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
- r = check_gpa_range(kvm, mop->gaddr, mop->size, GACC_STORE, mop->key);
- } else {
- if (copy_from_user(tmpbuf, uaddr, mop->size)) {
- r = -EFAULT;
- break;
- }
- r = access_guest_abs_with_key(kvm, mop->gaddr, tmpbuf,
- mop->size, GACC_STORE, mop->key);
+ if (acc_mode == GACC_FETCH) {
+ r = access_guest_abs_with_key(kvm, mop->gaddr, tmpbuf,
+ mop->size, GACC_FETCH, mop->key);
+ if (r)
+ goto out_unlock;
+ if (copy_to_user(uaddr, tmpbuf, mop->size))
+ r = -EFAULT;
+ } else {
+ if (copy_from_user(tmpbuf, uaddr, mop->size)) {
+ r = -EFAULT;
+ goto out_unlock;
}
- break;
- }
- default:
- r = -EINVAL;
+ r = access_guest_abs_with_key(kvm, mop->gaddr, tmpbuf,
+ mop->size, GACC_STORE, mop->key);
}
out_unlock:
@@ -2850,8 +2950,76 @@ out_unlock:
return r;
}
-long kvm_arch_vm_ioctl(struct file *filp,
- unsigned int ioctl, unsigned long arg)
+static int kvm_s390_vm_mem_op_cmpxchg(struct kvm *kvm, struct kvm_s390_mem_op *mop)
+{
+ void __user *uaddr = (void __user *)mop->buf;
+ void __user *old_addr = (void __user *)mop->old_addr;
+ union {
+ __uint128_t quad;
+ char raw[sizeof(__uint128_t)];
+ } old = { .quad = 0}, new = { .quad = 0 };
+ unsigned int off_in_quad = sizeof(new) - mop->size;
+ int r, srcu_idx;
+ bool success;
+
+ r = mem_op_validate_common(mop, KVM_S390_MEMOP_F_SKEY_PROTECTION);
+ if (r)
+ return r;
+ /*
+ * This validates off_in_quad. Checking that size is a power
+ * of two is not necessary, as cmpxchg_guest_abs_with_key
+ * takes care of that
+ */
+ if (mop->size > sizeof(new))
+ return -EINVAL;
+ if (copy_from_user(&new.raw[off_in_quad], uaddr, mop->size))
+ return -EFAULT;
+ if (copy_from_user(&old.raw[off_in_quad], old_addr, mop->size))
+ return -EFAULT;
+
+ srcu_idx = srcu_read_lock(&kvm->srcu);
+
+ if (!kvm_is_gpa_in_memslot(kvm, mop->gaddr)) {
+ r = PGM_ADDRESSING;
+ goto out_unlock;
+ }
+
+ r = cmpxchg_guest_abs_with_key(kvm, mop->gaddr, mop->size, &old.quad,
+ new.quad, mop->key, &success);
+ if (!success && copy_to_user(old_addr, &old.raw[off_in_quad], mop->size))
+ r = -EFAULT;
+
+out_unlock:
+ srcu_read_unlock(&kvm->srcu, srcu_idx);
+ return r;
+}
+
+static int kvm_s390_vm_mem_op(struct kvm *kvm, struct kvm_s390_mem_op *mop)
+{
+ /*
+ * This is technically a heuristic only, if the kvm->lock is not
+ * taken, it is not guaranteed that the vm is/remains non-protected.
+ * This is ok from a kernel perspective, wrongdoing is detected
+ * on the access, -EFAULT is returned and the vm may crash the
+ * next time it accesses the memory in question.
+ * There is no sane usecase to do switching and a memop on two
+ * different CPUs at the same time.
+ */
+ if (kvm_s390_pv_get_handle(kvm))
+ return -EINVAL;
+
+ switch (mop->op) {
+ case KVM_S390_MEMOP_ABSOLUTE_READ:
+ case KVM_S390_MEMOP_ABSOLUTE_WRITE:
+ return kvm_s390_vm_mem_op_abs(kvm, mop);
+ case KVM_S390_MEMOP_ABSOLUTE_CMPXCHG:
+ return kvm_s390_vm_mem_op_cmpxchg(kvm, mop);
+ default:
+ return -EINVAL;
+ }
+}
+
+int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
{
struct kvm *kvm = filp->private_data;
void __user *argp = (void __user *)arg;
@@ -2869,14 +3037,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
break;
}
case KVM_CREATE_IRQCHIP: {
- struct kvm_irq_routing_entry routing;
-
r = -EINVAL;
- if (kvm->arch.use_irqchip) {
- /* Set up dummy routing. */
- memset(&routing, 0, sizeof(routing));
- r = kvm_set_irq_routing(kvm, &routing, 0, 0);
- }
+ if (kvm->arch.use_irqchip)
+ r = 0;
break;
}
case KVM_SET_DEVICE_ATTR: {
@@ -3024,7 +3187,7 @@ static int kvm_s390_apxa_installed(void)
*/
static void kvm_s390_set_crycb_format(struct kvm *kvm)
{
- kvm->arch.crypto.crycbd = (__u32)(unsigned long) kvm->arch.crypto.crycb;
+ kvm->arch.crypto.crycbd = virt_to_phys(kvm->arch.crypto.crycb);
/* Clear the CRYCB format bits - i.e., set format 0 by default */
kvm->arch.crypto.crycbd &= ~(CRYCB_FORMAT_MASK);
@@ -3149,10 +3312,7 @@ static void kvm_s390_crypto_init(struct kvm *kvm)
static void sca_dispose(struct kvm *kvm)
{
- if (kvm->arch.use_esca)
- free_pages_exact(kvm->arch.sca, sizeof(struct esca_block));
- else
- free_page((unsigned long)(kvm->arch.sca));
+ free_pages_exact(kvm->arch.sca, sizeof(*kvm->arch.sca));
kvm->arch.sca = NULL;
}
@@ -3166,10 +3326,9 @@ void kvm_arch_free_vm(struct kvm *kvm)
int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
{
- gfp_t alloc_flags = GFP_KERNEL_ACCOUNT;
- int i, rc;
+ gfp_t alloc_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO;
char debug_name[16];
- static unsigned long sca_offset;
+ int i, rc;
rc = -EINVAL;
#ifdef CONFIG_KVM_S390_UCONTROL
@@ -3190,20 +3349,14 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
if (!sclp.has_64bscao)
alloc_flags |= GFP_DMA;
- rwlock_init(&kvm->arch.sca_lock);
- /* start with basic SCA */
- kvm->arch.sca = (struct bsca_block *) get_zeroed_page(alloc_flags);
- if (!kvm->arch.sca)
- goto out_err;
mutex_lock(&kvm_lock);
- sca_offset += 16;
- if (sca_offset + sizeof(struct bsca_block) > PAGE_SIZE)
- sca_offset = 0;
- kvm->arch.sca = (struct bsca_block *)
- ((char *) kvm->arch.sca + sca_offset);
+
+ kvm->arch.sca = alloc_pages_exact(sizeof(*kvm->arch.sca), alloc_flags);
mutex_unlock(&kvm_lock);
+ if (!kvm->arch.sca)
+ goto out_err;
- sprintf(debug_name, "kvm-%u", current->pid);
+ snprintf(debug_name, sizeof(debug_name), "kvm-%u", current->pid);
kvm->arch.dbf = debug_register(debug_name, 32, 1, 7 * sizeof(long));
if (!kvm->arch.dbf)
@@ -3233,7 +3386,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
/* we emulate STHYI in kvm */
set_kvm_facility(kvm->arch.model.fac_mask, 74);
set_kvm_facility(kvm->arch.model.fac_list, 74);
- if (MACHINE_HAS_TLB_GUEST) {
+ if (machine_has_tlb_guest()) {
set_kvm_facility(kvm->arch.model.fac_mask, 147);
set_kvm_facility(kvm->arch.model.fac_list, 147);
}
@@ -3244,6 +3397,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
kvm->arch.model.cpuid = kvm_s390_get_initial_cpuid();
kvm->arch.model.ibc = sclp.ibc & 0x0fff;
+ kvm->arch.model.uv_feat_guest.feat = 0;
+
kvm_s390_crypto_init(kvm);
if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) {
@@ -3264,8 +3419,20 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
VM_EVENT(kvm, 3, "vm created with type %lu", type);
if (type & KVM_VM_S390_UCONTROL) {
+ struct kvm_userspace_memory_region2 fake_memslot = {
+ .slot = KVM_S390_UCONTROL_MEMSLOT,
+ .guest_phys_addr = 0,
+ .userspace_addr = 0,
+ .memory_size = ALIGN_DOWN(TASK_SIZE, _SEGMENT_SIZE),
+ .flags = 0,
+ };
+
kvm->arch.gmap = NULL;
kvm->arch.mem_limit = KVM_S390_NO_MEM_LIMIT;
+ /* one flat fake memslot covering the whole address-space */
+ mutex_lock(&kvm->slots_lock);
+ KVM_BUG_ON(kvm_set_internal_memslot(kvm, &fake_memslot), kvm);
+ mutex_unlock(&kvm->slots_lock);
} else {
if (sclp.hamax == U64_MAX)
kvm->arch.mem_limit = TASK_SIZE_MAX;
@@ -3287,7 +3454,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
kvm_s390_gisa_init(kvm);
INIT_LIST_HEAD(&kvm->arch.pv.need_cleanup);
kvm->arch.pv.set_aside = NULL;
- KVM_EVENT(3, "vm 0x%pK created by pid %u", kvm, current->pid);
+ KVM_EVENT(3, "vm 0x%p created by pid %u", kvm, current->pid);
return 0;
out_err:
@@ -3350,7 +3517,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
kvm_s390_destroy_adapters(kvm);
kvm_s390_clear_float_irqs(kvm);
kvm_s390_vsie_destroy(kvm);
- KVM_EVENT(3, "vm 0x%pK destroyed", kvm);
+ KVM_EVENT(3, "vm 0x%p destroyed", kvm);
}
/* Section: vcpu related */
@@ -3366,133 +3533,38 @@ static int __kvm_ucontrol_vcpu_init(struct kvm_vcpu *vcpu)
static void sca_del_vcpu(struct kvm_vcpu *vcpu)
{
+ struct esca_block *sca = vcpu->kvm->arch.sca;
+
if (!kvm_s390_use_sca_entries())
return;
- read_lock(&vcpu->kvm->arch.sca_lock);
- if (vcpu->kvm->arch.use_esca) {
- struct esca_block *sca = vcpu->kvm->arch.sca;
- clear_bit_inv(vcpu->vcpu_id, (unsigned long *) sca->mcn);
- sca->cpu[vcpu->vcpu_id].sda = 0;
- } else {
- struct bsca_block *sca = vcpu->kvm->arch.sca;
-
- clear_bit_inv(vcpu->vcpu_id, (unsigned long *) &sca->mcn);
- sca->cpu[vcpu->vcpu_id].sda = 0;
- }
- read_unlock(&vcpu->kvm->arch.sca_lock);
+ clear_bit_inv(vcpu->vcpu_id, (unsigned long *)sca->mcn);
+ sca->cpu[vcpu->vcpu_id].sda = 0;
}
static void sca_add_vcpu(struct kvm_vcpu *vcpu)
{
- if (!kvm_s390_use_sca_entries()) {
- phys_addr_t sca_phys = virt_to_phys(vcpu->kvm->arch.sca);
-
- /* we still need the basic sca for the ipte control */
- vcpu->arch.sie_block->scaoh = sca_phys >> 32;
- vcpu->arch.sie_block->scaol = sca_phys;
- return;
- }
- read_lock(&vcpu->kvm->arch.sca_lock);
- if (vcpu->kvm->arch.use_esca) {
- struct esca_block *sca = vcpu->kvm->arch.sca;
- phys_addr_t sca_phys = virt_to_phys(sca);
-
- sca->cpu[vcpu->vcpu_id].sda = virt_to_phys(vcpu->arch.sie_block);
- vcpu->arch.sie_block->scaoh = sca_phys >> 32;
- vcpu->arch.sie_block->scaol = sca_phys & ESCA_SCAOL_MASK;
- vcpu->arch.sie_block->ecb2 |= ECB2_ESCA;
- set_bit_inv(vcpu->vcpu_id, (unsigned long *) sca->mcn);
- } else {
- struct bsca_block *sca = vcpu->kvm->arch.sca;
- phys_addr_t sca_phys = virt_to_phys(sca);
-
- sca->cpu[vcpu->vcpu_id].sda = virt_to_phys(vcpu->arch.sie_block);
- vcpu->arch.sie_block->scaoh = sca_phys >> 32;
- vcpu->arch.sie_block->scaol = sca_phys;
- set_bit_inv(vcpu->vcpu_id, (unsigned long *) &sca->mcn);
- }
- read_unlock(&vcpu->kvm->arch.sca_lock);
-}
-
-/* Basic SCA to Extended SCA data copy routines */
-static inline void sca_copy_entry(struct esca_entry *d, struct bsca_entry *s)
-{
- d->sda = s->sda;
- d->sigp_ctrl.c = s->sigp_ctrl.c;
- d->sigp_ctrl.scn = s->sigp_ctrl.scn;
-}
-
-static void sca_copy_b_to_e(struct esca_block *d, struct bsca_block *s)
-{
- int i;
-
- d->ipte_control = s->ipte_control;
- d->mcn[0] = s->mcn;
- for (i = 0; i < KVM_S390_BSCA_CPU_SLOTS; i++)
- sca_copy_entry(&d->cpu[i], &s->cpu[i]);
-}
-
-static int sca_switch_to_extended(struct kvm *kvm)
-{
- struct bsca_block *old_sca = kvm->arch.sca;
- struct esca_block *new_sca;
- struct kvm_vcpu *vcpu;
- unsigned long vcpu_idx;
- u32 scaol, scaoh;
- phys_addr_t new_sca_phys;
-
- if (kvm->arch.use_esca)
- return 0;
-
- new_sca = alloc_pages_exact(sizeof(*new_sca), GFP_KERNEL_ACCOUNT | __GFP_ZERO);
- if (!new_sca)
- return -ENOMEM;
+ struct esca_block *sca = vcpu->kvm->arch.sca;
+ phys_addr_t sca_phys = virt_to_phys(sca);
- new_sca_phys = virt_to_phys(new_sca);
- scaoh = new_sca_phys >> 32;
- scaol = new_sca_phys & ESCA_SCAOL_MASK;
+ /* we still need the sca header for the ipte control */
+ vcpu->arch.sie_block->scaoh = sca_phys >> 32;
+ vcpu->arch.sie_block->scaol = sca_phys & ESCA_SCAOL_MASK;
+ vcpu->arch.sie_block->ecb2 |= ECB2_ESCA;
- kvm_s390_vcpu_block_all(kvm);
- write_lock(&kvm->arch.sca_lock);
-
- sca_copy_b_to_e(new_sca, old_sca);
-
- kvm_for_each_vcpu(vcpu_idx, vcpu, kvm) {
- vcpu->arch.sie_block->scaoh = scaoh;
- vcpu->arch.sie_block->scaol = scaol;
- vcpu->arch.sie_block->ecb2 |= ECB2_ESCA;
- }
- kvm->arch.sca = new_sca;
- kvm->arch.use_esca = 1;
-
- write_unlock(&kvm->arch.sca_lock);
- kvm_s390_vcpu_unblock_all(kvm);
-
- free_page((unsigned long)old_sca);
+ if (!kvm_s390_use_sca_entries())
+ return;
- VM_EVENT(kvm, 2, "Switched to ESCA (0x%pK -> 0x%pK)",
- old_sca, kvm->arch.sca);
- return 0;
+ set_bit_inv(vcpu->vcpu_id, (unsigned long *)sca->mcn);
+ sca->cpu[vcpu->vcpu_id].sda = virt_to_phys(vcpu->arch.sie_block);
}
static int sca_can_add_vcpu(struct kvm *kvm, unsigned int id)
{
- int rc;
-
- if (!kvm_s390_use_sca_entries()) {
- if (id < KVM_MAX_VCPUS)
- return true;
- return false;
- }
- if (id < KVM_S390_BSCA_CPU_SLOTS)
- return true;
- if (!sclp.has_esca || !sclp.has_64bscao)
- return false;
-
- rc = kvm->arch.use_esca ? 0 : sca_switch_to_extended(kvm);
+ if (!kvm_s390_use_sca_entries())
+ return id < KVM_MAX_VCPUS;
- return rc == 0 && id < KVM_S390_ESCA_CPU_SLOTS;
+ return id < KVM_S390_ESCA_CPU_SLOTS;
}
/* needs disabled preemption to protect from TOD sync and vcpu_load/put */
@@ -3585,7 +3657,6 @@ __u64 kvm_s390_get_cpu_timer(struct kvm_vcpu *vcpu)
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
- gmap_enable(vcpu->arch.enabled_gmap);
kvm_s390_set_cpuflags(vcpu, CPUSTAT_RUNNING);
if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu))
__start_cpu_timer_accounting(vcpu);
@@ -3598,8 +3669,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu))
__stop_cpu_timer_accounting(vcpu);
kvm_s390_clear_cpuflags(vcpu, CPUSTAT_RUNNING);
- vcpu->arch.enabled_gmap = gmap_get_enabled();
- gmap_disable(vcpu->arch.enabled_gmap);
}
@@ -3617,8 +3686,6 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
}
if (test_kvm_facility(vcpu->kvm, 74) || vcpu->kvm->arch.user_instr0)
vcpu->arch.sie_block->ictl |= ICTL_OPEREXC;
- /* make vcpu_load load the right gmap on the first trigger */
- vcpu->arch.enabled_gmap = vcpu->arch.gmap;
}
static bool kvm_has_pckmo_subfunc(struct kvm *kvm, unsigned long nr)
@@ -3640,6 +3707,13 @@ static bool kvm_has_pckmo_ecc(struct kvm *kvm)
}
+static bool kvm_has_pckmo_hmac(struct kvm *kvm)
+{
+ /* At least one HMAC subfunction must be present */
+ return kvm_has_pckmo_subfunc(kvm, 118) ||
+ kvm_has_pckmo_subfunc(kvm, 122);
+}
+
static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu)
{
/*
@@ -3652,7 +3726,7 @@ static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu)
vcpu->arch.sie_block->crycbd = vcpu->kvm->arch.crypto.crycbd;
vcpu->arch.sie_block->ecb3 &= ~(ECB3_AES | ECB3_DEA);
vcpu->arch.sie_block->eca &= ~ECA_APIE;
- vcpu->arch.sie_block->ecd &= ~ECD_ECC;
+ vcpu->arch.sie_block->ecd &= ~(ECD_ECC | ECD_HMAC);
if (vcpu->kvm->arch.crypto.apie)
vcpu->arch.sie_block->eca |= ECA_APIE;
@@ -3660,9 +3734,11 @@ static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu)
/* Set up protected key support */
if (vcpu->kvm->arch.crypto.aes_kw) {
vcpu->arch.sie_block->ecb3 |= ECB3_AES;
- /* ecc is also wrapped with AES key */
+ /* ecc/hmac is also wrapped with AES key */
if (kvm_has_pckmo_ecc(vcpu->kvm))
vcpu->arch.sie_block->ecd |= ECD_ECC;
+ if (kvm_has_pckmo_hmac(vcpu->kvm))
+ vcpu->arch.sie_block->ecd |= ECD_HMAC;
}
if (vcpu->kvm->arch.crypto.dea_kw)
@@ -3711,8 +3787,8 @@ static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu)
kvm_s390_vcpu_setup_model(vcpu);
- /* pgste_set_pte has special handling for !MACHINE_HAS_ESOP */
- if (MACHINE_HAS_ESOP)
+ /* pgste_set_pte has special handling for !machine_has_esop() */
+ if (machine_has_esop())
vcpu->arch.sie_block->ecb |= ECB_HOSTPROTINT;
if (test_kvm_facility(vcpu->kvm, 9))
vcpu->arch.sie_block->ecb |= ECB_SRSI;
@@ -3734,7 +3810,7 @@ static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu)
vcpu->arch.sie_block->eca |= ECA_IB;
if (sclp.has_siif)
vcpu->arch.sie_block->eca |= ECA_SII;
- if (sclp.has_sigpif)
+ if (kvm_s390_use_sca_entries())
vcpu->arch.sie_block->eca |= ECA_SIGPI;
if (test_kvm_facility(vcpu->kvm, 129)) {
vcpu->arch.sie_block->eca |= ECA_VX;
@@ -3762,8 +3838,8 @@ static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu)
if (rc)
return rc;
}
- hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- vcpu->arch.ckc_timer.function = kvm_s390_idle_wakeup;
+ hrtimer_setup(&vcpu->arch.ckc_timer, kvm_s390_idle_wakeup, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL);
vcpu->arch.sie_block->hpid = HPID_KVM;
@@ -3820,6 +3896,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
KVM_SYNC_ARCH0 |
KVM_SYNC_PFAULT |
KVM_SYNC_DIAG318;
+ vcpu->arch.acrs_loaded = false;
kvm_s390_set_prefix(vcpu, 0);
if (test_kvm_facility(vcpu->kvm, 64))
vcpu->run->kvm_valid_regs |= KVM_SYNC_RICCB;
@@ -3830,9 +3907,9 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
if (test_kvm_facility(vcpu->kvm, 156))
vcpu->run->kvm_valid_regs |= KVM_SYNC_ETOKEN;
/* fprs can be synchronized via vrs, even if the guest has no vx. With
- * MACHINE_HAS_VX, (load|store)_fpu_regs() will work with vrs format.
+ * cpu_has_vx(), (load|store)_fpu_regs() will work with vrs format.
*/
- if (MACHINE_HAS_VX)
+ if (cpu_has_vx())
vcpu->run->kvm_valid_regs |= KVM_SYNC_VRS;
else
vcpu->run->kvm_valid_regs |= KVM_SYNC_FPRS;
@@ -3843,7 +3920,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
goto out_free_sie_block;
}
- VM_EVENT(vcpu->kvm, 3, "create cpu %d at 0x%pK, sie block at 0x%pK",
+ VM_EVENT(vcpu->kvm, 3, "create cpu %d at 0x%p, sie block at 0x%p",
vcpu->vcpu_id, vcpu, vcpu->arch.sie_block);
trace_kvm_s390_create_vcpu(vcpu->vcpu_id, vcpu, vcpu->arch.sie_block);
@@ -3928,6 +4005,8 @@ static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
unsigned long prefix;
unsigned long i;
+ trace_kvm_s390_gmap_notifier(start, end, gmap_is_shadow(gmap));
+
if (gmap_is_shadow(gmap))
return;
if (start >= 1UL << 31)
@@ -3947,7 +4026,7 @@ static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
{
/* do not poll with more than halt_poll_max_steal percent of steal time */
- if (S390_lowcore.avg_steal_timer * 100 / (TICK_USEC << 12) >=
+ if (get_lowcore()->avg_steal_timer * 100 / (TICK_USEC << 12) >=
READ_ONCE(halt_poll_max_steal)) {
vcpu->stat.halt_no_poll_steal++;
return true;
@@ -4109,7 +4188,7 @@ static void kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu)
vcpu->run->s.regs.fpc = 0;
/*
* Do not reset these registers in the protected case, as some of
- * them are overlayed and they are not accessible in this case
+ * them are overlaid and they are not accessible in this case
* anyway.
*/
if (!kvm_s390_pv_cpu_is_protected(vcpu)) {
@@ -4178,33 +4257,24 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
{
- int ret = 0;
-
vcpu_load(vcpu);
- if (test_fp_ctl(fpu->fpc)) {
- ret = -EINVAL;
- goto out;
- }
vcpu->run->s.regs.fpc = fpu->fpc;
- if (MACHINE_HAS_VX)
+ if (cpu_has_vx())
convert_fp_to_vx((__vector128 *) vcpu->run->s.regs.vrs,
(freg_t *) fpu->fprs);
else
memcpy(vcpu->run->s.regs.fprs, &fpu->fprs, sizeof(fpu->fprs));
-out:
vcpu_put(vcpu);
- return ret;
+ return 0;
}
int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
{
vcpu_load(vcpu);
- /* make sure we have the latest values */
- save_fpu_regs();
- if (MACHINE_HAS_VX)
+ if (cpu_has_vx())
convert_vx_to_fp((freg_t *) fpu->fprs,
(__vector128 *) vcpu->run->s.regs.vrs);
else
@@ -4334,6 +4404,75 @@ static bool ibs_enabled(struct kvm_vcpu *vcpu)
return kvm_s390_test_cpuflags(vcpu, CPUSTAT_IBS);
}
+static int __kvm_s390_fixup_fault_sync(struct gmap *gmap, gpa_t gaddr, unsigned int flags)
+{
+ struct kvm *kvm = gmap->private;
+ gfn_t gfn = gpa_to_gfn(gaddr);
+ bool unlocked;
+ hva_t vmaddr;
+ gpa_t tmp;
+ int rc;
+
+ if (kvm_is_ucontrol(kvm)) {
+ tmp = __gmap_translate(gmap, gaddr);
+ gfn = gpa_to_gfn(tmp);
+ }
+
+ vmaddr = gfn_to_hva(kvm, gfn);
+ rc = fixup_user_fault(gmap->mm, vmaddr, FAULT_FLAG_WRITE, &unlocked);
+ if (!rc)
+ rc = __gmap_link(gmap, gaddr, vmaddr);
+ return rc;
+}
+
+/**
+ * __kvm_s390_mprotect_many() - Apply specified protection to guest pages
+ * @gmap: the gmap of the guest
+ * @gpa: the starting guest address
+ * @npages: how many pages to protect
+ * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ * @bits: pgste notification bits to set
+ *
+ * Returns: 0 in case of success, < 0 in case of error - see gmap_protect_one()
+ *
+ * Context: kvm->srcu and gmap->mm need to be held in read mode
+ */
+int __kvm_s390_mprotect_many(struct gmap *gmap, gpa_t gpa, u8 npages, unsigned int prot,
+ unsigned long bits)
+{
+ unsigned int fault_flag = (prot & PROT_WRITE) ? FAULT_FLAG_WRITE : 0;
+ gpa_t end = gpa + npages * PAGE_SIZE;
+ int rc;
+
+ for (; gpa < end; gpa = ALIGN(gpa + 1, rc)) {
+ rc = gmap_protect_one(gmap, gpa, prot, bits);
+ if (rc == -EAGAIN) {
+ __kvm_s390_fixup_fault_sync(gmap, gpa, fault_flag);
+ rc = gmap_protect_one(gmap, gpa, prot, bits);
+ }
+ if (rc < 0)
+ return rc;
+ }
+
+ return 0;
+}
+
+static int kvm_s390_mprotect_notify_prefix(struct kvm_vcpu *vcpu)
+{
+ gpa_t gaddr = kvm_s390_get_prefix(vcpu);
+ int idx, rc;
+
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ mmap_read_lock(vcpu->arch.gmap->mm);
+
+ rc = __kvm_s390_mprotect_many(vcpu->arch.gmap, gaddr, 2, PROT_WRITE, GMAP_NOTIFY_MPROT);
+
+ mmap_read_unlock(vcpu->arch.gmap->mm);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+ return rc;
+}
+
static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu)
{
retry:
@@ -4349,9 +4488,8 @@ retry:
*/
if (kvm_check_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu)) {
int rc;
- rc = gmap_mprotect_notify(vcpu->arch.gmap,
- kvm_s390_get_prefix(vcpu),
- PAGE_SIZE * 2, PROT_WRITE);
+
+ rc = kvm_s390_mprotect_notify_prefix(vcpu);
if (rc) {
kvm_make_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu);
return rc;
@@ -4449,22 +4587,6 @@ int kvm_s390_try_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clo
return 1;
}
-/**
- * kvm_arch_fault_in_page - fault-in guest page if necessary
- * @vcpu: The corresponding virtual cpu
- * @gpa: Guest physical address
- * @writable: Whether the page should be writable or not
- *
- * Make sure that a guest page has been faulted-in on the host.
- *
- * Return: Zero on success, negative error code otherwise.
- */
-long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable)
-{
- return gmap_fault(vcpu->arch.gmap, gpa,
- writable ? FAULT_FLAG_WRITE : 0);
-}
-
static void __kvm_inject_pfault_token(struct kvm_vcpu *vcpu, bool start_token,
unsigned long token)
{
@@ -4532,12 +4654,11 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu)
if (!vcpu->arch.gmap->pfault_enabled)
return false;
- hva = gfn_to_hva(vcpu->kvm, gpa_to_gfn(current->thread.gmap_addr));
- hva += current->thread.gmap_addr & ~PAGE_MASK;
+ hva = gfn_to_hva(vcpu->kvm, current->thread.gmap_teid.addr);
if (read_guest_real(vcpu, vcpu->arch.pfault_token, &arch.pfault_token, 8))
return false;
- return kvm_setup_async_pf(vcpu, current->thread.gmap_addr, hva, &arch);
+ return kvm_setup_async_pf(vcpu, current->thread.gmap_teid.addr * PAGE_SIZE, hva, &arch);
}
static int vcpu_pre_run(struct kvm_vcpu *vcpu)
@@ -4554,12 +4675,9 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu)
vcpu->arch.sie_block->gg14 = vcpu->run->s.regs.gprs[14];
vcpu->arch.sie_block->gg15 = vcpu->run->s.regs.gprs[15];
- if (need_resched())
- schedule();
-
if (!kvm_is_ucontrol(vcpu->kvm)) {
rc = kvm_s390_deliver_pending_interrupts(vcpu);
- if (rc)
+ if (rc || guestdbg_exit_pending(vcpu))
return rc;
}
@@ -4575,6 +4693,7 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu)
clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.gisa_int.kicked_mask);
vcpu->arch.sie_block->icptcode = 0;
+ current->thread.gmap_int_code = 0;
cpuflags = atomic_read(&vcpu->arch.sie_block->cpuflags);
VCPU_EVENT(vcpu, 6, "entering sie flags %x", cpuflags);
trace_kvm_s390_sie_enter(vcpu, cpuflags);
@@ -4582,7 +4701,7 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu)
return 0;
}
-static int vcpu_post_run_fault_in_sie(struct kvm_vcpu *vcpu)
+static int vcpu_post_run_addressing_exception(struct kvm_vcpu *vcpu)
{
struct kvm_s390_pgm_info pgm_info = {
.code = PGM_ADDRESSING,
@@ -4618,10 +4737,182 @@ static int vcpu_post_run_fault_in_sie(struct kvm_vcpu *vcpu)
return kvm_s390_inject_prog_irq(vcpu, &pgm_info);
}
+static void kvm_s390_assert_primary_as(struct kvm_vcpu *vcpu)
+{
+ KVM_BUG(current->thread.gmap_teid.as != PSW_BITS_AS_PRIMARY, vcpu->kvm,
+ "Unexpected program interrupt 0x%x, TEID 0x%016lx",
+ current->thread.gmap_int_code, current->thread.gmap_teid.val);
+}
+
+/*
+ * __kvm_s390_handle_dat_fault() - handle a dat fault for the gmap of a vcpu
+ * @vcpu: the vCPU whose gmap is to be fixed up
+ * @gfn: the guest frame number used for memslots (including fake memslots)
+ * @gaddr: the gmap address, does not have to match @gfn for ucontrol gmaps
+ * @foll: FOLL_* flags
+ *
+ * Return: 0 on success, < 0 in case of error.
+ * Context: The mm lock must not be held before calling. May sleep.
+ */
+int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, unsigned int foll)
+{
+ struct kvm_memory_slot *slot;
+ unsigned int fault_flags;
+ bool writable, unlocked;
+ unsigned long vmaddr;
+ struct page *page;
+ kvm_pfn_t pfn;
+ int rc;
+
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
+ return vcpu_post_run_addressing_exception(vcpu);
+
+ fault_flags = foll & FOLL_WRITE ? FAULT_FLAG_WRITE : 0;
+ if (vcpu->arch.gmap->pfault_enabled)
+ foll |= FOLL_NOWAIT;
+ vmaddr = __gfn_to_hva_memslot(slot, gfn);
+
+try_again:
+ pfn = __kvm_faultin_pfn(slot, gfn, foll, &writable, &page);
+
+ /* Access outside memory, inject addressing exception */
+ if (is_noslot_pfn(pfn))
+ return vcpu_post_run_addressing_exception(vcpu);
+ /* Signal pending: try again */
+ if (pfn == KVM_PFN_ERR_SIGPENDING)
+ return -EAGAIN;
+
+ /* Needs I/O, try to setup async pfault (only possible with FOLL_NOWAIT) */
+ if (pfn == KVM_PFN_ERR_NEEDS_IO) {
+ trace_kvm_s390_major_guest_pfault(vcpu);
+ if (kvm_arch_setup_async_pf(vcpu))
+ return 0;
+ vcpu->stat.pfault_sync++;
+ /* Could not setup async pfault, try again synchronously */
+ foll &= ~FOLL_NOWAIT;
+ goto try_again;
+ }
+ /* Any other error */
+ if (is_error_pfn(pfn))
+ return -EFAULT;
+
+ /* Success */
+ mmap_read_lock(vcpu->arch.gmap->mm);
+ /* Mark the userspace PTEs as young and/or dirty, to avoid page fault loops */
+ rc = fixup_user_fault(vcpu->arch.gmap->mm, vmaddr, fault_flags, &unlocked);
+ if (!rc)
+ rc = __gmap_link(vcpu->arch.gmap, gaddr, vmaddr);
+ scoped_guard(spinlock, &vcpu->kvm->mmu_lock) {
+ kvm_release_faultin_page(vcpu->kvm, page, false, writable);
+ }
+ mmap_read_unlock(vcpu->arch.gmap->mm);
+ return rc;
+}
+
+static int vcpu_dat_fault_handler(struct kvm_vcpu *vcpu, unsigned long gaddr, unsigned int foll)
+{
+ unsigned long gaddr_tmp;
+ gfn_t gfn;
+
+ gfn = gpa_to_gfn(gaddr);
+ if (kvm_is_ucontrol(vcpu->kvm)) {
+ /*
+ * This translates the per-vCPU guest address into a
+ * fake guest address, which can then be used with the
+ * fake memslots that are identity mapping userspace.
+ * This allows ucontrol VMs to use the normal fault
+ * resolution path, like normal VMs.
+ */
+ mmap_read_lock(vcpu->arch.gmap->mm);
+ gaddr_tmp = __gmap_translate(vcpu->arch.gmap, gaddr);
+ mmap_read_unlock(vcpu->arch.gmap->mm);
+ if (gaddr_tmp == -EFAULT) {
+ vcpu->run->exit_reason = KVM_EXIT_S390_UCONTROL;
+ vcpu->run->s390_ucontrol.trans_exc_code = gaddr;
+ vcpu->run->s390_ucontrol.pgm_code = PGM_SEGMENT_TRANSLATION;
+ return -EREMOTE;
+ }
+ gfn = gpa_to_gfn(gaddr_tmp);
+ }
+ return __kvm_s390_handle_dat_fault(vcpu, gfn, gaddr, foll);
+}
+
+static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu)
+{
+ unsigned int foll = 0;
+ unsigned long gaddr;
+ int rc;
+
+ gaddr = current->thread.gmap_teid.addr * PAGE_SIZE;
+ if (kvm_s390_cur_gmap_fault_is_write())
+ foll = FOLL_WRITE;
+
+ switch (current->thread.gmap_int_code & PGM_INT_CODE_MASK) {
+ case 0:
+ vcpu->stat.exit_null++;
+ break;
+ case PGM_SECURE_STORAGE_ACCESS:
+ case PGM_SECURE_STORAGE_VIOLATION:
+ kvm_s390_assert_primary_as(vcpu);
+ /*
+ * This can happen after a reboot with asynchronous teardown;
+ * the new guest (normal or protected) will run on top of the
+ * previous protected guest. The old pages need to be destroyed
+ * so the new guest can use them.
+ */
+ if (kvm_s390_pv_destroy_page(vcpu->kvm, gaddr)) {
+ /*
+ * Either KVM messed up the secure guest mapping or the
+ * same page is mapped into multiple secure guests.
+ *
+ * This exception is only triggered when a guest 2 is
+ * running and can therefore never occur in kernel
+ * context.
+ */
+ pr_warn_ratelimited("Secure storage violation (%x) in task: %s, pid %d\n",
+ current->thread.gmap_int_code, current->comm,
+ current->pid);
+ send_sig(SIGSEGV, current, 0);
+ }
+ break;
+ case PGM_NON_SECURE_STORAGE_ACCESS:
+ kvm_s390_assert_primary_as(vcpu);
+ /*
+ * This is normal operation; a page belonging to a protected
+ * guest has not been imported yet. Try to import the page into
+ * the protected guest.
+ */
+ rc = kvm_s390_pv_convert_to_secure(vcpu->kvm, gaddr);
+ if (rc == -EINVAL)
+ send_sig(SIGSEGV, current, 0);
+ if (rc != -ENXIO)
+ break;
+ foll = FOLL_WRITE;
+ fallthrough;
+ case PGM_PROTECTION:
+ case PGM_SEGMENT_TRANSLATION:
+ case PGM_PAGE_TRANSLATION:
+ case PGM_ASCE_TYPE:
+ case PGM_REGION_FIRST_TRANS:
+ case PGM_REGION_SECOND_TRANS:
+ case PGM_REGION_THIRD_TRANS:
+ kvm_s390_assert_primary_as(vcpu);
+ return vcpu_dat_fault_handler(vcpu, gaddr, foll);
+ default:
+ KVM_BUG(1, vcpu->kvm, "Unexpected program interrupt 0x%x, TEID 0x%016lx",
+ current->thread.gmap_int_code, current->thread.gmap_teid.val);
+ send_sig(SIGSEGV, current, 0);
+ break;
+ }
+ return 0;
+}
+
static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
{
struct mcck_volatile_info *mcck_info;
struct sie_page *sie_page;
+ int rc;
VCPU_EVENT(vcpu, 6, "exit sie icptcode %d",
vcpu->arch.sie_block->icptcode);
@@ -4643,7 +4934,7 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
}
if (vcpu->arch.sie_block->icptcode > 0) {
- int rc = kvm_handle_sie_intercept(vcpu);
+ rc = kvm_handle_sie_intercept(vcpu);
if (rc != -EOPNOTSUPP)
return rc;
@@ -4652,24 +4943,28 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
vcpu->run->s390_sieic.ipa = vcpu->arch.sie_block->ipa;
vcpu->run->s390_sieic.ipb = vcpu->arch.sie_block->ipb;
return -EREMOTE;
- } else if (exit_reason != -EFAULT) {
- vcpu->stat.exit_null++;
- return 0;
- } else if (kvm_is_ucontrol(vcpu->kvm)) {
- vcpu->run->exit_reason = KVM_EXIT_S390_UCONTROL;
- vcpu->run->s390_ucontrol.trans_exc_code =
- current->thread.gmap_addr;
- vcpu->run->s390_ucontrol.pgm_code = 0x10;
- return -EREMOTE;
- } else if (current->thread.gmap_pfault) {
- trace_kvm_s390_major_guest_pfault(vcpu);
- current->thread.gmap_pfault = 0;
- if (kvm_arch_setup_async_pf(vcpu))
- return 0;
- vcpu->stat.pfault_sync++;
- return kvm_arch_fault_in_page(vcpu, current->thread.gmap_addr, 1);
}
- return vcpu_post_run_fault_in_sie(vcpu);
+
+ return vcpu_post_run_handle_fault(vcpu);
+}
+
+int noinstr kvm_s390_enter_exit_sie(struct kvm_s390_sie_block *scb,
+ u64 *gprs, unsigned long gasce)
+{
+ int ret;
+
+ guest_state_enter_irqoff();
+
+ /*
+ * The guest_state_{enter,exit}_irqoff() functions inform lockdep and
+ * tracing that entry to the guest will enable host IRQs, and exit from
+ * the guest will disable host IRQs.
+ */
+ ret = sie64a(scb, gprs, gasce);
+
+ guest_state_exit_irqoff();
+
+ return ret;
}
#define PSW_INT_MASK (PSW_MASK_EXT | PSW_MASK_IO | PSW_MASK_MCHECK)
@@ -4684,29 +4979,45 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
*/
kvm_vcpu_srcu_read_lock(vcpu);
- do {
+ while (true) {
rc = vcpu_pre_run(vcpu);
- if (rc)
+ kvm_vcpu_srcu_read_unlock(vcpu);
+ if (rc || guestdbg_exit_pending(vcpu))
break;
- kvm_vcpu_srcu_read_unlock(vcpu);
/*
* As PF_VCPU will be used in fault handler, between
- * guest_enter and guest_exit should be no uaccess.
+ * guest_timing_enter_irqoff and guest_timing_exit_irqoff
+ * should be no uaccess.
*/
- local_irq_disable();
- guest_enter_irqoff();
- __disable_cpu_timer_accounting(vcpu);
- local_irq_enable();
if (kvm_s390_pv_cpu_is_protected(vcpu)) {
memcpy(sie_page->pv_grregs,
vcpu->run->s.regs.gprs,
sizeof(sie_page->pv_grregs));
}
- if (test_cpu_flag(CIF_FPU))
- load_fpu_regs();
- exit_reason = sie64a(vcpu->arch.sie_block,
- vcpu->run->s.regs.gprs);
+
+xfer_to_guest_mode_check:
+ local_irq_disable();
+ xfer_to_guest_mode_prepare();
+ if (xfer_to_guest_mode_work_pending()) {
+ local_irq_enable();
+ rc = kvm_xfer_to_guest_mode_handle_work(vcpu);
+ if (rc)
+ break;
+ goto xfer_to_guest_mode_check;
+ }
+
+ guest_timing_enter_irqoff();
+ __disable_cpu_timer_accounting(vcpu);
+
+ exit_reason = kvm_s390_enter_exit_sie(vcpu->arch.sie_block,
+ vcpu->run->s.regs.gprs,
+ vcpu->arch.gmap->asce);
+
+ __enable_cpu_timer_accounting(vcpu);
+ guest_timing_exit_irqoff();
+ local_irq_enable();
+
if (kvm_s390_pv_cpu_is_protected(vcpu)) {
memcpy(vcpu->run->s.regs.gprs,
sie_page->pv_grregs,
@@ -4722,16 +5033,15 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
vcpu->arch.sie_block->gpsw.mask &= ~PSW_INT_MASK;
}
}
- local_irq_disable();
- __enable_cpu_timer_accounting(vcpu);
- guest_exit_irqoff();
- local_irq_enable();
kvm_vcpu_srcu_read_lock(vcpu);
rc = vcpu_post_run(vcpu, exit_reason);
- } while (!signal_pending(current) && !guestdbg_exit_pending(vcpu) && !rc);
+ if (rc || guestdbg_exit_pending(vcpu)) {
+ kvm_vcpu_srcu_read_unlock(vcpu);
+ break;
+ }
+ }
- kvm_vcpu_srcu_read_unlock(vcpu);
return rc;
}
@@ -4791,9 +5101,9 @@ static void sync_regs_fmt2(struct kvm_vcpu *vcpu)
vcpu->arch.sie_block->fpf &= ~FPF_BPBC;
vcpu->arch.sie_block->fpf |= kvm_run->s.regs.bpbc ? FPF_BPBC : 0;
}
- if (MACHINE_HAS_GS) {
+ if (cpu_has_gs()) {
preempt_disable();
- __ctl_set_bit(2, 4);
+ local_ctl_set_bit(2, CR2_GUARDED_STORAGE_BIT);
if (current->thread.gs_cb) {
vcpu->arch.host_gscb = current->thread.gs_cb;
save_gs_cb(vcpu->arch.host_gscb);
@@ -4825,19 +5135,8 @@ static void sync_regs(struct kvm_vcpu *vcpu)
}
save_access_regs(vcpu->arch.host_acrs);
restore_access_regs(vcpu->run->s.regs.acrs);
- /* save host (userspace) fprs/vrs */
- save_fpu_regs();
- vcpu->arch.host_fpregs.fpc = current->thread.fpu.fpc;
- vcpu->arch.host_fpregs.regs = current->thread.fpu.regs;
- if (MACHINE_HAS_VX)
- current->thread.fpu.regs = vcpu->run->s.regs.vrs;
- else
- current->thread.fpu.regs = vcpu->run->s.regs.fprs;
- current->thread.fpu.fpc = vcpu->run->s.regs.fpc;
- if (test_fp_ctl(current->thread.fpu.fpc))
- /* User space provided an invalid FPC, let's clear it */
- current->thread.fpu.fpc = 0;
-
+ vcpu->arch.acrs_loaded = true;
+ kvm_s390_fpu_load(vcpu->run);
/* Sync fmt2 only data */
if (likely(!kvm_s390_pv_cpu_is_protected(vcpu))) {
sync_regs_fmt2(vcpu);
@@ -4868,15 +5167,15 @@ static void store_regs_fmt2(struct kvm_vcpu *vcpu)
kvm_run->s.regs.gbea = vcpu->arch.sie_block->gbea;
kvm_run->s.regs.bpbc = (vcpu->arch.sie_block->fpf & FPF_BPBC) == FPF_BPBC;
kvm_run->s.regs.diag318 = vcpu->arch.diag318_info.val;
- if (MACHINE_HAS_GS) {
+ if (cpu_has_gs()) {
preempt_disable();
- __ctl_set_bit(2, 4);
+ local_ctl_set_bit(2, CR2_GUARDED_STORAGE_BIT);
if (vcpu->arch.gs_enabled)
save_gs_cb(current->thread.gs_cb);
current->thread.gs_cb = vcpu->arch.host_gscb;
restore_gs_cb(vcpu->arch.host_gscb);
if (!vcpu->arch.host_gscb)
- __ctl_clear_bit(2, 4);
+ local_ctl_clear_bit(2, CR2_GUARDED_STORAGE_BIT);
vcpu->arch.host_gscb = NULL;
preempt_enable();
}
@@ -4898,12 +5197,8 @@ static void store_regs(struct kvm_vcpu *vcpu)
kvm_run->s.regs.pfc = vcpu->arch.pfault_compare;
save_access_regs(vcpu->run->s.regs.acrs);
restore_access_regs(vcpu->arch.host_acrs);
- /* Save guest register state */
- save_fpu_regs();
- vcpu->run->s.regs.fpc = current->thread.fpu.fpc;
- /* Restore will be done lazily at return */
- current->thread.fpu.fpc = vcpu->arch.host_fpregs.fpc;
- current->thread.fpu.regs = vcpu->arch.host_fpregs.regs;
+ vcpu->arch.acrs_loaded = false;
+ kvm_s390_fpu_store(vcpu->run);
if (likely(!kvm_s390_pv_cpu_is_protected(vcpu)))
store_regs_fmt2(vcpu);
}
@@ -4911,6 +5206,7 @@ static void store_regs(struct kvm_vcpu *vcpu)
int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
{
struct kvm_run *kvm_run = vcpu->run;
+ DECLARE_KERNEL_FPU_ONSTACK32(fpu);
int rc;
/*
@@ -4922,7 +5218,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
if (vcpu->kvm->arch.pv.dumping)
return -EINVAL;
- if (kvm_run->immediate_exit)
+ if (!vcpu->wants_to_run)
return -EINTR;
if (kvm_run->kvm_valid_regs & ~KVM_SYNC_S390_VALID_FIELDS ||
@@ -4952,6 +5248,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
goto out;
}
+ kernel_fpu_begin(&fpu, KERNEL_FPC | KERNEL_VXR);
sync_regs(vcpu);
enable_cpu_timer_accounting(vcpu);
@@ -4960,6 +5257,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
if (signal_pending(current) && !rc) {
kvm_run->exit_reason = KVM_EXIT_INTR;
+ vcpu->stat.signal_exits++;
rc = -EINTR;
}
@@ -4975,6 +5273,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
disable_cpu_timer_accounting(vcpu);
store_regs(vcpu);
+ kernel_fpu_end(&fpu, KERNEL_FPC | KERNEL_VXR);
kvm_sigset_deactivate(vcpu);
@@ -5011,7 +5310,7 @@ int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long gpa)
gpa -= __LC_FPREGS_SAVE_AREA;
/* manually convert vector registers if necessary */
- if (MACHINE_HAS_VX) {
+ if (cpu_has_vx()) {
convert_vx_to_fp(fprs, (__vector128 *) vcpu->run->s.regs.vrs);
rc = write_guest_abs(vcpu, gpa + __LC_FPREGS_SAVE_AREA,
fprs, 128);
@@ -5049,8 +5348,7 @@ int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
* switch in the run ioctl. Let's update our copies before we save
* it into the save area
*/
- save_fpu_regs();
- vcpu->run->s.regs.fpc = current->thread.fpu.fpc;
+ kvm_s390_fpu_store(vcpu->run);
save_access_regs(vcpu->run->s.regs.acrs);
return kvm_s390_store_status_unloaded(vcpu, addr);
@@ -5249,62 +5547,54 @@ static long kvm_s390_vcpu_mem_op(struct kvm_vcpu *vcpu,
struct kvm_s390_mem_op *mop)
{
void __user *uaddr = (void __user *)mop->buf;
+ enum gacc_mode acc_mode;
void *tmpbuf = NULL;
- int r = 0;
- const u64 supported_flags = KVM_S390_MEMOP_F_INJECT_EXCEPTION
- | KVM_S390_MEMOP_F_CHECK_ONLY
- | KVM_S390_MEMOP_F_SKEY_PROTECTION;
+ int r;
- if (mop->flags & ~supported_flags || mop->ar >= NUM_ACRS || !mop->size)
+ r = mem_op_validate_common(mop, KVM_S390_MEMOP_F_INJECT_EXCEPTION |
+ KVM_S390_MEMOP_F_CHECK_ONLY |
+ KVM_S390_MEMOP_F_SKEY_PROTECTION);
+ if (r)
+ return r;
+ if (mop->ar >= NUM_ACRS)
return -EINVAL;
- if (mop->size > MEM_OP_MAX_SIZE)
- return -E2BIG;
if (kvm_s390_pv_cpu_is_protected(vcpu))
return -EINVAL;
- if (mop->flags & KVM_S390_MEMOP_F_SKEY_PROTECTION) {
- if (access_key_invalid(mop->key))
- return -EINVAL;
- } else {
- mop->key = 0;
- }
if (!(mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY)) {
tmpbuf = vmalloc(mop->size);
if (!tmpbuf)
return -ENOMEM;
}
- switch (mop->op) {
- case KVM_S390_MEMOP_LOGICAL_READ:
- if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
- r = check_gva_range(vcpu, mop->gaddr, mop->ar, mop->size,
- GACC_FETCH, mop->key);
- break;
- }
+ acc_mode = mop->op == KVM_S390_MEMOP_LOGICAL_READ ? GACC_FETCH : GACC_STORE;
+ if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
+ r = check_gva_range(vcpu, mop->gaddr, mop->ar, mop->size,
+ acc_mode, mop->key);
+ goto out_inject;
+ }
+ if (acc_mode == GACC_FETCH) {
r = read_guest_with_key(vcpu, mop->gaddr, mop->ar, tmpbuf,
mop->size, mop->key);
- if (r == 0) {
- if (copy_to_user(uaddr, tmpbuf, mop->size))
- r = -EFAULT;
- }
- break;
- case KVM_S390_MEMOP_LOGICAL_WRITE:
- if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
- r = check_gva_range(vcpu, mop->gaddr, mop->ar, mop->size,
- GACC_STORE, mop->key);
- break;
+ if (r)
+ goto out_inject;
+ if (copy_to_user(uaddr, tmpbuf, mop->size)) {
+ r = -EFAULT;
+ goto out_free;
}
+ } else {
if (copy_from_user(tmpbuf, uaddr, mop->size)) {
r = -EFAULT;
- break;
+ goto out_free;
}
r = write_guest_with_key(vcpu, mop->gaddr, mop->ar, tmpbuf,
mop->size, mop->key);
- break;
}
+out_inject:
if (r > 0 && (mop->flags & KVM_S390_MEMOP_F_INJECT_EXCEPTION) != 0)
kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm);
+out_free:
vfree(tmpbuf);
return r;
}
@@ -5334,11 +5624,12 @@ static long kvm_s390_vcpu_memsida_op(struct kvm_vcpu *vcpu,
return r;
}
-long kvm_arch_vcpu_async_ioctl(struct file *filp,
- unsigned int ioctl, unsigned long arg)
+long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl,
+ unsigned long arg)
{
struct kvm_vcpu *vcpu = filp->private_data;
void __user *argp = (void __user *)arg;
+ int rc;
switch (ioctl) {
case KVM_S390_IRQ: {
@@ -5346,7 +5637,8 @@ long kvm_arch_vcpu_async_ioctl(struct file *filp,
if (copy_from_user(&s390irq, argp, sizeof(s390irq)))
return -EFAULT;
- return kvm_s390_inject_vcpu(vcpu, &s390irq);
+ rc = kvm_s390_inject_vcpu(vcpu, &s390irq);
+ break;
}
case KVM_S390_INTERRUPT: {
struct kvm_s390_interrupt s390int;
@@ -5356,10 +5648,25 @@ long kvm_arch_vcpu_async_ioctl(struct file *filp,
return -EFAULT;
if (s390int_to_s390irq(&s390int, &s390irq))
return -EINVAL;
- return kvm_s390_inject_vcpu(vcpu, &s390irq);
+ rc = kvm_s390_inject_vcpu(vcpu, &s390irq);
+ break;
}
+ default:
+ rc = -ENOIOCTLCMD;
+ break;
}
- return -ENOIOCTLCMD;
+
+ /*
+ * To simplify single stepping of userspace-emulated instructions,
+ * KVM_EXIT_S390_SIEIC exit sets KVM_GUESTDBG_EXIT_PENDING (see
+ * should_handle_per_ifetch()). However, if userspace emulation injects
+ * an interrupt, it needs to be cleared, so that KVM_EXIT_DEBUG happens
+ * after (and not before) the interrupt delivery.
+ */
+ if (!rc)
+ vcpu->guest_debug &= ~KVM_GUESTDBG_EXIT_PENDING;
+
+ return rc;
}
static int kvm_s390_handle_pv_vcpu_dump(struct kvm_vcpu *vcpu,
@@ -5513,7 +5820,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
}
#endif
case KVM_S390_VCPU_FAULT: {
- r = gmap_fault(vcpu->arch.gmap, arg, 0);
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ r = vcpu_dat_fault_handler(vcpu, arg, 0);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
break;
}
case KVM_ENABLE_CAP:
@@ -5629,27 +5938,47 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
{
gpa_t size;
+ if (kvm_is_ucontrol(kvm) && new->id < KVM_USER_MEM_SLOTS)
+ return -EINVAL;
+
/* When we are protected, we should not change the memory slots */
if (kvm_s390_pv_get_handle(kvm))
return -EINVAL;
- if (change == KVM_MR_DELETE || change == KVM_MR_FLAGS_ONLY)
- return 0;
+ if (change != KVM_MR_DELETE && change != KVM_MR_FLAGS_ONLY) {
+ /*
+ * A few sanity checks. We can have memory slots which have to be
+ * located/ended at a segment boundary (1MB). The memory in userland is
+ * ok to be fragmented into various different vmas. It is okay to mmap()
+ * and munmap() stuff in this slot after doing this call at any time
+ */
- /* A few sanity checks. We can have memory slots which have to be
- located/ended at a segment boundary (1MB). The memory in userland is
- ok to be fragmented into various different vmas. It is okay to mmap()
- and munmap() stuff in this slot after doing this call at any time */
+ if (new->userspace_addr & 0xffffful)
+ return -EINVAL;
- if (new->userspace_addr & 0xffffful)
- return -EINVAL;
+ size = new->npages * PAGE_SIZE;
+ if (size & 0xffffful)
+ return -EINVAL;
- size = new->npages * PAGE_SIZE;
- if (size & 0xffffful)
- return -EINVAL;
+ if ((new->base_gfn * PAGE_SIZE) + size > kvm->arch.mem_limit)
+ return -EINVAL;
+ }
- if ((new->base_gfn * PAGE_SIZE) + size > kvm->arch.mem_limit)
- return -EINVAL;
+ if (!kvm->arch.migration_mode)
+ return 0;
+
+ /*
+ * Turn off migration mode when:
+ * - userspace creates a new memslot with dirty logging off,
+ * - userspace modifies an existing memslot (MOVE or FLAGS_ONLY) and
+ * dirty logging is turned off.
+ * Migration mode expects dirty page logging being enabled to store
+ * its dirty bitmap.
+ */
+ if (change != KVM_MR_DELETE &&
+ !(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
+ WARN(kvm_s390_vm_stop_migration(kvm),
+ "Failed to stop migration mode");
return 0;
}
@@ -5661,6 +5990,9 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
{
int rc = 0;
+ if (kvm_is_ucontrol(kvm))
+ return;
+
switch (change) {
case KVM_MR_DELETE:
rc = gmap_unmap_segment(kvm->arch.gmap, old->base_gfn * PAGE_SIZE,
@@ -5696,7 +6028,7 @@ static inline unsigned long nonhyp_mask(int i)
static int __init kvm_s390_init(void)
{
- int i;
+ int i, r;
if (!sclp.has_sief2) {
pr_info("SIE is not available\n");
@@ -5712,12 +6044,23 @@ static int __init kvm_s390_init(void)
kvm_s390_fac_base[i] |=
stfle_fac_list[i] & nonhyp_mask(i);
- return kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
+ r = __kvm_s390_init();
+ if (r)
+ return r;
+
+ r = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE);
+ if (r) {
+ __kvm_s390_exit();
+ return r;
+ }
+ return 0;
}
static void __exit kvm_s390_exit(void)
{
kvm_exit();
+
+ __kvm_s390_exit();
}
module_init(kvm_s390_init);
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index d48588c207d8..65c950760993 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -20,6 +20,26 @@
#include <asm/processor.h>
#include <asm/sclp.h>
+#define KVM_S390_UCONTROL_MEMSLOT (KVM_USER_MEM_SLOTS + 0)
+
+static inline void kvm_s390_fpu_store(struct kvm_run *run)
+{
+ fpu_stfpc(&run->s.regs.fpc);
+ if (cpu_has_vx())
+ save_vx_regs((__vector128 *)&run->s.regs.vrs);
+ else
+ save_fp_regs((freg_t *)&run->s.regs.fprs);
+}
+
+static inline void kvm_s390_fpu_load(struct kvm_run *run)
+{
+ fpu_lfpc_safe(&run->s.regs.fpc);
+ if (cpu_has_vx())
+ load_vx_regs((__vector128 *)&run->s.regs.vrs);
+ else
+ load_fp_regs((freg_t *)&run->s.regs.fprs);
+}
+
/* Transactional Memory Execution related macros */
#define IS_TE_ENABLED(vcpu) ((vcpu->arch.sie_block->ecb & ECB_TE))
#define TDB_FORMAT1 1
@@ -120,6 +140,21 @@ static inline u64 kvm_s390_get_base_disp_s(struct kvm_vcpu *vcpu, u8 *ar)
return (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + disp2;
}
+static inline u64 kvm_s390_get_base_disp_siy(struct kvm_vcpu *vcpu, u8 *ar)
+{
+ u32 base1 = vcpu->arch.sie_block->ipb >> 28;
+ s64 disp1;
+
+ /* The displacement is a 20bit _SIGNED_ value */
+ disp1 = sign_extend64(((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16) +
+ ((vcpu->arch.sie_block->ipb & 0xff00) << 4), 19);
+
+ if (ar)
+ *ar = base1;
+
+ return (base1 ? vcpu->run->s.regs.gprs[base1] : 0) + disp1;
+}
+
static inline void kvm_s390_get_base_disp_sse(struct kvm_vcpu *vcpu,
u64 *address1, u64 *address2,
u8 *ar_b1, u8 *ar_b2)
@@ -234,13 +269,27 @@ static inline unsigned long kvm_s390_get_gfn_end(struct kvm_memslots *slots)
static inline u32 kvm_s390_get_gisa_desc(struct kvm *kvm)
{
- u32 gd = virt_to_phys(kvm->arch.gisa_int.origin);
+ u32 gd;
+
+ if (!kvm->arch.gisa_int.origin)
+ return 0;
+
+ gd = virt_to_phys(kvm->arch.gisa_int.origin);
if (gd && sclp.has_gisaf)
gd |= GISA_FORMAT1;
return gd;
}
+static inline hva_t gpa_to_hva(struct kvm *kvm, gpa_t gpa)
+{
+ hva_t hva = gfn_to_hva(kvm, gpa_to_gfn(gpa));
+
+ if (!kvm_is_error_hva(hva))
+ hva |= offset_in_page(gpa);
+ return hva;
+}
+
/* implemented in pv.c */
int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc);
int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc);
@@ -259,6 +308,9 @@ int kvm_s390_pv_dump_stor_state(struct kvm *kvm, void __user *buff_user,
u64 *gaddr, u64 buff_user_len, u16 *rc, u16 *rrc);
int kvm_s390_pv_dump_complete(struct kvm *kvm, void __user *buff_user,
u16 *rc, u16 *rrc);
+int kvm_s390_pv_destroy_page(struct kvm *kvm, unsigned long gaddr);
+int kvm_s390_pv_convert_to_secure(struct kvm *kvm, unsigned long gaddr);
+int kvm_s390_pv_make_secure(struct kvm *kvm, unsigned long gaddr, void *uvcb);
static inline u64 kvm_s390_pv_get_handle(struct kvm *kvm)
{
@@ -270,16 +322,39 @@ static inline u64 kvm_s390_pv_cpu_get_handle(struct kvm_vcpu *vcpu)
return vcpu->arch.pv.handle;
}
-static inline bool kvm_s390_pv_is_protected(struct kvm *kvm)
+/**
+ * __kvm_s390_pv_destroy_page() - Destroy a guest page.
+ * @page: the page to destroy
+ *
+ * An attempt will be made to destroy the given guest page. If the attempt
+ * fails, an attempt is made to export the page. If both attempts fail, an
+ * appropriate error is returned.
+ *
+ * Context: must be called holding the mm lock for gmap->mm
+ */
+static inline int __kvm_s390_pv_destroy_page(struct page *page)
{
- lockdep_assert_held(&kvm->lock);
- return !!kvm_s390_pv_get_handle(kvm);
-}
+ struct folio *folio = page_folio(page);
+ int rc;
-static inline bool kvm_s390_pv_cpu_is_protected(struct kvm_vcpu *vcpu)
-{
- lockdep_assert_held(&vcpu->mutex);
- return !!kvm_s390_pv_cpu_get_handle(vcpu);
+ /* Large folios cannot be secure. Small folio implies FW_LEVEL_PTE. */
+ if (folio_test_large(folio))
+ return -EFAULT;
+
+ rc = uv_destroy_folio(folio);
+ /*
+ * Fault handlers can race; it is possible that two CPUs will fault
+ * on the same secure page. One CPU can destroy the page, reboot,
+ * re-enter secure mode and import it, while the second CPU was
+ * stuck at the beginning of the handler. At some point the second
+ * CPU will be able to progress, and it will not be able to destroy
+ * the page. In that case we do not want to terminate the process,
+ * we instead try to export the page.
+ */
+ if (rc)
+ rc = uv_convert_from_secure_folio(folio);
+
+ return rc;
}
/* implemented in interrupt.c */
@@ -361,6 +436,10 @@ void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start,
unsigned long end);
void kvm_s390_vsie_init(struct kvm *kvm);
void kvm_s390_vsie_destroy(struct kvm *kvm);
+int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level);
+
+/* implemented in gmap-vsie.c */
+struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, int edat_level);
/* implemented in sigp.c */
int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu);
@@ -368,7 +447,6 @@ int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu);
/* implemented in kvm-s390.c */
int kvm_s390_try_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod);
-long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable);
int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr);
int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr);
int kvm_s390_vcpu_start(struct kvm_vcpu *vcpu);
@@ -383,6 +461,14 @@ void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu);
void kvm_s390_set_cpu_timer(struct kvm_vcpu *vcpu, __u64 cputm);
__u64 kvm_s390_get_cpu_timer(struct kvm_vcpu *vcpu);
int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rc, u16 *rrc);
+int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, unsigned int flags);
+int __kvm_s390_mprotect_many(struct gmap *gmap, gpa_t gpa, u8 npages, unsigned int prot,
+ unsigned long bits);
+
+static inline int kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gpa_t gaddr, unsigned int flags)
+{
+ return __kvm_s390_handle_dat_fault(vcpu, gpa_to_gfn(gaddr), gaddr, flags);
+}
/* implemented in diag.c */
int kvm_s390_handle_diag(struct kvm_vcpu *vcpu);
@@ -470,7 +556,7 @@ void kvm_s390_gisa_clear(struct kvm *kvm);
void kvm_s390_gisa_destroy(struct kvm *kvm);
void kvm_s390_gisa_disable(struct kvm *kvm);
void kvm_s390_gisa_enable(struct kvm *kvm);
-int kvm_s390_gib_init(u8 nisc);
+int __init kvm_s390_gib_init(u8 nisc);
void kvm_s390_gib_destroy(void);
/* implemented in guestdbg.c */
@@ -484,13 +570,6 @@ void kvm_s390_prepare_debug_exit(struct kvm_vcpu *vcpu);
int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu);
int kvm_s390_handle_per_event(struct kvm_vcpu *vcpu);
-/* support for Basic/Extended SCA handling */
-static inline union ipte_control *kvm_s390_get_ipte_control(struct kvm *kvm)
-{
- struct bsca_block *sca = kvm->arch.sca; /* SCA version doesn't matter */
-
- return &sca->ipte_control;
-}
static inline int kvm_s390_use_sca_entries(void)
{
/*
@@ -498,11 +577,18 @@ static inline int kvm_s390_use_sca_entries(void)
* might use the entries. By not setting the entries and keeping them
* invalid, hardware will not access them but intercept.
*/
- return sclp.has_sigpif;
+ return sclp.has_sigpif && sclp.has_esca;
}
void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu,
struct mcck_volatile_info *mcck_info);
+static inline bool kvm_s390_cur_gmap_fault_is_write(void)
+{
+ if (current->thread.gmap_int_code == PGM_PROTECTION)
+ return true;
+ return test_facility(75) && (current->thread.gmap_teid.fsi == TEID_FSI_STORE);
+}
+
/**
* kvm_s390_vcpu_crypto_reset_all
*
diff --git a/arch/s390/kvm/pci.c b/arch/s390/kvm/pci.c
index ec51e810e381..8c40154ff50f 100644
--- a/arch/s390/kvm/pci.c
+++ b/arch/s390/kvm/pci.c
@@ -103,7 +103,7 @@ static int zpci_reset_aipb(u8 nisc)
/*
* AEN registration can only happen once per system boot. If
* an aipb already exists then AEN was already registered and
- * we can re-use the aipb contents. This can only happen if
+ * we can reuse the aipb contents. This can only happen if
* the KVM module was removed and re-inserted. However, we must
* ensure that the same forwarding ISC is used as this is assigned
* during KVM module load.
@@ -112,7 +112,7 @@ static int zpci_reset_aipb(u8 nisc)
return -EINVAL;
aift->sbv = zpci_aif_sbv;
- aift->gait = (struct zpci_gaite *)zpci_aipb->aipb.gait;
+ aift->gait = phys_to_virt(zpci_aipb->aipb.gait);
return 0;
}
@@ -208,13 +208,12 @@ static inline int account_mem(unsigned long nr_pages)
page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ cur_pages = atomic_long_read(&user->locked_vm);
do {
- cur_pages = atomic_long_read(&user->locked_vm);
new_pages = cur_pages + nr_pages;
if (new_pages > page_limit)
return -ENOMEM;
- } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
- new_pages) != cur_pages);
+ } while (!atomic_long_try_cmpxchg(&user->locked_vm, &cur_pages, new_pages));
atomic64_add(nr_pages, &current->mm->pinned_vm);
@@ -427,14 +426,13 @@ static void kvm_s390_pci_dev_release(struct zpci_dev *zdev)
/*
- * Register device with the specified KVM. If interpetation facilities are
+ * Register device with the specified KVM. If interpretation facilities are
* available, enable them and let userspace indicate whether or not they will
* be used (specify SHM bit to disable).
*/
static int kvm_s390_pci_register_kvm(void *opaque, struct kvm *kvm)
{
struct zpci_dev *zdev = opaque;
- u8 status;
int rc;
if (!zdev)
@@ -481,13 +479,7 @@ static int kvm_s390_pci_register_kvm(void *opaque, struct kvm *kvm)
*/
zdev->gisa = (u32)virt_to_phys(&kvm->arch.sie_page2->gisa);
- rc = zpci_enable_device(zdev);
- if (rc)
- goto clear_gisa;
-
- /* Re-register the IOMMU that was already created */
- rc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma,
- virt_to_phys(zdev->dma_table), &status);
+ rc = zpci_reenable_device(zdev);
if (rc)
goto clear_gisa;
@@ -517,7 +509,6 @@ static void kvm_s390_pci_unregister_kvm(void *opaque)
{
struct zpci_dev *zdev = opaque;
struct kvm *kvm;
- u8 status;
if (!zdev)
return;
@@ -551,12 +542,7 @@ static void kvm_s390_pci_unregister_kvm(void *opaque)
goto out;
}
- if (zpci_enable_device(zdev))
- goto out;
-
- /* Re-register the IOMMU that was already created */
- zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma,
- virt_to_phys(zdev->dma_table), &status);
+ zpci_reenable_device(zdev);
out:
spin_lock(&kvm->arch.kzdev_list_lock);
@@ -672,7 +658,7 @@ out:
return r;
}
-int kvm_s390_pci_init(void)
+int __init kvm_s390_pci_init(void)
{
zpci_kvm_hook.kvm_register = kvm_s390_pci_register_kvm;
zpci_kvm_hook.kvm_unregister = kvm_s390_pci_unregister_kvm;
diff --git a/arch/s390/kvm/pci.h b/arch/s390/kvm/pci.h
index 486d06ef563f..ff0972dd5e71 100644
--- a/arch/s390/kvm/pci.h
+++ b/arch/s390/kvm/pci.h
@@ -60,7 +60,7 @@ void kvm_s390_pci_clear_list(struct kvm *kvm);
int kvm_s390_pci_zpci_op(struct kvm *kvm, struct kvm_s390_zpci_op *args);
-int kvm_s390_pci_init(void);
+int __init kvm_s390_pci_init(void);
void kvm_s390_pci_exit(void);
static inline bool kvm_s390_pci_interp_allowed(void)
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 9f8a192bd750..0b14d894f38a 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -13,7 +13,7 @@
#include <linux/errno.h>
#include <linux/mm_types.h>
#include <linux/pgtable.h>
-
+#include <linux/io.h>
#include <asm/asm-offsets.h>
#include <asm/facility.h>
#include <asm/current.h>
@@ -22,7 +22,6 @@
#include <asm/sysinfo.h>
#include <asm/page-states.h>
#include <asm/gmap.h>
-#include <asm/io.h>
#include <asm/ptrace.h>
#include <asm/sclp.h>
#include <asm/ap.h>
@@ -58,7 +57,7 @@ static int handle_gs(struct kvm_vcpu *vcpu)
if (test_kvm_facility(vcpu->kvm, 133)) {
VCPU_EVENT(vcpu, 3, "%s", "ENABLE: GS (lazy)");
preempt_disable();
- __ctl_set_bit(2, 4);
+ local_ctl_set_bit(2, CR2_GUARDED_STORAGE_BIT);
current->thread.gs_cb = (struct gs_cb *)&vcpu->run->s.regs.gscb;
restore_gs_cb(current->thread.gs_cb);
preempt_enable();
@@ -150,7 +149,7 @@ static int handle_set_prefix(struct kvm_vcpu *vcpu)
* first page, since address is 8k aligned and memory pieces are always
* at least 1MB aligned and have at least a size of 1MB.
*/
- if (kvm_is_error_gpa(vcpu->kvm, address))
+ if (!kvm_is_gpa_in_memslot(vcpu->kvm, address))
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
kvm_s390_set_prefix(vcpu, address);
@@ -465,7 +464,7 @@ static int handle_test_block(struct kvm_vcpu *vcpu)
return kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm);
addr = kvm_s390_real_to_abs(vcpu, addr);
- if (kvm_is_error_gpa(vcpu->kvm, addr))
+ if (!kvm_is_gpa_in_memslot(vcpu->kvm, addr))
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
/*
* We don't expect errors on modern systems, and do not care
@@ -606,6 +605,14 @@ static int handle_io_inst(struct kvm_vcpu *vcpu)
}
}
+#if IS_ENABLED(CONFIG_VFIO_AP)
+bool kvm_s390_is_gpa_in_memslot(struct kvm *kvm, gpa_t gpa)
+{
+ return kvm_is_gpa_in_memslot(kvm, gpa);
+}
+EXPORT_SYMBOL_FOR_MODULES(kvm_s390_is_gpa_in_memslot, "vfio_ap");
+#endif
+
/*
* handle_pqap: Handling pqap interception
* @vcpu: the vcpu having issue the pqap instruction
@@ -677,8 +684,12 @@ static int handle_pqap(struct kvm_vcpu *vcpu)
if (vcpu->kvm->arch.crypto.pqap_hook) {
pqap_hook = *vcpu->kvm->arch.crypto.pqap_hook;
ret = pqap_hook(vcpu);
- if (!ret && vcpu->run->s.regs.gprs[1] & 0x00ff0000)
- kvm_s390_set_psw_cc(vcpu, 3);
+ if (!ret) {
+ if (vcpu->run->s.regs.gprs[1] & 0x00ff0000)
+ kvm_s390_set_psw_cc(vcpu, 3);
+ else
+ kvm_s390_set_psw_cc(vcpu, 0);
+ }
up_read(&vcpu->kvm->arch.crypto.pqap_hook_rwsem);
return ret;
}
@@ -743,7 +754,7 @@ int is_valid_psw(psw_t *psw)
int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu)
{
psw_t *gpsw = &vcpu->arch.sie_block->gpsw;
- psw_compat_t new_psw;
+ psw32_t new_psw;
u64 addr;
int rc;
u8 ar;
@@ -794,6 +805,36 @@ static int handle_lpswe(struct kvm_vcpu *vcpu)
return 0;
}
+static int handle_lpswey(struct kvm_vcpu *vcpu)
+{
+ psw_t new_psw;
+ u64 addr;
+ int rc;
+ u8 ar;
+
+ vcpu->stat.instruction_lpswey++;
+
+ if (!test_kvm_facility(vcpu->kvm, 193))
+ return kvm_s390_inject_program_int(vcpu, PGM_OPERATION);
+
+ if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
+ return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
+
+ addr = kvm_s390_get_base_disp_siy(vcpu, &ar);
+ if (addr & 7)
+ return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+
+ rc = read_guest(vcpu, addr, ar, &new_psw, sizeof(new_psw));
+ if (rc)
+ return kvm_s390_inject_prog_cond(vcpu, rc);
+
+ vcpu->arch.sie_block->gpsw = new_psw;
+ if (!is_valid_psw(&vcpu->arch.sie_block->gpsw))
+ return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+
+ return 0;
+}
+
static int handle_stidp(struct kvm_vcpu *vcpu)
{
u64 stidp_data = vcpu->kvm->arch.model.cpuid;
@@ -1215,6 +1256,8 @@ static inline int __do_essa(struct kvm_vcpu *vcpu, const int orc)
static int handle_essa(struct kvm_vcpu *vcpu)
{
+ lockdep_assert_held(&vcpu->kvm->srcu);
+
/* entries expected to be 1FF */
int entries = (vcpu->arch.sie_block->cbrlo & ~PAGE_MASK) >> 3;
unsigned long *cbrlo;
@@ -1264,12 +1307,8 @@ static int handle_essa(struct kvm_vcpu *vcpu)
/* Retry the ESSA instruction */
kvm_s390_retry_instr(vcpu);
} else {
- int srcu_idx;
-
mmap_read_lock(vcpu->kvm->mm);
- srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
i = __do_essa(vcpu, orc);
- srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
mmap_read_unlock(vcpu->kvm->mm);
if (i < 0)
return i;
@@ -1459,6 +1498,8 @@ int kvm_s390_handle_eb(struct kvm_vcpu *vcpu)
case 0x61:
case 0x62:
return handle_ri(vcpu);
+ case 0x71:
+ return handle_lpswey(vcpu);
default:
return -EOPNOTSUPP;
}
diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c
index e032ebbf51b9..6ba5a0305e25 100644
--- a/arch/s390/kvm/pv.c
+++ b/arch/s390/kvm/pv.c
@@ -5,6 +5,8 @@
* Copyright IBM Corp. 2019, 2020
* Author(s): Janosch Frank <frankja@linux.ibm.com>
*/
+
+#include <linux/export.h>
#include <linux/kvm.h>
#include <linux/kvm_host.h>
#include <linux/minmax.h>
@@ -18,6 +20,78 @@
#include <linux/mmu_notifier.h>
#include "kvm-s390.h"
+bool kvm_s390_pv_is_protected(struct kvm *kvm)
+{
+ lockdep_assert_held(&kvm->lock);
+ return !!kvm_s390_pv_get_handle(kvm);
+}
+EXPORT_SYMBOL_GPL(kvm_s390_pv_is_protected);
+
+bool kvm_s390_pv_cpu_is_protected(struct kvm_vcpu *vcpu)
+{
+ lockdep_assert_held(&vcpu->mutex);
+ return !!kvm_s390_pv_cpu_get_handle(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_s390_pv_cpu_is_protected);
+
+/**
+ * kvm_s390_pv_make_secure() - make one guest page secure
+ * @kvm: the guest
+ * @gaddr: the guest address that needs to be made secure
+ * @uvcb: the UVCB specifying which operation needs to be performed
+ *
+ * Context: needs to be called with kvm->srcu held.
+ * Return: 0 on success, < 0 in case of error.
+ */
+int kvm_s390_pv_make_secure(struct kvm *kvm, unsigned long gaddr, void *uvcb)
+{
+ unsigned long vmaddr;
+
+ lockdep_assert_held(&kvm->srcu);
+
+ vmaddr = gfn_to_hva(kvm, gpa_to_gfn(gaddr));
+ if (kvm_is_error_hva(vmaddr))
+ return -EFAULT;
+ return make_hva_secure(kvm->mm, vmaddr, uvcb);
+}
+
+int kvm_s390_pv_convert_to_secure(struct kvm *kvm, unsigned long gaddr)
+{
+ struct uv_cb_cts uvcb = {
+ .header.cmd = UVC_CMD_CONV_TO_SEC_STOR,
+ .header.len = sizeof(uvcb),
+ .guest_handle = kvm_s390_pv_get_handle(kvm),
+ .gaddr = gaddr,
+ };
+
+ return kvm_s390_pv_make_secure(kvm, gaddr, &uvcb);
+}
+
+/**
+ * kvm_s390_pv_destroy_page() - Destroy a guest page.
+ * @kvm: the guest
+ * @gaddr: the guest address to destroy
+ *
+ * An attempt will be made to destroy the given guest page. If the attempt
+ * fails, an attempt is made to export the page. If both attempts fail, an
+ * appropriate error is returned.
+ *
+ * Context: may sleep.
+ */
+int kvm_s390_pv_destroy_page(struct kvm *kvm, unsigned long gaddr)
+{
+ struct page *page;
+ int rc = 0;
+
+ mmap_read_lock(kvm->mm);
+ page = gfn_to_page(kvm, gpa_to_gfn(gaddr));
+ if (page)
+ rc = __kvm_s390_pv_destroy_page(page);
+ kvm_release_page_clean(page);
+ mmap_read_unlock(kvm->mm);
+ return rc;
+}
+
/**
* struct pv_vm_to_be_destroyed - Represents a protected VM that needs to
* be destroyed
@@ -271,9 +345,10 @@ static int kvm_s390_pv_deinit_vm_fast(struct kvm *kvm, u16 *rc, u16 *rrc)
WRITE_ONCE(kvm->arch.gmap->guest_handle, 0);
KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM FAST: rc %x rrc %x",
uvcb.header.rc, uvcb.header.rrc);
- WARN_ONCE(cc, "protvirt destroy vm fast failed handle %llx rc %x rrc %x",
+ WARN_ONCE(cc && uvcb.header.rc != 0x104,
+ "protvirt destroy vm fast failed handle %llx rc %x rrc %x",
kvm_s390_pv_get_handle(kvm), uvcb.header.rc, uvcb.header.rrc);
- /* Inteded memory leak on "impossible" error */
+ /* Intended memory leak on "impossible" error */
if (!cc)
kvm_s390_pv_dealloc_vm(kvm);
return cc ? -EIO : 0;
@@ -314,6 +389,11 @@ int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc)
*/
if (kvm->arch.pv.set_aside)
return -EINVAL;
+
+ /* Guest with segment type ASCE, refuse to destroy asynchronously */
+ if ((kvm->arch.gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT)
+ return -EINVAL;
+
priv = kzalloc(sizeof(*priv), GFP_KERNEL);
if (!priv)
return -ENOMEM;
@@ -406,8 +486,12 @@ int kvm_s390_pv_deinit_cleanup_all(struct kvm *kvm, u16 *rc, u16 *rrc)
u16 _rc, _rrc;
int cc = 0;
- /* Make sure the counter does not reach 0 before calling s390_uv_destroy_range */
- atomic_inc(&kvm->mm->context.protected_count);
+ /*
+ * Nothing to do if the counter was already 0. Otherwise make sure
+ * the counter does not reach 0 before calling s390_uv_destroy_range.
+ */
+ if (!atomic_inc_not_zero(&kvm->mm->context.protected_count))
+ return 0;
*rc = 1;
/* If the current VM is protected, destroy it */
@@ -540,6 +624,17 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
int cc, ret;
u16 dummy;
+ /* Add the notifier only once. No races because we hold kvm->lock */
+ if (kvm->arch.pv.mmu_notifier.ops != &kvm_s390_pv_mmu_notifier_ops) {
+ /* The notifier will be unregistered when the VM is destroyed */
+ kvm->arch.pv.mmu_notifier.ops = &kvm_s390_pv_mmu_notifier_ops;
+ ret = mmu_notifier_register(&kvm->arch.pv.mmu_notifier, kvm->mm);
+ if (ret) {
+ kvm->arch.pv.mmu_notifier.ops = NULL;
+ return ret;
+ }
+ }
+
ret = kvm_s390_pv_alloc_vm(kvm);
if (ret)
return ret;
@@ -552,12 +647,14 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
uvcb.conf_base_stor_origin =
virt_to_phys((void *)kvm->arch.pv.stor_base);
uvcb.conf_virt_stor_origin = (u64)kvm->arch.pv.stor_var;
+ uvcb.flags.ap_allow_instr = kvm->arch.model.uv_feat_guest.ap;
+ uvcb.flags.ap_instr_intr = kvm->arch.model.uv_feat_guest.ap_intr;
cc = uv_call_sched(0, (u64)&uvcb);
*rc = uvcb.header.rc;
*rrc = uvcb.header.rrc;
- KVM_UV_EVENT(kvm, 3, "PROTVIRT CREATE VM: handle %llx len %llx rc %x rrc %x",
- uvcb.guest_handle, uvcb.guest_stor_len, *rc, *rrc);
+ KVM_UV_EVENT(kvm, 3, "PROTVIRT CREATE VM: handle %llx len %llx rc %x rrc %x flags %04x",
+ uvcb.guest_handle, uvcb.guest_stor_len, *rc, *rrc, uvcb.flags.raw);
/* Outputs */
kvm->arch.pv.handle = uvcb.guest_handle;
@@ -573,11 +670,6 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
return -EIO;
}
kvm->arch.gmap->guest_handle = uvcb.guest_handle;
- /* Add the notifier only once. No races because we hold kvm->lock */
- if (kvm->arch.pv.mmu_notifier.ops != &kvm_s390_pv_mmu_notifier_ops) {
- kvm->arch.pv.mmu_notifier.ops = &kvm_s390_pv_mmu_notifier_ops;
- mmu_notifier_register(&kvm->arch.pv.mmu_notifier, kvm->mm);
- }
return 0;
}
@@ -611,11 +703,29 @@ static int unpack_one(struct kvm *kvm, unsigned long addr, u64 tweak,
.tweak[0] = tweak,
.tweak[1] = offset,
};
- int ret = gmap_make_secure(kvm->arch.gmap, addr, &uvcb);
+ int ret = kvm_s390_pv_make_secure(kvm, addr, &uvcb);
+ unsigned long vmaddr;
+ bool unlocked;
*rc = uvcb.header.rc;
*rrc = uvcb.header.rrc;
+ if (ret == -ENXIO) {
+ mmap_read_lock(kvm->mm);
+ vmaddr = gfn_to_hva(kvm, gpa_to_gfn(addr));
+ if (kvm_is_error_hva(vmaddr)) {
+ ret = -EFAULT;
+ } else {
+ ret = fixup_user_fault(kvm->mm, vmaddr, FAULT_FLAG_WRITE, &unlocked);
+ if (!ret)
+ ret = __gmap_link(kvm->arch.gmap, addr, vmaddr);
+ }
+ mmap_read_unlock(kvm->mm);
+ if (!ret)
+ return -EAGAIN;
+ return ret;
+ }
+
if (ret && ret != -EAGAIN)
KVM_UV_EVENT(kvm, 3, "PROTVIRT VM UNPACK: failed addr %llx with rc %x rrc %x",
uvcb.gaddr, *rc, *rrc);
@@ -634,6 +744,8 @@ int kvm_s390_pv_unpack(struct kvm *kvm, unsigned long addr, unsigned long size,
KVM_UV_EVENT(kvm, 3, "PROTVIRT VM UNPACK: start addr %lx size %lx",
addr, size);
+ guard(srcu)(&kvm->srcu);
+
while (offset < size) {
ret = unpack_one(kvm, addr, tweak, offset, rc, rrc);
if (ret == -EAGAIN) {
diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c
index cb747bf6c798..55c34cb35428 100644
--- a/arch/s390/kvm/sigp.c
+++ b/arch/s390/kvm/sigp.c
@@ -172,7 +172,7 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, struct kvm_vcpu *dst_vcpu,
* first page, since address is 8k aligned and memory pieces are always
* at least 1MB aligned and have at least a size of 1MB.
*/
- if (kvm_is_error_gpa(vcpu->kvm, irq.u.prefix.address)) {
+ if (!kvm_is_gpa_in_memslot(vcpu->kvm, irq.u.prefix.address)) {
*reg &= 0xffffffff00000000UL;
*reg |= SIGP_STATUS_INVALID_PARAMETER;
return SIGP_CC_STATUS_STORED;
@@ -469,7 +469,7 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu)
*
* This interception will occur at the source cpu when a source cpu sends an
* external call to a target cpu and the target cpu has the WAIT bit set in
- * its cpuflags. Interception will occurr after the interrupt indicator bits at
+ * its cpuflags. Interception will occur after the interrupt indicator bits at
* the target cpu have been set. All error cases will lead to instruction
* interception, therefore nothing is to be checked or prepared.
*/
diff --git a/arch/s390/kvm/trace-s390.h b/arch/s390/kvm/trace-s390.h
index 6f0209d45164..9e28f165c114 100644
--- a/arch/s390/kvm/trace-s390.h
+++ b/arch/s390/kvm/trace-s390.h
@@ -56,7 +56,7 @@ TRACE_EVENT(kvm_s390_create_vcpu,
__entry->sie_block = sie_block;
),
- TP_printk("create cpu %d at 0x%pK, sie block at 0x%pK",
+ TP_printk("create cpu %d at 0x%p, sie block at 0x%p",
__entry->id, __entry->vcpu, __entry->sie_block)
);
@@ -255,7 +255,7 @@ TRACE_EVENT(kvm_s390_enable_css,
__entry->kvm = kvm;
),
- TP_printk("enabling channel I/O support (kvm @ %pK)\n",
+ TP_printk("enabling channel I/O support (kvm @ %p)\n",
__entry->kvm)
);
@@ -333,6 +333,29 @@ TRACE_EVENT(kvm_s390_airq_suppressed,
__entry->id, __entry->isc)
);
+/*
+ * Trace point for gmap notifier calls.
+ */
+TRACE_EVENT(kvm_s390_gmap_notifier,
+ TP_PROTO(unsigned long start, unsigned long end, unsigned int shadow),
+ TP_ARGS(start, end, shadow),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, start)
+ __field(unsigned long, end)
+ __field(unsigned int, shadow)
+ ),
+
+ TP_fast_assign(
+ __entry->start = start;
+ __entry->end = end;
+ __entry->shadow = shadow;
+ ),
+
+ TP_printk("gmap notified (start:0x%lx end:0x%lx shadow:%d)",
+ __entry->start, __entry->end, __entry->shadow)
+ );
+
#endif /* _TRACE_KVMS390_H */
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index b6a0219e470a..b526621d2a1b 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -12,16 +12,22 @@
#include <linux/list.h>
#include <linux/bitmap.h>
#include <linux/sched/signal.h>
+#include <linux/io.h>
+#include <linux/mman.h>
#include <asm/gmap.h>
#include <asm/mmu_context.h>
#include <asm/sclp.h>
#include <asm/nmi.h>
#include <asm/dis.h>
-#include <asm/fpu/api.h>
+#include <asm/facility.h>
#include "kvm-s390.h"
#include "gaccess.h"
+enum vsie_page_flags {
+ VSIE_PAGE_IN_USE = 0,
+};
+
struct vsie_page {
struct kvm_s390_sie_block scb_s; /* 0x0000 */
/*
@@ -45,11 +51,40 @@ struct vsie_page {
gpa_t gvrd_gpa; /* 0x0240 */
gpa_t riccbd_gpa; /* 0x0248 */
gpa_t sdnx_gpa; /* 0x0250 */
- __u8 reserved[0x0700 - 0x0258]; /* 0x0258 */
+ /*
+ * guest address of the original SCB. Remains set for free vsie
+ * pages, so we can properly look them up in our addr_to_page
+ * radix tree.
+ */
+ gpa_t scb_gpa; /* 0x0258 */
+ /*
+ * Flags: must be set/cleared atomically after the vsie page can be
+ * looked up by other CPUs.
+ */
+ unsigned long flags; /* 0x0260 */
+ __u8 reserved[0x0700 - 0x0268]; /* 0x0268 */
struct kvm_s390_crypto_cb crycb; /* 0x0700 */
__u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE]; /* 0x0800 */
};
+/**
+ * gmap_shadow_valid() - check if a shadow guest address space matches the
+ * given properties and is still valid
+ * @sg: pointer to the shadow guest address space structure
+ * @asce: ASCE for which the shadow table is requested
+ * @edat_level: edat level to be used for the shadow translation
+ *
+ * Returns 1 if the gmap shadow is still valid and matches the given
+ * properties, the caller can continue using it. Returns 0 otherwise; the
+ * caller has to request a new shadow gmap in this case.
+ */
+int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level)
+{
+ if (sg->removed)
+ return 0;
+ return sg->orig_asce == asce && sg->edat_level == edat_level;
+}
+
/* trigger a validity icpt for the given scb */
static int set_validity_icpt(struct kvm_s390_sie_block *scb,
__u16 reason_code)
@@ -138,11 +173,15 @@ static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
}
/* Copy to APCB FORMAT1 from APCB FORMAT0 */
static int setup_apcb10(struct kvm_vcpu *vcpu, struct kvm_s390_apcb1 *apcb_s,
- unsigned long apcb_o, struct kvm_s390_apcb1 *apcb_h)
+ unsigned long crycb_gpa, struct kvm_s390_apcb1 *apcb_h)
{
struct kvm_s390_apcb0 tmp;
+ unsigned long apcb_gpa;
+
+ apcb_gpa = crycb_gpa + offsetof(struct kvm_s390_crypto_cb, apcb0);
- if (read_guest_real(vcpu, apcb_o, &tmp, sizeof(struct kvm_s390_apcb0)))
+ if (read_guest_real(vcpu, apcb_gpa, &tmp,
+ sizeof(struct kvm_s390_apcb0)))
return -EFAULT;
apcb_s->apm[0] = apcb_h->apm[0] & tmp.apm[0];
@@ -157,19 +196,24 @@ static int setup_apcb10(struct kvm_vcpu *vcpu, struct kvm_s390_apcb1 *apcb_s,
* setup_apcb00 - Copy to APCB FORMAT0 from APCB FORMAT0
* @vcpu: pointer to the virtual CPU
* @apcb_s: pointer to start of apcb in the shadow crycb
- * @apcb_o: pointer to start of original apcb in the guest2
+ * @crycb_gpa: guest physical address to start of original guest crycb
* @apcb_h: pointer to start of apcb in the guest1
*
* Returns 0 and -EFAULT on error reading guest apcb
*/
static int setup_apcb00(struct kvm_vcpu *vcpu, unsigned long *apcb_s,
- unsigned long apcb_o, unsigned long *apcb_h)
+ unsigned long crycb_gpa, unsigned long *apcb_h)
{
- if (read_guest_real(vcpu, apcb_o, apcb_s,
+ unsigned long apcb_gpa;
+
+ apcb_gpa = crycb_gpa + offsetof(struct kvm_s390_crypto_cb, apcb0);
+
+ if (read_guest_real(vcpu, apcb_gpa, apcb_s,
sizeof(struct kvm_s390_apcb0)))
return -EFAULT;
- bitmap_and(apcb_s, apcb_s, apcb_h, sizeof(struct kvm_s390_apcb0));
+ bitmap_and(apcb_s, apcb_s, apcb_h,
+ BITS_PER_BYTE * sizeof(struct kvm_s390_apcb0));
return 0;
}
@@ -178,20 +222,25 @@ static int setup_apcb00(struct kvm_vcpu *vcpu, unsigned long *apcb_s,
* setup_apcb11 - Copy the FORMAT1 APCB from the guest to the shadow CRYCB
* @vcpu: pointer to the virtual CPU
* @apcb_s: pointer to start of apcb in the shadow crycb
- * @apcb_o: pointer to start of original guest apcb
+ * @crycb_gpa: guest physical address to start of original guest crycb
* @apcb_h: pointer to start of apcb in the host
*
* Returns 0 and -EFAULT on error reading guest apcb
*/
static int setup_apcb11(struct kvm_vcpu *vcpu, unsigned long *apcb_s,
- unsigned long apcb_o,
+ unsigned long crycb_gpa,
unsigned long *apcb_h)
{
- if (read_guest_real(vcpu, apcb_o, apcb_s,
+ unsigned long apcb_gpa;
+
+ apcb_gpa = crycb_gpa + offsetof(struct kvm_s390_crypto_cb, apcb1);
+
+ if (read_guest_real(vcpu, apcb_gpa, apcb_s,
sizeof(struct kvm_s390_apcb1)))
return -EFAULT;
- bitmap_and(apcb_s, apcb_s, apcb_h, sizeof(struct kvm_s390_apcb1));
+ bitmap_and(apcb_s, apcb_s, apcb_h,
+ BITS_PER_BYTE * sizeof(struct kvm_s390_apcb1));
return 0;
}
@@ -200,7 +249,7 @@ static int setup_apcb11(struct kvm_vcpu *vcpu, unsigned long *apcb_s,
* setup_apcb - Create a shadow copy of the apcb.
* @vcpu: pointer to the virtual CPU
* @crycb_s: pointer to shadow crycb
- * @crycb_o: pointer to original guest crycb
+ * @crycb_gpa: guest physical address of original guest crycb
* @crycb_h: pointer to the host crycb
* @fmt_o: format of the original guest crycb.
* @fmt_h: format of the host crycb.
@@ -211,50 +260,46 @@ static int setup_apcb11(struct kvm_vcpu *vcpu, unsigned long *apcb_s,
* Return 0 or an error number if the guest and host crycb are incompatible.
*/
static int setup_apcb(struct kvm_vcpu *vcpu, struct kvm_s390_crypto_cb *crycb_s,
- const u32 crycb_o,
+ const u32 crycb_gpa,
struct kvm_s390_crypto_cb *crycb_h,
int fmt_o, int fmt_h)
{
- struct kvm_s390_crypto_cb *crycb;
-
- crycb = (struct kvm_s390_crypto_cb *) (unsigned long)crycb_o;
-
switch (fmt_o) {
case CRYCB_FORMAT2:
- if ((crycb_o & PAGE_MASK) != ((crycb_o + 256) & PAGE_MASK))
+ if ((crycb_gpa & PAGE_MASK) != ((crycb_gpa + 256) & PAGE_MASK))
return -EACCES;
if (fmt_h != CRYCB_FORMAT2)
return -EINVAL;
return setup_apcb11(vcpu, (unsigned long *)&crycb_s->apcb1,
- (unsigned long) &crycb->apcb1,
+ crycb_gpa,
(unsigned long *)&crycb_h->apcb1);
case CRYCB_FORMAT1:
switch (fmt_h) {
case CRYCB_FORMAT2:
return setup_apcb10(vcpu, &crycb_s->apcb1,
- (unsigned long) &crycb->apcb0,
+ crycb_gpa,
&crycb_h->apcb1);
case CRYCB_FORMAT1:
return setup_apcb00(vcpu,
(unsigned long *) &crycb_s->apcb0,
- (unsigned long) &crycb->apcb0,
+ crycb_gpa,
(unsigned long *) &crycb_h->apcb0);
}
break;
case CRYCB_FORMAT0:
- if ((crycb_o & PAGE_MASK) != ((crycb_o + 32) & PAGE_MASK))
+ if ((crycb_gpa & PAGE_MASK) != ((crycb_gpa + 32) & PAGE_MASK))
return -EACCES;
switch (fmt_h) {
case CRYCB_FORMAT2:
return setup_apcb10(vcpu, &crycb_s->apcb1,
- (unsigned long) &crycb->apcb0,
+ crycb_gpa,
&crycb_h->apcb1);
case CRYCB_FORMAT1:
case CRYCB_FORMAT0:
return setup_apcb00(vcpu,
(unsigned long *) &crycb_s->apcb0,
- (unsigned long) &crycb->apcb0,
+ crycb_gpa,
(unsigned long *) &crycb_h->apcb0);
}
}
@@ -324,7 +369,8 @@ static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
/* we may only allow it if enabled for guest 2 */
ecb3_flags = scb_o->ecb3 & vcpu->arch.sie_block->ecb3 &
(ECB3_AES | ECB3_DEA);
- ecd_flags = scb_o->ecd & vcpu->arch.sie_block->ecd & ECD_ECC;
+ ecd_flags = scb_o->ecd & vcpu->arch.sie_block->ecd &
+ (ECD_ECC | ECD_HMAC);
if (!ecb3_flags && !ecd_flags)
goto end;
@@ -351,7 +397,7 @@ end:
case -EACCES:
return set_validity_icpt(scb_s, 0x003CU);
}
- scb_s->crycbd = ((__u32)(__u64) &vsie_page->crycb) | CRYCB_FORMAT2;
+ scb_s->crycbd = (u32)virt_to_phys(&vsie_page->crycb) | CRYCB_FORMAT2;
return 0;
}
@@ -494,7 +540,7 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
scb_s->mso = new_mso;
scb_s->prefix = new_prefix;
- /* We have to definetly flush the tlb if this scb never ran */
+ /* We have to definitely flush the tlb if this scb never ran */
if (scb_s->ihcpu != 0xffffU)
scb_s->ihcpu = scb_o->ihcpu;
@@ -572,24 +618,18 @@ void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start,
struct kvm *kvm = gmap->private;
struct vsie_page *cur;
unsigned long prefix;
- struct page *page;
int i;
if (!gmap_is_shadow(gmap))
return;
- if (start >= 1UL << 31)
- /* We are only interested in prefix pages */
- return;
-
/*
* Only new shadow blocks are added to the list during runtime,
* therefore we can safely reference them all the time.
*/
for (i = 0; i < kvm->arch.vsie.page_count; i++) {
- page = READ_ONCE(kvm->arch.vsie.pages[i]);
- if (!page)
+ cur = READ_ONCE(kvm->arch.vsie.pages[i]);
+ if (!cur)
continue;
- cur = page_to_virt(page);
if (READ_ONCE(cur->gmap) != gmap)
continue;
prefix = cur->scb_s.prefix << GUEST_PREFIX_SHIFT;
@@ -654,7 +694,7 @@ static int pin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t *hpa)
struct page *page;
page = gfn_to_page(kvm, gpa_to_gfn(gpa));
- if (is_error_page(page))
+ if (!page)
return -EINVAL;
*hpa = (hpa_t)page_to_phys(page) + (gpa & ~PAGE_MASK);
return 0;
@@ -663,7 +703,7 @@ static int pin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t *hpa)
/* Unpins a page previously pinned via pin_guest_page, marking it as dirty. */
static void unpin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t hpa)
{
- kvm_release_pfn_dirty(hpa >> PAGE_SHIFT);
+ kvm_release_page_dirty(pfn_to_page(hpa >> PAGE_SHIFT));
/* mark the page always as dirty for migration */
mark_page_dirty(kvm, gpa_to_gfn(gpa));
}
@@ -742,7 +782,7 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
else if ((gpa & ~0x1fffUL) == kvm_s390_get_prefix(vcpu))
rc = set_validity_icpt(scb_s, 0x0011U);
else if ((gpa & PAGE_MASK) !=
- ((gpa + sizeof(struct bsca_block) - 1) & PAGE_MASK))
+ ((gpa + offsetof(struct bsca_block, cpu[0]) - 1) & PAGE_MASK))
rc = set_validity_icpt(scb_s, 0x003bU);
if (!rc) {
rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
@@ -846,7 +886,7 @@ unpin:
static void unpin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
gpa_t gpa)
{
- hpa_t hpa = (hpa_t) vsie_page->scb_o;
+ hpa_t hpa = virt_to_phys(vsie_page->scb_o);
if (hpa)
unpin_guest_page(vcpu->kvm, gpa, hpa);
@@ -891,7 +931,7 @@ static int inject_fault(struct kvm_vcpu *vcpu, __u16 code, __u64 vaddr,
(vaddr & 0xfffffffffffff000UL) |
/* 52-53: store / fetch */
(((unsigned int) !write_flag) + 1) << 10,
- /* 62-63: asce id (alway primary == 0) */
+ /* 62-63: asce id (always primary == 0) */
.exc_access_id = 0, /* always primary */
.op_access_id = 0, /* not MVPG */
};
@@ -915,19 +955,19 @@ static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
{
int rc;
- if (current->thread.gmap_int_code == PGM_PROTECTION)
+ if ((current->thread.gmap_int_code & PGM_INT_CODE_MASK) == PGM_PROTECTION)
/* we can directly forward all protection exceptions */
return inject_fault(vcpu, PGM_PROTECTION,
- current->thread.gmap_addr, 1);
+ current->thread.gmap_teid.addr * PAGE_SIZE, 1);
rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
- current->thread.gmap_addr, NULL);
+ current->thread.gmap_teid.addr * PAGE_SIZE, NULL);
if (rc > 0) {
rc = inject_fault(vcpu, rc,
- current->thread.gmap_addr,
- current->thread.gmap_write_flag);
+ current->thread.gmap_teid.addr * PAGE_SIZE,
+ kvm_s390_cur_gmap_fault_is_write());
if (rc >= 0)
- vsie_page->fault_addr = current->thread.gmap_addr;
+ vsie_page->fault_addr = current->thread.gmap_teid.addr * PAGE_SIZE;
}
return rc;
}
@@ -978,14 +1018,28 @@ static void retry_vsie_icpt(struct vsie_page *vsie_page)
static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
{
struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
- __u32 fac = READ_ONCE(vsie_page->scb_o->fac) & 0x7ffffff8U;
+ __u32 fac = READ_ONCE(vsie_page->scb_o->fac);
+ /*
+ * Alternate-STFLE-Interpretive-Execution facilities are not supported
+ * -> format-0 flcb
+ */
if (fac && test_kvm_facility(vcpu->kvm, 7)) {
retry_vsie_icpt(vsie_page);
+ /*
+ * The facility list origin (FLO) is in bits 1 - 28 of the FLD
+ * so we need to mask here before reading.
+ */
+ fac = fac & 0x7ffffff8U;
+ /*
+ * format-0 -> size of nested guest's facility list == guest's size
+ * guest's size == host's size, since STFLE is interpretatively executed
+ * using a format-0 for the guest, too.
+ */
if (read_guest_real(vcpu, fac, &vsie_page->fac,
- sizeof(vsie_page->fac)))
+ stfle_size() * sizeof(u64)))
return set_validity_icpt(scb_s, 0x1090U);
- scb_s->fac = (__u32)(__u64) &vsie_page->fac;
+ scb_s->fac = (u32)virt_to_phys(&vsie_page->fac);
}
return 0;
}
@@ -1116,10 +1170,6 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
vcpu->arch.sie_block->fpf & FPF_BPBC)
set_thread_flag(TIF_ISOLATE_BP_GUEST);
- local_irq_disable();
- guest_enter_irqoff();
- local_irq_enable();
-
/*
* Simulate a SIE entry of the VCPU (see sie64a), so VCPU blocking
* and VCPU requests also hinder the vSIE from running and lead
@@ -1127,18 +1177,29 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
* also kick the vSIE.
*/
vcpu->arch.sie_block->prog0c |= PROG_IN_SIE;
+ current->thread.gmap_int_code = 0;
barrier();
- if (test_cpu_flag(CIF_FPU))
- load_fpu_regs();
- if (!kvm_s390_vcpu_sie_inhibited(vcpu))
- rc = sie64a(scb_s, vcpu->run->s.regs.gprs);
+ if (!kvm_s390_vcpu_sie_inhibited(vcpu)) {
+xfer_to_guest_mode_check:
+ local_irq_disable();
+ xfer_to_guest_mode_prepare();
+ if (xfer_to_guest_mode_work_pending()) {
+ local_irq_enable();
+ rc = kvm_xfer_to_guest_mode_handle_work(vcpu);
+ if (rc)
+ goto skip_sie;
+ goto xfer_to_guest_mode_check;
+ }
+ guest_timing_enter_irqoff();
+ rc = kvm_s390_enter_exit_sie(scb_s, vcpu->run->s.regs.gprs, vsie_page->gmap->asce);
+ guest_timing_exit_irqoff();
+ local_irq_enable();
+ }
+
+skip_sie:
barrier();
vcpu->arch.sie_block->prog0c &= ~PROG_IN_SIE;
- local_irq_disable();
- guest_exit_irqoff();
- local_irq_enable();
-
/* restore guest state for bp isolation override */
if (!guest_bp_isolation)
clear_thread_flag(TIF_ISOLATE_BP_GUEST);
@@ -1153,7 +1214,7 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
if (rc > 0)
rc = 0; /* we could still have an icpt */
- else if (rc == -EFAULT)
+ else if (current->thread.gmap_int_code)
return handle_fault(vcpu, vsie_page);
switch (scb_s->icptcode) {
@@ -1204,15 +1265,17 @@ static int acquire_gmap_shadow(struct kvm_vcpu *vcpu,
* we're holding has been unshadowed. If the gmap is still valid,
* we can safely reuse it.
*/
- if (vsie_page->gmap && gmap_shadow_valid(vsie_page->gmap, asce, edat))
+ if (vsie_page->gmap && gmap_shadow_valid(vsie_page->gmap, asce, edat)) {
+ vcpu->kvm->stat.gmap_shadow_reuse++;
return 0;
+ }
/* release the old shadow - if any, and mark the prefix as unmapped */
release_gmap_shadow(vsie_page);
gmap = gmap_shadow(vcpu->arch.gmap, asce, edat);
if (IS_ERR(gmap))
return PTR_ERR(gmap);
- gmap->private = vcpu->kvm;
+ vcpu->kvm->stat.gmap_shadow_create++;
WRITE_ONCE(vsie_page->gmap, gmap);
return 0;
}
@@ -1274,20 +1337,30 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
if (!rc)
rc = map_prefix(vcpu, vsie_page);
if (!rc) {
- gmap_enable(vsie_page->gmap);
update_intervention_requests(vsie_page);
rc = do_vsie_run(vcpu, vsie_page);
- gmap_enable(vcpu->arch.gmap);
}
atomic_andnot(PROG_BLOCK_SIE, &scb_s->prog20);
if (rc == -EAGAIN)
rc = 0;
- if (rc || scb_s->icptcode || signal_pending(current) ||
- kvm_s390_vcpu_has_irq(vcpu, 0) ||
- kvm_s390_vcpu_sie_inhibited(vcpu))
+
+ /*
+ * Exit the loop if the guest needs to process the intercept
+ */
+ if (rc || scb_s->icptcode)
break;
- cond_resched();
+
+ /*
+ * Exit the loop if the host needs to process an intercept,
+ * but rewind the PSW to re-enter SIE once that's completed
+ * instead of passing a "no action" intercept to the guest.
+ */
+ if (kvm_s390_vcpu_has_irq(vcpu, 0) ||
+ kvm_s390_vcpu_sie_inhibited(vcpu)) {
+ kvm_s390_rewind_psw(vcpu, 4);
+ break;
+ }
}
if (rc == -EFAULT) {
@@ -1310,6 +1383,20 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
return rc;
}
+/* Try getting a given vsie page, returning "true" on success. */
+static inline bool try_get_vsie_page(struct vsie_page *vsie_page)
+{
+ if (test_bit(VSIE_PAGE_IN_USE, &vsie_page->flags))
+ return false;
+ return !test_and_set_bit(VSIE_PAGE_IN_USE, &vsie_page->flags);
+}
+
+/* Put a vsie page acquired through get_vsie_page / try_get_vsie_page. */
+static void put_vsie_page(struct vsie_page *vsie_page)
+{
+ clear_bit(VSIE_PAGE_IN_USE, &vsie_page->flags);
+}
+
/*
* Get or create a vsie page for a scb address.
*
@@ -1320,16 +1407,21 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr)
{
struct vsie_page *vsie_page;
- struct page *page;
int nr_vcpus;
rcu_read_lock();
- page = radix_tree_lookup(&kvm->arch.vsie.addr_to_page, addr >> 9);
+ vsie_page = radix_tree_lookup(&kvm->arch.vsie.addr_to_page, addr >> 9);
rcu_read_unlock();
- if (page) {
- if (page_ref_inc_return(page) == 2)
- return page_to_virt(page);
- page_ref_dec(page);
+ if (vsie_page) {
+ if (try_get_vsie_page(vsie_page)) {
+ if (vsie_page->scb_gpa == addr)
+ return vsie_page;
+ /*
+ * We raced with someone reusing + putting this vsie
+ * page before we grabbed it.
+ */
+ put_vsie_page(vsie_page);
+ }
}
/*
@@ -1340,36 +1432,40 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr)
mutex_lock(&kvm->arch.vsie.mutex);
if (kvm->arch.vsie.page_count < nr_vcpus) {
- page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO | GFP_DMA);
- if (!page) {
+ vsie_page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO | GFP_DMA);
+ if (!vsie_page) {
mutex_unlock(&kvm->arch.vsie.mutex);
return ERR_PTR(-ENOMEM);
}
- page_ref_inc(page);
- kvm->arch.vsie.pages[kvm->arch.vsie.page_count] = page;
+ __set_bit(VSIE_PAGE_IN_USE, &vsie_page->flags);
+ kvm->arch.vsie.pages[kvm->arch.vsie.page_count] = vsie_page;
kvm->arch.vsie.page_count++;
} else {
/* reuse an existing entry that belongs to nobody */
while (true) {
- page = kvm->arch.vsie.pages[kvm->arch.vsie.next];
- if (page_ref_inc_return(page) == 2)
+ vsie_page = kvm->arch.vsie.pages[kvm->arch.vsie.next];
+ if (try_get_vsie_page(vsie_page))
break;
- page_ref_dec(page);
kvm->arch.vsie.next++;
kvm->arch.vsie.next %= nr_vcpus;
}
- radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9);
+ if (vsie_page->scb_gpa != ULONG_MAX)
+ radix_tree_delete(&kvm->arch.vsie.addr_to_page,
+ vsie_page->scb_gpa >> 9);
}
- page->index = addr;
- /* double use of the same address */
- if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9, page)) {
- page_ref_dec(page);
+ /* Mark it as invalid until it resides in the tree. */
+ vsie_page->scb_gpa = ULONG_MAX;
+
+ /* Double use of the same address or allocation failure. */
+ if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9,
+ vsie_page)) {
+ put_vsie_page(vsie_page);
mutex_unlock(&kvm->arch.vsie.mutex);
return NULL;
}
+ vsie_page->scb_gpa = addr;
mutex_unlock(&kvm->arch.vsie.mutex);
- vsie_page = page_to_virt(page);
memset(&vsie_page->scb_s, 0, sizeof(struct kvm_s390_sie_block));
release_gmap_shadow(vsie_page);
vsie_page->fault_addr = 0;
@@ -1377,14 +1473,6 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr)
return vsie_page;
}
-/* put a vsie page acquired via get_vsie_page */
-static void put_vsie_page(struct kvm *kvm, struct vsie_page *vsie_page)
-{
- struct page *page = pfn_to_page(__pa(vsie_page) >> PAGE_SHIFT);
-
- page_ref_dec(page);
-}
-
int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu)
{
struct vsie_page *vsie_page;
@@ -1404,9 +1492,10 @@ int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu)
if (unlikely(scb_addr & 0x1ffUL))
return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
- if (signal_pending(current) || kvm_s390_vcpu_has_irq(vcpu, 0) ||
- kvm_s390_vcpu_sie_inhibited(vcpu))
+ if (kvm_s390_vcpu_has_irq(vcpu, 0) || kvm_s390_vcpu_sie_inhibited(vcpu)) {
+ kvm_s390_rewind_psw(vcpu, 4);
return 0;
+ }
vsie_page = get_vsie_page(vcpu->kvm, scb_addr);
if (IS_ERR(vsie_page))
@@ -1433,7 +1522,7 @@ out_unshadow:
out_unpin_scb:
unpin_scb(vcpu, vsie_page, scb_addr);
out_put:
- put_vsie_page(vcpu->kvm, vsie_page);
+ put_vsie_page(vsie_page);
return rc < 0 ? rc : 0;
}
@@ -1449,18 +1538,18 @@ void kvm_s390_vsie_init(struct kvm *kvm)
void kvm_s390_vsie_destroy(struct kvm *kvm)
{
struct vsie_page *vsie_page;
- struct page *page;
int i;
mutex_lock(&kvm->arch.vsie.mutex);
for (i = 0; i < kvm->arch.vsie.page_count; i++) {
- page = kvm->arch.vsie.pages[i];
+ vsie_page = kvm->arch.vsie.pages[i];
kvm->arch.vsie.pages[i] = NULL;
- vsie_page = page_to_virt(page);
release_gmap_shadow(vsie_page);
/* free the radix tree entry */
- radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9);
- __free_page(page);
+ if (vsie_page->scb_gpa != ULONG_MAX)
+ radix_tree_delete(&kvm->arch.vsie.addr_to_page,
+ vsie_page->scb_gpa >> 9);
+ free_page((unsigned long)vsie_page);
}
kvm->arch.vsie.page_count = 0;
mutex_unlock(&kvm->arch.vsie.mutex);