From b7c92f1a4e131e459bcf53a570e7265e5ce64455 Mon Sep 17 00:00:00 2001 From: QingFeng Hao Date: Fri, 29 Sep 2017 12:41:50 +0200 Subject: s390/sthyi: reorganize sthyi implementation As we need to support sthyi instruction on LPAR too, move the common code to kernel part and kvm related code to intercept.c for better reuse. Signed-off-by: QingFeng Hao Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/sysinfo.h | 1 + arch/s390/kernel/Makefile | 2 +- arch/s390/kernel/sthyi.c | 428 ++++++++++++++++++++++++++++++++++++ arch/s390/kvm/Makefile | 2 +- arch/s390/kvm/intercept.c | 66 ++++++ arch/s390/kvm/kvm-s390.h | 5 +- arch/s390/kvm/sthyi.c | 469 ---------------------------------------- 7 files changed, 499 insertions(+), 474 deletions(-) create mode 100644 arch/s390/kernel/sthyi.c delete mode 100644 arch/s390/kvm/sthyi.c (limited to 'arch/s390') diff --git a/arch/s390/include/asm/sysinfo.h b/arch/s390/include/asm/sysinfo.h index 2b498e58b914..e4a28307bc5d 100644 --- a/arch/s390/include/asm/sysinfo.h +++ b/arch/s390/include/asm/sysinfo.h @@ -198,4 +198,5 @@ struct service_level { int register_service_level(struct service_level *); int unregister_service_level(struct service_level *); +int sthyi_fill(void *dst, u64 *rc); #endif /* __ASM_S390_SYSINFO_H */ diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index adb3fe2e3d42..1fefb7f9216f 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -55,7 +55,7 @@ obj-y := traps.o time.o process.o base.o early.o setup.o idle.o vtime.o obj-y += processor.o sys_s390.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o obj-y += debug.o irq.o ipl.o dis.o diag.o vdso.o als.o obj-y += sysinfo.o jump_label.o lgr.o os_info.o machine_kexec.o pgm_check.o -obj-y += runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o +obj-y += runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o sthyi.o obj-y += entry.o reipl.o relocate_kernel.o kdebugfs.o extra-y += head.o head64.o vmlinux.lds diff --git a/arch/s390/kernel/sthyi.c b/arch/s390/kernel/sthyi.c new file mode 100644 index 000000000000..3d51f86f9dec --- /dev/null +++ b/arch/s390/kernel/sthyi.c @@ -0,0 +1,428 @@ +/* + * store hypervisor information instruction emulation functions. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License (version 2 only) + * as published by the Free Software Foundation. + * + * Copyright IBM Corp. 2016 + * Author(s): Janosch Frank + */ +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#define DED_WEIGHT 0xffff +/* + * CP and IFL as EBCDIC strings, SP/0x40 determines the end of string + * as they are justified with spaces. + */ +#define CP 0xc3d7404040404040UL +#define IFL 0xc9c6d34040404040UL + +enum hdr_flags { + HDR_NOT_LPAR = 0x10, + HDR_STACK_INCM = 0x20, + HDR_STSI_UNAV = 0x40, + HDR_PERF_UNAV = 0x80, +}; + +enum mac_validity { + MAC_NAME_VLD = 0x20, + MAC_ID_VLD = 0x40, + MAC_CNT_VLD = 0x80, +}; + +enum par_flag { + PAR_MT_EN = 0x80, +}; + +enum par_validity { + PAR_GRP_VLD = 0x08, + PAR_ID_VLD = 0x10, + PAR_ABS_VLD = 0x20, + PAR_WGHT_VLD = 0x40, + PAR_PCNT_VLD = 0x80, +}; + +struct hdr_sctn { + u8 infhflg1; + u8 infhflg2; /* reserved */ + u8 infhval1; /* reserved */ + u8 infhval2; /* reserved */ + u8 reserved[3]; + u8 infhygct; + u16 infhtotl; + u16 infhdln; + u16 infmoff; + u16 infmlen; + u16 infpoff; + u16 infplen; + u16 infhoff1; + u16 infhlen1; + u16 infgoff1; + u16 infglen1; + u16 infhoff2; + u16 infhlen2; + u16 infgoff2; + u16 infglen2; + u16 infhoff3; + u16 infhlen3; + u16 infgoff3; + u16 infglen3; + u8 reserved2[4]; +} __packed; + +struct mac_sctn { + u8 infmflg1; /* reserved */ + u8 infmflg2; /* reserved */ + u8 infmval1; + u8 infmval2; /* reserved */ + u16 infmscps; + u16 infmdcps; + u16 infmsifl; + u16 infmdifl; + char infmname[8]; + char infmtype[4]; + char infmmanu[16]; + char infmseq[16]; + char infmpman[4]; + u8 reserved[4]; +} __packed; + +struct par_sctn { + u8 infpflg1; + u8 infpflg2; /* reserved */ + u8 infpval1; + u8 infpval2; /* reserved */ + u16 infppnum; + u16 infpscps; + u16 infpdcps; + u16 infpsifl; + u16 infpdifl; + u16 reserved; + char infppnam[8]; + u32 infpwbcp; + u32 infpabcp; + u32 infpwbif; + u32 infpabif; + char infplgnm[8]; + u32 infplgcp; + u32 infplgif; +} __packed; + +struct sthyi_sctns { + struct hdr_sctn hdr; + struct mac_sctn mac; + struct par_sctn par; +} __packed; + +struct cpu_inf { + u64 lpar_cap; + u64 lpar_grp_cap; + u64 lpar_weight; + u64 all_weight; + int cpu_num_ded; + int cpu_num_shd; +}; + +struct lpar_cpu_inf { + struct cpu_inf cp; + struct cpu_inf ifl; +}; + +static inline u64 cpu_id(u8 ctidx, void *diag224_buf) +{ + return *((u64 *)(diag224_buf + (ctidx + 1) * DIAG204_CPU_NAME_LEN)); +} + +/* + * Scales the cpu capping from the lpar range to the one expected in + * sthyi data. + * + * diag204 reports a cap in hundredths of processor units. + * z/VM's range for one core is 0 - 0x10000. + */ +static u32 scale_cap(u32 in) +{ + return (0x10000 * in) / 100; +} + +static void fill_hdr(struct sthyi_sctns *sctns) +{ + sctns->hdr.infhdln = sizeof(sctns->hdr); + sctns->hdr.infmoff = sizeof(sctns->hdr); + sctns->hdr.infmlen = sizeof(sctns->mac); + sctns->hdr.infplen = sizeof(sctns->par); + sctns->hdr.infpoff = sctns->hdr.infhdln + sctns->hdr.infmlen; + sctns->hdr.infhtotl = sctns->hdr.infpoff + sctns->hdr.infplen; +} + +static void fill_stsi_mac(struct sthyi_sctns *sctns, + struct sysinfo_1_1_1 *sysinfo) +{ + if (stsi(sysinfo, 1, 1, 1)) + return; + + sclp_ocf_cpc_name_copy(sctns->mac.infmname); + + memcpy(sctns->mac.infmtype, sysinfo->type, sizeof(sctns->mac.infmtype)); + memcpy(sctns->mac.infmmanu, sysinfo->manufacturer, sizeof(sctns->mac.infmmanu)); + memcpy(sctns->mac.infmpman, sysinfo->plant, sizeof(sctns->mac.infmpman)); + memcpy(sctns->mac.infmseq, sysinfo->sequence, sizeof(sctns->mac.infmseq)); + + sctns->mac.infmval1 |= MAC_ID_VLD | MAC_NAME_VLD; +} + +static void fill_stsi_par(struct sthyi_sctns *sctns, + struct sysinfo_2_2_2 *sysinfo) +{ + if (stsi(sysinfo, 2, 2, 2)) + return; + + sctns->par.infppnum = sysinfo->lpar_number; + memcpy(sctns->par.infppnam, sysinfo->name, sizeof(sctns->par.infppnam)); + + sctns->par.infpval1 |= PAR_ID_VLD; +} + +static void fill_stsi(struct sthyi_sctns *sctns) +{ + void *sysinfo; + + /* Errors are handled through the validity bits in the response. */ + sysinfo = (void *)__get_free_page(GFP_KERNEL); + if (!sysinfo) + return; + + fill_stsi_mac(sctns, sysinfo); + fill_stsi_par(sctns, sysinfo); + + free_pages((unsigned long)sysinfo, 0); +} + +static void fill_diag_mac(struct sthyi_sctns *sctns, + struct diag204_x_phys_block *block, + void *diag224_buf) +{ + int i; + + for (i = 0; i < block->hdr.cpus; i++) { + switch (cpu_id(block->cpus[i].ctidx, diag224_buf)) { + case CP: + if (block->cpus[i].weight == DED_WEIGHT) + sctns->mac.infmdcps++; + else + sctns->mac.infmscps++; + break; + case IFL: + if (block->cpus[i].weight == DED_WEIGHT) + sctns->mac.infmdifl++; + else + sctns->mac.infmsifl++; + break; + } + } + sctns->mac.infmval1 |= MAC_CNT_VLD; +} + +/* Returns a pointer to the the next partition block. */ +static struct diag204_x_part_block *lpar_cpu_inf(struct lpar_cpu_inf *part_inf, + bool this_lpar, + void *diag224_buf, + struct diag204_x_part_block *block) +{ + int i, capped = 0, weight_cp = 0, weight_ifl = 0; + struct cpu_inf *cpu_inf; + + for (i = 0; i < block->hdr.rcpus; i++) { + if (!(block->cpus[i].cflag & DIAG204_CPU_ONLINE)) + continue; + + switch (cpu_id(block->cpus[i].ctidx, diag224_buf)) { + case CP: + cpu_inf = &part_inf->cp; + if (block->cpus[i].cur_weight < DED_WEIGHT) + weight_cp |= block->cpus[i].cur_weight; + break; + case IFL: + cpu_inf = &part_inf->ifl; + if (block->cpus[i].cur_weight < DED_WEIGHT) + weight_ifl |= block->cpus[i].cur_weight; + break; + default: + continue; + } + + if (!this_lpar) + continue; + + capped |= block->cpus[i].cflag & DIAG204_CPU_CAPPED; + cpu_inf->lpar_cap |= block->cpus[i].cpu_type_cap; + cpu_inf->lpar_grp_cap |= block->cpus[i].group_cpu_type_cap; + + if (block->cpus[i].weight == DED_WEIGHT) + cpu_inf->cpu_num_ded += 1; + else + cpu_inf->cpu_num_shd += 1; + } + + if (this_lpar && capped) { + part_inf->cp.lpar_weight = weight_cp; + part_inf->ifl.lpar_weight = weight_ifl; + } + part_inf->cp.all_weight += weight_cp; + part_inf->ifl.all_weight += weight_ifl; + return (struct diag204_x_part_block *)&block->cpus[i]; +} + +static void fill_diag(struct sthyi_sctns *sctns) +{ + int i, r, pages; + bool this_lpar; + void *diag204_buf; + void *diag224_buf = NULL; + struct diag204_x_info_blk_hdr *ti_hdr; + struct diag204_x_part_block *part_block; + struct diag204_x_phys_block *phys_block; + struct lpar_cpu_inf lpar_inf = {}; + + /* Errors are handled through the validity bits in the response. */ + pages = diag204((unsigned long)DIAG204_SUBC_RSI | + (unsigned long)DIAG204_INFO_EXT, 0, NULL); + if (pages <= 0) + return; + + diag204_buf = vmalloc(PAGE_SIZE * pages); + if (!diag204_buf) + return; + + r = diag204((unsigned long)DIAG204_SUBC_STIB7 | + (unsigned long)DIAG204_INFO_EXT, pages, diag204_buf); + if (r < 0) + goto out; + + diag224_buf = (void *)__get_free_page(GFP_KERNEL | GFP_DMA); + if (!diag224_buf || diag224(diag224_buf)) + goto out; + + ti_hdr = diag204_buf; + part_block = diag204_buf + sizeof(*ti_hdr); + + for (i = 0; i < ti_hdr->npar; i++) { + /* + * For the calling lpar we also need to get the cpu + * caps and weights. The time information block header + * specifies the offset to the partition block of the + * caller lpar, so we know when we process its data. + */ + this_lpar = (void *)part_block - diag204_buf == ti_hdr->this_part; + part_block = lpar_cpu_inf(&lpar_inf, this_lpar, diag224_buf, + part_block); + } + + phys_block = (struct diag204_x_phys_block *)part_block; + part_block = diag204_buf + ti_hdr->this_part; + if (part_block->hdr.mtid) + sctns->par.infpflg1 = PAR_MT_EN; + + sctns->par.infpval1 |= PAR_GRP_VLD; + sctns->par.infplgcp = scale_cap(lpar_inf.cp.lpar_grp_cap); + sctns->par.infplgif = scale_cap(lpar_inf.ifl.lpar_grp_cap); + memcpy(sctns->par.infplgnm, part_block->hdr.hardware_group_name, + sizeof(sctns->par.infplgnm)); + + sctns->par.infpscps = lpar_inf.cp.cpu_num_shd; + sctns->par.infpdcps = lpar_inf.cp.cpu_num_ded; + sctns->par.infpsifl = lpar_inf.ifl.cpu_num_shd; + sctns->par.infpdifl = lpar_inf.ifl.cpu_num_ded; + sctns->par.infpval1 |= PAR_PCNT_VLD; + + sctns->par.infpabcp = scale_cap(lpar_inf.cp.lpar_cap); + sctns->par.infpabif = scale_cap(lpar_inf.ifl.lpar_cap); + sctns->par.infpval1 |= PAR_ABS_VLD; + + /* + * Everything below needs global performance data to be + * meaningful. + */ + if (!(ti_hdr->flags & DIAG204_LPAR_PHYS_FLG)) { + sctns->hdr.infhflg1 |= HDR_PERF_UNAV; + goto out; + } + + fill_diag_mac(sctns, phys_block, diag224_buf); + + if (lpar_inf.cp.lpar_weight) { + sctns->par.infpwbcp = sctns->mac.infmscps * 0x10000 * + lpar_inf.cp.lpar_weight / lpar_inf.cp.all_weight; + } + + if (lpar_inf.ifl.lpar_weight) { + sctns->par.infpwbif = sctns->mac.infmsifl * 0x10000 * + lpar_inf.ifl.lpar_weight / lpar_inf.ifl.all_weight; + } + sctns->par.infpval1 |= PAR_WGHT_VLD; + +out: + free_page((unsigned long)diag224_buf); + vfree(diag204_buf); +} + +static int sthyi(u64 vaddr, u64 *rc) +{ + register u64 code asm("0") = 0; + register u64 addr asm("2") = vaddr; + register u64 rcode asm("3"); + int cc; + + asm volatile( + ".insn rre,0xB2560000,%[code],%[addr]\n" + "ipm %[cc]\n" + "srl %[cc],28\n" + : [cc] "=d" (cc), "=d" (rcode) + : [code] "d" (code), [addr] "a" (addr) + : "memory", "cc"); + *rc = rcode; + return cc; +} + +/* + * sthyi_fill - Fill page with data returned by the STHYI instruction + * + * @dst: Pointer to zeroed page + * @rc: Pointer for storing the return code of the instruction + * + * Fills the destination with system information returned by the STHYI + * instruction. The data is generated by emulation or execution of STHYI, + * if available. The return value is the condition code that would be + * returned, the rc parameter is the return code which is passed in + * register R2 + 1. + */ +int sthyi_fill(void *dst, u64 *rc) +{ + struct sthyi_sctns *sctns = (struct sthyi_sctns *)dst; + + /* + * If the facility is on, we don't want to emulate the instruction. + * We ask the hypervisor to provide the data. + */ + if (test_facility(74)) + return sthyi((u64)dst, rc); + + fill_hdr(sctns); + fill_stsi(sctns); + fill_diag(sctns); + + *rc = 0; + return 0; +} +EXPORT_SYMBOL_GPL(sthyi_fill); diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile index 09a9e6dfc09f..6048b1c6e580 100644 --- a/arch/s390/kvm/Makefile +++ b/arch/s390/kvm/Makefile @@ -12,6 +12,6 @@ common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o $(KVM)/async_pf.o $(KVM)/irqch ccflags-y := -Ivirt/kvm -Iarch/s390/kvm kvm-objs := $(common-objs) kvm-s390.o intercept.o interrupt.o priv.o sigp.o -kvm-objs += diag.o gaccess.o guestdbg.o sthyi.o vsie.o +kvm-objs += diag.o gaccess.o guestdbg.o vsie.o obj-$(CONFIG_KVM) += kvm.o diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index a4752bf6b526..46adda5e2b2c 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "kvm-s390.h" #include "gaccess.h" @@ -360,6 +361,71 @@ static int handle_partial_execution(struct kvm_vcpu *vcpu) return -EOPNOTSUPP; } +/* + * Handle the sthyi instruction that provides the guest with system + * information, like current CPU resources available at each level of + * the machine. + */ +int handle_sthyi(struct kvm_vcpu *vcpu) +{ + int reg1, reg2, r = 0; + u64 code, addr, cc = 0, rc = 0; + struct sthyi_sctns *sctns = NULL; + + if (!test_kvm_facility(vcpu->kvm, 74)) + return kvm_s390_inject_program_int(vcpu, PGM_OPERATION); + + /* + * STHYI requires extensive locking in the higher hypervisors + * and is very computational/memory expensive. Therefore we + * ratelimit the executions per VM. + */ + if (!__ratelimit(&vcpu->kvm->arch.sthyi_limit)) { + kvm_s390_retry_instr(vcpu); + return 0; + } + + kvm_s390_get_regs_rre(vcpu, ®1, ®2); + code = vcpu->run->s.regs.gprs[reg1]; + addr = vcpu->run->s.regs.gprs[reg2]; + + vcpu->stat.instruction_sthyi++; + VCPU_EVENT(vcpu, 3, "STHYI: fc: %llu addr: 0x%016llx", code, addr); + trace_kvm_s390_handle_sthyi(vcpu, code, addr); + + if (reg1 == reg2 || reg1 & 1 || reg2 & 1) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + if (code & 0xffff) { + cc = 3; + rc = 4; + goto out; + } + + if (addr & ~PAGE_MASK) + return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); + + sctns = (void *)get_zeroed_page(GFP_KERNEL); + if (!sctns) + return -ENOMEM; + + cc = sthyi_fill(sctns, &rc); + +out: + if (!cc) { + r = write_guest(vcpu, addr, reg2, sctns, PAGE_SIZE); + if (r) { + free_page((unsigned long)sctns); + return kvm_s390_inject_prog_cond(vcpu, r); + } + } + + free_page((unsigned long)sctns); + vcpu->run->s.regs.gprs[reg2 + 1] = rc; + kvm_s390_set_psw_cc(vcpu, cc); + return r; +} + static int handle_operexc(struct kvm_vcpu *vcpu) { psw_t oldpsw, newpsw; diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 9f8fdd7b2311..10d65dfbc306 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -242,6 +242,8 @@ static inline void kvm_s390_retry_instr(struct kvm_vcpu *vcpu) kvm_s390_rewind_psw(vcpu, kvm_s390_get_ilen(vcpu)); } +int handle_sthyi(struct kvm_vcpu *vcpu); + /* implemented in priv.c */ int is_valid_psw(psw_t *psw); int kvm_s390_handle_aa(struct kvm_vcpu *vcpu); @@ -268,9 +270,6 @@ void kvm_s390_vsie_destroy(struct kvm *kvm); int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu); int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu); -/* implemented in sthyi.c */ -int handle_sthyi(struct kvm_vcpu *vcpu); - /* implemented in kvm-s390.c */ void kvm_s390_set_tod_clock_ext(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod); diff --git a/arch/s390/kvm/sthyi.c b/arch/s390/kvm/sthyi.c deleted file mode 100644 index 395926b8c1ed..000000000000 --- a/arch/s390/kvm/sthyi.c +++ /dev/null @@ -1,469 +0,0 @@ -/* - * store hypervisor information instruction emulation functions. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License (version 2 only) - * as published by the Free Software Foundation. - * - * Copyright IBM Corp. 2016 - * Author(s): Janosch Frank - */ -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include "kvm-s390.h" -#include "gaccess.h" -#include "trace.h" - -#define DED_WEIGHT 0xffff -/* - * CP and IFL as EBCDIC strings, SP/0x40 determines the end of string - * as they are justified with spaces. - */ -#define CP 0xc3d7404040404040UL -#define IFL 0xc9c6d34040404040UL - -enum hdr_flags { - HDR_NOT_LPAR = 0x10, - HDR_STACK_INCM = 0x20, - HDR_STSI_UNAV = 0x40, - HDR_PERF_UNAV = 0x80, -}; - -enum mac_validity { - MAC_NAME_VLD = 0x20, - MAC_ID_VLD = 0x40, - MAC_CNT_VLD = 0x80, -}; - -enum par_flag { - PAR_MT_EN = 0x80, -}; - -enum par_validity { - PAR_GRP_VLD = 0x08, - PAR_ID_VLD = 0x10, - PAR_ABS_VLD = 0x20, - PAR_WGHT_VLD = 0x40, - PAR_PCNT_VLD = 0x80, -}; - -struct hdr_sctn { - u8 infhflg1; - u8 infhflg2; /* reserved */ - u8 infhval1; /* reserved */ - u8 infhval2; /* reserved */ - u8 reserved[3]; - u8 infhygct; - u16 infhtotl; - u16 infhdln; - u16 infmoff; - u16 infmlen; - u16 infpoff; - u16 infplen; - u16 infhoff1; - u16 infhlen1; - u16 infgoff1; - u16 infglen1; - u16 infhoff2; - u16 infhlen2; - u16 infgoff2; - u16 infglen2; - u16 infhoff3; - u16 infhlen3; - u16 infgoff3; - u16 infglen3; - u8 reserved2[4]; -} __packed; - -struct mac_sctn { - u8 infmflg1; /* reserved */ - u8 infmflg2; /* reserved */ - u8 infmval1; - u8 infmval2; /* reserved */ - u16 infmscps; - u16 infmdcps; - u16 infmsifl; - u16 infmdifl; - char infmname[8]; - char infmtype[4]; - char infmmanu[16]; - char infmseq[16]; - char infmpman[4]; - u8 reserved[4]; -} __packed; - -struct par_sctn { - u8 infpflg1; - u8 infpflg2; /* reserved */ - u8 infpval1; - u8 infpval2; /* reserved */ - u16 infppnum; - u16 infpscps; - u16 infpdcps; - u16 infpsifl; - u16 infpdifl; - u16 reserved; - char infppnam[8]; - u32 infpwbcp; - u32 infpabcp; - u32 infpwbif; - u32 infpabif; - char infplgnm[8]; - u32 infplgcp; - u32 infplgif; -} __packed; - -struct sthyi_sctns { - struct hdr_sctn hdr; - struct mac_sctn mac; - struct par_sctn par; -} __packed; - -struct cpu_inf { - u64 lpar_cap; - u64 lpar_grp_cap; - u64 lpar_weight; - u64 all_weight; - int cpu_num_ded; - int cpu_num_shd; -}; - -struct lpar_cpu_inf { - struct cpu_inf cp; - struct cpu_inf ifl; -}; - -static inline u64 cpu_id(u8 ctidx, void *diag224_buf) -{ - return *((u64 *)(diag224_buf + (ctidx + 1) * DIAG204_CPU_NAME_LEN)); -} - -/* - * Scales the cpu capping from the lpar range to the one expected in - * sthyi data. - * - * diag204 reports a cap in hundredths of processor units. - * z/VM's range for one core is 0 - 0x10000. - */ -static u32 scale_cap(u32 in) -{ - return (0x10000 * in) / 100; -} - -static void fill_hdr(struct sthyi_sctns *sctns) -{ - sctns->hdr.infhdln = sizeof(sctns->hdr); - sctns->hdr.infmoff = sizeof(sctns->hdr); - sctns->hdr.infmlen = sizeof(sctns->mac); - sctns->hdr.infplen = sizeof(sctns->par); - sctns->hdr.infpoff = sctns->hdr.infhdln + sctns->hdr.infmlen; - sctns->hdr.infhtotl = sctns->hdr.infpoff + sctns->hdr.infplen; -} - -static void fill_stsi_mac(struct sthyi_sctns *sctns, - struct sysinfo_1_1_1 *sysinfo) -{ - if (stsi(sysinfo, 1, 1, 1)) - return; - - sclp_ocf_cpc_name_copy(sctns->mac.infmname); - - memcpy(sctns->mac.infmtype, sysinfo->type, sizeof(sctns->mac.infmtype)); - memcpy(sctns->mac.infmmanu, sysinfo->manufacturer, sizeof(sctns->mac.infmmanu)); - memcpy(sctns->mac.infmpman, sysinfo->plant, sizeof(sctns->mac.infmpman)); - memcpy(sctns->mac.infmseq, sysinfo->sequence, sizeof(sctns->mac.infmseq)); - - sctns->mac.infmval1 |= MAC_ID_VLD | MAC_NAME_VLD; -} - -static void fill_stsi_par(struct sthyi_sctns *sctns, - struct sysinfo_2_2_2 *sysinfo) -{ - if (stsi(sysinfo, 2, 2, 2)) - return; - - sctns->par.infppnum = sysinfo->lpar_number; - memcpy(sctns->par.infppnam, sysinfo->name, sizeof(sctns->par.infppnam)); - - sctns->par.infpval1 |= PAR_ID_VLD; -} - -static void fill_stsi(struct sthyi_sctns *sctns) -{ - void *sysinfo; - - /* Errors are handled through the validity bits in the response. */ - sysinfo = (void *)__get_free_page(GFP_KERNEL); - if (!sysinfo) - return; - - fill_stsi_mac(sctns, sysinfo); - fill_stsi_par(sctns, sysinfo); - - free_pages((unsigned long)sysinfo, 0); -} - -static void fill_diag_mac(struct sthyi_sctns *sctns, - struct diag204_x_phys_block *block, - void *diag224_buf) -{ - int i; - - for (i = 0; i < block->hdr.cpus; i++) { - switch (cpu_id(block->cpus[i].ctidx, diag224_buf)) { - case CP: - if (block->cpus[i].weight == DED_WEIGHT) - sctns->mac.infmdcps++; - else - sctns->mac.infmscps++; - break; - case IFL: - if (block->cpus[i].weight == DED_WEIGHT) - sctns->mac.infmdifl++; - else - sctns->mac.infmsifl++; - break; - } - } - sctns->mac.infmval1 |= MAC_CNT_VLD; -} - -/* Returns a pointer to the the next partition block. */ -static struct diag204_x_part_block *lpar_cpu_inf(struct lpar_cpu_inf *part_inf, - bool this_lpar, - void *diag224_buf, - struct diag204_x_part_block *block) -{ - int i, capped = 0, weight_cp = 0, weight_ifl = 0; - struct cpu_inf *cpu_inf; - - for (i = 0; i < block->hdr.rcpus; i++) { - if (!(block->cpus[i].cflag & DIAG204_CPU_ONLINE)) - continue; - - switch (cpu_id(block->cpus[i].ctidx, diag224_buf)) { - case CP: - cpu_inf = &part_inf->cp; - if (block->cpus[i].cur_weight < DED_WEIGHT) - weight_cp |= block->cpus[i].cur_weight; - break; - case IFL: - cpu_inf = &part_inf->ifl; - if (block->cpus[i].cur_weight < DED_WEIGHT) - weight_ifl |= block->cpus[i].cur_weight; - break; - default: - continue; - } - - if (!this_lpar) - continue; - - capped |= block->cpus[i].cflag & DIAG204_CPU_CAPPED; - cpu_inf->lpar_cap |= block->cpus[i].cpu_type_cap; - cpu_inf->lpar_grp_cap |= block->cpus[i].group_cpu_type_cap; - - if (block->cpus[i].weight == DED_WEIGHT) - cpu_inf->cpu_num_ded += 1; - else - cpu_inf->cpu_num_shd += 1; - } - - if (this_lpar && capped) { - part_inf->cp.lpar_weight = weight_cp; - part_inf->ifl.lpar_weight = weight_ifl; - } - part_inf->cp.all_weight += weight_cp; - part_inf->ifl.all_weight += weight_ifl; - return (struct diag204_x_part_block *)&block->cpus[i]; -} - -static void fill_diag(struct sthyi_sctns *sctns) -{ - int i, r, pages; - bool this_lpar; - void *diag204_buf; - void *diag224_buf = NULL; - struct diag204_x_info_blk_hdr *ti_hdr; - struct diag204_x_part_block *part_block; - struct diag204_x_phys_block *phys_block; - struct lpar_cpu_inf lpar_inf = {}; - - /* Errors are handled through the validity bits in the response. */ - pages = diag204((unsigned long)DIAG204_SUBC_RSI | - (unsigned long)DIAG204_INFO_EXT, 0, NULL); - if (pages <= 0) - return; - - diag204_buf = vmalloc(PAGE_SIZE * pages); - if (!diag204_buf) - return; - - r = diag204((unsigned long)DIAG204_SUBC_STIB7 | - (unsigned long)DIAG204_INFO_EXT, pages, diag204_buf); - if (r < 0) - goto out; - - diag224_buf = (void *)__get_free_page(GFP_KERNEL | GFP_DMA); - if (!diag224_buf || diag224(diag224_buf)) - goto out; - - ti_hdr = diag204_buf; - part_block = diag204_buf + sizeof(*ti_hdr); - - for (i = 0; i < ti_hdr->npar; i++) { - /* - * For the calling lpar we also need to get the cpu - * caps and weights. The time information block header - * specifies the offset to the partition block of the - * caller lpar, so we know when we process its data. - */ - this_lpar = (void *)part_block - diag204_buf == ti_hdr->this_part; - part_block = lpar_cpu_inf(&lpar_inf, this_lpar, diag224_buf, - part_block); - } - - phys_block = (struct diag204_x_phys_block *)part_block; - part_block = diag204_buf + ti_hdr->this_part; - if (part_block->hdr.mtid) - sctns->par.infpflg1 = PAR_MT_EN; - - sctns->par.infpval1 |= PAR_GRP_VLD; - sctns->par.infplgcp = scale_cap(lpar_inf.cp.lpar_grp_cap); - sctns->par.infplgif = scale_cap(lpar_inf.ifl.lpar_grp_cap); - memcpy(sctns->par.infplgnm, part_block->hdr.hardware_group_name, - sizeof(sctns->par.infplgnm)); - - sctns->par.infpscps = lpar_inf.cp.cpu_num_shd; - sctns->par.infpdcps = lpar_inf.cp.cpu_num_ded; - sctns->par.infpsifl = lpar_inf.ifl.cpu_num_shd; - sctns->par.infpdifl = lpar_inf.ifl.cpu_num_ded; - sctns->par.infpval1 |= PAR_PCNT_VLD; - - sctns->par.infpabcp = scale_cap(lpar_inf.cp.lpar_cap); - sctns->par.infpabif = scale_cap(lpar_inf.ifl.lpar_cap); - sctns->par.infpval1 |= PAR_ABS_VLD; - - /* - * Everything below needs global performance data to be - * meaningful. - */ - if (!(ti_hdr->flags & DIAG204_LPAR_PHYS_FLG)) { - sctns->hdr.infhflg1 |= HDR_PERF_UNAV; - goto out; - } - - fill_diag_mac(sctns, phys_block, diag224_buf); - - if (lpar_inf.cp.lpar_weight) { - sctns->par.infpwbcp = sctns->mac.infmscps * 0x10000 * - lpar_inf.cp.lpar_weight / lpar_inf.cp.all_weight; - } - - if (lpar_inf.ifl.lpar_weight) { - sctns->par.infpwbif = sctns->mac.infmsifl * 0x10000 * - lpar_inf.ifl.lpar_weight / lpar_inf.ifl.all_weight; - } - sctns->par.infpval1 |= PAR_WGHT_VLD; - -out: - free_page((unsigned long)diag224_buf); - vfree(diag204_buf); -} - -static int sthyi(u64 vaddr) -{ - register u64 code asm("0") = 0; - register u64 addr asm("2") = vaddr; - int cc; - - asm volatile( - ".insn rre,0xB2560000,%[code],%[addr]\n" - "ipm %[cc]\n" - "srl %[cc],28\n" - : [cc] "=d" (cc) - : [code] "d" (code), [addr] "a" (addr) - : "3", "memory", "cc"); - return cc; -} - -int handle_sthyi(struct kvm_vcpu *vcpu) -{ - int reg1, reg2, r = 0; - u64 code, addr, cc = 0; - struct sthyi_sctns *sctns = NULL; - - if (!test_kvm_facility(vcpu->kvm, 74)) - return kvm_s390_inject_program_int(vcpu, PGM_OPERATION); - - /* - * STHYI requires extensive locking in the higher hypervisors - * and is very computational/memory expensive. Therefore we - * ratelimit the executions per VM. - */ - if (!__ratelimit(&vcpu->kvm->arch.sthyi_limit)) { - kvm_s390_retry_instr(vcpu); - return 0; - } - - kvm_s390_get_regs_rre(vcpu, ®1, ®2); - code = vcpu->run->s.regs.gprs[reg1]; - addr = vcpu->run->s.regs.gprs[reg2]; - - vcpu->stat.instruction_sthyi++; - VCPU_EVENT(vcpu, 3, "STHYI: fc: %llu addr: 0x%016llx", code, addr); - trace_kvm_s390_handle_sthyi(vcpu, code, addr); - - if (reg1 == reg2 || reg1 & 1 || reg2 & 1) - return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - - if (code & 0xffff) { - cc = 3; - goto out; - } - - if (addr & ~PAGE_MASK) - return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - - sctns = (void *)get_zeroed_page(GFP_KERNEL); - if (!sctns) - return -ENOMEM; - - /* - * If we are a guest, we don't want to emulate an emulated - * instruction. We ask the hypervisor to provide the data. - */ - if (test_facility(74)) { - cc = sthyi((u64)sctns); - goto out; - } - - fill_hdr(sctns); - fill_stsi(sctns); - fill_diag(sctns); - -out: - if (!cc) { - r = write_guest(vcpu, addr, reg2, sctns, PAGE_SIZE); - if (r) { - free_page((unsigned long)sctns); - return kvm_s390_inject_prog_cond(vcpu, r); - } - } - - free_page((unsigned long)sctns); - vcpu->run->s.regs.gprs[reg2 + 1] = cc ? 4 : 0; - kvm_s390_set_psw_cc(vcpu, cc); - return r; -} -- cgit From 9fb6c9b3fea1b1d1c6f14178373e8f7235f3b681 Mon Sep 17 00:00:00 2001 From: QingFeng Hao Date: Fri, 29 Sep 2017 12:41:51 +0200 Subject: s390/sthyi: add cache to store hypervisor info STHYI requires extensive locking in the higher hypervisors and is very computational/memory expensive. Therefore we cache the retrieved hypervisor info whose valid period is 1s with mutex to allow concurrent access. rw semaphore can't benefit here due to cache line bounce. Signed-off-by: QingFeng Hao Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/kvm_host.h | 1 - arch/s390/kernel/sthyi.c | 84 +++++++++++++++++++++++++++++++++------- arch/s390/kvm/intercept.c | 10 ----- arch/s390/kvm/kvm-s390.c | 2 - 4 files changed, 71 insertions(+), 26 deletions(-) (limited to 'arch/s390') diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 51375e766e90..fd006a272024 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -736,7 +736,6 @@ struct kvm_arch{ wait_queue_head_t ipte_wq; int ipte_lock_count; struct mutex ipte_mutex; - struct ratelimit_state sthyi_limit; spinlock_t start_stop_lock; struct sie_page2 *sie_page2; struct kvm_s390_cpu_model model; diff --git a/arch/s390/kernel/sthyi.c b/arch/s390/kernel/sthyi.c index 3d51f86f9dec..27e3c3d87379 100644 --- a/arch/s390/kernel/sthyi.c +++ b/arch/s390/kernel/sthyi.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include @@ -139,6 +138,21 @@ struct lpar_cpu_inf { struct cpu_inf ifl; }; +/* + * STHYI requires extensive locking in the higher hypervisors + * and is very computational/memory expensive. Therefore we + * cache the retrieved data whose valid period is 1s. + */ +#define CACHE_VALID_JIFFIES HZ + +struct sthyi_info { + void *info; + unsigned long end; +}; + +static DEFINE_MUTEX(sthyi_mutex); +static struct sthyi_info sthyi_cache; + static inline u64 cpu_id(u8 ctidx, void *diag224_buf) { return *((u64 *)(diag224_buf + (ctidx + 1) * DIAG204_CPU_NAME_LEN)); @@ -395,6 +409,47 @@ static int sthyi(u64 vaddr, u64 *rc) return cc; } +static int fill_dst(void *dst, u64 *rc) +{ + struct sthyi_sctns *sctns = (struct sthyi_sctns *)dst; + + /* + * If the facility is on, we don't want to emulate the instruction. + * We ask the hypervisor to provide the data. + */ + if (test_facility(74)) + return sthyi((u64)dst, rc); + + fill_hdr(sctns); + fill_stsi(sctns); + fill_diag(sctns); + *rc = 0; + return 0; +} + +static int sthyi_init_cache(void) +{ + if (sthyi_cache.info) + return 0; + sthyi_cache.info = (void *)get_zeroed_page(GFP_KERNEL); + if (!sthyi_cache.info) + return -ENOMEM; + sthyi_cache.end = jiffies - 1; /* expired */ + return 0; +} + +static int sthyi_update_cache(u64 *rc) +{ + int r; + + memset(sthyi_cache.info, 0, PAGE_SIZE); + r = fill_dst(sthyi_cache.info, rc); + if (r) + return r; + sthyi_cache.end = jiffies + CACHE_VALID_JIFFIES; + return r; +} + /* * sthyi_fill - Fill page with data returned by the STHYI instruction * @@ -409,20 +464,23 @@ static int sthyi(u64 vaddr, u64 *rc) */ int sthyi_fill(void *dst, u64 *rc) { - struct sthyi_sctns *sctns = (struct sthyi_sctns *)dst; - - /* - * If the facility is on, we don't want to emulate the instruction. - * We ask the hypervisor to provide the data. - */ - if (test_facility(74)) - return sthyi((u64)dst, rc); + int r; - fill_hdr(sctns); - fill_stsi(sctns); - fill_diag(sctns); + mutex_lock(&sthyi_mutex); + r = sthyi_init_cache(); + if (r) + goto out; + if (time_is_before_jiffies(sthyi_cache.end)) { + /* cache expired */ + r = sthyi_update_cache(rc); + if (r) + goto out; + } *rc = 0; - return 0; + memcpy(dst, sthyi_cache.info, PAGE_SIZE); +out: + mutex_unlock(&sthyi_mutex); + return r; } EXPORT_SYMBOL_GPL(sthyi_fill); diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 46adda5e2b2c..8fe034beb623 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -375,16 +375,6 @@ int handle_sthyi(struct kvm_vcpu *vcpu) if (!test_kvm_facility(vcpu->kvm, 74)) return kvm_s390_inject_program_int(vcpu, PGM_OPERATION); - /* - * STHYI requires extensive locking in the higher hypervisors - * and is very computational/memory expensive. Therefore we - * ratelimit the executions per VM. - */ - if (!__ratelimit(&vcpu->kvm->arch.sthyi_limit)) { - kvm_s390_retry_instr(vcpu); - return 0; - } - kvm_s390_get_regs_rre(vcpu, ®1, ®2); code = vcpu->run->s.regs.gprs[reg1]; addr = vcpu->run->s.regs.gprs[reg2]; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 40d0a1a97889..de6a5b790da0 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1884,8 +1884,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) rc = -ENOMEM; - ratelimit_state_init(&kvm->arch.sthyi_limit, 5 * HZ, 500); - kvm->arch.use_esca = 0; /* start with basic SCA */ if (!sclp.has_64bscao) alloc_flags |= GFP_DMA; -- cgit From 3d8757b87d7fc15a87928bc970f060bc9c6dc618 Mon Sep 17 00:00:00 2001 From: QingFeng Hao Date: Fri, 29 Sep 2017 12:41:52 +0200 Subject: s390/sthyi: add s390_sthyi system call Add a syscall of s390_sthyi to implement STHYI instruction in LPAR which reuses the implementation for KVM by Janosch Frank - commit 95ca2cb57985 ("KVM: s390: Add sthyi emulation"). STHYI(Store Hypervisor Information) is an emulated z/VM instruction that provides a guest with basic information about the layers it is running on. This includes information about the cpu configuration of both the machine and the lpar, as well as their names, machine model and machine type. This information enables an application to determine the maximum capacity of CPs and IFLs available to software. For the arguments of s390_sthyi, code shall be 0 and flags is reserved for future use, info is the output argument to store the required hypervisor info. Signed-off-by: QingFeng Hao Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/uapi/asm/sthyi.h | 6 ++++++ arch/s390/include/uapi/asm/unistd.h | 3 ++- arch/s390/kernel/compat_wrapper.c | 1 + arch/s390/kernel/entry.h | 1 + arch/s390/kernel/sthyi.c | 33 ++++++++++++++++++++++++++++++++- arch/s390/kernel/syscalls.S | 1 + 6 files changed, 43 insertions(+), 2 deletions(-) create mode 100644 arch/s390/include/uapi/asm/sthyi.h (limited to 'arch/s390') diff --git a/arch/s390/include/uapi/asm/sthyi.h b/arch/s390/include/uapi/asm/sthyi.h new file mode 100644 index 000000000000..ec113db4eb7e --- /dev/null +++ b/arch/s390/include/uapi/asm/sthyi.h @@ -0,0 +1,6 @@ +#ifndef _UAPI_ASM_STHYI_H +#define _UAPI_ASM_STHYI_H + +#define STHYI_FC_CP_IFL_CAP 0 + +#endif /* _UAPI_ASM_STHYI_H */ diff --git a/arch/s390/include/uapi/asm/unistd.h b/arch/s390/include/uapi/asm/unistd.h index ea42290e7d51..61c64f543769 100644 --- a/arch/s390/include/uapi/asm/unistd.h +++ b/arch/s390/include/uapi/asm/unistd.h @@ -315,7 +315,8 @@ #define __NR_pwritev2 377 #define __NR_s390_guarded_storage 378 #define __NR_statx 379 -#define NR_syscalls 380 +#define __NR_s390_sthyi 380 +#define NR_syscalls 381 /* * There are some system calls that are not present on 64 bit, some diff --git a/arch/s390/kernel/compat_wrapper.c b/arch/s390/kernel/compat_wrapper.c index 986642a3543b..eb0b17ed95b6 100644 --- a/arch/s390/kernel/compat_wrapper.c +++ b/arch/s390/kernel/compat_wrapper.c @@ -180,3 +180,4 @@ COMPAT_SYSCALL_WRAP3(mlock2, unsigned long, start, size_t, len, int, flags); COMPAT_SYSCALL_WRAP6(copy_file_range, int, fd_in, loff_t __user *, off_in, int, fd_out, loff_t __user *, off_out, size_t, len, unsigned int, flags); COMPAT_SYSCALL_WRAP2(s390_guarded_storage, int, command, struct gs_cb *, gs_cb); COMPAT_SYSCALL_WRAP5(statx, int, dfd, const char __user *, path, unsigned, flags, unsigned, mask, struct statx __user *, buffer); +COMPAT_SYSCALL_WRAP4(s390_sthyi, unsigned long, code, void __user *, info, u64 __user *, rc, unsigned long, flags); diff --git a/arch/s390/kernel/entry.h b/arch/s390/kernel/entry.h index dbf5f7e18246..bb5301eeb4f4 100644 --- a/arch/s390/kernel/entry.h +++ b/arch/s390/kernel/entry.h @@ -77,6 +77,7 @@ long sys_s390_runtime_instr(int command, int signum); long sys_s390_guarded_storage(int command, struct gs_cb __user *); long sys_s390_pci_mmio_write(unsigned long, const void __user *, size_t); long sys_s390_pci_mmio_read(unsigned long, void __user *, size_t); +long sys_s390_sthyi(unsigned long function_code, void __user *buffer, u64 __user *return_code, unsigned long flags); DECLARE_PER_CPU(u64, mt_cycles[8]); diff --git a/arch/s390/kernel/sthyi.c b/arch/s390/kernel/sthyi.c index 27e3c3d87379..12981e197f01 100644 --- a/arch/s390/kernel/sthyi.c +++ b/arch/s390/kernel/sthyi.c @@ -11,13 +11,16 @@ #include #include #include - +#include +#include #include #include #include #include #include #include +#include +#include "entry.h" #define DED_WEIGHT 0xffff /* @@ -484,3 +487,31 @@ out: return r; } EXPORT_SYMBOL_GPL(sthyi_fill); + +SYSCALL_DEFINE4(s390_sthyi, unsigned long, function_code, void __user *, buffer, + u64 __user *, return_code, unsigned long, flags) +{ + u64 sthyi_rc; + void *info; + int r; + + if (flags) + return -EINVAL; + if (function_code != STHYI_FC_CP_IFL_CAP) + return -EOPNOTSUPP; + info = (void *)get_zeroed_page(GFP_KERNEL); + if (!info) + return -ENOMEM; + r = sthyi_fill(info, &sthyi_rc); + if (r < 0) + goto out; + if (return_code && put_user(sthyi_rc, return_code)) { + r = -EFAULT; + goto out; + } + if (copy_to_user(buffer, info, PAGE_SIZE)) + r = -EFAULT; +out: + free_page((unsigned long)info); + return r; +} diff --git a/arch/s390/kernel/syscalls.S b/arch/s390/kernel/syscalls.S index 54fce7b065de..0fb407ebbf46 100644 --- a/arch/s390/kernel/syscalls.S +++ b/arch/s390/kernel/syscalls.S @@ -388,3 +388,4 @@ SYSCALL(sys_preadv2,compat_sys_preadv2) SYSCALL(sys_pwritev2,compat_sys_pwritev2) SYSCALL(sys_s390_guarded_storage,compat_sys_s390_guarded_storage) /* 378 */ SYSCALL(sys_statx,compat_sys_statx) +SYSCALL(sys_s390_sthyi,compat_sys_s390_sthyi) -- cgit From 72e1ad4200d5ed5c5adf120b77fb2900a29a48e5 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Tue, 19 Sep 2017 12:34:06 +0200 Subject: KVM: s390: document memory ordering for kvm_s390_vcpu_wakeup swait_active does not enforce any ordering and it can therefore trigger some subtle races when the CPU moves the read for the check before a previous store and that store is then used on another CPU that is preparing the swait. On s390 there is a call to swait_active in kvm_s390_vcpu_wakeup. The good thing is, on s390 all potential races cannot happen because all callers of kvm_s390_vcpu_wakeup do not store (no race) or use an atomic operation, which handles memory ordering. Since this is not guaranteed by the Linux semantics (but by the implementation on s390) let's add smp_mb_after_atomic to make this obvious and document the ordering. Suggested-by: Paolo Bonzini Acked-by: Halil Pasic Reviewed-by: Cornelia Huck Reviewed-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/s390') diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index a832ad031cee..23d8fb25c5dd 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -1074,6 +1074,12 @@ void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu) * in kvm_vcpu_block without having the waitqueue set (polling) */ vcpu->valid_wakeup = true; + /* + * This is mostly to document, that the read in swait_active could + * be moved before other stores, leading to subtle races. + * All current users do not store or use an atomic like update + */ + smp_mb__after_atomic(); if (swait_active(&vcpu->wq)) { /* * The vcpu gave up the cpu voluntarily, mark it as a good -- cgit From ba850a8e64fbbd5f6407d5931124d00ced0528cc Mon Sep 17 00:00:00 2001 From: Tony Krowiak Date: Fri, 13 Oct 2017 13:38:46 -0400 Subject: KVM: s390: SIE considerations for AP Queue virtualization The Crypto Control Block (CRYCB) is referenced by the SIE state description and controls KVM guest access to the Adjunct Processor (AP) adapters, usage domains and control domains. This patch defines the AP control blocks to be used for controlling guest access to the AP adapters and domains. Signed-off-by: Tony Krowiak Message-Id: <1507916344-3896-2-git-send-email-akrowiak@linux.vnet.ibm.com> Acked-by: Cornelia Huck Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/kvm_host.h | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) (limited to 'arch/s390') diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index fd006a272024..f3a9b5a445b6 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -685,11 +685,28 @@ struct kvm_s390_crypto { __u8 dea_kw; }; +#define APCB0_MASK_SIZE 1 +struct kvm_s390_apcb0 { + __u64 apm[APCB0_MASK_SIZE]; /* 0x0000 */ + __u64 aqm[APCB0_MASK_SIZE]; /* 0x0008 */ + __u64 adm[APCB0_MASK_SIZE]; /* 0x0010 */ + __u64 reserved18; /* 0x0018 */ +}; + +#define APCB1_MASK_SIZE 4 +struct kvm_s390_apcb1 { + __u64 apm[APCB1_MASK_SIZE]; /* 0x0000 */ + __u64 aqm[APCB1_MASK_SIZE]; /* 0x0020 */ + __u64 adm[APCB1_MASK_SIZE]; /* 0x0040 */ + __u64 reserved60[4]; /* 0x0060 */ +}; + struct kvm_s390_crypto_cb { - __u8 reserved00[72]; /* 0x0000 */ - __u8 dea_wrapping_key_mask[24]; /* 0x0048 */ - __u8 aes_wrapping_key_mask[32]; /* 0x0060 */ - __u8 reserved80[128]; /* 0x0080 */ + struct kvm_s390_apcb0 apcb0; /* 0x0000 */ + __u8 reserved20[0x0048 - 0x0020]; /* 0x0020 */ + __u8 dea_wrapping_key_mask[24]; /* 0x0048 */ + __u8 aes_wrapping_key_mask[32]; /* 0x0060 */ + struct kvm_s390_apcb1 apcb1; /* 0x0080 */ }; /* -- cgit From f7a6509fe002e3909cb41c09e807b7f3ca4a361b Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 1 Sep 2017 17:11:43 +0200 Subject: KVM: s390: vsie: use common code functions for pinning We will not see -ENOMEM (gfn_to_hva() will return KVM_ERR_PTR_BAD_PAGE for all errors). So we can also get rid of special handling in the callers of pin_guest_page() and always assume that it is a g2 error. As also kvm_s390_inject_program_int() should never fail, we can simplify pin_scb(), too. Signed-off-by: David Hildenbrand Message-Id: <20170901151143.22714-1-david@redhat.com> Acked-by: Cornelia Huck Signed-off-by: Christian Borntraeger --- arch/s390/kvm/vsie.c | 50 ++++++++++++++++++-------------------------------- 1 file changed, 18 insertions(+), 32 deletions(-) (limited to 'arch/s390') diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index b18b5652e5c5..a311938b63b3 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -443,22 +443,14 @@ static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) * * Returns: - 0 on success * - -EINVAL if the gpa is not valid guest storage - * - -ENOMEM if out of memory */ static int pin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t *hpa) { struct page *page; - hva_t hva; - int rc; - hva = gfn_to_hva(kvm, gpa_to_gfn(gpa)); - if (kvm_is_error_hva(hva)) + page = gfn_to_page(kvm, gpa_to_gfn(gpa)); + if (is_error_page(page)) return -EINVAL; - rc = get_user_pages_fast(hva, 1, 1, &page); - if (rc < 0) - return rc; - else if (rc != 1) - return -ENOMEM; *hpa = (hpa_t) page_to_virt(page) + (gpa & ~PAGE_MASK); return 0; } @@ -466,11 +458,7 @@ static int pin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t *hpa) /* Unpins a page previously pinned via pin_guest_page, marking it as dirty. */ static void unpin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t hpa) { - struct page *page; - - page = virt_to_page(hpa); - set_page_dirty_lock(page); - put_page(page); + kvm_release_pfn_dirty(hpa >> PAGE_SHIFT); /* mark the page always as dirty for migration */ mark_page_dirty(kvm, gpa_to_gfn(gpa)); } @@ -557,7 +545,7 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) rc = set_validity_icpt(scb_s, 0x003bU); if (!rc) { rc = pin_guest_page(vcpu->kvm, gpa, &hpa); - if (rc == -EINVAL) + if (rc) rc = set_validity_icpt(scb_s, 0x0034U); } if (rc) @@ -574,10 +562,10 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) } /* 256 bytes cannot cross page boundaries */ rc = pin_guest_page(vcpu->kvm, gpa, &hpa); - if (rc == -EINVAL) + if (rc) { rc = set_validity_icpt(scb_s, 0x0080U); - if (rc) goto unpin; + } scb_s->itdba = hpa; } @@ -592,10 +580,10 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) * if this block gets bigger, we have to shadow it. */ rc = pin_guest_page(vcpu->kvm, gpa, &hpa); - if (rc == -EINVAL) + if (rc) { rc = set_validity_icpt(scb_s, 0x1310U); - if (rc) goto unpin; + } scb_s->gvrd = hpa; } @@ -607,11 +595,11 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) } /* 64 bytes cannot cross page boundaries */ rc = pin_guest_page(vcpu->kvm, gpa, &hpa); - if (rc == -EINVAL) + if (rc) { rc = set_validity_icpt(scb_s, 0x0043U); - /* Validity 0x0044 will be checked by SIE */ - if (rc) goto unpin; + } + /* Validity 0x0044 will be checked by SIE */ scb_s->riccbd = hpa; } if ((scb_s->ecb & ECB_GS) && !(scb_s->ecd & ECD_HOSTREGMGMT)) { @@ -635,10 +623,10 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) * cross page boundaries */ rc = pin_guest_page(vcpu->kvm, gpa, &hpa); - if (rc == -EINVAL) + if (rc) { rc = set_validity_icpt(scb_s, 0x10b0U); - if (rc) goto unpin; + } scb_s->sdnxo = hpa | sdnxc; } return 0; @@ -663,7 +651,6 @@ static void unpin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, * * Returns: - 0 if the scb was pinned. * - > 0 if control has to be given to guest 2 - * - -ENOMEM if out of memory */ static int pin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, gpa_t gpa) @@ -672,14 +659,13 @@ static int pin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, int rc; rc = pin_guest_page(vcpu->kvm, gpa, &hpa); - if (rc == -EINVAL) { + if (rc) { rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); - if (!rc) - rc = 1; + WARN_ON_ONCE(rc); + return 1; } - if (!rc) - vsie_page->scb_o = (struct kvm_s390_sie_block *) hpa; - return rc; + vsie_page->scb_o = (struct kvm_s390_sie_block *) hpa; + return 0; } /* -- cgit From ee739f4b216e9394281cf99e6d93c67bdf4a37d2 Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Mon, 3 Jul 2017 15:32:50 +0200 Subject: KVM: s390: abstract conversion between isc and enum irq_types The abstraction of the conversion between an isc value and an irq_type by means of functions isc_to_irq_type() and irq_type_to_isc() allows to clarify the respective operations where used. Signed-off-by: Michael Mueller Reviewed-by: Halil Pasic Reviewed-by: Pierre Morel Reviewed-by: Christian Borntraeger Reviewed-by: Cornelia Huck Reviewed-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'arch/s390') diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 23d8fb25c5dd..a3da4f3065aa 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -213,6 +213,16 @@ static inline unsigned long pending_irqs(struct kvm_vcpu *vcpu) vcpu->arch.local_int.pending_irqs; } +static inline int isc_to_irq_type(unsigned long isc) +{ + return IRQ_PEND_IO_ISC_0 + isc; +} + +static inline int irq_type_to_isc(unsigned long irq_type) +{ + return irq_type - IRQ_PEND_IO_ISC_0; +} + static unsigned long disable_iscs(struct kvm_vcpu *vcpu, unsigned long active_mask) { @@ -220,7 +230,7 @@ static unsigned long disable_iscs(struct kvm_vcpu *vcpu, for (i = 0; i <= MAX_ISC; i++) if (!(vcpu->arch.sie_block->gcr[6] & isc_to_isc_bits(i))) - active_mask &= ~(1UL << (IRQ_PEND_IO_ISC_0 + i)); + active_mask &= ~(1UL << (isc_to_irq_type(i))); return active_mask; } @@ -901,7 +911,7 @@ static int __must_check __deliver_io(struct kvm_vcpu *vcpu, fi = &vcpu->kvm->arch.float_int; spin_lock(&fi->lock); - isc_list = &fi->lists[irq_type - IRQ_PEND_IO_ISC_0]; + isc_list = &fi->lists[irq_type_to_isc(irq_type)]; inti = list_first_entry_or_null(isc_list, struct kvm_s390_interrupt_info, list); @@ -1401,7 +1411,7 @@ static struct kvm_s390_interrupt_info *get_io_int(struct kvm *kvm, list_del_init(&iter->list); fi->counters[FIRQ_CNTR_IO] -= 1; if (list_empty(isc_list)) - clear_bit(IRQ_PEND_IO_ISC_0 + isc, &fi->pending_irqs); + clear_bit(isc_to_irq_type(isc), &fi->pending_irqs); spin_unlock(&fi->lock); return iter; } @@ -1528,7 +1538,7 @@ static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) isc = int_word_to_isc(inti->io.io_int_word); list = &fi->lists[FIRQ_LIST_IO_ISC_0 + isc]; list_add_tail(&inti->list, list); - set_bit(IRQ_PEND_IO_ISC_0 + isc, &fi->pending_irqs); + set_bit(isc_to_irq_type(isc), &fi->pending_irqs); spin_unlock(&fi->lock); return 0; } -- cgit From 4dd6f17eb913d3d23dd6c07950627ac2c3068dca Mon Sep 17 00:00:00 2001 From: Michael Mueller Date: Thu, 6 Jul 2017 14:22:20 +0200 Subject: KVM: s390: clear_io_irq() requests are not expected for adapter interrupts There is a chance to delete not yet delivered I/O interrupts if an exploiter uses the subsystem identification word 0x0000 while processing a KVM_DEV_FLIC_CLEAR_IO_IRQ ioctl. -EINVAL will be returned now instead in that case. Classic interrupts will always have bit 0x10000 set in the schid while adapter interrupts have a zero schid. The clear_io_irq interface is only useful for classic interrupts (as adapter interrupts belong to many devices). Let's make this interface more strict and forbid a schid of 0. Signed-off-by: Michael Mueller Reviewed-by: Halil Pasic Reviewed-by: Christian Borntraeger Reviewed-by: Cornelia Huck Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/s390') diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index a3da4f3065aa..c8aacced23fb 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -2191,6 +2191,8 @@ static int clear_io_irq(struct kvm *kvm, struct kvm_device_attr *attr) return -EINVAL; if (copy_from_user(&schid, (void __user *) attr->addr, sizeof(schid))) return -EFAULT; + if (!schid) + return -EINVAL; kfree(kvm_s390_get_io_int(kvm, isc_mask, schid)); /* * If userspace is conforming to the architecture, we can have at most -- cgit From da9a1446d248f673a8560ce46251ff620214ab7b Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Thu, 9 Nov 2017 10:00:45 +0100 Subject: KVM: s390: provide a capability for AIS state migration The AIS capability was introduced in 4.12, while the interface to migrate the state was added in 4.13. Unfortunately it is not possible for userspace to detect the migration capability without creating a flic kvm device. As in QEMU the cpu model detection runs on the "none" machine this will result in cpu model issues regarding the "ais" capability. To get the "ais" capability properly let's add a new KVM capability that tells userspace that AIS states can be migrated. Signed-off-by: Christian Borntraeger Reviewed-by: Cornelia Huck Reviewed-by: David Hildenbrand Acked-by: Halil Pasic --- arch/s390/kvm/kvm-s390.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/s390') diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index de6a5b790da0..8f4b655f65d7 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -395,6 +395,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_USER_INSTR0: case KVM_CAP_S390_CMMA_MIGRATION: case KVM_CAP_S390_AIS: + case KVM_CAP_S390_AIS_MIGRATION: r = 1; break; case KVM_CAP_S390_MEM_OP: -- cgit