diff options
Diffstat (limited to 'arch/x86')
460 files changed, 13814 insertions, 9143 deletions
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild index 30dec019756b..f384cb1a4f7a 100644 --- a/arch/x86/Kbuild +++ b/arch/x86/Kbuild @@ -25,3 +25,6 @@ obj-y += platform/ obj-y += net/ obj-$(CONFIG_KEXEC_FILE) += purgatory/ + +# for cleaning +subdir- += boot tools diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ab83c22d274e..407533c835fe 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -61,8 +61,9 @@ config X86 select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI select ARCH_32BIT_OFF_T if X86_32 select ARCH_CLOCKSOURCE_INIT + select ARCH_CORRECT_STACKTRACE_ON_KRETPROBE select ARCH_ENABLE_HUGEPAGE_MIGRATION if X86_64 && HUGETLB_PAGE && MIGRATION - select ARCH_ENABLE_MEMORY_HOTPLUG if X86_64 || (X86_32 && HIGHMEM) + select ARCH_ENABLE_MEMORY_HOTPLUG if X86_64 select ARCH_ENABLE_MEMORY_HOTREMOVE if MEMORY_HOTPLUG select ARCH_ENABLE_SPLIT_PMD_PTLOCK if (PGTABLE_LEVELS > 2) && (X86_64 || X86_PAE) select ARCH_ENABLE_THP_MIGRATION if X86_64 && TRANSPARENT_HUGEPAGE @@ -103,6 +104,7 @@ config X86 select ARCH_SUPPORTS_ACPI select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_DEBUG_PAGEALLOC + select ARCH_SUPPORTS_PAGE_TABLE_CHECK if X86_64 select ARCH_SUPPORTS_NUMA_BALANCING if X86_64 select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP if NR_CPUS <= 4096 select ARCH_SUPPORTS_LTO_CLANG @@ -125,6 +127,7 @@ config X86 select CLOCKSOURCE_VALIDATE_LAST_CYCLE select CLOCKSOURCE_WATCHDOG select DCACHE_WORD_ACCESS + select DYNAMIC_SIGFRAME select EDAC_ATOMIC_SCRUB select EDAC_SUPPORT select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC) @@ -190,6 +193,8 @@ config X86 select HAVE_DYNAMIC_FTRACE_WITH_REGS select HAVE_DYNAMIC_FTRACE_WITH_ARGS if X86_64 select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + select HAVE_SAMPLE_FTRACE_DIRECT if X86_64 + select HAVE_SAMPLE_FTRACE_DIRECT_MULTI if X86_64 select HAVE_EBPF_JIT select HAVE_EFFICIENT_UNALIGNED_ACCESS select HAVE_EISA @@ -197,7 +202,7 @@ config X86 select HAVE_FAST_GUP select HAVE_FENTRY if X86_64 || DYNAMIC_FTRACE select HAVE_FTRACE_MCOUNT_RECORD - select HAVE_FUNCTION_GRAPH_TRACER + select HAVE_FUNCTION_GRAPH_TRACER if X86_32 || (X86_64 && DYNAMIC_FTRACE) select HAVE_FUNCTION_TRACER select HAVE_GCC_PLUGINS select HAVE_HW_BREAKPOINT @@ -265,6 +270,7 @@ config X86 select HAVE_ARCH_KCSAN if X86_64 select X86_FEATURE_NAMES if PROC_FS select PROC_PID_ARCH_STATUS if PROC_FS + select HAVE_ARCH_NODE_DEV_GROUP if X86_SGX imply IMA_SECURE_AND_OR_TRUSTED_BOOT if EFI config INSTRUCTION_DECODER @@ -468,6 +474,18 @@ config RETPOLINE branches. Requires a compiler with -mindirect-branch=thunk-extern support for full protection. The kernel may run slower. +config CC_HAS_SLS + def_bool $(cc-option,-mharden-sls=all) + +config SLS + bool "Mitigate Straight-Line-Speculation" + depends on CC_HAS_SLS && X86_64 + default n + help + Compile the kernel with straight-line-speculation options to guard + against straight line speculation. The kernel image might be slightly + larger. + config X86_CPU_RESCTRL bool "x86 CPU resource control support" depends on X86 && (CPU_SUP_INTEL || CPU_SUP_AMD) @@ -610,9 +628,7 @@ config X86_INTEL_MID depends on X86_IO_APIC select I2C select DW_APB_TIMER - select APB_TIMER select INTEL_SCU_PCI - select MFD_INTEL_MSIC help Select to build a kernel capable of supporting Intel MID (Mobile Internet Device) platform systems which do not have the PCI legacy @@ -1001,6 +1017,17 @@ config NR_CPUS This is purely to save memory: each supported CPU adds about 8KB to the kernel image. +config SCHED_CLUSTER + bool "Cluster scheduler support" + depends on SMP + default y + help + Cluster scheduler support improves the CPU scheduler's decision + making when dealing with machines that have clusters of CPUs. + Cluster usually means a couple of CPUs which are placed closely + by sharing mid-level caches, last-level cache tags or internal + busses. + config SCHED_SMT def_bool y if SMP @@ -1256,7 +1283,8 @@ config TOSHIBA config I8K tristate "Dell i8k legacy laptop support" - select HWMON + depends on HWMON + depends on PROC_FS select SENSORS_DELL_SMM help This option enables legacy /proc/i8k userspace interface in hwmon @@ -1405,7 +1433,7 @@ config HIGHMEM4G config HIGHMEM64G bool "64GB" - depends on !M486SX && !M486 && !M586 && !M586TSC && !M586MMX && !MGEODE_LX && !MGEODEGX1 && !MCYRIXIII && !MELAN && !MWINCHIPC6 && !WINCHIP3D && !MK6 + depends on !M486SX && !M486 && !M586 && !M586TSC && !M586MMX && !MGEODE_LX && !MGEODEGX1 && !MCYRIXIII && !MELAN && !MWINCHIPC6 && !MWINCHIP3D && !MK6 select X86_PAE help Select this if you have a 32-bit processor and more than 4 @@ -1509,15 +1537,20 @@ config X86_CPA_STATISTICS helps to determine the effectiveness of preserving large and huge page mappings when mapping protections are changed. +config X86_MEM_ENCRYPT + select ARCH_HAS_FORCE_DMA_UNENCRYPTED + select DYNAMIC_PHYSICAL_MASK + select ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS + def_bool n + config AMD_MEM_ENCRYPT bool "AMD Secure Memory Encryption (SME) support" depends on X86_64 && CPU_SUP_AMD select DMA_COHERENT_POOL - select DYNAMIC_PHYSICAL_MASK select ARCH_USE_MEMREMAP_PROT - select ARCH_HAS_FORCE_DMA_UNENCRYPTED select INSTRUCTION_DECODER - select ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS + select ARCH_HAS_CC_PLATFORM + select X86_MEM_ENCRYPT help Say yes to enable support for the encryption of system memory. This requires an AMD processor that supports Secure Memory @@ -1525,7 +1558,6 @@ config AMD_MEM_ENCRYPT config AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT bool "Activate AMD Secure Memory Encryption (SME) by default" - default y depends on AMD_MEM_ENCRYPT help Say yes to have system memory encrypted by default if running on @@ -1615,7 +1647,7 @@ config ARCH_SELECT_MEMORY_MODEL config ARCH_MEMORY_PROBE bool "Enable sysfs memory/probe interface" - depends on X86_64 && MEMORY_HOTPLUG + depends on MEMORY_HOTPLUG help This option enables a sysfs memory/probe interface for testing. See Documentation/admin-guide/mm/memory-hotplug.rst for more information. @@ -1903,6 +1935,7 @@ config X86_SGX select SRCU select MMU_NOTIFIER select NUMA_KEEP_MEMINFO if NUMA + select XARRAY_MULTI help Intel(R) Software Guard eXtensions (SGX) is a set of CPU instructions that can be used by applications to set aside private regions of code @@ -1918,6 +1951,7 @@ config EFI depends on ACPI select UCS2_STRING select EFI_RUNTIME_WRAPPERS + select ARCH_USE_MEMREMAP_PROT help This enables the kernel to use EFI runtime services that are available (such as the EFI variable services). @@ -1931,7 +1965,7 @@ config EFI config EFI_STUB bool "EFI stub support" - depends on EFI && !X86_USE_3DNOW + depends on EFI depends on $(cc-option,-mabi=ms) || X86_32 select RELOCATABLE help @@ -2389,13 +2423,29 @@ config MODIFY_LDT_SYSCALL Saying 'N' here may make sense for embedded or server kernels. +config STRICT_SIGALTSTACK_SIZE + bool "Enforce strict size checking for sigaltstack" + depends on DYNAMIC_SIGFRAME + help + For historical reasons MINSIGSTKSZ is a constant which became + already too small with AVX512 support. Add a mechanism to + enforce strict checking of the sigaltstack size against the + real size of the FPU frame. This option enables the check + by default. It can also be controlled via the kernel command + line option 'strict_sas_size' independent of this config + switch. Enabling it might break existing applications which + allocate a too small sigaltstack but 'work' because they + never get a signal delivered. + + Say 'N' unless you want to really enforce this check. + source "kernel/livepatch/Kconfig" endmenu config ARCH_HAS_ADD_PAGES def_bool y - depends on X86_64 && ARCH_ENABLE_MEMORY_HOTPLUG + depends on ARCH_ENABLE_MEMORY_HOTPLUG config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE def_bool y @@ -2832,8 +2882,6 @@ config HAVE_ATOMIC_IOMAP def_bool y depends on X86_32 -source "drivers/firmware/Kconfig" - source "arch/x86/kvm/Kconfig" source "arch/x86/Kconfig.assembler" diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 814fe0d349b0..542377cd419d 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -342,10 +342,6 @@ config X86_USE_PPRO_CHECKSUM def_bool y depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM -config X86_USE_3DNOW - def_bool y - depends on (MCYRIXIII || MK7 || MGEODE_LX) && !UML - # # P6_NOPs are a relatively minor optimization that require a family >= # 6 processor, except that it is broken on certain VIA chips. @@ -508,3 +504,16 @@ config CPU_SUP_ZHAOXIN CPU might render the kernel unbootable. If unsure, say N. + +config CPU_SUP_VORTEX_32 + default y + bool "Support Vortex processors" if PROCESSOR_SELECT + depends on X86_32 + help + This enables detection, tunings and quirks for Vortex processors + + You need this enabled if you want your kernel to run on a + Vortex CPU. Disabling this option on other types of CPUs + makes the kernel a tiny bit smaller. + + If unsure, say N. diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 7488cfbbd2f6..e84cdd409b64 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -12,6 +12,18 @@ else KBUILD_DEFCONFIG := $(ARCH)_defconfig endif +ifdef CONFIG_CC_IS_GCC +RETPOLINE_CFLAGS := $(call cc-option,-mindirect-branch=thunk-extern -mindirect-branch-register) +RETPOLINE_CFLAGS += $(call cc-option,-mindirect-branch-cs-prefix) +RETPOLINE_VDSO_CFLAGS := $(call cc-option,-mindirect-branch=thunk-inline -mindirect-branch-register) +endif +ifdef CONFIG_CC_IS_CLANG +RETPOLINE_CFLAGS := -mretpoline-external-thunk +RETPOLINE_VDSO_CFLAGS := -mretpoline +endif +export RETPOLINE_CFLAGS +export RETPOLINE_VDSO_CFLAGS + # For gcc stack alignment is specified with -mpreferred-stack-boundary, # clang has the option -mstack-alignment for that purpose. ifneq ($(call cc-option, -mpreferred-stack-boundary=4),) @@ -179,6 +191,10 @@ ifdef CONFIG_RETPOLINE endif endif +ifdef CONFIG_SLS + KBUILD_CFLAGS += -mharden-sls=all +endif + KBUILD_LDFLAGS += -m elf_$(UTS_MACHINE) ifdef CONFIG_LTO_CLANG @@ -283,8 +299,6 @@ endif archclean: $(Q)rm -rf $(objtree)/arch/i386 $(Q)rm -rf $(objtree)/arch/x86_64 - $(Q)$(MAKE) $(clean)=$(boot) - $(Q)$(MAKE) $(clean)=arch/x86/tools define archhelp echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)' @@ -299,7 +313,7 @@ define archhelp echo ' isoimage - Create a boot CD-ROM image (arch/x86/boot/image.iso)' echo ' bzdisk/fdimage*/hdimage/isoimage also accept:' echo ' FDARGS="..." arguments for the booted kernel' - echo ' FDINITRD=file initrd for the booted kernel' + echo ' FDINITRD=file initrd for the booted kernel' echo '' echo ' kvm_guest.config - Enable Kconfig items for running this kernel as a KVM guest' echo ' xen.config - Enable Kconfig items for running this kernel as a Xen guest' diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 431bf7f846c3..e11813646051 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -28,7 +28,11 @@ KCOV_INSTRUMENT := n targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \ vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4 vmlinux.bin.zst -KBUILD_CFLAGS := -m$(BITS) -O2 +# CLANG_FLAGS must come before any cc-disable-warning or cc-option calls in +# case of cross compiling, as it has the '--target=' flag, which is needed to +# avoid errors with '-march=i386', and future flags may depend on the target to +# be valid. +KBUILD_CFLAGS := -m$(BITS) -O2 $(CLANG_FLAGS) KBUILD_CFLAGS += -fno-strict-aliasing -fPIE KBUILD_CFLAGS += -Wundef KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING @@ -47,7 +51,6 @@ KBUILD_CFLAGS += -D__DISABLE_EXPORTS # Disable relocation relaxation in case the link is not PIE. KBUILD_CFLAGS += $(call as-option,-Wa$(comma)-mrelax-relocations=no) KBUILD_CFLAGS += -include $(srctree)/include/linux/hidden.h -KBUILD_CFLAGS += $(CLANG_FLAGS) # sev.c indirectly inludes inat-table.h which is generated during # compilation and stored in $(objtree). Add the directory to the includes so diff --git a/arch/x86/boot/compressed/efi_thunk_64.S b/arch/x86/boot/compressed/efi_thunk_64.S index 8bb92e9f4e97..67e7edcdfea8 100644 --- a/arch/x86/boot/compressed/efi_thunk_64.S +++ b/arch/x86/boot/compressed/efi_thunk_64.S @@ -26,8 +26,6 @@ SYM_FUNC_START(__efi64_thunk) push %rbp push %rbx - leaq 1f(%rip), %rbp - movl %ds, %eax push %rax movl %es, %eax @@ -35,6 +33,11 @@ SYM_FUNC_START(__efi64_thunk) movl %ss, %eax push %rax + /* Copy args passed on stack */ + movq 0x30(%rsp), %rbp + movq 0x38(%rsp), %rbx + movq 0x40(%rsp), %rax + /* * Convert x86-64 ABI params to i386 ABI */ @@ -44,13 +47,18 @@ SYM_FUNC_START(__efi64_thunk) movl %ecx, 0x8(%rsp) movl %r8d, 0xc(%rsp) movl %r9d, 0x10(%rsp) + movl %ebp, 0x14(%rsp) + movl %ebx, 0x18(%rsp) + movl %eax, 0x1c(%rsp) - leaq 0x14(%rsp), %rbx + leaq 0x20(%rsp), %rbx sgdt (%rbx) addq $16, %rbx sidt (%rbx) + leaq 1f(%rip), %rbp + /* * Switch to IDT and GDT with 32-bit segments. This is the firmware GDT * and IDT that was installed when the kernel started executing. The @@ -93,7 +101,7 @@ SYM_FUNC_START(__efi64_thunk) pop %rbx pop %rbp - ret + RET SYM_FUNC_END(__efi64_thunk) .code32 diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 572c535cf45b..fd9441f40457 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -813,7 +813,7 @@ SYM_FUNC_START(efi32_pe_entry) 2: popl %edi // restore callee-save registers popl %ebx leave - ret + RET SYM_FUNC_END(efi32_pe_entry) .section ".rodata" @@ -868,7 +868,7 @@ SYM_FUNC_START(startup32_set_idt_entry) pop %ecx pop %ebx - ret + RET SYM_FUNC_END(startup32_set_idt_entry) #endif @@ -884,7 +884,7 @@ SYM_FUNC_START(startup32_load_idt) movl %eax, rva(boot32_idt_desc+2)(%ebp) lidt rva(boot32_idt_desc)(%ebp) #endif - ret + RET SYM_FUNC_END(startup32_load_idt) /* @@ -954,7 +954,7 @@ SYM_FUNC_START(startup32_check_sev_cbit) popl %ebx popl %eax #endif - ret + RET SYM_FUNC_END(startup32_check_sev_cbit) /* diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 67c3208b668a..411b268bc0a2 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -32,10 +32,6 @@ #include <generated/utsrelease.h> #include <asm/efi.h> -/* Macros used by the included decompressor code below. */ -#define STATIC -#include <linux/decompress/mm.h> - #define _SETUP #include <asm/setup.h> /* For COMMAND_LINE_SIZE */ #undef _SETUP diff --git a/arch/x86/boot/compressed/mem_encrypt.S b/arch/x86/boot/compressed/mem_encrypt.S index c1e81a848b2a..a63424d13627 100644 --- a/arch/x86/boot/compressed/mem_encrypt.S +++ b/arch/x86/boot/compressed/mem_encrypt.S @@ -58,7 +58,7 @@ SYM_FUNC_START(get_sev_encryption_bit) #endif /* CONFIG_AMD_MEM_ENCRYPT */ - ret + RET SYM_FUNC_END(get_sev_encryption_bit) /** @@ -92,7 +92,7 @@ SYM_CODE_START_LOCAL(sev_es_req_cpuid) /* All good - return success */ xorl %eax, %eax 1: - ret + RET 2: movl $-1, %eax jmp 1b @@ -221,7 +221,7 @@ SYM_FUNC_START(set_sev_encryption_mask) #endif xor %rax, %rax - ret + RET SYM_FUNC_END(set_sev_encryption_mask) .data diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 743f13ea25c1..a4339cb2d247 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -28,6 +28,9 @@ /* Macros used by the included decompressor code below. */ #define STATIC static +/* Define an externally visible malloc()/free(). */ +#define MALLOC_VISIBLE +#include <linux/decompress/mm.h> /* * Provide definitions of memzero and memmove as some of the decompressors will diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 31139256859f..16ed360b6692 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -14,6 +14,8 @@ #undef CONFIG_KASAN #undef CONFIG_KASAN_GENERIC +#define __NO_FORTIFY + /* cpu_feature_enabled() cannot be used this early */ #define USE_EARLY_PGTABLE_L5 @@ -44,6 +46,8 @@ extern char _head[], _end[]; /* misc.c */ extern memptr free_mem_ptr; extern memptr free_mem_end_ptr; +void *malloc(int size); +void free(void *where); extern struct boot_params *boot_params; void __putstr(const char *s); void __puthex(unsigned long value); diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c index 2a78746f5a4c..a1733319a22a 100644 --- a/arch/x86/boot/compressed/pgtable_64.c +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "misc.h" #include <linux/efi.h> #include <asm/e820/types.h> #include <asm/processor.h> diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 670e998fe930..28bcf04c022e 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -122,7 +122,7 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, static bool early_setup_sev_es(void) { if (!sev_es_negotiate_protocol()) - sev_es_terminate(GHCB_SEV_ES_REASON_PROTOCOL_UNSUPPORTED); + sev_es_terminate(GHCB_SEV_ES_PROT_UNSUPPORTED); if (set_page_decrypted((unsigned long)&boot_ghcb_page)) return false; @@ -175,7 +175,7 @@ void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code) enum es_result result; if (!boot_ghcb && !early_setup_sev_es()) - sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); + sev_es_terminate(GHCB_SEV_ES_GEN_REQ); vc_ghcb_invalidate(boot_ghcb); result = vc_init_em_ctxt(&ctxt, regs, exit_code); @@ -202,5 +202,5 @@ finish: if (result == ES_OK) vc_finish_insn(&ctxt); else if (result != ES_RETRY) - sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); + sev_es_terminate(GHCB_SEV_ES_GEN_REQ); } diff --git a/arch/x86/boot/genimage.sh b/arch/x86/boot/genimage.sh index 0673fdfc1a11..c9299aeb7333 100644 --- a/arch/x86/boot/genimage.sh +++ b/arch/x86/boot/genimage.sh @@ -120,12 +120,13 @@ efiarch() { } # Get the combined sizes in bytes of the files given, counting sparse -# files as full length, and padding each file to a 4K block size +# files as full length, and padding each file to cluster size +cluster=16384 filesizes() { local t=0 local s for s in $(ls -lnL "$@" 2>/dev/null | awk '/^-/{ print $5; }'); do - t=$((t + ((s+4095)/4096)*4096)) + t=$((t + ((s+cluster-1)/cluster)*cluster)) done echo $t } @@ -230,14 +231,14 @@ genhdimage() { ptype='-T 0xef' # EFI system partition, no GPT fi sizes=$(filesizes "$FBZIMAGE" "${FDINITRDS[@]}" "$efishell") - # Allow 1% + 1 MiB for filesystem and partition table overhead, - # syslinux, and config files + # Allow 1% + 2 MiB for filesystem and partition table overhead, + # syslinux, and config files; this is probably excessive... megs=$(((sizes + sizes/100 + 2*1024*1024 - 1)/(1024*1024))) $dd if=/dev/zero of="$FIMAGE" bs=$((1024*1024)) count=$megs 2>/dev/null - mpartition -I -c -s 32 -h 64 -t $megs $ptype -b 512 -a h: + mpartition -I -c -s 32 -h 64 $ptype -b 64 -a p: $dd if="$mbr" of="$FIMAGE" bs=440 count=1 conv=notrunc 2>/dev/null - mformat -v 'LINUX_BOOT' -s 32 -h 64 -t $megs h: - syslinux --offset $((512*512)) "$FIMAGE" + mformat -v 'LINUX_BOOT' -s 32 -h 64 -c $((cluster/512)) -t $megs h: + syslinux --offset $((64*512)) "$FIMAGE" do_mcopy h: } diff --git a/arch/x86/boot/mtools.conf.in b/arch/x86/boot/mtools.conf.in index 9e2662d01364..174c60508766 100644 --- a/arch/x86/boot/mtools.conf.in +++ b/arch/x86/boot/mtools.conf.in @@ -14,7 +14,8 @@ drive v: drive w: file="@OBJ@/fdimage" cylinders=80 heads=2 sectors=36 filter -# Hard disk +# Hard disk (h: for the filesystem, p: for format - old mtools bug?) drive h: + file="@OBJ@/hdimage" offset=32768 mformat_only +drive p: file="@OBJ@/hdimage" partition=1 mformat_only - diff --git a/arch/x86/boot/string.h b/arch/x86/boot/string.h index a232da487cd2..e5d2c6b8c2f1 100644 --- a/arch/x86/boot/string.h +++ b/arch/x86/boot/string.h @@ -8,8 +8,10 @@ #undef memcmp void *memcpy(void *dst, const void *src, size_t len); +void *memmove(void *dst, const void *src, size_t len); void *memset(void *dst, int c, size_t len); int memcmp(const void *s1, const void *s2, size_t len); +int bcmp(const void *s1, const void *s2, size_t len); /* Access builtin version by default. */ #define memcpy(d,s,l) __builtin_memcpy(d,s,l) @@ -25,6 +27,7 @@ extern size_t strnlen(const char *s, size_t maxlen); extern unsigned int atou(const char *s); extern unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base); +long simple_strtol(const char *cp, char **endp, unsigned int base); int kstrtoull(const char *s, unsigned int base, unsigned long long *res); int boot_kstrtoul(const char *s, unsigned int base, unsigned long *res); diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index e81885384f60..71124cf8630c 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -1,4 +1,3 @@ -# CONFIG_LOCALVERSION_AUTO is not set CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y CONFIG_AUDIT=y @@ -262,3 +261,4 @@ CONFIG_BLK_DEV_IO_TRACE=y CONFIG_PROVIDE_OHCI1394_DMA_INIT=y CONFIG_EARLY_PRINTK_DBGP=y CONFIG_DEBUG_BOOT_PARAMS=y +CONFIG_KALLSYMS_ALL=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index e8a7a0af2bda..92b1169ec90b 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -1,4 +1,3 @@ -# CONFIG_LOCALVERSION_AUTO is not set CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y CONFIG_AUDIT=y @@ -258,3 +257,4 @@ CONFIG_BLK_DEV_IO_TRACE=y CONFIG_PROVIDE_OHCI1394_DMA_INIT=y CONFIG_EARLY_PRINTK_DBGP=y CONFIG_DEBUG_BOOT_PARAMS=y +CONFIG_KALLSYMS_ALL=y diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index f307c93fc90a..c3af959648e6 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -62,7 +62,9 @@ obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += blake2s-x86_64.o -blake2s-x86_64-y := blake2s-core.o blake2s-glue.o +blake2s-x86_64-y := blake2s-shash.o +obj-$(if $(CONFIG_CRYPTO_BLAKE2S_X86),y) += libblake2s-x86_64.o +libblake2s-x86_64-y := blake2s-core.o blake2s-glue.o obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S index 51d46d93efbc..b48ddebb4748 100644 --- a/arch/x86/crypto/aegis128-aesni-asm.S +++ b/arch/x86/crypto/aegis128-aesni-asm.S @@ -122,7 +122,7 @@ SYM_FUNC_START_LOCAL(__load_partial) pxor T0, MSG .Lld_partial_8: - ret + RET SYM_FUNC_END(__load_partial) /* @@ -180,7 +180,7 @@ SYM_FUNC_START_LOCAL(__store_partial) mov %r10b, (%r9) .Lst_partial_1: - ret + RET SYM_FUNC_END(__store_partial) /* @@ -225,7 +225,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_init) movdqu STATE4, 0x40(STATEP) FRAME_END - ret + RET SYM_FUNC_END(crypto_aegis128_aesni_init) /* @@ -337,7 +337,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad) movdqu STATE3, 0x30(STATEP) movdqu STATE4, 0x40(STATEP) FRAME_END - ret + RET .Lad_out_1: movdqu STATE4, 0x00(STATEP) @@ -346,7 +346,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad) movdqu STATE2, 0x30(STATEP) movdqu STATE3, 0x40(STATEP) FRAME_END - ret + RET .Lad_out_2: movdqu STATE3, 0x00(STATEP) @@ -355,7 +355,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad) movdqu STATE1, 0x30(STATEP) movdqu STATE2, 0x40(STATEP) FRAME_END - ret + RET .Lad_out_3: movdqu STATE2, 0x00(STATEP) @@ -364,7 +364,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad) movdqu STATE0, 0x30(STATEP) movdqu STATE1, 0x40(STATEP) FRAME_END - ret + RET .Lad_out_4: movdqu STATE1, 0x00(STATEP) @@ -373,11 +373,11 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad) movdqu STATE4, 0x30(STATEP) movdqu STATE0, 0x40(STATEP) FRAME_END - ret + RET .Lad_out: FRAME_END - ret + RET SYM_FUNC_END(crypto_aegis128_aesni_ad) .macro encrypt_block a s0 s1 s2 s3 s4 i @@ -452,7 +452,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_enc) movdqu STATE2, 0x30(STATEP) movdqu STATE3, 0x40(STATEP) FRAME_END - ret + RET .Lenc_out_1: movdqu STATE3, 0x00(STATEP) @@ -461,7 +461,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_enc) movdqu STATE1, 0x30(STATEP) movdqu STATE2, 0x40(STATEP) FRAME_END - ret + RET .Lenc_out_2: movdqu STATE2, 0x00(STATEP) @@ -470,7 +470,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_enc) movdqu STATE0, 0x30(STATEP) movdqu STATE1, 0x40(STATEP) FRAME_END - ret + RET .Lenc_out_3: movdqu STATE1, 0x00(STATEP) @@ -479,7 +479,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_enc) movdqu STATE4, 0x30(STATEP) movdqu STATE0, 0x40(STATEP) FRAME_END - ret + RET .Lenc_out_4: movdqu STATE0, 0x00(STATEP) @@ -488,11 +488,11 @@ SYM_FUNC_START(crypto_aegis128_aesni_enc) movdqu STATE3, 0x30(STATEP) movdqu STATE4, 0x40(STATEP) FRAME_END - ret + RET .Lenc_out: FRAME_END - ret + RET SYM_FUNC_END(crypto_aegis128_aesni_enc) /* @@ -532,7 +532,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_enc_tail) movdqu STATE3, 0x40(STATEP) FRAME_END - ret + RET SYM_FUNC_END(crypto_aegis128_aesni_enc_tail) .macro decrypt_block a s0 s1 s2 s3 s4 i @@ -606,7 +606,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_dec) movdqu STATE2, 0x30(STATEP) movdqu STATE3, 0x40(STATEP) FRAME_END - ret + RET .Ldec_out_1: movdqu STATE3, 0x00(STATEP) @@ -615,7 +615,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_dec) movdqu STATE1, 0x30(STATEP) movdqu STATE2, 0x40(STATEP) FRAME_END - ret + RET .Ldec_out_2: movdqu STATE2, 0x00(STATEP) @@ -624,7 +624,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_dec) movdqu STATE0, 0x30(STATEP) movdqu STATE1, 0x40(STATEP) FRAME_END - ret + RET .Ldec_out_3: movdqu STATE1, 0x00(STATEP) @@ -633,7 +633,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_dec) movdqu STATE4, 0x30(STATEP) movdqu STATE0, 0x40(STATEP) FRAME_END - ret + RET .Ldec_out_4: movdqu STATE0, 0x00(STATEP) @@ -642,11 +642,11 @@ SYM_FUNC_START(crypto_aegis128_aesni_dec) movdqu STATE3, 0x30(STATEP) movdqu STATE4, 0x40(STATEP) FRAME_END - ret + RET .Ldec_out: FRAME_END - ret + RET SYM_FUNC_END(crypto_aegis128_aesni_dec) /* @@ -696,7 +696,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_dec_tail) movdqu STATE3, 0x40(STATEP) FRAME_END - ret + RET SYM_FUNC_END(crypto_aegis128_aesni_dec_tail) /* @@ -743,5 +743,5 @@ SYM_FUNC_START(crypto_aegis128_aesni_final) movdqu MSG, (%rsi) FRAME_END - ret + RET SYM_FUNC_END(crypto_aegis128_aesni_final) diff --git a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S index 3f0fc7dd87d7..c799838242a6 100644 --- a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S +++ b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S @@ -525,7 +525,7 @@ ddq_add_8: /* return updated IV */ vpshufb xbyteswap, xcounter, xcounter vmovdqu xcounter, (p_iv) - ret + RET .endm /* diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index 4e3972570916..363699dd7220 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -1594,7 +1594,7 @@ SYM_FUNC_START(aesni_gcm_dec) GCM_ENC_DEC dec GCM_COMPLETE arg10, arg11 FUNC_RESTORE - ret + RET SYM_FUNC_END(aesni_gcm_dec) @@ -1683,7 +1683,7 @@ SYM_FUNC_START(aesni_gcm_enc) GCM_COMPLETE arg10, arg11 FUNC_RESTORE - ret + RET SYM_FUNC_END(aesni_gcm_enc) /***************************************************************************** @@ -1701,7 +1701,7 @@ SYM_FUNC_START(aesni_gcm_init) FUNC_SAVE GCM_INIT %arg3, %arg4,%arg5, %arg6 FUNC_RESTORE - ret + RET SYM_FUNC_END(aesni_gcm_init) /***************************************************************************** @@ -1716,7 +1716,7 @@ SYM_FUNC_START(aesni_gcm_enc_update) FUNC_SAVE GCM_ENC_DEC enc FUNC_RESTORE - ret + RET SYM_FUNC_END(aesni_gcm_enc_update) /***************************************************************************** @@ -1731,7 +1731,7 @@ SYM_FUNC_START(aesni_gcm_dec_update) FUNC_SAVE GCM_ENC_DEC dec FUNC_RESTORE - ret + RET SYM_FUNC_END(aesni_gcm_dec_update) /***************************************************************************** @@ -1746,7 +1746,7 @@ SYM_FUNC_START(aesni_gcm_finalize) FUNC_SAVE GCM_COMPLETE %arg3 %arg4 FUNC_RESTORE - ret + RET SYM_FUNC_END(aesni_gcm_finalize) #endif @@ -1762,7 +1762,7 @@ SYM_FUNC_START_LOCAL(_key_expansion_256a) pxor %xmm1, %xmm0 movaps %xmm0, (TKEYP) add $0x10, TKEYP - ret + RET SYM_FUNC_END(_key_expansion_256a) SYM_FUNC_END_ALIAS(_key_expansion_128) @@ -1787,7 +1787,7 @@ SYM_FUNC_START_LOCAL(_key_expansion_192a) shufps $0b01001110, %xmm2, %xmm1 movaps %xmm1, 0x10(TKEYP) add $0x20, TKEYP - ret + RET SYM_FUNC_END(_key_expansion_192a) SYM_FUNC_START_LOCAL(_key_expansion_192b) @@ -1806,7 +1806,7 @@ SYM_FUNC_START_LOCAL(_key_expansion_192b) movaps %xmm0, (TKEYP) add $0x10, TKEYP - ret + RET SYM_FUNC_END(_key_expansion_192b) SYM_FUNC_START_LOCAL(_key_expansion_256b) @@ -1818,7 +1818,7 @@ SYM_FUNC_START_LOCAL(_key_expansion_256b) pxor %xmm1, %xmm2 movaps %xmm2, (TKEYP) add $0x10, TKEYP - ret + RET SYM_FUNC_END(_key_expansion_256b) /* @@ -1933,7 +1933,7 @@ SYM_FUNC_START(aesni_set_key) popl KEYP #endif FRAME_END - ret + RET SYM_FUNC_END(aesni_set_key) /* @@ -1957,7 +1957,7 @@ SYM_FUNC_START(aesni_enc) popl KEYP #endif FRAME_END - ret + RET SYM_FUNC_END(aesni_enc) /* @@ -2014,7 +2014,7 @@ SYM_FUNC_START_LOCAL(_aesni_enc1) aesenc KEY, STATE movaps 0x70(TKEYP), KEY aesenclast KEY, STATE - ret + RET SYM_FUNC_END(_aesni_enc1) /* @@ -2122,7 +2122,7 @@ SYM_FUNC_START_LOCAL(_aesni_enc4) aesenclast KEY, STATE2 aesenclast KEY, STATE3 aesenclast KEY, STATE4 - ret + RET SYM_FUNC_END(_aesni_enc4) /* @@ -2147,7 +2147,7 @@ SYM_FUNC_START(aesni_dec) popl KEYP #endif FRAME_END - ret + RET SYM_FUNC_END(aesni_dec) /* @@ -2204,7 +2204,7 @@ SYM_FUNC_START_LOCAL(_aesni_dec1) aesdec KEY, STATE movaps 0x70(TKEYP), KEY aesdeclast KEY, STATE - ret + RET SYM_FUNC_END(_aesni_dec1) /* @@ -2312,7 +2312,7 @@ SYM_FUNC_START_LOCAL(_aesni_dec4) aesdeclast KEY, STATE2 aesdeclast KEY, STATE3 aesdeclast KEY, STATE4 - ret + RET SYM_FUNC_END(_aesni_dec4) /* @@ -2372,7 +2372,7 @@ SYM_FUNC_START(aesni_ecb_enc) popl LEN #endif FRAME_END - ret + RET SYM_FUNC_END(aesni_ecb_enc) /* @@ -2433,7 +2433,7 @@ SYM_FUNC_START(aesni_ecb_dec) popl LEN #endif FRAME_END - ret + RET SYM_FUNC_END(aesni_ecb_dec) /* @@ -2477,7 +2477,7 @@ SYM_FUNC_START(aesni_cbc_enc) popl IVP #endif FRAME_END - ret + RET SYM_FUNC_END(aesni_cbc_enc) /* @@ -2570,7 +2570,7 @@ SYM_FUNC_START(aesni_cbc_dec) popl IVP #endif FRAME_END - ret + RET SYM_FUNC_END(aesni_cbc_dec) /* @@ -2627,7 +2627,7 @@ SYM_FUNC_START(aesni_cts_cbc_enc) popl IVP #endif FRAME_END - ret + RET SYM_FUNC_END(aesni_cts_cbc_enc) /* @@ -2688,7 +2688,7 @@ SYM_FUNC_START(aesni_cts_cbc_dec) popl IVP #endif FRAME_END - ret + RET SYM_FUNC_END(aesni_cts_cbc_dec) .pushsection .rodata @@ -2725,7 +2725,7 @@ SYM_FUNC_START_LOCAL(_aesni_inc_init) mov $1, TCTR_LOW movq TCTR_LOW, INC movq CTR, TCTR_LOW - ret + RET SYM_FUNC_END(_aesni_inc_init) /* @@ -2753,7 +2753,7 @@ SYM_FUNC_START_LOCAL(_aesni_inc) .Linc_low: movaps CTR, IV pshufb BSWAP_MASK, IV - ret + RET SYM_FUNC_END(_aesni_inc) /* @@ -2816,7 +2816,7 @@ SYM_FUNC_START(aesni_ctr_enc) movups IV, (IVP) .Lctr_enc_just_ret: FRAME_END - ret + RET SYM_FUNC_END(aesni_ctr_enc) #endif @@ -2932,7 +2932,7 @@ SYM_FUNC_START(aesni_xts_encrypt) popl IVP #endif FRAME_END - ret + RET .Lxts_enc_1x: add $64, LEN @@ -3092,7 +3092,7 @@ SYM_FUNC_START(aesni_xts_decrypt) popl IVP #endif FRAME_END - ret + RET .Lxts_dec_1x: add $64, LEN diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S index 98e3552b6e03..0852ab573fd3 100644 --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S +++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S @@ -1767,7 +1767,7 @@ SYM_FUNC_START(aesni_gcm_init_avx_gen2) FUNC_SAVE INIT GHASH_MUL_AVX, PRECOMPUTE_AVX FUNC_RESTORE - ret + RET SYM_FUNC_END(aesni_gcm_init_avx_gen2) ############################################################################### @@ -1788,15 +1788,15 @@ SYM_FUNC_START(aesni_gcm_enc_update_avx_gen2) # must be 192 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11 FUNC_RESTORE - ret + RET key_128_enc_update: GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9 FUNC_RESTORE - ret + RET key_256_enc_update: GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13 FUNC_RESTORE - ret + RET SYM_FUNC_END(aesni_gcm_enc_update_avx_gen2) ############################################################################### @@ -1817,15 +1817,15 @@ SYM_FUNC_START(aesni_gcm_dec_update_avx_gen2) # must be 192 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11 FUNC_RESTORE - ret + RET key_128_dec_update: GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9 FUNC_RESTORE - ret + RET key_256_dec_update: GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13 FUNC_RESTORE - ret + RET SYM_FUNC_END(aesni_gcm_dec_update_avx_gen2) ############################################################################### @@ -1846,15 +1846,15 @@ SYM_FUNC_START(aesni_gcm_finalize_avx_gen2) # must be 192 GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4 FUNC_RESTORE - ret + RET key_128_finalize: GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4 FUNC_RESTORE - ret + RET key_256_finalize: GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4 FUNC_RESTORE - ret + RET SYM_FUNC_END(aesni_gcm_finalize_avx_gen2) ############################################################################### @@ -2735,7 +2735,7 @@ SYM_FUNC_START(aesni_gcm_init_avx_gen4) FUNC_SAVE INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2 FUNC_RESTORE - ret + RET SYM_FUNC_END(aesni_gcm_init_avx_gen4) ############################################################################### @@ -2756,15 +2756,15 @@ SYM_FUNC_START(aesni_gcm_enc_update_avx_gen4) # must be 192 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11 FUNC_RESTORE - ret + RET key_128_enc_update4: GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9 FUNC_RESTORE - ret + RET key_256_enc_update4: GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13 FUNC_RESTORE - ret + RET SYM_FUNC_END(aesni_gcm_enc_update_avx_gen4) ############################################################################### @@ -2785,15 +2785,15 @@ SYM_FUNC_START(aesni_gcm_dec_update_avx_gen4) # must be 192 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11 FUNC_RESTORE - ret + RET key_128_dec_update4: GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9 FUNC_RESTORE - ret + RET key_256_dec_update4: GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13 FUNC_RESTORE - ret + RET SYM_FUNC_END(aesni_gcm_dec_update_avx_gen4) ############################################################################### @@ -2814,13 +2814,13 @@ SYM_FUNC_START(aesni_gcm_finalize_avx_gen4) # must be 192 GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4 FUNC_RESTORE - ret + RET key_128_finalize4: GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4 FUNC_RESTORE - ret + RET key_256_finalize4: GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4 FUNC_RESTORE - ret + RET SYM_FUNC_END(aesni_gcm_finalize_avx_gen4) diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 0fc961bef299..41901ba9d3a2 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -866,7 +866,7 @@ static int xts_crypt(struct skcipher_request *req, bool encrypt) req = &subreq; err = skcipher_walk_virt(&walk, req, false); - if (err) + if (!walk.nbytes) return err; } else { tail = 0; @@ -1107,7 +1107,7 @@ static struct aead_alg aesni_aeads[] = { { .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx), - .cra_alignmask = AESNI_ALIGN - 1, + .cra_alignmask = 0, .cra_module = THIS_MODULE, }, }, { @@ -1124,7 +1124,7 @@ static struct aead_alg aesni_aeads[] = { { .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct generic_gcmaes_ctx), - .cra_alignmask = AESNI_ALIGN - 1, + .cra_alignmask = 0, .cra_module = THIS_MODULE, }, } }; diff --git a/arch/x86/crypto/blake2s-core.S b/arch/x86/crypto/blake2s-core.S index 2ca79974f819..b50b35ff1fdb 100644 --- a/arch/x86/crypto/blake2s-core.S +++ b/arch/x86/crypto/blake2s-core.S @@ -171,7 +171,7 @@ SYM_FUNC_START(blake2s_compress_ssse3) movdqu %xmm1,0x10(%rdi) movdqu %xmm14,0x20(%rdi) .Lendofloop: - ret + RET SYM_FUNC_END(blake2s_compress_ssse3) #ifdef CONFIG_AS_AVX512 @@ -251,6 +251,6 @@ SYM_FUNC_START(blake2s_compress_avx512) vmovdqu %xmm1,0x10(%rdi) vmovdqu %xmm4,0x20(%rdi) vzeroupper - retq + RET SYM_FUNC_END(blake2s_compress_avx512) #endif /* CONFIG_AS_AVX512 */ diff --git a/arch/x86/crypto/blake2s-glue.c b/arch/x86/crypto/blake2s-glue.c index a40365ab301e..69853c13e8fb 100644 --- a/arch/x86/crypto/blake2s-glue.c +++ b/arch/x86/crypto/blake2s-glue.c @@ -5,7 +5,6 @@ #include <crypto/internal/blake2s.h> #include <crypto/internal/simd.h> -#include <crypto/internal/hash.h> #include <linux/types.h> #include <linux/jump_label.h> @@ -28,9 +27,8 @@ asmlinkage void blake2s_compress_avx512(struct blake2s_state *state, static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3); static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512); -void blake2s_compress_arch(struct blake2s_state *state, - const u8 *block, size_t nblocks, - const u32 inc) +void blake2s_compress(struct blake2s_state *state, const u8 *block, + size_t nblocks, const u32 inc) { /* SIMD disables preemption, so relax after processing each page. */ BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8); @@ -56,49 +54,12 @@ void blake2s_compress_arch(struct blake2s_state *state, block += blocks * BLAKE2S_BLOCK_SIZE; } while (nblocks); } -EXPORT_SYMBOL(blake2s_compress_arch); - -static int crypto_blake2s_update_x86(struct shash_desc *desc, - const u8 *in, unsigned int inlen) -{ - return crypto_blake2s_update(desc, in, inlen, blake2s_compress_arch); -} - -static int crypto_blake2s_final_x86(struct shash_desc *desc, u8 *out) -{ - return crypto_blake2s_final(desc, out, blake2s_compress_arch); -} - -#define BLAKE2S_ALG(name, driver_name, digest_size) \ - { \ - .base.cra_name = name, \ - .base.cra_driver_name = driver_name, \ - .base.cra_priority = 200, \ - .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, \ - .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, \ - .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), \ - .base.cra_module = THIS_MODULE, \ - .digestsize = digest_size, \ - .setkey = crypto_blake2s_setkey, \ - .init = crypto_blake2s_init, \ - .update = crypto_blake2s_update_x86, \ - .final = crypto_blake2s_final_x86, \ - .descsize = sizeof(struct blake2s_state), \ - } - -static struct shash_alg blake2s_algs[] = { - BLAKE2S_ALG("blake2s-128", "blake2s-128-x86", BLAKE2S_128_HASH_SIZE), - BLAKE2S_ALG("blake2s-160", "blake2s-160-x86", BLAKE2S_160_HASH_SIZE), - BLAKE2S_ALG("blake2s-224", "blake2s-224-x86", BLAKE2S_224_HASH_SIZE), - BLAKE2S_ALG("blake2s-256", "blake2s-256-x86", BLAKE2S_256_HASH_SIZE), -}; +EXPORT_SYMBOL(blake2s_compress); static int __init blake2s_mod_init(void) { - if (!boot_cpu_has(X86_FEATURE_SSSE3)) - return 0; - - static_branch_enable(&blake2s_use_ssse3); + if (boot_cpu_has(X86_FEATURE_SSSE3)) + static_branch_enable(&blake2s_use_ssse3); if (IS_ENABLED(CONFIG_AS_AVX512) && boot_cpu_has(X86_FEATURE_AVX) && @@ -109,26 +70,9 @@ static int __init blake2s_mod_init(void) XFEATURE_MASK_AVX512, NULL)) static_branch_enable(&blake2s_use_avx512); - return IS_REACHABLE(CONFIG_CRYPTO_HASH) ? - crypto_register_shashes(blake2s_algs, - ARRAY_SIZE(blake2s_algs)) : 0; -} - -static void __exit blake2s_mod_exit(void) -{ - if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && boot_cpu_has(X86_FEATURE_SSSE3)) - crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs)); + return 0; } module_init(blake2s_mod_init); -module_exit(blake2s_mod_exit); -MODULE_ALIAS_CRYPTO("blake2s-128"); -MODULE_ALIAS_CRYPTO("blake2s-128-x86"); -MODULE_ALIAS_CRYPTO("blake2s-160"); -MODULE_ALIAS_CRYPTO("blake2s-160-x86"); -MODULE_ALIAS_CRYPTO("blake2s-224"); -MODULE_ALIAS_CRYPTO("blake2s-224-x86"); -MODULE_ALIAS_CRYPTO("blake2s-256"); -MODULE_ALIAS_CRYPTO("blake2s-256-x86"); MODULE_LICENSE("GPL v2"); diff --git a/arch/x86/crypto/blake2s-shash.c b/arch/x86/crypto/blake2s-shash.c new file mode 100644 index 000000000000..f9e2fecdb761 --- /dev/null +++ b/arch/x86/crypto/blake2s-shash.c @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: GPL-2.0 OR MIT +/* + * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + */ + +#include <crypto/internal/blake2s.h> +#include <crypto/internal/simd.h> +#include <crypto/internal/hash.h> + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/sizes.h> + +#include <asm/cpufeature.h> +#include <asm/processor.h> + +static int crypto_blake2s_update_x86(struct shash_desc *desc, + const u8 *in, unsigned int inlen) +{ + return crypto_blake2s_update(desc, in, inlen, blake2s_compress); +} + +static int crypto_blake2s_final_x86(struct shash_desc *desc, u8 *out) +{ + return crypto_blake2s_final(desc, out, blake2s_compress); +} + +#define BLAKE2S_ALG(name, driver_name, digest_size) \ + { \ + .base.cra_name = name, \ + .base.cra_driver_name = driver_name, \ + .base.cra_priority = 200, \ + .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, \ + .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, \ + .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), \ + .base.cra_module = THIS_MODULE, \ + .digestsize = digest_size, \ + .setkey = crypto_blake2s_setkey, \ + .init = crypto_blake2s_init, \ + .update = crypto_blake2s_update_x86, \ + .final = crypto_blake2s_final_x86, \ + .descsize = sizeof(struct blake2s_state), \ + } + +static struct shash_alg blake2s_algs[] = { + BLAKE2S_ALG("blake2s-128", "blake2s-128-x86", BLAKE2S_128_HASH_SIZE), + BLAKE2S_ALG("blake2s-160", "blake2s-160-x86", BLAKE2S_160_HASH_SIZE), + BLAKE2S_ALG("blake2s-224", "blake2s-224-x86", BLAKE2S_224_HASH_SIZE), + BLAKE2S_ALG("blake2s-256", "blake2s-256-x86", BLAKE2S_256_HASH_SIZE), +}; + +static int __init blake2s_mod_init(void) +{ + if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && boot_cpu_has(X86_FEATURE_SSSE3)) + return crypto_register_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs)); + return 0; +} + +static void __exit blake2s_mod_exit(void) +{ + if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && boot_cpu_has(X86_FEATURE_SSSE3)) + crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs)); +} + +module_init(blake2s_mod_init); +module_exit(blake2s_mod_exit); + +MODULE_ALIAS_CRYPTO("blake2s-128"); +MODULE_ALIAS_CRYPTO("blake2s-128-x86"); +MODULE_ALIAS_CRYPTO("blake2s-160"); +MODULE_ALIAS_CRYPTO("blake2s-160-x86"); +MODULE_ALIAS_CRYPTO("blake2s-224"); +MODULE_ALIAS_CRYPTO("blake2s-224-x86"); +MODULE_ALIAS_CRYPTO("blake2s-256"); +MODULE_ALIAS_CRYPTO("blake2s-256-x86"); +MODULE_LICENSE("GPL v2"); diff --git a/arch/x86/crypto/blowfish-x86_64-asm_64.S b/arch/x86/crypto/blowfish-x86_64-asm_64.S index 4222ac6d6584..802d71582689 100644 --- a/arch/x86/crypto/blowfish-x86_64-asm_64.S +++ b/arch/x86/crypto/blowfish-x86_64-asm_64.S @@ -135,10 +135,10 @@ SYM_FUNC_START(__blowfish_enc_blk) jnz .L__enc_xor; write_block(); - ret; + RET; .L__enc_xor: xor_block(); - ret; + RET; SYM_FUNC_END(__blowfish_enc_blk) SYM_FUNC_START(blowfish_dec_blk) @@ -170,7 +170,7 @@ SYM_FUNC_START(blowfish_dec_blk) movq %r11, %r12; - ret; + RET; SYM_FUNC_END(blowfish_dec_blk) /********************************************************************** @@ -322,14 +322,14 @@ SYM_FUNC_START(__blowfish_enc_blk_4way) popq %rbx; popq %r12; - ret; + RET; .L__enc_xor4: xor_block4(); popq %rbx; popq %r12; - ret; + RET; SYM_FUNC_END(__blowfish_enc_blk_4way) SYM_FUNC_START(blowfish_dec_blk_4way) @@ -364,5 +364,5 @@ SYM_FUNC_START(blowfish_dec_blk_4way) popq %rbx; popq %r12; - ret; + RET; SYM_FUNC_END(blowfish_dec_blk_4way) diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S index e2a0e0f4bf9d..2e1658ddbe1a 100644 --- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S @@ -192,7 +192,7 @@ SYM_FUNC_START_LOCAL(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_c roundsm16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %rcx, (%r9)); - ret; + RET; SYM_FUNC_END(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd) .align 8 @@ -200,7 +200,7 @@ SYM_FUNC_START_LOCAL(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_a roundsm16(%xmm4, %xmm5, %xmm6, %xmm7, %xmm0, %xmm1, %xmm2, %xmm3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm8, %xmm9, %xmm10, %xmm11, %rax, (%r9)); - ret; + RET; SYM_FUNC_END(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) /* @@ -778,7 +778,7 @@ SYM_FUNC_START_LOCAL(__camellia_enc_blk16) %xmm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 16(%rax)); FRAME_END - ret; + RET; .align 8 .Lenc_max32: @@ -865,7 +865,7 @@ SYM_FUNC_START_LOCAL(__camellia_dec_blk16) %xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax)); FRAME_END - ret; + RET; .align 8 .Ldec_max32: @@ -906,7 +906,7 @@ SYM_FUNC_START(camellia_ecb_enc_16way) %xmm8, %rsi); FRAME_END - ret; + RET; SYM_FUNC_END(camellia_ecb_enc_16way) SYM_FUNC_START(camellia_ecb_dec_16way) @@ -936,7 +936,7 @@ SYM_FUNC_START(camellia_ecb_dec_16way) %xmm8, %rsi); FRAME_END - ret; + RET; SYM_FUNC_END(camellia_ecb_dec_16way) SYM_FUNC_START(camellia_cbc_dec_16way) @@ -987,5 +987,5 @@ SYM_FUNC_START(camellia_cbc_dec_16way) %xmm8, %rsi); FRAME_END - ret; + RET; SYM_FUNC_END(camellia_cbc_dec_16way) diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S index 706f70829a07..0e4e9abbf4de 100644 --- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S @@ -226,7 +226,7 @@ SYM_FUNC_START_LOCAL(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_c roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, %rcx, (%r9)); - ret; + RET; SYM_FUNC_END(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd) .align 8 @@ -234,7 +234,7 @@ SYM_FUNC_START_LOCAL(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_a roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3, %ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11, %rax, (%r9)); - ret; + RET; SYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) /* @@ -814,7 +814,7 @@ SYM_FUNC_START_LOCAL(__camellia_enc_blk32) %ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax)); FRAME_END - ret; + RET; .align 8 .Lenc_max32: @@ -901,7 +901,7 @@ SYM_FUNC_START_LOCAL(__camellia_dec_blk32) %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax)); FRAME_END - ret; + RET; .align 8 .Ldec_max32: @@ -946,7 +946,7 @@ SYM_FUNC_START(camellia_ecb_enc_32way) vzeroupper; FRAME_END - ret; + RET; SYM_FUNC_END(camellia_ecb_enc_32way) SYM_FUNC_START(camellia_ecb_dec_32way) @@ -980,7 +980,7 @@ SYM_FUNC_START(camellia_ecb_dec_32way) vzeroupper; FRAME_END - ret; + RET; SYM_FUNC_END(camellia_ecb_dec_32way) SYM_FUNC_START(camellia_cbc_dec_32way) @@ -1047,5 +1047,5 @@ SYM_FUNC_START(camellia_cbc_dec_32way) addq $(16 * 32), %rsp; FRAME_END - ret; + RET; SYM_FUNC_END(camellia_cbc_dec_32way) diff --git a/arch/x86/crypto/camellia-x86_64-asm_64.S b/arch/x86/crypto/camellia-x86_64-asm_64.S index 1372e6408850..347c059f5940 100644 --- a/arch/x86/crypto/camellia-x86_64-asm_64.S +++ b/arch/x86/crypto/camellia-x86_64-asm_64.S @@ -213,13 +213,13 @@ SYM_FUNC_START(__camellia_enc_blk) enc_outunpack(mov, RT1); movq RR12, %r12; - ret; + RET; .L__enc_xor: enc_outunpack(xor, RT1); movq RR12, %r12; - ret; + RET; SYM_FUNC_END(__camellia_enc_blk) SYM_FUNC_START(camellia_dec_blk) @@ -257,7 +257,7 @@ SYM_FUNC_START(camellia_dec_blk) dec_outunpack(); movq RR12, %r12; - ret; + RET; SYM_FUNC_END(camellia_dec_blk) /********************************************************************** @@ -448,14 +448,14 @@ SYM_FUNC_START(__camellia_enc_blk_2way) movq RR12, %r12; popq %rbx; - ret; + RET; .L__enc2_xor: enc_outunpack2(xor, RT2); movq RR12, %r12; popq %rbx; - ret; + RET; SYM_FUNC_END(__camellia_enc_blk_2way) SYM_FUNC_START(camellia_dec_blk_2way) @@ -495,5 +495,5 @@ SYM_FUNC_START(camellia_dec_blk_2way) movq RR12, %r12; movq RXOR, %rbx; - ret; + RET; SYM_FUNC_END(camellia_dec_blk_2way) diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S index 8a6181b08b59..b258af420c92 100644 --- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S @@ -279,7 +279,7 @@ SYM_FUNC_START_LOCAL(__cast5_enc_blk16) outunpack_blocks(RR3, RL3, RTMP, RX, RKM); outunpack_blocks(RR4, RL4, RTMP, RX, RKM); - ret; + RET; SYM_FUNC_END(__cast5_enc_blk16) .align 16 @@ -352,7 +352,7 @@ SYM_FUNC_START_LOCAL(__cast5_dec_blk16) outunpack_blocks(RR3, RL3, RTMP, RX, RKM); outunpack_blocks(RR4, RL4, RTMP, RX, RKM); - ret; + RET; .L__skip_dec: vpsrldq $4, RKR, RKR; @@ -393,7 +393,7 @@ SYM_FUNC_START(cast5_ecb_enc_16way) popq %r15; FRAME_END - ret; + RET; SYM_FUNC_END(cast5_ecb_enc_16way) SYM_FUNC_START(cast5_ecb_dec_16way) @@ -431,7 +431,7 @@ SYM_FUNC_START(cast5_ecb_dec_16way) popq %r15; FRAME_END - ret; + RET; SYM_FUNC_END(cast5_ecb_dec_16way) SYM_FUNC_START(cast5_cbc_dec_16way) @@ -483,7 +483,7 @@ SYM_FUNC_START(cast5_cbc_dec_16way) popq %r15; popq %r12; FRAME_END - ret; + RET; SYM_FUNC_END(cast5_cbc_dec_16way) SYM_FUNC_START(cast5_ctr_16way) @@ -559,5 +559,5 @@ SYM_FUNC_START(cast5_ctr_16way) popq %r15; popq %r12; FRAME_END - ret; + RET; SYM_FUNC_END(cast5_ctr_16way) diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S index fbddcecc3e3f..82b716fd5dba 100644 --- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S @@ -289,7 +289,7 @@ SYM_FUNC_START_LOCAL(__cast6_enc_blk8) outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); - ret; + RET; SYM_FUNC_END(__cast6_enc_blk8) .align 8 @@ -336,7 +336,7 @@ SYM_FUNC_START_LOCAL(__cast6_dec_blk8) outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); - ret; + RET; SYM_FUNC_END(__cast6_dec_blk8) SYM_FUNC_START(cast6_ecb_enc_8way) @@ -359,7 +359,7 @@ SYM_FUNC_START(cast6_ecb_enc_8way) popq %r15; FRAME_END - ret; + RET; SYM_FUNC_END(cast6_ecb_enc_8way) SYM_FUNC_START(cast6_ecb_dec_8way) @@ -382,7 +382,7 @@ SYM_FUNC_START(cast6_ecb_dec_8way) popq %r15; FRAME_END - ret; + RET; SYM_FUNC_END(cast6_ecb_dec_8way) SYM_FUNC_START(cast6_cbc_dec_8way) @@ -408,5 +408,5 @@ SYM_FUNC_START(cast6_cbc_dec_8way) popq %r15; popq %r12; FRAME_END - ret; + RET; SYM_FUNC_END(cast6_cbc_dec_8way) diff --git a/arch/x86/crypto/chacha-avx2-x86_64.S b/arch/x86/crypto/chacha-avx2-x86_64.S index ee9a40ab4109..f3d8fc018249 100644 --- a/arch/x86/crypto/chacha-avx2-x86_64.S +++ b/arch/x86/crypto/chacha-avx2-x86_64.S @@ -193,7 +193,7 @@ SYM_FUNC_START(chacha_2block_xor_avx2) .Ldone2: vzeroupper - ret + RET .Lxorpart2: # xor remaining bytes from partial register into output @@ -498,7 +498,7 @@ SYM_FUNC_START(chacha_4block_xor_avx2) .Ldone4: vzeroupper - ret + RET .Lxorpart4: # xor remaining bytes from partial register into output @@ -992,7 +992,7 @@ SYM_FUNC_START(chacha_8block_xor_avx2) .Ldone8: vzeroupper lea -8(%r10),%rsp - ret + RET .Lxorpart8: # xor remaining bytes from partial register into output diff --git a/arch/x86/crypto/chacha-avx512vl-x86_64.S b/arch/x86/crypto/chacha-avx512vl-x86_64.S index bb193fde123a..946f74dd6fba 100644 --- a/arch/x86/crypto/chacha-avx512vl-x86_64.S +++ b/arch/x86/crypto/chacha-avx512vl-x86_64.S @@ -166,7 +166,7 @@ SYM_FUNC_START(chacha_2block_xor_avx512vl) .Ldone2: vzeroupper - ret + RET .Lxorpart2: # xor remaining bytes from partial register into output @@ -432,7 +432,7 @@ SYM_FUNC_START(chacha_4block_xor_avx512vl) .Ldone4: vzeroupper - ret + RET .Lxorpart4: # xor remaining bytes from partial register into output @@ -812,7 +812,7 @@ SYM_FUNC_START(chacha_8block_xor_avx512vl) .Ldone8: vzeroupper - ret + RET .Lxorpart8: # xor remaining bytes from partial register into output diff --git a/arch/x86/crypto/chacha-ssse3-x86_64.S b/arch/x86/crypto/chacha-ssse3-x86_64.S index ca1788bfee16..7111949cd5b9 100644 --- a/arch/x86/crypto/chacha-ssse3-x86_64.S +++ b/arch/x86/crypto/chacha-ssse3-x86_64.S @@ -108,7 +108,7 @@ SYM_FUNC_START_LOCAL(chacha_permute) sub $2,%r8d jnz .Ldoubleround - ret + RET SYM_FUNC_END(chacha_permute) SYM_FUNC_START(chacha_block_xor_ssse3) @@ -166,7 +166,7 @@ SYM_FUNC_START(chacha_block_xor_ssse3) .Ldone: FRAME_END - ret + RET .Lxorpart: # xor remaining bytes from partial register into output @@ -217,7 +217,7 @@ SYM_FUNC_START(hchacha_block_ssse3) movdqu %xmm3,0x10(%rsi) FRAME_END - ret + RET SYM_FUNC_END(hchacha_block_ssse3) SYM_FUNC_START(chacha_4block_xor_ssse3) @@ -762,7 +762,7 @@ SYM_FUNC_START(chacha_4block_xor_ssse3) .Ldone4: lea -8(%r10),%rsp - ret + RET .Lxorpart4: # xor remaining bytes from partial register into output diff --git a/arch/x86/crypto/crc32-pclmul_asm.S b/arch/x86/crypto/crc32-pclmul_asm.S index 6e7d4c4d3208..c392a6edbfff 100644 --- a/arch/x86/crypto/crc32-pclmul_asm.S +++ b/arch/x86/crypto/crc32-pclmul_asm.S @@ -236,5 +236,5 @@ fold_64: pxor %xmm2, %xmm1 pextrd $0x01, %xmm1, %eax - ret + RET SYM_FUNC_END(crc32_pclmul_le_16) diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S index ac1f303eed0f..80c0d22fc42c 100644 --- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S @@ -306,7 +306,7 @@ do_return: popq %rsi popq %rdi popq %rbx - ret + RET SYM_FUNC_END(crc_pcl) .section .rodata, "a", @progbits diff --git a/arch/x86/crypto/crct10dif-pcl-asm_64.S b/arch/x86/crypto/crct10dif-pcl-asm_64.S index b2533d63030e..721474abfb71 100644 --- a/arch/x86/crypto/crct10dif-pcl-asm_64.S +++ b/arch/x86/crypto/crct10dif-pcl-asm_64.S @@ -257,7 +257,7 @@ SYM_FUNC_START(crc_t10dif_pcl) # Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of xmm0. pextrw $0, %xmm0, %eax - ret + RET .align 16 .Lless_than_256_bytes: diff --git a/arch/x86/crypto/curve25519-x86_64.c b/arch/x86/crypto/curve25519-x86_64.c index 38caf61cd5b7..d55fa9e9b9e6 100644 --- a/arch/x86/crypto/curve25519-x86_64.c +++ b/arch/x86/crypto/curve25519-x86_64.c @@ -64,10 +64,9 @@ static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2) /* Return the carry bit in a register */ " adcx %%r11, %1;" - : "+&r" (f2), "=&r" (carry_r) - : "r" (out), "r" (f1) - : "%r8", "%r9", "%r10", "%r11", "memory", "cc" - ); + : "+&r"(f2), "=&r"(carry_r) + : "r"(out), "r"(f1) + : "%r8", "%r9", "%r10", "%r11", "memory", "cc"); return carry_r; } @@ -108,10 +107,9 @@ static inline void fadd(u64 *out, const u64 *f1, const u64 *f2) " cmovc %0, %%rax;" " add %%rax, %%r8;" " movq %%r8, 0(%1);" - : "+&r" (f2) - : "r" (out), "r" (f1) - : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc" - ); + : "+&r"(f2) + : "r"(out), "r"(f1) + : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc"); } /* Computes the field subtraction of two field elements */ @@ -151,10 +149,9 @@ static inline void fsub(u64 *out, const u64 *f1, const u64 *f2) " movq %%r9, 8(%0);" " movq %%r10, 16(%0);" " movq %%r11, 24(%0);" - : - : "r" (out), "r" (f1), "r" (f2) - : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc" - ); + : + : "r"(out), "r"(f1), "r"(f2) + : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc"); } /* Computes a field multiplication: out <- f1 * f2 @@ -162,239 +159,400 @@ static inline void fsub(u64 *out, const u64 *f1, const u64 *f2) static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp) { asm volatile( + /* Compute the raw multiplication: tmp <- src1 * src2 */ /* Compute src1[0] * src2 */ - " movq 0(%1), %%rdx;" - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " movq %%r8, 0(%0);" - " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 8(%0);" - " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" - " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;" - " adox %%rdx, %%rax;" + " movq 0(%0), %%rdx;" + " mulxq 0(%1), %%r8, %%r9;" + " xor %%r10d, %%r10d;" + " movq %%r8, 0(%2);" + " mulxq 8(%1), %%r10, %%r11;" + " adox %%r9, %%r10;" + " movq %%r10, 8(%2);" + " mulxq 16(%1), %%rbx, %%r13;" + " adox %%r11, %%rbx;" + " mulxq 24(%1), %%r14, %%rdx;" + " adox %%r13, %%r14;" + " mov $0, %%rax;" + " adox %%rdx, %%rax;" + /* Compute src1[1] * src2 */ - " movq 8(%1), %%rdx;" - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);" - " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 16(%0);" - " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;" - " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;" - " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" + " movq 8(%0), %%rdx;" + " mulxq 0(%1), %%r8, %%r9;" + " xor %%r10d, %%r10d;" + " adcxq 8(%2), %%r8;" + " movq %%r8, 8(%2);" + " mulxq 8(%1), %%r10, %%r11;" + " adox %%r9, %%r10;" + " adcx %%rbx, %%r10;" + " movq %%r10, 16(%2);" + " mulxq 16(%1), %%rbx, %%r13;" + " adox %%r11, %%rbx;" + " adcx %%r14, %%rbx;" + " mov $0, %%r8;" + " mulxq 24(%1), %%r14, %%rdx;" + " adox %%r13, %%r14;" + " adcx %%rax, %%r14;" + " mov $0, %%rax;" + " adox %%rdx, %%rax;" + " adcx %%r8, %%rax;" + /* Compute src1[2] * src2 */ - " movq 16(%1), %%rdx;" - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);" - " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 24(%0);" - " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;" - " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;" - " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" + " movq 16(%0), %%rdx;" + " mulxq 0(%1), %%r8, %%r9;" + " xor %%r10d, %%r10d;" + " adcxq 16(%2), %%r8;" + " movq %%r8, 16(%2);" + " mulxq 8(%1), %%r10, %%r11;" + " adox %%r9, %%r10;" + " adcx %%rbx, %%r10;" + " movq %%r10, 24(%2);" + " mulxq 16(%1), %%rbx, %%r13;" + " adox %%r11, %%rbx;" + " adcx %%r14, %%rbx;" + " mov $0, %%r8;" + " mulxq 24(%1), %%r14, %%rdx;" + " adox %%r13, %%r14;" + " adcx %%rax, %%r14;" + " mov $0, %%rax;" + " adox %%rdx, %%rax;" + " adcx %%r8, %%rax;" + /* Compute src1[3] * src2 */ - " movq 24(%1), %%rdx;" - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);" - " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 32(%0);" - " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 40(%0);" " mov $0, %%r8;" - " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 48(%0);" " mov $0, %%rax;" - " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 56(%0);" + " movq 24(%0), %%rdx;" + " mulxq 0(%1), %%r8, %%r9;" + " xor %%r10d, %%r10d;" + " adcxq 24(%2), %%r8;" + " movq %%r8, 24(%2);" + " mulxq 8(%1), %%r10, %%r11;" + " adox %%r9, %%r10;" + " adcx %%rbx, %%r10;" + " movq %%r10, 32(%2);" + " mulxq 16(%1), %%rbx, %%r13;" + " adox %%r11, %%rbx;" + " adcx %%r14, %%rbx;" + " movq %%rbx, 40(%2);" + " mov $0, %%r8;" + " mulxq 24(%1), %%r14, %%rdx;" + " adox %%r13, %%r14;" + " adcx %%rax, %%r14;" + " movq %%r14, 48(%2);" + " mov $0, %%rax;" + " adox %%rdx, %%rax;" + " adcx %%r8, %%rax;" + " movq %%rax, 56(%2);" + /* Line up pointers */ - " mov %0, %1;" " mov %2, %0;" + " mov %3, %2;" /* Wrap the result back into the field */ /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ " mov $38, %%rdx;" - " mulxq 32(%1), %%r8, %%r13;" - " xor %k3, %k3;" - " adoxq 0(%1), %%r8;" - " mulxq 40(%1), %%r9, %%rbx;" + " mulxq 32(%0), %%r8, %%r13;" + " xor %k1, %k1;" + " adoxq 0(%0), %%r8;" + " mulxq 40(%0), %%r9, %%rbx;" " adcx %%r13, %%r9;" - " adoxq 8(%1), %%r9;" - " mulxq 48(%1), %%r10, %%r13;" + " adoxq 8(%0), %%r9;" + " mulxq 48(%0), %%r10, %%r13;" " adcx %%rbx, %%r10;" - " adoxq 16(%1), %%r10;" - " mulxq 56(%1), %%r11, %%rax;" + " adoxq 16(%0), %%r10;" + " mulxq 56(%0), %%r11, %%rax;" " adcx %%r13, %%r11;" - " adoxq 24(%1), %%r11;" - " adcx %3, %%rax;" - " adox %3, %%rax;" + " adoxq 24(%0), %%r11;" + " adcx %1, %%rax;" + " adox %1, %%rax;" " imul %%rdx, %%rax;" /* Step 2: Fold the carry back into dst */ " add %%rax, %%r8;" - " adcx %3, %%r9;" - " movq %%r9, 8(%0);" - " adcx %3, %%r10;" - " movq %%r10, 16(%0);" - " adcx %3, %%r11;" - " movq %%r11, 24(%0);" + " adcx %1, %%r9;" + " movq %%r9, 8(%2);" + " adcx %1, %%r10;" + " movq %%r10, 16(%2);" + " adcx %1, %%r11;" + " movq %%r11, 24(%2);" /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ " mov $0, %%rax;" " cmovc %%rdx, %%rax;" " add %%rax, %%r8;" - " movq %%r8, 0(%0);" - : "+&r" (tmp), "+&r" (f1), "+&r" (out), "+&r" (f2) - : - : "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "memory", "cc" - ); + " movq %%r8, 0(%2);" + : "+&r"(f1), "+&r"(f2), "+&r"(tmp) + : "r"(out) + : "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13", + "%r14", "memory", "cc"); } /* Computes two field multiplications: - * out[0] <- f1[0] * f2[0] - * out[1] <- f1[1] * f2[1] - * Uses the 16-element buffer tmp for intermediate results. */ + * out[0] <- f1[0] * f2[0] + * out[1] <- f1[1] * f2[1] + * Uses the 16-element buffer tmp for intermediate results: */ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp) { asm volatile( + /* Compute the raw multiplication tmp[0] <- f1[0] * f2[0] */ /* Compute src1[0] * src2 */ - " movq 0(%1), %%rdx;" - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " movq %%r8, 0(%0);" - " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 8(%0);" - " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" - " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;" - " adox %%rdx, %%rax;" + " movq 0(%0), %%rdx;" + " mulxq 0(%1), %%r8, %%r9;" + " xor %%r10d, %%r10d;" + " movq %%r8, 0(%2);" + " mulxq 8(%1), %%r10, %%r11;" + " adox %%r9, %%r10;" + " movq %%r10, 8(%2);" + " mulxq 16(%1), %%rbx, %%r13;" + " adox %%r11, %%rbx;" + " mulxq 24(%1), %%r14, %%rdx;" + " adox %%r13, %%r14;" + " mov $0, %%rax;" + " adox %%rdx, %%rax;" + /* Compute src1[1] * src2 */ - " movq 8(%1), %%rdx;" - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);" - " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 16(%0);" - " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;" - " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;" - " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" + " movq 8(%0), %%rdx;" + " mulxq 0(%1), %%r8, %%r9;" + " xor %%r10d, %%r10d;" + " adcxq 8(%2), %%r8;" + " movq %%r8, 8(%2);" + " mulxq 8(%1), %%r10, %%r11;" + " adox %%r9, %%r10;" + " adcx %%rbx, %%r10;" + " movq %%r10, 16(%2);" + " mulxq 16(%1), %%rbx, %%r13;" + " adox %%r11, %%rbx;" + " adcx %%r14, %%rbx;" + " mov $0, %%r8;" + " mulxq 24(%1), %%r14, %%rdx;" + " adox %%r13, %%r14;" + " adcx %%rax, %%r14;" + " mov $0, %%rax;" + " adox %%rdx, %%rax;" + " adcx %%r8, %%rax;" + /* Compute src1[2] * src2 */ - " movq 16(%1), %%rdx;" - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);" - " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 24(%0);" - " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;" - " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;" - " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" + " movq 16(%0), %%rdx;" + " mulxq 0(%1), %%r8, %%r9;" + " xor %%r10d, %%r10d;" + " adcxq 16(%2), %%r8;" + " movq %%r8, 16(%2);" + " mulxq 8(%1), %%r10, %%r11;" + " adox %%r9, %%r10;" + " adcx %%rbx, %%r10;" + " movq %%r10, 24(%2);" + " mulxq 16(%1), %%rbx, %%r13;" + " adox %%r11, %%rbx;" + " adcx %%r14, %%rbx;" + " mov $0, %%r8;" + " mulxq 24(%1), %%r14, %%rdx;" + " adox %%r13, %%r14;" + " adcx %%rax, %%r14;" + " mov $0, %%rax;" + " adox %%rdx, %%rax;" + " adcx %%r8, %%rax;" + /* Compute src1[3] * src2 */ - " movq 24(%1), %%rdx;" - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);" - " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 32(%0);" - " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 40(%0);" " mov $0, %%r8;" - " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 48(%0);" " mov $0, %%rax;" - " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 56(%0);" + " movq 24(%0), %%rdx;" + " mulxq 0(%1), %%r8, %%r9;" + " xor %%r10d, %%r10d;" + " adcxq 24(%2), %%r8;" + " movq %%r8, 24(%2);" + " mulxq 8(%1), %%r10, %%r11;" + " adox %%r9, %%r10;" + " adcx %%rbx, %%r10;" + " movq %%r10, 32(%2);" + " mulxq 16(%1), %%rbx, %%r13;" + " adox %%r11, %%rbx;" + " adcx %%r14, %%rbx;" + " movq %%rbx, 40(%2);" + " mov $0, %%r8;" + " mulxq 24(%1), %%r14, %%rdx;" + " adox %%r13, %%r14;" + " adcx %%rax, %%r14;" + " movq %%r14, 48(%2);" + " mov $0, %%rax;" + " adox %%rdx, %%rax;" + " adcx %%r8, %%rax;" + " movq %%rax, 56(%2);" /* Compute the raw multiplication tmp[1] <- f1[1] * f2[1] */ /* Compute src1[0] * src2 */ - " movq 32(%1), %%rdx;" - " mulxq 32(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " movq %%r8, 64(%0);" - " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 72(%0);" - " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" - " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;" - " adox %%rdx, %%rax;" + " movq 32(%0), %%rdx;" + " mulxq 32(%1), %%r8, %%r9;" + " xor %%r10d, %%r10d;" + " movq %%r8, 64(%2);" + " mulxq 40(%1), %%r10, %%r11;" + " adox %%r9, %%r10;" + " movq %%r10, 72(%2);" + " mulxq 48(%1), %%rbx, %%r13;" + " adox %%r11, %%rbx;" + " mulxq 56(%1), %%r14, %%rdx;" + " adox %%r13, %%r14;" + " mov $0, %%rax;" + " adox %%rdx, %%rax;" + /* Compute src1[1] * src2 */ - " movq 40(%1), %%rdx;" - " mulxq 32(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 72(%0), %%r8;" " movq %%r8, 72(%0);" - " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 80(%0);" - " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;" - " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;" - " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" + " movq 40(%0), %%rdx;" + " mulxq 32(%1), %%r8, %%r9;" + " xor %%r10d, %%r10d;" + " adcxq 72(%2), %%r8;" + " movq %%r8, 72(%2);" + " mulxq 40(%1), %%r10, %%r11;" + " adox %%r9, %%r10;" + " adcx %%rbx, %%r10;" + " movq %%r10, 80(%2);" + " mulxq 48(%1), %%rbx, %%r13;" + " adox %%r11, %%rbx;" + " adcx %%r14, %%rbx;" + " mov $0, %%r8;" + " mulxq 56(%1), %%r14, %%rdx;" + " adox %%r13, %%r14;" + " adcx %%rax, %%r14;" + " mov $0, %%rax;" + " adox %%rdx, %%rax;" + " adcx %%r8, %%rax;" + /* Compute src1[2] * src2 */ - " movq 48(%1), %%rdx;" - " mulxq 32(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 80(%0), %%r8;" " movq %%r8, 80(%0);" - " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 88(%0);" - " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;" - " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;" - " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" + " movq 48(%0), %%rdx;" + " mulxq 32(%1), %%r8, %%r9;" + " xor %%r10d, %%r10d;" + " adcxq 80(%2), %%r8;" + " movq %%r8, 80(%2);" + " mulxq 40(%1), %%r10, %%r11;" + " adox %%r9, %%r10;" + " adcx %%rbx, %%r10;" + " movq %%r10, 88(%2);" + " mulxq 48(%1), %%rbx, %%r13;" + " adox %%r11, %%rbx;" + " adcx %%r14, %%rbx;" + " mov $0, %%r8;" + " mulxq 56(%1), %%r14, %%rdx;" + " adox %%r13, %%r14;" + " adcx %%rax, %%r14;" + " mov $0, %%rax;" + " adox %%rdx, %%rax;" + " adcx %%r8, %%rax;" + /* Compute src1[3] * src2 */ - " movq 56(%1), %%rdx;" - " mulxq 32(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 88(%0), %%r8;" " movq %%r8, 88(%0);" - " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 96(%0);" - " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 104(%0);" " mov $0, %%r8;" - " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 112(%0);" " mov $0, %%rax;" - " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 120(%0);" + " movq 56(%0), %%rdx;" + " mulxq 32(%1), %%r8, %%r9;" + " xor %%r10d, %%r10d;" + " adcxq 88(%2), %%r8;" + " movq %%r8, 88(%2);" + " mulxq 40(%1), %%r10, %%r11;" + " adox %%r9, %%r10;" + " adcx %%rbx, %%r10;" + " movq %%r10, 96(%2);" + " mulxq 48(%1), %%rbx, %%r13;" + " adox %%r11, %%rbx;" + " adcx %%r14, %%rbx;" + " movq %%rbx, 104(%2);" + " mov $0, %%r8;" + " mulxq 56(%1), %%r14, %%rdx;" + " adox %%r13, %%r14;" + " adcx %%rax, %%r14;" + " movq %%r14, 112(%2);" + " mov $0, %%rax;" + " adox %%rdx, %%rax;" + " adcx %%r8, %%rax;" + " movq %%rax, 120(%2);" + /* Line up pointers */ - " mov %0, %1;" " mov %2, %0;" + " mov %3, %2;" /* Wrap the results back into the field */ /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ " mov $38, %%rdx;" - " mulxq 32(%1), %%r8, %%r13;" - " xor %k3, %k3;" - " adoxq 0(%1), %%r8;" - " mulxq 40(%1), %%r9, %%rbx;" + " mulxq 32(%0), %%r8, %%r13;" + " xor %k1, %k1;" + " adoxq 0(%0), %%r8;" + " mulxq 40(%0), %%r9, %%rbx;" " adcx %%r13, %%r9;" - " adoxq 8(%1), %%r9;" - " mulxq 48(%1), %%r10, %%r13;" + " adoxq 8(%0), %%r9;" + " mulxq 48(%0), %%r10, %%r13;" " adcx %%rbx, %%r10;" - " adoxq 16(%1), %%r10;" - " mulxq 56(%1), %%r11, %%rax;" + " adoxq 16(%0), %%r10;" + " mulxq 56(%0), %%r11, %%rax;" " adcx %%r13, %%r11;" - " adoxq 24(%1), %%r11;" - " adcx %3, %%rax;" - " adox %3, %%rax;" + " adoxq 24(%0), %%r11;" + " adcx %1, %%rax;" + " adox %1, %%rax;" " imul %%rdx, %%rax;" /* Step 2: Fold the carry back into dst */ " add %%rax, %%r8;" - " adcx %3, %%r9;" - " movq %%r9, 8(%0);" - " adcx %3, %%r10;" - " movq %%r10, 16(%0);" - " adcx %3, %%r11;" - " movq %%r11, 24(%0);" + " adcx %1, %%r9;" + " movq %%r9, 8(%2);" + " adcx %1, %%r10;" + " movq %%r10, 16(%2);" + " adcx %1, %%r11;" + " movq %%r11, 24(%2);" /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ " mov $0, %%rax;" " cmovc %%rdx, %%rax;" " add %%rax, %%r8;" - " movq %%r8, 0(%0);" + " movq %%r8, 0(%2);" /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ " mov $38, %%rdx;" - " mulxq 96(%1), %%r8, %%r13;" - " xor %k3, %k3;" - " adoxq 64(%1), %%r8;" - " mulxq 104(%1), %%r9, %%rbx;" + " mulxq 96(%0), %%r8, %%r13;" + " xor %k1, %k1;" + " adoxq 64(%0), %%r8;" + " mulxq 104(%0), %%r9, %%rbx;" " adcx %%r13, %%r9;" - " adoxq 72(%1), %%r9;" - " mulxq 112(%1), %%r10, %%r13;" + " adoxq 72(%0), %%r9;" + " mulxq 112(%0), %%r10, %%r13;" " adcx %%rbx, %%r10;" - " adoxq 80(%1), %%r10;" - " mulxq 120(%1), %%r11, %%rax;" + " adoxq 80(%0), %%r10;" + " mulxq 120(%0), %%r11, %%rax;" " adcx %%r13, %%r11;" - " adoxq 88(%1), %%r11;" - " adcx %3, %%rax;" - " adox %3, %%rax;" + " adoxq 88(%0), %%r11;" + " adcx %1, %%rax;" + " adox %1, %%rax;" " imul %%rdx, %%rax;" /* Step 2: Fold the carry back into dst */ " add %%rax, %%r8;" - " adcx %3, %%r9;" - " movq %%r9, 40(%0);" - " adcx %3, %%r10;" - " movq %%r10, 48(%0);" - " adcx %3, %%r11;" - " movq %%r11, 56(%0);" + " adcx %1, %%r9;" + " movq %%r9, 40(%2);" + " adcx %1, %%r10;" + " movq %%r10, 48(%2);" + " adcx %1, %%r11;" + " movq %%r11, 56(%2);" /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ " mov $0, %%rax;" " cmovc %%rdx, %%rax;" " add %%rax, %%r8;" - " movq %%r8, 32(%0);" - : "+&r" (tmp), "+&r" (f1), "+&r" (out), "+&r" (f2) - : - : "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "memory", "cc" - ); + " movq %%r8, 32(%2);" + : "+&r"(f1), "+&r"(f2), "+&r"(tmp) + : "r"(out) + : "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13", + "%r14", "memory", "cc"); } -/* Computes the field multiplication of four-element f1 with value in f2 */ +/* Computes the field multiplication of four-element f1 with value in f2 + * Requires f2 to be smaller than 2^17 */ static inline void fmul_scalar(u64 *out, const u64 *f1, u64 f2) { register u64 f2_r asm("rdx") = f2; asm volatile( /* Compute the raw multiplication of f1*f2 */ - " mulxq 0(%2), %%r8, %%rcx;" /* f1[0]*f2 */ - " mulxq 8(%2), %%r9, %%rbx;" /* f1[1]*f2 */ + " mulxq 0(%2), %%r8, %%rcx;" /* f1[0]*f2 */ + " mulxq 8(%2), %%r9, %%rbx;" /* f1[1]*f2 */ " add %%rcx, %%r9;" " mov $0, %%rcx;" - " mulxq 16(%2), %%r10, %%r13;" /* f1[2]*f2 */ + " mulxq 16(%2), %%r10, %%r13;" /* f1[2]*f2 */ " adcx %%rbx, %%r10;" - " mulxq 24(%2), %%r11, %%rax;" /* f1[3]*f2 */ + " mulxq 24(%2), %%r11, %%rax;" /* f1[3]*f2 */ " adcx %%r13, %%r11;" " adcx %%rcx, %%rax;" @@ -418,17 +576,17 @@ static inline void fmul_scalar(u64 *out, const u64 *f1, u64 f2) " cmovc %%rdx, %%rax;" " add %%rax, %%r8;" " movq %%r8, 0(%1);" - : "+&r" (f2_r) - : "r" (out), "r" (f1) - : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "memory", "cc" - ); + : "+&r"(f2_r) + : "r"(out), "r"(f1) + : "%rax", "%rbx", "%rcx", "%r8", "%r9", "%r10", "%r11", "%r13", + "memory", "cc"); } /* Computes p1 <- bit ? p2 : p1 in constant time */ static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2) { asm volatile( - /* Invert the polarity of bit to match cmov expectations */ + /* Transfer bit into CF flag */ " add $18446744073709551615, %0;" /* cswap p1[0], p2[0] */ @@ -502,10 +660,9 @@ static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2) " cmovc %%r10, %%r9;" " movq %%r8, 56(%1);" " movq %%r9, 56(%2);" - : "+&r" (bit) - : "r" (p1), "r" (p2) - : "%r8", "%r9", "%r10", "memory", "cc" - ); + : "+&r"(bit) + : "r"(p1), "r"(p2) + : "%r8", "%r9", "%r10", "memory", "cc"); } /* Computes the square of a field element: out <- f * f @@ -516,15 +673,22 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp) /* Compute the raw multiplication: tmp <- f * f */ /* Step 1: Compute all partial products */ - " movq 0(%1), %%rdx;" /* f[0] */ - " mulxq 8(%1), %%r8, %%r14;" " xor %%r15d, %%r15d;" /* f[1]*f[0] */ - " mulxq 16(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */ - " mulxq 24(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */ - " movq 24(%1), %%rdx;" /* f[3] */ - " mulxq 8(%1), %%r11, %%rbx;" " adcx %%rcx, %%r11;" /* f[1]*f[3] */ - " mulxq 16(%1), %%rax, %%r13;" " adcx %%rax, %%rbx;" /* f[2]*f[3] */ - " movq 8(%1), %%rdx;" " adcx %%r15, %%r13;" /* f1 */ - " mulxq 16(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */ + " movq 0(%0), %%rdx;" /* f[0] */ + " mulxq 8(%0), %%r8, %%r14;" + " xor %%r15d, %%r15d;" /* f[1]*f[0] */ + " mulxq 16(%0), %%r9, %%r10;" + " adcx %%r14, %%r9;" /* f[2]*f[0] */ + " mulxq 24(%0), %%rax, %%rcx;" + " adcx %%rax, %%r10;" /* f[3]*f[0] */ + " movq 24(%0), %%rdx;" /* f[3] */ + " mulxq 8(%0), %%r11, %%rbx;" + " adcx %%rcx, %%r11;" /* f[1]*f[3] */ + " mulxq 16(%0), %%rax, %%r13;" + " adcx %%rax, %%rbx;" /* f[2]*f[3] */ + " movq 8(%0), %%rdx;" + " adcx %%r15, %%r13;" /* f1 */ + " mulxq 16(%0), %%rax, %%rcx;" + " mov $0, %%r14;" /* f[2]*f[1] */ /* Step 2: Compute two parallel carry chains */ " xor %%r15d, %%r15d;" @@ -542,39 +706,50 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp) " adcx %%r14, %%r14;" /* Step 3: Compute intermediate squares */ - " movq 0(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */ - " movq %%rax, 0(%0);" - " add %%rcx, %%r8;" " movq %%r8, 8(%0);" - " movq 8(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */ - " adcx %%rax, %%r9;" " movq %%r9, 16(%0);" - " adcx %%rcx, %%r10;" " movq %%r10, 24(%0);" - " movq 16(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */ - " adcx %%rax, %%r11;" " movq %%r11, 32(%0);" - " adcx %%rcx, %%rbx;" " movq %%rbx, 40(%0);" - " movq 24(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */ - " adcx %%rax, %%r13;" " movq %%r13, 48(%0);" - " adcx %%rcx, %%r14;" " movq %%r14, 56(%0);" + " movq 0(%0), %%rdx;" + " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */ + " movq %%rax, 0(%1);" + " add %%rcx, %%r8;" + " movq %%r8, 8(%1);" + " movq 8(%0), %%rdx;" + " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */ + " adcx %%rax, %%r9;" + " movq %%r9, 16(%1);" + " adcx %%rcx, %%r10;" + " movq %%r10, 24(%1);" + " movq 16(%0), %%rdx;" + " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */ + " adcx %%rax, %%r11;" + " movq %%r11, 32(%1);" + " adcx %%rcx, %%rbx;" + " movq %%rbx, 40(%1);" + " movq 24(%0), %%rdx;" + " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */ + " adcx %%rax, %%r13;" + " movq %%r13, 48(%1);" + " adcx %%rcx, %%r14;" + " movq %%r14, 56(%1);" /* Line up pointers */ - " mov %0, %1;" - " mov %2, %0;" + " mov %1, %0;" + " mov %2, %1;" /* Wrap the result back into the field */ /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ " mov $38, %%rdx;" - " mulxq 32(%1), %%r8, %%r13;" + " mulxq 32(%0), %%r8, %%r13;" " xor %%ecx, %%ecx;" - " adoxq 0(%1), %%r8;" - " mulxq 40(%1), %%r9, %%rbx;" + " adoxq 0(%0), %%r8;" + " mulxq 40(%0), %%r9, %%rbx;" " adcx %%r13, %%r9;" - " adoxq 8(%1), %%r9;" - " mulxq 48(%1), %%r10, %%r13;" + " adoxq 8(%0), %%r9;" + " mulxq 48(%0), %%r10, %%r13;" " adcx %%rbx, %%r10;" - " adoxq 16(%1), %%r10;" - " mulxq 56(%1), %%r11, %%rax;" + " adoxq 16(%0), %%r10;" + " mulxq 56(%0), %%r11, %%rax;" " adcx %%r13, %%r11;" - " adoxq 24(%1), %%r11;" + " adoxq 24(%0), %%r11;" " adcx %%rcx, %%rax;" " adox %%rcx, %%rax;" " imul %%rdx, %%rax;" @@ -582,40 +757,47 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp) /* Step 2: Fold the carry back into dst */ " add %%rax, %%r8;" " adcx %%rcx, %%r9;" - " movq %%r9, 8(%0);" + " movq %%r9, 8(%1);" " adcx %%rcx, %%r10;" - " movq %%r10, 16(%0);" + " movq %%r10, 16(%1);" " adcx %%rcx, %%r11;" - " movq %%r11, 24(%0);" + " movq %%r11, 24(%1);" /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ " mov $0, %%rax;" " cmovc %%rdx, %%rax;" " add %%rax, %%r8;" - " movq %%r8, 0(%0);" - : "+&r" (tmp), "+&r" (f), "+&r" (out) - : - : "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "%r15", "memory", "cc" - ); + " movq %%r8, 0(%1);" + : "+&r"(f), "+&r"(tmp) + : "r"(out) + : "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", + "%r13", "%r14", "%r15", "memory", "cc"); } /* Computes two field squarings: - * out[0] <- f[0] * f[0] - * out[1] <- f[1] * f[1] + * out[0] <- f[0] * f[0] + * out[1] <- f[1] * f[1] * Uses the 16-element buffer tmp for intermediate results */ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp) { asm volatile( /* Step 1: Compute all partial products */ - " movq 0(%1), %%rdx;" /* f[0] */ - " mulxq 8(%1), %%r8, %%r14;" " xor %%r15d, %%r15d;" /* f[1]*f[0] */ - " mulxq 16(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */ - " mulxq 24(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */ - " movq 24(%1), %%rdx;" /* f[3] */ - " mulxq 8(%1), %%r11, %%rbx;" " adcx %%rcx, %%r11;" /* f[1]*f[3] */ - " mulxq 16(%1), %%rax, %%r13;" " adcx %%rax, %%rbx;" /* f[2]*f[3] */ - " movq 8(%1), %%rdx;" " adcx %%r15, %%r13;" /* f1 */ - " mulxq 16(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */ + " movq 0(%0), %%rdx;" /* f[0] */ + " mulxq 8(%0), %%r8, %%r14;" + " xor %%r15d, %%r15d;" /* f[1]*f[0] */ + " mulxq 16(%0), %%r9, %%r10;" + " adcx %%r14, %%r9;" /* f[2]*f[0] */ + " mulxq 24(%0), %%rax, %%rcx;" + " adcx %%rax, %%r10;" /* f[3]*f[0] */ + " movq 24(%0), %%rdx;" /* f[3] */ + " mulxq 8(%0), %%r11, %%rbx;" + " adcx %%rcx, %%r11;" /* f[1]*f[3] */ + " mulxq 16(%0), %%rax, %%r13;" + " adcx %%rax, %%rbx;" /* f[2]*f[3] */ + " movq 8(%0), %%rdx;" + " adcx %%r15, %%r13;" /* f1 */ + " mulxq 16(%0), %%rax, %%rcx;" + " mov $0, %%r14;" /* f[2]*f[1] */ /* Step 2: Compute two parallel carry chains */ " xor %%r15d, %%r15d;" @@ -633,29 +815,47 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp) " adcx %%r14, %%r14;" /* Step 3: Compute intermediate squares */ - " movq 0(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */ - " movq %%rax, 0(%0);" - " add %%rcx, %%r8;" " movq %%r8, 8(%0);" - " movq 8(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */ - " adcx %%rax, %%r9;" " movq %%r9, 16(%0);" - " adcx %%rcx, %%r10;" " movq %%r10, 24(%0);" - " movq 16(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */ - " adcx %%rax, %%r11;" " movq %%r11, 32(%0);" - " adcx %%rcx, %%rbx;" " movq %%rbx, 40(%0);" - " movq 24(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */ - " adcx %%rax, %%r13;" " movq %%r13, 48(%0);" - " adcx %%rcx, %%r14;" " movq %%r14, 56(%0);" + " movq 0(%0), %%rdx;" + " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */ + " movq %%rax, 0(%1);" + " add %%rcx, %%r8;" + " movq %%r8, 8(%1);" + " movq 8(%0), %%rdx;" + " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */ + " adcx %%rax, %%r9;" + " movq %%r9, 16(%1);" + " adcx %%rcx, %%r10;" + " movq %%r10, 24(%1);" + " movq 16(%0), %%rdx;" + " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */ + " adcx %%rax, %%r11;" + " movq %%r11, 32(%1);" + " adcx %%rcx, %%rbx;" + " movq %%rbx, 40(%1);" + " movq 24(%0), %%rdx;" + " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */ + " adcx %%rax, %%r13;" + " movq %%r13, 48(%1);" + " adcx %%rcx, %%r14;" + " movq %%r14, 56(%1);" /* Step 1: Compute all partial products */ - " movq 32(%1), %%rdx;" /* f[0] */ - " mulxq 40(%1), %%r8, %%r14;" " xor %%r15d, %%r15d;" /* f[1]*f[0] */ - " mulxq 48(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */ - " mulxq 56(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */ - " movq 56(%1), %%rdx;" /* f[3] */ - " mulxq 40(%1), %%r11, %%rbx;" " adcx %%rcx, %%r11;" /* f[1]*f[3] */ - " mulxq 48(%1), %%rax, %%r13;" " adcx %%rax, %%rbx;" /* f[2]*f[3] */ - " movq 40(%1), %%rdx;" " adcx %%r15, %%r13;" /* f1 */ - " mulxq 48(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */ + " movq 32(%0), %%rdx;" /* f[0] */ + " mulxq 40(%0), %%r8, %%r14;" + " xor %%r15d, %%r15d;" /* f[1]*f[0] */ + " mulxq 48(%0), %%r9, %%r10;" + " adcx %%r14, %%r9;" /* f[2]*f[0] */ + " mulxq 56(%0), %%rax, %%rcx;" + " adcx %%rax, %%r10;" /* f[3]*f[0] */ + " movq 56(%0), %%rdx;" /* f[3] */ + " mulxq 40(%0), %%r11, %%rbx;" + " adcx %%rcx, %%r11;" /* f[1]*f[3] */ + " mulxq 48(%0), %%rax, %%r13;" + " adcx %%rax, %%rbx;" /* f[2]*f[3] */ + " movq 40(%0), %%rdx;" + " adcx %%r15, %%r13;" /* f1 */ + " mulxq 48(%0), %%rax, %%rcx;" + " mov $0, %%r14;" /* f[2]*f[1] */ /* Step 2: Compute two parallel carry chains */ " xor %%r15d, %%r15d;" @@ -673,37 +873,48 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp) " adcx %%r14, %%r14;" /* Step 3: Compute intermediate squares */ - " movq 32(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */ - " movq %%rax, 64(%0);" - " add %%rcx, %%r8;" " movq %%r8, 72(%0);" - " movq 40(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */ - " adcx %%rax, %%r9;" " movq %%r9, 80(%0);" - " adcx %%rcx, %%r10;" " movq %%r10, 88(%0);" - " movq 48(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */ - " adcx %%rax, %%r11;" " movq %%r11, 96(%0);" - " adcx %%rcx, %%rbx;" " movq %%rbx, 104(%0);" - " movq 56(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */ - " adcx %%rax, %%r13;" " movq %%r13, 112(%0);" - " adcx %%rcx, %%r14;" " movq %%r14, 120(%0);" + " movq 32(%0), %%rdx;" + " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */ + " movq %%rax, 64(%1);" + " add %%rcx, %%r8;" + " movq %%r8, 72(%1);" + " movq 40(%0), %%rdx;" + " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */ + " adcx %%rax, %%r9;" + " movq %%r9, 80(%1);" + " adcx %%rcx, %%r10;" + " movq %%r10, 88(%1);" + " movq 48(%0), %%rdx;" + " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */ + " adcx %%rax, %%r11;" + " movq %%r11, 96(%1);" + " adcx %%rcx, %%rbx;" + " movq %%rbx, 104(%1);" + " movq 56(%0), %%rdx;" + " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */ + " adcx %%rax, %%r13;" + " movq %%r13, 112(%1);" + " adcx %%rcx, %%r14;" + " movq %%r14, 120(%1);" /* Line up pointers */ - " mov %0, %1;" - " mov %2, %0;" + " mov %1, %0;" + " mov %2, %1;" /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ " mov $38, %%rdx;" - " mulxq 32(%1), %%r8, %%r13;" + " mulxq 32(%0), %%r8, %%r13;" " xor %%ecx, %%ecx;" - " adoxq 0(%1), %%r8;" - " mulxq 40(%1), %%r9, %%rbx;" + " adoxq 0(%0), %%r8;" + " mulxq 40(%0), %%r9, %%rbx;" " adcx %%r13, %%r9;" - " adoxq 8(%1), %%r9;" - " mulxq 48(%1), %%r10, %%r13;" + " adoxq 8(%0), %%r9;" + " mulxq 48(%0), %%r10, %%r13;" " adcx %%rbx, %%r10;" - " adoxq 16(%1), %%r10;" - " mulxq 56(%1), %%r11, %%rax;" + " adoxq 16(%0), %%r10;" + " mulxq 56(%0), %%r11, %%rax;" " adcx %%r13, %%r11;" - " adoxq 24(%1), %%r11;" + " adoxq 24(%0), %%r11;" " adcx %%rcx, %%rax;" " adox %%rcx, %%rax;" " imul %%rdx, %%rax;" @@ -711,32 +922,32 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp) /* Step 2: Fold the carry back into dst */ " add %%rax, %%r8;" " adcx %%rcx, %%r9;" - " movq %%r9, 8(%0);" + " movq %%r9, 8(%1);" " adcx %%rcx, %%r10;" - " movq %%r10, 16(%0);" + " movq %%r10, 16(%1);" " adcx %%rcx, %%r11;" - " movq %%r11, 24(%0);" + " movq %%r11, 24(%1);" /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ " mov $0, %%rax;" " cmovc %%rdx, %%rax;" " add %%rax, %%r8;" - " movq %%r8, 0(%0);" + " movq %%r8, 0(%1);" /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ " mov $38, %%rdx;" - " mulxq 96(%1), %%r8, %%r13;" + " mulxq 96(%0), %%r8, %%r13;" " xor %%ecx, %%ecx;" - " adoxq 64(%1), %%r8;" - " mulxq 104(%1), %%r9, %%rbx;" + " adoxq 64(%0), %%r8;" + " mulxq 104(%0), %%r9, %%rbx;" " adcx %%r13, %%r9;" - " adoxq 72(%1), %%r9;" - " mulxq 112(%1), %%r10, %%r13;" + " adoxq 72(%0), %%r9;" + " mulxq 112(%0), %%r10, %%r13;" " adcx %%rbx, %%r10;" - " adoxq 80(%1), %%r10;" - " mulxq 120(%1), %%r11, %%rax;" + " adoxq 80(%0), %%r10;" + " mulxq 120(%0), %%r11, %%rax;" " adcx %%r13, %%r11;" - " adoxq 88(%1), %%r11;" + " adoxq 88(%0), %%r11;" " adcx %%rcx, %%rax;" " adox %%rcx, %%rax;" " imul %%rdx, %%rax;" @@ -744,21 +955,21 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp) /* Step 2: Fold the carry back into dst */ " add %%rax, %%r8;" " adcx %%rcx, %%r9;" - " movq %%r9, 40(%0);" + " movq %%r9, 40(%1);" " adcx %%rcx, %%r10;" - " movq %%r10, 48(%0);" + " movq %%r10, 48(%1);" " adcx %%rcx, %%r11;" - " movq %%r11, 56(%0);" + " movq %%r11, 56(%1);" /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ " mov $0, %%rax;" " cmovc %%rdx, %%rax;" " add %%rax, %%r8;" - " movq %%r8, 32(%0);" - : "+&r" (tmp), "+&r" (f), "+&r" (out) - : - : "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "%r15", "memory", "cc" - ); + " movq %%r8, 32(%1);" + : "+&r"(f), "+&r"(tmp) + : "r"(out) + : "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", + "%r13", "%r14", "%r15", "memory", "cc"); } static void point_add_and_double(u64 *q, u64 *p01_tmp1, u64 *tmp2) diff --git a/arch/x86/crypto/des3_ede-asm_64.S b/arch/x86/crypto/des3_ede-asm_64.S index fac0fdc3f25d..f4c760f4cade 100644 --- a/arch/x86/crypto/des3_ede-asm_64.S +++ b/arch/x86/crypto/des3_ede-asm_64.S @@ -243,7 +243,7 @@ SYM_FUNC_START(des3_ede_x86_64_crypt_blk) popq %r12; popq %rbx; - ret; + RET; SYM_FUNC_END(des3_ede_x86_64_crypt_blk) /*********************************************************************** @@ -528,7 +528,7 @@ SYM_FUNC_START(des3_ede_x86_64_crypt_blk_3way) popq %r12; popq %rbx; - ret; + RET; SYM_FUNC_END(des3_ede_x86_64_crypt_blk_3way) .section .rodata, "a", @progbits diff --git a/arch/x86/crypto/des3_ede_glue.c b/arch/x86/crypto/des3_ede_glue.c index e7cb68a3db3b..787c234d2469 100644 --- a/arch/x86/crypto/des3_ede_glue.c +++ b/arch/x86/crypto/des3_ede_glue.c @@ -164,7 +164,7 @@ static int cbc_encrypt(struct skcipher_request *req) err = skcipher_walk_virt(&walk, req, false); - while ((nbytes = walk.nbytes)) { + while (walk.nbytes) { nbytes = __cbc_encrypt(ctx, &walk); err = skcipher_walk_done(&walk, nbytes); } @@ -243,7 +243,7 @@ static int cbc_decrypt(struct skcipher_request *req) err = skcipher_walk_virt(&walk, req, false); - while ((nbytes = walk.nbytes)) { + while (walk.nbytes) { nbytes = __cbc_decrypt(ctx, &walk); err = skcipher_walk_done(&walk, nbytes); } diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S index 99ac25e18e09..2bf871899920 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_asm.S +++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S @@ -85,7 +85,7 @@ SYM_FUNC_START_LOCAL(__clmul_gf128mul_ble) psrlq $1, T2 pxor T2, T1 pxor T1, DATA - ret + RET SYM_FUNC_END(__clmul_gf128mul_ble) /* void clmul_ghash_mul(char *dst, const u128 *shash) */ @@ -99,7 +99,7 @@ SYM_FUNC_START(clmul_ghash_mul) pshufb BSWAP, DATA movups DATA, (%rdi) FRAME_END - ret + RET SYM_FUNC_END(clmul_ghash_mul) /* @@ -128,5 +128,5 @@ SYM_FUNC_START(clmul_ghash_update) movups DATA, (%rdi) .Lupdate_just_ret: FRAME_END - ret + RET SYM_FUNC_END(clmul_ghash_update) diff --git a/arch/x86/crypto/nh-avx2-x86_64.S b/arch/x86/crypto/nh-avx2-x86_64.S index b22c7b936272..6a0b15e7196a 100644 --- a/arch/x86/crypto/nh-avx2-x86_64.S +++ b/arch/x86/crypto/nh-avx2-x86_64.S @@ -153,5 +153,5 @@ SYM_FUNC_START(nh_avx2) vpaddq T1, T0, T0 vpaddq T4, T0, T0 vmovdqu T0, (HASH) - ret + RET SYM_FUNC_END(nh_avx2) diff --git a/arch/x86/crypto/nh-sse2-x86_64.S b/arch/x86/crypto/nh-sse2-x86_64.S index d7ae22dd6683..34c567bbcb4f 100644 --- a/arch/x86/crypto/nh-sse2-x86_64.S +++ b/arch/x86/crypto/nh-sse2-x86_64.S @@ -119,5 +119,5 @@ SYM_FUNC_START(nh_sse2) paddq PASS2_SUMS, T1 movdqu T0, 0x00(HASH) movdqu T1, 0x10(HASH) - ret + RET SYM_FUNC_END(nh_sse2) diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S index b7ee24df7fba..82f2313f512b 100644 --- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S @@ -601,7 +601,7 @@ SYM_FUNC_START_LOCAL(__serpent_enc_blk8_avx) write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); - ret; + RET; SYM_FUNC_END(__serpent_enc_blk8_avx) .align 8 @@ -655,7 +655,7 @@ SYM_FUNC_START_LOCAL(__serpent_dec_blk8_avx) write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2); write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2); - ret; + RET; SYM_FUNC_END(__serpent_dec_blk8_avx) SYM_FUNC_START(serpent_ecb_enc_8way_avx) @@ -673,7 +673,7 @@ SYM_FUNC_START(serpent_ecb_enc_8way_avx) store_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); FRAME_END - ret; + RET; SYM_FUNC_END(serpent_ecb_enc_8way_avx) SYM_FUNC_START(serpent_ecb_dec_8way_avx) @@ -691,7 +691,7 @@ SYM_FUNC_START(serpent_ecb_dec_8way_avx) store_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); FRAME_END - ret; + RET; SYM_FUNC_END(serpent_ecb_dec_8way_avx) SYM_FUNC_START(serpent_cbc_dec_8way_avx) @@ -709,5 +709,5 @@ SYM_FUNC_START(serpent_cbc_dec_8way_avx) store_cbc_8way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); FRAME_END - ret; + RET; SYM_FUNC_END(serpent_cbc_dec_8way_avx) diff --git a/arch/x86/crypto/serpent-avx2-asm_64.S b/arch/x86/crypto/serpent-avx2-asm_64.S index 9161b6e441f3..8ea34c9b9316 100644 --- a/arch/x86/crypto/serpent-avx2-asm_64.S +++ b/arch/x86/crypto/serpent-avx2-asm_64.S @@ -601,7 +601,7 @@ SYM_FUNC_START_LOCAL(__serpent_enc_blk16) write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); - ret; + RET; SYM_FUNC_END(__serpent_enc_blk16) .align 8 @@ -655,7 +655,7 @@ SYM_FUNC_START_LOCAL(__serpent_dec_blk16) write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2); write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2); - ret; + RET; SYM_FUNC_END(__serpent_dec_blk16) SYM_FUNC_START(serpent_ecb_enc_16way) @@ -677,7 +677,7 @@ SYM_FUNC_START(serpent_ecb_enc_16way) vzeroupper; FRAME_END - ret; + RET; SYM_FUNC_END(serpent_ecb_enc_16way) SYM_FUNC_START(serpent_ecb_dec_16way) @@ -699,7 +699,7 @@ SYM_FUNC_START(serpent_ecb_dec_16way) vzeroupper; FRAME_END - ret; + RET; SYM_FUNC_END(serpent_ecb_dec_16way) SYM_FUNC_START(serpent_cbc_dec_16way) @@ -722,5 +722,5 @@ SYM_FUNC_START(serpent_cbc_dec_16way) vzeroupper; FRAME_END - ret; + RET; SYM_FUNC_END(serpent_cbc_dec_16way) diff --git a/arch/x86/crypto/serpent-sse2-i586-asm_32.S b/arch/x86/crypto/serpent-sse2-i586-asm_32.S index 6379b99cb722..8ccb03ad7cef 100644 --- a/arch/x86/crypto/serpent-sse2-i586-asm_32.S +++ b/arch/x86/crypto/serpent-sse2-i586-asm_32.S @@ -553,12 +553,12 @@ SYM_FUNC_START(__serpent_enc_blk_4way) write_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE); - ret; + RET; .L__enc_xor4: xor_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE); - ret; + RET; SYM_FUNC_END(__serpent_enc_blk_4way) SYM_FUNC_START(serpent_dec_blk_4way) @@ -612,5 +612,5 @@ SYM_FUNC_START(serpent_dec_blk_4way) movl arg_dst(%esp), %eax; write_blocks(%eax, RC, RD, RB, RE, RT0, RT1, RA); - ret; + RET; SYM_FUNC_END(serpent_dec_blk_4way) diff --git a/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S index efb6dc17dc90..e0998a011d1d 100644 --- a/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S +++ b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S @@ -675,13 +675,13 @@ SYM_FUNC_START(__serpent_enc_blk_8way) write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2); write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); - ret; + RET; .L__enc_xor8: xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2); xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); - ret; + RET; SYM_FUNC_END(__serpent_enc_blk_8way) SYM_FUNC_START(serpent_dec_blk_8way) @@ -735,5 +735,5 @@ SYM_FUNC_START(serpent_dec_blk_8way) write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2); write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2); - ret; + RET; SYM_FUNC_END(serpent_dec_blk_8way) diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S index 5eed620f4676..a96b2fd26dab 100644 --- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S +++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S @@ -674,7 +674,7 @@ _loop3: pop %r12 pop %rbx - ret + RET SYM_FUNC_END(\name) .endm diff --git a/arch/x86/crypto/sha1_ni_asm.S b/arch/x86/crypto/sha1_ni_asm.S index 5d8415f482bd..2f94ec0e763b 100644 --- a/arch/x86/crypto/sha1_ni_asm.S +++ b/arch/x86/crypto/sha1_ni_asm.S @@ -290,7 +290,7 @@ SYM_FUNC_START(sha1_ni_transform) mov %rbp, %rsp pop %rbp - ret + RET SYM_FUNC_END(sha1_ni_transform) .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S index d25668d2a1e9..263f916362e0 100644 --- a/arch/x86/crypto/sha1_ssse3_asm.S +++ b/arch/x86/crypto/sha1_ssse3_asm.S @@ -99,7 +99,7 @@ pop %rbp pop %r12 pop %rbx - ret + RET SYM_FUNC_END(\name) .endm diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S index 4739cd31b9db..3baa1ec39097 100644 --- a/arch/x86/crypto/sha256-avx-asm.S +++ b/arch/x86/crypto/sha256-avx-asm.S @@ -458,7 +458,7 @@ done_hash: popq %r13 popq %r12 popq %rbx - ret + RET SYM_FUNC_END(sha256_transform_avx) .section .rodata.cst256.K256, "aM", @progbits, 256 diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S index 4087f7432a7e..9bcdbc47b8b4 100644 --- a/arch/x86/crypto/sha256-avx2-asm.S +++ b/arch/x86/crypto/sha256-avx2-asm.S @@ -710,7 +710,7 @@ done_hash: popq %r13 popq %r12 popq %rbx - ret + RET SYM_FUNC_END(sha256_transform_rorx) .section .rodata.cst512.K256, "aM", @progbits, 512 diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/crypto/sha256-ssse3-asm.S index ddfa863b4ee3..c4a5db612c32 100644 --- a/arch/x86/crypto/sha256-ssse3-asm.S +++ b/arch/x86/crypto/sha256-ssse3-asm.S @@ -472,7 +472,7 @@ done_hash: popq %r12 popq %rbx - ret + RET SYM_FUNC_END(sha256_transform_ssse3) .section .rodata.cst256.K256, "aM", @progbits, 256 diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/crypto/sha256_ni_asm.S index 7abade04a3a3..94d50dd27cb5 100644 --- a/arch/x86/crypto/sha256_ni_asm.S +++ b/arch/x86/crypto/sha256_ni_asm.S @@ -326,7 +326,7 @@ SYM_FUNC_START(sha256_ni_transform) .Ldone_hash: - ret + RET SYM_FUNC_END(sha256_ni_transform) .section .rodata.cst256.K256, "aM", @progbits, 256 diff --git a/arch/x86/crypto/sha512-avx-asm.S b/arch/x86/crypto/sha512-avx-asm.S index 3d8f0fd4eea8..1fefe6dd3a9e 100644 --- a/arch/x86/crypto/sha512-avx-asm.S +++ b/arch/x86/crypto/sha512-avx-asm.S @@ -361,7 +361,7 @@ updateblock: pop %rbx nowork: - ret + RET SYM_FUNC_END(sha512_transform_avx) ######################################################################## diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S index 072cb0f0deae..5cdaab7d6901 100644 --- a/arch/x86/crypto/sha512-avx2-asm.S +++ b/arch/x86/crypto/sha512-avx2-asm.S @@ -679,7 +679,7 @@ done_hash: pop %r12 pop %rbx - ret + RET SYM_FUNC_END(sha512_transform_rorx) ######################################################################## diff --git a/arch/x86/crypto/sha512-ssse3-asm.S b/arch/x86/crypto/sha512-ssse3-asm.S index bd51c9070bed..b84c22e06c5f 100644 --- a/arch/x86/crypto/sha512-ssse3-asm.S +++ b/arch/x86/crypto/sha512-ssse3-asm.S @@ -363,7 +363,7 @@ updateblock: pop %rbx nowork: - ret + RET SYM_FUNC_END(sha512_transform_ssse3) ######################################################################## diff --git a/arch/x86/crypto/sm4-aesni-avx-asm_64.S b/arch/x86/crypto/sm4-aesni-avx-asm_64.S index 18d2f5199194..4767ab61ff48 100644 --- a/arch/x86/crypto/sm4-aesni-avx-asm_64.S +++ b/arch/x86/crypto/sm4-aesni-avx-asm_64.S @@ -78,7 +78,7 @@ vpxor tmp0, x, x; -.section .rodata.cst164, "aM", @progbits, 164 +.section .rodata.cst16, "aM", @progbits, 16 .align 16 /* @@ -133,6 +133,10 @@ .L0f0f0f0f: .long 0x0f0f0f0f +/* 12 bytes, only for padding */ +.Lpadding_deadbeef: + .long 0xdeadbeef, 0xdeadbeef, 0xdeadbeef + .text .align 16 @@ -242,7 +246,7 @@ SYM_FUNC_START(sm4_aesni_avx_crypt4) .Lblk4_store_output_done: vzeroall; FRAME_END - ret; + RET; SYM_FUNC_END(sm4_aesni_avx_crypt4) .align 8 @@ -352,7 +356,7 @@ SYM_FUNC_START_LOCAL(__sm4_crypt_blk8) vpshufb RTMP2, RB3, RB3; FRAME_END - ret; + RET; SYM_FUNC_END(__sm4_crypt_blk8) /* @@ -408,7 +412,7 @@ SYM_FUNC_START(sm4_aesni_avx_crypt8) .Lblk8_store_output_done: vzeroall; FRAME_END - ret; + RET; SYM_FUNC_END(sm4_aesni_avx_crypt8) /* @@ -483,7 +487,7 @@ SYM_FUNC_START(sm4_aesni_avx_ctr_enc_blk8) vzeroall; FRAME_END - ret; + RET; SYM_FUNC_END(sm4_aesni_avx_ctr_enc_blk8) /* @@ -533,7 +537,7 @@ SYM_FUNC_START(sm4_aesni_avx_cbc_dec_blk8) vzeroall; FRAME_END - ret; + RET; SYM_FUNC_END(sm4_aesni_avx_cbc_dec_blk8) /* @@ -586,5 +590,5 @@ SYM_FUNC_START(sm4_aesni_avx_cfb_dec_blk8) vzeroall; FRAME_END - ret; + RET; SYM_FUNC_END(sm4_aesni_avx_cfb_dec_blk8) diff --git a/arch/x86/crypto/sm4-aesni-avx2-asm_64.S b/arch/x86/crypto/sm4-aesni-avx2-asm_64.S index d2ffd7f76ee2..4732fe8bb65b 100644 --- a/arch/x86/crypto/sm4-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/sm4-aesni-avx2-asm_64.S @@ -93,7 +93,7 @@ vpxor tmp0, x, x; -.section .rodata.cst164, "aM", @progbits, 164 +.section .rodata.cst16, "aM", @progbits, 16 .align 16 /* @@ -148,6 +148,10 @@ .L0f0f0f0f: .long 0x0f0f0f0f +/* 12 bytes, only for padding */ +.Lpadding_deadbeef: + .long 0xdeadbeef, 0xdeadbeef, 0xdeadbeef + .text .align 16 @@ -264,7 +268,7 @@ SYM_FUNC_START_LOCAL(__sm4_crypt_blk16) vpshufb RTMP2, RB3, RB3; FRAME_END - ret; + RET; SYM_FUNC_END(__sm4_crypt_blk16) #define inc_le128(x, minus_one, tmp) \ @@ -383,7 +387,7 @@ SYM_FUNC_START(sm4_aesni_avx2_ctr_enc_blk16) vzeroall; FRAME_END - ret; + RET; SYM_FUNC_END(sm4_aesni_avx2_ctr_enc_blk16) /* @@ -437,7 +441,7 @@ SYM_FUNC_START(sm4_aesni_avx2_cbc_dec_blk16) vzeroall; FRAME_END - ret; + RET; SYM_FUNC_END(sm4_aesni_avx2_cbc_dec_blk16) /* @@ -493,5 +497,5 @@ SYM_FUNC_START(sm4_aesni_avx2_cfb_dec_blk16) vzeroall; FRAME_END - ret; + RET; SYM_FUNC_END(sm4_aesni_avx2_cfb_dec_blk16) diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S index 37e63b3c664e..31f9b2ec3857 100644 --- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S @@ -267,7 +267,7 @@ SYM_FUNC_START_LOCAL(__twofish_enc_blk8) outunpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2); outunpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2); - ret; + RET; SYM_FUNC_END(__twofish_enc_blk8) .align 8 @@ -307,7 +307,7 @@ SYM_FUNC_START_LOCAL(__twofish_dec_blk8) outunpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2); outunpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2); - ret; + RET; SYM_FUNC_END(__twofish_dec_blk8) SYM_FUNC_START(twofish_ecb_enc_8way) @@ -327,7 +327,7 @@ SYM_FUNC_START(twofish_ecb_enc_8way) store_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2); FRAME_END - ret; + RET; SYM_FUNC_END(twofish_ecb_enc_8way) SYM_FUNC_START(twofish_ecb_dec_8way) @@ -347,7 +347,7 @@ SYM_FUNC_START(twofish_ecb_dec_8way) store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); FRAME_END - ret; + RET; SYM_FUNC_END(twofish_ecb_dec_8way) SYM_FUNC_START(twofish_cbc_dec_8way) @@ -372,5 +372,5 @@ SYM_FUNC_START(twofish_cbc_dec_8way) popq %r12; FRAME_END - ret; + RET; SYM_FUNC_END(twofish_cbc_dec_8way) diff --git a/arch/x86/crypto/twofish-i586-asm_32.S b/arch/x86/crypto/twofish-i586-asm_32.S index a6f09e4f2e46..3abcad661884 100644 --- a/arch/x86/crypto/twofish-i586-asm_32.S +++ b/arch/x86/crypto/twofish-i586-asm_32.S @@ -260,7 +260,7 @@ SYM_FUNC_START(twofish_enc_blk) pop %ebx pop %ebp mov $1, %eax - ret + RET SYM_FUNC_END(twofish_enc_blk) SYM_FUNC_START(twofish_dec_blk) @@ -317,5 +317,5 @@ SYM_FUNC_START(twofish_dec_blk) pop %ebx pop %ebp mov $1, %eax - ret + RET SYM_FUNC_END(twofish_dec_blk) diff --git a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S index bca4cea757ce..d2288bf38a8a 100644 --- a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S +++ b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S @@ -258,7 +258,7 @@ SYM_FUNC_START(__twofish_enc_blk_3way) popq %rbx; popq %r12; popq %r13; - ret; + RET; .L__enc_xor3: outunpack_enc3(xor); @@ -266,7 +266,7 @@ SYM_FUNC_START(__twofish_enc_blk_3way) popq %rbx; popq %r12; popq %r13; - ret; + RET; SYM_FUNC_END(__twofish_enc_blk_3way) SYM_FUNC_START(twofish_dec_blk_3way) @@ -301,5 +301,5 @@ SYM_FUNC_START(twofish_dec_blk_3way) popq %rbx; popq %r12; popq %r13; - ret; + RET; SYM_FUNC_END(twofish_dec_blk_3way) diff --git a/arch/x86/crypto/twofish-x86_64-asm_64.S b/arch/x86/crypto/twofish-x86_64-asm_64.S index d2e56232494a..775af290cd19 100644 --- a/arch/x86/crypto/twofish-x86_64-asm_64.S +++ b/arch/x86/crypto/twofish-x86_64-asm_64.S @@ -252,7 +252,7 @@ SYM_FUNC_START(twofish_enc_blk) popq R1 movl $1,%eax - ret + RET SYM_FUNC_END(twofish_enc_blk) SYM_FUNC_START(twofish_dec_blk) @@ -304,5 +304,5 @@ SYM_FUNC_START(twofish_dec_blk) popq R1 movl $1,%eax - ret + RET SYM_FUNC_END(twofish_dec_blk) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index ccb9d32768f3..a7ec22b1d06c 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -268,19 +268,16 @@ 1: popl %ds 2: popl %es 3: popl %fs - addl $(4 + \pop), %esp /* pop the unused "gs" slot */ +4: addl $(4 + \pop), %esp /* pop the unused "gs" slot */ IRET_FRAME -.pushsection .fixup, "ax" -4: movl $0, (%esp) - jmp 1b -5: movl $0, (%esp) - jmp 2b -6: movl $0, (%esp) - jmp 3b -.popsection - _ASM_EXTABLE(1b, 4b) - _ASM_EXTABLE(2b, 5b) - _ASM_EXTABLE(3b, 6b) + + /* + * There is no _ASM_EXTABLE_TYPE_REG() for ASM, however since this is + * ASM the registers are known and we can trivially hard-code them. + */ + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_POP_ZERO|EX_REG_DS) + _ASM_EXTABLE_TYPE(2b, 3b, EX_TYPE_POP_ZERO|EX_REG_ES) + _ASM_EXTABLE_TYPE(3b, 4b, EX_TYPE_POP_ZERO|EX_REG_FS) .endm .macro RESTORE_ALL_NMI cr3_reg:req pop=0 @@ -740,7 +737,7 @@ SYM_FUNC_START(schedule_tail_wrapper) popl %eax FRAME_END - ret + RET SYM_FUNC_END(schedule_tail_wrapper) .popsection @@ -925,10 +922,8 @@ SYM_FUNC_START(entry_SYSENTER_32) sti sysexit -.pushsection .fixup, "ax" -2: movl $0, PT_FS(%esp) - jmp 1b -.popsection +2: movl $0, PT_FS(%esp) + jmp 1b _ASM_EXTABLE(1b, 2b) .Lsysenter_fix_flags: @@ -996,8 +991,7 @@ restore_all_switch_stack: */ iret -.section .fixup, "ax" -SYM_CODE_START(asm_iret_error) +.Lasm_iret_error: pushl $0 # no error code pushl $iret_error @@ -1014,9 +1008,8 @@ SYM_CODE_START(asm_iret_error) #endif jmp handle_exception -SYM_CODE_END(asm_iret_error) -.previous - _ASM_EXTABLE(.Lirq_return, asm_iret_error) + + _ASM_EXTABLE(.Lirq_return, .Lasm_iret_error) SYM_FUNC_END(entry_INT80_32) .macro FIXUP_ESPFIX_STACK diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index e38a4cf795d9..1ffdbfaad2e2 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -574,6 +574,10 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL) ud2 1: #endif +#ifdef CONFIG_XEN_PV + ALTERNATIVE "", "jmp xenpv_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV +#endif + POP_REGS pop_rdi=0 /* @@ -734,14 +738,10 @@ SYM_FUNC_START(asm_load_gs_index) 2: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE swapgs FRAME_END - ret -SYM_FUNC_END(asm_load_gs_index) -EXPORT_SYMBOL(asm_load_gs_index) + RET - _ASM_EXTABLE(.Lgs_change, .Lbad_gs) - .section .fixup, "ax" /* running with kernelgs */ -SYM_CODE_START_LOCAL_NOALIGN(.Lbad_gs) +.Lbad_gs: swapgs /* switch back to user gs */ .macro ZAP_GS /* This can't be a string because the preprocessor needs to see it. */ @@ -752,8 +752,11 @@ SYM_CODE_START_LOCAL_NOALIGN(.Lbad_gs) xorl %eax, %eax movl %eax, %gs jmp 2b -SYM_CODE_END(.Lbad_gs) - .previous + + _ASM_EXTABLE(.Lgs_change, .Lbad_gs) + +SYM_FUNC_END(asm_load_gs_index) +EXPORT_SYMBOL(asm_load_gs_index) #ifdef CONFIG_XEN_PV /* @@ -885,11 +888,12 @@ SYM_CODE_START_LOCAL(paranoid_entry) * is needed here. */ SAVE_AND_SET_GSBASE scratch_reg=%rax save_reg=%rbx - ret + RET .Lparanoid_entry_checkgs: /* EBX = 1 -> kernel GSBASE active, no restore required */ movl $1, %ebx + /* * The kernel-enforced convention is a negative GSBASE indicates * a kernel value. No SWAPGS needed on entry and exit. @@ -897,22 +901,15 @@ SYM_CODE_START_LOCAL(paranoid_entry) movl $MSR_GS_BASE, %ecx rdmsr testl %edx, %edx - jns .Lparanoid_entry_swapgs - ret + js .Lparanoid_kernel_gsbase -.Lparanoid_entry_swapgs: + /* EBX = 0 -> SWAPGS required on exit */ + xorl %ebx, %ebx swapgs +.Lparanoid_kernel_gsbase: - /* - * The above SAVE_AND_SWITCH_TO_KERNEL_CR3 macro doesn't do an - * unconditional CR3 write, even in the PTI case. So do an lfence - * to prevent GS speculation, regardless of whether PTI is enabled. - */ FENCE_SWAPGS_KERNEL_ENTRY - - /* EBX = 0 -> SWAPGS required on exit */ - xorl %ebx, %ebx - ret + RET SYM_CODE_END(paranoid_entry) /* @@ -991,12 +988,7 @@ SYM_CODE_START_LOCAL(error_entry) movq %rax, %rsp /* switch stack */ ENCODE_FRAME_POINTER pushq %r12 - ret - -.Lerror_entry_done_lfence: - FENCE_SWAPGS_KERNEL_ENTRY -.Lerror_entry_done: - ret + RET /* * There are two places in the kernel that can potentially fault with @@ -1020,8 +1012,14 @@ SYM_CODE_START_LOCAL(error_entry) * .Lgs_change's error handler with kernel gsbase. */ SWAPGS - FENCE_SWAPGS_USER_ENTRY - jmp .Lerror_entry_done + + /* + * Issue an LFENCE to prevent GS speculation, regardless of whether it is a + * kernel or user gsbase. + */ +.Lerror_entry_done_lfence: + FENCE_SWAPGS_KERNEL_ENTRY + RET .Lbstep_iret: /* Fix truncated RIP */ diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 960a021d543e..320480a8db4f 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -453,3 +453,5 @@ 446 i386 landlock_restrict_self sys_landlock_restrict_self 447 i386 memfd_secret sys_memfd_secret 448 i386 process_mrelease sys_process_mrelease +449 i386 futex_waitv sys_futex_waitv +450 i386 set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 18b5500ea8bf..c84d12608cd2 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -370,6 +370,8 @@ 446 common landlock_restrict_self sys_landlock_restrict_self 447 common memfd_secret sys_memfd_secret 448 common process_mrelease sys_process_mrelease +449 common futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node # # Due to a historical design error, certain syscalls are numbered differently diff --git a/arch/x86/entry/thunk_32.S b/arch/x86/entry/thunk_32.S index f1f96d4d8cd6..7591bab060f7 100644 --- a/arch/x86/entry/thunk_32.S +++ b/arch/x86/entry/thunk_32.S @@ -24,7 +24,7 @@ SYM_CODE_START_NOALIGN(\name) popl %edx popl %ecx popl %eax - ret + RET _ASM_NOKPROBE(\name) SYM_CODE_END(\name) .endm diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S index 496b11ec469d..505b488fcc65 100644 --- a/arch/x86/entry/thunk_64.S +++ b/arch/x86/entry/thunk_64.S @@ -50,7 +50,7 @@ SYM_CODE_START_LOCAL_NOALIGN(__thunk_restore) popq %rsi popq %rdi popq %rbp - ret + RET _ASM_NOKPROBE(__thunk_restore) SYM_CODE_END(__thunk_restore) #endif diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index a2dddcc189f6..693f8b9031fb 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -172,7 +172,7 @@ $(obj)/vdso32.so.dbg: $(obj)/vdso32/vdso32.lds $(vobjs32) FORCE # The DSO images are built using a special linker script. # quiet_cmd_vdso = VDSO $@ - cmd_vdso = $(LD) -nostdlib -o $@ \ + cmd_vdso = $(LD) -o $@ \ $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \ -T $(filter %.lds,$^) $(filter %.o,$^) && \ sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@' diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S index dc8da7695859..bafa73f09e92 100644 --- a/arch/x86/entry/vdso/vdso-layout.lds.S +++ b/arch/x86/entry/vdso/vdso-layout.lds.S @@ -77,7 +77,6 @@ SECTIONS .text : { *(.text*) - *(.fixup) } :text =0x90909090, diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S index 6ddd7a937b3e..d33c6513fd2c 100644 --- a/arch/x86/entry/vdso/vdso32/system_call.S +++ b/arch/x86/entry/vdso/vdso32/system_call.S @@ -78,7 +78,7 @@ SYM_INNER_LABEL(int80_landing_pad, SYM_L_GLOBAL) popl %ecx CFI_RESTORE ecx CFI_ADJUST_CFA_OFFSET -4 - ret + RET CFI_ENDPROC .size __kernel_vsyscall,.-__kernel_vsyscall diff --git a/arch/x86/entry/vdso/vsgx.S b/arch/x86/entry/vdso/vsgx.S index 99dafac992e2..d77d278ee9dd 100644 --- a/arch/x86/entry/vdso/vsgx.S +++ b/arch/x86/entry/vdso/vsgx.S @@ -81,7 +81,7 @@ SYM_FUNC_START(__vdso_sgx_enter_enclave) pop %rbx leave .cfi_def_cfa %rsp, 8 - ret + RET /* The out-of-line code runs with the pre-leave stack frame. */ .cfi_def_cfa %rbp, 16 diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index 1b40b9297083..fd2ee9408e91 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -226,7 +226,8 @@ bool emulate_vsyscall(unsigned long error_code, if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) { warn_bad_vsyscall(KERN_DEBUG, regs, "seccomp tried to change syscall nr or ip"); - do_exit(SIGSYS); + force_exit_sig(SIGSYS); + return true; } regs->orig_ax = -1; if (tmp) diff --git a/arch/x86/entry/vsyscall/vsyscall_emu_64.S b/arch/x86/entry/vsyscall/vsyscall_emu_64.S index 2e203f3a25a7..15e35159ebb6 100644 --- a/arch/x86/entry/vsyscall/vsyscall_emu_64.S +++ b/arch/x86/entry/vsyscall/vsyscall_emu_64.S @@ -19,17 +19,17 @@ __vsyscall_page: mov $__NR_gettimeofday, %rax syscall - ret + RET .balign 1024, 0xcc mov $__NR_time, %rax syscall - ret + RET .balign 1024, 0xcc mov $__NR_getcpu, %rax syscall - ret + RET .balign 4096, 0xcc diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c index 913745f1419b..b15f7b950d2e 100644 --- a/arch/x86/events/amd/iommu.c +++ b/arch/x86/events/amd/iommu.c @@ -161,7 +161,7 @@ static int get_next_avail_iommu_bnk_cntr(struct perf_event *event) raw_spin_lock_irqsave(&piommu->lock, flags); - for (bank = 0, shift = 0; bank < max_banks; bank++) { + for (bank = 0; bank < max_banks; bank++) { for (cntr = 0; cntr < max_cntrs; cntr++) { shift = bank + (bank*3) + cntr; if (piommu->cntr_assign_mask & BIT_ULL(shift)) { diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 6dfa8ddaa60f..e686c5e0537b 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -66,6 +66,8 @@ DEFINE_STATIC_CALL_NULL(x86_pmu_enable_all, *x86_pmu.enable_all); DEFINE_STATIC_CALL_NULL(x86_pmu_enable, *x86_pmu.enable); DEFINE_STATIC_CALL_NULL(x86_pmu_disable, *x86_pmu.disable); +DEFINE_STATIC_CALL_NULL(x86_pmu_assign, *x86_pmu.assign); + DEFINE_STATIC_CALL_NULL(x86_pmu_add, *x86_pmu.add); DEFINE_STATIC_CALL_NULL(x86_pmu_del, *x86_pmu.del); DEFINE_STATIC_CALL_NULL(x86_pmu_read, *x86_pmu.read); @@ -1215,6 +1217,8 @@ static inline void x86_assign_hw_event(struct perf_event *event, hwc->last_cpu = smp_processor_id(); hwc->last_tag = ++cpuc->tags[i]; + static_call_cond(x86_pmu_assign)(event, idx); + switch (hwc->idx) { case INTEL_PMC_IDX_FIXED_BTS: case INTEL_PMC_IDX_FIXED_VLBR: @@ -2005,6 +2009,8 @@ static void x86_pmu_static_call_update(void) static_call_update(x86_pmu_enable, x86_pmu.enable); static_call_update(x86_pmu_disable, x86_pmu.disable); + static_call_update(x86_pmu_assign, x86_pmu.assign); + static_call_update(x86_pmu_add, x86_pmu.add); static_call_update(x86_pmu_del, x86_pmu.del); static_call_update(x86_pmu_read, x86_pmu.read); @@ -2470,7 +2476,7 @@ static int x86_pmu_event_init(struct perf_event *event) if (READ_ONCE(x86_pmu.attr_rdpmc) && !(event->hw.flags & PERF_X86_EVENT_LARGE_PEBS)) - event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED; + event->hw.flags |= PERF_EVENT_FLAG_USER_READ_CNT; return err; } @@ -2504,7 +2510,7 @@ void perf_clear_dirty_counters(void) static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm) { - if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) + if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)) return; /* @@ -2525,7 +2531,7 @@ static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm) static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm) { - if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) + if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)) return; if (atomic_dec_and_test(&mm->context.perf_rdpmc_allowed)) @@ -2536,7 +2542,7 @@ static int x86_pmu_event_idx(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; - if (!(hwc->flags & PERF_X86_EVENT_RDPMC_ALLOWED)) + if (!(hwc->flags & PERF_EVENT_FLAG_USER_READ_CNT)) return 0; if (is_metric_idx(hwc->idx)) @@ -2719,7 +2725,7 @@ void arch_perf_update_userpage(struct perf_event *event, userpg->cap_user_time = 0; userpg->cap_user_time_zero = 0; userpg->cap_user_rdpmc = - !!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED); + !!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT); userpg->pmc_width = x86_pmu.cntval_bits; if (!using_native_sched_clock() || !sched_clock_stable()) @@ -2765,7 +2771,7 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re struct unwind_state state; unsigned long addr; - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + if (perf_guest_state()) { /* TODO: We don't support guest os callchain now */ return; } @@ -2868,7 +2874,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs struct stack_frame frame; const struct stack_frame __user *fp; - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + if (perf_guest_state()) { /* TODO: We don't support guest os callchain now */ return; } @@ -2945,18 +2951,19 @@ static unsigned long code_segment_base(struct pt_regs *regs) unsigned long perf_instruction_pointer(struct pt_regs *regs) { - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) - return perf_guest_cbs->get_guest_ip(); + if (perf_guest_state()) + return perf_guest_get_ip(); return regs->ip + code_segment_base(regs); } unsigned long perf_misc_flags(struct pt_regs *regs) { + unsigned int guest_state = perf_guest_state(); int misc = 0; - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { - if (perf_guest_cbs->is_user_mode()) + if (guest_state) { + if (guest_state & PERF_GUEST_USER) misc |= PERF_RECORD_MISC_GUEST_USER; else misc |= PERF_RECORD_MISC_GUEST_KERNEL; diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index 6320d2cfd9d3..974e917e65b2 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -209,6 +209,12 @@ static void bts_update(struct bts_ctx *bts) } else { local_set(&buf->data_size, head); } + + /* + * Since BTS is coherent, just add compiler barrier to ensure + * BTS updating is ordered against bts::handle::event. + */ + barrier(); } static int diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 9a044438072b..fd9f908debe5 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -243,7 +243,8 @@ static struct extra_reg intel_skl_extra_regs[] __read_mostly = { static struct event_constraint intel_icl_event_constraints[] = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ - FIXED_EVENT_CONSTRAINT(0x01c0, 0), /* INST_RETIRED.PREC_DIST */ + FIXED_EVENT_CONSTRAINT(0x01c0, 0), /* old INST_RETIRED.PREC_DIST */ + FIXED_EVENT_CONSTRAINT(0x0100, 0), /* INST_RETIRED.PREC_DIST */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ FIXED_EVENT_CONSTRAINT(0x0400, 3), /* SLOTS */ @@ -288,7 +289,7 @@ static struct extra_reg intel_spr_extra_regs[] __read_mostly = { static struct event_constraint intel_spr_event_constraints[] = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ - FIXED_EVENT_CONSTRAINT(0x01c0, 0), /* INST_RETIRED.PREC_DIST */ + FIXED_EVENT_CONSTRAINT(0x0100, 0), /* INST_RETIRED.PREC_DIST */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ FIXED_EVENT_CONSTRAINT(0x0400, 3), /* SLOTS */ @@ -2144,19 +2145,19 @@ static __initconst const u64 knl_hw_cache_extra_regs * However, there are some cases which may change PEBS status, e.g. PMI * throttle. The PEBS_ENABLE should be updated where the status changes. */ -static void __intel_pmu_disable_all(void) +static __always_inline void __intel_pmu_disable_all(bool bts) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); - if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) + if (bts && test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) intel_pmu_disable_bts(); } -static void intel_pmu_disable_all(void) +static __always_inline void intel_pmu_disable_all(void) { - __intel_pmu_disable_all(); + __intel_pmu_disable_all(true); intel_pmu_pebs_disable_all(); intel_pmu_lbr_disable_all(); } @@ -2187,6 +2188,47 @@ static void intel_pmu_enable_all(int added) __intel_pmu_enable_all(added, false); } +static noinline int +__intel_pmu_snapshot_branch_stack(struct perf_branch_entry *entries, + unsigned int cnt, unsigned long flags) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + intel_pmu_lbr_read(); + cnt = min_t(unsigned int, cnt, x86_pmu.lbr_nr); + + memcpy(entries, cpuc->lbr_entries, sizeof(struct perf_branch_entry) * cnt); + intel_pmu_enable_all(0); + local_irq_restore(flags); + return cnt; +} + +static int +intel_pmu_snapshot_branch_stack(struct perf_branch_entry *entries, unsigned int cnt) +{ + unsigned long flags; + + /* must not have branches... */ + local_irq_save(flags); + __intel_pmu_disable_all(false); /* we don't care about BTS */ + __intel_pmu_lbr_disable(); + /* ... until here */ + return __intel_pmu_snapshot_branch_stack(entries, cnt, flags); +} + +static int +intel_pmu_snapshot_arch_branch_stack(struct perf_branch_entry *entries, unsigned int cnt) +{ + unsigned long flags; + + /* must not have branches... */ + local_irq_save(flags); + __intel_pmu_disable_all(false); /* we don't care about BTS */ + __intel_pmu_arch_lbr_disable(); + /* ... until here */ + return __intel_pmu_snapshot_branch_stack(entries, cnt, flags); +} + /* * Workaround for: * Intel Errata AAK100 (model 26) @@ -2403,6 +2445,12 @@ static void intel_pmu_disable_event(struct perf_event *event) intel_pmu_pebs_disable(event); } +static void intel_pmu_assign_event(struct perf_event *event, int idx) +{ + if (is_pebs_pt(event)) + perf_report_aux_output_id(event, idx); +} + static void intel_pmu_del_event(struct perf_event *event) { if (needs_branch_stack(event)) @@ -2853,10 +2901,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) */ if (__test_and_clear_bit(GLOBAL_STATUS_TRACE_TOPAPMI_BIT, (unsigned long *)&status)) { handled++; - if (unlikely(perf_guest_cbs && perf_guest_cbs->is_in_guest() && - perf_guest_cbs->handle_intel_pt_intr)) - perf_guest_cbs->handle_intel_pt_intr(); - else + if (!perf_guest_handle_intel_pt_intr()) intel_pt_interrupt(); } @@ -2930,7 +2975,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) apic_write(APIC_LVTPC, APIC_DM_NMI); intel_bts_disable_local(); cpuc->enabled = 0; - __intel_pmu_disable_all(); + __intel_pmu_disable_all(true); handled = intel_pmu_drain_bts_buffer(); handled += intel_bts_interrupt(); status = intel_pmu_get_status(); @@ -2998,8 +3043,10 @@ intel_vlbr_constraints(struct perf_event *event) { struct event_constraint *c = &vlbr_constraint; - if (unlikely(constraint_match(c, event->hw.config))) + if (unlikely(constraint_match(c, event->hw.config))) { + event->hw.flags |= c->flags; return c; + } return NULL; } @@ -4495,8 +4542,16 @@ static int intel_pmu_check_period(struct perf_event *event, u64 value) return intel_pmu_has_bts_period(event, value) ? -EINVAL : 0; } +static void intel_aux_output_init(void) +{ + /* Refer also intel_pmu_aux_output_match() */ + if (x86_pmu.intel_cap.pebs_output_pt_available) + x86_pmu.assign = intel_pmu_assign_event; +} + static int intel_pmu_aux_output_match(struct perf_event *event) { + /* intel_pmu_assign_event() is needed, refer intel_aux_output_init() */ if (!x86_pmu.intel_cap.pebs_output_pt_available) return 0; @@ -6284,9 +6339,21 @@ __init int intel_pmu_init(void) x86_pmu.lbr_nr = 0; } - if (x86_pmu.lbr_nr) + if (x86_pmu.lbr_nr) { pr_cont("%d-deep LBR, ", x86_pmu.lbr_nr); + /* only support branch_stack snapshot for perfmon >= v2 */ + if (x86_pmu.disable_all == intel_pmu_disable_all) { + if (boot_cpu_has(X86_FEATURE_ARCH_LBR)) { + static_call_update(perf_snapshot_branch_stack, + intel_pmu_snapshot_arch_branch_stack); + } else { + static_call_update(perf_snapshot_branch_stack, + intel_pmu_snapshot_branch_stack); + } + } + } + intel_pmu_check_extra_regs(x86_pmu.extra_regs); /* Support full width counters using alternative MSR range */ @@ -6302,6 +6369,8 @@ __init int intel_pmu_init(void) if (is_hybrid()) intel_pmu_check_hybrid_pmus((u64)fixed_mask); + intel_aux_output_init(); + return 0; } diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 8647713276a7..2e215369df4a 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -923,7 +923,8 @@ struct event_constraint intel_skl_pebs_event_constraints[] = { }; struct event_constraint intel_icl_pebs_event_constraints[] = { - INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x100000000ULL), /* INST_RETIRED.PREC_DIST */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x100000000ULL), /* old INST_RETIRED.PREC_DIST */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x0100, 0x100000000ULL), /* INST_RETIRED.PREC_DIST */ INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x800000000ULL), /* SLOTS */ INTEL_PLD_CONSTRAINT(0x1cd, 0xff), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ @@ -943,7 +944,7 @@ struct event_constraint intel_icl_pebs_event_constraints[] = { }; struct event_constraint intel_spr_pebs_event_constraints[] = { - INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x100000000ULL), + INTEL_FLAGS_UEVENT_CONSTRAINT(0x100, 0x100000000ULL), /* INST_RETIRED.PREC_DIST */ INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x800000000ULL), INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xfe), @@ -1301,7 +1302,7 @@ void intel_pmu_pebs_disable_all(void) struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); if (cpuc->pebs_enabled) - wrmsrl(MSR_IA32_PEBS_ENABLE, 0); + __intel_pmu_pebs_disable_all(); } static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 9e6d6eaeb4cb..8043213b75a5 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -228,20 +228,6 @@ static void __intel_pmu_lbr_enable(bool pmi) wrmsrl(MSR_ARCH_LBR_CTL, lbr_select | ARCH_LBR_CTL_LBREN); } -static void __intel_pmu_lbr_disable(void) -{ - u64 debugctl; - - if (static_cpu_has(X86_FEATURE_ARCH_LBR)) { - wrmsrl(MSR_ARCH_LBR_CTL, 0); - return; - } - - rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); - debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); - wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); -} - void intel_pmu_lbr_reset_32(void) { int i; @@ -279,6 +265,8 @@ void intel_pmu_lbr_reset(void) cpuc->last_task_ctx = NULL; cpuc->last_log_id = 0; + if (!static_cpu_has(X86_FEATURE_ARCH_LBR) && cpuc->lbr_select) + wrmsrl(MSR_LBR_SELECT, 0); } /* @@ -779,8 +767,12 @@ void intel_pmu_lbr_disable_all(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - if (cpuc->lbr_users && !vlbr_exclude_host()) + if (cpuc->lbr_users && !vlbr_exclude_host()) { + if (static_cpu_has(X86_FEATURE_ARCH_LBR)) + return __intel_pmu_arch_lbr_disable(); + __intel_pmu_lbr_disable(); + } } void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index c72e368dd164..f1ba6ab2e97e 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1187,7 +1187,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id * PCI slot and func to indicate the uncore box. */ if (id->driver_data & ~0xffff) { - struct pci_driver *pci_drv = pdev->driver; + struct pci_driver *pci_drv = to_pci_driver(pdev->dev.driver); pmu = uncore_pci_find_dev_pmu(pdev, pci_drv->id_table); if (pmu == NULL) diff --git a/arch/x86/events/intel/uncore_discovery.h b/arch/x86/events/intel/uncore_discovery.h index 7280c8a3c831..6d735611c281 100644 --- a/arch/x86/events/intel/uncore_discovery.h +++ b/arch/x86/events/intel/uncore_discovery.h @@ -30,7 +30,7 @@ #define uncore_discovery_invalid_unit(unit) \ - (!unit.table1 || !unit.ctl || !unit.table3 || \ + (!unit.table1 || !unit.ctl || \ unit.table1 == -1ULL || unit.ctl == -1ULL || \ unit.table3 == -1ULL) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 5ddc0f30db6f..3660f698fb2a 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -452,7 +452,7 @@ #define ICX_M3UPI_PCI_PMON_BOX_CTL 0xa0 /* ICX IMC */ -#define ICX_NUMBER_IMC_CHN 2 +#define ICX_NUMBER_IMC_CHN 3 #define ICX_IMC_MEM_STRIDE 0x4 /* SPR */ @@ -3608,6 +3608,9 @@ static int skx_cha_hw_config(struct intel_uncore_box *box, struct perf_event *ev struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; struct extra_reg *er; int idx = 0; + /* Any of the CHA events may be filtered by Thread/Core-ID.*/ + if (event->hw.config & SNBEP_CBO_PMON_CTL_TID_EN) + idx = SKX_CHA_MSR_PMON_BOX_FILTER_TID; for (er = skx_uncore_cha_extra_regs; er->msr; er++) { if (er->event != (event->hw.config & er->config_mask)) @@ -3675,6 +3678,7 @@ static struct event_constraint skx_uncore_iio_constraints[] = { UNCORE_EVENT_CONSTRAINT(0xc0, 0xc), UNCORE_EVENT_CONSTRAINT(0xc5, 0xc), UNCORE_EVENT_CONSTRAINT(0xd4, 0xc), + UNCORE_EVENT_CONSTRAINT(0xd5, 0xc), EVENT_CONSTRAINT_END }; @@ -4525,6 +4529,13 @@ static void snr_iio_cleanup_mapping(struct intel_uncore_type *type) pmu_iio_cleanup_mapping(type, &snr_iio_mapping_group); } +static struct event_constraint snr_uncore_iio_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x83, 0x3), + UNCORE_EVENT_CONSTRAINT(0xc0, 0xc), + UNCORE_EVENT_CONSTRAINT(0xd5, 0xc), + EVENT_CONSTRAINT_END +}; + static struct intel_uncore_type snr_uncore_iio = { .name = "iio", .num_counters = 4, @@ -4536,6 +4547,7 @@ static struct intel_uncore_type snr_uncore_iio = { .event_mask_ext = SNR_IIO_PMON_RAW_EVENT_MASK_EXT, .box_ctl = SNR_IIO_MSR_PMON_BOX_CTL, .msr_offset = SNR_IIO_MSR_OFFSET, + .constraints = snr_uncore_iio_constraints, .ops = &ivbep_uncore_msr_ops, .format_group = &snr_uncore_iio_format_group, .attr_update = snr_iio_attr_update, @@ -5076,8 +5088,10 @@ static struct event_constraint icx_uncore_iio_constraints[] = { UNCORE_EVENT_CONSTRAINT(0x02, 0x3), UNCORE_EVENT_CONSTRAINT(0x03, 0x3), UNCORE_EVENT_CONSTRAINT(0x83, 0x3), + UNCORE_EVENT_CONSTRAINT(0x88, 0xc), UNCORE_EVENT_CONSTRAINT(0xc0, 0xc), UNCORE_EVENT_CONSTRAINT(0xc5, 0xc), + UNCORE_EVENT_CONSTRAINT(0xd5, 0xc), EVENT_CONSTRAINT_END }; @@ -5463,7 +5477,7 @@ static struct intel_uncore_ops icx_uncore_mmio_ops = { static struct intel_uncore_type icx_uncore_imc = { .name = "imc", .num_counters = 4, - .num_boxes = 8, + .num_boxes = 12, .perf_ctr_bits = 48, .fixed_ctr_bits = 48, .fixed_ctr = SNR_IMC_MMIO_PMON_FIXED_CTR, @@ -5647,6 +5661,7 @@ static struct intel_uncore_type spr_uncore_chabox = { .event_mask = SPR_CHA_PMON_EVENT_MASK, .event_mask_ext = SPR_RAW_EVENT_MASK_EXT, .num_shared_regs = 1, + .constraints = skx_uncore_chabox_constraints, .ops = &spr_uncore_chabox_ops, .format_group = &spr_uncore_chabox_format_group, .attr_update = uncore_alias_groups, @@ -5658,6 +5673,7 @@ static struct intel_uncore_type spr_uncore_iio = { .event_mask_ext = SNR_IIO_PMON_RAW_EVENT_MASK_EXT, .format_group = &snr_uncore_iio_format_group, .attr_update = uncore_alias_groups, + .constraints = icx_uncore_iio_constraints, }; static struct attribute *spr_uncore_raw_formats_attr[] = { @@ -5686,9 +5702,16 @@ static struct intel_uncore_type spr_uncore_irp = { }; +static struct event_constraint spr_uncore_m2pcie_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x14, 0x3), + UNCORE_EVENT_CONSTRAINT(0x2d, 0x3), + EVENT_CONSTRAINT_END +}; + static struct intel_uncore_type spr_uncore_m2pcie = { SPR_UNCORE_COMMON_FORMAT(), .name = "m2pcie", + .constraints = spr_uncore_m2pcie_constraints, }; static struct intel_uncore_type spr_uncore_pcu = { @@ -5765,6 +5788,7 @@ static struct intel_uncore_type spr_uncore_upi = { static struct intel_uncore_type spr_uncore_m3upi = { SPR_UNCORE_PCI_COMMON_FORMAT(), .name = "m3upi", + .constraints = icx_uncore_m3upi_constraints, }; static struct intel_uncore_type spr_uncore_mdf = { diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index c853b28efa33..96c775abe31f 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -68,6 +68,7 @@ static bool test_intel(int idx, void *data) case INTEL_FAM6_BROADWELL_D: case INTEL_FAM6_BROADWELL_G: case INTEL_FAM6_BROADWELL_X: + case INTEL_FAM6_SAPPHIRERAPIDS_X: case INTEL_FAM6_ATOM_SILVERMONT: case INTEL_FAM6_ATOM_SILVERMONT_D: diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index e3ac05c97b5e..9d376e528dfc 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -14,6 +14,7 @@ #include <linux/perf_event.h> +#include <asm/fpu/xstate.h> #include <asm/intel_ds.h> #include <asm/cpu.h> @@ -73,7 +74,7 @@ static inline bool constraint_match(struct event_constraint *c, u64 ecode) #define PERF_X86_EVENT_PEBS_NA_HSW 0x0010 /* haswell style datala, unknown */ #define PERF_X86_EVENT_EXCL 0x0020 /* HT exclusivity on counter */ #define PERF_X86_EVENT_DYNAMIC 0x0040 /* dynamic alloc'd constraint */ -#define PERF_X86_EVENT_RDPMC_ALLOWED 0x0080 /* grant rdpmc permission */ + #define PERF_X86_EVENT_EXCL_ACCT 0x0100 /* accounted EXCL event */ #define PERF_X86_EVENT_AUTO_RELOAD 0x0200 /* use PEBS auto-reload */ #define PERF_X86_EVENT_LARGE_PEBS 0x0400 /* use large PEBS */ @@ -726,6 +727,7 @@ struct x86_pmu { void (*enable_all)(int added); void (*enable)(struct perf_event *); void (*disable)(struct perf_event *); + void (*assign)(struct perf_event *event, int idx); void (*add)(struct perf_event *); void (*del)(struct perf_event *); void (*read)(struct perf_event *event); @@ -1240,6 +1242,25 @@ static inline bool intel_pmu_has_bts(struct perf_event *event) return intel_pmu_has_bts_period(event, hwc->sample_period); } +static __always_inline void __intel_pmu_pebs_disable_all(void) +{ + wrmsrl(MSR_IA32_PEBS_ENABLE, 0); +} + +static __always_inline void __intel_pmu_arch_lbr_disable(void) +{ + wrmsrl(MSR_ARCH_LBR_CTL, 0); +} + +static __always_inline void __intel_pmu_lbr_disable(void) +{ + u64 debugctl; + + rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); + wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); +} + int intel_pmu_save_and_restart(struct perf_event *event); struct event_constraint * diff --git a/arch/x86/hyperv/Makefile b/arch/x86/hyperv/Makefile index 48e2c51464e8..5d2de10809ae 100644 --- a/arch/x86/hyperv/Makefile +++ b/arch/x86/hyperv/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-y := hv_init.o mmu.o nested.o irqdomain.o +obj-y := hv_init.o mmu.o nested.o irqdomain.o ivm.o obj-$(CONFIG_X86_64) += hv_apic.o hv_proc.o ifdef CONFIG_X86_64 diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c index 32a1ad356c18..db2d92fb44da 100644 --- a/arch/x86/hyperv/hv_apic.c +++ b/arch/x86/hyperv/hv_apic.c @@ -122,17 +122,27 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector, ipi_arg->reserved = 0; ipi_arg->vp_set.valid_bank_mask = 0; - if (!cpumask_equal(mask, cpu_present_mask)) { + /* + * Use HV_GENERIC_SET_ALL and avoid converting cpumask to VP_SET + * when the IPI is sent to all currently present CPUs. + */ + if (!cpumask_equal(mask, cpu_present_mask) || exclude_self) { ipi_arg->vp_set.format = HV_GENERIC_SET_SPARSE_4K; if (exclude_self) nr_bank = cpumask_to_vpset_noself(&(ipi_arg->vp_set), mask); else nr_bank = cpumask_to_vpset(&(ipi_arg->vp_set), mask); - } - if (nr_bank < 0) - goto ipi_mask_ex_done; - if (!nr_bank) + + /* + * 'nr_bank <= 0' means some CPUs in cpumask can't be + * represented in VP_SET. Return an error and fall back to + * native (architectural) method of sending IPIs. + */ + if (nr_bank <= 0) + goto ipi_mask_ex_done; + } else { ipi_arg->vp_set.format = HV_GENERIC_SET_ALL; + } status = hv_do_rep_hypercall(HVCALL_SEND_IPI_EX, 0, nr_bank, ipi_arg, NULL); diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 708a2712a516..8b392b6b7b93 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -28,6 +28,7 @@ #include <linux/syscore_ops.h> #include <clocksource/hyperv_timer.h> #include <linux/highmem.h> +#include <linux/swiotlb.h> int hyperv_init_cpuhp; u64 hv_current_partition_id = ~0ull; @@ -36,12 +37,42 @@ EXPORT_SYMBOL_GPL(hv_current_partition_id); void *hv_hypercall_pg; EXPORT_SYMBOL_GPL(hv_hypercall_pg); +union hv_ghcb * __percpu *hv_ghcb_pg; + /* Storage to save the hypercall page temporarily for hibernation */ static void *hv_hypercall_pg_saved; struct hv_vp_assist_page **hv_vp_assist_page; EXPORT_SYMBOL_GPL(hv_vp_assist_page); +static int hyperv_init_ghcb(void) +{ + u64 ghcb_gpa; + void *ghcb_va; + void **ghcb_base; + + if (!hv_isolation_type_snp()) + return 0; + + if (!hv_ghcb_pg) + return -EINVAL; + + /* + * GHCB page is allocated by paravisor. The address + * returned by MSR_AMD64_SEV_ES_GHCB is above shared + * memory boundary and map it here. + */ + rdmsrl(MSR_AMD64_SEV_ES_GHCB, ghcb_gpa); + ghcb_va = memremap(ghcb_gpa, HV_HYP_PAGE_SIZE, MEMREMAP_WB); + if (!ghcb_va) + return -ENOMEM; + + ghcb_base = (void **)this_cpu_ptr(hv_ghcb_pg); + *ghcb_base = ghcb_va; + + return 0; +} + static int hv_cpu_init(unsigned int cpu) { union hv_vp_assist_msr_contents msr = { 0 }; @@ -85,7 +116,7 @@ static int hv_cpu_init(unsigned int cpu) } } - return 0; + return hyperv_init_ghcb(); } static void (*hv_reenlightenment_cb)(void); @@ -139,7 +170,6 @@ void set_hv_tscchange_cb(void (*cb)(void)) struct hv_reenlightenment_control re_ctrl = { .vector = HYPERV_REENLIGHTENMENT_VECTOR, .enabled = 1, - .target_vp = hv_vp_index[smp_processor_id()] }; struct hv_tsc_emulation_control emu_ctrl = {.enabled = 1}; @@ -148,13 +178,20 @@ void set_hv_tscchange_cb(void (*cb)(void)) return; } + if (!hv_vp_index) + return; + hv_reenlightenment_cb = cb; /* Make sure callback is registered before we write to MSRs */ wmb(); + re_ctrl.target_vp = hv_vp_index[get_cpu()]; + wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl)); wrmsrl(HV_X64_MSR_TSC_EMULATION_CONTROL, *((u64 *)&emu_ctrl)); + + put_cpu(); } EXPORT_SYMBOL_GPL(set_hv_tscchange_cb); @@ -177,6 +214,14 @@ static int hv_cpu_die(unsigned int cpu) { struct hv_reenlightenment_control re_ctrl; unsigned int new_cpu; + void **ghcb_va; + + if (hv_ghcb_pg) { + ghcb_va = (void **)this_cpu_ptr(hv_ghcb_pg); + if (*ghcb_va) + memunmap(*ghcb_va); + *ghcb_va = NULL; + } hv_common_cpu_die(cpu); @@ -342,20 +387,13 @@ static void __init hv_get_partition_id(void) */ void __init hyperv_init(void) { - u64 guest_id, required_msrs; + u64 guest_id; union hv_x64_msr_hypercall_contents hypercall_msr; int cpuhp; if (x86_hyper_type != X86_HYPER_MS_HYPERV) return; - /* Absolutely required MSRs */ - required_msrs = HV_MSR_HYPERCALL_AVAILABLE | - HV_MSR_VP_INDEX_AVAILABLE; - - if ((ms_hyperv.features & required_msrs) != required_msrs) - return; - if (hv_common_init()) return; @@ -366,10 +404,16 @@ void __init hyperv_init(void) goto common_free; } + if (hv_isolation_type_snp()) { + hv_ghcb_pg = alloc_percpu(union hv_ghcb *); + if (!hv_ghcb_pg) + goto free_vp_assist_page; + } + cpuhp = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/hyperv_init:online", hv_cpu_init, hv_cpu_die); if (cpuhp < 0) - goto free_vp_assist_page; + goto free_ghcb_page; /* * Setup the hypercall page and enable hypercalls. @@ -379,14 +423,15 @@ void __init hyperv_init(void) guest_id = generate_guest_id(0, LINUX_VERSION_CODE, 0); wrmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id); + /* Hyper-V requires to write guest os id via ghcb in SNP IVM. */ + hv_ghcb_msr_write(HV_X64_MSR_GUEST_OS_ID, guest_id); + hv_hypercall_pg = __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START, VMALLOC_END, GFP_KERNEL, PAGE_KERNEL_ROX, VM_FLUSH_RESET_PERMS, NUMA_NO_NODE, __builtin_return_address(0)); - if (hv_hypercall_pg == NULL) { - wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); - goto remove_cpuhp_state; - } + if (hv_hypercall_pg == NULL) + goto clean_guest_os_id; rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); hypercall_msr.enable = 1; @@ -454,10 +499,25 @@ void __init hyperv_init(void) /* Query the VMs extended capability once, so that it can be cached. */ hv_query_ext_cap(0); + +#ifdef CONFIG_SWIOTLB + /* + * Swiotlb bounce buffer needs to be mapped in extra address + * space. Map function doesn't work in the early place and so + * call swiotlb_update_mem_attributes() here. + */ + if (hv_is_isolation_supported()) + swiotlb_update_mem_attributes(); +#endif + return; -remove_cpuhp_state: +clean_guest_os_id: + wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); + hv_ghcb_msr_write(HV_X64_MSR_GUEST_OS_ID, 0); cpuhp_remove_state(cpuhp); +free_ghcb_page: + free_percpu(hv_ghcb_pg); free_vp_assist_page: kfree(hv_vp_assist_page); hv_vp_assist_page = NULL; @@ -476,6 +536,7 @@ void hyperv_cleanup(void) /* Reset our OS id */ wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); + hv_ghcb_msr_write(HV_X64_MSR_GUEST_OS_ID, 0); /* * Reset hypercall page reference before reset the page, @@ -546,16 +607,3 @@ bool hv_is_hyperv_initialized(void) return hypercall_msr.enable; } EXPORT_SYMBOL_GPL(hv_is_hyperv_initialized); - -enum hv_isolation_type hv_get_isolation_type(void) -{ - if (!(ms_hyperv.priv_high & HV_ISOLATION)) - return HV_ISOLATION_TYPE_NONE; - return FIELD_GET(HV_ISOLATION_TYPE, ms_hyperv.isolation_config_b); -} -EXPORT_SYMBOL_GPL(hv_get_isolation_type); - -bool hv_is_isolation_supported(void) -{ - return hv_get_isolation_type() != HV_ISOLATION_TYPE_NONE; -} diff --git a/arch/x86/hyperv/irqdomain.c b/arch/x86/hyperv/irqdomain.c index 514fc64e23d5..7e0f6bedc248 100644 --- a/arch/x86/hyperv/irqdomain.c +++ b/arch/x86/hyperv/irqdomain.c @@ -253,64 +253,43 @@ static int hv_unmap_msi_interrupt(struct pci_dev *dev, struct hv_interrupt_entry return hv_unmap_interrupt(hv_build_pci_dev_id(dev).as_uint64, old_entry); } -static void hv_teardown_msi_irq_common(struct pci_dev *dev, struct msi_desc *msidesc, int irq) +static void hv_teardown_msi_irq(struct pci_dev *dev, struct irq_data *irqd) { - u64 status; struct hv_interrupt_entry old_entry; - struct irq_desc *desc; - struct irq_data *data; struct msi_msg msg; + u64 status; - desc = irq_to_desc(irq); - if (!desc) { - pr_debug("%s: no irq desc\n", __func__); - return; - } - - data = &desc->irq_data; - if (!data) { - pr_debug("%s: no irq data\n", __func__); - return; - } - - if (!data->chip_data) { + if (!irqd->chip_data) { pr_debug("%s: no chip data\n!", __func__); return; } - old_entry = *(struct hv_interrupt_entry *)data->chip_data; + old_entry = *(struct hv_interrupt_entry *)irqd->chip_data; entry_to_msi_msg(&old_entry, &msg); - kfree(data->chip_data); - data->chip_data = NULL; + kfree(irqd->chip_data); + irqd->chip_data = NULL; status = hv_unmap_msi_interrupt(dev, &old_entry); - if (status != HV_STATUS_SUCCESS) { + if (status != HV_STATUS_SUCCESS) pr_err("%s: hypercall failed, status %lld\n", __func__, status); - return; - } } -static void hv_msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) +static void hv_msi_free_irq(struct irq_domain *domain, + struct msi_domain_info *info, unsigned int virq) { - int i; - struct msi_desc *entry; - struct pci_dev *pdev; + struct irq_data *irqd = irq_get_irq_data(virq); + struct msi_desc *desc; - if (WARN_ON_ONCE(!dev_is_pci(dev))) + if (!irqd) return; - pdev = to_pci_dev(dev); + desc = irq_data_get_msi_desc(irqd); + if (!desc || !desc->irq || WARN_ON_ONCE(!dev_is_pci(desc->dev))) + return; - for_each_pci_msi_entry(entry, pdev) { - if (entry->irq) { - for (i = 0; i < entry->nvec_used; i++) { - hv_teardown_msi_irq_common(pdev, entry, entry->irq + i); - irq_domain_free_irqs(entry->irq + i, 1); - } - } - } + hv_teardown_msi_irq(to_pci_dev(desc->dev), irqd); } /* @@ -329,7 +308,7 @@ static struct irq_chip hv_pci_msi_controller = { }; static struct msi_domain_ops pci_msi_domain_ops = { - .domain_free_irqs = hv_msi_domain_free_irqs, + .msi_free = hv_msi_free_irq, .msi_prepare = pci_msi_prepare, }; diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c new file mode 100644 index 000000000000..2b994117581e --- /dev/null +++ b/arch/x86/hyperv/ivm.c @@ -0,0 +1,317 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Hyper-V Isolation VM interface with paravisor and hypervisor + * + * Author: + * Tianyu Lan <Tianyu.Lan@microsoft.com> + */ + +#include <linux/bitfield.h> +#include <linux/hyperv.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <asm/svm.h> +#include <asm/sev.h> +#include <asm/io.h> +#include <asm/mshyperv.h> +#include <asm/hypervisor.h> + +#ifdef CONFIG_AMD_MEM_ENCRYPT + +#define GHCB_USAGE_HYPERV_CALL 1 + +union hv_ghcb { + struct ghcb ghcb; + struct { + u64 hypercalldata[509]; + u64 outputgpa; + union { + union { + struct { + u32 callcode : 16; + u32 isfast : 1; + u32 reserved1 : 14; + u32 isnested : 1; + u32 countofelements : 12; + u32 reserved2 : 4; + u32 repstartindex : 12; + u32 reserved3 : 4; + }; + u64 asuint64; + } hypercallinput; + union { + struct { + u16 callstatus; + u16 reserved1; + u32 elementsprocessed : 12; + u32 reserved2 : 20; + }; + u64 asunit64; + } hypercalloutput; + }; + u64 reserved2; + } hypercall; +} __packed __aligned(HV_HYP_PAGE_SIZE); + +u64 hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size) +{ + union hv_ghcb *hv_ghcb; + void **ghcb_base; + unsigned long flags; + u64 status; + + if (!hv_ghcb_pg) + return -EFAULT; + + WARN_ON(in_nmi()); + + local_irq_save(flags); + ghcb_base = (void **)this_cpu_ptr(hv_ghcb_pg); + hv_ghcb = (union hv_ghcb *)*ghcb_base; + if (!hv_ghcb) { + local_irq_restore(flags); + return -EFAULT; + } + + hv_ghcb->ghcb.protocol_version = GHCB_PROTOCOL_MAX; + hv_ghcb->ghcb.ghcb_usage = GHCB_USAGE_HYPERV_CALL; + + hv_ghcb->hypercall.outputgpa = (u64)output; + hv_ghcb->hypercall.hypercallinput.asuint64 = 0; + hv_ghcb->hypercall.hypercallinput.callcode = control; + + if (input_size) + memcpy(hv_ghcb->hypercall.hypercalldata, input, input_size); + + VMGEXIT(); + + hv_ghcb->ghcb.ghcb_usage = 0xffffffff; + memset(hv_ghcb->ghcb.save.valid_bitmap, 0, + sizeof(hv_ghcb->ghcb.save.valid_bitmap)); + + status = hv_ghcb->hypercall.hypercalloutput.callstatus; + + local_irq_restore(flags); + + return status; +} + +void hv_ghcb_msr_write(u64 msr, u64 value) +{ + union hv_ghcb *hv_ghcb; + void **ghcb_base; + unsigned long flags; + struct es_em_ctxt ctxt; + + if (!hv_ghcb_pg) + return; + + WARN_ON(in_nmi()); + + local_irq_save(flags); + ghcb_base = (void **)this_cpu_ptr(hv_ghcb_pg); + hv_ghcb = (union hv_ghcb *)*ghcb_base; + if (!hv_ghcb) { + local_irq_restore(flags); + return; + } + + ghcb_set_rcx(&hv_ghcb->ghcb, msr); + ghcb_set_rax(&hv_ghcb->ghcb, lower_32_bits(value)); + ghcb_set_rdx(&hv_ghcb->ghcb, upper_32_bits(value)); + + if (sev_es_ghcb_hv_call(&hv_ghcb->ghcb, false, &ctxt, + SVM_EXIT_MSR, 1, 0)) + pr_warn("Fail to write msr via ghcb %llx.\n", msr); + + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(hv_ghcb_msr_write); + +void hv_ghcb_msr_read(u64 msr, u64 *value) +{ + union hv_ghcb *hv_ghcb; + void **ghcb_base; + unsigned long flags; + struct es_em_ctxt ctxt; + + /* Check size of union hv_ghcb here. */ + BUILD_BUG_ON(sizeof(union hv_ghcb) != HV_HYP_PAGE_SIZE); + + if (!hv_ghcb_pg) + return; + + WARN_ON(in_nmi()); + + local_irq_save(flags); + ghcb_base = (void **)this_cpu_ptr(hv_ghcb_pg); + hv_ghcb = (union hv_ghcb *)*ghcb_base; + if (!hv_ghcb) { + local_irq_restore(flags); + return; + } + + ghcb_set_rcx(&hv_ghcb->ghcb, msr); + if (sev_es_ghcb_hv_call(&hv_ghcb->ghcb, false, &ctxt, + SVM_EXIT_MSR, 0, 0)) + pr_warn("Fail to read msr via ghcb %llx.\n", msr); + else + *value = (u64)lower_32_bits(hv_ghcb->ghcb.save.rax) + | ((u64)lower_32_bits(hv_ghcb->ghcb.save.rdx) << 32); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(hv_ghcb_msr_read); +#endif + +enum hv_isolation_type hv_get_isolation_type(void) +{ + if (!(ms_hyperv.priv_high & HV_ISOLATION)) + return HV_ISOLATION_TYPE_NONE; + return FIELD_GET(HV_ISOLATION_TYPE, ms_hyperv.isolation_config_b); +} +EXPORT_SYMBOL_GPL(hv_get_isolation_type); + +/* + * hv_is_isolation_supported - Check system runs in the Hyper-V + * isolation VM. + */ +bool hv_is_isolation_supported(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) + return false; + + if (!hypervisor_is_type(X86_HYPER_MS_HYPERV)) + return false; + + return hv_get_isolation_type() != HV_ISOLATION_TYPE_NONE; +} + +DEFINE_STATIC_KEY_FALSE(isolation_type_snp); + +/* + * hv_isolation_type_snp - Check system runs in the AMD SEV-SNP based + * isolation VM. + */ +bool hv_isolation_type_snp(void) +{ + return static_branch_unlikely(&isolation_type_snp); +} + +/* + * hv_mark_gpa_visibility - Set pages visible to host via hvcall. + * + * In Isolation VM, all guest memory is encrypted from host and guest + * needs to set memory visible to host via hvcall before sharing memory + * with host. + */ +static int hv_mark_gpa_visibility(u16 count, const u64 pfn[], + enum hv_mem_host_visibility visibility) +{ + struct hv_gpa_range_for_visibility **input_pcpu, *input; + u16 pages_processed; + u64 hv_status; + unsigned long flags; + + /* no-op if partition isolation is not enabled */ + if (!hv_is_isolation_supported()) + return 0; + + if (count > HV_MAX_MODIFY_GPA_REP_COUNT) { + pr_err("Hyper-V: GPA count:%d exceeds supported:%lu\n", count, + HV_MAX_MODIFY_GPA_REP_COUNT); + return -EINVAL; + } + + local_irq_save(flags); + input_pcpu = (struct hv_gpa_range_for_visibility **) + this_cpu_ptr(hyperv_pcpu_input_arg); + input = *input_pcpu; + if (unlikely(!input)) { + local_irq_restore(flags); + return -EINVAL; + } + + input->partition_id = HV_PARTITION_ID_SELF; + input->host_visibility = visibility; + input->reserved0 = 0; + input->reserved1 = 0; + memcpy((void *)input->gpa_page_list, pfn, count * sizeof(*pfn)); + hv_status = hv_do_rep_hypercall( + HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY, count, + 0, input, &pages_processed); + local_irq_restore(flags); + + if (hv_result_success(hv_status)) + return 0; + else + return -EFAULT; +} + +/* + * hv_set_mem_host_visibility - Set specified memory visible to host. + * + * In Isolation VM, all guest memory is encrypted from host and guest + * needs to set memory visible to host via hvcall before sharing memory + * with host. This function works as wrap of hv_mark_gpa_visibility() + * with memory base and size. + */ +int hv_set_mem_host_visibility(unsigned long kbuffer, int pagecount, bool visible) +{ + enum hv_mem_host_visibility visibility = visible ? + VMBUS_PAGE_VISIBLE_READ_WRITE : VMBUS_PAGE_NOT_VISIBLE; + u64 *pfn_array; + int ret = 0; + int i, pfn; + + if (!hv_is_isolation_supported() || !hv_hypercall_pg) + return 0; + + pfn_array = kmalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL); + if (!pfn_array) + return -ENOMEM; + + for (i = 0, pfn = 0; i < pagecount; i++) { + pfn_array[pfn] = virt_to_hvpfn((void *)kbuffer + i * HV_HYP_PAGE_SIZE); + pfn++; + + if (pfn == HV_MAX_MODIFY_GPA_REP_COUNT || i == pagecount - 1) { + ret = hv_mark_gpa_visibility(pfn, pfn_array, + visibility); + if (ret) + goto err_free_pfn_array; + pfn = 0; + } + } + + err_free_pfn_array: + kfree(pfn_array); + return ret; +} + +/* + * hv_map_memory - map memory to extra space in the AMD SEV-SNP Isolation VM. + */ +void *hv_map_memory(void *addr, unsigned long size) +{ + unsigned long *pfns = kcalloc(size / PAGE_SIZE, + sizeof(unsigned long), GFP_KERNEL); + void *vaddr; + int i; + + if (!pfns) + return NULL; + + for (i = 0; i < size / PAGE_SIZE; i++) + pfns[i] = vmalloc_to_pfn(addr + i * PAGE_SIZE) + + (ms_hyperv.shared_gpa_boundary >> PAGE_SHIFT); + + vaddr = vmap_pfn(pfns, size / PAGE_SIZE, PAGE_KERNEL_IO); + kfree(pfns); + + return vaddr; +} + +void hv_unmap_memory(void *addr) +{ + vunmap(addr); +} diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c index bd13736d0c05..0ad2378fe6ad 100644 --- a/arch/x86/hyperv/mmu.c +++ b/arch/x86/hyperv/mmu.c @@ -68,15 +68,6 @@ static void hyperv_flush_tlb_multi(const struct cpumask *cpus, local_irq_save(flags); - /* - * Only check the mask _after_ interrupt has been disabled to avoid the - * mask changing under our feet. - */ - if (cpumask_empty(cpus)) { - local_irq_restore(flags); - return; - } - flush_pcpu = (struct hv_tlb_flush **) this_cpu_ptr(hyperv_pcpu_input_arg); @@ -115,7 +106,9 @@ static void hyperv_flush_tlb_multi(const struct cpumask *cpus, * must. We will also check all VP numbers when walking the * supplied CPU set to remain correct in all cases. */ - if (hv_cpu_number_to_vp_number(cpumask_last(cpus)) >= 64) + cpu = cpumask_last(cpus); + + if (cpu < nr_cpumask_bits && hv_cpu_number_to_vp_number(cpu) >= 64) goto do_ex_hypercall; for_each_cpu(cpu, cpus) { @@ -131,6 +124,12 @@ static void hyperv_flush_tlb_multi(const struct cpumask *cpus, __set_bit(vcpu, (unsigned long *) &flush->processor_mask); } + + /* nothing to flush if 'processor_mask' ends up being empty */ + if (!flush->processor_mask) { + local_irq_restore(flags); + return; + } } /* diff --git a/arch/x86/ia32/audit.c b/arch/x86/ia32/audit.c index 6efe6cb3768a..59e19549e759 100644 --- a/arch/x86/ia32/audit.c +++ b/arch/x86/ia32/audit.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/audit_arch.h> #include <asm/unistd_32.h> #include <asm/audit.h> @@ -31,15 +32,17 @@ int ia32_classify_syscall(unsigned syscall) { switch (syscall) { case __NR_open: - return 2; + return AUDITSC_OPEN; case __NR_openat: - return 3; + return AUDITSC_OPENAT; case __NR_socketcall: - return 4; + return AUDITSC_SOCKETCALL; case __NR_execve: case __NR_execveat: - return 5; + return AUDITSC_EXECVE; + case __NR_openat2: + return AUDITSC_OPENAT2; default: - return 1; + return AUDITSC_COMPAT; } } diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 5e3d9b7fd5fb..c9c3859322fa 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -24,7 +24,6 @@ #include <linux/syscalls.h> #include <asm/ucontext.h> #include <linux/uaccess.h> -#include <asm/fpu/internal.h> #include <asm/fpu/signal.h> #include <asm/ptrace.h> #include <asm/ia32_unistd.h> @@ -57,8 +56,8 @@ static inline void reload_segments(struct sigcontext_32 *sc) /* * Do a signal return; undo the signal stack. */ -static int ia32_restore_sigcontext(struct pt_regs *regs, - struct sigcontext_32 __user *usc) +static bool ia32_restore_sigcontext(struct pt_regs *regs, + struct sigcontext_32 __user *usc) { struct sigcontext_32 sc; @@ -66,7 +65,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, current->restart_block.fn = do_no_restart_syscall; if (unlikely(copy_from_user(&sc, usc, sizeof(sc)))) - return -EFAULT; + return false; /* Get only the ia32 registers. */ regs->bx = sc.bx; @@ -111,7 +110,7 @@ COMPAT_SYSCALL_DEFINE0(sigreturn) set_current_blocked(&set); - if (ia32_restore_sigcontext(regs, &frame->sc)) + if (!ia32_restore_sigcontext(regs, &frame->sc)) goto badframe; return regs->ax; @@ -135,7 +134,7 @@ COMPAT_SYSCALL_DEFINE0(rt_sigreturn) set_current_blocked(&set); - if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext)) + if (!ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext)) goto badframe; if (compat_restore_altstack(&frame->uc.uc_stack)) @@ -220,8 +219,8 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs, sp = fpu__alloc_mathframe(sp, 1, &fx_aligned, &math_size); *fpstate = (struct _fpstate_32 __user *) sp; - if (copy_fpstate_to_sigframe(*fpstate, (void __user *)fx_aligned, - math_size) < 0) + if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)fx_aligned, + math_size)) return (void __user *) -1L; sp -= frame_size; diff --git a/arch/x86/include/asm/GEN-for-each-reg.h b/arch/x86/include/asm/GEN-for-each-reg.h index 1b07fb102c4e..07949102a08d 100644 --- a/arch/x86/include/asm/GEN-for-each-reg.h +++ b/arch/x86/include/asm/GEN-for-each-reg.h @@ -1,11 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * These are in machine order; things rely on that. + */ #ifdef CONFIG_64BIT GEN(rax) -GEN(rbx) GEN(rcx) GEN(rdx) +GEN(rbx) +GEN(rsp) +GEN(rbp) GEN(rsi) GEN(rdi) -GEN(rbp) GEN(r8) GEN(r9) GEN(r10) @@ -16,10 +21,11 @@ GEN(r14) GEN(r15) #else GEN(eax) -GEN(ebx) GEN(ecx) GEN(edx) +GEN(ebx) +GEN(esp) +GEN(ebp) GEN(esi) GEN(edi) -GEN(ebp) #endif diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index a3c2315aca12..58eee6402832 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -75,6 +75,7 @@ extern int alternatives_patched; extern void alternative_instructions(void); extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); +extern void apply_retpolines(s32 *start, s32 *end); struct module; diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index 455066a06f60..00d1a400b7a1 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -24,7 +24,6 @@ extern int amd_set_subcaches(int, unsigned long); extern int amd_smn_read(u16 node, u32 address, u32 *value); extern int amd_smn_write(u16 node, u32 address, u32 value); -extern int amd_df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo); struct amd_l3_cache { unsigned indices; diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h index 4cb726c71ed8..8f80de627c60 100644 --- a/arch/x86/include/asm/asm-prototypes.h +++ b/arch/x86/include/asm/asm-prototypes.h @@ -17,21 +17,3 @@ extern void cmpxchg8b_emu(void); #endif -#ifdef CONFIG_RETPOLINE - -#undef GEN -#define GEN(reg) \ - extern asmlinkage void __x86_indirect_thunk_ ## reg (void); -#include <asm/GEN-for-each-reg.h> - -#undef GEN -#define GEN(reg) \ - extern asmlinkage void __x86_indirect_alt_call_ ## reg (void); -#include <asm/GEN-for-each-reg.h> - -#undef GEN -#define GEN(reg) \ - extern asmlinkage void __x86_indirect_alt_jmp_ ## reg (void); -#include <asm/GEN-for-each-reg.h> - -#endif /* CONFIG_RETPOLINE */ diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 3ad3da9a7d97..c878fed3056f 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -6,11 +6,13 @@ # define __ASM_FORM(x, ...) x,## __VA_ARGS__ # define __ASM_FORM_RAW(x, ...) x,## __VA_ARGS__ # define __ASM_FORM_COMMA(x, ...) x,## __VA_ARGS__, +# define __ASM_REGPFX % #else #include <linux/stringify.h> # define __ASM_FORM(x, ...) " " __stringify(x,##__VA_ARGS__) " " # define __ASM_FORM_RAW(x, ...) __stringify(x,##__VA_ARGS__) # define __ASM_FORM_COMMA(x, ...) " " __stringify(x,##__VA_ARGS__) "," +# define __ASM_REGPFX %% #endif #define _ASM_BYTES(x, ...) __ASM_FORM(.byte x,##__VA_ARGS__ ;) @@ -49,6 +51,9 @@ #define _ASM_SI __ASM_REG(si) #define _ASM_DI __ASM_REG(di) +/* Adds a (%rip) suffix on 64 bits only; for immediate memory references */ +#define _ASM_RIP(x) __ASM_SEL_RAW(x, x (__ASM_REGPFX rip)) + #ifndef __x86_64__ /* 32 bit */ @@ -122,28 +127,19 @@ #ifdef __KERNEL__ +# include <asm/extable_fixup_types.h> + /* Exception table entry */ #ifdef __ASSEMBLY__ -# define _ASM_EXTABLE_HANDLE(from, to, handler) \ + +# define _ASM_EXTABLE_TYPE(from, to, type) \ .pushsection "__ex_table","a" ; \ .balign 4 ; \ .long (from) - . ; \ .long (to) - . ; \ - .long (handler) - . ; \ + .long type ; \ .popsection -# define _ASM_EXTABLE(from, to) \ - _ASM_EXTABLE_HANDLE(from, to, ex_handler_default) - -# define _ASM_EXTABLE_UA(from, to) \ - _ASM_EXTABLE_HANDLE(from, to, ex_handler_uaccess) - -# define _ASM_EXTABLE_CPY(from, to) \ - _ASM_EXTABLE_HANDLE(from, to, ex_handler_copy) - -# define _ASM_EXTABLE_FAULT(from, to) \ - _ASM_EXTABLE_HANDLE(from, to, ex_handler_fault) - # ifdef CONFIG_KPROBES # define _ASM_NOKPROBE(entry) \ .pushsection "_kprobe_blacklist","aw" ; \ @@ -155,26 +151,51 @@ # endif #else /* ! __ASSEMBLY__ */ -# define _EXPAND_EXTABLE_HANDLE(x) #x -# define _ASM_EXTABLE_HANDLE(from, to, handler) \ + +# define DEFINE_EXTABLE_TYPE_REG \ + ".macro extable_type_reg type:req reg:req\n" \ + ".set found, 0\n" \ + ".set regnr, 0\n" \ + ".irp rs,rax,rcx,rdx,rbx,rsp,rbp,rsi,rdi,r8,r9,r10,r11,r12,r13,r14,r15\n" \ + ".ifc \\reg, %%\\rs\n" \ + ".set found, found+1\n" \ + ".long \\type + (regnr << 8)\n" \ + ".endif\n" \ + ".set regnr, regnr+1\n" \ + ".endr\n" \ + ".set regnr, 0\n" \ + ".irp rs,eax,ecx,edx,ebx,esp,ebp,esi,edi,r8d,r9d,r10d,r11d,r12d,r13d,r14d,r15d\n" \ + ".ifc \\reg, %%\\rs\n" \ + ".set found, found+1\n" \ + ".long \\type + (regnr << 8)\n" \ + ".endif\n" \ + ".set regnr, regnr+1\n" \ + ".endr\n" \ + ".if (found != 1)\n" \ + ".error \"extable_type_reg: bad register argument\"\n" \ + ".endif\n" \ + ".endm\n" + +# define UNDEFINE_EXTABLE_TYPE_REG \ + ".purgem extable_type_reg\n" + +# define _ASM_EXTABLE_TYPE(from, to, type) \ " .pushsection \"__ex_table\",\"a\"\n" \ " .balign 4\n" \ " .long (" #from ") - .\n" \ " .long (" #to ") - .\n" \ - " .long (" _EXPAND_EXTABLE_HANDLE(handler) ") - .\n" \ + " .long " __stringify(type) " \n" \ " .popsection\n" -# define _ASM_EXTABLE(from, to) \ - _ASM_EXTABLE_HANDLE(from, to, ex_handler_default) - -# define _ASM_EXTABLE_UA(from, to) \ - _ASM_EXTABLE_HANDLE(from, to, ex_handler_uaccess) - -# define _ASM_EXTABLE_CPY(from, to) \ - _ASM_EXTABLE_HANDLE(from, to, ex_handler_copy) - -# define _ASM_EXTABLE_FAULT(from, to) \ - _ASM_EXTABLE_HANDLE(from, to, ex_handler_fault) +# define _ASM_EXTABLE_TYPE_REG(from, to, type, reg) \ + " .pushsection \"__ex_table\",\"a\"\n" \ + " .balign 4\n" \ + " .long (" #from ") - .\n" \ + " .long (" #to ") - .\n" \ + DEFINE_EXTABLE_TYPE_REG \ + "extable_type_reg reg=" __stringify(reg) ", type=" __stringify(type) " \n"\ + UNDEFINE_EXTABLE_TYPE_REG \ + " .popsection\n" /* For C file, we already have NOKPROBE_SYMBOL macro */ @@ -188,6 +209,17 @@ register unsigned long current_stack_pointer asm(_ASM_SP); #define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer) #endif /* __ASSEMBLY__ */ -#endif /* __KERNEL__ */ +#define _ASM_EXTABLE(from, to) \ + _ASM_EXTABLE_TYPE(from, to, EX_TYPE_DEFAULT) +#define _ASM_EXTABLE_UA(from, to) \ + _ASM_EXTABLE_TYPE(from, to, EX_TYPE_UACCESS) + +#define _ASM_EXTABLE_CPY(from, to) \ + _ASM_EXTABLE_TYPE(from, to, EX_TYPE_COPY) + +#define _ASM_EXTABLE_FAULT(from, to) \ + _ASM_EXTABLE_TYPE(from, to, EX_TYPE_FAULT) + +#endif /* __KERNEL__ */ #endif /* _ASM_X86_ASM_H */ diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 3ba772a69cc8..35389b2af88e 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -19,9 +19,9 @@ #define wmb() asm volatile(ALTERNATIVE("lock; addl $0,-4(%%esp)", "sfence", \ X86_FEATURE_XMM2) ::: "memory", "cc") #else -#define mb() asm volatile("mfence":::"memory") -#define rmb() asm volatile("lfence":::"memory") -#define wmb() asm volatile("sfence" ::: "memory") +#define __mb() asm volatile("mfence":::"memory") +#define __rmb() asm volatile("lfence":::"memory") +#define __wmb() asm volatile("sfence" ::: "memory") #endif /** @@ -51,8 +51,8 @@ static inline unsigned long array_index_mask_nospec(unsigned long index, /* Prevent speculative execution past this barrier. */ #define barrier_nospec() alternative("", "lfence", X86_FEATURE_LFENCE_RDTSC) -#define dma_rmb() barrier() -#define dma_wmb() barrier() +#define __dma_rmb() barrier() +#define __dma_wmb() barrier() #define __smp_mb() asm volatile("lock; addl $0,-4(%%" _ASM_SP ")" ::: "memory", "cc") diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h index 3d52b094850a..dd5ea1bdf04c 100644 --- a/arch/x86/include/asm/cpu_entry_area.h +++ b/arch/x86/include/asm/cpu_entry_area.h @@ -10,6 +10,12 @@ #ifdef CONFIG_X86_64 +#ifdef CONFIG_AMD_MEM_ENCRYPT +#define VC_EXCEPTION_STKSZ EXCEPTION_STKSZ +#else +#define VC_EXCEPTION_STKSZ 0 +#endif + /* Macro to enforce the same ordering and stack sizes */ #define ESTACKS_MEMBERS(guardsize, optional_stack_size) \ char DF_stack_guard[guardsize]; \ @@ -28,7 +34,7 @@ /* The exception stacks' physical storage. No guard pages required */ struct exception_stacks { - ESTACKS_MEMBERS(0, 0) + ESTACKS_MEMBERS(0, VC_EXCEPTION_STKSZ) }; /* The effective cpu entry area mapping with guard pages. */ diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 16a51e7288d5..1261842d006c 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -173,20 +173,25 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); * means that the boot_cpu_has() variant is already fast enough for the * majority of cases and you should stick to using it as it is generally * only two instructions: a RIP-relative MOV and a TEST. + * + * Do not use an "m" constraint for [cap_byte] here: gcc doesn't know + * that this is only used on a fallback path and will sometimes cause + * it to manifest the address of boot_cpu_data in a register, fouling + * the mainline (post-initialization) code. */ static __always_inline bool _static_cpu_has(u16 bit) { asm_volatile_goto( ALTERNATIVE_TERNARY("jmp 6f", %P[feature], "", "jmp %l[t_no]") - ".section .altinstr_aux,\"ax\"\n" + ".pushsection .altinstr_aux,\"ax\"\n" "6:\n" - " testb %[bitnum],%[cap_byte]\n" + " testb %[bitnum]," _ASM_RIP(%P[cap_byte]) "\n" " jnz %l[t_yes]\n" " jmp %l[t_no]\n" - ".previous\n" + ".popsection\n" : : [feature] "i" (bit), [bitnum] "i" (1 << (bit & 7)), - [cap_byte] "m" (((const char *)boot_cpu_data.x86_capability)[bit >> 3]) + [cap_byte] "i" (&((const char *)boot_cpu_data.x86_capability)[bit >> 3]) : : t_yes, t_no); t_yes: return true; diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index d0ce5cfd3ac1..6db4e2932b3d 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -277,6 +277,7 @@ #define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC instruction */ #define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 instruction */ #define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS instructions */ +#define X86_FEATURE_XFD (10*32+ 4) /* "" eXtended Feature Disabling */ /* * Extended auxiliary flags: Linux defined - for features scattered in various @@ -298,6 +299,9 @@ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ #define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */ +#define X86_FEATURE_AMX_BF16 (18*32+22) /* AMX bf16 Support */ +#define X86_FEATURE_AMX_TILE (18*32+24) /* AMX tile Support */ +#define X86_FEATURE_AMX_INT8 (18*32+25) /* AMX int8 Support */ /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ @@ -313,6 +317,7 @@ #define X86_FEATURE_AMD_SSBD (13*32+24) /* "" Speculative Store Bypass Disable */ #define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */ #define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */ +#define X86_FEATURE_CPPC (13*32+27) /* Collaborative Processor Performance Control */ /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 4d0b126835b8..03cb12775043 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -46,13 +46,14 @@ extern unsigned long efi_mixed_mode_stack_pa; #define __efi_nargs(...) __efi_nargs_(__VA_ARGS__) #define __efi_nargs_(...) __efi_nargs__(0, ##__VA_ARGS__, \ + __efi_arg_sentinel(9), __efi_arg_sentinel(8), \ __efi_arg_sentinel(7), __efi_arg_sentinel(6), \ __efi_arg_sentinel(5), __efi_arg_sentinel(4), \ __efi_arg_sentinel(3), __efi_arg_sentinel(2), \ __efi_arg_sentinel(1), __efi_arg_sentinel(0)) -#define __efi_nargs__(_0, _1, _2, _3, _4, _5, _6, _7, n, ...) \ +#define __efi_nargs__(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, n, ...) \ __take_second_arg(n, \ - ({ BUILD_BUG_ON_MSG(1, "__efi_nargs limit exceeded"); 8; })) + ({ BUILD_BUG_ON_MSG(1, "__efi_nargs limit exceeded"); 10; })) #define __efi_arg_sentinel(n) , n /* @@ -176,8 +177,9 @@ extern u64 efi_setup; extern efi_status_t __efi64_thunk(u32, ...); #define efi64_thunk(...) ({ \ - __efi_nargs_check(efi64_thunk, 6, __VA_ARGS__); \ - __efi64_thunk(__VA_ARGS__); \ + u64 __pad[3]; /* must have space for 3 args on the stack */ \ + __efi_nargs_check(efi64_thunk, 9, __VA_ARGS__); \ + __efi64_thunk(__VA_ARGS__, __pad); \ }) static inline bool efi_is_mixed(void) @@ -197,8 +199,6 @@ static inline bool efi_runtime_supported(void) extern void parse_efi_setup(u64 phys_addr, u32 data_len); -extern void efifb_setup_from_dmi(struct screen_info *si, const char *opt); - extern void efi_thunk_runtime_setup(void); efi_status_t efi_set_virtual_address_map(unsigned long memory_map_size, unsigned long descriptor_size, @@ -308,6 +308,10 @@ static inline u32 efi64_convert_status(efi_status_t status) #define __efi64_argmap_query_mode(gop, mode, size, info) \ ((gop), (mode), efi64_zero_upper(size), efi64_zero_upper(info)) +/* TCG2 protocol */ +#define __efi64_argmap_hash_log_extend_event(prot, fl, addr, size, ev) \ + ((prot), (fl), 0ULL, (u64)(addr), 0ULL, (u64)(size), 0ULL, ev) + /* * The macros below handle the plumbing for the argument mapping. To add a * mapping for a specific EFI method, simply define a macro diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h index 14ebd2196569..43184640b579 100644 --- a/arch/x86/include/asm/entry-common.h +++ b/arch/x86/include/asm/entry-common.h @@ -25,7 +25,7 @@ static __always_inline void arch_check_user_regs(struct pt_regs *regs) * For !SMAP hardware we patch out CLAC on entry. */ if (boot_cpu_has(X86_FEATURE_SMAP) || - (IS_ENABLED(CONFIG_64_BIT) && boot_cpu_has(X86_FEATURE_XENPV))) + (IS_ENABLED(CONFIG_64BIT) && boot_cpu_has(X86_FEATURE_XENPV))) mask |= X86_EFLAGS_AC; WARN_ON_ONCE(flags & mask); diff --git a/arch/x86/include/asm/extable.h b/arch/x86/include/asm/extable.h index 1f0cbc52937c..155c991ba95e 100644 --- a/arch/x86/include/asm/extable.h +++ b/arch/x86/include/asm/extable.h @@ -1,12 +1,18 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_EXTABLE_H #define _ASM_X86_EXTABLE_H + +#include <asm/extable_fixup_types.h> + /* - * The exception table consists of triples of addresses relative to the - * exception table entry itself. The first address is of an instruction - * that is allowed to fault, the second is the target at which the program - * should continue. The third is a handler function to deal with the fault - * caused by the instruction in the first field. + * The exception table consists of two addresses relative to the + * exception table entry itself and a type selector field. + * + * The first address is of an instruction that is allowed to fault, the + * second is the target at which the program should continue. + * + * The type entry is used by fixup_exception() to select the handler to + * deal with the fault caused by the instruction in the first field. * * All the routines below use bits of fixup code that are out of line * with the main instruction path. This means when everything is well, @@ -15,7 +21,7 @@ */ struct exception_table_entry { - int insn, fixup, handler; + int insn, fixup, data; }; struct pt_regs; @@ -25,21 +31,27 @@ struct pt_regs; do { \ (a)->fixup = (b)->fixup + (delta); \ (b)->fixup = (tmp).fixup - (delta); \ - (a)->handler = (b)->handler + (delta); \ - (b)->handler = (tmp).handler - (delta); \ + (a)->data = (b)->data; \ + (b)->data = (tmp).data; \ } while (0) -enum handler_type { - EX_HANDLER_NONE, - EX_HANDLER_FAULT, - EX_HANDLER_UACCESS, - EX_HANDLER_OTHER -}; - extern int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code, unsigned long fault_addr); extern int fixup_bug(struct pt_regs *regs, int trapnr); -extern enum handler_type ex_get_fault_handler_type(unsigned long ip); +extern int ex_get_fixup_type(unsigned long ip); extern void early_fixup_exception(struct pt_regs *regs, int trapnr); +#ifdef CONFIG_X86_MCE +extern void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr); +#else +static inline void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr) { } +#endif + +#if defined(CONFIG_BPF_JIT) && defined(CONFIG_X86_64) +bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs); +#else +static inline bool ex_handler_bpf(const struct exception_table_entry *x, + struct pt_regs *regs) { return false; } +#endif + #endif diff --git a/arch/x86/include/asm/extable_fixup_types.h b/arch/x86/include/asm/extable_fixup_types.h new file mode 100644 index 000000000000..503622627400 --- /dev/null +++ b/arch/x86/include/asm/extable_fixup_types.h @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_EXTABLE_FIXUP_TYPES_H +#define _ASM_X86_EXTABLE_FIXUP_TYPES_H + +/* + * Our IMM is signed, as such it must live at the top end of the word. Also, + * since C99 hex constants are of ambigious type, force cast the mask to 'int' + * so that FIELD_GET() will DTRT and sign extend the value when it extracts it. + */ +#define EX_DATA_TYPE_MASK ((int)0x000000FF) +#define EX_DATA_REG_MASK ((int)0x00000F00) +#define EX_DATA_FLAG_MASK ((int)0x0000F000) +#define EX_DATA_IMM_MASK ((int)0xFFFF0000) + +#define EX_DATA_REG_SHIFT 8 +#define EX_DATA_FLAG_SHIFT 12 +#define EX_DATA_IMM_SHIFT 16 + +#define EX_DATA_REG(reg) ((reg) << EX_DATA_REG_SHIFT) +#define EX_DATA_FLAG(flag) ((flag) << EX_DATA_FLAG_SHIFT) +#define EX_DATA_IMM(imm) ((imm) << EX_DATA_IMM_SHIFT) + +/* segment regs */ +#define EX_REG_DS EX_DATA_REG(8) +#define EX_REG_ES EX_DATA_REG(9) +#define EX_REG_FS EX_DATA_REG(10) +#define EX_REG_GS EX_DATA_REG(11) + +/* flags */ +#define EX_FLAG_CLEAR_AX EX_DATA_FLAG(1) +#define EX_FLAG_CLEAR_DX EX_DATA_FLAG(2) +#define EX_FLAG_CLEAR_AX_DX EX_DATA_FLAG(3) + +/* types */ +#define EX_TYPE_NONE 0 +#define EX_TYPE_DEFAULT 1 +#define EX_TYPE_FAULT 2 +#define EX_TYPE_UACCESS 3 +#define EX_TYPE_COPY 4 +#define EX_TYPE_CLEAR_FS 5 +#define EX_TYPE_FPU_RESTORE 6 +#define EX_TYPE_BPF 7 +#define EX_TYPE_WRMSR 8 +#define EX_TYPE_RDMSR 9 +#define EX_TYPE_WRMSR_SAFE 10 /* reg := -EIO */ +#define EX_TYPE_RDMSR_SAFE 11 /* reg := -EIO */ +#define EX_TYPE_WRMSR_IN_MCE 12 +#define EX_TYPE_RDMSR_IN_MCE 13 +#define EX_TYPE_DEFAULT_MCE_SAFE 14 +#define EX_TYPE_FAULT_MCE_SAFE 15 + +#define EX_TYPE_POP_REG 16 /* sp += sizeof(long) */ +#define EX_TYPE_POP_ZERO (EX_TYPE_POP_REG | EX_DATA_IMM(0)) + +#define EX_TYPE_IMM_REG 17 /* reg := (long)imm */ +#define EX_TYPE_EFAULT_REG (EX_TYPE_IMM_REG | EX_DATA_IMM(-EFAULT)) +#define EX_TYPE_ZERO_REG (EX_TYPE_IMM_REG | EX_DATA_IMM(0)) +#define EX_TYPE_ONE_REG (EX_TYPE_IMM_REG | EX_DATA_IMM(1)) + +#define EX_TYPE_FAULT_SGX 18 + +#define EX_TYPE_UCOPY_LEN 19 /* cx := reg + imm*cx */ +#define EX_TYPE_UCOPY_LEN1 (EX_TYPE_UCOPY_LEN | EX_DATA_IMM(1)) +#define EX_TYPE_UCOPY_LEN4 (EX_TYPE_UCOPY_LEN | EX_DATA_IMM(4)) +#define EX_TYPE_UCOPY_LEN8 (EX_TYPE_UCOPY_LEN | EX_DATA_IMM(8)) + +#endif diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index 23bef08a8388..c83b3020350a 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -12,6 +12,8 @@ #define _ASM_X86_FPU_API_H #include <linux/bottom_half.h> +#include <asm/fpu/types.h> + /* * Use kernel_fpu_begin/end() if you intend to use FPU in kernel context. It * disables preemption so be careful if you intend to use it for long periods @@ -48,9 +50,9 @@ static inline void kernel_fpu_begin(void) } /* - * Use fpregs_lock() while editing CPU's FPU registers or fpu->state. + * Use fpregs_lock() while editing CPU's FPU registers or fpu->fpstate. * A context switch will (and softirq might) save CPU's FPU registers to - * fpu->state and set TIF_NEED_FPU_LOAD leaving CPU's FPU registers in + * fpu->fpstate.regs and set TIF_NEED_FPU_LOAD leaving CPU's FPU registers in * a random state. * * local_bh_disable() protects against both preemption and soft interrupts @@ -100,12 +102,67 @@ extern void switch_fpu_return(void); */ extern int cpu_has_xfeatures(u64 xfeatures_mask, const char **feature_name); -/* - * Tasks that are not using SVA have mm->pasid set to zero to note that they - * will not have the valid bit set in MSR_IA32_PASID while they are running. - */ -#define PASID_DISABLED 0 +/* Trap handling */ +extern int fpu__exception_code(struct fpu *fpu, int trap_nr); +extern void fpu_sync_fpstate(struct fpu *fpu); +extern void fpu_reset_from_exception_fixup(void); + +/* Boot, hotplug and resume */ +extern void fpu__init_cpu(void); +extern void fpu__init_system(struct cpuinfo_x86 *c); +extern void fpu__init_check_bugs(void); +extern void fpu__resume_cpu(void); + +#ifdef CONFIG_MATH_EMULATION +extern void fpstate_init_soft(struct swregs_state *soft); +#else +static inline void fpstate_init_soft(struct swregs_state *soft) {} +#endif + +/* State tracking */ +DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); + +/* Process cleanup */ +#ifdef CONFIG_X86_64 +extern void fpstate_free(struct fpu *fpu); +#else +static inline void fpstate_free(struct fpu *fpu) { } +#endif + +/* fpstate-related functions which are exported to KVM */ +extern void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature); + +extern u64 xstate_get_guest_group_perm(void); + +/* KVM specific functions */ +extern bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu); +extern void fpu_free_guest_fpstate(struct fpu_guest *gfpu); +extern int fpu_swap_kvm_fpstate(struct fpu_guest *gfpu, bool enter_guest); +extern int fpu_enable_guest_xfd_features(struct fpu_guest *guest_fpu, u64 xfeatures); + +#ifdef CONFIG_X86_64 +extern void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd); +extern void fpu_sync_guest_vmexit_xfd_state(void); +#else +static inline void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd) { } +static inline void fpu_sync_guest_vmexit_xfd_state(void) { } +#endif + +extern void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, unsigned int size, u32 pkru); +extern int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, u64 xcr0, u32 *vpkru); + +static inline void fpstate_set_confidential(struct fpu_guest *gfpu) +{ + gfpu->fpstate->is_confidential = true; +} + +static inline bool fpstate_is_confidential(struct fpu_guest *gfpu) +{ + return gfpu->fpstate->is_confidential; +} -static inline void update_pasid(void) { } +/* prctl */ +struct task_struct; +extern long fpu_xstate_prctl(struct task_struct *tsk, int option, unsigned long arg2); #endif /* _ASM_X86_FPU_API_H */ diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 5a18694a89b2..e69de29bb2d1 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -1,540 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 1994 Linus Torvalds - * - * Pentium III FXSR, SSE support - * General FPU state handling cleanups - * Gareth Hughes <gareth@valinux.com>, May 2000 - * x86-64 work by Andi Kleen 2002 - */ - -#ifndef _ASM_X86_FPU_INTERNAL_H -#define _ASM_X86_FPU_INTERNAL_H - -#include <linux/compat.h> -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/mm.h> - -#include <asm/user.h> -#include <asm/fpu/api.h> -#include <asm/fpu/xstate.h> -#include <asm/fpu/xcr.h> -#include <asm/cpufeature.h> -#include <asm/trace/fpu.h> - -/* - * High level FPU state handling functions: - */ -extern int fpu__restore_sig(void __user *buf, int ia32_frame); -extern void fpu__drop(struct fpu *fpu); -extern void fpu__clear_user_states(struct fpu *fpu); -extern int fpu__exception_code(struct fpu *fpu, int trap_nr); - -extern void fpu_sync_fpstate(struct fpu *fpu); - -/* Clone and exit operations */ -extern int fpu_clone(struct task_struct *dst); -extern void fpu_flush_thread(void); - -/* - * Boot time FPU initialization functions: - */ -extern void fpu__init_cpu(void); -extern void fpu__init_system_xstate(void); -extern void fpu__init_cpu_xstate(void); -extern void fpu__init_system(struct cpuinfo_x86 *c); -extern void fpu__init_check_bugs(void); -extern void fpu__resume_cpu(void); - -/* - * Debugging facility: - */ -#ifdef CONFIG_X86_DEBUG_FPU -# define WARN_ON_FPU(x) WARN_ON_ONCE(x) -#else -# define WARN_ON_FPU(x) ({ (void)(x); 0; }) -#endif - -/* - * FPU related CPU feature flag helper routines: - */ -static __always_inline __pure bool use_xsaveopt(void) -{ - return static_cpu_has(X86_FEATURE_XSAVEOPT); -} - -static __always_inline __pure bool use_xsave(void) -{ - return static_cpu_has(X86_FEATURE_XSAVE); -} - -static __always_inline __pure bool use_fxsr(void) -{ - return static_cpu_has(X86_FEATURE_FXSR); -} - -/* - * fpstate handling functions: - */ - -extern union fpregs_state init_fpstate; - -extern void fpstate_init(union fpregs_state *state); -#ifdef CONFIG_MATH_EMULATION -extern void fpstate_init_soft(struct swregs_state *soft); -#else -static inline void fpstate_init_soft(struct swregs_state *soft) {} -#endif -extern void save_fpregs_to_fpstate(struct fpu *fpu); - -/* Returns 0 or the negated trap number, which results in -EFAULT for #PF */ -#define user_insn(insn, output, input...) \ -({ \ - int err; \ - \ - might_fault(); \ - \ - asm volatile(ASM_STAC "\n" \ - "1: " #insn "\n" \ - "2: " ASM_CLAC "\n" \ - ".section .fixup,\"ax\"\n" \ - "3: negl %%eax\n" \ - " jmp 2b\n" \ - ".previous\n" \ - _ASM_EXTABLE_FAULT(1b, 3b) \ - : [err] "=a" (err), output \ - : "0"(0), input); \ - err; \ -}) - -#define kernel_insn_err(insn, output, input...) \ -({ \ - int err; \ - asm volatile("1:" #insn "\n\t" \ - "2:\n" \ - ".section .fixup,\"ax\"\n" \ - "3: movl $-1,%[err]\n" \ - " jmp 2b\n" \ - ".previous\n" \ - _ASM_EXTABLE(1b, 3b) \ - : [err] "=r" (err), output \ - : "0"(0), input); \ - err; \ -}) - -#define kernel_insn(insn, output, input...) \ - asm volatile("1:" #insn "\n\t" \ - "2:\n" \ - _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_fprestore) \ - : output : input) - -static inline int fnsave_to_user_sigframe(struct fregs_state __user *fx) -{ - return user_insn(fnsave %[fx]; fwait, [fx] "=m" (*fx), "m" (*fx)); -} - -static inline int fxsave_to_user_sigframe(struct fxregs_state __user *fx) -{ - if (IS_ENABLED(CONFIG_X86_32)) - return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx)); - else - return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx)); - -} - -static inline void fxrstor(struct fxregs_state *fx) -{ - if (IS_ENABLED(CONFIG_X86_32)) - kernel_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); - else - kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); -} - -static inline int fxrstor_safe(struct fxregs_state *fx) -{ - if (IS_ENABLED(CONFIG_X86_32)) - return kernel_insn_err(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); - else - return kernel_insn_err(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); -} - -static inline int fxrstor_from_user_sigframe(struct fxregs_state __user *fx) -{ - if (IS_ENABLED(CONFIG_X86_32)) - return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); - else - return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); -} - -static inline void frstor(struct fregs_state *fx) -{ - kernel_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); -} - -static inline int frstor_safe(struct fregs_state *fx) -{ - return kernel_insn_err(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); -} - -static inline int frstor_from_user_sigframe(struct fregs_state __user *fx) -{ - return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); -} - -static inline void fxsave(struct fxregs_state *fx) -{ - if (IS_ENABLED(CONFIG_X86_32)) - asm volatile( "fxsave %[fx]" : [fx] "=m" (*fx)); - else - asm volatile("fxsaveq %[fx]" : [fx] "=m" (*fx)); -} - -/* These macros all use (%edi)/(%rdi) as the single memory argument. */ -#define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27" -#define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37" -#define XSAVES ".byte " REX_PREFIX "0x0f,0xc7,0x2f" -#define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f" -#define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f" - -/* - * After this @err contains 0 on success or the negated trap number when - * the operation raises an exception. For faults this results in -EFAULT. - */ -#define XSTATE_OP(op, st, lmask, hmask, err) \ - asm volatile("1:" op "\n\t" \ - "xor %[err], %[err]\n" \ - "2:\n\t" \ - ".pushsection .fixup,\"ax\"\n\t" \ - "3: negl %%eax\n\t" \ - "jmp 2b\n\t" \ - ".popsection\n\t" \ - _ASM_EXTABLE_FAULT(1b, 3b) \ - : [err] "=a" (err) \ - : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ - : "memory") - -/* - * If XSAVES is enabled, it replaces XSAVEOPT because it supports a compact - * format and supervisor states in addition to modified optimization in - * XSAVEOPT. - * - * Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT - * supports modified optimization which is not supported by XSAVE. - * - * We use XSAVE as a fallback. - * - * The 661 label is defined in the ALTERNATIVE* macros as the address of the - * original instruction which gets replaced. We need to use it here as the - * address of the instruction where we might get an exception at. - */ -#define XSTATE_XSAVE(st, lmask, hmask, err) \ - asm volatile(ALTERNATIVE_2(XSAVE, \ - XSAVEOPT, X86_FEATURE_XSAVEOPT, \ - XSAVES, X86_FEATURE_XSAVES) \ - "\n" \ - "xor %[err], %[err]\n" \ - "3:\n" \ - ".pushsection .fixup,\"ax\"\n" \ - "4: movl $-2, %[err]\n" \ - "jmp 3b\n" \ - ".popsection\n" \ - _ASM_EXTABLE(661b, 4b) \ - : [err] "=r" (err) \ - : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ - : "memory") - -/* - * Use XRSTORS to restore context if it is enabled. XRSTORS supports compact - * XSAVE area format. - */ -#define XSTATE_XRESTORE(st, lmask, hmask) \ - asm volatile(ALTERNATIVE(XRSTOR, \ - XRSTORS, X86_FEATURE_XSAVES) \ - "\n" \ - "3:\n" \ - _ASM_EXTABLE_HANDLE(661b, 3b, ex_handler_fprestore)\ - : \ - : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ - : "memory") - -/* - * This function is called only during boot time when x86 caps are not set - * up and alternative can not be used yet. - */ -static inline void os_xrstor_booting(struct xregs_state *xstate) -{ - u64 mask = xfeatures_mask_fpstate(); - u32 lmask = mask; - u32 hmask = mask >> 32; - int err; - - WARN_ON(system_state != SYSTEM_BOOTING); - - if (boot_cpu_has(X86_FEATURE_XSAVES)) - XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); - else - XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); - - /* - * We should never fault when copying from a kernel buffer, and the FPU - * state we set at boot time should be valid. - */ - WARN_ON_FPU(err); -} - -/* - * Save processor xstate to xsave area. - * - * Uses either XSAVE or XSAVEOPT or XSAVES depending on the CPU features - * and command line options. The choice is permanent until the next reboot. - */ -static inline void os_xsave(struct xregs_state *xstate) -{ - u64 mask = xfeatures_mask_all; - u32 lmask = mask; - u32 hmask = mask >> 32; - int err; - - WARN_ON_FPU(!alternatives_patched); - - XSTATE_XSAVE(xstate, lmask, hmask, err); - - /* We should never fault when copying to a kernel buffer: */ - WARN_ON_FPU(err); -} - -/* - * Restore processor xstate from xsave area. - * - * Uses XRSTORS when XSAVES is used, XRSTOR otherwise. - */ -static inline void os_xrstor(struct xregs_state *xstate, u64 mask) -{ - u32 lmask = mask; - u32 hmask = mask >> 32; - - XSTATE_XRESTORE(xstate, lmask, hmask); -} - -/* - * Save xstate to user space xsave area. - * - * We don't use modified optimization because xrstor/xrstors might track - * a different application. - * - * We don't use compacted format xsave area for - * backward compatibility for old applications which don't understand - * compacted format of xsave area. - */ -static inline int xsave_to_user_sigframe(struct xregs_state __user *buf) -{ - /* - * Include the features which are not xsaved/rstored by the kernel - * internally, e.g. PKRU. That's user space ABI and also required - * to allow the signal handler to modify PKRU. - */ - u64 mask = xfeatures_mask_uabi(); - u32 lmask = mask; - u32 hmask = mask >> 32; - int err; - - /* - * Clear the xsave header first, so that reserved fields are - * initialized to zero. - */ - err = __clear_user(&buf->header, sizeof(buf->header)); - if (unlikely(err)) - return -EFAULT; - - stac(); - XSTATE_OP(XSAVE, buf, lmask, hmask, err); - clac(); - - return err; -} - -/* - * Restore xstate from user space xsave area. - */ -static inline int xrstor_from_user_sigframe(struct xregs_state __user *buf, u64 mask) -{ - struct xregs_state *xstate = ((__force struct xregs_state *)buf); - u32 lmask = mask; - u32 hmask = mask >> 32; - int err; - - stac(); - XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); - clac(); - - return err; -} - -/* - * Restore xstate from kernel space xsave area, return an error code instead of - * an exception. - */ -static inline int os_xrstor_safe(struct xregs_state *xstate, u64 mask) -{ - u32 lmask = mask; - u32 hmask = mask >> 32; - int err; - - if (cpu_feature_enabled(X86_FEATURE_XSAVES)) - XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); - else - XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); - - return err; -} - -extern void __restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask); - -static inline void restore_fpregs_from_fpstate(union fpregs_state *fpstate) -{ - __restore_fpregs_from_fpstate(fpstate, xfeatures_mask_fpstate()); -} - -extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size); - -/* - * FPU context switch related helper methods: - */ - -DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); - -/* - * The in-register FPU state for an FPU context on a CPU is assumed to be - * valid if the fpu->last_cpu matches the CPU, and the fpu_fpregs_owner_ctx - * matches the FPU. - * - * If the FPU register state is valid, the kernel can skip restoring the - * FPU state from memory. - * - * Any code that clobbers the FPU registers or updates the in-memory - * FPU state for a task MUST let the rest of the kernel know that the - * FPU registers are no longer valid for this task. - * - * Either one of these invalidation functions is enough. Invalidate - * a resource you control: CPU if using the CPU for something else - * (with preemption disabled), FPU for the current task, or a task that - * is prevented from running by the current task. - */ -static inline void __cpu_invalidate_fpregs_state(void) -{ - __this_cpu_write(fpu_fpregs_owner_ctx, NULL); -} - -static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu) -{ - fpu->last_cpu = -1; -} - -static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu) -{ - return fpu == this_cpu_read(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; -} - -/* - * These generally need preemption protection to work, - * do try to avoid using these on their own: - */ -static inline void fpregs_deactivate(struct fpu *fpu) -{ - this_cpu_write(fpu_fpregs_owner_ctx, NULL); - trace_x86_fpu_regs_deactivated(fpu); -} - -static inline void fpregs_activate(struct fpu *fpu) -{ - this_cpu_write(fpu_fpregs_owner_ctx, fpu); - trace_x86_fpu_regs_activated(fpu); -} - -/* Internal helper for switch_fpu_return() and signal frame setup */ -static inline void fpregs_restore_userregs(void) -{ - struct fpu *fpu = ¤t->thread.fpu; - int cpu = smp_processor_id(); - - if (WARN_ON_ONCE(current->flags & PF_KTHREAD)) - return; - - if (!fpregs_state_valid(fpu, cpu)) { - u64 mask; - - /* - * This restores _all_ xstate which has not been - * established yet. - * - * If PKRU is enabled, then the PKRU value is already - * correct because it was either set in switch_to() or in - * flush_thread(). So it is excluded because it might be - * not up to date in current->thread.fpu.xsave state. - */ - mask = xfeatures_mask_restore_user() | - xfeatures_mask_supervisor(); - __restore_fpregs_from_fpstate(&fpu->state, mask); - - fpregs_activate(fpu); - fpu->last_cpu = cpu; - } - clear_thread_flag(TIF_NEED_FPU_LOAD); -} - -/* - * FPU state switching for scheduling. - * - * This is a two-stage process: - * - * - switch_fpu_prepare() saves the old state. - * This is done within the context of the old process. - * - * - switch_fpu_finish() sets TIF_NEED_FPU_LOAD; the floating point state - * will get loaded on return to userspace, or when the kernel needs it. - * - * If TIF_NEED_FPU_LOAD is cleared then the CPU's FPU registers - * are saved in the current thread's FPU register state. - * - * If TIF_NEED_FPU_LOAD is set then CPU's FPU registers may not - * hold current()'s FPU registers. It is required to load the - * registers before returning to userland or using the content - * otherwise. - * - * The FPU context is only stored/restored for a user task and - * PF_KTHREAD is used to distinguish between kernel and user threads. - */ -static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu) -{ - if (static_cpu_has(X86_FEATURE_FPU) && !(current->flags & PF_KTHREAD)) { - save_fpregs_to_fpstate(old_fpu); - /* - * The save operation preserved register state, so the - * fpu_fpregs_owner_ctx is still @old_fpu. Store the - * current CPU number in @old_fpu, so the next return - * to user space can avoid the FPU register restore - * when is returns on the same CPU and still owns the - * context. - */ - old_fpu->last_cpu = cpu; - - trace_x86_fpu_regs_deactivated(old_fpu); - } -} - -/* - * Misc helper functions: - */ - -/* - * Delay loading of the complete FPU state until the return to userland. - * PKRU is handled separately. - */ -static inline void switch_fpu_finish(struct fpu *new_fpu) -{ - if (cpu_feature_enabled(X86_FEATURE_FPU)) - set_thread_flag(TIF_NEED_FPU_LOAD); -} - -#endif /* _ASM_X86_FPU_INTERNAL_H */ diff --git a/arch/x86/include/asm/fpu/sched.h b/arch/x86/include/asm/fpu/sched.h new file mode 100644 index 000000000000..99a8820e8cc4 --- /dev/null +++ b/arch/x86/include/asm/fpu/sched.h @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_FPU_SCHED_H +#define _ASM_X86_FPU_SCHED_H + +#include <linux/sched.h> + +#include <asm/cpufeature.h> +#include <asm/fpu/types.h> + +#include <asm/trace/fpu.h> + +extern void save_fpregs_to_fpstate(struct fpu *fpu); +extern void fpu__drop(struct fpu *fpu); +extern int fpu_clone(struct task_struct *dst, unsigned long clone_flags); +extern void fpu_flush_thread(void); + +/* + * FPU state switching for scheduling. + * + * This is a two-stage process: + * + * - switch_fpu_prepare() saves the old state. + * This is done within the context of the old process. + * + * - switch_fpu_finish() sets TIF_NEED_FPU_LOAD; the floating point state + * will get loaded on return to userspace, or when the kernel needs it. + * + * If TIF_NEED_FPU_LOAD is cleared then the CPU's FPU registers + * are saved in the current thread's FPU register state. + * + * If TIF_NEED_FPU_LOAD is set then CPU's FPU registers may not + * hold current()'s FPU registers. It is required to load the + * registers before returning to userland or using the content + * otherwise. + * + * The FPU context is only stored/restored for a user task and + * PF_KTHREAD is used to distinguish between kernel and user threads. + */ +static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu) +{ + if (cpu_feature_enabled(X86_FEATURE_FPU) && + !(current->flags & PF_KTHREAD)) { + save_fpregs_to_fpstate(old_fpu); + /* + * The save operation preserved register state, so the + * fpu_fpregs_owner_ctx is still @old_fpu. Store the + * current CPU number in @old_fpu, so the next return + * to user space can avoid the FPU register restore + * when is returns on the same CPU and still owns the + * context. + */ + old_fpu->last_cpu = cpu; + + trace_x86_fpu_regs_deactivated(old_fpu); + } +} + +/* + * Delay loading of the complete FPU state until the return to userland. + * PKRU is handled separately. + */ +static inline void switch_fpu_finish(void) +{ + if (cpu_feature_enabled(X86_FEATURE_FPU)) + set_thread_flag(TIF_NEED_FPU_LOAD); +} + +#endif /* _ASM_X86_FPU_SCHED_H */ diff --git a/arch/x86/include/asm/fpu/signal.h b/arch/x86/include/asm/fpu/signal.h index 8b6631dffefd..e1c9df9102a5 100644 --- a/arch/x86/include/asm/fpu/signal.h +++ b/arch/x86/include/asm/fpu/signal.h @@ -5,6 +5,11 @@ #ifndef _ASM_X86_FPU_SIGNAL_H #define _ASM_X86_FPU_SIGNAL_H +#include <linux/compat.h> +#include <linux/user.h> + +#include <asm/fpu/types.h> + #ifdef CONFIG_X86_64 # include <uapi/asm/sigcontext.h> # include <asm/user32.h> @@ -31,6 +36,9 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame, unsigned long fpu__get_fpstate_size(void); -extern void fpu__init_prepare_fx_sw_frame(void); +extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size); +extern void fpu__clear_user_states(struct fpu *fpu); +extern bool fpu__restore_sig(void __user *buf, int ia32_frame); +extern void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask); #endif /* _ASM_X86_FPU_SIGNAL_H */ diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index f5a38a5f3ae1..eb7cd1139d97 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -120,6 +120,9 @@ enum xfeature { XFEATURE_RSRVD_COMP_13, XFEATURE_RSRVD_COMP_14, XFEATURE_LBR, + XFEATURE_RSRVD_COMP_16, + XFEATURE_XTILE_CFG, + XFEATURE_XTILE_DATA, XFEATURE_MAX, }; @@ -136,12 +139,21 @@ enum xfeature { #define XFEATURE_MASK_PKRU (1 << XFEATURE_PKRU) #define XFEATURE_MASK_PASID (1 << XFEATURE_PASID) #define XFEATURE_MASK_LBR (1 << XFEATURE_LBR) +#define XFEATURE_MASK_XTILE_CFG (1 << XFEATURE_XTILE_CFG) +#define XFEATURE_MASK_XTILE_DATA (1 << XFEATURE_XTILE_DATA) #define XFEATURE_MASK_FPSSE (XFEATURE_MASK_FP | XFEATURE_MASK_SSE) #define XFEATURE_MASK_AVX512 (XFEATURE_MASK_OPMASK \ | XFEATURE_MASK_ZMM_Hi256 \ | XFEATURE_MASK_Hi16_ZMM) +#ifdef CONFIG_X86_64 +# define XFEATURE_MASK_XTILE (XFEATURE_MASK_XTILE_DATA \ + | XFEATURE_MASK_XTILE_CFG) +#else +# define XFEATURE_MASK_XTILE (0) +#endif + #define FIRST_EXTENDED_XFEATURE XFEATURE_YMM struct reg_128_bit { @@ -153,6 +165,9 @@ struct reg_256_bit { struct reg_512_bit { u8 regbytes[512/8]; }; +struct reg_1024_byte { + u8 regbytes[1024]; +}; /* * State component 2: @@ -255,6 +270,23 @@ struct arch_lbr_state { u64 ler_to; u64 ler_info; struct lbr_entry entries[]; +}; + +/* + * State component 17: 64-byte tile configuration register. + */ +struct xtile_cfg { + u64 tcfg[8]; +} __packed; + +/* + * State component 18: 1KB tile data register. + * Each register represents 16 64-byte rows of the matrix + * data. But the number of registers depends on the actual + * implementation. + */ +struct xtile_data { + struct reg_1024_byte tmm; } __packed; /* @@ -309,6 +341,93 @@ union fpregs_state { u8 __padding[PAGE_SIZE]; }; +struct fpstate { + /* @kernel_size: The size of the kernel register image */ + unsigned int size; + + /* @user_size: The size in non-compacted UABI format */ + unsigned int user_size; + + /* @xfeatures: xfeatures for which the storage is sized */ + u64 xfeatures; + + /* @user_xfeatures: xfeatures valid in UABI buffers */ + u64 user_xfeatures; + + /* @xfd: xfeatures disabled to trap userspace use. */ + u64 xfd; + + /* @is_valloc: Indicator for dynamically allocated state */ + unsigned int is_valloc : 1; + + /* @is_guest: Indicator for guest state (KVM) */ + unsigned int is_guest : 1; + + /* + * @is_confidential: Indicator for KVM confidential mode. + * The FPU registers are restored by the + * vmentry firmware from encrypted guest + * memory. On vmexit the FPU registers are + * saved by firmware to encrypted guest memory + * and the registers are scrubbed before + * returning to the host. So there is no + * content which is worth saving and restoring. + * The fpstate has to be there so that + * preemption and softirq FPU usage works + * without special casing. + */ + unsigned int is_confidential : 1; + + /* @in_use: State is in use */ + unsigned int in_use : 1; + + /* @regs: The register state union for all supported formats */ + union fpregs_state regs; + + /* @regs is dynamically sized! Don't add anything after @regs! */ +} __aligned(64); + +#define FPU_GUEST_PERM_LOCKED BIT_ULL(63) + +struct fpu_state_perm { + /* + * @__state_perm: + * + * This bitmap indicates the permission for state components, which + * are available to a thread group. The permission prctl() sets the + * enabled state bits in thread_group_leader()->thread.fpu. + * + * All run time operations use the per thread information in the + * currently active fpu.fpstate which contains the xfeature masks + * and sizes for kernel and user space. + * + * This master permission field is only to be used when + * task.fpu.fpstate based checks fail to validate whether the task + * is allowed to expand it's xfeatures set which requires to + * allocate a larger sized fpstate buffer. + * + * Do not access this field directly. Use the provided helper + * function. Unlocked access is possible for quick checks. + */ + u64 __state_perm; + + /* + * @__state_size: + * + * The size required for @__state_perm. Only valid to access + * with sighand locked. + */ + unsigned int __state_size; + + /* + * @__user_state_size: + * + * The size required for @__state_perm user part. Only valid to + * access with sighand locked. + */ + unsigned int __user_state_size; +}; + /* * Highest level per task FPU state data structure that * contains the FPU register state plus various FPU @@ -337,19 +456,130 @@ struct fpu { unsigned long avx512_timestamp; /* - * @state: + * @fpstate: + * + * Pointer to the active struct fpstate. Initialized to + * point at @__fpstate below. + */ + struct fpstate *fpstate; + + /* + * @__task_fpstate: + * + * Pointer to an inactive struct fpstate. Initialized to NULL. Is + * used only for KVM support to swap out the regular task fpstate. + */ + struct fpstate *__task_fpstate; + + /* + * @perm: + * + * Permission related information + */ + struct fpu_state_perm perm; + + /* + * @guest_perm: + * + * Permission related information for guest pseudo FPUs + */ + struct fpu_state_perm guest_perm; + + /* + * @__fpstate: * - * In-memory copy of all FPU registers that we save/restore - * over context switches. If the task is using the FPU then - * the registers in the FPU are more recent than this state - * copy. If the task context-switches away then they get - * saved here and represent the FPU state. + * Initial in-memory storage for FPU registers which are saved in + * context switch and when the kernel uses the FPU. The registers + * are restored from this storage on return to user space if they + * are not longer containing the tasks FPU register state. */ - union fpregs_state state; + struct fpstate __fpstate; /* - * WARNING: 'state' is dynamically-sized. Do not put + * WARNING: '__fpstate' is dynamically-sized. Do not put * anything after it here. */ }; +/* + * Guest pseudo FPU container + */ +struct fpu_guest { + /* + * @xfeatures: xfeature bitmap of features which are + * currently enabled for the guest vCPU. + */ + u64 xfeatures; + + /* + * @perm: xfeature bitmap of features which are + * permitted to be enabled for the guest + * vCPU. + */ + u64 perm; + + /* + * @xfd_err: Save the guest value. + */ + u64 xfd_err; + + /* + * @uabi_size: Size required for save/restore + */ + unsigned int uabi_size; + + /* + * @fpstate: Pointer to the allocated guest fpstate + */ + struct fpstate *fpstate; +}; + +/* + * FPU state configuration data. Initialized at boot time. Read only after init. + */ +struct fpu_state_config { + /* + * @max_size: + * + * The maximum size of the register state buffer. Includes all + * supported features except independent managed features. + */ + unsigned int max_size; + + /* + * @default_size: + * + * The default size of the register state buffer. Includes all + * supported features except independent managed features and + * features which have to be requested by user space before usage. + */ + unsigned int default_size; + + /* + * @max_features: + * + * The maximum supported features bitmap. Does not include + * independent managed features. + */ + u64 max_features; + + /* + * @default_features: + * + * The default supported features bitmap. Does not include + * independent managed features and features which have to + * be requested by user space before usage. + */ + u64 default_features; + /* + * @legacy_features: + * + * Features which can be reported back to user space + * even without XSAVE support, i.e. legacy features FP + SSE + */ + u64 legacy_features; +}; + +/* FPU state configuration information */ +extern struct fpu_state_config fpu_kernel_cfg, fpu_user_cfg; + #endif /* _ASM_X86_FPU_H */ diff --git a/arch/x86/include/asm/fpu/xcr.h b/arch/x86/include/asm/fpu/xcr.h index 1c7ab8d95da5..9656a5bc6fea 100644 --- a/arch/x86/include/asm/fpu/xcr.h +++ b/arch/x86/include/asm/fpu/xcr.h @@ -2,18 +2,8 @@ #ifndef _ASM_X86_FPU_XCR_H #define _ASM_X86_FPU_XCR_H -/* - * MXCSR and XCR definitions: - */ - -static inline void ldmxcsr(u32 mxcsr) -{ - asm volatile("ldmxcsr %0" :: "m" (mxcsr)); -} - -extern unsigned int mxcsr_feature_mask; - #define XCR_XFEATURE_ENABLED_MASK 0x00000000 +#define XCR_XFEATURE_IN_USE_MASK 0x00000001 static inline u64 xgetbv(u32 index) { @@ -31,4 +21,15 @@ static inline void xsetbv(u32 index, u64 value) asm volatile("xsetbv" :: "a" (eax), "d" (edx), "c" (index)); } +/* + * Return a mask of xfeatures which are currently being tracked + * by the processor as being in the initial configuration. + * + * Callers should check X86_FEATURE_XGETBV1. + */ +static inline u64 xfeatures_in_use(void) +{ + return xgetbv(XCR_XFEATURE_IN_USE_MASK); +} + #endif /* _ASM_X86_FPU_XCR_H */ diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 109dfcc75299..cd3dd170e23a 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -14,6 +14,8 @@ #define XSTATE_CPUID 0x0000000d +#define TILE_CPUID 0x0000001d + #define FXSAVE_SIZE 512 #define XSAVE_HDR_SIZE 64 @@ -33,7 +35,8 @@ XFEATURE_MASK_Hi16_ZMM | \ XFEATURE_MASK_PKRU | \ XFEATURE_MASK_BNDREGS | \ - XFEATURE_MASK_BNDCSR) + XFEATURE_MASK_BNDCSR | \ + XFEATURE_MASK_XTILE) /* * Features which are restored when returning to user space. @@ -43,6 +46,9 @@ #define XFEATURE_MASK_USER_RESTORE \ (XFEATURE_MASK_USER_SUPPORTED & ~XFEATURE_MASK_PKRU) +/* Features which are dynamically enabled for a process on request */ +#define XFEATURE_MASK_USER_DYNAMIC XFEATURE_MASK_XTILE_DATA + /* All currently supported supervisor features */ #define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID) @@ -78,78 +84,49 @@ XFEATURE_MASK_INDEPENDENT | \ XFEATURE_MASK_SUPERVISOR_UNSUPPORTED) -#ifdef CONFIG_X86_64 -#define REX_PREFIX "0x48, " -#else -#define REX_PREFIX -#endif - -extern u64 xfeatures_mask_all; - -static inline u64 xfeatures_mask_supervisor(void) -{ - return xfeatures_mask_all & XFEATURE_MASK_SUPERVISOR_SUPPORTED; -} - -/* - * The xfeatures which are enabled in XCR0 and expected to be in ptrace - * buffers and signal frames. - */ -static inline u64 xfeatures_mask_uabi(void) -{ - return xfeatures_mask_all & XFEATURE_MASK_USER_SUPPORTED; -} - /* - * The xfeatures which are restored by the kernel when returning to user - * mode. This is not necessarily the same as xfeatures_mask_uabi() as the - * kernel does not manage all XCR0 enabled features via xsave/xrstor as - * some of them have to be switched eagerly on context switch and exec(). + * The feature mask required to restore FPU state: + * - All user states which are not eagerly switched in switch_to()/exec() + * - The suporvisor states */ -static inline u64 xfeatures_mask_restore_user(void) -{ - return xfeatures_mask_all & XFEATURE_MASK_USER_RESTORE; -} +#define XFEATURE_MASK_FPSTATE (XFEATURE_MASK_USER_RESTORE | \ + XFEATURE_MASK_SUPERVISOR_SUPPORTED) /* - * Like xfeatures_mask_restore_user() but additionally restors the - * supported supervisor states. + * Features in this mask have space allocated in the signal frame, but may not + * have that space initialized when the feature is in its init state. */ -static inline u64 xfeatures_mask_fpstate(void) -{ - return xfeatures_mask_all & \ - (XFEATURE_MASK_USER_RESTORE | XFEATURE_MASK_SUPERVISOR_SUPPORTED); -} - -static inline u64 xfeatures_mask_independent(void) -{ - if (!boot_cpu_has(X86_FEATURE_ARCH_LBR)) - return XFEATURE_MASK_INDEPENDENT & ~XFEATURE_MASK_LBR; - - return XFEATURE_MASK_INDEPENDENT; -} +#define XFEATURE_MASK_SIGFRAME_INITOPT (XFEATURE_MASK_XTILE | \ + XFEATURE_MASK_USER_DYNAMIC) extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; extern void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask); -void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr); int xfeature_size(int xfeature_nr); -int copy_uabi_from_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf); -int copy_sigframe_from_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf); void xsaves(struct xregs_state *xsave, u64 mask); void xrstors(struct xregs_state *xsave, u64 mask); -enum xstate_copy_mode { - XSTATE_COPY_FP, - XSTATE_COPY_FX, - XSTATE_COPY_XSAVE, -}; +int xfd_enable_feature(u64 xfd_err); + +#ifdef CONFIG_X86_64 +DECLARE_STATIC_KEY_FALSE(__fpu_state_size_dynamic); +#endif -struct membuf; -void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, - enum xstate_copy_mode mode); +#ifdef CONFIG_X86_64 +DECLARE_STATIC_KEY_FALSE(__fpu_state_size_dynamic); + +static __always_inline __pure bool fpu_state_size_dynamic(void) +{ + return static_branch_unlikely(&__fpu_state_size_dynamic); +} +#else +static __always_inline __pure bool fpu_state_size_dynamic(void) +{ + return false; +} +#endif #endif diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 9f3130f40807..024d9797646e 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -57,6 +57,13 @@ arch_ftrace_get_regs(struct ftrace_regs *fregs) #define ftrace_instruction_pointer_set(fregs, _ip) \ do { (fregs)->regs.ip = (_ip); } while (0) + +struct ftrace_ops; +#define ftrace_graph_func ftrace_graph_func +void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs); +#else +#define FTRACE_GRAPH_TRAMP_ADDR FTRACE_GRAPH_ADDR #endif #ifdef CONFIG_DYNAMIC_FTRACE @@ -65,8 +72,6 @@ struct dyn_arch_ftrace { /* No extra data needed for x86 */ }; -#define FTRACE_GRAPH_TRAMP_ADDR FTRACE_GRAPH_ADDR - #endif /* CONFIG_DYNAMIC_FTRACE */ #endif /* __ASSEMBLY__ */ #endif /* CONFIG_FUNCTION_TRACER */ diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h index f9c00110a69a..99d345b686fa 100644 --- a/arch/x86/include/asm/futex.h +++ b/arch/x86/include/asm/futex.h @@ -17,13 +17,9 @@ do { \ int oldval = 0, ret; \ asm volatile("1:\t" insn "\n" \ "2:\n" \ - "\t.section .fixup,\"ax\"\n" \ - "3:\tmov\t%3, %1\n" \ - "\tjmp\t2b\n" \ - "\t.previous\n" \ - _ASM_EXTABLE_UA(1b, 3b) \ + _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %1) \ : "=r" (oldval), "=r" (ret), "+m" (*uaddr) \ - : "i" (-EFAULT), "0" (oparg), "1" (0)); \ + : "0" (oparg), "1" (0)); \ if (ret) \ goto label; \ *oval = oldval; \ @@ -39,15 +35,11 @@ do { \ "3:\t" LOCK_PREFIX "cmpxchgl %3, %2\n" \ "\tjnz\t2b\n" \ "4:\n" \ - "\t.section .fixup,\"ax\"\n" \ - "5:\tmov\t%5, %1\n" \ - "\tjmp\t4b\n" \ - "\t.previous\n" \ - _ASM_EXTABLE_UA(1b, 5b) \ - _ASM_EXTABLE_UA(3b, 5b) \ + _ASM_EXTABLE_TYPE_REG(1b, 4b, EX_TYPE_EFAULT_REG, %1) \ + _ASM_EXTABLE_TYPE_REG(3b, 4b, EX_TYPE_EFAULT_REG, %1) \ : "=&a" (oldval), "=&r" (ret), \ "+m" (*uaddr), "=&r" (tem) \ - : "r" (oparg), "i" (-EFAULT), "1" (0)); \ + : "r" (oparg), "1" (0)); \ if (ret) \ goto label; \ *oval = oldval; \ @@ -95,15 +87,11 @@ static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, if (!user_access_begin(uaddr, sizeof(u32))) return -EFAULT; asm volatile("\n" - "1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n" + "1:\t" LOCK_PREFIX "cmpxchgl %3, %2\n" "2:\n" - "\t.section .fixup, \"ax\"\n" - "3:\tmov %3, %0\n" - "\tjmp 2b\n" - "\t.previous\n" - _ASM_EXTABLE_UA(1b, 3b) + _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %0) \ : "+r" (ret), "=a" (oldval), "+m" (*uaddr) - : "i" (-EFAULT), "r" (newval), "1" (oldval) + : "r" (newval), "1" (oldval) : "memory" ); user_access_end(); diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index 2322d6bd5883..0a9407dc0859 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -276,6 +276,23 @@ enum hv_isolation_type { #define HV_X64_MSR_TIME_REF_COUNT HV_REGISTER_TIME_REF_COUNT #define HV_X64_MSR_REFERENCE_TSC HV_REGISTER_REFERENCE_TSC +/* Hyper-V memory host visibility */ +enum hv_mem_host_visibility { + VMBUS_PAGE_NOT_VISIBLE = 0, + VMBUS_PAGE_VISIBLE_READ_ONLY = 1, + VMBUS_PAGE_VISIBLE_READ_WRITE = 3 +}; + +/* HvCallModifySparseGpaPageHostVisibility hypercall */ +#define HV_MAX_MODIFY_GPA_REP_COUNT ((PAGE_SIZE / sizeof(u64)) - 2) +struct hv_gpa_range_for_visibility { + u64 partition_id; + u32 host_visibility:2; + u32 reserved0:30; + u32 reserved1; + u64 gpa_page_list[HV_MAX_MODIFY_GPA_REP_COUNT]; +} __packed; + /* * Declare the MSR used to setup pages used to communicate with the hypervisor. */ @@ -585,6 +602,39 @@ enum hv_interrupt_type { HV_X64_INTERRUPT_TYPE_MAXIMUM = 0x000A, }; +union hv_msi_address_register { + u32 as_uint32; + struct { + u32 reserved1:2; + u32 destination_mode:1; + u32 redirection_hint:1; + u32 reserved2:8; + u32 destination_id:8; + u32 msi_base:12; + }; +} __packed; + +union hv_msi_data_register { + u32 as_uint32; + struct { + u32 vector:8; + u32 delivery_mode:3; + u32 reserved1:3; + u32 level_assert:1; + u32 trigger_mode:1; + u32 reserved2:16; + }; +} __packed; + +/* HvRetargetDeviceInterrupt hypercall */ +union hv_msi_entry { + u64 as_uint64; + struct { + union hv_msi_address_register address; + union hv_msi_data_register data; + } __packed; +}; + #include <asm-generic/hyperv-tlfs.h> #endif diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h index 2c5f7861d373..fada857f0a1e 100644 --- a/arch/x86/include/asm/ia32.h +++ b/arch/x86/include/asm/ia32.h @@ -68,6 +68,6 @@ extern void ia32_pick_mmap_layout(struct mm_struct *mm); #endif -#endif /* !CONFIG_IA32_SUPPORT */ +#endif /* CONFIG_IA32_EMULATION */ #endif /* _ASM_X86_IA32_H */ diff --git a/arch/x86/include/asm/insn-eval.h b/arch/x86/include/asm/insn-eval.h index 91d7182ad2d6..f07faa61c7f3 100644 --- a/arch/x86/include/asm/insn-eval.h +++ b/arch/x86/include/asm/insn-eval.h @@ -15,12 +15,16 @@ #define INSN_CODE_SEG_OPND_SZ(params) (params & 0xf) #define INSN_CODE_SEG_PARAMS(oper_sz, addr_sz) (oper_sz | (addr_sz << 4)) +int pt_regs_offset(struct pt_regs *regs, int regno); + bool insn_has_rep_prefix(struct insn *insn); void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs); int insn_get_modrm_rm_off(struct insn *insn, struct pt_regs *regs); int insn_get_modrm_reg_off(struct insn *insn, struct pt_regs *regs); +unsigned long *insn_get_modrm_reg_ptr(struct insn *insn, struct pt_regs *regs); unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx); int insn_get_code_seg_params(struct pt_regs *regs); +int insn_get_effective_ip(struct pt_regs *regs, unsigned long *ip); int insn_fetch_from_user(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE]); int insn_fetch_from_user_inatomic(struct pt_regs *regs, @@ -28,4 +32,16 @@ int insn_fetch_from_user_inatomic(struct pt_regs *regs, bool insn_decode_from_regs(struct insn *insn, struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE], int buf_size); +enum mmio_type { + MMIO_DECODE_FAILED, + MMIO_WRITE, + MMIO_WRITE_IMM, + MMIO_READ, + MMIO_READ_ZERO_EXTEND, + MMIO_READ_SIGN_EXTEND, + MMIO_MOVS, +}; + +enum mmio_type insn_decode_mmio(struct insn *insn, int *bytes); + #endif /* _ASM_X86_INSN_EVAL_H */ diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 27158436f322..048b6d5aff50 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -108,6 +108,8 @@ #define INTEL_FAM6_ALDERLAKE 0x97 /* Golden Cove / Gracemont */ #define INTEL_FAM6_ALDERLAKE_L 0x9A /* Golden Cove / Gracemont */ +#define INTEL_FAM6_RAPTORLAKE 0xB7 + /* "Small Core" Processors (Atom) */ #define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */ diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 841a5d104afa..f6d91ecb8026 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -40,6 +40,7 @@ #include <linux/string.h> #include <linux/compiler.h> +#include <linux/cc_platform.h> #include <asm/page.h> #include <asm/early_ioremap.h> #include <asm/pgtable_types.h> @@ -256,21 +257,6 @@ static inline void slow_down_io(void) #endif -#ifdef CONFIG_AMD_MEM_ENCRYPT -#include <linux/jump_label.h> - -extern struct static_key_false sev_enable_key; -static inline bool sev_key_active(void) -{ - return static_branch_unlikely(&sev_enable_key); -} - -#else /* !CONFIG_AMD_MEM_ENCRYPT */ - -static inline bool sev_key_active(void) { return false; } - -#endif /* CONFIG_AMD_MEM_ENCRYPT */ - #define BUILDIO(bwl, bw, type) \ static inline void out##bwl(unsigned type value, int port) \ { \ @@ -301,7 +287,7 @@ static inline unsigned type in##bwl##_p(int port) \ \ static inline void outs##bwl(int port, const void *addr, unsigned long count) \ { \ - if (sev_key_active()) { \ + if (cc_platform_has(CC_ATTR_GUEST_UNROLL_STRING_IO)) { \ unsigned type *value = (unsigned type *)addr; \ while (count) { \ out##bwl(*value, port); \ @@ -317,7 +303,7 @@ static inline void outs##bwl(int port, const void *addr, unsigned long count) \ \ static inline void ins##bwl(int port, void *addr, unsigned long count) \ { \ - if (sev_key_active()) { \ + if (cc_platform_has(CC_ATTR_GUEST_UNROLL_STRING_IO)) { \ unsigned type *value = (unsigned type *)addr; \ while (count) { \ *value = in##bwl(port); \ @@ -391,6 +377,7 @@ extern void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size) #define arch_io_reserve_memtype_wc arch_io_reserve_memtype_wc #endif +#ifdef CONFIG_AMD_MEM_ENCRYPT extern bool arch_memremap_can_ram_remap(resource_size_t offset, unsigned long size, unsigned long flags); @@ -398,6 +385,13 @@ extern bool arch_memremap_can_ram_remap(resource_size_t offset, extern bool phys_mem_access_encrypted(unsigned long phys_addr, unsigned long size); +#else +static inline bool phys_mem_access_encrypted(unsigned long phys_addr, + unsigned long size) +{ + return true; +} +#endif /** * iosubmit_cmds512 - copy data to single MMIO location, in 512-bit units diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h index 562854c60808..ae9d40f6c706 100644 --- a/arch/x86/include/asm/irq_stack.h +++ b/arch/x86/include/asm/irq_stack.h @@ -58,7 +58,7 @@ * the output constraints to make the compiler aware that R11 cannot be * reused after the asm() statement. * - * For builds with CONFIG_UNWIND_FRAME_POINTER ASM_CALL_CONSTRAINT is + * For builds with CONFIG_UNWINDER_FRAME_POINTER, ASM_CALL_CONSTRAINT is * required as well as this prevents certain creative GCC variants from * misplacing the ASM code. * @@ -77,11 +77,11 @@ * Function calls can clobber anything except the callee-saved * registers. Tell the compiler. */ -#define call_on_irqstack(func, asm_call, argconstr...) \ +#define call_on_stack(stack, func, asm_call, argconstr...) \ { \ register void *tos asm("r11"); \ \ - tos = ((void *)__this_cpu_read(hardirq_stack_ptr)); \ + tos = ((void *)(stack)); \ \ asm_inline volatile( \ "movq %%rsp, (%[tos]) \n" \ @@ -98,6 +98,25 @@ ); \ } +#define ASM_CALL_ARG0 \ + "call %P[__func] \n" + +#define ASM_CALL_ARG1 \ + "movq %[arg1], %%rdi \n" \ + ASM_CALL_ARG0 + +#define ASM_CALL_ARG2 \ + "movq %[arg2], %%rsi \n" \ + ASM_CALL_ARG1 + +#define ASM_CALL_ARG3 \ + "movq %[arg3], %%rdx \n" \ + ASM_CALL_ARG2 + +#define call_on_irqstack(func, asm_call, argconstr...) \ + call_on_stack(__this_cpu_read(hardirq_stack_ptr), \ + func, asm_call, argconstr) + /* Macros to assert type correctness for run_*_on_irqstack macros */ #define assert_function_type(func, proto) \ static_assert(__builtin_types_compatible_p(typeof(&func), proto)) @@ -147,8 +166,7 @@ */ #define ASM_CALL_SYSVEC \ "call irq_enter_rcu \n" \ - "movq %[arg1], %%rdi \n" \ - "call %P[__func] \n" \ + ASM_CALL_ARG1 \ "call irq_exit_rcu \n" #define SYSVEC_CONSTRAINTS , [arg1] "r" (regs) @@ -168,12 +186,10 @@ */ #define ASM_CALL_IRQ \ "call irq_enter_rcu \n" \ - "movq %[arg1], %%rdi \n" \ - "movl %[arg2], %%esi \n" \ - "call %P[__func] \n" \ + ASM_CALL_ARG2 \ "call irq_exit_rcu \n" -#define IRQ_CONSTRAINTS , [arg1] "r" (regs), [arg2] "r" (vector) +#define IRQ_CONSTRAINTS , [arg1] "r" (regs), [arg2] "r" ((unsigned long)vector) #define run_irq_on_irqstack_cond(func, regs, vector) \ { \ @@ -185,9 +201,7 @@ IRQ_CONSTRAINTS, regs, vector); \ } -#define ASM_CALL_SOFTIRQ \ - "call %P[__func] \n" - +#ifndef CONFIG_PREEMPT_RT /* * Macro to invoke __do_softirq on the irq stack. This is only called from * task context when bottom halves are about to be reenabled and soft @@ -197,10 +211,12 @@ #define do_softirq_own_stack() \ { \ __this_cpu_write(hardirq_stack_inuse, true); \ - call_on_irqstack(__do_softirq, ASM_CALL_SOFTIRQ); \ + call_on_irqstack(__do_softirq, ASM_CALL_ARG0); \ __this_cpu_write(hardirq_stack_inuse, false); \ } +#endif + #else /* CONFIG_X86_64 */ /* System vector handlers always run on the stack they interrupted. */ #define run_sysvec_on_irqstack_cond(func, regs) \ diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index c5ce9845c999..87761396e8cc 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -114,8 +114,6 @@ static __always_inline unsigned long arch_local_irq_save(void) #define SAVE_FLAGS pushfq; popq %rax #endif -#define INTERRUPT_RETURN jmp native_iret - #endif #endif /* __ASSEMBLY__ */ @@ -143,8 +141,13 @@ static __always_inline void arch_local_irq_restore(unsigned long flags) #ifdef CONFIG_X86_64 #ifdef CONFIG_XEN_PV #define SWAPGS ALTERNATIVE "swapgs", "", X86_FEATURE_XENPV +#define INTERRUPT_RETURN \ + ANNOTATE_RETPOLINE_SAFE; \ + ALTERNATIVE_TERNARY("jmp *paravirt_iret(%rip);", \ + X86_FEATURE_XENPV, "jmp xen_iret;", "jmp native_iret;") #else #define SWAPGS swapgs +#define INTERRUPT_RETURN jmp native_iret #endif #endif #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 0a6e34b07017..11b7c06e2828 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -129,7 +129,7 @@ relocate_kernel(unsigned long indirection_page, unsigned long page_list, unsigned long start_address, unsigned int preserve_context, - unsigned int sme_active); + unsigned int host_mem_enc_active); #endif #define ARCH_HAS_KIMAGE_ARCH diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index bd7f5886a789..71ea2eab43d5 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -49,7 +49,6 @@ extern __visible kprobe_opcode_t optprobe_template_end[]; extern const int kretprobe_blacklist_size; void arch_remove_kprobe(struct kprobe *p); -asmlinkage void kretprobe_trampoline(void); extern void arch_kprobe_override_function(struct pt_regs *regs); diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index cefe1d81e2e8..f658bb4dbb74 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -35,6 +35,7 @@ KVM_X86_OP(get_cpl) KVM_X86_OP(set_segment) KVM_X86_OP_NULL(get_cs_db_l_bits) KVM_X86_OP(set_cr0) +KVM_X86_OP_NULL(post_set_cr3) KVM_X86_OP(is_valid_cr4) KVM_X86_OP(set_cr4) KVM_X86_OP(set_efer) @@ -47,6 +48,7 @@ KVM_X86_OP(set_dr7) KVM_X86_OP(cache_reg) KVM_X86_OP(get_rflags) KVM_X86_OP(set_rflags) +KVM_X86_OP(get_if_flag) KVM_X86_OP(tlb_flush_all) KVM_X86_OP(tlb_flush_current) KVM_X86_OP_NULL(tlb_remote_flush) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f8f48a7ec577..0677b9ea01c9 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -38,7 +38,6 @@ #define __KVM_HAVE_ARCH_VCPU_DEBUGFS #define KVM_MAX_VCPUS 1024 -#define KVM_SOFT_MAX_VCPUS 710 /* * In x86, the VCPU ID corresponds to the APIC ID, and APIC IDs @@ -50,7 +49,7 @@ * so ratio of 4 should be enough. */ #define KVM_VCPU_ID_RATIO 4 -#define KVM_MAX_VCPU_ID (KVM_MAX_VCPUS * KVM_VCPU_ID_RATIO) +#define KVM_MAX_VCPU_IDS (KVM_MAX_VCPUS * KVM_VCPU_ID_RATIO) /* memory slots that are not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 3 @@ -98,7 +97,7 @@ KVM_ARCH_REQ_FLAGS(25, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_TLB_FLUSH_CURRENT KVM_ARCH_REQ(26) #define KVM_REQ_TLB_FLUSH_GUEST \ - KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_NO_WAKEUP) + KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_APF_READY KVM_ARCH_REQ(28) #define KVM_REQ_MSR_FILTER_CHANGED KVM_ARCH_REQ(29) #define KVM_REQ_UPDATE_CPU_DIRTY_LOGGING \ @@ -136,7 +135,7 @@ #define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1)) #define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE) -#define KVM_PERMILLE_MMU_PAGES 20 +#define KVM_MEMSLOT_PAGES_TO_MMU_PAGES_RATIO 50 #define KVM_MIN_ALLOC_MMU_PAGES 64UL #define KVM_MMU_HASH_SHIFT 12 #define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT) @@ -292,25 +291,31 @@ struct kvm_kernel_irq_routing_entry; * the number of unique SPs that can theoretically be created is 2^n, where n * is the number of bits that are used to compute the role. * - * But, even though there are 18 bits in the mask below, not all combinations - * of modes and flags are possible. The maximum number of possible upper-level - * shadow pages for a single gfn is in the neighborhood of 2^13. + * But, even though there are 19 bits in the mask below, not all combinations + * of modes and flags are possible: * - * - invalid shadow pages are not accounted. - * - level is effectively limited to four combinations, not 16 as the number - * bits would imply, as 4k SPs are not tracked (allowed to go unsync). - * - level is effectively unused for non-PAE paging because there is exactly - * one upper level (see 4k SP exception above). - * - quadrant is used only for non-PAE paging and is exclusive with - * gpte_is_8_bytes. - * - execonly and ad_disabled are used only for nested EPT, which makes it - * exclusive with quadrant. + * - invalid shadow pages are not accounted, so the bits are effectively 18 + * + * - quadrant will only be used if has_4_byte_gpte=1 (non-PAE paging); + * execonly and ad_disabled are only used for nested EPT which has + * has_4_byte_gpte=0. Therefore, 2 bits are always unused. + * + * - the 4 bits of level are effectively limited to the values 2/3/4/5, + * as 4k SPs are not tracked (allowed to go unsync). In addition non-PAE + * paging has exactly one upper level, making level completely redundant + * when has_4_byte_gpte=1. + * + * - on top of this, smep_andnot_wp and smap_andnot_wp are only set if + * cr0_wp=0, therefore these three bits only give rise to 5 possibilities. + * + * Therefore, the maximum number of possible upper-level shadow pages for a + * single gfn is a bit less than 2^13. */ union kvm_mmu_page_role { u32 word; struct { unsigned level:4; - unsigned gpte_is_8_bytes:1; + unsigned has_4_byte_gpte:1; unsigned quadrant:2; unsigned direct:1; unsigned access:3; @@ -364,6 +369,7 @@ union kvm_mmu_extended_role { unsigned int cr4_smap:1; unsigned int cr4_smep:1; unsigned int cr4_la57:1; + unsigned int efer_lma:1; }; }; @@ -407,6 +413,7 @@ struct kvm_mmu_root_info { #define KVM_HAVE_MMU_RWLOCK struct kvm_mmu_page; +struct kvm_page_fault; /* * x86 supports 4 paging modes (5-level 64-bit, 4-level 64-bit, 3-level 32-bit, @@ -416,14 +423,12 @@ struct kvm_mmu_page; struct kvm_mmu { unsigned long (*get_guest_pgd)(struct kvm_vcpu *vcpu); u64 (*get_pdptr)(struct kvm_vcpu *vcpu, int index); - int (*page_fault)(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 err, - bool prefault); + int (*page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); void (*inject_page_fault)(struct kvm_vcpu *vcpu, struct x86_exception *fault); - gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gpa_t gva_or_gpa, - u32 access, struct x86_exception *exception); - gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, - struct x86_exception *exception); + gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + gpa_t gva_or_gpa, u32 access, + struct x86_exception *exception); int (*sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp); void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa); @@ -490,6 +495,7 @@ struct kvm_pmc { */ u64 current_config; bool is_paused; + bool intr; }; struct kvm_pmu { @@ -499,7 +505,6 @@ struct kvm_pmu { u64 fixed_ctr_ctrl; u64 global_ctrl; u64 global_status; - u64 global_ovf_ctrl; u64 counter_bitmask[2]; u64 global_ctrl_mask; u64 global_ovf_ctrl_mask; @@ -581,7 +586,6 @@ struct kvm_vcpu_hv { struct kvm_hyperv_exit exit; struct kvm_vcpu_hv_stimer stimer[HV_SYNIC_STIMER_COUNT]; DECLARE_BITMAP(stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT); - cpumask_t tlb_flush; bool enforce_cpuid; struct { u32 features_eax; /* HYPERV_CPUID_FEATURES.EAX */ @@ -606,6 +610,7 @@ struct kvm_vcpu_xen { u64 last_steal; u64 runstate_entry_time; u64 runstate_times[4]; + unsigned long evtchn_pending_sel; }; struct kvm_vcpu_arch { @@ -642,6 +647,7 @@ struct kvm_vcpu_arch { u64 smi_count; bool tpr_access_reporting; bool xsaves_enabled; + bool xfd_no_write_intercept; u64 ia32_xss; u64 microcode_version; u64 arch_capabilities; @@ -691,18 +697,18 @@ struct kvm_vcpu_arch { * * Note that while the PKRU state lives inside the fpu registers, * it is switched out separately at VMENTER and VMEXIT time. The - * "guest_fpu" state here contains the guest FPU context, with the + * "guest_fpstate" state here contains the guest FPU context, with the * host PRKU bits. */ - struct fpu *user_fpu; - struct fpu *guest_fpu; + struct fpu_guest guest_fpu; u64 xcr0; u64 guest_supported_xcr0; struct kvm_pio_request pio; void *pio_data; - void *guest_ins_data; + void *sev_pio_data; + unsigned sev_pio_count; u8 event_exit_inst_len; @@ -727,6 +733,7 @@ struct kvm_vcpu_arch { int cpuid_nent; struct kvm_cpuid_entry2 *cpuid_entries; + u32 kvm_cpuid_base; u64 reserved_gpa_bits; int maxphyaddr; @@ -750,7 +757,7 @@ struct kvm_vcpu_arch { u8 preempted; u64 msr_val; u64 last_steal; - struct gfn_to_pfn_cache cache; + struct gfn_to_hva_cache cache; } st; u64 l1_tsc_offset; @@ -775,6 +782,7 @@ struct kvm_vcpu_arch { unsigned nmi_pending; /* NMI queued after currently running handler */ bool nmi_injected; /* Trying to inject an NMI this entry */ bool smi_pending; /* SMI queued after currently running handler */ + u8 handling_intr_from_guest; struct kvm_mtrr mtrr_state; u64 pat; @@ -1015,7 +1023,7 @@ struct msr_bitmap_range { struct kvm_xen { bool long_mode; u8 upcall_vector; - gfn_t shinfo_gfn; + struct gfn_to_pfn_cache shinfo_cache; }; enum kvm_irqchip_mode { @@ -1036,6 +1044,8 @@ struct kvm_x86_msr_filter { #define APICV_INHIBIT_REASON_IRQWIN 3 #define APICV_INHIBIT_REASON_PIT_REINJ 4 #define APICV_INHIBIT_REASON_X2APIC 5 +#define APICV_INHIBIT_REASON_BLOCKIRQ 6 +#define APICV_INHIBIT_REASON_ABSENT 7 struct kvm_arch { unsigned long n_used_mmu_pages; @@ -1073,7 +1083,7 @@ struct kvm_arch { atomic_t apic_map_dirty; /* Protects apic_access_memslot_enabled and apicv_inhibit_reasons */ - struct mutex apicv_update_lock; + struct rw_semaphore apicv_update_lock; bool apic_access_memslot_enabled; unsigned long apicv_inhibit_reasons; @@ -1087,17 +1097,23 @@ struct kvm_arch { unsigned long irq_sources_bitmap; s64 kvmclock_offset; + + /* + * This also protects nr_vcpus_matched_tsc which is read from a + * preemption-disabled region, so it must be a raw spinlock. + */ raw_spinlock_t tsc_write_lock; u64 last_tsc_nsec; u64 last_tsc_write; u32 last_tsc_khz; + u64 last_tsc_offset; u64 cur_tsc_nsec; u64 cur_tsc_write; u64 cur_tsc_offset; u64 cur_tsc_generation; int nr_vcpus_matched_tsc; - spinlock_t pvclock_gtod_sync_lock; + seqcount_raw_spinlock_t pvclock_sc; bool use_master_clock; u64 master_kernel_ns; u64 master_cycle_now; @@ -1207,10 +1223,11 @@ struct kvm_arch { #endif /* CONFIG_X86_64 */ /* - * If set, rmaps have been allocated for all memslots and should be - * allocated for any newly created or modified memslots. + * If set, at least one shadow root has been allocated. This flag + * is used as one input when determining whether certain memslot + * related allocations are necessary. */ - bool memslots_have_rmaps; + bool shadow_root_allocated; #if IS_ENABLED(CONFIG_HYPERV) hpa_t hv_root_tdp; @@ -1296,6 +1313,8 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical) } struct kvm_x86_ops { + const char *name; + int (*hardware_enable)(void); void (*hardware_disable)(void); void (*hardware_unsetup)(void); @@ -1327,6 +1346,7 @@ struct kvm_x86_ops { struct kvm_segment *var, int seg); void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); + void (*post_set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); bool (*is_valid_cr4)(struct kvm_vcpu *vcpu, unsigned long cr0); void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); int (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); @@ -1339,6 +1359,7 @@ struct kvm_x86_ops { void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); + bool (*get_if_flag)(struct kvm_vcpu *vcpu); void (*tlb_flush_all)(struct kvm_vcpu *vcpu); void (*tlb_flush_current)(struct kvm_vcpu *vcpu); @@ -1405,10 +1426,11 @@ struct kvm_x86_ops { void (*write_tsc_multiplier)(struct kvm_vcpu *vcpu, u64 multiplier); /* - * Retrieve somewhat arbitrary exit information. Intended to be used - * only from within tracepoints to avoid VMREADs when tracing is off. + * Retrieve somewhat arbitrary exit information. Intended to + * be used only from within tracepoints or error paths. */ - void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2, + void (*get_exit_info)(struct kvm_vcpu *vcpu, u32 *reason, + u64 *info1, u64 *info2, u32 *exit_int_info, u32 *exit_int_info_err_code); int (*check_intercept)(struct kvm_vcpu *vcpu, @@ -1468,6 +1490,7 @@ struct kvm_x86_ops { int (*mem_enc_reg_region)(struct kvm *kvm, struct kvm_enc_region *argp); int (*mem_enc_unreg_region)(struct kvm *kvm, struct kvm_enc_region *argp); int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd); + int (*vm_move_enc_context_from)(struct kvm *kvm, unsigned int source_fd); int (*get_msr_feature)(struct kvm_msr_entry *entry); @@ -1506,6 +1529,7 @@ struct kvm_x86_init_ops { int (*disabled_by_bios)(void); int (*check_processor_compatibility)(void); int (*hardware_setup)(void); + unsigned int (*handle_intel_pt_intr)(void); struct kvm_x86_ops *runtime_ops; }; @@ -1541,6 +1565,8 @@ static inline struct kvm *kvm_arch_alloc_vm(void) { return __vmalloc(kvm_x86_ops.vm_size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); } + +#define __KVM_HAVE_ARCH_VM_FREE void kvm_arch_free_vm(struct kvm *kvm); #define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB @@ -1553,6 +1579,9 @@ static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm) return -ENOTSUPP; } +#define kvm_arch_pmi_in_guest(vcpu) \ + ((vcpu) && (vcpu)->arch.handling_intr_from_guest) + int kvm_mmu_module_init(void); void kvm_mmu_module_exit(void); @@ -1572,10 +1601,9 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, const struct kvm_memory_slot *memslot); void kvm_mmu_zap_all(struct kvm *kvm); void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen); -unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm); void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages); -int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3); +int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, const void *val, int bytes); @@ -1625,7 +1653,8 @@ extern u64 kvm_mce_cap_supported; * * EMULTYPE_SKIP - Set when emulating solely to skip an instruction, i.e. to * decode the instruction length. For use *only* by - * kvm_x86_ops.skip_emulated_instruction() implementations. + * kvm_x86_ops.skip_emulated_instruction() implementations if + * EMULTYPE_COMPLETE_USER_EXIT is not set. * * EMULTYPE_ALLOW_RETRY_PF - Set when the emulator should resume the guest to * retry native execution under certain conditions, @@ -1645,6 +1674,10 @@ extern u64 kvm_mce_cap_supported; * * EMULTYPE_PF - Set when emulating MMIO by way of an intercepted #PF, in which * case the CR2/GPA value pass on the stack is valid. + * + * EMULTYPE_COMPLETE_USER_EXIT - Set when the emulator should update interruptibility + * state and inject single-step #DBs after skipping + * an instruction (after completing userspace I/O). */ #define EMULTYPE_NO_DECODE (1 << 0) #define EMULTYPE_TRAP_UD (1 << 1) @@ -1653,10 +1686,14 @@ extern u64 kvm_mce_cap_supported; #define EMULTYPE_TRAP_UD_FORCED (1 << 4) #define EMULTYPE_VMWARE_GP (1 << 5) #define EMULTYPE_PF (1 << 6) +#define EMULTYPE_COMPLETE_USER_EXIT (1 << 7) int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type); int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu, void *insn, int insn_len); +void __kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, + u64 *data, u8 ndata); +void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu); void kvm_enable_efer_bits(u64); bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer); @@ -1674,7 +1711,7 @@ int kvm_emulate_monitor(struct kvm_vcpu *vcpu); int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in); int kvm_emulate_cpuid(struct kvm_vcpu *vcpu); int kvm_emulate_halt(struct kvm_vcpu *vcpu); -int kvm_vcpu_halt(struct kvm_vcpu *vcpu); +int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu); int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu); int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu); @@ -1685,8 +1722,6 @@ void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, int reason, bool has_error_code, u32 error_code); -void kvm_free_guest_fpu(struct kvm_vcpu *vcpu); - void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0); void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4); int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); @@ -1715,9 +1750,6 @@ void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); -int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - gfn_t gfn, void *data, int offset, int len, - u32 access); bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr); @@ -1745,12 +1777,9 @@ void kvm_inject_nmi(struct kvm_vcpu *vcpu); void kvm_update_dr7(struct kvm_vcpu *vcpu); int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn); -void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, ulong roots_to_free); void kvm_mmu_free_guest_mode_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu); -gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, - struct x86_exception *exception); gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception); gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, @@ -1866,7 +1895,6 @@ u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier); unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu); bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip); -void kvm_make_mclock_inprogress_request(struct kvm *kvm); void kvm_make_scan_ioapic_request(struct kvm *kvm); void kvm_make_scan_ioapic_request_mask(struct kvm *kvm, unsigned long *vcpu_bitmap); @@ -1885,8 +1913,6 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu); int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu); -int kvm_is_in_guest(void); - void __user *__x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size); bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu); @@ -1915,8 +1941,6 @@ static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) static_call_cond(kvm_x86_vcpu_unblocking)(vcpu); } -static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {} - static inline int kvm_cpu_get_apicid(int mps_cpu) { #ifdef CONFIG_X86_LOCAL_APIC @@ -1935,6 +1959,9 @@ static inline int kvm_cpu_get_apicid(int mps_cpu) int kvm_cpu_dirty_log_size(void); -int alloc_all_memslots_rmaps(struct kvm *kvm); +int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); + +#define KVM_CLOCK_VALID_FLAGS \ + (KVM_CLOCK_TSC_STABLE | KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC) #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h index 6a5f3acf2b33..eb186bc57f6a 100644 --- a/arch/x86/include/asm/kvm_page_track.h +++ b/arch/x86/include/asm/kvm_page_track.h @@ -49,8 +49,12 @@ struct kvm_page_track_notifier_node { int kvm_page_track_init(struct kvm *kvm); void kvm_page_track_cleanup(struct kvm *kvm); +bool kvm_page_track_write_tracking_enabled(struct kvm *kvm); +int kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot); + void kvm_page_track_free_memslot(struct kvm_memory_slot *slot); -int kvm_page_track_create_memslot(struct kvm_memory_slot *slot, +int kvm_page_track_create_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot, unsigned long npages); void kvm_slot_page_track_add_page(struct kvm *kvm, @@ -59,8 +63,9 @@ void kvm_slot_page_track_add_page(struct kvm *kvm, void kvm_slot_page_track_remove_page(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, enum kvm_page_track_mode mode); -bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn, - enum kvm_page_track_mode mode); +bool kvm_slot_page_track_is_active(struct kvm *kvm, + const struct kvm_memory_slot *slot, + gfn_t gfn, enum kvm_page_track_mode mode); void kvm_page_track_register_notifier(struct kvm *kvm, diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 69299878b200..56935ebb1dfe 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -83,6 +83,18 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1, return ret; } +static inline long kvm_sev_hypercall3(unsigned int nr, unsigned long p1, + unsigned long p2, unsigned long p3) +{ + long ret; + + asm volatile("vmmcall" + : "=a"(ret) + : "a"(nr), "b"(p1), "c"(p2), "d"(p3) + : "memory"); + return ret; +} + #ifdef CONFIG_KVM_GUEST void kvmclock_init(void); void kvmclock_disable(void); diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index 365111789cc6..030907922bd0 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h @@ -18,6 +18,20 @@ #define __ALIGN_STR __stringify(__ALIGN) #endif +#ifdef CONFIG_SLS +#define RET ret; int3 +#else +#define RET ret +#endif + +#else /* __ASSEMBLY__ */ + +#ifdef CONFIG_SLS +#define ASM_RET "ret; int3\n\t" +#else +#define ASM_RET "ret\n\t" +#endif + #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_LINKAGE_H */ diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index da9321548f6f..cc73061e7255 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -205,28 +205,16 @@ struct cper_ia_proc_ctx; int mcheck_init(void); void mcheck_cpu_init(struct cpuinfo_x86 *c); void mcheck_cpu_clear(struct cpuinfo_x86 *c); -void mcheck_vendor_init_severity(void); int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id); #else static inline int mcheck_init(void) { return 0; } static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {} static inline void mcheck_cpu_clear(struct cpuinfo_x86 *c) {} -static inline void mcheck_vendor_init_severity(void) {} static inline int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) { return -EINVAL; } #endif -#ifdef CONFIG_X86_ANCIENT_MCE -void intel_p5_mcheck_init(struct cpuinfo_x86 *c); -void winchip_mcheck_init(struct cpuinfo_x86 *c); -static inline void enable_p5_mce(void) { mce_p5_enabled = 1; } -#else -static inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {} -static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {} -static inline void enable_p5_mce(void) {} -#endif - void mce_setup(struct mce *m); void mce_log(struct mce *m); DECLARE_PER_CPU(struct device *, mce_device); @@ -325,31 +313,22 @@ enum smca_bank_types { SMCA_SMU, /* System Management Unit */ SMCA_SMU_V2, SMCA_MP5, /* Microprocessor 5 Unit */ + SMCA_MPDMA, /* MPDMA Unit */ SMCA_NBIO, /* Northbridge IO Unit */ SMCA_PCIE, /* PCI Express Unit */ SMCA_PCIE_V2, SMCA_XGMI_PCS, /* xGMI PCS Unit */ + SMCA_NBIF, /* NBIF Unit */ + SMCA_SHUB, /* System HUB Unit */ + SMCA_SATA, /* SATA Unit */ + SMCA_USB, /* USB Unit */ + SMCA_GMI_PCS, /* GMI PCS Unit */ SMCA_XGMI_PHY, /* xGMI PHY Unit */ SMCA_WAFL_PHY, /* WAFL PHY Unit */ + SMCA_GMI_PHY, /* GMI PHY Unit */ N_SMCA_BANK_TYPES }; -#define HWID_MCATYPE(hwid, mcatype) (((hwid) << 16) | (mcatype)) - -struct smca_hwid { - unsigned int bank_type; /* Use with smca_bank_types for easy indexing. */ - u32 hwid_mcatype; /* (hwid,mcatype) tuple */ - u8 count; /* Number of instances. */ -}; - -struct smca_bank { - struct smca_hwid *hwid; - u32 id; /* Value of MCA_IPID[InstanceId]. */ - u8 sysfs_id; /* Value used for sysfs name. */ -}; - -extern struct smca_bank smca_banks[MAX_NR_BANKS]; - extern const char *smca_get_long_name(enum smca_bank_types t); extern bool amd_mce_is_memory_error(struct mce *m); @@ -357,16 +336,13 @@ extern int mce_threshold_create_device(unsigned int cpu); extern int mce_threshold_remove_device(unsigned int cpu); void mce_amd_feature_init(struct cpuinfo_x86 *c); -int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr); - +enum smca_bank_types smca_get_bank_type(unsigned int cpu, unsigned int bank); #else static inline int mce_threshold_create_device(unsigned int cpu) { return 0; }; static inline int mce_threshold_remove_device(unsigned int cpu) { return 0; }; static inline bool amd_mce_is_memory_error(struct mce *m) { return false; }; static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } -static inline int -umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr) { return -EINVAL; }; #endif static inline void mce_hygon_feature_init(struct cpuinfo_x86 *c) { return mce_amd_feature_init(c); } diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index 9c80c68d75b5..e2c6f433ed10 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -13,6 +13,7 @@ #ifndef __ASSEMBLY__ #include <linux/init.h> +#include <linux/cc_platform.h> #include <asm/bootparam.h> @@ -43,6 +44,8 @@ void __init sme_enable(struct boot_params *bp); int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size); int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size); +void __init early_set_mem_enc_dec_hypercall(unsigned long vaddr, int npages, + bool enc); void __init mem_encrypt_free_decrypted_mem(void); @@ -50,9 +53,6 @@ void __init mem_encrypt_free_decrypted_mem(void); void __init mem_encrypt_init(void); void __init sev_es_init_vc_handling(void); -bool sme_active(void); -bool sev_active(void); -bool sev_es_active(void); #define __bss_decrypted __section(".bss..decrypted") @@ -75,14 +75,13 @@ static inline void __init sme_encrypt_kernel(struct boot_params *bp) { } static inline void __init sme_enable(struct boot_params *bp) { } static inline void sev_es_init_vc_handling(void) { } -static inline bool sme_active(void) { return false; } -static inline bool sev_active(void) { return false; } -static inline bool sev_es_active(void) { return false; } static inline int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size) { return 0; } static inline int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; } +static inline void __init +early_set_mem_enc_dec_hypercall(unsigned long vaddr, int npages, bool enc) {} static inline void mem_encrypt_free_decrypted_mem(void) { } @@ -101,11 +100,6 @@ static inline void mem_encrypt_free_decrypted_mem(void) { } extern char __start_bss_decrypted[], __end_bss_decrypted[], __start_bss_decrypted_unused[]; -static inline bool mem_encrypt_active(void) -{ - return sme_me_mask; -} - static inline u64 sme_get_me_mask(void) { return sme_me_mask; diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index ab45a220fac4..d6bfdfb0f0af 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -130,14 +130,11 @@ static inline unsigned int x86_cpuid_family(void) extern void __init load_ucode_bsp(void); extern void load_ucode_ap(void); void reload_early_microcode(void); -extern bool get_builtin_firmware(struct cpio_data *cd, const char *name); extern bool initrd_gone; #else static inline void __init load_ucode_bsp(void) { } static inline void load_ucode_ap(void) { } static inline void reload_early_microcode(void) { } -static inline bool -get_builtin_firmware(struct cpio_data *cd, const char *name) { return false; } #endif #endif /* _ASM_X86_MICROCODE_H */ diff --git a/arch/x86/include/asm/mmx.h b/arch/x86/include/asm/mmx.h index f572d0f944bb..e69de29bb2d1 100644 --- a/arch/x86/include/asm/mmx.h +++ b/arch/x86/include/asm/mmx.h @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_MMX_H -#define _ASM_X86_MMX_H - -/* - * MMX 3Dnow! helper operations - */ - -#include <linux/types.h> - -extern void *_mmx_memcpy(void *to, const void *from, size_t size); -extern void mmx_clear_page(void *page); -extern void mmx_copy_page(void *to, void *from); - -#endif /* _ASM_X86_MMX_H */ diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index adccbc209169..a82f603d4312 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -11,23 +11,14 @@ #include <asm/paravirt.h> #include <asm/mshyperv.h> +union hv_ghcb; + +DECLARE_STATIC_KEY_FALSE(isolation_type_snp); + typedef int (*hyperv_fill_flush_list_func)( struct hv_guest_mapping_flush_list *flush, void *data); -static inline void hv_set_register(unsigned int reg, u64 value) -{ - wrmsrl(reg, value); -} - -static inline u64 hv_get_register(unsigned int reg) -{ - u64 value; - - rdmsrl(reg, value); - return value; -} - #define hv_get_raw_timer() rdtsc_ordered() void hyperv_vector_handler(struct pt_regs *regs); @@ -39,6 +30,8 @@ extern void *hv_hypercall_pg; extern u64 hv_current_partition_id; +extern union hv_ghcb * __percpu *hv_ghcb_pg; + int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages); int hv_call_add_logical_proc(int node, u32 lp_index, u32 acpi_id); int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags); @@ -176,18 +169,55 @@ bool hv_vcpu_is_preempted(int vcpu); static inline void hv_apic_init(void) {} #endif -static inline void hv_set_msi_entry_from_desc(union hv_msi_entry *msi_entry, - struct msi_desc *msi_desc) -{ - msi_entry->address.as_uint32 = msi_desc->msg.address_lo; - msi_entry->data.as_uint32 = msi_desc->msg.data; -} - struct irq_domain *hv_create_pci_msi_domain(void); int hv_map_ioapic_interrupt(int ioapic_id, bool level, int vcpu, int vector, struct hv_interrupt_entry *entry); int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry); +int hv_set_mem_host_visibility(unsigned long addr, int numpages, bool visible); + +#ifdef CONFIG_AMD_MEM_ENCRYPT +void hv_ghcb_msr_write(u64 msr, u64 value); +void hv_ghcb_msr_read(u64 msr, u64 *value); +#else +static inline void hv_ghcb_msr_write(u64 msr, u64 value) {} +static inline void hv_ghcb_msr_read(u64 msr, u64 *value) {} +#endif + +extern bool hv_isolation_type_snp(void); + +static inline bool hv_is_synic_reg(unsigned int reg) +{ + if ((reg >= HV_REGISTER_SCONTROL) && + (reg <= HV_REGISTER_SINT15)) + return true; + return false; +} + +static inline u64 hv_get_register(unsigned int reg) +{ + u64 value; + + if (hv_is_synic_reg(reg) && hv_isolation_type_snp()) + hv_ghcb_msr_read(reg, &value); + else + rdmsrl(reg, value); + return value; +} + +static inline void hv_set_register(unsigned int reg, u64 value) +{ + if (hv_is_synic_reg(reg) && hv_isolation_type_snp()) { + hv_ghcb_msr_write(reg, value); + + /* Write proxy bit via wrmsl instruction */ + if (reg >= HV_REGISTER_SINT0 && + reg <= HV_REGISTER_SINT15) + wrmsrl(reg, value | 1 << 20); + } else { + wrmsrl(reg, value); + } +} #else /* CONFIG_HYPERV */ static inline void hyperv_init(void) {} @@ -205,6 +235,13 @@ static inline int hyperv_flush_guest_mapping_range(u64 as, { return -1; } +static inline void hv_set_register(unsigned int reg, u64 value) { } +static inline u64 hv_get_register(unsigned int reg) { return 0; } +static inline int hv_set_mem_host_visibility(unsigned long addr, int numpages, + bool visible) +{ + return -1; +} #endif /* CONFIG_HYPERV */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index a7c413432b33..3faf0f97edb1 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -486,6 +486,23 @@ #define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f +/* AMD Collaborative Processor Performance Control MSRs */ +#define MSR_AMD_CPPC_CAP1 0xc00102b0 +#define MSR_AMD_CPPC_ENABLE 0xc00102b1 +#define MSR_AMD_CPPC_CAP2 0xc00102b2 +#define MSR_AMD_CPPC_REQ 0xc00102b3 +#define MSR_AMD_CPPC_STATUS 0xc00102b4 + +#define AMD_CPPC_LOWEST_PERF(x) (((x) >> 0) & 0xff) +#define AMD_CPPC_LOWNONLIN_PERF(x) (((x) >> 8) & 0xff) +#define AMD_CPPC_NOMINAL_PERF(x) (((x) >> 16) & 0xff) +#define AMD_CPPC_HIGHEST_PERF(x) (((x) >> 24) & 0xff) + +#define AMD_CPPC_MAX_PERF(x) (((x) & 0xff) << 0) +#define AMD_CPPC_MIN_PERF(x) (((x) & 0xff) << 8) +#define AMD_CPPC_DES_PERF(x) (((x) & 0xff) << 16) +#define AMD_CPPC_ENERGY_PERF_PREF(x) (((x) & 0xff) << 24) + /* Fam 17h MSRs */ #define MSR_F17H_IRPERF 0xc00000e9 @@ -625,6 +642,8 @@ #define MSR_IA32_BNDCFGS_RSVD 0x00000ffc +#define MSR_IA32_XFD 0x000001c4 +#define MSR_IA32_XFD_ERR 0x000001c5 #define MSR_IA32_XSS 0x00000da0 #define MSR_IA32_APICBASE 0x0000001b diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index a3f87f1015d3..d42e6c6b47b1 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -92,7 +92,7 @@ static __always_inline unsigned long long __rdmsr(unsigned int msr) asm volatile("1: rdmsr\n" "2:\n" - _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_rdmsr_unsafe) + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_RDMSR) : EAX_EDX_RET(val, low, high) : "c" (msr)); return EAX_EDX_VAL(val, low, high); @@ -102,7 +102,7 @@ static __always_inline void __wrmsr(unsigned int msr, u32 low, u32 high) { asm volatile("1: wrmsr\n" "2:\n" - _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_wrmsr_unsafe) + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR) : : "c" (msr), "a"(low), "d" (high) : "memory"); } @@ -137,17 +137,11 @@ static inline unsigned long long native_read_msr_safe(unsigned int msr, { DECLARE_ARGS(val, low, high); - asm volatile("2: rdmsr ; xor %[err],%[err]\n" - "1:\n\t" - ".section .fixup,\"ax\"\n\t" - "3: mov %[fault],%[err]\n\t" - "xorl %%eax, %%eax\n\t" - "xorl %%edx, %%edx\n\t" - "jmp 1b\n\t" - ".previous\n\t" - _ASM_EXTABLE(2b, 3b) + asm volatile("1: rdmsr ; xor %[err],%[err]\n" + "2:\n\t" + _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_RDMSR_SAFE, %[err]) : [err] "=r" (*err), EAX_EDX_RET(val, low, high) - : "c" (msr), [fault] "i" (-EIO)); + : "c" (msr)); if (tracepoint_enabled(read_msr)) do_trace_read_msr(msr, EAX_EDX_VAL(val, low, high), *err); return EAX_EDX_VAL(val, low, high); @@ -169,15 +163,11 @@ native_write_msr_safe(unsigned int msr, u32 low, u32 high) { int err; - asm volatile("2: wrmsr ; xor %[err],%[err]\n" - "1:\n\t" - ".section .fixup,\"ax\"\n\t" - "3: mov %[fault],%[err] ; jmp 1b\n\t" - ".previous\n\t" - _ASM_EXTABLE(2b, 3b) + asm volatile("1: wrmsr ; xor %[err],%[err]\n" + "2:\n\t" + _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_WRMSR_SAFE, %[err]) : [err] "=a" (err) - : "c" (msr), "0" (low), "d" (high), - [fault] "i" (-EIO) + : "c" (msr), "0" (low), "d" (high) : "memory"); if (tracepoint_enabled(write_msr)) do_trace_write_msr(msr, ((u64)high << 32 | low), err); diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h index 829df26fd7a3..76d726074c16 100644 --- a/arch/x86/include/asm/mtrr.h +++ b/arch/x86/include/asm/mtrr.h @@ -24,8 +24,8 @@ #define _ASM_X86_MTRR_H #include <uapi/asm/mtrr.h> -#include <asm/memtype.h> +void mtrr_bp_init(void); /* * The following functions are for use by other drivers that cannot use @@ -43,7 +43,6 @@ extern int mtrr_del(int reg, unsigned long base, unsigned long size); extern int mtrr_del_page(int reg, unsigned long base, unsigned long size); extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi); extern void mtrr_ap_init(void); -extern void mtrr_bp_init(void); extern void set_mtrr_aps_delayed_init(void); extern void mtrr_aps_init(void); extern void mtrr_bp_restore(void); @@ -84,11 +83,6 @@ static inline int mtrr_trim_uncached_memory(unsigned long end_pfn) static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) { } -static inline void mtrr_bp_init(void) -{ - pat_disable("PAT support disabled because CONFIG_MTRR is disabled in the kernel."); -} - #define mtrr_ap_init() do {} while (0) #define set_mtrr_aps_delayed_init() do {} while (0) #define mtrr_aps_init() do {} while (0) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index ec2d5c8c6694..cc74dc584836 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -5,12 +5,15 @@ #include <linux/static_key.h> #include <linux/objtool.h> +#include <linux/linkage.h> #include <asm/alternative.h> #include <asm/cpufeatures.h> #include <asm/msr-index.h> #include <asm/unwind_hints.h> +#define RETPOLINE_THUNK_SIZE 32 + /* * Fill the CPU return stack buffer. * @@ -118,6 +121,16 @@ ".popsection\n\t" #ifdef CONFIG_RETPOLINE + +typedef u8 retpoline_thunk_t[RETPOLINE_THUNK_SIZE]; + +#define GEN(reg) \ + extern retpoline_thunk_t __x86_indirect_thunk_ ## reg; +#include <asm/GEN-for-each-reg.h> +#undef GEN + +extern retpoline_thunk_t __x86_indirect_thunk_array[]; + #ifdef CONFIG_X86_64 /* @@ -303,63 +316,4 @@ static inline void mds_idle_clear_cpu_buffers(void) #endif /* __ASSEMBLY__ */ -/* - * Below is used in the eBPF JIT compiler and emits the byte sequence - * for the following assembly: - * - * With retpolines configured: - * - * callq do_rop - * spec_trap: - * pause - * lfence - * jmp spec_trap - * do_rop: - * mov %rcx,(%rsp) for x86_64 - * mov %edx,(%esp) for x86_32 - * retq - * - * Without retpolines configured: - * - * jmp *%rcx for x86_64 - * jmp *%edx for x86_32 - */ -#ifdef CONFIG_RETPOLINE -# ifdef CONFIG_X86_64 -# define RETPOLINE_RCX_BPF_JIT_SIZE 17 -# define RETPOLINE_RCX_BPF_JIT() \ -do { \ - EMIT1_off32(0xE8, 7); /* callq do_rop */ \ - /* spec_trap: */ \ - EMIT2(0xF3, 0x90); /* pause */ \ - EMIT3(0x0F, 0xAE, 0xE8); /* lfence */ \ - EMIT2(0xEB, 0xF9); /* jmp spec_trap */ \ - /* do_rop: */ \ - EMIT4(0x48, 0x89, 0x0C, 0x24); /* mov %rcx,(%rsp) */ \ - EMIT1(0xC3); /* retq */ \ -} while (0) -# else /* !CONFIG_X86_64 */ -# define RETPOLINE_EDX_BPF_JIT() \ -do { \ - EMIT1_off32(0xE8, 7); /* call do_rop */ \ - /* spec_trap: */ \ - EMIT2(0xF3, 0x90); /* pause */ \ - EMIT3(0x0F, 0xAE, 0xE8); /* lfence */ \ - EMIT2(0xEB, 0xF9); /* jmp spec_trap */ \ - /* do_rop: */ \ - EMIT3(0x89, 0x14, 0x24); /* mov %edx,(%esp) */ \ - EMIT1(0xC3); /* ret */ \ -} while (0) -# endif -#else /* !CONFIG_RETPOLINE */ -# ifdef CONFIG_X86_64 -# define RETPOLINE_RCX_BPF_JIT_SIZE 2 -# define RETPOLINE_RCX_BPF_JIT() \ - EMIT2(0xFF, 0xE1); /* jmp *%rcx */ -# else /* !CONFIG_X86_64 */ -# define RETPOLINE_EDX_BPF_JIT() \ - EMIT2(0xFF, 0xE2) /* jmp *%edx */ -# endif -#endif - #endif /* _ASM_X86_NOSPEC_BRANCH_H_ */ diff --git a/arch/x86/include/asm/page_32.h b/arch/x86/include/asm/page_32.h index 94dbd51df58f..df42f8aa99e4 100644 --- a/arch/x86/include/asm/page_32.h +++ b/arch/x86/include/asm/page_32.h @@ -19,19 +19,6 @@ extern unsigned long __phys_addr(unsigned long); #define pfn_valid(pfn) ((pfn) < max_mapnr) #endif /* CONFIG_FLATMEM */ -#ifdef CONFIG_X86_USE_3DNOW -#include <asm/mmx.h> - -static inline void clear_page(void *page) -{ - mmx_clear_page(page); -} - -static inline void copy_page(void *to, void *from) -{ - mmx_copy_page(to, from); -} -#else /* !CONFIG_X86_USE_3DNOW */ #include <linux/string.h> static inline void clear_page(void *page) @@ -43,7 +30,6 @@ static inline void copy_page(void *to, void *from) { memcpy(to, from, PAGE_SIZE); } -#endif /* CONFIG_X86_3DNOW */ #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_PAGE_32_H */ diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index 4bde0dc66100..e9c86299b835 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -5,6 +5,7 @@ #include <asm/page_64_types.h> #ifndef __ASSEMBLY__ +#include <asm/cpufeatures.h> #include <asm/alternative.h> /* duplicated to the one in bootmem.h */ diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index a8d4ad856568..e9e2c3ba5923 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -15,7 +15,7 @@ #define THREAD_SIZE_ORDER (2 + KASAN_STACK_ORDER) #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) -#define EXCEPTION_STACK_ORDER (0 + KASAN_STACK_ORDER) +#define EXCEPTION_STACK_ORDER (1 + KASAN_STACK_ORDER) #define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER) #define IRQ_STACK_ORDER (2 + KASAN_STACK_ORDER) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index da3a1ac82be5..0d76502cc6f5 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -52,11 +52,11 @@ void __init paravirt_set_cap(void); /* The paravirtualized I/O functions */ static inline void slow_down_io(void) { - pv_ops.cpu.io_delay(); + PVOP_VCALL0(cpu.io_delay); #ifdef REALLY_SLOW_IO - pv_ops.cpu.io_delay(); - pv_ops.cpu.io_delay(); - pv_ops.cpu.io_delay(); + PVOP_VCALL0(cpu.io_delay); + PVOP_VCALL0(cpu.io_delay); + PVOP_VCALL0(cpu.io_delay); #endif } @@ -97,6 +97,12 @@ static inline void paravirt_arch_exit_mmap(struct mm_struct *mm) PVOP_VCALL1(mmu.exit_mmap, mm); } +static inline void notify_page_enc_status_changed(unsigned long pfn, + int npages, bool enc) +{ + PVOP_VCALL3(mmu.notify_page_enc_status_changed, pfn, npages, enc); +} + #ifdef CONFIG_PARAVIRT_XXL static inline void load_sp0(unsigned long sp0) { @@ -113,12 +119,12 @@ static inline void __cpuid(unsigned int *eax, unsigned int *ebx, /* * These special macros can be used to get or set a debugging register */ -static inline unsigned long paravirt_get_debugreg(int reg) +static __always_inline unsigned long paravirt_get_debugreg(int reg) { return PVOP_CALL1(unsigned long, cpu.get_debugreg, reg); } #define get_debugreg(var, reg) var = paravirt_get_debugreg(reg) -static inline void set_debugreg(unsigned long val, int reg) +static __always_inline void set_debugreg(unsigned long val, int reg) { PVOP_VCALL2(cpu.set_debugreg, reg, val); } @@ -133,14 +139,14 @@ static inline void write_cr0(unsigned long x) PVOP_VCALL1(cpu.write_cr0, x); } -static inline unsigned long read_cr2(void) +static __always_inline unsigned long read_cr2(void) { return PVOP_ALT_CALLEE0(unsigned long, mmu.read_cr2, "mov %%cr2, %%rax;", ALT_NOT(X86_FEATURE_XENPV)); } -static inline void write_cr2(unsigned long x) +static __always_inline void write_cr2(unsigned long x) { PVOP_VCALL1(mmu.write_cr2, x); } @@ -653,10 +659,10 @@ bool __raw_callee_save___native_vcpu_is_preempted(long cpu); * functions. */ #define PV_THUNK_NAME(func) "__raw_callee_save_" #func -#define PV_CALLEE_SAVE_REGS_THUNK(func) \ +#define __PV_CALLEE_SAVE_REGS_THUNK(func, section) \ extern typeof(func) __raw_callee_save_##func; \ \ - asm(".pushsection .text;" \ + asm(".pushsection " section ", \"ax\";" \ ".globl " PV_THUNK_NAME(func) ";" \ ".type " PV_THUNK_NAME(func) ", @function;" \ PV_THUNK_NAME(func) ":" \ @@ -665,10 +671,13 @@ bool __raw_callee_save___native_vcpu_is_preempted(long cpu); "call " #func ";" \ PV_RESTORE_ALL_CALLER_REGS \ FRAME_END \ - "ret;" \ + ASM_RET \ ".size " PV_THUNK_NAME(func) ", .-" PV_THUNK_NAME(func) ";" \ ".popsection") +#define PV_CALLEE_SAVE_REGS_THUNK(func) \ + __PV_CALLEE_SAVE_REGS_THUNK(func, ".text") + /* Get a reference to a callee-save function */ #define PV_CALLEE_SAVE(func) \ ((struct paravirt_callee_save) { __raw_callee_save_##func }) @@ -678,23 +687,23 @@ bool __raw_callee_save___native_vcpu_is_preempted(long cpu); ((struct paravirt_callee_save) { func }) #ifdef CONFIG_PARAVIRT_XXL -static inline notrace unsigned long arch_local_save_flags(void) +static __always_inline unsigned long arch_local_save_flags(void) { return PVOP_ALT_CALLEE0(unsigned long, irq.save_fl, "pushf; pop %%rax;", ALT_NOT(X86_FEATURE_XENPV)); } -static inline notrace void arch_local_irq_disable(void) +static __always_inline void arch_local_irq_disable(void) { PVOP_ALT_VCALLEE0(irq.irq_disable, "cli;", ALT_NOT(X86_FEATURE_XENPV)); } -static inline notrace void arch_local_irq_enable(void) +static __always_inline void arch_local_irq_enable(void) { PVOP_ALT_VCALLEE0(irq.irq_enable, "sti;", ALT_NOT(X86_FEATURE_XENPV)); } -static inline notrace unsigned long arch_local_irq_save(void) +static __always_inline unsigned long arch_local_irq_save(void) { unsigned long f; @@ -743,11 +752,6 @@ extern void default_banner(void); #define PARA_SITE(ptype, ops) _PVSITE(ptype, ops, .quad, 8) #define PARA_INDIRECT(addr) *addr(%rip) -#define INTERRUPT_RETURN \ - ANNOTATE_RETPOLINE_SAFE; \ - ALTERNATIVE_TERNARY("jmp *paravirt_iret(%rip);", \ - X86_FEATURE_XENPV, "jmp xen_iret;", "jmp native_iret;") - #ifdef CONFIG_DEBUG_ENTRY .macro PARA_IRQ_save_fl PARA_SITE(PARA_PATCH(PV_IRQ_save_fl), diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index d9d6b0203ec4..a69012e1903f 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -168,6 +168,7 @@ struct pv_mmu_ops { /* Hook for intercepting the destruction of an mm_struct. */ void (*exit_mmap)(struct mm_struct *mm); + void (*notify_page_enc_status_changed)(unsigned long pfn, int npages, bool enc); #ifdef CONFIG_PARAVIRT_XXL struct paravirt_callee_save read_cr2; @@ -577,7 +578,9 @@ void paravirt_leave_lazy_mmu(void); void paravirt_flush_lazy_mmu(void); void _paravirt_nop(void); +void paravirt_BUG(void); u64 _paravirt_ident_64(u64); +unsigned long paravirt_ret0(void); #define paravirt_nop ((void *)_paravirt_nop) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 448cd01eb3ec..8a9432fb3802 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -22,10 +22,12 @@ #define pgprot_decrypted(prot) __pgprot(__sme_clr(pgprot_val(prot))) #ifndef __ASSEMBLY__ +#include <linux/spinlock.h> #include <asm/x86_init.h> #include <asm/pkru.h> #include <asm/fpu/api.h> #include <asm-generic/pgtable_uffd.h> +#include <linux/page_table_check.h> extern pgd_t early_top_pgt[PTRS_PER_PGD]; bool __init __early_make_pgtable(unsigned long address, pmdval_t pmd); @@ -752,7 +754,7 @@ static inline bool pte_accessible(struct mm_struct *mm, pte_t a) return true; if ((pte_flags(a) & _PAGE_PROTNONE) && - mm_tlb_flush_pending(mm)) + atomic_read(&mm->tlb_flush_pending)) return true; return false; @@ -1006,18 +1008,21 @@ static inline pud_t native_local_pudp_get_and_clear(pud_t *pudp) static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { + page_table_check_pte_set(mm, addr, ptep, pte); set_pte(ptep, pte); } static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { + page_table_check_pmd_set(mm, addr, pmdp, pmd); set_pmd(pmdp, pmd); } static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, pud_t *pudp, pud_t pud) { + page_table_check_pud_set(mm, addr, pudp, pud); native_set_pud(pudp, pud); } @@ -1048,6 +1053,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { pte_t pte = native_ptep_get_and_clear(ptep); + page_table_check_pte_clear(mm, addr, pte); return pte; } @@ -1063,12 +1069,23 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, * care about updates and native needs no locking */ pte = native_local_ptep_get_and_clear(ptep); + page_table_check_pte_clear(mm, addr, pte); } else { pte = ptep_get_and_clear(mm, addr, ptep); } return pte; } +#define __HAVE_ARCH_PTEP_CLEAR +static inline void ptep_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + if (IS_ENABLED(CONFIG_PAGE_TABLE_CHECK)) + ptep_get_and_clear(mm, addr, ptep); + else + pte_clear(mm, addr, ptep); +} + #define __HAVE_ARCH_PTEP_SET_WRPROTECT static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) @@ -1109,14 +1126,22 @@ static inline int pmd_write(pmd_t pmd) static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { - return native_pmdp_get_and_clear(pmdp); + pmd_t pmd = native_pmdp_get_and_clear(pmdp); + + page_table_check_pmd_clear(mm, addr, pmd); + + return pmd; } #define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr, pud_t *pudp) { - return native_pudp_get_and_clear(pudp); + pud_t pud = native_pudp_get_and_clear(pudp); + + page_table_check_pud_clear(mm, addr, pud); + + return pud; } #define __HAVE_ARCH_PMDP_SET_WRPROTECT @@ -1137,6 +1162,7 @@ static inline int pud_write(pud_t pud) static inline pmd_t pmdp_establish(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t pmd) { + page_table_check_pmd_set(vma->vm_mm, address, pmdp, pmd); if (IS_ENABLED(CONFIG_SMP)) { return xchg(pmdp, pmd); } else { diff --git a/arch/x86/include/asm/pkru.h b/arch/x86/include/asm/pkru.h index ccc539faa5bb..74f0a2d34ffd 100644 --- a/arch/x86/include/asm/pkru.h +++ b/arch/x86/include/asm/pkru.h @@ -2,10 +2,10 @@ #ifndef _ASM_X86_PKRU_H #define _ASM_X86_PKRU_H -#include <asm/fpu/xstate.h> +#include <asm/cpufeature.h> -#define PKRU_AD_BIT 0x1 -#define PKRU_WD_BIT 0x2 +#define PKRU_AD_BIT 0x1u +#define PKRU_WD_BIT 0x2u #define PKRU_BITS_PER_PKEY 2 #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 9ad2acaaae9b..2c5f12ae7d04 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -164,7 +164,8 @@ enum cpuid_regs_idx { #define X86_VENDOR_NSC 8 #define X86_VENDOR_HYGON 9 #define X86_VENDOR_ZHAOXIN 10 -#define X86_VENDOR_NUM 11 +#define X86_VENDOR_VORTEX 11 +#define X86_VENDOR_NUM 12 #define X86_VENDOR_UNKNOWN 0xff @@ -461,9 +462,6 @@ DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); DECLARE_PER_CPU(struct irq_stack *, softirq_stack_ptr); #endif /* !X86_64 */ -extern unsigned int fpu_kernel_xstate_size; -extern unsigned int fpu_user_xstate_size; - struct perf_event; struct thread_struct { @@ -518,6 +516,7 @@ struct thread_struct { */ unsigned long iopl_emul; + unsigned int iopl_warn:1; unsigned int sig_on_uaccess_err:1; /* @@ -537,12 +536,12 @@ struct thread_struct { */ }; -/* Whitelist the FPU state from the task_struct for hardened usercopy. */ +extern void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size); + static inline void arch_thread_struct_whitelist(unsigned long *offset, unsigned long *size) { - *offset = offsetof(struct thread_struct, fpu.state); - *size = fpu_kernel_xstate_size; + fpu_thread_struct_whitelist(offset, size); } static inline void @@ -589,7 +588,7 @@ static inline void load_sp0(unsigned long sp0) /* Free all resources held by a thread. */ extern void release_thread(struct task_struct *); -unsigned long get_wchan(struct task_struct *p); +unsigned long __get_wchan(struct task_struct *p); /* * Generic CPUID function @@ -807,11 +806,14 @@ static inline u32 amd_get_nodes_per_socket(void) { return 0; } static inline u32 amd_get_highest_perf(void) { return 0; } #endif +#define for_each_possible_hypervisor_cpuid_base(function) \ + for (function = 0x40000000; function < 0x40010000; function += 0x100) + static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves) { uint32_t base, eax, signature[3]; - for (base = 0x40000000; base < 0x40010000; base += 0x100) { + for_each_possible_hypervisor_cpuid_base(base) { cpuid(base, &eax, &signature[0], &signature[1], &signature[2]); if (!memcmp(sig, signature, 12) && @@ -853,4 +855,12 @@ enum mds_mitigations { MDS_MITIGATION_VMWERV, }; +#ifdef CONFIG_X86_SGX +int arch_memory_failure(unsigned long pfn, int flags); +#define arch_memory_failure arch_memory_failure + +bool arch_is_platform_page(u64 paddr); +#define arch_is_platform_page arch_is_platform_page +#endif + #endif /* _ASM_X86_PROCESSOR_H */ diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 8c5d1910a848..feed36d44d04 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -40,6 +40,6 @@ void x86_report_nx(void); extern int reboot_force; long do_arch_prctl_common(struct task_struct *task, int option, - unsigned long cpuid_enabled); + unsigned long arg2); #endif /* _ASM_X86_PROTO_H */ diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index b94f615600d5..703663175a5a 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -181,7 +181,7 @@ static inline bool any_64bit_mode(struct pt_regs *regs) #define current_user_stack_pointer() current_pt_regs()->sp #define compat_user_stack_pointer() current_pt_regs()->sp -static inline bool ip_within_syscall_gap(struct pt_regs *regs) +static __always_inline bool ip_within_syscall_gap(struct pt_regs *regs) { bool ret = (regs->ip >= (unsigned long)entry_SYSCALL_64 && regs->ip < (unsigned long)entry_SYSCALL_64_safe_stack); diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h index d86ab942219c..d87451df480b 100644 --- a/arch/x86/include/asm/qspinlock.h +++ b/arch/x86/include/asm/qspinlock.h @@ -53,6 +53,7 @@ static inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) static inline void queued_spin_unlock(struct qspinlock *lock) { + kcsan_release(); pv_queued_spin_unlock(lock); } diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h index 159622ee0674..1474cf96251d 100644 --- a/arch/x86/include/asm/qspinlock_paravirt.h +++ b/arch/x86/include/asm/qspinlock_paravirt.h @@ -48,7 +48,7 @@ asm (".pushsection .text;" "jne .slowpath;" "pop %rdx;" FRAME_END - "ret;" + ASM_RET ".slowpath: " "push %rsi;" "movzbl %al,%esi;" @@ -56,7 +56,7 @@ asm (".pushsection .text;" "pop %rsi;" "pop %rdx;" FRAME_END - "ret;" + ASM_RET ".size " PV_UNLOCK ", .-" PV_UNLOCK ";" ".popsection"); diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index 5db5d083c873..331474b150f1 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -89,6 +89,7 @@ static inline void set_real_mode_mem(phys_addr_t mem) } void reserve_real_mode(void); +void load_trampoline_pgtable(void); #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h index b2d504f11937..aff774775c67 100644 --- a/arch/x86/include/asm/required-features.h +++ b/arch/x86/include/asm/required-features.h @@ -35,11 +35,7 @@ # define NEED_CMOV 0 #endif -#ifdef CONFIG_X86_USE_3DNOW -# define NEED_3DNOW (1<<(X86_FEATURE_3DNOW & 31)) -#else # define NEED_3DNOW 0 -#endif #if defined(CONFIG_X86_P6_NOP) || defined(CONFIG_X86_64) # define NEED_NOPL (1<<(X86_FEATURE_NOPL & 31)) diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 72044026eb3c..b228c9d44ee7 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -307,14 +307,7 @@ do { \ \ asm volatile(" \n" \ "1: movl %k0,%%" #seg " \n" \ - \ - ".section .fixup,\"ax\" \n" \ - "2: xorl %k0,%k0 \n" \ - " jmp 1b \n" \ - ".previous \n" \ - \ - _ASM_EXTABLE(1b, 2b) \ - \ + _ASM_EXTABLE_TYPE_REG(1b, 1b, EX_TYPE_ZERO_REG, %k0)\ : "+r" (__val) : : "memory"); \ } while (0) @@ -339,7 +332,7 @@ static inline void __loadsegment_fs(unsigned short value) "1: movw %0, %%fs \n" "2: \n" - _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_clear_fs) + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_CLEAR_FS) : : "rm" (value) : "memory"); } diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index 43fa081a1adb..ff0f2d90338a 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -2,6 +2,7 @@ #ifndef _ASM_X86_SET_MEMORY_H #define _ASM_X86_SET_MEMORY_H +#include <linux/mm.h> #include <asm/page.h> #include <asm-generic/set_memory.h> @@ -83,6 +84,7 @@ int set_pages_rw(struct page *page, int numpages); int set_direct_map_invalid_noflush(struct page *page); int set_direct_map_default_noflush(struct page *page); bool kernel_page_present(struct page *page); +void notify_range_enc_status_changed(unsigned long vaddr, int npages, bool enc); extern int kernel_set_to_readonly; @@ -98,6 +100,9 @@ static inline int set_mce_nospec(unsigned long pfn, bool unmap) unsigned long decoy_addr; int rc; + /* SGX pages are not in the 1:1 map */ + if (arch_is_platform_page(pfn << PAGE_SHIFT)) + return 0; /* * We would like to just call: * set_memory_XX((unsigned long)pfn_to_kaddr(pfn), 1); diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index 2cef6c5a52c2..1b2fd32b42fe 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -18,20 +18,19 @@ /* SEV Information Request/Response */ #define GHCB_MSR_SEV_INFO_RESP 0x001 #define GHCB_MSR_SEV_INFO_REQ 0x002 -#define GHCB_MSR_VER_MAX_POS 48 -#define GHCB_MSR_VER_MAX_MASK 0xffff -#define GHCB_MSR_VER_MIN_POS 32 -#define GHCB_MSR_VER_MIN_MASK 0xffff -#define GHCB_MSR_CBIT_POS 24 -#define GHCB_MSR_CBIT_MASK 0xff -#define GHCB_MSR_SEV_INFO(_max, _min, _cbit) \ - ((((_max) & GHCB_MSR_VER_MAX_MASK) << GHCB_MSR_VER_MAX_POS) | \ - (((_min) & GHCB_MSR_VER_MIN_MASK) << GHCB_MSR_VER_MIN_POS) | \ - (((_cbit) & GHCB_MSR_CBIT_MASK) << GHCB_MSR_CBIT_POS) | \ + +#define GHCB_MSR_SEV_INFO(_max, _min, _cbit) \ + /* GHCBData[63:48] */ \ + ((((_max) & 0xffff) << 48) | \ + /* GHCBData[47:32] */ \ + (((_min) & 0xffff) << 32) | \ + /* GHCBData[31:24] */ \ + (((_cbit) & 0xff) << 24) | \ GHCB_MSR_SEV_INFO_RESP) + #define GHCB_MSR_INFO(v) ((v) & 0xfffUL) -#define GHCB_MSR_PROTO_MAX(v) (((v) >> GHCB_MSR_VER_MAX_POS) & GHCB_MSR_VER_MAX_MASK) -#define GHCB_MSR_PROTO_MIN(v) (((v) >> GHCB_MSR_VER_MIN_POS) & GHCB_MSR_VER_MIN_MASK) +#define GHCB_MSR_PROTO_MAX(v) (((v) >> 48) & 0xffff) +#define GHCB_MSR_PROTO_MIN(v) (((v) >> 32) & 0xffff) /* CPUID Request/Response */ #define GHCB_MSR_CPUID_REQ 0x004 @@ -46,31 +45,48 @@ #define GHCB_CPUID_REQ_EBX 1 #define GHCB_CPUID_REQ_ECX 2 #define GHCB_CPUID_REQ_EDX 3 -#define GHCB_CPUID_REQ(fn, reg) \ - (GHCB_MSR_CPUID_REQ | \ - (((unsigned long)reg & GHCB_MSR_CPUID_REG_MASK) << GHCB_MSR_CPUID_REG_POS) | \ - (((unsigned long)fn) << GHCB_MSR_CPUID_FUNC_POS)) +#define GHCB_CPUID_REQ(fn, reg) \ + /* GHCBData[11:0] */ \ + (GHCB_MSR_CPUID_REQ | \ + /* GHCBData[31:12] */ \ + (((unsigned long)(reg) & 0x3) << 30) | \ + /* GHCBData[63:32] */ \ + (((unsigned long)fn) << 32)) /* AP Reset Hold */ -#define GHCB_MSR_AP_RESET_HOLD_REQ 0x006 -#define GHCB_MSR_AP_RESET_HOLD_RESP 0x007 +#define GHCB_MSR_AP_RESET_HOLD_REQ 0x006 +#define GHCB_MSR_AP_RESET_HOLD_RESP 0x007 /* GHCB Hypervisor Feature Request/Response */ -#define GHCB_MSR_HV_FT_REQ 0x080 -#define GHCB_MSR_HV_FT_RESP 0x081 +#define GHCB_MSR_HV_FT_REQ 0x080 +#define GHCB_MSR_HV_FT_RESP 0x081 #define GHCB_MSR_TERM_REQ 0x100 #define GHCB_MSR_TERM_REASON_SET_POS 12 #define GHCB_MSR_TERM_REASON_SET_MASK 0xf #define GHCB_MSR_TERM_REASON_POS 16 #define GHCB_MSR_TERM_REASON_MASK 0xff -#define GHCB_SEV_TERM_REASON(reason_set, reason_val) \ - (((((u64)reason_set) & GHCB_MSR_TERM_REASON_SET_MASK) << GHCB_MSR_TERM_REASON_SET_POS) | \ - ((((u64)reason_val) & GHCB_MSR_TERM_REASON_MASK) << GHCB_MSR_TERM_REASON_POS)) -#define GHCB_SEV_ES_REASON_GENERAL_REQUEST 0 -#define GHCB_SEV_ES_REASON_PROTOCOL_UNSUPPORTED 1 +#define GHCB_SEV_TERM_REASON(reason_set, reason_val) \ + /* GHCBData[15:12] */ \ + (((((u64)reason_set) & 0xf) << 12) | \ + /* GHCBData[23:16] */ \ + ((((u64)reason_val) & 0xff) << 16)) + +#define GHCB_SEV_ES_GEN_REQ 0 +#define GHCB_SEV_ES_PROT_UNSUPPORTED 1 #define GHCB_RESP_CODE(v) ((v) & GHCB_MSR_INFO_MASK) +/* + * Error codes related to GHCB input that can be communicated back to the guest + * by setting the lower 32-bits of the GHCB SW_EXITINFO1 field to 2. + */ +#define GHCB_ERR_NOT_REGISTERED 1 +#define GHCB_ERR_INVALID_USAGE 2 +#define GHCB_ERR_INVALID_SCRATCH_AREA 3 +#define GHCB_ERR_MISSING_INPUT 4 +#define GHCB_ERR_INVALID_INPUT 5 +#define GHCB_ERR_INVALID_EVENT 6 + #endif diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index fa5cd05d3b5b..ec060c433589 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -53,6 +53,7 @@ static inline u64 lower_bits(u64 val, unsigned int bits) struct real_mode_header; enum stack_type; +struct ghcb; /* Early IDT entry points for #VC handler */ extern void vc_no_ghcb(void); @@ -81,6 +82,11 @@ static __always_inline void sev_es_nmi_complete(void) __sev_es_nmi_complete(); } extern int __init sev_es_efi_map_ghcbs(pgd_t *pgd); +extern enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, + bool set_ghcb_msr, + struct es_em_ctxt *ctxt, + u64 exit_code, u64 exit_info_1, + u64 exit_info_2); #else static inline void sev_es_ist_enter(struct pt_regs *regs) { } static inline void sev_es_ist_exit(void) { } diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h index 05f3e21f01a7..3f9334ef67cd 100644 --- a/arch/x86/include/asm/sgx.h +++ b/arch/x86/include/asm/sgx.h @@ -46,6 +46,24 @@ enum sgx_encls_function { }; /** + * SGX_ENCLS_FAULT_FLAG - flag signifying an ENCLS return code is a trapnr + * + * ENCLS has its own (positive value) error codes and also generates + * ENCLS specific #GP and #PF faults. And the ENCLS values get munged + * with system error codes as everything percolates back up the stack. + * Unfortunately (for us), we need to precisely identify each unique + * error code, e.g. the action taken if EWB fails varies based on the + * type of fault and on the exact SGX error code, i.e. we can't simply + * convert all faults to -EFAULT. + * + * To make all three error types coexist, we set bit 30 to identify an + * ENCLS fault. Bit 31 (technically bits N:31) is used to differentiate + * between positive (faults and SGX error codes) and negative (system + * error codes) values. + */ +#define SGX_ENCLS_FAULT_FLAG 0x40000000 + +/** * enum sgx_return_code - The return code type for ENCLS, ENCLU and ENCLV * %SGX_NOT_TRACKED: Previous ETRACK's shootdown sequence has not * been completed yet. diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 630ff08532be..81a0211a372d 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -16,7 +16,9 @@ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map); DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map); /* cpus sharing the last level cache: */ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); +DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map); DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id); +DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id); DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number); static inline struct cpumask *cpu_llc_shared_mask(int cpu) @@ -24,6 +26,11 @@ static inline struct cpumask *cpu_llc_shared_mask(int cpu) return per_cpu(cpu_llc_shared_map, cpu); } +static inline struct cpumask *cpu_l2c_shared_mask(int cpu) +{ + return per_cpu(cpu_l2c_shared_map, cpu); +} + DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid); DECLARE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid); DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid); @@ -119,6 +126,7 @@ static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask) void cpu_disable_common(void); void native_smp_prepare_boot_cpu(void); +void smp_prepare_cpus_common(void); void native_smp_prepare_cpus(unsigned int max_cpus); void calculate_max_logical_packages(void); void native_smp_cpus_done(unsigned int max_cpus); diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index f248eb2ac2d4..3881b5333eb8 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -38,6 +38,16 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, bool get_stack_info_noinstr(unsigned long *stack, struct task_struct *task, struct stack_info *info); +static __always_inline +bool get_stack_guard_info(unsigned long *stack, struct stack_info *info) +{ + /* make sure it's not in the stack proper */ + if (get_stack_info_noinstr(stack, current, info)) + return false; + /* but if it is in the page below it, we hit a guard */ + return get_stack_info_noinstr((void *)stack + PAGE_SIZE, current, info); +} + const char *stack_type_name(enum stack_type type); static inline bool on_stack(struct stack_info *info, void *addr, size_t len) diff --git a/arch/x86/include/asm/static_call.h b/arch/x86/include/asm/static_call.h index cbb67b6030f9..ed4f8bb6c2d9 100644 --- a/arch/x86/include/asm/static_call.h +++ b/arch/x86/include/asm/static_call.h @@ -27,6 +27,7 @@ ".globl " STATIC_CALL_TRAMP_STR(name) " \n" \ STATIC_CALL_TRAMP_STR(name) ": \n" \ insns " \n" \ + ".byte 0x53, 0x43, 0x54 \n" \ ".type " STATIC_CALL_TRAMP_STR(name) ", @function \n" \ ".size " STATIC_CALL_TRAMP_STR(name) ", . - " STATIC_CALL_TRAMP_STR(name) " \n" \ ".popsection \n") @@ -35,7 +36,7 @@ __ARCH_DEFINE_STATIC_CALL_TRAMP(name, ".byte 0xe9; .long " #func " - (. + 4)") #define ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name) \ - __ARCH_DEFINE_STATIC_CALL_TRAMP(name, "ret; nop; nop; nop; nop") + __ARCH_DEFINE_STATIC_CALL_TRAMP(name, "ret; int3; nop; nop; nop") #define ARCH_ADD_TRAMP_KEY(name) \ diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h index f74362b05619..32c0d981a82a 100644 --- a/arch/x86/include/asm/string_32.h +++ b/arch/x86/include/asm/string_32.h @@ -146,42 +146,9 @@ static __always_inline void *__constant_memcpy(void *to, const void *from, extern void *memcpy(void *, const void *, size_t); #ifndef CONFIG_FORTIFY_SOURCE -#ifdef CONFIG_X86_USE_3DNOW - -#include <asm/mmx.h> - -/* - * This CPU favours 3DNow strongly (eg AMD Athlon) - */ - -static inline void *__constant_memcpy3d(void *to, const void *from, size_t len) -{ - if (len < 512) - return __constant_memcpy(to, from, len); - return _mmx_memcpy(to, from, len); -} - -static inline void *__memcpy3d(void *to, const void *from, size_t len) -{ - if (len < 512) - return __memcpy(to, from, len); - return _mmx_memcpy(to, from, len); -} - -#define memcpy(t, f, n) \ - (__builtin_constant_p((n)) \ - ? __constant_memcpy3d((t), (f), (n)) \ - : __memcpy3d((t), (f), (n))) - -#else - -/* - * No 3D Now! - */ #define memcpy(t, f, n) __builtin_memcpy(t, f, n) -#endif #endif /* !CONFIG_FORTIFY_SOURCE */ #define __HAVE_ARCH_MEMMOVE diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index f7e2d82d24fb..5b85987a5e97 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -87,15 +87,6 @@ static inline void syscall_get_arguments(struct task_struct *task, memcpy(args, ®s->bx, 6 * sizeof(args[0])); } -static inline void syscall_set_arguments(struct task_struct *task, - struct pt_regs *regs, - unsigned int i, unsigned int n, - const unsigned long *args) -{ - BUG_ON(i + n > 6); - memcpy(®s->bx + i, args, n * sizeof(args[0])); -} - static inline int syscall_get_arch(struct task_struct *task) { return AUDIT_ARCH_I386; @@ -127,30 +118,6 @@ static inline void syscall_get_arguments(struct task_struct *task, } } -static inline void syscall_set_arguments(struct task_struct *task, - struct pt_regs *regs, - const unsigned long *args) -{ -# ifdef CONFIG_IA32_EMULATION - if (task->thread_info.status & TS_COMPAT) { - regs->bx = *args++; - regs->cx = *args++; - regs->dx = *args++; - regs->si = *args++; - regs->di = *args++; - regs->bp = *args; - } else -# endif - { - regs->di = *args++; - regs->si = *args++; - regs->dx = *args++; - regs->r10 = *args++; - regs->r8 = *args++; - regs->r9 = *args; - } -} - static inline int syscall_get_arch(struct task_struct *task) { /* x32 tasks should be considered AUDIT_ARCH_X86_64. */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index cf132663c219..ebec69c35e95 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -57,6 +57,9 @@ struct thread_info { unsigned long flags; /* low level flags */ unsigned long syscall_work; /* SYSCALL_WORK_ flags */ u32 status; /* thread synchronous flags */ +#ifdef CONFIG_SMP + u32 cpu; /* current CPU */ +#endif }; #define INIT_THREAD_INFO(tsk) \ diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index b587a9ee9cb2..98fa0a114074 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -261,4 +261,9 @@ extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch); #endif /* !MODULE */ +static inline void __native_tlb_flush_global(unsigned long cr4) +{ + native_write_cr4(cr4 ^ X86_CR4_PGE); + native_write_cr4(cr4); +} #endif /* _ASM_X86_TLBFLUSH_H */ diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 9239399e5491..2f0b6be8eaab 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -103,6 +103,7 @@ static inline void setup_node_to_cpumask_map(void) { } #include <asm-generic/topology.h> extern const struct cpumask *cpu_coregroup_mask(int cpu); +extern const struct cpumask *cpu_clustergroup_mask(int cpu); #define topology_logical_package_id(cpu) (cpu_data(cpu).logical_proc_id) #define topology_physical_package_id(cpu) (cpu_data(cpu).phys_proc_id) @@ -113,7 +114,9 @@ extern const struct cpumask *cpu_coregroup_mask(int cpu); extern unsigned int __max_die_per_package; #ifdef CONFIG_SMP +#define topology_cluster_id(cpu) (per_cpu(cpu_l2c_id, cpu)) #define topology_die_cpumask(cpu) (per_cpu(cpu_die_map, cpu)) +#define topology_cluster_cpumask(cpu) (cpu_clustergroup_mask(cpu)) #define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu)) #define topology_sibling_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu)) @@ -218,7 +221,7 @@ static inline void arch_set_max_freq_ratio(bool turbo_disabled) } #endif -#ifdef CONFIG_ACPI_CPPC_LIB +#if defined(CONFIG_ACPI_CPPC_LIB) && defined(CONFIG_SMP) void init_freq_invariance_cppc(void); #define init_freq_invariance_cppc init_freq_invariance_cppc #endif diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h index 879b77792f94..4645a6334063 100644 --- a/arch/x86/include/asm/trace/fpu.h +++ b/arch/x86/include/asm/trace/fpu.h @@ -22,8 +22,8 @@ DECLARE_EVENT_CLASS(x86_fpu, __entry->fpu = fpu; __entry->load_fpu = test_thread_flag(TIF_NEED_FPU_LOAD); if (boot_cpu_has(X86_FEATURE_OSXSAVE)) { - __entry->xfeatures = fpu->state.xsave.header.xfeatures; - __entry->xcomp_bv = fpu->state.xsave.header.xcomp_bv; + __entry->xfeatures = fpu->fpstate->regs.xsave.header.xfeatures; + __entry->xcomp_bv = fpu->fpstate->regs.xsave.header.xcomp_bv; } ), TP_printk("x86/fpu: %p load: %d xfeatures: %llx xcomp_bv: %llx", diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 7f7200021bd1..6221be7cafc3 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -40,9 +40,9 @@ void math_emulate(struct math_emu_info *); bool fault_in_kernel_space(unsigned long address); #ifdef CONFIG_VMAP_STACK -void __noreturn handle_stack_overflow(const char *message, - struct pt_regs *regs, - unsigned long fault_address); +void __noreturn handle_stack_overflow(struct pt_regs *regs, + unsigned long fault_address, + struct stack_info *info); #endif #endif /* _ASM_X86_TRAPS_H */ diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 5c95d242f38d..ac96f9b2d64b 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -314,11 +314,12 @@ do { \ do { \ __chk_user_ptr(ptr); \ switch (size) { \ - unsigned char x_u8__; \ - case 1: \ + case 1: { \ + unsigned char x_u8__; \ __get_user_asm(x_u8__, ptr, "b", "=q", label); \ (x) = x_u8__; \ break; \ + } \ case 2: \ __get_user_asm(x, ptr, "w", "=r", label); \ break; \ @@ -351,24 +352,22 @@ do { \ "1: movl %[lowbits],%%eax\n" \ "2: movl %[highbits],%%edx\n" \ "3:\n" \ - ".section .fixup,\"ax\"\n" \ - "4: mov %[efault],%[errout]\n" \ - " xorl %%eax,%%eax\n" \ - " xorl %%edx,%%edx\n" \ - " jmp 3b\n" \ - ".previous\n" \ - _ASM_EXTABLE_UA(1b, 4b) \ - _ASM_EXTABLE_UA(2b, 4b) \ + _ASM_EXTABLE_TYPE_REG(1b, 3b, EX_TYPE_EFAULT_REG | \ + EX_FLAG_CLEAR_AX_DX, \ + %[errout]) \ + _ASM_EXTABLE_TYPE_REG(2b, 3b, EX_TYPE_EFAULT_REG | \ + EX_FLAG_CLEAR_AX_DX, \ + %[errout]) \ : [errout] "=r" (retval), \ [output] "=&A"(x) \ : [lowbits] "m" (__m(__ptr)), \ [highbits] "m" __m(((u32 __user *)(__ptr)) + 1), \ - [efault] "i" (-EFAULT), "0" (retval)); \ + "0" (retval)); \ }) #else #define __get_user_asm_u64(x, ptr, retval) \ - __get_user_asm(x, ptr, retval, "q", "=r") + __get_user_asm(x, ptr, retval, "q") #endif #define __get_user_size(x, ptr, size, retval) \ @@ -379,14 +378,14 @@ do { \ __chk_user_ptr(ptr); \ switch (size) { \ case 1: \ - __get_user_asm(x_u8__, ptr, retval, "b", "=q"); \ + __get_user_asm(x_u8__, ptr, retval, "b"); \ (x) = x_u8__; \ break; \ case 2: \ - __get_user_asm(x, ptr, retval, "w", "=r"); \ + __get_user_asm(x, ptr, retval, "w"); \ break; \ case 4: \ - __get_user_asm(x, ptr, retval, "l", "=r"); \ + __get_user_asm(x, ptr, retval, "l"); \ break; \ case 8: \ __get_user_asm_u64(x, ptr, retval); \ @@ -396,22 +395,19 @@ do { \ } \ } while (0) -#define __get_user_asm(x, addr, err, itype, ltype) \ +#define __get_user_asm(x, addr, err, itype) \ asm volatile("\n" \ "1: mov"itype" %[umem],%[output]\n" \ "2:\n" \ - ".section .fixup,\"ax\"\n" \ - "3: mov %[efault],%[errout]\n" \ - " xorl %k[output],%k[output]\n" \ - " jmp 2b\n" \ - ".previous\n" \ - _ASM_EXTABLE_UA(1b, 3b) \ + _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG | \ + EX_FLAG_CLEAR_AX, \ + %[errout]) \ : [errout] "=r" (err), \ - [output] ltype(x) \ + [output] "=a" (x) \ : [umem] "m" (__m(addr)), \ - [efault] "i" (-EFAULT), "0" (err)) + "0" (err)) -#endif // CONFIG_CC_ASM_GOTO_OUTPUT +#endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT /* FIXME: this hack is definitely wrong -AK */ struct __large_struct { unsigned long buf[100]; }; diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h index 70fc159ebe69..2a1f8734416d 100644 --- a/arch/x86/include/asm/unwind.h +++ b/arch/x86/include/asm/unwind.h @@ -4,6 +4,7 @@ #include <linux/sched.h> #include <linux/ftrace.h> +#include <linux/kprobes.h> #include <asm/ptrace.h> #include <asm/stacktrace.h> @@ -15,6 +16,9 @@ struct unwind_state { unsigned long stack_mask; struct task_struct *task; int graph_idx; +#ifdef CONFIG_KRETPROBES + struct llist_node *kr_cur; +#endif bool error; #if defined(CONFIG_UNWINDER_ORC) bool signal, full_regs; @@ -99,6 +103,31 @@ void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, void *orc, size_t orc_size) {} #endif +static inline +unsigned long unwind_recover_kretprobe(struct unwind_state *state, + unsigned long addr, unsigned long *addr_p) +{ +#ifdef CONFIG_KRETPROBES + return is_kretprobe_trampoline(addr) ? + kretprobe_find_ret_addr(state->task, addr_p, &state->kr_cur) : + addr; +#else + return addr; +#endif +} + +/* Recover the return address modified by kretprobe and ftrace_graph. */ +static inline +unsigned long unwind_recover_ret_addr(struct unwind_state *state, + unsigned long addr, unsigned long *addr_p) +{ + unsigned long ret; + + ret = ftrace_graph_ret_addr(state->task, &state->graph_idx, + addr, addr_p); + return unwind_recover_kretprobe(state, ret, addr_p); +} + /* * This disables KASAN checking when reading a value from another task's stack, * since the other task could be running on another CPU and could have poisoned diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h index 8e574c0afef8..8b33674288ea 100644 --- a/arch/x86/include/asm/unwind_hints.h +++ b/arch/x86/include/asm/unwind_hints.h @@ -52,6 +52,11 @@ UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=8 type=UNWIND_HINT_TYPE_FUNC .endm +#else + +#define UNWIND_HINT_FUNC \ + UNWIND_HINT(ORC_REG_SP, 8, UNWIND_HINT_TYPE_FUNC, 0) + #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_UNWIND_HINTS_H */ diff --git a/arch/x86/include/asm/word-at-a-time.h b/arch/x86/include/asm/word-at-a-time.h index 06006b0351f3..8338b0432b50 100644 --- a/arch/x86/include/asm/word-at-a-time.h +++ b/arch/x86/include/asm/word-at-a-time.h @@ -77,30 +77,58 @@ static inline unsigned long find_zero(unsigned long mask) * and the next page not being mapped, take the exception and * return zeroes in the non-existing part. */ +#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT + static inline unsigned long load_unaligned_zeropad(const void *addr) { - unsigned long ret, dummy; + unsigned long offset, data; + unsigned long ret; + + asm_volatile_goto( + "1: mov %[mem], %[ret]\n" + + _ASM_EXTABLE(1b, %l[do_exception]) + + : [ret] "=r" (ret) + : [mem] "m" (*(unsigned long *)addr) + : : do_exception); + + return ret; + +do_exception: + offset = (unsigned long)addr & (sizeof(long) - 1); + addr = (void *)((unsigned long)addr & ~(sizeof(long) - 1)); + data = *(unsigned long *)addr; + ret = data >> offset * 8; + + return ret; +} - asm( - "1:\tmov %2,%0\n" +#else /* !CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ + +static inline unsigned long load_unaligned_zeropad(const void *addr) +{ + unsigned long offset, data; + unsigned long ret, err = 0; + + asm( "1: mov %[mem], %[ret]\n" "2:\n" - ".section .fixup,\"ax\"\n" - "3:\t" - "lea %2,%1\n\t" - "and %3,%1\n\t" - "mov (%1),%0\n\t" - "leal %2,%%ecx\n\t" - "andl %4,%%ecx\n\t" - "shll $3,%%ecx\n\t" - "shr %%cl,%0\n\t" - "jmp 2b\n" - ".previous\n" - _ASM_EXTABLE(1b, 3b) - :"=&r" (ret),"=&c" (dummy) - :"m" (*(unsigned long *)addr), - "i" (-sizeof(unsigned long)), - "i" (sizeof(unsigned long)-1)); + + _ASM_EXTABLE_FAULT(1b, 2b) + + : [ret] "=&r" (ret), "+a" (err) + : [mem] "m" (*(unsigned long *)addr)); + + if (unlikely(err)) { + offset = (unsigned long)addr & (sizeof(long) - 1); + addr = (void *)((unsigned long)addr & ~(sizeof(long) - 1)); + data = *(unsigned long *)addr; + ret = data >> offset * 8; + } + return ret; } +#endif /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ + #endif /* _ASM_WORD_AT_A_TIME_H */ diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 5c69f7eb5d47..22b7412c08f6 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -289,12 +289,6 @@ struct x86_platform_ops { struct x86_hyper_runtime hyper; }; -struct pci_dev; - -struct x86_msi_ops { - void (*restore_msi_irqs)(struct pci_dev *dev); -}; - struct x86_apic_ops { unsigned int (*io_apic_read) (unsigned int apic, unsigned int reg); void (*restore)(void); diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 454b20815f35..e5e0fe10c692 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -248,6 +248,7 @@ privcmd_call(unsigned int call, return res; } +#ifdef CONFIG_XEN_PV static inline int HYPERVISOR_set_trap_table(struct trap_info *table) { @@ -280,6 +281,107 @@ HYPERVISOR_callback_op(int cmd, void *arg) return _hypercall2(int, callback_op, cmd, arg); } +static __always_inline int +HYPERVISOR_set_debugreg(int reg, unsigned long value) +{ + return _hypercall2(int, set_debugreg, reg, value); +} + +static __always_inline unsigned long +HYPERVISOR_get_debugreg(int reg) +{ + return _hypercall1(unsigned long, get_debugreg, reg); +} + +static inline int +HYPERVISOR_update_descriptor(u64 ma, u64 desc) +{ + return _hypercall2(int, update_descriptor, ma, desc); +} + +static inline int +HYPERVISOR_update_va_mapping(unsigned long va, pte_t new_val, + unsigned long flags) +{ + return _hypercall3(int, update_va_mapping, va, new_val.pte, flags); +} + +static inline int +HYPERVISOR_set_segment_base(int reg, unsigned long value) +{ + return _hypercall2(int, set_segment_base, reg, value); +} + +static inline void +MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set) +{ + mcl->op = __HYPERVISOR_fpu_taskswitch; + mcl->args[0] = set; + + trace_xen_mc_entry(mcl, 1); +} + +static inline void +MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va, + pte_t new_val, unsigned long flags) +{ + mcl->op = __HYPERVISOR_update_va_mapping; + mcl->args[0] = va; + mcl->args[1] = new_val.pte; + mcl->args[2] = flags; + + trace_xen_mc_entry(mcl, 3); +} + +static inline void +MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr, + struct desc_struct desc) +{ + mcl->op = __HYPERVISOR_update_descriptor; + mcl->args[0] = maddr; + mcl->args[1] = *(unsigned long *)&desc; + + trace_xen_mc_entry(mcl, 2); +} + +static inline void +MULTI_mmu_update(struct multicall_entry *mcl, struct mmu_update *req, + int count, int *success_count, domid_t domid) +{ + mcl->op = __HYPERVISOR_mmu_update; + mcl->args[0] = (unsigned long)req; + mcl->args[1] = count; + mcl->args[2] = (unsigned long)success_count; + mcl->args[3] = domid; + + trace_xen_mc_entry(mcl, 4); +} + +static inline void +MULTI_mmuext_op(struct multicall_entry *mcl, struct mmuext_op *op, int count, + int *success_count, domid_t domid) +{ + mcl->op = __HYPERVISOR_mmuext_op; + mcl->args[0] = (unsigned long)op; + mcl->args[1] = count; + mcl->args[2] = (unsigned long)success_count; + mcl->args[3] = domid; + + trace_xen_mc_entry(mcl, 4); +} + +static inline void +MULTI_stack_switch(struct multicall_entry *mcl, + unsigned long ss, unsigned long esp) +{ + mcl->op = __HYPERVISOR_stack_switch; + mcl->args[0] = ss; + mcl->args[1] = esp; + + trace_xen_mc_entry(mcl, 2); +} +#endif + static inline int HYPERVISOR_sched_op(int cmd, void *arg) { @@ -308,26 +410,6 @@ HYPERVISOR_platform_op(struct xen_platform_op *op) return _hypercall1(int, platform_op, op); } -static inline int -HYPERVISOR_set_debugreg(int reg, unsigned long value) -{ - return _hypercall2(int, set_debugreg, reg, value); -} - -static inline unsigned long -HYPERVISOR_get_debugreg(int reg) -{ - return _hypercall1(unsigned long, get_debugreg, reg); -} - -static inline int -HYPERVISOR_update_descriptor(u64 ma, u64 desc) -{ - if (sizeof(u64) == sizeof(long)) - return _hypercall2(int, update_descriptor, ma, desc); - return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32); -} - static inline long HYPERVISOR_memory_op(unsigned int cmd, void *arg) { @@ -341,24 +423,12 @@ HYPERVISOR_multicall(void *call_list, uint32_t nr_calls) } static inline int -HYPERVISOR_update_va_mapping(unsigned long va, pte_t new_val, - unsigned long flags) -{ - if (sizeof(new_val) == sizeof(long)) - return _hypercall3(int, update_va_mapping, va, - new_val.pte, flags); - else - return _hypercall4(int, update_va_mapping, va, - new_val.pte, new_val.pte >> 32, flags); -} - -static inline int HYPERVISOR_event_channel_op(int cmd, void *arg) { return _hypercall2(int, event_channel_op, cmd, arg); } -static inline int +static __always_inline int HYPERVISOR_xen_version(int cmd, void *arg) { return _hypercall2(int, xen_version, cmd, arg); @@ -394,14 +464,6 @@ HYPERVISOR_vcpu_op(int cmd, int vcpuid, void *extra_args) return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args); } -#ifdef CONFIG_X86_64 -static inline int -HYPERVISOR_set_segment_base(int reg, unsigned long value) -{ - return _hypercall2(int, set_segment_base, reg, value); -} -#endif - static inline int HYPERVISOR_suspend(unsigned long start_info_mfn) { @@ -423,13 +485,6 @@ HYPERVISOR_hvm_op(int op, void *arg) } static inline int -HYPERVISOR_tmem_op( - struct tmem_op *op) -{ - return _hypercall1(int, tmem_op, op); -} - -static inline int HYPERVISOR_xenpmu_op(unsigned int op, void *arg) { return _hypercall2(int, xenpmu_op, op, arg); @@ -446,88 +501,4 @@ HYPERVISOR_dm_op( return ret; } -static inline void -MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set) -{ - mcl->op = __HYPERVISOR_fpu_taskswitch; - mcl->args[0] = set; - - trace_xen_mc_entry(mcl, 1); -} - -static inline void -MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va, - pte_t new_val, unsigned long flags) -{ - mcl->op = __HYPERVISOR_update_va_mapping; - mcl->args[0] = va; - if (sizeof(new_val) == sizeof(long)) { - mcl->args[1] = new_val.pte; - mcl->args[2] = flags; - } else { - mcl->args[1] = new_val.pte; - mcl->args[2] = new_val.pte >> 32; - mcl->args[3] = flags; - } - - trace_xen_mc_entry(mcl, sizeof(new_val) == sizeof(long) ? 3 : 4); -} - -static inline void -MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr, - struct desc_struct desc) -{ - mcl->op = __HYPERVISOR_update_descriptor; - if (sizeof(maddr) == sizeof(long)) { - mcl->args[0] = maddr; - mcl->args[1] = *(unsigned long *)&desc; - } else { - u32 *p = (u32 *)&desc; - - mcl->args[0] = maddr; - mcl->args[1] = maddr >> 32; - mcl->args[2] = *p++; - mcl->args[3] = *p; - } - - trace_xen_mc_entry(mcl, sizeof(maddr) == sizeof(long) ? 2 : 4); -} - -static inline void -MULTI_mmu_update(struct multicall_entry *mcl, struct mmu_update *req, - int count, int *success_count, domid_t domid) -{ - mcl->op = __HYPERVISOR_mmu_update; - mcl->args[0] = (unsigned long)req; - mcl->args[1] = count; - mcl->args[2] = (unsigned long)success_count; - mcl->args[3] = domid; - - trace_xen_mc_entry(mcl, 4); -} - -static inline void -MULTI_mmuext_op(struct multicall_entry *mcl, struct mmuext_op *op, int count, - int *success_count, domid_t domid) -{ - mcl->op = __HYPERVISOR_mmuext_op; - mcl->args[0] = (unsigned long)op; - mcl->args[1] = count; - mcl->args[2] = (unsigned long)success_count; - mcl->args[3] = domid; - - trace_xen_mc_entry(mcl, 4); -} - -static inline void -MULTI_stack_switch(struct multicall_entry *mcl, - unsigned long ss, unsigned long esp) -{ - mcl->op = __HYPERVISOR_stack_switch; - mcl->args[0] = ss; - mcl->args[1] = esp; - - trace_xen_mc_entry(mcl, 2); -} - #endif /* _ASM_X86_XEN_HYPERCALL_H */ diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h index ff4b52e37e60..1bf2ad34188a 100644 --- a/arch/x86/include/asm/xen/hypervisor.h +++ b/arch/x86/include/asm/xen/hypervisor.h @@ -57,9 +57,22 @@ static inline bool __init xen_x2apic_para_available(void) } #endif +struct pci_dev; + +#ifdef CONFIG_XEN_PV_DOM0 +bool xen_initdom_restore_msi(struct pci_dev *dev); +#else +static inline bool xen_initdom_restore_msi(struct pci_dev *dev) { return true; } +#endif + #ifdef CONFIG_HOTPLUG_CPU void xen_arch_register_cpu(int num); void xen_arch_unregister_cpu(int num); #endif +#ifdef CONFIG_PVH +void __init xen_pvh_init(struct boot_params *boot_params); +void __init mem_map_via_hcall(struct boot_params *boot_params_p); +#endif + #endif /* _ASM_X86_XEN_HYPERVISOR_H */ diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 1a162e559753..e989bc2269f5 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -96,11 +96,7 @@ static inline int xen_safe_write_ulong(unsigned long *addr, unsigned long val) asm volatile("1: mov %[val], %[ptr]\n" "2:\n" - ".section .fixup, \"ax\"\n" - "3: sub $1, %[ret]\n" - " jmp 2b\n" - ".previous\n" - _ASM_EXTABLE(1b, 3b) + _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %[ret]) : [ret] "+r" (ret), [ptr] "=m" (*addr) : [val] "r" (val)); @@ -110,16 +106,12 @@ static inline int xen_safe_write_ulong(unsigned long *addr, unsigned long val) static inline int xen_safe_read_ulong(const unsigned long *addr, unsigned long *val) { - int ret = 0; unsigned long rval = ~0ul; + int ret = 0; asm volatile("1: mov %[ptr], %[rval]\n" "2:\n" - ".section .fixup, \"ax\"\n" - "3: sub $1, %[ret]\n" - " jmp 2b\n" - ".previous\n" - _ASM_EXTABLE(1b, 3b) + _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %[ret]) : [ret] "+r" (ret), [rval] "+r" (rval) : [ptr] "m" (*addr)); *val = rval; diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h index 3506d8c598c1..9015b888edd6 100644 --- a/arch/x86/include/asm/xen/pci.h +++ b/arch/x86/include/asm/xen/pci.h @@ -14,29 +14,13 @@ static inline int pci_xen_hvm_init(void) return -1; } #endif -#if defined(CONFIG_XEN_DOM0) +#ifdef CONFIG_XEN_PV_DOM0 int __init pci_xen_initial_domain(void); -int xen_find_device_domain_owner(struct pci_dev *dev); -int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain); -int xen_unregister_device_domain_owner(struct pci_dev *dev); #else static inline int __init pci_xen_initial_domain(void) { return -1; } -static inline int xen_find_device_domain_owner(struct pci_dev *dev) -{ - return -1; -} -static inline int xen_register_device_domain_owner(struct pci_dev *dev, - uint16_t domain) -{ - return -1; -} -static inline int xen_unregister_device_domain_owner(struct pci_dev *dev) -{ - return -1; -} #endif #if defined(CONFIG_PCI_MSI) diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 2ef1f6513c68..2da3316bb559 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -373,9 +373,23 @@ struct kvm_debugregs { __u64 reserved[9]; }; -/* for KVM_CAP_XSAVE */ +/* for KVM_CAP_XSAVE and KVM_CAP_XSAVE2 */ struct kvm_xsave { + /* + * KVM_GET_XSAVE2 and KVM_SET_XSAVE write and read as many bytes + * as are returned by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2) + * respectively, when invoked on the vm file descriptor. + * + * The size value returned by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2) + * will always be at least 4096. Currently, it is only greater + * than 4096 if a dynamic feature has been enabled with + * ``arch_prctl()``, but this may change in the future. + * + * The offsets of the state save areas in struct kvm_xsave follow + * the contents of CPUID leaf 0xD on the host. + */ __u32 region[1024]; + __u32 extra[0]; }; #define KVM_MAX_XCRS 16 @@ -504,4 +518,8 @@ struct kvm_pmu_event_filter { #define KVM_PMU_EVENT_ALLOW 0 #define KVM_PMU_EVENT_DENY 1 +/* for KVM_{GET,SET,HAS}_DEVICE_ATTR */ +#define KVM_VCPU_TSC_CTRL 0 /* control group for the timestamp counter (TSC) */ +#define KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */ + #endif /* _ASM_X86_KVM_H */ diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 5146bbab84d4..6e64b27b2c1e 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -8,6 +8,7 @@ * should be used to determine that a VM is running under KVM. */ #define KVM_CPUID_SIGNATURE 0x40000000 +#define KVM_SIGNATURE "KVMKVMKVM\0\0\0" /* This CPUID returns two feature bitmaps in eax, edx. Before enabling * a particular paravirtualization, the appropriate feature bit should diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h index 5a6aac9fa41f..500b96e71f18 100644 --- a/arch/x86/include/uapi/asm/prctl.h +++ b/arch/x86/include/uapi/asm/prctl.h @@ -2,16 +2,22 @@ #ifndef _ASM_X86_PRCTL_H #define _ASM_X86_PRCTL_H -#define ARCH_SET_GS 0x1001 -#define ARCH_SET_FS 0x1002 -#define ARCH_GET_FS 0x1003 -#define ARCH_GET_GS 0x1004 +#define ARCH_SET_GS 0x1001 +#define ARCH_SET_FS 0x1002 +#define ARCH_GET_FS 0x1003 +#define ARCH_GET_GS 0x1004 -#define ARCH_GET_CPUID 0x1011 -#define ARCH_SET_CPUID 0x1012 +#define ARCH_GET_CPUID 0x1011 +#define ARCH_SET_CPUID 0x1012 -#define ARCH_MAP_VDSO_X32 0x2001 -#define ARCH_MAP_VDSO_32 0x2002 -#define ARCH_MAP_VDSO_64 0x2003 +#define ARCH_GET_XCOMP_SUPP 0x1021 +#define ARCH_GET_XCOMP_PERM 0x1022 +#define ARCH_REQ_XCOMP_PERM 0x1023 +#define ARCH_GET_XCOMP_GUEST_PERM 0x1024 +#define ARCH_REQ_XCOMP_GUEST_PERM 0x1025 + +#define ARCH_MAP_VDSO_X32 0x2001 +#define ARCH_MAP_VDSO_32 0x2002 +#define ARCH_MAP_VDSO_64 0x2003 #endif /* _ASM_X86_PRCTL_H */ diff --git a/arch/x86/include/uapi/asm/sgx.h b/arch/x86/include/uapi/asm/sgx.h index 9690d6899ad9..f4b81587e90b 100644 --- a/arch/x86/include/uapi/asm/sgx.h +++ b/arch/x86/include/uapi/asm/sgx.h @@ -27,6 +27,8 @@ enum sgx_page_flags { _IOW(SGX_MAGIC, 0x02, struct sgx_enclave_init) #define SGX_IOC_ENCLAVE_PROVISION \ _IOW(SGX_MAGIC, 0x03, struct sgx_enclave_provision) +#define SGX_IOC_VEPC_REMOVE_ALL \ + _IO(SGX_MAGIC, 0x04) /** * struct sgx_enclave_create - parameter structure for the diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 8f4e8fa6ed75..6aef9ee28a39 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -21,6 +21,7 @@ CFLAGS_REMOVE_ftrace.o = -pg CFLAGS_REMOVE_early_printk.o = -pg CFLAGS_REMOVE_head64.o = -pg CFLAGS_REMOVE_sev.o = -pg +CFLAGS_REMOVE_cc_platform.o = -pg endif KASAN_SANITIZE_head$(BITS).o := n @@ -29,6 +30,7 @@ KASAN_SANITIZE_dumpstack_$(BITS).o := n KASAN_SANITIZE_stacktrace.o := n KASAN_SANITIZE_paravirt.o := n KASAN_SANITIZE_sev.o := n +KASAN_SANITIZE_cc_platform.o := n # With some compiler versions the generated code results in boot hangs, caused # by several compilation units. To be safe, disable all instrumentation. @@ -47,6 +49,7 @@ endif KCOV_INSTRUMENT := n CFLAGS_head$(BITS).o += -fno-stack-protector +CFLAGS_cc_platform.o += -fno-stack-protector CFLAGS_irq.o := -I $(srctree)/$(src)/../include/asm/trace @@ -81,7 +84,7 @@ obj-$(CONFIG_IA32_EMULATION) += tls.o obj-y += step.o obj-$(CONFIG_INTEL_TXT) += tboot.o obj-$(CONFIG_ISA_DMA_API) += i8237.o -obj-$(CONFIG_STACKTRACE) += stacktrace.o +obj-y += stacktrace.o obj-y += cpu/ obj-y += acpi/ obj-y += reboot.o @@ -147,6 +150,9 @@ obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o obj-$(CONFIG_AMD_MEM_ENCRYPT) += sev.o + +obj-$(CONFIG_ARCH_HAS_CC_PLATFORM) += cc_platform.o + ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 14bcd59bcdee..5b6d1a95776f 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -62,6 +62,7 @@ int acpi_fix_pin2_polarity __initdata; #ifdef CONFIG_X86_LOCAL_APIC static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; +static bool acpi_support_online_capable; #endif #ifdef CONFIG_X86_IO_APIC @@ -138,6 +139,8 @@ static int __init acpi_parse_madt(struct acpi_table_header *table) pr_debug("Local APIC address 0x%08x\n", madt->address); } + if (madt->header.revision >= 5) + acpi_support_online_capable = true; default_acpi_madt_oem_check(madt->header.oem_id, madt->header.oem_table_id); @@ -239,6 +242,12 @@ acpi_parse_lapic(union acpi_subtable_headers * header, const unsigned long end) if (processor->id == 0xff) return 0; + /* don't register processors that can not be onlined */ + if (acpi_support_online_capable && + !(processor->lapic_flags & ACPI_MADT_ENABLED) && + !(processor->lapic_flags & ACPI_MADT_ONLINE_CAPABLE)) + return 0; + /* * We need to register disabled CPU as well to permit * counting disabled CPUs. This allows us to size diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 7de599eba7f0..7945eae5b315 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -79,6 +79,21 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags, */ flags->bm_control = 0; } + if (c->x86_vendor == X86_VENDOR_AMD && c->x86 >= 0x17) { + /* + * For all AMD Zen or newer CPUs that support C3, caches + * should not be flushed by software while entering C3 + * type state. Set bm->check to 1 so that kernel doesn't + * need to execute cache flush operation. + */ + flags->bm_check = 1; + /* + * In current AMD C state implementation ARB_DIS is no longer + * used. So set bm_control to zero to indicate ARB_DIS is not + * required while entering C3 type state. + */ + flags->bm_control = 0; + } } EXPORT_SYMBOL(acpi_processor_power_init_bm_check); diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 3f85fcae450c..1e97f944b47d 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -139,8 +139,10 @@ static int __init acpi_sleep_setup(char *str) if (strncmp(str, "s3_beep", 7) == 0) acpi_realmode_flags |= 4; #ifdef CONFIG_HIBERNATION + if (strncmp(str, "s4_hwsig", 8) == 0) + acpi_check_s4_hw_signature(1); if (strncmp(str, "s4_nohwsig", 10) == 0) - acpi_no_s4_hw_signature(); + acpi_check_s4_hw_signature(0); #endif if (strncmp(str, "nonvs", 5) == 0) acpi_nvs_nosave(); diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S index daf88f8143c5..cf69081073b5 100644 --- a/arch/x86/kernel/acpi/wakeup_32.S +++ b/arch/x86/kernel/acpi/wakeup_32.S @@ -60,7 +60,7 @@ save_registers: popl saved_context_eflags movl $ret_point, saved_eip - ret + RET restore_registers: @@ -70,7 +70,7 @@ restore_registers: movl saved_context_edi, %edi pushl saved_context_eflags popfl - ret + RET SYM_CODE_START(do_suspend_lowlevel) call save_processor_state @@ -86,7 +86,7 @@ SYM_CODE_START(do_suspend_lowlevel) ret_point: call restore_registers call restore_processor_state - ret + RET SYM_CODE_END(do_suspend_lowlevel) .data diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index e9da3dc71254..5007c3ffe96f 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -29,6 +29,7 @@ #include <asm/io.h> #include <asm/fixmap.h> #include <asm/paravirt.h> +#include <asm/asm-prototypes.h> int __read_mostly alternatives_patched; @@ -113,6 +114,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len) } } +extern s32 __retpoline_sites[], __retpoline_sites_end[]; extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern s32 __smp_locks[], __smp_locks_end[]; void text_poke_early(void *addr, const void *opcode, size_t len); @@ -221,7 +223,7 @@ static __always_inline int optimize_nops_range(u8 *instr, u8 instrlen, int off) * "noinline" to cause control flow change and thus invalidate I$ and * cause refetch after modification. */ -static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr) +static void __init_or_module noinline optimize_nops(u8 *instr, size_t len) { struct insn insn; int i = 0; @@ -239,11 +241,11 @@ static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *ins * optimized. */ if (insn.length == 1 && insn.opcode.bytes[0] == 0x90) - i += optimize_nops_range(instr, a->instrlen, i); + i += optimize_nops_range(instr, len, i); else i += insn.length; - if (i >= a->instrlen) + if (i >= len) return; } } @@ -331,10 +333,185 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, text_poke_early(instr, insn_buff, insn_buff_sz); next: - optimize_nops(a, instr); + optimize_nops(instr, a->instrlen); } } +#if defined(CONFIG_RETPOLINE) && defined(CONFIG_STACK_VALIDATION) + +/* + * CALL/JMP *%\reg + */ +static int emit_indirect(int op, int reg, u8 *bytes) +{ + int i = 0; + u8 modrm; + + switch (op) { + case CALL_INSN_OPCODE: + modrm = 0x10; /* Reg = 2; CALL r/m */ + break; + + case JMP32_INSN_OPCODE: + modrm = 0x20; /* Reg = 4; JMP r/m */ + break; + + default: + WARN_ON_ONCE(1); + return -1; + } + + if (reg >= 8) { + bytes[i++] = 0x41; /* REX.B prefix */ + reg -= 8; + } + + modrm |= 0xc0; /* Mod = 3 */ + modrm += reg; + + bytes[i++] = 0xff; /* opcode */ + bytes[i++] = modrm; + + return i; +} + +/* + * Rewrite the compiler generated retpoline thunk calls. + * + * For spectre_v2=off (!X86_FEATURE_RETPOLINE), rewrite them into immediate + * indirect instructions, avoiding the extra indirection. + * + * For example, convert: + * + * CALL __x86_indirect_thunk_\reg + * + * into: + * + * CALL *%\reg + * + * It also tries to inline spectre_v2=retpoline,amd when size permits. + */ +static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) +{ + retpoline_thunk_t *target; + int reg, ret, i = 0; + u8 op, cc; + + target = addr + insn->length + insn->immediate.value; + reg = target - __x86_indirect_thunk_array; + + if (WARN_ON_ONCE(reg & ~0xf)) + return -1; + + /* If anyone ever does: CALL/JMP *%rsp, we're in deep trouble. */ + BUG_ON(reg == 4); + + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) && + !cpu_feature_enabled(X86_FEATURE_RETPOLINE_AMD)) + return -1; + + op = insn->opcode.bytes[0]; + + /* + * Convert: + * + * Jcc.d32 __x86_indirect_thunk_\reg + * + * into: + * + * Jncc.d8 1f + * [ LFENCE ] + * JMP *%\reg + * [ NOP ] + * 1: + */ + /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */ + if (op == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80) { + cc = insn->opcode.bytes[1] & 0xf; + cc ^= 1; /* invert condition */ + + bytes[i++] = 0x70 + cc; /* Jcc.d8 */ + bytes[i++] = insn->length - 2; /* sizeof(Jcc.d8) == 2 */ + + /* Continue as if: JMP.d32 __x86_indirect_thunk_\reg */ + op = JMP32_INSN_OPCODE; + } + + /* + * For RETPOLINE_AMD: prepend the indirect CALL/JMP with an LFENCE. + */ + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_AMD)) { + bytes[i++] = 0x0f; + bytes[i++] = 0xae; + bytes[i++] = 0xe8; /* LFENCE */ + } + + ret = emit_indirect(op, reg, bytes + i); + if (ret < 0) + return ret; + i += ret; + + for (; i < insn->length;) + bytes[i++] = BYTES_NOP1; + + return i; +} + +/* + * Generated by 'objtool --retpoline'. + */ +void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) +{ + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + struct insn insn; + int len, ret; + u8 bytes[16]; + u8 op1, op2; + + ret = insn_decode_kernel(&insn, addr); + if (WARN_ON_ONCE(ret < 0)) + continue; + + op1 = insn.opcode.bytes[0]; + op2 = insn.opcode.bytes[1]; + + switch (op1) { + case CALL_INSN_OPCODE: + case JMP32_INSN_OPCODE: + break; + + case 0x0f: /* escape */ + if (op2 >= 0x80 && op2 <= 0x8f) + break; + fallthrough; + default: + WARN_ON_ONCE(1); + continue; + } + + DPRINTK("retpoline at: %pS (%px) len: %d to: %pS", + addr, addr, insn.length, + addr + insn.length + insn.immediate.value); + + len = patch_retpoline(addr, &insn, bytes); + if (len == insn.length) { + optimize_nops(bytes, len); + DUMP_BYTES(((u8*)addr), len, "%px: orig: ", addr); + DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr); + text_poke_early(addr, bytes, len); + } + } +} + +#else /* !RETPOLINES || !CONFIG_STACK_VALIDATION */ + +void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { } + +#endif /* CONFIG_RETPOLINE && CONFIG_STACK_VALIDATION */ + #ifdef CONFIG_SMP static void alternatives_smp_lock(const s32 *start, const s32 *end, u8 *text, u8 *text_end) @@ -537,7 +714,7 @@ asm ( " .type int3_magic, @function\n" "int3_magic:\n" " movl $1, (%" _ASM_ARG1 ")\n" -" ret\n" + ASM_RET " .size int3_magic, .-int3_magic\n" " .popsection\n" ); @@ -643,6 +820,12 @@ void __init alternative_instructions(void) apply_paravirt(__parainstructions, __parainstructions_end); /* + * Rewrite the retpolines, must be done before alternatives since + * those can rewrite the retpoline thunks. + */ + apply_retpolines(__retpoline_sites, __retpoline_sites_end); + + /* * Then patch alternatives, such that those paravirt calls that are in * alternatives can be overwritten by their immediate fragments. */ @@ -930,10 +1113,13 @@ void text_poke_sync(void) } struct text_poke_loc { - s32 rel_addr; /* addr := _stext + rel_addr */ - s32 rel32; + /* addr := _stext + rel_addr */ + s32 rel_addr; + s32 disp; + u8 len; u8 opcode; const u8 text[POKE_MAX_OPCODE_SIZE]; + /* see text_poke_bp_batch() */ u8 old; }; @@ -948,7 +1134,8 @@ static struct bp_patching_desc *bp_desc; static __always_inline struct bp_patching_desc *try_get_desc(struct bp_patching_desc **descp) { - struct bp_patching_desc *desc = __READ_ONCE(*descp); /* rcu_dereference */ + /* rcu_dereference */ + struct bp_patching_desc *desc = __READ_ONCE(*descp); if (!desc || !arch_atomic_inc_not_zero(&desc->refs)) return NULL; @@ -982,7 +1169,7 @@ noinstr int poke_int3_handler(struct pt_regs *regs) { struct bp_patching_desc *desc; struct text_poke_loc *tp; - int len, ret = 0; + int ret = 0; void *ip; if (user_mode(regs)) @@ -1022,8 +1209,7 @@ noinstr int poke_int3_handler(struct pt_regs *regs) goto out_put; } - len = text_opcode_size(tp->opcode); - ip += len; + ip += tp->len; switch (tp->opcode) { case INT3_INSN_OPCODE: @@ -1038,12 +1224,12 @@ noinstr int poke_int3_handler(struct pt_regs *regs) break; case CALL_INSN_OPCODE: - int3_emulate_call(regs, (long)ip + tp->rel32); + int3_emulate_call(regs, (long)ip + tp->disp); break; case JMP32_INSN_OPCODE: case JMP8_INSN_OPCODE: - int3_emulate_jmp(regs, (long)ip + tp->rel32); + int3_emulate_jmp(regs, (long)ip + tp->disp); break; default: @@ -1118,7 +1304,7 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries */ for (do_sync = 0, i = 0; i < nr_entries; i++) { u8 old[POKE_MAX_OPCODE_SIZE] = { tp[i].old, }; - int len = text_opcode_size(tp[i].opcode); + int len = tp[i].len; if (len - INT3_INSN_SIZE > 0) { memcpy(old + INT3_INSN_SIZE, @@ -1195,21 +1381,37 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, const void *opcode, size_t len, const void *emulate) { struct insn insn; - int ret; + int ret, i; memcpy((void *)tp->text, opcode, len); if (!emulate) emulate = opcode; ret = insn_decode_kernel(&insn, emulate); - BUG_ON(ret < 0); - BUG_ON(len != insn.length); tp->rel_addr = addr - (void *)_stext; + tp->len = len; tp->opcode = insn.opcode.bytes[0]; switch (tp->opcode) { + case RET_INSN_OPCODE: + case JMP32_INSN_OPCODE: + case JMP8_INSN_OPCODE: + /* + * Control flow instructions without implied execution of the + * next instruction can be padded with INT3. + */ + for (i = insn.length; i < len; i++) + BUG_ON(tp->text[i] != INT3_INSN_OPCODE); + break; + + default: + BUG_ON(len != insn.length); + }; + + + switch (tp->opcode) { case INT3_INSN_OPCODE: case RET_INSN_OPCODE: break; @@ -1217,7 +1419,7 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, case CALL_INSN_OPCODE: case JMP32_INSN_OPCODE: case JMP8_INSN_OPCODE: - tp->rel32 = insn.immediate.value; + tp->disp = insn.immediate.value; break; default: /* assume NOP */ @@ -1225,13 +1427,13 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, case 2: /* NOP2 -- emulate as JMP8+0 */ BUG_ON(memcmp(emulate, x86_nops[len], len)); tp->opcode = JMP8_INSN_OPCODE; - tp->rel32 = 0; + tp->disp = 0; break; case 5: /* NOP5 -- emulate as JMP32+0 */ BUG_ON(memcmp(emulate, x86_nops[len], len)); tp->opcode = JMP32_INSN_OPCODE; - tp->rel32 = 0; + tp->disp = 0; break; default: /* unknown instruction */ diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index c92c9c774c0e..020c906f7934 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -19,17 +19,19 @@ #define PCI_DEVICE_ID_AMD_17H_M10H_ROOT 0x15d0 #define PCI_DEVICE_ID_AMD_17H_M30H_ROOT 0x1480 #define PCI_DEVICE_ID_AMD_17H_M60H_ROOT 0x1630 +#define PCI_DEVICE_ID_AMD_19H_M10H_ROOT 0x14a4 #define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464 #define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4 0x15ec #define PCI_DEVICE_ID_AMD_17H_M30H_DF_F4 0x1494 #define PCI_DEVICE_ID_AMD_17H_M60H_DF_F4 0x144c #define PCI_DEVICE_ID_AMD_17H_M70H_DF_F4 0x1444 #define PCI_DEVICE_ID_AMD_19H_DF_F4 0x1654 +#define PCI_DEVICE_ID_AMD_19H_M10H_DF_F4 0x14b1 #define PCI_DEVICE_ID_AMD_19H_M40H_ROOT 0x14b5 #define PCI_DEVICE_ID_AMD_19H_M40H_DF_F4 0x167d #define PCI_DEVICE_ID_AMD_19H_M50H_DF_F4 0x166e -/* Protect the PCI config register pairs used for SMN and DF indirect access. */ +/* Protect the PCI config register pairs used for SMN. */ static DEFINE_MUTEX(smn_mutex); static u32 *flush_words; @@ -39,6 +41,7 @@ static const struct pci_device_id amd_root_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_ROOT) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_ROOT) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_ROOT) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_ROOT) }, {} }; @@ -61,6 +64,7 @@ static const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F3) }, {} @@ -78,6 +82,7 @@ static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) }, @@ -182,53 +187,6 @@ int amd_smn_write(u16 node, u32 address, u32 value) } EXPORT_SYMBOL_GPL(amd_smn_write); -/* - * Data Fabric Indirect Access uses FICAA/FICAD. - * - * Fabric Indirect Configuration Access Address (FICAA): Constructed based - * on the device's Instance Id and the PCI function and register offset of - * the desired register. - * - * Fabric Indirect Configuration Access Data (FICAD): There are FICAD LO - * and FICAD HI registers but so far we only need the LO register. - */ -int amd_df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo) -{ - struct pci_dev *F4; - u32 ficaa; - int err = -ENODEV; - - if (node >= amd_northbridges.num) - goto out; - - F4 = node_to_amd_nb(node)->link; - if (!F4) - goto out; - - ficaa = 1; - ficaa |= reg & 0x3FC; - ficaa |= (func & 0x7) << 11; - ficaa |= instance_id << 16; - - mutex_lock(&smn_mutex); - - err = pci_write_config_dword(F4, 0x5C, ficaa); - if (err) { - pr_warn("Error writing DF Indirect FICAA, FICAA=0x%x\n", ficaa); - goto out_unlock; - } - - err = pci_read_config_dword(F4, 0x98, lo); - if (err) - pr_warn("Error reading DF Indirect FICAD LO, FICAA=0x%x.\n", ficaa); - -out_unlock: - mutex_unlock(&smn_mutex); - -out: - return err; -} -EXPORT_SYMBOL_GPL(amd_df_indirect_read); int amd_cache_northbridges(void) { diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 10562885f5fc..af3ba08b684b 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -73,12 +73,23 @@ static int gart_mem_pfn_is_ram(unsigned long pfn) (pfn >= aperture_pfn_start + aperture_page_count)); } +#ifdef CONFIG_PROC_VMCORE +static bool gart_oldmem_pfn_is_ram(struct vmcore_cb *cb, unsigned long pfn) +{ + return !!gart_mem_pfn_is_ram(pfn); +} + +static struct vmcore_cb gart_vmcore_cb = { + .pfn_is_ram = gart_oldmem_pfn_is_ram, +}; +#endif + static void __init exclude_from_core(u64 aper_base, u32 aper_order) { aperture_pfn_start = aper_base >> PAGE_SHIFT; aperture_page_count = (32 * 1024 * 1024) << aper_order >> PAGE_SHIFT; #ifdef CONFIG_PROC_VMCORE - WARN_ON(register_oldmem_pfn_is_ram(&gart_mem_pfn_is_ram)); + register_vmcore_cb(&gart_vmcore_cb); #endif #ifdef CONFIG_PROC_KCORE WARN_ON(register_mem_pfn_is_ram(&gart_mem_pfn_is_ram)); diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index dbacb9ec8843..7517eb05bdc1 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -19,6 +19,7 @@ #include <asm/hw_irq.h> #include <asm/apic.h> #include <asm/irq_remapping.h> +#include <asm/xen/hypervisor.h> struct irq_domain *x86_pci_msi_default_domain __ro_after_init; @@ -159,11 +160,8 @@ static struct irq_chip pci_msi_controller = { int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec, msi_alloc_info_t *arg) { - struct pci_dev *pdev = to_pci_dev(dev); - struct msi_desc *desc = first_pci_msi_entry(pdev); - init_irq_alloc_info(arg, NULL); - if (desc->msi_attrib.is_msix) { + if (to_pci_dev(dev)->msix_enabled) { arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSIX; } else { arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSI; @@ -345,3 +343,8 @@ void dmar_free_hwirq(int irq) irq_domain_free_irqs(irq, 1); } #endif + +bool arch_restore_msi_irqs(struct pci_dev *dev) +{ + return xen_initdom_restore_msi(dev); +} diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index f4da9bb69a88..e696e22d0531 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -15,9 +15,15 @@ struct cluster_mask { struct cpumask mask; }; -static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); +/* + * __x2apic_send_IPI_mask() possibly needs to read + * x86_cpu_to_logical_apicid for all online cpus in a sequential way. + * Using per cpu variable would cost one cache line per cpu. + */ +static u32 *x86_cpu_to_logical_apicid __read_mostly; + static DEFINE_PER_CPU(cpumask_var_t, ipi_mask); -static DEFINE_PER_CPU(struct cluster_mask *, cluster_masks); +static DEFINE_PER_CPU_READ_MOSTLY(struct cluster_mask *, cluster_masks); static struct cluster_mask *cluster_hotplug_mask; static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) @@ -27,7 +33,7 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) static void x2apic_send_IPI(int cpu, int vector) { - u32 dest = per_cpu(x86_cpu_to_logical_apicid, cpu); + u32 dest = x86_cpu_to_logical_apicid[cpu]; /* x2apic MSRs are special and need a special fence: */ weak_wrmsr_fence(); @@ -58,7 +64,7 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) dest = 0; for_each_cpu_and(clustercpu, tmpmsk, &cmsk->mask) - dest |= per_cpu(x86_cpu_to_logical_apicid, clustercpu); + dest |= x86_cpu_to_logical_apicid[clustercpu]; if (!dest) continue; @@ -94,7 +100,7 @@ static void x2apic_send_IPI_all(int vector) static u32 x2apic_calc_apicid(unsigned int cpu) { - return per_cpu(x86_cpu_to_logical_apicid, cpu); + return x86_cpu_to_logical_apicid[cpu]; } static void init_x2apic_ldr(void) @@ -103,7 +109,7 @@ static void init_x2apic_ldr(void) u32 cluster, apicid = apic_read(APIC_LDR); unsigned int cpu; - this_cpu_write(x86_cpu_to_logical_apicid, apicid); + x86_cpu_to_logical_apicid[smp_processor_id()] = apicid; if (cmsk) goto update; @@ -166,12 +172,21 @@ static int x2apic_dead_cpu(unsigned int dead_cpu) static int x2apic_cluster_probe(void) { + u32 slots; + if (!x2apic_mode) return 0; + slots = max_t(u32, L1_CACHE_BYTES/sizeof(u32), nr_cpu_ids); + x86_cpu_to_logical_apicid = kcalloc(slots, sizeof(u32), GFP_KERNEL); + if (!x86_cpu_to_logical_apicid) + return 0; + if (cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "x86/x2apic:prepare", x2apic_prepare_cpu, x2apic_dead_cpu) < 0) { pr_err("Failed to register X2APIC_PREPARE\n"); + kfree(x86_cpu_to_logical_apicid); + x86_cpu_to_logical_apicid = NULL; return 0; } init_x2apic_ldr(); diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index ecd3fd6993d1..9fb0a2f8b62a 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -38,9 +38,6 @@ static void __used common(void) #endif BLANK(); - OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); - - BLANK(); OFFSET(pbe_address, pbe, address); OFFSET(pbe_orig_address, pbe, orig_address); OFFSET(pbe_next, pbe, next); diff --git a/arch/x86/kernel/audit_64.c b/arch/x86/kernel/audit_64.c index 83d9cad4e68b..44c3601cfdc4 100644 --- a/arch/x86/kernel/audit_64.c +++ b/arch/x86/kernel/audit_64.c @@ -47,14 +47,16 @@ int audit_classify_syscall(int abi, unsigned syscall) #endif switch(syscall) { case __NR_open: - return 2; + return AUDITSC_OPEN; case __NR_openat: - return 3; + return AUDITSC_OPENAT; case __NR_execve: case __NR_execveat: - return 5; + return AUDITSC_EXECVE; + case __NR_openat2: + return AUDITSC_OPENAT2; default: - return 0; + return AUDITSC_NATIVE; } } diff --git a/arch/x86/kernel/cc_platform.c b/arch/x86/kernel/cc_platform.c new file mode 100644 index 000000000000..6a6ffcd978f6 --- /dev/null +++ b/arch/x86/kernel/cc_platform.c @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Confidential Computing Platform Capability checks + * + * Copyright (C) 2021 Advanced Micro Devices, Inc. + * + * Author: Tom Lendacky <thomas.lendacky@amd.com> + */ + +#include <linux/export.h> +#include <linux/cc_platform.h> +#include <linux/mem_encrypt.h> + +#include <asm/mshyperv.h> +#include <asm/processor.h> + +static bool __maybe_unused intel_cc_platform_has(enum cc_attr attr) +{ +#ifdef CONFIG_INTEL_TDX_GUEST + return false; +#else + return false; +#endif +} + +/* + * SME and SEV are very similar but they are not the same, so there are + * times that the kernel will need to distinguish between SME and SEV. The + * cc_platform_has() function is used for this. When a distinction isn't + * needed, the CC_ATTR_MEM_ENCRYPT attribute can be used. + * + * The trampoline code is a good example for this requirement. Before + * paging is activated, SME will access all memory as decrypted, but SEV + * will access all memory as encrypted. So, when APs are being brought + * up under SME the trampoline area cannot be encrypted, whereas under SEV + * the trampoline area must be encrypted. + */ +static bool amd_cc_platform_has(enum cc_attr attr) +{ +#ifdef CONFIG_AMD_MEM_ENCRYPT + switch (attr) { + case CC_ATTR_MEM_ENCRYPT: + return sme_me_mask; + + case CC_ATTR_HOST_MEM_ENCRYPT: + return sme_me_mask && !(sev_status & MSR_AMD64_SEV_ENABLED); + + case CC_ATTR_GUEST_MEM_ENCRYPT: + return sev_status & MSR_AMD64_SEV_ENABLED; + + case CC_ATTR_GUEST_STATE_ENCRYPT: + return sev_status & MSR_AMD64_SEV_ES_ENABLED; + + /* + * With SEV, the rep string I/O instructions need to be unrolled + * but SEV-ES supports them through the #VC handler. + */ + case CC_ATTR_GUEST_UNROLL_STRING_IO: + return (sev_status & MSR_AMD64_SEV_ENABLED) && + !(sev_status & MSR_AMD64_SEV_ES_ENABLED); + + default: + return false; + } +#else + return false; +#endif +} + +static bool hyperv_cc_platform_has(enum cc_attr attr) +{ + return attr == CC_ATTR_GUEST_MEM_ENCRYPT; +} + +bool cc_platform_has(enum cc_attr attr) +{ + if (sme_me_mask) + return amd_cc_platform_has(attr); + + if (hv_is_isolation_supported()) + return hyperv_cc_platform_has(attr); + + return false; +} +EXPORT_SYMBOL_GPL(cc_platform_has); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 637b499450d1..9661e3e802be 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -43,6 +43,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o obj-$(CONFIG_CPU_SUP_ZHAOXIN) += zhaoxin.o +obj-$(CONFIG_CPU_SUP_VORTEX_32) += vortex.o obj-$(CONFIG_X86_MCE) += mce/ obj-$(CONFIG_MTRR) += mtrr/ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 2131af9f2fa2..4edb6f0f628c 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -989,6 +989,8 @@ static void init_amd(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_IRPERF) && !cpu_has_amd_erratum(c, amd_erratum_1054)) msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT); + + check_null_seg_clears_base(c); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index ecfca3bbcd96..1c1f218a701d 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -22,7 +22,7 @@ #include <asm/bugs.h> #include <asm/processor.h> #include <asm/processor-flags.h> -#include <asm/fpu/internal.h> +#include <asm/fpu/api.h> #include <asm/msr.h> #include <asm/vmx.h> #include <asm/paravirt.h> @@ -758,11 +758,11 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) case SPECTRE_V2_USER_CMD_FORCE: mode = SPECTRE_V2_USER_STRICT; break; + case SPECTRE_V2_USER_CMD_AUTO: case SPECTRE_V2_USER_CMD_PRCTL: case SPECTRE_V2_USER_CMD_PRCTL_IBPB: mode = SPECTRE_V2_USER_PRCTL; break; - case SPECTRE_V2_USER_CMD_AUTO: case SPECTRE_V2_USER_CMD_SECCOMP: case SPECTRE_V2_USER_CMD_SECCOMP_IBPB: if (IS_ENABLED(CONFIG_SECCOMP)) @@ -882,13 +882,6 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) return SPECTRE_V2_CMD_AUTO; } - if (cmd == SPECTRE_V2_CMD_RETPOLINE_AMD && - boot_cpu_data.x86_vendor != X86_VENDOR_HYGON && - boot_cpu_data.x86_vendor != X86_VENDOR_AMD) { - pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n"); - return SPECTRE_V2_CMD_AUTO; - } - spec_v2_print_cond(mitigation_options[i].option, mitigation_options[i].secure); return cmd; @@ -1169,7 +1162,6 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void) return mode; switch (cmd) { - case SPEC_STORE_BYPASS_CMD_AUTO: case SPEC_STORE_BYPASS_CMD_SECCOMP: /* * Choose prctl+seccomp as the default mode if seccomp is @@ -1183,6 +1175,7 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void) case SPEC_STORE_BYPASS_CMD_ON: mode = SPEC_STORE_BYPASS_DISABLE; break; + case SPEC_STORE_BYPASS_CMD_AUTO: case SPEC_STORE_BYPASS_CMD_PRCTL: mode = SPEC_STORE_BYPASS_PRCTL; break; diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index b5e36bd0425b..fe98a1465be6 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -846,6 +846,7 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c) l2 = new_l2; #ifdef CONFIG_SMP per_cpu(cpu_llc_id, cpu) = l2_id; + per_cpu(cpu_l2c_id, cpu) = l2_id; #endif } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 0f8885949e8c..7b8382c11788 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -42,7 +42,7 @@ #include <asm/setup.h> #include <asm/apic.h> #include <asm/desc.h> -#include <asm/fpu/internal.h> +#include <asm/fpu/api.h> #include <asm/mtrr.h> #include <asm/hwcap2.h> #include <linux/numa.h> @@ -85,6 +85,9 @@ u16 get_llc_id(unsigned int cpu) } EXPORT_SYMBOL_GPL(get_llc_id); +/* L2 cache ID of each logical CPU */ +DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id) = BAD_APICID; + /* correctly size the local cpu masks */ void __init setup_cpu_local_masks(void) { @@ -326,6 +329,7 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_SMAP cr4_set_bits(X86_CR4_SMAP); #else + clear_cpu_cap(c, X86_FEATURE_SMAP); cr4_clear_bits(X86_CR4_SMAP); #endif } @@ -380,7 +384,7 @@ set_register: } EXPORT_SYMBOL(native_write_cr0); -void native_write_cr4(unsigned long val) +void __no_profile native_write_cr4(unsigned long val) { unsigned long bits_changed = 0; @@ -1044,6 +1048,8 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { VULNWL(CENTAUR, 5, X86_MODEL_ANY, NO_SPECULATION), VULNWL(INTEL, 5, X86_MODEL_ANY, NO_SPECULATION), VULNWL(NSC, 5, X86_MODEL_ANY, NO_SPECULATION), + VULNWL(VORTEX, 5, X86_MODEL_ANY, NO_SPECULATION), + VULNWL(VORTEX, 6, X86_MODEL_ANY, NO_SPECULATION), /* Intel Family 6 */ VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT), @@ -1395,9 +1401,8 @@ void __init early_cpu_init(void) early_identify_cpu(&boot_cpu_data); } -static void detect_null_seg_behavior(struct cpuinfo_x86 *c) +static bool detect_null_seg_behavior(void) { -#ifdef CONFIG_X86_64 /* * Empirically, writing zero to a segment selector on AMD does * not clear the base, whereas writing zero to a segment @@ -1418,10 +1423,43 @@ static void detect_null_seg_behavior(struct cpuinfo_x86 *c) wrmsrl(MSR_FS_BASE, 1); loadsegment(fs, 0); rdmsrl(MSR_FS_BASE, tmp); - if (tmp != 0) - set_cpu_bug(c, X86_BUG_NULL_SEG); wrmsrl(MSR_FS_BASE, old_base); -#endif + return tmp == 0; +} + +void check_null_seg_clears_base(struct cpuinfo_x86 *c) +{ + /* BUG_NULL_SEG is only relevant with 64bit userspace */ + if (!IS_ENABLED(CONFIG_X86_64)) + return; + + /* Zen3 CPUs advertise Null Selector Clears Base in CPUID. */ + if (c->extended_cpuid_level >= 0x80000021 && + cpuid_eax(0x80000021) & BIT(6)) + return; + + /* + * CPUID bit above wasn't set. If this kernel is still running + * as a HV guest, then the HV has decided not to advertize + * that CPUID bit for whatever reason. For example, one + * member of the migration pool might be vulnerable. Which + * means, the bug is present: set the BUG flag and return. + */ + if (cpu_has(c, X86_FEATURE_HYPERVISOR)) { + set_cpu_bug(c, X86_BUG_NULL_SEG); + return; + } + + /* + * Zen2 CPUs also have this behaviour, but no CPUID bit. + * 0x18 is the respective family for Hygon. + */ + if ((c->x86 == 0x17 || c->x86 == 0x18) && + detect_null_seg_behavior()) + return; + + /* All the remaining ones are affected */ + set_cpu_bug(c, X86_BUG_NULL_SEG); } static void generic_identify(struct cpuinfo_x86 *c) @@ -1457,8 +1495,6 @@ static void generic_identify(struct cpuinfo_x86 *c) get_model_name(c); /* Default name */ - detect_null_seg_behavior(c); - /* * ESPFIX is a strange bug. All real CPUs have it. Paravirt * systems that run Linux at CPL > 0 may or may not have the @@ -1751,6 +1787,17 @@ EXPORT_PER_CPU_SYMBOL(__preempt_count); DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK; +static void wrmsrl_cstar(unsigned long val) +{ + /* + * Intel CPUs do not support 32-bit SYSCALL. Writing to MSR_CSTAR + * is so far ignored by the CPU, but raises a #VE trap in a TDX + * guest. Avoid the pointless write on all Intel CPUs. + */ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + wrmsrl(MSR_CSTAR, val); +} + /* May not be marked __init: used by software suspend */ void syscall_init(void) { @@ -1758,7 +1805,7 @@ void syscall_init(void) wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); #ifdef CONFIG_IA32_EMULATION - wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat); + wrmsrl_cstar((unsigned long)entry_SYSCALL_compat); /* * This only works on Intel CPUs. * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP. @@ -1770,7 +1817,7 @@ void syscall_init(void) (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1)); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); #else - wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret); + wrmsrl_cstar((unsigned long)ignore_sysret); wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG); wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL); diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 95521302630d..ee6f23f7587d 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -75,6 +75,7 @@ extern int detect_extended_topology_early(struct cpuinfo_x86 *c); extern int detect_extended_topology(struct cpuinfo_x86 *c); extern int detect_ht_early(struct cpuinfo_x86 *c); extern void detect_ht(struct cpuinfo_x86 *c); +extern void check_null_seg_clears_base(struct cpuinfo_x86 *c); unsigned int aperfmperf_get_khz(int cpu); diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index defda61f372d..c881bcafba7d 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -75,6 +75,9 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_SGX_LC, X86_FEATURE_SGX }, { X86_FEATURE_SGX1, X86_FEATURE_SGX }, { X86_FEATURE_SGX2, X86_FEATURE_SGX1 }, + { X86_FEATURE_XFD, X86_FEATURE_XSAVES }, + { X86_FEATURE_XFD, X86_FEATURE_XGETBV1 }, + { X86_FEATURE_AMX_TILE, X86_FEATURE_XFD }, {} }; diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index 6d50136f7ab9..3fcdda4c1e11 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -335,6 +335,8 @@ static void init_hygon(struct cpuinfo_x86 *c) /* Hygon CPUs don't reset SS attributes on SYSRET, Xen does. */ if (!cpu_has(c, X86_FEATURE_XENPV)) set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); + + check_null_seg_clears_base(c); } static void cpu_detect_tlb_hygon(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c index f4dd73396f28..fbaf12e43f41 100644 --- a/arch/x86/kernel/cpu/intel_epb.c +++ b/arch/x86/kernel/cpu/intel_epb.c @@ -16,6 +16,7 @@ #include <linux/syscore_ops.h> #include <linux/pm.h> +#include <asm/cpu_device_id.h> #include <asm/cpufeature.h> #include <asm/msr.h> @@ -58,6 +59,22 @@ static DEFINE_PER_CPU(u8, saved_epb); #define EPB_SAVED 0x10ULL #define MAX_EPB EPB_MASK +enum energy_perf_value_index { + EPB_INDEX_PERFORMANCE, + EPB_INDEX_BALANCE_PERFORMANCE, + EPB_INDEX_NORMAL, + EPB_INDEX_BALANCE_POWERSAVE, + EPB_INDEX_POWERSAVE, +}; + +static u8 energ_perf_values[] = { + [EPB_INDEX_PERFORMANCE] = ENERGY_PERF_BIAS_PERFORMANCE, + [EPB_INDEX_BALANCE_PERFORMANCE] = ENERGY_PERF_BIAS_BALANCE_PERFORMANCE, + [EPB_INDEX_NORMAL] = ENERGY_PERF_BIAS_NORMAL, + [EPB_INDEX_BALANCE_POWERSAVE] = ENERGY_PERF_BIAS_BALANCE_POWERSAVE, + [EPB_INDEX_POWERSAVE] = ENERGY_PERF_BIAS_POWERSAVE, +}; + static int intel_epb_save(void) { u64 epb; @@ -90,7 +107,7 @@ static void intel_epb_restore(void) */ val = epb & EPB_MASK; if (val == ENERGY_PERF_BIAS_PERFORMANCE) { - val = ENERGY_PERF_BIAS_NORMAL; + val = energ_perf_values[EPB_INDEX_NORMAL]; pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n"); } } @@ -103,18 +120,11 @@ static struct syscore_ops intel_epb_syscore_ops = { }; static const char * const energy_perf_strings[] = { - "performance", - "balance-performance", - "normal", - "balance-power", - "power" -}; -static const u8 energ_perf_values[] = { - ENERGY_PERF_BIAS_PERFORMANCE, - ENERGY_PERF_BIAS_BALANCE_PERFORMANCE, - ENERGY_PERF_BIAS_NORMAL, - ENERGY_PERF_BIAS_BALANCE_POWERSAVE, - ENERGY_PERF_BIAS_POWERSAVE + [EPB_INDEX_PERFORMANCE] = "performance", + [EPB_INDEX_BALANCE_PERFORMANCE] = "balance-performance", + [EPB_INDEX_NORMAL] = "normal", + [EPB_INDEX_BALANCE_POWERSAVE] = "balance-power", + [EPB_INDEX_POWERSAVE] = "power", }; static ssize_t energy_perf_bias_show(struct device *dev, @@ -193,13 +203,22 @@ static int intel_epb_offline(unsigned int cpu) return 0; } +static const struct x86_cpu_id intel_epb_normal[] = { + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, 7), + {} +}; + static __init int intel_epb_init(void) { + const struct x86_cpu_id *id = x86_match_cpu(intel_epb_normal); int ret; if (!boot_cpu_has(X86_FEATURE_EPB)) return -ENODEV; + if (id) + energ_perf_values[EPB_INDEX_NORMAL] = id->driver_data; + ret = cpuhp_setup_state(CPUHP_AP_X86_INTEL_EPB_ONLINE, "x86/intel/epb:online", intel_epb_online, intel_epb_offline); diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 08831acc1d03..a1e2f41796dc 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -71,6 +71,22 @@ static const char * const smca_umc_block_names[] = { "misc_umc" }; +#define HWID_MCATYPE(hwid, mcatype) (((hwid) << 16) | (mcatype)) + +struct smca_hwid { + unsigned int bank_type; /* Use with smca_bank_types for easy indexing. */ + u32 hwid_mcatype; /* (hwid,mcatype) tuple */ +}; + +struct smca_bank { + const struct smca_hwid *hwid; + u32 id; /* Value of MCA_IPID[InstanceId]. */ + u8 sysfs_id; /* Value used for sysfs name. */ +}; + +static DEFINE_PER_CPU_READ_MOSTLY(struct smca_bank[MAX_NR_BANKS], smca_banks); +static DEFINE_PER_CPU_READ_MOSTLY(u8[N_SMCA_BANK_TYPES], smca_bank_counts); + struct smca_bank_name { const char *name; /* Short name for sysfs */ const char *long_name; /* Long name for pretty-printing */ @@ -95,11 +111,18 @@ static struct smca_bank_name smca_names[] = { [SMCA_PSP ... SMCA_PSP_V2] = { "psp", "Platform Security Processor" }, [SMCA_SMU ... SMCA_SMU_V2] = { "smu", "System Management Unit" }, [SMCA_MP5] = { "mp5", "Microprocessor 5 Unit" }, + [SMCA_MPDMA] = { "mpdma", "MPDMA Unit" }, [SMCA_NBIO] = { "nbio", "Northbridge IO Unit" }, [SMCA_PCIE ... SMCA_PCIE_V2] = { "pcie", "PCI Express Unit" }, [SMCA_XGMI_PCS] = { "xgmi_pcs", "Ext Global Memory Interconnect PCS Unit" }, + [SMCA_NBIF] = { "nbif", "NBIF Unit" }, + [SMCA_SHUB] = { "shub", "System Hub Unit" }, + [SMCA_SATA] = { "sata", "SATA Unit" }, + [SMCA_USB] = { "usb", "USB Unit" }, + [SMCA_GMI_PCS] = { "gmi_pcs", "Global Memory Interconnect PCS Unit" }, [SMCA_XGMI_PHY] = { "xgmi_phy", "Ext Global Memory Interconnect PHY Unit" }, [SMCA_WAFL_PHY] = { "wafl_phy", "WAFL PHY Unit" }, + [SMCA_GMI_PHY] = { "gmi_phy", "Global Memory Interconnect PHY Unit" }, }; static const char *smca_get_name(enum smca_bank_types t) @@ -119,21 +142,22 @@ const char *smca_get_long_name(enum smca_bank_types t) } EXPORT_SYMBOL_GPL(smca_get_long_name); -static enum smca_bank_types smca_get_bank_type(unsigned int bank) +enum smca_bank_types smca_get_bank_type(unsigned int cpu, unsigned int bank) { struct smca_bank *b; if (bank >= MAX_NR_BANKS) return N_SMCA_BANK_TYPES; - b = &smca_banks[bank]; + b = &per_cpu(smca_banks, cpu)[bank]; if (!b->hwid) return N_SMCA_BANK_TYPES; return b->hwid->bank_type; } +EXPORT_SYMBOL_GPL(smca_get_bank_type); -static struct smca_hwid smca_hwid_mcatypes[] = { +static const struct smca_hwid smca_hwid_mcatypes[] = { /* { bank_type, hwid_mcatype } */ /* Reserved type */ @@ -173,6 +197,9 @@ static struct smca_hwid smca_hwid_mcatypes[] = { /* Microprocessor 5 Unit MCA type */ { SMCA_MP5, HWID_MCATYPE(0x01, 0x2) }, + /* MPDMA MCA type */ + { SMCA_MPDMA, HWID_MCATYPE(0x01, 0x3) }, + /* Northbridge IO Unit MCA type */ { SMCA_NBIO, HWID_MCATYPE(0x18, 0x0) }, @@ -180,19 +207,17 @@ static struct smca_hwid smca_hwid_mcatypes[] = { { SMCA_PCIE, HWID_MCATYPE(0x46, 0x0) }, { SMCA_PCIE_V2, HWID_MCATYPE(0x46, 0x1) }, - /* xGMI PCS MCA type */ { SMCA_XGMI_PCS, HWID_MCATYPE(0x50, 0x0) }, - - /* xGMI PHY MCA type */ + { SMCA_NBIF, HWID_MCATYPE(0x6C, 0x0) }, + { SMCA_SHUB, HWID_MCATYPE(0x80, 0x0) }, + { SMCA_SATA, HWID_MCATYPE(0xA8, 0x0) }, + { SMCA_USB, HWID_MCATYPE(0xAA, 0x0) }, + { SMCA_GMI_PCS, HWID_MCATYPE(0x241, 0x0) }, { SMCA_XGMI_PHY, HWID_MCATYPE(0x259, 0x0) }, - - /* WAFL PHY MCA type */ { SMCA_WAFL_PHY, HWID_MCATYPE(0x267, 0x0) }, + { SMCA_GMI_PHY, HWID_MCATYPE(0x269, 0x0) }, }; -struct smca_bank smca_banks[MAX_NR_BANKS]; -EXPORT_SYMBOL_GPL(smca_banks); - /* * In SMCA enabled processors, we can have multiple banks for a given IP type. * So to define a unique name for each bank, we use a temp c-string to append @@ -248,8 +273,9 @@ static void smca_set_misc_banks_map(unsigned int bank, unsigned int cpu) static void smca_configure(unsigned int bank, unsigned int cpu) { + u8 *bank_counts = this_cpu_ptr(smca_bank_counts); + const struct smca_hwid *s_hwid; unsigned int i, hwid_mcatype; - struct smca_hwid *s_hwid; u32 high, low; u32 smca_config = MSR_AMD64_SMCA_MCx_CONFIG(bank); @@ -285,10 +311,6 @@ static void smca_configure(unsigned int bank, unsigned int cpu) smca_set_misc_banks_map(bank, cpu); - /* Return early if this bank was already initialized. */ - if (smca_banks[bank].hwid && smca_banks[bank].hwid->hwid_mcatype != 0) - return; - if (rdmsr_safe(MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) { pr_warn("Failed to read MCA_IPID for bank %d\n", bank); return; @@ -299,10 +321,11 @@ static void smca_configure(unsigned int bank, unsigned int cpu) for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) { s_hwid = &smca_hwid_mcatypes[i]; + if (hwid_mcatype == s_hwid->hwid_mcatype) { - smca_banks[bank].hwid = s_hwid; - smca_banks[bank].id = low; - smca_banks[bank].sysfs_id = s_hwid->count++; + this_cpu_ptr(smca_banks)[bank].hwid = s_hwid; + this_cpu_ptr(smca_banks)[bank].id = low; + this_cpu_ptr(smca_banks)[bank].sysfs_id = bank_counts[s_hwid->bank_type]++; break; } } @@ -526,7 +549,7 @@ static u32 get_block_address(u32 current_addr, u32 low, u32 high, /* Fall back to method we used for older processors: */ switch (block) { case 0: - addr = msr_ops.misc(bank); + addr = mca_msr_reg(bank, MCA_MISC); break; case 1: offset = ((low & MASK_BLKPTR_LO) >> 21); @@ -588,7 +611,7 @@ out: bool amd_filter_mce(struct mce *m) { - enum smca_bank_types bank_type = smca_get_bank_type(m->bank); + enum smca_bank_types bank_type = smca_get_bank_type(m->extcpu, m->bank); struct cpuinfo_x86 *c = &boot_cpu_data; /* See Family 17h Models 10h-2Fh Erratum #1114. */ @@ -626,7 +649,7 @@ static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank) } else if (c->x86 == 0x17 && (c->x86_model >= 0x10 && c->x86_model <= 0x2F)) { - if (smca_get_bank_type(bank) != SMCA_IF) + if (smca_get_bank_type(smp_processor_id(), bank) != SMCA_IF) return; msrs[0] = MSR_AMD64_SMCA_MCx_MISC(bank); @@ -688,213 +711,13 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) deferred_error_interrupt_enable(c); } -int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr) -{ - u64 dram_base_addr, dram_limit_addr, dram_hole_base; - /* We start from the normalized address */ - u64 ret_addr = norm_addr; - - u32 tmp; - - u8 die_id_shift, die_id_mask, socket_id_shift, socket_id_mask; - u8 intlv_num_dies, intlv_num_chan, intlv_num_sockets; - u8 intlv_addr_sel, intlv_addr_bit; - u8 num_intlv_bits, hashed_bit; - u8 lgcy_mmio_hole_en, base = 0; - u8 cs_mask, cs_id = 0; - bool hash_enabled = false; - - /* Read D18F0x1B4 (DramOffset), check if base 1 is used. */ - if (amd_df_indirect_read(nid, 0, 0x1B4, umc, &tmp)) - goto out_err; - - /* Remove HiAddrOffset from normalized address, if enabled: */ - if (tmp & BIT(0)) { - u64 hi_addr_offset = (tmp & GENMASK_ULL(31, 20)) << 8; - - if (norm_addr >= hi_addr_offset) { - ret_addr -= hi_addr_offset; - base = 1; - } - } - - /* Read D18F0x110 (DramBaseAddress). */ - if (amd_df_indirect_read(nid, 0, 0x110 + (8 * base), umc, &tmp)) - goto out_err; - - /* Check if address range is valid. */ - if (!(tmp & BIT(0))) { - pr_err("%s: Invalid DramBaseAddress range: 0x%x.\n", - __func__, tmp); - goto out_err; - } - - lgcy_mmio_hole_en = tmp & BIT(1); - intlv_num_chan = (tmp >> 4) & 0xF; - intlv_addr_sel = (tmp >> 8) & 0x7; - dram_base_addr = (tmp & GENMASK_ULL(31, 12)) << 16; - - /* {0, 1, 2, 3} map to address bits {8, 9, 10, 11} respectively */ - if (intlv_addr_sel > 3) { - pr_err("%s: Invalid interleave address select %d.\n", - __func__, intlv_addr_sel); - goto out_err; - } - - /* Read D18F0x114 (DramLimitAddress). */ - if (amd_df_indirect_read(nid, 0, 0x114 + (8 * base), umc, &tmp)) - goto out_err; - - intlv_num_sockets = (tmp >> 8) & 0x1; - intlv_num_dies = (tmp >> 10) & 0x3; - dram_limit_addr = ((tmp & GENMASK_ULL(31, 12)) << 16) | GENMASK_ULL(27, 0); - - intlv_addr_bit = intlv_addr_sel + 8; - - /* Re-use intlv_num_chan by setting it equal to log2(#channels) */ - switch (intlv_num_chan) { - case 0: intlv_num_chan = 0; break; - case 1: intlv_num_chan = 1; break; - case 3: intlv_num_chan = 2; break; - case 5: intlv_num_chan = 3; break; - case 7: intlv_num_chan = 4; break; - - case 8: intlv_num_chan = 1; - hash_enabled = true; - break; - default: - pr_err("%s: Invalid number of interleaved channels %d.\n", - __func__, intlv_num_chan); - goto out_err; - } - - num_intlv_bits = intlv_num_chan; - - if (intlv_num_dies > 2) { - pr_err("%s: Invalid number of interleaved nodes/dies %d.\n", - __func__, intlv_num_dies); - goto out_err; - } - - num_intlv_bits += intlv_num_dies; - - /* Add a bit if sockets are interleaved. */ - num_intlv_bits += intlv_num_sockets; - - /* Assert num_intlv_bits <= 4 */ - if (num_intlv_bits > 4) { - pr_err("%s: Invalid interleave bits %d.\n", - __func__, num_intlv_bits); - goto out_err; - } - - if (num_intlv_bits > 0) { - u64 temp_addr_x, temp_addr_i, temp_addr_y; - u8 die_id_bit, sock_id_bit, cs_fabric_id; - - /* - * Read FabricBlockInstanceInformation3_CS[BlockFabricID]. - * This is the fabric id for this coherent slave. Use - * umc/channel# as instance id of the coherent slave - * for FICAA. - */ - if (amd_df_indirect_read(nid, 0, 0x50, umc, &tmp)) - goto out_err; - - cs_fabric_id = (tmp >> 8) & 0xFF; - die_id_bit = 0; - - /* If interleaved over more than 1 channel: */ - if (intlv_num_chan) { - die_id_bit = intlv_num_chan; - cs_mask = (1 << die_id_bit) - 1; - cs_id = cs_fabric_id & cs_mask; - } - - sock_id_bit = die_id_bit; - - /* Read D18F1x208 (SystemFabricIdMask). */ - if (intlv_num_dies || intlv_num_sockets) - if (amd_df_indirect_read(nid, 1, 0x208, umc, &tmp)) - goto out_err; - - /* If interleaved over more than 1 die. */ - if (intlv_num_dies) { - sock_id_bit = die_id_bit + intlv_num_dies; - die_id_shift = (tmp >> 24) & 0xF; - die_id_mask = (tmp >> 8) & 0xFF; - - cs_id |= ((cs_fabric_id & die_id_mask) >> die_id_shift) << die_id_bit; - } - - /* If interleaved over more than 1 socket. */ - if (intlv_num_sockets) { - socket_id_shift = (tmp >> 28) & 0xF; - socket_id_mask = (tmp >> 16) & 0xFF; - - cs_id |= ((cs_fabric_id & socket_id_mask) >> socket_id_shift) << sock_id_bit; - } - - /* - * The pre-interleaved address consists of XXXXXXIIIYYYYY - * where III is the ID for this CS, and XXXXXXYYYYY are the - * address bits from the post-interleaved address. - * "num_intlv_bits" has been calculated to tell us how many "I" - * bits there are. "intlv_addr_bit" tells us how many "Y" bits - * there are (where "I" starts). - */ - temp_addr_y = ret_addr & GENMASK_ULL(intlv_addr_bit-1, 0); - temp_addr_i = (cs_id << intlv_addr_bit); - temp_addr_x = (ret_addr & GENMASK_ULL(63, intlv_addr_bit)) << num_intlv_bits; - ret_addr = temp_addr_x | temp_addr_i | temp_addr_y; - } - - /* Add dram base address */ - ret_addr += dram_base_addr; - - /* If legacy MMIO hole enabled */ - if (lgcy_mmio_hole_en) { - if (amd_df_indirect_read(nid, 0, 0x104, umc, &tmp)) - goto out_err; - - dram_hole_base = tmp & GENMASK(31, 24); - if (ret_addr >= dram_hole_base) - ret_addr += (BIT_ULL(32) - dram_hole_base); - } - - if (hash_enabled) { - /* Save some parentheses and grab ls-bit at the end. */ - hashed_bit = (ret_addr >> 12) ^ - (ret_addr >> 18) ^ - (ret_addr >> 21) ^ - (ret_addr >> 30) ^ - cs_id; - - hashed_bit &= BIT(0); - - if (hashed_bit != ((ret_addr >> intlv_addr_bit) & BIT(0))) - ret_addr ^= BIT(intlv_addr_bit); - } - - /* Is calculated system address is above DRAM limit address? */ - if (ret_addr > dram_limit_addr) - goto out_err; - - *sys_addr = ret_addr; - return 0; - -out_err: - return -EINVAL; -} -EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr); - bool amd_mce_is_memory_error(struct mce *m) { /* ErrCodeExt[20:16] */ u8 xec = (m->status >> 16) & 0x1f; if (mce_flags.smca) - return smca_get_bank_type(m->bank) == SMCA_UMC && xec == 0x0; + return smca_get_bank_type(m->extcpu, m->bank) == SMCA_UMC && xec == 0x0; return m->bank == 4 && xec == 0x8; } @@ -978,8 +801,8 @@ static void log_error_deferred(unsigned int bank) { bool defrd; - defrd = _log_error_bank(bank, msr_ops.status(bank), - msr_ops.addr(bank), 0); + defrd = _log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS), + mca_msr_reg(bank, MCA_ADDR), 0); if (!mce_flags.smca) return; @@ -1009,7 +832,7 @@ static void amd_deferred_error_interrupt(void) static void log_error_thresholding(unsigned int bank, u64 misc) { - _log_error_bank(bank, msr_ops.status(bank), msr_ops.addr(bank), misc); + _log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS), mca_msr_reg(bank, MCA_ADDR), misc); } static void log_and_reset_block(struct threshold_block *block) @@ -1210,7 +1033,7 @@ static struct kobj_type threshold_ktype = { .release = threshold_block_release, }; -static const char *get_name(unsigned int bank, struct threshold_block *b) +static const char *get_name(unsigned int cpu, unsigned int bank, struct threshold_block *b) { enum smca_bank_types bank_type; @@ -1221,7 +1044,7 @@ static const char *get_name(unsigned int bank, struct threshold_block *b) return th_names[bank]; } - bank_type = smca_get_bank_type(bank); + bank_type = smca_get_bank_type(cpu, bank); if (bank_type >= N_SMCA_BANK_TYPES) return NULL; @@ -1231,12 +1054,12 @@ static const char *get_name(unsigned int bank, struct threshold_block *b) return NULL; } - if (smca_banks[bank].hwid->count == 1) + if (per_cpu(smca_bank_counts, cpu)[bank_type] == 1) return smca_get_name(bank_type); snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN, - "%s_%x", smca_get_name(bank_type), - smca_banks[bank].sysfs_id); + "%s_%u", smca_get_name(bank_type), + per_cpu(smca_banks, cpu)[bank].sysfs_id); return buf_mcatype; } @@ -1292,7 +1115,7 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb else tb->blocks = b; - err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(bank, b)); + err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(cpu, bank, b)); if (err) goto out_free; recurse: @@ -1347,7 +1170,7 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu, struct device *dev = this_cpu_read(mce_device); struct amd_northbridge *nb = NULL; struct threshold_bank *b = NULL; - const char *name = get_name(bank, NULL); + const char *name = get_name(cpu, bank, NULL); int err = 0; if (!dev) @@ -1397,7 +1220,7 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu, } } - err = allocate_threshold_blocks(cpu, b, bank, 0, msr_ops.misc(bank)); + err = allocate_threshold_blocks(cpu, b, bank, 0, mca_msr_reg(bank, MCA_MISC)); if (err) goto out_kobj; diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 193204aee880..5818b837fd4d 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -99,7 +99,6 @@ struct mca_config mca_cfg __read_mostly = { static DEFINE_PER_CPU(struct mce, mces_seen); static unsigned long mce_need_notify; -static int cpu_missing; /* * MCA banks polled by the period polling timer for corrected events. @@ -121,8 +120,6 @@ mce_banks_t mce_banks_ce_disabled; static struct work_struct mce_work; static struct irq_work mce_irq_work; -static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); - /* * CPU/chipset specific EDAC code can register a notifier call here to print * MCE errors in a human-readable form. @@ -130,7 +127,7 @@ static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain); /* Do initial initialization of a struct mce */ -noinstr void mce_setup(struct mce *m) +void mce_setup(struct mce *m) { memset(m, 0, sizeof(struct mce)); m->cpu = m->extcpu = smp_processor_id(); @@ -176,53 +173,27 @@ void mce_unregister_decode_chain(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); -static inline u32 ctl_reg(int bank) -{ - return MSR_IA32_MCx_CTL(bank); -} - -static inline u32 status_reg(int bank) -{ - return MSR_IA32_MCx_STATUS(bank); -} - -static inline u32 addr_reg(int bank) -{ - return MSR_IA32_MCx_ADDR(bank); -} - -static inline u32 misc_reg(int bank) -{ - return MSR_IA32_MCx_MISC(bank); -} - -static inline u32 smca_ctl_reg(int bank) +u32 mca_msr_reg(int bank, enum mca_msr reg) { - return MSR_AMD64_SMCA_MCx_CTL(bank); -} - -static inline u32 smca_status_reg(int bank) -{ - return MSR_AMD64_SMCA_MCx_STATUS(bank); -} + if (mce_flags.smca) { + switch (reg) { + case MCA_CTL: return MSR_AMD64_SMCA_MCx_CTL(bank); + case MCA_ADDR: return MSR_AMD64_SMCA_MCx_ADDR(bank); + case MCA_MISC: return MSR_AMD64_SMCA_MCx_MISC(bank); + case MCA_STATUS: return MSR_AMD64_SMCA_MCx_STATUS(bank); + } + } -static inline u32 smca_addr_reg(int bank) -{ - return MSR_AMD64_SMCA_MCx_ADDR(bank); -} + switch (reg) { + case MCA_CTL: return MSR_IA32_MCx_CTL(bank); + case MCA_ADDR: return MSR_IA32_MCx_ADDR(bank); + case MCA_MISC: return MSR_IA32_MCx_MISC(bank); + case MCA_STATUS: return MSR_IA32_MCx_STATUS(bank); + } -static inline u32 smca_misc_reg(int bank) -{ - return MSR_AMD64_SMCA_MCx_MISC(bank); + return 0; } -struct mca_msr_regs msr_ops = { - .ctl = ctl_reg, - .status = status_reg, - .addr = addr_reg, - .misc = misc_reg -}; - static void __print_mce(struct mce *m) { pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n", @@ -295,11 +266,17 @@ static void wait_for_panic(void) panic("Panicing machine check CPU died"); } -static void mce_panic(const char *msg, struct mce *final, char *exp) +static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) { - int apei_err = 0; struct llist_node *pending; struct mce_evt_llist *l; + int apei_err = 0; + + /* + * Allow instrumentation around external facilities usage. Not that it + * matters a whole lot since the machine is going to panic anyway. + */ + instrumentation_begin(); if (!fake_panic) { /* @@ -314,7 +291,7 @@ static void mce_panic(const char *msg, struct mce *final, char *exp) } else { /* Don't log too much for fake panic */ if (atomic_inc_return(&mce_fake_panicked) > 1) - return; + goto out; } pending = mce_gen_pool_prepare_records(); /* First print corrected ones that are still unlogged */ @@ -342,8 +319,6 @@ static void mce_panic(const char *msg, struct mce *final, char *exp) if (!apei_err) apei_err = apei_write_mce(final); } - if (cpu_missing) - pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n"); if (exp) pr_emerg(HW_ERR "Machine check: %s\n", exp); if (!fake_panic) { @@ -352,6 +327,9 @@ static void mce_panic(const char *msg, struct mce *final, char *exp) panic(msg); } else pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); + +out: + instrumentation_end(); } /* Support code for software error injection */ @@ -362,24 +340,27 @@ static int msr_to_offset(u32 msr) if (msr == mca_cfg.rip_msr) return offsetof(struct mce, ip); - if (msr == msr_ops.status(bank)) + if (msr == mca_msr_reg(bank, MCA_STATUS)) return offsetof(struct mce, status); - if (msr == msr_ops.addr(bank)) + if (msr == mca_msr_reg(bank, MCA_ADDR)) return offsetof(struct mce, addr); - if (msr == msr_ops.misc(bank)) + if (msr == mca_msr_reg(bank, MCA_MISC)) return offsetof(struct mce, misc); if (msr == MSR_IA32_MCG_STATUS) return offsetof(struct mce, mcgstatus); return -1; } -__visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr, - unsigned long error_code, - unsigned long fault_addr) +void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr) { - pr_emerg("MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n", - (unsigned int)regs->cx, regs->ip, (void *)regs->ip); + if (wrmsr) { + pr_emerg("MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n", + (unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->ax, + regs->ip, (void *)regs->ip); + } else { + pr_emerg("MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n", + (unsigned int)regs->cx, regs->ip, (void *)regs->ip); + } show_stack_regs(regs); @@ -387,12 +368,10 @@ __visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup, while (true) cpu_relax(); - - return true; } /* MSR access wrappers used for error injection */ -static noinstr u64 mce_rdmsrl(u32 msr) +noinstr u64 mce_rdmsrl(u32 msr) { DECLARE_ARGS(val, low, high); @@ -420,32 +399,13 @@ static noinstr u64 mce_rdmsrl(u32 msr) */ asm volatile("1: rdmsr\n" "2:\n" - _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_rdmsr_fault) + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_RDMSR_IN_MCE) : EAX_EDX_RET(val, low, high) : "c" (msr)); return EAX_EDX_VAL(val, low, high); } -__visible bool ex_handler_wrmsr_fault(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr, - unsigned long error_code, - unsigned long fault_addr) -{ - pr_emerg("MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n", - (unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->ax, - regs->ip, (void *)regs->ip); - - show_stack_regs(regs); - - panic("MCA architectural violation!\n"); - - while (true) - cpu_relax(); - - return true; -} - static noinstr void mce_wrmsrl(u32 msr, u64 v) { u32 low, high; @@ -470,7 +430,7 @@ static noinstr void mce_wrmsrl(u32 msr, u64 v) /* See comment in mce_rdmsrl() */ asm volatile("1: wrmsr\n" "2:\n" - _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_wrmsr_fault) + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR_IN_MCE) : : "c" (msr), "a"(low), "d" (high) : "memory"); } @@ -479,9 +439,15 @@ static noinstr void mce_wrmsrl(u32 msr, u64 v) * check into our "mce" struct so that we can use it later to assess * the severity of the problem as we read per-bank specific details. */ -static inline void mce_gather_info(struct mce *m, struct pt_regs *regs) +static noinstr void mce_gather_info(struct mce *m, struct pt_regs *regs) { + /* + * Enable instrumentation around mce_setup() which calls external + * facilities. + */ + instrumentation_begin(); mce_setup(m); + instrumentation_end(); m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); if (regs) { @@ -682,13 +648,13 @@ static struct notifier_block mce_default_nb = { /* * Read ADDR and MISC registers. */ -static void mce_read_aux(struct mce *m, int i) +static noinstr void mce_read_aux(struct mce *m, int i) { if (m->status & MCI_STATUS_MISCV) - m->misc = mce_rdmsrl(msr_ops.misc(i)); + m->misc = mce_rdmsrl(mca_msr_reg(i, MCA_MISC)); if (m->status & MCI_STATUS_ADDRV) { - m->addr = mce_rdmsrl(msr_ops.addr(i)); + m->addr = mce_rdmsrl(mca_msr_reg(i, MCA_ADDR)); /* * Mask the reported address by the reported granularity. @@ -758,7 +724,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) m.bank = i; barrier(); - m.status = mce_rdmsrl(msr_ops.status(i)); + m.status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); /* If this entry is not valid, ignore it */ if (!(m.status & MCI_STATUS_VAL)) @@ -826,7 +792,7 @@ clear_it: /* * Clear state for this bank. */ - mce_wrmsrl(msr_ops.status(i), 0); + mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0); } /* @@ -841,6 +807,34 @@ clear_it: EXPORT_SYMBOL_GPL(machine_check_poll); /* + * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and + * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM + * Vol 3B Table 15-20). But this confuses both the code that determines + * whether the machine check occurred in kernel or user mode, and also + * the severity assessment code. Pretend that EIPV was set, and take the + * ip/cs values from the pt_regs that mce_gather_info() ignored earlier. + */ +static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs) +{ + if (bank != 0) + return; + if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0) + return; + if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC| + MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV| + MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR| + MCACOD)) != + (MCI_STATUS_UC|MCI_STATUS_EN| + MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S| + MCI_STATUS_AR|MCACOD_INSTR)) + return; + + m->mcgstatus |= MCG_STATUS_EIPV; + m->ip = regs->ip; + m->cs = regs->cs; +} + +/* * Do a quick check if any of the events requires a panic. * This decides if we keep the events around or clear them. */ @@ -851,13 +845,13 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, int i; for (i = 0; i < this_cpu_read(mce_num_banks); i++) { - m->status = mce_rdmsrl(msr_ops.status(i)); + m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); if (!(m->status & MCI_STATUS_VAL)) continue; __set_bit(i, validp); - if (quirk_no_way_out) - quirk_no_way_out(i, m, regs); + if (mce_flags.snb_ifu_quirk) + quirk_sandybridge_ifu(i, m, regs); m->bank = i; if (mce_severity(m, regs, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) { @@ -889,8 +883,13 @@ static cpumask_t mce_missing_cpus = CPU_MASK_ALL; /* * Check if a timeout waiting for other CPUs happened. */ -static int mce_timed_out(u64 *t, const char *msg) +static noinstr int mce_timed_out(u64 *t, const char *msg) { + int ret = 0; + + /* Enable instrumentation around calls to external facilities */ + instrumentation_begin(); + /* * The others already did panic for some reason. * Bail out like in a timeout. @@ -909,13 +908,17 @@ static int mce_timed_out(u64 *t, const char *msg) cpumask_pr_args(&mce_missing_cpus)); mce_panic(msg, NULL, NULL); } - cpu_missing = 1; - return 1; + ret = 1; + goto out; } *t -= SPINUNIT; + out: touch_nmi_watchdog(); - return 0; + + instrumentation_end(); + + return ret; } /* @@ -1004,14 +1007,13 @@ static atomic_t global_nwo; * in the entry order. * TBD double check parallel CPU hotunplug */ -static int mce_start(int *no_way_out) +static noinstr int mce_start(int *no_way_out) { - int order; - int cpus = num_online_cpus(); u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC; + int order, ret = -1; if (!timeout) - return -1; + return ret; atomic_add(*no_way_out, &global_nwo); /* @@ -1021,14 +1023,17 @@ static int mce_start(int *no_way_out) order = atomic_inc_return(&mce_callin); cpumask_clear_cpu(smp_processor_id(), &mce_missing_cpus); + /* Enable instrumentation around calls to external facilities */ + instrumentation_begin(); + /* * Wait for everyone. */ - while (atomic_read(&mce_callin) != cpus) { + while (atomic_read(&mce_callin) != num_online_cpus()) { if (mce_timed_out(&timeout, "Timeout: Not all CPUs entered broadcast exception handler")) { atomic_set(&global_nwo, 0); - return -1; + goto out; } ndelay(SPINUNIT); } @@ -1054,7 +1059,7 @@ static int mce_start(int *no_way_out) if (mce_timed_out(&timeout, "Timeout: Subject CPUs unable to finish machine check processing")) { atomic_set(&global_nwo, 0); - return -1; + goto out; } ndelay(SPINUNIT); } @@ -1065,17 +1070,25 @@ static int mce_start(int *no_way_out) */ *no_way_out = atomic_read(&global_nwo); - return order; + ret = order; + +out: + instrumentation_end(); + + return ret; } /* * Synchronize between CPUs after main scanning loop. * This invokes the bulk of the Monarch processing. */ -static int mce_end(int order) +static noinstr int mce_end(int order) { - int ret = -1; u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC; + int ret = -1; + + /* Allow instrumentation around external facilities. */ + instrumentation_begin(); if (!timeout) goto reset; @@ -1088,14 +1101,11 @@ static int mce_end(int order) atomic_inc(&mce_executing); if (order == 1) { - /* CHECKME: Can this race with a parallel hotplug? */ - int cpus = num_online_cpus(); - /* * Monarch: Wait for everyone to go through their scanning * loops. */ - while (atomic_read(&mce_executing) <= cpus) { + while (atomic_read(&mce_executing) <= num_online_cpus()) { if (mce_timed_out(&timeout, "Timeout: Monarch CPU unable to finish machine check processing")) goto reset; @@ -1119,7 +1129,8 @@ static int mce_end(int order) /* * Don't reset anything. That's done by the Monarch. */ - return 0; + ret = 0; + goto out; } /* @@ -1135,6 +1146,10 @@ reset: * Let others run again. */ atomic_set(&mce_executing, 0); + +out: + instrumentation_end(); + return ret; } @@ -1144,7 +1159,7 @@ static void mce_clear_state(unsigned long *toclear) for (i = 0; i < this_cpu_read(mce_num_banks); i++) { if (test_bit(i, toclear)) - mce_wrmsrl(msr_ops.status(i), 0); + mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0); } } @@ -1183,13 +1198,14 @@ static noinstr bool mce_check_crashing_cpu(void) return false; } -static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final, - unsigned long *toclear, unsigned long *valid_banks, - int no_way_out, int *worst) +static __always_inline int +__mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final, + unsigned long *toclear, unsigned long *valid_banks, int no_way_out, + int *worst) { struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); struct mca_config *cfg = &mca_cfg; - int severity, i; + int severity, i, taint = 0; for (i = 0; i < this_cpu_read(mce_num_banks); i++) { __clear_bit(i, toclear); @@ -1203,7 +1219,7 @@ static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *fin m->addr = 0; m->bank = i; - m->status = mce_rdmsrl(msr_ops.status(i)); + m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); if (!(m->status & MCI_STATUS_VAL)) continue; @@ -1216,7 +1232,7 @@ static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *fin continue; /* Set taint even when machine check was not enabled. */ - add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); + taint++; severity = mce_severity(m, regs, cfg->tolerant, NULL, true); @@ -1239,7 +1255,13 @@ static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *fin /* assuming valid severity level != 0 */ m->severity = severity; + /* + * Enable instrumentation around the mce_log() call which is + * done in #MC context, where instrumentation is disabled. + */ + instrumentation_begin(); mce_log(m); + instrumentation_end(); if (severity > *worst) { *final = *m; @@ -1249,6 +1271,8 @@ static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *fin /* mce_clear_state will clear *final, save locally for use later */ *m = *final; + + return taint; } static void kill_me_now(struct callback_head *ch) @@ -1272,7 +1296,7 @@ static void kill_me_maybe(struct callback_head *cb) flags |= MF_MUST_KILL; ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags); - if (!ret && !(p->mce_kflags & MCE_IN_KERNEL_COPYIN)) { + if (!ret) { set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page); sync_core(); return; @@ -1286,15 +1310,21 @@ static void kill_me_maybe(struct callback_head *cb) if (ret == -EHWPOISON) return; - if (p->mce_vaddr != (void __user *)-1l) { - force_sig_mceerr(BUS_MCEERR_AR, p->mce_vaddr, PAGE_SHIFT); - } else { - pr_err("Memory error not recovered"); - kill_me_now(cb); - } + pr_err("Memory error not recovered"); + kill_me_now(cb); } -static void queue_task_work(struct mce *m, char *msg, int kill_current_task) +static void kill_me_never(struct callback_head *cb) +{ + struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me); + + p->mce_count = 0; + pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr); + if (!memory_failure(p->mce_addr >> PAGE_SHIFT, 0)) + set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page); +} + +static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *)) { int count = ++current->mce_count; @@ -1304,11 +1334,7 @@ static void queue_task_work(struct mce *m, char *msg, int kill_current_task) current->mce_kflags = m->kflags; current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV); current->mce_whole_page = whole_page(m); - - if (kill_current_task) - current->mce_kill_me.func = kill_me_now; - else - current->mce_kill_me.func = kill_me_maybe; + current->mce_kill_me.func = func; } /* Ten is likely overkill. Don't expect more than two faults before task_work() */ @@ -1326,12 +1352,21 @@ static void queue_task_work(struct mce *m, char *msg, int kill_current_task) task_work_add(current, ¤t->mce_kill_me, TWA_RESUME); } +/* Handle unconfigured int18 (should never happen) */ +static noinstr void unexpected_machine_check(struct pt_regs *regs) +{ + instrumentation_begin(); + pr_err("CPU#%d: Unexpected int18 (Machine Check)\n", + smp_processor_id()); + instrumentation_end(); +} + /* - * The actual machine check handler. This only handles real - * exceptions when something got corrupted coming in through int 18. + * The actual machine check handler. This only handles real exceptions when + * something got corrupted coming in through int 18. * - * This is executed in NMI context not subject to normal locking rules. This - * implies that most kernel services cannot be safely used. Don't even + * This is executed in #MC context not subject to normal locking rules. + * This implies that most kernel services cannot be safely used. Don't even * think about putting a printk in there! * * On Intel systems this is entered on all CPUs in parallel through @@ -1343,39 +1378,54 @@ static void queue_task_work(struct mce *m, char *msg, int kill_current_task) * issues: if the machine check was due to a failure of the memory * backing the user stack, tracing that reads the user stack will cause * potentially infinite recursion. + * + * Currently, the #MC handler calls out to a number of external facilities + * and, therefore, allows instrumentation around them. The optimal thing to + * have would be to do the absolutely minimal work required in #MC context + * and have instrumentation disabled only around that. Further processing can + * then happen in process context where instrumentation is allowed. Achieving + * that requires careful auditing and modifications. Until then, the code + * allows instrumentation temporarily, where required. * */ noinstr void do_machine_check(struct pt_regs *regs) { - DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); - DECLARE_BITMAP(toclear, MAX_NR_BANKS); + int worst = 0, order, no_way_out, kill_current_task, lmce, taint = 0; + DECLARE_BITMAP(valid_banks, MAX_NR_BANKS) = { 0 }; + DECLARE_BITMAP(toclear, MAX_NR_BANKS) = { 0 }; struct mca_config *cfg = &mca_cfg; struct mce m, *final; char *msg = NULL; - int worst = 0; + + if (unlikely(mce_flags.p5)) + return pentium_machine_check(regs); + else if (unlikely(mce_flags.winchip)) + return winchip_machine_check(regs); + else if (unlikely(!mca_cfg.initialized)) + return unexpected_machine_check(regs); /* * Establish sequential order between the CPUs entering the machine * check handler. */ - int order = -1; + order = -1; /* * If no_way_out gets set, there is no safe way to recover from this * MCE. If mca_cfg.tolerant is cranked up, we'll try anyway. */ - int no_way_out = 0; + no_way_out = 0; /* * If kill_current_task is not set, there might be a way to recover from this * error. */ - int kill_current_task = 0; + kill_current_task = 0; /* * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES * on Intel. */ - int lmce = 1; + lmce = 1; this_cpu_inc(mce_exception_count); @@ -1385,7 +1435,6 @@ noinstr void do_machine_check(struct pt_regs *regs) final = this_cpu_ptr(&mces_seen); *final = m; - memset(valid_banks, 0, sizeof(valid_banks)); no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs); barrier(); @@ -1419,7 +1468,7 @@ noinstr void do_machine_check(struct pt_regs *regs) order = mce_start(&no_way_out); } - __mc_scan_banks(&m, regs, final, toclear, valid_banks, no_way_out, &worst); + taint = __mc_scan_banks(&m, regs, final, toclear, valid_banks, no_way_out, &worst); if (!no_way_out) mce_clear_state(toclear); @@ -1451,6 +1500,16 @@ noinstr void do_machine_check(struct pt_regs *regs) } } + /* + * Enable instrumentation around the external facilities like task_work_add() + * (via queue_task_work()), fixup_exception() etc. For now, that is. Fixing this + * properly would need a lot more involved reorganization. + */ + instrumentation_begin(); + + if (taint) + add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); + if (worst != MCE_AR_SEVERITY && !kill_current_task) goto out; @@ -1459,7 +1518,10 @@ noinstr void do_machine_check(struct pt_regs *regs) /* If this triggers there is no way to recover. Die hard. */ BUG_ON(!on_thread_stack() || !user_mode(regs)); - queue_task_work(&m, msg, kill_current_task); + if (kill_current_task) + queue_task_work(&m, msg, kill_me_now); + else + queue_task_work(&m, msg, kill_me_maybe); } else { /* @@ -1477,9 +1539,12 @@ noinstr void do_machine_check(struct pt_regs *regs) } if (m.kflags & MCE_IN_KERNEL_COPYIN) - queue_task_work(&m, msg, kill_current_task); + queue_task_work(&m, msg, kill_me_never); } + out: + instrumentation_end(); + mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); } EXPORT_SYMBOL_GPL(do_machine_check); @@ -1687,8 +1752,8 @@ static void __mcheck_cpu_init_clear_banks(void) if (!b->init) continue; - wrmsrl(msr_ops.ctl(i), b->ctl); - wrmsrl(msr_ops.status(i), 0); + wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl); + wrmsrl(mca_msr_reg(i, MCA_STATUS), 0); } } @@ -1714,39 +1779,11 @@ static void __mcheck_cpu_check_banks(void) if (!b->init) continue; - rdmsrl(msr_ops.ctl(i), msrval); + rdmsrl(mca_msr_reg(i, MCA_CTL), msrval); b->init = !!msrval; } } -/* - * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and - * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM - * Vol 3B Table 15-20). But this confuses both the code that determines - * whether the machine check occurred in kernel or user mode, and also - * the severity assessment code. Pretend that EIPV was set, and take the - * ip/cs values from the pt_regs that mce_gather_info() ignored earlier. - */ -static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs) -{ - if (bank != 0) - return; - if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0) - return; - if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC| - MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV| - MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR| - MCACOD)) != - (MCI_STATUS_UC|MCI_STATUS_EN| - MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S| - MCI_STATUS_AR|MCACOD_INSTR)) - return; - - m->mcgstatus |= MCG_STATUS_EIPV; - m->ip = regs->ip; - m->cs = regs->cs; -} - /* Add per CPU specific workarounds here */ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) { @@ -1820,7 +1857,7 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) cfg->bootlog = 0; if (c->x86 == 6 && c->x86_model == 45) - quirk_no_way_out = quirk_sandybridge_ifu; + mce_flags.snb_ifu_quirk = 1; } if (c->x86_vendor == X86_VENDOR_ZHAOXIN) { @@ -1850,9 +1887,11 @@ static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) switch (c->x86_vendor) { case X86_VENDOR_INTEL: intel_p5_mcheck_init(c); + mce_flags.p5 = 1; return 1; case X86_VENDOR_CENTAUR: winchip_mcheck_init(c); + mce_flags.winchip = 1; return 1; default: return 0; @@ -1871,13 +1910,6 @@ static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c) mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR); mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA); mce_flags.amd_threshold = 1; - - if (mce_flags.smca) { - msr_ops.ctl = smca_ctl_reg; - msr_ops.status = smca_status_reg; - msr_ops.addr = smca_addr_reg; - msr_ops.misc = smca_misc_reg; - } } } @@ -2007,18 +2039,6 @@ bool filter_mce(struct mce *m) return false; } -/* Handle unconfigured int18 (should never happen) */ -static noinstr void unexpected_machine_check(struct pt_regs *regs) -{ - instrumentation_begin(); - pr_err("CPU#%d: Unexpected int18 (Machine Check)\n", - smp_processor_id()); - instrumentation_end(); -} - -/* Call the installed machine check handler for this CPU setup. */ -void (*machine_check_vector)(struct pt_regs *) = unexpected_machine_check; - static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) { irqentry_state_t irq_state; @@ -2029,31 +2049,22 @@ static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) * Only required when from kernel mode. See * mce_check_crashing_cpu() for details. */ - if (machine_check_vector == do_machine_check && - mce_check_crashing_cpu()) + if (mca_cfg.initialized && mce_check_crashing_cpu()) return; irq_state = irqentry_nmi_enter(regs); - /* - * The call targets are marked noinstr, but objtool can't figure - * that out because it's an indirect call. Annotate it. - */ - instrumentation_begin(); - machine_check_vector(regs); + do_machine_check(regs); - instrumentation_end(); irqentry_nmi_exit(regs, irq_state); } static __always_inline void exc_machine_check_user(struct pt_regs *regs) { irqentry_enter_from_user_mode(regs); - instrumentation_begin(); - machine_check_vector(regs); + do_machine_check(regs); - instrumentation_end(); irqentry_exit_to_user_mode(regs); } @@ -2120,7 +2131,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) return; } - machine_check_vector = do_machine_check; + mca_cfg.initialized = 1; __mcheck_cpu_init_early(c); __mcheck_cpu_init_generic(); @@ -2228,7 +2239,6 @@ int __init mcheck_init(void) mce_register_decode_chain(&early_nb); mce_register_decode_chain(&mce_uc_nb); mce_register_decode_chain(&mce_default_nb); - mcheck_vendor_init_severity(); INIT_WORK(&mce_work, mce_gen_pool_process); init_irq_work(&mce_irq_work, mce_irq_work_cb); @@ -2253,7 +2263,7 @@ static void mce_disable_error_reporting(void) struct mce_bank *b = &mce_banks[i]; if (b->init) - wrmsrl(msr_ops.ctl(i), 0); + wrmsrl(mca_msr_reg(i, MCA_CTL), 0); } return; } @@ -2605,7 +2615,7 @@ static void mce_reenable_cpu(void) struct mce_bank *b = &mce_banks[i]; if (b->init) - wrmsrl(msr_ops.ctl(i), b->ctl); + wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl); } } @@ -2754,7 +2764,6 @@ struct dentry *mce_get_debugfs_dir(void) static void mce_reset(void) { - cpu_missing = 0; atomic_set(&mce_fake_panicked, 0); atomic_set(&mce_executing, 0); atomic_set(&mce_callin, 0); diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 0bfc14041bbb..5fbd7ffb3233 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -74,7 +74,6 @@ MCE_INJECT_SET(status); MCE_INJECT_SET(misc); MCE_INJECT_SET(addr); MCE_INJECT_SET(synd); -MCE_INJECT_SET(ipid); #define MCE_INJECT_GET(reg) \ static int inj_##reg##_get(void *data, u64 *val) \ @@ -95,6 +94,20 @@ DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n"); DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n"); DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n"); DEFINE_SIMPLE_ATTRIBUTE(synd_fops, inj_synd_get, inj_synd_set, "%llx\n"); + +/* Use the user provided IPID value on a sw injection. */ +static int inj_ipid_set(void *data, u64 val) +{ + struct mce *m = (struct mce *)data; + + if (cpu_feature_enabled(X86_FEATURE_SMCA)) { + if (inj_type == SW_INJ) + m->ipid = val; + } + + return 0; +} + DEFINE_SIMPLE_ATTRIBUTE(ipid_fops, inj_ipid_get, inj_ipid_set, "%llx\n"); static void setup_inj_struct(struct mce *m) @@ -350,7 +363,7 @@ static ssize_t flags_write(struct file *filp, const char __user *ubuf, char buf[MAX_FLAG_OPT_SIZE], *__buf; int err; - if (cnt > MAX_FLAG_OPT_SIZE) + if (!cnt || cnt > MAX_FLAG_OPT_SIZE) return -EINVAL; if (copy_from_user(&buf, ubuf, cnt)) @@ -490,6 +503,8 @@ static void do_inject(void) i_mce.tsc = rdtsc_ordered(); + i_mce.status |= MCI_STATUS_VAL; + if (i_mce.misc) i_mce.status |= MCI_STATUS_MISCV; @@ -577,6 +592,33 @@ static int inj_bank_set(void *data, u64 val) } m->bank = val; + + /* + * sw-only injection allows to write arbitrary values into the MCA + * registers because it tests only the decoding paths. + */ + if (inj_type == SW_INJ) + goto inject; + + /* + * Read IPID value to determine if a bank is populated on the target + * CPU. + */ + if (cpu_feature_enabled(X86_FEATURE_SMCA)) { + u64 ipid; + + if (rdmsrl_on_cpu(m->extcpu, MSR_AMD64_SMCA_MCx_IPID(val), &ipid)) { + pr_err("Error reading IPID on CPU%d\n", m->extcpu); + return -EINVAL; + } + + if (!ipid) { + pr_err("Cannot inject into unpopulated bank %llu\n", val); + return -ENODEV; + } + } + +inject: do_inject(); /* Reset injection struct */ diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index acfd5d9f93c6..bb9a46a804bf 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -547,12 +547,13 @@ bool intel_filter_mce(struct mce *m) { struct cpuinfo_x86 *c = &boot_cpu_data; - /* MCE errata HSD131, HSM142, HSW131, BDM48, and HSM142 */ + /* MCE errata HSD131, HSM142, HSW131, BDM48, HSM142 and SKX37 */ if ((c->x86 == 6) && ((c->x86_model == INTEL_FAM6_HASWELL) || (c->x86_model == INTEL_FAM6_HASWELL_L) || (c->x86_model == INTEL_FAM6_BROADWELL) || - (c->x86_model == INTEL_FAM6_HASWELL_G)) && + (c->x86_model == INTEL_FAM6_HASWELL_G) || + (c->x86_model == INTEL_FAM6_SKYLAKE_X)) && (m->bank == 0) && ((m->status & 0xa0000000ffffffff) == 0x80000000000f0005)) return true; diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 88dcc79cfb07..52c633950b38 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -8,9 +8,6 @@ #include <linux/device.h> #include <asm/mce.h> -/* Pointer to the installed machine check handler for this CPU setup. */ -extern void (*machine_check_vector)(struct pt_regs *); - enum severity_level { MCE_NO_SEVERITY, MCE_DEFERRED_SEVERITY, @@ -38,8 +35,7 @@ int mce_gen_pool_add(struct mce *mce); int mce_gen_pool_init(void); struct llist_node *mce_gen_pool_prepare_records(void); -extern int (*mce_severity)(struct mce *a, struct pt_regs *regs, - int tolerant, char **msg, bool is_excp); +int mce_severity(struct mce *a, struct pt_regs *regs, int tolerant, char **msg, bool is_excp); struct dentry *mce_get_debugfs_dir(void); extern mce_banks_t mce_banks_ce_disabled; @@ -61,7 +57,7 @@ static inline void cmci_disable_bank(int bank) { } static inline void intel_init_cmci(void) { } static inline void intel_init_lmce(void) { } static inline void intel_clear_lmce(void) { } -static inline bool intel_filter_mce(struct mce *m) { return false; }; +static inline bool intel_filter_mce(struct mce *m) { return false; } #endif void mce_timer_kick(unsigned long interval); @@ -117,23 +113,25 @@ static inline void mce_unregister_injector_chain(struct notifier_block *nb) { } #endif struct mca_config { - bool dont_log_ce; - bool cmci_disabled; - bool ignore_ce; - bool print_all; - __u64 lmce_disabled : 1, disabled : 1, ser : 1, recovery : 1, bios_cmci_threshold : 1, - __reserved : 59; + /* Proper #MC exception handler is set */ + initialized : 1, + __reserved : 58; + + bool dont_log_ce; + bool cmci_disabled; + bool ignore_ce; + bool print_all; - s8 bootlog; int tolerant; int monarch_timeout; int panic_timeout; u32 rip_msr; + s8 bootlog; }; extern struct mca_config mca_cfg; @@ -163,19 +161,28 @@ struct mce_vendor_flags { /* AMD-style error thresholding banks present. */ amd_threshold : 1, - __reserved_0 : 60; + /* Pentium, family 5-style MCA */ + p5 : 1, + + /* Centaur Winchip C6-style MCA */ + winchip : 1, + + /* SandyBridge IFU quirk */ + snb_ifu_quirk : 1, + + __reserved_0 : 57; }; extern struct mce_vendor_flags mce_flags; -struct mca_msr_regs { - u32 (*ctl) (int bank); - u32 (*status) (int bank); - u32 (*addr) (int bank); - u32 (*misc) (int bank); +enum mca_msr { + MCA_CTL, + MCA_STATUS, + MCA_ADDR, + MCA_MISC, }; -extern struct mca_msr_regs msr_ops; +u32 mca_msr_reg(int bank, enum mca_msr reg); /* Decide whether to add MCE record to MCE event pool or filter it out. */ extern bool filter_mce(struct mce *m); @@ -183,17 +190,23 @@ extern bool filter_mce(struct mce *m); #ifdef CONFIG_X86_MCE_AMD extern bool amd_filter_mce(struct mce *m); #else -static inline bool amd_filter_mce(struct mce *m) { return false; }; +static inline bool amd_filter_mce(struct mce *m) { return false; } #endif -__visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr, - unsigned long error_code, - unsigned long fault_addr); +#ifdef CONFIG_X86_ANCIENT_MCE +void intel_p5_mcheck_init(struct cpuinfo_x86 *c); +void winchip_mcheck_init(struct cpuinfo_x86 *c); +noinstr void pentium_machine_check(struct pt_regs *regs); +noinstr void winchip_machine_check(struct pt_regs *regs); +static inline void enable_p5_mce(void) { mce_p5_enabled = 1; } +#else +static inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {} +static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {} +static inline void enable_p5_mce(void) {} +static inline void pentium_machine_check(struct pt_regs *regs) {} +static inline void winchip_machine_check(struct pt_regs *regs) {} +#endif -__visible bool ex_handler_wrmsr_fault(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr, - unsigned long error_code, - unsigned long fault_addr); +noinstr u64 mce_rdmsrl(u32 msr); #endif /* __X86_MCE_INTERNAL_H__ */ diff --git a/arch/x86/kernel/cpu/mce/p5.c b/arch/x86/kernel/cpu/mce/p5.c index 19e90cae8e97..2272ad53fc33 100644 --- a/arch/x86/kernel/cpu/mce/p5.c +++ b/arch/x86/kernel/cpu/mce/p5.c @@ -21,7 +21,7 @@ int mce_p5_enabled __read_mostly; /* Machine check handler for Pentium class Intel CPUs: */ -static noinstr void pentium_machine_check(struct pt_regs *regs) +noinstr void pentium_machine_check(struct pt_regs *regs) { u32 loaddr, hi, lotype; @@ -54,10 +54,6 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c) if (!cpu_has(c, X86_FEATURE_MCE)) return; - machine_check_vector = pentium_machine_check; - /* Make sure the vector pointer is visible before we enable MCEs: */ - wmb(); - /* Read registers before enabling: */ rdmsr(MSR_IA32_P5_MC_ADDR, l, h); rdmsr(MSR_IA32_P5_MC_TYPE, l, h); diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c index 17e631443116..7aa2bda93cbb 100644 --- a/arch/x86/kernel/cpu/mce/severity.c +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -222,6 +222,9 @@ static bool is_copy_from_user(struct pt_regs *regs) struct insn insn; int ret; + if (!regs) + return false; + if (copy_from_kernel_nofault(insn_buf, (void *)regs->ip, MAX_INSN_SIZE)) return false; @@ -263,33 +266,44 @@ static bool is_copy_from_user(struct pt_regs *regs) * distinguish an exception taken in user from from one * taken in the kernel. */ -static int error_context(struct mce *m, struct pt_regs *regs) +static noinstr int error_context(struct mce *m, struct pt_regs *regs) { - enum handler_type t; + int fixup_type; + bool copy_user; if ((m->cs & 3) == 3) return IN_USER; + if (!mc_recoverable(m->mcgstatus)) return IN_KERNEL; - t = ex_get_fault_handler_type(m->ip); - if (t == EX_HANDLER_FAULT) { - m->kflags |= MCE_IN_KERNEL_RECOV; - return IN_KERNEL_RECOV; - } - if (t == EX_HANDLER_UACCESS && regs && is_copy_from_user(regs)) { - m->kflags |= MCE_IN_KERNEL_RECOV; + /* Allow instrumentation around external facilities usage. */ + instrumentation_begin(); + fixup_type = ex_get_fixup_type(m->ip); + copy_user = is_copy_from_user(regs); + instrumentation_end(); + + switch (fixup_type) { + case EX_TYPE_UACCESS: + case EX_TYPE_COPY: + if (!copy_user) + return IN_KERNEL; m->kflags |= MCE_IN_KERNEL_COPYIN; + fallthrough; + + case EX_TYPE_FAULT_MCE_SAFE: + case EX_TYPE_DEFAULT_MCE_SAFE: + m->kflags |= MCE_IN_KERNEL_RECOV; return IN_KERNEL_RECOV; - } - return IN_KERNEL; + default: + return IN_KERNEL; + } } static int mce_severity_amd_smca(struct mce *m, enum context err_ctx) { - u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank); - u32 low, high; + u64 mcx_cfg; /* * We need to look at the following bits: @@ -300,11 +314,10 @@ static int mce_severity_amd_smca(struct mce *m, enum context err_ctx) if (!mce_flags.succor) return MCE_PANIC_SEVERITY; - if (rdmsr_safe(addr, &low, &high)) - return MCE_PANIC_SEVERITY; + mcx_cfg = mce_rdmsrl(MSR_AMD64_SMCA_MCx_CONFIG(m->bank)); /* TCC (Task context corrupt). If set and if IN_KERNEL, panic. */ - if ((low & MCI_CONFIG_MCAX) && + if ((mcx_cfg & MCI_CONFIG_MCAX) && (m->status & MCI_STATUS_TCC) && (err_ctx == IN_KERNEL)) return MCE_PANIC_SEVERITY; @@ -317,8 +330,8 @@ static int mce_severity_amd_smca(struct mce *m, enum context err_ctx) * See AMD Error Scope Hierarchy table in a newer BKDG. For example * 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features" */ -static int mce_severity_amd(struct mce *m, struct pt_regs *regs, int tolerant, - char **msg, bool is_excp) +static noinstr int mce_severity_amd(struct mce *m, struct pt_regs *regs, int tolerant, + char **msg, bool is_excp) { enum context ctx = error_context(m, regs); @@ -370,8 +383,8 @@ static int mce_severity_amd(struct mce *m, struct pt_regs *regs, int tolerant, return MCE_KEEP_SEVERITY; } -static int mce_severity_intel(struct mce *m, struct pt_regs *regs, - int tolerant, char **msg, bool is_excp) +static noinstr int mce_severity_intel(struct mce *m, struct pt_regs *regs, + int tolerant, char **msg, bool is_excp) { enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP); enum context ctx = error_context(m, regs); @@ -407,15 +420,14 @@ static int mce_severity_intel(struct mce *m, struct pt_regs *regs, } } -/* Default to mce_severity_intel */ -int (*mce_severity)(struct mce *m, struct pt_regs *regs, int tolerant, char **msg, bool is_excp) = - mce_severity_intel; - -void __init mcheck_vendor_init_severity(void) +int noinstr mce_severity(struct mce *m, struct pt_regs *regs, int tolerant, char **msg, + bool is_excp) { if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) - mce_severity = mce_severity_amd; + return mce_severity_amd(m, regs, tolerant, msg, is_excp); + else + return mce_severity_intel(m, regs, tolerant, msg, is_excp); } #ifdef CONFIG_DEBUG_FS diff --git a/arch/x86/kernel/cpu/mce/winchip.c b/arch/x86/kernel/cpu/mce/winchip.c index 9c9f0abd2d7f..6c99f2941909 100644 --- a/arch/x86/kernel/cpu/mce/winchip.c +++ b/arch/x86/kernel/cpu/mce/winchip.c @@ -17,7 +17,7 @@ #include "internal.h" /* Machine check handler for WinChip C6: */ -static noinstr void winchip_machine_check(struct pt_regs *regs) +noinstr void winchip_machine_check(struct pt_regs *regs) { instrumentation_begin(); pr_emerg("CPU0: Machine Check Exception.\n"); @@ -30,10 +30,6 @@ void winchip_mcheck_init(struct cpuinfo_x86 *c) { u32 lo, hi; - machine_check_vector = winchip_machine_check; - /* Make sure the vector pointer is visible before we enable MCEs: */ - wmb(); - rdmsr(MSR_IDT_FCR1, lo, hi); lo |= (1<<2); /* Enable EIERRINT (int 18 MCE) */ lo &= ~(1<<4); /* Enable MCE */ diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 3d4a48336084..8b2fcdfa6d31 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -456,17 +456,23 @@ apply_microcode_early_amd(u32 cpuid_1_eax, void *ucode, size_t size, bool save_p static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family) { -#ifdef CONFIG_X86_64 char fw_name[36] = "amd-ucode/microcode_amd.bin"; + struct firmware fw; + + if (IS_ENABLED(CONFIG_X86_32)) + return false; if (family >= 0x15) snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", family); - return get_builtin_firmware(cp, fw_name); -#else + if (firmware_request_builtin(&fw, fw_name)) { + cp->size = fw.size; + cp->data = (void *)fw.data; + return true; + } + return false; -#endif } static void __load_ucode_amd(unsigned int cpuid_1_eax, struct cpio_data *ret) diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index efb69be41ab1..f955d25076ba 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -140,23 +140,6 @@ static bool __init check_loader_disabled_bsp(void) return *res; } -extern struct builtin_fw __start_builtin_fw[]; -extern struct builtin_fw __end_builtin_fw[]; - -bool get_builtin_firmware(struct cpio_data *cd, const char *name) -{ - struct builtin_fw *b_fw; - - for (b_fw = __start_builtin_fw; b_fw != __end_builtin_fw; b_fw++) { - if (!strcmp(name, b_fw->name)) { - cd->size = b_fw->size; - cd->data = b_fw->data; - return true; - } - } - return false; -} - void __init load_ucode_bsp(void) { unsigned int cpuid_1_eax; diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 7e8e07bddd5f..d28a9f8f3fec 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -456,6 +456,7 @@ static void save_mc_for_early(struct ucode_cpu_info *uci, u8 *mc, unsigned int s static bool load_builtin_intel_microcode(struct cpio_data *cp) { unsigned int eax = 1, ebx, ecx = 0, edx; + struct firmware fw; char name[30]; if (IS_ENABLED(CONFIG_X86_32)) @@ -466,7 +467,13 @@ static bool load_builtin_intel_microcode(struct cpio_data *cp) sprintf(name, "intel-ucode/%02x-%02x-%02x", x86_family(eax), x86_model(eax), x86_stepping(eax)); - return get_builtin_firmware(cp, name); + if (firmware_request_builtin(&fw, name)) { + cp->size = fw.size; + cp->data = (void *)fw.data; + return true; + } + + return false; } /* diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index e095c28d27ae..5a99f993e639 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -18,6 +18,7 @@ #include <linux/kexec.h> #include <linux/i8253.h> #include <linux/random.h> +#include <linux/swiotlb.h> #include <asm/processor.h> #include <asm/hypervisor.h> #include <asm/hyperv-tlfs.h> @@ -79,7 +80,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_stimer0) inc_irq_stat(hyperv_stimer0_count); if (hv_stimer0_handler) hv_stimer0_handler(); - add_interrupt_randomness(HYPERV_STIMER0_VECTOR, 0); + add_interrupt_randomness(HYPERV_STIMER0_VECTOR); ack_APIC_irq(); set_irq_regs(old_regs); @@ -163,12 +164,22 @@ static uint32_t __init ms_hyperv_platform(void) cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS, &eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]); - if (eax >= HYPERV_CPUID_MIN && - eax <= HYPERV_CPUID_MAX && - !memcmp("Microsoft Hv", hyp_signature, 12)) - return HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS; + if (eax < HYPERV_CPUID_MIN || eax > HYPERV_CPUID_MAX || + memcmp("Microsoft Hv", hyp_signature, 12)) + return 0; - return 0; + /* HYPERCALL and VP_INDEX MSRs are mandatory for all features. */ + eax = cpuid_eax(HYPERV_CPUID_FEATURES); + if (!(eax & HV_MSR_HYPERCALL_AVAILABLE)) { + pr_warn("x86/hyperv: HYPERCALL MSR not available.\n"); + return 0; + } + if (!(eax & HV_MSR_VP_INDEX_AVAILABLE)) { + pr_warn("x86/hyperv: VP_INDEX MSR not available.\n"); + return 0; + } + + return HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS; } static unsigned char hv_get_nmi_reason(void) @@ -313,9 +324,26 @@ static void __init ms_hyperv_init_platform(void) if (ms_hyperv.priv_high & HV_ISOLATION) { ms_hyperv.isolation_config_a = cpuid_eax(HYPERV_CPUID_ISOLATION_CONFIG); ms_hyperv.isolation_config_b = cpuid_ebx(HYPERV_CPUID_ISOLATION_CONFIG); + ms_hyperv.shared_gpa_boundary = + BIT_ULL(ms_hyperv.shared_gpa_boundary_bits); pr_info("Hyper-V: Isolation Config: Group A 0x%x, Group B 0x%x\n", ms_hyperv.isolation_config_a, ms_hyperv.isolation_config_b); + + if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP) { + static_branch_enable(&isolation_type_snp); +#ifdef CONFIG_SWIOTLB + swiotlb_unencrypted_base = ms_hyperv.shared_gpa_boundary; +#endif + } + +#ifdef CONFIG_SWIOTLB + /* + * Enable swiotlb force mode in Isolation VM to + * use swiotlb bounce buffer for dma transaction. + */ + swiotlb_force = SWIOTLB_FORCE; +#endif } if (hv_max_functions_eax >= HYPERV_CPUID_NESTED_FEATURES) { diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 4b8813bafffd..bb1c3f5f60c8 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -527,12 +527,14 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) rdt_domain_reconfigure_cdp(r); if (r->alloc_capable && domain_setup_ctrlval(r, d)) { - kfree(d); + kfree(hw_dom); return; } if (r->mon_capable && domain_setup_mon_state(r, d)) { - kfree(d); + kfree(hw_dom->ctrl_val); + kfree(hw_dom->mbps_val); + kfree(hw_dom); return; } diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index c9f0f3d63f75..eaf25a234ff5 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -282,7 +282,7 @@ static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) u64 shift = 64 - width, chunks; chunks = (cur_msr << shift) - (prev_msr << shift); - return chunks >>= shift; + return chunks >> shift; } static u64 __mon_event_count(u32 rmid, struct rmid_read *rr) diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h index 9b204843b78d..fa04a73daf9c 100644 --- a/arch/x86/kernel/cpu/sgx/encls.h +++ b/arch/x86/kernel/cpu/sgx/encls.h @@ -11,26 +11,8 @@ #include <asm/traps.h> #include "sgx.h" -/** - * ENCLS_FAULT_FLAG - flag signifying an ENCLS return code is a trapnr - * - * ENCLS has its own (positive value) error codes and also generates - * ENCLS specific #GP and #PF faults. And the ENCLS values get munged - * with system error codes as everything percolates back up the stack. - * Unfortunately (for us), we need to precisely identify each unique - * error code, e.g. the action taken if EWB fails varies based on the - * type of fault and on the exact SGX error code, i.e. we can't simply - * convert all faults to -EFAULT. - * - * To make all three error types coexist, we set bit 30 to identify an - * ENCLS fault. Bit 31 (technically bits N:31) is used to differentiate - * between positive (faults and SGX error codes) and negative (system - * error codes) values. - */ -#define ENCLS_FAULT_FLAG 0x40000000 - /* Retrieve the encoded trapnr from the specified return code. */ -#define ENCLS_TRAPNR(r) ((r) & ~ENCLS_FAULT_FLAG) +#define ENCLS_TRAPNR(r) ((r) & ~SGX_ENCLS_FAULT_FLAG) /* Issue a WARN() about an ENCLS function. */ #define ENCLS_WARN(r, name) { \ @@ -50,7 +32,7 @@ */ static inline bool encls_faulted(int ret) { - return ret & ENCLS_FAULT_FLAG; + return ret & SGX_ENCLS_FAULT_FLAG; } /** @@ -88,11 +70,7 @@ static inline bool encls_failed(int ret) asm volatile( \ "1: .byte 0x0f, 0x01, 0xcf;\n\t" \ "2:\n" \ - ".section .fixup,\"ax\"\n" \ - "3: orl $"__stringify(ENCLS_FAULT_FLAG)",%%eax\n" \ - " jmp 2b\n" \ - ".previous\n" \ - _ASM_EXTABLE_FAULT(1b, 3b) \ + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_SGX) \ : "=a"(ret) \ : "a"(rax), inputs \ : "memory", "cc"); \ @@ -127,7 +105,7 @@ static inline bool encls_failed(int ret) * * Return: * 0 on success, - * trapnr with ENCLS_FAULT_FLAG set on fault + * trapnr with SGX_ENCLS_FAULT_FLAG set on fault */ #define __encls_N(rax, rbx_out, inputs...) \ ({ \ @@ -136,11 +114,7 @@ static inline bool encls_failed(int ret) "1: .byte 0x0f, 0x01, 0xcf;\n\t" \ " xor %%eax,%%eax;\n" \ "2:\n" \ - ".section .fixup,\"ax\"\n" \ - "3: orl $"__stringify(ENCLS_FAULT_FLAG)",%%eax\n" \ - " jmp 2b\n" \ - ".previous\n" \ - _ASM_EXTABLE_FAULT(1b, 3b) \ + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_SGX) \ : "=a"(ret), "=b"(rbx_out) \ : "a"(rax), inputs \ : "memory"); \ diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 63d3de02bbcc..4b41efc9e367 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -6,11 +6,13 @@ #include <linux/highmem.h> #include <linux/kthread.h> #include <linux/miscdevice.h> +#include <linux/node.h> #include <linux/pagemap.h> #include <linux/ratelimit.h> #include <linux/sched/mm.h> #include <linux/sched/signal.h> #include <linux/slab.h> +#include <linux/sysfs.h> #include <asm/sgx.h> #include "driver.h" #include "encl.h" @@ -20,6 +22,7 @@ struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS]; static int sgx_nr_epc_sections; static struct task_struct *ksgxd_tsk; static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq); +static DEFINE_XARRAY(sgx_epc_address_space); /* * These variables are part of the state of the reclaimer, and must be accessed @@ -28,8 +31,7 @@ static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq); static LIST_HEAD(sgx_active_page_list); static DEFINE_SPINLOCK(sgx_reclaimer_lock); -/* The free page list lock protected variables prepend the lock. */ -static unsigned long sgx_nr_free_pages; +static atomic_long_t sgx_nr_free_pages = ATOMIC_LONG_INIT(0); /* Nodes with one or more EPC sections. */ static nodemask_t sgx_numa_mask; @@ -61,6 +63,24 @@ static void __sgx_sanitize_pages(struct list_head *dirty_page_list) page = list_first_entry(dirty_page_list, struct sgx_epc_page, list); + /* + * Checking page->poison without holding the node->lock + * is racy, but losing the race (i.e. poison is set just + * after the check) just means __eremove() will be uselessly + * called for a page that sgx_free_epc_page() will put onto + * the node->sgx_poison_page_list later. + */ + if (page->poison) { + struct sgx_epc_section *section = &sgx_epc_sections[page->section]; + struct sgx_numa_node *node = section->node; + + spin_lock(&node->lock); + list_move(&page->list, &node->sgx_poison_page_list); + spin_unlock(&node->lock); + + continue; + } + ret = __eremove(sgx_get_epc_virt_addr(page)); if (!ret) { /* @@ -403,14 +423,15 @@ skip: spin_lock(&node->lock); list_add_tail(&epc_page->list, &node->free_page_list); - sgx_nr_free_pages++; spin_unlock(&node->lock); + atomic_long_inc(&sgx_nr_free_pages); } } static bool sgx_should_reclaim(unsigned long watermark) { - return sgx_nr_free_pages < watermark && !list_empty(&sgx_active_page_list); + return atomic_long_read(&sgx_nr_free_pages) < watermark && + !list_empty(&sgx_active_page_list); } static int ksgxd(void *p) @@ -471,9 +492,10 @@ static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid) page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list); list_del_init(&page->list); - sgx_nr_free_pages--; + page->flags = 0; spin_unlock(&node->lock); + atomic_long_dec(&sgx_nr_free_pages); return page; } @@ -624,10 +646,15 @@ void sgx_free_epc_page(struct sgx_epc_page *page) spin_lock(&node->lock); - list_add_tail(&page->list, &node->free_page_list); - sgx_nr_free_pages++; + page->owner = NULL; + if (page->poison) + list_add(&page->list, &node->sgx_poison_page_list); + else + list_add_tail(&page->list, &node->free_page_list); + page->flags = SGX_EPC_PAGE_IS_FREE; spin_unlock(&node->lock); + atomic_long_inc(&sgx_nr_free_pages); } static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, @@ -648,17 +675,102 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, } section->phys_addr = phys_addr; + xa_store_range(&sgx_epc_address_space, section->phys_addr, + phys_addr + size - 1, section, GFP_KERNEL); for (i = 0; i < nr_pages; i++) { section->pages[i].section = index; section->pages[i].flags = 0; section->pages[i].owner = NULL; + section->pages[i].poison = 0; list_add_tail(§ion->pages[i].list, &sgx_dirty_page_list); } return true; } +bool arch_is_platform_page(u64 paddr) +{ + return !!xa_load(&sgx_epc_address_space, paddr); +} +EXPORT_SYMBOL_GPL(arch_is_platform_page); + +static struct sgx_epc_page *sgx_paddr_to_page(u64 paddr) +{ + struct sgx_epc_section *section; + + section = xa_load(&sgx_epc_address_space, paddr); + if (!section) + return NULL; + + return §ion->pages[PFN_DOWN(paddr - section->phys_addr)]; +} + +/* + * Called in process context to handle a hardware reported + * error in an SGX EPC page. + * If the MF_ACTION_REQUIRED bit is set in flags, then the + * context is the task that consumed the poison data. Otherwise + * this is called from a kernel thread unrelated to the page. + */ +int arch_memory_failure(unsigned long pfn, int flags) +{ + struct sgx_epc_page *page = sgx_paddr_to_page(pfn << PAGE_SHIFT); + struct sgx_epc_section *section; + struct sgx_numa_node *node; + + /* + * mm/memory-failure.c calls this routine for all errors + * where there isn't a "struct page" for the address. But that + * includes other address ranges besides SGX. + */ + if (!page) + return -ENXIO; + + /* + * If poison was consumed synchronously. Send a SIGBUS to + * the task. Hardware has already exited the SGX enclave and + * will not allow re-entry to an enclave that has a memory + * error. The signal may help the task understand why the + * enclave is broken. + */ + if (flags & MF_ACTION_REQUIRED) + force_sig(SIGBUS); + + section = &sgx_epc_sections[page->section]; + node = section->node; + + spin_lock(&node->lock); + + /* Already poisoned? Nothing more to do */ + if (page->poison) + goto out; + + page->poison = 1; + + /* + * If the page is on a free list, move it to the per-node + * poison page list. + */ + if (page->flags & SGX_EPC_PAGE_IS_FREE) { + list_move(&page->list, &node->sgx_poison_page_list); + goto out; + } + + /* + * TBD: Add additional plumbing to enable pre-emptive + * action for asynchronous poison notification. Until + * then just hope that the poison: + * a) is not accessed - sgx_free_epc_page() will deal with it + * when the user gives it back + * b) results in a recoverable machine check rather than + * a fatal one + */ +out: + spin_unlock(&node->lock); + return 0; +} + /** * A section metric is concatenated in a way that @low bits 12-31 define the * bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the @@ -670,6 +782,48 @@ static inline u64 __init sgx_calc_section_metric(u64 low, u64 high) ((high & GENMASK_ULL(19, 0)) << 32); } +#ifdef CONFIG_NUMA +static ssize_t sgx_total_bytes_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lu\n", sgx_numa_nodes[dev->id].size); +} +static DEVICE_ATTR_RO(sgx_total_bytes); + +static umode_t arch_node_attr_is_visible(struct kobject *kobj, + struct attribute *attr, int idx) +{ + /* Make all x86/ attributes invisible when SGX is not initialized: */ + if (nodes_empty(sgx_numa_mask)) + return 0; + + return attr->mode; +} + +static struct attribute *arch_node_dev_attrs[] = { + &dev_attr_sgx_total_bytes.attr, + NULL, +}; + +const struct attribute_group arch_node_dev_group = { + .name = "x86", + .attrs = arch_node_dev_attrs, + .is_visible = arch_node_attr_is_visible, +}; + +static void __init arch_update_sysfs_visibility(int nid) +{ + struct node *node = node_devices[nid]; + int ret; + + ret = sysfs_update_group(&node->dev.kobj, &arch_node_dev_group); + + if (ret) + pr_err("sysfs update failed (%d), files may be invisible", ret); +} +#else /* !CONFIG_NUMA */ +static void __init arch_update_sysfs_visibility(int nid) {} +#endif + static bool __init sgx_page_cache_init(void) { u32 eax, ebx, ecx, edx, type; @@ -713,10 +867,16 @@ static bool __init sgx_page_cache_init(void) if (!node_isset(nid, sgx_numa_mask)) { spin_lock_init(&sgx_numa_nodes[nid].lock); INIT_LIST_HEAD(&sgx_numa_nodes[nid].free_page_list); + INIT_LIST_HEAD(&sgx_numa_nodes[nid].sgx_poison_page_list); node_set(nid, sgx_numa_mask); + sgx_numa_nodes[nid].size = 0; + + /* Make SGX-specific node sysfs files visible: */ + arch_update_sysfs_visibility(nid); } sgx_epc_sections[i].node = &sgx_numa_nodes[nid]; + sgx_numa_nodes[nid].size += size; sgx_nr_epc_sections++; } diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h index 4628acec0009..0f17def9fe6f 100644 --- a/arch/x86/kernel/cpu/sgx/sgx.h +++ b/arch/x86/kernel/cpu/sgx/sgx.h @@ -26,9 +26,13 @@ /* Pages, which are being tracked by the page reclaimer. */ #define SGX_EPC_PAGE_RECLAIMER_TRACKED BIT(0) +/* Pages on free list */ +#define SGX_EPC_PAGE_IS_FREE BIT(1) + struct sgx_epc_page { unsigned int section; - unsigned int flags; + u16 flags; + u16 poison; struct sgx_encl_page *owner; struct list_head list; }; @@ -39,6 +43,8 @@ struct sgx_epc_page { */ struct sgx_numa_node { struct list_head free_page_list; + struct list_head sgx_poison_page_list; + unsigned long size; spinlock_t lock; }; diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c index 64511c4a5200..6a77a14eee38 100644 --- a/arch/x86/kernel/cpu/sgx/virt.c +++ b/arch/x86/kernel/cpu/sgx/virt.c @@ -111,10 +111,8 @@ static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma) return 0; } -static int sgx_vepc_free_page(struct sgx_epc_page *epc_page) +static int sgx_vepc_remove_page(struct sgx_epc_page *epc_page) { - int ret; - /* * Take a previously guest-owned EPC page and return it to the * general EPC page pool. @@ -124,7 +122,12 @@ static int sgx_vepc_free_page(struct sgx_epc_page *epc_page) * case that a guest properly EREMOVE'd this page, a superfluous * EREMOVE is harmless. */ - ret = __eremove(sgx_get_epc_virt_addr(epc_page)); + return __eremove(sgx_get_epc_virt_addr(epc_page)); +} + +static int sgx_vepc_free_page(struct sgx_epc_page *epc_page) +{ + int ret = sgx_vepc_remove_page(epc_page); if (ret) { /* * Only SGX_CHILD_PRESENT is expected, which is because of @@ -144,10 +147,44 @@ static int sgx_vepc_free_page(struct sgx_epc_page *epc_page) } sgx_free_epc_page(epc_page); - return 0; } +static long sgx_vepc_remove_all(struct sgx_vepc *vepc) +{ + struct sgx_epc_page *entry; + unsigned long index; + long failures = 0; + + xa_for_each(&vepc->page_array, index, entry) { + int ret = sgx_vepc_remove_page(entry); + if (ret) { + if (ret == SGX_CHILD_PRESENT) { + /* The page is a SECS, userspace will retry. */ + failures++; + } else { + /* + * Report errors due to #GP or SGX_ENCLAVE_ACT; do not + * WARN, as userspace can induce said failures by + * calling the ioctl concurrently on multiple vEPCs or + * while one or more CPUs is running the enclave. Only + * a #PF on EREMOVE indicates a kernel/hardware issue. + */ + WARN_ON_ONCE(encls_faulted(ret) && + ENCLS_TRAPNR(ret) != X86_TRAP_GP); + return -EBUSY; + } + } + cond_resched(); + } + + /* + * Return the number of SECS pages that failed to be removed, so + * userspace knows that it has to retry. + */ + return failures; +} + static int sgx_vepc_release(struct inode *inode, struct file *file) { struct sgx_vepc *vepc = file->private_data; @@ -233,9 +270,27 @@ static int sgx_vepc_open(struct inode *inode, struct file *file) return 0; } +static long sgx_vepc_ioctl(struct file *file, + unsigned int cmd, unsigned long arg) +{ + struct sgx_vepc *vepc = file->private_data; + + switch (cmd) { + case SGX_IOC_VEPC_REMOVE_ALL: + if (arg) + return -EINVAL; + return sgx_vepc_remove_all(vepc); + + default: + return -ENOTTY; + } +} + static const struct file_operations sgx_vepc_fops = { .owner = THIS_MODULE, .open = sgx_vepc_open, + .unlocked_ioctl = sgx_vepc_ioctl, + .compat_ioctl = sgx_vepc_ioctl, .release = sgx_vepc_release, .mmap = sgx_vepc_mmap, }; diff --git a/arch/x86/kernel/cpu/vortex.c b/arch/x86/kernel/cpu/vortex.c new file mode 100644 index 000000000000..e2685470ba94 --- /dev/null +++ b/arch/x86/kernel/cpu/vortex.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/kernel.h> +#include <asm/processor.h> +#include "cpu.h" + +/* + * No special init required for Vortex processors. + */ + +static const struct cpu_dev vortex_cpu_dev = { + .c_vendor = "Vortex", + .c_ident = { "Vortex86 SoC" }, + .legacy_models = { + { + .family = 5, + .model_names = { + [2] = "Vortex86DX", + [8] = "Vortex86MX", + }, + }, + { + .family = 6, + .model_names = { + /* + * Both the Vortex86EX and the Vortex86EX2 + * have the same family and model id. + * + * However, the -EX2 supports the product name + * CPUID call, so this name will only be used + * for the -EX, which does not. + */ + [0] = "Vortex86EX", + }, + }, + }, + .c_x86_vendor = X86_VENDOR_VORTEX, +}; + +cpu_dev_register(vortex_cpu_dev); diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c index 045e82e8945b..a7f617a3981d 100644 --- a/arch/x86/kernel/crash_dump_64.c +++ b/arch/x86/kernel/crash_dump_64.c @@ -10,6 +10,7 @@ #include <linux/crash_dump.h> #include <linux/uaccess.h> #include <linux/io.h> +#include <linux/cc_platform.h> static ssize_t __copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, unsigned long offset, int userbuf, @@ -73,5 +74,6 @@ ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize, ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos) { - return read_from_oldmem(buf, count, ppos, 0, sev_active()); + return read_from_oldmem(buf, count, ppos, 0, + cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)); } diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 6a4cb71c2498..5cd51f25f446 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -31,11 +31,6 @@ char __initdata cmd_line[COMMAND_LINE_SIZE]; int __initdata of_ioapic; -void __init early_init_dt_scan_chosen_arch(unsigned long node) -{ - BUG(); -} - void __init early_init_dt_add_memory_arch(u64 base, u64 size) { BUG(); @@ -139,12 +134,11 @@ static void __init dtb_cpu_setup(void) { struct device_node *dn; u32 apic_id, version; - int ret; version = GET_APIC_VERSION(apic_read(APIC_LVR)); for_each_of_cpu_node(dn) { - ret = of_property_read_u32(dn, "reg", &apic_id); - if (ret < 0) { + apic_id = of_get_cpu_hwid(dn, 0); + if (apic_id == ~0U) { pr_warn("%pOF: missing local APIC ID\n", dn); continue; } diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c index d1d49e3d536b..3b58d8703094 100644 --- a/arch/x86/kernel/doublefault_32.c +++ b/arch/x86/kernel/doublefault_32.c @@ -77,9 +77,6 @@ asmlinkage noinstr void __noreturn doublefault_shim(void) * some way to reconstruct CR3. We could make a credible guess based * on cpu_tlbstate, but that would be racy and would not account for * PTI. - * - * Instead, don't bother. We can return through - * rewind_stack_do_exit() instead. */ panic("cannot return from double fault\n"); } diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 5601b95944fa..6c5defd6569a 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -32,9 +32,15 @@ const char *stack_type_name(enum stack_type type) { BUILD_BUG_ON(N_EXCEPTION_STACKS != 6); + if (type == STACK_TYPE_TASK) + return "TASK"; + if (type == STACK_TYPE_IRQ) return "IRQ"; + if (type == STACK_TYPE_SOFTIRQ) + return "SOFTIRQ"; + if (type == STACK_TYPE_ENTRY) { /* * On 64-bit, we have a generic entry stack that we diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 38837dad46e6..fd2d3ab38ebb 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -554,6 +554,7 @@ static const struct pci_device_id intel_early_ids[] __initconst = { INTEL_RKL_IDS(&gen11_early_ops), INTEL_ADLS_IDS(&gen11_early_ops), INTEL_ADLP_IDS(&gen11_early_ops), + INTEL_RPLS_IDS(&gen11_early_ops), }; struct resource intel_graphics_stolen_res __ro_after_init = DEFINE_RES_MEM(0, 0); @@ -714,12 +715,6 @@ static struct chipset early_qrk[] __initdata = { */ { PCI_VENDOR_ID_INTEL, 0x0f00, PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, - { PCI_VENDOR_ID_INTEL, 0x3e20, - PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, - { PCI_VENDOR_ID_INTEL, 0x3ec4, - PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, - { PCI_VENDOR_ID_INTEL, 0x8a12, - PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, { PCI_VENDOR_ID_BROADCOM, 0x4331, PCI_CLASS_NETWORK_OTHER, PCI_ANY_ID, 0, apple_airport_reset}, {} diff --git a/arch/x86/kernel/fpu/bugs.c b/arch/x86/kernel/fpu/bugs.c index 2954fab15e51..794e70151203 100644 --- a/arch/x86/kernel/fpu/bugs.c +++ b/arch/x86/kernel/fpu/bugs.c @@ -2,7 +2,7 @@ /* * x86 FPU bug checks: */ -#include <asm/fpu/internal.h> +#include <asm/fpu/api.h> /* * Boot time CPU/FPU FDIV bug detection code: diff --git a/arch/x86/kernel/fpu/context.h b/arch/x86/kernel/fpu/context.h new file mode 100644 index 000000000000..958accf2ccf0 --- /dev/null +++ b/arch/x86/kernel/fpu/context.h @@ -0,0 +1,83 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __X86_KERNEL_FPU_CONTEXT_H +#define __X86_KERNEL_FPU_CONTEXT_H + +#include <asm/fpu/xstate.h> +#include <asm/trace/fpu.h> + +/* Functions related to FPU context tracking */ + +/* + * The in-register FPU state for an FPU context on a CPU is assumed to be + * valid if the fpu->last_cpu matches the CPU, and the fpu_fpregs_owner_ctx + * matches the FPU. + * + * If the FPU register state is valid, the kernel can skip restoring the + * FPU state from memory. + * + * Any code that clobbers the FPU registers or updates the in-memory + * FPU state for a task MUST let the rest of the kernel know that the + * FPU registers are no longer valid for this task. + * + * Either one of these invalidation functions is enough. Invalidate + * a resource you control: CPU if using the CPU for something else + * (with preemption disabled), FPU for the current task, or a task that + * is prevented from running by the current task. + */ +static inline void __cpu_invalidate_fpregs_state(void) +{ + __this_cpu_write(fpu_fpregs_owner_ctx, NULL); +} + +static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu) +{ + fpu->last_cpu = -1; +} + +static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu) +{ + return fpu == this_cpu_read(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; +} + +static inline void fpregs_deactivate(struct fpu *fpu) +{ + __this_cpu_write(fpu_fpregs_owner_ctx, NULL); + trace_x86_fpu_regs_deactivated(fpu); +} + +static inline void fpregs_activate(struct fpu *fpu) +{ + __this_cpu_write(fpu_fpregs_owner_ctx, fpu); + trace_x86_fpu_regs_activated(fpu); +} + +/* Internal helper for switch_fpu_return() and signal frame setup */ +static inline void fpregs_restore_userregs(void) +{ + struct fpu *fpu = ¤t->thread.fpu; + int cpu = smp_processor_id(); + + if (WARN_ON_ONCE(current->flags & PF_KTHREAD)) + return; + + if (!fpregs_state_valid(fpu, cpu)) { + /* + * This restores _all_ xstate which has not been + * established yet. + * + * If PKRU is enabled, then the PKRU value is already + * correct because it was either set in switch_to() or in + * flush_thread(). So it is excluded because it might be + * not up to date in current->thread.fpu.xsave state. + * + * XFD state is handled in restore_fpregs_from_fpstate(). + */ + restore_fpregs_from_fpstate(fpu->fpstate, XFEATURE_MASK_FPSTATE); + + fpregs_activate(fpu); + fpu->last_cpu = cpu; + } + clear_thread_flag(TIF_NEED_FPU_LOAD); +} + +#endif diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 7ada7bd03a32..8dea01ffc5c1 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -6,8 +6,9 @@ * General FPU state handling cleanups * Gareth Hughes <gareth@valinux.com>, May 2000 */ -#include <asm/fpu/internal.h> +#include <asm/fpu/api.h> #include <asm/fpu/regset.h> +#include <asm/fpu/sched.h> #include <asm/fpu/signal.h> #include <asm/fpu/types.h> #include <asm/traps.h> @@ -15,15 +16,30 @@ #include <linux/hardirq.h> #include <linux/pkeys.h> +#include <linux/vmalloc.h> + +#include "context.h" +#include "internal.h" +#include "legacy.h" +#include "xstate.h" #define CREATE_TRACE_POINTS #include <asm/trace/fpu.h> +#ifdef CONFIG_X86_64 +DEFINE_STATIC_KEY_FALSE(__fpu_state_size_dynamic); +DEFINE_PER_CPU(u64, xfd_state); +#endif + +/* The FPU state configuration data for kernel and user space */ +struct fpu_state_config fpu_kernel_cfg __ro_after_init; +struct fpu_state_config fpu_user_cfg __ro_after_init; + /* * Represents the initial FPU state. It's mostly (but not completely) zeroes, * depending on the FPU hardware format: */ -union fpregs_state init_fpstate __ro_after_init; +struct fpstate init_fpstate __ro_after_init; /* * Track whether the kernel is using the FPU state @@ -83,7 +99,20 @@ bool irq_fpu_usable(void) EXPORT_SYMBOL(irq_fpu_usable); /* - * Save the FPU register state in fpu->state. The register state is + * Track AVX512 state use because it is known to slow the max clock + * speed of the core. + */ +static void update_avx_timestamp(struct fpu *fpu) +{ + +#define AVX512_TRACKING_MASK (XFEATURE_MASK_ZMM_Hi256 | XFEATURE_MASK_Hi16_ZMM) + + if (fpu->fpstate->regs.xsave.header.xfeatures & AVX512_TRACKING_MASK) + fpu->avx512_timestamp = jiffies; +} + +/* + * Save the FPU register state in fpu->fpstate->regs. The register state is * preserved. * * Must be called with fpregs_lock() held. @@ -99,19 +128,13 @@ EXPORT_SYMBOL(irq_fpu_usable); void save_fpregs_to_fpstate(struct fpu *fpu) { if (likely(use_xsave())) { - os_xsave(&fpu->state.xsave); - - /* - * AVX512 state is tracked here because its use is - * known to slow the max clock speed of the core. - */ - if (fpu->state.xsave.header.xfeatures & XFEATURE_MASK_AVX512) - fpu->avx512_timestamp = jiffies; + os_xsave(fpu->fpstate); + update_avx_timestamp(fpu); return; } if (likely(use_fxsr())) { - fxsave(&fpu->state.fxsave); + fxsave(&fpu->fpstate->regs.fxsave); return; } @@ -119,12 +142,11 @@ void save_fpregs_to_fpstate(struct fpu *fpu) * Legacy FPU register saving, FNSAVE always clears FPU registers, * so we have to reload them from the memory state. */ - asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave)); - frstor(&fpu->state.fsave); + asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->fpstate->regs.fsave)); + frstor(&fpu->fpstate->regs.fsave); } -EXPORT_SYMBOL(save_fpregs_to_fpstate); -void __restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask) +void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask) { /* * AMD K7/K8 and later CPUs up to Zen don't save/restore @@ -141,15 +163,265 @@ void __restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask) } if (use_xsave()) { - os_xrstor(&fpstate->xsave, mask); + /* + * Dynamically enabled features are enabled in XCR0, but + * usage requires also that the corresponding bits in XFD + * are cleared. If the bits are set then using a related + * instruction will raise #NM. This allows to do the + * allocation of the larger FPU buffer lazy from #NM or if + * the task has no permission to kill it which would happen + * via #UD if the feature is disabled in XCR0. + * + * XFD state is following the same life time rules as + * XSTATE and to restore state correctly XFD has to be + * updated before XRSTORS otherwise the component would + * stay in or go into init state even if the bits are set + * in fpstate::regs::xsave::xfeatures. + */ + xfd_update_state(fpstate); + + /* + * Restoring state always needs to modify all features + * which are in @mask even if the current task cannot use + * extended features. + * + * So fpstate->xfeatures cannot be used here, because then + * a feature for which the task has no permission but was + * used by the previous task would not go into init state. + */ + mask = fpu_kernel_cfg.max_features & mask; + + os_xrstor(fpstate, mask); } else { if (use_fxsr()) - fxrstor(&fpstate->fxsave); + fxrstor(&fpstate->regs.fxsave); else - frstor(&fpstate->fsave); + frstor(&fpstate->regs.fsave); + } +} + +void fpu_reset_from_exception_fixup(void) +{ + restore_fpregs_from_fpstate(&init_fpstate, XFEATURE_MASK_FPSTATE); +} + +#if IS_ENABLED(CONFIG_KVM) +static void __fpstate_reset(struct fpstate *fpstate, u64 xfd); + +static void fpu_init_guest_permissions(struct fpu_guest *gfpu) +{ + struct fpu_state_perm *fpuperm; + u64 perm; + + if (!IS_ENABLED(CONFIG_X86_64)) + return; + + spin_lock_irq(¤t->sighand->siglock); + fpuperm = ¤t->group_leader->thread.fpu.guest_perm; + perm = fpuperm->__state_perm; + + /* First fpstate allocation locks down permissions. */ + WRITE_ONCE(fpuperm->__state_perm, perm | FPU_GUEST_PERM_LOCKED); + + spin_unlock_irq(¤t->sighand->siglock); + + gfpu->perm = perm & ~FPU_GUEST_PERM_LOCKED; +} + +bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu) +{ + struct fpstate *fpstate; + unsigned int size; + + size = fpu_user_cfg.default_size + ALIGN(offsetof(struct fpstate, regs), 64); + fpstate = vzalloc(size); + if (!fpstate) + return false; + + /* Leave xfd to 0 (the reset value defined by spec) */ + __fpstate_reset(fpstate, 0); + fpstate_init_user(fpstate); + fpstate->is_valloc = true; + fpstate->is_guest = true; + + gfpu->fpstate = fpstate; + gfpu->xfeatures = fpu_user_cfg.default_features; + gfpu->perm = fpu_user_cfg.default_features; + gfpu->uabi_size = fpu_user_cfg.default_size; + fpu_init_guest_permissions(gfpu); + + return true; +} +EXPORT_SYMBOL_GPL(fpu_alloc_guest_fpstate); + +void fpu_free_guest_fpstate(struct fpu_guest *gfpu) +{ + struct fpstate *fps = gfpu->fpstate; + + if (!fps) + return; + + if (WARN_ON_ONCE(!fps->is_valloc || !fps->is_guest || fps->in_use)) + return; + + gfpu->fpstate = NULL; + vfree(fps); +} +EXPORT_SYMBOL_GPL(fpu_free_guest_fpstate); + +/* + * fpu_enable_guest_xfd_features - Check xfeatures against guest perm and enable + * @guest_fpu: Pointer to the guest FPU container + * @xfeatures: Features requested by guest CPUID + * + * Enable all dynamic xfeatures according to guest perm and requested CPUID. + * + * Return: 0 on success, error code otherwise + */ +int fpu_enable_guest_xfd_features(struct fpu_guest *guest_fpu, u64 xfeatures) +{ + lockdep_assert_preemption_enabled(); + + /* Nothing to do if all requested features are already enabled. */ + xfeatures &= ~guest_fpu->xfeatures; + if (!xfeatures) + return 0; + + return __xfd_enable_feature(xfeatures, guest_fpu); +} +EXPORT_SYMBOL_GPL(fpu_enable_guest_xfd_features); + +#ifdef CONFIG_X86_64 +void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd) +{ + fpregs_lock(); + guest_fpu->fpstate->xfd = xfd; + if (guest_fpu->fpstate->in_use) + xfd_update_state(guest_fpu->fpstate); + fpregs_unlock(); +} +EXPORT_SYMBOL_GPL(fpu_update_guest_xfd); + +/** + * fpu_sync_guest_vmexit_xfd_state - Synchronize XFD MSR and software state + * + * Must be invoked from KVM after a VMEXIT before enabling interrupts when + * XFD write emulation is disabled. This is required because the guest can + * freely modify XFD and the state at VMEXIT is not guaranteed to be the + * same as the state on VMENTER. So software state has to be udpated before + * any operation which depends on it can take place. + * + * Note: It can be invoked unconditionally even when write emulation is + * enabled for the price of a then pointless MSR read. + */ +void fpu_sync_guest_vmexit_xfd_state(void) +{ + struct fpstate *fps = current->thread.fpu.fpstate; + + lockdep_assert_irqs_disabled(); + if (fpu_state_size_dynamic()) { + rdmsrl(MSR_IA32_XFD, fps->xfd); + __this_cpu_write(xfd_state, fps->xfd); + } +} +EXPORT_SYMBOL_GPL(fpu_sync_guest_vmexit_xfd_state); +#endif /* CONFIG_X86_64 */ + +int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest) +{ + struct fpstate *guest_fps = guest_fpu->fpstate; + struct fpu *fpu = ¤t->thread.fpu; + struct fpstate *cur_fps = fpu->fpstate; + + fpregs_lock(); + if (!cur_fps->is_confidential && !test_thread_flag(TIF_NEED_FPU_LOAD)) + save_fpregs_to_fpstate(fpu); + + /* Swap fpstate */ + if (enter_guest) { + fpu->__task_fpstate = cur_fps; + fpu->fpstate = guest_fps; + guest_fps->in_use = true; + } else { + guest_fps->in_use = false; + fpu->fpstate = fpu->__task_fpstate; + fpu->__task_fpstate = NULL; } + + cur_fps = fpu->fpstate; + + if (!cur_fps->is_confidential) { + /* Includes XFD update */ + restore_fpregs_from_fpstate(cur_fps, XFEATURE_MASK_FPSTATE); + } else { + /* + * XSTATE is restored by firmware from encrypted + * memory. Make sure XFD state is correct while + * running with guest fpstate + */ + xfd_update_state(cur_fps); + } + + fpregs_mark_activate(); + fpregs_unlock(); + return 0; } -EXPORT_SYMBOL_GPL(__restore_fpregs_from_fpstate); +EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpstate); + +void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, + unsigned int size, u32 pkru) +{ + struct fpstate *kstate = gfpu->fpstate; + union fpregs_state *ustate = buf; + struct membuf mb = { .p = buf, .left = size }; + + if (cpu_feature_enabled(X86_FEATURE_XSAVE)) { + __copy_xstate_to_uabi_buf(mb, kstate, pkru, XSTATE_COPY_XSAVE); + } else { + memcpy(&ustate->fxsave, &kstate->regs.fxsave, + sizeof(ustate->fxsave)); + /* Make it restorable on a XSAVE enabled host */ + ustate->xsave.header.xfeatures = XFEATURE_MASK_FPSSE; + } +} +EXPORT_SYMBOL_GPL(fpu_copy_guest_fpstate_to_uabi); + +int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, + u64 xcr0, u32 *vpkru) +{ + struct fpstate *kstate = gfpu->fpstate; + const union fpregs_state *ustate = buf; + struct pkru_state *xpkru; + int ret; + + if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) { + if (ustate->xsave.header.xfeatures & ~XFEATURE_MASK_FPSSE) + return -EINVAL; + if (ustate->fxsave.mxcsr & ~mxcsr_feature_mask) + return -EINVAL; + memcpy(&kstate->regs.fxsave, &ustate->fxsave, sizeof(ustate->fxsave)); + return 0; + } + + if (ustate->xsave.header.xfeatures & ~xcr0) + return -EINVAL; + + ret = copy_uabi_from_kernel_to_xstate(kstate, ustate); + if (ret) + return ret; + + /* Retrieve PKRU if not in init state */ + if (kstate->regs.xsave.header.xfeatures & XFEATURE_MASK_PKRU) { + xpkru = get_xsave_addr(&kstate->regs.xsave, XFEATURE_PKRU); + *vpkru = xpkru->pkru; + } + + /* Ensure that XCOMP_BV is set up for XSAVES */ + xstate_init_xcomp_bv(&kstate->regs.xsave, kstate->xfeatures); + return 0; +} +EXPORT_SYMBOL_GPL(fpu_copy_uabi_to_guest_fpstate); +#endif /* CONFIG_KVM */ void kernel_fpu_begin_mask(unsigned int kfpu_mask) { @@ -203,52 +475,91 @@ void fpu_sync_fpstate(struct fpu *fpu) fpregs_unlock(); } -static inline void fpstate_init_xstate(struct xregs_state *xsave) +static inline unsigned int init_fpstate_copy_size(void) { - /* - * XRSTORS requires these bits set in xcomp_bv, or it will - * trigger #GP: - */ - xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask_all; + if (!use_xsave()) + return fpu_kernel_cfg.default_size; + + /* XSAVE(S) just needs the legacy and the xstate header part */ + return sizeof(init_fpstate.regs.xsave); } -static inline void fpstate_init_fxstate(struct fxregs_state *fx) +static inline void fpstate_init_fxstate(struct fpstate *fpstate) { - fx->cwd = 0x37f; - fx->mxcsr = MXCSR_DEFAULT; + fpstate->regs.fxsave.cwd = 0x37f; + fpstate->regs.fxsave.mxcsr = MXCSR_DEFAULT; } /* * Legacy x87 fpstate state init: */ -static inline void fpstate_init_fstate(struct fregs_state *fp) +static inline void fpstate_init_fstate(struct fpstate *fpstate) { - fp->cwd = 0xffff037fu; - fp->swd = 0xffff0000u; - fp->twd = 0xffffffffu; - fp->fos = 0xffff0000u; + fpstate->regs.fsave.cwd = 0xffff037fu; + fpstate->regs.fsave.swd = 0xffff0000u; + fpstate->regs.fsave.twd = 0xffffffffu; + fpstate->regs.fsave.fos = 0xffff0000u; } -void fpstate_init(union fpregs_state *state) +/* + * Used in two places: + * 1) Early boot to setup init_fpstate for non XSAVE systems + * 2) fpu_init_fpstate_user() which is invoked from KVM + */ +void fpstate_init_user(struct fpstate *fpstate) { - if (!static_cpu_has(X86_FEATURE_FPU)) { - fpstate_init_soft(&state->soft); + if (!cpu_feature_enabled(X86_FEATURE_FPU)) { + fpstate_init_soft(&fpstate->regs.soft); return; } - memset(state, 0, fpu_kernel_xstate_size); + xstate_init_xcomp_bv(&fpstate->regs.xsave, fpstate->xfeatures); - if (static_cpu_has(X86_FEATURE_XSAVES)) - fpstate_init_xstate(&state->xsave); - if (static_cpu_has(X86_FEATURE_FXSR)) - fpstate_init_fxstate(&state->fxsave); + if (cpu_feature_enabled(X86_FEATURE_FXSR)) + fpstate_init_fxstate(fpstate); else - fpstate_init_fstate(&state->fsave); + fpstate_init_fstate(fpstate); +} + +static void __fpstate_reset(struct fpstate *fpstate, u64 xfd) +{ + /* Initialize sizes and feature masks */ + fpstate->size = fpu_kernel_cfg.default_size; + fpstate->user_size = fpu_user_cfg.default_size; + fpstate->xfeatures = fpu_kernel_cfg.default_features; + fpstate->user_xfeatures = fpu_user_cfg.default_features; + fpstate->xfd = xfd; +} + +void fpstate_reset(struct fpu *fpu) +{ + /* Set the fpstate pointer to the default fpstate */ + fpu->fpstate = &fpu->__fpstate; + __fpstate_reset(fpu->fpstate, init_fpstate.xfd); + + /* Initialize the permission related info in fpu */ + fpu->perm.__state_perm = fpu_kernel_cfg.default_features; + fpu->perm.__state_size = fpu_kernel_cfg.default_size; + fpu->perm.__user_state_size = fpu_user_cfg.default_size; + /* Same defaults for guests */ + fpu->guest_perm = fpu->perm; +} + +static inline void fpu_inherit_perms(struct fpu *dst_fpu) +{ + if (fpu_state_size_dynamic()) { + struct fpu *src_fpu = ¤t->group_leader->thread.fpu; + + spin_lock_irq(¤t->sighand->siglock); + /* Fork also inherits the permissions of the parent */ + dst_fpu->perm = src_fpu->perm; + dst_fpu->guest_perm = src_fpu->guest_perm; + spin_unlock_irq(¤t->sighand->siglock); + } } -EXPORT_SYMBOL_GPL(fpstate_init); /* Clone current's FPU state on fork */ -int fpu_clone(struct task_struct *dst) +int fpu_clone(struct task_struct *dst, unsigned long clone_flags) { struct fpu *src_fpu = ¤t->thread.fpu; struct fpu *dst_fpu = &dst->thread.fpu; @@ -256,30 +567,51 @@ int fpu_clone(struct task_struct *dst) /* The new task's FPU state cannot be valid in the hardware. */ dst_fpu->last_cpu = -1; + fpstate_reset(dst_fpu); + if (!cpu_feature_enabled(X86_FEATURE_FPU)) return 0; /* - * Don't let 'init optimized' areas of the XSAVE area - * leak into the child task: + * Enforce reload for user space tasks and prevent kernel threads + * from trying to save the FPU registers on context switch. */ - memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size); + set_tsk_thread_flag(dst, TIF_NEED_FPU_LOAD); /* - * If the FPU registers are not owned by current just memcpy() the - * state. Otherwise save the FPU registers directly into the - * child's FPU context, without any memory-to-memory copying. + * No FPU state inheritance for kernel threads and IO + * worker threads. + */ + if (dst->flags & (PF_KTHREAD | PF_IO_WORKER)) { + /* Clear out the minimal state */ + memcpy(&dst_fpu->fpstate->regs, &init_fpstate.regs, + init_fpstate_copy_size()); + return 0; + } + + /* + * If a new feature is added, ensure all dynamic features are + * caller-saved from here! + */ + BUILD_BUG_ON(XFEATURE_MASK_USER_DYNAMIC != XFEATURE_MASK_XTILE_DATA); + + /* + * Save the default portion of the current FPU state into the + * clone. Assume all dynamic features to be defined as caller- + * saved, which enables skipping both the expansion of fpstate + * and the copying of any dynamic state. + * + * Do not use memcpy() when TIF_NEED_FPU_LOAD is set because + * copying is not valid when current uses non-default states. */ fpregs_lock(); if (test_thread_flag(TIF_NEED_FPU_LOAD)) - memcpy(&dst_fpu->state, &src_fpu->state, fpu_kernel_xstate_size); - - else - save_fpregs_to_fpstate(dst_fpu); + fpregs_restore_userregs(); + save_fpregs_to_fpstate(dst_fpu); + if (!(clone_flags & CLONE_THREAD)) + fpu_inherit_perms(dst_fpu); fpregs_unlock(); - set_tsk_thread_flag(dst, TIF_NEED_FPU_LOAD); - trace_x86_fpu_copy_src(src_fpu); trace_x86_fpu_copy_dst(dst_fpu); @@ -287,6 +619,16 @@ int fpu_clone(struct task_struct *dst) } /* + * Whitelist the FPU register state embedded into task_struct for hardened + * usercopy. + */ +void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size) +{ + *offset = offsetof(struct thread_struct, fpu.__fpstate.regs); + *size = fpu_kernel_cfg.default_size; +} + +/* * Drops current FPU state: deactivates the fpregs and * the fpstate. NOTE: it still leaves previous contents * in the fpregs in the eager-FPU case. @@ -319,28 +661,19 @@ void fpu__drop(struct fpu *fpu) static inline void restore_fpregs_from_init_fpstate(u64 features_mask) { if (use_xsave()) - os_xrstor(&init_fpstate.xsave, features_mask); + os_xrstor(&init_fpstate, features_mask); else if (use_fxsr()) - fxrstor(&init_fpstate.fxsave); + fxrstor(&init_fpstate.regs.fxsave); else - frstor(&init_fpstate.fsave); + frstor(&init_fpstate.regs.fsave); pkru_write_default(); } -static inline unsigned int init_fpstate_copy_size(void) -{ - if (!use_xsave()) - return fpu_kernel_xstate_size; - - /* XSAVE(S) just needs the legacy and the xstate header part */ - return sizeof(init_fpstate.xsave); -} - /* * Reset current->fpu memory state to the init values. */ -static void fpu_reset_fpstate(void) +static void fpu_reset_fpregs(void) { struct fpu *fpu = ¤t->thread.fpu; @@ -359,7 +692,7 @@ static void fpu_reset_fpstate(void) * user space as PKRU is eagerly written in switch_to() and * flush_thread(). */ - memcpy(&fpu->state, &init_fpstate, init_fpstate_copy_size()); + memcpy(&fpu->fpstate->regs, &init_fpstate.regs, init_fpstate_copy_size()); set_thread_flag(TIF_NEED_FPU_LOAD); fpregs_unlock(); } @@ -375,7 +708,7 @@ void fpu__clear_user_states(struct fpu *fpu) fpregs_lock(); if (!cpu_feature_enabled(X86_FEATURE_FPU)) { - fpu_reset_fpstate(); + fpu_reset_fpregs(); fpregs_unlock(); return; } @@ -385,12 +718,11 @@ void fpu__clear_user_states(struct fpu *fpu) * corresponding registers. */ if (xfeatures_mask_supervisor() && - !fpregs_state_valid(fpu, smp_processor_id())) { - os_xrstor(&fpu->state.xsave, xfeatures_mask_supervisor()); - } + !fpregs_state_valid(fpu, smp_processor_id())) + os_xrstor_supervisor(fpu->fpstate); /* Reset user states in registers. */ - restore_fpregs_from_init_fpstate(xfeatures_mask_restore_user()); + restore_fpregs_from_init_fpstate(XFEATURE_MASK_USER_RESTORE); /* * Now all FPU registers have their desired values. Inform the FPU @@ -405,7 +737,8 @@ void fpu__clear_user_states(struct fpu *fpu) void fpu_flush_thread(void) { - fpu_reset_fpstate(); + fpstate_reset(¤t->thread.fpu); + fpu_reset_fpregs(); } /* * Load FPU context before returning to userspace. @@ -445,7 +778,6 @@ void fpregs_mark_activate(void) fpu->last_cpu = smp_processor_id(); clear_thread_flag(TIF_NEED_FPU_LOAD); } -EXPORT_SYMBOL_GPL(fpregs_mark_activate); /* * x87 math exception handling: @@ -468,11 +800,11 @@ int fpu__exception_code(struct fpu *fpu, int trap_nr) * fully reproduce the context of the exception. */ if (boot_cpu_has(X86_FEATURE_FXSR)) { - cwd = fpu->state.fxsave.cwd; - swd = fpu->state.fxsave.swd; + cwd = fpu->fpstate->regs.fxsave.cwd; + swd = fpu->fpstate->regs.fxsave.swd; } else { - cwd = (unsigned short)fpu->state.fsave.cwd; - swd = (unsigned short)fpu->state.fsave.swd; + cwd = (unsigned short)fpu->fpstate->regs.fsave.cwd; + swd = (unsigned short)fpu->fpstate->regs.fsave.swd; } err = swd & ~cwd; @@ -486,7 +818,7 @@ int fpu__exception_code(struct fpu *fpu, int trap_nr) unsigned short mxcsr = MXCSR_DEFAULT; if (boot_cpu_has(X86_FEATURE_XMM)) - mxcsr = fpu->state.fxsave.mxcsr; + mxcsr = fpu->fpstate->regs.fxsave.mxcsr; err = ~(mxcsr >> 7) & mxcsr; } diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 64e29927cc32..621f4b6cac4a 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -2,7 +2,7 @@ /* * x86 FPU boot time init code: */ -#include <asm/fpu/internal.h> +#include <asm/fpu/api.h> #include <asm/tlbflush.h> #include <asm/setup.h> @@ -10,6 +10,10 @@ #include <linux/sched/task.h> #include <linux/init.h> +#include "internal.h" +#include "legacy.h" +#include "xstate.h" + /* * Initialize the registers found in all CPUs, CR0 and CR4: */ @@ -34,7 +38,7 @@ static void fpu__init_cpu_generic(void) /* Flush out any pending x87 state: */ #ifdef CONFIG_MATH_EMULATION if (!boot_cpu_has(X86_FEATURE_FPU)) - fpstate_init_soft(¤t->thread.fpu.state.soft); + fpstate_init_soft(¤t->thread.fpu.fpstate->regs.soft); else #endif asm volatile ("fninit"); @@ -121,23 +125,14 @@ static void __init fpu__init_system_mxcsr(void) static void __init fpu__init_system_generic(void) { /* - * Set up the legacy init FPU context. (xstate init might overwrite this - * with a more modern format, if the CPU supports it.) + * Set up the legacy init FPU context. Will be updated when the + * CPU supports XSAVE[S]. */ - fpstate_init(&init_fpstate); + fpstate_init_user(&init_fpstate); fpu__init_system_mxcsr(); } -/* - * Size of the FPU context state. All tasks in the system use the - * same context size, regardless of what portion they use. - * This is inherent to the XSAVE architecture which puts all state - * components into a single, continuous memory block: - */ -unsigned int fpu_kernel_xstate_size __ro_after_init; -EXPORT_SYMBOL_GPL(fpu_kernel_xstate_size); - /* Get alignment of the TYPE. */ #define TYPE_ALIGN(TYPE) offsetof(struct { char x; TYPE test; }, test) @@ -162,13 +157,13 @@ static void __init fpu__init_task_struct_size(void) * Subtract off the static size of the register state. * It potentially has a bunch of padding. */ - task_size -= sizeof(((struct task_struct *)0)->thread.fpu.state); + task_size -= sizeof(current->thread.fpu.__fpstate.regs); /* * Add back the dynamically-calculated register state * size. */ - task_size += fpu_kernel_xstate_size; + task_size += fpu_kernel_cfg.default_size; /* * We dynamically size 'struct fpu', so we require that @@ -177,7 +172,7 @@ static void __init fpu__init_task_struct_size(void) * you hit a compile error here, check the structure to * see if something got added to the end. */ - CHECK_MEMBER_AT_END_OF(struct fpu, state); + CHECK_MEMBER_AT_END_OF(struct fpu, __fpstate); CHECK_MEMBER_AT_END_OF(struct thread_struct, fpu); CHECK_MEMBER_AT_END_OF(struct task_struct, thread); @@ -192,37 +187,34 @@ static void __init fpu__init_task_struct_size(void) */ static void __init fpu__init_system_xstate_size_legacy(void) { - static int on_boot_cpu __initdata = 1; - - WARN_ON_FPU(!on_boot_cpu); - on_boot_cpu = 0; + unsigned int size; /* - * Note that xstate sizes might be overwritten later during - * fpu__init_system_xstate(). + * Note that the size configuration might be overwritten later + * during fpu__init_system_xstate(). */ - - if (!boot_cpu_has(X86_FEATURE_FPU)) { - fpu_kernel_xstate_size = sizeof(struct swregs_state); + if (!cpu_feature_enabled(X86_FEATURE_FPU)) { + size = sizeof(struct swregs_state); + } else if (cpu_feature_enabled(X86_FEATURE_FXSR)) { + size = sizeof(struct fxregs_state); + fpu_user_cfg.legacy_features = XFEATURE_MASK_FPSSE; } else { - if (boot_cpu_has(X86_FEATURE_FXSR)) - fpu_kernel_xstate_size = - sizeof(struct fxregs_state); - else - fpu_kernel_xstate_size = - sizeof(struct fregs_state); + size = sizeof(struct fregs_state); + fpu_user_cfg.legacy_features = XFEATURE_MASK_FP; } - fpu_user_xstate_size = fpu_kernel_xstate_size; + fpu_kernel_cfg.max_size = size; + fpu_kernel_cfg.default_size = size; + fpu_user_cfg.max_size = size; + fpu_user_cfg.default_size = size; + fpstate_reset(¤t->thread.fpu); } -/* Legacy code to initialize eager fpu mode. */ -static void __init fpu__init_system_ctx_switch(void) +static void __init fpu__init_init_fpstate(void) { - static bool on_boot_cpu __initdata = 1; - - WARN_ON_FPU(!on_boot_cpu); - on_boot_cpu = 0; + /* Bring init_fpstate size and features up to date */ + init_fpstate.size = fpu_kernel_cfg.max_size; + init_fpstate.xfeatures = fpu_kernel_cfg.max_features; } /* @@ -231,6 +223,7 @@ static void __init fpu__init_system_ctx_switch(void) */ void __init fpu__init_system(struct cpuinfo_x86 *c) { + fpstate_reset(¤t->thread.fpu); fpu__init_system_early_generic(c); /* @@ -241,8 +234,7 @@ void __init fpu__init_system(struct cpuinfo_x86 *c) fpu__init_system_generic(); fpu__init_system_xstate_size_legacy(); - fpu__init_system_xstate(); + fpu__init_system_xstate(fpu_kernel_cfg.max_size); fpu__init_task_struct_size(); - - fpu__init_system_ctx_switch(); + fpu__init_init_fpstate(); } diff --git a/arch/x86/kernel/fpu/internal.h b/arch/x86/kernel/fpu/internal.h new file mode 100644 index 000000000000..dbdb31f55fc7 --- /dev/null +++ b/arch/x86/kernel/fpu/internal.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __X86_KERNEL_FPU_INTERNAL_H +#define __X86_KERNEL_FPU_INTERNAL_H + +extern struct fpstate init_fpstate; + +/* CPU feature check wrappers */ +static __always_inline __pure bool use_xsave(void) +{ + return cpu_feature_enabled(X86_FEATURE_XSAVE); +} + +static __always_inline __pure bool use_fxsr(void) +{ + return cpu_feature_enabled(X86_FEATURE_FXSR); +} + +#ifdef CONFIG_X86_DEBUG_FPU +# define WARN_ON_FPU(x) WARN_ON_ONCE(x) +#else +# define WARN_ON_FPU(x) ({ (void)(x); 0; }) +#endif + +/* Used in init.c */ +extern void fpstate_init_user(struct fpstate *fpstate); +extern void fpstate_reset(struct fpu *fpu); + +#endif diff --git a/arch/x86/kernel/fpu/legacy.h b/arch/x86/kernel/fpu/legacy.h new file mode 100644 index 000000000000..098f367bb8a7 --- /dev/null +++ b/arch/x86/kernel/fpu/legacy.h @@ -0,0 +1,111 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __X86_KERNEL_FPU_LEGACY_H +#define __X86_KERNEL_FPU_LEGACY_H + +#include <asm/fpu/types.h> + +extern unsigned int mxcsr_feature_mask; + +static inline void ldmxcsr(u32 mxcsr) +{ + asm volatile("ldmxcsr %0" :: "m" (mxcsr)); +} + +/* + * Returns 0 on success or the trap number when the operation raises an + * exception. + */ +#define user_insn(insn, output, input...) \ +({ \ + int err; \ + \ + might_fault(); \ + \ + asm volatile(ASM_STAC "\n" \ + "1: " #insn "\n" \ + "2: " ASM_CLAC "\n" \ + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE) \ + : [err] "=a" (err), output \ + : "0"(0), input); \ + err; \ +}) + +#define kernel_insn_err(insn, output, input...) \ +({ \ + int err; \ + asm volatile("1:" #insn "\n\t" \ + "2:\n" \ + _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %[err]) \ + : [err] "=r" (err), output \ + : "0"(0), input); \ + err; \ +}) + +#define kernel_insn(insn, output, input...) \ + asm volatile("1:" #insn "\n\t" \ + "2:\n" \ + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FPU_RESTORE) \ + : output : input) + +static inline int fnsave_to_user_sigframe(struct fregs_state __user *fx) +{ + return user_insn(fnsave %[fx]; fwait, [fx] "=m" (*fx), "m" (*fx)); +} + +static inline int fxsave_to_user_sigframe(struct fxregs_state __user *fx) +{ + if (IS_ENABLED(CONFIG_X86_32)) + return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx)); + else + return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx)); + +} + +static inline void fxrstor(struct fxregs_state *fx) +{ + if (IS_ENABLED(CONFIG_X86_32)) + kernel_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); + else + kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); +} + +static inline int fxrstor_safe(struct fxregs_state *fx) +{ + if (IS_ENABLED(CONFIG_X86_32)) + return kernel_insn_err(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); + else + return kernel_insn_err(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); +} + +static inline int fxrstor_from_user_sigframe(struct fxregs_state __user *fx) +{ + if (IS_ENABLED(CONFIG_X86_32)) + return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); + else + return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); +} + +static inline void frstor(struct fregs_state *fx) +{ + kernel_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); +} + +static inline int frstor_safe(struct fregs_state *fx) +{ + return kernel_insn_err(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); +} + +static inline int frstor_from_user_sigframe(struct fregs_state __user *fx) +{ + return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); +} + +static inline void fxsave(struct fxregs_state *fx) +{ + if (IS_ENABLED(CONFIG_X86_32)) + asm volatile( "fxsave %[fx]" : [fx] "=m" (*fx)); + else + asm volatile("fxsaveq %[fx]" : [fx] "=m" (*fx)); +} + +#endif diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 66ed317ebc0d..437d7c930c0b 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -5,10 +5,14 @@ #include <linux/sched/task_stack.h> #include <linux/vmalloc.h> -#include <asm/fpu/internal.h> +#include <asm/fpu/api.h> #include <asm/fpu/signal.h> #include <asm/fpu/regset.h> -#include <asm/fpu/xstate.h> + +#include "context.h" +#include "internal.h" +#include "legacy.h" +#include "xstate.h" /* * The xstateregs_active() routine is the same as the regset_fpregs_active() routine, @@ -74,8 +78,8 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset, sync_fpstate(fpu); if (!use_xsave()) { - return membuf_write(&to, &fpu->state.fxsave, - sizeof(fpu->state.fxsave)); + return membuf_write(&to, &fpu->fpstate->regs.fxsave, + sizeof(fpu->fpstate->regs.fxsave)); } copy_xstate_to_uabi_buf(to, target, XSTATE_COPY_FX); @@ -110,15 +114,15 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, fpu_force_restore(fpu); /* Copy the state */ - memcpy(&fpu->state.fxsave, &newstate, sizeof(newstate)); + memcpy(&fpu->fpstate->regs.fxsave, &newstate, sizeof(newstate)); /* Clear xmm8..15 */ - BUILD_BUG_ON(sizeof(fpu->state.fxsave.xmm_space) != 16 * 16); - memset(&fpu->state.fxsave.xmm_space[8], 0, 8 * 16); + BUILD_BUG_ON(sizeof(fpu->__fpstate.regs.fxsave.xmm_space) != 16 * 16); + memset(&fpu->fpstate->regs.fxsave.xmm_space[8], 0, 8 * 16); /* Mark FP and SSE as in use when XSAVE is enabled */ if (use_xsave()) - fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE; + fpu->fpstate->regs.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE; return 0; } @@ -149,7 +153,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, /* * A whole standard-format XSAVE buffer is needed: */ - if (pos != 0 || count != fpu_user_xstate_size) + if (pos != 0 || count != fpu_user_cfg.max_size) return -EFAULT; if (!kbuf) { @@ -164,7 +168,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, } fpu_force_restore(fpu); - ret = copy_uabi_from_kernel_to_xstate(&fpu->state.xsave, kbuf ?: tmpbuf); + ret = copy_uabi_from_kernel_to_xstate(fpu->fpstate, kbuf ?: tmpbuf); out: vfree(tmpbuf); @@ -283,7 +287,7 @@ static void __convert_from_fxsr(struct user_i387_ia32_struct *env, void convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk) { - __convert_from_fxsr(env, tsk, &tsk->thread.fpu.state.fxsave); + __convert_from_fxsr(env, tsk, &tsk->thread.fpu.fpstate->regs.fxsave); } void convert_to_fxsr(struct fxregs_state *fxsave, @@ -326,7 +330,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset, return fpregs_soft_get(target, regset, to); if (!cpu_feature_enabled(X86_FEATURE_FXSR)) { - return membuf_write(&to, &fpu->state.fsave, + return membuf_write(&to, &fpu->fpstate->regs.fsave, sizeof(struct fregs_state)); } @@ -337,7 +341,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset, copy_xstate_to_uabi_buf(mb, target, XSTATE_COPY_FP); fx = &fxsave; } else { - fx = &fpu->state.fxsave; + fx = &fpu->fpstate->regs.fxsave; } __convert_from_fxsr(&env, target, fx); @@ -366,16 +370,16 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, fpu_force_restore(fpu); if (cpu_feature_enabled(X86_FEATURE_FXSR)) - convert_to_fxsr(&fpu->state.fxsave, &env); + convert_to_fxsr(&fpu->fpstate->regs.fxsave, &env); else - memcpy(&fpu->state.fsave, &env, sizeof(env)); + memcpy(&fpu->fpstate->regs.fsave, &env, sizeof(env)); /* * Update the header bit in the xsave header, indicating the * presence of FP. */ if (cpu_feature_enabled(X86_FEATURE_XSAVE)) - fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FP; + fpu->fpstate->regs.xsave.header.xfeatures |= XFEATURE_MASK_FP; return 0; } diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 445c57c9c539..91d4b6de58ab 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -7,23 +7,25 @@ #include <linux/cpu.h> #include <linux/pagemap.h> -#include <asm/fpu/internal.h> #include <asm/fpu/signal.h> #include <asm/fpu/regset.h> #include <asm/fpu/xstate.h> #include <asm/sigframe.h> +#include <asm/trapnr.h> #include <asm/trace/fpu.h> -static struct _fpx_sw_bytes fx_sw_reserved __ro_after_init; -static struct _fpx_sw_bytes fx_sw_reserved_ia32 __ro_after_init; +#include "context.h" +#include "internal.h" +#include "legacy.h" +#include "xstate.h" /* * Check for the presence of extended state information in the * user fpstate pointer in the sigcontext. */ -static inline int check_xstate_in_sigframe(struct fxregs_state __user *fxbuf, - struct _fpx_sw_bytes *fx_sw) +static inline bool check_xstate_in_sigframe(struct fxregs_state __user *fxbuf, + struct _fpx_sw_bytes *fx_sw) { int min_xstate_size = sizeof(struct fxregs_state) + sizeof(struct xstate_header); @@ -31,12 +33,12 @@ static inline int check_xstate_in_sigframe(struct fxregs_state __user *fxbuf, unsigned int magic2; if (__copy_from_user(fx_sw, &fxbuf->sw_reserved[0], sizeof(*fx_sw))) - return -EFAULT; + return false; /* Check for the first magic field and other error scenarios. */ if (fx_sw->magic1 != FP_XSTATE_MAGIC1 || fx_sw->xstate_size < min_xstate_size || - fx_sw->xstate_size > fpu_user_xstate_size || + fx_sw->xstate_size > current->thread.fpu.fpstate->user_size || fx_sw->xstate_size > fx_sw->extended_size) goto setfx; @@ -47,10 +49,10 @@ static inline int check_xstate_in_sigframe(struct fxregs_state __user *fxbuf, * in the memory layout. */ if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size))) - return -EFAULT; + return false; if (likely(magic2 == FP_XSTATE_MAGIC2)) - return 0; + return true; setfx: trace_x86_fpu_xstate_check_failed(¤t->thread.fpu); @@ -58,22 +60,22 @@ setfx: fx_sw->magic1 = 0; fx_sw->xstate_size = sizeof(struct fxregs_state); fx_sw->xfeatures = XFEATURE_MASK_FPSSE; - return 0; + return true; } /* * Signal frame handlers. */ -static inline int save_fsave_header(struct task_struct *tsk, void __user *buf) +static inline bool save_fsave_header(struct task_struct *tsk, void __user *buf) { if (use_fxsr()) { - struct xregs_state *xsave = &tsk->thread.fpu.state.xsave; + struct xregs_state *xsave = &tsk->thread.fpu.fpstate->regs.xsave; struct user_i387_ia32_struct env; struct _fpstate_32 __user *fp = buf; fpregs_lock(); if (!test_thread_flag(TIF_NEED_FPU_LOAD)) - fxsave(&tsk->thread.fpu.state.fxsave); + fxsave(&tsk->thread.fpu.fpstate->regs.fxsave); fpregs_unlock(); convert_from_fxsr(&env, tsk); @@ -81,33 +83,54 @@ static inline int save_fsave_header(struct task_struct *tsk, void __user *buf) if (__copy_to_user(buf, &env, sizeof(env)) || __put_user(xsave->i387.swd, &fp->status) || __put_user(X86_FXSR_MAGIC, &fp->magic)) - return -1; + return false; } else { struct fregs_state __user *fp = buf; u32 swd; + if (__get_user(swd, &fp->swd) || __put_user(swd, &fp->status)) - return -1; + return false; } - return 0; + return true; +} + +/* + * Prepare the SW reserved portion of the fxsave memory layout, indicating + * the presence of the extended state information in the memory layout + * pointed to by the fpstate pointer in the sigcontext. + * This is saved when ever the FP and extended state context is + * saved on the user stack during the signal handler delivery to the user. + */ +static inline void save_sw_bytes(struct _fpx_sw_bytes *sw_bytes, bool ia32_frame, + struct fpstate *fpstate) +{ + sw_bytes->magic1 = FP_XSTATE_MAGIC1; + sw_bytes->extended_size = fpstate->user_size + FP_XSTATE_MAGIC2_SIZE; + sw_bytes->xfeatures = fpstate->user_xfeatures; + sw_bytes->xstate_size = fpstate->user_size; + + if (ia32_frame) + sw_bytes->extended_size += sizeof(struct fregs_state); } -static inline int save_xstate_epilog(void __user *buf, int ia32_frame) +static inline bool save_xstate_epilog(void __user *buf, int ia32_frame, + struct fpstate *fpstate) { struct xregs_state __user *x = buf; - struct _fpx_sw_bytes *sw_bytes; + struct _fpx_sw_bytes sw_bytes = {}; u32 xfeatures; int err; /* Setup the bytes not touched by the [f]xsave and reserved for SW. */ - sw_bytes = ia32_frame ? &fx_sw_reserved_ia32 : &fx_sw_reserved; - err = __copy_to_user(&x->i387.sw_reserved, sw_bytes, sizeof(*sw_bytes)); + save_sw_bytes(&sw_bytes, ia32_frame, fpstate); + err = __copy_to_user(&x->i387.sw_reserved, &sw_bytes, sizeof(sw_bytes)); if (!use_xsave()) - return err; + return !err; err |= __put_user(FP_XSTATE_MAGIC2, - (__u32 __user *)(buf + fpu_user_xstate_size)); + (__u32 __user *)(buf + fpstate->user_size)); /* * Read the xfeatures which we copied (directly from the cpu or @@ -130,23 +153,17 @@ static inline int save_xstate_epilog(void __user *buf, int ia32_frame) err |= __put_user(xfeatures, (__u32 __user *)&x->header.xfeatures); - return err; + return !err; } static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) { - int err; - if (use_xsave()) - err = xsave_to_user_sigframe(buf); - else if (use_fxsr()) - err = fxsave_to_user_sigframe((struct fxregs_state __user *) buf); + return xsave_to_user_sigframe(buf); + if (use_fxsr()) + return fxsave_to_user_sigframe((struct fxregs_state __user *) buf); else - err = fnsave_to_user_sigframe((struct fregs_state __user *) buf); - - if (unlikely(err) && __clear_user(buf, fpu_user_xstate_size)) - err = -EFAULT; - return err; + return fnsave_to_user_sigframe((struct fregs_state __user *) buf); } /* @@ -159,10 +176,8 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) * buf == buf_fx for 64-bit frames and 32-bit fsave frame. * buf != buf_fx for 32-bit frames with fxstate. * - * Try to save it directly to the user frame with disabled page fault handler. - * If this fails then do the slow path where the FPU state is first saved to - * task's fpu->state and then copy it to the user frame pointed to by the - * aligned pointer 'buf_fx'. + * Save it directly to the user frame with disabled page fault handler. If + * that faults, try to clear the frame which handles the page fault. * * If this is a 32-bit frame with fxstate, put a fsave header before * the aligned state at 'buf_fx'. @@ -170,10 +185,11 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) * For [f]xsave state, update the SW reserved fields in the [f]xsave frame * indicating the absence/presence of the extended state to the user. */ -int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) +bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) { struct task_struct *tsk = current; - int ia32_fxstate = (buf != buf_fx); + struct fpstate *fpstate = tsk->thread.fpu.fpstate; + bool ia32_fxstate = (buf != buf_fx); int ret; ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) || @@ -181,13 +197,25 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) if (!static_cpu_has(X86_FEATURE_FPU)) { struct user_i387_ia32_struct fp; + fpregs_soft_get(current, NULL, (struct membuf){.p = &fp, .left = sizeof(fp)}); - return copy_to_user(buf, &fp, sizeof(fp)) ? -EFAULT : 0; + return !copy_to_user(buf, &fp, sizeof(fp)); } if (!access_ok(buf, size)) - return -EACCES; + return false; + + if (use_xsave()) { + struct xregs_state __user *xbuf = buf_fx; + + /* + * Clear the xsave header first, so that reserved fields are + * initialized to zero. + */ + if (__clear_user(&xbuf->header, sizeof(xbuf->header))) + return false; + } retry: /* * Load the FPU registers if they are not valid for the current task. @@ -205,26 +233,26 @@ retry: fpregs_unlock(); if (ret) { - if (!fault_in_pages_writeable(buf_fx, fpu_user_xstate_size)) + if (!__clear_user(buf_fx, fpstate->user_size)) goto retry; - return -EFAULT; + return false; } /* Save the fsave header for the 32-bit frames. */ - if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf)) - return -1; + if ((ia32_fxstate || !use_fxsr()) && !save_fsave_header(tsk, buf)) + return false; - if (use_fxsr() && save_xstate_epilog(buf_fx, ia32_fxstate)) - return -1; + if (use_fxsr() && !save_xstate_epilog(buf_fx, ia32_fxstate, fpstate)) + return false; - return 0; + return true; } -static int __restore_fpregs_from_user(void __user *buf, u64 xrestore, - bool fx_only) +static int __restore_fpregs_from_user(void __user *buf, u64 ufeatures, + u64 xrestore, bool fx_only) { if (use_xsave()) { - u64 init_bv = xfeatures_mask_uabi() & ~xrestore; + u64 init_bv = ufeatures & ~xrestore; int ret; if (likely(!fx_only)) @@ -233,7 +261,7 @@ static int __restore_fpregs_from_user(void __user *buf, u64 xrestore, ret = fxrstor_from_user_sigframe(buf); if (!ret && unlikely(init_bv)) - os_xrstor(&init_fpstate.xsave, init_bv); + os_xrstor(&init_fpstate, init_bv); return ret; } else if (use_fxsr()) { return fxrstor_from_user_sigframe(buf); @@ -246,16 +274,19 @@ static int __restore_fpregs_from_user(void __user *buf, u64 xrestore, * Attempt to restore the FPU registers directly from user memory. * Pagefaults are handled and any errors returned are fatal. */ -static int restore_fpregs_from_user(void __user *buf, u64 xrestore, - bool fx_only, unsigned int size) +static bool restore_fpregs_from_user(void __user *buf, u64 xrestore, + bool fx_only, unsigned int size) { struct fpu *fpu = ¤t->thread.fpu; int ret; retry: fpregs_lock(); + /* Ensure that XFD is up to date */ + xfd_update_state(fpu->fpstate); pagefault_disable(); - ret = __restore_fpregs_from_user(buf, xrestore, fx_only); + ret = __restore_fpregs_from_user(buf, fpu->fpstate->user_xfeatures, + xrestore, fx_only); pagefault_enable(); if (unlikely(ret)) { @@ -275,13 +306,12 @@ retry: fpregs_unlock(); /* Try to handle #PF, but anything else is fatal. */ - if (ret != -EFAULT) - return -EINVAL; + if (ret != X86_TRAP_PF) + return false; - ret = fault_in_pages_readable(buf, size); - if (!ret) + if (!fault_in_readable(buf, size)) goto retry; - return ret; + return false; } /* @@ -294,45 +324,40 @@ retry: * been restored from a user buffer directly. */ if (test_thread_flag(TIF_NEED_FPU_LOAD) && xfeatures_mask_supervisor()) - os_xrstor(&fpu->state.xsave, xfeatures_mask_supervisor()); + os_xrstor_supervisor(fpu->fpstate); fpregs_mark_activate(); fpregs_unlock(); - return 0; + return true; } -static int __fpu_restore_sig(void __user *buf, void __user *buf_fx, - bool ia32_fxstate) +static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx, + bool ia32_fxstate) { - int state_size = fpu_kernel_xstate_size; struct task_struct *tsk = current; struct fpu *fpu = &tsk->thread.fpu; struct user_i387_ia32_struct env; + bool success, fx_only = false; + union fpregs_state *fpregs; + unsigned int state_size; u64 user_xfeatures = 0; - bool fx_only = false; - int ret; if (use_xsave()) { struct _fpx_sw_bytes fx_sw_user; - ret = check_xstate_in_sigframe(buf_fx, &fx_sw_user); - if (unlikely(ret)) - return ret; + if (!check_xstate_in_sigframe(buf_fx, &fx_sw_user)) + return false; fx_only = !fx_sw_user.magic1; state_size = fx_sw_user.xstate_size; user_xfeatures = fx_sw_user.xfeatures; } else { user_xfeatures = XFEATURE_MASK_FPSSE; + state_size = fpu->fpstate->user_size; } if (likely(!ia32_fxstate)) { - /* - * Attempt to restore the FPU registers directly from user - * memory. For that to succeed, the user access cannot cause page - * faults. If it does, fall back to the slow path below, going - * through the kernel buffer with the enabled pagefault handler. - */ + /* Restore the FPU registers directly from user memory. */ return restore_fpregs_from_user(buf_fx, user_xfeatures, fx_only, state_size); } @@ -342,9 +367,8 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx, * to be ignored for histerical raisins. The legacy state is folded * in once the larger state has been copied. */ - ret = __copy_from_user(&env, buf, sizeof(env)); - if (ret) - return ret; + if (__copy_from_user(&env, buf, sizeof(env))) + return false; /* * By setting TIF_NEED_FPU_LOAD it is ensured that our xstate is @@ -363,33 +387,38 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx, * the right place in memory. It's ia32 mode. Shrug. */ if (xfeatures_mask_supervisor()) - os_xsave(&fpu->state.xsave); + os_xsave(fpu->fpstate); set_thread_flag(TIF_NEED_FPU_LOAD); } __fpu_invalidate_fpregs_state(fpu); __cpu_invalidate_fpregs_state(); fpregs_unlock(); + fpregs = &fpu->fpstate->regs; if (use_xsave() && !fx_only) { - ret = copy_sigframe_from_user_to_xstate(&fpu->state.xsave, buf_fx); - if (ret) - return ret; + if (copy_sigframe_from_user_to_xstate(fpu->fpstate, buf_fx)) + return false; } else { - if (__copy_from_user(&fpu->state.fxsave, buf_fx, - sizeof(fpu->state.fxsave))) - return -EFAULT; - - /* Reject invalid MXCSR values. */ - if (fpu->state.fxsave.mxcsr & ~mxcsr_feature_mask) - return -EINVAL; + if (__copy_from_user(&fpregs->fxsave, buf_fx, + sizeof(fpregs->fxsave))) + return false; + + if (IS_ENABLED(CONFIG_X86_64)) { + /* Reject invalid MXCSR values. */ + if (fpregs->fxsave.mxcsr & ~mxcsr_feature_mask) + return false; + } else { + /* Mask invalid bits out for historical reasons (broken hardware). */ + fpregs->fxsave.mxcsr &= mxcsr_feature_mask; + } /* Enforce XFEATURE_MASK_FPSSE when XSAVE is enabled */ if (use_xsave()) - fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE; + fpregs->xsave.header.xfeatures |= XFEATURE_MASK_FPSSE; } /* Fold the legacy FP storage */ - convert_to_fxsr(&fpu->state.fxsave, &env); + convert_to_fxsr(&fpregs->fxsave, &env); fpregs_lock(); if (use_xsave()) { @@ -404,40 +433,45 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx, */ u64 mask = user_xfeatures | xfeatures_mask_supervisor(); - fpu->state.xsave.header.xfeatures &= mask; - ret = os_xrstor_safe(&fpu->state.xsave, xfeatures_mask_all); + fpregs->xsave.header.xfeatures &= mask; + success = !os_xrstor_safe(fpu->fpstate, + fpu_kernel_cfg.max_features); } else { - ret = fxrstor_safe(&fpu->state.fxsave); + success = !fxrstor_safe(&fpregs->fxsave); } - if (likely(!ret)) + if (likely(success)) fpregs_mark_activate(); fpregs_unlock(); - return ret; + return success; } -static inline int xstate_sigframe_size(void) + +static inline unsigned int xstate_sigframe_size(struct fpstate *fpstate) { - return use_xsave() ? fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE : - fpu_user_xstate_size; + unsigned int size = fpstate->user_size; + + return use_xsave() ? size + FP_XSTATE_MAGIC2_SIZE : size; } /* * Restore FPU state from a sigframe: */ -int fpu__restore_sig(void __user *buf, int ia32_frame) +bool fpu__restore_sig(void __user *buf, int ia32_frame) { - unsigned int size = xstate_sigframe_size(); struct fpu *fpu = ¤t->thread.fpu; void __user *buf_fx = buf; bool ia32_fxstate = false; - int ret; + bool success = false; + unsigned int size; if (unlikely(!buf)) { fpu__clear_user_states(fpu); - return 0; + return true; } + size = xstate_sigframe_size(fpu->fpstate); + ia32_frame &= (IS_ENABLED(CONFIG_X86_32) || IS_ENABLED(CONFIG_IA32_EMULATION)); @@ -451,30 +485,28 @@ int fpu__restore_sig(void __user *buf, int ia32_frame) ia32_fxstate = true; } - if (!access_ok(buf, size)) { - ret = -EACCES; + if (!access_ok(buf, size)) goto out; - } if (!IS_ENABLED(CONFIG_X86_64) && !cpu_feature_enabled(X86_FEATURE_FPU)) { - ret = fpregs_soft_set(current, NULL, 0, - sizeof(struct user_i387_ia32_struct), - NULL, buf); + success = !fpregs_soft_set(current, NULL, 0, + sizeof(struct user_i387_ia32_struct), + NULL, buf); } else { - ret = __fpu_restore_sig(buf, buf_fx, ia32_fxstate); + success = __fpu_restore_sig(buf, buf_fx, ia32_fxstate); } out: - if (unlikely(ret)) + if (unlikely(!success)) fpu__clear_user_states(fpu); - return ret; + return success; } unsigned long fpu__alloc_mathframe(unsigned long sp, int ia32_frame, unsigned long *buf_fx, unsigned long *size) { - unsigned long frame_size = xstate_sigframe_size(); + unsigned long frame_size = xstate_sigframe_size(current->thread.fpu.fpstate); *buf_fx = sp = round_down(sp - frame_size, 64); if (ia32_frame && use_fxsr()) { @@ -487,9 +519,12 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame, return sp; } -unsigned long fpu__get_fpstate_size(void) +unsigned long __init fpu__get_fpstate_size(void) { - unsigned long ret = xstate_sigframe_size(); + unsigned long ret = fpu_user_cfg.max_size; + + if (use_xsave()) + ret += FP_XSTATE_MAGIC2_SIZE; /* * This space is needed on (most) 32-bit kernels, or when a 32-bit @@ -505,28 +540,3 @@ unsigned long fpu__get_fpstate_size(void) return ret; } -/* - * Prepare the SW reserved portion of the fxsave memory layout, indicating - * the presence of the extended state information in the memory layout - * pointed by the fpstate pointer in the sigcontext. - * This will be saved when ever the FP and extended state context is - * saved on the user stack during the signal handler delivery to the user. - */ -void fpu__init_prepare_fx_sw_frame(void) -{ - int size = fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE; - - fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1; - fx_sw_reserved.extended_size = size; - fx_sw_reserved.xfeatures = xfeatures_mask_uabi(); - fx_sw_reserved.xstate_size = fpu_user_xstate_size; - - if (IS_ENABLED(CONFIG_IA32_EMULATION) || - IS_ENABLED(CONFIG_X86_32)) { - int fsave_header_size = sizeof(struct fregs_state); - - fx_sw_reserved_ia32 = fx_sw_reserved; - fx_sw_reserved_ia32.extended_size = size + fsave_header_size; - } -} - diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index c8def1b7f8fb..02b3ddaf4f75 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -4,21 +4,33 @@ * * Author: Suresh Siddha <suresh.b.siddha@intel.com> */ +#include <linux/bitops.h> #include <linux/compat.h> #include <linux/cpu.h> #include <linux/mman.h> +#include <linux/nospec.h> #include <linux/pkeys.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> +#include <linux/vmalloc.h> #include <asm/fpu/api.h> -#include <asm/fpu/internal.h> -#include <asm/fpu/signal.h> #include <asm/fpu/regset.h> -#include <asm/fpu/xstate.h> +#include <asm/fpu/signal.h> +#include <asm/fpu/xcr.h> #include <asm/tlbflush.h> -#include <asm/cpufeature.h> +#include <asm/prctl.h> +#include <asm/elf.h> + +#include "context.h" +#include "internal.h" +#include "legacy.h" +#include "xstate.h" + +#define for_each_extended_xfeature(bit, mask) \ + (bit) = FIRST_EXTENDED_XFEATURE; \ + for_each_set_bit_from(bit, (unsigned long *)&(mask), 8 * sizeof(mask)) /* * Although we spell it out in here, the Processor Trace @@ -39,29 +51,32 @@ static const char *xfeature_names[] = "Protection Keys User registers", "PASID state", "unknown xstate feature" , + "unknown xstate feature" , + "unknown xstate feature" , + "unknown xstate feature" , + "unknown xstate feature" , + "unknown xstate feature" , + "AMX Tile config" , + "AMX Tile data" , + "unknown xstate feature" , }; -static short xsave_cpuid_features[] __initdata = { - X86_FEATURE_FPU, - X86_FEATURE_XMM, - X86_FEATURE_AVX, - X86_FEATURE_MPX, - X86_FEATURE_MPX, - X86_FEATURE_AVX512F, - X86_FEATURE_AVX512F, - X86_FEATURE_AVX512F, - X86_FEATURE_INTEL_PT, - X86_FEATURE_PKU, - X86_FEATURE_ENQCMD, +static unsigned short xsave_cpuid_features[] __initdata = { + [XFEATURE_FP] = X86_FEATURE_FPU, + [XFEATURE_SSE] = X86_FEATURE_XMM, + [XFEATURE_YMM] = X86_FEATURE_AVX, + [XFEATURE_BNDREGS] = X86_FEATURE_MPX, + [XFEATURE_BNDCSR] = X86_FEATURE_MPX, + [XFEATURE_OPMASK] = X86_FEATURE_AVX512F, + [XFEATURE_ZMM_Hi256] = X86_FEATURE_AVX512F, + [XFEATURE_Hi16_ZMM] = X86_FEATURE_AVX512F, + [XFEATURE_PT_UNIMPLEMENTED_SO_FAR] = X86_FEATURE_INTEL_PT, + [XFEATURE_PKRU] = X86_FEATURE_PKU, + [XFEATURE_PASID] = X86_FEATURE_ENQCMD, + [XFEATURE_XTILE_CFG] = X86_FEATURE_AMX_TILE, + [XFEATURE_XTILE_DATA] = X86_FEATURE_AMX_TILE, }; -/* - * This represents the full set of bits that should ever be set in a kernel - * XSAVE buffer, both supervisor and user xstates. - */ -u64 xfeatures_mask_all __ro_after_init; -EXPORT_SYMBOL_GPL(xfeatures_mask_all); - static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init = { [ 0 ... XFEATURE_MAX - 1] = -1}; static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init = @@ -72,20 +87,13 @@ static unsigned int xstate_supervisor_only_offsets[XFEATURE_MAX] __ro_after_init { [ 0 ... XFEATURE_MAX - 1] = -1}; /* - * The XSAVE area of kernel can be in standard or compacted format; - * it is always in standard format for user mode. This is the user - * mode standard format size used for signal and ptrace frames. - */ -unsigned int fpu_user_xstate_size __ro_after_init; - -/* * Return whether the system supports a given xfeature. * * Also return the name of the (most advanced) feature that the caller requested: */ int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name) { - u64 xfeatures_missing = xfeatures_needed & ~xfeatures_mask_all; + u64 xfeatures_missing = xfeatures_needed & ~fpu_kernel_cfg.max_features; if (unlikely(feature_name)) { long xfeature_idx, max_idx; @@ -135,17 +143,26 @@ static bool xfeature_is_supervisor(int xfeature_nr) */ void fpu__init_cpu_xstate(void) { - if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask_all) + if (!boot_cpu_has(X86_FEATURE_XSAVE) || !fpu_kernel_cfg.max_features) return; cr4_set_bits(X86_CR4_OSXSAVE); /* + * Must happen after CR4 setup and before xsetbv() to allow KVM + * lazy passthrough. Write independent of the dynamic state static + * key as that does not work on the boot CPU. This also ensures + * that any stale state is wiped out from XFD. + */ + if (cpu_feature_enabled(X86_FEATURE_XFD)) + wrmsrl(MSR_IA32_XFD, init_fpstate.xfd); + + /* * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features * managed by XSAVE{C, OPT, S} and XRSTOR{S}. Only XSAVE user * states can be set here. */ - xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_uabi()); + xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features); /* * MSR_IA32_XSS sets supervisor states managed by XSAVES. @@ -158,7 +175,7 @@ void fpu__init_cpu_xstate(void) static bool xfeature_enabled(enum xfeature xfeature) { - return xfeatures_mask_all & BIT_ULL(xfeature); + return fpu_kernel_cfg.max_features & BIT_ULL(xfeature); } /* @@ -184,10 +201,7 @@ static void __init setup_xstate_features(void) xstate_sizes[XFEATURE_SSE] = sizeof_field(struct fxregs_state, xmm_space); - for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { - if (!xfeature_enabled(i)) - continue; - + for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); xstate_sizes[i] = eax; @@ -236,6 +250,8 @@ static void __init print_xstate_features(void) print_xstate_feature(XFEATURE_MASK_Hi16_ZMM); print_xstate_feature(XFEATURE_MASK_PKRU); print_xstate_feature(XFEATURE_MASK_PASID); + print_xstate_feature(XFEATURE_MASK_XTILE_CFG); + print_xstate_feature(XFEATURE_MASK_XTILE_DATA); } /* @@ -291,20 +307,15 @@ static void __init setup_xstate_comp_offsets(void) xstate_comp_offsets[XFEATURE_SSE] = offsetof(struct fxregs_state, xmm_space); - if (!boot_cpu_has(X86_FEATURE_XSAVES)) { - for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { - if (xfeature_enabled(i)) - xstate_comp_offsets[i] = xstate_offsets[i]; - } + if (!cpu_feature_enabled(X86_FEATURE_XSAVES)) { + for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) + xstate_comp_offsets[i] = xstate_offsets[i]; return; } next_offset = FXSAVE_SIZE + XSAVE_HDR_SIZE; - for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { - if (!xfeature_enabled(i)) - continue; - + for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { if (xfeature_is_aligned(i)) next_offset = ALIGN(next_offset, 64); @@ -328,8 +339,8 @@ static void __init setup_supervisor_only_offsets(void) next_offset = FXSAVE_SIZE + XSAVE_HDR_SIZE; - for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { - if (!xfeature_enabled(i) || !xfeature_is_supervisor(i)) + for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { + if (!xfeature_is_supervisor(i)) continue; if (xfeature_is_aligned(i)) @@ -347,15 +358,36 @@ static void __init print_xstate_offset_size(void) { int i; - for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { - if (!xfeature_enabled(i)) - continue; + for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n", i, xstate_comp_offsets[i], i, xstate_sizes[i]); } } /* + * This function is called only during boot time when x86 caps are not set + * up and alternative can not be used yet. + */ +static __init void os_xrstor_booting(struct xregs_state *xstate) +{ + u64 mask = fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSTATE; + u32 lmask = mask; + u32 hmask = mask >> 32; + int err; + + if (cpu_feature_enabled(X86_FEATURE_XSAVES)) + XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); + else + XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); + + /* + * We should never fault when copying from a kernel buffer, and the FPU + * state we set at boot time should be valid. + */ + WARN_ON_FPU(err); +} + +/* * All supported features have either init state all zeros or are * handled in setup_init_fpu() individually. This is an explicit * feature list and does not use XFEATURE_MASK*SUPPORTED to catch @@ -372,36 +404,30 @@ static void __init print_xstate_offset_size(void) XFEATURE_MASK_PKRU | \ XFEATURE_MASK_BNDREGS | \ XFEATURE_MASK_BNDCSR | \ - XFEATURE_MASK_PASID) + XFEATURE_MASK_PASID | \ + XFEATURE_MASK_XTILE) /* * setup the xstate image representing the init state */ static void __init setup_init_fpu_buf(void) { - static int on_boot_cpu __initdata = 1; - BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED | XFEATURE_MASK_SUPERVISOR_SUPPORTED) != XFEATURES_INIT_FPSTATE_HANDLED); - WARN_ON_FPU(!on_boot_cpu); - on_boot_cpu = 0; - if (!boot_cpu_has(X86_FEATURE_XSAVE)) return; setup_xstate_features(); print_xstate_features(); - if (boot_cpu_has(X86_FEATURE_XSAVES)) - init_fpstate.xsave.header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | - xfeatures_mask_all; + xstate_init_xcomp_bv(&init_fpstate.regs.xsave, fpu_kernel_cfg.max_features); /* * Init all the features state with header.xfeatures being 0x0 */ - os_xrstor_booting(&init_fpstate.xsave); + os_xrstor_booting(&init_fpstate.regs.xsave); /* * All components are now in init state. Read the state back so @@ -419,7 +445,7 @@ static void __init setup_init_fpu_buf(void) * state is all zeroes or if not to add the necessary handling * here. */ - fxsave(&init_fpstate.fxsave); + fxsave(&init_fpstate.regs.fxsave); } static int xfeature_uncompacted_offset(int xfeature_nr) @@ -451,10 +477,11 @@ int xfeature_size(int xfeature_nr) } /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */ -static int validate_user_xstate_header(const struct xstate_header *hdr) +static int validate_user_xstate_header(const struct xstate_header *hdr, + struct fpstate *fpstate) { /* No unknown or supervisor features may be set */ - if (hdr->xfeatures & ~xfeatures_mask_uabi()) + if (hdr->xfeatures & ~fpstate->user_xfeatures) return -EINVAL; /* Userspace must use the uncompacted format */ @@ -474,7 +501,7 @@ static int validate_user_xstate_header(const struct xstate_header *hdr) return 0; } -static void __xstate_dump_leaves(void) +static void __init __xstate_dump_leaves(void) { int i; u32 eax, ebx, ecx, edx; @@ -509,12 +536,73 @@ static void __xstate_dump_leaves(void) } \ } while (0) +/** + * check_xtile_data_against_struct - Check tile data state size. + * + * Calculate the state size by multiplying the single tile size which is + * recorded in a C struct, and the number of tiles that the CPU informs. + * Compare the provided size with the calculation. + * + * @size: The tile data state size + * + * Returns: 0 on success, -EINVAL on mismatch. + */ +static int __init check_xtile_data_against_struct(int size) +{ + u32 max_palid, palid, state_size; + u32 eax, ebx, ecx, edx; + u16 max_tile; + + /* + * Check the maximum palette id: + * eax: the highest numbered palette subleaf. + */ + cpuid_count(TILE_CPUID, 0, &max_palid, &ebx, &ecx, &edx); + + /* + * Cross-check each tile size and find the maximum number of + * supported tiles. + */ + for (palid = 1, max_tile = 0; palid <= max_palid; palid++) { + u16 tile_size, max; + + /* + * Check the tile size info: + * eax[31:16]: bytes per title + * ebx[31:16]: the max names (or max number of tiles) + */ + cpuid_count(TILE_CPUID, palid, &eax, &ebx, &edx, &edx); + tile_size = eax >> 16; + max = ebx >> 16; + + if (tile_size != sizeof(struct xtile_data)) { + pr_err("%s: struct is %zu bytes, cpu xtile %d bytes\n", + __stringify(XFEATURE_XTILE_DATA), + sizeof(struct xtile_data), tile_size); + __xstate_dump_leaves(); + return -EINVAL; + } + + if (max > max_tile) + max_tile = max; + } + + state_size = sizeof(struct xtile_data) * max_tile; + if (size != state_size) { + pr_err("%s: calculated size is %u bytes, cpu state %d bytes\n", + __stringify(XFEATURE_XTILE_DATA), state_size, size); + __xstate_dump_leaves(); + return -EINVAL; + } + return 0; +} + /* * We have a C struct for each 'xstate'. We need to ensure * that our software representation matches what the CPU * tells us about the state's size. */ -static void check_xstate_against_struct(int nr) +static bool __init check_xstate_against_struct(int nr) { /* * Ask the CPU for the size of the state. @@ -532,6 +620,11 @@ static void check_xstate_against_struct(int nr) XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM, struct avx_512_hi16_state); XCHECK_SZ(sz, nr, XFEATURE_PKRU, struct pkru_state); XCHECK_SZ(sz, nr, XFEATURE_PASID, struct ia32_pasid_state); + XCHECK_SZ(sz, nr, XFEATURE_XTILE_CFG, struct xtile_cfg); + + /* The tile data size varies between implementations. */ + if (nr == XFEATURE_XTILE_DATA) + check_xtile_data_against_struct(sz); /* * Make *SURE* to add any feature numbers in below if @@ -541,10 +634,39 @@ static void check_xstate_against_struct(int nr) if ((nr < XFEATURE_YMM) || (nr >= XFEATURE_MAX) || (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR) || - ((nr >= XFEATURE_RSRVD_COMP_11) && (nr <= XFEATURE_LBR))) { + ((nr >= XFEATURE_RSRVD_COMP_11) && (nr <= XFEATURE_RSRVD_COMP_16))) { WARN_ONCE(1, "no structure for xstate: %d\n", nr); XSTATE_WARN_ON(1); + return false; } + return true; +} + +static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted) +{ + unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE; + int i; + + for_each_extended_xfeature(i, xfeatures) { + /* Align from the end of the previous feature */ + if (xfeature_is_aligned(i)) + size = ALIGN(size, 64); + /* + * In compacted format the enabled features are packed, + * i.e. disabled features do not occupy space. + * + * In non-compacted format the offsets are fixed and + * disabled states still occupy space in the memory buffer. + */ + if (!compacted) + size = xfeature_uncompacted_offset(i); + /* + * Add the feature size even for non-compacted format + * to make the end result correct + */ + size += xfeature_size(i); + } + return size; } /* @@ -556,44 +678,29 @@ static void check_xstate_against_struct(int nr) * covered by these checks. Only the size of the buffer for task->fpu * is checked here. */ -static void do_extra_xstate_size_checks(void) +static bool __init paranoid_xstate_size_valid(unsigned int kernel_size) { - int paranoid_xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE; + bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES); + unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE; int i; - for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { - if (!xfeature_enabled(i)) - continue; - - check_xstate_against_struct(i); + for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { + if (!check_xstate_against_struct(i)) + return false; /* * Supervisor state components can be managed only by * XSAVES. */ - if (!cpu_feature_enabled(X86_FEATURE_XSAVES)) - XSTATE_WARN_ON(xfeature_is_supervisor(i)); - - /* Align from the end of the previous feature */ - if (xfeature_is_aligned(i)) - paranoid_xstate_size = ALIGN(paranoid_xstate_size, 64); - /* - * The offset of a given state in the non-compacted - * format is given to us in a CPUID leaf. We check - * them for being ordered (increasing offsets) in - * setup_xstate_features(). XSAVES uses compacted format. - */ - if (!cpu_feature_enabled(X86_FEATURE_XSAVES)) - paranoid_xstate_size = xfeature_uncompacted_offset(i); - /* - * The compacted-format offset always depends on where - * the previous state ended. - */ - paranoid_xstate_size += xfeature_size(i); + if (!compacted && xfeature_is_supervisor(i)) { + XSTATE_WARN_ON(1); + return false; + } } - XSTATE_WARN_ON(paranoid_xstate_size != fpu_kernel_xstate_size); + size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted); + XSTATE_WARN_ON(size != kernel_size); + return size == kernel_size; } - /* * Get total size of enabled xstates in XCR0 | IA32_XSS. * @@ -644,7 +751,7 @@ static unsigned int __init get_xsaves_size_no_independent(void) return size; } -static unsigned int __init get_xsave_size(void) +static unsigned int __init get_xsave_size_user(void) { unsigned int eax, ebx, ecx, edx; /* @@ -662,44 +769,54 @@ static unsigned int __init get_xsave_size(void) * Will the runtime-enumerated 'xstate_size' fit in the init * task's statically-allocated buffer? */ -static bool is_supported_xstate_size(unsigned int test_xstate_size) +static bool __init is_supported_xstate_size(unsigned int test_xstate_size) { - if (test_xstate_size <= sizeof(union fpregs_state)) + if (test_xstate_size <= sizeof(init_fpstate.regs)) return true; pr_warn("x86/fpu: xstate buffer too small (%zu < %d), disabling xsave\n", - sizeof(union fpregs_state), test_xstate_size); + sizeof(init_fpstate.regs), test_xstate_size); return false; } static int __init init_xstate_size(void) { /* Recompute the context size for enabled features: */ - unsigned int possible_xstate_size; - unsigned int xsave_size; + unsigned int user_size, kernel_size, kernel_default_size; + bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES); - xsave_size = get_xsave_size(); + /* Uncompacted user space size */ + user_size = get_xsave_size_user(); - if (boot_cpu_has(X86_FEATURE_XSAVES)) - possible_xstate_size = get_xsaves_size_no_independent(); + /* + * XSAVES kernel size includes supervisor states and + * uses compacted format when available. + * + * XSAVE does not support supervisor states so + * kernel and user size is identical. + */ + if (compacted) + kernel_size = get_xsaves_size_no_independent(); else - possible_xstate_size = xsave_size; + kernel_size = user_size; + + kernel_default_size = + xstate_calculate_size(fpu_kernel_cfg.default_features, compacted); - /* Ensure we have the space to store all enabled: */ - if (!is_supported_xstate_size(possible_xstate_size)) + /* Ensure we have the space to store all default enabled features. */ + if (!is_supported_xstate_size(kernel_default_size)) return -EINVAL; - /* - * The size is OK, we are definitely going to use xsave, - * make it known to the world that we need more space. - */ - fpu_kernel_xstate_size = possible_xstate_size; - do_extra_xstate_size_checks(); + if (!paranoid_xstate_size_valid(kernel_size)) + return -EINVAL; + + fpu_kernel_cfg.max_size = kernel_size; + fpu_user_cfg.max_size = user_size; + + fpu_kernel_cfg.default_size = kernel_default_size; + fpu_user_cfg.default_size = + xstate_calculate_size(fpu_user_cfg.default_features, false); - /* - * User space is always in standard format. - */ - fpu_user_xstate_size = xsave_size; return 0; } @@ -707,28 +824,38 @@ static int __init init_xstate_size(void) * We enabled the XSAVE hardware, but something went wrong and * we can not use it. Disable it. */ -static void fpu__init_disable_system_xstate(void) +static void __init fpu__init_disable_system_xstate(unsigned int legacy_size) { - xfeatures_mask_all = 0; + fpu_kernel_cfg.max_features = 0; cr4_clear_bits(X86_CR4_OSXSAVE); setup_clear_cpu_cap(X86_FEATURE_XSAVE); + + /* Restore the legacy size.*/ + fpu_kernel_cfg.max_size = legacy_size; + fpu_kernel_cfg.default_size = legacy_size; + fpu_user_cfg.max_size = legacy_size; + fpu_user_cfg.default_size = legacy_size; + + /* + * Prevent enabling the static branch which enables writes to the + * XFD MSR. + */ + init_fpstate.xfd = 0; + + fpstate_reset(¤t->thread.fpu); } /* * Enable and initialize the xsave feature. * Called once per system bootup. */ -void __init fpu__init_system_xstate(void) +void __init fpu__init_system_xstate(unsigned int legacy_size) { unsigned int eax, ebx, ecx, edx; - static int on_boot_cpu __initdata = 1; u64 xfeatures; int err; int i; - WARN_ON_FPU(!on_boot_cpu); - on_boot_cpu = 0; - if (!boot_cpu_has(X86_FEATURE_FPU)) { pr_info("x86/fpu: No FPU detected\n"); return; @@ -749,22 +876,22 @@ void __init fpu__init_system_xstate(void) * Find user xstates supported by the processor. */ cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); - xfeatures_mask_all = eax + ((u64)edx << 32); + fpu_kernel_cfg.max_features = eax + ((u64)edx << 32); /* * Find supervisor xstates supported by the processor. */ cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); - xfeatures_mask_all |= ecx + ((u64)edx << 32); + fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32); - if ((xfeatures_mask_uabi() & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) { + if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) { /* * This indicates that something really unexpected happened * with the enumeration. Disable XSAVE and try to continue * booting without it. This is too early to BUG(). */ pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", - xfeatures_mask_all); + fpu_kernel_cfg.max_features); goto out_disable; } @@ -772,15 +899,39 @@ void __init fpu__init_system_xstate(void) * Clear XSAVE features that are disabled in the normal CPUID. */ for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) { - if (!boot_cpu_has(xsave_cpuid_features[i])) - xfeatures_mask_all &= ~BIT_ULL(i); + unsigned short cid = xsave_cpuid_features[i]; + + /* Careful: X86_FEATURE_FPU is 0! */ + if ((i != XFEATURE_FP && !cid) || !boot_cpu_has(cid)) + fpu_kernel_cfg.max_features &= ~BIT_ULL(i); } - xfeatures_mask_all &= XFEATURE_MASK_USER_SUPPORTED | + if (!cpu_feature_enabled(X86_FEATURE_XFD)) + fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC; + + fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED | XFEATURE_MASK_SUPERVISOR_SUPPORTED; + fpu_user_cfg.max_features = fpu_kernel_cfg.max_features; + fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED; + + /* Clean out dynamic features from default */ + fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features; + fpu_kernel_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC; + + fpu_user_cfg.default_features = fpu_user_cfg.max_features; + fpu_user_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC; + /* Store it for paranoia check at the end */ - xfeatures = xfeatures_mask_all; + xfeatures = fpu_kernel_cfg.max_features; + + /* + * Initialize the default XFD state in initfp_state and enable the + * dynamic sizing mechanism if dynamic states are available. The + * static key cannot be enabled here because this runs before + * jump_label_init(). This is delayed to an initcall. + */ + init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC; /* Enable xstate instructions to be able to continue with initialization: */ fpu__init_cpu_xstate(); @@ -788,13 +939,16 @@ void __init fpu__init_system_xstate(void) if (err) goto out_disable; + /* Reset the state for the current task */ + fpstate_reset(¤t->thread.fpu); + /* * Update info used for ptrace frames; use standard-format size and no * supervisor xstates: */ - update_regset_xstate_info(fpu_user_xstate_size, xfeatures_mask_uabi()); + update_regset_xstate_info(fpu_user_cfg.max_size, + fpu_user_cfg.max_features); - fpu__init_prepare_fx_sw_frame(); setup_init_fpu_buf(); setup_xstate_comp_offsets(); setup_supervisor_only_offsets(); @@ -803,22 +957,22 @@ void __init fpu__init_system_xstate(void) * Paranoia check whether something in the setup modified the * xfeatures mask. */ - if (xfeatures != xfeatures_mask_all) { + if (xfeatures != fpu_kernel_cfg.max_features) { pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init, disabling XSAVE\n", - xfeatures, xfeatures_mask_all); + xfeatures, fpu_kernel_cfg.max_features); goto out_disable; } print_xstate_offset_size(); pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n", - xfeatures_mask_all, - fpu_kernel_xstate_size, + fpu_kernel_cfg.max_features, + fpu_kernel_cfg.max_size, boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard"); return; out_disable: /* something went wrong, try to boot without any XSAVE support */ - fpu__init_disable_system_xstate(); + fpu__init_disable_system_xstate(legacy_size); } /* @@ -830,7 +984,7 @@ void fpu__resume_cpu(void) * Restore XCR0 on xsave capable CPUs: */ if (cpu_feature_enabled(X86_FEATURE_XSAVE)) - xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_uabi()); + xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features); /* * Restore IA32_XSS. The same CPUID bit enumerates support @@ -840,6 +994,9 @@ void fpu__resume_cpu(void) wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | xfeatures_mask_independent()); } + + if (fpu_state_size_dynamic()) + wrmsrl(MSR_IA32_XFD, current->thread.fpu.fpstate->xfd); } /* @@ -886,7 +1043,7 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr) * We should not ever be requesting features that we * have not enabled. */ - WARN_ONCE(!(xfeatures_mask_all & BIT_ULL(xfeature_nr)), + WARN_ONCE(!(fpu_kernel_cfg.max_features & BIT_ULL(xfeature_nr)), "get of unsupported state"); /* * This assumes the last 'xsave*' instruction to @@ -904,7 +1061,6 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr) return __raw_xsave_addr(xsave, xfeature_nr); } -EXPORT_SYMBOL_GPL(get_xsave_addr); #ifdef CONFIG_ARCH_HAS_PKEYS @@ -961,9 +1117,10 @@ static void copy_feature(bool from_xstate, struct membuf *to, void *xstate, } /** - * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer + * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer * @to: membuf descriptor - * @tsk: The task from which to copy the saved xstate + * @fpstate: The fpstate buffer from which to copy + * @pkru_val: The PKRU value to store in the PKRU component * @copy_mode: The requested copy mode * * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming @@ -972,14 +1129,15 @@ static void copy_feature(bool from_xstate, struct membuf *to, void *xstate, * * It supports partial copy but @to.pos always starts from zero. */ -void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, - enum xstate_copy_mode copy_mode) +void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, + u32 pkru_val, enum xstate_copy_mode copy_mode) { const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr); - struct xregs_state *xsave = &tsk->thread.fpu.state.xsave; - struct xregs_state *xinit = &init_fpstate.xsave; + struct xregs_state *xinit = &init_fpstate.regs.xsave; + struct xregs_state *xsave = &fpstate->regs.xsave; struct xstate_header header; unsigned int zerofrom; + u64 mask; int i; memset(&header, 0, sizeof(header)); @@ -996,7 +1154,7 @@ void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, break; case XSTATE_COPY_XSAVE: - header.xfeatures &= xfeatures_mask_uabi(); + header.xfeatures &= fpstate->user_xfeatures; break; } @@ -1033,17 +1191,15 @@ void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, zerofrom = offsetof(struct xregs_state, extended_state_area); - for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { - /* - * The ptrace buffer is in non-compacted XSAVE format. - * In non-compacted format disabled features still occupy - * state space, but there is no state to copy from in the - * compacted init_fpstate. The gap tracking will zero this - * later. - */ - if (!(xfeatures_mask_uabi() & BIT_ULL(i))) - continue; + /* + * The ptrace buffer is in non-compacted XSAVE format. In + * non-compacted format disabled features still occupy state space, + * but there is no state to copy from in the compacted + * init_fpstate. The gap tracking will zero these states. + */ + mask = fpstate->user_xfeatures; + for_each_extended_xfeature(i, mask) { /* * If there was a feature or alignment gap, zero the space * in the destination buffer. @@ -1055,10 +1211,9 @@ void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, struct pkru_state pkru = {0}; /* * PKRU is not necessarily up to date in the - * thread's XSAVE buffer. Fill this part from the - * per-thread storage. + * XSAVE buffer. Use the provided value. */ - pkru.pkru = tsk->thread.pkru; + pkru.pkru = pkru_val; membuf_write(&to, &pkru, sizeof(pkru)); } else { copy_feature(header.xfeatures & BIT_ULL(i), &to, @@ -1078,6 +1233,25 @@ out: membuf_zero(&to, to.left); } +/** + * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer + * @to: membuf descriptor + * @tsk: The task from which to copy the saved xstate + * @copy_mode: The requested copy mode + * + * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming + * format, i.e. from the kernel internal hardware dependent storage format + * to the requested @mode. UABI XSTATE is always uncompacted! + * + * It supports partial copy but @to.pos always starts from zero. + */ +void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, + enum xstate_copy_mode copy_mode) +{ + __copy_xstate_to_uabi_buf(to, tsk->thread.fpu.fpstate, + tsk->thread.pkru, copy_mode); +} + static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size, const void *kbuf, const void __user *ubuf) { @@ -1091,9 +1265,10 @@ static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size, } -static int copy_uabi_to_xstate(struct xregs_state *xsave, const void *kbuf, +static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf, const void __user *ubuf) { + struct xregs_state *xsave = &fpstate->regs.xsave; unsigned int offset, size; struct xstate_header hdr; u64 mask; @@ -1103,7 +1278,7 @@ static int copy_uabi_to_xstate(struct xregs_state *xsave, const void *kbuf, if (copy_from_buffer(&hdr, offset, sizeof(hdr), kbuf, ubuf)) return -EFAULT; - if (validate_user_xstate_header(&hdr)) + if (validate_user_xstate_header(&hdr, fpstate)) return -EINVAL; /* Validate MXCSR when any of the related features is in use */ @@ -1156,12 +1331,11 @@ static int copy_uabi_to_xstate(struct xregs_state *xsave, const void *kbuf, /* * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S] - * format and copy to the target thread. This is called from - * xstateregs_set(). + * format and copy to the target thread. Used by ptrace and KVM. */ -int copy_uabi_from_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) +int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf) { - return copy_uabi_to_xstate(xsave, kbuf, NULL); + return copy_uabi_to_xstate(fpstate, kbuf, NULL); } /* @@ -1169,26 +1343,20 @@ int copy_uabi_from_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) * XSAVE[S] format and copy to the target thread. This is called from the * sigreturn() and rt_sigreturn() system calls. */ -int copy_sigframe_from_user_to_xstate(struct xregs_state *xsave, +int copy_sigframe_from_user_to_xstate(struct fpstate *fpstate, const void __user *ubuf) { - return copy_uabi_to_xstate(xsave, NULL, ubuf); + return copy_uabi_to_xstate(fpstate, NULL, ubuf); } -static bool validate_xsaves_xrstors(u64 mask) +static bool validate_independent_components(u64 mask) { u64 xchk; if (WARN_ON_FPU(!cpu_feature_enabled(X86_FEATURE_XSAVES))) return false; - /* - * Validate that this is either a task->fpstate related component - * subset or an independent one. - */ - if (mask & xfeatures_mask_independent()) - xchk = ~xfeatures_mask_independent(); - else - xchk = ~xfeatures_mask_all; + + xchk = ~xfeatures_mask_independent(); if (WARN_ON_ONCE(!mask || mask & xchk)) return false; @@ -1206,14 +1374,13 @@ static bool validate_xsaves_xrstors(u64 mask) * buffer should be zeroed otherwise a consecutive XRSTORS from that buffer * can #GP. * - * The feature mask must either be a subset of the independent features or - * a subset of the task->fpstate related features. + * The feature mask must be a subset of the independent features. */ void xsaves(struct xregs_state *xstate, u64 mask) { int err; - if (!validate_xsaves_xrstors(mask)) + if (!validate_independent_components(mask)) return; XSTATE_OP(XSAVES, xstate, (u32)mask, (u32)(mask >> 32), err); @@ -1231,20 +1398,420 @@ void xsaves(struct xregs_state *xstate, u64 mask) * Proper usage is to restore the state which was saved with * xsaves() into @xstate. * - * The feature mask must either be a subset of the independent features or - * a subset of the task->fpstate related features. + * The feature mask must be a subset of the independent features. */ void xrstors(struct xregs_state *xstate, u64 mask) { int err; - if (!validate_xsaves_xrstors(mask)) + if (!validate_independent_components(mask)) return; XSTATE_OP(XRSTORS, xstate, (u32)mask, (u32)(mask >> 32), err); WARN_ON_ONCE(err); } +#if IS_ENABLED(CONFIG_KVM) +void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature) +{ + void *addr = get_xsave_addr(&fps->regs.xsave, xfeature); + + if (addr) + memset(addr, 0, xstate_sizes[xfeature]); +} +EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component); +#endif + +#ifdef CONFIG_X86_64 + +#ifdef CONFIG_X86_DEBUG_FPU +/* + * Ensure that a subsequent XSAVE* or XRSTOR* instruction with RFBM=@mask + * can safely operate on the @fpstate buffer. + */ +static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor) +{ + u64 xfd = __this_cpu_read(xfd_state); + + if (fpstate->xfd == xfd) + return true; + + /* + * The XFD MSR does not match fpstate->xfd. That's invalid when + * the passed in fpstate is current's fpstate. + */ + if (fpstate->xfd == current->thread.fpu.fpstate->xfd) + return false; + + /* + * XRSTOR(S) from init_fpstate are always correct as it will just + * bring all components into init state and not read from the + * buffer. XSAVE(S) raises #PF after init. + */ + if (fpstate == &init_fpstate) + return rstor; + + /* + * XSAVE(S): clone(), fpu_swap_kvm_fpu() + * XRSTORS(S): fpu_swap_kvm_fpu() + */ + + /* + * No XSAVE/XRSTOR instructions (except XSAVE itself) touch + * the buffer area for XFD-disabled state components. + */ + mask &= ~xfd; + + /* + * Remove features which are valid in fpstate. They + * have space allocated in fpstate. + */ + mask &= ~fpstate->xfeatures; + + /* + * Any remaining state components in 'mask' might be written + * by XSAVE/XRSTOR. Fail validation it found. + */ + return !mask; +} + +void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor) +{ + WARN_ON_ONCE(!xstate_op_valid(fpstate, mask, rstor)); +} +#endif /* CONFIG_X86_DEBUG_FPU */ + +static int __init xfd_update_static_branch(void) +{ + /* + * If init_fpstate.xfd has bits set then dynamic features are + * available and the dynamic sizing must be enabled. + */ + if (init_fpstate.xfd) + static_branch_enable(&__fpu_state_size_dynamic); + return 0; +} +arch_initcall(xfd_update_static_branch) + +void fpstate_free(struct fpu *fpu) +{ + if (fpu->fpstate && fpu->fpstate != &fpu->__fpstate) + vfree(fpu->fpstate); +} + +/** + * fpstate_realloc - Reallocate struct fpstate for the requested new features + * + * @xfeatures: A bitmap of xstate features which extend the enabled features + * of that task + * @ksize: The required size for the kernel buffer + * @usize: The required size for user space buffers + * @guest_fpu: Pointer to a guest FPU container. NULL for host allocations + * + * Note vs. vmalloc(): If the task with a vzalloc()-allocated buffer + * terminates quickly, vfree()-induced IPIs may be a concern, but tasks + * with large states are likely to live longer. + * + * Returns: 0 on success, -ENOMEM on allocation error. + */ +static int fpstate_realloc(u64 xfeatures, unsigned int ksize, + unsigned int usize, struct fpu_guest *guest_fpu) +{ + struct fpu *fpu = ¤t->thread.fpu; + struct fpstate *curfps, *newfps = NULL; + unsigned int fpsize; + bool in_use; + + fpsize = ksize + ALIGN(offsetof(struct fpstate, regs), 64); + + newfps = vzalloc(fpsize); + if (!newfps) + return -ENOMEM; + newfps->size = ksize; + newfps->user_size = usize; + newfps->is_valloc = true; + + /* + * When a guest FPU is supplied, use @guest_fpu->fpstate + * as reference independent whether it is in use or not. + */ + curfps = guest_fpu ? guest_fpu->fpstate : fpu->fpstate; + + /* Determine whether @curfps is the active fpstate */ + in_use = fpu->fpstate == curfps; + + if (guest_fpu) { + newfps->is_guest = true; + newfps->is_confidential = curfps->is_confidential; + newfps->in_use = curfps->in_use; + guest_fpu->xfeatures |= xfeatures; + guest_fpu->uabi_size = usize; + } + + fpregs_lock(); + /* + * If @curfps is in use, ensure that the current state is in the + * registers before swapping fpstate as that might invalidate it + * due to layout changes. + */ + if (in_use && test_thread_flag(TIF_NEED_FPU_LOAD)) + fpregs_restore_userregs(); + + newfps->xfeatures = curfps->xfeatures | xfeatures; + newfps->user_xfeatures = curfps->user_xfeatures | xfeatures; + newfps->xfd = curfps->xfd & ~xfeatures; + + /* Do the final updates within the locked region */ + xstate_init_xcomp_bv(&newfps->regs.xsave, newfps->xfeatures); + + if (guest_fpu) { + guest_fpu->fpstate = newfps; + /* If curfps is active, update the FPU fpstate pointer */ + if (in_use) + fpu->fpstate = newfps; + } else { + fpu->fpstate = newfps; + } + + if (in_use) + xfd_update_state(fpu->fpstate); + fpregs_unlock(); + + /* Only free valloc'ed state */ + if (curfps && curfps->is_valloc) + vfree(curfps); + + return 0; +} + +static int validate_sigaltstack(unsigned int usize) +{ + struct task_struct *thread, *leader = current->group_leader; + unsigned long framesize = get_sigframe_size(); + + lockdep_assert_held(¤t->sighand->siglock); + + /* get_sigframe_size() is based on fpu_user_cfg.max_size */ + framesize -= fpu_user_cfg.max_size; + framesize += usize; + for_each_thread(leader, thread) { + if (thread->sas_ss_size && thread->sas_ss_size < framesize) + return -ENOSPC; + } + return 0; +} + +static int __xstate_request_perm(u64 permitted, u64 requested, bool guest) +{ + /* + * This deliberately does not exclude !XSAVES as we still might + * decide to optionally context switch XCR0 or talk the silicon + * vendors into extending XFD for the pre AMX states, especially + * AVX512. + */ + bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES); + struct fpu *fpu = ¤t->group_leader->thread.fpu; + struct fpu_state_perm *perm; + unsigned int ksize, usize; + u64 mask; + int ret = 0; + + /* Check whether fully enabled */ + if ((permitted & requested) == requested) + return 0; + + /* Calculate the resulting kernel state size */ + mask = permitted | requested; + ksize = xstate_calculate_size(mask, compacted); + + /* Calculate the resulting user state size */ + mask &= XFEATURE_MASK_USER_SUPPORTED; + usize = xstate_calculate_size(mask, false); + + if (!guest) { + ret = validate_sigaltstack(usize); + if (ret) + return ret; + } + + perm = guest ? &fpu->guest_perm : &fpu->perm; + /* Pairs with the READ_ONCE() in xstate_get_group_perm() */ + WRITE_ONCE(perm->__state_perm, requested); + /* Protected by sighand lock */ + perm->__state_size = ksize; + perm->__user_state_size = usize; + return ret; +} + +/* + * Permissions array to map facilities with more than one component + */ +static const u64 xstate_prctl_req[XFEATURE_MAX] = { + [XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE_DATA, +}; + +static int xstate_request_perm(unsigned long idx, bool guest) +{ + u64 permitted, requested; + int ret; + + if (idx >= XFEATURE_MAX) + return -EINVAL; + + /* + * Look up the facility mask which can require more than + * one xstate component. + */ + idx = array_index_nospec(idx, ARRAY_SIZE(xstate_prctl_req)); + requested = xstate_prctl_req[idx]; + if (!requested) + return -EOPNOTSUPP; + + if ((fpu_user_cfg.max_features & requested) != requested) + return -EOPNOTSUPP; + + /* Lockless quick check */ + permitted = xstate_get_group_perm(guest); + if ((permitted & requested) == requested) + return 0; + + /* Protect against concurrent modifications */ + spin_lock_irq(¤t->sighand->siglock); + permitted = xstate_get_group_perm(guest); + + /* First vCPU allocation locks the permissions. */ + if (guest && (permitted & FPU_GUEST_PERM_LOCKED)) + ret = -EBUSY; + else + ret = __xstate_request_perm(permitted, requested, guest); + spin_unlock_irq(¤t->sighand->siglock); + return ret; +} + +int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu) +{ + u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC; + struct fpu_state_perm *perm; + unsigned int ksize, usize; + struct fpu *fpu; + + if (!xfd_event) { + if (!guest_fpu) + pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err); + return 0; + } + + /* Protect against concurrent modifications */ + spin_lock_irq(¤t->sighand->siglock); + + /* If not permitted let it die */ + if ((xstate_get_group_perm(!!guest_fpu) & xfd_event) != xfd_event) { + spin_unlock_irq(¤t->sighand->siglock); + return -EPERM; + } + + fpu = ¤t->group_leader->thread.fpu; + perm = guest_fpu ? &fpu->guest_perm : &fpu->perm; + ksize = perm->__state_size; + usize = perm->__user_state_size; + + /* + * The feature is permitted. State size is sufficient. Dropping + * the lock is safe here even if more features are added from + * another task, the retrieved buffer sizes are valid for the + * currently requested feature(s). + */ + spin_unlock_irq(¤t->sighand->siglock); + + /* + * Try to allocate a new fpstate. If that fails there is no way + * out. + */ + if (fpstate_realloc(xfd_event, ksize, usize, guest_fpu)) + return -EFAULT; + return 0; +} + +int xfd_enable_feature(u64 xfd_err) +{ + return __xfd_enable_feature(xfd_err, NULL); +} + +#else /* CONFIG_X86_64 */ +static inline int xstate_request_perm(unsigned long idx, bool guest) +{ + return -EPERM; +} +#endif /* !CONFIG_X86_64 */ + +u64 xstate_get_guest_group_perm(void) +{ + return xstate_get_group_perm(true); +} +EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm); + +/** + * fpu_xstate_prctl - xstate permission operations + * @tsk: Redundant pointer to current + * @option: A subfunction of arch_prctl() + * @arg2: option argument + * Return: 0 if successful; otherwise, an error code + * + * Option arguments: + * + * ARCH_GET_XCOMP_SUPP: Pointer to user space u64 to store the info + * ARCH_GET_XCOMP_PERM: Pointer to user space u64 to store the info + * ARCH_REQ_XCOMP_PERM: Facility number requested + * + * For facilities which require more than one XSTATE component, the request + * must be the highest state component number related to that facility, + * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and + * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18). + */ +long fpu_xstate_prctl(struct task_struct *tsk, int option, unsigned long arg2) +{ + u64 __user *uptr = (u64 __user *)arg2; + u64 permitted, supported; + unsigned long idx = arg2; + bool guest = false; + + if (tsk != current) + return -EPERM; + + switch (option) { + case ARCH_GET_XCOMP_SUPP: + supported = fpu_user_cfg.max_features | fpu_user_cfg.legacy_features; + return put_user(supported, uptr); + + case ARCH_GET_XCOMP_PERM: + /* + * Lockless snapshot as it can also change right after the + * dropping the lock. + */ + permitted = xstate_get_host_group_perm(); + permitted &= XFEATURE_MASK_USER_SUPPORTED; + return put_user(permitted, uptr); + + case ARCH_GET_XCOMP_GUEST_PERM: + permitted = xstate_get_guest_group_perm(); + permitted &= XFEATURE_MASK_USER_SUPPORTED; + return put_user(permitted, uptr); + + case ARCH_REQ_XCOMP_GUEST_PERM: + guest = true; + fallthrough; + + case ARCH_REQ_XCOMP_PERM: + if (!IS_ENABLED(CONFIG_X86_64)) + return -EOPNOTSUPP; + + return xstate_request_perm(idx, guest); + + default: + return -EINVAL; + } +} + #ifdef CONFIG_PROC_PID_ARCH_STATUS /* * Report the amount of time elapsed in millisecond since last AVX512 diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h new file mode 100644 index 000000000000..d22ace092ca2 --- /dev/null +++ b/arch/x86/kernel/fpu/xstate.h @@ -0,0 +1,322 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __X86_KERNEL_FPU_XSTATE_H +#define __X86_KERNEL_FPU_XSTATE_H + +#include <asm/cpufeature.h> +#include <asm/fpu/xstate.h> +#include <asm/fpu/xcr.h> + +#ifdef CONFIG_X86_64 +DECLARE_PER_CPU(u64, xfd_state); +#endif + +static inline void xstate_init_xcomp_bv(struct xregs_state *xsave, u64 mask) +{ + /* + * XRSTORS requires these bits set in xcomp_bv, or it will + * trigger #GP: + */ + if (cpu_feature_enabled(X86_FEATURE_XSAVES)) + xsave->header.xcomp_bv = mask | XCOMP_BV_COMPACTED_FORMAT; +} + +static inline u64 xstate_get_group_perm(bool guest) +{ + struct fpu *fpu = ¤t->group_leader->thread.fpu; + struct fpu_state_perm *perm; + + /* Pairs with WRITE_ONCE() in xstate_request_perm() */ + perm = guest ? &fpu->guest_perm : &fpu->perm; + return READ_ONCE(perm->__state_perm); +} + +static inline u64 xstate_get_host_group_perm(void) +{ + return xstate_get_group_perm(false); +} + +enum xstate_copy_mode { + XSTATE_COPY_FP, + XSTATE_COPY_FX, + XSTATE_COPY_XSAVE, +}; + +struct membuf; +extern void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, + u32 pkru_val, enum xstate_copy_mode copy_mode); +extern void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, + enum xstate_copy_mode mode); +extern int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf); +extern int copy_sigframe_from_user_to_xstate(struct fpstate *fpstate, const void __user *ubuf); + + +extern void fpu__init_cpu_xstate(void); +extern void fpu__init_system_xstate(unsigned int legacy_size); + +extern void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr); + +static inline u64 xfeatures_mask_supervisor(void) +{ + return fpu_kernel_cfg.max_features & XFEATURE_MASK_SUPERVISOR_SUPPORTED; +} + +static inline u64 xfeatures_mask_independent(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) + return XFEATURE_MASK_INDEPENDENT & ~XFEATURE_MASK_LBR; + + return XFEATURE_MASK_INDEPENDENT; +} + +/* XSAVE/XRSTOR wrapper functions */ + +#ifdef CONFIG_X86_64 +#define REX_PREFIX "0x48, " +#else +#define REX_PREFIX +#endif + +/* These macros all use (%edi)/(%rdi) as the single memory argument. */ +#define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27" +#define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37" +#define XSAVES ".byte " REX_PREFIX "0x0f,0xc7,0x2f" +#define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f" +#define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f" + +/* + * After this @err contains 0 on success or the trap number when the + * operation raises an exception. + */ +#define XSTATE_OP(op, st, lmask, hmask, err) \ + asm volatile("1:" op "\n\t" \ + "xor %[err], %[err]\n" \ + "2:\n\t" \ + _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE) \ + : [err] "=a" (err) \ + : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ + : "memory") + +/* + * If XSAVES is enabled, it replaces XSAVEOPT because it supports a compact + * format and supervisor states in addition to modified optimization in + * XSAVEOPT. + * + * Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT + * supports modified optimization which is not supported by XSAVE. + * + * We use XSAVE as a fallback. + * + * The 661 label is defined in the ALTERNATIVE* macros as the address of the + * original instruction which gets replaced. We need to use it here as the + * address of the instruction where we might get an exception at. + */ +#define XSTATE_XSAVE(st, lmask, hmask, err) \ + asm volatile(ALTERNATIVE_2(XSAVE, \ + XSAVEOPT, X86_FEATURE_XSAVEOPT, \ + XSAVES, X86_FEATURE_XSAVES) \ + "\n" \ + "xor %[err], %[err]\n" \ + "3:\n" \ + _ASM_EXTABLE_TYPE_REG(661b, 3b, EX_TYPE_EFAULT_REG, %[err]) \ + : [err] "=r" (err) \ + : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ + : "memory") + +/* + * Use XRSTORS to restore context if it is enabled. XRSTORS supports compact + * XSAVE area format. + */ +#define XSTATE_XRESTORE(st, lmask, hmask) \ + asm volatile(ALTERNATIVE(XRSTOR, \ + XRSTORS, X86_FEATURE_XSAVES) \ + "\n" \ + "3:\n" \ + _ASM_EXTABLE_TYPE(661b, 3b, EX_TYPE_FPU_RESTORE) \ + : \ + : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ + : "memory") + +#if defined(CONFIG_X86_64) && defined(CONFIG_X86_DEBUG_FPU) +extern void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor); +#else +static inline void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor) { } +#endif + +#ifdef CONFIG_X86_64 +static inline void xfd_update_state(struct fpstate *fpstate) +{ + if (fpu_state_size_dynamic()) { + u64 xfd = fpstate->xfd; + + if (__this_cpu_read(xfd_state) != xfd) { + wrmsrl(MSR_IA32_XFD, xfd); + __this_cpu_write(xfd_state, xfd); + } + } +} + +extern int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu); +#else +static inline void xfd_update_state(struct fpstate *fpstate) { } + +static inline int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu) { + return -EPERM; +} +#endif + +/* + * Save processor xstate to xsave area. + * + * Uses either XSAVE or XSAVEOPT or XSAVES depending on the CPU features + * and command line options. The choice is permanent until the next reboot. + */ +static inline void os_xsave(struct fpstate *fpstate) +{ + u64 mask = fpstate->xfeatures; + u32 lmask = mask; + u32 hmask = mask >> 32; + int err; + + WARN_ON_FPU(!alternatives_patched); + xfd_validate_state(fpstate, mask, false); + + XSTATE_XSAVE(&fpstate->regs.xsave, lmask, hmask, err); + + /* We should never fault when copying to a kernel buffer: */ + WARN_ON_FPU(err); +} + +/* + * Restore processor xstate from xsave area. + * + * Uses XRSTORS when XSAVES is used, XRSTOR otherwise. + */ +static inline void os_xrstor(struct fpstate *fpstate, u64 mask) +{ + u32 lmask = mask; + u32 hmask = mask >> 32; + + xfd_validate_state(fpstate, mask, true); + XSTATE_XRESTORE(&fpstate->regs.xsave, lmask, hmask); +} + +/* Restore of supervisor state. Does not require XFD */ +static inline void os_xrstor_supervisor(struct fpstate *fpstate) +{ + u64 mask = xfeatures_mask_supervisor(); + u32 lmask = mask; + u32 hmask = mask >> 32; + + XSTATE_XRESTORE(&fpstate->regs.xsave, lmask, hmask); +} + +/* + * XSAVE itself always writes all requested xfeatures. Removing features + * from the request bitmap reduces the features which are written. + * Generate a mask of features which must be written to a sigframe. The + * unset features can be optimized away and not written. + * + * This optimization is user-visible. Only use for states where + * uninitialized sigframe contents are tolerable, like dynamic features. + * + * Users of buffers produced with this optimization must check XSTATE_BV + * to determine which features have been optimized out. + */ +static inline u64 xfeatures_need_sigframe_write(void) +{ + u64 xfeaures_to_write; + + /* In-use features must be written: */ + xfeaures_to_write = xfeatures_in_use(); + + /* Also write all non-optimizable sigframe features: */ + xfeaures_to_write |= XFEATURE_MASK_USER_SUPPORTED & + ~XFEATURE_MASK_SIGFRAME_INITOPT; + + return xfeaures_to_write; +} + +/* + * Save xstate to user space xsave area. + * + * We don't use modified optimization because xrstor/xrstors might track + * a different application. + * + * We don't use compacted format xsave area for backward compatibility for + * old applications which don't understand the compacted format of the + * xsave area. + * + * The caller has to zero buf::header before calling this because XSAVE* + * does not touch the reserved fields in the header. + */ +static inline int xsave_to_user_sigframe(struct xregs_state __user *buf) +{ + /* + * Include the features which are not xsaved/rstored by the kernel + * internally, e.g. PKRU. That's user space ABI and also required + * to allow the signal handler to modify PKRU. + */ + struct fpstate *fpstate = current->thread.fpu.fpstate; + u64 mask = fpstate->user_xfeatures; + u32 lmask; + u32 hmask; + int err; + + /* Optimize away writing unnecessary xfeatures: */ + if (fpu_state_size_dynamic()) + mask &= xfeatures_need_sigframe_write(); + + lmask = mask; + hmask = mask >> 32; + xfd_validate_state(fpstate, mask, false); + + stac(); + XSTATE_OP(XSAVE, buf, lmask, hmask, err); + clac(); + + return err; +} + +/* + * Restore xstate from user space xsave area. + */ +static inline int xrstor_from_user_sigframe(struct xregs_state __user *buf, u64 mask) +{ + struct xregs_state *xstate = ((__force struct xregs_state *)buf); + u32 lmask = mask; + u32 hmask = mask >> 32; + int err; + + xfd_validate_state(current->thread.fpu.fpstate, mask, true); + + stac(); + XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); + clac(); + + return err; +} + +/* + * Restore xstate from kernel space xsave area, return an error code instead of + * an exception. + */ +static inline int os_xrstor_safe(struct fpstate *fpstate, u64 mask) +{ + struct xregs_state *xstate = &fpstate->regs.xsave; + u32 lmask = mask; + u32 hmask = mask >> 32; + int err; + + /* Ensure that XFD is up to date */ + xfd_update_state(fpstate); + + if (cpu_feature_enabled(X86_FEATURE_XSAVES)) + XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); + else + XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); + + return err; +} + + +#endif diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 1b3ce3b4a2a2..7cc540e6de0c 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -252,11 +252,6 @@ void arch_ftrace_update_code(int command) ftrace_modify_all_code(command); } -int __init ftrace_dyn_arch_init(void) -{ - return 0; -} - /* Currently only x86_64 supports dynamic trampolines */ #ifdef CONFIG_X86_64 @@ -308,7 +303,7 @@ union ftrace_op_code_union { } __attribute__((packed)); }; -#define RET_SIZE 1 +#define RET_SIZE 1 + IS_ENABLED(CONFIG_SLS) static unsigned long create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) @@ -527,7 +522,7 @@ static void *addr_from_call(void *ptr) return ptr + CALL_INSN_SIZE + call.disp; } -void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, +void prepare_ftrace_return(unsigned long ip, unsigned long *parent, unsigned long frame_pointer); /* @@ -541,7 +536,8 @@ static void *static_tramp_func(struct ftrace_ops *ops, struct dyn_ftrace *rec) void *ptr; if (ops && ops->trampoline) { -#ifdef CONFIG_FUNCTION_GRAPH_TRACER +#if !defined(CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS) && \ + defined(CONFIG_FUNCTION_GRAPH_TRACER) /* * We only know about function graph tracer setting as static * trampoline. @@ -589,8 +585,9 @@ void arch_ftrace_trampoline_free(struct ftrace_ops *ops) #ifdef CONFIG_FUNCTION_GRAPH_TRACER #ifdef CONFIG_DYNAMIC_FTRACE -extern void ftrace_graph_call(void); +#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS +extern void ftrace_graph_call(void); static const char *ftrace_jmp_replace(unsigned long ip, unsigned long addr) { return text_gen_insn(JMP32_INSN_OPCODE, (void *)ip, (void *)addr); @@ -618,19 +615,28 @@ int ftrace_disable_ftrace_graph_caller(void) return ftrace_mod_jmp(ip, &ftrace_stub); } +#else /* !CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS */ +int ftrace_enable_ftrace_graph_caller(void) +{ + return 0; +} +int ftrace_disable_ftrace_graph_caller(void) +{ + return 0; +} +#endif /* CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS */ #endif /* !CONFIG_DYNAMIC_FTRACE */ /* * Hook the return address and push it in the stack of return addrs * in current thread info. */ -void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, +void prepare_ftrace_return(unsigned long ip, unsigned long *parent, unsigned long frame_pointer) { unsigned long return_hooker = (unsigned long)&return_to_handler; - unsigned long old; - int faulted; + int bit; /* * When resuming from suspend-to-ram, this function can be indirectly @@ -650,37 +656,25 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, if (unlikely(atomic_read(¤t->tracing_graph_pause))) return; - /* - * Protect against fault, even if it shouldn't - * happen. This tool is too much intrusive to - * ignore such a protection. - */ - asm volatile( - "1: " _ASM_MOV " (%[parent]), %[old]\n" - "2: " _ASM_MOV " %[return_hooker], (%[parent])\n" - " movl $0, %[faulted]\n" - "3:\n" - - ".section .fixup, \"ax\"\n" - "4: movl $1, %[faulted]\n" - " jmp 3b\n" - ".previous\n" - - _ASM_EXTABLE(1b, 4b) - _ASM_EXTABLE(2b, 4b) - - : [old] "=&r" (old), [faulted] "=r" (faulted) - : [parent] "r" (parent), [return_hooker] "r" (return_hooker) - : "memory" - ); - - if (unlikely(faulted)) { - ftrace_graph_stop(); - WARN_ON(1); + bit = ftrace_test_recursion_trylock(ip, *parent); + if (bit < 0) return; - } - if (function_graph_enter(old, self_addr, frame_pointer, parent)) - *parent = old; + if (!function_graph_enter(*parent, ip, frame_pointer, parent)) + *parent = return_hooker; + + ftrace_test_recursion_unlock(bit); } + +#ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS +void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs) +{ + struct pt_regs *regs = &fregs->regs; + unsigned long *stack = (unsigned long *)kernel_stack_pointer(regs); + + prepare_ftrace_return(ip, (unsigned long *)stack, 0); +} +#endif + #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S index e405fe1a8bf4..a0ed0e4a2c0c 100644 --- a/arch/x86/kernel/ftrace_32.S +++ b/arch/x86/kernel/ftrace_32.S @@ -19,7 +19,7 @@ #endif SYM_FUNC_START(__fentry__) - ret + RET SYM_FUNC_END(__fentry__) EXPORT_SYMBOL(__fentry__) @@ -84,7 +84,7 @@ ftrace_graph_call: /* This is weak to keep gas from relaxing the jumps */ SYM_INNER_LABEL_ALIGN(ftrace_stub, SYM_L_WEAK) - ret + RET SYM_CODE_END(ftrace_caller) SYM_CODE_START(ftrace_regs_caller) @@ -177,7 +177,7 @@ SYM_CODE_START(ftrace_graph_caller) popl %edx popl %ecx popl %eax - ret + RET SYM_CODE_END(ftrace_graph_caller) .globl return_to_handler diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index 7c273846c687..11ac028e30e4 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -132,7 +132,7 @@ #ifdef CONFIG_DYNAMIC_FTRACE SYM_FUNC_START(__fentry__) - retq + RET SYM_FUNC_END(__fentry__) EXPORT_SYMBOL(__fentry__) @@ -174,18 +174,13 @@ SYM_INNER_LABEL(ftrace_caller_end, SYM_L_GLOBAL) SYM_FUNC_END(ftrace_caller); SYM_FUNC_START(ftrace_epilogue) -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -SYM_INNER_LABEL(ftrace_graph_call, SYM_L_GLOBAL) - jmp ftrace_stub -#endif - /* * This is weak to keep gas from relaxing the jumps. - * It is also used to copy the retq for trampolines. + * It is also used to copy the RET for trampolines. */ SYM_INNER_LABEL_ALIGN(ftrace_stub, SYM_L_WEAK) UNWIND_HINT_FUNC - retq + RET SYM_FUNC_END(ftrace_epilogue) SYM_FUNC_START(ftrace_regs_caller) @@ -251,7 +246,6 @@ SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL) * If ORIG_RAX is anything but zero, make this a call to that. * See arch_ftrace_set_direct_caller(). */ - movq ORIG_RAX(%rsp), %rax testq %rax, %rax SYM_INNER_LABEL(ftrace_regs_caller_jmp, SYM_L_GLOBAL) jnz 1f @@ -289,17 +283,8 @@ SYM_FUNC_START(__fentry__) cmpq $ftrace_stub, ftrace_trace_function jnz trace -fgraph_trace: -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - cmpq $ftrace_stub, ftrace_graph_return - jnz ftrace_graph_caller - - cmpq $ftrace_graph_entry_stub, ftrace_graph_entry - jnz ftrace_graph_caller -#endif - SYM_INNER_LABEL(ftrace_stub, SYM_L_GLOBAL) - retq + RET trace: /* save_mcount_regs fills in first two parameters */ @@ -315,25 +300,12 @@ trace: CALL_NOSPEC r8 restore_mcount_regs - jmp fgraph_trace + jmp ftrace_stub SYM_FUNC_END(__fentry__) EXPORT_SYMBOL(__fentry__) #endif /* CONFIG_DYNAMIC_FTRACE */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER -SYM_FUNC_START(ftrace_graph_caller) - /* Saves rbp into %rdx and fills first parameter */ - save_mcount_regs - - leaq MCOUNT_REG_SIZE+8(%rsp), %rsi - movq $0, %rdx /* No framepointers needed */ - call prepare_ftrace_return - - restore_mcount_regs - - retq -SYM_FUNC_END(ftrace_graph_caller) - SYM_FUNC_START(return_to_handler) subq $24, %rsp diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index de01903c3735..de563db9cdcd 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -19,7 +19,7 @@ #include <linux/start_kernel.h> #include <linux/io.h> #include <linux/memblock.h> -#include <linux/mem_encrypt.h> +#include <linux/cc_platform.h> #include <linux/pgtable.h> #include <asm/processor.h> @@ -126,6 +126,36 @@ static bool __head check_la57_support(unsigned long physaddr) } #endif +static unsigned long sme_postprocess_startup(struct boot_params *bp, pmdval_t *pmd) +{ + unsigned long vaddr, vaddr_end; + int i; + + /* Encrypt the kernel and related (if SME is active) */ + sme_encrypt_kernel(bp); + + /* + * Clear the memory encryption mask from the .bss..decrypted section. + * The bss section will be memset to zero later in the initialization so + * there is no need to zero it after changing the memory encryption + * attribute. + */ + if (sme_get_me_mask()) { + vaddr = (unsigned long)__start_bss_decrypted; + vaddr_end = (unsigned long)__end_bss_decrypted; + for (; vaddr < vaddr_end; vaddr += PMD_SIZE) { + i = pmd_index(vaddr); + pmd[i] -= sme_get_me_mask(); + } + } + + /* + * Return the SME encryption mask (if SME is active) to be used as a + * modifier for the initial pgdir entry programmed into CR3. + */ + return sme_get_me_mask(); +} + /* Code in __startup_64() can be relocated during execution, but the compiler * doesn't have to generate PC-relative relocations when accessing globals from * that function. Clang actually does not generate them, which leads to @@ -135,7 +165,6 @@ static bool __head check_la57_support(unsigned long physaddr) unsigned long __head __startup_64(unsigned long physaddr, struct boot_params *bp) { - unsigned long vaddr, vaddr_end; unsigned long load_delta, *p; unsigned long pgtable_flags; pgdval_t *pgd; @@ -276,29 +305,7 @@ unsigned long __head __startup_64(unsigned long physaddr, */ *fixup_long(&phys_base, physaddr) += load_delta - sme_get_me_mask(); - /* Encrypt the kernel and related (if SME is active) */ - sme_encrypt_kernel(bp); - - /* - * Clear the memory encryption mask from the .bss..decrypted section. - * The bss section will be memset to zero later in the initialization so - * there is no need to zero it after changing the memory encryption - * attribute. - */ - if (mem_encrypt_active()) { - vaddr = (unsigned long)__start_bss_decrypted; - vaddr_end = (unsigned long)__end_bss_decrypted; - for (; vaddr < vaddr_end; vaddr += PMD_SIZE) { - i = pmd_index(vaddr); - pmd[i] -= sme_get_me_mask(); - } - } - - /* - * Return the SME encryption mask (if SME is active) to be used as a - * modifier for the initial pgdir entry programmed into CR3. - */ - return sme_get_me_mask(); + return sme_postprocess_startup(bp, pmd); } unsigned long __startup_secondary_64(void) @@ -480,6 +487,10 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) clear_bss(); + /* + * This needs to happen *before* kasan_early_init() because latter maps stuff + * into that page. + */ clear_page(init_top_pgt); /* @@ -491,6 +502,16 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) kasan_early_init(); + /* + * Flush global TLB entries which could be left over from the trampoline page + * table. + * + * This needs to happen *after* kasan_early_init() as KASAN-enabled .configs + * instrument native_write_cr4() so KASAN must be initialized for that + * instrumentation to work. + */ + __native_tlb_flush_global(this_cpu_read(cpu_tlbstate.cr4)); + idt_setup_early_handler(); copy_bootdata(__va(real_mode_data)); diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index d8c64dab0efe..eb8656bac99b 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -340,7 +340,7 @@ SYM_FUNC_END(startup_32_smp) __INIT setup_once: andl $0,setup_once_ref /* Once is enough, thanks */ - ret + RET SYM_FUNC_START(early_idt_handler_array) # 36(%esp) %eflags diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index d8b3ebd2bb85..9c63fc5988cd 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -166,9 +166,26 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) call sev_verify_cbit popq %rsi - /* Switch to new page-table */ + /* + * Switch to new page-table + * + * For the boot CPU this switches to early_top_pgt which still has the + * indentity mappings present. The secondary CPUs will switch to the + * init_top_pgt here, away from the trampoline_pgd and unmap the + * indentity mapped ranges. + */ movq %rax, %cr3 + /* + * Do a global TLB flush after the CR3 switch to make sure the TLB + * entries from the identity mapping are flushed. + */ + movq %cr4, %rcx + movq %rcx, %rax + xorq $X86_CR4_PGE, %rcx + movq %rcx, %cr4 + movq %rax, %cr4 + /* Ensure I am executing from virtual addresses */ movq $1f, %rax ANNOTATE_RETPOLINE_SAFE diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 42fc41dd0e1f..882213df3713 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -10,6 +10,7 @@ #include <asm/irq_remapping.h> #include <asm/hpet.h> #include <asm/time.h> +#include <asm/mwait.h> #undef pr_fmt #define pr_fmt(fmt) "hpet: " fmt @@ -916,6 +917,83 @@ static bool __init hpet_counting(void) return false; } +static bool __init mwait_pc10_supported(void) +{ + unsigned int eax, ebx, ecx, mwait_substates; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return false; + + if (!cpu_feature_enabled(X86_FEATURE_MWAIT)) + return false; + + if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) + return false; + + cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &mwait_substates); + + return (ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) && + (ecx & CPUID5_ECX_INTERRUPT_BREAK) && + (mwait_substates & (0xF << 28)); +} + +/* + * Check whether the system supports PC10. If so force disable HPET as that + * stops counting in PC10. This check is overbroad as it does not take any + * of the following into account: + * + * - ACPI tables + * - Enablement of intel_idle + * - Command line arguments which limit intel_idle C-state support + * + * That's perfectly fine. HPET is a piece of hardware designed by committee + * and the only reasons why it is still in use on modern systems is the + * fact that it is impossible to reliably query TSC and CPU frequency via + * CPUID or firmware. + * + * If HPET is functional it is useful for calibrating TSC, but this can be + * done via PMTIMER as well which seems to be the last remaining timer on + * X86/INTEL platforms that has not been completely wreckaged by feature + * creep. + * + * In theory HPET support should be removed altogether, but there are older + * systems out there which depend on it because TSC and APIC timer are + * dysfunctional in deeper C-states. + * + * It's only 20 years now that hardware people have been asked to provide + * reliable and discoverable facilities which can be used for timekeeping + * and per CPU timer interrupts. + * + * The probability that this problem is going to be solved in the + * forseeable future is close to zero, so the kernel has to be cluttered + * with heuristics to keep up with the ever growing amount of hardware and + * firmware trainwrecks. Hopefully some day hardware people will understand + * that the approach of "This can be fixed in software" is not sustainable. + * Hope dies last... + */ +static bool __init hpet_is_pc10_damaged(void) +{ + unsigned long long pcfg; + + /* Check whether PC10 substates are supported */ + if (!mwait_pc10_supported()) + return false; + + /* Check whether PC10 is enabled in PKG C-state limit */ + rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, pcfg); + if ((pcfg & 0xF) < 8) + return false; + + if (hpet_force_user) { + pr_warn("HPET force enabled via command line, but dysfunctional in PC10.\n"); + return false; + } + + pr_info("HPET dysfunctional in PC10. Force disabled.\n"); + boot_hpet_disable = true; + return true; +} + /** * hpet_enable - Try to setup the HPET timer. Returns 1 on success. */ @@ -929,6 +1007,9 @@ int __init hpet_enable(void) if (!is_hpet_capable()) return 0; + if (hpet_is_pc10_damaged()) + return 0; + hpet_set_mapping(); if (!hpet_virt_address) return 0; diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index e28f6a5d14f1..766ffe3ba313 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -291,8 +291,10 @@ void kvm_set_posted_intr_wakeup_handler(void (*handler)(void)) { if (handler) kvm_posted_intr_wakeup_handler = handler; - else + else { kvm_posted_intr_wakeup_handler = dummy_handler; + synchronize_rcu(); + } } EXPORT_SYMBOL_GPL(kvm_set_posted_intr_wakeup_handler); diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 044902d5a3c4..e5dd6da78713 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -132,6 +132,7 @@ int irq_init_percpu_irqstack(unsigned int cpu) return 0; } +#ifndef CONFIG_PREEMPT_RT void do_softirq_own_stack(void) { struct irq_stack *irqstk; @@ -148,6 +149,7 @@ void do_softirq_own_stack(void) call_on_stack(__do_softirq, isp); } +#endif void __handle_irq(struct irq_desc *desc, struct pt_regs *regs) { diff --git a/arch/x86/kernel/irqflags.S b/arch/x86/kernel/irqflags.S index 8ef35063964b..aaf9e776f323 100644 --- a/arch/x86/kernel/irqflags.S +++ b/arch/x86/kernel/irqflags.S @@ -7,9 +7,11 @@ /* * unsigned long native_save_fl(void) */ +.pushsection .noinstr.text, "ax" SYM_FUNC_START(native_save_fl) pushf pop %_ASM_AX - ret + RET SYM_FUNC_END(native_save_fl) +.popsection EXPORT_SYMBOL(native_save_fl) diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c index 1afbdd1dd777..9ff480e94511 100644 --- a/arch/x86/kernel/itmt.c +++ b/arch/x86/kernel/itmt.c @@ -198,7 +198,7 @@ void sched_set_itmt_core_prio(int prio, int core_cpu) * of the priority chain and only used when * all other high priority cpus are out of capacity. */ - smt_prio = prio * smp_num_siblings / i; + smt_prio = prio * smp_num_siblings / (i * i); per_cpu(sched_core_priority, cpu) = smt_prio; i++; } diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index b6e046e4b289..6290712cb36d 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -809,7 +809,7 @@ void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) ri->fp = sara; /* Replace the return addr with trampoline addr */ - *sara = (unsigned long) &kretprobe_trampoline; + *sara = (unsigned long) &__kretprobe_trampoline; } NOKPROBE_SYMBOL(arch_prepare_kretprobe); @@ -1019,52 +1019,91 @@ NOKPROBE_SYMBOL(kprobe_int3_handler); */ asm( ".text\n" - ".global kretprobe_trampoline\n" - ".type kretprobe_trampoline, @function\n" - "kretprobe_trampoline:\n" - /* We don't bother saving the ss register */ + ".global __kretprobe_trampoline\n" + ".type __kretprobe_trampoline, @function\n" + "__kretprobe_trampoline:\n" #ifdef CONFIG_X86_64 + /* Push a fake return address to tell the unwinder it's a kretprobe. */ + " pushq $__kretprobe_trampoline\n" + UNWIND_HINT_FUNC + /* Save the 'sp - 8', this will be fixed later. */ " pushq %rsp\n" " pushfq\n" SAVE_REGS_STRING " movq %rsp, %rdi\n" " call trampoline_handler\n" - /* Replace saved sp with true return address. */ - " movq %rax, 19*8(%rsp)\n" RESTORE_REGS_STRING + /* In trampoline_handler(), 'regs->flags' is copied to 'regs->sp'. */ + " addq $8, %rsp\n" " popfq\n" #else + /* Push a fake return address to tell the unwinder it's a kretprobe. */ + " pushl $__kretprobe_trampoline\n" + UNWIND_HINT_FUNC + /* Save the 'sp - 4', this will be fixed later. */ " pushl %esp\n" " pushfl\n" SAVE_REGS_STRING " movl %esp, %eax\n" " call trampoline_handler\n" - /* Replace saved sp with true return address. */ - " movl %eax, 15*4(%esp)\n" RESTORE_REGS_STRING + /* In trampoline_handler(), 'regs->flags' is copied to 'regs->sp'. */ + " addl $4, %esp\n" " popfl\n" #endif - " ret\n" - ".size kretprobe_trampoline, .-kretprobe_trampoline\n" + ASM_RET + ".size __kretprobe_trampoline, .-__kretprobe_trampoline\n" ); -NOKPROBE_SYMBOL(kretprobe_trampoline); -STACK_FRAME_NON_STANDARD(kretprobe_trampoline); +NOKPROBE_SYMBOL(__kretprobe_trampoline); +/* + * __kretprobe_trampoline() skips updating frame pointer. The frame pointer + * saved in trampoline_handler() points to the real caller function's + * frame pointer. Thus the __kretprobe_trampoline() doesn't have a + * standard stack frame with CONFIG_FRAME_POINTER=y. + * Let's mark it non-standard function. Anyway, FP unwinder can correctly + * unwind without the hint. + */ +STACK_FRAME_NON_STANDARD_FP(__kretprobe_trampoline); + +/* This is called from kretprobe_trampoline_handler(). */ +void arch_kretprobe_fixup_return(struct pt_regs *regs, + kprobe_opcode_t *correct_ret_addr) +{ + unsigned long *frame_pointer = ®s->sp + 1; + /* Replace fake return address with real one. */ + *frame_pointer = (unsigned long)correct_ret_addr; +} /* - * Called from kretprobe_trampoline + * Called from __kretprobe_trampoline */ -__used __visible void *trampoline_handler(struct pt_regs *regs) +__used __visible void trampoline_handler(struct pt_regs *regs) { + unsigned long *frame_pointer; + /* fixup registers */ regs->cs = __KERNEL_CS; #ifdef CONFIG_X86_32 regs->gs = 0; #endif - regs->ip = (unsigned long)&kretprobe_trampoline; + regs->ip = (unsigned long)&__kretprobe_trampoline; regs->orig_ax = ~0UL; + regs->sp += sizeof(long); + frame_pointer = ®s->sp + 1; - return (void *)kretprobe_trampoline_handler(regs, &kretprobe_trampoline, ®s->sp); + /* + * The return address at 'frame_pointer' is recovered by the + * arch_kretprobe_fixup_return() which called from the + * kretprobe_trampoline_handler(). + */ + kretprobe_trampoline_handler(regs, frame_pointer); + + /* + * Copy FLAGS to 'pt_regs::sp' so that __kretprobe_trapmoline() + * can do RET right after POPF. + */ + regs->sp = regs->flags; } NOKPROBE_SYMBOL(trampoline_handler); diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index 596de2f6d3a5..dd2ec14adb77 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -25,7 +25,6 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, if (bit < 0) return; - preempt_disable_notrace(); p = get_kprobe((kprobe_opcode_t *)ip); if (unlikely(!p) || kprobe_disabled(p)) goto out; @@ -59,7 +58,6 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, __this_cpu_write(current_kprobe, NULL); } out: - preempt_enable_notrace(); ftrace_test_recursion_unlock(bit); } NOKPROBE_SYMBOL(kprobe_ftrace_handler); diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 71425ebba98a..b4a54a52aa59 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -367,10 +367,10 @@ int arch_check_optimized_kprobe(struct optimized_kprobe *op) /* Check the addr is within the optimized instructions. */ int arch_within_optimized_kprobe(struct optimized_kprobe *op, - unsigned long addr) + kprobe_opcode_t *addr) { - return ((unsigned long)op->kp.addr <= addr && - (unsigned long)op->kp.addr + op->optinsn.size > addr); + return (op->kp.addr <= addr && + op->kp.addr + op->optinsn.size > addr); } /* Free optimized instruction slot */ diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index b656456c3a94..a438217cbfac 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -27,6 +27,8 @@ #include <linux/nmi.h> #include <linux/swait.h> #include <linux/syscore_ops.h> +#include <linux/cc_platform.h> +#include <linux/efi.h> #include <asm/timer.h> #include <asm/cpu.h> #include <asm/traps.h> @@ -40,6 +42,7 @@ #include <asm/ptrace.h> #include <asm/reboot.h> #include <asm/svm.h> +#include <asm/e820/api.h> DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled); @@ -310,7 +313,7 @@ static void kvm_register_steal_time(void) return; wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED)); - pr_info("stealtime: cpu %d, msr %llx\n", cpu, + pr_debug("stealtime: cpu %d, msr %llx\n", cpu, (unsigned long long) slow_virt_to_phys(st)); } @@ -347,7 +350,7 @@ static void kvm_guest_cpu_init(void) wrmsrl(MSR_KVM_ASYNC_PF_EN, pa); __this_cpu_write(apf_reason.enabled, 1); - pr_info("setup async PF for cpu %d\n", smp_processor_id()); + pr_debug("setup async PF for cpu %d\n", smp_processor_id()); } if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) { @@ -373,7 +376,7 @@ static void kvm_pv_disable_apf(void) wrmsrl(MSR_KVM_ASYNC_PF_EN, 0); __this_cpu_write(apf_reason.enabled, 0); - pr_info("disable async PF for cpu %d\n", smp_processor_id()); + pr_debug("disable async PF for cpu %d\n", smp_processor_id()); } static void kvm_disable_steal_time(void) @@ -418,7 +421,7 @@ static void __init sev_map_percpu_data(void) { int cpu; - if (!sev_active()) + if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) return; for_each_possible_cpu(cpu) { @@ -433,6 +436,8 @@ static void kvm_guest_cpu_offline(bool shutdown) kvm_disable_steal_time(); if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) wrmsrl(MSR_KVM_PV_EOI_EN, 0); + if (kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL)) + wrmsrl(MSR_KVM_MIGRATION_CONTROL, 0); kvm_pv_disable_apf(); if (!shutdown) apf_task_wake_all(); @@ -547,6 +552,55 @@ static void kvm_send_ipi_mask_allbutself(const struct cpumask *mask, int vector) __send_ipi_mask(local_mask, vector); } +static int __init setup_efi_kvm_sev_migration(void) +{ + efi_char16_t efi_sev_live_migration_enabled[] = L"SevLiveMigrationEnabled"; + efi_guid_t efi_variable_guid = AMD_SEV_MEM_ENCRYPT_GUID; + efi_status_t status; + unsigned long size; + bool enabled; + + if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) || + !kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL)) + return 0; + + if (!efi_enabled(EFI_BOOT)) + return 0; + + if (!efi_enabled(EFI_RUNTIME_SERVICES)) { + pr_info("%s : EFI runtime services are not enabled\n", __func__); + return 0; + } + + size = sizeof(enabled); + + /* Get variable contents into buffer */ + status = efi.get_variable(efi_sev_live_migration_enabled, + &efi_variable_guid, NULL, &size, &enabled); + + if (status == EFI_NOT_FOUND) { + pr_info("%s : EFI live migration variable not found\n", __func__); + return 0; + } + + if (status != EFI_SUCCESS) { + pr_info("%s : EFI variable retrieval failed\n", __func__); + return 0; + } + + if (enabled == 0) { + pr_info("%s: live migration disabled in EFI\n", __func__); + return 0; + } + + pr_info("%s : live migration enabled in EFI\n", __func__); + wrmsrl(MSR_KVM_MIGRATION_CONTROL, KVM_MIGRATION_READY); + + return 1; +} + +late_initcall(setup_efi_kvm_sev_migration); + /* * Set the IPI entry points */ @@ -755,7 +809,7 @@ static noinline uint32_t __kvm_cpuid_base(void) return 0; /* So we don't blow up on old processors */ if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) - return hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0); + return hypervisor_cpuid_base(KVM_SIGNATURE, 0); return 0; } @@ -805,8 +859,62 @@ static bool __init kvm_msi_ext_dest_id(void) return kvm_para_has_feature(KVM_FEATURE_MSI_EXT_DEST_ID); } +static void kvm_sev_hc_page_enc_status(unsigned long pfn, int npages, bool enc) +{ + kvm_sev_hypercall3(KVM_HC_MAP_GPA_RANGE, pfn << PAGE_SHIFT, npages, + KVM_MAP_GPA_RANGE_ENC_STAT(enc) | KVM_MAP_GPA_RANGE_PAGE_SZ_4K); +} + static void __init kvm_init_platform(void) { + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) && + kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL)) { + unsigned long nr_pages; + int i; + + pv_ops.mmu.notify_page_enc_status_changed = + kvm_sev_hc_page_enc_status; + + /* + * Reset the host's shared pages list related to kernel + * specific page encryption status settings before we load a + * new kernel by kexec. Reset the page encryption status + * during early boot intead of just before kexec to avoid SMP + * races during kvm_pv_guest_cpu_reboot(). + * NOTE: We cannot reset the complete shared pages list + * here as we need to retain the UEFI/OVMF firmware + * specific settings. + */ + + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; + + if (entry->type != E820_TYPE_RAM) + continue; + + nr_pages = DIV_ROUND_UP(entry->size, PAGE_SIZE); + + kvm_sev_hypercall3(KVM_HC_MAP_GPA_RANGE, entry->addr, + nr_pages, + KVM_MAP_GPA_RANGE_ENCRYPTED | KVM_MAP_GPA_RANGE_PAGE_SZ_4K); + } + + /* + * Ensure that _bss_decrypted section is marked as decrypted in the + * shared pages list. + */ + nr_pages = DIV_ROUND_UP(__end_bss_decrypted - __start_bss_decrypted, + PAGE_SIZE); + early_set_mem_enc_dec_hypercall((unsigned long)__start_bss_decrypted, + nr_pages, 0); + + /* + * If not booted using EFI, enable Live migration support. + */ + if (!efi_enabled(EFI_BOOT)) + wrmsrl(MSR_KVM_MIGRATION_CONTROL, + KVM_MIGRATION_READY); + } kvmclock_init(); x86_platform.apic_post_init = kvm_apic_init; } diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 73c74b961d0f..a35cbf9107af 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -16,9 +16,9 @@ #include <linux/mm.h> #include <linux/slab.h> #include <linux/set_memory.h> +#include <linux/cc_platform.h> #include <asm/hypervisor.h> -#include <asm/mem_encrypt.h> #include <asm/x86_init.h> #include <asm/kvmclock.h> @@ -174,7 +174,7 @@ static void kvm_register_clock(char *txt) pa = slow_virt_to_phys(&src->pvti) | 0x01ULL; wrmsrl(msr_kvm_system_time, pa); - pr_info("kvm-clock: cpu %d, msr %llx, %s", smp_processor_id(), pa, txt); + pr_debug("kvm-clock: cpu %d, msr %llx, %s", smp_processor_id(), pa, txt); } static void kvm_save_sched_clock_state(void) @@ -223,7 +223,7 @@ static void __init kvmclock_init_mem(void) * hvclock is shared between the guest and the hypervisor, must * be mapped decrypted. */ - if (sev_active()) { + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) { r = set_memory_decrypted((unsigned long) hvclock_mem, 1UL << order); if (r) { diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 131f30fdcfbd..f5da4a18070a 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -17,6 +17,7 @@ #include <linux/suspend.h> #include <linux/vmalloc.h> #include <linux/efi.h> +#include <linux/cc_platform.h> #include <asm/init.h> #include <asm/tlbflush.h> @@ -166,7 +167,7 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) } pte = pte_offset_kernel(pmd, vaddr); - if (sev_active()) + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) prot = PAGE_KERNEL_EXEC; set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot)); @@ -206,7 +207,7 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) level4p = (pgd_t *)__va(start_pgtable); clear_page(level4p); - if (sev_active()) { + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) { info.page_flag |= _PAGE_ENC; info.kernpg_flag |= _PAGE_ENC; } @@ -358,7 +359,7 @@ void machine_kexec(struct kimage *image) (unsigned long)page_list, image->start, image->preserve_context, - sme_active()); + cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)); #ifdef CONFIG_KEXEC_JUMP if (image->preserve_context) @@ -569,12 +570,12 @@ void arch_kexec_unprotect_crashkres(void) */ int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp) { - if (sev_active()) + if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) return 0; /* - * If SME is active we need to be sure that kexec pages are - * not encrypted because when we boot to the new kernel the + * If host memory encryption is active we need to be sure that kexec + * pages are not encrypted because when we boot to the new kernel the * pages won't be accessed encrypted (initially). */ return set_memory_decrypted((unsigned long)vaddr, pages); @@ -582,12 +583,12 @@ int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp) void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) { - if (sev_active()) + if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) return; /* - * If SME is active we need to reset the pages back to being - * an encrypted mapping before freeing them. + * If host memory encryption is active we need to reset the pages back + * to being an encrypted mapping before freeing them. */ set_memory_encrypted((unsigned long)vaddr, pages); } diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 5e9a34b5bd74..95fa745e310a 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -67,6 +67,7 @@ static unsigned long int get_module_load_offset(void) void *module_alloc(unsigned long size) { + gfp_t gfp_mask = GFP_KERNEL; void *p; if (PAGE_ALIGN(size) > MODULES_LEN) @@ -74,10 +75,10 @@ void *module_alloc(unsigned long size) p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR + get_module_load_offset(), - MODULES_END, GFP_KERNEL, - PAGE_KERNEL, 0, NUMA_NO_NODE, + MODULES_END, gfp_mask, + PAGE_KERNEL, VM_DEFER_KMEMLEAK, NUMA_NO_NODE, __builtin_return_address(0)); - if (p && (kasan_module_alloc(p, size) < 0)) { + if (p && (kasan_module_alloc(p, size, gfp_mask) < 0)) { vfree(p); return NULL; } @@ -251,7 +252,8 @@ int module_finalize(const Elf_Ehdr *hdr, struct module *me) { const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, - *para = NULL, *orc = NULL, *orc_ip = NULL; + *para = NULL, *orc = NULL, *orc_ip = NULL, + *retpolines = NULL; char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { @@ -267,8 +269,14 @@ int module_finalize(const Elf_Ehdr *hdr, orc = s; if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name)) orc_ip = s; + if (!strcmp(".retpoline_sites", secstrings + s->sh_name)) + retpolines = s; } + if (retpolines) { + void *rseg = (void *)retpolines->sh_addr; + apply_retpolines(rseg, rseg + retpolines->sh_size); + } if (alt) { /* patch .altinstructions */ void *aseg = (void *)alt->sh_addr; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 04cafc057bed..4420499f7bb4 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -41,11 +41,22 @@ extern void _paravirt_nop(void); asm (".pushsection .entry.text, \"ax\"\n" ".global _paravirt_nop\n" "_paravirt_nop:\n\t" - "ret\n\t" + ASM_RET ".size _paravirt_nop, . - _paravirt_nop\n\t" ".type _paravirt_nop, @function\n\t" ".popsection"); +/* stub always returning 0. */ +asm (".pushsection .entry.text, \"ax\"\n" + ".global paravirt_ret0\n" + "paravirt_ret0:\n\t" + "xor %" _ASM_AX ", %" _ASM_AX ";\n\t" + ASM_RET + ".size paravirt_ret0, . - paravirt_ret0\n\t" + ".type paravirt_ret0, @function\n\t" + ".popsection"); + + void __init default_banner(void) { printk(KERN_INFO "Booting paravirtualized kernel on %s\n", @@ -53,7 +64,7 @@ void __init default_banner(void) } /* Undefined instruction for dealing with missing ops pointers. */ -static void paravirt_BUG(void) +noinstr void paravirt_BUG(void) { BUG(); } @@ -218,6 +229,36 @@ void paravirt_end_context_switch(struct task_struct *next) if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES)) arch_enter_lazy_mmu_mode(); } + +static noinstr unsigned long pv_native_read_cr2(void) +{ + return native_read_cr2(); +} + +static noinstr void pv_native_write_cr2(unsigned long val) +{ + native_write_cr2(val); +} + +static noinstr unsigned long pv_native_get_debugreg(int regno) +{ + return native_get_debugreg(regno); +} + +static noinstr void pv_native_set_debugreg(int regno, unsigned long val) +{ + native_set_debugreg(regno, val); +} + +static noinstr void pv_native_irq_enable(void) +{ + native_irq_enable(); +} + +static noinstr void pv_native_irq_disable(void) +{ + native_irq_disable(); +} #endif enum paravirt_lazy_mode paravirt_get_lazy_mode(void) @@ -244,8 +285,8 @@ struct paravirt_patch_template pv_ops = { #ifdef CONFIG_PARAVIRT_XXL .cpu.cpuid = native_cpuid, - .cpu.get_debugreg = native_get_debugreg, - .cpu.set_debugreg = native_set_debugreg, + .cpu.get_debugreg = pv_native_get_debugreg, + .cpu.set_debugreg = pv_native_set_debugreg, .cpu.read_cr0 = native_read_cr0, .cpu.write_cr0 = native_write_cr0, .cpu.write_cr4 = native_write_cr4, @@ -281,8 +322,8 @@ struct paravirt_patch_template pv_ops = { /* Irq ops. */ .irq.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl), - .irq.irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable), - .irq.irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable), + .irq.irq_disable = __PV_IS_CALLEE_SAVE(pv_native_irq_disable), + .irq.irq_enable = __PV_IS_CALLEE_SAVE(pv_native_irq_enable), .irq.safe_halt = native_safe_halt, .irq.halt = native_halt, #endif /* CONFIG_PARAVIRT_XXL */ @@ -296,10 +337,11 @@ struct paravirt_patch_template pv_ops = { (void (*)(struct mmu_gather *, void *))tlb_remove_page, .mmu.exit_mmap = paravirt_nop, + .mmu.notify_page_enc_status_changed = paravirt_nop, #ifdef CONFIG_PARAVIRT_XXL - .mmu.read_cr2 = __PV_IS_CALLEE_SAVE(native_read_cr2), - .mmu.write_cr2 = native_write_cr2, + .mmu.read_cr2 = __PV_IS_CALLEE_SAVE(pv_native_read_cr2), + .mmu.write_cr2 = pv_native_write_cr2, .mmu.read_cr3 = __native_read_cr3, .mmu.write_cr3 = native_write_cr3, @@ -371,9 +413,6 @@ struct paravirt_patch_template pv_ops = { }; #ifdef CONFIG_PARAVIRT_XXL -/* At this point, native_get/set_debugreg has real function entries */ -NOKPROBE_SYMBOL(native_get_debugreg); -NOKPROBE_SYMBOL(native_set_debugreg); NOKPROBE_SYMBOL(native_load_idt); void (*paravirt_iret)(void) = native_iret; diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index c2cfa5e7c152..814ab46a0dad 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -6,7 +6,7 @@ #include <linux/swiotlb.h> #include <linux/memblock.h> #include <linux/dma-direct.h> -#include <linux/mem_encrypt.h> +#include <linux/cc_platform.h> #include <asm/iommu.h> #include <asm/swiotlb.h> @@ -45,11 +45,10 @@ int __init pci_swiotlb_detect_4gb(void) swiotlb = 1; /* - * If SME is active then swiotlb will be set to 1 so that bounce - * buffers are allocated and used for devices that do not support - * the addressing range required for the encryption mask. + * Set swiotlb to 1 so that bounce buffers are allocated and used for + * devices that can't support DMA to encrypted memory. */ - if (sme_active()) + if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) swiotlb = 1; return swiotlb; diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c index 9e1def3744f2..36e84d904260 100644 --- a/arch/x86/kernel/probe_roms.c +++ b/arch/x86/kernel/probe_roms.c @@ -80,7 +80,7 @@ static struct resource video_rom_resource = { */ static bool match_id(struct pci_dev *pdev, unsigned short vendor, unsigned short device) { - struct pci_driver *drv = pdev->driver; + struct pci_driver *drv = to_pci_driver(pdev->dev.driver); const struct pci_device_id *id; if (pdev->vendor == vendor && pdev->device == device) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 1d9463e3096b..81d8ef036637 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -30,7 +30,9 @@ #include <asm/apic.h> #include <linux/uaccess.h> #include <asm/mwait.h> -#include <asm/fpu/internal.h> +#include <asm/fpu/api.h> +#include <asm/fpu/sched.h> +#include <asm/fpu/xstate.h> #include <asm/debugreg.h> #include <asm/nmi.h> #include <asm/tlbflush.h> @@ -43,6 +45,7 @@ #include <asm/io_bitmap.h> #include <asm/proto.h> #include <asm/frame.h> +#include <asm/unwind.h> #include "process.h" @@ -87,9 +90,20 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) #ifdef CONFIG_VM86 dst->thread.vm86 = NULL; #endif - return fpu_clone(dst); + /* Drop the copied pointer to current's fpstate */ + dst->thread.fpu.fpstate = NULL; + + return 0; } +#ifdef CONFIG_X86_64 +void arch_release_task_struct(struct task_struct *tsk) +{ + if (fpu_state_size_dynamic()) + fpstate_free(&tsk->thread.fpu); +} +#endif + /* * Free thread data structures etc.. */ @@ -132,6 +146,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, frame->ret_addr = (unsigned long) ret_from_fork; p->thread.sp = (unsigned long) fork_frame; p->thread.io_bitmap = NULL; + p->thread.iopl_warn = 0; memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); #ifdef CONFIG_X86_64 @@ -154,6 +169,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, frame->flags = X86_EFLAGS_FIXED; #endif + fpu_clone(p, clone_flags); + /* Kernel thread ? */ if (unlikely(p->flags & PF_KTHREAD)) { p->thread.pkru = pkru_get_init_value(); @@ -348,7 +365,7 @@ void arch_setup_new_exec(void) clear_thread_flag(TIF_SSBD); task_clear_spec_ssb_disable(current); task_clear_spec_ssb_noexec(current); - speculation_ctrl_update(task_thread_info(current)->flags); + speculation_ctrl_update(read_thread_flags()); } } @@ -600,7 +617,7 @@ static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk) clear_tsk_thread_flag(tsk, TIF_SPEC_IB); } /* Return the updated threadinfo flags*/ - return task_thread_info(tsk)->flags; + return read_task_thread_flags(tsk); } void speculation_ctrl_update(unsigned long tif) @@ -636,8 +653,8 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p) { unsigned long tifp, tifn; - tifn = READ_ONCE(task_thread_info(next_p)->flags); - tifp = READ_ONCE(task_thread_info(prev_p)->flags); + tifn = read_task_thread_flags(next_p); + tifp = read_task_thread_flags(prev_p); switch_to_bitmap(tifp); @@ -942,70 +959,43 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) * because the task might wake up and we might look at a stack * changing under us. */ -unsigned long get_wchan(struct task_struct *p) +unsigned long __get_wchan(struct task_struct *p) { - unsigned long start, bottom, top, sp, fp, ip, ret = 0; - int count = 0; - - if (p == current || task_is_running(p)) - return 0; + struct unwind_state state; + unsigned long addr = 0; if (!try_get_task_stack(p)) return 0; - start = (unsigned long)task_stack_page(p); - if (!start) - goto out; - - /* - * Layout of the stack page: - * - * ----------- topmax = start + THREAD_SIZE - sizeof(unsigned long) - * PADDING - * ----------- top = topmax - TOP_OF_KERNEL_STACK_PADDING - * stack - * ----------- bottom = start - * - * The tasks stack pointer points at the location where the - * framepointer is stored. The data on the stack is: - * ... IP FP ... IP FP - * - * We need to read FP and IP, so we need to adjust the upper - * bound by another unsigned long. - */ - top = start + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; - top -= 2 * sizeof(unsigned long); - bottom = start; - - sp = READ_ONCE(p->thread.sp); - if (sp < bottom || sp > top) - goto out; - - fp = READ_ONCE_NOCHECK(((struct inactive_task_frame *)sp)->bp); - do { - if (fp < bottom || fp > top) - goto out; - ip = READ_ONCE_NOCHECK(*(unsigned long *)(fp + sizeof(unsigned long))); - if (!in_sched_functions(ip)) { - ret = ip; - goto out; - } - fp = READ_ONCE_NOCHECK(*(unsigned long *)fp); - } while (count++ < 16 && !task_is_running(p)); + for (unwind_start(&state, p, NULL, NULL); !unwind_done(&state); + unwind_next_frame(&state)) { + addr = unwind_get_return_address(&state); + if (!addr) + break; + if (in_sched_functions(addr)) + continue; + break; + } -out: put_task_stack(p); - return ret; + + return addr; } long do_arch_prctl_common(struct task_struct *task, int option, - unsigned long cpuid_enabled) + unsigned long arg2) { switch (option) { case ARCH_GET_CPUID: return get_cpuid_mode(); case ARCH_SET_CPUID: - return set_cpuid_mode(task, cpuid_enabled); + return set_cpuid_mode(task, arg2); + case ARCH_GET_XCOMP_SUPP: + case ARCH_GET_XCOMP_PERM: + case ARCH_REQ_XCOMP_PERM: + case ARCH_GET_XCOMP_GUEST_PERM: + case ARCH_REQ_XCOMP_GUEST_PERM: + return fpu_xstate_prctl(task, option, arg2); } return -EINVAL; diff --git a/arch/x86/kernel/process.h b/arch/x86/kernel/process.h index 1d0797b2338a..76b547b83232 100644 --- a/arch/x86/kernel/process.h +++ b/arch/x86/kernel/process.h @@ -13,8 +13,8 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p); static inline void switch_to_extra(struct task_struct *prev, struct task_struct *next) { - unsigned long next_tif = task_thread_info(next)->flags; - unsigned long prev_tif = task_thread_info(prev)->flags; + unsigned long next_tif = read_task_thread_flags(next); + unsigned long prev_tif = read_task_thread_flags(prev); if (IS_ENABLED(CONFIG_SMP)) { /* diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 4f2f54e1281c..26edb1cd07a4 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -41,7 +41,7 @@ #include <asm/ldt.h> #include <asm/processor.h> -#include <asm/fpu/internal.h> +#include <asm/fpu/sched.h> #include <asm/desc.h> #include <linux/err.h> @@ -160,7 +160,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct thread_struct *prev = &prev_p->thread, *next = &next_p->thread; struct fpu *prev_fpu = &prev->fpu; - struct fpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ @@ -213,7 +212,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) this_cpu_write(current_task, next_p); - switch_fpu_finish(next_fpu); + switch_fpu_finish(); /* Load the Intel cache allocation PQR MSR. */ resctrl_sched_in(); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ec0d836a13b1..3402edec236c 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -42,7 +42,7 @@ #include <asm/processor.h> #include <asm/pkru.h> -#include <asm/fpu/internal.h> +#include <asm/fpu/sched.h> #include <asm/mmu_context.h> #include <asm/prctl.h> #include <asm/desc.h> @@ -559,7 +559,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct thread_struct *prev = &prev_p->thread; struct thread_struct *next = &next_p->thread; struct fpu *prev_fpu = &prev->fpu; - struct fpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && @@ -620,7 +619,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) this_cpu_write(current_task, next_p); this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p)); - switch_fpu_finish(next_fpu); + switch_fpu_finish(); /* Reload sp0. */ update_task_stack(next_p); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 4c208ea3bd9f..6d2244c94799 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -29,9 +29,9 @@ #include <linux/uaccess.h> #include <asm/processor.h> -#include <asm/fpu/internal.h> #include <asm/fpu/signal.h> #include <asm/fpu/regset.h> +#include <asm/fpu/xstate.h> #include <asm/debugreg.h> #include <asm/ldt.h> #include <asm/desc.h> diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 0a40df66a40d..fa700b46588e 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -113,17 +113,9 @@ void __noreturn machine_real_restart(unsigned int type) spin_unlock(&rtc_lock); /* - * Switch back to the initial page table. + * Switch to the trampoline page table. */ -#ifdef CONFIG_X86_32 - load_cr3(initial_page_table); -#else - write_cr3(real_mode_header->trampoline_pgd); - - /* Exiting long mode will fail if CR4.PCIDE is set. */ - if (boot_cpu_has(X86_FEATURE_PCID)) - cr4_clear_bits(X86_CR4_PCIDE); -#endif + load_trampoline_pgtable(); /* Jump to the identity-mapped low memory code */ #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index f469153eca8a..fcc8a7699103 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S @@ -91,7 +91,7 @@ SYM_CODE_START_NOALIGN(relocate_kernel) movl %edi, %eax addl $(identity_mapped - relocate_kernel), %eax pushl %eax - ret + RET SYM_CODE_END(relocate_kernel) SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) @@ -159,7 +159,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) xorl %edx, %edx xorl %esi, %esi xorl %ebp, %ebp - ret + RET 1: popl %edx movl CP_PA_SWAP_PAGE(%edi), %esp @@ -190,7 +190,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) movl %edi, %eax addl $(virtual_mapped - relocate_kernel), %eax pushl %eax - ret + RET SYM_CODE_END(identity_mapped) SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped) @@ -208,7 +208,7 @@ SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped) popl %edi popl %esi popl %ebx - ret + RET SYM_CODE_END(virtual_mapped) /* Do the copies */ @@ -271,7 +271,7 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages) popl %edi popl %ebx popl %ebp - ret + RET SYM_CODE_END(swap_pages) .globl kexec_control_code_size diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index c53271aebb64..399f075ccdc4 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -47,7 +47,7 @@ SYM_CODE_START_NOALIGN(relocate_kernel) * %rsi page_list * %rdx start address * %rcx preserve_context - * %r8 sme_active + * %r8 host_mem_enc_active */ /* Save the CPU context, used for jumping back */ @@ -104,7 +104,7 @@ SYM_CODE_START_NOALIGN(relocate_kernel) /* jump to identity mapped page */ addq $(identity_mapped - relocate_kernel), %r8 pushq %r8 - ret + RET SYM_CODE_END(relocate_kernel) SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) @@ -191,7 +191,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) xorl %r14d, %r14d xorl %r15d, %r15d - ret + RET 1: popq %rdx @@ -210,7 +210,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) call swap_pages movq $virtual_mapped, %rax pushq %rax - ret + RET SYM_CODE_END(identity_mapped) SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped) @@ -231,7 +231,7 @@ SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped) popq %r12 popq %rbp popq %rbx - ret + RET SYM_CODE_END(virtual_mapped) /* Do the copies */ @@ -288,7 +288,7 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages) lea PAGE_SIZE(%rax), %rsi jmp 0b 3: - ret + RET SYM_CODE_END(swap_pages) .globl kexec_control_code_size diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 40ed44ead063..f7a132eb794d 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -40,6 +40,7 @@ #include <asm/kasan.h> #include <asm/kaslr.h> #include <asm/mce.h> +#include <asm/memtype.h> #include <asm/mtrr.h> #include <asm/realmode.h> #include <asm/olpc_ofw.h> @@ -322,7 +323,7 @@ static void __init reserve_initrd(void) relocate_initrd(); - memblock_free(ramdisk_image, ramdisk_end - ramdisk_image); + memblock_phys_free(ramdisk_image, ramdisk_end - ramdisk_image); } #else @@ -521,7 +522,7 @@ static void __init reserve_crashkernel(void) } if (crash_base >= (1ULL << 32) && reserve_crashkernel_low()) { - memblock_free(crash_base, crash_size); + memblock_phys_free(crash_base, crash_size); return; } @@ -713,9 +714,6 @@ static void __init early_reserve_memory(void) early_reserve_initrd(); - if (efi_enabled(EFI_BOOT)) - efi_memblock_x86_reserve_range(); - memblock_x86_reserve_range_setup_data(); reserve_ibft_region(); @@ -890,6 +888,9 @@ void __init setup_arch(char **cmdline_p) parse_early_param(); + if (efi_enabled(EFI_BOOT)) + efi_memblock_x86_reserve_range(); + #ifdef CONFIG_MEMORY_HOTPLUG /* * Memory used by the kernel cannot be hot-removed because Linux @@ -967,7 +968,11 @@ void __init setup_arch(char **cmdline_p) max_pfn = e820__end_of_ram_pfn(); /* update e820 for memory not covered by WB MTRRs */ - mtrr_bp_init(); + if (IS_ENABLED(CONFIG_MTRR)) + mtrr_bp_init(); + else + pat_disable("PAT support disabled because CONFIG_MTRR is disabled in the kernel."); + if (mtrr_trim_uncached_memory(max_pfn)) max_pfn = e820__end_of_ram_pfn(); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 5afd98559193..7b65275544b2 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -135,7 +135,7 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align) static void __init pcpu_fc_free(void *ptr, size_t size) { - memblock_free_ptr(ptr, size); + memblock_free(ptr, size); } static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index 9f90f460a28c..ce987688bbc0 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -64,7 +64,7 @@ static bool sev_es_negotiate_protocol(void) static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb) { ghcb->save.sw_exit_code = 0; - memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); + __builtin_memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); } static bool vc_decoding_needed(unsigned long exit_code) @@ -94,25 +94,15 @@ static void vc_finish_insn(struct es_em_ctxt *ctxt) ctxt->regs->ip += ctxt->insn.length; } -static enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, - struct es_em_ctxt *ctxt, - u64 exit_code, u64 exit_info_1, - u64 exit_info_2) +static enum es_result verify_exception_info(struct ghcb *ghcb, struct es_em_ctxt *ctxt) { - enum es_result ret; - - /* Fill in protocol and format specifiers */ - ghcb->protocol_version = GHCB_PROTOCOL_MAX; - ghcb->ghcb_usage = GHCB_DEFAULT_USAGE; - - ghcb_set_sw_exit_code(ghcb, exit_code); - ghcb_set_sw_exit_info_1(ghcb, exit_info_1); - ghcb_set_sw_exit_info_2(ghcb, exit_info_2); + u32 ret; - sev_es_wr_ghcb_msr(__pa(ghcb)); - VMGEXIT(); + ret = ghcb->save.sw_exit_info_1 & GENMASK_ULL(31, 0); + if (!ret) + return ES_OK; - if ((ghcb->save.sw_exit_info_1 & 0xffffffff) == 1) { + if (ret == 1) { u64 info = ghcb->save.sw_exit_info_2; unsigned long v; @@ -124,17 +114,40 @@ static enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, ((v == X86_TRAP_GP) || (v == X86_TRAP_UD)) && ((info & SVM_EVTINJ_TYPE_MASK) == SVM_EVTINJ_TYPE_EXEPT)) { ctxt->fi.vector = v; + if (info & SVM_EVTINJ_VALID_ERR) ctxt->fi.error_code = info >> 32; - ret = ES_EXCEPTION; - } else { - ret = ES_VMM_ERROR; + + return ES_EXCEPTION; } - } else { - ret = ES_OK; } - return ret; + return ES_VMM_ERROR; +} + +enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, bool set_ghcb_msr, + struct es_em_ctxt *ctxt, u64 exit_code, + u64 exit_info_1, u64 exit_info_2) +{ + /* Fill in protocol and format specifiers */ + ghcb->protocol_version = GHCB_PROTOCOL_MAX; + ghcb->ghcb_usage = GHCB_DEFAULT_USAGE; + + ghcb_set_sw_exit_code(ghcb, exit_code); + ghcb_set_sw_exit_info_1(ghcb, exit_info_1); + ghcb_set_sw_exit_info_2(ghcb, exit_info_2); + + /* + * Hyper-V unenlightened guests use a paravisor for communicating and + * GHCB pages are being allocated and set up by that paravisor. Linux + * should not change the GHCB page's physical address. + */ + if (set_ghcb_msr) + sev_es_wr_ghcb_msr(__pa(ghcb)); + + VMGEXIT(); + + return verify_exception_info(ghcb, ctxt); } /* @@ -208,7 +221,7 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) fail: /* Terminate the guest */ - sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); + sev_es_terminate(GHCB_SEV_ES_GEN_REQ); } static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt, @@ -411,7 +424,7 @@ static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) */ sw_scratch = __pa(ghcb) + offsetof(struct ghcb, shared_buffer); ghcb_set_sw_scratch(ghcb, sw_scratch); - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, + ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_IOIO, exit_info_1, exit_info_2); if (ret != ES_OK) return ret; @@ -453,7 +466,8 @@ static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) ghcb_set_rax(ghcb, rax); - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, exit_info_1, 0); + ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, + SVM_EXIT_IOIO, exit_info_1, 0); if (ret != ES_OK) return ret; @@ -484,7 +498,7 @@ static enum es_result vc_handle_cpuid(struct ghcb *ghcb, /* xgetbv will cause #GP - use reset value for xcr0 */ ghcb_set_xcr0(ghcb, 1); - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0); + ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_CPUID, 0, 0); if (ret != ES_OK) return ret; @@ -509,7 +523,7 @@ static enum es_result vc_handle_rdtsc(struct ghcb *ghcb, bool rdtscp = (exit_code == SVM_EXIT_RDTSCP); enum es_result ret; - ret = sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, 0, 0); + ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, exit_code, 0, 0); if (ret != ES_OK) return ret; diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index a6895e440bc3..e6d316a01fdd 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -11,7 +11,7 @@ #include <linux/sched/debug.h> /* For show_regs() */ #include <linux/percpu-defs.h> -#include <linux/mem_encrypt.h> +#include <linux/cc_platform.h> #include <linux/printk.h> #include <linux/mm_types.h> #include <linux/set_memory.h> @@ -23,9 +23,10 @@ #include <asm/stacktrace.h> #include <asm/sev.h> #include <asm/insn-eval.h> -#include <asm/fpu/internal.h> +#include <asm/fpu/xcr.h> #include <asm/processor.h> #include <asm/realmode.h> +#include <asm/setup.h> #include <asm/traps.h> #include <asm/svm.h> #include <asm/smp.h> @@ -46,16 +47,6 @@ static struct ghcb __initdata *boot_ghcb; struct sev_es_runtime_data { struct ghcb ghcb_page; - /* Physical storage for the per-CPU IST stack of the #VC handler */ - char ist_stack[EXCEPTION_STKSZ] __aligned(PAGE_SIZE); - - /* - * Physical storage for the per-CPU fall-back stack of the #VC handler. - * The fall-back stack is used when it is not safe to switch back to the - * interrupted stack in the #VC entry code. - */ - char fallback_stack[EXCEPTION_STKSZ] __aligned(PAGE_SIZE); - /* * Reserve one page per CPU as backup storage for the unencrypted GHCB. * It is needed when an NMI happens while the #VC handler uses the real @@ -96,30 +87,6 @@ struct ghcb_state { static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data); DEFINE_STATIC_KEY_FALSE(sev_es_enable_key); -/* Needed in vc_early_forward_exception */ -void do_early_exception(struct pt_regs *regs, int trapnr); - -static void __init setup_vc_stacks(int cpu) -{ - struct sev_es_runtime_data *data; - struct cpu_entry_area *cea; - unsigned long vaddr; - phys_addr_t pa; - - data = per_cpu(runtime_data, cpu); - cea = get_cpu_entry_area(cpu); - - /* Map #VC IST stack */ - vaddr = CEA_ESTACK_BOT(&cea->estacks, VC); - pa = __pa(data->ist_stack); - cea_set_pte((void *)vaddr, pa, PAGE_KERNEL); - - /* Map VC fall-back stack */ - vaddr = CEA_ESTACK_BOT(&cea->estacks, VC2); - pa = __pa(data->fallback_stack); - cea_set_pte((void *)vaddr, pa, PAGE_KERNEL); -} - static __always_inline bool on_vc_stack(struct pt_regs *regs) { unsigned long sp = regs->sp; @@ -240,9 +207,6 @@ static noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state) return ghcb; } -/* Needed in vc_early_forward_exception */ -void do_early_exception(struct pt_regs *regs, int trapnr); - static inline u64 sev_es_rd_ghcb_msr(void) { return __rdmsr(MSR_AMD64_SEV_ES_GHCB); @@ -325,11 +289,6 @@ static enum es_result vc_write_mem(struct es_em_ctxt *ctxt, char *dst, char *buf, size_t size) { unsigned long error_code = X86_PF_PROT | X86_PF_WRITE; - char __user *target = (char __user *)dst; - u64 d8; - u32 d4; - u16 d2; - u8 d1; /* * This function uses __put_user() independent of whether kernel or user @@ -351,26 +310,42 @@ static enum es_result vc_write_mem(struct es_em_ctxt *ctxt, * instructions here would cause infinite nesting. */ switch (size) { - case 1: + case 1: { + u8 d1; + u8 __user *target = (u8 __user *)dst; + memcpy(&d1, buf, 1); if (__put_user(d1, target)) goto fault; break; - case 2: + } + case 2: { + u16 d2; + u16 __user *target = (u16 __user *)dst; + memcpy(&d2, buf, 2); if (__put_user(d2, target)) goto fault; break; - case 4: + } + case 4: { + u32 d4; + u32 __user *target = (u32 __user *)dst; + memcpy(&d4, buf, 4); if (__put_user(d4, target)) goto fault; break; - case 8: + } + case 8: { + u64 d8; + u64 __user *target = (u64 __user *)dst; + memcpy(&d8, buf, 8); if (__put_user(d8, target)) goto fault; break; + } default: WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size); return ES_UNSUPPORTED; @@ -393,11 +368,6 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, char *src, char *buf, size_t size) { unsigned long error_code = X86_PF_PROT; - char __user *s = (char __user *)src; - u64 d8; - u32 d4; - u16 d2; - u8 d1; /* * This function uses __get_user() independent of whether kernel or user @@ -419,26 +389,41 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, * instructions here would cause infinite nesting. */ switch (size) { - case 1: + case 1: { + u8 d1; + u8 __user *s = (u8 __user *)src; + if (__get_user(d1, s)) goto fault; memcpy(buf, &d1, 1); break; - case 2: + } + case 2: { + u16 d2; + u16 __user *s = (u16 __user *)src; + if (__get_user(d2, s)) goto fault; memcpy(buf, &d2, 2); break; - case 4: + } + case 4: { + u32 d4; + u32 __user *s = (u32 __user *)src; + if (__get_user(d4, s)) goto fault; memcpy(buf, &d4, 4); break; - case 8: + } + case 8: { + u64 d8; + u64 __user *s = (u64 __user *)src; if (__get_user(d8, s)) goto fault; memcpy(buf, &d8, 8); break; + } default: WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size); return ES_UNSUPPORTED; @@ -615,7 +600,7 @@ int __init sev_es_efi_map_ghcbs(pgd_t *pgd) int cpu; u64 pfn; - if (!sev_es_active()) + if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) return 0; pflags = _PAGE_NX | _PAGE_RW; @@ -648,7 +633,8 @@ static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) ghcb_set_rdx(ghcb, regs->dx); } - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_MSR, exit_info_1, 0); + ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_MSR, + exit_info_1, 0); if ((ret == ES_OK) && (!exit_info_1)) { regs->ax = ghcb->save.rax; @@ -774,7 +760,7 @@ void __init sev_es_init_vc_handling(void) BUILD_BUG_ON(offsetof(struct sev_es_runtime_data, ghcb_page) % PAGE_SIZE); - if (!sev_es_active()) + if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) return; if (!sev_es_check_cpu_features()) @@ -787,7 +773,6 @@ void __init sev_es_init_vc_handling(void) for_each_possible_cpu(cpu) { alloc_runtime_data(cpu); init_ghcb(cpu); - setup_vc_stacks(cpu); } sev_es_setup_play_dead(); @@ -807,22 +792,6 @@ static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt) do_early_exception(ctxt->regs, trapnr); } -static long *vc_insn_get_reg(struct es_em_ctxt *ctxt) -{ - long *reg_array; - int offset; - - reg_array = (long *)ctxt->regs; - offset = insn_get_modrm_reg_off(&ctxt->insn, ctxt->regs); - - if (offset < 0) - return NULL; - - offset /= sizeof(long); - - return reg_array + offset; -} - static long *vc_insn_get_rm(struct es_em_ctxt *ctxt) { long *reg_array; @@ -867,77 +836,7 @@ static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt, ghcb_set_sw_scratch(ghcb, ghcb_pa + offsetof(struct ghcb, shared_buffer)); - return sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, exit_info_1, exit_info_2); -} - -static enum es_result vc_handle_mmio_twobyte_ops(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - struct insn *insn = &ctxt->insn; - unsigned int bytes = 0; - enum es_result ret; - int sign_byte; - long *reg_data; - - switch (insn->opcode.bytes[1]) { - /* MMIO Read w/ zero-extension */ - case 0xb6: - bytes = 1; - fallthrough; - case 0xb7: - if (!bytes) - bytes = 2; - - ret = vc_do_mmio(ghcb, ctxt, bytes, true); - if (ret) - break; - - /* Zero extend based on operand size */ - reg_data = vc_insn_get_reg(ctxt); - if (!reg_data) - return ES_DECODE_FAILED; - - memset(reg_data, 0, insn->opnd_bytes); - - memcpy(reg_data, ghcb->shared_buffer, bytes); - break; - - /* MMIO Read w/ sign-extension */ - case 0xbe: - bytes = 1; - fallthrough; - case 0xbf: - if (!bytes) - bytes = 2; - - ret = vc_do_mmio(ghcb, ctxt, bytes, true); - if (ret) - break; - - /* Sign extend based on operand size */ - reg_data = vc_insn_get_reg(ctxt); - if (!reg_data) - return ES_DECODE_FAILED; - - if (bytes == 1) { - u8 *val = (u8 *)ghcb->shared_buffer; - - sign_byte = (*val & 0x80) ? 0xff : 0x00; - } else { - u16 *val = (u16 *)ghcb->shared_buffer; - - sign_byte = (*val & 0x8000) ? 0xff : 0x00; - } - memset(reg_data, sign_byte, insn->opnd_bytes); - - memcpy(reg_data, ghcb->shared_buffer, bytes); - break; - - default: - ret = ES_UNSUPPORTED; - } - - return ret; + return sev_es_ghcb_hv_call(ghcb, true, ctxt, exit_code, exit_info_1, exit_info_2); } /* @@ -1007,83 +906,79 @@ static enum es_result vc_handle_mmio_movs(struct es_em_ctxt *ctxt, return ES_RETRY; } -static enum es_result vc_handle_mmio(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) +static enum es_result vc_handle_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) { struct insn *insn = &ctxt->insn; unsigned int bytes = 0; + enum mmio_type mmio; enum es_result ret; + u8 sign_byte; long *reg_data; - switch (insn->opcode.bytes[0]) { - /* MMIO Write */ - case 0x88: - bytes = 1; - fallthrough; - case 0x89: - if (!bytes) - bytes = insn->opnd_bytes; + mmio = insn_decode_mmio(insn, &bytes); + if (mmio == MMIO_DECODE_FAILED) + return ES_DECODE_FAILED; - reg_data = vc_insn_get_reg(ctxt); + if (mmio != MMIO_WRITE_IMM && mmio != MMIO_MOVS) { + reg_data = insn_get_modrm_reg_ptr(insn, ctxt->regs); if (!reg_data) return ES_DECODE_FAILED; + } + switch (mmio) { + case MMIO_WRITE: memcpy(ghcb->shared_buffer, reg_data, bytes); - ret = vc_do_mmio(ghcb, ctxt, bytes, false); break; - - case 0xc6: - bytes = 1; - fallthrough; - case 0xc7: - if (!bytes) - bytes = insn->opnd_bytes; - + case MMIO_WRITE_IMM: memcpy(ghcb->shared_buffer, insn->immediate1.bytes, bytes); - ret = vc_do_mmio(ghcb, ctxt, bytes, false); break; - - /* MMIO Read */ - case 0x8a: - bytes = 1; - fallthrough; - case 0x8b: - if (!bytes) - bytes = insn->opnd_bytes; - + case MMIO_READ: ret = vc_do_mmio(ghcb, ctxt, bytes, true); if (ret) break; - reg_data = vc_insn_get_reg(ctxt); - if (!reg_data) - return ES_DECODE_FAILED; - /* Zero-extend for 32-bit operation */ if (bytes == 4) *reg_data = 0; memcpy(reg_data, ghcb->shared_buffer, bytes); break; + case MMIO_READ_ZERO_EXTEND: + ret = vc_do_mmio(ghcb, ctxt, bytes, true); + if (ret) + break; - /* MOVS instruction */ - case 0xa4: - bytes = 1; - fallthrough; - case 0xa5: - if (!bytes) - bytes = insn->opnd_bytes; + /* Zero extend based on operand size */ + memset(reg_data, 0, insn->opnd_bytes); + memcpy(reg_data, ghcb->shared_buffer, bytes); + break; + case MMIO_READ_SIGN_EXTEND: + ret = vc_do_mmio(ghcb, ctxt, bytes, true); + if (ret) + break; - ret = vc_handle_mmio_movs(ctxt, bytes); + if (bytes == 1) { + u8 *val = (u8 *)ghcb->shared_buffer; + + sign_byte = (*val & 0x80) ? 0xff : 0x00; + } else { + u16 *val = (u16 *)ghcb->shared_buffer; + + sign_byte = (*val & 0x8000) ? 0xff : 0x00; + } + + /* Sign extend based on operand size */ + memset(reg_data, sign_byte, insn->opnd_bytes); + memcpy(reg_data, ghcb->shared_buffer, bytes); break; - /* Two-Byte Opcodes */ - case 0x0f: - ret = vc_handle_mmio_twobyte_ops(ghcb, ctxt); + case MMIO_MOVS: + ret = vc_handle_mmio_movs(ctxt, bytes); break; default: ret = ES_UNSUPPORTED; + break; } return ret; @@ -1117,7 +1012,7 @@ static enum es_result vc_handle_dr7_write(struct ghcb *ghcb, /* Using a value of 0 for ExitInfo1 means RAX holds the value */ ghcb_set_rax(ghcb, val); - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WRITE_DR7, 0, 0); + ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_WRITE_DR7, 0, 0); if (ret != ES_OK) return ret; @@ -1147,7 +1042,7 @@ static enum es_result vc_handle_dr7_read(struct ghcb *ghcb, static enum es_result vc_handle_wbinvd(struct ghcb *ghcb, struct es_em_ctxt *ctxt) { - return sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WBINVD, 0, 0); + return sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_WBINVD, 0, 0); } static enum es_result vc_handle_rdpmc(struct ghcb *ghcb, struct es_em_ctxt *ctxt) @@ -1156,7 +1051,7 @@ static enum es_result vc_handle_rdpmc(struct ghcb *ghcb, struct es_em_ctxt *ctxt ghcb_set_rcx(ghcb, ctxt->regs->cx); - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_RDPMC, 0, 0); + ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_RDPMC, 0, 0); if (ret != ES_OK) return ret; @@ -1197,7 +1092,7 @@ static enum es_result vc_handle_vmmcall(struct ghcb *ghcb, if (x86_platform.hyper.sev_es_hcall_prepare) x86_platform.hyper.sev_es_hcall_prepare(ghcb, ctxt->regs); - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_VMMCALL, 0, 0); + ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_VMMCALL, 0, 0); if (ret != ES_OK) return ret; @@ -1319,13 +1214,26 @@ static __always_inline void vc_forward_exception(struct es_em_ctxt *ctxt) } } -static __always_inline bool on_vc_fallback_stack(struct pt_regs *regs) +static __always_inline bool is_vc2_stack(unsigned long sp) { - unsigned long sp = (unsigned long)regs; - return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2)); } +static __always_inline bool vc_from_invalid_context(struct pt_regs *regs) +{ + unsigned long sp, prev_sp; + + sp = (unsigned long)regs; + prev_sp = regs->sp; + + /* + * If the code was already executing on the VC2 stack when the #VC + * happened, let it proceed to the normal handling routine. This way the + * code executing on the VC2 stack can cause #VC exceptions to get handled. + */ + return is_vc2_stack(sp) && !is_vc2_stack(prev_sp); +} + static bool vc_raw_handle_exception(struct pt_regs *regs, unsigned long error_code) { struct ghcb_state state; @@ -1406,7 +1314,7 @@ DEFINE_IDTENTRY_VC_KERNEL(exc_vmm_communication) * But keep this here in case the noinstr annotations are violated due * to bug elsewhere. */ - if (unlikely(on_vc_fallback_stack(regs))) { + if (unlikely(vc_from_invalid_context(regs))) { instrumentation_begin(); panic("Can't handle #VC exception from unsupported context\n"); instrumentation_end(); @@ -1429,7 +1337,7 @@ DEFINE_IDTENTRY_VC_KERNEL(exc_vmm_communication) show_regs(regs); /* Ask hypervisor to sev_es_terminate */ - sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); + sev_es_terminate(GHCB_SEV_ES_GEN_REQ); /* If that fails and we get here - just panic */ panic("Returned from Terminate-Request to Hypervisor\n"); @@ -1477,7 +1385,7 @@ bool __init handle_vc_boot_ghcb(struct pt_regs *regs) /* Do initial setup or terminate the guest */ if (unlikely(boot_ghcb == NULL && !sev_es_setup_ghcb())) - sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); + sev_es_terminate(GHCB_SEV_ES_GEN_REQ); vc_ghcb_invalidate(boot_ghcb); diff --git a/arch/x86/kernel/sev_verify_cbit.S b/arch/x86/kernel/sev_verify_cbit.S index ee04941a6546..3355e27c69eb 100644 --- a/arch/x86/kernel/sev_verify_cbit.S +++ b/arch/x86/kernel/sev_verify_cbit.S @@ -85,5 +85,5 @@ SYM_FUNC_START(sev_verify_cbit) #endif /* Return page-table pointer */ movq %rdi, %rax - ret + RET SYM_FUNC_END(sev_verify_cbit) diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index f4d21e470083..ec71e06ae364 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -15,6 +15,7 @@ #include <linux/mm.h> #include <linux/smp.h> #include <linux/kernel.h> +#include <linux/kstrtox.h> #include <linux/errno.h> #include <linux/wait.h> #include <linux/tracehook.h> @@ -30,8 +31,8 @@ #include <asm/processor.h> #include <asm/ucontext.h> -#include <asm/fpu/internal.h> #include <asm/fpu/signal.h> +#include <asm/fpu/xstate.h> #include <asm/vdso.h> #include <asm/mce.h> #include <asm/sighandling.h> @@ -41,6 +42,7 @@ #include <linux/compat.h> #include <asm/proto.h> #include <asm/ia32_unistd.h> +#include <asm/fpu/xstate.h> #endif /* CONFIG_X86_64 */ #include <asm/syscall.h> @@ -79,9 +81,9 @@ static void force_valid_ss(struct pt_regs *regs) # define CONTEXT_COPY_SIZE sizeof(struct sigcontext) #endif -static int restore_sigcontext(struct pt_regs *regs, - struct sigcontext __user *usc, - unsigned long uc_flags) +static bool restore_sigcontext(struct pt_regs *regs, + struct sigcontext __user *usc, + unsigned long uc_flags) { struct sigcontext sc; @@ -89,7 +91,7 @@ static int restore_sigcontext(struct pt_regs *regs, current->restart_block.fn = do_no_restart_syscall; if (copy_from_user(&sc, usc, CONTEXT_COPY_SIZE)) - return -EFAULT; + return false; #ifdef CONFIG_X86_32 set_user_gs(regs, sc.gs); @@ -244,7 +246,6 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, unsigned long math_size = 0; unsigned long sp = regs->sp; unsigned long buf_fx = 0; - int ret; /* redzone */ if (IS_ENABLED(CONFIG_X86_64)) @@ -292,8 +293,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, } /* save i387 and extended state */ - ret = copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size); - if (ret < 0) + if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size)) return (void __user *)-1L; return (void __user *)sp; @@ -643,7 +643,7 @@ SYSCALL_DEFINE0(sigreturn) * x86_32 has no uc_flags bits relevant to restore_sigcontext. * Save a few cycles by skipping the __get_user. */ - if (restore_sigcontext(regs, &frame->sc, 0)) + if (!restore_sigcontext(regs, &frame->sc, 0)) goto badframe; return regs->ax; @@ -671,7 +671,7 @@ SYSCALL_DEFINE0(rt_sigreturn) set_current_blocked(&set); - if (restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) + if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) goto badframe; if (restore_altstack(&frame->uc.uc_stack)) @@ -721,12 +721,15 @@ badframe: /* max_frame_size tells userspace the worst case signal stack size. */ static unsigned long __ro_after_init max_frame_size; +static unsigned int __ro_after_init fpu_default_state_size; void __init init_sigframe_size(void) { + fpu_default_state_size = fpu__get_fpstate_size(); + max_frame_size = MAX_FRAME_SIGINFO_UCTXT_SIZE + MAX_FRAME_PADDING; - max_frame_size += fpu__get_fpstate_size() + MAX_XSAVE_PADDING; + max_frame_size += fpu_default_state_size + MAX_XSAVE_PADDING; /* Userspace expects an aligned size. */ max_frame_size = round_up(max_frame_size, FRAME_ALIGNMENT); @@ -910,6 +913,62 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where) force_sig(SIGSEGV); } +#ifdef CONFIG_DYNAMIC_SIGFRAME +#ifdef CONFIG_STRICT_SIGALTSTACK_SIZE +static bool strict_sigaltstack_size __ro_after_init = true; +#else +static bool strict_sigaltstack_size __ro_after_init = false; +#endif + +static int __init strict_sas_size(char *arg) +{ + return kstrtobool(arg, &strict_sigaltstack_size); +} +__setup("strict_sas_size", strict_sas_size); + +/* + * MINSIGSTKSZ is 2048 and can't be changed despite the fact that AVX512 + * exceeds that size already. As such programs might never use the + * sigaltstack they just continued to work. While always checking against + * the real size would be correct, this might be considered a regression. + * + * Therefore avoid the sanity check, unless enforced by kernel + * configuration or command line option. + * + * When dynamic FPU features are supported, the check is also enforced when + * the task has permissions to use dynamic features. Tasks which have no + * permission are checked against the size of the non-dynamic feature set + * if strict checking is enabled. This avoids forcing all tasks on the + * system to allocate large sigaltstacks even if they are never going + * to use a dynamic feature. As this is serialized via sighand::siglock + * any permission request for a dynamic feature either happened already + * or will see the newly install sigaltstack size in the permission checks. + */ +bool sigaltstack_size_valid(size_t ss_size) +{ + unsigned long fsize = max_frame_size - fpu_default_state_size; + u64 mask; + + lockdep_assert_held(¤t->sighand->siglock); + + if (!fpu_state_size_dynamic() && !strict_sigaltstack_size) + return true; + + fsize += current->group_leader->thread.fpu.perm.__user_state_size; + if (likely(ss_size > fsize)) + return true; + + if (strict_sigaltstack_size) + return ss_size > fsize; + + mask = current->group_leader->thread.fpu.perm.__state_perm; + if (mask & XFEATURE_MASK_USER_DYNAMIC) + return ss_size > fsize; + + return true; +} +#endif /* CONFIG_DYNAMIC_SIGFRAME */ + #ifdef CONFIG_X86_X32_ABI COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn) { @@ -929,7 +988,7 @@ COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn) set_current_blocked(&set); - if (restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) + if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) goto badframe; if (compat_restore_altstack(&frame->uc.uc_stack)) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 85f6e242b6b4..617012f4619f 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -70,7 +70,7 @@ #include <asm/mwait.h> #include <asm/apic.h> #include <asm/io_apic.h> -#include <asm/fpu/internal.h> +#include <asm/fpu/api.h> #include <asm/setup.h> #include <asm/uv/uv.h> #include <linux/mc146818rtc.h> @@ -101,6 +101,8 @@ EXPORT_PER_CPU_SYMBOL(cpu_die_map); DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); +DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map); + /* Per CPU bogomips and other parameters */ DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); EXPORT_PER_CPU_SYMBOL(cpu_info); @@ -464,6 +466,21 @@ static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) return false; } +static bool match_l2c(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +{ + int cpu1 = c->cpu_index, cpu2 = o->cpu_index; + + /* If the arch didn't set up l2c_id, fall back to SMT */ + if (per_cpu(cpu_l2c_id, cpu1) == BAD_APICID) + return match_smt(c, o); + + /* Do not match if L2 cache id does not match: */ + if (per_cpu(cpu_l2c_id, cpu1) != per_cpu(cpu_l2c_id, cpu2)) + return false; + + return topology_sane(c, o, "l2c"); +} + /* * Unlike the other levels, we do not enforce keeping a * multicore group inside a NUMA node. If this happens, we will @@ -523,7 +540,7 @@ static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) } -#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC) +#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_CLUSTER) || defined(CONFIG_SCHED_MC) static inline int x86_sched_itmt_flags(void) { return sysctl_sched_itmt_enabled ? SD_ASYM_PACKING : 0; @@ -541,15 +558,35 @@ static int x86_smt_flags(void) return cpu_smt_flags() | x86_sched_itmt_flags(); } #endif +#ifdef CONFIG_SCHED_CLUSTER +static int x86_cluster_flags(void) +{ + return cpu_cluster_flags() | x86_sched_itmt_flags(); +} +#endif #endif static struct sched_domain_topology_level x86_numa_in_package_topology[] = { #ifdef CONFIG_SCHED_SMT { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) }, #endif +#ifdef CONFIG_SCHED_CLUSTER + { cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) }, +#endif +#ifdef CONFIG_SCHED_MC + { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) }, +#endif + { NULL, }, +}; + +static struct sched_domain_topology_level x86_hybrid_topology[] = { +#ifdef CONFIG_SCHED_SMT + { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) }, +#endif #ifdef CONFIG_SCHED_MC { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) }, #endif + { cpu_cpu_mask, SD_INIT_NAME(DIE) }, { NULL, }, }; @@ -557,6 +594,9 @@ static struct sched_domain_topology_level x86_topology[] = { #ifdef CONFIG_SCHED_SMT { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) }, #endif +#ifdef CONFIG_SCHED_CLUSTER + { cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) }, +#endif #ifdef CONFIG_SCHED_MC { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) }, #endif @@ -584,6 +624,7 @@ void set_cpu_sibling_map(int cpu) if (!has_mp) { cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu)); cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu)); + cpumask_set_cpu(cpu, cpu_l2c_shared_mask(cpu)); cpumask_set_cpu(cpu, topology_core_cpumask(cpu)); cpumask_set_cpu(cpu, topology_die_cpumask(cpu)); c->booted_cores = 1; @@ -602,6 +643,9 @@ void set_cpu_sibling_map(int cpu) if ((i == cpu) || (has_mp && match_llc(c, o))) link_mask(cpu_llc_shared_mask, cpu, i); + if ((i == cpu) || (has_mp && match_l2c(c, o))) + link_mask(cpu_l2c_shared_mask, cpu, i); + if ((i == cpu) || (has_mp && match_die(c, o))) link_mask(topology_die_cpumask, cpu, i); } @@ -652,6 +696,11 @@ const struct cpumask *cpu_coregroup_mask(int cpu) return cpu_llc_shared_mask(cpu); } +const struct cpumask *cpu_clustergroup_mask(int cpu) +{ + return cpu_l2c_shared_mask(cpu); +} + static void impress_friends(void) { int cpu; @@ -1312,12 +1361,7 @@ static void __init smp_get_logical_apicid(void) cpu0_logical_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); } -/* - * Prepare for SMP bootup. - * @max_cpus: configured maximum number of CPUs, It is a legacy parameter - * for common interface support. - */ -void __init native_smp_prepare_cpus(unsigned int max_cpus) +void __init smp_prepare_cpus_common(void) { unsigned int i; @@ -1335,6 +1379,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); zalloc_cpumask_var(&per_cpu(cpu_die_map, i), GFP_KERNEL); zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL); + zalloc_cpumask_var(&per_cpu(cpu_l2c_shared_map, i), GFP_KERNEL); } /* @@ -1347,6 +1392,17 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) set_sched_topology(x86_topology); set_cpu_sibling_map(0); +} + +/* + * Prepare for SMP bootup. + * @max_cpus: configured maximum number of CPUs, It is a legacy parameter + * for common interface support. + */ +void __init native_smp_prepare_cpus(unsigned int max_cpus) +{ + smp_prepare_cpus_common(); + init_freq_invariance(false, false); smp_sanity_check(); @@ -1424,8 +1480,11 @@ void __init native_smp_cpus_done(unsigned int max_cpus) calculate_max_logical_packages(); + /* XXX for now assume numa-in-package and hybrid don't overlap */ if (x86_has_numa_in_package) set_sched_topology(x86_numa_in_package_topology); + if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) + set_sched_topology(x86_hybrid_topology); nmi_selftest(); impress_friends(); @@ -1564,7 +1623,10 @@ static void remove_siblinginfo(int cpu) for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling)); + for_each_cpu(sibling, cpu_l2c_shared_mask(cpu)) + cpumask_clear_cpu(cpu, cpu_l2c_shared_mask(sibling)); cpumask_clear(cpu_llc_shared_mask(cpu)); + cpumask_clear(cpu_l2c_shared_mask(cpu)); cpumask_clear(topology_sibling_cpumask(cpu)); cpumask_clear(topology_core_cpumask(cpu)); cpumask_clear(topology_die_cpumask(cpu)); @@ -2166,7 +2228,7 @@ DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; void arch_scale_freq_tick(void) { - u64 freq_scale = SCHED_CAPACITY_SCALE; + u64 freq_scale; u64 aperf, mperf; u64 acnt, mcnt; diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c index ea028e736831..531fb4cbb63f 100644 --- a/arch/x86/kernel/static_call.c +++ b/arch/x86/kernel/static_call.c @@ -17,6 +17,8 @@ enum insn_type { */ static const u8 xor5rax[] = { 0x66, 0x66, 0x48, 0x31, 0xc0 }; +static const u8 retinsn[] = { RET_INSN_OPCODE, 0xcc, 0xcc, 0xcc, 0xcc }; + static void __ref __static_call_transform(void *insn, enum insn_type type, void *func) { const void *emulate = NULL; @@ -42,8 +44,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, void break; case RET: - code = text_gen_insn(RET_INSN_OPCODE, insn, func); - size = RET_INSN_SIZE; + code = &retinsn; break; } @@ -56,10 +57,15 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, void text_poke_bp(insn, code, size, emulate); } -static void __static_call_validate(void *insn, bool tail) +static void __static_call_validate(void *insn, bool tail, bool tramp) { u8 opcode = *(u8 *)insn; + if (tramp && memcmp(insn+5, "SCT", 3)) { + pr_err("trampoline signature fail"); + BUG(); + } + if (tail) { if (opcode == JMP32_INSN_OPCODE || opcode == RET_INSN_OPCODE) @@ -74,7 +80,8 @@ static void __static_call_validate(void *insn, bool tail) /* * If we ever trigger this, our text is corrupt, we'll probably not live long. */ - WARN_ONCE(1, "unexpected static_call insn opcode 0x%x at %pS\n", opcode, insn); + pr_err("unexpected static_call insn opcode 0x%x at %pS\n", opcode, insn); + BUG(); } static inline enum insn_type __sc_insn(bool null, bool tail) @@ -97,12 +104,12 @@ void arch_static_call_transform(void *site, void *tramp, void *func, bool tail) mutex_lock(&text_mutex); if (tramp) { - __static_call_validate(tramp, true); + __static_call_validate(tramp, true, true); __static_call_transform(tramp, __sc_insn(!func, true), func); } if (IS_ENABLED(CONFIG_HAVE_STATIC_CALL_INLINE) && site) { - __static_call_validate(site, tail); + __static_call_validate(site, tail, false); __static_call_transform(site, __sc_insn(!func, tail), func); } diff --git a/arch/x86/kernel/trace.c b/arch/x86/kernel/trace.c index 6b73b6f92ad3..8322e8352777 100644 --- a/arch/x86/kernel/trace.c +++ b/arch/x86/kernel/trace.c @@ -231,4 +231,4 @@ void osnoise_arch_unregister(void) unregister_trace_local_timer_exit(trace_intel_irq_exit, "local_timer"); unregister_trace_local_timer_entry(trace_intel_irq_entry, NULL); } -#endif /* CONFIG_OSNOISE_TRAECR && CONFIG_X86_LOCAL_APIC */ +#endif /* CONFIG_OSNOISE_TRACER && CONFIG_X86_LOCAL_APIC */ diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a58800973aed..c9d566dcf89a 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -48,7 +48,7 @@ #include <asm/ftrace.h> #include <asm/traps.h> #include <asm/desc.h> -#include <asm/fpu/internal.h> +#include <asm/fpu/api.h> #include <asm/cpu.h> #include <asm/cpu_entry_area.h> #include <asm/mce.h> @@ -313,17 +313,19 @@ out: } #ifdef CONFIG_VMAP_STACK -__visible void __noreturn handle_stack_overflow(const char *message, - struct pt_regs *regs, - unsigned long fault_address) +__visible void __noreturn handle_stack_overflow(struct pt_regs *regs, + unsigned long fault_address, + struct stack_info *info) { - printk(KERN_EMERG "BUG: stack guard page was hit at %p (stack is %p..%p)\n", - (void *)fault_address, current->stack, - (char *)current->stack + THREAD_SIZE - 1); - die(message, regs, 0); + const char *name = stack_type_name(info->type); + + printk(KERN_EMERG "BUG: %s stack guard page was hit at %p (stack is %p..%p)\n", + name, (void *)fault_address, info->begin, info->end); + + die("stack guard page", regs, 0); /* Be absolutely certain we don't return. */ - panic("%s", message); + panic("%s stack guard hit", name); } #endif @@ -353,6 +355,7 @@ DEFINE_IDTENTRY_DF(exc_double_fault) #ifdef CONFIG_VMAP_STACK unsigned long address = read_cr2(); + struct stack_info info; #endif #ifdef CONFIG_X86_ESPFIX64 @@ -455,10 +458,8 @@ DEFINE_IDTENTRY_DF(exc_double_fault) * stack even if the actual trigger for the double fault was * something else. */ - if ((unsigned long)task_stack_page(tsk) - 1 - address < PAGE_SIZE) { - handle_stack_overflow("kernel stack overflow (double-fault)", - regs, address); - } + if (get_stack_guard_info((void *)address, &info)) + handle_stack_overflow(regs, address, &info); #endif pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code); @@ -528,6 +529,36 @@ static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs, #define GPFSTR "general protection fault" +static bool fixup_iopl_exception(struct pt_regs *regs) +{ + struct thread_struct *t = ¤t->thread; + unsigned char byte; + unsigned long ip; + + if (!IS_ENABLED(CONFIG_X86_IOPL_IOPERM) || t->iopl_emul != 3) + return false; + + if (insn_get_effective_ip(regs, &ip)) + return false; + + if (get_user(byte, (const char __user *)ip)) + return false; + + if (byte != 0xfa && byte != 0xfb) + return false; + + if (!t->iopl_warn && printk_ratelimit()) { + pr_err("%s[%d] attempts to use CLI/STI, pretending it's a NOP, ip:%lx", + current->comm, task_pid_nr(current), ip); + print_vma_addr(KERN_CONT " in ", ip); + pr_cont("\n"); + t->iopl_warn = 1; + } + + regs->ip += 1; + return true; +} + DEFINE_IDTENTRY_ERRORCODE(exc_general_protection) { char desc[sizeof(GPFSTR) + 50 + 2*sizeof(unsigned long) + 1] = GPFSTR; @@ -553,6 +584,9 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_protection) tsk = current; if (user_mode(regs)) { + if (fixup_iopl_exception(regs)) + goto exit; + tsk->thread.error_code = error_code; tsk->thread.trap_nr = X86_TRAP_GP; @@ -709,7 +743,7 @@ asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *r stack = (unsigned long *)sp; if (!get_stack_info_noinstr(stack, current, &info) || info.type == STACK_TYPE_ENTRY || - info.type >= STACK_TYPE_EXCEPTION_LAST) + info.type > STACK_TYPE_EXCEPTION_LAST) sp = __this_cpu_ist_top_va(VC2); sync: @@ -1108,10 +1142,48 @@ DEFINE_IDTENTRY(exc_spurious_interrupt_bug) */ } +static bool handle_xfd_event(struct pt_regs *regs) +{ + u64 xfd_err; + int err; + + if (!IS_ENABLED(CONFIG_X86_64) || !cpu_feature_enabled(X86_FEATURE_XFD)) + return false; + + rdmsrl(MSR_IA32_XFD_ERR, xfd_err); + if (!xfd_err) + return false; + + wrmsrl(MSR_IA32_XFD_ERR, 0); + + /* Die if that happens in kernel space */ + if (WARN_ON(!user_mode(regs))) + return false; + + local_irq_enable(); + + err = xfd_enable_feature(xfd_err); + + switch (err) { + case -EPERM: + force_sig_fault(SIGILL, ILL_ILLOPC, error_get_trap_addr(regs)); + break; + case -EFAULT: + force_sig(SIGSEGV); + break; + } + + local_irq_disable(); + return true; +} + DEFINE_IDTENTRY(exc_device_not_available) { unsigned long cr0 = read_cr0(); + if (handle_xfd_event(regs)) + return; + #ifdef CONFIG_MATH_EMULATION if (!boot_cpu_has(X86_FEATURE_FPU) && (cr0 & X86_CR0_EM)) { struct math_emu_info info = { }; diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 2e076a459a0c..a698196377be 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1180,6 +1180,12 @@ void mark_tsc_unstable(char *reason) EXPORT_SYMBOL_GPL(mark_tsc_unstable); +static void __init tsc_disable_clocksource_watchdog(void) +{ + clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY; + clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; +} + static void __init check_system_tsc_reliable(void) { #if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC) @@ -1196,6 +1202,23 @@ static void __init check_system_tsc_reliable(void) #endif if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) tsc_clocksource_reliable = 1; + + /* + * Disable the clocksource watchdog when the system has: + * - TSC running at constant frequency + * - TSC which does not stop in C-States + * - the TSC_ADJUST register which allows to detect even minimal + * modifications + * - not more than two sockets. As the number of sockets cannot be + * evaluated at the early boot stage where this has to be + * invoked, check the number of online memory nodes as a + * fallback solution which is an reasonable estimate. + */ + if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && + boot_cpu_has(X86_FEATURE_NONSTOP_TSC) && + boot_cpu_has(X86_FEATURE_TSC_ADJUST) && + nr_online_nodes <= 2) + tsc_disable_clocksource_watchdog(); } /* @@ -1387,9 +1410,6 @@ static int __init init_tsc_clocksource(void) if (tsc_unstable) goto unreg; - if (tsc_clocksource_reliable || no_tsc_watchdog) - clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; - if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3)) clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP; @@ -1527,7 +1547,7 @@ void __init tsc_init(void) } if (tsc_clocksource_reliable || no_tsc_watchdog) - clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY; + tsc_disable_clocksource_watchdog(); clocksource_register_khz(&clocksource_tsc_early, tsc_khz); detect_art(); diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 50a4515fe0ad..9452dc9664b5 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -30,6 +30,7 @@ struct tsc_adjust { }; static DEFINE_PER_CPU(struct tsc_adjust, tsc_adjust); +static struct timer_list tsc_sync_check_timer; /* * TSC's on different sockets may be reset asynchronously. @@ -77,6 +78,46 @@ void tsc_verify_tsc_adjust(bool resume) } } +/* + * Normally the tsc_sync will be checked every time system enters idle + * state, but there is still caveat that a system won't enter idle, + * either because it's too busy or configured purposely to not enter + * idle. + * + * So setup a periodic timer (every 10 minutes) to make sure the check + * is always on. + */ + +#define SYNC_CHECK_INTERVAL (HZ * 600) + +static void tsc_sync_check_timer_fn(struct timer_list *unused) +{ + int next_cpu; + + tsc_verify_tsc_adjust(false); + + /* Run the check for all onlined CPUs in turn */ + next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask); + if (next_cpu >= nr_cpu_ids) + next_cpu = cpumask_first(cpu_online_mask); + + tsc_sync_check_timer.expires += SYNC_CHECK_INTERVAL; + add_timer_on(&tsc_sync_check_timer, next_cpu); +} + +static int __init start_sync_check_timer(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_TSC_ADJUST) || tsc_clocksource_reliable) + return 0; + + timer_setup(&tsc_sync_check_timer, tsc_sync_check_timer_fn, 0); + tsc_sync_check_timer.expires = jiffies + SYNC_CHECK_INTERVAL; + add_timer(&tsc_sync_check_timer); + + return 0; +} +late_initcall(start_sync_check_timer); + static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval, unsigned int cpu, bool bootcpu) { diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c index 576b47e7523d..5a4b21389b1d 100644 --- a/arch/x86/kernel/umip.c +++ b/arch/x86/kernel/umip.c @@ -92,8 +92,8 @@ static const char * const umip_insns[5] = { #define umip_pr_err(regs, fmt, ...) \ umip_printk(regs, KERN_ERR, fmt, ##__VA_ARGS__) -#define umip_pr_warn(regs, fmt, ...) \ - umip_printk(regs, KERN_WARNING, fmt, ##__VA_ARGS__) +#define umip_pr_debug(regs, fmt, ...) \ + umip_printk(regs, KERN_DEBUG, fmt, ##__VA_ARGS__) /** * umip_printk() - Print a rate-limited message @@ -361,10 +361,10 @@ bool fixup_umip_exception(struct pt_regs *regs) if (umip_inst < 0) return false; - umip_pr_warn(regs, "%s instruction cannot be used by applications.\n", + umip_pr_debug(regs, "%s instruction cannot be used by applications.\n", umip_insns[umip_inst]); - umip_pr_warn(regs, "For now, expensive software emulation returns the result.\n"); + umip_pr_debug(regs, "For now, expensive software emulation returns the result.\n"); if (emulate_umip_insn(&insn, umip_inst, dummy_data, &dummy_data_size, user_64bit_mode(regs))) diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index d7c44b257f7f..8e1c50c86e5d 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -240,8 +240,7 @@ static bool update_stack_state(struct unwind_state *state, else { addr_p = unwind_get_return_address_ptr(state); addr = READ_ONCE_TASK_STACK(state->task, *addr_p); - state->ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, - addr, addr_p); + state->ip = unwind_recover_ret_addr(state, addr, addr_p); } /* Save the original stack pointer for unwind_dump(): */ diff --git a/arch/x86/kernel/unwind_guess.c b/arch/x86/kernel/unwind_guess.c index c49f10ffd8cd..884d68a6e714 100644 --- a/arch/x86/kernel/unwind_guess.c +++ b/arch/x86/kernel/unwind_guess.c @@ -15,8 +15,7 @@ unsigned long unwind_get_return_address(struct unwind_state *state) addr = READ_ONCE_NOCHECK(*state->sp); - return ftrace_graph_ret_addr(state->task, &state->graph_idx, - addr, state->sp); + return unwind_recover_ret_addr(state, addr, state->sp); } EXPORT_SYMBOL_GPL(unwind_get_return_address); diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index a1202536fc57..2de3c8c5eba9 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -175,7 +175,7 @@ static struct orc_entry *orc_find(unsigned long ip) } /* vmlinux .init slow lookup: */ - if (init_kernel_text(ip)) + if (is_kernel_inittext(ip)) return __orc_find(__start_orc_unwind_ip, __start_orc_unwind, __stop_orc_unwind_ip - __start_orc_unwind_ip, ip); @@ -534,9 +534,8 @@ bool unwind_next_frame(struct unwind_state *state) if (!deref_stack_reg(state, ip_p, &state->ip)) goto err; - state->ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, - state->ip, (void *)ip_p); - + state->ip = unwind_recover_ret_addr(state, state->ip, + (unsigned long *)ip_p); state->sp = sp; state->regs = NULL; state->prev_regs = NULL; @@ -549,7 +548,18 @@ bool unwind_next_frame(struct unwind_state *state) (void *)orig_ip); goto err; } - + /* + * There is a small chance to interrupt at the entry of + * __kretprobe_trampoline() where the ORC info doesn't exist. + * That point is right after the RET to __kretprobe_trampoline() + * which was modified return address. + * At that point, the @addr_p of the unwind_recover_kretprobe() + * (this has to point the address of the stack entry storing + * the modified return address) must be "SP - (a stack entry)" + * because SP is incremented by the RET. + */ + state->ip = unwind_recover_kretprobe(state, state->ip, + (unsigned long *)(state->sp - sizeof(long))); state->regs = (struct pt_regs *)sp; state->prev_regs = NULL; state->full_regs = true; @@ -562,6 +572,9 @@ bool unwind_next_frame(struct unwind_state *state) (void *)orig_ip); goto err; } + /* See UNWIND_HINT_TYPE_REGS case comment. */ + state->ip = unwind_recover_kretprobe(state, state->ip, + (unsigned long *)(state->sp - sizeof(long))); if (state->full_regs) state->prev_regs = state->regs; diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S index 641f0fe1e5b4..1258a5872d12 100644 --- a/arch/x86/kernel/verify_cpu.S +++ b/arch/x86/kernel/verify_cpu.S @@ -132,9 +132,9 @@ SYM_FUNC_START_LOCAL(verify_cpu) .Lverify_cpu_no_longmode: popf # Restore caller passed flags movl $1,%eax - ret + RET .Lverify_cpu_sse_ok: popf # Restore caller passed flags xorl %eax, %eax - ret + RET SYM_FUNC_END(verify_cpu) diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index e5a7a10a0164..c21bcd668284 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -106,10 +106,8 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) */ local_irq_enable(); - if (!vm86 || !vm86->user_vm86) { - pr_alert("no user_vm86: BAD\n"); - do_exit(SIGSEGV); - } + BUG_ON(!vm86); + set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | vm86->veflags_mask); user = vm86->user_vm86; @@ -142,6 +140,7 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) user_access_end(); +exit_vm86: preempt_disable(); tsk->thread.sp0 = vm86->saved_sp0; tsk->thread.sysenter_cs = __KERNEL_CS; @@ -161,7 +160,8 @@ Efault_end: user_access_end(); Efault: pr_alert("could not access userspace vm86 info\n"); - do_exit(SIGSEGV); + force_exit_sig(SIGSEGV); + goto exit_vm86; } static int do_vm86_irq_handling(int subfunction, int irqnumber); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index efd9e9ea17f2..27f830345b6f 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -137,7 +137,6 @@ SECTIONS ALIGN_ENTRY_TEXT_END SOFTIRQENTRY_TEXT STATIC_CALL_TEXT - *(.fixup) *(.gnu.warning) #ifdef CONFIG_RETPOLINE @@ -272,6 +271,20 @@ SECTIONS __parainstructions_end = .; } +#ifdef CONFIG_RETPOLINE + /* + * List of instructions that call/jmp/jcc to retpoline thunks + * __x86_indirect_thunk_*(). These instructions can be patched along + * with alternatives, after which the section can be freed. + */ + . = ALIGN(8); + .retpoline_sites : AT(ADDR(.retpoline_sites) - LOAD_OFFSET) { + __retpoline_sites = .; + *(.retpoline_sites) + __retpoline_sites_end = .; + } +#endif + /* * struct alt_inst entries. From the header (alternative.h): * "Alternative instructions for different CPU types or capabilities" diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 8b395821cb8d..7d20c1d34a3c 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -145,18 +145,6 @@ struct x86_platform_ops x86_platform __ro_after_init = { EXPORT_SYMBOL_GPL(x86_platform); -#if defined(CONFIG_PCI_MSI) -struct x86_msi_ops x86_msi __ro_after_init = { - .restore_msi_irqs = default_restore_msi_irqs, -}; - -/* MSI arch specific hooks */ -void arch_restore_msi_irqs(struct pci_dev *dev) -{ - x86_msi.restore_msi_irqs(dev); -} -#endif - struct x86_apic_ops x86_apic_ops __ro_after_init = { .io_apic_read = native_io_apic_read, .restore = native_restore_boot_irq_mode, diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index ac69894eab88..2b1548da00eb 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -26,7 +26,9 @@ config KVM select PREEMPT_NOTIFIERS select MMU_NOTIFIER select HAVE_KVM_IRQCHIP + select HAVE_KVM_PFNCACHE select HAVE_KVM_IRQFD + select HAVE_KVM_DIRTY_RING select IRQ_BYPASS_MANAGER select HAVE_KVM_IRQ_BYPASS select HAVE_KVM_IRQ_ROUTING @@ -36,6 +38,7 @@ config KVM select KVM_MMIO select SCHED_INFO select PERF_EVENTS + select GUEST_PERF_EVENTS select HAVE_KVM_MSI select HAVE_KVM_CPU_RELAX_INTERCEPT select HAVE_KVM_NO_POLL @@ -43,6 +46,7 @@ config KVM select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_VFIO select SRCU + select INTERVAL_TREE select HAVE_KVM_PM_NOTIFIER if PM help Support hosting fully virtualized guest machines using hardware @@ -129,4 +133,7 @@ config KVM_MMU_AUDIT This option adds a R/W kVM module parameter 'mmu_audit', which allows auditing of KVM MMU events at runtime. +config KVM_EXTERNAL_WRITE_TRACKING + bool + endif # VIRTUALIZATION diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 75dfd27b6e8a..30f244b64523 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -7,12 +7,7 @@ ifeq ($(CONFIG_FRAME_POINTER),y) OBJECT_FILES_NON_STANDARD_vmenter.o := y endif -KVM := ../../../virt/kvm - -kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \ - $(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o \ - $(KVM)/dirty_ring.o $(KVM)/binary_stats.o -kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o +include $(srctree)/virt/kvm/Makefile.kvm kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 751aa85a3001..c55e57b30e81 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -32,7 +32,7 @@ u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly; EXPORT_SYMBOL_GPL(kvm_cpu_caps); -static u32 xstate_required_size(u64 xstate_bv, bool compacted) +u32 xstate_required_size(u64 xstate_bv, bool compacted) { int feature_bit = 0; u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; @@ -42,7 +42,11 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted) if (xstate_bv & 0x1) { u32 eax, ebx, ecx, edx, offset; cpuid_count(0xD, feature_bit, &eax, &ebx, &ecx, &edx); - offset = compacted ? ret : ebx; + /* ECX[1]: 64B alignment in compacted form */ + if (compacted) + offset = (ecx & 0x2) ? ALIGN(ret, 64) : ret; + else + offset = ebx; ret = max(ret, offset + eax); } @@ -53,9 +57,16 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted) return ret; } +/* + * This one is tied to SSB in the user API, and not + * visible in /proc/cpuinfo. + */ +#define KVM_X86_FEATURE_PSFD (13*32+28) /* Predictive Store Forwarding Disable */ + #define F feature_bit #define SF(name) (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0) + static inline struct kvm_cpuid_entry2 *cpuid_entry2_find( struct kvm_cpuid_entry2 *entries, int nent, u32 function, u32 index) { @@ -73,9 +84,12 @@ static inline struct kvm_cpuid_entry2 *cpuid_entry2_find( return NULL; } -static int kvm_check_cpuid(struct kvm_cpuid_entry2 *entries, int nent) +static int kvm_check_cpuid(struct kvm_vcpu *vcpu, + struct kvm_cpuid_entry2 *entries, + int nent) { struct kvm_cpuid_entry2 *best; + u64 xfeatures; /* * The existing code assumes virtual address is 48-bit or 57-bit in the @@ -89,14 +103,61 @@ static int kvm_check_cpuid(struct kvm_cpuid_entry2 *entries, int nent) return -EINVAL; } - return 0; + /* + * Exposing dynamic xfeatures to the guest requires additional + * enabling in the FPU, e.g. to expand the guest XSAVE state size. + */ + best = cpuid_entry2_find(entries, nent, 0xd, 0); + if (!best) + return 0; + + xfeatures = best->eax | ((u64)best->edx << 32); + xfeatures &= XFEATURE_MASK_USER_DYNAMIC; + if (!xfeatures) + return 0; + + return fpu_enable_guest_xfd_features(&vcpu->arch.guest_fpu, xfeatures); } -void kvm_update_pv_runtime(struct kvm_vcpu *vcpu) +static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; + u32 function; + struct kvm_cpuid_entry2 *entry; + + vcpu->arch.kvm_cpuid_base = 0; + + for_each_possible_hypervisor_cpuid_base(function) { + entry = kvm_find_cpuid_entry(vcpu, function, 0); + + if (entry) { + u32 signature[3]; + + signature[0] = entry->ebx; + signature[1] = entry->ecx; + signature[2] = entry->edx; + + BUILD_BUG_ON(sizeof(signature) > sizeof(KVM_SIGNATURE)); + if (!memcmp(signature, KVM_SIGNATURE, sizeof(signature))) { + vcpu->arch.kvm_cpuid_base = function; + break; + } + } + } +} - best = kvm_find_cpuid_entry(vcpu, KVM_CPUID_FEATURES, 0); +static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu) +{ + u32 base = vcpu->arch.kvm_cpuid_base; + + if (!base) + return NULL; + + return kvm_find_cpuid_entry(vcpu, base | KVM_CPUID_FEATURES, 0); +} + +void kvm_update_pv_runtime(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best = kvm_find_kvm_cpuid_features(vcpu); /* * save the feature bitmap to avoid cpuid lookup for every PV @@ -135,7 +196,7 @@ void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) cpuid_entry_has(best, X86_FEATURE_XSAVEC))) best->ebx = xstate_required_size(vcpu->arch.xcr0, true); - best = kvm_find_cpuid_entry(vcpu, KVM_CPUID_FEATURES, 0); + best = kvm_find_kvm_cpuid_features(vcpu); if (kvm_hlt_in_guest(vcpu->kvm) && best && (best->eax & (1 << KVM_FEATURE_PV_UNHALT))) best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); @@ -232,6 +293,26 @@ u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu) return rsvd_bits(cpuid_maxphyaddr(vcpu), 63); } +static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, + int nent) +{ + int r; + + r = kvm_check_cpuid(vcpu, e2, nent); + if (r) + return r; + + kvfree(vcpu->arch.cpuid_entries); + vcpu->arch.cpuid_entries = e2; + vcpu->arch.cpuid_nent = nent; + + kvm_update_kvm_cpuid_base(vcpu); + kvm_update_cpuid_runtime(vcpu); + kvm_vcpu_after_set_cpuid(vcpu); + + return 0; +} + /* when an old userspace process fills a new kernel module */ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid *cpuid, @@ -268,18 +349,9 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, e2[i].padding[2] = 0; } - r = kvm_check_cpuid(e2, cpuid->nent); - if (r) { + r = kvm_set_cpuid(vcpu, e2, cpuid->nent); + if (r) kvfree(e2); - goto out_free_cpuid; - } - - kvfree(vcpu->arch.cpuid_entries); - vcpu->arch.cpuid_entries = e2; - vcpu->arch.cpuid_nent = cpuid->nent; - - kvm_update_cpuid_runtime(vcpu); - kvm_vcpu_after_set_cpuid(vcpu); out_free_cpuid: kvfree(e); @@ -303,20 +375,11 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, return PTR_ERR(e2); } - r = kvm_check_cpuid(e2, cpuid->nent); - if (r) { + r = kvm_set_cpuid(vcpu, e2, cpuid->nent); + if (r) kvfree(e2); - return r; - } - kvfree(vcpu->arch.cpuid_entries); - vcpu->arch.cpuid_entries = e2; - vcpu->arch.cpuid_nent = cpuid->nent; - - kvm_update_cpuid_runtime(vcpu); - kvm_vcpu_after_set_cpuid(vcpu); - - return 0; + return r; } int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, @@ -379,9 +442,11 @@ void kvm_set_cpu_caps(void) #ifdef CONFIG_X86_64 unsigned int f_gbpages = F(GBPAGES); unsigned int f_lm = F(LM); + unsigned int f_xfd = F(XFD); #else unsigned int f_gbpages = 0; unsigned int f_lm = 0; + unsigned int f_xfd = 0; #endif memset(kvm_cpu_caps, 0, sizeof(kvm_cpu_caps)); @@ -449,7 +514,8 @@ void kvm_set_cpu_caps(void) F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) | F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) | F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) | - F(SERIALIZE) | F(TSXLDTRK) | F(AVX512_FP16) + F(SERIALIZE) | F(TSXLDTRK) | F(AVX512_FP16) | + F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16) ); /* TSC_ADJUST and ARCH_CAPABILITIES are emulated in software. */ @@ -468,7 +534,7 @@ void kvm_set_cpu_caps(void) ); kvm_cpu_cap_mask(CPUID_D_1_EAX, - F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES) + F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES) | f_xfd ); kvm_cpu_cap_init_scattered(CPUID_12_EAX, @@ -480,7 +546,7 @@ void kvm_set_cpu_caps(void) F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) | 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM) | - F(TOPOEXT) | F(PERFCTR_CORE) + F(TOPOEXT) | 0 /* PERFCTR_CORE */ ); kvm_cpu_cap_mask(CPUID_8000_0001_EDX, @@ -500,7 +566,8 @@ void kvm_set_cpu_caps(void) kvm_cpu_cap_mask(CPUID_8000_0008_EBX, F(CLZERO) | F(XSAVEERPTR) | F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) | - F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON) + F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON) | + __feature_bit(KVM_X86_FEATURE_PSFD) ); /* @@ -593,6 +660,8 @@ static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array, case 0x14: case 0x17: case 0x18: + case 0x1d: + case 0x1e: case 0x1f: case 0x8000001d: entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; @@ -767,11 +836,13 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) goto out; } break; - case 0xd: - entry->eax &= supported_xcr0; + case 0xd: { + u64 guest_perm = xstate_get_guest_group_perm(); + + entry->eax &= supported_xcr0 & guest_perm; entry->ebx = xstate_required_size(supported_xcr0, false); entry->ecx = entry->ebx; - entry->edx &= supported_xcr0 >> 32; + entry->edx &= (supported_xcr0 & guest_perm) >> 32; if (!supported_xcr0) break; @@ -818,6 +889,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->edx = 0; } break; + } case 0x12: /* Intel SGX */ if (!kvm_cpu_cap_has(X86_FEATURE_SGX)) { @@ -862,9 +934,26 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) goto out; } break; + /* Intel AMX TILE */ + case 0x1d: + if (!kvm_cpu_cap_has(X86_FEATURE_AMX_TILE)) { + entry->eax = entry->ebx = entry->ecx = entry->edx = 0; + break; + } + + for (i = 1, max_idx = entry->eax; i <= max_idx; ++i) { + if (!do_host_cpuid(array, function, i)) + goto out; + } + break; + case 0x1e: /* TMUL information */ + if (!kvm_cpu_cap_has(X86_FEATURE_AMX_TILE)) { + entry->eax = entry->ebx = entry->ecx = entry->edx = 0; + break; + } + break; case KVM_CPUID_SIGNATURE: { - static const char signature[12] = "KVMKVMKVM\0\0"; - const u32 *sigptr = (const u32 *)signature; + const u32 *sigptr = (const u32 *)KVM_SIGNATURE; entry->eax = KVM_CPUID_FEATURES; entry->ebx = sigptr[0]; entry->ecx = sigptr[1]; diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index c99edfff7f82..8a770b481d9d 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -30,6 +30,8 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool exact_only); +u32 xstate_required_size(u64 xstate_bv, bool compacted); + int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu); u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c index 54a83a744538..9240b3b7f8dd 100644 --- a/arch/x86/kvm/debugfs.c +++ b/arch/x86/kvm/debugfs.c @@ -95,6 +95,9 @@ static int kvm_mmu_rmaps_stat_show(struct seq_file *m, void *v) unsigned int *log[KVM_NR_PAGE_SIZES], *cur; int i, j, k, l, ret; + if (!kvm_memslots_have_rmaps(kvm)) + return 0; + ret = -ENOMEM; memset(log, 0, sizeof(log)); for (i = 0; i < KVM_NR_PAGE_SIZES; i++) { @@ -107,9 +110,10 @@ static int kvm_mmu_rmaps_stat_show(struct seq_file *m, void *v) write_lock(&kvm->mmu_lock); for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + int bkt; + slots = __kvm_memslots(kvm, i); - for (j = 0; j < slots->used_slots; j++) { - slot = &slots->memslots[j]; + kvm_for_each_memslot(slot, bkt, slots) for (k = 0; k < KVM_NR_PAGE_SIZES; k++) { rmap = slot->arch.rmap[k]; lpage_size = kvm_mmu_slot_lpages(slot, k + 1); @@ -121,7 +125,6 @@ static int kvm_mmu_rmaps_stat_show(struct seq_file *m, void *v) cur[index]++; } } - } } write_unlock(&kvm->mmu_lock); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 9a144ca8e146..5719d8cfdbd9 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -175,6 +175,7 @@ #define No16 ((u64)1 << 53) /* No 16 bit operand */ #define IncSP ((u64)1 << 54) /* SP is incremented before ModRM calc */ #define TwoMemOp ((u64)1 << 55) /* Instruction has two memory operand */ +#define IsBranch ((u64)1 << 56) /* Instruction is considered a branch. */ #define DstXacc (DstAccLo | SrcAccHi | SrcWrite) @@ -191,8 +192,9 @@ #define FASTOP_SIZE 8 struct opcode { - u64 flags : 56; - u64 intercept : 8; + u64 flags; + u8 intercept; + u8 pad[7]; union { int (*execute)(struct x86_emulate_ctxt *ctxt); const struct opcode *group; @@ -315,7 +317,7 @@ static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop); __FOP_FUNC(#name) #define __FOP_RET(name) \ - "ret \n\t" \ + "11: " ASM_RET \ ".size " name ", .-" name "\n\t" #define FOP_RET(name) \ @@ -344,7 +346,7 @@ static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop); __FOP_RET(#op "_" #dst) #define FOP1EEX(op, dst) \ - FOP1E(op, dst) _ASM_EXTABLE(10b, kvm_fastop_exception) + FOP1E(op, dst) _ASM_EXTABLE_TYPE_REG(10b, 11b, EX_TYPE_ZERO_REG, %%esi) #define FASTOP1(op) \ FOP_START(op) \ @@ -434,10 +436,6 @@ static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop); #op " %al \n\t" \ __FOP_RET(#op) -asm(".pushsection .fixup, \"ax\"\n" - "kvm_fastop_exception: xor %esi, %esi; ret\n" - ".popsection"); - FOP_START(setcc) FOP_SETCC(seto) FOP_SETCC(setno) @@ -473,12 +471,8 @@ FOP_END; \ asm volatile("1:" insn "\n" \ "2:\n" \ - ".pushsection .fixup, \"ax\"\n" \ - "3: movl $1, %[_fault]\n" \ - " jmp 2b\n" \ - ".popsection\n" \ - _ASM_EXTABLE(1b, 3b) \ - : [_fault] "+qm"(_fault) inoutclob ); \ + _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_ONE_REG, %[_fault]) \ + : [_fault] "+r"(_fault) inoutclob ); \ \ _fault ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE; \ }) @@ -4222,6 +4216,11 @@ static int check_rdpmc(struct x86_emulate_ctxt *ctxt) if (enable_vmware_backdoor && is_vmware_backdoor_pmc(rcx)) return X86EMUL_CONTINUE; + /* + * If CR4.PCE is set, the SDM requires CPL=0 or CR0.PE=0. The CR0.PE + * check however is unnecessary because CPL is always 0 outside + * protected mode. + */ if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) || ctxt->ops->check_pmc(ctxt, rcx)) return emulate_gp(ctxt, 0); @@ -4359,10 +4358,10 @@ static const struct opcode group4[] = { static const struct opcode group5[] = { F(DstMem | SrcNone | Lock, em_inc), F(DstMem | SrcNone | Lock, em_dec), - I(SrcMem | NearBranch, em_call_near_abs), - I(SrcMemFAddr | ImplicitOps, em_call_far), - I(SrcMem | NearBranch, em_jmp_abs), - I(SrcMemFAddr | ImplicitOps, em_jmp_far), + I(SrcMem | NearBranch | IsBranch, em_call_near_abs), + I(SrcMemFAddr | ImplicitOps | IsBranch, em_call_far), + I(SrcMem | NearBranch | IsBranch, em_jmp_abs), + I(SrcMemFAddr | ImplicitOps | IsBranch, em_jmp_far), I(SrcMem | Stack | TwoMemOp, em_push), D(Undefined), }; @@ -4572,7 +4571,7 @@ static const struct opcode opcode_table[256] = { I2bvIP(DstDI | SrcDX | Mov | String | Unaligned, em_in, ins, check_perm_in), /* insb, insw/insd */ I2bvIP(SrcSI | DstDX | String, em_out, outs, check_perm_out), /* outsb, outsw/outsd */ /* 0x70 - 0x7F */ - X16(D(SrcImmByte | NearBranch)), + X16(D(SrcImmByte | NearBranch | IsBranch)), /* 0x80 - 0x87 */ G(ByteOp | DstMem | SrcImm, group1), G(DstMem | SrcImm, group1), @@ -4591,7 +4590,7 @@ static const struct opcode opcode_table[256] = { DI(SrcAcc | DstReg, pause), X7(D(SrcAcc | DstReg)), /* 0x98 - 0x9F */ D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd), - I(SrcImmFAddr | No64, em_call_far), N, + I(SrcImmFAddr | No64 | IsBranch, em_call_far), N, II(ImplicitOps | Stack, em_pushf, pushf), II(ImplicitOps | Stack, em_popf, popf), I(ImplicitOps, em_sahf), I(ImplicitOps, em_lahf), @@ -4611,17 +4610,19 @@ static const struct opcode opcode_table[256] = { X8(I(DstReg | SrcImm64 | Mov, em_mov)), /* 0xC0 - 0xC7 */ G(ByteOp | Src2ImmByte, group2), G(Src2ImmByte, group2), - I(ImplicitOps | NearBranch | SrcImmU16, em_ret_near_imm), - I(ImplicitOps | NearBranch, em_ret), + I(ImplicitOps | NearBranch | SrcImmU16 | IsBranch, em_ret_near_imm), + I(ImplicitOps | NearBranch | IsBranch, em_ret), I(DstReg | SrcMemFAddr | ModRM | No64 | Src2ES, em_lseg), I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg), G(ByteOp, group11), G(0, group11), /* 0xC8 - 0xCF */ - I(Stack | SrcImmU16 | Src2ImmByte, em_enter), I(Stack, em_leave), - I(ImplicitOps | SrcImmU16, em_ret_far_imm), - I(ImplicitOps, em_ret_far), - D(ImplicitOps), DI(SrcImmByte, intn), - D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret), + I(Stack | SrcImmU16 | Src2ImmByte | IsBranch, em_enter), + I(Stack | IsBranch, em_leave), + I(ImplicitOps | SrcImmU16 | IsBranch, em_ret_far_imm), + I(ImplicitOps | IsBranch, em_ret_far), + D(ImplicitOps | IsBranch), DI(SrcImmByte | IsBranch, intn), + D(ImplicitOps | No64 | IsBranch), + II(ImplicitOps | IsBranch, em_iret, iret), /* 0xD0 - 0xD7 */ G(Src2One | ByteOp, group2), G(Src2One, group2), G(Src2CL | ByteOp, group2), G(Src2CL, group2), @@ -4632,14 +4633,15 @@ static const struct opcode opcode_table[256] = { /* 0xD8 - 0xDF */ N, E(0, &escape_d9), N, E(0, &escape_db), N, E(0, &escape_dd), N, N, /* 0xE0 - 0xE7 */ - X3(I(SrcImmByte | NearBranch, em_loop)), - I(SrcImmByte | NearBranch, em_jcxz), + X3(I(SrcImmByte | NearBranch | IsBranch, em_loop)), + I(SrcImmByte | NearBranch | IsBranch, em_jcxz), I2bvIP(SrcImmUByte | DstAcc, em_in, in, check_perm_in), I2bvIP(SrcAcc | DstImmUByte, em_out, out, check_perm_out), /* 0xE8 - 0xEF */ - I(SrcImm | NearBranch, em_call), D(SrcImm | ImplicitOps | NearBranch), - I(SrcImmFAddr | No64, em_jmp_far), - D(SrcImmByte | ImplicitOps | NearBranch), + I(SrcImm | NearBranch | IsBranch, em_call), + D(SrcImm | ImplicitOps | NearBranch | IsBranch), + I(SrcImmFAddr | No64 | IsBranch, em_jmp_far), + D(SrcImmByte | ImplicitOps | NearBranch | IsBranch), I2bvIP(SrcDX | DstAcc, em_in, in, check_perm_in), I2bvIP(SrcAcc | DstDX, em_out, out, check_perm_out), /* 0xF0 - 0xF7 */ @@ -4655,7 +4657,7 @@ static const struct opcode opcode_table[256] = { static const struct opcode twobyte_table[256] = { /* 0x00 - 0x0F */ G(0, group6), GD(0, &group7), N, N, - N, I(ImplicitOps | EmulateOnUD, em_syscall), + N, I(ImplicitOps | EmulateOnUD | IsBranch, em_syscall), II(ImplicitOps | Priv, em_clts, clts), N, DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N, N, D(ImplicitOps | ModRM | SrcMem | NoAccess), N, N, @@ -4686,8 +4688,8 @@ static const struct opcode twobyte_table[256] = { IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc), II(ImplicitOps | Priv, em_rdmsr, rdmsr), IIP(ImplicitOps, em_rdpmc, rdpmc, check_rdpmc), - I(ImplicitOps | EmulateOnUD, em_sysenter), - I(ImplicitOps | Priv | EmulateOnUD, em_sysexit), + I(ImplicitOps | EmulateOnUD | IsBranch, em_sysenter), + I(ImplicitOps | Priv | EmulateOnUD | IsBranch, em_sysexit), N, N, N, N, N, N, N, N, N, N, /* 0x40 - 0x4F */ @@ -4705,7 +4707,7 @@ static const struct opcode twobyte_table[256] = { N, N, N, N, N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_6f_0f_7f), /* 0x80 - 0x8F */ - X16(D(SrcImm | NearBranch)), + X16(D(SrcImm | NearBranch | IsBranch)), /* 0x90 - 0x9F */ X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), /* 0xA0 - 0xA7 */ @@ -5219,6 +5221,8 @@ done_prefixes: ctxt->d |= opcode.flags; } + ctxt->is_branch = opcode.flags & IsBranch; + /* Unrecognised? */ if (ctxt->d == 0) return EMULATION_FAILED; diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index d5124b520f76..6e38a7d22e97 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -112,7 +112,7 @@ static void synic_update_vector(struct kvm_vcpu_hv_synic *synic, if (!!auto_eoi_old == !!auto_eoi_new) return; - mutex_lock(&vcpu->kvm->arch.apicv_update_lock); + down_write(&vcpu->kvm->arch.apicv_update_lock); if (auto_eoi_new) hv->synic_auto_eoi_used++; @@ -123,7 +123,7 @@ static void synic_update_vector(struct kvm_vcpu_hv_synic *synic, !hv->synic_auto_eoi_used, APICV_INHIBIT_REASON_HYPERV); - mutex_unlock(&vcpu->kvm->arch.apicv_update_lock); + up_write(&vcpu->kvm->arch.apicv_update_lock); } static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint, @@ -164,7 +164,7 @@ static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint, static struct kvm_vcpu *get_vcpu_by_vpidx(struct kvm *kvm, u32 vpidx) { struct kvm_vcpu *vcpu = NULL; - int i; + unsigned long i; if (vpidx >= KVM_MAX_VCPUS) return NULL; @@ -1472,7 +1472,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) if (!(data & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE)) { hv_vcpu->hv_vapic = data; - if (kvm_lapic_enable_pv_eoi(vcpu, 0, 0)) + if (kvm_lapic_set_pv_eoi(vcpu, 0, 0)) return 1; break; } @@ -1490,7 +1490,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) return 1; hv_vcpu->hv_vapic = data; kvm_vcpu_mark_page_dirty(vcpu, gfn); - if (kvm_lapic_enable_pv_eoi(vcpu, + if (kvm_lapic_set_pv_eoi(vcpu, gfn_to_gpa(gfn) | KVM_MSR_ENABLED, sizeof(struct hv_vp_assist_page))) return 1; @@ -1716,7 +1716,8 @@ static __always_inline unsigned long *sparse_set_to_vcpu_mask( { struct kvm_hv *hv = to_kvm_hv(kvm); struct kvm_vcpu *vcpu; - int i, bank, sbank = 0; + int bank, sbank = 0; + unsigned long i; memset(vp_bitmap, 0, KVM_HV_MAX_SPARSE_VCPU_SET_BITS * sizeof(*vp_bitmap)); @@ -1754,7 +1755,6 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool int i; gpa_t gpa; struct kvm *kvm = vcpu->kvm; - struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct hv_tlb_flush_ex flush_ex; struct hv_tlb_flush flush; u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; @@ -1836,18 +1836,19 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool } } - cpumask_clear(&hv_vcpu->tlb_flush); - - vcpu_mask = all_cpus ? NULL : - sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, - vp_bitmap, vcpu_bitmap); - /* * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't * analyze it here, flush TLB regardless of the specified address space. */ - kvm_make_vcpus_request_mask(kvm, KVM_REQ_TLB_FLUSH_GUEST, - NULL, vcpu_mask, &hv_vcpu->tlb_flush); + if (all_cpus) { + kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH_GUEST); + } else { + vcpu_mask = sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, + vp_bitmap, vcpu_bitmap); + + kvm_make_vcpus_request_mask(kvm, KVM_REQ_TLB_FLUSH_GUEST, + vcpu_mask); + } ret_success: /* We always do full TLB flush, set 'Reps completed' = 'Rep Count' */ @@ -1863,7 +1864,7 @@ static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector, .vector = vector }; struct kvm_vcpu *vcpu; - int i; + unsigned long i; kvm_for_each_vcpu(i, vcpu, kvm) { if (vcpu_bitmap && !test_bit(i, vcpu_bitmap)) @@ -1922,11 +1923,13 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool all_cpus = send_ipi_ex.vp_set.format == HV_GENERIC_SET_ALL; + if (all_cpus) + goto check_and_send_ipi; + if (!sparse_banks_len) goto ret_success; - if (!all_cpus && - kvm_read_guest(kvm, + if (kvm_read_guest(kvm, hc->ingpa + offsetof(struct hv_send_ipi_ex, vp_set.bank_contents), sparse_banks, @@ -1934,6 +1937,7 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool return HV_STATUS_INVALID_HYPERCALL_INPUT; } +check_and_send_ipi: if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR)) return HV_STATUS_INVALID_HYPERCALL_INPUT; @@ -2022,7 +2026,7 @@ static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result) { bool longmode; - longmode = is_64_bit_mode(vcpu); + longmode = is_64_bit_hypercall(vcpu); if (longmode) kvm_rax_write(vcpu, result); else { @@ -2171,7 +2175,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) } #ifdef CONFIG_X86_64 - if (is_64_bit_mode(vcpu)) { + if (is_64_bit_hypercall(vcpu)) { hc.param = kvm_rcx_read(vcpu); hc.ingpa = kvm_rdx_read(vcpu); hc.outgpa = kvm_r8_read(vcpu); @@ -2516,6 +2520,8 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, case HYPERV_CPUID_NESTED_FEATURES: ent->eax = evmcs_ver; + if (evmcs_ver) + ent->eax |= HV_X64_NESTED_MSR_BITMAP; break; diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 5a69cce4d72d..0b65a764ed3a 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -242,7 +242,7 @@ static void pit_do_work(struct kthread_work *work) struct kvm_pit *pit = container_of(work, struct kvm_pit, expired); struct kvm *kvm = pit->kvm; struct kvm_vcpu *vcpu; - int i; + unsigned long i; struct kvm_kpit_state *ps = &pit->pit_state; if (atomic_read(&ps->reinject) && !atomic_xchg(&ps->irq_ack, 0)) diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 0b80263d46d8..814064d06016 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -50,7 +50,7 @@ static void pic_unlock(struct kvm_pic *s) { bool wakeup = s->wakeup_needed; struct kvm_vcpu *vcpu; - int i; + unsigned long i; s->wakeup_needed = false; @@ -270,7 +270,8 @@ int kvm_pic_read_irq(struct kvm *kvm) static void kvm_pic_reset(struct kvm_kpic_state *s) { - int irq, i; + int irq; + unsigned long i; struct kvm_vcpu *vcpu; u8 edge_irr = s->irr & ~s->elcr; bool found = false; diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 8c065da73f8e..decfa36b7891 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -96,7 +96,7 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, static void rtc_irq_eoi_tracking_reset(struct kvm_ioapic *ioapic) { ioapic->rtc_status.pending_eoi = 0; - bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPU_ID + 1); + bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPU_IDS); } static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic); @@ -149,7 +149,7 @@ void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu) static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic) { struct kvm_vcpu *vcpu; - int i; + unsigned long i; if (RTC_GSI >= IOAPIC_NUM_PINS) return; @@ -184,7 +184,7 @@ static bool rtc_irq_check_coalesced(struct kvm_ioapic *ioapic) static void ioapic_lazy_update_eoi(struct kvm_ioapic *ioapic, int irq) { - int i; + unsigned long i; struct kvm_vcpu *vcpu; union kvm_ioapic_redirect_entry *entry = &ioapic->redirtbl[irq]; diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index bbd4a5d18b5d..539333ac4b38 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -39,13 +39,13 @@ struct kvm_vcpu; struct dest_map { /* vcpu bitmap where IRQ has been sent */ - DECLARE_BITMAP(map, KVM_MAX_VCPU_ID + 1); + DECLARE_BITMAP(map, KVM_MAX_VCPU_IDS); /* * Vector sent to a given vcpu, only valid when * the vcpu's bit in map is set */ - u8 vectors[KVM_MAX_VCPU_ID + 1]; + u8 vectors[KVM_MAX_VCPU_IDS]; }; @@ -81,7 +81,6 @@ struct kvm_ioapic { unsigned long irq_states[IOAPIC_NUM_PINS]; struct kvm_io_device dev; struct kvm *kvm; - void (*ack_notifier)(void *opaque, int irq); spinlock_t lock; struct rtc_status rtc_status; struct delayed_work eoi_inject; diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 650642b18d15..c2d7cfe82d00 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -56,7 +56,6 @@ struct kvm_pic { struct kvm_io_device dev_master; struct kvm_io_device dev_slave; struct kvm_io_device dev_elcr; - void (*ack_notifier)(void *opaque, int irq); unsigned long irq_states[PIC_NUM_PINS]; }; diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index d5b72a08e566..6e0dab04320e 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -24,6 +24,7 @@ #include "hyperv.h" #include "x86.h" +#include "xen.h" static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, @@ -45,9 +46,9 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e, int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, struct dest_map *dest_map) { - int i, r = -1; + int r = -1; struct kvm_vcpu *vcpu, *lowest = NULL; - unsigned long dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)]; + unsigned long i, dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)]; unsigned int dest_vcpus = 0; if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map)) @@ -175,6 +176,13 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, return r; break; +#ifdef CONFIG_KVM_XEN + case KVM_IRQ_ROUTING_XEN_EVTCHN: + if (!level) + return -1; + + return kvm_xen_set_evtchn_fast(e, kvm); +#endif default: break; } @@ -310,6 +318,10 @@ int kvm_set_routing_entry(struct kvm *kvm, e->hv_sint.vcpu = ue->u.hv_sint.vcpu; e->hv_sint.sint = ue->u.hv_sint.sint; break; +#ifdef CONFIG_KVM_XEN + case KVM_IRQ_ROUTING_XEN_EVTCHN: + return kvm_xen_setup_evtchn(kvm, e, ue); +#endif default: return -EINVAL; } @@ -320,7 +332,8 @@ int kvm_set_routing_entry(struct kvm *kvm, bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, struct kvm_vcpu **dest_vcpu) { - int i, r = 0; + int r = 0; + unsigned long i; struct kvm_vcpu *vcpu; if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu)) diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 90e1ffdc05b7..3febc342360c 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -9,6 +9,12 @@ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ | X86_CR4_OSXMMEXCPT | X86_CR4_PGE | X86_CR4_TSD | X86_CR4_FSGSBASE) +#define X86_CR0_PDPTR_BITS (X86_CR0_CD | X86_CR0_NW | X86_CR0_PG) +#define X86_CR4_TLBFLUSH_BITS (X86_CR4_PGE | X86_CR4_PCIDE | X86_CR4_PAE | X86_CR4_SMEP) +#define X86_CR4_PDPTR_BITS (X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_SMEP) + +static_assert(!(KVM_POSSIBLE_CR0_GUEST_BITS & X86_CR0_PDPTR_BITS)); + #define BUILD_KVM_GPR_ACCESSORS(lname, uname) \ static __always_inline unsigned long kvm_##lname##_read(struct kvm_vcpu *vcpu)\ { \ @@ -37,6 +43,13 @@ BUILD_KVM_GPR_ACCESSORS(r14, R14) BUILD_KVM_GPR_ACCESSORS(r15, R15) #endif +/* + * avail dirty + * 0 0 register in VMCS/VMCB + * 0 1 *INVALID* + * 1 0 register in vcpu->arch + * 1 1 register in vcpu->arch, needs to be stored back + */ static inline bool kvm_register_is_available(struct kvm_vcpu *vcpu, enum kvm_reg reg) { @@ -55,13 +68,6 @@ static inline void kvm_register_mark_available(struct kvm_vcpu *vcpu, __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); } -static inline void kvm_register_clear_available(struct kvm_vcpu *vcpu, - enum kvm_reg reg) -{ - __clear_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); - __clear_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); -} - static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu, enum kvm_reg reg) { diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index 68b420289d7e..39eded2426ff 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -369,6 +369,7 @@ struct x86_emulate_ctxt { struct fetch_cache fetch; struct read_cache io_read; struct read_cache mem_read; + bool is_branch; }; /* Repeat String Operation Prefix */ diff --git a/arch/x86/kvm/kvm_onhyperv.c b/arch/x86/kvm/kvm_onhyperv.c index c7db2df50a7a..b469f45e3fe4 100644 --- a/arch/x86/kvm/kvm_onhyperv.c +++ b/arch/x86/kvm/kvm_onhyperv.c @@ -33,7 +33,8 @@ int hv_remote_flush_tlb_with_range(struct kvm *kvm, { struct kvm_arch *kvm_arch = &kvm->arch; struct kvm_vcpu *vcpu; - int ret = 0, i, nr_unique_valid_roots; + int ret = 0, nr_unique_valid_roots; + unsigned long i; hpa_t root; spin_lock(&kvm_arch->hv_root_tdp_lock); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 76fb00921203..c5028e6b0f96 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -185,7 +185,7 @@ void kvm_recalculate_apic_map(struct kvm *kvm) { struct kvm_apic_map *new, *old = NULL; struct kvm_vcpu *vcpu; - int i; + unsigned long i; u32 max_id = 255; /* enough space for any xAPIC ID */ /* Read kvm->arch.apic_map_dirty before kvm->arch.apic_map. */ @@ -673,41 +673,40 @@ static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu) return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED; } -static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu) -{ - u8 val; - if (pv_eoi_get_user(vcpu, &val) < 0) { - printk(KERN_WARNING "Can't read EOI MSR value: 0x%llx\n", - (unsigned long long)vcpu->arch.pv_eoi.msr_val); - return false; - } - return val & KVM_PV_EOI_ENABLED; -} - static void pv_eoi_set_pending(struct kvm_vcpu *vcpu) { - if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0) { - printk(KERN_WARNING "Can't set EOI MSR value: 0x%llx\n", - (unsigned long long)vcpu->arch.pv_eoi.msr_val); + if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0) return; - } + __set_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); } -static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu) +static bool pv_eoi_test_and_clr_pending(struct kvm_vcpu *vcpu) { - if (pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0) { - printk(KERN_WARNING "Can't clear EOI MSR value: 0x%llx\n", - (unsigned long long)vcpu->arch.pv_eoi.msr_val); - return; - } + u8 val; + + if (pv_eoi_get_user(vcpu, &val) < 0) + return false; + + val &= KVM_PV_EOI_ENABLED; + + if (val && pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0) + return false; + + /* + * Clear pending bit in any case: it will be set again on vmentry. + * While this might not be ideal from performance point of view, + * this makes sure pv eoi is only enabled when we know it's safe. + */ __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); + + return val; } static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr) { int highest_irr; - if (apic->vcpu->arch.apicv_active) + if (kvm_x86_ops.sync_pir_to_irr) highest_irr = static_call(kvm_x86_sync_pir_to_irr)(apic->vcpu); else highest_irr = apic_find_highest_irr(apic); @@ -1101,6 +1100,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, kvm_lapic_set_irr(vector, apic); kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); + } else { + trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, + trig_mode, vector); } break; @@ -1172,8 +1174,8 @@ void kvm_bitmap_or_dest_vcpus(struct kvm *kvm, struct kvm_lapic_irq *irq, struct kvm_lapic *src = NULL; struct kvm_apic_map *map; struct kvm_vcpu *vcpu; - unsigned long bitmap; - int i, vcpu_idx; + unsigned long bitmap, i; + int vcpu_idx; bool ret; rcu_read_lock(); @@ -1931,7 +1933,7 @@ void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) /* If the preempt notifier has already run, it also called apic_timer_expired */ if (!apic->lapic_timer.hv_timer_in_use) goto out; - WARN_ON(rcuwait_active(&vcpu->wait)); + WARN_ON(kvm_vcpu_is_blocking(vcpu)); apic_timer_expired(apic, false); cancel_hv_timer(apic); @@ -2321,13 +2323,14 @@ EXPORT_SYMBOL_GPL(kvm_apic_update_apicv); void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) { struct kvm_lapic *apic = vcpu->arch.apic; + u64 msr_val; int i; if (!init_event) { - vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE | - MSR_IA32_APICBASE_ENABLE; + msr_val = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE; if (kvm_vcpu_is_reset_bsp(vcpu)) - vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; + msr_val |= MSR_IA32_APICBASE_BSP; + kvm_lapic_set_base(vcpu, msr_val); } if (!apic) @@ -2336,11 +2339,9 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) /* Stop the timer in case it's a reset to an active apic */ hrtimer_cancel(&apic->lapic_timer.timer); - if (!init_event) { - apic->base_address = APIC_DEFAULT_PHYS_BASE; - + /* The xAPIC ID is set at RESET even if the APIC was already enabled. */ + if (!init_event) kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); - } kvm_apic_set_version(apic->vcpu); for (i = 0; i < KVM_APIC_LVT_NUM; i++) @@ -2481,6 +2482,11 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) lapic_timer_advance_dynamic = false; } + /* + * Stuff the APIC ENABLE bit in lieu of temporarily incrementing + * apic_hw_disabled; the full RESET value is set by kvm_lapic_reset(). + */ + vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE; static_branch_inc(&apic_sw_disabled.key); /* sw disabled at reset */ kvm_iodevice_init(&apic->dev, &apic_mmio_ops); @@ -2673,7 +2679,6 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) static void apic_sync_pv_eoi_from_guest(struct kvm_vcpu *vcpu, struct kvm_lapic *apic) { - bool pending; int vector; /* * PV EOI state is derived from KVM_APIC_PV_EOI_PENDING in host @@ -2687,14 +2692,8 @@ static void apic_sync_pv_eoi_from_guest(struct kvm_vcpu *vcpu, * -> host enabled PV EOI, guest executed EOI. */ BUG_ON(!pv_eoi_enabled(vcpu)); - pending = pv_eoi_get_pending(vcpu); - /* - * Clear pending bit in any case: it will be set again on vmentry. - * While this might not be ideal from performance point of view, - * this makes sure pv eoi is only enabled when we know it's safe. - */ - pv_eoi_clr_pending(vcpu); - if (pending) + + if (pv_eoi_test_and_clr_pending(vcpu)) return; vector = apic_set_eoi(apic); trace_kvm_pv_eoi(apic, vector); @@ -2852,25 +2851,30 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data) return 0; } -int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len) +int kvm_lapic_set_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len) { u64 addr = data & ~KVM_MSR_ENABLED; struct gfn_to_hva_cache *ghc = &vcpu->arch.pv_eoi.data; unsigned long new_len; + int ret; if (!IS_ALIGNED(addr, 4)) return 1; - vcpu->arch.pv_eoi.msr_val = data; - if (!pv_eoi_enabled(vcpu)) - return 0; + if (data & KVM_MSR_ENABLED) { + if (addr == ghc->gpa && len <= ghc->len) + new_len = ghc->len; + else + new_len = len; - if (addr == ghc->gpa && len <= ghc->len) - new_len = ghc->len; - else - new_len = len; + ret = kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, addr, new_len); + if (ret) + return ret; + } - return kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, addr, new_len); + vcpu->arch.pv_eoi.msr_val = data; + + return 0; } int kvm_apic_accept_events(struct kvm_vcpu *vcpu) @@ -2942,5 +2946,7 @@ int kvm_apic_accept_events(struct kvm_vcpu *vcpu) void kvm_lapic_exit(void) { static_key_deferred_flush(&apic_hw_disabled); + WARN_ON(static_branch_unlikely(&apic_hw_disabled.key)); static_key_deferred_flush(&apic_sw_disabled); + WARN_ON(static_branch_unlikely(&apic_sw_disabled.key)); } diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index d7c25d0c1354..2b44e533fc8d 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -127,7 +127,7 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data); int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); -int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len); +int kvm_lapic_set_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len); void kvm_lapic_exit(void); #define VEC_POS(v) ((v) & (32 - 1)) diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index e9688a9f7b57..e9fbb2c8bbe2 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -44,9 +44,8 @@ #define PT32_ROOT_LEVEL 2 #define PT32E_ROOT_LEVEL 3 -#define KVM_MMU_CR4_ROLE_BITS (X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | \ - X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE | \ - X86_CR4_LA57) +#define KVM_MMU_CR4_ROLE_BITS (X86_CR4_PSE | X86_CR4_PAE | X86_CR4_LA57 | \ + X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE) #define KVM_MMU_CR0_ROLE_BITS (X86_CR0_PG | X86_CR0_WP) @@ -72,7 +71,8 @@ void kvm_init_mmu(struct kvm_vcpu *vcpu); void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0, unsigned long cr4, u64 efer, gpa_t nested_cr3); void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, - bool accessed_dirty, gpa_t new_eptp); + int huge_page_level, bool accessed_dirty, + gpa_t new_eptp); bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu); int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, u64 fault_address, char *insn, int insn_len); @@ -80,6 +80,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, int kvm_mmu_load(struct kvm_vcpu *vcpu); void kvm_mmu_unload(struct kvm_vcpu *vcpu); void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); +void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu); static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) { @@ -114,17 +115,91 @@ static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu) vcpu->arch.mmu->shadow_root_level); } -int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, - bool prefault); +struct kvm_page_fault { + /* arguments to kvm_mmu_do_page_fault. */ + const gpa_t addr; + const u32 error_code; + const bool prefetch; + + /* Derived from error_code. */ + const bool exec; + const bool write; + const bool present; + const bool rsvd; + const bool user; + + /* Derived from mmu and global state. */ + const bool is_tdp; + const bool nx_huge_page_workaround_enabled; + + /* + * Whether a >4KB mapping can be created or is forbidden due to NX + * hugepages. + */ + bool huge_page_disallowed; + + /* + * Maximum page size that can be created for this fault; input to + * FNAME(fetch), __direct_map and kvm_tdp_mmu_map. + */ + u8 max_level; + + /* + * Page size that can be created based on the max_level and the + * page size used by the host mapping. + */ + u8 req_level; + + /* + * Page size that will be created based on the req_level and + * huge_page_disallowed. + */ + u8 goal_level; + + /* Shifted addr, or result of guest page table walk if addr is a gva. */ + gfn_t gfn; + + /* The memslot containing gfn. May be NULL. */ + struct kvm_memory_slot *slot; + + /* Outputs of kvm_faultin_pfn. */ + kvm_pfn_t pfn; + hva_t hva; + bool map_writable; +}; + +int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); + +extern int nx_huge_pages; +static inline bool is_nx_huge_page_enabled(void) +{ + return READ_ONCE(nx_huge_pages); +} static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - u32 err, bool prefault) + u32 err, bool prefetch) { + struct kvm_page_fault fault = { + .addr = cr2_or_gpa, + .error_code = err, + .exec = err & PFERR_FETCH_MASK, + .write = err & PFERR_WRITE_MASK, + .present = err & PFERR_PRESENT_MASK, + .rsvd = err & PFERR_RSVD_MASK, + .user = err & PFERR_USER_MASK, + .prefetch = prefetch, + .is_tdp = likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault), + .nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(), + + .max_level = KVM_MAX_HUGEPAGE_LEVEL, + .req_level = PG_LEVEL_4K, + .goal_level = PG_LEVEL_4K, + }; #ifdef CONFIG_RETPOLINE - if (likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault)) - return kvm_tdp_page_fault(vcpu, cr2_or_gpa, err, prefault); + if (fault.is_tdp) + return kvm_tdp_page_fault(vcpu, &fault); #endif - return vcpu->arch.mmu->page_fault(vcpu, cr2_or_gpa, err, prefault); + return vcpu->arch.mmu->page_fault(vcpu, &fault); } /* @@ -230,14 +305,26 @@ int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu); int kvm_mmu_post_init_vm(struct kvm *kvm); void kvm_mmu_pre_destroy_vm(struct kvm *kvm); -static inline bool kvm_memslots_have_rmaps(struct kvm *kvm) +static inline bool kvm_shadow_root_allocated(struct kvm *kvm) { /* - * Read memslot_have_rmaps before rmap pointers. Hence, threads reading - * memslots_have_rmaps in any lock context are guaranteed to see the - * pointers. Pairs with smp_store_release in alloc_all_memslots_rmaps. + * Read shadow_root_allocated before related pointers. Hence, threads + * reading shadow_root_allocated in any lock context are guaranteed to + * see the pointers. Pairs with smp_store_release in + * mmu_first_shadow_root_alloc. */ - return smp_load_acquire(&kvm->arch.memslots_have_rmaps); + return smp_load_acquire(&kvm->arch.shadow_root_allocated); +} + +#ifdef CONFIG_X86_64 +static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return kvm->arch.tdp_mmu_enabled; } +#else +static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return false; } +#endif + +static inline bool kvm_memslots_have_rmaps(struct kvm *kvm) +{ + return !is_tdp_mmu_enabled(kvm) || kvm_shadow_root_allocated(kvm); } static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) @@ -265,4 +352,17 @@ static inline void kvm_update_page_stats(struct kvm *kvm, int level, int count) { atomic64_add(count, &kvm->stat.pages[level - 1]); } + +gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, + struct x86_exception *exception); + +static inline gpa_t kvm_translate_gpa(struct kvm_vcpu *vcpu, + struct kvm_mmu *mmu, + gpa_t gpa, u32 access, + struct x86_exception *exception) +{ + if (mmu != &vcpu->arch.nested_mmu) + return gpa; + return translate_nested_gpa(vcpu, gpa, access, exception); +} #endif diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 1a64ba5b9437..1d275e9d76b5 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -58,6 +58,7 @@ extern bool itlb_multihit_kvm_mitigation; int __read_mostly nx_huge_pages = -1; +static uint __read_mostly nx_huge_pages_recovery_period_ms; #ifdef CONFIG_PREEMPT_RT /* Recovery can cause latency spikes, disable it for PREEMPT_RT. */ static uint __read_mostly nx_huge_pages_recovery_ratio = 0; @@ -66,23 +67,26 @@ static uint __read_mostly nx_huge_pages_recovery_ratio = 60; #endif static int set_nx_huge_pages(const char *val, const struct kernel_param *kp); -static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp); +static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp); static const struct kernel_param_ops nx_huge_pages_ops = { .set = set_nx_huge_pages, .get = param_get_bool, }; -static const struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = { - .set = set_nx_huge_pages_recovery_ratio, +static const struct kernel_param_ops nx_huge_pages_recovery_param_ops = { + .set = set_nx_huge_pages_recovery_param, .get = param_get_uint, }; module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644); __MODULE_PARM_TYPE(nx_huge_pages, "bool"); -module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops, +module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_param_ops, &nx_huge_pages_recovery_ratio, 0644); __MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint"); +module_param_cb(nx_huge_pages_recovery_period_ms, &nx_huge_pages_recovery_param_ops, + &nx_huge_pages_recovery_period_ms, 0644); +__MODULE_PARM_TYPE(nx_huge_pages_recovery_period_ms, "uint"); static bool __read_mostly force_flush_and_sync_on_reuse; module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644); @@ -331,12 +335,6 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) return likely(kvm_gen == spte_gen); } -static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, - struct x86_exception *exception) -{ - return gpa; -} - static int is_cpuid_PSE36(void) { return 1; @@ -1071,20 +1069,6 @@ static bool rmap_can_add(struct kvm_vcpu *vcpu) return kvm_mmu_memory_cache_nr_free_objects(mc); } -static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) -{ - struct kvm_memory_slot *slot; - struct kvm_mmu_page *sp; - struct kvm_rmap_head *rmap_head; - - sp = sptep_to_sp(spte); - kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); - return pte_list_add(vcpu, spte, rmap_head); -} - - static void rmap_remove(struct kvm *kvm, u64 *spte) { struct kvm_memslots *slots; @@ -1097,9 +1081,9 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); /* - * Unlike rmap_add and rmap_recycle, rmap_remove does not run in the - * context of a vCPU so have to determine which memslots to use based - * on context information in sp->role. + * Unlike rmap_add, rmap_remove does not run in the context of a vCPU + * so we have to determine which memslots to use based on context + * information in sp->role. */ slots = kvm_memslots_for_spte_role(kvm, sp->role); @@ -1464,7 +1448,7 @@ static bool kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, { u64 *sptep; struct rmap_iterator iter; - int need_flush = 0; + bool need_flush = false; u64 new_spte; kvm_pfn_t new_pfn; @@ -1476,7 +1460,7 @@ restart: rmap_printk("spte %p %llx gfn %llx (%d)\n", sptep, *sptep, gfn, level); - need_flush = 1; + need_flush = true; if (pte_write(pte)) { pte_list_remove(kvm, rmap_head, sptep); @@ -1492,7 +1476,7 @@ restart: if (need_flush && kvm_available_flush_tlb_with_range()) { kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); - return 0; + return false; } return need_flush; @@ -1592,7 +1576,7 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp); if (is_tdp_mmu_enabled(kvm)) - flush |= kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush); + flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush); return flush; } @@ -1633,25 +1617,29 @@ static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, for_each_rmap_spte(rmap_head, &iter, sptep) if (is_accessed_spte(*sptep)) - return 1; - return 0; + return true; + return false; } #define RMAP_RECYCLE_THRESHOLD 1000 -static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) +static void rmap_add(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, + u64 *spte, gfn_t gfn) { - struct kvm_memory_slot *slot; - struct kvm_rmap_head *rmap_head; struct kvm_mmu_page *sp; + struct kvm_rmap_head *rmap_head; + int rmap_count; sp = sptep_to_sp(spte); - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); + rmap_count = pte_list_add(vcpu, spte, rmap_head); - kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0)); - kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, - KVM_PAGES_PER_HPAGE(sp->role.level)); + if (rmap_count > RMAP_RECYCLE_THRESHOLD) { + kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0)); + kvm_flush_remote_tlbs_with_address( + vcpu->kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); + } } bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) @@ -1795,7 +1783,7 @@ static void mark_unsync(u64 *spte) static int nonpaging_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { - return 0; + return -1; } #define KVM_PAGE_ARRAY_NR 16 @@ -1909,12 +1897,14 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, struct list_head *invalid_list) { - if (vcpu->arch.mmu->sync_page(vcpu, sp) == 0) { + int ret = vcpu->arch.mmu->sync_page(vcpu, sp); + + if (ret < 0) { kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); return false; } - return true; + return !!ret; } static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm, @@ -1931,17 +1921,6 @@ static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm, return true; } -static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu, - struct list_head *invalid_list, - bool remote_flush, bool local_flush) -{ - if (kvm_mmu_remote_flush_or_zap(vcpu->kvm, invalid_list, remote_flush)) - return; - - if (local_flush) - kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); -} - #ifdef CONFIG_KVM_MMU_AUDIT #include "mmu_audit.c" #else @@ -1951,7 +1930,11 @@ static void mmu_audit_disable(void) { } static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) { - return sp->role.invalid || + if (sp->role.invalid) + return true; + + /* TDP MMU pages due not use the MMU generation. */ + return !sp->tdp_mmu_page && unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); } @@ -2044,7 +2027,7 @@ static int mmu_sync_children(struct kvm_vcpu *vcpu, protected |= rmap_write_protect(vcpu, sp->gfn); if (protected) { - kvm_flush_remote_tlbs(vcpu->kvm); + kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, true); flush = false; } @@ -2054,7 +2037,7 @@ static int mmu_sync_children(struct kvm_vcpu *vcpu, mmu_pages_clear_parents(&parents); } if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) { - kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); + kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush); if (!can_yield) { kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); return -EINTR; @@ -2065,7 +2048,7 @@ static int mmu_sync_children(struct kvm_vcpu *vcpu, } } - kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); + kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush); return 0; } @@ -2097,10 +2080,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, role = vcpu->arch.mmu->mmu_role.base; role.level = level; role.direct = direct; - if (role.direct) - role.gpte_is_8_bytes = true; role.access = access; - if (!direct_mmu && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) { + if (role.has_4_byte_gpte) { quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; role.quadrant = quadrant; @@ -2149,7 +2130,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, break; WARN_ON(!list_empty(&invalid_list)); - kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); + kvm_flush_remote_tlbs(vcpu->kvm); } __clear_sp_write_flooding_count(sp); @@ -2188,10 +2169,10 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato iterator->shadow_addr = root; iterator->level = vcpu->arch.mmu->shadow_root_level; - if (iterator->level == PT64_ROOT_4LEVEL && + if (iterator->level >= PT64_ROOT_4LEVEL && vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL && !vcpu->arch.mmu->direct_map) - --iterator->level; + iterator->level = PT32E_ROOT_LEVEL; if (iterator->level == PT32E_ROOT_LEVEL) { /* @@ -2229,7 +2210,7 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator, u64 spte) { - if (is_last_spte(spte, iterator->level)) { + if (!is_shadow_present_pte(spte) || is_last_spte(spte, iterator->level)) { iterator->level = 0; return; } @@ -2576,10 +2557,10 @@ static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) return r; } -static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +static void kvm_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) { trace_kvm_mmu_unsync_page(sp); - ++vcpu->kvm->stat.mmu_unsync; + ++kvm->stat.mmu_unsync; sp->unsync = 1; kvm_mmu_mark_parents_unsync(sp); @@ -2591,7 +2572,8 @@ static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) * were marked unsync (or if there is no shadow page), -EPERM if the SPTE must * be write-protected. */ -int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) +int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, + gfn_t gfn, bool can_unsync, bool prefetch) { struct kvm_mmu_page *sp; bool locked = false; @@ -2601,7 +2583,7 @@ int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) * track machinery is used to write-protect upper-level shadow pages, * i.e. this guards the role.level == 4K assertion below! */ - if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE)) + if (kvm_slot_page_track_is_active(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE)) return -EPERM; /* @@ -2610,13 +2592,16 @@ int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) * that case, KVM must complete emulation of the guest TLB flush before * allowing shadow pages to become unsync (writable by the guest). */ - for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) { + for_each_gfn_indirect_valid_sp(kvm, sp, gfn) { if (!can_unsync) return -EPERM; if (sp->unsync) continue; + if (prefetch) + return -EEXIST; + /* * TDP MMU page faults require an additional spinlock as they * run with mmu_lock held for read, not write, and the unsync @@ -2626,7 +2611,7 @@ int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) */ if (!locked) { locked = true; - spin_lock(&vcpu->kvm->arch.mmu_unsync_pages_lock); + spin_lock(&kvm->arch.mmu_unsync_pages_lock); /* * Recheck after taking the spinlock, a different vCPU @@ -2641,10 +2626,10 @@ int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) } WARN_ON(sp->role.level != PG_LEVEL_4K); - kvm_unsync_page(vcpu, sp); + kvm_unsync_page(kvm, sp); } if (locked) - spin_unlock(&vcpu->kvm->arch.mmu_unsync_pages_lock); + spin_unlock(&kvm->arch.mmu_unsync_pages_lock); /* * We need to ensure that the marking of unsync pages is visible @@ -2680,48 +2665,30 @@ int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) * (sp->unsync = true) * * The write barrier below ensures that 1.1 happens before 1.2 and thus - * the situation in 2.4 does not arise. The implicit barrier in 2.2 - * pairs with this write barrier. + * the situation in 2.4 does not arise. It pairs with the read barrier + * in is_unsync_root(), placed between 2.1's load of SPTE.W and 2.3. */ smp_wmb(); return 0; } -static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, - unsigned int pte_access, int level, - gfn_t gfn, kvm_pfn_t pfn, bool speculative, - bool can_unsync, bool host_writable) -{ - u64 spte; - struct kvm_mmu_page *sp; - int ret; - - sp = sptep_to_sp(sptep); - - ret = make_spte(vcpu, pte_access, level, gfn, pfn, *sptep, speculative, - can_unsync, host_writable, sp_ad_disabled(sp), &spte); - - if (spte & PT_WRITABLE_MASK) - kvm_vcpu_mark_page_dirty(vcpu, gfn); - - if (*sptep == spte) - ret |= SET_SPTE_SPURIOUS; - else if (mmu_spte_update(sptep, spte)) - ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH; - return ret; -} - -static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, - unsigned int pte_access, bool write_fault, int level, - gfn_t gfn, kvm_pfn_t pfn, bool speculative, - bool host_writable) +static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, + u64 *sptep, unsigned int pte_access, gfn_t gfn, + kvm_pfn_t pfn, struct kvm_page_fault *fault) { + struct kvm_mmu_page *sp = sptep_to_sp(sptep); + int level = sp->role.level; int was_rmapped = 0; - int rmap_count; - int set_spte_ret; int ret = RET_PF_FIXED; bool flush = false; + bool wrprot; + u64 spte; + + /* Prefetching always gets a writable pfn. */ + bool host_writable = !fault || fault->map_writable; + bool prefetch = !fault || fault->prefetch; + bool write_fault = fault && fault->write; pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, *sptep, write_fault, gfn); @@ -2752,52 +2719,36 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, was_rmapped = 1; } - set_spte_ret = set_spte(vcpu, sptep, pte_access, level, gfn, pfn, - speculative, true, host_writable); - if (set_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) { + wrprot = make_spte(vcpu, sp, slot, pte_access, gfn, pfn, *sptep, prefetch, + true, host_writable, &spte); + + if (*sptep == spte) { + ret = RET_PF_SPURIOUS; + } else { + trace_kvm_mmu_set_spte(level, gfn, sptep); + flush |= mmu_spte_update(sptep, spte); + } + + if (wrprot) { if (write_fault) ret = RET_PF_EMULATE; - kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } - if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH || flush) + if (flush) kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, KVM_PAGES_PER_HPAGE(level)); - /* - * The fault is fully spurious if and only if the new SPTE and old SPTE - * are identical, and emulation is not required. - */ - if ((set_spte_ret & SET_SPTE_SPURIOUS) && ret == RET_PF_FIXED) { - WARN_ON_ONCE(!was_rmapped); - return RET_PF_SPURIOUS; - } - pgprintk("%s: setting spte %llx\n", __func__, *sptep); - trace_kvm_mmu_set_spte(level, gfn, sptep); if (!was_rmapped) { + WARN_ON_ONCE(ret == RET_PF_SPURIOUS); kvm_update_page_stats(vcpu->kvm, level, 1); - rmap_count = rmap_add(vcpu, sptep, gfn); - if (rmap_count > RMAP_RECYCLE_THRESHOLD) - rmap_recycle(vcpu, sptep, gfn); + rmap_add(vcpu, slot, sptep, gfn); } return ret; } -static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, - bool no_dirty_log) -{ - struct kvm_memory_slot *slot; - - slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log); - if (!slot) - return KVM_PFN_ERR_FAULT; - - return gfn_to_pfn_memslot_atomic(slot, gfn); -} - static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *start, u64 *end) @@ -2818,8 +2769,8 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, return -1; for (i = 0; i < ret; i++, gfn++, start++) { - mmu_set_spte(vcpu, start, access, false, sp->role.level, gfn, - page_to_pfn(pages[i]), true, true); + mmu_set_spte(vcpu, slot, start, access, gfn, + page_to_pfn(pages[i]), NULL); put_page(pages[i]); } @@ -2842,11 +2793,13 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu, if (!start) continue; if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0) - break; + return; start = NULL; } else if (!start) start = spte; } + if (start) + direct_pte_prefetch_many(vcpu, sp, start, spte); } static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) @@ -2924,52 +2877,46 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, return min(host_level, max_level); } -int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, - int max_level, kvm_pfn_t *pfnp, - bool huge_page_disallowed, int *req_level) +void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - struct kvm_memory_slot *slot; - kvm_pfn_t pfn = *pfnp; + struct kvm_memory_slot *slot = fault->slot; kvm_pfn_t mask; - int level; - *req_level = PG_LEVEL_4K; + fault->huge_page_disallowed = fault->exec && fault->nx_huge_page_workaround_enabled; - if (unlikely(max_level == PG_LEVEL_4K)) - return PG_LEVEL_4K; + if (unlikely(fault->max_level == PG_LEVEL_4K)) + return; - if (is_error_noslot_pfn(pfn) || kvm_is_reserved_pfn(pfn)) - return PG_LEVEL_4K; + if (is_error_noslot_pfn(fault->pfn) || kvm_is_reserved_pfn(fault->pfn)) + return; - slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, true); - if (!slot) - return PG_LEVEL_4K; + if (kvm_slot_dirty_track_enabled(slot)) + return; /* * Enforce the iTLB multihit workaround after capturing the requested * level, which will be used to do precise, accurate accounting. */ - *req_level = level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, gfn, pfn, max_level); - if (level == PG_LEVEL_4K || huge_page_disallowed) - return PG_LEVEL_4K; + fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, + fault->gfn, fault->pfn, + fault->max_level); + if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed) + return; /* * mmu_notifier_retry() was successful and mmu_lock is held, so * the pmd can't be split from under us. */ - mask = KVM_PAGES_PER_HPAGE(level) - 1; - VM_BUG_ON((gfn & mask) != (pfn & mask)); - *pfnp = pfn & ~mask; - - return level; + fault->goal_level = fault->req_level; + mask = KVM_PAGES_PER_HPAGE(fault->goal_level) - 1; + VM_BUG_ON((fault->gfn & mask) != (fault->pfn & mask)); + fault->pfn &= ~mask; } -void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level, - kvm_pfn_t *pfnp, int *goal_levelp) +void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level) { - int level = *goal_levelp; - - if (cur_level == level && level > PG_LEVEL_4K && + if (cur_level > PG_LEVEL_4K && + cur_level == fault->goal_level && is_shadow_present_pte(spte) && !is_large_pte(spte)) { /* @@ -2979,42 +2926,33 @@ void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level, * patching back for them into pfn the next 9 bits of * the address. */ - u64 page_mask = KVM_PAGES_PER_HPAGE(level) - - KVM_PAGES_PER_HPAGE(level - 1); - *pfnp |= gfn & page_mask; - (*goal_levelp)--; + u64 page_mask = KVM_PAGES_PER_HPAGE(cur_level) - + KVM_PAGES_PER_HPAGE(cur_level - 1); + fault->pfn |= fault->gfn & page_mask; + fault->goal_level--; } } -static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, - int map_writable, int max_level, kvm_pfn_t pfn, - bool prefault, bool is_tdp) +static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(); - bool write = error_code & PFERR_WRITE_MASK; - bool exec = error_code & PFERR_FETCH_MASK; - bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; struct kvm_shadow_walk_iterator it; struct kvm_mmu_page *sp; - int level, req_level, ret; - gfn_t gfn = gpa >> PAGE_SHIFT; - gfn_t base_gfn = gfn; + int ret; + gfn_t base_gfn = fault->gfn; - level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, - huge_page_disallowed, &req_level); + kvm_mmu_hugepage_adjust(vcpu, fault); - trace_kvm_mmu_spte_requested(gpa, level, pfn); - for_each_shadow_entry(vcpu, gpa, it) { + trace_kvm_mmu_spte_requested(fault); + for_each_shadow_entry(vcpu, fault->addr, it) { /* * We cannot overwrite existing page tables with an NX * large page, as the leaf could be executable. */ - if (nx_huge_page_workaround_enabled) - disallowed_hugepage_adjust(*it.sptep, gfn, it.level, - &pfn, &level); + if (fault->nx_huge_page_workaround_enabled) + disallowed_hugepage_adjust(fault, *it.sptep, it.level); - base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); - if (it.level == level) + base_gfn = fault->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); + if (it.level == fault->goal_level) break; drop_large_spte(vcpu, it.sptep); @@ -3025,14 +2963,16 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, it.level - 1, true, ACC_ALL); link_shadow_page(vcpu, it.sptep, sp); - if (is_tdp && huge_page_disallowed && - req_level >= it.level) + if (fault->is_tdp && fault->huge_page_disallowed && + fault->req_level >= it.level) account_huge_nx_page(vcpu->kvm, sp); } - ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL, - write, level, base_gfn, pfn, prefault, - map_writable); + if (WARN_ON_ONCE(it.level != fault->goal_level)) + return -EFAULT; + + ret = mmu_set_spte(vcpu, fault->slot, it.sptep, ACC_ALL, + base_gfn, fault->pfn, fault); if (ret == RET_PF_SPURIOUS) return ret; @@ -3064,18 +3004,19 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) return -EFAULT; } -static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, - kvm_pfn_t pfn, unsigned int access, - int *ret_val) +static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, + unsigned int access, int *ret_val) { /* The pfn is invalid, report the error! */ - if (unlikely(is_error_pfn(pfn))) { - *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn); + if (unlikely(is_error_pfn(fault->pfn))) { + *ret_val = kvm_handle_bad_page(vcpu, fault->gfn, fault->pfn); return true; } - if (unlikely(is_noslot_pfn(pfn))) { - vcpu_cache_mmio_info(vcpu, gva, gfn, + if (unlikely(!fault->slot)) { + gva_t gva = fault->is_tdp ? 0 : fault->addr; + + vcpu_cache_mmio_info(vcpu, gva, fault->gfn, access & shadow_mmio_access_mask); /* * If MMIO caching is disabled, emulate immediately without @@ -3091,18 +3032,17 @@ static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, return false; } -static bool page_fault_can_be_fast(u32 error_code) +static bool page_fault_can_be_fast(struct kvm_page_fault *fault) { /* * Do not fix the mmio spte with invalid generation number which * need to be updated by slow page fault path. */ - if (unlikely(error_code & PFERR_RSVD_MASK)) + if (fault->rsvd) return false; /* See if the page fault is due to an NX violation */ - if (unlikely(((error_code & (PFERR_FETCH_MASK | PFERR_PRESENT_MASK)) - == (PFERR_FETCH_MASK | PFERR_PRESENT_MASK)))) + if (unlikely(fault->exec && fault->present)) return false; /* @@ -3119,9 +3059,7 @@ static bool page_fault_can_be_fast(u32 error_code) * accesses to a present page. */ - return shadow_acc_track_mask != 0 || - ((error_code & (PFERR_WRITE_MASK | PFERR_PRESENT_MASK)) - == (PFERR_WRITE_MASK | PFERR_PRESENT_MASK)); + return shadow_acc_track_mask != 0 || (fault->write && fault->present); } /* @@ -3129,13 +3067,9 @@ static bool page_fault_can_be_fast(u32 error_code) * someone else modified the SPTE from its original value. */ static bool -fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, +fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, u64 *sptep, u64 old_spte, u64 new_spte) { - gfn_t gfn; - - WARN_ON(!sp->role.direct); - /* * Theoretically we could also set dirty bit (and flush TLB) here in * order to eliminate unnecessary PML logging. See comments in @@ -3151,24 +3085,18 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, if (cmpxchg64(sptep, old_spte, new_spte) != old_spte) return false; - if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) { - /* - * The gfn of direct spte is stable since it is - * calculated by sp->gfn. - */ - gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); - kvm_vcpu_mark_page_dirty(vcpu, gfn); - } + if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) + mark_page_dirty_in_slot(vcpu->kvm, fault->slot, fault->gfn); return true; } -static bool is_access_allowed(u32 fault_err_code, u64 spte) +static bool is_access_allowed(struct kvm_page_fault *fault, u64 spte) { - if (fault_err_code & PFERR_FETCH_MASK) + if (fault->exec) return is_executable_pte(spte); - if (fault_err_code & PFERR_WRITE_MASK) + if (fault->write) return is_writable_pte(spte); /* Fault was on Read access */ @@ -3193,9 +3121,6 @@ static u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte) for_each_shadow_entry_lockless(vcpu, gpa, iterator, old_spte) { sptep = iterator.sptep; *spte = old_spte; - - if (!is_shadow_present_pte(old_spte)) - break; } return sptep; @@ -3204,7 +3129,7 @@ static u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte) /* * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS. */ -static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code) +static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_mmu_page *sp; int ret = RET_PF_INVALID; @@ -3212,7 +3137,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code) u64 *sptep = NULL; uint retry_count = 0; - if (!page_fault_can_be_fast(error_code)) + if (!page_fault_can_be_fast(fault)) return ret; walk_shadow_page_lockless_begin(vcpu); @@ -3221,9 +3146,9 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code) u64 new_spte; if (is_tdp_mmu(vcpu->arch.mmu)) - sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, gpa, &spte); + sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->addr, &spte); else - sptep = fast_pf_get_last_sptep(vcpu, gpa, &spte); + sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte); if (!is_shadow_present_pte(spte)) break; @@ -3242,7 +3167,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code) * Need not check the access of upper level table entries since * they are always ACC_ALL. */ - if (is_access_allowed(error_code, spte)) { + if (is_access_allowed(fault, spte)) { ret = RET_PF_SPURIOUS; break; } @@ -3257,28 +3182,28 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code) * be removed in the fast path only if the SPTE was * write-protected for dirty-logging or access tracking. */ - if ((error_code & PFERR_WRITE_MASK) && + if (fault->write && spte_can_locklessly_be_made_writable(spte)) { new_spte |= PT_WRITABLE_MASK; /* - * Do not fix write-permission on the large spte. Since - * we only dirty the first page into the dirty-bitmap in + * Do not fix write-permission on the large spte when + * dirty logging is enabled. Since we only dirty the + * first page into the dirty-bitmap in * fast_pf_fix_direct_spte(), other pages are missed * if its slot has dirty logging enabled. * * Instead, we let the slow page fault path create a * normal spte to fix the access. - * - * See the comments in kvm_arch_commit_memory_region(). */ - if (sp->role.level > PG_LEVEL_4K) + if (sp->role.level > PG_LEVEL_4K && + kvm_slot_dirty_track_enabled(fault->slot)) break; } /* Verify that the fault can be handled in the fast path */ if (new_spte == spte || - !is_access_allowed(error_code, new_spte)) + !is_access_allowed(fault, new_spte)) break; /* @@ -3286,7 +3211,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code) * since the gfn is not stable for indirect shadow page. See * Documentation/virt/kvm/locking.rst to get more detail. */ - if (fast_pf_fix_direct_spte(vcpu, sp, sptep, spte, new_spte)) { + if (fast_pf_fix_direct_spte(vcpu, fault, sptep, spte, new_spte)) { ret = RET_PF_FIXED; break; } @@ -3299,7 +3224,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code) } while (true); - trace_fast_page_fault(vcpu, gpa, error_code, sptep, spte, ret); + trace_fast_page_fault(vcpu, fault, sptep, spte, ret); walk_shadow_page_lockless_end(vcpu); return ret; @@ -3472,6 +3397,67 @@ out_unlock: return r; } +static int mmu_first_shadow_root_alloc(struct kvm *kvm) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *slot; + int r = 0, i, bkt; + + /* + * Check if this is the first shadow root being allocated before + * taking the lock. + */ + if (kvm_shadow_root_allocated(kvm)) + return 0; + + mutex_lock(&kvm->slots_arch_lock); + + /* Recheck, under the lock, whether this is the first shadow root. */ + if (kvm_shadow_root_allocated(kvm)) + goto out_unlock; + + /* + * Check if anything actually needs to be allocated, e.g. all metadata + * will be allocated upfront if TDP is disabled. + */ + if (kvm_memslots_have_rmaps(kvm) && + kvm_page_track_write_tracking_enabled(kvm)) + goto out_success; + + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + slots = __kvm_memslots(kvm, i); + kvm_for_each_memslot(slot, bkt, slots) { + /* + * Both of these functions are no-ops if the target is + * already allocated, so unconditionally calling both + * is safe. Intentionally do NOT free allocations on + * failure to avoid having to track which allocations + * were made now versus when the memslot was created. + * The metadata is guaranteed to be freed when the slot + * is freed, and will be kept/used if userspace retries + * KVM_RUN instead of killing the VM. + */ + r = memslot_rmap_alloc(slot, slot->npages); + if (r) + goto out_unlock; + r = kvm_page_track_write_tracking_alloc(slot); + if (r) + goto out_unlock; + } + } + + /* + * Ensure that shadow_root_allocated becomes true strictly after + * all the related pointers are set. + */ +out_success: + smp_store_release(&kvm->arch.shadow_root_allocated, true); + +out_unlock: + mutex_unlock(&kvm->slots_arch_lock); + return r; +} + static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.mmu; @@ -3502,7 +3488,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) } } - r = alloc_all_memslots_rmaps(vcpu->kvm); + r = mmu_first_shadow_root_alloc(vcpu->kvm); if (r) return r; @@ -3653,6 +3639,33 @@ err_pml4: #endif } +static bool is_unsync_root(hpa_t root) +{ + struct kvm_mmu_page *sp; + + if (!VALID_PAGE(root)) + return false; + + /* + * The read barrier orders the CPU's read of SPTE.W during the page table + * walk before the reads of sp->unsync/sp->unsync_children here. + * + * Even if another CPU was marking the SP as unsync-ed simultaneously, + * any guest page table changes are not guaranteed to be visible anyway + * until this VCPU issues a TLB flush strictly after those changes are + * made. We only need to ensure that the other CPU sets these flags + * before any actual changes to the page tables are made. The comments + * in mmu_try_to_unsync_pages() describe what could go wrong if this + * requirement isn't satisfied. + */ + smp_rmb(); + sp = to_shadow_page(root); + if (sp->unsync || sp->unsync_children) + return true; + + return false; +} + void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) { int i; @@ -3670,18 +3683,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) hpa_t root = vcpu->arch.mmu->root_hpa; sp = to_shadow_page(root); - /* - * Even if another CPU was marking the SP as unsync-ed - * simultaneously, any guest page table changes are not - * guaranteed to be visible anyway until this VCPU issues a TLB - * flush strictly after those changes are made. We only need to - * ensure that the other CPU sets these flags before any actual - * changes to the page tables are made. The comments in - * mmu_try_to_unsync_pages() describe what could go wrong if - * this requirement isn't satisfied. - */ - if (!smp_load_acquire(&sp->unsync) && - !smp_load_acquire(&sp->unsync_children)) + if (!is_unsync_root(root)) return; write_lock(&vcpu->kvm->mmu_lock); @@ -3711,21 +3713,26 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) write_unlock(&vcpu->kvm->mmu_lock); } -static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gpa_t vaddr, - u32 access, struct x86_exception *exception) +void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu) { - if (exception) - exception->error_code = 0; - return vaddr; + unsigned long roots_to_free = 0; + int i; + + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + if (is_unsync_root(vcpu->arch.mmu->prev_roots[i].hpa)) + roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); + + /* sync prev_roots by simply freeing them */ + kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free); } -static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gpa_t vaddr, - u32 access, - struct x86_exception *exception) +static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + gpa_t vaddr, u32 access, + struct x86_exception *exception) { if (exception) exception->error_code = 0; - return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception); + return kvm_translate_gpa(vcpu, mmu, vaddr, access, exception); } static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct) @@ -3763,9 +3770,6 @@ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level spte = mmu_spte_get_lockless(iterator.sptep); sptes[leaf] = spte; - - if (!is_shadow_present_pte(spte)) - break; } return leaf; @@ -3856,20 +3860,19 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) } static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu, - u32 error_code, gfn_t gfn) + struct kvm_page_fault *fault) { - if (unlikely(error_code & PFERR_RSVD_MASK)) + if (unlikely(fault->rsvd)) return false; - if (!(error_code & PFERR_PRESENT_MASK) || - !(error_code & PFERR_WRITE_MASK)) + if (!fault->present || !fault->write) return false; /* * guest is writing the page which is write tracked which can * not be fixed by page fault handler. */ - if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE)) + if (kvm_slot_page_track_is_active(vcpu->kvm, fault->slot, fault->gfn, KVM_PAGE_TRACK_WRITE)) return true; return false; @@ -3881,11 +3884,8 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr) u64 spte; walk_shadow_page_lockless_begin(vcpu); - for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) { + for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) clear_sp_write_flooding_count(iterator.sptep); - if (!is_shadow_present_pte(spte)) - break; - } walk_shadow_page_lockless_end(vcpu); } @@ -3903,11 +3903,9 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); } -static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, - gpa_t cr2_or_gpa, kvm_pfn_t *pfn, hva_t *hva, - bool write, bool *writable, int *r) +static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, int *r) { - struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + struct kvm_memory_slot *slot = fault->slot; bool async; /* @@ -3921,8 +3919,9 @@ static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, if (!kvm_is_visible_memslot(slot)) { /* Don't expose private memslots to L2. */ if (is_guest_mode(vcpu)) { - *pfn = KVM_PFN_NOSLOT; - *writable = false; + fault->slot = NULL; + fault->pfn = KVM_PFN_NOSLOT; + fault->map_writable = false; return false; } /* @@ -3939,46 +3938,74 @@ static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, } async = false; - *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, - write, writable, hva); + fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, &async, + fault->write, &fault->map_writable, + &fault->hva); if (!async) return false; /* *pfn has correct page already */ - if (!prefault && kvm_can_do_async_pf(vcpu)) { - trace_kvm_try_async_get_page(cr2_or_gpa, gfn); - if (kvm_find_async_pf_gfn(vcpu, gfn)) { - trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn); + if (!fault->prefetch && kvm_can_do_async_pf(vcpu)) { + trace_kvm_try_async_get_page(fault->addr, fault->gfn); + if (kvm_find_async_pf_gfn(vcpu, fault->gfn)) { + trace_kvm_async_pf_doublefault(fault->addr, fault->gfn); kvm_make_request(KVM_REQ_APF_HALT, vcpu); goto out_retry; - } else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn)) + } else if (kvm_arch_setup_async_pf(vcpu, fault->addr, fault->gfn)) goto out_retry; } - *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, - write, writable, hva); + fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, NULL, + fault->write, &fault->map_writable, + &fault->hva); + return false; out_retry: *r = RET_PF_RETRY; return true; } -static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, - bool prefault, int max_level, bool is_tdp) +/* + * Returns true if the page fault is stale and needs to be retried, i.e. if the + * root was invalidated by a memslot update or a relevant mmu_notifier fired. + */ +static bool is_page_fault_stale(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault, int mmu_seq) +{ + struct kvm_mmu_page *sp = to_shadow_page(vcpu->arch.mmu->root_hpa); + + /* Special roots, e.g. pae_root, are not backed by shadow pages. */ + if (sp && is_obsolete_sp(vcpu->kvm, sp)) + return true; + + /* + * Roots without an associated shadow page are considered invalid if + * there is a pending request to free obsolete roots. The request is + * only a hint that the current root _may_ be obsolete and needs to be + * reloaded, e.g. if the guest frees a PGD that KVM is tracking as a + * previous root, then __kvm_mmu_prepare_zap_page() signals all vCPUs + * to reload even if no vCPU is actively using the root. + */ + if (!sp && kvm_test_request(KVM_REQ_MMU_RELOAD, vcpu)) + return true; + + return fault->slot && + mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva); +} + +static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu); - bool write = error_code & PFERR_WRITE_MASK; - bool map_writable; - gfn_t gfn = gpa >> PAGE_SHIFT; unsigned long mmu_seq; - kvm_pfn_t pfn; - hva_t hva; int r; - if (page_fault_handle_page_track(vcpu, error_code, gfn)) + fault->gfn = fault->addr >> PAGE_SHIFT; + fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn); + + if (page_fault_handle_page_track(vcpu, fault)) return RET_PF_EMULATE; - r = fast_page_fault(vcpu, gpa, error_code); + r = fast_page_fault(vcpu, fault); if (r != RET_PF_INVALID) return r; @@ -3989,11 +4016,10 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - if (kvm_faultin_pfn(vcpu, prefault, gfn, gpa, &pfn, &hva, - write, &map_writable, &r)) + if (kvm_faultin_pfn(vcpu, fault, &r)) return r; - if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r)) + if (handle_abnormal_pfn(vcpu, fault, ACC_ALL, &r)) return r; r = RET_PF_RETRY; @@ -4003,36 +4029,35 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, else write_lock(&vcpu->kvm->mmu_lock); - if (!is_noslot_pfn(pfn) && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva)) + if (is_page_fault_stale(vcpu, fault, mmu_seq)) goto out_unlock; + r = make_mmu_pages_available(vcpu); if (r) goto out_unlock; if (is_tdp_mmu_fault) - r = kvm_tdp_mmu_map(vcpu, gpa, error_code, map_writable, max_level, - pfn, prefault); + r = kvm_tdp_mmu_map(vcpu, fault); else - r = __direct_map(vcpu, gpa, error_code, map_writable, max_level, pfn, - prefault, is_tdp); + r = __direct_map(vcpu, fault); out_unlock: if (is_tdp_mmu_fault) read_unlock(&vcpu->kvm->mmu_lock); else write_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(pfn); + kvm_release_pfn_clean(fault->pfn); return r; } -static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, - u32 error_code, bool prefault) +static int nonpaging_page_fault(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault) { - pgprintk("%s: gva %lx error %x\n", __func__, gpa, error_code); + pgprintk("%s: gva %lx error %x\n", __func__, fault->addr, fault->error_code); /* This path builds a PAE pagetable, we can map 2mb pages at maximum. */ - return direct_page_fault(vcpu, gpa & PAGE_MASK, error_code, prefault, - PG_LEVEL_2M, false); + fault->max_level = PG_LEVEL_2M; + return direct_page_fault(vcpu, fault); } int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, @@ -4068,23 +4093,19 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, } EXPORT_SYMBOL_GPL(kvm_handle_page_fault); -int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, - bool prefault) +int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - int max_level; - - for (max_level = KVM_MAX_HUGEPAGE_LEVEL; - max_level > PG_LEVEL_4K; - max_level--) { - int page_num = KVM_PAGES_PER_HPAGE(max_level); - gfn_t base = (gpa >> PAGE_SHIFT) & ~(page_num - 1); + while (fault->max_level > PG_LEVEL_4K) { + int page_num = KVM_PAGES_PER_HPAGE(fault->max_level); + gfn_t base = (fault->addr >> PAGE_SHIFT) & ~(page_num - 1); if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num)) break; + + --fault->max_level; } - return direct_page_fault(vcpu, gpa, error_code, prefault, - max_level, true); + return direct_page_fault(vcpu, fault); } static void nonpaging_init_context(struct kvm_mmu *context) @@ -4205,7 +4226,7 @@ static unsigned long get_cr3(struct kvm_vcpu *vcpu) } static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, - unsigned int access, int *nr_present) + unsigned int access) { if (unlikely(is_mmio_spte(*sptep))) { if (gfn != get_mmio_spte_gfn(*sptep)) { @@ -4213,7 +4234,6 @@ static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, return true; } - (*nr_present)++; mark_mmio_spte(vcpu, sptep, gfn, access); return true; } @@ -4352,22 +4372,28 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, - u64 pa_bits_rsvd, bool execonly) + u64 pa_bits_rsvd, bool execonly, int huge_page_level) { u64 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51); + u64 large_1g_rsvd = 0, large_2m_rsvd = 0; u64 bad_mt_xwr; + if (huge_page_level < PG_LEVEL_1G) + large_1g_rsvd = rsvd_bits(7, 7); + if (huge_page_level < PG_LEVEL_2M) + large_2m_rsvd = rsvd_bits(7, 7); + rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | rsvd_bits(3, 7); rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | rsvd_bits(3, 7); - rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6); - rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6); + rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6) | large_1g_rsvd; + rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6) | large_2m_rsvd; rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd; /* large page */ rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4]; rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3]; - rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29); - rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20); + rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29) | large_1g_rsvd; + rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20) | large_2m_rsvd; rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */ @@ -4383,10 +4409,11 @@ __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, } static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, - struct kvm_mmu *context, bool execonly) + struct kvm_mmu *context, bool execonly, int huge_page_level) { __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check, - vcpu->arch.reserved_gpa_bits, execonly); + vcpu->arch.reserved_gpa_bits, execonly, + huge_page_level); } static inline u64 reserved_hpa_bits(void) @@ -4462,7 +4489,8 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, false, true); else __reset_rsvds_bits_mask_ept(shadow_zero_check, - reserved_hpa_bits(), false); + reserved_hpa_bits(), false, + max_huge_page_level); if (!shadow_me_mask) return; @@ -4482,7 +4510,8 @@ reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context, bool execonly) { __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, - reserved_hpa_bits(), execonly); + reserved_hpa_bits(), execonly, + max_huge_page_level); } #define BYTE_MASK(access) \ @@ -4596,10 +4625,10 @@ static void update_pkru_bitmask(struct kvm_mmu *mmu) unsigned bit; bool wp; - if (!is_cr4_pke(mmu)) { - mmu->pkru_mask = 0; + mmu->pkru_mask = 0; + + if (!is_cr4_pke(mmu)) return; - } wp = is_cr0_wp(mmu); @@ -4679,6 +4708,7 @@ static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu, /* PKEY and LA57 are active iff long mode is active. */ ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs); ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs); + ext.efer_lma = ____is_efer_lma(regs); } ext.valid = 1; @@ -4730,7 +4760,7 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, role.base.ad_disabled = (shadow_accessed_mask == 0); role.base.level = kvm_mmu_get_tdp_level(vcpu); role.base.direct = true; - role.base.gpte_is_8_bytes = true; + role.base.has_4_byte_gpte = false; return role; } @@ -4775,7 +4805,7 @@ kvm_calc_shadow_root_page_role_common(struct kvm_vcpu *vcpu, role.base.smep_andnot_wp = role.ext.cr4_smep && !____is_cr0_wp(regs); role.base.smap_andnot_wp = role.ext.cr4_smap && !____is_cr0_wp(regs); - role.base.gpte_is_8_bytes = ____is_cr0_pg(regs) && ____is_cr4_pae(regs); + role.base.has_4_byte_gpte = ____is_cr0_pg(regs) && !____is_cr4_pae(regs); return role; } @@ -4851,7 +4881,7 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0, struct kvm_mmu *context = &vcpu->arch.guest_mmu; struct kvm_mmu_role_regs regs = { .cr0 = cr0, - .cr4 = cr4, + .cr4 = cr4 & ~X86_CR4_PKE, .efer = efer, }; union kvm_mmu_role new_role; @@ -4874,7 +4904,7 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty, role.base.smm = vcpu->arch.root_mmu.mmu_role.base.smm; role.base.level = level; - role.base.gpte_is_8_bytes = true; + role.base.has_4_byte_gpte = false; role.base.direct = false; role.base.ad_disabled = !accessed_dirty; role.base.guest_mode = true; @@ -4889,7 +4919,8 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty, } void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, - bool accessed_dirty, gpa_t new_eptp) + int huge_page_level, bool accessed_dirty, + gpa_t new_eptp) { struct kvm_mmu *context = &vcpu->arch.guest_mmu; u8 level = vmx_eptp_page_walk_level(new_eptp); @@ -4915,8 +4946,8 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, context->direct_map = false; update_permission_bitmask(context, true); - update_pkru_bitmask(context); - reset_rsvds_bits_mask_ept(vcpu, context, execonly); + context->pkru_mask = 0; + reset_rsvds_bits_mask_ept(vcpu, context, execonly, huge_page_level); reset_ept_shadow_zero_bits_mask(vcpu, context, execonly); } EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); @@ -4980,13 +5011,13 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) * the gva_to_gpa functions between mmu and nested_mmu are swapped. */ if (!is_paging(vcpu)) - g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested; + g_context->gva_to_gpa = nonpaging_gva_to_gpa; else if (is_long_mode(vcpu)) - g_context->gva_to_gpa = paging64_gva_to_gpa_nested; + g_context->gva_to_gpa = paging64_gva_to_gpa; else if (is_pae(vcpu)) - g_context->gva_to_gpa = paging64_gva_to_gpa_nested; + g_context->gva_to_gpa = paging64_gva_to_gpa; else - g_context->gva_to_gpa = paging32_gva_to_gpa_nested; + g_context->gva_to_gpa = paging32_gva_to_gpa; reset_guest_paging_metadata(vcpu, g_context); } @@ -5021,6 +5052,14 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu) /* * Invalidate all MMU roles to force them to reinitialize as CPUID * information is factored into reserved bit calculations. + * + * Correctly handling multiple vCPU models with respect to paging and + * physical address properties) in a single VM would require tracking + * all relevant CPUID information in kvm_mmu_page_role. That is very + * undesirable as it would increase the memory requirements for + * gfn_track (see struct kvm_mmu_page_role comments). For now that + * problem is swept under the rug; KVM's CPUID API is horrific and + * it's all but impossible to solve it without introducing a new API. */ vcpu->arch.root_mmu.mmu_role.ext.valid = 0; vcpu->arch.guest_mmu.mmu_role.ext.valid = 0; @@ -5028,24 +5067,10 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu) kvm_mmu_reset_context(vcpu); /* - * KVM does not correctly handle changing guest CPUID after KVM_RUN, as - * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't - * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page - * faults due to reusing SPs/SPTEs. Alert userspace, but otherwise - * sweep the problem under the rug. - * - * KVM's horrific CPUID ABI makes the problem all but impossible to - * solve, as correctly handling multiple vCPU models (with respect to - * paging and physical address properties) in a single VM would require - * tracking all relevant CPUID information in kvm_mmu_page_role. That - * is very undesirable as it would double the memory requirements for - * gfn_track (see struct kvm_mmu_page_role comments), and in practice - * no sane VMM mucks with the core vCPU model on the fly. + * Changing guest CPUID after KVM_RUN is forbidden, see the comment in + * kvm_arch_vcpu_ioctl(). */ - if (vcpu->arch.last_vmentry_cpu != -1) { - pr_warn_ratelimited("KVM: KVM_SET_CPUID{,2} after KVM_RUN may cause guest instability\n"); - pr_warn_ratelimited("KVM: KVM_SET_CPUID{,2} will fail after KVM_RUN starting with Linux 5.16\n"); - } + KVM_BUG_ON(vcpu->arch.last_vmentry_cpu != -1, vcpu->kvm); } void kvm_mmu_reset_context(struct kvm_vcpu *vcpu) @@ -5157,7 +5182,7 @@ static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa, gpa, bytes, sp->role.word); offset = offset_in_page(gpa); - pte_size = sp->role.gpte_is_8_bytes ? 8 : 4; + pte_size = sp->role.has_4_byte_gpte ? 4 : 8; /* * Sometimes, the OS only writes the last one bytes to update status @@ -5181,7 +5206,7 @@ static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte) page_offset = offset_in_page(gpa); level = sp->role.level; *nspte = 1; - if (!sp->role.gpte_is_8_bytes) { + if (sp->role.has_4_byte_gpte) { page_offset <<= 1; /* 32->64 */ /* * A 32-bit pde maps 4MB while the shadow pdes map @@ -5212,7 +5237,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, LIST_HEAD(invalid_list); u64 entry, gentry, *spte; int npte; - bool remote_flush, local_flush; + bool flush = false; /* * If we don't have indirect shadow pages, it means no page is @@ -5221,8 +5246,6 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages)) return; - remote_flush = local_flush = false; - pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); /* @@ -5251,18 +5274,17 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, if (!spte) continue; - local_flush = true; while (npte--) { entry = *spte; mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL); if (gentry && sp->role.level != PG_LEVEL_4K) ++vcpu->kvm->stat.mmu_pde_zapped; if (need_remote_flush(entry, *spte)) - remote_flush = true; + flush = true; ++spte; } } - kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush); + kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush); kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); write_unlock(&vcpu->kvm->mmu_lock); } @@ -5368,7 +5390,7 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) { - kvm_mmu_invalidate_gva(vcpu, vcpu->arch.mmu, gva, INVALID_PAGE); + kvm_mmu_invalidate_gva(vcpu, vcpu->arch.walk_mmu, gva, INVALID_PAGE); ++vcpu->stat.invlpg; } EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); @@ -5473,8 +5495,8 @@ slot_handle_level(struct kvm *kvm, const struct kvm_memory_slot *memslot, } static __always_inline bool -slot_handle_leaf(struct kvm *kvm, const struct kvm_memory_slot *memslot, - slot_level_handler fn, bool flush_on_yield) +slot_handle_level_4k(struct kvm *kvm, const struct kvm_memory_slot *memslot, + slot_level_handler fn, bool flush_on_yield) { return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K, PG_LEVEL_4K, flush_on_yield); @@ -5496,10 +5518,13 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) mmu->root_hpa = INVALID_PAGE; mmu->root_pgd = 0; - mmu->translate_gpa = translate_gpa; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; + /* vcpu->arch.guest_mmu isn't used when !tdp_enabled. */ + if (!tdp_enabled && mmu == &vcpu->arch.guest_mmu) + return 0; + /* * When using PAE paging, the four PDPTEs are treated as 'root' pages, * while the PDP table is a per-vCPU construct that's allocated at MMU @@ -5509,7 +5534,7 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) * generally doesn't use PAE paging and can skip allocating the PDP * table. The main exception, handled here, is SVM's 32-bit NPT. The * other exception is for shadowing L1's 32-bit or PAE NPT on 64-bit - * KVM; that horror is handled on-demand by mmu_alloc_shadow_roots(). + * KVM; that horror is handled on-demand by mmu_alloc_special_roots(). */ if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL) return 0; @@ -5554,8 +5579,6 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) vcpu->arch.mmu = &vcpu->arch.root_mmu; vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; - vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa; - ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu); if (ret) return ret; @@ -5694,13 +5717,7 @@ void kvm_mmu_init_vm(struct kvm *kvm) spin_lock_init(&kvm->arch.mmu_unsync_pages_lock); - if (!kvm_mmu_init_tdp_mmu(kvm)) - /* - * No smp_load/store wrappers needed here as we are in - * VM init and there cannot be any memslots / other threads - * accessing this struct kvm yet. - */ - kvm->arch.memslots_have_rmaps = true; + kvm_mmu_init_tdp_mmu(kvm); node->track_write = kvm_mmu_pte_write; node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot; @@ -5716,55 +5733,64 @@ void kvm_mmu_uninit_vm(struct kvm *kvm) kvm_mmu_uninit_tdp_mmu(kvm); } +static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) +{ + const struct kvm_memory_slot *memslot; + struct kvm_memslots *slots; + struct kvm_memslot_iter iter; + bool flush = false; + gfn_t start, end; + int i; + + if (!kvm_memslots_have_rmaps(kvm)) + return flush; + + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + slots = __kvm_memslots(kvm, i); + + kvm_for_each_memslot_in_gfn_range(&iter, slots, gfn_start, gfn_end) { + memslot = iter.slot; + start = max(gfn_start, memslot->base_gfn); + end = min(gfn_end, memslot->base_gfn + memslot->npages); + if (WARN_ON_ONCE(start >= end)) + continue; + + flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, + PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, + start, end - 1, true, flush); + } + } + + return flush; +} + /* * Invalidate (zap) SPTEs that cover GFNs from gfn_start and up to gfn_end * (not including it) */ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) { - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; + bool flush; int i; - bool flush = false; + + if (WARN_ON_ONCE(gfn_end <= gfn_start)) + return; write_lock(&kvm->mmu_lock); kvm_inc_notifier_count(kvm, gfn_start, gfn_end); - if (kvm_memslots_have_rmaps(kvm)) { - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { - slots = __kvm_memslots(kvm, i); - kvm_for_each_memslot(memslot, slots) { - gfn_t start, end; - - start = max(gfn_start, memslot->base_gfn); - end = min(gfn_end, memslot->base_gfn + memslot->npages); - if (start >= end) - continue; - - flush = slot_handle_level_range(kvm, - (const struct kvm_memory_slot *) memslot, - kvm_zap_rmapp, PG_LEVEL_4K, - KVM_MAX_HUGEPAGE_LEVEL, start, - end - 1, true, flush); - } - } - if (flush) - kvm_flush_remote_tlbs_with_address(kvm, gfn_start, - gfn_end - gfn_start); - } + flush = __kvm_zap_rmaps(kvm, gfn_start, gfn_end); if (is_tdp_mmu_enabled(kvm)) { for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start, gfn_end, flush); - if (flush) - kvm_flush_remote_tlbs_with_address(kvm, gfn_start, - gfn_end - gfn_start); } if (flush) - kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end); + kvm_flush_remote_tlbs_with_address(kvm, gfn_start, + gfn_end - gfn_start); kvm_dec_notifier_count(kvm, gfn_start, gfn_end); @@ -5856,21 +5882,21 @@ restart: void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *slot) { - bool flush = false; - if (kvm_memslots_have_rmaps(kvm)) { write_lock(&kvm->mmu_lock); - flush = slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true); - if (flush) + /* + * Zap only 4k SPTEs since the legacy MMU only supports dirty + * logging at a 4k granularity and never creates collapsible + * 2m SPTEs during dirty logging. + */ + if (slot_handle_level_4k(kvm, slot, kvm_mmu_zap_collapsible_spte, true)) kvm_arch_flush_remote_tlbs_memslot(kvm, slot); write_unlock(&kvm->mmu_lock); } if (is_tdp_mmu_enabled(kvm)) { read_lock(&kvm->mmu_lock); - flush = kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot, flush); - if (flush) - kvm_arch_flush_remote_tlbs_memslot(kvm, slot); + kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot); read_unlock(&kvm->mmu_lock); } } @@ -5897,8 +5923,11 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, if (kvm_memslots_have_rmaps(kvm)) { write_lock(&kvm->mmu_lock); - flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, - false); + /* + * Clear dirty bits only on 4k SPTEs since the legacy MMU only + * support dirty logging at a 4k granularity. + */ + flush = slot_handle_level_4k(kvm, memslot, __rmap_clear_dirty, false); write_unlock(&kvm->mmu_lock); } @@ -6136,30 +6165,6 @@ out: return ret; } -/* - * Calculate mmu pages needed for kvm. - */ -unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm) -{ - unsigned long nr_mmu_pages; - unsigned long nr_pages = 0; - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; - int i; - - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { - slots = __kvm_memslots(kvm, i); - - kvm_for_each_memslot(memslot, slots) - nr_pages += memslot->npages; - } - - nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; - nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES); - - return nr_mmu_pages; -} - void kvm_mmu_destroy(struct kvm_vcpu *vcpu) { kvm_mmu_unload(vcpu); @@ -6176,18 +6181,47 @@ void kvm_mmu_module_exit(void) mmu_audit_disable(); } -static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp) +/* + * Calculate the effective recovery period, accounting for '0' meaning "let KVM + * select a halving time of 1 hour". Returns true if recovery is enabled. + */ +static bool calc_nx_huge_pages_recovery_period(uint *period) +{ + /* + * Use READ_ONCE to get the params, this may be called outside of the + * param setters, e.g. by the kthread to compute its next timeout. + */ + bool enabled = READ_ONCE(nx_huge_pages); + uint ratio = READ_ONCE(nx_huge_pages_recovery_ratio); + + if (!enabled || !ratio) + return false; + + *period = READ_ONCE(nx_huge_pages_recovery_period_ms); + if (!*period) { + /* Make sure the period is not less than one second. */ + ratio = min(ratio, 3600u); + *period = 60 * 60 * 1000 / ratio; + } + return true; +} + +static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp) { - unsigned int old_val; + bool was_recovery_enabled, is_recovery_enabled; + uint old_period, new_period; int err; - old_val = nx_huge_pages_recovery_ratio; + was_recovery_enabled = calc_nx_huge_pages_recovery_period(&old_period); + err = param_set_uint(val, kp); if (err) return err; - if (READ_ONCE(nx_huge_pages) && - !old_val && nx_huge_pages_recovery_ratio) { + is_recovery_enabled = calc_nx_huge_pages_recovery_period(&new_period); + + if (is_recovery_enabled && + (!was_recovery_enabled || old_period > new_period)) { struct kvm *kvm; mutex_lock(&kvm_lock); @@ -6250,9 +6284,13 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) static long get_nx_lpage_recovery_timeout(u64 start_time) { - return READ_ONCE(nx_huge_pages) && READ_ONCE(nx_huge_pages_recovery_ratio) - ? start_time + 60 * HZ - get_jiffies_64() - : MAX_SCHEDULE_TIMEOUT; + bool enabled; + uint period; + + enabled = calc_nx_huge_pages_recovery_period(&period); + + return enabled ? start_time + msecs_to_jiffies(period) - get_jiffies_64() + : MAX_SCHEDULE_TIMEOUT; } static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data) diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index bf2bdbf333c2..da6166b5c377 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -104,7 +104,7 @@ static inline int kvm_mmu_page_as_id(struct kvm_mmu_page *sp) return kvm_mmu_role_as_id(sp->role); } -static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu) +static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp) { /* * When using the EPT page-modification log, the GPAs in the CPU dirty @@ -112,19 +112,13 @@ static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu) * on write protection to record dirty pages, which bypasses PML, since * writes now result in a vmexit. Note, the check on CPU dirty logging * being enabled is mandatory as the bits used to denote WP-only SPTEs - * are reserved for NPT w/ PAE (32-bit KVM). + * are reserved for PAE paging (32-bit KVM). */ - return vcpu->arch.mmu == &vcpu->arch.guest_mmu && - kvm_x86_ops.cpu_dirty_log_size; + return kvm_x86_ops.cpu_dirty_log_size && sp->role.guest_mode; } -extern int nx_huge_pages; -static inline bool is_nx_huge_page_enabled(void) -{ - return READ_ONCE(nx_huge_pages); -} - -int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync); +int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, + gfn_t gfn, bool can_unsync, bool prefetch); void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn); void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn); @@ -155,19 +149,11 @@ enum { RET_PF_SPURIOUS, }; -/* Bits which may be returned by set_spte() */ -#define SET_SPTE_WRITE_PROTECTED_PT BIT(0) -#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1) -#define SET_SPTE_SPURIOUS BIT(2) - int kvm_mmu_max_mapping_level(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t gfn, kvm_pfn_t pfn, int max_level); -int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, - int max_level, kvm_pfn_t *pfnp, - bool huge_page_disallowed, int *req_level); -void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level, - kvm_pfn_t *pfnp, int *goal_levelp); +void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); +void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level); void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index 2924a4081a19..de5e8e4e1aa7 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -35,7 +35,7 @@ " %snxe %sad root %u %s%c", \ __entry->mmu_valid_gen, \ __entry->gfn, role.level, \ - role.gpte_is_8_bytes ? 8 : 4, \ + role.has_4_byte_gpte ? 4 : 8, \ role.quadrant, \ role.direct ? " direct" : "", \ access_str[role.access], \ @@ -252,9 +252,9 @@ TRACE_EVENT( TRACE_EVENT( fast_page_fault, - TP_PROTO(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 error_code, + TP_PROTO(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, u64 *sptep, u64 old_spte, int ret), - TP_ARGS(vcpu, cr2_or_gpa, error_code, sptep, old_spte, ret), + TP_ARGS(vcpu, fault, sptep, old_spte, ret), TP_STRUCT__entry( __field(int, vcpu_id) @@ -268,8 +268,8 @@ TRACE_EVENT( TP_fast_assign( __entry->vcpu_id = vcpu->vcpu_id; - __entry->cr2_or_gpa = cr2_or_gpa; - __entry->error_code = error_code; + __entry->cr2_or_gpa = fault->addr; + __entry->error_code = fault->error_code; __entry->sptep = sptep; __entry->old_spte = old_spte; __entry->new_spte = *sptep; @@ -367,8 +367,8 @@ TRACE_EVENT( TRACE_EVENT( kvm_mmu_spte_requested, - TP_PROTO(gpa_t addr, int level, kvm_pfn_t pfn), - TP_ARGS(addr, level, pfn), + TP_PROTO(struct kvm_page_fault *fault), + TP_ARGS(fault), TP_STRUCT__entry( __field(u64, gfn) @@ -377,9 +377,9 @@ TRACE_EVENT( ), TP_fast_assign( - __entry->gfn = addr >> PAGE_SHIFT; - __entry->pfn = pfn | (__entry->gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); - __entry->level = level; + __entry->gfn = fault->gfn; + __entry->pfn = fault->pfn | (fault->gfn & (KVM_PAGES_PER_HPAGE(fault->goal_level) - 1)); + __entry->level = fault->goal_level; ), TP_printk("gfn %llx pfn %llx level %d", diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index 21427e84a82e..68eb1fb548b6 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -19,6 +19,12 @@ #include "mmu.h" #include "mmu_internal.h" +bool kvm_page_track_write_tracking_enabled(struct kvm *kvm) +{ + return IS_ENABLED(CONFIG_KVM_EXTERNAL_WRITE_TRACKING) || + !tdp_enabled || kvm_shadow_root_allocated(kvm); +} + void kvm_page_track_free_memslot(struct kvm_memory_slot *slot) { int i; @@ -29,12 +35,17 @@ void kvm_page_track_free_memslot(struct kvm_memory_slot *slot) } } -int kvm_page_track_create_memslot(struct kvm_memory_slot *slot, +int kvm_page_track_create_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot, unsigned long npages) { - int i; + int i; for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) { + if (i == KVM_PAGE_TRACK_WRITE && + !kvm_page_track_write_tracking_enabled(kvm)) + continue; + slot->arch.gfn_track[i] = kvcalloc(npages, sizeof(*slot->arch.gfn_track[i]), GFP_KERNEL_ACCOUNT); @@ -57,6 +68,21 @@ static inline bool page_track_mode_is_valid(enum kvm_page_track_mode mode) return true; } +int kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot) +{ + unsigned short *gfn_track; + + if (slot->arch.gfn_track[KVM_PAGE_TRACK_WRITE]) + return 0; + + gfn_track = kvcalloc(slot->npages, sizeof(*gfn_track), GFP_KERNEL_ACCOUNT); + if (gfn_track == NULL) + return -ENOMEM; + + slot->arch.gfn_track[KVM_PAGE_TRACK_WRITE] = gfn_track; + return 0; +} + static void update_gfn_track(struct kvm_memory_slot *slot, gfn_t gfn, enum kvm_page_track_mode mode, short count) { @@ -92,6 +118,10 @@ void kvm_slot_page_track_add_page(struct kvm *kvm, if (WARN_ON(!page_track_mode_is_valid(mode))) return; + if (WARN_ON(mode == KVM_PAGE_TRACK_WRITE && + !kvm_page_track_write_tracking_enabled(kvm))) + return; + update_gfn_track(slot, gfn, mode, 1); /* @@ -126,6 +156,10 @@ void kvm_slot_page_track_remove_page(struct kvm *kvm, if (WARN_ON(!page_track_mode_is_valid(mode))) return; + if (WARN_ON(mode == KVM_PAGE_TRACK_WRITE && + !kvm_page_track_write_tracking_enabled(kvm))) + return; + update_gfn_track(slot, gfn, mode, -1); /* @@ -139,19 +173,22 @@ EXPORT_SYMBOL_GPL(kvm_slot_page_track_remove_page); /* * check if the corresponding access on the specified guest page is tracked. */ -bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn, - enum kvm_page_track_mode mode) +bool kvm_slot_page_track_is_active(struct kvm *kvm, + const struct kvm_memory_slot *slot, + gfn_t gfn, enum kvm_page_track_mode mode) { - struct kvm_memory_slot *slot; int index; if (WARN_ON(!page_track_mode_is_valid(mode))) return false; - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); if (!slot) return false; + if (mode == KVM_PAGE_TRACK_WRITE && + !kvm_page_track_write_tracking_enabled(kvm)) + return false; + index = gfn_to_index(gfn, slot->base_gfn, PG_LEVEL_4K); return !!READ_ONCE(slot->arch.gfn_track[mode][index]); } diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 913d52a7923e..5b5bdac97c7b 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -403,9 +403,8 @@ retry_walk: walker->table_gfn[walker->level - 1] = table_gfn; walker->pte_gpa[walker->level - 1] = pte_gpa; - real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn), - nested_access, - &walker->fault); + real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(table_gfn), + nested_access, &walker->fault); /* * FIXME: This can happen if emulation (for of an INS/OUTS @@ -467,7 +466,7 @@ retry_walk: if (PTTYPE == 32 && walker->level > PG_LEVEL_4K && is_cpuid_PSE36()) gfn += pse36_gfn_delta(pte); - real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), access, &walker->fault); + real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(gfn), access, &walker->fault); if (real_gpa == UNMAPPED_GVA) return 0; @@ -547,20 +546,11 @@ static int FNAME(walk_addr)(struct guest_walker *walker, access); } -#if PTTYPE != PTTYPE_EPT -static int FNAME(walk_addr_nested)(struct guest_walker *walker, - struct kvm_vcpu *vcpu, gva_t addr, - u32 access) -{ - return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu, - addr, access); -} -#endif - static bool FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, pt_element_t gpte, bool no_dirty_log) { + struct kvm_memory_slot *slot; unsigned pte_access; gfn_t gfn; kvm_pfn_t pfn; @@ -573,30 +563,21 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, gfn = gpte_to_gfn(gpte); pte_access = sp->role.access & FNAME(gpte_access)(gpte); FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte); - pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, + + slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log && (pte_access & ACC_WRITE_MASK)); - if (is_error_pfn(pfn)) + if (!slot) return false; - /* - * we call mmu_set_spte() with host_writable = true because - * pte_prefetch_gfn_to_pfn always gets a writable pfn. - */ - mmu_set_spte(vcpu, spte, pte_access, false, PG_LEVEL_4K, gfn, pfn, - true, true); + pfn = gfn_to_pfn_memslot_atomic(slot, gfn); + if (is_error_pfn(pfn)) + return false; + mmu_set_spte(vcpu, slot, spte, pte_access, gfn, pfn, NULL); kvm_release_pfn_clean(pfn); return true; } -static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - u64 *spte, const void *pte) -{ - pt_element_t gpte = *(const pt_element_t *)pte; - - FNAME(prefetch_gpte)(vcpu, sp, spte, gpte, false); -} - static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, struct guest_walker *gw, int level) { @@ -663,21 +644,16 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, * If the guest tries to write a write-protected page, we need to * emulate this operation, return 1 to indicate this case. */ -static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, - struct guest_walker *gw, u32 error_code, - int max_level, kvm_pfn_t pfn, bool map_writable, - bool prefault) +static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, + struct guest_walker *gw) { - bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(); - bool write_fault = error_code & PFERR_WRITE_MASK; - bool exec = error_code & PFERR_FETCH_MASK; - bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; struct kvm_mmu_page *sp = NULL; struct kvm_shadow_walk_iterator it; unsigned int direct_access, access; - int top_level, level, req_level, ret; - gfn_t base_gfn = gw->gfn; + int top_level, ret; + gfn_t base_gfn = fault->gfn; + WARN_ON_ONCE(gw->gfn != base_gfn); direct_access = gw->pte_access; top_level = vcpu->arch.mmu->root_level; @@ -695,7 +671,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) goto out_gpte_changed; - for (shadow_walk_init(&it, vcpu, addr); + for (shadow_walk_init(&it, vcpu, fault->addr); shadow_walk_okay(&it) && it.level > gw->level; shadow_walk_next(&it)) { gfn_t table_gfn; @@ -707,7 +683,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, if (!is_shadow_present_pte(*it.sptep)) { table_gfn = gw->table_gfn[it.level - 2]; access = gw->pt_access[it.level - 2]; - sp = kvm_mmu_get_page(vcpu, table_gfn, addr, + sp = kvm_mmu_get_page(vcpu, table_gfn, fault->addr, it.level-1, false, access); /* * We must synchronize the pagetable before linking it @@ -741,10 +717,9 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, link_shadow_page(vcpu, it.sptep, sp); } - level = kvm_mmu_hugepage_adjust(vcpu, gw->gfn, max_level, &pfn, - huge_page_disallowed, &req_level); + kvm_mmu_hugepage_adjust(vcpu, fault); - trace_kvm_mmu_spte_requested(addr, gw->level, pfn); + trace_kvm_mmu_spte_requested(fault); for (; shadow_walk_okay(&it); shadow_walk_next(&it)) { clear_sp_write_flooding_count(it.sptep); @@ -753,12 +728,11 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, * We cannot overwrite existing page tables with an NX * large page, as the leaf could be executable. */ - if (nx_huge_page_workaround_enabled) - disallowed_hugepage_adjust(*it.sptep, gw->gfn, it.level, - &pfn, &level); + if (fault->nx_huge_page_workaround_enabled) + disallowed_hugepage_adjust(fault, *it.sptep, it.level); - base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); - if (it.level == level) + base_gfn = fault->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); + if (it.level == fault->goal_level) break; validate_direct_spte(vcpu, it.sptep, direct_access); @@ -766,16 +740,20 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, drop_large_spte(vcpu, it.sptep); if (!is_shadow_present_pte(*it.sptep)) { - sp = kvm_mmu_get_page(vcpu, base_gfn, addr, + sp = kvm_mmu_get_page(vcpu, base_gfn, fault->addr, it.level - 1, true, direct_access); link_shadow_page(vcpu, it.sptep, sp); - if (huge_page_disallowed && req_level >= it.level) + if (fault->huge_page_disallowed && + fault->req_level >= it.level) account_huge_nx_page(vcpu->kvm, sp); } } - ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, - it.level, base_gfn, pfn, prefault, map_writable); + if (WARN_ON_ONCE(it.level != fault->goal_level)) + return -EFAULT; + + ret = mmu_set_spte(vcpu, fault->slot, it.sptep, gw->pte_access, + base_gfn, fault->pfn, fault); if (ret == RET_PF_SPURIOUS) return ret; @@ -841,45 +819,40 @@ FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, * Returns: 1 if we need to emulate the instruction, 0 otherwise, or * a negative value on error. */ -static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, - bool prefault) +static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - bool write_fault = error_code & PFERR_WRITE_MASK; - bool user_fault = error_code & PFERR_USER_MASK; struct guest_walker walker; int r; - kvm_pfn_t pfn; - hva_t hva; unsigned long mmu_seq; - bool map_writable, is_self_change_mapping; - int max_level; + bool is_self_change_mapping; - pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); + pgprintk("%s: addr %lx err %x\n", __func__, fault->addr, fault->error_code); + WARN_ON_ONCE(fault->is_tdp); /* + * Look up the guest pte for the faulting address. * If PFEC.RSVD is set, this is a shadow page fault. * The bit needs to be cleared before walking guest page tables. */ - error_code &= ~PFERR_RSVD_MASK; - - /* - * Look up the guest pte for the faulting address. - */ - r = FNAME(walk_addr)(&walker, vcpu, addr, error_code); + r = FNAME(walk_addr)(&walker, vcpu, fault->addr, + fault->error_code & ~PFERR_RSVD_MASK); /* * The page is not mapped by the guest. Let the guest handle it. */ if (!r) { pgprintk("%s: guest page fault\n", __func__); - if (!prefault) + if (!fault->prefetch) kvm_inject_emulated_page_fault(vcpu, &walker.fault); return RET_PF_RETRY; } - if (page_fault_handle_page_track(vcpu, error_code, walker.gfn)) { - shadow_page_table_clear_flood(vcpu, addr); + fault->gfn = walker.gfn; + fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn); + + if (page_fault_handle_page_track(vcpu, fault)) { + shadow_page_table_clear_flood(vcpu, fault->addr); return RET_PF_EMULATE; } @@ -890,29 +863,28 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, vcpu->arch.write_fault_to_shadow_pgtable = false; is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, - &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable); + &walker, fault->user, &vcpu->arch.write_fault_to_shadow_pgtable); if (is_self_change_mapping) - max_level = PG_LEVEL_4K; + fault->max_level = PG_LEVEL_4K; else - max_level = walker.level; + fault->max_level = walker.level; mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - if (kvm_faultin_pfn(vcpu, prefault, walker.gfn, addr, &pfn, &hva, - write_fault, &map_writable, &r)) + if (kvm_faultin_pfn(vcpu, fault, &r)) return r; - if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r)) + if (handle_abnormal_pfn(vcpu, fault, walker.pte_access, &r)) return r; /* * Do not change pte_access if the pfn is a mmio page, otherwise * we will cache the incorrect access into mmio spte. */ - if (write_fault && !(walker.pte_access & ACC_WRITE_MASK) && - !is_cr0_wp(vcpu->arch.mmu) && !user_fault && !is_noslot_pfn(pfn)) { + if (fault->write && !(walker.pte_access & ACC_WRITE_MASK) && + !is_cr0_wp(vcpu->arch.mmu) && !fault->user && fault->slot) { walker.pte_access |= ACC_WRITE_MASK; walker.pte_access &= ~ACC_USER_MASK; @@ -928,20 +900,20 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, r = RET_PF_RETRY; write_lock(&vcpu->kvm->mmu_lock); - if (!is_noslot_pfn(pfn) && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva)) + + if (is_page_fault_stale(vcpu, fault, mmu_seq)) goto out_unlock; kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); r = make_mmu_pages_available(vcpu); if (r) goto out_unlock; - r = FNAME(fetch)(vcpu, addr, &walker, error_code, max_level, pfn, - map_writable, prefault); + r = FNAME(fetch)(vcpu, fault, &walker); kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); out_unlock: write_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(pfn); + kvm_release_pfn_clean(fault->pfn); return r; } @@ -1007,73 +979,57 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) sizeof(pt_element_t))) break; - FNAME(update_pte)(vcpu, sp, sptep, &gpte); + FNAME(prefetch_gpte)(vcpu, sp, sptep, gpte, false); } - if (!is_shadow_present_pte(*sptep) || !sp->unsync_children) + if (!sp->unsync_children) break; } write_unlock(&vcpu->kvm->mmu_lock); } /* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */ -static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gpa_t addr, u32 access, +static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + gpa_t addr, u32 access, struct x86_exception *exception) { struct guest_walker walker; gpa_t gpa = UNMAPPED_GVA; int r; - r = FNAME(walk_addr)(&walker, vcpu, addr, access); - - if (r) { - gpa = gfn_to_gpa(walker.gfn); - gpa |= addr & ~PAGE_MASK; - } else if (exception) - *exception = walker.fault; - - return gpa; -} - -#if PTTYPE != PTTYPE_EPT -/* Note, gva_to_gpa_nested() is only used to translate L2 GVAs. */ -static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gpa_t vaddr, - u32 access, - struct x86_exception *exception) -{ - struct guest_walker walker; - gpa_t gpa = UNMAPPED_GVA; - int r; - #ifndef CONFIG_X86_64 /* A 64-bit GVA should be impossible on 32-bit KVM. */ - WARN_ON_ONCE(vaddr >> 32); + WARN_ON_ONCE((addr >> 32) && mmu == vcpu->arch.walk_mmu); #endif - r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access); + r = FNAME(walk_addr_generic)(&walker, vcpu, mmu, addr, access); if (r) { gpa = gfn_to_gpa(walker.gfn); - gpa |= vaddr & ~PAGE_MASK; + gpa |= addr & ~PAGE_MASK; } else if (exception) *exception = walker.fault; return gpa; } -#endif /* * Using the cached information from sp->gfns is safe because: * - The spte has a reference to the struct page, so the pfn for a given gfn * can't change unless all sptes pointing to it are nuked first. + * + * Returns + * < 0: the sp should be zapped + * 0: the sp is synced and no tlb flushing is required + * > 0: the sp is synced and tlb flushing is required */ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { union kvm_mmu_page_role mmu_role = vcpu->arch.mmu->mmu_role.base; - int i, nr_present = 0; + int i; bool host_writable; gpa_t first_pte_gpa; - int set_spte_ret = 0; + bool flush = false; /* * Ignore various flags when verifying that it's safe to sync a shadow @@ -1098,11 +1054,13 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) */ if (WARN_ON_ONCE(sp->role.direct || (sp->role.word ^ mmu_role.word) & ~sync_role_ign.word)) - return 0; + return -1; first_pte_gpa = FNAME(get_level1_sp_gpa)(sp); for (i = 0; i < PT64_ENT_PER_PAGE; i++) { + u64 *sptep, spte; + struct kvm_memory_slot *slot; unsigned pte_access; pt_element_t gpte; gpa_t pte_gpa; @@ -1115,10 +1073,10 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte, sizeof(pt_element_t))) - return 0; + return -1; if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { - set_spte_ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH; + flush = true; continue; } @@ -1127,30 +1085,27 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) pte_access &= FNAME(gpte_access)(gpte); FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte); - if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access, - &nr_present)) + if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access)) continue; if (gfn != sp->gfns[i]) { drop_spte(vcpu->kvm, &sp->spt[i]); - set_spte_ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH; + flush = true; continue; } - nr_present++; - - host_writable = sp->spt[i] & shadow_host_writable_mask; + sptep = &sp->spt[i]; + spte = *sptep; + host_writable = spte & shadow_host_writable_mask; + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + make_spte(vcpu, sp, slot, pte_access, gfn, + spte_to_pfn(spte), spte, true, false, + host_writable, &spte); - set_spte_ret |= set_spte(vcpu, &sp->spt[i], - pte_access, PG_LEVEL_4K, - gfn, spte_to_pfn(sp->spt[i]), - true, false, host_writable); + flush |= mmu_spte_update(sptep, spte); } - if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH) - kvm_flush_remote_tlbs(vcpu->kvm); - - return nr_present; + return flush; } #undef pt_element_t diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 3e97cdb13eb7..351b04ad62a1 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -16,6 +16,7 @@ #include "spte.h" #include <asm/e820/api.h> +#include <asm/memtype.h> #include <asm/vmx.h> static bool __read_mostly enable_mmio_caching = true; @@ -89,17 +90,19 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) E820_TYPE_RAM); } -int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, - gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool speculative, - bool can_unsync, bool host_writable, bool ad_disabled, - u64 *new_spte) +bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + const struct kvm_memory_slot *slot, + unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, + u64 old_spte, bool prefetch, bool can_unsync, + bool host_writable, u64 *new_spte) { + int level = sp->role.level; u64 spte = SPTE_MMU_PRESENT_MASK; - int ret = 0; + bool wrprot = false; - if (ad_disabled) + if (sp->role.ad_disabled) spte |= SPTE_TDP_AD_DISABLED_MASK; - else if (kvm_vcpu_ad_need_write_protect(vcpu)) + else if (kvm_mmu_page_ad_need_write_protect(sp)) spte |= SPTE_TDP_AD_WRPROT_ONLY_MASK; /* @@ -109,7 +112,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, * read access. See FNAME(gpte_access) in paging_tmpl.h. */ spte |= shadow_present_mask; - if (!speculative) + if (!prefetch) spte |= spte_shadow_accessed_mask(spte); if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) && @@ -150,7 +153,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, * is responsibility of kvm_mmu_get_page / kvm_mmu_sync_roots. * Same reasoning can be applied to dirty page accounting. */ - if (!can_unsync && is_writable_pte(old_spte)) + if (is_writable_pte(old_spte)) goto out; /* @@ -159,10 +162,10 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, * e.g. it's write-tracked (upper-level SPs) or has one or more * shadow pages and unsync'ing pages is not allowed. */ - if (mmu_try_to_unsync_pages(vcpu, gfn, can_unsync)) { + if (mmu_try_to_unsync_pages(vcpu->kvm, slot, gfn, can_unsync, prefetch)) { pgprintk("%s: found shadow page for %llx, marking ro\n", __func__, gfn); - ret |= SET_SPTE_WRITE_PROTECTED_PT; + wrprot = true; pte_access &= ~ACC_WRITE_MASK; spte &= ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask); } @@ -171,16 +174,22 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, if (pte_access & ACC_WRITE_MASK) spte |= spte_shadow_dirty_mask(spte); - if (speculative) +out: + if (prefetch) spte = mark_spte_for_access_track(spte); -out: WARN_ONCE(is_rsvd_spte(&vcpu->arch.mmu->shadow_zero_check, spte, level), "spte = 0x%llx, level = %d, rsvd bits = 0x%llx", spte, level, get_rsvd_bits(&vcpu->arch.mmu->shadow_zero_check, spte, level)); + if ((spte & PT_WRITABLE_MASK) && kvm_slot_dirty_track_enabled(slot)) { + /* Enforced by kvm_mmu_hugepage_adjust. */ + WARN_ON(level > PG_LEVEL_4K); + mark_page_dirty_in_slot(vcpu->kvm, slot, gfn); + } + *new_spte = spte; - return ret; + return wrprot; } u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled) diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index eb7b227fc6cf..a4af2a42695c 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -310,12 +310,7 @@ static inline bool __is_bad_mt_xwr(struct rsvd_bits_validate *rsvd_check, static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check, u64 spte, int level) { - /* - * Use a bitwise-OR instead of a logical-OR to aggregate the reserved - * bits and EPT's invalid memtype/XWR checks to avoid an extra Jcc - * (this is extremely unlikely to be short-circuited as true). - */ - return __is_bad_mt_xwr(rsvd_check, spte) | + return __is_bad_mt_xwr(rsvd_check, spte) || __is_rsvd_bits_set(rsvd_check, spte, level); } @@ -334,15 +329,11 @@ static inline u64 get_mmio_spte_generation(u64 spte) return gen; } -/* Bits which may be returned by set_spte() */ -#define SET_SPTE_WRITE_PROTECTED_PT BIT(0) -#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1) -#define SET_SPTE_SPURIOUS BIT(2) - -int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, - gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool speculative, - bool can_unsync, bool host_writable, bool ad_disabled, - u64 *new_spte); +bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + const struct kvm_memory_slot *slot, + unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, + u64 old_spte, bool prefetch, bool can_unsync, + bool host_writable, u64 *new_spte); u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled); u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access); u64 mark_spte_for_access_track(u64 spte); diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c index b3ed302c1a35..caa96c270b95 100644 --- a/arch/x86/kvm/mmu/tdp_iter.c +++ b/arch/x86/kvm/mmu/tdp_iter.c @@ -26,6 +26,7 @@ static gfn_t round_gfn_for_level(gfn_t gfn, int level) */ void tdp_iter_restart(struct tdp_iter *iter) { + iter->yielded = false; iter->yielded_gfn = iter->next_last_level_gfn; iter->level = iter->root_level; @@ -160,6 +161,11 @@ static bool try_step_up(struct tdp_iter *iter) */ void tdp_iter_next(struct tdp_iter *iter) { + if (iter->yielded) { + tdp_iter_restart(iter); + return; + } + if (try_step_down(iter)) return; diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h index b1748b988d3a..e19cabbcb65c 100644 --- a/arch/x86/kvm/mmu/tdp_iter.h +++ b/arch/x86/kvm/mmu/tdp_iter.h @@ -45,6 +45,12 @@ struct tdp_iter { * iterator walks off the end of the paging structure. */ bool valid; + /* + * True if KVM dropped mmu_lock and yielded in the middle of a walk, in + * which case tdp_iter_next() needs to restart the walk at the root + * level instead of advancing to the next entry. + */ + bool yielded; }; /* diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 64ccfc1fa553..7b1bc816b7c3 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -165,8 +165,9 @@ static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu, role = vcpu->arch.mmu->mmu_role.base; role.level = level; role.direct = true; - role.gpte_is_8_bytes = true; + role.has_4_byte_gpte = false; role.access = ACC_ALL; + role.ad_disabled = !shadow_accessed_mask; return role; } @@ -316,9 +317,6 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt)); int level = sp->role.level; gfn_t base_gfn = sp->gfn; - u64 old_child_spte; - u64 *sptep; - gfn_t gfn; int i; trace_kvm_mmu_prepare_zap_page(sp); @@ -326,8 +324,9 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, tdp_mmu_unlink_page(kvm, sp, shared); for (i = 0; i < PT64_ENT_PER_PAGE; i++) { - sptep = rcu_dereference(pt) + i; - gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); + u64 *sptep = rcu_dereference(pt) + i; + gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); + u64 old_child_spte; if (shared) { /* @@ -373,7 +372,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, shared); } - kvm_flush_remote_tlbs_with_address(kvm, gfn, + kvm_flush_remote_tlbs_with_address(kvm, base_gfn, KVM_PAGES_PER_HPAGE(level + 1)); call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); @@ -489,8 +488,8 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, } /* - * tdp_mmu_set_spte_atomic_no_dirty_log - Set a TDP MMU SPTE atomically - * and handle the associated bookkeeping, but do not mark the page dirty + * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically + * and handle the associated bookkeeping. Do not mark the page dirty * in KVM's dirty bitmaps. * * @kvm: kvm instance @@ -499,10 +498,12 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, * Returns: true if the SPTE was set, false if it was not. If false is returned, * this function will have no side-effects. */ -static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm, - struct tdp_iter *iter, - u64 new_spte) +static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, + struct tdp_iter *iter, + u64 new_spte) { + WARN_ON_ONCE(iter->yielded); + lockdep_assert_held_read(&kvm->mmu_lock); /* @@ -527,43 +528,6 @@ static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm, return true; } -/* - * tdp_mmu_map_set_spte_atomic - Set a leaf TDP MMU SPTE atomically to resolve a - * TDP page fault. - * - * @vcpu: The vcpu instance that took the TDP page fault. - * @iter: a tdp_iter instance currently on the SPTE that should be set - * @new_spte: The value the SPTE should be set to - * - * Returns: true if the SPTE was set, false if it was not. If false is returned, - * this function will have no side-effects. - */ -static inline bool tdp_mmu_map_set_spte_atomic(struct kvm_vcpu *vcpu, - struct tdp_iter *iter, - u64 new_spte) -{ - struct kvm *kvm = vcpu->kvm; - - if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, new_spte)) - return false; - - /* - * Use kvm_vcpu_gfn_to_memslot() instead of going through - * handle_changed_spte_dirty_log() to leverage vcpu->last_used_slot. - */ - if (is_writable_pte(new_spte)) { - struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, iter->gfn); - - if (slot && kvm_slot_dirty_track_enabled(slot)) { - /* Enforced by kvm_mmu_hugepage_adjust. */ - WARN_ON_ONCE(iter->level > PG_LEVEL_4K); - mark_page_dirty_in_slot(kvm, slot, iter->gfn); - } - } - - return true; -} - static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm, struct tdp_iter *iter) { @@ -573,7 +537,7 @@ static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm, * immediately installing a present entry in its place * before the TLBs are flushed. */ - if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, REMOVED_SPTE)) + if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE)) return false; kvm_flush_remote_tlbs_with_address(kvm, iter->gfn, @@ -613,6 +577,8 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, u64 new_spte, bool record_acc_track, bool record_dirty_log) { + WARN_ON_ONCE(iter->yielded); + lockdep_assert_held_write(&kvm->mmu_lock); /* @@ -678,18 +644,19 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, * If this function should yield and flush is set, it will perform a remote * TLB flush before yielding. * - * If this function yields, it will also reset the tdp_iter's walk over the - * paging structure and the calling function should skip to the next - * iteration to allow the iterator to continue its traversal from the - * paging structure root. + * If this function yields, iter->yielded is set and the caller must skip to + * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk + * over the paging structures to allow the iterator to continue its traversal + * from the paging structure root. * - * Return true if this function yielded and the iterator's traversal was reset. - * Return false if a yield was not needed. + * Returns true if this function yielded. */ -static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, - struct tdp_iter *iter, bool flush, - bool shared) +static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm, + struct tdp_iter *iter, + bool flush, bool shared) { + WARN_ON(iter->yielded); + /* Ensure forward progress has been made before yielding. */ if (iter->next_last_level_gfn == iter->yielded_gfn) return false; @@ -709,12 +676,10 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, WARN_ON(iter->gfn > iter->next_last_level_gfn); - tdp_iter_restart(iter); - - return true; + iter->yielded = true; } - return false; + return iter->yielded; } /* @@ -929,26 +894,26 @@ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) * Installs a last-level SPTE to handle a TDP page fault. * (NPT/EPT violation/misconfiguration) */ -static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, - int map_writable, - struct tdp_iter *iter, - kvm_pfn_t pfn, bool prefault) +static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault, + struct tdp_iter *iter) { + struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep)); u64 new_spte; int ret = RET_PF_FIXED; - int make_spte_ret = 0; + bool wrprot = false; - if (unlikely(is_noslot_pfn(pfn))) + WARN_ON(sp->role.level != fault->goal_level); + if (unlikely(!fault->slot)) new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); else - make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn, - pfn, iter->old_spte, prefault, true, - map_writable, !shadow_accessed_mask, - &new_spte); + wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn, + fault->pfn, iter->old_spte, fault->prefetch, true, + fault->map_writable, &new_spte); if (new_spte == iter->old_spte) ret = RET_PF_SPURIOUS; - else if (!tdp_mmu_map_set_spte_atomic(vcpu, iter, new_spte)) + else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) return RET_PF_RETRY; /* @@ -956,10 +921,9 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, * protected, emulation is needed. If the emulation was skipped, * the vCPU would have the same fault again. */ - if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) { - if (write) + if (wrprot) { + if (fault->write) ret = RET_PF_EMULATE; - kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ @@ -986,37 +950,26 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing * page tables and SPTEs to translate the faulting guest physical address. */ -int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, - int map_writable, int max_level, kvm_pfn_t pfn, - bool prefault) +int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(); - bool write = error_code & PFERR_WRITE_MASK; - bool exec = error_code & PFERR_FETCH_MASK; - bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; struct kvm_mmu *mmu = vcpu->arch.mmu; struct tdp_iter iter; struct kvm_mmu_page *sp; u64 *child_pt; u64 new_spte; int ret; - gfn_t gfn = gpa >> PAGE_SHIFT; - int level; - int req_level; - level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, - huge_page_disallowed, &req_level); + kvm_mmu_hugepage_adjust(vcpu, fault); - trace_kvm_mmu_spte_requested(gpa, level, pfn); + trace_kvm_mmu_spte_requested(fault); rcu_read_lock(); - tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { - if (nx_huge_page_workaround_enabled) - disallowed_hugepage_adjust(iter.old_spte, gfn, - iter.level, &pfn, &level); + tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) { + if (fault->nx_huge_page_workaround_enabled) + disallowed_hugepage_adjust(fault, iter.old_spte, iter.level); - if (iter.level == level) + if (iter.level == fault->goal_level) break; /* @@ -1052,10 +1005,10 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, new_spte = make_nonleaf_spte(child_pt, !shadow_accessed_mask); - if (tdp_mmu_set_spte_atomic_no_dirty_log(vcpu->kvm, &iter, new_spte)) { + if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, new_spte)) { tdp_mmu_link_page(vcpu->kvm, sp, - huge_page_disallowed && - req_level >= iter.level); + fault->huge_page_disallowed && + fault->req_level >= iter.level); trace_kvm_mmu_get_page(sp, true); } else { @@ -1065,13 +1018,12 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, } } - if (iter.level != level) { + if (iter.level != fault->goal_level) { rcu_read_unlock(); return RET_PF_RETRY; } - ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter, - pfn, prefault); + ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter); rcu_read_unlock(); return ret; @@ -1082,9 +1034,9 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, { struct kvm_mmu_page *root; - for_each_tdp_mmu_root(kvm, root, range->slot->as_id) - flush |= zap_gfn_range(kvm, root, range->start, range->end, - range->may_block, flush, false); + for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false) + flush = zap_gfn_range(kvm, root, range->start, range->end, + range->may_block, flush, false); return flush; } @@ -1241,8 +1193,7 @@ retry: new_spte = iter.old_spte & ~PT_WRITABLE_MASK; - if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter, - new_spte)) { + if (!tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) { /* * The iter must explicitly re-read the SPTE because * the atomic cmpxchg failed. @@ -1310,8 +1261,7 @@ retry: continue; } - if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter, - new_spte)) { + if (!tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) { /* * The iter must explicitly re-read the SPTE because * the atomic cmpxchg failed. @@ -1415,10 +1365,9 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, * Clear leaf entries which could be replaced by large mappings, for * GFNs within the slot. */ -static bool zap_collapsible_spte_range(struct kvm *kvm, +static void zap_collapsible_spte_range(struct kvm *kvm, struct kvm_mmu_page *root, - const struct kvm_memory_slot *slot, - bool flush) + const struct kvm_memory_slot *slot) { gfn_t start = slot->base_gfn; gfn_t end = start + slot->npages; @@ -1429,10 +1378,8 @@ static bool zap_collapsible_spte_range(struct kvm *kvm, tdp_root_for_each_pte(iter, root, start, end) { retry: - if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) { - flush = false; + if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) continue; - } if (!is_shadow_present_pte(iter.old_spte) || !is_last_spte(iter.old_spte, iter.level)) @@ -1444,6 +1391,7 @@ retry: pfn, PG_LEVEL_NUM)) continue; + /* Note, a successful atomic zap also does a remote TLB flush. */ if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) { /* * The iter must explicitly re-read the SPTE because @@ -1452,30 +1400,24 @@ retry: iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); goto retry; } - flush = true; } rcu_read_unlock(); - - return flush; } /* * Clear non-leaf entries (and free associated page tables) which could * be replaced by large mappings, for GFNs within the slot. */ -bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *slot, - bool flush) +void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, + const struct kvm_memory_slot *slot) { struct kvm_mmu_page *root; lockdep_assert_held_read(&kvm->mmu_lock); for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) - flush = zap_collapsible_spte_range(kvm, root, slot, flush); - - return flush; + zap_collapsible_spte_range(kvm, root, slot); } /* diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 358f447d4012..3899004a5d91 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -48,9 +48,7 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm); void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm); void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm); -int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, - int map_writable, int max_level, kvm_pfn_t pfn, - bool prefault); +int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, bool flush); @@ -66,9 +64,8 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, unsigned long mask, bool wrprot); -bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *slot, - bool flush); +void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, + const struct kvm_memory_slot *slot); bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, @@ -92,7 +89,6 @@ u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, #ifdef CONFIG_X86_64 bool kvm_mmu_init_tdp_mmu(struct kvm *kvm); void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); -static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return kvm->arch.tdp_mmu_enabled; } static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; } static inline bool is_tdp_mmu(struct kvm_mmu *mmu) @@ -114,7 +110,6 @@ static inline bool is_tdp_mmu(struct kvm_mmu *mmu) #else static inline bool kvm_mmu_init_tdp_mmu(struct kvm *kvm) { return false; } static inline void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) {} -static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return false; } static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; } static inline bool is_tdp_mmu(struct kvm_mmu *mmu) { return false; } #endif diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 0772bad9165c..261b39cbef6e 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -55,43 +55,41 @@ static void kvm_pmi_trigger_fn(struct irq_work *irq_work) kvm_pmu_deliver_pmi(vcpu); } -static void kvm_perf_overflow(struct perf_event *perf_event, - struct perf_sample_data *data, - struct pt_regs *regs) +static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi) { - struct kvm_pmc *pmc = perf_event->overflow_handler_context; struct kvm_pmu *pmu = pmc_to_pmu(pmc); - if (!test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) { - __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); - kvm_make_request(KVM_REQ_PMU, pmc->vcpu); - } + /* Ignore counters that have been reprogrammed already. */ + if (test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) + return; + + __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); + kvm_make_request(KVM_REQ_PMU, pmc->vcpu); + + if (!pmc->intr) + return; + + /* + * Inject PMI. If vcpu was in a guest mode during NMI PMI + * can be ejected on a guest mode re-entry. Otherwise we can't + * be sure that vcpu wasn't executing hlt instruction at the + * time of vmexit and is not going to re-enter guest mode until + * woken up. So we should wake it, but this is impossible from + * NMI context. Do it from irq work instead. + */ + if (in_pmi && !kvm_handling_nmi_from_guest(pmc->vcpu)) + irq_work_queue(&pmc_to_pmu(pmc)->irq_work); + else + kvm_make_request(KVM_REQ_PMI, pmc->vcpu); } -static void kvm_perf_overflow_intr(struct perf_event *perf_event, - struct perf_sample_data *data, - struct pt_regs *regs) +static void kvm_perf_overflow(struct perf_event *perf_event, + struct perf_sample_data *data, + struct pt_regs *regs) { struct kvm_pmc *pmc = perf_event->overflow_handler_context; - struct kvm_pmu *pmu = pmc_to_pmu(pmc); - - if (!test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) { - __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); - kvm_make_request(KVM_REQ_PMU, pmc->vcpu); - /* - * Inject PMI. If vcpu was in a guest mode during NMI PMI - * can be ejected on a guest mode re-entry. Otherwise we can't - * be sure that vcpu wasn't executing hlt instruction at the - * time of vmexit and is not going to re-enter guest mode until - * woken up. So we should wake it, but this is impossible from - * NMI context. Do it from irq work instead. - */ - if (!kvm_is_in_guest()) - irq_work_queue(&pmc_to_pmu(pmc)->irq_work); - else - kvm_make_request(KVM_REQ_PMI, pmc->vcpu); - } + __kvm_perf_overflow(pmc, true); } static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, @@ -126,7 +124,6 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, } event = perf_event_create_kernel_counter(&attr, -1, current, - intr ? kvm_perf_overflow_intr : kvm_perf_overflow, pmc); if (IS_ERR(event)) { pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n", @@ -138,6 +135,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, pmc_to_pmu(pmc)->event_count++; clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi); pmc->is_paused = false; + pmc->intr = intr; } static void pmc_pause_counter(struct kvm_pmc *pmc) @@ -174,7 +172,6 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc) void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) { unsigned config, type = PERF_TYPE_RAW; - u8 event_select, unit_mask; struct kvm *kvm = pmc->vcpu->kvm; struct kvm_pmu_event_filter *filter; int i; @@ -206,17 +203,12 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) if (!allow_event) return; - event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT; - unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; - if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE | ARCH_PERFMON_EVENTSEL_INV | ARCH_PERFMON_EVENTSEL_CMASK | HSW_IN_TX | HSW_IN_TX_CHECKPOINTED))) { - config = kvm_x86_ops.pmu_ops->find_arch_event(pmc_to_pmu(pmc), - event_select, - unit_mask); + config = kvm_x86_ops.pmu_ops->pmc_perf_hw_id(pmc); if (config != PERF_COUNT_HW_MAX) type = PERF_TYPE_HARDWARE; } @@ -268,7 +260,7 @@ void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx) pmc->current_config = (u64)ctrl; pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE, - kvm_x86_ops.pmu_ops->find_fixed_event(idx), + kvm_x86_ops.pmu_ops->pmc_perf_hw_id(pmc), !(en_field & 0x2), /* exclude user */ !(en_field & 0x1), /* exclude kernel */ pmi, false, false); @@ -319,7 +311,7 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) } /* check if idx is a valid index to access PMU */ -int kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) +bool kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) { return kvm_x86_ops.pmu_ops->is_valid_rdpmc_ecx(vcpu, idx); } @@ -490,6 +482,66 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu) kvm_pmu_reset(vcpu); } +static void kvm_pmu_incr_counter(struct kvm_pmc *pmc) +{ + struct kvm_pmu *pmu = pmc_to_pmu(pmc); + u64 prev_count; + + prev_count = pmc->counter; + pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc); + + reprogram_counter(pmu, pmc->idx); + if (pmc->counter < prev_count) + __kvm_perf_overflow(pmc, false); +} + +static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc, + unsigned int perf_hw_id) +{ + u64 old_eventsel = pmc->eventsel; + unsigned int config; + + pmc->eventsel &= (ARCH_PERFMON_EVENTSEL_EVENT | ARCH_PERFMON_EVENTSEL_UMASK); + config = kvm_x86_ops.pmu_ops->pmc_perf_hw_id(pmc); + pmc->eventsel = old_eventsel; + return config == perf_hw_id; +} + +static inline bool cpl_is_matched(struct kvm_pmc *pmc) +{ + bool select_os, select_user; + u64 config = pmc->current_config; + + if (pmc_is_gp(pmc)) { + select_os = config & ARCH_PERFMON_EVENTSEL_OS; + select_user = config & ARCH_PERFMON_EVENTSEL_USR; + } else { + select_os = config & 0x1; + select_user = config & 0x2; + } + + return (static_call(kvm_x86_get_cpl)(pmc->vcpu) == 0) ? select_os : select_user; +} + +void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; + int i; + + for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) { + pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, i); + + if (!pmc || !pmc_is_enabled(pmc) || !pmc_speculative_in_use(pmc)) + continue; + + /* Ignore checks for edge detect, pin control, invert and CMASK bits */ + if (eventsel_match_perf_hw_id(pmc, perf_hw_id) && cpl_is_matched(pmc)) + kvm_pmu_incr_counter(pmc); + } +} +EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event); + int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp) { struct kvm_pmu_event_filter tmp, *filter; diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 0e4f2b1fa9fb..7a7b8d5b775e 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -24,15 +24,13 @@ struct kvm_event_hw_type_mapping { }; struct kvm_pmu_ops { - unsigned (*find_arch_event)(struct kvm_pmu *pmu, u8 event_select, - u8 unit_mask); - unsigned (*find_fixed_event)(int idx); + unsigned int (*pmc_perf_hw_id)(struct kvm_pmc *pmc); bool (*pmc_is_enabled)(struct kvm_pmc *pmc); struct kvm_pmc *(*pmc_idx_to_pmc)(struct kvm_pmu *pmu, int pmc_idx); struct kvm_pmc *(*rdpmc_ecx_to_pmc)(struct kvm_vcpu *vcpu, unsigned int idx, u64 *mask); struct kvm_pmc *(*msr_idx_to_pmc)(struct kvm_vcpu *vcpu, u32 msr); - int (*is_valid_rdpmc_ecx)(struct kvm_vcpu *vcpu, unsigned int idx); + bool (*is_valid_rdpmc_ecx)(struct kvm_vcpu *vcpu, unsigned int idx); bool (*is_valid_msr)(struct kvm_vcpu *vcpu, u32 msr); int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info); int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info); @@ -149,7 +147,7 @@ void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx); void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu); int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); -int kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx); +bool kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx); bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr); int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); @@ -159,6 +157,7 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu); void kvm_pmu_cleanup(struct kvm_vcpu *vcpu); void kvm_pmu_destroy(struct kvm_vcpu *vcpu); int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp); +void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id); bool is_vmware_backdoor_pmc(u32 pmc_idx); diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 8052d92069e0..0e5b49294086 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -293,7 +293,7 @@ static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source, u32 icrl, u32 icrh) { struct kvm_vcpu *vcpu; - int i; + unsigned long i; kvm_for_each_vcpu(i, vcpu, kvm) { bool m = kvm_apic_match_dest(vcpu, source, @@ -675,10 +675,18 @@ int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec) smp_mb__after_atomic(); if (avic_vcpu_is_running(vcpu)) { - int cpuid = vcpu->cpu; + int cpu = READ_ONCE(vcpu->cpu); - if (cpuid != get_cpu()) - wrmsrl(SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpuid)); + /* + * Note, the vCPU could get migrated to a different pCPU at any + * point, which could result in signalling the wrong/previous + * pCPU. But if that happens the vCPU is guaranteed to do a + * VMRUN (after being migrated) and thus will process pending + * interrupts, i.e. a doorbell is not needed (and the spurious + * one is harmless). + */ + if (cpu != get_cpu()) + wrmsrl(SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu)); put_cpu(); } else kvm_vcpu_wake_up(vcpu); @@ -900,11 +908,13 @@ out: bool svm_check_apicv_inhibit_reasons(ulong bit) { ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) | + BIT(APICV_INHIBIT_REASON_ABSENT) | BIT(APICV_INHIBIT_REASON_HYPERV) | BIT(APICV_INHIBIT_REASON_NESTED) | BIT(APICV_INHIBIT_REASON_IRQWIN) | BIT(APICV_INHIBIT_REASON_PIT_REINJ) | - BIT(APICV_INHIBIT_REASON_X2APIC); + BIT(APICV_INHIBIT_REASON_X2APIC) | + BIT(APICV_INHIBIT_REASON_BLOCKIRQ); return supported & BIT(bit); } @@ -988,16 +998,18 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu) static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run) { struct vcpu_svm *svm = to_svm(vcpu); + int cpu = get_cpu(); + WARN_ON(cpu != vcpu->cpu); svm->avic_is_running = is_run; - if (!kvm_vcpu_apicv_active(vcpu)) - return; - - if (is_run) - avic_vcpu_load(vcpu, vcpu->cpu); - else - avic_vcpu_put(vcpu); + if (kvm_vcpu_apicv_active(vcpu)) { + if (is_run) + avic_vcpu_load(vcpu, cpu); + else + avic_vcpu_put(vcpu); + } + put_cpu(); } void svm_vcpu_blocking(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 510b833cbd39..cf206855ebf0 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -58,8 +58,9 @@ static void svm_inject_page_fault_nested(struct kvm_vcpu *vcpu, struct x86_excep struct vcpu_svm *svm = to_svm(vcpu); WARN_ON(!is_guest_mode(vcpu)); - if (vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_EXCEPTION_OFFSET + PF_VECTOR) && - !svm->nested.nested_run_pending) { + if (vmcb12_is_intercept(&svm->nested.ctl, + INTERCEPT_EXCEPTION_OFFSET + PF_VECTOR) && + !svm->nested.nested_run_pending) { svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + PF_VECTOR; svm->vmcb->control.exit_code_hi = 0; svm->vmcb->control.exit_info_1 = fault->error_code; @@ -121,7 +122,8 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) void recalc_intercepts(struct vcpu_svm *svm) { - struct vmcb_control_area *c, *h, *g; + struct vmcb_control_area *c, *h; + struct vmcb_ctrl_area_cached *g; unsigned int i; vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); @@ -163,37 +165,6 @@ void recalc_intercepts(struct vcpu_svm *svm) vmcb_set_intercept(c, INTERCEPT_VMSAVE); } -static void copy_vmcb_control_area(struct vmcb_control_area *dst, - struct vmcb_control_area *from) -{ - unsigned int i; - - for (i = 0; i < MAX_INTERCEPT; i++) - dst->intercepts[i] = from->intercepts[i]; - - dst->iopm_base_pa = from->iopm_base_pa; - dst->msrpm_base_pa = from->msrpm_base_pa; - dst->tsc_offset = from->tsc_offset; - /* asid not copied, it is handled manually for svm->vmcb. */ - dst->tlb_ctl = from->tlb_ctl; - dst->int_ctl = from->int_ctl; - dst->int_vector = from->int_vector; - dst->int_state = from->int_state; - dst->exit_code = from->exit_code; - dst->exit_code_hi = from->exit_code_hi; - dst->exit_info_1 = from->exit_info_1; - dst->exit_info_2 = from->exit_info_2; - dst->exit_int_info = from->exit_int_info; - dst->exit_int_info_err = from->exit_int_info_err; - dst->nested_ctl = from->nested_ctl; - dst->event_inj = from->event_inj; - dst->event_inj_err = from->event_inj_err; - dst->nested_cr3 = from->nested_cr3; - dst->virt_ext = from->virt_ext; - dst->pause_filter_count = from->pause_filter_count; - dst->pause_filter_thresh = from->pause_filter_thresh; -} - static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) { /* @@ -203,7 +174,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) */ int i; - if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))) + if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))) return true; for (i = 0; i < MSRPM_OFFSETS; i++) { @@ -238,10 +209,22 @@ static bool nested_svm_check_bitmap_pa(struct kvm_vcpu *vcpu, u64 pa, u32 size) kvm_vcpu_is_legal_gpa(vcpu, addr + size - 1); } -static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu, - struct vmcb_control_area *control) +static bool nested_svm_check_tlb_ctl(struct kvm_vcpu *vcpu, u8 tlb_ctl) +{ + /* Nested FLUSHBYASID is not supported yet. */ + switch(tlb_ctl) { + case TLB_CONTROL_DO_NOTHING: + case TLB_CONTROL_FLUSH_ALL_ASID: + return true; + default: + return false; + } +} + +static bool __nested_vmcb_check_controls(struct kvm_vcpu *vcpu, + struct vmcb_ctrl_area_cached *control) { - if (CC(!vmcb_is_intercept(control, INTERCEPT_VMRUN))) + if (CC(!vmcb12_is_intercept(control, INTERCEPT_VMRUN))) return false; if (CC(control->asid == 0)) @@ -257,12 +240,26 @@ static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu, IOPM_SIZE))) return false; + if (CC(!nested_svm_check_tlb_ctl(vcpu, control->tlb_ctl))) + return false; + return true; } -static bool nested_vmcb_check_cr3_cr4(struct kvm_vcpu *vcpu, - struct vmcb_save_area *save) +/* Common checks that apply to both L1 and L2 state. */ +static bool __nested_vmcb_check_save(struct kvm_vcpu *vcpu, + struct vmcb_save_area_cached *save) { + if (CC(!(save->efer & EFER_SVME))) + return false; + + if (CC((save->cr0 & X86_CR0_CD) == 0 && (save->cr0 & X86_CR0_NW)) || + CC(save->cr0 & ~0xffffffffULL)) + return false; + + if (CC(!kvm_dr6_valid(save->dr6)) || CC(!kvm_dr7_valid(save->dr7))) + return false; + /* * These checks are also performed by KVM_SET_SREGS, * except that EFER.LMA is not checked by SVM against @@ -278,48 +275,90 @@ static bool nested_vmcb_check_cr3_cr4(struct kvm_vcpu *vcpu, if (CC(!kvm_is_valid_cr4(vcpu, save->cr4))) return false; + if (CC(!kvm_valid_efer(vcpu, save->efer))) + return false; + return true; } -/* Common checks that apply to both L1 and L2 state. */ -static bool nested_vmcb_valid_sregs(struct kvm_vcpu *vcpu, - struct vmcb_save_area *save) +static bool nested_vmcb_check_save(struct kvm_vcpu *vcpu) { - /* - * FIXME: these should be done after copying the fields, - * to avoid TOC/TOU races. For these save area checks - * the possible damage is limited since kvm_set_cr0 and - * kvm_set_cr4 handle failure; EFER_SVME is an exception - * so it is force-set later in nested_prepare_vmcb_save. - */ - if (CC(!(save->efer & EFER_SVME))) - return false; + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb_save_area_cached *save = &svm->nested.save; - if (CC((save->cr0 & X86_CR0_CD) == 0 && (save->cr0 & X86_CR0_NW)) || - CC(save->cr0 & ~0xffffffffULL)) - return false; + return __nested_vmcb_check_save(vcpu, save); +} - if (CC(!kvm_dr6_valid(save->dr6)) || CC(!kvm_dr7_valid(save->dr7))) - return false; +static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb_ctrl_area_cached *ctl = &svm->nested.ctl; - if (!nested_vmcb_check_cr3_cr4(vcpu, save)) - return false; + return __nested_vmcb_check_controls(vcpu, ctl); +} - if (CC(!kvm_valid_efer(vcpu, save->efer))) - return false; +static +void __nested_copy_vmcb_control_to_cache(struct vmcb_ctrl_area_cached *to, + struct vmcb_control_area *from) +{ + unsigned int i; - return true; + for (i = 0; i < MAX_INTERCEPT; i++) + to->intercepts[i] = from->intercepts[i]; + + to->iopm_base_pa = from->iopm_base_pa; + to->msrpm_base_pa = from->msrpm_base_pa; + to->tsc_offset = from->tsc_offset; + to->tlb_ctl = from->tlb_ctl; + to->int_ctl = from->int_ctl; + to->int_vector = from->int_vector; + to->int_state = from->int_state; + to->exit_code = from->exit_code; + to->exit_code_hi = from->exit_code_hi; + to->exit_info_1 = from->exit_info_1; + to->exit_info_2 = from->exit_info_2; + to->exit_int_info = from->exit_int_info; + to->exit_int_info_err = from->exit_int_info_err; + to->nested_ctl = from->nested_ctl; + to->event_inj = from->event_inj; + to->event_inj_err = from->event_inj_err; + to->nested_cr3 = from->nested_cr3; + to->virt_ext = from->virt_ext; + to->pause_filter_count = from->pause_filter_count; + to->pause_filter_thresh = from->pause_filter_thresh; + + /* Copy asid here because nested_vmcb_check_controls will check it. */ + to->asid = from->asid; + to->msrpm_base_pa &= ~0x0fffULL; + to->iopm_base_pa &= ~0x0fffULL; } -void nested_load_control_from_vmcb12(struct vcpu_svm *svm, - struct vmcb_control_area *control) +void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm, + struct vmcb_control_area *control) { - copy_vmcb_control_area(&svm->nested.ctl, control); + __nested_copy_vmcb_control_to_cache(&svm->nested.ctl, control); +} - /* Copy it here because nested_svm_check_controls will check it. */ - svm->nested.ctl.asid = control->asid; - svm->nested.ctl.msrpm_base_pa &= ~0x0fffULL; - svm->nested.ctl.iopm_base_pa &= ~0x0fffULL; +static void __nested_copy_vmcb_save_to_cache(struct vmcb_save_area_cached *to, + struct vmcb_save_area *from) +{ + /* + * Copy only fields that are validated, as we need them + * to avoid TOC/TOU races. + */ + to->efer = from->efer; + to->cr0 = from->cr0; + to->cr3 = from->cr3; + to->cr4 = from->cr4; + + to->dr6 = from->dr6; + to->dr7 = from->dr7; +} + +void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm, + struct vmcb_save_area *save) +{ + __nested_copy_vmcb_save_to_cache(&svm->nested.save, save); } /* @@ -422,14 +461,13 @@ static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, return -EINVAL; if (reload_pdptrs && !nested_npt && is_pae_paging(vcpu) && - CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) + CC(!load_pdptrs(vcpu, cr3))) return -EINVAL; if (!nested_npt) kvm_mmu_new_pgd(vcpu, cr3); vcpu->arch.cr3 = cr3; - kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */ kvm_init_mmu(vcpu); @@ -475,15 +513,10 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED); - /* - * Force-set EFER_SVME even though it is checked earlier on the - * VMCB12, because the guest can flip the bit between the check - * and now. Clearing EFER_SVME would call svm_free_nested. - */ - svm_set_efer(&svm->vcpu, vmcb12->save.efer | EFER_SVME); + svm_set_efer(&svm->vcpu, svm->nested.save.efer); - svm_set_cr0(&svm->vcpu, vmcb12->save.cr0); - svm_set_cr4(&svm->vcpu, vmcb12->save.cr4); + svm_set_cr0(&svm->vcpu, svm->nested.save.cr0); + svm_set_cr4(&svm->vcpu, svm->nested.save.cr4); svm->vcpu.arch.cr2 = vmcb12->save.cr2; @@ -498,8 +531,8 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 /* These bits will be set properly on the first execution when new_vmc12 is true */ if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DR))) { - svm->vmcb->save.dr7 = vmcb12->save.dr7 | DR7_FIXED_1; - svm->vcpu.arch.dr6 = vmcb12->save.dr6 | DR6_ACTIVE_LOW; + svm->vmcb->save.dr7 = svm->nested.save.dr7 | DR7_FIXED_1; + svm->vcpu.arch.dr6 = svm->nested.save.dr6 | DR6_ACTIVE_LOW; vmcb_mark_dirty(svm->vmcb, VMCB_DR); } } @@ -538,8 +571,17 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) if (nested_npt_enabled(svm)) nested_svm_init_mmu_context(vcpu); - svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset = - vcpu->arch.l1_tsc_offset + svm->nested.ctl.tsc_offset; + vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( + vcpu->arch.l1_tsc_offset, + svm->nested.ctl.tsc_offset, + svm->tsc_ratio_msr); + + svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset; + + if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) { + WARN_ON(!svm->tsc_scaling_enabled); + nested_svm_update_tsc_ratio_msr(vcpu); + } svm->vmcb->control.int_ctl = (svm->nested.ctl.int_ctl & int_ctl_vmcb12_bits) | @@ -550,9 +592,6 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) svm->vmcb->control.event_inj = svm->nested.ctl.event_inj; svm->vmcb->control.event_inj_err = svm->nested.ctl.event_inj_err; - svm->vmcb->control.pause_filter_count = svm->nested.ctl.pause_filter_count; - svm->vmcb->control.pause_filter_thresh = svm->nested.ctl.pause_filter_thresh; - nested_svm_transition_tlb_flush(vcpu); /* Enter Guest-Mode */ @@ -607,7 +646,7 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, nested_vmcb02_prepare_control(svm); nested_vmcb02_prepare_save(svm, vmcb12); - ret = nested_svm_load_cr3(&svm->vcpu, vmcb12->save.cr3, + ret = nested_svm_load_cr3(&svm->vcpu, svm->nested.save.cr3, nested_npt_enabled(svm), from_vmrun); if (ret) return ret; @@ -657,10 +696,11 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) if (WARN_ON_ONCE(!svm->nested.initialized)) return -EINVAL; - nested_load_control_from_vmcb12(svm, &vmcb12->control); + nested_copy_vmcb_control_to_cache(svm, &vmcb12->control); + nested_copy_vmcb_save_to_cache(svm, &vmcb12->save); - if (!nested_vmcb_valid_sregs(vcpu, &vmcb12->save) || - !nested_vmcb_check_controls(vcpu, &svm->nested.ctl)) { + if (!nested_vmcb_check_save(vcpu) || + !nested_vmcb_check_controls(vcpu)) { vmcb12->control.exit_code = SVM_EXIT_ERR; vmcb12->control.exit_code_hi = 0; vmcb12->control.exit_info_1 = 0; @@ -810,11 +850,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb12->control.event_inj = svm->nested.ctl.event_inj; vmcb12->control.event_inj_err = svm->nested.ctl.event_inj_err; - vmcb12->control.pause_filter_count = - svm->vmcb->control.pause_filter_count; - vmcb12->control.pause_filter_thresh = - svm->vmcb->control.pause_filter_thresh; - nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr); svm_switch_vmcb(svm, &svm->vmcb01); @@ -832,6 +867,12 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); } + if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) { + WARN_ON(!svm->tsc_scaling_enabled); + vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; + svm_write_tsc_multiplier(vcpu, vcpu->arch.tsc_scaling_ratio); + } + svm->nested.ctl.nested_cr3 = 0; /* @@ -966,7 +1007,7 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) u32 offset, msr, value; int write, mask; - if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))) + if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT))) return NESTED_EXIT_HOST; msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; @@ -993,7 +1034,7 @@ static int nested_svm_intercept_ioio(struct vcpu_svm *svm) u8 start_bit; u64 gpa; - if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_IOIO_PROT))) + if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_IOIO_PROT))) return NESTED_EXIT_HOST; port = svm->vmcb->control.exit_info_1 >> 16; @@ -1024,12 +1065,12 @@ static int nested_svm_intercept(struct vcpu_svm *svm) vmexit = nested_svm_intercept_ioio(svm); break; case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: { - if (vmcb_is_intercept(&svm->nested.ctl, exit_code)) + if (vmcb12_is_intercept(&svm->nested.ctl, exit_code)) vmexit = NESTED_EXIT_DONE; break; } case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: { - if (vmcb_is_intercept(&svm->nested.ctl, exit_code)) + if (vmcb12_is_intercept(&svm->nested.ctl, exit_code)) vmexit = NESTED_EXIT_DONE; break; } @@ -1047,7 +1088,7 @@ static int nested_svm_intercept(struct vcpu_svm *svm) break; } default: { - if (vmcb_is_intercept(&svm->nested.ctl, exit_code)) + if (vmcb12_is_intercept(&svm->nested.ctl, exit_code)) vmexit = NESTED_EXIT_DONE; } } @@ -1125,7 +1166,7 @@ static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm) static inline bool nested_exit_on_init(struct vcpu_svm *svm) { - return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_INIT); + return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_INIT); } static int svm_check_nested_events(struct kvm_vcpu *vcpu) @@ -1219,11 +1260,57 @@ int nested_svm_exit_special(struct vcpu_svm *svm) return NESTED_EXIT_CONTINUE; } +void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + vcpu->arch.tsc_scaling_ratio = + kvm_calc_nested_tsc_multiplier(vcpu->arch.l1_tsc_scaling_ratio, + svm->tsc_ratio_msr); + svm_write_tsc_multiplier(vcpu, vcpu->arch.tsc_scaling_ratio); +} + +/* Inverse operation of nested_copy_vmcb_control_to_cache(). asid is copied too. */ +static void nested_copy_vmcb_cache_to_control(struct vmcb_control_area *dst, + struct vmcb_ctrl_area_cached *from) +{ + unsigned int i; + + memset(dst, 0, sizeof(struct vmcb_control_area)); + + for (i = 0; i < MAX_INTERCEPT; i++) + dst->intercepts[i] = from->intercepts[i]; + + dst->iopm_base_pa = from->iopm_base_pa; + dst->msrpm_base_pa = from->msrpm_base_pa; + dst->tsc_offset = from->tsc_offset; + dst->asid = from->asid; + dst->tlb_ctl = from->tlb_ctl; + dst->int_ctl = from->int_ctl; + dst->int_vector = from->int_vector; + dst->int_state = from->int_state; + dst->exit_code = from->exit_code; + dst->exit_code_hi = from->exit_code_hi; + dst->exit_info_1 = from->exit_info_1; + dst->exit_info_2 = from->exit_info_2; + dst->exit_int_info = from->exit_int_info; + dst->exit_int_info_err = from->exit_int_info_err; + dst->nested_ctl = from->nested_ctl; + dst->event_inj = from->event_inj; + dst->event_inj_err = from->event_inj_err; + dst->nested_cr3 = from->nested_cr3; + dst->virt_ext = from->virt_ext; + dst->pause_filter_count = from->pause_filter_count; + dst->pause_filter_thresh = from->pause_filter_thresh; +} + static int svm_get_nested_state(struct kvm_vcpu *vcpu, struct kvm_nested_state __user *user_kvm_nested_state, u32 user_data_size) { struct vcpu_svm *svm; + struct vmcb_control_area *ctl; + unsigned long r; struct kvm_nested_state kvm_state = { .flags = 0, .format = KVM_STATE_NESTED_FORMAT_SVM, @@ -1265,9 +1352,18 @@ static int svm_get_nested_state(struct kvm_vcpu *vcpu, */ if (clear_user(user_vmcb, KVM_STATE_NESTED_SVM_VMCB_SIZE)) return -EFAULT; - if (copy_to_user(&user_vmcb->control, &svm->nested.ctl, - sizeof(user_vmcb->control))) + + ctl = kzalloc(sizeof(*ctl), GFP_KERNEL); + if (!ctl) + return -ENOMEM; + + nested_copy_vmcb_cache_to_control(ctl, &svm->nested.ctl); + r = copy_to_user(&user_vmcb->control, ctl, + sizeof(user_vmcb->control)); + kfree(ctl); + if (r) return -EFAULT; + if (copy_to_user(&user_vmcb->save, &svm->vmcb01.ptr->save, sizeof(user_vmcb->save))) return -EFAULT; @@ -1284,6 +1380,8 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, &user_kvm_nested_state->data.svm[0]; struct vmcb_control_area *ctl; struct vmcb_save_area *save; + struct vmcb_save_area_cached save_cached; + struct vmcb_ctrl_area_cached ctl_cached; unsigned long cr0; int ret; @@ -1336,7 +1434,8 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, goto out_free; ret = -EINVAL; - if (!nested_vmcb_check_controls(vcpu, ctl)) + __nested_copy_vmcb_control_to_cache(&ctl_cached, ctl); + if (!__nested_vmcb_check_controls(vcpu, &ctl_cached)) goto out_free; /* @@ -1351,10 +1450,11 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, * Validate host state saved from before VMRUN (see * nested_svm_check_permissions). */ + __nested_copy_vmcb_save_to_cache(&save_cached, save); if (!(save->cr0 & X86_CR0_PG) || !(save->cr0 & X86_CR0_PE) || (save->rflags & X86_EFLAGS_VM) || - !nested_vmcb_valid_sregs(vcpu, save)) + !__nested_vmcb_check_save(vcpu, &save_cached)) goto out_free; /* @@ -1390,7 +1490,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa; svm_copy_vmrun_state(&svm->vmcb01.ptr->save, save); - nested_load_control_from_vmcb12(svm, ctl); + nested_copy_vmcb_control_to_cache(svm, ctl); svm_switch_vmcb(svm, &svm->nested.vmcb02); nested_vmcb02_prepare_control(svm); @@ -1417,7 +1517,7 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu) * the guest CR3 might be restored prior to setting the nested * state which can lead to a load of wrong PDPTRs. */ - if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3))) + if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3))) return false; if (!nested_svm_vmrun_msrpm(svm)) { diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index fdf587f19c5f..12d8b301065a 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -16,6 +16,7 @@ #include "cpuid.h" #include "lapic.h" #include "pmu.h" +#include "svm.h" enum pmu_type { PMU_TYPE_COUNTER = 0, @@ -100,6 +101,9 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, { struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); + if (!pmu) + return NULL; + switch (msr) { case MSR_F15H_PERF_CTL0: case MSR_F15H_PERF_CTL1: @@ -134,12 +138,16 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, return &pmu->gp_counters[msr_to_index(msr)]; } -static unsigned amd_find_arch_event(struct kvm_pmu *pmu, - u8 event_select, - u8 unit_mask) +static unsigned int amd_pmc_perf_hw_id(struct kvm_pmc *pmc) { + u8 event_select = pmc->eventsel & ARCH_PERFMON_EVENTSEL_EVENT; + u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; int i; + /* return PERF_COUNT_HW_MAX as AMD doesn't have fixed events */ + if (WARN_ON(pmc_is_fixed(pmc))) + return PERF_COUNT_HW_MAX; + for (i = 0; i < ARRAY_SIZE(amd_event_mapping); i++) if (amd_event_mapping[i].eventsel == event_select && amd_event_mapping[i].unit_mask == unit_mask) @@ -151,12 +159,6 @@ static unsigned amd_find_arch_event(struct kvm_pmu *pmu, return amd_event_mapping[i].event_type; } -/* return PERF_COUNT_HW_MAX as AMD doesn't have fixed events */ -static unsigned amd_find_fixed_event(int idx) -{ - return PERF_COUNT_HW_MAX; -} - /* check if a PMC is enabled by comparing it against global_ctrl bits. Because * AMD CPU doesn't have global_ctrl MSR, all PMCs are enabled (return TRUE). */ @@ -181,14 +183,13 @@ static struct kvm_pmc *amd_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) return get_gp_pmc_amd(pmu, base + pmc_idx, PMU_TYPE_COUNTER); } -/* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */ -static int amd_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) +static bool amd_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); idx &= ~(3u << 30); - return (idx >= pmu->nr_arch_gp_counters); + return idx < pmu->nr_arch_gp_counters; } /* idx is the ECX register of RDPMC instruction */ @@ -282,7 +283,7 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu) pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS; pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1; - pmu->reserved_bits = 0xffffffff00200000ull; + pmu->reserved_bits = 0xfffffff000280000ull; pmu->version = 1; /* not applicable to AMD; but clean them to prevent any fall out */ pmu->counter_bitmask[KVM_PMC_FIXED] = 0; @@ -320,8 +321,7 @@ static void amd_pmu_reset(struct kvm_vcpu *vcpu) } struct kvm_pmu_ops amd_pmu_ops = { - .find_arch_event = amd_find_arch_event, - .find_fixed_event = amd_find_fixed_event, + .pmc_perf_hw_id = amd_pmc_perf_hw_id, .pmc_is_enabled = amd_pmc_is_enabled, .pmc_idx_to_pmc = amd_pmc_idx_to_pmc, .rdpmc_ecx_to_pmc = amd_rdpmc_ecx_to_pmc, diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index c36b5fe4c27c..6a22798eaaee 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -17,10 +17,10 @@ #include <linux/misc_cgroup.h> #include <linux/processor.h> #include <linux/trace_events.h> -#include <asm/fpu/internal.h> #include <asm/pkru.h> #include <asm/trapnr.h> +#include <asm/fpu/xcr.h> #include "x86.h" #include "svm.h" @@ -120,16 +120,26 @@ static bool __sev_recycle_asids(int min_asid, int max_asid) return true; } +static int sev_misc_cg_try_charge(struct kvm_sev_info *sev) +{ + enum misc_res_type type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV; + return misc_cg_try_charge(type, sev->misc_cg, 1); +} + +static void sev_misc_cg_uncharge(struct kvm_sev_info *sev) +{ + enum misc_res_type type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV; + misc_cg_uncharge(type, sev->misc_cg, 1); +} + static int sev_asid_new(struct kvm_sev_info *sev) { int asid, min_asid, max_asid, ret; bool retry = true; - enum misc_res_type type; - type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV; WARN_ON(sev->misc_cg); sev->misc_cg = get_current_misc_cg(); - ret = misc_cg_try_charge(type, sev->misc_cg, 1); + ret = sev_misc_cg_try_charge(sev); if (ret) { put_misc_cg(sev->misc_cg); sev->misc_cg = NULL; @@ -162,7 +172,7 @@ again: return asid; e_uncharge: - misc_cg_uncharge(type, sev->misc_cg, 1); + sev_misc_cg_uncharge(sev); put_misc_cg(sev->misc_cg); sev->misc_cg = NULL; return ret; @@ -179,7 +189,6 @@ static void sev_asid_free(struct kvm_sev_info *sev) { struct svm_cpu_data *sd; int cpu; - enum misc_res_type type; mutex_lock(&sev_bitmap_lock); @@ -192,8 +201,7 @@ static void sev_asid_free(struct kvm_sev_info *sev) mutex_unlock(&sev_bitmap_lock); - type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV; - misc_cg_uncharge(type, sev->misc_cg, 1); + sev_misc_cg_uncharge(sev); put_misc_cg(sev->misc_cg); sev->misc_cg = NULL; } @@ -229,7 +237,6 @@ static void sev_unbind_asid(struct kvm *kvm, unsigned int handle) static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) { struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - bool es_active = argp->id == KVM_SEV_ES_INIT; int asid, ret; if (kvm->created_vcpus) @@ -239,7 +246,8 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) if (unlikely(sev->active)) return ret; - sev->es_active = es_active; + sev->active = true; + sev->es_active = argp->id == KVM_SEV_ES_INIT; asid = sev_asid_new(sev); if (asid < 0) goto e_no_asid; @@ -249,8 +257,6 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) if (ret) goto e_free; - sev->active = true; - sev->asid = asid; INIT_LIST_HEAD(&sev->regions_list); return 0; @@ -260,6 +266,7 @@ e_free: sev->asid = 0; e_no_asid: sev->es_active = false; + sev->active = false; return ret; } @@ -590,7 +597,7 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm) * traditional VMSA as it has been built so far (in prep * for LAUNCH_UPDATE_VMSA) to be the initial SEV-ES state. */ - memcpy(svm->vmsa, save, sizeof(*save)); + memcpy(svm->sev_es.vmsa, save, sizeof(*save)); return 0; } @@ -612,19 +619,25 @@ static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu, * the VMSA memory content (i.e it will write the same memory region * with the guest's key), so invalidate it first. */ - clflush_cache_range(svm->vmsa, PAGE_SIZE); + clflush_cache_range(svm->sev_es.vmsa, PAGE_SIZE); vmsa.reserved = 0; vmsa.handle = to_kvm_svm(kvm)->sev_info.handle; - vmsa.address = __sme_pa(svm->vmsa); + vmsa.address = __sme_pa(svm->sev_es.vmsa); vmsa.len = PAGE_SIZE; - return sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa, error); + ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa, error); + if (ret) + return ret; + + vcpu->arch.guest_state_protected = true; + return 0; } static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp) { struct kvm_vcpu *vcpu; - int i, ret; + unsigned long i; + int ret; if (!sev_es_guest(kvm)) return -ENOTTY; @@ -1479,6 +1492,13 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) goto e_free_trans; } + /* + * Flush (on non-coherent CPUs) before RECEIVE_UPDATE_DATA, the PSP + * encrypts the written data with the guest's key, and the cache may + * contain dirty, unencrypted data. + */ + sev_clflush_pages(guest_page, n); + /* The RECEIVE_UPDATE_DATA command requires C-bit to be always set. */ data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset; data.guest_address |= sev_me_mask; @@ -1510,7 +1530,7 @@ static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) return sev_issue_cmd(kvm, SEV_CMD_RECEIVE_FINISH, &data, &argp->error); } -static bool cmd_allowed_from_miror(u32 cmd_id) +static bool is_cmd_allowed_from_mirror(u32 cmd_id) { /* * Allow mirrors VM to call KVM_SEV_LAUNCH_UPDATE_VMSA to enable SEV-ES @@ -1524,6 +1544,223 @@ static bool cmd_allowed_from_miror(u32 cmd_id) return false; } +static int sev_lock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) +{ + struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info; + struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info; + int r = -EBUSY; + + if (dst_kvm == src_kvm) + return -EINVAL; + + /* + * Bail if these VMs are already involved in a migration to avoid + * deadlock between two VMs trying to migrate to/from each other. + */ + if (atomic_cmpxchg_acquire(&dst_sev->migration_in_progress, 0, 1)) + return -EBUSY; + + if (atomic_cmpxchg_acquire(&src_sev->migration_in_progress, 0, 1)) + goto release_dst; + + r = -EINTR; + if (mutex_lock_killable(&dst_kvm->lock)) + goto release_src; + if (mutex_lock_killable_nested(&src_kvm->lock, SINGLE_DEPTH_NESTING)) + goto unlock_dst; + return 0; + +unlock_dst: + mutex_unlock(&dst_kvm->lock); +release_src: + atomic_set_release(&src_sev->migration_in_progress, 0); +release_dst: + atomic_set_release(&dst_sev->migration_in_progress, 0); + return r; +} + +static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) +{ + struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info; + struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info; + + mutex_unlock(&dst_kvm->lock); + mutex_unlock(&src_kvm->lock); + atomic_set_release(&dst_sev->migration_in_progress, 0); + atomic_set_release(&src_sev->migration_in_progress, 0); +} + + +static int sev_lock_vcpus_for_migration(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + unsigned long i, j; + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (mutex_lock_killable(&vcpu->mutex)) + goto out_unlock; + } + + return 0; + +out_unlock: + kvm_for_each_vcpu(j, vcpu, kvm) { + if (i == j) + break; + + mutex_unlock(&vcpu->mutex); + } + return -EINTR; +} + +static void sev_unlock_vcpus_for_migration(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + unsigned long i; + + kvm_for_each_vcpu(i, vcpu, kvm) { + mutex_unlock(&vcpu->mutex); + } +} + +static void sev_migrate_from(struct kvm_sev_info *dst, + struct kvm_sev_info *src) +{ + dst->active = true; + dst->asid = src->asid; + dst->handle = src->handle; + dst->pages_locked = src->pages_locked; + dst->enc_context_owner = src->enc_context_owner; + + src->asid = 0; + src->active = false; + src->handle = 0; + src->pages_locked = 0; + src->enc_context_owner = NULL; + + list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list); +} + +static int sev_es_migrate_from(struct kvm *dst, struct kvm *src) +{ + unsigned long i; + struct kvm_vcpu *dst_vcpu, *src_vcpu; + struct vcpu_svm *dst_svm, *src_svm; + + if (atomic_read(&src->online_vcpus) != atomic_read(&dst->online_vcpus)) + return -EINVAL; + + kvm_for_each_vcpu(i, src_vcpu, src) { + if (!src_vcpu->arch.guest_state_protected) + return -EINVAL; + } + + kvm_for_each_vcpu(i, src_vcpu, src) { + src_svm = to_svm(src_vcpu); + dst_vcpu = kvm_get_vcpu(dst, i); + dst_svm = to_svm(dst_vcpu); + + /* + * Transfer VMSA and GHCB state to the destination. Nullify and + * clear source fields as appropriate, the state now belongs to + * the destination. + */ + memcpy(&dst_svm->sev_es, &src_svm->sev_es, sizeof(src_svm->sev_es)); + dst_svm->vmcb->control.ghcb_gpa = src_svm->vmcb->control.ghcb_gpa; + dst_svm->vmcb->control.vmsa_pa = src_svm->vmcb->control.vmsa_pa; + dst_vcpu->arch.guest_state_protected = true; + + memset(&src_svm->sev_es, 0, sizeof(src_svm->sev_es)); + src_svm->vmcb->control.ghcb_gpa = INVALID_PAGE; + src_svm->vmcb->control.vmsa_pa = INVALID_PAGE; + src_vcpu->arch.guest_state_protected = false; + } + to_kvm_svm(src)->sev_info.es_active = false; + to_kvm_svm(dst)->sev_info.es_active = true; + + return 0; +} + +int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd) +{ + struct kvm_sev_info *dst_sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *src_sev, *cg_cleanup_sev; + struct file *source_kvm_file; + struct kvm *source_kvm; + bool charged = false; + int ret; + + source_kvm_file = fget(source_fd); + if (!file_is_kvm(source_kvm_file)) { + ret = -EBADF; + goto out_fput; + } + + source_kvm = source_kvm_file->private_data; + ret = sev_lock_two_vms(kvm, source_kvm); + if (ret) + goto out_fput; + + if (sev_guest(kvm) || !sev_guest(source_kvm)) { + ret = -EINVAL; + goto out_unlock; + } + + src_sev = &to_kvm_svm(source_kvm)->sev_info; + + /* + * VMs mirroring src's encryption context rely on it to keep the + * ASID allocated, but below we are clearing src_sev->asid. + */ + if (src_sev->num_mirrored_vms) { + ret = -EBUSY; + goto out_unlock; + } + + dst_sev->misc_cg = get_current_misc_cg(); + cg_cleanup_sev = dst_sev; + if (dst_sev->misc_cg != src_sev->misc_cg) { + ret = sev_misc_cg_try_charge(dst_sev); + if (ret) + goto out_dst_cgroup; + charged = true; + } + + ret = sev_lock_vcpus_for_migration(kvm); + if (ret) + goto out_dst_cgroup; + ret = sev_lock_vcpus_for_migration(source_kvm); + if (ret) + goto out_dst_vcpu; + + if (sev_es_guest(source_kvm)) { + ret = sev_es_migrate_from(kvm, source_kvm); + if (ret) + goto out_source_vcpu; + } + sev_migrate_from(dst_sev, src_sev); + kvm_vm_dead(source_kvm); + cg_cleanup_sev = src_sev; + ret = 0; + +out_source_vcpu: + sev_unlock_vcpus_for_migration(source_kvm); +out_dst_vcpu: + sev_unlock_vcpus_for_migration(kvm); +out_dst_cgroup: + /* Operates on the source on success, on the destination on failure. */ + if (charged) + sev_misc_cg_uncharge(cg_cleanup_sev); + put_misc_cg(cg_cleanup_sev->misc_cg); + cg_cleanup_sev->misc_cg = NULL; +out_unlock: + sev_unlock_two_vms(kvm, source_kvm); +out_fput: + if (source_kvm_file) + fput(source_kvm_file); + return ret; +} + int svm_mem_enc_op(struct kvm *kvm, void __user *argp) { struct kvm_sev_cmd sev_cmd; @@ -1542,7 +1779,7 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp) /* Only the enc_context_owner handles some memory enc operations. */ if (is_mirroring_enc_context(kvm) && - !cmd_allowed_from_miror(sev_cmd.id)) { + !is_cmd_allowed_from_mirror(sev_cmd.id)) { r = -EINVAL; goto out; } @@ -1739,71 +1976,60 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd) { struct file *source_kvm_file; struct kvm *source_kvm; - struct kvm_sev_info source_sev, *mirror_sev; + struct kvm_sev_info *source_sev, *mirror_sev; int ret; source_kvm_file = fget(source_fd); if (!file_is_kvm(source_kvm_file)) { ret = -EBADF; - goto e_source_put; + goto e_source_fput; } source_kvm = source_kvm_file->private_data; - mutex_lock(&source_kvm->lock); - - if (!sev_guest(source_kvm)) { - ret = -EINVAL; - goto e_source_unlock; - } + ret = sev_lock_two_vms(kvm, source_kvm); + if (ret) + goto e_source_fput; - /* Mirrors of mirrors should work, but let's not get silly */ - if (is_mirroring_enc_context(source_kvm) || source_kvm == kvm) { + /* + * Mirrors of mirrors should work, but let's not get silly. Also + * disallow out-of-band SEV/SEV-ES init if the target is already an + * SEV guest, or if vCPUs have been created. KVM relies on vCPUs being + * created after SEV/SEV-ES initialization, e.g. to init intercepts. + */ + if (sev_guest(kvm) || !sev_guest(source_kvm) || + is_mirroring_enc_context(source_kvm) || kvm->created_vcpus) { ret = -EINVAL; - goto e_source_unlock; + goto e_unlock; } - memcpy(&source_sev, &to_kvm_svm(source_kvm)->sev_info, - sizeof(source_sev)); - /* * The mirror kvm holds an enc_context_owner ref so its asid can't * disappear until we're done with it */ + source_sev = &to_kvm_svm(source_kvm)->sev_info; kvm_get_kvm(source_kvm); - - fput(source_kvm_file); - mutex_unlock(&source_kvm->lock); - mutex_lock(&kvm->lock); - - if (sev_guest(kvm)) { - ret = -EINVAL; - goto e_mirror_unlock; - } + source_sev->num_mirrored_vms++; /* Set enc_context_owner and copy its encryption context over */ mirror_sev = &to_kvm_svm(kvm)->sev_info; mirror_sev->enc_context_owner = source_kvm; mirror_sev->active = true; - mirror_sev->asid = source_sev.asid; - mirror_sev->fd = source_sev.fd; - mirror_sev->es_active = source_sev.es_active; - mirror_sev->handle = source_sev.handle; + mirror_sev->asid = source_sev->asid; + mirror_sev->fd = source_sev->fd; + mirror_sev->es_active = source_sev->es_active; + mirror_sev->handle = source_sev->handle; + INIT_LIST_HEAD(&mirror_sev->regions_list); + ret = 0; + /* * Do not copy ap_jump_table. Since the mirror does not share the same * KVM contexts as the original, and they may have different * memory-views. */ - mutex_unlock(&kvm->lock); - return 0; - -e_mirror_unlock: - mutex_unlock(&kvm->lock); - kvm_put_kvm(source_kvm); - return ret; -e_source_unlock: - mutex_unlock(&source_kvm->lock); -e_source_put: +e_unlock: + sev_unlock_two_vms(kvm, source_kvm); +e_source_fput: if (source_kvm_file) fput(source_kvm_file); return ret; @@ -1815,17 +2041,24 @@ void sev_vm_destroy(struct kvm *kvm) struct list_head *head = &sev->regions_list; struct list_head *pos, *q; + WARN_ON(sev->num_mirrored_vms); + if (!sev_guest(kvm)) return; /* If this is a mirror_kvm release the enc_context_owner and skip sev cleanup */ if (is_mirroring_enc_context(kvm)) { - kvm_put_kvm(sev->enc_context_owner); + struct kvm *owner_kvm = sev->enc_context_owner; + struct kvm_sev_info *owner_sev = &to_kvm_svm(owner_kvm)->sev_info; + + mutex_lock(&owner_kvm->lock); + if (!WARN_ON(!owner_sev->num_mirrored_vms)) + owner_sev->num_mirrored_vms--; + mutex_unlock(&owner_kvm->lock); + kvm_put_kvm(owner_kvm); return; } - mutex_lock(&kvm->lock); - /* * Ensure that all guest tagged cache entries are flushed before * releasing the pages back to the system for use. CLFLUSH will @@ -1845,8 +2078,6 @@ void sev_vm_destroy(struct kvm *kvm) } } - mutex_unlock(&kvm->lock); - sev_unbind_asid(kvm, sev->handle); sev_asid_free(sev); } @@ -2026,16 +2257,16 @@ void sev_free_vcpu(struct kvm_vcpu *vcpu) svm = to_svm(vcpu); if (vcpu->arch.guest_state_protected) - sev_flush_guest_memory(svm, svm->vmsa, PAGE_SIZE); - __free_page(virt_to_page(svm->vmsa)); + sev_flush_guest_memory(svm, svm->sev_es.vmsa, PAGE_SIZE); + __free_page(virt_to_page(svm->sev_es.vmsa)); - if (svm->ghcb_sa_free) - kfree(svm->ghcb_sa); + if (svm->sev_es.ghcb_sa_free) + kvfree(svm->sev_es.ghcb_sa); } static void dump_ghcb(struct vcpu_svm *svm) { - struct ghcb *ghcb = svm->ghcb; + struct ghcb *ghcb = svm->sev_es.ghcb; unsigned int nbits; /* Re-use the dump_invalid_vmcb module parameter */ @@ -2061,7 +2292,7 @@ static void dump_ghcb(struct vcpu_svm *svm) static void sev_es_sync_to_ghcb(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu = &svm->vcpu; - struct ghcb *ghcb = svm->ghcb; + struct ghcb *ghcb = svm->sev_es.ghcb; /* * The GHCB protocol so far allows for the following data @@ -2081,7 +2312,7 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; struct kvm_vcpu *vcpu = &svm->vcpu; - struct ghcb *ghcb = svm->ghcb; + struct ghcb *ghcb = svm->sev_es.ghcb; u64 exit_code; /* @@ -2122,24 +2353,29 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); } -static int sev_es_validate_vmgexit(struct vcpu_svm *svm) +static bool sev_es_validate_vmgexit(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu; struct ghcb *ghcb; - u64 exit_code = 0; - - ghcb = svm->ghcb; + u64 exit_code; + u64 reason; - /* Only GHCB Usage code 0 is supported */ - if (ghcb->ghcb_usage) - goto vmgexit_err; + ghcb = svm->sev_es.ghcb; /* - * Retrieve the exit code now even though is may not be marked valid + * Retrieve the exit code now even though it may not be marked valid * as it could help with debugging. */ exit_code = ghcb_get_sw_exit_code(ghcb); + /* Only GHCB Usage code 0 is supported */ + if (ghcb->ghcb_usage) { + reason = GHCB_ERR_INVALID_USAGE; + goto vmgexit_err; + } + + reason = GHCB_ERR_MISSING_INPUT; + if (!ghcb_sw_exit_code_is_valid(ghcb) || !ghcb_sw_exit_info_1_is_valid(ghcb) || !ghcb_sw_exit_info_2_is_valid(ghcb)) @@ -2218,61 +2454,66 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) case SVM_VMGEXIT_UNSUPPORTED_EVENT: break; default: + reason = GHCB_ERR_INVALID_EVENT; goto vmgexit_err; } - return 0; + return true; vmgexit_err: vcpu = &svm->vcpu; - if (ghcb->ghcb_usage) { + if (reason == GHCB_ERR_INVALID_USAGE) { vcpu_unimpl(vcpu, "vmgexit: ghcb usage %#x is not valid\n", ghcb->ghcb_usage); + } else if (reason == GHCB_ERR_INVALID_EVENT) { + vcpu_unimpl(vcpu, "vmgexit: exit code %#llx is not valid\n", + exit_code); } else { - vcpu_unimpl(vcpu, "vmgexit: exit reason %#llx is not valid\n", + vcpu_unimpl(vcpu, "vmgexit: exit code %#llx input is not valid\n", exit_code); dump_ghcb(svm); } - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; - vcpu->run->internal.ndata = 2; - vcpu->run->internal.data[0] = exit_code; - vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; + /* Clear the valid entries fields */ + memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); - return -EINVAL; + ghcb_set_sw_exit_info_1(ghcb, 2); + ghcb_set_sw_exit_info_2(ghcb, reason); + + return false; } void sev_es_unmap_ghcb(struct vcpu_svm *svm) { - if (!svm->ghcb) + if (!svm->sev_es.ghcb) return; - if (svm->ghcb_sa_free) { + if (svm->sev_es.ghcb_sa_free) { /* * The scratch area lives outside the GHCB, so there is a * buffer that, depending on the operation performed, may * need to be synced, then freed. */ - if (svm->ghcb_sa_sync) { + if (svm->sev_es.ghcb_sa_sync) { kvm_write_guest(svm->vcpu.kvm, - ghcb_get_sw_scratch(svm->ghcb), - svm->ghcb_sa, svm->ghcb_sa_len); - svm->ghcb_sa_sync = false; + ghcb_get_sw_scratch(svm->sev_es.ghcb), + svm->sev_es.ghcb_sa, + svm->sev_es.ghcb_sa_len); + svm->sev_es.ghcb_sa_sync = false; } - kfree(svm->ghcb_sa); - svm->ghcb_sa = NULL; - svm->ghcb_sa_free = false; + kvfree(svm->sev_es.ghcb_sa); + svm->sev_es.ghcb_sa = NULL; + svm->sev_es.ghcb_sa_free = false; } - trace_kvm_vmgexit_exit(svm->vcpu.vcpu_id, svm->ghcb); + trace_kvm_vmgexit_exit(svm->vcpu.vcpu_id, svm->sev_es.ghcb); sev_es_sync_to_ghcb(svm); - kvm_vcpu_unmap(&svm->vcpu, &svm->ghcb_map, true); - svm->ghcb = NULL; + kvm_vcpu_unmap(&svm->vcpu, &svm->sev_es.ghcb_map, true); + svm->sev_es.ghcb = NULL; } void pre_sev_run(struct vcpu_svm *svm, int cpu) @@ -2302,7 +2543,7 @@ void pre_sev_run(struct vcpu_svm *svm, int cpu) static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) { struct vmcb_control_area *control = &svm->vmcb->control; - struct ghcb *ghcb = svm->ghcb; + struct ghcb *ghcb = svm->sev_es.ghcb; u64 ghcb_scratch_beg, ghcb_scratch_end; u64 scratch_gpa_beg, scratch_gpa_end; void *scratch_va; @@ -2310,14 +2551,14 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) scratch_gpa_beg = ghcb_get_sw_scratch(ghcb); if (!scratch_gpa_beg) { pr_err("vmgexit: scratch gpa not provided\n"); - return false; + goto e_scratch; } scratch_gpa_end = scratch_gpa_beg + len; if (scratch_gpa_end < scratch_gpa_beg) { pr_err("vmgexit: scratch length (%#llx) not valid for scratch address (%#llx)\n", len, scratch_gpa_beg); - return false; + goto e_scratch; } if ((scratch_gpa_beg & PAGE_MASK) == control->ghcb_gpa) { @@ -2335,10 +2576,10 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) scratch_gpa_end > ghcb_scratch_end) { pr_err("vmgexit: scratch area is outside of GHCB shared buffer area (%#llx - %#llx)\n", scratch_gpa_beg, scratch_gpa_end); - return false; + goto e_scratch; } - scratch_va = (void *)svm->ghcb; + scratch_va = (void *)svm->sev_es.ghcb; scratch_va += (scratch_gpa_beg - control->ghcb_gpa); } else { /* @@ -2348,18 +2589,18 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) if (len > GHCB_SCRATCH_AREA_LIMIT) { pr_err("vmgexit: scratch area exceeds KVM limits (%#llx requested, %#llx limit)\n", len, GHCB_SCRATCH_AREA_LIMIT); - return false; + goto e_scratch; } - scratch_va = kzalloc(len, GFP_KERNEL_ACCOUNT); + scratch_va = kvzalloc(len, GFP_KERNEL_ACCOUNT); if (!scratch_va) - return false; + goto e_scratch; if (kvm_read_guest(svm->vcpu.kvm, scratch_gpa_beg, scratch_va, len)) { /* Unable to copy scratch area from guest */ pr_err("vmgexit: kvm_read_guest for scratch area failed\n"); - kfree(scratch_va); - return false; + kvfree(scratch_va); + goto e_scratch; } /* @@ -2368,14 +2609,20 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) * the vCPU next time (i.e. a read was requested so the data * must be written back to the guest memory). */ - svm->ghcb_sa_sync = sync; - svm->ghcb_sa_free = true; + svm->sev_es.ghcb_sa_sync = sync; + svm->sev_es.ghcb_sa_free = true; } - svm->ghcb_sa = scratch_va; - svm->ghcb_sa_len = len; + svm->sev_es.ghcb_sa = scratch_va; + svm->sev_es.ghcb_sa_len = len; return true; + +e_scratch: + ghcb_set_sw_exit_info_1(ghcb, 2); + ghcb_set_sw_exit_info_2(ghcb, GHCB_ERR_INVALID_SCRATCH_AREA); + + return false; } static void set_ghcb_msr_bits(struct vcpu_svm *svm, u64 value, u64 mask, @@ -2426,7 +2673,7 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_CPUID); if (!ret) { - ret = -EINVAL; + /* Error, keep GHCB MSR value as-is */ break; } @@ -2462,10 +2709,13 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm) GHCB_MSR_TERM_REASON_POS); pr_info("SEV-ES guest requested termination: %#llx:%#llx\n", reason_set, reason_code); - fallthrough; + + ret = -EINVAL; + break; } default: - ret = -EINVAL; + /* Error, keep GHCB MSR value as-is */ + break; } trace_kvm_vmgexit_msr_protocol_exit(svm->vcpu.vcpu_id, @@ -2489,32 +2739,35 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) if (!ghcb_gpa) { vcpu_unimpl(vcpu, "vmgexit: GHCB gpa is not set\n"); - return -EINVAL; + + /* Without a GHCB, just return right back to the guest */ + return 1; } - if (kvm_vcpu_map(vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->ghcb_map)) { + if (kvm_vcpu_map(vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->sev_es.ghcb_map)) { /* Unable to map GHCB from guest */ vcpu_unimpl(vcpu, "vmgexit: error mapping GHCB [%#llx] from guest\n", ghcb_gpa); - return -EINVAL; + + /* Without a GHCB, just return right back to the guest */ + return 1; } - svm->ghcb = svm->ghcb_map.hva; - ghcb = svm->ghcb_map.hva; + svm->sev_es.ghcb = svm->sev_es.ghcb_map.hva; + ghcb = svm->sev_es.ghcb_map.hva; trace_kvm_vmgexit_enter(vcpu->vcpu_id, ghcb); exit_code = ghcb_get_sw_exit_code(ghcb); - ret = sev_es_validate_vmgexit(svm); - if (ret) - return ret; + if (!sev_es_validate_vmgexit(svm)) + return 1; sev_es_sync_from_ghcb(svm); ghcb_set_sw_exit_info_1(ghcb, 0); ghcb_set_sw_exit_info_2(ghcb, 0); - ret = -EINVAL; + ret = 1; switch (exit_code) { case SVM_VMGEXIT_MMIO_READ: if (!setup_vmgexit_scratch(svm, true, control->exit_info_2)) @@ -2523,7 +2776,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) ret = kvm_sev_es_mmio_read(vcpu, control->exit_info_1, control->exit_info_2, - svm->ghcb_sa); + svm->sev_es.ghcb_sa); break; case SVM_VMGEXIT_MMIO_WRITE: if (!setup_vmgexit_scratch(svm, false, control->exit_info_2)) @@ -2532,7 +2785,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) ret = kvm_sev_es_mmio_write(vcpu, control->exit_info_1, control->exit_info_2, - svm->ghcb_sa); + svm->sev_es.ghcb_sa); break; case SVM_VMGEXIT_NMI_COMPLETE: ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_IRET); @@ -2555,20 +2808,17 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) default: pr_err("svm: vmgexit: unsupported AP jump table request - exit_info_1=%#llx\n", control->exit_info_1); - ghcb_set_sw_exit_info_1(ghcb, 1); - ghcb_set_sw_exit_info_2(ghcb, - X86_TRAP_UD | - SVM_EVTINJ_TYPE_EXEPT | - SVM_EVTINJ_VALID); + ghcb_set_sw_exit_info_1(ghcb, 2); + ghcb_set_sw_exit_info_2(ghcb, GHCB_ERR_INVALID_INPUT); } - ret = 1; break; } case SVM_VMGEXIT_UNSUPPORTED_EVENT: vcpu_unimpl(vcpu, "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n", control->exit_info_1, control->exit_info_2); + ret = -EINVAL; break; default: ret = svm_invoke_exit_handler(vcpu, exit_code); @@ -2579,11 +2829,21 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in) { - if (!setup_vmgexit_scratch(svm, in, svm->vmcb->control.exit_info_2)) + int count; + int bytes; + + if (svm->vmcb->control.exit_info_2 > INT_MAX) return -EINVAL; - return kvm_sev_es_string_io(&svm->vcpu, size, port, - svm->ghcb_sa, svm->ghcb_sa_len, in); + count = svm->vmcb->control.exit_info_2; + if (unlikely(check_mul_overflow(count, size, &bytes))) + return -EINVAL; + + if (!setup_vmgexit_scratch(svm, in, bytes)) + return 1; + + return kvm_sev_es_string_io(&svm->vcpu, size, port, svm->sev_es.ghcb_sa, + count, in); } void sev_es_init_vmcb(struct vcpu_svm *svm) @@ -2598,7 +2858,7 @@ void sev_es_init_vmcb(struct vcpu_svm *svm) * VMCB page. Do not include the encryption mask on the VMSA physical * address since hardware will access it using the guest key. */ - svm->vmcb->control.vmsa_pa = __pa(svm->vmsa); + svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa); /* Can't intercept CR register access, HV can't modify CR registers */ svm_clr_intercept(svm, INTERCEPT_CR0_READ); @@ -2631,11 +2891,11 @@ void sev_es_init_vmcb(struct vcpu_svm *svm) set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1); } -void sev_es_create_vcpu(struct vcpu_svm *svm) +void sev_es_vcpu_reset(struct vcpu_svm *svm) { /* - * Set the GHCB MSR value as per the GHCB specification when creating - * a vCPU for an SEV-ES guest. + * Set the GHCB MSR value as per the GHCB specification when emulating + * vCPU RESET for an SEV-ES guest. */ set_ghcb_msr(svm, GHCB_MSR_SEV_INFO(GHCB_VERSION_MAX, GHCB_VERSION_MIN, @@ -2670,8 +2930,8 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) struct vcpu_svm *svm = to_svm(vcpu); /* First SIPI: Use the values as initially set by the VMM */ - if (!svm->received_first_sipi) { - svm->received_first_sipi = true; + if (!svm->sev_es.received_first_sipi) { + svm->sev_es.received_first_sipi = true; return; } @@ -2680,8 +2940,8 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) * the guest will set the CS and RIP. Set SW_EXIT_INFO_2 to a * non-zero value. */ - if (!svm->ghcb) + if (!svm->sev_es.ghcb) return; - ghcb_set_sw_exit_info_2(svm->ghcb, 1); + ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1); } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 989685098b3e..46bcc706f257 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -25,6 +25,7 @@ #include <linux/pagemap.h> #include <linux/swap.h> #include <linux/rwsem.h> +#include <linux/cc_platform.h> #include <asm/apic.h> #include <asm/perf_event.h> @@ -36,6 +37,7 @@ #include <asm/spec-ctrl.h> #include <asm/cpu_device_id.h> #include <asm/traps.h> +#include <asm/fpu/api.h> #include <asm/virtext.h> #include "trace.h" @@ -186,6 +188,17 @@ module_param(vls, int, 0444); static int vgif = true; module_param(vgif, int, 0444); +/* enable/disable LBR virtualization */ +static int lbrv = true; +module_param(lbrv, int, 0444); + +/* enable/disable PMU virtualization */ +bool pmu = true; +module_param(pmu, bool, 0444); + +static int tsc_scaling = true; +module_param(tsc_scaling, int, 0444); + /* * enable / disable AVIC. Because the defaults differ for APICv * support between VMX and SVM we cannot use module_param_named. @@ -256,7 +269,7 @@ u32 svm_msrpm_offset(u32 msr) #define MAX_INST_SIZE 15 -static int get_max_npt_level(void) +static int get_npt_level(void) { #ifdef CONFIG_X86_64 return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL; @@ -455,7 +468,7 @@ static int has_svm(void) return 0; } - if (sev_active()) { + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) { pr_info("KVM is unsupported when running as an SEV guest\n"); return 0; } @@ -466,7 +479,7 @@ static int has_svm(void) static void svm_hardware_disable(void) { /* Make sure we clean up behind us */ - if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) + if (tsc_scaling) wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT); cpu_svm_disable(); @@ -509,6 +522,10 @@ static int svm_hardware_enable(void) wrmsrl(MSR_VM_HSAVE_PA, __sme_page_pa(sd->save_area)); if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { + /* + * Set the default value, even if we don't use TSC scaling + * to avoid having stale value in the msr + */ wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT); __this_cpu_write(current_tsc_ratio, TSC_RATIO_DEFAULT); } @@ -572,12 +589,10 @@ static int svm_cpu_init(int cpu) if (!sd) return ret; sd->cpu = cpu; - sd->save_area = alloc_page(GFP_KERNEL); + sd->save_area = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!sd->save_area) goto free_cpu_data; - clear_page(page_address(sd->save_area)); - ret = sev_cpu_init(sd); if (ret) goto free_save_area; @@ -929,6 +944,9 @@ static __init void svm_set_cpu_caps(void) if (npt_enabled) kvm_cpu_cap_set(X86_FEATURE_NPT); + if (tsc_scaling) + kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR); + /* Nested VM can receive #VMEXIT instead of triggering #GP */ kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK); } @@ -938,6 +956,10 @@ static __init void svm_set_cpu_caps(void) boot_cpu_has(X86_FEATURE_AMD_SSBD)) kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD); + /* AMD PMU PERFCTR_CORE CPUID */ + if (pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE)) + kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE); + /* CPUID 0x8000001F (SME/SEV features) */ sev_set_cpu_caps(); } @@ -976,10 +998,15 @@ static __init int svm_hardware_setup(void) if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) kvm_enable_efer_bits(EFER_FFXSR); - if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { - kvm_has_tsc_control = true; - kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX; - kvm_tsc_scaling_ratio_frac_bits = 32; + if (tsc_scaling) { + if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { + tsc_scaling = false; + } else { + pr_info("TSC scaling supported\n"); + kvm_has_tsc_control = true; + kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX; + kvm_tsc_scaling_ratio_frac_bits = 32; + } } tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX); @@ -1008,9 +1035,9 @@ static __init int svm_hardware_setup(void) if (!boot_cpu_has(X86_FEATURE_NPT)) npt_enabled = false; - /* Force VM NPT level equal to the host's max NPT level */ - kvm_configure_mmu(npt_enabled, get_max_npt_level(), - get_max_npt_level(), PG_LEVEL_1G); + /* Force VM NPT level equal to the host's paging level */ + kvm_configure_mmu(npt_enabled, get_npt_level(), + get_npt_level(), PG_LEVEL_1G); pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); /* Note, SEV setup consumes npt_enabled. */ @@ -1059,6 +1086,16 @@ static __init int svm_hardware_setup(void) pr_info("Virtual GIF supported\n"); } + if (lbrv) { + if (!boot_cpu_has(X86_FEATURE_LBRV)) + lbrv = false; + else + pr_info("LBR virtualization supported\n"); + } + + if (!pmu) + pr_info("PMU virtualization is disabled\n"); + svm_set_cpu_caps(); /* @@ -1109,7 +1146,9 @@ static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu) static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) { - return kvm_default_tsc_scaling_ratio; + struct vcpu_svm *svm = to_svm(vcpu); + + return svm->tsc_ratio_msr; } static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) @@ -1121,7 +1160,7 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); } -static void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier) +void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier) { wrmsrl(MSR_AMD64_TSC_RATIO, multiplier); } @@ -1150,6 +1189,38 @@ static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu, } } +static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (guest_cpuid_is_intel(vcpu)) { + /* + * We must intercept SYSENTER_EIP and SYSENTER_ESP + * accesses because the processor only stores 32 bits. + * For the same reason we cannot use virtual VMLOAD/VMSAVE. + */ + svm_set_intercept(svm, INTERCEPT_VMLOAD); + svm_set_intercept(svm, INTERCEPT_VMSAVE); + svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; + + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0); + } else { + /* + * If hardware supports Virtual VMLOAD VMSAVE then enable it + * in VMCB and clear intercepts to avoid #VMEXIT. + */ + if (vls) { + svm_clr_intercept(svm, INTERCEPT_VMLOAD); + svm_clr_intercept(svm, INTERCEPT_VMSAVE); + svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; + } + /* No need to intercept these MSRs */ + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1); + } +} + static void init_vmcb(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1296,11 +1367,25 @@ static void init_vmcb(struct kvm_vcpu *vcpu) } svm_hv_init_vmcb(svm->vmcb); + init_vmcb_after_set_cpuid(vcpu); vmcb_mark_all_dirty(svm->vmcb); enable_gif(svm); +} + +static void __svm_vcpu_reset(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + svm_vcpu_init_msrpm(vcpu, svm->msrpm); + + svm_init_osvw(vcpu); + vcpu->arch.microcode_version = 0x01000065; + svm->tsc_ratio_msr = kvm_default_tsc_scaling_ratio; + + if (sev_es_guest(vcpu->kvm)) + sev_es_vcpu_reset(svm); } static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -1311,6 +1396,9 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) svm->virt_spec_ctrl = 0; init_vmcb(vcpu); + + if (!init_event) + __svm_vcpu_reset(vcpu); } void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb) @@ -1346,10 +1434,10 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) /* * SEV-ES guests maintain an encrypted version of their FPU * state which is restored and saved on VMRUN and VMEXIT. - * Free the fpu structure to prevent KVM from attempting to - * access the FPU state. + * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't + * do xsave/xrstor on it. */ - kvm_free_guest_fpu(vcpu); + fpstate_set_confidential(&vcpu->arch.guest_fpu); } err = avic_init_vcpu(svm); @@ -1370,24 +1458,13 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) svm->vmcb01.ptr = page_address(vmcb01_page); svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT); + svm_switch_vmcb(svm, &svm->vmcb01); if (vmsa_page) - svm->vmsa = page_address(vmsa_page); + svm->sev_es.vmsa = page_address(vmsa_page); svm->guest_state_loaded = false; - svm_switch_vmcb(svm, &svm->vmcb01); - init_vmcb(vcpu); - - svm_vcpu_init_msrpm(vcpu, svm->msrpm); - - svm_init_osvw(vcpu); - vcpu->arch.microcode_version = 0x01000065; - - if (sev_es_guest(vcpu->kvm)) - /* Perform SEV-ES specific VMCB creation updates */ - sev_es_create_vcpu(svm); - return 0; error_free_vmsa_page: @@ -1447,7 +1524,7 @@ static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) vmsave(__sme_page_pa(sd->save_area)); } - if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { + if (tsc_scaling) { u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio; if (tsc_ratio != __this_cpu_read(current_tsc_ratio)) { __this_cpu_write(current_tsc_ratio, tsc_ratio); @@ -1517,12 +1594,27 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) to_svm(vcpu)->vmcb->save.rflags = rflags; } +static bool svm_get_if_flag(struct kvm_vcpu *vcpu) +{ + struct vmcb *vmcb = to_svm(vcpu)->vmcb; + + return sev_es_guest(vcpu->kvm) + ? vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK + : kvm_get_rflags(vcpu) & X86_EFLAGS_IF; +} + static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) { + kvm_register_mark_available(vcpu, reg); + switch (reg) { case VCPU_EXREG_PDPTR: - BUG_ON(!npt_enabled); - load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); + /* + * When !npt_enabled, mmu->pdptrs[] is already available since + * it is always updated per SDM when moving to CRs. + */ + if (npt_enabled) + load_pdptrs(vcpu, kvm_read_cr3(vcpu)); break; default: KVM_BUG_ON(1, vcpu->kvm); @@ -1709,6 +1801,24 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) vmcb_mark_dirty(svm->vmcb, VMCB_DT); } +static void svm_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + /* + * For guests that don't set guest_state_protected, the cr3 update is + * handled via kvm_mmu_load() while entering the guest. For guests + * that do (SEV-ES/SEV-SNP), the cr3 update needs to be written to + * VMCB save area now, since the save area will become the initial + * contents of the VMSA, and future VMCB save area updates won't be + * seen. + */ + if (sev_es_guest(vcpu->kvm)) { + svm->vmcb->save.cr3 = cr3; + vmcb_mark_dirty(svm->vmcb, VMCB_CR); + } +} + void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_svm *svm = to_svm(vcpu); @@ -2440,7 +2550,7 @@ static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu, bool ret = false; if (!is_guest_mode(vcpu) || - (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0)))) + (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0)))) return false; cr0 &= ~SVM_CR0_SELECTIVE_MASK; @@ -2657,6 +2767,11 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) struct vcpu_svm *svm = to_svm(vcpu); switch (msr_info->index) { + case MSR_AMD64_TSC_RATIO: + if (!msr_info->host_initiated && !svm->tsc_scaling_enabled) + return 1; + msr_info->data = svm->tsc_ratio_msr; + break; case MSR_STAR: msr_info->data = svm->vmcb01.ptr->save.star; break; @@ -2762,11 +2877,11 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) { struct vcpu_svm *svm = to_svm(vcpu); - if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->ghcb)) + if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->sev_es.ghcb)) return kvm_complete_insn_gp(vcpu, err); - ghcb_set_sw_exit_info_1(svm->ghcb, 1); - ghcb_set_sw_exit_info_2(svm->ghcb, + ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 1); + ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, X86_TRAP_GP | SVM_EVTINJ_TYPE_EXEPT | SVM_EVTINJ_VALID); @@ -2806,6 +2921,19 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) u32 ecx = msr->index; u64 data = msr->data; switch (ecx) { + case MSR_AMD64_TSC_RATIO: + if (!msr->host_initiated && !svm->tsc_scaling_enabled) + return 1; + + if (data & TSC_RATIO_RSVD) + return 1; + + svm->tsc_ratio_msr = data; + + if (svm->tsc_scaling_enabled && is_guest_mode(vcpu)) + nested_svm_update_tsc_ratio_msr(vcpu); + + break; case MSR_IA32_CR_PAT: if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) return 1; @@ -2918,7 +3046,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) svm->tsc_aux = data; break; case MSR_IA32_DEBUGCTLMSR: - if (!boot_cpu_has(X86_FEATURE_LBRV)) { + if (!lbrv) { vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", __func__, data); break; @@ -3035,11 +3163,6 @@ static int invpcid_interception(struct kvm_vcpu *vcpu) type = svm->vmcb->control.exit_info_2; gva = svm->vmcb->control.exit_info_1; - if (type > 3) { - kvm_inject_gp(vcpu, 0); - return 1; - } - return kvm_handle_invpcid(vcpu, type, gva); } @@ -3278,11 +3401,13 @@ int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code) return svm_exit_handlers[exit_code](vcpu); } -static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2, +static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, + u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code) { struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control; + *reason = control->exit_code; *info1 = control->exit_info_1; *info2 = control->exit_info_2; *intr_info = control->exit_int_info; @@ -3299,7 +3424,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) struct kvm_run *kvm_run = vcpu->run; u32 exit_code = svm->vmcb->control.exit_code; - trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM); + trace_kvm_exit(vcpu, KVM_ISA_SVM); /* SEV-ES guests must use the CR write traps to track CR registers. */ if (!sev_es_guest(vcpu->kvm)) { @@ -3312,7 +3437,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) if (is_guest_mode(vcpu)) { int vmexit; - trace_kvm_nested_vmexit(exit_code, vcpu, KVM_ISA_SVM); + trace_kvm_nested_vmexit(vcpu, KVM_ISA_SVM); vmexit = nested_svm_exit_special(svm); @@ -3485,14 +3610,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu) if (!gif_set(svm)) return true; - if (sev_es_guest(vcpu->kvm)) { - /* - * SEV-ES guests to not expose RFLAGS. Use the VMCB interrupt mask - * bit to determine the state of the IF flag. - */ - if (!(vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK)) - return true; - } else if (is_guest_mode(vcpu)) { + if (is_guest_mode(vcpu)) { /* As long as interrupts are being delivered... */ if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK) ? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF) @@ -3503,7 +3621,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu) if (nested_exit_on_intr(svm)) return false; } else { - if (!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF)) + if (!svm_get_if_flag(vcpu)) return true; } @@ -3780,8 +3898,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) pre_svm_run(vcpu); - WARN_ON_ONCE(kvm_apicv_activated(vcpu->kvm) != kvm_vcpu_apicv_active(vcpu)); - sync_lapic_to_cr8(vcpu); if (unlikely(svm->asid != svm->vmcb->control.asid)) { @@ -3848,9 +3964,10 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; } + vcpu->arch.regs_dirty = 0; if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) - kvm_before_interrupt(vcpu); + kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); kvm_load_host_xsave_state(vcpu); stgi(); @@ -3882,8 +3999,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags(); - if (npt_enabled) - kvm_register_clear_available(vcpu, VCPU_EXREG_PDPTR); + vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET; /* * We need to handle MC intercepts here before the vcpu has a chance to @@ -3913,9 +4029,6 @@ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, hv_track_root_tdp(vcpu, root_hpa); - /* Loading L2's CR3 is handled by enter_svm_guest_mode. */ - if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) - return; cr3 = vcpu->arch.cr3; } else if (vcpu->arch.mmu->shadow_root_level >= PT64_ROOT_4LEVEL) { cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu); @@ -4001,6 +4114,8 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) && guest_cpuid_has(vcpu, X86_FEATURE_NRIPS); + svm->tsc_scaling_enabled = tsc_scaling && guest_cpuid_has(vcpu, X86_FEATURE_TSCRATEMSR); + svm_recalc_instruction_intercepts(vcpu, svm); /* For sev guests, the memory encryption bit is not reserved in CR3. */ @@ -4027,33 +4142,7 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) kvm_request_apicv_update(vcpu->kvm, false, APICV_INHIBIT_REASON_NESTED); } - - if (guest_cpuid_is_intel(vcpu)) { - /* - * We must intercept SYSENTER_EIP and SYSENTER_ESP - * accesses because the processor only stores 32 bits. - * For the same reason we cannot use virtual VMLOAD/VMSAVE. - */ - svm_set_intercept(svm, INTERCEPT_VMLOAD); - svm_set_intercept(svm, INTERCEPT_VMSAVE); - svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; - - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0); - } else { - /* - * If hardware supports Virtual VMLOAD VMSAVE then enable it - * in VMCB and clear intercepts to avoid #VMEXIT. - */ - if (vls) { - svm_clr_intercept(svm, INTERCEPT_VMLOAD); - svm_clr_intercept(svm, INTERCEPT_VMSAVE); - svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; - } - /* No need to intercept these MSRs */ - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1); - } + init_vmcb_after_set_cpuid(vcpu); } static bool svm_has_wbinvd_exit(void) @@ -4158,7 +4247,7 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, info->intercept == x86_intercept_clts) break; - if (!(vmcb_is_intercept(&svm->nested.ctl, + if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))) break; @@ -4377,7 +4466,8 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) */ vmcb12 = map.hva; - nested_load_control_from_vmcb12(svm, &vmcb12->control); + nested_copy_vmcb_control_to_cache(svm, &vmcb12->control); + nested_copy_vmcb_save_to_cache(svm, &vmcb12->save); ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, false); unmap_save: @@ -4520,6 +4610,8 @@ static int svm_vm_init(struct kvm *kvm) } static struct kvm_x86_ops svm_x86_ops __initdata = { + .name = "kvm_amd", + .hardware_unsetup = svm_hardware_teardown, .hardware_enable = svm_hardware_enable, .hardware_disable = svm_hardware_disable, @@ -4550,6 +4642,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .get_cpl = svm_get_cpl, .get_cs_db_l_bits = kvm_get_cs_db_l_bits, .set_cr0 = svm_set_cr0, + .post_set_cr3 = svm_post_set_cr3, .is_valid_cr4 = svm_is_valid_cr4, .set_cr4 = svm_set_cr4, .set_efer = svm_set_efer, @@ -4562,6 +4655,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .cache_reg = svm_cache_reg, .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, + .get_if_flag = svm_get_if_flag, .tlb_flush_all = svm_flush_tlb, .tlb_flush_current = svm_flush_tlb, @@ -4592,7 +4686,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .load_eoi_exitmap = svm_load_eoi_exitmap, .hwapic_irr_update = svm_hwapic_irr_update, .hwapic_isr_update = svm_hwapic_isr_update, - .sync_pir_to_irr = kvm_lapic_find_highest_irr, .apicv_post_state_restore = avic_post_state_restore, .set_tss_addr = svm_set_tss_addr, @@ -4637,6 +4730,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .mem_enc_unreg_region = svm_unregister_enc_region, .vm_copy_enc_context_from = svm_vm_copy_asid_from, + .vm_move_enc_context_from = svm_vm_migrate_from, .can_emulate_instruction = svm_can_emulate_instruction, diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 128a54b1fbf1..9f153c59f2c8 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -32,6 +32,7 @@ extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; extern bool npt_enabled; extern bool intercept_smi; +extern bool pmu; /* * Clean bits in VMCB. @@ -79,7 +80,9 @@ struct kvm_sev_info { struct list_head regions_list; /* List of registered regions */ u64 ap_jump_table; /* SEV-ES AP Jump Table address */ struct kvm *enc_context_owner; /* Owner of copied encryption context */ + unsigned long num_mirrored_vms; /* Number of VMs sharing this ASID */ struct misc_cg *misc_cg; /* For misc cgroup accounting */ + atomic_t migration_in_progress; }; struct kvm_svm { @@ -103,6 +106,40 @@ struct kvm_vmcb_info { uint64_t asid_generation; }; +struct vmcb_save_area_cached { + u64 efer; + u64 cr4; + u64 cr3; + u64 cr0; + u64 dr7; + u64 dr6; +}; + +struct vmcb_ctrl_area_cached { + u32 intercepts[MAX_INTERCEPT]; + u16 pause_filter_thresh; + u16 pause_filter_count; + u64 iopm_base_pa; + u64 msrpm_base_pa; + u64 tsc_offset; + u32 asid; + u8 tlb_ctl; + u32 int_ctl; + u32 int_vector; + u32 int_state; + u32 exit_code; + u32 exit_code_hi; + u64 exit_info_1; + u64 exit_info_2; + u32 exit_int_info; + u32 exit_int_info_err; + u64 nested_ctl; + u32 event_inj; + u32 event_inj_err; + u64 nested_cr3; + u64 virt_ext; +}; + struct svm_nested_state { struct kvm_vmcb_info vmcb02; u64 hsave_msr; @@ -118,11 +155,31 @@ struct svm_nested_state { bool nested_run_pending; /* cache for control fields of the guest */ - struct vmcb_control_area ctl; + struct vmcb_ctrl_area_cached ctl; + + /* + * Note: this struct is not kept up-to-date while L2 runs; it is only + * valid within nested_svm_vmrun. + */ + struct vmcb_save_area_cached save; bool initialized; }; +struct vcpu_sev_es_state { + /* SEV-ES support */ + struct vmcb_save_area *vmsa; + struct ghcb *ghcb; + struct kvm_host_map ghcb_map; + bool received_first_sipi; + + /* SEV-ES scratch area support */ + void *ghcb_sa; + u32 ghcb_sa_len; + bool ghcb_sa_sync; + bool ghcb_sa_free; +}; + struct vcpu_svm { struct kvm_vcpu vcpu; /* vmcb always points at current_vmcb->ptr, it's purely a shorthand. */ @@ -140,6 +197,8 @@ struct vcpu_svm { u64 next_rip; u64 spec_ctrl; + + u64 tsc_ratio_msr; /* * Contains guest-controlled bits of VIRT_SPEC_CTRL, which will be * translated into the appropriate L2_CFG bits on the host to @@ -160,7 +219,8 @@ struct vcpu_svm { unsigned long int3_rip; /* cached guest cpuid flags for faster access */ - bool nrips_enabled : 1; + bool nrips_enabled : 1; + bool tsc_scaling_enabled : 1; u32 ldr_reg; u32 dfr_reg; @@ -183,17 +243,7 @@ struct vcpu_svm { DECLARE_BITMAP(write, MAX_DIRECT_ACCESS_MSRS); } shadow_msr_intercept; - /* SEV-ES support */ - struct vmcb_save_area *vmsa; - struct ghcb *ghcb; - struct kvm_host_map ghcb_map; - bool received_first_sipi; - - /* SEV-ES scratch area support */ - void *ghcb_sa; - u64 ghcb_sa_len; - bool ghcb_sa_sync; - bool ghcb_sa_free; + struct vcpu_sev_es_state sev_es; bool guest_state_loaded; }; @@ -218,12 +268,12 @@ DECLARE_PER_CPU(struct svm_cpu_data *, svm_data); void recalc_intercepts(struct vcpu_svm *svm); -static inline struct kvm_svm *to_kvm_svm(struct kvm *kvm) +static __always_inline struct kvm_svm *to_kvm_svm(struct kvm *kvm) { return container_of(kvm, struct kvm_svm, kvm); } -static inline bool sev_guest(struct kvm *kvm) +static __always_inline bool sev_guest(struct kvm *kvm) { #ifdef CONFIG_KVM_AMD_SEV struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; @@ -234,12 +284,12 @@ static inline bool sev_guest(struct kvm *kvm) #endif } -static inline bool sev_es_guest(struct kvm *kvm) +static __always_inline bool sev_es_guest(struct kvm *kvm) { #ifdef CONFIG_KVM_AMD_SEV struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - return sev_guest(kvm) && sev->es_active; + return sev->es_active && !WARN_ON_ONCE(!sev->active); #else return false; #endif @@ -271,11 +321,21 @@ static inline bool vmcb_is_dirty(struct vmcb *vmcb, int bit) return !test_bit(bit, (unsigned long *)&vmcb->control.clean); } -static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) +static __always_inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) { return container_of(vcpu, struct vcpu_svm, vcpu); } +/* + * Only the PDPTRs are loaded on demand into the shadow MMU. All other + * fields are synchronized in handle_exit, because accessing the VMCB is cheap. + * + * CR3 might be out of date in the VMCB but it is not marked dirty; instead, + * KVM_REQ_LOAD_MMU_PGD is always requested when the cached vcpu->arch.cr3 + * is changed. svm_load_mmu_pgd() then syncs the new CR3 value into the VMCB. + */ +#define SVM_REGS_LAZY_LOAD_SET (1 << VCPU_EXREG_PDPTR) + static inline void vmcb_set_intercept(struct vmcb_control_area *control, u32 bit) { WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT); @@ -294,6 +354,12 @@ static inline bool vmcb_is_intercept(struct vmcb_control_area *control, u32 bit) return test_bit(bit, (unsigned long *)&control->intercepts); } +static inline bool vmcb12_is_intercept(struct vmcb_ctrl_area_cached *control, u32 bit) +{ + WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT); + return test_bit(bit, (unsigned long *)&control->intercepts); +} + static inline void set_dr_intercepts(struct vcpu_svm *svm) { struct vmcb *vmcb = svm->vmcb01.ptr; @@ -446,17 +512,17 @@ static inline bool nested_svm_virtualize_tpr(struct kvm_vcpu *vcpu) static inline bool nested_exit_on_smi(struct vcpu_svm *svm) { - return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_SMI); + return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SMI); } static inline bool nested_exit_on_intr(struct vcpu_svm *svm) { - return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_INTR); + return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_INTR); } static inline bool nested_exit_on_nmi(struct vcpu_svm *svm) { - return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_NMI); + return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_NMI); } int enter_svm_guest_mode(struct kvm_vcpu *vcpu, @@ -483,8 +549,12 @@ int nested_svm_check_permissions(struct kvm_vcpu *vcpu); int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, bool has_error_code, u32 error_code); int nested_svm_exit_special(struct vcpu_svm *svm); -void nested_load_control_from_vmcb12(struct vcpu_svm *svm, - struct vmcb_control_area *control); +void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu); +void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier); +void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm, + struct vmcb_control_area *control); +void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm, + struct vmcb_save_area *save); void nested_sync_control_from_vmcb02(struct vcpu_svm *svm); void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm); void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb); @@ -553,6 +623,7 @@ int svm_register_enc_region(struct kvm *kvm, int svm_unregister_enc_region(struct kvm *kvm, struct kvm_enc_region *range); int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd); +int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd); void pre_sev_run(struct vcpu_svm *svm, int cpu); void __init sev_set_cpu_caps(void); void __init sev_hardware_setup(void); @@ -562,7 +633,7 @@ void sev_free_vcpu(struct kvm_vcpu *vcpu); int sev_handle_vmgexit(struct kvm_vcpu *vcpu); int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in); void sev_es_init_vmcb(struct vcpu_svm *svm); -void sev_es_create_vcpu(struct vcpu_svm *svm); +void sev_es_vcpu_reset(struct vcpu_svm *svm); void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu); void sev_es_unmap_ghcb(struct vcpu_svm *svm); diff --git a/arch/x86/kvm/svm/svm_ops.h b/arch/x86/kvm/svm/svm_ops.h index 22e2b019de37..9430d6437c9f 100644 --- a/arch/x86/kvm/svm/svm_ops.h +++ b/arch/x86/kvm/svm/svm_ops.h @@ -56,12 +56,12 @@ static inline void invlpga(unsigned long addr, u32 asid) * VMSAVE, VMLOAD, etc... is still controlled by the effective address size, * hence 'unsigned long' instead of 'hpa_t'. */ -static inline void vmsave(unsigned long pa) +static __always_inline void vmsave(unsigned long pa) { svm_asm1(vmsave, "a" (pa), "memory"); } -static inline void vmload(unsigned long pa) +static __always_inline void vmload(unsigned long pa) { svm_asm1(vmload, "a" (pa), "memory"); } diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index 4fa17df123cd..dfaeb47fcf2a 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -148,7 +148,7 @@ SYM_FUNC_START(__svm_vcpu_run) pop %edi #endif pop %_ASM_BP - ret + RET 3: cmpb $0, kvm_rebooting jne 2b @@ -202,7 +202,7 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) pop %edi #endif pop %_ASM_BP - ret + RET 3: cmpb $0, kvm_rebooting jne 2b diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 03ebe368333e..92e6f6702f00 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -288,8 +288,8 @@ TRACE_EVENT(kvm_apic, #define TRACE_EVENT_KVM_EXIT(name) \ TRACE_EVENT(name, \ - TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu, u32 isa), \ - TP_ARGS(exit_reason, vcpu, isa), \ + TP_PROTO(struct kvm_vcpu *vcpu, u32 isa), \ + TP_ARGS(vcpu, isa), \ \ TP_STRUCT__entry( \ __field( unsigned int, exit_reason ) \ @@ -303,11 +303,12 @@ TRACE_EVENT(name, \ ), \ \ TP_fast_assign( \ - __entry->exit_reason = exit_reason; \ __entry->guest_rip = kvm_rip_read(vcpu); \ __entry->isa = isa; \ __entry->vcpu_id = vcpu->vcpu_id; \ - static_call(kvm_x86_get_exit_info)(vcpu, &__entry->info1, \ + static_call(kvm_x86_get_exit_info)(vcpu, \ + &__entry->exit_reason, \ + &__entry->info1, \ &__entry->info2, \ &__entry->intr_info, \ &__entry->error_code); \ @@ -1355,6 +1356,30 @@ TRACE_EVENT(kvm_apicv_update_request, __entry->bit) ); +TRACE_EVENT(kvm_apicv_accept_irq, + TP_PROTO(__u32 apicid, __u16 dm, __u16 tm, __u8 vec), + TP_ARGS(apicid, dm, tm, vec), + + TP_STRUCT__entry( + __field( __u32, apicid ) + __field( __u16, dm ) + __field( __u16, tm ) + __field( __u8, vec ) + ), + + TP_fast_assign( + __entry->apicid = apicid; + __entry->dm = dm; + __entry->tm = tm; + __entry->vec = vec; + ), + + TP_printk("apicid %x vec %u (%s|%s)", + __entry->apicid, __entry->vec, + __print_symbolic((__entry->dm >> 8 & 0x7), kvm_deliver_mode), + __entry->tm ? "level" : "edge") +); + /* * Tracepoint for AMD AVIC */ diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 4705ad55abb5..c8029b7845b6 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -312,6 +312,15 @@ static inline bool cpu_has_vmx_ept_1g_page(void) return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT; } +static inline int ept_caps_to_lpage_level(u32 ept_caps) +{ + if (ept_caps & VMX_EPT_1GB_PAGE_BIT) + return PG_LEVEL_1G; + if (ept_caps & VMX_EPT_2MB_PAGE_BIT) + return PG_LEVEL_2M; + return PG_LEVEL_4K; +} + static inline bool cpu_has_vmx_ept_ad_bits(void) { return vmx_capability.ept & VMX_EPT_AD_BIT; diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h index 152ab0aa82cf..16731d2cf231 100644 --- a/arch/x86/kvm/vmx/evmcs.h +++ b/arch/x86/kvm/vmx/evmcs.h @@ -93,7 +93,7 @@ static __always_inline int get_evmcs_offset(unsigned long field, return evmcs_field->offset; } -static inline void evmcs_write64(unsigned long field, u64 value) +static __always_inline void evmcs_write64(unsigned long field, u64 value) { u16 clean_field; int offset = get_evmcs_offset(field, &clean_field); @@ -183,7 +183,7 @@ static inline void evmcs_load(u64 phys_addr) __init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf); #else /* !IS_ENABLED(CONFIG_HYPERV) */ -static inline void evmcs_write64(unsigned long field, u64 value) {} +static __always_inline void evmcs_write64(unsigned long field, u64 value) {} static inline void evmcs_write32(unsigned long field, u32 value) {} static inline void evmcs_write16(unsigned long field, u16 value) {} static inline u64 evmcs_read64(unsigned long field) { return 0; } diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index eedcebf58004..f235f77cbc03 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -191,7 +191,7 @@ static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error) * failValid writes the error number to the current VMCS, which * can't be done if there isn't a current VMCS. */ - if (vmx->nested.current_vmptr == -1ull && + if (vmx->nested.current_vmptr == INVALID_GPA && !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) return nested_vmx_failInvalid(vcpu); @@ -218,7 +218,7 @@ static inline u64 vmx_control_msr(u32 low, u32 high) static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) { secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); - vmcs_write64(VMCS_LINK_POINTER, -1ull); + vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); vmx->nested.need_vmcs12_to_shadow_sync = false; } @@ -245,7 +245,8 @@ static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, src = &prev->host_state; dest = &vmx->loaded_vmcs->host_state; - vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); + vmx_set_vmcs_host_state(dest, src->cr3, src->fs_sel, src->gs_sel, + src->fs_base, src->gs_base); dest->ldt_sel = src->ldt_sel; #ifdef CONFIG_X86_64 dest->ds_sel = src->ds_sel; @@ -269,7 +270,13 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) vmx_sync_vmcs_host_state(vmx, prev); put_cpu(); - vmx_register_cache_reset(vcpu); + vcpu->arch.regs_avail = ~VMX_REGS_LAZY_LOAD_SET; + + /* + * All lazily updated registers will be reloaded from VMCS12 on both + * vmentry and vmexit. + */ + vcpu->arch.regs_dirty = 0; } /* @@ -290,9 +297,10 @@ static void free_nested(struct kvm_vcpu *vcpu) vmx->nested.vmxon = false; vmx->nested.smm.vmxon = false; + vmx->nested.vmxon_ptr = INVALID_GPA; free_vpid(vmx->nested.vpid02); vmx->nested.posted_intr_nv = -1; - vmx->nested.current_vmptr = -1ull; + vmx->nested.current_vmptr = INVALID_GPA; if (enable_shadow_vmcs) { vmx_disable_shadow_vmcs(vmx); vmcs_clear(vmx->vmcs01.shadow_vmcs); @@ -390,9 +398,11 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, static void nested_ept_new_eptp(struct kvm_vcpu *vcpu) { - kvm_init_shadow_ept_mmu(vcpu, - to_vmx(vcpu)->nested.msrs.ept_caps & - VMX_EPT_EXECUTE_ONLY_BIT, + struct vcpu_vmx *vmx = to_vmx(vcpu); + bool execonly = vmx->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT; + int ept_lpage_level = ept_caps_to_lpage_level(vmx->nested.msrs.ept_caps); + + kvm_init_shadow_ept_mmu(vcpu, execonly, ept_lpage_level, nested_ept_ad_enabled(vcpu), nested_ept_get_eptp(vcpu)); } @@ -524,67 +534,19 @@ static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, } /* - * Check if MSR is intercepted for L01 MSR bitmap. - */ -static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr) -{ - unsigned long *msr_bitmap; - int f = sizeof(unsigned long); - - if (!cpu_has_vmx_msr_bitmap()) - return true; - - msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap; - - if (msr <= 0x1fff) { - return !!test_bit(msr, msr_bitmap + 0x800 / f); - } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { - msr &= 0x1fff; - return !!test_bit(msr, msr_bitmap + 0xc00 / f); - } - - return true; -} - -/* - * If a msr is allowed by L0, we should check whether it is allowed by L1. - * The corresponding bit will be cleared unless both of L0 and L1 allow it. + * For x2APIC MSRs, ignore the vmcs01 bitmap. L1 can enable x2APIC without L1 + * itself utilizing x2APIC. All MSRs were previously set to be intercepted, + * only the "disable intercept" case needs to be handled. */ -static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1, - unsigned long *msr_bitmap_nested, - u32 msr, int type) +static void nested_vmx_disable_intercept_for_x2apic_msr(unsigned long *msr_bitmap_l1, + unsigned long *msr_bitmap_l0, + u32 msr, int type) { - int f = sizeof(unsigned long); - - /* - * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals - * have the write-low and read-high bitmap offsets the wrong way round. - * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. - */ - if (msr <= 0x1fff) { - if (type & MSR_TYPE_R && - !test_bit(msr, msr_bitmap_l1 + 0x000 / f)) - /* read-low */ - __clear_bit(msr, msr_bitmap_nested + 0x000 / f); - - if (type & MSR_TYPE_W && - !test_bit(msr, msr_bitmap_l1 + 0x800 / f)) - /* write-low */ - __clear_bit(msr, msr_bitmap_nested + 0x800 / f); - - } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { - msr &= 0x1fff; - if (type & MSR_TYPE_R && - !test_bit(msr, msr_bitmap_l1 + 0x400 / f)) - /* read-high */ - __clear_bit(msr, msr_bitmap_nested + 0x400 / f); + if (type & MSR_TYPE_R && !vmx_test_msr_bitmap_read(msr_bitmap_l1, msr)) + vmx_clear_msr_bitmap_read(msr_bitmap_l0, msr); - if (type & MSR_TYPE_W && - !test_bit(msr, msr_bitmap_l1 + 0xc00 / f)) - /* write-high */ - __clear_bit(msr, msr_bitmap_nested + 0xc00 / f); - - } + if (type & MSR_TYPE_W && !vmx_test_msr_bitmap_write(msr_bitmap_l1, msr)) + vmx_clear_msr_bitmap_write(msr_bitmap_l0, msr); } static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) @@ -599,6 +561,34 @@ static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) } } +#define BUILD_NVMX_MSR_INTERCEPT_HELPER(rw) \ +static inline \ +void nested_vmx_set_msr_##rw##_intercept(struct vcpu_vmx *vmx, \ + unsigned long *msr_bitmap_l1, \ + unsigned long *msr_bitmap_l0, u32 msr) \ +{ \ + if (vmx_test_msr_bitmap_##rw(vmx->vmcs01.msr_bitmap, msr) || \ + vmx_test_msr_bitmap_##rw(msr_bitmap_l1, msr)) \ + vmx_set_msr_bitmap_##rw(msr_bitmap_l0, msr); \ + else \ + vmx_clear_msr_bitmap_##rw(msr_bitmap_l0, msr); \ +} +BUILD_NVMX_MSR_INTERCEPT_HELPER(read) +BUILD_NVMX_MSR_INTERCEPT_HELPER(write) + +static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx, + unsigned long *msr_bitmap_l1, + unsigned long *msr_bitmap_l0, + u32 msr, int types) +{ + if (types & MSR_TYPE_R) + nested_vmx_set_msr_read_intercept(vmx, msr_bitmap_l1, + msr_bitmap_l0, msr); + if (types & MSR_TYPE_W) + nested_vmx_set_msr_write_intercept(vmx, msr_bitmap_l1, + msr_bitmap_l0, msr); +} + /* * Merge L0's and L1's MSR bitmap, return false to indicate that * we do not use the hardware. @@ -606,16 +596,31 @@ static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { + struct vcpu_vmx *vmx = to_vmx(vcpu); int msr; unsigned long *msr_bitmap_l1; - unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap; - struct kvm_host_map *map = &to_vmx(vcpu)->nested.msr_bitmap_map; + unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap; + struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; + struct kvm_host_map *map = &vmx->nested.msr_bitmap_map; /* Nothing to do if the MSR bitmap is not in use. */ if (!cpu_has_vmx_msr_bitmap() || !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) return false; + /* + * MSR bitmap update can be skipped when: + * - MSR bitmap for L1 hasn't changed. + * - Nested hypervisor (L1) is attempting to launch the same L2 as + * before. + * - Nested hypervisor (L1) has enabled 'Enlightened MSR Bitmap' feature + * and tells KVM (L0) there were no changes in MSR bitmap for L2. + */ + if (!vmx->nested.force_msr_bitmap_recalc && evmcs && + evmcs->hv_enlightenments_control.msr_bitmap && + evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP) + return true; + if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map)) return false; @@ -624,7 +629,7 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, /* * To keep the control flow simple, pay eight 8-byte writes (sixteen * 4-byte writes on 32-bit systems) up front to enable intercepts for - * the x2APIC MSR range and selectively disable them below. + * the x2APIC MSR range and selectively toggle those relevant to L2. */ enable_x2apic_msr_intercepts(msr_bitmap_l0); @@ -643,61 +648,46 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, } } - nested_vmx_disable_intercept_for_msr( + nested_vmx_disable_intercept_for_x2apic_msr( msr_bitmap_l1, msr_bitmap_l0, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_R | MSR_TYPE_W); if (nested_cpu_has_vid(vmcs12)) { - nested_vmx_disable_intercept_for_msr( + nested_vmx_disable_intercept_for_x2apic_msr( msr_bitmap_l1, msr_bitmap_l0, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); - nested_vmx_disable_intercept_for_msr( + nested_vmx_disable_intercept_for_x2apic_msr( msr_bitmap_l1, msr_bitmap_l0, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); } } - /* KVM unconditionally exposes the FS/GS base MSRs to L1. */ + /* + * Always check vmcs01's bitmap to honor userspace MSR filters and any + * other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through. + */ #ifdef CONFIG_X86_64 - nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, - MSR_FS_BASE, MSR_TYPE_RW); + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_FS_BASE, MSR_TYPE_RW); - nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, - MSR_GS_BASE, MSR_TYPE_RW); + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_GS_BASE, MSR_TYPE_RW); - nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, - MSR_KERNEL_GS_BASE, MSR_TYPE_RW); + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_KERNEL_GS_BASE, MSR_TYPE_RW); #endif + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); - /* - * Checking the L0->L1 bitmap is trying to verify two things: - * - * 1. L0 gave a permission to L1 to actually passthrough the MSR. This - * ensures that we do not accidentally generate an L02 MSR bitmap - * from the L12 MSR bitmap that is too permissive. - * 2. That L1 or L2s have actually used the MSR. This avoids - * unnecessarily merging of the bitmap if the MSR is unused. This - * works properly because we only update the L01 MSR bitmap lazily. - * So even if L0 should pass L1 these MSRs, the L01 bitmap is only - * updated to reflect this when L1 (or its L2s) actually write to - * the MSR. - */ - if (!msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL)) - nested_vmx_disable_intercept_for_msr( - msr_bitmap_l1, msr_bitmap_l0, - MSR_IA32_SPEC_CTRL, - MSR_TYPE_R | MSR_TYPE_W); + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_PRED_CMD, MSR_TYPE_W); - if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD)) - nested_vmx_disable_intercept_for_msr( - msr_bitmap_l1, msr_bitmap_l0, - MSR_IA32_PRED_CMD, - MSR_TYPE_W); + kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false); - kvm_vcpu_unmap(vcpu, &to_vmx(vcpu)->nested.msr_bitmap_map, false); + vmx->nested.force_msr_bitmap_recalc = false; return true; } @@ -705,33 +695,39 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { - struct kvm_host_map map; - struct vmcs12 *shadow; + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; if (!nested_cpu_has_shadow_vmcs(vmcs12) || - vmcs12->vmcs_link_pointer == -1ull) + vmcs12->vmcs_link_pointer == INVALID_GPA) return; - shadow = get_shadow_vmcs12(vcpu); - - if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map)) + if (ghc->gpa != vmcs12->vmcs_link_pointer && + kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, + vmcs12->vmcs_link_pointer, VMCS12_SIZE)) return; - memcpy(shadow, map.hva, VMCS12_SIZE); - kvm_vcpu_unmap(vcpu, &map, false); + kvm_read_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), + VMCS12_SIZE); } static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct vcpu_vmx *vmx = to_vmx(vcpu); + struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; if (!nested_cpu_has_shadow_vmcs(vmcs12) || - vmcs12->vmcs_link_pointer == -1ull) + vmcs12->vmcs_link_pointer == INVALID_GPA) + return; + + if (ghc->gpa != vmcs12->vmcs_link_pointer && + kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, + vmcs12->vmcs_link_pointer, VMCS12_SIZE)) return; - kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer, - get_shadow_vmcs12(vcpu), VMCS12_SIZE); + kvm_write_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), + VMCS12_SIZE); } /* @@ -1124,7 +1120,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, * must not be dereferenced. */ if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) && - CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) { + CC(!load_pdptrs(vcpu, cr3))) { *entry_failure_code = ENTRY_FAIL_PDPTE; return -EINVAL; } @@ -1133,7 +1129,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, kvm_mmu_new_pgd(vcpu, cr3); vcpu->arch.cr3 = cr3; - kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); + kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */ kvm_init_mmu(vcpu); @@ -1191,29 +1187,26 @@ static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, WARN_ON(!enable_vpid); /* - * If VPID is enabled and used by vmc12, but L2 does not have a unique - * TLB tag (ASID), i.e. EPT is disabled and KVM was unable to allocate - * a VPID for L2, flush the current context as the effective ASID is - * common to both L1 and L2. - * - * Defer the flush so that it runs after vmcs02.EPTP has been set by - * KVM_REQ_LOAD_MMU_PGD (if nested EPT is enabled) and to avoid - * redundant flushes further down the nested pipeline. - * - * If a TLB flush isn't required due to any of the above, and vpid12 is - * changing then the new "virtual" VPID (vpid12) will reuse the same - * "real" VPID (vpid02), and so needs to be flushed. There's no direct - * mapping between vpid02 and vpid12, vpid02 is per-vCPU and reused for - * all nested vCPUs. Remember, a flush on VM-Enter does not invalidate - * guest-physical mappings, so there is no need to sync the nEPT MMU. + * VPID is enabled and in use by vmcs12. If vpid12 is changing, then + * emulate a guest TLB flush as KVM does not track vpid12 history nor + * is the VPID incorporated into the MMU context. I.e. KVM must assume + * that the new vpid12 has never been used and thus represents a new + * guest ASID that cannot have entries in the TLB. */ - if (!nested_has_guest_tlb_tag(vcpu)) { - kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); - } else if (is_vmenter && - vmcs12->virtual_processor_id != vmx->nested.last_vpid) { + if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) { vmx->nested.last_vpid = vmcs12->virtual_processor_id; - vpid_sync_context(nested_get_vpid02(vcpu)); + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); + return; } + + /* + * If VPID is enabled, used by vmc12, and vpid12 is not changing but + * does not have a unique TLB tag (ASID), i.e. EPT is disabled and + * KVM was unable to allocate a VPID for L2, flush the current context + * as the effective ASID is common to both L1 and L2. + */ + if (!nested_has_guest_tlb_tag(vcpu)) + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) @@ -1994,7 +1987,7 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( } if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { - vmx->nested.current_vmptr = -1ull; + vmx->nested.current_vmptr = INVALID_GPA; nested_release_evmcs(vcpu); @@ -2053,10 +2046,13 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( * Clean fields data can't be used on VMLAUNCH and when we switch * between different L2 guests as KVM keeps a single VMCS12 per L1. */ - if (from_launch || evmcs_gpa_changed) + if (from_launch || evmcs_gpa_changed) { vmx->nested.hv_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; + vmx->nested.force_msr_bitmap_recalc = true; + } + return EVMPTRLD_SUCCEEDED; } @@ -2178,7 +2174,7 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) } if (cpu_has_vmx_encls_vmexit()) - vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); + vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA); /* * Set the MSR load/store lists to match L0's settings. Only the @@ -2197,7 +2193,7 @@ static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, { prepare_vmcs02_constant_state(vmx); - vmcs_write64(VMCS_LINK_POINTER, -1ull); + vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); if (enable_vpid) { if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) @@ -2623,8 +2619,10 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, - vmcs12->guest_ia32_perf_global_ctrl))) + vmcs12->guest_ia32_perf_global_ctrl))) { + *entry_failure_code = ENTRY_FAIL_DEFAULT; return -EINVAL; + } kvm_rsp_write(vcpu, vmcs12->guest_rsp); kvm_rip_write(vcpu, vmcs12->guest_rip); @@ -2865,6 +2863,17 @@ static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, return 0; } +static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ +#ifdef CONFIG_X86_64 + if (CC(!!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) != + !!(vcpu->arch.efer & EFER_LMA))) + return -EINVAL; +#endif + return 0; +} + static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { @@ -2889,18 +2898,16 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, return -EINVAL; #ifdef CONFIG_X86_64 - ia32e = !!(vcpu->arch.efer & EFER_LMA); + ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE); #else ia32e = false; #endif if (ia32e) { - if (CC(!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)) || - CC(!(vmcs12->host_cr4 & X86_CR4_PAE))) + if (CC(!(vmcs12->host_cr4 & X86_CR4_PAE))) return -EINVAL; } else { - if (CC(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) || - CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || + if (CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || CC(vmcs12->host_cr4 & X86_CR4_PCIDE) || CC((vmcs12->host_rip) >> 32)) return -EINVAL; @@ -2945,27 +2952,31 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { - int r = 0; - struct vmcs12 *shadow; - struct kvm_host_map map; + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; + struct vmcs_hdr hdr; - if (vmcs12->vmcs_link_pointer == -1ull) + if (vmcs12->vmcs_link_pointer == INVALID_GPA) return 0; if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))) return -EINVAL; - if (CC(kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))) - return -EINVAL; + if (ghc->gpa != vmcs12->vmcs_link_pointer && + CC(kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, + vmcs12->vmcs_link_pointer, VMCS12_SIZE))) + return -EINVAL; - shadow = map.hva; + if (CC(kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, + offsetof(struct vmcs12, hdr), + sizeof(hdr)))) + return -EINVAL; - if (CC(shadow->hdr.revision_id != VMCS12_REVISION) || - CC(shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))) - r = -EINVAL; + if (CC(hdr.revision_id != VMCS12_REVISION) || + CC(hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))) + return -EINVAL; - kvm_vcpu_unmap(vcpu, &map, false); - return r; + return 0; } /* @@ -3044,7 +3055,7 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long cr3, cr4; + unsigned long cr4; bool vm_fail; if (!nested_early_check) @@ -3067,12 +3078,6 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) */ vmcs_writel(GUEST_RFLAGS, 0); - cr3 = __get_current_cr3_fast(); - if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { - vmcs_writel(HOST_CR3, cr3); - vmx->loaded_vmcs->host_state.cr3 = cr3; - } - cr4 = cr4_read_shadow(); if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { vmcs_writel(HOST_CR4, cr4); @@ -3162,7 +3167,7 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) * the guest CR3 might be restored prior to setting the nested * state which can lead to a load of wrong PDPTRs. */ - if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3))) + if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3))) return false; } @@ -3216,7 +3221,7 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to * force VM-Entry to fail. */ - vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull); + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, INVALID_GPA); } } @@ -3360,8 +3365,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, }; u32 failed_index; - if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) - kvm_vcpu_flush_tlb_current(vcpu); + kvm_service_local_tlb_flush_requests(vcpu); evaluate_pending_interrupts = exec_controls_get(vmx) & (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING); @@ -3522,12 +3526,15 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) if (evmptrld_status == EVMPTRLD_ERROR) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; - } else if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) { - return nested_vmx_failInvalid(vcpu); } + kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_BRANCH_INSTRUCTIONS); + + if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) + return nested_vmx_failInvalid(vcpu); + if (CC(!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) && - vmx->nested.current_vmptr == -1ull)) + vmx->nested.current_vmptr == INVALID_GPA)) return nested_vmx_failInvalid(vcpu); vmcs12 = get_vmcs12(vcpu); @@ -3570,6 +3577,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) if (nested_vmx_check_controls(vcpu, vmcs12)) return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); + if (nested_vmx_check_address_space_size(vcpu, vmcs12)) + return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); + if (nested_vmx_check_host_state(vcpu, vmcs12)) return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); @@ -3618,7 +3628,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) && (vmcs12->guest_rflags & X86_EFLAGS_IF))) { vmx->nested.nested_run_pending = 0; - return kvm_vcpu_halt(vcpu); + return kvm_emulate_halt_noskip(vcpu); } break; case GUEST_ACTIVITY_WAIT_SIPI: @@ -4515,9 +4525,8 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, (void)nested_get_evmcs_page(vcpu); } - /* Service the TLB flush request for L2 before switching to L1. */ - if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) - kvm_vcpu_flush_tlb_current(vcpu); + /* Service pending TLB flush requests for L2 before switching to L1. */ + kvm_service_local_tlb_flush_requests(vcpu); /* * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between @@ -4870,6 +4879,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) if (!vmx->nested.cached_vmcs12) goto out_cached_vmcs12; + vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA; vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); if (!vmx->nested.cached_shadow_vmcs12) goto out_cached_shadow_vmcs12; @@ -4975,7 +4985,7 @@ static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (vmx->nested.current_vmptr == -1ull) + if (vmx->nested.current_vmptr == INVALID_GPA) return; copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); @@ -4995,7 +5005,7 @@ static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); - vmx->nested.current_vmptr = -1ull; + vmx->nested.current_vmptr = INVALID_GPA; } /* Emulate the VMXOFF instruction */ @@ -5090,12 +5100,12 @@ static int handle_vmread(struct kvm_vcpu *vcpu) return 1; /* - * In VMX non-root operation, when the VMCS-link pointer is -1ull, + * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, * any VMREAD sets the ALU flags for VMfailInvalid. */ - if (vmx->nested.current_vmptr == -1ull || + if (vmx->nested.current_vmptr == INVALID_GPA || (is_guest_mode(vcpu) && - get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)) + get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) return nested_vmx_failInvalid(vcpu); /* Decode instruction info and find the field to read */ @@ -5182,12 +5192,12 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) return 1; /* - * In VMX non-root operation, when the VMCS-link pointer is -1ull, + * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, * any VMWRITE sets the ALU flags for VMfailInvalid. */ - if (vmx->nested.current_vmptr == -1ull || + if (vmx->nested.current_vmptr == INVALID_GPA || (is_guest_mode(vcpu) && - get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)) + get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) return nested_vmx_failInvalid(vcpu); if (instr_info & BIT(10)) @@ -5273,6 +5283,7 @@ static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) vmx->nested.need_vmcs12_to_shadow_sync = true; } vmx->nested.dirty_vmcs12 = true; + vmx->nested.force_msr_bitmap_recalc = true; } /* Emulate the VMPTRLD instruction */ @@ -5299,10 +5310,10 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) return 1; if (vmx->nested.current_vmptr != vmptr) { - struct kvm_host_map map; - struct vmcs12 *new_vmcs12; + struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache; + struct vmcs_hdr hdr; - if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmptr), &map)) { + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) { /* * Reads from an unbacked page return all 1s, * which means that the 32 bits located at the @@ -5313,12 +5324,16 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); } - new_vmcs12 = map.hva; + if (kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, + offsetof(struct vmcs12, hdr), + sizeof(hdr))) { + return nested_vmx_fail(vcpu, + VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); + } - if (new_vmcs12->hdr.revision_id != VMCS12_REVISION || - (new_vmcs12->hdr.shadow_vmcs && + if (hdr.revision_id != VMCS12_REVISION || + (hdr.shadow_vmcs && !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { - kvm_vcpu_unmap(vcpu, &map, false); return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); } @@ -5329,8 +5344,11 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) * Load VMCS12 from guest memory since it is not already * cached. */ - memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE); - kvm_vcpu_unmap(vcpu, &map, false); + if (kvm_read_guest_cached(vcpu->kvm, ghc, vmx->nested.cached_vmcs12, + VMCS12_SIZE)) { + return nested_vmx_fail(vcpu, + VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); + } set_current_vmptr(vmx, vmptr); } @@ -5378,7 +5396,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) struct { u64 eptp, gpa; } operand; - int i, r; + int i, r, gpr_index; if (!(vmx->nested.msrs.secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) || @@ -5391,7 +5409,8 @@ static int handle_invept(struct kvm_vcpu *vcpu) return 1; vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf); + gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); + type = kvm_register_read(vcpu, gpr_index); types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; @@ -5458,7 +5477,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) u64 gla; } operand; u16 vpid02; - int r; + int r, gpr_index; if (!(vmx->nested.msrs.secondary_ctls_high & SECONDARY_EXEC_ENABLE_VPID) || @@ -5471,7 +5490,8 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) return 1; vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf); + gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); + type = kvm_register_read(vcpu, gpr_index); types = (vmx->nested.msrs.vpid_caps & VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; @@ -5630,7 +5650,7 @@ bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, gpa_t bitmap, last_bitmap; u8 b; - last_bitmap = (gpa_t)-1; + last_bitmap = INVALID_GPA; b = -1; while (size > 0) { @@ -6065,7 +6085,7 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) goto reflect_vmexit; } - trace_kvm_nested_vmexit(exit_reason.full, vcpu, KVM_ISA_VMX); + trace_kvm_nested_vmexit(vcpu, KVM_ISA_VMX); /* If L0 (KVM) wants the exit, it trumps L1's desires. */ if (nested_vmx_l0_wants_exit(vcpu, exit_reason)) @@ -6106,8 +6126,8 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, .format = KVM_STATE_NESTED_FORMAT_VMX, .size = sizeof(kvm_state), .hdr.vmx.flags = 0, - .hdr.vmx.vmxon_pa = -1ull, - .hdr.vmx.vmcs12_pa = -1ull, + .hdr.vmx.vmxon_pa = INVALID_GPA, + .hdr.vmx.vmcs12_pa = INVALID_GPA, .hdr.vmx.preemption_timer_deadline = 0, }; struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = @@ -6133,7 +6153,7 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, if (is_guest_mode(vcpu) && nested_cpu_has_shadow_vmcs(vmcs12) && - vmcs12->vmcs_link_pointer != -1ull) + vmcs12->vmcs_link_pointer != INVALID_GPA) kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12); } @@ -6209,7 +6229,7 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, return -EFAULT; if (nested_cpu_has_shadow_vmcs(vmcs12) && - vmcs12->vmcs_link_pointer != -1ull) { + vmcs12->vmcs_link_pointer != INVALID_GPA) { if (copy_to_user(user_vmx_nested_state->shadow_vmcs12, get_shadow_vmcs12(vcpu), VMCS12_SIZE)) return -EFAULT; @@ -6244,11 +6264,11 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX) return -EINVAL; - if (kvm_state->hdr.vmx.vmxon_pa == -1ull) { + if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) { if (kvm_state->hdr.vmx.smm.flags) return -EINVAL; - if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) + if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) return -EINVAL; /* @@ -6302,7 +6322,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, vmx_leave_nested(vcpu); - if (kvm_state->hdr.vmx.vmxon_pa == -1ull) + if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) return 0; vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa; @@ -6315,13 +6335,13 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, /* See vmx_has_valid_vmcs12. */ if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) || (kvm_state->flags & KVM_STATE_NESTED_EVMCS) || - (kvm_state->hdr.vmx.vmcs12_pa != -1ull)) + (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA)) return -EINVAL; else return 0; } - if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) { + if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) { if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa || !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa)) return -EINVAL; @@ -6366,7 +6386,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, ret = -EINVAL; if (nested_cpu_has_shadow_vmcs(vmcs12) && - vmcs12->vmcs_link_pointer != -1ull) { + vmcs12->vmcs_link_pointer != INVALID_GPA) { struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); if (kvm_state->size < @@ -6399,6 +6419,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, goto error_guest_mode; vmx->nested.dirty_vmcs12 = true; + vmx->nested.force_msr_bitmap_recalc = true; ret = nested_vmx_enter_non_root_mode(vcpu, false); if (ret) goto error_guest_mode; diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 10cc4f65c4ef..5e0ac57d6d1b 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -68,16 +68,17 @@ static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data) reprogram_counter(pmu, bit); } -static unsigned intel_find_arch_event(struct kvm_pmu *pmu, - u8 event_select, - u8 unit_mask) +static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc) { + struct kvm_pmu *pmu = pmc_to_pmu(pmc); + u8 event_select = pmc->eventsel & ARCH_PERFMON_EVENTSEL_EVENT; + u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; int i; for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++) - if (intel_arch_events[i].eventsel == event_select - && intel_arch_events[i].unit_mask == unit_mask - && (pmu->available_event_types & (1 << i))) + if (intel_arch_events[i].eventsel == event_select && + intel_arch_events[i].unit_mask == unit_mask && + (pmc_is_fixed(pmc) || pmu->available_event_types & (1 << i))) break; if (i == ARRAY_SIZE(intel_arch_events)) @@ -86,18 +87,6 @@ static unsigned intel_find_arch_event(struct kvm_pmu *pmu, return intel_arch_events[i].event_type; } -static unsigned intel_find_fixed_event(int idx) -{ - u32 event; - size_t size = ARRAY_SIZE(fixed_pmc_events); - - if (idx >= size) - return PERF_COUNT_HW_MAX; - - event = fixed_pmc_events[array_index_nospec(idx, size)]; - return intel_arch_events[event].event_type; -} - /* check if a PMC is enabled by comparing it with globl_ctrl bits. */ static bool intel_pmc_is_enabled(struct kvm_pmc *pmc) { @@ -118,16 +107,15 @@ static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) } } -/* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */ -static int intel_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) +static bool intel_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); bool fixed = idx & (1u << 30); idx &= ~(3u << 30); - return (!fixed && idx >= pmu->nr_arch_gp_counters) || - (fixed && idx >= pmu->nr_arch_fixed_counters); + return fixed ? idx < pmu->nr_arch_fixed_counters + : idx < pmu->nr_arch_gp_counters; } static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu, @@ -365,7 +353,7 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = pmu->global_ctrl; return 0; case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - msr_info->data = pmu->global_ovf_ctrl; + msr_info->data = 0; return 0; default: if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || @@ -423,7 +411,6 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!(data & pmu->global_ovf_ctrl_mask)) { if (!msr_info->host_initiated) pmu->global_status &= ~data; - pmu->global_ovf_ctrl = data; return 0; } break; @@ -461,6 +448,21 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; } +static void setup_fixed_pmc_eventsel(struct kvm_pmu *pmu) +{ + size_t size = ARRAY_SIZE(fixed_pmc_events); + struct kvm_pmc *pmc; + u32 event; + int i; + + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { + pmc = &pmu->fixed_counters[i]; + event = fixed_pmc_events[array_index_nospec(i, size)]; + pmc->eventsel = (intel_arch_events[event].unit_mask << 8) | + intel_arch_events[event].eventsel; + } +} + static void intel_pmu_refresh(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -502,12 +504,14 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->nr_arch_fixed_counters = 0; } else { pmu->nr_arch_fixed_counters = - min_t(int, edx.split.num_counters_fixed, - x86_pmu.num_counters_fixed); + min3(ARRAY_SIZE(fixed_pmc_events), + (size_t) edx.split.num_counters_fixed, + (size_t) x86_pmu.num_counters_fixed); edx.split.bit_width_fixed = min_t(int, edx.split.bit_width_fixed, x86_pmu.bit_width_fixed); pmu->counter_bitmask[KVM_PMC_FIXED] = ((u64)1 << edx.split.bit_width_fixed) - 1; + setup_fixed_pmc_eventsel(pmu); } pmu->global_ctrl = ((1ull << pmu->nr_arch_gp_counters) - 1) | @@ -588,8 +592,7 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu) pmc->counter = 0; } - pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = - pmu->global_ovf_ctrl = 0; + pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0; intel_pmu_release_guest_lbr_event(vcpu); } @@ -706,8 +709,7 @@ static void intel_pmu_cleanup(struct kvm_vcpu *vcpu) } struct kvm_pmu_ops intel_pmu_ops = { - .find_arch_event = intel_find_arch_event, - .find_fixed_event = intel_find_fixed_event, + .pmc_perf_hw_id = intel_pmc_perf_hw_id, .pmc_is_enabled = intel_pmc_is_enabled, .pmc_idx_to_pmc = intel_pmc_idx_to_pmc, .rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc, diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 5f81ef092bd4..88c53c521094 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -5,15 +5,28 @@ #include <asm/cpu.h> #include "lapic.h" +#include "irq.h" #include "posted_intr.h" #include "trace.h" #include "vmx.h" /* - * We maintain a per-CPU linked-list of vCPU, so in wakeup_handler() we - * can find which vCPU should be waken up. + * Maintain a per-CPU list of vCPUs that need to be awakened by wakeup_handler() + * when a WAKEUP_VECTOR interrupted is posted. vCPUs are added to the list when + * the vCPU is scheduled out and is blocking (e.g. in HLT) with IRQs enabled. + * The vCPUs posted interrupt descriptor is updated at the same time to set its + * notification vector to WAKEUP_VECTOR, so that posted interrupt from devices + * wake the target vCPUs. vCPUs are removed from the list and the notification + * vector is reset when the vCPU is scheduled in. */ static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); +/* + * Protect the per-CPU list with a per-CPU spinlock to handle task migration. + * When a blocking vCPU is awakened _and_ migrated to a different pCPU, the + * ->sched_in() path will need to take the vCPU off the list of the _previous_ + * CPU. IRQs must be disabled when taking this lock, otherwise deadlock will + * occur if a wakeup IRQ arrives and attempts to acquire the lock. + */ static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock); static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) @@ -21,6 +34,20 @@ static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) return &(to_vmx(vcpu)->pi_desc); } +static int pi_try_set_control(struct pi_desc *pi_desc, u64 old, u64 new) +{ + /* + * PID.ON can be set at any time by a different vCPU or by hardware, + * e.g. a device. PID.control must be written atomically, and the + * update must be retried with a fresh snapshot an ON change causes + * the cmpxchg to fail. + */ + if (cmpxchg64(&pi_desc->control, old, new) != old) + return -EBUSY; + + return 0; +} + void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); @@ -28,11 +55,14 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) unsigned int dest; /* - * In case of hot-plug or hot-unplug, we may have to undo - * vmx_vcpu_pi_put even if there is no assigned device. And we - * always keep PI.NDST up to date for simplicity: it makes the - * code easier, and CPU migration is not a fast path. + * To simplify hot-plug and dynamic toggling of APICv, keep PI.NDST and + * PI.SN up-to-date even if there is no assigned device or if APICv is + * deactivated due to a dynamic inhibit bit, e.g. for Hyper-V's SyncIC. */ + if (!enable_apicv || !lapic_in_kernel(vcpu)) + return; + + /* Nothing to do if PI.SN and PI.NDST both have the desired value. */ if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu) return; @@ -48,20 +78,17 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) goto after_clear_sn; } - /* The full case. */ - do { - old.control = new.control = pi_desc->control; - - dest = cpu_physical_id(cpu); + /* The full case. Set the new destination and clear SN. */ + dest = cpu_physical_id(cpu); + if (!x2apic_mode) + dest = (dest << 8) & 0xFF00; - if (x2apic_mode) - new.ndst = dest; - else - new.ndst = (dest << 8) & 0xFF00; + do { + old.control = new.control = READ_ONCE(pi_desc->control); + new.ndst = dest; new.sn = 0; - } while (cmpxchg64(&pi_desc->control, old.control, - new.control) != old.control); + } while (pi_try_set_control(pi_desc, old.control, new.control)); after_clear_sn: @@ -77,13 +104,18 @@ after_clear_sn: pi_set_on(pi_desc); } +static bool vmx_can_use_vtd_pi(struct kvm *kvm) +{ + return irqchip_in_kernel(kvm) && enable_apicv && + kvm_arch_has_assigned_device(kvm) && + irq_remapping_cap(IRQ_POSTING_CAP); +} + void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(vcpu)) + if (!vmx_can_use_vtd_pi(vcpu->kvm)) return; /* Set SN when the vCPU is preempted */ @@ -97,29 +129,31 @@ static void __pi_post_block(struct kvm_vcpu *vcpu) struct pi_desc old, new; unsigned int dest; - do { - old.control = new.control = pi_desc->control; - WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR, - "Wakeup handler not enabled while the VCPU is blocked\n"); + /* + * Remove the vCPU from the wakeup list of the _previous_ pCPU, which + * will not be the same as the current pCPU if the task was migrated. + */ + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + list_del(&vcpu->blocked_vcpu_list); + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); - dest = cpu_physical_id(vcpu->cpu); + dest = cpu_physical_id(vcpu->cpu); + if (!x2apic_mode) + dest = (dest << 8) & 0xFF00; - if (x2apic_mode) - new.ndst = dest; - else - new.ndst = (dest << 8) & 0xFF00; + WARN(pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR, + "Wakeup handler not enabled while the vCPU was blocking"); + + do { + old.control = new.control = READ_ONCE(pi_desc->control); + + new.ndst = dest; /* set 'NV' to 'notification vector' */ new.nv = POSTED_INTR_VECTOR; - } while (cmpxchg64(&pi_desc->control, old.control, - new.control) != old.control); - - if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) { - spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); - list_del(&vcpu->blocked_vcpu_list); - spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); - vcpu->pre_pcpu = -1; - } + } while (pi_try_set_control(pi_desc, old.control, new.control)); + + vcpu->pre_pcpu = -1; } /* @@ -128,7 +162,6 @@ static void __pi_post_block(struct kvm_vcpu *vcpu) * - Store the vCPU to the wakeup list, so when interrupts happen * we can find the right vCPU to wake up. * - Change the Posted-interrupt descriptor as below: - * 'NDST' <-- vcpu->pre_pcpu * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR * - If 'ON' is set during this process, which means at least one * interrupt is posted for this vCPU, we cannot block it, in @@ -137,70 +170,50 @@ static void __pi_post_block(struct kvm_vcpu *vcpu) */ int pi_pre_block(struct kvm_vcpu *vcpu) { - unsigned int dest; struct pi_desc old, new; struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + unsigned long flags; - if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(vcpu)) + if (!vmx_can_use_vtd_pi(vcpu->kvm) || + vmx_interrupt_blocked(vcpu)) return 0; - WARN_ON(irqs_disabled()); - local_irq_disable(); - if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) { - vcpu->pre_pcpu = vcpu->cpu; - spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); - list_add_tail(&vcpu->blocked_vcpu_list, - &per_cpu(blocked_vcpu_on_cpu, - vcpu->pre_pcpu)); - spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); - } - - do { - old.control = new.control = pi_desc->control; + local_irq_save(flags); - WARN((pi_desc->sn == 1), - "Warning: SN field of posted-interrupts " - "is set before blocking\n"); + vcpu->pre_pcpu = vcpu->cpu; + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu)); + list_add_tail(&vcpu->blocked_vcpu_list, + &per_cpu(blocked_vcpu_on_cpu, vcpu->cpu)); + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu)); - /* - * Since vCPU can be preempted during this process, - * vcpu->cpu could be different with pre_pcpu, we - * need to set pre_pcpu as the destination of wakeup - * notification event, then we can find the right vCPU - * to wakeup in wakeup handler if interrupts happen - * when the vCPU is in blocked state. - */ - dest = cpu_physical_id(vcpu->pre_pcpu); + WARN(pi_desc->sn == 1, + "Posted Interrupt Suppress Notification set before blocking"); - if (x2apic_mode) - new.ndst = dest; - else - new.ndst = (dest << 8) & 0xFF00; + do { + old.control = new.control = READ_ONCE(pi_desc->control); /* set 'NV' to 'wakeup vector' */ new.nv = POSTED_INTR_WAKEUP_VECTOR; - } while (cmpxchg64(&pi_desc->control, old.control, - new.control) != old.control); + } while (pi_try_set_control(pi_desc, old.control, new.control)); /* We should not block the vCPU if an interrupt is posted for it. */ - if (pi_test_on(pi_desc) == 1) + if (pi_test_on(pi_desc)) __pi_post_block(vcpu); - local_irq_enable(); + local_irq_restore(flags); return (vcpu->pre_pcpu == -1); } void pi_post_block(struct kvm_vcpu *vcpu) { + unsigned long flags; + if (vcpu->pre_pcpu == -1) return; - WARN_ON(irqs_disabled()); - local_irq_disable(); + local_irq_save(flags); __pi_post_block(vcpu); - local_irq_enable(); + local_irq_restore(flags); } /* @@ -216,7 +229,7 @@ void pi_wakeup_handler(void) blocked_vcpu_list) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - if (pi_test_on(pi_desc) == 1) + if (pi_test_on(pi_desc)) kvm_vcpu_kick(vcpu); } spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); @@ -270,9 +283,7 @@ int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, struct vcpu_data vcpu_info; int idx, ret = 0; - if (!kvm_arch_has_assigned_device(kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(kvm->vcpus[0])) + if (!vmx_can_use_vtd_pi(kvm)) return 0; idx = srcu_read_lock(&kvm->irq_srcu); diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h index 7f7b2326caf5..36ae035f14aa 100644 --- a/arch/x86/kvm/vmx/posted_intr.h +++ b/arch/x86/kvm/vmx/posted_intr.h @@ -40,7 +40,7 @@ static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc) (unsigned long *)&pi_desc->control); } -static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) +static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) { return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); } @@ -74,13 +74,13 @@ static inline void pi_clear_sn(struct pi_desc *pi_desc) (unsigned long *)&pi_desc->control); } -static inline int pi_test_on(struct pi_desc *pi_desc) +static inline bool pi_test_on(struct pi_desc *pi_desc) { return test_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control); } -static inline int pi_test_sn(struct pi_desc *pi_desc) +static inline bool pi_test_sn(struct pi_desc *pi_desc) { return test_bit(POSTED_INTR_SN, (unsigned long *)&pi_desc->control); diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c index 6693ebdc0770..35e7ec91ae86 100644 --- a/arch/x86/kvm/vmx/sgx.c +++ b/arch/x86/kvm/vmx/sgx.c @@ -53,11 +53,9 @@ static int sgx_get_encls_gva(struct kvm_vcpu *vcpu, unsigned long offset, static void sgx_handle_emulation_failure(struct kvm_vcpu *vcpu, u64 addr, unsigned int size) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 2; - vcpu->run->internal.data[0] = addr; - vcpu->run->internal.data[1] = size; + uint64_t data[2] = { addr, size }; + + __kvm_prepare_emulation_failure_exit(vcpu, data, ARRAY_SIZE(data)); } static int sgx_read_hva(struct kvm_vcpu *vcpu, unsigned long hva, void *data, @@ -112,9 +110,7 @@ static int sgx_inject_fault(struct kvm_vcpu *vcpu, gva_t gva, int trapnr) * but the error code isn't (yet) plumbed through the ENCLS helpers. */ if (trapnr == PF_VECTOR && !boot_cpu_has(X86_FEATURE_SGX2)) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; + kvm_prepare_emulation_failure_exit(vcpu); return 0; } @@ -155,9 +151,7 @@ static int __handle_encls_ecreate(struct kvm_vcpu *vcpu, sgx_12_0 = kvm_find_cpuid_entry(vcpu, 0x12, 0); sgx_12_1 = kvm_find_cpuid_entry(vcpu, 0x12, 1); if (!sgx_12_0 || !sgx_12_1) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; + kvm_prepare_emulation_failure_exit(vcpu); return 0; } diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index 6e5de2e2b0da..e325c290a816 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -129,6 +129,11 @@ static inline bool is_machine_check(u32 intr_info) return is_exception_n(intr_info, MC_VECTOR); } +static inline bool is_nm_fault(u32 intr_info) +{ + return is_exception_n(intr_info, NM_VECTOR); +} + /* Undocumented: icebp/int1 */ static inline bool is_icebp(u32 intr_info) { diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 3a6461694fc2..435c187927c4 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -49,14 +49,14 @@ SYM_FUNC_START_LOCAL(vmx_vmenter) je 2f 1: vmresume - ret + RET 2: vmlaunch - ret + RET 3: cmpb $0, kvm_rebooting je 4f - ret + RET 4: ud2 _ASM_EXTABLE(1b, 3b) @@ -89,7 +89,7 @@ SYM_FUNC_START(vmx_vmexit) pop %_ASM_AX .Lvmexit_skip_rsb: #endif - ret + RET SYM_FUNC_END(vmx_vmexit) /** @@ -228,7 +228,7 @@ SYM_FUNC_START(__vmx_vcpu_run) pop %edi #endif pop %_ASM_BP - ret + RET /* VM-Fail. Out-of-line to avoid a taken Jcc after VM-Exit. */ 2: mov $1, %eax @@ -293,7 +293,7 @@ SYM_FUNC_START(vmread_error_trampoline) pop %_ASM_AX pop %_ASM_BP - ret + RET SYM_FUNC_END(vmread_error_trampoline) SYM_FUNC_START(vmx_do_interrupt_nmi_irqoff) @@ -326,5 +326,5 @@ SYM_FUNC_START(vmx_do_interrupt_nmi_irqoff) */ mov %_ASM_BP, %_ASM_SP pop %_ASM_BP - ret + RET SYM_FUNC_END(vmx_do_interrupt_nmi_irqoff) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 116b08904ac3..1b2e9d8c5cc9 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -35,7 +35,8 @@ #include <asm/cpu_device_id.h> #include <asm/debugreg.h> #include <asm/desc.h> -#include <asm/fpu/internal.h> +#include <asm/fpu/api.h> +#include <asm/fpu/xstate.h> #include <asm/idtentry.h> #include <asm/io.h> #include <asm/irq_remapping.h> @@ -161,6 +162,8 @@ static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = { MSR_FS_BASE, MSR_GS_BASE, MSR_KERNEL_GS_BASE, + MSR_IA32_XFD, + MSR_IA32_XFD_ERR, #endif MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, @@ -602,15 +605,13 @@ static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx, unsigned int slot = msr - vmx->guest_uret_msrs; int ret = 0; - u64 old_msr_data = msr->data; - msr->data = data; if (msr->load_into_hardware) { preempt_disable(); - ret = kvm_set_user_return_msr(slot, msr->data, msr->mask); + ret = kvm_set_user_return_msr(slot, data, msr->mask); preempt_enable(); - if (ret) - msr->data = old_msr_data; } + if (!ret) + msr->data = data; return ret; } @@ -763,30 +764,27 @@ void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu) vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, match); } + /* + * Disabling xfd interception indicates that dynamic xfeatures + * might be used in the guest. Always trap #NM in this case + * to save guest xfd_err timely. + */ + if (vcpu->arch.xfd_no_write_intercept) + eb |= (1u << NM_VECTOR); + vmcs_write32(EXCEPTION_BITMAP, eb); } /* * Check if MSR is intercepted for currently loaded MSR bitmap. */ -static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) +static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr) { - unsigned long *msr_bitmap; - int f = sizeof(unsigned long); - - if (!cpu_has_vmx_msr_bitmap()) + if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS)) return true; - msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap; - - if (msr <= 0x1fff) { - return !!test_bit(msr, msr_bitmap + 0x800 / f); - } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { - msr &= 0x1fff; - return !!test_bit(msr, msr_bitmap + 0xc00 / f); - } - - return true; + return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap, + MSR_IA32_SPEC_CTRL); } static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, @@ -1059,8 +1057,8 @@ static void pt_guest_enter(struct vcpu_vmx *vmx) rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { wrmsrl(MSR_IA32_RTIT_CTL, 0); - pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range); - pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range); + pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges); + pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges); } } @@ -1070,17 +1068,26 @@ static void pt_guest_exit(struct vcpu_vmx *vmx) return; if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { - pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range); - pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range); + pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges); + pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges); } - /* Reload host state (IA32_RTIT_CTL will be cleared on VM exit). */ - wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); + /* + * KVM requires VM_EXIT_CLEAR_IA32_RTIT_CTL to expose PT to the guest, + * i.e. RTIT_CTL is always cleared on VM-Exit. Restore it if necessary. + */ + if (vmx->pt_desc.host.ctl) + wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); } -void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, - unsigned long fs_base, unsigned long gs_base) +void vmx_set_vmcs_host_state(struct vmcs_host_state *host, unsigned long cr3, + u16 fs_sel, u16 gs_sel, + unsigned long fs_base, unsigned long gs_base) { + if (unlikely(cr3 != host->cr3)) { + vmcs_writel(HOST_CR3, cr3); + host->cr3 = cr3; + } if (unlikely(fs_sel != host->fs_sel)) { if (!(fs_sel & 7)) vmcs_write16(HOST_FS_SELECTOR, fs_sel); @@ -1175,7 +1182,9 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) gs_base = segment_base(gs_sel); #endif - vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base); + vmx_set_vmcs_host_state(host_state, __get_current_cr3_fast(), + fs_sel, gs_sel, fs_base, gs_base); + vmx->guest_state_loaded = true; } @@ -1278,7 +1287,6 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, if (!already_loaded) { void *gdt = get_current_gdt_ro(); - unsigned long sysenter_esp; /* * Flush all EPTP/VPID contexts, the new pCPU may have stale @@ -1294,8 +1302,11 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss); vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ - rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); - vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ + if (IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) { + /* 22.2.3 */ + vmcs_writel(HOST_IA32_SYSENTER_ESP, + (unsigned long)(cpu_entry_stack(cpu) + 1)); + } vmx->loaded_vmcs->cpu = cpu; } @@ -1370,6 +1381,11 @@ void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) vmx->emulation_required = vmx_emulation_required(vcpu); } +static bool vmx_get_if_flag(struct kvm_vcpu *vcpu) +{ + return vmx_get_rflags(vcpu) & X86_EFLAGS_IF; +} + u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) { u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); @@ -1456,16 +1472,16 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) * cause a #GP fault. */ value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET; - if ((value && (vmx->pt_desc.addr_range < 1)) || (value > 2)) + if ((value && (vmx->pt_desc.num_address_ranges < 1)) || (value > 2)) return 1; value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET; - if ((value && (vmx->pt_desc.addr_range < 2)) || (value > 2)) + if ((value && (vmx->pt_desc.num_address_ranges < 2)) || (value > 2)) return 1; value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET; - if ((value && (vmx->pt_desc.addr_range < 3)) || (value > 2)) + if ((value && (vmx->pt_desc.num_address_ranges < 3)) || (value > 2)) return 1; value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET; - if ((value && (vmx->pt_desc.addr_range < 4)) || (value > 2)) + if ((value && (vmx->pt_desc.num_address_ranges < 4)) || (value > 2)) return 1; return 0; @@ -1755,7 +1771,7 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr) } /* - * Reads an msr value (of 'msr_index') into 'pdata'. + * Reads an msr value (of 'msr_info->index') into 'msr_info->data'. * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ @@ -1886,8 +1902,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; if (!vmx_pt_mode_is_host_guest() || - (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps, - PT_CAP_num_address_ranges))) + (index >= 2 * vmx->pt_desc.num_address_ranges)) return 1; if (index % 2) msr_info->data = vmx->pt_desc.guest.addr_b[index / 2]; @@ -1963,6 +1978,24 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_KERNEL_GS_BASE: vmx_write_guest_kernel_gs_base(vmx, data); break; + case MSR_IA32_XFD: + ret = kvm_set_msr_common(vcpu, msr_info); + /* + * Always intercepting WRMSR could incur non-negligible + * overhead given xfd might be changed frequently in + * guest context switch. Disable write interception + * upon the first write with a non-zero value (indicating + * potential usage on dynamic xfeatures). Also update + * exception bitmap to trap #NM for proper virtualization + * of guest xfd_err. + */ + if (!ret && data) { + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD, + MSR_TYPE_RW); + vcpu->arch.xfd_no_write_intercept = true; + vmx_update_exception_bitmap(vcpu); + } + break; #endif case MSR_IA32_SYSENTER_CS: if (is_guest_mode(vcpu)) @@ -2103,9 +2136,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } ret = kvm_set_msr_common(vcpu, msr_info); break; - case MSR_IA32_TSC_ADJUST: - ret = kvm_set_msr_common(vcpu, msr_info); - break; case MSR_IA32_MCG_EXT_CTL: if ((!msr_info->host_initiated && !(to_vmx(vcpu)->msr_ia32_feature_control & @@ -2202,8 +2232,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!pt_can_write_msr(vmx)) return 1; index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; - if (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps, - PT_CAP_num_address_ranges)) + if (index >= 2 * vmx->pt_desc.num_address_ranges) return 1; if (is_noncanonical_address(data, vcpu)) return 1; @@ -2655,15 +2684,6 @@ int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) if (!loaded_vmcs->msr_bitmap) goto out_vmcs; memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE); - - if (IS_ENABLED(CONFIG_HYPERV) && - static_branch_unlikely(&enable_evmcs) && - (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) { - struct hv_enlightened_vmcs *evmcs = - (struct hv_enlightened_vmcs *)loaded_vmcs->vmcs; - - evmcs->hv_enlightenments_control.msr_bitmap = 1; - } } memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state)); @@ -2927,6 +2947,13 @@ static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu) } } +static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu) +{ + if (is_guest_mode(vcpu)) + return nested_get_vpid02(vcpu); + return to_vmx(vcpu)->vpid; +} + static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.mmu; @@ -2939,31 +2966,29 @@ static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) if (enable_ept) ept_sync_context(construct_eptp(vcpu, root_hpa, mmu->shadow_root_level)); - else if (!is_guest_mode(vcpu)) - vpid_sync_context(to_vmx(vcpu)->vpid); else - vpid_sync_context(nested_get_vpid02(vcpu)); + vpid_sync_context(vmx_get_current_vpid(vcpu)); } static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) { /* - * vpid_sync_vcpu_addr() is a nop if vmx->vpid==0, see the comment in + * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in * vmx_flush_tlb_guest() for an explanation of why this is ok. */ - vpid_sync_vcpu_addr(to_vmx(vcpu)->vpid, addr); + vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr); } static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu) { /* - * vpid_sync_context() is a nop if vmx->vpid==0, e.g. if enable_vpid==0 - * or a vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit - * are required to flush GVA->{G,H}PA mappings from the TLB if vpid is + * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a + * vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit are + * required to flush GVA->{G,H}PA mappings from the TLB if vpid is * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed), * i.e. no explicit INVVPID is necessary. */ - vpid_sync_context(to_vmx(vcpu)->vpid); + vpid_sync_context(vmx_get_current_vpid(vcpu)); } void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu) @@ -2993,7 +3018,7 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu) mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3); - kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); + kvm_register_mark_available(vcpu, VCPU_EXREG_PDPTR); } #define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \ @@ -3075,6 +3100,13 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) /* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */ if ((old_cr0_pg ^ cr0) & X86_CR0_PG) vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); + + /* + * When !CR0_PG -> CR0_PG, vcpu->arch.cr3 becomes active, but + * GUEST_CR3 is still vmx->ept_identity_map_addr if EPT + !URG. + */ + if (!(old_cr0_pg & X86_CR0_PG) && (cr0 & X86_CR0_PG)) + kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); } /* depends on vcpu->arch.cr0 to be set to a new value */ @@ -3118,9 +3150,9 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, if (!enable_unrestricted_guest && !is_paging(vcpu)) guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr; - else if (test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) + else if (kvm_register_is_dirty(vcpu, VCPU_EXREG_CR3)) guest_cr3 = vcpu->arch.cr3; - else /* vmcs01.GUEST_CR3 is already up-to-date. */ + else /* vmcs.GUEST_CR3 is already up-to-date. */ update_guest_cr3 = false; vmx_ept_load_pdptrs(vcpu); } else { @@ -3131,6 +3163,7 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, vmcs_writel(GUEST_CR3, guest_cr3); } + static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { /* @@ -3695,44 +3728,17 @@ void free_vpid(int vpid) spin_unlock(&vmx_vpid_lock); } -static void vmx_clear_msr_bitmap_read(ulong *msr_bitmap, u32 msr) +static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx) { - int f = sizeof(unsigned long); - - if (msr <= 0x1fff) - __clear_bit(msr, msr_bitmap + 0x000 / f); - else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) - __clear_bit(msr & 0x1fff, msr_bitmap + 0x400 / f); -} - -static void vmx_clear_msr_bitmap_write(ulong *msr_bitmap, u32 msr) -{ - int f = sizeof(unsigned long); - - if (msr <= 0x1fff) - __clear_bit(msr, msr_bitmap + 0x800 / f); - else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) - __clear_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f); -} - -static void vmx_set_msr_bitmap_read(ulong *msr_bitmap, u32 msr) -{ - int f = sizeof(unsigned long); - - if (msr <= 0x1fff) - __set_bit(msr, msr_bitmap + 0x000 / f); - else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) - __set_bit(msr & 0x1fff, msr_bitmap + 0x400 / f); -} - -static void vmx_set_msr_bitmap_write(ulong *msr_bitmap, u32 msr) -{ - int f = sizeof(unsigned long); + /* + * When KVM is a nested hypervisor on top of Hyper-V and uses + * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR + * bitmap has changed. + */ + if (static_branch_unlikely(&enable_evmcs)) + evmcs_touch_msr_bitmap(); - if (msr <= 0x1fff) - __set_bit(msr, msr_bitmap + 0x800 / f); - else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) - __set_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f); + vmx->nested.force_msr_bitmap_recalc = true; } void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) @@ -3743,8 +3749,7 @@ void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) if (!cpu_has_vmx_msr_bitmap()) return; - if (static_branch_unlikely(&enable_evmcs)) - evmcs_touch_msr_bitmap(); + vmx_msr_bitmap_l01_changed(vmx); /* * Mark the desired intercept state in shadow bitmap, this is needed @@ -3788,8 +3793,7 @@ void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) if (!cpu_has_vmx_msr_bitmap()) return; - if (static_branch_unlikely(&enable_evmcs)) - evmcs_touch_msr_bitmap(); + vmx_msr_bitmap_l01_changed(vmx); /* * Mark the desired intercept state in shadow bitmap, this is needed @@ -3879,7 +3883,7 @@ void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu) vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag); vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag); vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag); - for (i = 0; i < vmx->pt_desc.addr_range; i++) { + for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) { vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag); vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag); } @@ -3979,6 +3983,19 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, */ vmx->nested.pi_pending = true; kvm_make_request(KVM_REQ_EVENT, vcpu); + + /* + * This pairs with the smp_mb_*() after setting vcpu->mode in + * vcpu_enter_guest() to guarantee the vCPU sees the event + * request if triggering a posted interrupt "fails" because + * vcpu->mode != IN_GUEST_MODE. The extra barrier is needed as + * the smb_wmb() in kvm_make_request() only ensures everything + * done before making the request is visible when the request + * is visible, it doesn't ensure ordering between the store to + * vcpu->requests and the load from vcpu->mode. + */ + smp_mb__after_atomic(); + /* the PIR and ON have been set by L1. */ if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true)) kvm_vcpu_kick(vcpu); @@ -4012,8 +4029,13 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) if (pi_test_and_set_on(&vmx->pi_desc)) return 0; - if (vcpu != kvm_get_running_vcpu() && - !kvm_vcpu_trigger_posted_interrupt(vcpu, false)) + /* + * The implied barrier in pi_test_and_set_on() pairs with the smp_mb_*() + * after setting vcpu->mode in vcpu_enter_guest(), thus the vCPU is + * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a + * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE. + */ + if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false)) kvm_vcpu_kick(vcpu); return 0; @@ -4070,6 +4092,12 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx) rdmsr(MSR_IA32_SYSENTER_CS, low32, high32); vmcs_write32(HOST_IA32_SYSENTER_CS, low32); + + /* + * If 32-bit syscall is enabled, vmx_vcpu_load_vcms rewrites + * HOST_IA32_SYSENTER_ESP. + */ + vmcs_writel(HOST_IA32_SYSENTER_ESP, 0); rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl); vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */ @@ -4088,8 +4116,10 @@ void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS & ~vcpu->arch.cr4_guest_rsvd_bits; - if (!enable_ept) - vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PGE; + if (!enable_ept) { + vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_TLBFLUSH_BITS; + vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PDPTR_BITS; + } if (is_guest_mode(&vmx->vcpu)) vcpu->arch.cr4_guest_owned_bits &= ~get_vmcs12(vcpu)->cr4_guest_host_mask; @@ -4328,10 +4358,6 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) #define VMX_XSS_EXIT_BITMAP 0 -/* - * Noting that the initialization of Guest-state Area of VMCS is in - * vmx_vcpu_reset(). - */ static void init_vmcs(struct vcpu_vmx *vmx) { if (nested) @@ -4340,7 +4366,7 @@ static void init_vmcs(struct vcpu_vmx *vmx) if (cpu_has_vmx_msr_bitmap()) vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap)); - vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ + vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); /* 22.3.1.5 */ /* Control */ pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); @@ -4436,10 +4462,40 @@ static void init_vmcs(struct vcpu_vmx *vmx) vmx_setup_uret_msrs(vmx); } +static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + init_vmcs(vmx); + + if (nested) + memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs)); + + vcpu_setup_sgx_lepubkeyhash(vcpu); + + vmx->nested.posted_intr_nv = -1; + vmx->nested.vmxon_ptr = INVALID_GPA; + vmx->nested.current_vmptr = INVALID_GPA; + vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; + + vcpu->arch.microcode_version = 0x100000000ULL; + vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED; + + /* + * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR + * or POSTED_INTR_WAKEUP_VECTOR. + */ + vmx->pi_desc.nv = POSTED_INTR_VECTOR; + vmx->pi_desc.sn = 1; +} + static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { struct vcpu_vmx *vmx = to_vmx(vcpu); + if (!init_event) + __vmx_vcpu_reset(vcpu); + vmx->rmode.vm86_active = 0; vmx->spec_ctrl = 0; @@ -4449,6 +4505,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_set_cr8(vcpu, 0); vmx_segment_cache_clear(vmx); + kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS); seg_setup(VCPU_SREG_CS); vmcs_write16(GUEST_CS_SELECTOR, 0xf000); @@ -4714,7 +4771,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, if (kvm_emulate_instruction(vcpu, 0)) { if (vcpu->arch.halt_request) { vcpu->arch.halt_request = 0; - return kvm_vcpu_halt(vcpu); + return kvm_emulate_halt_noskip(vcpu); } return 1; } @@ -4770,6 +4827,17 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) if (is_machine_check(intr_info) || is_nmi(intr_info)) return 1; /* handled by handle_exception_nmi_irqoff() */ + /* + * Queue the exception here instead of in handle_nm_fault_irqoff(). + * This ensures the nested_vmx check is not skipped so vmexit can + * be reflected to L1 (when it intercepts #NM) before reaching this + * point. + */ + if (is_nm_fault(intr_info)) { + kvm_queue_exception(vcpu, NM_VECTOR); + return 1; + } + if (is_invalid_opcode(intr_info)) return handle_ud(vcpu); @@ -5379,16 +5447,13 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) if (vmx->emulation_required && !vmx->rmode.vm86_active && vcpu->arch.exception.pending) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = - KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; + kvm_prepare_emulation_failure_exit(vcpu); return 0; } if (vcpu->arch.halt_request) { vcpu->arch.halt_request = 0; - return kvm_vcpu_halt(vcpu); + return kvm_emulate_halt_noskip(vcpu); } /* @@ -5468,6 +5533,7 @@ static int handle_invpcid(struct kvm_vcpu *vcpu) u64 pcid; u64 gla; } operand; + int gpr_index; if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) { kvm_queue_exception(vcpu, UD_VECTOR); @@ -5475,12 +5541,8 @@ static int handle_invpcid(struct kvm_vcpu *vcpu) } vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf); - - if (type > 3) { - kvm_inject_gp(vcpu, 0); - return 1; - } + gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); + type = kvm_register_read(vcpu, gpr_index); /* According to the Intel instruction reference, the memory operand * is read even if it isn't needed (e.g., for type==all) @@ -5562,9 +5624,13 @@ static int handle_encls(struct kvm_vcpu *vcpu) static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu) { - vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK; - vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK; - return 0; + /* + * Hardware may or may not set the BUS_LOCK_DETECTED flag on BUS_LOCK + * VM-Exits. Unconditionally set the flag here and leave the handling to + * vmx_handle_exit(). + */ + to_vmx(vcpu)->exit_reason.bus_lock_detected = true; + return 1; } /* @@ -5629,11 +5695,13 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { static const int kvm_vmx_max_exit_handlers = ARRAY_SIZE(kvm_vmx_exit_handlers); -static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2, +static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, + u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code) { struct vcpu_vmx *vmx = to_vmx(vcpu); + *reason = vmx->exit_reason.full; *info1 = vmx_get_exit_qual(vcpu); if (!(vmx->exit_reason.failed_vmentry)) { *info2 = vmx->idt_vectoring_info; @@ -5903,18 +5971,14 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) vmx_flush_pml_buffer(vcpu); /* - * We should never reach this point with a pending nested VM-Enter, and - * more specifically emulation of L2 due to invalid guest state (see - * below) should never happen as that means we incorrectly allowed a - * nested VM-Enter with an invalid vmcs12. + * KVM should never reach this point with a pending nested VM-Enter. + * More specifically, short-circuiting VM-Entry to emulate L2 due to + * invalid guest state should never happen as that means KVM knowingly + * allowed a nested VM-Enter with an invalid vmcs12. More below. */ if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm)) return -EIO; - /* If guest state is invalid, start emulating */ - if (vmx->emulation_required) - return handle_invalid_guest_state(vcpu); - if (is_guest_mode(vcpu)) { /* * PML is never enabled when running L2, bail immediately if a @@ -5936,10 +6000,30 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) */ nested_mark_vmcs12_pages_dirty(vcpu); + /* + * Synthesize a triple fault if L2 state is invalid. In normal + * operation, nested VM-Enter rejects any attempt to enter L2 + * with invalid state. However, those checks are skipped if + * state is being stuffed via RSM or KVM_SET_NESTED_STATE. If + * L2 state is invalid, it means either L1 modified SMRAM state + * or userspace provided bad state. Synthesize TRIPLE_FAULT as + * doing so is architecturally allowed in the RSM case, and is + * the least awful solution for the userspace case without + * risking false positives. + */ + if (vmx->emulation_required) { + nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); + return 1; + } + if (nested_vmx_reflect_vmexit(vcpu)) return 1; } + /* If guest state is invalid, start emulating. L2 is handled above. */ + if (vmx->emulation_required) + return handle_invalid_guest_state(vcpu); + if (exit_reason.failed_vmentry) { dump_vmcs(vcpu); vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; @@ -6051,9 +6135,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) int ret = __vmx_handle_exit(vcpu, exit_fastpath); /* - * Even when current exit reason is handled by KVM internally, we - * still need to exit to user space when bus lock detected to inform - * that there is a bus lock in guest. + * Exit to user space when bus lock detected to inform that there is + * a bus lock in guest. */ if (to_vmx(vcpu)->exit_reason.bus_lock_detected) { if (ret > 0) @@ -6285,9 +6368,9 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); int max_irr; - bool max_irr_updated; + bool got_posted_interrupt; - if (KVM_BUG_ON(!vcpu->arch.apicv_active, vcpu->kvm)) + if (KVM_BUG_ON(!enable_apicv, vcpu->kvm)) return -EIO; if (pi_test_on(&vmx->pi_desc)) { @@ -6297,27 +6380,33 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) * But on x86 this is just a compiler barrier anyway. */ smp_mb__after_atomic(); - max_irr_updated = + got_posted_interrupt = kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr); - - /* - * If we are running L2 and L1 has a new pending interrupt - * which can be injected, we should re-evaluate - * what should be done with this new L1 interrupt. - * If L1 intercepts external-interrupts, we should - * exit from L2 to L1. Otherwise, interrupt should be - * delivered directly to L2. - */ - if (is_guest_mode(vcpu) && max_irr_updated) { - if (nested_exit_on_intr(vcpu)) - kvm_vcpu_exiting_guest_mode(vcpu); - else - kvm_make_request(KVM_REQ_EVENT, vcpu); - } } else { max_irr = kvm_lapic_find_highest_irr(vcpu); + got_posted_interrupt = false; } - vmx_hwapic_irr_update(vcpu, max_irr); + + /* + * Newly recognized interrupts are injected via either virtual interrupt + * delivery (RVI) or KVM_REQ_EVENT. Virtual interrupt delivery is + * disabled in two cases: + * + * 1) If L2 is running and the vCPU has a new pending interrupt. If L1 + * wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a + * VM-Exit to L1. If L1 doesn't want to exit, the interrupt is injected + * into L2, but KVM doesn't use virtual interrupt delivery to inject + * interrupts into L2, and so KVM_REQ_EVENT is again needed. + * + * 2) If APICv is disabled for this vCPU, assigned devices may still + * attempt to post interrupts. The posted interrupt vector will cause + * a VM-Exit and the subsequent entry will call sync_pir_to_irr. + */ + if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu)) + vmx_set_rvi(max_irr); + else if (got_posted_interrupt) + kvm_make_request(KVM_REQ_EVENT, vcpu); + return max_irr; } @@ -6345,11 +6434,33 @@ void vmx_do_interrupt_nmi_irqoff(unsigned long entry); static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, unsigned long entry) { - kvm_before_interrupt(vcpu); + bool is_nmi = entry == (unsigned long)asm_exc_nmi_noist; + + kvm_before_interrupt(vcpu, is_nmi ? KVM_HANDLING_NMI : KVM_HANDLING_IRQ); vmx_do_interrupt_nmi_irqoff(entry); kvm_after_interrupt(vcpu); } +static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu) +{ + /* + * Save xfd_err to guest_fpu before interrupt is enabled, so the + * MSR value is not clobbered by the host activity before the guest + * has chance to consume it. + * + * Do not blindly read xfd_err here, since this exception might + * be caused by L1 interception on a platform which doesn't + * support xfd at all. + * + * Do it conditionally upon guest_fpu::xfd. xfd_err matters + * only when xfd contains a non-zero value. + * + * Queuing exception is done in vmx_handle_exit. See comment there. + */ + if (vcpu->arch.guest_fpu.fpstate->xfd) + rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); +} + static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx) { const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist; @@ -6358,6 +6469,9 @@ static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx) /* if exit due to PF check for async PF */ if (is_page_fault(intr_info)) vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags(); + /* if exit due to NM, handle before interrupts are enabled */ + else if (is_nm_fault(intr_info)) + handle_nm_fault_irqoff(&vmx->vcpu); /* Handle machine checks before interrupts are enabled */ else if (is_machine_check(intr_info)) kvm_machine_check(); @@ -6408,6 +6522,7 @@ static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index) case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: return nested; case MSR_AMD64_VIRT_SPEC_CTRL: + case MSR_AMD64_TSC_RATIO: /* This is AMD only. */ return false; default: @@ -6615,7 +6730,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long cr3, cr4; + unsigned long cr4; /* Record the guest's net vcpu time for enforced NMI injections. */ if (unlikely(!enable_vnmi && @@ -6628,9 +6743,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) * consistency check VM-Exit due to invalid guest state and bail. */ if (unlikely(vmx->emulation_required)) { - - /* We don't emulate invalid state of a nested guest */ - vmx->fail = is_guest_mode(vcpu); + vmx->fail = 0; vmx->exit_reason.full = EXIT_REASON_INVALID_STATE; vmx->exit_reason.failed_vmentry = 1; @@ -6658,12 +6771,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP)) vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); - - cr3 = __get_current_cr3_fast(); - if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { - vmcs_writel(HOST_CR3, cr3); - vmx->loaded_vmcs->host_state.cr3 = cr3; - } + vcpu->arch.regs_dirty = 0; cr4 = cr4_read_shadow(); if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { @@ -6722,7 +6830,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) * If the L02 MSR bitmap does not intercept the MSR, then we need to * save it. */ - if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))) + if (unlikely(!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))) vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0); @@ -6752,7 +6860,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) loadsegment(es, __USER_DS); #endif - vmx_register_cache_reset(vcpu); + vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET; pt_guest_exit(vmx); @@ -6784,7 +6892,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) if (likely(!vmx->exit_reason.failed_vmentry)) vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); - trace_kvm_exit(vmx->exit_reason.full, vcpu, KVM_ISA_VMX); + trace_kvm_exit(vcpu, KVM_ISA_VMX); if (unlikely(vmx->exit_reason.failed_vmentry)) return EXIT_FASTPATH_NONE; @@ -6815,7 +6923,7 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) { struct vmx_uret_msr *tsx_ctrl; struct vcpu_vmx *vmx; - int i, cpu, err; + int i, err; BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0); vmx = to_vmx(vcpu); @@ -6836,10 +6944,8 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) goto free_vpid; } - for (i = 0; i < kvm_nr_uret_msrs; ++i) { - vmx->guest_uret_msrs[i].data = 0; + for (i = 0; i < kvm_nr_uret_msrs; ++i) vmx->guest_uret_msrs[i].mask = -1ull; - } if (boot_cpu_has(X86_FEATURE_RTM)) { /* * TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception. @@ -6855,6 +6961,19 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) if (err < 0) goto free_pml; + /* + * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a + * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the + * feature only for vmcs01, KVM currently isn't equipped to realize any + * performance benefits from enabling it for vmcs02. + */ + if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs) && + (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) { + struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs; + + evmcs->hv_enlightenments_control.msr_bitmap = 1; + } + /* The MSR bitmap starts with all ones */ bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS); bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS); @@ -6876,12 +6995,7 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) } vmx->loaded_vmcs = &vmx->vmcs01; - cpu = get_cpu(); - vmx_vcpu_load(vcpu, cpu); - vcpu->cpu = cpu; - init_vmcs(vmx); - vmx_vcpu_put(vcpu); - put_cpu(); + if (cpu_need_virtualize_apic_accesses(vcpu)) { err = alloc_apic_access_page(vcpu->kvm); if (err) @@ -6894,27 +7008,6 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) goto free_vmcs; } - if (nested) - memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs)); - else - memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs)); - - vcpu_setup_sgx_lepubkeyhash(vcpu); - - vmx->nested.posted_intr_nv = -1; - vmx->nested.current_vmptr = -1ull; - vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; - - vcpu->arch.microcode_version = 0x100000000ULL; - vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED; - - /* - * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR - * or POSTED_INTR_WAKEUP_VECTOR. - */ - vmx->pi_desc.nv = POSTED_INTR_VECTOR; - vmx->pi_desc.sn = 1; - return 0; free_vmcs: @@ -6986,7 +7079,6 @@ static int __init vmx_check_processor_compat(void) static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { u8 cache; - u64 ipat = 0; /* We wanted to honor guest CD/MTRR/PAT, but doing so could result in * memory aliases with conflicting memory types and sometimes MCEs. @@ -7006,30 +7098,22 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) * EPT memory type is used to emulate guest CD/MTRR. */ - if (is_mmio) { - cache = MTRR_TYPE_UNCACHABLE; - goto exit; - } + if (is_mmio) + return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT; - if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) { - ipat = VMX_EPT_IPAT_BIT; - cache = MTRR_TYPE_WRBACK; - goto exit; - } + if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) + return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT; if (kvm_read_cr0(vcpu) & X86_CR0_CD) { - ipat = VMX_EPT_IPAT_BIT; if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) cache = MTRR_TYPE_WRBACK; else cache = MTRR_TYPE_UNCACHABLE; - goto exit; - } - cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn); + return (cache << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT; + } -exit: - return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat; + return kvm_mtrr_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT; } static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl) @@ -7129,12 +7213,13 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) } /* Get the number of configurable Address Ranges for filtering */ - vmx->pt_desc.addr_range = intel_pt_validate_cap(vmx->pt_desc.caps, + vmx->pt_desc.num_address_ranges = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_num_address_ranges); /* Initialize and clear the no dependency bits */ vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS | - RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC); + RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC | + RTIT_CTL_BRANCH_EN); /* * If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise @@ -7152,12 +7237,11 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ); /* - * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn BranchEn and - * MTCFreq can be set + * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn and MTCFreq can be set */ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc)) vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN | - RTIT_CTL_BRANCH_EN | RTIT_CTL_MTC_RANGE); + RTIT_CTL_MTC_RANGE); /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite)) @@ -7177,7 +7261,7 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN; /* unmask address range configure area */ - for (i = 0; i < vmx->pt_desc.addr_range; i++) + for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4)); } @@ -7221,6 +7305,11 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) } } + if (kvm_cpu_cap_has(X86_FEATURE_XFD)) + vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R, + !guest_cpuid_has(vcpu, X86_FEATURE_XFD)); + + set_cr4_guest_host_mask(vmx); vmx_write_encls_bitmap(vcpu, NULL); @@ -7553,6 +7642,8 @@ static void vmx_migrate_timers(struct kvm_vcpu *vcpu) static void hardware_unsetup(void) { + kvm_set_posted_intr_wakeup_handler(NULL); + if (nested) nested_vmx_hardware_unsetup(); @@ -7562,12 +7653,16 @@ static void hardware_unsetup(void) static bool vmx_check_apicv_inhibit_reasons(ulong bit) { ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) | - BIT(APICV_INHIBIT_REASON_HYPERV); + BIT(APICV_INHIBIT_REASON_ABSENT) | + BIT(APICV_INHIBIT_REASON_HYPERV) | + BIT(APICV_INHIBIT_REASON_BLOCKIRQ); return supported & BIT(bit); } static struct kvm_x86_ops vmx_x86_ops __initdata = { + .name = "kvm_intel", + .hardware_unsetup = hardware_unsetup, .hardware_enable = hardware_enable, @@ -7608,6 +7703,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .cache_reg = vmx_cache_reg, .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, + .get_if_flag = vmx_get_if_flag, .tlb_flush_all = vmx_flush_tlb_all, .tlb_flush_current = vmx_flush_tlb_current, @@ -7703,6 +7799,20 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector, }; +static unsigned int vmx_handle_intel_pt_intr(void) +{ + struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); + + /* '0' on failure so that the !PT case can use a RET0 static call. */ + if (!kvm_arch_pmi_in_guest(vcpu)) + return 0; + + kvm_make_request(KVM_REQ_PMI, vcpu); + __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT, + (unsigned long *)&vcpu->arch.pmu.global_status); + return 1; +} + static __init void vmx_setup_user_return_msrs(void) { @@ -7729,11 +7839,13 @@ static __init void vmx_setup_user_return_msrs(void) kvm_add_user_return_msr(vmx_uret_msrs_list[i]); } +static struct kvm_x86_init_ops vmx_init_ops __initdata; + static __init int hardware_setup(void) { unsigned long host_bndcfgs; struct desc_ptr dt; - int r, ept_lpage_level; + int r; store_idt(&dt); host_idt_base = dt.address; @@ -7811,10 +7923,10 @@ static __init int hardware_setup(void) ple_window_shrink = 0; } - if (!cpu_has_vmx_apicv()) { + if (!cpu_has_vmx_apicv()) enable_apicv = 0; + if (!enable_apicv) vmx_x86_ops.sync_pir_to_irr = NULL; - } if (cpu_has_vmx_tsc_scaling()) { kvm_has_tsc_control = true; @@ -7830,16 +7942,8 @@ static __init int hardware_setup(void) kvm_mmu_set_ept_masks(enable_ept_ad_bits, cpu_has_vmx_ept_execute_only()); - if (!enable_ept) - ept_lpage_level = 0; - else if (cpu_has_vmx_ept_1g_page()) - ept_lpage_level = PG_LEVEL_1G; - else if (cpu_has_vmx_ept_2m_page()) - ept_lpage_level = PG_LEVEL_2M; - else - ept_lpage_level = PG_LEVEL_4K; kvm_configure_mmu(enable_ept, 0, vmx_get_max_tdp_level(), - ept_lpage_level); + ept_caps_to_lpage_level(vmx_capability.ept)); /* * Only enable PML when hardware supports PML feature, and both EPT @@ -7881,14 +7985,16 @@ static __init int hardware_setup(void) vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit; } - kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler); - kvm_mce_cap_supported |= MCG_LMCE_P; if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST) return -EINVAL; if (!enable_ept || !cpu_has_vmx_intel_pt()) pt_mode = PT_MODE_SYSTEM; + if (pt_mode == PT_MODE_HOST_GUEST) + vmx_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr; + else + vmx_init_ops.handle_intel_pt_intr = NULL; setup_default_sgx_lepubkeyhash(); @@ -7906,6 +8012,9 @@ static __init int hardware_setup(void) r = alloc_kvm_area(); if (r) nested_vmx_hardware_unsetup(); + + kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler); + return r; } @@ -7914,6 +8023,7 @@ static struct kvm_x86_init_ops vmx_init_ops __initdata = { .disabled_by_bios = vmx_disabled_by_bios, .check_processor_compatibility = vmx_check_processor_compat, .hardware_setup = hardware_setup, + .handle_intel_pt_intr = NULL, .runtime_ops = &vmx_x86_ops, }; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 592217fd7d92..f8fc7441baea 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -62,7 +62,7 @@ struct pt_ctx { struct pt_desc { u64 ctl_bitmask; - u32 addr_range; + u32 num_address_ranges; u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES]; struct pt_ctx host; struct pt_ctx guest; @@ -142,6 +142,16 @@ struct nested_vmx { struct vmcs12 *cached_shadow_vmcs12; /* + * GPA to HVA cache for accessing vmcs12->vmcs_link_pointer + */ + struct gfn_to_hva_cache shadow_vmcs12_cache; + + /* + * GPA to HVA cache for VMCS12 + */ + struct gfn_to_hva_cache vmcs12_cache; + + /* * Indicates if the shadow vmcs or enlightened vmcs must be updated * with the data held by struct vmcs12. */ @@ -149,6 +159,15 @@ struct nested_vmx { bool dirty_vmcs12; /* + * Indicates whether MSR bitmap for L2 needs to be rebuilt due to + * changes in MSR bitmap for L1 or switching to a different L2. Note, + * this flag can only be used reliably in conjunction with a paravirt L1 + * which informs L0 whether any changes to MSR bitmap for L2 were done + * on its side. + */ + bool force_msr_bitmap_recalc; + + /* * Indicates lazily loaded guest state has not yet been decached from * vmcs02. */ @@ -330,7 +349,7 @@ struct vcpu_vmx { struct lbr_desc lbr_desc; /* Save desired MSR intercept (read: pass-through) state */ -#define MAX_POSSIBLE_PASSTHROUGH_MSRS 13 +#define MAX_POSSIBLE_PASSTHROUGH_MSRS 15 struct { DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS); DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS); @@ -352,8 +371,9 @@ int allocate_vpid(void); void free_vpid(int vpid); void vmx_set_constant_host_state(struct vcpu_vmx *vmx); void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu); -void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, - unsigned long fs_base, unsigned long gs_base); +void vmx_set_vmcs_host_state(struct vmcs_host_state *host, unsigned long cr3, + u16 fs_sel, u16 gs_sel, + unsigned long fs_base, unsigned long gs_base); int vmx_get_cpl(struct kvm_vcpu *vcpu); bool vmx_emulation_required(struct kvm_vcpu *vcpu); unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu); @@ -400,6 +420,34 @@ static inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu); +/* + * Note, early Intel manuals have the write-low and read-high bitmap offsets + * the wrong way round. The bitmaps control MSRs 0x00000000-0x00001fff and + * 0xc0000000-0xc0001fff. The former (low) uses bytes 0-0x3ff for reads and + * 0x800-0xbff for writes. The latter (high) uses 0x400-0x7ff for reads and + * 0xc00-0xfff for writes. MSRs not covered by either of the ranges always + * VM-Exit. + */ +#define __BUILD_VMX_MSR_BITMAP_HELPER(rtype, action, bitop, access, base) \ +static inline rtype vmx_##action##_msr_bitmap_##access(unsigned long *bitmap, \ + u32 msr) \ +{ \ + int f = sizeof(unsigned long); \ + \ + if (msr <= 0x1fff) \ + return bitop##_bit(msr, bitmap + base / f); \ + else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) \ + return bitop##_bit(msr & 0x1fff, bitmap + (base + 0x400) / f); \ + return (rtype)true; \ +} +#define BUILD_VMX_MSR_BITMAP_HELPERS(ret_type, action, bitop) \ + __BUILD_VMX_MSR_BITMAP_HELPER(ret_type, action, bitop, read, 0x0) \ + __BUILD_VMX_MSR_BITMAP_HELPER(ret_type, action, bitop, write, 0x800) + +BUILD_VMX_MSR_BITMAP_HELPERS(bool, test, test) +BUILD_VMX_MSR_BITMAP_HELPERS(void, clear, __clear) +BUILD_VMX_MSR_BITMAP_HELPERS(void, set, __set) + static inline u8 vmx_get_rvi(void) { return vmcs_read16(GUEST_INTR_STATUS) & 0xff; @@ -435,19 +483,21 @@ BUILD_CONTROLS_SHADOW(pin, PIN_BASED_VM_EXEC_CONTROL) BUILD_CONTROLS_SHADOW(exec, CPU_BASED_VM_EXEC_CONTROL) BUILD_CONTROLS_SHADOW(secondary_exec, SECONDARY_VM_EXEC_CONTROL) -static inline void vmx_register_cache_reset(struct kvm_vcpu *vcpu) -{ - vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) - | (1 << VCPU_EXREG_RFLAGS) - | (1 << VCPU_EXREG_PDPTR) - | (1 << VCPU_EXREG_SEGMENTS) - | (1 << VCPU_EXREG_CR0) - | (1 << VCPU_EXREG_CR3) - | (1 << VCPU_EXREG_CR4) - | (1 << VCPU_EXREG_EXIT_INFO_1) - | (1 << VCPU_EXREG_EXIT_INFO_2)); - vcpu->arch.regs_dirty = 0; -} +/* + * VMX_REGS_LAZY_LOAD_SET - The set of registers that will be updated in the + * cache on demand. Other registers not listed here are synced to + * the cache immediately after VM-Exit. + */ +#define VMX_REGS_LAZY_LOAD_SET ((1 << VCPU_REGS_RIP) | \ + (1 << VCPU_REGS_RSP) | \ + (1 << VCPU_EXREG_RFLAGS) | \ + (1 << VCPU_EXREG_PDPTR) | \ + (1 << VCPU_EXREG_SEGMENTS) | \ + (1 << VCPU_EXREG_CR0) | \ + (1 << VCPU_EXREG_CR3) | \ + (1 << VCPU_EXREG_CR4) | \ + (1 << VCPU_EXREG_EXIT_INFO_1) | \ + (1 << VCPU_EXREG_EXIT_INFO_2)) static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm) { @@ -522,4 +572,9 @@ static inline bool vmx_guest_state_valid(struct kvm_vcpu *vcpu) void dump_vmcs(struct kvm_vcpu *vcpu); +static inline int vmx_get_instr_info_reg2(u32 vmx_instr_info) +{ + return (vmx_instr_info >> 28) & 0xf; +} + #endif /* __KVM_X86_VMX_H */ diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h index 9e9ef47e988c..5e7f41225780 100644 --- a/arch/x86/kvm/vmx/vmx_ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -71,6 +71,31 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field) { unsigned long value; +#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT + + asm_volatile_goto("1: vmread %[field], %[output]\n\t" + "jna %l[do_fail]\n\t" + + _ASM_EXTABLE(1b, %l[do_exception]) + + : [output] "=r" (value) + : [field] "r" (field) + : "cc" + : do_fail, do_exception); + + return value; + +do_fail: + WARN_ONCE(1, "kvm: vmread failed: field=%lx\n", field); + pr_warn_ratelimited("kvm: vmread failed: field=%lx\n", field); + return 0; + +do_exception: + kvm_spurious_fault(); + return 0; + +#else /* !CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ + asm volatile("1: vmread %2, %1\n\t" ".byte 0x3e\n\t" /* branch taken hint */ "ja 3f\n\t" @@ -80,9 +105,11 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field) * @field, and bounce through the trampoline to preserve * volatile registers. */ - "push $0\n\t" + "xorl %k1, %k1\n\t" + "2:\n\t" + "push %1\n\t" "push %2\n\t" - "2:call vmread_error_trampoline\n\t" + "call vmread_error_trampoline\n\t" /* * Unwind the stack. Note, the trampoline zeros out the @@ -93,14 +120,12 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field) "3:\n\t" /* VMREAD faulted. As above, except push '1' for @fault. */ - ".pushsection .fixup, \"ax\"\n\t" - "4: push $1\n\t" - "push %2\n\t" - "jmp 2b\n\t" - ".popsection\n\t" - _ASM_EXTABLE(1b, 4b) - : ASM_CALL_CONSTRAINT, "=r"(value) : "r"(field) : "cc"); + _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_ONE_REG, %1) + + : ASM_CALL_CONSTRAINT, "=&r"(value) : "r"(field) : "cc"); return value; + +#endif /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ } static __always_inline u16 vmcs_read16(unsigned long field) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index aabd3a2ec1bc..76b4803dd3bd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -68,7 +68,9 @@ #include <asm/mce.h> #include <asm/pkru.h> #include <linux/kernel_stat.h> -#include <asm/fpu/internal.h> /* Ugh! */ +#include <asm/fpu/api.h> +#include <asm/fpu/xcr.h> +#include <asm/fpu/xstate.h> #include <asm/pvclock.h> #include <asm/div64.h> #include <asm/irq_remapping.h> @@ -116,6 +118,7 @@ static void enter_smm(struct kvm_vcpu *vcpu); static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); static void store_regs(struct kvm_vcpu *vcpu); static int sync_regs(struct kvm_vcpu *vcpu); +static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu); static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); @@ -208,7 +211,7 @@ static struct kvm_user_return_msrs __percpu *user_return_msrs; #define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \ | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \ - | XFEATURE_MASK_PKRU) + | XFEATURE_MASK_PKRU | XFEATURE_MASK_XTILE) u64 __read_mostly host_efer; EXPORT_SYMBOL_GPL(host_efer); @@ -293,8 +296,6 @@ u64 __read_mostly host_xcr0; u64 __read_mostly supported_xcr0; EXPORT_SYMBOL_GPL(supported_xcr0); -static struct kmem_cache *x86_fpu_cache; - static struct kmem_cache *x86_emulator_cache; /* @@ -710,6 +711,17 @@ int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) } EXPORT_SYMBOL_GPL(kvm_complete_insn_gp); +static int complete_emulated_insn_gp(struct kvm_vcpu *vcpu, int err) +{ + if (err) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + return kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE | EMULTYPE_SKIP | + EMULTYPE_COMPLETE_USER_EXIT); +} + void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) { ++vcpu->stat.pf_guest; @@ -790,30 +802,6 @@ bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr) } EXPORT_SYMBOL_GPL(kvm_require_dr); -/* - * This function will be used to read from the physical memory of the currently - * running guest. The difference to kvm_vcpu_read_guest_page is that this function - * can read from guest physical or from the guest's guest physical memory. - */ -int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - gfn_t ngfn, void *data, int offset, int len, - u32 access) -{ - struct x86_exception exception; - gfn_t real_gfn; - gpa_t ngpa; - - ngpa = gfn_to_gpa(ngfn); - real_gfn = mmu->translate_gpa(vcpu, ngpa, access, &exception); - if (real_gfn == UNMAPPED_GVA) - return -EFAULT; - - real_gfn = gpa_to_gfn(real_gfn); - - return kvm_vcpu_read_guest_page(vcpu, real_gfn, data, offset, len); -} -EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu); - static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu) { return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2); @@ -822,37 +810,50 @@ static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu) /* * Load the pae pdptrs. Return 1 if they are all valid, 0 otherwise. */ -int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) +int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) { + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; - unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; + gpa_t real_gpa; int i; int ret; u64 pdpte[ARRAY_SIZE(mmu->pdptrs)]; - ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte, - offset * sizeof(u64), sizeof(pdpte), - PFERR_USER_MASK|PFERR_WRITE_MASK); - if (ret < 0) { - ret = 0; - goto out; - } + /* + * If the MMU is nested, CR3 holds an L2 GPA and needs to be translated + * to an L1 GPA. + */ + real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(pdpt_gfn), + PFERR_USER_MASK | PFERR_WRITE_MASK, NULL); + if (real_gpa == UNMAPPED_GVA) + return 0; + + /* Note the offset, PDPTRs are 32 byte aligned when using PAE paging. */ + ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(real_gpa), pdpte, + cr3 & GENMASK(11, 5), sizeof(pdpte)); + if (ret < 0) + return 0; + for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { if ((pdpte[i] & PT_PRESENT_MASK) && (pdpte[i] & pdptr_rsvd_bits(vcpu))) { - ret = 0; - goto out; + return 0; } } - ret = 1; + + /* + * Marking VCPU_EXREG_PDPTR dirty doesn't work for !tdp_enabled. + * Shadow page roots need to be reconstructed instead. + */ + if (!tdp_enabled && memcmp(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs))) + kvm_mmu_free_roots(vcpu, mmu, KVM_MMU_ROOT_CURRENT); memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)); kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); + kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu); vcpu->arch.pdptrs_from_userspace = false; -out: - - return ret; + return 1; } EXPORT_SYMBOL_GPL(load_pdptrs); @@ -876,7 +877,6 @@ EXPORT_SYMBOL_GPL(kvm_post_set_cr0); int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { unsigned long old_cr0 = kvm_read_cr0(vcpu); - unsigned long pdptr_bits = X86_CR0_CD | X86_CR0_NW | X86_CR0_PG; cr0 |= X86_CR0_ET; @@ -906,11 +906,12 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } #endif if (!(vcpu->arch.efer & EFER_LME) && (cr0 & X86_CR0_PG) && - is_pae(vcpu) && ((cr0 ^ old_cr0) & pdptr_bits) && - !load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu))) + is_pae(vcpu) && ((cr0 ^ old_cr0) & X86_CR0_PDPTR_BITS) && + !load_pdptrs(vcpu, kvm_read_cr3(vcpu))) return 1; - if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) + if (!(cr0 & X86_CR0_PG) && + (is_64_bit_mode(vcpu) || kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))) return 1; static_call(kvm_x86_set_cr0)(vcpu, cr0); @@ -993,7 +994,7 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) /* * Do not allow the guest to set bits that we do not support * saving. However, xcr0 bit 0 is always set, even if the - * emulated CPU does not support XSAVE (see fx_init). + * emulated CPU does not support XSAVE (see kvm_vcpu_reset()). */ valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP; if (xcr0 & ~valid_bits) @@ -1009,6 +1010,11 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512) return 1; } + + if ((xcr0 & XFEATURE_MASK_XTILE) && + ((xcr0 & XFEATURE_MASK_XTILE) != XFEATURE_MASK_XTILE)) + return 1; + vcpu->arch.xcr0 = xcr0; if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND) @@ -1042,17 +1048,34 @@ EXPORT_SYMBOL_GPL(kvm_is_valid_cr4); void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4) { - if (((cr4 ^ old_cr4) & KVM_MMU_CR4_ROLE_BITS) || - (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) + /* + * If any role bit is changed, the MMU needs to be reset. + * + * If CR4.PCIDE is changed 1 -> 0, the guest TLB must be flushed. + * If CR4.PCIDE is changed 0 -> 1, there is no need to flush the TLB + * according to the SDM; however, stale prev_roots could be reused + * incorrectly in the future after a MOV to CR3 with NOFLUSH=1, so we + * free them all. KVM_REQ_MMU_RELOAD is fit for the both cases; it + * is slow, but changing CR4.PCIDE is a rare case. + * + * If CR4.PGE is changed, the guest TLB must be flushed. + * + * Note: resetting MMU is a superset of KVM_REQ_MMU_RELOAD and + * KVM_REQ_MMU_RELOAD is a superset of KVM_REQ_TLB_FLUSH_GUEST, hence + * the usage of "else if". + */ + if ((cr4 ^ old_cr4) & KVM_MMU_CR4_ROLE_BITS) kvm_mmu_reset_context(vcpu); + else if ((cr4 ^ old_cr4) & X86_CR4_PCIDE) + kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu); + else if ((cr4 ^ old_cr4) & X86_CR4_PGE) + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); } EXPORT_SYMBOL_GPL(kvm_post_set_cr4); int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { unsigned long old_cr4 = kvm_read_cr4(vcpu); - unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | - X86_CR4_SMEP; if (!kvm_is_valid_cr4(vcpu, cr4)) return 1; @@ -1063,9 +1086,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if ((cr4 ^ old_cr4) & X86_CR4_LA57) return 1; } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) - && ((cr4 ^ old_cr4) & pdptr_bits) - && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, - kvm_read_cr3(vcpu))) + && ((cr4 ^ old_cr4) & X86_CR4_PDPTR_BITS) + && !load_pdptrs(vcpu, kvm_read_cr3(vcpu))) return 1; if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) { @@ -1092,6 +1114,18 @@ static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid) int i; /* + * MOV CR3 and INVPCID are usually not intercepted when using TDP, but + * this is reachable when running EPT=1 and unrestricted_guest=0, and + * also via the emulator. KVM's TDP page tables are not in the scope of + * the invalidation, but the guest's TLB entries need to be flushed as + * the CPU may have cached entries in its TLB for the target PCID. + */ + if (unlikely(tdp_enabled)) { + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); + return; + } + + /* * If neither the current CR3 nor any of the prev_roots use the given * PCID, then nothing needs to be done here because a resync will * happen anyway before switching to any other CR3. @@ -1101,6 +1135,14 @@ static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid) kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } + /* + * If PCID is disabled, there is no need to free prev_roots even if the + * PCIDs for them are also 0, because MOV to CR3 always flushes the TLB + * with PCIDE=0. + */ + if (!kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) + return; + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) if (kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd) == pcid) roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); @@ -1134,14 +1176,15 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) if (kvm_vcpu_is_illegal_gpa(vcpu, cr3)) return 1; - if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) + if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, cr3)) return 1; if (cr3 != kvm_read_cr3(vcpu)) kvm_mmu_new_pgd(vcpu, cr3); vcpu->arch.cr3 = cr3; - kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); + kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); + /* Do not call post_set_cr3, we do not get here for confidential guests. */ handle_tlb_flush: /* @@ -1311,7 +1354,7 @@ static const u32 msrs_to_save_all[] = { MSR_IA32_UMWAIT_CONTROL, MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, - MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_ARCH_PERFMON_FIXED_CTR0 + 3, + MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL, MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1, @@ -1339,6 +1382,7 @@ static const u32 msrs_to_save_all[] = { MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5, MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2, MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5, + MSR_IA32_XFD, MSR_IA32_XFD_ERR, }; static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)]; @@ -1381,6 +1425,7 @@ static const u32 emulated_msrs_all[] = { MSR_PLATFORM_INFO, MSR_MISC_FEATURES_ENABLES, MSR_AMD64_VIRT_SPEC_CTRL, + MSR_AMD64_TSC_RATIO, MSR_IA32_POWER_CTL, MSR_IA32_UCODE_REV, @@ -1794,22 +1839,36 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) } EXPORT_SYMBOL_GPL(kvm_set_msr); -static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu) +static void complete_userspace_rdmsr(struct kvm_vcpu *vcpu) { - int err = vcpu->run->msr.error; - if (!err) { + if (!vcpu->run->msr.error) { kvm_rax_write(vcpu, (u32)vcpu->run->msr.data); kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32); } +} + +static int complete_emulated_msr_access(struct kvm_vcpu *vcpu) +{ + return complete_emulated_insn_gp(vcpu, vcpu->run->msr.error); +} - return static_call(kvm_x86_complete_emulated_msr)(vcpu, err); +static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu) +{ + complete_userspace_rdmsr(vcpu); + return complete_emulated_msr_access(vcpu); } -static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu) +static int complete_fast_msr_access(struct kvm_vcpu *vcpu) { return static_call(kvm_x86_complete_emulated_msr)(vcpu, vcpu->run->msr.error); } +static int complete_fast_rdmsr(struct kvm_vcpu *vcpu) +{ + complete_userspace_rdmsr(vcpu); + return complete_fast_msr_access(vcpu); +} + static u64 kvm_msr_reason(int r) { switch (r) { @@ -1844,18 +1903,6 @@ static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index, return 1; } -static int kvm_get_msr_user_space(struct kvm_vcpu *vcpu, u32 index, int r) -{ - return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_RDMSR, 0, - complete_emulated_rdmsr, r); -} - -static int kvm_set_msr_user_space(struct kvm_vcpu *vcpu, u32 index, u64 data, int r) -{ - return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_WRMSR, data, - complete_emulated_wrmsr, r); -} - int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu) { u32 ecx = kvm_rcx_read(vcpu); @@ -1864,18 +1911,16 @@ int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu) r = kvm_get_msr(vcpu, ecx, &data); - /* MSR read failed? See if we should ask user space */ - if (r && kvm_get_msr_user_space(vcpu, ecx, r)) { - /* Bounce to user space */ - return 0; - } - if (!r) { trace_kvm_msr_read(ecx, data); kvm_rax_write(vcpu, data & -1u); kvm_rdx_write(vcpu, (data >> 32) & -1u); } else { + /* MSR read failed? See if we should ask user space */ + if (kvm_msr_user_space(vcpu, ecx, KVM_EXIT_X86_RDMSR, 0, + complete_fast_rdmsr, r)) + return 0; trace_kvm_msr_read_ex(ecx); } @@ -1891,19 +1936,18 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) r = kvm_set_msr(vcpu, ecx, data); - /* MSR write failed? See if we should ask user space */ - if (r && kvm_set_msr_user_space(vcpu, ecx, data, r)) - /* Bounce to user space */ - return 0; - - /* Signal all other negative errors to userspace */ - if (r < 0) - return r; - - if (!r) + if (!r) { trace_kvm_msr_write(ecx, data); - else + } else { + /* MSR write failed? See if we should ask user space */ + if (kvm_msr_user_space(vcpu, ecx, KVM_EXIT_X86_WRMSR, data, + complete_fast_msr_access, r)) + return 0; + /* Signal all other negative errors to userspace */ + if (r < 0) + return r; trace_kvm_msr_write_ex(ecx, data); + } return static_call(kvm_x86_complete_emulated_msr)(vcpu, r); } @@ -2098,7 +2142,7 @@ static s64 get_kvmclock_base_ns(void) } #endif -void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs) +static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs) { int version; int r; @@ -2454,13 +2498,64 @@ static inline bool kvm_check_tsc_unstable(void) return check_tsc_unstable(); } +/* + * Infers attempts to synchronize the guest's tsc from host writes. Sets the + * offset for the vcpu and tracks the TSC matching generation that the vcpu + * participates in. + */ +static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc, + u64 ns, bool matched) +{ + struct kvm *kvm = vcpu->kvm; + + lockdep_assert_held(&kvm->arch.tsc_write_lock); + + /* + * We also track th most recent recorded KHZ, write and time to + * allow the matching interval to be extended at each write. + */ + kvm->arch.last_tsc_nsec = ns; + kvm->arch.last_tsc_write = tsc; + kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz; + kvm->arch.last_tsc_offset = offset; + + vcpu->arch.last_guest_tsc = tsc; + + kvm_vcpu_write_tsc_offset(vcpu, offset); + + if (!matched) { + /* + * We split periods of matched TSC writes into generations. + * For each generation, we track the original measured + * nanosecond time, offset, and write, so if TSCs are in + * sync, we can match exact offset, and if not, we can match + * exact software computation in compute_guest_tsc() + * + * These values are tracked in kvm->arch.cur_xxx variables. + */ + kvm->arch.cur_tsc_generation++; + kvm->arch.cur_tsc_nsec = ns; + kvm->arch.cur_tsc_write = tsc; + kvm->arch.cur_tsc_offset = offset; + kvm->arch.nr_vcpus_matched_tsc = 0; + } else if (vcpu->arch.this_tsc_generation != kvm->arch.cur_tsc_generation) { + kvm->arch.nr_vcpus_matched_tsc++; + } + + /* Keep track of which generation this VCPU has synchronized to */ + vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation; + vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; + vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; + + kvm_track_tsc_matching(vcpu); +} + static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) { struct kvm *kvm = vcpu->kvm; u64 offset, ns, elapsed; unsigned long flags; - bool matched; - bool already_matched; + bool matched = false; bool synchronizing = false; raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); @@ -2506,51 +2601,10 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) offset = kvm_compute_l1_tsc_offset(vcpu, data); } matched = true; - already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation); - } else { - /* - * We split periods of matched TSC writes into generations. - * For each generation, we track the original measured - * nanosecond time, offset, and write, so if TSCs are in - * sync, we can match exact offset, and if not, we can match - * exact software computation in compute_guest_tsc() - * - * These values are tracked in kvm->arch.cur_xxx variables. - */ - kvm->arch.cur_tsc_generation++; - kvm->arch.cur_tsc_nsec = ns; - kvm->arch.cur_tsc_write = data; - kvm->arch.cur_tsc_offset = offset; - matched = false; } - /* - * We also track th most recent recorded KHZ, write and time to - * allow the matching interval to be extended at each write. - */ - kvm->arch.last_tsc_nsec = ns; - kvm->arch.last_tsc_write = data; - kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz; - - vcpu->arch.last_guest_tsc = data; - - /* Keep track of which generation this VCPU has synchronized to */ - vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation; - vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; - vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; - - kvm_vcpu_write_tsc_offset(vcpu, offset); + __kvm_synchronize_tsc(vcpu, offset, data, ns, matched); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); - - spin_lock_irqsave(&kvm->arch.pvclock_gtod_sync_lock, flags); - if (!matched) { - kvm->arch.nr_vcpus_matched_tsc = 0; - } else if (!already_matched) { - kvm->arch.nr_vcpus_matched_tsc++; - } - - kvm_track_tsc_matching(vcpu); - spin_unlock_irqrestore(&kvm->arch.pvclock_gtod_sync_lock, flags); } static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, @@ -2738,6 +2792,7 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm) int vclock_mode; bool host_tsc_clocksource, vcpus_matched; + lockdep_assert_held(&kvm->arch.tsc_write_lock); vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == atomic_read(&kvm->online_vcpus)); @@ -2762,68 +2817,101 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm) #endif } -void kvm_make_mclock_inprogress_request(struct kvm *kvm) +static void kvm_make_mclock_inprogress_request(struct kvm *kvm) { kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS); } -static void kvm_gen_update_masterclock(struct kvm *kvm) +static void __kvm_start_pvclock_update(struct kvm *kvm) { -#ifdef CONFIG_X86_64 - int i; - struct kvm_vcpu *vcpu; - struct kvm_arch *ka = &kvm->arch; - unsigned long flags; - - kvm_hv_invalidate_tsc_page(kvm); + raw_spin_lock_irq(&kvm->arch.tsc_write_lock); + write_seqcount_begin(&kvm->arch.pvclock_sc); +} +static void kvm_start_pvclock_update(struct kvm *kvm) +{ kvm_make_mclock_inprogress_request(kvm); /* no guest entries from this point */ - spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); - pvclock_update_vm_gtod_copy(kvm); - spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); + __kvm_start_pvclock_update(kvm); +} +static void kvm_end_pvclock_update(struct kvm *kvm) +{ + struct kvm_arch *ka = &kvm->arch; + struct kvm_vcpu *vcpu; + unsigned long i; + + write_seqcount_end(&ka->pvclock_sc); + raw_spin_unlock_irq(&ka->tsc_write_lock); kvm_for_each_vcpu(i, vcpu, kvm) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); /* guest entries allowed */ kvm_for_each_vcpu(i, vcpu, kvm) kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu); -#endif } -u64 get_kvmclock_ns(struct kvm *kvm) +static void kvm_update_masterclock(struct kvm *kvm) +{ + kvm_hv_invalidate_tsc_page(kvm); + kvm_start_pvclock_update(kvm); + pvclock_update_vm_gtod_copy(kvm); + kvm_end_pvclock_update(kvm); +} + +/* Called within read_seqcount_begin/retry for kvm->pvclock_sc. */ +static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data) { struct kvm_arch *ka = &kvm->arch; struct pvclock_vcpu_time_info hv_clock; - unsigned long flags; - u64 ret; - - spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); - if (!ka->use_master_clock) { - spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); - return get_kvmclock_base_ns() + ka->kvmclock_offset; - } - - hv_clock.tsc_timestamp = ka->master_cycle_now; - hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; - spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); /* both __this_cpu_read() and rdtsc() should be on the same cpu */ get_cpu(); - if (__this_cpu_read(cpu_tsc_khz)) { + data->flags = 0; + if (ka->use_master_clock && __this_cpu_read(cpu_tsc_khz)) { +#ifdef CONFIG_X86_64 + struct timespec64 ts; + + if (kvm_get_walltime_and_clockread(&ts, &data->host_tsc)) { + data->realtime = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec; + data->flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC; + } else +#endif + data->host_tsc = rdtsc(); + + data->flags |= KVM_CLOCK_TSC_STABLE; + hv_clock.tsc_timestamp = ka->master_cycle_now; + hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL, &hv_clock.tsc_shift, &hv_clock.tsc_to_system_mul); - ret = __pvclock_read_cycles(&hv_clock, rdtsc()); - } else - ret = get_kvmclock_base_ns() + ka->kvmclock_offset; + data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc); + } else { + data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset; + } put_cpu(); +} - return ret; +static void get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data) +{ + struct kvm_arch *ka = &kvm->arch; + unsigned seq; + + do { + seq = read_seqcount_begin(&ka->pvclock_sc); + __get_kvmclock(kvm, data); + } while (read_seqcount_retry(&ka->pvclock_sc, seq)); +} + +u64 get_kvmclock_ns(struct kvm *kvm) +{ + struct kvm_clock_data data; + + get_kvmclock(kvm, &data); + return data.clock; } static void kvm_setup_pvclock_page(struct kvm_vcpu *v, @@ -2888,6 +2976,7 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v, static int kvm_guest_time_update(struct kvm_vcpu *v) { unsigned long flags, tgt_tsc_khz; + unsigned seq; struct kvm_vcpu_arch *vcpu = &v->arch; struct kvm_arch *ka = &v->kvm->arch; s64 kernel_ns; @@ -2902,13 +2991,14 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) * If the host uses TSC clock, then passthrough TSC as stable * to the guest. */ - spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); - use_master_clock = ka->use_master_clock; - if (use_master_clock) { - host_tsc = ka->master_cycle_now; - kernel_ns = ka->master_kernel_ns; - } - spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); + do { + seq = read_seqcount_begin(&ka->pvclock_sc); + use_master_clock = ka->use_master_clock; + if (use_master_clock) { + host_tsc = ka->master_cycle_now; + kernel_ns = ka->master_kernel_ns; + } + } while (read_seqcount_retry(&ka->pvclock_sc, seq)); /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); @@ -2999,7 +3089,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) static void kvmclock_update_fn(struct work_struct *work) { - int i; + unsigned long i; struct delayed_work *dwork = to_delayed_work(work); struct kvm_arch *ka = container_of(dwork, struct kvm_arch, kvmclock_update_work); @@ -3179,24 +3269,49 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) ++vcpu->stat.tlb_flush; if (!tdp_enabled) { - /* + /* * A TLB flush on behalf of the guest is equivalent to * INVPCID(all), toggling CR4.PGE, etc., which requires - * a forced sync of the shadow page tables. Unload the - * entire MMU here and the subsequent load will sync the - * shadow page tables, and also flush the TLB. + * a forced sync of the shadow page tables. Ensure all the + * roots are synced and the guest TLB in hardware is clean. */ - kvm_mmu_unload(vcpu); - return; + kvm_mmu_sync_roots(vcpu); + kvm_mmu_sync_prev_roots(vcpu); } static_call(kvm_x86_tlb_flush_guest)(vcpu); } + +static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu) +{ + ++vcpu->stat.tlb_flush; + static_call(kvm_x86_tlb_flush_current)(vcpu); +} + +/* + * Service "local" TLB flush requests, which are specific to the current MMU + * context. In addition to the generic event handling in vcpu_enter_guest(), + * TLB flushes that are targeted at an MMU context also need to be serviced + * prior before nested VM-Enter/VM-Exit. + */ +void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu) +{ + if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) + kvm_vcpu_flush_tlb_current(vcpu); + + if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu)) + kvm_vcpu_flush_tlb_guest(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_service_local_tlb_flush_requests); + static void record_steal_time(struct kvm_vcpu *vcpu) { - struct kvm_host_map map; - struct kvm_steal_time *st; + struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache; + struct kvm_steal_time __user *st; + struct kvm_memslots *slots; + u64 steal; + u32 version; if (kvm_xen_msr_enabled(vcpu->kvm)) { kvm_xen_runstate_set_running(vcpu); @@ -3206,47 +3321,86 @@ static void record_steal_time(struct kvm_vcpu *vcpu) if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; - /* -EAGAIN is returned in atomic context so we can just return. */ - if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, - &map, &vcpu->arch.st.cache, false)) + if (WARN_ON_ONCE(current->mm != vcpu->kvm->mm)) return; - st = map.hva + - offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS); + slots = kvm_memslots(vcpu->kvm); + + if (unlikely(slots->generation != ghc->generation || + kvm_is_error_hva(ghc->hva) || !ghc->memslot)) { + gfn_t gfn = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS; + + /* We rely on the fact that it fits in a single page. */ + BUILD_BUG_ON((sizeof(*st) - 1) & KVM_STEAL_VALID_BITS); + + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, gfn, sizeof(*st)) || + kvm_is_error_hva(ghc->hva) || !ghc->memslot) + return; + } + st = (struct kvm_steal_time __user *)ghc->hva; /* * Doing a TLB flush here, on the guest's behalf, can avoid * expensive IPIs. */ if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) { - u8 st_preempted = xchg(&st->preempted, 0); + u8 st_preempted = 0; + int err = -EFAULT; + + if (!user_access_begin(st, sizeof(*st))) + return; + + asm volatile("1: xchgb %0, %2\n" + "xor %1, %1\n" + "2:\n" + _ASM_EXTABLE_UA(1b, 2b) + : "+q" (st_preempted), + "+&r" (err), + "+m" (st->preempted)); + if (err) + goto out; + + user_access_end(); + + vcpu->arch.st.preempted = 0; trace_kvm_pv_tlb_flush(vcpu->vcpu_id, st_preempted & KVM_VCPU_FLUSH_TLB); if (st_preempted & KVM_VCPU_FLUSH_TLB) kvm_vcpu_flush_tlb_guest(vcpu); + + if (!user_access_begin(st, sizeof(*st))) + goto dirty; } else { - st->preempted = 0; - } + if (!user_access_begin(st, sizeof(*st))) + return; - vcpu->arch.st.preempted = 0; + unsafe_put_user(0, &st->preempted, out); + vcpu->arch.st.preempted = 0; + } - if (st->version & 1) - st->version += 1; /* first time write, random junk */ + unsafe_get_user(version, &st->version, out); + if (version & 1) + version += 1; /* first time write, random junk */ - st->version += 1; + version += 1; + unsafe_put_user(version, &st->version, out); smp_wmb(); - st->steal += current->sched_info.run_delay - + unsafe_get_user(steal, &st->steal, out); + steal += current->sched_info.run_delay - vcpu->arch.st.last_steal; vcpu->arch.st.last_steal = current->sched_info.run_delay; + unsafe_put_user(steal, &st->steal, out); - smp_wmb(); - - st->version += 1; + version += 1; + unsafe_put_user(version, &st->version, out); - kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, false); + out: + user_access_end(); + dirty: + mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa)); } int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) @@ -3282,7 +3436,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!msr_info->host_initiated) return 1; - if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM) && kvm_get_msr_feature(&msr_ent)) + if (kvm_get_msr_feature(&msr_ent)) return 1; if (data & ~msr_ent.data) return 1; @@ -3452,7 +3606,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI)) return 1; - if (kvm_lapic_enable_pv_eoi(vcpu, data, sizeof(u8))) + if (kvm_lapic_set_pv_eoi(vcpu, data, sizeof(u8))) return 1; break; @@ -3538,6 +3692,30 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vcpu->arch.msr_misc_features_enables = data; break; +#ifdef CONFIG_X86_64 + case MSR_IA32_XFD: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_XFD)) + return 1; + + if (data & ~(XFEATURE_MASK_USER_DYNAMIC & + vcpu->arch.guest_supported_xcr0)) + return 1; + + fpu_update_guest_xfd(&vcpu->arch.guest_fpu, data); + break; + case MSR_IA32_XFD_ERR: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_XFD)) + return 1; + + if (data & ~(XFEATURE_MASK_USER_DYNAMIC & + vcpu->arch.guest_supported_xcr0)) + return 1; + + vcpu->arch.guest_fpu.xfd_err = data; + break; +#endif default: if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); @@ -3858,6 +4036,22 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_K7_HWCR: msr_info->data = vcpu->arch.msr_hwcr; break; +#ifdef CONFIG_X86_64 + case MSR_IA32_XFD: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_XFD)) + return 1; + + msr_info->data = vcpu->arch.guest_fpu.fpstate->xfd; + break; + case MSR_IA32_XFD_ERR: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_XFD)) + return 1; + + msr_info->data = vcpu->arch.guest_fpu.xfd_err; + break; +#endif default: if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info); @@ -4026,8 +4220,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SGX_ATTRIBUTE: #endif case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM: + case KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM: case KVM_CAP_SREGS2: case KVM_CAP_EXIT_ON_EMULATION_FAILURE: + case KVM_CAP_VCPU_ATTRIBUTES: r = 1; break; case KVM_CAP_EXIT_HYPERCALL: @@ -4039,7 +4235,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_XEN_HVM: r = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR | KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL | - KVM_XEN_HVM_CONFIG_SHARED_INFO; + KVM_XEN_HVM_CONFIG_SHARED_INFO | + KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL; if (sched_info_on()) r |= KVM_XEN_HVM_CONFIG_RUNSTATE; break; @@ -4048,7 +4245,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = KVM_SYNC_X86_VALID_FIELDS; break; case KVM_CAP_ADJUST_CLOCK: - r = KVM_CLOCK_TSC_STABLE; + r = KVM_CLOCK_VALID_FLAGS; break; case KVM_CAP_X86_DISABLE_EXITS: r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE | @@ -4071,13 +4268,13 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = !static_call(kvm_x86_cpu_has_accelerated_tpr)(); break; case KVM_CAP_NR_VCPUS: - r = KVM_SOFT_MAX_VCPUS; + r = min_t(unsigned int, num_online_cpus(), KVM_MAX_VCPUS); break; case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; break; case KVM_CAP_MAX_VCPU_ID: - r = KVM_MAX_VCPU_ID; + r = KVM_MAX_VCPU_IDS; break; case KVM_CAP_PV_MMU: /* obsolete */ r = 0; @@ -4117,6 +4314,14 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) else r = 0; break; + case KVM_CAP_XSAVE2: { + u64 guest_perm = xstate_get_guest_group_perm(); + + r = xstate_required_size(supported_xcr0 & guest_perm, false); + if (r < sizeof(struct kvm_xsave)) + r = sizeof(struct kvm_xsave); + break; + } default: break; } @@ -4285,8 +4490,10 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) { - struct kvm_host_map map; - struct kvm_steal_time *st; + struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache; + struct kvm_steal_time __user *st; + struct kvm_memslots *slots; + static const u8 preempted = KVM_VCPU_PREEMPTED; if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; @@ -4294,16 +4501,23 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) if (vcpu->arch.st.preempted) return; - if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map, - &vcpu->arch.st.cache, true)) + /* This happens on process exit */ + if (unlikely(current->mm != vcpu->kvm->mm)) + return; + + slots = kvm_memslots(vcpu->kvm); + + if (unlikely(slots->generation != ghc->generation || + kvm_is_error_hva(ghc->hva) || !ghc->memslot)) return; - st = map.hva + - offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS); + st = (struct kvm_steal_time __user *)ghc->hva; + BUILD_BUG_ON(sizeof(st->preempted) != sizeof(preempted)); - st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED; + if (!copy_to_user_nofault(&st->preempted, &preempted, sizeof(preempted))) + vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED; - kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true); + mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa)); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) @@ -4331,8 +4545,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { - if (vcpu->arch.apicv_active) - static_call(kvm_x86_sync_pir_to_irr)(vcpu); + static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu); return kvm_apic_get_state(vcpu, s); } @@ -4700,144 +4913,37 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, return 0; } -#define XSTATE_COMPACTION_ENABLED (1ULL << 63) - -static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) -{ - struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave; - u64 xstate_bv = xsave->header.xfeatures; - u64 valid; - - /* - * Copy legacy XSAVE area, to avoid complications with CPUID - * leaves 0 and 1 in the loop below. - */ - memcpy(dest, xsave, XSAVE_HDR_OFFSET); - - /* Set XSTATE_BV */ - xstate_bv &= vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FPSSE; - *(u64 *)(dest + XSAVE_HDR_OFFSET) = xstate_bv; - - /* - * Copy each region from the possibly compacted offset to the - * non-compacted offset. - */ - valid = xstate_bv & ~XFEATURE_MASK_FPSSE; - while (valid) { - u32 size, offset, ecx, edx; - u64 xfeature_mask = valid & -valid; - int xfeature_nr = fls64(xfeature_mask) - 1; - void *src; - - cpuid_count(XSTATE_CPUID, xfeature_nr, - &size, &offset, &ecx, &edx); - - if (xfeature_nr == XFEATURE_PKRU) { - memcpy(dest + offset, &vcpu->arch.pkru, - sizeof(vcpu->arch.pkru)); - } else { - src = get_xsave_addr(xsave, xfeature_nr); - if (src) - memcpy(dest + offset, src, size); - } - - valid -= xfeature_mask; - } -} - -static void load_xsave(struct kvm_vcpu *vcpu, u8 *src) +static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, + struct kvm_xsave *guest_xsave) { - struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave; - u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET); - u64 valid; - - /* - * Copy legacy XSAVE area, to avoid complications with CPUID - * leaves 0 and 1 in the loop below. - */ - memcpy(xsave, src, XSAVE_HDR_OFFSET); - - /* Set XSTATE_BV and possibly XCOMP_BV. */ - xsave->header.xfeatures = xstate_bv; - if (boot_cpu_has(X86_FEATURE_XSAVES)) - xsave->header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED; - - /* - * Copy each region from the non-compacted offset to the - * possibly compacted offset. - */ - valid = xstate_bv & ~XFEATURE_MASK_FPSSE; - while (valid) { - u32 size, offset, ecx, edx; - u64 xfeature_mask = valid & -valid; - int xfeature_nr = fls64(xfeature_mask) - 1; - - cpuid_count(XSTATE_CPUID, xfeature_nr, - &size, &offset, &ecx, &edx); - - if (xfeature_nr == XFEATURE_PKRU) { - memcpy(&vcpu->arch.pkru, src + offset, - sizeof(vcpu->arch.pkru)); - } else { - void *dest = get_xsave_addr(xsave, xfeature_nr); - - if (dest) - memcpy(dest, src + offset, size); - } + if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) + return; - valid -= xfeature_mask; - } + fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu, + guest_xsave->region, + sizeof(guest_xsave->region), + vcpu->arch.pkru); } -static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, - struct kvm_xsave *guest_xsave) +static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu, + u8 *state, unsigned int size) { - if (!vcpu->arch.guest_fpu) + if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) return; - if (boot_cpu_has(X86_FEATURE_XSAVE)) { - memset(guest_xsave, 0, sizeof(struct kvm_xsave)); - fill_xsave((u8 *) guest_xsave->region, vcpu); - } else { - memcpy(guest_xsave->region, - &vcpu->arch.guest_fpu->state.fxsave, - sizeof(struct fxregs_state)); - *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] = - XFEATURE_MASK_FPSSE; - } + fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu, + state, size, vcpu->arch.pkru); } -#define XSAVE_MXCSR_OFFSET 24 - static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, struct kvm_xsave *guest_xsave) { - u64 xstate_bv; - u32 mxcsr; - - if (!vcpu->arch.guest_fpu) + if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) return 0; - xstate_bv = *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)]; - mxcsr = *(u32 *)&guest_xsave->region[XSAVE_MXCSR_OFFSET / sizeof(u32)]; - - if (boot_cpu_has(X86_FEATURE_XSAVE)) { - /* - * Here we allow setting states that are not present in - * CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility - * with old userspace. - */ - if (xstate_bv & ~supported_xcr0 || mxcsr & ~mxcsr_feature_mask) - return -EINVAL; - load_xsave(vcpu, (u8 *)guest_xsave->region); - } else { - if (xstate_bv & ~XFEATURE_MASK_FPSSE || - mxcsr & ~mxcsr_feature_mask) - return -EINVAL; - memcpy(&vcpu->arch.guest_fpu->state.fxsave, - guest_xsave->region, sizeof(struct fxregs_state)); - } - return 0; + return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu, + guest_xsave->region, + supported_xcr0, &vcpu->arch.pkru); } static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu, @@ -4892,6 +4998,115 @@ static int kvm_set_guest_paused(struct kvm_vcpu *vcpu) return 0; } +static int kvm_arch_tsc_has_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + int r; + + switch (attr->attr) { + case KVM_VCPU_TSC_OFFSET: + r = 0; + break; + default: + r = -ENXIO; + } + + return r; +} + +static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + u64 __user *uaddr = (u64 __user *)(unsigned long)attr->addr; + int r; + + if ((u64)(unsigned long)uaddr != attr->addr) + return -EFAULT; + + switch (attr->attr) { + case KVM_VCPU_TSC_OFFSET: + r = -EFAULT; + if (put_user(vcpu->arch.l1_tsc_offset, uaddr)) + break; + r = 0; + break; + default: + r = -ENXIO; + } + + return r; +} + +static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + u64 __user *uaddr = (u64 __user *)(unsigned long)attr->addr; + struct kvm *kvm = vcpu->kvm; + int r; + + if ((u64)(unsigned long)uaddr != attr->addr) + return -EFAULT; + + switch (attr->attr) { + case KVM_VCPU_TSC_OFFSET: { + u64 offset, tsc, ns; + unsigned long flags; + bool matched; + + r = -EFAULT; + if (get_user(offset, uaddr)) + break; + + raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); + + matched = (vcpu->arch.virtual_tsc_khz && + kvm->arch.last_tsc_khz == vcpu->arch.virtual_tsc_khz && + kvm->arch.last_tsc_offset == offset); + + tsc = kvm_scale_tsc(vcpu, rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset; + ns = get_kvmclock_base_ns(); + + __kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched); + raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); + + r = 0; + break; + } + default: + r = -ENXIO; + } + + return r; +} + +static int kvm_vcpu_ioctl_device_attr(struct kvm_vcpu *vcpu, + unsigned int ioctl, + void __user *argp) +{ + struct kvm_device_attr attr; + int r; + + if (copy_from_user(&attr, argp, sizeof(attr))) + return -EFAULT; + + if (attr.group != KVM_VCPU_TSC_CTRL) + return -ENXIO; + + switch (ioctl) { + case KVM_HAS_DEVICE_ATTR: + r = kvm_arch_tsc_has_attr(vcpu, &attr); + break; + case KVM_GET_DEVICE_ATTR: + r = kvm_arch_tsc_get_attr(vcpu, &attr); + break; + case KVM_SET_DEVICE_ATTR: + r = kvm_arch_tsc_set_attr(vcpu, &attr); + break; + } + + return r; +} + static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, struct kvm_enable_cap *cap) { @@ -5015,6 +5230,17 @@ long kvm_arch_vcpu_ioctl(struct file *filp, struct kvm_cpuid __user *cpuid_arg = argp; struct kvm_cpuid cpuid; + /* + * KVM does not correctly handle changing guest CPUID after KVM_RUN, as + * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't + * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page + * faults due to reusing SPs/SPTEs. In practice no sane VMM mucks with + * the core vCPU model on the fly, so fail. + */ + r = -EINVAL; + if (vcpu->arch.last_vmentry_cpu != -1) + goto out; + r = -EFAULT; if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) goto out; @@ -5025,6 +5251,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp, struct kvm_cpuid2 __user *cpuid_arg = argp; struct kvm_cpuid2 cpuid; + /* + * KVM_SET_CPUID{,2} after KVM_RUN is forbidded, see the comment in + * KVM_SET_CPUID case above. + */ + r = -EINVAL; + if (vcpu->arch.last_vmentry_cpu != -1) + goto out; + r = -EFAULT; if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) goto out; @@ -5154,6 +5388,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } case KVM_GET_XSAVE: { + r = -EINVAL; + if (vcpu->arch.guest_fpu.uabi_size > sizeof(struct kvm_xsave)) + break; + u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT); r = -ENOMEM; if (!u.xsave) @@ -5168,7 +5406,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } case KVM_SET_XSAVE: { - u.xsave = memdup_user(argp, sizeof(*u.xsave)); + int size = vcpu->arch.guest_fpu.uabi_size; + + u.xsave = memdup_user(argp, size); if (IS_ERR(u.xsave)) { r = PTR_ERR(u.xsave); goto out_nofree; @@ -5177,6 +5417,25 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave); break; } + + case KVM_GET_XSAVE2: { + int size = vcpu->arch.guest_fpu.uabi_size; + + u.xsave = kzalloc(size, GFP_KERNEL_ACCOUNT); + r = -ENOMEM; + if (!u.xsave) + break; + + kvm_vcpu_ioctl_x86_get_xsave2(vcpu, u.buffer, size); + + r = -EFAULT; + if (copy_to_user(argp, u.xsave, size)) + break; + + r = 0; + break; + } + case KVM_GET_XCRS: { u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT); r = -ENOMEM; @@ -5346,6 +5605,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = __set_sregs2(vcpu, u.sregs2); break; } + case KVM_HAS_DEVICE_ATTR: + case KVM_GET_DEVICE_ATTR: + case KVM_SET_DEVICE_ATTR: + r = kvm_vcpu_ioctl_device_attr(vcpu, ioctl, argp); + break; default: r = -EINVAL; } @@ -5536,7 +5800,7 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) * VM-Exit. */ struct kvm_vcpu *vcpu; - int i; + unsigned long i; kvm_for_each_vcpu(i, vcpu, kvm) kvm_vcpu_kick(vcpu); @@ -5584,6 +5848,7 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, smp_wmb(); kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT; kvm->arch.nr_reserved_ioapic_pins = cap->args[0]; + kvm_request_apicv_update(kvm, true, APICV_INHIBIT_REASON_ABSENT); r = 0; split_irqchip_unlock: mutex_unlock(&kvm->lock); @@ -5665,6 +5930,12 @@ split_irqchip_unlock: if (kvm_x86_ops.vm_copy_enc_context_from) r = kvm_x86_ops.vm_copy_enc_context_from(kvm, cap->args[0]); return r; + case KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM: + r = -EINVAL; + if (kvm_x86_ops.vm_move_enc_context_from) + r = kvm_x86_ops.vm_move_enc_context_from( + kvm, cap->args[0]); + return r; case KVM_CAP_EXIT_HYPERCALL: if (cap->args[0] & ~KVM_EXIT_HYPERCALL_VALID_MASK) { r = -EINVAL; @@ -5798,7 +6069,8 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp) static int kvm_arch_suspend_notifier(struct kvm *kvm) { struct kvm_vcpu *vcpu; - int i, ret = 0; + unsigned long i; + int ret = 0; mutex_lock(&kvm->lock); kvm_for_each_vcpu(i, vcpu, kvm) { @@ -5829,6 +6101,63 @@ int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state) } #endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */ +static int kvm_vm_ioctl_get_clock(struct kvm *kvm, void __user *argp) +{ + struct kvm_clock_data data = { 0 }; + + get_kvmclock(kvm, &data); + if (copy_to_user(argp, &data, sizeof(data))) + return -EFAULT; + + return 0; +} + +static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp) +{ + struct kvm_arch *ka = &kvm->arch; + struct kvm_clock_data data; + u64 now_raw_ns; + + if (copy_from_user(&data, argp, sizeof(data))) + return -EFAULT; + + /* + * Only KVM_CLOCK_REALTIME is used, but allow passing the + * result of KVM_GET_CLOCK back to KVM_SET_CLOCK. + */ + if (data.flags & ~KVM_CLOCK_VALID_FLAGS) + return -EINVAL; + + kvm_hv_invalidate_tsc_page(kvm); + kvm_start_pvclock_update(kvm); + pvclock_update_vm_gtod_copy(kvm); + + /* + * This pairs with kvm_guest_time_update(): when masterclock is + * in use, we use master_kernel_ns + kvmclock_offset to set + * unsigned 'system_time' so if we use get_kvmclock_ns() (which + * is slightly ahead) here we risk going negative on unsigned + * 'system_time' when 'data.clock' is very small. + */ + if (data.flags & KVM_CLOCK_REALTIME) { + u64 now_real_ns = ktime_get_real_ns(); + + /* + * Avoid stepping the kvmclock backwards. + */ + if (now_real_ns > data.realtime) + data.clock += now_real_ns - data.realtime; + } + + if (ka->use_master_clock) + now_raw_ns = ka->master_kernel_ns; + else + now_raw_ns = get_kvmclock_base_ns(); + ka->kvmclock_offset = data.clock - now_raw_ns; + kvm_end_pvclock_update(kvm); + return 0; +} + long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -5901,6 +6230,7 @@ set_identity_unlock: /* Write kvm->irq_routing before enabling irqchip_in_kernel. */ smp_wmb(); kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL; + kvm_request_apicv_update(kvm, true, APICV_INHIBIT_REASON_ABSENT); create_irqchip_unlock: mutex_unlock(&kvm->lock); break; @@ -6072,60 +6402,12 @@ set_pit2_out: break; } #endif - case KVM_SET_CLOCK: { - struct kvm_arch *ka = &kvm->arch; - struct kvm_clock_data user_ns; - u64 now_ns; - - r = -EFAULT; - if (copy_from_user(&user_ns, argp, sizeof(user_ns))) - goto out; - - r = -EINVAL; - if (user_ns.flags) - goto out; - - r = 0; - /* - * TODO: userspace has to take care of races with VCPU_RUN, so - * kvm_gen_update_masterclock() can be cut down to locked - * pvclock_update_vm_gtod_copy(). - */ - kvm_gen_update_masterclock(kvm); - - /* - * This pairs with kvm_guest_time_update(): when masterclock is - * in use, we use master_kernel_ns + kvmclock_offset to set - * unsigned 'system_time' so if we use get_kvmclock_ns() (which - * is slightly ahead) here we risk going negative on unsigned - * 'system_time' when 'user_ns.clock' is very small. - */ - spin_lock_irq(&ka->pvclock_gtod_sync_lock); - if (kvm->arch.use_master_clock) - now_ns = ka->master_kernel_ns; - else - now_ns = get_kvmclock_base_ns(); - ka->kvmclock_offset = user_ns.clock - now_ns; - spin_unlock_irq(&ka->pvclock_gtod_sync_lock); - - kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE); + case KVM_SET_CLOCK: + r = kvm_vm_ioctl_set_clock(kvm, argp); break; - } - case KVM_GET_CLOCK: { - struct kvm_clock_data user_ns; - u64 now_ns; - - now_ns = get_kvmclock_ns(kvm); - user_ns.clock = now_ns; - user_ns.flags = kvm->arch.use_master_clock ? KVM_CLOCK_TSC_STABLE : 0; - memset(&user_ns.pad, 0, sizeof(user_ns.pad)); - - r = -EFAULT; - if (copy_to_user(argp, &user_ns, sizeof(user_ns))) - goto out; - r = 0; + case KVM_GET_CLOCK: + r = kvm_vm_ioctl_get_clock(kvm, argp); break; - } case KVM_MEMORY_ENCRYPT_OP: { r = -ENOTTY; if (kvm_x86_ops.mem_enc_op) @@ -6248,6 +6530,11 @@ static void kvm_init_msr_list(void) min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) continue; break; + case MSR_IA32_XFD: + case MSR_IA32_XFD_ERR: + if (!kvm_cpu_cap_has(X86_FEATURE_XFD)) + continue; + break; default: break; } @@ -6331,13 +6618,14 @@ void kvm_get_segment(struct kvm_vcpu *vcpu, gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, struct x86_exception *exception) { + struct kvm_mmu *mmu = vcpu->arch.mmu; gpa_t t_gpa; BUG_ON(!mmu_is_nested(vcpu)); /* NPT walks are always user-walks */ access |= PFERR_USER_MASK; - t_gpa = vcpu->arch.mmu->gva_to_gpa(vcpu, gpa, access, exception); + t_gpa = mmu->gva_to_gpa(vcpu, mmu, gpa, access, exception); return t_gpa; } @@ -6345,25 +6633,31 @@ gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) { + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; + u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; - return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); + return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception); } EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read); gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) { + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; + u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; access |= PFERR_FETCH_MASK; - return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); + return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception); } gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) { + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; + u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; access |= PFERR_WRITE_MASK; - return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); + return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception); } EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_write); @@ -6371,19 +6665,21 @@ EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_write); gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) { - return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception); + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; + + return mmu->gva_to_gpa(vcpu, mmu, gva, 0, exception); } static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, struct kvm_vcpu *vcpu, u32 access, struct x86_exception *exception) { + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; void *data = val; int r = X86EMUL_CONTINUE; while (bytes) { - gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access, - exception); + gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access, exception); unsigned offset = addr & (PAGE_SIZE-1); unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; @@ -6411,13 +6707,14 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt, struct x86_exception *exception) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; unsigned offset; int ret; /* Inline kvm_read_guest_virt_helper for speed. */ - gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access|PFERR_FETCH_MASK, - exception); + gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access|PFERR_FETCH_MASK, + exception); if (unlikely(gpa == UNMAPPED_GVA)) return X86EMUL_PROPAGATE_FAULT; @@ -6476,13 +6773,12 @@ static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes struct kvm_vcpu *vcpu, u32 access, struct x86_exception *exception) { + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; void *data = val; int r = X86EMUL_CONTINUE; while (bytes) { - gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, - access, - exception); + gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access, exception); unsigned offset = addr & (PAGE_SIZE-1); unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; @@ -6569,6 +6865,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, gpa_t *gpa, struct x86_exception *exception, bool write) { + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; u32 access = ((static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0) | (write ? PFERR_WRITE_MASK : 0); @@ -6586,7 +6883,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, return 1; } - *gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); + *gpa = mmu->gva_to_gpa(vcpu, mmu, gva, access, exception); if (*gpa == UNMAPPED_GVA) return -1; @@ -6906,7 +7203,7 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) } static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, - unsigned short port, void *val, + unsigned short port, unsigned int count, bool in) { vcpu->arch.pio.port = port; @@ -6914,10 +7211,8 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, vcpu->arch.pio.count = count; vcpu->arch.pio.size = size; - if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { - vcpu->arch.pio.count = 0; + if (!kernel_pio(vcpu, vcpu->arch.pio_data)) return 1; - } vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; @@ -6929,26 +7224,44 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, return 0; } -static int emulator_pio_in(struct kvm_vcpu *vcpu, int size, - unsigned short port, void *val, unsigned int count) +static int __emulator_pio_in(struct kvm_vcpu *vcpu, int size, + unsigned short port, unsigned int count) { - int ret; + WARN_ON(vcpu->arch.pio.count); + memset(vcpu->arch.pio_data, 0, size * count); + return emulator_pio_in_out(vcpu, size, port, count, true); +} - if (vcpu->arch.pio.count) - goto data_avail; +static void complete_emulator_pio_in(struct kvm_vcpu *vcpu, void *val) +{ + int size = vcpu->arch.pio.size; + unsigned count = vcpu->arch.pio.count; + memcpy(val, vcpu->arch.pio_data, size * count); + trace_kvm_pio(KVM_PIO_IN, vcpu->arch.pio.port, size, count, vcpu->arch.pio_data); + vcpu->arch.pio.count = 0; +} - memset(vcpu->arch.pio_data, 0, size * count); +static int emulator_pio_in(struct kvm_vcpu *vcpu, int size, + unsigned short port, void *val, unsigned int count) +{ + if (vcpu->arch.pio.count) { + /* + * Complete a previous iteration that required userspace I/O. + * Note, @count isn't guaranteed to match pio.count as userspace + * can modify ECX before rerunning the vCPU. Ignore any such + * shenanigans as KVM doesn't support modifying the rep count, + * and the emulator ensures @count doesn't overflow the buffer. + */ + } else { + int r = __emulator_pio_in(vcpu, size, port, count); + if (!r) + return r; - ret = emulator_pio_in_out(vcpu, size, port, val, count, true); - if (ret) { -data_avail: - memcpy(val, vcpu->arch.pio_data, size * count); - trace_kvm_pio(KVM_PIO_IN, port, size, count, vcpu->arch.pio_data); - vcpu->arch.pio.count = 0; - return 1; + /* Results already available, fall through. */ } - return 0; + complete_emulator_pio_in(vcpu, val); + return 1; } static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, @@ -6963,9 +7276,15 @@ static int emulator_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port, const void *val, unsigned int count) { + int ret; + memcpy(vcpu->arch.pio_data, val, size * count); trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data); - return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false); + ret = emulator_pio_in_out(vcpu, size, port, count, false); + if (ret) + vcpu->arch.pio.count = 0; + + return ret; } static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, @@ -7198,7 +7517,8 @@ static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, r = kvm_get_msr(vcpu, msr_index, pdata); - if (r && kvm_get_msr_user_space(vcpu, msr_index, r)) { + if (r && kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_RDMSR, 0, + complete_emulated_rdmsr, r)) { /* Bounce to user space */ return X86EMUL_IO_NEEDED; } @@ -7214,7 +7534,8 @@ static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, r = kvm_set_msr(vcpu, msr_index, data); - if (r && kvm_set_msr_user_space(vcpu, msr_index, data, r)) { + if (r && kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_WRMSR, data, + complete_emulated_msr_access, r)) { /* Bounce to user space */ return X86EMUL_IO_NEEDED; } @@ -7239,7 +7560,9 @@ static void emulator_set_smbase(struct x86_emulate_ctxt *ctxt, u64 smbase) static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt, u32 pmc) { - return kvm_pmu_is_valid_rdpmc_ecx(emul_to_vcpu(ctxt), pmc); + if (kvm_pmu_is_valid_rdpmc_ecx(emul_to_vcpu(ctxt), pmc)) + return 0; + return -EINVAL; } static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt, @@ -7475,28 +7798,77 @@ void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) } EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt); -static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu) +static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data, + u8 ndata, u8 *insn_bytes, u8 insn_size) { - struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; - u32 insn_size = ctxt->fetch.end - ctxt->fetch.data; struct kvm_run *run = vcpu->run; + u64 info[5]; + u8 info_start; + + /* + * Zero the whole array used to retrieve the exit info, as casting to + * u32 for select entries will leave some chunks uninitialized. + */ + memset(&info, 0, sizeof(info)); + + static_call(kvm_x86_get_exit_info)(vcpu, (u32 *)&info[0], &info[1], + &info[2], (u32 *)&info[3], + (u32 *)&info[4]); run->exit_reason = KVM_EXIT_INTERNAL_ERROR; run->emulation_failure.suberror = KVM_INTERNAL_ERROR_EMULATION; - run->emulation_failure.ndata = 0; + + /* + * There's currently space for 13 entries, but 5 are used for the exit + * reason and info. Restrict to 4 to reduce the maintenance burden + * when expanding kvm_run.emulation_failure in the future. + */ + if (WARN_ON_ONCE(ndata > 4)) + ndata = 4; + + /* Always include the flags as a 'data' entry. */ + info_start = 1; run->emulation_failure.flags = 0; if (insn_size) { - run->emulation_failure.ndata = 3; + BUILD_BUG_ON((sizeof(run->emulation_failure.insn_size) + + sizeof(run->emulation_failure.insn_bytes) != 16)); + info_start += 2; run->emulation_failure.flags |= KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES; run->emulation_failure.insn_size = insn_size; memset(run->emulation_failure.insn_bytes, 0x90, sizeof(run->emulation_failure.insn_bytes)); - memcpy(run->emulation_failure.insn_bytes, - ctxt->fetch.data, insn_size); + memcpy(run->emulation_failure.insn_bytes, insn_bytes, insn_size); } + + memcpy(&run->internal.data[info_start], info, sizeof(info)); + memcpy(&run->internal.data[info_start + ARRAY_SIZE(info)], data, + ndata * sizeof(data[0])); + + run->emulation_failure.ndata = info_start + ARRAY_SIZE(info) + ndata; +} + +static void prepare_emulation_ctxt_failure_exit(struct kvm_vcpu *vcpu) +{ + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; + + prepare_emulation_failure_exit(vcpu, NULL, 0, ctxt->fetch.data, + ctxt->fetch.end - ctxt->fetch.data); +} + +void __kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data, + u8 ndata) +{ + prepare_emulation_failure_exit(vcpu, data, ndata, NULL, 0); +} +EXPORT_SYMBOL_GPL(__kvm_prepare_emulation_failure_exit); + +void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu) +{ + __kvm_prepare_emulation_failure_exit(vcpu, NULL, 0); } +EXPORT_SYMBOL_GPL(kvm_prepare_emulation_failure_exit); static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) { @@ -7512,16 +7884,14 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) if (kvm->arch.exit_on_emulation_error || (emulation_type & EMULTYPE_SKIP)) { - prepare_emulation_failure_exit(vcpu); + prepare_emulation_ctxt_failure_exit(vcpu); return 0; } kvm_queue_exception(vcpu, UD_VECTOR); if (!is_guest_mode(vcpu) && static_call(kvm_x86_get_cpl)(vcpu) == 0) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; + prepare_emulation_ctxt_failure_exit(vcpu); return 0; } @@ -7716,6 +8086,8 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) if (unlikely(!r)) return 0; + kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS); + /* * rflags is the old, "raw" value of the flags. The new value has * not been saved yet. @@ -7883,12 +8255,23 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, } /* - * Note, EMULTYPE_SKIP is intended for use *only* by vendor callbacks - * for kvm_skip_emulated_instruction(). The caller is responsible for - * updating interruptibility state and injecting single-step #DBs. + * EMULTYPE_SKIP without EMULTYPE_COMPLETE_USER_EXIT is intended for + * use *only* by vendor callbacks for kvm_skip_emulated_instruction(). + * The caller is responsible for updating interruptibility state and + * injecting single-step #DBs. */ if (emulation_type & EMULTYPE_SKIP) { - kvm_rip_write(vcpu, ctxt->_eip); + if (ctxt->mode != X86EMUL_MODE_PROT64) + ctxt->eip = (u32)ctxt->_eip; + else + ctxt->eip = ctxt->_eip; + + if (emulation_type & EMULTYPE_COMPLETE_USER_EXIT) { + r = 1; + goto writeback; + } + + kvm_rip_write(vcpu, ctxt->eip); if (ctxt->eflags & X86_EFLAGS_RF) kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF); return 1; @@ -7952,17 +8335,24 @@ restart: writeback = false; r = 0; vcpu->arch.complete_userspace_io = complete_emulated_mmio; + } else if (vcpu->arch.complete_userspace_io) { + writeback = false; + r = 0; } else if (r == EMULATION_RESTART) goto restart; else r = 1; +writeback: if (writeback) { unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu); toggle_interruptibility(vcpu, ctxt->interruptibility); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; if (!ctxt->have_exception || exception_type(ctxt->exception.vector) == EXCPT_TRAP) { + kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS); + if (ctxt->is_branch) + kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_BRANCH_INSTRUCTIONS); kvm_rip_write(vcpu, ctxt->eip); if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) r = kvm_vcpu_do_singlestep(vcpu); @@ -8121,14 +8511,13 @@ static void tsc_khz_changed(void *data) static void kvm_hyperv_tsc_notifier(void) { struct kvm *kvm; - struct kvm_vcpu *vcpu; int cpu; - unsigned long flags; mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) kvm_make_mclock_inprogress_request(kvm); + /* no guest entries from this point */ hyperv_stop_tsc_emulation(); /* TSC frequency always matches when on Hyper-V */ @@ -8137,18 +8526,11 @@ static void kvm_hyperv_tsc_notifier(void) kvm_max_guest_tsc_khz = tsc_khz; list_for_each_entry(kvm, &vm_list, vm_list) { - struct kvm_arch *ka = &kvm->arch; - - spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); + __kvm_start_pvclock_update(kvm); pvclock_update_vm_gtod_copy(kvm); - spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); - - kvm_for_each_vcpu(cpu, vcpu, kvm) - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - - kvm_for_each_vcpu(cpu, vcpu, kvm) - kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu); + kvm_end_pvclock_update(kvm); } + mutex_unlock(&kvm_lock); } #endif @@ -8157,7 +8539,8 @@ static void __kvmclock_cpufreq_notifier(struct cpufreq_freqs *freq, int cpu) { struct kvm *kvm; struct kvm_vcpu *vcpu; - int i, send_ipi = 0; + int send_ipi = 0; + unsigned long i; /* * We allow guests to temporarily run on slowing clocks, @@ -8282,57 +8665,12 @@ static void kvm_timer_init(void) kvmclock_cpu_online, kvmclock_cpu_down_prep); } -DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); -EXPORT_PER_CPU_SYMBOL_GPL(current_vcpu); - -int kvm_is_in_guest(void) -{ - return __this_cpu_read(current_vcpu) != NULL; -} - -static int kvm_is_user_mode(void) -{ - int user_mode = 3; - - if (__this_cpu_read(current_vcpu)) - user_mode = static_call(kvm_x86_get_cpl)(__this_cpu_read(current_vcpu)); - - return user_mode != 0; -} - -static unsigned long kvm_get_guest_ip(void) -{ - unsigned long ip = 0; - - if (__this_cpu_read(current_vcpu)) - ip = kvm_rip_read(__this_cpu_read(current_vcpu)); - - return ip; -} - -static void kvm_handle_intel_pt_intr(void) -{ - struct kvm_vcpu *vcpu = __this_cpu_read(current_vcpu); - - kvm_make_request(KVM_REQ_PMI, vcpu); - __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT, - (unsigned long *)&vcpu->arch.pmu.global_status); -} - -static struct perf_guest_info_callbacks kvm_guest_cbs = { - .is_in_guest = kvm_is_in_guest, - .is_user_mode = kvm_is_user_mode, - .get_guest_ip = kvm_get_guest_ip, - .handle_intel_pt_intr = kvm_handle_intel_pt_intr, -}; - #ifdef CONFIG_X86_64 static void pvclock_gtod_update_fn(struct work_struct *work) { struct kvm *kvm; - struct kvm_vcpu *vcpu; - int i; + unsigned long i; mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) @@ -8389,18 +8727,20 @@ int kvm_arch_init(void *opaque) int r; if (kvm_x86_ops.hardware_enable) { - printk(KERN_ERR "kvm: already loaded the other module\n"); + pr_err("kvm: already loaded vendor module '%s'\n", kvm_x86_ops.name); r = -EEXIST; goto out; } if (!ops->cpu_has_kvm_support()) { - pr_err_ratelimited("kvm: no hardware support\n"); + pr_err_ratelimited("kvm: no hardware support for '%s'\n", + ops->runtime_ops->name); r = -EOPNOTSUPP; goto out; } if (ops->disabled_by_bios()) { - pr_err_ratelimited("kvm: disabled by bios\n"); + pr_err_ratelimited("kvm: support for '%s' disabled by bios\n", + ops->runtime_ops->name); r = -EOPNOTSUPP; goto out; } @@ -8417,18 +8757,11 @@ int kvm_arch_init(void *opaque) } r = -ENOMEM; - x86_fpu_cache = kmem_cache_create("x86_fpu", sizeof(struct fpu), - __alignof__(struct fpu), SLAB_ACCOUNT, - NULL); - if (!x86_fpu_cache) { - printk(KERN_ERR "kvm: failed to allocate cache for x86 fpu\n"); - goto out; - } x86_emulator_cache = kvm_alloc_emulator_cache(); if (!x86_emulator_cache) { pr_err("kvm: failed to allocate cache for x86 emulator\n"); - goto out_free_x86_fpu_cache; + goto out; } user_return_msrs = alloc_percpu(struct kvm_user_return_msrs); @@ -8444,8 +8777,6 @@ int kvm_arch_init(void *opaque) kvm_timer_init(); - perf_register_guest_info_callbacks(&kvm_guest_cbs); - if (boot_cpu_has(X86_FEATURE_XSAVE)) { host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0; @@ -8466,8 +8797,6 @@ out_free_percpu: free_percpu(user_return_msrs); out_free_x86_emulator_cache: kmem_cache_destroy(x86_emulator_cache); -out_free_x86_fpu_cache: - kmem_cache_destroy(x86_fpu_cache); out: return r; } @@ -8479,7 +8808,6 @@ void kvm_arch_exit(void) clear_hv_tscchange_cb(); #endif kvm_lapic_exit(); - perf_unregister_guest_info_callbacks(&kvm_guest_cbs); if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, @@ -8494,15 +8822,21 @@ void kvm_arch_exit(void) kvm_mmu_module_exit(); free_percpu(user_return_msrs); kmem_cache_destroy(x86_emulator_cache); - kmem_cache_destroy(x86_fpu_cache); #ifdef CONFIG_KVM_XEN static_key_deferred_flush(&kvm_xen_enabled); WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key)); #endif } -static int __kvm_vcpu_halt(struct kvm_vcpu *vcpu, int state, int reason) +static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason) { + /* + * The vCPU has halted, e.g. executed HLT. Update the run state if the + * local APIC is in-kernel, the run loop will detect the non-runnable + * state and halt the vCPU. Exit to userspace if the local APIC is + * managed by userspace, in which case userspace is responsible for + * handling wake events. + */ ++vcpu->stat.halt_exits; if (lapic_in_kernel(vcpu)) { vcpu->arch.mp_state = state; @@ -8513,11 +8847,11 @@ static int __kvm_vcpu_halt(struct kvm_vcpu *vcpu, int state, int reason) } } -int kvm_vcpu_halt(struct kvm_vcpu *vcpu) +int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu) { - return __kvm_vcpu_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT); + return __kvm_emulate_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT); } -EXPORT_SYMBOL_GPL(kvm_vcpu_halt); +EXPORT_SYMBOL_GPL(kvm_emulate_halt_noskip); int kvm_emulate_halt(struct kvm_vcpu *vcpu) { @@ -8526,7 +8860,7 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu) * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered * KVM_EXIT_DEBUG here. */ - return kvm_vcpu_halt(vcpu) && ret; + return kvm_emulate_halt_noskip(vcpu) && ret; } EXPORT_SYMBOL_GPL(kvm_emulate_halt); @@ -8534,7 +8868,8 @@ int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu) { int ret = kvm_skip_emulated_instruction(vcpu); - return __kvm_vcpu_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD, KVM_EXIT_AP_RESET_HOLD) && ret; + return __kvm_emulate_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD, + KVM_EXIT_AP_RESET_HOLD) && ret; } EXPORT_SYMBOL_GPL(kvm_emulate_ap_reset_hold); @@ -8595,12 +8930,11 @@ EXPORT_SYMBOL_GPL(kvm_apicv_activated); static void kvm_apicv_init(struct kvm *kvm) { - mutex_init(&kvm->arch.apicv_update_lock); + init_rwsem(&kvm->arch.apicv_update_lock); - if (enable_apicv) - clear_bit(APICV_INHIBIT_REASON_DISABLE, - &kvm->arch.apicv_inhibit_reasons); - else + set_bit(APICV_INHIBIT_REASON_ABSENT, + &kvm->arch.apicv_inhibit_reasons); + if (!enable_apicv) set_bit(APICV_INHIBIT_REASON_DISABLE, &kvm->arch.apicv_inhibit_reasons); } @@ -8669,7 +9003,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) trace_kvm_hypercall(nr, a0, a1, a2, a3); - op_64_bit = is_64_bit_mode(vcpu); + op_64_bit = is_64_bit_hypercall(vcpu); if (!op_64_bit) { nr &= 0xFFFFFFFF; a0 &= 0xFFFFFFFF; @@ -8773,19 +9107,20 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu) { struct kvm_run *kvm_run = vcpu->run; - /* - * if_flag is obsolete and useless, so do not bother - * setting it for SEV-ES guests. Userspace can just - * use kvm_run->ready_for_interrupt_injection. - */ - kvm_run->if_flag = !vcpu->arch.guest_state_protected - && (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0; - + kvm_run->if_flag = static_call(kvm_x86_get_if_flag)(vcpu); kvm_run->cr8 = kvm_get_cr8(vcpu); kvm_run->apic_base = kvm_get_apic_base(vcpu); + + /* + * The call to kvm_ready_for_interrupt_injection() may end up in + * kvm_xen_has_interrupt() which may require the srcu lock to be + * held, to protect against changes in the vcpu_info address. + */ + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); kvm_run->ready_for_interrupt_injection = pic_in_kernel(vcpu->kvm) || kvm_vcpu_ready_for_interrupt_injection(vcpu); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); if (is_smm(vcpu)) kvm_run->flags |= KVM_RUN_X86_SMM; @@ -9242,14 +9577,7 @@ static void process_smi(struct kvm_vcpu *vcpu) void kvm_make_scan_ioapic_request_mask(struct kvm *kvm, unsigned long *vcpu_bitmap) { - cpumask_var_t cpus; - - zalloc_cpumask_var(&cpus, GFP_ATOMIC); - - kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC, - NULL, vcpu_bitmap, cpus); - - free_cpumask_var(cpus); + kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC, vcpu_bitmap); } void kvm_make_scan_ioapic_request(struct kvm *kvm) @@ -9264,7 +9592,7 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) if (!lapic_in_kernel(vcpu)) return; - mutex_lock(&vcpu->kvm->arch.apicv_update_lock); + down_read(&vcpu->kvm->arch.apicv_update_lock); activate = kvm_apicv_activated(vcpu->kvm); if (vcpu->arch.apicv_active == activate) @@ -9284,7 +9612,7 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) kvm_make_request(KVM_REQ_EVENT, vcpu); out: - mutex_unlock(&vcpu->kvm->arch.apicv_update_lock); + up_read(&vcpu->kvm->arch.apicv_update_lock); } EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv); @@ -9292,6 +9620,8 @@ void __kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) { unsigned long old, new; + lockdep_assert_held_write(&kvm->arch.apicv_update_lock); + if (!kvm_x86_ops.check_apicv_inhibit_reasons || !static_call(kvm_x86_check_apicv_inhibit_reasons)(bit)) return; @@ -9305,6 +9635,18 @@ void __kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) if (!!old != !!new) { trace_kvm_apicv_update_request(activate, bit); + /* + * Kick all vCPUs before setting apicv_inhibit_reasons to avoid + * false positives in the sanity check WARN in svm_vcpu_run(). + * This task will wait for all vCPUs to ack the kick IRQ before + * updating apicv_inhibit_reasons, and all other vCPUs will + * block on acquiring apicv_update_lock so that vCPUs can't + * redo svm_vcpu_run() without seeing the new inhibit state. + * + * Note, holding apicv_update_lock and taking it in the read + * side (handling the request) also prevents other vCPUs from + * servicing the request with a stale apicv_inhibit_reasons. + */ kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_UPDATE); kvm->arch.apicv_inhibit_reasons = new; if (new) { @@ -9318,9 +9660,9 @@ EXPORT_SYMBOL_GPL(__kvm_request_apicv_update); void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) { - mutex_lock(&kvm->arch.apicv_update_lock); + down_write(&kvm->arch.apicv_update_lock); __kvm_request_apicv_update(kvm, activate, bit); - mutex_unlock(&kvm->arch.apicv_update_lock); + up_write(&kvm->arch.apicv_update_lock); } EXPORT_SYMBOL_GPL(kvm_request_apicv_update); @@ -9334,8 +9676,7 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) if (irqchip_split(vcpu->kvm)) kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors); else { - if (vcpu->arch.apicv_active) - static_call(kvm_x86_sync_pir_to_irr)(vcpu); + static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu); if (ioapic_in_kernel(vcpu->kvm)) kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); } @@ -9353,12 +9694,16 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu) if (!kvm_apic_hw_enabled(vcpu->arch.apic)) return; - if (to_hv_vcpu(vcpu)) + if (to_hv_vcpu(vcpu)) { bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors, to_hv_synic(vcpu)->vec_bitmap, 256); + static_call(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap); + return; + } - static_call(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap); + static_call(kvm_x86_load_eoi_exitmap)( + vcpu, (u64 *)vcpu->arch.ioapic_handled_vectors); } void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, @@ -9417,7 +9762,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } if (kvm_request_pending(vcpu)) { - if (kvm_check_request(KVM_REQ_VM_BUGGED, vcpu)) { + if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) { r = -EIO; goto out; } @@ -9432,7 +9777,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) __kvm_migrate_timers(vcpu); if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu)) - kvm_gen_update_masterclock(vcpu->kvm); + kvm_update_masterclock(vcpu->kvm); if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu)) kvm_gen_kvmclock_update(vcpu); if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) { @@ -9450,10 +9795,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) /* Flushing all ASIDs flushes the current ASID... */ kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } - if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) - kvm_vcpu_flush_tlb_current(vcpu); - if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu)) - kvm_vcpu_flush_tlb_guest(vcpu); + kvm_service_local_tlb_flush_requests(vcpu); if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; @@ -9604,10 +9946,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) /* * This handles the case where a posted interrupt was - * notified with kvm_vcpu_kick. + * notified with kvm_vcpu_kick. Assigned devices can + * use the POSTED_INTR_VECTOR even if APICv is disabled, + * so do it even if APICv is disabled on this vCPU. */ - if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active) - static_call(kvm_x86_sync_pir_to_irr)(vcpu); + if (kvm_lapic_enabled(vcpu)) + static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu); if (kvm_vcpu_exit_request(vcpu)) { vcpu->mode = OUTSIDE_GUEST_MODE; @@ -9628,6 +9972,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (test_thread_flag(TIF_NEED_FPU_LOAD)) switch_fpu_return(); + if (vcpu->arch.guest_fpu.xfd_err) + wrmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); + if (unlikely(vcpu->arch.switch_db_regs)) { set_debugreg(0, 7); set_debugreg(vcpu->arch.eff_db[0], 0); @@ -9639,18 +9986,26 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } for (;;) { + /* + * Assert that vCPU vs. VM APICv state is consistent. An APICv + * update must kick and wait for all vCPUs before toggling the + * per-VM state, and responsing vCPUs must wait for the update + * to complete before servicing KVM_REQ_APICV_UPDATE. + */ + WARN_ON_ONCE(kvm_apicv_activated(vcpu->kvm) != kvm_vcpu_apicv_active(vcpu)); + exit_fastpath = static_call(kvm_x86_run)(vcpu); if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST)) break; - if (unlikely(kvm_vcpu_exit_request(vcpu))) { + if (kvm_lapic_enabled(vcpu)) + static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu); + + if (unlikely(kvm_vcpu_exit_request(vcpu))) { exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED; break; } - - if (vcpu->arch.apicv_active) - static_call(kvm_x86_sync_pir_to_irr)(vcpu); - } + } /* * Do this here before restoring debug registers on the host. And @@ -9681,8 +10036,19 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); + /* + * Sync xfd before calling handle_exit_irqoff() which may + * rely on the fact that guest_fpu::xfd is up-to-date (e.g. + * in #NM irqoff handler). + */ + if (vcpu->arch.xfd_no_write_intercept) + fpu_sync_guest_vmexit_xfd_state(); + static_call(kvm_x86_handle_exit_irqoff)(vcpu); + if (vcpu->arch.guest_fpu.xfd_err) + wrmsrl(MSR_IA32_XFD_ERR, 0); + /* * Consume any pending interrupts, including the possible source of * VM-Exit on SVM and any ticks that occur between VM-Exit and now. @@ -9690,7 +10056,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * interrupts on processors that implement an interrupt shadow, the * stat.exits increment will do nicely. */ - kvm_before_interrupt(vcpu); + kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ); local_irq_enable(); ++vcpu->stat.exits; local_irq_disable(); @@ -9750,7 +10116,10 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) if (!kvm_arch_vcpu_runnable(vcpu) && (!kvm_x86_ops.pre_block || static_call(kvm_x86_pre_block)(vcpu) == 0)) { srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); - kvm_vcpu_block(vcpu); + if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) + kvm_vcpu_halt(vcpu); + else + kvm_vcpu_block(vcpu); vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); if (kvm_x86_ops.post_block) @@ -9913,58 +10282,21 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) return 0; } -static void kvm_save_current_fpu(struct fpu *fpu) -{ - /* - * If the target FPU state is not resident in the CPU registers, just - * memcpy() from current, else save CPU state directly to the target. - */ - if (test_thread_flag(TIF_NEED_FPU_LOAD)) - memcpy(&fpu->state, ¤t->thread.fpu.state, - fpu_kernel_xstate_size); - else - save_fpregs_to_fpstate(fpu); -} - /* Swap (qemu) user FPU context for the guest FPU context. */ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { - fpregs_lock(); - - kvm_save_current_fpu(vcpu->arch.user_fpu); - /* - * Guests with protected state can't have it set by the hypervisor, - * so skip trying to set it. + * Exclude PKRU from restore as restored separately in + * kvm_x86_ops.run(). */ - if (vcpu->arch.guest_fpu) - /* PKRU is separately restored in kvm_x86_ops.run. */ - __restore_fpregs_from_fpstate(&vcpu->arch.guest_fpu->state, - ~XFEATURE_MASK_PKRU); - - fpregs_mark_activate(); - fpregs_unlock(); - + fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, true); trace_kvm_fpu(1); } /* When vcpu_run ends, restore user space FPU context. */ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { - fpregs_lock(); - - /* - * Guests with protected state can't have it read by the hypervisor, - * so skip trying to save it. - */ - if (vcpu->arch.guest_fpu) - kvm_save_current_fpu(vcpu->arch.guest_fpu); - - restore_fpregs_from_fpstate(&vcpu->arch.user_fpu->state); - - fpregs_mark_activate(); - fpregs_unlock(); - + fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, false); ++vcpu->stat.fpu_reload; trace_kvm_fpu(0); } @@ -10347,7 +10679,8 @@ static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs, vcpu->arch.cr2 = sregs->cr2; *mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3; vcpu->arch.cr3 = sregs->cr3; - kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); + kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); + static_call_cond(kvm_x86_post_set_cr3)(vcpu, sregs->cr3); kvm_set_cr8(vcpu, sregs->cr8); @@ -10364,7 +10697,7 @@ static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs, if (update_pdptrs) { idx = srcu_read_lock(&vcpu->kvm->srcu); if (is_pae_paging(vcpu)) { - load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); + load_pdptrs(vcpu, kvm_read_cr3(vcpu)); *mmu_reset_needed = 1; } srcu_read_unlock(&vcpu->kvm->srcu, idx); @@ -10458,6 +10791,24 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, return ret; } +static void kvm_arch_vcpu_guestdbg_update_apicv_inhibit(struct kvm *kvm) +{ + bool inhibit = false; + struct kvm_vcpu *vcpu; + unsigned long i; + + down_write(&kvm->arch.apicv_update_lock); + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (vcpu->guest_debug & KVM_GUESTDBG_BLOCKIRQ) { + inhibit = true; + break; + } + } + __kvm_request_apicv_update(kvm, !inhibit, APICV_INHIBIT_REASON_BLOCKIRQ); + up_write(&kvm->arch.apicv_update_lock); +} + int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) { @@ -10510,6 +10861,8 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, static_call(kvm_x86_update_exception_bitmap)(vcpu); + kvm_arch_vcpu_guestdbg_update_apicv_inhibit(vcpu->kvm); + r = 0; out: @@ -10545,12 +10898,12 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { struct fxregs_state *fxsave; - if (!vcpu->arch.guest_fpu) + if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) return 0; vcpu_load(vcpu); - fxsave = &vcpu->arch.guest_fpu->state.fxsave; + fxsave = &vcpu->arch.guest_fpu.fpstate->regs.fxsave; memcpy(fpu->fpr, fxsave->st_space, 128); fpu->fcw = fxsave->cwd; fpu->fsw = fxsave->swd; @@ -10568,12 +10921,12 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { struct fxregs_state *fxsave; - if (!vcpu->arch.guest_fpu) + if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) return 0; vcpu_load(vcpu); - fxsave = &vcpu->arch.guest_fpu->state.fxsave; + fxsave = &vcpu->arch.guest_fpu.fpstate->regs.fxsave; memcpy(fxsave->st_space, fpu->fpr, 128); fxsave->cwd = fpu->fcw; @@ -10624,33 +10977,6 @@ static int sync_regs(struct kvm_vcpu *vcpu) return 0; } -static void fx_init(struct kvm_vcpu *vcpu) -{ - if (!vcpu->arch.guest_fpu) - return; - - fpstate_init(&vcpu->arch.guest_fpu->state); - if (boot_cpu_has(X86_FEATURE_XSAVES)) - vcpu->arch.guest_fpu->state.xsave.header.xcomp_bv = - host_xcr0 | XSTATE_COMPACTION_ENABLED; - - /* - * Ensure guest xcr0 is valid for loading - */ - vcpu->arch.xcr0 = XFEATURE_MASK_FP; - - vcpu->arch.cr0 |= X86_CR0_ET; -} - -void kvm_free_guest_fpu(struct kvm_vcpu *vcpu) -{ - if (vcpu->arch.guest_fpu) { - kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); - vcpu->arch.guest_fpu = NULL; - } -} -EXPORT_SYMBOL_GPL(kvm_free_guest_fpu); - int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) { if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) @@ -10707,20 +11033,10 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) if (!alloc_emulate_ctxt(vcpu)) goto free_wbinvd_dirty_mask; - vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache, - GFP_KERNEL_ACCOUNT); - if (!vcpu->arch.user_fpu) { - pr_err("kvm: failed to allocate userspace's fpu\n"); - goto free_emulate_ctxt; - } - - vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, - GFP_KERNEL_ACCOUNT); - if (!vcpu->arch.guest_fpu) { + if (!fpu_alloc_guest_fpstate(&vcpu->arch.guest_fpu)) { pr_err("kvm: failed to allocate vcpu's fpu\n"); - goto free_user_fpu; + goto free_emulate_ctxt; } - fx_init(vcpu); vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu); @@ -10752,9 +11068,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) return 0; free_guest_fpu: - kvm_free_guest_fpu(vcpu); -free_user_fpu: - kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); + fpu_free_guest_fpstate(&vcpu->arch.guest_fpu); free_emulate_ctxt: kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt); free_wbinvd_dirty_mask: @@ -10792,19 +11106,15 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { - struct gfn_to_pfn_cache *cache = &vcpu->arch.st.cache; int idx; - kvm_release_pfn(cache->pfn, cache->dirty, cache); - kvmclock_reset(vcpu); static_call(kvm_x86_vcpu_free)(vcpu); kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt); free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); - kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); - kvm_free_guest_fpu(vcpu); + fpu_free_guest_fpstate(&vcpu->arch.guest_fpu); kvm_hv_vcpu_uninit(vcpu); kvm_pmu_destroy(vcpu); @@ -10821,9 +11131,19 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { + struct kvm_cpuid_entry2 *cpuid_0x1; unsigned long old_cr0 = kvm_read_cr0(vcpu); unsigned long new_cr0; - u32 eax, dummy; + + /* + * Several of the "set" flows, e.g. ->set_cr0(), read other registers + * to handle side effects. RESET emulation hits those flows and relies + * on emulated/virtualized registers, including those that are loaded + * into hardware, to be zeroed at vCPU creation. Use CRs as a sentinel + * to detect improper or missing initialization. + */ + WARN_ON_ONCE(!init_event && + (old_cr0 || kvm_read_cr3(vcpu) || kvm_read_cr4(vcpu))); kvm_lapic_reset(vcpu, init_event); @@ -10856,8 +11176,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_async_pf_hash_reset(vcpu); vcpu->arch.apf.halted = false; - if (vcpu->arch.guest_fpu && kvm_mpx_supported()) { - void *mpx_state_buffer; + if (vcpu->arch.guest_fpu.fpstate && kvm_mpx_supported()) { + struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate; /* * To avoid have the INIT path from kvm_apic_has_events() that be @@ -10865,14 +11185,10 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) */ if (init_event) kvm_put_guest_fpu(vcpu); - mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave, - XFEATURE_BNDREGS); - if (mpx_state_buffer) - memset(mpx_state_buffer, 0, sizeof(struct mpx_bndreg_state)); - mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave, - XFEATURE_BNDCSR); - if (mpx_state_buffer) - memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr)); + + fpstate_clear_xstate_component(fpstate, XFEATURE_BNDREGS); + fpstate_clear_xstate_component(fpstate, XFEATURE_BNDCSR); + if (init_event) kvm_load_guest_fpu(vcpu); } @@ -10886,21 +11202,19 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.xcr0 = XFEATURE_MASK_FP; } + /* All GPRs except RDX (handled below) are zeroed on RESET/INIT. */ memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs)); - vcpu->arch.regs_avail = ~0; - vcpu->arch.regs_dirty = ~0; + kvm_register_mark_dirty(vcpu, VCPU_REGS_RSP); /* * Fall back to KVM's default Family/Model/Stepping of 0x600 (P6/Athlon) * if no CPUID match is found. Note, it's impossible to get a match at * RESET since KVM emulates RESET before exposing the vCPU to userspace, - * i.e. it'simpossible for kvm_cpuid() to find a valid entry on RESET. - * But, go through the motions in case that's ever remedied. + * i.e. it's impossible for kvm_find_cpuid_entry() to find a valid entry + * on RESET. But, go through the motions in case that's ever remedied. */ - eax = 1; - if (!kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true)) - eax = 0x600; - kvm_rdx_write(vcpu, eax); + cpuid_0x1 = kvm_find_cpuid_entry(vcpu, 1, 0); + kvm_rdx_write(vcpu, cpuid_0x1 ? cpuid_0x1->eax : 0x600); vcpu->arch.ia32_xss = 0; @@ -10969,7 +11283,7 @@ int kvm_arch_hardware_enable(void) { struct kvm *kvm; struct kvm_vcpu *vcpu; - int i; + unsigned long i; int ret; u64 local_tsc; u64 max_tsc = 0; @@ -11079,6 +11393,8 @@ int kvm_arch_hardware_setup(void *opaque) memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops)); kvm_ops_static_call_update(); + kvm_register_perf_callbacks(ops->handle_intel_pt_intr); + if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) supported_xss = 0; @@ -11106,6 +11422,8 @@ int kvm_arch_hardware_setup(void *opaque) void kvm_arch_hardware_unsetup(void) { + kvm_unregister_perf_callbacks(); + static_call(kvm_x86_hardware_unsetup)(); } @@ -11152,13 +11470,14 @@ void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) void kvm_arch_free_vm(struct kvm *kvm) { kfree(to_kvm_hv(kvm)->hv_pa_pg); - vfree(kvm); + __kvm_arch_free_vm(kvm); } int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { int ret; + unsigned long flags; if (type) return -EINVAL; @@ -11182,10 +11501,12 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) raw_spin_lock_init(&kvm->arch.tsc_write_lock); mutex_init(&kvm->arch.apic_map_lock); - spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock); - + seqcount_raw_spinlock_init(&kvm->arch.pvclock_sc, &kvm->arch.tsc_write_lock); kvm->arch.kvmclock_offset = -get_kvmclock_base_ns(); + + raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); pvclock_update_vm_gtod_copy(kvm); + raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); kvm->arch.guest_can_read_msr_platform_info = true; @@ -11219,7 +11540,7 @@ static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) static void kvm_free_vcpus(struct kvm *kvm) { - unsigned int i; + unsigned long i; struct kvm_vcpu *vcpu; /* @@ -11229,15 +11550,8 @@ static void kvm_free_vcpus(struct kvm *kvm) kvm_clear_async_pf_completion_queue(vcpu); kvm_unload_vcpu_mmu(vcpu); } - kvm_for_each_vcpu(i, vcpu, kvm) - kvm_vcpu_destroy(vcpu); - - mutex_lock(&kvm->lock); - for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) - kvm->vcpus[i] = NULL; - atomic_set(&kvm->online_vcpus, 0); - mutex_unlock(&kvm->lock); + kvm_destroy_vcpus(kvm); } void kvm_arch_sync_events(struct kvm *kvm) @@ -11382,8 +11696,7 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) kvm_page_track_free_memslot(slot); } -static int memslot_rmap_alloc(struct kvm_memory_slot *slot, - unsigned long npages) +int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages) { const int sz = sizeof(*slot->arch.rmap[0]); int i; @@ -11392,7 +11705,8 @@ static int memslot_rmap_alloc(struct kvm_memory_slot *slot, int level = i + 1; int lpages = __kvm_mmu_slot_lpages(slot, npages, level); - WARN_ON(slot->arch.rmap[i]); + if (slot->arch.rmap[i]) + continue; slot->arch.rmap[i] = kvcalloc(lpages, sz, GFP_KERNEL_ACCOUNT); if (!slot->arch.rmap[i]) { @@ -11404,54 +11718,10 @@ static int memslot_rmap_alloc(struct kvm_memory_slot *slot, return 0; } -int alloc_all_memslots_rmaps(struct kvm *kvm) -{ - struct kvm_memslots *slots; - struct kvm_memory_slot *slot; - int r, i; - - /* - * Check if memslots alreday have rmaps early before acquiring - * the slots_arch_lock below. - */ - if (kvm_memslots_have_rmaps(kvm)) - return 0; - - mutex_lock(&kvm->slots_arch_lock); - - /* - * Read memslots_have_rmaps again, under the slots arch lock, - * before allocating the rmaps - */ - if (kvm_memslots_have_rmaps(kvm)) { - mutex_unlock(&kvm->slots_arch_lock); - return 0; - } - - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { - slots = __kvm_memslots(kvm, i); - kvm_for_each_memslot(slot, slots) { - r = memslot_rmap_alloc(slot, slot->npages); - if (r) { - mutex_unlock(&kvm->slots_arch_lock); - return r; - } - } - } - - /* - * Ensure that memslots_have_rmaps becomes true strictly after - * all the rmap pointers are set. - */ - smp_store_release(&kvm->arch.memslots_have_rmaps, true); - mutex_unlock(&kvm->slots_arch_lock); - return 0; -} - static int kvm_alloc_memslot_metadata(struct kvm *kvm, - struct kvm_memory_slot *slot, - unsigned long npages) + struct kvm_memory_slot *slot) { + unsigned long npages = slot->npages; int i, r; /* @@ -11498,7 +11768,7 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm, } } - if (kvm_page_track_create_memslot(slot, npages)) + if (kvm_page_track_create_memslot(kvm, slot, npages)) goto out_free; return 0; @@ -11516,7 +11786,7 @@ out_free: void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) { struct kvm_vcpu *vcpu; - int i; + unsigned long i; /* * memslots->generation has been incremented. @@ -11530,13 +11800,18 @@ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) } int kvm_arch_prepare_memory_region(struct kvm *kvm, - struct kvm_memory_slot *memslot, - const struct kvm_userspace_memory_region *mem, - enum kvm_mr_change change) + const struct kvm_memory_slot *old, + struct kvm_memory_slot *new, + enum kvm_mr_change change) { if (change == KVM_MR_CREATE || change == KVM_MR_MOVE) - return kvm_alloc_memslot_metadata(kvm, memslot, - mem->memory_size >> PAGE_SHIFT); + return kvm_alloc_memslot_metadata(kvm, new); + + if (change == KVM_MR_FLAGS_ONLY) + memcpy(&new->arch, &old->arch, sizeof(old->arch)); + else if (WARN_ON_ONCE(change != KVM_MR_DELETE)) + return -EIO; + return 0; } @@ -11560,13 +11835,15 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, const struct kvm_memory_slot *new, enum kvm_mr_change change) { - bool log_dirty_pages = new->flags & KVM_MEM_LOG_DIRTY_PAGES; + u32 old_flags = old ? old->flags : 0; + u32 new_flags = new ? new->flags : 0; + bool log_dirty_pages = new_flags & KVM_MEM_LOG_DIRTY_PAGES; /* * Update CPU dirty logging if dirty logging is being toggled. This * applies to all operations. */ - if ((old->flags ^ new->flags) & KVM_MEM_LOG_DIRTY_PAGES) + if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES) kvm_mmu_update_cpu_dirty_logging(kvm, log_dirty_pages); /* @@ -11584,7 +11861,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, * MOVE/DELETE: The old mappings will already have been cleaned up by * kvm_arch_flush_shadow_memslot(). */ - if ((change != KVM_MR_FLAGS_ONLY) || (new->flags & KVM_MEM_READONLY)) + if ((change != KVM_MR_FLAGS_ONLY) || (new_flags & KVM_MEM_READONLY)) return; /* @@ -11592,7 +11869,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, * other flag is LOG_DIRTY_PAGES, i.e. something is wrong if dirty * logging isn't being toggled on or off. */ - if (WARN_ON_ONCE(!((old->flags ^ new->flags) & KVM_MEM_LOG_DIRTY_PAGES))) + if (WARN_ON_ONCE(!((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES))) return; if (!log_dirty_pages) { @@ -11628,14 +11905,18 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, } void kvm_arch_commit_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem, struct kvm_memory_slot *old, const struct kvm_memory_slot *new, enum kvm_mr_change change) { - if (!kvm->arch.n_requested_mmu_pages) - kvm_mmu_change_mmu_pages(kvm, - kvm_mmu_calculate_default_mmu_pages(kvm)); + if (!kvm->arch.n_requested_mmu_pages && + (change == KVM_MR_CREATE || change == KVM_MR_DELETE)) { + unsigned long nr_mmu_pages; + + nr_mmu_pages = kvm->nr_memslot_pages / KVM_MEMSLOT_PAGES_TO_MMU_PAGES_RATIO; + nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES); + kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); + } kvm_mmu_slot_apply_flags(kvm, old, new, change); @@ -11736,6 +12017,11 @@ bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) return vcpu->arch.preempted_in_kernel; } +unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu) +{ + return kvm_rip_read(vcpu); +} + int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) { return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE; @@ -12096,6 +12382,15 @@ int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, return static_call(kvm_x86_update_pi_irte)(kvm, host_irq, guest_irq, set); } +bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old, + struct kvm_kernel_irq_routing_entry *new) +{ + if (new->type != KVM_IRQ_ROUTING_MSI) + return true; + + return !!memcmp(&old->msi, &new->msi, sizeof(new->msi)); +} + bool kvm_vector_hashing_enabled(void) { return vector_hashing; @@ -12136,12 +12431,13 @@ EXPORT_SYMBOL_GPL(kvm_spec_ctrl_test_value); void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code) { + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; struct x86_exception fault; u32 access = error_code & (PFERR_WRITE_MASK | PFERR_FETCH_MASK | PFERR_USER_MASK); if (!(error_code & PFERR_PRESENT_MASK) || - vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, &fault) != UNMAPPED_GVA) { + mmu->gva_to_gpa(vcpu, mmu, gva, access, &fault) != UNMAPPED_GVA) { /* * If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page * tables probably do not match the TLB. Just proceed @@ -12177,9 +12473,7 @@ int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r, * doesn't seem to be a real use-case behind such requests, just return * KVM_EXIT_INTERNAL_ERROR for now. */ - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; + kvm_prepare_emulation_failure_exit(vcpu); return 0; } @@ -12239,7 +12533,8 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) return kvm_skip_emulated_instruction(vcpu); default: - BUG(); /* We have already checked above that type <= 3 */ + kvm_inject_gp(vcpu, 0); + return 1; } } EXPORT_SYMBOL_GPL(kvm_handle_invpcid); @@ -12367,44 +12662,81 @@ int kvm_sev_es_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes, } EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_read); -static int complete_sev_es_emulated_ins(struct kvm_vcpu *vcpu) +static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size, + unsigned int port); + +static int complete_sev_es_emulated_outs(struct kvm_vcpu *vcpu) { - memcpy(vcpu->arch.guest_ins_data, vcpu->arch.pio_data, - vcpu->arch.pio.count * vcpu->arch.pio.size); - vcpu->arch.pio.count = 0; + int size = vcpu->arch.pio.size; + int port = vcpu->arch.pio.port; + vcpu->arch.pio.count = 0; + if (vcpu->arch.sev_pio_count) + return kvm_sev_es_outs(vcpu, size, port); return 1; } static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size, - unsigned int port, void *data, unsigned int count) + unsigned int port) { - int ret; - - ret = emulator_pio_out_emulated(vcpu->arch.emulate_ctxt, size, port, - data, count); - if (ret) - return ret; + for (;;) { + unsigned int count = + min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count); + int ret = emulator_pio_out(vcpu, size, port, vcpu->arch.sev_pio_data, count); + + /* memcpy done already by emulator_pio_out. */ + vcpu->arch.sev_pio_count -= count; + vcpu->arch.sev_pio_data += count * vcpu->arch.pio.size; + if (!ret) + break; - vcpu->arch.pio.count = 0; + /* Emulation done by the kernel. */ + if (!vcpu->arch.sev_pio_count) + return 1; + } + vcpu->arch.complete_userspace_io = complete_sev_es_emulated_outs; return 0; } static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size, - unsigned int port, void *data, unsigned int count) + unsigned int port); + +static void advance_sev_es_emulated_ins(struct kvm_vcpu *vcpu) { - int ret; + unsigned count = vcpu->arch.pio.count; + complete_emulator_pio_in(vcpu, vcpu->arch.sev_pio_data); + vcpu->arch.sev_pio_count -= count; + vcpu->arch.sev_pio_data += count * vcpu->arch.pio.size; +} - ret = emulator_pio_in_emulated(vcpu->arch.emulate_ctxt, size, port, - data, count); - if (ret) { - vcpu->arch.pio.count = 0; - } else { - vcpu->arch.guest_ins_data = data; - vcpu->arch.complete_userspace_io = complete_sev_es_emulated_ins; +static int complete_sev_es_emulated_ins(struct kvm_vcpu *vcpu) +{ + int size = vcpu->arch.pio.size; + int port = vcpu->arch.pio.port; + + advance_sev_es_emulated_ins(vcpu); + if (vcpu->arch.sev_pio_count) + return kvm_sev_es_ins(vcpu, size, port); + return 1; +} + +static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size, + unsigned int port) +{ + for (;;) { + unsigned int count = + min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count); + if (!__emulator_pio_in(vcpu, size, port, count)) + break; + + /* Emulation done by the kernel. */ + advance_sev_es_emulated_ins(vcpu); + if (!vcpu->arch.sev_pio_count) + return 1; } + vcpu->arch.complete_userspace_io = complete_sev_es_emulated_ins; return 0; } @@ -12412,8 +12744,10 @@ int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size, unsigned int port, void *data, unsigned int count, int in) { - return in ? kvm_sev_es_ins(vcpu, size, port, data, count) - : kvm_sev_es_outs(vcpu, size, port, data, count); + vcpu->arch.sev_pio_data = data; + vcpu->arch.sev_pio_count = count; + return in ? kvm_sev_es_ins(vcpu, size, port) + : kvm_sev_es_outs(vcpu, size, port); } EXPORT_SYMBOL_GPL(kvm_sev_es_string_io); @@ -12440,6 +12774,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_update_request); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_accept_irq); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_enter); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 7d66d63dc55a..bec8ed090abc 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -103,6 +103,7 @@ static inline unsigned int __shrink_ple_window(unsigned int val, #define MSR_IA32_CR_PAT_DEFAULT 0x0007040600070406ULL +void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu); int kvm_check_nested_events(struct kvm_vcpu *vcpu); static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu) @@ -153,12 +154,24 @@ static inline bool is_64_bit_mode(struct kvm_vcpu *vcpu) { int cs_db, cs_l; + WARN_ON_ONCE(vcpu->arch.guest_state_protected); + if (!is_long_mode(vcpu)) return false; static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l); return cs_l; } +static inline bool is_64_bit_hypercall(struct kvm_vcpu *vcpu) +{ + /* + * If running with protected guest state, the CS register is not + * accessible. The hypercall register values will have had to been + * provided in 64-bit mode, so assume the guest is in 64-bit. + */ + return vcpu->arch.guest_state_protected || is_64_bit_mode(vcpu); +} + static inline bool x86_exception_has_error_code(unsigned int vector) { static u32 exception_has_error_code = BIT(DF_VECTOR) | BIT(TS_VECTOR) | @@ -173,12 +186,6 @@ static inline bool mmu_is_nested(struct kvm_vcpu *vcpu) return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu; } -static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu) -{ - ++vcpu->stat.tlb_flush; - static_call(kvm_x86_tlb_flush_current)(vcpu); -} - static inline int is_pae(struct kvm_vcpu *vcpu) { return kvm_read_cr4_bits(vcpu, X86_CR4_PAE); @@ -294,7 +301,6 @@ static inline bool kvm_vcpu_latch_init(struct kvm_vcpu *vcpu) return is_smm(vcpu) || static_call(kvm_x86_apic_init_signal_blocked)(vcpu); } -void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs); void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); u64 get_kvmclock_ns(struct kvm *kvm); @@ -343,8 +349,6 @@ extern bool enable_vmware_backdoor; extern int pi_inject_timer; -extern struct static_key kvm_no_apic_vcpu; - extern bool report_ignored_msrs; static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) @@ -387,18 +391,27 @@ static inline bool kvm_cstate_in_guest(struct kvm *kvm) return kvm->arch.cstate_in_guest; } -DECLARE_PER_CPU(struct kvm_vcpu *, current_vcpu); +enum kvm_intr_type { + /* Values are arbitrary, but must be non-zero. */ + KVM_HANDLING_IRQ = 1, + KVM_HANDLING_NMI, +}; -static inline void kvm_before_interrupt(struct kvm_vcpu *vcpu) +static inline void kvm_before_interrupt(struct kvm_vcpu *vcpu, + enum kvm_intr_type intr) { - __this_cpu_write(current_vcpu, vcpu); + WRITE_ONCE(vcpu->arch.handling_intr_from_guest, (u8)intr); } static inline void kvm_after_interrupt(struct kvm_vcpu *vcpu) { - __this_cpu_write(current_vcpu, NULL); + WRITE_ONCE(vcpu->arch.handling_intr_from_guest, 0); } +static inline bool kvm_handling_nmi_from_guest(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.handling_intr_from_guest == KVM_HANDLING_NMI; +} static inline bool kvm_pat_valid(u64 data) { diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 9ea9c3dabe37..0e3f7d6e9fd7 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -16,6 +16,7 @@ #include <trace/events/kvm.h> #include <xen/interface/xen.h> #include <xen/interface/vcpu.h> +#include <xen/interface/event_channel.h> #include "trace.h" @@ -23,38 +24,77 @@ DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ); static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn) { + struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache; + struct pvclock_wall_clock *wc; gpa_t gpa = gfn_to_gpa(gfn); - int wc_ofs, sec_hi_ofs; + u32 *wc_sec_hi; + u32 wc_version; + u64 wall_nsec; int ret = 0; int idx = srcu_read_lock(&kvm->srcu); - if (kvm_is_error_hva(gfn_to_hva(kvm, gfn))) { - ret = -EFAULT; + if (gfn == GPA_INVALID) { + kvm_gfn_to_pfn_cache_destroy(kvm, gpc); goto out; } - kvm->arch.xen.shinfo_gfn = gfn; + + do { + ret = kvm_gfn_to_pfn_cache_init(kvm, gpc, NULL, false, true, + gpa, PAGE_SIZE, false); + if (ret) + goto out; + + /* + * This code mirrors kvm_write_wall_clock() except that it writes + * directly through the pfn cache and doesn't mark the page dirty. + */ + wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm); + + /* It could be invalid again already, so we need to check */ + read_lock_irq(&gpc->lock); + + if (gpc->valid) + break; + + read_unlock_irq(&gpc->lock); + } while (1); /* Paranoia checks on the 32-bit struct layout */ BUILD_BUG_ON(offsetof(struct compat_shared_info, wc) != 0x900); BUILD_BUG_ON(offsetof(struct compat_shared_info, arch.wc_sec_hi) != 0x924); BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0); - /* 32-bit location by default */ - wc_ofs = offsetof(struct compat_shared_info, wc); - sec_hi_ofs = offsetof(struct compat_shared_info, arch.wc_sec_hi); - #ifdef CONFIG_X86_64 /* Paranoia checks on the 64-bit struct layout */ BUILD_BUG_ON(offsetof(struct shared_info, wc) != 0xc00); BUILD_BUG_ON(offsetof(struct shared_info, wc_sec_hi) != 0xc0c); - if (kvm->arch.xen.long_mode) { - wc_ofs = offsetof(struct shared_info, wc); - sec_hi_ofs = offsetof(struct shared_info, wc_sec_hi); - } + if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) { + struct shared_info *shinfo = gpc->khva; + + wc_sec_hi = &shinfo->wc_sec_hi; + wc = &shinfo->wc; + } else #endif + { + struct compat_shared_info *shinfo = gpc->khva; + + wc_sec_hi = &shinfo->arch.wc_sec_hi; + wc = &shinfo->wc; + } + + /* Increment and ensure an odd value */ + wc_version = wc->version = (wc->version + 1) | 1; + smp_wmb(); + + wc->nsec = do_div(wall_nsec, 1000000000); + wc->sec = (u32)wall_nsec; + *wc_sec_hi = wall_nsec >> 32; + smp_wmb(); + + wc->version = wc_version + 1; + read_unlock_irq(&gpc->lock); - kvm_write_wall_clock(kvm, gpa + wc_ofs, sec_hi_ofs - wc_ofs); kvm_make_all_cpus_request(kvm, KVM_REQ_MASTERCLOCK_UPDATE); out: @@ -127,9 +167,9 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) state_entry_time = vx->runstate_entry_time; state_entry_time |= XEN_RUNSTATE_UPDATE; - BUILD_BUG_ON(sizeof(((struct vcpu_runstate_info *)0)->state_entry_time) != + BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) != sizeof(state_entry_time)); - BUILD_BUG_ON(sizeof(((struct compat_vcpu_runstate_info *)0)->state_entry_time) != + BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) != sizeof(state_entry_time)); if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache, @@ -144,9 +184,9 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) */ BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != offsetof(struct compat_vcpu_runstate_info, state)); - BUILD_BUG_ON(sizeof(((struct vcpu_runstate_info *)0)->state) != + BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state) != sizeof(vx->current_runstate)); - BUILD_BUG_ON(sizeof(((struct compat_vcpu_runstate_info *)0)->state) != + BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) != sizeof(vx->current_runstate)); if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache, @@ -163,9 +203,9 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) offsetof(struct vcpu_runstate_info, time) - sizeof(u64)); BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state_entry_time) != offsetof(struct compat_vcpu_runstate_info, time) - sizeof(u64)); - BUILD_BUG_ON(sizeof(((struct vcpu_runstate_info *)0)->time) != - sizeof(((struct compat_vcpu_runstate_info *)0)->time)); - BUILD_BUG_ON(sizeof(((struct vcpu_runstate_info *)0)->time) != + BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) != + sizeof_field(struct compat_vcpu_runstate_info, time)); + BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) != sizeof(vx->runstate_times)); if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache, @@ -190,6 +230,9 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) int __kvm_xen_has_interrupt(struct kvm_vcpu *v) { + unsigned long evtchn_pending_sel = READ_ONCE(v->arch.xen.evtchn_pending_sel); + bool atomic = in_atomic() || !task_is_running(current); + int err; u8 rc = 0; /* @@ -198,29 +241,118 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v) */ struct gfn_to_hva_cache *ghc = &v->arch.xen.vcpu_info_cache; struct kvm_memslots *slots = kvm_memslots(v->kvm); + bool ghc_valid = slots->generation == ghc->generation && + !kvm_is_error_hva(ghc->hva) && ghc->memslot; + unsigned int offset = offsetof(struct vcpu_info, evtchn_upcall_pending); /* No need for compat handling here */ BUILD_BUG_ON(offsetof(struct vcpu_info, evtchn_upcall_pending) != offsetof(struct compat_vcpu_info, evtchn_upcall_pending)); BUILD_BUG_ON(sizeof(rc) != - sizeof(((struct vcpu_info *)0)->evtchn_upcall_pending)); + sizeof_field(struct vcpu_info, evtchn_upcall_pending)); BUILD_BUG_ON(sizeof(rc) != - sizeof(((struct compat_vcpu_info *)0)->evtchn_upcall_pending)); + sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending)); /* * For efficiency, this mirrors the checks for using the valid * cache in kvm_read_guest_offset_cached(), but just uses * __get_user() instead. And falls back to the slow path. */ - if (likely(slots->generation == ghc->generation && - !kvm_is_error_hva(ghc->hva) && ghc->memslot)) { + if (!evtchn_pending_sel && ghc_valid) { /* Fast path */ - __get_user(rc, (u8 __user *)ghc->hva + offset); + pagefault_disable(); + err = __get_user(rc, (u8 __user *)ghc->hva + offset); + pagefault_enable(); + if (!err) + return rc; + } + + /* Slow path */ + + /* + * This function gets called from kvm_vcpu_block() after setting the + * task to TASK_INTERRUPTIBLE, to see if it needs to wake immediately + * from a HLT. So we really mustn't sleep. If the page ended up absent + * at that point, just return 1 in order to trigger an immediate wake, + * and we'll end up getting called again from a context where we *can* + * fault in the page and wait for it. + */ + if (atomic) + return 1; + + if (!ghc_valid) { + err = kvm_gfn_to_hva_cache_init(v->kvm, ghc, ghc->gpa, ghc->len); + if (err || !ghc->memslot) { + /* + * If this failed, userspace has screwed up the + * vcpu_info mapping. No interrupts for you. + */ + return 0; + } + } + + /* + * Now we have a valid (protected by srcu) userspace HVA in + * ghc->hva which points to the struct vcpu_info. If there + * are any bits in the in-kernel evtchn_pending_sel then + * we need to write those to the guest vcpu_info and set + * its evtchn_upcall_pending flag. If there aren't any bits + * to add, we only want to *check* evtchn_upcall_pending. + */ + if (evtchn_pending_sel) { + bool long_mode = v->kvm->arch.xen.long_mode; + + if (!user_access_begin((void __user *)ghc->hva, sizeof(struct vcpu_info))) + return 0; + + if (IS_ENABLED(CONFIG_64BIT) && long_mode) { + struct vcpu_info __user *vi = (void __user *)ghc->hva; + + /* Attempt to set the evtchn_pending_sel bits in the + * guest, and if that succeeds then clear the same + * bits in the in-kernel version. */ + asm volatile("1:\t" LOCK_PREFIX "orq %0, %1\n" + "\tnotq %0\n" + "\t" LOCK_PREFIX "andq %0, %2\n" + "2:\n" + "\t.section .fixup,\"ax\"\n" + "3:\tjmp\t2b\n" + "\t.previous\n" + _ASM_EXTABLE_UA(1b, 3b) + : "=r" (evtchn_pending_sel), + "+m" (vi->evtchn_pending_sel), + "+m" (v->arch.xen.evtchn_pending_sel) + : "0" (evtchn_pending_sel)); + } else { + struct compat_vcpu_info __user *vi = (void __user *)ghc->hva; + u32 evtchn_pending_sel32 = evtchn_pending_sel; + + /* Attempt to set the evtchn_pending_sel bits in the + * guest, and if that succeeds then clear the same + * bits in the in-kernel version. */ + asm volatile("1:\t" LOCK_PREFIX "orl %0, %1\n" + "\tnotl %0\n" + "\t" LOCK_PREFIX "andl %0, %2\n" + "2:\n" + "\t.section .fixup,\"ax\"\n" + "3:\tjmp\t2b\n" + "\t.previous\n" + _ASM_EXTABLE_UA(1b, 3b) + : "=r" (evtchn_pending_sel32), + "+m" (vi->evtchn_pending_sel), + "+m" (v->arch.xen.evtchn_pending_sel) + : "0" (evtchn_pending_sel32)); + } + rc = 1; + unsafe_put_user(rc, (u8 __user *)ghc->hva + offset, err); + + err: + user_access_end(); + + mark_page_dirty_in_slot(v->kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT); } else { - /* Slow path */ - kvm_read_guest_offset_cached(v->kvm, ghc, &rc, offset, - sizeof(rc)); + __get_user(rc, (u8 __user *)ghc->hva + offset); } return rc; @@ -243,15 +375,9 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) break; case KVM_XEN_ATTR_TYPE_SHARED_INFO: - if (data->u.shared_info.gfn == GPA_INVALID) { - kvm->arch.xen.shinfo_gfn = GPA_INVALID; - r = 0; - break; - } r = kvm_xen_shared_info_init(kvm, data->u.shared_info.gfn); break; - case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR: if (data->u.vector && data->u.vector < 0x10) r = -EINVAL; @@ -282,7 +408,10 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) break; case KVM_XEN_ATTR_TYPE_SHARED_INFO: - data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_gfn); + if (kvm->arch.xen.shinfo_cache.active) + data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_cache.gpa); + else + data->u.shared_info.gfn = GPA_INVALID; r = 0; break; @@ -644,11 +773,12 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc) void kvm_xen_init_vm(struct kvm *kvm) { - kvm->arch.xen.shinfo_gfn = GPA_INVALID; } void kvm_xen_destroy_vm(struct kvm *kvm) { + kvm_gfn_to_pfn_cache_destroy(kvm, &kvm->arch.xen.shinfo_cache); + if (kvm->arch.xen_hvm_config.msr) static_branch_slow_dec_deferred(&kvm_xen_enabled); } @@ -681,7 +811,7 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu) kvm_hv_hypercall_enabled(vcpu)) return kvm_hv_hypercall(vcpu); - longmode = is_64_bit_mode(vcpu); + longmode = is_64_bit_hypercall(vcpu); if (!longmode) { params[0] = (u32)kvm_rbx_read(vcpu); params[1] = (u32)kvm_rcx_read(vcpu); @@ -720,3 +850,179 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu) return 0; } + +static inline int max_evtchn_port(struct kvm *kvm) +{ + if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) + return EVTCHN_2L_NR_CHANNELS; + else + return COMPAT_EVTCHN_2L_NR_CHANNELS; +} + +/* + * This follows the kvm_set_irq() API, so it returns: + * < 0 Interrupt was ignored (masked or not delivered for other reasons) + * = 0 Interrupt was coalesced (previous irq is still pending) + * > 0 Number of CPUs interrupt was delivered to + */ +int kvm_xen_set_evtchn_fast(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm) +{ + struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache; + struct kvm_vcpu *vcpu; + unsigned long *pending_bits, *mask_bits; + unsigned long flags; + int port_word_bit; + bool kick_vcpu = false; + int idx; + int rc; + + vcpu = kvm_get_vcpu_by_id(kvm, e->xen_evtchn.vcpu); + if (!vcpu) + return -1; + + if (!vcpu->arch.xen.vcpu_info_set) + return -1; + + if (e->xen_evtchn.port >= max_evtchn_port(kvm)) + return -1; + + rc = -EWOULDBLOCK; + read_lock_irqsave(&gpc->lock, flags); + + idx = srcu_read_lock(&kvm->srcu); + if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, PAGE_SIZE)) + goto out_rcu; + + if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) { + struct shared_info *shinfo = gpc->khva; + pending_bits = (unsigned long *)&shinfo->evtchn_pending; + mask_bits = (unsigned long *)&shinfo->evtchn_mask; + port_word_bit = e->xen_evtchn.port / 64; + } else { + struct compat_shared_info *shinfo = gpc->khva; + pending_bits = (unsigned long *)&shinfo->evtchn_pending; + mask_bits = (unsigned long *)&shinfo->evtchn_mask; + port_word_bit = e->xen_evtchn.port / 32; + } + + /* + * If this port wasn't already set, and if it isn't masked, then + * we try to set the corresponding bit in the in-kernel shadow of + * evtchn_pending_sel for the target vCPU. And if *that* wasn't + * already set, then we kick the vCPU in question to write to the + * *real* evtchn_pending_sel in its own guest vcpu_info struct. + */ + if (test_and_set_bit(e->xen_evtchn.port, pending_bits)) { + rc = 0; /* It was already raised */ + } else if (test_bit(e->xen_evtchn.port, mask_bits)) { + rc = -1; /* Masked */ + } else { + rc = 1; /* Delivered. But was the vCPU waking already? */ + if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel)) + kick_vcpu = true; + } + + out_rcu: + srcu_read_unlock(&kvm->srcu, idx); + read_unlock_irqrestore(&gpc->lock, flags); + + if (kick_vcpu) { + kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_vcpu_kick(vcpu); + } + + return rc; +} + +/* This is the version called from kvm_set_irq() as the .set function */ +static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, + int irq_source_id, int level, bool line_status) +{ + bool mm_borrowed = false; + int rc; + + if (!level) + return -1; + + rc = kvm_xen_set_evtchn_fast(e, kvm); + if (rc != -EWOULDBLOCK) + return rc; + + if (current->mm != kvm->mm) { + /* + * If not on a thread which already belongs to this KVM, + * we'd better be in the irqfd workqueue. + */ + if (WARN_ON_ONCE(current->mm)) + return -EINVAL; + + kthread_use_mm(kvm->mm); + mm_borrowed = true; + } + + /* + * For the irqfd workqueue, using the main kvm->lock mutex is + * fine since this function is invoked from kvm_set_irq() with + * no other lock held, no srcu. In future if it will be called + * directly from a vCPU thread (e.g. on hypercall for an IPI) + * then it may need to switch to using a leaf-node mutex for + * serializing the shared_info mapping. + */ + mutex_lock(&kvm->lock); + + /* + * It is theoretically possible for the page to be unmapped + * and the MMU notifier to invalidate the shared_info before + * we even get to use it. In that case, this looks like an + * infinite loop. It was tempting to do it via the userspace + * HVA instead... but that just *hides* the fact that it's + * an infinite loop, because if a fault occurs and it waits + * for the page to come back, it can *still* immediately + * fault and have to wait again, repeatedly. + * + * Conversely, the page could also have been reinstated by + * another thread before we even obtain the mutex above, so + * check again *first* before remapping it. + */ + do { + struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache; + int idx; + + rc = kvm_xen_set_evtchn_fast(e, kvm); + if (rc != -EWOULDBLOCK) + break; + + idx = srcu_read_lock(&kvm->srcu); + rc = kvm_gfn_to_pfn_cache_refresh(kvm, gpc, gpc->gpa, + PAGE_SIZE, false); + srcu_read_unlock(&kvm->srcu, idx); + } while(!rc); + + mutex_unlock(&kvm->lock); + + if (mm_borrowed) + kthread_unuse_mm(kvm->mm); + + return rc; +} + +int kvm_xen_setup_evtchn(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e, + const struct kvm_irq_routing_entry *ue) + +{ + if (ue->u.xen_evtchn.port >= max_evtchn_port(kvm)) + return -EINVAL; + + /* We only support 2 level event channels for now */ + if (ue->u.xen_evtchn.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) + return -EINVAL; + + e->xen_evtchn.port = ue->u.xen_evtchn.port; + e->xen_evtchn.vcpu = ue->u.xen_evtchn.vcpu; + e->xen_evtchn.priority = ue->u.xen_evtchn.priority; + e->set = evtchn_set_fn; + + return 0; +} diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h index cc0cf5f37450..adbcc9ed59db 100644 --- a/arch/x86/kvm/xen.h +++ b/arch/x86/kvm/xen.h @@ -24,6 +24,12 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc); void kvm_xen_init_vm(struct kvm *kvm); void kvm_xen_destroy_vm(struct kvm *kvm); +int kvm_xen_set_evtchn_fast(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm); +int kvm_xen_setup_evtchn(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e, + const struct kvm_irq_routing_entry *ue); + static inline bool kvm_xen_msr_enabled(struct kvm *kvm) { return static_branch_unlikely(&kvm_xen_enabled.key) && @@ -134,6 +140,9 @@ struct compat_shared_info { struct compat_arch_shared_info arch; }; +#define COMPAT_EVTCHN_2L_NR_CHANNELS (8 * \ + sizeof_field(struct compat_shared_info, \ + evtchn_pending)) struct compat_vcpu_runstate_info { int state; uint64_t state_entry_time; diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index c6506c6a7092..f76747862bd2 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -63,7 +63,6 @@ ifeq ($(CONFIG_X86_32),y) ifneq ($(CONFIG_X86_CMPXCHG64),y) lib-y += cmpxchg8b_emu.o atomic64_386_32.o endif - lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o else obj-y += iomap_copy_64.o lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S index 16bc9130e7a5..e768815e58ae 100644 --- a/arch/x86/lib/atomic64_386_32.S +++ b/arch/x86/lib/atomic64_386_32.S @@ -9,81 +9,83 @@ #include <asm/alternative.h> /* if you want SMP support, implement these with real spinlocks */ -.macro LOCK reg +.macro IRQ_SAVE reg pushfl cli .endm -.macro UNLOCK reg +.macro IRQ_RESTORE reg popfl .endm -#define BEGIN(op) \ +#define BEGIN_IRQ_SAVE(op) \ .macro endp; \ SYM_FUNC_END(atomic64_##op##_386); \ .purgem endp; \ .endm; \ SYM_FUNC_START(atomic64_##op##_386); \ - LOCK v; + IRQ_SAVE v; #define ENDP endp -#define RET \ - UNLOCK v; \ - ret - -#define RET_ENDP \ - RET; \ - ENDP +#define RET_IRQ_RESTORE \ + IRQ_RESTORE v; \ + RET #define v %ecx -BEGIN(read) +BEGIN_IRQ_SAVE(read) movl (v), %eax movl 4(v), %edx -RET_ENDP + RET_IRQ_RESTORE +ENDP #undef v #define v %esi -BEGIN(set) +BEGIN_IRQ_SAVE(set) movl %ebx, (v) movl %ecx, 4(v) -RET_ENDP + RET_IRQ_RESTORE +ENDP #undef v #define v %esi -BEGIN(xchg) +BEGIN_IRQ_SAVE(xchg) movl (v), %eax movl 4(v), %edx movl %ebx, (v) movl %ecx, 4(v) -RET_ENDP + RET_IRQ_RESTORE +ENDP #undef v #define v %ecx -BEGIN(add) +BEGIN_IRQ_SAVE(add) addl %eax, (v) adcl %edx, 4(v) -RET_ENDP + RET_IRQ_RESTORE +ENDP #undef v #define v %ecx -BEGIN(add_return) +BEGIN_IRQ_SAVE(add_return) addl (v), %eax adcl 4(v), %edx movl %eax, (v) movl %edx, 4(v) -RET_ENDP + RET_IRQ_RESTORE +ENDP #undef v #define v %ecx -BEGIN(sub) +BEGIN_IRQ_SAVE(sub) subl %eax, (v) sbbl %edx, 4(v) -RET_ENDP + RET_IRQ_RESTORE +ENDP #undef v #define v %ecx -BEGIN(sub_return) +BEGIN_IRQ_SAVE(sub_return) negl %edx negl %eax sbbl $0, %edx @@ -91,47 +93,52 @@ BEGIN(sub_return) adcl 4(v), %edx movl %eax, (v) movl %edx, 4(v) -RET_ENDP + RET_IRQ_RESTORE +ENDP #undef v #define v %esi -BEGIN(inc) +BEGIN_IRQ_SAVE(inc) addl $1, (v) adcl $0, 4(v) -RET_ENDP + RET_IRQ_RESTORE +ENDP #undef v #define v %esi -BEGIN(inc_return) +BEGIN_IRQ_SAVE(inc_return) movl (v), %eax movl 4(v), %edx addl $1, %eax adcl $0, %edx movl %eax, (v) movl %edx, 4(v) -RET_ENDP + RET_IRQ_RESTORE +ENDP #undef v #define v %esi -BEGIN(dec) +BEGIN_IRQ_SAVE(dec) subl $1, (v) sbbl $0, 4(v) -RET_ENDP + RET_IRQ_RESTORE +ENDP #undef v #define v %esi -BEGIN(dec_return) +BEGIN_IRQ_SAVE(dec_return) movl (v), %eax movl 4(v), %edx subl $1, %eax sbbl $0, %edx movl %eax, (v) movl %edx, 4(v) -RET_ENDP + RET_IRQ_RESTORE +ENDP #undef v #define v %esi -BEGIN(add_unless) +BEGIN_IRQ_SAVE(add_unless) addl %eax, %ecx adcl %edx, %edi addl (v), %eax @@ -143,7 +150,7 @@ BEGIN(add_unless) movl %edx, 4(v) movl $1, %eax 2: - RET + RET_IRQ_RESTORE 3: cmpl %edx, %edi jne 1b @@ -153,7 +160,7 @@ ENDP #undef v #define v %esi -BEGIN(inc_not_zero) +BEGIN_IRQ_SAVE(inc_not_zero) movl (v), %eax movl 4(v), %edx testl %eax, %eax @@ -165,7 +172,7 @@ BEGIN(inc_not_zero) movl %edx, 4(v) movl $1, %eax 2: - RET + RET_IRQ_RESTORE 3: testl %edx, %edx jne 1b @@ -174,7 +181,7 @@ ENDP #undef v #define v %esi -BEGIN(dec_if_positive) +BEGIN_IRQ_SAVE(dec_if_positive) movl (v), %eax movl 4(v), %edx subl $1, %eax @@ -183,5 +190,6 @@ BEGIN(dec_if_positive) movl %eax, (v) movl %edx, 4(v) 1: -RET_ENDP + RET_IRQ_RESTORE +ENDP #undef v diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S index ce6935690766..90afb488b396 100644 --- a/arch/x86/lib/atomic64_cx8_32.S +++ b/arch/x86/lib/atomic64_cx8_32.S @@ -18,7 +18,7 @@ SYM_FUNC_START(atomic64_read_cx8) read64 %ecx - ret + RET SYM_FUNC_END(atomic64_read_cx8) SYM_FUNC_START(atomic64_set_cx8) @@ -28,7 +28,7 @@ SYM_FUNC_START(atomic64_set_cx8) cmpxchg8b (%esi) jne 1b - ret + RET SYM_FUNC_END(atomic64_set_cx8) SYM_FUNC_START(atomic64_xchg_cx8) @@ -37,7 +37,7 @@ SYM_FUNC_START(atomic64_xchg_cx8) cmpxchg8b (%esi) jne 1b - ret + RET SYM_FUNC_END(atomic64_xchg_cx8) .macro addsub_return func ins insc @@ -68,7 +68,7 @@ SYM_FUNC_START(atomic64_\func\()_return_cx8) popl %esi popl %ebx popl %ebp - ret + RET SYM_FUNC_END(atomic64_\func\()_return_cx8) .endm @@ -93,7 +93,7 @@ SYM_FUNC_START(atomic64_\func\()_return_cx8) movl %ebx, %eax movl %ecx, %edx popl %ebx - ret + RET SYM_FUNC_END(atomic64_\func\()_return_cx8) .endm @@ -118,7 +118,7 @@ SYM_FUNC_START(atomic64_dec_if_positive_cx8) movl %ebx, %eax movl %ecx, %edx popl %ebx - ret + RET SYM_FUNC_END(atomic64_dec_if_positive_cx8) SYM_FUNC_START(atomic64_add_unless_cx8) @@ -149,7 +149,7 @@ SYM_FUNC_START(atomic64_add_unless_cx8) addl $8, %esp popl %ebx popl %ebp - ret + RET 4: cmpl %edx, 4(%esp) jne 2b @@ -176,5 +176,5 @@ SYM_FUNC_START(atomic64_inc_not_zero_cx8) movl $1, %eax 3: popl %ebx - ret + RET SYM_FUNC_END(atomic64_inc_not_zero_cx8) diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S index 4304320e51f4..23318c338db0 100644 --- a/arch/x86/lib/checksum_32.S +++ b/arch/x86/lib/checksum_32.S @@ -127,7 +127,7 @@ SYM_FUNC_START(csum_partial) 8: popl %ebx popl %esi - ret + RET SYM_FUNC_END(csum_partial) #else @@ -245,7 +245,7 @@ SYM_FUNC_START(csum_partial) 90: popl %ebx popl %esi - ret + RET SYM_FUNC_END(csum_partial) #endif @@ -260,9 +260,9 @@ unsigned int csum_partial_copy_generic (const char *src, char *dst, * Copy from ds while checksumming, otherwise like csum_partial */ -#define EXC(y...) \ - 9999: y; \ - _ASM_EXTABLE_UA(9999b, 6001f) +#define EXC(y...) \ + 9999: y; \ + _ASM_EXTABLE_TYPE(9999b, 7f, EX_TYPE_UACCESS | EX_FLAG_CLEAR_AX) #ifndef CONFIG_X86_USE_PPRO_CHECKSUM @@ -358,20 +358,11 @@ EXC( movb %cl, (%edi) ) adcl $0, %eax 7: -# Exception handler: -.section .fixup, "ax" - -6001: - xorl %eax, %eax - jmp 7b - -.previous - popl %ebx popl %esi popl %edi popl %ecx # equivalent to addl $4,%esp - ret + RET SYM_FUNC_END(csum_partial_copy_generic) #else @@ -439,15 +430,11 @@ EXC( movb %dl, (%edi) ) 6: addl %edx, %eax adcl $0, %eax 7: -.section .fixup, "ax" -6001: xorl %eax, %eax - jmp 7b -.previous popl %esi popl %edi popl %ebx - ret + RET SYM_FUNC_END(csum_partial_copy_generic) #undef ROUND diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index c4c7dd115953..fe59b8ac4fcc 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S @@ -17,7 +17,7 @@ SYM_FUNC_START(clear_page_rep) movl $4096/8,%ecx xorl %eax,%eax rep stosq - ret + RET SYM_FUNC_END(clear_page_rep) EXPORT_SYMBOL_GPL(clear_page_rep) @@ -39,7 +39,7 @@ SYM_FUNC_START(clear_page_orig) leaq 64(%rdi),%rdi jnz .Lloop nop - ret + RET SYM_FUNC_END(clear_page_orig) EXPORT_SYMBOL_GPL(clear_page_orig) @@ -47,6 +47,6 @@ SYM_FUNC_START(clear_page_erms) movl $4096,%ecx xorl %eax,%eax rep stosb - ret + RET SYM_FUNC_END(clear_page_erms) EXPORT_SYMBOL_GPL(clear_page_erms) diff --git a/arch/x86/lib/cmpxchg16b_emu.S b/arch/x86/lib/cmpxchg16b_emu.S index 3542502faa3b..33c70c0160ea 100644 --- a/arch/x86/lib/cmpxchg16b_emu.S +++ b/arch/x86/lib/cmpxchg16b_emu.S @@ -37,11 +37,11 @@ SYM_FUNC_START(this_cpu_cmpxchg16b_emu) popfq mov $1, %al - ret + RET .Lnot_same: popfq xor %al,%al - ret + RET SYM_FUNC_END(this_cpu_cmpxchg16b_emu) diff --git a/arch/x86/lib/cmpxchg8b_emu.S b/arch/x86/lib/cmpxchg8b_emu.S index ca01ed6029f4..6a912d58fecc 100644 --- a/arch/x86/lib/cmpxchg8b_emu.S +++ b/arch/x86/lib/cmpxchg8b_emu.S @@ -32,7 +32,7 @@ SYM_FUNC_START(cmpxchg8b_emu) movl %ecx, 4(%esi) popfl - ret + RET .Lnot_same: movl (%esi), %eax @@ -40,7 +40,7 @@ SYM_FUNC_START(cmpxchg8b_emu) movl 4(%esi), %edx popfl - ret + RET SYM_FUNC_END(cmpxchg8b_emu) EXPORT_SYMBOL(cmpxchg8b_emu) diff --git a/arch/x86/lib/copy_mc_64.S b/arch/x86/lib/copy_mc_64.S index e5f77e293034..c859a8a09860 100644 --- a/arch/x86/lib/copy_mc_64.S +++ b/arch/x86/lib/copy_mc_64.S @@ -77,10 +77,8 @@ SYM_FUNC_START(copy_mc_fragile) .L_done_memcpy_trap: xorl %eax, %eax .L_done: - ret -SYM_FUNC_END(copy_mc_fragile) + RET - .section .fixup, "ax" /* * Return number of bytes not copied for any failure. Note that * there is no "tail" handling since the source buffer is 8-byte @@ -105,14 +103,14 @@ SYM_FUNC_END(copy_mc_fragile) movl %ecx, %edx jmp copy_mc_fragile_handle_tail - .previous - - _ASM_EXTABLE_FAULT(.L_read_leading_bytes, .E_leading_bytes) - _ASM_EXTABLE_FAULT(.L_read_words, .E_read_words) - _ASM_EXTABLE_FAULT(.L_read_trailing_bytes, .E_trailing_bytes) + _ASM_EXTABLE_TYPE(.L_read_leading_bytes, .E_leading_bytes, EX_TYPE_DEFAULT_MCE_SAFE) + _ASM_EXTABLE_TYPE(.L_read_words, .E_read_words, EX_TYPE_DEFAULT_MCE_SAFE) + _ASM_EXTABLE_TYPE(.L_read_trailing_bytes, .E_trailing_bytes, EX_TYPE_DEFAULT_MCE_SAFE) _ASM_EXTABLE(.L_write_leading_bytes, .E_leading_bytes) _ASM_EXTABLE(.L_write_words, .E_write_words) _ASM_EXTABLE(.L_write_trailing_bytes, .E_trailing_bytes) + +SYM_FUNC_END(copy_mc_fragile) #endif /* CONFIG_X86_MCE */ /* @@ -132,10 +130,8 @@ SYM_FUNC_START(copy_mc_enhanced_fast_string) rep movsb /* Copy successful. Return zero */ xorl %eax, %eax - ret -SYM_FUNC_END(copy_mc_enhanced_fast_string) + RET - .section .fixup, "ax" .E_copy: /* * On fault %rcx is updated such that the copy instruction could @@ -145,9 +141,9 @@ SYM_FUNC_END(copy_mc_enhanced_fast_string) * user-copy routines. */ movq %rcx, %rax - ret + RET - .previous + _ASM_EXTABLE_TYPE(.L_copy, .E_copy, EX_TYPE_DEFAULT_MCE_SAFE) - _ASM_EXTABLE_FAULT(.L_copy, .E_copy) +SYM_FUNC_END(copy_mc_enhanced_fast_string) #endif /* !CONFIG_UML */ diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S index db4b4f9197c7..30ea644bf446 100644 --- a/arch/x86/lib/copy_page_64.S +++ b/arch/x86/lib/copy_page_64.S @@ -17,7 +17,7 @@ SYM_FUNC_START(copy_page) ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD movl $4096/8, %ecx rep movsq - ret + RET SYM_FUNC_END(copy_page) EXPORT_SYMBOL(copy_page) @@ -85,5 +85,5 @@ SYM_FUNC_START_LOCAL(copy_page_regs) movq (%rsp), %rbx movq 1*8(%rsp), %r12 addq $2*8, %rsp - ret + RET SYM_FUNC_END(copy_page_regs) diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 57b79c577496..8ca5ecf16dc4 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -32,14 +32,10 @@ decl %ecx jnz 100b 102: - .section .fixup,"ax" -103: addl %ecx,%edx /* ecx is zerorest also */ - jmp .Lcopy_user_handle_tail - .previous - _ASM_EXTABLE_CPY(100b, 103b) - _ASM_EXTABLE_CPY(101b, 103b) - .endm + _ASM_EXTABLE_CPY(100b, .Lcopy_user_handle_align) + _ASM_EXTABLE_CPY(101b, .Lcopy_user_handle_align) +.endm /* * copy_user_generic_unrolled - memory copy with exception handling. @@ -105,9 +101,8 @@ SYM_FUNC_START(copy_user_generic_unrolled) jnz 21b 23: xor %eax,%eax ASM_CLAC - ret + RET - .section .fixup,"ax" 30: shll $6,%ecx addl %ecx,%edx jmp 60f @@ -115,7 +110,6 @@ SYM_FUNC_START(copy_user_generic_unrolled) jmp 60f 50: movl %ecx,%edx 60: jmp .Lcopy_user_handle_tail /* ecx is zerorest also */ - .previous _ASM_EXTABLE_CPY(1b, 30b) _ASM_EXTABLE_CPY(2b, 30b) @@ -166,20 +160,16 @@ SYM_FUNC_START(copy_user_generic_string) movl %edx,%ecx shrl $3,%ecx andl $7,%edx -1: rep - movsq +1: rep movsq 2: movl %edx,%ecx -3: rep - movsb +3: rep movsb xorl %eax,%eax ASM_CLAC - ret + RET - .section .fixup,"ax" 11: leal (%rdx,%rcx,8),%ecx 12: movl %ecx,%edx /* ecx is zerorest also */ jmp .Lcopy_user_handle_tail - .previous _ASM_EXTABLE_CPY(1b, 11b) _ASM_EXTABLE_CPY(3b, 12b) @@ -200,19 +190,16 @@ EXPORT_SYMBOL(copy_user_generic_string) */ SYM_FUNC_START(copy_user_enhanced_fast_string) ASM_STAC - cmpl $64,%edx - jb .L_copy_short_string /* less then 64 bytes, avoid the costly 'rep' */ + /* CPUs without FSRM should avoid rep movsb for short copies */ + ALTERNATIVE "cmpl $64, %edx; jb .L_copy_short_string", "", X86_FEATURE_FSRM movl %edx,%ecx -1: rep - movsb +1: rep movsb xorl %eax,%eax ASM_CLAC - ret + RET - .section .fixup,"ax" 12: movl %ecx,%edx /* ecx is zerorest also */ jmp .Lcopy_user_handle_tail - .previous _ASM_EXTABLE_CPY(1b, 12b) SYM_FUNC_END(copy_user_enhanced_fast_string) @@ -225,6 +212,7 @@ EXPORT_SYMBOL(copy_user_enhanced_fast_string) * Don't try to copy the tail if machine check happened * * Input: + * eax trap number written by ex_handler_copy() * rdi destination * rsi source * rdx count @@ -233,26 +221,26 @@ EXPORT_SYMBOL(copy_user_enhanced_fast_string) * eax uncopied bytes or 0 if successful. */ SYM_CODE_START_LOCAL(.Lcopy_user_handle_tail) - movl %edx,%ecx - cmp $X86_TRAP_MC,%eax /* check if X86_TRAP_MC */ + cmp $X86_TRAP_MC,%eax je 3f + + movl %edx,%ecx 1: rep movsb 2: mov %ecx,%eax ASM_CLAC - ret - - /* - * Return zero to pretend that this copy succeeded. This - * is counter-intuitive, but needed to prevent the code - * in lib/iov_iter.c from retrying and running back into - * the poison cache line again. The machine check handler - * will ensure that a SIGBUS is sent to the task. - */ -3: xorl %eax,%eax + RET + +3: + movl %edx,%eax ASM_CLAC - ret + RET _ASM_EXTABLE_CPY(1b, 2b) + +.Lcopy_user_handle_align: + addl %ecx,%edx /* ecx is zerorest also */ + jmp .Lcopy_user_handle_tail + SYM_CODE_END(.Lcopy_user_handle_tail) /* @@ -361,9 +349,8 @@ SYM_FUNC_START(__copy_user_nocache) xorl %eax,%eax ASM_CLAC sfence - ret + RET - .section .fixup,"ax" .L_fixup_4x8b_copy: shll $6,%ecx addl %ecx,%edx @@ -379,7 +366,6 @@ SYM_FUNC_START(__copy_user_nocache) .L_fixup_handle_tail: sfence jmp .Lcopy_user_handle_tail - .previous _ASM_EXTABLE_CPY(1b, .L_fixup_4x8b_copy) _ASM_EXTABLE_CPY(2b, .L_fixup_4x8b_copy) diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S index 1fbd8ee9642d..d9e16a2cf285 100644 --- a/arch/x86/lib/csum-copy_64.S +++ b/arch/x86/lib/csum-copy_64.S @@ -201,7 +201,7 @@ SYM_FUNC_START(csum_partial_copy_generic) movq 3*8(%rsp), %r13 movq 4*8(%rsp), %r15 addq $5*8, %rsp - ret + RET .Lshort: movl %ecx, %r10d jmp .L1 diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c index e7925d668b68..1f8a8f895173 100644 --- a/arch/x86/lib/csum-partial_64.c +++ b/arch/x86/lib/csum-partial_64.c @@ -9,6 +9,7 @@ #include <linux/compiler.h> #include <linux/export.h> #include <asm/checksum.h> +#include <asm/word-at-a-time.h> static inline unsigned short from32to16(unsigned a) { @@ -21,120 +22,119 @@ static inline unsigned short from32to16(unsigned a) } /* - * Do a 64-bit checksum on an arbitrary memory area. + * Do a checksum on an arbitrary memory area. * Returns a 32bit checksum. * * This isn't as time critical as it used to be because many NICs * do hardware checksumming these days. - * - * Things tried and found to not make it faster: - * Manual Prefetching - * Unrolling to an 128 bytes inner loop. - * Using interleaving with more registers to break the carry chains. + * + * Still, with CHECKSUM_COMPLETE this is called to compute + * checksums on IPv6 headers (40 bytes) and other small parts. + * it's best to have buff aligned on a 64-bit boundary */ -static unsigned do_csum(const unsigned char *buff, unsigned len) +__wsum csum_partial(const void *buff, int len, __wsum sum) { - unsigned odd, count; - unsigned long result = 0; + u64 temp64 = (__force u64)sum; + unsigned odd, result; - if (unlikely(len == 0)) - return result; odd = 1 & (unsigned long) buff; if (unlikely(odd)) { - result = *buff << 8; + if (unlikely(len == 0)) + return sum; + temp64 = ror32((__force u32)sum, 8); + temp64 += (*(unsigned char *)buff << 8); len--; buff++; } - count = len >> 1; /* nr of 16-bit words.. */ - if (count) { - if (2 & (unsigned long) buff) { - result += *(unsigned short *)buff; - count--; - len -= 2; - buff += 2; - } - count >>= 1; /* nr of 32-bit words.. */ - if (count) { - unsigned long zero; - unsigned count64; - if (4 & (unsigned long) buff) { - result += *(unsigned int *) buff; - count--; - len -= 4; - buff += 4; - } - count >>= 1; /* nr of 64-bit words.. */ - /* main loop using 64byte blocks */ - zero = 0; - count64 = count >> 3; - while (count64) { - asm("addq 0*8(%[src]),%[res]\n\t" - "adcq 1*8(%[src]),%[res]\n\t" - "adcq 2*8(%[src]),%[res]\n\t" - "adcq 3*8(%[src]),%[res]\n\t" - "adcq 4*8(%[src]),%[res]\n\t" - "adcq 5*8(%[src]),%[res]\n\t" - "adcq 6*8(%[src]),%[res]\n\t" - "adcq 7*8(%[src]),%[res]\n\t" - "adcq %[zero],%[res]" - : [res] "=r" (result) - : [src] "r" (buff), [zero] "r" (zero), - "[res]" (result)); - buff += 64; - count64--; - } + while (unlikely(len >= 64)) { + asm("addq 0*8(%[src]),%[res]\n\t" + "adcq 1*8(%[src]),%[res]\n\t" + "adcq 2*8(%[src]),%[res]\n\t" + "adcq 3*8(%[src]),%[res]\n\t" + "adcq 4*8(%[src]),%[res]\n\t" + "adcq 5*8(%[src]),%[res]\n\t" + "adcq 6*8(%[src]),%[res]\n\t" + "adcq 7*8(%[src]),%[res]\n\t" + "adcq $0,%[res]" + : [res] "+r" (temp64) + : [src] "r" (buff) + : "memory"); + buff += 64; + len -= 64; + } + + if (len & 32) { + asm("addq 0*8(%[src]),%[res]\n\t" + "adcq 1*8(%[src]),%[res]\n\t" + "adcq 2*8(%[src]),%[res]\n\t" + "adcq 3*8(%[src]),%[res]\n\t" + "adcq $0,%[res]" + : [res] "+r" (temp64) + : [src] "r" (buff) + : "memory"); + buff += 32; + } + if (len & 16) { + asm("addq 0*8(%[src]),%[res]\n\t" + "adcq 1*8(%[src]),%[res]\n\t" + "adcq $0,%[res]" + : [res] "+r" (temp64) + : [src] "r" (buff) + : "memory"); + buff += 16; + } + if (len & 8) { + asm("addq 0*8(%[src]),%[res]\n\t" + "adcq $0,%[res]" + : [res] "+r" (temp64) + : [src] "r" (buff) + : "memory"); + buff += 8; + } + if (len & 7) { +#ifdef CONFIG_DCACHE_WORD_ACCESS + unsigned int shift = (8 - (len & 7)) * 8; + unsigned long trail; - /* last up to 7 8byte blocks */ - count %= 8; - while (count) { - asm("addq %1,%0\n\t" - "adcq %2,%0\n" - : "=r" (result) - : "m" (*(unsigned long *)buff), - "r" (zero), "0" (result)); - --count; - buff += 8; - } - result = add32_with_carry(result>>32, - result&0xffffffff); + trail = (load_unaligned_zeropad(buff) << shift) >> shift; - if (len & 4) { - result += *(unsigned int *) buff; - buff += 4; - } + asm("addq %[trail],%[res]\n\t" + "adcq $0,%[res]" + : [res] "+r" (temp64) + : [trail] "r" (trail)); +#else + if (len & 4) { + asm("addq %[val],%[res]\n\t" + "adcq $0,%[res]" + : [res] "+r" (temp64) + : [val] "r" ((u64)*(u32 *)buff) + : "memory"); + buff += 4; } if (len & 2) { - result += *(unsigned short *) buff; + asm("addq %[val],%[res]\n\t" + "adcq $0,%[res]" + : [res] "+r" (temp64) + : [val] "r" ((u64)*(u16 *)buff) + : "memory"); buff += 2; } + if (len & 1) { + asm("addq %[val],%[res]\n\t" + "adcq $0,%[res]" + : [res] "+r" (temp64) + : [val] "r" ((u64)*(u8 *)buff) + : "memory"); + } +#endif } - if (len & 1) - result += *buff; - result = add32_with_carry(result>>32, result & 0xffffffff); - if (unlikely(odd)) { + result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff); + if (unlikely(odd)) { result = from32to16(result); result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); } - return result; -} - -/* - * computes the checksum of a memory block at buff, length len, - * and adds in "sum" (32-bit) - * - * returns a 32-bit number suitable for feeding into itself - * or csum_tcpudp_magic - * - * this function must be called with even lengths, except - * for the last fragment, which may be odd - * - * it's best to have buff aligned on a 64-bit boundary - */ -__wsum csum_partial(const void *buff, int len, __wsum sum) -{ - return (__force __wsum)add32_with_carry(do_csum(buff, len), - (__force u32)sum); + return (__force __wsum)result; } EXPORT_SYMBOL(csum_partial); @@ -147,4 +147,3 @@ __sum16 ip_compute_csum(const void *buff, int len) return csum_fold(csum_partial(buff,len,0)); } EXPORT_SYMBOL(ip_compute_csum); - diff --git a/arch/x86/lib/error-inject.c b/arch/x86/lib/error-inject.c index be5b5fb1598b..520897061ee0 100644 --- a/arch/x86/lib/error-inject.c +++ b/arch/x86/lib/error-inject.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/linkage.h> #include <linux/error-injection.h> #include <linux/kprobes.h> @@ -10,7 +11,7 @@ asm( ".type just_return_func, @function\n" ".globl just_return_func\n" "just_return_func:\n" - " ret\n" + ASM_RET ".size just_return_func, .-just_return_func\n" ); diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S index fa1bc2104b32..b70d98d79a9d 100644 --- a/arch/x86/lib/getuser.S +++ b/arch/x86/lib/getuser.S @@ -57,7 +57,7 @@ SYM_FUNC_START(__get_user_1) 1: movzbl (%_ASM_AX),%edx xor %eax,%eax ASM_CLAC - ret + RET SYM_FUNC_END(__get_user_1) EXPORT_SYMBOL(__get_user_1) @@ -71,7 +71,7 @@ SYM_FUNC_START(__get_user_2) 2: movzwl (%_ASM_AX),%edx xor %eax,%eax ASM_CLAC - ret + RET SYM_FUNC_END(__get_user_2) EXPORT_SYMBOL(__get_user_2) @@ -85,7 +85,7 @@ SYM_FUNC_START(__get_user_4) 3: movl (%_ASM_AX),%edx xor %eax,%eax ASM_CLAC - ret + RET SYM_FUNC_END(__get_user_4) EXPORT_SYMBOL(__get_user_4) @@ -100,7 +100,7 @@ SYM_FUNC_START(__get_user_8) 4: movq (%_ASM_AX),%rdx xor %eax,%eax ASM_CLAC - ret + RET #else LOAD_TASK_SIZE_MINUS_N(7) cmp %_ASM_DX,%_ASM_AX @@ -112,7 +112,7 @@ SYM_FUNC_START(__get_user_8) 5: movl 4(%_ASM_AX),%ecx xor %eax,%eax ASM_CLAC - ret + RET #endif SYM_FUNC_END(__get_user_8) EXPORT_SYMBOL(__get_user_8) @@ -124,7 +124,7 @@ SYM_FUNC_START(__get_user_nocheck_1) 6: movzbl (%_ASM_AX),%edx xor %eax,%eax ASM_CLAC - ret + RET SYM_FUNC_END(__get_user_nocheck_1) EXPORT_SYMBOL(__get_user_nocheck_1) @@ -134,7 +134,7 @@ SYM_FUNC_START(__get_user_nocheck_2) 7: movzwl (%_ASM_AX),%edx xor %eax,%eax ASM_CLAC - ret + RET SYM_FUNC_END(__get_user_nocheck_2) EXPORT_SYMBOL(__get_user_nocheck_2) @@ -144,7 +144,7 @@ SYM_FUNC_START(__get_user_nocheck_4) 8: movl (%_ASM_AX),%edx xor %eax,%eax ASM_CLAC - ret + RET SYM_FUNC_END(__get_user_nocheck_4) EXPORT_SYMBOL(__get_user_nocheck_4) @@ -159,7 +159,7 @@ SYM_FUNC_START(__get_user_nocheck_8) #endif xor %eax,%eax ASM_CLAC - ret + RET SYM_FUNC_END(__get_user_nocheck_8) EXPORT_SYMBOL(__get_user_nocheck_8) @@ -169,7 +169,7 @@ SYM_CODE_START_LOCAL(.Lbad_get_user_clac) bad_get_user: xor %edx,%edx mov $(-EFAULT),%_ASM_AX - ret + RET SYM_CODE_END(.Lbad_get_user_clac) #ifdef CONFIG_X86_32 @@ -179,7 +179,7 @@ bad_get_user_8: xor %edx,%edx xor %ecx,%ecx mov $(-EFAULT),%_ASM_AX - ret + RET SYM_CODE_END(.Lbad_get_user_8_clac) #endif diff --git a/arch/x86/lib/hweight.S b/arch/x86/lib/hweight.S index dbf8cc97b7f5..12c16c6aa44a 100644 --- a/arch/x86/lib/hweight.S +++ b/arch/x86/lib/hweight.S @@ -32,7 +32,7 @@ SYM_FUNC_START(__sw_hweight32) imull $0x01010101, %eax, %eax # w_tmp *= 0x01010101 shrl $24, %eax # w = w_tmp >> 24 __ASM_SIZE(pop,) %__ASM_REG(dx) - ret + RET SYM_FUNC_END(__sw_hweight32) EXPORT_SYMBOL(__sw_hweight32) @@ -65,7 +65,7 @@ SYM_FUNC_START(__sw_hweight64) popq %rdx popq %rdi - ret + RET #else /* CONFIG_X86_32 */ /* We're getting an u64 arg in (%eax,%edx): unsigned long hweight64(__u64 w) */ pushl %ecx @@ -77,7 +77,7 @@ SYM_FUNC_START(__sw_hweight64) addl %ecx, %eax # result popl %ecx - ret + RET #endif SYM_FUNC_END(__sw_hweight64) EXPORT_SYMBOL(__sw_hweight64) diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index a1d24fdc07cf..b781d324211b 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -37,8 +37,6 @@ enum reg_type { */ static bool is_string_insn(struct insn *insn) { - insn_get_opcode(insn); - /* All string instructions have a 1-byte opcode. */ if (insn->opcode.nbytes != 1) return false; @@ -412,32 +410,44 @@ static short get_segment_selector(struct pt_regs *regs, int seg_reg_idx) #endif /* CONFIG_X86_64 */ } -static int get_reg_offset(struct insn *insn, struct pt_regs *regs, - enum reg_type type) +static const int pt_regoff[] = { + offsetof(struct pt_regs, ax), + offsetof(struct pt_regs, cx), + offsetof(struct pt_regs, dx), + offsetof(struct pt_regs, bx), + offsetof(struct pt_regs, sp), + offsetof(struct pt_regs, bp), + offsetof(struct pt_regs, si), + offsetof(struct pt_regs, di), +#ifdef CONFIG_X86_64 + offsetof(struct pt_regs, r8), + offsetof(struct pt_regs, r9), + offsetof(struct pt_regs, r10), + offsetof(struct pt_regs, r11), + offsetof(struct pt_regs, r12), + offsetof(struct pt_regs, r13), + offsetof(struct pt_regs, r14), + offsetof(struct pt_regs, r15), +#else + offsetof(struct pt_regs, ds), + offsetof(struct pt_regs, es), + offsetof(struct pt_regs, fs), + offsetof(struct pt_regs, gs), +#endif +}; + +int pt_regs_offset(struct pt_regs *regs, int regno) +{ + if ((unsigned)regno < ARRAY_SIZE(pt_regoff)) + return pt_regoff[regno]; + return -EDOM; +} + +static int get_regno(struct insn *insn, enum reg_type type) { + int nr_registers = ARRAY_SIZE(pt_regoff); int regno = 0; - static const int regoff[] = { - offsetof(struct pt_regs, ax), - offsetof(struct pt_regs, cx), - offsetof(struct pt_regs, dx), - offsetof(struct pt_regs, bx), - offsetof(struct pt_regs, sp), - offsetof(struct pt_regs, bp), - offsetof(struct pt_regs, si), - offsetof(struct pt_regs, di), -#ifdef CONFIG_X86_64 - offsetof(struct pt_regs, r8), - offsetof(struct pt_regs, r9), - offsetof(struct pt_regs, r10), - offsetof(struct pt_regs, r11), - offsetof(struct pt_regs, r12), - offsetof(struct pt_regs, r13), - offsetof(struct pt_regs, r14), - offsetof(struct pt_regs, r15), -#endif - }; - int nr_registers = ARRAY_SIZE(regoff); /* * Don't possibly decode a 32-bit instructions as * reading a 64-bit-only register. @@ -505,7 +515,18 @@ static int get_reg_offset(struct insn *insn, struct pt_regs *regs, WARN_ONCE(1, "decoded an instruction with an invalid register"); return -EINVAL; } - return regoff[regno]; + return regno; +} + +static int get_reg_offset(struct insn *insn, struct pt_regs *regs, + enum reg_type type) +{ + int regno = get_regno(insn, type); + + if (regno < 0) + return regno; + + return pt_regs_offset(regs, regno); } /** @@ -851,6 +872,26 @@ int insn_get_modrm_reg_off(struct insn *insn, struct pt_regs *regs) } /** + * insn_get_modrm_reg_ptr() - Obtain register pointer based on ModRM byte + * @insn: Instruction containing the ModRM byte + * @regs: Register values as seen when entering kernel mode + * + * Returns: + * + * The register indicated by the reg part of the ModRM byte. + * The register is obtained as a pointer within pt_regs. + */ +unsigned long *insn_get_modrm_reg_ptr(struct insn *insn, struct pt_regs *regs) +{ + int offset; + + offset = insn_get_modrm_reg_off(insn, regs); + if (offset < 0) + return NULL; + return (void *)regs + offset; +} + +/** * get_seg_base_limit() - obtain base address and limit of a segment * @insn: Instruction. Must be valid. * @regs: Register values as seen when entering kernel mode @@ -1405,6 +1446,9 @@ void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs) if (!insn || !regs) return (void __user *)-1L; + if (insn_get_opcode(insn)) + return (void __user *)-1L; + switch (insn->addr_bytes) { case 2: return get_addr_ref_16(insn, regs); @@ -1417,7 +1461,7 @@ void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs) } } -static int insn_get_effective_ip(struct pt_regs *regs, unsigned long *ip) +int insn_get_effective_ip(struct pt_regs *regs, unsigned long *ip) { unsigned long seg_base = 0; @@ -1539,3 +1583,87 @@ bool insn_decode_from_regs(struct insn *insn, struct pt_regs *regs, return true; } + +/** + * insn_decode_mmio() - Decode a MMIO instruction + * @insn: Structure to store decoded instruction + * @bytes: Returns size of memory operand + * + * Decodes instruction that used for Memory-mapped I/O. + * + * Returns: + * + * Type of the instruction. Size of the memory operand is stored in + * @bytes. If decode failed, MMIO_DECODE_FAILED returned. + */ +enum mmio_type insn_decode_mmio(struct insn *insn, int *bytes) +{ + enum mmio_type type = MMIO_DECODE_FAILED; + + *bytes = 0; + + if (insn_get_opcode(insn)) + return MMIO_DECODE_FAILED; + + switch (insn->opcode.bytes[0]) { + case 0x88: /* MOV m8,r8 */ + *bytes = 1; + fallthrough; + case 0x89: /* MOV m16/m32/m64, r16/m32/m64 */ + if (!*bytes) + *bytes = insn->opnd_bytes; + type = MMIO_WRITE; + break; + + case 0xc6: /* MOV m8, imm8 */ + *bytes = 1; + fallthrough; + case 0xc7: /* MOV m16/m32/m64, imm16/imm32/imm64 */ + if (!*bytes) + *bytes = insn->opnd_bytes; + type = MMIO_WRITE_IMM; + break; + + case 0x8a: /* MOV r8, m8 */ + *bytes = 1; + fallthrough; + case 0x8b: /* MOV r16/r32/r64, m16/m32/m64 */ + if (!*bytes) + *bytes = insn->opnd_bytes; + type = MMIO_READ; + break; + + case 0xa4: /* MOVS m8, m8 */ + *bytes = 1; + fallthrough; + case 0xa5: /* MOVS m16/m32/m64, m16/m32/m64 */ + if (!*bytes) + *bytes = insn->opnd_bytes; + type = MMIO_MOVS; + break; + + case 0x0f: /* Two-byte instruction */ + switch (insn->opcode.bytes[1]) { + case 0xb6: /* MOVZX r16/r32/r64, m8 */ + *bytes = 1; + fallthrough; + case 0xb7: /* MOVZX r32/r64, m16 */ + if (!*bytes) + *bytes = 2; + type = MMIO_READ_ZERO_EXTEND; + break; + + case 0xbe: /* MOVSX r16/r32/r64, m8 */ + *bytes = 1; + fallthrough; + case 0xbf: /* MOVSX r32/r64, m16 */ + if (!*bytes) + *bytes = 2; + type = MMIO_READ_SIGN_EXTEND; + break; + } + break; + } + + return type; +} diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c index c565def611e2..55e371cc69fd 100644 --- a/arch/x86/lib/insn.c +++ b/arch/x86/lib/insn.c @@ -13,6 +13,7 @@ #endif #include <asm/inat.h> /*__ignore_sync_check__ */ #include <asm/insn.h> /* __ignore_sync_check__ */ +#include <asm/unaligned.h> /* __ignore_sync_check__ */ #include <linux/errno.h> #include <linux/kconfig.h> @@ -37,10 +38,10 @@ ((insn)->next_byte + sizeof(t) + n <= (insn)->end_kaddr) #define __get_next(t, insn) \ - ({ t r; memcpy(&r, insn->next_byte, sizeof(t)); insn->next_byte += sizeof(t); leXX_to_cpu(t, r); }) + ({ t r = get_unaligned((t *)(insn)->next_byte); (insn)->next_byte += sizeof(t); leXX_to_cpu(t, r); }) #define __peek_nbyte_next(t, insn, n) \ - ({ t r; memcpy(&r, (insn)->next_byte + n, sizeof(t)); leXX_to_cpu(t, r); }) + ({ t r = get_unaligned((t *)(insn)->next_byte + n); leXX_to_cpu(t, r); }) #define get_next(t, insn) \ ({ if (unlikely(!validate_next(t, insn, 0))) goto err_out; __get_next(t, insn); }) diff --git a/arch/x86/lib/iomap_copy_64.S b/arch/x86/lib/iomap_copy_64.S index cb5a1964506b..a1f9416bf67a 100644 --- a/arch/x86/lib/iomap_copy_64.S +++ b/arch/x86/lib/iomap_copy_64.S @@ -11,5 +11,5 @@ SYM_FUNC_START(__iowrite32_copy) movl %edx,%ecx rep movsd - ret + RET SYM_FUNC_END(__iowrite32_copy) diff --git a/arch/x86/lib/kaslr.c b/arch/x86/lib/kaslr.c index a53665116458..2b3eb8c948a3 100644 --- a/arch/x86/lib/kaslr.c +++ b/arch/x86/lib/kaslr.c @@ -56,11 +56,14 @@ unsigned long kaslr_get_random_long(const char *purpose) unsigned long raw, random = get_boot_seed(); bool use_i8254 = true; - debug_putstr(purpose); - debug_putstr(" KASLR using"); + if (purpose) { + debug_putstr(purpose); + debug_putstr(" KASLR using"); + } if (has_cpuflag(X86_FEATURE_RDRAND)) { - debug_putstr(" RDRAND"); + if (purpose) + debug_putstr(" RDRAND"); if (rdrand_long(&raw)) { random ^= raw; use_i8254 = false; @@ -68,7 +71,8 @@ unsigned long kaslr_get_random_long(const char *purpose) } if (has_cpuflag(X86_FEATURE_TSC)) { - debug_putstr(" RDTSC"); + if (purpose) + debug_putstr(" RDTSC"); raw = rdtsc(); random ^= raw; @@ -76,7 +80,8 @@ unsigned long kaslr_get_random_long(const char *purpose) } if (use_i8254) { - debug_putstr(" i8254"); + if (purpose) + debug_putstr(" i8254"); random ^= i8254(); } @@ -86,7 +91,8 @@ unsigned long kaslr_get_random_long(const char *purpose) : "a" (random), "rm" (mix_const)); random += raw; - debug_putstr("...\n"); + if (purpose) + debug_putstr("...\n"); return random; } diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c index e565d1c9019e..3a6e6cfe8c35 100644 --- a/arch/x86/lib/memcpy_32.c +++ b/arch/x86/lib/memcpy_32.c @@ -7,11 +7,7 @@ __visible void *memcpy(void *to, const void *from, size_t n) { -#if defined(CONFIG_X86_USE_3DNOW) && !defined(CONFIG_FORTIFY_SOURCE) - return __memcpy3d(to, from, n); -#else return __memcpy(to, from, n); -#endif } EXPORT_SYMBOL(memcpy); diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 1cc9da6e29c7..59cf2343f3d9 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -39,7 +39,7 @@ SYM_FUNC_START_WEAK(memcpy) rep movsq movl %edx, %ecx rep movsb - ret + RET SYM_FUNC_END(memcpy) SYM_FUNC_END_ALIAS(__memcpy) EXPORT_SYMBOL(memcpy) @@ -53,7 +53,7 @@ SYM_FUNC_START_LOCAL(memcpy_erms) movq %rdi, %rax movq %rdx, %rcx rep movsb - ret + RET SYM_FUNC_END(memcpy_erms) SYM_FUNC_START_LOCAL(memcpy_orig) @@ -137,7 +137,7 @@ SYM_FUNC_START_LOCAL(memcpy_orig) movq %r9, 1*8(%rdi) movq %r10, -2*8(%rdi, %rdx) movq %r11, -1*8(%rdi, %rdx) - retq + RET .p2align 4 .Lless_16bytes: cmpl $8, %edx @@ -149,7 +149,7 @@ SYM_FUNC_START_LOCAL(memcpy_orig) movq -1*8(%rsi, %rdx), %r9 movq %r8, 0*8(%rdi) movq %r9, -1*8(%rdi, %rdx) - retq + RET .p2align 4 .Lless_8bytes: cmpl $4, %edx @@ -162,7 +162,7 @@ SYM_FUNC_START_LOCAL(memcpy_orig) movl -4(%rsi, %rdx), %r8d movl %ecx, (%rdi) movl %r8d, -4(%rdi, %rdx) - retq + RET .p2align 4 .Lless_3bytes: subl $1, %edx @@ -180,7 +180,7 @@ SYM_FUNC_START_LOCAL(memcpy_orig) movb %cl, (%rdi) .Lend: - retq + RET SYM_FUNC_END(memcpy_orig) .popsection diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S index 64801010d312..50ea390df712 100644 --- a/arch/x86/lib/memmove_64.S +++ b/arch/x86/lib/memmove_64.S @@ -40,7 +40,7 @@ SYM_FUNC_START(__memmove) /* FSRM implies ERMS => no length checks, do the copy directly */ .Lmemmove_begin_forward: ALTERNATIVE "cmp $0x20, %rdx; jb 1f", "", X86_FEATURE_FSRM - ALTERNATIVE "", "movq %rdx, %rcx; rep movsb; retq", X86_FEATURE_ERMS + ALTERNATIVE "", __stringify(movq %rdx, %rcx; rep movsb; RET), X86_FEATURE_ERMS /* * movsq instruction have many startup latency @@ -205,7 +205,7 @@ SYM_FUNC_START(__memmove) movb (%rsi), %r11b movb %r11b, (%rdi) 13: - retq + RET SYM_FUNC_END(__memmove) SYM_FUNC_END_ALIAS(memmove) EXPORT_SYMBOL(__memmove) diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index 9827ae267f96..d624f2bc42f1 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S @@ -40,7 +40,7 @@ SYM_FUNC_START(__memset) movl %edx,%ecx rep stosb movq %r9,%rax - ret + RET SYM_FUNC_END(__memset) SYM_FUNC_END_ALIAS(memset) EXPORT_SYMBOL(memset) @@ -63,7 +63,7 @@ SYM_FUNC_START_LOCAL(memset_erms) movq %rdx,%rcx rep stosb movq %r9,%rax - ret + RET SYM_FUNC_END(memset_erms) SYM_FUNC_START_LOCAL(memset_orig) @@ -125,7 +125,7 @@ SYM_FUNC_START_LOCAL(memset_orig) .Lende: movq %r10,%rax - ret + RET .Lbad_alignment: cmpq $7,%rdx diff --git a/arch/x86/lib/mmx_32.c b/arch/x86/lib/mmx_32.c index cc5f4ea943d3..e69de29bb2d1 100644 --- a/arch/x86/lib/mmx_32.c +++ b/arch/x86/lib/mmx_32.c @@ -1,388 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * MMX 3DNow! library helper functions - * - * To do: - * We can use MMX just for prefetch in IRQ's. This may be a win. - * (reported so on K6-III) - * We should use a better code neutral filler for the short jump - * leal ebx. [ebx] is apparently best for K6-2, but Cyrix ?? - * We also want to clobber the filler register so we don't get any - * register forwarding stalls on the filler. - * - * Add *user handling. Checksums are not a win with MMX on any CPU - * tested so far for any MMX solution figured. - * - * 22/09/2000 - Arjan van de Ven - * Improved for non-engineering-sample Athlons - * - */ -#include <linux/hardirq.h> -#include <linux/string.h> -#include <linux/export.h> -#include <linux/sched.h> -#include <linux/types.h> - -#include <asm/fpu/api.h> -#include <asm/asm.h> - -/* - * Use KFPU_387. MMX instructions are not affected by MXCSR, - * but both AMD and Intel documentation states that even integer MMX - * operations will result in #MF if an exception is pending in FCW. - * - * EMMS is not needed afterwards because, after calling kernel_fpu_end(), - * any subsequent user of the 387 stack will reinitialize it using - * KFPU_387. - */ - -void *_mmx_memcpy(void *to, const void *from, size_t len) -{ - void *p; - int i; - - if (unlikely(in_interrupt())) - return __memcpy(to, from, len); - - p = to; - i = len >> 6; /* len/64 */ - - kernel_fpu_begin_mask(KFPU_387); - - __asm__ __volatile__ ( - "1: prefetch (%0)\n" /* This set is 28 bytes */ - " prefetch 64(%0)\n" - " prefetch 128(%0)\n" - " prefetch 192(%0)\n" - " prefetch 256(%0)\n" - "2: \n" - ".section .fixup, \"ax\"\n" - "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ - " jmp 2b\n" - ".previous\n" - _ASM_EXTABLE(1b, 3b) - : : "r" (from)); - - for ( ; i > 5; i--) { - __asm__ __volatile__ ( - "1: prefetch 320(%0)\n" - "2: movq (%0), %%mm0\n" - " movq 8(%0), %%mm1\n" - " movq 16(%0), %%mm2\n" - " movq 24(%0), %%mm3\n" - " movq %%mm0, (%1)\n" - " movq %%mm1, 8(%1)\n" - " movq %%mm2, 16(%1)\n" - " movq %%mm3, 24(%1)\n" - " movq 32(%0), %%mm0\n" - " movq 40(%0), %%mm1\n" - " movq 48(%0), %%mm2\n" - " movq 56(%0), %%mm3\n" - " movq %%mm0, 32(%1)\n" - " movq %%mm1, 40(%1)\n" - " movq %%mm2, 48(%1)\n" - " movq %%mm3, 56(%1)\n" - ".section .fixup, \"ax\"\n" - "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ - " jmp 2b\n" - ".previous\n" - _ASM_EXTABLE(1b, 3b) - : : "r" (from), "r" (to) : "memory"); - - from += 64; - to += 64; - } - - for ( ; i > 0; i--) { - __asm__ __volatile__ ( - " movq (%0), %%mm0\n" - " movq 8(%0), %%mm1\n" - " movq 16(%0), %%mm2\n" - " movq 24(%0), %%mm3\n" - " movq %%mm0, (%1)\n" - " movq %%mm1, 8(%1)\n" - " movq %%mm2, 16(%1)\n" - " movq %%mm3, 24(%1)\n" - " movq 32(%0), %%mm0\n" - " movq 40(%0), %%mm1\n" - " movq 48(%0), %%mm2\n" - " movq 56(%0), %%mm3\n" - " movq %%mm0, 32(%1)\n" - " movq %%mm1, 40(%1)\n" - " movq %%mm2, 48(%1)\n" - " movq %%mm3, 56(%1)\n" - : : "r" (from), "r" (to) : "memory"); - - from += 64; - to += 64; - } - /* - * Now do the tail of the block: - */ - __memcpy(to, from, len & 63); - kernel_fpu_end(); - - return p; -} -EXPORT_SYMBOL(_mmx_memcpy); - -#ifdef CONFIG_MK7 - -/* - * The K7 has streaming cache bypass load/store. The Cyrix III, K6 and - * other MMX using processors do not. - */ - -static void fast_clear_page(void *page) -{ - int i; - - kernel_fpu_begin_mask(KFPU_387); - - __asm__ __volatile__ ( - " pxor %%mm0, %%mm0\n" : : - ); - - for (i = 0; i < 4096/64; i++) { - __asm__ __volatile__ ( - " movntq %%mm0, (%0)\n" - " movntq %%mm0, 8(%0)\n" - " movntq %%mm0, 16(%0)\n" - " movntq %%mm0, 24(%0)\n" - " movntq %%mm0, 32(%0)\n" - " movntq %%mm0, 40(%0)\n" - " movntq %%mm0, 48(%0)\n" - " movntq %%mm0, 56(%0)\n" - : : "r" (page) : "memory"); - page += 64; - } - - /* - * Since movntq is weakly-ordered, a "sfence" is needed to become - * ordered again: - */ - __asm__ __volatile__("sfence\n"::); - - kernel_fpu_end(); -} - -static void fast_copy_page(void *to, void *from) -{ - int i; - - kernel_fpu_begin_mask(KFPU_387); - - /* - * maybe the prefetch stuff can go before the expensive fnsave... - * but that is for later. -AV - */ - __asm__ __volatile__( - "1: prefetch (%0)\n" - " prefetch 64(%0)\n" - " prefetch 128(%0)\n" - " prefetch 192(%0)\n" - " prefetch 256(%0)\n" - "2: \n" - ".section .fixup, \"ax\"\n" - "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ - " jmp 2b\n" - ".previous\n" - _ASM_EXTABLE(1b, 3b) : : "r" (from)); - - for (i = 0; i < (4096-320)/64; i++) { - __asm__ __volatile__ ( - "1: prefetch 320(%0)\n" - "2: movq (%0), %%mm0\n" - " movntq %%mm0, (%1)\n" - " movq 8(%0), %%mm1\n" - " movntq %%mm1, 8(%1)\n" - " movq 16(%0), %%mm2\n" - " movntq %%mm2, 16(%1)\n" - " movq 24(%0), %%mm3\n" - " movntq %%mm3, 24(%1)\n" - " movq 32(%0), %%mm4\n" - " movntq %%mm4, 32(%1)\n" - " movq 40(%0), %%mm5\n" - " movntq %%mm5, 40(%1)\n" - " movq 48(%0), %%mm6\n" - " movntq %%mm6, 48(%1)\n" - " movq 56(%0), %%mm7\n" - " movntq %%mm7, 56(%1)\n" - ".section .fixup, \"ax\"\n" - "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ - " jmp 2b\n" - ".previous\n" - _ASM_EXTABLE(1b, 3b) : : "r" (from), "r" (to) : "memory"); - - from += 64; - to += 64; - } - - for (i = (4096-320)/64; i < 4096/64; i++) { - __asm__ __volatile__ ( - "2: movq (%0), %%mm0\n" - " movntq %%mm0, (%1)\n" - " movq 8(%0), %%mm1\n" - " movntq %%mm1, 8(%1)\n" - " movq 16(%0), %%mm2\n" - " movntq %%mm2, 16(%1)\n" - " movq 24(%0), %%mm3\n" - " movntq %%mm3, 24(%1)\n" - " movq 32(%0), %%mm4\n" - " movntq %%mm4, 32(%1)\n" - " movq 40(%0), %%mm5\n" - " movntq %%mm5, 40(%1)\n" - " movq 48(%0), %%mm6\n" - " movntq %%mm6, 48(%1)\n" - " movq 56(%0), %%mm7\n" - " movntq %%mm7, 56(%1)\n" - : : "r" (from), "r" (to) : "memory"); - from += 64; - to += 64; - } - /* - * Since movntq is weakly-ordered, a "sfence" is needed to become - * ordered again: - */ - __asm__ __volatile__("sfence \n"::); - kernel_fpu_end(); -} - -#else /* CONFIG_MK7 */ - -/* - * Generic MMX implementation without K7 specific streaming - */ -static void fast_clear_page(void *page) -{ - int i; - - kernel_fpu_begin_mask(KFPU_387); - - __asm__ __volatile__ ( - " pxor %%mm0, %%mm0\n" : : - ); - - for (i = 0; i < 4096/128; i++) { - __asm__ __volatile__ ( - " movq %%mm0, (%0)\n" - " movq %%mm0, 8(%0)\n" - " movq %%mm0, 16(%0)\n" - " movq %%mm0, 24(%0)\n" - " movq %%mm0, 32(%0)\n" - " movq %%mm0, 40(%0)\n" - " movq %%mm0, 48(%0)\n" - " movq %%mm0, 56(%0)\n" - " movq %%mm0, 64(%0)\n" - " movq %%mm0, 72(%0)\n" - " movq %%mm0, 80(%0)\n" - " movq %%mm0, 88(%0)\n" - " movq %%mm0, 96(%0)\n" - " movq %%mm0, 104(%0)\n" - " movq %%mm0, 112(%0)\n" - " movq %%mm0, 120(%0)\n" - : : "r" (page) : "memory"); - page += 128; - } - - kernel_fpu_end(); -} - -static void fast_copy_page(void *to, void *from) -{ - int i; - - kernel_fpu_begin_mask(KFPU_387); - - __asm__ __volatile__ ( - "1: prefetch (%0)\n" - " prefetch 64(%0)\n" - " prefetch 128(%0)\n" - " prefetch 192(%0)\n" - " prefetch 256(%0)\n" - "2: \n" - ".section .fixup, \"ax\"\n" - "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ - " jmp 2b\n" - ".previous\n" - _ASM_EXTABLE(1b, 3b) : : "r" (from)); - - for (i = 0; i < 4096/64; i++) { - __asm__ __volatile__ ( - "1: prefetch 320(%0)\n" - "2: movq (%0), %%mm0\n" - " movq 8(%0), %%mm1\n" - " movq 16(%0), %%mm2\n" - " movq 24(%0), %%mm3\n" - " movq %%mm0, (%1)\n" - " movq %%mm1, 8(%1)\n" - " movq %%mm2, 16(%1)\n" - " movq %%mm3, 24(%1)\n" - " movq 32(%0), %%mm0\n" - " movq 40(%0), %%mm1\n" - " movq 48(%0), %%mm2\n" - " movq 56(%0), %%mm3\n" - " movq %%mm0, 32(%1)\n" - " movq %%mm1, 40(%1)\n" - " movq %%mm2, 48(%1)\n" - " movq %%mm3, 56(%1)\n" - ".section .fixup, \"ax\"\n" - "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ - " jmp 2b\n" - ".previous\n" - _ASM_EXTABLE(1b, 3b) - : : "r" (from), "r" (to) : "memory"); - - from += 64; - to += 64; - } - kernel_fpu_end(); -} - -#endif /* !CONFIG_MK7 */ - -/* - * Favour MMX for page clear and copy: - */ -static void slow_zero_page(void *page) -{ - int d0, d1; - - __asm__ __volatile__( - "cld\n\t" - "rep ; stosl" - - : "=&c" (d0), "=&D" (d1) - :"a" (0), "1" (page), "0" (1024) - :"memory"); -} - -void mmx_clear_page(void *page) -{ - if (unlikely(in_interrupt())) - slow_zero_page(page); - else - fast_clear_page(page); -} -EXPORT_SYMBOL(mmx_clear_page); - -static void slow_copy_page(void *to, void *from) -{ - int d0, d1, d2; - - __asm__ __volatile__( - "cld\n\t" - "rep ; movsl" - : "=&c" (d0), "=&D" (d1), "=&S" (d2) - : "0" (1024), "1" ((long) to), "2" ((long) from) - : "memory"); -} - -void mmx_copy_page(void *to, void *from) -{ - if (unlikely(in_interrupt())) - slow_copy_page(to, from); - else - fast_copy_page(to, from); -} -EXPORT_SYMBOL(mmx_copy_page); diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S index a2b9caa5274c..ebd259f31496 100644 --- a/arch/x86/lib/msr-reg.S +++ b/arch/x86/lib/msr-reg.S @@ -35,7 +35,7 @@ SYM_FUNC_START(\op\()_safe_regs) movl %edi, 28(%r10) popq %r12 popq %rbx - ret + RET 3: movl $-EIO, %r11d jmp 2b @@ -77,7 +77,7 @@ SYM_FUNC_START(\op\()_safe_regs) popl %esi popl %ebp popl %ebx - ret + RET 3: movl $-EIO, 4(%esp) jmp 2b diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S index 0ea344c5ea43..ecb2049c1273 100644 --- a/arch/x86/lib/putuser.S +++ b/arch/x86/lib/putuser.S @@ -52,7 +52,7 @@ SYM_INNER_LABEL(__put_user_nocheck_1, SYM_L_GLOBAL) 1: movb %al,(%_ASM_CX) xor %ecx,%ecx ASM_CLAC - ret + RET SYM_FUNC_END(__put_user_1) EXPORT_SYMBOL(__put_user_1) EXPORT_SYMBOL(__put_user_nocheck_1) @@ -66,7 +66,7 @@ SYM_INNER_LABEL(__put_user_nocheck_2, SYM_L_GLOBAL) 2: movw %ax,(%_ASM_CX) xor %ecx,%ecx ASM_CLAC - ret + RET SYM_FUNC_END(__put_user_2) EXPORT_SYMBOL(__put_user_2) EXPORT_SYMBOL(__put_user_nocheck_2) @@ -80,7 +80,7 @@ SYM_INNER_LABEL(__put_user_nocheck_4, SYM_L_GLOBAL) 3: movl %eax,(%_ASM_CX) xor %ecx,%ecx ASM_CLAC - ret + RET SYM_FUNC_END(__put_user_4) EXPORT_SYMBOL(__put_user_4) EXPORT_SYMBOL(__put_user_nocheck_4) diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index ec9922cba30a..89b3fb244e15 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -23,50 +23,18 @@ .Ldo_rop_\@: mov %\reg, (%_ASM_SP) UNWIND_HINT_FUNC - ret + RET .endm .macro THUNK reg - .align 32 - -SYM_FUNC_START(__x86_indirect_thunk_\reg) + .align RETPOLINE_THUNK_SIZE +SYM_INNER_LABEL(__x86_indirect_thunk_\reg, SYM_L_GLOBAL) + UNWIND_HINT_EMPTY ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \ __stringify(RETPOLINE \reg), X86_FEATURE_RETPOLINE, \ - __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_AMD - -SYM_FUNC_END(__x86_indirect_thunk_\reg) - -.endm - -/* - * This generates .altinstr_replacement symbols for use by objtool. They, - * however, must not actually live in .altinstr_replacement since that will be - * discarded after init, but module alternatives will also reference these - * symbols. - * - * Their names matches the "__x86_indirect_" prefix to mark them as retpolines. - */ -.macro ALT_THUNK reg - - .align 1 - -SYM_FUNC_START_NOALIGN(__x86_indirect_alt_call_\reg) - ANNOTATE_RETPOLINE_SAFE -1: call *%\reg -2: .skip 5-(2b-1b), 0x90 -SYM_FUNC_END(__x86_indirect_alt_call_\reg) - -STACK_FRAME_NON_STANDARD(__x86_indirect_alt_call_\reg) - -SYM_FUNC_START_NOALIGN(__x86_indirect_alt_jmp_\reg) - ANNOTATE_RETPOLINE_SAFE -1: jmp *%\reg -2: .skip 5-(2b-1b), 0x90 -SYM_FUNC_END(__x86_indirect_alt_jmp_\reg) - -STACK_FRAME_NON_STANDARD(__x86_indirect_alt_jmp_\reg) + __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg; int3), X86_FEATURE_RETPOLINE_AMD .endm @@ -85,22 +53,16 @@ STACK_FRAME_NON_STANDARD(__x86_indirect_alt_jmp_\reg) #define __EXPORT_THUNK(sym) _ASM_NOKPROBE(sym); EXPORT_SYMBOL(sym) #define EXPORT_THUNK(reg) __EXPORT_THUNK(__x86_indirect_thunk_ ## reg) -#undef GEN + .align RETPOLINE_THUNK_SIZE +SYM_CODE_START(__x86_indirect_thunk_array) + #define GEN(reg) THUNK reg #include <asm/GEN-for-each-reg.h> - #undef GEN -#define GEN(reg) EXPORT_THUNK(reg) -#include <asm/GEN-for-each-reg.h> -#undef GEN -#define GEN(reg) ALT_THUNK reg -#include <asm/GEN-for-each-reg.h> + .align RETPOLINE_THUNK_SIZE +SYM_CODE_END(__x86_indirect_thunk_array) -#undef GEN -#define GEN(reg) __EXPORT_THUNK(__x86_indirect_alt_call_ ## reg) +#define GEN(reg) EXPORT_THUNK(reg) #include <asm/GEN-for-each-reg.h> - #undef GEN -#define GEN(reg) __EXPORT_THUNK(__x86_indirect_alt_jmp_ ## reg) -#include <asm/GEN-for-each-reg.h> diff --git a/arch/x86/lib/string_32.c b/arch/x86/lib/string_32.c index d15fdae9656e..53b3f202267c 100644 --- a/arch/x86/lib/string_32.c +++ b/arch/x86/lib/string_32.c @@ -11,6 +11,7 @@ * strings. */ +#define __NO_FORTIFY #include <linux/string.h> #include <linux/export.h> diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 7d290777246d..422257c350c6 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -8,7 +8,6 @@ */ #include <linux/export.h> #include <linux/uaccess.h> -#include <asm/mmx.h> #include <asm/asm.h> #ifdef CONFIG_X86_INTEL_USERCOPY @@ -43,11 +42,7 @@ do { \ " movl %2,%0\n" \ "1: rep; stosb\n" \ "2: " ASM_CLAC "\n" \ - ".section .fixup,\"ax\"\n" \ - "3: lea 0(%2,%0,4),%0\n" \ - " jmp 2b\n" \ - ".previous\n" \ - _ASM_EXTABLE_UA(0b, 3b) \ + _ASM_EXTABLE_TYPE_REG(0b, 2b, EX_TYPE_UCOPY_LEN4, %2) \ _ASM_EXTABLE_UA(1b, 2b) \ : "=&c"(size), "=&D" (__d0) \ : "r"(size & 3), "0"(size / 4), "1"(addr), "a"(0)); \ @@ -149,10 +144,6 @@ __copy_user_intel(void __user *to, const void *from, unsigned long size) "36: movl %%eax, %0\n" "37: rep; movsb\n" "100:\n" - ".section .fixup,\"ax\"\n" - "101: lea 0(%%eax,%0,4),%0\n" - " jmp 100b\n" - ".previous\n" _ASM_EXTABLE_UA(1b, 100b) _ASM_EXTABLE_UA(2b, 100b) _ASM_EXTABLE_UA(3b, 100b) @@ -190,7 +181,7 @@ __copy_user_intel(void __user *to, const void *from, unsigned long size) _ASM_EXTABLE_UA(35b, 100b) _ASM_EXTABLE_UA(36b, 100b) _ASM_EXTABLE_UA(37b, 100b) - _ASM_EXTABLE_UA(99b, 101b) + _ASM_EXTABLE_TYPE_REG(99b, 100b, EX_TYPE_UCOPY_LEN4, %%eax) : "=&c"(size), "=&D" (d0), "=&S" (d1) : "1"(to), "2"(from), "0"(size) : "eax", "edx", "memory"); @@ -255,30 +246,26 @@ static unsigned long __copy_user_intel_nocache(void *to, " movl %%eax,%0\n" "7: rep; movsb\n" "8:\n" - ".section .fixup,\"ax\"\n" - "9: lea 0(%%eax,%0,4),%0\n" - "16: jmp 8b\n" - ".previous\n" - _ASM_EXTABLE_UA(0b, 16b) - _ASM_EXTABLE_UA(1b, 16b) - _ASM_EXTABLE_UA(2b, 16b) - _ASM_EXTABLE_UA(21b, 16b) - _ASM_EXTABLE_UA(3b, 16b) - _ASM_EXTABLE_UA(31b, 16b) - _ASM_EXTABLE_UA(4b, 16b) - _ASM_EXTABLE_UA(41b, 16b) - _ASM_EXTABLE_UA(10b, 16b) - _ASM_EXTABLE_UA(51b, 16b) - _ASM_EXTABLE_UA(11b, 16b) - _ASM_EXTABLE_UA(61b, 16b) - _ASM_EXTABLE_UA(12b, 16b) - _ASM_EXTABLE_UA(71b, 16b) - _ASM_EXTABLE_UA(13b, 16b) - _ASM_EXTABLE_UA(81b, 16b) - _ASM_EXTABLE_UA(14b, 16b) - _ASM_EXTABLE_UA(91b, 16b) - _ASM_EXTABLE_UA(6b, 9b) - _ASM_EXTABLE_UA(7b, 16b) + _ASM_EXTABLE_UA(0b, 8b) + _ASM_EXTABLE_UA(1b, 8b) + _ASM_EXTABLE_UA(2b, 8b) + _ASM_EXTABLE_UA(21b, 8b) + _ASM_EXTABLE_UA(3b, 8b) + _ASM_EXTABLE_UA(31b, 8b) + _ASM_EXTABLE_UA(4b, 8b) + _ASM_EXTABLE_UA(41b, 8b) + _ASM_EXTABLE_UA(10b, 8b) + _ASM_EXTABLE_UA(51b, 8b) + _ASM_EXTABLE_UA(11b, 8b) + _ASM_EXTABLE_UA(61b, 8b) + _ASM_EXTABLE_UA(12b, 8b) + _ASM_EXTABLE_UA(71b, 8b) + _ASM_EXTABLE_UA(13b, 8b) + _ASM_EXTABLE_UA(81b, 8b) + _ASM_EXTABLE_UA(14b, 8b) + _ASM_EXTABLE_UA(91b, 8b) + _ASM_EXTABLE_TYPE_REG(6b, 8b, EX_TYPE_UCOPY_LEN4, %%eax) + _ASM_EXTABLE_UA(7b, 8b) : "=&c"(size), "=&D" (d0), "=&S" (d1) : "1"(to), "2"(from), "0"(size) : "eax", "edx", "memory"); @@ -315,14 +302,8 @@ do { \ " movl %3,%0\n" \ "1: rep; movsb\n" \ "2:\n" \ - ".section .fixup,\"ax\"\n" \ - "5: addl %3,%0\n" \ - " jmp 2b\n" \ - "3: lea 0(%3,%0,4),%0\n" \ - " jmp 2b\n" \ - ".previous\n" \ - _ASM_EXTABLE_UA(4b, 5b) \ - _ASM_EXTABLE_UA(0b, 3b) \ + _ASM_EXTABLE_TYPE_REG(4b, 2b, EX_TYPE_UCOPY_LEN1, %3) \ + _ASM_EXTABLE_TYPE_REG(0b, 2b, EX_TYPE_UCOPY_LEN4, %3) \ _ASM_EXTABLE_UA(1b, 2b) \ : "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2) \ : "3"(size), "0"(size), "1"(to), "2"(from) \ diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index 508c81e97ab1..0402a749f3a0 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -35,12 +35,10 @@ unsigned long __clear_user(void __user *addr, unsigned long size) " incq %[dst]\n" " decl %%ecx ; jnz 1b\n" "2:\n" - ".section .fixup,\"ax\"\n" - "3: lea 0(%[size1],%[size8],8),%[size8]\n" - " jmp 2b\n" - ".previous\n" - _ASM_EXTABLE_UA(0b, 3b) + + _ASM_EXTABLE_TYPE_REG(0b, 2b, EX_TYPE_UCOPY_LEN8, %[size1]) _ASM_EXTABLE_UA(1b, 2b) + : [size8] "=&c"(size), [dst] "=&D" (__d0) : [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr)); clac(); diff --git a/arch/x86/math-emu/div_Xsig.S b/arch/x86/math-emu/div_Xsig.S index 951da2ad54bb..8c270ab415be 100644 --- a/arch/x86/math-emu/div_Xsig.S +++ b/arch/x86/math-emu/div_Xsig.S @@ -341,7 +341,7 @@ L_exit: popl %esi leave - ret + RET #ifdef PARANOID diff --git a/arch/x86/math-emu/div_small.S b/arch/x86/math-emu/div_small.S index d047d1816abe..637439bfefa4 100644 --- a/arch/x86/math-emu/div_small.S +++ b/arch/x86/math-emu/div_small.S @@ -44,5 +44,5 @@ SYM_FUNC_START(FPU_div_small) popl %esi leave - ret + RET SYM_FUNC_END(FPU_div_small) diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c index 034748459482..d62662bdd460 100644 --- a/arch/x86/math-emu/fpu_aux.c +++ b/arch/x86/math-emu/fpu_aux.c @@ -53,7 +53,7 @@ void fpstate_init_soft(struct swregs_state *soft) void finit(void) { - fpstate_init_soft(¤t->thread.fpu.state.soft); + fpstate_init_soft(¤t->thread.fpu.fpstate->regs.soft); } /* diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c index 8679a9d6c47f..7fe56c594aa6 100644 --- a/arch/x86/math-emu/fpu_entry.c +++ b/arch/x86/math-emu/fpu_entry.c @@ -31,7 +31,7 @@ #include <linux/uaccess.h> #include <asm/traps.h> #include <asm/user.h> -#include <asm/fpu/internal.h> +#include <asm/fpu/api.h> #include "fpu_system.h" #include "fpu_emu.h" @@ -640,7 +640,7 @@ int fpregs_soft_set(struct task_struct *target, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { - struct swregs_state *s387 = &target->thread.fpu.state.soft; + struct swregs_state *s387 = &target->thread.fpu.fpstate->regs.soft; void *space = s387->st_space; int ret; int offset, other, i, tags, regnr, tag, newtop; @@ -691,7 +691,7 @@ int fpregs_soft_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { - struct swregs_state *s387 = &target->thread.fpu.state.soft; + struct swregs_state *s387 = &target->thread.fpu.fpstate->regs.soft; const void *space = s387->st_space; int offset = (S387->ftop & 7) * 10, other = 80 - offset; diff --git a/arch/x86/math-emu/fpu_system.h b/arch/x86/math-emu/fpu_system.h index 9b41391867dc..eec3e4805c75 100644 --- a/arch/x86/math-emu/fpu_system.h +++ b/arch/x86/math-emu/fpu_system.h @@ -73,7 +73,7 @@ static inline bool seg_writable(struct desc_struct *d) return (d->type & SEG_TYPE_EXECUTE_MASK) == SEG_TYPE_WRITABLE; } -#define I387 (¤t->thread.fpu.state) +#define I387 (¤t->thread.fpu.fpstate->regs) #define FPU_info (I387->soft.info) #define FPU_CS (*(unsigned short *) &(FPU_info->regs->cs)) diff --git a/arch/x86/math-emu/mul_Xsig.S b/arch/x86/math-emu/mul_Xsig.S index 4afc7b1fa6e9..54a031b66142 100644 --- a/arch/x86/math-emu/mul_Xsig.S +++ b/arch/x86/math-emu/mul_Xsig.S @@ -62,7 +62,7 @@ SYM_FUNC_START(mul32_Xsig) popl %esi leave - ret + RET SYM_FUNC_END(mul32_Xsig) @@ -115,7 +115,7 @@ SYM_FUNC_START(mul64_Xsig) popl %esi leave - ret + RET SYM_FUNC_END(mul64_Xsig) @@ -175,5 +175,5 @@ SYM_FUNC_START(mul_Xsig_Xsig) popl %esi leave - ret + RET SYM_FUNC_END(mul_Xsig_Xsig) diff --git a/arch/x86/math-emu/polynom_Xsig.S b/arch/x86/math-emu/polynom_Xsig.S index 702315eecb86..35fd723fc0df 100644 --- a/arch/x86/math-emu/polynom_Xsig.S +++ b/arch/x86/math-emu/polynom_Xsig.S @@ -133,5 +133,5 @@ L_accum_done: popl %edi popl %esi leave - ret + RET SYM_FUNC_END(polynomial_Xsig) diff --git a/arch/x86/math-emu/reg_norm.S b/arch/x86/math-emu/reg_norm.S index cad1d60b1e84..594936eeed67 100644 --- a/arch/x86/math-emu/reg_norm.S +++ b/arch/x86/math-emu/reg_norm.S @@ -72,7 +72,7 @@ L_exit_valid: L_exit: popl %ebx leave - ret + RET L_zero: @@ -138,7 +138,7 @@ L_exit_nuo_valid: popl %ebx leave - ret + RET L_exit_nuo_zero: movl TAG_Zero,%eax @@ -146,5 +146,5 @@ L_exit_nuo_zero: popl %ebx leave - ret + RET SYM_FUNC_END(FPU_normalize_nuo) diff --git a/arch/x86/math-emu/reg_round.S b/arch/x86/math-emu/reg_round.S index 4a9fc3cc5a4d..0bb2a092161a 100644 --- a/arch/x86/math-emu/reg_round.S +++ b/arch/x86/math-emu/reg_round.S @@ -437,7 +437,7 @@ fpu_Arith_exit: popl %edi popl %esi leave - ret + RET /* diff --git a/arch/x86/math-emu/reg_u_add.S b/arch/x86/math-emu/reg_u_add.S index 9c9e2c810afe..07247287a3af 100644 --- a/arch/x86/math-emu/reg_u_add.S +++ b/arch/x86/math-emu/reg_u_add.S @@ -164,6 +164,6 @@ L_exit: popl %edi popl %esi leave - ret + RET #endif /* PARANOID */ SYM_FUNC_END(FPU_u_add) diff --git a/arch/x86/math-emu/reg_u_div.S b/arch/x86/math-emu/reg_u_div.S index e2fb5c2644c5..b5a41e2fc484 100644 --- a/arch/x86/math-emu/reg_u_div.S +++ b/arch/x86/math-emu/reg_u_div.S @@ -468,7 +468,7 @@ L_exit: popl %esi leave - ret + RET #endif /* PARANOID */ SYM_FUNC_END(FPU_u_div) diff --git a/arch/x86/math-emu/reg_u_mul.S b/arch/x86/math-emu/reg_u_mul.S index 0c779c87ac5b..e2588b24b8c2 100644 --- a/arch/x86/math-emu/reg_u_mul.S +++ b/arch/x86/math-emu/reg_u_mul.S @@ -144,7 +144,7 @@ L_exit: popl %edi popl %esi leave - ret + RET #endif /* PARANOID */ SYM_FUNC_END(FPU_u_mul) diff --git a/arch/x86/math-emu/reg_u_sub.S b/arch/x86/math-emu/reg_u_sub.S index e9bb7c248649..4c900c29e4ff 100644 --- a/arch/x86/math-emu/reg_u_sub.S +++ b/arch/x86/math-emu/reg_u_sub.S @@ -270,5 +270,5 @@ L_exit: popl %edi popl %esi leave - ret + RET SYM_FUNC_END(FPU_u_sub) diff --git a/arch/x86/math-emu/round_Xsig.S b/arch/x86/math-emu/round_Xsig.S index d9d7de8dbd7b..126c40473bad 100644 --- a/arch/x86/math-emu/round_Xsig.S +++ b/arch/x86/math-emu/round_Xsig.S @@ -78,7 +78,7 @@ L_exit: popl %esi popl %ebx leave - ret + RET SYM_FUNC_END(round_Xsig) @@ -138,5 +138,5 @@ L_n_exit: popl %esi popl %ebx leave - ret + RET SYM_FUNC_END(norm_Xsig) diff --git a/arch/x86/math-emu/shr_Xsig.S b/arch/x86/math-emu/shr_Xsig.S index 726af985f758..f726bf6f6396 100644 --- a/arch/x86/math-emu/shr_Xsig.S +++ b/arch/x86/math-emu/shr_Xsig.S @@ -45,7 +45,7 @@ SYM_FUNC_START(shr_Xsig) popl %ebx popl %esi leave - ret + RET L_more_than_31: cmpl $64,%ecx @@ -61,7 +61,7 @@ L_more_than_31: movl $0,8(%esi) popl %esi leave - ret + RET L_more_than_63: cmpl $96,%ecx @@ -76,7 +76,7 @@ L_more_than_63: movl %edx,8(%esi) popl %esi leave - ret + RET L_more_than_95: xorl %eax,%eax @@ -85,5 +85,5 @@ L_more_than_95: movl %eax,8(%esi) popl %esi leave - ret + RET SYM_FUNC_END(shr_Xsig) diff --git a/arch/x86/math-emu/wm_shrx.S b/arch/x86/math-emu/wm_shrx.S index 4fc89174caf0..f608a28a4c43 100644 --- a/arch/x86/math-emu/wm_shrx.S +++ b/arch/x86/math-emu/wm_shrx.S @@ -55,7 +55,7 @@ SYM_FUNC_START(FPU_shrx) popl %ebx popl %esi leave - ret + RET L_more_than_31: cmpl $64,%ecx @@ -70,7 +70,7 @@ L_more_than_31: movl $0,4(%esi) popl %esi leave - ret + RET L_more_than_63: cmpl $96,%ecx @@ -84,7 +84,7 @@ L_more_than_63: movl %edx,4(%esi) popl %esi leave - ret + RET L_more_than_95: xorl %eax,%eax @@ -92,7 +92,7 @@ L_more_than_95: movl %eax,4(%esi) popl %esi leave - ret + RET SYM_FUNC_END(FPU_shrx) @@ -146,7 +146,7 @@ SYM_FUNC_START(FPU_shrxs) popl %ebx popl %esi leave - ret + RET /* Shift by [0..31] bits */ Ls_less_than_32: @@ -163,7 +163,7 @@ Ls_less_than_32: popl %ebx popl %esi leave - ret + RET /* Shift by [64..95] bits */ Ls_more_than_63: @@ -189,7 +189,7 @@ Ls_more_than_63: popl %ebx popl %esi leave - ret + RET Ls_more_than_95: /* Shift by [96..inf) bits */ @@ -203,5 +203,5 @@ Ls_more_than_95: popl %ebx popl %esi leave - ret + RET SYM_FUNC_END(FPU_shrxs) diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 5864219221ca..fe3d3061fc11 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -2,9 +2,11 @@ # Kernel does not boot with instrumentation of tlb.c and mem_encrypt*.c KCOV_INSTRUMENT_tlb.o := n KCOV_INSTRUMENT_mem_encrypt.o := n +KCOV_INSTRUMENT_mem_encrypt_amd.o := n KCOV_INSTRUMENT_mem_encrypt_identity.o := n KASAN_SANITIZE_mem_encrypt.o := n +KASAN_SANITIZE_mem_encrypt_amd.o := n KASAN_SANITIZE_mem_encrypt_identity.o := n # Disable KCSAN entirely, because otherwise we get warnings that some functions @@ -13,6 +15,7 @@ KCSAN_SANITIZE := n ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_mem_encrypt.o = -pg +CFLAGS_REMOVE_mem_encrypt_amd.o = -pg CFLAGS_REMOVE_mem_encrypt_identity.o = -pg endif @@ -52,6 +55,8 @@ obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o -obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o +obj-$(CONFIG_X86_MEM_ENCRYPT) += mem_encrypt.o +obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_amd.o + obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_identity.o obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index f5e1e60c9095..6c2f1b76a0b6 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c @@ -110,6 +110,13 @@ static void __init percpu_setup_exception_stacks(unsigned int cpu) cea_map_stack(NMI); cea_map_stack(DB); cea_map_stack(MCE); + + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) { + if (cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) { + cea_map_stack(VC); + cea_map_stack(VC2); + } + } } #else static inline void percpu_setup_exception_stacks(unsigned int cpu) diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index e1664e9f969c..dba2197c05c3 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -2,48 +2,58 @@ #include <linux/extable.h> #include <linux/uaccess.h> #include <linux/sched/debug.h> +#include <linux/bitfield.h> #include <xen/xen.h> -#include <asm/fpu/internal.h> +#include <asm/fpu/api.h> #include <asm/sev.h> #include <asm/traps.h> #include <asm/kdebug.h> +#include <asm/insn-eval.h> +#include <asm/sgx.h> -typedef bool (*ex_handler_t)(const struct exception_table_entry *, - struct pt_regs *, int, unsigned long, - unsigned long); +static inline unsigned long *pt_regs_nr(struct pt_regs *regs, int nr) +{ + int reg_offset = pt_regs_offset(regs, nr); + static unsigned long __dummy; + + if (WARN_ON_ONCE(reg_offset < 0)) + return &__dummy; + + return (unsigned long *)((unsigned long)regs + reg_offset); +} static inline unsigned long ex_fixup_addr(const struct exception_table_entry *x) { return (unsigned long)&x->fixup + x->fixup; } -static inline ex_handler_t -ex_fixup_handler(const struct exception_table_entry *x) -{ - return (ex_handler_t)((unsigned long)&x->handler + x->handler); -} -__visible bool ex_handler_default(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr, - unsigned long error_code, - unsigned long fault_addr) +static bool ex_handler_default(const struct exception_table_entry *e, + struct pt_regs *regs) { - regs->ip = ex_fixup_addr(fixup); + if (e->data & EX_FLAG_CLEAR_AX) + regs->ax = 0; + if (e->data & EX_FLAG_CLEAR_DX) + regs->dx = 0; + + regs->ip = ex_fixup_addr(e); return true; } -EXPORT_SYMBOL(ex_handler_default); -__visible bool ex_handler_fault(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr, - unsigned long error_code, - unsigned long fault_addr) +static bool ex_handler_fault(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) { - regs->ip = ex_fixup_addr(fixup); regs->ax = trapnr; - return true; + return ex_handler_default(fixup, regs); +} + +static bool ex_handler_sgx(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) +{ + regs->ax = trapnr | SGX_ENCLS_FAULT_FLAG; + return ex_handler_default(fixup, regs); } -EXPORT_SYMBOL_GPL(ex_handler_fault); /* * Handler for when we fail to restore a task's FPU state. We should never get @@ -55,111 +65,93 @@ EXPORT_SYMBOL_GPL(ex_handler_fault); * of vulnerability by restoring from the initial state (essentially, zeroing * out all the FPU registers) if we can't restore from the task's FPU state. */ -__visible bool ex_handler_fprestore(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr, - unsigned long error_code, - unsigned long fault_addr) +static bool ex_handler_fprestore(const struct exception_table_entry *fixup, + struct pt_regs *regs) { regs->ip = ex_fixup_addr(fixup); WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.", (void *)instruction_pointer(regs)); - __restore_fpregs_from_fpstate(&init_fpstate, xfeatures_mask_fpstate()); + fpu_reset_from_exception_fixup(); return true; } -EXPORT_SYMBOL_GPL(ex_handler_fprestore); -__visible bool ex_handler_uaccess(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr, - unsigned long error_code, - unsigned long fault_addr) +static bool ex_handler_uaccess(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) { WARN_ONCE(trapnr == X86_TRAP_GP, "General protection fault in user access. Non-canonical address?"); - regs->ip = ex_fixup_addr(fixup); - return true; + return ex_handler_default(fixup, regs); } -EXPORT_SYMBOL(ex_handler_uaccess); -__visible bool ex_handler_copy(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr, - unsigned long error_code, - unsigned long fault_addr) +static bool ex_handler_copy(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) { WARN_ONCE(trapnr == X86_TRAP_GP, "General protection fault in user access. Non-canonical address?"); - regs->ip = ex_fixup_addr(fixup); - regs->ax = trapnr; - return true; + return ex_handler_fault(fixup, regs, trapnr); } -EXPORT_SYMBOL(ex_handler_copy); -__visible bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr, - unsigned long error_code, - unsigned long fault_addr) +static bool ex_handler_msr(const struct exception_table_entry *fixup, + struct pt_regs *regs, bool wrmsr, bool safe, int reg) { - if (pr_warn_once("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n", + if (!safe && wrmsr && + pr_warn_once("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n", + (unsigned int)regs->cx, (unsigned int)regs->dx, + (unsigned int)regs->ax, regs->ip, (void *)regs->ip)) + show_stack_regs(regs); + + if (!safe && !wrmsr && + pr_warn_once("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n", (unsigned int)regs->cx, regs->ip, (void *)regs->ip)) show_stack_regs(regs); - /* Pretend that the read succeeded and returned 0. */ - regs->ip = ex_fixup_addr(fixup); - regs->ax = 0; - regs->dx = 0; - return true; -} -EXPORT_SYMBOL(ex_handler_rdmsr_unsafe); + if (!wrmsr) { + /* Pretend that the read succeeded and returned 0. */ + regs->ax = 0; + regs->dx = 0; + } -__visible bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr, - unsigned long error_code, - unsigned long fault_addr) -{ - if (pr_warn_once("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n", - (unsigned int)regs->cx, (unsigned int)regs->dx, - (unsigned int)regs->ax, regs->ip, (void *)regs->ip)) - show_stack_regs(regs); + if (safe) + *pt_regs_nr(regs, reg) = -EIO; - /* Pretend that the write succeeded. */ - regs->ip = ex_fixup_addr(fixup); - return true; + return ex_handler_default(fixup, regs); } -EXPORT_SYMBOL(ex_handler_wrmsr_unsafe); -__visible bool ex_handler_clear_fs(const struct exception_table_entry *fixup, - struct pt_regs *regs, int trapnr, - unsigned long error_code, - unsigned long fault_addr) +static bool ex_handler_clear_fs(const struct exception_table_entry *fixup, + struct pt_regs *regs) { if (static_cpu_has(X86_BUG_NULL_SEG)) asm volatile ("mov %0, %%fs" : : "rm" (__USER_DS)); asm volatile ("mov %0, %%fs" : : "rm" (0)); - return ex_handler_default(fixup, regs, trapnr, error_code, fault_addr); + return ex_handler_default(fixup, regs); } -EXPORT_SYMBOL(ex_handler_clear_fs); -enum handler_type ex_get_fault_handler_type(unsigned long ip) +static bool ex_handler_imm_reg(const struct exception_table_entry *fixup, + struct pt_regs *regs, int reg, int imm) { - const struct exception_table_entry *e; - ex_handler_t handler; + *pt_regs_nr(regs, reg) = (long)imm; + return ex_handler_default(fixup, regs); +} - e = search_exception_tables(ip); - if (!e) - return EX_HANDLER_NONE; - handler = ex_fixup_handler(e); - if (handler == ex_handler_fault) - return EX_HANDLER_FAULT; - else if (handler == ex_handler_uaccess || handler == ex_handler_copy) - return EX_HANDLER_UACCESS; - else - return EX_HANDLER_OTHER; +static bool ex_handler_ucopy_len(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr, int reg, int imm) +{ + regs->cx = imm * regs->cx + *pt_regs_nr(regs, reg); + return ex_handler_uaccess(fixup, regs, trapnr); +} + +int ex_get_fixup_type(unsigned long ip) +{ + const struct exception_table_entry *e = search_exception_tables(ip); + + return e ? FIELD_GET(EX_DATA_TYPE_MASK, e->data) : EX_TYPE_NONE; } int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code, unsigned long fault_addr) { const struct exception_table_entry *e; - ex_handler_t handler; + int type, reg, imm; #ifdef CONFIG_PNPBIOS if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) { @@ -179,8 +171,52 @@ int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code, if (!e) return 0; - handler = ex_fixup_handler(e); - return handler(e, regs, trapnr, error_code, fault_addr); + type = FIELD_GET(EX_DATA_TYPE_MASK, e->data); + reg = FIELD_GET(EX_DATA_REG_MASK, e->data); + imm = FIELD_GET(EX_DATA_IMM_MASK, e->data); + + switch (type) { + case EX_TYPE_DEFAULT: + case EX_TYPE_DEFAULT_MCE_SAFE: + return ex_handler_default(e, regs); + case EX_TYPE_FAULT: + case EX_TYPE_FAULT_MCE_SAFE: + return ex_handler_fault(e, regs, trapnr); + case EX_TYPE_UACCESS: + return ex_handler_uaccess(e, regs, trapnr); + case EX_TYPE_COPY: + return ex_handler_copy(e, regs, trapnr); + case EX_TYPE_CLEAR_FS: + return ex_handler_clear_fs(e, regs); + case EX_TYPE_FPU_RESTORE: + return ex_handler_fprestore(e, regs); + case EX_TYPE_BPF: + return ex_handler_bpf(e, regs); + case EX_TYPE_WRMSR: + return ex_handler_msr(e, regs, true, false, reg); + case EX_TYPE_RDMSR: + return ex_handler_msr(e, regs, false, false, reg); + case EX_TYPE_WRMSR_SAFE: + return ex_handler_msr(e, regs, true, true, reg); + case EX_TYPE_RDMSR_SAFE: + return ex_handler_msr(e, regs, false, true, reg); + case EX_TYPE_WRMSR_IN_MCE: + ex_handler_msr_mce(regs, true); + break; + case EX_TYPE_RDMSR_IN_MCE: + ex_handler_msr_mce(regs, false); + break; + case EX_TYPE_POP_REG: + regs->sp += sizeof(long); + fallthrough; + case EX_TYPE_IMM_REG: + return ex_handler_imm_reg(e, regs, reg, imm); + case EX_TYPE_FAULT_SGX: + return ex_handler_sgx(e, regs, trapnr); + case EX_TYPE_UCOPY_LEN: + return ex_handler_ucopy_len(e, regs, trapnr, reg, imm); + } + BUG(); } extern unsigned int early_recursion_flag; diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 84a2c8c4af73..d0074c6ed31a 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -32,6 +32,7 @@ #include <asm/pgtable_areas.h> /* VMALLOC_START, ... */ #include <asm/kvm_para.h> /* kvm_handle_async_pf */ #include <asm/vdso.h> /* fixup_vdso_exception() */ +#include <asm/irq_stack.h> #define CREATE_TRACE_POINTS #include <asm/trace/exceptions.h> @@ -631,6 +632,9 @@ static noinline void page_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address) { +#ifdef CONFIG_VMAP_STACK + struct stack_info info; +#endif unsigned long flags; int sig; @@ -649,9 +653,7 @@ page_fault_oops(struct pt_regs *regs, unsigned long error_code, * that we're in vmalloc space to avoid this. */ if (is_vmalloc_addr((void *)address) && - (((unsigned long)current->stack - 1 - address < PAGE_SIZE) || - address - ((unsigned long)current->stack + THREAD_SIZE) < PAGE_SIZE)) { - unsigned long stack = __this_cpu_ist_top_va(DF) - sizeof(void *); + get_stack_guard_info((void *)address, &info)) { /* * We're likely to be running with very little stack space * left. It's plausible that we'd hit this condition but @@ -662,13 +664,11 @@ page_fault_oops(struct pt_regs *regs, unsigned long error_code, * and then double-fault, though, because we're likely to * break the console driver and lose most of the stack dump. */ - asm volatile ("movq %[stack], %%rsp\n\t" - "call handle_stack_overflow\n\t" - "1: jmp 1b" - : ASM_CALL_CONSTRAINT - : "D" ("kernel stack overflow (page fault)"), - "S" (regs), "d" (address), - [stack] "rm" (stack)); + call_on_stack(__this_cpu_ist_top_va(DF) - sizeof(void*), + handle_stack_overflow, + ASM_CALL_ARG3, + , [arg1] "r" (regs), [arg2] "r" (address), [arg3] "r" (&info)); + unreachable(); } #endif @@ -1413,8 +1413,7 @@ good_area: * and if there is a fatal signal pending there is no guarantee * that we made any progress. Handle this case first. */ - if (unlikely((fault & VM_FAULT_RETRY) && - (flags & FAULT_FLAG_ALLOW_RETRY))) { + if (unlikely(fault & VM_FAULT_RETRY)) { flags |= FAULT_FLAG_TRIED; goto retry; } diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 23a14d82e783..4ba024d5b63a 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -618,7 +618,7 @@ static void __init memory_map_top_down(unsigned long map_start, */ addr = memblock_phys_alloc_range(PMD_SIZE, PMD_SIZE, map_start, map_end); - memblock_free(addr, PMD_SIZE); + memblock_phys_free(addr, PMD_SIZE); real_end = addr + PMD_SIZE; /* step_size need to be small so pgt_buf from BRK could cover it */ @@ -714,6 +714,11 @@ static void __init memory_map_bottom_up(unsigned long map_start, static void __init init_trampoline(void) { #ifdef CONFIG_X86_64 + /* + * The code below will alias kernel page-tables in the user-range of the + * address space, including the Global bit. So global TLB entries will + * be created when using the trampoline page-table. + */ if (!kaslr_memory_enabled()) trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)]; else diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index bd90b8fe81e4..d4e2648a1dfb 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -238,11 +238,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base) } } -/* - * The <linux/kallsyms.h> already defines is_kernel_text, - * using '__' prefix not to get in conflict. - */ -static inline int __is_kernel_text(unsigned long addr) +static inline int is_x86_32_kernel_text(unsigned long addr) { if (addr >= (unsigned long)_text && addr <= (unsigned long)__init_end) return 1; @@ -333,8 +329,8 @@ repeat: addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1; - if (__is_kernel_text(addr) || - __is_kernel_text(addr2)) + if (is_x86_32_kernel_text(addr) || + is_x86_32_kernel_text(addr2)) prot = PAGE_KERNEL_LARGE_EXEC; pages_2m++; @@ -359,7 +355,7 @@ repeat: */ pgprot_t init_prot = __pgprot(PTE_IDENT_ATTR); - if (__is_kernel_text(addr)) + if (is_x86_32_kernel_text(addr)) prot = PAGE_KERNEL_EXEC; pages_4k++; @@ -779,37 +775,6 @@ void __init mem_init(void) test_wp_bit(); } -#ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, - struct mhp_params *params) -{ - unsigned long start_pfn = start >> PAGE_SHIFT; - unsigned long nr_pages = size >> PAGE_SHIFT; - int ret; - - /* - * The page tables were already mapped at boot so if the caller - * requests a different mapping type then we must change all the - * pages with __set_memory_prot(). - */ - if (params->pgprot.pgprot != PAGE_KERNEL.pgprot) { - ret = __set_memory_prot(start, nr_pages, params->pgprot); - if (ret) - return ret; - } - - return __add_pages(nid, start_pfn, nr_pages, params); -} - -void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) -{ - unsigned long start_pfn = start >> PAGE_SHIFT; - unsigned long nr_pages = size >> PAGE_SHIFT; - - __remove_pages(start_pfn, nr_pages, altmap); -} -#endif - int kernel_set_to_readonly __read_mostly; static void mark_nxdata_nx(void) @@ -820,7 +785,7 @@ static void mark_nxdata_nx(void) */ unsigned long start = PFN_ALIGN(_etext); /* - * This comes from __is_kernel_text upper limit. Also HPAGE where used: + * This comes from is_x86_32_kernel_text upper limit. Also HPAGE where used: */ unsigned long size = (((unsigned long)__init_end + HPAGE_SIZE) & HPAGE_MASK) - start; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 36098226a957..96d34ebb20a9 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -981,7 +981,7 @@ static void __meminit free_pagetable(struct page *page, int order) if (PageReserved(page)) { __ClearPageReserved(page); - magic = (unsigned long)page->freelist; + magic = page->index; if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) { while (nr_pages--) put_page_bootmem(page++); diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 60ade7dd71bd..026031b3b782 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -14,7 +14,7 @@ #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/mmiotrace.h> -#include <linux/mem_encrypt.h> +#include <linux/cc_platform.h> #include <linux/efi.h> #include <linux/pgtable.h> @@ -92,7 +92,7 @@ static unsigned int __ioremap_check_ram(struct resource *res) */ static unsigned int __ioremap_check_encrypted(struct resource *res) { - if (!sev_active()) + if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) return 0; switch (res->desc) { @@ -112,7 +112,7 @@ static unsigned int __ioremap_check_encrypted(struct resource *res) */ static void __ioremap_check_other(resource_size_t addr, struct ioremap_desc *desc) { - if (!sev_active()) + if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) return; if (!IS_ENABLED(CONFIG_EFI)) @@ -508,6 +508,7 @@ void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr) memunmap((void *)((unsigned long)addr & PAGE_MASK)); } +#ifdef CONFIG_AMD_MEM_ENCRYPT /* * Examine the physical address to determine if it is an area of memory * that should be mapped decrypted. If the memory is not part of the @@ -555,7 +556,7 @@ static bool memremap_should_map_decrypted(resource_size_t phys_addr, case E820_TYPE_NVS: case E820_TYPE_UNUSABLE: /* For SEV, these areas are encrypted */ - if (sev_active()) + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) break; fallthrough; @@ -693,7 +694,7 @@ static bool __init early_memremap_is_setup_data(resource_size_t phys_addr, bool arch_memremap_can_ram_remap(resource_size_t phys_addr, unsigned long size, unsigned long flags) { - if (!mem_encrypt_active()) + if (!cc_platform_has(CC_ATTR_MEM_ENCRYPT)) return true; if (flags & MEMREMAP_ENC) @@ -702,7 +703,7 @@ bool arch_memremap_can_ram_remap(resource_size_t phys_addr, unsigned long size, if (flags & MEMREMAP_DEC) return false; - if (sme_active()) { + if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) { if (memremap_is_setup_data(phys_addr, size) || memremap_is_efi_data(phys_addr, size)) return false; @@ -723,12 +724,12 @@ pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr, { bool encrypted_prot; - if (!mem_encrypt_active()) + if (!cc_platform_has(CC_ATTR_MEM_ENCRYPT)) return prot; encrypted_prot = true; - if (sme_active()) { + if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) { if (early_memremap_is_setup_data(phys_addr, size) || memremap_is_efi_data(phys_addr, size)) encrypted_prot = false; @@ -746,7 +747,6 @@ bool phys_mem_access_encrypted(unsigned long phys_addr, unsigned long size) return arch_memremap_can_ram_remap(phys_addr, size, 0); } -#ifdef CONFIG_AMD_MEM_ENCRYPT /* Remap memory with encryption */ void __init *early_memremap_encrypted(resource_size_t phys_addr, unsigned long size) diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index ef885370719a..e7b9b464a82f 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -49,7 +49,7 @@ static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr, p = early_alloc(PMD_SIZE, nid, false); if (p && pmd_set_huge(pmd, __pa(p), PAGE_KERNEL)) return; - memblock_free_ptr(p, PMD_SIZE); + memblock_free(p, PMD_SIZE); } p = early_alloc(PAGE_SIZE, nid, true); @@ -85,7 +85,7 @@ static void __init kasan_populate_pud(pud_t *pud, unsigned long addr, p = early_alloc(PUD_SIZE, nid, false); if (p && pud_set_huge(pud, __pa(p), PAGE_KERNEL)) return; - memblock_free_ptr(p, PUD_SIZE); + memblock_free(p, PUD_SIZE); } p = early_alloc(PAGE_SIZE, nid, true); diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index ff08dc463634..50d209939c66 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -1,401 +1,26 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * AMD Memory Encryption Support + * Memory Encryption Support Common Code * * Copyright (C) 2016 Advanced Micro Devices, Inc. * * Author: Tom Lendacky <thomas.lendacky@amd.com> */ -#define DISABLE_BRANCH_PROFILING - -#include <linux/linkage.h> -#include <linux/init.h> -#include <linux/mm.h> #include <linux/dma-direct.h> +#include <linux/dma-mapping.h> #include <linux/swiotlb.h> +#include <linux/cc_platform.h> #include <linux/mem_encrypt.h> -#include <linux/device.h> -#include <linux/kernel.h> -#include <linux/bitops.h> -#include <linux/dma-mapping.h> #include <linux/virtio_config.h> -#include <asm/tlbflush.h> -#include <asm/fixmap.h> -#include <asm/setup.h> -#include <asm/bootparam.h> -#include <asm/set_memory.h> -#include <asm/cacheflush.h> -#include <asm/processor-flags.h> -#include <asm/msr.h> -#include <asm/cmdline.h> - -#include "mm_internal.h" - -/* - * Since SME related variables are set early in the boot process they must - * reside in the .data section so as not to be zeroed out when the .bss - * section is later cleared. - */ -u64 sme_me_mask __section(".data") = 0; -u64 sev_status __section(".data") = 0; -u64 sev_check_data __section(".data") = 0; -EXPORT_SYMBOL(sme_me_mask); -DEFINE_STATIC_KEY_FALSE(sev_enable_key); -EXPORT_SYMBOL_GPL(sev_enable_key); - -/* Buffer used for early in-place encryption by BSP, no locking needed */ -static char sme_early_buffer[PAGE_SIZE] __initdata __aligned(PAGE_SIZE); - -/* - * This routine does not change the underlying encryption setting of the - * page(s) that map this memory. It assumes that eventually the memory is - * meant to be accessed as either encrypted or decrypted but the contents - * are currently not in the desired state. - * - * This routine follows the steps outlined in the AMD64 Architecture - * Programmer's Manual Volume 2, Section 7.10.8 Encrypt-in-Place. - */ -static void __init __sme_early_enc_dec(resource_size_t paddr, - unsigned long size, bool enc) -{ - void *src, *dst; - size_t len; - - if (!sme_me_mask) - return; - - wbinvd(); - - /* - * There are limited number of early mapping slots, so map (at most) - * one page at time. - */ - while (size) { - len = min_t(size_t, sizeof(sme_early_buffer), size); - - /* - * Create mappings for the current and desired format of - * the memory. Use a write-protected mapping for the source. - */ - src = enc ? early_memremap_decrypted_wp(paddr, len) : - early_memremap_encrypted_wp(paddr, len); - - dst = enc ? early_memremap_encrypted(paddr, len) : - early_memremap_decrypted(paddr, len); - - /* - * If a mapping can't be obtained to perform the operation, - * then eventual access of that area in the desired mode - * will cause a crash. - */ - BUG_ON(!src || !dst); - - /* - * Use a temporary buffer, of cache-line multiple size, to - * avoid data corruption as documented in the APM. - */ - memcpy(sme_early_buffer, src, len); - memcpy(dst, sme_early_buffer, len); - - early_memunmap(dst, len); - early_memunmap(src, len); - - paddr += len; - size -= len; - } -} - -void __init sme_early_encrypt(resource_size_t paddr, unsigned long size) -{ - __sme_early_enc_dec(paddr, size, true); -} - -void __init sme_early_decrypt(resource_size_t paddr, unsigned long size) -{ - __sme_early_enc_dec(paddr, size, false); -} - -static void __init __sme_early_map_unmap_mem(void *vaddr, unsigned long size, - bool map) -{ - unsigned long paddr = (unsigned long)vaddr - __PAGE_OFFSET; - pmdval_t pmd_flags, pmd; - - /* Use early_pmd_flags but remove the encryption mask */ - pmd_flags = __sme_clr(early_pmd_flags); - - do { - pmd = map ? (paddr & PMD_MASK) + pmd_flags : 0; - __early_make_pgtable((unsigned long)vaddr, pmd); - - vaddr += PMD_SIZE; - paddr += PMD_SIZE; - size = (size <= PMD_SIZE) ? 0 : size - PMD_SIZE; - } while (size); - - flush_tlb_local(); -} - -void __init sme_unmap_bootdata(char *real_mode_data) -{ - struct boot_params *boot_data; - unsigned long cmdline_paddr; - - if (!sme_active()) - return; - - /* Get the command line address before unmapping the real_mode_data */ - boot_data = (struct boot_params *)real_mode_data; - cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32); - - __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), false); - - if (!cmdline_paddr) - return; - - __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, false); -} - -void __init sme_map_bootdata(char *real_mode_data) -{ - struct boot_params *boot_data; - unsigned long cmdline_paddr; - - if (!sme_active()) - return; - - __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), true); - - /* Get the command line address after mapping the real_mode_data */ - boot_data = (struct boot_params *)real_mode_data; - cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32); - - if (!cmdline_paddr) - return; - - __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, true); -} - -void __init sme_early_init(void) -{ - unsigned int i; - - if (!sme_me_mask) - return; - - early_pmd_flags = __sme_set(early_pmd_flags); - - __supported_pte_mask = __sme_set(__supported_pte_mask); - - /* Update the protection map with memory encryption mask */ - for (i = 0; i < ARRAY_SIZE(protection_map); i++) - protection_map[i] = pgprot_encrypted(protection_map[i]); - - if (sev_active()) - swiotlb_force = SWIOTLB_FORCE; -} - -void __init sev_setup_arch(void) -{ - phys_addr_t total_mem = memblock_phys_mem_size(); - unsigned long size; - - if (!sev_active()) - return; - - /* - * For SEV, all DMA has to occur via shared/unencrypted pages. - * SEV uses SWIOTLB to make this happen without changing device - * drivers. However, depending on the workload being run, the - * default 64MB of SWIOTLB may not be enough and SWIOTLB may - * run out of buffers for DMA, resulting in I/O errors and/or - * performance degradation especially with high I/O workloads. - * - * Adjust the default size of SWIOTLB for SEV guests using - * a percentage of guest memory for SWIOTLB buffers. - * Also, as the SWIOTLB bounce buffer memory is allocated - * from low memory, ensure that the adjusted size is within - * the limits of low available memory. - * - * The percentage of guest memory used here for SWIOTLB buffers - * is more of an approximation of the static adjustment which - * 64MB for <1G, and ~128M to 256M for 1G-to-4G, i.e., the 6% - */ - size = total_mem * 6 / 100; - size = clamp_val(size, IO_TLB_DEFAULT_SIZE, SZ_1G); - swiotlb_adjust_size(size); -} - -static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc) -{ - pgprot_t old_prot, new_prot; - unsigned long pfn, pa, size; - pte_t new_pte; - - switch (level) { - case PG_LEVEL_4K: - pfn = pte_pfn(*kpte); - old_prot = pte_pgprot(*kpte); - break; - case PG_LEVEL_2M: - pfn = pmd_pfn(*(pmd_t *)kpte); - old_prot = pmd_pgprot(*(pmd_t *)kpte); - break; - case PG_LEVEL_1G: - pfn = pud_pfn(*(pud_t *)kpte); - old_prot = pud_pgprot(*(pud_t *)kpte); - break; - default: - return; - } - - new_prot = old_prot; - if (enc) - pgprot_val(new_prot) |= _PAGE_ENC; - else - pgprot_val(new_prot) &= ~_PAGE_ENC; - - /* If prot is same then do nothing. */ - if (pgprot_val(old_prot) == pgprot_val(new_prot)) - return; - - pa = pfn << PAGE_SHIFT; - size = page_level_size(level); - - /* - * We are going to perform in-place en-/decryption and change the - * physical page attribute from C=1 to C=0 or vice versa. Flush the - * caches to ensure that data gets accessed with the correct C-bit. - */ - clflush_cache_range(__va(pa), size); - - /* Encrypt/decrypt the contents in-place */ - if (enc) - sme_early_encrypt(pa, size); - else - sme_early_decrypt(pa, size); - - /* Change the page encryption mask. */ - new_pte = pfn_pte(pfn, new_prot); - set_pte_atomic(kpte, new_pte); -} - -static int __init early_set_memory_enc_dec(unsigned long vaddr, - unsigned long size, bool enc) -{ - unsigned long vaddr_end, vaddr_next; - unsigned long psize, pmask; - int split_page_size_mask; - int level, ret; - pte_t *kpte; - - vaddr_next = vaddr; - vaddr_end = vaddr + size; - - for (; vaddr < vaddr_end; vaddr = vaddr_next) { - kpte = lookup_address(vaddr, &level); - if (!kpte || pte_none(*kpte)) { - ret = 1; - goto out; - } - - if (level == PG_LEVEL_4K) { - __set_clr_pte_enc(kpte, level, enc); - vaddr_next = (vaddr & PAGE_MASK) + PAGE_SIZE; - continue; - } - - psize = page_level_size(level); - pmask = page_level_mask(level); - - /* - * Check whether we can change the large page in one go. - * We request a split when the address is not aligned and - * the number of pages to set/clear encryption bit is smaller - * than the number of pages in the large page. - */ - if (vaddr == (vaddr & pmask) && - ((vaddr_end - vaddr) >= psize)) { - __set_clr_pte_enc(kpte, level, enc); - vaddr_next = (vaddr & pmask) + psize; - continue; - } - - /* - * The virtual address is part of a larger page, create the next - * level page table mapping (4K or 2M). If it is part of a 2M - * page then we request a split of the large page into 4K - * chunks. A 1GB large page is split into 2M pages, resp. - */ - if (level == PG_LEVEL_2M) - split_page_size_mask = 0; - else - split_page_size_mask = 1 << PG_LEVEL_2M; - - /* - * kernel_physical_mapping_change() does not flush the TLBs, so - * a TLB flush is required after we exit from the for loop. - */ - kernel_physical_mapping_change(__pa(vaddr & pmask), - __pa((vaddr_end & pmask) + psize), - split_page_size_mask); - } - - ret = 0; - -out: - __flush_tlb_all(); - return ret; -} - -int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size) -{ - return early_set_memory_enc_dec(vaddr, size, false); -} - -int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size) -{ - return early_set_memory_enc_dec(vaddr, size, true); -} - -/* - * SME and SEV are very similar but they are not the same, so there are - * times that the kernel will need to distinguish between SME and SEV. The - * sme_active() and sev_active() functions are used for this. When a - * distinction isn't needed, the mem_encrypt_active() function can be used. - * - * The trampoline code is a good example for this requirement. Before - * paging is activated, SME will access all memory as decrypted, but SEV - * will access all memory as encrypted. So, when APs are being brought - * up under SME the trampoline area cannot be encrypted, whereas under SEV - * the trampoline area must be encrypted. - */ -bool sev_active(void) -{ - return sev_status & MSR_AMD64_SEV_ENABLED; -} - -bool sme_active(void) -{ - return sme_me_mask && !sev_active(); -} -EXPORT_SYMBOL_GPL(sev_active); - -/* Needs to be called from non-instrumentable code */ -bool noinstr sev_es_active(void) -{ - return sev_status & MSR_AMD64_SEV_ES_ENABLED; -} - /* Override for DMA direct allocation check - ARCH_HAS_FORCE_DMA_UNENCRYPTED */ bool force_dma_unencrypted(struct device *dev) { /* * For SEV, all DMA must be to unencrypted addresses. */ - if (sev_active()) + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) return true; /* @@ -403,7 +28,7 @@ bool force_dma_unencrypted(struct device *dev) * device does not support DMA to addresses that include the * encryption mask. */ - if (sme_active()) { + if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) { u64 dma_enc_mask = DMA_BIT_MASK(__ffs64(sme_me_mask)); u64 dma_dev_mask = min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit); @@ -415,36 +40,12 @@ bool force_dma_unencrypted(struct device *dev) return false; } -void __init mem_encrypt_free_decrypted_mem(void) -{ - unsigned long vaddr, vaddr_end, npages; - int r; - - vaddr = (unsigned long)__start_bss_decrypted_unused; - vaddr_end = (unsigned long)__end_bss_decrypted; - npages = (vaddr_end - vaddr) >> PAGE_SHIFT; - - /* - * The unused memory range was mapped decrypted, change the encryption - * attribute from decrypted to encrypted before freeing it. - */ - if (mem_encrypt_active()) { - r = set_memory_encrypted(vaddr, npages); - if (r) { - pr_warn("failed to free unused decrypted pages\n"); - return; - } - } - - free_init_pages("unused decrypted", vaddr, vaddr_end); -} - static void print_mem_encrypt_feature_info(void) { pr_info("AMD Memory Encryption Features active:"); /* Secure Memory Encryption */ - if (sme_active()) { + if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) { /* * SME is mutually exclusive with any of the SEV * features below. @@ -454,11 +55,11 @@ static void print_mem_encrypt_feature_info(void) } /* Secure Encrypted Virtualization */ - if (sev_active()) + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) pr_cont(" SEV"); /* Encrypted Register State */ - if (sev_es_active()) + if (cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) pr_cont(" SEV-ES"); pr_cont("\n"); @@ -467,24 +68,17 @@ static void print_mem_encrypt_feature_info(void) /* Architecture __weak replacement functions */ void __init mem_encrypt_init(void) { - if (!sme_me_mask) + if (!cc_platform_has(CC_ATTR_MEM_ENCRYPT)) return; /* Call into SWIOTLB to update the SWIOTLB DMA buffers */ swiotlb_update_mem_attributes(); - /* - * With SEV, we need to unroll the rep string I/O instructions, - * but SEV-ES supports them through the #VC handler. - */ - if (sev_active() && !sev_es_active()) - static_branch_enable(&sev_enable_key); - print_mem_encrypt_feature_info(); } int arch_has_restricted_virtio_memory_access(void) { - return sev_active(); + return cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT); } EXPORT_SYMBOL_GPL(arch_has_restricted_virtio_memory_access); diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c new file mode 100644 index 000000000000..2b2d018ea345 --- /dev/null +++ b/arch/x86/mm/mem_encrypt_amd.c @@ -0,0 +1,438 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2016 Advanced Micro Devices, Inc. + * + * Author: Tom Lendacky <thomas.lendacky@amd.com> + */ + +#define DISABLE_BRANCH_PROFILING + +#include <linux/linkage.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/dma-direct.h> +#include <linux/swiotlb.h> +#include <linux/mem_encrypt.h> +#include <linux/device.h> +#include <linux/kernel.h> +#include <linux/bitops.h> +#include <linux/dma-mapping.h> +#include <linux/virtio_config.h> +#include <linux/cc_platform.h> + +#include <asm/tlbflush.h> +#include <asm/fixmap.h> +#include <asm/setup.h> +#include <asm/bootparam.h> +#include <asm/set_memory.h> +#include <asm/cacheflush.h> +#include <asm/processor-flags.h> +#include <asm/msr.h> +#include <asm/cmdline.h> + +#include "mm_internal.h" + +/* + * Since SME related variables are set early in the boot process they must + * reside in the .data section so as not to be zeroed out when the .bss + * section is later cleared. + */ +u64 sme_me_mask __section(".data") = 0; +u64 sev_status __section(".data") = 0; +u64 sev_check_data __section(".data") = 0; +EXPORT_SYMBOL(sme_me_mask); + +/* Buffer used for early in-place encryption by BSP, no locking needed */ +static char sme_early_buffer[PAGE_SIZE] __initdata __aligned(PAGE_SIZE); + +/* + * This routine does not change the underlying encryption setting of the + * page(s) that map this memory. It assumes that eventually the memory is + * meant to be accessed as either encrypted or decrypted but the contents + * are currently not in the desired state. + * + * This routine follows the steps outlined in the AMD64 Architecture + * Programmer's Manual Volume 2, Section 7.10.8 Encrypt-in-Place. + */ +static void __init __sme_early_enc_dec(resource_size_t paddr, + unsigned long size, bool enc) +{ + void *src, *dst; + size_t len; + + if (!sme_me_mask) + return; + + wbinvd(); + + /* + * There are limited number of early mapping slots, so map (at most) + * one page at time. + */ + while (size) { + len = min_t(size_t, sizeof(sme_early_buffer), size); + + /* + * Create mappings for the current and desired format of + * the memory. Use a write-protected mapping for the source. + */ + src = enc ? early_memremap_decrypted_wp(paddr, len) : + early_memremap_encrypted_wp(paddr, len); + + dst = enc ? early_memremap_encrypted(paddr, len) : + early_memremap_decrypted(paddr, len); + + /* + * If a mapping can't be obtained to perform the operation, + * then eventual access of that area in the desired mode + * will cause a crash. + */ + BUG_ON(!src || !dst); + + /* + * Use a temporary buffer, of cache-line multiple size, to + * avoid data corruption as documented in the APM. + */ + memcpy(sme_early_buffer, src, len); + memcpy(dst, sme_early_buffer, len); + + early_memunmap(dst, len); + early_memunmap(src, len); + + paddr += len; + size -= len; + } +} + +void __init sme_early_encrypt(resource_size_t paddr, unsigned long size) +{ + __sme_early_enc_dec(paddr, size, true); +} + +void __init sme_early_decrypt(resource_size_t paddr, unsigned long size) +{ + __sme_early_enc_dec(paddr, size, false); +} + +static void __init __sme_early_map_unmap_mem(void *vaddr, unsigned long size, + bool map) +{ + unsigned long paddr = (unsigned long)vaddr - __PAGE_OFFSET; + pmdval_t pmd_flags, pmd; + + /* Use early_pmd_flags but remove the encryption mask */ + pmd_flags = __sme_clr(early_pmd_flags); + + do { + pmd = map ? (paddr & PMD_MASK) + pmd_flags : 0; + __early_make_pgtable((unsigned long)vaddr, pmd); + + vaddr += PMD_SIZE; + paddr += PMD_SIZE; + size = (size <= PMD_SIZE) ? 0 : size - PMD_SIZE; + } while (size); + + flush_tlb_local(); +} + +void __init sme_unmap_bootdata(char *real_mode_data) +{ + struct boot_params *boot_data; + unsigned long cmdline_paddr; + + if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) + return; + + /* Get the command line address before unmapping the real_mode_data */ + boot_data = (struct boot_params *)real_mode_data; + cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32); + + __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), false); + + if (!cmdline_paddr) + return; + + __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, false); +} + +void __init sme_map_bootdata(char *real_mode_data) +{ + struct boot_params *boot_data; + unsigned long cmdline_paddr; + + if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) + return; + + __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), true); + + /* Get the command line address after mapping the real_mode_data */ + boot_data = (struct boot_params *)real_mode_data; + cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32); + + if (!cmdline_paddr) + return; + + __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, true); +} + +void __init sme_early_init(void) +{ + unsigned int i; + + if (!sme_me_mask) + return; + + early_pmd_flags = __sme_set(early_pmd_flags); + + __supported_pte_mask = __sme_set(__supported_pte_mask); + + /* Update the protection map with memory encryption mask */ + for (i = 0; i < ARRAY_SIZE(protection_map); i++) + protection_map[i] = pgprot_encrypted(protection_map[i]); + + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) + swiotlb_force = SWIOTLB_FORCE; +} + +void __init sev_setup_arch(void) +{ + phys_addr_t total_mem = memblock_phys_mem_size(); + unsigned long size; + + if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) + return; + + /* + * For SEV, all DMA has to occur via shared/unencrypted pages. + * SEV uses SWIOTLB to make this happen without changing device + * drivers. However, depending on the workload being run, the + * default 64MB of SWIOTLB may not be enough and SWIOTLB may + * run out of buffers for DMA, resulting in I/O errors and/or + * performance degradation especially with high I/O workloads. + * + * Adjust the default size of SWIOTLB for SEV guests using + * a percentage of guest memory for SWIOTLB buffers. + * Also, as the SWIOTLB bounce buffer memory is allocated + * from low memory, ensure that the adjusted size is within + * the limits of low available memory. + * + * The percentage of guest memory used here for SWIOTLB buffers + * is more of an approximation of the static adjustment which + * 64MB for <1G, and ~128M to 256M for 1G-to-4G, i.e., the 6% + */ + size = total_mem * 6 / 100; + size = clamp_val(size, IO_TLB_DEFAULT_SIZE, SZ_1G); + swiotlb_adjust_size(size); +} + +static unsigned long pg_level_to_pfn(int level, pte_t *kpte, pgprot_t *ret_prot) +{ + unsigned long pfn = 0; + pgprot_t prot; + + switch (level) { + case PG_LEVEL_4K: + pfn = pte_pfn(*kpte); + prot = pte_pgprot(*kpte); + break; + case PG_LEVEL_2M: + pfn = pmd_pfn(*(pmd_t *)kpte); + prot = pmd_pgprot(*(pmd_t *)kpte); + break; + case PG_LEVEL_1G: + pfn = pud_pfn(*(pud_t *)kpte); + prot = pud_pgprot(*(pud_t *)kpte); + break; + default: + WARN_ONCE(1, "Invalid level for kpte\n"); + return 0; + } + + if (ret_prot) + *ret_prot = prot; + + return pfn; +} + +void notify_range_enc_status_changed(unsigned long vaddr, int npages, bool enc) +{ +#ifdef CONFIG_PARAVIRT + unsigned long sz = npages << PAGE_SHIFT; + unsigned long vaddr_end = vaddr + sz; + + while (vaddr < vaddr_end) { + int psize, pmask, level; + unsigned long pfn; + pte_t *kpte; + + kpte = lookup_address(vaddr, &level); + if (!kpte || pte_none(*kpte)) { + WARN_ONCE(1, "kpte lookup for vaddr\n"); + return; + } + + pfn = pg_level_to_pfn(level, kpte, NULL); + if (!pfn) + continue; + + psize = page_level_size(level); + pmask = page_level_mask(level); + + notify_page_enc_status_changed(pfn, psize >> PAGE_SHIFT, enc); + + vaddr = (vaddr & pmask) + psize; + } +#endif +} + +static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc) +{ + pgprot_t old_prot, new_prot; + unsigned long pfn, pa, size; + pte_t new_pte; + + pfn = pg_level_to_pfn(level, kpte, &old_prot); + if (!pfn) + return; + + new_prot = old_prot; + if (enc) + pgprot_val(new_prot) |= _PAGE_ENC; + else + pgprot_val(new_prot) &= ~_PAGE_ENC; + + /* If prot is same then do nothing. */ + if (pgprot_val(old_prot) == pgprot_val(new_prot)) + return; + + pa = pfn << PAGE_SHIFT; + size = page_level_size(level); + + /* + * We are going to perform in-place en-/decryption and change the + * physical page attribute from C=1 to C=0 or vice versa. Flush the + * caches to ensure that data gets accessed with the correct C-bit. + */ + clflush_cache_range(__va(pa), size); + + /* Encrypt/decrypt the contents in-place */ + if (enc) + sme_early_encrypt(pa, size); + else + sme_early_decrypt(pa, size); + + /* Change the page encryption mask. */ + new_pte = pfn_pte(pfn, new_prot); + set_pte_atomic(kpte, new_pte); +} + +static int __init early_set_memory_enc_dec(unsigned long vaddr, + unsigned long size, bool enc) +{ + unsigned long vaddr_end, vaddr_next, start; + unsigned long psize, pmask; + int split_page_size_mask; + int level, ret; + pte_t *kpte; + + start = vaddr; + vaddr_next = vaddr; + vaddr_end = vaddr + size; + + for (; vaddr < vaddr_end; vaddr = vaddr_next) { + kpte = lookup_address(vaddr, &level); + if (!kpte || pte_none(*kpte)) { + ret = 1; + goto out; + } + + if (level == PG_LEVEL_4K) { + __set_clr_pte_enc(kpte, level, enc); + vaddr_next = (vaddr & PAGE_MASK) + PAGE_SIZE; + continue; + } + + psize = page_level_size(level); + pmask = page_level_mask(level); + + /* + * Check whether we can change the large page in one go. + * We request a split when the address is not aligned and + * the number of pages to set/clear encryption bit is smaller + * than the number of pages in the large page. + */ + if (vaddr == (vaddr & pmask) && + ((vaddr_end - vaddr) >= psize)) { + __set_clr_pte_enc(kpte, level, enc); + vaddr_next = (vaddr & pmask) + psize; + continue; + } + + /* + * The virtual address is part of a larger page, create the next + * level page table mapping (4K or 2M). If it is part of a 2M + * page then we request a split of the large page into 4K + * chunks. A 1GB large page is split into 2M pages, resp. + */ + if (level == PG_LEVEL_2M) + split_page_size_mask = 0; + else + split_page_size_mask = 1 << PG_LEVEL_2M; + + /* + * kernel_physical_mapping_change() does not flush the TLBs, so + * a TLB flush is required after we exit from the for loop. + */ + kernel_physical_mapping_change(__pa(vaddr & pmask), + __pa((vaddr_end & pmask) + psize), + split_page_size_mask); + } + + ret = 0; + + notify_range_enc_status_changed(start, PAGE_ALIGN(size) >> PAGE_SHIFT, enc); +out: + __flush_tlb_all(); + return ret; +} + +int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size) +{ + return early_set_memory_enc_dec(vaddr, size, false); +} + +int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size) +{ + return early_set_memory_enc_dec(vaddr, size, true); +} + +void __init early_set_mem_enc_dec_hypercall(unsigned long vaddr, int npages, bool enc) +{ + notify_range_enc_status_changed(vaddr, npages, enc); +} + +void __init mem_encrypt_free_decrypted_mem(void) +{ + unsigned long vaddr, vaddr_end, npages; + int r; + + vaddr = (unsigned long)__start_bss_decrypted_unused; + vaddr_end = (unsigned long)__end_bss_decrypted; + npages = (vaddr_end - vaddr) >> PAGE_SHIFT; + + /* + * The unused memory range was mapped decrypted, change the encryption + * attribute from decrypted to encrypted before freeing it. + */ + if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) { + r = set_memory_encrypted(vaddr, npages); + if (r) { + pr_warn("failed to free unused decrypted pages\n"); + return; + } + } + + free_init_pages("unused decrypted", vaddr, vaddr_end); +} diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S index 17d292b7072f..3d1dba05fce4 100644 --- a/arch/x86/mm/mem_encrypt_boot.S +++ b/arch/x86/mm/mem_encrypt_boot.S @@ -65,7 +65,7 @@ SYM_FUNC_START(sme_encrypt_execute) movq %rbp, %rsp /* Restore original stack pointer */ pop %rbp - ret + RET SYM_FUNC_END(sme_encrypt_execute) SYM_FUNC_START(__enc_copy) @@ -151,6 +151,6 @@ SYM_FUNC_START(__enc_copy) pop %r12 pop %r15 - ret + RET .L__enc_copy_end: SYM_FUNC_END(__enc_copy) diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index 470b20208430..3f0abb403340 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -27,9 +27,19 @@ #undef CONFIG_PARAVIRT_XXL #undef CONFIG_PARAVIRT_SPINLOCKS +/* + * This code runs before CPU feature bits are set. By default, the + * pgtable_l5_enabled() function uses bit X86_FEATURE_LA57 to determine if + * 5-level paging is active, so that won't work here. USE_EARLY_PGTABLE_L5 + * is provided to handle this situation and, instead, use a variable that + * has been set by the early boot code. + */ +#define USE_EARLY_PGTABLE_L5 + #include <linux/kernel.h> #include <linux/mm.h> #include <linux/mem_encrypt.h> +#include <linux/cc_platform.h> #include <asm/setup.h> #include <asm/sections.h> @@ -287,7 +297,13 @@ void __init sme_encrypt_kernel(struct boot_params *bp) unsigned long pgtable_area_len; unsigned long decrypted_base; - if (!sme_active()) + /* + * This is early code, use an open coded check for SME instead of + * using cc_platform_has(). This eliminates worries about removing + * instrumentation or checking boot_cpu_data in the cc_platform_has() + * function. + */ + if (!sme_get_me_mask() || sev_status & MSR_AMD64_SEV_ENABLED) return; /* diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 1e9b93b088db..c6b1213086d6 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -355,7 +355,7 @@ void __init numa_reset_distance(void) /* numa_distance could be 1LU marking allocation failure, test cnt */ if (numa_distance_cnt) - memblock_free_ptr(numa_distance, size); + memblock_free(numa_distance, size); numa_distance_cnt = 0; numa_distance = NULL; /* enable table creation */ } diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index e801e30089c4..1a02b791d273 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -517,7 +517,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) } /* free the copied physical distance table */ - memblock_free_ptr(phys_dist, phys_size); + memblock_free(phys_dist, phys_size); return; no_emu: diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index ad8a5c586a35..b4072115c8ef 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -18,6 +18,7 @@ #include <linux/libnvdimm.h> #include <linux/vmstat.h> #include <linux/kernel.h> +#include <linux/cc_platform.h> #include <asm/e820/api.h> #include <asm/processor.h> @@ -29,6 +30,8 @@ #include <asm/proto.h> #include <asm/memtype.h> #include <asm/set_memory.h> +#include <asm/hyperv-tlfs.h> +#include <asm/mshyperv.h> #include "../mm_internal.h" @@ -1980,15 +1983,15 @@ int set_memory_global(unsigned long addr, int numpages) __pgprot(_PAGE_GLOBAL), 0); } -static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) +/* + * __set_memory_enc_pgtable() is used for the hypervisors that get + * informed about "encryption" status via page tables. + */ +static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc) { struct cpa_data cpa; int ret; - /* Nothing to do if memory encryption is not active */ - if (!mem_encrypt_active()) - return 0; - /* Should not be working on unaligned addresses */ if (WARN_ONCE(addr & ~PAGE_MASK, "misaligned address: %#lx\n", addr)) addr &= PAGE_MASK; @@ -2020,9 +2023,26 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) */ cpa_flush(&cpa, 0); + /* + * Notify hypervisor that a given memory range is mapped encrypted + * or decrypted. + */ + notify_range_enc_status_changed(addr, numpages, enc); + return ret; } +static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) +{ + if (hv_is_isolation_supported()) + return hv_set_mem_host_visibility(addr, numpages, !enc); + + if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) + return __set_memory_enc_pgtable(addr, numpages, enc); + + return 0; +} + int set_memory_encrypted(unsigned long addr, int numpages) { return __set_memory_enc_dec(addr, numpages, true); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 59ba2968af1b..a6cf56a14939 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -361,7 +361,7 @@ static void l1d_flush_evaluate(unsigned long prev_mm, unsigned long next_mm, static unsigned long mm_mangle_tif_spec_bits(struct task_struct *next) { - unsigned long next_tif = task_thread_info(next)->flags; + unsigned long next_tif = read_task_thread_flags(next); unsigned long spec_bits = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_SPEC_MASK; /* @@ -1148,7 +1148,7 @@ void flush_tlb_one_user(unsigned long addr) */ STATIC_NOPV void native_flush_tlb_global(void) { - unsigned long cr4, flags; + unsigned long flags; if (static_cpu_has(X86_FEATURE_INVPCID)) { /* @@ -1168,11 +1168,7 @@ STATIC_NOPV void native_flush_tlb_global(void) */ raw_local_irq_save(flags); - cr4 = this_cpu_read(cpu_tlbstate.cr4); - /* toggle PGE */ - native_write_cr4(cr4 ^ X86_CR4_PGE); - /* write old PGE again and flush TLBs */ - native_write_cr4(cr4); + __native_tlb_flush_global(this_cpu_read(cpu_tlbstate.cr4)); raw_local_irq_restore(flags); } diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 9ea57389c554..2b1e266ff95c 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1,9 +1,9 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * bpf_jit_comp.c: BPF JIT compiler + * BPF JIT compiler * * Copyright (C) 2011-2013 Eric Dumazet (eric.dumazet@gmail.com) - * Internal BPF Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com */ #include <linux/netdevice.h> #include <linux/filter.h> @@ -15,7 +15,6 @@ #include <asm/set_memory.h> #include <asm/nospec-branch.h> #include <asm/text-patching.h> -#include <asm/asm-prototypes.h> static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len) { @@ -225,6 +224,14 @@ static void jit_fill_hole(void *area, unsigned int size) struct jit_context { int cleanup_addr; /* Epilogue code offset */ + + /* + * Program specific offsets of labels in the code; these rely on the + * JIT doing at least 2 passes, recording the position on the first + * pass, only to generate the correct offset on the second pass. + */ + int tail_call_direct_label; + int tail_call_indirect_label; }; /* Maximum number of bytes emitted while JITing one eBPF insn */ @@ -380,20 +387,23 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, return __bpf_arch_text_poke(ip, t, old_addr, new_addr, true); } -static int get_pop_bytes(bool *callee_regs_used) +#define EMIT_LFENCE() EMIT3(0x0F, 0xAE, 0xE8) + +static void emit_indirect_jump(u8 **pprog, int reg, u8 *ip) { - int bytes = 0; + u8 *prog = *pprog; - if (callee_regs_used[3]) - bytes += 2; - if (callee_regs_used[2]) - bytes += 2; - if (callee_regs_used[1]) - bytes += 2; - if (callee_regs_used[0]) - bytes += 1; +#ifdef CONFIG_RETPOLINE + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_AMD)) { + EMIT_LFENCE(); + EMIT2(0xFF, 0xE0 + reg); + } else if (cpu_feature_enabled(X86_FEATURE_RETPOLINE)) { + emit_jump(&prog, &__x86_indirect_thunk_array[reg], ip); + } else +#endif + EMIT2(0xFF, 0xE0 + reg); - return bytes; + *pprog = prog; } /* @@ -402,7 +412,7 @@ static int get_pop_bytes(bool *callee_regs_used) * ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ... * if (index >= array->map.max_entries) * goto out; - * if (++tail_call_cnt > MAX_TAIL_CALL_CNT) + * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT) * goto out; * prog = array->ptrs[index]; * if (prog == NULL) @@ -411,29 +421,12 @@ static int get_pop_bytes(bool *callee_regs_used) * out: */ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used, - u32 stack_depth) + u32 stack_depth, u8 *ip, + struct jit_context *ctx) { int tcc_off = -4 - round_up(stack_depth, 8); - u8 *prog = *pprog; - int pop_bytes = 0; - int off1 = 42; - int off2 = 31; - int off3 = 9; - - /* count the additional bytes used for popping callee regs from stack - * that need to be taken into account for each of the offsets that - * are used for bailing out of the tail call - */ - pop_bytes = get_pop_bytes(callee_regs_used); - off1 += pop_bytes; - off2 += pop_bytes; - off3 += pop_bytes; - - if (stack_depth) { - off1 += 7; - off2 += 7; - off3 += 7; - } + u8 *prog = *pprog, *start = *pprog; + int offset; /* * rdi - pointer to ctx @@ -448,17 +441,19 @@ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used, EMIT2(0x89, 0xD2); /* mov edx, edx */ EMIT3(0x39, 0x56, /* cmp dword ptr [rsi + 16], edx */ offsetof(struct bpf_array, map.max_entries)); -#define OFFSET1 (off1 + RETPOLINE_RCX_BPF_JIT_SIZE) /* Number of bytes to jump */ - EMIT2(X86_JBE, OFFSET1); /* jbe out */ + + offset = ctx->tail_call_indirect_label - (prog + 2 - start); + EMIT2(X86_JBE, offset); /* jbe out */ /* - * if (tail_call_cnt > MAX_TAIL_CALL_CNT) + * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT) * goto out; */ EMIT2_off32(0x8B, 0x85, tcc_off); /* mov eax, dword ptr [rbp - tcc_off] */ EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */ -#define OFFSET2 (off2 + RETPOLINE_RCX_BPF_JIT_SIZE) - EMIT2(X86_JA, OFFSET2); /* ja out */ + + offset = ctx->tail_call_indirect_label - (prog + 2 - start); + EMIT2(X86_JAE, offset); /* jae out */ EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ EMIT2_off32(0x89, 0x85, tcc_off); /* mov dword ptr [rbp - tcc_off], eax */ @@ -471,12 +466,11 @@ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used, * goto out; */ EMIT3(0x48, 0x85, 0xC9); /* test rcx,rcx */ -#define OFFSET3 (off3 + RETPOLINE_RCX_BPF_JIT_SIZE) - EMIT2(X86_JE, OFFSET3); /* je out */ - *pprog = prog; - pop_callee_regs(pprog, callee_regs_used); - prog = *pprog; + offset = ctx->tail_call_indirect_label - (prog + 2 - start); + EMIT2(X86_JE, offset); /* je out */ + + pop_callee_regs(&prog, callee_regs_used); EMIT1(0x58); /* pop rax */ if (stack_depth) @@ -493,70 +487,52 @@ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used, * rdi == ctx (1st arg) * rcx == prog->bpf_func + X86_TAIL_CALL_OFFSET */ - RETPOLINE_RCX_BPF_JIT(); + emit_indirect_jump(&prog, 1 /* rcx */, ip + (prog - start)); /* out: */ + ctx->tail_call_indirect_label = prog - start; *pprog = prog; } static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke, - u8 **pprog, int addr, u8 *image, - bool *callee_regs_used, u32 stack_depth) + u8 **pprog, u8 *ip, + bool *callee_regs_used, u32 stack_depth, + struct jit_context *ctx) { int tcc_off = -4 - round_up(stack_depth, 8); - u8 *prog = *pprog; - int pop_bytes = 0; - int off1 = 20; - int poke_off; - - /* count the additional bytes used for popping callee regs to stack - * that need to be taken into account for jump offset that is used for - * bailing out from of the tail call when limit is reached - */ - pop_bytes = get_pop_bytes(callee_regs_used); - off1 += pop_bytes; + u8 *prog = *pprog, *start = *pprog; + int offset; /* - * total bytes for: - * - nop5/ jmpq $off - * - pop callee regs - * - sub rsp, $val if depth > 0 - * - pop rax - */ - poke_off = X86_PATCH_SIZE + pop_bytes + 1; - if (stack_depth) { - poke_off += 7; - off1 += 7; - } - - /* - * if (tail_call_cnt > MAX_TAIL_CALL_CNT) + * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT) * goto out; */ EMIT2_off32(0x8B, 0x85, tcc_off); /* mov eax, dword ptr [rbp - tcc_off] */ EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */ - EMIT2(X86_JA, off1); /* ja out */ + + offset = ctx->tail_call_direct_label - (prog + 2 - start); + EMIT2(X86_JAE, offset); /* jae out */ EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ EMIT2_off32(0x89, 0x85, tcc_off); /* mov dword ptr [rbp - tcc_off], eax */ - poke->tailcall_bypass = image + (addr - poke_off - X86_PATCH_SIZE); + poke->tailcall_bypass = ip + (prog - start); poke->adj_off = X86_TAIL_CALL_OFFSET; - poke->tailcall_target = image + (addr - X86_PATCH_SIZE); + poke->tailcall_target = ip + ctx->tail_call_direct_label - X86_PATCH_SIZE; poke->bypass_addr = (u8 *)poke->tailcall_target + X86_PATCH_SIZE; emit_jump(&prog, (u8 *)poke->tailcall_target + X86_PATCH_SIZE, poke->tailcall_bypass); - *pprog = prog; - pop_callee_regs(pprog, callee_regs_used); - prog = *pprog; + pop_callee_regs(&prog, callee_regs_used); EMIT1(0x58); /* pop rax */ if (stack_depth) EMIT3_off32(0x48, 0x81, 0xC4, round_up(stack_depth, 8)); memcpy(prog, x86_nops[5], X86_PATCH_SIZE); prog += X86_PATCH_SIZE; + /* out: */ + ctx->tail_call_direct_label = prog - start; *pprog = prog; } @@ -721,6 +697,20 @@ static void maybe_emit_mod(u8 **pprog, u32 dst_reg, u32 src_reg, bool is64) *pprog = prog; } +/* + * Similar version of maybe_emit_mod() for a single register + */ +static void maybe_emit_1mod(u8 **pprog, u32 reg, bool is64) +{ + u8 *prog = *pprog; + + if (is64) + EMIT1(add_1mod(0x48, reg)); + else if (is_ereg(reg)) + EMIT1(add_1mod(0x40, reg)); + *pprog = prog; +} + /* LDX: dst_reg = *(u8*)(src_reg + off) */ static void emit_ldx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) { @@ -827,9 +817,7 @@ static int emit_atomic(u8 **pprog, u8 atomic_op, return 0; } -static bool ex_handler_bpf(const struct exception_table_entry *x, - struct pt_regs *regs, int trapnr, - unsigned long error_code, unsigned long fault_addr) +bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs) { u32 reg = x->fixup >> 8; @@ -951,10 +939,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, /* neg dst */ case BPF_ALU | BPF_NEG: case BPF_ALU64 | BPF_NEG: - if (BPF_CLASS(insn->code) == BPF_ALU64) - EMIT1(add_1mod(0x48, dst_reg)); - else if (is_ereg(dst_reg)) - EMIT1(add_1mod(0x40, dst_reg)); + maybe_emit_1mod(&prog, dst_reg, + BPF_CLASS(insn->code) == BPF_ALU64); EMIT2(0xF7, add_1reg(0xD8, dst_reg)); break; @@ -968,10 +954,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, case BPF_ALU64 | BPF_AND | BPF_K: case BPF_ALU64 | BPF_OR | BPF_K: case BPF_ALU64 | BPF_XOR | BPF_K: - if (BPF_CLASS(insn->code) == BPF_ALU64) - EMIT1(add_1mod(0x48, dst_reg)); - else if (is_ereg(dst_reg)) - EMIT1(add_1mod(0x40, dst_reg)); + maybe_emit_1mod(&prog, dst_reg, + BPF_CLASS(insn->code) == BPF_ALU64); /* * b3 holds 'normal' opcode, b2 short form only valid @@ -1028,19 +1012,30 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, case BPF_ALU64 | BPF_MOD | BPF_X: case BPF_ALU64 | BPF_DIV | BPF_X: case BPF_ALU64 | BPF_MOD | BPF_K: - case BPF_ALU64 | BPF_DIV | BPF_K: - EMIT1(0x50); /* push rax */ - EMIT1(0x52); /* push rdx */ + case BPF_ALU64 | BPF_DIV | BPF_K: { + bool is64 = BPF_CLASS(insn->code) == BPF_ALU64; - if (BPF_SRC(insn->code) == BPF_X) - /* mov r11, src_reg */ - EMIT_mov(AUX_REG, src_reg); - else + if (dst_reg != BPF_REG_0) + EMIT1(0x50); /* push rax */ + if (dst_reg != BPF_REG_3) + EMIT1(0x52); /* push rdx */ + + if (BPF_SRC(insn->code) == BPF_X) { + if (src_reg == BPF_REG_0 || + src_reg == BPF_REG_3) { + /* mov r11, src_reg */ + EMIT_mov(AUX_REG, src_reg); + src_reg = AUX_REG; + } + } else { /* mov r11, imm32 */ EMIT3_off32(0x49, 0xC7, 0xC3, imm32); + src_reg = AUX_REG; + } - /* mov rax, dst_reg */ - EMIT_mov(BPF_REG_0, dst_reg); + if (dst_reg != BPF_REG_0) + /* mov rax, dst_reg */ + emit_mov_reg(&prog, is64, BPF_REG_0, dst_reg); /* * xor edx, edx @@ -1048,63 +1043,51 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, */ EMIT2(0x31, 0xd2); - if (BPF_CLASS(insn->code) == BPF_ALU64) - /* div r11 */ - EMIT3(0x49, 0xF7, 0xF3); - else - /* div r11d */ - EMIT3(0x41, 0xF7, 0xF3); - - if (BPF_OP(insn->code) == BPF_MOD) - /* mov r11, rdx */ - EMIT3(0x49, 0x89, 0xD3); - else - /* mov r11, rax */ - EMIT3(0x49, 0x89, 0xC3); + /* div src_reg */ + maybe_emit_1mod(&prog, src_reg, is64); + EMIT2(0xF7, add_1reg(0xF0, src_reg)); - EMIT1(0x5A); /* pop rdx */ - EMIT1(0x58); /* pop rax */ + if (BPF_OP(insn->code) == BPF_MOD && + dst_reg != BPF_REG_3) + /* mov dst_reg, rdx */ + emit_mov_reg(&prog, is64, dst_reg, BPF_REG_3); + else if (BPF_OP(insn->code) == BPF_DIV && + dst_reg != BPF_REG_0) + /* mov dst_reg, rax */ + emit_mov_reg(&prog, is64, dst_reg, BPF_REG_0); - /* mov dst_reg, r11 */ - EMIT_mov(dst_reg, AUX_REG); + if (dst_reg != BPF_REG_3) + EMIT1(0x5A); /* pop rdx */ + if (dst_reg != BPF_REG_0) + EMIT1(0x58); /* pop rax */ break; + } case BPF_ALU | BPF_MUL | BPF_K: - case BPF_ALU | BPF_MUL | BPF_X: case BPF_ALU64 | BPF_MUL | BPF_K: - case BPF_ALU64 | BPF_MUL | BPF_X: - { - bool is64 = BPF_CLASS(insn->code) == BPF_ALU64; - - if (dst_reg != BPF_REG_0) - EMIT1(0x50); /* push rax */ - if (dst_reg != BPF_REG_3) - EMIT1(0x52); /* push rdx */ - - /* mov r11, dst_reg */ - EMIT_mov(AUX_REG, dst_reg); + maybe_emit_mod(&prog, dst_reg, dst_reg, + BPF_CLASS(insn->code) == BPF_ALU64); - if (BPF_SRC(insn->code) == BPF_X) - emit_mov_reg(&prog, is64, BPF_REG_0, src_reg); + if (is_imm8(imm32)) + /* imul dst_reg, dst_reg, imm8 */ + EMIT3(0x6B, add_2reg(0xC0, dst_reg, dst_reg), + imm32); else - emit_mov_imm32(&prog, is64, BPF_REG_0, imm32); + /* imul dst_reg, dst_reg, imm32 */ + EMIT2_off32(0x69, + add_2reg(0xC0, dst_reg, dst_reg), + imm32); + break; - if (is64) - EMIT1(add_1mod(0x48, AUX_REG)); - else if (is_ereg(AUX_REG)) - EMIT1(add_1mod(0x40, AUX_REG)); - /* mul(q) r11 */ - EMIT2(0xF7, add_1reg(0xE0, AUX_REG)); + case BPF_ALU | BPF_MUL | BPF_X: + case BPF_ALU64 | BPF_MUL | BPF_X: + maybe_emit_mod(&prog, src_reg, dst_reg, + BPF_CLASS(insn->code) == BPF_ALU64); - if (dst_reg != BPF_REG_3) - EMIT1(0x5A); /* pop rdx */ - if (dst_reg != BPF_REG_0) { - /* mov dst_reg, rax */ - EMIT_mov(dst_reg, BPF_REG_0); - EMIT1(0x58); /* pop rax */ - } + /* imul dst_reg, src_reg */ + EMIT3(0x0F, 0xAF, add_2reg(0xC0, src_reg, dst_reg)); break; - } + /* Shifts */ case BPF_ALU | BPF_LSH | BPF_K: case BPF_ALU | BPF_RSH | BPF_K: @@ -1112,10 +1095,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, case BPF_ALU64 | BPF_LSH | BPF_K: case BPF_ALU64 | BPF_RSH | BPF_K: case BPF_ALU64 | BPF_ARSH | BPF_K: - if (BPF_CLASS(insn->code) == BPF_ALU64) - EMIT1(add_1mod(0x48, dst_reg)); - else if (is_ereg(dst_reg)) - EMIT1(add_1mod(0x40, dst_reg)); + maybe_emit_1mod(&prog, dst_reg, + BPF_CLASS(insn->code) == BPF_ALU64); b3 = simple_alu_opcodes[BPF_OP(insn->code)]; if (imm32 == 1) @@ -1146,10 +1127,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, } /* shl %rax, %cl | shr %rax, %cl | sar %rax, %cl */ - if (BPF_CLASS(insn->code) == BPF_ALU64) - EMIT1(add_1mod(0x48, dst_reg)); - else if (is_ereg(dst_reg)) - EMIT1(add_1mod(0x40, dst_reg)); + maybe_emit_1mod(&prog, dst_reg, + BPF_CLASS(insn->code) == BPF_ALU64); b3 = simple_alu_opcodes[BPF_OP(insn->code)]; EMIT2(0xD3, add_1reg(b3, dst_reg)); @@ -1222,8 +1201,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, /* speculation barrier */ case BPF_ST | BPF_NOSPEC: if (boot_cpu_has(X86_FEATURE_XMM2)) - /* Emit 'lfence' */ - EMIT3(0x0F, 0xAE, 0xE8); + EMIT_LFENCE(); break; /* ST: *(u8*)(dst_reg + off) = imm */ @@ -1274,19 +1252,54 @@ st: if (is_imm8(insn->off)) case BPF_LDX | BPF_MEM | BPF_DW: case BPF_LDX | BPF_PROBE_MEM | BPF_DW: if (BPF_MODE(insn->code) == BPF_PROBE_MEM) { - /* test src_reg, src_reg */ - maybe_emit_mod(&prog, src_reg, src_reg, true); /* always 1 byte */ - EMIT2(0x85, add_2reg(0xC0, src_reg, src_reg)); - /* jne start_of_ldx */ - EMIT2(X86_JNE, 0); + /* Though the verifier prevents negative insn->off in BPF_PROBE_MEM + * add abs(insn->off) to the limit to make sure that negative + * offset won't be an issue. + * insn->off is s16, so it won't affect valid pointers. + */ + u64 limit = TASK_SIZE_MAX + PAGE_SIZE + abs(insn->off); + u8 *end_of_jmp1, *end_of_jmp2; + + /* Conservatively check that src_reg + insn->off is a kernel address: + * 1. src_reg + insn->off >= limit + * 2. src_reg + insn->off doesn't become small positive. + * Cannot do src_reg + insn->off >= limit in one branch, + * since it needs two spare registers, but JIT has only one. + */ + + /* movabsq r11, limit */ + EMIT2(add_1mod(0x48, AUX_REG), add_1reg(0xB8, AUX_REG)); + EMIT((u32)limit, 4); + EMIT(limit >> 32, 4); + /* cmp src_reg, r11 */ + maybe_emit_mod(&prog, src_reg, AUX_REG, true); + EMIT2(0x39, add_2reg(0xC0, src_reg, AUX_REG)); + /* if unsigned '<' goto end_of_jmp2 */ + EMIT2(X86_JB, 0); + end_of_jmp1 = prog; + + /* mov r11, src_reg */ + emit_mov_reg(&prog, true, AUX_REG, src_reg); + /* add r11, insn->off */ + maybe_emit_1mod(&prog, AUX_REG, true); + EMIT2_off32(0x81, add_1reg(0xC0, AUX_REG), insn->off); + /* jmp if not carry to start_of_ldx + * Otherwise ERR_PTR(-EINVAL) + 128 will be the user addr + * that has to be rejected. + */ + EMIT2(0x73 /* JNC */, 0); + end_of_jmp2 = prog; + /* xor dst_reg, dst_reg */ emit_mov_imm32(&prog, false, dst_reg, 0); /* jmp byte_after_ldx */ EMIT2(0xEB, 0); - /* populate jmp_offset for JNE above */ - temp[4] = prog - temp - 5 /* sizeof(test + jne) */; + /* populate jmp_offset for JB above to jump to xor dst_reg */ + end_of_jmp1[-1] = end_of_jmp2 - end_of_jmp1; + /* populate jmp_offset for JNC above to jump to start_of_ldx */ start_of_ldx = prog; + end_of_jmp2[-1] = start_of_ldx - end_of_jmp2; } emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); if (BPF_MODE(insn->code) == BPF_PROBE_MEM) { @@ -1313,12 +1326,7 @@ st: if (is_imm8(insn->off)) } ex->insn = delta; - delta = (u8 *)ex_handler_bpf - (u8 *)&ex->handler; - if (!is_simm32(delta)) { - pr_err("extable->handler doesn't fit into 32-bit\n"); - return -EFAULT; - } - ex->handler = delta; + ex->data = EX_TYPE_BPF; if (dst_reg > BPF_REG_9) { pr_err("verifier error\n"); @@ -1332,7 +1340,7 @@ st: if (is_imm8(insn->off)) * End result: x86 insn "mov rbx, qword ptr [rax+0x14]" * of 4 bytes will be ignored and rbx will be zero inited. */ - ex->fixup = (prog - temp) | (reg2pt_regs[dst_reg] << 8); + ex->fixup = (prog - start_of_ldx) | (reg2pt_regs[dst_reg] << 8); } break; @@ -1412,13 +1420,16 @@ st: if (is_imm8(insn->off)) case BPF_JMP | BPF_TAIL_CALL: if (imm32) emit_bpf_tail_call_direct(&bpf_prog->aux->poke_tab[imm32 - 1], - &prog, addrs[i], image, + &prog, image + addrs[i - 1], callee_regs_used, - bpf_prog->aux->stack_depth); + bpf_prog->aux->stack_depth, + ctx); else emit_bpf_tail_call_indirect(&prog, callee_regs_used, - bpf_prog->aux->stack_depth); + bpf_prog->aux->stack_depth, + image + addrs[i - 1], + ctx); break; /* cond jump */ @@ -1459,10 +1470,8 @@ st: if (is_imm8(insn->off)) case BPF_JMP | BPF_JSET | BPF_K: case BPF_JMP32 | BPF_JSET | BPF_K: /* test dst_reg, imm32 */ - if (BPF_CLASS(insn->code) == BPF_JMP) - EMIT1(add_1mod(0x48, dst_reg)); - else if (is_ereg(dst_reg)) - EMIT1(add_1mod(0x40, dst_reg)); + maybe_emit_1mod(&prog, dst_reg, + BPF_CLASS(insn->code) == BPF_JMP); EMIT2_off32(0xF7, add_1reg(0xC0, dst_reg), imm32); goto emit_cond_jmp; @@ -1495,10 +1504,8 @@ st: if (is_imm8(insn->off)) } /* cmp dst_reg, imm8/32 */ - if (BPF_CLASS(insn->code) == BPF_JMP) - EMIT1(add_1mod(0x48, dst_reg)); - else if (is_ereg(dst_reg)) - EMIT1(add_1mod(0x40, dst_reg)); + maybe_emit_1mod(&prog, dst_reg, + BPF_CLASS(insn->code) == BPF_JMP); if (is_imm8(imm32)) EMIT3(0x83, add_1reg(0xF8, dst_reg), imm32); @@ -1969,7 +1976,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i void *orig_call) { int ret, i, nr_args = m->nr_args; - int stack_size = nr_args * 8; + int regs_off, ip_off, args_off, stack_size = nr_args * 8; struct bpf_tramp_progs *fentry = &tprogs[BPF_TRAMP_FENTRY]; struct bpf_tramp_progs *fexit = &tprogs[BPF_TRAMP_FEXIT]; struct bpf_tramp_progs *fmod_ret = &tprogs[BPF_TRAMP_MODIFY_RETURN]; @@ -1984,14 +1991,39 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i if (!is_valid_bpf_tramp_flags(flags)) return -EINVAL; + /* Generated trampoline stack layout: + * + * RBP + 8 [ return address ] + * RBP + 0 [ RBP ] + * + * RBP - 8 [ return value ] BPF_TRAMP_F_CALL_ORIG or + * BPF_TRAMP_F_RET_FENTRY_RET flags + * + * [ reg_argN ] always + * [ ... ] + * RBP - regs_off [ reg_arg1 ] program's ctx pointer + * + * RBP - args_off [ args count ] always + * + * RBP - ip_off [ traced function ] BPF_TRAMP_F_IP_ARG flag + */ + /* room for return value of orig_call or fentry prog */ save_ret = flags & (BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_RET_FENTRY_RET); if (save_ret) stack_size += 8; + regs_off = stack_size; + + /* args count */ + stack_size += 8; + args_off = stack_size; + if (flags & BPF_TRAMP_F_IP_ARG) stack_size += 8; /* room for IP address argument */ + ip_off = stack_size; + if (flags & BPF_TRAMP_F_SKIP_FRAME) /* skip patched call instruction and point orig_call to actual * body of the kernel function. @@ -2005,23 +2037,25 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i EMIT4(0x48, 0x83, 0xEC, stack_size); /* sub rsp, stack_size */ EMIT1(0x53); /* push rbx */ + /* Store number of arguments of the traced function: + * mov rax, nr_args + * mov QWORD PTR [rbp - args_off], rax + */ + emit_mov_imm64(&prog, BPF_REG_0, 0, (u32) nr_args); + emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -args_off); + if (flags & BPF_TRAMP_F_IP_ARG) { /* Store IP address of the traced function: * mov rax, QWORD PTR [rbp + 8] * sub rax, X86_PATCH_SIZE - * mov QWORD PTR [rbp - stack_size], rax + * mov QWORD PTR [rbp - ip_off], rax */ emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, 8); EMIT4(0x48, 0x83, 0xe8, X86_PATCH_SIZE); - emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -stack_size); - - /* Continue with stack_size for regs storage, stack will - * be correctly restored with 'leave' instruction. - */ - stack_size -= 8; + emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -ip_off); } - save_regs(m, &prog, nr_args, stack_size); + save_regs(m, &prog, nr_args, regs_off); if (flags & BPF_TRAMP_F_CALL_ORIG) { /* arg1: mov rdi, im */ @@ -2033,7 +2067,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i } if (fentry->nr_progs) - if (invoke_bpf(m, &prog, fentry, stack_size, + if (invoke_bpf(m, &prog, fentry, regs_off, flags & BPF_TRAMP_F_RET_FENTRY_RET)) return -EINVAL; @@ -2043,7 +2077,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i if (!branches) return -ENOMEM; - if (invoke_bpf_mod_ret(m, &prog, fmod_ret, stack_size, + if (invoke_bpf_mod_ret(m, &prog, fmod_ret, regs_off, branches)) { ret = -EINVAL; goto cleanup; @@ -2051,7 +2085,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i } if (flags & BPF_TRAMP_F_CALL_ORIG) { - restore_regs(m, &prog, nr_args, stack_size); + restore_regs(m, &prog, nr_args, regs_off); /* call original function */ if (emit_call(&prog, orig_call, prog)) { @@ -2081,13 +2115,13 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i } if (fexit->nr_progs) - if (invoke_bpf(m, &prog, fexit, stack_size, false)) { + if (invoke_bpf(m, &prog, fexit, regs_off, false)) { ret = -EINVAL; goto cleanup; } if (flags & BPF_TRAMP_F_RESTORE_REGS) - restore_regs(m, &prog, nr_args, stack_size); + restore_regs(m, &prog, nr_args, regs_off); /* This needs to be done regardless. If there were fmod_ret programs, * the return value is only updated on the stack and still needs to be @@ -2124,24 +2158,6 @@ cleanup: return ret; } -static int emit_fallback_jump(u8 **pprog) -{ - u8 *prog = *pprog; - int err = 0; - -#ifdef CONFIG_RETPOLINE - /* Note that this assumes the the compiler uses external - * thunks for indirect calls. Both clang and GCC use the same - * naming convention for external thunks. - */ - err = emit_jump(&prog, __x86_indirect_thunk_rdx, prog); -#else - EMIT2(0xFF, 0xE2); /* jmp rdx */ -#endif - *pprog = prog; - return err; -} - static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs) { u8 *jg_reloc, *prog = *pprog; @@ -2163,9 +2179,7 @@ static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs) if (err) return err; - err = emit_fallback_jump(&prog); /* jmp thunk/indirect */ - if (err) - return err; + emit_indirect_jump(&prog, 2 /* rdx */, prog); *pprog = prog; return 0; diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c index 3bfda5f502cb..429a89c5468b 100644 --- a/arch/x86/net/bpf_jit_comp32.c +++ b/arch/x86/net/bpf_jit_comp32.c @@ -15,6 +15,7 @@ #include <asm/cacheflush.h> #include <asm/set_memory.h> #include <asm/nospec-branch.h> +#include <asm/asm-prototypes.h> #include <linux/bpf.h> /* @@ -1267,6 +1268,21 @@ static void emit_epilogue(u8 **pprog, u32 stack_depth) *pprog = prog; } +static int emit_jmp_edx(u8 **pprog, u8 *ip) +{ + u8 *prog = *pprog; + int cnt = 0; + +#ifdef CONFIG_RETPOLINE + EMIT1_off32(0xE9, (u8 *)__x86_indirect_thunk_edx - (ip + 5)); +#else + EMIT2(0xFF, 0xE2); +#endif + *pprog = prog; + + return cnt; +} + /* * Generate the following code: * ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ... @@ -1280,7 +1296,7 @@ static void emit_epilogue(u8 **pprog, u32 stack_depth) * goto *(prog->bpf_func + prologue_size); * out: */ -static void emit_bpf_tail_call(u8 **pprog) +static void emit_bpf_tail_call(u8 **pprog, u8 *ip) { u8 *prog = *pprog; int cnt = 0; @@ -1307,7 +1323,7 @@ static void emit_bpf_tail_call(u8 **pprog) EMIT2(IA32_JBE, jmp_label(jmp_label1, 2)); /* - * if (tail_call_cnt > MAX_TAIL_CALL_CNT) + * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT) * goto out; */ lo = (u32)MAX_TAIL_CALL_CNT; @@ -1321,7 +1337,7 @@ static void emit_bpf_tail_call(u8 **pprog) /* cmp ecx,lo */ EMIT3(0x83, add_1reg(0xF8, IA32_ECX), lo); - /* ja out */ + /* jae out */ EMIT2(IA32_JAE, jmp_label(jmp_label1, 2)); /* add eax,0x1 */ @@ -1362,7 +1378,7 @@ static void emit_bpf_tail_call(u8 **pprog) * eax == ctx (1st arg) * edx == prog->bpf_func + prologue_size */ - RETPOLINE_EDX_BPF_JIT(); + cnt += emit_jmp_edx(&prog, ip + cnt); if (jmp_label1 == -1) jmp_label1 = cnt; @@ -2122,7 +2138,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, break; } case BPF_JMP | BPF_TAIL_CALL: - emit_bpf_tail_call(&prog); + emit_bpf_tail_call(&prog, image + addrs[i - 1]); break; /* cond jump */ diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 948656069cdd..052f1d78a562 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -20,7 +20,7 @@ struct pci_root_info { }; static bool pci_use_crs = true; -static bool pci_ignore_seg = false; +static bool pci_ignore_seg; static int __init set_use_crs(const struct dmi_system_id *id) { diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 3507f456fcd0..9e1e6b8d8876 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -632,7 +632,7 @@ static void set_dev_domain_options(struct pci_dev *pdev) pdev->hotplug_user_indicators = 1; } -int pcibios_add_device(struct pci_dev *dev) +int pcibios_device_add(struct pci_dev *dev) { struct pci_setup_rom *rom; struct irq_domain *msidom; diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 3d41a09c2c14..9bb1e2941179 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -23,6 +23,7 @@ #include <xen/features.h> #include <xen/events.h> +#include <xen/pci.h> #include <asm/xen/pci.h> #include <asm/xen/cpuid.h> #include <asm/apic.h> @@ -113,7 +114,7 @@ static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi, false /* no mapping of GSI to PIRQ */); } -#ifdef CONFIG_XEN_DOM0 +#ifdef CONFIG_XEN_PV_DOM0 static int xen_register_gsi(u32 gsi, int triggering, int polarity) { int rc, irq; @@ -183,7 +184,7 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) if (ret) goto error; i = 0; - for_each_pci_msi_entry(msidesc, dev) { + msi_for_each_desc(msidesc, &dev->dev, MSI_DESC_NOTASSOCIATED) { irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i], (type == PCI_CAP_ID_MSI) ? nvec : 1, (type == PCI_CAP_ID_MSIX) ? @@ -234,7 +235,7 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) if (type == PCI_CAP_ID_MSI && nvec > 1) return 1; - for_each_pci_msi_entry(msidesc, dev) { + msi_for_each_desc(msidesc, &dev->dev, MSI_DESC_NOTASSOCIATED) { pirq = xen_allocate_pirq_msi(dev, msidesc); if (pirq < 0) { irq = -ENODEV; @@ -261,7 +262,7 @@ error: return irq; } -#ifdef CONFIG_XEN_DOM0 +#ifdef CONFIG_XEN_PV_DOM0 static bool __read_mostly pci_seg_supported = true; static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) @@ -269,7 +270,7 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) int ret = 0; struct msi_desc *msidesc; - for_each_pci_msi_entry(msidesc, dev) { + msi_for_each_desc(msidesc, &dev->dev, MSI_DESC_NOTASSOCIATED) { struct physdev_map_pirq map_irq; domid_t domid; @@ -305,7 +306,7 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) return -EINVAL; map_irq.table_base = pci_resource_start(dev, bir); - map_irq.entry_nr = msidesc->msi_attrib.entry_nr; + map_irq.entry_nr = msidesc->msi_index; } ret = -EINVAL; @@ -350,10 +351,13 @@ out: return ret; } -static void xen_initdom_restore_msi_irqs(struct pci_dev *dev) +bool xen_initdom_restore_msi(struct pci_dev *dev) { int ret = 0; + if (!xen_initial_domain()) + return true; + if (pci_seg_supported) { struct physdev_pci_device restore_ext; @@ -374,30 +378,26 @@ static void xen_initdom_restore_msi_irqs(struct pci_dev *dev) ret = HYPERVISOR_physdev_op(PHYSDEVOP_restore_msi, &restore); WARN(ret && ret != -ENOSYS, "restore_msi -> %d\n", ret); } + return false; } -#else /* CONFIG_XEN_DOM0 */ +#else /* CONFIG_XEN_PV_DOM0 */ #define xen_initdom_setup_msi_irqs NULL -#define xen_initdom_restore_msi_irqs NULL -#endif /* !CONFIG_XEN_DOM0 */ +#endif /* !CONFIG_XEN_PV_DOM0 */ static void xen_teardown_msi_irqs(struct pci_dev *dev) { struct msi_desc *msidesc; int i; - for_each_pci_msi_entry(msidesc, dev) { - if (msidesc->irq) { - for (i = 0; i < msidesc->nvec_used; i++) - xen_destroy_irq(msidesc->irq + i); - } + msi_for_each_desc(msidesc, &dev->dev, MSI_DESC_ASSOCIATED) { + for (i = 0; i < msidesc->nvec_used; i++) + xen_destroy_irq(msidesc->irq + i); } } static void xen_pv_teardown_msi_irqs(struct pci_dev *dev) { - struct msi_desc *msidesc = first_pci_msi_entry(dev); - - if (msidesc->msi_attrib.is_msix) + if (dev->msix_enabled) xen_pci_frontend_disable_msix(dev); else xen_pci_frontend_disable_msi(dev); @@ -413,10 +413,7 @@ static int xen_msi_domain_alloc_irqs(struct irq_domain *domain, if (WARN_ON_ONCE(!dev_is_pci(dev))) return -EINVAL; - if (first_msi_entry(dev)->msi_attrib.is_msix) - type = PCI_CAP_ID_MSIX; - else - type = PCI_CAP_ID_MSI; + type = to_pci_dev(dev)->msix_enabled ? PCI_CAP_ID_MSIX : PCI_CAP_ID_MSI; return xen_msi_ops.setup_msi_irqs(to_pci_dev(dev), nvec, type); } @@ -465,12 +462,10 @@ static __init struct irq_domain *xen_create_pci_msi_domain(void) static __init void xen_setup_pci_msi(void) { if (xen_pv_domain()) { - if (xen_initial_domain()) { + if (xen_initial_domain()) xen_msi_ops.setup_msi_irqs = xen_initdom_setup_msi_irqs; - x86_msi.restore_msi_irqs = xen_initdom_restore_msi_irqs; - } else { + else xen_msi_ops.setup_msi_irqs = xen_setup_msi_irqs; - } xen_msi_ops.teardown_msi_irqs = xen_pv_teardown_msi_irqs; pci_msi_ignore_mask = 1; } else if (xen_hvm_domain()) { @@ -555,7 +550,7 @@ int __init pci_xen_hvm_init(void) return 0; } -#ifdef CONFIG_XEN_DOM0 +#ifdef CONFIG_XEN_PV_DOM0 int __init pci_xen_initial_domain(void) { int irq; @@ -583,77 +578,5 @@ int __init pci_xen_initial_domain(void) } return 0; } - -struct xen_device_domain_owner { - domid_t domain; - struct pci_dev *dev; - struct list_head list; -}; - -static DEFINE_SPINLOCK(dev_domain_list_spinlock); -static struct list_head dev_domain_list = LIST_HEAD_INIT(dev_domain_list); - -static struct xen_device_domain_owner *find_device(struct pci_dev *dev) -{ - struct xen_device_domain_owner *owner; - - list_for_each_entry(owner, &dev_domain_list, list) { - if (owner->dev == dev) - return owner; - } - return NULL; -} - -int xen_find_device_domain_owner(struct pci_dev *dev) -{ - struct xen_device_domain_owner *owner; - int domain = -ENODEV; - - spin_lock(&dev_domain_list_spinlock); - owner = find_device(dev); - if (owner) - domain = owner->domain; - spin_unlock(&dev_domain_list_spinlock); - return domain; -} -EXPORT_SYMBOL_GPL(xen_find_device_domain_owner); - -int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain) -{ - struct xen_device_domain_owner *owner; - - owner = kzalloc(sizeof(struct xen_device_domain_owner), GFP_KERNEL); - if (!owner) - return -ENODEV; - - spin_lock(&dev_domain_list_spinlock); - if (find_device(dev)) { - spin_unlock(&dev_domain_list_spinlock); - kfree(owner); - return -EEXIST; - } - owner->domain = domain; - owner->dev = dev; - list_add_tail(&owner->list, &dev_domain_list); - spin_unlock(&dev_domain_list_spinlock); - return 0; -} -EXPORT_SYMBOL_GPL(xen_register_device_domain_owner); - -int xen_unregister_device_domain_owner(struct pci_dev *dev) -{ - struct xen_device_domain_owner *owner; - - spin_lock(&dev_domain_list_spinlock); - owner = find_device(dev); - if (!owner) { - spin_unlock(&dev_domain_list_spinlock); - return -ENODEV; - } - list_del(&owner->list); - spin_unlock(&dev_domain_list_spinlock); - kfree(owner); - return 0; -} -EXPORT_SYMBOL_GPL(xen_unregister_device_domain_owner); #endif + diff --git a/arch/x86/platform/ce4100/falconfalls.dts b/arch/x86/platform/ce4100/falconfalls.dts index 0ac3d4357136..65fa3d866226 100644 --- a/arch/x86/platform/ce4100/falconfalls.dts +++ b/arch/x86/platform/ce4100/falconfalls.dts @@ -249,7 +249,7 @@ gpio@26 { #gpio-cells = <2>; - compatible = "ti,pcf8575"; + compatible = "nxp,pcf8575"; reg = <0x26>; gpio-controller; }; @@ -263,7 +263,7 @@ gpio@26 { #gpio-cells = <2>; - compatible = "ti,pcf8575"; + compatible = "nxp,pcf8575"; reg = <0x26>; gpio-controller; }; diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 7515e78ef898..1f3675453a57 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -33,7 +33,7 @@ #include <linux/reboot.h> #include <linux/slab.h> #include <linux/ucs2_string.h> -#include <linux/mem_encrypt.h> +#include <linux/cc_platform.h> #include <linux/sched/task.h> #include <asm/setup.h> @@ -284,7 +284,8 @@ static void __init __map_region(efi_memory_desc_t *md, u64 va) if (!(md->attribute & EFI_MEMORY_WB)) flags |= _PAGE_PCD; - if (sev_active() && md->type != EFI_MEMORY_MAPPED_IO) + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) && + md->type != EFI_MEMORY_MAPPED_IO) flags |= _PAGE_ENC; pfn = md->phys_addr >> PAGE_SHIFT; @@ -390,7 +391,7 @@ static int __init efi_update_mem_attr(struct mm_struct *mm, efi_memory_desc_t *m if (!(md->attribute & EFI_MEMORY_RO)) pf |= _PAGE_RW; - if (sev_active()) + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) pf |= _PAGE_ENC; return efi_update_mappings(md, pf); @@ -438,7 +439,7 @@ void __init efi_runtime_update_mappings(void) (md->type != EFI_RUNTIME_SERVICES_CODE)) pf |= _PAGE_RW; - if (sev_active()) + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) pf |= _PAGE_ENC; efi_update_mappings(md, pf); diff --git a/arch/x86/platform/efi/efi_stub_32.S b/arch/x86/platform/efi/efi_stub_32.S index 09ec84f6ef51..f3cfdb1c9a35 100644 --- a/arch/x86/platform/efi/efi_stub_32.S +++ b/arch/x86/platform/efi/efi_stub_32.S @@ -56,5 +56,5 @@ SYM_FUNC_START(efi_call_svam) movl 16(%esp), %ebx leave - ret + RET SYM_FUNC_END(efi_call_svam) diff --git a/arch/x86/platform/efi/efi_stub_64.S b/arch/x86/platform/efi/efi_stub_64.S index 90380a17ab23..2206b8bc47b8 100644 --- a/arch/x86/platform/efi/efi_stub_64.S +++ b/arch/x86/platform/efi/efi_stub_64.S @@ -23,5 +23,5 @@ SYM_FUNC_START(__efi_call) mov %rsi, %rcx CALL_NOSPEC rdi leave - ret + RET SYM_FUNC_END(__efi_call) diff --git a/arch/x86/platform/efi/efi_thunk_64.S b/arch/x86/platform/efi/efi_thunk_64.S index fd3dd1708eba..25799d768624 100644 --- a/arch/x86/platform/efi/efi_thunk_64.S +++ b/arch/x86/platform/efi/efi_thunk_64.S @@ -37,6 +37,17 @@ SYM_CODE_START(__efi64_thunk) push %rax /* + * Copy args passed via the stack + */ + subq $0x24, %rsp + movq 0x18(%rax), %rbp + movq 0x20(%rax), %rbx + movq 0x28(%rax), %rax + movl %ebp, 0x18(%rsp) + movl %ebx, 0x1c(%rsp) + movl %eax, 0x20(%rsp) + + /* * Calculate the physical address of the kernel text. */ movq $__START_KERNEL_map, %rax @@ -47,7 +58,6 @@ SYM_CODE_START(__efi64_thunk) subq %rax, %rbp subq %rax, %rbx - subq $28, %rsp movl %ebx, 0x0(%rsp) /* return address */ movl %esi, 0x4(%rsp) movl %edx, 0x8(%rsp) @@ -60,10 +70,10 @@ SYM_CODE_START(__efi64_thunk) pushq %rdi /* EFI runtime service address */ lretq -1: movq 24(%rsp), %rsp +1: movq 0x20(%rsp), %rsp pop %rbx pop %rbp - retq + RET .code32 2: pushl $__KERNEL_CS diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index b15ebfe40a73..b0b848d6933a 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -277,7 +277,8 @@ void __init efi_arch_mem_reserve(phys_addr_t addr, u64 size) return; } - new = early_memremap(data.phys_map, data.size); + new = early_memremap_prot(data.phys_map, data.size, + pgprot_val(pgprot_encrypted(FIXMAP_PAGE_NORMAL))); if (!new) { pr_err("Failed to map new boot services memmap\n"); return; diff --git a/arch/x86/platform/olpc/olpc.c b/arch/x86/platform/olpc/olpc.c index ee2beda590d0..1d4a00e767ec 100644 --- a/arch/x86/platform/olpc/olpc.c +++ b/arch/x86/platform/olpc/olpc.c @@ -274,7 +274,7 @@ static struct olpc_ec_driver ec_xo1_driver = { static struct olpc_ec_driver ec_xo1_5_driver = { .ec_cmd = olpc_xo1_ec_cmd, -#ifdef CONFIG_OLPC_XO1_5_SCI +#ifdef CONFIG_OLPC_XO15_SCI /* * XO-1.5 EC wakeups are available when olpc-xo15-sci driver is * compiled in diff --git a/arch/x86/platform/olpc/xo1-wakeup.S b/arch/x86/platform/olpc/xo1-wakeup.S index 75f4faff8468..3a5abffe5660 100644 --- a/arch/x86/platform/olpc/xo1-wakeup.S +++ b/arch/x86/platform/olpc/xo1-wakeup.S @@ -77,7 +77,7 @@ save_registers: pushfl popl saved_context_eflags - ret + RET restore_registers: movl saved_context_ebp, %ebp @@ -88,7 +88,7 @@ restore_registers: pushl saved_context_eflags popfl - ret + RET SYM_CODE_START(do_olpc_suspend_lowlevel) call save_processor_state @@ -109,7 +109,7 @@ ret_point: call restore_registers call restore_processor_state - ret + RET SYM_CODE_END(do_olpc_suspend_lowlevel) .data diff --git a/arch/x86/platform/pvh/enlighten.c b/arch/x86/platform/pvh/enlighten.c index 9ac7457f52a3..ed0442e35434 100644 --- a/arch/x86/platform/pvh/enlighten.c +++ b/arch/x86/platform/pvh/enlighten.c @@ -16,15 +16,15 @@ /* * PVH variables. * - * pvh_bootparams and pvh_start_info need to live in the data segment since + * pvh_bootparams and pvh_start_info need to live in a data segment since * they are used after startup_{32|64}, which clear .bss, are invoked. */ -struct boot_params pvh_bootparams __section(".data"); -struct hvm_start_info pvh_start_info __section(".data"); +struct boot_params __initdata pvh_bootparams; +struct hvm_start_info __initdata pvh_start_info; -unsigned int pvh_start_info_sz = sizeof(pvh_start_info); +const unsigned int __initconst pvh_start_info_sz = sizeof(pvh_start_info); -static u64 pvh_get_root_pointer(void) +static u64 __init pvh_get_root_pointer(void) { return pvh_start_info.rsdp_paddr; } @@ -107,7 +107,7 @@ void __init __weak xen_pvh_init(struct boot_params *boot_params) BUG(); } -static void hypervisor_specific_init(bool xen_guest) +static void __init hypervisor_specific_init(bool xen_guest) { if (xen_guest) xen_pvh_init(&pvh_bootparams); diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 6665f8802098..9f2b251e83c5 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -20,7 +20,7 @@ #include <asm/page.h> #include <asm/mce.h> #include <asm/suspend.h> -#include <asm/fpu/internal.h> +#include <asm/fpu/api.h> #include <asm/debugreg.h> #include <asm/cpu.h> #include <asm/mmu_context.h> diff --git a/arch/x86/power/hibernate_asm_32.S b/arch/x86/power/hibernate_asm_32.S index 8786653ad3c0..5606a15cf9a1 100644 --- a/arch/x86/power/hibernate_asm_32.S +++ b/arch/x86/power/hibernate_asm_32.S @@ -32,7 +32,7 @@ SYM_FUNC_START(swsusp_arch_suspend) FRAME_BEGIN call swsusp_save FRAME_END - ret + RET SYM_FUNC_END(swsusp_arch_suspend) SYM_CODE_START(restore_image) @@ -108,5 +108,5 @@ SYM_FUNC_START(restore_registers) /* tell the hibernation core that we've just restored the memory */ movl %eax, in_suspend - ret + RET SYM_FUNC_END(restore_registers) diff --git a/arch/x86/power/hibernate_asm_64.S b/arch/x86/power/hibernate_asm_64.S index d9bed596d849..0a0539e1cc81 100644 --- a/arch/x86/power/hibernate_asm_64.S +++ b/arch/x86/power/hibernate_asm_64.S @@ -66,7 +66,7 @@ SYM_FUNC_START(restore_registers) /* tell the hibernation core that we've just restored the memory */ movq %rax, in_suspend(%rip) - ret + RET SYM_FUNC_END(restore_registers) SYM_FUNC_START(swsusp_arch_suspend) @@ -96,7 +96,7 @@ SYM_FUNC_START(swsusp_arch_suspend) FRAME_BEGIN call swsusp_save FRAME_END - ret + RET SYM_FUNC_END(swsusp_arch_suspend) SYM_FUNC_START(restore_image) diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index 95ea17a9d20c..ae53d54d7959 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -16,7 +16,7 @@ CFLAGS_sha256.o := -D__DISABLE_EXPORTS # When linking purgatory.ro with -r unresolved symbols are not checked, # also link a purgatory.chk binary without -r to check for unresolved symbols. -PURGATORY_LDFLAGS := -e purgatory_start -nostdlib -z nodefaultlib +PURGATORY_LDFLAGS := -e purgatory_start -z nodefaultlib LDFLAGS_purgatory.ro := -r $(PURGATORY_LDFLAGS) LDFLAGS_purgatory.chk := $(PURGATORY_LDFLAGS) targets += purgatory.ro purgatory.chk diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index 31b5856010cb..c5e29db02a46 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -2,7 +2,7 @@ #include <linux/io.h> #include <linux/slab.h> #include <linux/memblock.h> -#include <linux/mem_encrypt.h> +#include <linux/cc_platform.h> #include <linux/pgtable.h> #include <asm/set_memory.h> @@ -17,6 +17,32 @@ u32 *trampoline_cr4_features; /* Hold the pgd entry used on booting additional CPUs */ pgd_t trampoline_pgd_entry; +void load_trampoline_pgtable(void) +{ +#ifdef CONFIG_X86_32 + load_cr3(initial_page_table); +#else + /* + * This function is called before exiting to real-mode and that will + * fail with CR4.PCIDE still set. + */ + if (boot_cpu_has(X86_FEATURE_PCID)) + cr4_clear_bits(X86_CR4_PCIDE); + + write_cr3(real_mode_header->trampoline_pgd); +#endif + + /* + * The CR3 write above will not flush global TLB entries. + * Stale, global entries from previous page tables may still be + * present. Flush those stale entries. + * + * This ensures that memory accessed while running with + * trampoline_pgd is *actually* mapped into trampoline_pgd. + */ + __flush_tlb_all(); +} + void __init reserve_real_mode(void) { phys_addr_t mem; @@ -44,10 +70,10 @@ void __init reserve_real_mode(void) static void sme_sev_setup_real_mode(struct trampoline_header *th) { #ifdef CONFIG_AMD_MEM_ENCRYPT - if (sme_active()) + if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) th->flags |= TH_FLAGS_SME_ACTIVE; - if (sev_es_active()) { + if (cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) { /* * Skip the call to verify_cpu() in secondary_startup_64 as it * will cause #VC exceptions when the AP can't handle them yet. @@ -72,6 +98,7 @@ static void __init setup_real_mode(void) #ifdef CONFIG_X86_64 u64 *trampoline_pgd; u64 efer; + int i; #endif base = (unsigned char *)real_mode_header; @@ -81,7 +108,7 @@ static void __init setup_real_mode(void) * decrypted memory in order to bring up other processors * successfully. This is not needed for SEV. */ - if (sme_active()) + if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) set_memory_decrypted((unsigned long)base, size >> PAGE_SHIFT); memcpy(base, real_mode_blob, size); @@ -128,8 +155,17 @@ static void __init setup_real_mode(void) trampoline_header->flags = 0; trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd); + + /* Map the real mode stub as virtual == physical */ trampoline_pgd[0] = trampoline_pgd_entry.pgd; - trampoline_pgd[511] = init_top_pgt[511].pgd; + + /* + * Include the entirety of the kernel mapping into the trampoline + * PGD. This way, all mappings present in the normal kernel page + * tables are usable while running on trampoline_pgd. + */ + for (i = pgd_index(__PAGE_OFFSET); i < PTRS_PER_PGD; i++) + trampoline_pgd[i] = init_top_pgt[i].pgd; #endif sme_sev_setup_real_mode(trampoline_header); diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index 27c82207d387..e2c5b296120d 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -14,6 +14,10 @@ static Elf_Ehdr ehdr; static unsigned long shnum; static unsigned int shstrndx; +static unsigned int shsymtabndx; +static unsigned int shxsymtabndx; + +static int sym_index(Elf_Sym *sym); struct relocs { uint32_t *offset; @@ -35,6 +39,7 @@ struct section { Elf_Shdr shdr; struct section *link; Elf_Sym *symtab; + Elf32_Word *xsymtab; Elf_Rel *reltab; char *strtab; }; @@ -63,7 +68,9 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = { "(__parainstructions|__alt_instructions)(_end)?|" "(__iommu_table|__apicdrivers|__smp_locks)(_end)?|" "__(start|end)_pci_.*|" +#if CONFIG_FW_LOADER "__(start|end)_builtin_fw|" +#endif "__(start|stop)___ksymtab(_gpl)?|" "__(start|stop)___kcrctab(_gpl)?|" "__(start|stop)___param|" @@ -268,7 +275,7 @@ static const char *sym_name(const char *sym_strtab, Elf_Sym *sym) name = sym_strtab + sym->st_name; } else { - name = sec_name(sym->st_shndx); + name = sec_name(sym_index(sym)); } return name; } @@ -338,6 +345,23 @@ static uint64_t elf64_to_cpu(uint64_t val) #define elf_xword_to_cpu(x) elf32_to_cpu(x) #endif +static int sym_index(Elf_Sym *sym) +{ + Elf_Sym *symtab = secs[shsymtabndx].symtab; + Elf32_Word *xsymtab = secs[shxsymtabndx].xsymtab; + unsigned long offset; + int index; + + if (sym->st_shndx != SHN_XINDEX) + return sym->st_shndx; + + /* calculate offset of sym from head of table. */ + offset = (unsigned long)sym - (unsigned long)symtab; + index = offset / sizeof(*sym); + + return elf32_to_cpu(xsymtab[index]); +} + static void read_ehdr(FILE *fp) { if (fread(&ehdr, sizeof(ehdr), 1, fp) != 1) { @@ -471,31 +495,60 @@ static void read_strtabs(FILE *fp) static void read_symtabs(FILE *fp) { int i,j; + for (i = 0; i < shnum; i++) { struct section *sec = &secs[i]; - if (sec->shdr.sh_type != SHT_SYMTAB) { + int num_syms; + + switch (sec->shdr.sh_type) { + case SHT_SYMTAB_SHNDX: + sec->xsymtab = malloc(sec->shdr.sh_size); + if (!sec->xsymtab) { + die("malloc of %" FMT " bytes for xsymtab failed\n", + sec->shdr.sh_size); + } + if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) { + die("Seek to %" FMT " failed: %s\n", + sec->shdr.sh_offset, strerror(errno)); + } + if (fread(sec->xsymtab, 1, sec->shdr.sh_size, fp) + != sec->shdr.sh_size) { + die("Cannot read extended symbol table: %s\n", + strerror(errno)); + } + shxsymtabndx = i; + continue; + + case SHT_SYMTAB: + num_syms = sec->shdr.sh_size / sizeof(Elf_Sym); + + sec->symtab = malloc(sec->shdr.sh_size); + if (!sec->symtab) { + die("malloc of %" FMT " bytes for symtab failed\n", + sec->shdr.sh_size); + } + if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) { + die("Seek to %" FMT " failed: %s\n", + sec->shdr.sh_offset, strerror(errno)); + } + if (fread(sec->symtab, 1, sec->shdr.sh_size, fp) + != sec->shdr.sh_size) { + die("Cannot read symbol table: %s\n", + strerror(errno)); + } + for (j = 0; j < num_syms; j++) { + Elf_Sym *sym = &sec->symtab[j]; + + sym->st_name = elf_word_to_cpu(sym->st_name); + sym->st_value = elf_addr_to_cpu(sym->st_value); + sym->st_size = elf_xword_to_cpu(sym->st_size); + sym->st_shndx = elf_half_to_cpu(sym->st_shndx); + } + shsymtabndx = i; + continue; + + default: continue; - } - sec->symtab = malloc(sec->shdr.sh_size); - if (!sec->symtab) { - die("malloc of %" FMT " bytes for symtab failed\n", - sec->shdr.sh_size); - } - if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) { - die("Seek to %" FMT " failed: %s\n", - sec->shdr.sh_offset, strerror(errno)); - } - if (fread(sec->symtab, 1, sec->shdr.sh_size, fp) - != sec->shdr.sh_size) { - die("Cannot read symbol table: %s\n", - strerror(errno)); - } - for (j = 0; j < sec->shdr.sh_size/sizeof(Elf_Sym); j++) { - Elf_Sym *sym = &sec->symtab[j]; - sym->st_name = elf_word_to_cpu(sym->st_name); - sym->st_value = elf_addr_to_cpu(sym->st_value); - sym->st_size = elf_xword_to_cpu(sym->st_size); - sym->st_shndx = elf_half_to_cpu(sym->st_shndx); } } } @@ -762,7 +815,9 @@ static void percpu_init(void) */ static int is_percpu_sym(ElfW(Sym) *sym, const char *symname) { - return (sym->st_shndx == per_cpu_shndx) && + int shndx = sym_index(sym); + + return (shndx == per_cpu_shndx) && strcmp(symname, "__init_begin") && strcmp(symname, "__per_cpu_load") && strncmp(symname, "init_per_cpu_", 13); @@ -1095,7 +1150,7 @@ static int do_reloc_info(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym, sec_name(sec->shdr.sh_info), rel_type(ELF_R_TYPE(rel->r_info)), symname, - sec_name(sym->st_shndx)); + sec_name(sym_index(sym))); return 0; } diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile index 5ccb18290d71..ba5789c35809 100644 --- a/arch/x86/um/Makefile +++ b/arch/x86/um/Makefile @@ -40,7 +40,7 @@ $(obj)/user-offsets.s: c_flags = -Wp,-MD,$(depfile) $(USER_CFLAGS) \ -Iarch/x86/include/generated targets += user-offsets.s -include/generated/user_constants.h: $(obj)/user-offsets.s +include/generated/user_constants.h: $(obj)/user-offsets.s FORCE $(call filechk,offsets,__USER_CONSTANT_H__) UNPROFILE_OBJS := stub_segv.o diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h index 165be7f9a964..4da336965698 100644 --- a/arch/x86/um/asm/barrier.h +++ b/arch/x86/um/asm/barrier.h @@ -2,6 +2,7 @@ #ifndef _ASM_UM_BARRIER_H_ #define _ASM_UM_BARRIER_H_ +#include <asm/cpufeatures.h> #include <asm/alternative.h> /* diff --git a/arch/x86/um/asm/segment.h b/arch/x86/um/asm/segment.h index 453db377150d..2ef507bc6989 100644 --- a/arch/x86/um/asm/segment.h +++ b/arch/x86/um/asm/segment.h @@ -8,12 +8,4 @@ extern int host_gdt_entry_tls_min; #define GDT_ENTRY_TLS_MIN host_gdt_entry_tls_min #define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1) -typedef struct { - unsigned long seg; -} mm_segment_t; - -#define MAKE_MM_SEG(s) ((mm_segment_t) { (s) }) -#define KERNEL_DS MAKE_MM_SEG(~0UL) -#define USER_DS MAKE_MM_SEG(TASK_SIZE) - #endif diff --git a/arch/x86/um/checksum_32.S b/arch/x86/um/checksum_32.S index 13f118dec74f..aed782ab7721 100644 --- a/arch/x86/um/checksum_32.S +++ b/arch/x86/um/checksum_32.S @@ -110,7 +110,7 @@ csum_partial: 7: popl %ebx popl %esi - ret + RET #else @@ -208,7 +208,7 @@ csum_partial: 80: popl %ebx popl %esi - ret + RET #endif EXPORT_SYMBOL(csum_partial) diff --git a/arch/x86/um/os-Linux/registers.c b/arch/x86/um/os-Linux/registers.c index 3c423dfcd78b..df8f4b4bf98b 100644 --- a/arch/x86/um/os-Linux/registers.c +++ b/arch/x86/um/os-Linux/registers.c @@ -15,6 +15,7 @@ #include <sys/uio.h> #include <asm/sigcontext.h> #include <linux/elf.h> +#include <registers.h> int have_xstate_support; diff --git a/arch/x86/um/ptrace_32.c b/arch/x86/um/ptrace_32.c index 2497bac56066..0bc4b73a9cde 100644 --- a/arch/x86/um/ptrace_32.c +++ b/arch/x86/um/ptrace_32.c @@ -7,6 +7,7 @@ #include <linux/sched.h> #include <linux/uaccess.h> #include <asm/ptrace-abi.h> +#include <registers.h> #include <skas.h> extern int arch_switch_tls(struct task_struct *to); diff --git a/arch/x86/um/ptrace_64.c b/arch/x86/um/ptrace_64.c index 1401899dee9b..289d0159b041 100644 --- a/arch/x86/um/ptrace_64.c +++ b/arch/x86/um/ptrace_64.c @@ -11,6 +11,7 @@ #define __FRAME_OFFSETS #include <asm/ptrace.h> #include <linux/uaccess.h> +#include <registers.h> #include <asm/ptrace-abi.h> /* diff --git a/arch/x86/um/setjmp_32.S b/arch/x86/um/setjmp_32.S index 62eaf8c80e04..2d991ddbcca5 100644 --- a/arch/x86/um/setjmp_32.S +++ b/arch/x86/um/setjmp_32.S @@ -34,7 +34,7 @@ kernel_setjmp: movl %esi,12(%edx) movl %edi,16(%edx) movl %ecx,20(%edx) # Return address - ret + RET .size kernel_setjmp,.-kernel_setjmp diff --git a/arch/x86/um/setjmp_64.S b/arch/x86/um/setjmp_64.S index 1b5d40d4ff46..b46acb6a8ebd 100644 --- a/arch/x86/um/setjmp_64.S +++ b/arch/x86/um/setjmp_64.S @@ -33,7 +33,7 @@ kernel_setjmp: movq %r14,40(%rdi) movq %r15,48(%rdi) movq %rsi,56(%rdi) # Return address - ret + RET .size kernel_setjmp,.-kernel_setjmp diff --git a/arch/x86/um/shared/sysdep/syscalls_64.h b/arch/x86/um/shared/sysdep/syscalls_64.h index 8a7d5e1da98e..48d6cd12f8a5 100644 --- a/arch/x86/um/shared/sysdep/syscalls_64.h +++ b/arch/x86/um/shared/sysdep/syscalls_64.h @@ -23,9 +23,6 @@ extern syscall_handler_t *sys_call_table[]; UPT_SYSCALL_ARG5(®s->regs), \ UPT_SYSCALL_ARG6(®s->regs))) -extern long old_mmap(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff); extern syscall_handler_t sys_modify_ldt; extern syscall_handler_t sys_arch_prctl; diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c index 7c11c9e5d7ea..263e1d08f216 100644 --- a/arch/x86/um/signal.c +++ b/arch/x86/um/signal.c @@ -12,6 +12,7 @@ #include <linux/uaccess.h> #include <asm/ucontext.h> #include <frame_kern.h> +#include <registers.h> #include <skas.h> #ifdef CONFIG_X86_32 diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c index 0575decb5e54..89df5d89d664 100644 --- a/arch/x86/um/sys_call_table_32.c +++ b/arch/x86/um/sys_call_table_32.c @@ -9,8 +9,6 @@ #include <linux/cache.h> #include <asm/syscall.h> -#define __NO_STUBS - /* * Below you can see, in terms of #define's, the differences between the x86-64 * and the UML syscall table. @@ -23,8 +21,6 @@ #define sys_vm86old sys_ni_syscall #define sys_vm86 sys_ni_syscall -#define old_mmap sys_old_mmap - #define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, native) #define __SYSCALL(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c index 95725b5a41ac..b0b4cfd2308c 100644 --- a/arch/x86/um/sys_call_table_64.c +++ b/arch/x86/um/sys_call_table_64.c @@ -9,8 +9,6 @@ #include <linux/cache.h> #include <asm/syscall.h> -#define __NO_STUBS - /* * Below you can see, in terms of #define's, the differences between the x86-64 * and the UML syscall table. @@ -20,21 +18,6 @@ #define sys_iopl sys_ni_syscall #define sys_ioperm sys_ni_syscall -/* - * The UML TLS problem. Note that x86_64 does not implement this, so the below - * is needed only for the ia32 compatibility. - */ - -/* On UML we call it this way ("old" means it's not mmap2) */ -#define sys_mmap old_mmap - -#define stub_clone sys_clone -#define stub_fork sys_fork -#define stub_vfork sys_vfork -#define stub_execve sys_execve -#define stub_execveat sys_execveat -#define stub_rt_sigreturn sys_rt_sigreturn - #define __SYSCALL(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); #include <asm/syscalls_64.h> diff --git a/arch/x86/um/syscalls_64.c b/arch/x86/um/syscalls_64.c index 58f51667e2e4..fe5323f0c42d 100644 --- a/arch/x86/um/syscalls_64.c +++ b/arch/x86/um/syscalls_64.c @@ -10,7 +10,9 @@ #include <linux/syscalls.h> #include <linux/uaccess.h> #include <asm/prctl.h> /* XXX This should get the constants from libc */ +#include <registers.h> #include <os.h> +#include <registers.h> long arch_prctl(struct task_struct *task, int option, unsigned long __user *arg2) @@ -35,7 +37,7 @@ long arch_prctl(struct task_struct *task, int option, switch (option) { case ARCH_SET_FS: case ARCH_SET_GS: - ret = restore_registers(pid, ¤t->thread.regs.regs); + ret = restore_pid_registers(pid, ¤t->thread.regs.regs); if (ret) return ret; break; @@ -87,3 +89,13 @@ void arch_switch_to(struct task_struct *to) arch_prctl(to, ARCH_SET_FS, (void __user *) to->thread.arch.fs); } + +SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, + unsigned long, prot, unsigned long, flags, + unsigned long, fd, unsigned long, off) +{ + if (off & ~PAGE_MASK) + return -EINVAL; + + return ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); +} diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index afc1da68b06d..85246dd9faa1 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -23,6 +23,7 @@ config XEN_PV select PARAVIRT_XXL select XEN_HAVE_PVMMU select XEN_HAVE_VPMU + select GUEST_PERF_EVENTS help Support running as a Xen PV guest. @@ -43,13 +44,9 @@ config XEN_PV_SMP def_bool y depends on XEN_PV && SMP -config XEN_DOM0 - bool "Xen PV Dom0 support" - default y - depends on XEN_PV && PCI_XEN && SWIOTLB_XEN - depends on X86_IO_APIC && ACPI && PCI - help - Support running as a Xen PV Dom0 guest. +config XEN_PV_DOM0 + def_bool y + depends on XEN_PV && XEN_DOM0 config XEN_PVHVM def_bool y @@ -86,3 +83,12 @@ config XEN_PVH def_bool n help Support for running as a Xen PVH guest. + +config XEN_DOM0 + bool "Xen Dom0 support" + default XEN_PV + depends on (XEN_PV && SWIOTLB_XEN) || (XEN_PVH && X86_64) + depends on X86_IO_APIC && ACPI && PCI + select X86_X2APIC if XEN_PVH && X86_64 + help + Support running as a Xen Dom0 guest. diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 40b5779fce21..4953260e281c 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -45,7 +45,7 @@ obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o -obj-$(CONFIG_XEN_DOM0) += vga.o +obj-$(CONFIG_XEN_PV_DOM0) += vga.o obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index c79bd0af2e8c..30c6e986a6cd 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -3,6 +3,7 @@ #ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG #include <linux/memblock.h> #endif +#include <linux/console.h> #include <linux/cpu.h> #include <linux/kexec.h> #include <linux/slab.h> @@ -10,12 +11,15 @@ #include <xen/xen.h> #include <xen/features.h> +#include <xen/interface/sched.h> +#include <xen/interface/version.h> #include <xen/page.h> #include <asm/xen/hypercall.h> #include <asm/xen/hypervisor.h> #include <asm/cpu.h> #include <asm/e820/api.h> +#include <asm/setup.h> #include "xen-ops.h" #include "smp.h" @@ -27,34 +31,16 @@ EXPORT_SYMBOL_GPL(hypercall_page); * Pointer to the xen_vcpu_info structure or * &HYPERVISOR_shared_info->vcpu_info[cpu]. See xen_hvm_init_shared_info * and xen_vcpu_setup for details. By default it points to share_info->vcpu_info - * but if the hypervisor supports VCPUOP_register_vcpu_info then it can point - * to xen_vcpu_info. The pointer is used in __xen_evtchn_do_upcall to - * acknowledge pending events. - * Also more subtly it is used by the patched version of irq enable/disable - * e.g. xen_irq_enable_direct and xen_iret in PV mode. - * - * The desire to be able to do those mask/unmask operations as a single - * instruction by using the per-cpu offset held in %gs is the real reason - * vcpu info is in a per-cpu pointer and the original reason for this - * hypercall. - * + * but during boot it is switched to point to xen_vcpu_info. + * The pointer is used in __xen_evtchn_do_upcall to acknowledge pending events. */ DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); - -/* - * Per CPU pages used if hypervisor supports VCPUOP_register_vcpu_info - * hypercall. This can be used both in PV and PVHVM mode. The structure - * overrides the default per_cpu(xen_vcpu, cpu) value. - */ DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); /* Linux <-> Xen vCPU id mapping */ DEFINE_PER_CPU(uint32_t, xen_vcpu_id); EXPORT_PER_CPU_SYMBOL(xen_vcpu_id); -enum xen_domain_type xen_domain_type = XEN_NATIVE; -EXPORT_SYMBOL_GPL(xen_domain_type); - unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START; EXPORT_SYMBOL(machine_to_phys_mapping); unsigned long machine_to_phys_nr; @@ -69,10 +55,12 @@ __read_mostly int xen_have_vector_callback; EXPORT_SYMBOL_GPL(xen_have_vector_callback); /* - * NB: needs to live in .data because it's used by xen_prepare_pvh which runs - * before clearing the bss. + * NB: These need to live in .data or alike because they're used by + * xen_prepare_pvh() which runs before clearing the bss. */ -uint32_t xen_start_flags __section(".data") = 0; +enum xen_domain_type __ro_after_init xen_domain_type = XEN_NATIVE; +EXPORT_SYMBOL_GPL(xen_domain_type); +uint32_t __ro_after_init xen_start_flags; EXPORT_SYMBOL(xen_start_flags); /* @@ -81,21 +69,6 @@ EXPORT_SYMBOL(xen_start_flags); */ struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info; -/* - * Flag to determine whether vcpu info placement is available on all - * VCPUs. We assume it is to start with, and then set it to zero on - * the first failure. This is because it can succeed on some VCPUs - * and not others, since it can involve hypervisor memory allocation, - * or because the guest failed to guarantee all the appropriate - * constraints on all VCPUs (ie buffer can't cross a page boundary). - * - * Note that any particular CPU may be using a placed vcpu structure, - * but we can only optimise if the all are. - * - * 0: not available, 1: available - */ -int xen_have_vcpu_info_placement = 1; - static int xen_cpu_up_online(unsigned int cpu) { xen_init_lock_cpu(cpu); @@ -121,10 +94,8 @@ int xen_cpuhp_setup(int (*cpu_up_prepare_cb)(unsigned int), return rc >= 0 ? 0 : rc; } -static int xen_vcpu_setup_restore(int cpu) +static void xen_vcpu_setup_restore(int cpu) { - int rc = 0; - /* Any per_cpu(xen_vcpu) is stale, so reset it */ xen_vcpu_info_reset(cpu); @@ -133,11 +104,8 @@ static int xen_vcpu_setup_restore(int cpu) * be handled by hotplug. */ if (xen_pv_domain() || - (xen_hvm_domain() && cpu_online(cpu))) { - rc = xen_vcpu_setup(cpu); - } - - return rc; + (xen_hvm_domain() && cpu_online(cpu))) + xen_vcpu_setup(cpu); } /* @@ -147,7 +115,7 @@ static int xen_vcpu_setup_restore(int cpu) */ void xen_vcpu_restore(void) { - int cpu, rc; + int cpu; for_each_possible_cpu(cpu) { bool other_cpu = (cpu != smp_processor_id()); @@ -167,20 +135,9 @@ void xen_vcpu_restore(void) if (xen_pv_domain() || xen_feature(XENFEAT_hvm_safe_pvclock)) xen_setup_runstate_info(cpu); - rc = xen_vcpu_setup_restore(cpu); - if (rc) - pr_emerg_once("vcpu restore failed for cpu=%d err=%d. " - "System will hang.\n", cpu, rc); - /* - * In case xen_vcpu_setup_restore() fails, do not bring up the - * VCPU. This helps us avoid the resulting OOPS when the VCPU - * accesses pvclock_vcpu_time via xen_vcpu (which is NULL.) - * Note that this does not improve the situation much -- now the - * VM hangs instead of OOPSing -- with the VCPUs that did not - * fail, spinning in stop_machine(), waiting for the failed - * VCPUs to come up. - */ - if (other_cpu && is_up && (rc == 0) && + xen_vcpu_setup_restore(cpu); + + if (other_cpu && is_up && HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL)) BUG(); } @@ -197,7 +154,7 @@ void xen_vcpu_info_reset(int cpu) } } -int xen_vcpu_setup(int cpu) +void xen_vcpu_setup(int cpu) { struct vcpu_register_vcpu_info info; int err; @@ -218,44 +175,65 @@ int xen_vcpu_setup(int cpu) */ if (xen_hvm_domain()) { if (per_cpu(xen_vcpu, cpu) == &per_cpu(xen_vcpu_info, cpu)) - return 0; + return; } - if (xen_have_vcpu_info_placement) { - vcpup = &per_cpu(xen_vcpu_info, cpu); - info.mfn = arbitrary_virt_to_mfn(vcpup); - info.offset = offset_in_page(vcpup); + vcpup = &per_cpu(xen_vcpu_info, cpu); + info.mfn = arbitrary_virt_to_mfn(vcpup); + info.offset = offset_in_page(vcpup); - /* - * Check to see if the hypervisor will put the vcpu_info - * structure where we want it, which allows direct access via - * a percpu-variable. - * N.B. This hypercall can _only_ be called once per CPU. - * Subsequent calls will error out with -EINVAL. This is due to - * the fact that hypervisor has no unregister variant and this - * hypercall does not allow to over-write info.mfn and - * info.offset. - */ - err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, - xen_vcpu_nr(cpu), &info); - - if (err) { - pr_warn_once("register_vcpu_info failed: cpu=%d err=%d\n", - cpu, err); - xen_have_vcpu_info_placement = 0; - } else { - /* - * This cpu is using the registered vcpu info, even if - * later ones fail to. - */ - per_cpu(xen_vcpu, cpu) = vcpup; - } - } + /* + * N.B. This hypercall can _only_ be called once per CPU. + * Subsequent calls will error out with -EINVAL. This is due to + * the fact that hypervisor has no unregister variant and this + * hypercall does not allow to over-write info.mfn and + * info.offset. + */ + err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, xen_vcpu_nr(cpu), + &info); + if (err) + panic("register_vcpu_info failed: cpu=%d err=%d\n", cpu, err); + + per_cpu(xen_vcpu, cpu) = vcpup; +} + +void __init xen_banner(void) +{ + unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL); + struct xen_extraversion extra; - if (!xen_have_vcpu_info_placement) - xen_vcpu_info_reset(cpu); + HYPERVISOR_xen_version(XENVER_extraversion, &extra); - return ((per_cpu(xen_vcpu, cpu) == NULL) ? -ENODEV : 0); + pr_info("Booting kernel on %s\n", pv_info.name); + pr_info("Xen version: %u.%u%s%s\n", + version >> 16, version & 0xffff, extra.extraversion, + xen_feature(XENFEAT_mmu_pt_update_preserve_ad) + ? " (preserve-AD)" : ""); +} + +/* Check if running on Xen version (major, minor) or later */ +bool xen_running_on_version_or_later(unsigned int major, unsigned int minor) +{ + unsigned int version; + + if (!xen_domain()) + return false; + + version = HYPERVISOR_xen_version(XENVER_version, NULL); + if ((((version >> 16) == major) && ((version & 0xffff) >= minor)) || + ((version >> 16) > major)) + return true; + return false; +} + +void __init xen_add_preferred_consoles(void) +{ + add_preferred_console("xenboot", 0, NULL); + if (!boot_params.screen_info.orig_video_isVGA) + add_preferred_console("tty", 0, NULL); + add_preferred_console("hvc", 0, NULL); + if (boot_params.screen_info.orig_video_isVGA) + add_preferred_console("tty", 0, NULL); } void xen_reboot(int reason) diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index e68ea5f4ad1c..42300941ec29 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -163,9 +163,9 @@ static int xen_cpu_up_prepare_hvm(unsigned int cpu) per_cpu(xen_vcpu_id, cpu) = cpu_acpi_id(cpu); else per_cpu(xen_vcpu_id, cpu) = cpu; - rc = xen_vcpu_setup(cpu); - if (rc || !xen_have_vector_callback) - return rc; + xen_vcpu_setup(cpu); + if (!xen_have_vector_callback) + return 0; if (xen_feature(XENFEAT_hvm_safe_pvclock)) xen_setup_timer(cpu); diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 6e0d0754f94f..5004feb16783 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -27,8 +27,6 @@ #include <linux/export.h> #include <linux/mm.h> #include <linux/page-flags.h> -#include <linux/highmem.h> -#include <linux/console.h> #include <linux/pci.h> #include <linux/gfp.h> #include <linux/edd.h> @@ -109,17 +107,6 @@ struct tls_descs { */ static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc); -static void __init xen_banner(void) -{ - unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL); - struct xen_extraversion extra; - HYPERVISOR_xen_version(XENVER_extraversion, &extra); - - pr_info("Booting paravirtualized kernel on %s\n", pv_info.name); - pr_info("Xen version: %d.%d%s (preserve-AD)\n", - version >> 16, version & 0xffff, extra.extraversion); -} - static void __init xen_pv_init_platform(void) { populate_extra_pte(fix_to_virt(FIX_PARAVIRT_BOOTMAP)); @@ -142,22 +129,6 @@ static void __init xen_pv_guest_late_init(void) #endif } -/* Check if running on Xen version (major, minor) or later */ -bool -xen_running_on_version_or_later(unsigned int major, unsigned int minor) -{ - unsigned int version; - - if (!xen_domain()) - return false; - - version = HYPERVISOR_xen_version(XENVER_version, NULL); - if ((((version >> 16) == major) && ((version & 0xffff) >= minor)) || - ((version >> 16) > major)) - return true; - return false; -} - static __read_mostly unsigned int cpuid_leaf5_ecx_val; static __read_mostly unsigned int cpuid_leaf5_edx_val; @@ -311,12 +282,12 @@ static void __init xen_init_capabilities(void) } } -static void xen_set_debugreg(int reg, unsigned long val) +static noinstr void xen_set_debugreg(int reg, unsigned long val) { HYPERVISOR_set_debugreg(reg, val); } -static unsigned long xen_get_debugreg(int reg) +static noinstr unsigned long xen_get_debugreg(int reg) { return HYPERVISOR_get_debugreg(reg); } @@ -1021,31 +992,13 @@ void __init xen_setup_vcpu_info_placement(void) for_each_possible_cpu(cpu) { /* Set up direct vCPU id mapping for PV guests. */ per_cpu(xen_vcpu_id, cpu) = cpu; - - /* - * xen_vcpu_setup(cpu) can fail -- in which case it - * falls back to the shared_info version for cpus - * where xen_vcpu_nr(cpu) < MAX_VIRT_CPUS. - * - * xen_cpu_up_prepare_pv() handles the rest by failing - * them in hotplug. - */ - (void) xen_vcpu_setup(cpu); + xen_vcpu_setup(cpu); } - /* - * xen_vcpu_setup managed to place the vcpu_info within the - * percpu area for all cpus, so make use of it. - */ - if (xen_have_vcpu_info_placement) { - pv_ops.irq.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct); - pv_ops.irq.irq_disable = - __PV_IS_CALLEE_SAVE(xen_irq_disable_direct); - pv_ops.irq.irq_enable = - __PV_IS_CALLEE_SAVE(xen_irq_enable_direct); - pv_ops.mmu.read_cr2 = - __PV_IS_CALLEE_SAVE(xen_read_cr2_direct); - } + pv_ops.irq.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct); + pv_ops.irq.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct); + pv_ops.irq.irq_enable = __PV_IS_CALLEE_SAVE(xen_irq_enable_direct); + pv_ops.mmu.read_cr2 = __PV_IS_CALLEE_SAVE(xen_read_cr2_direct); } static const struct pv_info xen_info __initconst = { @@ -1053,52 +1006,54 @@ static const struct pv_info xen_info __initconst = { .name = "Xen", }; -static const struct pv_cpu_ops xen_cpu_ops __initconst = { - .cpuid = xen_cpuid, +static const typeof(pv_ops) xen_cpu_ops __initconst = { + .cpu = { + .cpuid = xen_cpuid, - .set_debugreg = xen_set_debugreg, - .get_debugreg = xen_get_debugreg, + .set_debugreg = xen_set_debugreg, + .get_debugreg = xen_get_debugreg, - .read_cr0 = xen_read_cr0, - .write_cr0 = xen_write_cr0, + .read_cr0 = xen_read_cr0, + .write_cr0 = xen_write_cr0, - .write_cr4 = xen_write_cr4, + .write_cr4 = xen_write_cr4, - .wbinvd = native_wbinvd, + .wbinvd = native_wbinvd, - .read_msr = xen_read_msr, - .write_msr = xen_write_msr, + .read_msr = xen_read_msr, + .write_msr = xen_write_msr, - .read_msr_safe = xen_read_msr_safe, - .write_msr_safe = xen_write_msr_safe, + .read_msr_safe = xen_read_msr_safe, + .write_msr_safe = xen_write_msr_safe, - .read_pmc = xen_read_pmc, + .read_pmc = xen_read_pmc, - .load_tr_desc = paravirt_nop, - .set_ldt = xen_set_ldt, - .load_gdt = xen_load_gdt, - .load_idt = xen_load_idt, - .load_tls = xen_load_tls, - .load_gs_index = xen_load_gs_index, + .load_tr_desc = paravirt_nop, + .set_ldt = xen_set_ldt, + .load_gdt = xen_load_gdt, + .load_idt = xen_load_idt, + .load_tls = xen_load_tls, + .load_gs_index = xen_load_gs_index, - .alloc_ldt = xen_alloc_ldt, - .free_ldt = xen_free_ldt, + .alloc_ldt = xen_alloc_ldt, + .free_ldt = xen_free_ldt, - .store_tr = xen_store_tr, + .store_tr = xen_store_tr, - .write_ldt_entry = xen_write_ldt_entry, - .write_gdt_entry = xen_write_gdt_entry, - .write_idt_entry = xen_write_idt_entry, - .load_sp0 = xen_load_sp0, + .write_ldt_entry = xen_write_ldt_entry, + .write_gdt_entry = xen_write_gdt_entry, + .write_idt_entry = xen_write_idt_entry, + .load_sp0 = xen_load_sp0, #ifdef CONFIG_X86_IOPL_IOPERM - .invalidate_io_bitmap = xen_invalidate_io_bitmap, - .update_io_bitmap = xen_update_io_bitmap, + .invalidate_io_bitmap = xen_invalidate_io_bitmap, + .update_io_bitmap = xen_update_io_bitmap, #endif - .io_delay = xen_io_delay, + .io_delay = xen_io_delay, - .start_context_switch = paravirt_start_context_switch, - .end_context_switch = xen_end_context_switch, + .start_context_switch = paravirt_start_context_switch, + .end_context_switch = xen_end_context_switch, + }, }; static void xen_restart(char *msg) @@ -1239,7 +1194,7 @@ asmlinkage __visible void __init xen_start_kernel(void) /* Install Xen paravirt ops */ pv_info = xen_info; - pv_ops.cpu = xen_cpu_ops; + pv_ops.cpu = xen_cpu_ops.cpu; paravirt_iret = xen_iret; xen_init_irq_ops(); @@ -1273,12 +1228,6 @@ asmlinkage __visible void __init xen_start_kernel(void) __supported_pte_mask &= ~_PAGE_GLOBAL; __default_kernel_pte_mask &= ~_PAGE_GLOBAL; - /* - * Prevent page tables from being allocated in highmem, even - * if CONFIG_HIGHPTE is enabled. - */ - __userpte_alloc_gfp &= ~__GFP_HIGHMEM; - /* Get mfn list */ xen_build_dynamic_phys_to_machine(); @@ -1364,7 +1313,6 @@ asmlinkage __visible void __init xen_start_kernel(void) boot_params.hdr.hardware_subarch = X86_SUBARCH_XEN; if (!xen_initial_domain()) { - add_preferred_console("xenboot", 0, NULL); if (pci_xen) x86_init.pci.arch_init = pci_xen_init; x86_platform.set_legacy_features = @@ -1409,11 +1357,7 @@ asmlinkage __visible void __init xen_start_kernel(void) #endif } - if (!boot_params.screen_info.orig_video_isVGA) - add_preferred_console("tty", 0, NULL); - add_preferred_console("hvc", 0, NULL); - if (boot_params.screen_info.orig_video_isVGA) - add_preferred_console("tty", 0, NULL); + xen_add_preferred_consoles(); #ifdef CONFIG_PCI /* PCI BIOS service won't work from a PV guest. */ diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c index 0d5e34b9e6f9..bcae606bbc5c 100644 --- a/arch/x86/xen/enlighten_pvh.c +++ b/arch/x86/xen/enlighten_pvh.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/acpi.h> +#include <linux/export.h> #include <xen/hvc-console.h> @@ -18,10 +19,11 @@ /* * PVH variables. * - * The variable xen_pvh needs to live in the data segment since it is used + * The variable xen_pvh needs to live in a data segment since it is used * after startup_{32|64} is invoked, which will clear the .bss segment. */ -bool xen_pvh __section(".data") = 0; +bool __ro_after_init xen_pvh; +EXPORT_SYMBOL_GPL(xen_pvh); void __init xen_pvh_init(struct boot_params *boot_params) { @@ -36,6 +38,10 @@ void __init xen_pvh_init(struct boot_params *boot_params) pfn = __pa(hypercall_page); wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); + if (xen_initial_domain()) + x86_init.oem.arch_setup = xen_add_preferred_consoles; + x86_init.oem.banner = xen_banner; + xen_efi_init(boot_params); } diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index dfa091d79c2e..06c3c2fb4b06 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -19,65 +19,11 @@ * callback mask. We do this in a very simple manner, by making a call * down into Xen. The pending flag will be checked by Xen on return. */ -void xen_force_evtchn_callback(void) +noinstr void xen_force_evtchn_callback(void) { (void)HYPERVISOR_xen_version(0, NULL); } -asmlinkage __visible unsigned long xen_save_fl(void) -{ - struct vcpu_info *vcpu; - unsigned long flags; - - vcpu = this_cpu_read(xen_vcpu); - - /* flag has opposite sense of mask */ - flags = !vcpu->evtchn_upcall_mask; - - /* convert to IF type flag - -0 -> 0x00000000 - -1 -> 0xffffffff - */ - return (-flags) & X86_EFLAGS_IF; -} -PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl); - -asmlinkage __visible void xen_irq_disable(void) -{ - /* There's a one instruction preempt window here. We need to - make sure we're don't switch CPUs between getting the vcpu - pointer and updating the mask. */ - preempt_disable(); - this_cpu_read(xen_vcpu)->evtchn_upcall_mask = 1; - preempt_enable_no_resched(); -} -PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable); - -asmlinkage __visible void xen_irq_enable(void) -{ - struct vcpu_info *vcpu; - - /* - * We may be preempted as soon as vcpu->evtchn_upcall_mask is - * cleared, so disable preemption to ensure we check for - * events on the VCPU we are still running on. - */ - preempt_disable(); - - vcpu = this_cpu_read(xen_vcpu); - vcpu->evtchn_upcall_mask = 0; - - /* Doesn't matter if we get preempted here, because any - pending event will get dealt with anyway. */ - - barrier(); /* unmask then check (avoid races) */ - if (unlikely(vcpu->evtchn_upcall_pending)) - xen_force_evtchn_callback(); - - preempt_enable(); -} -PV_CALLEE_SAVE_REGS_THUNK(xen_irq_enable); - static void xen_safe_halt(void) { /* Blocking includes an implicit local_irq_enable(). */ @@ -94,17 +40,20 @@ static void xen_halt(void) xen_safe_halt(); } -static const struct pv_irq_ops xen_irq_ops __initconst = { - .save_fl = PV_CALLEE_SAVE(xen_save_fl), - .irq_disable = PV_CALLEE_SAVE(xen_irq_disable), - .irq_enable = PV_CALLEE_SAVE(xen_irq_enable), +static const typeof(pv_ops) xen_irq_ops __initconst = { + .irq = { + /* Initial interrupt flag handling only called while interrupts off. */ + .save_fl = __PV_IS_CALLEE_SAVE(paravirt_ret0), + .irq_disable = __PV_IS_CALLEE_SAVE(paravirt_nop), + .irq_enable = __PV_IS_CALLEE_SAVE(paravirt_BUG), - .safe_halt = xen_safe_halt, - .halt = xen_halt, + .safe_halt = xen_safe_halt, + .halt = xen_halt, + }, }; void __init xen_init_irq_ops(void) { - pv_ops.irq = xen_irq_ops; + pv_ops.irq = xen_irq_ops.irq; x86_init.irqs.intr_init = xen_init_IRQ; } diff --git a/arch/x86/xen/mmu_hvm.c b/arch/x86/xen/mmu_hvm.c index 57409373750f..509bdee3ab90 100644 --- a/arch/x86/xen/mmu_hvm.c +++ b/arch/x86/xen/mmu_hvm.c @@ -9,39 +9,28 @@ #ifdef CONFIG_PROC_VMCORE /* - * This function is used in two contexts: - * - the kdump kernel has to check whether a pfn of the crashed kernel - * was a ballooned page. vmcore is using this function to decide - * whether to access a pfn of the crashed kernel. - * - the kexec kernel has to check whether a pfn was ballooned by the - * previous kernel. If the pfn is ballooned, handle it properly. - * Returns 0 if the pfn is not backed by a RAM page, the caller may + * The kdump kernel has to check whether a pfn of the crashed kernel + * was a ballooned page. vmcore is using this function to decide + * whether to access a pfn of the crashed kernel. + * Returns "false" if the pfn is not backed by a RAM page, the caller may * handle the pfn special in this case. */ -static int xen_oldmem_pfn_is_ram(unsigned long pfn) +static bool xen_vmcore_pfn_is_ram(struct vmcore_cb *cb, unsigned long pfn) { struct xen_hvm_get_mem_type a = { .domid = DOMID_SELF, .pfn = pfn, }; - int ram; - if (HYPERVISOR_hvm_op(HVMOP_get_mem_type, &a)) - return -ENXIO; - - switch (a.mem_type) { - case HVMMEM_mmio_dm: - ram = 0; - break; - case HVMMEM_ram_rw: - case HVMMEM_ram_ro: - default: - ram = 1; - break; + if (HYPERVISOR_hvm_op(HVMOP_get_mem_type, &a)) { + pr_warn_once("Unexpected HVMOP_get_mem_type failure\n"); + return true; } - - return ram; + return a.mem_type != HVMMEM_mmio_dm; } +static struct vmcore_cb xen_vmcore_cb = { + .pfn_is_ram = xen_vmcore_pfn_is_ram, +}; #endif static void xen_hvm_exit_mmap(struct mm_struct *mm) @@ -75,6 +64,6 @@ void __init xen_hvm_init_mmu_ops(void) if (is_pagetable_dying_supported()) pv_ops.mmu.exit_mmap = xen_hvm_exit_mmap; #ifdef CONFIG_PROC_VMCORE - WARN_ON(register_oldmem_pfn_is_ram(&xen_oldmem_pfn_is_ram)); + register_vmcore_cb(&xen_vmcore_cb); #endif } diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 8d751939c6f3..00354866921b 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -41,7 +41,6 @@ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 */ #include <linux/sched/mm.h> -#include <linux/highmem.h> #include <linux/debugfs.h> #include <linux/bug.h> #include <linux/vmalloc.h> @@ -86,8 +85,10 @@ #include "mmu.h" #include "debugfs.h" +#ifdef CONFIG_X86_VSYSCALL_EMULATION /* l3 pud for userspace vsyscall mapping */ static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss; +#endif /* * Protects atomic reservation decrease/increase against concurrent increases. @@ -241,9 +242,11 @@ static void xen_set_pmd(pmd_t *ptr, pmd_t val) * Associate a virtual page frame with a given physical page frame * and protection flags for that frame. */ -void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) +void __init set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) { - set_pte_vaddr(vaddr, mfn_pte(mfn, flags)); + if (HYPERVISOR_update_va_mapping(vaddr, mfn_pte(mfn, flags), + UVMF_INVLPG)) + BUG(); } static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval) @@ -789,7 +792,9 @@ static void __init xen_mark_pinned(struct mm_struct *mm, struct page *page, static void __init xen_after_bootmem(void) { static_branch_enable(&xen_struct_pages_ready); +#ifdef CONFIG_X86_VSYSCALL_EMULATION SetPagePinned(virt_to_page(level3_user_vsyscall)); +#endif xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP); } @@ -1025,7 +1030,7 @@ static void __init xen_free_ro_pages(unsigned long paddr, unsigned long size) for (; vaddr < vaddr_end; vaddr += PAGE_SIZE) make_lowmem_page_readwrite(vaddr); - memblock_free(paddr, size); + memblock_phys_free(paddr, size); } static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin) @@ -1151,7 +1156,7 @@ static void __init xen_pagetable_p2m_free(void) xen_cleanhighmap(addr, addr + size); size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); - memblock_free(__pa(addr), size); + memblock_free((void *)addr, size); } else { xen_cleanmfnmap(addr); } @@ -1192,6 +1197,13 @@ static void __init xen_pagetable_p2m_setup(void) static void __init xen_pagetable_init(void) { + /* + * The majority of further PTE writes is to pagetables already + * announced as such to Xen. Hence it is more efficient to use + * hypercalls for these updates. + */ + pv_ops.mmu.set_pte = __xen_set_pte; + paging_init(); xen_post_allocator_init(); @@ -1204,7 +1216,8 @@ static void __init xen_pagetable_init(void) xen_remap_memory(); xen_setup_mfn_list_list(); } -static void xen_write_cr2(unsigned long cr2) + +static noinstr void xen_write_cr2(unsigned long cr2) { this_cpu_read(xen_vcpu)->arch.cr2 = cr2; } @@ -1420,10 +1433,18 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) * * Many of these PTE updates are done on unpinned and writable pages * and doing a hypercall for these is unnecessary and expensive. At - * this point it is not possible to tell if a page is pinned or not, - * so always write the PTE directly and rely on Xen trapping and + * this point it is rarely possible to tell if a page is pinned, so + * mostly write the PTE directly and rely on Xen trapping and * emulating any updates as necessary. */ +static void __init xen_set_pte_init(pte_t *ptep, pte_t pte) +{ + if (unlikely(is_early_ioremap_ptep(ptep))) + __xen_set_pte(ptep, pte); + else + native_set_pte(ptep, pte); +} + __visible pte_t xen_make_pte_init(pteval_t pte) { unsigned long pfn; @@ -1445,11 +1466,6 @@ __visible pte_t xen_make_pte_init(pteval_t pte) } PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_init); -static void __init xen_set_pte_init(pte_t *ptep, pte_t pte) -{ - __xen_set_pte(ptep, pte); -} - /* Early in boot, while setting up the initial pagetable, assume everything is pinned. */ static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) @@ -1749,7 +1765,6 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) set_page_prot(init_top_pgt, PAGE_KERNEL_RO); set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); - set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO); set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); @@ -1766,6 +1781,13 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) /* Unpin Xen-provided one */ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); +#ifdef CONFIG_X86_VSYSCALL_EMULATION + /* Pin user vsyscall L3 */ + set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); + pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, + PFN_DOWN(__pa_symbol(level3_user_vsyscall))); +#endif + /* * At this stage there can be no user pgd, and no page structure to * attach it to, so make sure we just set kernel pgd. @@ -1955,7 +1977,7 @@ void __init xen_relocate_p2m(void) pfn_end = p2m_pfn_end; } - memblock_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn)); + memblock_phys_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn)); while (pfn < pfn_end) { if (pfn == p2m_pfn) { pfn = p2m_pfn_end; @@ -1998,6 +2020,7 @@ static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss; static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) { pte_t pte; + unsigned long vaddr; phys >>= PAGE_SHIFT; @@ -2038,15 +2061,15 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) break; } - __native_set_fixmap(idx, pte); + vaddr = __fix_to_virt(idx); + if (HYPERVISOR_update_va_mapping(vaddr, pte, UVMF_INVLPG)) + BUG(); #ifdef CONFIG_X86_VSYSCALL_EMULATION /* Replicate changes to map the vsyscall page into the user pagetable vsyscall mapping. */ - if (idx == VSYSCALL_PAGE) { - unsigned long vaddr = __fix_to_virt(idx); + if (idx == VSYSCALL_PAGE) set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte); - } #endif } @@ -2078,67 +2101,69 @@ static void xen_leave_lazy_mmu(void) preempt_enable(); } -static const struct pv_mmu_ops xen_mmu_ops __initconst = { - .read_cr2 = __PV_IS_CALLEE_SAVE(xen_read_cr2), - .write_cr2 = xen_write_cr2, +static const typeof(pv_ops) xen_mmu_ops __initconst = { + .mmu = { + .read_cr2 = __PV_IS_CALLEE_SAVE(xen_read_cr2), + .write_cr2 = xen_write_cr2, - .read_cr3 = xen_read_cr3, - .write_cr3 = xen_write_cr3_init, + .read_cr3 = xen_read_cr3, + .write_cr3 = xen_write_cr3_init, - .flush_tlb_user = xen_flush_tlb, - .flush_tlb_kernel = xen_flush_tlb, - .flush_tlb_one_user = xen_flush_tlb_one_user, - .flush_tlb_multi = xen_flush_tlb_multi, - .tlb_remove_table = tlb_remove_table, + .flush_tlb_user = xen_flush_tlb, + .flush_tlb_kernel = xen_flush_tlb, + .flush_tlb_one_user = xen_flush_tlb_one_user, + .flush_tlb_multi = xen_flush_tlb_multi, + .tlb_remove_table = tlb_remove_table, - .pgd_alloc = xen_pgd_alloc, - .pgd_free = xen_pgd_free, + .pgd_alloc = xen_pgd_alloc, + .pgd_free = xen_pgd_free, - .alloc_pte = xen_alloc_pte_init, - .release_pte = xen_release_pte_init, - .alloc_pmd = xen_alloc_pmd_init, - .release_pmd = xen_release_pmd_init, + .alloc_pte = xen_alloc_pte_init, + .release_pte = xen_release_pte_init, + .alloc_pmd = xen_alloc_pmd_init, + .release_pmd = xen_release_pmd_init, - .set_pte = xen_set_pte_init, - .set_pmd = xen_set_pmd_hyper, + .set_pte = xen_set_pte_init, + .set_pmd = xen_set_pmd_hyper, - .ptep_modify_prot_start = xen_ptep_modify_prot_start, - .ptep_modify_prot_commit = xen_ptep_modify_prot_commit, + .ptep_modify_prot_start = xen_ptep_modify_prot_start, + .ptep_modify_prot_commit = xen_ptep_modify_prot_commit, - .pte_val = PV_CALLEE_SAVE(xen_pte_val), - .pgd_val = PV_CALLEE_SAVE(xen_pgd_val), + .pte_val = PV_CALLEE_SAVE(xen_pte_val), + .pgd_val = PV_CALLEE_SAVE(xen_pgd_val), - .make_pte = PV_CALLEE_SAVE(xen_make_pte_init), - .make_pgd = PV_CALLEE_SAVE(xen_make_pgd), + .make_pte = PV_CALLEE_SAVE(xen_make_pte_init), + .make_pgd = PV_CALLEE_SAVE(xen_make_pgd), - .set_pud = xen_set_pud_hyper, + .set_pud = xen_set_pud_hyper, - .make_pmd = PV_CALLEE_SAVE(xen_make_pmd), - .pmd_val = PV_CALLEE_SAVE(xen_pmd_val), + .make_pmd = PV_CALLEE_SAVE(xen_make_pmd), + .pmd_val = PV_CALLEE_SAVE(xen_pmd_val), - .pud_val = PV_CALLEE_SAVE(xen_pud_val), - .make_pud = PV_CALLEE_SAVE(xen_make_pud), - .set_p4d = xen_set_p4d_hyper, + .pud_val = PV_CALLEE_SAVE(xen_pud_val), + .make_pud = PV_CALLEE_SAVE(xen_make_pud), + .set_p4d = xen_set_p4d_hyper, - .alloc_pud = xen_alloc_pmd_init, - .release_pud = xen_release_pmd_init, + .alloc_pud = xen_alloc_pmd_init, + .release_pud = xen_release_pmd_init, #if CONFIG_PGTABLE_LEVELS >= 5 - .p4d_val = PV_CALLEE_SAVE(xen_p4d_val), - .make_p4d = PV_CALLEE_SAVE(xen_make_p4d), + .p4d_val = PV_CALLEE_SAVE(xen_p4d_val), + .make_p4d = PV_CALLEE_SAVE(xen_make_p4d), #endif - .activate_mm = xen_activate_mm, - .dup_mmap = xen_dup_mmap, - .exit_mmap = xen_exit_mmap, + .activate_mm = xen_activate_mm, + .dup_mmap = xen_dup_mmap, + .exit_mmap = xen_exit_mmap, - .lazy_mode = { - .enter = paravirt_enter_lazy_mmu, - .leave = xen_leave_lazy_mmu, - .flush = paravirt_flush_lazy_mmu, - }, + .lazy_mode = { + .enter = paravirt_enter_lazy_mmu, + .leave = xen_leave_lazy_mmu, + .flush = paravirt_flush_lazy_mmu, + }, - .set_fixmap = xen_set_fixmap, + .set_fixmap = xen_set_fixmap, + }, }; void __init xen_init_mmu_ops(void) @@ -2146,7 +2171,7 @@ void __init xen_init_mmu_ops(void) x86_init.paging.pagetable_init = xen_pagetable_init; x86_init.hyper.init_after_bootmem = xen_after_bootmem; - pv_ops.mmu = xen_mmu_ops; + pv_ops.mmu = xen_mmu_ops.mmu; memset(dummy_mapping, 0xff, PAGE_SIZE); } @@ -2398,7 +2423,7 @@ static int remap_area_pfn_pte_fn(pte_t *ptep, unsigned long addr, void *data) int xen_remap_pfn(struct vm_area_struct *vma, unsigned long addr, xen_pfn_t *pfn, int nr, int *err_ptr, pgprot_t prot, - unsigned int domid, bool no_translate, struct page **pages) + unsigned int domid, bool no_translate) { int err = 0; struct remap_data rmd; diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 5e6e236977c7..58db86f7b384 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -197,7 +197,7 @@ static void * __ref alloc_p2m_page(void) static void __ref free_p2m_page(void *p) { if (unlikely(!slab_is_available())) { - memblock_free((unsigned long)p, PAGE_SIZE); + memblock_free(p, PAGE_SIZE); return; } diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c index e13b0b49fcdf..89dd6b1708b0 100644 --- a/arch/x86/xen/pmu.c +++ b/arch/x86/xen/pmu.c @@ -413,34 +413,29 @@ int pmu_apic_update(uint32_t val) } /* perf callbacks */ -static int xen_is_in_guest(void) +static unsigned int xen_guest_state(void) { const struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + unsigned int state = 0; if (!xenpmu_data) { pr_warn_once("%s: pmudata not initialized\n", __func__); - return 0; + return state; } if (!xen_initial_domain() || (xenpmu_data->domain_id >= DOMID_SELF)) - return 0; + return state; - return 1; -} - -static int xen_is_user_mode(void) -{ - const struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + state |= PERF_GUEST_ACTIVE; - if (!xenpmu_data) { - pr_warn_once("%s: pmudata not initialized\n", __func__); - return 0; + if (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_PV) { + if (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_USER) + state |= PERF_GUEST_USER; + } else if (xenpmu_data->pmu.r.regs.cpl & 3) { + state |= PERF_GUEST_USER; } - if (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_PV) - return (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_USER); - else - return !!(xenpmu_data->pmu.r.regs.cpl & 3); + return state; } static unsigned long xen_get_guest_ip(void) @@ -456,9 +451,8 @@ static unsigned long xen_get_guest_ip(void) } static struct perf_guest_info_callbacks xen_guest_cbs = { - .is_in_guest = xen_is_in_guest, - .is_user_mode = xen_is_user_mode, - .get_guest_ip = xen_get_guest_ip, + .state = xen_guest_state, + .get_ip = xen_get_guest_ip, }; /* Convert registers from Xen's format to Linux' */ diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 8bfc10330107..af216feb63d9 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -153,7 +153,7 @@ static void __init xen_del_extra_mem(unsigned long start_pfn, break; } } - memblock_free(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns)); + memblock_phys_free(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns)); } /* @@ -306,10 +306,6 @@ static void __init xen_update_mem_tables(unsigned long pfn, unsigned long mfn) BUG(); } - /* Update kernel mapping, but not for highmem. */ - if (pfn >= PFN_UP(__pa(high_memory - 1))) - return; - if (HYPERVISOR_update_va_mapping((unsigned long)__va(pfn << PAGE_SHIFT), mfn_pte(mfn, PAGE_KERNEL), 0)) { WARN(1, "Failed to update kernel mapping for mfn=%ld pfn=%ld\n", @@ -429,13 +425,13 @@ static unsigned long __init xen_set_identity_and_remap_chunk( } /* - * If the PFNs are currently mapped, the VA mapping also needs - * to be updated to be 1:1. + * If the PFNs are currently mapped, their VA mappings need to be + * zapped. */ for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) (void)HYPERVISOR_update_va_mapping( (unsigned long)__va(pfn << PAGE_SHIFT), - mfn_pte(pfn, PAGE_KERNEL_IO), 0); + native_make_pte(0), 0); return remap_pfn; } @@ -719,7 +715,7 @@ static void __init xen_reserve_xen_mfnlist(void) return; xen_relocate_p2m(); - memblock_free(start, size); + memblock_phys_free(start, size); } /** @@ -885,7 +881,7 @@ char * __init xen_memory_setup(void) xen_phys_memcpy(new_area, start, size); pr_info("initrd moved from [mem %#010llx-%#010llx] to [mem %#010llx-%#010llx]\n", start, start + size, new_area, new_area + size); - memblock_free(start, size); + memblock_phys_free(start, size); boot_params.hdr.ramdisk_image = new_area; boot_params.ext_ramdisk_image = new_area >> 32; } diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index c1b2f764b29a..c3e1f9a7d43a 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -121,34 +121,10 @@ int xen_smp_intr_init(unsigned int cpu) void __init xen_smp_cpus_done(unsigned int max_cpus) { - int cpu, rc, count = 0; - if (xen_hvm_domain()) native_smp_cpus_done(max_cpus); else calculate_max_logical_packages(); - - if (xen_have_vcpu_info_placement) - return; - - for_each_online_cpu(cpu) { - if (xen_vcpu_nr(cpu) < MAX_VIRT_CPUS) - continue; - - rc = remove_cpu(cpu); - - if (rc == 0) { - /* - * Reset vcpu_info so this cpu cannot be onlined again. - */ - xen_vcpu_info_reset(cpu); - count++; - } else { - pr_warn("%s: failed to bring CPU %d down, error %d\n", - __func__, cpu, rc); - } - } - WARN(count, "%s: brought %d CPUs offline\n", __func__, count); } void xen_smp_send_reschedule(int cpu) @@ -268,20 +244,16 @@ void xen_send_IPI_allbutself(int vector) static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id) { - irq_enter(); generic_smp_call_function_interrupt(); inc_irq_stat(irq_call_count); - irq_exit(); return IRQ_HANDLED; } static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id) { - irq_enter(); generic_smp_call_function_single_interrupt(); inc_irq_stat(irq_call_count); - irq_exit(); return IRQ_HANDLED; } diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index 7ed56c6075b0..6a8f3b53ab83 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -225,7 +225,6 @@ static void __init xen_pv_smp_prepare_boot_cpu(void) static void __init xen_pv_smp_prepare_cpus(unsigned int max_cpus) { unsigned cpu; - unsigned int i; if (skip_ioapic_setup) { char *m = (max_cpus == 0) ? @@ -238,16 +237,9 @@ static void __init xen_pv_smp_prepare_cpus(unsigned int max_cpus) } xen_init_lock_cpu(0); - smp_store_boot_cpu_info(); - cpu_data(0).x86_max_cores = 1; + smp_prepare_cpus_common(); - for_each_possible_cpu(i) { - zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); - zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); - zalloc_cpumask_var(&per_cpu(cpu_die_map, i), GFP_KERNEL); - zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL); - } - set_cpu_sibling_map(0); + cpu_data(0).x86_max_cores = 1; speculative_store_bypass_ht_init(); @@ -458,10 +450,8 @@ static void xen_pv_stop_other_cpus(int wait) static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id) { - irq_enter(); irq_work_run(); inc_irq_stat(apic_irq_work_irqs); - irq_exit(); return IRQ_HANDLED; } diff --git a/arch/x86/xen/vga.c b/arch/x86/xen/vga.c index e336f223f7f4..31b1e3477cb6 100644 --- a/arch/x86/xen/vga.c +++ b/arch/x86/xen/vga.c @@ -63,13 +63,17 @@ void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size) } if (size >= offsetof(struct dom0_vga_console_info, - u.vesa_lfb.gbl_caps) - + sizeof(info->u.vesa_lfb.gbl_caps)) - screen_info->capabilities = info->u.vesa_lfb.gbl_caps; - if (size >= offsetof(struct dom0_vga_console_info, u.vesa_lfb.mode_attrs) + sizeof(info->u.vesa_lfb.mode_attrs)) screen_info->vesa_attributes = info->u.vesa_lfb.mode_attrs; + + if (size >= offsetof(struct dom0_vga_console_info, + u.vesa_lfb.ext_lfb_base) + + sizeof(info->u.vesa_lfb.ext_lfb_base) + && info->u.vesa_lfb.ext_lfb_base) { + screen_info->ext_lfb_base = info->u.vesa_lfb.ext_lfb_base; + screen_info->capabilities |= VIDEO_CAPABILITY_64BIT_BASE; + } break; } } diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index 1e626444712b..e730e6200e64 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -20,6 +20,46 @@ #include <linux/init.h> #include <linux/linkage.h> +#include <../entry/calling.h> + +.pushsection .noinstr.text, "ax" +/* + * Disabling events is simply a matter of making the event mask + * non-zero. + */ +SYM_FUNC_START(xen_irq_disable_direct) + movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask + RET +SYM_FUNC_END(xen_irq_disable_direct) + +/* + * Force an event check by making a hypercall, but preserve regs + * before making the call. + */ +SYM_FUNC_START(check_events) + FRAME_BEGIN + push %rax + push %rcx + push %rdx + push %rsi + push %rdi + push %r8 + push %r9 + push %r10 + push %r11 + call xen_force_evtchn_callback + pop %r11 + pop %r10 + pop %r9 + pop %r8 + pop %rdi + pop %rsi + pop %rdx + pop %rcx + pop %rax + FRAME_END + RET +SYM_FUNC_END(check_events) /* * Enable events. This clears the event mask and tests the pending @@ -44,19 +84,9 @@ SYM_FUNC_START(xen_irq_enable_direct) call check_events 1: FRAME_END - ret + RET SYM_FUNC_END(xen_irq_enable_direct) - -/* - * Disabling events is simply a matter of making the event mask - * non-zero. - */ -SYM_FUNC_START(xen_irq_disable_direct) - movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask - ret -SYM_FUNC_END(xen_irq_disable_direct) - /* * (xen_)save_fl is used to get the current interrupt enable status. * Callers expect the status to be in X86_EFLAGS_IF, and other bits @@ -70,52 +100,24 @@ SYM_FUNC_START(xen_save_fl_direct) testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask setz %ah addb %ah, %ah - ret + RET SYM_FUNC_END(xen_save_fl_direct) -/* - * Force an event check by making a hypercall, but preserve regs - * before making the call. - */ -SYM_FUNC_START(check_events) - FRAME_BEGIN - push %rax - push %rcx - push %rdx - push %rsi - push %rdi - push %r8 - push %r9 - push %r10 - push %r11 - call xen_force_evtchn_callback - pop %r11 - pop %r10 - pop %r9 - pop %r8 - pop %rdi - pop %rsi - pop %rdx - pop %rcx - pop %rax - FRAME_END - ret -SYM_FUNC_END(check_events) - SYM_FUNC_START(xen_read_cr2) FRAME_BEGIN _ASM_MOV PER_CPU_VAR(xen_vcpu), %_ASM_AX _ASM_MOV XEN_vcpu_info_arch_cr2(%_ASM_AX), %_ASM_AX FRAME_END - ret + RET SYM_FUNC_END(xen_read_cr2); SYM_FUNC_START(xen_read_cr2_direct) FRAME_BEGIN _ASM_MOV PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_arch_cr2, %_ASM_AX FRAME_END - ret + RET SYM_FUNC_END(xen_read_cr2_direct); +.popsection .macro xen_pv_trap name SYM_CODE_START(xen_\name) @@ -192,6 +194,25 @@ SYM_CODE_START(xen_iret) SYM_CODE_END(xen_iret) /* + * XEN pv doesn't use trampoline stack, PER_CPU_VAR(cpu_tss_rw + TSS_sp0) is + * also the kernel stack. Reusing swapgs_restore_regs_and_return_to_usermode() + * in XEN pv would cause %rsp to move up to the top of the kernel stack and + * leave the IRET frame below %rsp, which is dangerous to be corrupted if #NMI + * interrupts. And swapgs_restore_regs_and_return_to_usermode() pushing the IRET + * frame at the same address is useless. + */ +SYM_CODE_START(xenpv_restore_regs_and_return_to_usermode) + UNWIND_HINT_REGS + POP_REGS + + /* stackleak_erase() can work safely on the kernel stack. */ + STACKLEAK_ERASE_NOCLOBBER + + addq $8, %rsp /* skip regs->orig_ax */ + jmp xen_iret +SYM_CODE_END(xenpv_restore_regs_and_return_to_usermode) + +/* * Xen handles syscall callbacks much like ordinary exceptions, which * means we have: * - kernel gs diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index cb6538ae2fe0..11d286529fe5 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -20,6 +20,23 @@ #include <xen/interface/xen-mca.h> #include <asm/xen/interface.h> +.pushsection .noinstr.text, "ax" + .balign PAGE_SIZE +SYM_CODE_START(hypercall_page) + .rept (PAGE_SIZE / 32) + UNWIND_HINT_FUNC + .skip 31, 0x90 + RET + .endr + +#define HYPERCALL(n) \ + .equ xen_hypercall_##n, hypercall_page + __HYPERVISOR_##n * 32; \ + .type xen_hypercall_##n, @function; .size xen_hypercall_##n, 32 +#include <asm/xen-hypercalls.h> +#undef HYPERCALL +SYM_CODE_END(hypercall_page) +.popsection + #ifdef CONFIG_XEN_PV __INIT SYM_CODE_START(startup_xen) @@ -28,13 +45,13 @@ SYM_CODE_START(startup_xen) /* Clear .bss */ xor %eax,%eax - mov $__bss_start, %_ASM_DI - mov $__bss_stop, %_ASM_CX - sub %_ASM_DI, %_ASM_CX - shr $__ASM_SEL(2, 3), %_ASM_CX - rep __ASM_SIZE(stos) + mov $__bss_start, %rdi + mov $__bss_stop, %rcx + sub %rdi, %rcx + shr $3, %rcx + rep stosq - mov %_ASM_SI, xen_start_info + mov %rsi, xen_start_info mov initial_stack(%rip), %rsp /* Set up %gs. @@ -64,23 +81,6 @@ SYM_CODE_END(asm_cpu_bringup_and_idle) #endif #endif -.pushsection .text - .balign PAGE_SIZE -SYM_CODE_START(hypercall_page) - .rept (PAGE_SIZE / 32) - UNWIND_HINT_FUNC - .skip 31, 0x90 - ret - .endr - -#define HYPERCALL(n) \ - .equ xen_hypercall_##n, hypercall_page + __HYPERVISOR_##n * 32; \ - .type xen_hypercall_##n, @function; .size xen_hypercall_##n, 32 -#include <asm/xen-hypercalls.h> -#undef HYPERCALL -SYM_CODE_END(hypercall_page) -.popsection - ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6") ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0") diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 8d7ec49a35fb..fd0fec6e92f4 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -51,6 +51,7 @@ void __init xen_remap_memory(void); phys_addr_t __init xen_find_free_area(phys_addr_t size); char * __init xen_memory_setup(void); void __init xen_arch_setup(void); +void xen_banner(void); void xen_enable_sysenter(void); void xen_enable_syscall(void); void xen_vcpu_restore(void); @@ -75,9 +76,7 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id); bool xen_vcpu_stolen(int vcpu); -extern int xen_have_vcpu_info_placement; - -int xen_vcpu_setup(int cpu); +void xen_vcpu_setup(int cpu); void xen_vcpu_info_reset(int cpu); void xen_setup_vcpu_info_placement(void); @@ -109,7 +108,7 @@ static inline void xen_uninit_lock_cpu(int cpu) struct dom0_vga_console_info; -#ifdef CONFIG_XEN_DOM0 +#ifdef CONFIG_XEN_PV_DOM0 void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size); #else static inline void __init xen_init_vga(const struct dom0_vga_console_info *info, @@ -118,6 +117,8 @@ static inline void __init xen_init_vga(const struct dom0_vga_console_info *info, } #endif +void xen_add_preferred_consoles(void); + void __init xen_init_apic(void); #ifdef CONFIG_XEN_EFI |