diff options
Diffstat (limited to 'arch/x86')
946 files changed, 17799 insertions, 12398 deletions
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild index 586b786b3edf..0038a2d10a7a 100644 --- a/arch/x86/Kbuild +++ b/arch/x86/Kbuild @@ -8,10 +8,7 @@ obj-$(CONFIG_KVM) += kvm/ obj-$(CONFIG_XEN) += xen/ # Hyper-V paravirtualization support -obj-$(CONFIG_HYPERVISOR_GUEST) += hyperv/ - -# lguest paravirtualization support -obj-$(CONFIG_LGUEST_GUEST) += lguest/ +obj-$(subst m,y,$(CONFIG_HYPERV)) += hyperv/ obj-y += realmode/ obj-y += kernel/ diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0efb4c9497bc..2fdb23313dd5 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # Select 32 or 64 bit config 64BIT bool "64-bit kernel" if ARCH = "x86" @@ -22,7 +23,7 @@ config X86_64 def_bool y depends on 64BIT # Options that are inherently 64-bit kernel only: - select ARCH_HAS_GIGANTIC_PAGE + select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA select ARCH_SUPPORTS_INT128 select ARCH_USE_CMPXCHG_LOCKREF select HAVE_ARCH_SOFT_DIRTY @@ -50,15 +51,19 @@ config X86 select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FAST_MULTIPLIER + select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_KCOV if X86_64 - select ARCH_HAS_MMIO_FLUSH select ARCH_HAS_PMEM_API if X86_64 + # Causing hangs/crashes, see the commit that added this change for details. + select ARCH_HAS_REFCOUNT if BROKEN + select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 select ARCH_HAS_SET_MEMORY select ARCH_HAS_SG_CHAIN select ARCH_HAS_STRICT_KERNEL_RWX select ARCH_HAS_STRICT_MODULE_RWX select ARCH_HAS_UBSAN_SANITIZE_ALL + select ARCH_HAS_ZONE_DEVICE if X86_64 select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI select ARCH_MIGHT_HAVE_PC_PARPORT @@ -69,9 +74,9 @@ config X86 select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS - select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP - select ARCH_WANT_FRAME_POINTERS + select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH select ARCH_WANTS_DYNAMIC_TASK_STRUCT + select ARCH_WANTS_THP_SWAP if X86_64 select BUILDTIME_EXTABLE_SORT select CLKEVT_I8253 select CLOCKSOURCE_VALIDATE_LAST_CYCLE @@ -87,6 +92,8 @@ config X86 select GENERIC_EARLY_IOREMAP select GENERIC_FIND_FIRST_BIT select GENERIC_IOMAP + select GENERIC_IRQ_EFFECTIVE_AFF_MASK if SMP + select GENERIC_IRQ_MIGRATION if SMP select GENERIC_IRQ_PROBE select GENERIC_IRQ_SHOW select GENERIC_PENDING_IRQ if SMP @@ -94,6 +101,7 @@ config X86 select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER select GENERIC_TIME_VSYSCALL + select HARDLOCKUP_CHECK_TIMESTAMP if X86_64 select HAVE_ACPI_APEI if ACPI select HAVE_ACPI_APEI_NMI if ACPI select HAVE_ALIGNED_STRUCT_PAGE if SLUB @@ -151,21 +159,25 @@ config X86 select HAVE_MEMBLOCK select HAVE_MEMBLOCK_NODE_MAP select HAVE_MIXED_BREAKPOINTS_REGS + select HAVE_MOD_ARCH_SPECIFIC select HAVE_NMI select HAVE_OPROFILE select HAVE_OPTPROBES select HAVE_PCSPKR_PLATFORM select HAVE_PERF_EVENTS select HAVE_PERF_EVENTS_NMI + select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP + select HAVE_RCU_TABLE_FREE select HAVE_REGS_AND_STACK_ACCESS_API - select HAVE_RELIABLE_STACKTRACE if X86_64 && FRAME_POINTER && STACK_VALIDATION + select HAVE_RELIABLE_STACKTRACE if X86_64 && FRAME_POINTER_UNWINDER && STACK_VALIDATION select HAVE_STACK_VALIDATION if X86_64 select HAVE_SYSCALL_TRACEPOINTS select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_USER_RETURN_NOTIFIER select IRQ_FORCED_THREADING + select PCI_LOCKLESS_CONFIG select PERF_EVENTS select RTC_LIB select RTC_MC146818_LIB @@ -318,6 +330,7 @@ config FIX_EARLYCON_MEM config PGTABLE_LEVELS int + default 5 if X86_5LEVEL default 4 if X86_64 default 3 if X86_PAE default 2 @@ -416,16 +429,16 @@ config GOLDFISH def_bool y depends on X86_GOLDFISH -config INTEL_RDT_A - bool "Intel Resource Director Technology Allocation support" +config INTEL_RDT + bool "Intel Resource Director Technology support" default n depends on X86 && CPU_SUP_INTEL select KERNFS help - Select to enable resource allocation which is a sub-feature of - Intel Resource Director Technology(RDT). More information about - RDT can be found in the Intel x86 Architecture Software - Developer Manual. + Select to enable resource allocation and monitoring which are + sub-features of Intel Resource Director Technology(RDT). More + information about RDT can be found in the Intel x86 + Architecture Software Developer Manual. Say N if unsure. @@ -769,8 +782,6 @@ config KVM_DEBUG_FS Statistics are displayed in debugfs filesystem. Enabling this option may incur significant overhead. -source "arch/x86/lguest/Kconfig" - config PARAVIRT_TIME_ACCOUNTING bool "Paravirtual steal time accounting" depends on PARAVIRT @@ -1082,7 +1093,7 @@ config X86_MCE_THRESHOLD def_bool y config X86_MCE_INJECT - depends on X86_MCE && X86_LOCAL_APIC && X86_MCELOG_LEGACY + depends on X86_MCE && X86_LOCAL_APIC && DEBUG_FS tristate "Machine check injector support" ---help--- Provide support for injecting machine checks for testing purposes. @@ -1390,6 +1401,24 @@ config X86_PAE has the cost of more pagetable lookup overhead, and also consumes more pagetable space per process. +config X86_5LEVEL + bool "Enable 5-level page tables support" + depends on X86_64 + ---help--- + 5-level paging enables access to larger address space: + upto 128 PiB of virtual address space and 4 PiB of + physical address space. + + It will be supported by future Intel CPUs. + + Note: a kernel with this option enabled can only be booted + on machines that support the feature. + + See Documentation/x86/x86_64/5level-paging.txt for more + information. + + Say N if unsure. + config ARCH_PHYS_ADDR_T_64BIT def_bool y depends on X86_64 || X86_PAE @@ -1407,6 +1436,35 @@ config X86_DIRECT_GBPAGES supports them), so don't confuse the user by printing that we have them enabled. +config ARCH_HAS_MEM_ENCRYPT + def_bool y + +config AMD_MEM_ENCRYPT + bool "AMD Secure Memory Encryption (SME) support" + depends on X86_64 && CPU_SUP_AMD + ---help--- + Say yes to enable support for the encryption of system memory. + This requires an AMD processor that supports Secure Memory + Encryption (SME). + +config AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT + bool "Activate AMD Secure Memory Encryption (SME) by default" + default y + depends on AMD_MEM_ENCRYPT + ---help--- + Say yes to have system memory encrypted by default if running on + an AMD processor that supports Secure Memory Encryption (SME). + + If set to Y, then the encryption of system memory can be + deactivated with the mem_encrypt=off command line option. + + If set to N, then the encryption of system memory can be + activated with the mem_encrypt=on command line option. + +config ARCH_USE_MEMREMAP_PROT + def_bool y + depends on AMD_MEM_ENCRYPT + # Common NUMA Features config NUMA bool "Numa Memory Allocation and Scheduler Support" @@ -1748,7 +1806,9 @@ config X86_SMAP config X86_INTEL_MPX prompt "Intel MPX (Memory Protection Extensions)" def_bool n - depends on CPU_SUP_INTEL + # Note: only available in 64-bit mode due to VMA flags shortage + depends on CPU_SUP_INTEL && X86_64 + select ARCH_USES_HIGH_VMA_FLAGS ---help--- MPX provides hardware features that can be used in conjunction with compiler-instrumented code to check @@ -2263,6 +2323,10 @@ source "kernel/livepatch/Kconfig" endmenu +config ARCH_HAS_ADD_PAGES + def_bool y + depends on X86_64 && ARCH_ENABLE_MEMORY_HOTPLUG + config ARCH_ENABLE_MEMORY_HOTPLUG def_bool y depends on X86_64 || (X86_32 && HIGHMEM) @@ -2283,6 +2347,10 @@ config ARCH_ENABLE_HUGEPAGE_MIGRATION def_bool y depends on X86_64 && HUGETLB_PAGE && MIGRATION +config ARCH_ENABLE_THP_MIGRATION + def_bool y + depends on X86_64 && TRANSPARENT_HUGEPAGE + menu "Power management and ACPI options" config ARCH_HIBERNATION_HEADER @@ -2793,6 +2861,9 @@ config X86_DMA_REMAP bool depends on STA2X11 +config HAVE_GENERIC_GUP + def_bool y + source "net/Kconfig" source "drivers/Kconfig" diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 3ba5ff2f2d08..65a9a4716e34 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # Put here option for CPU selection and depending optimization choice prompt "Processor family" diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index fcb7604172ce..90b123056f4b 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 menu "Kernel hacking" config TRACE_IRQFLAGS_SUPPORT @@ -305,8 +306,6 @@ config DEBUG_ENTRY Some of these sanity checks may slow down kernel entries and exits or otherwise impact performance. - This is currently used to help test NMI code. - If unsure, say N. config DEBUG_NMI_SELFTEST @@ -348,6 +347,7 @@ config X86_DEBUG_FPU config PUNIT_ATOM_DEBUG tristate "ATOM Punit debug driver" + depends on PCI select DEBUG_FS select IOSF_MBI ---help--- @@ -357,4 +357,61 @@ config PUNIT_ATOM_DEBUG The current power state can be read from /sys/kernel/debug/punit_atom/dev_power_state +choice + prompt "Choose kernel unwinder" + default FRAME_POINTER_UNWINDER + ---help--- + This determines which method will be used for unwinding kernel stack + traces for panics, oopses, bugs, warnings, perf, /proc/<pid>/stack, + livepatch, lockdep, and more. + +config FRAME_POINTER_UNWINDER + bool "Frame pointer unwinder" + select FRAME_POINTER + ---help--- + This option enables the frame pointer unwinder for unwinding kernel + stack traces. + + The unwinder itself is fast and it uses less RAM than the ORC + unwinder, but the kernel text size will grow by ~3% and the kernel's + overall performance will degrade by roughly 5-10%. + + This option is recommended if you want to use the livepatch + consistency model, as this is currently the only way to get a + reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE). + +config ORC_UNWINDER + bool "ORC unwinder" + depends on X86_64 + select STACK_VALIDATION + ---help--- + This option enables the ORC (Oops Rewind Capability) unwinder for + unwinding kernel stack traces. It uses a custom data format which is + a simplified version of the DWARF Call Frame Information standard. + + This unwinder is more accurate across interrupt entry frames than the + frame pointer unwinder. It also enables a 5-10% performance + improvement across the entire kernel compared to frame pointers. + + Enabling this option will increase the kernel's runtime memory usage + by roughly 2-4MB, depending on your kernel config. + +config GUESS_UNWINDER + bool "Guess unwinder" + depends on EXPERT + ---help--- + This option enables the "guess" unwinder for unwinding kernel stack + traces. It scans the stack and reports every kernel text address it + finds. Some of the addresses it reports may be incorrect. + + While this option often produces false positives, it can still be + useful in many cases. Unlike the other unwinders, it has no runtime + overhead. + +endchoice + +config FRAME_POINTER + depends on !ORC_UNWINDER && !GUESS_UNWINDER + bool + endmenu diff --git a/arch/x86/Makefile b/arch/x86/Makefile index bf240b920473..a20eacd9c7e9 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # Unified Makefile for i386 and x86_64 # select defconfig based on actual architecture @@ -11,6 +12,16 @@ else KBUILD_DEFCONFIG := $(ARCH)_defconfig endif +# For gcc stack alignment is specified with -mpreferred-stack-boundary, +# clang has the option -mstack-alignment for that purpose. +ifneq ($(call cc-option, -mpreferred-stack-boundary=4),) + cc_stack_align4 := -mpreferred-stack-boundary=2 + cc_stack_align8 := -mpreferred-stack-boundary=3 +else ifneq ($(call cc-option, -mstack-alignment=16),) + cc_stack_align4 := -mstack-alignment=4 + cc_stack_align8 := -mstack-alignment=8 +endif + # How to compile the 16-bit code. Note we always compile for -march=i386; # that way we can complain to the user if the CPU is insufficient. # @@ -24,10 +35,11 @@ REALMODE_CFLAGS := $(M16_CFLAGS) -g -Os -D__KERNEL__ \ -DDISABLE_BRANCH_PROFILING \ -Wall -Wstrict-prototypes -march=i386 -mregparm=3 \ -fno-strict-aliasing -fomit-frame-pointer -fno-pic \ - -mno-mmx -mno-sse \ - $(call cc-option, -ffreestanding) \ - $(call cc-option, -fno-stack-protector) \ - $(call cc-option, -mpreferred-stack-boundary=2) + -mno-mmx -mno-sse + +REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -ffreestanding) +REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -fno-stack-protector) +REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), $(cc_stack_align4)) export REALMODE_CFLAGS # BITS is used as extension for files which are available in a 32 bit @@ -64,8 +76,10 @@ ifeq ($(CONFIG_X86_32),y) # with nonstandard options KBUILD_CFLAGS += -fno-pic - # prevent gcc from keeping the stack 16 byte aligned - KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2) + # Align the stack to the register width instead of using the default + # alignment of 16 bytes. This reduces stack usage and the number of + # alignment instructions. + KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align4)) # Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use # a lot more stack due to the lack of sharing of stacklots: @@ -97,8 +111,14 @@ else KBUILD_CFLAGS += $(call cc-option,-mno-80387) KBUILD_CFLAGS += $(call cc-option,-mno-fp-ret-in-387) - # Use -mpreferred-stack-boundary=3 if supported. - KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=3) + # By default gcc and clang use a stack alignment of 16 bytes for x86. + # However the standard kernel entry on x86-64 leaves the stack on an + # 8-byte boundary. If the compiler isn't informed about the actual + # alignment it will generate extra alignment instructions for the + # default alignment which keep the stack *mis*aligned. + # Furthermore an alignment to the register width reduces stack usage + # and the number of alignment instructions. + KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align8)) # Use -mskip-rax-setup if supported. KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup) @@ -215,9 +235,6 @@ KBUILD_CFLAGS += -Wno-sign-compare # KBUILD_CFLAGS += -fno-asynchronous-unwind-tables -KBUILD_CFLAGS += $(mflags-y) -KBUILD_AFLAGS += $(mflags-y) - archscripts: scripts_basic $(Q)$(MAKE) $(build)=arch/x86/tools relocs @@ -257,8 +274,6 @@ drivers-$(CONFIG_PM) += arch/x86/power/ drivers-$(CONFIG_FB) += arch/x86/video/ -drivers-$(CONFIG_RAS) += arch/x86/ras/ - #### # boot loader support. Several targets are kept for legacy purposes diff --git a/arch/x86/Makefile.um b/arch/x86/Makefile.um index 5b7e898ffd9a..45af19921ebd 100644 --- a/arch/x86/Makefile.um +++ b/arch/x86/Makefile.um @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 core-y += arch/x86/crypto/ ifeq ($(CONFIG_X86_32),y) diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu index a45eb15b7cf2..1f5faf8606b4 100644 --- a/arch/x86/Makefile_32.cpu +++ b/arch/x86/Makefile_32.cpu @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # CPU tuning section - shared with UML. # Must change only cflags-y (or [yn]), not CFLAGS! That makes a difference for UML. @@ -9,7 +10,6 @@ else tune = $(call cc-option,-mcpu=$(1),$(2)) endif -align := $(cc-option-align) cflags-$(CONFIG_M486) += -march=i486 cflags-$(CONFIG_M586) += -march=i586 cflags-$(CONFIG_M586TSC) += -march=i586 @@ -24,11 +24,11 @@ cflags-$(CONFIG_MK6) += -march=k6 # They make zero difference whatsosever to performance at this time. cflags-$(CONFIG_MK7) += -march=athlon cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8,-march=athlon) -cflags-$(CONFIG_MCRUSOE) += -march=i686 $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 -cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 +cflags-$(CONFIG_MCRUSOE) += -march=i686 -falign-functions=0 -falign-jumps=0 -falign-loops=0 +cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) -falign-functions=0 -falign-jumps=0 -falign-loops=0 cflags-$(CONFIG_MWINCHIPC6) += $(call cc-option,-march=winchip-c6,-march=i586) cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586) -cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 +cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) -falign-functions=0 -falign-jumps=0 -falign-loops=0 cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686) cflags-$(CONFIG_MVIAC7) += -march=i686 cflags-$(CONFIG_MCORE2) += -march=i686 $(call tune,core2) diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 0d810fb15eac..d88a2fddba8c 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -73,12 +73,13 @@ UBSAN_SANITIZE := n $(obj)/bzImage: asflags-y := $(SVGA_MODE) quiet_cmd_image = BUILD $@ +silent_redirect_image = >/dev/null cmd_image = $(obj)/tools/build $(obj)/setup.bin $(obj)/vmlinux.bin \ - $(obj)/zoffset.h $@ + $(obj)/zoffset.h $@ $($(quiet)redirect_image) $(obj)/bzImage: $(obj)/setup.bin $(obj)/vmlinux.bin $(obj)/tools/build FORCE $(call if_changed,image) - @echo 'Kernel: $@ is ready' ' (#'`cat .version`')' + @$(kecho) 'Kernel: $@ is ready' ' (#'`cat .version`')' OBJCOPYFLAGS_vmlinux.bin := -O binary -R .note -R .comment -S $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE diff --git a/arch/x86/boot/code16gcc.h b/arch/x86/boot/code16gcc.h index 5ff426535397..e19fd7536307 100644 --- a/arch/x86/boot/code16gcc.h +++ b/arch/x86/boot/code16gcc.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ # # code16gcc.h # diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 2c860ad4fe06..4b7575b00563 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # linux/arch/x86/boot/compressed/Makefile # @@ -34,6 +35,7 @@ KBUILD_CFLAGS += $(cflags-y) KBUILD_CFLAGS += -mno-mmx -mno-sse KBUILD_CFLAGS += $(call cc-option,-ffreestanding) KBUILD_CFLAGS += $(call cc-option,-fno-stack-protector) +KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ GCOV_PROFILE := n diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c index 73ccf63b0f48..0cb325734cfb 100644 --- a/arch/x86/boot/compressed/cmdline.c +++ b/arch/x86/boot/compressed/cmdline.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include "misc.h" #if CONFIG_EARLY_PRINTK || CONFIG_RANDOMIZE_BASE @@ -13,7 +14,7 @@ static inline char rdfs8(addr_t addr) return *((char *)(fs + addr)); } #include "../cmdline.c" -static unsigned long get_cmd_line_ptr(void) +unsigned long get_cmd_line_ptr(void) { unsigned long cmd_line_ptr = boot_params->hdr.cmd_line_ptr; diff --git a/arch/x86/boot/compressed/cpuflags.c b/arch/x86/boot/compressed/cpuflags.c index aa313466118b..6448a8196d32 100644 --- a/arch/x86/boot/compressed/cpuflags.c +++ b/arch/x86/boot/compressed/cpuflags.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #ifdef CONFIG_RANDOMIZE_BASE #include "../cpuflags.c" diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index cbf4b87f55b9..e56dbc67e837 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -767,7 +767,7 @@ static efi_status_t setup_e820(struct boot_params *params, m |= (u64)efi->efi_memmap_hi << 32; #endif - d = (efi_memory_desc_t *)(m + (i * efi->efi_memdesc_size)); + d = efi_early_memdesc_ptr(m, efi->efi_memdesc_size, i); switch (d->type) { case EFI_RESERVED_TYPE: case EFI_RUNTIME_SERVICES_CODE: @@ -997,6 +997,9 @@ struct boot_params *efi_main(struct efi_config *c, if (boot_params->secure_boot == efi_secureboot_mode_unset) boot_params->secure_boot = efi_get_secureboot(sys_table); + /* Ask the firmware to clear memory on unclean shutdown */ + efi_enable_reset_attack_mitigation(sys_table); + setup_graphics(boot_params); setup_efi_pci(boot_params); @@ -1046,9 +1049,31 @@ struct boot_params *efi_main(struct efi_config *c, memset((char *)gdt->address, 0x0, gdt->size); desc = (struct desc_struct *)gdt->address; - /* The first GDT is a dummy and the second is unused. */ - desc += 2; + /* The first GDT is a dummy. */ + desc++; + + if (IS_ENABLED(CONFIG_X86_64)) { + /* __KERNEL32_CS */ + desc->limit0 = 0xffff; + desc->base0 = 0x0000; + desc->base1 = 0x0000; + desc->type = SEG_TYPE_CODE | SEG_TYPE_EXEC_READ; + desc->s = DESC_TYPE_CODE_DATA; + desc->dpl = 0; + desc->p = 1; + desc->limit1 = 0xf; + desc->avl = 0; + desc->l = 0; + desc->d = SEG_OP_SIZE_32BIT; + desc->g = SEG_GRANULARITY_4KB; + desc->base2 = 0x00; + desc++; + } else { + /* Second entry is unused on 32-bit */ + desc++; + } + /* __KERNEL_CS */ desc->limit0 = 0xffff; desc->base0 = 0x0000; desc->base1 = 0x0000; @@ -1056,14 +1081,20 @@ struct boot_params *efi_main(struct efi_config *c, desc->s = DESC_TYPE_CODE_DATA; desc->dpl = 0; desc->p = 1; - desc->limit = 0xf; + desc->limit1 = 0xf; desc->avl = 0; - desc->l = 0; - desc->d = SEG_OP_SIZE_32BIT; + if (IS_ENABLED(CONFIG_X86_64)) { + desc->l = 1; + desc->d = 0; + } else { + desc->l = 0; + desc->d = SEG_OP_SIZE_32BIT; + } desc->g = SEG_GRANULARITY_4KB; desc->base2 = 0x00; - desc++; + + /* __KERNEL_DS */ desc->limit0 = 0xffff; desc->base0 = 0x0000; desc->base1 = 0x0000; @@ -1071,30 +1102,31 @@ struct boot_params *efi_main(struct efi_config *c, desc->s = DESC_TYPE_CODE_DATA; desc->dpl = 0; desc->p = 1; - desc->limit = 0xf; + desc->limit1 = 0xf; desc->avl = 0; desc->l = 0; desc->d = SEG_OP_SIZE_32BIT; desc->g = SEG_GRANULARITY_4KB; desc->base2 = 0x00; - -#ifdef CONFIG_X86_64 - /* Task segment value */ desc++; - desc->limit0 = 0x0000; - desc->base0 = 0x0000; - desc->base1 = 0x0000; - desc->type = SEG_TYPE_TSS; - desc->s = 0; - desc->dpl = 0; - desc->p = 1; - desc->limit = 0x0; - desc->avl = 0; - desc->l = 0; - desc->d = 0; - desc->g = SEG_GRANULARITY_4KB; - desc->base2 = 0x00; -#endif /* CONFIG_X86_64 */ + + if (IS_ENABLED(CONFIG_X86_64)) { + /* Task segment value */ + desc->limit0 = 0x0000; + desc->base0 = 0x0000; + desc->base1 = 0x0000; + desc->type = SEG_TYPE_TSS; + desc->s = 0; + desc->dpl = 0; + desc->p = 1; + desc->limit1 = 0x0; + desc->avl = 0; + desc->l = 0; + desc->d = 0; + desc->g = SEG_GRANULARITY_4KB; + desc->base2 = 0x00; + desc++; + } asm volatile("cli"); asm volatile ("lgdt %0" : : "m" (*gdt)); diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h index c0223f1a89d7..e799dc5c6448 100644 --- a/arch/x86/boot/compressed/eboot.h +++ b/arch/x86/boot/compressed/eboot.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef BOOT_COMPRESSED_EBOOT_H #define BOOT_COMPRESSED_EBOOT_H diff --git a/arch/x86/boot/compressed/efi_stub_32.S b/arch/x86/boot/compressed/efi_stub_32.S index a53440e81d52..257e341fd2c8 100644 --- a/arch/x86/boot/compressed/efi_stub_32.S +++ b/arch/x86/boot/compressed/efi_stub_32.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * EFI call stub for IA32. * diff --git a/arch/x86/boot/compressed/efi_thunk_64.S b/arch/x86/boot/compressed/efi_thunk_64.S index 630384a4c14a..bff9ab7c6317 100644 --- a/arch/x86/boot/compressed/efi_thunk_64.S +++ b/arch/x86/boot/compressed/efi_thunk_64.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2014, 2015 Intel Corporation; author Matt Fleming * diff --git a/arch/x86/boot/compressed/error.c b/arch/x86/boot/compressed/error.c index 31922023de49..c881878e56d3 100644 --- a/arch/x86/boot/compressed/error.c +++ b/arch/x86/boot/compressed/error.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Callers outside of misc.c need access to the error reporting routines, * but the *_putstr() functions need to stay in misc.c because of how diff --git a/arch/x86/boot/compressed/error.h b/arch/x86/boot/compressed/error.h index d732e608e3af..1de5821184f1 100644 --- a/arch/x86/boot/compressed/error.h +++ b/arch/x86/boot/compressed/error.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef BOOT_COMPRESSED_ERROR_H #define BOOT_COMPRESSED_ERROR_H diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index d85b9625e836..37380c0d5999 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * linux/boot/head.S * @@ -61,71 +62,6 @@ __HEAD ENTRY(startup_32) -#ifdef CONFIG_EFI_STUB - jmp preferred_addr - - /* - * We don't need the return address, so set up the stack so - * efi_main() can find its arguments. - */ -ENTRY(efi_pe_entry) - add $0x4, %esp - - call 1f -1: popl %esi - subl $1b, %esi - - popl %ecx - movl %ecx, efi32_config(%esi) /* Handle */ - popl %ecx - movl %ecx, efi32_config+8(%esi) /* EFI System table pointer */ - - /* Relocate efi_config->call() */ - leal efi32_config(%esi), %eax - add %esi, 40(%eax) - pushl %eax - - call make_boot_params - cmpl $0, %eax - je fail - movl %esi, BP_code32_start(%eax) - popl %ecx - pushl %eax - pushl %ecx - jmp 2f /* Skip efi_config initialization */ - -ENTRY(efi32_stub_entry) - add $0x4, %esp - popl %ecx - popl %edx - - call 1f -1: popl %esi - subl $1b, %esi - - movl %ecx, efi32_config(%esi) /* Handle */ - movl %edx, efi32_config+8(%esi) /* EFI System table pointer */ - - /* Relocate efi_config->call() */ - leal efi32_config(%esi), %eax - add %esi, 40(%eax) - pushl %eax -2: - call efi_main - cmpl $0, %eax - movl %eax, %esi - jne 2f -fail: - /* EFI init failed, so hang. */ - hlt - jmp fail -2: - movl BP_code32_start(%esi), %eax - leal preferred_addr(%eax), %eax - jmp *%eax - -preferred_addr: -#endif cld /* * Test KEEP_SEGMENTS flag to see if the bootloader is asking @@ -208,6 +144,70 @@ preferred_addr: jmp *%eax ENDPROC(startup_32) +#ifdef CONFIG_EFI_STUB +/* + * We don't need the return address, so set up the stack so efi_main() can find + * its arguments. + */ +ENTRY(efi_pe_entry) + add $0x4, %esp + + call 1f +1: popl %esi + subl $1b, %esi + + popl %ecx + movl %ecx, efi32_config(%esi) /* Handle */ + popl %ecx + movl %ecx, efi32_config+8(%esi) /* EFI System table pointer */ + + /* Relocate efi_config->call() */ + leal efi32_config(%esi), %eax + add %esi, 40(%eax) + pushl %eax + + call make_boot_params + cmpl $0, %eax + je fail + movl %esi, BP_code32_start(%eax) + popl %ecx + pushl %eax + pushl %ecx + jmp 2f /* Skip efi_config initialization */ +ENDPROC(efi_pe_entry) + +ENTRY(efi32_stub_entry) + add $0x4, %esp + popl %ecx + popl %edx + + call 1f +1: popl %esi + subl $1b, %esi + + movl %ecx, efi32_config(%esi) /* Handle */ + movl %edx, efi32_config+8(%esi) /* EFI System table pointer */ + + /* Relocate efi_config->call() */ + leal efi32_config(%esi), %eax + add %esi, 40(%eax) + pushl %eax +2: + call efi_main + cmpl $0, %eax + movl %eax, %esi + jne 2f +fail: + /* EFI init failed, so hang. */ + hlt + jmp fail +2: + movl BP_code32_start(%esi), %eax + leal startup_32(%eax), %eax + jmp *%eax +ENDPROC(efi32_stub_entry) +#endif + .text relocated: diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index d2ae1f821e0c..beb255b66447 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * linux/boot/head.S * @@ -243,65 +244,6 @@ ENTRY(startup_64) * that maps our entire kernel(text+data+bss+brk), zero page * and command line. */ -#ifdef CONFIG_EFI_STUB - /* - * The entry point for the PE/COFF executable is efi_pe_entry, so - * only legacy boot loaders will execute this jmp. - */ - jmp preferred_addr - -ENTRY(efi_pe_entry) - movq %rcx, efi64_config(%rip) /* Handle */ - movq %rdx, efi64_config+8(%rip) /* EFI System table pointer */ - - leaq efi64_config(%rip), %rax - movq %rax, efi_config(%rip) - - call 1f -1: popq %rbp - subq $1b, %rbp - - /* - * Relocate efi_config->call(). - */ - addq %rbp, efi64_config+40(%rip) - - movq %rax, %rdi - call make_boot_params - cmpq $0,%rax - je fail - mov %rax, %rsi - leaq startup_32(%rip), %rax - movl %eax, BP_code32_start(%rsi) - jmp 2f /* Skip the relocation */ - -handover_entry: - call 1f -1: popq %rbp - subq $1b, %rbp - - /* - * Relocate efi_config->call(). - */ - movq efi_config(%rip), %rax - addq %rbp, 40(%rax) -2: - movq efi_config(%rip), %rdi - call efi_main - movq %rax,%rsi - cmpq $0,%rax - jne 2f -fail: - /* EFI init failed, so hang. */ - hlt - jmp fail -2: - movl BP_code32_start(%esi), %eax - leaq preferred_addr(%rax), %rax - jmp *%rax - -preferred_addr: -#endif /* Setup data segments. */ xorl %eax, %eax @@ -346,6 +288,48 @@ preferred_addr: /* Set up the stack */ leaq boot_stack_end(%rbx), %rsp +#ifdef CONFIG_X86_5LEVEL + /* Check if 5-level paging has already enabled */ + movq %cr4, %rax + testl $X86_CR4_LA57, %eax + jnz lvl5 + + /* + * At this point we are in long mode with 4-level paging enabled, + * but we want to enable 5-level paging. + * + * The problem is that we cannot do it directly. Setting LA57 in + * long mode would trigger #GP. So we need to switch off long mode + * first. + * + * NOTE: This is not going to work if bootloader put us above 4G + * limit. + * + * The first step is go into compatibility mode. + */ + + /* Clear additional page table */ + leaq lvl5_pgtable(%rbx), %rdi + xorq %rax, %rax + movq $(PAGE_SIZE/8), %rcx + rep stosq + + /* + * Setup current CR3 as the first and only entry in a new top level + * page table. + */ + movq %cr3, %rdi + leaq 0x7 (%rdi), %rax + movq %rax, lvl5_pgtable(%rbx) + + /* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */ + pushq $__KERNEL32_CS + leaq compatible_mode(%rip), %rax + pushq %rax + lretq +lvl5: +#endif + /* Zero EFLAGS */ pushq $0 popfq @@ -371,6 +355,59 @@ preferred_addr: jmp *%rax #ifdef CONFIG_EFI_STUB + +/* The entry point for the PE/COFF executable is efi_pe_entry. */ +ENTRY(efi_pe_entry) + movq %rcx, efi64_config(%rip) /* Handle */ + movq %rdx, efi64_config+8(%rip) /* EFI System table pointer */ + + leaq efi64_config(%rip), %rax + movq %rax, efi_config(%rip) + + call 1f +1: popq %rbp + subq $1b, %rbp + + /* + * Relocate efi_config->call(). + */ + addq %rbp, efi64_config+40(%rip) + + movq %rax, %rdi + call make_boot_params + cmpq $0,%rax + je fail + mov %rax, %rsi + leaq startup_32(%rip), %rax + movl %eax, BP_code32_start(%rsi) + jmp 2f /* Skip the relocation */ + +handover_entry: + call 1f +1: popq %rbp + subq $1b, %rbp + + /* + * Relocate efi_config->call(). + */ + movq efi_config(%rip), %rax + addq %rbp, 40(%rax) +2: + movq efi_config(%rip), %rdi + call efi_main + movq %rax,%rsi + cmpq $0,%rax + jne 2f +fail: + /* EFI init failed, so hang. */ + hlt + jmp fail +2: + movl BP_code32_start(%esi), %eax + leaq startup_64(%rax), %rax + jmp *%rax +ENDPROC(efi_pe_entry) + .org 0x390 ENTRY(efi64_stub_entry) movq %rdi, efi64_config(%rip) /* Handle */ @@ -429,6 +466,44 @@ relocated: jmp *%rax .code32 +#ifdef CONFIG_X86_5LEVEL +compatible_mode: + /* Setup data and stack segments */ + movl $__KERNEL_DS, %eax + movl %eax, %ds + movl %eax, %ss + + /* Disable paging */ + movl %cr0, %eax + btrl $X86_CR0_PG_BIT, %eax + movl %eax, %cr0 + + /* Point CR3 to 5-level paging */ + leal lvl5_pgtable(%ebx), %eax + movl %eax, %cr3 + + /* Enable PAE and LA57 mode */ + movl %cr4, %eax + orl $(X86_CR4_PAE | X86_CR4_LA57), %eax + movl %eax, %cr4 + + /* Calculate address we are running at */ + call 1f +1: popl %edi + subl $1b, %edi + + /* Prepare stack for far return to Long Mode */ + pushl $__KERNEL_CS + leal lvl5(%edi), %eax + push %eax + + /* Enable paging back */ + movl $(X86_CR0_PG | X86_CR0_PE), %eax + movl %eax, %cr0 + + lret +#endif + no_longmode: /* This isn't an x86-64 CPU so hang */ 1: @@ -442,7 +517,7 @@ gdt: .word gdt_end - gdt .long gdt .word 0 - .quad 0x0000000000000000 /* NULL descriptor */ + .quad 0x00cf9a000000ffff /* __KERNEL32_CS */ .quad 0x00af9a000000ffff /* __KERNEL_CS */ .quad 0x00cf92000000ffff /* __KERNEL_DS */ .quad 0x0080890000000000 /* TS descriptor */ @@ -486,3 +561,7 @@ boot_stack_end: .balign 4096 pgtable: .fill BOOT_PGT_SIZE, 1, 0 +#ifdef CONFIG_X86_5LEVEL +lvl5_pgtable: + .fill PAGE_SIZE, 1, 0 +#endif diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 56a7e9201741..a63fbc25ce84 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * kaslr.c * @@ -9,15 +10,43 @@ * contain the entire properly aligned running kernel image. * */ + +/* + * isspace() in linux/ctype.h is expected by next_args() to filter + * out "space/lf/tab". While boot/ctype.h conflicts with linux/ctype.h, + * since isdigit() is implemented in both of them. Hence disable it + * here. + */ +#define BOOT_CTYPE_H + +/* + * _ctype[] in lib/ctype.c is needed by isspace() of linux/ctype.h. + * While both lib/ctype.c and lib/cmdline.c will bring EXPORT_SYMBOL + * which is meaningless and will cause compiling error in some cases. + * So do not include linux/export.h and define EXPORT_SYMBOL(sym) + * as empty. + */ +#define _LINUX_EXPORT_H +#define EXPORT_SYMBOL(sym) + #include "misc.h" #include "error.h" -#include "../boot.h" +#include "../string.h" #include <generated/compile.h> #include <linux/module.h> #include <linux/uts.h> #include <linux/utsname.h> +#include <linux/ctype.h> +#include <linux/efi.h> #include <generated/utsrelease.h> +#include <asm/efi.h> + +/* Macros used by the included decompressor code below. */ +#define STATIC +#include <linux/decompress/mm.h> + +extern unsigned long get_cmd_line_ptr(void); /* Simplified build-specific string for starting entropy. */ static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@" @@ -62,6 +91,11 @@ struct mem_vector { static bool memmap_too_large; + +/* Store memory limit specified by "mem=nn[KMG]" or "memmap=nn[KMG]" */ +unsigned long long mem_limit = ULLONG_MAX; + + enum mem_avoid_index { MEM_AVOID_ZO_RANGE = 0, MEM_AVOID_INITRD, @@ -85,49 +119,14 @@ static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two) return true; } -/** - * _memparse - Parse a string with mem suffixes into a number - * @ptr: Where parse begins - * @retptr: (output) Optional pointer to next char after parse completes - * - * Parses a string into a number. The number stored at @ptr is - * potentially suffixed with K, M, G, T, P, E. - */ -static unsigned long long _memparse(const char *ptr, char **retptr) +char *skip_spaces(const char *str) { - char *endptr; /* Local pointer to end of parsed string */ - - unsigned long long ret = simple_strtoull(ptr, &endptr, 0); - - switch (*endptr) { - case 'E': - case 'e': - ret <<= 10; - case 'P': - case 'p': - ret <<= 10; - case 'T': - case 't': - ret <<= 10; - case 'G': - case 'g': - ret <<= 10; - case 'M': - case 'm': - ret <<= 10; - case 'K': - case 'k': - ret <<= 10; - endptr++; - default: - break; - } - - if (retptr) - *retptr = endptr; - - return ret; + while (isspace(*str)) + ++str; + return (char *)str; } +#include "../../../../lib/ctype.c" +#include "../../../../lib/cmdline.c" static int parse_memmap(char *p, unsigned long long *start, unsigned long long *size) @@ -142,40 +141,41 @@ parse_memmap(char *p, unsigned long long *start, unsigned long long *size) return -EINVAL; oldp = p; - *size = _memparse(p, &p); + *size = memparse(p, &p); if (p == oldp) return -EINVAL; switch (*p) { - case '@': - /* Skip this region, usable */ - *start = 0; - *size = 0; - return 0; case '#': case '$': case '!': - *start = _memparse(p + 1, &p); + *start = memparse(p + 1, &p); + return 0; + case '@': + /* memmap=nn@ss specifies usable region, should be skipped */ + *size = 0; + /* Fall through */ + default: + /* + * If w/o offset, only size specified, memmap=nn[KMG] has the + * same behaviour as mem=nn[KMG]. It limits the max address + * system can use. Region above the limit should be avoided. + */ + *start = 0; return 0; } return -EINVAL; } -static void mem_avoid_memmap(void) +static void mem_avoid_memmap(char *str) { - char arg[128]; + static int i; int rc; - int i; - char *str; - /* See if we have any memmap areas */ - rc = cmdline_find_option("memmap", arg, sizeof(arg)); - if (rc <= 0) + if (i >= MAX_MEMMAP_REGIONS) return; - i = 0; - str = arg; while (str && (i < MAX_MEMMAP_REGIONS)) { int rc; unsigned long long start, size; @@ -188,9 +188,14 @@ static void mem_avoid_memmap(void) if (rc < 0) break; str = k; - /* A usable region that should not be skipped */ - if (size == 0) + + if (start == 0) { + /* Store the specified memory limit if size > 0 */ + if (size > 0) + mem_limit = size; + continue; + } mem_avoid[MEM_AVOID_MEMMAP_BEGIN + i].start = start; mem_avoid[MEM_AVOID_MEMMAP_BEGIN + i].size = size; @@ -202,6 +207,57 @@ static void mem_avoid_memmap(void) memmap_too_large = true; } +static int handle_mem_memmap(void) +{ + char *args = (char *)get_cmd_line_ptr(); + size_t len = strlen((char *)args); + char *tmp_cmdline; + char *param, *val; + u64 mem_size; + + if (!strstr(args, "memmap=") && !strstr(args, "mem=")) + return 0; + + tmp_cmdline = malloc(len + 1); + if (!tmp_cmdline ) + error("Failed to allocate space for tmp_cmdline"); + + memcpy(tmp_cmdline, args, len); + tmp_cmdline[len] = 0; + args = tmp_cmdline; + + /* Chew leading spaces */ + args = skip_spaces(args); + + while (*args) { + args = next_arg(args, ¶m, &val); + /* Stop at -- */ + if (!val && strcmp(param, "--") == 0) { + warn("Only '--' specified in cmdline"); + free(tmp_cmdline); + return -1; + } + + if (!strcmp(param, "memmap")) { + mem_avoid_memmap(val); + } else if (!strcmp(param, "mem")) { + char *p = val; + + if (!strcmp(p, "nopentium")) + continue; + mem_size = memparse(p, &p); + if (mem_size == 0) { + free(tmp_cmdline); + return -EINVAL; + } + mem_limit = mem_size; + } + } + + free(tmp_cmdline); + return 0; +} + /* * In theory, KASLR can put the kernel anywhere in the range of [16M, 64T). * The mem_avoid array is used to store the ranges that need to be avoided @@ -323,7 +379,7 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, /* We don't need to set a mapping for setup_data. */ /* Mark the memmap regions we need to avoid */ - mem_avoid_memmap(); + handle_mem_memmap(); #ifdef CONFIG_X86_VERBOSE_BOOTUP /* Make sure video RAM can be used. */ @@ -426,28 +482,32 @@ static unsigned long slots_fetch_random(void) return 0; } -static void process_e820_entry(struct boot_e820_entry *entry, +static void process_mem_region(struct mem_vector *entry, unsigned long minimum, unsigned long image_size) { struct mem_vector region, overlap; struct slot_area slot_area; - unsigned long start_orig; - - /* Skip non-RAM entries. */ - if (entry->type != E820_TYPE_RAM) - return; + unsigned long start_orig, end; + struct mem_vector cur_entry; /* On 32-bit, ignore entries entirely above our maximum. */ - if (IS_ENABLED(CONFIG_X86_32) && entry->addr >= KERNEL_IMAGE_SIZE) + if (IS_ENABLED(CONFIG_X86_32) && entry->start >= KERNEL_IMAGE_SIZE) return; /* Ignore entries entirely below our minimum. */ - if (entry->addr + entry->size < minimum) + if (entry->start + entry->size < minimum) + return; + + /* Ignore entries above memory limit */ + end = min(entry->size + entry->start, mem_limit); + if (entry->start >= end) return; + cur_entry.start = entry->start; + cur_entry.size = end - entry->start; - region.start = entry->addr; - region.size = entry->size; + region.start = cur_entry.start; + region.size = cur_entry.size; /* Give up if slot area array is full. */ while (slot_area_index < MAX_SLOT_AREA) { @@ -460,8 +520,8 @@ static void process_e820_entry(struct boot_e820_entry *entry, /* Potentially raise address to meet alignment needs. */ region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN); - /* Did we raise the address above this e820 region? */ - if (region.start > entry->addr + entry->size) + /* Did we raise the address above the passed in memory entry? */ + if (region.start > cur_entry.start + cur_entry.size) return; /* Reduce size by any delta from the original address. */ @@ -501,31 +561,126 @@ static void process_e820_entry(struct boot_e820_entry *entry, } } -static unsigned long find_random_phys_addr(unsigned long minimum, - unsigned long image_size) +#ifdef CONFIG_EFI +/* + * Returns true if mirror region found (and must have been processed + * for slots adding) + */ +static bool +process_efi_entries(unsigned long minimum, unsigned long image_size) { + struct efi_info *e = &boot_params->efi_info; + bool efi_mirror_found = false; + struct mem_vector region; + efi_memory_desc_t *md; + unsigned long pmap; + char *signature; + u32 nr_desc; int i; - unsigned long addr; - /* Check if we had too many memmaps. */ - if (memmap_too_large) { - debug_putstr("Aborted e820 scan (more than 4 memmap= args)!\n"); - return 0; + signature = (char *)&e->efi_loader_signature; + if (strncmp(signature, EFI32_LOADER_SIGNATURE, 4) && + strncmp(signature, EFI64_LOADER_SIGNATURE, 4)) + return false; + +#ifdef CONFIG_X86_32 + /* Can't handle data above 4GB at this time */ + if (e->efi_memmap_hi) { + warn("EFI memmap is above 4GB, can't be handled now on x86_32. EFI should be disabled.\n"); + return false; } + pmap = e->efi_memmap; +#else + pmap = (e->efi_memmap | ((__u64)e->efi_memmap_hi << 32)); +#endif - /* Make sure minimum is aligned. */ - minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN); + nr_desc = e->efi_memmap_size / e->efi_memdesc_size; + for (i = 0; i < nr_desc; i++) { + md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i); + if (md->attribute & EFI_MEMORY_MORE_RELIABLE) { + efi_mirror_found = true; + break; + } + } + + for (i = 0; i < nr_desc; i++) { + md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i); + + /* + * Here we are more conservative in picking free memory than + * the EFI spec allows: + * + * According to the spec, EFI_BOOT_SERVICES_{CODE|DATA} are also + * free memory and thus available to place the kernel image into, + * but in practice there's firmware where using that memory leads + * to crashes. + * + * Only EFI_CONVENTIONAL_MEMORY is guaranteed to be free. + */ + if (md->type != EFI_CONVENTIONAL_MEMORY) + continue; + + if (efi_mirror_found && + !(md->attribute & EFI_MEMORY_MORE_RELIABLE)) + continue; + + region.start = md->phys_addr; + region.size = md->num_pages << EFI_PAGE_SHIFT; + process_mem_region(®ion, minimum, image_size); + if (slot_area_index == MAX_SLOT_AREA) { + debug_putstr("Aborted EFI scan (slot_areas full)!\n"); + break; + } + } + return true; +} +#else +static inline bool +process_efi_entries(unsigned long minimum, unsigned long image_size) +{ + return false; +} +#endif + +static void process_e820_entries(unsigned long minimum, + unsigned long image_size) +{ + int i; + struct mem_vector region; + struct boot_e820_entry *entry; /* Verify potential e820 positions, appending to slots list. */ for (i = 0; i < boot_params->e820_entries; i++) { - process_e820_entry(&boot_params->e820_table[i], minimum, - image_size); + entry = &boot_params->e820_table[i]; + /* Skip non-RAM entries. */ + if (entry->type != E820_TYPE_RAM) + continue; + region.start = entry->addr; + region.size = entry->size; + process_mem_region(®ion, minimum, image_size); if (slot_area_index == MAX_SLOT_AREA) { debug_putstr("Aborted e820 scan (slot_areas full)!\n"); break; } } +} + +static unsigned long find_random_phys_addr(unsigned long minimum, + unsigned long image_size) +{ + /* Check if we had too many memmaps. */ + if (memmap_too_large) { + debug_putstr("Aborted memory entries scan (more than 4 memmap= args)!\n"); + return 0; + } + + /* Make sure minimum is aligned. */ + minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN); + + if (process_efi_entries(minimum, image_size)) + return slots_fetch_random(); + process_e820_entries(minimum, image_size); return slots_fetch_random(); } @@ -584,7 +739,7 @@ void choose_random_location(unsigned long input, */ min_addr = min(*output, 512UL << 20); - /* Walk e820 and find a random address. */ + /* Walk available memory entries to find a random address. */ random_addr = find_random_phys_addr(min_addr, output_size); if (!random_addr) { warn("Physical KASLR disabled: no suitable memory region!"); diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 00241c815524..b50c42455e25 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * misc.c * @@ -116,8 +117,7 @@ void __putstr(const char *s) } } - if (boot_params->screen_info.orig_video_mode == 0 && - lines == 0 && cols == 0) + if (lines == 0 || cols == 0) return; x = boot_params->screen_info.orig_x; @@ -411,3 +411,8 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, debug_putstr("done.\nBooting the kernel.\n"); return output; } + +void fortify_panic(const char *name) +{ + error("detected buffer overflow"); +} diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 766a5211f827..32d4ec2e0243 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef BOOT_COMPRESSED_MISC_H #define BOOT_COMPRESSED_MISC_H diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c index 1d78f1739087..972319ff5b01 100644 --- a/arch/x86/boot/compressed/pagetable.c +++ b/arch/x86/boot/compressed/pagetable.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * This code is used on x86_64 to create page table identity mappings on * demand by building up a new set of page tables (or appending to the @@ -15,6 +16,13 @@ #define __pa(x) ((unsigned long)(x)) #define __va(x) ((void *)((unsigned long)(x))) +/* + * The pgtable.h and mm/ident_map.c includes make use of the SME related + * information which is not used in the compressed image support. Un-define + * the SME support to avoid any compile and link errors. + */ +#undef CONFIG_AMD_MEM_ENCRYPT + #include "misc.h" /* These actually do the work of building the kernel identity maps. */ @@ -63,7 +71,7 @@ static void *alloc_pgt_page(void *context) static struct alloc_pgt_data pgt_data; /* The top level page table entry pointer. */ -static unsigned long level4p; +static unsigned long top_level_pgt; /* * Mapping information structure passed to kernel_ident_mapping_init(). @@ -91,9 +99,15 @@ void initialize_identity_maps(void) * If we came here via startup_32(), cr3 will be _pgtable already * and we must append to the existing area instead of entirely * overwriting it. + * + * With 5-level paging, we use '_pgtable' to allocate the p4d page table, + * the top-level page table is allocated separately. + * + * p4d_offset(top_level_pgt, 0) would cover both the 4- and 5-level + * cases. On 4-level paging it's equal to 'top_level_pgt'. */ - level4p = read_cr3(); - if (level4p == (unsigned long)_pgtable) { + top_level_pgt = read_cr3_pa(); + if (p4d_offset((pgd_t *)top_level_pgt, 0) == (p4d_t *)_pgtable) { debug_putstr("booted via startup_32()\n"); pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE; pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE; @@ -103,7 +117,7 @@ void initialize_identity_maps(void) pgt_data.pgt_buf = _pgtable; pgt_data.pgt_buf_size = BOOT_PGT_SIZE; memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size); - level4p = (unsigned long)alloc_pgt_page(&pgt_data); + top_level_pgt = (unsigned long)alloc_pgt_page(&pgt_data); } } @@ -123,7 +137,7 @@ void add_identity_map(unsigned long start, unsigned long size) return; /* Build the mapping. */ - kernel_ident_mapping_init(&mapping_info, (pgd_t *)level4p, + kernel_ident_mapping_init(&mapping_info, (pgd_t *)top_level_pgt, start, end); } @@ -134,5 +148,5 @@ void add_identity_map(unsigned long start, unsigned long size) */ void finalize_identity_maps(void) { - write_cr3(level4p); + write_cr3(top_level_pgt); } diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c index cea140ce6b42..19dbbcdd1a53 100644 --- a/arch/x86/boot/compressed/string.c +++ b/arch/x86/boot/compressed/string.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * This provides an optimized implementation of memcpy, and a simplified * implementation of memset and memmove. These are used here because the diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S index e24e0a0c90c9..f491bbde8493 100644 --- a/arch/x86/boot/compressed/vmlinux.lds.S +++ b/arch/x86/boot/compressed/vmlinux.lds.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #include <asm-generic/vmlinux.lds.h> OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT) diff --git a/arch/x86/boot/copy.S b/arch/x86/boot/copy.S index 1eb7d298b47d..15d9f74b0008 100644 --- a/arch/x86/boot/copy.S +++ b/arch/x86/boot/copy.S @@ -65,23 +65,3 @@ GLOBAL(copy_to_fs) popw %es retl ENDPROC(copy_to_fs) - -#if 0 /* Not currently used, but can be enabled as needed */ -GLOBAL(copy_from_gs) - pushw %ds - pushw %gs - popw %ds - calll memcpy - popw %ds - retl -ENDPROC(copy_from_gs) - -GLOBAL(copy_to_gs) - pushw %es - pushw %gs - popw %es - calll memcpy - popw %es - retl -ENDPROC(copy_to_gs) -#endif diff --git a/arch/x86/boot/cpuflags.c b/arch/x86/boot/cpuflags.c index 9e77c23c2422..a0b75f73dc63 100644 --- a/arch/x86/boot/cpuflags.c +++ b/arch/x86/boot/cpuflags.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/types.h> #include "bitops.h" diff --git a/arch/x86/boot/cpuflags.h b/arch/x86/boot/cpuflags.h index 15ad56a3f905..2e20814d3ce3 100644 --- a/arch/x86/boot/cpuflags.h +++ b/arch/x86/boot/cpuflags.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef BOOT_CPUFLAGS_H #define BOOT_CPUFLAGS_H diff --git a/arch/x86/boot/ctype.h b/arch/x86/boot/ctype.h index 020f137df7a2..8f5ef2994b5e 100644 --- a/arch/x86/boot/ctype.h +++ b/arch/x86/boot/ctype.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef BOOT_CTYPE_H #define BOOT_CTYPE_H diff --git a/arch/x86/boot/early_serial_console.c b/arch/x86/boot/early_serial_console.c index f0b8d6d93164..b25c53527a94 100644 --- a/arch/x86/boot/early_serial_console.c +++ b/arch/x86/boot/early_serial_console.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Serial port routines for use during early boot reporting. This code is * included from both the compressed kernel and the regular kernel. diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 2ed8f0c25def..9c7ea597eee6 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * header.S * @@ -520,8 +521,14 @@ pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr # the description in lib/decompressor_xxx.c for specific information. # # extra_bytes = (uncompressed_size >> 12) + 65536 + 128 +# +# LZ4 is even worse: data that cannot be further compressed grows by 0.4%, +# or one byte per 256 bytes. OTOH, we can safely get rid of the +128 as +# the size-dependent part now grows so fast. +# +# extra_bytes = (uncompressed_size >> 8) + 65536 -#define ZO_z_extra_bytes ((ZO_z_output_len >> 12) + 65536 + 128) +#define ZO_z_extra_bytes ((ZO_z_output_len >> 8) + 65536) #if ZO_z_output_len > ZO_z_input_len # define ZO_z_extract_offset (ZO_z_output_len + ZO_z_extra_bytes - \ ZO_z_input_len) diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c index 5457b02fc050..16f49123d747 100644 --- a/arch/x86/boot/string.c +++ b/arch/x86/boot/string.c @@ -16,6 +16,15 @@ #include "ctype.h" #include "string.h" +/* + * Undef these macros so that the functions that we provide + * here will have the correct names regardless of how string.h + * may have chosen to #define them. + */ +#undef memcpy +#undef memset +#undef memcmp + int memcmp(const void *s1, const void *s2, size_t len) { bool diff; @@ -122,6 +131,14 @@ unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int bas return result; } +long simple_strtol(const char *cp, char **endp, unsigned int base) +{ + if (*cp == '-') + return -simple_strtoull(cp + 1, endp, base); + + return simple_strtoull(cp, endp, base); +} + /** * strlen - Find the length of a string * @s: The string to be sized diff --git a/arch/x86/boot/string.h b/arch/x86/boot/string.h index 113588ddb43f..3d78e27077f4 100644 --- a/arch/x86/boot/string.h +++ b/arch/x86/boot/string.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef BOOT_STRING_H #define BOOT_STRING_H @@ -22,6 +23,7 @@ extern int strcmp(const char *str1, const char *str2); extern int strncmp(const char *cs, const char *ct, size_t count); extern size_t strlen(const char *s); extern char *strstr(const char *s1, const char *s2); +extern char *strchr(const char *s, int c); extern size_t strnlen(const char *s, size_t maxlen); extern unsigned int atou(const char *s); extern unsigned long long simple_strtoull(const char *cp, char **endp, diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c index 0702d2531bc7..d4e6cd4577e5 100644 --- a/arch/x86/boot/tools/build.c +++ b/arch/x86/boot/tools/build.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 1997 Martin Mares diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 6cf79e1a6830..0eb9f92f3717 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -1,5 +1,4 @@ # CONFIG_64BIT is not set -CONFIG_EXPERIMENTAL=y # CONFIG_LOCALVERSION_AUTO is not set CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y @@ -125,7 +124,6 @@ CONFIG_NF_CONNTRACK_IPV4=y CONFIG_IP_NF_IPTABLES=y CONFIG_IP_NF_FILTER=y CONFIG_IP_NF_TARGET_REJECT=y -CONFIG_IP_NF_TARGET_ULOG=y CONFIG_NF_NAT=y CONFIG_IP_NF_TARGET_MASQUERADE=y CONFIG_IP_NF_MANGLE=y @@ -255,7 +253,6 @@ CONFIG_USB_OHCI_HCD=y CONFIG_USB_UHCI_HCD=y CONFIG_USB_PRINTER=y CONFIG_USB_STORAGE=y -CONFIG_USB_LIBUSUAL=y CONFIG_EDAC=y CONFIG_RTC_CLASS=y # CONFIG_RTC_HCTOSYS is not set diff --git a/arch/x86/configs/tiny.config b/arch/x86/configs/tiny.config index 4b429df40d7a..550cd5012b73 100644 --- a/arch/x86/configs/tiny.config +++ b/arch/x86/configs/tiny.config @@ -1,3 +1,5 @@ CONFIG_NOHIGHMEM=y # CONFIG_HIGHMEM4G is not set # CONFIG_HIGHMEM64G is not set +CONFIG_GUESS_UNWINDER=y +# CONFIG_FRAME_POINTER_UNWINDER is not set diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index de45f57b410d..4a4b16e56d35 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -1,4 +1,3 @@ -CONFIG_EXPERIMENTAL=y # CONFIG_LOCALVERSION_AUTO is not set CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y @@ -124,7 +123,6 @@ CONFIG_NF_CONNTRACK_IPV4=y CONFIG_IP_NF_IPTABLES=y CONFIG_IP_NF_FILTER=y CONFIG_IP_NF_TARGET_REJECT=y -CONFIG_IP_NF_TARGET_ULOG=y CONFIG_NF_NAT=y CONFIG_IP_NF_TARGET_MASQUERADE=y CONFIG_IP_NF_MANGLE=y @@ -251,7 +249,6 @@ CONFIG_USB_OHCI_HCD=y CONFIG_USB_UHCI_HCD=y CONFIG_USB_PRINTER=y CONFIG_USB_STORAGE=y -CONFIG_USB_LIBUSUAL=y CONFIG_EDAC=y CONFIG_RTC_CLASS=y # CONFIG_RTC_HCTOSYS is not set diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 34b3fa2889d1..5f07333bb224 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -1,7 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0 # # Arch-specific CryptoAPI modules. # +OBJECT_FILES_NON_STANDARD := y + avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no) avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\ $(comma)4)$(comma)%ymm2,yes,no) diff --git a/arch/x86/crypto/aes-x86_64-asm_64.S b/arch/x86/crypto/aes-x86_64-asm_64.S index 910565547163..8739cf7795de 100644 --- a/arch/x86/crypto/aes-x86_64-asm_64.S +++ b/arch/x86/crypto/aes-x86_64-asm_64.S @@ -42,17 +42,15 @@ #define R5E %esi #define R6 %rdi #define R6E %edi -#define R7 %rbp -#define R7E %ebp +#define R7 %r9 /* don't use %rbp; it breaks stack traces */ +#define R7E %r9d #define R8 %r8 -#define R9 %r9 #define R10 %r10 #define R11 %r11 -#define prologue(FUNC,KEY,B128,B192,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11) \ +#define prologue(FUNC,KEY,B128,B192,r1,r2,r5,r6,r7,r8,r9,r10,r11) \ ENTRY(FUNC); \ movq r1,r2; \ - movq r3,r4; \ leaq KEY+48(r8),r9; \ movq r10,r11; \ movl (r7),r5 ## E; \ @@ -70,9 +68,8 @@ je B192; \ leaq 32(r9),r9; -#define epilogue(FUNC,r1,r2,r3,r4,r5,r6,r7,r8,r9) \ +#define epilogue(FUNC,r1,r2,r5,r6,r7,r8,r9) \ movq r1,r2; \ - movq r3,r4; \ movl r5 ## E,(r9); \ movl r6 ## E,4(r9); \ movl r7 ## E,8(r9); \ @@ -88,12 +85,12 @@ movl TAB(,r6,4),r6 ## E; \ roll $16,r2 ## E; \ shrl $16,r4 ## E; \ - movzbl r4 ## H,r7 ## E; \ - movzbl r4 ## L,r4 ## E; \ + movzbl r4 ## L,r7 ## E; \ + movzbl r4 ## H,r4 ## E; \ xorl OFFSET(r8),ra ## E; \ xorl OFFSET+4(r8),rb ## E; \ - xorl TAB+3072(,r7,4),r5 ## E;\ - xorl TAB+2048(,r4,4),r6 ## E;\ + xorl TAB+3072(,r4,4),r5 ## E;\ + xorl TAB+2048(,r7,4),r6 ## E;\ movzbl r1 ## L,r7 ## E; \ movzbl r1 ## H,r4 ## E; \ movl TAB+1024(,r4,4),r4 ## E;\ @@ -101,19 +98,19 @@ roll $16,r1 ## E; \ shrl $16,r3 ## E; \ xorl TAB(,r7,4),r5 ## E; \ - movzbl r3 ## H,r7 ## E; \ - movzbl r3 ## L,r3 ## E; \ - xorl TAB+3072(,r7,4),r4 ## E;\ - xorl TAB+2048(,r3,4),r5 ## E;\ - movzbl r1 ## H,r7 ## E; \ - movzbl r1 ## L,r3 ## E; \ + movzbl r3 ## L,r7 ## E; \ + movzbl r3 ## H,r3 ## E; \ + xorl TAB+3072(,r3,4),r4 ## E;\ + xorl TAB+2048(,r7,4),r5 ## E;\ + movzbl r1 ## L,r7 ## E; \ + movzbl r1 ## H,r3 ## E; \ shrl $16,r1 ## E; \ - xorl TAB+3072(,r7,4),r6 ## E;\ - movl TAB+2048(,r3,4),r3 ## E;\ - movzbl r1 ## H,r7 ## E; \ - movzbl r1 ## L,r1 ## E; \ - xorl TAB+1024(,r7,4),r6 ## E;\ - xorl TAB(,r1,4),r3 ## E; \ + xorl TAB+3072(,r3,4),r6 ## E;\ + movl TAB+2048(,r7,4),r3 ## E;\ + movzbl r1 ## L,r7 ## E; \ + movzbl r1 ## H,r1 ## E; \ + xorl TAB+1024(,r1,4),r6 ## E;\ + xorl TAB(,r7,4),r3 ## E; \ movzbl r2 ## H,r1 ## E; \ movzbl r2 ## L,r7 ## E; \ shrl $16,r2 ## E; \ @@ -131,9 +128,9 @@ movl r4 ## E,r2 ## E; #define entry(FUNC,KEY,B128,B192) \ - prologue(FUNC,KEY,B128,B192,R2,R8,R7,R9,R1,R3,R4,R6,R10,R5,R11) + prologue(FUNC,KEY,B128,B192,R2,R8,R1,R3,R4,R6,R10,R5,R11) -#define return(FUNC) epilogue(FUNC,R8,R2,R9,R7,R5,R6,R3,R4,R11) +#define return(FUNC) epilogue(FUNC,R8,R2,R5,R6,R3,R4,R11) #define encrypt_round(TAB,OFFSET) \ round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R10,R5,R6,R3,R4) \ diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index 3c465184ff8a..16627fec80b2 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -89,6 +89,29 @@ SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 ALL_F: .octa 0xffffffffffffffffffffffffffffffff .octa 0x00000000000000000000000000000000 +.section .rodata +.align 16 +.type aad_shift_arr, @object +.size aad_shift_arr, 272 +aad_shift_arr: + .octa 0xffffffffffffffffffffffffffffffff + .octa 0xffffffffffffffffffffffffffffff0C + .octa 0xffffffffffffffffffffffffffff0D0C + .octa 0xffffffffffffffffffffffffff0E0D0C + .octa 0xffffffffffffffffffffffff0F0E0D0C + .octa 0xffffffffffffffffffffff0C0B0A0908 + .octa 0xffffffffffffffffffff0D0C0B0A0908 + .octa 0xffffffffffffffffff0E0D0C0B0A0908 + .octa 0xffffffffffffffff0F0E0D0C0B0A0908 + .octa 0xffffffffffffff0C0B0A090807060504 + .octa 0xffffffffffff0D0C0B0A090807060504 + .octa 0xffffffffff0E0D0C0B0A090807060504 + .octa 0xffffffff0F0E0D0C0B0A090807060504 + .octa 0xffffff0C0B0A09080706050403020100 + .octa 0xffff0D0C0B0A09080706050403020100 + .octa 0xff0E0D0C0B0A09080706050403020100 + .octa 0x0F0E0D0C0B0A09080706050403020100 + .text @@ -252,32 +275,66 @@ XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation mov arg8, %r12 # %r12 = aadLen mov %r12, %r11 pxor %xmm\i, %xmm\i + pxor \XMM2, \XMM2 -_get_AAD_loop\num_initial_blocks\operation: - movd (%r10), \TMP1 - pslldq $12, \TMP1 - psrldq $4, %xmm\i + cmp $16, %r11 + jl _get_AAD_rest8\num_initial_blocks\operation +_get_AAD_blocks\num_initial_blocks\operation: + movdqu (%r10), %xmm\i + PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data + pxor %xmm\i, \XMM2 + GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 + add $16, %r10 + sub $16, %r12 + sub $16, %r11 + cmp $16, %r11 + jge _get_AAD_blocks\num_initial_blocks\operation + + movdqu \XMM2, %xmm\i + cmp $0, %r11 + je _get_AAD_done\num_initial_blocks\operation + + pxor %xmm\i,%xmm\i + + /* read the last <16B of AAD. since we have at least 4B of + data right after the AAD (the ICV, and maybe some CT), we can + read 4B/8B blocks safely, and then get rid of the extra stuff */ +_get_AAD_rest8\num_initial_blocks\operation: + cmp $4, %r11 + jle _get_AAD_rest4\num_initial_blocks\operation + movq (%r10), \TMP1 + add $8, %r10 + sub $8, %r11 + pslldq $8, \TMP1 + psrldq $8, %xmm\i pxor \TMP1, %xmm\i + jmp _get_AAD_rest8\num_initial_blocks\operation +_get_AAD_rest4\num_initial_blocks\operation: + cmp $0, %r11 + jle _get_AAD_rest0\num_initial_blocks\operation + mov (%r10), %eax + movq %rax, \TMP1 add $4, %r10 - sub $4, %r12 - jne _get_AAD_loop\num_initial_blocks\operation - - cmp $16, %r11 - je _get_AAD_loop2_done\num_initial_blocks\operation - - mov $16, %r12 -_get_AAD_loop2\num_initial_blocks\operation: + sub $4, %r10 + pslldq $12, \TMP1 psrldq $4, %xmm\i - sub $4, %r12 - cmp %r11, %r12 - jne _get_AAD_loop2\num_initial_blocks\operation - -_get_AAD_loop2_done\num_initial_blocks\operation: + pxor \TMP1, %xmm\i +_get_AAD_rest0\num_initial_blocks\operation: + /* finalize: shift out the extra bytes we read, and align + left. since pslldq can only shift by an immediate, we use + vpshufb and an array of shuffle masks */ + movq %r12, %r11 + salq $4, %r11 + movdqu aad_shift_arr(%r11), \TMP1 + PSHUFB_XMM \TMP1, %xmm\i +_get_AAD_rest_final\num_initial_blocks\operation: PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data + pxor \XMM2, %xmm\i + GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 +_get_AAD_done\num_initial_blocks\operation: xor %r11, %r11 # initialise the data pointer offset as zero - - # start AES for num_initial_blocks blocks + # start AES for num_initial_blocks blocks mov %arg5, %rax # %rax = *Y0 movdqu (%rax), \XMM0 # XMM0 = Y0 @@ -322,7 +379,7 @@ aes_loop_initial_dec\num_initial_blocks: # prepare plaintext/ciphertext for GHASH computation .endr .endif - GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 + # apply GHASH on num_initial_blocks blocks .if \i == 5 @@ -477,28 +534,66 @@ XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation mov arg8, %r12 # %r12 = aadLen mov %r12, %r11 pxor %xmm\i, %xmm\i -_get_AAD_loop\num_initial_blocks\operation: - movd (%r10), \TMP1 - pslldq $12, \TMP1 - psrldq $4, %xmm\i + pxor \XMM2, \XMM2 + + cmp $16, %r11 + jl _get_AAD_rest8\num_initial_blocks\operation +_get_AAD_blocks\num_initial_blocks\operation: + movdqu (%r10), %xmm\i + PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data + pxor %xmm\i, \XMM2 + GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 + add $16, %r10 + sub $16, %r12 + sub $16, %r11 + cmp $16, %r11 + jge _get_AAD_blocks\num_initial_blocks\operation + + movdqu \XMM2, %xmm\i + cmp $0, %r11 + je _get_AAD_done\num_initial_blocks\operation + + pxor %xmm\i,%xmm\i + + /* read the last <16B of AAD. since we have at least 4B of + data right after the AAD (the ICV, and maybe some PT), we can + read 4B/8B blocks safely, and then get rid of the extra stuff */ +_get_AAD_rest8\num_initial_blocks\operation: + cmp $4, %r11 + jle _get_AAD_rest4\num_initial_blocks\operation + movq (%r10), \TMP1 + add $8, %r10 + sub $8, %r11 + pslldq $8, \TMP1 + psrldq $8, %xmm\i pxor \TMP1, %xmm\i + jmp _get_AAD_rest8\num_initial_blocks\operation +_get_AAD_rest4\num_initial_blocks\operation: + cmp $0, %r11 + jle _get_AAD_rest0\num_initial_blocks\operation + mov (%r10), %eax + movq %rax, \TMP1 add $4, %r10 - sub $4, %r12 - jne _get_AAD_loop\num_initial_blocks\operation - cmp $16, %r11 - je _get_AAD_loop2_done\num_initial_blocks\operation - mov $16, %r12 -_get_AAD_loop2\num_initial_blocks\operation: + sub $4, %r10 + pslldq $12, \TMP1 psrldq $4, %xmm\i - sub $4, %r12 - cmp %r11, %r12 - jne _get_AAD_loop2\num_initial_blocks\operation -_get_AAD_loop2_done\num_initial_blocks\operation: + pxor \TMP1, %xmm\i +_get_AAD_rest0\num_initial_blocks\operation: + /* finalize: shift out the extra bytes we read, and align + left. since pslldq can only shift by an immediate, we use + vpshufb and an array of shuffle masks */ + movq %r12, %r11 + salq $4, %r11 + movdqu aad_shift_arr(%r11), \TMP1 + PSHUFB_XMM \TMP1, %xmm\i +_get_AAD_rest_final\num_initial_blocks\operation: PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data + pxor \XMM2, %xmm\i + GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 +_get_AAD_done\num_initial_blocks\operation: xor %r11, %r11 # initialise the data pointer offset as zero - - # start AES for num_initial_blocks blocks + # start AES for num_initial_blocks blocks mov %arg5, %rax # %rax = *Y0 movdqu (%rax), \XMM0 # XMM0 = Y0 @@ -543,7 +638,7 @@ aes_loop_initial_enc\num_initial_blocks: # prepare plaintext/ciphertext for GHASH computation .endr .endif - GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 + # apply GHASH on num_initial_blocks blocks .if \i == 5 @@ -1454,18 +1549,35 @@ _return_T_decrypt: mov arg10, %r11 # %r11 = auth_tag_len cmp $16, %r11 je _T_16_decrypt - cmp $12, %r11 - je _T_12_decrypt + cmp $8, %r11 + jl _T_4_decrypt _T_8_decrypt: MOVQ_R64_XMM %xmm0, %rax mov %rax, (%r10) - jmp _return_T_done_decrypt -_T_12_decrypt: - MOVQ_R64_XMM %xmm0, %rax - mov %rax, (%r10) + add $8, %r10 + sub $8, %r11 psrldq $8, %xmm0 + cmp $0, %r11 + je _return_T_done_decrypt +_T_4_decrypt: + movd %xmm0, %eax + mov %eax, (%r10) + add $4, %r10 + sub $4, %r11 + psrldq $4, %xmm0 + cmp $0, %r11 + je _return_T_done_decrypt +_T_123_decrypt: movd %xmm0, %eax - mov %eax, 8(%r10) + cmp $2, %r11 + jl _T_1_decrypt + mov %ax, (%r10) + cmp $2, %r11 + je _return_T_done_decrypt + add $2, %r10 + sar $16, %eax +_T_1_decrypt: + mov %al, (%r10) jmp _return_T_done_decrypt _T_16_decrypt: movdqu %xmm0, (%r10) @@ -1718,18 +1830,35 @@ _return_T_encrypt: mov arg10, %r11 # %r11 = auth_tag_len cmp $16, %r11 je _T_16_encrypt - cmp $12, %r11 - je _T_12_encrypt + cmp $8, %r11 + jl _T_4_encrypt _T_8_encrypt: MOVQ_R64_XMM %xmm0, %rax mov %rax, (%r10) - jmp _return_T_done_encrypt -_T_12_encrypt: - MOVQ_R64_XMM %xmm0, %rax - mov %rax, (%r10) + add $8, %r10 + sub $8, %r11 psrldq $8, %xmm0 + cmp $0, %r11 + je _return_T_done_encrypt +_T_4_encrypt: + movd %xmm0, %eax + mov %eax, (%r10) + add $4, %r10 + sub $4, %r11 + psrldq $4, %xmm0 + cmp $0, %r11 + je _return_T_done_encrypt +_T_123_encrypt: movd %xmm0, %eax - mov %eax, 8(%r10) + cmp $2, %r11 + jl _T_1_encrypt + mov %ax, (%r10) + cmp $2, %r11 + je _return_T_done_encrypt + add $2, %r10 + sar $16, %eax +_T_1_encrypt: + mov %al, (%r10) jmp _return_T_done_encrypt _T_16_encrypt: movdqu %xmm0, (%r10) diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S index d664382c6e56..faecb1518bf8 100644 --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S +++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S @@ -155,6 +155,30 @@ SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 ALL_F: .octa 0xffffffffffffffffffffffffffffffff .octa 0x00000000000000000000000000000000 +.section .rodata +.align 16 +.type aad_shift_arr, @object +.size aad_shift_arr, 272 +aad_shift_arr: + .octa 0xffffffffffffffffffffffffffffffff + .octa 0xffffffffffffffffffffffffffffff0C + .octa 0xffffffffffffffffffffffffffff0D0C + .octa 0xffffffffffffffffffffffffff0E0D0C + .octa 0xffffffffffffffffffffffff0F0E0D0C + .octa 0xffffffffffffffffffffff0C0B0A0908 + .octa 0xffffffffffffffffffff0D0C0B0A0908 + .octa 0xffffffffffffffffff0E0D0C0B0A0908 + .octa 0xffffffffffffffff0F0E0D0C0B0A0908 + .octa 0xffffffffffffff0C0B0A090807060504 + .octa 0xffffffffffff0D0C0B0A090807060504 + .octa 0xffffffffff0E0D0C0B0A090807060504 + .octa 0xffffffff0F0E0D0C0B0A090807060504 + .octa 0xffffff0C0B0A09080706050403020100 + .octa 0xffff0D0C0B0A09080706050403020100 + .octa 0xff0E0D0C0B0A09080706050403020100 + .octa 0x0F0E0D0C0B0A09080706050403020100 + + .text @@ -372,41 +396,72 @@ VARIABLE_OFFSET = 16*8 .macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC i = (8-\num_initial_blocks) + j = 0 setreg - mov arg6, %r10 # r10 = AAD - mov arg7, %r12 # r12 = aadLen - - - mov %r12, %r11 - - vpxor reg_i, reg_i, reg_i -_get_AAD_loop\@: - vmovd (%r10), \T1 - vpslldq $12, \T1, \T1 - vpsrldq $4, reg_i, reg_i - vpxor \T1, reg_i, reg_i - - add $4, %r10 - sub $4, %r12 - jg _get_AAD_loop\@ - - - cmp $16, %r11 - je _get_AAD_loop2_done\@ - mov $16, %r12 - -_get_AAD_loop2\@: - vpsrldq $4, reg_i, reg_i - sub $4, %r12 - cmp %r11, %r12 - jg _get_AAD_loop2\@ - -_get_AAD_loop2_done\@: - - #byte-reflect the AAD data - vpshufb SHUF_MASK(%rip), reg_i, reg_i - + mov arg6, %r10 # r10 = AAD + mov arg7, %r12 # r12 = aadLen + + + mov %r12, %r11 + + vpxor reg_j, reg_j, reg_j + vpxor reg_i, reg_i, reg_i + cmp $16, %r11 + jl _get_AAD_rest8\@ +_get_AAD_blocks\@: + vmovdqu (%r10), reg_i + vpshufb SHUF_MASK(%rip), reg_i, reg_i + vpxor reg_i, reg_j, reg_j + GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 + add $16, %r10 + sub $16, %r12 + sub $16, %r11 + cmp $16, %r11 + jge _get_AAD_blocks\@ + vmovdqu reg_j, reg_i + cmp $0, %r11 + je _get_AAD_done\@ + + vpxor reg_i, reg_i, reg_i + + /* read the last <16B of AAD. since we have at least 4B of + data right after the AAD (the ICV, and maybe some CT), we can + read 4B/8B blocks safely, and then get rid of the extra stuff */ +_get_AAD_rest8\@: + cmp $4, %r11 + jle _get_AAD_rest4\@ + movq (%r10), \T1 + add $8, %r10 + sub $8, %r11 + vpslldq $8, \T1, \T1 + vpsrldq $8, reg_i, reg_i + vpxor \T1, reg_i, reg_i + jmp _get_AAD_rest8\@ +_get_AAD_rest4\@: + cmp $0, %r11 + jle _get_AAD_rest0\@ + mov (%r10), %eax + movq %rax, \T1 + add $4, %r10 + sub $4, %r11 + vpslldq $12, \T1, \T1 + vpsrldq $4, reg_i, reg_i + vpxor \T1, reg_i, reg_i +_get_AAD_rest0\@: + /* finalize: shift out the extra bytes we read, and align + left. since pslldq can only shift by an immediate, we use + vpshufb and an array of shuffle masks */ + movq %r12, %r11 + salq $4, %r11 + movdqu aad_shift_arr(%r11), \T1 + vpshufb \T1, reg_i, reg_i +_get_AAD_rest_final\@: + vpshufb SHUF_MASK(%rip), reg_i, reg_i + vpxor reg_j, reg_i, reg_i + GHASH_MUL_AVX reg_i, \T2, \T1, \T3, \T4, \T5, \T6 + +_get_AAD_done\@: # initialize the data pointer offset as zero xor %r11, %r11 @@ -480,7 +535,6 @@ _get_AAD_loop2_done\@: i = (8-\num_initial_blocks) j = (9-\num_initial_blocks) setreg - GHASH_MUL_AVX reg_i, \T2, \T1, \T3, \T4, \T5, \T6 .rep \num_initial_blocks vpxor reg_i, reg_j, reg_j @@ -1427,19 +1481,36 @@ _return_T\@: cmp $16, %r11 je _T_16\@ - cmp $12, %r11 - je _T_12\@ + cmp $8, %r11 + jl _T_4\@ _T_8\@: vmovq %xmm9, %rax mov %rax, (%r10) - jmp _return_T_done\@ -_T_12\@: - vmovq %xmm9, %rax - mov %rax, (%r10) + add $8, %r10 + sub $8, %r11 vpsrldq $8, %xmm9, %xmm9 + cmp $0, %r11 + je _return_T_done\@ +_T_4\@: vmovd %xmm9, %eax - mov %eax, 8(%r10) + mov %eax, (%r10) + add $4, %r10 + sub $4, %r11 + vpsrldq $4, %xmm9, %xmm9 + cmp $0, %r11 + je _return_T_done\@ +_T_123\@: + vmovd %xmm9, %eax + cmp $2, %r11 + jl _T_1\@ + mov %ax, (%r10) + cmp $2, %r11 + je _return_T_done\@ + add $2, %r10 + sar $16, %eax +_T_1\@: + mov %al, (%r10) jmp _return_T_done\@ _T_16\@: @@ -1631,41 +1702,73 @@ ENDPROC(aesni_gcm_dec_avx_gen2) .macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER i = (8-\num_initial_blocks) + j = 0 setreg - mov arg6, %r10 # r10 = AAD - mov arg7, %r12 # r12 = aadLen - - - mov %r12, %r11 - - vpxor reg_i, reg_i, reg_i -_get_AAD_loop\@: - vmovd (%r10), \T1 - vpslldq $12, \T1, \T1 - vpsrldq $4, reg_i, reg_i - vpxor \T1, reg_i, reg_i - - add $4, %r10 - sub $4, %r12 - jg _get_AAD_loop\@ - - - cmp $16, %r11 - je _get_AAD_loop2_done\@ - mov $16, %r12 - -_get_AAD_loop2\@: - vpsrldq $4, reg_i, reg_i - sub $4, %r12 - cmp %r11, %r12 - jg _get_AAD_loop2\@ - -_get_AAD_loop2_done\@: - - #byte-reflect the AAD data - vpshufb SHUF_MASK(%rip), reg_i, reg_i - + mov arg6, %r10 # r10 = AAD + mov arg7, %r12 # r12 = aadLen + + + mov %r12, %r11 + + vpxor reg_j, reg_j, reg_j + vpxor reg_i, reg_i, reg_i + + cmp $16, %r11 + jl _get_AAD_rest8\@ +_get_AAD_blocks\@: + vmovdqu (%r10), reg_i + vpshufb SHUF_MASK(%rip), reg_i, reg_i + vpxor reg_i, reg_j, reg_j + GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 + add $16, %r10 + sub $16, %r12 + sub $16, %r11 + cmp $16, %r11 + jge _get_AAD_blocks\@ + vmovdqu reg_j, reg_i + cmp $0, %r11 + je _get_AAD_done\@ + + vpxor reg_i, reg_i, reg_i + + /* read the last <16B of AAD. since we have at least 4B of + data right after the AAD (the ICV, and maybe some CT), we can + read 4B/8B blocks safely, and then get rid of the extra stuff */ +_get_AAD_rest8\@: + cmp $4, %r11 + jle _get_AAD_rest4\@ + movq (%r10), \T1 + add $8, %r10 + sub $8, %r11 + vpslldq $8, \T1, \T1 + vpsrldq $8, reg_i, reg_i + vpxor \T1, reg_i, reg_i + jmp _get_AAD_rest8\@ +_get_AAD_rest4\@: + cmp $0, %r11 + jle _get_AAD_rest0\@ + mov (%r10), %eax + movq %rax, \T1 + add $4, %r10 + sub $4, %r11 + vpslldq $12, \T1, \T1 + vpsrldq $4, reg_i, reg_i + vpxor \T1, reg_i, reg_i +_get_AAD_rest0\@: + /* finalize: shift out the extra bytes we read, and align + left. since pslldq can only shift by an immediate, we use + vpshufb and an array of shuffle masks */ + movq %r12, %r11 + salq $4, %r11 + movdqu aad_shift_arr(%r11), \T1 + vpshufb \T1, reg_i, reg_i +_get_AAD_rest_final\@: + vpshufb SHUF_MASK(%rip), reg_i, reg_i + vpxor reg_j, reg_i, reg_i + GHASH_MUL_AVX2 reg_i, \T2, \T1, \T3, \T4, \T5, \T6 + +_get_AAD_done\@: # initialize the data pointer offset as zero xor %r11, %r11 @@ -1740,7 +1843,6 @@ _get_AAD_loop2_done\@: i = (8-\num_initial_blocks) j = (9-\num_initial_blocks) setreg - GHASH_MUL_AVX2 reg_i, \T2, \T1, \T3, \T4, \T5, \T6 .rep \num_initial_blocks vpxor reg_i, reg_j, reg_j @@ -2702,19 +2804,36 @@ _return_T\@: cmp $16, %r11 je _T_16\@ - cmp $12, %r11 - je _T_12\@ + cmp $8, %r11 + jl _T_4\@ _T_8\@: vmovq %xmm9, %rax mov %rax, (%r10) - jmp _return_T_done\@ -_T_12\@: - vmovq %xmm9, %rax - mov %rax, (%r10) + add $8, %r10 + sub $8, %r11 vpsrldq $8, %xmm9, %xmm9 + cmp $0, %r11 + je _return_T_done\@ +_T_4\@: vmovd %xmm9, %eax - mov %eax, 8(%r10) + mov %eax, (%r10) + add $4, %r10 + sub $4, %r11 + vpsrldq $4, %xmm9, %xmm9 + cmp $0, %r11 + je _return_T_done\@ +_T_123\@: + vmovd %xmm9, %eax + cmp $2, %r11 + jl _T_1\@ + mov %ax, (%r10) + cmp $2, %r11 + je _return_T_done\@ + add $2, %r10 + sar $16, %eax +_T_1\@: + mov %al, (%r10) jmp _return_T_done\@ _T_16\@: diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 93de8ea51548..5c15d6b57329 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -61,6 +61,11 @@ struct aesni_rfc4106_gcm_ctx { u8 nonce[4]; }; +struct generic_gcmaes_ctx { + u8 hash_subkey[16] AESNI_ALIGN_ATTR; + struct crypto_aes_ctx aes_key_expanded AESNI_ALIGN_ATTR; +}; + struct aesni_xts_ctx { u8 raw_tweak_ctx[sizeof(struct crypto_aes_ctx)] AESNI_ALIGN_ATTR; u8 raw_crypt_ctx[sizeof(struct crypto_aes_ctx)] AESNI_ALIGN_ATTR; @@ -102,13 +107,11 @@ asmlinkage void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, u8 *out, * u8 *out, Ciphertext output. Encrypt in-place is allowed. * const u8 *in, Plaintext input * unsigned long plaintext_len, Length of data in bytes for encryption. - * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association) - * concatenated with 8 byte Initialisation Vector (from IPSec ESP - * Payload) concatenated with 0x00000001. 16-byte aligned pointer. + * u8 *iv, Pre-counter block j0: 12 byte IV concatenated with 0x00000001. + * 16-byte aligned pointer. * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. * const u8 *aad, Additional Authentication Data (AAD) - * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this - * is going to be 8 or 12 bytes + * unsigned long aad_len, Length of AAD in bytes. * u8 *auth_tag, Authenticated Tag output. * unsigned long auth_tag_len), Authenticated Tag Length in bytes. * Valid values are 16 (most likely), 12 or 8. @@ -123,9 +126,8 @@ asmlinkage void aesni_gcm_enc(void *ctx, u8 *out, * u8 *out, Plaintext output. Decrypt in-place is allowed. * const u8 *in, Ciphertext input * unsigned long ciphertext_len, Length of data in bytes for decryption. - * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association) - * concatenated with 8 byte Initialisation Vector (from IPSec ESP - * Payload) concatenated with 0x00000001. 16-byte aligned pointer. + * u8 *iv, Pre-counter block j0: 12 byte IV concatenated with 0x00000001. + * 16-byte aligned pointer. * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. * const u8 *aad, Additional Authentication Data (AAD) * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this is going @@ -275,6 +277,16 @@ aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm) align = 1; return PTR_ALIGN(crypto_aead_ctx(tfm), align); } + +static inline struct +generic_gcmaes_ctx *generic_gcmaes_ctx_get(struct crypto_aead *tfm) +{ + unsigned long align = AESNI_ALIGN; + + if (align <= crypto_tfm_ctx_alignment()) + align = 1; + return PTR_ALIGN(crypto_aead_ctx(tfm), align); +} #endif static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx) @@ -463,8 +475,8 @@ static void ctr_crypt_final(struct crypto_aes_ctx *ctx, unsigned int nbytes = walk->nbytes; aesni_enc(ctx, keystream, ctrblk); - crypto_xor(keystream, src, nbytes); - memcpy(dst, keystream, nbytes); + crypto_xor_cpy(dst, keystream, src, nbytes); + crypto_inc(ctrblk, AES_BLOCK_SIZE); } @@ -712,32 +724,34 @@ static int rfc4106_set_authsize(struct crypto_aead *parent, return crypto_aead_setauthsize(&cryptd_tfm->base, authsize); } -static int helper_rfc4106_encrypt(struct aead_request *req) +static int generic_gcmaes_set_authsize(struct crypto_aead *tfm, + unsigned int authsize) +{ + switch (authsize) { + case 4: + case 8: + case 12: + case 13: + case 14: + case 15: + case 16: + break; + default: + return -EINVAL; + } + + return 0; +} + +static int gcmaes_encrypt(struct aead_request *req, unsigned int assoclen, + u8 *hash_subkey, u8 *iv, void *aes_ctx) { u8 one_entry_in_sg = 0; u8 *src, *dst, *assoc; - __be32 counter = cpu_to_be32(1); struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); - void *aes_ctx = &(ctx->aes_key_expanded); unsigned long auth_tag_len = crypto_aead_authsize(tfm); - u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN))); struct scatter_walk src_sg_walk; struct scatter_walk dst_sg_walk = {}; - unsigned int i; - - /* Assuming we are supporting rfc4106 64-bit extended */ - /* sequence numbers We need to have the AAD length equal */ - /* to 16 or 20 bytes */ - if (unlikely(req->assoclen != 16 && req->assoclen != 20)) - return -EINVAL; - - /* IV below built */ - for (i = 0; i < 4; i++) - *(iv+i) = ctx->nonce[i]; - for (i = 0; i < 8; i++) - *(iv+4+i) = req->iv[i]; - *((__be32 *)(iv+12)) = counter; if (sg_is_last(req->src) && (!PageHighMem(sg_page(req->src)) || @@ -768,7 +782,7 @@ static int helper_rfc4106_encrypt(struct aead_request *req) kernel_fpu_begin(); aesni_gcm_enc_tfm(aes_ctx, dst, src, req->cryptlen, iv, - ctx->hash_subkey, assoc, req->assoclen - 8, + hash_subkey, assoc, assoclen, dst + req->cryptlen, auth_tag_len); kernel_fpu_end(); @@ -791,37 +805,20 @@ static int helper_rfc4106_encrypt(struct aead_request *req) return 0; } -static int helper_rfc4106_decrypt(struct aead_request *req) +static int gcmaes_decrypt(struct aead_request *req, unsigned int assoclen, + u8 *hash_subkey, u8 *iv, void *aes_ctx) { u8 one_entry_in_sg = 0; u8 *src, *dst, *assoc; unsigned long tempCipherLen = 0; - __be32 counter = cpu_to_be32(1); - int retval = 0; struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); - void *aes_ctx = &(ctx->aes_key_expanded); unsigned long auth_tag_len = crypto_aead_authsize(tfm); - u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN))); u8 authTag[16]; struct scatter_walk src_sg_walk; struct scatter_walk dst_sg_walk = {}; - unsigned int i; - - if (unlikely(req->assoclen != 16 && req->assoclen != 20)) - return -EINVAL; - - /* Assuming we are supporting rfc4106 64-bit extended */ - /* sequence numbers We need to have the AAD length */ - /* equal to 16 or 20 bytes */ + int retval = 0; tempCipherLen = (unsigned long)(req->cryptlen - auth_tag_len); - /* IV below built */ - for (i = 0; i < 4; i++) - *(iv+i) = ctx->nonce[i]; - for (i = 0; i < 8; i++) - *(iv+4+i) = req->iv[i]; - *((__be32 *)(iv+12)) = counter; if (sg_is_last(req->src) && (!PageHighMem(sg_page(req->src)) || @@ -838,7 +835,6 @@ static int helper_rfc4106_decrypt(struct aead_request *req) scatterwalk_start(&dst_sg_walk, req->dst); dst = scatterwalk_map(&dst_sg_walk) + req->assoclen; } - } else { /* Allocate memory for src, dst, assoc */ assoc = kmalloc(req->cryptlen + req->assoclen, GFP_ATOMIC); @@ -850,9 +846,10 @@ static int helper_rfc4106_decrypt(struct aead_request *req) dst = src; } + kernel_fpu_begin(); aesni_gcm_dec_tfm(aes_ctx, dst, src, tempCipherLen, iv, - ctx->hash_subkey, assoc, req->assoclen - 8, + hash_subkey, assoc, assoclen, authTag, auth_tag_len); kernel_fpu_end(); @@ -875,6 +872,60 @@ static int helper_rfc4106_decrypt(struct aead_request *req) kfree(assoc); } return retval; + +} + +static int helper_rfc4106_encrypt(struct aead_request *req) +{ + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); + void *aes_ctx = &(ctx->aes_key_expanded); + u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN))); + unsigned int i; + __be32 counter = cpu_to_be32(1); + + /* Assuming we are supporting rfc4106 64-bit extended */ + /* sequence numbers We need to have the AAD length equal */ + /* to 16 or 20 bytes */ + if (unlikely(req->assoclen != 16 && req->assoclen != 20)) + return -EINVAL; + + /* IV below built */ + for (i = 0; i < 4; i++) + *(iv+i) = ctx->nonce[i]; + for (i = 0; i < 8; i++) + *(iv+4+i) = req->iv[i]; + *((__be32 *)(iv+12)) = counter; + + return gcmaes_encrypt(req, req->assoclen - 8, ctx->hash_subkey, iv, + aes_ctx); +} + +static int helper_rfc4106_decrypt(struct aead_request *req) +{ + __be32 counter = cpu_to_be32(1); + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); + void *aes_ctx = &(ctx->aes_key_expanded); + u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN))); + unsigned int i; + + if (unlikely(req->assoclen != 16 && req->assoclen != 20)) + return -EINVAL; + + /* Assuming we are supporting rfc4106 64-bit extended */ + /* sequence numbers We need to have the AAD length */ + /* equal to 16 or 20 bytes */ + + /* IV below built */ + for (i = 0; i < 4; i++) + *(iv+i) = ctx->nonce[i]; + for (i = 0; i < 8; i++) + *(iv+4+i) = req->iv[i]; + *((__be32 *)(iv+12)) = counter; + + return gcmaes_decrypt(req, req->assoclen - 8, ctx->hash_subkey, iv, + aes_ctx); } static int rfc4106_encrypt(struct aead_request *req) @@ -1035,6 +1086,46 @@ struct { }; #ifdef CONFIG_X86_64 +static int generic_gcmaes_set_key(struct crypto_aead *aead, const u8 *key, + unsigned int key_len) +{ + struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(aead); + + return aes_set_key_common(crypto_aead_tfm(aead), + &ctx->aes_key_expanded, key, key_len) ?: + rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len); +} + +static int generic_gcmaes_encrypt(struct aead_request *req) +{ + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(tfm); + void *aes_ctx = &(ctx->aes_key_expanded); + u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN))); + __be32 counter = cpu_to_be32(1); + + memcpy(iv, req->iv, 12); + *((__be32 *)(iv+12)) = counter; + + return gcmaes_encrypt(req, req->assoclen, ctx->hash_subkey, iv, + aes_ctx); +} + +static int generic_gcmaes_decrypt(struct aead_request *req) +{ + __be32 counter = cpu_to_be32(1); + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); + void *aes_ctx = &(ctx->aes_key_expanded); + u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN))); + + memcpy(iv, req->iv, 12); + *((__be32 *)(iv+12)) = counter; + + return gcmaes_decrypt(req, req->assoclen, ctx->hash_subkey, iv, + aes_ctx); +} + static struct aead_alg aesni_aead_algs[] = { { .setkey = common_rfc4106_set_key, .setauthsize = common_rfc4106_set_authsize, @@ -1069,6 +1160,23 @@ static struct aead_alg aesni_aead_algs[] = { { .cra_ctxsize = sizeof(struct cryptd_aead *), .cra_module = THIS_MODULE, }, +}, { + .setkey = generic_gcmaes_set_key, + .setauthsize = generic_gcmaes_set_authsize, + .encrypt = generic_gcmaes_encrypt, + .decrypt = generic_gcmaes_decrypt, + .ivsize = 12, + .maxauthsize = 16, + .base = { + .cra_name = "gcm(aes)", + .cra_driver_name = "generic-gcm-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_ASYNC, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct generic_gcmaes_ctx), + .cra_alignmask = AESNI_ALIGN - 1, + .cra_module = THIS_MODULE, + }, } }; #else static struct aead_alg aesni_aead_algs[0]; diff --git a/arch/x86/crypto/blowfish-x86_64-asm_64.S b/arch/x86/crypto/blowfish-x86_64-asm_64.S index 246c67006ed0..8c1fcb6bad21 100644 --- a/arch/x86/crypto/blowfish-x86_64-asm_64.S +++ b/arch/x86/crypto/blowfish-x86_64-asm_64.S @@ -33,7 +33,7 @@ #define s3 ((16 + 2 + (3 * 256)) * 4) /* register macros */ -#define CTX %rdi +#define CTX %r12 #define RIO %rsi #define RX0 %rax @@ -56,12 +56,12 @@ #define RX2bh %ch #define RX3bh %dh -#define RT0 %rbp +#define RT0 %rdi #define RT1 %rsi #define RT2 %r8 #define RT3 %r9 -#define RT0d %ebp +#define RT0d %edi #define RT1d %esi #define RT2d %r8d #define RT3d %r9d @@ -120,13 +120,14 @@ ENTRY(__blowfish_enc_blk) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src * %rcx: bool, if true: xor output */ - movq %rbp, %r11; + movq %r12, %r11; + movq %rdi, CTX; movq %rsi, %r10; movq %rdx, RIO; @@ -142,7 +143,7 @@ ENTRY(__blowfish_enc_blk) round_enc(14); add_roundkey_enc(16); - movq %r11, %rbp; + movq %r11, %r12; movq %r10, RIO; test %cl, %cl; @@ -157,12 +158,13 @@ ENDPROC(__blowfish_enc_blk) ENTRY(blowfish_dec_blk) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src */ - movq %rbp, %r11; + movq %r12, %r11; + movq %rdi, CTX; movq %rsi, %r10; movq %rdx, RIO; @@ -181,7 +183,7 @@ ENTRY(blowfish_dec_blk) movq %r10, RIO; write_block(); - movq %r11, %rbp; + movq %r11, %r12; ret; ENDPROC(blowfish_dec_blk) @@ -298,20 +300,21 @@ ENDPROC(blowfish_dec_blk) ENTRY(__blowfish_enc_blk_4way) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src * %rcx: bool, if true: xor output */ - pushq %rbp; + pushq %r12; pushq %rbx; pushq %rcx; - preload_roundkey_enc(0); - + movq %rdi, CTX movq %rsi, %r11; movq %rdx, RIO; + preload_roundkey_enc(0); + read_block4(); round_enc4(0); @@ -324,39 +327,40 @@ ENTRY(__blowfish_enc_blk_4way) round_enc4(14); add_preloaded_roundkey4(); - popq %rbp; + popq %r12; movq %r11, RIO; - test %bpl, %bpl; + test %r12b, %r12b; jnz .L__enc_xor4; write_block4(); popq %rbx; - popq %rbp; + popq %r12; ret; .L__enc_xor4: xor_block4(); popq %rbx; - popq %rbp; + popq %r12; ret; ENDPROC(__blowfish_enc_blk_4way) ENTRY(blowfish_dec_blk_4way) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src */ - pushq %rbp; + pushq %r12; pushq %rbx; - preload_roundkey_dec(17); - movq %rsi, %r11; + movq %rdi, CTX; + movq %rsi, %r11 movq %rdx, RIO; + preload_roundkey_dec(17); read_block4(); round_dec4(17); @@ -373,7 +377,7 @@ ENTRY(blowfish_dec_blk_4way) write_block4(); popq %rbx; - popq %rbp; + popq %r12; ret; ENDPROC(blowfish_dec_blk_4way) diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c index 17c05531dfd1..f9eca34301e2 100644 --- a/arch/x86/crypto/blowfish_glue.c +++ b/arch/x86/crypto/blowfish_glue.c @@ -271,8 +271,7 @@ static void ctr_crypt_final(struct bf_ctx *ctx, struct blkcipher_walk *walk) unsigned int nbytes = walk->nbytes; blowfish_enc_blk(ctx, keystream, ctrblk); - crypto_xor(keystream, src, nbytes); - memcpy(dst, keystream, nbytes); + crypto_xor_cpy(dst, keystream, src, nbytes); crypto_inc(ctrblk, BF_BLOCK_SIZE); } diff --git a/arch/x86/crypto/camellia-x86_64-asm_64.S b/arch/x86/crypto/camellia-x86_64-asm_64.S index 310319c601ed..95ba6956a7f6 100644 --- a/arch/x86/crypto/camellia-x86_64-asm_64.S +++ b/arch/x86/crypto/camellia-x86_64-asm_64.S @@ -75,17 +75,17 @@ #define RCD1bh %dh #define RT0 %rsi -#define RT1 %rbp +#define RT1 %r12 #define RT2 %r8 #define RT0d %esi -#define RT1d %ebp +#define RT1d %r12d #define RT2d %r8d #define RT2bl %r8b #define RXOR %r9 -#define RRBP %r10 +#define RR12 %r10 #define RDST %r11 #define RXORd %r9d @@ -197,7 +197,7 @@ ENTRY(__camellia_enc_blk) * %rdx: src * %rcx: bool xor */ - movq %rbp, RRBP; + movq %r12, RR12; movq %rcx, RXOR; movq %rsi, RDST; @@ -227,13 +227,13 @@ ENTRY(__camellia_enc_blk) enc_outunpack(mov, RT1); - movq RRBP, %rbp; + movq RR12, %r12; ret; .L__enc_xor: enc_outunpack(xor, RT1); - movq RRBP, %rbp; + movq RR12, %r12; ret; ENDPROC(__camellia_enc_blk) @@ -248,7 +248,7 @@ ENTRY(camellia_dec_blk) movl $24, RXORd; cmovel RXORd, RT2d; /* max */ - movq %rbp, RRBP; + movq %r12, RR12; movq %rsi, RDST; movq %rdx, RIO; @@ -271,7 +271,7 @@ ENTRY(camellia_dec_blk) dec_outunpack(); - movq RRBP, %rbp; + movq RR12, %r12; ret; ENDPROC(camellia_dec_blk) @@ -433,7 +433,7 @@ ENTRY(__camellia_enc_blk_2way) */ pushq %rbx; - movq %rbp, RRBP; + movq %r12, RR12; movq %rcx, RXOR; movq %rsi, RDST; movq %rdx, RIO; @@ -461,14 +461,14 @@ ENTRY(__camellia_enc_blk_2way) enc_outunpack2(mov, RT2); - movq RRBP, %rbp; + movq RR12, %r12; popq %rbx; ret; .L__enc2_xor: enc_outunpack2(xor, RT2); - movq RRBP, %rbp; + movq RR12, %r12; popq %rbx; ret; ENDPROC(__camellia_enc_blk_2way) @@ -485,7 +485,7 @@ ENTRY(camellia_dec_blk_2way) cmovel RXORd, RT2d; /* max */ movq %rbx, RXOR; - movq %rbp, RRBP; + movq %r12, RR12; movq %rsi, RDST; movq %rdx, RIO; @@ -508,7 +508,7 @@ ENTRY(camellia_dec_blk_2way) dec_outunpack2(); - movq RRBP, %rbp; + movq RR12, %r12; movq RXOR, %rbx; ret; ENDPROC(camellia_dec_blk_2way) diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S index b4a8806234ea..86107c961bb4 100644 --- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S @@ -47,7 +47,7 @@ /********************************************************************** 16-way AVX cast5 **********************************************************************/ -#define CTX %rdi +#define CTX %r15 #define RL1 %xmm0 #define RR1 %xmm1 @@ -70,8 +70,8 @@ #define RTMP %xmm15 -#define RID1 %rbp -#define RID1d %ebp +#define RID1 %rdi +#define RID1d %edi #define RID2 %rsi #define RID2d %esi @@ -226,7 +226,7 @@ .align 16 __cast5_enc_blk16: /* input: - * %rdi: ctx, CTX + * %rdi: ctx * RL1: blocks 1 and 2 * RR1: blocks 3 and 4 * RL2: blocks 5 and 6 @@ -246,9 +246,11 @@ __cast5_enc_blk16: * RR4: encrypted blocks 15 and 16 */ - pushq %rbp; + pushq %r15; pushq %rbx; + movq %rdi, CTX; + vmovdqa .Lbswap_mask, RKM; vmovd .Lfirst_mask, R1ST; vmovd .L32_mask, R32; @@ -283,7 +285,7 @@ __cast5_enc_blk16: .L__skip_enc: popq %rbx; - popq %rbp; + popq %r15; vmovdqa .Lbswap_mask, RKM; @@ -298,7 +300,7 @@ ENDPROC(__cast5_enc_blk16) .align 16 __cast5_dec_blk16: /* input: - * %rdi: ctx, CTX + * %rdi: ctx * RL1: encrypted blocks 1 and 2 * RR1: encrypted blocks 3 and 4 * RL2: encrypted blocks 5 and 6 @@ -318,9 +320,11 @@ __cast5_dec_blk16: * RR4: decrypted blocks 15 and 16 */ - pushq %rbp; + pushq %r15; pushq %rbx; + movq %rdi, CTX; + vmovdqa .Lbswap_mask, RKM; vmovd .Lfirst_mask, R1ST; vmovd .L32_mask, R32; @@ -356,7 +360,7 @@ __cast5_dec_blk16: vmovdqa .Lbswap_mask, RKM; popq %rbx; - popq %rbp; + popq %r15; outunpack_blocks(RR1, RL1, RTMP, RX, RKM); outunpack_blocks(RR2, RL2, RTMP, RX, RKM); @@ -372,12 +376,14 @@ ENDPROC(__cast5_dec_blk16) ENTRY(cast5_ecb_enc_16way) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src */ FRAME_BEGIN + pushq %r15; + movq %rdi, CTX; movq %rsi, %r11; vmovdqu (0*4*4)(%rdx), RL1; @@ -400,18 +406,22 @@ ENTRY(cast5_ecb_enc_16way) vmovdqu RR4, (6*4*4)(%r11); vmovdqu RL4, (7*4*4)(%r11); + popq %r15; FRAME_END ret; ENDPROC(cast5_ecb_enc_16way) ENTRY(cast5_ecb_dec_16way) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src */ FRAME_BEGIN + pushq %r15; + + movq %rdi, CTX; movq %rsi, %r11; vmovdqu (0*4*4)(%rdx), RL1; @@ -434,20 +444,22 @@ ENTRY(cast5_ecb_dec_16way) vmovdqu RR4, (6*4*4)(%r11); vmovdqu RL4, (7*4*4)(%r11); + popq %r15; FRAME_END ret; ENDPROC(cast5_ecb_dec_16way) ENTRY(cast5_cbc_dec_16way) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src */ FRAME_BEGIN - pushq %r12; + pushq %r15; + movq %rdi, CTX; movq %rsi, %r11; movq %rdx, %r12; @@ -483,23 +495,24 @@ ENTRY(cast5_cbc_dec_16way) vmovdqu RR4, (6*16)(%r11); vmovdqu RL4, (7*16)(%r11); + popq %r15; popq %r12; - FRAME_END ret; ENDPROC(cast5_cbc_dec_16way) ENTRY(cast5_ctr_16way) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src * %rcx: iv (big endian, 64bit) */ FRAME_BEGIN - pushq %r12; + pushq %r15; + movq %rdi, CTX; movq %rsi, %r11; movq %rdx, %r12; @@ -558,8 +571,8 @@ ENTRY(cast5_ctr_16way) vmovdqu RR4, (6*16)(%r11); vmovdqu RL4, (7*16)(%r11); + popq %r15; popq %r12; - FRAME_END ret; ENDPROC(cast5_ctr_16way) diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c index 8648158f3916..dbea6020ffe7 100644 --- a/arch/x86/crypto/cast5_avx_glue.c +++ b/arch/x86/crypto/cast5_avx_glue.c @@ -256,8 +256,7 @@ static void ctr_crypt_final(struct blkcipher_desc *desc, unsigned int nbytes = walk->nbytes; __cast5_encrypt(ctx, keystream, ctrblk); - crypto_xor(keystream, src, nbytes); - memcpy(dst, keystream, nbytes); + crypto_xor_cpy(dst, keystream, src, nbytes); crypto_inc(ctrblk, CAST5_BLOCK_SIZE); } diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S index 952d3156a933..7f30b6f0d72c 100644 --- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S @@ -47,7 +47,7 @@ /********************************************************************** 8-way AVX cast6 **********************************************************************/ -#define CTX %rdi +#define CTX %r15 #define RA1 %xmm0 #define RB1 %xmm1 @@ -70,8 +70,8 @@ #define RTMP %xmm15 -#define RID1 %rbp -#define RID1d %ebp +#define RID1 %rdi +#define RID1d %edi #define RID2 %rsi #define RID2d %esi @@ -264,15 +264,17 @@ .align 8 __cast6_enc_blk8: /* input: - * %rdi: ctx, CTX + * %rdi: ctx * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks * output: * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks */ - pushq %rbp; + pushq %r15; pushq %rbx; + movq %rdi, CTX; + vmovdqa .Lbswap_mask, RKM; vmovd .Lfirst_mask, R1ST; vmovd .L32_mask, R32; @@ -297,7 +299,7 @@ __cast6_enc_blk8: QBAR(11); popq %rbx; - popq %rbp; + popq %r15; vmovdqa .Lbswap_mask, RKM; @@ -310,15 +312,17 @@ ENDPROC(__cast6_enc_blk8) .align 8 __cast6_dec_blk8: /* input: - * %rdi: ctx, CTX + * %rdi: ctx * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks * output: * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks */ - pushq %rbp; + pushq %r15; pushq %rbx; + movq %rdi, CTX; + vmovdqa .Lbswap_mask, RKM; vmovd .Lfirst_mask, R1ST; vmovd .L32_mask, R32; @@ -343,7 +347,7 @@ __cast6_dec_blk8: QBAR(0); popq %rbx; - popq %rbp; + popq %r15; vmovdqa .Lbswap_mask, RKM; outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); @@ -354,12 +358,14 @@ ENDPROC(__cast6_dec_blk8) ENTRY(cast6_ecb_enc_8way) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src */ FRAME_BEGIN + pushq %r15; + movq %rdi, CTX; movq %rsi, %r11; load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); @@ -368,18 +374,21 @@ ENTRY(cast6_ecb_enc_8way) store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + popq %r15; FRAME_END ret; ENDPROC(cast6_ecb_enc_8way) ENTRY(cast6_ecb_dec_8way) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src */ FRAME_BEGIN + pushq %r15; + movq %rdi, CTX; movq %rsi, %r11; load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); @@ -388,20 +397,22 @@ ENTRY(cast6_ecb_dec_8way) store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + popq %r15; FRAME_END ret; ENDPROC(cast6_ecb_dec_8way) ENTRY(cast6_cbc_dec_8way) /* input: - * %rdi: ctx, CTX + * %rdi: ctx * %rsi: dst * %rdx: src */ FRAME_BEGIN - pushq %r12; + pushq %r15; + movq %rdi, CTX; movq %rsi, %r11; movq %rdx, %r12; @@ -411,8 +422,8 @@ ENTRY(cast6_cbc_dec_8way) store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + popq %r15; popq %r12; - FRAME_END ret; ENDPROC(cast6_cbc_dec_8way) @@ -425,9 +436,10 @@ ENTRY(cast6_ctr_8way) * %rcx: iv (little endian, 128bit) */ FRAME_BEGIN - pushq %r12; + pushq %r15 + movq %rdi, CTX; movq %rsi, %r11; movq %rdx, %r12; @@ -438,8 +450,8 @@ ENTRY(cast6_ctr_8way) store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + popq %r15; popq %r12; - FRAME_END ret; ENDPROC(cast6_ctr_8way) @@ -452,7 +464,9 @@ ENTRY(cast6_xts_enc_8way) * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) */ FRAME_BEGIN + pushq %r15; + movq %rdi, CTX movq %rsi, %r11; /* regs <= src, dst <= IVs, regs <= regs xor IVs */ @@ -464,6 +478,7 @@ ENTRY(cast6_xts_enc_8way) /* dst <= regs xor IVs(in dst) */ store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + popq %r15; FRAME_END ret; ENDPROC(cast6_xts_enc_8way) @@ -476,7 +491,9 @@ ENTRY(cast6_xts_dec_8way) * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) */ FRAME_BEGIN + pushq %r15; + movq %rdi, CTX movq %rsi, %r11; /* regs <= src, dst <= IVs, regs <= regs xor IVs */ @@ -488,6 +505,7 @@ ENTRY(cast6_xts_dec_8way) /* dst <= regs xor IVs(in dst) */ store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + popq %r15; FRAME_END ret; ENDPROC(cast6_xts_dec_8way) diff --git a/arch/x86/crypto/chacha20-avx2-x86_64.S b/arch/x86/crypto/chacha20-avx2-x86_64.S index 3a2dc3dc6cac..f3cd26f48332 100644 --- a/arch/x86/crypto/chacha20-avx2-x86_64.S +++ b/arch/x86/crypto/chacha20-avx2-x86_64.S @@ -45,7 +45,7 @@ ENTRY(chacha20_8block_xor_avx2) vzeroupper # 4 * 32 byte stack, 32-byte aligned - mov %rsp, %r8 + lea 8(%rsp),%r10 and $~31, %rsp sub $0x80, %rsp @@ -443,6 +443,6 @@ ENTRY(chacha20_8block_xor_avx2) vmovdqu %ymm15,0x01e0(%rsi) vzeroupper - mov %r8,%rsp + lea -8(%r10),%rsp ret ENDPROC(chacha20_8block_xor_avx2) diff --git a/arch/x86/crypto/chacha20-ssse3-x86_64.S b/arch/x86/crypto/chacha20-ssse3-x86_64.S index 3f511a7d73b8..512a2b500fd1 100644 --- a/arch/x86/crypto/chacha20-ssse3-x86_64.S +++ b/arch/x86/crypto/chacha20-ssse3-x86_64.S @@ -160,7 +160,7 @@ ENTRY(chacha20_4block_xor_ssse3) # done with the slightly better performing SSSE3 byte shuffling, # 7/12-bit word rotation uses traditional shift+OR. - mov %rsp,%r11 + lea 8(%rsp),%r10 sub $0x80,%rsp and $~63,%rsp @@ -625,6 +625,6 @@ ENTRY(chacha20_4block_xor_ssse3) pxor %xmm1,%xmm15 movdqu %xmm15,0xf0(%rsi) - mov %r11,%rsp + lea -8(%r10),%rsp ret ENDPROC(chacha20_4block_xor_ssse3) diff --git a/arch/x86/crypto/des3_ede-asm_64.S b/arch/x86/crypto/des3_ede-asm_64.S index f3e91647ca27..8e49ce117494 100644 --- a/arch/x86/crypto/des3_ede-asm_64.S +++ b/arch/x86/crypto/des3_ede-asm_64.S @@ -64,12 +64,12 @@ #define RW2bh %ch #define RT0 %r15 -#define RT1 %rbp +#define RT1 %rsi #define RT2 %r14 #define RT3 %rdx #define RT0d %r15d -#define RT1d %ebp +#define RT1d %esi #define RT2d %r14d #define RT3d %edx @@ -177,13 +177,14 @@ ENTRY(des3_ede_x86_64_crypt_blk) * %rsi: dst * %rdx: src */ - pushq %rbp; pushq %rbx; pushq %r12; pushq %r13; pushq %r14; pushq %r15; + pushq %rsi; /* dst */ + read_block(%rdx, RL0, RR0); initial_permutation(RL0, RR0); @@ -241,6 +242,8 @@ ENTRY(des3_ede_x86_64_crypt_blk) round1(32+15, RL0, RR0, dummy2); final_permutation(RR0, RL0); + + popq %rsi /* dst */ write_block(%rsi, RR0, RL0); popq %r15; @@ -248,7 +251,6 @@ ENTRY(des3_ede_x86_64_crypt_blk) popq %r13; popq %r12; popq %rbx; - popq %rbp; ret; ENDPROC(des3_ede_x86_64_crypt_blk) @@ -432,13 +434,14 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way) * %rdx: src (3 blocks) */ - pushq %rbp; pushq %rbx; pushq %r12; pushq %r13; pushq %r14; pushq %r15; + pushq %rsi /* dst */ + /* load input */ movl 0 * 4(%rdx), RL0d; movl 1 * 4(%rdx), RR0d; @@ -520,6 +523,7 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way) bswapl RR2d; bswapl RL2d; + popq %rsi /* dst */ movl RR0d, 0 * 4(%rsi); movl RL0d, 1 * 4(%rsi); movl RR1d, 2 * 4(%rsi); @@ -532,7 +536,6 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way) popq %r13; popq %r12; popq %rbx; - popq %rbp; ret; ENDPROC(des3_ede_x86_64_crypt_blk_3way) diff --git a/arch/x86/crypto/des3_ede_glue.c b/arch/x86/crypto/des3_ede_glue.c index d6fc59aaaadf..30c0a37f4882 100644 --- a/arch/x86/crypto/des3_ede_glue.c +++ b/arch/x86/crypto/des3_ede_glue.c @@ -277,8 +277,7 @@ static void ctr_crypt_final(struct des3_ede_x86_ctx *ctx, unsigned int nbytes = walk->nbytes; des3_ede_enc_blk(ctx, keystream, ctrblk); - crypto_xor(keystream, src, nbytes); - memcpy(dst, keystream, nbytes); + crypto_xor_cpy(dst, keystream, src, nbytes); crypto_inc(ctrblk, DES3_EDE_BLOCK_SIZE); } diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c index 24ac9fad832d..d61e57960fe0 100644 --- a/arch/x86/crypto/glue_helper.c +++ b/arch/x86/crypto/glue_helper.c @@ -176,9 +176,6 @@ __glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx, src -= 1; dst -= 1; } while (nbytes >= func_bytes); - - if (nbytes < bsize) - goto done; } } diff --git a/arch/x86/crypto/salsa20-x86_64-asm_64.S b/arch/x86/crypto/salsa20-x86_64-asm_64.S index 9279e0b2d60e..10db30d58006 100644 --- a/arch/x86/crypto/salsa20-x86_64-asm_64.S +++ b/arch/x86/crypto/salsa20-x86_64-asm_64.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #include <linux/linkage.h> # enter salsa20_encrypt_bytes diff --git a/arch/x86/crypto/sha1-mb/Makefile b/arch/x86/crypto/sha1-mb/Makefile index 2f8756375df5..815ded3ba90e 100644 --- a/arch/x86/crypto/sha1-mb/Makefile +++ b/arch/x86/crypto/sha1-mb/Makefile @@ -1,7 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0 # # Arch-specific CryptoAPI modules. # +OBJECT_FILES_NON_STANDARD := y + avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\ $(comma)4)$(comma)%ymm2,yes,no) ifeq ($(avx2_supported),yes) diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S index 1cd792db15ef..9f712a7dfd79 100644 --- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S +++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S @@ -89,7 +89,7 @@ #define REG_RE %rdx #define REG_RTA %r12 #define REG_RTB %rbx -#define REG_T1 %ebp +#define REG_T1 %r11d #define xmm_mov vmovups #define avx2_zeroupper vzeroupper #define RND_F1 1 @@ -117,11 +117,10 @@ .set T1, REG_T1 .endm -#define K_BASE %r8 #define HASH_PTR %r9 +#define BLOCKS_CTR %r8 #define BUFFER_PTR %r10 #define BUFFER_PTR2 %r13 -#define BUFFER_END %r11 #define PRECALC_BUF %r14 #define WK_BUF %r15 @@ -205,14 +204,14 @@ * blended AVX2 and ALU instruction scheduling * 1 vector iteration per 8 rounds */ - vmovdqu ((i * 2) + PRECALC_OFFSET)(BUFFER_PTR), W_TMP + vmovdqu (i * 2)(BUFFER_PTR), W_TMP .elseif ((i & 7) == 1) - vinsertf128 $1, (((i-1) * 2)+PRECALC_OFFSET)(BUFFER_PTR2),\ + vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\ WY_TMP, WY_TMP .elseif ((i & 7) == 2) vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY .elseif ((i & 7) == 4) - vpaddd K_XMM(K_BASE), WY, WY_TMP + vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP .elseif ((i & 7) == 7) vmovdqu WY_TMP, PRECALC_WK(i&~7) @@ -255,7 +254,7 @@ vpxor WY, WY_TMP, WY_TMP .elseif ((i & 7) == 7) vpxor WY_TMP2, WY_TMP, WY - vpaddd K_XMM(K_BASE), WY, WY_TMP + vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP vmovdqu WY_TMP, PRECALC_WK(i&~7) PRECALC_ROTATE_WY @@ -291,7 +290,7 @@ vpsrld $30, WY, WY vpor WY, WY_TMP, WY .elseif ((i & 7) == 7) - vpaddd K_XMM(K_BASE), WY, WY_TMP + vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP vmovdqu WY_TMP, PRECALC_WK(i&~7) PRECALC_ROTATE_WY @@ -446,6 +445,16 @@ .endm +/* Add constant only if (%2 > %3) condition met (uses RTA as temp) + * %1 + %2 >= %3 ? %4 : 0 + */ +.macro ADD_IF_GE a, b, c, d + mov \a, RTA + add $\d, RTA + cmp $\c, \b + cmovge RTA, \a +.endm + /* * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining */ @@ -463,13 +472,16 @@ lea (2*4*80+32)(%rsp), WK_BUF # Precalc WK for first 2 blocks - PRECALC_OFFSET = 0 + ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64 .set i, 0 .rept 160 PRECALC i .set i, i + 1 .endr - PRECALC_OFFSET = 128 + + /* Go to next block if needed */ + ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128 + ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 xchg WK_BUF, PRECALC_BUF .align 32 @@ -479,8 +491,8 @@ _loop: * we use K_BASE value as a signal of a last block, * it is set below by: cmovae BUFFER_PTR, K_BASE */ - cmp K_BASE, BUFFER_PTR - jne _begin + test BLOCKS_CTR, BLOCKS_CTR + jnz _begin .align 32 jmp _end .align 32 @@ -512,10 +524,10 @@ _loop0: .set j, j+2 .endr - add $(2*64), BUFFER_PTR /* move to next odd-64-byte block */ - cmp BUFFER_END, BUFFER_PTR /* is current block the last one? */ - cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */ - + /* Update Counter */ + sub $1, BLOCKS_CTR + /* Move to the next block only if needed*/ + ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128 /* * rounds * 60,62,64,66,68 @@ -532,8 +544,8 @@ _loop0: UPDATE_HASH 12(HASH_PTR), D UPDATE_HASH 16(HASH_PTR), E - cmp K_BASE, BUFFER_PTR /* is current block the last one? */ - je _loop + test BLOCKS_CTR, BLOCKS_CTR + jz _loop mov TB, B @@ -575,10 +587,10 @@ _loop2: .set j, j+2 .endr - add $(2*64), BUFFER_PTR2 /* move to next even-64-byte block */ - - cmp BUFFER_END, BUFFER_PTR2 /* is current block the last one */ - cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */ + /* update counter */ + sub $1, BLOCKS_CTR + /* Move to the next block only if needed*/ + ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 jmp _loop3 _loop3: @@ -625,7 +637,6 @@ _loop3: ENTRY(\name) push %rbx - push %rbp push %r12 push %r13 push %r14 @@ -641,19 +652,12 @@ _loop3: avx2_zeroupper - lea K_XMM_AR(%rip), K_BASE - + /* Setup initial values */ mov CTX, HASH_PTR mov BUF, BUFFER_PTR - lea 64(BUF), BUFFER_PTR2 - - shl $6, CNT /* mul by 64 */ - add BUF, CNT - add $64, CNT - mov CNT, BUFFER_END - cmp BUFFER_END, BUFFER_PTR2 - cmovae K_BASE, BUFFER_PTR2 + mov BUF, BUFFER_PTR2 + mov CNT, BLOCKS_CTR xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP @@ -668,7 +672,6 @@ _loop3: pop %r14 pop %r13 pop %r12 - pop %rbp pop %rbx ret diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S index a4109506a5e8..6204bd53528c 100644 --- a/arch/x86/crypto/sha1_ssse3_asm.S +++ b/arch/x86/crypto/sha1_ssse3_asm.S @@ -37,7 +37,7 @@ #define REG_A %ecx #define REG_B %esi #define REG_C %edi -#define REG_D %ebp +#define REG_D %r12d #define REG_E %edx #define REG_T1 %eax @@ -74,10 +74,10 @@ ENTRY(\name) push %rbx - push %rbp push %r12 + push %rbp + mov %rsp, %rbp - mov %rsp, %r12 sub $64, %rsp # allocate workspace and $~15, %rsp # align stack @@ -99,10 +99,9 @@ xor %rax, %rax rep stosq - mov %r12, %rsp # deallocate workspace - - pop %r12 + mov %rbp, %rsp # deallocate workspace pop %rbp + pop %r12 pop %rbx ret diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S index e08888a1a5f2..001bbcf93c79 100644 --- a/arch/x86/crypto/sha256-avx-asm.S +++ b/arch/x86/crypto/sha256-avx-asm.S @@ -103,7 +103,7 @@ SRND = %rsi # clobbers INP c = %ecx d = %r8d e = %edx -TBL = %rbp +TBL = %r12 a = %eax b = %ebx @@ -350,13 +350,13 @@ a = TMP_ ENTRY(sha256_transform_avx) .align 32 pushq %rbx - pushq %rbp + pushq %r12 pushq %r13 pushq %r14 pushq %r15 - pushq %r12 + pushq %rbp + movq %rsp, %rbp - mov %rsp, %r12 subq $STACK_SIZE, %rsp # allocate stack space and $~15, %rsp # align stack pointer @@ -452,13 +452,12 @@ loop2: done_hash: - mov %r12, %rsp - - popq %r12 + mov %rbp, %rsp + popq %rbp popq %r15 popq %r14 popq %r13 - popq %rbp + popq %r12 popq %rbx ret ENDPROC(sha256_transform_avx) diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S index 89c8f09787d2..1420db15dcdd 100644 --- a/arch/x86/crypto/sha256-avx2-asm.S +++ b/arch/x86/crypto/sha256-avx2-asm.S @@ -98,8 +98,6 @@ d = %r8d e = %edx # clobbers NUM_BLKS y3 = %esi # clobbers INP - -TBL = %rbp SRND = CTX # SRND is same register as CTX a = %eax @@ -531,7 +529,6 @@ STACK_SIZE = _RSP + _RSP_SIZE ENTRY(sha256_transform_rorx) .align 32 pushq %rbx - pushq %rbp pushq %r12 pushq %r13 pushq %r14 @@ -568,8 +565,6 @@ ENTRY(sha256_transform_rorx) mov CTX, _CTX(%rsp) loop0: - lea K256(%rip), TBL - ## Load first 16 dwords from two blocks VMOVDQ 0*32(INP),XTMP0 VMOVDQ 1*32(INP),XTMP1 @@ -597,19 +592,19 @@ last_block_enter: .align 16 loop1: - vpaddd 0*32(TBL, SRND), X0, XFER + vpaddd K256+0*32(SRND), X0, XFER vmovdqa XFER, 0*32+_XFER(%rsp, SRND) FOUR_ROUNDS_AND_SCHED _XFER + 0*32 - vpaddd 1*32(TBL, SRND), X0, XFER + vpaddd K256+1*32(SRND), X0, XFER vmovdqa XFER, 1*32+_XFER(%rsp, SRND) FOUR_ROUNDS_AND_SCHED _XFER + 1*32 - vpaddd 2*32(TBL, SRND), X0, XFER + vpaddd K256+2*32(SRND), X0, XFER vmovdqa XFER, 2*32+_XFER(%rsp, SRND) FOUR_ROUNDS_AND_SCHED _XFER + 2*32 - vpaddd 3*32(TBL, SRND), X0, XFER + vpaddd K256+3*32(SRND), X0, XFER vmovdqa XFER, 3*32+_XFER(%rsp, SRND) FOUR_ROUNDS_AND_SCHED _XFER + 3*32 @@ -619,10 +614,11 @@ loop1: loop2: ## Do last 16 rounds with no scheduling - vpaddd 0*32(TBL, SRND), X0, XFER + vpaddd K256+0*32(SRND), X0, XFER vmovdqa XFER, 0*32+_XFER(%rsp, SRND) DO_4ROUNDS _XFER + 0*32 - vpaddd 1*32(TBL, SRND), X1, XFER + + vpaddd K256+1*32(SRND), X1, XFER vmovdqa XFER, 1*32+_XFER(%rsp, SRND) DO_4ROUNDS _XFER + 1*32 add $2*32, SRND @@ -676,9 +672,6 @@ loop3: ja done_hash do_last_block: - #### do last block - lea K256(%rip), TBL - VMOVDQ 0*16(INP),XWORD0 VMOVDQ 1*16(INP),XWORD1 VMOVDQ 2*16(INP),XWORD2 @@ -718,7 +711,6 @@ done_hash: popq %r14 popq %r13 popq %r12 - popq %rbp popq %rbx ret ENDPROC(sha256_transform_rorx) diff --git a/arch/x86/crypto/sha256-mb/Makefile b/arch/x86/crypto/sha256-mb/Makefile index 41089e7c400c..53ad6e7db747 100644 --- a/arch/x86/crypto/sha256-mb/Makefile +++ b/arch/x86/crypto/sha256-mb/Makefile @@ -1,7 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0 # # Arch-specific CryptoAPI modules. # +OBJECT_FILES_NON_STANDARD := y + avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\ $(comma)4)$(comma)%ymm2,yes,no) ifeq ($(avx2_supported),yes) diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/crypto/sha256-ssse3-asm.S index 39b83c93e7fd..c6c05ed2c16a 100644 --- a/arch/x86/crypto/sha256-ssse3-asm.S +++ b/arch/x86/crypto/sha256-ssse3-asm.S @@ -95,7 +95,7 @@ SRND = %rsi # clobbers INP c = %ecx d = %r8d e = %edx -TBL = %rbp +TBL = %r12 a = %eax b = %ebx @@ -356,13 +356,13 @@ a = TMP_ ENTRY(sha256_transform_ssse3) .align 32 pushq %rbx - pushq %rbp + pushq %r12 pushq %r13 pushq %r14 pushq %r15 - pushq %r12 + pushq %rbp + mov %rsp, %rbp - mov %rsp, %r12 subq $STACK_SIZE, %rsp and $~15, %rsp @@ -462,13 +462,12 @@ loop2: done_hash: - mov %r12, %rsp - - popq %r12 + mov %rbp, %rsp + popq %rbp popq %r15 popq %r14 popq %r13 - popq %rbp + popq %r12 popq %rbx ret diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S index 7f5f6c6ec72e..b16d56005162 100644 --- a/arch/x86/crypto/sha512-avx2-asm.S +++ b/arch/x86/crypto/sha512-avx2-asm.S @@ -69,8 +69,9 @@ XFER = YTMP0 BYTE_FLIP_MASK = %ymm9 -# 1st arg -CTX = %rdi +# 1st arg is %rdi, which is saved to the stack and accessed later via %r12 +CTX1 = %rdi +CTX2 = %r12 # 2nd arg INP = %rsi # 3rd arg @@ -81,7 +82,7 @@ d = %r8 e = %rdx y3 = %rsi -TBL = %rbp +TBL = %rdi # clobbers CTX1 a = %rax b = %rbx @@ -91,26 +92,26 @@ g = %r10 h = %r11 old_h = %r11 -T1 = %r12 +T1 = %r12 # clobbers CTX2 y0 = %r13 y1 = %r14 y2 = %r15 -y4 = %r12 - # Local variables (stack frame) XFER_SIZE = 4*8 SRND_SIZE = 1*8 INP_SIZE = 1*8 INPEND_SIZE = 1*8 +CTX_SIZE = 1*8 RSPSAVE_SIZE = 1*8 -GPRSAVE_SIZE = 6*8 +GPRSAVE_SIZE = 5*8 frame_XFER = 0 frame_SRND = frame_XFER + XFER_SIZE frame_INP = frame_SRND + SRND_SIZE frame_INPEND = frame_INP + INP_SIZE -frame_RSPSAVE = frame_INPEND + INPEND_SIZE +frame_CTX = frame_INPEND + INPEND_SIZE +frame_RSPSAVE = frame_CTX + CTX_SIZE frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE frame_size = frame_GPRSAVE + GPRSAVE_SIZE @@ -576,12 +577,11 @@ ENTRY(sha512_transform_rorx) mov %rax, frame_RSPSAVE(%rsp) # Save GPRs - mov %rbp, frame_GPRSAVE(%rsp) - mov %rbx, 8*1+frame_GPRSAVE(%rsp) - mov %r12, 8*2+frame_GPRSAVE(%rsp) - mov %r13, 8*3+frame_GPRSAVE(%rsp) - mov %r14, 8*4+frame_GPRSAVE(%rsp) - mov %r15, 8*5+frame_GPRSAVE(%rsp) + mov %rbx, 8*0+frame_GPRSAVE(%rsp) + mov %r12, 8*1+frame_GPRSAVE(%rsp) + mov %r13, 8*2+frame_GPRSAVE(%rsp) + mov %r14, 8*3+frame_GPRSAVE(%rsp) + mov %r15, 8*4+frame_GPRSAVE(%rsp) shl $7, NUM_BLKS # convert to bytes jz done_hash @@ -589,14 +589,17 @@ ENTRY(sha512_transform_rorx) mov NUM_BLKS, frame_INPEND(%rsp) ## load initial digest - mov 8*0(CTX),a - mov 8*1(CTX),b - mov 8*2(CTX),c - mov 8*3(CTX),d - mov 8*4(CTX),e - mov 8*5(CTX),f - mov 8*6(CTX),g - mov 8*7(CTX),h + mov 8*0(CTX1), a + mov 8*1(CTX1), b + mov 8*2(CTX1), c + mov 8*3(CTX1), d + mov 8*4(CTX1), e + mov 8*5(CTX1), f + mov 8*6(CTX1), g + mov 8*7(CTX1), h + + # save %rdi (CTX) before it gets clobbered + mov %rdi, frame_CTX(%rsp) vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK @@ -652,14 +655,15 @@ loop2: subq $1, frame_SRND(%rsp) jne loop2 - addm 8*0(CTX),a - addm 8*1(CTX),b - addm 8*2(CTX),c - addm 8*3(CTX),d - addm 8*4(CTX),e - addm 8*5(CTX),f - addm 8*6(CTX),g - addm 8*7(CTX),h + mov frame_CTX(%rsp), CTX2 + addm 8*0(CTX2), a + addm 8*1(CTX2), b + addm 8*2(CTX2), c + addm 8*3(CTX2), d + addm 8*4(CTX2), e + addm 8*5(CTX2), f + addm 8*6(CTX2), g + addm 8*7(CTX2), h mov frame_INP(%rsp), INP add $128, INP @@ -669,12 +673,11 @@ loop2: done_hash: # Restore GPRs - mov frame_GPRSAVE(%rsp) ,%rbp - mov 8*1+frame_GPRSAVE(%rsp) ,%rbx - mov 8*2+frame_GPRSAVE(%rsp) ,%r12 - mov 8*3+frame_GPRSAVE(%rsp) ,%r13 - mov 8*4+frame_GPRSAVE(%rsp) ,%r14 - mov 8*5+frame_GPRSAVE(%rsp) ,%r15 + mov 8*0+frame_GPRSAVE(%rsp), %rbx + mov 8*1+frame_GPRSAVE(%rsp), %r12 + mov 8*2+frame_GPRSAVE(%rsp), %r13 + mov 8*3+frame_GPRSAVE(%rsp), %r14 + mov 8*4+frame_GPRSAVE(%rsp), %r15 # Restore Stack Pointer mov frame_RSPSAVE(%rsp), %rsp diff --git a/arch/x86/crypto/sha512-mb/Makefile b/arch/x86/crypto/sha512-mb/Makefile index 0a57e2103980..90f1ef69152e 100644 --- a/arch/x86/crypto/sha512-mb/Makefile +++ b/arch/x86/crypto/sha512-mb/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # Arch-specific CryptoAPI modules. # diff --git a/arch/x86/crypto/sha512-mb/sha512_mb.c b/arch/x86/crypto/sha512-mb/sha512_mb.c index 2dd3674b5a1e..458409b7568d 100644 --- a/arch/x86/crypto/sha512-mb/sha512_mb.c +++ b/arch/x86/crypto/sha512-mb/sha512_mb.c @@ -269,19 +269,19 @@ static struct sha512_hash_ctx * LAST */ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS; - return ctx; + goto unlock; } if (ctx->status & HASH_CTX_STS_PROCESSING) { /* Cannot submit to a currently processing job. */ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING; - return ctx; + goto unlock; } if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) { /* Cannot update a finished job. */ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED; - return ctx; + goto unlock; } @@ -363,6 +363,7 @@ static struct sha512_hash_ctx } ctx = sha512_ctx_mgr_resubmit(mgr, ctx); +unlock: spin_unlock_irqrestore(&cstate->work_lock, irqflags); return ctx; } diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S index b3f49d286348..73b471da3622 100644 --- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S @@ -76,8 +76,8 @@ #define RT %xmm14 #define RR %xmm15 -#define RID1 %rbp -#define RID1d %ebp +#define RID1 %r13 +#define RID1d %r13d #define RID2 %rsi #define RID2d %esi @@ -259,7 +259,7 @@ __twofish_enc_blk8: vmovdqu w(CTX), RK1; - pushq %rbp; + pushq %r13; pushq %rbx; pushq %rcx; @@ -282,7 +282,7 @@ __twofish_enc_blk8: popq %rcx; popq %rbx; - popq %rbp; + popq %r13; outunpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2); outunpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2); @@ -301,7 +301,7 @@ __twofish_dec_blk8: vmovdqu (w+4*4)(CTX), RK1; - pushq %rbp; + pushq %r13; pushq %rbx; inpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2); @@ -322,7 +322,7 @@ __twofish_dec_blk8: vmovdqu (w)(CTX), RK1; popq %rbx; - popq %rbp; + popq %r13; outunpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2); outunpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2); diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index 9976fcecd17e..06fc70cf5433 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -1,8 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0 # # Makefile for the x86 low level entry code # -OBJECT_FILES_NON_STANDARD_entry_$(BITS).o := y OBJECT_FILES_NON_STANDARD_entry_64_compat.o := y CFLAGS_syscall_64.o += $(call cc-option,-Wno-override-init,) diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 05ed3d393da7..6e160031cfea 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -1,4 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #include <linux/jump_label.h> +#include <asm/unwind_hints.h> /* @@ -112,6 +114,7 @@ For 32-bit we have the following conventions - kernel is built with movq %rdx, 12*8+\offset(%rsp) movq %rsi, 13*8+\offset(%rsp) movq %rdi, 14*8+\offset(%rsp) + UNWIND_HINT_REGS offset=\offset extra=0 .endm .macro SAVE_C_REGS offset=0 SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1 @@ -136,6 +139,7 @@ For 32-bit we have the following conventions - kernel is built with movq %r12, 3*8+\offset(%rsp) movq %rbp, 4*8+\offset(%rsp) movq %rbx, 5*8+\offset(%rsp) + UNWIND_HINT_REGS offset=\offset .endm .macro RESTORE_EXTRA_REGS offset=0 @@ -145,6 +149,7 @@ For 32-bit we have the following conventions - kernel is built with movq 3*8+\offset(%rsp), %r12 movq 4*8+\offset(%rsp), %rbp movq 5*8+\offset(%rsp), %rbx + UNWIND_HINT_REGS offset=\offset extra=0 .endm .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1 @@ -167,6 +172,7 @@ For 32-bit we have the following conventions - kernel is built with .endif movq 13*8(%rsp), %rsi movq 14*8(%rsp), %rdi + UNWIND_HINT_IRET_REGS offset=16*8 .endm .macro RESTORE_C_REGS RESTORE_C_REGS_HELPER 1,1,1,1,1 diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index cdefcfdd9e63..03505ffbe1b6 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -23,6 +23,7 @@ #include <linux/user-return-notifier.h> #include <linux/uprobes.h> #include <linux/livepatch.h> +#include <linux/syscalls.h> #include <asm/desc.h> #include <asm/traps.h> @@ -183,6 +184,8 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) struct thread_info *ti = current_thread_info(); u32 cached_flags; + addr_limit_user_check(); + if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled())) local_irq_disable(); diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 48ef7bb32c42..4838037f97f6 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 1991,1992 Linus Torvalds * @@ -176,7 +177,7 @@ /* * This is a sneaky trick to help the unwinder find pt_regs on the stack. The * frame pointer is replaced with an encoded pointer to pt_regs. The encoding - * is just setting the LSB, which makes it an invalid stack address and is also + * is just clearing the MSB, which makes it an invalid stack address and is also * a signal to the unwinder that it's a pt_regs pointer in disguise. * * NOTE: This macro must be used *after* SAVE_ALL because it corrupts the @@ -185,7 +186,7 @@ .macro ENCODE_FRAME_POINTER #ifdef CONFIG_FRAME_POINTER mov %esp, %ebp - orl $0x1, %ebp + andl $0x7fffffff, %ebp #endif .endm @@ -673,16 +674,8 @@ ENTRY(name) \ jmp ret_from_intr; \ ENDPROC(name) - -#ifdef CONFIG_TRACING -# define TRACE_BUILD_INTERRUPT(name, nr) BUILD_INTERRUPT3(trace_##name, nr, smp_trace_##name) -#else -# define TRACE_BUILD_INTERRUPT(name, nr) -#endif - #define BUILD_INTERRUPT(name, nr) \ BUILD_INTERRUPT3(name, nr, smp_##name); \ - TRACE_BUILD_INTERRUPT(name, nr) /* The include is where all of the SMP etc. interrupts come from */ #include <asm/entry_arch.h> @@ -880,25 +873,17 @@ ENTRY(xen_failsafe_callback) ENDPROC(xen_failsafe_callback) BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR, - xen_evtchn_do_upcall) + xen_evtchn_do_upcall) #endif /* CONFIG_XEN */ #if IS_ENABLED(CONFIG_HYPERV) BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR, - hyperv_vector_handler) + hyperv_vector_handler) #endif /* CONFIG_HYPERV */ -#ifdef CONFIG_TRACING -ENTRY(trace_page_fault) - ASM_CLAC - pushl $trace_do_page_fault - jmp common_exception -END(trace_page_fault) -#endif - ENTRY(page_fault) ASM_CLAC pushl $do_page_fault diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 4a4c0834f965..bcfc5668dcb2 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * linux/arch/x86_64/entry.S * @@ -36,6 +37,7 @@ #include <asm/smap.h> #include <asm/pgtable_types.h> #include <asm/export.h> +#include <asm/frame.h> #include <linux/err.h> .code64 @@ -43,9 +45,10 @@ #ifdef CONFIG_PARAVIRT ENTRY(native_usergs_sysret64) + UNWIND_HINT_EMPTY swapgs sysretq -ENDPROC(native_usergs_sysret64) +END(native_usergs_sysret64) #endif /* CONFIG_PARAVIRT */ .macro TRACE_IRQS_IRETQ @@ -134,19 +137,14 @@ ENDPROC(native_usergs_sysret64) */ ENTRY(entry_SYSCALL_64) + UNWIND_HINT_EMPTY /* * Interrupts are off on entry. * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, * it is too small to ever cause noticeable irq latency. */ - SWAPGS_UNSAFE_STACK - /* - * A hypervisor implementation might want to use a label - * after the swapgs, so that it can do the swapgs - * for the guest and jump here on syscall. - */ -GLOBAL(entry_SYSCALL_64_after_swapgs) + swapgs movq %rsp, PER_CPU_VAR(rsp_scratch) movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp @@ -158,6 +156,7 @@ GLOBAL(entry_SYSCALL_64_after_swapgs) pushq %r11 /* pt_regs->flags */ pushq $__USER_CS /* pt_regs->cs */ pushq %rcx /* pt_regs->ip */ +GLOBAL(entry_SYSCALL_64_after_hwframe) pushq %rax /* pt_regs->orig_ax */ pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ @@ -169,6 +168,7 @@ GLOBAL(entry_SYSCALL_64_after_swapgs) pushq %r10 /* pt_regs->r10 */ pushq %r11 /* pt_regs->r11 */ sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ + UNWIND_HINT_REGS extra=0 /* * If we need to do entry work or if we guess we'll need to do @@ -223,6 +223,7 @@ entry_SYSCALL_64_fastpath: movq EFLAGS(%rsp), %r11 RESTORE_C_REGS_EXCEPT_RCX_R11 movq RSP(%rsp), %rsp + UNWIND_HINT_EMPTY USERGS_SYSRET64 1: @@ -265,7 +266,8 @@ return_from_SYSCALL_64: * If width of "canonical tail" ever becomes variable, this will need * to be updated to remain correct on both old and new CPUs. * - * Change top 16 bits to be the sign-extension of 47th bit + * Change top bits to match most significant bit (47th or 56th bit + * depending on paging mode) in the address. */ shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx @@ -315,6 +317,7 @@ syscall_return_via_sysret: /* rcx and r11 are already restored (see code above) */ RESTORE_C_REGS_EXCEPT_RCX_R11 movq RSP(%rsp), %rsp + UNWIND_HINT_EMPTY USERGS_SYSRET64 opportunistic_sysret_failed: @@ -342,6 +345,7 @@ ENTRY(stub_ptregs_64) DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF popq %rax + UNWIND_HINT_REGS extra=0 jmp entry_SYSCALL64_slow_path 1: @@ -350,6 +354,7 @@ END(stub_ptregs_64) .macro ptregs_stub func ENTRY(ptregs_\func) + UNWIND_HINT_FUNC leaq \func(%rip), %rax jmp stub_ptregs_64 END(ptregs_\func) @@ -366,6 +371,7 @@ END(ptregs_\func) * %rsi: next task */ ENTRY(__switch_to_asm) + UNWIND_HINT_FUNC /* * Save callee-saved registers * This must match the order in inactive_task_frame @@ -405,6 +411,7 @@ END(__switch_to_asm) * r12: kernel thread arg */ ENTRY(ret_from_fork) + UNWIND_HINT_EMPTY movq %rax, %rdi call schedule_tail /* rdi: 'prev' task parameter */ @@ -412,6 +419,7 @@ ENTRY(ret_from_fork) jnz 1f /* kernel threads are uncommon */ 2: + UNWIND_HINT_REGS movq %rsp, %rdi call syscall_return_slowpath /* returns with IRQs disabled */ TRACE_IRQS_ON /* user mode is traced as IRQS on */ @@ -439,13 +447,102 @@ END(ret_from_fork) ENTRY(irq_entries_start) vector=FIRST_EXTERNAL_VECTOR .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) + UNWIND_HINT_IRET_REGS pushq $(~vector+0x80) /* Note: always in signed byte range */ - vector=vector+1 jmp common_interrupt .align 8 + vector=vector+1 .endr END(irq_entries_start) +.macro DEBUG_ENTRY_ASSERT_IRQS_OFF +#ifdef CONFIG_DEBUG_ENTRY + pushfq + testl $X86_EFLAGS_IF, (%rsp) + jz .Lokay_\@ + ud2 +.Lokay_\@: + addq $8, %rsp +#endif +.endm + +/* + * Enters the IRQ stack if we're not already using it. NMI-safe. Clobbers + * flags and puts old RSP into old_rsp, and leaves all other GPRs alone. + * Requires kernel GSBASE. + * + * The invariant is that, if irq_count != -1, then the IRQ stack is in use. + */ +.macro ENTER_IRQ_STACK regs=1 old_rsp + DEBUG_ENTRY_ASSERT_IRQS_OFF + movq %rsp, \old_rsp + + .if \regs + UNWIND_HINT_REGS base=\old_rsp + .endif + + incl PER_CPU_VAR(irq_count) + jnz .Lirq_stack_push_old_rsp_\@ + + /* + * Right now, if we just incremented irq_count to zero, we've + * claimed the IRQ stack but we haven't switched to it yet. + * + * If anything is added that can interrupt us here without using IST, + * it must be *extremely* careful to limit its stack usage. This + * could include kprobes and a hypothetical future IST-less #DB + * handler. + * + * The OOPS unwinder relies on the word at the top of the IRQ + * stack linking back to the previous RSP for the entire time we're + * on the IRQ stack. For this to work reliably, we need to write + * it before we actually move ourselves to the IRQ stack. + */ + + movq \old_rsp, PER_CPU_VAR(irq_stack_union + IRQ_STACK_SIZE - 8) + movq PER_CPU_VAR(irq_stack_ptr), %rsp + +#ifdef CONFIG_DEBUG_ENTRY + /* + * If the first movq above becomes wrong due to IRQ stack layout + * changes, the only way we'll notice is if we try to unwind right + * here. Assert that we set up the stack right to catch this type + * of bug quickly. + */ + cmpq -8(%rsp), \old_rsp + je .Lirq_stack_okay\@ + ud2 + .Lirq_stack_okay\@: +#endif + +.Lirq_stack_push_old_rsp_\@: + pushq \old_rsp + + .if \regs + UNWIND_HINT_REGS indirect=1 + .endif +.endm + +/* + * Undoes ENTER_IRQ_STACK. + */ +.macro LEAVE_IRQ_STACK regs=1 + DEBUG_ENTRY_ASSERT_IRQS_OFF + /* We need to be off the IRQ stack before decrementing irq_count. */ + popq %rsp + + .if \regs + UNWIND_HINT_REGS + .endif + + /* + * As in ENTER_IRQ_STACK, irq_count == 0, we are still claiming + * the irq stack but we're not on it. + */ + + decl PER_CPU_VAR(irq_count) +.endm + /* * Interrupt entry/exit. * @@ -484,17 +581,7 @@ END(irq_entries_start) CALL_enter_from_user_mode 1: - /* - * Save previous stack pointer, optionally switch to interrupt stack. - * irq_count is used to check if a CPU is already on an interrupt stack - * or not. While this is essentially redundant with preempt_count it is - * a little cheaper to use a separate counter in the PDA (short of - * moving irq_enter into assembly, which would be too much work) - */ - movq %rsp, %rdi - incl PER_CPU_VAR(irq_count) - cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp - pushq %rdi + ENTER_IRQ_STACK old_rsp=%rdi /* We entered an interrupt context - irqs are off: */ TRACE_IRQS_OFF @@ -514,10 +601,8 @@ common_interrupt: ret_from_intr: DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF - decl PER_CPU_VAR(irq_count) - /* Restore saved previous stack */ - popq %rsp + LEAVE_IRQ_STACK testb $3, CS(%rsp) jz retint_kernel @@ -560,6 +645,7 @@ restore_c_regs_and_iret: INTERRUPT_RETURN ENTRY(native_iret) + UNWIND_HINT_IRET_REGS /* * Are we returning to a stack segment from the LDT? Note: in * 64-bit mode SS:RSP on the exception stack is always valid. @@ -632,6 +718,7 @@ native_irq_return_ldt: orq PER_CPU_VAR(espfix_stack), %rax SWAPGS movq %rax, %rsp + UNWIND_HINT_IRET_REGS offset=8 /* * At this point, we cannot write to the stack any more, but we can @@ -653,6 +740,7 @@ END(common_interrupt) */ .macro apicinterrupt3 num sym do_sym ENTRY(\sym) + UNWIND_HINT_IRET_REGS ASM_CLAC pushq $~(\num) .Lcommon_\sym: @@ -661,31 +749,13 @@ ENTRY(\sym) END(\sym) .endm -#ifdef CONFIG_TRACING -#define trace(sym) trace_##sym -#define smp_trace(sym) smp_trace_##sym - -.macro trace_apicinterrupt num sym -apicinterrupt3 \num trace(\sym) smp_trace(\sym) -.endm -#else -.macro trace_apicinterrupt num sym do_sym -.endm -#endif - /* Make sure APIC interrupt handlers end up in the irqentry section: */ -#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) -# define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax" -# define POP_SECTION_IRQENTRY .popsection -#else -# define PUSH_SECTION_IRQENTRY -# define POP_SECTION_IRQENTRY -#endif +#define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax" +#define POP_SECTION_IRQENTRY .popsection .macro apicinterrupt num sym do_sym PUSH_SECTION_IRQENTRY apicinterrupt3 \num \sym \do_sym -trace_apicinterrupt \num \sym POP_SECTION_IRQENTRY .endm @@ -704,6 +774,7 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR x86_platform_ipi smp_x86_platform_ipi #ifdef CONFIG_HAVE_KVM apicinterrupt3 POSTED_INTR_VECTOR kvm_posted_intr_ipi smp_kvm_posted_intr_ipi apicinterrupt3 POSTED_INTR_WAKEUP_VECTOR kvm_posted_intr_wakeup_ipi smp_kvm_posted_intr_wakeup_ipi +apicinterrupt3 POSTED_INTR_NESTED_VECTOR kvm_posted_intr_nested_ipi smp_kvm_posted_intr_nested_ipi #endif #ifdef CONFIG_X86_MCE_THRESHOLD @@ -738,13 +809,14 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ENTRY(\sym) + UNWIND_HINT_IRET_REGS offset=\has_error_code*8 + /* Sanity check */ .if \shift_ist != -1 && \paranoid == 0 .error "using shift_ist requires paranoid=1" .endif ASM_CLAC - PARAVIRT_ADJUST_EXCEPTION_FRAME .ifeq \has_error_code pushq $-1 /* ORIG_RAX: no syscall to restart */ @@ -761,6 +833,7 @@ ENTRY(\sym) .else call error_entry .endif + UNWIND_HINT_REGS /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ .if \paranoid @@ -827,17 +900,6 @@ ENTRY(\sym) END(\sym) .endm -#ifdef CONFIG_TRACING -.macro trace_idtentry sym do_sym has_error_code:req -idtentry trace(\sym) trace(\do_sym) has_error_code=\has_error_code -idtentry \sym \do_sym has_error_code=\has_error_code -.endm -#else -.macro trace_idtentry sym do_sym has_error_code:req -idtentry \sym \do_sym has_error_code=\has_error_code -.endm -#endif - idtentry divide_error do_divide_error has_error_code=0 idtentry overflow do_overflow has_error_code=0 idtentry bounds do_bounds has_error_code=0 @@ -858,6 +920,7 @@ idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 * edi: new selector */ ENTRY(native_load_gs_index) + FRAME_BEGIN pushfq DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) SWAPGS @@ -866,8 +929,9 @@ ENTRY(native_load_gs_index) 2: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE SWAPGS popfq + FRAME_END ret -END(native_load_gs_index) +ENDPROC(native_load_gs_index) EXPORT_SYMBOL(native_load_gs_index) _ASM_EXTABLE(.Lgs_change, bad_gs) @@ -890,17 +954,15 @@ bad_gs: ENTRY(do_softirq_own_stack) pushq %rbp mov %rsp, %rbp - incl PER_CPU_VAR(irq_count) - cmove PER_CPU_VAR(irq_stack_ptr), %rsp - push %rbp /* frame pointer backlink */ + ENTER_IRQ_STACK regs=0 old_rsp=%r11 call __do_softirq + LEAVE_IRQ_STACK regs=0 leaveq - decl PER_CPU_VAR(irq_count) ret -END(do_softirq_own_stack) +ENDPROC(do_softirq_own_stack) #ifdef CONFIG_XEN -idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0 +idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0 /* * A note on the "critical region" in our callback handler. @@ -921,14 +983,14 @@ ENTRY(xen_do_hypervisor_callback) /* do_hypervisor_callback(struct *pt_regs) */ * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will * see the correct pointer to the pt_regs */ + UNWIND_HINT_FUNC movq %rdi, %rsp /* we don't return, adjust the stack frame */ -11: incl PER_CPU_VAR(irq_count) - movq %rsp, %rbp - cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp - pushq %rbp /* frame pointer backlink */ + UNWIND_HINT_REGS + + ENTER_IRQ_STACK old_rsp=%r10 call xen_evtchn_do_upcall - popq %rsp - decl PER_CPU_VAR(irq_count) + LEAVE_IRQ_STACK + #ifndef CONFIG_PREEMPT call xen_maybe_preempt_hcall #endif @@ -949,6 +1011,7 @@ END(xen_do_hypervisor_callback) * with its current contents: any discrepancy means we in category 1. */ ENTRY(xen_failsafe_callback) + UNWIND_HINT_EMPTY movl %ds, %ecx cmpw %cx, 0x10(%rsp) jne 1f @@ -966,13 +1029,13 @@ ENTRY(xen_failsafe_callback) movq 8(%rsp), %r11 addq $0x30, %rsp pushq $0 /* RIP */ - pushq %r11 - pushq %rcx + UNWIND_HINT_IRET_REGS offset=8 jmp general_protection 1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ movq (%rsp), %rcx movq 8(%rsp), %r11 addq $0x30, %rsp + UNWIND_HINT_IRET_REGS pushq $-1 /* orig_ax = -1 => not a system call */ ALLOC_PT_GPREGS_ON_STACK SAVE_C_REGS @@ -996,13 +1059,12 @@ idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK idtentry stack_segment do_stack_segment has_error_code=1 #ifdef CONFIG_XEN -idtentry xen_debug do_debug has_error_code=0 -idtentry xen_int3 do_int3 has_error_code=0 -idtentry xen_stack_segment do_stack_segment has_error_code=1 +idtentry xendebug do_debug has_error_code=0 +idtentry xenint3 do_int3 has_error_code=0 #endif idtentry general_protection do_general_protection has_error_code=1 -trace_idtentry page_fault do_page_fault has_error_code=1 +idtentry page_fault do_page_fault has_error_code=1 #ifdef CONFIG_KVM_GUEST idtentry async_page_fault do_async_page_fault has_error_code=1 @@ -1018,6 +1080,7 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vec * Return: ebx=0: need swapgs on exit, ebx=1: otherwise */ ENTRY(paranoid_entry) + UNWIND_HINT_FUNC cld SAVE_C_REGS 8 SAVE_EXTRA_REGS 8 @@ -1045,6 +1108,7 @@ END(paranoid_entry) * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ ENTRY(paranoid_exit) + UNWIND_HINT_REGS DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF_DEBUG testl %ebx, %ebx /* swapgs needed? */ @@ -1066,6 +1130,7 @@ END(paranoid_exit) * Return: EBX=0: came from user mode; EBX=1: otherwise */ ENTRY(error_entry) + UNWIND_HINT_FUNC cld SAVE_C_REGS 8 SAVE_EXTRA_REGS 8 @@ -1150,6 +1215,7 @@ END(error_entry) * 0: user gsbase is loaded, we need SWAPGS and standard preparation for return to usermode */ ENTRY(error_exit) + UNWIND_HINT_REGS DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF testl %ebx, %ebx @@ -1158,19 +1224,9 @@ ENTRY(error_exit) END(error_exit) /* Runs on exception stack */ +/* XXX: broken on Xen PV */ ENTRY(nmi) - /* - * Fix up the exception frame if we're on Xen. - * PARAVIRT_ADJUST_EXCEPTION_FRAME is guaranteed to push at most - * one value to the stack on native, so it may clobber the rdx - * scratch slot, but it won't clobber any of the important - * slots past it. - * - * Xen is a different story, because the Xen frame itself overlaps - * the "NMI executing" variable. - */ - PARAVIRT_ADJUST_EXCEPTION_FRAME - + UNWIND_HINT_IRET_REGS /* * We allow breakpoints in NMIs. If a breakpoint occurs, then * the iretq it performs will take us out of NMI context. @@ -1209,6 +1265,8 @@ ENTRY(nmi) * other IST entries. */ + ASM_CLAC + /* Use %rdx as our temp variable throughout */ pushq %rdx @@ -1230,11 +1288,13 @@ ENTRY(nmi) cld movq %rsp, %rdx movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + UNWIND_HINT_IRET_REGS base=%rdx offset=8 pushq 5*8(%rdx) /* pt_regs->ss */ pushq 4*8(%rdx) /* pt_regs->rsp */ pushq 3*8(%rdx) /* pt_regs->flags */ pushq 2*8(%rdx) /* pt_regs->cs */ pushq 1*8(%rdx) /* pt_regs->rip */ + UNWIND_HINT_IRET_REGS pushq $-1 /* pt_regs->orig_ax */ pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ @@ -1251,6 +1311,7 @@ ENTRY(nmi) pushq %r13 /* pt_regs->r13 */ pushq %r14 /* pt_regs->r14 */ pushq %r15 /* pt_regs->r15 */ + UNWIND_HINT_REGS ENCODE_FRAME_POINTER /* @@ -1405,6 +1466,7 @@ first_nmi: .rept 5 pushq 11*8(%rsp) .endr + UNWIND_HINT_IRET_REGS /* Everything up to here is safe from nested NMIs */ @@ -1420,6 +1482,7 @@ first_nmi: pushq $__KERNEL_CS /* CS */ pushq $1f /* RIP */ INTERRUPT_RETURN /* continues at repeat_nmi below */ + UNWIND_HINT_IRET_REGS 1: #endif @@ -1469,6 +1532,7 @@ end_repeat_nmi: * exceptions might do. */ call paranoid_entry + UNWIND_HINT_REGS /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ movq %rsp, %rdi @@ -1506,17 +1570,19 @@ nmi_restore: END(nmi) ENTRY(ignore_sysret) + UNWIND_HINT_EMPTY mov $-ENOSYS, %eax sysret END(ignore_sysret) ENTRY(rewind_stack_do_exit) + UNWIND_HINT_FUNC /* Prevent any naive code from trying to unwind to our caller. */ xorl %ebp, %ebp movq PER_CPU_VAR(cpu_current_top_of_stack), %rax - leaq -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%rax), %rsp + leaq -PTREGS_SIZE(%rax), %rsp + UNWIND_HINT_FUNC sp_offset=PTREGS_SIZE call do_exit -1: jmp 1b END(rewind_stack_do_exit) diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index e1721dafbcb1..b5c7a56ed256 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Compatibility mode system call entry point for x86-64. * @@ -183,21 +184,20 @@ ENDPROC(entry_SYSENTER_compat) */ ENTRY(entry_SYSCALL_compat) /* Interrupts are off on entry. */ - SWAPGS_UNSAFE_STACK + swapgs /* Stash user ESP and switch to the kernel stack. */ movl %esp, %r8d movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - /* Zero-extending 32-bit regs, do not remove */ - movl %eax, %eax - /* Construct struct pt_regs on stack */ pushq $__USER32_DS /* pt_regs->ss */ pushq %r8 /* pt_regs->sp */ pushq %r11 /* pt_regs->flags */ pushq $__USER32_CS /* pt_regs->cs */ pushq %rcx /* pt_regs->ip */ +GLOBAL(entry_SYSCALL_compat_after_hwframe) + movl %eax, %eax /* discard orig_ax high bits */ pushq %rax /* pt_regs->orig_ax */ pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ @@ -294,7 +294,6 @@ ENTRY(entry_INT80_compat) /* * Interrupts are off on entry. */ - PARAVIRT_ADJUST_EXCEPTION_FRAME ASM_CLAC /* Do this early to minimize exposure */ SWAPGS @@ -342,8 +341,7 @@ ENTRY(entry_INT80_compat) jmp restore_regs_and_iret END(entry_INT80_compat) - ALIGN -GLOBAL(stub32_clone) +ENTRY(stub32_clone) /* * The 32-bit clone ABI is: clone(..., int tls_val, int *child_tidptr). * The 64-bit clone ABI is: clone(..., int *child_tidptr, int tls_val). @@ -353,3 +351,4 @@ GLOBAL(stub32_clone) */ xchg %r8, %rcx jmp sys_clone +ENDPROC(stub32_clone) diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c index 8f895ee13a1c..95c294963612 100644 --- a/arch/x86/entry/syscall_32.c +++ b/arch/x86/entry/syscall_32.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* System call table for i386. */ #include <linux/linkage.h> diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c index 9dbc5abb6162..9c09775e589d 100644 --- a/arch/x86/entry/syscall_64.c +++ b/arch/x86/entry/syscall_64.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* System call table for x86-64. */ #include <linux/linkage.h> diff --git a/arch/x86/entry/syscalls/Makefile b/arch/x86/entry/syscalls/Makefile index 57aa59fd140c..331f1dca5085 100644 --- a/arch/x86/entry/syscalls/Makefile +++ b/arch/x86/entry/syscalls/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 out := $(obj)/../../include/generated/asm uapi := $(obj)/../../include/generated/uapi/asm diff --git a/arch/x86/entry/syscalls/syscallhdr.sh b/arch/x86/entry/syscalls/syscallhdr.sh index 31fd5f1f38f7..12fbbcfe7ef3 100644 --- a/arch/x86/entry/syscalls/syscallhdr.sh +++ b/arch/x86/entry/syscalls/syscallhdr.sh @@ -1,4 +1,5 @@ #!/bin/sh +# SPDX-License-Identifier: GPL-2.0 in="$1" out="$2" diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh index 751d1f992630..d71ef4bd3615 100644 --- a/arch/x86/entry/syscalls/syscalltbl.sh +++ b/arch/x86/entry/syscalls/syscalltbl.sh @@ -1,4 +1,5 @@ #!/bin/sh +# SPDX-License-Identifier: GPL-2.0 in="$1" out="$2" diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index d5409660f5de..c366c0adeb40 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # Building vDSO images for x86. # diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S index 8ebb4b6454fe..acfd5ba7d943 100644 --- a/arch/x86/entry/vdso/vdso-layout.lds.S +++ b/arch/x86/entry/vdso/vdso-layout.lds.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #include <asm/vdso.h> /* diff --git a/arch/x86/entry/vdso/vdso.lds.S b/arch/x86/entry/vdso/vdso.lds.S index 6807932643c2..d3a2dce4cfa9 100644 --- a/arch/x86/entry/vdso/vdso.lds.S +++ b/arch/x86/entry/vdso/vdso.lds.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Linker script for 64-bit vDSO. * We #include the file to define the layout details. diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h index 3dab75f2a673..fa847a620f40 100644 --- a/arch/x86/entry/vdso/vdso2c.h +++ b/arch/x86/entry/vdso/vdso2c.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * This file is included twice from vdso2c.c. It generates code for 32-bit * and 64-bit vDSOs. We need both for 64-bit builds, since 32-bit vDSOs diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c index 3f9d1a83891a..42d4c89f990e 100644 --- a/arch/x86/entry/vdso/vdso32-setup.c +++ b/arch/x86/entry/vdso/vdso32-setup.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * (C) Copyright 2002 Linus Torvalds * Portions based on the vdso-randomization code from exec-shield: diff --git a/arch/x86/entry/vdso/vdso32/note.S b/arch/x86/entry/vdso/vdso32/note.S index c83f25734696..9fd51f206314 100644 --- a/arch/x86/entry/vdso/vdso32/note.S +++ b/arch/x86/entry/vdso/vdso32/note.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text. * Here we can supply some information useful to userland. diff --git a/arch/x86/entry/vdso/vdso32/sigreturn.S b/arch/x86/entry/vdso/vdso32/sigreturn.S index 20633e026e82..c3233ee98a6b 100644 --- a/arch/x86/entry/vdso/vdso32/sigreturn.S +++ b/arch/x86/entry/vdso/vdso32/sigreturn.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #include <linux/linkage.h> #include <asm/unistd_32.h> #include <asm/asm-offsets.h> diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S index ed4bc9731cbb..263d7433dea8 100644 --- a/arch/x86/entry/vdso/vdso32/system_call.S +++ b/arch/x86/entry/vdso/vdso32/system_call.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * AT_SYSINFO entry point */ diff --git a/arch/x86/entry/vdso/vdso32/vclock_gettime.c b/arch/x86/entry/vdso/vdso32/vclock_gettime.c index 87a86e017f0e..7780bbfb06ef 100644 --- a/arch/x86/entry/vdso/vdso32/vclock_gettime.c +++ b/arch/x86/entry/vdso/vdso32/vclock_gettime.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #define BUILD_VDSO32 #ifndef CONFIG_CC_OPTIMIZE_FOR_SIZE diff --git a/arch/x86/entry/vdso/vdso32/vdso32.lds.S b/arch/x86/entry/vdso/vdso32/vdso32.lds.S index 31056cf294bf..422764a81d32 100644 --- a/arch/x86/entry/vdso/vdso32/vdso32.lds.S +++ b/arch/x86/entry/vdso/vdso32/vdso32.lds.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Linker script for 32-bit vDSO. * We #include the file to define the layout details. diff --git a/arch/x86/entry/vdso/vdsox32.lds.S b/arch/x86/entry/vdso/vdsox32.lds.S index 697c11ece90c..05cd1c5c4a15 100644 --- a/arch/x86/entry/vdso/vdsox32.lds.S +++ b/arch/x86/entry/vdso/vdsox32.lds.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Linker script for x32 vDSO. * We #include the file to define the layout details. diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 139ad7726e10..1911310959f8 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -78,9 +78,6 @@ static int vdso_mremap(const struct vm_special_mapping *sm, if (image->size != new_size) return -EINVAL; - if (WARN_ON_ONCE(current->mm != new_vma->vm_mm)) - return -EFAULT; - vdso_fix_landing(image, new_vma); current->mm->context.vdso = (void __user *)new_vma->vm_start; @@ -354,7 +351,7 @@ static void vgetcpu_cpu_init(void *arg) * and 8 bits for the node) */ d.limit0 = cpu | ((node & 0xf) << 12); - d.limit = node >> 4; + d.limit1 = node >> 4; d.type = 5; /* RO data, expand down, accessed */ d.dpl = 3; /* Visible to user code */ d.s = 1; /* Not a system segment */ diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index ce1d7534fa53..f279ba2643dc 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2012-2014 Andy Lutomirski <luto@amacapital.net> * diff --git a/arch/x86/entry/vsyscall/vsyscall_gtod.c b/arch/x86/entry/vsyscall/vsyscall_gtod.c index 0fb3a104ac62..e1216dd95c04 100644 --- a/arch/x86/entry/vsyscall/vsyscall_gtod.c +++ b/arch/x86/entry/vsyscall/vsyscall_gtod.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE * Copyright 2003 Andi Kleen, SuSE Labs. diff --git a/arch/x86/entry/vsyscall/vsyscall_trace.h b/arch/x86/entry/vsyscall/vsyscall_trace.h index 9dd7359a38a8..3c3f9765a85c 100644 --- a/arch/x86/entry/vsyscall/vsyscall_trace.h +++ b/arch/x86/entry/vsyscall/vsyscall_trace.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM vsyscall diff --git a/arch/x86/events/Kconfig b/arch/x86/events/Kconfig index 98397db5ceae..9a7a1446cb3a 100644 --- a/arch/x86/events/Kconfig +++ b/arch/x86/events/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 menu "Performance monitoring" config PERF_EVENTS_INTEL_UNCORE diff --git a/arch/x86/events/amd/Makefile b/arch/x86/events/amd/Makefile index b1da46f396e0..fe8795a67385 100644 --- a/arch/x86/events/amd/Makefile +++ b/arch/x86/events/amd/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_CPU_SUP_AMD) += core.o uncore.o obj-$(CONFIG_PERF_EVENTS_AMD_POWER) += power.o obj-$(CONFIG_X86_LOCAL_APIC) += ibs.o diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index ad44af0dd667..f5cbbba99283 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -400,11 +400,24 @@ static int amd_uncore_cpu_starting(unsigned int cpu) if (amd_uncore_llc) { unsigned int apicid = cpu_data(cpu).apicid; - unsigned int nshared; + unsigned int nshared, subleaf, prev_eax = 0; uncore = *per_cpu_ptr(amd_uncore_llc, cpu); - cpuid_count(0x8000001d, 2, &eax, &ebx, &ecx, &edx); - nshared = ((eax >> 14) & 0xfff) + 1; + /* + * Iterate over Cache Topology Definition leaves until no + * more cache descriptions are available. + */ + for (subleaf = 0; subleaf < 5; subleaf++) { + cpuid_count(0x8000001d, subleaf, &eax, &ebx, &ecx, &edx); + + /* EAX[0:4] gives type of cache */ + if (!(eax & 0x1f)) + break; + + prev_eax = eax; + } + nshared = ((prev_eax >> 14) & 0xfff) + 1; + uncore->id = apicid - (apicid % nshared); uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_llc); @@ -555,7 +568,7 @@ static int __init amd_uncore_init(void) ret = 0; } - if (boot_cpu_has(X86_FEATURE_PERFCTR_L2)) { + if (boot_cpu_has(X86_FEATURE_PERFCTR_LLC)) { amd_uncore_llc = alloc_percpu(struct amd_uncore *); if (!amd_uncore_llc) { ret = -ENOMEM; diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 580b60f5ac83..80534d3c2480 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -191,8 +191,8 @@ static void release_pmc_hardware(void) {} static bool check_hw_exists(void) { - u64 val, val_fail, val_new= ~0; - int i, reg, reg_fail, ret = 0; + u64 val, val_fail = -1, val_new= ~0; + int i, reg, reg_fail = -1, ret = 0; int bios_fail = 0; int reg_safe = -1; @@ -487,22 +487,28 @@ static inline int precise_br_compat(struct perf_event *event) return m == b; } -int x86_pmu_hw_config(struct perf_event *event) +int x86_pmu_max_precise(void) { - if (event->attr.precise_ip) { - int precise = 0; + int precise = 0; + + /* Support for constant skid */ + if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) { + precise++; - /* Support for constant skid */ - if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) { + /* Support for IP fixup */ + if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2) precise++; - /* Support for IP fixup */ - if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2) - precise++; + if (x86_pmu.pebs_prec_dist) + precise++; + } + return precise; +} - if (x86_pmu.pebs_prec_dist) - precise++; - } +int x86_pmu_hw_config(struct perf_event *event) +{ + if (event->attr.precise_ip) { + int precise = x86_pmu_max_precise(); if (event->attr.precise_ip > precise) return -EOPNOTSUPP; @@ -1750,6 +1756,9 @@ ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event) return ret; } +static struct attribute_group x86_pmu_attr_group; +static struct attribute_group x86_pmu_caps_group; + static int __init init_hw_perf_events(void) { struct x86_pmu_quirk *quirk; @@ -1797,6 +1806,14 @@ static int __init init_hw_perf_events(void) x86_pmu_format_group.attrs = x86_pmu.format_attrs; + if (x86_pmu.caps_attrs) { + struct attribute **tmp; + + tmp = merge_attr(x86_pmu_caps_group.attrs, x86_pmu.caps_attrs); + if (!WARN_ON(!tmp)) + x86_pmu_caps_group.attrs = tmp; + } + if (x86_pmu.event_attrs) x86_pmu_events_group.attrs = x86_pmu.event_attrs; @@ -1813,6 +1830,14 @@ static int __init init_hw_perf_events(void) x86_pmu_events_group.attrs = tmp; } + if (x86_pmu.attrs) { + struct attribute **tmp; + + tmp = merge_attr(x86_pmu_attr_group.attrs, x86_pmu.attrs); + if (!WARN_ON(!tmp)) + x86_pmu_attr_group.attrs = tmp; + } + pr_info("... version: %d\n", x86_pmu.version); pr_info("... bit width: %d\n", x86_pmu.cntval_bits); pr_info("... generic registers: %d\n", x86_pmu.num_counters); @@ -2101,11 +2126,10 @@ static int x86_pmu_event_init(struct perf_event *event) static void refresh_pce(void *ignored) { - if (current->active_mm) - load_mm_cr4(current->active_mm); + load_mm_cr4(this_cpu_read(cpu_tlbstate.loaded_mm)); } -static void x86_pmu_event_mapped(struct perf_event *event) +static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm) { if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) return; @@ -2120,22 +2144,20 @@ static void x86_pmu_event_mapped(struct perf_event *event) * For now, this can't happen because all callers hold mmap_sem * for write. If this changes, we'll need a different solution. */ - lockdep_assert_held_exclusive(¤t->mm->mmap_sem); + lockdep_assert_held_exclusive(&mm->mmap_sem); - if (atomic_inc_return(¤t->mm->context.perf_rdpmc_allowed) == 1) - on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1); + if (atomic_inc_return(&mm->context.perf_rdpmc_allowed) == 1) + on_each_cpu_mask(mm_cpumask(mm), refresh_pce, NULL, 1); } -static void x86_pmu_event_unmapped(struct perf_event *event) +static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm) { - if (!current->mm) - return; if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) return; - if (atomic_dec_and_test(¤t->mm->context.perf_rdpmc_allowed)) - on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1); + if (atomic_dec_and_test(&mm->context.perf_rdpmc_allowed)) + on_each_cpu_mask(mm_cpumask(mm), refresh_pce, NULL, 1); } static int x86_pmu_event_idx(struct perf_event *event) @@ -2206,10 +2228,30 @@ static struct attribute_group x86_pmu_attr_group = { .attrs = x86_pmu_attrs, }; +static ssize_t max_precise_show(struct device *cdev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu_max_precise()); +} + +static DEVICE_ATTR_RO(max_precise); + +static struct attribute *x86_pmu_caps_attrs[] = { + &dev_attr_max_precise.attr, + NULL +}; + +static struct attribute_group x86_pmu_caps_group = { + .name = "caps", + .attrs = x86_pmu_caps_attrs, +}; + static const struct attribute_group *x86_pmu_attr_groups[] = { &x86_pmu_attr_group, &x86_pmu_format_group, &x86_pmu_events_group, + &x86_pmu_caps_group, NULL, }; @@ -2224,7 +2266,6 @@ void perf_check_microcode(void) if (x86_pmu.check_microcode) x86_pmu.check_microcode(); } -EXPORT_SYMBOL_GPL(perf_check_microcode); static struct pmu pmu = { .pmu_enable = x86_pmu_enable, @@ -2255,7 +2296,7 @@ static struct pmu pmu = { void arch_perf_update_userpage(struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now) { - struct cyc2ns_data *data; + struct cyc2ns_data data; u64 offset; userpg->cap_user_time = 0; @@ -2267,17 +2308,17 @@ void arch_perf_update_userpage(struct perf_event *event, if (!using_native_sched_clock() || !sched_clock_stable()) return; - data = cyc2ns_read_begin(); + cyc2ns_read_begin(&data); - offset = data->cyc2ns_offset + __sched_clock_offset; + offset = data.cyc2ns_offset + __sched_clock_offset; /* * Internal timekeeping for enabled/running/stopped times * is always in the local_clock domain. */ userpg->cap_user_time = 1; - userpg->time_mult = data->cyc2ns_mul; - userpg->time_shift = data->cyc2ns_shift; + userpg->time_mult = data.cyc2ns_mul; + userpg->time_shift = data.cyc2ns_shift; userpg->time_offset = offset - now; /* @@ -2289,7 +2330,7 @@ void arch_perf_update_userpage(struct perf_event *event, userpg->time_zero = offset; } - cyc2ns_read_end(data); + cyc2ns_read_end(); } void @@ -2329,12 +2370,9 @@ static unsigned long get_segment_base(unsigned int segment) #ifdef CONFIG_MODIFY_LDT_SYSCALL struct ldt_struct *ldt; - if (idx > LDT_ENTRIES) - return 0; - /* IRQs are off, so this synchronizes with smp_store_release */ ldt = lockless_dereference(current->active_mm->context.ldt); - if (!ldt || idx > ldt->size) + if (!ldt || idx >= ldt->nr_entries) return 0; desc = &ldt->entries[idx]; @@ -2342,7 +2380,7 @@ static unsigned long get_segment_base(unsigned int segment) return 0; #endif } else { - if (idx > GDT_ENTRIES) + if (idx >= GDT_ENTRIES) return 0; desc = raw_cpu_ptr(gdt_page.gdt) + idx; diff --git a/arch/x86/events/intel/Makefile b/arch/x86/events/intel/Makefile index 06c2baa51814..3468b0c1dc7c 100644 --- a/arch/x86/events/intel/Makefile +++ b/arch/x86/events/intel/Makefile @@ -1,4 +1,5 @@ -obj-$(CONFIG_CPU_SUP_INTEL) += core.o bts.o cqm.o +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_CPU_SUP_INTEL) += core.o bts.o obj-$(CONFIG_CPU_SUP_INTEL) += ds.o knc.o obj-$(CONFIG_CPU_SUP_INTEL) += lbr.o p4.o p6.o pt.o obj-$(CONFIG_PERF_EVENTS_INTEL_RAPL) += intel-rapl-perf.o diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index 8ae8c5ce3a1f..141e07b06216 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -69,7 +69,7 @@ struct bts_buffer { struct bts_phys buf[0]; }; -struct pmu bts_pmu; +static struct pmu bts_pmu; static size_t buf_size(struct page *page) { @@ -268,7 +268,7 @@ static void bts_event_start(struct perf_event *event, int flags) bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum; bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold; - event->hw.itrace_started = 1; + perf_event_itrace_started(event); event->hw.state = 0; __bts_event_start(event); @@ -546,9 +546,6 @@ static int bts_event_init(struct perf_event *event) if (event->attr.type != bts_pmu.type) return -ENOENT; - if (x86_add_exclusive(x86_lbr_exclusive_bts)) - return -EBUSY; - /* * BTS leaks kernel addresses even when CPL0 tracing is * disabled, so disallow intel_bts driver for unprivileged @@ -562,6 +559,9 @@ static int bts_event_init(struct perf_event *event) !capable(CAP_SYS_ADMIN)) return -EACCES; + if (x86_add_exclusive(x86_lbr_exclusive_bts)) + return -EBUSY; + ret = x86_reserve_hardware(); if (ret) { x86_del_exclusive(x86_lbr_exclusive_bts); diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 110ce8238466..9fb9a1f1e47b 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -1708,6 +1708,120 @@ static __initconst const u64 glm_hw_cache_extra_regs }, }; +static __initconst const u64 glp_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = { + [C(L1D)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */ + [C(RESULT_MISS)] = 0x0, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */ + [C(RESULT_MISS)] = 0x0, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0x0, + [C(RESULT_MISS)] = 0x0, + }, + }, + [C(L1I)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0380, /* ICACHE.ACCESSES */ + [C(RESULT_MISS)] = 0x0280, /* ICACHE.MISSES */ + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0x0, + [C(RESULT_MISS)] = 0x0, + }, + }, + [C(LL)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x1b7, /* OFFCORE_RESPONSE */ + [C(RESULT_MISS)] = 0x1b7, /* OFFCORE_RESPONSE */ + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0x1b7, /* OFFCORE_RESPONSE */ + [C(RESULT_MISS)] = 0x1b7, /* OFFCORE_RESPONSE */ + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0x0, + [C(RESULT_MISS)] = 0x0, + }, + }, + [C(DTLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */ + [C(RESULT_MISS)] = 0xe08, /* DTLB_LOAD_MISSES.WALK_COMPLETED */ + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */ + [C(RESULT_MISS)] = 0xe49, /* DTLB_STORE_MISSES.WALK_COMPLETED */ + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0x0, + [C(RESULT_MISS)] = 0x0, + }, + }, + [C(ITLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x00c0, /* INST_RETIRED.ANY_P */ + [C(RESULT_MISS)] = 0x0481, /* ITLB.MISS */ + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + }, + [C(BPU)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ + [C(RESULT_MISS)] = 0x00c5, /* BR_MISP_RETIRED.ALL_BRANCHES */ + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + }, +}; + +static __initconst const u64 glp_hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = { + [C(LL)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = GLM_DEMAND_READ| + GLM_LLC_ACCESS, + [C(RESULT_MISS)] = GLM_DEMAND_READ| + GLM_LLC_MISS, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = GLM_DEMAND_WRITE| + GLM_LLC_ACCESS, + [C(RESULT_MISS)] = GLM_DEMAND_WRITE| + GLM_LLC_MISS, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0x0, + [C(RESULT_MISS)] = 0x0, + }, + }, +}; + #define KNL_OT_L2_HITE BIT_ULL(19) /* Other Tile L2 Hit */ #define KNL_OT_L2_HITF BIT_ULL(20) /* Other Tile L2 Hit */ #define KNL_MCDRAM_LOCAL BIT_ULL(21) @@ -3016,6 +3130,9 @@ static int hsw_hw_config(struct perf_event *event) return 0; } +static struct event_constraint counter0_constraint = + INTEL_ALL_EVENT_CONSTRAINT(0, 0x1); + static struct event_constraint counter2_constraint = EVENT_CONSTRAINT(0, 0x4, 0); @@ -3037,6 +3154,21 @@ hsw_get_event_constraints(struct cpu_hw_events *cpuc, int idx, return c; } +static struct event_constraint * +glp_get_event_constraints(struct cpu_hw_events *cpuc, int idx, + struct perf_event *event) +{ + struct event_constraint *c; + + /* :ppp means to do reduced skid PEBS which is PMC0 only. */ + if (event->attr.precise_ip == 3) + return &counter0_constraint; + + c = intel_get_event_constraints(cpuc, idx, event); + + return c; +} + /* * Broadwell: * @@ -3160,6 +3292,19 @@ err: return -ENOMEM; } +static void flip_smm_bit(void *data) +{ + unsigned long set = *(unsigned long *)data; + + if (set > 0) { + msr_set_bit(MSR_IA32_DEBUGCTLMSR, + DEBUGCTLMSR_FREEZE_IN_SMM_BIT); + } else { + msr_clear_bit(MSR_IA32_DEBUGCTLMSR, + DEBUGCTLMSR_FREEZE_IN_SMM_BIT); + } +} + static void intel_pmu_cpu_starting(int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); @@ -3174,6 +3319,8 @@ static void intel_pmu_cpu_starting(int cpu) cpuc->lbr_sel = NULL; + flip_smm_bit(&x86_pmu.attr_freeze_on_smi); + if (!cpuc->shared_regs) return; @@ -3250,10 +3397,8 @@ static void intel_pmu_cpu_dying(int cpu) static void intel_pmu_sched_task(struct perf_event_context *ctx, bool sched_in) { - if (x86_pmu.pebs_active) - intel_pmu_pebs_sched_task(ctx, sched_in); - if (x86_pmu.lbr_nr) - intel_pmu_lbr_sched_task(ctx, sched_in); + intel_pmu_pebs_sched_task(ctx, sched_in); + intel_pmu_lbr_sched_task(ctx, sched_in); } PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63"); @@ -3270,12 +3415,26 @@ static struct attribute *intel_arch3_formats_attr[] = { &format_attr_any.attr, &format_attr_inv.attr, &format_attr_cmask.attr, + NULL, +}; + +static struct attribute *hsw_format_attr[] = { &format_attr_in_tx.attr, &format_attr_in_tx_cp.attr, + &format_attr_offcore_rsp.attr, + &format_attr_ldlat.attr, + NULL +}; - &format_attr_offcore_rsp.attr, /* XXX do NHM/WSM + SNB breakout */ - &format_attr_ldlat.attr, /* PEBS load latency */ - NULL, +static struct attribute *nhm_format_attr[] = { + &format_attr_offcore_rsp.attr, + &format_attr_ldlat.attr, + NULL +}; + +static struct attribute *slm_format_attr[] = { + &format_attr_offcore_rsp.attr, + NULL }; static struct attribute *skl_format_attr[] = { @@ -3410,12 +3569,10 @@ static void intel_snb_check_microcode(void) int pebs_broken = 0; int cpu; - get_online_cpus(); for_each_online_cpu(cpu) { if ((pebs_broken = intel_snb_pebs_broken(cpu))) break; } - put_online_cpus(); if (pebs_broken == x86_pmu.pebs_broken) return; @@ -3488,7 +3645,9 @@ static bool check_msr(unsigned long msr, u64 mask) static __init void intel_sandybridge_quirk(void) { x86_pmu.check_microcode = intel_snb_check_microcode; + cpus_read_lock(); intel_snb_check_microcode(); + cpus_read_unlock(); } static const struct { int id; char *name; } intel_arch_events_map[] __initconst = { @@ -3595,6 +3754,82 @@ static struct attribute *hsw_events_attrs[] = { NULL }; +static ssize_t freeze_on_smi_show(struct device *cdev, + struct device_attribute *attr, + char *buf) +{ + return sprintf(buf, "%lu\n", x86_pmu.attr_freeze_on_smi); +} + +static DEFINE_MUTEX(freeze_on_smi_mutex); + +static ssize_t freeze_on_smi_store(struct device *cdev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + unsigned long val; + ssize_t ret; + + ret = kstrtoul(buf, 0, &val); + if (ret) + return ret; + + if (val > 1) + return -EINVAL; + + mutex_lock(&freeze_on_smi_mutex); + + if (x86_pmu.attr_freeze_on_smi == val) + goto done; + + x86_pmu.attr_freeze_on_smi = val; + + get_online_cpus(); + on_each_cpu(flip_smm_bit, &val, 1); + put_online_cpus(); +done: + mutex_unlock(&freeze_on_smi_mutex); + + return count; +} + +static DEVICE_ATTR_RW(freeze_on_smi); + +static ssize_t branches_show(struct device *cdev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu.lbr_nr); +} + +static DEVICE_ATTR_RO(branches); + +static struct attribute *lbr_attrs[] = { + &dev_attr_branches.attr, + NULL +}; + +static char pmu_name_str[30]; + +static ssize_t pmu_name_show(struct device *cdev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%s\n", pmu_name_str); +} + +static DEVICE_ATTR_RO(pmu_name); + +static struct attribute *intel_pmu_caps_attrs[] = { + &dev_attr_pmu_name.attr, + NULL +}; + +static struct attribute *intel_pmu_attrs[] = { + &dev_attr_freeze_on_smi.attr, + NULL, +}; + __init int intel_pmu_init(void) { union cpuid10_edx edx; @@ -3604,6 +3839,8 @@ __init int intel_pmu_init(void) unsigned int unused; struct extra_reg *er; int version, i; + struct attribute **extra_attr = NULL; + char *name; if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { switch (boot_cpu_data.x86) { @@ -3641,6 +3878,8 @@ __init int intel_pmu_init(void) x86_pmu.max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters); + + x86_pmu.attrs = intel_pmu_attrs; /* * Quirk: v2 perfmon does not report fixed-purpose events, so * assume at least 3 events, when not running in a hypervisor: @@ -3669,6 +3908,7 @@ __init int intel_pmu_init(void) switch (boot_cpu_data.x86_model) { case INTEL_FAM6_CORE_YONAH: pr_cont("Core events, "); + name = "core"; break; case INTEL_FAM6_CORE2_MEROM: @@ -3684,6 +3924,7 @@ __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_core2_event_constraints; x86_pmu.pebs_constraints = intel_core2_pebs_event_constraints; pr_cont("Core2 events, "); + name = "core2"; break; case INTEL_FAM6_NEHALEM: @@ -3712,8 +3953,11 @@ __init int intel_pmu_init(void) intel_pmu_pebs_data_source_nhm(); x86_add_quirk(intel_nehalem_quirk); + x86_pmu.pebs_no_tlb = 1; + extra_attr = nhm_format_attr; pr_cont("Nehalem events, "); + name = "nehalem"; break; case INTEL_FAM6_ATOM_PINEVIEW: @@ -3730,6 +3974,7 @@ __init int intel_pmu_init(void) x86_pmu.pebs_constraints = intel_atom_pebs_event_constraints; x86_pmu.pebs_aliases = intel_pebs_aliases_core2; pr_cont("Atom events, "); + name = "bonnell"; break; case INTEL_FAM6_ATOM_SILVERMONT1: @@ -3747,7 +3992,9 @@ __init int intel_pmu_init(void) x86_pmu.extra_regs = intel_slm_extra_regs; x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.cpu_events = slm_events_attrs; + extra_attr = slm_format_attr; pr_cont("Silvermont events, "); + name = "silvermont"; break; case INTEL_FAM6_ATOM_GOLDMONT: @@ -3772,7 +4019,37 @@ __init int intel_pmu_init(void) x86_pmu.lbr_pt_coexist = true; x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.cpu_events = glm_events_attrs; + extra_attr = slm_format_attr; pr_cont("Goldmont events, "); + name = "goldmont"; + break; + + case INTEL_FAM6_ATOM_GEMINI_LAKE: + memcpy(hw_cache_event_ids, glp_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, glp_hw_cache_extra_regs, + sizeof(hw_cache_extra_regs)); + + intel_pmu_lbr_init_skl(); + + x86_pmu.event_constraints = intel_slm_event_constraints; + x86_pmu.pebs_constraints = intel_glp_pebs_event_constraints; + x86_pmu.extra_regs = intel_glm_extra_regs; + /* + * It's recommended to use CPU_CLK_UNHALTED.CORE_P + NPEBS + * for precise cycles. + */ + x86_pmu.pebs_aliases = NULL; + x86_pmu.pebs_prec_dist = true; + x86_pmu.lbr_pt_coexist = true; + x86_pmu.flags |= PMU_FL_HAS_RSP_1; + x86_pmu.get_event_constraints = glp_get_event_constraints; + x86_pmu.cpu_events = glm_events_attrs; + /* Goldmont Plus has 4-wide pipeline */ + event_attr_td_total_slots_scale_glm.event_str = "4"; + extra_attr = slm_format_attr; + pr_cont("Goldmont plus events, "); + name = "goldmont_plus"; break; case INTEL_FAM6_WESTMERE: @@ -3801,7 +4078,9 @@ __init int intel_pmu_init(void) X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1); intel_pmu_pebs_data_source_nhm(); + extra_attr = nhm_format_attr; pr_cont("Westmere events, "); + name = "westmere"; break; case INTEL_FAM6_SANDYBRIDGE: @@ -3837,7 +4116,10 @@ __init int intel_pmu_init(void) intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1); + extra_attr = nhm_format_attr; + pr_cont("SandyBridge events, "); + name = "sandybridge"; break; case INTEL_FAM6_IVYBRIDGE: @@ -3871,7 +4153,10 @@ __init int intel_pmu_init(void) intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1); + extra_attr = nhm_format_attr; + pr_cont("IvyBridge events, "); + name = "ivybridge"; break; @@ -3899,7 +4184,10 @@ __init int intel_pmu_init(void) x86_pmu.get_event_constraints = hsw_get_event_constraints; x86_pmu.cpu_events = hsw_events_attrs; x86_pmu.lbr_double_abort = true; + extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? + hsw_format_attr : nhm_format_attr; pr_cont("Haswell events, "); + name = "haswell"; break; case INTEL_FAM6_BROADWELL_CORE: @@ -3935,7 +4223,10 @@ __init int intel_pmu_init(void) x86_pmu.get_event_constraints = hsw_get_event_constraints; x86_pmu.cpu_events = hsw_events_attrs; x86_pmu.limit_period = bdw_limit_period; + extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? + hsw_format_attr : nhm_format_attr; pr_cont("Broadwell events, "); + name = "broadwell"; break; case INTEL_FAM6_XEON_PHI_KNL: @@ -3953,8 +4244,9 @@ __init int intel_pmu_init(void) /* all extra regs are per-cpu when HT is on */ x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.flags |= PMU_FL_NO_HT_SHARING; - + extra_attr = slm_format_attr; pr_cont("Knights Landing/Mill events, "); + name = "knights-landing"; break; case INTEL_FAM6_SKYLAKE_MOBILE: @@ -3984,11 +4276,14 @@ __init int intel_pmu_init(void) x86_pmu.hw_config = hsw_hw_config; x86_pmu.get_event_constraints = hsw_get_event_constraints; - x86_pmu.format_attrs = merge_attr(intel_arch3_formats_attr, - skl_format_attr); - WARN_ON(!x86_pmu.format_attrs); + extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? + hsw_format_attr : nhm_format_attr; + extra_attr = merge_attr(extra_attr, skl_format_attr); x86_pmu.cpu_events = hsw_events_attrs; + intel_pmu_pebs_data_source_skl( + boot_cpu_data.x86_model == INTEL_FAM6_SKYLAKE_X); pr_cont("Skylake events, "); + name = "skylake"; break; default: @@ -3996,6 +4291,7 @@ __init int intel_pmu_init(void) case 1: x86_pmu.event_constraints = intel_v1_event_constraints; pr_cont("generic architected perfmon v1, "); + name = "generic_arch_v1"; break; default: /* @@ -4003,10 +4299,19 @@ __init int intel_pmu_init(void) */ x86_pmu.event_constraints = intel_gen_event_constraints; pr_cont("generic architected perfmon, "); + name = "generic_arch_v2+"; break; } } + snprintf(pmu_name_str, sizeof pmu_name_str, "%s", name); + + if (version >= 2 && extra_attr) { + x86_pmu.format_attrs = merge_attr(intel_arch3_formats_attr, + extra_attr); + WARN_ON(!x86_pmu.format_attrs); + } + if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) { WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC); @@ -4053,8 +4358,13 @@ __init int intel_pmu_init(void) x86_pmu.lbr_nr = 0; } - if (x86_pmu.lbr_nr) + x86_pmu.caps_attrs = intel_pmu_caps_attrs; + + if (x86_pmu.lbr_nr) { + x86_pmu.caps_attrs = merge_attr(x86_pmu.caps_attrs, lbr_attrs); pr_cont("%d-deep LBR, ", x86_pmu.lbr_nr); + } + /* * Access extra MSR may cause #GP under certain circumstances. * E.g. KVM doesn't support offcore event @@ -4099,10 +4409,9 @@ static __init int fixup_ht_bug(void) return 0; } - if (lockup_detector_suspend() != 0) { - pr_debug("failed to disable PMU erratum BJ122, BV98, HSD29 workaround\n"); - return 0; - } + cpus_read_lock(); + + hardlockup_detector_perf_stop(); x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED); @@ -4110,15 +4419,12 @@ static __init int fixup_ht_bug(void) x86_pmu.commit_scheduling = NULL; x86_pmu.stop_scheduling = NULL; - lockup_detector_resume(); - - get_online_cpus(); + hardlockup_detector_perf_restart(); - for_each_online_cpu(c) { + for_each_online_cpu(c) free_excl_cntrs(c); - } - put_online_cpus(); + cpus_read_unlock(); pr_info("PMU erratum BJ122, BV98, HSD29 workaround disabled, HT off\n"); return 0; } diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c deleted file mode 100644 index 8c00dc09a5d2..000000000000 --- a/arch/x86/events/intel/cqm.c +++ /dev/null @@ -1,1766 +0,0 @@ -/* - * Intel Cache Quality-of-Service Monitoring (CQM) support. - * - * Based very, very heavily on work by Peter Zijlstra. - */ - -#include <linux/perf_event.h> -#include <linux/slab.h> -#include <asm/cpu_device_id.h> -#include <asm/intel_rdt_common.h> -#include "../perf_event.h" - -#define MSR_IA32_QM_CTR 0x0c8e -#define MSR_IA32_QM_EVTSEL 0x0c8d - -#define MBM_CNTR_WIDTH 24 -/* - * Guaranteed time in ms as per SDM where MBM counters will not overflow. - */ -#define MBM_CTR_OVERFLOW_TIME 1000 - -static u32 cqm_max_rmid = -1; -static unsigned int cqm_l3_scale; /* supposedly cacheline size */ -static bool cqm_enabled, mbm_enabled; -unsigned int mbm_socket_max; - -/* - * The cached intel_pqr_state is strictly per CPU and can never be - * updated from a remote CPU. Both functions which modify the state - * (intel_cqm_event_start and intel_cqm_event_stop) are called with - * interrupts disabled, which is sufficient for the protection. - */ -DEFINE_PER_CPU(struct intel_pqr_state, pqr_state); -static struct hrtimer *mbm_timers; -/** - * struct sample - mbm event's (local or total) data - * @total_bytes #bytes since we began monitoring - * @prev_msr previous value of MSR - */ -struct sample { - u64 total_bytes; - u64 prev_msr; -}; - -/* - * samples profiled for total memory bandwidth type events - */ -static struct sample *mbm_total; -/* - * samples profiled for local memory bandwidth type events - */ -static struct sample *mbm_local; - -#define pkg_id topology_physical_package_id(smp_processor_id()) -/* - * rmid_2_index returns the index for the rmid in mbm_local/mbm_total array. - * mbm_total[] and mbm_local[] are linearly indexed by socket# * max number of - * rmids per socket, an example is given below - * RMID1 of Socket0: vrmid = 1 - * RMID1 of Socket1: vrmid = 1 * (cqm_max_rmid + 1) + 1 - * RMID1 of Socket2: vrmid = 2 * (cqm_max_rmid + 1) + 1 - */ -#define rmid_2_index(rmid) ((pkg_id * (cqm_max_rmid + 1)) + rmid) -/* - * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru. - * Also protects event->hw.cqm_rmid - * - * Hold either for stability, both for modification of ->hw.cqm_rmid. - */ -static DEFINE_MUTEX(cache_mutex); -static DEFINE_RAW_SPINLOCK(cache_lock); - -/* - * Groups of events that have the same target(s), one RMID per group. - */ -static LIST_HEAD(cache_groups); - -/* - * Mask of CPUs for reading CQM values. We only need one per-socket. - */ -static cpumask_t cqm_cpumask; - -#define RMID_VAL_ERROR (1ULL << 63) -#define RMID_VAL_UNAVAIL (1ULL << 62) - -/* - * Event IDs are used to program IA32_QM_EVTSEL before reading event - * counter from IA32_QM_CTR - */ -#define QOS_L3_OCCUP_EVENT_ID 0x01 -#define QOS_MBM_TOTAL_EVENT_ID 0x02 -#define QOS_MBM_LOCAL_EVENT_ID 0x03 - -/* - * This is central to the rotation algorithm in __intel_cqm_rmid_rotate(). - * - * This rmid is always free and is guaranteed to have an associated - * near-zero occupancy value, i.e. no cachelines are tagged with this - * RMID, once __intel_cqm_rmid_rotate() returns. - */ -static u32 intel_cqm_rotation_rmid; - -#define INVALID_RMID (-1) - -/* - * Is @rmid valid for programming the hardware? - * - * rmid 0 is reserved by the hardware for all non-monitored tasks, which - * means that we should never come across an rmid with that value. - * Likewise, an rmid value of -1 is used to indicate "no rmid currently - * assigned" and is used as part of the rotation code. - */ -static inline bool __rmid_valid(u32 rmid) -{ - if (!rmid || rmid == INVALID_RMID) - return false; - - return true; -} - -static u64 __rmid_read(u32 rmid) -{ - u64 val; - - /* - * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt, - * it just says that to increase confusion. - */ - wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid); - rdmsrl(MSR_IA32_QM_CTR, val); - - /* - * Aside from the ERROR and UNAVAIL bits, assume this thing returns - * the number of cachelines tagged with @rmid. - */ - return val; -} - -enum rmid_recycle_state { - RMID_YOUNG = 0, - RMID_AVAILABLE, - RMID_DIRTY, -}; - -struct cqm_rmid_entry { - u32 rmid; - enum rmid_recycle_state state; - struct list_head list; - unsigned long queue_time; -}; - -/* - * cqm_rmid_free_lru - A least recently used list of RMIDs. - * - * Oldest entry at the head, newest (most recently used) entry at the - * tail. This list is never traversed, it's only used to keep track of - * the lru order. That is, we only pick entries of the head or insert - * them on the tail. - * - * All entries on the list are 'free', and their RMIDs are not currently - * in use. To mark an RMID as in use, remove its entry from the lru - * list. - * - * - * cqm_rmid_limbo_lru - list of currently unused but (potentially) dirty RMIDs. - * - * This list is contains RMIDs that no one is currently using but that - * may have a non-zero occupancy value associated with them. The - * rotation worker moves RMIDs from the limbo list to the free list once - * the occupancy value drops below __intel_cqm_threshold. - * - * Both lists are protected by cache_mutex. - */ -static LIST_HEAD(cqm_rmid_free_lru); -static LIST_HEAD(cqm_rmid_limbo_lru); - -/* - * We use a simple array of pointers so that we can lookup a struct - * cqm_rmid_entry in O(1). This alleviates the callers of __get_rmid() - * and __put_rmid() from having to worry about dealing with struct - * cqm_rmid_entry - they just deal with rmids, i.e. integers. - * - * Once this array is initialized it is read-only. No locks are required - * to access it. - * - * All entries for all RMIDs can be looked up in the this array at all - * times. - */ -static struct cqm_rmid_entry **cqm_rmid_ptrs; - -static inline struct cqm_rmid_entry *__rmid_entry(u32 rmid) -{ - struct cqm_rmid_entry *entry; - - entry = cqm_rmid_ptrs[rmid]; - WARN_ON(entry->rmid != rmid); - - return entry; -} - -/* - * Returns < 0 on fail. - * - * We expect to be called with cache_mutex held. - */ -static u32 __get_rmid(void) -{ - struct cqm_rmid_entry *entry; - - lockdep_assert_held(&cache_mutex); - - if (list_empty(&cqm_rmid_free_lru)) - return INVALID_RMID; - - entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list); - list_del(&entry->list); - - return entry->rmid; -} - -static void __put_rmid(u32 rmid) -{ - struct cqm_rmid_entry *entry; - - lockdep_assert_held(&cache_mutex); - - WARN_ON(!__rmid_valid(rmid)); - entry = __rmid_entry(rmid); - - entry->queue_time = jiffies; - entry->state = RMID_YOUNG; - - list_add_tail(&entry->list, &cqm_rmid_limbo_lru); -} - -static void cqm_cleanup(void) -{ - int i; - - if (!cqm_rmid_ptrs) - return; - - for (i = 0; i < cqm_max_rmid; i++) - kfree(cqm_rmid_ptrs[i]); - - kfree(cqm_rmid_ptrs); - cqm_rmid_ptrs = NULL; - cqm_enabled = false; -} - -static int intel_cqm_setup_rmid_cache(void) -{ - struct cqm_rmid_entry *entry; - unsigned int nr_rmids; - int r = 0; - - nr_rmids = cqm_max_rmid + 1; - cqm_rmid_ptrs = kzalloc(sizeof(struct cqm_rmid_entry *) * - nr_rmids, GFP_KERNEL); - if (!cqm_rmid_ptrs) - return -ENOMEM; - - for (; r <= cqm_max_rmid; r++) { - struct cqm_rmid_entry *entry; - - entry = kmalloc(sizeof(*entry), GFP_KERNEL); - if (!entry) - goto fail; - - INIT_LIST_HEAD(&entry->list); - entry->rmid = r; - cqm_rmid_ptrs[r] = entry; - - list_add_tail(&entry->list, &cqm_rmid_free_lru); - } - - /* - * RMID 0 is special and is always allocated. It's used for all - * tasks that are not monitored. - */ - entry = __rmid_entry(0); - list_del(&entry->list); - - mutex_lock(&cache_mutex); - intel_cqm_rotation_rmid = __get_rmid(); - mutex_unlock(&cache_mutex); - - return 0; - -fail: - cqm_cleanup(); - return -ENOMEM; -} - -/* - * Determine if @a and @b measure the same set of tasks. - * - * If @a and @b measure the same set of tasks then we want to share a - * single RMID. - */ -static bool __match_event(struct perf_event *a, struct perf_event *b) -{ - /* Per-cpu and task events don't mix */ - if ((a->attach_state & PERF_ATTACH_TASK) != - (b->attach_state & PERF_ATTACH_TASK)) - return false; - -#ifdef CONFIG_CGROUP_PERF - if (a->cgrp != b->cgrp) - return false; -#endif - - /* If not task event, we're machine wide */ - if (!(b->attach_state & PERF_ATTACH_TASK)) - return true; - - /* - * Events that target same task are placed into the same cache group. - * Mark it as a multi event group, so that we update ->count - * for every event rather than just the group leader later. - */ - if (a->hw.target == b->hw.target) { - b->hw.is_group_event = true; - return true; - } - - /* - * Are we an inherited event? - */ - if (b->parent == a) - return true; - - return false; -} - -#ifdef CONFIG_CGROUP_PERF -static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event) -{ - if (event->attach_state & PERF_ATTACH_TASK) - return perf_cgroup_from_task(event->hw.target, event->ctx); - - return event->cgrp; -} -#endif - -/* - * Determine if @a's tasks intersect with @b's tasks - * - * There are combinations of events that we explicitly prohibit, - * - * PROHIBITS - * system-wide -> cgroup and task - * cgroup -> system-wide - * -> task in cgroup - * task -> system-wide - * -> task in cgroup - * - * Call this function before allocating an RMID. - */ -static bool __conflict_event(struct perf_event *a, struct perf_event *b) -{ -#ifdef CONFIG_CGROUP_PERF - /* - * We can have any number of cgroups but only one system-wide - * event at a time. - */ - if (a->cgrp && b->cgrp) { - struct perf_cgroup *ac = a->cgrp; - struct perf_cgroup *bc = b->cgrp; - - /* - * This condition should have been caught in - * __match_event() and we should be sharing an RMID. - */ - WARN_ON_ONCE(ac == bc); - - if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) || - cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup)) - return true; - - return false; - } - - if (a->cgrp || b->cgrp) { - struct perf_cgroup *ac, *bc; - - /* - * cgroup and system-wide events are mutually exclusive - */ - if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) || - (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK))) - return true; - - /* - * Ensure neither event is part of the other's cgroup - */ - ac = event_to_cgroup(a); - bc = event_to_cgroup(b); - if (ac == bc) - return true; - - /* - * Must have cgroup and non-intersecting task events. - */ - if (!ac || !bc) - return false; - - /* - * We have cgroup and task events, and the task belongs - * to a cgroup. Check for for overlap. - */ - if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) || - cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup)) - return true; - - return false; - } -#endif - /* - * If one of them is not a task, same story as above with cgroups. - */ - if (!(a->attach_state & PERF_ATTACH_TASK) || - !(b->attach_state & PERF_ATTACH_TASK)) - return true; - - /* - * Must be non-overlapping. - */ - return false; -} - -struct rmid_read { - u32 rmid; - u32 evt_type; - atomic64_t value; -}; - -static void __intel_cqm_event_count(void *info); -static void init_mbm_sample(u32 rmid, u32 evt_type); -static void __intel_mbm_event_count(void *info); - -static bool is_cqm_event(int e) -{ - return (e == QOS_L3_OCCUP_EVENT_ID); -} - -static bool is_mbm_event(int e) -{ - return (e >= QOS_MBM_TOTAL_EVENT_ID && e <= QOS_MBM_LOCAL_EVENT_ID); -} - -static void cqm_mask_call(struct rmid_read *rr) -{ - if (is_mbm_event(rr->evt_type)) - on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_count, rr, 1); - else - on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, rr, 1); -} - -/* - * Exchange the RMID of a group of events. - */ -static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid) -{ - struct perf_event *event; - struct list_head *head = &group->hw.cqm_group_entry; - u32 old_rmid = group->hw.cqm_rmid; - - lockdep_assert_held(&cache_mutex); - - /* - * If our RMID is being deallocated, perform a read now. - */ - if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) { - struct rmid_read rr = { - .rmid = old_rmid, - .evt_type = group->attr.config, - .value = ATOMIC64_INIT(0), - }; - - cqm_mask_call(&rr); - local64_set(&group->count, atomic64_read(&rr.value)); - } - - raw_spin_lock_irq(&cache_lock); - - group->hw.cqm_rmid = rmid; - list_for_each_entry(event, head, hw.cqm_group_entry) - event->hw.cqm_rmid = rmid; - - raw_spin_unlock_irq(&cache_lock); - - /* - * If the allocation is for mbm, init the mbm stats. - * Need to check if each event in the group is mbm event - * because there could be multiple type of events in the same group. - */ - if (__rmid_valid(rmid)) { - event = group; - if (is_mbm_event(event->attr.config)) - init_mbm_sample(rmid, event->attr.config); - - list_for_each_entry(event, head, hw.cqm_group_entry) { - if (is_mbm_event(event->attr.config)) - init_mbm_sample(rmid, event->attr.config); - } - } - - return old_rmid; -} - -/* - * If we fail to assign a new RMID for intel_cqm_rotation_rmid because - * cachelines are still tagged with RMIDs in limbo, we progressively - * increment the threshold until we find an RMID in limbo with <= - * __intel_cqm_threshold lines tagged. This is designed to mitigate the - * problem where cachelines tagged with an RMID are not steadily being - * evicted. - * - * On successful rotations we decrease the threshold back towards zero. - * - * __intel_cqm_max_threshold provides an upper bound on the threshold, - * and is measured in bytes because it's exposed to userland. - */ -static unsigned int __intel_cqm_threshold; -static unsigned int __intel_cqm_max_threshold; - -/* - * Test whether an RMID has a zero occupancy value on this cpu. - */ -static void intel_cqm_stable(void *arg) -{ - struct cqm_rmid_entry *entry; - - list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) { - if (entry->state != RMID_AVAILABLE) - break; - - if (__rmid_read(entry->rmid) > __intel_cqm_threshold) - entry->state = RMID_DIRTY; - } -} - -/* - * If we have group events waiting for an RMID that don't conflict with - * events already running, assign @rmid. - */ -static bool intel_cqm_sched_in_event(u32 rmid) -{ - struct perf_event *leader, *event; - - lockdep_assert_held(&cache_mutex); - - leader = list_first_entry(&cache_groups, struct perf_event, - hw.cqm_groups_entry); - event = leader; - - list_for_each_entry_continue(event, &cache_groups, - hw.cqm_groups_entry) { - if (__rmid_valid(event->hw.cqm_rmid)) - continue; - - if (__conflict_event(event, leader)) - continue; - - intel_cqm_xchg_rmid(event, rmid); - return true; - } - - return false; -} - -/* - * Initially use this constant for both the limbo queue time and the - * rotation timer interval, pmu::hrtimer_interval_ms. - * - * They don't need to be the same, but the two are related since if you - * rotate faster than you recycle RMIDs, you may run out of available - * RMIDs. - */ -#define RMID_DEFAULT_QUEUE_TIME 250 /* ms */ - -static unsigned int __rmid_queue_time_ms = RMID_DEFAULT_QUEUE_TIME; - -/* - * intel_cqm_rmid_stabilize - move RMIDs from limbo to free list - * @nr_available: number of freeable RMIDs on the limbo list - * - * Quiescent state; wait for all 'freed' RMIDs to become unused, i.e. no - * cachelines are tagged with those RMIDs. After this we can reuse them - * and know that the current set of active RMIDs is stable. - * - * Return %true or %false depending on whether stabilization needs to be - * reattempted. - * - * If we return %true then @nr_available is updated to indicate the - * number of RMIDs on the limbo list that have been queued for the - * minimum queue time (RMID_AVAILABLE), but whose data occupancy values - * are above __intel_cqm_threshold. - */ -static bool intel_cqm_rmid_stabilize(unsigned int *available) -{ - struct cqm_rmid_entry *entry, *tmp; - - lockdep_assert_held(&cache_mutex); - - *available = 0; - list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) { - unsigned long min_queue_time; - unsigned long now = jiffies; - - /* - * We hold RMIDs placed into limbo for a minimum queue - * time. Before the minimum queue time has elapsed we do - * not recycle RMIDs. - * - * The reasoning is that until a sufficient time has - * passed since we stopped using an RMID, any RMID - * placed onto the limbo list will likely still have - * data tagged in the cache, which means we'll probably - * fail to recycle it anyway. - * - * We can save ourselves an expensive IPI by skipping - * any RMIDs that have not been queued for the minimum - * time. - */ - min_queue_time = entry->queue_time + - msecs_to_jiffies(__rmid_queue_time_ms); - - if (time_after(min_queue_time, now)) - break; - - entry->state = RMID_AVAILABLE; - (*available)++; - } - - /* - * Fast return if none of the RMIDs on the limbo list have been - * sitting on the queue for the minimum queue time. - */ - if (!*available) - return false; - - /* - * Test whether an RMID is free for each package. - */ - on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true); - - list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) { - /* - * Exhausted all RMIDs that have waited min queue time. - */ - if (entry->state == RMID_YOUNG) - break; - - if (entry->state == RMID_DIRTY) - continue; - - list_del(&entry->list); /* remove from limbo */ - - /* - * The rotation RMID gets priority if it's - * currently invalid. In which case, skip adding - * the RMID to the the free lru. - */ - if (!__rmid_valid(intel_cqm_rotation_rmid)) { - intel_cqm_rotation_rmid = entry->rmid; - continue; - } - - /* - * If we have groups waiting for RMIDs, hand - * them one now provided they don't conflict. - */ - if (intel_cqm_sched_in_event(entry->rmid)) - continue; - - /* - * Otherwise place it onto the free list. - */ - list_add_tail(&entry->list, &cqm_rmid_free_lru); - } - - - return __rmid_valid(intel_cqm_rotation_rmid); -} - -/* - * Pick a victim group and move it to the tail of the group list. - * @next: The first group without an RMID - */ -static void __intel_cqm_pick_and_rotate(struct perf_event *next) -{ - struct perf_event *rotor; - u32 rmid; - - lockdep_assert_held(&cache_mutex); - - rotor = list_first_entry(&cache_groups, struct perf_event, - hw.cqm_groups_entry); - - /* - * The group at the front of the list should always have a valid - * RMID. If it doesn't then no groups have RMIDs assigned and we - * don't need to rotate the list. - */ - if (next == rotor) - return; - - rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID); - __put_rmid(rmid); - - list_rotate_left(&cache_groups); -} - -/* - * Deallocate the RMIDs from any events that conflict with @event, and - * place them on the back of the group list. - */ -static void intel_cqm_sched_out_conflicting_events(struct perf_event *event) -{ - struct perf_event *group, *g; - u32 rmid; - - lockdep_assert_held(&cache_mutex); - - list_for_each_entry_safe(group, g, &cache_groups, hw.cqm_groups_entry) { - if (group == event) - continue; - - rmid = group->hw.cqm_rmid; - - /* - * Skip events that don't have a valid RMID. - */ - if (!__rmid_valid(rmid)) - continue; - - /* - * No conflict? No problem! Leave the event alone. - */ - if (!__conflict_event(group, event)) - continue; - - intel_cqm_xchg_rmid(group, INVALID_RMID); - __put_rmid(rmid); - } -} - -/* - * Attempt to rotate the groups and assign new RMIDs. - * - * We rotate for two reasons, - * 1. To handle the scheduling of conflicting events - * 2. To recycle RMIDs - * - * Rotating RMIDs is complicated because the hardware doesn't give us - * any clues. - * - * There's problems with the hardware interface; when you change the - * task:RMID map cachelines retain their 'old' tags, giving a skewed - * picture. In order to work around this, we must always keep one free - * RMID - intel_cqm_rotation_rmid. - * - * Rotation works by taking away an RMID from a group (the old RMID), - * and assigning the free RMID to another group (the new RMID). We must - * then wait for the old RMID to not be used (no cachelines tagged). - * This ensure that all cachelines are tagged with 'active' RMIDs. At - * this point we can start reading values for the new RMID and treat the - * old RMID as the free RMID for the next rotation. - * - * Return %true or %false depending on whether we did any rotating. - */ -static bool __intel_cqm_rmid_rotate(void) -{ - struct perf_event *group, *start = NULL; - unsigned int threshold_limit; - unsigned int nr_needed = 0; - unsigned int nr_available; - bool rotated = false; - - mutex_lock(&cache_mutex); - -again: - /* - * Fast path through this function if there are no groups and no - * RMIDs that need cleaning. - */ - if (list_empty(&cache_groups) && list_empty(&cqm_rmid_limbo_lru)) - goto out; - - list_for_each_entry(group, &cache_groups, hw.cqm_groups_entry) { - if (!__rmid_valid(group->hw.cqm_rmid)) { - if (!start) - start = group; - nr_needed++; - } - } - - /* - * We have some event groups, but they all have RMIDs assigned - * and no RMIDs need cleaning. - */ - if (!nr_needed && list_empty(&cqm_rmid_limbo_lru)) - goto out; - - if (!nr_needed) - goto stabilize; - - /* - * We have more event groups without RMIDs than available RMIDs, - * or we have event groups that conflict with the ones currently - * scheduled. - * - * We force deallocate the rmid of the group at the head of - * cache_groups. The first event group without an RMID then gets - * assigned intel_cqm_rotation_rmid. This ensures we always make - * forward progress. - * - * Rotate the cache_groups list so the previous head is now the - * tail. - */ - __intel_cqm_pick_and_rotate(start); - - /* - * If the rotation is going to succeed, reduce the threshold so - * that we don't needlessly reuse dirty RMIDs. - */ - if (__rmid_valid(intel_cqm_rotation_rmid)) { - intel_cqm_xchg_rmid(start, intel_cqm_rotation_rmid); - intel_cqm_rotation_rmid = __get_rmid(); - - intel_cqm_sched_out_conflicting_events(start); - - if (__intel_cqm_threshold) - __intel_cqm_threshold--; - } - - rotated = true; - -stabilize: - /* - * We now need to stablize the RMID we freed above (if any) to - * ensure that the next time we rotate we have an RMID with zero - * occupancy value. - * - * Alternatively, if we didn't need to perform any rotation, - * we'll have a bunch of RMIDs in limbo that need stabilizing. - */ - threshold_limit = __intel_cqm_max_threshold / cqm_l3_scale; - - while (intel_cqm_rmid_stabilize(&nr_available) && - __intel_cqm_threshold < threshold_limit) { - unsigned int steal_limit; - - /* - * Don't spin if nobody is actively waiting for an RMID, - * the rotation worker will be kicked as soon as an - * event needs an RMID anyway. - */ - if (!nr_needed) - break; - - /* Allow max 25% of RMIDs to be in limbo. */ - steal_limit = (cqm_max_rmid + 1) / 4; - - /* - * We failed to stabilize any RMIDs so our rotation - * logic is now stuck. In order to make forward progress - * we have a few options: - * - * 1. rotate ("steal") another RMID - * 2. increase the threshold - * 3. do nothing - * - * We do both of 1. and 2. until we hit the steal limit. - * - * The steal limit prevents all RMIDs ending up on the - * limbo list. This can happen if every RMID has a - * non-zero occupancy above threshold_limit, and the - * occupancy values aren't dropping fast enough. - * - * Note that there is prioritisation at work here - we'd - * rather increase the number of RMIDs on the limbo list - * than increase the threshold, because increasing the - * threshold skews the event data (because we reuse - * dirty RMIDs) - threshold bumps are a last resort. - */ - if (nr_available < steal_limit) - goto again; - - __intel_cqm_threshold++; - } - -out: - mutex_unlock(&cache_mutex); - return rotated; -} - -static void intel_cqm_rmid_rotate(struct work_struct *work); - -static DECLARE_DELAYED_WORK(intel_cqm_rmid_work, intel_cqm_rmid_rotate); - -static struct pmu intel_cqm_pmu; - -static void intel_cqm_rmid_rotate(struct work_struct *work) -{ - unsigned long delay; - - __intel_cqm_rmid_rotate(); - - delay = msecs_to_jiffies(intel_cqm_pmu.hrtimer_interval_ms); - schedule_delayed_work(&intel_cqm_rmid_work, delay); -} - -static u64 update_sample(unsigned int rmid, u32 evt_type, int first) -{ - struct sample *mbm_current; - u32 vrmid = rmid_2_index(rmid); - u64 val, bytes, shift; - u32 eventid; - - if (evt_type == QOS_MBM_LOCAL_EVENT_ID) { - mbm_current = &mbm_local[vrmid]; - eventid = QOS_MBM_LOCAL_EVENT_ID; - } else { - mbm_current = &mbm_total[vrmid]; - eventid = QOS_MBM_TOTAL_EVENT_ID; - } - - wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid); - rdmsrl(MSR_IA32_QM_CTR, val); - if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) - return mbm_current->total_bytes; - - if (first) { - mbm_current->prev_msr = val; - mbm_current->total_bytes = 0; - return mbm_current->total_bytes; - } - - /* - * The h/w guarantees that counters will not overflow - * so long as we poll them at least once per second. - */ - shift = 64 - MBM_CNTR_WIDTH; - bytes = (val << shift) - (mbm_current->prev_msr << shift); - bytes >>= shift; - - bytes *= cqm_l3_scale; - - mbm_current->total_bytes += bytes; - mbm_current->prev_msr = val; - - return mbm_current->total_bytes; -} - -static u64 rmid_read_mbm(unsigned int rmid, u32 evt_type) -{ - return update_sample(rmid, evt_type, 0); -} - -static void __intel_mbm_event_init(void *info) -{ - struct rmid_read *rr = info; - - update_sample(rr->rmid, rr->evt_type, 1); -} - -static void init_mbm_sample(u32 rmid, u32 evt_type) -{ - struct rmid_read rr = { - .rmid = rmid, - .evt_type = evt_type, - .value = ATOMIC64_INIT(0), - }; - - /* on each socket, init sample */ - on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_init, &rr, 1); -} - -/* - * Find a group and setup RMID. - * - * If we're part of a group, we use the group's RMID. - */ -static void intel_cqm_setup_event(struct perf_event *event, - struct perf_event **group) -{ - struct perf_event *iter; - bool conflict = false; - u32 rmid; - - event->hw.is_group_event = false; - list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) { - rmid = iter->hw.cqm_rmid; - - if (__match_event(iter, event)) { - /* All tasks in a group share an RMID */ - event->hw.cqm_rmid = rmid; - *group = iter; - if (is_mbm_event(event->attr.config) && __rmid_valid(rmid)) - init_mbm_sample(rmid, event->attr.config); - return; - } - - /* - * We only care about conflicts for events that are - * actually scheduled in (and hence have a valid RMID). - */ - if (__conflict_event(iter, event) && __rmid_valid(rmid)) - conflict = true; - } - - if (conflict) - rmid = INVALID_RMID; - else - rmid = __get_rmid(); - - if (is_mbm_event(event->attr.config) && __rmid_valid(rmid)) - init_mbm_sample(rmid, event->attr.config); - - event->hw.cqm_rmid = rmid; -} - -static void intel_cqm_event_read(struct perf_event *event) -{ - unsigned long flags; - u32 rmid; - u64 val; - - /* - * Task events are handled by intel_cqm_event_count(). - */ - if (event->cpu == -1) - return; - - raw_spin_lock_irqsave(&cache_lock, flags); - rmid = event->hw.cqm_rmid; - - if (!__rmid_valid(rmid)) - goto out; - - if (is_mbm_event(event->attr.config)) - val = rmid_read_mbm(rmid, event->attr.config); - else - val = __rmid_read(rmid); - - /* - * Ignore this reading on error states and do not update the value. - */ - if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) - goto out; - - local64_set(&event->count, val); -out: - raw_spin_unlock_irqrestore(&cache_lock, flags); -} - -static void __intel_cqm_event_count(void *info) -{ - struct rmid_read *rr = info; - u64 val; - - val = __rmid_read(rr->rmid); - - if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) - return; - - atomic64_add(val, &rr->value); -} - -static inline bool cqm_group_leader(struct perf_event *event) -{ - return !list_empty(&event->hw.cqm_groups_entry); -} - -static void __intel_mbm_event_count(void *info) -{ - struct rmid_read *rr = info; - u64 val; - - val = rmid_read_mbm(rr->rmid, rr->evt_type); - if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) - return; - atomic64_add(val, &rr->value); -} - -static enum hrtimer_restart mbm_hrtimer_handle(struct hrtimer *hrtimer) -{ - struct perf_event *iter, *iter1; - int ret = HRTIMER_RESTART; - struct list_head *head; - unsigned long flags; - u32 grp_rmid; - - /* - * Need to cache_lock as the timer Event Select MSR reads - * can race with the mbm/cqm count() and mbm_init() reads. - */ - raw_spin_lock_irqsave(&cache_lock, flags); - - if (list_empty(&cache_groups)) { - ret = HRTIMER_NORESTART; - goto out; - } - - list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) { - grp_rmid = iter->hw.cqm_rmid; - if (!__rmid_valid(grp_rmid)) - continue; - if (is_mbm_event(iter->attr.config)) - update_sample(grp_rmid, iter->attr.config, 0); - - head = &iter->hw.cqm_group_entry; - if (list_empty(head)) - continue; - list_for_each_entry(iter1, head, hw.cqm_group_entry) { - if (!iter1->hw.is_group_event) - break; - if (is_mbm_event(iter1->attr.config)) - update_sample(iter1->hw.cqm_rmid, - iter1->attr.config, 0); - } - } - - hrtimer_forward_now(hrtimer, ms_to_ktime(MBM_CTR_OVERFLOW_TIME)); -out: - raw_spin_unlock_irqrestore(&cache_lock, flags); - - return ret; -} - -static void __mbm_start_timer(void *info) -{ - hrtimer_start(&mbm_timers[pkg_id], ms_to_ktime(MBM_CTR_OVERFLOW_TIME), - HRTIMER_MODE_REL_PINNED); -} - -static void __mbm_stop_timer(void *info) -{ - hrtimer_cancel(&mbm_timers[pkg_id]); -} - -static void mbm_start_timers(void) -{ - on_each_cpu_mask(&cqm_cpumask, __mbm_start_timer, NULL, 1); -} - -static void mbm_stop_timers(void) -{ - on_each_cpu_mask(&cqm_cpumask, __mbm_stop_timer, NULL, 1); -} - -static void mbm_hrtimer_init(void) -{ - struct hrtimer *hr; - int i; - - for (i = 0; i < mbm_socket_max; i++) { - hr = &mbm_timers[i]; - hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hr->function = mbm_hrtimer_handle; - } -} - -static u64 intel_cqm_event_count(struct perf_event *event) -{ - unsigned long flags; - struct rmid_read rr = { - .evt_type = event->attr.config, - .value = ATOMIC64_INIT(0), - }; - - /* - * We only need to worry about task events. System-wide events - * are handled like usual, i.e. entirely with - * intel_cqm_event_read(). - */ - if (event->cpu != -1) - return __perf_event_count(event); - - /* - * Only the group leader gets to report values except in case of - * multiple events in the same group, we still need to read the - * other events.This stops us - * reporting duplicate values to userspace, and gives us a clear - * rule for which task gets to report the values. - * - * Note that it is impossible to attribute these values to - * specific packages - we forfeit that ability when we create - * task events. - */ - if (!cqm_group_leader(event) && !event->hw.is_group_event) - return 0; - - /* - * Getting up-to-date values requires an SMP IPI which is not - * possible if we're being called in interrupt context. Return - * the cached values instead. - */ - if (unlikely(in_interrupt())) - goto out; - - /* - * Notice that we don't perform the reading of an RMID - * atomically, because we can't hold a spin lock across the - * IPIs. - * - * Speculatively perform the read, since @event might be - * assigned a different (possibly invalid) RMID while we're - * busying performing the IPI calls. It's therefore necessary to - * check @event's RMID afterwards, and if it has changed, - * discard the result of the read. - */ - rr.rmid = ACCESS_ONCE(event->hw.cqm_rmid); - - if (!__rmid_valid(rr.rmid)) - goto out; - - cqm_mask_call(&rr); - - raw_spin_lock_irqsave(&cache_lock, flags); - if (event->hw.cqm_rmid == rr.rmid) - local64_set(&event->count, atomic64_read(&rr.value)); - raw_spin_unlock_irqrestore(&cache_lock, flags); -out: - return __perf_event_count(event); -} - -static void intel_cqm_event_start(struct perf_event *event, int mode) -{ - struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); - u32 rmid = event->hw.cqm_rmid; - - if (!(event->hw.cqm_state & PERF_HES_STOPPED)) - return; - - event->hw.cqm_state &= ~PERF_HES_STOPPED; - - if (state->rmid_usecnt++) { - if (!WARN_ON_ONCE(state->rmid != rmid)) - return; - } else { - WARN_ON_ONCE(state->rmid); - } - - state->rmid = rmid; - wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid); -} - -static void intel_cqm_event_stop(struct perf_event *event, int mode) -{ - struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); - - if (event->hw.cqm_state & PERF_HES_STOPPED) - return; - - event->hw.cqm_state |= PERF_HES_STOPPED; - - intel_cqm_event_read(event); - - if (!--state->rmid_usecnt) { - state->rmid = 0; - wrmsr(MSR_IA32_PQR_ASSOC, 0, state->closid); - } else { - WARN_ON_ONCE(!state->rmid); - } -} - -static int intel_cqm_event_add(struct perf_event *event, int mode) -{ - unsigned long flags; - u32 rmid; - - raw_spin_lock_irqsave(&cache_lock, flags); - - event->hw.cqm_state = PERF_HES_STOPPED; - rmid = event->hw.cqm_rmid; - - if (__rmid_valid(rmid) && (mode & PERF_EF_START)) - intel_cqm_event_start(event, mode); - - raw_spin_unlock_irqrestore(&cache_lock, flags); - - return 0; -} - -static void intel_cqm_event_destroy(struct perf_event *event) -{ - struct perf_event *group_other = NULL; - unsigned long flags; - - mutex_lock(&cache_mutex); - /* - * Hold the cache_lock as mbm timer handlers could be - * scanning the list of events. - */ - raw_spin_lock_irqsave(&cache_lock, flags); - - /* - * If there's another event in this group... - */ - if (!list_empty(&event->hw.cqm_group_entry)) { - group_other = list_first_entry(&event->hw.cqm_group_entry, - struct perf_event, - hw.cqm_group_entry); - list_del(&event->hw.cqm_group_entry); - } - - /* - * And we're the group leader.. - */ - if (cqm_group_leader(event)) { - /* - * If there was a group_other, make that leader, otherwise - * destroy the group and return the RMID. - */ - if (group_other) { - list_replace(&event->hw.cqm_groups_entry, - &group_other->hw.cqm_groups_entry); - } else { - u32 rmid = event->hw.cqm_rmid; - - if (__rmid_valid(rmid)) - __put_rmid(rmid); - list_del(&event->hw.cqm_groups_entry); - } - } - - raw_spin_unlock_irqrestore(&cache_lock, flags); - - /* - * Stop the mbm overflow timers when the last event is destroyed. - */ - if (mbm_enabled && list_empty(&cache_groups)) - mbm_stop_timers(); - - mutex_unlock(&cache_mutex); -} - -static int intel_cqm_event_init(struct perf_event *event) -{ - struct perf_event *group = NULL; - bool rotate = false; - unsigned long flags; - - if (event->attr.type != intel_cqm_pmu.type) - return -ENOENT; - - if ((event->attr.config < QOS_L3_OCCUP_EVENT_ID) || - (event->attr.config > QOS_MBM_LOCAL_EVENT_ID)) - return -EINVAL; - - if ((is_cqm_event(event->attr.config) && !cqm_enabled) || - (is_mbm_event(event->attr.config) && !mbm_enabled)) - return -EINVAL; - - /* unsupported modes and filters */ - if (event->attr.exclude_user || - event->attr.exclude_kernel || - event->attr.exclude_hv || - event->attr.exclude_idle || - event->attr.exclude_host || - event->attr.exclude_guest || - event->attr.sample_period) /* no sampling */ - return -EINVAL; - - INIT_LIST_HEAD(&event->hw.cqm_group_entry); - INIT_LIST_HEAD(&event->hw.cqm_groups_entry); - - event->destroy = intel_cqm_event_destroy; - - mutex_lock(&cache_mutex); - - /* - * Start the mbm overflow timers when the first event is created. - */ - if (mbm_enabled && list_empty(&cache_groups)) - mbm_start_timers(); - - /* Will also set rmid */ - intel_cqm_setup_event(event, &group); - - /* - * Hold the cache_lock as mbm timer handlers be - * scanning the list of events. - */ - raw_spin_lock_irqsave(&cache_lock, flags); - - if (group) { - list_add_tail(&event->hw.cqm_group_entry, - &group->hw.cqm_group_entry); - } else { - list_add_tail(&event->hw.cqm_groups_entry, - &cache_groups); - - /* - * All RMIDs are either in use or have recently been - * used. Kick the rotation worker to clean/free some. - * - * We only do this for the group leader, rather than for - * every event in a group to save on needless work. - */ - if (!__rmid_valid(event->hw.cqm_rmid)) - rotate = true; - } - - raw_spin_unlock_irqrestore(&cache_lock, flags); - mutex_unlock(&cache_mutex); - - if (rotate) - schedule_delayed_work(&intel_cqm_rmid_work, 0); - - return 0; -} - -EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01"); -EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1"); -EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes"); -EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL); -EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1"); - -EVENT_ATTR_STR(total_bytes, intel_cqm_total_bytes, "event=0x02"); -EVENT_ATTR_STR(total_bytes.per-pkg, intel_cqm_total_bytes_pkg, "1"); -EVENT_ATTR_STR(total_bytes.unit, intel_cqm_total_bytes_unit, "MB"); -EVENT_ATTR_STR(total_bytes.scale, intel_cqm_total_bytes_scale, "1e-6"); - -EVENT_ATTR_STR(local_bytes, intel_cqm_local_bytes, "event=0x03"); -EVENT_ATTR_STR(local_bytes.per-pkg, intel_cqm_local_bytes_pkg, "1"); -EVENT_ATTR_STR(local_bytes.unit, intel_cqm_local_bytes_unit, "MB"); -EVENT_ATTR_STR(local_bytes.scale, intel_cqm_local_bytes_scale, "1e-6"); - -static struct attribute *intel_cqm_events_attr[] = { - EVENT_PTR(intel_cqm_llc), - EVENT_PTR(intel_cqm_llc_pkg), - EVENT_PTR(intel_cqm_llc_unit), - EVENT_PTR(intel_cqm_llc_scale), - EVENT_PTR(intel_cqm_llc_snapshot), - NULL, -}; - -static struct attribute *intel_mbm_events_attr[] = { - EVENT_PTR(intel_cqm_total_bytes), - EVENT_PTR(intel_cqm_local_bytes), - EVENT_PTR(intel_cqm_total_bytes_pkg), - EVENT_PTR(intel_cqm_local_bytes_pkg), - EVENT_PTR(intel_cqm_total_bytes_unit), - EVENT_PTR(intel_cqm_local_bytes_unit), - EVENT_PTR(intel_cqm_total_bytes_scale), - EVENT_PTR(intel_cqm_local_bytes_scale), - NULL, -}; - -static struct attribute *intel_cmt_mbm_events_attr[] = { - EVENT_PTR(intel_cqm_llc), - EVENT_PTR(intel_cqm_total_bytes), - EVENT_PTR(intel_cqm_local_bytes), - EVENT_PTR(intel_cqm_llc_pkg), - EVENT_PTR(intel_cqm_total_bytes_pkg), - EVENT_PTR(intel_cqm_local_bytes_pkg), - EVENT_PTR(intel_cqm_llc_unit), - EVENT_PTR(intel_cqm_total_bytes_unit), - EVENT_PTR(intel_cqm_local_bytes_unit), - EVENT_PTR(intel_cqm_llc_scale), - EVENT_PTR(intel_cqm_total_bytes_scale), - EVENT_PTR(intel_cqm_local_bytes_scale), - EVENT_PTR(intel_cqm_llc_snapshot), - NULL, -}; - -static struct attribute_group intel_cqm_events_group = { - .name = "events", - .attrs = NULL, -}; - -PMU_FORMAT_ATTR(event, "config:0-7"); -static struct attribute *intel_cqm_formats_attr[] = { - &format_attr_event.attr, - NULL, -}; - -static struct attribute_group intel_cqm_format_group = { - .name = "format", - .attrs = intel_cqm_formats_attr, -}; - -static ssize_t -max_recycle_threshold_show(struct device *dev, struct device_attribute *attr, - char *page) -{ - ssize_t rv; - - mutex_lock(&cache_mutex); - rv = snprintf(page, PAGE_SIZE-1, "%u\n", __intel_cqm_max_threshold); - mutex_unlock(&cache_mutex); - - return rv; -} - -static ssize_t -max_recycle_threshold_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - unsigned int bytes, cachelines; - int ret; - - ret = kstrtouint(buf, 0, &bytes); - if (ret) - return ret; - - mutex_lock(&cache_mutex); - - __intel_cqm_max_threshold = bytes; - cachelines = bytes / cqm_l3_scale; - - /* - * The new maximum takes effect immediately. - */ - if (__intel_cqm_threshold > cachelines) - __intel_cqm_threshold = cachelines; - - mutex_unlock(&cache_mutex); - - return count; -} - -static DEVICE_ATTR_RW(max_recycle_threshold); - -static struct attribute *intel_cqm_attrs[] = { - &dev_attr_max_recycle_threshold.attr, - NULL, -}; - -static const struct attribute_group intel_cqm_group = { - .attrs = intel_cqm_attrs, -}; - -static const struct attribute_group *intel_cqm_attr_groups[] = { - &intel_cqm_events_group, - &intel_cqm_format_group, - &intel_cqm_group, - NULL, -}; - -static struct pmu intel_cqm_pmu = { - .hrtimer_interval_ms = RMID_DEFAULT_QUEUE_TIME, - .attr_groups = intel_cqm_attr_groups, - .task_ctx_nr = perf_sw_context, - .event_init = intel_cqm_event_init, - .add = intel_cqm_event_add, - .del = intel_cqm_event_stop, - .start = intel_cqm_event_start, - .stop = intel_cqm_event_stop, - .read = intel_cqm_event_read, - .count = intel_cqm_event_count, -}; - -static inline void cqm_pick_event_reader(int cpu) -{ - int reader; - - /* First online cpu in package becomes the reader */ - reader = cpumask_any_and(&cqm_cpumask, topology_core_cpumask(cpu)); - if (reader >= nr_cpu_ids) - cpumask_set_cpu(cpu, &cqm_cpumask); -} - -static int intel_cqm_cpu_starting(unsigned int cpu) -{ - struct intel_pqr_state *state = &per_cpu(pqr_state, cpu); - struct cpuinfo_x86 *c = &cpu_data(cpu); - - state->rmid = 0; - state->closid = 0; - state->rmid_usecnt = 0; - - WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid); - WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale); - - cqm_pick_event_reader(cpu); - return 0; -} - -static int intel_cqm_cpu_exit(unsigned int cpu) -{ - int target; - - /* Is @cpu the current cqm reader for this package ? */ - if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask)) - return 0; - - /* Find another online reader in this package */ - target = cpumask_any_but(topology_core_cpumask(cpu), cpu); - - if (target < nr_cpu_ids) - cpumask_set_cpu(target, &cqm_cpumask); - - return 0; -} - -static const struct x86_cpu_id intel_cqm_match[] = { - { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC }, - {} -}; - -static void mbm_cleanup(void) -{ - if (!mbm_enabled) - return; - - kfree(mbm_local); - kfree(mbm_total); - mbm_enabled = false; -} - -static const struct x86_cpu_id intel_mbm_local_match[] = { - { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_LOCAL }, - {} -}; - -static const struct x86_cpu_id intel_mbm_total_match[] = { - { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_TOTAL }, - {} -}; - -static int intel_mbm_init(void) -{ - int ret = 0, array_size, maxid = cqm_max_rmid + 1; - - mbm_socket_max = topology_max_packages(); - array_size = sizeof(struct sample) * maxid * mbm_socket_max; - mbm_local = kmalloc(array_size, GFP_KERNEL); - if (!mbm_local) - return -ENOMEM; - - mbm_total = kmalloc(array_size, GFP_KERNEL); - if (!mbm_total) { - ret = -ENOMEM; - goto out; - } - - array_size = sizeof(struct hrtimer) * mbm_socket_max; - mbm_timers = kmalloc(array_size, GFP_KERNEL); - if (!mbm_timers) { - ret = -ENOMEM; - goto out; - } - mbm_hrtimer_init(); - -out: - if (ret) - mbm_cleanup(); - - return ret; -} - -static int __init intel_cqm_init(void) -{ - char *str = NULL, scale[20]; - int cpu, ret; - - if (x86_match_cpu(intel_cqm_match)) - cqm_enabled = true; - - if (x86_match_cpu(intel_mbm_local_match) && - x86_match_cpu(intel_mbm_total_match)) - mbm_enabled = true; - - if (!cqm_enabled && !mbm_enabled) - return -ENODEV; - - cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale; - - /* - * It's possible that not all resources support the same number - * of RMIDs. Instead of making scheduling much more complicated - * (where we have to match a task's RMID to a cpu that supports - * that many RMIDs) just find the minimum RMIDs supported across - * all cpus. - * - * Also, check that the scales match on all cpus. - */ - get_online_cpus(); - for_each_online_cpu(cpu) { - struct cpuinfo_x86 *c = &cpu_data(cpu); - - if (c->x86_cache_max_rmid < cqm_max_rmid) - cqm_max_rmid = c->x86_cache_max_rmid; - - if (c->x86_cache_occ_scale != cqm_l3_scale) { - pr_err("Multiple LLC scale values, disabling\n"); - ret = -EINVAL; - goto out; - } - } - - /* - * A reasonable upper limit on the max threshold is the number - * of lines tagged per RMID if all RMIDs have the same number of - * lines tagged in the LLC. - * - * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. - */ - __intel_cqm_max_threshold = - boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1); - - snprintf(scale, sizeof(scale), "%u", cqm_l3_scale); - str = kstrdup(scale, GFP_KERNEL); - if (!str) { - ret = -ENOMEM; - goto out; - } - - event_attr_intel_cqm_llc_scale.event_str = str; - - ret = intel_cqm_setup_rmid_cache(); - if (ret) - goto out; - - if (mbm_enabled) - ret = intel_mbm_init(); - if (ret && !cqm_enabled) - goto out; - - if (cqm_enabled && mbm_enabled) - intel_cqm_events_group.attrs = intel_cmt_mbm_events_attr; - else if (!cqm_enabled && mbm_enabled) - intel_cqm_events_group.attrs = intel_mbm_events_attr; - else if (cqm_enabled && !mbm_enabled) - intel_cqm_events_group.attrs = intel_cqm_events_attr; - - ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1); - if (ret) { - pr_err("Intel CQM perf registration failed: %d\n", ret); - goto out; - } - - if (cqm_enabled) - pr_info("Intel CQM monitoring enabled\n"); - if (mbm_enabled) - pr_info("Intel MBM enabled\n"); - - /* - * Setup the hot cpu notifier once we are sure cqm - * is enabled to avoid notifier leak. - */ - cpuhp_setup_state(CPUHP_AP_PERF_X86_CQM_STARTING, - "perf/x86/cqm:starting", - intel_cqm_cpu_starting, NULL); - cpuhp_setup_state(CPUHP_AP_PERF_X86_CQM_ONLINE, "perf/x86/cqm:online", - NULL, intel_cqm_cpu_exit); - -out: - put_online_cpus(); - - if (ret) { - kfree(str); - cqm_cleanup(); - mbm_cleanup(); - } - - return ret; -} -device_initcall(intel_cqm_init); diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 238ae3248ba5..72db0664a53d 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -40,16 +40,16 @@ * Model specific counters: * MSR_CORE_C1_RES: CORE C1 Residency Counter * perf code: 0x00 - * Available model: SLM,AMT + * Available model: SLM,AMT,GLM * Scope: Core (each processor core has a MSR) * MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter * perf code: 0x01 - * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL + * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,GLM * Scope: Core * MSR_CORE_C6_RESIDENCY: CORE C6 Residency Counter * perf code: 0x02 * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW - * SKL,KNL + * SKL,KNL,GLM * Scope: Core * MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter * perf code: 0x03 @@ -57,16 +57,17 @@ * Scope: Core * MSR_PKG_C2_RESIDENCY: Package C2 Residency Counter. * perf code: 0x00 - * Available model: SNB,IVB,HSW,BDW,SKL,KNL + * Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM * Scope: Package (physical package) * MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter. * perf code: 0x01 * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,KNL + * GLM * Scope: Package (physical package) * MSR_PKG_C6_RESIDENCY: Package C6 Residency Counter. * perf code: 0x02 * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW - * SKL,KNL + * SKL,KNL,GLM * Scope: Package (physical package) * MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter. * perf code: 0x03 @@ -82,7 +83,7 @@ * Scope: Package (physical package) * MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter. * perf code: 0x06 - * Available model: HSW ULT only + * Available model: HSW ULT, GLM * Scope: Package (physical package) * */ @@ -504,6 +505,17 @@ static const struct cstate_model knl_cstates __initconst = { }; +static const struct cstate_model glm_cstates __initconst = { + .core_events = BIT(PERF_CSTATE_CORE_C1_RES) | + BIT(PERF_CSTATE_CORE_C3_RES) | + BIT(PERF_CSTATE_CORE_C6_RES), + + .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) | + BIT(PERF_CSTATE_PKG_C3_RES) | + BIT(PERF_CSTATE_PKG_C6_RES) | + BIT(PERF_CSTATE_PKG_C10_RES), +}; + #define X86_CSTATES_MODEL(model, states) \ { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long) &(states) } @@ -540,12 +552,18 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_MOBILE, snb_cstates), X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_DESKTOP, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_X, snb_cstates), X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_MOBILE, snb_cstates), X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_DESKTOP, snb_cstates), X86_CSTATES_MODEL(INTEL_FAM6_XEON_PHI_KNL, knl_cstates), X86_CSTATES_MODEL(INTEL_FAM6_XEON_PHI_KNM, knl_cstates), + + X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT, glm_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_ATOM_DENVERTON, glm_cstates), + + X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GEMINI_LAKE, glm_cstates), { }, }; MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index c6d23ffe422d..3674a4b6f8bd 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/bitops.h> #include <linux/types.h> #include <linux/slab.h> @@ -49,34 +50,47 @@ union intel_x86_pebs_dse { */ #define P(a, b) PERF_MEM_S(a, b) #define OP_LH (P(OP, LOAD) | P(LVL, HIT)) +#define LEVEL(x) P(LVLNUM, x) +#define REM P(REMOTE, REMOTE) #define SNOOP_NONE_MISS (P(SNOOP, NONE) | P(SNOOP, MISS)) /* Version for Sandy Bridge and later */ static u64 pebs_data_source[] = { - P(OP, LOAD) | P(LVL, MISS) | P(LVL, L3) | P(SNOOP, NA),/* 0x00:ukn L3 */ - OP_LH | P(LVL, L1) | P(SNOOP, NONE), /* 0x01: L1 local */ - OP_LH | P(LVL, LFB) | P(SNOOP, NONE), /* 0x02: LFB hit */ - OP_LH | P(LVL, L2) | P(SNOOP, NONE), /* 0x03: L2 hit */ - OP_LH | P(LVL, L3) | P(SNOOP, NONE), /* 0x04: L3 hit */ - OP_LH | P(LVL, L3) | P(SNOOP, MISS), /* 0x05: L3 hit, snoop miss */ - OP_LH | P(LVL, L3) | P(SNOOP, HIT), /* 0x06: L3 hit, snoop hit */ - OP_LH | P(LVL, L3) | P(SNOOP, HITM), /* 0x07: L3 hit, snoop hitm */ - OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HIT), /* 0x08: L3 miss snoop hit */ - OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/ - OP_LH | P(LVL, LOC_RAM) | P(SNOOP, HIT), /* 0x0a: L3 miss, shared */ - OP_LH | P(LVL, REM_RAM1) | P(SNOOP, HIT), /* 0x0b: L3 miss, shared */ - OP_LH | P(LVL, LOC_RAM) | SNOOP_NONE_MISS,/* 0x0c: L3 miss, excl */ - OP_LH | P(LVL, REM_RAM1) | SNOOP_NONE_MISS,/* 0x0d: L3 miss, excl */ - OP_LH | P(LVL, IO) | P(SNOOP, NONE), /* 0x0e: I/O */ - OP_LH | P(LVL, UNC) | P(SNOOP, NONE), /* 0x0f: uncached */ + P(OP, LOAD) | P(LVL, MISS) | LEVEL(L3) | P(SNOOP, NA),/* 0x00:ukn L3 */ + OP_LH | P(LVL, L1) | LEVEL(L1) | P(SNOOP, NONE), /* 0x01: L1 local */ + OP_LH | P(LVL, LFB) | LEVEL(LFB) | P(SNOOP, NONE), /* 0x02: LFB hit */ + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, NONE), /* 0x03: L2 hit */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, NONE), /* 0x04: L3 hit */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, MISS), /* 0x05: L3 hit, snoop miss */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HIT), /* 0x06: L3 hit, snoop hit */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM), /* 0x07: L3 hit, snoop hitm */ + OP_LH | P(LVL, REM_CCE1) | REM | LEVEL(L3) | P(SNOOP, HIT), /* 0x08: L3 miss snoop hit */ + OP_LH | P(LVL, REM_CCE1) | REM | LEVEL(L3) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/ + OP_LH | P(LVL, LOC_RAM) | LEVEL(RAM) | P(SNOOP, HIT), /* 0x0a: L3 miss, shared */ + OP_LH | P(LVL, REM_RAM1) | REM | LEVEL(L3) | P(SNOOP, HIT), /* 0x0b: L3 miss, shared */ + OP_LH | P(LVL, LOC_RAM) | LEVEL(RAM) | SNOOP_NONE_MISS, /* 0x0c: L3 miss, excl */ + OP_LH | P(LVL, REM_RAM1) | LEVEL(RAM) | REM | SNOOP_NONE_MISS, /* 0x0d: L3 miss, excl */ + OP_LH | P(LVL, IO) | LEVEL(NA) | P(SNOOP, NONE), /* 0x0e: I/O */ + OP_LH | P(LVL, UNC) | LEVEL(NA) | P(SNOOP, NONE), /* 0x0f: uncached */ }; /* Patch up minor differences in the bits */ void __init intel_pmu_pebs_data_source_nhm(void) { - pebs_data_source[0x05] = OP_LH | P(LVL, L3) | P(SNOOP, HIT); - pebs_data_source[0x06] = OP_LH | P(LVL, L3) | P(SNOOP, HITM); - pebs_data_source[0x07] = OP_LH | P(LVL, L3) | P(SNOOP, HITM); + pebs_data_source[0x05] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HIT); + pebs_data_source[0x06] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM); + pebs_data_source[0x07] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM); +} + +void __init intel_pmu_pebs_data_source_skl(bool pmem) +{ + u64 pmem_or_l4 = pmem ? LEVEL(PMEM) : LEVEL(L4); + + pebs_data_source[0x08] = OP_LH | pmem_or_l4 | P(SNOOP, HIT); + pebs_data_source[0x09] = OP_LH | pmem_or_l4 | REM | P(SNOOP, HIT); + pebs_data_source[0x0b] = OP_LH | LEVEL(RAM) | REM | P(SNOOP, NONE); + pebs_data_source[0x0c] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOPX, FWD); + pebs_data_source[0x0d] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOP, HITM); } static u64 precise_store_data(u64 status) @@ -149,8 +163,6 @@ static u64 load_latency_data(u64 status) { union intel_x86_pebs_dse dse; u64 val; - int model = boot_cpu_data.x86_model; - int fam = boot_cpu_data.x86; dse.val = status; @@ -162,8 +174,7 @@ static u64 load_latency_data(u64 status) /* * Nehalem models do not support TLB, Lock infos */ - if (fam == 0x6 && (model == 26 || model == 30 - || model == 31 || model == 46)) { + if (x86_pmu.pebs_no_tlb) { val |= P(TLB, NA) | P(LOCK, NA); return val; } @@ -606,12 +617,6 @@ static inline void intel_pmu_drain_pebs_buffer(void) x86_pmu.drain_pebs(®s); } -void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in) -{ - if (!sched_in) - intel_pmu_drain_pebs_buffer(); -} - /* * PEBS */ @@ -651,6 +656,12 @@ struct event_constraint intel_glm_pebs_event_constraints[] = { EVENT_CONSTRAINT_END }; +struct event_constraint intel_glp_pebs_event_constraints[] = { + /* Allow all events as PEBS with no flags */ + INTEL_ALL_EVENT_CONSTRAINT(0, 0xf), + EVENT_CONSTRAINT_END +}; + struct event_constraint intel_nehalem_pebs_event_constraints[] = { INTEL_PLD_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.* */ INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ @@ -816,6 +827,14 @@ static inline bool pebs_needs_sched_cb(struct cpu_hw_events *cpuc) return cpuc->n_pebs && (cpuc->n_pebs == cpuc->n_large_pebs); } +void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + if (!sched_in && pebs_needs_sched_cb(cpuc)) + intel_pmu_drain_pebs_buffer(); +} + static inline void pebs_update_threshold(struct cpu_hw_events *cpuc) { struct debug_store *ds = cpuc->ds; @@ -889,6 +908,8 @@ void intel_pmu_pebs_enable(struct perf_event *event) if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) { ds->pebs_event_reset[hwc->idx] = (u64)(-hwc->sample_period) & x86_pmu.cntval_mask; + } else { + ds->pebs_event_reset[hwc->idx] = 0; } } @@ -1165,7 +1186,7 @@ static void setup_pebs_sample_data(struct perf_event *event, else regs->flags &= ~PERF_EFLAGS_EXACT; - if ((sample_type & PERF_SAMPLE_ADDR) && + if ((sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR)) && x86_pmu.intel_cap.pebs_format >= 1) data->addr = pebs->dla; diff --git a/arch/x86/events/intel/knc.c b/arch/x86/events/intel/knc.c index 548d5f774b07..618001c208e8 100644 --- a/arch/x86/events/intel/knc.c +++ b/arch/x86/events/intel/knc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Driver for Intel Xeon Phi "Knights Corner" PMU */ #include <linux/perf_event.h> diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index f924629836a8..ae64d0b69729 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/perf_event.h> #include <linux/types.h> @@ -18,7 +19,7 @@ enum { LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_TIME, }; -static enum { +static const enum { LBR_EIP_FLAGS = 1, LBR_TSX = 2, } lbr_desc[LBR_FORMAT_MAX_KNOWN + 1] = { @@ -109,6 +110,9 @@ enum { X86_BR_ZERO_CALL = 1 << 15,/* zero length call */ X86_BR_CALL_STACK = 1 << 16,/* call stack */ X86_BR_IND_JMP = 1 << 17,/* indirect jump */ + + X86_BR_TYPE_SAVE = 1 << 18,/* indicate to save branch type */ + }; #define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL) @@ -287,7 +291,7 @@ inline u64 lbr_from_signext_quirk_wr(u64 val) /* * If quirk is needed, ensure sign extension is 61 bits: */ -u64 lbr_from_signext_quirk_rd(u64 val) +static u64 lbr_from_signext_quirk_rd(u64 val) { if (static_branch_unlikely(&lbr_from_quirk_key)) { /* @@ -380,8 +384,12 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct x86_perf_task_context *task_ctx; + if (!cpuc->lbr_users) + return; + /* * If LBR callstack feature is enabled and the stack was saved when * the task was scheduled out, restore the stack. Otherwise flush @@ -510,6 +518,7 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) cpuc->lbr_entries[i].in_tx = 0; cpuc->lbr_entries[i].abort = 0; cpuc->lbr_entries[i].cycles = 0; + cpuc->lbr_entries[i].type = 0; cpuc->lbr_entries[i].reserved = 0; } cpuc->lbr_stack.nr = i; @@ -596,6 +605,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) cpuc->lbr_entries[out].in_tx = in_tx; cpuc->lbr_entries[out].abort = abort; cpuc->lbr_entries[out].cycles = cycles; + cpuc->lbr_entries[out].type = 0; cpuc->lbr_entries[out].reserved = 0; out++; } @@ -673,6 +683,10 @@ static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event) if (br_type & PERF_SAMPLE_BRANCH_CALL) mask |= X86_BR_CALL | X86_BR_ZERO_CALL; + + if (br_type & PERF_SAMPLE_BRANCH_TYPE_SAVE) + mask |= X86_BR_TYPE_SAVE; + /* * stash actual user request into reg, it may * be used by fixup code for some CPU @@ -926,6 +940,43 @@ static int branch_type(unsigned long from, unsigned long to, int abort) return ret; } +#define X86_BR_TYPE_MAP_MAX 16 + +static int branch_map[X86_BR_TYPE_MAP_MAX] = { + PERF_BR_CALL, /* X86_BR_CALL */ + PERF_BR_RET, /* X86_BR_RET */ + PERF_BR_SYSCALL, /* X86_BR_SYSCALL */ + PERF_BR_SYSRET, /* X86_BR_SYSRET */ + PERF_BR_UNKNOWN, /* X86_BR_INT */ + PERF_BR_UNKNOWN, /* X86_BR_IRET */ + PERF_BR_COND, /* X86_BR_JCC */ + PERF_BR_UNCOND, /* X86_BR_JMP */ + PERF_BR_UNKNOWN, /* X86_BR_IRQ */ + PERF_BR_IND_CALL, /* X86_BR_IND_CALL */ + PERF_BR_UNKNOWN, /* X86_BR_ABORT */ + PERF_BR_UNKNOWN, /* X86_BR_IN_TX */ + PERF_BR_UNKNOWN, /* X86_BR_NO_TX */ + PERF_BR_CALL, /* X86_BR_ZERO_CALL */ + PERF_BR_UNKNOWN, /* X86_BR_CALL_STACK */ + PERF_BR_IND, /* X86_BR_IND_JMP */ +}; + +static int +common_branch_type(int type) +{ + int i; + + type >>= 2; /* skip X86_BR_USER and X86_BR_KERNEL */ + + if (type) { + i = __ffs(type); + if (i < X86_BR_TYPE_MAP_MAX) + return branch_map[i]; + } + + return PERF_BR_UNKNOWN; +} + /* * implement actual branch filter based on user demand. * Hardware may not exactly satisfy that request, thus @@ -942,7 +993,8 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) bool compress = false; /* if sampling all branches, then nothing to filter */ - if ((br_sel & X86_BR_ALL) == X86_BR_ALL) + if (((br_sel & X86_BR_ALL) == X86_BR_ALL) && + ((br_sel & X86_BR_TYPE_SAVE) != X86_BR_TYPE_SAVE)) return; for (i = 0; i < cpuc->lbr_stack.nr; i++) { @@ -963,6 +1015,9 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) cpuc->lbr_entries[i].from = 0; compress = true; } + + if ((br_sel & X86_BR_TYPE_SAVE) == X86_BR_TYPE_SAVE) + cpuc->lbr_entries[i].type = common_branch_type(type); } if (!compress) diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c index eb0533558c2b..d32c0eed38ca 100644 --- a/arch/x86/events/intel/p4.c +++ b/arch/x86/events/intel/p4.c @@ -587,7 +587,7 @@ static __initconst const u64 p4_hw_cache_event_ids * P4_CONFIG_ALIASABLE or bits for P4_PEBS_METRIC, they are * either up to date automatically or not applicable at all. */ -struct p4_event_alias { +static struct p4_event_alias { u64 original; u64 alternative; } p4_event_aliases[] = { diff --git a/arch/x86/events/intel/p6.c b/arch/x86/events/intel/p6.c index 1f5c47ab4c65..a5604c352930 100644 --- a/arch/x86/events/intel/p6.c +++ b/arch/x86/events/intel/p6.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/perf_event.h> #include <linux/types.h> diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index ae8324d65e61..81fd41d5a0d9 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -471,8 +471,9 @@ static void pt_config(struct perf_event *event) struct pt *pt = this_cpu_ptr(&pt_ctx); u64 reg; - if (!event->hw.itrace_started) { - event->hw.itrace_started = 1; + /* First round: clear STATUS, in particular the PSB byte counter. */ + if (!event->hw.config) { + perf_event_itrace_started(event); wrmsrl(MSR_IA32_RTIT_STATUS, 0); } diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index a45e2114a846..005908ee9333 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -559,7 +559,7 @@ static struct attribute_group rapl_pmu_format_group = { .attrs = rapl_formats_attr, }; -const struct attribute_group *rapl_attr_groups[] = { +static const struct attribute_group *rapl_attr_groups[] = { &rapl_pmu_attr_group, &rapl_pmu_format_group, &rapl_pmu_events_group, @@ -775,6 +775,9 @@ static const struct x86_cpu_id rapl_cpu_match[] __initconst = { X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE_DESKTOP, skl_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT, hsw_rapl_init), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_DENVERTON, hsw_rapl_init), + + X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GEMINI_LAKE, hsw_rapl_init), {}, }; diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 44ec523287f6..d45e06346f14 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -721,7 +721,7 @@ static struct attribute *uncore_pmu_attrs[] = { NULL, }; -static struct attribute_group uncore_pmu_attr_group = { +static const struct attribute_group uncore_pmu_attr_group = { .attrs = uncore_pmu_attrs, }; @@ -822,7 +822,7 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid) pmus[i].type = type; pmus[i].boxes = kzalloc(size, GFP_KERNEL); if (!pmus[i].boxes) - return -ENOMEM; + goto err; } type->pmus = pmus; @@ -836,7 +836,7 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid) attr_group = kzalloc(sizeof(struct attribute *) * (i + 1) + sizeof(*attr_group), GFP_KERNEL); if (!attr_group) - return -ENOMEM; + goto err; attrs = (struct attribute **)(attr_group + 1); attr_group->name = "events"; @@ -849,7 +849,15 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid) } type->pmu_group = &uncore_pmu_attr_group; + return 0; + +err: + for (i = 0; i < type->num_boxes; i++) + kfree(pmus[i].boxes); + kfree(pmus); + + return -ENOMEM; } static int __init diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index df5989f27b1b..4364191e7c6b 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #include <linux/slab.h> #include <linux/pci.h> #include <asm/apicdef.h> diff --git a/arch/x86/events/intel/uncore_nhmex.c b/arch/x86/events/intel/uncore_nhmex.c index cda569332005..93e7a8397cde 100644 --- a/arch/x86/events/intel/uncore_nhmex.c +++ b/arch/x86/events/intel/uncore_nhmex.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Nehalem-EX/Westmere-EX uncore support */ #include "uncore.h" @@ -272,7 +273,7 @@ static struct attribute *nhmex_uncore_ubox_formats_attr[] = { NULL, }; -static struct attribute_group nhmex_uncore_ubox_format_group = { +static const struct attribute_group nhmex_uncore_ubox_format_group = { .name = "format", .attrs = nhmex_uncore_ubox_formats_attr, }; @@ -299,7 +300,7 @@ static struct attribute *nhmex_uncore_cbox_formats_attr[] = { NULL, }; -static struct attribute_group nhmex_uncore_cbox_format_group = { +static const struct attribute_group nhmex_uncore_cbox_format_group = { .name = "format", .attrs = nhmex_uncore_cbox_formats_attr, }; @@ -407,7 +408,7 @@ static struct attribute *nhmex_uncore_bbox_formats_attr[] = { NULL, }; -static struct attribute_group nhmex_uncore_bbox_format_group = { +static const struct attribute_group nhmex_uncore_bbox_format_group = { .name = "format", .attrs = nhmex_uncore_bbox_formats_attr, }; @@ -484,7 +485,7 @@ static struct attribute *nhmex_uncore_sbox_formats_attr[] = { NULL, }; -static struct attribute_group nhmex_uncore_sbox_format_group = { +static const struct attribute_group nhmex_uncore_sbox_format_group = { .name = "format", .attrs = nhmex_uncore_sbox_formats_attr, }; @@ -898,7 +899,7 @@ static struct attribute *nhmex_uncore_mbox_formats_attr[] = { NULL, }; -static struct attribute_group nhmex_uncore_mbox_format_group = { +static const struct attribute_group nhmex_uncore_mbox_format_group = { .name = "format", .attrs = nhmex_uncore_mbox_formats_attr, }; @@ -1163,7 +1164,7 @@ static struct attribute *nhmex_uncore_rbox_formats_attr[] = { NULL, }; -static struct attribute_group nhmex_uncore_rbox_format_group = { +static const struct attribute_group nhmex_uncore_rbox_format_group = { .name = "format", .attrs = nhmex_uncore_rbox_formats_attr, }; diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index a3dcc12bef4a..aee5e8496be4 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Nehalem/SandBridge/Haswell/Broadwell/Skylake uncore support */ #include "uncore.h" @@ -130,7 +131,7 @@ static struct attribute *snb_uncore_formats_attr[] = { NULL, }; -static struct attribute_group snb_uncore_format_group = { +static const struct attribute_group snb_uncore_format_group = { .name = "format", .attrs = snb_uncore_formats_attr, }; @@ -289,7 +290,7 @@ static struct attribute *snb_uncore_imc_formats_attr[] = { NULL, }; -static struct attribute_group snb_uncore_imc_format_group = { +static const struct attribute_group snb_uncore_imc_format_group = { .name = "format", .attrs = snb_uncore_imc_formats_attr, }; @@ -769,7 +770,7 @@ static struct attribute *nhm_uncore_formats_attr[] = { NULL, }; -static struct attribute_group nhm_uncore_format_group = { +static const struct attribute_group nhm_uncore_format_group = { .name = "format", .attrs = nhm_uncore_formats_attr, }; diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index dae2fedc1601..95cb19f4e06f 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* SandyBridge-EP/IvyTown uncore support */ #include "uncore.h" @@ -316,7 +317,7 @@ #define SKX_UPI_PCI_PMON_CTL0 0x350 #define SKX_UPI_PCI_PMON_CTR0 0x318 #define SKX_UPI_PCI_PMON_BOX_CTL 0x378 -#define SKX_PMON_CTL_UMASK_EXT 0xff +#define SKX_UPI_CTL_UMASK_EXT 0xffefff /* SKX M2M */ #define SKX_M2M_PCI_PMON_CTL0 0x228 @@ -328,7 +329,7 @@ DEFINE_UNCORE_FORMAT_ATTR(event2, event, "config:0-6"); DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21"); DEFINE_UNCORE_FORMAT_ATTR(use_occ_ctr, use_occ_ctr, "config:7"); DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); -DEFINE_UNCORE_FORMAT_ATTR(umask_ext, umask, "config:8-15,32-39"); +DEFINE_UNCORE_FORMAT_ATTR(umask_ext, umask, "config:8-15,32-43,45-55"); DEFINE_UNCORE_FORMAT_ATTR(qor, qor, "config:16"); DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18"); DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19"); @@ -351,7 +352,6 @@ DEFINE_UNCORE_FORMAT_ATTR(filter_cid, filter_cid, "config1:5"); DEFINE_UNCORE_FORMAT_ATTR(filter_link, filter_link, "config1:5-8"); DEFINE_UNCORE_FORMAT_ATTR(filter_link2, filter_link, "config1:6-8"); DEFINE_UNCORE_FORMAT_ATTR(filter_link3, filter_link, "config1:12"); -DEFINE_UNCORE_FORMAT_ATTR(filter_link4, filter_link, "config1:9-12"); DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17"); DEFINE_UNCORE_FORMAT_ATTR(filter_nid2, filter_nid, "config1:32-47"); DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22"); @@ -603,27 +603,27 @@ static struct uncore_event_desc snbep_uncore_qpi_events[] = { { /* end: all zeroes */ }, }; -static struct attribute_group snbep_uncore_format_group = { +static const struct attribute_group snbep_uncore_format_group = { .name = "format", .attrs = snbep_uncore_formats_attr, }; -static struct attribute_group snbep_uncore_ubox_format_group = { +static const struct attribute_group snbep_uncore_ubox_format_group = { .name = "format", .attrs = snbep_uncore_ubox_formats_attr, }; -static struct attribute_group snbep_uncore_cbox_format_group = { +static const struct attribute_group snbep_uncore_cbox_format_group = { .name = "format", .attrs = snbep_uncore_cbox_formats_attr, }; -static struct attribute_group snbep_uncore_pcu_format_group = { +static const struct attribute_group snbep_uncore_pcu_format_group = { .name = "format", .attrs = snbep_uncore_pcu_formats_attr, }; -static struct attribute_group snbep_uncore_qpi_format_group = { +static const struct attribute_group snbep_uncore_qpi_format_group = { .name = "format", .attrs = snbep_uncore_qpi_formats_attr, }; @@ -1432,27 +1432,27 @@ static struct attribute *ivbep_uncore_qpi_formats_attr[] = { NULL, }; -static struct attribute_group ivbep_uncore_format_group = { +static const struct attribute_group ivbep_uncore_format_group = { .name = "format", .attrs = ivbep_uncore_formats_attr, }; -static struct attribute_group ivbep_uncore_ubox_format_group = { +static const struct attribute_group ivbep_uncore_ubox_format_group = { .name = "format", .attrs = ivbep_uncore_ubox_formats_attr, }; -static struct attribute_group ivbep_uncore_cbox_format_group = { +static const struct attribute_group ivbep_uncore_cbox_format_group = { .name = "format", .attrs = ivbep_uncore_cbox_formats_attr, }; -static struct attribute_group ivbep_uncore_pcu_format_group = { +static const struct attribute_group ivbep_uncore_pcu_format_group = { .name = "format", .attrs = ivbep_uncore_pcu_formats_attr, }; -static struct attribute_group ivbep_uncore_qpi_format_group = { +static const struct attribute_group ivbep_uncore_qpi_format_group = { .name = "format", .attrs = ivbep_uncore_qpi_formats_attr, }; @@ -1888,7 +1888,7 @@ static struct attribute *knl_uncore_ubox_formats_attr[] = { NULL, }; -static struct attribute_group knl_uncore_ubox_format_group = { +static const struct attribute_group knl_uncore_ubox_format_group = { .name = "format", .attrs = knl_uncore_ubox_formats_attr, }; @@ -1928,7 +1928,7 @@ static struct attribute *knl_uncore_cha_formats_attr[] = { NULL, }; -static struct attribute_group knl_uncore_cha_format_group = { +static const struct attribute_group knl_uncore_cha_format_group = { .name = "format", .attrs = knl_uncore_cha_formats_attr, }; @@ -2038,7 +2038,7 @@ static struct attribute *knl_uncore_pcu_formats_attr[] = { NULL, }; -static struct attribute_group knl_uncore_pcu_format_group = { +static const struct attribute_group knl_uncore_pcu_format_group = { .name = "format", .attrs = knl_uncore_pcu_formats_attr, }; @@ -2188,7 +2188,7 @@ static struct attribute *knl_uncore_irp_formats_attr[] = { NULL, }; -static struct attribute_group knl_uncore_irp_format_group = { +static const struct attribute_group knl_uncore_irp_format_group = { .name = "format", .attrs = knl_uncore_irp_formats_attr, }; @@ -2386,7 +2386,7 @@ static struct attribute *hswep_uncore_ubox_formats_attr[] = { NULL, }; -static struct attribute_group hswep_uncore_ubox_format_group = { +static const struct attribute_group hswep_uncore_ubox_format_group = { .name = "format", .attrs = hswep_uncore_ubox_formats_attr, }; @@ -2440,7 +2440,7 @@ static struct attribute *hswep_uncore_cbox_formats_attr[] = { NULL, }; -static struct attribute_group hswep_uncore_cbox_format_group = { +static const struct attribute_group hswep_uncore_cbox_format_group = { .name = "format", .attrs = hswep_uncore_cbox_formats_attr, }; @@ -2622,7 +2622,7 @@ static struct attribute *hswep_uncore_sbox_formats_attr[] = { NULL, }; -static struct attribute_group hswep_uncore_sbox_format_group = { +static const struct attribute_group hswep_uncore_sbox_format_group = { .name = "format", .attrs = hswep_uncore_sbox_formats_attr, }; @@ -3302,7 +3302,6 @@ static struct attribute *skx_uncore_cha_formats_attr[] = { &format_attr_inv.attr, &format_attr_thresh8.attr, &format_attr_filter_tid4.attr, - &format_attr_filter_link4.attr, &format_attr_filter_state5.attr, &format_attr_filter_rem.attr, &format_attr_filter_loc.attr, @@ -3312,12 +3311,11 @@ static struct attribute *skx_uncore_cha_formats_attr[] = { &format_attr_filter_opc_0.attr, &format_attr_filter_opc_1.attr, &format_attr_filter_nc.attr, - &format_attr_filter_c6.attr, &format_attr_filter_isoc.attr, NULL, }; -static struct attribute_group skx_uncore_chabox_format_group = { +static const struct attribute_group skx_uncore_chabox_format_group = { .name = "format", .attrs = skx_uncore_cha_formats_attr, }; @@ -3333,8 +3331,11 @@ static struct extra_reg skx_uncore_cha_extra_regs[] = { SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4), SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4), SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4), - SNBEP_CBO_EVENT_EXTRA_REG(0x2134, 0xffff, 0x4), - SNBEP_CBO_EVENT_EXTRA_REG(0x8134, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x3134, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x9134, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x35, 0xff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x36, 0xff, 0x8), + EVENT_EXTRA_END }; static u64 skx_cha_filter_mask(int fields) @@ -3347,6 +3348,17 @@ static u64 skx_cha_filter_mask(int fields) mask |= SKX_CHA_MSR_PMON_BOX_FILTER_LINK; if (fields & 0x4) mask |= SKX_CHA_MSR_PMON_BOX_FILTER_STATE; + if (fields & 0x8) { + mask |= SKX_CHA_MSR_PMON_BOX_FILTER_REM; + mask |= SKX_CHA_MSR_PMON_BOX_FILTER_LOC; + mask |= SKX_CHA_MSR_PMON_BOX_FILTER_ALL_OPC; + mask |= SKX_CHA_MSR_PMON_BOX_FILTER_NM; + mask |= SKX_CHA_MSR_PMON_BOX_FILTER_NOT_NM; + mask |= SKX_CHA_MSR_PMON_BOX_FILTER_OPC0; + mask |= SKX_CHA_MSR_PMON_BOX_FILTER_OPC1; + mask |= SKX_CHA_MSR_PMON_BOX_FILTER_NC; + mask |= SKX_CHA_MSR_PMON_BOX_FILTER_ISOC; + } return mask; } @@ -3416,7 +3428,7 @@ static struct attribute *skx_uncore_iio_formats_attr[] = { NULL, }; -static struct attribute_group skx_uncore_iio_format_group = { +static const struct attribute_group skx_uncore_iio_format_group = { .name = "format", .attrs = skx_uncore_iio_formats_attr, }; @@ -3451,7 +3463,7 @@ static struct intel_uncore_ops skx_uncore_iio_ops = { static struct intel_uncore_type skx_uncore_iio = { .name = "iio", .num_counters = 4, - .num_boxes = 5, + .num_boxes = 6, .perf_ctr_bits = 48, .event_ctl = SKX_IIO0_MSR_PMON_CTL0, .perf_ctr = SKX_IIO0_MSR_PMON_CTR0, @@ -3473,7 +3485,7 @@ static struct attribute *skx_uncore_formats_attr[] = { NULL, }; -static struct attribute_group skx_uncore_format_group = { +static const struct attribute_group skx_uncore_format_group = { .name = "format", .attrs = skx_uncore_formats_attr, }; @@ -3481,7 +3493,7 @@ static struct attribute_group skx_uncore_format_group = { static struct intel_uncore_type skx_uncore_irp = { .name = "irp", .num_counters = 2, - .num_boxes = 5, + .num_boxes = 6, .perf_ctr_bits = 48, .event_ctl = SKX_IRP0_MSR_PMON_CTL0, .perf_ctr = SKX_IRP0_MSR_PMON_CTR0, @@ -3492,6 +3504,26 @@ static struct intel_uncore_type skx_uncore_irp = { .format_group = &skx_uncore_format_group, }; +static struct attribute *skx_uncore_pcu_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + &format_attr_occ_invert.attr, + &format_attr_occ_edge_det.attr, + &format_attr_filter_band0.attr, + &format_attr_filter_band1.attr, + &format_attr_filter_band2.attr, + &format_attr_filter_band3.attr, + NULL, +}; + +static struct attribute_group skx_uncore_pcu_format_group = { + .name = "format", + .attrs = skx_uncore_pcu_formats_attr, +}; + static struct intel_uncore_ops skx_uncore_pcu_ops = { IVBEP_UNCORE_MSR_OPS_COMMON_INIT(), .hw_config = hswep_pcu_hw_config, @@ -3510,7 +3542,7 @@ static struct intel_uncore_type skx_uncore_pcu = { .box_ctl = HSWEP_PCU_MSR_PMON_BOX_CTL, .num_shared_regs = 1, .ops = &skx_uncore_pcu_ops, - .format_group = &snbep_uncore_pcu_format_group, + .format_group = &skx_uncore_pcu_format_group, }; static struct intel_uncore_type *skx_msr_uncores[] = { @@ -3574,7 +3606,7 @@ static struct attribute *skx_upi_uncore_formats_attr[] = { NULL, }; -static struct attribute_group skx_upi_uncore_format_group = { +static const struct attribute_group skx_upi_uncore_format_group = { .name = "format", .attrs = skx_upi_uncore_formats_attr, }; @@ -3603,8 +3635,8 @@ static struct intel_uncore_type skx_uncore_upi = { .perf_ctr_bits = 48, .perf_ctr = SKX_UPI_PCI_PMON_CTR0, .event_ctl = SKX_UPI_PCI_PMON_CTL0, - .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK, - .event_mask_ext = SKX_PMON_CTL_UMASK_EXT, + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .event_mask_ext = SKX_UPI_CTL_UMASK_EXT, .box_ctl = SKX_UPI_PCI_PMON_BOX_CTL, .ops = &skx_upi_uncore_pci_ops, .format_group = &skx_upi_uncore_format_group, diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index 4bb3ec69e8ea..14efaa0e8684 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/perf_event.h> #include <asm/intel-family.h> @@ -63,6 +64,14 @@ static bool test_intel(int idx) case INTEL_FAM6_ATOM_SILVERMONT1: case INTEL_FAM6_ATOM_SILVERMONT2: case INTEL_FAM6_ATOM_AIRMONT: + + case INTEL_FAM6_ATOM_GOLDMONT: + case INTEL_FAM6_ATOM_DENVERTON: + + case INTEL_FAM6_ATOM_GEMINI_LAKE: + + case INTEL_FAM6_XEON_PHI_KNL: + case INTEL_FAM6_XEON_PHI_KNM: if (idx == PERF_MSR_SMI) return true; break; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index be3d36254040..4196f81ec0e1 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -91,7 +91,7 @@ struct amd_nb { (PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \ PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \ PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \ - PERF_SAMPLE_TRANSACTION) + PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR) /* * A debug store configuration. @@ -558,10 +558,14 @@ struct x86_pmu { int attr_rdpmc; struct attribute **format_attrs; struct attribute **event_attrs; + struct attribute **caps_attrs; ssize_t (*events_sysfs_show)(char *page, u64 config); struct attribute **cpu_events; + unsigned long attr_freeze_on_smi; + struct attribute **attrs; + /* * CPU Hotplug hooks */ @@ -588,7 +592,8 @@ struct x86_pmu { pebs :1, pebs_active :1, pebs_broken :1, - pebs_prec_dist :1; + pebs_prec_dist :1, + pebs_no_tlb :1; int pebs_record_size; int pebs_buffer_size; void (*drain_pebs)(struct pt_regs *regs); @@ -738,6 +743,8 @@ int x86_reserve_hardware(void); void x86_release_hardware(void); +int x86_pmu_max_precise(void); + void hw_perf_lbr_event_destroy(struct perf_event *event); int x86_setup_perfctr(struct perf_event *event); @@ -876,6 +883,8 @@ extern struct event_constraint intel_slm_pebs_event_constraints[]; extern struct event_constraint intel_glm_pebs_event_constraints[]; +extern struct event_constraint intel_glp_pebs_event_constraints[]; + extern struct event_constraint intel_nehalem_pebs_event_constraints[]; extern struct event_constraint intel_westmere_pebs_event_constraints[]; @@ -942,6 +951,8 @@ void intel_pmu_lbr_init_knl(void); void intel_pmu_pebs_data_source_nhm(void); +void intel_pmu_pebs_data_source_skl(bool pmem); + int intel_pmu_setup_lbr_filter(struct perf_event *event); void intel_pt_interrupt(void); diff --git a/arch/x86/hyperv/Makefile b/arch/x86/hyperv/Makefile index 171ae09864d7..367a8203cfcf 100644 --- a/arch/x86/hyperv/Makefile +++ b/arch/x86/hyperv/Makefile @@ -1 +1 @@ -obj-y := hv_init.o +obj-y := hv_init.o mmu.o diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 5b882cc0c0e9..a5db63f728a2 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -26,6 +26,8 @@ #include <linux/mm.h> #include <linux/clockchips.h> #include <linux/hyperv.h> +#include <linux/slab.h> +#include <linux/cpuhotplug.h> #ifdef CONFIG_HYPERV_TSCPAGE @@ -75,10 +77,30 @@ static struct clocksource hyperv_cs_msr = { .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; -static void *hypercall_pg; +void *hv_hypercall_pg; +EXPORT_SYMBOL_GPL(hv_hypercall_pg); struct clocksource *hyperv_cs; EXPORT_SYMBOL_GPL(hyperv_cs); +u32 *hv_vp_index; +EXPORT_SYMBOL_GPL(hv_vp_index); + +u32 hv_max_vp_index; + +static int hv_cpu_init(unsigned int cpu) +{ + u64 msr_vp_index; + + hv_get_vp_index(msr_vp_index); + + hv_vp_index[smp_processor_id()] = msr_vp_index; + + if (msr_vp_index > hv_max_vp_index) + hv_max_vp_index = msr_vp_index; + + return 0; +} + /* * This function is to be invoked early in the boot sequence after the * hypervisor has been detected. @@ -94,6 +116,16 @@ void hyperv_init(void) if (x86_hyper != &x86_hyper_ms_hyperv) return; + /* Allocate percpu VP index */ + hv_vp_index = kmalloc_array(num_possible_cpus(), sizeof(*hv_vp_index), + GFP_KERNEL); + if (!hv_vp_index) + return; + + if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/hyperv_init:online", + hv_cpu_init, NULL) < 0) + goto free_vp_index; + /* * Setup the hypercall page and enable hypercalls. * 1. Register the guest ID @@ -102,17 +134,19 @@ void hyperv_init(void) guest_id = generate_guest_id(0, LINUX_VERSION_CODE, 0); wrmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id); - hypercall_pg = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL_RX); - if (hypercall_pg == NULL) { + hv_hypercall_pg = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL_RX); + if (hv_hypercall_pg == NULL) { wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); - return; + goto free_vp_index; } rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); hypercall_msr.enable = 1; - hypercall_msr.guest_physical_address = vmalloc_to_pfn(hypercall_pg); + hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg); wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + hyper_alloc_mmu(); + /* * Register Hyper-V specific clocksource. */ @@ -148,6 +182,12 @@ register_msr_cs: hyperv_cs = &hyperv_cs_msr; if (ms_hyperv.features & HV_X64_MSR_TIME_REF_COUNT_AVAILABLE) clocksource_register_hz(&hyperv_cs_msr, NSEC_PER_SEC/100); + + return; + +free_vp_index: + kfree(hv_vp_index); + hv_vp_index = NULL; } /* @@ -170,51 +210,6 @@ void hyperv_cleanup(void) } EXPORT_SYMBOL_GPL(hyperv_cleanup); -/* - * hv_do_hypercall- Invoke the specified hypercall - */ -u64 hv_do_hypercall(u64 control, void *input, void *output) -{ - u64 input_address = (input) ? virt_to_phys(input) : 0; - u64 output_address = (output) ? virt_to_phys(output) : 0; -#ifdef CONFIG_X86_64 - u64 hv_status = 0; - - if (!hypercall_pg) - return (u64)ULLONG_MAX; - - __asm__ __volatile__("mov %0, %%r8" : : "r" (output_address) : "r8"); - __asm__ __volatile__("call *%3" : "=a" (hv_status) : - "c" (control), "d" (input_address), - "m" (hypercall_pg)); - - return hv_status; - -#else - - u32 control_hi = control >> 32; - u32 control_lo = control & 0xFFFFFFFF; - u32 hv_status_hi = 1; - u32 hv_status_lo = 1; - u32 input_address_hi = input_address >> 32; - u32 input_address_lo = input_address & 0xFFFFFFFF; - u32 output_address_hi = output_address >> 32; - u32 output_address_lo = output_address & 0xFFFFFFFF; - - if (!hypercall_pg) - return (u64)ULLONG_MAX; - - __asm__ __volatile__ ("call *%8" : "=d"(hv_status_hi), - "=a"(hv_status_lo) : "d" (control_hi), - "a" (control_lo), "b" (input_address_hi), - "c" (input_address_lo), "D"(output_address_hi), - "S"(output_address_lo), "m" (hypercall_pg)); - - return hv_status_lo | ((u64)hv_status_hi << 32); -#endif /* !x86_64 */ -} -EXPORT_SYMBOL_GPL(hv_do_hypercall); - void hyperv_report_panic(struct pt_regs *regs) { static bool panic_reported; diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c new file mode 100644 index 000000000000..9cc9e1c1e2db --- /dev/null +++ b/arch/x86/hyperv/mmu.c @@ -0,0 +1,301 @@ +#define pr_fmt(fmt) "Hyper-V: " fmt + +#include <linux/hyperv.h> +#include <linux/log2.h> +#include <linux/slab.h> +#include <linux/types.h> + +#include <asm/fpu/api.h> +#include <asm/mshyperv.h> +#include <asm/msr.h> +#include <asm/tlbflush.h> + +#define CREATE_TRACE_POINTS +#include <asm/trace/hyperv.h> + +/* HvFlushVirtualAddressSpace, HvFlushVirtualAddressList hypercalls */ +struct hv_flush_pcpu { + u64 address_space; + u64 flags; + u64 processor_mask; + u64 gva_list[]; +}; + +/* HvFlushVirtualAddressSpaceEx, HvFlushVirtualAddressListEx hypercalls */ +struct hv_flush_pcpu_ex { + u64 address_space; + u64 flags; + struct { + u64 format; + u64 valid_bank_mask; + u64 bank_contents[]; + } hv_vp_set; + u64 gva_list[]; +}; + +/* Each gva in gva_list encodes up to 4096 pages to flush */ +#define HV_TLB_FLUSH_UNIT (4096 * PAGE_SIZE) + +static struct hv_flush_pcpu __percpu **pcpu_flush; + +static struct hv_flush_pcpu_ex __percpu **pcpu_flush_ex; + +/* + * Fills in gva_list starting from offset. Returns the number of items added. + */ +static inline int fill_gva_list(u64 gva_list[], int offset, + unsigned long start, unsigned long end) +{ + int gva_n = offset; + unsigned long cur = start, diff; + + do { + diff = end > cur ? end - cur : 0; + + gva_list[gva_n] = cur & PAGE_MASK; + /* + * Lower 12 bits encode the number of additional + * pages to flush (in addition to the 'cur' page). + */ + if (diff >= HV_TLB_FLUSH_UNIT) + gva_list[gva_n] |= ~PAGE_MASK; + else if (diff) + gva_list[gva_n] |= (diff - 1) >> PAGE_SHIFT; + + cur += HV_TLB_FLUSH_UNIT; + gva_n++; + + } while (cur < end); + + return gva_n - offset; +} + +/* Return the number of banks in the resulting vp_set */ +static inline int cpumask_to_vp_set(struct hv_flush_pcpu_ex *flush, + const struct cpumask *cpus) +{ + int cpu, vcpu, vcpu_bank, vcpu_offset, nr_bank = 1; + + /* valid_bank_mask can represent up to 64 banks */ + if (hv_max_vp_index / 64 >= 64) + return 0; + + /* + * Clear all banks up to the maximum possible bank as hv_flush_pcpu_ex + * structs are not cleared between calls, we risk flushing unneeded + * vCPUs otherwise. + */ + for (vcpu_bank = 0; vcpu_bank <= hv_max_vp_index / 64; vcpu_bank++) + flush->hv_vp_set.bank_contents[vcpu_bank] = 0; + + /* + * Some banks may end up being empty but this is acceptable. + */ + for_each_cpu(cpu, cpus) { + vcpu = hv_cpu_number_to_vp_number(cpu); + vcpu_bank = vcpu / 64; + vcpu_offset = vcpu % 64; + __set_bit(vcpu_offset, (unsigned long *) + &flush->hv_vp_set.bank_contents[vcpu_bank]); + if (vcpu_bank >= nr_bank) + nr_bank = vcpu_bank + 1; + } + flush->hv_vp_set.valid_bank_mask = GENMASK_ULL(nr_bank - 1, 0); + + return nr_bank; +} + +static void hyperv_flush_tlb_others(const struct cpumask *cpus, + const struct flush_tlb_info *info) +{ + int cpu, vcpu, gva_n, max_gvas; + struct hv_flush_pcpu **flush_pcpu; + struct hv_flush_pcpu *flush; + u64 status = U64_MAX; + unsigned long flags; + + trace_hyperv_mmu_flush_tlb_others(cpus, info); + + if (!pcpu_flush || !hv_hypercall_pg) + goto do_native; + + if (cpumask_empty(cpus)) + return; + + local_irq_save(flags); + + flush_pcpu = this_cpu_ptr(pcpu_flush); + + if (unlikely(!*flush_pcpu)) + *flush_pcpu = page_address(alloc_page(GFP_ATOMIC)); + + flush = *flush_pcpu; + + if (unlikely(!flush)) { + local_irq_restore(flags); + goto do_native; + } + + if (info->mm) { + flush->address_space = virt_to_phys(info->mm->pgd); + flush->flags = 0; + } else { + flush->address_space = 0; + flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES; + } + + flush->processor_mask = 0; + if (cpumask_equal(cpus, cpu_present_mask)) { + flush->flags |= HV_FLUSH_ALL_PROCESSORS; + } else { + for_each_cpu(cpu, cpus) { + vcpu = hv_cpu_number_to_vp_number(cpu); + if (vcpu >= 64) + goto do_native; + + __set_bit(vcpu, (unsigned long *) + &flush->processor_mask); + } + } + + /* + * We can flush not more than max_gvas with one hypercall. Flush the + * whole address space if we were asked to do more. + */ + max_gvas = (PAGE_SIZE - sizeof(*flush)) / sizeof(flush->gva_list[0]); + + if (info->end == TLB_FLUSH_ALL) { + flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY; + status = hv_do_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE, + flush, NULL); + } else if (info->end && + ((info->end - info->start)/HV_TLB_FLUSH_UNIT) > max_gvas) { + status = hv_do_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE, + flush, NULL); + } else { + gva_n = fill_gva_list(flush->gva_list, 0, + info->start, info->end); + status = hv_do_rep_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST, + gva_n, 0, flush, NULL); + } + + local_irq_restore(flags); + + if (!(status & HV_HYPERCALL_RESULT_MASK)) + return; +do_native: + native_flush_tlb_others(cpus, info); +} + +static void hyperv_flush_tlb_others_ex(const struct cpumask *cpus, + const struct flush_tlb_info *info) +{ + int nr_bank = 0, max_gvas, gva_n; + struct hv_flush_pcpu_ex **flush_pcpu; + struct hv_flush_pcpu_ex *flush; + u64 status = U64_MAX; + unsigned long flags; + + trace_hyperv_mmu_flush_tlb_others(cpus, info); + + if (!pcpu_flush_ex || !hv_hypercall_pg) + goto do_native; + + if (cpumask_empty(cpus)) + return; + + local_irq_save(flags); + + flush_pcpu = this_cpu_ptr(pcpu_flush_ex); + + if (unlikely(!*flush_pcpu)) + *flush_pcpu = page_address(alloc_page(GFP_ATOMIC)); + + flush = *flush_pcpu; + + if (unlikely(!flush)) { + local_irq_restore(flags); + goto do_native; + } + + if (info->mm) { + flush->address_space = virt_to_phys(info->mm->pgd); + flush->flags = 0; + } else { + flush->address_space = 0; + flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES; + } + + flush->hv_vp_set.valid_bank_mask = 0; + + if (!cpumask_equal(cpus, cpu_present_mask)) { + flush->hv_vp_set.format = HV_GENERIC_SET_SPARCE_4K; + nr_bank = cpumask_to_vp_set(flush, cpus); + } + + if (!nr_bank) { + flush->hv_vp_set.format = HV_GENERIC_SET_ALL; + flush->flags |= HV_FLUSH_ALL_PROCESSORS; + } + + /* + * We can flush not more than max_gvas with one hypercall. Flush the + * whole address space if we were asked to do more. + */ + max_gvas = + (PAGE_SIZE - sizeof(*flush) - nr_bank * + sizeof(flush->hv_vp_set.bank_contents[0])) / + sizeof(flush->gva_list[0]); + + if (info->end == TLB_FLUSH_ALL) { + flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY; + status = hv_do_rep_hypercall( + HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX, + 0, nr_bank, flush, NULL); + } else if (info->end && + ((info->end - info->start)/HV_TLB_FLUSH_UNIT) > max_gvas) { + status = hv_do_rep_hypercall( + HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX, + 0, nr_bank, flush, NULL); + } else { + gva_n = fill_gva_list(flush->gva_list, nr_bank, + info->start, info->end); + status = hv_do_rep_hypercall( + HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX, + gva_n, nr_bank, flush, NULL); + } + + local_irq_restore(flags); + + if (!(status & HV_HYPERCALL_RESULT_MASK)) + return; +do_native: + native_flush_tlb_others(cpus, info); +} + +void hyperv_setup_mmu_ops(void) +{ + if (!(ms_hyperv.hints & HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED)) + return; + + setup_clear_cpu_cap(X86_FEATURE_PCID); + + if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) { + pr_info("Using hypercall for remote TLB flush\n"); + pv_mmu_ops.flush_tlb_others = hyperv_flush_tlb_others; + } else { + pr_info("Using ext hypercall for remote TLB flush\n"); + pv_mmu_ops.flush_tlb_others = hyperv_flush_tlb_others_ex; + } +} + +void hyper_alloc_mmu(void) +{ + if (!(ms_hyperv.hints & HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED)) + return; + + if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) + pcpu_flush = alloc_percpu(struct hv_flush_pcpu *); + else + pcpu_flush_ex = alloc_percpu(struct hv_flush_pcpu_ex *); +} diff --git a/arch/x86/ia32/audit.c b/arch/x86/ia32/audit.c index 2eccc8932ae6..3d21eab7aaed 100644 --- a/arch/x86/ia32/audit.c +++ b/arch/x86/ia32/audit.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <asm/unistd_32.h> unsigned ia32_dir_class[] = { diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index 8d0879f1d42c..8e02b30cf08e 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -407,10 +407,10 @@ static int load_aout_library(struct file *file) unsigned long bss, start_addr, len, error; int retval; struct exec ex; - + loff_t pos = 0; retval = -ENOEXEC; - error = kernel_read(file, 0, (char *) &ex, sizeof(ex)); + error = kernel_read(file, &ex, sizeof(ex), &pos); if (error != sizeof(ex)) goto out; diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 724153797209..41c671854642 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/arch/x86_64/ia32/ia32_signal.c * @@ -226,12 +227,12 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs, if (ksig->ka.sa.sa_flags & SA_ONSTACK) sp = sigsp(sp, ksig); /* This is the legacy signal stack switching. */ - else if ((regs->ss & 0xffff) != __USER32_DS && + else if (regs->ss != __USER32_DS && !(ksig->ka.sa.sa_flags & SA_RESTORER) && ksig->ka.sa.sa_restorer) sp = (unsigned long) ksig->ka.sa.sa_restorer; - if (fpu->fpstate_active) { + if (fpu->initialized) { unsigned long fx_aligned, math_size; sp = fpu__alloc_mathframe(sp, 1, &fx_aligned, &math_size); diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index 47956c6a4fd8..96cd33bbfc85 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * sys_ia32.c: Conversion between 32bit and 64bit native syscalls. Based on * sys_sparc32 diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 2efc768e4362..8d0ec9df1cbe 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -162,12 +162,13 @@ static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr) * you call efi_mem_attributes() during boot and at runtime, * you could theoretically see different attributes. * - * Since we are yet to see any x86 platforms that require - * anything other than PAGE_KERNEL (some arm64 platforms - * require the equivalent of PAGE_KERNEL_NOCACHE), return that - * until we know differently. + * We are yet to see any x86 platforms that require anything + * other than PAGE_KERNEL (some ARM64 platforms require the + * equivalent of PAGE_KERNEL_NOCACHE). Additionally, if SME + * is active, the ACPI information will not be encrypted, + * so return PAGE_KERNEL_NOENC until we know differently. */ - return PAGE_KERNEL; + return PAGE_KERNEL_NOENC; } #endif diff --git a/arch/x86/include/asm/agp.h b/arch/x86/include/asm/agp.h index eec2a70d4376..8e25bf4f323a 100644 --- a/arch/x86/include/asm/agp.h +++ b/arch/x86/include/asm/agp.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_AGP_H #define _ASM_X86_AGP_H diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h index e7636bac7372..31b627b43a8e 100644 --- a/arch/x86/include/asm/alternative-asm.h +++ b/arch/x86/include/asm/alternative-asm.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_ALTERNATIVE_ASM_H #define _ASM_X86_ALTERNATIVE_ASM_H @@ -62,8 +63,10 @@ #define new_len2 145f-144f /* - * max without conditionals. Idea adapted from: + * gas compatible max based on the idea from: * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax + * + * The additional "-" is needed because gas uses a "true" value of -1. */ #define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b))))) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 1b020381ab38..dbfd0854651f 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_ALTERNATIVE_H #define _ASM_X86_ALTERNATIVE_H @@ -103,12 +104,12 @@ static inline int alternatives_text_reserved(void *start, void *end) alt_end_marker ":\n" /* - * max without conditionals. Idea adapted from: + * gas compatible max based on the idea from: * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax * - * The additional "-" is needed because gas works with s32s. + * The additional "-" is needed because gas uses a "true" value of -1. */ -#define alt_max_short(a, b) "((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") - (" b ")))))" +#define alt_max_short(a, b) "((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") < (" b ")))))" /* * Pad the second replacement alternative with additional NOPs if it is @@ -218,10 +219,9 @@ static inline int alternatives_text_reserved(void *start, void *end) #define alternative_call_2(oldfunc, newfunc1, feature1, newfunc2, feature2, \ output, input...) \ { \ - register void *__sp asm(_ASM_SP); \ asm volatile (ALTERNATIVE_2("call %P[old]", "call %P[new1]", feature1,\ "call %P[new2]", feature2) \ - : output, "+r" (__sp) \ + : output, ASM_CALL_CONSTRAINT \ : [old] "i" (oldfunc), [new1] "i" (newfunc1), \ [new2] "i" (newfunc2), ## input); \ } diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index 00c88a01301d..fddb6d26239f 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -1,8 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_AMD_NB_H #define _ASM_X86_AMD_NB_H #include <linux/ioport.h> #include <linux/pci.h> +#include <linux/refcount.h> struct amd_nb_bus_dev_range { u8 bus; @@ -55,7 +57,7 @@ struct threshold_bank { struct threshold_block *blocks; /* initialized to the number of CPUs on the node sharing this bank */ - atomic_t cpus; + refcount_t cpus; }; struct amd_northbridge { diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index bdffcd9eab2b..5f01671c68f2 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -252,6 +252,8 @@ static inline int x2apic_enabled(void) { return 0; } #define x2apic_supported() (0) #endif /* !CONFIG_X86_X2APIC */ +struct irq_data; + /* * Copyright 2004 James Cleverdon, IBM. * Subject to the GNU Public License, v.2 @@ -296,9 +298,9 @@ struct apic { /* Can't be NULL on 64-bit */ unsigned long (*set_apic_id)(unsigned int id); - int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask, - const struct cpumask *andmask, - unsigned int *apicid); + int (*cpu_mask_to_apicid)(const struct cpumask *cpumask, + struct irq_data *irqdata, + unsigned int *apicid); /* ipi */ void (*send_IPI)(int cpu, int vector); @@ -540,28 +542,12 @@ static inline int default_phys_pkg_id(int cpuid_apic, int index_msb) #endif -static inline int -flat_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask, - unsigned int *apicid) -{ - unsigned long cpu_mask = cpumask_bits(cpumask)[0] & - cpumask_bits(andmask)[0] & - cpumask_bits(cpu_online_mask)[0] & - APIC_ALL_CPUS; - - if (likely(cpu_mask)) { - *apicid = (unsigned int)cpu_mask; - return 0; - } else { - return -EINVAL; - } -} - -extern int -default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask, - unsigned int *apicid); +extern int flat_cpu_mask_to_apicid(const struct cpumask *cpumask, + struct irq_data *irqdata, + unsigned int *apicid); +extern int default_cpu_mask_to_apicid(const struct cpumask *cpumask, + struct irq_data *irqdata, + unsigned int *apicid); static inline void flat_vector_allocation_domain(int cpu, struct cpumask *retmask, diff --git a/arch/x86/include/asm/apic_flat_64.h b/arch/x86/include/asm/apic_flat_64.h index a2d312796440..d3a2b3876ce6 100644 --- a/arch/x86/include/asm/apic_flat_64.h +++ b/arch/x86/include/asm/apic_flat_64.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_APIC_FLAT_64_H #define _ASM_X86_APIC_FLAT_64_H diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index c46bb99d5fb2..05e694ed8386 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_APICDEF_H #define _ASM_X86_APICDEF_H diff --git a/arch/x86/include/asm/apm.h b/arch/x86/include/asm/apm.h index 93eebc636c76..4d4015ddcf26 100644 --- a/arch/x86/include/asm/apm.h +++ b/arch/x86/include/asm/apm.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Machine specific APM BIOS functions for generic. * Split out from apm.c by Osamu Tomita <tomita@cinet.co.jp> diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h index e7cd63175de4..34a10b2d5b73 100644 --- a/arch/x86/include/asm/arch_hweight.h +++ b/arch/x86/include/asm/arch_hweight.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_HWEIGHT_H #define _ASM_X86_HWEIGHT_H diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h index 830b19dbfa31..ff700d81e91e 100644 --- a/arch/x86/include/asm/asm-prototypes.h +++ b/arch/x86/include/asm/asm-prototypes.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #include <asm/ftrace.h> #include <linux/uaccess.h> #include <asm/string.h> diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 7a9df3beb89b..219faaec51df 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_ASM_H #define _ASM_X86_ASM_H @@ -11,10 +12,12 @@ # define __ASM_FORM_COMMA(x) " " #x "," #endif -#ifdef CONFIG_X86_32 +#ifndef __x86_64__ +/* 32 bit */ # define __ASM_SEL(a,b) __ASM_FORM(a) # define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(a) #else +/* 64 bit */ # define __ASM_SEL(a,b) __ASM_FORM(b) # define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(b) #endif @@ -74,6 +77,9 @@ # define _ASM_EXTABLE_EX(from, to) \ _ASM_EXTABLE_HANDLE(from, to, ex_handler_ext) +# define _ASM_EXTABLE_REFCOUNT(from, to) \ + _ASM_EXTABLE_HANDLE(from, to, ex_handler_refcount) + # define _ASM_NOKPROBE(entry) \ .pushsection "_kprobe_blacklist","aw" ; \ _ASM_ALIGN ; \ @@ -123,7 +129,21 @@ # define _ASM_EXTABLE_EX(from, to) \ _ASM_EXTABLE_HANDLE(from, to, ex_handler_ext) +# define _ASM_EXTABLE_REFCOUNT(from, to) \ + _ASM_EXTABLE_HANDLE(from, to, ex_handler_refcount) + /* For C file, we already have NOKPROBE_SYMBOL macro */ #endif +#ifndef __ASSEMBLY__ +/* + * This output constraint should be used for any inline asm which has a "call" + * instruction. Otherwise the asm may be inserted before the frame pointer + * gets set up by the containing function. If you forget to do this, objtool + * may print a "call without frame pointer save/setup" warning. + */ +register unsigned long current_stack_pointer asm(_ASM_SP); +#define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer) +#endif + #endif /* _ASM_X86_ASM_H */ diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index caa5798c92f4..72759f131cc5 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_ATOMIC_H #define _ASM_X86_ATOMIC_H @@ -197,35 +198,56 @@ static inline int atomic_xchg(atomic_t *v, int new) return xchg(&v->counter, new); } -#define ATOMIC_OP(op) \ -static inline void atomic_##op(int i, atomic_t *v) \ -{ \ - asm volatile(LOCK_PREFIX #op"l %1,%0" \ - : "+m" (v->counter) \ - : "ir" (i) \ - : "memory"); \ +static inline void atomic_and(int i, atomic_t *v) +{ + asm volatile(LOCK_PREFIX "andl %1,%0" + : "+m" (v->counter) + : "ir" (i) + : "memory"); } -#define ATOMIC_FETCH_OP(op, c_op) \ -static inline int atomic_fetch_##op(int i, atomic_t *v) \ -{ \ - int val = atomic_read(v); \ - do { \ - } while (!atomic_try_cmpxchg(v, &val, val c_op i)); \ - return val; \ +static inline int atomic_fetch_and(int i, atomic_t *v) +{ + int val = atomic_read(v); + + do { } while (!atomic_try_cmpxchg(v, &val, val & i)); + + return val; } -#define ATOMIC_OPS(op, c_op) \ - ATOMIC_OP(op) \ - ATOMIC_FETCH_OP(op, c_op) +static inline void atomic_or(int i, atomic_t *v) +{ + asm volatile(LOCK_PREFIX "orl %1,%0" + : "+m" (v->counter) + : "ir" (i) + : "memory"); +} -ATOMIC_OPS(and, &) -ATOMIC_OPS(or , |) -ATOMIC_OPS(xor, ^) +static inline int atomic_fetch_or(int i, atomic_t *v) +{ + int val = atomic_read(v); -#undef ATOMIC_OPS -#undef ATOMIC_FETCH_OP -#undef ATOMIC_OP + do { } while (!atomic_try_cmpxchg(v, &val, val | i)); + + return val; +} + +static inline void atomic_xor(int i, atomic_t *v) +{ + asm volatile(LOCK_PREFIX "xorl %1,%0" + : "+m" (v->counter) + : "ir" (i) + : "memory"); +} + +static inline int atomic_fetch_xor(int i, atomic_t *v) +{ + int val = atomic_read(v); + + do { } while (!atomic_try_cmpxchg(v, &val, val ^ i)); + + return val; +} /** * __atomic_add_unless - add unless the number is already a given value @@ -239,24 +261,13 @@ ATOMIC_OPS(xor, ^) static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u) { int c = atomic_read(v); + do { if (unlikely(c == u)) break; } while (!atomic_try_cmpxchg(v, &c, c + a)); - return c; -} -/** - * atomic_inc_short - increment of a short integer - * @v: pointer to type int - * - * Atomically adds 1 to @v - * Returns the new value of @u - */ -static __always_inline short int atomic_inc_short(short int *v) -{ - asm(LOCK_PREFIX "addw $1, %0" : "+m" (*v)); - return *v; + return c; } #ifdef CONFIG_X86_32 diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index 71d7705fb303..97c46b8169b7 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_ATOMIC64_32_H #define _ASM_X86_ATOMIC64_32_H @@ -312,37 +313,70 @@ static inline long long atomic64_dec_if_positive(atomic64_t *v) #undef alternative_atomic64 #undef __alternative_atomic64 -#define ATOMIC64_OP(op, c_op) \ -static inline void atomic64_##op(long long i, atomic64_t *v) \ -{ \ - long long old, c = 0; \ - while ((old = atomic64_cmpxchg(v, c, c c_op i)) != c) \ - c = old; \ +static inline void atomic64_and(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c & i)) != c) + c = old; } -#define ATOMIC64_FETCH_OP(op, c_op) \ -static inline long long atomic64_fetch_##op(long long i, atomic64_t *v) \ -{ \ - long long old, c = 0; \ - while ((old = atomic64_cmpxchg(v, c, c c_op i)) != c) \ - c = old; \ - return old; \ +static inline long long atomic64_fetch_and(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c & i)) != c) + c = old; + + return old; } -ATOMIC64_FETCH_OP(add, +) +static inline void atomic64_or(long long i, atomic64_t *v) +{ + long long old, c = 0; -#define atomic64_fetch_sub(i, v) atomic64_fetch_add(-(i), (v)) + while ((old = atomic64_cmpxchg(v, c, c | i)) != c) + c = old; +} + +static inline long long atomic64_fetch_or(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c | i)) != c) + c = old; + + return old; +} -#define ATOMIC64_OPS(op, c_op) \ - ATOMIC64_OP(op, c_op) \ - ATOMIC64_FETCH_OP(op, c_op) +static inline void atomic64_xor(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c ^ i)) != c) + c = old; +} -ATOMIC64_OPS(and, &) -ATOMIC64_OPS(or, |) -ATOMIC64_OPS(xor, ^) +static inline long long atomic64_fetch_xor(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c ^ i)) != c) + c = old; + + return old; +} -#undef ATOMIC64_OPS -#undef ATOMIC64_FETCH_OP -#undef ATOMIC64_OP +static inline long long atomic64_fetch_add(long long i, atomic64_t *v) +{ + long long old, c = 0; + + while ((old = atomic64_cmpxchg(v, c, c + i)) != c) + c = old; + + return old; +} + +#define atomic64_fetch_sub(i, v) atomic64_fetch_add(-(i), (v)) #endif /* _ASM_X86_ATOMIC64_32_H */ diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index 6189a433c9a9..738495caf05f 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_ATOMIC64_64_H #define _ASM_X86_ATOMIC64_64_H @@ -177,7 +178,7 @@ static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new) } #define atomic64_try_cmpxchg atomic64_try_cmpxchg -static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, long *old, long new) +static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, s64 *old, long new) { return try_cmpxchg(&v->counter, old, new); } @@ -198,7 +199,7 @@ static inline long atomic64_xchg(atomic64_t *v, long new) */ static inline bool atomic64_add_unless(atomic64_t *v, long a, long u) { - long c = atomic64_read(v); + s64 c = atomic64_read(v); do { if (unlikely(c == u)) return false; @@ -217,7 +218,7 @@ static inline bool atomic64_add_unless(atomic64_t *v, long a, long u) */ static inline long atomic64_dec_if_positive(atomic64_t *v) { - long dec, c = atomic64_read(v); + s64 dec, c = atomic64_read(v); do { dec = c - 1; if (unlikely(dec < 0)) @@ -226,34 +227,55 @@ static inline long atomic64_dec_if_positive(atomic64_t *v) return dec; } -#define ATOMIC64_OP(op) \ -static inline void atomic64_##op(long i, atomic64_t *v) \ -{ \ - asm volatile(LOCK_PREFIX #op"q %1,%0" \ - : "+m" (v->counter) \ - : "er" (i) \ - : "memory"); \ +static inline void atomic64_and(long i, atomic64_t *v) +{ + asm volatile(LOCK_PREFIX "andq %1,%0" + : "+m" (v->counter) + : "er" (i) + : "memory"); } -#define ATOMIC64_FETCH_OP(op, c_op) \ -static inline long atomic64_fetch_##op(long i, atomic64_t *v) \ -{ \ - long val = atomic64_read(v); \ - do { \ - } while (!atomic64_try_cmpxchg(v, &val, val c_op i)); \ - return val; \ +static inline long atomic64_fetch_and(long i, atomic64_t *v) +{ + s64 val = atomic64_read(v); + + do { + } while (!atomic64_try_cmpxchg(v, &val, val & i)); + return val; } -#define ATOMIC64_OPS(op, c_op) \ - ATOMIC64_OP(op) \ - ATOMIC64_FETCH_OP(op, c_op) +static inline void atomic64_or(long i, atomic64_t *v) +{ + asm volatile(LOCK_PREFIX "orq %1,%0" + : "+m" (v->counter) + : "er" (i) + : "memory"); +} -ATOMIC64_OPS(and, &) -ATOMIC64_OPS(or, |) -ATOMIC64_OPS(xor, ^) +static inline long atomic64_fetch_or(long i, atomic64_t *v) +{ + s64 val = atomic64_read(v); -#undef ATOMIC64_OPS -#undef ATOMIC64_FETCH_OP -#undef ATOMIC64_OP + do { + } while (!atomic64_try_cmpxchg(v, &val, val | i)); + return val; +} + +static inline void atomic64_xor(long i, atomic64_t *v) +{ + asm volatile(LOCK_PREFIX "xorq %1,%0" + : "+m" (v->counter) + : "er" (i) + : "memory"); +} + +static inline long atomic64_fetch_xor(long i, atomic64_t *v) +{ + s64 val = atomic64_read(v); + + do { + } while (!atomic64_try_cmpxchg(v, &val, val ^ i)); + return val; +} #endif /* _ASM_X86_ATOMIC64_64_H */ diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index bfb28caf97b1..01727dbc294a 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_BARRIER_H #define _ASM_X86_BARRIER_H diff --git a/arch/x86/include/asm/bios_ebda.h b/arch/x86/include/asm/bios_ebda.h index 4b7b8e71607e..4d5a17e2febe 100644 --- a/arch/x86/include/asm/bios_ebda.h +++ b/arch/x86/include/asm/bios_ebda.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_BIOS_EBDA_H #define _ASM_X86_BIOS_EBDA_H diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 854022772c5b..2bcf47314959 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_BITOPS_H #define _ASM_X86_BITOPS_H diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h index abd06b19ddd2..680c320363db 100644 --- a/arch/x86/include/asm/boot.h +++ b/arch/x86/include/asm/boot.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_BOOT_H #define _ASM_X86_BOOT_H diff --git a/arch/x86/include/asm/bootparam_utils.h b/arch/x86/include/asm/bootparam_utils.h index 4a8cb8d7cbd5..a07ffd23e4dd 100644 --- a/arch/x86/include/asm/bootparam_utils.h +++ b/arch/x86/include/asm/bootparam_utils.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_BOOTPARAM_UTILS_H #define _ASM_X86_BOOTPARAM_UTILS_H diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index 39e702d90cdb..34d99af43994 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_BUG_H #define _ASM_X86_BUG_H @@ -35,7 +36,7 @@ #define _BUG_FLAGS(ins, flags) \ do { \ asm volatile("1:\t" ins "\n" \ - ".pushsection __bug_table,\"a\"\n" \ + ".pushsection __bug_table,\"aw\"\n" \ "2:\t" __BUG_REL(1b) "\t# bug_entry::bug_addr\n" \ "\t" __BUG_REL(%c0) "\t# bug_entry::file\n" \ "\t.word %c1" "\t# bug_entry::line\n" \ @@ -52,7 +53,7 @@ do { \ #define _BUG_FLAGS(ins, flags) \ do { \ asm volatile("1:\t" ins "\n" \ - ".pushsection __bug_table,\"a\"\n" \ + ".pushsection __bug_table,\"aw\"\n" \ "2:\t" __BUG_REL(1b) "\t# bug_entry::bug_addr\n" \ "\t.word %c0" "\t# bug_entry::flags\n" \ "\t.org 2b+%c1\n" \ diff --git a/arch/x86/include/asm/bugs.h b/arch/x86/include/asm/bugs.h index 5490bbaf71d5..542509b53e0f 100644 --- a/arch/x86/include/asm/bugs.h +++ b/arch/x86/include/asm/bugs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_BUGS_H #define _ASM_X86_BUGS_H diff --git a/arch/x86/include/asm/cache.h b/arch/x86/include/asm/cache.h index 48f99f15452e..abe08690a887 100644 --- a/arch/x86/include/asm/cache.h +++ b/arch/x86/include/asm/cache.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_CACHE_H #define _ASM_X86_CACHE_H diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index 8b4140f6724f..63feaf2a5f93 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_CACHEFLUSH_H #define _ASM_X86_CACHEFLUSH_H @@ -7,6 +8,4 @@ void clflush_cache_range(void *addr, unsigned int size); -#define mmio_flush_range(addr, size) clflush_cache_range(addr, size) - #endif /* _ASM_X86_CACHEFLUSH_H */ diff --git a/arch/x86/include/asm/ce4100.h b/arch/x86/include/asm/ce4100.h index e656ad8c0a2e..2930f560d7f3 100644 --- a/arch/x86/include/asm/ce4100.h +++ b/arch/x86/include/asm/ce4100.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_CE4100_H_ #define _ASM_CE4100_H_ diff --git a/arch/x86/include/asm/checksum.h b/arch/x86/include/asm/checksum.h index 5f5bb0f97361..d79d1e622dcf 100644 --- a/arch/x86/include/asm/checksum.h +++ b/arch/x86/include/asm/checksum.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifdef CONFIG_X86_32 # include <asm/checksum_32.h> #else diff --git a/arch/x86/include/asm/checksum_32.h b/arch/x86/include/asm/checksum_32.h index 7b53743ed267..7a659c74cd03 100644 --- a/arch/x86/include/asm/checksum_32.h +++ b/arch/x86/include/asm/checksum_32.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_CHECKSUM_32_H #define _ASM_X86_CHECKSUM_32_H diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h index 08e7efb2c140..3ec6d3267cf9 100644 --- a/arch/x86/include/asm/checksum_64.h +++ b/arch/x86/include/asm/checksum_64.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_CHECKSUM_64_H #define _ASM_X86_CHECKSUM_64_H diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h index 47bea8cadbd0..dc4cfc888d6d 100644 --- a/arch/x86/include/asm/clocksource.h +++ b/arch/x86/include/asm/clocksource.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* x86-specific clocksource additions */ #ifndef _ASM_X86_CLOCKSOURCE_H diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h index e01f7f7ccb0c..6faaf27e8899 100644 --- a/arch/x86/include/asm/cmdline.h +++ b/arch/x86/include/asm/cmdline.h @@ -1,6 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_CMDLINE_H #define _ASM_X86_CMDLINE_H int cmdline_find_option_bool(const char *cmdline_ptr, const char *option); +int cmdline_find_option(const char *cmdline_ptr, const char *option, + char *buffer, int bufsize); #endif /* _ASM_X86_CMDLINE_H */ diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index d90296d061e8..56bd436ed01b 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef ASM_X86_CMPXCHG_H #define ASM_X86_CMPXCHG_H @@ -157,7 +158,7 @@ extern void __add_wrong_size(void) #define __raw_try_cmpxchg(_ptr, _pold, _new, size, lock) \ ({ \ bool success; \ - __typeof__(_ptr) _old = (_pold); \ + __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \ __typeof__(*(_ptr)) __old = *_old; \ __typeof__(*(_ptr)) __new = (_new); \ switch (size) { \ diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index e4959d023af8..1732704f0445 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_CMPXCHG_32_H #define _ASM_X86_CMPXCHG_32_H diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h index caa23a34c963..03cad196a301 100644 --- a/arch/x86/include/asm/cmpxchg_64.h +++ b/arch/x86/include/asm/cmpxchg_64.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_CMPXCHG_64_H #define _ASM_X86_CMPXCHG_64_H diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 24118c0b4640..9eef9cc64c68 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_COMPAT_H #define _ASM_X86_COMPAT_H @@ -116,7 +117,6 @@ struct compat_statfs { int f_spare[4]; }; -#define COMPAT_RLIM_OLD_INFINITY 0x7fffffff #define COMPAT_RLIM_INFINITY 0xffffffff typedef u32 compat_old_sigset_t; /* at least 32 bits */ diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index 9b7fa6313f1a..adc6cc86b062 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_CPU_H #define _ASM_X86_CPU_H diff --git a/arch/x86/include/asm/cpu_device_id.h b/arch/x86/include/asm/cpu_device_id.h index ff501e511d91..baeba0567126 100644 --- a/arch/x86/include/asm/cpu_device_id.h +++ b/arch/x86/include/asm/cpu_device_id.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _CPU_DEVICE_ID #define _CPU_DEVICE_ID 1 diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index d59c15c3defd..0dfa68438e80 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_CPUFEATURE_H #define _ASM_X86_CPUFEATURE_H diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 2701e5f8145b..793690fbda36 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_CPUFEATURES_H #define _ASM_X86_CPUFEATURES_H @@ -177,7 +178,7 @@ #define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ #define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */ #define X86_FEATURE_PTSC ( 6*32+27) /* performance time-stamp counter */ -#define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* L2 performance counter extensions */ +#define X86_FEATURE_PERFCTR_LLC ( 6*32+28) /* Last Level Cache performance counter extensions */ #define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */ /* @@ -196,6 +197,7 @@ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ +#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ @@ -286,6 +288,8 @@ #define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */ #define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */ #define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ +#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */ +#define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */ #define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ diff --git a/arch/x86/include/asm/cpumask.h b/arch/x86/include/asm/cpumask.h index 61c852fa346b..6722ffcef2e6 100644 --- a/arch/x86/include/asm/cpumask.h +++ b/arch/x86/include/asm/cpumask.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_CPUMASK_H #define _ASM_X86_CPUMASK_H #ifndef __ASSEMBLY__ diff --git a/arch/x86/include/asm/crash.h b/arch/x86/include/asm/crash.h index f498411f2500..a7adb2bfbf0b 100644 --- a/arch/x86/include/asm/crash.h +++ b/arch/x86/include/asm/crash.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_CRASH_H #define _ASM_X86_CRASH_H diff --git a/arch/x86/include/asm/crypto/aes.h b/arch/x86/include/asm/crypto/aes.h index 80545a1cbe39..c508521dd190 100644 --- a/arch/x86/include/asm/crypto/aes.h +++ b/arch/x86/include/asm/crypto/aes.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef ASM_X86_AES_H #define ASM_X86_AES_H diff --git a/arch/x86/include/asm/crypto/camellia.h b/arch/x86/include/asm/crypto/camellia.h index bb93333d9200..10f8d590bcfe 100644 --- a/arch/x86/include/asm/crypto/camellia.h +++ b/arch/x86/include/asm/crypto/camellia.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef ASM_X86_CAMELLIA_H #define ASM_X86_CAMELLIA_H diff --git a/arch/x86/include/asm/crypto/glue_helper.h b/arch/x86/include/asm/crypto/glue_helper.h index ed8b66de541f..553a03de55c3 100644 --- a/arch/x86/include/asm/crypto/glue_helper.h +++ b/arch/x86/include/asm/crypto/glue_helper.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Shared glue code for 128bit block ciphers */ diff --git a/arch/x86/include/asm/crypto/serpent-avx.h b/arch/x86/include/asm/crypto/serpent-avx.h index 33c2b8a435da..c958b7bd0fcb 100644 --- a/arch/x86/include/asm/crypto/serpent-avx.h +++ b/arch/x86/include/asm/crypto/serpent-avx.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef ASM_X86_SERPENT_AVX_H #define ASM_X86_SERPENT_AVX_H diff --git a/arch/x86/include/asm/crypto/serpent-sse2.h b/arch/x86/include/asm/crypto/serpent-sse2.h index e6e77dffbdab..1a345e8a7496 100644 --- a/arch/x86/include/asm/crypto/serpent-sse2.h +++ b/arch/x86/include/asm/crypto/serpent-sse2.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef ASM_X86_SERPENT_SSE2_H #define ASM_X86_SERPENT_SSE2_H diff --git a/arch/x86/include/asm/crypto/twofish.h b/arch/x86/include/asm/crypto/twofish.h index 878c51ceebb5..65bb80adba3e 100644 --- a/arch/x86/include/asm/crypto/twofish.h +++ b/arch/x86/include/asm/crypto/twofish.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef ASM_X86_TWOFISH_H #define ASM_X86_TWOFISH_H diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h index 9476c04ee635..3e204e6140b5 100644 --- a/arch/x86/include/asm/current.h +++ b/arch/x86/include/asm/current.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_CURRENT_H #define _ASM_X86_CURRENT_H diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index 12cb66f6d3a5..4505ac2735ad 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_DEBUGREG_H #define _ASM_X86_DEBUGREG_H diff --git a/arch/x86/include/asm/delay.h b/arch/x86/include/asm/delay.h index 36a760bda462..de9e7841f953 100644 --- a/arch/x86/include/asm/delay.h +++ b/arch/x86/include/asm/delay.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_DELAY_H #define _ASM_X86_DELAY_H diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index d0a21b12dd58..0a3e808b9123 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_DESC_H #define _ASM_X86_DESC_H @@ -5,6 +6,7 @@ #include <asm/ldt.h> #include <asm/mmu.h> #include <asm/fixmap.h> +#include <asm/irq_vectors.h> #include <linux/smp.h> #include <linux/percpu.h> @@ -22,7 +24,7 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in desc->s = 1; desc->dpl = 0x3; desc->p = info->seg_not_present ^ 1; - desc->limit = (info->limit & 0xf0000) >> 16; + desc->limit1 = (info->limit & 0xf0000) >> 16; desc->avl = info->useable; desc->d = info->seg_32bit; desc->g = info->limit_in_pages; @@ -83,33 +85,25 @@ static inline phys_addr_t get_cpu_gdt_paddr(unsigned int cpu) return per_cpu_ptr_to_phys(get_cpu_gdt_rw(cpu)); } -#ifdef CONFIG_X86_64 - static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func, unsigned dpl, unsigned ist, unsigned seg) { - gate->offset_low = PTR_LOW(func); + gate->offset_low = (u16) func; + gate->bits.p = 1; + gate->bits.dpl = dpl; + gate->bits.zero = 0; + gate->bits.type = type; + gate->offset_middle = (u16) (func >> 16); +#ifdef CONFIG_X86_64 gate->segment = __KERNEL_CS; - gate->ist = ist; - gate->p = 1; - gate->dpl = dpl; - gate->zero0 = 0; - gate->zero1 = 0; - gate->type = type; - gate->offset_middle = PTR_MIDDLE(func); - gate->offset_high = PTR_HIGH(func); -} - + gate->bits.ist = ist; + gate->reserved = 0; + gate->offset_high = (u32) (func >> 32); #else -static inline void pack_gate(gate_desc *gate, unsigned char type, - unsigned long base, unsigned dpl, unsigned flags, - unsigned short seg) -{ - gate->a = (seg << 16) | (base & 0xffff); - gate->b = (base & 0xffff0000) | (((0x80 | type | (dpl << 5)) & 0xff) << 8); -} - + gate->segment = seg; + gate->bits.ist = 0; #endif +} static inline int desc_empty(const void *ptr) { @@ -128,7 +122,6 @@ static inline int desc_empty(const void *ptr) #define load_ldt(ldt) asm volatile("lldt %0"::"m" (ldt)) #define store_gdt(dtr) native_store_gdt(dtr) -#define store_idt(dtr) native_store_idt(dtr) #define store_tr(tr) (tr = native_store_tr()) #define load_TLS(t, cpu) native_load_tls(t, cpu) @@ -173,35 +166,22 @@ native_write_gdt_entry(struct desc_struct *gdt, int entry, const void *desc, int memcpy(&gdt[entry], desc, size); } -static inline void pack_descriptor(struct desc_struct *desc, unsigned long base, - unsigned long limit, unsigned char type, - unsigned char flags) -{ - desc->a = ((base & 0xffff) << 16) | (limit & 0xffff); - desc->b = (base & 0xff000000) | ((base & 0xff0000) >> 16) | - (limit & 0x000f0000) | ((type & 0xff) << 8) | - ((flags & 0xf) << 20); - desc->p = 1; -} - - -static inline void set_tssldt_descriptor(void *d, unsigned long addr, unsigned type, unsigned size) +static inline void set_tssldt_descriptor(void *d, unsigned long addr, + unsigned type, unsigned size) { -#ifdef CONFIG_X86_64 - struct ldttss_desc64 *desc = d; + struct ldttss_desc *desc = d; memset(desc, 0, sizeof(*desc)); - desc->limit0 = size & 0xFFFF; - desc->base0 = PTR_LOW(addr); - desc->base1 = PTR_MIDDLE(addr) & 0xFF; + desc->limit0 = (u16) size; + desc->base0 = (u16) addr; + desc->base1 = (addr >> 16) & 0xFF; desc->type = type; desc->p = 1; desc->limit1 = (size >> 16) & 0xF; - desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF; - desc->base3 = PTR_HIGH(addr); -#else - pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0); + desc->base2 = (addr >> 24) & 0xFF; +#ifdef CONFIG_X86_64 + desc->base3 = (u32) (addr >> 32); #endif } @@ -248,7 +228,7 @@ static inline void native_store_gdt(struct desc_ptr *dtr) asm volatile("sgdt %0":"=m" (*dtr)); } -static inline void native_store_idt(struct desc_ptr *dtr) +static inline void store_idt(struct desc_ptr *dtr) { asm volatile("sidt %0":"=m" (*dtr)); } @@ -401,147 +381,20 @@ static inline void set_desc_base(struct desc_struct *desc, unsigned long base) static inline unsigned long get_desc_limit(const struct desc_struct *desc) { - return desc->limit0 | (desc->limit << 16); + return desc->limit0 | (desc->limit1 << 16); } static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit) { desc->limit0 = limit & 0xffff; - desc->limit = (limit >> 16) & 0xf; -} - -#ifdef CONFIG_X86_64 -static inline void set_nmi_gate(int gate, void *addr) -{ - gate_desc s; - - pack_gate(&s, GATE_INTERRUPT, (unsigned long)addr, 0, 0, __KERNEL_CS); - write_idt_entry(debug_idt_table, gate, &s); + desc->limit1 = (limit >> 16) & 0xf; } -#endif -#ifdef CONFIG_TRACING -extern struct desc_ptr trace_idt_descr; -extern gate_desc trace_idt_table[]; -static inline void write_trace_idt_entry(int entry, const gate_desc *gate) -{ - write_idt_entry(trace_idt_table, entry, gate); -} +void update_intr_gate(unsigned int n, const void *addr); +void alloc_intr_gate(unsigned int n, const void *addr); -static inline void _trace_set_gate(int gate, unsigned type, void *addr, - unsigned dpl, unsigned ist, unsigned seg) -{ - gate_desc s; - - pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg); - /* - * does not need to be atomic because it is only done once at - * setup time - */ - write_trace_idt_entry(gate, &s); -} -#else -static inline void write_trace_idt_entry(int entry, const gate_desc *gate) -{ -} - -#define _trace_set_gate(gate, type, addr, dpl, ist, seg) -#endif - -static inline void _set_gate(int gate, unsigned type, void *addr, - unsigned dpl, unsigned ist, unsigned seg) -{ - gate_desc s; - - pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg); - /* - * does not need to be atomic because it is only done once at - * setup time - */ - write_idt_entry(idt_table, gate, &s); - write_trace_idt_entry(gate, &s); -} - -/* - * This needs to use 'idt_table' rather than 'idt', and - * thus use the _nonmapped_ version of the IDT, as the - * Pentium F0 0F bugfix can have resulted in the mapped - * IDT being write-protected. - */ -#define set_intr_gate_notrace(n, addr) \ - do { \ - BUG_ON((unsigned)n > 0xFF); \ - _set_gate(n, GATE_INTERRUPT, (void *)addr, 0, 0, \ - __KERNEL_CS); \ - } while (0) - -#define set_intr_gate(n, addr) \ - do { \ - set_intr_gate_notrace(n, addr); \ - _trace_set_gate(n, GATE_INTERRUPT, (void *)trace_##addr,\ - 0, 0, __KERNEL_CS); \ - } while (0) - -extern int first_system_vector; -/* used_vectors is BITMAP for irq is not managed by percpu vector_irq */ extern unsigned long used_vectors[]; -static inline void alloc_system_vector(int vector) -{ - if (!test_bit(vector, used_vectors)) { - set_bit(vector, used_vectors); - if (first_system_vector > vector) - first_system_vector = vector; - } else { - BUG(); - } -} - -#define alloc_intr_gate(n, addr) \ - do { \ - alloc_system_vector(n); \ - set_intr_gate(n, addr); \ - } while (0) - -/* - * This routine sets up an interrupt gate at directory privilege level 3. - */ -static inline void set_system_intr_gate(unsigned int n, void *addr) -{ - BUG_ON((unsigned)n > 0xFF); - _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS); -} - -static inline void set_system_trap_gate(unsigned int n, void *addr) -{ - BUG_ON((unsigned)n > 0xFF); - _set_gate(n, GATE_TRAP, addr, 0x3, 0, __KERNEL_CS); -} - -static inline void set_trap_gate(unsigned int n, void *addr) -{ - BUG_ON((unsigned)n > 0xFF); - _set_gate(n, GATE_TRAP, addr, 0, 0, __KERNEL_CS); -} - -static inline void set_task_gate(unsigned int n, unsigned int gdt_entry) -{ - BUG_ON((unsigned)n > 0xFF); - _set_gate(n, GATE_TASK, (void *)0, 0, 0, (gdt_entry<<3)); -} - -static inline void set_intr_gate_ist(int n, void *addr, unsigned ist) -{ - BUG_ON((unsigned)n > 0xFF); - _set_gate(n, GATE_INTERRUPT, addr, 0, ist, __KERNEL_CS); -} - -static inline void set_system_intr_gate_ist(int n, void *addr, unsigned ist) -{ - BUG_ON((unsigned)n > 0xFF); - _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS); -} - #ifdef CONFIG_X86_64 DECLARE_PER_CPU(u32, debug_idt_ctr); static inline bool is_debug_idt_enabled(void) @@ -567,31 +420,6 @@ static inline void load_debug_idt(void) } #endif -#ifdef CONFIG_TRACING -extern atomic_t trace_idt_ctr; -static inline bool is_trace_idt_enabled(void) -{ - if (atomic_read(&trace_idt_ctr)) - return true; - - return false; -} - -static inline void load_trace_idt(void) -{ - load_idt((const struct desc_ptr *)&trace_idt_descr); -} -#else -static inline bool is_trace_idt_enabled(void) -{ - return false; -} - -static inline void load_trace_idt(void) -{ -} -#endif - /* * The load_current_idt() must be called with interrupts disabled * to avoid races. That way the IDT will always be set back to the expected @@ -603,9 +431,25 @@ static inline void load_current_idt(void) { if (is_debug_idt_enabled()) load_debug_idt(); - else if (is_trace_idt_enabled()) - load_trace_idt(); else load_idt((const struct desc_ptr *)&idt_descr); } + +extern void idt_setup_early_handler(void); +extern void idt_setup_early_traps(void); +extern void idt_setup_traps(void); +extern void idt_setup_apic_and_irq_gates(void); + +#ifdef CONFIG_X86_64 +extern void idt_setup_early_pf(void); +extern void idt_setup_ist_traps(void); +extern void idt_setup_debugidt_traps(void); +#else +static inline void idt_setup_early_pf(void) { } +static inline void idt_setup_ist_traps(void) { } +static inline void idt_setup_debugidt_traps(void) { } +#endif + +extern void idt_invalidate(void *addr); + #endif /* _ASM_X86_DESC_H */ diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h index 49265345d4d2..a91f3b6e4f2a 100644 --- a/arch/x86/include/asm/desc_defs.h +++ b/arch/x86/include/asm/desc_defs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Written 2000 by Andi Kleen */ #ifndef _ASM_X86_DESC_DEFS_H #define _ASM_X86_DESC_DEFS_H @@ -11,34 +12,30 @@ #include <linux/types.h> -/* - * FIXME: Accessing the desc_struct through its fields is more elegant, - * and should be the one valid thing to do. However, a lot of open code - * still touches the a and b accessors, and doing this allow us to do it - * incrementally. We keep the signature as a struct, rather than a union, - * so we can get rid of it transparently in the future -- glommer - */ /* 8 byte segment descriptor */ struct desc_struct { - union { - struct { - unsigned int a; - unsigned int b; - }; - struct { - u16 limit0; - u16 base0; - unsigned base1: 8, type: 4, s: 1, dpl: 2, p: 1; - unsigned limit: 4, avl: 1, l: 1, d: 1, g: 1, base2: 8; - }; - }; + u16 limit0; + u16 base0; + u16 base1: 8, type: 4, s: 1, dpl: 2, p: 1; + u16 limit1: 4, avl: 1, l: 1, d: 1, g: 1, base2: 8; } __attribute__((packed)); -#define GDT_ENTRY_INIT(flags, base, limit) { { { \ - .a = ((limit) & 0xffff) | (((base) & 0xffff) << 16), \ - .b = (((base) & 0xff0000) >> 16) | (((flags) & 0xf0ff) << 8) | \ - ((limit) & 0xf0000) | ((base) & 0xff000000), \ - } } } +#define GDT_ENTRY_INIT(flags, base, limit) \ + { \ + .limit0 = (u16) (limit), \ + .limit1 = ((limit) >> 16) & 0x0F, \ + .base0 = (u16) (base), \ + .base1 = ((base) >> 16) & 0xFF, \ + .base2 = ((base) >> 24) & 0xFF, \ + .type = (flags & 0x0f), \ + .s = (flags >> 4) & 0x01, \ + .dpl = (flags >> 5) & 0x03, \ + .p = (flags >> 7) & 0x01, \ + .avl = (flags >> 12) & 0x01, \ + .l = (flags >> 13) & 0x01, \ + .d = (flags >> 14) & 0x01, \ + .g = (flags >> 15) & 0x01, \ + } enum { GATE_INTERRUPT = 0xE, @@ -47,49 +44,63 @@ enum { GATE_TASK = 0x5, }; -/* 16byte gate */ -struct gate_struct64 { - u16 offset_low; - u16 segment; - unsigned ist : 3, zero0 : 5, type : 5, dpl : 2, p : 1; - u16 offset_middle; - u32 offset_high; - u32 zero1; -} __attribute__((packed)); - -#define PTR_LOW(x) ((unsigned long long)(x) & 0xFFFF) -#define PTR_MIDDLE(x) (((unsigned long long)(x) >> 16) & 0xFFFF) -#define PTR_HIGH(x) ((unsigned long long)(x) >> 32) - enum { DESC_TSS = 0x9, DESC_LDT = 0x2, DESCTYPE_S = 0x10, /* !system */ }; -/* LDT or TSS descriptor in the GDT. 16 bytes. */ -struct ldttss_desc64 { - u16 limit0; - u16 base0; - unsigned base1 : 8, type : 5, dpl : 2, p : 1; - unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8; - u32 base3; - u32 zero1; +/* LDT or TSS descriptor in the GDT. */ +struct ldttss_desc { + u16 limit0; + u16 base0; + + u16 base1 : 8, type : 5, dpl : 2, p : 1; + u16 limit1 : 4, zero0 : 3, g : 1, base2 : 8; +#ifdef CONFIG_X86_64 + u32 base3; + u32 zero1; +#endif } __attribute__((packed)); +typedef struct ldttss_desc ldt_desc; +typedef struct ldttss_desc tss_desc; + +struct idt_bits { + u16 ist : 3, + zero : 5, + type : 5, + dpl : 2, + p : 1; +} __attribute__((packed)); + +struct gate_struct { + u16 offset_low; + u16 segment; + struct idt_bits bits; + u16 offset_middle; +#ifdef CONFIG_X86_64 + u32 offset_high; + u32 reserved; +#endif +} __attribute__((packed)); + +typedef struct gate_struct gate_desc; + +static inline unsigned long gate_offset(const gate_desc *g) +{ #ifdef CONFIG_X86_64 -typedef struct gate_struct64 gate_desc; -typedef struct ldttss_desc64 ldt_desc; -typedef struct ldttss_desc64 tss_desc; -#define gate_offset(g) ((g).offset_low | ((unsigned long)(g).offset_middle << 16) | ((unsigned long)(g).offset_high << 32)) -#define gate_segment(g) ((g).segment) + return g->offset_low | ((unsigned long)g->offset_middle << 16) | + ((unsigned long) g->offset_high << 32); #else -typedef struct desc_struct gate_desc; -typedef struct desc_struct ldt_desc; -typedef struct desc_struct tss_desc; -#define gate_offset(g) (((g).b & 0xffff0000) | ((g).a & 0x0000ffff)) -#define gate_segment(g) ((g).a >> 16) + return g->offset_low | ((unsigned long)g->offset_middle << 16); #endif +} + +static inline unsigned long gate_segment(const gate_desc *g) +{ + return g->segment; +} struct desc_ptr { unsigned short size; diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h index 1b3ef26e77df..5e12c63b47aa 100644 --- a/arch/x86/include/asm/device.h +++ b/arch/x86/include/asm/device.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_DEVICE_H #define _ASM_X86_DEVICE_H diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 5dff775af7cd..c10c9128f54e 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -21,11 +21,13 @@ # define DISABLE_K6_MTRR (1<<(X86_FEATURE_K6_MTRR & 31)) # define DISABLE_CYRIX_ARR (1<<(X86_FEATURE_CYRIX_ARR & 31)) # define DISABLE_CENTAUR_MCR (1<<(X86_FEATURE_CENTAUR_MCR & 31)) +# define DISABLE_PCID 0 #else # define DISABLE_VME 0 # define DISABLE_K6_MTRR 0 # define DISABLE_CYRIX_ARR 0 # define DISABLE_CENTAUR_MCR 0 +# define DISABLE_PCID (1<<(X86_FEATURE_PCID & 31)) #endif /* CONFIG_X86_64 */ #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS @@ -49,7 +51,7 @@ #define DISABLED_MASK1 0 #define DISABLED_MASK2 0 #define DISABLED_MASK3 (DISABLE_CYRIX_ARR|DISABLE_CENTAUR_MCR|DISABLE_K6_MTRR) -#define DISABLED_MASK4 0 +#define DISABLED_MASK4 (DISABLE_PCID) #define DISABLED_MASK5 0 #define DISABLED_MASK6 0 #define DISABLED_MASK7 0 diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h index af95c47d5c9e..20a46150e0a8 100644 --- a/arch/x86/include/asm/div64.h +++ b/arch/x86/include/asm/div64.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_DIV64_H #define _ASM_X86_DIV64_H diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 08a0838b83fb..836ca1178a6a 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_DMA_MAPPING_H #define _ASM_X86_DMA_MAPPING_H @@ -12,6 +13,7 @@ #include <asm/io.h> #include <asm/swiotlb.h> #include <linux/dma-contiguous.h> +#include <linux/mem_encrypt.h> #ifdef CONFIG_ISA # define ISA_DMA_BIT_MASK DMA_BIT_MASK(24) @@ -19,8 +21,6 @@ # define ISA_DMA_BIT_MASK DMA_BIT_MASK(32) #endif -#define DMA_ERROR_CODE 0 - extern int iommu_merge; extern struct device x86_dma_fallback_dev; extern int panic_on_overflow; @@ -35,9 +35,6 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus) bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp); #define arch_dma_alloc_attrs arch_dma_alloc_attrs -#define HAVE_ARCH_DMA_SUPPORTED 1 -extern int dma_supported(struct device *hwdev, u64 mask); - extern void *dma_generic_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, gfp_t flag, unsigned long attrs); @@ -62,12 +59,12 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) { - return paddr; + return __sme_set(paddr); } static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr) { - return daddr; + return __sme_clr(daddr); } #endif /* CONFIG_X86_DMA_REMAP */ diff --git a/arch/x86/include/asm/dma.h b/arch/x86/include/asm/dma.h index fe884e18fa6e..00f7cf45e699 100644 --- a/arch/x86/include/asm/dma.h +++ b/arch/x86/include/asm/dma.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * linux/include/asm/dma.h: Defines for using and allocating dma channels. * Written by Hennus Bergman, 1992. diff --git a/arch/x86/include/asm/dmi.h b/arch/x86/include/asm/dmi.h index 3c69fed215c5..0ab2ab27ad1f 100644 --- a/arch/x86/include/asm/dmi.h +++ b/arch/x86/include/asm/dmi.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_DMI_H #define _ASM_X86_DMI_H @@ -13,9 +14,9 @@ static __always_inline __init void *dmi_alloc(unsigned len) } /* Use early IO mappings for DMI because it's initialized early */ -#define dmi_early_remap early_ioremap -#define dmi_early_unmap early_iounmap -#define dmi_remap ioremap_cache -#define dmi_unmap iounmap +#define dmi_early_remap early_memremap +#define dmi_early_unmap early_memunmap +#define dmi_remap(_x, _l) memremap(_x, _l, MEMREMAP_WB) +#define dmi_unmap(_x) memunmap(_x) #endif /* _ASM_X86_DMI_H */ diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h index b7a1ab865d68..ae391f609840 100644 --- a/arch/x86/include/asm/dwarf2.h +++ b/arch/x86/include/asm/dwarf2.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_DWARF2_H #define _ASM_X86_DWARF2_H diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h index 8e0f8b85b209..62be73b23d5c 100644 --- a/arch/x86/include/asm/e820/api.h +++ b/arch/x86/include/asm/e820/api.h @@ -1,9 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_E820_API_H #define _ASM_E820_API_H #include <asm/e820/types.h> extern struct e820_table *e820_table; +extern struct e820_table *e820_table_kexec; extern struct e820_table *e820_table_firmware; extern unsigned long pci_mem_start; @@ -38,6 +40,8 @@ extern void e820__setup_pci_gap(void); extern void e820__reallocate_tables(void); extern void e820__register_nosave_regions(unsigned long limit_pfn); +extern int e820__get_entry_type(u64 start, u64 end); + /* * Returns true iff the specified range [start,end) is completely contained inside * the ISA region. diff --git a/arch/x86/include/asm/e820/types.h b/arch/x86/include/asm/e820/types.h index 4adeed03a9a1..c3aa4b5e49e2 100644 --- a/arch/x86/include/asm/e820/types.h +++ b/arch/x86/include/asm/e820/types.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_E820_TYPES_H #define _ASM_E820_TYPES_H diff --git a/arch/x86/include/asm/edac.h b/arch/x86/include/asm/edac.h index cf8fdf83b231..426fc53ff803 100644 --- a/arch/x86/include/asm/edac.h +++ b/arch/x86/include/asm/edac.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_EDAC_H #define _ASM_X86_EDAC_H diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 2f77bcefe6b4..85f6ccb80b91 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_EFI_H #define _ASM_X86_EFI_H @@ -33,7 +34,7 @@ #ifdef CONFIG_X86_32 -extern unsigned long asmlinkage efi_call_phys(void *, ...); +extern asmlinkage unsigned long efi_call_phys(void *, ...); #define arch_efi_call_virt_setup() kernel_fpu_begin() #define arch_efi_call_virt_teardown() kernel_fpu_end() @@ -52,7 +53,7 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...); #define EFI_LOADER_SIGNATURE "EL64" -extern u64 asmlinkage efi_call(void *fp, ...); +extern asmlinkage u64 efi_call(void *fp, ...); #define efi_call_phys(f, args...) efi_call((f), args) @@ -74,7 +75,7 @@ struct efi_scratch { __kernel_fpu_begin(); \ \ if (efi_scratch.use_pgd) { \ - efi_scratch.prev_cr3 = read_cr3(); \ + efi_scratch.prev_cr3 = __read_cr3(); \ write_cr3((unsigned long)efi_scratch.efi_pgt); \ __flush_tlb_all(); \ } \ diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index e8ab9a46bc68..c1a125e47ff3 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_ELF_H #define _ASM_X86_ELF_H @@ -126,15 +127,15 @@ do { \ pr_reg[4] = regs->di; \ pr_reg[5] = regs->bp; \ pr_reg[6] = regs->ax; \ - pr_reg[7] = regs->ds & 0xffff; \ - pr_reg[8] = regs->es & 0xffff; \ - pr_reg[9] = regs->fs & 0xffff; \ + pr_reg[7] = regs->ds; \ + pr_reg[8] = regs->es; \ + pr_reg[9] = regs->fs; \ pr_reg[11] = regs->orig_ax; \ pr_reg[12] = regs->ip; \ - pr_reg[13] = regs->cs & 0xffff; \ + pr_reg[13] = regs->cs; \ pr_reg[14] = regs->flags; \ pr_reg[15] = regs->sp; \ - pr_reg[16] = regs->ss & 0xffff; \ + pr_reg[16] = regs->ss; \ } while (0); #define ELF_CORE_COPY_REGS(pr_reg, regs) \ @@ -204,6 +205,7 @@ void set_personality_ia32(bool); #define ELF_CORE_COPY_REGS(pr_reg, regs) \ do { \ + unsigned long base; \ unsigned v; \ (pr_reg)[0] = (regs)->r15; \ (pr_reg)[1] = (regs)->r14; \ @@ -226,8 +228,8 @@ do { \ (pr_reg)[18] = (regs)->flags; \ (pr_reg)[19] = (regs)->sp; \ (pr_reg)[20] = (regs)->ss; \ - (pr_reg)[21] = current->thread.fsbase; \ - (pr_reg)[22] = current->thread.gsbase; \ + rdmsrl(MSR_FS_BASE, base); (pr_reg)[21] = base; \ + rdmsrl(MSR_KERNEL_GS_BASE, base); (pr_reg)[22] = base; \ asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v; \ asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v; \ asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v; \ @@ -245,12 +247,13 @@ extern int force_personality32; #define CORE_DUMP_USE_REGSET #define ELF_EXEC_PAGESIZE 4096 -/* This is the location that an ET_DYN program is loaded if exec'ed. Typical - use of this is to invoke "./ld.so someprog" to test out a new version of - the loader. We need to make sure that it is out of the way of the program - that it will "exec", and that there is sufficient room for the brk. */ - -#define ELF_ET_DYN_BASE (TASK_SIZE / 3 * 2) +/* + * This is the base location for PIE (ET_DYN with INTERP) loads. On + * 64-bit, this is above 4GB to leave the entire 32-bit address + * space open for things that want to use the area for 32-bit pointers. + */ +#define ELF_ET_DYN_BASE (mmap_is_ia32() ? 0x000400000UL : \ + (TASK_SIZE / 3 * 2)) /* This yields a mask that user programs can use to figure out what instruction set this CPU supports. This could be done in user space, @@ -303,8 +306,8 @@ static inline int mmap_is_ia32(void) test_thread_flag(TIF_ADDR32)); } -extern unsigned long tasksize_32bit(void); -extern unsigned long tasksize_64bit(void); +extern unsigned long task_size_32bit(void); +extern unsigned long task_size_64bit(int full_addr_space); extern unsigned long get_mmap_base(int is_legacy); #ifdef CONFIG_X86_32 diff --git a/arch/x86/include/asm/emergency-restart.h b/arch/x86/include/asm/emergency-restart.h index 77a99ac06d00..2abde717db31 100644 --- a/arch/x86/include/asm/emergency-restart.h +++ b/arch/x86/include/asm/emergency-restart.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_EMERGENCY_RESTART_H #define _ASM_X86_EMERGENCY_RESTART_H diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index df002992d8fd..416422762845 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * This file is designed to contain the BUILD_INTERRUPT specifications for * all of the extra named interrupt vectors used by the architecture. @@ -13,18 +14,14 @@ BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR) BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR) BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR) -BUILD_INTERRUPT3(irq_move_cleanup_interrupt, IRQ_MOVE_CLEANUP_VECTOR, - smp_irq_move_cleanup_interrupt) -BUILD_INTERRUPT3(reboot_interrupt, REBOOT_VECTOR, smp_reboot_interrupt) +BUILD_INTERRUPT(irq_move_cleanup_interrupt, IRQ_MOVE_CLEANUP_VECTOR) +BUILD_INTERRUPT(reboot_interrupt, REBOOT_VECTOR) #endif -BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR) - #ifdef CONFIG_HAVE_KVM -BUILD_INTERRUPT3(kvm_posted_intr_ipi, POSTED_INTR_VECTOR, - smp_kvm_posted_intr_ipi) -BUILD_INTERRUPT3(kvm_posted_intr_wakeup_ipi, POSTED_INTR_WAKEUP_VECTOR, - smp_kvm_posted_intr_wakeup_ipi) +BUILD_INTERRUPT(kvm_posted_intr_ipi, POSTED_INTR_VECTOR) +BUILD_INTERRUPT(kvm_posted_intr_wakeup_ipi, POSTED_INTR_WAKEUP_VECTOR) +BUILD_INTERRUPT(kvm_posted_intr_nested_ipi, POSTED_INTR_NESTED_VECTOR) #endif /* @@ -39,6 +36,7 @@ BUILD_INTERRUPT3(kvm_posted_intr_wakeup_ipi, POSTED_INTR_WAKEUP_VECTOR, BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR) BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR) BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) +BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR) #ifdef CONFIG_IRQ_WORK BUILD_INTERRUPT(irq_work_interrupt, IRQ_WORK_VECTOR) diff --git a/arch/x86/include/asm/espfix.h b/arch/x86/include/asm/espfix.h index ca3ce9ab9385..0211029076ea 100644 --- a/arch/x86/include/asm/espfix.h +++ b/arch/x86/include/asm/espfix.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_ESPFIX_H #define _ASM_X86_ESPFIX_H diff --git a/arch/x86/include/asm/export.h b/arch/x86/include/asm/export.h index 138de56b13eb..2a51d66689c5 100644 --- a/arch/x86/include/asm/export.h +++ b/arch/x86/include/asm/export.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifdef CONFIG_64BIT #define KSYM_ALIGN 16 #endif diff --git a/arch/x86/include/asm/extable.h b/arch/x86/include/asm/extable.h index c66d19e3c23e..f9c3a5d502f4 100644 --- a/arch/x86/include/asm/extable.h +++ b/arch/x86/include/asm/extable.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_EXTABLE_H #define _ASM_X86_EXTABLE_H /* diff --git a/arch/x86/include/asm/fb.h b/arch/x86/include/asm/fb.h index c3dd5e71f439..ab4c960146e3 100644 --- a/arch/x86/include/asm/fb.h +++ b/arch/x86/include/asm/fb.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_FB_H #define _ASM_X86_FB_H diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index b65155cc3760..dcd9fb55e679 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -157,6 +157,26 @@ static inline void __set_fixmap(enum fixed_addresses idx, } #endif +/* + * FIXMAP_PAGE_NOCACHE is used for MMIO. Memory encryption is not + * supported for MMIO addresses, so make sure that the memory encryption + * mask is not part of the page attributes. + */ +#define FIXMAP_PAGE_NOCACHE PAGE_KERNEL_IO_NOCACHE + +/* + * Early memremap routines used for in-place encryption. The mappings created + * by these routines are intended to be used as temporary mappings. + */ +void __init *early_memremap_encrypted(resource_size_t phys_addr, + unsigned long size); +void __init *early_memremap_encrypted_wp(resource_size_t phys_addr, + unsigned long size); +void __init *early_memremap_decrypted(resource_size_t phys_addr, + unsigned long size); +void __init *early_memremap_decrypted_wp(resource_size_t phys_addr, + unsigned long size); + #include <asm-generic/fixmap.h> #define __late_set_fixmap(idx, phys, flags) __set_fixmap(idx, phys, flags) diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index 0877ae018fc9..a9caac9d4a72 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 1994 Linus Torvalds * diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 255645f60ca2..a38bf5a1e37a 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 1994 Linus Torvalds * @@ -23,11 +24,9 @@ /* * High level FPU state handling functions: */ -extern void fpu__activate_curr(struct fpu *fpu); -extern void fpu__activate_fpstate_read(struct fpu *fpu); -extern void fpu__activate_fpstate_write(struct fpu *fpu); -extern void fpu__current_fpstate_write_begin(void); -extern void fpu__current_fpstate_write_end(void); +extern void fpu__initialize(struct fpu *fpu); +extern void fpu__prepare_read(struct fpu *fpu); +extern void fpu__prepare_write(struct fpu *fpu); extern void fpu__save(struct fpu *fpu); extern void fpu__restore(struct fpu *fpu); extern int fpu__restore_sig(void __user *buf, int ia32_frame); @@ -120,20 +119,11 @@ extern void fpstate_sanitize_xstate(struct fpu *fpu); err; \ }) -#define check_insn(insn, output, input...) \ -({ \ - int err; \ +#define kernel_insn(insn, output, input...) \ asm volatile("1:" #insn "\n\t" \ "2:\n" \ - ".section .fixup,\"ax\"\n" \ - "3: movl $-1,%[err]\n" \ - " jmp 2b\n" \ - ".previous\n" \ - _ASM_EXTABLE(1b, 3b) \ - : [err] "=r" (err), output \ - : "0"(0), input); \ - err; \ -}) + _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_fprestore) \ + : output : input) static inline int copy_fregs_to_user(struct fregs_state __user *fx) { @@ -153,20 +143,16 @@ static inline int copy_fxregs_to_user(struct fxregs_state __user *fx) static inline void copy_kernel_to_fxregs(struct fxregs_state *fx) { - int err; - if (IS_ENABLED(CONFIG_X86_32)) { - err = check_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); + kernel_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); } else { if (IS_ENABLED(CONFIG_AS_FXSAVEQ)) { - err = check_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); + kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); } else { /* See comment in copy_fxregs_to_kernel() below. */ - err = check_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), "m" (*fx)); + kernel_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), "m" (*fx)); } } - /* Copying from a kernel buffer to FPU registers should never fail: */ - WARN_ON_FPU(err); } static inline int copy_user_to_fxregs(struct fxregs_state __user *fx) @@ -183,9 +169,7 @@ static inline int copy_user_to_fxregs(struct fxregs_state __user *fx) static inline void copy_kernel_to_fregs(struct fregs_state *fx) { - int err = check_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); - - WARN_ON_FPU(err); + kernel_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); } static inline int copy_user_to_fregs(struct fregs_state __user *fx) @@ -281,18 +265,13 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu) * Use XRSTORS to restore context if it is enabled. XRSTORS supports compact * XSAVE area format. */ -#define XSTATE_XRESTORE(st, lmask, hmask, err) \ +#define XSTATE_XRESTORE(st, lmask, hmask) \ asm volatile(ALTERNATIVE(XRSTOR, \ XRSTORS, X86_FEATURE_XSAVES) \ "\n" \ - "xor %[err], %[err]\n" \ "3:\n" \ - ".pushsection .fixup,\"ax\"\n" \ - "4: movl $-2, %[err]\n" \ - "jmp 3b\n" \ - ".popsection\n" \ - _ASM_EXTABLE(661b, 4b) \ - : [err] "=r" (err) \ + _ASM_EXTABLE_HANDLE(661b, 3b, ex_handler_fprestore)\ + : \ : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ : "memory") @@ -336,7 +315,10 @@ static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate) else XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); - /* We should never fault when copying from a kernel buffer: */ + /* + * We should never fault when copying from a kernel buffer, and the FPU + * state we set at boot time should be valid. + */ WARN_ON_FPU(err); } @@ -350,7 +332,7 @@ static inline void copy_xregs_to_kernel(struct xregs_state *xstate) u32 hmask = mask >> 32; int err; - WARN_ON(!alternatives_patched); + WARN_ON_FPU(!alternatives_patched); XSTATE_XSAVE(xstate, lmask, hmask, err); @@ -365,12 +347,8 @@ static inline void copy_kernel_to_xregs(struct xregs_state *xstate, u64 mask) { u32 lmask = mask; u32 hmask = mask >> 32; - int err; - - XSTATE_XRESTORE(xstate, lmask, hmask, err); - /* We should never fault when copying from a kernel buffer: */ - WARN_ON_FPU(err); + XSTATE_XRESTORE(xstate, lmask, hmask); } /* @@ -450,10 +428,10 @@ static inline int copy_fpregs_to_fpstate(struct fpu *fpu) return 0; } -static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate) +static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate, u64 mask) { if (use_xsave()) { - copy_kernel_to_xregs(&fpstate->xsave, -1); + copy_kernel_to_xregs(&fpstate->xsave, mask); } else { if (use_fxsr()) copy_kernel_to_fxregs(&fpstate->fxsave); @@ -477,7 +455,7 @@ static inline void copy_kernel_to_fpregs(union fpregs_state *fpstate) : : [addr] "m" (fpstate)); } - __copy_kernel_to_fpregs(fpstate); + __copy_kernel_to_fpregs(fpstate, -1); } extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size); @@ -526,38 +504,17 @@ static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu) */ static inline void fpregs_deactivate(struct fpu *fpu) { - WARN_ON_FPU(!fpu->fpregs_active); - - fpu->fpregs_active = 0; this_cpu_write(fpu_fpregs_owner_ctx, NULL); trace_x86_fpu_regs_deactivated(fpu); } static inline void fpregs_activate(struct fpu *fpu) { - WARN_ON_FPU(fpu->fpregs_active); - - fpu->fpregs_active = 1; this_cpu_write(fpu_fpregs_owner_ctx, fpu); trace_x86_fpu_regs_activated(fpu); } /* - * The question "does this thread have fpu access?" - * is slightly racy, since preemption could come in - * and revoke it immediately after the test. - * - * However, even in that very unlikely scenario, - * we can just assume we have FPU access - typically - * to save the FP state - we'll just take a #NM - * fault and get the FPU access back. - */ -static inline int fpregs_active(void) -{ - return current->thread.fpu.fpregs_active; -} - -/* * FPU state switching for scheduling. * * This is a two-stage process: @@ -571,14 +528,13 @@ static inline int fpregs_active(void) static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu) { - if (old_fpu->fpregs_active) { + if (old_fpu->initialized) { if (!copy_fpregs_to_fpstate(old_fpu)) old_fpu->last_cpu = -1; else old_fpu->last_cpu = cpu; /* But leave fpu_fpregs_owner_ctx! */ - old_fpu->fpregs_active = 0; trace_x86_fpu_regs_deactivated(old_fpu); } else old_fpu->last_cpu = -1; @@ -595,7 +551,7 @@ switch_fpu_prepare(struct fpu *old_fpu, int cpu) static inline void switch_fpu_finish(struct fpu *new_fpu, int cpu) { bool preload = static_cpu_has(X86_FEATURE_FPU) && - new_fpu->fpstate_active; + new_fpu->initialized; if (preload) { if (!fpregs_state_valid(new_fpu, cpu)) @@ -617,8 +573,7 @@ static inline void user_fpu_begin(void) struct fpu *fpu = ¤t->thread.fpu; preempt_disable(); - if (!fpregs_active()) - fpregs_activate(fpu); + fpregs_activate(fpu); preempt_enable(); } diff --git a/arch/x86/include/asm/fpu/regset.h b/arch/x86/include/asm/fpu/regset.h index 39d3107ac6c7..d5bdffb9d27f 100644 --- a/arch/x86/include/asm/fpu/regset.h +++ b/arch/x86/include/asm/fpu/regset.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * FPU regset handling methods: */ diff --git a/arch/x86/include/asm/fpu/signal.h b/arch/x86/include/asm/fpu/signal.h index 20a1fbf7fe4e..4df2754ef380 100644 --- a/arch/x86/include/asm/fpu/signal.h +++ b/arch/x86/include/asm/fpu/signal.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * x86 FPU signal frame handling methods: */ diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 3c80f5b9c09d..202c53918ecf 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * FPU data structures: */ @@ -68,6 +69,9 @@ struct fxregs_state { /* Default value for fxregs_state.mxcsr: */ #define MXCSR_DEFAULT 0x1f80 +/* Copy both mxcsr & mxcsr_flags with a single u64 memcpy: */ +#define MXCSR_AND_FLAGS_SIZE sizeof(u64) + /* * Software based FPU emulation state. This is arbitrary really, * it matches the x87 format to make it easier to understand: @@ -290,36 +294,13 @@ struct fpu { unsigned int last_cpu; /* - * @fpstate_active: + * @initialized: * - * This flag indicates whether this context is active: if the task + * This flag indicates whether this context is initialized: if the task * is not running then we can restore from this context, if the task * is running then we should save into this context. */ - unsigned char fpstate_active; - - /* - * @fpregs_active: - * - * This flag determines whether a given context is actively - * loaded into the FPU's registers and that those registers - * represent the task's current FPU state. - * - * Note the interaction with fpstate_active: - * - * # task does not use the FPU: - * fpstate_active == 0 - * - * # task uses the FPU and regs are active: - * fpstate_active == 1 && fpregs_active == 1 - * - * # the regs are inactive but still match fpstate: - * fpstate_active == 1 && fpregs_active == 0 && fpregs_owner == fpu - * - * The third state is what we use for the lazy restore optimization - * on lazy-switching CPUs. - */ - unsigned char fpregs_active; + unsigned char initialized; /* * @state: diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 1b2799e0699a..48581988d78c 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __ASM_X86_XSAVE_H #define __ASM_X86_XSAVE_H @@ -48,8 +49,12 @@ void fpu__xstate_clear_all_cpu_caps(void); void *get_xsave_addr(struct xregs_state *xsave, int xstate); const void *get_xsave_field_ptr(int xstate_field); int using_compacted_format(void); -int copyout_from_xsaves(unsigned int pos, unsigned int count, void *kbuf, - void __user *ubuf, struct xregs_state *xsave); -int copyin_to_xsaves(const void *kbuf, const void __user *ubuf, - struct xregs_state *xsave); +int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); +int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); +int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf); +int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf); + +/* Validate an xstate header supplied by userspace (ptrace or sigreturn) */ +extern int validate_xstate_header(const struct xstate_header *hdr); + #endif diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h index 6e4d170726b7..5cbce6fbb534 100644 --- a/arch/x86/include/asm/frame.h +++ b/arch/x86/include/asm/frame.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_FRAME_H #define _ASM_X86_FRAME_H diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index eccd0ac6bc38..09ad88572746 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_FTRACE_H #define _ASM_X86_FTRACE_H diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h index b4c1f5453436..de4d68852d3a 100644 --- a/arch/x86/include/asm/futex.h +++ b/arch/x86/include/asm/futex.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_FUTEX_H #define _ASM_X86_FUTEX_H @@ -41,20 +42,11 @@ "+m" (*uaddr), "=&r" (tem) \ : "r" (oparg), "i" (-EFAULT), "1" (0)) -static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, + u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret, tem; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; - pagefault_disable(); switch (op) { @@ -80,30 +72,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: - ret = (oldval == cmparg); - break; - case FUTEX_OP_CMP_NE: - ret = (oldval != cmparg); - break; - case FUTEX_OP_CMP_LT: - ret = (oldval < cmparg); - break; - case FUTEX_OP_CMP_GE: - ret = (oldval >= cmparg); - break; - case FUTEX_OP_CMP_LE: - ret = (oldval <= cmparg); - break; - case FUTEX_OP_CMP_GT: - ret = (oldval > cmparg); - break; - default: - ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h index 1d268098ac2e..318556574345 100644 --- a/arch/x86/include/asm/gart.h +++ b/arch/x86/include/asm/gart.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_GART_H #define _ASM_X86_GART_H diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 59405a248fc2..51cc979dd364 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_HARDIRQ_H #define _ASM_X86_HARDIRQ_H @@ -15,6 +16,7 @@ typedef struct { #ifdef CONFIG_HAVE_KVM unsigned int kvm_posted_intr_ipis; unsigned int kvm_posted_intr_wakeup_ipis; + unsigned int kvm_posted_intr_nested_ipis; #endif unsigned int x86_platform_ipis; /* arch dependent */ unsigned int apic_perf_irqs; @@ -22,8 +24,8 @@ typedef struct { #ifdef CONFIG_SMP unsigned int irq_resched_count; unsigned int irq_call_count; - unsigned int irq_tlb_count; #endif + unsigned int irq_tlb_count; #ifdef CONFIG_X86_THERMAL_VECTOR unsigned int irq_thermal_count; #endif diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index 1c0b43724ce3..a8059930056d 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * highmem.h: virtual kernel memory mappings for high memory * diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h index cc285ec4b2c1..67385d56d4f4 100644 --- a/arch/x86/include/asm/hpet.h +++ b/arch/x86/include/asm/hpet.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_HPET_H #define _ASM_X86_HPET_H diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h index 3a106165e03a..5ed826da5e07 100644 --- a/arch/x86/include/asm/hugetlb.h +++ b/arch/x86/include/asm/hugetlb.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_HUGETLB_H #define _ASM_X86_HUGETLB_H @@ -85,4 +86,8 @@ static inline void arch_clear_hugepage_flags(struct page *page) { } +#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE +static inline bool gigantic_page_supported(void) { return true; } +#endif + #endif /* _ASM_X86_HUGETLB_H */ diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h index 6c98be864a75..f59c39835a5a 100644 --- a/arch/x86/include/asm/hw_breakpoint.h +++ b/arch/x86/include/asm/hw_breakpoint.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _I386_HW_BREAKPOINT_H #define _I386_HW_BREAKPOINT_H diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index b90e1053049b..8ec99a55e6b9 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_HW_IRQ_H #define _ASM_X86_HW_IRQ_H @@ -30,6 +31,7 @@ extern asmlinkage void apic_timer_interrupt(void); extern asmlinkage void x86_platform_ipi(void); extern asmlinkage void kvm_posted_intr_ipi(void); extern asmlinkage void kvm_posted_intr_wakeup_ipi(void); +extern asmlinkage void kvm_posted_intr_nested_ipi(void); extern asmlinkage void error_interrupt(void); extern asmlinkage void irq_work_interrupt(void); @@ -45,25 +47,6 @@ extern asmlinkage void deferred_error_interrupt(void); extern asmlinkage void call_function_interrupt(void); extern asmlinkage void call_function_single_interrupt(void); -#ifdef CONFIG_TRACING -/* Interrupt handlers registered during init_IRQ */ -extern void trace_apic_timer_interrupt(void); -extern void trace_x86_platform_ipi(void); -extern void trace_error_interrupt(void); -extern void trace_irq_work_interrupt(void); -extern void trace_spurious_interrupt(void); -extern void trace_thermal_interrupt(void); -extern void trace_reschedule_interrupt(void); -extern void trace_threshold_interrupt(void); -extern void trace_deferred_error_interrupt(void); -extern void trace_call_function_interrupt(void); -extern void trace_call_function_single_interrupt(void); -#define trace_irq_move_cleanup_interrupt irq_move_cleanup_interrupt -#define trace_reboot_interrupt reboot_interrupt -#define trace_kvm_posted_intr_ipi kvm_posted_intr_ipi -#define trace_kvm_posted_intr_wakeup_ipi kvm_posted_intr_wakeup_ipi -#endif /* CONFIG_TRACING */ - #ifdef CONFIG_X86_LOCAL_APIC struct irq_data; struct pci_dev; diff --git a/arch/x86/include/asm/hypertransport.h b/arch/x86/include/asm/hypertransport.h index 334b1a885d9c..5d55df352879 100644 --- a/arch/x86/include/asm/hypertransport.h +++ b/arch/x86/include/asm/hypertransport.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_HYPERTRANSPORT_H #define _ASM_X86_HYPERTRANSPORT_H diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index 21126155a739..0ead9dbb9130 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h @@ -43,6 +43,9 @@ struct hypervisor_x86 { /* pin current vcpu to specified physical cpu (run rarely) */ void (*pin_vcpu)(int); + + /* called during init_mem_mapping() to setup early mappings. */ + void (*init_mem_mapping)(void); }; extern const struct hypervisor_x86 *x86_hyper; @@ -57,8 +60,15 @@ extern const struct hypervisor_x86 x86_hyper_kvm; extern void init_hypervisor_platform(void); extern bool hypervisor_x2apic_available(void); extern void hypervisor_pin_vcpu(int cpu); + +static inline void hypervisor_init_mem_mapping(void) +{ + if (x86_hyper && x86_hyper->init_mem_mapping) + x86_hyper->init_mem_mapping(); +} #else static inline void init_hypervisor_platform(void) { } static inline bool hypervisor_x2apic_available(void) { return false; } +static inline void hypervisor_init_mem_mapping(void) { } #endif /* CONFIG_HYPERVISOR_GUEST */ #endif /* _ASM_X86_HYPERVISOR_H */ diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h index 39bcefc20de7..c8376b40e882 100644 --- a/arch/x86/include/asm/i8259.h +++ b/arch/x86/include/asm/i8259.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_I8259_H #define _ASM_X86_I8259_H diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h index a9bdf5569ab3..2c5f7861d373 100644 --- a/arch/x86/include/asm/ia32.h +++ b/arch/x86/include/asm/ia32.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_IA32_H #define _ASM_X86_IA32_H diff --git a/arch/x86/include/asm/ia32_unistd.h b/arch/x86/include/asm/ia32_unistd.h index b0d5716ca1e4..aa065c94ccf5 100644 --- a/arch/x86/include/asm/ia32_unistd.h +++ b/arch/x86/include/asm/ia32_unistd.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_IA32_UNISTD_H #define _ASM_X86_IA32_UNISTD_H diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h index 474eb8c66fee..5f1d3c421f68 100644 --- a/arch/x86/include/asm/init.h +++ b/arch/x86/include/asm/init.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_INIT_H #define _ASM_X86_INIT_H @@ -7,6 +8,7 @@ struct x86_mapping_info { unsigned long page_flag; /* page flag for PMD or PUD entry */ unsigned long offset; /* ident mapping offset */ bool direct_gbpages; /* PUD level 1GB page support */ + unsigned long kernpg_flag; /* kernel pagetable flag override */ }; int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, diff --git a/arch/x86/include/asm/inst.h b/arch/x86/include/asm/inst.h index 3e115273ed88..f5a796da07f8 100644 --- a/arch/x86/include/asm/inst.h +++ b/arch/x86/include/asm/inst.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Generate .byte code for some instructions not supported by old * binutils. diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 75b748a1deb8..35a6bc4da8ad 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_INTEL_FAMILY_H #define _ASM_X86_INTEL_FAMILY_H diff --git a/arch/x86/include/asm/intel_mid_vrtc.h b/arch/x86/include/asm/intel_mid_vrtc.h index 86ff4685c409..35555016b1be 100644 --- a/arch/x86/include/asm/intel_mid_vrtc.h +++ b/arch/x86/include/asm/intel_mid_vrtc.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _INTEL_MID_VRTC_H #define _INTEL_MID_VRTC_H diff --git a/arch/x86/include/asm/intel_pmc_ipc.h b/arch/x86/include/asm/intel_pmc_ipc.h index fac89eb78a6b..528ed4be4393 100644 --- a/arch/x86/include/asm/intel_pmc_ipc.h +++ b/arch/x86/include/asm/intel_pmc_ipc.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_INTEL_PMC_IPC_H_ #define _ASM_X86_INTEL_PMC_IPC_H_ diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h index e1a411786bf5..b523f51c5400 100644 --- a/arch/x86/include/asm/intel_pt.h +++ b/arch/x86/include/asm/intel_pt.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_INTEL_PT_H #define _ASM_X86_INTEL_PT_H diff --git a/arch/x86/include/asm/intel_punit_ipc.h b/arch/x86/include/asm/intel_punit_ipc.h index 201eb9dce595..ce16da719596 100644 --- a/arch/x86/include/asm/intel_punit_ipc.h +++ b/arch/x86/include/asm/intel_punit_ipc.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_INTEL_PUNIT_IPC_H_ #define _ASM_X86_INTEL_PUNIT_IPC_H_ diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h deleted file mode 100644 index 597dc4995678..000000000000 --- a/arch/x86/include/asm/intel_rdt.h +++ /dev/null @@ -1,286 +0,0 @@ -#ifndef _ASM_X86_INTEL_RDT_H -#define _ASM_X86_INTEL_RDT_H - -#ifdef CONFIG_INTEL_RDT_A - -#include <linux/sched.h> -#include <linux/kernfs.h> -#include <linux/jump_label.h> - -#include <asm/intel_rdt_common.h> - -#define IA32_L3_QOS_CFG 0xc81 -#define IA32_L3_CBM_BASE 0xc90 -#define IA32_L2_CBM_BASE 0xd10 -#define IA32_MBA_THRTL_BASE 0xd50 - -#define L3_QOS_CDP_ENABLE 0x01ULL - -/** - * struct rdtgroup - store rdtgroup's data in resctrl file system. - * @kn: kernfs node - * @rdtgroup_list: linked list for all rdtgroups - * @closid: closid for this rdtgroup - * @cpu_mask: CPUs assigned to this rdtgroup - * @flags: status bits - * @waitcount: how many cpus expect to find this - * group when they acquire rdtgroup_mutex - */ -struct rdtgroup { - struct kernfs_node *kn; - struct list_head rdtgroup_list; - int closid; - struct cpumask cpu_mask; - int flags; - atomic_t waitcount; -}; - -/* rdtgroup.flags */ -#define RDT_DELETED 1 - -/* rftype.flags */ -#define RFTYPE_FLAGS_CPUS_LIST 1 - -/* List of all resource groups */ -extern struct list_head rdt_all_groups; - -extern int max_name_width, max_data_width; - -int __init rdtgroup_init(void); - -/** - * struct rftype - describe each file in the resctrl file system - * @name: File name - * @mode: Access mode - * @kf_ops: File operations - * @flags: File specific RFTYPE_FLAGS_* flags - * @seq_show: Show content of the file - * @write: Write to the file - */ -struct rftype { - char *name; - umode_t mode; - struct kernfs_ops *kf_ops; - unsigned long flags; - - int (*seq_show)(struct kernfs_open_file *of, - struct seq_file *sf, void *v); - /* - * write() is the generic write callback which maps directly to - * kernfs write operation and overrides all other operations. - * Maximum write size is determined by ->max_write_len. - */ - ssize_t (*write)(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off); -}; - -/** - * struct rdt_domain - group of cpus sharing an RDT resource - * @list: all instances of this resource - * @id: unique id for this instance - * @cpu_mask: which cpus share this resource - * @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID) - * @new_ctrl: new ctrl value to be loaded - * @have_new_ctrl: did user provide new_ctrl for this domain - */ -struct rdt_domain { - struct list_head list; - int id; - struct cpumask cpu_mask; - u32 *ctrl_val; - u32 new_ctrl; - bool have_new_ctrl; -}; - -/** - * struct msr_param - set a range of MSRs from a domain - * @res: The resource to use - * @low: Beginning index from base MSR - * @high: End index - */ -struct msr_param { - struct rdt_resource *res; - int low; - int high; -}; - -/** - * struct rdt_cache - Cache allocation related data - * @cbm_len: Length of the cache bit mask - * @min_cbm_bits: Minimum number of consecutive bits to be set - * @cbm_idx_mult: Multiplier of CBM index - * @cbm_idx_offset: Offset of CBM index. CBM index is computed by: - * closid * cbm_idx_multi + cbm_idx_offset - * in a cache bit mask - */ -struct rdt_cache { - unsigned int cbm_len; - unsigned int min_cbm_bits; - unsigned int cbm_idx_mult; - unsigned int cbm_idx_offset; -}; - -/** - * struct rdt_membw - Memory bandwidth allocation related data - * @max_delay: Max throttle delay. Delay is the hardware - * representation for memory bandwidth. - * @min_bw: Minimum memory bandwidth percentage user can request - * @bw_gran: Granularity at which the memory bandwidth is allocated - * @delay_linear: True if memory B/W delay is in linear scale - * @mb_map: Mapping of memory B/W percentage to memory B/W delay - */ -struct rdt_membw { - u32 max_delay; - u32 min_bw; - u32 bw_gran; - u32 delay_linear; - u32 *mb_map; -}; - -/** - * struct rdt_resource - attributes of an RDT resource - * @enabled: Is this feature enabled on this machine - * @capable: Is this feature available on this machine - * @name: Name to use in "schemata" file - * @num_closid: Number of CLOSIDs available - * @cache_level: Which cache level defines scope of this resource - * @default_ctrl: Specifies default cache cbm or memory B/W percent. - * @msr_base: Base MSR address for CBMs - * @msr_update: Function pointer to update QOS MSRs - * @data_width: Character width of data when displaying - * @domains: All domains for this resource - * @cache: Cache allocation related data - * @info_files: resctrl info files for the resource - * @nr_info_files: Number of info files - * @format_str: Per resource format string to show domain value - * @parse_ctrlval: Per resource function pointer to parse control values - */ -struct rdt_resource { - bool enabled; - bool capable; - char *name; - int num_closid; - int cache_level; - u32 default_ctrl; - unsigned int msr_base; - void (*msr_update) (struct rdt_domain *d, struct msr_param *m, - struct rdt_resource *r); - int data_width; - struct list_head domains; - struct rdt_cache cache; - struct rdt_membw membw; - struct rftype *info_files; - int nr_info_files; - const char *format_str; - int (*parse_ctrlval) (char *buf, struct rdt_resource *r, - struct rdt_domain *d); -}; - -void rdt_get_cache_infofile(struct rdt_resource *r); -void rdt_get_mba_infofile(struct rdt_resource *r); -int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d); -int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d); - -extern struct mutex rdtgroup_mutex; - -extern struct rdt_resource rdt_resources_all[]; -extern struct rdtgroup rdtgroup_default; -DECLARE_STATIC_KEY_FALSE(rdt_enable_key); - -int __init rdtgroup_init(void); - -enum { - RDT_RESOURCE_L3, - RDT_RESOURCE_L3DATA, - RDT_RESOURCE_L3CODE, - RDT_RESOURCE_L2, - RDT_RESOURCE_MBA, - - /* Must be the last */ - RDT_NUM_RESOURCES, -}; - -#define for_each_capable_rdt_resource(r) \ - for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ - r++) \ - if (r->capable) - -#define for_each_enabled_rdt_resource(r) \ - for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ - r++) \ - if (r->enabled) - -/* CPUID.(EAX=10H, ECX=ResID=1).EAX */ -union cpuid_0x10_1_eax { - struct { - unsigned int cbm_len:5; - } split; - unsigned int full; -}; - -/* CPUID.(EAX=10H, ECX=ResID=3).EAX */ -union cpuid_0x10_3_eax { - struct { - unsigned int max_delay:12; - } split; - unsigned int full; -}; - -/* CPUID.(EAX=10H, ECX=ResID).EDX */ -union cpuid_0x10_x_edx { - struct { - unsigned int cos_max:16; - } split; - unsigned int full; -}; - -DECLARE_PER_CPU_READ_MOSTLY(int, cpu_closid); - -void rdt_ctrl_update(void *arg); -struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn); -void rdtgroup_kn_unlock(struct kernfs_node *kn); -ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off); -int rdtgroup_schemata_show(struct kernfs_open_file *of, - struct seq_file *s, void *v); - -/* - * intel_rdt_sched_in() - Writes the task's CLOSid to IA32_PQR_MSR - * - * Following considerations are made so that this has minimal impact - * on scheduler hot path: - * - This will stay as no-op unless we are running on an Intel SKU - * which supports resource control and we enable by mounting the - * resctrl file system. - * - Caches the per cpu CLOSid values and does the MSR write only - * when a task with a different CLOSid is scheduled in. - * - * Must be called with preemption disabled. - */ -static inline void intel_rdt_sched_in(void) -{ - if (static_branch_likely(&rdt_enable_key)) { - struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); - int closid; - - /* - * If this task has a closid assigned, use it. - * Else use the closid assigned to this cpu. - */ - closid = current->closid; - if (closid == 0) - closid = this_cpu_read(cpu_closid); - - if (closid != state->closid) { - state->closid = closid; - wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, closid); - } - } -} - -#else - -static inline void intel_rdt_sched_in(void) {} - -#endif /* CONFIG_INTEL_RDT_A */ -#endif /* _ASM_X86_INTEL_RDT_H */ diff --git a/arch/x86/include/asm/intel_rdt_common.h b/arch/x86/include/asm/intel_rdt_common.h deleted file mode 100644 index b31081b89407..000000000000 --- a/arch/x86/include/asm/intel_rdt_common.h +++ /dev/null @@ -1,27 +0,0 @@ -#ifndef _ASM_X86_INTEL_RDT_COMMON_H -#define _ASM_X86_INTEL_RDT_COMMON_H - -#define MSR_IA32_PQR_ASSOC 0x0c8f - -/** - * struct intel_pqr_state - State cache for the PQR MSR - * @rmid: The cached Resource Monitoring ID - * @closid: The cached Class Of Service ID - * @rmid_usecnt: The usage counter for rmid - * - * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the - * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always - * contains both parts, so we need to cache them. - * - * The cache also helps to avoid pointless updates if the value does - * not change. - */ -struct intel_pqr_state { - u32 rmid; - u32 closid; - int rmid_usecnt; -}; - -DECLARE_PER_CPU(struct intel_pqr_state, pqr_state); - -#endif /* _ASM_X86_INTEL_RDT_COMMON_H */ diff --git a/arch/x86/include/asm/intel_rdt_sched.h b/arch/x86/include/asm/intel_rdt_sched.h new file mode 100644 index 000000000000..9acb06b6f81e --- /dev/null +++ b/arch/x86/include/asm/intel_rdt_sched.h @@ -0,0 +1,93 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_INTEL_RDT_SCHED_H +#define _ASM_X86_INTEL_RDT_SCHED_H + +#ifdef CONFIG_INTEL_RDT + +#include <linux/sched.h> +#include <linux/jump_label.h> + +#define IA32_PQR_ASSOC 0x0c8f + +/** + * struct intel_pqr_state - State cache for the PQR MSR + * @cur_rmid: The cached Resource Monitoring ID + * @cur_closid: The cached Class Of Service ID + * @default_rmid: The user assigned Resource Monitoring ID + * @default_closid: The user assigned cached Class Of Service ID + * + * The upper 32 bits of IA32_PQR_ASSOC contain closid and the + * lower 10 bits rmid. The update to IA32_PQR_ASSOC always + * contains both parts, so we need to cache them. This also + * stores the user configured per cpu CLOSID and RMID. + * + * The cache also helps to avoid pointless updates if the value does + * not change. + */ +struct intel_pqr_state { + u32 cur_rmid; + u32 cur_closid; + u32 default_rmid; + u32 default_closid; +}; + +DECLARE_PER_CPU(struct intel_pqr_state, pqr_state); + +DECLARE_STATIC_KEY_FALSE(rdt_enable_key); +DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key); +DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key); + +/* + * __intel_rdt_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR + * + * Following considerations are made so that this has minimal impact + * on scheduler hot path: + * - This will stay as no-op unless we are running on an Intel SKU + * which supports resource control or monitoring and we enable by + * mounting the resctrl file system. + * - Caches the per cpu CLOSid/RMID values and does the MSR write only + * when a task with a different CLOSid/RMID is scheduled in. + * - We allocate RMIDs/CLOSids globally in order to keep this as + * simple as possible. + * Must be called with preemption disabled. + */ +static void __intel_rdt_sched_in(void) +{ + struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); + u32 closid = state->default_closid; + u32 rmid = state->default_rmid; + + /* + * If this task has a closid/rmid assigned, use it. + * Else use the closid/rmid assigned to this cpu. + */ + if (static_branch_likely(&rdt_alloc_enable_key)) { + if (current->closid) + closid = current->closid; + } + + if (static_branch_likely(&rdt_mon_enable_key)) { + if (current->rmid) + rmid = current->rmid; + } + + if (closid != state->cur_closid || rmid != state->cur_rmid) { + state->cur_closid = closid; + state->cur_rmid = rmid; + wrmsr(IA32_PQR_ASSOC, rmid, closid); + } +} + +static inline void intel_rdt_sched_in(void) +{ + if (static_branch_likely(&rdt_enable_key)) + __intel_rdt_sched_in(); +} + +#else + +static inline void intel_rdt_sched_in(void) {} + +#endif /* CONFIG_INTEL_RDT */ + +#endif /* _ASM_X86_INTEL_RDT_SCHED_H */ diff --git a/arch/x86/include/asm/intel_scu_ipc.h b/arch/x86/include/asm/intel_scu_ipc.h index 81d3d8776fd9..4a8c6e817398 100644 --- a/arch/x86/include/asm/intel_scu_ipc.h +++ b/arch/x86/include/asm/intel_scu_ipc.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_INTEL_SCU_IPC_H_ #define _ASM_X86_INTEL_SCU_IPC_H_ diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 7afb0e2f07f4..11398d55aefa 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_IO_H #define _ASM_X86_IO_H @@ -69,6 +70,9 @@ build_mmio_write(__writeb, "b", unsigned char, "q", ) build_mmio_write(__writew, "w", unsigned short, "r", ) build_mmio_write(__writel, "l", unsigned int, "r", ) +#define readb readb +#define readw readw +#define readl readl #define readb_relaxed(a) __readb(a) #define readw_relaxed(a) __readw(a) #define readl_relaxed(a) __readl(a) @@ -76,6 +80,9 @@ build_mmio_write(__writel, "l", unsigned int, "r", ) #define __raw_readw __readw #define __raw_readl __readl +#define writeb writeb +#define writew writew +#define writel writel #define writeb_relaxed(v, a) __writeb(v, a) #define writew_relaxed(v, a) __writew(v, a) #define writel_relaxed(v, a) __writel(v, a) @@ -88,13 +95,15 @@ build_mmio_write(__writel, "l", unsigned int, "r", ) #ifdef CONFIG_X86_64 build_mmio_read(readq, "q", unsigned long, "=r", :"memory") +build_mmio_read(__readq, "q", unsigned long, "=r", ) build_mmio_write(writeq, "q", unsigned long, "r", :"memory") +build_mmio_write(__writeq, "q", unsigned long, "r", ) -#define readq_relaxed(a) readq(a) -#define writeq_relaxed(v, a) writeq(v, a) +#define readq_relaxed(a) __readq(a) +#define writeq_relaxed(v, a) __writeq(v, a) -#define __raw_readq(a) readq(a) -#define __raw_writeq(val, addr) writeq(val, addr) +#define __raw_readq __readq +#define __raw_writeq __writeq /* Let people know that we have them */ #define readq readq @@ -119,6 +128,7 @@ static inline phys_addr_t virt_to_phys(volatile void *address) { return __pa(address); } +#define virt_to_phys virt_to_phys /** * phys_to_virt - map physical address to virtual @@ -137,6 +147,7 @@ static inline void *phys_to_virt(phys_addr_t address) { return __va(address); } +#define phys_to_virt phys_to_virt /* * Change "struct page" to physical address. @@ -169,11 +180,14 @@ static inline unsigned int isa_virt_to_bus(volatile void *address) * else, you probably want one of the following. */ extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size); +#define ioremap_nocache ioremap_nocache extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size); #define ioremap_uc ioremap_uc extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size); +#define ioremap_cache ioremap_cache extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, unsigned long prot_val); +#define ioremap_prot ioremap_prot /** * ioremap - map bus memory into CPU space @@ -193,8 +207,10 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size) { return ioremap_nocache(offset, size); } +#define ioremap ioremap extern void iounmap(volatile void __iomem *addr); +#define iounmap iounmap extern void set_iounmap_nonlazy(void); @@ -203,53 +219,6 @@ extern void set_iounmap_nonlazy(void); #include <asm-generic/iomap.h> /* - * Convert a virtual cached pointer to an uncached pointer - */ -#define xlate_dev_kmem_ptr(p) p - -/** - * memset_io Set a range of I/O memory to a constant value - * @addr: The beginning of the I/O-memory range to set - * @val: The value to set the memory to - * @count: The number of bytes to set - * - * Set a range of I/O memory to a given value. - */ -static inline void -memset_io(volatile void __iomem *addr, unsigned char val, size_t count) -{ - memset((void __force *)addr, val, count); -} - -/** - * memcpy_fromio Copy a block of data from I/O memory - * @dst: The (RAM) destination for the copy - * @src: The (I/O memory) source for the data - * @count: The number of bytes to copy - * - * Copy a block of data from I/O memory. - */ -static inline void -memcpy_fromio(void *dst, const volatile void __iomem *src, size_t count) -{ - memcpy(dst, (const void __force *)src, count); -} - -/** - * memcpy_toio Copy a block of data into I/O memory - * @dst: The (I/O memory) destination for the copy - * @src: The (RAM) source for the data - * @count: The number of bytes to copy - * - * Copy a block of data to I/O memory. - */ -static inline void -memcpy_toio(volatile void __iomem *dst, const void *src, size_t count) -{ - memcpy((void __force *)dst, src, count); -} - -/* * ISA space is 'always mapped' on a typical x86 system, no need to * explicitly ioremap() it. The fact that the ISA IO space is mapped * to PAGE_OFFSET is pure coincidence - it does not mean ISA values @@ -328,26 +297,51 @@ static inline unsigned type in##bwl##_p(int port) \ static inline void outs##bwl(int port, const void *addr, unsigned long count) \ { \ asm volatile("rep; outs" #bwl \ - : "+S"(addr), "+c"(count) : "d"(port)); \ + : "+S"(addr), "+c"(count) : "d"(port) : "memory"); \ } \ \ static inline void ins##bwl(int port, void *addr, unsigned long count) \ { \ asm volatile("rep; ins" #bwl \ - : "+D"(addr), "+c"(count) : "d"(port)); \ + : "+D"(addr), "+c"(count) : "d"(port) : "memory"); \ } BUILDIO(b, b, char) BUILDIO(w, w, short) BUILDIO(l, , int) +#define inb inb +#define inw inw +#define inl inl +#define inb_p inb_p +#define inw_p inw_p +#define inl_p inl_p +#define insb insb +#define insw insw +#define insl insl + +#define outb outb +#define outw outw +#define outl outl +#define outb_p outb_p +#define outw_p outw_p +#define outl_p outl_p +#define outsb outsb +#define outsw outsw +#define outsl outsl + extern void *xlate_dev_mem_ptr(phys_addr_t phys); extern void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr); +#define xlate_dev_mem_ptr xlate_dev_mem_ptr +#define unxlate_dev_mem_ptr unxlate_dev_mem_ptr + extern int ioremap_change_attr(unsigned long vaddr, unsigned long size, enum page_cache_mode pcm); extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size); +#define ioremap_wc ioremap_wc extern void __iomem *ioremap_wt(resource_size_t offset, unsigned long size); +#define ioremap_wt ioremap_wt extern bool is_early_ioremap_ptep(pte_t *ptep); @@ -365,6 +359,9 @@ extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, #define IO_SPACE_LIMIT 0xffff +#include <asm-generic/io.h> +#undef PCI_IOBASE + #ifdef CONFIG_MTRR extern int __must_check arch_phys_wc_index(int handle); #define arch_phys_wc_index arch_phys_wc_index @@ -381,4 +378,12 @@ extern void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size) #define arch_io_reserve_memtype_wc arch_io_reserve_memtype_wc #endif +extern bool arch_memremap_can_ram_remap(resource_size_t offset, + unsigned long size, + unsigned long flags); +#define arch_memremap_can_ram_remap arch_memremap_can_ram_remap + +extern bool phys_mem_access_encrypted(unsigned long phys_addr, + unsigned long size); + #endif /* _ASM_X86_IO_H */ diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 6cbf2cfb3f8a..5c27e146a166 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_IO_APIC_H #define _ASM_X86_IO_APIC_H diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h index 793869879464..1e5d5d92eb40 100644 --- a/arch/x86/include/asm/iommu.h +++ b/arch/x86/include/asm/iommu.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_IOMMU_H #define _ASM_X86_IOMMU_H @@ -6,6 +7,8 @@ extern int force_iommu, no_iommu; extern int iommu_detected; extern int iommu_pass_through; +int x86_dma_supported(struct device *dev, u64 mask); + /* 10 seconds */ #define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) diff --git a/arch/x86/include/asm/iommu_table.h b/arch/x86/include/asm/iommu_table.h index e37d6b3ad983..1fb3fd1a83c2 100644 --- a/arch/x86/include/asm/iommu_table.h +++ b/arch/x86/include/asm/iommu_table.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_IOMMU_TABLE_H #define _ASM_X86_IOMMU_TABLE_H diff --git a/arch/x86/include/asm/iosf_mbi.h b/arch/x86/include/asm/iosf_mbi.h index c313cac36f56..7d87437bd030 100644 --- a/arch/x86/include/asm/iosf_mbi.h +++ b/arch/x86/include/asm/iosf_mbi.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Intel OnChip System Fabric MailBox access support */ diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 16d3fa211962..d8632f8fa17d 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_IRQ_H #define _ASM_X86_IRQ_H /* @@ -29,7 +30,6 @@ struct irq_desc; #include <linux/cpumask.h> extern int check_irq_vectors_for_cpu_disable(void); extern void fixup_irqs(void); -extern void irq_force_complete_move(struct irq_desc *desc); #endif #ifdef CONFIG_HAVE_KVM @@ -43,10 +43,6 @@ extern bool handle_irq(struct irq_desc *desc, struct pt_regs *regs); extern __visible unsigned int do_IRQ(struct pt_regs *regs); -/* Interrupt vector management */ -extern DECLARE_BITMAP(used_vectors, NR_VECTORS); -extern int vector_used_by_percpu_irq(unsigned int vector); - extern void init_ISA_irqs(void); #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/include/asm/irq_regs.h b/arch/x86/include/asm/irq_regs.h index d82250b1debb..8f3bee821e6c 100644 --- a/arch/x86/include/asm/irq_regs.h +++ b/arch/x86/include/asm/irq_regs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Per-cpu current frame pointer - the location of the last exception frame on * the stack, stored in the per-cpu area. diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index a210eba2727c..023b4a9fc846 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -55,7 +55,8 @@ extern struct irq_domain * irq_remapping_get_irq_domain(struct irq_alloc_info *info); /* Create PCI MSI/MSIx irqdomain, use @parent as the parent irqdomain. */ -extern struct irq_domain *arch_create_msi_irq_domain(struct irq_domain *parent); +extern struct irq_domain * +arch_create_remap_msi_irq_domain(struct irq_domain *par, const char *n, int id); /* Get parent irqdomain for interrupt remapping irqdomain */ static inline struct irq_domain *arch_get_ir_parent_domain(void) diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 6ca9fd6234e1..c20ffca8fef1 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_IRQ_VECTORS_H #define _ASM_X86_IRQ_VECTORS_H @@ -83,7 +84,6 @@ */ #define X86_PLATFORM_IPI_VECTOR 0xf7 -#define POSTED_INTR_WAKEUP_VECTOR 0xf1 /* * IRQ work vector: */ @@ -98,6 +98,8 @@ /* Vector for KVM to deliver posted interrupt IPI */ #ifdef CONFIG_HAVE_KVM #define POSTED_INTR_VECTOR 0xf2 +#define POSTED_INTR_WAKEUP_VECTOR 0xf1 +#define POSTED_INTR_NESTED_VECTOR 0xf0 #endif /* diff --git a/arch/x86/include/asm/irq_work.h b/arch/x86/include/asm/irq_work.h index f70604125286..800ffce0db29 100644 --- a/arch/x86/include/asm/irq_work.h +++ b/arch/x86/include/asm/irq_work.h @@ -1,11 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_IRQ_WORK_H #define _ASM_IRQ_WORK_H #include <asm/cpufeature.h> +#ifdef CONFIG_X86_LOCAL_APIC static inline bool arch_irq_work_has_interrupt(void) { return boot_cpu_has(X86_FEATURE_APIC); } +extern void arch_irq_work_raise(void); +#else +static inline bool arch_irq_work_has_interrupt(void) +{ + return false; +} +#endif #endif /* _ASM_IRQ_WORK_H */ diff --git a/arch/x86/include/asm/irqdomain.h b/arch/x86/include/asm/irqdomain.h index d26075b52885..423e112c1e8f 100644 --- a/arch/x86/include/asm/irqdomain.h +++ b/arch/x86/include/asm/irqdomain.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_IRQDOMAIN_H #define _ASM_IRQDOMAIN_H diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index ac7692dcfa2e..c8ef23f2c28f 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _X86_IRQFLAGS_H_ #define _X86_IRQFLAGS_H_ diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index adc54c12cbd1..8c0de4282659 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_JUMP_LABEL_H #define _ASM_X86_JUMP_LABEL_H diff --git a/arch/x86/include/asm/kasan.h b/arch/x86/include/asm/kasan.h index f527b02a0ee3..b577dd0916aa 100644 --- a/arch/x86/include/asm/kasan.h +++ b/arch/x86/include/asm/kasan.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_KASAN_H #define _ASM_X86_KASAN_H diff --git a/arch/x86/include/asm/kaslr.h b/arch/x86/include/asm/kaslr.h index 1052a797d71d..460991e3b529 100644 --- a/arch/x86/include/asm/kaslr.h +++ b/arch/x86/include/asm/kaslr.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_KASLR_H_ #define _ASM_KASLR_H_ diff --git a/arch/x86/include/asm/kbdleds.h b/arch/x86/include/asm/kbdleds.h index f27ac5ff597d..197ea4fedd53 100644 --- a/arch/x86/include/asm/kbdleds.h +++ b/arch/x86/include/asm/kbdleds.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_KBDLEDS_H #define _ASM_X86_KBDLEDS_H diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h index 29a594a3b82a..f86a8caa561e 100644 --- a/arch/x86/include/asm/kdebug.h +++ b/arch/x86/include/asm/kdebug.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_KDEBUG_H #define _ASM_X86_KDEBUG_H diff --git a/arch/x86/include/asm/kexec-bzimage64.h b/arch/x86/include/asm/kexec-bzimage64.h index d1b5d194e31d..9f07cff43705 100644 --- a/arch/x86/include/asm/kexec-bzimage64.h +++ b/arch/x86/include/asm/kexec-bzimage64.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_KEXEC_BZIMAGE64_H #define _ASM_KEXEC_BZIMAGE64_H diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 70ef205489f0..f327236f0fa7 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_KEXEC_H #define _ASM_X86_KEXEC_H @@ -147,7 +148,8 @@ unsigned long relocate_kernel(unsigned long indirection_page, unsigned long page_list, unsigned long start_address, - unsigned int preserve_context); + unsigned int preserve_context, + unsigned int sme_active); #endif #define ARCH_HAS_KIMAGE_ARCH @@ -207,6 +209,14 @@ struct kexec_entry64_regs { uint64_t r15; uint64_t rip; }; + +extern int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, + gfp_t gfp); +#define arch_kexec_post_alloc_pages arch_kexec_post_alloc_pages + +extern void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages); +#define arch_kexec_pre_free_pages arch_kexec_pre_free_pages + #endif typedef void crash_vmclear_fn(void); diff --git a/arch/x86/include/asm/kgdb.h b/arch/x86/include/asm/kgdb.h index 22a8537eb780..aacaf2502bd2 100644 --- a/arch/x86/include/asm/kgdb.h +++ b/arch/x86/include/asm/kgdb.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_KGDB_H #define _ASM_X86_KGDB_H diff --git a/arch/x86/include/asm/kmap_types.h b/arch/x86/include/asm/kmap_types.h index 9e00a731a7fb..04ab8266e347 100644 --- a/arch/x86/include/asm/kmap_types.h +++ b/arch/x86/include/asm/kmap_types.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_KMAP_TYPES_H #define _ASM_X86_KMAP_TYPES_H diff --git a/arch/x86/include/asm/kmemcheck.h b/arch/x86/include/asm/kmemcheck.h index ed01518f297e..945a0337fbcf 100644 --- a/arch/x86/include/asm/kmemcheck.h +++ b/arch/x86/include/asm/kmemcheck.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef ASM_X86_KMEMCHECK_H #define ASM_X86_KMEMCHECK_H diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index 34b984c60790..6cf65437b5e5 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -52,10 +52,10 @@ typedef u8 kprobe_opcode_t; #define flush_insn_slot(p) do { } while (0) /* optinsn template addresses */ -extern __visible kprobe_opcode_t optprobe_template_entry; -extern __visible kprobe_opcode_t optprobe_template_val; -extern __visible kprobe_opcode_t optprobe_template_call; -extern __visible kprobe_opcode_t optprobe_template_end; +extern __visible kprobe_opcode_t optprobe_template_entry[]; +extern __visible kprobe_opcode_t optprobe_template_val[]; +extern __visible kprobe_opcode_t optprobe_template_call[]; +extern __visible kprobe_opcode_t optprobe_template_end[]; #define MAX_OPTIMIZED_LENGTH (MAX_INSN_SIZE + RELATIVE_ADDR_SIZE) #define MAX_OPTINSN_SIZE \ (((unsigned long)&optprobe_template_end - \ diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 722d0e568863..ee23a43386a2 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /****************************************************************************** * x86_emulate.h * @@ -23,6 +24,7 @@ struct x86_exception { u16 error_code; bool nested_page_fault; u64 address; /* cr2 or nested page fault gpa */ + u8 async_page_fault; }; /* @@ -218,8 +220,8 @@ struct x86_emulate_ops { struct x86_instruction_info *info, enum x86_intercept_stage stage); - void (*get_cpuid)(struct x86_emulate_ctxt *ctxt, - u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); + bool (*get_cpuid)(struct x86_emulate_ctxt *ctxt, u32 *eax, u32 *ebx, + u32 *ecx, u32 *edx, bool check_limit); void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked); unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt); diff --git a/arch/x86/include/asm/kvm_guest.h b/arch/x86/include/asm/kvm_guest.h index a92b1763c419..46185263d9c2 100644 --- a/arch/x86/include/asm/kvm_guest.h +++ b/arch/x86/include/asm/kvm_guest.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_KVM_GUEST_H #define _ASM_X86_KVM_GUEST_H diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 695605eb1dfb..c73e493adf07 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -48,43 +48,45 @@ #define KVM_IRQCHIP_NUM_PINS KVM_IOAPIC_NUM_PINS /* x86-specific vcpu->requests bit members */ -#define KVM_REQ_MIGRATE_TIMER 8 -#define KVM_REQ_REPORT_TPR_ACCESS 9 -#define KVM_REQ_TRIPLE_FAULT 10 -#define KVM_REQ_MMU_SYNC 11 -#define KVM_REQ_CLOCK_UPDATE 12 -#define KVM_REQ_EVENT 14 -#define KVM_REQ_APF_HALT 15 -#define KVM_REQ_STEAL_UPDATE 16 -#define KVM_REQ_NMI 17 -#define KVM_REQ_PMU 18 -#define KVM_REQ_PMI 19 -#define KVM_REQ_SMI 20 -#define KVM_REQ_MASTERCLOCK_UPDATE 21 -#define KVM_REQ_MCLOCK_INPROGRESS (22 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) -#define KVM_REQ_SCAN_IOAPIC (23 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) -#define KVM_REQ_GLOBAL_CLOCK_UPDATE 24 -#define KVM_REQ_APIC_PAGE_RELOAD (25 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) -#define KVM_REQ_HV_CRASH 26 -#define KVM_REQ_IOAPIC_EOI_EXIT 27 -#define KVM_REQ_HV_RESET 28 -#define KVM_REQ_HV_EXIT 29 -#define KVM_REQ_HV_STIMER 30 +#define KVM_REQ_MIGRATE_TIMER KVM_ARCH_REQ(0) +#define KVM_REQ_REPORT_TPR_ACCESS KVM_ARCH_REQ(1) +#define KVM_REQ_TRIPLE_FAULT KVM_ARCH_REQ(2) +#define KVM_REQ_MMU_SYNC KVM_ARCH_REQ(3) +#define KVM_REQ_CLOCK_UPDATE KVM_ARCH_REQ(4) +#define KVM_REQ_EVENT KVM_ARCH_REQ(6) +#define KVM_REQ_APF_HALT KVM_ARCH_REQ(7) +#define KVM_REQ_STEAL_UPDATE KVM_ARCH_REQ(8) +#define KVM_REQ_NMI KVM_ARCH_REQ(9) +#define KVM_REQ_PMU KVM_ARCH_REQ(10) +#define KVM_REQ_PMI KVM_ARCH_REQ(11) +#define KVM_REQ_SMI KVM_ARCH_REQ(12) +#define KVM_REQ_MASTERCLOCK_UPDATE KVM_ARCH_REQ(13) +#define KVM_REQ_MCLOCK_INPROGRESS \ + KVM_ARCH_REQ_FLAGS(14, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_SCAN_IOAPIC \ + KVM_ARCH_REQ_FLAGS(15, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_GLOBAL_CLOCK_UPDATE KVM_ARCH_REQ(16) +#define KVM_REQ_APIC_PAGE_RELOAD \ + KVM_ARCH_REQ_FLAGS(17, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_HV_CRASH KVM_ARCH_REQ(18) +#define KVM_REQ_IOAPIC_EOI_EXIT KVM_ARCH_REQ(19) +#define KVM_REQ_HV_RESET KVM_ARCH_REQ(20) +#define KVM_REQ_HV_EXIT KVM_ARCH_REQ(21) +#define KVM_REQ_HV_STIMER KVM_ARCH_REQ(22) #define CR0_RESERVED_BITS \ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) -#define CR3_L_MODE_RESERVED_BITS 0xFFFFFF0000000000ULL #define CR3_PCID_INVD BIT_64(63) #define CR4_RESERVED_BITS \ (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \ | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \ - | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE | X86_CR4_SMAP \ - | X86_CR4_PKE)) + | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_VMXE \ + | X86_CR4_SMAP | X86_CR4_PKE)) #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) @@ -201,7 +203,6 @@ enum { #define PFERR_GUEST_PAGE_MASK (1ULL << PFERR_GUEST_PAGE_BIT) #define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \ - PFERR_USER_MASK | \ PFERR_WRITE_MASK | \ PFERR_PRESENT_MASK) @@ -254,7 +255,8 @@ union kvm_mmu_page_role { unsigned cr0_wp:1; unsigned smep_andnot_wp:1; unsigned smap_andnot_wp:1; - unsigned :8; + unsigned ad_disabled:1; + unsigned :7; /* * This is left at the top of the word so that @@ -313,15 +315,17 @@ struct kvm_pio_request { int size; }; +#define PT64_ROOT_MAX_LEVEL 5 + struct rsvd_bits_validate { - u64 rsvd_bits_mask[2][4]; + u64 rsvd_bits_mask[2][PT64_ROOT_MAX_LEVEL]; u64 bad_mt_xwr; }; /* - * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level - * 32-bit). The kvm_mmu structure abstracts the details of the current mmu - * mode. + * x86 supports 4 paging modes (5-level 64-bit, 4-level 64-bit, 3-level 32-bit, + * and 2-level 32-bit). The kvm_mmu structure abstracts the details of the + * current mmu mode. */ struct kvm_mmu { void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root); @@ -458,10 +462,12 @@ struct kvm_vcpu_hv_synic { DECLARE_BITMAP(auto_eoi_bitmap, 256); DECLARE_BITMAP(vec_bitmap, 256); bool active; + bool dont_zero_synic_pages; }; /* Hyper-V per vcpu emulation context */ struct kvm_vcpu_hv { + u32 vp_index; u64 hv_vapic; s64 runtime_offset; struct kvm_vcpu_hv_synic synic; @@ -486,6 +492,7 @@ struct kvm_vcpu_arch { unsigned long cr4; unsigned long cr4_guest_owned_bits; unsigned long cr8; + u32 pkru; u32 hflags; u64 efer; u64 apic_base; @@ -541,10 +548,11 @@ struct kvm_vcpu_arch { struct kvm_queued_exception { bool pending; + bool injected; bool has_error_code; - bool reinject; u8 nr; u32 error_code; + u8 nested_apf; } exception; struct kvm_queued_interrupt { @@ -645,6 +653,9 @@ struct kvm_vcpu_arch { u64 msr_val; u32 id; bool send_user_only; + u32 host_apf_reason; + unsigned long nested_apf_token; + bool delivery_as_pf_vmexit; } apf; /* OSVW MSRs (AMD only) */ @@ -676,8 +687,12 @@ struct kvm_vcpu_arch { int pending_ioapic_eoi; int pending_external_vector; - /* GPA available (AMD only) */ + /* GPA available */ bool gpa_available; + gpa_t gpa_val; + + /* be preempted when it's in kernel-mode(cpl=0) */ + bool preempted_in_kernel; }; struct kvm_lpage_info { @@ -799,6 +814,7 @@ struct kvm_arch { int audit_point; #endif + bool backwards_tsc_observed; bool boot_vcpu_runs_old_kvmclock; u32 bsp_vcpu_id; @@ -935,7 +951,6 @@ struct kvm_x86_ops { void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); - u32 (*get_pkru)(struct kvm_vcpu *vcpu); void (*tlb_flush)(struct kvm_vcpu *vcpu); @@ -948,9 +963,7 @@ struct kvm_x86_ops { unsigned char *hypercall_addr); void (*set_irq)(struct kvm_vcpu *vcpu); void (*set_nmi)(struct kvm_vcpu *vcpu); - void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr, - bool has_error_code, u32 error_code, - bool reinject); + void (*queue_exception)(struct kvm_vcpu *vcpu); void (*cancel_injection)(struct kvm_vcpu *vcpu); int (*interrupt_allowed)(struct kvm_vcpu *vcpu); int (*nmi_allowed)(struct kvm_vcpu *vcpu); @@ -959,7 +972,7 @@ struct kvm_x86_ops { void (*enable_nmi_window)(struct kvm_vcpu *vcpu); void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); - bool (*get_enable_apicv)(void); + bool (*get_enable_apicv)(struct kvm_vcpu *vcpu); void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr); @@ -969,7 +982,7 @@ struct kvm_x86_ops { void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector); int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); - int (*get_tdp_level)(void); + int (*get_tdp_level)(struct kvm_vcpu *vcpu); u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); int (*get_lpage_level)(void); bool (*rdtscp_supported)(void); @@ -1069,7 +1082,7 @@ void kvm_mmu_init_vm(struct kvm *kvm); void kvm_mmu_uninit_vm(struct kvm *kvm); void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, - u64 acc_track_mask); + u64 acc_track_mask, u64 me_mask); void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, @@ -1287,20 +1300,6 @@ static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code) kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); } -static inline u64 get_canonical(u64 la) -{ - return ((int64_t)la << 16) >> 16; -} - -static inline bool is_noncanonical_address(u64 la) -{ -#ifdef CONFIG_X86_64 - return get_canonical(la) != la; -#else - return false; -#endif -} - #define TSS_IOPB_BASE_OFFSET 0x66 #define TSS_BASE_SIZE 0x68 #define TSS_IOPB_SIZE (65536 / 8) @@ -1365,8 +1364,6 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); int kvm_cpu_get_interrupt(struct kvm_vcpu *v); void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu); -void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, - unsigned long address); void kvm_define_shared_msr(unsigned index, u32 msr); int kvm_set_shared_msr(unsigned index, u64 val, u64 mask); diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h index c4eda791f877..172f9749dbb2 100644 --- a/arch/x86/include/asm/kvm_page_track.h +++ b/arch/x86/include/asm/kvm_page_track.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_KVM_PAGE_TRACK_H #define _ASM_X86_KVM_PAGE_TRACK_H diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index bc62e7cbf1b1..c373e44049b1 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_KVM_PARA_H #define _ASM_X86_KVM_PARA_H @@ -88,7 +89,7 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1, bool kvm_para_available(void); unsigned int kvm_arch_para_features(void); void __init kvm_guest_init(void); -void kvm_async_pf_task_wait(u32 token); +void kvm_async_pf_task_wait(u32 token, int interrupt_kernel); void kvm_async_pf_task_wake(u32 token); u32 kvm_read_and_reset_pf_reason(void); extern void kvm_disable_steal_time(void); @@ -103,7 +104,7 @@ static inline void kvm_spinlock_init(void) #else /* CONFIG_KVM_GUEST */ #define kvm_guest_init() do {} while (0) -#define kvm_async_pf_task_wait(T) do {} while(0) +#define kvm_async_pf_task_wait(T, I) do {} while(0) #define kvm_async_pf_task_wake(T) do {} while(0) static inline bool kvm_para_available(void) diff --git a/arch/x86/include/asm/kvmclock.h b/arch/x86/include/asm/kvmclock.h index f260bef63591..eceea9299097 100644 --- a/arch/x86/include/asm/kvmclock.h +++ b/arch/x86/include/asm/kvmclock.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_KVM_CLOCK_H #define _ASM_X86_KVM_CLOCK_H diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h deleted file mode 100644 index 73d0c9b92087..000000000000 --- a/arch/x86/include/asm/lguest.h +++ /dev/null @@ -1,91 +0,0 @@ -#ifndef _ASM_X86_LGUEST_H -#define _ASM_X86_LGUEST_H - -#define GDT_ENTRY_LGUEST_CS 10 -#define GDT_ENTRY_LGUEST_DS 11 -#define LGUEST_CS (GDT_ENTRY_LGUEST_CS * 8) -#define LGUEST_DS (GDT_ENTRY_LGUEST_DS * 8) - -#ifndef __ASSEMBLY__ -#include <asm/desc.h> - -#define GUEST_PL 1 - -/* Page for Switcher text itself, then two pages per cpu */ -#define SWITCHER_TEXT_PAGES (1) -#define SWITCHER_STACK_PAGES (2 * nr_cpu_ids) -#define TOTAL_SWITCHER_PAGES (SWITCHER_TEXT_PAGES + SWITCHER_STACK_PAGES) - -/* Where we map the Switcher, in both Host and Guest. */ -extern unsigned long switcher_addr; - -/* Found in switcher.S */ -extern unsigned long default_idt_entries[]; - -/* Declarations for definitions in arch/x86/lguest/head_32.S */ -extern char lguest_noirq_iret[]; -extern const char lgstart_cli[], lgend_cli[]; -extern const char lgstart_pushf[], lgend_pushf[]; - -extern void lguest_iret(void); -extern void lguest_init(void); - -struct lguest_regs { - /* Manually saved part. */ - unsigned long eax, ebx, ecx, edx; - unsigned long esi, edi, ebp; - unsigned long gs; - unsigned long fs, ds, es; - unsigned long trapnum, errcode; - /* Trap pushed part */ - unsigned long eip; - unsigned long cs; - unsigned long eflags; - unsigned long esp; - unsigned long ss; -}; - -/* This is a guest-specific page (mapped ro) into the guest. */ -struct lguest_ro_state { - /* Host information we need to restore when we switch back. */ - u32 host_cr3; - struct desc_ptr host_idt_desc; - struct desc_ptr host_gdt_desc; - u32 host_sp; - - /* Fields which are used when guest is running. */ - struct desc_ptr guest_idt_desc; - struct desc_ptr guest_gdt_desc; - struct x86_hw_tss guest_tss; - struct desc_struct guest_idt[IDT_ENTRIES]; - struct desc_struct guest_gdt[GDT_ENTRIES]; -}; - -struct lg_cpu_arch { - /* The GDT entries copied into lguest_ro_state when running. */ - struct desc_struct gdt[GDT_ENTRIES]; - - /* The IDT entries: some copied into lguest_ro_state when running. */ - struct desc_struct idt[IDT_ENTRIES]; - - /* The address of the last guest-visible pagefault (ie. cr2). */ - unsigned long last_pagefault; -}; - -static inline void lguest_set_ts(void) -{ - u32 cr0; - - cr0 = read_cr0(); - if (!(cr0 & 8)) - write_cr0(cr0 | 8); -} - -/* Full 4G segment descriptors, suitable for CS and DS. */ -#define FULL_EXEC_SEGMENT \ - ((struct desc_struct)GDT_ENTRY_INIT(0xc09b, 0, 0xfffff)) -#define FULL_SEGMENT ((struct desc_struct)GDT_ENTRY_INIT(0xc093, 0, 0xfffff)) - -#endif /* __ASSEMBLY__ */ - -#endif /* _ASM_X86_LGUEST_H */ diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h deleted file mode 100644 index 6c119cfae218..000000000000 --- a/arch/x86/include/asm/lguest_hcall.h +++ /dev/null @@ -1,74 +0,0 @@ -/* Architecture specific portion of the lguest hypercalls */ -#ifndef _ASM_X86_LGUEST_HCALL_H -#define _ASM_X86_LGUEST_HCALL_H - -#define LHCALL_FLUSH_ASYNC 0 -#define LHCALL_LGUEST_INIT 1 -#define LHCALL_SHUTDOWN 2 -#define LHCALL_NEW_PGTABLE 4 -#define LHCALL_FLUSH_TLB 5 -#define LHCALL_LOAD_IDT_ENTRY 6 -#define LHCALL_SET_STACK 7 -#define LHCALL_SET_CLOCKEVENT 9 -#define LHCALL_HALT 10 -#define LHCALL_SET_PMD 13 -#define LHCALL_SET_PTE 14 -#define LHCALL_SET_PGD 15 -#define LHCALL_LOAD_TLS 16 -#define LHCALL_LOAD_GDT_ENTRY 18 -#define LHCALL_SEND_INTERRUPTS 19 - -#define LGUEST_TRAP_ENTRY 0x1F - -/* Argument number 3 to LHCALL_LGUEST_SHUTDOWN */ -#define LGUEST_SHUTDOWN_POWEROFF 1 -#define LGUEST_SHUTDOWN_RESTART 2 - -#ifndef __ASSEMBLY__ -#include <asm/hw_irq.h> - -/*G:030 - * But first, how does our Guest contact the Host to ask for privileged - * operations? There are two ways: the direct way is to make a "hypercall", - * to make requests of the Host Itself. - * - * Our hypercall mechanism uses the highest unused trap code (traps 32 and - * above are used by real hardware interrupts). Seventeen hypercalls are - * available: the hypercall number is put in the %eax register, and the - * arguments (when required) are placed in %ebx, %ecx, %edx and %esi. - * If a return value makes sense, it's returned in %eax. - * - * Grossly invalid calls result in Sudden Death at the hands of the vengeful - * Host, rather than returning failure. This reflects Winston Churchill's - * definition of a gentleman: "someone who is only rude intentionally". - */ -static inline unsigned long -hcall(unsigned long call, - unsigned long arg1, unsigned long arg2, unsigned long arg3, - unsigned long arg4) -{ - /* "int" is the Intel instruction to trigger a trap. */ - asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY) - /* The call in %eax (aka "a") might be overwritten */ - : "=a"(call) - /* The arguments are in %eax, %ebx, %ecx, %edx & %esi */ - : "a"(call), "b"(arg1), "c"(arg2), "d"(arg3), "S"(arg4) - /* "memory" means this might write somewhere in memory. - * This isn't true for all calls, but it's safe to tell - * gcc that it might happen so it doesn't get clever. */ - : "memory"); - return call; -} -/*:*/ - -/* Can't use our min() macro here: needs to be a constant */ -#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) - -#define LHCALL_RING_SIZE 64 -struct hcall_args { - /* These map directly onto eax/ebx/ecx/edx/esi in struct lguest_regs */ - unsigned long arg0, arg1, arg2, arg3, arg4; -}; - -#endif /* !__ASSEMBLY__ */ -#endif /* _ASM_X86_LGUEST_HCALL_H */ diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index 0ccb26dda126..14caa9d9fb7f 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_LINKAGE_H #define _ASM_X86_LINKAGE_H diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h index 7511978093eb..c91083c59845 100644 --- a/arch/x86/include/asm/local.h +++ b/arch/x86/include/asm/local.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_LOCAL_H #define _ASM_X86_LOCAL_H diff --git a/arch/x86/include/asm/mach_timer.h b/arch/x86/include/asm/mach_timer.h index 88d0c3c74c13..044daf6fb8f6 100644 --- a/arch/x86/include/asm/mach_timer.h +++ b/arch/x86/include/asm/mach_timer.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Machine specific calibrate_tsc() for generic. * Split out from timer_tsc.c by Osamu Tomita <tomita@cinet.co.jp> diff --git a/arch/x86/include/asm/mach_traps.h b/arch/x86/include/asm/mach_traps.h index a01e7ec7d237..e39a517467cd 100644 --- a/arch/x86/include/asm/mach_traps.h +++ b/arch/x86/include/asm/mach_traps.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Machine specific NMI handling for generic. * Split out from traps.c by Osamu Tomita <tomita@cinet.co.jp> diff --git a/arch/x86/include/asm/math_emu.h b/arch/x86/include/asm/math_emu.h index 0d9b14f60d2c..3c42743083ed 100644 --- a/arch/x86/include/asm/math_emu.h +++ b/arch/x86/include/asm/math_emu.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_MATH_EMU_H #define _ASM_X86_MATH_EMU_H diff --git a/arch/x86/include/asm/mc146818rtc.h b/arch/x86/include/asm/mc146818rtc.h index 24acd9ba7837..1775a32f7ea6 100644 --- a/arch/x86/include/asm/mc146818rtc.h +++ b/arch/x86/include/asm/mc146818rtc.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Machine dependent access functions for RTC registers. */ diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 3f9a3d2a5209..b1e8d8db921f 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_MCE_H #define _ASM_X86_MCE_H @@ -187,7 +188,6 @@ struct mca_msr_regs { extern struct mce_vendor_flags mce_flags; -extern struct mca_config mca_cfg; extern struct mca_msr_regs msr_ops; enum mce_notifier_prios { @@ -285,10 +285,6 @@ int mce_notify_irq(void); DECLARE_PER_CPU(struct mce, injectm); -extern void register_mce_write_callback(ssize_t (*)(struct file *filp, - const char __user *ubuf, - size_t usize, loff_t *off)); - /* Disable CMCI/polling for MCA bank claimed by firmware */ extern void mce_disable_bank(int bank); diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h new file mode 100644 index 000000000000..6a77c63540f7 --- /dev/null +++ b/arch/x86/include/asm/mem_encrypt.h @@ -0,0 +1,80 @@ +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2016 Advanced Micro Devices, Inc. + * + * Author: Tom Lendacky <thomas.lendacky@amd.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __X86_MEM_ENCRYPT_H__ +#define __X86_MEM_ENCRYPT_H__ + +#ifndef __ASSEMBLY__ + +#include <linux/init.h> + +#include <asm/bootparam.h> + +#ifdef CONFIG_AMD_MEM_ENCRYPT + +extern u64 sme_me_mask; + +void sme_encrypt_execute(unsigned long encrypted_kernel_vaddr, + unsigned long decrypted_kernel_vaddr, + unsigned long kernel_len, + unsigned long encryption_wa, + unsigned long encryption_pgd); + +void __init sme_early_encrypt(resource_size_t paddr, + unsigned long size); +void __init sme_early_decrypt(resource_size_t paddr, + unsigned long size); + +void __init sme_map_bootdata(char *real_mode_data); +void __init sme_unmap_bootdata(char *real_mode_data); + +void __init sme_early_init(void); + +void __init sme_encrypt_kernel(void); +void __init sme_enable(struct boot_params *bp); + +/* Architecture __weak replacement functions */ +void __init mem_encrypt_init(void); + +void swiotlb_set_mem_attributes(void *vaddr, unsigned long size); + +#else /* !CONFIG_AMD_MEM_ENCRYPT */ + +#define sme_me_mask 0ULL + +static inline void __init sme_early_encrypt(resource_size_t paddr, + unsigned long size) { } +static inline void __init sme_early_decrypt(resource_size_t paddr, + unsigned long size) { } + +static inline void __init sme_map_bootdata(char *real_mode_data) { } +static inline void __init sme_unmap_bootdata(char *real_mode_data) { } + +static inline void __init sme_early_init(void) { } + +static inline void __init sme_encrypt_kernel(void) { } +static inline void __init sme_enable(struct boot_params *bp) { } + +#endif /* CONFIG_AMD_MEM_ENCRYPT */ + +/* + * The __sme_pa() and __sme_pa_nodebug() macros are meant for use when + * writing to or comparing values from the cr3 register. Having the + * encryption mask set in cr3 enables the PGD entry to be encrypted and + * avoid special case handling of PGD allocations. + */ +#define __sme_pa(x) (__pa(x) | sme_me_mask) +#define __sme_pa_nodebug(x) (__pa_nodebug(x) | sme_me_mask) + +#endif /* __ASSEMBLY__ */ + +#endif /* __X86_MEM_ENCRYPT_H__ */ diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index daadeeea00b1..55520cec8b27 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_MICROCODE_H #define _ASM_X86_MICROCODE_H diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h index 3d57009e168b..209492849566 100644 --- a/arch/x86/include/asm/microcode_amd.h +++ b/arch/x86/include/asm/microcode_amd.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_MICROCODE_AMD_H #define _ASM_X86_MICROCODE_AMD_H diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h index e793fc9a9b20..d85a07d7154f 100644 --- a/arch/x86/include/asm/microcode_intel.h +++ b/arch/x86/include/asm/microcode_intel.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_MICROCODE_INTEL_H #define _ASM_X86_MICROCODE_INTEL_H diff --git a/arch/x86/include/asm/misc.h b/arch/x86/include/asm/misc.h index 475f5bbc7f53..bb049cca3729 100644 --- a/arch/x86/include/asm/misc.h +++ b/arch/x86/include/asm/misc.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_MISC_H #define _ASM_X86_MISC_H diff --git a/arch/x86/include/asm/mmconfig.h b/arch/x86/include/asm/mmconfig.h index 04a3fed22cfe..976486447e10 100644 --- a/arch/x86/include/asm/mmconfig.h +++ b/arch/x86/include/asm/mmconfig.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_MMCONFIG_H #define _ASM_X86_MMCONFIG_H diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index f9813b6d8b80..9ea26f167497 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -1,14 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_MMU_H #define _ASM_X86_MMU_H #include <linux/spinlock.h> #include <linux/mutex.h> +#include <linux/atomic.h> /* - * The x86 doesn't have a mmu context, but - * we put the segment information here. + * x86 has arch-specific MMU state beyond what lives in mm_struct. */ typedef struct { + /* + * ctx_id uniquely identifies this mm_struct. A ctx_id will never + * be reused, and zero is not a valid ctx_id. + */ + u64 ctx_id; + + /* + * Any code that needs to do any sort of TLB flushing for this + * mm will first make its changes to the page tables, then + * increment tlb_gen, then flush. This lets the low-level + * flushing code keep track of what needs flushing. + * + * This is not used on Xen PV. + */ + atomic64_t tlb_gen; + #ifdef CONFIG_MODIFY_LDT_SYSCALL struct ldt_struct *ldt; #endif @@ -37,12 +54,11 @@ typedef struct { #endif } mm_context_t; -#ifdef CONFIG_SMP +#define INIT_MM_CONTEXT(mm) \ + .context = { \ + .ctx_id = 1, \ + } + void leave_mm(int cpu); -#else -static inline void leave_mm(int cpu) -{ -} -#endif #endif /* _ASM_X86_MMU_H */ diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 68b329d77b3a..6699fc441644 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_MMU_CONTEXT_H #define _ASM_X86_MMU_CONTEXT_H @@ -12,6 +13,9 @@ #include <asm/tlbflush.h> #include <asm/paravirt.h> #include <asm/mpx.h> + +extern atomic64_t last_mm_ctx_id; + #ifndef CONFIG_PARAVIRT static inline void paravirt_activate_mm(struct mm_struct *prev, struct mm_struct *next) @@ -47,7 +51,7 @@ struct ldt_struct { * allocations, but it's not worth trying to optimize. */ struct desc_struct *entries; - unsigned int size; + unsigned int nr_entries; }; /* @@ -87,27 +91,50 @@ static inline void load_mm_ldt(struct mm_struct *mm) */ if (unlikely(ldt)) - set_ldt(ldt->entries, ldt->size); + set_ldt(ldt->entries, ldt->nr_entries); else clear_LDT(); #else clear_LDT(); #endif - - DEBUG_LOCKS_WARN_ON(preemptible()); } -static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) +static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next) { -#ifdef CONFIG_SMP - if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) - this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY); +#ifdef CONFIG_MODIFY_LDT_SYSCALL + /* + * Load the LDT if either the old or new mm had an LDT. + * + * An mm will never go from having an LDT to not having an LDT. Two + * mms never share an LDT, so we don't gain anything by checking to + * see whether the LDT changed. There's also no guarantee that + * prev->context.ldt actually matches LDTR, but, if LDTR is non-NULL, + * then prev->context.ldt will also be non-NULL. + * + * If we really cared, we could optimize the case where prev == next + * and we're exiting lazy mode. Most of the time, if this happens, + * we don't actually need to reload LDTR, but modify_ldt() is mostly + * used by legacy code and emulators where we don't need this level of + * performance. + * + * This uses | instead of || because it generates better code. + */ + if (unlikely((unsigned long)prev->context.ldt | + (unsigned long)next->context.ldt)) + load_mm_ldt(next); #endif + + DEBUG_LOCKS_WARN_ON(preemptible()); } +void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk); + static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm) { + mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id); + atomic64_set(&mm->context.tlb_gen, 0); + #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS if (cpu_feature_enabled(X86_FEATURE_OSPKE)) { /* pkey 0 is the default and always allocated */ @@ -116,9 +143,7 @@ static inline int init_new_context(struct task_struct *tsk, mm->context.execute_only_pkey = -1; } #endif - init_new_context_ldt(tsk, mm); - - return 0; + return init_new_context_ldt(tsk, mm); } static inline void destroy_context(struct mm_struct *mm) { @@ -220,18 +245,6 @@ static inline int vma_pkey(struct vm_area_struct *vma) } #endif -static inline bool __pkru_allows_pkey(u16 pkey, bool write) -{ - u32 pkru = read_pkru(); - - if (!__pkru_allows_read(pkru, pkey)) - return false; - if (write && !__pkru_allows_write(pkru, pkey)) - return false; - - return true; -} - /* * We only want to enforce protection keys on the current process * because we effectively have no access to PKRU for other @@ -268,4 +281,50 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, return __pkru_allows_pkey(vma_pkey(vma), write); } +/* + * If PCID is on, ASID-aware code paths put the ASID+1 into the PCID + * bits. This serves two purposes. It prevents a nasty situation in + * which PCID-unaware code saves CR3, loads some other value (with PCID + * == 0), and then restores CR3, thus corrupting the TLB for ASID 0 if + * the saved ASID was nonzero. It also means that any bugs involving + * loading a PCID-enabled CR3 with CR4.PCIDE off will trigger + * deterministically. + */ + +static inline unsigned long build_cr3(struct mm_struct *mm, u16 asid) +{ + if (static_cpu_has(X86_FEATURE_PCID)) { + VM_WARN_ON_ONCE(asid > 4094); + return __sme_pa(mm->pgd) | (asid + 1); + } else { + VM_WARN_ON_ONCE(asid != 0); + return __sme_pa(mm->pgd); + } +} + +static inline unsigned long build_cr3_noflush(struct mm_struct *mm, u16 asid) +{ + VM_WARN_ON_ONCE(asid > 4094); + return __sme_pa(mm->pgd) | (asid + 1) | CR3_NOFLUSH; +} + +/* + * This can be used from process context to figure out what the value of + * CR3 is without needing to do a (slow) __read_cr3(). + * + * It's intended to be used for code like KVM that sneakily changes CR3 + * and needs to restore it. It needs to be used very carefully. + */ +static inline unsigned long __get_current_cr3_fast(void) +{ + unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm), + this_cpu_read(cpu_tlbstate.loaded_mm_asid)); + + /* For now, be very restrictive about when this can be called. */ + VM_WARN_ON(in_nmi() || preemptible()); + + VM_BUG_ON(cr3 != __read_cr3()); + return cr3; +} + #endif /* _ASM_X86_MMU_CONTEXT_H */ diff --git a/arch/x86/include/asm/mmx.h b/arch/x86/include/asm/mmx.h index 5cbf3135b971..f572d0f944bb 100644 --- a/arch/x86/include/asm/mmx.h +++ b/arch/x86/include/asm/mmx.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_MMX_H #define _ASM_X86_MMX_H diff --git a/arch/x86/include/asm/mmzone.h b/arch/x86/include/asm/mmzone.h index d497bc425cae..c41b41edd691 100644 --- a/arch/x86/include/asm/mmzone.h +++ b/arch/x86/include/asm/mmzone.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifdef CONFIG_X86_32 # include <asm/mmzone_32.h> #else diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h index 1ec990bd7dc0..73d8dd14dda2 100644 --- a/arch/x86/include/asm/mmzone_32.h +++ b/arch/x86/include/asm/mmzone_32.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Written by Pat Gaughen (gone@us.ibm.com) Mar 2002 * diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h index 129d9aa3ceb3..0c585046f744 100644 --- a/arch/x86/include/asm/mmzone_64.h +++ b/arch/x86/include/asm/mmzone_64.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* K8 NUMA support */ /* Copyright 2002,2003 by Andi Kleen, SuSE Labs */ /* 2.5 Version loosely based on the NUMAQ Code by Pat Gaughen. */ diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h index e3b7819caeef..8546fafa21a9 100644 --- a/arch/x86/include/asm/module.h +++ b/arch/x86/include/asm/module.h @@ -1,7 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_MODULE_H #define _ASM_X86_MODULE_H #include <asm-generic/module.h> +#include <asm/orc_types.h> + +struct mod_arch_specific { +#ifdef CONFIG_ORC_UNWINDER + unsigned int num_orcs; + int *orc_unwind_ip; + struct orc_entry *orc_unwind; +#endif +}; #ifdef CONFIG_X86_64 /* X86_64 does not define MODULE_PROC_FAMILY */ diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index 831eb7895535..606cbaebd336 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_MPSPEC_H #define _ASM_X86_MPSPEC_H @@ -86,7 +87,6 @@ static inline void e820__memblock_alloc_reserved_mpc_new(void) { } #endif int generic_processor_info(int apicid, int version); -int __generic_processor_info(int apicid, int version, bool enabled); #define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_LOCAL_APIC) diff --git a/arch/x86/include/asm/mpspec_def.h b/arch/x86/include/asm/mpspec_def.h index b31f8c098271..9492893aec52 100644 --- a/arch/x86/include/asm/mpspec_def.h +++ b/arch/x86/include/asm/mpspec_def.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_MPSPEC_DEF_H #define _ASM_X86_MPSPEC_DEF_H diff --git a/arch/x86/include/asm/mpx.h b/arch/x86/include/asm/mpx.h index a0d662be4c5b..61eb4b63c5ec 100644 --- a/arch/x86/include/asm/mpx.h +++ b/arch/x86/include/asm/mpx.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_MPX_H #define _ASM_X86_MPX_H @@ -73,6 +74,9 @@ static inline void mpx_mm_init(struct mm_struct *mm) } void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long start, unsigned long end); + +unsigned long mpx_unmapped_area_check(unsigned long addr, unsigned long len, + unsigned long flags); #else static inline siginfo_t *mpx_generate_siginfo(struct pt_regs *regs) { @@ -94,6 +98,12 @@ static inline void mpx_notify_unmap(struct mm_struct *mm, unsigned long start, unsigned long end) { } + +static inline unsigned long mpx_unmapped_area_check(unsigned long addr, + unsigned long len, unsigned long flags) +{ + return addr; +} #endif /* CONFIG_X86_INTEL_MPX */ #endif /* _ASM_X86_MPX_H */ diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index d5acc27ed1cc..581bb54dd464 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -1,8 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_MSHYPER_H #define _ASM_X86_MSHYPER_H #include <linux/types.h> #include <linux/atomic.h> +#include <linux/nmi.h> +#include <asm/io.h> #include <asm/hyperv.h> /* @@ -28,6 +31,8 @@ struct ms_hyperv_info { u32 features; u32 misc_features; u32 hints; + u32 max_vp_index; + u32 max_lp_index; }; extern struct ms_hyperv_info ms_hyperv; @@ -136,7 +141,6 @@ static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type) } } -#define hv_get_current_tick(tick) rdmsrl(HV_X64_MSR_TIME_REF_COUNT, tick) #define hv_init_timer(timer, tick) wrmsrl(timer, tick) #define hv_init_timer_config(config, val) wrmsrl(config, val) @@ -169,12 +173,154 @@ void hv_remove_crash_handler(void); #if IS_ENABLED(CONFIG_HYPERV) extern struct clocksource *hyperv_cs; +extern void *hv_hypercall_pg; + +static inline u64 hv_do_hypercall(u64 control, void *input, void *output) +{ + u64 input_address = input ? virt_to_phys(input) : 0; + u64 output_address = output ? virt_to_phys(output) : 0; + u64 hv_status; + +#ifdef CONFIG_X86_64 + if (!hv_hypercall_pg) + return U64_MAX; + + __asm__ __volatile__("mov %4, %%r8\n" + "call *%5" + : "=a" (hv_status), ASM_CALL_CONSTRAINT, + "+c" (control), "+d" (input_address) + : "r" (output_address), "m" (hv_hypercall_pg) + : "cc", "memory", "r8", "r9", "r10", "r11"); +#else + u32 input_address_hi = upper_32_bits(input_address); + u32 input_address_lo = lower_32_bits(input_address); + u32 output_address_hi = upper_32_bits(output_address); + u32 output_address_lo = lower_32_bits(output_address); + + if (!hv_hypercall_pg) + return U64_MAX; + + __asm__ __volatile__("call *%7" + : "=A" (hv_status), + "+c" (input_address_lo), ASM_CALL_CONSTRAINT + : "A" (control), + "b" (input_address_hi), + "D"(output_address_hi), "S"(output_address_lo), + "m" (hv_hypercall_pg) + : "cc", "memory"); +#endif /* !x86_64 */ + return hv_status; +} + +#define HV_HYPERCALL_RESULT_MASK GENMASK_ULL(15, 0) +#define HV_HYPERCALL_FAST_BIT BIT(16) +#define HV_HYPERCALL_VARHEAD_OFFSET 17 +#define HV_HYPERCALL_REP_COMP_OFFSET 32 +#define HV_HYPERCALL_REP_COMP_MASK GENMASK_ULL(43, 32) +#define HV_HYPERCALL_REP_START_OFFSET 48 +#define HV_HYPERCALL_REP_START_MASK GENMASK_ULL(59, 48) + +/* Fast hypercall with 8 bytes of input and no output */ +static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1) +{ + u64 hv_status, control = (u64)code | HV_HYPERCALL_FAST_BIT; + +#ifdef CONFIG_X86_64 + { + __asm__ __volatile__("call *%4" + : "=a" (hv_status), ASM_CALL_CONSTRAINT, + "+c" (control), "+d" (input1) + : "m" (hv_hypercall_pg) + : "cc", "r8", "r9", "r10", "r11"); + } +#else + { + u32 input1_hi = upper_32_bits(input1); + u32 input1_lo = lower_32_bits(input1); + + __asm__ __volatile__ ("call *%5" + : "=A"(hv_status), + "+c"(input1_lo), + ASM_CALL_CONSTRAINT + : "A" (control), + "b" (input1_hi), + "m" (hv_hypercall_pg) + : "cc", "edi", "esi"); + } +#endif + return hv_status; +} + +/* + * Rep hypercalls. Callers of this functions are supposed to ensure that + * rep_count and varhead_size comply with Hyper-V hypercall definition. + */ +static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size, + void *input, void *output) +{ + u64 control = code; + u64 status; + u16 rep_comp; + + control |= (u64)varhead_size << HV_HYPERCALL_VARHEAD_OFFSET; + control |= (u64)rep_count << HV_HYPERCALL_REP_COMP_OFFSET; + + do { + status = hv_do_hypercall(control, input, output); + if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS) + return status; + + /* Bits 32-43 of status have 'Reps completed' data. */ + rep_comp = (status & HV_HYPERCALL_REP_COMP_MASK) >> + HV_HYPERCALL_REP_COMP_OFFSET; + + control &= ~HV_HYPERCALL_REP_START_MASK; + control |= (u64)rep_comp << HV_HYPERCALL_REP_START_OFFSET; + + touch_nmi_watchdog(); + } while (rep_comp < rep_count); + + return status; +} + +/* + * Hypervisor's notion of virtual processor ID is different from + * Linux' notion of CPU ID. This information can only be retrieved + * in the context of the calling CPU. Setup a map for easy access + * to this information. + */ +extern u32 *hv_vp_index; +extern u32 hv_max_vp_index; + +/** + * hv_cpu_number_to_vp_number() - Map CPU to VP. + * @cpu_number: CPU number in Linux terms + * + * This function returns the mapping between the Linux processor + * number and the hypervisor's virtual processor number, useful + * in making hypercalls and such that talk about specific + * processors. + * + * Return: Virtual processor number in Hyper-V terms + */ +static inline int hv_cpu_number_to_vp_number(int cpu_number) +{ + return hv_vp_index[cpu_number]; +} void hyperv_init(void); +void hyperv_setup_mmu_ops(void); +void hyper_alloc_mmu(void); void hyperv_report_panic(struct pt_regs *regs); bool hv_is_hypercall_page_setup(void); void hyperv_cleanup(void); -#endif +#else /* CONFIG_HYPERV */ +static inline void hyperv_init(void) {} +static inline bool hv_is_hypercall_page_setup(void) { return false; } +static inline void hyperv_cleanup(void) {} +static inline void hyperv_setup_mmu_ops(void) {} +#endif /* CONFIG_HYPERV */ + #ifdef CONFIG_HYPERV_TSCPAGE struct ms_hyperv_tsc_page *hv_get_tsc_page(void); static inline u64 hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg) diff --git a/arch/x86/include/asm/msi.h b/arch/x86/include/asm/msi.h index eb4b09b41df5..25ddd0916bb2 100644 --- a/arch/x86/include/asm/msi.h +++ b/arch/x86/include/asm/msi.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_MSI_H #define _ASM_X86_MSI_H #include <asm/hw_irq.h> diff --git a/arch/x86/include/asm/msidef.h b/arch/x86/include/asm/msidef.h index 4cc48af23fef..ee2f8ccc32d0 100644 --- a/arch/x86/include/asm/msidef.h +++ b/arch/x86/include/asm/msidef.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_MSIDEF_H #define _ASM_X86_MSIDEF_H diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 673f9ac50f6d..ab022618a50a 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_MSR_INDEX_H #define _ASM_X86_MSR_INDEX_H @@ -137,6 +138,8 @@ #define DEBUGCTLMSR_BTS_OFF_OS (1UL << 9) #define DEBUGCTLMSR_BTS_OFF_USR (1UL << 10) #define DEBUGCTLMSR_FREEZE_LBRS_ON_PMI (1UL << 11) +#define DEBUGCTLMSR_FREEZE_IN_SMM_BIT 14 +#define DEBUGCTLMSR_FREEZE_IN_SMM (1UL << DEBUGCTLMSR_FREEZE_IN_SMM_BIT) #define MSR_PEBS_FRONTEND 0x000003f7 @@ -249,9 +252,13 @@ #define HWP_MIN_PERF(x) (x & 0xff) #define HWP_MAX_PERF(x) ((x & 0xff) << 8) #define HWP_DESIRED_PERF(x) ((x & 0xff) << 16) -#define HWP_ENERGY_PERF_PREFERENCE(x) ((x & 0xff) << 24) -#define HWP_ACTIVITY_WINDOW(x) ((x & 0xff3) << 32) -#define HWP_PACKAGE_CONTROL(x) ((x & 0x1) << 42) +#define HWP_ENERGY_PERF_PREFERENCE(x) (((unsigned long long) x & 0xff) << 24) +#define HWP_EPP_PERFORMANCE 0x00 +#define HWP_EPP_BALANCE_PERFORMANCE 0x80 +#define HWP_EPP_BALANCE_POWERSAVE 0xC0 +#define HWP_EPP_POWERSAVE 0xFF +#define HWP_ACTIVITY_WINDOW(x) ((unsigned long long)(x & 0xff3) << 32) +#define HWP_PACKAGE_CONTROL(x) ((unsigned long long)(x & 0x1) << 42) /* IA32_HWP_STATUS */ #define HWP_GUARANTEED_CHANGE(x) (x & 0x1) @@ -350,6 +357,8 @@ #define MSR_K8_TOP_MEM1 0xc001001a #define MSR_K8_TOP_MEM2 0xc001001d #define MSR_K8_SYSCFG 0xc0010010 +#define MSR_K8_SYSCFG_MEM_ENCRYPT_BIT 23 +#define MSR_K8_SYSCFG_MEM_ENCRYPT BIT_ULL(MSR_K8_SYSCFG_MEM_ENCRYPT_BIT) #define MSR_K8_INT_PENDING_MSG 0xc0010055 /* C1E active bits in int pending message */ #define K8_INTP_C1E_ACTIVE_MASK 0x18000000 @@ -420,6 +429,8 @@ #define MSR_IA32_TSC_ADJUST 0x0000003b #define MSR_IA32_BNDCFGS 0x00000d90 +#define MSR_IA32_BNDCFGS_RSVD 0x00000ffc + #define MSR_IA32_XSS 0x00000da0 #define FEATURE_CONTROL_LOCKED (1<<0) @@ -474,9 +485,11 @@ #define MSR_MISC_PWR_MGMT 0x000001aa #define MSR_IA32_ENERGY_PERF_BIAS 0x000001b0 -#define ENERGY_PERF_BIAS_PERFORMANCE 0 -#define ENERGY_PERF_BIAS_NORMAL 6 -#define ENERGY_PERF_BIAS_POWERSAVE 15 +#define ENERGY_PERF_BIAS_PERFORMANCE 0 +#define ENERGY_PERF_BIAS_BALANCE_PERFORMANCE 4 +#define ENERGY_PERF_BIAS_NORMAL 6 +#define ENERGY_PERF_BIAS_BALANCE_POWERSAVE 8 +#define ENERGY_PERF_BIAS_POWERSAVE 15 #define MSR_IA32_PACKAGE_THERM_STATUS 0x000001b1 diff --git a/arch/x86/include/asm/msr-trace.h b/arch/x86/include/asm/msr-trace.h index 7567225747d8..f6adbe96856a 100644 --- a/arch/x86/include/asm/msr-trace.h +++ b/arch/x86/include/asm/msr-trace.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM msr diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 898dba2e2e2c..07962f5f6fba 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_MSR_H #define _ASM_X86_MSR_H diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index bda3c27f0da0..39a2fb29378a 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_MWAIT_H #define _ASM_X86_MWAIT_H diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index 5f2fc4441b11..75ded1d13d98 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_NMI_H #define _ASM_X86_NMI_H diff --git a/arch/x86/include/asm/nops.h b/arch/x86/include/asm/nops.h index aff2b3356101..12f12b5cf2ca 100644 --- a/arch/x86/include/asm/nops.h +++ b/arch/x86/include/asm/nops.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_NOPS_H #define _ASM_X86_NOPS_H diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index 01b493e5a99b..bbfde3d2662f 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_NUMA_H #define _ASM_X86_NUMA_H diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h index e7d6b8254742..9c8e9e85be77 100644 --- a/arch/x86/include/asm/numa_32.h +++ b/arch/x86/include/asm/numa_32.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_NUMA_32_H #define _ASM_X86_NUMA_32_H diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h index 72f9adf6eca4..c2bf1de5d901 100644 --- a/arch/x86/include/asm/olpc.h +++ b/arch/x86/include/asm/olpc.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* OLPC machine specific definitions */ #ifndef _ASM_X86_OLPC_H diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h index 24487712e0b1..8c2a1daf7f72 100644 --- a/arch/x86/include/asm/olpc_ofw.h +++ b/arch/x86/include/asm/olpc_ofw.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_OLPC_OFW_H #define _ASM_X86_OLPC_OFW_H diff --git a/arch/x86/include/asm/orc_lookup.h b/arch/x86/include/asm/orc_lookup.h new file mode 100644 index 000000000000..91c8d868424d --- /dev/null +++ b/arch/x86/include/asm/orc_lookup.h @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2017 Josh Poimboeuf <jpoimboe@redhat.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ +#ifndef _ORC_LOOKUP_H +#define _ORC_LOOKUP_H + +/* + * This is a lookup table for speeding up access to the .orc_unwind table. + * Given an input address offset, the corresponding lookup table entry + * specifies a subset of the .orc_unwind table to search. + * + * Each block represents the end of the previous range and the start of the + * next range. An extra block is added to give the last range an end. + * + * The block size should be a power of 2 to avoid a costly 'div' instruction. + * + * A block size of 256 was chosen because it roughly doubles unwinder + * performance while only adding ~5% to the ORC data footprint. + */ +#define LOOKUP_BLOCK_ORDER 8 +#define LOOKUP_BLOCK_SIZE (1 << LOOKUP_BLOCK_ORDER) + +#ifndef LINKER_SCRIPT + +extern unsigned int orc_lookup[]; +extern unsigned int orc_lookup_end[]; + +#define LOOKUP_START_IP (unsigned long)_stext +#define LOOKUP_STOP_IP (unsigned long)_etext + +#endif /* LINKER_SCRIPT */ + +#endif /* _ORC_LOOKUP_H */ diff --git a/arch/x86/include/asm/orc_types.h b/arch/x86/include/asm/orc_types.h new file mode 100644 index 000000000000..9c9dc579bd7d --- /dev/null +++ b/arch/x86/include/asm/orc_types.h @@ -0,0 +1,107 @@ +/* + * Copyright (C) 2017 Josh Poimboeuf <jpoimboe@redhat.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _ORC_TYPES_H +#define _ORC_TYPES_H + +#include <linux/types.h> +#include <linux/compiler.h> + +/* + * The ORC_REG_* registers are base registers which are used to find other + * registers on the stack. + * + * ORC_REG_PREV_SP, also known as DWARF Call Frame Address (CFA), is the + * address of the previous frame: the caller's SP before it called the current + * function. + * + * ORC_REG_UNDEFINED means the corresponding register's value didn't change in + * the current frame. + * + * The most commonly used base registers are SP and BP -- which the previous SP + * is usually based on -- and PREV_SP and UNDEFINED -- which the previous BP is + * usually based on. + * + * The rest of the base registers are needed for special cases like entry code + * and GCC realigned stacks. + */ +#define ORC_REG_UNDEFINED 0 +#define ORC_REG_PREV_SP 1 +#define ORC_REG_DX 2 +#define ORC_REG_DI 3 +#define ORC_REG_BP 4 +#define ORC_REG_SP 5 +#define ORC_REG_R10 6 +#define ORC_REG_R13 7 +#define ORC_REG_BP_INDIRECT 8 +#define ORC_REG_SP_INDIRECT 9 +#define ORC_REG_MAX 15 + +/* + * ORC_TYPE_CALL: Indicates that sp_reg+sp_offset resolves to PREV_SP (the + * caller's SP right before it made the call). Used for all callable + * functions, i.e. all C code and all callable asm functions. + * + * ORC_TYPE_REGS: Used in entry code to indicate that sp_reg+sp_offset points + * to a fully populated pt_regs from a syscall, interrupt, or exception. + * + * ORC_TYPE_REGS_IRET: Used in entry code to indicate that sp_reg+sp_offset + * points to the iret return frame. + * + * The UNWIND_HINT macros are used only for the unwind_hint struct. They + * aren't used in struct orc_entry due to size and complexity constraints. + * Objtool converts them to real types when it converts the hints to orc + * entries. + */ +#define ORC_TYPE_CALL 0 +#define ORC_TYPE_REGS 1 +#define ORC_TYPE_REGS_IRET 2 +#define UNWIND_HINT_TYPE_SAVE 3 +#define UNWIND_HINT_TYPE_RESTORE 4 + +#ifndef __ASSEMBLY__ +/* + * This struct is more or less a vastly simplified version of the DWARF Call + * Frame Information standard. It contains only the necessary parts of DWARF + * CFI, simplified for ease of access by the in-kernel unwinder. It tells the + * unwinder how to find the previous SP and BP (and sometimes entry regs) on + * the stack for a given code address. Each instance of the struct corresponds + * to one or more code locations. + */ +struct orc_entry { + s16 sp_offset; + s16 bp_offset; + unsigned sp_reg:4; + unsigned bp_reg:4; + unsigned type:2; +} __packed; + +/* + * This struct is used by asm and inline asm code to manually annotate the + * location of registers on the stack for the ORC unwinder. + * + * Type can be either ORC_TYPE_* or UNWIND_HINT_TYPE_*. + */ +struct unwind_hint { + u32 ip; + s16 sp_offset; + u8 sp_reg; + u8 type; +}; +#endif /* __ASSEMBLY__ */ + +#endif /* _ORC_TYPES_H */ diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index cf8f619b305f..7555b48803a8 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PAGE_H #define _ASM_X86_PAGE_H diff --git a/arch/x86/include/asm/page_32.h b/arch/x86/include/asm/page_32.h index 904f528cc8e8..94dbd51df58f 100644 --- a/arch/x86/include/asm/page_32.h +++ b/arch/x86/include/asm/page_32.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PAGE_32_H #define _ASM_X86_PAGE_32_H diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h index 3bae4969ac65..aa30c3241ea7 100644 --- a/arch/x86/include/asm/page_32_types.h +++ b/arch/x86/include/asm/page_32_types.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PAGE_32_DEFS_H #define _ASM_X86_PAGE_32_DEFS_H diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index b4a0d43248cf..4baa6bceb232 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PAGE_64_H #define _ASM_X86_PAGE_64_H @@ -51,6 +52,10 @@ static inline void clear_page(void *page) void copy_page(void *to, void *from); +#ifdef CONFIG_X86_MCE +#define arch_unmap_kpfn arch_unmap_kpfn +#endif + #endif /* !__ASSEMBLY__ */ #ifdef CONFIG_X86_VSYSCALL_EMULATION diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 3f5f08b010d0..e1407312c412 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PAGE_64_DEFS_H #define _ASM_X86_PAGE_64_DEFS_H diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 7bd0099384ca..1e53560a84bb 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -1,8 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PAGE_DEFS_H #define _ASM_X86_PAGE_DEFS_H #include <linux/const.h> #include <linux/types.h> +#include <linux/mem_encrypt.h> /* PAGE_SHIFT determines the page size */ #define PAGE_SHIFT 12 @@ -15,7 +17,7 @@ #define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT) #define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1)) -#define __PHYSICAL_MASK ((phys_addr_t)((1ULL << __PHYSICAL_MASK_SHIFT) - 1)) +#define __PHYSICAL_MASK ((phys_addr_t)(__sme_clr((1ULL << __PHYSICAL_MASK_SHIFT) - 1))) #define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1) /* Cast *PAGE_MASK to a signed type so that it is sign-extended if diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 55fa56fe4e45..fd81228e8037 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PARAVIRT_H #define _ASM_X86_PARAVIRT_H /* Various instructions on x86 need to be replaced for @@ -61,7 +62,7 @@ static inline void write_cr2(unsigned long x) PVOP_VCALL1(pv_mmu_ops.write_cr2, x); } -static inline unsigned long read_cr3(void) +static inline unsigned long __read_cr3(void) { return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr3); } @@ -71,11 +72,6 @@ static inline void write_cr3(unsigned long x) PVOP_VCALL1(pv_mmu_ops.write_cr3, x); } -static inline unsigned long __read_cr4(void) -{ - return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4); -} - static inline void __write_cr4(unsigned long x) { PVOP_VCALL1(pv_cpu_ops.write_cr4, x); @@ -118,7 +114,7 @@ static inline u64 paravirt_read_msr(unsigned msr) static inline void paravirt_write_msr(unsigned msr, unsigned low, unsigned high) { - return PVOP_VCALL3(pv_cpu_ops.write_msr, msr, low, high); + PVOP_VCALL3(pv_cpu_ops.write_msr, msr, low, high); } static inline u64 paravirt_read_msr_safe(unsigned msr, int *err) @@ -228,10 +224,6 @@ static inline void set_ldt(const void *addr, unsigned entries) { PVOP_VCALL2(pv_cpu_ops.set_ldt, addr, entries); } -static inline void store_idt(struct desc_ptr *dtr) -{ - PVOP_VCALL1(pv_cpu_ops.store_idt, dtr); -} static inline unsigned long paravirt_store_tr(void) { return PVOP_CALL0(unsigned long, pv_cpu_ops.store_tr); @@ -312,11 +304,9 @@ static inline void __flush_tlb_single(unsigned long addr) } static inline void flush_tlb_others(const struct cpumask *cpumask, - struct mm_struct *mm, - unsigned long start, - unsigned long end) + const struct flush_tlb_info *info) { - PVOP_VCALL4(pv_mmu_ops.flush_tlb_others, cpumask, mm, start, end); + PVOP_VCALL2(pv_mmu_ops.flush_tlb_others, cpumask, info); } static inline int paravirt_pgd_alloc(struct mm_struct *mm) @@ -367,12 +357,6 @@ static inline void paravirt_release_p4d(unsigned long pfn) PVOP_VCALL1(pv_mmu_ops.release_p4d, pfn); } -static inline void pte_update(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) -{ - PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep); -} - static inline pte_t __pte(pteval_t val) { pteval_t ret; @@ -474,28 +458,6 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pte.pte); } -static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp, pmd_t pmd) -{ - if (sizeof(pmdval_t) > sizeof(long)) - /* 5 arg words */ - pv_mmu_ops.set_pmd_at(mm, addr, pmdp, pmd); - else - PVOP_VCALL4(pv_mmu_ops.set_pmd_at, mm, addr, pmdp, - native_pmd_val(pmd)); -} - -static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, - pud_t *pudp, pud_t pud) -{ - if (sizeof(pudval_t) > sizeof(long)) - /* 5 arg words */ - pv_mmu_ops.set_pud_at(mm, addr, pudp, pud); - else - PVOP_VCALL4(pv_mmu_ops.set_pud_at, mm, addr, pudp, - native_pud_val(pud)); -} - static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) { pmdval_t val = native_pmd_val(pmd); @@ -962,11 +924,6 @@ extern void default_banner(void); #define GET_CR2_INTO_RAX \ call PARA_INDIRECT(pv_mmu_ops+PV_MMU_read_cr2) -#define PARAVIRT_ADJUST_EXCEPTION_FRAME \ - PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_adjust_exception_frame), \ - CLBR_NONE, \ - call PARA_INDIRECT(pv_irq_ops+PV_IRQ_adjust_exception_frame)) - #define USERGS_SYSRET64 \ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \ CLBR_NONE, \ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 7465d6fe336f..10cc3b9709fe 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PARAVIRT_TYPES_H #define _ASM_X86_PARAVIRT_TYPES_H @@ -51,6 +52,7 @@ struct mm_struct; struct desc_struct; struct task_struct; struct cpumask; +struct flush_tlb_info; /* * Wrapper type for pointers to code which uses the non-standard @@ -83,7 +85,7 @@ struct pv_init_ops { */ unsigned (*patch)(u8 type, u16 clobber, void *insnbuf, unsigned long addr, unsigned len); -}; +} __no_randomize_layout; struct pv_lazy_ops { @@ -91,12 +93,12 @@ struct pv_lazy_ops { void (*enter)(void); void (*leave)(void); void (*flush)(void); -}; +} __no_randomize_layout; struct pv_time_ops { unsigned long long (*sched_clock)(void); unsigned long long (*steal_clock)(int cpu); -}; +} __no_randomize_layout; struct pv_cpu_ops { /* hooks for various privileged instructions */ @@ -106,7 +108,6 @@ struct pv_cpu_ops { unsigned long (*read_cr0)(void); void (*write_cr0)(unsigned long); - unsigned long (*read_cr4)(void); void (*write_cr4)(unsigned long); #ifdef CONFIG_X86_64 @@ -118,8 +119,6 @@ struct pv_cpu_ops { void (*load_tr_desc)(void); void (*load_gdt)(const struct desc_ptr *); void (*load_idt)(const struct desc_ptr *); - /* store_gdt has been removed. */ - void (*store_idt)(struct desc_ptr *); void (*set_ldt)(const void *desc, unsigned entries); unsigned long (*store_tr)(void); void (*load_tls)(struct thread_struct *t, unsigned int cpu); @@ -175,7 +174,7 @@ struct pv_cpu_ops { void (*start_context_switch)(struct task_struct *prev); void (*end_context_switch)(struct task_struct *next); -}; +} __no_randomize_layout; struct pv_irq_ops { /* @@ -195,10 +194,7 @@ struct pv_irq_ops { void (*safe_halt)(void); void (*halt)(void); -#ifdef CONFIG_X86_64 - void (*adjust_exception_frame)(void); -#endif -}; +} __no_randomize_layout; struct pv_mmu_ops { unsigned long (*read_cr2)(void); @@ -223,9 +219,7 @@ struct pv_mmu_ops { void (*flush_tlb_kernel)(void); void (*flush_tlb_single)(unsigned long addr); void (*flush_tlb_others)(const struct cpumask *cpus, - struct mm_struct *mm, - unsigned long start, - unsigned long end); + const struct flush_tlb_info *info); /* Hooks for allocating and freeing a pagetable top-level */ int (*pgd_alloc)(struct mm_struct *mm); @@ -249,12 +243,6 @@ struct pv_mmu_ops { void (*set_pte_at)(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval); void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval); - void (*set_pmd_at)(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp, pmd_t pmdval); - void (*set_pud_at)(struct mm_struct *mm, unsigned long addr, - pud_t *pudp, pud_t pudval); - void (*pte_update)(struct mm_struct *mm, unsigned long addr, - pte_t *ptep); pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr, pte_t *ptep); @@ -306,7 +294,7 @@ struct pv_mmu_ops { an mfn. We can tell which is which from the index. */ void (*set_fixmap)(unsigned /* enum fixed_addresses */ idx, phys_addr_t phys, pgprot_t flags); -}; +} __no_randomize_layout; struct arch_spinlock; #ifdef CONFIG_SMP @@ -323,7 +311,7 @@ struct pv_lock_ops { void (*kick)(int cpu); struct paravirt_callee_save vcpu_is_preempted; -}; +} __no_randomize_layout; /* This contains all the paravirt structures: we get a convenient * number for each function using the offset which we use to indicate @@ -335,7 +323,7 @@ struct paravirt_patch_template { struct pv_irq_ops pv_irq_ops; struct pv_mmu_ops pv_mmu_ops; struct pv_lock_ops pv_lock_ops; -}; +} __no_randomize_layout; extern struct pv_info pv_info; extern struct pv_init_ops pv_init_ops; @@ -472,8 +460,8 @@ int paravirt_disable_iospace(void); */ #ifdef CONFIG_X86_32 #define PVOP_VCALL_ARGS \ - unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx; \ - register void *__sp asm("esp") + unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx; + #define PVOP_CALL_ARGS PVOP_VCALL_ARGS #define PVOP_CALL_ARG1(x) "a" ((unsigned long)(x)) @@ -493,8 +481,8 @@ int paravirt_disable_iospace(void); /* [re]ax isn't an arg, but the return val */ #define PVOP_VCALL_ARGS \ unsigned long __edi = __edi, __esi = __esi, \ - __edx = __edx, __ecx = __ecx, __eax = __eax; \ - register void *__sp asm("rsp") + __edx = __edx, __ecx = __ecx, __eax = __eax; + #define PVOP_CALL_ARGS PVOP_VCALL_ARGS #define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x)) @@ -545,7 +533,7 @@ int paravirt_disable_iospace(void); asm volatile(pre \ paravirt_alt(PARAVIRT_CALL) \ post \ - : call_clbr, "+r" (__sp) \ + : call_clbr, ASM_CALL_CONSTRAINT \ : paravirt_type(op), \ paravirt_clobber(clbr), \ ##__VA_ARGS__ \ @@ -555,7 +543,7 @@ int paravirt_disable_iospace(void); asm volatile(pre \ paravirt_alt(PARAVIRT_CALL) \ post \ - : call_clbr, "+r" (__sp) \ + : call_clbr, ASM_CALL_CONSTRAINT \ : paravirt_type(op), \ paravirt_clobber(clbr), \ ##__VA_ARGS__ \ @@ -582,7 +570,7 @@ int paravirt_disable_iospace(void); asm volatile(pre \ paravirt_alt(PARAVIRT_CALL) \ post \ - : call_clbr, "+r" (__sp) \ + : call_clbr, ASM_CALL_CONSTRAINT \ : paravirt_type(op), \ paravirt_clobber(clbr), \ ##__VA_ARGS__ \ diff --git a/arch/x86/include/asm/parport.h b/arch/x86/include/asm/parport.h index 0d2d3b29118f..163f78259a7e 100644 --- a/arch/x86/include/asm/parport.h +++ b/arch/x86/include/asm/parport.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PARPORT_H #define _ASM_X86_PARPORT_H diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h index 0b1ff4c1c14e..8a3ee355b422 100644 --- a/arch/x86/include/asm/pat.h +++ b/arch/x86/include/asm/pat.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PAT_H #define _ASM_X86_PAT_H @@ -7,6 +8,7 @@ bool pat_enabled(void); void pat_disable(const char *reason); extern void pat_init(void); +extern void init_cache_modes(void); extern int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_pcm, enum page_cache_mode *ret_pcm); diff --git a/arch/x86/include/asm/pci-direct.h b/arch/x86/include/asm/pci-direct.h index b1e7a45d868a..e1084f71a295 100644 --- a/arch/x86/include/asm/pci-direct.h +++ b/arch/x86/include/asm/pci-direct.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PCI_DIRECT_H #define _ASM_X86_PCI_DIRECT_H diff --git a/arch/x86/include/asm/pci-functions.h b/arch/x86/include/asm/pci-functions.h index ed0bab427354..1bbc10812ae1 100644 --- a/arch/x86/include/asm/pci-functions.h +++ b/arch/x86/include/asm/pci-functions.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * PCI BIOS function numbering for conventional PCI BIOS * systems diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index f513cc231151..09c06b0fb964 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PCI_H #define _ASM_X86_PCI_H @@ -77,14 +78,8 @@ static inline bool is_vmd(struct pci_bus *bus) extern unsigned int pcibios_assign_all_busses(void); extern int pci_legacy_init(void); -# ifdef CONFIG_ACPI -# define x86_default_pci_init pci_acpi_init -# else -# define x86_default_pci_init pci_legacy_init -# endif #else -# define pcibios_assign_all_busses() 0 -# define x86_default_pci_init NULL +static inline int pcibios_assign_all_busses(void) { return 0; } #endif extern unsigned long pci_mem_start; diff --git a/arch/x86/include/asm/pci_64.h b/arch/x86/include/asm/pci_64.h index fe15cfb21b9b..f5411de0ae11 100644 --- a/arch/x86/include/asm/pci_64.h +++ b/arch/x86/include/asm/pci_64.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PCI_64_H #define _ASM_X86_PCI_64_H diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index 9f1b21f372fe..7a5d6695abd3 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Low-Level PCI Access for i386 machines. * diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 9fa03604b2b3..377f1ffd18be 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PERCPU_H #define _ASM_X86_PERCPU_H diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index f353061bba1d..12f54082f4c8 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PERF_EVENT_H #define _ASM_X86_PERF_EVENT_H diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h index d725382c2ae0..94de1a05aeba 100644 --- a/arch/x86/include/asm/perf_event_p4.h +++ b/arch/x86/include/asm/perf_event_p4.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Netburst Performance Events (P4, old Xeon) */ diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index b2d0cd8288aa..4b5e1eafada7 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PGALLOC_H #define _ASM_X86_PGALLOC_H diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h index a8b96e708c2b..685ffe8a0eaf 100644 --- a/arch/x86/include/asm/pgtable-2level.h +++ b/arch/x86/include/asm/pgtable-2level.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PGTABLE_2LEVEL_H #define _ASM_X86_PGTABLE_2LEVEL_H diff --git a/arch/x86/include/asm/pgtable-2level_types.h b/arch/x86/include/asm/pgtable-2level_types.h index 373ab1de909f..f982ef808e7e 100644 --- a/arch/x86/include/asm/pgtable-2level_types.h +++ b/arch/x86/include/asm/pgtable-2level_types.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PGTABLE_2LEVEL_DEFS_H #define _ASM_X86_PGTABLE_2LEVEL_DEFS_H diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 50d35e3185f5..bc4af5453802 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PGTABLE_3LEVEL_H #define _ASM_X86_PGTABLE_3LEVEL_H @@ -212,4 +213,51 @@ static inline pud_t native_pudp_get_and_clear(pud_t *pudp) #define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high }) #define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } }) +#define gup_get_pte gup_get_pte +/* + * WARNING: only to be used in the get_user_pages_fast() implementation. + * + * With get_user_pages_fast(), we walk down the pagetables without taking + * any locks. For this we would like to load the pointers atomically, + * but that is not possible (without expensive cmpxchg8b) on PAE. What + * we do have is the guarantee that a PTE will only either go from not + * present to present, or present to not present or both -- it will not + * switch to a completely different present page without a TLB flush in + * between; something that we are blocking by holding interrupts off. + * + * Setting ptes from not present to present goes: + * + * ptep->pte_high = h; + * smp_wmb(); + * ptep->pte_low = l; + * + * And present to not present goes: + * + * ptep->pte_low = 0; + * smp_wmb(); + * ptep->pte_high = 0; + * + * We must ensure here that the load of pte_low sees 'l' iff pte_high + * sees 'h'. We load pte_high *after* loading pte_low, which ensures we + * don't see an older value of pte_high. *Then* we recheck pte_low, + * which ensures that we haven't picked up a changed pte high. We might + * have gotten rubbish values from pte_low and pte_high, but we are + * guaranteed that pte_low will not have the present bit set *unless* + * it is 'l'. Because get_user_pages_fast() only operates on present ptes + * we're safe. + */ +static inline pte_t gup_get_pte(pte_t *ptep) +{ + pte_t pte; + + do { + pte.pte_low = ptep->pte_low; + smp_rmb(); + pte.pte_high = ptep->pte_high; + smp_rmb(); + } while (unlikely(pte.pte_low != ptep->pte_low)); + + return pte; +} + #endif /* _ASM_X86_PGTABLE_3LEVEL_H */ diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h index b8a4341faafa..876b4c77d983 100644 --- a/arch/x86/include/asm/pgtable-3level_types.h +++ b/arch/x86/include/asm/pgtable-3level_types.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PGTABLE_3LEVEL_DEFS_H #define _ASM_X86_PGTABLE_3LEVEL_DEFS_H diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index f5af95a0c6b8..f735c3016325 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1,6 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PGTABLE_H #define _ASM_X86_PGTABLE_H +#include <linux/mem_encrypt.h> #include <asm/page.h> #include <asm/pgtable_types.h> @@ -13,9 +15,18 @@ cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS))) \ : (prot)) +/* + * Macros to add or remove encryption attribute + */ +#define pgprot_encrypted(prot) __pgprot(__sme_set(pgprot_val(prot))) +#define pgprot_decrypted(prot) __pgprot(__sme_clr(pgprot_val(prot))) + #ifndef __ASSEMBLY__ #include <asm/x86_init.h> +extern pgd_t early_top_pgt[PTRS_PER_PGD]; +int __init __early_make_pgtable(unsigned long address, pmdval_t pmd); + void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd); void ptdump_walk_pgd_level_checkwx(void); @@ -38,13 +49,13 @@ extern struct list_head pgd_list; extern struct mm_struct *pgd_page_get_mm(struct page *page); +extern pmdval_t early_pmd_flags; + #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> #else /* !CONFIG_PARAVIRT */ #define set_pte(ptep, pte) native_set_pte(ptep, pte) #define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte) -#define set_pmd_at(mm, addr, pmdp, pmd) native_set_pmd_at(mm, addr, pmdp, pmd) -#define set_pud_at(mm, addr, pudp, pud) native_set_pud_at(mm, addr, pudp, pud) #define set_pte_atomic(ptep, pte) \ native_set_pte_atomic(ptep, pte) @@ -75,8 +86,6 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page); #define pte_clear(mm, addr, ptep) native_pte_clear(mm, addr, ptep) #define pmd_clear(pmd) native_pmd_clear(pmd) -#define pte_update(mm, addr, ptep) do { } while (0) - #define pgd_val(x) native_pgd_val(x) #define __pgd(x) native_make_pgd(x) @@ -195,6 +204,11 @@ static inline unsigned long p4d_pfn(p4d_t p4d) return (p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT; } +static inline unsigned long pgd_pfn(pgd_t pgd) +{ + return (pgd_val(pgd) & PTE_PFN_MASK) >> PAGE_SHIFT; +} + static inline int p4d_large(p4d_t p4d) { /* No 512 GiB pages yet */ @@ -244,6 +258,11 @@ static inline int pud_devmap(pud_t pud) return 0; } #endif + +static inline int pgd_devmap(pgd_t pgd) +{ + return 0; +} #endif #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -699,8 +718,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ -#define pmd_page(pmd) \ - pfn_to_page((pmd_val(pmd) & pmd_pfn_mask(pmd)) >> PAGE_SHIFT) +#define pmd_page(pmd) pfn_to_page(pmd_pfn(pmd)) /* * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD] @@ -768,8 +786,7 @@ static inline unsigned long pud_page_vaddr(pud_t pud) * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ -#define pud_page(pud) \ - pfn_to_page((pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT) +#define pud_page(pud) pfn_to_page(pud_pfn(pud)) /* Find an entry in the second-level page table.. */ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) @@ -819,8 +836,7 @@ static inline unsigned long p4d_page_vaddr(p4d_t p4d) * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ -#define p4d_page(p4d) \ - pfn_to_page((p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT) +#define p4d_page(p4d) pfn_to_page(p4d_pfn(p4d)) /* Find an entry in the third-level page table.. */ static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) @@ -854,7 +870,7 @@ static inline unsigned long pgd_page_vaddr(pgd_t pgd) * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ -#define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT) +#define pgd_page(pgd) pfn_to_page(pgd_pfn(pgd)) /* to find an entry in a page-table-directory. */ static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) @@ -917,7 +933,7 @@ extern pgd_t trampoline_pgd_entry; static inline void __meminit init_trampoline_default(void) { /* Default trampoline pgd value */ - trampoline_pgd_entry = init_level4_pgt[pgd_index(__PAGE_OFFSET)]; + trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)]; } # ifdef CONFIG_RANDOMIZE_MEMORY void __meminit init_trampoline(void); @@ -960,31 +976,18 @@ static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr, native_set_pte(ptep, pte); } -static inline void native_set_pmd_at(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp , pmd_t pmd) +static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmd) { native_set_pmd(pmdp, pmd); } -static inline void native_set_pud_at(struct mm_struct *mm, unsigned long addr, - pud_t *pudp, pud_t pud) +static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, pud_t pud) { native_set_pud(pudp, pud); } -#ifndef CONFIG_PARAVIRT -/* - * Rules for using pte_update - it must be called after any PTE update which - * has not been done using the set_pte / clear_pte interfaces. It is used by - * shadow mode hypervisors to resynchronize the shadow page tables. Kernel PTE - * updates should either be sets, clears, or set_pte_atomic for P->P - * transitions, which means this hook should only be called for user PTEs. - * This hook implies a P->P protection or access change has taken place, which - * requires a subsequent TLB flush. - */ -#define pte_update(mm, addr, ptep) do { } while (0) -#endif - /* * We only update the dirty/accessed state if we set * the dirty bit by hand in the kernel, since the hardware @@ -1012,7 +1015,6 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { pte_t pte = native_ptep_get_and_clear(ptep); - pte_update(mm, addr, ptep); return pte; } @@ -1039,7 +1041,6 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { clear_bit(_PAGE_BIT_RW, (unsigned long *)&ptep->pte); - pte_update(mm, addr, ptep); } #define flush_tlb_fix_spurious_fault(vma, address) do { } while (0) @@ -1153,6 +1154,23 @@ static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) { return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); } + +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION +static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_SWP_SOFT_DIRTY); +} + +static inline int pmd_swp_soft_dirty(pmd_t pmd) +{ + return pmd_flags(pmd) & _PAGE_SWP_SOFT_DIRTY; +} + +static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd) +{ + return pmd_clear_flags(pmd, _PAGE_SWP_SOFT_DIRTY); +} +#endif #endif #define PKRU_AD_BIT 0x1 @@ -1185,6 +1203,54 @@ static inline u16 pte_flags_pkey(unsigned long pte_flags) #endif } +static inline bool __pkru_allows_pkey(u16 pkey, bool write) +{ + u32 pkru = read_pkru(); + + if (!__pkru_allows_read(pkru, pkey)) + return false; + if (write && !__pkru_allows_write(pkru, pkey)) + return false; + + return true; +} + +/* + * 'pteval' can come from a PTE, PMD or PUD. We only check + * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the + * same value on all 3 types. + */ +static inline bool __pte_access_permitted(unsigned long pteval, bool write) +{ + unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER; + + if (write) + need_pte_bits |= _PAGE_RW; + + if ((pteval & need_pte_bits) != need_pte_bits) + return 0; + + return __pkru_allows_pkey(pte_flags_pkey(pteval), write); +} + +#define pte_access_permitted pte_access_permitted +static inline bool pte_access_permitted(pte_t pte, bool write) +{ + return __pte_access_permitted(pte_val(pte), write); +} + +#define pmd_access_permitted pmd_access_permitted +static inline bool pmd_access_permitted(pmd_t pmd, bool write) +{ + return __pte_access_permitted(pmd_val(pmd), write); +} + +#define pud_access_permitted pud_access_permitted +static inline bool pud_access_permitted(pud_t pud, bool write) +{ + return __pte_access_permitted(pud_val(pud), write); +} + #include <asm-generic/pgtable.h> #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index bfab55675c16..e67c0620aec2 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PGTABLE_32_H #define _ASM_X86_PGTABLE_32_H diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h index 9fb2f2bc8245..f2ca9b28fd68 100644 --- a/arch/x86/include/asm/pgtable_32_types.h +++ b/arch/x86/include/asm/pgtable_32_types.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PGTABLE_32_DEFS_H #define _ASM_X86_PGTABLE_32_DEFS_H diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 9991224f6238..e9f05331e732 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PGTABLE_64_H #define _ASM_X86_PGTABLE_64_H @@ -14,15 +15,17 @@ #include <linux/bitops.h> #include <linux/threads.h> +extern p4d_t level4_kernel_pgt[512]; +extern p4d_t level4_ident_pgt[512]; extern pud_t level3_kernel_pgt[512]; extern pud_t level3_ident_pgt[512]; extern pmd_t level2_kernel_pgt[512]; extern pmd_t level2_fixmap_pgt[512]; extern pmd_t level2_ident_pgt[512]; extern pte_t level1_fixmap_pgt[512]; -extern pgd_t init_level4_pgt[]; +extern pgd_t init_top_pgt[]; -#define swapper_pg_dir init_level4_pgt +#define swapper_pg_dir init_top_pgt extern void paging_init(void); @@ -178,15 +181,21 @@ static inline int pgd_large(pgd_t pgd) { return 0; } /* * Encode and de-code a swap entry * - * | ... | 11| 10| 9|8|7|6|5| 4| 3|2|1|0| <- bit number - * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U|W|P| <- bit names - * | OFFSET (14->63) | TYPE (9-13) |0|X|X|X| X| X|X|X|0| <- swp entry + * | ... | 11| 10| 9|8|7|6|5| 4| 3|2| 1|0| <- bit number + * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names + * | OFFSET (14->63) | TYPE (9-13) |0|0|X|X| X| X|X|SD|0| <- swp entry * * G (8) is aliased and used as a PROT_NONE indicator for * !present ptes. We need to start storing swap entries above * there. We also need to avoid using A and D because of an * erratum where they can be incorrectly set by hardware on * non-present PTEs. + * + * SD (1) in swp entry is used to store soft dirty bit, which helps us + * remember soft dirty over page migration + * + * Bit 7 in swp entry should be 0 because pmd_present checks not only P, + * but also L and G. */ #define SWP_TYPE_FIRST_BIT (_PAGE_BIT_PROTNONE + 1) #define SWP_TYPE_BITS 5 @@ -202,7 +211,9 @@ static inline int pgd_large(pgd_t pgd) { return 0; } ((type) << (SWP_TYPE_FIRST_BIT)) \ | ((offset) << SWP_OFFSET_FIRST_BIT) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) }) +#define __pmd_to_swp_entry(pmd) ((swp_entry_t) { pmd_val((pmd)) }) #define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val }) +#define __swp_entry_to_pmd(x) ((pmd_t) { .pmd = (x).val }) extern int kern_addr_valid(unsigned long addr); extern void cleanup_highmap(void); @@ -227,6 +238,20 @@ extern void cleanup_highmap(void); extern void init_extra_mapping_uc(unsigned long phys, unsigned long size); extern void init_extra_mapping_wb(unsigned long phys, unsigned long size); -#endif /* !__ASSEMBLY__ */ +#define gup_fast_permitted gup_fast_permitted +static inline bool gup_fast_permitted(unsigned long start, int nr_pages, + int write) +{ + unsigned long len, end; + + len = (unsigned long)nr_pages << PAGE_SHIFT; + end = start + len; + if (end < start) + return false; + if (end >> __VIRTUAL_MASK_SHIFT) + return false; + return true; +} +#endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_PGTABLE_64_H */ diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 06470da156ba..6d5f45dcd4a1 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PGTABLE_64_DEFS_H #define _ASM_X86_PGTABLE_64_DEFS_H diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index bf9638e1ee42..59df7b47a434 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -1,7 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PGTABLE_DEFS_H #define _ASM_X86_PGTABLE_DEFS_H #include <linux/const.h> +#include <linux/mem_encrypt.h> + #include <asm/page_types.h> #define FIRST_USER_ADDRESS 0UL @@ -97,15 +100,15 @@ /* * Tracking soft dirty bit when a page goes to a swap is tricky. * We need a bit which can be stored in pte _and_ not conflict - * with swap entry format. On x86 bits 6 and 7 are *not* involved - * into swap entry computation, but bit 6 is used for nonlinear - * file mapping, so we borrow bit 7 for soft dirty tracking. + * with swap entry format. On x86 bits 1-4 are *not* involved + * into swap entry computation, but bit 7 is used for thp migration, + * so we borrow bit 1 for soft dirty tracking. * * Please note that this bit must be treated as swap dirty page - * mark if and only if the PTE has present bit clear! + * mark if and only if the PTE/PMD has present bit clear! */ #ifdef CONFIG_MEM_SOFT_DIRTY -#define _PAGE_SWP_SOFT_DIRTY _PAGE_PSE +#define _PAGE_SWP_SOFT_DIRTY _PAGE_RW #else #define _PAGE_SWP_SOFT_DIRTY (_AT(pteval_t, 0)) #endif @@ -121,10 +124,10 @@ #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) -#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ - _PAGE_ACCESSED | _PAGE_DIRTY) -#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ - _PAGE_DIRTY) +#define _PAGE_TABLE_NOENC (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |\ + _PAGE_ACCESSED | _PAGE_DIRTY) +#define _KERNPG_TABLE_NOENC (_PAGE_PRESENT | _PAGE_RW | \ + _PAGE_ACCESSED | _PAGE_DIRTY) /* * Set of bits not changed in pte_modify. The pte's @@ -159,6 +162,7 @@ enum page_cache_mode { #define _PAGE_CACHE_MASK (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT) #define _PAGE_NOCACHE (cachemode2protval(_PAGE_CACHE_MODE_UC)) +#define _PAGE_CACHE_WP (cachemode2protval(_PAGE_CACHE_MODE_WP)) #define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) #define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ @@ -187,22 +191,42 @@ enum page_cache_mode { #define __PAGE_KERNEL_VVAR (__PAGE_KERNEL_RO | _PAGE_USER) #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) +#define __PAGE_KERNEL_WP (__PAGE_KERNEL | _PAGE_CACHE_WP) #define __PAGE_KERNEL_IO (__PAGE_KERNEL) #define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE) -#define PAGE_KERNEL __pgprot(__PAGE_KERNEL) -#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO) -#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC) -#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX) -#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE) -#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE) -#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC) -#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL) -#define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR) +#ifndef __ASSEMBLY__ + +#define _PAGE_ENC (_AT(pteval_t, sme_me_mask)) + +#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ + _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_ENC) +#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ + _PAGE_DIRTY | _PAGE_ENC) + +#define __PAGE_KERNEL_ENC (__PAGE_KERNEL | _PAGE_ENC) +#define __PAGE_KERNEL_ENC_WP (__PAGE_KERNEL_WP | _PAGE_ENC) + +#define __PAGE_KERNEL_NOENC (__PAGE_KERNEL) +#define __PAGE_KERNEL_NOENC_WP (__PAGE_KERNEL_WP) + +#define PAGE_KERNEL __pgprot(__PAGE_KERNEL | _PAGE_ENC) +#define PAGE_KERNEL_NOENC __pgprot(__PAGE_KERNEL) +#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO | _PAGE_ENC) +#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC | _PAGE_ENC) +#define PAGE_KERNEL_EXEC_NOENC __pgprot(__PAGE_KERNEL_EXEC) +#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX | _PAGE_ENC) +#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE | _PAGE_ENC) +#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE | _PAGE_ENC) +#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC | _PAGE_ENC) +#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL | _PAGE_ENC) +#define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR | _PAGE_ENC) + +#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO) +#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE) -#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO) -#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE) +#endif /* __ASSEMBLY__ */ /* xwr */ #define __P000 PAGE_NONE @@ -287,6 +311,11 @@ static inline p4dval_t native_p4d_val(p4d_t p4d) #else #include <asm-generic/pgtable-nop4d.h> +static inline p4d_t native_make_p4d(pudval_t val) +{ + return (p4d_t) { .pgd = native_make_pgd((pgdval_t)val) }; +} + static inline p4dval_t native_p4d_val(p4d_t p4d) { return native_pgd_val(p4d.pgd); diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h index b3b09b98896d..a0ba1ffda0df 100644 --- a/arch/x86/include/asm/pkeys.h +++ b/arch/x86/include/asm/pkeys.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PKEYS_H #define _ASM_X86_PKEYS_H diff --git a/arch/x86/include/asm/pm-trace.h b/arch/x86/include/asm/pm-trace.h index 7b7ac42c3661..bfa32aa428e5 100644 --- a/arch/x86/include/asm/pm-trace.h +++ b/arch/x86/include/asm/pm-trace.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PM_TRACE_H #define _ASM_X86_PM_TRACE_H diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h deleted file mode 100644 index 0ff8fe71b255..000000000000 --- a/arch/x86/include/asm/pmem.h +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright(c) 2015 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - */ -#ifndef __ASM_X86_PMEM_H__ -#define __ASM_X86_PMEM_H__ - -#include <linux/uaccess.h> -#include <asm/cacheflush.h> -#include <asm/cpufeature.h> -#include <asm/special_insns.h> - -#ifdef CONFIG_ARCH_HAS_PMEM_API -/** - * arch_memcpy_to_pmem - copy data to persistent memory - * @dst: destination buffer for the copy - * @src: source buffer for the copy - * @n: length of the copy in bytes - * - * Copy data to persistent memory media via non-temporal stores so that - * a subsequent pmem driver flush operation will drain posted write queues. - */ -static inline void arch_memcpy_to_pmem(void *dst, const void *src, size_t n) -{ - int rem; - - /* - * We are copying between two kernel buffers, if - * __copy_from_user_inatomic_nocache() returns an error (page - * fault) we would have already reported a general protection fault - * before the WARN+BUG. - */ - rem = __copy_from_user_inatomic_nocache(dst, (void __user *) src, n); - if (WARN(rem, "%s: fault copying %p <- %p unwritten: %d\n", - __func__, dst, src, rem)) - BUG(); -} - -/** - * arch_wb_cache_pmem - write back a cache range with CLWB - * @vaddr: virtual start address - * @size: number of bytes to write back - * - * Write back a cache range using the CLWB (cache line write back) - * instruction. Note that @size is internally rounded up to be cache - * line size aligned. - */ -static inline void arch_wb_cache_pmem(void *addr, size_t size) -{ - u16 x86_clflush_size = boot_cpu_data.x86_clflush_size; - unsigned long clflush_mask = x86_clflush_size - 1; - void *vend = addr + size; - void *p; - - for (p = (void *)((unsigned long)addr & ~clflush_mask); - p < vend; p += x86_clflush_size) - clwb(p); -} - -/** - * arch_copy_from_iter_pmem - copy data from an iterator to PMEM - * @addr: PMEM destination address - * @bytes: number of bytes to copy - * @i: iterator with source data - * - * Copy data from the iterator 'i' to the PMEM buffer starting at 'addr'. - */ -static inline size_t arch_copy_from_iter_pmem(void *addr, size_t bytes, - struct iov_iter *i) -{ - size_t len; - - /* TODO: skip the write-back by always using non-temporal stores */ - len = copy_from_iter_nocache(addr, bytes, i); - - /* - * In the iovec case on x86_64 copy_from_iter_nocache() uses - * non-temporal stores for the bulk of the transfer, but we need - * to manually flush if the transfer is unaligned. A cached - * memory copy is used when destination or size is not naturally - * aligned. That is: - * - Require 8-byte alignment when size is 8 bytes or larger. - * - Require 4-byte alignment when size is 4 bytes. - * - * In the non-iovec case the entire destination needs to be - * flushed. - */ - if (iter_is_iovec(i)) { - unsigned long flushed, dest = (unsigned long) addr; - - if (bytes < 8) { - if (!IS_ALIGNED(dest, 4) || (bytes != 4)) - arch_wb_cache_pmem(addr, bytes); - } else { - if (!IS_ALIGNED(dest, 8)) { - dest = ALIGN(dest, boot_cpu_data.x86_clflush_size); - arch_wb_cache_pmem(addr, 1); - } - - flushed = dest - (unsigned long) addr; - if (bytes > flushed && !IS_ALIGNED(bytes - flushed, 8)) - arch_wb_cache_pmem(addr + bytes - 1, 1); - } - } else - arch_wb_cache_pmem(addr, bytes); - - return len; -} - -/** - * arch_clear_pmem - zero a PMEM memory range - * @addr: virtual start address - * @size: number of bytes to zero - * - * Write zeros into the memory range starting at 'addr' for 'size' bytes. - */ -static inline void arch_clear_pmem(void *addr, size_t size) -{ - memset(addr, 0, size); - arch_wb_cache_pmem(addr, size); -} - -static inline void arch_invalidate_pmem(void *addr, size_t size) -{ - clflush_cache_range(addr, size); -} -#endif /* CONFIG_ARCH_HAS_PMEM_API */ -#endif /* __ASM_X86_PMEM_H__ */ diff --git a/arch/x86/include/asm/posix_types.h b/arch/x86/include/asm/posix_types.h index f565f6dd59d4..374336e217ff 100644 --- a/arch/x86/include/asm/posix_types.h +++ b/arch/x86/include/asm/posix_types.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ # ifdef CONFIG_X86_32 # include <asm/posix_types_32.h> # else diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index ec1f3c651150..7f2dbd91fc74 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __ASM_PREEMPT_H #define __ASM_PREEMPT_H @@ -100,19 +101,14 @@ static __always_inline bool should_resched(int preempt_offset) #ifdef CONFIG_PREEMPT extern asmlinkage void ___preempt_schedule(void); -# define __preempt_schedule() \ -({ \ - register void *__sp asm(_ASM_SP); \ - asm volatile ("call ___preempt_schedule" : "+r"(__sp)); \ -}) +# define __preempt_schedule() \ + asm volatile ("call ___preempt_schedule" : ASM_CALL_CONSTRAINT) extern asmlinkage void preempt_schedule(void); extern asmlinkage void ___preempt_schedule_notrace(void); -# define __preempt_schedule_notrace() \ -({ \ - register void *__sp asm(_ASM_SP); \ - asm volatile ("call ___preempt_schedule_notrace" : "+r"(__sp)); \ -}) +# define __preempt_schedule_notrace() \ + asm volatile ("call ___preempt_schedule_notrace" : ASM_CALL_CONSTRAINT) + extern asmlinkage void preempt_schedule_notrace(void); #endif diff --git a/arch/x86/include/asm/probe_roms.h b/arch/x86/include/asm/probe_roms.h index 4950a0b1d09c..1c7f3815bbd6 100644 --- a/arch/x86/include/asm/probe_roms.h +++ b/arch/x86/include/asm/probe_roms.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _PROBE_ROMS_H_ #define _PROBE_ROMS_H_ struct pci_dev; diff --git a/arch/x86/include/asm/processor-cyrix.h b/arch/x86/include/asm/processor-cyrix.h index 1198f2a0e42c..aaedd73ea2c6 100644 --- a/arch/x86/include/asm/processor-cyrix.h +++ b/arch/x86/include/asm/processor-cyrix.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * NSC/Cyrix CPU indexed register access. Must be inlined instead of * macros to ensure correct access ordering diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h index 39fb618e2211..43212a43ee69 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h @@ -1,11 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PROCESSOR_FLAGS_H #define _ASM_X86_PROCESSOR_FLAGS_H #include <uapi/asm/processor-flags.h> +#include <linux/mem_encrypt.h> #ifdef CONFIG_VM86 #define X86_VM_MASK X86_EFLAGS_VM #else #define X86_VM_MASK 0 /* No VM86 support */ #endif + +/* + * CR3's layout varies depending on several things. + * + * If CR4.PCIDE is set (64-bit only), then CR3[11:0] is the address space ID. + * If PAE is enabled, then CR3[11:5] is part of the PDPT address + * (i.e. it's 32-byte aligned, not page-aligned) and CR3[4:0] is ignored. + * Otherwise (non-PAE, non-PCID), CR3[3] is PWT, CR3[4] is PCD, and + * CR3[2:0] and CR3[11:5] are ignored. + * + * In all cases, Linux puts zeros in the low ignored bits and in PWT and PCD. + * + * CR3[63] is always read as zero. If CR4.PCIDE is set, then CR3[63] may be + * written as 1 to prevent the write to CR3 from flushing the TLB. + * + * On systems with SME, one bit (in a variable position!) is stolen to indicate + * that the top-level paging structure is encrypted. + * + * All of the remaining bits indicate the physical address of the top-level + * paging structure. + * + * CR3_ADDR_MASK is the mask used by read_cr3_pa(). + */ +#ifdef CONFIG_X86_64 +/* Mask off the address space ID and SME encryption bits. */ +#define CR3_ADDR_MASK __sme_clr(0x7FFFFFFFFFFFF000ull) +#define CR3_PCID_MASK 0xFFFull +#define CR3_NOFLUSH BIT_ULL(63) +#else +/* + * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save + * a tiny bit of code size by setting all the bits. + */ +#define CR3_ADDR_MASK 0xFFFFFFFFull +#define CR3_PCID_MASK 0ull +#define CR3_NOFLUSH 0 +#endif + #endif /* _ASM_X86_PROCESSOR_FLAGS_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index a28b671f1549..bdac19ab2488 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PROCESSOR_H #define _ASM_X86_PROCESSOR_H @@ -22,6 +23,7 @@ struct vm86; #include <asm/nops.h> #include <asm/special_insns.h> #include <asm/fpu/types.h> +#include <asm/unwind_hints.h> #include <linux/personality.h> #include <linux/cache.h> @@ -29,6 +31,7 @@ struct vm86; #include <linux/math64.h> #include <linux/err.h> #include <linux/irqflags.h> +#include <linux/mem_encrypt.h> /* * We handle most unaligned accesses in hardware. On the other hand @@ -129,7 +132,7 @@ struct cpuinfo_x86 { /* Index into per_cpu list: */ u16 cpu_index; u32 microcode; -}; +} __randomize_layout; struct cpuid_regs { u32 eax, ebx, ecx, edx; @@ -231,9 +234,22 @@ native_cpuid_reg(ebx) native_cpuid_reg(ecx) native_cpuid_reg(edx) +/* + * Friendlier CR3 helpers. + */ +static inline unsigned long read_cr3_pa(void) +{ + return __read_cr3() & CR3_ADDR_MASK; +} + +static inline unsigned long native_read_cr3_pa(void) +{ + return __native_read_cr3() & CR3_ADDR_MASK; +} + static inline void load_cr3(pgd_t *pgdir) { - write_cr3(__pa(pgdir)); + write_cr3(__sme_pa(pgdir)); } #ifdef CONFIG_X86_32 @@ -653,7 +669,7 @@ static inline void sync_core(void) * In case NMI unmasking or performance ever becomes a problem, * the next best option appears to be MOV-to-CR2 and an * unconditional jump. That sequence also works on all CPUs, - * but it will fault at CPL3 (i.e. Xen PV and lguest). + * but it will fault at CPL3 (i.e. Xen PV). * * CPUID is the conventional way, but it's nasty: it doesn't * exist on some 486-like CPUs, and it usually exits to a @@ -662,8 +678,6 @@ static inline void sync_core(void) * Like all of Linux's memory ordering operations, this is a * compiler barrier as well. */ - register void *__sp asm(_ASM_SP); - #ifdef CONFIG_X86_32 asm volatile ( "pushfl\n\t" @@ -671,11 +685,12 @@ static inline void sync_core(void) "pushl $1f\n\t" "iret\n\t" "1:" - : "+r" (__sp) : : "memory"); + : ASM_CALL_CONSTRAINT : : "memory"); #else unsigned int tmp; asm volatile ( + UNWIND_HINT_SAVE "mov %%ss, %0\n\t" "pushq %q0\n\t" "pushq %%rsp\n\t" @@ -685,8 +700,9 @@ static inline void sync_core(void) "pushq %q0\n\t" "pushq $1f\n\t" "iretq\n\t" + UNWIND_HINT_RESTORE "1:" - : "=&r" (tmp), "+r" (__sp) : : "cc", "memory"); + : "=&r" (tmp), ASM_CALL_CONSTRAINT : : "cc", "memory"); #endif } @@ -794,7 +810,9 @@ static inline void spin_lock_prefetch(const void *x) */ #define IA32_PAGE_OFFSET PAGE_OFFSET #define TASK_SIZE PAGE_OFFSET +#define TASK_SIZE_LOW TASK_SIZE #define TASK_SIZE_MAX TASK_SIZE +#define DEFAULT_MAP_WINDOW TASK_SIZE #define STACK_TOP TASK_SIZE #define STACK_TOP_MAX STACK_TOP @@ -834,7 +852,9 @@ static inline void spin_lock_prefetch(const void *x) * particular problem by preventing anything from being mapped * at the maximum canonical address. */ -#define TASK_SIZE_MAX ((1UL << 47) - PAGE_SIZE) +#define TASK_SIZE_MAX ((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE) + +#define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE) /* This decides where the kernel will search for a free chunk of vm * space during mmap's. @@ -842,12 +862,14 @@ static inline void spin_lock_prefetch(const void *x) #define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \ 0xc0000000 : 0xFFFFe000) +#define TASK_SIZE_LOW (test_thread_flag(TIF_ADDR32) ? \ + IA32_PAGE_OFFSET : DEFAULT_MAP_WINDOW) #define TASK_SIZE (test_thread_flag(TIF_ADDR32) ? \ IA32_PAGE_OFFSET : TASK_SIZE_MAX) #define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_ADDR32)) ? \ IA32_PAGE_OFFSET : TASK_SIZE_MAX) -#define STACK_TOP TASK_SIZE +#define STACK_TOP TASK_SIZE_LOW #define STACK_TOP_MAX TASK_SIZE_MAX #define INIT_THREAD { \ @@ -868,7 +890,7 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip, * space during mmap's. */ #define __TASK_UNMAPPED_BASE(task_size) (PAGE_ALIGN(task_size / 3)) -#define TASK_UNMAPPED_BASE __TASK_UNMAPPED_BASE(TASK_SIZE) +#define TASK_UNMAPPED_BASE __TASK_UNMAPPED_BASE(TASK_SIZE_LOW) #define KSTK_EIP(task) (task_pt_regs(task)->ip) @@ -899,8 +921,13 @@ static inline int mpx_disable_management(void) } #endif /* CONFIG_X86_INTEL_MPX */ +#ifdef CONFIG_CPU_SUP_AMD extern u16 amd_get_nb_id(int cpu); extern u32 amd_get_nodes_per_socket(void); +#else +static inline u16 amd_get_nb_id(int cpu) { return 0; } +static inline u32 amd_get_nodes_per_socket(void) { return 0; } +#endif static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves) { diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 8d3964fc5f91..6e81788a30c1 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PROTO_H #define _ASM_X86_PROTO_H @@ -24,6 +25,9 @@ void entry_SYSENTER_compat(void); void __end_entry_SYSENTER_compat(void); void entry_SYSCALL_compat(void); void entry_INT80_compat(void); +#if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV) +void xen_entry_INT80_compat(void); +#endif #endif void x86_configure_nx(void); diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 2b5d686ea9f3..c0e3c45cf6ab 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PTRACE_H #define _ASM_X86_PTRACE_H @@ -9,6 +10,20 @@ #ifdef __i386__ struct pt_regs { + /* + * NB: 32-bit x86 CPUs are inconsistent as what happens in the + * following cases (where %seg represents a segment register): + * + * - pushl %seg: some do a 16-bit write and leave the high + * bits alone + * - movl %seg, [mem]: some do a 16-bit write despite the movl + * - IDT entry: some (e.g. 486) will leave the high bits of CS + * and (if applicable) SS undefined. + * + * Fortunately, x86-32 doesn't read the high bits on POP or IRET, + * so we can just treat all of the segment registers as 16-bit + * values. + */ unsigned long bx; unsigned long cx; unsigned long dx; @@ -16,16 +31,22 @@ struct pt_regs { unsigned long di; unsigned long bp; unsigned long ax; - unsigned long ds; - unsigned long es; - unsigned long fs; - unsigned long gs; + unsigned short ds; + unsigned short __dsh; + unsigned short es; + unsigned short __esh; + unsigned short fs; + unsigned short __fsh; + unsigned short gs; + unsigned short __gsh; unsigned long orig_ax; unsigned long ip; - unsigned long cs; + unsigned short cs; + unsigned short __csh; unsigned long flags; unsigned long sp; - unsigned long ss; + unsigned short ss; + unsigned short __ssh; }; #else /* __i386__ */ @@ -176,6 +197,17 @@ static inline unsigned long regs_get_register(struct pt_regs *regs, if (offset == offsetof(struct pt_regs, sp) && regs->cs == __KERNEL_CS) return kernel_stack_pointer(regs); + + /* The selector fields are 16-bit. */ + if (offset == offsetof(struct pt_regs, cs) || + offset == offsetof(struct pt_regs, ss) || + offset == offsetof(struct pt_regs, ds) || + offset == offsetof(struct pt_regs, es) || + offset == offsetof(struct pt_regs, fs) || + offset == offsetof(struct pt_regs, gs)) { + return *(u16 *)((unsigned long)regs + offset); + + } #endif return *(unsigned long *)((unsigned long)regs + offset); } diff --git a/arch/x86/include/asm/purgatory.h b/arch/x86/include/asm/purgatory.h index d7da2729903d..92c34e517da1 100644 --- a/arch/x86/include/asm/purgatory.h +++ b/arch/x86/include/asm/purgatory.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PURGATORY_H #define _ASM_X86_PURGATORY_H diff --git a/arch/x86/include/asm/pvclock-abi.h b/arch/x86/include/asm/pvclock-abi.h index 67f08230103a..1436226efe3e 100644 --- a/arch/x86/include/asm/pvclock-abi.h +++ b/arch/x86/include/asm/pvclock-abi.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PVCLOCK_ABI_H #define _ASM_X86_PVCLOCK_ABI_H #ifndef __ASSEMBLY__ diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index 448cfe1b48cf..3e4ed8fb5f91 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PVCLOCK_H #define _ASM_X86_PVCLOCK_H diff --git a/arch/x86/include/asm/qrwlock.h b/arch/x86/include/asm/qrwlock.h index c537cbb038a7..8656b5a6e8e7 100644 --- a/arch/x86/include/asm/qrwlock.h +++ b/arch/x86/include/asm/qrwlock.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_QRWLOCK_H #define _ASM_X86_QRWLOCK_H diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h index 48a706f641f2..9982dd96f093 100644 --- a/arch/x86/include/asm/qspinlock.h +++ b/arch/x86/include/asm/qspinlock.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_QSPINLOCK_H #define _ASM_X86_QSPINLOCK_H diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h index 9d55f9b6e167..923307ea11c7 100644 --- a/arch/x86/include/asm/qspinlock_paravirt.h +++ b/arch/x86/include/asm/qspinlock_paravirt.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __ASM_QSPINLOCK_PARAVIRT_H #define __ASM_QSPINLOCK_PARAVIRT_H diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index 230e1903acf0..63b3393bd98e 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -1,6 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ARCH_X86_REALMODE_H #define _ARCH_X86_REALMODE_H +/* + * Flag bit definitions for use with the flags field of the trampoline header + * in the CONFIG_X86_64 variant. + */ +#define TH_FLAGS_SME_ACTIVE_BIT 0 +#define TH_FLAGS_SME_ACTIVE BIT(TH_FLAGS_SME_ACTIVE_BIT) + +#ifndef __ASSEMBLY__ + #include <linux/types.h> #include <asm/io.h> @@ -38,6 +48,7 @@ struct trampoline_header { u64 start; u64 efer; u32 cr4; + u32 flags; #endif }; @@ -69,4 +80,6 @@ static inline size_t real_mode_size_needed(void) void set_real_mode_mem(phys_addr_t mem, size_t size); void reserve_real_mode(void); +#endif /* __ASSEMBLY__ */ + #endif /* _ARCH_X86_REALMODE_H */ diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h index fc62ba8dce93..a671a1145906 100644 --- a/arch/x86/include/asm/reboot.h +++ b/arch/x86/include/asm/reboot.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_REBOOT_H #define _ASM_X86_REBOOT_H diff --git a/arch/x86/include/asm/reboot_fixups.h b/arch/x86/include/asm/reboot_fixups.h index 765debe4c54c..96515658ce12 100644 --- a/arch/x86/include/asm/reboot_fixups.h +++ b/arch/x86/include/asm/reboot_fixups.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_REBOOT_FIXUPS_H #define _ASM_X86_REBOOT_FIXUPS_H diff --git a/arch/x86/include/asm/refcount.h b/arch/x86/include/asm/refcount.h new file mode 100644 index 000000000000..ff871210b9f2 --- /dev/null +++ b/arch/x86/include/asm/refcount.h @@ -0,0 +1,109 @@ +#ifndef __ASM_X86_REFCOUNT_H +#define __ASM_X86_REFCOUNT_H +/* + * x86-specific implementation of refcount_t. Based on PAX_REFCOUNT from + * PaX/grsecurity. + */ +#include <linux/refcount.h> + +/* + * This is the first portion of the refcount error handling, which lives in + * .text.unlikely, and is jumped to from the CPU flag check (in the + * following macros). This saves the refcount value location into CX for + * the exception handler to use (in mm/extable.c), and then triggers the + * central refcount exception. The fixup address for the exception points + * back to the regular execution flow in .text. + */ +#define _REFCOUNT_EXCEPTION \ + ".pushsection .text.unlikely\n" \ + "111:\tlea %[counter], %%" _ASM_CX "\n" \ + "112:\t" ASM_UD0 "\n" \ + ASM_UNREACHABLE \ + ".popsection\n" \ + "113:\n" \ + _ASM_EXTABLE_REFCOUNT(112b, 113b) + +/* Trigger refcount exception if refcount result is negative. */ +#define REFCOUNT_CHECK_LT_ZERO \ + "js 111f\n\t" \ + _REFCOUNT_EXCEPTION + +/* Trigger refcount exception if refcount result is zero or negative. */ +#define REFCOUNT_CHECK_LE_ZERO \ + "jz 111f\n\t" \ + REFCOUNT_CHECK_LT_ZERO + +/* Trigger refcount exception unconditionally. */ +#define REFCOUNT_ERROR \ + "jmp 111f\n\t" \ + _REFCOUNT_EXCEPTION + +static __always_inline void refcount_add(unsigned int i, refcount_t *r) +{ + asm volatile(LOCK_PREFIX "addl %1,%0\n\t" + REFCOUNT_CHECK_LT_ZERO + : [counter] "+m" (r->refs.counter) + : "ir" (i) + : "cc", "cx"); +} + +static __always_inline void refcount_inc(refcount_t *r) +{ + asm volatile(LOCK_PREFIX "incl %0\n\t" + REFCOUNT_CHECK_LT_ZERO + : [counter] "+m" (r->refs.counter) + : : "cc", "cx"); +} + +static __always_inline void refcount_dec(refcount_t *r) +{ + asm volatile(LOCK_PREFIX "decl %0\n\t" + REFCOUNT_CHECK_LE_ZERO + : [counter] "+m" (r->refs.counter) + : : "cc", "cx"); +} + +static __always_inline __must_check +bool refcount_sub_and_test(unsigned int i, refcount_t *r) +{ + GEN_BINARY_SUFFIXED_RMWcc(LOCK_PREFIX "subl", REFCOUNT_CHECK_LT_ZERO, + r->refs.counter, "er", i, "%0", e); +} + +static __always_inline __must_check bool refcount_dec_and_test(refcount_t *r) +{ + GEN_UNARY_SUFFIXED_RMWcc(LOCK_PREFIX "decl", REFCOUNT_CHECK_LT_ZERO, + r->refs.counter, "%0", e); +} + +static __always_inline __must_check +bool refcount_add_not_zero(unsigned int i, refcount_t *r) +{ + int c, result; + + c = atomic_read(&(r->refs)); + do { + if (unlikely(c == 0)) + return false; + + result = c + i; + + /* Did we try to increment from/to an undesirable state? */ + if (unlikely(c < 0 || c == INT_MAX || result < c)) { + asm volatile(REFCOUNT_ERROR + : : [counter] "m" (r->refs.counter) + : "cc", "cx"); + break; + } + + } while (!atomic_try_cmpxchg(&(r->refs), &c, result)); + + return c != 0; +} + +static __always_inline __must_check bool refcount_inc_not_zero(refcount_t *r) +{ + return refcount_add_not_zero(1, r); +} + +#endif diff --git a/arch/x86/include/asm/rio.h b/arch/x86/include/asm/rio.h index 97bab6388a92..0a21986d2238 100644 --- a/arch/x86/include/asm/rio.h +++ b/arch/x86/include/asm/rio.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Derived from include/asm-x86/mach-summit/mach_mpparse.h * and include/asm-x86/mach-default/bios_ebda.h diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h index 661dd305694a..d8f3a6ae9f6c 100644 --- a/arch/x86/include/asm/rmwcc.h +++ b/arch/x86/include/asm/rmwcc.h @@ -1,45 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_RMWcc #define _ASM_X86_RMWcc +#define __CLOBBERS_MEM "memory" +#define __CLOBBERS_MEM_CC_CX "memory", "cc", "cx" + #if !defined(__GCC_ASM_FLAG_OUTPUTS__) && defined(CC_HAVE_ASM_GOTO) /* Use asm goto */ -#define __GEN_RMWcc(fullop, var, cc, ...) \ +#define __GEN_RMWcc(fullop, var, cc, clobbers, ...) \ do { \ asm_volatile_goto (fullop "; j" #cc " %l[cc_label]" \ - : : "m" (var), ## __VA_ARGS__ \ - : "memory" : cc_label); \ + : : [counter] "m" (var), ## __VA_ARGS__ \ + : clobbers : cc_label); \ return 0; \ cc_label: \ return 1; \ } while (0) -#define GEN_UNARY_RMWcc(op, var, arg0, cc) \ - __GEN_RMWcc(op " " arg0, var, cc) +#define __BINARY_RMWcc_ARG " %1, " -#define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc) \ - __GEN_RMWcc(op " %1, " arg0, var, cc, vcon (val)) #else /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */ /* Use flags output or a set instruction */ -#define __GEN_RMWcc(fullop, var, cc, ...) \ +#define __GEN_RMWcc(fullop, var, cc, clobbers, ...) \ do { \ bool c; \ asm volatile (fullop ";" CC_SET(cc) \ - : "+m" (var), CC_OUT(cc) (c) \ - : __VA_ARGS__ : "memory"); \ + : [counter] "+m" (var), CC_OUT(cc) (c) \ + : __VA_ARGS__ : clobbers); \ return c; \ } while (0) +#define __BINARY_RMWcc_ARG " %2, " + +#endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */ + #define GEN_UNARY_RMWcc(op, var, arg0, cc) \ - __GEN_RMWcc(op " " arg0, var, cc) + __GEN_RMWcc(op " " arg0, var, cc, __CLOBBERS_MEM) + +#define GEN_UNARY_SUFFIXED_RMWcc(op, suffix, var, arg0, cc) \ + __GEN_RMWcc(op " " arg0 "\n\t" suffix, var, cc, \ + __CLOBBERS_MEM_CC_CX) #define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc) \ - __GEN_RMWcc(op " %2, " arg0, var, cc, vcon (val)) + __GEN_RMWcc(op __BINARY_RMWcc_ARG arg0, var, cc, \ + __CLOBBERS_MEM, vcon (val)) -#endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */ +#define GEN_BINARY_SUFFIXED_RMWcc(op, suffix, var, vcon, val, arg0, cc) \ + __GEN_RMWcc(op __BINARY_RMWcc_ARG arg0 "\n\t" suffix, var, cc, \ + __CLOBBERS_MEM_CC_CX, vcon (val)) #endif /* _ASM_X86_RMWcc */ diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h index a34e0d4b957d..4d38d85a16ad 100644 --- a/arch/x86/include/asm/rwsem.h +++ b/arch/x86/include/asm/rwsem.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* rwsem.h: R/W semaphores implemented using XADD/CMPXCHG for i486+ * * Written by David Howells (dhowells@redhat.com). @@ -103,7 +104,6 @@ static inline bool __down_read_trylock(struct rw_semaphore *sem) ({ \ long tmp; \ struct rw_semaphore* ret; \ - register void *__sp asm(_ASM_SP); \ \ asm volatile("# beginning down_write\n\t" \ LOCK_PREFIX " xadd %1,(%4)\n\t" \ @@ -114,7 +114,8 @@ static inline bool __down_read_trylock(struct rw_semaphore *sem) " call " slow_path "\n" \ "1:\n" \ "# ending down_write" \ - : "+m" (sem->count), "=d" (tmp), "=a" (ret), "+r" (__sp) \ + : "+m" (sem->count), "=d" (tmp), \ + "=a" (ret), ASM_CALL_CONSTRAINT \ : "a" (sem), "1" (RWSEM_ACTIVE_WRITE_BIAS) \ : "memory", "cc"); \ ret; \ diff --git a/arch/x86/include/asm/seccomp.h b/arch/x86/include/asm/seccomp.h index 0c8c7c8861b4..2bd1338de236 100644 --- a/arch/x86/include/asm/seccomp.h +++ b/arch/x86/include/asm/seccomp.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_SECCOMP_H #define _ASM_X86_SECCOMP_H diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h index 2f75f30cb2f6..d6baf23782bc 100644 --- a/arch/x86/include/asm/sections.h +++ b/arch/x86/include/asm/sections.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_SECTIONS_H #define _ASM_X86_SECTIONS_H diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 1549caa098f0..b20f9d623f9c 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_SEGMENT_H #define _ASM_X86_SEGMENT_H @@ -238,9 +239,7 @@ #ifndef __ASSEMBLY__ extern const char early_idt_handler_array[NUM_EXCEPTION_VECTORS][EARLY_IDT_HANDLER_SIZE]; -#ifdef CONFIG_TRACING -# define trace_early_idt_handler_array early_idt_handler_array -#endif +extern void early_ignore_irq(void); /* * Load a segment. Fall back on loading the zero segment if something goes diff --git a/arch/x86/include/asm/serial.h b/arch/x86/include/asm/serial.h index bb658211edad..ece8299d2695 100644 --- a/arch/x86/include/asm/serial.h +++ b/arch/x86/include/asm/serial.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_SERIAL_H #define _ASM_X86_SERIAL_H diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index eaec6c364e42..bd090367236c 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_SET_MEMORY_H #define _ASM_X86_SET_MEMORY_H @@ -11,6 +12,7 @@ * Executability : eXeutable, NoteXecutable * Read/Write : ReadOnly, ReadWrite * Presence : NotPresent + * Encryption : Encrypted, Decrypted * * Within a category, the attributes are mutually exclusive. * @@ -42,6 +44,8 @@ int set_memory_wt(unsigned long addr, int numpages); int set_memory_wb(unsigned long addr, int numpages); int set_memory_np(unsigned long addr, int numpages); int set_memory_4k(unsigned long addr, int numpages); +int set_memory_encrypted(unsigned long addr, int numpages); +int set_memory_decrypted(unsigned long addr, int numpages); int set_memory_array_uc(unsigned long *addr, int addrinarray); int set_memory_array_wc(unsigned long *addr, int addrinarray); diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index ac1d5da14734..ae13bc974416 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_SETUP_H #define _ASM_X86_SETUP_H @@ -39,12 +40,12 @@ static inline void vsmp_init(void) { } #endif void setup_bios_corruption_check(void); +void early_platform_quirks(void); extern unsigned long saved_video_mode; extern void reserve_standard_io_resources(void); extern void i386_reserve_resources(void); -extern void setup_default_timer_irq(void); #ifdef CONFIG_X86_INTEL_MID extern void x86_intel_mid_early_setup(void); diff --git a/arch/x86/include/asm/shmparam.h b/arch/x86/include/asm/shmparam.h index 0880cf0917b9..c4041819c3e5 100644 --- a/arch/x86/include/asm/shmparam.h +++ b/arch/x86/include/asm/shmparam.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_SHMPARAM_H #define _ASM_X86_SHMPARAM_H diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h index e6cd2c489dbb..140d890c2c98 100644 --- a/arch/x86/include/asm/sigcontext.h +++ b/arch/x86/include/asm/sigcontext.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_SIGCONTEXT_H #define _ASM_X86_SIGCONTEXT_H diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h index 34edd1650bae..f176114c04d4 100644 --- a/arch/x86/include/asm/sigframe.h +++ b/arch/x86/include/asm/sigframe.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_SIGFRAME_H #define _ASM_X86_SIGFRAME_H diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h index 452c88b8ad06..bd26834724e5 100644 --- a/arch/x86/include/asm/sighandling.h +++ b/arch/x86/include/asm/sighandling.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_SIGHANDLING_H #define _ASM_X86_SIGHANDLING_H diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h index 8af22be0fe61..5f9012ff52ed 100644 --- a/arch/x86/include/asm/signal.h +++ b/arch/x86/include/asm/signal.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_SIGNAL_H #define _ASM_X86_SIGNAL_H diff --git a/arch/x86/include/asm/simd.h b/arch/x86/include/asm/simd.h index 6c8a7ed13365..a341c878e977 100644 --- a/arch/x86/include/asm/simd.h +++ b/arch/x86/include/asm/simd.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #include <asm/fpu/api.h> diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 47103eca3775..461f53d27708 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_SMP_H #define _ASM_X86_SMP_H #ifndef __ASSEMBLY__ diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h index 1f5bee2c202f..4fc1e9d3c43e 100644 --- a/arch/x86/include/asm/sparsemem.h +++ b/arch/x86/include/asm/sparsemem.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_SPARSEMEM_H #define _ASM_X86_SPARSEMEM_H diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 12af3e35edfa..317fc59b512c 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_SPECIAL_INSNS_H #define _ASM_X86_SPECIAL_INSNS_H @@ -39,7 +40,7 @@ static inline void native_write_cr2(unsigned long val) asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order)); } -static inline unsigned long native_read_cr3(void) +static inline unsigned long __native_read_cr3(void) { unsigned long val; asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order)); @@ -135,6 +136,11 @@ static inline void native_wbinvd(void) extern asmlinkage void native_load_gs_index(unsigned); +static inline unsigned long __read_cr4(void) +{ + return native_read_cr4(); +} + #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> #else @@ -159,9 +165,13 @@ static inline void write_cr2(unsigned long x) native_write_cr2(x); } -static inline unsigned long read_cr3(void) +/* + * Careful! CR3 contains more than just an address. You probably want + * read_cr3_pa() instead. + */ +static inline unsigned long __read_cr3(void) { - return native_read_cr3(); + return __native_read_cr3(); } static inline void write_cr3(unsigned long x) @@ -169,11 +179,6 @@ static inline void write_cr3(unsigned long x) native_write_cr3(x); } -static inline unsigned long __read_cr4(void) -{ - return native_read_cr4(); -} - static inline void __write_cr4(unsigned long x) { native_write_cr4(x); diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index 6d391909e864..b34625796eb2 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_SPINLOCK_H #define _ASM_X86_SPINLOCK_H diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h index 25311ebb446c..bf3e34b25afc 100644 --- a/arch/x86/include/asm/spinlock_types.h +++ b/arch/x86/include/asm/spinlock_types.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_SPINLOCK_TYPES_H #define _ASM_X86_SPINLOCK_TYPES_H diff --git a/arch/x86/include/asm/sta2x11.h b/arch/x86/include/asm/sta2x11.h index e9d32df89ccc..e0975e9c4f47 100644 --- a/arch/x86/include/asm/sta2x11.h +++ b/arch/x86/include/asm/sta2x11.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Header file for STMicroelectronics ConneXt (STA2X11) IOHub */ diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index dcbd9bcce714..371b3a4af000 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * GCC stack protector support. * @@ -74,6 +75,7 @@ static __always_inline void boot_init_stack_canary(void) get_random_bytes(&canary, sizeof(canary)); tsc = rdtsc(); canary += tsc + (tsc << 32UL); + canary &= CANARY_MASK; current->stack_canary = canary; #ifdef CONFIG_X86_64 diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 2e41c50ddf47..8da111b3c342 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs diff --git a/arch/x86/include/asm/string.h b/arch/x86/include/asm/string.h index 09224d7a5862..c3c2c1914d65 100644 --- a/arch/x86/include/asm/string.h +++ b/arch/x86/include/asm/string.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifdef CONFIG_X86_32 # include <asm/string_32.h> #else diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h index 3d3e8353ee5c..076502241eae 100644 --- a/arch/x86/include/asm/string_32.h +++ b/arch/x86/include/asm/string_32.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_STRING_32_H #define _ASM_X86_STRING_32_H @@ -142,7 +143,9 @@ static __always_inline void *__constant_memcpy(void *to, const void *from, } #define __HAVE_ARCH_MEMCPY +extern void *memcpy(void *, const void *, size_t); +#ifndef CONFIG_FORTIFY_SOURCE #ifdef CONFIG_X86_USE_3DNOW #include <asm/mmx.h> @@ -195,11 +198,15 @@ static inline void *__memcpy3d(void *to, const void *from, size_t len) #endif #endif +#endif /* !CONFIG_FORTIFY_SOURCE */ #define __HAVE_ARCH_MEMMOVE void *memmove(void *dest, const void *src, size_t n); +extern int memcmp(const void *, const void *, size_t); +#ifndef CONFIG_FORTIFY_SOURCE #define memcmp __builtin_memcmp +#endif #define __HAVE_ARCH_MEMCHR extern void *memchr(const void *cs, int c, size_t count); @@ -321,6 +328,8 @@ void *__constant_c_and_count_memset(void *s, unsigned long pattern, : __memset_generic((s), (c), (count))) #define __HAVE_ARCH_MEMSET +extern void *memset(void *, int, size_t); +#ifndef CONFIG_FORTIFY_SOURCE #if (__GNUC__ >= 4) #define memset(s, c, count) __builtin_memset(s, c, count) #else @@ -330,6 +339,31 @@ void *__constant_c_and_count_memset(void *s, unsigned long pattern, (count)) \ : __memset((s), (c), (count))) #endif +#endif /* !CONFIG_FORTIFY_SOURCE */ + +#define __HAVE_ARCH_MEMSET16 +static inline void *memset16(uint16_t *s, uint16_t v, size_t n) +{ + int d0, d1; + asm volatile("rep\n\t" + "stosw" + : "=&c" (d0), "=&D" (d1) + : "a" (v), "1" (s), "0" (n) + : "memory"); + return s; +} + +#define __HAVE_ARCH_MEMSET32 +static inline void *memset32(uint32_t *s, uint32_t v, size_t n) +{ + int d0, d1; + asm volatile("rep\n\t" + "stosl" + : "=&c" (d0), "=&D" (d1) + : "a" (v), "1" (s), "0" (n) + : "memory"); + return s; +} /* * find the first occurrence of byte 'c', or 1 past the area if none diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index 733bae07fb29..0b1b4445f4c5 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_STRING_64_H #define _ASM_X86_STRING_64_H @@ -31,6 +32,7 @@ static __always_inline void *__inline_memcpy(void *to, const void *from, size_t extern void *memcpy(void *to, const void *from, size_t len); extern void *__memcpy(void *to, const void *from, size_t len); +#ifndef CONFIG_FORTIFY_SOURCE #ifndef CONFIG_KMEMCHECK #if (__GNUC__ == 4 && __GNUC_MINOR__ < 3) || __GNUC__ < 4 #define memcpy(dst, src, len) \ @@ -51,11 +53,48 @@ extern void *__memcpy(void *to, const void *from, size_t len); */ #define memcpy(dst, src, len) __inline_memcpy((dst), (src), (len)) #endif +#endif /* !CONFIG_FORTIFY_SOURCE */ #define __HAVE_ARCH_MEMSET void *memset(void *s, int c, size_t n); void *__memset(void *s, int c, size_t n); +#define __HAVE_ARCH_MEMSET16 +static inline void *memset16(uint16_t *s, uint16_t v, size_t n) +{ + long d0, d1; + asm volatile("rep\n\t" + "stosw" + : "=&c" (d0), "=&D" (d1) + : "a" (v), "1" (s), "0" (n) + : "memory"); + return s; +} + +#define __HAVE_ARCH_MEMSET32 +static inline void *memset32(uint32_t *s, uint32_t v, size_t n) +{ + long d0, d1; + asm volatile("rep\n\t" + "stosl" + : "=&c" (d0), "=&D" (d1) + : "a" (v), "1" (s), "0" (n) + : "memory"); + return s; +} + +#define __HAVE_ARCH_MEMSET64 +static inline void *memset64(uint64_t *s, uint64_t v, size_t n) +{ + long d0, d1; + asm volatile("rep\n\t" + "stosq" + : "=&c" (d0), "=&D" (d1) + : "a" (v), "1" (s), "0" (n) + : "memory"); + return s; +} + #define __HAVE_ARCH_MEMMOVE void *memmove(void *dest, const void *src, size_t count); void *__memmove(void *dest, const void *src, size_t count); @@ -77,6 +116,11 @@ int strcmp(const char *cs, const char *ct); #define memcpy(dst, src, len) __memcpy(dst, src, len) #define memmove(dst, src, len) __memmove(dst, src, len) #define memset(s, c, n) __memset(s, c, n) + +#ifndef __NO_FORTIFY +#define __NO_FORTIFY /* FORTIFY_SOURCE uses __builtin_memcpy, etc. */ +#endif + #endif #define __HAVE_ARCH_MEMCPY_MCSAFE 1 @@ -109,6 +153,11 @@ memcpy_mcsafe(void *dst, const void *src, size_t cnt) return 0; } +#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE +#define __HAVE_ARCH_MEMCPY_FLUSHCACHE 1 +void memcpy_flushcache(void *dst, const void *src, size_t cnt); +#endif + #endif /* __KERNEL__ */ #endif /* _ASM_X86_STRING_64_H */ diff --git a/arch/x86/include/asm/suspend.h b/arch/x86/include/asm/suspend.h index 2fab6c2c3575..ecffe81ff65c 100644 --- a/arch/x86/include/asm/suspend.h +++ b/arch/x86/include/asm/suspend.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifdef CONFIG_X86_32 # include <asm/suspend_32.h> #else diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h index 8e9dbe7b73a1..982c325dad33 100644 --- a/arch/x86/include/asm/suspend_32.h +++ b/arch/x86/include/asm/suspend_32.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright 2001-2002 Pavel Machek <pavel@suse.cz> * Based on code diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h index 6136a18152af..7306e911faee 100644 --- a/arch/x86/include/asm/suspend_64.h +++ b/arch/x86/include/asm/suspend_64.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright 2001-2003 Pavel Machek <pavel@suse.cz> * Based on code @@ -42,8 +43,7 @@ struct saved_context { set_debugreg((thread)->debugreg##register, register) /* routines for saving/restoring kernel state */ -extern int acpi_save_state_mem(void); -extern char core_restore_code; -extern char restore_registers; +extern char core_restore_code[]; +extern char restore_registers[]; #endif /* _ASM_X86_SUSPEND_64_H */ diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 14824fc78f7e..78dd9df88157 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __SVM_H #define __SVM_H @@ -83,7 +84,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area { u32 event_inj; u32 event_inj_err; u64 nested_cr3; - u64 lbr_ctl; + u64 virt_ext; u32 clean; u32 reserved_5; u64 next_rip; @@ -107,6 +108,9 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define V_IRQ_SHIFT 8 #define V_IRQ_MASK (1 << V_IRQ_SHIFT) +#define V_GIF_SHIFT 9 +#define V_GIF_MASK (1 << V_GIF_SHIFT) + #define V_INTR_PRIO_SHIFT 16 #define V_INTR_PRIO_MASK (0x0f << V_INTR_PRIO_SHIFT) @@ -116,9 +120,15 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define V_INTR_MASKING_SHIFT 24 #define V_INTR_MASKING_MASK (1 << V_INTR_MASKING_SHIFT) +#define V_GIF_ENABLE_SHIFT 25 +#define V_GIF_ENABLE_MASK (1 << V_GIF_ENABLE_SHIFT) + #define AVIC_ENABLE_SHIFT 31 #define AVIC_ENABLE_MASK (1 << AVIC_ENABLE_SHIFT) +#define LBR_CTL_ENABLE_MASK BIT_ULL(0) +#define VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK BIT_ULL(1) + #define SVM_INTERRUPT_SHADOW_MASK 1 #define SVM_IOIO_STR_SHIFT 2 diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h index d2f69b9ff732..bdf9aed40403 100644 --- a/arch/x86/include/asm/swiotlb.h +++ b/arch/x86/include/asm/swiotlb.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_SWIOTLB_H #define _ASM_X86_SWIOTLB_H diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index fcc5cd387fd1..899084b70412 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_SWITCH_TO_H #define _ASM_X86_SWITCH_TO_H diff --git a/arch/x86/include/asm/sync_bitops.h b/arch/x86/include/asm/sync_bitops.h index cbf8847d02a0..2fe745356fb1 100644 --- a/arch/x86/include/asm/sync_bitops.h +++ b/arch/x86/include/asm/sync_bitops.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_SYNC_BITOPS_H #define _ASM_X86_SYNC_BITOPS_H diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index 90395063383c..2ecd34e2d46c 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_TEXT_PATCHING_H #define _ASM_X86_TEXT_PATCHING_H diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index e00e1bd6e7b3..70f425947dc5 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* thread_info.h: low-level thread information * * Copyright (C) 2002 David Howells (dhowells@redhat.com) @@ -98,6 +99,7 @@ struct thread_info { #define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */ #define TIF_ADDR32 29 /* 32-bit address space on 64 bits */ #define TIF_X32 30 /* 32-bit native x86-64 binary */ +#define TIF_FSCHECK 31 /* Check FS is USER_DS on return */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) @@ -122,6 +124,7 @@ struct thread_info { #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) #define _TIF_ADDR32 (1 << TIF_ADDR32) #define _TIF_X32 (1 << TIF_X32) +#define _TIF_FSCHECK (1 << TIF_FSCHECK) /* * work to do in syscall_trace_enter(). Also includes TIF_NOHZ for @@ -137,7 +140,8 @@ struct thread_info { (_TIF_SYSCALL_TRACE | _TIF_NOTIFY_RESUME | _TIF_SIGPENDING | \ _TIF_NEED_RESCHED | _TIF_SINGLESTEP | _TIF_SYSCALL_EMU | \ _TIF_SYSCALL_AUDIT | _TIF_USER_RETURN_NOTIFY | _TIF_UPROBE | \ - _TIF_PATCH_PENDING | _TIF_NOHZ | _TIF_SYSCALL_TRACEPOINT) + _TIF_PATCH_PENDING | _TIF_NOHZ | _TIF_SYSCALL_TRACEPOINT | \ + _TIF_FSCHECK) /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW \ @@ -155,17 +159,6 @@ struct thread_info { */ #ifndef __ASSEMBLY__ -static inline unsigned long current_stack_pointer(void) -{ - unsigned long sp; -#ifdef CONFIG_X86_64 - asm("mov %%rsp,%0" : "=g" (sp)); -#else - asm("mov %%esp,%0" : "=g" (sp)); -#endif - return sp; -} - /* * Walks up the stack frames to make sure that the specified object is * entirely contained by a single stack frame. diff --git a/arch/x86/include/asm/time.h b/arch/x86/include/asm/time.h index 92b8aec06970..cef818b16045 100644 --- a/arch/x86/include/asm/time.h +++ b/arch/x86/include/asm/time.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_TIME_H #define _ASM_X86_TIME_H diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index 27e9f9d769b8..47457ab975fd 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_TIMER_H #define _ASM_X86_TIMER_H #include <linux/pm.h> @@ -29,11 +30,9 @@ struct cyc2ns_data { u32 cyc2ns_mul; u32 cyc2ns_shift; u64 cyc2ns_offset; - u32 __count; - /* u32 hole */ -}; /* 24 bytes -- do not grow */ +}; /* 16 bytes */ -extern struct cyc2ns_data *cyc2ns_read_begin(void); -extern void cyc2ns_read_end(struct cyc2ns_data *); +extern void cyc2ns_read_begin(struct cyc2ns_data *); +extern void cyc2ns_read_end(void); #endif /* _ASM_X86_TIMER_H */ diff --git a/arch/x86/include/asm/timex.h b/arch/x86/include/asm/timex.h index 1375cfc93960..a4a8b1b16c0c 100644 --- a/arch/x86/include/asm/timex.h +++ b/arch/x86/include/asm/timex.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_TIMEX_H #define _ASM_X86_TIMEX_H diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h index c7797307fc2b..cb0a1f470980 100644 --- a/arch/x86/include/asm/tlb.h +++ b/arch/x86/include/asm/tlb.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_TLB_H #define _ASM_X86_TLB_H @@ -15,4 +16,18 @@ #include <asm-generic/tlb.h> +/* + * While x86 architecture in general requires an IPI to perform TLB + * shootdown, enablement code for several hypervisors overrides + * .flush_tlb_others hook in pv_mmu_ops and implements it by issuing + * a hypercall. To keep software pagetable walkers safe in this case we + * switch to RCU based table free (HAVE_RCU_TABLE_FREE). See the comment + * below 'ifdef CONFIG_HAVE_RCU_TABLE_FREE' in include/asm-generic/tlb.h + * for more details. + */ +static inline void __tlb_remove_table(void *table) +{ + free_page_and_swap_cache(table); +} + #endif /* _ASM_X86_TLB_H */ diff --git a/arch/x86/include/asm/tlbbatch.h b/arch/x86/include/asm/tlbbatch.h new file mode 100644 index 000000000000..1ad56eb3e8a8 --- /dev/null +++ b/arch/x86/include/asm/tlbbatch.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ARCH_X86_TLBBATCH_H +#define _ARCH_X86_TLBBATCH_H + +#include <linux/cpumask.h> + +struct arch_tlbflush_unmap_batch { + /* + * Each bit set is a CPU that potentially has a TLB entry for one of + * the PFNs being flushed.. + */ + struct cpumask cpumask; +}; + +#endif /* _ARCH_X86_TLBBATCH_H */ diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 6ed9ea469b48..509046cfa5ce 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_TLBFLUSH_H #define _ASM_X86_TLBFLUSH_H @@ -7,6 +8,7 @@ #include <asm/processor.h> #include <asm/cpufeature.h> #include <asm/special_insns.h> +#include <asm/smp.h> static inline void __invpcid(unsigned long pcid, unsigned long addr, unsigned long type) @@ -56,6 +58,23 @@ static inline void invpcid_flush_all_nonglobals(void) __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL); } +static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) +{ + u64 new_tlb_gen; + + /* + * Bump the generation count. This also serves as a full barrier + * that synchronizes with switch_mm(): callers are required to order + * their read of mm_cpumask after their writes to the paging + * structures. + */ + smp_mb__before_atomic(); + new_tlb_gen = atomic64_inc_return(&mm->context.tlb_gen); + smp_mb__after_atomic(); + + return new_tlb_gen; +} + #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> #else @@ -64,17 +83,87 @@ static inline void invpcid_flush_all_nonglobals(void) #define __flush_tlb_single(addr) __native_flush_tlb_single(addr) #endif +static inline bool tlb_defer_switch_to_init_mm(void) +{ + /* + * If we have PCID, then switching to init_mm is reasonably + * fast. If we don't have PCID, then switching to init_mm is + * quite slow, so we try to defer it in the hopes that we can + * avoid it entirely. The latter approach runs the risk of + * receiving otherwise unnecessary IPIs. + * + * This choice is just a heuristic. The tlb code can handle this + * function returning true or false regardless of whether we have + * PCID. + */ + return !static_cpu_has(X86_FEATURE_PCID); +} + +/* + * 6 because 6 should be plenty and struct tlb_state will fit in + * two cache lines. + */ +#define TLB_NR_DYN_ASIDS 6 + +struct tlb_context { + u64 ctx_id; + u64 tlb_gen; +}; + struct tlb_state { -#ifdef CONFIG_SMP - struct mm_struct *active_mm; - int state; -#endif + /* + * cpu_tlbstate.loaded_mm should match CR3 whenever interrupts + * are on. This means that it may not match current->active_mm, + * which will contain the previous user mm when we're in lazy TLB + * mode even if we've already switched back to swapper_pg_dir. + */ + struct mm_struct *loaded_mm; + u16 loaded_mm_asid; + u16 next_asid; + + /* + * We can be in one of several states: + * + * - Actively using an mm. Our CPU's bit will be set in + * mm_cpumask(loaded_mm) and is_lazy == false; + * + * - Not using a real mm. loaded_mm == &init_mm. Our CPU's bit + * will not be set in mm_cpumask(&init_mm) and is_lazy == false. + * + * - Lazily using a real mm. loaded_mm != &init_mm, our bit + * is set in mm_cpumask(loaded_mm), but is_lazy == true. + * We're heuristically guessing that the CR3 load we + * skipped more than makes up for the overhead added by + * lazy mode. + */ + bool is_lazy; /* * Access to this CR4 shadow and to H/W CR4 is protected by * disabling interrupts when modifying either one. */ unsigned long cr4; + + /* + * This is a list of all contexts that might exist in the TLB. + * There is one per ASID that we use, and the ASID (what the + * CPU calls PCID) is the index into ctxts. + * + * For each context, ctx_id indicates which mm the TLB's user + * entries came from. As an invariant, the TLB will never + * contain entries that are out-of-date as when that mm reached + * the tlb_gen in the list. + * + * To be clear, this means that it's legal for the TLB code to + * flush the TLB without updating tlb_gen. This can happen + * (for now, at least) due to paravirt remote flushes. + * + * NB: context 0 is a bit special, since it's also used by + * various bits of init code. This is fine -- code that + * isn't aware of PCID will end up harmlessly flushing + * context 0. + */ + struct tlb_context ctxs[TLB_NR_DYN_ASIDS]; }; DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate); @@ -143,6 +232,8 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask) cr4_set_bits(mask); } +extern void initialize_tlbstate_and_flush(void); + static inline void __native_flush_tlb(void) { /* @@ -151,7 +242,7 @@ static inline void __native_flush_tlb(void) * back: */ preempt_disable(); - native_write_cr3(native_read_cr3()); + native_write_cr3(__native_read_cr3()); preempt_enable(); } @@ -202,6 +293,14 @@ static inline void __flush_tlb_all(void) __flush_tlb_global(); else __flush_tlb(); + + /* + * Note: if we somehow had PCID but not PGE, then this wouldn't work -- + * we'd end up flushing kernel translations for the current ASID but + * we might fail to flush kernel translations for other cached ASIDs. + * + * To avoid this issue, we force PCID off if PGE is off. + */ } static inline void __flush_tlb_one(unsigned long addr) @@ -220,84 +319,33 @@ static inline void __flush_tlb_one(unsigned long addr) * - flush_tlb_page(vma, vmaddr) flushes one page * - flush_tlb_range(vma, start, end) flushes a range of pages * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages - * - flush_tlb_others(cpumask, mm, start, end) flushes TLBs on other cpus + * - flush_tlb_others(cpumask, info) flushes TLBs on other cpus * * ..but the i386 has somewhat limited tlb flushing capabilities, * and page-granular flushes are available only on i486 and up. */ - -#ifndef CONFIG_SMP - -/* "_up" is for UniProcessor. - * - * This is a helper for other header functions. *Not* intended to be called - * directly. All global TLB flushes need to either call this, or to bump the - * vm statistics themselves. - */ -static inline void __flush_tlb_up(void) -{ - count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); - __flush_tlb(); -} - -static inline void flush_tlb_all(void) -{ - count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); - __flush_tlb_all(); -} - -static inline void local_flush_tlb(void) -{ - __flush_tlb_up(); -} - -static inline void flush_tlb_mm(struct mm_struct *mm) -{ - if (mm == current->active_mm) - __flush_tlb_up(); -} - -static inline void flush_tlb_page(struct vm_area_struct *vma, - unsigned long addr) -{ - if (vma->vm_mm == current->active_mm) - __flush_tlb_one(addr); -} - -static inline void flush_tlb_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) -{ - if (vma->vm_mm == current->active_mm) - __flush_tlb_up(); -} - -static inline void flush_tlb_mm_range(struct mm_struct *mm, - unsigned long start, unsigned long end, unsigned long vmflag) -{ - if (mm == current->active_mm) - __flush_tlb_up(); -} - -static inline void native_flush_tlb_others(const struct cpumask *cpumask, - struct mm_struct *mm, - unsigned long start, - unsigned long end) -{ -} - -static inline void reset_lazy_tlbstate(void) -{ -} - -static inline void flush_tlb_kernel_range(unsigned long start, - unsigned long end) -{ - flush_tlb_all(); -} - -#else /* SMP */ - -#include <asm/smp.h> +struct flush_tlb_info { + /* + * We support several kinds of flushes. + * + * - Fully flush a single mm. .mm will be set, .end will be + * TLB_FLUSH_ALL, and .new_tlb_gen will be the tlb_gen to + * which the IPI sender is trying to catch us up. + * + * - Partially flush a single mm. .mm will be set, .start and + * .end will indicate the range, and .new_tlb_gen will be set + * such that the changes between generation .new_tlb_gen-1 and + * .new_tlb_gen are entirely contained in the indicated range. + * + * - Fully flush all mms whose tlb_gens have been updated. .mm + * will be NULL, .end will be TLB_FLUSH_ALL, and .new_tlb_gen + * will be zero. + */ + struct mm_struct *mm; + unsigned long start; + unsigned long end; + u64 new_tlb_gen; +}; #define local_flush_tlb() __flush_tlb() @@ -307,29 +355,30 @@ static inline void flush_tlb_kernel_range(unsigned long start, flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags) extern void flush_tlb_all(void); -extern void flush_tlb_page(struct vm_area_struct *, unsigned long); extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned long vmflag); extern void flush_tlb_kernel_range(unsigned long start, unsigned long end); -void native_flush_tlb_others(const struct cpumask *cpumask, - struct mm_struct *mm, - unsigned long start, unsigned long end); +static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a) +{ + flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, VM_NONE); +} -#define TLBSTATE_OK 1 -#define TLBSTATE_LAZY 2 +void native_flush_tlb_others(const struct cpumask *cpumask, + const struct flush_tlb_info *info); -static inline void reset_lazy_tlbstate(void) +static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch, + struct mm_struct *mm) { - this_cpu_write(cpu_tlbstate.state, 0); - this_cpu_write(cpu_tlbstate.active_mm, &init_mm); + inc_mm_tlb_gen(mm); + cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); } -#endif /* SMP */ +extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch); #ifndef CONFIG_PARAVIRT -#define flush_tlb_others(mask, mm, start, end) \ - native_flush_tlb_others(mask, mm, start, end) +#define flush_tlb_others(mask, info) \ + native_flush_tlb_others(mask, info) #endif #endif /* _ASM_X86_TLBFLUSH_H */ diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 6358a85e2270..c1d2a9892352 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -75,12 +75,6 @@ static inline const struct cpumask *cpumask_of_node(int node) extern void setup_node_to_cpumask_map(void); -/* - * Returns the number of the node containing Node 'node'. This - * architecture is flat, so it is a pretty simple function! - */ -#define parent_node(node) (node) - #define pcibus_to_node(bus) __pcibus_to_node(bus) extern int __node_distance(int, int); diff --git a/arch/x86/include/asm/trace/common.h b/arch/x86/include/asm/trace/common.h new file mode 100644 index 000000000000..57c8da027d99 --- /dev/null +++ b/arch/x86/include/asm/trace/common.h @@ -0,0 +1,16 @@ +#ifndef _ASM_TRACE_COMMON_H +#define _ASM_TRACE_COMMON_H + +#ifdef CONFIG_TRACING +DECLARE_STATIC_KEY_FALSE(trace_pagefault_key); +#define trace_pagefault_enabled() \ + static_branch_unlikely(&trace_pagefault_key) +DECLARE_STATIC_KEY_FALSE(trace_resched_ipi_key); +#define trace_resched_ipi_enabled() \ + static_branch_unlikely(&trace_resched_ipi_key) +#else +static inline bool trace_pagefault_enabled(void) { return false; } +static inline bool trace_resched_ipi_enabled(void) { return false; } +#endif + +#endif diff --git a/arch/x86/include/asm/trace/exceptions.h b/arch/x86/include/asm/trace/exceptions.h index 2422b14c50a7..69615e387973 100644 --- a/arch/x86/include/asm/trace/exceptions.h +++ b/arch/x86/include/asm/trace/exceptions.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM exceptions @@ -5,9 +6,10 @@ #define _TRACE_PAGE_FAULT_H #include <linux/tracepoint.h> +#include <asm/trace/common.h> -extern int trace_irq_vector_regfunc(void); -extern void trace_irq_vector_unregfunc(void); +extern int trace_pagefault_reg(void); +extern void trace_pagefault_unreg(void); DECLARE_EVENT_CLASS(x86_exceptions, @@ -37,8 +39,7 @@ DEFINE_EVENT_FN(x86_exceptions, name, \ TP_PROTO(unsigned long address, struct pt_regs *regs, \ unsigned long error_code), \ TP_ARGS(address, regs, error_code), \ - trace_irq_vector_regfunc, \ - trace_irq_vector_unregfunc); + trace_pagefault_reg, trace_pagefault_unreg); DEFINE_PAGE_FAULT_EVENT(page_fault_user); DEFINE_PAGE_FAULT_EVENT(page_fault_kernel); diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h index 342e59789fcd..fa60398bbc3a 100644 --- a/arch/x86/include/asm/trace/fpu.h +++ b/arch/x86/include/asm/trace/fpu.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM x86_fpu @@ -12,25 +13,22 @@ DECLARE_EVENT_CLASS(x86_fpu, TP_STRUCT__entry( __field(struct fpu *, fpu) - __field(bool, fpregs_active) - __field(bool, fpstate_active) + __field(bool, initialized) __field(u64, xfeatures) __field(u64, xcomp_bv) ), TP_fast_assign( __entry->fpu = fpu; - __entry->fpregs_active = fpu->fpregs_active; - __entry->fpstate_active = fpu->fpstate_active; + __entry->initialized = fpu->initialized; if (boot_cpu_has(X86_FEATURE_OSXSAVE)) { __entry->xfeatures = fpu->state.xsave.header.xfeatures; __entry->xcomp_bv = fpu->state.xsave.header.xcomp_bv; } ), - TP_printk("x86/fpu: %p fpregs_active: %d fpstate_active: %d xfeatures: %llx xcomp_bv: %llx", + TP_printk("x86/fpu: %p initialized: %d xfeatures: %llx xcomp_bv: %llx", __entry->fpu, - __entry->fpregs_active, - __entry->fpstate_active, + __entry->initialized, __entry->xfeatures, __entry->xcomp_bv ) diff --git a/arch/x86/include/asm/trace/hyperv.h b/arch/x86/include/asm/trace/hyperv.h new file mode 100644 index 000000000000..4253bca99989 --- /dev/null +++ b/arch/x86/include/asm/trace/hyperv.h @@ -0,0 +1,40 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hyperv + +#if !defined(_TRACE_HYPERV_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HYPERV_H + +#include <linux/tracepoint.h> + +#if IS_ENABLED(CONFIG_HYPERV) + +TRACE_EVENT(hyperv_mmu_flush_tlb_others, + TP_PROTO(const struct cpumask *cpus, + const struct flush_tlb_info *info), + TP_ARGS(cpus, info), + TP_STRUCT__entry( + __field(unsigned int, ncpus) + __field(struct mm_struct *, mm) + __field(unsigned long, addr) + __field(unsigned long, end) + ), + TP_fast_assign(__entry->ncpus = cpumask_weight(cpus); + __entry->mm = info->mm; + __entry->addr = info->start; + __entry->end = info->end; + ), + TP_printk("ncpus %d mm %p addr %lx, end %lx", + __entry->ncpus, __entry->mm, + __entry->addr, __entry->end) + ); + +#endif /* CONFIG_HYPERV */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH asm/trace/ +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE hyperv +#endif /* _TRACE_HYPERV_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h index 32dd6a9e343c..8eb139ed1a03 100644 --- a/arch/x86/include/asm/trace/irq_vectors.h +++ b/arch/x86/include/asm/trace/irq_vectors.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM irq_vectors @@ -5,9 +6,12 @@ #define _TRACE_IRQ_VECTORS_H #include <linux/tracepoint.h> +#include <asm/trace/common.h> -extern int trace_irq_vector_regfunc(void); -extern void trace_irq_vector_unregfunc(void); +#ifdef CONFIG_X86_LOCAL_APIC + +extern int trace_resched_ipi_reg(void); +extern void trace_resched_ipi_unreg(void); DECLARE_EVENT_CLASS(x86_irq_vector, @@ -28,15 +32,22 @@ DECLARE_EVENT_CLASS(x86_irq_vector, #define DEFINE_IRQ_VECTOR_EVENT(name) \ DEFINE_EVENT_FN(x86_irq_vector, name##_entry, \ TP_PROTO(int vector), \ + TP_ARGS(vector), NULL, NULL); \ +DEFINE_EVENT_FN(x86_irq_vector, name##_exit, \ + TP_PROTO(int vector), \ + TP_ARGS(vector), NULL, NULL); + +#define DEFINE_RESCHED_IPI_EVENT(name) \ +DEFINE_EVENT_FN(x86_irq_vector, name##_entry, \ + TP_PROTO(int vector), \ TP_ARGS(vector), \ - trace_irq_vector_regfunc, \ - trace_irq_vector_unregfunc); \ + trace_resched_ipi_reg, \ + trace_resched_ipi_unreg); \ DEFINE_EVENT_FN(x86_irq_vector, name##_exit, \ TP_PROTO(int vector), \ TP_ARGS(vector), \ - trace_irq_vector_regfunc, \ - trace_irq_vector_unregfunc); - + trace_resched_ipi_reg, \ + trace_resched_ipi_unreg); /* * local_timer - called when entering/exiting a local timer interrupt @@ -45,11 +56,6 @@ DEFINE_EVENT_FN(x86_irq_vector, name##_exit, \ DEFINE_IRQ_VECTOR_EVENT(local_timer); /* - * reschedule - called when entering/exiting a reschedule vector handler - */ -DEFINE_IRQ_VECTOR_EVENT(reschedule); - -/* * spurious_apic - called when entering/exiting a spurious apic vector handler */ DEFINE_IRQ_VECTOR_EVENT(spurious_apic); @@ -65,6 +71,7 @@ DEFINE_IRQ_VECTOR_EVENT(error_apic); */ DEFINE_IRQ_VECTOR_EVENT(x86_platform_ipi); +#ifdef CONFIG_IRQ_WORK /* * irq_work - called when entering/exiting a irq work interrupt * vector handler @@ -81,6 +88,18 @@ DEFINE_IRQ_VECTOR_EVENT(irq_work); * 4) goto 1 */ TRACE_EVENT_PERF_PERM(irq_work_exit, is_sampling_event(p_event) ? -EPERM : 0); +#endif + +/* + * The ifdef is required because that tracepoint macro hell emits tracepoint + * code in files which include this header even if the tracepoint is not + * enabled. Brilliant stuff that. + */ +#ifdef CONFIG_SMP +/* + * reschedule - called when entering/exiting a reschedule vector handler + */ +DEFINE_RESCHED_IPI_EVENT(reschedule); /* * call_function - called when entering/exiting a call function interrupt @@ -93,24 +112,33 @@ DEFINE_IRQ_VECTOR_EVENT(call_function); * single interrupt vector handler */ DEFINE_IRQ_VECTOR_EVENT(call_function_single); +#endif +#ifdef CONFIG_X86_MCE_THRESHOLD /* * threshold_apic - called when entering/exiting a threshold apic interrupt * vector handler */ DEFINE_IRQ_VECTOR_EVENT(threshold_apic); +#endif +#ifdef CONFIG_X86_MCE_AMD /* * deferred_error_apic - called when entering/exiting a deferred apic interrupt * vector handler */ DEFINE_IRQ_VECTOR_EVENT(deferred_error_apic); +#endif +#ifdef CONFIG_X86_THERMAL_VECTOR /* * thermal_apic - called when entering/exiting a thermal apic interrupt * vector handler */ DEFINE_IRQ_VECTOR_EVENT(thermal_apic); +#endif + +#endif /* CONFIG_X86_LOCAL_APIC */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . diff --git a/arch/x86/include/asm/trace/mpx.h b/arch/x86/include/asm/trace/mpx.h index 0f492fc50bce..7bd92db09e8d 100644 --- a/arch/x86/include/asm/trace/mpx.h +++ b/arch/x86/include/asm/trace/mpx.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM mpx diff --git a/arch/x86/include/asm/trace_clock.h b/arch/x86/include/asm/trace_clock.h index beab86cc282d..7061a5650969 100644 --- a/arch/x86/include/asm/trace_clock.h +++ b/arch/x86/include/asm/trace_clock.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_TRACE_CLOCK_H #define _ASM_X86_TRACE_CLOCK_H diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 01fd0a7f48cd..b0cced97a6ce 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_TRAPS_H #define _ASM_X86_TRAPS_H @@ -13,9 +14,6 @@ asmlinkage void divide_error(void); asmlinkage void debug(void); asmlinkage void nmi(void); asmlinkage void int3(void); -asmlinkage void xen_debug(void); -asmlinkage void xen_int3(void); -asmlinkage void xen_stack_segment(void); asmlinkage void overflow(void); asmlinkage void bounds(void); asmlinkage void invalid_op(void); @@ -38,22 +36,29 @@ asmlinkage void machine_check(void); #endif /* CONFIG_X86_MCE */ asmlinkage void simd_coprocessor_error(void); -#ifdef CONFIG_TRACING -asmlinkage void trace_page_fault(void); -#define trace_stack_segment stack_segment -#define trace_divide_error divide_error -#define trace_bounds bounds -#define trace_invalid_op invalid_op -#define trace_device_not_available device_not_available -#define trace_coprocessor_segment_overrun coprocessor_segment_overrun -#define trace_invalid_TSS invalid_TSS -#define trace_segment_not_present segment_not_present -#define trace_general_protection general_protection -#define trace_spurious_interrupt_bug spurious_interrupt_bug -#define trace_coprocessor_error coprocessor_error -#define trace_alignment_check alignment_check -#define trace_simd_coprocessor_error simd_coprocessor_error -#define trace_async_page_fault async_page_fault +#if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV) +asmlinkage void xen_divide_error(void); +asmlinkage void xen_xendebug(void); +asmlinkage void xen_xenint3(void); +asmlinkage void xen_nmi(void); +asmlinkage void xen_overflow(void); +asmlinkage void xen_bounds(void); +asmlinkage void xen_invalid_op(void); +asmlinkage void xen_device_not_available(void); +asmlinkage void xen_double_fault(void); +asmlinkage void xen_coprocessor_segment_overrun(void); +asmlinkage void xen_invalid_TSS(void); +asmlinkage void xen_segment_not_present(void); +asmlinkage void xen_stack_segment(void); +asmlinkage void xen_general_protection(void); +asmlinkage void xen_page_fault(void); +asmlinkage void xen_spurious_interrupt_bug(void); +asmlinkage void xen_coprocessor_error(void); +asmlinkage void xen_alignment_check(void); +#ifdef CONFIG_X86_MCE +asmlinkage void xen_machine_check(void); +#endif /* CONFIG_X86_MCE */ +asmlinkage void xen_simd_coprocessor_error(void); #endif dotraplinkage void do_divide_error(struct pt_regs *, long); @@ -74,14 +79,6 @@ asmlinkage struct pt_regs *sync_regs(struct pt_regs *); #endif dotraplinkage void do_general_protection(struct pt_regs *, long); dotraplinkage void do_page_fault(struct pt_regs *, unsigned long); -#ifdef CONFIG_TRACING -dotraplinkage void trace_do_page_fault(struct pt_regs *, unsigned long); -#else -static inline void trace_do_page_fault(struct pt_regs *regs, unsigned long error) -{ - do_page_fault(regs, error); -} -#endif dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *, long); dotraplinkage void do_coprocessor_error(struct pt_regs *, long); dotraplinkage void do_alignment_check(struct pt_regs *, long); diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index f5e6f1c417df..8da0efb13544 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * x86 TSC related functions */ diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index a059aac9e937..574dff4d2913 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_UACCESS_H #define _ASM_X86_UACCESS_H /* @@ -26,7 +27,12 @@ #define get_ds() (KERNEL_DS) #define get_fs() (current->thread.addr_limit) -#define set_fs(x) (current->thread.addr_limit = (x)) +static inline void set_fs(mm_segment_t fs) +{ + current->thread.addr_limit = fs; + /* On user-mode return, check fs is correct */ + set_thread_flag(TIF_FSCHECK); +} #define segment_eq(a, b) ((a).seg == (b).seg) @@ -161,11 +167,11 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) ({ \ int __ret_gu; \ register __inttype(*(ptr)) __val_gu asm("%"_ASM_DX); \ - register void *__sp asm(_ASM_SP); \ __chk_user_ptr(ptr); \ might_fault(); \ asm volatile("call __get_user_%P4" \ - : "=a" (__ret_gu), "=r" (__val_gu), "+r" (__sp) \ + : "=a" (__ret_gu), "=r" (__val_gu), \ + ASM_CALL_CONSTRAINT \ : "0" (ptr), "i" (sizeof(*(ptr)))); \ (x) = (__force __typeof__(*(ptr))) __val_gu; \ __builtin_expect(__ret_gu, 0); \ @@ -332,7 +338,7 @@ do { \ _ASM_EXTABLE(1b, 4b) \ _ASM_EXTABLE(2b, 4b) \ : "=r" (retval), "=&A"(x) \ - : "m" (__m(__ptr)), "m" __m(((u32 *)(__ptr)) + 1), \ + : "m" (__m(__ptr)), "m" __m(((u32 __user *)(__ptr)) + 1), \ "i" (errret), "0" (retval)); \ }) @@ -535,9 +541,6 @@ struct __large_struct { unsigned long buf[100]; }; #define __put_user(x, ptr) \ __put_user_nocheck((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr))) -#define __get_user_unaligned __get_user -#define __put_user_unaligned __put_user - /* * {get|put}_user_try and catch * @@ -565,7 +568,6 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n); extern __must_check long strncpy_from_user(char *dst, const char __user *src, long count); -extern __must_check long strlen_user(const char __user *str); extern __must_check long strnlen_user(const char __user *str, long n); unsigned long __must_check clear_user(void __user *mem, unsigned long len); diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index aeda9bb8af50..72950401b223 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_UACCESS_32_H #define _ASM_X86_UACCESS_32_H diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index c5504b9a472e..f07ef3c575db 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_UACCESS_64_H #define _ASM_X86_UACCESS_64_H @@ -171,6 +172,10 @@ unsigned long raw_copy_in_user(void __user *dst, const void __user *src, unsigne extern long __copy_user_nocache(void *dst, const void __user *src, unsigned size, int zerorest); +extern long __copy_user_flushcache(void *dst, const void __user *src, unsigned size); +extern void memcpy_page_flushcache(char *to, struct page *page, size_t offset, + size_t len); + static inline int __copy_from_user_inatomic_nocache(void *dst, const void __user *src, unsigned size) @@ -179,6 +184,13 @@ __copy_from_user_inatomic_nocache(void *dst, const void __user *src, return __copy_user_nocache(dst, src, size, 0); } +static inline int +__copy_from_user_flushcache(void *dst, const void __user *src, unsigned size) +{ + kasan_check_write(dst, size); + return __copy_user_flushcache(dst, src, size); +} + unsigned long copy_user_handle_tail(char *to, char *from, unsigned len); diff --git a/arch/x86/include/asm/unaligned.h b/arch/x86/include/asm/unaligned.h index a7bd416b4763..9c754a7447aa 100644 --- a/arch/x86/include/asm/unaligned.h +++ b/arch/x86/include/asm/unaligned.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_UNALIGNED_H #define _ASM_X86_UNALIGNED_H diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h index 1ba1536f627e..51c4eee00732 100644 --- a/arch/x86/include/asm/unistd.h +++ b/arch/x86/include/asm/unistd.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_UNISTD_H #define _ASM_X86_UNISTD_H 1 diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h index e6676495b125..87adc0d38c4a 100644 --- a/arch/x86/include/asm/unwind.h +++ b/arch/x86/include/asm/unwind.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_UNWIND_H #define _ASM_X86_UNWIND_H @@ -12,11 +13,14 @@ struct unwind_state { struct task_struct *task; int graph_idx; bool error; -#ifdef CONFIG_FRAME_POINTER +#if defined(CONFIG_ORC_UNWINDER) + bool signal, full_regs; + unsigned long sp, bp, ip; + struct pt_regs *regs; +#elif defined(CONFIG_FRAME_POINTER_UNWINDER) bool got_irq; - unsigned long *bp, *orig_sp; + unsigned long *bp, *orig_sp, ip; struct pt_regs *regs; - unsigned long ip; #else unsigned long *sp; #endif @@ -24,41 +28,30 @@ struct unwind_state { void __unwind_start(struct unwind_state *state, struct task_struct *task, struct pt_regs *regs, unsigned long *first_frame); - bool unwind_next_frame(struct unwind_state *state); - unsigned long unwind_get_return_address(struct unwind_state *state); +unsigned long *unwind_get_return_address_ptr(struct unwind_state *state); static inline bool unwind_done(struct unwind_state *state) { return state->stack_info.type == STACK_TYPE_UNKNOWN; } -static inline -void unwind_start(struct unwind_state *state, struct task_struct *task, - struct pt_regs *regs, unsigned long *first_frame) -{ - first_frame = first_frame ? : get_stack_pointer(task, regs); - - __unwind_start(state, task, regs, first_frame); -} - static inline bool unwind_error(struct unwind_state *state) { return state->error; } -#ifdef CONFIG_FRAME_POINTER - static inline -unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) +void unwind_start(struct unwind_state *state, struct task_struct *task, + struct pt_regs *regs, unsigned long *first_frame) { - if (unwind_done(state)) - return NULL; + first_frame = first_frame ? : get_stack_pointer(task, regs); - return state->regs ? &state->regs->ip : state->bp + 1; + __unwind_start(state, task, regs, first_frame); } +#if defined(CONFIG_ORC_UNWINDER) || defined(CONFIG_FRAME_POINTER_UNWINDER) static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) { if (unwind_done(state)) @@ -66,20 +59,46 @@ static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) return state->regs; } - -#else /* !CONFIG_FRAME_POINTER */ - -static inline -unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) +#else +static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) { return NULL; } +#endif -static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) +#ifdef CONFIG_ORC_UNWINDER +void unwind_init(void); +void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, + void *orc, size_t orc_size); +#else +static inline void unwind_init(void) {} +static inline +void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, + void *orc, size_t orc_size) {} +#endif + +/* + * This disables KASAN checking when reading a value from another task's stack, + * since the other task could be running on another CPU and could have poisoned + * the stack in the meantime. + */ +#define READ_ONCE_TASK_STACK(task, x) \ +({ \ + unsigned long val; \ + if (task == current) \ + val = READ_ONCE(x); \ + else \ + val = READ_ONCE_NOCHECK(x); \ + val; \ +}) + +static inline bool task_on_another_cpu(struct task_struct *task) { - return NULL; +#ifdef CONFIG_SMP + return task != current && task->on_cpu; +#else + return false; +#endif } -#endif /* CONFIG_FRAME_POINTER */ - #endif /* _ASM_X86_UNWIND_H */ diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h new file mode 100644 index 000000000000..bae46fc6b9de --- /dev/null +++ b/arch/x86/include/asm/unwind_hints.h @@ -0,0 +1,105 @@ +#ifndef _ASM_X86_UNWIND_HINTS_H +#define _ASM_X86_UNWIND_HINTS_H + +#include "orc_types.h" + +#ifdef __ASSEMBLY__ + +/* + * In asm, there are two kinds of code: normal C-type callable functions and + * the rest. The normal callable functions can be called by other code, and + * don't do anything unusual with the stack. Such normal callable functions + * are annotated with the ENTRY/ENDPROC macros. Most asm code falls in this + * category. In this case, no special debugging annotations are needed because + * objtool can automatically generate the ORC data for the ORC unwinder to read + * at runtime. + * + * Anything which doesn't fall into the above category, such as syscall and + * interrupt handlers, tends to not be called directly by other functions, and + * often does unusual non-C-function-type things with the stack pointer. Such + * code needs to be annotated such that objtool can understand it. The + * following CFI hint macros are for this type of code. + * + * These macros provide hints to objtool about the state of the stack at each + * instruction. Objtool starts from the hints and follows the code flow, + * making automatic CFI adjustments when it sees pushes and pops, filling out + * the debuginfo as necessary. It will also warn if it sees any + * inconsistencies. + */ +.macro UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=0 type=ORC_TYPE_CALL +#ifdef CONFIG_STACK_VALIDATION +.Lunwind_hint_ip_\@: + .pushsection .discard.unwind_hints + /* struct unwind_hint */ + .long .Lunwind_hint_ip_\@ - . + .short \sp_offset + .byte \sp_reg + .byte \type + .popsection +#endif +.endm + +.macro UNWIND_HINT_EMPTY + UNWIND_HINT sp_reg=ORC_REG_UNDEFINED +.endm + +.macro UNWIND_HINT_REGS base=%rsp offset=0 indirect=0 extra=1 iret=0 + .if \base == %rsp + .if \indirect + .set sp_reg, ORC_REG_SP_INDIRECT + .else + .set sp_reg, ORC_REG_SP + .endif + .elseif \base == %rbp + .set sp_reg, ORC_REG_BP + .elseif \base == %rdi + .set sp_reg, ORC_REG_DI + .elseif \base == %rdx + .set sp_reg, ORC_REG_DX + .elseif \base == %r10 + .set sp_reg, ORC_REG_R10 + .else + .error "UNWIND_HINT_REGS: bad base register" + .endif + + .set sp_offset, \offset + + .if \iret + .set type, ORC_TYPE_REGS_IRET + .elseif \extra == 0 + .set type, ORC_TYPE_REGS_IRET + .set sp_offset, \offset + (16*8) + .else + .set type, ORC_TYPE_REGS + .endif + + UNWIND_HINT sp_reg=sp_reg sp_offset=sp_offset type=type +.endm + +.macro UNWIND_HINT_IRET_REGS base=%rsp offset=0 + UNWIND_HINT_REGS base=\base offset=\offset iret=1 +.endm + +.macro UNWIND_HINT_FUNC sp_offset=8 + UNWIND_HINT sp_offset=\sp_offset +.endm + +#else /* !__ASSEMBLY__ */ + +#define UNWIND_HINT(sp_reg, sp_offset, type) \ + "987: \n\t" \ + ".pushsection .discard.unwind_hints\n\t" \ + /* struct unwind_hint */ \ + ".long 987b - .\n\t" \ + ".short " __stringify(sp_offset) "\n\t" \ + ".byte " __stringify(sp_reg) "\n\t" \ + ".byte " __stringify(type) "\n\t" \ + ".popsection\n\t" + +#define UNWIND_HINT_SAVE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_SAVE) + +#define UNWIND_HINT_RESTORE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_RESTORE) + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_X86_UNWIND_HINTS_H */ diff --git a/arch/x86/include/asm/user.h b/arch/x86/include/asm/user.h index 59a54e869f15..413c91746b27 100644 --- a/arch/x86/include/asm/user.h +++ b/arch/x86/include/asm/user.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_USER_H #define _ASM_X86_USER_H diff --git a/arch/x86/include/asm/user32.h b/arch/x86/include/asm/user32.h index 14cbb73ebcba..fa577312f63a 100644 --- a/arch/x86/include/asm/user32.h +++ b/arch/x86/include/asm/user32.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_USER32_H #define _ASM_X86_USER32_H diff --git a/arch/x86/include/asm/user_32.h b/arch/x86/include/asm/user_32.h index bebfd8644016..d72c3d66e94f 100644 --- a/arch/x86/include/asm/user_32.h +++ b/arch/x86/include/asm/user_32.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_USER_32_H #define _ASM_X86_USER_32_H diff --git a/arch/x86/include/asm/user_64.h b/arch/x86/include/asm/user_64.h index faf2cd3e0d76..db909923611c 100644 --- a/arch/x86/include/asm/user_64.h +++ b/arch/x86/include/asm/user_64.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_USER_64_H #define _ASM_X86_USER_64_H diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h index 6686820feae9..a80c0673798f 100644 --- a/arch/x86/include/asm/uv/uv.h +++ b/arch/x86/include/asm/uv/uv.h @@ -1,6 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_UV_UV_H #define _ASM_X86_UV_UV_H +#include <asm/tlbflush.h> + enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC}; struct cpumask; @@ -15,10 +18,7 @@ extern void uv_cpu_init(void); extern void uv_nmi_init(void); extern void uv_system_init(void); extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, - struct mm_struct *mm, - unsigned long start, - unsigned long end, - unsigned int cpu); + const struct flush_tlb_info *info); #else /* X86_UV */ @@ -28,8 +28,8 @@ static inline int is_uv_hubless(void) { return 0; } static inline void uv_cpu_init(void) { } static inline void uv_system_init(void) { } static inline const struct cpumask * -uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, - unsigned long start, unsigned long end, unsigned int cpu) +uv_flush_tlb_others(const struct cpumask *cpumask, + const struct flush_tlb_info *info) { return cpumask; } #endif /* X86_UV */ diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index bccdf4938ddf..27566e57e87d 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_VDSO_H #define _ASM_X86_VDSO_H diff --git a/arch/x86/include/asm/vga.h b/arch/x86/include/asm/vga.h index c4b9dc2f67c5..46f9b2deab4d 100644 --- a/arch/x86/include/asm/vga.h +++ b/arch/x86/include/asm/vga.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Access to VGA videoram * @@ -7,12 +8,24 @@ #ifndef _ASM_X86_VGA_H #define _ASM_X86_VGA_H +#include <asm/set_memory.h> + /* * On the PC, we can just recalculate addresses and then * access the videoram directly without any black magic. + * To support memory encryption however, we need to access + * the videoram as decrypted memory. */ -#define VGA_MAP_MEM(x, s) (unsigned long)phys_to_virt(x) +#define VGA_MAP_MEM(x, s) \ +({ \ + unsigned long start = (unsigned long)phys_to_virt(x); \ + \ + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) \ + set_memory_decrypted(start, (s) >> PAGE_SHIFT); \ + \ + start; \ +}) #define vga_readb(x) (*(x)) #define vga_writeb(x, y) (*(y) = (x)) diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index 022e59714562..52250681f68c 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_VGTOD_H #define _ASM_X86_VGTOD_H diff --git a/arch/x86/include/asm/vm86.h b/arch/x86/include/asm/vm86.h index 1e491f3af317..26efbec94448 100644 --- a/arch/x86/include/asm/vm86.h +++ b/arch/x86/include/asm/vm86.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_VM86_H #define _ASM_X86_VM86_H diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 35cd06f636ab..caec8417539f 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -72,6 +72,7 @@ #define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 #define SECONDARY_EXEC_RDRAND 0x00000800 #define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000 +#define SECONDARY_EXEC_ENABLE_VMFUNC 0x00002000 #define SECONDARY_EXEC_SHADOW_VMCS 0x00004000 #define SECONDARY_EXEC_RDSEED 0x00010000 #define SECONDARY_EXEC_ENABLE_PML 0x00020000 @@ -114,6 +115,10 @@ #define VMX_MISC_SAVE_EFER_LMA 0x00000020 #define VMX_MISC_ACTIVITY_HLT 0x00000040 +/* VMFUNC functions */ +#define VMX_VMFUNC_EPTP_SWITCHING 0x00000001 +#define VMFUNC_EPTP_ENTRIES 512 + static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic) { return vmx_basic & GENMASK_ULL(30, 0); @@ -187,6 +192,8 @@ enum vmcs_field { APIC_ACCESS_ADDR_HIGH = 0x00002015, POSTED_INTR_DESC_ADDR = 0x00002016, POSTED_INTR_DESC_ADDR_HIGH = 0x00002017, + VM_FUNCTION_CONTROL = 0x00002018, + VM_FUNCTION_CONTROL_HIGH = 0x00002019, EPT_POINTER = 0x0000201a, EPT_POINTER_HIGH = 0x0000201b, EOI_EXIT_BITMAP0 = 0x0000201c, @@ -197,6 +204,8 @@ enum vmcs_field { EOI_EXIT_BITMAP2_HIGH = 0x00002021, EOI_EXIT_BITMAP3 = 0x00002022, EOI_EXIT_BITMAP3_HIGH = 0x00002023, + EPTP_LIST_ADDRESS = 0x00002024, + EPTP_LIST_ADDRESS_HIGH = 0x00002025, VMREAD_BITMAP = 0x00002026, VMWRITE_BITMAP = 0x00002028, XSS_EXIT_BITMAP = 0x0000202C, @@ -444,6 +453,7 @@ enum vmcs_field { #define VMX_EPT_EXECUTE_ONLY_BIT (1ull) #define VMX_EPT_PAGE_WALK_4_BIT (1ull << 6) +#define VMX_EPT_PAGE_WALK_5_BIT (1ull << 7) #define VMX_EPTP_UC_BIT (1ull << 8) #define VMX_EPTP_WB_BIT (1ull << 14) #define VMX_EPT_2MB_PAGE_BIT (1ull << 16) @@ -459,12 +469,14 @@ enum vmcs_field { #define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT (1ull << 10) /* (42 - 32) */ #define VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT (1ull << 11) /* (43 - 32) */ -#define VMX_EPT_DEFAULT_GAW 3 -#define VMX_EPT_MAX_GAW 0x4 #define VMX_EPT_MT_EPTE_SHIFT 3 -#define VMX_EPT_GAW_EPTP_SHIFT 3 -#define VMX_EPT_AD_ENABLE_BIT (1ull << 6) -#define VMX_EPT_DEFAULT_MT 0x6ull +#define VMX_EPTP_PWL_MASK 0x38ull +#define VMX_EPTP_PWL_4 0x18ull +#define VMX_EPTP_PWL_5 0x20ull +#define VMX_EPTP_AD_ENABLE_BIT (1ull << 6) +#define VMX_EPTP_MT_MASK 0x7ull +#define VMX_EPTP_MT_WB 0x6ull +#define VMX_EPTP_MT_UC 0x0ull #define VMX_EPT_READABLE_MASK 0x1ull #define VMX_EPT_WRITABLE_MASK 0x2ull #define VMX_EPT_EXECUTABLE_MASK 0x4ull diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h index 6ba66ee79710..d9a7c659009c 100644 --- a/arch/x86/include/asm/vsyscall.h +++ b/arch/x86/include/asm/vsyscall.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_VSYSCALL_H #define _ASM_X86_VSYSCALL_H diff --git a/arch/x86/include/asm/word-at-a-time.h b/arch/x86/include/asm/word-at-a-time.h index 5b238981542a..06006b0351f3 100644 --- a/arch/x86/include/asm/word-at-a-time.h +++ b/arch/x86/include/asm/word-at-a-time.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_WORD_AT_A_TIME_H #define _ASM_WORD_AT_A_TIME_H diff --git a/arch/x86/include/asm/x2apic.h b/arch/x86/include/asm/x2apic.h index f90f0a587c66..78ccf28d17db 100644 --- a/arch/x86/include/asm/x2apic.h +++ b/arch/x86/include/asm/x2apic.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Common bits for X2APIC cluster/physical modes. */ diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 7ba7e90a9ad6..8a1ebf9540dd 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PLATFORM_H #define _ASM_X86_PLATFORM_H diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h index e6911caf5bbf..d383140e1dc8 100644 --- a/arch/x86/include/asm/xen/events.h +++ b/arch/x86/include/asm/xen/events.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_XEN_EVENTS_H #define _ASM_X86_XEN_EVENTS_H diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index f6d20f6cca12..7cb282e9e587 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -43,6 +43,7 @@ #include <asm/page.h> #include <asm/pgtable.h> +#include <asm/smap.h> #include <xen/interface/xen.h> #include <xen/interface/sched.h> @@ -50,6 +51,8 @@ #include <xen/interface/platform.h> #include <xen/interface/xen-mca.h> +struct xen_dm_op_buf; + /* * The hypercall asms have to meet several constraints: * - Work on 32- and 64-bit. @@ -110,10 +113,9 @@ extern struct { char _entry[32]; } hypercall_page[]; register unsigned long __arg2 asm(__HYPERCALL_ARG2REG) = __arg2; \ register unsigned long __arg3 asm(__HYPERCALL_ARG3REG) = __arg3; \ register unsigned long __arg4 asm(__HYPERCALL_ARG4REG) = __arg4; \ - register unsigned long __arg5 asm(__HYPERCALL_ARG5REG) = __arg5; \ - register void *__sp asm(_ASM_SP); + register unsigned long __arg5 asm(__HYPERCALL_ARG5REG) = __arg5; -#define __HYPERCALL_0PARAM "=r" (__res), "+r" (__sp) +#define __HYPERCALL_0PARAM "=r" (__res), ASM_CALL_CONSTRAINT #define __HYPERCALL_1PARAM __HYPERCALL_0PARAM, "+r" (__arg1) #define __HYPERCALL_2PARAM __HYPERCALL_1PARAM, "+r" (__arg2) #define __HYPERCALL_3PARAM __HYPERCALL_2PARAM, "+r" (__arg3) @@ -214,10 +216,12 @@ privcmd_call(unsigned call, __HYPERCALL_DECLS; __HYPERCALL_5ARG(a1, a2, a3, a4, a5); + stac(); asm volatile("call *%[call]" : __HYPERCALL_5PARAM : [call] "a" (&hypercall_page[call]) : __HYPERCALL_CLOBBER5); + clac(); return (long)__res; } @@ -474,9 +478,13 @@ HYPERVISOR_xenpmu_op(unsigned int op, void *arg) static inline int HYPERVISOR_dm_op( - domid_t dom, unsigned int nr_bufs, void *bufs) + domid_t dom, unsigned int nr_bufs, struct xen_dm_op_buf *bufs) { - return _hypercall3(int, dm_op, dom, nr_bufs, bufs); + int ret; + stac(); + ret = _hypercall3(int, dm_op, dom, nr_bufs, bufs); + clac(); + return ret; } static inline void @@ -548,10 +556,12 @@ MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr, mcl->args[0] = maddr; mcl->args[1] = *(unsigned long *)&desc; } else { + u32 *p = (u32 *)&desc; + mcl->args[0] = maddr; mcl->args[1] = maddr >> 32; - mcl->args[2] = desc.a; - mcl->args[3] = desc.b; + mcl->args[2] = *p++; + mcl->args[3] = *p; } trace_xen_mc_entry(mcl, sizeof(maddr) == sizeof(long) ? 2 : 4); diff --git a/arch/x86/include/asm/xen/interface_32.h b/arch/x86/include/asm/xen/interface_32.h index 8413688b2571..dc40578abded 100644 --- a/arch/x86/include/asm/xen/interface_32.h +++ b/arch/x86/include/asm/xen/interface_32.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /****************************************************************************** * arch-x86_32.h * diff --git a/arch/x86/include/asm/xen/interface_64.h b/arch/x86/include/asm/xen/interface_64.h index 839a4811cf98..c599ec269a25 100644 --- a/arch/x86/include/asm/xen/interface_64.h +++ b/arch/x86/include/asm/xen/interface_64.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_XEN_INTERFACE_64_H #define _ASM_X86_XEN_INTERFACE_64_H diff --git a/arch/x86/include/asm/xen/page-coherent.h b/arch/x86/include/asm/xen/page-coherent.h index f02f025ff988..116777e7f387 100644 --- a/arch/x86/include/asm/xen/page-coherent.h +++ b/arch/x86/include/asm/xen/page-coherent.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_XEN_PAGE_COHERENT_H #define _ASM_X86_XEN_PAGE_COHERENT_H diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 8417ef7c3885..c6b84245e5ab 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_XEN_PAGE_H #define _ASM_X86_XEN_PAGE_H @@ -158,9 +159,6 @@ static inline unsigned long mfn_to_pfn_no_overrides(unsigned long mfn) unsigned long pfn; int ret; - if (xen_feature(XENFEAT_auto_translated_physmap)) - return mfn; - if (unlikely(mfn >= machine_to_phys_nr)) return ~0; @@ -317,8 +315,6 @@ static inline pte_t __pte_ma(pteval_t x) #define p4d_val_ma(x) ((x).p4d) #endif -void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid); - xmaddr_t arbitrary_virt_to_machine(void *address); unsigned long arbitrary_virt_to_mfn(void *vaddr); void make_lowmem_page_readonly(void *vaddr); diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h index f320ee32d5a1..3506d8c598c1 100644 --- a/arch/x86/include/asm/xen/pci.h +++ b/arch/x86/include/asm/xen/pci.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_XEN_PCI_H #define _ASM_X86_XEN_PCI_H diff --git a/arch/x86/include/asm/xen/swiotlb-xen.h b/arch/x86/include/asm/xen/swiotlb-xen.h index ee52fcac6f72..6b56d0d45d15 100644 --- a/arch/x86/include/asm/xen/swiotlb-xen.h +++ b/arch/x86/include/asm/xen/swiotlb-xen.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_SWIOTLB_XEN_H #define _ASM_X86_SWIOTLB_XEN_H diff --git a/arch/x86/include/asm/xen/trace_types.h b/arch/x86/include/asm/xen/trace_types.h index 21e1874c0a0b..2aad0abd68e2 100644 --- a/arch/x86/include/asm/xen/trace_types.h +++ b/arch/x86/include/asm/xen/trace_types.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_XEN_TRACE_TYPES_H #define _ASM_XEN_TRACE_TYPES_H diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h index 546f1e3b87cc..0307e4ec5044 100644 --- a/arch/x86/include/asm/xor_64.h +++ b/arch/x86/include/asm/xor_64.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_XOR_64_H #define _ASM_X86_XOR_64_H diff --git a/arch/x86/include/uapi/asm/Kbuild b/arch/x86/include/uapi/asm/Kbuild index 83b6e9a0dce4..da1489cb64dc 100644 --- a/arch/x86/include/uapi/asm/Kbuild +++ b/arch/x86/include/uapi/asm/Kbuild @@ -1,6 +1,6 @@ # UAPI Header export list include include/uapi/asm-generic/Kbuild.asm -genhdr-y += unistd_32.h -genhdr-y += unistd_64.h -genhdr-y += unistd_x32.h +generated-y += unistd_32.h +generated-y += unistd_64.h +generated-y += unistd_x32.h diff --git a/arch/x86/include/uapi/asm/a.out.h b/arch/x86/include/uapi/asm/a.out.h index 4684f97a5bbd..094c49d8ea8e 100644 --- a/arch/x86/include/uapi/asm/a.out.h +++ b/arch/x86/include/uapi/asm/a.out.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _ASM_X86_A_OUT_H #define _ASM_X86_A_OUT_H diff --git a/arch/x86/include/uapi/asm/auxvec.h b/arch/x86/include/uapi/asm/auxvec.h index 77203ac352de..580e3c567046 100644 --- a/arch/x86/include/uapi/asm/auxvec.h +++ b/arch/x86/include/uapi/asm/auxvec.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _ASM_X86_AUXVEC_H #define _ASM_X86_AUXVEC_H /* diff --git a/arch/x86/include/uapi/asm/bitsperlong.h b/arch/x86/include/uapi/asm/bitsperlong.h index 217909b4d6f5..5d72c8458838 100644 --- a/arch/x86/include/uapi/asm/bitsperlong.h +++ b/arch/x86/include/uapi/asm/bitsperlong.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef __ASM_X86_BITSPERLONG_H #define __ASM_X86_BITSPERLONG_H diff --git a/arch/x86/include/uapi/asm/boot.h b/arch/x86/include/uapi/asm/boot.h index 94292c4c8122..88ffc5aee087 100644 --- a/arch/x86/include/uapi/asm/boot.h +++ b/arch/x86/include/uapi/asm/boot.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _UAPI_ASM_X86_BOOT_H #define _UAPI_ASM_X86_BOOT_H diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index ddef37b16af2..afdd5ae0fcc4 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _ASM_X86_BOOTPARAM_H #define _ASM_X86_BOOTPARAM_H @@ -201,7 +202,7 @@ struct boot_params { * * @X86_SUBARCH_PC: Should be used if the hardware is enumerable using standard * PC mechanisms (PCI, ACPI) and doesn't need a special boot flow. - * @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest + * @X86_SUBARCH_LGUEST: Used for x86 hypervisor demo, lguest, deprecated * @X86_SUBARCH_XEN: Used for Xen guest types which follow the PV boot path, * which start at asm startup_xen() entry point and later jump to the C * xen_start_kernel() entry point. Both domU and dom0 type of guests are diff --git a/arch/x86/include/uapi/asm/byteorder.h b/arch/x86/include/uapi/asm/byteorder.h index b13a7a88f3eb..484e3cfd7ef2 100644 --- a/arch/x86/include/uapi/asm/byteorder.h +++ b/arch/x86/include/uapi/asm/byteorder.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_BYTEORDER_H #define _ASM_X86_BYTEORDER_H diff --git a/arch/x86/include/uapi/asm/debugreg.h b/arch/x86/include/uapi/asm/debugreg.h index 3c0874dd9861..d95d080b30e3 100644 --- a/arch/x86/include/uapi/asm/debugreg.h +++ b/arch/x86/include/uapi/asm/debugreg.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _UAPI_ASM_X86_DEBUGREG_H #define _UAPI_ASM_X86_DEBUGREG_H diff --git a/arch/x86/include/uapi/asm/e820.h b/arch/x86/include/uapi/asm/e820.h index 9dafe59cf6e2..2f491efe3a12 100644 --- a/arch/x86/include/uapi/asm/e820.h +++ b/arch/x86/include/uapi/asm/e820.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _UAPI_ASM_X86_E820_H #define _UAPI_ASM_X86_E820_H #define E820MAP 0x2d0 /* our map */ diff --git a/arch/x86/include/uapi/asm/hw_breakpoint.h b/arch/x86/include/uapi/asm/hw_breakpoint.h index 79a9626b5500..6789884c7701 100644 --- a/arch/x86/include/uapi/asm/hw_breakpoint.h +++ b/arch/x86/include/uapi/asm/hw_breakpoint.h @@ -1 +1,2 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /* */ diff --git a/arch/x86/include/uapi/asm/hwcap2.h b/arch/x86/include/uapi/asm/hwcap2.h index 0bd2be5c7617..6ebaae90e207 100644 --- a/arch/x86/include/uapi/asm/hwcap2.h +++ b/arch/x86/include/uapi/asm/hwcap2.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_HWCAP2_H #define _ASM_X86_HWCAP2_H diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h index 432df4b1baec..1a5bfead93b4 100644 --- a/arch/x86/include/uapi/asm/hyperv.h +++ b/arch/x86/include/uapi/asm/hyperv.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _ASM_X86_HYPERV_H #define _ASM_X86_HYPERV_H @@ -34,16 +35,10 @@ #define HV_X64_MSR_REFERENCE_TSC 0x40000021 /* - * There is a single feature flag that signifies the presence of the MSR - * that can be used to retrieve both the local APIC Timer frequency as - * well as the TSC frequency. + * There is a single feature flag that signifies if the partition has access + * to MSRs with local APIC and TSC frequencies. */ - -/* Local APIC timer frequency MSR (HV_X64_MSR_APIC_FREQUENCY) is available */ -#define HV_X64_MSR_APIC_FREQUENCY_AVAILABLE (1 << 11) - -/* TSC frequency MSR (HV_X64_MSR_TSC_FREQUENCY) is available */ -#define HV_X64_MSR_TSC_FREQUENCY_AVAILABLE (1 << 11) +#define HV_X64_ACCESS_FREQUENCY_MSRS (1 << 11) /* * Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM @@ -73,6 +68,9 @@ */ #define HV_X64_MSR_STAT_PAGES_AVAILABLE (1 << 8) +/* Frequency MSRs available */ +#define HV_FEATURE_FREQUENCY_MSRS_AVAILABLE (1 << 8) + /* Crash MSR available */ #define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE (1 << 10) @@ -152,6 +150,9 @@ */ #define HV_X64_DEPRECATING_AEOI_RECOMMENDED (1 << 9) +/* Recommend using the newer ExProcessorMasks interface */ +#define HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED (1 << 11) + /* * Crash notification flag. */ @@ -239,7 +240,11 @@ (~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1)) /* Declare the various hypercall operations. */ +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE 0x0002 +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST 0x0003 #define HVCALL_NOTIFY_LONG_SPIN_WAIT 0x0008 +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX 0x0013 +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX 0x0014 #define HVCALL_POST_MESSAGE 0x005c #define HVCALL_SIGNAL_EVENT 0x005d @@ -256,6 +261,16 @@ #define HV_PROCESSOR_POWER_STATE_C2 2 #define HV_PROCESSOR_POWER_STATE_C3 3 +#define HV_FLUSH_ALL_PROCESSORS BIT(0) +#define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES BIT(1) +#define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY BIT(2) +#define HV_FLUSH_USE_EXTENDED_RANGE_FORMAT BIT(3) + +enum HV_GENERIC_SET_FORMAT { + HV_GENERIC_SET_SPARCE_4K, + HV_GENERIC_SET_ALL, +}; + /* hypercall status code */ #define HV_STATUS_SUCCESS 0 #define HV_STATUS_INVALID_HYPERCALL_CODE 2 diff --git a/arch/x86/include/uapi/asm/ist.h b/arch/x86/include/uapi/asm/ist.h index bad9f5ea4070..eac5b207939d 100644 --- a/arch/x86/include/uapi/asm/ist.h +++ b/arch/x86/include/uapi/asm/ist.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ /* * Include file for the interface to IST BIOS * Copyright 2002 Andy Grover <andrew.grover@intel.com> diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index c2824d02ba37..f3a960488eae 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _ASM_X86_KVM_H #define _ASM_X86_KVM_H diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index cff0bb6556f8..554aa8f24f91 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _UAPI_ASM_X86_KVM_PARA_H #define _UAPI_ASM_X86_KVM_PARA_H @@ -67,6 +68,7 @@ struct kvm_clock_pairing { #define KVM_ASYNC_PF_ENABLED (1 << 0) #define KVM_ASYNC_PF_SEND_ALWAYS (1 << 1) +#define KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT (1 << 2) /* Operations for KVM_HC_MMU_OP */ #define KVM_MMU_OP_WRITE_PTE 1 diff --git a/arch/x86/include/uapi/asm/kvm_perf.h b/arch/x86/include/uapi/asm/kvm_perf.h index 3bb964f88aa1..125cf5cdf6c5 100644 --- a/arch/x86/include/uapi/asm/kvm_perf.h +++ b/arch/x86/include/uapi/asm/kvm_perf.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _ASM_X86_KVM_PERF_H #define _ASM_X86_KVM_PERF_H diff --git a/arch/x86/include/uapi/asm/ldt.h b/arch/x86/include/uapi/asm/ldt.h index 6e1aaf73852a..d62ac5db093b 100644 --- a/arch/x86/include/uapi/asm/ldt.h +++ b/arch/x86/include/uapi/asm/ldt.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /* * ldt.h * diff --git a/arch/x86/include/uapi/asm/mce.h b/arch/x86/include/uapi/asm/mce.h index eb6247a7009b..91723461dc1f 100644 --- a/arch/x86/include/uapi/asm/mce.h +++ b/arch/x86/include/uapi/asm/mce.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _UAPI_ASM_X86_MCE_H #define _UAPI_ASM_X86_MCE_H diff --git a/arch/x86/include/uapi/asm/mman.h b/arch/x86/include/uapi/asm/mman.h index 39bca7fac087..d4a8d0424bfb 100644 --- a/arch/x86/include/uapi/asm/mman.h +++ b/arch/x86/include/uapi/asm/mman.h @@ -1,11 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _ASM_X86_MMAN_H #define _ASM_X86_MMAN_H #define MAP_32BIT 0x40 /* only give out 32bit addresses */ -#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT) -#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT) - #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS /* * Take the 4 protection key bits out of the vma->vm_flags diff --git a/arch/x86/include/uapi/asm/msr.h b/arch/x86/include/uapi/asm/msr.h index c41f4fe25483..e7516b402a00 100644 --- a/arch/x86/include/uapi/asm/msr.h +++ b/arch/x86/include/uapi/asm/msr.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _UAPI_ASM_X86_MSR_H #define _UAPI_ASM_X86_MSR_H diff --git a/arch/x86/include/uapi/asm/mtrr.h b/arch/x86/include/uapi/asm/mtrr.h index 7528dcf59691..376563f2bac1 100644 --- a/arch/x86/include/uapi/asm/mtrr.h +++ b/arch/x86/include/uapi/asm/mtrr.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.0+ WITH Linux-syscall-note */ /* Generic MTRR (Memory Type Range Register) ioctls. Copyright (C) 1997-1999 Richard Gooch diff --git a/arch/x86/include/uapi/asm/perf_regs.h b/arch/x86/include/uapi/asm/perf_regs.h index 3f2207bfd17b..f3329cabce5c 100644 --- a/arch/x86/include/uapi/asm/perf_regs.h +++ b/arch/x86/include/uapi/asm/perf_regs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _ASM_X86_PERF_REGS_H #define _ASM_X86_PERF_REGS_H diff --git a/arch/x86/include/uapi/asm/posix_types.h b/arch/x86/include/uapi/asm/posix_types.h index 85506b383627..c661e95f0134 100644 --- a/arch/x86/include/uapi/asm/posix_types.h +++ b/arch/x86/include/uapi/asm/posix_types.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef __KERNEL__ # ifdef __i386__ # include <asm/posix_types_32.h> diff --git a/arch/x86/include/uapi/asm/posix_types_32.h b/arch/x86/include/uapi/asm/posix_types_32.h index 8e525059e7d8..840659f4b96f 100644 --- a/arch/x86/include/uapi/asm/posix_types_32.h +++ b/arch/x86/include/uapi/asm/posix_types_32.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _ASM_X86_POSIX_TYPES_32_H #define _ASM_X86_POSIX_TYPES_32_H diff --git a/arch/x86/include/uapi/asm/posix_types_64.h b/arch/x86/include/uapi/asm/posix_types_64.h index cba0c1ead162..515afb8059ce 100644 --- a/arch/x86/include/uapi/asm/posix_types_64.h +++ b/arch/x86/include/uapi/asm/posix_types_64.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _ASM_X86_POSIX_TYPES_64_H #define _ASM_X86_POSIX_TYPES_64_H diff --git a/arch/x86/include/uapi/asm/posix_types_x32.h b/arch/x86/include/uapi/asm/posix_types_x32.h index 85f9bdafa93c..f60479b07fc8 100644 --- a/arch/x86/include/uapi/asm/posix_types_x32.h +++ b/arch/x86/include/uapi/asm/posix_types_x32.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _ASM_X86_POSIX_TYPES_X32_H #define _ASM_X86_POSIX_TYPES_X32_H diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h index c45765517092..5a6aac9fa41f 100644 --- a/arch/x86/include/uapi/asm/prctl.h +++ b/arch/x86/include/uapi/asm/prctl.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _ASM_X86_PRCTL_H #define _ASM_X86_PRCTL_H diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h index 567de50a4c2a..6f3355399665 100644 --- a/arch/x86/include/uapi/asm/processor-flags.h +++ b/arch/x86/include/uapi/asm/processor-flags.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _UAPI_ASM_X86_PROCESSOR_FLAGS_H #define _UAPI_ASM_X86_PROCESSOR_FLAGS_H /* Various flags defined: can be included from assembler. */ @@ -104,6 +105,8 @@ #define X86_CR4_OSFXSR _BITUL(X86_CR4_OSFXSR_BIT) #define X86_CR4_OSXMMEXCPT_BIT 10 /* enable unmasked SSE exceptions */ #define X86_CR4_OSXMMEXCPT _BITUL(X86_CR4_OSXMMEXCPT_BIT) +#define X86_CR4_LA57_BIT 12 /* enable 5-level page tables */ +#define X86_CR4_LA57 _BITUL(X86_CR4_LA57_BIT) #define X86_CR4_VMXE_BIT 13 /* enable VMX virtualization */ #define X86_CR4_VMXE _BITUL(X86_CR4_VMXE_BIT) #define X86_CR4_SMXE_BIT 14 /* enable safer mode (TXT) */ diff --git a/arch/x86/include/uapi/asm/ptrace-abi.h b/arch/x86/include/uapi/asm/ptrace-abi.h index 580aee3072e0..16074b9c93bb 100644 --- a/arch/x86/include/uapi/asm/ptrace-abi.h +++ b/arch/x86/include/uapi/asm/ptrace-abi.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _ASM_X86_PTRACE_ABI_H #define _ASM_X86_PTRACE_ABI_H diff --git a/arch/x86/include/uapi/asm/ptrace.h b/arch/x86/include/uapi/asm/ptrace.h index bc16115af39b..85165c0edafc 100644 --- a/arch/x86/include/uapi/asm/ptrace.h +++ b/arch/x86/include/uapi/asm/ptrace.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _UAPI_ASM_X86_PTRACE_H #define _UAPI_ASM_X86_PTRACE_H diff --git a/arch/x86/include/uapi/asm/sembuf.h b/arch/x86/include/uapi/asm/sembuf.h index cc2d6a3aeae7..cabd7476bd6c 100644 --- a/arch/x86/include/uapi/asm/sembuf.h +++ b/arch/x86/include/uapi/asm/sembuf.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _ASM_X86_SEMBUF_H #define _ASM_X86_SEMBUF_H diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h index 62d4111c1c54..844d60eb1882 100644 --- a/arch/x86/include/uapi/asm/sigcontext.h +++ b/arch/x86/include/uapi/asm/sigcontext.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _UAPI_ASM_X86_SIGCONTEXT_H #define _UAPI_ASM_X86_SIGCONTEXT_H diff --git a/arch/x86/include/uapi/asm/sigcontext32.h b/arch/x86/include/uapi/asm/sigcontext32.h index a92b0f0dc09e..6b18e88de8a6 100644 --- a/arch/x86/include/uapi/asm/sigcontext32.h +++ b/arch/x86/include/uapi/asm/sigcontext32.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_SIGCONTEXT32_H #define _ASM_X86_SIGCONTEXT32_H diff --git a/arch/x86/include/uapi/asm/siginfo.h b/arch/x86/include/uapi/asm/siginfo.h index 34c47b3341c0..b3d157957177 100644 --- a/arch/x86/include/uapi/asm/siginfo.h +++ b/arch/x86/include/uapi/asm/siginfo.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _ASM_X86_SIGINFO_H #define _ASM_X86_SIGINFO_H diff --git a/arch/x86/include/uapi/asm/signal.h b/arch/x86/include/uapi/asm/signal.h index 8264f47cf53e..e5745d593dc7 100644 --- a/arch/x86/include/uapi/asm/signal.h +++ b/arch/x86/include/uapi/asm/signal.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _UAPI_ASM_X86_SIGNAL_H #define _UAPI_ASM_X86_SIGNAL_H diff --git a/arch/x86/include/uapi/asm/stat.h b/arch/x86/include/uapi/asm/stat.h index bc03eb5d6360..9e3982d95d0f 100644 --- a/arch/x86/include/uapi/asm/stat.h +++ b/arch/x86/include/uapi/asm/stat.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _ASM_X86_STAT_H #define _ASM_X86_STAT_H diff --git a/arch/x86/include/uapi/asm/statfs.h b/arch/x86/include/uapi/asm/statfs.h index 2d0adbf99a8e..13c2464cd2c4 100644 --- a/arch/x86/include/uapi/asm/statfs.h +++ b/arch/x86/include/uapi/asm/statfs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _ASM_X86_STATFS_H #define _ASM_X86_STATFS_H diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h index 3725e145aa58..a9731f8a480f 100644 --- a/arch/x86/include/uapi/asm/svm.h +++ b/arch/x86/include/uapi/asm/svm.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _UAPI__SVM_H #define _UAPI__SVM_H diff --git a/arch/x86/include/uapi/asm/swab.h b/arch/x86/include/uapi/asm/swab.h index 7f235c7105c1..cd3fd8ddbe9a 100644 --- a/arch/x86/include/uapi/asm/swab.h +++ b/arch/x86/include/uapi/asm/swab.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _ASM_X86_SWAB_H #define _ASM_X86_SWAB_H diff --git a/arch/x86/include/uapi/asm/types.h b/arch/x86/include/uapi/asm/types.h index 8e8c23fef08c..df55e1ddb0c9 100644 --- a/arch/x86/include/uapi/asm/types.h +++ b/arch/x86/include/uapi/asm/types.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_TYPES_H #define _ASM_X86_TYPES_H diff --git a/arch/x86/include/uapi/asm/ucontext.h b/arch/x86/include/uapi/asm/ucontext.h index e3d1ec90616e..5657b7a49f03 100644 --- a/arch/x86/include/uapi/asm/ucontext.h +++ b/arch/x86/include/uapi/asm/ucontext.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _ASM_X86_UCONTEXT_H #define _ASM_X86_UCONTEXT_H diff --git a/arch/x86/include/uapi/asm/unistd.h b/arch/x86/include/uapi/asm/unistd.h index a26df0d75cd0..30d7d04d72d6 100644 --- a/arch/x86/include/uapi/asm/unistd.h +++ b/arch/x86/include/uapi/asm/unistd.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _UAPI_ASM_X86_UNISTD_H #define _UAPI_ASM_X86_UNISTD_H diff --git a/arch/x86/include/uapi/asm/vm86.h b/arch/x86/include/uapi/asm/vm86.h index e0b243e9d859..d2ee4e307ef8 100644 --- a/arch/x86/include/uapi/asm/vm86.h +++ b/arch/x86/include/uapi/asm/vm86.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _UAPI_ASM_X86_VM86_H #define _UAPI_ASM_X86_VM86_H diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index 690a2dcf4078..f0b0c90dd398 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /* * vmx.h: VMX Architecture related definitions * Copyright (c) 2004, Intel Corporation. diff --git a/arch/x86/include/uapi/asm/vsyscall.h b/arch/x86/include/uapi/asm/vsyscall.h index b97dd6e263d2..75275f547444 100644 --- a/arch/x86/include/uapi/asm/vsyscall.h +++ b/arch/x86/include/uapi/asm/vsyscall.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _UAPI_ASM_X86_VSYSCALL_H #define _UAPI_ASM_X86_VSYSCALL_H diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 4b994232cb57..5f70044340ff 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # Makefile for the linux kernel. # @@ -18,6 +19,7 @@ CFLAGS_REMOVE_pvclock.o = -pg CFLAGS_REMOVE_kvmclock.o = -pg CFLAGS_REMOVE_ftrace.o = -pg CFLAGS_REMOVE_early_printk.o = -pg +CFLAGS_REMOVE_head64.o = -pg endif KASAN_SANITIZE_head$(BITS).o := n @@ -29,6 +31,7 @@ OBJECT_FILES_NON_STANDARD_head_$(BITS).o := y OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y OBJECT_FILES_NON_STANDARD_test_nx.o := y +OBJECT_FILES_NON_STANDARD_paravirt_patch_$(BITS).o := y # If instrumentation of this dir is enabled, boot hangs during first second. # Probably could be more selective here, but note that files related to irqs, @@ -40,7 +43,7 @@ CFLAGS_irq.o := -I$(src)/../include/asm/trace obj-y := process_$(BITS).o signal.o obj-$(CONFIG_COMPAT) += signal_compat.o -obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o +obj-y += traps.o idt.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o dumpstack.o nmi.o obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o @@ -109,6 +112,7 @@ obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o +obj-$(CONFIG_EISA) += eisa.o obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o @@ -124,11 +128,9 @@ obj-$(CONFIG_PERF_EVENTS) += perf_regs.o obj-$(CONFIG_TRACING) += tracepoint.o obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o -ifdef CONFIG_FRAME_POINTER -obj-y += unwind_frame.o -else -obj-y += unwind_guess.o -endif +obj-$(CONFIG_ORC_UNWINDER) += unwind_orc.o +obj-$(CONFIG_FRAME_POINTER_UNWINDER) += unwind_frame.o +obj-$(CONFIG_GUESS_UNWINDER) += unwind_guess.o ### # 64 bit specific files diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile index 26b78d86f25a..f1bb57b0e41e 100644 --- a/arch/x86/kernel/acpi/Makefile +++ b/arch/x86/kernel/acpi/Makefile @@ -1,3 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0 +OBJECT_FILES_NON_STANDARD_wakeup_$(BITS).o := y + obj-$(CONFIG_ACPI) += boot.o obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o obj-$(CONFIG_ACPI_APEI) += apei.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 6bb680671088..079535e53e2a 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -115,24 +115,24 @@ static u32 isa_irq_to_gsi[NR_IRQS_LEGACY] __read_mostly = { #define ACPI_INVALID_GSI INT_MIN /* - * This is just a simple wrapper around early_ioremap(), + * This is just a simple wrapper around early_memremap(), * with sanity checks for phys == 0 and size == 0. */ -char *__init __acpi_map_table(unsigned long phys, unsigned long size) +void __init __iomem *__acpi_map_table(unsigned long phys, unsigned long size) { if (!phys || !size) return NULL; - return early_ioremap(phys, size); + return early_memremap(phys, size); } -void __init __acpi_unmap_table(char *map, unsigned long size) +void __init __acpi_unmap_table(void __iomem *map, unsigned long size) { if (!map || !size) return; - early_iounmap(map, size); + early_memunmap(map, size); } #ifdef CONFIG_X86_LOCAL_APIC @@ -199,8 +199,10 @@ static int __init acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end) { struct acpi_madt_local_x2apic *processor = NULL; +#ifdef CONFIG_X86_X2APIC int apic_id; u8 enabled; +#endif processor = (struct acpi_madt_local_x2apic *)header; @@ -209,9 +211,10 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end) acpi_table_print_madt_entry(header); +#ifdef CONFIG_X86_X2APIC apic_id = processor->local_apic_id; enabled = processor->lapic_flags & ACPI_MADT_ENABLED; -#ifdef CONFIG_X86_X2APIC + /* * We need to register disabled CPU as well to permit * counting disabled CPUs. This allows us to size @@ -347,6 +350,14 @@ static void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, struct mpc_intsrc mp_irq; /* + * Check bus_irq boundary. + */ + if (bus_irq >= NR_IRQS_LEGACY) { + pr_warn("Invalid bus_irq %u for legacy override\n", bus_irq); + return; + } + + /* * Convert 'gsi' to 'ioapic.pin'. */ ioapic = mp_find_ioapic(gsi); @@ -1075,7 +1086,7 @@ static void __init mp_config_acpi_legacy_irqs(void) mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; #endif set_bit(MP_ISA_BUS, mp_bus_not_pci); - pr_debug("Bus #%d is ISA\n", MP_ISA_BUS); + pr_debug("Bus #%d is ISA (nIRQs: %d)\n", MP_ISA_BUS, nr_legacy_irqs()); /* * Use the default configuration for the IRQs 0-15. Unless @@ -1362,7 +1373,7 @@ static void __init acpi_reduced_hw_init(void) * If your system is blacklisted here, but you find that acpi=force * works for you, please contact linux-acpi@vger.kernel.org */ -static struct dmi_system_id __initdata acpi_dmi_table[] = { +static const struct dmi_system_id acpi_dmi_table[] __initconst = { /* * Boxes that need ACPI disabled */ @@ -1437,7 +1448,7 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = { }; /* second table for DMI checks that should run after early-quirks */ -static struct dmi_system_id __initdata acpi_dmi_table_late[] = { +static const struct dmi_system_id acpi_dmi_table_late[] __initconst = { /* * HP laptops which use a DSDT reporting as HP/SB400/10000, * which includes some code which overrides all temperature diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 8233a630280f..dde437f5d14f 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -167,7 +167,8 @@ static int __init ffh_cstate_init(void) { struct cpuinfo_x86 *c = &boot_cpu_data; - if (c->x86_vendor != X86_VENDOR_INTEL) + if (c->x86_vendor != X86_VENDOR_INTEL && + c->x86_vendor != X86_VENDOR_AMD) return -1; cpu_cstate_entry = alloc_percpu(struct cstate_entry); diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index ed014814ea35..7188aea91549 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * sleep.c - x86-specific ACPI sleep support. * diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h index 65c7b606b606..fbb60ca4255c 100644 --- a/arch/x86/kernel/acpi/sleep.h +++ b/arch/x86/kernel/acpi/sleep.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Variables and functions used by the code in sleep.c */ diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 32e14d137416..3344d3382e91 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -742,7 +742,16 @@ static void *bp_int3_handler, *bp_int3_addr; int poke_int3_handler(struct pt_regs *regs) { - /* bp_patching_in_progress */ + /* + * Having observed our INT3 instruction, we now must observe + * bp_patching_in_progress. + * + * in_progress = TRUE INT3 + * WMB RMB + * write INT3 if (in_progress) + * + * Idem for bp_int3_handler. + */ smp_rmb(); if (likely(!bp_patching_in_progress)) @@ -788,9 +797,8 @@ void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) bp_int3_addr = (u8 *)addr + sizeof(int3); bp_patching_in_progress = true; /* - * Corresponding read barrier in int3 notifier for - * making sure the in_progress flags is correctly ordered wrt. - * patching + * Corresponding read barrier in int3 notifier for making sure the + * in_progress and handler are correctly ordered wrt. patching. */ smp_wmb(); @@ -815,9 +823,11 @@ void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) text_poke(addr, opcode, sizeof(int3)); on_each_cpu(do_sync_core, NULL, 1); - + /* + * sync_core() implies an smp_mb() and orders this store against + * the writing of the new instruction. + */ bp_patching_in_progress = false; - smp_wmb(); return addr; } diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index 815dd63f49d0..cc0e8bc0ea3f 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -704,6 +704,7 @@ static const struct dma_map_ops gart_dma_ops = { .alloc = gart_alloc_coherent, .free = gart_free_coherent, .mapping_error = gart_mapping_error, + .dma_supported = x86_dma_supported, }; static void gart_iommu_shutdown(void) diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 458da8509b75..6db28f17ff28 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -27,6 +27,8 @@ static const struct pci_device_id amd_root_ids[] = { {} }; +#define PCI_DEVICE_ID_AMD_CNB17H_F4 0x1704 + const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, @@ -37,6 +39,7 @@ const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) }, {} }; EXPORT_SYMBOL_GPL(amd_nb_misc_ids); @@ -48,6 +51,7 @@ static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) }, {} }; @@ -402,11 +406,48 @@ void amd_flush_garts(void) } EXPORT_SYMBOL_GPL(amd_flush_garts); +static void __fix_erratum_688(void *info) +{ +#define MSR_AMD64_IC_CFG 0xC0011021 + + msr_set_bit(MSR_AMD64_IC_CFG, 3); + msr_set_bit(MSR_AMD64_IC_CFG, 14); +} + +/* Apply erratum 688 fix so machines without a BIOS fix work. */ +static __init void fix_erratum_688(void) +{ + struct pci_dev *F4; + u32 val; + + if (boot_cpu_data.x86 != 0x14) + return; + + if (!amd_northbridges.num) + return; + + F4 = node_to_amd_nb(0)->link; + if (!F4) + return; + + if (pci_read_config_dword(F4, 0x164, &val)) + return; + + if (val & BIT(2)) + return; + + on_each_cpu(__fix_erratum_688, NULL, 0); + + pr_info("x86/cpu/AMD: CPU erratum 688 worked around\n"); +} + static __init int init_amd_nbs(void) { amd_cache_northbridges(); amd_cache_gart(); + fix_erratum_688(); + return 0; } diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index ef2859f9fcce..f5d92bc3b884 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Firmware replacement code. * diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 8e63ebdcbd0b..2fb7309c6900 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # Makefile for local APIC drivers and for the IO-APIC code # diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 2d75faf743f2..ff891772c9f8 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -54,6 +54,8 @@ #include <asm/mce.h> #include <asm/tsc.h> #include <asm/hypervisor.h> +#include <asm/cpu_device_id.h> +#include <asm/intel-family.h> unsigned int num_processors; @@ -175,8 +177,6 @@ static int disable_apic_timer __initdata; int local_apic_timer_c2_ok; EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); -int first_system_vector = FIRST_SYSTEM_VECTOR; - /* * Debug level, exported for io_apic.c */ @@ -545,6 +545,96 @@ static struct clock_event_device lapic_clockevent = { }; static DEFINE_PER_CPU(struct clock_event_device, lapic_events); +#define DEADLINE_MODEL_MATCH_FUNC(model, func) \ + { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)&func } + +#define DEADLINE_MODEL_MATCH_REV(model, rev) \ + { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)rev } + +static u32 hsx_deadline_rev(void) +{ + switch (boot_cpu_data.x86_mask) { + case 0x02: return 0x3a; /* EP */ + case 0x04: return 0x0f; /* EX */ + } + + return ~0U; +} + +static u32 bdx_deadline_rev(void) +{ + switch (boot_cpu_data.x86_mask) { + case 0x02: return 0x00000011; + case 0x03: return 0x0700000e; + case 0x04: return 0x0f00000c; + case 0x05: return 0x0e000003; + } + + return ~0U; +} + +static u32 skx_deadline_rev(void) +{ + switch (boot_cpu_data.x86_mask) { + case 0x03: return 0x01000136; + case 0x04: return 0x02000014; + } + + return ~0U; +} + +static const struct x86_cpu_id deadline_match[] = { + DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_HASWELL_X, hsx_deadline_rev), + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_X, 0x0b000020), + DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_BROADWELL_XEON_D, bdx_deadline_rev), + DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_SKYLAKE_X, skx_deadline_rev), + + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_CORE, 0x22), + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_ULT, 0x20), + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_GT3E, 0x17), + + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_CORE, 0x25), + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_GT3E, 0x17), + + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE_MOBILE, 0xb2), + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE_DESKTOP, 0xb2), + + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_KABYLAKE_MOBILE, 0x52), + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_KABYLAKE_DESKTOP, 0x52), + + {}, +}; + +static void apic_check_deadline_errata(void) +{ + const struct x86_cpu_id *m; + u32 rev; + + if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER) || + boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return; + + m = x86_match_cpu(deadline_match); + if (!m) + return; + + /* + * Function pointers will have the MSB set due to address layout, + * immediate revisions will not. + */ + if ((long)m->driver_data < 0) + rev = ((u32 (*)(void))(m->driver_data))(); + else + rev = (u32)m->driver_data; + + if (boot_cpu_data.microcode >= rev) + return; + + setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); + pr_err(FW_BUG "TSC_DEADLINE disabled due to Errata; " + "please update microcode to version: 0x%x (or later)\n", rev); +} + /* * Setup the local APIC timer for this CPU. Copy the initialized values * of the boot CPU and register the clock event in the framework. @@ -563,6 +653,7 @@ static void setup_APIC_timer(void) levt->cpumask = cpumask_of(smp_processor_id()); if (this_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) { + levt->name = "lapic-deadline"; levt->features &= ~(CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_DUMMY); levt->set_next_event = lapic_next_deadline; @@ -912,8 +1003,7 @@ void setup_secondary_APIC_clock(void) */ static void local_apic_timer_interrupt(void) { - int cpu = smp_processor_id(); - struct clock_event_device *evt = &per_cpu(lapic_events, cpu); + struct clock_event_device *evt = this_cpu_ptr(&lapic_events); /* * Normally we should not be here till LAPIC has been initialized but @@ -927,7 +1017,8 @@ static void local_apic_timer_interrupt(void) * spurious. */ if (!evt->event_handler) { - pr_warning("Spurious LAPIC timer interrupt on cpu %d\n", cpu); + pr_warning("Spurious LAPIC timer interrupt on cpu %d\n", + smp_processor_id()); /* Switch it off */ lapic_timer_shutdown(evt); return; @@ -962,25 +1053,6 @@ __visible void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs) * interrupt lock, which is the WrongThing (tm) to do. */ entering_ack_irq(); - local_apic_timer_interrupt(); - exiting_irq(); - - set_irq_regs(old_regs); -} - -__visible void __irq_entry smp_trace_apic_timer_interrupt(struct pt_regs *regs) -{ - struct pt_regs *old_regs = set_irq_regs(regs); - - /* - * NOTE! We'd better ACK the irq immediately, - * because timer handling can be slow. - * - * update_process_times() expects us to have done irq_enter(). - * Besides, if we don't timer interrupts ignore the global - * interrupt lock, which is the WrongThing (tm) to do. - */ - entering_ack_irq(); trace_local_timer_entry(LOCAL_TIMER_VECTOR); local_apic_timer_interrupt(); trace_local_timer_exit(LOCAL_TIMER_VECTOR); @@ -1779,6 +1851,8 @@ void __init init_apic_mappings(void) { unsigned int new_apicid; + apic_check_deadline_errata(); + if (x2apic_mode) { boot_cpu_physical_apicid = read_apic_id(); return; @@ -1840,10 +1914,14 @@ void __init register_lapic_address(unsigned long address) /* * This interrupt should _never_ happen with our APIC/SMP architecture */ -static void __smp_spurious_interrupt(u8 vector) +__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs) { + u8 vector = ~regs->orig_ax; u32 v; + entering_irq(); + trace_spurious_apic_entry(vector); + /* * Check if this really is a spurious interrupt and ACK it * if it is a vectored one. Just in case... @@ -1858,22 +1936,7 @@ static void __smp_spurious_interrupt(u8 vector) /* see sw-dev-man vol 3, chapter 7.4.13.5 */ pr_info("spurious APIC interrupt through vector %02x on CPU#%d, " "should never happen.\n", vector, smp_processor_id()); -} -__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs) -{ - entering_irq(); - __smp_spurious_interrupt(~regs->orig_ax); - exiting_irq(); -} - -__visible void __irq_entry smp_trace_spurious_interrupt(struct pt_regs *regs) -{ - u8 vector = ~regs->orig_ax; - - entering_irq(); - trace_spurious_apic_entry(vector); - __smp_spurious_interrupt(vector); trace_spurious_apic_exit(vector); exiting_irq(); } @@ -1881,10 +1944,8 @@ __visible void __irq_entry smp_trace_spurious_interrupt(struct pt_regs *regs) /* * This interrupt should never happen with our APIC/SMP architecture */ -static void __smp_error_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_error_interrupt(struct pt_regs *regs) { - u32 v; - u32 i = 0; static const char * const error_interrupt_reason[] = { "Send CS error", /* APIC Error Bit 0 */ "Receive CS error", /* APIC Error Bit 1 */ @@ -1895,6 +1956,10 @@ static void __smp_error_interrupt(struct pt_regs *regs) "Received illegal vector", /* APIC Error Bit 6 */ "Illegal register address", /* APIC Error Bit 7 */ }; + u32 v, i = 0; + + entering_irq(); + trace_error_apic_entry(ERROR_APIC_VECTOR); /* First tickle the hardware, only then report what went on. -- REW */ if (lapic_get_maxlvt() > 3) /* Due to the Pentium erratum 3AP. */ @@ -1916,20 +1981,6 @@ static void __smp_error_interrupt(struct pt_regs *regs) apic_printk(APIC_DEBUG, KERN_CONT "\n"); -} - -__visible void __irq_entry smp_error_interrupt(struct pt_regs *regs) -{ - entering_irq(); - __smp_error_interrupt(regs); - exiting_irq(); -} - -__visible void __irq_entry smp_trace_error_interrupt(struct pt_regs *regs) -{ - entering_irq(); - trace_error_apic_entry(ERROR_APIC_VECTOR); - __smp_error_interrupt(regs); trace_error_apic_exit(ERROR_APIC_VECTOR); exiting_irq(); } @@ -2057,7 +2108,7 @@ static int allocate_logical_cpuid(int apicid) /* Allocate a new cpuid. */ if (nr_logical_cpuids >= nr_cpu_ids) { - WARN_ONCE(1, "APIC: NR_CPUS/possible_cpus limit of %i reached. " + WARN_ONCE(1, "APIC: NR_CPUS/possible_cpus limit of %u reached. " "Processor %d/0x%x and the rest are ignored.\n", nr_cpu_ids, nr_logical_cpuids, apicid); return -EINVAL; @@ -2090,7 +2141,7 @@ int generic_processor_info(int apicid, int version) * Since fixing handling of boot_cpu_physical_apicid requires * another discussion and tests on each platform, we leave it * for now and here we use read_apic_id() directly in this - * function, __generic_processor_info(). + * function, generic_processor_info(). */ if (disabled_cpu_apicid != BAD_APICID && disabled_cpu_apicid != read_apic_id() && @@ -2201,23 +2252,32 @@ void default_init_apic_ldr(void) apic_write(APIC_LDR, val); } -int default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask, - unsigned int *apicid) +int default_cpu_mask_to_apicid(const struct cpumask *mask, + struct irq_data *irqdata, + unsigned int *apicid) { - unsigned int cpu; + unsigned int cpu = cpumask_first(mask); - for_each_cpu_and(cpu, cpumask, andmask) { - if (cpumask_test_cpu(cpu, cpu_online_mask)) - break; - } + if (cpu >= nr_cpu_ids) + return -EINVAL; + *apicid = per_cpu(x86_cpu_to_apicid, cpu); + irq_data_update_effective_affinity(irqdata, cpumask_of(cpu)); + return 0; +} - if (likely(cpu < nr_cpu_ids)) { - *apicid = per_cpu(x86_cpu_to_apicid, cpu); - return 0; - } +int flat_cpu_mask_to_apicid(const struct cpumask *mask, + struct irq_data *irqdata, + unsigned int *apicid) - return -EINVAL; +{ + struct cpumask *effmsk = irq_data_get_effective_affinity_mask(irqdata); + unsigned long cpu_mask = cpumask_bits(mask)[0] & APIC_ALL_CPUS; + + if (!cpu_mask) + return -EINVAL; + *apicid = (unsigned int)cpu_mask; + cpumask_bits(effmsk)[0] = cpu_mask; + return 0; } /* diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index a4d7ff20ed22..dedd5a41ba48 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -172,7 +172,7 @@ static struct apic apic_flat __ro_after_init = { .get_apic_id = flat_get_apic_id, .set_apic_id = set_apic_id, - .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, .send_IPI = default_send_IPI_single, .send_IPI_mask = flat_send_IPI_mask, @@ -268,7 +268,7 @@ static struct apic apic_physflat __ro_after_init = { .get_apic_id = flat_get_apic_id, .set_apic_id = set_apic_id, - .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid = default_cpu_mask_to_apicid, .send_IPI = default_send_IPI_single_phys, .send_IPI_mask = default_send_IPI_mask_sequence_phys, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 2262eb6df796..c8d211277315 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * NOOP APIC driver. * @@ -141,7 +142,7 @@ struct apic apic_noop __ro_after_init = { .get_apic_id = noop_get_apic_id, .set_apic_id = NULL, - .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, .send_IPI = noop_send_IPI, .send_IPI_mask = noop_send_IPI_mask, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index e08fe2c8dd8c..2fda912219a6 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -267,7 +267,7 @@ static const struct apic apic_numachip1 __refconst = { .get_apic_id = numachip1_get_apic_id, .set_apic_id = numachip1_set_apic_id, - .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid = default_cpu_mask_to_apicid, .send_IPI = numachip_send_IPI_one, .send_IPI_mask = numachip_send_IPI_mask, @@ -318,7 +318,7 @@ static const struct apic apic_numachip2 __refconst = { .get_apic_id = numachip2_get_apic_id, .set_apic_id = numachip2_set_apic_id, - .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid = default_cpu_mask_to_apicid, .send_IPI = numachip_send_IPI_one, .send_IPI_mask = numachip_send_IPI_mask, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 56012010332c..e12fbcfc9571 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * APIC driver for "bigsmp" xAPIC machines with more than 8 virtual CPUs. * @@ -172,7 +173,7 @@ static struct apic apic_bigsmp __ro_after_init = { .get_apic_id = bigsmp_get_apic_id, .set_apic_id = NULL, - .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid = default_cpu_mask_to_apicid, .send_IPI = default_send_IPI_single_phys, .send_IPI_mask = default_send_IPI_mask_sequence_phys, diff --git a/arch/x86/kernel/apic/htirq.c b/arch/x86/kernel/apic/htirq.c index ae50d3454d78..56ccf9346b08 100644 --- a/arch/x86/kernel/apic/htirq.c +++ b/arch/x86/kernel/apic/htirq.c @@ -150,16 +150,27 @@ static const struct irq_domain_ops htirq_domain_ops = { .deactivate = htirq_domain_deactivate, }; -void arch_init_htirq_domain(struct irq_domain *parent) +void __init arch_init_htirq_domain(struct irq_domain *parent) { + struct fwnode_handle *fn; + if (disable_apic) return; - htirq_domain = irq_domain_add_tree(NULL, &htirq_domain_ops, NULL); + fn = irq_domain_alloc_named_fwnode("PCI-HT"); + if (!fn) + goto warn; + + htirq_domain = irq_domain_create_tree(fn, &htirq_domain_ops, NULL); + irq_domain_free_fwnode(fn); if (!htirq_domain) - pr_warn("failed to initialize irqdomain for HTIRQ.\n"); - else - htirq_domain->parent = parent; + goto warn; + + htirq_domain->parent = parent; + return; + +warn: + pr_warn("Failed to initialize irqdomain for HTIRQ.\n"); } int arch_setup_ht_irq(int idx, int pos, struct pci_dev *dev, diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index c73c9fb281e1..d1fc62a67320 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * HW NMI watchdog support * @@ -19,7 +20,7 @@ #include <linux/init.h> #include <linux/delay.h> -#ifdef CONFIG_HARDLOCKUP_DETECTOR +#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF u64 hw_nmi_get_sample_period(int watchdog_thresh) { return (u64)(cpu_khz) * 1000 * watchdog_thresh; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 347bb9f65737..3b89b27945ff 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Intel IO-APIC support for multi-Pentium hosts. * @@ -1200,28 +1201,6 @@ EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); static struct irq_chip ioapic_chip, ioapic_ir_chip; -#ifdef CONFIG_X86_32 -static inline int IO_APIC_irq_trigger(int irq) -{ - int apic, idx, pin; - - for_each_ioapic_pin(apic, pin) { - idx = find_irq_entry(apic, pin, mp_INT); - if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin, 0))) - return irq_trigger(idx); - } - /* - * nonexistent IRQs are edge default - */ - return 0; -} -#else -static inline int IO_APIC_irq_trigger(int irq) -{ - return 1; -} -#endif - static void __init setup_IO_APIC_irqs(void) { unsigned int ioapic, pin; @@ -1265,7 +1244,7 @@ static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries) entry.vector, entry.irr, entry.delivery_status); if (ir_entry->format) printk(KERN_DEBUG "%s, remapped, I(%04X), Z(%X)\n", - buf, (ir_entry->index << 15) | ir_entry->index, + buf, (ir_entry->index2 << 15) | ir_entry->index, ir_entry->zero); else printk(KERN_DEBUG "%s, %s, D(%02X), M(%1d)\n", @@ -2115,7 +2094,7 @@ static inline void __init check_timer(void) int idx; idx = find_irq_entry(apic1, pin1, mp_INT); if (idx != -1 && irq_trigger(idx)) - unmask_ioapic_irq(irq_get_chip_data(0)); + unmask_ioapic_irq(irq_get_irq_data(0)); } irq_domain_deactivate_irq(irq_data); irq_domain_activate_irq(irq_data); @@ -2223,6 +2202,8 @@ static int mp_irqdomain_create(int ioapic) struct ioapic *ip = &ioapics[ioapic]; struct ioapic_domain_cfg *cfg = &ip->irqdomain_cfg; struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic); + struct fwnode_handle *fn; + char *name = "IO-APIC"; if (cfg->type == IOAPIC_DOMAIN_INVALID) return 0; @@ -2233,9 +2214,25 @@ static int mp_irqdomain_create(int ioapic) parent = irq_remapping_get_ir_irq_domain(&info); if (!parent) parent = x86_vector_domain; + else + name = "IO-APIC-IR"; + + /* Handle device tree enumerated APICs proper */ + if (cfg->dev) { + fn = of_node_to_fwnode(cfg->dev); + } else { + fn = irq_domain_alloc_named_id_fwnode(name, ioapic); + if (!fn) + return -ENOMEM; + } + + ip->irqdomain = irq_domain_create_linear(fn, hwirqs, cfg->ops, + (void *)(long)ioapic); + + /* Release fw handle if it was allocated above */ + if (!cfg->dev) + irq_domain_free_fwnode(fn); - ip->irqdomain = irq_domain_add_linear(cfg->dev, hwirqs, cfg->ops, - (void *)(long)ioapic); if (!ip->irqdomain) return -ENOMEM; diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index 3a205d4a12d0..82f9244fe61f 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/cpumask.h> #include <linux/interrupt.h> diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index c61aec7e65f4..9b18be764422 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -136,13 +136,20 @@ static struct msi_domain_info pci_msi_domain_info = { .handler_name = "edge", }; -void arch_init_msi_domain(struct irq_domain *parent) +void __init arch_init_msi_domain(struct irq_domain *parent) { + struct fwnode_handle *fn; + if (disable_apic) return; - msi_default_domain = pci_msi_create_irq_domain(NULL, - &pci_msi_domain_info, parent); + fn = irq_domain_alloc_named_fwnode("PCI-MSI"); + if (fn) { + msi_default_domain = + pci_msi_create_irq_domain(fn, &pci_msi_domain_info, + parent); + irq_domain_free_fwnode(fn); + } if (!msi_default_domain) pr_warn("failed to initialize irqdomain for MSI/MSI-x.\n"); } @@ -167,9 +174,18 @@ static struct msi_domain_info pci_msi_ir_domain_info = { .handler_name = "edge", }; -struct irq_domain *arch_create_msi_irq_domain(struct irq_domain *parent) +struct irq_domain *arch_create_remap_msi_irq_domain(struct irq_domain *parent, + const char *name, int id) { - return pci_msi_create_irq_domain(NULL, &pci_msi_ir_domain_info, parent); + struct fwnode_handle *fn; + struct irq_domain *d; + + fn = irq_domain_alloc_named_id_fwnode(name, id); + if (!fn) + return NULL; + d = pci_msi_create_irq_domain(fn, &pci_msi_ir_domain_info, parent); + irq_domain_free_fwnode(fn); + return d; } #endif @@ -221,13 +237,20 @@ static struct irq_domain *dmar_get_irq_domain(void) { static struct irq_domain *dmar_domain; static DEFINE_MUTEX(dmar_lock); + struct fwnode_handle *fn; mutex_lock(&dmar_lock); - if (dmar_domain == NULL) - dmar_domain = msi_create_irq_domain(NULL, &dmar_msi_domain_info, + if (dmar_domain) + goto out; + + fn = irq_domain_alloc_named_fwnode("DMAR-MSI"); + if (fn) { + dmar_domain = msi_create_irq_domain(fn, &dmar_msi_domain_info, x86_vector_domain); + irq_domain_free_fwnode(fn); + } +out: mutex_unlock(&dmar_lock); - return dmar_domain; } @@ -317,9 +340,10 @@ static struct msi_domain_info hpet_msi_domain_info = { struct irq_domain *hpet_create_irq_domain(int hpet_id) { - struct irq_domain *parent; - struct irq_alloc_info info; struct msi_domain_info *domain_info; + struct irq_domain *parent, *d; + struct irq_alloc_info info; + struct fwnode_handle *fn; if (x86_vector_domain == NULL) return NULL; @@ -340,7 +364,16 @@ struct irq_domain *hpet_create_irq_domain(int hpet_id) else hpet_msi_controller.name = "IR-HPET-MSI"; - return msi_create_irq_domain(NULL, domain_info, parent); + fn = irq_domain_alloc_named_id_fwnode(hpet_msi_controller.name, + hpet_id); + if (!fn) { + kfree(domain_info); + return NULL; + } + + d = msi_create_irq_domain(fn, domain_info, parent); + irq_domain_free_fwnode(fn); + return d; } int hpet_assign_irq(struct irq_domain *domain, struct hpet_dev *dev, diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 2e8f7f048f4f..63287659adb6 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -102,7 +102,7 @@ static struct apic apic_default __ro_after_init = { .get_apic_id = default_get_apic_id, .set_apic_id = NULL, - .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, .send_IPI = default_send_IPI_single, .send_IPI_mask = default_send_IPI_mask_logical, diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index f3557a1eb562..88c214e75a6b 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -103,7 +103,8 @@ static void free_apic_chip_data(struct apic_chip_data *data) } static int __assign_irq_vector(int irq, struct apic_chip_data *d, - const struct cpumask *mask) + const struct cpumask *mask, + struct irq_data *irqdata) { /* * NOTE! The local APIC isn't very good at handling @@ -141,7 +142,7 @@ static int __assign_irq_vector(int irq, struct apic_chip_data *d, /* * Clear the offline cpus from @vector_cpumask for searching * and verify whether the result overlaps with @mask. If true, - * then the call to apic->cpu_mask_to_apicid_and() will + * then the call to apic->cpu_mask_to_apicid() will * succeed as well. If not, no point in trying to find a * vector in this mask. */ @@ -165,7 +166,7 @@ static int __assign_irq_vector(int irq, struct apic_chip_data *d, offset = current_offset; next: vector += 16; - if (vector >= first_system_vector) { + if (vector >= FIRST_SYSTEM_VECTOR) { offset = (offset + 1) % 16; vector = FIRST_EXTERNAL_VECTOR + offset; } @@ -221,34 +222,40 @@ success: * Cache destination APIC IDs into cfg->dest_apicid. This cannot fail * as we already established, that mask & d->domain & cpu_online_mask * is not empty. + * + * vector_searchmask is a subset of d->domain and has the offline + * cpus masked out. */ - BUG_ON(apic->cpu_mask_to_apicid_and(mask, d->domain, - &d->cfg.dest_apicid)); + cpumask_and(vector_searchmask, vector_searchmask, mask); + BUG_ON(apic->cpu_mask_to_apicid(vector_searchmask, irqdata, + &d->cfg.dest_apicid)); return 0; } static int assign_irq_vector(int irq, struct apic_chip_data *data, - const struct cpumask *mask) + const struct cpumask *mask, + struct irq_data *irqdata) { int err; unsigned long flags; raw_spin_lock_irqsave(&vector_lock, flags); - err = __assign_irq_vector(irq, data, mask); + err = __assign_irq_vector(irq, data, mask, irqdata); raw_spin_unlock_irqrestore(&vector_lock, flags); return err; } static int assign_irq_vector_policy(int irq, int node, struct apic_chip_data *data, - struct irq_alloc_info *info) + struct irq_alloc_info *info, + struct irq_data *irqdata) { if (info && info->mask) - return assign_irq_vector(irq, data, info->mask); + return assign_irq_vector(irq, data, info->mask, irqdata); if (node != NUMA_NO_NODE && - assign_irq_vector(irq, data, cpumask_of_node(node)) == 0) + assign_irq_vector(irq, data, cpumask_of_node(node), irqdata) == 0) return 0; - return assign_irq_vector(irq, data, apic->target_cpus()); + return assign_irq_vector(irq, data, apic->target_cpus(), irqdata); } static void clear_irq_vector(int irq, struct apic_chip_data *data) @@ -360,9 +367,17 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, irq_data->chip = &lapic_controller; irq_data->chip_data = data; irq_data->hwirq = virq + i; - err = assign_irq_vector_policy(virq + i, node, data, info); + err = assign_irq_vector_policy(virq + i, node, data, info, + irq_data); if (err) goto error; + /* + * If the apic destination mode is physical, then the + * effective affinity is restricted to a single target + * CPU. Mark the interrupt accordingly. + */ + if (!apic->irq_dest_mode) + irqd_set_single_target(irq_data); } return 0; @@ -405,7 +420,7 @@ int __init arch_probe_nr_irqs(void) } #ifdef CONFIG_X86_IO_APIC -static void init_legacy_irqs(void) +static void __init init_legacy_irqs(void) { int i, node = cpu_to_node(0); struct apic_chip_data *data; @@ -424,16 +439,21 @@ static void init_legacy_irqs(void) } } #else -static void init_legacy_irqs(void) { } +static inline void init_legacy_irqs(void) { } #endif int __init arch_early_irq_init(void) { + struct fwnode_handle *fn; + init_legacy_irqs(); - x86_vector_domain = irq_domain_add_tree(NULL, &x86_vector_domain_ops, - NULL); + fn = irq_domain_alloc_named_fwnode("VECTOR"); + BUG_ON(!fn); + x86_vector_domain = irq_domain_create_tree(fn, &x86_vector_domain_ops, + NULL); BUG_ON(x86_vector_domain == NULL); + irq_domain_free_fwnode(fn); irq_set_default_host(x86_vector_domain); arch_init_msi_domain(x86_vector_domain); @@ -529,11 +549,12 @@ static int apic_set_affinity(struct irq_data *irq_data, if (!cpumask_intersects(dest, cpu_online_mask)) return -EINVAL; - err = assign_irq_vector(irq, data, dest); + err = assign_irq_vector(irq, data, dest, irq_data); return err ? err : IRQ_SET_MASK_OK; } static struct irq_chip lapic_controller = { + .name = "APIC", .irq_ack = apic_ack_edge, .irq_set_affinity = apic_set_affinity, .irq_retrigger = apic_retrigger_irq, diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 5a35f208ed95..e216cf3d64d2 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -1,9 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/threads.h> #include <linux/cpumask.h> #include <linux/string.h> #include <linux/kernel.h> #include <linux/ctype.h> #include <linux/dmar.h> +#include <linux/irq.h> #include <linux/cpu.h> #include <asm/smp.h> @@ -104,35 +106,30 @@ static void x2apic_send_IPI_all(int vector) } static int -x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask, - unsigned int *apicid) +x2apic_cpu_mask_to_apicid(const struct cpumask *mask, struct irq_data *irqdata, + unsigned int *apicid) { + struct cpumask *effmsk = irq_data_get_effective_affinity_mask(irqdata); + unsigned int cpu; u32 dest = 0; u16 cluster; - int i; - - for_each_cpu_and(i, cpumask, andmask) { - if (!cpumask_test_cpu(i, cpu_online_mask)) - continue; - dest = per_cpu(x86_cpu_to_logical_apicid, i); - cluster = x2apic_cluster(i); - break; - } - if (!dest) + cpu = cpumask_first(mask); + if (cpu >= nr_cpu_ids) return -EINVAL; - for_each_cpu_and(i, cpumask, andmask) { - if (!cpumask_test_cpu(i, cpu_online_mask)) - continue; - if (cluster != x2apic_cluster(i)) + dest = per_cpu(x86_cpu_to_logical_apicid, cpu); + cluster = x2apic_cluster(cpu); + + cpumask_clear(effmsk); + for_each_cpu(cpu, mask) { + if (cluster != x2apic_cluster(cpu)) continue; - dest |= per_cpu(x86_cpu_to_logical_apicid, i); + dest |= per_cpu(x86_cpu_to_logical_apicid, cpu); + cpumask_set_cpu(cpu, effmsk); } *apicid = dest; - return 0; } @@ -256,7 +253,7 @@ static struct apic apic_x2apic_cluster __ro_after_init = { .get_apic_id = x2apic_get_apic_id, .set_apic_id = x2apic_set_apic_id, - .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, .send_IPI = x2apic_send_IPI, .send_IPI_mask = x2apic_send_IPI_mask, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index ff111f05a314..b94d35320f85 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/threads.h> #include <linux/cpumask.h> #include <linux/string.h> @@ -127,7 +128,7 @@ static struct apic apic_x2apic_phys __ro_after_init = { .get_apic_id = x2apic_get_apic_id, .set_apic_id = x2apic_set_apic_id, - .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid = default_cpu_mask_to_apicid, .send_IPI = x2apic_send_IPI, .send_IPI_mask = x2apic_send_IPI_mask, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index b487b3a01615..0d57bb9079c9 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -526,27 +526,15 @@ static void uv_init_apic_ldr(void) } static int -uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask, - unsigned int *apicid) +uv_cpu_mask_to_apicid(const struct cpumask *mask, struct irq_data *irqdata, + unsigned int *apicid) { - int unsigned cpu; + int ret = default_cpu_mask_to_apicid(mask, irqdata, apicid); - /* - * We're using fixed IRQ delivery, can only return one phys APIC ID. - * May as well be the first. - */ - for_each_cpu_and(cpu, cpumask, andmask) { - if (cpumask_test_cpu(cpu, cpu_online_mask)) - break; - } - - if (likely(cpu < nr_cpu_ids)) { - *apicid = per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits; - return 0; - } + if (!ret) + *apicid |= uv_apicid_hibits; - return -EINVAL; + return ret; } static unsigned int x2apic_get_apic_id(unsigned long x) @@ -614,7 +602,7 @@ static struct apic apic_x2apic_uv_x __ro_after_init = { .get_apic_id = x2apic_get_apic_id, .set_apic_id = set_apic_id, - .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid = uv_cpu_mask_to_apicid, .send_IPI = uv_send_IPI_one, .send_IPI_mask = uv_send_IPI_mask, diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 446b0d3d4932..e4b0d92b3ae0 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -2043,7 +2043,7 @@ static int __init swab_apm_power_in_minutes(const struct dmi_system_id *d) return 0; } -static struct dmi_system_id __initdata apm_dmi_table[] = { +static const struct dmi_system_id apm_dmi_table[] __initconst = { { print_if_true, KERN_WARNING "IBM T23 - BIOS 1.03b+ and controller firmware 1.02+ may be needed for Linux APM.", diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index de827d6ac8c2..8ea78275480d 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Generate definitions needed by assembly language modules. * This code generates raw asm output which is post-processed to extract diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 880aa093268d..dedf428b20b6 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -1,12 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 #ifndef __LINUX_KBUILD_H # error "Please do not build this file directly, build asm-offsets.c instead" #endif #include <asm/ucontext.h> -#include <linux/lguest.h> -#include "../../../drivers/lguest/lg.h" - #define __SYSCALL_I386(nr, sym, qual) [nr] = 1, static char syscalls[] = { #include <asm/syscalls_32.h> @@ -62,23 +60,6 @@ void foo(void) OFFSET(stack_canary_offset, stack_canary, canary); #endif -#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) - BLANK(); - OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); - OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending); - - BLANK(); - OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc); - OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc); - OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3); - OFFSET(LGUEST_PAGES_host_sp, lguest_pages, state.host_sp); - OFFSET(LGUEST_PAGES_guest_gdt_desc, lguest_pages,state.guest_gdt_desc); - OFFSET(LGUEST_PAGES_guest_idt_desc, lguest_pages,state.guest_idt_desc); - OFFSET(LGUEST_PAGES_guest_gdt, lguest_pages, state.guest_gdt); - OFFSET(LGUEST_PAGES_regs_trapnum, lguest_pages, regs.trapnum); - OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode); - OFFSET(LGUEST_PAGES_regs, lguest_pages, regs); -#endif BLANK(); DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); DEFINE(NR_syscalls, sizeof(syscalls)); diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 99332f550c48..630212fa9b9d 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #ifndef __LINUX_KBUILD_H # error "Please do not build this file directly, build asm-offsets.c instead" #endif @@ -20,7 +21,6 @@ static char syscalls_ia32[] = { int main(void) { #ifdef CONFIG_PARAVIRT - OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame); OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64); OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs); BLANK(); diff --git a/arch/x86/kernel/audit_64.c b/arch/x86/kernel/audit_64.c index f3672508b249..e1efe44ebefc 100644 --- a/arch/x86/kernel/audit_64.c +++ b/arch/x86/kernel/audit_64.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/init.h> #include <linux/types.h> #include <linux/audit.h> diff --git a/arch/x86/kernel/bootflag.c b/arch/x86/kernel/bootflag.c index 52c8e3c7789d..3fed7ae58b60 100644 --- a/arch/x86/kernel/bootflag.c +++ b/arch/x86/kernel/bootflag.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Implement 'Simple Boot Flag Specification 2.0' */ diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c index 145863d4d343..33399426793e 100644 --- a/arch/x86/kernel/check.c +++ b/arch/x86/kernel/check.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/init.h> #include <linux/sched.h> #include <linux/kthread.h> diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 52000010c62e..236999c54edc 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # Makefile for x86-compatible CPU details, features and quirks # @@ -21,6 +22,7 @@ obj-y += common.o obj-y += rdrand.o obj-y += match.o obj-y += bugs.o +obj-y += aperfmperf.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o @@ -32,7 +34,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o -obj-$(CONFIG_INTEL_RDT_A) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_schemata.o +obj-$(CONFIG_INTEL_RDT) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_monitor.o intel_rdt_ctrlmondata.o obj-$(CONFIG_X86_MCE) += mcheck/ obj-$(CONFIG_MTRR) += mtrr/ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index bb5abe8f5fd4..d58184b7cd44 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -134,6 +134,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c) n = K6_BUG_LOOP; f_vide = vide; + OPTIMIZER_HIDE_VAR(f_vide); d = rdtsc(); while (n--) f_vide(); @@ -296,13 +297,29 @@ static int nearby_node(int apicid) } #endif +#ifdef CONFIG_SMP +/* + * Fix up cpu_core_id for pre-F17h systems to be in the + * [0 .. cores_per_node - 1] range. Not really needed but + * kept so as not to break existing setups. + */ +static void legacy_fixup_core_id(struct cpuinfo_x86 *c) +{ + u32 cus_per_node; + + if (c->x86 >= 0x17) + return; + + cus_per_node = c->x86_max_cores / nodes_per_socket; + c->cpu_core_id %= cus_per_node; +} + /* * Fixup core topology information for * (1) AMD multi-node processors * Assumption: Number of cores in each internal node is the same. * (2) AMD processors supporting compute units */ -#ifdef CONFIG_SMP static void amd_get_topology(struct cpuinfo_x86 *c) { u8 node_id; @@ -353,15 +370,9 @@ static void amd_get_topology(struct cpuinfo_x86 *c) } else return; - /* fixup multi-node processor information */ if (nodes_per_socket > 1) { - u32 cus_per_node; - set_cpu_cap(c, X86_FEATURE_AMD_DCM); - cus_per_node = c->x86_max_cores / nodes_per_socket; - - /* core id has to be in the [0 .. cores_per_node - 1] range */ - c->cpu_core_id %= cus_per_node; + legacy_fixup_core_id(c); } } #endif @@ -547,8 +558,12 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) static void early_init_amd(struct cpuinfo_x86 *c) { + u32 dummy; + early_init_amd_mc(c); + rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); + /* * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate * with P/T states and does not stop in deep C-states @@ -611,6 +626,27 @@ static void early_init_amd(struct cpuinfo_x86 *c) */ if (cpu_has_amd_erratum(c, amd_erratum_400)) set_cpu_bug(c, X86_BUG_AMD_E400); + + /* + * BIOS support is required for SME. If BIOS has enabled SME then + * adjust x86_phys_bits by the SME physical address space reduction + * value. If BIOS has not enabled SME then don't advertise the + * feature (set in scattered.c). Also, since the SME support requires + * long mode, don't advertise the feature under CONFIG_X86_32. + */ + if (cpu_has(c, X86_FEATURE_SME)) { + u64 msr; + + /* Check if SME is enabled */ + rdmsrl(MSR_K8_SYSCFG, msr); + if (msr & MSR_K8_SYSCFG_MEM_ENCRYPT) { + c->x86_phys_bits -= (cpuid_ebx(0x8000001f) >> 6) & 0x3f; + if (IS_ENABLED(CONFIG_X86_32)) + clear_cpu_cap(c, X86_FEATURE_SME); + } else { + clear_cpu_cap(c, X86_FEATURE_SME); + } + } } static void init_amd_k8(struct cpuinfo_x86 *c) @@ -727,10 +763,18 @@ static void init_amd_bd(struct cpuinfo_x86 *c) } } -static void init_amd(struct cpuinfo_x86 *c) +static void init_amd_zn(struct cpuinfo_x86 *c) { - u32 dummy; + /* + * Fix erratum 1076: CPB feature bit not being set in CPUID. It affects + * all up to and including B1. + */ + if (c->x86_model <= 1 && c->x86_mask <= 1) + set_cpu_cap(c, X86_FEATURE_CPB); +} +static void init_amd(struct cpuinfo_x86 *c) +{ early_init_amd(c); /* @@ -757,6 +801,7 @@ static void init_amd(struct cpuinfo_x86 *c) case 0x10: init_amd_gh(c); break; case 0x12: init_amd_ln(c); break; case 0x15: init_amd_bd(c); break; + case 0x17: init_amd_zn(c); break; } /* Enable workaround for FXSAVE leak */ @@ -792,8 +837,6 @@ static void init_amd(struct cpuinfo_x86 *c) if (c->x86 > 0x11) set_cpu_cap(c, X86_FEATURE_ARAT); - rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); - /* 3DNow or LM implies PREFETCHW */ if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH)) if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM)) diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c new file mode 100644 index 000000000000..957813e0180d --- /dev/null +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -0,0 +1,97 @@ +/* + * x86 APERF/MPERF KHz calculation for + * /sys/.../cpufreq/scaling_cur_freq + * + * Copyright (C) 2017 Intel Corp. + * Author: Len Brown <len.brown@intel.com> + * + * This file is licensed under GPLv2. + */ + +#include <linux/delay.h> +#include <linux/ktime.h> +#include <linux/math64.h> +#include <linux/percpu.h> +#include <linux/smp.h> + +struct aperfmperf_sample { + unsigned int khz; + ktime_t time; + u64 aperf; + u64 mperf; +}; + +static DEFINE_PER_CPU(struct aperfmperf_sample, samples); + +#define APERFMPERF_CACHE_THRESHOLD_MS 10 +#define APERFMPERF_REFRESH_DELAY_MS 20 +#define APERFMPERF_STALE_THRESHOLD_MS 1000 + +/* + * aperfmperf_snapshot_khz() + * On the current CPU, snapshot APERF, MPERF, and jiffies + * unless we already did it within 10ms + * calculate kHz, save snapshot + */ +static void aperfmperf_snapshot_khz(void *dummy) +{ + u64 aperf, aperf_delta; + u64 mperf, mperf_delta; + struct aperfmperf_sample *s = this_cpu_ptr(&samples); + ktime_t now = ktime_get(); + s64 time_delta = ktime_ms_delta(now, s->time); + unsigned long flags; + + local_irq_save(flags); + rdmsrl(MSR_IA32_APERF, aperf); + rdmsrl(MSR_IA32_MPERF, mperf); + local_irq_restore(flags); + + aperf_delta = aperf - s->aperf; + mperf_delta = mperf - s->mperf; + + /* + * There is no architectural guarantee that MPERF + * increments faster than we can read it. + */ + if (mperf_delta == 0) + return; + + s->time = now; + s->aperf = aperf; + s->mperf = mperf; + + /* If the previous iteration was too long ago, discard it. */ + if (time_delta > APERFMPERF_STALE_THRESHOLD_MS) + s->khz = 0; + else + s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta); +} + +unsigned int arch_freq_get_on_cpu(int cpu) +{ + s64 time_delta; + unsigned int khz; + + if (!cpu_khz) + return 0; + + if (!static_cpu_has(X86_FEATURE_APERFMPERF)) + return 0; + + /* Don't bother re-computing within the cache threshold time. */ + time_delta = ktime_ms_delta(ktime_get(), per_cpu(samples.time, cpu)); + khz = per_cpu(samples.khz, cpu); + if (khz && time_delta < APERFMPERF_CACHE_THRESHOLD_MS) + return khz; + + smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1); + khz = per_cpu(samples.khz, cpu); + if (khz) + return khz; + + msleep(APERFMPERF_REFRESH_DELAY_MS); + smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1); + + return per_cpu(samples.khz, cpu); +} diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 0af86d9242da..ba0b2424c9b0 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1994 Linus Torvalds * diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index 44207b71fee1..68bc6d9b3132 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/sched.h> #include <linux/sched/clock.h> diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c8b39870f33e..c9176bae7fd8 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -168,6 +168,24 @@ static int __init x86_mpx_setup(char *s) } __setup("nompx", x86_mpx_setup); +#ifdef CONFIG_X86_64 +static int __init x86_nopcid_setup(char *s) +{ + /* nopcid doesn't accept parameters */ + if (s) + return -EINVAL; + + /* do not emit a message if the feature is not present */ + if (!boot_cpu_has(X86_FEATURE_PCID)) + return 0; + + setup_clear_cpu_cap(X86_FEATURE_PCID); + pr_info("nopcid: PCID feature disabled\n"); + return 0; +} +early_param("nopcid", x86_nopcid_setup); +#endif + static int __init x86_noinvpcid_setup(char *s) { /* noinvpcid doesn't accept parameters */ @@ -886,6 +904,14 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) setup_force_cpu_cap(X86_FEATURE_ALWAYS); fpu__init_system(c); + +#ifdef CONFIG_X86_32 + /* + * Regardless of whether PCID is enumerated, the SDM says + * that it can't be enabled in 32-bit mode. + */ + setup_clear_cpu_cap(X86_FEATURE_PCID); +#endif } void __init early_cpu_init(void) @@ -1289,15 +1315,6 @@ static __init int setup_disablecpuid(char *arg) __setup("clearcpuid=", setup_disablecpuid); #ifdef CONFIG_X86_64 -struct desc_ptr idt_descr __ro_after_init = { - .size = NR_VECTORS * 16 - 1, - .address = (unsigned long) idt_table, -}; -const struct desc_ptr debug_idt_descr = { - .size = NR_VECTORS * 16 - 1, - .address = (unsigned long) debug_idt_table, -}; - DEFINE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __aligned(PAGE_SIZE) __visible; @@ -1552,6 +1569,7 @@ void cpu_init(void) mmgrab(&init_mm); me->active_mm = &init_mm; BUG_ON(me->mm); + initialize_tlbstate_and_flush(); enter_lazy_tlb(&init_mm, me); load_sp0(t, ¤t->thread); @@ -1606,6 +1624,7 @@ void cpu_init(void) mmgrab(&init_mm); curr->active_mm = &init_mm; BUG_ON(curr->mm); + initialize_tlbstate_and_flush(); enter_lazy_tlb(&init_mm, curr); load_sp0(t, thread); diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 2584265d4745..f52a370b6c00 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef ARCH_X86_CPU_H #define ARCH_X86_CPU_H diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 6f077445647a..6b4bb335641f 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/bitops.h> #include <linux/delay.h> #include <linux/pci.h> diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index dfa90a3a5145..b720dacac051 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/string.h> diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index c55fb2cb2acc..54d04d574148 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Routines to identify caches on Intel CPU. * @@ -811,10 +812,26 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index, struct cacheinfo *this_leaf; int i, sibling; - if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { + /* + * For L3, always use the pre-calculated cpu_llc_shared_mask + * to derive shared_cpu_map. + */ + if (index == 3) { + for_each_cpu(i, cpu_llc_shared_mask(cpu)) { + this_cpu_ci = get_cpu_cacheinfo(i); + if (!this_cpu_ci->info_list) + continue; + this_leaf = this_cpu_ci->info_list + index; + for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) { + if (!cpu_online(sibling)) + continue; + cpumask_set_cpu(sibling, + &this_leaf->shared_cpu_map); + } + } + } else if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { unsigned int apicid, nshared, first, last; - this_leaf = this_cpu_ci->info_list + index; nshared = base->eax.split.num_threads_sharing + 1; apicid = cpu_data(cpu).apicid; first = apicid - (apicid % nshared); @@ -839,19 +856,6 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index, &this_leaf->shared_cpu_map); } } - } else if (index == 3) { - for_each_cpu(i, cpu_llc_shared_mask(cpu)) { - this_cpu_ci = get_cpu_cacheinfo(i); - if (!this_cpu_ci->info_list) - continue; - this_leaf = this_cpu_ci->info_list + index; - for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) { - if (!cpu_online(sibling)) - continue; - cpumask_set_cpu(sibling, - &this_leaf->shared_cpu_map); - } - } } else return 0; diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c index 5b366462f579..cd5fc61ba450 100644 --- a/arch/x86/kernel/cpu/intel_rdt.c +++ b/arch/x86/kernel/cpu/intel_rdt.c @@ -30,7 +30,8 @@ #include <linux/cpuhotplug.h> #include <asm/intel-family.h> -#include <asm/intel_rdt.h> +#include <asm/intel_rdt_sched.h> +#include "intel_rdt.h" #define MAX_MBA_BW 100u #define MBA_IS_LINEAR 0x4 @@ -38,7 +39,13 @@ /* Mutex to protect rdtgroup access. */ DEFINE_MUTEX(rdtgroup_mutex); -DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid); +/* + * The cached intel_pqr_state is strictly per CPU and can never be + * updated from a remote CPU. Functions which modify the state + * are called with interrupts disabled and no preemption, which + * is sufficient for the protection. + */ +DEFINE_PER_CPU(struct intel_pqr_state, pqr_state); /* * Used to store the max resource name width and max resource data width @@ -46,6 +53,12 @@ DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid); */ int max_name_width, max_data_width; +/* + * Global boolean for rdt_alloc which is true if any + * resource allocation is enabled. + */ +bool rdt_alloc_capable; + static void mba_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r); static void @@ -54,7 +67,9 @@ cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r); #define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].domains) struct rdt_resource rdt_resources_all[] = { + [RDT_RESOURCE_L3] = { + .rid = RDT_RESOURCE_L3, .name = "L3", .domains = domain_init(RDT_RESOURCE_L3), .msr_base = IA32_L3_CBM_BASE, @@ -67,8 +82,11 @@ struct rdt_resource rdt_resources_all[] = { }, .parse_ctrlval = parse_cbm, .format_str = "%d=%0*x", + .fflags = RFTYPE_RES_CACHE, }, + [RDT_RESOURCE_L3DATA] = { + .rid = RDT_RESOURCE_L3DATA, .name = "L3DATA", .domains = domain_init(RDT_RESOURCE_L3DATA), .msr_base = IA32_L3_CBM_BASE, @@ -81,8 +99,11 @@ struct rdt_resource rdt_resources_all[] = { }, .parse_ctrlval = parse_cbm, .format_str = "%d=%0*x", + .fflags = RFTYPE_RES_CACHE, }, + [RDT_RESOURCE_L3CODE] = { + .rid = RDT_RESOURCE_L3CODE, .name = "L3CODE", .domains = domain_init(RDT_RESOURCE_L3CODE), .msr_base = IA32_L3_CBM_BASE, @@ -95,8 +116,11 @@ struct rdt_resource rdt_resources_all[] = { }, .parse_ctrlval = parse_cbm, .format_str = "%d=%0*x", + .fflags = RFTYPE_RES_CACHE, }, + [RDT_RESOURCE_L2] = { + .rid = RDT_RESOURCE_L2, .name = "L2", .domains = domain_init(RDT_RESOURCE_L2), .msr_base = IA32_L2_CBM_BASE, @@ -109,8 +133,11 @@ struct rdt_resource rdt_resources_all[] = { }, .parse_ctrlval = parse_cbm, .format_str = "%d=%0*x", + .fflags = RFTYPE_RES_CACHE, }, + [RDT_RESOURCE_MBA] = { + .rid = RDT_RESOURCE_MBA, .name = "MB", .domains = domain_init(RDT_RESOURCE_MBA), .msr_base = IA32_MBA_THRTL_BASE, @@ -118,6 +145,7 @@ struct rdt_resource rdt_resources_all[] = { .cache_level = 3, .parse_ctrlval = parse_bw, .format_str = "%d=%*d", + .fflags = RFTYPE_RES_MB, }, }; @@ -144,33 +172,28 @@ static unsigned int cbm_idx(struct rdt_resource *r, unsigned int closid) * is always 20 on hsw server parts. The minimum cache bitmask length * allowed for HSW server is always 2 bits. Hardcode all of them. */ -static inline bool cache_alloc_hsw_probe(void) +static inline void cache_alloc_hsw_probe(void) { - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && - boot_cpu_data.x86 == 6 && - boot_cpu_data.x86_model == INTEL_FAM6_HASWELL_X) { - struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3]; - u32 l, h, max_cbm = BIT_MASK(20) - 1; - - if (wrmsr_safe(IA32_L3_CBM_BASE, max_cbm, 0)) - return false; - rdmsr(IA32_L3_CBM_BASE, l, h); + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3]; + u32 l, h, max_cbm = BIT_MASK(20) - 1; - /* If all the bits were set in MSR, return success */ - if (l != max_cbm) - return false; + if (wrmsr_safe(IA32_L3_CBM_BASE, max_cbm, 0)) + return; + rdmsr(IA32_L3_CBM_BASE, l, h); - r->num_closid = 4; - r->default_ctrl = max_cbm; - r->cache.cbm_len = 20; - r->cache.min_cbm_bits = 2; - r->capable = true; - r->enabled = true; + /* If all the bits were set in MSR, return success */ + if (l != max_cbm) + return; - return true; - } + r->num_closid = 4; + r->default_ctrl = max_cbm; + r->cache.cbm_len = 20; + r->cache.shareable_bits = 0xc0000; + r->cache.min_cbm_bits = 2; + r->alloc_capable = true; + r->alloc_enabled = true; - return false; + rdt_alloc_capable = true; } /* @@ -213,15 +236,14 @@ static bool rdt_get_mem_config(struct rdt_resource *r) return false; } r->data_width = 3; - rdt_get_mba_infofile(r); - r->capable = true; - r->enabled = true; + r->alloc_capable = true; + r->alloc_enabled = true; return true; } -static void rdt_get_cache_config(int idx, struct rdt_resource *r) +static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r) { union cpuid_0x10_1_eax eax; union cpuid_0x10_x_edx edx; @@ -231,10 +253,10 @@ static void rdt_get_cache_config(int idx, struct rdt_resource *r) r->num_closid = edx.split.cos_max + 1; r->cache.cbm_len = eax.split.cbm_len + 1; r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1; + r->cache.shareable_bits = ebx & r->default_ctrl; r->data_width = (r->cache.cbm_len + 3) / 4; - rdt_get_cache_infofile(r); - r->capable = true; - r->enabled = true; + r->alloc_capable = true; + r->alloc_enabled = true; } static void rdt_get_cdp_l3_config(int type) @@ -246,12 +268,12 @@ static void rdt_get_cdp_l3_config(int type) r->cache.cbm_len = r_l3->cache.cbm_len; r->default_ctrl = r_l3->default_ctrl; r->data_width = (r->cache.cbm_len + 3) / 4; - r->capable = true; + r->alloc_capable = true; /* * By default, CDP is disabled. CDP can be enabled by mount parameter * "cdp" during resctrl file system mount time. */ - r->enabled = false; + r->alloc_enabled = false; } static int get_cache_id(int cpu, int level) @@ -300,6 +322,19 @@ cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r) wrmsrl(r->msr_base + cbm_idx(r, i), d->ctrl_val[i]); } +struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r) +{ + struct rdt_domain *d; + + list_for_each_entry(d, &r->domains, list) { + /* Find the domain that contains this CPU */ + if (cpumask_test_cpu(cpu, &d->cpu_mask)) + return d; + } + + return NULL; +} + void rdt_ctrl_update(void *arg) { struct msr_param *m = arg; @@ -307,12 +342,10 @@ void rdt_ctrl_update(void *arg) int cpu = smp_processor_id(); struct rdt_domain *d; - list_for_each_entry(d, &r->domains, list) { - /* Find the domain that contains this CPU */ - if (cpumask_test_cpu(cpu, &d->cpu_mask)) { - r->msr_update(d, m, r); - return; - } + d = get_domain_from_cpu(cpu, r); + if (d) { + r->msr_update(d, m, r); + return; } pr_warn_once("cpu %d not found in any domain for resource %s\n", cpu, r->name); @@ -326,8 +359,8 @@ void rdt_ctrl_update(void *arg) * caller, return the first domain whose id is bigger than the input id. * The domain list is sorted by id in ascending order. */ -static struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, - struct list_head **pos) +struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, + struct list_head **pos) { struct rdt_domain *d; struct list_head *l; @@ -377,6 +410,44 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d) return 0; } +static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) +{ + size_t tsize; + + if (is_llc_occupancy_enabled()) { + d->rmid_busy_llc = kcalloc(BITS_TO_LONGS(r->num_rmid), + sizeof(unsigned long), + GFP_KERNEL); + if (!d->rmid_busy_llc) + return -ENOMEM; + INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); + } + if (is_mbm_total_enabled()) { + tsize = sizeof(*d->mbm_total); + d->mbm_total = kcalloc(r->num_rmid, tsize, GFP_KERNEL); + if (!d->mbm_total) { + kfree(d->rmid_busy_llc); + return -ENOMEM; + } + } + if (is_mbm_local_enabled()) { + tsize = sizeof(*d->mbm_local); + d->mbm_local = kcalloc(r->num_rmid, tsize, GFP_KERNEL); + if (!d->mbm_local) { + kfree(d->rmid_busy_llc); + kfree(d->mbm_total); + return -ENOMEM; + } + } + + if (is_mbm_enabled()) { + INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow); + mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL); + } + + return 0; +} + /* * domain_add_cpu - Add a cpu to a resource's domain list. * @@ -412,14 +483,26 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) return; d->id = id; + cpumask_set_cpu(cpu, &d->cpu_mask); - if (domain_setup_ctrlval(r, d)) { + if (r->alloc_capable && domain_setup_ctrlval(r, d)) { + kfree(d); + return; + } + + if (r->mon_capable && domain_setup_mon_state(r, d)) { kfree(d); return; } - cpumask_set_cpu(cpu, &d->cpu_mask); list_add_tail(&d->list, add_pos); + + /* + * If resctrl is mounted, add + * per domain monitor data directories. + */ + if (static_branch_unlikely(&rdt_mon_enable_key)) + mkdir_mondata_subdir_allrdtgrp(r, d); } static void domain_remove_cpu(int cpu, struct rdt_resource *r) @@ -435,19 +518,58 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) cpumask_clear_cpu(cpu, &d->cpu_mask); if (cpumask_empty(&d->cpu_mask)) { + /* + * If resctrl is mounted, remove all the + * per domain monitor data directories. + */ + if (static_branch_unlikely(&rdt_mon_enable_key)) + rmdir_mondata_subdir_allrdtgrp(r, d->id); kfree(d->ctrl_val); + kfree(d->rmid_busy_llc); + kfree(d->mbm_total); + kfree(d->mbm_local); list_del(&d->list); + if (is_mbm_enabled()) + cancel_delayed_work(&d->mbm_over); + if (is_llc_occupancy_enabled() && has_busy_rmid(r, d)) { + /* + * When a package is going down, forcefully + * decrement rmid->ebusy. There is no way to know + * that the L3 was flushed and hence may lead to + * incorrect counts in rare scenarios, but leaving + * the RMID as busy creates RMID leaks if the + * package never comes back. + */ + __check_limbo(d, true); + cancel_delayed_work(&d->cqm_limbo); + } + kfree(d); + return; + } + + if (r == &rdt_resources_all[RDT_RESOURCE_L3]) { + if (is_mbm_enabled() && cpu == d->mbm_work_cpu) { + cancel_delayed_work(&d->mbm_over); + mbm_setup_overflow_handler(d, 0); + } + if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu && + has_busy_rmid(r, d)) { + cancel_delayed_work(&d->cqm_limbo); + cqm_setup_limbo_handler(d, 0); + } } } -static void clear_closid(int cpu) +static void clear_closid_rmid(int cpu) { struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); - per_cpu(cpu_closid, cpu) = 0; - state->closid = 0; - wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, 0); + state->default_closid = 0; + state->default_rmid = 0; + state->cur_closid = 0; + state->cur_rmid = 0; + wrmsr(IA32_PQR_ASSOC, 0, 0); } static int intel_rdt_online_cpu(unsigned int cpu) @@ -459,12 +581,23 @@ static int intel_rdt_online_cpu(unsigned int cpu) domain_add_cpu(cpu, r); /* The cpu is set in default rdtgroup after online. */ cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); - clear_closid(cpu); + clear_closid_rmid(cpu); mutex_unlock(&rdtgroup_mutex); return 0; } +static void clear_childcpus(struct rdtgroup *r, unsigned int cpu) +{ + struct rdtgroup *cr; + + list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) { + if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) { + break; + } + } +} + static int intel_rdt_offline_cpu(unsigned int cpu) { struct rdtgroup *rdtgrp; @@ -474,10 +607,12 @@ static int intel_rdt_offline_cpu(unsigned int cpu) for_each_capable_rdt_resource(r) domain_remove_cpu(cpu, r); list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { - if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) + if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) { + clear_childcpus(rdtgrp, cpu); break; + } } - clear_closid(cpu); + clear_closid_rmid(cpu); mutex_unlock(&rdtgroup_mutex); return 0; @@ -492,7 +627,7 @@ static __init void rdt_init_padding(void) struct rdt_resource *r; int cl; - for_each_capable_rdt_resource(r) { + for_each_alloc_capable_rdt_resource(r) { cl = strlen(r->name); if (cl > max_name_width) max_name_width = cl; @@ -502,38 +637,153 @@ static __init void rdt_init_padding(void) } } -static __init bool get_rdt_resources(void) +enum { + RDT_FLAG_CMT, + RDT_FLAG_MBM_TOTAL, + RDT_FLAG_MBM_LOCAL, + RDT_FLAG_L3_CAT, + RDT_FLAG_L3_CDP, + RDT_FLAG_L2_CAT, + RDT_FLAG_MBA, +}; + +#define RDT_OPT(idx, n, f) \ +[idx] = { \ + .name = n, \ + .flag = f \ +} + +struct rdt_options { + char *name; + int flag; + bool force_off, force_on; +}; + +static struct rdt_options rdt_options[] __initdata = { + RDT_OPT(RDT_FLAG_CMT, "cmt", X86_FEATURE_CQM_OCCUP_LLC), + RDT_OPT(RDT_FLAG_MBM_TOTAL, "mbmtotal", X86_FEATURE_CQM_MBM_TOTAL), + RDT_OPT(RDT_FLAG_MBM_LOCAL, "mbmlocal", X86_FEATURE_CQM_MBM_LOCAL), + RDT_OPT(RDT_FLAG_L3_CAT, "l3cat", X86_FEATURE_CAT_L3), + RDT_OPT(RDT_FLAG_L3_CDP, "l3cdp", X86_FEATURE_CDP_L3), + RDT_OPT(RDT_FLAG_L2_CAT, "l2cat", X86_FEATURE_CAT_L2), + RDT_OPT(RDT_FLAG_MBA, "mba", X86_FEATURE_MBA), +}; +#define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options) + +static int __init set_rdt_options(char *str) +{ + struct rdt_options *o; + bool force_off; + char *tok; + + if (*str == '=') + str++; + while ((tok = strsep(&str, ",")) != NULL) { + force_off = *tok == '!'; + if (force_off) + tok++; + for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) { + if (strcmp(tok, o->name) == 0) { + if (force_off) + o->force_off = true; + else + o->force_on = true; + break; + } + } + } + return 1; +} +__setup("rdt", set_rdt_options); + +static bool __init rdt_cpu_has(int flag) +{ + bool ret = boot_cpu_has(flag); + struct rdt_options *o; + + if (!ret) + return ret; + + for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) { + if (flag == o->flag) { + if (o->force_off) + ret = false; + if (o->force_on) + ret = true; + break; + } + } + return ret; +} + +static __init bool get_rdt_alloc_resources(void) { bool ret = false; - if (cache_alloc_hsw_probe()) + if (rdt_alloc_capable) return true; if (!boot_cpu_has(X86_FEATURE_RDT_A)) return false; - if (boot_cpu_has(X86_FEATURE_CAT_L3)) { - rdt_get_cache_config(1, &rdt_resources_all[RDT_RESOURCE_L3]); - if (boot_cpu_has(X86_FEATURE_CDP_L3)) { + if (rdt_cpu_has(X86_FEATURE_CAT_L3)) { + rdt_get_cache_alloc_cfg(1, &rdt_resources_all[RDT_RESOURCE_L3]); + if (rdt_cpu_has(X86_FEATURE_CDP_L3)) { rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA); rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE); } ret = true; } - if (boot_cpu_has(X86_FEATURE_CAT_L2)) { + if (rdt_cpu_has(X86_FEATURE_CAT_L2)) { /* CPUID 0x10.2 fields are same format at 0x10.1 */ - rdt_get_cache_config(2, &rdt_resources_all[RDT_RESOURCE_L2]); + rdt_get_cache_alloc_cfg(2, &rdt_resources_all[RDT_RESOURCE_L2]); ret = true; } - if (boot_cpu_has(X86_FEATURE_MBA)) { + if (rdt_cpu_has(X86_FEATURE_MBA)) { if (rdt_get_mem_config(&rdt_resources_all[RDT_RESOURCE_MBA])) ret = true; } - return ret; } +static __init bool get_rdt_mon_resources(void) +{ + if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) + rdt_mon_features |= (1 << QOS_L3_OCCUP_EVENT_ID); + if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) + rdt_mon_features |= (1 << QOS_L3_MBM_TOTAL_EVENT_ID); + if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) + rdt_mon_features |= (1 << QOS_L3_MBM_LOCAL_EVENT_ID); + + if (!rdt_mon_features) + return false; + + return !rdt_get_mon_l3_config(&rdt_resources_all[RDT_RESOURCE_L3]); +} + +static __init void rdt_quirks(void) +{ + switch (boot_cpu_data.x86_model) { + case INTEL_FAM6_HASWELL_X: + if (!rdt_options[RDT_FLAG_L3_CAT].force_off) + cache_alloc_hsw_probe(); + break; + case INTEL_FAM6_SKYLAKE_X: + if (boot_cpu_data.x86_mask <= 4) + set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat"); + } +} + +static __init bool get_rdt_resources(void) +{ + rdt_quirks(); + rdt_alloc_capable = get_rdt_alloc_resources(); + rdt_mon_capable = get_rdt_mon_resources(); + + return (rdt_mon_capable || rdt_alloc_capable); +} + static int __init intel_rdt_late_init(void) { struct rdt_resource *r; @@ -556,9 +806,12 @@ static int __init intel_rdt_late_init(void) return ret; } - for_each_capable_rdt_resource(r) + for_each_alloc_capable_rdt_resource(r) pr_info("Intel RDT %s allocation detected\n", r->name); + for_each_mon_capable_rdt_resource(r) + pr_info("Intel RDT %s monitoring detected\n", r->name); + return 0; } diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h new file mode 100644 index 000000000000..a43a72d8e88e --- /dev/null +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -0,0 +1,441 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_INTEL_RDT_H +#define _ASM_X86_INTEL_RDT_H + +#include <linux/sched.h> +#include <linux/kernfs.h> +#include <linux/jump_label.h> + +#define IA32_L3_QOS_CFG 0xc81 +#define IA32_L3_CBM_BASE 0xc90 +#define IA32_L2_CBM_BASE 0xd10 +#define IA32_MBA_THRTL_BASE 0xd50 + +#define L3_QOS_CDP_ENABLE 0x01ULL + +/* + * Event IDs are used to program IA32_QM_EVTSEL before reading event + * counter from IA32_QM_CTR + */ +#define QOS_L3_OCCUP_EVENT_ID 0x01 +#define QOS_L3_MBM_TOTAL_EVENT_ID 0x02 +#define QOS_L3_MBM_LOCAL_EVENT_ID 0x03 + +#define CQM_LIMBOCHECK_INTERVAL 1000 + +#define MBM_CNTR_WIDTH 24 +#define MBM_OVERFLOW_INTERVAL 1000 + +#define RMID_VAL_ERROR BIT_ULL(63) +#define RMID_VAL_UNAVAIL BIT_ULL(62) + +DECLARE_STATIC_KEY_FALSE(rdt_enable_key); + +/** + * struct mon_evt - Entry in the event list of a resource + * @evtid: event id + * @name: name of the event + */ +struct mon_evt { + u32 evtid; + char *name; + struct list_head list; +}; + +/** + * struct mon_data_bits - Monitoring details for each event file + * @rid: Resource id associated with the event file. + * @evtid: Event id associated with the event file + * @domid: The domain to which the event file belongs + */ +union mon_data_bits { + void *priv; + struct { + unsigned int rid : 10; + unsigned int evtid : 8; + unsigned int domid : 14; + } u; +}; + +struct rmid_read { + struct rdtgroup *rgrp; + struct rdt_domain *d; + int evtid; + bool first; + u64 val; +}; + +extern unsigned int intel_cqm_threshold; +extern bool rdt_alloc_capable; +extern bool rdt_mon_capable; +extern unsigned int rdt_mon_features; + +enum rdt_group_type { + RDTCTRL_GROUP = 0, + RDTMON_GROUP, + RDT_NUM_GROUP, +}; + +/** + * struct mongroup - store mon group's data in resctrl fs. + * @mon_data_kn kernlfs node for the mon_data directory + * @parent: parent rdtgrp + * @crdtgrp_list: child rdtgroup node list + * @rmid: rmid for this rdtgroup + */ +struct mongroup { + struct kernfs_node *mon_data_kn; + struct rdtgroup *parent; + struct list_head crdtgrp_list; + u32 rmid; +}; + +/** + * struct rdtgroup - store rdtgroup's data in resctrl file system. + * @kn: kernfs node + * @rdtgroup_list: linked list for all rdtgroups + * @closid: closid for this rdtgroup + * @cpu_mask: CPUs assigned to this rdtgroup + * @flags: status bits + * @waitcount: how many cpus expect to find this + * group when they acquire rdtgroup_mutex + * @type: indicates type of this rdtgroup - either + * monitor only or ctrl_mon group + * @mon: mongroup related data + */ +struct rdtgroup { + struct kernfs_node *kn; + struct list_head rdtgroup_list; + u32 closid; + struct cpumask cpu_mask; + int flags; + atomic_t waitcount; + enum rdt_group_type type; + struct mongroup mon; +}; + +/* rdtgroup.flags */ +#define RDT_DELETED 1 + +/* rftype.flags */ +#define RFTYPE_FLAGS_CPUS_LIST 1 + +/* + * Define the file type flags for base and info directories. + */ +#define RFTYPE_INFO BIT(0) +#define RFTYPE_BASE BIT(1) +#define RF_CTRLSHIFT 4 +#define RF_MONSHIFT 5 +#define RFTYPE_CTRL BIT(RF_CTRLSHIFT) +#define RFTYPE_MON BIT(RF_MONSHIFT) +#define RFTYPE_RES_CACHE BIT(8) +#define RFTYPE_RES_MB BIT(9) +#define RF_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) +#define RF_MON_INFO (RFTYPE_INFO | RFTYPE_MON) +#define RF_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL) + +/* List of all resource groups */ +extern struct list_head rdt_all_groups; + +extern int max_name_width, max_data_width; + +int __init rdtgroup_init(void); + +/** + * struct rftype - describe each file in the resctrl file system + * @name: File name + * @mode: Access mode + * @kf_ops: File operations + * @flags: File specific RFTYPE_FLAGS_* flags + * @fflags: File specific RF_* or RFTYPE_* flags + * @seq_show: Show content of the file + * @write: Write to the file + */ +struct rftype { + char *name; + umode_t mode; + struct kernfs_ops *kf_ops; + unsigned long flags; + unsigned long fflags; + + int (*seq_show)(struct kernfs_open_file *of, + struct seq_file *sf, void *v); + /* + * write() is the generic write callback which maps directly to + * kernfs write operation and overrides all other operations. + * Maximum write size is determined by ->max_write_len. + */ + ssize_t (*write)(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); +}; + +/** + * struct mbm_state - status for each MBM counter in each domain + * @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes) + * @prev_msr Value of IA32_QM_CTR for this RMID last time we read it + */ +struct mbm_state { + u64 chunks; + u64 prev_msr; +}; + +/** + * struct rdt_domain - group of cpus sharing an RDT resource + * @list: all instances of this resource + * @id: unique id for this instance + * @cpu_mask: which cpus share this resource + * @rmid_busy_llc: + * bitmap of which limbo RMIDs are above threshold + * @mbm_total: saved state for MBM total bandwidth + * @mbm_local: saved state for MBM local bandwidth + * @mbm_over: worker to periodically read MBM h/w counters + * @cqm_limbo: worker to periodically read CQM h/w counters + * @mbm_work_cpu: + * worker cpu for MBM h/w counters + * @cqm_work_cpu: + * worker cpu for CQM h/w counters + * @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID) + * @new_ctrl: new ctrl value to be loaded + * @have_new_ctrl: did user provide new_ctrl for this domain + */ +struct rdt_domain { + struct list_head list; + int id; + struct cpumask cpu_mask; + unsigned long *rmid_busy_llc; + struct mbm_state *mbm_total; + struct mbm_state *mbm_local; + struct delayed_work mbm_over; + struct delayed_work cqm_limbo; + int mbm_work_cpu; + int cqm_work_cpu; + u32 *ctrl_val; + u32 new_ctrl; + bool have_new_ctrl; +}; + +/** + * struct msr_param - set a range of MSRs from a domain + * @res: The resource to use + * @low: Beginning index from base MSR + * @high: End index + */ +struct msr_param { + struct rdt_resource *res; + int low; + int high; +}; + +/** + * struct rdt_cache - Cache allocation related data + * @cbm_len: Length of the cache bit mask + * @min_cbm_bits: Minimum number of consecutive bits to be set + * @cbm_idx_mult: Multiplier of CBM index + * @cbm_idx_offset: Offset of CBM index. CBM index is computed by: + * closid * cbm_idx_multi + cbm_idx_offset + * in a cache bit mask + * @shareable_bits: Bitmask of shareable resource with other + * executing entities + */ +struct rdt_cache { + unsigned int cbm_len; + unsigned int min_cbm_bits; + unsigned int cbm_idx_mult; + unsigned int cbm_idx_offset; + unsigned int shareable_bits; +}; + +/** + * struct rdt_membw - Memory bandwidth allocation related data + * @max_delay: Max throttle delay. Delay is the hardware + * representation for memory bandwidth. + * @min_bw: Minimum memory bandwidth percentage user can request + * @bw_gran: Granularity at which the memory bandwidth is allocated + * @delay_linear: True if memory B/W delay is in linear scale + * @mb_map: Mapping of memory B/W percentage to memory B/W delay + */ +struct rdt_membw { + u32 max_delay; + u32 min_bw; + u32 bw_gran; + u32 delay_linear; + u32 *mb_map; +}; + +static inline bool is_llc_occupancy_enabled(void) +{ + return (rdt_mon_features & (1 << QOS_L3_OCCUP_EVENT_ID)); +} + +static inline bool is_mbm_total_enabled(void) +{ + return (rdt_mon_features & (1 << QOS_L3_MBM_TOTAL_EVENT_ID)); +} + +static inline bool is_mbm_local_enabled(void) +{ + return (rdt_mon_features & (1 << QOS_L3_MBM_LOCAL_EVENT_ID)); +} + +static inline bool is_mbm_enabled(void) +{ + return (is_mbm_total_enabled() || is_mbm_local_enabled()); +} + +static inline bool is_mbm_event(int e) +{ + return (e >= QOS_L3_MBM_TOTAL_EVENT_ID && + e <= QOS_L3_MBM_LOCAL_EVENT_ID); +} + +/** + * struct rdt_resource - attributes of an RDT resource + * @rid: The index of the resource + * @alloc_enabled: Is allocation enabled on this machine + * @mon_enabled: Is monitoring enabled for this feature + * @alloc_capable: Is allocation available on this machine + * @mon_capable: Is monitor feature available on this machine + * @name: Name to use in "schemata" file + * @num_closid: Number of CLOSIDs available + * @cache_level: Which cache level defines scope of this resource + * @default_ctrl: Specifies default cache cbm or memory B/W percent. + * @msr_base: Base MSR address for CBMs + * @msr_update: Function pointer to update QOS MSRs + * @data_width: Character width of data when displaying + * @domains: All domains for this resource + * @cache: Cache allocation related data + * @format_str: Per resource format string to show domain value + * @parse_ctrlval: Per resource function pointer to parse control values + * @evt_list: List of monitoring events + * @num_rmid: Number of RMIDs available + * @mon_scale: cqm counter * mon_scale = occupancy in bytes + * @fflags: flags to choose base and info files + */ +struct rdt_resource { + int rid; + bool alloc_enabled; + bool mon_enabled; + bool alloc_capable; + bool mon_capable; + char *name; + int num_closid; + int cache_level; + u32 default_ctrl; + unsigned int msr_base; + void (*msr_update) (struct rdt_domain *d, struct msr_param *m, + struct rdt_resource *r); + int data_width; + struct list_head domains; + struct rdt_cache cache; + struct rdt_membw membw; + const char *format_str; + int (*parse_ctrlval) (char *buf, struct rdt_resource *r, + struct rdt_domain *d); + struct list_head evt_list; + int num_rmid; + unsigned int mon_scale; + unsigned long fflags; +}; + +int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d); +int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d); + +extern struct mutex rdtgroup_mutex; + +extern struct rdt_resource rdt_resources_all[]; +extern struct rdtgroup rdtgroup_default; +DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key); + +int __init rdtgroup_init(void); + +enum { + RDT_RESOURCE_L3, + RDT_RESOURCE_L3DATA, + RDT_RESOURCE_L3CODE, + RDT_RESOURCE_L2, + RDT_RESOURCE_MBA, + + /* Must be the last */ + RDT_NUM_RESOURCES, +}; + +#define for_each_capable_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) \ + if (r->alloc_capable || r->mon_capable) + +#define for_each_alloc_capable_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) \ + if (r->alloc_capable) + +#define for_each_mon_capable_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) \ + if (r->mon_capable) + +#define for_each_alloc_enabled_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) \ + if (r->alloc_enabled) + +#define for_each_mon_enabled_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) \ + if (r->mon_enabled) + +/* CPUID.(EAX=10H, ECX=ResID=1).EAX */ +union cpuid_0x10_1_eax { + struct { + unsigned int cbm_len:5; + } split; + unsigned int full; +}; + +/* CPUID.(EAX=10H, ECX=ResID=3).EAX */ +union cpuid_0x10_3_eax { + struct { + unsigned int max_delay:12; + } split; + unsigned int full; +}; + +/* CPUID.(EAX=10H, ECX=ResID).EDX */ +union cpuid_0x10_x_edx { + struct { + unsigned int cos_max:16; + } split; + unsigned int full; +}; + +void rdt_ctrl_update(void *arg); +struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn); +void rdtgroup_kn_unlock(struct kernfs_node *kn); +struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, + struct list_head **pos); +ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); +int rdtgroup_schemata_show(struct kernfs_open_file *of, + struct seq_file *s, void *v); +struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r); +int alloc_rmid(void); +void free_rmid(u32 rmid); +int rdt_get_mon_l3_config(struct rdt_resource *r); +void mon_event_count(void *info); +int rdtgroup_mondata_show(struct seq_file *m, void *arg); +void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, + unsigned int dom_id); +void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, + struct rdt_domain *d); +void mon_event_read(struct rmid_read *rr, struct rdt_domain *d, + struct rdtgroup *rdtgrp, int evtid, int first); +void mbm_setup_overflow_handler(struct rdt_domain *dom, + unsigned long delay_ms); +void mbm_handle_overflow(struct work_struct *work); +void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms); +void cqm_handle_limbo(struct work_struct *work); +bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d); +void __check_limbo(struct rdt_domain *d, bool force_free); + +#endif /* _ASM_X86_INTEL_RDT_H */ diff --git a/arch/x86/kernel/cpu/intel_rdt_schemata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c index 406d7a6532f9..f6ea94f8954a 100644 --- a/arch/x86/kernel/cpu/intel_rdt_schemata.c +++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c @@ -26,7 +26,7 @@ #include <linux/kernfs.h> #include <linux/seq_file.h> #include <linux/slab.h> -#include <asm/intel_rdt.h> +#include "intel_rdt.h" /* * Check whether MBA bandwidth percentage value is correct. The value is @@ -192,7 +192,7 @@ static int rdtgroup_parse_resource(char *resname, char *tok, int closid) { struct rdt_resource *r; - for_each_enabled_rdt_resource(r) { + for_each_alloc_enabled_rdt_resource(r) { if (!strcmp(resname, r->name) && closid < r->num_closid) return parse_line(tok, r); } @@ -221,7 +221,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, closid = rdtgrp->closid; - for_each_enabled_rdt_resource(r) { + for_each_alloc_enabled_rdt_resource(r) { list_for_each_entry(dom, &r->domains, list) dom->have_new_ctrl = false; } @@ -237,7 +237,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, goto out; } - for_each_enabled_rdt_resource(r) { + for_each_alloc_enabled_rdt_resource(r) { ret = update_domains(r, closid); if (ret) goto out; @@ -269,12 +269,13 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of, { struct rdtgroup *rdtgrp; struct rdt_resource *r; - int closid, ret = 0; + int ret = 0; + u32 closid; rdtgrp = rdtgroup_kn_lock_live(of->kn); if (rdtgrp) { closid = rdtgrp->closid; - for_each_enabled_rdt_resource(r) { + for_each_alloc_enabled_rdt_resource(r) { if (closid < r->num_closid) show_doms(s, r, closid); } @@ -284,3 +285,57 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of, rdtgroup_kn_unlock(of->kn); return ret; } + +void mon_event_read(struct rmid_read *rr, struct rdt_domain *d, + struct rdtgroup *rdtgrp, int evtid, int first) +{ + /* + * setup the parameters to send to the IPI to read the data. + */ + rr->rgrp = rdtgrp; + rr->evtid = evtid; + rr->d = d; + rr->val = 0; + rr->first = first; + + smp_call_function_any(&d->cpu_mask, mon_event_count, rr, 1); +} + +int rdtgroup_mondata_show(struct seq_file *m, void *arg) +{ + struct kernfs_open_file *of = m->private; + u32 resid, evtid, domid; + struct rdtgroup *rdtgrp; + struct rdt_resource *r; + union mon_data_bits md; + struct rdt_domain *d; + struct rmid_read rr; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + + md.priv = of->kn->priv; + resid = md.u.rid; + domid = md.u.domid; + evtid = md.u.evtid; + + r = &rdt_resources_all[resid]; + d = rdt_find_domain(r, domid, NULL); + if (!d) { + ret = -ENOENT; + goto out; + } + + mon_event_read(&rr, d, rdtgrp, evtid, false); + + if (rr.val & RMID_VAL_ERROR) + seq_puts(m, "Error\n"); + else if (rr.val & RMID_VAL_UNAVAIL) + seq_puts(m, "Unavailable\n"); + else + seq_printf(m, "%llu\n", rr.val * r->mon_scale); + +out: + rdtgroup_kn_unlock(of->kn); + return ret; +} diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c b/arch/x86/kernel/cpu/intel_rdt_monitor.c new file mode 100644 index 000000000000..30827510094b --- /dev/null +++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c @@ -0,0 +1,499 @@ +/* + * Resource Director Technology(RDT) + * - Monitoring code + * + * Copyright (C) 2017 Intel Corporation + * + * Author: + * Vikas Shivappa <vikas.shivappa@intel.com> + * + * This replaces the cqm.c based on perf but we reuse a lot of + * code and datastructures originally from Peter Zijlstra and Matt Fleming. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * More information about RDT be found in the Intel (R) x86 Architecture + * Software Developer Manual June 2016, volume 3, section 17.17. + */ + +#include <linux/module.h> +#include <linux/slab.h> +#include <asm/cpu_device_id.h> +#include "intel_rdt.h" + +#define MSR_IA32_QM_CTR 0x0c8e +#define MSR_IA32_QM_EVTSEL 0x0c8d + +struct rmid_entry { + u32 rmid; + int busy; + struct list_head list; +}; + +/** + * @rmid_free_lru A least recently used list of free RMIDs + * These RMIDs are guaranteed to have an occupancy less than the + * threshold occupancy + */ +static LIST_HEAD(rmid_free_lru); + +/** + * @rmid_limbo_count count of currently unused but (potentially) + * dirty RMIDs. + * This counts RMIDs that no one is currently using but that + * may have a occupancy value > intel_cqm_threshold. User can change + * the threshold occupancy value. + */ +unsigned int rmid_limbo_count; + +/** + * @rmid_entry - The entry in the limbo and free lists. + */ +static struct rmid_entry *rmid_ptrs; + +/* + * Global boolean for rdt_monitor which is true if any + * resource monitoring is enabled. + */ +bool rdt_mon_capable; + +/* + * Global to indicate which monitoring events are enabled. + */ +unsigned int rdt_mon_features; + +/* + * This is the threshold cache occupancy at which we will consider an + * RMID available for re-allocation. + */ +unsigned int intel_cqm_threshold; + +static inline struct rmid_entry *__rmid_entry(u32 rmid) +{ + struct rmid_entry *entry; + + entry = &rmid_ptrs[rmid]; + WARN_ON(entry->rmid != rmid); + + return entry; +} + +static u64 __rmid_read(u32 rmid, u32 eventid) +{ + u64 val; + + /* + * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured + * with a valid event code for supported resource type and the bits + * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID, + * IA32_QM_CTR.data (bits 61:0) reports the monitored data. + * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62) + * are error bits. + */ + wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid); + rdmsrl(MSR_IA32_QM_CTR, val); + + return val; +} + +static bool rmid_dirty(struct rmid_entry *entry) +{ + u64 val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID); + + return val >= intel_cqm_threshold; +} + +/* + * Check the RMIDs that are marked as busy for this domain. If the + * reported LLC occupancy is below the threshold clear the busy bit and + * decrement the count. If the busy count gets to zero on an RMID, we + * free the RMID + */ +void __check_limbo(struct rdt_domain *d, bool force_free) +{ + struct rmid_entry *entry; + struct rdt_resource *r; + u32 crmid = 1, nrmid; + + r = &rdt_resources_all[RDT_RESOURCE_L3]; + + /* + * Skip RMID 0 and start from RMID 1 and check all the RMIDs that + * are marked as busy for occupancy < threshold. If the occupancy + * is less than the threshold decrement the busy counter of the + * RMID and move it to the free list when the counter reaches 0. + */ + for (;;) { + nrmid = find_next_bit(d->rmid_busy_llc, r->num_rmid, crmid); + if (nrmid >= r->num_rmid) + break; + + entry = __rmid_entry(nrmid); + if (force_free || !rmid_dirty(entry)) { + clear_bit(entry->rmid, d->rmid_busy_llc); + if (!--entry->busy) { + rmid_limbo_count--; + list_add_tail(&entry->list, &rmid_free_lru); + } + } + crmid = nrmid + 1; + } +} + +bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d) +{ + return find_first_bit(d->rmid_busy_llc, r->num_rmid) != r->num_rmid; +} + +/* + * As of now the RMIDs allocation is global. + * However we keep track of which packages the RMIDs + * are used to optimize the limbo list management. + */ +int alloc_rmid(void) +{ + struct rmid_entry *entry; + + lockdep_assert_held(&rdtgroup_mutex); + + if (list_empty(&rmid_free_lru)) + return rmid_limbo_count ? -EBUSY : -ENOSPC; + + entry = list_first_entry(&rmid_free_lru, + struct rmid_entry, list); + list_del(&entry->list); + + return entry->rmid; +} + +static void add_rmid_to_limbo(struct rmid_entry *entry) +{ + struct rdt_resource *r; + struct rdt_domain *d; + int cpu; + u64 val; + + r = &rdt_resources_all[RDT_RESOURCE_L3]; + + entry->busy = 0; + cpu = get_cpu(); + list_for_each_entry(d, &r->domains, list) { + if (cpumask_test_cpu(cpu, &d->cpu_mask)) { + val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID); + if (val <= intel_cqm_threshold) + continue; + } + + /* + * For the first limbo RMID in the domain, + * setup up the limbo worker. + */ + if (!has_busy_rmid(r, d)) + cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL); + set_bit(entry->rmid, d->rmid_busy_llc); + entry->busy++; + } + put_cpu(); + + if (entry->busy) + rmid_limbo_count++; + else + list_add_tail(&entry->list, &rmid_free_lru); +} + +void free_rmid(u32 rmid) +{ + struct rmid_entry *entry; + + if (!rmid) + return; + + lockdep_assert_held(&rdtgroup_mutex); + + entry = __rmid_entry(rmid); + + if (is_llc_occupancy_enabled()) + add_rmid_to_limbo(entry); + else + list_add_tail(&entry->list, &rmid_free_lru); +} + +static int __mon_event_count(u32 rmid, struct rmid_read *rr) +{ + u64 chunks, shift, tval; + struct mbm_state *m; + + tval = __rmid_read(rmid, rr->evtid); + if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) { + rr->val = tval; + return -EINVAL; + } + switch (rr->evtid) { + case QOS_L3_OCCUP_EVENT_ID: + rr->val += tval; + return 0; + case QOS_L3_MBM_TOTAL_EVENT_ID: + m = &rr->d->mbm_total[rmid]; + break; + case QOS_L3_MBM_LOCAL_EVENT_ID: + m = &rr->d->mbm_local[rmid]; + break; + default: + /* + * Code would never reach here because + * an invalid event id would fail the __rmid_read. + */ + return -EINVAL; + } + + if (rr->first) { + m->prev_msr = tval; + m->chunks = 0; + return 0; + } + + shift = 64 - MBM_CNTR_WIDTH; + chunks = (tval << shift) - (m->prev_msr << shift); + chunks >>= shift; + m->chunks += chunks; + m->prev_msr = tval; + + rr->val += m->chunks; + return 0; +} + +/* + * This is called via IPI to read the CQM/MBM counters + * on a domain. + */ +void mon_event_count(void *info) +{ + struct rdtgroup *rdtgrp, *entry; + struct rmid_read *rr = info; + struct list_head *head; + + rdtgrp = rr->rgrp; + + if (__mon_event_count(rdtgrp->mon.rmid, rr)) + return; + + /* + * For Ctrl groups read data from child monitor groups. + */ + head = &rdtgrp->mon.crdtgrp_list; + + if (rdtgrp->type == RDTCTRL_GROUP) { + list_for_each_entry(entry, head, mon.crdtgrp_list) { + if (__mon_event_count(entry->mon.rmid, rr)) + return; + } + } +} + +static void mbm_update(struct rdt_domain *d, int rmid) +{ + struct rmid_read rr; + + rr.first = false; + rr.d = d; + + /* + * This is protected from concurrent reads from user + * as both the user and we hold the global mutex. + */ + if (is_mbm_total_enabled()) { + rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID; + __mon_event_count(rmid, &rr); + } + if (is_mbm_local_enabled()) { + rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID; + __mon_event_count(rmid, &rr); + } +} + +/* + * Handler to scan the limbo list and move the RMIDs + * to free list whose occupancy < threshold_occupancy. + */ +void cqm_handle_limbo(struct work_struct *work) +{ + unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL); + int cpu = smp_processor_id(); + struct rdt_resource *r; + struct rdt_domain *d; + + mutex_lock(&rdtgroup_mutex); + + r = &rdt_resources_all[RDT_RESOURCE_L3]; + d = get_domain_from_cpu(cpu, r); + + if (!d) { + pr_warn_once("Failure to get domain for limbo worker\n"); + goto out_unlock; + } + + __check_limbo(d, false); + + if (has_busy_rmid(r, d)) + schedule_delayed_work_on(cpu, &d->cqm_limbo, delay); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); +} + +void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms) +{ + unsigned long delay = msecs_to_jiffies(delay_ms); + struct rdt_resource *r; + int cpu; + + r = &rdt_resources_all[RDT_RESOURCE_L3]; + + cpu = cpumask_any(&dom->cpu_mask); + dom->cqm_work_cpu = cpu; + + schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay); +} + +void mbm_handle_overflow(struct work_struct *work) +{ + unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL); + struct rdtgroup *prgrp, *crgrp; + int cpu = smp_processor_id(); + struct list_head *head; + struct rdt_domain *d; + + mutex_lock(&rdtgroup_mutex); + + if (!static_branch_likely(&rdt_enable_key)) + goto out_unlock; + + d = get_domain_from_cpu(cpu, &rdt_resources_all[RDT_RESOURCE_L3]); + if (!d) + goto out_unlock; + + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { + mbm_update(d, prgrp->mon.rmid); + + head = &prgrp->mon.crdtgrp_list; + list_for_each_entry(crgrp, head, mon.crdtgrp_list) + mbm_update(d, crgrp->mon.rmid); + } + + schedule_delayed_work_on(cpu, &d->mbm_over, delay); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); +} + +void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms) +{ + unsigned long delay = msecs_to_jiffies(delay_ms); + int cpu; + + if (!static_branch_likely(&rdt_enable_key)) + return; + cpu = cpumask_any(&dom->cpu_mask); + dom->mbm_work_cpu = cpu; + schedule_delayed_work_on(cpu, &dom->mbm_over, delay); +} + +static int dom_data_init(struct rdt_resource *r) +{ + struct rmid_entry *entry = NULL; + int i, nr_rmids; + + nr_rmids = r->num_rmid; + rmid_ptrs = kcalloc(nr_rmids, sizeof(struct rmid_entry), GFP_KERNEL); + if (!rmid_ptrs) + return -ENOMEM; + + for (i = 0; i < nr_rmids; i++) { + entry = &rmid_ptrs[i]; + INIT_LIST_HEAD(&entry->list); + + entry->rmid = i; + list_add_tail(&entry->list, &rmid_free_lru); + } + + /* + * RMID 0 is special and is always allocated. It's used for all + * tasks that are not monitored. + */ + entry = __rmid_entry(0); + list_del(&entry->list); + + return 0; +} + +static struct mon_evt llc_occupancy_event = { + .name = "llc_occupancy", + .evtid = QOS_L3_OCCUP_EVENT_ID, +}; + +static struct mon_evt mbm_total_event = { + .name = "mbm_total_bytes", + .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, +}; + +static struct mon_evt mbm_local_event = { + .name = "mbm_local_bytes", + .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, +}; + +/* + * Initialize the event list for the resource. + * + * Note that MBM events are also part of RDT_RESOURCE_L3 resource + * because as per the SDM the total and local memory bandwidth + * are enumerated as part of L3 monitoring. + */ +static void l3_mon_evt_init(struct rdt_resource *r) +{ + INIT_LIST_HEAD(&r->evt_list); + + if (is_llc_occupancy_enabled()) + list_add_tail(&llc_occupancy_event.list, &r->evt_list); + if (is_mbm_total_enabled()) + list_add_tail(&mbm_total_event.list, &r->evt_list); + if (is_mbm_local_enabled()) + list_add_tail(&mbm_local_event.list, &r->evt_list); +} + +int rdt_get_mon_l3_config(struct rdt_resource *r) +{ + int ret; + + r->mon_scale = boot_cpu_data.x86_cache_occ_scale; + r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1; + + /* + * A reasonable upper limit on the max threshold is the number + * of lines tagged per RMID if all RMIDs have the same number of + * lines tagged in the LLC. + * + * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. + */ + intel_cqm_threshold = boot_cpu_data.x86_cache_size * 1024 / r->num_rmid; + + /* h/w works in units of "boot_cpu_data.x86_cache_occ_scale" */ + intel_cqm_threshold /= r->mon_scale; + + ret = dom_data_init(r); + if (ret) + return ret; + + l3_mon_evt_init(r); + + r->mon_capable = true; + r->mon_enabled = true; + + return 0; +} diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 9257bd9dc664..a869d4a073c5 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -32,17 +32,25 @@ #include <uapi/linux/magic.h> -#include <asm/intel_rdt.h> -#include <asm/intel_rdt_common.h> +#include <asm/intel_rdt_sched.h> +#include "intel_rdt.h" DEFINE_STATIC_KEY_FALSE(rdt_enable_key); -struct kernfs_root *rdt_root; +DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key); +DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key); +static struct kernfs_root *rdt_root; struct rdtgroup rdtgroup_default; LIST_HEAD(rdt_all_groups); /* Kernel fs node for "info" directory under root */ static struct kernfs_node *kn_info; +/* Kernel fs node for "mon_groups" directory under root */ +static struct kernfs_node *kn_mongrp; + +/* Kernel fs node for "mon_data" directory under root */ +static struct kernfs_node *kn_mondata; + /* * Trivial allocator for CLOSIDs. Since h/w only supports a small number, * we can keep a bitmap of free CLOSIDs in a single integer. @@ -66,7 +74,7 @@ static void closid_init(void) int rdt_min_closid = 32; /* Compute rdt_min_closid across all resources */ - for_each_enabled_rdt_resource(r) + for_each_alloc_enabled_rdt_resource(r) rdt_min_closid = min(rdt_min_closid, r->num_closid); closid_free_map = BIT_MASK(rdt_min_closid) - 1; @@ -75,9 +83,9 @@ static void closid_init(void) closid_free_map &= ~1; } -int closid_alloc(void) +static int closid_alloc(void) { - int closid = ffs(closid_free_map); + u32 closid = ffs(closid_free_map); if (closid == 0) return -ENOSPC; @@ -125,28 +133,6 @@ static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft) return 0; } -static int rdtgroup_add_files(struct kernfs_node *kn, struct rftype *rfts, - int len) -{ - struct rftype *rft; - int ret; - - lockdep_assert_held(&rdtgroup_mutex); - - for (rft = rfts; rft < rfts + len; rft++) { - ret = rdtgroup_add_file(kn, rft); - if (ret) - goto error; - } - - return 0; -error: - pr_warn("Failed to add %s, err=%d\n", rft->name, ret); - while (--rft >= rfts) - kernfs_remove_by_name(kn, rft->name); - return ret; -} - static int rdtgroup_seqfile_show(struct seq_file *m, void *arg) { struct kernfs_open_file *of = m->private; @@ -174,6 +160,11 @@ static struct kernfs_ops rdtgroup_kf_single_ops = { .seq_show = rdtgroup_seqfile_show, }; +static struct kernfs_ops kf_mondata_ops = { + .atomic_write_len = PAGE_SIZE, + .seq_show = rdtgroup_mondata_show, +}; + static bool is_cpu_list(struct kernfs_open_file *of) { struct rftype *rft = of->kn->priv; @@ -203,13 +194,18 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of, /* * This is safe against intel_rdt_sched_in() called from __switch_to() * because __switch_to() is executed with interrupts disabled. A local call - * from rdt_update_closid() is proteced against __switch_to() because + * from update_closid_rmid() is proteced against __switch_to() because * preemption is disabled. */ -static void rdt_update_cpu_closid(void *closid) +static void update_cpu_closid_rmid(void *info) { - if (closid) - this_cpu_write(cpu_closid, *(int *)closid); + struct rdtgroup *r = info; + + if (r) { + this_cpu_write(pqr_state.default_closid, r->closid); + this_cpu_write(pqr_state.default_rmid, r->mon.rmid); + } + /* * We cannot unconditionally write the MSR because the current * executing task might have its own closid selected. Just reuse @@ -221,28 +217,128 @@ static void rdt_update_cpu_closid(void *closid) /* * Update the PGR_ASSOC MSR on all cpus in @cpu_mask, * - * Per task closids must have been set up before calling this function. - * - * The per cpu closids are updated with the smp function call, when @closid - * is not NULL. If @closid is NULL then all affected percpu closids must - * have been set up before calling this function. + * Per task closids/rmids must have been set up before calling this function. */ static void -rdt_update_closid(const struct cpumask *cpu_mask, int *closid) +update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r) { int cpu = get_cpu(); if (cpumask_test_cpu(cpu, cpu_mask)) - rdt_update_cpu_closid(closid); - smp_call_function_many(cpu_mask, rdt_update_cpu_closid, closid, 1); + update_cpu_closid_rmid(r); + smp_call_function_many(cpu_mask, update_cpu_closid_rmid, r, 1); put_cpu(); } +static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, + cpumask_var_t tmpmask) +{ + struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp; + struct list_head *head; + + /* Check whether cpus belong to parent ctrl group */ + cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask); + if (cpumask_weight(tmpmask)) + return -EINVAL; + + /* Check whether cpus are dropped from this group */ + cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); + if (cpumask_weight(tmpmask)) { + /* Give any dropped cpus to parent rdtgroup */ + cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask); + update_closid_rmid(tmpmask, prgrp); + } + + /* + * If we added cpus, remove them from previous group that owned them + * and update per-cpu rmid + */ + cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); + if (cpumask_weight(tmpmask)) { + head = &prgrp->mon.crdtgrp_list; + list_for_each_entry(crgrp, head, mon.crdtgrp_list) { + if (crgrp == rdtgrp) + continue; + cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask, + tmpmask); + } + update_closid_rmid(tmpmask, rdtgrp); + } + + /* Done pushing/pulling - update this group with new mask */ + cpumask_copy(&rdtgrp->cpu_mask, newmask); + + return 0; +} + +static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m) +{ + struct rdtgroup *crgrp; + + cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m); + /* update the child mon group masks as well*/ + list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list) + cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask); +} + +static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, + cpumask_var_t tmpmask, cpumask_var_t tmpmask1) +{ + struct rdtgroup *r, *crgrp; + struct list_head *head; + + /* Check whether cpus are dropped from this group */ + cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); + if (cpumask_weight(tmpmask)) { + /* Can't drop from default group */ + if (rdtgrp == &rdtgroup_default) + return -EINVAL; + + /* Give any dropped cpus to rdtgroup_default */ + cpumask_or(&rdtgroup_default.cpu_mask, + &rdtgroup_default.cpu_mask, tmpmask); + update_closid_rmid(tmpmask, &rdtgroup_default); + } + + /* + * If we added cpus, remove them from previous group and + * the prev group's child groups that owned them + * and update per-cpu closid/rmid. + */ + cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); + if (cpumask_weight(tmpmask)) { + list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) { + if (r == rdtgrp) + continue; + cpumask_and(tmpmask1, &r->cpu_mask, tmpmask); + if (cpumask_weight(tmpmask1)) + cpumask_rdtgrp_clear(r, tmpmask1); + } + update_closid_rmid(tmpmask, rdtgrp); + } + + /* Done pushing/pulling - update this group with new mask */ + cpumask_copy(&rdtgrp->cpu_mask, newmask); + + /* + * Clear child mon group masks since there is a new parent mask + * now and update the rmid for the cpus the child lost. + */ + head = &rdtgrp->mon.crdtgrp_list; + list_for_each_entry(crgrp, head, mon.crdtgrp_list) { + cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask); + update_closid_rmid(tmpmask, rdtgrp); + cpumask_clear(&crgrp->cpu_mask); + } + + return 0; +} + static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - cpumask_var_t tmpmask, newmask; - struct rdtgroup *rdtgrp, *r; + cpumask_var_t tmpmask, newmask, tmpmask1; + struct rdtgroup *rdtgrp; int ret; if (!buf) @@ -254,6 +350,11 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, free_cpumask_var(tmpmask); return -ENOMEM; } + if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) { + free_cpumask_var(tmpmask); + free_cpumask_var(newmask); + return -ENOMEM; + } rdtgrp = rdtgroup_kn_lock_live(of->kn); if (!rdtgrp) { @@ -276,41 +377,18 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, goto unlock; } - /* Check whether cpus are dropped from this group */ - cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); - if (cpumask_weight(tmpmask)) { - /* Can't drop from default group */ - if (rdtgrp == &rdtgroup_default) { - ret = -EINVAL; - goto unlock; - } - /* Give any dropped cpus to rdtgroup_default */ - cpumask_or(&rdtgroup_default.cpu_mask, - &rdtgroup_default.cpu_mask, tmpmask); - rdt_update_closid(tmpmask, &rdtgroup_default.closid); - } - - /* - * If we added cpus, remove them from previous group that owned them - * and update per-cpu closid - */ - cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); - if (cpumask_weight(tmpmask)) { - list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) { - if (r == rdtgrp) - continue; - cpumask_andnot(&r->cpu_mask, &r->cpu_mask, tmpmask); - } - rdt_update_closid(tmpmask, &rdtgrp->closid); - } - - /* Done pushing/pulling - update this group with new mask */ - cpumask_copy(&rdtgrp->cpu_mask, newmask); + if (rdtgrp->type == RDTCTRL_GROUP) + ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1); + else if (rdtgrp->type == RDTMON_GROUP) + ret = cpus_mon_write(rdtgrp, newmask, tmpmask); + else + ret = -EINVAL; unlock: rdtgroup_kn_unlock(of->kn); free_cpumask_var(tmpmask); free_cpumask_var(newmask); + free_cpumask_var(tmpmask1); return ret ?: nbytes; } @@ -336,6 +414,7 @@ static void move_myself(struct callback_head *head) if (atomic_dec_and_test(&rdtgrp->waitcount) && (rdtgrp->flags & RDT_DELETED)) { current->closid = 0; + current->rmid = 0; kfree(rdtgrp); } @@ -374,7 +453,20 @@ static int __rdtgroup_move_task(struct task_struct *tsk, atomic_dec(&rdtgrp->waitcount); kfree(callback); } else { - tsk->closid = rdtgrp->closid; + /* + * For ctrl_mon groups move both closid and rmid. + * For monitor groups, can move the tasks only from + * their parent CTRL group. + */ + if (rdtgrp->type == RDTCTRL_GROUP) { + tsk->closid = rdtgrp->closid; + tsk->rmid = rdtgrp->mon.rmid; + } else if (rdtgrp->type == RDTMON_GROUP) { + if (rdtgrp->mon.parent->closid == tsk->closid) + tsk->rmid = rdtgrp->mon.rmid; + else + ret = -EINVAL; + } } return ret; } @@ -454,7 +546,8 @@ static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s) rcu_read_lock(); for_each_process_thread(p, t) { - if (t->closid == r->closid) + if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) || + (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid)) seq_printf(s, "%d\n", t->pid); } rcu_read_unlock(); @@ -476,39 +569,6 @@ static int rdtgroup_tasks_show(struct kernfs_open_file *of, return ret; } -/* Files in each rdtgroup */ -static struct rftype rdtgroup_base_files[] = { - { - .name = "cpus", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_cpus_write, - .seq_show = rdtgroup_cpus_show, - }, - { - .name = "cpus_list", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_cpus_write, - .seq_show = rdtgroup_cpus_show, - .flags = RFTYPE_FLAGS_CPUS_LIST, - }, - { - .name = "tasks", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_tasks_write, - .seq_show = rdtgroup_tasks_show, - }, - { - .name = "schemata", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_schemata_write, - .seq_show = rdtgroup_schemata_show, - }, -}; - static int rdt_num_closids_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { @@ -536,6 +596,15 @@ static int rdt_min_cbm_bits_show(struct kernfs_open_file *of, return 0; } +static int rdt_shareable_bits_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%x\n", r->cache.shareable_bits); + return 0; +} + static int rdt_min_bw_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { @@ -545,6 +614,28 @@ static int rdt_min_bw_show(struct kernfs_open_file *of, return 0; } +static int rdt_num_rmids_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%d\n", r->num_rmid); + + return 0; +} + +static int rdt_mon_features_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + struct mon_evt *mevt; + + list_for_each_entry(mevt, &r->evt_list, list) + seq_printf(seq, "%s\n", mevt->name); + + return 0; +} + static int rdt_bw_gran_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { @@ -563,74 +654,200 @@ static int rdt_delay_linear_show(struct kernfs_open_file *of, return 0; } +static int max_threshold_occ_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%u\n", intel_cqm_threshold * r->mon_scale); + + return 0; +} + +static ssize_t max_threshold_occ_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct rdt_resource *r = of->kn->parent->priv; + unsigned int bytes; + int ret; + + ret = kstrtouint(buf, 0, &bytes); + if (ret) + return ret; + + if (bytes > (boot_cpu_data.x86_cache_size * 1024)) + return -EINVAL; + + intel_cqm_threshold = bytes / r->mon_scale; + + return nbytes; +} + /* rdtgroup information files for one cache resource. */ -static struct rftype res_cache_info_files[] = { +static struct rftype res_common_files[] = { { .name = "num_closids", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_num_closids_show, + .fflags = RF_CTRL_INFO, + }, + { + .name = "mon_features", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_mon_features_show, + .fflags = RF_MON_INFO, + }, + { + .name = "num_rmids", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_num_rmids_show, + .fflags = RF_MON_INFO, }, { .name = "cbm_mask", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_default_ctrl_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, }, { .name = "min_cbm_bits", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_min_cbm_bits_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, }, -}; - -/* rdtgroup information files for memory bandwidth. */ -static struct rftype res_mba_info_files[] = { { - .name = "num_closids", + .name = "shareable_bits", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_num_closids_show, + .seq_show = rdt_shareable_bits_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, }, { .name = "min_bandwidth", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_min_bw_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, }, { .name = "bandwidth_gran", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_bw_gran_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, }, { .name = "delay_linear", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_delay_linear_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, + }, + { + .name = "max_threshold_occupancy", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = max_threshold_occ_write, + .seq_show = max_threshold_occ_show, + .fflags = RF_MON_INFO | RFTYPE_RES_CACHE, + }, + { + .name = "cpus", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_cpus_write, + .seq_show = rdtgroup_cpus_show, + .fflags = RFTYPE_BASE, + }, + { + .name = "cpus_list", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_cpus_write, + .seq_show = rdtgroup_cpus_show, + .flags = RFTYPE_FLAGS_CPUS_LIST, + .fflags = RFTYPE_BASE, + }, + { + .name = "tasks", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_tasks_write, + .seq_show = rdtgroup_tasks_show, + .fflags = RFTYPE_BASE, + }, + { + .name = "schemata", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_schemata_write, + .seq_show = rdtgroup_schemata_show, + .fflags = RF_CTRL_BASE, }, }; -void rdt_get_mba_infofile(struct rdt_resource *r) +static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags) { - r->info_files = res_mba_info_files; - r->nr_info_files = ARRAY_SIZE(res_mba_info_files); + struct rftype *rfts, *rft; + int ret, len; + + rfts = res_common_files; + len = ARRAY_SIZE(res_common_files); + + lockdep_assert_held(&rdtgroup_mutex); + + for (rft = rfts; rft < rfts + len; rft++) { + if ((fflags & rft->fflags) == rft->fflags) { + ret = rdtgroup_add_file(kn, rft); + if (ret) + goto error; + } + } + + return 0; +error: + pr_warn("Failed to add %s, err=%d\n", rft->name, ret); + while (--rft >= rfts) { + if ((fflags & rft->fflags) == rft->fflags) + kernfs_remove_by_name(kn, rft->name); + } + return ret; } -void rdt_get_cache_infofile(struct rdt_resource *r) +static int rdtgroup_mkdir_info_resdir(struct rdt_resource *r, char *name, + unsigned long fflags) { - r->info_files = res_cache_info_files; - r->nr_info_files = ARRAY_SIZE(res_cache_info_files); + struct kernfs_node *kn_subdir; + int ret; + + kn_subdir = kernfs_create_dir(kn_info, name, + kn_info->mode, r); + if (IS_ERR(kn_subdir)) + return PTR_ERR(kn_subdir); + + kernfs_get(kn_subdir); + ret = rdtgroup_kn_set_ugid(kn_subdir); + if (ret) + return ret; + + ret = rdtgroup_add_files(kn_subdir, fflags); + if (!ret) + kernfs_activate(kn_subdir); + + return ret; } static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) { - struct kernfs_node *kn_subdir; - struct rftype *res_info_files; struct rdt_resource *r; - int ret, len; + unsigned long fflags; + char name[32]; + int ret; /* create the directory */ kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL); @@ -638,25 +855,19 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) return PTR_ERR(kn_info); kernfs_get(kn_info); - for_each_enabled_rdt_resource(r) { - kn_subdir = kernfs_create_dir(kn_info, r->name, - kn_info->mode, r); - if (IS_ERR(kn_subdir)) { - ret = PTR_ERR(kn_subdir); - goto out_destroy; - } - kernfs_get(kn_subdir); - ret = rdtgroup_kn_set_ugid(kn_subdir); + for_each_alloc_enabled_rdt_resource(r) { + fflags = r->fflags | RF_CTRL_INFO; + ret = rdtgroup_mkdir_info_resdir(r, r->name, fflags); if (ret) goto out_destroy; + } - res_info_files = r->info_files; - len = r->nr_info_files; - - ret = rdtgroup_add_files(kn_subdir, res_info_files, len); + for_each_mon_enabled_rdt_resource(r) { + fflags = r->fflags | RF_MON_INFO; + sprintf(name, "%s_MON", r->name); + ret = rdtgroup_mkdir_info_resdir(r, name, fflags); if (ret) goto out_destroy; - kernfs_activate(kn_subdir); } /* @@ -678,6 +889,39 @@ out_destroy: return ret; } +static int +mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp, + char *name, struct kernfs_node **dest_kn) +{ + struct kernfs_node *kn; + int ret; + + /* create the directory */ + kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); + if (IS_ERR(kn)) + return PTR_ERR(kn); + + if (dest_kn) + *dest_kn = kn; + + /* + * This extra ref will be put in kernfs_remove() and guarantees + * that @rdtgrp->kn is always accessible. + */ + kernfs_get(kn); + + ret = rdtgroup_kn_set_ugid(kn); + if (ret) + goto out_destroy; + + kernfs_activate(kn); + + return 0; + +out_destroy: + kernfs_remove(kn); + return ret; +} static void l3_qos_cfg_update(void *arg) { bool *enable = arg; @@ -718,14 +962,15 @@ static int cdp_enable(void) struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3]; int ret; - if (!r_l3->capable || !r_l3data->capable || !r_l3code->capable) + if (!r_l3->alloc_capable || !r_l3data->alloc_capable || + !r_l3code->alloc_capable) return -EINVAL; ret = set_l3_qos_cfg(r_l3, true); if (!ret) { - r_l3->enabled = false; - r_l3data->enabled = true; - r_l3code->enabled = true; + r_l3->alloc_enabled = false; + r_l3data->alloc_enabled = true; + r_l3code->alloc_enabled = true; } return ret; } @@ -734,11 +979,11 @@ static void cdp_disable(void) { struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3]; - r->enabled = r->capable; + r->alloc_enabled = r->alloc_capable; - if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled) { - rdt_resources_all[RDT_RESOURCE_L3DATA].enabled = false; - rdt_resources_all[RDT_RESOURCE_L3CODE].enabled = false; + if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled) { + rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled = false; + rdt_resources_all[RDT_RESOURCE_L3CODE].alloc_enabled = false; set_l3_qos_cfg(r, false); } } @@ -823,10 +1068,16 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn) } } +static int mkdir_mondata_all(struct kernfs_node *parent_kn, + struct rdtgroup *prgrp, + struct kernfs_node **mon_data_kn); + static struct dentry *rdt_mount(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data) { + struct rdt_domain *dom; + struct rdt_resource *r; struct dentry *dentry; int ret; @@ -853,15 +1104,54 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type, goto out_cdp; } + if (rdt_mon_capable) { + ret = mongroup_create_dir(rdtgroup_default.kn, + NULL, "mon_groups", + &kn_mongrp); + if (ret) { + dentry = ERR_PTR(ret); + goto out_info; + } + kernfs_get(kn_mongrp); + + ret = mkdir_mondata_all(rdtgroup_default.kn, + &rdtgroup_default, &kn_mondata); + if (ret) { + dentry = ERR_PTR(ret); + goto out_mongrp; + } + kernfs_get(kn_mondata); + rdtgroup_default.mon.mon_data_kn = kn_mondata; + } + dentry = kernfs_mount(fs_type, flags, rdt_root, RDTGROUP_SUPER_MAGIC, NULL); if (IS_ERR(dentry)) - goto out_destroy; + goto out_mondata; + + if (rdt_alloc_capable) + static_branch_enable(&rdt_alloc_enable_key); + if (rdt_mon_capable) + static_branch_enable(&rdt_mon_enable_key); + + if (rdt_alloc_capable || rdt_mon_capable) + static_branch_enable(&rdt_enable_key); + + if (is_mbm_enabled()) { + r = &rdt_resources_all[RDT_RESOURCE_L3]; + list_for_each_entry(dom, &r->domains, list) + mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL); + } - static_branch_enable(&rdt_enable_key); goto out; -out_destroy: +out_mondata: + if (rdt_mon_capable) + kernfs_remove(kn_mondata); +out_mongrp: + if (rdt_mon_capable) + kernfs_remove(kn_mongrp); +out_info: kernfs_remove(kn_info); out_cdp: cdp_disable(); @@ -909,6 +1199,18 @@ static int reset_all_ctrls(struct rdt_resource *r) return 0; } +static bool is_closid_match(struct task_struct *t, struct rdtgroup *r) +{ + return (rdt_alloc_capable && + (r->type == RDTCTRL_GROUP) && (t->closid == r->closid)); +} + +static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r) +{ + return (rdt_mon_capable && + (r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid)); +} + /* * Move tasks from one to the other group. If @from is NULL, then all tasks * in the systems are moved unconditionally (used for teardown). @@ -924,8 +1226,11 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to, read_lock(&tasklist_lock); for_each_process_thread(p, t) { - if (!from || t->closid == from->closid) { + if (!from || is_closid_match(t, from) || + is_rmid_match(t, from)) { t->closid = to->closid; + t->rmid = to->mon.rmid; + #ifdef CONFIG_SMP /* * This is safe on x86 w/o barriers as the ordering @@ -944,6 +1249,19 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to, read_unlock(&tasklist_lock); } +static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp) +{ + struct rdtgroup *sentry, *stmp; + struct list_head *head; + + head = &rdtgrp->mon.crdtgrp_list; + list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) { + free_rmid(sentry->mon.rmid); + list_del(&sentry->mon.crdtgrp_list); + kfree(sentry); + } +} + /* * Forcibly remove all of subdirectories under root. */ @@ -955,6 +1273,9 @@ static void rmdir_all_sub(void) rdt_move_group_tasks(NULL, &rdtgroup_default, NULL); list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) { + /* Free any child rmids */ + free_all_child_rdtgrp(rdtgrp); + /* Remove each rdtgroup other than root */ if (rdtgrp == &rdtgroup_default) continue; @@ -967,16 +1288,20 @@ static void rmdir_all_sub(void) cpumask_or(&rdtgroup_default.cpu_mask, &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); + free_rmid(rdtgrp->mon.rmid); + kernfs_remove(rdtgrp->kn); list_del(&rdtgrp->rdtgroup_list); kfree(rdtgrp); } /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */ get_online_cpus(); - rdt_update_closid(cpu_online_mask, &rdtgroup_default.closid); + update_closid_rmid(cpu_online_mask, &rdtgroup_default); put_online_cpus(); kernfs_remove(kn_info); + kernfs_remove(kn_mongrp); + kernfs_remove(kn_mondata); } static void rdt_kill_sb(struct super_block *sb) @@ -986,10 +1311,12 @@ static void rdt_kill_sb(struct super_block *sb) mutex_lock(&rdtgroup_mutex); /*Put everything back to default values. */ - for_each_enabled_rdt_resource(r) + for_each_alloc_enabled_rdt_resource(r) reset_all_ctrls(r); cdp_disable(); rmdir_all_sub(); + static_branch_disable(&rdt_alloc_enable_key); + static_branch_disable(&rdt_mon_enable_key); static_branch_disable(&rdt_enable_key); kernfs_kill_sb(sb); mutex_unlock(&rdtgroup_mutex); @@ -1001,46 +1328,223 @@ static struct file_system_type rdt_fs_type = { .kill_sb = rdt_kill_sb, }; -static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, - umode_t mode) +static int mon_addfile(struct kernfs_node *parent_kn, const char *name, + void *priv) { - struct rdtgroup *parent, *rdtgrp; struct kernfs_node *kn; - int ret, closid; + int ret = 0; - /* Only allow mkdir in the root directory */ - if (parent_kn != rdtgroup_default.kn) - return -EPERM; + kn = __kernfs_create_file(parent_kn, name, 0444, 0, + &kf_mondata_ops, priv, NULL, NULL); + if (IS_ERR(kn)) + return PTR_ERR(kn); - /* Do not accept '\n' to avoid unparsable situation. */ - if (strchr(name, '\n')) - return -EINVAL; + ret = rdtgroup_kn_set_ugid(kn); + if (ret) { + kernfs_remove(kn); + return ret; + } - parent = rdtgroup_kn_lock_live(parent_kn); - if (!parent) { - ret = -ENODEV; - goto out_unlock; + return ret; +} + +/* + * Remove all subdirectories of mon_data of ctrl_mon groups + * and monitor groups with given domain id. + */ +void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, unsigned int dom_id) +{ + struct rdtgroup *prgrp, *crgrp; + char name[32]; + + if (!r->mon_enabled) + return; + + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { + sprintf(name, "mon_%s_%02d", r->name, dom_id); + kernfs_remove_by_name(prgrp->mon.mon_data_kn, name); + + list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list) + kernfs_remove_by_name(crgrp->mon.mon_data_kn, name); } +} - ret = closid_alloc(); - if (ret < 0) +static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, + struct rdt_domain *d, + struct rdt_resource *r, struct rdtgroup *prgrp) +{ + union mon_data_bits priv; + struct kernfs_node *kn; + struct mon_evt *mevt; + struct rmid_read rr; + char name[32]; + int ret; + + sprintf(name, "mon_%s_%02d", r->name, d->id); + /* create the directory */ + kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); + if (IS_ERR(kn)) + return PTR_ERR(kn); + + /* + * This extra ref will be put in kernfs_remove() and guarantees + * that kn is always accessible. + */ + kernfs_get(kn); + ret = rdtgroup_kn_set_ugid(kn); + if (ret) + goto out_destroy; + + if (WARN_ON(list_empty(&r->evt_list))) { + ret = -EPERM; + goto out_destroy; + } + + priv.u.rid = r->rid; + priv.u.domid = d->id; + list_for_each_entry(mevt, &r->evt_list, list) { + priv.u.evtid = mevt->evtid; + ret = mon_addfile(kn, mevt->name, priv.priv); + if (ret) + goto out_destroy; + + if (is_mbm_event(mevt->evtid)) + mon_event_read(&rr, d, prgrp, mevt->evtid, true); + } + kernfs_activate(kn); + return 0; + +out_destroy: + kernfs_remove(kn); + return ret; +} + +/* + * Add all subdirectories of mon_data for "ctrl_mon" groups + * and "monitor" groups with given domain id. + */ +void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, + struct rdt_domain *d) +{ + struct kernfs_node *parent_kn; + struct rdtgroup *prgrp, *crgrp; + struct list_head *head; + + if (!r->mon_enabled) + return; + + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { + parent_kn = prgrp->mon.mon_data_kn; + mkdir_mondata_subdir(parent_kn, d, r, prgrp); + + head = &prgrp->mon.crdtgrp_list; + list_for_each_entry(crgrp, head, mon.crdtgrp_list) { + parent_kn = crgrp->mon.mon_data_kn; + mkdir_mondata_subdir(parent_kn, d, r, crgrp); + } + } +} + +static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn, + struct rdt_resource *r, + struct rdtgroup *prgrp) +{ + struct rdt_domain *dom; + int ret; + + list_for_each_entry(dom, &r->domains, list) { + ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp); + if (ret) + return ret; + } + + return 0; +} + +/* + * This creates a directory mon_data which contains the monitored data. + * + * mon_data has one directory for each domain whic are named + * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data + * with L3 domain looks as below: + * ./mon_data: + * mon_L3_00 + * mon_L3_01 + * mon_L3_02 + * ... + * + * Each domain directory has one file per event: + * ./mon_L3_00/: + * llc_occupancy + * + */ +static int mkdir_mondata_all(struct kernfs_node *parent_kn, + struct rdtgroup *prgrp, + struct kernfs_node **dest_kn) +{ + struct rdt_resource *r; + struct kernfs_node *kn; + int ret; + + /* + * Create the mon_data directory first. + */ + ret = mongroup_create_dir(parent_kn, NULL, "mon_data", &kn); + if (ret) + return ret; + + if (dest_kn) + *dest_kn = kn; + + /* + * Create the subdirectories for each domain. Note that all events + * in a domain like L3 are grouped into a resource whose domain is L3 + */ + for_each_mon_enabled_rdt_resource(r) { + ret = mkdir_mondata_subdir_alldom(kn, r, prgrp); + if (ret) + goto out_destroy; + } + + return 0; + +out_destroy: + kernfs_remove(kn); + return ret; +} + +static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, + struct kernfs_node *prgrp_kn, + const char *name, umode_t mode, + enum rdt_group_type rtype, struct rdtgroup **r) +{ + struct rdtgroup *prdtgrp, *rdtgrp; + struct kernfs_node *kn; + uint files = 0; + int ret; + + prdtgrp = rdtgroup_kn_lock_live(prgrp_kn); + if (!prdtgrp) { + ret = -ENODEV; goto out_unlock; - closid = ret; + } /* allocate the rdtgroup. */ rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL); if (!rdtgrp) { ret = -ENOSPC; - goto out_closid_free; + goto out_unlock; } - rdtgrp->closid = closid; - list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); + *r = rdtgrp; + rdtgrp->mon.parent = prdtgrp; + rdtgrp->type = rtype; + INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list); /* kernfs creates the directory for rdtgrp */ - kn = kernfs_create_dir(parent->kn, name, mode, rdtgrp); + kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp); if (IS_ERR(kn)) { ret = PTR_ERR(kn); - goto out_cancel_ref; + goto out_free_rgrp; } rdtgrp->kn = kn; @@ -1056,43 +1560,211 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, if (ret) goto out_destroy; - ret = rdtgroup_add_files(kn, rdtgroup_base_files, - ARRAY_SIZE(rdtgroup_base_files)); + files = RFTYPE_BASE | RFTYPE_CTRL; + files = RFTYPE_BASE | BIT(RF_CTRLSHIFT + rtype); + ret = rdtgroup_add_files(kn, files); if (ret) goto out_destroy; + if (rdt_mon_capable) { + ret = alloc_rmid(); + if (ret < 0) + goto out_destroy; + rdtgrp->mon.rmid = ret; + + ret = mkdir_mondata_all(kn, rdtgrp, &rdtgrp->mon.mon_data_kn); + if (ret) + goto out_idfree; + } kernfs_activate(kn); - ret = 0; - goto out_unlock; + /* + * The caller unlocks the prgrp_kn upon success. + */ + return 0; +out_idfree: + free_rmid(rdtgrp->mon.rmid); out_destroy: kernfs_remove(rdtgrp->kn); -out_cancel_ref: - list_del(&rdtgrp->rdtgroup_list); +out_free_rgrp: kfree(rdtgrp); -out_closid_free: - closid_free(closid); out_unlock: - rdtgroup_kn_unlock(parent_kn); + rdtgroup_kn_unlock(prgrp_kn); return ret; } -static int rdtgroup_rmdir(struct kernfs_node *kn) +static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp) +{ + kernfs_remove(rgrp->kn); + free_rmid(rgrp->mon.rmid); + kfree(rgrp); +} + +/* + * Create a monitor group under "mon_groups" directory of a control + * and monitor group(ctrl_mon). This is a resource group + * to monitor a subset of tasks and cpus in its parent ctrl_mon group. + */ +static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn, + struct kernfs_node *prgrp_kn, + const char *name, + umode_t mode) +{ + struct rdtgroup *rdtgrp, *prgrp; + int ret; + + ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTMON_GROUP, + &rdtgrp); + if (ret) + return ret; + + prgrp = rdtgrp->mon.parent; + rdtgrp->closid = prgrp->closid; + + /* + * Add the rdtgrp to the list of rdtgrps the parent + * ctrl_mon group has to track. + */ + list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list); + + rdtgroup_kn_unlock(prgrp_kn); + return ret; +} + +/* + * These are rdtgroups created under the root directory. Can be used + * to allocate and monitor resources. + */ +static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, + struct kernfs_node *prgrp_kn, + const char *name, umode_t mode) { - int ret, cpu, closid = rdtgroup_default.closid; struct rdtgroup *rdtgrp; - cpumask_var_t tmpmask; + struct kernfs_node *kn; + u32 closid; + int ret; - if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) - return -ENOMEM; + ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTCTRL_GROUP, + &rdtgrp); + if (ret) + return ret; - rdtgrp = rdtgroup_kn_lock_live(kn); - if (!rdtgrp) { - ret = -EPERM; - goto out; + kn = rdtgrp->kn; + ret = closid_alloc(); + if (ret < 0) + goto out_common_fail; + closid = ret; + + rdtgrp->closid = closid; + list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); + + if (rdt_mon_capable) { + /* + * Create an empty mon_groups directory to hold the subset + * of tasks and cpus to monitor. + */ + ret = mongroup_create_dir(kn, NULL, "mon_groups", NULL); + if (ret) + goto out_id_free; } + goto out_unlock; + +out_id_free: + closid_free(closid); + list_del(&rdtgrp->rdtgroup_list); +out_common_fail: + mkdir_rdt_prepare_clean(rdtgrp); +out_unlock: + rdtgroup_kn_unlock(prgrp_kn); + return ret; +} + +/* + * We allow creating mon groups only with in a directory called "mon_groups" + * which is present in every ctrl_mon group. Check if this is a valid + * "mon_groups" directory. + * + * 1. The directory should be named "mon_groups". + * 2. The mon group itself should "not" be named "mon_groups". + * This makes sure "mon_groups" directory always has a ctrl_mon group + * as parent. + */ +static bool is_mon_groups(struct kernfs_node *kn, const char *name) +{ + return (!strcmp(kn->name, "mon_groups") && + strcmp(name, "mon_groups")); +} + +static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, + umode_t mode) +{ + /* Do not accept '\n' to avoid unparsable situation. */ + if (strchr(name, '\n')) + return -EINVAL; + + /* + * If the parent directory is the root directory and RDT + * allocation is supported, add a control and monitoring + * subdirectory + */ + if (rdt_alloc_capable && parent_kn == rdtgroup_default.kn) + return rdtgroup_mkdir_ctrl_mon(parent_kn, parent_kn, name, mode); + + /* + * If RDT monitoring is supported and the parent directory is a valid + * "mon_groups" directory, add a monitoring subdirectory. + */ + if (rdt_mon_capable && is_mon_groups(parent_kn, name)) + return rdtgroup_mkdir_mon(parent_kn, parent_kn->parent, name, mode); + + return -EPERM; +} + +static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp, + cpumask_var_t tmpmask) +{ + struct rdtgroup *prdtgrp = rdtgrp->mon.parent; + int cpu; + + /* Give any tasks back to the parent group */ + rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask); + + /* Update per cpu rmid of the moved CPUs first */ + for_each_cpu(cpu, &rdtgrp->cpu_mask) + per_cpu(pqr_state.default_rmid, cpu) = prdtgrp->mon.rmid; + /* + * Update the MSR on moved CPUs and CPUs which have moved + * task running on them. + */ + cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); + update_closid_rmid(tmpmask, NULL); + + rdtgrp->flags = RDT_DELETED; + free_rmid(rdtgrp->mon.rmid); + + /* + * Remove the rdtgrp from the parent ctrl_mon group's list + */ + WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list)); + list_del(&rdtgrp->mon.crdtgrp_list); + + /* + * one extra hold on this, will drop when we kfree(rdtgrp) + * in rdtgroup_kn_unlock() + */ + kernfs_get(kn); + kernfs_remove(rdtgrp->kn); + + return 0; +} + +static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp, + cpumask_var_t tmpmask) +{ + int cpu; + /* Give any tasks back to the default group */ rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask); @@ -1100,18 +1772,28 @@ static int rdtgroup_rmdir(struct kernfs_node *kn) cpumask_or(&rdtgroup_default.cpu_mask, &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); - /* Update per cpu closid of the moved CPUs first */ - for_each_cpu(cpu, &rdtgrp->cpu_mask) - per_cpu(cpu_closid, cpu) = closid; + /* Update per cpu closid and rmid of the moved CPUs first */ + for_each_cpu(cpu, &rdtgrp->cpu_mask) { + per_cpu(pqr_state.default_closid, cpu) = rdtgroup_default.closid; + per_cpu(pqr_state.default_rmid, cpu) = rdtgroup_default.mon.rmid; + } + /* * Update the MSR on moved CPUs and CPUs which have moved * task running on them. */ cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); - rdt_update_closid(tmpmask, NULL); + update_closid_rmid(tmpmask, NULL); rdtgrp->flags = RDT_DELETED; closid_free(rdtgrp->closid); + free_rmid(rdtgrp->mon.rmid); + + /* + * Free all the child monitor group rmids. + */ + free_all_child_rdtgrp(rdtgrp); + list_del(&rdtgrp->rdtgroup_list); /* @@ -1120,7 +1802,41 @@ static int rdtgroup_rmdir(struct kernfs_node *kn) */ kernfs_get(kn); kernfs_remove(rdtgrp->kn); - ret = 0; + + return 0; +} + +static int rdtgroup_rmdir(struct kernfs_node *kn) +{ + struct kernfs_node *parent_kn = kn->parent; + struct rdtgroup *rdtgrp; + cpumask_var_t tmpmask; + int ret = 0; + + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + return -ENOMEM; + + rdtgrp = rdtgroup_kn_lock_live(kn); + if (!rdtgrp) { + ret = -EPERM; + goto out; + } + + /* + * If the rdtgroup is a ctrl_mon group and parent directory + * is the root directory, remove the ctrl_mon group. + * + * If the rdtgroup is a mon group and parent directory + * is a valid "mon_groups" directory, remove the mon group. + */ + if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn) + ret = rdtgroup_rmdir_ctrl(kn, rdtgrp, tmpmask); + else if (rdtgrp->type == RDTMON_GROUP && + is_mon_groups(parent_kn, kn->name)) + ret = rdtgroup_rmdir_mon(kn, rdtgrp, tmpmask); + else + ret = -EPERM; + out: rdtgroup_kn_unlock(kn); free_cpumask_var(tmpmask); @@ -1129,7 +1845,7 @@ out: static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) { - if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled) + if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled) seq_puts(seq, ",cdp"); return 0; } @@ -1153,10 +1869,13 @@ static int __init rdtgroup_setup_root(void) mutex_lock(&rdtgroup_mutex); rdtgroup_default.closid = 0; + rdtgroup_default.mon.rmid = 0; + rdtgroup_default.type = RDTCTRL_GROUP; + INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list); + list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups); - ret = rdtgroup_add_files(rdt_root->kn, rdtgroup_base_files, - ARRAY_SIZE(rdtgroup_base_files)); + ret = rdtgroup_add_files(rdt_root->kn, RF_CTRL_BASE); if (ret) { kernfs_destroy_root(rdt_root); goto out; diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index e42117d5f4d7..3fed38812eea 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <asm/cpu_device_id.h> #include <asm/cpufeature.h> #include <linux/cpu.h> diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index 43051f0777d4..bcc7c54c7041 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 obj-y = mce.o mce-severity.o mce-genpool.o obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o diff --git a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c index 9c632cb88546..7f85b76f43bc 100644 --- a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c +++ b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c @@ -17,19 +17,13 @@ #include "mce-internal.h" +static BLOCKING_NOTIFIER_HEAD(mce_injector_chain); + static DEFINE_MUTEX(mce_chrdev_read_mutex); static char mce_helper[128]; static char *mce_helper_argv[2] = { mce_helper, NULL }; -#define mce_log_get_idx_check(p) \ -({ \ - RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ - !lockdep_is_held(&mce_chrdev_read_mutex), \ - "suspicious mce_log_get_idx_check() usage"); \ - smp_load_acquire(&(p)); \ -}) - /* * Lockless MCE logging infrastructure. * This avoids deadlocks on printk locks without having to break locks. Also @@ -51,43 +45,32 @@ static int dev_mce_log(struct notifier_block *nb, unsigned long val, void *data) { struct mce *mce = (struct mce *)data; - unsigned int next, entry; - - wmb(); - for (;;) { - entry = mce_log_get_idx_check(mcelog.next); - for (;;) { - - /* - * When the buffer fills up discard new entries. - * Assume that the earlier errors are the more - * interesting ones: - */ - if (entry >= MCE_LOG_LEN) { - set_bit(MCE_OVERFLOW, - (unsigned long *)&mcelog.flags); - return NOTIFY_OK; - } - /* Old left over entry. Skip: */ - if (mcelog.entry[entry].finished) { - entry++; - continue; - } - break; - } - smp_rmb(); - next = entry + 1; - if (cmpxchg(&mcelog.next, entry, next) == entry) - break; + unsigned int entry; + + mutex_lock(&mce_chrdev_read_mutex); + + entry = mcelog.next; + + /* + * When the buffer fills up discard new entries. Assume that the + * earlier errors are the more interesting ones: + */ + if (entry >= MCE_LOG_LEN) { + set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags); + goto unlock; } + + mcelog.next = entry + 1; + memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); - wmb(); mcelog.entry[entry].finished = 1; - wmb(); /* wake processes polling /dev/mcelog */ wake_up_interruptible(&mce_chrdev_wait); +unlock: + mutex_unlock(&mce_chrdev_read_mutex); + return NOTIFY_OK; } @@ -175,13 +158,6 @@ static int mce_chrdev_release(struct inode *inode, struct file *file) return 0; } -static void collect_tscs(void *data) -{ - unsigned long *cpu_tsc = (unsigned long *)data; - - cpu_tsc[smp_processor_id()] = rdtsc(); -} - static int mce_apei_read_done; /* Collect MCE record of previous boot in persistent storage via APEI ERST. */ @@ -229,14 +205,9 @@ static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off) { char __user *buf = ubuf; - unsigned long *cpu_tsc; - unsigned prev, next; + unsigned next; int i, err; - cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); - if (!cpu_tsc) - return -ENOMEM; - mutex_lock(&mce_chrdev_read_mutex); if (!mce_apei_read_done) { @@ -245,65 +216,29 @@ static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf, goto out; } - next = mce_log_get_idx_check(mcelog.next); - /* Only supports full reads right now */ err = -EINVAL; if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) goto out; + next = mcelog.next; err = 0; - prev = 0; - do { - for (i = prev; i < next; i++) { - unsigned long start = jiffies; - struct mce *m = &mcelog.entry[i]; - - while (!m->finished) { - if (time_after_eq(jiffies, start + 2)) { - memset(m, 0, sizeof(*m)); - goto timeout; - } - cpu_relax(); - } - smp_rmb(); - err |= copy_to_user(buf, m, sizeof(*m)); - buf += sizeof(*m); -timeout: - ; - } - - memset(mcelog.entry + prev, 0, - (next - prev) * sizeof(struct mce)); - prev = next; - next = cmpxchg(&mcelog.next, prev, 0); - } while (next != prev); - - synchronize_sched(); - /* - * Collect entries that were still getting written before the - * synchronize. - */ - on_each_cpu(collect_tscs, cpu_tsc, 1); - - for (i = next; i < MCE_LOG_LEN; i++) { + for (i = 0; i < next; i++) { struct mce *m = &mcelog.entry[i]; - if (m->finished && m->tsc < cpu_tsc[m->cpu]) { - err |= copy_to_user(buf, m, sizeof(*m)); - smp_rmb(); - buf += sizeof(*m); - memset(m, 0, sizeof(*m)); - } + err |= copy_to_user(buf, m, sizeof(*m)); + buf += sizeof(*m); } + memset(mcelog.entry, 0, next * sizeof(struct mce)); + mcelog.next = 0; + if (err) err = -EFAULT; out: mutex_unlock(&mce_chrdev_read_mutex); - kfree(cpu_tsc); return err ? err : buf - ubuf; } @@ -345,24 +280,49 @@ static long mce_chrdev_ioctl(struct file *f, unsigned int cmd, } } -static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf, - size_t usize, loff_t *off); +void mce_register_injector_chain(struct notifier_block *nb) +{ + blocking_notifier_chain_register(&mce_injector_chain, nb); +} +EXPORT_SYMBOL_GPL(mce_register_injector_chain); -void register_mce_write_callback(ssize_t (*fn)(struct file *filp, - const char __user *ubuf, - size_t usize, loff_t *off)) +void mce_unregister_injector_chain(struct notifier_block *nb) { - mce_write = fn; + blocking_notifier_chain_unregister(&mce_injector_chain, nb); } -EXPORT_SYMBOL_GPL(register_mce_write_callback); +EXPORT_SYMBOL_GPL(mce_unregister_injector_chain); static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf, size_t usize, loff_t *off) { - if (mce_write) - return mce_write(filp, ubuf, usize, off); - else + struct mce m; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + /* + * There are some cases where real MSR reads could slip + * through. + */ + if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA)) + return -EIO; + + if ((unsigned long)usize > sizeof(struct mce)) + usize = sizeof(struct mce); + if (copy_from_user(&m, ubuf, usize)) + return -EFAULT; + + if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu)) return -EINVAL; + + /* + * Need to give user space some time to set everything up, + * so do it a jiffie or two later everywhere. + */ + schedule_timeout(2); + + blocking_notifier_call_chain(&mce_injector_chain, 0, &m); + + return usize; } static const struct file_operations mce_chrdev_ops = { @@ -388,9 +348,15 @@ static __init int dev_mcelog_init_device(void) /* register character device /dev/mcelog */ err = misc_register(&mce_chrdev_device); if (err) { - pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err); + if (err == -EBUSY) + /* Xen dom0 might have registered the device already. */ + pr_info("Unable to init device /dev/mcelog, already registered"); + else + pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err); + return err; } + mce_register_decode_chain(&dev_mcelog_nb); return 0; } diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 99165b206df3..231ad23b24a9 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -10,23 +10,105 @@ * Authors: * Andi Kleen * Ying Huang + * + * The AMD part (from mce_amd_inj.c): a simple MCE injection facility + * for testing different aspects of the RAS code. This driver should be + * built as module so that it can be loaded on production kernels for + * testing purposes. + * + * This file may be distributed under the terms of the GNU General Public + * License version 2. + * + * Copyright (c) 2010-17: Borislav Petkov <bp@alien8.de> + * Advanced Micro Devices Inc. */ -#include <linux/uaccess.h> -#include <linux/module.h> -#include <linux/timer.h> + +#include <linux/cpu.h> +#include <linux/debugfs.h> #include <linux/kernel.h> -#include <linux/string.h> -#include <linux/fs.h> -#include <linux/preempt.h> -#include <linux/smp.h> +#include <linux/module.h> #include <linux/notifier.h> -#include <linux/kdebug.h> -#include <linux/cpu.h> -#include <linux/sched.h> -#include <linux/gfp.h> -#include <asm/mce.h> +#include <linux/pci.h> +#include <linux/uaccess.h> + +#include <asm/amd_nb.h> #include <asm/apic.h> +#include <asm/irq_vectors.h> +#include <asm/mce.h> #include <asm/nmi.h> +#include <asm/smp.h> + +#include "mce-internal.h" + +/* + * Collect all the MCi_XXX settings + */ +static struct mce i_mce; +static struct dentry *dfs_inj; + +static u8 n_banks; + +#define MAX_FLAG_OPT_SIZE 3 +#define NBCFG 0x44 + +enum injection_type { + SW_INJ = 0, /* SW injection, simply decode the error */ + HW_INJ, /* Trigger a #MC */ + DFR_INT_INJ, /* Trigger Deferred error interrupt */ + THR_INT_INJ, /* Trigger threshold interrupt */ + N_INJ_TYPES, +}; + +static const char * const flags_options[] = { + [SW_INJ] = "sw", + [HW_INJ] = "hw", + [DFR_INT_INJ] = "df", + [THR_INT_INJ] = "th", + NULL +}; + +/* Set default injection to SW_INJ */ +static enum injection_type inj_type = SW_INJ; + +#define MCE_INJECT_SET(reg) \ +static int inj_##reg##_set(void *data, u64 val) \ +{ \ + struct mce *m = (struct mce *)data; \ + \ + m->reg = val; \ + return 0; \ +} + +MCE_INJECT_SET(status); +MCE_INJECT_SET(misc); +MCE_INJECT_SET(addr); +MCE_INJECT_SET(synd); + +#define MCE_INJECT_GET(reg) \ +static int inj_##reg##_get(void *data, u64 *val) \ +{ \ + struct mce *m = (struct mce *)data; \ + \ + *val = m->reg; \ + return 0; \ +} + +MCE_INJECT_GET(status); +MCE_INJECT_GET(misc); +MCE_INJECT_GET(addr); +MCE_INJECT_GET(synd); + +DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(synd_fops, inj_synd_get, inj_synd_set, "%llx\n"); + +static void setup_inj_struct(struct mce *m) +{ + memset(m, 0, sizeof(struct mce)); + + m->cpuvendor = boot_cpu_data.x86_vendor; +} /* Update fake mce registers on current CPU. */ static void inject_mce(struct mce *m) @@ -143,7 +225,7 @@ static int raise_local(void) return ret; } -static void raise_mce(struct mce *m) +static void __maybe_unused raise_mce(struct mce *m) { int context = MCJ_CTX(m->inject_flags); @@ -198,55 +280,454 @@ static void raise_mce(struct mce *m) } } -/* Error injection interface */ -static ssize_t mce_write(struct file *filp, const char __user *ubuf, - size_t usize, loff_t *off) +static int mce_inject_raise(struct notifier_block *nb, unsigned long val, + void *data) { - struct mce m; + struct mce *m = (struct mce *)data; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - /* - * There are some cases where real MSR reads could slip - * through. - */ - if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA)) - return -EIO; + if (!m) + return NOTIFY_DONE; + + mutex_lock(&mce_inject_mutex); + raise_mce(m); + mutex_unlock(&mce_inject_mutex); + + return NOTIFY_DONE; +} + +static struct notifier_block inject_nb = { + .notifier_call = mce_inject_raise, +}; + +/* + * Caller needs to be make sure this cpu doesn't disappear + * from under us, i.e.: get_cpu/put_cpu. + */ +static int toggle_hw_mce_inject(unsigned int cpu, bool enable) +{ + u32 l, h; + int err; + + err = rdmsr_on_cpu(cpu, MSR_K7_HWCR, &l, &h); + if (err) { + pr_err("%s: error reading HWCR\n", __func__); + return err; + } + + enable ? (l |= BIT(18)) : (l &= ~BIT(18)); + + err = wrmsr_on_cpu(cpu, MSR_K7_HWCR, l, h); + if (err) + pr_err("%s: error writing HWCR\n", __func__); - if ((unsigned long)usize > sizeof(struct mce)) - usize = sizeof(struct mce); - if (copy_from_user(&m, ubuf, usize)) + return err; +} + +static int __set_inj(const char *buf) +{ + int i; + + for (i = 0; i < N_INJ_TYPES; i++) { + if (!strncmp(flags_options[i], buf, strlen(flags_options[i]))) { + inj_type = i; + return 0; + } + } + return -EINVAL; +} + +static ssize_t flags_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[MAX_FLAG_OPT_SIZE]; + int n; + + n = sprintf(buf, "%s\n", flags_options[inj_type]); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, n); +} + +static ssize_t flags_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[MAX_FLAG_OPT_SIZE], *__buf; + int err; + + if (cnt > MAX_FLAG_OPT_SIZE) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) return -EFAULT; - if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu)) + buf[cnt - 1] = 0; + + /* strip whitespace */ + __buf = strstrip(buf); + + err = __set_inj(__buf); + if (err) { + pr_err("%s: Invalid flags value: %s\n", __func__, __buf); + return err; + } + + *ppos += cnt; + + return cnt; +} + +static const struct file_operations flags_fops = { + .read = flags_read, + .write = flags_write, + .llseek = generic_file_llseek, +}; + +/* + * On which CPU to inject? + */ +MCE_INJECT_GET(extcpu); + +static int inj_extcpu_set(void *data, u64 val) +{ + struct mce *m = (struct mce *)data; + + if (val >= nr_cpu_ids || !cpu_online(val)) { + pr_err("%s: Invalid CPU: %llu\n", __func__, val); return -EINVAL; + } + m->extcpu = val; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(extcpu_fops, inj_extcpu_get, inj_extcpu_set, "%llu\n"); + +static void trigger_mce(void *info) +{ + asm volatile("int $18"); +} + +static void trigger_dfr_int(void *info) +{ + asm volatile("int %0" :: "i" (DEFERRED_ERROR_VECTOR)); +} + +static void trigger_thr_int(void *info) +{ + asm volatile("int %0" :: "i" (THRESHOLD_APIC_VECTOR)); +} + +static u32 get_nbc_for_node(int node_id) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + u32 cores_per_node; + + cores_per_node = (c->x86_max_cores * smp_num_siblings) / amd_get_nodes_per_socket(); + + return cores_per_node * node_id; +} + +static void toggle_nb_mca_mst_cpu(u16 nid) +{ + struct amd_northbridge *nb; + struct pci_dev *F3; + u32 val; + int err; + + nb = node_to_amd_nb(nid); + if (!nb) + return; + + F3 = nb->misc; + if (!F3) + return; + + err = pci_read_config_dword(F3, NBCFG, &val); + if (err) { + pr_err("%s: Error reading F%dx%03x.\n", + __func__, PCI_FUNC(F3->devfn), NBCFG); + return; + } + + if (val & BIT(27)) + return; + + pr_err("%s: Set D18F3x44[NbMcaToMstCpuEn] which BIOS hasn't done.\n", + __func__); + + val |= BIT(27); + err = pci_write_config_dword(F3, NBCFG, val); + if (err) + pr_err("%s: Error writing F%dx%03x.\n", + __func__, PCI_FUNC(F3->devfn), NBCFG); +} + +static void prepare_msrs(void *info) +{ + struct mce m = *(struct mce *)info; + u8 b = m.bank; + + wrmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); + + if (boot_cpu_has(X86_FEATURE_SMCA)) { + if (m.inject_flags == DFR_INT_INJ) { + wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(b), m.status); + wrmsrl(MSR_AMD64_SMCA_MCx_DEADDR(b), m.addr); + } else { + wrmsrl(MSR_AMD64_SMCA_MCx_STATUS(b), m.status); + wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr); + } + + wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), m.misc); + wrmsrl(MSR_AMD64_SMCA_MCx_SYND(b), m.synd); + } else { + wrmsrl(MSR_IA32_MCx_STATUS(b), m.status); + wrmsrl(MSR_IA32_MCx_ADDR(b), m.addr); + wrmsrl(MSR_IA32_MCx_MISC(b), m.misc); + } +} + +static void do_inject(void) +{ + u64 mcg_status = 0; + unsigned int cpu = i_mce.extcpu; + u8 b = i_mce.bank; + + rdtscll(i_mce.tsc); + + if (i_mce.misc) + i_mce.status |= MCI_STATUS_MISCV; + + if (i_mce.synd) + i_mce.status |= MCI_STATUS_SYNDV; + + if (inj_type == SW_INJ) { + mce_inject_log(&i_mce); + return; + } + + /* prep MCE global settings for the injection */ + mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV; + + if (!(i_mce.status & MCI_STATUS_PCC)) + mcg_status |= MCG_STATUS_RIPV; /* - * Need to give user space some time to set everything up, - * so do it a jiffie or two later everywhere. + * Ensure necessary status bits for deferred errors: + * - MCx_STATUS[Deferred]: make sure it is a deferred error + * - MCx_STATUS[UC] cleared: deferred errors are _not_ UC */ - schedule_timeout(2); + if (inj_type == DFR_INT_INJ) { + i_mce.status |= MCI_STATUS_DEFERRED; + i_mce.status |= (i_mce.status & ~MCI_STATUS_UC); + } - mutex_lock(&mce_inject_mutex); - raise_mce(&m); - mutex_unlock(&mce_inject_mutex); - return usize; + /* + * For multi node CPUs, logging and reporting of bank 4 errors happens + * only on the node base core. Refer to D18F3x44[NbMcaToMstCpuEn] for + * Fam10h and later BKDGs. + */ + if (static_cpu_has(X86_FEATURE_AMD_DCM) && + b == 4 && + boot_cpu_data.x86 < 0x17) { + toggle_nb_mca_mst_cpu(amd_get_nb_id(cpu)); + cpu = get_nbc_for_node(amd_get_nb_id(cpu)); + } + + get_online_cpus(); + if (!cpu_online(cpu)) + goto err; + + toggle_hw_mce_inject(cpu, true); + + i_mce.mcgstatus = mcg_status; + i_mce.inject_flags = inj_type; + smp_call_function_single(cpu, prepare_msrs, &i_mce, 0); + + toggle_hw_mce_inject(cpu, false); + + switch (inj_type) { + case DFR_INT_INJ: + smp_call_function_single(cpu, trigger_dfr_int, NULL, 0); + break; + case THR_INT_INJ: + smp_call_function_single(cpu, trigger_thr_int, NULL, 0); + break; + default: + smp_call_function_single(cpu, trigger_mce, NULL, 0); + } + +err: + put_online_cpus(); + +} + +/* + * This denotes into which bank we're injecting and triggers + * the injection, at the same time. + */ +static int inj_bank_set(void *data, u64 val) +{ + struct mce *m = (struct mce *)data; + + if (val >= n_banks) { + pr_err("Non-existent MCE bank: %llu\n", val); + return -EINVAL; + } + + m->bank = val; + do_inject(); + + return 0; +} + +MCE_INJECT_GET(bank); + +DEFINE_SIMPLE_ATTRIBUTE(bank_fops, inj_bank_get, inj_bank_set, "%llu\n"); + +static const char readme_msg[] = +"Description of the files and their usages:\n" +"\n" +"Note1: i refers to the bank number below.\n" +"Note2: See respective BKDGs for the exact bit definitions of the files below\n" +"as they mirror the hardware registers.\n" +"\n" +"status:\t Set MCi_STATUS: the bits in that MSR control the error type and\n" +"\t attributes of the error which caused the MCE.\n" +"\n" +"misc:\t Set MCi_MISC: provide auxiliary info about the error. It is mostly\n" +"\t used for error thresholding purposes and its validity is indicated by\n" +"\t MCi_STATUS[MiscV].\n" +"\n" +"synd:\t Set MCi_SYND: provide syndrome info about the error. Only valid on\n" +"\t Scalable MCA systems, and its validity is indicated by MCi_STATUS[SyndV].\n" +"\n" +"addr:\t Error address value to be written to MCi_ADDR. Log address information\n" +"\t associated with the error.\n" +"\n" +"cpu:\t The CPU to inject the error on.\n" +"\n" +"bank:\t Specify the bank you want to inject the error into: the number of\n" +"\t banks in a processor varies and is family/model-specific, therefore, the\n" +"\t supplied value is sanity-checked. Setting the bank value also triggers the\n" +"\t injection.\n" +"\n" +"flags:\t Injection type to be performed. Writing to this file will trigger a\n" +"\t real machine check, an APIC interrupt or invoke the error decoder routines\n" +"\t for AMD processors.\n" +"\n" +"\t Allowed error injection types:\n" +"\t - \"sw\": Software error injection. Decode error to a human-readable \n" +"\t format only. Safe to use.\n" +"\t - \"hw\": Hardware error injection. Causes the #MC exception handler to \n" +"\t handle the error. Be warned: might cause system panic if MCi_STATUS[PCC] \n" +"\t is set. Therefore, consider setting (debugfs_mountpoint)/mce/fake_panic \n" +"\t before injecting.\n" +"\t - \"df\": Trigger APIC interrupt for Deferred error. Causes deferred \n" +"\t error APIC interrupt handler to handle the error if the feature is \n" +"\t is present in hardware. \n" +"\t - \"th\": Trigger APIC interrupt for Threshold errors. Causes threshold \n" +"\t APIC interrupt handler to handle the error. \n" +"\n"; + +static ssize_t +inj_readme_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return simple_read_from_buffer(ubuf, cnt, ppos, + readme_msg, strlen(readme_msg)); +} + +static const struct file_operations readme_fops = { + .read = inj_readme_read, +}; + +static struct dfs_node { + char *name; + struct dentry *d; + const struct file_operations *fops; + umode_t perm; +} dfs_fls[] = { + { .name = "status", .fops = &status_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "misc", .fops = &misc_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "addr", .fops = &addr_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "synd", .fops = &synd_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "bank", .fops = &bank_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "flags", .fops = &flags_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "cpu", .fops = &extcpu_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "README", .fops = &readme_fops, .perm = S_IRUSR | S_IRGRP | S_IROTH }, +}; + +static int __init debugfs_init(void) +{ + unsigned int i; + u64 cap; + + rdmsrl(MSR_IA32_MCG_CAP, cap); + n_banks = cap & MCG_BANKCNT_MASK; + + dfs_inj = debugfs_create_dir("mce-inject", NULL); + if (!dfs_inj) + return -EINVAL; + + for (i = 0; i < ARRAY_SIZE(dfs_fls); i++) { + dfs_fls[i].d = debugfs_create_file(dfs_fls[i].name, + dfs_fls[i].perm, + dfs_inj, + &i_mce, + dfs_fls[i].fops); + + if (!dfs_fls[i].d) + goto err_dfs_add; + } + + return 0; + +err_dfs_add: + while (i-- > 0) + debugfs_remove(dfs_fls[i].d); + + debugfs_remove(dfs_inj); + dfs_inj = NULL; + + return -ENODEV; } -static int inject_init(void) +static int __init inject_init(void) { + int err; + if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL)) return -ENOMEM; + + err = debugfs_init(); + if (err) { + free_cpumask_var(mce_inject_cpumask); + return err; + } + + register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, "mce_notify"); + mce_register_injector_chain(&inject_nb); + + setup_inj_struct(&i_mce); + pr_info("Machine check injector initialized\n"); - register_mce_write_callback(mce_write); - register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, - "mce_notify"); + return 0; } +static void __exit inject_exit(void) +{ + + mce_unregister_injector_chain(&inject_nb); + unregister_nmi_handler(NMI_LOCAL, "mce_notify"); + + debugfs_remove_recursive(dfs_inj); + dfs_inj = NULL; + + memset(&dfs_fls, 0, sizeof(dfs_fls)); + + free_cpumask_var(mce_inject_cpumask); +} + module_init(inject_init); -/* - * Cannot tolerate unloading currently because we cannot - * guarantee all openers of mce_chrdev will get a reference to us. - */ +module_exit(inject_exit); MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 654ad0668d72..aa0d5df9dc60 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -1,3 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __X86_MCE_INTERNAL_H__ +#define __X86_MCE_INTERNAL_H__ + #include <linux/device.h> #include <asm/mce.h> @@ -100,7 +104,15 @@ static inline bool mce_cmp(struct mce *m1, struct mce *m2) extern struct device_attribute dev_attr_trigger; #ifdef CONFIG_X86_MCELOG_LEGACY -extern void mce_work_trigger(void); +void mce_work_trigger(void); +void mce_register_injector_chain(struct notifier_block *nb); +void mce_unregister_injector_chain(struct notifier_block *nb); #else static inline void mce_work_trigger(void) { } +static inline void mce_register_injector_chain(struct notifier_block *nb) { } +static inline void mce_unregister_injector_chain(struct notifier_block *nb) { } #endif + +extern struct mca_config mca_cfg; + +#endif /* __X86_MCE_INTERNAL_H__ */ diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 5cfbaeb6529a..3b413065c613 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -51,6 +51,7 @@ #include <asm/mce.h> #include <asm/msr.h> #include <asm/reboot.h> +#include <asm/set_memory.h> #include "mce-internal.h" @@ -673,7 +674,6 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) { bool error_seen = false; struct mce m; - int severity; int i; this_cpu_inc(mce_poll_count); @@ -710,11 +710,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) mce_read_aux(&m, i); - severity = mce_severity(&m, mca_cfg.tolerant, NULL, false); - - if (severity == MCE_DEFERRED_SEVERITY && mce_is_memory_error(&m)) - if (m.status & MCI_STATUS_ADDRV) - m.severity = severity; + m.severity = mce_severity(&m, mca_cfg.tolerant, NULL, false); /* * Don't get the IP here because it's unlikely to @@ -1056,6 +1052,48 @@ static int do_memory_failure(struct mce *m) return ret; } +#if defined(arch_unmap_kpfn) && defined(CONFIG_MEMORY_FAILURE) + +void arch_unmap_kpfn(unsigned long pfn) +{ + unsigned long decoy_addr; + + /* + * Unmap this page from the kernel 1:1 mappings to make sure + * we don't log more errors because of speculative access to + * the page. + * We would like to just call: + * set_memory_np((unsigned long)pfn_to_kaddr(pfn), 1); + * but doing that would radically increase the odds of a + * speculative access to the posion page because we'd have + * the virtual address of the kernel 1:1 mapping sitting + * around in registers. + * Instead we get tricky. We create a non-canonical address + * that looks just like the one we want, but has bit 63 flipped. + * This relies on set_memory_np() not checking whether we passed + * a legal address. + */ + +/* + * Build time check to see if we have a spare virtual bit. Don't want + * to leave this until run time because most developers don't have a + * system that can exercise this code path. This will only become a + * problem if/when we move beyond 5-level page tables. + * + * Hard code "9" here because cpp doesn't grok ilog2(PTRS_PER_PGD) + */ +#if PGDIR_SHIFT + 9 < 63 + decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63)); +#else +#error "no unused virtual bit available" +#endif + + if (set_memory_np(decoy_addr, 1)) + pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn); + +} +#endif + /* * The actual machine check handler. This only handles real * exceptions when something got corrupted coming in through int 18. @@ -1550,7 +1588,7 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) */ clear_bit(10, (unsigned long *)&mce_banks[4].ctl); } - if (c->x86 < 17 && cfg->bootlog < 0) { + if (c->x86 < 0x11 && cfg->bootlog < 0) { /* * Lots of broken BIOS around that don't clear them * by default and leave crap in there. Don't log: @@ -1832,7 +1870,8 @@ void mce_disable_bank(int bank) * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) * monarchtimeout is how long to wait for other CPUs on machine * check, or 0 to not wait - * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. + * mce=bootlog Log MCEs from before booting. Disabled by default on AMD Fam10h + and older. * mce=nobootlog Don't log MCEs from before booting. * mce=bios_cmci_threshold Don't program the CMCI threshold * mce=recovery force enable memcpy_mcsafe() @@ -1912,12 +1951,13 @@ static void mce_disable_error_reporting(void) static void vendor_disable_error_reporting(void) { /* - * Don't clear on Intel CPUs. Some of these MSRs are socket-wide. + * Don't clear on Intel or AMD CPUs. Some of these MSRs are socket-wide. * Disabling them for just a single offlined CPU is bad, since it will * inhibit reporting for all shared resources on the socket like the * last level cache (LLC), the integrated memory controller (iMC), etc. */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL || + boot_cpu_data.x86_vendor == X86_VENDOR_AMD) return; mce_disable_error_reporting(); diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 6e4a047e4b68..486f640b02ef 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -28,6 +28,8 @@ #include <asm/msr.h> #include <asm/trace/irq_vectors.h> +#include "mce-internal.h" + #define NR_BLOCKS 5 #define THRESHOLD_MAX 0xFFF #define INT_TYPE_APIC 0x00020000 @@ -164,17 +166,48 @@ static void default_deferred_error_interrupt(void) } void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt; -static void get_smca_bank_info(unsigned int bank) +static void smca_configure(unsigned int bank, unsigned int cpu) { - unsigned int i, hwid_mcatype, cpu = smp_processor_id(); + unsigned int i, hwid_mcatype; struct smca_hwid *s_hwid; - u32 high, instance_id; + u32 high, low; + u32 smca_config = MSR_AMD64_SMCA_MCx_CONFIG(bank); + + /* Set appropriate bits in MCA_CONFIG */ + if (!rdmsr_safe(smca_config, &low, &high)) { + /* + * OS is required to set the MCAX bit to acknowledge that it is + * now using the new MSR ranges and new registers under each + * bank. It also means that the OS will configure deferred + * errors in the new MCx_CONFIG register. If the bit is not set, + * uncorrectable errors will cause a system panic. + * + * MCA_CONFIG[MCAX] is bit 32 (0 in the high portion of the MSR.) + */ + high |= BIT(0); + + /* + * SMCA sets the Deferred Error Interrupt type per bank. + * + * MCA_CONFIG[DeferredIntTypeSupported] is bit 5, and tells us + * if the DeferredIntType bit field is available. + * + * MCA_CONFIG[DeferredIntType] is bits [38:37] ([6:5] in the + * high portion of the MSR). OS should set this to 0x1 to enable + * APIC based interrupt. First, check that no interrupt has been + * set. + */ + if ((low & BIT(5)) && !((high >> 5) & 0x3)) + high |= BIT(5); + + wrmsr(smca_config, low, high); + } - /* Collect bank_info using CPU 0 for now. */ - if (cpu) + /* Return early if this bank was already initialized. */ + if (smca_banks[bank].hwid) return; - if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &instance_id, &high)) { + if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) { pr_warn("Failed to read MCA_IPID for bank %d\n", bank); return; } @@ -185,13 +218,8 @@ static void get_smca_bank_info(unsigned int bank) for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) { s_hwid = &smca_hwid_mcatypes[i]; if (hwid_mcatype == s_hwid->hwid_mcatype) { - - WARN(smca_banks[bank].hwid, - "Bank %s already initialized!\n", - smca_get_name(s_hwid->bank_type)); - smca_banks[bank].hwid = s_hwid; - smca_banks[bank].id = instance_id; + smca_banks[bank].id = low; smca_banks[bank].sysfs_id = s_hwid->count++; break; } @@ -433,7 +461,7 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr, int offset, u32 misc_high) { unsigned int cpu = smp_processor_id(); - u32 smca_low, smca_high, smca_addr; + u32 smca_low, smca_high; struct threshold_block b; int new; @@ -457,51 +485,6 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr, goto set_offset; } - smca_addr = MSR_AMD64_SMCA_MCx_CONFIG(bank); - - if (!rdmsr_safe(smca_addr, &smca_low, &smca_high)) { - /* - * OS is required to set the MCAX bit to acknowledge that it is - * now using the new MSR ranges and new registers under each - * bank. It also means that the OS will configure deferred - * errors in the new MCx_CONFIG register. If the bit is not set, - * uncorrectable errors will cause a system panic. - * - * MCA_CONFIG[MCAX] is bit 32 (0 in the high portion of the MSR.) - */ - smca_high |= BIT(0); - - /* - * SMCA logs Deferred Error information in MCA_DE{STAT,ADDR} - * registers with the option of additionally logging to - * MCA_{STATUS,ADDR} if MCA_CONFIG[LogDeferredInMcaStat] is set. - * - * This bit is usually set by BIOS to retain the old behavior - * for OSes that don't use the new registers. Linux supports the - * new registers so let's disable that additional logging here. - * - * MCA_CONFIG[LogDeferredInMcaStat] is bit 34 (bit 2 in the high - * portion of the MSR). - */ - smca_high &= ~BIT(2); - - /* - * SMCA sets the Deferred Error Interrupt type per bank. - * - * MCA_CONFIG[DeferredIntTypeSupported] is bit 5, and tells us - * if the DeferredIntType bit field is available. - * - * MCA_CONFIG[DeferredIntType] is bits [38:37] ([6:5] in the - * high portion of the MSR). OS should set this to 0x1 to enable - * APIC based interrupt. First, check that no interrupt has been - * set. - */ - if ((smca_low & BIT(5)) && !((smca_high >> 5) & 0x3)) - smca_high |= BIT(5); - - wrmsr(smca_addr, smca_low, smca_high); - } - /* Gather LVT offset for thresholding: */ if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high)) goto out; @@ -530,7 +513,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) for (bank = 0; bank < mca_cfg.banks; ++bank) { if (mce_flags.smca) - get_smca_bank_info(bank); + smca_configure(bank, cpu); for (block = 0; block < NR_BLOCKS; ++block) { address = get_block_address(cpu, address, low, high, bank, block); @@ -755,37 +738,19 @@ out_err: } EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr); -static void -__log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc) +static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) { - u32 msr_status = msr_ops.status(bank); - u32 msr_addr = msr_ops.addr(bank); struct mce m; - u64 status; - - WARN_ON_ONCE(deferred_err && threshold_err); - - if (deferred_err && mce_flags.smca) { - msr_status = MSR_AMD64_SMCA_MCx_DESTAT(bank); - msr_addr = MSR_AMD64_SMCA_MCx_DEADDR(bank); - } - - rdmsrl(msr_status, status); - - if (!(status & MCI_STATUS_VAL)) - return; mce_setup(&m); m.status = status; + m.misc = misc; m.bank = bank; m.tsc = rdtsc(); - if (threshold_err) - m.misc = misc; - if (m.status & MCI_STATUS_ADDRV) { - rdmsrl(msr_addr, m.addr); + m.addr = addr; /* * Extract [55:<lsb>] where lsb is the least significant @@ -806,110 +771,135 @@ __log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc) } mce_log(&m); - - wrmsrl(msr_status, 0); } -static inline void __smp_deferred_error_interrupt(void) +asmlinkage __visible void __irq_entry smp_deferred_error_interrupt(void) { + entering_irq(); + trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR); inc_irq_stat(irq_deferred_error_count); deferred_error_int_vector(); + trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR); + exiting_ack_irq(); } -asmlinkage __visible void __irq_entry smp_deferred_error_interrupt(void) +/* + * Returns true if the logged error is deferred. False, otherwise. + */ +static inline bool +_log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc) { - entering_irq(); - __smp_deferred_error_interrupt(); - exiting_ack_irq(); + u64 status, addr = 0; + + rdmsrl(msr_stat, status); + if (!(status & MCI_STATUS_VAL)) + return false; + + if (status & MCI_STATUS_ADDRV) + rdmsrl(msr_addr, addr); + + __log_error(bank, status, addr, misc); + + wrmsrl(msr_stat, 0); + + return status & MCI_STATUS_DEFERRED; } -asmlinkage __visible void __irq_entry smp_trace_deferred_error_interrupt(void) +/* + * We have three scenarios for checking for Deferred errors: + * + * 1) Non-SMCA systems check MCA_STATUS and log error if found. + * 2) SMCA systems check MCA_STATUS. If error is found then log it and also + * clear MCA_DESTAT. + * 3) SMCA systems check MCA_DESTAT, if error was not found in MCA_STATUS, and + * log it. + */ +static void log_error_deferred(unsigned int bank) { - entering_irq(); - trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR); - __smp_deferred_error_interrupt(); - trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR); - exiting_ack_irq(); + bool defrd; + + defrd = _log_error_bank(bank, msr_ops.status(bank), + msr_ops.addr(bank), 0); + + if (!mce_flags.smca) + return; + + /* Clear MCA_DESTAT if we logged the deferred error from MCA_STATUS. */ + if (defrd) { + wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0); + return; + } + + /* + * Only deferred errors are logged in MCA_DE{STAT,ADDR} so just check + * for a valid error. + */ + _log_error_bank(bank, MSR_AMD64_SMCA_MCx_DESTAT(bank), + MSR_AMD64_SMCA_MCx_DEADDR(bank), 0); } /* APIC interrupt handler for deferred errors */ static void amd_deferred_error_interrupt(void) { unsigned int bank; - u32 msr_status; - u64 status; - for (bank = 0; bank < mca_cfg.banks; ++bank) { - msr_status = (mce_flags.smca) ? MSR_AMD64_SMCA_MCx_DESTAT(bank) - : msr_ops.status(bank); + for (bank = 0; bank < mca_cfg.banks; ++bank) + log_error_deferred(bank); +} - rdmsrl(msr_status, status); +static void log_error_thresholding(unsigned int bank, u64 misc) +{ + _log_error_bank(bank, msr_ops.status(bank), msr_ops.addr(bank), misc); +} - if (!(status & MCI_STATUS_VAL) || - !(status & MCI_STATUS_DEFERRED)) - continue; +static void log_and_reset_block(struct threshold_block *block) +{ + struct thresh_restart tr; + u32 low = 0, high = 0; - __log_error(bank, true, false, 0); - break; - } -} + if (!block) + return; -/* - * APIC Interrupt Handler - */ + if (rdmsr_safe(block->address, &low, &high)) + return; + + if (!(high & MASK_OVERFLOW_HI)) + return; + + /* Log the MCE which caused the threshold event. */ + log_error_thresholding(block->bank, ((u64)high << 32) | low); + + /* Reset threshold block after logging error. */ + memset(&tr, 0, sizeof(tr)); + tr.b = block; + threshold_restart_bank(&tr); +} /* - * threshold interrupt handler will service THRESHOLD_APIC_VECTOR. - * the interrupt goes off when error_count reaches threshold_limit. - * the handler will simply log mcelog w/ software defined bank number. + * Threshold interrupt handler will service THRESHOLD_APIC_VECTOR. The interrupt + * goes off when error_count reaches threshold_limit. */ - static void amd_threshold_interrupt(void) { - u32 low = 0, high = 0, address = 0; - unsigned int bank, block, cpu = smp_processor_id(); - struct thresh_restart tr; + struct threshold_block *first_block = NULL, *block = NULL, *tmp = NULL; + unsigned int bank, cpu = smp_processor_id(); - /* assume first bank caused it */ for (bank = 0; bank < mca_cfg.banks; ++bank) { if (!(per_cpu(bank_map, cpu) & (1 << bank))) continue; - for (block = 0; block < NR_BLOCKS; ++block) { - address = get_block_address(cpu, address, low, high, bank, block); - if (!address) - break; - if (rdmsr_safe(address, &low, &high)) - break; - - if (!(high & MASK_VALID_HI)) { - if (block) - continue; - else - break; - } - - if (!(high & MASK_CNTP_HI) || - (high & MASK_LOCKED_HI)) - continue; + first_block = per_cpu(threshold_banks, cpu)[bank]->blocks; + if (!first_block) + continue; - /* - * Log the machine check that caused the threshold - * event. - */ - if (high & MASK_OVERFLOW_HI) - goto log; - } + /* + * The first block is also the head of the list. Check it first + * before iterating over the rest. + */ + log_and_reset_block(first_block); + list_for_each_entry_safe(block, tmp, &first_block->miscj, miscj) + log_and_reset_block(block); } - return; - -log: - __log_error(bank, false, true, ((u64)high << 32) | low); - - /* Reset threshold block after logging error. */ - memset(&tr, 0, sizeof(tr)); - tr.b = &per_cpu(threshold_banks, cpu)[bank]->blocks[block]; - threshold_restart_bank(&tr); } /* @@ -1202,7 +1192,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) goto out; per_cpu(threshold_banks, cpu)[bank] = b; - atomic_inc(&b->cpus); + refcount_inc(&b->cpus); err = __threshold_add_blocks(b); @@ -1225,7 +1215,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) per_cpu(threshold_banks, cpu)[bank] = b; if (is_shared_bank(bank)) { - atomic_set(&b->cpus, 1); + refcount_set(&b->cpus, 1); /* nb is already initialized, see above */ if (nb) { @@ -1289,7 +1279,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) goto free_out; if (is_shared_bank(bank)) { - if (!atomic_dec_and_test(&b->cpus)) { + if (!refcount_dec_and_test(&b->cpus)) { __threshold_remove_blocks(b); per_cpu(threshold_banks, cpu)[bank] = NULL; return; diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index e84db79ef272..d05be307d081 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Intel specific MCE features. * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca> diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index 2a0717bf8033..5cddf831720f 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * P5 specific Machine Check Exception Reporting * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index d7cc190ae457..2da67b70ba98 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -122,7 +122,7 @@ static struct attribute *thermal_throttle_attrs[] = { NULL }; -static struct attribute_group thermal_attr_group = { +static const struct attribute_group thermal_attr_group = { .attrs = thermal_throttle_attrs, .name = "thermal_throttle" }; @@ -390,26 +390,12 @@ static void unexpected_thermal_interrupt(void) static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt; -static inline void __smp_thermal_interrupt(void) -{ - inc_irq_stat(irq_thermal_count); - smp_thermal_vector(); -} - -asmlinkage __visible void __irq_entry -smp_thermal_interrupt(struct pt_regs *regs) -{ - entering_irq(); - __smp_thermal_interrupt(); - exiting_ack_irq(); -} - -asmlinkage __visible void __irq_entry -smp_trace_thermal_interrupt(struct pt_regs *regs) +asmlinkage __visible void __irq_entry smp_thermal_interrupt(struct pt_regs *r) { entering_irq(); trace_thermal_apic_entry(THERMAL_APIC_VECTOR); - __smp_thermal_interrupt(); + inc_irq_stat(irq_thermal_count); + smp_thermal_vector(); trace_thermal_apic_exit(THERMAL_APIC_VECTOR); exiting_ack_irq(); } diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c index bb0e75eed10a..2b584b319eff 100644 --- a/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Common corrected MCE threshold handler code: */ @@ -17,24 +18,12 @@ static void default_threshold_interrupt(void) void (*mce_threshold_vector)(void) = default_threshold_interrupt; -static inline void __smp_threshold_interrupt(void) -{ - inc_irq_stat(irq_threshold_count); - mce_threshold_vector(); -} - asmlinkage __visible void __irq_entry smp_threshold_interrupt(void) { entering_irq(); - __smp_threshold_interrupt(); - exiting_ack_irq(); -} - -asmlinkage __visible void __irq_entry smp_trace_threshold_interrupt(void) -{ - entering_irq(); trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR); - __smp_threshold_interrupt(); + inc_irq_stat(irq_threshold_count); + mce_threshold_vector(); trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR); exiting_ack_irq(); } diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c index c6a722e1d011..3b45b270a865 100644 --- a/arch/x86/kernel/cpu/mcheck/winchip.c +++ b/arch/x86/kernel/cpu/mcheck/winchip.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * IDT Winchip specific Machine Check Exception Reporting * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index e9f4d762aa5b..c6daec4bdba5 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -251,7 +251,7 @@ static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family) #endif } -void __load_ucode_amd(unsigned int cpuid_1_eax, struct cpio_data *ret) +static void __load_ucode_amd(unsigned int cpuid_1_eax, struct cpio_data *ret) { struct ucode_cpu_info *uci; struct cpio_data cp; @@ -400,9 +400,12 @@ static void update_cache(struct ucode_patch *new_patch) list_for_each_entry(p, µcode_cache, plist) { if (p->equiv_cpu == new_patch->equiv_cpu) { - if (p->patch_id >= new_patch->patch_id) + if (p->patch_id >= new_patch->patch_id) { /* we already have the latest patch */ + kfree(new_patch->data); + kfree(new_patch); return; + } list_replace(&p->plist, &new_patch->plist); kfree(p->data); diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index e53d3c909840..c4fa4a85d4cb 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -122,9 +122,6 @@ static bool __init check_loader_disabled_bsp(void) bool *res = &dis_ucode_ldr; #endif - if (!have_cpuid_p()) - return *res; - /* * CPUID(1).ECX[31]: reserved for hypervisor use. This is still not * completely accurate as xen pv guests don't see that CPUID bit set but @@ -166,24 +163,36 @@ bool get_builtin_firmware(struct cpio_data *cd, const char *name) void __init load_ucode_bsp(void) { unsigned int cpuid_1_eax; + bool intel = true; - if (check_loader_disabled_bsp()) + if (!have_cpuid_p()) return; cpuid_1_eax = native_cpuid_eax(1); switch (x86_cpuid_vendor()) { case X86_VENDOR_INTEL: - if (x86_family(cpuid_1_eax) >= 6) - load_ucode_intel_bsp(); + if (x86_family(cpuid_1_eax) < 6) + return; break; + case X86_VENDOR_AMD: - if (x86_family(cpuid_1_eax) >= 0x10) - load_ucode_amd_bsp(cpuid_1_eax); + if (x86_family(cpuid_1_eax) < 0x10) + return; + intel = false; break; + default: - break; + return; } + + if (check_loader_disabled_bsp()) + return; + + if (intel) + load_ucode_intel_bsp(); + else + load_ucode_amd_bsp(cpuid_1_eax); } static bool check_loader_disabled_ap(void) @@ -290,6 +299,17 @@ struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa) return (struct cpio_data){ NULL, 0, "" }; if (initrd_start) start = initrd_start; + } else { + /* + * The picture with physical addresses is a bit different: we + * need to get the *physical* address to which the ramdisk was + * relocated, i.e., relocated_ramdisk (not initrd_start) and + * since we're running from physical addresses, we need to access + * relocated_ramdisk through its *physical* address too. + */ + u64 *rr = (u64 *)__pa_nodebug(&relocated_ramdisk); + if (*rr) + start = *rr; } return find_cpio_data(path, (void *)start, size, NULL); @@ -550,7 +570,7 @@ static struct attribute *mc_default_attrs[] = { NULL }; -static struct attribute_group mc_attr_group = { +static const struct attribute_group mc_attr_group = { .attrs = mc_default_attrs, .name = "microcode", }; @@ -696,7 +716,7 @@ static struct attribute *cpu_root_microcode_attrs[] = { NULL }; -static struct attribute_group cpu_root_microcode_group = { +static const struct attribute_group cpu_root_microcode_group = { .name = "microcode", .attrs = cpu_root_microcode_attrs, }; diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index f522415bf9e5..7dbcb7adf797 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -34,6 +34,7 @@ #include <linux/mm.h> #include <asm/microcode_intel.h> +#include <asm/intel-family.h> #include <asm/processor.h> #include <asm/tlbflush.h> #include <asm/setup.h> @@ -42,7 +43,7 @@ static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin"; /* Current microcode patch used in early patching on the APs. */ -struct microcode_intel *intel_ucode_patch; +static struct microcode_intel *intel_ucode_patch; static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1, unsigned int s2, unsigned int p2) @@ -146,18 +147,18 @@ static bool microcode_matches(struct microcode_header_intel *mc_header, return false; } -static struct ucode_patch *__alloc_microcode_buf(void *data, unsigned int size) +static struct ucode_patch *memdup_patch(void *data, unsigned int size) { struct ucode_patch *p; p = kzalloc(sizeof(struct ucode_patch), GFP_KERNEL); if (!p) - return ERR_PTR(-ENOMEM); + return NULL; p->data = kmemdup(data, size, GFP_KERNEL); if (!p->data) { kfree(p); - return ERR_PTR(-ENOMEM); + return NULL; } return p; @@ -166,7 +167,7 @@ static struct ucode_patch *__alloc_microcode_buf(void *data, unsigned int size) static void save_microcode_patch(void *data, unsigned int size) { struct microcode_header_intel *mc_hdr, *mc_saved_hdr; - struct ucode_patch *iter, *tmp, *p; + struct ucode_patch *iter, *tmp, *p = NULL; bool prev_found = false; unsigned int sig, pf; @@ -183,8 +184,8 @@ static void save_microcode_patch(void *data, unsigned int size) if (mc_hdr->rev <= mc_saved_hdr->rev) continue; - p = __alloc_microcode_buf(data, size); - if (IS_ERR(p)) + p = memdup_patch(data, size); + if (!p) pr_err("Error allocating buffer %p\n", data); else list_replace(&iter->plist, &p->plist); @@ -196,12 +197,25 @@ static void save_microcode_patch(void *data, unsigned int size) * newly found. */ if (!prev_found) { - p = __alloc_microcode_buf(data, size); - if (IS_ERR(p)) + p = memdup_patch(data, size); + if (!p) pr_err("Error allocating buffer for %p\n", data); else list_add_tail(&p->plist, µcode_cache); } + + if (!p) + return; + + /* + * Save for early loading. On 32-bit, that needs to be a physical + * address as the APs are running from physical addresses, before + * paging has been enabled. + */ + if (IS_ENABLED(CONFIG_X86_32)) + intel_ucode_patch = (struct microcode_intel *)__pa_nodebug(p->data); + else + intel_ucode_patch = p->data; } static int microcode_sanity_check(void *mc, int print_err) @@ -607,6 +621,14 @@ int __init save_microcode_in_initrd_intel(void) struct ucode_cpu_info uci; struct cpio_data cp; + /* + * initrd is going away, clear patch ptr. We will scan the microcode one + * last time before jettisoning and save a patch, if found. Then we will + * update that pointer too, with a stable patch address to use when + * resuming the cores. + */ + intel_ucode_patch = NULL; + if (!load_builtin_intel_microcode(&cp)) cp = find_microcode_in_initrd(ucode_path, false); @@ -619,9 +641,6 @@ int __init save_microcode_in_initrd_intel(void) show_saved_mc(); - /* initrd is going away, clear patch ptr. */ - intel_ucode_patch = NULL; - return 0; } @@ -900,6 +919,18 @@ static int get_ucode_fw(void *to, const void *from, size_t n) return 0; } +static bool is_blacklisted(unsigned int cpu) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu); + + if (c->x86 == 6 && c->x86_model == INTEL_FAM6_BROADWELL_X) { + pr_err_once("late loading on model 79 is disabled.\n"); + return true; + } + + return false; +} + static enum ucode_state request_microcode_fw(int cpu, struct device *device, bool refresh_fw) { @@ -908,6 +939,9 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device, const struct firmware *firmware; enum ucode_state ret; + if (is_blacklisted(cpu)) + return UCODE_NFOUND; + sprintf(name, "intel-ucode/%02x-%02x-%02x", c->x86, c->x86_model, c->x86_mask); @@ -932,6 +966,9 @@ static int get_ucode_user(void *to, const void *from, size_t n) static enum ucode_state request_microcode_user(int cpu, const void __user *buf, size_t size) { + if (is_blacklisted(cpu)) + return UCODE_NFOUND; + return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user); } diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh index 6988c74409a8..d0dfb892c72f 100644 --- a/arch/x86/kernel/cpu/mkcapflags.sh +++ b/arch/x86/kernel/cpu/mkcapflags.sh @@ -1,4 +1,5 @@ #!/bin/sh +# SPDX-License-Identifier: GPL-2.0 # # Generate the x86_cap/bug_flags[] arrays from include/asm/cpufeatures.h # diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 04cb8d34ccb8..236324e83a3a 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -59,13 +59,6 @@ void hyperv_vector_handler(struct pt_regs *regs) void hv_setup_vmbus_irq(void (*handler)(void)) { vmbus_handler = handler; - /* - * Setup the IDT for hypervisor callback. Prevent reallocation - * at module reload. - */ - if (!test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors)) - alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, - hyperv_callback_vector); } void hv_remove_vmbus_irq(void) @@ -161,6 +154,15 @@ static int hv_nmi_unknown(unsigned int val, struct pt_regs *regs) } #endif +static unsigned long hv_get_tsc_khz(void) +{ + unsigned long freq; + + rdmsrl(HV_X64_MSR_TSC_FREQUENCY, freq); + + return freq / 1000; +} + static void __init ms_hyperv_init_platform(void) { int hv_host_info_eax; @@ -175,9 +177,15 @@ static void __init ms_hyperv_init_platform(void) ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES); ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO); - pr_info("HyperV: features 0x%x, hints 0x%x\n", + pr_info("Hyper-V: features 0x%x, hints 0x%x\n", ms_hyperv.features, ms_hyperv.hints); + ms_hyperv.max_vp_index = cpuid_eax(HVCPUID_IMPLEMENTATION_LIMITS); + ms_hyperv.max_lp_index = cpuid_ebx(HVCPUID_IMPLEMENTATION_LIMITS); + + pr_debug("Hyper-V: max %u virtual processors, %u logical processors\n", + ms_hyperv.max_vp_index, ms_hyperv.max_lp_index); + /* * Extract host information. */ @@ -193,8 +201,15 @@ static void __init ms_hyperv_init_platform(void) hv_host_info_edx >> 24, hv_host_info_edx & 0xFFFFFF); } + if (ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS && + ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) { + x86_platform.calibrate_tsc = hv_get_tsc_khz; + x86_platform.calibrate_cpu = hv_get_tsc_khz; + } + #ifdef CONFIG_X86_LOCAL_APIC - if (ms_hyperv.features & HV_X64_MSR_APIC_FREQUENCY_AVAILABLE) { + if (ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS && + ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) { /* * Get the APIC frequency. */ @@ -203,7 +218,7 @@ static void __init ms_hyperv_init_platform(void) rdmsrl(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency); hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ); lapic_timer_frequency = hv_lapic_frequency; - pr_info("HyperV: LAPIC Timer Frequency: %#x\n", + pr_info("Hyper-V: LAPIC Timer Frequency: %#x\n", lapic_timer_frequency); } @@ -233,11 +248,14 @@ static void __init ms_hyperv_init_platform(void) * Setup the hook to get control post apic initialization. */ x86_platform.apic_post_init = hyperv_init; + hyperv_setup_mmu_ops(); + /* Setup the IDT for hypervisor callback */ + alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, hyperv_callback_vector); #endif } const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = { - .name = "Microsoft HyperV", + .name = "Microsoft Hyper-V", .detect = ms_hyperv_platform, .init_platform = ms_hyperv_init_platform, }; diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c index 92ba9cd31c9a..a65a0272096d 100644 --- a/arch/x86/kernel/cpu/mtrr/amd.c +++ b/arch/x86/kernel/cpu/mtrr/amd.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/init.h> #include <linux/mm.h> #include <asm/mtrr.h> diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c index 3d689937fc1b..f27177816569 100644 --- a/arch/x86/kernel/cpu/mtrr/centaur.c +++ b/arch/x86/kernel/cpu/mtrr/centaur.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/init.h> #include <linux/mm.h> diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c index b1086f79e57e..4296c702a3f7 100644 --- a/arch/x86/kernel/cpu/mtrr/cyrix.c +++ b/arch/x86/kernel/cpu/mtrr/cyrix.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/init.h> #include <linux/io.h> #include <linux/mm.h> diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index 6d9b45549109..558444b23923 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/capability.h> #include <linux/seq_file.h> #include <linux/uaccess.h> diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 2bce84d91c2b..40d5a8a75212 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -237,6 +237,18 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ stop_machine(mtrr_rendezvous_handler, &data, cpu_online_mask); } +static void set_mtrr_cpuslocked(unsigned int reg, unsigned long base, + unsigned long size, mtrr_type type) +{ + struct set_mtrr_data data = { .smp_reg = reg, + .smp_base = base, + .smp_size = size, + .smp_type = type + }; + + stop_machine_cpuslocked(mtrr_rendezvous_handler, &data, cpu_online_mask); +} + static void set_mtrr_from_inactive_cpu(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type) { @@ -370,7 +382,7 @@ int mtrr_add_page(unsigned long base, unsigned long size, /* Search for an empty MTRR */ i = mtrr_if->get_free_region(base, size, replace); if (i >= 0) { - set_mtrr(i, base, size, type); + set_mtrr_cpuslocked(i, base, size, type); if (likely(replace < 0)) { mtrr_usage_table[i] = 1; } else { @@ -378,7 +390,7 @@ int mtrr_add_page(unsigned long base, unsigned long size, if (increment) mtrr_usage_table[i]++; if (unlikely(replace != i)) { - set_mtrr(replace, 0, 0, 0); + set_mtrr_cpuslocked(replace, 0, 0, 0); mtrr_usage_table[replace] = 0; } } @@ -506,7 +518,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) goto out; } if (--mtrr_usage_table[reg] < 1) - set_mtrr(reg, 0, 0, 0); + set_mtrr_cpuslocked(reg, 0, 0, 0); error = reg; out: mutex_unlock(&mtrr_mutex); @@ -807,10 +819,8 @@ void mtrr_save_state(void) if (!mtrr_enabled()) return; - get_online_cpus(); first_cpu = cpumask_first(cpu_online_mask); smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1); - put_online_cpus(); } void set_mtrr_aps_delayed_init(void) diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index ad8bd763efa5..2ac99e561181 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * local MTRR defines. */ diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 181eabecae25..d389083330c5 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * local apic based NMI watchdog for various CPUs. * diff --git a/arch/x86/kernel/cpu/powerflags.c b/arch/x86/kernel/cpu/powerflags.c index 1dd8294fd730..fd6ec2aa0303 100644 --- a/arch/x86/kernel/cpu/powerflags.c +++ b/arch/x86/kernel/cpu/powerflags.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Strings for the various x86 power flags * diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 6df621ae62a7..4378a729b933 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/smp.h> #include <linux/timex.h> #include <linux/string.h> @@ -77,9 +78,11 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "microcode\t: 0x%x\n", c->microcode); if (cpu_has(c, X86_FEATURE_TSC)) { - unsigned int freq = cpufreq_quick_get(cpu); + unsigned int freq = arch_freq_get_on_cpu(cpu); if (!freq) + freq = cpufreq_quick_get(cpu); + if (!freq) freq = cpu_khz; seq_printf(m, "cpu MHz\t\t: %u.%03u\n", freq / 1000, (freq % 1000)); diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 23c23508c012..05459ad3db46 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -31,6 +31,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, + { X86_FEATURE_SME, CPUID_EAX, 0, 0x8000001f, 0 }, { 0, 0, 0, 0, 0 } }; diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index cd531355e838..b099024d339c 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Check for extended topology enumeration cpuid leaf 0xb and if it * exists, use it for populating initial_apicid and cpu topology diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c index d77d07ab310b..42c939827621 100644 --- a/arch/x86/kernel/cpu/transmeta.c +++ b/arch/x86/kernel/cpu/transmeta.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/sched.h> #include <linux/sched/clock.h> diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c index ef9c2a0078bd..65a58a390fc3 100644 --- a/arch/x86/kernel/cpu/umc.c +++ b/arch/x86/kernel/cpu/umc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <asm/processor.h> #include "cpu.h" diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 22217ece26c8..44404e2307bb 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -457,7 +457,7 @@ static int prepare_elf64_headers(struct crash_elf_data *ced, bufp += sizeof(Elf64_Phdr); phdr->p_type = PT_NOTE; phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note(); - phdr->p_filesz = phdr->p_memsz = sizeof(vmcoreinfo_note); + phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE; (ehdr->e_phnum)++; #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c index 538fedea9b3f..33ee47670b99 100644 --- a/arch/x86/kernel/crash_dump_32.c +++ b/arch/x86/kernel/crash_dump_32.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Memory preserving reboot related code. * diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c index afa64adb75ee..4f2e0778feac 100644 --- a/arch/x86/kernel/crash_dump_64.c +++ b/arch/x86/kernel/crash_dump_64.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Memory preserving reboot related code. * diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 3fe45f84ced4..76e07698e6d1 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Architecture specific OF callbacks. */ @@ -235,8 +236,7 @@ static void __init dtb_add_ioapic(struct device_node *dn) ret = of_address_to_resource(dn, 0, &r); if (ret) { - printk(KERN_ERR "Can't obtain address from node %s.\n", - dn->full_name); + printk(KERN_ERR "Can't obtain address from device node %pOF.\n", dn); return; } mp_register_ioapic(++ioapic_id, r.start, gsi_top, &cfg); diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c index f9c324e08d85..0e662c55ae90 100644 --- a/arch/x86/kernel/doublefault.c +++ b/arch/x86/kernel/doublefault.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/mm.h> #include <linux/sched.h> #include <linux/sched/debug.h> diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index dbce3cca94cb..f13b4c00a5de 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -94,6 +94,9 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, if (stack_name) printk("%s <%s>\n", log_lvl, stack_name); + if (regs && on_stack(&stack_info, regs, sizeof(*regs))) + __show_regs(regs, 0); + /* * Scan the stack, printing any text addresses we find. At the * same time, follow proper stack frames with the unwinder. @@ -118,10 +121,8 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, * Don't print regs->ip again if it was already printed * by __show_regs() below. */ - if (regs && stack == ®s->ip) { - unwind_next_frame(&state); - continue; - } + if (regs && stack == ®s->ip) + goto next; if (stack == ret_addr_p) reliable = 1; @@ -144,6 +145,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, if (!reliable) continue; +next: /* * Get the next frame from the unwinder. No need to * check for an error: if anything goes wrong, the rest @@ -153,7 +155,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, /* if the frame has entry regs, print them */ regs = unwind_get_entry_regs(&state); - if (regs) + if (regs && on_stack(&stack_info, regs, sizeof(*regs))) __show_regs(regs, 0); } @@ -265,7 +267,7 @@ int __die(const char *str, struct pt_regs *regs, long err) #ifdef CONFIG_X86_32 if (user_mode(regs)) { sp = regs->sp; - ss = regs->ss & 0xffff; + ss = regs->ss; } else { sp = kernel_stack_pointer(regs); savesegment(ss, ss); diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index e5f0b40e66d2..daefae83a3aa 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs @@ -37,7 +38,7 @@ static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) * This is a software stack, so 'end' can be a valid stack pointer. * It just means the stack is empty. */ - if (stack < begin || stack > end) + if (stack <= begin || stack > end) return false; info->type = STACK_TYPE_IRQ; @@ -62,7 +63,7 @@ static bool in_softirq_stack(unsigned long *stack, struct stack_info *info) * This is a software stack, so 'end' can be a valid stack pointer. * It just means the stack is empty. */ - if (stack < begin || stack > end) + if (stack <= begin || stack > end) return false; info->type = STACK_TYPE_SOFTIRQ; diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 3e1471d57487..88ce2ffdb110 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs @@ -55,7 +56,7 @@ static bool in_exception_stack(unsigned long *stack, struct stack_info *info) begin = end - (exception_stack_sizes[k] / sizeof(long)); regs = (struct pt_regs *)end - 1; - if (stack < begin || stack >= end) + if (stack <= begin || stack >= end) continue; info->type = STACK_TYPE_EXCEPTION + k; @@ -78,7 +79,7 @@ static bool in_irq_stack(unsigned long *stack, struct stack_info *info) * This is a software stack, so 'end' can be a valid stack pointer. * It just means the stack is empty. */ - if (stack < begin || stack > end) + if (stack <= begin || stack > end) return false; info->type = STACK_TYPE_IRQ; diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index d78a586ba8dc..71c11ad5643e 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -20,10 +20,12 @@ #include <asm/setup.h> /* - * We organize the E820 table into two main data structures: + * We organize the E820 table into three main data structures: * * - 'e820_table_firmware': the original firmware version passed to us by the - * bootloader - not modified by the kernel. We use this to: + * bootloader - not modified by the kernel. It is composed of two parts: + * the first 128 E820 memory entries in boot_params.e820_table and the remaining + * (if any) entries of the SETUP_E820_EXT nodes. We use this to: * * - inform the user about the firmware's notion of memory layout * via /sys/firmware/memmap @@ -31,6 +33,14 @@ * - the hibernation code uses it to generate a kernel-independent MD5 * fingerprint of the physical memory layout of a system. * + * - 'e820_table_kexec': a slightly modified (by the kernel) firmware version + * passed to us by the bootloader - the major difference between + * e820_table_firmware[] and this one is that, the latter marks the setup_data + * list created by the EFI boot stub as reserved, so that kexec can reuse the + * setup_data information in the second kernel. Besides, e820_table_kexec[] + * might also be modified by the kexec itself to fake a mptable. + * We use this to: + * * - kexec, which is a bootloader in disguise, uses the original E820 * layout to pass to the kexec-ed kernel. This way the original kernel * can have a restricted E820 map while the kexec()-ed kexec-kernel @@ -46,9 +56,11 @@ * specific memory layout data during early bootup. */ static struct e820_table e820_table_init __initdata; +static struct e820_table e820_table_kexec_init __initdata; static struct e820_table e820_table_firmware_init __initdata; struct e820_table *e820_table __refdata = &e820_table_init; +struct e820_table *e820_table_kexec __refdata = &e820_table_kexec_init; struct e820_table *e820_table_firmware __refdata = &e820_table_firmware_init; /* For PCI or other memory-mapped resources */ @@ -84,7 +96,8 @@ EXPORT_SYMBOL_GPL(e820__mapped_any); * Note: this function only works correctly once the E820 table is sorted and * not-overlapping (at least for the range specified), which is the case normally. */ -bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type) +static struct e820_entry *__e820__mapped_all(u64 start, u64 end, + enum e820_type type) { int i; @@ -110,9 +123,28 @@ bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type) * coverage of the desired range exists: */ if (start >= end) - return 1; + return entry; } - return 0; + + return NULL; +} + +/* + * This function checks if the entire range <start,end> is mapped with type. + */ +bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type) +{ + return __e820__mapped_all(start, end, type); +} + +/* + * This function returns the type associated with the range <start,end>. + */ +int e820__get_entry_type(u64 start, u64 end) +{ + struct e820_entry *entry = __e820__mapped_all(start, end, 0); + + return entry ? entry->type : -EINVAL; } /* @@ -470,9 +502,9 @@ u64 __init e820__range_update(u64 start, u64 size, enum e820_type old_type, enum return __e820__range_update(e820_table, start, size, old_type, new_type); } -static u64 __init e820__range_update_firmware(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type) +static u64 __init e820__range_update_kexec(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type) { - return __e820__range_update(e820_table_firmware, start, size, old_type, new_type); + return __e820__range_update(e820_table_kexec, start, size, old_type, new_type); } /* Remove a range of memory from the E820 table: */ @@ -546,9 +578,9 @@ void __init e820__update_table_print(void) e820__print_table("modified"); } -static void __init e820__update_table_firmware(void) +static void __init e820__update_table_kexec(void) { - e820__update_table(e820_table_firmware); + e820__update_table(e820_table_kexec); } #define MAX_GAP_END 0x100000000ull @@ -623,7 +655,7 @@ __init void e820__setup_pci_gap(void) /* * Called late during init, in free_initmem(). * - * Initial e820_table and e820_table_firmware are largish __initdata arrays. + * Initial e820_table and e820_table_kexec are largish __initdata arrays. * * Copy them to a (usually much smaller) dynamically allocated area that is * sized precisely after the number of e820 entries. @@ -643,6 +675,12 @@ __init void e820__reallocate_tables(void) memcpy(n, e820_table, size); e820_table = n; + size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table_kexec->nr_entries; + n = kmalloc(size, GFP_KERNEL); + BUG_ON(!n); + memcpy(n, e820_table_kexec, size); + e820_table_kexec = n; + size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table_firmware->nr_entries; n = kmalloc(size, GFP_KERNEL); BUG_ON(!n); @@ -669,6 +707,9 @@ void __init e820__memory_setup_extended(u64 phys_addr, u32 data_len) __append_e820_table(extmap, entries); e820__update_table(e820_table); + memcpy(e820_table_kexec, e820_table, sizeof(*e820_table_kexec)); + memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware)); + early_memunmap(sdata, data_len); pr_info("e820: extended physical RAM map:\n"); e820__print_table("extended"); @@ -727,7 +768,7 @@ core_initcall(e820__register_nvs_regions); /* * Allocate the requested number of bytes with the requsted alignment * and return (the physical address) to the caller. Also register this - * range in the 'firmware' E820 table as a reserved range. + * range in the 'kexec' E820 table as a reserved range. * * This allows kexec to fake a new mptable, as if it came from the real * system. @@ -738,9 +779,9 @@ u64 __init e820__memblock_alloc_reserved(u64 size, u64 align) addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); if (addr) { - e820__range_update_firmware(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED); - pr_info("e820: update e820_table_firmware for e820__memblock_alloc_reserved()\n"); - e820__update_table_firmware(); + e820__range_update_kexec(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED); + pr_info("e820: update e820_table_kexec for e820__memblock_alloc_reserved()\n"); + e820__update_table_kexec(); } return addr; @@ -923,13 +964,13 @@ void __init e820__reserve_setup_data(void) while (pa_data) { data = early_memremap(pa_data, sizeof(*data)); e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); + e820__range_update_kexec(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); pa_data = data->next; early_memunmap(data, sizeof(*data)); } e820__update_table(e820_table); - - memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware)); + e820__update_table(e820_table_kexec); pr_info("extended physical RAM map:\n"); e820__print_table("reserve setup_data"); @@ -1062,6 +1103,7 @@ void __init e820__reserve_resources(void) res++; } + /* Expose the bootloader-provided memory layout to the sysfs. */ for (i = 0; i < e820_table_firmware->nr_entries; i++) { struct e820_entry *entry = e820_table_firmware->entries + i; @@ -1175,6 +1217,7 @@ void __init e820__memory_setup(void) who = x86_init.resources.memory_setup(); + memcpy(e820_table_kexec, e820_table, sizeof(*e820_table_kexec)); memcpy(e820_table_firmware, e820_table, sizeof(*e820_table_firmware)); pr_info("e820: BIOS-provided physical RAM map:\n"); diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index d907c3d8633f..1e82f787c160 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Various workarounds for chipset bugs. This code runs very early and can't use the regular PCI subsystem The entries are keyed to PCI bridges which usually identify chipsets @@ -12,10 +13,10 @@ #include <linux/pci.h> #include <linux/acpi.h> #include <linux/delay.h> -#include <linux/dmi.h> #include <linux/pci_ids.h> #include <linux/bcma/bcma.h> #include <linux/bcma/bcma_regs.h> +#include <linux/platform_data/x86/apple.h> #include <drm/i915_drm.h> #include <asm/pci-direct.h> #include <asm/dma.h> @@ -527,6 +528,7 @@ static const struct pci_device_id intel_early_ids[] __initconst = { INTEL_BXT_IDS(&gen9_early_ops), INTEL_KBL_IDS(&gen9_early_ops), INTEL_GLK_IDS(&gen9_early_ops), + INTEL_CNL_IDS(&gen9_early_ops), }; static void __init @@ -593,7 +595,7 @@ static void __init apple_airport_reset(int bus, int slot, int func) u64 addr; int i; - if (!dmi_match(DMI_SYS_VENDOR, "Apple Inc.")) + if (!x86_apple_machine) return; /* Card may have been put into PCI_D3hot by grub quirk */ diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 0f0840304452..5e801c8c8ce7 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/console.h> #include <linux/kernel.h> #include <linux/init.h> diff --git a/arch/x86/kernel/ebda.c b/arch/x86/kernel/ebda.c index 4312f8ae71b7..38e7d597b660 100644 --- a/arch/x86/kernel/ebda.c +++ b/arch/x86/kernel/ebda.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/init.h> #include <linux/memblock.h> diff --git a/arch/x86/kernel/eisa.c b/arch/x86/kernel/eisa.c new file mode 100644 index 000000000000..f260e452e4f8 --- /dev/null +++ b/arch/x86/kernel/eisa.c @@ -0,0 +1,19 @@ +/* + * EISA specific code + * + * This file is licensed under the GPL V2 + */ +#include <linux/ioport.h> +#include <linux/eisa.h> +#include <linux/io.h> + +static __init int eisa_bus_probe(void) +{ + void __iomem *p = ioremap(0x0FFFD9, 4); + + if (readl(p) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24)) + EISA_bus = 1; + iounmap(p); + return 0; +} +subsys_initcall(eisa_bus_probe); diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 8e598a1ad986..9c4e7ba6870c 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -125,7 +125,7 @@ void __init init_espfix_bsp(void) p4d_t *p4d; /* Install the espfix pud into the kernel page directory */ - pgd = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; + pgd = &init_top_pgt[pgd_index(ESPFIX_BASE_ADDR)]; p4d = p4d_alloc(&init_mm, pgd, ESPFIX_BASE_ADDR); p4d_populate(&init_mm, p4d, espfix_pud_page); @@ -195,7 +195,7 @@ void init_espfix_ap(int cpu) pte_p = pte_offset_kernel(&pmd, addr); stack_page = page_address(alloc_pages_node(node, GFP_KERNEL, 0)); - pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask)); + pte = __pte(__pa(stack_page) | ((__PAGE_KERNEL_RO | _PAGE_ENC) & ptemask)); for (n = 0; n < ESPFIX_PTE_CLONES; n++) set_pte(&pte_p[n*PTE_STRIDE], pte); diff --git a/arch/x86/kernel/fpu/bugs.c b/arch/x86/kernel/fpu/bugs.c index d913047f832c..2954fab15e51 100644 --- a/arch/x86/kernel/fpu/bugs.c +++ b/arch/x86/kernel/fpu/bugs.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * x86 FPU bug checks: */ diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index e1114f070c2d..f92a6593de1e 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -100,7 +100,7 @@ void __kernel_fpu_begin(void) kernel_fpu_disable(); - if (fpu->fpregs_active) { + if (fpu->initialized) { /* * Ignore return value -- we don't care if reg state * is clobbered. @@ -116,7 +116,7 @@ void __kernel_fpu_end(void) { struct fpu *fpu = ¤t->thread.fpu; - if (fpu->fpregs_active) + if (fpu->initialized) copy_kernel_to_fpregs(&fpu->state); kernel_fpu_enable(); @@ -148,7 +148,7 @@ void fpu__save(struct fpu *fpu) preempt_disable(); trace_x86_fpu_before_save(fpu); - if (fpu->fpregs_active) { + if (fpu->initialized) { if (!copy_fpregs_to_fpstate(fpu)) { copy_kernel_to_fpregs(&fpu->state); } @@ -189,10 +189,9 @@ EXPORT_SYMBOL_GPL(fpstate_init); int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) { - dst_fpu->fpregs_active = 0; dst_fpu->last_cpu = -1; - if (!src_fpu->fpstate_active || !static_cpu_has(X86_FEATURE_FPU)) + if (!src_fpu->initialized || !static_cpu_has(X86_FEATURE_FPU)) return 0; WARN_ON_FPU(src_fpu != ¤t->thread.fpu); @@ -206,26 +205,14 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) /* * Save current FPU registers directly into the child * FPU context, without any memory-to-memory copying. - * In lazy mode, if the FPU context isn't loaded into - * fpregs, CR0.TS will be set and do_device_not_available - * will load the FPU context. * - * We have to do all this with preemption disabled, - * mostly because of the FNSAVE case, because in that - * case we must not allow preemption in the window - * between the FNSAVE and us marking the context lazy. - * - * It shouldn't be an issue as even FNSAVE is plenty - * fast in terms of critical section length. + * ( The function 'fails' in the FNSAVE case, which destroys + * register contents so we have to copy them back. ) */ - preempt_disable(); if (!copy_fpregs_to_fpstate(dst_fpu)) { - memcpy(&src_fpu->state, &dst_fpu->state, - fpu_kernel_xstate_size); - + memcpy(&src_fpu->state, &dst_fpu->state, fpu_kernel_xstate_size); copy_kernel_to_fpregs(&src_fpu->state); } - preempt_enable(); trace_x86_fpu_copy_src(src_fpu); trace_x86_fpu_copy_dst(dst_fpu); @@ -237,45 +224,48 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) * Activate the current task's in-memory FPU context, * if it has not been used before: */ -void fpu__activate_curr(struct fpu *fpu) +void fpu__initialize(struct fpu *fpu) { WARN_ON_FPU(fpu != ¤t->thread.fpu); - if (!fpu->fpstate_active) { + if (!fpu->initialized) { fpstate_init(&fpu->state); trace_x86_fpu_init_state(fpu); trace_x86_fpu_activate_state(fpu); /* Safe to do for the current task: */ - fpu->fpstate_active = 1; + fpu->initialized = 1; } } -EXPORT_SYMBOL_GPL(fpu__activate_curr); +EXPORT_SYMBOL_GPL(fpu__initialize); /* * This function must be called before we read a task's fpstate. * - * If the task has not used the FPU before then initialize its - * fpstate. + * There's two cases where this gets called: + * + * - for the current task (when coredumping), in which case we have + * to save the latest FPU registers into the fpstate, + * + * - or it's called for stopped tasks (ptrace), in which case the + * registers were already saved by the context-switch code when + * the task scheduled out - we only have to initialize the registers + * if they've never been initialized. * * If the task has used the FPU before then save it. */ -void fpu__activate_fpstate_read(struct fpu *fpu) +void fpu__prepare_read(struct fpu *fpu) { - /* - * If fpregs are active (in the current CPU), then - * copy them to the fpstate: - */ - if (fpu->fpregs_active) { + if (fpu == ¤t->thread.fpu) { fpu__save(fpu); } else { - if (!fpu->fpstate_active) { + if (!fpu->initialized) { fpstate_init(&fpu->state); trace_x86_fpu_init_state(fpu); trace_x86_fpu_activate_state(fpu); /* Safe to do for current and for stopped child tasks: */ - fpu->fpstate_active = 1; + fpu->initialized = 1; } } } @@ -283,17 +273,17 @@ void fpu__activate_fpstate_read(struct fpu *fpu) /* * This function must be called before we write a task's fpstate. * - * If the task has used the FPU before then unlazy it. + * If the task has used the FPU before then invalidate any cached FPU registers. * If the task has not used the FPU before then initialize its fpstate. * * After this function call, after registers in the fpstate are * modified and the child task has woken up, the child task will * restore the modified FPU state from the modified context. If we - * didn't clear its lazy status here then the lazy in-registers + * didn't clear its cached status here then the cached in-registers * state pending on its former CPU could be restored, corrupting * the modifications. */ -void fpu__activate_fpstate_write(struct fpu *fpu) +void fpu__prepare_write(struct fpu *fpu) { /* * Only stopped child tasks can be used to modify the FPU @@ -301,8 +291,8 @@ void fpu__activate_fpstate_write(struct fpu *fpu) */ WARN_ON_FPU(fpu == ¤t->thread.fpu); - if (fpu->fpstate_active) { - /* Invalidate any lazy state: */ + if (fpu->initialized) { + /* Invalidate any cached state: */ __fpu_invalidate_fpregs_state(fpu); } else { fpstate_init(&fpu->state); @@ -310,74 +300,11 @@ void fpu__activate_fpstate_write(struct fpu *fpu) trace_x86_fpu_activate_state(fpu); /* Safe to do for stopped child tasks: */ - fpu->fpstate_active = 1; + fpu->initialized = 1; } } /* - * This function must be called before we write the current - * task's fpstate. - * - * This call gets the current FPU register state and moves - * it in to the 'fpstate'. Preemption is disabled so that - * no writes to the 'fpstate' can occur from context - * swiches. - * - * Must be followed by a fpu__current_fpstate_write_end(). - */ -void fpu__current_fpstate_write_begin(void) -{ - struct fpu *fpu = ¤t->thread.fpu; - - /* - * Ensure that the context-switching code does not write - * over the fpstate while we are doing our update. - */ - preempt_disable(); - - /* - * Move the fpregs in to the fpu's 'fpstate'. - */ - fpu__activate_fpstate_read(fpu); - - /* - * The caller is about to write to 'fpu'. Ensure that no - * CPU thinks that its fpregs match the fpstate. This - * ensures we will not be lazy and skip a XRSTOR in the - * future. - */ - __fpu_invalidate_fpregs_state(fpu); -} - -/* - * This function must be paired with fpu__current_fpstate_write_begin() - * - * This will ensure that the modified fpstate gets placed back in - * the fpregs if necessary. - * - * Note: This function may be called whether or not an _actual_ - * write to the fpstate occurred. - */ -void fpu__current_fpstate_write_end(void) -{ - struct fpu *fpu = ¤t->thread.fpu; - - /* - * 'fpu' now has an updated copy of the state, but the - * registers may still be out of date. Update them with - * an XRSTOR if they are active. - */ - if (fpregs_active()) - copy_kernel_to_fpregs(&fpu->state); - - /* - * Our update is done and the fpregs/fpstate are in sync - * if necessary. Context switches can happen again. - */ - preempt_enable(); -} - -/* * 'fpu__restore()' is called to copy FPU registers from * the FPU fpstate to the live hw registers and to activate * access to the hardware registers, so that FPU instructions @@ -389,7 +316,7 @@ void fpu__current_fpstate_write_end(void) */ void fpu__restore(struct fpu *fpu) { - fpu__activate_curr(fpu); + fpu__initialize(fpu); /* Avoid __kernel_fpu_begin() right after fpregs_activate() */ kernel_fpu_disable(); @@ -414,15 +341,17 @@ void fpu__drop(struct fpu *fpu) { preempt_disable(); - if (fpu->fpregs_active) { - /* Ignore delayed exceptions from user space */ - asm volatile("1: fwait\n" - "2:\n" - _ASM_EXTABLE(1b, 2b)); - fpregs_deactivate(fpu); + if (fpu == ¤t->thread.fpu) { + if (fpu->initialized) { + /* Ignore delayed exceptions from user space */ + asm volatile("1: fwait\n" + "2:\n" + _ASM_EXTABLE(1b, 2b)); + fpregs_deactivate(fpu); + } } - fpu->fpstate_active = 0; + fpu->initialized = 0; trace_x86_fpu_dropped(fpu); @@ -462,9 +391,11 @@ void fpu__clear(struct fpu *fpu) * Make sure fpstate is cleared and initialized. */ if (static_cpu_has(X86_FEATURE_FPU)) { - fpu__activate_curr(fpu); + preempt_disable(); + fpu__initialize(fpu); user_fpu_begin(); copy_init_fpstate_to_fpregs(); + preempt_enable(); } } diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index d5d44c452624..7affb7e3d9a5 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -240,7 +240,7 @@ static void __init fpu__init_system_ctx_switch(void) WARN_ON_FPU(!on_boot_cpu); on_boot_cpu = 0; - WARN_ON_FPU(current->thread.fpu.fpstate_active); + WARN_ON_FPU(current->thread.fpu.initialized); } /* diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index b188b16841e3..bc02f5144b95 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * FPU register's regset abstraction, for ptrace, core dumps, etc. */ @@ -16,14 +17,14 @@ int regset_fpregs_active(struct task_struct *target, const struct user_regset *r { struct fpu *target_fpu = &target->thread.fpu; - return target_fpu->fpstate_active ? regset->n : 0; + return target_fpu->initialized ? regset->n : 0; } int regset_xregset_fpregs_active(struct task_struct *target, const struct user_regset *regset) { struct fpu *target_fpu = &target->thread.fpu; - if (boot_cpu_has(X86_FEATURE_FXSR) && target_fpu->fpstate_active) + if (boot_cpu_has(X86_FEATURE_FXSR) && target_fpu->initialized) return regset->n; else return 0; @@ -38,7 +39,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset, if (!boot_cpu_has(X86_FEATURE_FXSR)) return -ENODEV; - fpu__activate_fpstate_read(fpu); + fpu__prepare_read(fpu); fpstate_sanitize_xstate(fpu); return user_regset_copyout(&pos, &count, &kbuf, &ubuf, @@ -55,7 +56,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, if (!boot_cpu_has(X86_FEATURE_FXSR)) return -ENODEV; - fpu__activate_fpstate_write(fpu); + fpu__prepare_write(fpu); fpstate_sanitize_xstate(fpu); ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, @@ -89,10 +90,13 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, xsave = &fpu->state.xsave; - fpu__activate_fpstate_read(fpu); + fpu__prepare_read(fpu); if (using_compacted_format()) { - ret = copyout_from_xsaves(pos, count, kbuf, ubuf, xsave); + if (kbuf) + ret = copy_xstate_to_kernel(kbuf, xsave, pos, count); + else + ret = copy_xstate_to_user(ubuf, xsave, pos, count); } else { fpstate_sanitize_xstate(fpu); /* @@ -129,28 +133,29 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, xsave = &fpu->state.xsave; - fpu__activate_fpstate_write(fpu); + fpu__prepare_write(fpu); - if (boot_cpu_has(X86_FEATURE_XSAVES)) - ret = copyin_to_xsaves(kbuf, ubuf, xsave); - else + if (using_compacted_format()) { + if (kbuf) + ret = copy_kernel_to_xstate(xsave, kbuf); + else + ret = copy_user_to_xstate(xsave, ubuf); + } else { ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); - - /* - * In case of failure, mark all states as init: - */ - if (ret) - fpstate_init(&fpu->state); + if (!ret) + ret = validate_xstate_header(&xsave->header); + } /* * mxcsr reserved bits must be masked to zero for security reasons. */ xsave->i387.mxcsr &= mxcsr_feature_mask; - xsave->header.xfeatures &= xfeatures_mask; + /* - * These bits must be zero. + * In case of failure, mark all states as init: */ - memset(&xsave->header.reserved, 0, 48); + if (ret) + fpstate_init(&fpu->state); return ret; } @@ -299,7 +304,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset, struct fpu *fpu = &target->thread.fpu; struct user_i387_ia32_struct env; - fpu__activate_fpstate_read(fpu); + fpu__prepare_read(fpu); if (!boot_cpu_has(X86_FEATURE_FPU)) return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf); @@ -329,7 +334,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, struct user_i387_ia32_struct env; int ret; - fpu__activate_fpstate_write(fpu); + fpu__prepare_write(fpu); fpstate_sanitize_xstate(fpu); if (!boot_cpu_has(X86_FEATURE_FPU)) @@ -369,7 +374,7 @@ int dump_fpu(struct pt_regs *regs, struct user_i387_struct *ufpu) struct fpu *fpu = &tsk->thread.fpu; int fpvalid; - fpvalid = fpu->fpstate_active; + fpvalid = fpu->initialized; if (fpvalid) fpvalid = !fpregs_get(tsk, NULL, 0, sizeof(struct user_i387_ia32_struct), diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 83c23c230b4c..23f1691670b6 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * FPU signal frame handling routines. */ @@ -155,7 +156,8 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) */ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) { - struct xregs_state *xsave = ¤t->thread.fpu.state.xsave; + struct fpu *fpu = ¤t->thread.fpu; + struct xregs_state *xsave = &fpu->state.xsave; struct task_struct *tsk = current; int ia32_fxstate = (buf != buf_fx); @@ -170,13 +172,13 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) sizeof(struct user_i387_ia32_struct), NULL, (struct _fpstate_32 __user *) buf) ? -1 : 1; - if (fpregs_active() || using_compacted_format()) { + if (fpu->initialized || using_compacted_format()) { /* Save the live register state to the user directly. */ if (copy_fpregs_to_sigframe(buf_fx)) return -1; /* Update the thread's fxstate to save the fsave header. */ if (ia32_fxstate) - copy_fxregs_to_kernel(&tsk->thread.fpu); + copy_fxregs_to_kernel(fpu); } else { /* * It is a *bug* if kernel uses compacted-format for xsave @@ -189,7 +191,7 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) return -1; } - fpstate_sanitize_xstate(&tsk->thread.fpu); + fpstate_sanitize_xstate(fpu); if (__copy_to_user(buf_fx, xsave, fpu_user_xstate_size)) return -1; } @@ -213,8 +215,11 @@ sanitize_restored_xstate(struct task_struct *tsk, struct xstate_header *header = &xsave->header; if (use_xsave()) { - /* These bits must be zero. */ - memset(header->reserved, 0, 48); + /* + * Note: we don't need to zero the reserved bits in the + * xstate_header here because we either didn't copy them at all, + * or we checked earlier that they aren't set. + */ /* * Init the state that is not present in the memory @@ -223,7 +228,7 @@ sanitize_restored_xstate(struct task_struct *tsk, if (fx_only) header->xfeatures = XFEATURE_MASK_FPSSE; else - header->xfeatures &= (xfeatures_mask & xfeatures); + header->xfeatures &= xfeatures; } if (use_fxsr()) { @@ -279,7 +284,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) if (!access_ok(VERIFY_READ, buf, size)) return -EACCES; - fpu__activate_curr(fpu); + fpu__initialize(fpu); if (!static_cpu_has(X86_FEATURE_FPU)) return fpregs_soft_set(current, NULL, @@ -307,28 +312,29 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) /* * For 32-bit frames with fxstate, copy the user state to the * thread's fpu state, reconstruct fxstate from the fsave - * header. Sanitize the copied state etc. + * header. Validate and sanitize the copied state. */ struct fpu *fpu = &tsk->thread.fpu; struct user_i387_ia32_struct env; int err = 0; /* - * Drop the current fpu which clears fpu->fpstate_active. This ensures + * Drop the current fpu which clears fpu->initialized. This ensures * that any context-switch during the copy of the new state, * avoids the intermediate state from getting restored/saved. * Thus avoiding the new restored state from getting corrupted. * We will be ready to restore/save the state only after - * fpu->fpstate_active is again set. + * fpu->initialized is again set. */ fpu__drop(fpu); if (using_compacted_format()) { - err = copyin_to_xsaves(NULL, buf_fx, - &fpu->state.xsave); + err = copy_user_to_xstate(&fpu->state.xsave, buf_fx); } else { - err = __copy_from_user(&fpu->state.xsave, - buf_fx, state_size); + err = __copy_from_user(&fpu->state.xsave, buf_fx, state_size); + + if (!err && state_size > offsetof(struct xregs_state, header)) + err = validate_xstate_header(&fpu->state.xsave.header); } if (err || __copy_from_user(&env, buf, sizeof(env))) { @@ -339,7 +345,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) sanitize_restored_xstate(tsk, &env, xfeatures, fx_only); } - fpu->fpstate_active = 1; + fpu->initialized = 1; preempt_disable(); fpu__restore(fpu); preempt_enable(); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index c24ac1efb12d..f1d5476c9022 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -483,6 +483,30 @@ int using_compacted_format(void) return boot_cpu_has(X86_FEATURE_XSAVES); } +/* Validate an xstate header supplied by userspace (ptrace or sigreturn) */ +int validate_xstate_header(const struct xstate_header *hdr) +{ + /* No unknown or supervisor features may be set */ + if (hdr->xfeatures & (~xfeatures_mask | XFEATURE_MASK_SUPERVISOR)) + return -EINVAL; + + /* Userspace must use the uncompacted format */ + if (hdr->xcomp_bv) + return -EINVAL; + + /* + * If 'reserved' is shrunken to add a new field, make sure to validate + * that new field here! + */ + BUILD_BUG_ON(sizeof(hdr->reserved) != 48); + + /* No reserved bits may be set */ + if (memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved))) + return -EINVAL; + + return 0; +} + static void __xstate_dump_leaves(void) { int i; @@ -867,7 +891,7 @@ const void *get_xsave_field_ptr(int xsave_state) { struct fpu *fpu = ¤t->thread.fpu; - if (!fpu->fpstate_active) + if (!fpu->initialized) return NULL; /* * fpu__save() takes the CPU's xstate registers @@ -921,38 +945,129 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, #endif /* ! CONFIG_ARCH_HAS_PKEYS */ /* + * Weird legacy quirk: SSE and YMM states store information in the + * MXCSR and MXCSR_FLAGS fields of the FP area. That means if the FP + * area is marked as unused in the xfeatures header, we need to copy + * MXCSR and MXCSR_FLAGS if either SSE or YMM are in use. + */ +static inline bool xfeatures_mxcsr_quirk(u64 xfeatures) +{ + if (!(xfeatures & (XFEATURE_MASK_SSE|XFEATURE_MASK_YMM))) + return false; + + if (xfeatures & XFEATURE_MASK_FP) + return false; + + return true; +} + +/* * This is similar to user_regset_copyout(), but will not add offset to * the source data pointer or increment pos, count, kbuf, and ubuf. */ -static inline int xstate_copyout(unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf, - const void *data, const int start_pos, - const int end_pos) +static inline void +__copy_xstate_to_kernel(void *kbuf, const void *data, + unsigned int offset, unsigned int size, unsigned int size_total) { - if ((count == 0) || (pos < start_pos)) - return 0; + if (offset < size_total) { + unsigned int copy = min(size, size_total - offset); - if (end_pos < 0 || pos < end_pos) { - unsigned int copy = (end_pos < 0 ? count : min(count, end_pos - pos)); + memcpy(kbuf + offset, data, copy); + } +} - if (kbuf) { - memcpy(kbuf + pos, data, copy); - } else { - if (__copy_to_user(ubuf + pos, data, copy)) - return -EFAULT; +/* + * Convert from kernel XSAVES compacted format to standard format and copy + * to a kernel-space ptrace buffer. + * + * It supports partial copy but pos always starts from zero. This is called + * from xstateregs_get() and there we check the CPU has XSAVES. + */ +int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset_start, unsigned int size_total) +{ + unsigned int offset, size; + struct xstate_header header; + int i; + + /* + * Currently copy_regset_to_user() starts from pos 0: + */ + if (unlikely(offset_start != 0)) + return -EFAULT; + + /* + * The destination is a ptrace buffer; we put in only user xstates: + */ + memset(&header, 0, sizeof(header)); + header.xfeatures = xsave->header.xfeatures; + header.xfeatures &= ~XFEATURE_MASK_SUPERVISOR; + + /* + * Copy xregs_state->header: + */ + offset = offsetof(struct xregs_state, header); + size = sizeof(header); + + __copy_xstate_to_kernel(kbuf, &header, offset, size, size_total); + + for (i = 0; i < XFEATURE_MAX; i++) { + /* + * Copy only in-use xstates: + */ + if ((header.xfeatures >> i) & 1) { + void *src = __raw_xsave_addr(xsave, 1 << i); + + offset = xstate_offsets[i]; + size = xstate_sizes[i]; + + /* The next component has to fit fully into the output buffer: */ + if (offset + size > size_total) + break; + + __copy_xstate_to_kernel(kbuf, src, offset, size, size_total); } + + } + + if (xfeatures_mxcsr_quirk(header.xfeatures)) { + offset = offsetof(struct fxregs_state, mxcsr); + size = MXCSR_AND_FLAGS_SIZE; + __copy_xstate_to_kernel(kbuf, &xsave->i387.mxcsr, offset, size, size_total); + } + + /* + * Fill xsave->i387.sw_reserved value for ptrace frame: + */ + offset = offsetof(struct fxregs_state, sw_reserved); + size = sizeof(xstate_fx_sw_bytes); + + __copy_xstate_to_kernel(kbuf, xstate_fx_sw_bytes, offset, size, size_total); + + return 0; +} + +static inline int +__copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int offset, unsigned int size, unsigned int size_total) +{ + if (!size) + return 0; + + if (offset < size_total) { + unsigned int copy = min(size, size_total - offset); + + if (__copy_to_user(ubuf + offset, data, copy)) + return -EFAULT; } return 0; } /* * Convert from kernel XSAVES compacted format to standard format and copy - * to a ptrace buffer. It supports partial copy but pos always starts from + * to a user-space buffer. It supports partial copy but pos always starts from * zero. This is called from xstateregs_get() and there we check the CPU * has XSAVES. */ -int copyout_from_xsaves(unsigned int pos, unsigned int count, void *kbuf, - void __user *ubuf, struct xregs_state *xsave) +int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset_start, unsigned int size_total) { unsigned int offset, size; int ret, i; @@ -961,7 +1076,7 @@ int copyout_from_xsaves(unsigned int pos, unsigned int count, void *kbuf, /* * Currently copy_regset_to_user() starts from pos 0: */ - if (unlikely(pos != 0)) + if (unlikely(offset_start != 0)) return -EFAULT; /* @@ -977,8 +1092,7 @@ int copyout_from_xsaves(unsigned int pos, unsigned int count, void *kbuf, offset = offsetof(struct xregs_state, header); size = sizeof(header); - ret = xstate_copyout(offset, size, kbuf, ubuf, &header, 0, count); - + ret = __copy_xstate_to_user(ubuf, &header, offset, size, size_total); if (ret) return ret; @@ -992,25 +1106,30 @@ int copyout_from_xsaves(unsigned int pos, unsigned int count, void *kbuf, offset = xstate_offsets[i]; size = xstate_sizes[i]; - ret = xstate_copyout(offset, size, kbuf, ubuf, src, 0, count); + /* The next component has to fit fully into the output buffer: */ + if (offset + size > size_total) + break; + ret = __copy_xstate_to_user(ubuf, src, offset, size, size_total); if (ret) return ret; - - if (offset + size >= count) - break; } } + if (xfeatures_mxcsr_quirk(header.xfeatures)) { + offset = offsetof(struct fxregs_state, mxcsr); + size = MXCSR_AND_FLAGS_SIZE; + __copy_xstate_to_user(ubuf, &xsave->i387.mxcsr, offset, size, size_total); + } + /* * Fill xsave->i387.sw_reserved value for ptrace frame: */ offset = offsetof(struct fxregs_state, sw_reserved); size = sizeof(xstate_fx_sw_bytes); - ret = xstate_copyout(offset, size, kbuf, ubuf, xstate_fx_sw_bytes, 0, count); - + ret = __copy_xstate_to_user(ubuf, xstate_fx_sw_bytes, offset, size, size_total); if (ret) return ret; @@ -1018,55 +1137,98 @@ int copyout_from_xsaves(unsigned int pos, unsigned int count, void *kbuf, } /* - * Convert from a ptrace standard-format buffer to kernel XSAVES format - * and copy to the target thread. This is called from xstateregs_set() and - * there we check the CPU has XSAVES and a whole standard-sized buffer - * exists. + * Convert from a ptrace standard-format kernel buffer to kernel XSAVES format + * and copy to the target thread. This is called from xstateregs_set(). */ -int copyin_to_xsaves(const void *kbuf, const void __user *ubuf, - struct xregs_state *xsave) +int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) { unsigned int offset, size; int i; - u64 xfeatures; - u64 allowed_features; + struct xstate_header hdr; offset = offsetof(struct xregs_state, header); - size = sizeof(xfeatures); + size = sizeof(hdr); - if (kbuf) { - memcpy(&xfeatures, kbuf + offset, size); - } else { - if (__copy_from_user(&xfeatures, ubuf + offset, size)) - return -EFAULT; + memcpy(&hdr, kbuf + offset, size); + + if (validate_xstate_header(&hdr)) + return -EINVAL; + + for (i = 0; i < XFEATURE_MAX; i++) { + u64 mask = ((u64)1 << i); + + if (hdr.xfeatures & mask) { + void *dst = __raw_xsave_addr(xsave, 1 << i); + + offset = xstate_offsets[i]; + size = xstate_sizes[i]; + + memcpy(dst, kbuf + offset, size); + } + } + + if (xfeatures_mxcsr_quirk(hdr.xfeatures)) { + offset = offsetof(struct fxregs_state, mxcsr); + size = MXCSR_AND_FLAGS_SIZE; + memcpy(&xsave->i387.mxcsr, kbuf + offset, size); } /* - * Reject if the user sets any disabled or supervisor features: + * The state that came in from userspace was user-state only. + * Mask all the user states out of 'xfeatures': + */ + xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR; + + /* + * Add back in the features that came in from userspace: */ - allowed_features = xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR; + xsave->header.xfeatures |= hdr.xfeatures; - if (xfeatures & ~allowed_features) + return 0; +} + +/* + * Convert from a ptrace or sigreturn standard-format user-space buffer to + * kernel XSAVES format and copy to the target thread. This is called from + * xstateregs_set(), as well as potentially from the sigreturn() and + * rt_sigreturn() system calls. + */ +int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) +{ + unsigned int offset, size; + int i; + struct xstate_header hdr; + + offset = offsetof(struct xregs_state, header); + size = sizeof(hdr); + + if (__copy_from_user(&hdr, ubuf + offset, size)) + return -EFAULT; + + if (validate_xstate_header(&hdr)) return -EINVAL; for (i = 0; i < XFEATURE_MAX; i++) { u64 mask = ((u64)1 << i); - if (xfeatures & mask) { + if (hdr.xfeatures & mask) { void *dst = __raw_xsave_addr(xsave, 1 << i); offset = xstate_offsets[i]; size = xstate_sizes[i]; - if (kbuf) { - memcpy(dst, kbuf + offset, size); - } else { - if (__copy_from_user(dst, ubuf + offset, size)) - return -EFAULT; - } + if (__copy_from_user(dst, ubuf + offset, size)) + return -EFAULT; } } + if (xfeatures_mxcsr_quirk(hdr.xfeatures)) { + offset = offsetof(struct fxregs_state, mxcsr); + size = MXCSR_AND_FLAGS_SIZE; + if (__copy_from_user(&xsave->i387.mxcsr, ubuf + offset, size)) + return -EFAULT; + } + /* * The state that came in from userspace was user-state only. * Mask all the user states out of 'xfeatures': @@ -1076,7 +1238,7 @@ int copyin_to_xsaves(const void *kbuf, const void __user *ubuf, /* * Add back in the features that came in from userspace: */ - xsave->header.xfeatures |= xfeatures; + xsave->header.xfeatures |= hdr.xfeatures; return 0; } diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 9bef1bbeba63..01ebcb6f263e 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Dynamic function tracing support. * diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S index 722a145b4139..b6c6468e10bc 100644 --- a/arch/x86/kernel/ftrace_32.S +++ b/arch/x86/kernel/ftrace_32.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2017 Steven Rostedt, VMware Inc. */ diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index 1dfac634bbf7..c832291d948a 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2014 Steven Rostedt, Red Hat Inc */ diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 538ec012b371..ec6fefbfd3c0 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * linux/arch/i386/kernel/head32.c -- prepare to run common code * @@ -10,6 +11,7 @@ #include <linux/mm.h> #include <linux/memblock.h> +#include <asm/desc.h> #include <asm/setup.h> #include <asm/sections.h> #include <asm/e820/api.h> @@ -29,7 +31,11 @@ static void __init i386_default_early_setup(void) asmlinkage __visible void __init i386_start_kernel(void) { + /* Make sure IDT is set up before any exception happens */ + idt_setup_early_handler(); + cr4_init_shadow(); + sanitize_boot_params(&boot_params); x86_early_init_platform_quirks(); diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 43b7002f44fb..6a5d757b9cfd 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * prepare to run common code * @@ -14,6 +15,7 @@ #include <linux/start_kernel.h> #include <linux/io.h> #include <linux/memblock.h> +#include <linux/mem_encrypt.h> #include <asm/processor.h> #include <asm/proto.h> @@ -33,33 +35,170 @@ /* * Manage page tables very early on. */ -extern pgd_t early_level4_pgt[PTRS_PER_PGD]; extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD]; -static unsigned int __initdata next_early_pgt = 2; +static unsigned int __initdata next_early_pgt; pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); +#define __head __section(.head.text) + +static void __head *fixup_pointer(void *ptr, unsigned long physaddr) +{ + return ptr - (void *)_text + (void *)physaddr; +} + +unsigned long __head __startup_64(unsigned long physaddr, + struct boot_params *bp) +{ + unsigned long load_delta, *p; + unsigned long pgtable_flags; + pgdval_t *pgd; + p4dval_t *p4d; + pudval_t *pud; + pmdval_t *pmd, pmd_entry; + int i; + unsigned int *next_pgt_ptr; + + /* Is the address too large? */ + if (physaddr >> MAX_PHYSMEM_BITS) + for (;;); + + /* + * Compute the delta between the address I am compiled to run at + * and the address I am actually running at. + */ + load_delta = physaddr - (unsigned long)(_text - __START_KERNEL_map); + + /* Is the address not 2M aligned? */ + if (load_delta & ~PMD_PAGE_MASK) + for (;;); + + /* Activate Secure Memory Encryption (SME) if supported and enabled */ + sme_enable(bp); + + /* Include the SME encryption mask in the fixup value */ + load_delta += sme_get_me_mask(); + + /* Fixup the physical addresses in the page table */ + + pgd = fixup_pointer(&early_top_pgt, physaddr); + pgd[pgd_index(__START_KERNEL_map)] += load_delta; + + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + p4d = fixup_pointer(&level4_kernel_pgt, physaddr); + p4d[511] += load_delta; + } + + pud = fixup_pointer(&level3_kernel_pgt, physaddr); + pud[510] += load_delta; + pud[511] += load_delta; + + pmd = fixup_pointer(level2_fixmap_pgt, physaddr); + pmd[506] += load_delta; + + /* + * Set up the identity mapping for the switchover. These + * entries should *NOT* have the global bit set! This also + * creates a bunch of nonsense entries but that is fine -- + * it avoids problems around wraparound. + */ + + next_pgt_ptr = fixup_pointer(&next_early_pgt, physaddr); + pud = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr); + pmd = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr); + + pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask(); + + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); + + i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; + pgd[i + 0] = (pgdval_t)p4d + pgtable_flags; + pgd[i + 1] = (pgdval_t)p4d + pgtable_flags; + + i = (physaddr >> P4D_SHIFT) % PTRS_PER_P4D; + p4d[i + 0] = (pgdval_t)pud + pgtable_flags; + p4d[i + 1] = (pgdval_t)pud + pgtable_flags; + } else { + i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; + pgd[i + 0] = (pgdval_t)pud + pgtable_flags; + pgd[i + 1] = (pgdval_t)pud + pgtable_flags; + } + + i = (physaddr >> PUD_SHIFT) % PTRS_PER_PUD; + pud[i + 0] = (pudval_t)pmd + pgtable_flags; + pud[i + 1] = (pudval_t)pmd + pgtable_flags; + + pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL; + pmd_entry += sme_get_me_mask(); + pmd_entry += physaddr; + + for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) { + int idx = i + (physaddr >> PMD_SHIFT) % PTRS_PER_PMD; + pmd[idx] = pmd_entry + i * PMD_SIZE; + } + + /* + * Fixup the kernel text+data virtual addresses. Note that + * we might write invalid pmds, when the kernel is relocated + * cleanup_highmap() fixes this up along with the mappings + * beyond _end. + */ + + pmd = fixup_pointer(level2_kernel_pgt, physaddr); + for (i = 0; i < PTRS_PER_PMD; i++) { + if (pmd[i] & _PAGE_PRESENT) + pmd[i] += load_delta; + } + + /* + * Fixup phys_base - remove the memory encryption mask to obtain + * the true physical address. + */ + p = fixup_pointer(&phys_base, physaddr); + *p += load_delta - sme_get_me_mask(); + + /* Encrypt the kernel (if SME is active) */ + sme_encrypt_kernel(); + + /* + * Return the SME encryption mask (if SME is active) to be used as a + * modifier for the initial pgdir entry programmed into CR3. + */ + return sme_get_me_mask(); +} + +unsigned long __startup_secondary_64(void) +{ + /* + * Return the SME encryption mask (if SME is active) to be used as a + * modifier for the initial pgdir entry programmed into CR3. + */ + return sme_get_me_mask(); +} + /* Wipe all early page tables except for the kernel symbol map */ static void __init reset_early_page_tables(void) { - memset(early_level4_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1)); + memset(early_top_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1)); next_early_pgt = 0; - write_cr3(__pa_nodebug(early_level4_pgt)); + write_cr3(__sme_pa_nodebug(early_top_pgt)); } /* Create a new PMD entry */ -int __init early_make_pgtable(unsigned long address) +int __init __early_make_pgtable(unsigned long address, pmdval_t pmd) { unsigned long physaddr = address - __PAGE_OFFSET; pgdval_t pgd, *pgd_p; + p4dval_t p4d, *p4d_p; pudval_t pud, *pud_p; - pmdval_t pmd, *pmd_p; + pmdval_t *pmd_p; /* Invalid address or early pgt is done ? */ - if (physaddr >= MAXMEM || read_cr3() != __pa_nodebug(early_level4_pgt)) + if (physaddr >= MAXMEM || read_cr3_pa() != __pa_nodebug(early_top_pgt)) return -1; again: - pgd_p = &early_level4_pgt[pgd_index(address)].pgd; + pgd_p = &early_top_pgt[pgd_index(address)].pgd; pgd = *pgd_p; /* @@ -67,8 +206,25 @@ again: * critical -- __PAGE_OFFSET would point us back into the dynamic * range and we might end up looping forever... */ - if (pgd) - pud_p = (pudval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); + if (!IS_ENABLED(CONFIG_X86_5LEVEL)) + p4d_p = pgd_p; + else if (pgd) + p4d_p = (p4dval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); + else { + if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) { + reset_early_page_tables(); + goto again; + } + + p4d_p = (p4dval_t *)early_dynamic_pgts[next_early_pgt++]; + memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D); + *pgd_p = (pgdval_t)p4d_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; + } + p4d_p += p4d_index(address); + p4d = *p4d_p; + + if (p4d) + pud_p = (pudval_t *)((p4d & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); else { if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) { reset_early_page_tables(); @@ -77,7 +233,7 @@ again: pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++]; memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); - *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; + *p4d_p = (p4dval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; } pud_p += pud_index(address); pud = *pud_p; @@ -94,12 +250,21 @@ again: memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD); *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; } - pmd = (physaddr & PMD_MASK) + early_pmd_flags; pmd_p[pmd_index(address)] = pmd; return 0; } +int __init early_make_pgtable(unsigned long address) +{ + unsigned long physaddr = address - __PAGE_OFFSET; + pmdval_t pmd; + + pmd = (physaddr & PMD_MASK) + early_pmd_flags; + + return __early_make_pgtable(address, pmd); +} + /* Don't add a printk in there. printk relies on the PDA which is not initialized yet. */ static void __init clear_bss(void) @@ -122,6 +287,12 @@ static void __init copy_bootdata(char *real_mode_data) char * command_line; unsigned long cmd_line_ptr; + /* + * If SME is active, this will create decrypted mappings of the + * boot data in advance of the copy operations. + */ + sme_map_bootdata(real_mode_data); + memcpy(&boot_params, real_mode_data, sizeof boot_params); sanitize_boot_params(&boot_params); cmd_line_ptr = get_cmd_line_ptr(); @@ -129,12 +300,18 @@ static void __init copy_bootdata(char *real_mode_data) command_line = __va(cmd_line_ptr); memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); } + + /* + * The old boot data is no longer needed and won't be reserved, + * freeing up that memory for use by the system. If SME is active, + * we need to remove the mappings that were created so that the + * memory doesn't remain mapped as decrypted. + */ + sme_unmap_bootdata(real_mode_data); } asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) { - int i; - /* * Build-time sanity checks on the kernel image and module * area mappings. (these are purely build-time and produce no code) @@ -156,13 +333,18 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) clear_bss(); - clear_page(init_level4_pgt); + clear_page(init_top_pgt); + + /* + * SME support may update early_pmd_flags to include the memory + * encryption mask, so it needs to be called before anything + * that may generate a page fault. + */ + sme_early_init(); kasan_early_init(); - for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) - set_intr_gate(i, early_idt_handler_array[i]); - load_idt((const struct desc_ptr *)&idt_descr); + idt_setup_early_handler(); copy_bootdata(__va(real_mode_data)); @@ -171,8 +353,8 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) */ load_ucode_bsp(); - /* set init_level4_pgt kernel high mapping*/ - init_level4_pgt[511] = early_level4_pgt[511]; + /* set init_top_pgt kernel high mapping*/ + init_top_pgt[511] = early_top_pgt[511]; x86_64_start_reservations(real_mode_data); } diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 1f85ee8f9439..f1d528bb66a6 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * * Copyright (C) 1991, 1992 Linus Torvalds @@ -155,7 +156,6 @@ ENTRY(startup_32) jmp *%eax .Lbad_subarch: -WEAK(lguest_entry) WEAK(xen_entry) /* Unknown implementation; there's really nothing we can do at this point. */ @@ -165,7 +165,6 @@ WEAK(xen_entry) subarch_entries: .long .Ldefault_entry /* normal x86/PC */ - .long lguest_entry /* lguest hypervisor */ .long xen_entry /* Xen hypervisor */ .long .Ldefault_entry /* Moorestown MID */ num_subarch_entries = (. - subarch_entries) / 4 @@ -347,7 +346,6 @@ ENTRY(startup_32_smp) movl %eax,%cr0 lgdt early_gdt_descr - lidt idt_descr ljmp $(__KERNEL_CS),$1f 1: movl $(__KERNEL_DS),%eax # reload all the segment registers movl %eax,%ss # after changing gdt. @@ -380,37 +378,6 @@ ENDPROC(startup_32_smp) */ __INIT setup_once: - /* - * Set up a idt with 256 interrupt gates that push zero if there - * is no error code and then jump to early_idt_handler_common. - * It doesn't actually load the idt - that needs to be done on - * each CPU. Interrupts are enabled elsewhere, when we can be - * relatively sure everything is ok. - */ - - movl $idt_table,%edi - movl $early_idt_handler_array,%eax - movl $NUM_EXCEPTION_VECTORS,%ecx -1: - movl %eax,(%edi) - movl %eax,4(%edi) - /* interrupt gate, dpl=0, present */ - movl $(0x8E000000 + __KERNEL_CS),2(%edi) - addl $EARLY_IDT_HANDLER_SIZE,%eax - addl $8,%edi - loop 1b - - movl $256 - NUM_EXCEPTION_VECTORS,%ecx - movl $ignore_int,%edx - movl $(__KERNEL_CS << 16),%eax - movw %dx,%ax /* selector = 0x0010 = cs */ - movw $0x8E00,%dx /* interrupt gate - dpl=0, present */ -2: - movl %eax,(%edi) - movl %edx,4(%edi) - addl $8,%edi - loop 2b - #ifdef CONFIG_CC_STACKPROTECTOR /* * Configure the stack canary. The linker can't handle this by @@ -457,12 +424,9 @@ early_idt_handler_common: /* The vector number is in pt_regs->gs */ cld - pushl %fs /* pt_regs->fs */ - movw $0, 2(%esp) /* clear high bits (some CPUs leave garbage) */ - pushl %es /* pt_regs->es */ - movw $0, 2(%esp) /* clear high bits (some CPUs leave garbage) */ - pushl %ds /* pt_regs->ds */ - movw $0, 2(%esp) /* clear high bits (some CPUs leave garbage) */ + pushl %fs /* pt_regs->fs (__fsh varies by model) */ + pushl %es /* pt_regs->es (__esh varies by model) */ + pushl %ds /* pt_regs->ds (__dsh varies by model) */ pushl %eax /* pt_regs->ax */ pushl %ebp /* pt_regs->bp */ pushl %edi /* pt_regs->di */ @@ -479,9 +443,8 @@ early_idt_handler_common: /* Load the vector number into EDX */ movl PT_GS(%esp), %edx - /* Load GS into pt_regs->gs and clear high bits */ + /* Load GS into pt_regs->gs (and maybe clobber __gsh) */ movw %gs, PT_GS(%esp) - movw $0, PT_GS+2(%esp) movl %esp, %eax /* args are pt_regs (EAX), trapnr (EDX) */ call early_fixup_exception @@ -493,18 +456,17 @@ early_idt_handler_common: popl %edi /* pt_regs->di */ popl %ebp /* pt_regs->bp */ popl %eax /* pt_regs->ax */ - popl %ds /* pt_regs->ds */ - popl %es /* pt_regs->es */ - popl %fs /* pt_regs->fs */ - popl %gs /* pt_regs->gs */ + popl %ds /* pt_regs->ds (always ignores __dsh) */ + popl %es /* pt_regs->es (always ignores __esh) */ + popl %fs /* pt_regs->fs (always ignores __fsh) */ + popl %gs /* pt_regs->gs (always ignores __gsh) */ decl %ss:early_recursion_flag addl $4, %esp /* pop pt_regs->orig_ax */ iret ENDPROC(early_idt_handler_common) /* This is the default interrupt "handler" :-) */ - ALIGN -ignore_int: +ENTRY(early_ignore_irq) cld #ifdef CONFIG_PRINTK pushl %eax @@ -539,7 +501,8 @@ ignore_int: hlt_loop: hlt jmp hlt_loop -ENDPROC(ignore_int) +ENDPROC(early_ignore_irq) + __INITDATA .align 4 GLOBAL(early_recursion_flag) @@ -628,7 +591,6 @@ int_msg: .data .globl boot_gdt_descr -.globl idt_descr ALIGN # early boot GDT descriptor (must use 1:1 address mapping) @@ -637,11 +599,6 @@ boot_gdt_descr: .word __BOOT_DS+7 .long boot_gdt - __PAGE_OFFSET - .word 0 # 32-bit align idt_desc.address -idt_descr: - .word IDT_ENTRIES*8-1 # idt contains 256 entries - .long idt_table - # boot GDT descriptor (later on used by CPU#0): .word 0 # 32 bit align gdt_desc.address ENTRY(early_gdt_descr) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index ac9d327d2e42..6dde3f3fc1f8 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * linux/arch/x86/kernel/head_64.S -- start in 32bit and switch to 64bit * @@ -37,10 +38,11 @@ * */ +#define p4d_index(x) (((x) >> P4D_SHIFT) & (PTRS_PER_P4D-1)) #define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) -L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE) -L4_START_KERNEL = pgd_index(__START_KERNEL_map) +PGD_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE) +PGD_START_KERNEL = pgd_index(__START_KERNEL_map) L3_START_KERNEL = pud_index(__START_KERNEL_map) .text @@ -73,100 +75,18 @@ startup_64: call verify_cpu /* - * Compute the delta between the address I am compiled to run at and the - * address I am actually running at. - */ - leaq _text(%rip), %rbp - subq $_text - __START_KERNEL_map, %rbp - - /* Is the address not 2M aligned? */ - testl $~PMD_PAGE_MASK, %ebp - jnz bad_address - - /* - * Is the address too large? - */ - leaq _text(%rip), %rax - shrq $MAX_PHYSMEM_BITS, %rax - jnz bad_address - - /* - * Fixup the physical addresses in the page table - */ - addq %rbp, early_level4_pgt + (L4_START_KERNEL*8)(%rip) - - addq %rbp, level3_kernel_pgt + (510*8)(%rip) - addq %rbp, level3_kernel_pgt + (511*8)(%rip) - - addq %rbp, level2_fixmap_pgt + (506*8)(%rip) - - /* - * Set up the identity mapping for the switchover. These - * entries should *NOT* have the global bit set! This also - * creates a bunch of nonsense entries but that is fine -- - * it avoids problems around wraparound. + * Perform pagetable fixups. Additionally, if SME is active, encrypt + * the kernel and retrieve the modifier (SME encryption mask if SME + * is active) to be added to the initial pgdir entry that will be + * programmed into CR3. */ leaq _text(%rip), %rdi - leaq early_level4_pgt(%rip), %rbx - - movq %rdi, %rax - shrq $PGDIR_SHIFT, %rax - - leaq (PAGE_SIZE + _KERNPG_TABLE)(%rbx), %rdx - movq %rdx, 0(%rbx,%rax,8) - movq %rdx, 8(%rbx,%rax,8) - - addq $PAGE_SIZE, %rdx - movq %rdi, %rax - shrq $PUD_SHIFT, %rax - andl $(PTRS_PER_PUD-1), %eax - movq %rdx, PAGE_SIZE(%rbx,%rax,8) - incl %eax - andl $(PTRS_PER_PUD-1), %eax - movq %rdx, PAGE_SIZE(%rbx,%rax,8) - - addq $PAGE_SIZE * 2, %rbx - movq %rdi, %rax - shrq $PMD_SHIFT, %rdi - addq $(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL), %rax - leaq (_end - 1)(%rip), %rcx - shrq $PMD_SHIFT, %rcx - subq %rdi, %rcx - incl %ecx - -1: - andq $(PTRS_PER_PMD - 1), %rdi - movq %rax, (%rbx,%rdi,8) - incq %rdi - addq $PMD_SIZE, %rax - decl %ecx - jnz 1b - - test %rbp, %rbp - jz .Lskip_fixup + pushq %rsi + call __startup_64 + popq %rsi - /* - * Fixup the kernel text+data virtual addresses. Note that - * we might write invalid pmds, when the kernel is relocated - * cleanup_highmap() fixes this up along with the mappings - * beyond _end. - */ - leaq level2_kernel_pgt(%rip), %rdi - leaq PAGE_SIZE(%rdi), %r8 - /* See if it is a valid page table entry */ -1: testb $_PAGE_PRESENT, 0(%rdi) - jz 2f - addq %rbp, 0(%rdi) - /* Go to the next page */ -2: addq $8, %rdi - cmp %r8, %rdi - jne 1b - - /* Fixup phys_base */ - addq %rbp, phys_base(%rip) - -.Lskip_fixup: - movq $(early_level4_pgt - __START_KERNEL_map), %rax + /* Form the CR3 value being sure to include the CR3 modifier */ + addq $(early_top_pgt - __START_KERNEL_map), %rax jmp 1f ENTRY(secondary_startup_64) /* @@ -186,14 +106,26 @@ ENTRY(secondary_startup_64) /* Sanitize CPU configuration */ call verify_cpu - movq $(init_level4_pgt - __START_KERNEL_map), %rax + /* + * Retrieve the modifier (SME encryption mask if SME is active) to be + * added to the initial pgdir entry that will be programmed into CR3. + */ + pushq %rsi + call __startup_secondary_64 + popq %rsi + + /* Form the CR3 value being sure to include the CR3 modifier */ + addq $(init_top_pgt - __START_KERNEL_map), %rax 1: - /* Enable PAE mode and PGE */ + /* Enable PAE mode, PGE and LA57 */ movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx +#ifdef CONFIG_X86_5LEVEL + orl $X86_CR4_LA57, %ecx +#endif movq %rcx, %cr4 - /* Setup early boot stage 4 level pagetables. */ + /* Setup early boot stage 4-/5-level pagetables. */ addq phys_base(%rip), %rax movq %rax, %cr3 @@ -417,9 +349,13 @@ GLOBAL(name) .endr __INITDATA -NEXT_PAGE(early_level4_pgt) +NEXT_PAGE(early_top_pgt) .fill 511,8,0 - .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE +#ifdef CONFIG_X86_5LEVEL + .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC +#else + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC +#endif NEXT_PAGE(early_dynamic_pgts) .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 @@ -427,19 +363,19 @@ NEXT_PAGE(early_dynamic_pgts) .data #ifndef CONFIG_XEN -NEXT_PAGE(init_level4_pgt) +NEXT_PAGE(init_top_pgt) .fill 512,8,0 #else -NEXT_PAGE(init_level4_pgt) - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .org init_level4_pgt + L4_PAGE_OFFSET*8, 0 - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .org init_level4_pgt + L4_START_KERNEL*8, 0 +NEXT_PAGE(init_top_pgt) + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC + .org init_top_pgt + PGD_PAGE_OFFSET*8, 0 + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC + .org init_top_pgt + PGD_START_KERNEL*8, 0 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ - .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC NEXT_PAGE(level3_ident_pgt) - .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .fill 511, 8, 0 NEXT_PAGE(level2_ident_pgt) /* Since I easily can, map the first 1G. @@ -448,11 +384,17 @@ NEXT_PAGE(level2_ident_pgt) PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) #endif +#ifdef CONFIG_X86_5LEVEL +NEXT_PAGE(level4_kernel_pgt) + .fill 511,8,0 + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC +#endif + NEXT_PAGE(level3_kernel_pgt) .fill L3_START_KERNEL,8,0 /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ - .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE - .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE + .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC + .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC NEXT_PAGE(level2_kernel_pgt) /* @@ -470,7 +412,7 @@ NEXT_PAGE(level2_kernel_pgt) NEXT_PAGE(level2_fixmap_pgt) .fill 506,8,0 - .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE + .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ .fill 5,8,0 diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 89ff7af2de50..8ce4212e2b8d 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -285,7 +285,7 @@ static void hpet_legacy_clockevent_register(void) * Start hpet with the boot cpu mask and make it * global after the IO_APIC has been initialized. */ - hpet_clockevent.cpumask = cpumask_of(smp_processor_id()); + hpet_clockevent.cpumask = cpumask_of(boot_cpu_data.cpu_index); clockevents_config_and_register(&hpet_clockevent, hpet_freq, HPET_MIN_PROG_DELTA, 0x7FFFFFFF); global_clock_event = &hpet_clockevent; @@ -345,21 +345,10 @@ static int hpet_shutdown(struct clock_event_device *evt, int timer) return 0; } -static int hpet_resume(struct clock_event_device *evt, int timer) +static int hpet_resume(struct clock_event_device *evt) { - if (!timer) { - hpet_enable_legacy_int(); - } else { - struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); - - irq_domain_deactivate_irq(irq_get_irq_data(hdev->irq)); - irq_domain_activate_irq(irq_get_irq_data(hdev->irq)); - disable_hardirq(hdev->irq); - irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu)); - enable_irq(hdev->irq); - } + hpet_enable_legacy_int(); hpet_print_config(); - return 0; } @@ -417,7 +406,7 @@ static int hpet_legacy_set_periodic(struct clock_event_device *evt) static int hpet_legacy_resume(struct clock_event_device *evt) { - return hpet_resume(evt, 0); + return hpet_resume(evt); } static int hpet_legacy_next_event(unsigned long delta, @@ -510,8 +499,14 @@ static int hpet_msi_set_periodic(struct clock_event_device *evt) static int hpet_msi_resume(struct clock_event_device *evt) { struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); + struct irq_data *data = irq_get_irq_data(hdev->irq); + struct msi_msg msg; - return hpet_resume(evt, hdev->num); + /* Restore the MSI msg and unmask the interrupt */ + irq_chip_compose_msi_msg(data, &msg); + hpet_msi_write(hdev, &msg); + hpet_msi_unmask(data); + return 0; } static int hpet_msi_next_event(unsigned long delta, diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index 6ebe00cb4a3b..0d307a657abb 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * 8253/PIT functions * diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 4e3b8a587c88..8f5cb2c7060c 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/linkage.h> #include <linux/errno.h> #include <linux/signal.h> diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c new file mode 100644 index 000000000000..6107ee1cb8d5 --- /dev/null +++ b/arch/x86/kernel/idt.c @@ -0,0 +1,371 @@ +/* + * Interrupt descriptor table related code + * + * This file is licensed under the GPL V2 + */ +#include <linux/interrupt.h> + +#include <asm/traps.h> +#include <asm/proto.h> +#include <asm/desc.h> + +struct idt_data { + unsigned int vector; + unsigned int segment; + struct idt_bits bits; + const void *addr; +}; + +#define DPL0 0x0 +#define DPL3 0x3 + +#define DEFAULT_STACK 0 + +#define G(_vector, _addr, _ist, _type, _dpl, _segment) \ + { \ + .vector = _vector, \ + .bits.ist = _ist, \ + .bits.type = _type, \ + .bits.dpl = _dpl, \ + .bits.p = 1, \ + .addr = _addr, \ + .segment = _segment, \ + } + +/* Interrupt gate */ +#define INTG(_vector, _addr) \ + G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL0, __KERNEL_CS) + +/* System interrupt gate */ +#define SYSG(_vector, _addr) \ + G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL3, __KERNEL_CS) + +/* Interrupt gate with interrupt stack */ +#define ISTG(_vector, _addr, _ist) \ + G(_vector, _addr, _ist, GATE_INTERRUPT, DPL0, __KERNEL_CS) + +/* System interrupt gate with interrupt stack */ +#define SISTG(_vector, _addr, _ist) \ + G(_vector, _addr, _ist, GATE_INTERRUPT, DPL3, __KERNEL_CS) + +/* Task gate */ +#define TSKG(_vector, _gdt) \ + G(_vector, NULL, DEFAULT_STACK, GATE_TASK, DPL0, _gdt << 3) + +/* + * Early traps running on the DEFAULT_STACK because the other interrupt + * stacks work only after cpu_init(). + */ +static const __initdata struct idt_data early_idts[] = { + INTG(X86_TRAP_DB, debug), + SYSG(X86_TRAP_BP, int3), +#ifdef CONFIG_X86_32 + INTG(X86_TRAP_PF, page_fault), +#endif +}; + +/* + * The default IDT entries which are set up in trap_init() before + * cpu_init() is invoked. Interrupt stacks cannot be used at that point and + * the traps which use them are reinitialized with IST after cpu_init() has + * set up TSS. + */ +static const __initdata struct idt_data def_idts[] = { + INTG(X86_TRAP_DE, divide_error), + INTG(X86_TRAP_NMI, nmi), + INTG(X86_TRAP_BR, bounds), + INTG(X86_TRAP_UD, invalid_op), + INTG(X86_TRAP_NM, device_not_available), + INTG(X86_TRAP_OLD_MF, coprocessor_segment_overrun), + INTG(X86_TRAP_TS, invalid_TSS), + INTG(X86_TRAP_NP, segment_not_present), + INTG(X86_TRAP_SS, stack_segment), + INTG(X86_TRAP_GP, general_protection), + INTG(X86_TRAP_SPURIOUS, spurious_interrupt_bug), + INTG(X86_TRAP_MF, coprocessor_error), + INTG(X86_TRAP_AC, alignment_check), + INTG(X86_TRAP_XF, simd_coprocessor_error), + +#ifdef CONFIG_X86_32 + TSKG(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS), +#else + INTG(X86_TRAP_DF, double_fault), +#endif + INTG(X86_TRAP_DB, debug), + INTG(X86_TRAP_NMI, nmi), + INTG(X86_TRAP_BP, int3), + +#ifdef CONFIG_X86_MCE + INTG(X86_TRAP_MC, &machine_check), +#endif + + SYSG(X86_TRAP_OF, overflow), +#if defined(CONFIG_IA32_EMULATION) + SYSG(IA32_SYSCALL_VECTOR, entry_INT80_compat), +#elif defined(CONFIG_X86_32) + SYSG(IA32_SYSCALL_VECTOR, entry_INT80_32), +#endif +}; + +/* + * The APIC and SMP idt entries + */ +static const __initdata struct idt_data apic_idts[] = { +#ifdef CONFIG_SMP + INTG(RESCHEDULE_VECTOR, reschedule_interrupt), + INTG(CALL_FUNCTION_VECTOR, call_function_interrupt), + INTG(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt), + INTG(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt), + INTG(REBOOT_VECTOR, reboot_interrupt), +#endif + +#ifdef CONFIG_X86_THERMAL_VECTOR + INTG(THERMAL_APIC_VECTOR, thermal_interrupt), +#endif + +#ifdef CONFIG_X86_MCE_THRESHOLD + INTG(THRESHOLD_APIC_VECTOR, threshold_interrupt), +#endif + +#ifdef CONFIG_X86_MCE_AMD + INTG(DEFERRED_ERROR_VECTOR, deferred_error_interrupt), +#endif + +#ifdef CONFIG_X86_LOCAL_APIC + INTG(LOCAL_TIMER_VECTOR, apic_timer_interrupt), + INTG(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi), +# ifdef CONFIG_HAVE_KVM + INTG(POSTED_INTR_VECTOR, kvm_posted_intr_ipi), + INTG(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi), + INTG(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi), +# endif +# ifdef CONFIG_IRQ_WORK + INTG(IRQ_WORK_VECTOR, irq_work_interrupt), +# endif + INTG(SPURIOUS_APIC_VECTOR, spurious_interrupt), + INTG(ERROR_APIC_VECTOR, error_interrupt), +#endif +}; + +#ifdef CONFIG_X86_64 +/* + * Early traps running on the DEFAULT_STACK because the other interrupt + * stacks work only after cpu_init(). + */ +static const __initdata struct idt_data early_pf_idts[] = { + INTG(X86_TRAP_PF, page_fault), +}; + +/* + * Override for the debug_idt. Same as the default, but with interrupt + * stack set to DEFAULT_STACK (0). Required for NMI trap handling. + */ +static const __initdata struct idt_data dbg_idts[] = { + INTG(X86_TRAP_DB, debug), + INTG(X86_TRAP_BP, int3), +}; +#endif + +/* Must be page-aligned because the real IDT is used in a fixmap. */ +gate_desc idt_table[IDT_ENTRIES] __page_aligned_bss; + +struct desc_ptr idt_descr __ro_after_init = { + .size = (IDT_ENTRIES * 2 * sizeof(unsigned long)) - 1, + .address = (unsigned long) idt_table, +}; + +#ifdef CONFIG_X86_64 +/* No need to be aligned, but done to keep all IDTs defined the same way. */ +gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss; + +/* + * The exceptions which use Interrupt stacks. They are setup after + * cpu_init() when the TSS has been initialized. + */ +static const __initdata struct idt_data ist_idts[] = { + ISTG(X86_TRAP_DB, debug, DEBUG_STACK), + ISTG(X86_TRAP_NMI, nmi, NMI_STACK), + SISTG(X86_TRAP_BP, int3, DEBUG_STACK), + ISTG(X86_TRAP_DF, double_fault, DOUBLEFAULT_STACK), +#ifdef CONFIG_X86_MCE + ISTG(X86_TRAP_MC, &machine_check, MCE_STACK), +#endif +}; + +/* + * Override for the debug_idt. Same as the default, but with interrupt + * stack set to DEFAULT_STACK (0). Required for NMI trap handling. + */ +const struct desc_ptr debug_idt_descr = { + .size = IDT_ENTRIES * 16 - 1, + .address = (unsigned long) debug_idt_table, +}; +#endif + +static inline void idt_init_desc(gate_desc *gate, const struct idt_data *d) +{ + unsigned long addr = (unsigned long) d->addr; + + gate->offset_low = (u16) addr; + gate->segment = (u16) d->segment; + gate->bits = d->bits; + gate->offset_middle = (u16) (addr >> 16); +#ifdef CONFIG_X86_64 + gate->offset_high = (u32) (addr >> 32); + gate->reserved = 0; +#endif +} + +static void +idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size, bool sys) +{ + gate_desc desc; + + for (; size > 0; t++, size--) { + idt_init_desc(&desc, t); + write_idt_entry(idt, t->vector, &desc); + if (sys) + set_bit(t->vector, used_vectors); + } +} + +static void set_intr_gate(unsigned int n, const void *addr) +{ + struct idt_data data; + + BUG_ON(n > 0xFF); + + memset(&data, 0, sizeof(data)); + data.vector = n; + data.addr = addr; + data.segment = __KERNEL_CS; + data.bits.type = GATE_INTERRUPT; + data.bits.p = 1; + + idt_setup_from_table(idt_table, &data, 1, false); +} + +/** + * idt_setup_early_traps - Initialize the idt table with early traps + * + * On X8664 these traps do not use interrupt stacks as they can't work + * before cpu_init() is invoked and sets up TSS. The IST variants are + * installed after that. + */ +void __init idt_setup_early_traps(void) +{ + idt_setup_from_table(idt_table, early_idts, ARRAY_SIZE(early_idts), + true); + load_idt(&idt_descr); +} + +/** + * idt_setup_traps - Initialize the idt table with default traps + */ +void __init idt_setup_traps(void) +{ + idt_setup_from_table(idt_table, def_idts, ARRAY_SIZE(def_idts), true); +} + +#ifdef CONFIG_X86_64 +/** + * idt_setup_early_pf - Initialize the idt table with early pagefault handler + * + * On X8664 this does not use interrupt stacks as they can't work before + * cpu_init() is invoked and sets up TSS. The IST variant is installed + * after that. + * + * FIXME: Why is 32bit and 64bit installing the PF handler at different + * places in the early setup code? + */ +void __init idt_setup_early_pf(void) +{ + idt_setup_from_table(idt_table, early_pf_idts, + ARRAY_SIZE(early_pf_idts), true); +} + +/** + * idt_setup_ist_traps - Initialize the idt table with traps using IST + */ +void __init idt_setup_ist_traps(void) +{ + idt_setup_from_table(idt_table, ist_idts, ARRAY_SIZE(ist_idts), true); +} + +/** + * idt_setup_debugidt_traps - Initialize the debug idt table with debug traps + */ +void __init idt_setup_debugidt_traps(void) +{ + memcpy(&debug_idt_table, &idt_table, IDT_ENTRIES * 16); + + idt_setup_from_table(debug_idt_table, dbg_idts, ARRAY_SIZE(dbg_idts), false); +} +#endif + +/** + * idt_setup_apic_and_irq_gates - Setup APIC/SMP and normal interrupt gates + */ +void __init idt_setup_apic_and_irq_gates(void) +{ + int i = FIRST_EXTERNAL_VECTOR; + void *entry; + + idt_setup_from_table(idt_table, apic_idts, ARRAY_SIZE(apic_idts), true); + + for_each_clear_bit_from(i, used_vectors, FIRST_SYSTEM_VECTOR) { + entry = irq_entries_start + 8 * (i - FIRST_EXTERNAL_VECTOR); + set_intr_gate(i, entry); + } + + for_each_clear_bit_from(i, used_vectors, NR_VECTORS) { +#ifdef CONFIG_X86_LOCAL_APIC + set_bit(i, used_vectors); + set_intr_gate(i, spurious_interrupt); +#else + entry = irq_entries_start + 8 * (i - FIRST_EXTERNAL_VECTOR); + set_intr_gate(i, entry); +#endif + } +} + +/** + * idt_setup_early_handler - Initializes the idt table with early handlers + */ +void __init idt_setup_early_handler(void) +{ + int i; + + for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) + set_intr_gate(i, early_idt_handler_array[i]); +#ifdef CONFIG_X86_32 + for ( ; i < NR_VECTORS; i++) + set_intr_gate(i, early_ignore_irq); +#endif + load_idt(&idt_descr); +} + +/** + * idt_invalidate - Invalidate interrupt descriptor table + * @addr: The virtual address of the 'invalid' IDT + */ +void idt_invalidate(void *addr) +{ + struct desc_ptr idt = { .address = (unsigned long) addr, .size = 0 }; + + load_idt(&idt); +} + +void __init update_intr_gate(unsigned int n, const void *addr) +{ + if (WARN_ON_ONCE(!test_bit(n, used_vectors))) + return; + set_intr_gate(n, addr); +} + +void alloc_intr_gate(unsigned int n, const void *addr) +{ + BUG_ON(n < FIRST_SYSTEM_VECTOR); + if (!test_and_set_bit(n, used_vectors)) + set_intr_gate(n, addr); +} diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c index 50c89e8a95f2..805b7a341aca 100644 --- a/arch/x86/kernel/io_delay.c +++ b/arch/x86/kernel/io_delay.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * I/O delay strategies for inb_p/outb_p * @@ -58,7 +59,7 @@ static int __init dmi_io_delay_0xed_port(const struct dmi_system_id *id) * Quirk table for systems that misbehave (lock up, etc.) if port * 0x80 is used: */ -static struct dmi_system_id __initdata io_delay_0xed_port_dmi_table[] = { +static const struct dmi_system_id io_delay_0xed_port_dmi_table[] __initconst = { { .callback = dmi_io_delay_0xed_port, .ident = "Compaq Presario V6000", diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 9c3cf0944bce..3feb648781c4 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * This contains the io-permission bitmap code - written by obz, with changes * by Linus. 32/64 bits code unification by Miguel Botón. diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index f34fe7444836..52089c043160 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -29,9 +29,6 @@ EXPORT_PER_CPU_SYMBOL(irq_regs); atomic_t irq_err_count; -/* Function pointer for generic interrupt vector handling */ -void (*x86_platform_ipi_callback)(void) = NULL; - /* * 'what should we do if we get a hw irq event on an illegal vector'. * each architecture has to answer this themselves. @@ -87,13 +84,13 @@ int arch_show_interrupts(struct seq_file *p, int prec) for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->icr_read_retry_count); seq_puts(p, " APIC ICR read retries\n"); -#endif if (x86_platform_ipi_callback) { seq_printf(p, "%*s: ", prec, "PLT"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->x86_platform_ipis); seq_puts(p, " Platform interrupts\n"); } +#endif #ifdef CONFIG_SMP seq_printf(p, "%*s: ", prec, "RES"); for_each_online_cpu(j) @@ -155,6 +152,12 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_printf(p, "%10u ", irq_stats(j)->kvm_posted_intr_ipis); seq_puts(p, " Posted-interrupt notification event\n"); + seq_printf(p, "%*s: ", prec, "NPI"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", + irq_stats(j)->kvm_posted_intr_nested_ipis); + seq_puts(p, " Nested posted-interrupt event\n"); + seq_printf(p, "%*s: ", prec, "PIW"); for_each_online_cpu(j) seq_printf(p, "%10u ", @@ -177,9 +180,9 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += irq_stats(cpu)->apic_perf_irqs; sum += irq_stats(cpu)->apic_irq_work_irqs; sum += irq_stats(cpu)->icr_read_retry_count; -#endif if (x86_platform_ipi_callback) sum += irq_stats(cpu)->x86_platform_ipis; +#endif #ifdef CONFIG_SMP sum += irq_stats(cpu)->irq_resched_count; sum += irq_stats(cpu)->irq_call_count; @@ -253,26 +256,26 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) return 1; } +#ifdef CONFIG_X86_LOCAL_APIC +/* Function pointer for generic interrupt vector handling */ +void (*x86_platform_ipi_callback)(void) = NULL; /* * Handler for X86_PLATFORM_IPI_VECTOR. */ -void __smp_x86_platform_ipi(void) -{ - inc_irq_stat(x86_platform_ipis); - - if (x86_platform_ipi_callback) - x86_platform_ipi_callback(); -} - __visible void __irq_entry smp_x86_platform_ipi(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); entering_ack_irq(); - __smp_x86_platform_ipi(); + trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR); + inc_irq_stat(x86_platform_ipis); + if (x86_platform_ipi_callback) + x86_platform_ipi_callback(); + trace_x86_platform_ipi_exit(X86_PLATFORM_IPI_VECTOR); exiting_irq(); set_irq_regs(old_regs); } +#endif #ifdef CONFIG_HAVE_KVM static void dummy_handler(void) {} @@ -313,21 +316,21 @@ __visible void smp_kvm_posted_intr_wakeup_ipi(struct pt_regs *regs) exiting_irq(); set_irq_regs(old_regs); } -#endif -__visible void __irq_entry smp_trace_x86_platform_ipi(struct pt_regs *regs) +/* + * Handler for POSTED_INTERRUPT_NESTED_VECTOR. + */ +__visible void smp_kvm_posted_intr_nested_ipi(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); entering_ack_irq(); - trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR); - __smp_x86_platform_ipi(); - trace_x86_platform_ipi_exit(X86_PLATFORM_IPI_VECTOR); + inc_irq_stat(kvm_posted_intr_nested_ipis); exiting_irq(); set_irq_regs(old_regs); } +#endif -EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); #ifdef CONFIG_HOTPLUG_CPU @@ -412,7 +415,7 @@ int check_irq_vectors_for_cpu_disable(void) * this w/o holding vector_lock. */ for (vector = FIRST_EXTERNAL_VECTOR; - vector < first_system_vector; vector++) { + vector < FIRST_SYSTEM_VECTOR; vector++) { if (!test_bit(vector, used_vectors) && IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector])) { if (++count == this_count) @@ -432,84 +435,12 @@ int check_irq_vectors_for_cpu_disable(void) /* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ void fixup_irqs(void) { - unsigned int irq, vector; - static int warned; + unsigned int irr, vector; struct irq_desc *desc; struct irq_data *data; struct irq_chip *chip; - int ret; - for_each_irq_desc(irq, desc) { - int break_affinity = 0; - int set_affinity = 1; - const struct cpumask *affinity; - - if (!desc) - continue; - if (irq == 2) - continue; - - /* interrupt's are disabled at this point */ - raw_spin_lock(&desc->lock); - - data = irq_desc_get_irq_data(desc); - affinity = irq_data_get_affinity_mask(data); - if (!irq_has_action(irq) || irqd_is_per_cpu(data) || - cpumask_subset(affinity, cpu_online_mask)) { - raw_spin_unlock(&desc->lock); - continue; - } - - /* - * Complete the irq move. This cpu is going down and for - * non intr-remapping case, we can't wait till this interrupt - * arrives at this cpu before completing the irq move. - */ - irq_force_complete_move(desc); - - if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { - break_affinity = 1; - affinity = cpu_online_mask; - } - - chip = irq_data_get_irq_chip(data); - /* - * The interrupt descriptor might have been cleaned up - * already, but it is not yet removed from the radix tree - */ - if (!chip) { - raw_spin_unlock(&desc->lock); - continue; - } - - if (!irqd_can_move_in_process_context(data) && chip->irq_mask) - chip->irq_mask(data); - - if (chip->irq_set_affinity) { - ret = chip->irq_set_affinity(data, affinity, true); - if (ret == -ENOSPC) - pr_crit("IRQ %d set affinity failed because there are no available vectors. The device assigned to this IRQ is unstable.\n", irq); - } else { - if (!(warned++)) - set_affinity = 0; - } - - /* - * We unmask if the irq was not marked masked by the - * core code. That respects the lazy irq disable - * behaviour. - */ - if (!irqd_can_move_in_process_context(data) && - !irqd_irq_masked(data) && chip->irq_unmask) - chip->irq_unmask(data); - - raw_spin_unlock(&desc->lock); - - if (break_affinity && set_affinity) - pr_notice("Broke affinity for irq %i\n", irq); - else if (!set_affinity) - pr_notice("Cannot set affinity for irq %i\n", irq); - } + irq_migrate_all_off_this_cpu(); /* * We can remove mdelay() and then send spuriuous interrupts to @@ -528,8 +459,6 @@ void fixup_irqs(void) * nothing else will touch it. */ for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { - unsigned int irr; - if (IS_ERR_OR_NULL(__this_cpu_read(vector_irq[vector]))) continue; diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 1f38d9a4d9de..a83b3346a0e1 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar * @@ -64,7 +65,7 @@ static void call_on_stack(void *func, void *stack) static inline void *current_stack(void) { - return (void *)(current_stack_pointer() & ~(THREAD_SIZE - 1)); + return (void *)(current_stack_pointer & ~(THREAD_SIZE - 1)); } static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) @@ -88,7 +89,7 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) /* Save the next esp at the bottom of the stack */ prev_esp = (u32 *)irqstk; - *prev_esp = current_stack_pointer(); + *prev_esp = current_stack_pointer; if (unlikely(overflow)) call_on_stack(print_stack_overflow, isp); @@ -139,7 +140,7 @@ void do_softirq_own_stack(void) /* Push the previous esp onto the stack */ prev_esp = (u32 *)irqstk; - *prev_esp = current_stack_pointer(); + *prev_esp = current_stack_pointer; call_on_stack(__do_softirq, isp); } diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 3be74fbdeff2..020efbf5786b 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar * diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c index 275487872be2..80bee7695a20 100644 --- a/arch/x86/kernel/irq_work.c +++ b/arch/x86/kernel/irq_work.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * x86 specific code for irq_work * @@ -11,35 +12,23 @@ #include <asm/trace/irq_vectors.h> #include <linux/interrupt.h> -static inline void __smp_irq_work_interrupt(void) -{ - inc_irq_stat(apic_irq_work_irqs); - irq_work_run(); -} - +#ifdef CONFIG_X86_LOCAL_APIC __visible void __irq_entry smp_irq_work_interrupt(struct pt_regs *regs) { ipi_entering_ack_irq(); - __smp_irq_work_interrupt(); - exiting_irq(); -} - -__visible void __irq_entry smp_trace_irq_work_interrupt(struct pt_regs *regs) -{ - ipi_entering_ack_irq(); trace_irq_work_entry(IRQ_WORK_VECTOR); - __smp_irq_work_interrupt(); + inc_irq_stat(apic_irq_work_irqs); + irq_work_run(); trace_irq_work_exit(IRQ_WORK_VECTOR); exiting_irq(); } void arch_irq_work_raise(void) { -#ifdef CONFIG_X86_LOCAL_APIC if (!arch_irq_work_has_interrupt()) return; apic->send_IPI_self(IRQ_WORK_VECTOR); apic_wait_icr_idle(); -#endif } +#endif diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 7468c6987547..1e4094eba15e 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/linkage.h> #include <linux/errno.h> #include <linux/signal.h> @@ -55,18 +56,6 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq) = { [0 ... NR_VECTORS - 1] = VECTOR_UNUSED, }; -int vector_used_by_percpu_irq(unsigned int vector) -{ - int cpu; - - for_each_online_cpu(cpu) { - if (!IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector])) - return 1; - } - - return 0; -} - void __init init_ISA_irqs(void) { struct irq_chip *chip = legacy_pic->chip; @@ -99,98 +88,12 @@ void __init init_IRQ(void) x86_init.irqs.intr_init(); } -static void __init smp_intr_init(void) -{ -#ifdef CONFIG_SMP - /* - * The reschedule interrupt is a CPU-to-CPU reschedule-helper - * IPI, driven by wakeup. - */ - alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); - - /* IPI for generic function call */ - alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); - - /* IPI for generic single function call */ - alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, - call_function_single_interrupt); - - /* Low priority IPI to cleanup after moving an irq */ - set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); - set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); - - /* IPI used for rebooting/stopping */ - alloc_intr_gate(REBOOT_VECTOR, reboot_interrupt); -#endif /* CONFIG_SMP */ -} - -static void __init apic_intr_init(void) -{ - smp_intr_init(); - -#ifdef CONFIG_X86_THERMAL_VECTOR - alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); -#endif -#ifdef CONFIG_X86_MCE_THRESHOLD - alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); -#endif - -#ifdef CONFIG_X86_MCE_AMD - alloc_intr_gate(DEFERRED_ERROR_VECTOR, deferred_error_interrupt); -#endif - -#ifdef CONFIG_X86_LOCAL_APIC - /* self generated IPI for local APIC timer */ - alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); - - /* IPI for X86 platform specific use */ - alloc_intr_gate(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi); -#ifdef CONFIG_HAVE_KVM - /* IPI for KVM to deliver posted interrupt */ - alloc_intr_gate(POSTED_INTR_VECTOR, kvm_posted_intr_ipi); - /* IPI for KVM to deliver interrupt to wake up tasks */ - alloc_intr_gate(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi); -#endif - - /* IPI vectors for APIC spurious and error interrupts */ - alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); - alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); - - /* IRQ work interrupts: */ -# ifdef CONFIG_IRQ_WORK - alloc_intr_gate(IRQ_WORK_VECTOR, irq_work_interrupt); -# endif - -#endif -} - void __init native_init_IRQ(void) { - int i; - /* Execute any quirks before the call gates are initialised: */ x86_init.irqs.pre_vector_init(); - apic_intr_init(); - - /* - * Cover the whole vector space, no vector can escape - * us. (some of these will be overridden and become - * 'special' SMP interrupts) - */ - i = FIRST_EXTERNAL_VECTOR; -#ifndef CONFIG_X86_LOCAL_APIC -#define first_system_vector NR_VECTORS -#endif - for_each_clear_bit_from(i, used_vectors, first_system_vector) { - /* IA32_SYSCALL_VECTOR could be used in trap_init already. */ - set_intr_gate(i, irq_entries_start + - 8 * (i - FIRST_EXTERNAL_VECTOR)); - } -#ifdef CONFIG_X86_LOCAL_APIC - for_each_clear_bit_from(i, used_vectors, NR_VECTORS) - set_intr_gate(i, spurious_interrupt); -#endif + idt_setup_apic_and_irq_gates(); if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs()) setup_irq(2, &irq2); diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index c37bd0f39c70..e56c95be2808 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * jump label x86 support * @@ -105,11 +106,9 @@ static void __jump_label_transform(struct jump_entry *entry, void arch_jump_label_transform(struct jump_entry *entry, enum jump_label_type type) { - get_online_cpus(); mutex_lock(&text_mutex); __jump_label_transform(entry, type, NULL, 0); mutex_unlock(&text_mutex); - put_online_cpus(); } static enum { diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c index 38b64587b31b..fd6f8fbbe6f2 100644 --- a/arch/x86/kernel/kdebugfs.c +++ b/arch/x86/kernel/kdebugfs.c @@ -33,7 +33,6 @@ static ssize_t setup_data_read(struct file *file, char __user *user_buf, struct setup_data_node *node = file->private_data; unsigned long remain; loff_t pos = *ppos; - struct page *pg; void *p; u64 pa; @@ -47,18 +46,13 @@ static ssize_t setup_data_read(struct file *file, char __user *user_buf, count = node->len - pos; pa = node->paddr + sizeof(struct setup_data) + pos; - pg = pfn_to_page((pa + count - 1) >> PAGE_SHIFT); - if (PageHighMem(pg)) { - p = ioremap_cache(pa, count); - if (!p) - return -ENXIO; - } else - p = __va(pa); + p = memremap(pa, count, MEMREMAP_WB); + if (!p) + return -ENOMEM; remain = copy_to_user(user_buf, p, count); - if (PageHighMem(pg)) - iounmap(p); + memunmap(p); if (remain) return -EFAULT; @@ -109,7 +103,6 @@ static int __init create_setup_data_nodes(struct dentry *parent) struct setup_data *data; int error; struct dentry *d; - struct page *pg; u64 pa_data; int no = 0; @@ -126,16 +119,12 @@ static int __init create_setup_data_nodes(struct dentry *parent) goto err_dir; } - pg = pfn_to_page((pa_data+sizeof(*data)-1) >> PAGE_SHIFT); - if (PageHighMem(pg)) { - data = ioremap_cache(pa_data, sizeof(*data)); - if (!data) { - kfree(node); - error = -ENXIO; - goto err_dir; - } - } else - data = __va(pa_data); + data = memremap(pa_data, sizeof(*data), MEMREMAP_WB); + if (!data) { + kfree(node); + error = -ENOMEM; + goto err_dir; + } node->paddr = pa_data; node->type = data->type; @@ -143,8 +132,7 @@ static int __init create_setup_data_nodes(struct dentry *parent) error = create_setup_data_node(d, no, node); pa_data = data->next; - if (PageHighMem(pg)) - iounmap(data); + memunmap(data); if (error) goto err_dir; no++; diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 9d7fd5e6689a..fb095ba0c02f 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -100,14 +100,14 @@ static int setup_e820_entries(struct boot_params *params) { unsigned int nr_e820_entries; - nr_e820_entries = e820_table_firmware->nr_entries; + nr_e820_entries = e820_table_kexec->nr_entries; /* TODO: Pass entries more than E820_MAX_ENTRIES_ZEROPAGE in bootparams setup data */ if (nr_e820_entries > E820_MAX_ENTRIES_ZEROPAGE) nr_e820_entries = E820_MAX_ENTRIES_ZEROPAGE; params->e820_entries = nr_e820_entries; - memcpy(¶ms->e820_table, &e820_table_firmware->entries, nr_e820_entries*sizeof(struct e820_entry)); + memcpy(¶ms->e820_table, &e820_table_kexec->entries, nr_e820_entries*sizeof(struct e820_entry)); return 0; } diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h index db2182d63ed0..615105cf7d58 100644 --- a/arch/x86/kernel/kprobes/common.h +++ b/arch/x86/kernel/kprobes/common.h @@ -1,8 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __X86_KERNEL_KPROBES_COMMON_H #define __X86_KERNEL_KPROBES_COMMON_H /* Kprobes and Optprobes common header */ +#include <asm/asm.h> + +#ifdef CONFIG_FRAME_POINTER +# define SAVE_RBP_STRING " push %" _ASM_BP "\n" \ + " mov %" _ASM_SP ", %" _ASM_BP "\n" +#else +# define SAVE_RBP_STRING " push %" _ASM_BP "\n" +#endif + #ifdef CONFIG_X86_64 #define SAVE_REGS_STRING \ /* Skip cs, ip, orig_ax. */ \ @@ -17,7 +27,7 @@ " pushq %r10\n" \ " pushq %r11\n" \ " pushq %rbx\n" \ - " pushq %rbp\n" \ + SAVE_RBP_STRING \ " pushq %r12\n" \ " pushq %r13\n" \ " pushq %r14\n" \ @@ -48,7 +58,7 @@ " pushl %es\n" \ " pushl %ds\n" \ " pushl %eax\n" \ - " pushl %ebp\n" \ + SAVE_RBP_STRING \ " pushl %edi\n" \ " pushl %esi\n" \ " pushl %edx\n" \ diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 6b877807598b..0742491cbb73 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -457,6 +457,8 @@ static int arch_copy_kprobe(struct kprobe *p) int arch_prepare_kprobe(struct kprobe *p) { + int ret; + if (alternatives_text_reserved(p->addr, p->addr)) return -EINVAL; @@ -467,7 +469,13 @@ int arch_prepare_kprobe(struct kprobe *p) if (!p->ainsn.insn) return -ENOMEM; - return arch_copy_kprobe(p); + ret = arch_copy_kprobe(p); + if (ret) { + free_insn_slot(p->ainsn.insn, 0); + p->ainsn.insn = NULL; + } + + return ret; } void arch_arm_kprobe(struct kprobe *p) @@ -1072,8 +1080,6 @@ int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) * raw stack chunk with redzones: */ __memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr, MIN_STACK_SIZE(addr)); - regs->flags &= ~X86_EFLAGS_IF; - trace_hardirqs_off(); regs->ip = (unsigned long)(jp->entry); /* diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 901c640d152f..4f98aad38237 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -28,6 +28,7 @@ #include <linux/kdebug.h> #include <linux/kallsyms.h> #include <linux/ftrace.h> +#include <linux/frame.h> #include <asm/text-patching.h> #include <asm/cacheflush.h> @@ -38,6 +39,7 @@ #include <asm/insn.h> #include <asm/debugreg.h> #include <asm/set_memory.h> +#include <asm/sections.h> #include "common.h" @@ -94,6 +96,7 @@ static void synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val) } asm ( + "optprobe_template_func:\n" ".global optprobe_template_entry\n" "optprobe_template_entry:\n" #ifdef CONFIG_X86_64 @@ -131,7 +134,12 @@ asm ( " popf\n" #endif ".global optprobe_template_end\n" - "optprobe_template_end:\n"); + "optprobe_template_end:\n" + ".type optprobe_template_func, @function\n" + ".size optprobe_template_func, .-optprobe_template_func\n"); + +void optprobe_template_func(void); +STACK_FRAME_NON_STANDARD(optprobe_template_func); #define TMPL_MOVE_IDX \ ((long)&optprobe_template_val - (long)&optprobe_template_entry) @@ -244,10 +252,12 @@ static int can_optimize(unsigned long paddr) /* * Do not optimize in the entry code due to the unstable - * stack handling. + * stack handling and registers setup. */ - if ((paddr >= (unsigned long)__entry_text_start) && - (paddr < (unsigned long)__entry_text_end)) + if (((paddr >= (unsigned long)__entry_text_start) && + (paddr < (unsigned long)__entry_text_end)) || + ((paddr >= (unsigned long)__irqentry_text_start) && + (paddr < (unsigned long)__irqentry_text_end))) return 0; /* Check there is enough space for a relative jump. */ diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c index 4afc67f5facc..8c1cc08f514f 100644 --- a/arch/x86/kernel/ksysfs.c +++ b/arch/x86/kernel/ksysfs.c @@ -16,8 +16,8 @@ #include <linux/stat.h> #include <linux/slab.h> #include <linux/mm.h> +#include <linux/io.h> -#include <asm/io.h> #include <asm/setup.h> static ssize_t version_show(struct kobject *kobj, @@ -55,7 +55,7 @@ static struct bin_attribute *boot_params_data_attrs[] = { NULL, }; -static struct attribute_group boot_params_attr_group = { +static const struct attribute_group boot_params_attr_group = { .attrs = boot_params_version_attrs, .bin_attrs = boot_params_data_attrs, }; @@ -79,12 +79,12 @@ static int get_setup_data_paddr(int nr, u64 *paddr) *paddr = pa_data; return 0; } - data = ioremap_cache(pa_data, sizeof(*data)); + data = memremap(pa_data, sizeof(*data), MEMREMAP_WB); if (!data) return -ENOMEM; pa_data = data->next; - iounmap(data); + memunmap(data); i++; } return -EINVAL; @@ -97,17 +97,17 @@ static int __init get_setup_data_size(int nr, size_t *size) u64 pa_data = boot_params.hdr.setup_data; while (pa_data) { - data = ioremap_cache(pa_data, sizeof(*data)); + data = memremap(pa_data, sizeof(*data), MEMREMAP_WB); if (!data) return -ENOMEM; if (nr == i) { *size = data->len; - iounmap(data); + memunmap(data); return 0; } pa_data = data->next; - iounmap(data); + memunmap(data); i++; } return -EINVAL; @@ -127,12 +127,12 @@ static ssize_t type_show(struct kobject *kobj, ret = get_setup_data_paddr(nr, &paddr); if (ret) return ret; - data = ioremap_cache(paddr, sizeof(*data)); + data = memremap(paddr, sizeof(*data), MEMREMAP_WB); if (!data) return -ENOMEM; ret = sprintf(buf, "0x%x\n", data->type); - iounmap(data); + memunmap(data); return ret; } @@ -154,7 +154,7 @@ static ssize_t setup_data_data_read(struct file *fp, ret = get_setup_data_paddr(nr, &paddr); if (ret) return ret; - data = ioremap_cache(paddr, sizeof(*data)); + data = memremap(paddr, sizeof(*data), MEMREMAP_WB); if (!data) return -ENOMEM; @@ -170,15 +170,15 @@ static ssize_t setup_data_data_read(struct file *fp, goto out; ret = count; - p = ioremap_cache(paddr + sizeof(*data), data->len); + p = memremap(paddr + sizeof(*data), data->len, MEMREMAP_WB); if (!p) { ret = -ENOMEM; goto out; } memcpy(buf, p + off, count); - iounmap(p); + memunmap(p); out: - iounmap(data); + memunmap(data); return ret; } @@ -202,7 +202,7 @@ static struct bin_attribute *setup_data_data_attrs[] = { NULL, }; -static struct attribute_group setup_data_attr_group = { +static const struct attribute_group setup_data_attr_group = { .attrs = setup_data_type_attrs, .bin_attrs = setup_data_data_attrs, }; @@ -250,13 +250,13 @@ static int __init get_setup_data_total_num(u64 pa_data, int *nr) *nr = 0; while (pa_data) { *nr += 1; - data = ioremap_cache(pa_data, sizeof(*data)); + data = memremap(pa_data, sizeof(*data), MEMREMAP_WB); if (!data) { ret = -ENOMEM; goto out; } pa_data = data->next; - iounmap(data); + memunmap(data); } out: @@ -299,7 +299,7 @@ static int __init create_setup_data_nodes(struct kobject *parent) return 0; out_clean_nodes: - for (j = i - 1; j > 0; j--) + for (j = i - 1; j >= 0; j--) cleanup_setup_data_node(*(kobjp + j)); kfree(kobjp); out_setup_data_kobj: diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 43e10d6fdbed..8bb9594d0761 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -117,7 +117,11 @@ static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b, return NULL; } -void kvm_async_pf_task_wait(u32 token) +/* + * @interrupt_kernel: Is this called from a routine which interrupts the kernel + * (other than user space)? + */ +void kvm_async_pf_task_wait(u32 token, int interrupt_kernel) { u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; @@ -140,7 +144,10 @@ void kvm_async_pf_task_wait(u32 token) n.token = token; n.cpu = smp_processor_id(); - n.halted = is_idle_task(current) || preempt_count() > 1; + n.halted = is_idle_task(current) || + (IS_ENABLED(CONFIG_PREEMPT_COUNT) + ? preempt_count() > 1 || rcu_preempt_depth() + : interrupt_kernel); init_swait_queue_head(&n.wq); hlist_add_head(&n.link, &b->list); raw_spin_unlock(&b->lock); @@ -151,6 +158,8 @@ void kvm_async_pf_task_wait(u32 token) if (hlist_unhashed(&n.link)) break; + rcu_irq_exit(); + if (!n.halted) { local_irq_enable(); schedule(); @@ -159,11 +168,11 @@ void kvm_async_pf_task_wait(u32 token) /* * We cannot reschedule. So halt. */ - rcu_irq_exit(); native_safe_halt(); local_irq_disable(); - rcu_irq_enter(); } + + rcu_irq_enter(); } if (!n.halted) finish_swait(&n.wq, &wait); @@ -178,7 +187,7 @@ static void apf_task_wake_one(struct kvm_task_sleep_node *n) hlist_del_init(&n->link); if (n->halted) smp_send_reschedule(n->cpu); - else if (swait_active(&n->wq)) + else if (swq_has_sleeper(&n->wq)) swake_up(&n->wq); } @@ -261,12 +270,12 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code) switch (kvm_read_and_reset_pf_reason()) { default: - trace_do_page_fault(regs, error_code); + do_page_fault(regs, error_code); break; case KVM_PV_REASON_PAGE_NOT_PRESENT: /* page is swapped out by the host. */ prev_state = exception_enter(); - kvm_async_pf_task_wait((u32)read_cr2()); + kvm_async_pf_task_wait((u32)read_cr2(), !user_mode(regs)); exception_exit(prev_state); break; case KVM_PV_REASON_PAGE_READY: @@ -330,7 +339,12 @@ static void kvm_guest_cpu_init(void) #ifdef CONFIG_PREEMPT pa |= KVM_ASYNC_PF_SEND_ALWAYS; #endif - wrmsrl(MSR_KVM_ASYNC_PF_EN, pa | KVM_ASYNC_PF_ENABLED); + pa |= KVM_ASYNC_PF_ENABLED; + + /* Async page fault support for L1 hypervisor is optional */ + if (wrmsr_safe(MSR_KVM_ASYNC_PF_EN, + (pa | KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT) & 0xffffffff, pa >> 32) < 0) + wrmsrl(MSR_KVM_ASYNC_PF_EN, pa); __this_cpu_write(apf_reason.enabled, 1); printk(KERN_INFO"KVM setup async PF for cpu %d\n", smp_processor_id()); @@ -448,7 +462,7 @@ static int kvm_cpu_down_prepare(unsigned int cpu) static void __init kvm_apf_trap_init(void) { - set_intr_gate(14, async_page_fault); + update_intr_gate(X86_TRAP_PF, async_page_fault); } void __init kvm_guest_init(void) diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index d88967659098..5b609e28ce3f 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -79,7 +79,7 @@ static void kvm_get_wallclock(struct timespec *now) static int kvm_set_wallclock(const struct timespec *now) { - return -1; + return -ENODEV; } static u64 kvm_clock_read(void) diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index d4a15831ac58..4d17bacf4030 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> @@ -21,25 +22,47 @@ #include <asm/mmu_context.h> #include <asm/syscalls.h> +static void refresh_ldt_segments(void) +{ +#ifdef CONFIG_X86_64 + unsigned short sel; + + /* + * Make sure that the cached DS and ES descriptors match the updated + * LDT. + */ + savesegment(ds, sel); + if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT) + loadsegment(ds, sel); + + savesegment(es, sel); + if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT) + loadsegment(es, sel); +#endif +} + /* context.lock is held for us, so we don't need any locking. */ -static void flush_ldt(void *current_mm) +static void flush_ldt(void *__mm) { + struct mm_struct *mm = __mm; mm_context_t *pc; - if (current->active_mm != current_mm) + if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm) return; - pc = ¤t->active_mm->context; - set_ldt(pc->ldt->entries, pc->ldt->size); + pc = &mm->context; + set_ldt(pc->ldt->entries, pc->ldt->nr_entries); + + refresh_ldt_segments(); } /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */ -static struct ldt_struct *alloc_ldt_struct(unsigned int size) +static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries) { struct ldt_struct *new_ldt; unsigned int alloc_size; - if (size > LDT_ENTRIES) + if (num_entries > LDT_ENTRIES) return NULL; new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL); @@ -47,7 +70,7 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int size) return NULL; BUILD_BUG_ON(LDT_ENTRY_SIZE != sizeof(struct desc_struct)); - alloc_size = size * LDT_ENTRY_SIZE; + alloc_size = num_entries * LDT_ENTRY_SIZE; /* * Xen is very picky: it requires a page-aligned LDT that has no @@ -65,14 +88,14 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int size) return NULL; } - new_ldt->size = size; + new_ldt->nr_entries = num_entries; return new_ldt; } /* After calling this, the LDT is immutable. */ static void finalize_ldt_struct(struct ldt_struct *ldt) { - paravirt_alloc_ldt(ldt->entries, ldt->size); + paravirt_alloc_ldt(ldt->entries, ldt->nr_entries); } /* context.lock is held */ @@ -91,8 +114,8 @@ static void free_ldt_struct(struct ldt_struct *ldt) if (likely(!ldt)) return; - paravirt_free_ldt(ldt->entries, ldt->size); - if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE) + paravirt_free_ldt(ldt->entries, ldt->nr_entries); + if (ldt->nr_entries * LDT_ENTRY_SIZE > PAGE_SIZE) vfree_atomic(ldt->entries); else free_page((unsigned long)ldt->entries); @@ -122,14 +145,14 @@ int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm) goto out_unlock; } - new_ldt = alloc_ldt_struct(old_mm->context.ldt->size); + new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries); if (!new_ldt) { retval = -ENOMEM; goto out_unlock; } memcpy(new_ldt->entries, old_mm->context.ldt->entries, - new_ldt->size * LDT_ENTRY_SIZE); + new_ldt->nr_entries * LDT_ENTRY_SIZE); finalize_ldt_struct(new_ldt); mm->context.ldt = new_ldt; @@ -152,9 +175,9 @@ void destroy_context_ldt(struct mm_struct *mm) static int read_ldt(void __user *ptr, unsigned long bytecount) { - int retval; - unsigned long size; struct mm_struct *mm = current->mm; + unsigned long entries_size; + int retval; mutex_lock(&mm->context.lock); @@ -166,18 +189,18 @@ static int read_ldt(void __user *ptr, unsigned long bytecount) if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES) bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES; - size = mm->context.ldt->size * LDT_ENTRY_SIZE; - if (size > bytecount) - size = bytecount; + entries_size = mm->context.ldt->nr_entries * LDT_ENTRY_SIZE; + if (entries_size > bytecount) + entries_size = bytecount; - if (copy_to_user(ptr, mm->context.ldt->entries, size)) { + if (copy_to_user(ptr, mm->context.ldt->entries, entries_size)) { retval = -EFAULT; goto out_unlock; } - if (size != bytecount) { + if (entries_size != bytecount) { /* Zero-fill the rest and pretend we read bytecount bytes. */ - if (clear_user(ptr + size, bytecount - size)) { + if (clear_user(ptr + entries_size, bytecount - entries_size)) { retval = -EFAULT; goto out_unlock; } @@ -208,7 +231,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) { struct mm_struct *mm = current->mm; struct ldt_struct *new_ldt, *old_ldt; - unsigned int oldsize, newsize; + unsigned int old_nr_entries, new_nr_entries; struct user_desc ldt_info; struct desc_struct ldt; int error; @@ -247,17 +270,18 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) mutex_lock(&mm->context.lock); - old_ldt = mm->context.ldt; - oldsize = old_ldt ? old_ldt->size : 0; - newsize = max(ldt_info.entry_number + 1, oldsize); + old_ldt = mm->context.ldt; + old_nr_entries = old_ldt ? old_ldt->nr_entries : 0; + new_nr_entries = max(ldt_info.entry_number + 1, old_nr_entries); error = -ENOMEM; - new_ldt = alloc_ldt_struct(newsize); + new_ldt = alloc_ldt_struct(new_nr_entries); if (!new_ldt) goto out_unlock; if (old_ldt) - memcpy(new_ldt->entries, old_ldt->entries, oldsize * LDT_ENTRY_SIZE); + memcpy(new_ldt->entries, old_ldt->entries, old_nr_entries * LDT_ENTRY_SIZE); + new_ldt->entries[ldt_info.entry_number] = ldt; finalize_ldt_struct(new_ldt); diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 8c53c5d7a1bc..00bc751c861c 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -26,18 +26,6 @@ #include <asm/set_memory.h> #include <asm/debugreg.h> -static void set_idt(void *newidt, __u16 limit) -{ - struct desc_ptr curidt; - - /* ia32 supports unaliged loads & stores */ - curidt.size = limit; - curidt.address = (unsigned long)newidt; - - load_idt(&curidt); -} - - static void set_gdt(void *newgdt, __u16 limit) { struct desc_ptr curgdt; @@ -245,7 +233,7 @@ void machine_kexec(struct kimage *image) * If you want to load them you must set up your own idt & gdt. */ set_gdt(phys_to_virt(0), 0); - set_idt(phys_to_virt(0), 0); + idt_invalidate(phys_to_virt(0)); /* now call it */ image->start = relocate_kernel_ptr((unsigned long)image->head, diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 6f5ca4ebe6e5..1f790cf9d38f 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -87,7 +87,7 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); } pte = pte_offset_kernel(pmd, vaddr); - set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC)); + set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC_NOENC)); return 0; err: free_transition_pgtable(image); @@ -115,6 +115,7 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) .alloc_pgt_page = alloc_pgt_page, .context = image, .page_flag = __PAGE_KERNEL_LARGE_EXEC, + .kernpg_flag = _KERNPG_TABLE_NOENC, }; unsigned long mstart, mend; pgd_t *level4p; @@ -334,7 +335,8 @@ void machine_kexec(struct kimage *image) image->start = relocate_kernel((unsigned long)image->head, (unsigned long)page_list, image->start, - image->preserve_context); + image->preserve_context, + sme_active()); #ifdef CONFIG_KEXEC_JUMP if (image->preserve_context) @@ -347,7 +349,7 @@ void machine_kexec(struct kimage *image) void arch_crash_save_vmcoreinfo(void) { VMCOREINFO_NUMBER(phys_base); - VMCOREINFO_SYMBOL(init_level4_pgt); + VMCOREINFO_SYMBOL(init_top_pgt); #ifdef CONFIG_NUMA VMCOREINFO_SYMBOL(node_data); @@ -602,3 +604,22 @@ void arch_kexec_unprotect_crashkres(void) { kexec_mark_crashkres(false); } + +int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp) +{ + /* + * If SME is active we need to be sure that kexec pages are + * not encrypted because when we boot to the new kernel the + * pages won't be accessed encrypted (initially). + */ + return set_memory_decrypted((unsigned long)vaddr, pages); +} + +void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) +{ + /* + * If SME is active we need to reset the pages back to being + * an encrypted mapping before freeing them. + */ + set_memory_encrypted((unsigned long)vaddr, pages); +} diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c index f4c886d9165c..b5cb49e57df8 100644 --- a/arch/x86/kernel/mmconf-fam10h_64.c +++ b/arch/x86/kernel/mmconf-fam10h_64.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * AMD Family 10h mmconfig enablement */ diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index f67bd3205df7..da0c160e5589 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -35,6 +35,7 @@ #include <asm/page.h> #include <asm/pgtable.h> #include <asm/setup.h> +#include <asm/unwind.h> #if 0 #define DEBUGP(fmt, ...) \ @@ -171,19 +172,27 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, case R_X86_64_NONE: break; case R_X86_64_64: + if (*(u64 *)loc != 0) + goto invalid_relocation; *(u64 *)loc = val; break; case R_X86_64_32: + if (*(u32 *)loc != 0) + goto invalid_relocation; *(u32 *)loc = val; if (val != *(u32 *)loc) goto overflow; break; case R_X86_64_32S: + if (*(s32 *)loc != 0) + goto invalid_relocation; *(s32 *)loc = val; if ((s64)val != *(s32 *)loc) goto overflow; break; case R_X86_64_PC32: + if (*(u32 *)loc != 0) + goto invalid_relocation; val -= (u64)loc; *(u32 *)loc = val; #if 0 @@ -199,6 +208,11 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, } return 0; +invalid_relocation: + pr_err("x86/modules: Skipping invalid relocation target, existing value is nonzero for type %d, loc %p, val %Lx\n", + (int)ELF64_R_TYPE(rel[i].r_info), loc, val); + return -ENOEXEC; + overflow: pr_err("overflow in relocation type %d val %Lx\n", (int)ELF64_R_TYPE(rel[i].r_info), val); @@ -213,7 +227,7 @@ int module_finalize(const Elf_Ehdr *hdr, struct module *me) { const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, - *para = NULL; + *para = NULL, *orc = NULL, *orc_ip = NULL; char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { @@ -225,6 +239,10 @@ int module_finalize(const Elf_Ehdr *hdr, locks = s; if (!strcmp(".parainstructions", secstrings + s->sh_name)) para = s; + if (!strcmp(".orc_unwind", secstrings + s->sh_name)) + orc = s; + if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name)) + orc_ip = s; } if (alt) { @@ -248,6 +266,10 @@ int module_finalize(const Elf_Ehdr *hdr, /* make jump label nops */ jump_label_apply_nops(me); + if (orc && orc_ip) + unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size, + (void *)orc->sh_addr, orc->sh_size); + return 0; } diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 0d904d759ff1..410c5dadcee3 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Intel Multiprocessor Specification 1.1 and 1.4 * compliant MP-table parsing routines. @@ -429,16 +430,16 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type) } } -static struct mpf_intel *mpf_found; +static unsigned long mpf_base; static unsigned long __init get_mpc_size(unsigned long physptr) { struct mpc_table *mpc; unsigned long size; - mpc = early_ioremap(physptr, PAGE_SIZE); + mpc = early_memremap(physptr, PAGE_SIZE); size = mpc->length; - early_iounmap(mpc, PAGE_SIZE); + early_memunmap(mpc, PAGE_SIZE); apic_printk(APIC_VERBOSE, " mpc: %lx-%lx\n", physptr, physptr + size); return size; @@ -450,7 +451,8 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early) unsigned long size; size = get_mpc_size(mpf->physptr); - mpc = early_ioremap(mpf->physptr, size); + mpc = early_memremap(mpf->physptr, size); + /* * Read the physical hardware table. Anything here will * override the defaults. @@ -461,10 +463,10 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early) #endif pr_err("BIOS bug, MP table errors detected!...\n"); pr_cont("... disabling SMP support. (tell your hw vendor)\n"); - early_iounmap(mpc, size); + early_memunmap(mpc, size); return -1; } - early_iounmap(mpc, size); + early_memunmap(mpc, size); if (early) return -1; @@ -497,12 +499,12 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early) */ void __init default_get_smp_config(unsigned int early) { - struct mpf_intel *mpf = mpf_found; + struct mpf_intel *mpf; if (!smp_found_config) return; - if (!mpf) + if (!mpf_base) return; if (acpi_lapic && early) @@ -515,6 +517,12 @@ void __init default_get_smp_config(unsigned int early) if (acpi_lapic && acpi_ioapic) return; + mpf = early_memremap(mpf_base, sizeof(*mpf)); + if (!mpf) { + pr_err("MPTABLE: error mapping MP table\n"); + return; + } + pr_info("Intel MultiProcessor Specification v1.%d\n", mpf->specification); #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) @@ -529,7 +537,7 @@ void __init default_get_smp_config(unsigned int early) /* * Now see if we need to read further. */ - if (mpf->feature1 != 0) { + if (mpf->feature1) { if (early) { /* * local APIC has default address @@ -542,8 +550,10 @@ void __init default_get_smp_config(unsigned int early) construct_default_ISA_mptable(mpf->feature1); } else if (mpf->physptr) { - if (check_physptr(mpf, early)) + if (check_physptr(mpf, early)) { + early_memunmap(mpf, sizeof(*mpf)); return; + } } else BUG(); @@ -552,6 +562,8 @@ void __init default_get_smp_config(unsigned int early) /* * Only use the first configuration found. */ + + early_memunmap(mpf, sizeof(*mpf)); } static void __init smp_reserve_memory(struct mpf_intel *mpf) @@ -561,15 +573,16 @@ static void __init smp_reserve_memory(struct mpf_intel *mpf) static int __init smp_scan_config(unsigned long base, unsigned long length) { - unsigned int *bp = phys_to_virt(base); + unsigned int *bp; struct mpf_intel *mpf; - unsigned long mem; + int ret = 0; apic_printk(APIC_VERBOSE, "Scan for SMP in [mem %#010lx-%#010lx]\n", base, base + length - 1); BUILD_BUG_ON(sizeof(*mpf) != 16); while (length > 0) { + bp = early_memremap(base, length); mpf = (struct mpf_intel *)bp; if ((*bp == SMP_MAGIC_IDENT) && (mpf->length == 1) && @@ -579,24 +592,26 @@ static int __init smp_scan_config(unsigned long base, unsigned long length) #ifdef CONFIG_X86_LOCAL_APIC smp_found_config = 1; #endif - mpf_found = mpf; + mpf_base = base; - pr_info("found SMP MP-table at [mem %#010llx-%#010llx] mapped at [%p]\n", - (unsigned long long) virt_to_phys(mpf), - (unsigned long long) virt_to_phys(mpf) + - sizeof(*mpf) - 1, mpf); + pr_info("found SMP MP-table at [mem %#010lx-%#010lx] mapped at [%p]\n", + base, base + sizeof(*mpf) - 1, mpf); - mem = virt_to_phys(mpf); - memblock_reserve(mem, sizeof(*mpf)); + memblock_reserve(base, sizeof(*mpf)); if (mpf->physptr) smp_reserve_memory(mpf); - return 1; + ret = 1; } - bp += 4; + early_memunmap(bp, length); + + if (ret) + break; + + base += 16; length -= 16; } - return 0; + return ret; } void __init default_find_smp_config(void) @@ -838,29 +853,40 @@ static int __init update_mp_table(void) char oem[10]; struct mpf_intel *mpf; struct mpc_table *mpc, *mpc_new; + unsigned long size; if (!enable_update_mptable) return 0; - mpf = mpf_found; - if (!mpf) + if (!mpf_base) return 0; + mpf = early_memremap(mpf_base, sizeof(*mpf)); + if (!mpf) { + pr_err("MPTABLE: mpf early_memremap() failed\n"); + return 0; + } + /* * Now see if we need to go further. */ - if (mpf->feature1 != 0) - return 0; + if (mpf->feature1) + goto do_unmap_mpf; if (!mpf->physptr) - return 0; + goto do_unmap_mpf; - mpc = phys_to_virt(mpf->physptr); + size = get_mpc_size(mpf->physptr); + mpc = early_memremap(mpf->physptr, size); + if (!mpc) { + pr_err("MPTABLE: mpc early_memremap() failed\n"); + goto do_unmap_mpf; + } if (!smp_check_mpc(mpc, oem, str)) - return 0; + goto do_unmap_mpc; - pr_info("mpf: %llx\n", (u64)virt_to_phys(mpf)); + pr_info("mpf: %llx\n", (u64)mpf_base); pr_info("physptr: %x\n", mpf->physptr); if (mpc_new_phys && mpc->length > mpc_new_length) { @@ -878,21 +904,32 @@ static int __init update_mp_table(void) new = mpf_checksum((unsigned char *)mpc, mpc->length); if (old == new) { pr_info("mpc is readonly, please try alloc_mptable instead\n"); - return 0; + goto do_unmap_mpc; } pr_info("use in-position replacing\n"); } else { + mpc_new = early_memremap(mpc_new_phys, mpc_new_length); + if (!mpc_new) { + pr_err("MPTABLE: new mpc early_memremap() failed\n"); + goto do_unmap_mpc; + } mpf->physptr = mpc_new_phys; - mpc_new = phys_to_virt(mpc_new_phys); memcpy(mpc_new, mpc, mpc->length); + early_memunmap(mpc, size); mpc = mpc_new; + size = mpc_new_length; /* check if we can modify that */ if (mpc_new_phys - mpf->physptr) { struct mpf_intel *mpf_new; /* steal 16 bytes from [0, 1k) */ + mpf_new = early_memremap(0x400 - 16, sizeof(*mpf_new)); + if (!mpf_new) { + pr_err("MPTABLE: new mpf early_memremap() failed\n"); + goto do_unmap_mpc; + } pr_info("mpf new: %x\n", 0x400 - 16); - mpf_new = phys_to_virt(0x400 - 16); memcpy(mpf_new, mpf, 16); + early_memunmap(mpf, sizeof(*mpf)); mpf = mpf_new; mpf->physptr = mpc_new_phys; } @@ -909,6 +946,12 @@ static int __init update_mp_table(void) */ replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length); +do_unmap_mpc: + early_memunmap(mpc, size); + +do_unmap_mpf: + early_memunmap(mpf, sizeof(*mpf)); + return 0; } diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 446c8aa09b9b..35aafc95e4b8 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -39,26 +39,26 @@ #include <trace/events/nmi.h> struct nmi_desc { - spinlock_t lock; + raw_spinlock_t lock; struct list_head head; }; static struct nmi_desc nmi_desc[NMI_MAX] = { { - .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[0].lock), + .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[0].lock), .head = LIST_HEAD_INIT(nmi_desc[0].head), }, { - .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock), + .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock), .head = LIST_HEAD_INIT(nmi_desc[1].head), }, { - .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock), + .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock), .head = LIST_HEAD_INIT(nmi_desc[2].head), }, { - .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock), + .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock), .head = LIST_HEAD_INIT(nmi_desc[3].head), }, @@ -163,7 +163,7 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action) init_irq_work(&action->irq_work, nmi_max_handler); - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); /* * Indicate if there are multiple registrations on the @@ -181,7 +181,7 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action) else list_add_tail_rcu(&action->list, &desc->head); - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; } EXPORT_SYMBOL(__register_nmi_handler); @@ -192,7 +192,7 @@ void unregister_nmi_handler(unsigned int type, const char *name) struct nmiaction *n; unsigned long flags; - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); list_for_each_entry_rcu(n, &desc->head, list) { /* @@ -207,7 +207,7 @@ void unregister_nmi_handler(unsigned int type, const char *name) } } - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); synchronize_rcu(); } EXPORT_SYMBOL_GPL(unregister_nmi_handler); diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c index 6d9582ec0324..a1a96df3dff1 100644 --- a/arch/x86/kernel/nmi_selftest.c +++ b/arch/x86/kernel/nmi_selftest.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * arch/x86/kernel/nmi-selftest.c * @@ -78,7 +79,7 @@ static void __init test_nmi_ipi(struct cpumask *mask) /* Don't wait longer than a second */ timeout = USEC_PER_SEC; - while (!cpumask_empty(mask) && timeout--) + while (!cpumask_empty(mask) && --timeout) udelay(1); /* What happens if we timeout, do we still unregister?? */ diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c index 8f2d1c9d43a8..71f2d1125ec0 100644 --- a/arch/x86/kernel/paravirt-spinlocks.c +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Split spinlock implementation out into its own file, so it can be * compiled in a FTRACE-compatible way. diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 3586996fc50d..19a3e8f961c7 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -319,9 +319,6 @@ __visible struct pv_irq_ops pv_irq_ops = { .irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable), .safe_halt = native_safe_halt, .halt = native_halt, -#ifdef CONFIG_X86_64 - .adjust_exception_frame = paravirt_nop, -#endif }; __visible struct pv_cpu_ops pv_cpu_ops = { @@ -330,7 +327,6 @@ __visible struct pv_cpu_ops pv_cpu_ops = { .set_debugreg = native_set_debugreg, .read_cr0 = native_read_cr0, .write_cr0 = native_write_cr0, - .read_cr4 = native_read_cr4, .write_cr4 = native_write_cr4, #ifdef CONFIG_X86_64 .read_cr8 = native_read_cr8, @@ -346,7 +342,6 @@ __visible struct pv_cpu_ops pv_cpu_ops = { .set_ldt = native_set_ldt, .load_gdt = native_load_gdt, .load_idt = native_load_idt, - .store_idt = native_store_idt, .store_tr = native_store_tr, .load_tls = native_load_tls, #ifdef CONFIG_X86_64 @@ -391,7 +386,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = { .read_cr2 = native_read_cr2, .write_cr2 = native_write_cr2, - .read_cr3 = native_read_cr3, + .read_cr3 = __native_read_cr3, .write_cr3 = native_write_cr3, .flush_tlb_user = native_flush_tlb, @@ -414,8 +409,6 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = { .set_pte = native_set_pte, .set_pte_at = native_set_pte_at, .set_pmd = native_set_pmd, - .set_pmd_at = native_set_pmd_at, - .pte_update = paravirt_nop, .ptep_modify_prot_start = __ptep_modify_prot_start, .ptep_modify_prot_commit = __ptep_modify_prot_commit, @@ -427,7 +420,6 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = { .pmd_clear = native_pmd_clear, #endif .set_pud = native_set_pud, - .set_pud_at = native_set_pud_at, .pmd_val = PTE_IDENT, .make_pmd = PTE_IDENT, diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c index 553acbbb4d32..758e69d72ebf 100644 --- a/arch/x86/kernel/paravirt_patch_32.c +++ b/arch/x86/kernel/paravirt_patch_32.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <asm/paravirt.h> DEF_NATIVE(pv_irq_ops, irq_disable, "cli"); diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index 11aaf1eaa0e4..ac0be8283325 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <asm/paravirt.h> #include <asm/asm-offsets.h> #include <linux/stringify.h> diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index fda7867046d0..5286a4a92cf7 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -50,6 +50,8 @@ #include <asm/x86_init.h> #include <asm/iommu_table.h> +#define CALGARY_MAPPING_ERROR 0 + #ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT int use_calgary __read_mostly = 1; #else @@ -252,7 +254,7 @@ static unsigned long iommu_range_alloc(struct device *dev, if (panic_on_overflow) panic("Calgary: fix the allocator.\n"); else - return DMA_ERROR_CODE; + return CALGARY_MAPPING_ERROR; } } @@ -272,10 +274,10 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, entry = iommu_range_alloc(dev, tbl, npages); - if (unlikely(entry == DMA_ERROR_CODE)) { + if (unlikely(entry == CALGARY_MAPPING_ERROR)) { pr_warn("failed to allocate %u pages in iommu %p\n", npages, tbl); - return DMA_ERROR_CODE; + return CALGARY_MAPPING_ERROR; } /* set the return dma address */ @@ -295,7 +297,7 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, unsigned long flags; /* were we called with bad_dma_address? */ - badend = DMA_ERROR_CODE + (EMERGENCY_PAGES * PAGE_SIZE); + badend = CALGARY_MAPPING_ERROR + (EMERGENCY_PAGES * PAGE_SIZE); if (unlikely(dma_addr < badend)) { WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA " "address 0x%Lx\n", dma_addr); @@ -380,7 +382,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg, npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE); entry = iommu_range_alloc(dev, tbl, npages); - if (entry == DMA_ERROR_CODE) { + if (entry == CALGARY_MAPPING_ERROR) { /* makes sure unmap knows to stop */ s->dma_length = 0; goto error; @@ -398,7 +400,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg, error: calgary_unmap_sg(dev, sg, nelems, dir, 0); for_each_sg(sg, s, nelems, i) { - sg->dma_address = DMA_ERROR_CODE; + sg->dma_address = CALGARY_MAPPING_ERROR; sg->dma_length = 0; } return 0; @@ -453,7 +455,7 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size, /* set up tces to cover the allocated range */ mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); - if (mapping == DMA_ERROR_CODE) + if (mapping == CALGARY_MAPPING_ERROR) goto free; *dma_handle = mapping; return ret; @@ -478,6 +480,11 @@ static void calgary_free_coherent(struct device *dev, size_t size, free_pages((unsigned long)vaddr, get_order(size)); } +static int calgary_mapping_error(struct device *dev, dma_addr_t dma_addr) +{ + return dma_addr == CALGARY_MAPPING_ERROR; +} + static const struct dma_map_ops calgary_dma_ops = { .alloc = calgary_alloc_coherent, .free = calgary_free_coherent, @@ -485,6 +492,8 @@ static const struct dma_map_ops calgary_dma_ops = { .unmap_sg = calgary_unmap_sg, .map_page = calgary_map_page, .unmap_page = calgary_unmap_page, + .mapping_error = calgary_mapping_error, + .dma_supported = x86_dma_supported, }; static inline void __iomem * busno_to_bbar(unsigned char num) @@ -732,7 +741,7 @@ static void __init calgary_reserve_regions(struct pci_dev *dev) struct iommu_table *tbl = pci_iommu(dev->bus); /* reserve EMERGENCY_PAGES from bad_dma_address and up */ - iommu_range_reserve(tbl, DMA_ERROR_CODE, EMERGENCY_PAGES); + iommu_range_reserve(tbl, CALGARY_MAPPING_ERROR, EMERGENCY_PAGES); /* avoid the BIOS/VGA first 640KB-1MB region */ /* for CalIOC2 - avoid the entire first MB */ diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 3a216ec869cd..599d7462eccc 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/dma-mapping.h> #include <linux/dma-debug.h> #include <linux/dmar.h> @@ -93,9 +94,12 @@ again: if (gfpflags_allow_blocking(flag)) { page = dma_alloc_from_contiguous(dev, count, get_order(size), flag); - if (page && page_to_phys(page) + size > dma_mask) { - dma_release_from_contiguous(dev, page, count); - page = NULL; + if (page) { + addr = phys_to_dma(dev, page_to_phys(page)); + if (addr + size > dma_mask) { + dma_release_from_contiguous(dev, page, count); + page = NULL; + } } } /* fallback */ @@ -104,7 +108,7 @@ again: if (!page) return NULL; - addr = page_to_phys(page); + addr = phys_to_dma(dev, page_to_phys(page)); if (addr + size > dma_mask) { __free_pages(page, get_order(size)); @@ -213,10 +217,8 @@ static __init int iommu_setup(char *p) } early_param("iommu", iommu_setup); -int dma_supported(struct device *dev, u64 mask) +int x86_dma_supported(struct device *dev, u64 mask) { - const struct dma_map_ops *ops = get_dma_ops(dev); - #ifdef CONFIG_PCI if (mask > 0xffffffff && forbid_dac > 0) { dev_info(dev, "PCI: Disallowing DAC for device\n"); @@ -224,9 +226,6 @@ int dma_supported(struct device *dev, u64 mask) } #endif - if (ops->dma_supported) - return ops->dma_supported(dev, mask); - /* Copied from i386. Doesn't make much sense, because it will only work for pci_alloc_coherent. The caller just has to use GFP_DMA in this case. */ @@ -252,7 +251,6 @@ int dma_supported(struct device *dev, u64 mask) return 1; } -EXPORT_SYMBOL(dma_supported); static int __init pci_iommu_init(void) { diff --git a/arch/x86/kernel/pci-iommu_table.c b/arch/x86/kernel/pci-iommu_table.c index f712dfdf1357..4dfd90a75e63 100644 --- a/arch/x86/kernel/pci-iommu_table.c +++ b/arch/x86/kernel/pci-iommu_table.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/dma-mapping.h> #include <asm/iommu_table.h> #include <linux/string.h> diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index a88952ef371c..b0caae27e1b7 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Fallback functions when the main IOMMU code is not compiled in. This code is roughly equivalent to i386. */ #include <linux/dma-mapping.h> @@ -11,6 +12,8 @@ #include <asm/iommu.h> #include <asm/dma.h> +#define NOMMU_MAPPING_ERROR 0 + static int check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size) { @@ -30,10 +33,10 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page, enum dma_data_direction dir, unsigned long attrs) { - dma_addr_t bus = page_to_phys(page) + offset; + dma_addr_t bus = phys_to_dma(dev, page_to_phys(page)) + offset; WARN_ON(size == 0); if (!check_addr("map_single", dev, bus, size)) - return DMA_ERROR_CODE; + return NOMMU_MAPPING_ERROR; flush_write_buffers(); return bus; } @@ -88,6 +91,11 @@ static void nommu_sync_sg_for_device(struct device *dev, flush_write_buffers(); } +static int nommu_mapping_error(struct device *dev, dma_addr_t dma_addr) +{ + return dma_addr == NOMMU_MAPPING_ERROR; +} + const struct dma_map_ops nommu_dma_ops = { .alloc = dma_generic_alloc_coherent, .free = dma_generic_free_coherent, @@ -96,4 +104,6 @@ const struct dma_map_ops nommu_dma_ops = { .sync_single_for_device = nommu_sync_single_for_device, .sync_sg_for_device = nommu_sync_sg_for_device, .is_phys = 1, + .mapping_error = nommu_mapping_error, + .dma_supported = x86_dma_supported, }; diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 1e23577e17cf..53bd05ea90d8 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Glue code to lib/swiotlb.c */ #include <linux/pci.h> @@ -6,12 +7,14 @@ #include <linux/swiotlb.h> #include <linux/bootmem.h> #include <linux/dma-mapping.h> +#include <linux/mem_encrypt.h> #include <asm/iommu.h> #include <asm/swiotlb.h> #include <asm/dma.h> #include <asm/xen/swiotlb-xen.h> #include <asm/iommu_table.h> + int swiotlb __read_mostly; void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, @@ -79,8 +82,8 @@ IOMMU_INIT_FINISH(pci_swiotlb_detect_override, pci_swiotlb_late_init); /* - * if 4GB or more detected (and iommu=off not set) return 1 - * and set swiotlb to 1. + * If 4GB or more detected (and iommu=off not set) or if SME is active + * then set swiotlb to 1 and return 1. */ int __init pci_swiotlb_detect_4gb(void) { @@ -89,6 +92,15 @@ int __init pci_swiotlb_detect_4gb(void) if (!no_iommu && max_possible_pfn > MAX_DMA32_PFN) swiotlb = 1; #endif + + /* + * If SME is active then swiotlb will be set to 1 so that bounce + * buffers are allocated and used for devices that do not support + * the addressing range required for the encryption mask. + */ + if (sme_active()) + swiotlb = 1; + return swiotlb; } IOMMU_INIT(pci_swiotlb_detect_4gb, diff --git a/arch/x86/kernel/pcspeaker.c b/arch/x86/kernel/pcspeaker.c index a311ffcaad16..da5190a1ea16 100644 --- a/arch/x86/kernel/pcspeaker.c +++ b/arch/x86/kernel/pcspeaker.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/platform_device.h> #include <linux/err.h> #include <linux/init.h> diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c index 587d887f7f17..e47b2dbbdef3 100644 --- a/arch/x86/kernel/perf_regs.c +++ b/arch/x86/kernel/perf_regs.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/errno.h> #include <linux/kernel.h> #include <linux/sched.h> diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c index 91271122f0df..39a59299bfa0 100644 --- a/arch/x86/kernel/platform-quirks.c +++ b/arch/x86/kernel/platform-quirks.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/init.h> @@ -16,7 +17,6 @@ void __init x86_early_init_platform_quirks(void) x86_platform.legacy.reserve_bios_regions = 1; break; case X86_SUBARCH_XEN: - case X86_SUBARCH_LGUEST: x86_platform.legacy.devices.pnpbios = 0; x86_platform.legacy.rtc = 0; break; diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c index 0c5315d322c8..3fe690067802 100644 --- a/arch/x86/kernel/pmem.c +++ b/arch/x86/kernel/pmem.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2015, Christoph Hellwig. * Copyright (c) 2015, Intel Corporation. diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c index 963e3fb56437..ee0286390a4c 100644 --- a/arch/x86/kernel/probe_roms.c +++ b/arch/x86/kernel/probe_roms.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/sched.h> #include <linux/mm.h> #include <linux/uaccess.h> diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 3ca198080ea9..c67685337c5a 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/errno.h> @@ -355,6 +356,7 @@ bool xen_set_default_idle(void) return ret; } #endif + void stop_this_cpu(void *dummy) { local_irq_disable(); @@ -365,8 +367,20 @@ void stop_this_cpu(void *dummy) disable_local_APIC(); mcheck_cpu_clear(this_cpu_ptr(&cpu_info)); - for (;;) - halt(); + for (;;) { + /* + * Use wbinvd followed by hlt to stop the processor. This + * provides support for kexec on a processor that supports + * SME. With kexec, going from SME inactive to SME active + * requires clearing cache entries so that addresses without + * the encryption bit set don't corrupt the same physical + * address that has the encryption bit set when caches are + * flushed. To achieve this a wbinvd is performed followed by + * a hlt. Even if the processor is not in the kexec/SME + * scenario this only adds a wbinvd to a halting processor. + */ + asm volatile("wbinvd; hlt" : : : "memory"); + } } /* diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index ffeae818aa7a..11966251cd42 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -56,7 +56,7 @@ #include <asm/debugreg.h> #include <asm/switch_to.h> #include <asm/vm86.h> -#include <asm/intel_rdt.h> +#include <asm/intel_rdt_sched.h> #include <asm/proto.h> void __show_regs(struct pt_regs *regs, int all) @@ -68,7 +68,7 @@ void __show_regs(struct pt_regs *regs, int all) if (user_mode(regs)) { sp = regs->sp; - ss = regs->ss & 0xffff; + ss = regs->ss; gs = get_user_gs(regs); } else { sp = kernel_stack_pointer(regs); @@ -92,7 +92,7 @@ void __show_regs(struct pt_regs *regs, int all) cr0 = read_cr0(); cr2 = read_cr2(); - cr3 = read_cr3(); + cr3 = __read_cr3(); cr4 = __read_cr4(); printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index b6840bf3940b..302e7b2572d1 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -52,7 +52,7 @@ #include <asm/switch_to.h> #include <asm/xen/hypervisor.h> #include <asm/vdso.h> -#include <asm/intel_rdt.h> +#include <asm/intel_rdt_sched.h> #include <asm/unistd.h> #ifdef CONFIG_IA32_EMULATION /* Not included via unistd.h */ @@ -69,8 +69,7 @@ void __show_regs(struct pt_regs *regs, int all) unsigned int fsindex, gsindex; unsigned int ds, cs, es; - printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs & 0xffff, - (void *)regs->ip); + printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs, (void *)regs->ip); printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss, regs->sp, regs->flags); if (regs->orig_ax != -1) @@ -104,7 +103,7 @@ void __show_regs(struct pt_regs *regs, int all) cr0 = read_cr0(); cr2 = read_cr2(); - cr3 = read_cr3(); + cr3 = __read_cr3(); cr4 = __read_cr4(); printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", @@ -142,13 +141,130 @@ void release_thread(struct task_struct *dead_task) pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n", dead_task->comm, dead_task->mm->context.ldt->entries, - dead_task->mm->context.ldt->size); + dead_task->mm->context.ldt->nr_entries); BUG(); } #endif } } +enum which_selector { + FS, + GS +}; + +/* + * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are + * not available. The goal is to be reasonably fast on non-FSGSBASE systems. + * It's forcibly inlined because it'll generate better code and this function + * is hot. + */ +static __always_inline void save_base_legacy(struct task_struct *prev_p, + unsigned short selector, + enum which_selector which) +{ + if (likely(selector == 0)) { + /* + * On Intel (without X86_BUG_NULL_SEG), the segment base could + * be the pre-existing saved base or it could be zero. On AMD + * (with X86_BUG_NULL_SEG), the segment base could be almost + * anything. + * + * This branch is very hot (it's hit twice on almost every + * context switch between 64-bit programs), and avoiding + * the RDMSR helps a lot, so we just assume that whatever + * value is already saved is correct. This matches historical + * Linux behavior, so it won't break existing applications. + * + * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we + * report that the base is zero, it needs to actually be zero: + * see the corresponding logic in load_seg_legacy. + */ + } else { + /* + * If the selector is 1, 2, or 3, then the base is zero on + * !X86_BUG_NULL_SEG CPUs and could be anything on + * X86_BUG_NULL_SEG CPUs. In the latter case, Linux + * has never attempted to preserve the base across context + * switches. + * + * If selector > 3, then it refers to a real segment, and + * saving the base isn't necessary. + */ + if (which == FS) + prev_p->thread.fsbase = 0; + else + prev_p->thread.gsbase = 0; + } +} + +static __always_inline void save_fsgs(struct task_struct *task) +{ + savesegment(fs, task->thread.fsindex); + savesegment(gs, task->thread.gsindex); + save_base_legacy(task, task->thread.fsindex, FS); + save_base_legacy(task, task->thread.gsindex, GS); +} + +static __always_inline void loadseg(enum which_selector which, + unsigned short sel) +{ + if (which == FS) + loadsegment(fs, sel); + else + load_gs_index(sel); +} + +static __always_inline void load_seg_legacy(unsigned short prev_index, + unsigned long prev_base, + unsigned short next_index, + unsigned long next_base, + enum which_selector which) +{ + if (likely(next_index <= 3)) { + /* + * The next task is using 64-bit TLS, is not using this + * segment at all, or is having fun with arcane CPU features. + */ + if (next_base == 0) { + /* + * Nasty case: on AMD CPUs, we need to forcibly zero + * the base. + */ + if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { + loadseg(which, __USER_DS); + loadseg(which, next_index); + } else { + /* + * We could try to exhaustively detect cases + * under which we can skip the segment load, + * but there's really only one case that matters + * for performance: if both the previous and + * next states are fully zeroed, we can skip + * the load. + * + * (This assumes that prev_base == 0 has no + * false positives. This is the case on + * Intel-style CPUs.) + */ + if (likely(prev_index | next_index | prev_base)) + loadseg(which, next_index); + } + } else { + if (prev_index != next_index) + loadseg(which, next_index); + wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE, + next_base); + } + } else { + /* + * The next task is using a real segment. Loading the selector + * is sufficient. + */ + loadseg(which, next_index); + } +} + int copy_thread_tls(unsigned long clone_flags, unsigned long sp, unsigned long arg, struct task_struct *p, unsigned long tls) { @@ -229,10 +345,19 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp, unsigned int _cs, unsigned int _ss, unsigned int _ds) { + WARN_ON_ONCE(regs != current_pt_regs()); + + if (static_cpu_has(X86_BUG_NULL_SEG)) { + /* Loading zero below won't clear the base. */ + loadsegment(fs, __USER_DS); + load_gs_index(__USER_DS); + } + loadsegment(fs, 0); loadsegment(es, _ds); loadsegment(ds, _ds); load_gs_index(0); + regs->ip = new_ip; regs->sp = new_sp; regs->cs = _cs; @@ -277,7 +402,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct fpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(cpu_tss, cpu); - unsigned prev_fsindex, prev_gsindex; + + WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && + this_cpu_read(irq_count) != -1); switch_fpu_prepare(prev_fpu, cpu); @@ -286,8 +413,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * * (e.g. xen_load_tls()) */ - savesegment(fs, prev_fsindex); - savesegment(gs, prev_gsindex); + save_fsgs(prev_p); /* * Load TLS before restoring any segments so that segment loads @@ -326,108 +452,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) if (unlikely(next->ds | prev->ds)) loadsegment(ds, next->ds); - /* - * Switch FS and GS. - * - * These are even more complicated than DS and ES: they have - * 64-bit bases are that controlled by arch_prctl. The bases - * don't necessarily match the selectors, as user code can do - * any number of things to cause them to be inconsistent. - * - * We don't promise to preserve the bases if the selectors are - * nonzero. We also don't promise to preserve the base if the - * selector is zero and the base doesn't match whatever was - * most recently passed to ARCH_SET_FS/GS. (If/when the - * FSGSBASE instructions are enabled, we'll need to offer - * stronger guarantees.) - * - * As an invariant, - * (fsbase != 0 && fsindex != 0) || (gsbase != 0 && gsindex != 0) is - * impossible. - */ - if (next->fsindex) { - /* Loading a nonzero value into FS sets the index and base. */ - loadsegment(fs, next->fsindex); - } else { - if (next->fsbase) { - /* Next index is zero but next base is nonzero. */ - if (prev_fsindex) - loadsegment(fs, 0); - wrmsrl(MSR_FS_BASE, next->fsbase); - } else { - /* Next base and index are both zero. */ - if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { - /* - * We don't know the previous base and can't - * find out without RDMSR. Forcibly clear it. - */ - loadsegment(fs, __USER_DS); - loadsegment(fs, 0); - } else { - /* - * If the previous index is zero and ARCH_SET_FS - * didn't change the base, then the base is - * also zero and we don't need to do anything. - */ - if (prev->fsbase || prev_fsindex) - loadsegment(fs, 0); - } - } - } - /* - * Save the old state and preserve the invariant. - * NB: if prev_fsindex == 0, then we can't reliably learn the base - * without RDMSR because Intel user code can zero it without telling - * us and AMD user code can program any 32-bit value without telling - * us. - */ - if (prev_fsindex) - prev->fsbase = 0; - prev->fsindex = prev_fsindex; - - if (next->gsindex) { - /* Loading a nonzero value into GS sets the index and base. */ - load_gs_index(next->gsindex); - } else { - if (next->gsbase) { - /* Next index is zero but next base is nonzero. */ - if (prev_gsindex) - load_gs_index(0); - wrmsrl(MSR_KERNEL_GS_BASE, next->gsbase); - } else { - /* Next base and index are both zero. */ - if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { - /* - * We don't know the previous base and can't - * find out without RDMSR. Forcibly clear it. - * - * This contains a pointless SWAPGS pair. - * Fixing it would involve an explicit check - * for Xen or a new pvop. - */ - load_gs_index(__USER_DS); - load_gs_index(0); - } else { - /* - * If the previous index is zero and ARCH_SET_GS - * didn't change the base, then the base is - * also zero and we don't need to do anything. - */ - if (prev->gsbase || prev_gsindex) - load_gs_index(0); - } - } - } - /* - * Save the old state and preserve the invariant. - * NB: if prev_gsindex == 0, then we can't reliably learn the base - * without RDMSR because Intel user code can zero it without telling - * us and AMD user code can program any 32-bit value without telling - * us. - */ - if (prev_gsindex) - prev->gsbase = 0; - prev->gsindex = prev_gsindex; + load_seg_legacy(prev->fsindex, prev->fsbase, + next->fsindex, next->fsbase, FS); + load_seg_legacy(prev->gsindex, prev->gsbase, + next->gsindex, next->gsbase, GS); switch_fpu_finish(next_fpu, cpu); diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 0bee04d41bed..697a4ce04308 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -1,6 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 /* * This file contains work-arounds for x86 and x86_64 platform bugs. */ +#include <linux/dmi.h> #include <linux/pci.h> #include <linux/irq.h> @@ -656,3 +658,12 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, quirk_intel_brickland_xeon_ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2083, quirk_intel_purley_xeon_ras_cap); #endif #endif + +bool x86_apple_machine; +EXPORT_SYMBOL(x86_apple_machine); + +void __init early_platform_quirks(void) +{ + x86_apple_machine = dmi_match(DMI_SYS_VENDOR, "Apple Inc.") || + dmi_match(DMI_SYS_VENDOR, "Apple Computer, Inc."); +} diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 2544700a2a87..2126b9d27c34 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/export.h> @@ -9,6 +10,7 @@ #include <linux/sched.h> #include <linux/tboot.h> #include <linux/delay.h> +#include <linux/frame.h> #include <acpi/reboot.h> #include <asm/io.h> #include <asm/apic.h> @@ -37,8 +39,6 @@ void (*pm_power_off)(void); EXPORT_SYMBOL(pm_power_off); -static const struct desc_ptr no_idt = {}; - /* * This is set if we need to go through the 'emergency' path. * When machine_emergency_restart() is called, we may be on @@ -106,6 +106,10 @@ void __noreturn machine_real_restart(unsigned int type) load_cr3(initial_page_table); #else write_cr3(real_mode_header->trampoline_pgd); + + /* Exiting long mode will fail if CR4.PCIDE is set. */ + if (static_cpu_has(X86_FEATURE_PCID)) + cr4_clear_bits(X86_CR4_PCIDE); #endif /* Jump to the identity-mapped low memory code */ @@ -123,6 +127,7 @@ void __noreturn machine_real_restart(unsigned int type) #ifdef CONFIG_APM_MODULE EXPORT_SYMBOL(machine_real_restart); #endif +STACK_FRAME_NON_STANDARD(machine_real_restart); /* * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot @@ -150,7 +155,7 @@ static int __init set_kbd_reboot(const struct dmi_system_id *d) /* * This is a single dmi_table handling all reboot quirks. */ -static struct dmi_system_id __initdata reboot_dmi_table[] = { +static const struct dmi_system_id reboot_dmi_table[] __initconst = { /* Acer */ { /* Handle reboot issue on Acer Aspire one */ @@ -469,12 +474,12 @@ static int __init reboot_init(void) /* * The DMI quirks table takes precedence. If no quirks entry - * matches and the ACPI Hardware Reduced bit is set, force EFI - * reboot. + * matches and the ACPI Hardware Reduced bit is set and EFI + * runtime services are enabled, force EFI reboot. */ rv = dmi_check_system(reboot_dmi_table); - if (!rv && efi_reboot_required()) + if (!rv && efi_reboot_required() && !efi_runtime_disabled()) reboot_type = BOOT_EFI; return 0; @@ -636,7 +641,7 @@ static void native_machine_emergency_restart(void) break; case BOOT_TRIPLE: - load_idt(&no_idt); + idt_invalidate(NULL); __asm__ __volatile__("int3"); /* We're probably dead after this, but... */ diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c index c8e41e90f59c..b7c0f142d026 100644 --- a/arch/x86/kernel/reboot_fixups_32.c +++ b/arch/x86/kernel/reboot_fixups_32.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * This is a good place to put board specific reboot fixups. * diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index 98111b38ebfd..307d3bac5f04 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -47,6 +47,7 @@ relocate_kernel: * %rsi page_list * %rdx start address * %rcx preserve_context + * %r8 sme_active */ /* Save the CPU context, used for jumping back */ @@ -71,6 +72,9 @@ relocate_kernel: pushq $0 popfq + /* Save SME active flag */ + movq %r8, %r12 + /* * get physical address of control page now * this is impossible after page table switch @@ -132,6 +136,16 @@ identity_mapped: /* Flush the TLB (needed?) */ movq %r9, %cr3 + /* + * If SME is active, there could be old encrypted cache line + * entries that will conflict with the now unencrypted memory + * used by kexec. Flush the caches before copying the kernel. + */ + testq %r12, %r12 + jz 1f + wbinvd +1: + movq %rcx, %r11 call swap_pages diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c index 5ab3895516ac..9b9fb7882c20 100644 --- a/arch/x86/kernel/resource.c +++ b/arch/x86/kernel/resource.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/ioport.h> #include <asm/e820/api.h> diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 5b21cb7d84d6..69ac9cb9cac6 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * RTC related functions */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index f81823695014..0957dd73d127 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -69,6 +69,7 @@ #include <linux/crash_dump.h> #include <linux/tboot.h> #include <linux/jiffies.h> +#include <linux/mem_encrypt.h> #include <linux/usb/xhci-dbgp.h> #include <video/edid.h> @@ -115,6 +116,7 @@ #include <asm/microcode.h> #include <asm/mmu_context.h> #include <asm/kaslr.h> +#include <asm/unwind.h> /* * max_low_pfn_mapped: highest direct mapped pfn under 4GB @@ -374,6 +376,14 @@ static void __init reserve_initrd(void) !ramdisk_image || !ramdisk_size) return; /* No initrd provided by bootloader */ + /* + * If SME is active, this memory will be marked encrypted by the + * kernel when it is accessed (including relocation). However, the + * ramdisk image was loaded decrypted by the bootloader, so make + * sure that it is encrypted before accessing it. + */ + sme_early_encrypt(ramdisk_image, ramdisk_end - ramdisk_image); + initrd_start = 0; mapped_size = memblock_mem_size(max_pfn_mapped); @@ -503,7 +513,7 @@ static int __init reserve_crashkernel_low(void) return 0; } - low_base = memblock_find_in_range(low_size, 1ULL << 32, low_size, CRASH_ALIGN); + low_base = memblock_find_in_range(0, 1ULL << 32, low_size, CRASH_ALIGN); if (!low_base) { pr_err("Cannot reserve %ldMB crashkernel low memory, please try smaller size.\n", (unsigned long)(low_size >> 20)); @@ -890,7 +900,7 @@ void __init setup_arch(char **cmdline_p) */ olpc_ofw_detect(); - early_trap_init(); + idt_setup_early_traps(); early_cpu_init(); early_ioremap_init(); @@ -1076,6 +1086,13 @@ void __init setup_arch(char **cmdline_p) max_possible_pfn = max_pfn; /* + * This call is required when the CPU does not support PAT. If + * mtrr_bp_init() invoked it already via pat_init() the call has no + * effect. + */ + init_cache_modes(); + + /* * Define random base addresses for memory sections after max_pfn is * defined and before each memory section base is used. */ @@ -1154,15 +1171,18 @@ void __init setup_arch(char **cmdline_p) init_mem_mapping(); - early_trap_pf_init(); + idt_setup_early_pf(); /* * Update mmu_cr4_features (and, indirectly, trampoline_cr4_features) * with the current CR4 value. This may not be necessary, but * auditing all the early-boot CR4 manipulation would be needed to * rule it out. + * + * Mask off features that don't work outside long mode (just + * PCIDE for now). */ - mmu_cr4_features = __read_cr4(); + mmu_cr4_features = __read_cr4() & ~X86_CR4_PCIDE; memblock_set_current_limit(get_max_mapped()); @@ -1199,6 +1219,8 @@ void __init setup_arch(char **cmdline_p) io_delay_init(); + early_platform_quirks(); + /* * Parse the ACPI tables for possible boot-time SMP configuration. */ @@ -1303,6 +1325,8 @@ void __init setup_arch(char **cmdline_p) if (efi_enabled(EFI_BOOT)) efi_apply_memmap_quirks(); #endif + + unwind_init(); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 10edd1e69a68..497aa766fab3 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> @@ -155,13 +156,10 @@ static void __init pcpup_populate_pte(unsigned long addr) static inline void setup_percpu_segment(int cpu) { #ifdef CONFIG_X86_32 - struct desc_struct gdt; + struct desc_struct d = GDT_ENTRY_INIT(0x8092, per_cpu_offset(cpu), + 0xFFFFF); - pack_descriptor(&gdt, per_cpu_offset(cpu), 0xFFFFF, - 0x2 | DESCTYPE_S, 0x8); - gdt.s = 1; - write_gdt_entry(get_cpu_gdt_rw(cpu), - GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S); + write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PERCPU, &d, DESCTYPE_S); #endif } @@ -171,7 +169,7 @@ void __init setup_per_cpu_areas(void) unsigned long delta; int rc; - pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", + pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%u nr_node_ids:%d\n", NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); /* diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index cc30a74e4adb..b9e00e8f1c9b 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs @@ -256,14 +257,14 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, sp = current->sas_ss_sp + current->sas_ss_size; } else if (IS_ENABLED(CONFIG_X86_32) && !onsigstack && - (regs->ss & 0xffff) != __USER_DS && + regs->ss != __USER_DS && !(ka->sa.sa_flags & SA_RESTORER) && ka->sa.sa_restorer) { /* This is the legacy signal stack switching. */ sp = (unsigned long) ka->sa.sa_restorer; } - if (fpu->fpstate_active) { + if (fpu->initialized) { sp = fpu__alloc_mathframe(sp, IS_ENABLED(CONFIG_X86_32), &buf_fx, &math_size); *fpstate = (void __user *)sp; @@ -279,7 +280,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, return (void __user *)-1L; /* save i387 and extended state */ - if (fpu->fpstate_active && + if (fpu->initialized && copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size) < 0) return (void __user *)-1L; @@ -755,7 +756,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) /* * Ensure the signal handler starts with the new fpu state. */ - if (fpu->fpstate_active) + if (fpu->initialized) fpu__clear(fpu); } signal_setup_done(failed, ksig, stepping); diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index 71beb28600d4..8c6da1a643da 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/compat.h> #include <linux/uaccess.h> #include <linux/ptrace.h> @@ -129,7 +130,7 @@ int __copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from, 3 ints plus the relevant union member. */ put_user_ex(from->si_signo, &to->si_signo); put_user_ex(from->si_errno, &to->si_errno); - put_user_ex((short)from->si_code, &to->si_code); + put_user_ex(from->si_code, &to->si_code); if (from->si_code < 0) { put_user_ex(from->si_pid, &to->si_pid); @@ -142,8 +143,8 @@ int __copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from, */ put_user_ex(from->_sifields._pad[0], &to->_sifields._pad[0]); - switch (from->si_code >> 16) { - case __SI_FAULT >> 16: + switch (siginfo_layout(from->si_signo, from->si_code)) { + case SIL_FAULT: if (from->si_signo == SIGBUS && (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO)) @@ -160,11 +161,11 @@ int __copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from, put_user_ex(from->si_pkey, &to->si_pkey); } break; - case __SI_SYS >> 16: + case SIL_SYS: put_user_ex(from->si_syscall, &to->si_syscall); put_user_ex(from->si_arch, &to->si_arch); break; - case __SI_CHLD >> 16: + case SIL_CHLD: if (!x32_ABI) { put_user_ex(from->si_utime, &to->si_utime); put_user_ex(from->si_stime, &to->si_stime); @@ -174,21 +175,18 @@ int __copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from, } put_user_ex(from->si_status, &to->si_status); /* FALL THROUGH */ - default: - case __SI_KILL >> 16: + case SIL_KILL: put_user_ex(from->si_uid, &to->si_uid); break; - case __SI_POLL >> 16: + case SIL_POLL: put_user_ex(from->si_fd, &to->si_fd); break; - case __SI_TIMER >> 16: + case SIL_TIMER: put_user_ex(from->si_overrun, &to->si_overrun); put_user_ex(ptr_to_compat(from->si_ptr), &to->si_ptr); break; - /* This is not generated by the kernel as of now. */ - case __SI_RT >> 16: - case __SI_MESGQ >> 16: + case SIL_RT: put_user_ex(from->si_uid, &to->si_uid); put_user_ex(from->si_int, &to->si_int); break; diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index d798c0da451c..5c574dff4c1a 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -254,84 +254,45 @@ finish: } /* - * Reschedule call back. + * Reschedule call back. KVM uses this interrupt to force a cpu out of + * guest mode */ -static inline void __smp_reschedule_interrupt(void) -{ - inc_irq_stat(irq_resched_count); - scheduler_ipi(); -} - __visible void __irq_entry smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); - __smp_reschedule_interrupt(); - /* - * KVM uses this interrupt to force a cpu out of guest mode - */ -} - -__visible void __irq_entry smp_trace_reschedule_interrupt(struct pt_regs *regs) -{ - /* - * Need to call irq_enter() before calling the trace point. - * __smp_reschedule_interrupt() calls irq_enter/exit() too (in - * scheduler_ipi(). This is OK, since those functions are allowed - * to nest. - */ - ipi_entering_ack_irq(); - trace_reschedule_entry(RESCHEDULE_VECTOR); - __smp_reschedule_interrupt(); - trace_reschedule_exit(RESCHEDULE_VECTOR); - exiting_irq(); - /* - * KVM uses this interrupt to force a cpu out of guest mode - */ -} + inc_irq_stat(irq_resched_count); -static inline void __smp_call_function_interrupt(void) -{ - generic_smp_call_function_interrupt(); - inc_irq_stat(irq_call_count); + if (trace_resched_ipi_enabled()) { + /* + * scheduler_ipi() might call irq_enter() as well, but + * nested calls are fine. + */ + irq_enter(); + trace_reschedule_entry(RESCHEDULE_VECTOR); + scheduler_ipi(); + trace_reschedule_exit(RESCHEDULE_VECTOR); + irq_exit(); + return; + } + scheduler_ipi(); } __visible void __irq_entry smp_call_function_interrupt(struct pt_regs *regs) { ipi_entering_ack_irq(); - __smp_call_function_interrupt(); - exiting_irq(); -} - -__visible void __irq_entry -smp_trace_call_function_interrupt(struct pt_regs *regs) -{ - ipi_entering_ack_irq(); trace_call_function_entry(CALL_FUNCTION_VECTOR); - __smp_call_function_interrupt(); - trace_call_function_exit(CALL_FUNCTION_VECTOR); - exiting_irq(); -} - -static inline void __smp_call_function_single_interrupt(void) -{ - generic_smp_call_function_single_interrupt(); inc_irq_stat(irq_call_count); -} - -__visible void __irq_entry -smp_call_function_single_interrupt(struct pt_regs *regs) -{ - ipi_entering_ack_irq(); - __smp_call_function_single_interrupt(); + generic_smp_call_function_interrupt(); + trace_call_function_exit(CALL_FUNCTION_VECTOR); exiting_irq(); } -__visible void __irq_entry -smp_trace_call_function_single_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_call_function_single_interrupt(struct pt_regs *r) { ipi_entering_ack_irq(); trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR); - __smp_call_function_single_interrupt(); + inc_irq_stat(irq_call_count); + generic_smp_call_function_single_interrupt(); trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR); exiting_irq(); } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index f04479a8f74f..ad59edd84de7 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -226,16 +226,12 @@ static int enable_start_cpu0; static void notrace start_secondary(void *unused) { /* - * Don't put *anything* before cpu_init(), SMP booting is too - * fragile that we want to limit the things done here to the - * most necessary things. + * Don't put *anything* except direct CPU state initialization + * before cpu_init(), SMP booting is too fragile that we want to + * limit the things done here to the most necessary things. */ - cpu_init(); - x86_cpuinit.early_percpu_clock_init(); - preempt_disable(); - smp_callin(); - - enable_start_cpu0 = 0; + if (boot_cpu_has(X86_FEATURE_PCID)) + __write_cr4(__read_cr4() | X86_CR4_PCIDE); #ifdef CONFIG_X86_32 /* switch away from the initial page table */ @@ -243,6 +239,13 @@ static void notrace start_secondary(void *unused) __flush_tlb_all(); #endif + cpu_init(); + x86_cpuinit.early_percpu_clock_init(); + preempt_disable(); + smp_callin(); + + enable_start_cpu0 = 0; + /* otherwise gcc will move up smp_processor_id before the cpu_init */ barrier(); /* @@ -863,7 +866,7 @@ static void announce_cpu(int cpu, int apicid) if (cpu == 1) printk(KERN_INFO "x86: Booting SMP configuration:\n"); - if (system_state == SYSTEM_BOOTING) { + if (system_state < SYSTEM_RUNNING) { if (node != current_node) { if (current_node > (-1)) pr_cont("\n"); @@ -971,7 +974,8 @@ void common_cpu_up(unsigned int cpu, struct task_struct *idle) * Returns zero if CPU booted OK, else error code from * ->wakeup_secondary_cpu. */ -static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) +static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, + int *cpu0_nmi_registered) { volatile u32 *trampoline_status = (volatile u32 *) __va(real_mode_header->trampoline_status); @@ -979,7 +983,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) unsigned long start_ip = real_mode_header->trampoline_start; unsigned long boot_error = 0; - int cpu0_nmi_registered = 0; unsigned long timeout; idle->thread.sp = (unsigned long)task_pt_regs(idle); @@ -1035,7 +1038,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) boot_error = apic->wakeup_secondary_cpu(apicid, start_ip); else boot_error = wakeup_cpu_via_init_nmi(cpu, start_ip, apicid, - &cpu0_nmi_registered); + cpu0_nmi_registered); if (!boot_error) { /* @@ -1080,12 +1083,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) */ smpboot_restore_warm_reset_vector(); } - /* - * Clean up the nmi handler. Do this after the callin and callout sync - * to avoid impact of possible long unregister time. - */ - if (cpu0_nmi_registered) - unregister_nmi_handler(NMI_LOCAL, "wake_cpu0"); return boot_error; } @@ -1093,8 +1090,9 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) int native_cpu_up(unsigned int cpu, struct task_struct *tidle) { int apicid = apic->cpu_present_to_apicid(cpu); + int cpu0_nmi_registered = 0; unsigned long flags; - int err; + int err, ret = 0; WARN_ON(irqs_disabled()); @@ -1131,10 +1129,11 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle) common_cpu_up(cpu, tidle); - err = do_boot_cpu(apicid, cpu, tidle); + err = do_boot_cpu(apicid, cpu, tidle, &cpu0_nmi_registered); if (err) { pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu); - return -EIO; + ret = -EIO; + goto unreg_nmi; } /* @@ -1150,7 +1149,15 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle) touch_nmi_watchdog(); } - return 0; +unreg_nmi: + /* + * Clean up the nmi handler. Do this after the callin and callout sync + * to avoid impact of possible long unregister time. + */ + if (cpu0_nmi_registered) + unregister_nmi_handler(NMI_LOCAL, "wake_cpu0"); + + return ret; } /** @@ -1457,7 +1464,7 @@ __init void prefill_possible_map(void) /* nr_cpu_ids could be reduced via nr_cpus= */ if (possible > nr_cpu_ids) { - pr_warn("%d Processors exceeds NR_CPUS limit of %d\n", + pr_warn("%d Processors exceeds NR_CPUS limit of %u\n", possible, nr_cpu_ids); possible = nr_cpu_ids; } @@ -1589,7 +1596,6 @@ void native_cpu_die(unsigned int cpu) void play_dead_common(void) { idle_task_exit(); - reset_lazy_tlbstate(); /* Ack it */ (void)cpu_report_death(); diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index f07f83b3611b..60d2c3798ba2 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * x86 single-step support code, common to 32-bit and 64-bit. */ @@ -13,7 +14,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re unsigned long addr, seg; addr = regs->ip; - seg = regs->cs & 0xffff; + seg = regs->cs; if (v8086_mode(regs)) { addr = (addr & 0xffff) + (seg << 4); return addr; @@ -34,7 +35,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re mutex_lock(&child->mm->context.lock); if (unlikely(!child->mm->context.ldt || - seg >= child->mm->context.ldt->size)) + seg >= child->mm->context.ldt->nr_entries)) addr = -1L; /* bogus selector, access would fault */ else { desc = &child->mm->context.ldt->entries[seg]; diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 213ddf3e937d..a63fe77b3217 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/errno.h> #include <linux/sched.h> #include <linux/sched/mm.h> @@ -21,6 +22,7 @@ #include <asm/compat.h> #include <asm/ia32.h> #include <asm/syscalls.h> +#include <asm/mpx.h> /* * Align a virtual address to avoid aliasing in the I$ on AMD F15h. @@ -100,8 +102,8 @@ out: return error; } -static void find_start_end(unsigned long flags, unsigned long *begin, - unsigned long *end) +static void find_start_end(unsigned long addr, unsigned long flags, + unsigned long *begin, unsigned long *end) { if (!in_compat_syscall() && (flags & MAP_32BIT)) { /* This is usually used needed to map code in small @@ -120,7 +122,10 @@ static void find_start_end(unsigned long flags, unsigned long *begin, } *begin = get_mmap_base(1); - *end = in_compat_syscall() ? tasksize_32bit() : tasksize_64bit(); + if (in_compat_syscall()) + *end = task_size_32bit(); + else + *end = task_size_64bit(addr > DEFAULT_MAP_WINDOW); } unsigned long @@ -132,10 +137,14 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, struct vm_unmapped_area_info info; unsigned long begin, end; + addr = mpx_unmapped_area_check(addr, len, flags); + if (IS_ERR_VALUE(addr)) + return addr; + if (flags & MAP_FIXED) return addr; - find_start_end(flags, &begin, &end); + find_start_end(addr, flags, &begin, &end); if (len > end) return -ENOMEM; @@ -171,6 +180,10 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, unsigned long addr = addr0; struct vm_unmapped_area_info info; + addr = mpx_unmapped_area_check(addr, len, flags); + if (IS_ERR_VALUE(addr)) + return addr; + /* requested length too big for entire address space */ if (len > TASK_SIZE) return -ENOMEM; @@ -195,6 +208,16 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, info.length = len; info.low_limit = PAGE_SIZE; info.high_limit = get_mmap_base(0); + + /* + * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area + * in the full address space. + * + * !in_compat_syscall() check to avoid high addresses for x32. + */ + if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall()) + info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW; + info.align_mask = 0; info.align_offset = pgoff << PAGE_SHIFT; if (filp) { diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index d39c09119db6..879af864d99a 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 1991,1992,1995 Linus Torvalds * Copyright (c) 1994 Alan Modra @@ -66,7 +67,7 @@ static struct irqaction irq0 = { .name = "timer" }; -void __init setup_default_timer_irq(void) +static void __init setup_default_timer_irq(void) { if (!nr_legacy_irqs()) return; diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index dcd699baea1b..9a9c9b076955 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/sched.h> @@ -93,7 +94,7 @@ static void set_tls_desc(struct task_struct *p, int idx, while (n-- > 0) { if (LDT_empty(info) || LDT_zero(info)) { - desc->a = desc->b = 0; + memset(desc, 0, sizeof(*desc)); } else { fill_ldt(desc, info); diff --git a/arch/x86/kernel/trace_clock.c b/arch/x86/kernel/trace_clock.c index 80bb24d9b880..b8e7abe00b06 100644 --- a/arch/x86/kernel/trace_clock.c +++ b/arch/x86/kernel/trace_clock.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * X86 trace clocks */ diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c index 15515132bf0d..5bd30c442794 100644 --- a/arch/x86/kernel/tracepoint.c +++ b/arch/x86/kernel/tracepoint.c @@ -1,60 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Code for supporting irq vector tracepoints. * * Copyright (C) 2013 Seiji Aguchi <seiji.aguchi@hds.com> * */ -#include <asm/hw_irq.h> -#include <asm/desc.h> +#include <linux/jump_label.h> #include <linux/atomic.h> -atomic_t trace_idt_ctr = ATOMIC_INIT(0); -struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1, - (unsigned long) trace_idt_table }; - -/* No need to be aligned, but done to keep all IDTs defined the same way. */ -gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss; +#include <asm/hw_irq.h> +#include <asm/desc.h> -static int trace_irq_vector_refcount; -static DEFINE_MUTEX(irq_vector_mutex); +DEFINE_STATIC_KEY_FALSE(trace_pagefault_key); -static void set_trace_idt_ctr(int val) +int trace_pagefault_reg(void) { - atomic_set(&trace_idt_ctr, val); - /* Ensure the trace_idt_ctr is set before sending IPI */ - wmb(); + static_branch_inc(&trace_pagefault_key); + return 0; } -static void switch_idt(void *arg) +void trace_pagefault_unreg(void) { - unsigned long flags; - - local_irq_save(flags); - load_current_idt(); - local_irq_restore(flags); + static_branch_dec(&trace_pagefault_key); } -int trace_irq_vector_regfunc(void) +#ifdef CONFIG_SMP + +DEFINE_STATIC_KEY_FALSE(trace_resched_ipi_key); + +int trace_resched_ipi_reg(void) { - mutex_lock(&irq_vector_mutex); - if (!trace_irq_vector_refcount) { - set_trace_idt_ctr(1); - smp_call_function(switch_idt, NULL, 0); - switch_idt(NULL); - } - trace_irq_vector_refcount++; - mutex_unlock(&irq_vector_mutex); + static_branch_inc(&trace_resched_ipi_key); return 0; } -void trace_irq_vector_unregfunc(void) +void trace_resched_ipi_unreg(void) { - mutex_lock(&irq_vector_mutex); - trace_irq_vector_refcount--; - if (!trace_irq_vector_refcount) { - set_trace_idt_ctr(0); - smp_call_function(switch_idt, NULL, 0); - switch_idt(NULL); - } - mutex_unlock(&irq_vector_mutex); + static_branch_dec(&trace_resched_ipi_key); } + +#endif diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index bf54309b85da..67db4f43309e 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -38,11 +38,6 @@ #include <linux/smp.h> #include <linux/io.h> -#ifdef CONFIG_EISA -#include <linux/ioport.h> -#include <linux/eisa.h> -#endif - #if defined(CONFIG_EDAC) #include <linux/edac.h> #endif @@ -70,20 +65,13 @@ #include <asm/x86_init.h> #include <asm/pgalloc.h> #include <asm/proto.h> - -/* No need to be aligned, but done to keep all IDTs defined the same way. */ -gate_desc debug_idt_table[NR_VECTORS] __page_aligned_bss; #else #include <asm/processor-flags.h> #include <asm/setup.h> #include <asm/proto.h> #endif -/* Must be page-aligned because the real IDT is used in a fixmap. */ -gate_desc idt_table[NR_VECTORS] __page_aligned_bss; - DECLARE_BITMAP(used_vectors, NR_VECTORS); -EXPORT_SYMBOL_GPL(used_vectors); static inline void cond_local_irq_enable(struct pt_regs *regs) { @@ -154,7 +142,7 @@ void ist_begin_non_atomic(struct pt_regs *regs) * from double_fault. */ BUG_ON((unsigned long)(current_top_of_stack() - - current_stack_pointer()) >= THREAD_SIZE); + current_stack_pointer) >= THREAD_SIZE); preempt_enable_no_resched(); } @@ -935,87 +923,9 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) } #endif -/* Set of traps needed for early debugging. */ -void __init early_trap_init(void) -{ - /* - * Don't use IST to set DEBUG_STACK as it doesn't work until TSS - * is ready in cpu_init() <-- trap_init(). Before trap_init(), - * CPU runs at ring 0 so it is impossible to hit an invalid - * stack. Using the original stack works well enough at this - * early stage. DEBUG_STACK will be equipped after cpu_init() in - * trap_init(). - * - * We don't need to set trace_idt_table like set_intr_gate(), - * since we don't have trace_debug and it will be reset to - * 'debug' in trap_init() by set_intr_gate_ist(). - */ - set_intr_gate_notrace(X86_TRAP_DB, debug); - /* int3 can be called from all */ - set_system_intr_gate(X86_TRAP_BP, &int3); -#ifdef CONFIG_X86_32 - set_intr_gate(X86_TRAP_PF, page_fault); -#endif - load_idt(&idt_descr); -} - -void __init early_trap_pf_init(void) -{ -#ifdef CONFIG_X86_64 - set_intr_gate(X86_TRAP_PF, page_fault); -#endif -} - void __init trap_init(void) { - int i; - -#ifdef CONFIG_EISA - void __iomem *p = early_ioremap(0x0FFFD9, 4); - - if (readl(p) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24)) - EISA_bus = 1; - early_iounmap(p, 4); -#endif - - set_intr_gate(X86_TRAP_DE, divide_error); - set_intr_gate_ist(X86_TRAP_NMI, &nmi, NMI_STACK); - /* int4 can be called from all */ - set_system_intr_gate(X86_TRAP_OF, &overflow); - set_intr_gate(X86_TRAP_BR, bounds); - set_intr_gate(X86_TRAP_UD, invalid_op); - set_intr_gate(X86_TRAP_NM, device_not_available); -#ifdef CONFIG_X86_32 - set_task_gate(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS); -#else - set_intr_gate_ist(X86_TRAP_DF, &double_fault, DOUBLEFAULT_STACK); -#endif - set_intr_gate(X86_TRAP_OLD_MF, coprocessor_segment_overrun); - set_intr_gate(X86_TRAP_TS, invalid_TSS); - set_intr_gate(X86_TRAP_NP, segment_not_present); - set_intr_gate(X86_TRAP_SS, stack_segment); - set_intr_gate(X86_TRAP_GP, general_protection); - set_intr_gate(X86_TRAP_SPURIOUS, spurious_interrupt_bug); - set_intr_gate(X86_TRAP_MF, coprocessor_error); - set_intr_gate(X86_TRAP_AC, alignment_check); -#ifdef CONFIG_X86_MCE - set_intr_gate_ist(X86_TRAP_MC, &machine_check, MCE_STACK); -#endif - set_intr_gate(X86_TRAP_XF, simd_coprocessor_error); - - /* Reserve all the builtin and the syscall vector: */ - for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) - set_bit(i, used_vectors); - -#ifdef CONFIG_IA32_EMULATION - set_system_intr_gate(IA32_SYSCALL_VECTOR, entry_INT80_compat); - set_bit(IA32_SYSCALL_VECTOR, used_vectors); -#endif - -#ifdef CONFIG_X86_32 - set_system_intr_gate(IA32_SYSCALL_VECTOR, entry_INT80_32); - set_bit(IA32_SYSCALL_VECTOR, used_vectors); -#endif + idt_setup_traps(); /* * Set the IDT descriptor to a fixed read-only location, so that the @@ -1030,20 +940,9 @@ void __init trap_init(void) */ cpu_init(); - /* - * X86_TRAP_DB and X86_TRAP_BP have been set - * in early_trap_init(). However, ITS works only after - * cpu_init() loads TSS. See comments in early_trap_init(). - */ - set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); - /* int3 can be called from all */ - set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK); + idt_setup_ist_traps(); x86_init.irqs.trap_init(); -#ifdef CONFIG_X86_64 - memcpy(&debug_idt_table, &idt_table, IDT_ENTRIES * 16); - set_nmi_gate(X86_TRAP_DB, &debug); - set_nmi_gate(X86_TRAP_BP, &int3); -#endif + idt_setup_debugidt_traps(); } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 714dfba6a1e7..796d96bb0821 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -51,115 +51,34 @@ static u32 art_to_tsc_denominator; static u64 art_to_tsc_offset; struct clocksource *art_related_clocksource; -/* - * Use a ring-buffer like data structure, where a writer advances the head by - * writing a new data entry and a reader advances the tail when it observes a - * new entry. - * - * Writers are made to wait on readers until there's space to write a new - * entry. - * - * This means that we can always use an {offset, mul} pair to compute a ns - * value that is 'roughly' in the right direction, even if we're writing a new - * {offset, mul} pair during the clock read. - * - * The down-side is that we can no longer guarantee strict monotonicity anymore - * (assuming the TSC was that to begin with), because while we compute the - * intersection point of the two clock slopes and make sure the time is - * continuous at the point of switching; we can no longer guarantee a reader is - * strictly before or after the switch point. - * - * It does mean a reader no longer needs to disable IRQs in order to avoid - * CPU-Freq updates messing with his times, and similarly an NMI reader will - * no longer run the risk of hitting half-written state. - */ - struct cyc2ns { - struct cyc2ns_data data[2]; /* 0 + 2*24 = 48 */ - struct cyc2ns_data *head; /* 48 + 8 = 56 */ - struct cyc2ns_data *tail; /* 56 + 8 = 64 */ -}; /* exactly fits one cacheline */ - -static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns); - -struct cyc2ns_data *cyc2ns_read_begin(void) -{ - struct cyc2ns_data *head; - - preempt_disable(); + struct cyc2ns_data data[2]; /* 0 + 2*16 = 32 */ + seqcount_t seq; /* 32 + 4 = 36 */ - head = this_cpu_read(cyc2ns.head); - /* - * Ensure we observe the entry when we observe the pointer to it. - * matches the wmb from cyc2ns_write_end(). - */ - smp_read_barrier_depends(); - head->__count++; - barrier(); - - return head; -} +}; /* fits one cacheline */ -void cyc2ns_read_end(struct cyc2ns_data *head) -{ - barrier(); - /* - * If we're the outer most nested read; update the tail pointer - * when we're done. This notifies possible pending writers - * that we've observed the head pointer and that the other - * entry is now free. - */ - if (!--head->__count) { - /* - * x86-TSO does not reorder writes with older reads; - * therefore once this write becomes visible to another - * cpu, we must be finished reading the cyc2ns_data. - * - * matches with cyc2ns_write_begin(). - */ - this_cpu_write(cyc2ns.tail, head); - } - preempt_enable(); -} +static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns); -/* - * Begin writing a new @data entry for @cpu. - * - * Assumes some sort of write side lock; currently 'provided' by the assumption - * that cpufreq will call its notifiers sequentially. - */ -static struct cyc2ns_data *cyc2ns_write_begin(int cpu) +void cyc2ns_read_begin(struct cyc2ns_data *data) { - struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); - struct cyc2ns_data *data = c2n->data; + int seq, idx; - if (data == c2n->head) - data++; + preempt_disable_notrace(); - /* XXX send an IPI to @cpu in order to guarantee a read? */ + do { + seq = this_cpu_read(cyc2ns.seq.sequence); + idx = seq & 1; - /* - * When we observe the tail write from cyc2ns_read_end(), - * the cpu must be done with that entry and its safe - * to start writing to it. - */ - while (c2n->tail == data) - cpu_relax(); + data->cyc2ns_offset = this_cpu_read(cyc2ns.data[idx].cyc2ns_offset); + data->cyc2ns_mul = this_cpu_read(cyc2ns.data[idx].cyc2ns_mul); + data->cyc2ns_shift = this_cpu_read(cyc2ns.data[idx].cyc2ns_shift); - return data; + } while (unlikely(seq != this_cpu_read(cyc2ns.seq.sequence))); } -static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data) +void cyc2ns_read_end(void) { - struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); - - /* - * Ensure the @data writes are visible before we publish the - * entry. Matches the data-depencency in cyc2ns_read_begin(). - */ - smp_wmb(); - - ACCESS_ONCE(c2n->head) = data; + preempt_enable_notrace(); } /* @@ -191,7 +110,6 @@ static void cyc2ns_data_init(struct cyc2ns_data *data) data->cyc2ns_mul = 0; data->cyc2ns_shift = 0; data->cyc2ns_offset = 0; - data->__count = 0; } static void cyc2ns_init(int cpu) @@ -201,51 +119,29 @@ static void cyc2ns_init(int cpu) cyc2ns_data_init(&c2n->data[0]); cyc2ns_data_init(&c2n->data[1]); - c2n->head = c2n->data; - c2n->tail = c2n->data; + seqcount_init(&c2n->seq); } static inline unsigned long long cycles_2_ns(unsigned long long cyc) { - struct cyc2ns_data *data, *tail; + struct cyc2ns_data data; unsigned long long ns; - /* - * See cyc2ns_read_*() for details; replicated in order to avoid - * an extra few instructions that came with the abstraction. - * Notable, it allows us to only do the __count and tail update - * dance when its actually needed. - */ - - preempt_disable_notrace(); - data = this_cpu_read(cyc2ns.head); - tail = this_cpu_read(cyc2ns.tail); - - if (likely(data == tail)) { - ns = data->cyc2ns_offset; - ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift); - } else { - data->__count++; - - barrier(); + cyc2ns_read_begin(&data); - ns = data->cyc2ns_offset; - ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift); + ns = data.cyc2ns_offset; + ns += mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift); - barrier(); - - if (!--data->__count) - this_cpu_write(cyc2ns.tail, data); - } - preempt_enable_notrace(); + cyc2ns_read_end(); return ns; } -static void set_cyc2ns_scale(unsigned long khz, int cpu) +static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now) { - unsigned long long tsc_now, ns_now; - struct cyc2ns_data *data; + unsigned long long ns_now; + struct cyc2ns_data data; + struct cyc2ns *c2n; unsigned long flags; local_irq_save(flags); @@ -254,9 +150,6 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu) if (!khz) goto done; - data = cyc2ns_write_begin(cpu); - - tsc_now = rdtsc(); ns_now = cycles_2_ns(tsc_now); /* @@ -264,7 +157,7 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu) * time function is continuous; see the comment near struct * cyc2ns_data. */ - clocks_calc_mult_shift(&data->cyc2ns_mul, &data->cyc2ns_shift, khz, + clocks_calc_mult_shift(&data.cyc2ns_mul, &data.cyc2ns_shift, khz, NSEC_PER_MSEC, 0); /* @@ -273,20 +166,26 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu) * conversion algorithm shifting a 32-bit value (now specifies a 64-bit * value) - refer perf_event_mmap_page documentation in perf_event.h. */ - if (data->cyc2ns_shift == 32) { - data->cyc2ns_shift = 31; - data->cyc2ns_mul >>= 1; + if (data.cyc2ns_shift == 32) { + data.cyc2ns_shift = 31; + data.cyc2ns_mul >>= 1; } - data->cyc2ns_offset = ns_now - - mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, data->cyc2ns_shift); + data.cyc2ns_offset = ns_now - + mul_u64_u32_shr(tsc_now, data.cyc2ns_mul, data.cyc2ns_shift); - cyc2ns_write_end(cpu, data); + c2n = per_cpu_ptr(&cyc2ns, cpu); + + raw_write_seqcount_latch(&c2n->seq); + c2n->data[0] = data; + raw_write_seqcount_latch(&c2n->seq); + c2n->data[1] = data; done: - sched_clock_idle_wakeup_event(0); + sched_clock_idle_wakeup_event(); local_irq_restore(flags); } + /* * Scheduler clock - returns current time in nanosec units. */ @@ -374,6 +273,8 @@ static int __init tsc_setup(char *str) tsc_clocksource_reliable = 1; if (!strncmp(str, "noirqtime", 9)) no_sched_irq_time = 1; + if (!strcmp(str, "unstable")) + mark_tsc_unstable("boot parameter"); return 1; } @@ -986,7 +887,6 @@ void tsc_restore_sched_clock_state(void) } #ifdef CONFIG_CPU_FREQ - /* Frequency scaling support. Adjust the TSC based timer when the cpu frequency * changes. * @@ -1027,7 +927,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, if (!(freq->flags & CPUFREQ_CONST_LOOPS)) mark_tsc_unstable("cpufreq changes"); - set_cyc2ns_scale(tsc_khz, freq->cpu); + set_cyc2ns_scale(tsc_khz, freq->cpu, rdtsc()); } return 0; @@ -1127,6 +1027,15 @@ static void tsc_cs_mark_unstable(struct clocksource *cs) pr_info("Marking TSC unstable due to clocksource watchdog\n"); } +static void tsc_cs_tick_stable(struct clocksource *cs) +{ + if (tsc_unstable) + return; + + if (using_native_sched_clock()) + sched_clock_tick_stable(); +} + /* * .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc() */ @@ -1140,6 +1049,7 @@ static struct clocksource clocksource_tsc = { .archdata = { .vclock_mode = VCLOCK_TSC }, .resume = tsc_resume, .mark_unstable = tsc_cs_mark_unstable, + .tick_stable = tsc_cs_tick_stable, }; void mark_tsc_unstable(char *reason) @@ -1255,6 +1165,7 @@ static void tsc_refine_calibration_work(struct work_struct *work) static int hpet; u64 tsc_stop, ref_stop, delta; unsigned long freq; + int cpu; /* Don't bother refining TSC on unstable systems */ if (check_tsc_unstable()) @@ -1305,6 +1216,10 @@ static void tsc_refine_calibration_work(struct work_struct *work) /* Inform the TSC deadline clockevent devices about the recalibration */ lapic_update_tsc_freq(); + /* Update the sched_clock() rate to match the clocksource one */ + for_each_possible_cpu(cpu) + set_cyc2ns_scale(tsc_khz, cpu, tsc_stop); + out: if (boot_cpu_has(X86_FEATURE_ART)) art_related_clocksource = &clocksource_tsc; @@ -1350,7 +1265,7 @@ device_initcall(init_tsc_clocksource); void __init tsc_init(void) { - u64 lpj; + u64 lpj, cyc; int cpu; if (!boot_cpu_has(X86_FEATURE_TSC)) { @@ -1390,9 +1305,10 @@ void __init tsc_init(void) * speed as the bootup CPU. (cpufreq notifiers will fix this * up if their speed diverges) */ + cyc = rdtsc(); for_each_possible_cpu(cpu) { cyc2ns_init(cpu); - set_cyc2ns_scale(tsc_khz, cpu); + set_cyc2ns_scale(tsc_khz, cpu, cyc); } if (tsc_disabled > 0) @@ -1412,11 +1328,11 @@ void __init tsc_init(void) use_tsc_delay(); + check_system_tsc_reliable(); + if (unsynchronized_tsc()) mark_tsc_unstable("TSCs unsynchronized"); - check_system_tsc_reliable(); - detect_art(); } diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 728f75378475..e76a9881306b 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * check TSC synchronization. * @@ -71,13 +72,8 @@ static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval, * non zero. We don't do that on non boot cpus because physical * hotplug should have set the ADJUST register to a value > 0 so * the TSC is in sync with the already running cpus. - * - * But we always force positive ADJUST values. Otherwise the TSC - * deadline timer creates an interrupt storm. We also have to - * prevent values > 0x7FFFFFFF as those wreckage the timer as well. */ - if ((bootcpu && bootval != 0) || (!bootcpu && bootval < 0) || - (bootval > 0x7FFFFFFF)) { + if (bootcpu && bootval != 0) { pr_warn(FW_BUG "TSC ADJUST: CPU%u: %lld force to 0\n", cpu, bootval); wrmsrl(MSR_IA32_TSC_ADJUST, 0); @@ -451,20 +447,6 @@ retry: */ cur->adjusted += cur_max_warp; - /* - * TSC deadline timer stops working or creates an interrupt storm - * with adjust values < 0 and > x07ffffff. - * - * To allow adjust values > 0x7FFFFFFF we need to disable the - * deadline timer and use the local APIC timer, but that requires - * more intrusive changes and we do not have any useful information - * from Intel about the underlying HW wreckage yet. - */ - if (cur->adjusted < 0) - cur->adjusted = 0; - if (cur->adjusted > 0x7FFFFFFF) - cur->adjusted = 0x7FFFFFFF; - pr_warn("TSC ADJUST compensate: CPU%u observed %lld warp. Adjust: %lld\n", cpu, cur_max_warp, cur->adjusted); diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index b9389d72b2f7..3dc26f95d46e 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -10,20 +10,22 @@ #define FRAME_HEADER_SIZE (sizeof(long) * 2) -/* - * This disables KASAN checking when reading a value from another task's stack, - * since the other task could be running on another CPU and could have poisoned - * the stack in the meantime. - */ -#define READ_ONCE_TASK_STACK(task, x) \ -({ \ - unsigned long val; \ - if (task == current) \ - val = READ_ONCE(x); \ - else \ - val = READ_ONCE_NOCHECK(x); \ - val; \ -}) +unsigned long unwind_get_return_address(struct unwind_state *state) +{ + if (unwind_done(state)) + return 0; + + return __kernel_text_address(state->ip) ? state->ip : 0; +} +EXPORT_SYMBOL_GPL(unwind_get_return_address); + +unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) +{ + if (unwind_done(state)) + return NULL; + + return state->regs ? &state->regs->ip : state->bp + 1; +} static void unwind_dump(struct unwind_state *state) { @@ -42,7 +44,8 @@ static void unwind_dump(struct unwind_state *state) state->stack_info.type, state->stack_info.next_sp, state->stack_mask, state->graph_idx); - for (sp = state->orig_sp; sp; sp = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { + for (sp = PTR_ALIGN(state->orig_sp, sizeof(long)); sp; + sp = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { if (get_stack_info(sp, state->task, &stack_info, &visit_mask)) break; @@ -66,15 +69,6 @@ static void unwind_dump(struct unwind_state *state) } } -unsigned long unwind_get_return_address(struct unwind_state *state) -{ - if (unwind_done(state)) - return 0; - - return __kernel_text_address(state->ip) ? state->ip : 0; -} -EXPORT_SYMBOL_GPL(unwind_get_return_address); - static size_t regs_size(struct pt_regs *regs) { /* x86_32 regs from kernel mode are two words shorter: */ @@ -91,10 +85,8 @@ static bool in_entry_code(unsigned long ip) if (addr >= __entry_text_start && addr < __entry_text_end) return true; -#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) if (addr >= __irqentry_text_start && addr < __irqentry_text_end) return true; -#endif return false; } @@ -183,6 +175,7 @@ static bool is_last_task_frame(struct unwind_state *state) * This determines if the frame pointer actually contains an encoded pointer to * pt_regs on the stack. See ENCODE_FRAME_POINTER. */ +#ifdef CONFIG_X86_64 static struct pt_regs *decode_frame_pointer(unsigned long *bp) { unsigned long regs = (unsigned long)bp; @@ -192,6 +185,23 @@ static struct pt_regs *decode_frame_pointer(unsigned long *bp) return (struct pt_regs *)(regs & ~0x1); } +#else +static struct pt_regs *decode_frame_pointer(unsigned long *bp) +{ + unsigned long regs = (unsigned long)bp; + + if (regs & 0x80000000) + return NULL; + + return (struct pt_regs *)(regs | 0x80000000); +} +#endif + +#ifdef CONFIG_X86_32 +#define KERNEL_REGS_SIZE (sizeof(struct pt_regs) - 2*sizeof(long)) +#else +#define KERNEL_REGS_SIZE (sizeof(struct pt_regs)) +#endif static bool update_stack_state(struct unwind_state *state, unsigned long *next_bp) @@ -211,7 +221,7 @@ static bool update_stack_state(struct unwind_state *state, regs = decode_frame_pointer(next_bp); if (regs) { frame = (unsigned long *)regs; - len = regs_size(regs); + len = KERNEL_REGS_SIZE; state->got_irq = true; } else { frame = next_bp; @@ -235,6 +245,14 @@ static bool update_stack_state(struct unwind_state *state, frame < prev_frame_end) return false; + /* + * On 32-bit with user mode regs, make sure the last two regs are safe + * to access: + */ + if (IS_ENABLED(CONFIG_X86_32) && regs && user_mode(regs) && + !on_stack(info, frame, len + 2*sizeof(long))) + return false; + /* Move state to the next frame: */ if (regs) { state->regs = regs; @@ -337,6 +355,13 @@ bad_address: state->regs->sp < (unsigned long)task_pt_regs(state->task)) goto the_end; + /* + * There are some known frame pointer issues on 32-bit. Disable + * unwinder warnings on 32-bit until it gets objtool support. + */ + if (IS_ENABLED(CONFIG_X86_32)) + goto the_end; + if (state->regs) { printk_deferred_once(KERN_WARNING "WARNING: kernel stack regs at %p in %s:%d has bad 'bp' value %p\n", diff --git a/arch/x86/kernel/unwind_guess.c b/arch/x86/kernel/unwind_guess.c index 039f36738e49..4f0e17b90463 100644 --- a/arch/x86/kernel/unwind_guess.c +++ b/arch/x86/kernel/unwind_guess.c @@ -19,6 +19,11 @@ unsigned long unwind_get_return_address(struct unwind_state *state) } EXPORT_SYMBOL_GPL(unwind_get_return_address); +unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) +{ + return NULL; +} + bool unwind_next_frame(struct unwind_state *state) { struct stack_info *info = &state->stack_info; diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c new file mode 100644 index 000000000000..b95007e7c1b3 --- /dev/null +++ b/arch/x86/kernel/unwind_orc.c @@ -0,0 +1,583 @@ +#include <linux/module.h> +#include <linux/sort.h> +#include <asm/ptrace.h> +#include <asm/stacktrace.h> +#include <asm/unwind.h> +#include <asm/orc_types.h> +#include <asm/orc_lookup.h> +#include <asm/sections.h> + +#define orc_warn(fmt, ...) \ + printk_deferred_once(KERN_WARNING pr_fmt("WARNING: " fmt), ##__VA_ARGS__) + +extern int __start_orc_unwind_ip[]; +extern int __stop_orc_unwind_ip[]; +extern struct orc_entry __start_orc_unwind[]; +extern struct orc_entry __stop_orc_unwind[]; + +static DEFINE_MUTEX(sort_mutex); +int *cur_orc_ip_table = __start_orc_unwind_ip; +struct orc_entry *cur_orc_table = __start_orc_unwind; + +unsigned int lookup_num_blocks; +bool orc_init; + +static inline unsigned long orc_ip(const int *ip) +{ + return (unsigned long)ip + *ip; +} + +static struct orc_entry *__orc_find(int *ip_table, struct orc_entry *u_table, + unsigned int num_entries, unsigned long ip) +{ + int *first = ip_table; + int *last = ip_table + num_entries - 1; + int *mid = first, *found = first; + + if (!num_entries) + return NULL; + + /* + * Do a binary range search to find the rightmost duplicate of a given + * starting address. Some entries are section terminators which are + * "weak" entries for ensuring there are no gaps. They should be + * ignored when they conflict with a real entry. + */ + while (first <= last) { + mid = first + ((last - first) / 2); + + if (orc_ip(mid) <= ip) { + found = mid; + first = mid + 1; + } else + last = mid - 1; + } + + return u_table + (found - ip_table); +} + +#ifdef CONFIG_MODULES +static struct orc_entry *orc_module_find(unsigned long ip) +{ + struct module *mod; + + mod = __module_address(ip); + if (!mod || !mod->arch.orc_unwind || !mod->arch.orc_unwind_ip) + return NULL; + return __orc_find(mod->arch.orc_unwind_ip, mod->arch.orc_unwind, + mod->arch.num_orcs, ip); +} +#else +static struct orc_entry *orc_module_find(unsigned long ip) +{ + return NULL; +} +#endif + +static struct orc_entry *orc_find(unsigned long ip) +{ + if (!orc_init) + return NULL; + + /* For non-init vmlinux addresses, use the fast lookup table: */ + if (ip >= LOOKUP_START_IP && ip < LOOKUP_STOP_IP) { + unsigned int idx, start, stop; + + idx = (ip - LOOKUP_START_IP) / LOOKUP_BLOCK_SIZE; + + if (unlikely((idx >= lookup_num_blocks-1))) { + orc_warn("WARNING: bad lookup idx: idx=%u num=%u ip=%pB\n", + idx, lookup_num_blocks, (void *)ip); + return NULL; + } + + start = orc_lookup[idx]; + stop = orc_lookup[idx + 1] + 1; + + if (unlikely((__start_orc_unwind + start >= __stop_orc_unwind) || + (__start_orc_unwind + stop > __stop_orc_unwind))) { + orc_warn("WARNING: bad lookup value: idx=%u num=%u start=%u stop=%u ip=%pB\n", + idx, lookup_num_blocks, start, stop, (void *)ip); + return NULL; + } + + return __orc_find(__start_orc_unwind_ip + start, + __start_orc_unwind + start, stop - start, ip); + } + + /* vmlinux .init slow lookup: */ + if (ip >= (unsigned long)_sinittext && ip < (unsigned long)_einittext) + return __orc_find(__start_orc_unwind_ip, __start_orc_unwind, + __stop_orc_unwind_ip - __start_orc_unwind_ip, ip); + + /* Module lookup: */ + return orc_module_find(ip); +} + +static void orc_sort_swap(void *_a, void *_b, int size) +{ + struct orc_entry *orc_a, *orc_b; + struct orc_entry orc_tmp; + int *a = _a, *b = _b, tmp; + int delta = _b - _a; + + /* Swap the .orc_unwind_ip entries: */ + tmp = *a; + *a = *b + delta; + *b = tmp - delta; + + /* Swap the corresponding .orc_unwind entries: */ + orc_a = cur_orc_table + (a - cur_orc_ip_table); + orc_b = cur_orc_table + (b - cur_orc_ip_table); + orc_tmp = *orc_a; + *orc_a = *orc_b; + *orc_b = orc_tmp; +} + +static int orc_sort_cmp(const void *_a, const void *_b) +{ + struct orc_entry *orc_a; + const int *a = _a, *b = _b; + unsigned long a_val = orc_ip(a); + unsigned long b_val = orc_ip(b); + + if (a_val > b_val) + return 1; + if (a_val < b_val) + return -1; + + /* + * The "weak" section terminator entries need to always be on the left + * to ensure the lookup code skips them in favor of real entries. + * These terminator entries exist to handle any gaps created by + * whitelisted .o files which didn't get objtool generation. + */ + orc_a = cur_orc_table + (a - cur_orc_ip_table); + return orc_a->sp_reg == ORC_REG_UNDEFINED ? -1 : 1; +} + +#ifdef CONFIG_MODULES +void unwind_module_init(struct module *mod, void *_orc_ip, size_t orc_ip_size, + void *_orc, size_t orc_size) +{ + int *orc_ip = _orc_ip; + struct orc_entry *orc = _orc; + unsigned int num_entries = orc_ip_size / sizeof(int); + + WARN_ON_ONCE(orc_ip_size % sizeof(int) != 0 || + orc_size % sizeof(*orc) != 0 || + num_entries != orc_size / sizeof(*orc)); + + /* + * The 'cur_orc_*' globals allow the orc_sort_swap() callback to + * associate an .orc_unwind_ip table entry with its corresponding + * .orc_unwind entry so they can both be swapped. + */ + mutex_lock(&sort_mutex); + cur_orc_ip_table = orc_ip; + cur_orc_table = orc; + sort(orc_ip, num_entries, sizeof(int), orc_sort_cmp, orc_sort_swap); + mutex_unlock(&sort_mutex); + + mod->arch.orc_unwind_ip = orc_ip; + mod->arch.orc_unwind = orc; + mod->arch.num_orcs = num_entries; +} +#endif + +void __init unwind_init(void) +{ + size_t orc_ip_size = (void *)__stop_orc_unwind_ip - (void *)__start_orc_unwind_ip; + size_t orc_size = (void *)__stop_orc_unwind - (void *)__start_orc_unwind; + size_t num_entries = orc_ip_size / sizeof(int); + struct orc_entry *orc; + int i; + + if (!num_entries || orc_ip_size % sizeof(int) != 0 || + orc_size % sizeof(struct orc_entry) != 0 || + num_entries != orc_size / sizeof(struct orc_entry)) { + orc_warn("WARNING: Bad or missing .orc_unwind table. Disabling unwinder.\n"); + return; + } + + /* Sort the .orc_unwind and .orc_unwind_ip tables: */ + sort(__start_orc_unwind_ip, num_entries, sizeof(int), orc_sort_cmp, + orc_sort_swap); + + /* Initialize the fast lookup table: */ + lookup_num_blocks = orc_lookup_end - orc_lookup; + for (i = 0; i < lookup_num_blocks-1; i++) { + orc = __orc_find(__start_orc_unwind_ip, __start_orc_unwind, + num_entries, + LOOKUP_START_IP + (LOOKUP_BLOCK_SIZE * i)); + if (!orc) { + orc_warn("WARNING: Corrupt .orc_unwind table. Disabling unwinder.\n"); + return; + } + + orc_lookup[i] = orc - __start_orc_unwind; + } + + /* Initialize the ending block: */ + orc = __orc_find(__start_orc_unwind_ip, __start_orc_unwind, num_entries, + LOOKUP_STOP_IP); + if (!orc) { + orc_warn("WARNING: Corrupt .orc_unwind table. Disabling unwinder.\n"); + return; + } + orc_lookup[lookup_num_blocks-1] = orc - __start_orc_unwind; + + orc_init = true; +} + +unsigned long unwind_get_return_address(struct unwind_state *state) +{ + if (unwind_done(state)) + return 0; + + return __kernel_text_address(state->ip) ? state->ip : 0; +} +EXPORT_SYMBOL_GPL(unwind_get_return_address); + +unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) +{ + if (unwind_done(state)) + return NULL; + + if (state->regs) + return &state->regs->ip; + + if (state->sp) + return (unsigned long *)state->sp - 1; + + return NULL; +} + +static bool stack_access_ok(struct unwind_state *state, unsigned long addr, + size_t len) +{ + struct stack_info *info = &state->stack_info; + + /* + * If the address isn't on the current stack, switch to the next one. + * + * We may have to traverse multiple stacks to deal with the possibility + * that info->next_sp could point to an empty stack and the address + * could be on a subsequent stack. + */ + while (!on_stack(info, (void *)addr, len)) + if (get_stack_info(info->next_sp, state->task, info, + &state->stack_mask)) + return false; + + return true; +} + +static bool deref_stack_reg(struct unwind_state *state, unsigned long addr, + unsigned long *val) +{ + if (!stack_access_ok(state, addr, sizeof(long))) + return false; + + *val = READ_ONCE_TASK_STACK(state->task, *(unsigned long *)addr); + return true; +} + +#define REGS_SIZE (sizeof(struct pt_regs)) +#define SP_OFFSET (offsetof(struct pt_regs, sp)) +#define IRET_REGS_SIZE (REGS_SIZE - offsetof(struct pt_regs, ip)) +#define IRET_SP_OFFSET (SP_OFFSET - offsetof(struct pt_regs, ip)) + +static bool deref_stack_regs(struct unwind_state *state, unsigned long addr, + unsigned long *ip, unsigned long *sp, bool full) +{ + size_t regs_size = full ? REGS_SIZE : IRET_REGS_SIZE; + size_t sp_offset = full ? SP_OFFSET : IRET_SP_OFFSET; + struct pt_regs *regs = (struct pt_regs *)(addr + regs_size - REGS_SIZE); + + if (IS_ENABLED(CONFIG_X86_64)) { + if (!stack_access_ok(state, addr, regs_size)) + return false; + + *ip = regs->ip; + *sp = regs->sp; + + return true; + } + + if (!stack_access_ok(state, addr, sp_offset)) + return false; + + *ip = regs->ip; + + if (user_mode(regs)) { + if (!stack_access_ok(state, addr + sp_offset, + REGS_SIZE - SP_OFFSET)) + return false; + + *sp = regs->sp; + } else + *sp = (unsigned long)®s->sp; + + return true; +} + +bool unwind_next_frame(struct unwind_state *state) +{ + unsigned long ip_p, sp, orig_ip, prev_sp = state->sp; + enum stack_type prev_type = state->stack_info.type; + struct orc_entry *orc; + struct pt_regs *ptregs; + bool indirect = false; + + if (unwind_done(state)) + return false; + + /* Don't let modules unload while we're reading their ORC data. */ + preempt_disable(); + + /* Have we reached the end? */ + if (state->regs && user_mode(state->regs)) + goto done; + + /* + * Find the orc_entry associated with the text address. + * + * Decrement call return addresses by one so they work for sibling + * calls and calls to noreturn functions. + */ + orc = orc_find(state->signal ? state->ip : state->ip - 1); + if (!orc || orc->sp_reg == ORC_REG_UNDEFINED) + goto done; + orig_ip = state->ip; + + /* Find the previous frame's stack: */ + switch (orc->sp_reg) { + case ORC_REG_SP: + sp = state->sp + orc->sp_offset; + break; + + case ORC_REG_BP: + sp = state->bp + orc->sp_offset; + break; + + case ORC_REG_SP_INDIRECT: + sp = state->sp + orc->sp_offset; + indirect = true; + break; + + case ORC_REG_BP_INDIRECT: + sp = state->bp + orc->sp_offset; + indirect = true; + break; + + case ORC_REG_R10: + if (!state->regs || !state->full_regs) { + orc_warn("missing regs for base reg R10 at ip %pB\n", + (void *)state->ip); + goto done; + } + sp = state->regs->r10; + break; + + case ORC_REG_R13: + if (!state->regs || !state->full_regs) { + orc_warn("missing regs for base reg R13 at ip %pB\n", + (void *)state->ip); + goto done; + } + sp = state->regs->r13; + break; + + case ORC_REG_DI: + if (!state->regs || !state->full_regs) { + orc_warn("missing regs for base reg DI at ip %pB\n", + (void *)state->ip); + goto done; + } + sp = state->regs->di; + break; + + case ORC_REG_DX: + if (!state->regs || !state->full_regs) { + orc_warn("missing regs for base reg DX at ip %pB\n", + (void *)state->ip); + goto done; + } + sp = state->regs->dx; + break; + + default: + orc_warn("unknown SP base reg %d for ip %pB\n", + orc->sp_reg, (void *)state->ip); + goto done; + } + + if (indirect) { + if (!deref_stack_reg(state, sp, &sp)) + goto done; + } + + /* Find IP, SP and possibly regs: */ + switch (orc->type) { + case ORC_TYPE_CALL: + ip_p = sp - sizeof(long); + + if (!deref_stack_reg(state, ip_p, &state->ip)) + goto done; + + state->ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, + state->ip, (void *)ip_p); + + state->sp = sp; + state->regs = NULL; + state->signal = false; + break; + + case ORC_TYPE_REGS: + if (!deref_stack_regs(state, sp, &state->ip, &state->sp, true)) { + orc_warn("can't dereference registers at %p for ip %pB\n", + (void *)sp, (void *)orig_ip); + goto done; + } + + state->regs = (struct pt_regs *)sp; + state->full_regs = true; + state->signal = true; + break; + + case ORC_TYPE_REGS_IRET: + if (!deref_stack_regs(state, sp, &state->ip, &state->sp, false)) { + orc_warn("can't dereference iret registers at %p for ip %pB\n", + (void *)sp, (void *)orig_ip); + goto done; + } + + ptregs = container_of((void *)sp, struct pt_regs, ip); + if ((unsigned long)ptregs >= prev_sp && + on_stack(&state->stack_info, ptregs, REGS_SIZE)) { + state->regs = ptregs; + state->full_regs = false; + } else + state->regs = NULL; + + state->signal = true; + break; + + default: + orc_warn("unknown .orc_unwind entry type %d for ip %pB\n", + orc->type, (void *)orig_ip); + break; + } + + /* Find BP: */ + switch (orc->bp_reg) { + case ORC_REG_UNDEFINED: + if (state->regs && state->full_regs) + state->bp = state->regs->bp; + break; + + case ORC_REG_PREV_SP: + if (!deref_stack_reg(state, sp + orc->bp_offset, &state->bp)) + goto done; + break; + + case ORC_REG_BP: + if (!deref_stack_reg(state, state->bp + orc->bp_offset, &state->bp)) + goto done; + break; + + default: + orc_warn("unknown BP base reg %d for ip %pB\n", + orc->bp_reg, (void *)orig_ip); + goto done; + } + + /* Prevent a recursive loop due to bad ORC data: */ + if (state->stack_info.type == prev_type && + on_stack(&state->stack_info, (void *)state->sp, sizeof(long)) && + state->sp <= prev_sp) { + orc_warn("stack going in the wrong direction? ip=%pB\n", + (void *)orig_ip); + goto done; + } + + preempt_enable(); + return true; + +done: + preempt_enable(); + state->stack_info.type = STACK_TYPE_UNKNOWN; + return false; +} +EXPORT_SYMBOL_GPL(unwind_next_frame); + +void __unwind_start(struct unwind_state *state, struct task_struct *task, + struct pt_regs *regs, unsigned long *first_frame) +{ + memset(state, 0, sizeof(*state)); + state->task = task; + + /* + * Refuse to unwind the stack of a task while it's executing on another + * CPU. This check is racy, but that's ok: the unwinder has other + * checks to prevent it from going off the rails. + */ + if (task_on_another_cpu(task)) + goto done; + + if (regs) { + if (user_mode(regs)) + goto done; + + state->ip = regs->ip; + state->sp = kernel_stack_pointer(regs); + state->bp = regs->bp; + state->regs = regs; + state->full_regs = true; + state->signal = true; + + } else if (task == current) { + asm volatile("lea (%%rip), %0\n\t" + "mov %%rsp, %1\n\t" + "mov %%rbp, %2\n\t" + : "=r" (state->ip), "=r" (state->sp), + "=r" (state->bp)); + + } else { + struct inactive_task_frame *frame = (void *)task->thread.sp; + + state->sp = task->thread.sp; + state->bp = READ_ONCE_NOCHECK(frame->bp); + state->ip = READ_ONCE_NOCHECK(frame->ret_addr); + } + + if (get_stack_info((unsigned long *)state->sp, state->task, + &state->stack_info, &state->stack_mask)) + return; + + /* + * The caller can provide the address of the first frame directly + * (first_frame) or indirectly (regs->sp) to indicate which stack frame + * to start unwinding at. Skip ahead until we reach it. + */ + + /* When starting from regs, skip the regs frame: */ + if (regs) { + unwind_next_frame(state); + return; + } + + /* Otherwise, skip ahead to the user-specified starting frame: */ + while (!unwind_done(state) && + (!on_stack(&state->stack_info, first_frame, sizeof(long)) || + state->sp <= (unsigned long)first_frame)) + unwind_next_frame(state); + + return; + +done: + state->stack_info.type = STACK_TYPE_UNKNOWN; + return; +} +EXPORT_SYMBOL_GPL(__unwind_start); diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 7924a5356c8a..68244742ecb0 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1994 Linus Torvalds * diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index c8a3b61be0aa..a4009fb9be87 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * ld script for the x86 kernel * @@ -24,6 +25,7 @@ #include <asm/asm-offsets.h> #include <asm/thread_info.h> #include <asm/page_types.h> +#include <asm/orc_lookup.h> #include <asm/cache.h> #include <asm/boot.h> @@ -148,6 +150,8 @@ SECTIONS BUG_TABLE + ORC_UNWIND_TABLE + . = ALIGN(PAGE_SIZE); __vvar_page = .; diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 760433b2574a..3df51c287844 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # KVM configuration # @@ -22,7 +23,8 @@ config KVM depends on HAVE_KVM depends on HIGH_RES_TIMERS # for TASKSTATS/TASK_DELAY_ACCT: - depends on NET + depends on NET && MULTIUSER + depends on X86_LOCAL_APIC select PREEMPT_NOTIFIERS select MMU_NOTIFIER select ANON_INODES @@ -89,6 +91,5 @@ config KVM_MMU_AUDIT # OK, it's a little counter-intuitive to do this, but it puts it neatly under # the virtualization menu. source drivers/vhost/Kconfig -source drivers/lguest/Kconfig endif # VIRTUALIZATION diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 09d4b17be022..dc4f2fdf5e57 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 ccflags-y += -Iarch/x86/kvm diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 59ca2eea522c..0099e10eb045 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -126,16 +126,20 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) best->ebx = xstate_required_size(vcpu->arch.xcr0, true); /* - * The existing code assumes virtual address is 48-bit in the canonical - * address checks; exit if it is ever changed. + * The existing code assumes virtual address is 48-bit or 57-bit in the + * canonical address checks; exit if it is ever changed. */ best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); - if (best && ((best->eax & 0xff00) >> 8) != 48 && - ((best->eax & 0xff00) >> 8) != 0) - return -EINVAL; + if (best) { + int vaddr_bits = (best->eax & 0xff00) >> 8; + + if (vaddr_bits != 48 && vaddr_bits != 57 && vaddr_bits != 0) + return -EINVAL; + } /* Update physical-address width */ vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); + kvm_mmu_reset_context(vcpu); kvm_pmu_refresh(vcpu); return 0; @@ -383,7 +387,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 7.0.ecx*/ const u32 kvm_cpuid_7_0_ecx_x86_features = - F(AVX512VBMI) | F(PKU) | 0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ); + F(AVX512VBMI) | F(LA57) | F(PKU) | + 0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ); /* cpuid 7.0.edx*/ const u32 kvm_cpuid_7_0_edx_x86_features = @@ -469,7 +474,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->ecx &= kvm_cpuid_7_0_ecx_x86_features; cpuid_mask(&entry->ecx, CPUID_7_ECX); /* PKU is not yet implemented for shadow paging. */ - if (!tdp_enabled) + if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE)) entry->ecx &= ~F(PKU); entry->edx &= kvm_cpuid_7_0_edx_x86_features; entry->edx &= get_scattered_cpuid_leaf(7, 0, CPUID_EDX); @@ -853,16 +858,24 @@ static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu, return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index); } -void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) +bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, + u32 *ecx, u32 *edx, bool check_limit) { u32 function = *eax, index = *ecx; struct kvm_cpuid_entry2 *best; + bool entry_found = true; best = kvm_find_cpuid_entry(vcpu, function, index); - if (!best) + if (!best) { + entry_found = false; + if (!check_limit) + goto out; + best = check_cpuid_limit(vcpu, function, index); + } +out: if (best) { *eax = best->eax; *ebx = best->ebx; @@ -870,7 +883,8 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) *edx = best->edx; } else *eax = *ebx = *ecx = *edx = 0; - trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx); + trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx, entry_found); + return entry_found; } EXPORT_SYMBOL_GPL(kvm_cpuid); @@ -883,7 +897,7 @@ int kvm_emulate_cpuid(struct kvm_vcpu *vcpu) eax = kvm_register_read(vcpu, VCPU_REGS_RAX); ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); - kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx); + kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, true); kvm_register_write(vcpu, VCPU_REGS_RAX, eax); kvm_register_write(vcpu, VCPU_REGS_RBX, ebx); kvm_register_write(vcpu, VCPU_REGS_RCX, ecx); diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index a6fd40aade7c..cdc70a3a6583 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -1,8 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef ARCH_X86_KVM_CPUID_H #define ARCH_X86_KVM_CPUID_H #include "x86.h" #include <asm/cpu.h> +#include <asm/processor.h> int kvm_update_cpuid(struct kvm_vcpu *vcpu); bool kvm_mpx_supported(void); @@ -20,7 +22,8 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries); -void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); +bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, + u32 *ecx, u32 *edx, bool check_limit); int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu); @@ -29,95 +32,86 @@ static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) return vcpu->arch.maxphyaddr; } -static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *best; - - if (!static_cpu_has(X86_FEATURE_XSAVE)) - return false; +struct cpuid_reg { + u32 function; + u32 index; + int reg; +}; - best = kvm_find_cpuid_entry(vcpu, 1, 0); - return best && (best->ecx & bit(X86_FEATURE_XSAVE)); -} - -static inline bool guest_cpuid_has_mtrr(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *best; - - best = kvm_find_cpuid_entry(vcpu, 1, 0); - return best && (best->edx & bit(X86_FEATURE_MTRR)); -} - -static inline bool guest_cpuid_has_tsc_adjust(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *best; - - best = kvm_find_cpuid_entry(vcpu, 7, 0); - return best && (best->ebx & bit(X86_FEATURE_TSC_ADJUST)); -} +static const struct cpuid_reg reverse_cpuid[] = { + [CPUID_1_EDX] = { 1, 0, CPUID_EDX}, + [CPUID_8000_0001_EDX] = {0x80000001, 0, CPUID_EDX}, + [CPUID_8086_0001_EDX] = {0x80860001, 0, CPUID_EDX}, + [CPUID_1_ECX] = { 1, 0, CPUID_ECX}, + [CPUID_C000_0001_EDX] = {0xc0000001, 0, CPUID_EDX}, + [CPUID_8000_0001_ECX] = {0xc0000001, 0, CPUID_ECX}, + [CPUID_7_0_EBX] = { 7, 0, CPUID_EBX}, + [CPUID_D_1_EAX] = { 0xd, 1, CPUID_EAX}, + [CPUID_F_0_EDX] = { 0xf, 0, CPUID_EDX}, + [CPUID_F_1_EDX] = { 0xf, 1, CPUID_EDX}, + [CPUID_8000_0008_EBX] = {0x80000008, 0, CPUID_EBX}, + [CPUID_6_EAX] = { 6, 0, CPUID_EAX}, + [CPUID_8000_000A_EDX] = {0x8000000a, 0, CPUID_EDX}, + [CPUID_7_ECX] = { 7, 0, CPUID_ECX}, + [CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX}, +}; -static inline bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu) +static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature) { - struct kvm_cpuid_entry2 *best; - - best = kvm_find_cpuid_entry(vcpu, 7, 0); - return best && (best->ebx & bit(X86_FEATURE_SMEP)); -} + unsigned x86_leaf = x86_feature / 32; -static inline bool guest_cpuid_has_smap(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *best; + BUILD_BUG_ON(x86_leaf >= ARRAY_SIZE(reverse_cpuid)); + BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0); - best = kvm_find_cpuid_entry(vcpu, 7, 0); - return best && (best->ebx & bit(X86_FEATURE_SMAP)); + return reverse_cpuid[x86_leaf]; } -static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu) +static __always_inline int *guest_cpuid_get_register(struct kvm_vcpu *vcpu, unsigned x86_feature) { - struct kvm_cpuid_entry2 *best; + struct kvm_cpuid_entry2 *entry; + const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature); - best = kvm_find_cpuid_entry(vcpu, 7, 0); - return best && (best->ebx & bit(X86_FEATURE_FSGSBASE)); -} + entry = kvm_find_cpuid_entry(vcpu, cpuid.function, cpuid.index); + if (!entry) + return NULL; -static inline bool guest_cpuid_has_pku(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *best; - - best = kvm_find_cpuid_entry(vcpu, 7, 0); - return best && (best->ecx & bit(X86_FEATURE_PKU)); + switch (cpuid.reg) { + case CPUID_EAX: + return &entry->eax; + case CPUID_EBX: + return &entry->ebx; + case CPUID_ECX: + return &entry->ecx; + case CPUID_EDX: + return &entry->edx; + default: + BUILD_BUG(); + return NULL; + } } -static inline bool guest_cpuid_has_longmode(struct kvm_vcpu *vcpu) +static __always_inline bool guest_cpuid_has(struct kvm_vcpu *vcpu, unsigned x86_feature) { - struct kvm_cpuid_entry2 *best; + int *reg; - best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); - return best && (best->edx & bit(X86_FEATURE_LM)); -} - -static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *best; - - best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); - return best && (best->ecx & bit(X86_FEATURE_OSVW)); -} + if (x86_feature == X86_FEATURE_XSAVE && + !static_cpu_has(X86_FEATURE_XSAVE)) + return false; -static inline bool guest_cpuid_has_pcid(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *best; + reg = guest_cpuid_get_register(vcpu, x86_feature); + if (!reg) + return false; - best = kvm_find_cpuid_entry(vcpu, 1, 0); - return best && (best->ecx & bit(X86_FEATURE_PCID)); + return *reg & bit(x86_feature); } -static inline bool guest_cpuid_has_x2apic(struct kvm_vcpu *vcpu) +static __always_inline void guest_cpuid_clear(struct kvm_vcpu *vcpu, unsigned x86_feature) { - struct kvm_cpuid_entry2 *best; + int *reg; - best = kvm_find_cpuid_entry(vcpu, 1, 0); - return best && (best->ecx & bit(X86_FEATURE_X2APIC)); + reg = guest_cpuid_get_register(vcpu, x86_feature); + if (reg) + *reg &= ~bit(x86_feature); } static inline bool guest_cpuid_is_amd(struct kvm_vcpu *vcpu) @@ -128,50 +122,6 @@ static inline bool guest_cpuid_is_amd(struct kvm_vcpu *vcpu) return best && best->ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx; } -static inline bool guest_cpuid_has_gbpages(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *best; - - best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); - return best && (best->edx & bit(X86_FEATURE_GBPAGES)); -} - -static inline bool guest_cpuid_has_rtm(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *best; - - best = kvm_find_cpuid_entry(vcpu, 7, 0); - return best && (best->ebx & bit(X86_FEATURE_RTM)); -} - -static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *best; - - best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); - return best && (best->edx & bit(X86_FEATURE_RDTSCP)); -} - -/* - * NRIPS is provided through cpuidfn 0x8000000a.edx bit 3 - */ -#define BIT_NRIPS 3 - -static inline bool guest_cpuid_has_nrips(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *best; - - best = kvm_find_cpuid_entry(vcpu, 0x8000000a, 0); - - /* - * NRIPS is a scattered cpuid feature, so we can't use - * X86_FEATURE_NRIPS here (X86_FEATURE_NRIPS would be bit - * position 8, not 3). - */ - return best && (best->edx & bit(BIT_NRIPS)); -} -#undef BIT_NRIPS - static inline int guest_cpuid_family(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 80890dee66ce..d90cdc77e077 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -28,6 +28,7 @@ #include "x86.h" #include "tss.h" +#include "mmu.h" /* * Operand types @@ -424,8 +425,10 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); #op " %al \n\t" \ FOP_RET -asm(".global kvm_fastop_exception \n" - "kvm_fastop_exception: xor %esi, %esi; ret"); +asm(".pushsection .fixup, \"ax\"\n" + ".global kvm_fastop_exception \n" + "kvm_fastop_exception: xor %esi, %esi; ret\n" + ".popsection"); FOP_START(setcc) FOP_SETCC(seto) @@ -688,16 +691,18 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt, ulong la; u32 lim; u16 sel; + u8 va_bits; la = seg_base(ctxt, addr.seg) + addr.ea; *max_size = 0; switch (mode) { case X86EMUL_MODE_PROT64: *linear = la; - if (is_noncanonical_address(la)) + va_bits = ctxt_virt_addr_bits(ctxt); + if (get_canonical(la, va_bits) != la) goto bad; - *max_size = min_t(u64, ~0u, (1ull << 48) - la); + *max_size = min_t(u64, ~0u, (1ull << va_bits) - la); if (size > *max_size) goto bad; break; @@ -900,7 +905,7 @@ static __always_inline int do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, if (rc != X86EMUL_CONTINUE) \ goto done; \ ctxt->_eip += sizeof(_type); \ - _x = *(_type __aligned(1) *) ctxt->fetch.ptr; \ + memcpy(&_x, ctxt->fetch.ptr, sizeof(_type)); \ ctxt->fetch.ptr += sizeof(_type); \ _x; \ }) @@ -1748,8 +1753,8 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, sizeof(base3), &ctxt->exception); if (ret != X86EMUL_CONTINUE) return ret; - if (is_noncanonical_address(get_desc_base(&seg_desc) | - ((u64)base3 << 32))) + if (emul_is_noncanonical_address(get_desc_base(&seg_desc) | + ((u64)base3 << 32), ctxt)) return emulate_gp(ctxt, 0); } load: @@ -2333,7 +2338,7 @@ static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt) eax = 0x80000001; ecx = 0; - ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false); return edx & bit(X86_FEATURE_LM); } @@ -2636,7 +2641,7 @@ static bool vendor_intel(struct x86_emulate_ctxt *ctxt) u32 eax, ebx, ecx, edx; eax = ecx = 0; - ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false); return ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx && ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx && edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx; @@ -2656,7 +2661,7 @@ static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt) eax = 0x00000000; ecx = 0x00000000; - ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); + ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false); /* * Intel ("GenuineIntel") * remark: Intel CPUs only support "syscall" in 64bit @@ -2840,8 +2845,8 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt) ss_sel = cs_sel + 8; cs.d = 0; cs.l = 1; - if (is_noncanonical_address(rcx) || - is_noncanonical_address(rdx)) + if (emul_is_noncanonical_address(rcx, ctxt) || + emul_is_noncanonical_address(rdx, ctxt)) return emulate_gp(ctxt, 0); break; } @@ -3551,7 +3556,7 @@ static int em_movbe(struct x86_emulate_ctxt *ctxt) /* * Check MOVBE is set in the guest-visible CPUID leaf. */ - ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false); if (!(ecx & FFL(MOVBE))) return emulate_ud(ctxt); @@ -3756,7 +3761,7 @@ static int em_lgdt_lidt(struct x86_emulate_ctxt *ctxt, bool lgdt) if (rc != X86EMUL_CONTINUE) return rc; if (ctxt->mode == X86EMUL_MODE_PROT64 && - is_noncanonical_address(desc_ptr.address)) + emul_is_noncanonical_address(desc_ptr.address, ctxt)) return emulate_gp(ctxt, 0); if (lgdt) ctxt->ops->set_gdt(ctxt, &desc_ptr); @@ -3865,7 +3870,7 @@ static int em_cpuid(struct x86_emulate_ctxt *ctxt) eax = reg_read(ctxt, VCPU_REGS_RAX); ecx = reg_read(ctxt, VCPU_REGS_RCX); - ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, true); *reg_write(ctxt, VCPU_REGS_RAX) = eax; *reg_write(ctxt, VCPU_REGS_RBX) = ebx; *reg_write(ctxt, VCPU_REGS_RCX) = ecx; @@ -3924,7 +3929,7 @@ static int check_fxsr(struct x86_emulate_ctxt *ctxt) { u32 eax = 1, ebx, ecx = 0, edx; - ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false); if (!(edx & FFL(FXSR))) return emulate_ud(ctxt); @@ -3942,6 +3947,25 @@ static int check_fxsr(struct x86_emulate_ctxt *ctxt) } /* + * Hardware doesn't save and restore XMM 0-7 without CR4.OSFXSR, but does save + * and restore MXCSR. + */ +static size_t __fxstate_size(int nregs) +{ + return offsetof(struct fxregs_state, xmm_space[0]) + nregs * 16; +} + +static inline size_t fxstate_size(struct x86_emulate_ctxt *ctxt) +{ + bool cr4_osfxsr; + if (ctxt->mode == X86EMUL_MODE_PROT64) + return __fxstate_size(16); + + cr4_osfxsr = ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR; + return __fxstate_size(cr4_osfxsr ? 8 : 0); +} + +/* * FXSAVE and FXRSTOR have 4 different formats depending on execution mode, * 1) 16 bit mode * 2) 32 bit mode @@ -3962,7 +3986,6 @@ static int check_fxsr(struct x86_emulate_ctxt *ctxt) static int em_fxsave(struct x86_emulate_ctxt *ctxt) { struct fxregs_state fx_state; - size_t size; int rc; rc = check_fxsr(ctxt); @@ -3978,68 +4001,42 @@ static int em_fxsave(struct x86_emulate_ctxt *ctxt) if (rc != X86EMUL_CONTINUE) return rc; - if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR) - size = offsetof(struct fxregs_state, xmm_space[8 * 16/4]); - else - size = offsetof(struct fxregs_state, xmm_space[0]); - - return segmented_write_std(ctxt, ctxt->memop.addr.mem, &fx_state, size); -} - -static int fxrstor_fixup(struct x86_emulate_ctxt *ctxt, - struct fxregs_state *new) -{ - int rc = X86EMUL_CONTINUE; - struct fxregs_state old; - - rc = asm_safe("fxsave %[fx]", , [fx] "+m"(old)); - if (rc != X86EMUL_CONTINUE) - return rc; - - /* - * 64 bit host will restore XMM 8-15, which is not correct on non-64 - * bit guests. Load the current values in order to preserve 64 bit - * XMMs after fxrstor. - */ -#ifdef CONFIG_X86_64 - /* XXX: accessing XMM 8-15 very awkwardly */ - memcpy(&new->xmm_space[8 * 16/4], &old.xmm_space[8 * 16/4], 8 * 16); -#endif - - /* - * Hardware doesn't save and restore XMM 0-7 without CR4.OSFXSR, but - * does save and restore MXCSR. - */ - if (!(ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR)) - memcpy(new->xmm_space, old.xmm_space, 8 * 16); - - return rc; + return segmented_write_std(ctxt, ctxt->memop.addr.mem, &fx_state, + fxstate_size(ctxt)); } static int em_fxrstor(struct x86_emulate_ctxt *ctxt) { struct fxregs_state fx_state; int rc; + size_t size; rc = check_fxsr(ctxt); if (rc != X86EMUL_CONTINUE) return rc; - rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, 512); - if (rc != X86EMUL_CONTINUE) - return rc; + ctxt->ops->get_fpu(ctxt); - if (fx_state.mxcsr >> 16) - return emulate_gp(ctxt, 0); + size = fxstate_size(ctxt); + if (size < __fxstate_size(16)) { + rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state)); + if (rc != X86EMUL_CONTINUE) + goto out; + } - ctxt->ops->get_fpu(ctxt); + rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, size); + if (rc != X86EMUL_CONTINUE) + goto out; - if (ctxt->mode < X86EMUL_MODE_PROT64) - rc = fxrstor_fixup(ctxt, &fx_state); + if (fx_state.mxcsr >> 16) { + rc = emulate_gp(ctxt, 0); + goto out; + } if (rc == X86EMUL_CONTINUE) rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state)); +out: ctxt->ops->put_fpu(ctxt); return rc; @@ -4105,8 +4102,19 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt) u64 rsvd = 0; ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); - if (efer & EFER_LMA) - rsvd = CR3_L_MODE_RESERVED_BITS & ~CR3_PCID_INVD; + if (efer & EFER_LMA) { + u64 maxphyaddr; + u32 eax, ebx, ecx, edx; + + eax = 0x80000008; + ecx = 0; + if (ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, + &edx, false)) + maxphyaddr = eax & 0xff; + else + maxphyaddr = 36; + rsvd = rsvd_bits(maxphyaddr, 62); + } if (new_val & rsvd) return emulate_gp(ctxt, 0); @@ -5292,7 +5300,6 @@ static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt, static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)) { - register void *__sp asm(_ASM_SP); ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF; if (!(ctxt->d & ByteOp)) @@ -5300,7 +5307,7 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)) asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n" : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags), - [fastop]"+S"(fop), "+r"(__sp) + [fastop]"+S"(fop), ASM_CALL_CONSTRAINT : "c"(ctxt->src2.val)); ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK); diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index ebae57ac5902..dc97f2544b6f 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -106,14 +106,27 @@ static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint, return 0; } -static struct kvm_vcpu_hv_synic *synic_get(struct kvm *kvm, u32 vcpu_id) +static struct kvm_vcpu *get_vcpu_by_vpidx(struct kvm *kvm, u32 vpidx) +{ + struct kvm_vcpu *vcpu = NULL; + int i; + + if (vpidx < KVM_MAX_VCPUS) + vcpu = kvm_get_vcpu(kvm, vpidx); + if (vcpu && vcpu_to_hv_vcpu(vcpu)->vp_index == vpidx) + return vcpu; + kvm_for_each_vcpu(i, vcpu, kvm) + if (vcpu_to_hv_vcpu(vcpu)->vp_index == vpidx) + return vcpu; + return NULL; +} + +static struct kvm_vcpu_hv_synic *synic_get(struct kvm *kvm, u32 vpidx) { struct kvm_vcpu *vcpu; struct kvm_vcpu_hv_synic *synic; - if (vcpu_id >= atomic_read(&kvm->online_vcpus)) - return NULL; - vcpu = kvm_get_vcpu(kvm, vcpu_id); + vcpu = get_vcpu_by_vpidx(kvm, vpidx); if (!vcpu) return NULL; synic = vcpu_to_synic(vcpu); @@ -221,7 +234,8 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic, synic->version = data; break; case HV_X64_MSR_SIEFP: - if (data & HV_SYNIC_SIEFP_ENABLE) + if ((data & HV_SYNIC_SIEFP_ENABLE) && !host && + !synic->dont_zero_synic_pages) if (kvm_clear_guest(vcpu->kvm, data & PAGE_MASK, PAGE_SIZE)) { ret = 1; @@ -232,7 +246,8 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic, synic_exit(synic, msr); break; case HV_X64_MSR_SIMP: - if (data & HV_SYNIC_SIMP_ENABLE) + if ((data & HV_SYNIC_SIMP_ENABLE) && !host && + !synic->dont_zero_synic_pages) if (kvm_clear_guest(vcpu->kvm, data & PAGE_MASK, PAGE_SIZE)) { ret = 1; @@ -318,11 +333,11 @@ static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint) return ret; } -int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint) +int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vpidx, u32 sint) { struct kvm_vcpu_hv_synic *synic; - synic = synic_get(kvm, vcpu_id); + synic = synic_get(kvm, vpidx); if (!synic) return -EINVAL; @@ -341,11 +356,11 @@ void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector) kvm_hv_notify_acked_sint(vcpu, i); } -static int kvm_hv_set_sint_gsi(struct kvm *kvm, u32 vcpu_id, u32 sint, int gsi) +static int kvm_hv_set_sint_gsi(struct kvm *kvm, u32 vpidx, u32 sint, int gsi) { struct kvm_vcpu_hv_synic *synic; - synic = synic_get(kvm, vcpu_id); + synic = synic_get(kvm, vpidx); if (!synic) return -EINVAL; @@ -634,9 +649,10 @@ void kvm_hv_process_stimers(struct kvm_vcpu *vcpu) } if ((stimer->config & HV_STIMER_ENABLE) && - stimer->count) - stimer_start(stimer); - else + stimer->count) { + if (!stimer->msg_pending) + stimer_start(stimer); + } else stimer_cleanup(stimer); } } @@ -687,14 +703,24 @@ void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu) stimer_init(&hv_vcpu->stimer[i], i); } -int kvm_hv_activate_synic(struct kvm_vcpu *vcpu) +void kvm_hv_vcpu_postcreate(struct kvm_vcpu *vcpu) { + struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu); + + hv_vcpu->vp_index = kvm_vcpu_get_idx(vcpu); +} + +int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages) +{ + struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu); + /* * Hyper-V SynIC auto EOI SINT's are * not compatible with APICV, so deactivate APICV */ kvm_vcpu_deactivate_apicv(vcpu); - vcpu_to_synic(vcpu)->active = true; + synic->active = true; + synic->dont_zero_synic_pages = dont_zero_synic_pages; return 0; } @@ -978,6 +1004,11 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv; switch (msr) { + case HV_X64_MSR_VP_INDEX: + if (!host) + return 1; + hv->vp_index = (u32)data; + break; case HV_X64_MSR_APIC_ASSIST_PAGE: { u64 gfn; unsigned long addr; @@ -1089,18 +1120,9 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv; switch (msr) { - case HV_X64_MSR_VP_INDEX: { - int r; - struct kvm_vcpu *v; - - kvm_for_each_vcpu(r, v, vcpu->kvm) { - if (v == vcpu) { - data = r; - break; - } - } + case HV_X64_MSR_VP_INDEX: + data = hv->vp_index; break; - } case HV_X64_MSR_EOI: return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata); case HV_X64_MSR_ICR: @@ -1138,6 +1160,12 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) return stimer_get_count(vcpu_to_stimer(vcpu, timer_index), pdata); } + case HV_X64_MSR_TSC_FREQUENCY: + data = (u64)vcpu->arch.virtual_tsc_khz * 1000; + break; + case HV_X64_MSR_APIC_FREQUENCY: + data = APIC_BUS_FREQUENCY; + break; default: vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); return 1; @@ -1246,7 +1274,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) switch (code) { case HVCALL_NOTIFY_LONG_SPIN_WAIT: - kvm_vcpu_on_spin(vcpu); + kvm_vcpu_on_spin(vcpu, true); break; case HVCALL_POST_MESSAGE: case HVCALL_SIGNAL_EVENT: diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index cd1119538add..e637631a9574 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -56,9 +56,10 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu); void kvm_hv_irq_routing_update(struct kvm *kvm); int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint); void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector); -int kvm_hv_activate_synic(struct kvm_vcpu *vcpu); +int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages); void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu); +void kvm_hv_vcpu_postcreate(struct kvm_vcpu *vcpu); void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu); static inline struct kvm_vcpu_hv_stimer *vcpu_to_stimer(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index a78b445ce411..af192895b1fc 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -724,8 +724,10 @@ void kvm_free_pit(struct kvm *kvm) struct kvm_pit *pit = kvm->arch.vpit; if (pit) { + mutex_lock(&kvm->slots_lock); kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev); kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->speaker_dev); + mutex_unlock(&kvm->slots_lock); kvm_pit_set_reinject(pit, false); hrtimer_cancel(&pit->pit_state.timer); kthread_destroy_worker(pit->worker); diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index 600bee9dcbbd..394d9527da7e 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __I8254_H #define __I8254_H diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index 29ce19732ccf..ea1a4e0297da 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __KVM_IO_APIC_H #define __KVM_IO_APIC_H diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 762cdf2595f9..f500293dad8d 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -1,10 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef ASM_KVM_CACHE_REGS_H #define ASM_KVM_CACHE_REGS_H #define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS #define KVM_POSSIBLE_CR4_GUEST_BITS \ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ - | X86_CR4_OSXMMEXCPT | X86_CR4_PGE) + | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_PGE) static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, enum kvm_reg reg) @@ -84,11 +85,6 @@ static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu) | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32); } -static inline u32 kvm_read_pkru(struct kvm_vcpu *vcpu) -{ - return kvm_x86_ops->get_pkru(vcpu); -} - static inline void enter_guest_mode(struct kvm_vcpu *vcpu) { vcpu->arch.hflags |= HF_GUEST_MASK; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index d24c8742d9b0..36c90d631096 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -54,8 +54,6 @@ #define PRIu64 "u" #define PRIo64 "o" -#define APIC_BUS_CYCLE_NS 1 - /* #define apic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */ #define apic_debug(fmt, arg...) @@ -1326,6 +1324,10 @@ static void apic_timer_expired(struct kvm_lapic *apic) atomic_inc(&apic->lapic_timer.pending); kvm_set_pending_timer(vcpu); + /* + * For x86, the atomic_inc() is serialized, thus + * using swait_active() is safe. + */ if (swait_active(q)) swake_up(q); @@ -1495,59 +1497,95 @@ EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use); static void cancel_hv_timer(struct kvm_lapic *apic) { - preempt_disable(); + WARN_ON(preemptible()); + WARN_ON(!apic->lapic_timer.hv_timer_in_use); kvm_x86_ops->cancel_hv_timer(apic->vcpu); apic->lapic_timer.hv_timer_in_use = false; - preempt_enable(); } static bool start_hv_timer(struct kvm_lapic *apic) { - u64 tscdeadline = apic->lapic_timer.tscdeadline; + struct kvm_timer *ktimer = &apic->lapic_timer; + int r; - if ((atomic_read(&apic->lapic_timer.pending) && - !apic_lvtt_period(apic)) || - kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) { - if (apic->lapic_timer.hv_timer_in_use) - cancel_hv_timer(apic); - } else { - apic->lapic_timer.hv_timer_in_use = true; - hrtimer_cancel(&apic->lapic_timer.timer); + WARN_ON(preemptible()); + if (!kvm_x86_ops->set_hv_timer) + return false; + + if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending)) + return false; + + r = kvm_x86_ops->set_hv_timer(apic->vcpu, ktimer->tscdeadline); + if (r < 0) + return false; - /* In case the sw timer triggered in the window */ - if (atomic_read(&apic->lapic_timer.pending) && - !apic_lvtt_period(apic)) - cancel_hv_timer(apic); + ktimer->hv_timer_in_use = true; + hrtimer_cancel(&ktimer->timer); + + /* + * Also recheck ktimer->pending, in case the sw timer triggered in + * the window. For periodic timer, leave the hv timer running for + * simplicity, and the deadline will be recomputed on the next vmexit. + */ + if (!apic_lvtt_period(apic) && (r || atomic_read(&ktimer->pending))) { + if (r) + apic_timer_expired(apic); + return false; } - trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, - apic->lapic_timer.hv_timer_in_use); - return apic->lapic_timer.hv_timer_in_use; + + trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, true); + return true; +} + +static void start_sw_timer(struct kvm_lapic *apic) +{ + struct kvm_timer *ktimer = &apic->lapic_timer; + + WARN_ON(preemptible()); + if (apic->lapic_timer.hv_timer_in_use) + cancel_hv_timer(apic); + if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending)) + return; + + if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) + start_sw_period(apic); + else if (apic_lvtt_tscdeadline(apic)) + start_sw_tscdeadline(apic); + trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, false); +} + +static void restart_apic_timer(struct kvm_lapic *apic) +{ + preempt_disable(); + if (!start_hv_timer(apic)) + start_sw_timer(apic); + preempt_enable(); } void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - WARN_ON(!apic->lapic_timer.hv_timer_in_use); + preempt_disable(); + /* If the preempt notifier has already run, it also called apic_timer_expired */ + if (!apic->lapic_timer.hv_timer_in_use) + goto out; WARN_ON(swait_active(&vcpu->wq)); cancel_hv_timer(apic); apic_timer_expired(apic); if (apic_lvtt_period(apic) && apic->lapic_timer.period) { advance_periodic_target_expiration(apic); - if (!start_hv_timer(apic)) - start_sw_period(apic); + restart_apic_timer(apic); } +out: + preempt_enable(); } EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer); void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu) { - struct kvm_lapic *apic = vcpu->arch.apic; - - WARN_ON(apic->lapic_timer.hv_timer_in_use); - - start_hv_timer(apic); + restart_apic_timer(vcpu->arch.apic); } EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer); @@ -1555,34 +1593,31 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; + preempt_disable(); /* Possibly the TSC deadline timer is not enabled yet */ - if (!apic->lapic_timer.hv_timer_in_use) - return; - - cancel_hv_timer(apic); + if (apic->lapic_timer.hv_timer_in_use) + start_sw_timer(apic); + preempt_enable(); +} +EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer); - if (atomic_read(&apic->lapic_timer.pending)) - return; +void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; - if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) - start_sw_period(apic); - else if (apic_lvtt_tscdeadline(apic)) - start_sw_tscdeadline(apic); + WARN_ON(!apic->lapic_timer.hv_timer_in_use); + restart_apic_timer(apic); } -EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer); static void start_apic_timer(struct kvm_lapic *apic) { atomic_set(&apic->lapic_timer.pending, 0); - if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) { - if (set_target_expiration(apic) && - !(kvm_x86_ops->set_hv_timer && start_hv_timer(apic))) - start_sw_period(apic); - } else if (apic_lvtt_tscdeadline(apic)) { - if (!(kvm_x86_ops->set_hv_timer && start_hv_timer(apic))) - start_sw_tscdeadline(apic); - } + if ((apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) + && !set_target_expiration(apic)) + return; + + restart_apic_timer(apic); } static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) @@ -1813,16 +1848,6 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu) * LAPIC interface *---------------------------------------------------------------------- */ -u64 kvm_get_lapic_target_expiration_tsc(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - - if (!lapic_in_kernel(vcpu)) - return 0; - - return apic->lapic_timer.tscdeadline; -} - u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; @@ -1967,6 +1992,11 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.apic_base | MSR_IA32_APICBASE_BSP); vcpu->arch.pv_eoi.msr_val = 0; apic_update_ppr(apic); + if (vcpu->arch.apicv_active) { + kvm_x86_ops->apicv_post_state_restore(vcpu); + kvm_x86_ops->hwapic_irr_update(vcpu, -1); + kvm_x86_ops->hwapic_isr_update(vcpu, -1); + } vcpu->arch.apic_arb_prio = 0; vcpu->arch.apic_attention = 0; diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index bcbe811f3b97..4b9935a38347 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __KVM_X86_LAPIC_H #define __KVM_X86_LAPIC_H @@ -12,6 +13,9 @@ #define KVM_APIC_SHORT_MASK 0xc0000 #define KVM_APIC_DEST_MASK 0x800 +#define APIC_BUS_CYCLE_NS 1 +#define APIC_BUS_FREQUENCY (1000000000ULL / APIC_BUS_CYCLE_NS) + struct kvm_timer { struct hrtimer timer; s64 period; /* unit: ns */ @@ -87,7 +91,6 @@ int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); -u64 kvm_get_lapic_target_expiration_tsc(struct kvm_vcpu *vcpu); u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data); @@ -216,4 +219,5 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu); void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu); void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu); bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu); +void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu); #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index cb8225969255..7a69cf053711 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -46,6 +46,7 @@ #include <asm/io.h> #include <asm/vmx.h> #include <asm/kvm_page_track.h> +#include "trace.h" /* * When setting this variable to true it enables Two-Dimensional-Paging @@ -107,7 +108,7 @@ module_param(dbg, bool, 0644); (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) -#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) +#define PT64_BASE_ADDR_MASK __sme_clr((((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))) #define PT64_DIR_BASE_ADDR_MASK \ (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) #define PT64_LVL_ADDR_MASK(level) \ @@ -125,7 +126,7 @@ module_param(dbg, bool, 0644); * PT32_LEVEL_BITS))) - 1)) #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \ - | shadow_x_mask | shadow_nx_mask) + | shadow_x_mask | shadow_nx_mask | shadow_me_mask) #define ACC_EXEC_MASK 1 #define ACC_WRITE_MASK PT_WRITABLE_MASK @@ -183,13 +184,14 @@ static u64 __read_mostly shadow_user_mask; static u64 __read_mostly shadow_accessed_mask; static u64 __read_mostly shadow_dirty_mask; static u64 __read_mostly shadow_mmio_mask; +static u64 __read_mostly shadow_mmio_value; static u64 __read_mostly shadow_present_mask; +static u64 __read_mostly shadow_me_mask; /* - * The mask/value to distinguish a PTE that has been marked not-present for - * access tracking purposes. - * The mask would be either 0 if access tracking is disabled, or - * SPTE_SPECIAL_MASK|VMX_EPT_RWX_MASK if access tracking is enabled. + * SPTEs used by MMUs without A/D bits are marked with shadow_acc_track_value. + * Non-present SPTEs with shadow_acc_track_value set are in place for access + * tracking. */ static u64 __read_mostly shadow_acc_track_mask; static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK; @@ -207,16 +209,40 @@ static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIF static void mmu_spte_set(u64 *sptep, u64 spte); static void mmu_free_roots(struct kvm_vcpu *vcpu); -void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) +void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value) { + BUG_ON((mmio_mask & mmio_value) != mmio_value); + shadow_mmio_value = mmio_value | SPTE_SPECIAL_MASK; shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK; } EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); +static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) +{ + return sp->role.ad_disabled; +} + +static inline bool spte_ad_enabled(u64 spte) +{ + MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value); + return !(spte & shadow_acc_track_value); +} + +static inline u64 spte_shadow_accessed_mask(u64 spte) +{ + MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value); + return spte_ad_enabled(spte) ? shadow_accessed_mask : 0; +} + +static inline u64 spte_shadow_dirty_mask(u64 spte) +{ + MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value); + return spte_ad_enabled(spte) ? shadow_dirty_mask : 0; +} + static inline bool is_access_track_spte(u64 spte) { - /* Always false if shadow_acc_track_mask is zero. */ - return (spte & shadow_acc_track_mask) == shadow_acc_track_value; + return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0; } /* @@ -270,7 +296,7 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, u64 mask = generation_mmio_spte_mask(gen); access &= ACC_WRITE_MASK | ACC_USER_MASK; - mask |= shadow_mmio_mask | access | gfn << PAGE_SHIFT; + mask |= shadow_mmio_value | access | gfn << PAGE_SHIFT; trace_mark_mmio_spte(sptep, gfn, access, gen); mmu_spte_set(sptep, mask); @@ -278,7 +304,7 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, static bool is_mmio_spte(u64 spte) { - return (spte & shadow_mmio_mask) == shadow_mmio_mask; + return (spte & shadow_mmio_mask) == shadow_mmio_value; } static gfn_t get_mmio_spte_gfn(u64 spte) @@ -315,12 +341,20 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) return likely(kvm_gen == spte_gen); } +/* + * Sets the shadow PTE masks used by the MMU. + * + * Assumptions: + * - Setting either @accessed_mask or @dirty_mask requires setting both + * - At least one of @accessed_mask or @acc_track_mask must be set + */ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, - u64 acc_track_mask) + u64 acc_track_mask, u64 me_mask) { - if (acc_track_mask != 0) - acc_track_mask |= SPTE_SPECIAL_MASK; + BUG_ON(!dirty_mask != !accessed_mask); + BUG_ON(!accessed_mask && !acc_track_mask); + BUG_ON(acc_track_mask & shadow_acc_track_value); shadow_user_mask = user_mask; shadow_accessed_mask = accessed_mask; @@ -329,7 +363,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, shadow_x_mask = x_mask; shadow_present_mask = p_mask; shadow_acc_track_mask = acc_track_mask; - WARN_ON(shadow_accessed_mask != 0 && shadow_acc_track_mask != 0); + shadow_me_mask = me_mask; } EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); @@ -549,7 +583,7 @@ static bool spte_has_volatile_bits(u64 spte) is_access_track_spte(spte)) return true; - if (shadow_accessed_mask) { + if (spte_ad_enabled(spte)) { if ((spte & shadow_accessed_mask) == 0 || (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0)) return true; @@ -560,14 +594,17 @@ static bool spte_has_volatile_bits(u64 spte) static bool is_accessed_spte(u64 spte) { - return shadow_accessed_mask ? spte & shadow_accessed_mask - : !is_access_track_spte(spte); + u64 accessed_mask = spte_shadow_accessed_mask(spte); + + return accessed_mask ? spte & accessed_mask + : !is_access_track_spte(spte); } static bool is_dirty_spte(u64 spte) { - return shadow_dirty_mask ? spte & shadow_dirty_mask - : spte & PT_WRITABLE_MASK; + u64 dirty_mask = spte_shadow_dirty_mask(spte); + + return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK; } /* Rules for using mmu_spte_set: @@ -707,10 +744,10 @@ static u64 mmu_spte_get_lockless(u64 *sptep) static u64 mark_spte_for_access_track(u64 spte) { - if (shadow_accessed_mask != 0) + if (spte_ad_enabled(spte)) return spte & ~shadow_accessed_mask; - if (shadow_acc_track_mask == 0 || is_access_track_spte(spte)) + if (is_access_track_spte(spte)) return spte; /* @@ -729,7 +766,6 @@ static u64 mark_spte_for_access_track(u64 spte) spte |= (spte & shadow_acc_track_saved_bits_mask) << shadow_acc_track_saved_bits_shift; spte &= ~shadow_acc_track_mask; - spte |= shadow_acc_track_value; return spte; } @@ -741,6 +777,7 @@ static u64 restore_acc_track_spte(u64 spte) u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift) & shadow_acc_track_saved_bits_mask; + WARN_ON_ONCE(spte_ad_enabled(spte)); WARN_ON_ONCE(!is_access_track_spte(spte)); new_spte &= ~shadow_acc_track_mask; @@ -759,7 +796,7 @@ static bool mmu_spte_age(u64 *sptep) if (!is_accessed_spte(spte)) return false; - if (shadow_accessed_mask) { + if (spte_ad_enabled(spte)) { clear_bit((ffs(shadow_accessed_mask) - 1), (unsigned long *)sptep); } else { @@ -1390,6 +1427,22 @@ static bool spte_clear_dirty(u64 *sptep) return mmu_spte_update(sptep, spte); } +static bool wrprot_ad_disabled_spte(u64 *sptep) +{ + bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT, + (unsigned long *)sptep); + if (was_writable) + kvm_set_pfn_dirty(spte_to_pfn(*sptep)); + + return was_writable; +} + +/* + * Gets the GFN ready for another round of dirty logging by clearing the + * - D bit on ad-enabled SPTEs, and + * - W bit on ad-disabled SPTEs. + * Returns true iff any D or W bits were cleared. + */ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) { u64 *sptep; @@ -1397,7 +1450,10 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) bool flush = false; for_each_rmap_spte(rmap_head, &iter, sptep) - flush |= spte_clear_dirty(sptep); + if (spte_ad_enabled(*sptep)) + flush |= spte_clear_dirty(sptep); + else + flush |= wrprot_ad_disabled_spte(sptep); return flush; } @@ -1420,7 +1476,8 @@ static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) bool flush = false; for_each_rmap_spte(rmap_head, &iter, sptep) - flush |= spte_set_dirty(sptep); + if (spte_ad_enabled(*sptep)) + flush |= spte_set_dirty(sptep); return flush; } @@ -1452,7 +1509,8 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, } /** - * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages + * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write + * protect the page if the D-bit isn't supported. * @kvm: kvm instance * @slot: slot to clear D-bit * @gfn_offset: start of the BITS_PER_LONG pages we care about @@ -1766,18 +1824,9 @@ static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, u64 *sptep; struct rmap_iterator iter; - /* - * If there's no access bit in the secondary pte set by the hardware and - * fast access tracking is also not enabled, it's up to gup-fast/gup to - * set the access bit in the primary pte or in the page structure. - */ - if (!shadow_accessed_mask && !shadow_acc_track_mask) - goto out; - for_each_rmap_spte(rmap_head, &iter, sptep) if (is_accessed_spte(*sptep)) return 1; -out: return 0; } @@ -1798,18 +1847,6 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) { - /* - * In case of absence of EPT Access and Dirty Bits supports, - * emulate the accessed bit for EPT, by checking if this page has - * an EPT mapping, and clearing it if it does. On the next access, - * a new EPT mapping will be established. - * This has some overhead, but not as much as the cost of swapping - * out actively used pages or breaking up actively used hugepages. - */ - if (!shadow_accessed_mask && !shadow_acc_track_mask) - return kvm_handle_hva_range(kvm, start, end, 0, - kvm_unmap_rmapp); - return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp); } @@ -2132,8 +2169,8 @@ static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, } struct mmu_page_path { - struct kvm_mmu_page *parent[PT64_ROOT_LEVEL]; - unsigned int idx[PT64_ROOT_LEVEL]; + struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL]; + unsigned int idx[PT64_ROOT_MAX_LEVEL]; }; #define for_each_sp(pvec, sp, parents, i) \ @@ -2348,8 +2385,8 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, iterator->shadow_addr = vcpu->arch.mmu.root_hpa; iterator->level = vcpu->arch.mmu.shadow_root_level; - if (iterator->level == PT64_ROOT_LEVEL && - vcpu->arch.mmu.root_level < PT64_ROOT_LEVEL && + if (iterator->level == PT64_ROOT_4LEVEL && + vcpu->arch.mmu.root_level < PT64_ROOT_4LEVEL && !vcpu->arch.mmu.direct_map) --iterator->level; @@ -2398,7 +2435,12 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK | - shadow_user_mask | shadow_x_mask | shadow_accessed_mask; + shadow_user_mask | shadow_x_mask | shadow_me_mask; + + if (sp_ad_disabled(sp)) + spte |= shadow_acc_track_value; + else + spte |= shadow_accessed_mask; mmu_spte_set(sptep, spte); @@ -2568,9 +2610,7 @@ static bool prepare_zap_oldest_mmu_page(struct kvm *kvm, sp = list_last_entry(&kvm->arch.active_mmu_pages, struct kvm_mmu_page, link); - kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); - - return true; + return kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); } /* @@ -2666,10 +2706,15 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, { u64 spte = 0; int ret = 0; + struct kvm_mmu_page *sp; if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access)) return 0; + sp = page_header(__pa(sptep)); + if (sp_ad_disabled(sp)) + spte |= shadow_acc_track_value; + /* * For the EPT case, shadow_present_mask is 0 if hardware * supports exec-only page table entries. In that case, @@ -2678,7 +2723,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, */ spte |= shadow_present_mask; if (!speculative) - spte |= shadow_accessed_mask; + spte |= spte_shadow_accessed_mask(spte); if (pte_access & ACC_EXEC_MASK) spte |= shadow_x_mask; @@ -2700,6 +2745,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, pte_access &= ~ACC_WRITE_MASK; spte |= (u64)pfn << PAGE_SHIFT; + spte |= shadow_me_mask; if (pte_access & ACC_WRITE_MASK) { @@ -2735,7 +2781,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (pte_access & ACC_WRITE_MASK) { kvm_vcpu_mark_page_dirty(vcpu, gfn); - spte |= shadow_dirty_mask; + spte |= spte_shadow_dirty_mask(spte); } if (speculative) @@ -2877,16 +2923,16 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) { struct kvm_mmu_page *sp; + sp = page_header(__pa(sptep)); + /* - * Since it's no accessed bit on EPT, it's no way to - * distinguish between actually accessed translations - * and prefetched, so disable pte prefetch if EPT is - * enabled. + * Without accessed bits, there's no way to distinguish between + * actually accessed translations and prefetched, so disable pte + * prefetch if accessed bits aren't available. */ - if (!shadow_accessed_mask) + if (sp_ad_disabled(sp)) return; - sp = page_header(__pa(sptep)); if (sp->role.level > PT_PAGE_TABLE_LEVEL) return; @@ -3214,7 +3260,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable); -static void make_mmu_pages_available(struct kvm_vcpu *vcpu); +static int make_mmu_pages_available(struct kvm_vcpu *vcpu); static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, gfn_t gfn, bool prefault) @@ -3254,7 +3300,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; - make_mmu_pages_available(vcpu); + if (make_mmu_pages_available(vcpu) < 0) + goto out_unlock; if (likely(!force_pt_level)) transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault); @@ -3278,8 +3325,8 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL && - (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL || + if (vcpu->arch.mmu.shadow_root_level >= PT64_ROOT_4LEVEL && + (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL || vcpu->arch.mmu.direct_map)) { hpa_t root = vcpu->arch.mmu.root_hpa; @@ -3331,10 +3378,14 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) struct kvm_mmu_page *sp; unsigned i; - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { + if (vcpu->arch.mmu.shadow_root_level >= PT64_ROOT_4LEVEL) { spin_lock(&vcpu->kvm->mmu_lock); - make_mmu_pages_available(vcpu); - sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL, 1, ACC_ALL); + if(make_mmu_pages_available(vcpu) < 0) { + spin_unlock(&vcpu->kvm->mmu_lock); + return 1; + } + sp = kvm_mmu_get_page(vcpu, 0, 0, + vcpu->arch.mmu.shadow_root_level, 1, ACC_ALL); ++sp->root_count; spin_unlock(&vcpu->kvm->mmu_lock); vcpu->arch.mmu.root_hpa = __pa(sp->spt); @@ -3344,7 +3395,10 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) MMU_WARN_ON(VALID_PAGE(root)); spin_lock(&vcpu->kvm->mmu_lock); - make_mmu_pages_available(vcpu); + if (make_mmu_pages_available(vcpu) < 0) { + spin_unlock(&vcpu->kvm->mmu_lock); + return 1; + } sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT), i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL); root = __pa(sp->spt); @@ -3375,15 +3429,18 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) * Do we shadow a long mode page table? If so we need to * write-protect the guests page table root. */ - if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { + if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; MMU_WARN_ON(VALID_PAGE(root)); spin_lock(&vcpu->kvm->mmu_lock); - make_mmu_pages_available(vcpu); - sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL, - 0, ACC_ALL); + if (make_mmu_pages_available(vcpu) < 0) { + spin_unlock(&vcpu->kvm->mmu_lock); + return 1; + } + sp = kvm_mmu_get_page(vcpu, root_gfn, 0, + vcpu->arch.mmu.shadow_root_level, 0, ACC_ALL); root = __pa(sp->spt); ++sp->root_count; spin_unlock(&vcpu->kvm->mmu_lock); @@ -3397,7 +3454,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) * the shadow page table may be a PAE or a long mode page table. */ pm_mask = PT_PRESENT_MASK; - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; for (i = 0; i < 4; ++i) { @@ -3415,7 +3472,10 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) return 1; } spin_lock(&vcpu->kvm->mmu_lock); - make_mmu_pages_available(vcpu); + if (make_mmu_pages_available(vcpu) < 0) { + spin_unlock(&vcpu->kvm->mmu_lock); + return 1; + } sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL, 0, ACC_ALL); root = __pa(sp->spt); @@ -3430,7 +3490,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) * If we shadow a 32 bit page table with a long mode page * table we enter this path. */ - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) { if (vcpu->arch.mmu.lm_root == NULL) { /* * The additional page necessary for this is only @@ -3475,7 +3535,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); - if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { + if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; sp = page_header(root); mmu_sync_children(vcpu, sp); @@ -3540,6 +3600,13 @@ static bool is_shadow_zero_bits_set(struct kvm_mmu *mmu, u64 spte, int level) static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct) { + /* + * A nested guest cannot use the MMIO cache if it is using nested + * page tables, because cr2 is a nGPA while the cache stores GPAs. + */ + if (mmu_is_nested(vcpu)) + return false; + if (direct) return vcpu_match_mmio_gpa(vcpu, addr); @@ -3551,7 +3618,7 @@ static bool walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) { struct kvm_shadow_walk_iterator iterator; - u64 sptes[PT64_ROOT_LEVEL], spte = 0ull; + u64 sptes[PT64_ROOT_MAX_LEVEL], spte = 0ull; int root, leaf; bool reserved = false; @@ -3592,7 +3659,23 @@ exit: return reserved; } -int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) +/* + * Return values of handle_mmio_page_fault: + * RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction + * directly. + * RET_MMIO_PF_INVALID: invalid spte is detected then let the real page + * fault path update the mmio spte. + * RET_MMIO_PF_RETRY: let CPU fault again on the address. + * RET_MMIO_PF_BUG: a bug was detected (and a WARN was printed). + */ +enum { + RET_MMIO_PF_EMULATE = 1, + RET_MMIO_PF_INVALID = 2, + RET_MMIO_PF_RETRY = 0, + RET_MMIO_PF_BUG = -1 +}; + +static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) { u64 spte; bool reserved; @@ -3704,7 +3787,7 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu) kvm_event_needs_reinjection(vcpu))) return false; - if (is_guest_mode(vcpu)) + if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu)) return false; return kvm_x86_ops->interrupt_allowed(vcpu); @@ -3736,6 +3819,38 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, return false; } +int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, + u64 fault_address, char *insn, int insn_len, + bool need_unprotect) +{ + int r = 1; + + switch (vcpu->arch.apf.host_apf_reason) { + default: + trace_kvm_page_fault(fault_address, error_code); + + if (need_unprotect && kvm_event_needs_reinjection(vcpu)) + kvm_mmu_unprotect_page_virt(vcpu, fault_address); + r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn, + insn_len); + break; + case KVM_PV_REASON_PAGE_NOT_PRESENT: + vcpu->arch.apf.host_apf_reason = 0; + local_irq_disable(); + kvm_async_pf_task_wait(fault_address, 0); + local_irq_enable(); + break; + case KVM_PV_REASON_PAGE_READY: + vcpu->arch.apf.host_apf_reason = 0; + local_irq_disable(); + kvm_async_pf_task_wake(fault_address); + local_irq_enable(); + break; + } + return r; +} +EXPORT_SYMBOL_GPL(kvm_handle_page_fault); + static bool check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level) { @@ -3792,7 +3907,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; - make_mmu_pages_available(vcpu); + if (make_mmu_pages_available(vcpu) < 0) + goto out_unlock; if (likely(!force_pt_level)) transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault); @@ -3858,19 +3974,19 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte) { /* - * PT_PAGE_TABLE_LEVEL always terminates. The RHS has bit 7 set - * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means - * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then. - */ - gpte |= level - PT_PAGE_TABLE_LEVEL - 1; - - /* * The RHS has bit 7 set iff level < mmu->last_nonleaf_level. * If it is clear, there are no large pages at this level, so clear * PT_PAGE_SIZE_MASK in gpte if that is the case. */ gpte &= level - mmu->last_nonleaf_level; + /* + * PT_PAGE_TABLE_LEVEL always terminates. The RHS has bit 7 set + * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means + * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then. + */ + gpte |= level - PT_PAGE_TABLE_LEVEL - 1; + return gpte & PT_PAGE_SIZE_MASK; } @@ -3945,7 +4061,13 @@ __reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; break; - case PT64_ROOT_LEVEL: + case PT64_ROOT_5LEVEL: + rsvd_check->rsvd_bits_mask[0][4] = exb_bit_rsvd | + nonleaf_bit8_rsvd | rsvd_bits(7, 7) | + rsvd_bits(maxphyaddr, 51); + rsvd_check->rsvd_bits_mask[1][4] = + rsvd_check->rsvd_bits_mask[0][4]; + case PT64_ROOT_4LEVEL: rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd | nonleaf_bit8_rsvd | rsvd_bits(7, 7) | rsvd_bits(maxphyaddr, 51); @@ -3975,7 +4097,8 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, { __reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check, cpuid_maxphyaddr(vcpu), context->root_level, - context->nx, guest_cpuid_has_gbpages(vcpu), + context->nx, + guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES), is_pse(vcpu), guest_cpuid_is_amd(vcpu)); } @@ -3985,6 +4108,8 @@ __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, { u64 bad_mt_xwr; + rsvd_check->rsvd_bits_mask[0][4] = + rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7); rsvd_check->rsvd_bits_mask[0][3] = rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7); rsvd_check->rsvd_bits_mask[0][2] = @@ -3994,6 +4119,7 @@ __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51); /* large page */ + rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4]; rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3]; rsvd_check->rsvd_bits_mask[1][2] = rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29); @@ -4029,16 +4155,28 @@ void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { bool uses_nx = context->nx || context->base_role.smep_andnot_wp; + struct rsvd_bits_validate *shadow_zero_check; + int i; /* * Passing "true" to the last argument is okay; it adds a check * on bit 8 of the SPTEs which KVM doesn't use anyway. */ - __reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check, + shadow_zero_check = &context->shadow_zero_check; + __reset_rsvds_bits_mask(vcpu, shadow_zero_check, boot_cpu_data.x86_phys_bits, context->shadow_root_level, uses_nx, - guest_cpuid_has_gbpages(vcpu), is_pse(vcpu), - true); + guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES), + is_pse(vcpu), true); + + if (!shadow_me_mask) + return; + + for (i = context->shadow_root_level; --i >= 0;) { + shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask; + shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask; + } + } EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask); @@ -4056,17 +4194,29 @@ static void reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { + struct rsvd_bits_validate *shadow_zero_check; + int i; + + shadow_zero_check = &context->shadow_zero_check; + if (boot_cpu_is_amd()) - __reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check, + __reset_rsvds_bits_mask(vcpu, shadow_zero_check, boot_cpu_data.x86_phys_bits, context->shadow_root_level, false, boot_cpu_has(X86_FEATURE_GBPAGES), true, true); else - __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, + __reset_rsvds_bits_mask_ept(shadow_zero_check, boot_cpu_data.x86_phys_bits, false); + if (!shadow_me_mask) + return; + + for (i = context->shadow_root_level; --i >= 0;) { + shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask; + shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask; + } } /* @@ -4081,66 +4231,85 @@ reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, boot_cpu_data.x86_phys_bits, execonly); } +#define BYTE_MASK(access) \ + ((1 & (access) ? 2 : 0) | \ + (2 & (access) ? 4 : 0) | \ + (3 & (access) ? 8 : 0) | \ + (4 & (access) ? 16 : 0) | \ + (5 & (access) ? 32 : 0) | \ + (6 & (access) ? 64 : 0) | \ + (7 & (access) ? 128 : 0)) + + static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, bool ept) { - unsigned bit, byte, pfec; - u8 map; - bool fault, x, w, u, wf, uf, ff, smapf, cr4_smap, cr4_smep, smap = 0; + unsigned byte; + + const u8 x = BYTE_MASK(ACC_EXEC_MASK); + const u8 w = BYTE_MASK(ACC_WRITE_MASK); + const u8 u = BYTE_MASK(ACC_USER_MASK); + + bool cr4_smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP) != 0; + bool cr4_smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP) != 0; + bool cr0_wp = is_write_protection(vcpu); - cr4_smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); - cr4_smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP); for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) { - pfec = byte << 1; - map = 0; - wf = pfec & PFERR_WRITE_MASK; - uf = pfec & PFERR_USER_MASK; - ff = pfec & PFERR_FETCH_MASK; + unsigned pfec = byte << 1; + /* - * PFERR_RSVD_MASK bit is set in PFEC if the access is not - * subject to SMAP restrictions, and cleared otherwise. The - * bit is only meaningful if the SMAP bit is set in CR4. + * Each "*f" variable has a 1 bit for each UWX value + * that causes a fault with the given PFEC. */ - smapf = !(pfec & PFERR_RSVD_MASK); - for (bit = 0; bit < 8; ++bit) { - x = bit & ACC_EXEC_MASK; - w = bit & ACC_WRITE_MASK; - u = bit & ACC_USER_MASK; - - if (!ept) { - /* Not really needed: !nx will cause pte.nx to fault */ - x |= !mmu->nx; - /* Allow supervisor writes if !cr0.wp */ - w |= !is_write_protection(vcpu) && !uf; - /* Disallow supervisor fetches of user code if cr4.smep */ - x &= !(cr4_smep && u && !uf); - - /* - * SMAP:kernel-mode data accesses from user-mode - * mappings should fault. A fault is considered - * as a SMAP violation if all of the following - * conditions are ture: - * - X86_CR4_SMAP is set in CR4 - * - A user page is accessed - * - Page fault in kernel mode - * - if CPL = 3 or X86_EFLAGS_AC is clear - * - * Here, we cover the first three conditions. - * The fourth is computed dynamically in - * permission_fault() and is in smapf. - * - * Also, SMAP does not affect instruction - * fetches, add the !ff check here to make it - * clearer. - */ - smap = cr4_smap && u && !uf && !ff; - } - fault = (ff && !x) || (uf && !u) || (wf && !w) || - (smapf && smap); - map |= fault << bit; + /* Faults from writes to non-writable pages */ + u8 wf = (pfec & PFERR_WRITE_MASK) ? ~w : 0; + /* Faults from user mode accesses to supervisor pages */ + u8 uf = (pfec & PFERR_USER_MASK) ? ~u : 0; + /* Faults from fetches of non-executable pages*/ + u8 ff = (pfec & PFERR_FETCH_MASK) ? ~x : 0; + /* Faults from kernel mode fetches of user pages */ + u8 smepf = 0; + /* Faults from kernel mode accesses of user pages */ + u8 smapf = 0; + + if (!ept) { + /* Faults from kernel mode accesses to user pages */ + u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u; + + /* Not really needed: !nx will cause pte.nx to fault */ + if (!mmu->nx) + ff = 0; + + /* Allow supervisor writes if !cr0.wp */ + if (!cr0_wp) + wf = (pfec & PFERR_USER_MASK) ? wf : 0; + + /* Disallow supervisor fetches of user code if cr4.smep */ + if (cr4_smep) + smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0; + + /* + * SMAP:kernel-mode data accesses from user-mode + * mappings should fault. A fault is considered + * as a SMAP violation if all of the following + * conditions are ture: + * - X86_CR4_SMAP is set in CR4 + * - A user page is accessed + * - The access is not a fetch + * - Page fault in kernel mode + * - if CPL = 3 or X86_EFLAGS_AC is clear + * + * Here, we cover the first three conditions. + * The fourth is computed dynamically in permission_fault(); + * PFERR_RSVD_MASK bit will be set in PFEC if the access is + * *not* subject to SMAP restrictions. + */ + if (cr4_smap) + smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf; } - mmu->permissions[byte] = map; + + mmu->permissions[byte] = ff | uf | wf | smepf | smapf; } } @@ -4254,7 +4423,10 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu, static void paging64_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { - paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL); + int root_level = is_la57_mode(vcpu) ? + PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL; + + paging64_init_context_common(vcpu, context, root_level); } static void paging32_init_context(struct kvm_vcpu *vcpu, @@ -4290,11 +4462,12 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->base_role.word = 0; context->base_role.smm = is_smm(vcpu); + context->base_role.ad_disabled = (shadow_accessed_mask == 0); context->page_fault = tdp_page_fault; context->sync_page = nonpaging_sync_page; context->invlpg = nonpaging_invlpg; context->update_pte = nonpaging_update_pte; - context->shadow_root_level = kvm_x86_ops->get_tdp_level(); + context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu); context->root_hpa = INVALID_PAGE; context->direct_map = true; context->set_cr3 = kvm_x86_ops->set_tdp_cr3; @@ -4308,7 +4481,8 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->root_level = 0; } else if (is_long_mode(vcpu)) { context->nx = is_nx(vcpu); - context->root_level = PT64_ROOT_LEVEL; + context->root_level = is_la57_mode(vcpu) ? + PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL; reset_rsvds_bits_mask(vcpu, context); context->gva_to_gpa = paging64_gva_to_gpa; } else if (is_pae(vcpu)) { @@ -4365,7 +4539,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, MMU_WARN_ON(VALID_PAGE(context->root_hpa)); - context->shadow_root_level = kvm_x86_ops->get_tdp_level(); + context->shadow_root_level = PT64_ROOT_4LEVEL; context->nx = true; context->ept_ad = accessed_dirty; @@ -4374,12 +4548,14 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, context->sync_page = ept_sync_page; context->invlpg = ept_invlpg; context->update_pte = ept_update_pte; - context->root_level = context->shadow_root_level; + context->root_level = PT64_ROOT_4LEVEL; context->root_hpa = INVALID_PAGE; context->direct_map = false; + context->base_role.ad_disabled = !accessed_dirty; update_permission_bitmask(vcpu, context, true); update_pkru_bitmask(vcpu, context, true); + update_last_nonleaf_level(vcpu, context); reset_rsvds_bits_mask_ept(vcpu, context, execonly); reset_ept_shadow_zero_bits_mask(vcpu, context, execonly); } @@ -4418,7 +4594,8 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested; } else if (is_long_mode(vcpu)) { g_context->nx = is_nx(vcpu); - g_context->root_level = PT64_ROOT_LEVEL; + g_context->root_level = is_la57_mode(vcpu) ? + PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL; reset_rsvds_bits_mask(vcpu, g_context); g_context->gva_to_gpa = paging64_gva_to_gpa_nested; } else if (is_pae(vcpu)) { @@ -4636,6 +4813,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, mask.smep_andnot_wp = 1; mask.smap_andnot_wp = 1; mask.smm = 1; + mask.ad_disabled = 1; /* * If we don't have indirect shadow pages, it means no page is @@ -4707,12 +4885,12 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) } EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); -static void make_mmu_pages_available(struct kvm_vcpu *vcpu) +static int make_mmu_pages_available(struct kvm_vcpu *vcpu) { LIST_HEAD(invalid_list); if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES)) - return; + return 0; while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) { if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list)) @@ -4721,6 +4899,10 @@ static void make_mmu_pages_available(struct kvm_vcpu *vcpu) ++vcpu->kvm->stat.mmu_recycled; } kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); + + if (!kvm_mmu_available_pages(vcpu->kvm)) + return -ENOSPC; + return 0; } int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, @@ -4728,7 +4910,13 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, { int r, emulation_type = EMULTYPE_RETRY; enum emulation_result er; - bool direct = vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu); + bool direct = vcpu->arch.mmu.direct_map; + + /* With shadow page tables, fault_address contains a GVA or nGPA. */ + if (vcpu->arch.mmu.direct_map) { + vcpu->arch.gpa_available = true; + vcpu->arch.gpa_val = cr2; + } if (unlikely(error_code & PFERR_RSVD_MASK)) { r = handle_mmio_page_fault(vcpu, cr2, direct); @@ -4740,6 +4928,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, return 1; if (r < 0) return r; + /* Must be RET_MMIO_PF_INVALID. */ } r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code), @@ -4755,11 +4944,9 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, * This can occur when using nested virtualization with nested * paging in both guests. If true, we simply unprotect the page * and resume the guest. - * - * Note: AMD only (since it supports the PFERR_GUEST_PAGE_MASK used - * in PFERR_NEXT_GUEST_PAGE) */ - if (error_code == PFERR_NESTED_GUEST_PAGE) { + if (vcpu->arch.mmu.direct_map && + (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) { kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2)); return 1; } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 330bf3a811fb..efc857615d8e 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __KVM_X86_MMU_H #define __KVM_X86_MMU_H @@ -37,7 +38,8 @@ #define PT32_DIR_PSE36_MASK \ (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT) -#define PT64_ROOT_LEVEL 4 +#define PT64_ROOT_5LEVEL 5 +#define PT64_ROOT_4LEVEL 4 #define PT32_ROOT_LEVEL 2 #define PT32E_ROOT_LEVEL 3 @@ -48,35 +50,24 @@ static inline u64 rsvd_bits(int s, int e) { + if (e < s) + return 0; + return ((1ULL << (e - s + 1)) - 1) << s; } -void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask); +void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value); void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context); -/* - * Return values of handle_mmio_page_fault: - * RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction - * directly. - * RET_MMIO_PF_INVALID: invalid spte is detected then let the real page - * fault path update the mmio spte. - * RET_MMIO_PF_RETRY: let CPU fault again on the address. - * RET_MMIO_PF_BUG: a bug was detected (and a WARN was printed). - */ -enum { - RET_MMIO_PF_EMULATE = 1, - RET_MMIO_PF_INVALID = 2, - RET_MMIO_PF_RETRY = 0, - RET_MMIO_PF_BUG = -1 -}; - -int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct); void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu); void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, bool accessed_dirty); bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu); +int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, + u64 fault_address, char *insn, int insn_len, + bool need_unprotect); static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) { @@ -182,7 +173,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, * index of the protection domain, so pte_pkey * 2 is * is the index of the first bit for the domain. */ - pkru_bits = (kvm_read_pkru(vcpu) >> (pte_pkey * 2)) & 3; + pkru_bits = (vcpu->arch.pkru >> (pte_pkey * 2)) & 3; /* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */ offset = (pfec & ~1) + diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index dcce533d420c..d22ddbdf5e6e 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -62,11 +62,11 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn) if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; - if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { + if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; sp = page_header(root); - __mmu_spte_walk(vcpu, sp, fn, PT64_ROOT_LEVEL); + __mmu_spte_walk(vcpu, sp, fn, vcpu->arch.mmu.root_level); return; } diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index 5a24b846a1cb..c73bf4e4988c 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #if !defined(_TRACE_KVMMMU_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_KVMMMU_H @@ -30,8 +31,9 @@ \ role.word = __entry->role; \ \ - trace_seq_printf(p, "sp gen %lx gfn %llx %u%s q%u%s %s%s" \ - " %snxe root %u %s%c", __entry->mmu_valid_gen, \ + trace_seq_printf(p, "sp gen %lx gfn %llx l%u%s q%u%s %s%s" \ + " %snxe %sad root %u %s%c", \ + __entry->mmu_valid_gen, \ __entry->gfn, role.level, \ role.cr4_pae ? " pae" : "", \ role.quadrant, \ @@ -39,6 +41,7 @@ access_str[role.access], \ role.invalid ? " invalid" : "", \ role.nxe ? "" : "!", \ + role.ad_disabled ? "!" : "", \ __entry->root_count, \ __entry->unsync ? "unsync" : "sync", 0); \ saved_ptr; \ diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index 0149ac59c273..e9ea2d45ae66 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -130,7 +130,7 @@ static u8 mtrr_disabled_type(struct kvm_vcpu *vcpu) * enable MTRRs and it is obviously undesirable to run the * guest entirely with UC memory and we use WB. */ - if (guest_cpuid_has_mtrr(vcpu)) + if (guest_cpuid_has(vcpu, X86_FEATURE_MTRR)) return MTRR_TYPE_UNCACHABLE; else return MTRR_TYPE_WRBACK; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index b0454c7e4cff..f18d1f8d332b 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -334,10 +334,11 @@ retry_walk: --walker->level; index = PT_INDEX(addr, walker->level); - table_gfn = gpte_to_gfn(pte); offset = index * sizeof(pt_element_t); pte_gpa = gfn_to_gpa(table_gfn) + offset; + + BUG_ON(walker->level < 1); walker->table_gfn[walker->level - 1] = table_gfn; walker->pte_gpa[walker->level - 1] = pte_gpa; @@ -790,8 +791,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, &map_writable)) return 0; - if (handle_abnormal_pfn(vcpu, mmu_is_nested(vcpu) ? 0 : addr, - walker.gfn, pfn, walker.pte_access, &r)) + if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r)) return r; /* @@ -819,7 +819,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, goto out_unlock; kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); - make_mmu_pages_available(vcpu); + if (make_mmu_pages_available(vcpu) < 0) + goto out_unlock; if (!force_pt_level) transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); r = FNAME(fetch)(vcpu, addr, &walker, write_fault, diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index f96e1f962587..a9a62b9a73e2 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __KVM_X86_PMU_H #define __KVM_X86_PMU_H diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ba9891ac5c56..0e68f0b3cbf7 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -36,6 +36,7 @@ #include <linux/slab.h> #include <linux/amd-iommu.h> #include <linux/hashtable.h> +#include <linux/frame.h> #include <asm/apic.h> #include <asm/perf_event.h> @@ -189,10 +190,10 @@ struct vcpu_svm { struct nested_state nested; bool nmi_singlestep; + u64 nmi_singlestep_guest_rflags; unsigned int3_injected; unsigned long int3_rip; - u32 apf_reason; /* cached guest cpuid flags for faster access */ bool nrips_enabled : 1; @@ -275,9 +276,13 @@ static int avic; module_param(avic, int, S_IRUGO); #endif -/* AVIC VM ID bit masks and lock */ -static DECLARE_BITMAP(avic_vm_id_bitmap, AVIC_VM_ID_NR); -static DEFINE_SPINLOCK(avic_vm_id_lock); +/* enable/disable Virtual VMLOAD VMSAVE */ +static int vls = true; +module_param(vls, int, 0444); + +/* enable/disable Virtual GIF */ +static int vgif = true; +module_param(vgif, int, 0444); static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); static void svm_flush_tlb(struct kvm_vcpu *vcpu); @@ -474,19 +479,33 @@ static inline void clr_intercept(struct vcpu_svm *svm, int bit) recalc_intercepts(svm); } +static inline bool vgif_enabled(struct vcpu_svm *svm) +{ + return !!(svm->vmcb->control.int_ctl & V_GIF_ENABLE_MASK); +} + static inline void enable_gif(struct vcpu_svm *svm) { - svm->vcpu.arch.hflags |= HF_GIF_MASK; + if (vgif_enabled(svm)) + svm->vmcb->control.int_ctl |= V_GIF_MASK; + else + svm->vcpu.arch.hflags |= HF_GIF_MASK; } static inline void disable_gif(struct vcpu_svm *svm) { - svm->vcpu.arch.hflags &= ~HF_GIF_MASK; + if (vgif_enabled(svm)) + svm->vmcb->control.int_ctl &= ~V_GIF_MASK; + else + svm->vcpu.arch.hflags &= ~HF_GIF_MASK; } static inline bool gif_set(struct vcpu_svm *svm) { - return !!(svm->vcpu.arch.hflags & HF_GIF_MASK); + if (vgif_enabled(svm)) + return !!(svm->vmcb->control.int_ctl & V_GIF_MASK); + else + return !!(svm->vcpu.arch.hflags & HF_GIF_MASK); } static unsigned long iopm_base; @@ -562,10 +581,10 @@ static inline void invlpga(unsigned long addr, u32 asid) asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid)); } -static int get_npt_level(void) +static int get_npt_level(struct kvm_vcpu *vcpu) { #ifdef CONFIG_X86_64 - return PT64_ROOT_LEVEL; + return PT64_ROOT_4LEVEL; #else return PT32E_ROOT_LEVEL; #endif @@ -631,11 +650,13 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) svm_set_interrupt_shadow(vcpu, 0); } -static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, - bool has_error_code, u32 error_code, - bool reinject) +static void svm_queue_exception(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + unsigned nr = vcpu->arch.exception.nr; + bool has_error_code = vcpu->arch.exception.has_error_code; + bool reinject = vcpu->arch.exception.injected; + u32 error_code = vcpu->arch.exception.error_code; /* * If we are within a nested VM we'd better #VMEXIT and let the guest @@ -945,7 +966,7 @@ static void svm_enable_lbrv(struct vcpu_svm *svm) { u32 *msrpm = svm->msrpm; - svm->vmcb->control.lbr_ctl = 1; + svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1); set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1); set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); @@ -956,13 +977,26 @@ static void svm_disable_lbrv(struct vcpu_svm *svm) { u32 *msrpm = svm->msrpm; - svm->vmcb->control.lbr_ctl = 0; + svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK; set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0); set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0); set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0); set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0); } +static void disable_nmi_singlestep(struct vcpu_svm *svm) +{ + svm->nmi_singlestep = false; + + if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) { + /* Clear our flags if they were not set by the guest */ + if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF)) + svm->vmcb->save.rflags &= ~X86_EFLAGS_TF; + if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF)) + svm->vmcb->save.rflags &= ~X86_EFLAGS_RF; + } +} + /* Note: * This hash table is used to map VM_ID to a struct kvm_arch, * when handling AMD IOMMU GALOG notification to schedule in @@ -970,6 +1004,8 @@ static void svm_disable_lbrv(struct vcpu_svm *svm) */ #define SVM_VM_DATA_HASH_BITS 8 static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS); +static u32 next_vm_id = 0; +static bool next_vm_id_wrapped = 0; static DEFINE_SPINLOCK(svm_vm_data_hash_lock); /* Note: @@ -1079,6 +1115,23 @@ static __init int svm_hardware_setup(void) } } + if (vls) { + if (!npt_enabled || + !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) || + !IS_ENABLED(CONFIG_X86_64)) { + vls = false; + } else { + pr_info("Virtual VMLOAD VMSAVE supported\n"); + } + } + + if (vgif) { + if (!boot_cpu_has(X86_FEATURE_VGIF)) + vgif = false; + else + pr_info("Virtual GIF supported\n"); + } + return 0; err: @@ -1138,16 +1191,15 @@ static void avic_init_vmcb(struct vcpu_svm *svm) { struct vmcb *vmcb = svm->vmcb; struct kvm_arch *vm_data = &svm->vcpu.kvm->arch; - phys_addr_t bpa = page_to_phys(svm->avic_backing_page); - phys_addr_t lpa = page_to_phys(vm_data->avic_logical_id_table_page); - phys_addr_t ppa = page_to_phys(vm_data->avic_physical_id_table_page); + phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page)); + phys_addr_t lpa = __sme_set(page_to_phys(vm_data->avic_logical_id_table_page)); + phys_addr_t ppa = __sme_set(page_to_phys(vm_data->avic_physical_id_table_page)); vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK; vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK; vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK; vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT; vmcb->control.int_ctl |= AVIC_ENABLE_MASK; - svm->vcpu.arch.apicv_active = true; } static void init_vmcb(struct vcpu_svm *svm) @@ -1203,8 +1255,8 @@ static void init_vmcb(struct vcpu_svm *svm) set_intercept(svm, INTERCEPT_MWAIT); } - control->iopm_base_pa = iopm_base; - control->msrpm_base_pa = __pa(svm->msrpm); + control->iopm_base_pa = __sme_set(iopm_base); + control->msrpm_base_pa = __sme_set(__pa(svm->msrpm)); control->int_ctl = V_INTR_MASKING_MASK; init_seg(&save->es); @@ -1263,9 +1315,25 @@ static void init_vmcb(struct vcpu_svm *svm) set_intercept(svm, INTERCEPT_PAUSE); } - if (avic) + if (kvm_vcpu_apicv_active(&svm->vcpu)) avic_init_vmcb(svm); + /* + * If hardware supports Virtual VMLOAD VMSAVE then enable it + * in VMCB and clear intercepts to avoid #VMEXIT. + */ + if (vls) { + clr_intercept(svm, INTERCEPT_VMLOAD); + clr_intercept(svm, INTERCEPT_VMSAVE); + svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; + } + + if (vgif) { + clr_intercept(svm, INTERCEPT_STGI); + clr_intercept(svm, INTERCEPT_CLGI); + svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK; + } + mark_all_dirty(svm->vmcb); enable_gif(svm); @@ -1338,9 +1406,9 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) return -EINVAL; new_entry = READ_ONCE(*entry); - new_entry = (page_to_phys(svm->avic_backing_page) & - AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) | - AVIC_PHYSICAL_ID_ENTRY_VALID_MASK; + new_entry = __sme_set((page_to_phys(svm->avic_backing_page) & + AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) | + AVIC_PHYSICAL_ID_ENTRY_VALID_MASK); WRITE_ONCE(*entry, new_entry); svm->avic_physical_id_cache = entry; @@ -1348,34 +1416,6 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) return 0; } -static inline int avic_get_next_vm_id(void) -{ - int id; - - spin_lock(&avic_vm_id_lock); - - /* AVIC VM ID is one-based. */ - id = find_next_zero_bit(avic_vm_id_bitmap, AVIC_VM_ID_NR, 1); - if (id <= AVIC_VM_ID_MASK) - __set_bit(id, avic_vm_id_bitmap); - else - id = -EAGAIN; - - spin_unlock(&avic_vm_id_lock); - return id; -} - -static inline int avic_free_vm_id(int id) -{ - if (id <= 0 || id > AVIC_VM_ID_MASK) - return -EINVAL; - - spin_lock(&avic_vm_id_lock); - __clear_bit(id, avic_vm_id_bitmap); - spin_unlock(&avic_vm_id_lock); - return 0; -} - static void avic_vm_destroy(struct kvm *kvm) { unsigned long flags; @@ -1384,8 +1424,6 @@ static void avic_vm_destroy(struct kvm *kvm) if (!avic) return; - avic_free_vm_id(vm_data->avic_vm_id); - if (vm_data->avic_logical_id_table_page) __free_page(vm_data->avic_logical_id_table_page); if (vm_data->avic_physical_id_table_page) @@ -1399,19 +1437,16 @@ static void avic_vm_destroy(struct kvm *kvm) static int avic_vm_init(struct kvm *kvm) { unsigned long flags; - int vm_id, err = -ENOMEM; + int err = -ENOMEM; struct kvm_arch *vm_data = &kvm->arch; struct page *p_page; struct page *l_page; + struct kvm_arch *ka; + u32 vm_id; if (!avic) return 0; - vm_id = avic_get_next_vm_id(); - if (vm_id < 0) - return vm_id; - vm_data->avic_vm_id = (u32)vm_id; - /* Allocating physical APIC ID table (4KB) */ p_page = alloc_page(GFP_KERNEL); if (!p_page) @@ -1429,6 +1464,22 @@ static int avic_vm_init(struct kvm *kvm) clear_page(page_address(l_page)); spin_lock_irqsave(&svm_vm_data_hash_lock, flags); + again: + vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK; + if (vm_id == 0) { /* id is 1-based, zero is not okay */ + next_vm_id_wrapped = 1; + goto again; + } + /* Is it still in use? Only possible if wrapped at least once */ + if (next_vm_id_wrapped) { + hash_for_each_possible(svm_vm_data_hash, ka, hnode, vm_id) { + struct kvm *k2 = container_of(ka, struct kvm, arch); + struct kvm_arch *vd2 = &k2->arch; + if (vd2->avic_vm_id == vm_id) + goto again; + } + } + vm_data->avic_vm_id = vm_id; hash_add(svm_vm_data_hash, &vm_data->hnode, vm_data->avic_vm_id); spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); @@ -1541,13 +1592,30 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) } init_vmcb(svm); - kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy); + kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true); kvm_register_write(vcpu, VCPU_REGS_RDX, eax); if (kvm_vcpu_apicv_active(vcpu) && !init_event) avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE); } +static int avic_init_vcpu(struct vcpu_svm *svm) +{ + int ret; + + if (!kvm_vcpu_apicv_active(&svm->vcpu)) + return 0; + + ret = avic_init_backing_page(&svm->vcpu); + if (ret) + return ret; + + INIT_LIST_HEAD(&svm->ir_list); + spin_lock_init(&svm->ir_list_lock); + + return ret; +} + static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) { struct vcpu_svm *svm; @@ -1584,14 +1652,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) if (!hsave_page) goto free_page3; - if (avic) { - err = avic_init_backing_page(&svm->vcpu); - if (err) - goto free_page4; - - INIT_LIST_HEAD(&svm->ir_list); - spin_lock_init(&svm->ir_list_lock); - } + err = avic_init_vcpu(svm); + if (err) + goto free_page4; /* We initialize this flag to true to make sure that the is_running * bit would be set the first time the vcpu is loaded. @@ -1608,7 +1671,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) svm->vmcb = page_address(page); clear_page(svm->vmcb); - svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; + svm->vmcb_pa = __sme_set(page_to_pfn(page) << PAGE_SHIFT); svm->asid_generation = 0; init_vmcb(svm); @@ -1636,7 +1699,7 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT)); + __free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT)); __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); __free_page(virt_to_page(svm->nested.hsave)); __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); @@ -1712,11 +1775,24 @@ static void svm_vcpu_unblocking(struct kvm_vcpu *vcpu) static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) { - return to_svm(vcpu)->vmcb->save.rflags; + struct vcpu_svm *svm = to_svm(vcpu); + unsigned long rflags = svm->vmcb->save.rflags; + + if (svm->nmi_singlestep) { + /* Hide our flags if they were not set by the guest */ + if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF)) + rflags &= ~X86_EFLAGS_TF; + if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF)) + rflags &= ~X86_EFLAGS_RF; + } + return rflags; } static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { + if (to_svm(vcpu)->nmi_singlestep) + rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); + /* * Any change of EFLAGS.VM is accompanied by a reload of SS * (caused by either a task switch or an inter-privilege IRET), @@ -1725,11 +1801,6 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) to_svm(vcpu)->vmcb->save.rflags = rflags; } -static u32 svm_get_pkru(struct kvm_vcpu *vcpu) -{ - return 0; -} - static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) { switch (reg) { @@ -2069,34 +2140,11 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) static int pf_interception(struct vcpu_svm *svm) { u64 fault_address = svm->vmcb->control.exit_info_2; - u64 error_code; - int r = 1; - - switch (svm->apf_reason) { - default: - error_code = svm->vmcb->control.exit_info_1; + u64 error_code = svm->vmcb->control.exit_info_1; - trace_kvm_page_fault(fault_address, error_code); - if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu)) - kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); - r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code, + return kvm_handle_page_fault(&svm->vcpu, error_code, fault_address, svm->vmcb->control.insn_bytes, - svm->vmcb->control.insn_len); - break; - case KVM_PV_REASON_PAGE_NOT_PRESENT: - svm->apf_reason = 0; - local_irq_disable(); - kvm_async_pf_task_wait(fault_address); - local_irq_enable(); - break; - case KVM_PV_REASON_PAGE_READY: - svm->apf_reason = 0; - local_irq_disable(); - kvm_async_pf_task_wake(fault_address); - local_irq_enable(); - break; - } - return r; + svm->vmcb->control.insn_len, !npt_enabled); } static int db_interception(struct vcpu_svm *svm) @@ -2111,10 +2159,7 @@ static int db_interception(struct vcpu_svm *svm) } if (svm->nmi_singlestep) { - svm->nmi_singlestep = false; - if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) - svm->vmcb->save.rflags &= - ~(X86_EFLAGS_TF | X86_EFLAGS_RF); + disable_nmi_singlestep(svm); } if (svm->vcpu.guest_debug & @@ -2243,7 +2288,7 @@ static int io_interception(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu = &svm->vcpu; u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ - int size, in, string; + int size, in, string, ret; unsigned port; ++svm->vcpu.stat.io_exits; @@ -2255,10 +2300,16 @@ static int io_interception(struct vcpu_svm *svm) port = io_info >> 16; size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; svm->next_rip = svm->vmcb->control.exit_info_2; - skip_emulated_instruction(&svm->vcpu); + ret = kvm_skip_emulated_instruction(&svm->vcpu); - return in ? kvm_fast_pio_in(vcpu, size, port) - : kvm_fast_pio_out(vcpu, size, port); + /* + * TODO: we might be squashing a KVM_GUESTDBG_SINGLESTEP-triggered + * KVM_EXIT_DEBUG here. + */ + if (in) + return kvm_fast_pio_in(vcpu, size, port) && ret; + else + return kvm_fast_pio_out(vcpu, size, port) && ret; } static int nmi_interception(struct vcpu_svm *svm) @@ -2303,7 +2354,7 @@ static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index) u64 pdpte; int ret; - ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte, + ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(__sme_clr(cr3)), &pdpte, offset_in_page(cr3) + index * 8, 8); if (ret) return 0; @@ -2315,7 +2366,7 @@ static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu, { struct vcpu_svm *svm = to_svm(vcpu); - svm->vmcb->control.nested_cr3 = root; + svm->vmcb->control.nested_cr3 = __sme_set(root); mark_dirty(svm->vmcb, VMCB_NPT); svm_flush_tlb(vcpu); } @@ -2357,7 +2408,7 @@ static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3; vcpu->arch.mmu.get_pdptr = nested_svm_get_tdp_pdptr; vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit; - vcpu->arch.mmu.shadow_root_level = get_npt_level(); + vcpu->arch.mmu.shadow_root_level = get_npt_level(vcpu); reset_shadow_zero_bits_mask(vcpu, &vcpu->arch.mmu); vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; } @@ -2369,8 +2420,8 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) static int nested_svm_check_permissions(struct vcpu_svm *svm) { - if (!(svm->vcpu.arch.efer & EFER_SVME) - || !is_paging(&svm->vcpu)) { + if (!(svm->vcpu.arch.efer & EFER_SVME) || + !is_paging(&svm->vcpu)) { kvm_queue_exception(&svm->vcpu, UD_VECTOR); return 1; } @@ -2380,7 +2431,7 @@ static int nested_svm_check_permissions(struct vcpu_svm *svm) return 1; } - return 0; + return 0; } static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, @@ -2391,15 +2442,29 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, if (!is_guest_mode(&svm->vcpu)) return 0; + vmexit = nested_svm_intercept(svm); + if (vmexit != NESTED_EXIT_DONE) + return 0; + svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; svm->vmcb->control.exit_code_hi = 0; svm->vmcb->control.exit_info_1 = error_code; - svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; - vmexit = nested_svm_intercept(svm); - if (vmexit == NESTED_EXIT_DONE) - svm->nested.exit_required = true; + /* + * FIXME: we should not write CR2 when L1 intercepts an L2 #PF exception. + * The fix is to add the ancillary datum (CR2 or DR6) to structs + * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6 can be + * written only when inject_pending_event runs (DR6 would written here + * too). This should be conditional on a new capability---if the + * capability is disabled, kvm_multiple_exception would write the + * ancillary information to CR2 or DR6, for backwards ABI-compatibility. + */ + if (svm->vcpu.arch.exception.nested_apf) + svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token; + else + svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; + svm->nested.exit_required = true; return vmexit; } @@ -2533,6 +2598,31 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; } +/* DB exceptions for our internal use must not cause vmexit */ +static int nested_svm_intercept_db(struct vcpu_svm *svm) +{ + unsigned long dr6; + + /* if we're not singlestepping, it's not ours */ + if (!svm->nmi_singlestep) + return NESTED_EXIT_DONE; + + /* if it's not a singlestep exception, it's not ours */ + if (kvm_get_dr(&svm->vcpu, 6, &dr6)) + return NESTED_EXIT_DONE; + if (!(dr6 & DR6_BS)) + return NESTED_EXIT_DONE; + + /* if the guest is singlestepping, it should get the vmexit */ + if (svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF) { + disable_nmi_singlestep(svm); + return NESTED_EXIT_DONE; + } + + /* it's ours, the nested hypervisor must not see this one */ + return NESTED_EXIT_HOST; +} + static int nested_svm_exit_special(struct vcpu_svm *svm) { u32 exit_code = svm->vmcb->control.exit_code; @@ -2549,7 +2639,7 @@ static int nested_svm_exit_special(struct vcpu_svm *svm) break; case SVM_EXIT_EXCP_BASE + PF_VECTOR: /* When we're shadowing, trap PFs, but not async PF */ - if (!npt_enabled && svm->apf_reason == 0) + if (!npt_enabled && svm->vcpu.arch.apf.host_apf_reason == 0) return NESTED_EXIT_HOST; break; default: @@ -2588,11 +2678,15 @@ static int nested_svm_intercept(struct vcpu_svm *svm) } case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); - if (svm->nested.intercept_exceptions & excp_bits) - vmexit = NESTED_EXIT_DONE; + if (svm->nested.intercept_exceptions & excp_bits) { + if (exit_code == SVM_EXIT_EXCP_BASE + DB_VECTOR) + vmexit = nested_svm_intercept_db(svm); + else + vmexit = NESTED_EXIT_DONE; + } /* async page fault always cause vmexit */ else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) && - svm->apf_reason != 0) + svm->vcpu.arch.exception.nested_apf != 0) vmexit = NESTED_EXIT_DONE; break; } @@ -2649,7 +2743,7 @@ static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *fr dst->event_inj = from->event_inj; dst->event_inj_err = from->event_inj_err; dst->nested_cr3 = from->nested_cr3; - dst->lbr_ctl = from->lbr_ctl; + dst->virt_ext = from->virt_ext; } static int nested_svm_vmexit(struct vcpu_svm *svm) @@ -2803,7 +2897,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) svm->nested.msrpm[p] = svm->msrpm[p] | value; } - svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm); + svm->vmcb->control.msrpm_base_pa = __sme_set(__pa(svm->nested.msrpm)); return true; } @@ -2955,7 +3049,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) /* We don't want to see VMMCALLs from a nested guest */ clr_intercept(svm, INTERCEPT_VMMCALL); - svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl; + svm->vmcb->control.virt_ext = nested_vmcb->control.virt_ext; svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; svm->vmcb->control.int_state = nested_vmcb->control.int_state; svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset; @@ -3002,6 +3096,7 @@ static int vmload_interception(struct vcpu_svm *svm) { struct vmcb *nested_vmcb; struct page *page; + int ret; if (nested_svm_check_permissions(svm)) return 1; @@ -3011,18 +3106,19 @@ static int vmload_interception(struct vcpu_svm *svm) return 1; svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; - skip_emulated_instruction(&svm->vcpu); + ret = kvm_skip_emulated_instruction(&svm->vcpu); nested_svm_vmloadsave(nested_vmcb, svm->vmcb); nested_svm_unmap(page); - return 1; + return ret; } static int vmsave_interception(struct vcpu_svm *svm) { struct vmcb *nested_vmcb; struct page *page; + int ret; if (nested_svm_check_permissions(svm)) return 1; @@ -3032,12 +3128,12 @@ static int vmsave_interception(struct vcpu_svm *svm) return 1; svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; - skip_emulated_instruction(&svm->vcpu); + ret = kvm_skip_emulated_instruction(&svm->vcpu); nested_svm_vmloadsave(svm->vmcb, nested_vmcb); nested_svm_unmap(page); - return 1; + return ret; } static int vmrun_interception(struct vcpu_svm *svm) @@ -3070,25 +3166,36 @@ failed: static int stgi_interception(struct vcpu_svm *svm) { + int ret; + if (nested_svm_check_permissions(svm)) return 1; + /* + * If VGIF is enabled, the STGI intercept is only added to + * detect the opening of the NMI window; remove it now. + */ + if (vgif_enabled(svm)) + clr_intercept(svm, INTERCEPT_STGI); + svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; - skip_emulated_instruction(&svm->vcpu); + ret = kvm_skip_emulated_instruction(&svm->vcpu); kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); enable_gif(svm); - return 1; + return ret; } static int clgi_interception(struct vcpu_svm *svm) { + int ret; + if (nested_svm_check_permissions(svm)) return 1; svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; - skip_emulated_instruction(&svm->vcpu); + ret = kvm_skip_emulated_instruction(&svm->vcpu); disable_gif(svm); @@ -3099,7 +3206,7 @@ static int clgi_interception(struct vcpu_svm *svm) mark_dirty(svm->vmcb, VMCB_INTR); } - return 1; + return ret; } static int invlpga_interception(struct vcpu_svm *svm) @@ -3113,8 +3220,7 @@ static int invlpga_interception(struct vcpu_svm *svm) kvm_mmu_invlpg(vcpu, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX)); svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; - skip_emulated_instruction(&svm->vcpu); - return 1; + return kvm_skip_emulated_instruction(&svm->vcpu); } static int skinit_interception(struct vcpu_svm *svm) @@ -3137,7 +3243,7 @@ static int xsetbv_interception(struct vcpu_svm *svm) if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) { svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; - skip_emulated_instruction(&svm->vcpu); + return kvm_skip_emulated_instruction(&svm->vcpu); } return 1; @@ -3233,8 +3339,7 @@ static int invlpg_interception(struct vcpu_svm *svm) return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1); - skip_emulated_instruction(&svm->vcpu); - return 1; + return kvm_skip_emulated_instruction(&svm->vcpu); } static int emulate_on_interception(struct vcpu_svm *svm) @@ -3384,9 +3489,7 @@ static int dr_interception(struct vcpu_svm *svm) kvm_register_write(&svm->vcpu, reg, val); } - skip_emulated_instruction(&svm->vcpu); - - return 1; + return kvm_skip_emulated_instruction(&svm->vcpu); } static int cr8_write_interception(struct vcpu_svm *svm) @@ -3509,6 +3612,7 @@ static int rdmsr_interception(struct vcpu_svm *svm) if (svm_get_msr(&svm->vcpu, &msr_info)) { trace_kvm_msr_read_ex(ecx); kvm_inject_gp(&svm->vcpu, 0); + return 1; } else { trace_kvm_msr_read(ecx, msr_info.data); @@ -3517,9 +3621,8 @@ static int rdmsr_interception(struct vcpu_svm *svm) kvm_register_write(&svm->vcpu, VCPU_REGS_RDX, msr_info.data >> 32); svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; - skip_emulated_instruction(&svm->vcpu); + return kvm_skip_emulated_instruction(&svm->vcpu); } - return 1; } static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data) @@ -3645,11 +3748,11 @@ static int wrmsr_interception(struct vcpu_svm *svm) if (kvm_set_msr(&svm->vcpu, &msr)) { trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(&svm->vcpu, 0); + return 1; } else { trace_kvm_msr_write(ecx, data); - skip_emulated_instruction(&svm->vcpu); + return kvm_skip_emulated_instruction(&svm->vcpu); } - return 1; } static int msr_interception(struct vcpu_svm *svm) @@ -3672,14 +3775,16 @@ static int interrupt_window_interception(struct vcpu_svm *svm) static int pause_interception(struct vcpu_svm *svm) { - kvm_vcpu_on_spin(&(svm->vcpu)); + struct kvm_vcpu *vcpu = &svm->vcpu; + bool in_kernel = (svm_get_cpl(vcpu) == 0); + + kvm_vcpu_on_spin(vcpu, in_kernel); return 1; } static int nop_interception(struct vcpu_svm *svm) { - skip_emulated_instruction(&(svm->vcpu)); - return 1; + return kvm_skip_emulated_instruction(&(svm->vcpu)); } static int monitor_interception(struct vcpu_svm *svm) @@ -4064,7 +4169,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar); pr_err("%-20s%08x\n", "event_inj:", control->event_inj); pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err); - pr_err("%-20s%lld\n", "lbr_ctl:", control->lbr_ctl); + pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext); pr_err("%-20s%016llx\n", "next_rip:", control->next_rip); pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page); pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id); @@ -4157,8 +4262,6 @@ static int handle_exit(struct kvm_vcpu *vcpu) trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM); - vcpu->arch.gpa_available = (exit_code == SVM_EXIT_NPF); - if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE)) vcpu->arch.cr0 = svm->vmcb->save.cr0; if (npt_enabled) @@ -4303,9 +4406,9 @@ static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) return; } -static bool svm_get_enable_apicv(void) +static bool svm_get_enable_apicv(struct kvm_vcpu *vcpu) { - return avic; + return avic && irqchip_split(vcpu->kvm); } static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) @@ -4322,7 +4425,7 @@ static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *vmcb = svm->vmcb; - if (!avic) + if (!kvm_vcpu_apicv_active(&svm->vcpu)) return; vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK; @@ -4435,7 +4538,7 @@ get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, pr_debug("SVM: %s: use GA mode for irq %u\n", __func__, irq.vector); *svm = to_svm(vcpu); - vcpu_info->pi_desc_addr = page_to_phys((*svm)->avic_backing_page); + vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page)); vcpu_info->vector = irq.vector; return 0; @@ -4486,7 +4589,8 @@ static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq, struct amd_iommu_pi_data pi; /* Try to enable guest_mode in IRTE */ - pi.base = page_to_phys(svm->avic_backing_page) & AVIC_HPA_MASK; + pi.base = __sme_set(page_to_phys(svm->avic_backing_page) & + AVIC_HPA_MASK); pi.ga_tag = AVIC_GATAG(kvm->arch.avic_vm_id, svm->vcpu.vcpu_id); pi.is_guest_mode = true; @@ -4610,9 +4714,11 @@ static void enable_irq_window(struct kvm_vcpu *vcpu) * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes * 1, because that's a separate STGI/VMRUN intercept. The next time we * get that intercept, this function will be called again though and - * we'll get the vintr intercept. + * we'll get the vintr intercept. However, if the vGIF feature is + * enabled, the STGI interception will not occur. Enable the irq + * window under the assumption that the hardware will set the GIF. */ - if (gif_set(svm) && nested_svm_intr(svm)) { + if ((vgif_enabled(svm) || gif_set(svm)) && nested_svm_intr(svm)) { svm_set_vintr(svm); svm_inject_irq(svm, 0x0); } @@ -4626,10 +4732,20 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu) == HF_NMI_MASK) return; /* IRET will cause a vm exit */ + if (!gif_set(svm)) { + if (vgif_enabled(svm)) + set_intercept(svm, INTERCEPT_STGI); + return; /* STGI will cause a vm exit */ + } + + if (svm->nested.exit_required) + return; /* we're not going to run the guest yet */ + /* * Something prevents NMI from been injected. Single step over possible * problem (IRET or exception injection or interrupt shadow) */ + svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu); svm->nmi_singlestep = true; svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); } @@ -4770,6 +4886,22 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) if (unlikely(svm->nested.exit_required)) return; + /* + * Disable singlestep if we're injecting an interrupt/exception. + * We don't want our modified rflags to be pushed on the stack where + * we might not be able to easily reset them if we disabled NMI + * singlestep later. + */ + if (svm->nmi_singlestep && svm->vmcb->control.event_inj) { + /* + * Event injection happens before external interrupts cause a + * vmexit and interrupts are disabled here, so smp_send_reschedule + * is enough to force an immediate vmexit. + */ + disable_nmi_singlestep(svm); + smp_send_reschedule(vcpu->cpu); + } + pre_svm_run(svm); sync_lapic_to_cr8(vcpu); @@ -4889,7 +5021,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) /* if exit due to PF check for async PF */ if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) - svm->apf_reason = kvm_read_and_reset_pf_reason(); + svm->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason(); if (npt_enabled) { vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR); @@ -4906,12 +5038,13 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) mark_all_clean(svm->vmcb); } +STACK_FRAME_NON_STANDARD(svm_vcpu_run); static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) { struct vcpu_svm *svm = to_svm(vcpu); - svm->vmcb->save.cr3 = root; + svm->vmcb->save.cr3 = __sme_set(root); mark_dirty(svm->vmcb, VMCB_CR); svm_flush_tlb(vcpu); } @@ -4920,7 +5053,7 @@ static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root) { struct vcpu_svm *svm = to_svm(vcpu); - svm->vmcb->control.nested_cr3 = root; + svm->vmcb->control.nested_cr3 = __sme_set(root); mark_dirty(svm->vmcb, VMCB_NPT); /* Also sync guest cr3 here in case we live migrate */ @@ -4975,17 +5108,14 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) static void svm_cpuid_update(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - struct kvm_cpuid_entry2 *entry; /* Update nrips enabled cache */ - svm->nrips_enabled = !!guest_cpuid_has_nrips(&svm->vcpu); + svm->nrips_enabled = !!guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS); if (!kvm_vcpu_apicv_active(vcpu)) return; - entry = kvm_find_cpuid_entry(vcpu, 1, 0); - if (entry) - entry->ecx &= ~bit(X86_FEATURE_X2APIC); + guest_cpuid_clear(vcpu, X86_FEATURE_X2APIC); } static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) @@ -5183,6 +5313,7 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, */ if (info->rep_prefix != REPE_PREFIX) goto out; + break; case SVM_EXIT_IOIO: { u64 exit_info; u32 bytes; @@ -5313,8 +5444,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, - .get_pkru = svm_get_pkru, - .tlb_flush = svm_flush_tlb, .run = svm_vcpu_run, diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 0a6cc6754ec5..9807c314c478 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_KVM_H @@ -151,8 +152,8 @@ TRACE_EVENT(kvm_fast_mmio, */ TRACE_EVENT(kvm_cpuid, TP_PROTO(unsigned int function, unsigned long rax, unsigned long rbx, - unsigned long rcx, unsigned long rdx), - TP_ARGS(function, rax, rbx, rcx, rdx), + unsigned long rcx, unsigned long rdx, bool found), + TP_ARGS(function, rax, rbx, rcx, rdx, found), TP_STRUCT__entry( __field( unsigned int, function ) @@ -160,6 +161,7 @@ TRACE_EVENT(kvm_cpuid, __field( unsigned long, rbx ) __field( unsigned long, rcx ) __field( unsigned long, rdx ) + __field( bool, found ) ), TP_fast_assign( @@ -168,11 +170,13 @@ TRACE_EVENT(kvm_cpuid, __entry->rbx = rbx; __entry->rcx = rcx; __entry->rdx = rdx; + __entry->found = found; ), - TP_printk("func %x rax %lx rbx %lx rcx %lx rdx %lx", + TP_printk("func %x rax %lx rbx %lx rcx %lx rdx %lx, cpuid entry %s", __entry->function, __entry->rax, - __entry->rbx, __entry->rcx, __entry->rdx) + __entry->rbx, __entry->rcx, __entry->rdx, + __entry->found ? "found" : "not found") ); #define AREG(x) { APIC_##x, "APIC_" #x } diff --git a/arch/x86/kvm/tss.h b/arch/x86/kvm/tss.h index 622aa10f692f..3f9150125e70 100644 --- a/arch/x86/kvm/tss.h +++ b/arch/x86/kvm/tss.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __TSS_SEGMENT_H #define __TSS_SEGMENT_H diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ca5d2b93385c..a6f4f095f8f4 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -33,6 +33,7 @@ #include <linux/slab.h> #include <linux/tboot.h> #include <linux/hrtimer.h> +#include <linux/frame.h> #include "kvm_cache_regs.h" #include "x86.h" @@ -48,6 +49,7 @@ #include <asm/kexec.h> #include <asm/apic.h> #include <asm/irq_remapping.h> +#include <asm/mmu_context.h> #include "trace.h" #include "pmu.h" @@ -120,7 +122,7 @@ module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) #define KVM_CR4_GUEST_OWNED_BITS \ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ - | X86_CR4_OSXMMEXCPT | X86_CR4_TSD) + | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_TSD) #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) @@ -196,7 +198,10 @@ struct loaded_vmcs { struct vmcs *vmcs; struct vmcs *shadow_vmcs; int cpu; - int launched; + bool launched; + bool nmi_known_unmasked; + unsigned long vmcs_host_cr3; /* May not match real cr3 */ + unsigned long vmcs_host_cr4; /* May not match real cr4 */ struct list_head loaded_vmcss_on_cpu_link; }; @@ -240,11 +245,13 @@ struct __packed vmcs12 { u64 virtual_apic_page_addr; u64 apic_access_addr; u64 posted_intr_desc_addr; + u64 vm_function_control; u64 ept_pointer; u64 eoi_exit_bitmap0; u64 eoi_exit_bitmap1; u64 eoi_exit_bitmap2; u64 eoi_exit_bitmap3; + u64 eptp_list_address; u64 xss_exit_bitmap; u64 guest_physical_address; u64 vmcs_link_pointer; @@ -413,13 +420,10 @@ struct nested_vmx { /* The guest-physical address of the current VMCS L1 keeps for L2 */ gpa_t current_vmptr; - /* The host-usable pointer to the above */ - struct page *current_vmcs12_page; - struct vmcs12 *current_vmcs12; /* * Cache of the guest's VMCS, existing outside of guest memory. * Loaded from guest memory during VMPTRLD. Flushed to guest - * memory during VMXOFF, VMCLEAR, VMPTRLD. + * memory during VMCLEAR and VMPTRLD. */ struct vmcs12 *cached_vmcs12; /* @@ -481,6 +485,7 @@ struct nested_vmx { u64 nested_vmx_cr4_fixed0; u64 nested_vmx_cr4_fixed1; u64 nested_vmx_vmcs_enum; + u64 nested_vmx_vmfunc_controls; }; #define POSTED_INTR_ON 0 @@ -560,7 +565,6 @@ struct vcpu_vmx { struct kvm_vcpu vcpu; unsigned long host_rsp; u8 fail; - bool nmi_known_unmasked; u32 exit_intr_info; u32 idt_vectoring_info; ulong rflags; @@ -574,6 +578,8 @@ struct vcpu_vmx { #endif u32 vm_entry_controls_shadow; u32 vm_exit_controls_shadow; + u32 secondary_exec_control; + /* * loaded_vmcs points to the VMCS currently used in this vcpu. For a * non-nested (L1) guest, it always points to vmcs01. For a nested @@ -596,7 +602,6 @@ struct vcpu_vmx { int gs_ldt_reload_needed; int fs_reload_needed; u64 msr_host_bndcfgs; - unsigned long vmcs_host_cr4; /* May not match real cr4 */ } host_state; struct { int vm86_active; @@ -636,8 +641,6 @@ struct vcpu_vmx { u64 current_tsc_ratio; - bool guest_pkru_valid; - u32 guest_pkru; u32 host_pkru; /* @@ -763,11 +766,13 @@ static const unsigned short vmcs_field_to_offset_table[] = { FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr), FIELD64(APIC_ACCESS_ADDR, apic_access_addr), FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr), + FIELD64(VM_FUNCTION_CONTROL, vm_function_control), FIELD64(EPT_POINTER, ept_pointer), FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0), FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1), FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2), FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3), + FIELD64(EPTP_LIST_ADDRESS, eptp_list_address), FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap), FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address), FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer), @@ -891,27 +896,9 @@ static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) return to_vmx(vcpu)->nested.cached_vmcs12; } -static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr) -{ - struct page *page = kvm_vcpu_gfn_to_page(vcpu, addr >> PAGE_SHIFT); - if (is_error_page(page)) - return NULL; - - return page; -} - -static void nested_release_page(struct page *page) -{ - kvm_release_page_dirty(page); -} - -static void nested_release_page_clean(struct page *page) -{ - kvm_release_page_clean(page); -} - +static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu); static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); -static u64 construct_eptp(unsigned long root_hpa); +static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa); static bool vmx_xsaves_supported(void); static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); static void vmx_set_segment(struct kvm_vcpu *vcpu, @@ -923,6 +910,10 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var); static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx); static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); static int alloc_identity_pagetable(struct kvm *kvm); +static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu); +static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked); +static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, + u16 error_code); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -1209,6 +1200,16 @@ static inline bool cpu_has_vmx_ept_4levels(void) return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT; } +static inline bool cpu_has_vmx_ept_mt_wb(void) +{ + return vmx_capability.ept & VMX_EPTP_WB_BIT; +} + +static inline bool cpu_has_vmx_ept_5levels(void) +{ + return vmx_capability.ept & VMX_EPT_PAGE_WALK_5_BIT; +} + static inline bool cpu_has_vmx_ept_ad_bits(void) { return vmx_capability.ept & VMX_EPT_AD_BIT; @@ -1314,6 +1315,12 @@ static inline bool cpu_has_vmx_tsc_scaling(void) SECONDARY_EXEC_TSC_SCALING; } +static inline bool cpu_has_vmx_vmfunc(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_ENABLE_VMFUNC; +} + static inline bool report_flexpriority(void) { return flexpriority_enabled; @@ -1354,8 +1361,7 @@ static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12) static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12) { - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES) && - vmx_xsaves_supported(); + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); } static inline bool nested_cpu_has_pml(struct vmcs12 *vmcs12) @@ -1388,6 +1394,18 @@ static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12) return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR; } +static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC); +} + +static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12) +{ + return nested_cpu_has_vmfunc(vmcs12) && + (vmcs12->vm_function_control & + VMX_VMFUNC_EPTP_SWITCHING); +} + static inline bool is_nmi(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) @@ -2184,46 +2202,44 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) struct pi_desc old, new; unsigned int dest; - if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(vcpu)) + /* + * In case of hot-plug or hot-unplug, we may have to undo + * vmx_vcpu_pi_put even if there is no assigned device. And we + * always keep PI.NDST up to date for simplicity: it makes the + * code easier, and CPU migration is not a fast path. + */ + if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu) + return; + + /* + * First handle the simple case where no cmpxchg is necessary; just + * allow posting non-urgent interrupts. + * + * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change + * PI.NDST: pi_post_block will do it for us and the wakeup_handler + * expects the VCPU to be on the blocked_vcpu_list that matches + * PI.NDST. + */ + if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || + vcpu->cpu == cpu) { + pi_clear_sn(pi_desc); return; + } + /* The full case. */ do { old.control = new.control = pi_desc->control; - /* - * If 'nv' field is POSTED_INTR_WAKEUP_VECTOR, there - * are two possible cases: - * 1. After running 'pre_block', context switch - * happened. For this case, 'sn' was set in - * vmx_vcpu_put(), so we need to clear it here. - * 2. After running 'pre_block', we were blocked, - * and woken up by some other guy. For this case, - * we don't need to do anything, 'pi_post_block' - * will do everything for us. However, we cannot - * check whether it is case #1 or case #2 here - * (maybe, not needed), so we also clear sn here, - * I think it is not a big deal. - */ - if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR) { - if (vcpu->cpu != cpu) { - dest = cpu_physical_id(cpu); - - if (x2apic_enabled()) - new.ndst = dest; - else - new.ndst = (dest << 8) & 0xFF00; - } + dest = cpu_physical_id(cpu); - /* set 'NV' to 'notification vector' */ - new.nv = POSTED_INTR_VECTOR; - } + if (x2apic_enabled()) + new.ndst = dest; + else + new.ndst = (dest << 8) & 0xFF00; - /* Allow posting non-urgent interrupts */ new.sn = 0; - } while (cmpxchg(&pi_desc->control, old.control, - new.control) != old.control); + } while (cmpxchg64(&pi_desc->control, old.control, + new.control) != old.control); } static void decache_tsc_multiplier(struct vcpu_vmx *vmx) @@ -2322,6 +2338,11 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu) __vmx_load_host_state(to_vmx(vcpu)); } +static bool emulation_required(struct kvm_vcpu *vcpu) +{ + return emulate_invalid_guest_state && !guest_state_valid(vcpu); +} + static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); /* @@ -2359,6 +2380,8 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { + unsigned long old_rflags = vmx_get_rflags(vcpu); + __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); to_vmx(vcpu)->rflags = rflags; if (to_vmx(vcpu)->rmode.vm86_active) { @@ -2366,11 +2389,9 @@ static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; } vmcs_writel(GUEST_RFLAGS, rflags); -} -static u32 vmx_get_pkru(struct kvm_vcpu *vcpu) -{ - return to_vmx(vcpu)->guest_pkru; + if ((old_rflags ^ to_vmx(vcpu)->rflags) & X86_EFLAGS_VM) + to_vmx(vcpu)->emulation_required = emulation_required(vcpu); } static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) @@ -2414,34 +2435,79 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) vmx_set_interrupt_shadow(vcpu, 0); } +static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu, + unsigned long exit_qual) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + unsigned int nr = vcpu->arch.exception.nr; + u32 intr_info = nr | INTR_INFO_VALID_MASK; + + if (vcpu->arch.exception.has_error_code) { + vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code; + intr_info |= INTR_INFO_DELIVER_CODE_MASK; + } + + if (kvm_exception_is_soft(nr)) + intr_info |= INTR_TYPE_SOFT_EXCEPTION; + else + intr_info |= INTR_TYPE_HARD_EXCEPTION; + + if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) && + vmx_get_nmi_mask(vcpu)) + intr_info |= INTR_INFO_UNBLOCK_NMI; + + nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); +} + /* * KVM wants to inject page-faults which it got to the guest. This function * checks whether in a nested guest, we need to inject them to L1 or L2. */ -static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr) +static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + unsigned int nr = vcpu->arch.exception.nr; - if (!(vmcs12->exception_bitmap & (1u << nr))) - return 0; + if (nr == PF_VECTOR) { + if (vcpu->arch.exception.nested_apf) { + *exit_qual = vcpu->arch.apf.nested_apf_token; + return 1; + } + /* + * FIXME: we must not write CR2 when L1 intercepts an L2 #PF exception. + * The fix is to add the ancillary datum (CR2 or DR6) to structs + * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6 + * can be written only when inject_pending_event runs. This should be + * conditional on a new capability---if the capability is disabled, + * kvm_multiple_exception would write the ancillary information to + * CR2 or DR6, for backwards ABI-compatibility. + */ + if (nested_vmx_is_page_fault_vmexit(vmcs12, + vcpu->arch.exception.error_code)) { + *exit_qual = vcpu->arch.cr2; + return 1; + } + } else { + if (vmcs12->exception_bitmap & (1u << nr)) { + if (nr == DB_VECTOR) + *exit_qual = vcpu->arch.dr6; + else + *exit_qual = 0; + return 1; + } + } - nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, - vmcs_read32(VM_EXIT_INTR_INFO), - vmcs_readl(EXIT_QUALIFICATION)); - return 1; + return 0; } -static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, - bool has_error_code, u32 error_code, - bool reinject) +static void vmx_queue_exception(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned nr = vcpu->arch.exception.nr; + bool has_error_code = vcpu->arch.exception.has_error_code; + u32 error_code = vcpu->arch.exception.error_code; u32 intr_info = nr | INTR_INFO_VALID_MASK; - if (!reinject && is_guest_mode(vcpu) && - nested_vmx_check_exception(vcpu, nr)) - return; - if (has_error_code) { vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); intr_info |= INTR_INFO_DELIVER_CODE_MASK; @@ -2540,7 +2606,7 @@ static void setup_msrs(struct vcpu_vmx *vmx) if (index >= 0) move_msr_up(vmx, index, save_nmsrs++); index = __find_msr_index(vmx, MSR_TSC_AUX); - if (index >= 0 && guest_cpuid_has_rdtscp(&vmx->vcpu)) + if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP)) move_msr_up(vmx, index, save_nmsrs++); /* * MSR_STAR is only needed on long mode guests, and only @@ -2600,12 +2666,6 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) } } -static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0); - return best && (best->ecx & (1 << (X86_FEATURE_VMX & 31))); -} - /* * nested_vmx_allowed() checks whether a guest should be allowed to use VMX * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for @@ -2614,7 +2674,7 @@ static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu) */ static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu) { - return nested && guest_cpuid_has_vmx(vcpu); + return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX); } /* @@ -2640,7 +2700,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) * reason is that if one of these bits is necessary, it will appear * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control * fields of vmcs01 and vmcs02, will turn these bits off - and - * nested_vmx_exit_handled() will not pass related exits to L1. + * nested_vmx_exit_reflected() will not pass related exits to L1. * These rules have exceptions below. */ @@ -2737,21 +2797,21 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) vmx->nested.nested_vmx_procbased_ctls_low &= ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); - /* secondary cpu-based controls */ + /* + * secondary cpu-based controls. Do not include those that + * depend on CPUID bits, they are added later by vmx_cpuid_update. + */ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, vmx->nested.nested_vmx_secondary_ctls_low, vmx->nested.nested_vmx_secondary_ctls_high); vmx->nested.nested_vmx_secondary_ctls_low = 0; vmx->nested.nested_vmx_secondary_ctls_high &= - SECONDARY_EXEC_RDRAND | SECONDARY_EXEC_RDSEED | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | - SECONDARY_EXEC_RDTSCP | SECONDARY_EXEC_DESC | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | - SECONDARY_EXEC_WBINVD_EXITING | - SECONDARY_EXEC_XSAVES; + SECONDARY_EXEC_WBINVD_EXITING; if (enable_ept) { /* nested EPT: emulate EPT also to L1 */ @@ -2769,11 +2829,22 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) if (enable_ept_ad_bits) { vmx->nested.nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_PML; - vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT; + vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT; } } else vmx->nested.nested_vmx_ept_caps = 0; + if (cpu_has_vmx_vmfunc()) { + vmx->nested.nested_vmx_secondary_ctls_high |= + SECONDARY_EXEC_ENABLE_VMFUNC; + /* + * Advertise EPTP switching unconditionally + * since we emulate it + */ + vmx->nested.nested_vmx_vmfunc_controls = + VMX_VMFUNC_EPTP_SWITCHING; + } + /* * Old versions of KVM use the single-context version without * checking for support, so declare that it is supported even @@ -3143,6 +3214,9 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) *pdata = vmx->nested.nested_vmx_ept_caps | ((u64)vmx->nested.nested_vmx_vpid_caps << 32); break; + case MSR_IA32_VMX_VMFUNC: + *pdata = vmx->nested.nested_vmx_vmfunc_controls; + break; default: return 1; } @@ -3195,7 +3269,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); break; case MSR_IA32_BNDCFGS: - if (!kvm_mpx_supported()) + if (!kvm_mpx_supported() || + (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_MPX))) return 1; msr_info->data = vmcs_read64(GUEST_BNDCFGS); break; @@ -3219,7 +3295,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vcpu->arch.ia32_xss; break; case MSR_TSC_AUX: - if (!guest_cpuid_has_rdtscp(vcpu) && !msr_info->host_initiated) + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) return 1; /* Otherwise falls through */ default: @@ -3277,7 +3354,12 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmcs_writel(GUEST_SYSENTER_ESP, data); break; case MSR_IA32_BNDCFGS: - if (!kvm_mpx_supported()) + if (!kvm_mpx_supported() || + (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_MPX))) + return 1; + if (is_noncanonical_address(data & PAGE_MASK, vcpu) || + (data & MSR_IA32_BNDCFGS_RSVD)) return 1; vmcs_write64(GUEST_BNDCFGS, data); break; @@ -3337,7 +3419,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) clear_atomic_switch_msr(vmx, MSR_IA32_XSS); break; case MSR_TSC_AUX: - if (!guest_cpuid_has_rdtscp(vcpu) && !msr_info->host_initiated) + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) return 1; /* Check reserved bit, higher 32 bits should be zero */ if ((data >> 32) != 0) @@ -3574,8 +3657,11 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_SHADOW_VMCS | SECONDARY_EXEC_XSAVES | + SECONDARY_EXEC_RDSEED | + SECONDARY_EXEC_RDRAND | SECONDARY_EXEC_ENABLE_PML | - SECONDARY_EXEC_TSC_SCALING; + SECONDARY_EXEC_TSC_SCALING | + SECONDARY_EXEC_ENABLE_VMFUNC; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) @@ -3755,6 +3841,25 @@ static void free_kvm_area(void) } } +enum vmcs_field_type { + VMCS_FIELD_TYPE_U16 = 0, + VMCS_FIELD_TYPE_U64 = 1, + VMCS_FIELD_TYPE_U32 = 2, + VMCS_FIELD_TYPE_NATURAL_WIDTH = 3 +}; + +static inline int vmcs_field_type(unsigned long field) +{ + if (0x1 & field) /* the *_HIGH fields are all 32 bit */ + return VMCS_FIELD_TYPE_U32; + return (field >> 13) & 0x3 ; +} + +static inline int vmcs_field_readonly(unsigned long field) +{ + return (((field >> 10) & 0x3) == 1); +} + static void init_vmcs_shadow_fields(void) { int i, j; @@ -3780,14 +3885,22 @@ static void init_vmcs_shadow_fields(void) /* shadowed fields guest access without vmexit */ for (i = 0; i < max_shadow_read_write_fields; i++) { - clear_bit(shadow_read_write_fields[i], - vmx_vmwrite_bitmap); - clear_bit(shadow_read_write_fields[i], - vmx_vmread_bitmap); + unsigned long field = shadow_read_write_fields[i]; + + clear_bit(field, vmx_vmwrite_bitmap); + clear_bit(field, vmx_vmread_bitmap); + if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64) { + clear_bit(field + 1, vmx_vmwrite_bitmap); + clear_bit(field + 1, vmx_vmread_bitmap); + } + } + for (i = 0; i < max_shadow_read_only_fields; i++) { + unsigned long field = shadow_read_only_fields[i]; + + clear_bit(field, vmx_vmread_bitmap); + if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64) + clear_bit(field + 1, vmx_vmread_bitmap); } - for (i = 0; i < max_shadow_read_only_fields; i++) - clear_bit(shadow_read_only_fields[i], - vmx_vmread_bitmap); } static __init int alloc_kvm_area(void) @@ -3808,11 +3921,6 @@ static __init int alloc_kvm_area(void) return 0; } -static bool emulation_required(struct kvm_vcpu *vcpu) -{ - return emulate_invalid_guest_state && !guest_state_valid(vcpu); -} - static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, struct kvm_segment *save) { @@ -4010,7 +4118,7 @@ static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid) if (enable_ept) { if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; - ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); + ept_sync_context(construct_eptp(vcpu, vcpu->arch.mmu.root_hpa)); } else { vpid_sync_context(vpid); } @@ -4185,15 +4293,22 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) vmx->emulation_required = emulation_required(vcpu); } -static u64 construct_eptp(unsigned long root_hpa) +static int get_ept_level(struct kvm_vcpu *vcpu) { - u64 eptp; + if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48)) + return 5; + return 4; +} - /* TODO write the value reading from MSR */ - eptp = VMX_EPT_DEFAULT_MT | - VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT; - if (enable_ept_ad_bits) - eptp |= VMX_EPT_AD_ENABLE_BIT; +static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa) +{ + u64 eptp = VMX_EPTP_MT_WB; + + eptp |= (get_ept_level(vcpu) == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4; + + if (enable_ept_ad_bits && + (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu))) + eptp |= VMX_EPTP_AD_ENABLE_BIT; eptp |= (root_hpa & PAGE_MASK); return eptp; @@ -4206,7 +4321,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) guest_cr3 = cr3; if (enable_ept) { - eptp = construct_eptp(cr3); + eptp = construct_eptp(vcpu, cr3); vmcs_write64(EPT_POINTER, eptp); if (is_paging(vcpu) || is_guest_mode(vcpu)) guest_cr3 = kvm_read_cr3(vcpu); @@ -4624,6 +4739,11 @@ static bool guest_state_valid(struct kvm_vcpu *vcpu) return true; } +static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa) +{ + return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu)); +} + static int init_rmode_tss(struct kvm *kvm) { gfn_t fn; @@ -4890,11 +5010,33 @@ static void vmx_disable_intercept_msr_x2apic(u32 msr, int type, bool apicv_activ } } -static bool vmx_get_enable_apicv(void) +static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu) { return enable_apicv; } +static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + gfn_t gfn; + + /* + * Don't need to mark the APIC access page dirty; it is never + * written to by the CPU during APIC virtualization. + */ + + if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { + gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; + kvm_vcpu_mark_page_dirty(vcpu, gfn); + } + + if (nested_cpu_has_posted_intr(vmcs12)) { + gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; + kvm_vcpu_mark_page_dirty(vcpu, gfn); + } +} + + static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -4902,18 +5044,15 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) void *vapic_page; u16 status; - if (vmx->nested.pi_desc && - vmx->nested.pi_pending) { - vmx->nested.pi_pending = false; - if (!pi_test_and_clear_on(vmx->nested.pi_desc)) - return; - - max_irr = find_last_bit( - (unsigned long *)vmx->nested.pi_desc->pir, 256); + if (!vmx->nested.pi_desc || !vmx->nested.pi_pending) + return; - if (max_irr == 256) - return; + vmx->nested.pi_pending = false; + if (!pi_test_and_clear_on(vmx->nested.pi_desc)) + return; + max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256); + if (max_irr != 256) { vapic_page = kmap(vmx->nested.virtual_apic_page); __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page); kunmap(vmx->nested.virtual_apic_page); @@ -4925,30 +5064,43 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) vmcs_write16(GUEST_INTR_STATUS, status); } } + + nested_mark_vmcs12_pages_dirty(vcpu); } -static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu) +static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, + bool nested) { #ifdef CONFIG_SMP - if (vcpu->mode == IN_GUEST_MODE) { - struct vcpu_vmx *vmx = to_vmx(vcpu); + int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR; + if (vcpu->mode == IN_GUEST_MODE) { /* - * Currently, we don't support urgent interrupt, - * all interrupts are recognized as non-urgent - * interrupt, so we cannot post interrupts when - * 'SN' is set. + * The vector of interrupt to be delivered to vcpu had + * been set in PIR before this function. + * + * Following cases will be reached in this block, and + * we always send a notification event in all cases as + * explained below. + * + * Case 1: vcpu keeps in non-root mode. Sending a + * notification event posts the interrupt to vcpu. + * + * Case 2: vcpu exits to root mode and is still + * runnable. PIR will be synced to vIRR before the + * next vcpu entry. Sending a notification event in + * this case has no effect, as vcpu is not in root + * mode. * - * If the vcpu is in guest mode, it means it is - * running instead of being scheduled out and - * waiting in the run queue, and that's the only - * case when 'SN' is set currently, warning if - * 'SN' is set. + * Case 3: vcpu exits to root mode and is blocked. + * vcpu_block() has already synced PIR to vIRR and + * never blocks vcpu if vIRR is not cleared. Therefore, + * a blocked vcpu here does not wait for any requested + * interrupts in PIR, and sending a notification event + * which has no effect is safe here. */ - WARN_ON_ONCE(pi_test_sn(&vmx->pi_desc)); - apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), - POSTED_INTR_VECTOR); + apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); return true; } #endif @@ -4963,7 +5115,7 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, if (is_guest_mode(vcpu) && vector == vmx->nested.posted_intr_nv) { /* the PIR and ON have been set by L1. */ - kvm_vcpu_trigger_posted_interrupt(vcpu); + kvm_vcpu_trigger_posted_interrupt(vcpu, true); /* * If a posted intr is not recognized by hardware, * we will accomplish it in the next vmentry. @@ -4997,7 +5149,7 @@ static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) if (pi_test_and_set_on(&vmx->pi_desc)) return; - if (!kvm_vcpu_trigger_posted_interrupt(vcpu)) + if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false)) kvm_vcpu_kick(vcpu); } @@ -5012,17 +5164,24 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) u32 low32, high32; unsigned long tmpl; struct desc_ptr dt; - unsigned long cr0, cr4; + unsigned long cr0, cr3, cr4; cr0 = read_cr0(); WARN_ON(cr0 & X86_CR0_TS); vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */ - vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ + + /* + * Save the most likely value for this task's CR3 in the VMCS. + * We can't use __get_current_cr3_fast() because we're not atomic. + */ + cr3 = __read_cr3(); + vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */ + vmx->loaded_vmcs->vmcs_host_cr3 = cr3; /* Save the most likely value for this task's CR4 in the VMCS. */ cr4 = cr4_read_shadow(); vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ - vmx->host_state.vmcs_host_cr4 = cr4; + vmx->loaded_vmcs->vmcs_host_cr4 = cr4; vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ #ifdef CONFIG_X86_64 @@ -5040,7 +5199,7 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ - native_store_idt(&dt); + store_idt(&dt); vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */ vmx->host_idt_base = dt.address; @@ -5120,10 +5279,24 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx) return exec_control; } -static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) +static bool vmx_rdrand_supported(void) { + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_RDRAND; +} + +static bool vmx_rdseed_supported(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_RDSEED; +} + +static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) +{ + struct kvm_vcpu *vcpu = &vmx->vcpu; + u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; - if (!cpu_need_virtualize_apic_accesses(&vmx->vcpu)) + if (!cpu_need_virtualize_apic_accesses(vcpu)) exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; if (vmx->vpid == 0) exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; @@ -5137,7 +5310,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; if (!ple_gap) exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; - if (!kvm_vcpu_apicv_active(&vmx->vcpu)) + if (!kvm_vcpu_apicv_active(vcpu)) exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; @@ -5151,7 +5324,92 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) if (!enable_pml) exec_control &= ~SECONDARY_EXEC_ENABLE_PML; - return exec_control; + if (vmx_xsaves_supported()) { + /* Exposing XSAVES only when XSAVE is exposed */ + bool xsaves_enabled = + guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && + guest_cpuid_has(vcpu, X86_FEATURE_XSAVES); + + if (!xsaves_enabled) + exec_control &= ~SECONDARY_EXEC_XSAVES; + + if (nested) { + if (xsaves_enabled) + vmx->nested.nested_vmx_secondary_ctls_high |= + SECONDARY_EXEC_XSAVES; + else + vmx->nested.nested_vmx_secondary_ctls_high &= + ~SECONDARY_EXEC_XSAVES; + } + } + + if (vmx_rdtscp_supported()) { + bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP); + if (!rdtscp_enabled) + exec_control &= ~SECONDARY_EXEC_RDTSCP; + + if (nested) { + if (rdtscp_enabled) + vmx->nested.nested_vmx_secondary_ctls_high |= + SECONDARY_EXEC_RDTSCP; + else + vmx->nested.nested_vmx_secondary_ctls_high &= + ~SECONDARY_EXEC_RDTSCP; + } + } + + if (vmx_invpcid_supported()) { + /* Exposing INVPCID only when PCID is exposed */ + bool invpcid_enabled = + guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) && + guest_cpuid_has(vcpu, X86_FEATURE_PCID); + + if (!invpcid_enabled) { + exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; + guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID); + } + + if (nested) { + if (invpcid_enabled) + vmx->nested.nested_vmx_secondary_ctls_high |= + SECONDARY_EXEC_ENABLE_INVPCID; + else + vmx->nested.nested_vmx_secondary_ctls_high &= + ~SECONDARY_EXEC_ENABLE_INVPCID; + } + } + + if (vmx_rdrand_supported()) { + bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND); + if (rdrand_enabled) + exec_control &= ~SECONDARY_EXEC_RDRAND; + + if (nested) { + if (rdrand_enabled) + vmx->nested.nested_vmx_secondary_ctls_high |= + SECONDARY_EXEC_RDRAND; + else + vmx->nested.nested_vmx_secondary_ctls_high &= + ~SECONDARY_EXEC_RDRAND; + } + } + + if (vmx_rdseed_supported()) { + bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED); + if (rdseed_enabled) + exec_control &= ~SECONDARY_EXEC_RDSEED; + + if (nested) { + if (rdseed_enabled) + vmx->nested.nested_vmx_secondary_ctls_high |= + SECONDARY_EXEC_RDSEED; + else + vmx->nested.nested_vmx_secondary_ctls_high &= + ~SECONDARY_EXEC_RDSEED; + } + } + + vmx->secondary_exec_control = exec_control; } static void ept_set_mmio_spte_mask(void) @@ -5160,7 +5418,8 @@ static void ept_set_mmio_spte_mask(void) * EPT Misconfigurations can be generated if the value of bits 2:0 * of an EPT paging-structure entry is 110b (write/execute). */ - kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE); + kvm_mmu_set_mmio_spte_mask(VMX_EPT_RWX_MASK, + VMX_EPT_MISCONFIG_WX_VALUE); } #define VMX_XSS_EXIT_BITMAP 0 @@ -5194,8 +5453,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx)); if (cpu_has_secondary_exec_ctrls()) { + vmx_compute_secondary_exec_control(vmx); vmcs_write32(SECONDARY_VM_EXEC_CONTROL, - vmx_secondary_exec_control(vmx)); + vmx->secondary_exec_control); } if (kvm_vcpu_apicv_active(&vmx->vcpu)) { @@ -5233,6 +5493,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ #endif + if (cpu_has_vmx_vmfunc()) + vmcs_write64(VM_FUNCTION_CONTROL, 0); + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host)); @@ -5356,9 +5619,6 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); - if (kvm_vcpu_apicv_active(vcpu)) - memset(&vmx->pi_desc, 0, sizeof(struct pi_desc)); - if (vmx->vpid != 0) vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); @@ -5447,10 +5707,8 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (!is_guest_mode(vcpu)) { - ++vcpu->stat.nmi_injections; - vmx->nmi_known_unmasked = false; - } + ++vcpu->stat.nmi_injections; + vmx->loaded_vmcs->nmi_known_unmasked = false; if (vmx->rmode.vm86_active) { if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE) @@ -5464,16 +5722,21 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) { - if (to_vmx(vcpu)->nmi_known_unmasked) + struct vcpu_vmx *vmx = to_vmx(vcpu); + bool masked; + + if (vmx->loaded_vmcs->nmi_known_unmasked) return false; - return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; + masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; + vmx->loaded_vmcs->nmi_known_unmasked = !masked; + return masked; } static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) { struct vcpu_vmx *vmx = to_vmx(vcpu); - vmx->nmi_known_unmasked = !masked; + vmx->loaded_vmcs->nmi_known_unmasked = !masked; if (masked) vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); @@ -5646,14 +5909,11 @@ static int handle_exception(struct kvm_vcpu *vcpu) } if (is_page_fault(intr_info)) { - /* EPT won't cause page fault directly */ - BUG_ON(enable_ept); cr2 = vmcs_readl(EXIT_QUALIFICATION); - trace_kvm_page_fault(cr2, error_code); - - if (kvm_event_needs_reinjection(vcpu)) - kvm_mmu_unprotect_page_virt(vcpu, cr2); - return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0); + /* EPT won't cause page fault directly */ + WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept); + return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0, + true); } ex_no = intr_info & INTR_INFO_VECTOR_MASK; @@ -5711,6 +5971,7 @@ static int handle_external_interrupt(struct kvm_vcpu *vcpu) static int handle_triple_fault(struct kvm_vcpu *vcpu) { vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; + vcpu->mmio_needed = 0; return 0; } @@ -6206,21 +6467,10 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; gpa_t gpa; - u32 error_code; + u64 error_code; exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - if (is_guest_mode(vcpu) - && !(exit_qualification & EPT_VIOLATION_GVA_TRANSLATED)) { - /* - * Fix up exit_qualification according to whether guest - * page table accesses are reads or writes. - */ - u64 eptp = nested_ept_get_cr3(vcpu); - if (!(eptp & VMX_EPT_AD_ENABLE_BIT)) - exit_qualification &= ~EPT_VIOLATION_ACC_WRITE; - } - /* * EPT violation happened while executing iret from NMI, * "blocked by NMI" bit has to be set before next VM entry. @@ -6249,9 +6499,10 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) EPT_VIOLATION_EXECUTABLE)) ? PFERR_PRESENT_MASK : 0; - vcpu->arch.gpa_available = true; - vcpu->arch.exit_qualification = exit_qualification; + error_code |= (exit_qualification & 0x100) != 0 ? + PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; + vcpu->arch.exit_qualification = exit_qualification; return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); } @@ -6260,23 +6511,20 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) int ret; gpa_t gpa; + /* + * A nested guest cannot optimize MMIO vmexits, because we have an + * nGPA here instead of the required GPA. + */ gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); - if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { + if (!is_guest_mode(vcpu) && + !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { trace_kvm_fast_mmio(gpa); return kvm_skip_emulated_instruction(vcpu); } - ret = handle_mmio_page_fault(vcpu, gpa, true); - vcpu->arch.gpa_available = true; - if (likely(ret == RET_MMIO_PF_EMULATE)) - return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) == - EMULATE_DONE; - - if (unlikely(ret == RET_MMIO_PF_INVALID)) - return kvm_mmu_page_fault(vcpu, gpa, 0, NULL, 0); - - if (unlikely(ret == RET_MMIO_PF_RETRY)) - return 1; + ret = kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0); + if (ret >= 0) + return ret; /* It is the real ept misconfig */ WARN_ON(1); @@ -6443,7 +6691,7 @@ void vmx_enable_tdp(void) enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull, 0ull, VMX_EPT_EXECUTABLE_MASK, cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK, - enable_ept_ad_bits ? 0ull : VMX_EPT_RWX_MASK); + VMX_EPT_RWX_MASK, 0ull); ept_set_mmio_spte_mask(); kvm_enable_tdp(); @@ -6498,7 +6746,8 @@ static __init int hardware_setup(void) init_vmcs_shadow_fields(); if (!cpu_has_vmx_ept() || - !cpu_has_vmx_ept_4levels()) { + !cpu_has_vmx_ept_4levels() || + !cpu_has_vmx_ept_mt_wb()) { enable_ept = 0; enable_unrestricted_guest = 0; enable_ept_ad_bits = 0; @@ -6547,7 +6796,6 @@ static __init int hardware_setup(void) vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); - vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true); memcpy(vmx_msr_bitmap_legacy_x2apic_apicv, vmx_msr_bitmap_legacy, PAGE_SIZE); @@ -6642,7 +6890,13 @@ static int handle_pause(struct kvm_vcpu *vcpu) if (ple_gap) grow_ple_window(vcpu); - kvm_vcpu_on_spin(vcpu); + /* + * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting" + * VM-execution control is ignored if CPL > 0. OTOH, KVM + * never set PAUSE_EXITING and just set PLE if supported, + * so the vcpu must be CPL=0 if it gets a PAUSE exit. + */ + kvm_vcpu_on_spin(vcpu, true); return kvm_skip_emulated_instruction(vcpu); } @@ -6657,6 +6911,12 @@ static int handle_mwait(struct kvm_vcpu *vcpu) return handle_nop(vcpu); } +static int handle_invalid_op(struct kvm_vcpu *vcpu) +{ + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; +} + static int handle_monitor_trap(struct kvm_vcpu *vcpu) { return 1; @@ -6873,7 +7133,7 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu, * non-canonical form. This is the only check on the memory * destination for long mode! */ - exn = is_noncanonical_address(*ret); + exn = is_noncanonical_address(*ret, vcpu); } else if (is_protmode(vcpu)) { /* Protected mode: apply checks for segment validity in the * following order: @@ -7037,19 +7297,19 @@ static int handle_vmon(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } - page = nested_get_page(vcpu, vmptr); - if (page == NULL) { + page = kvm_vcpu_gpa_to_page(vcpu, vmptr); + if (is_error_page(page)) { nested_vmx_failInvalid(vcpu); return kvm_skip_emulated_instruction(vcpu); } if (*(u32 *)kmap(page) != VMCS12_REVISION) { kunmap(page); - nested_release_page_clean(page); + kvm_release_page_clean(page); nested_vmx_failInvalid(vcpu); return kvm_skip_emulated_instruction(vcpu); } kunmap(page); - nested_release_page_clean(page); + kvm_release_page_clean(page); vmx->nested.vmxon_ptr = vmptr; ret = enter_vmx_operation(vcpu); @@ -7076,34 +7336,32 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) return 1; } +static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) +{ + vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS); + vmcs_write64(VMCS_LINK_POINTER, -1ull); +} + static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) { if (vmx->nested.current_vmptr == -1ull) return; - /* current_vmptr and current_vmcs12 are always set/reset together */ - if (WARN_ON(vmx->nested.current_vmcs12 == NULL)) - return; - if (enable_shadow_vmcs) { /* copy to memory all shadowed fields in case they were modified */ copy_shadow_to_vmcs12(vmx); vmx->nested.sync_shadow_vmcs = false; - vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, - SECONDARY_EXEC_SHADOW_VMCS); - vmcs_write64(VMCS_LINK_POINTER, -1ull); + vmx_disable_shadow_vmcs(vmx); } vmx->nested.posted_intr_nv = -1; /* Flush VMCS12 to guest memory */ - memcpy(vmx->nested.current_vmcs12, vmx->nested.cached_vmcs12, - VMCS12_SIZE); + kvm_vcpu_write_guest_page(&vmx->vcpu, + vmx->nested.current_vmptr >> PAGE_SHIFT, + vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); - kunmap(vmx->nested.current_vmcs12_page); - nested_release_page(vmx->nested.current_vmcs12_page); vmx->nested.current_vmptr = -1ull; - vmx->nested.current_vmcs12 = NULL; } /* @@ -7117,12 +7375,14 @@ static void free_nested(struct vcpu_vmx *vmx) vmx->nested.vmxon = false; free_vpid(vmx->nested.vpid02); - nested_release_vmcs12(vmx); + vmx->nested.posted_intr_nv = -1; + vmx->nested.current_vmptr = -1ull; if (vmx->nested.msr_bitmap) { free_page((unsigned long)vmx->nested.msr_bitmap); vmx->nested.msr_bitmap = NULL; } if (enable_shadow_vmcs) { + vmx_disable_shadow_vmcs(vmx); vmcs_clear(vmx->vmcs01.shadow_vmcs); free_vmcs(vmx->vmcs01.shadow_vmcs); vmx->vmcs01.shadow_vmcs = NULL; @@ -7130,16 +7390,16 @@ static void free_nested(struct vcpu_vmx *vmx) kfree(vmx->nested.cached_vmcs12); /* Unpin physical memory we referred to in current vmcs02 */ if (vmx->nested.apic_access_page) { - nested_release_page(vmx->nested.apic_access_page); + kvm_release_page_dirty(vmx->nested.apic_access_page); vmx->nested.apic_access_page = NULL; } if (vmx->nested.virtual_apic_page) { - nested_release_page(vmx->nested.virtual_apic_page); + kvm_release_page_dirty(vmx->nested.virtual_apic_page); vmx->nested.virtual_apic_page = NULL; } if (vmx->nested.pi_desc_page) { kunmap(vmx->nested.pi_desc_page); - nested_release_page(vmx->nested.pi_desc_page); + kvm_release_page_dirty(vmx->nested.pi_desc_page); vmx->nested.pi_desc_page = NULL; vmx->nested.pi_desc = NULL; } @@ -7208,25 +7468,6 @@ static int handle_vmresume(struct kvm_vcpu *vcpu) return nested_vmx_run(vcpu, false); } -enum vmcs_field_type { - VMCS_FIELD_TYPE_U16 = 0, - VMCS_FIELD_TYPE_U64 = 1, - VMCS_FIELD_TYPE_U32 = 2, - VMCS_FIELD_TYPE_NATURAL_WIDTH = 3 -}; - -static inline int vmcs_field_type(unsigned long field) -{ - if (0x1 & field) /* the *_HIGH fields are all 32 bit */ - return VMCS_FIELD_TYPE_U32; - return (field >> 13) & 0x3 ; -} - -static inline int vmcs_field_readonly(unsigned long field) -{ - return (((field >> 10) & 0x3) == 1); -} - /* * Read a vmcs12 field. Since these can have varying lengths and we return * one type, we chose the biggest type (u64) and zero-extend the return value @@ -7525,29 +7766,29 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) if (vmx->nested.current_vmptr != vmptr) { struct vmcs12 *new_vmcs12; struct page *page; - page = nested_get_page(vcpu, vmptr); - if (page == NULL) { + page = kvm_vcpu_gpa_to_page(vcpu, vmptr); + if (is_error_page(page)) { nested_vmx_failInvalid(vcpu); return kvm_skip_emulated_instruction(vcpu); } new_vmcs12 = kmap(page); if (new_vmcs12->revision_id != VMCS12_REVISION) { kunmap(page); - nested_release_page_clean(page); + kvm_release_page_clean(page); nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); return kvm_skip_emulated_instruction(vcpu); } nested_release_vmcs12(vmx); - vmx->nested.current_vmcs12 = new_vmcs12; - vmx->nested.current_vmcs12_page = page; /* * Load VMCS12 from guest memory since it is not already * cached. */ - memcpy(vmx->nested.cached_vmcs12, - vmx->nested.current_vmcs12, VMCS12_SIZE); + memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE); + kunmap(page); + kvm_release_page_clean(page); + set_current_vmptr(vmx, vmptr); } @@ -7651,7 +7892,10 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) unsigned long type, types; gva_t gva; struct x86_exception e; - int vpid; + struct { + u64 vpid; + u64 gla; + } operand; if (!(vmx->nested.nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_VPID) || @@ -7681,17 +7925,28 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), vmx_instruction_info, false, &gva)) return 1; - if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vpid, - sizeof(u32), &e)) { + if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand, + sizeof(operand), &e)) { kvm_inject_page_fault(vcpu, &e); return 1; } + if (operand.vpid >> 16) { + nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + return kvm_skip_emulated_instruction(vcpu); + } switch (type) { case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: + if (is_noncanonical_address(operand.gla, vcpu)) { + nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + return kvm_skip_emulated_instruction(vcpu); + } + /* fall through */ case VMX_VPID_EXTENT_SINGLE_CONTEXT: case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: - if (!vpid) { + if (!operand.vpid) { nested_vmx_failValid(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); return kvm_skip_emulated_instruction(vcpu); @@ -7740,6 +7995,124 @@ static int handle_preemption_timer(struct kvm_vcpu *vcpu) return 1; } +static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int maxphyaddr = cpuid_maxphyaddr(vcpu); + + /* Check for memory type validity */ + switch (address & VMX_EPTP_MT_MASK) { + case VMX_EPTP_MT_UC: + if (!(vmx->nested.nested_vmx_ept_caps & VMX_EPTP_UC_BIT)) + return false; + break; + case VMX_EPTP_MT_WB: + if (!(vmx->nested.nested_vmx_ept_caps & VMX_EPTP_WB_BIT)) + return false; + break; + default: + return false; + } + + /* only 4 levels page-walk length are valid */ + if ((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4) + return false; + + /* Reserved bits should not be set */ + if (address >> maxphyaddr || ((address >> 7) & 0x1f)) + return false; + + /* AD, if set, should be supported */ + if (address & VMX_EPTP_AD_ENABLE_BIT) { + if (!(vmx->nested.nested_vmx_ept_caps & VMX_EPT_AD_BIT)) + return false; + } + + return true; +} + +static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + u32 index = vcpu->arch.regs[VCPU_REGS_RCX]; + u64 address; + bool accessed_dirty; + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; + + if (!nested_cpu_has_eptp_switching(vmcs12) || + !nested_cpu_has_ept(vmcs12)) + return 1; + + if (index >= VMFUNC_EPTP_ENTRIES) + return 1; + + + if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, + &address, index * 8, 8)) + return 1; + + accessed_dirty = !!(address & VMX_EPTP_AD_ENABLE_BIT); + + /* + * If the (L2) guest does a vmfunc to the currently + * active ept pointer, we don't have to do anything else + */ + if (vmcs12->ept_pointer != address) { + if (!valid_ept_address(vcpu, address)) + return 1; + + kvm_mmu_unload(vcpu); + mmu->ept_ad = accessed_dirty; + mmu->base_role.ad_disabled = !accessed_dirty; + vmcs12->ept_pointer = address; + /* + * TODO: Check what's the correct approach in case + * mmu reload fails. Currently, we just let the next + * reload potentially fail + */ + kvm_mmu_reload(vcpu); + } + + return 0; +} + +static int handle_vmfunc(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmcs12 *vmcs12; + u32 function = vcpu->arch.regs[VCPU_REGS_RAX]; + + /* + * VMFUNC is only supported for nested guests, but we always enable the + * secondary control for simplicity; for non-nested mode, fake that we + * didn't by injecting #UD. + */ + if (!is_guest_mode(vcpu)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + vmcs12 = get_vmcs12(vcpu); + if ((vmcs12->vm_function_control & (1 << function)) == 0) + goto fail; + + switch (function) { + case 0: + if (nested_vmx_eptp_switching(vcpu, vmcs12)) + goto fail; + break; + default: + goto fail; + } + return kvm_skip_emulated_instruction(vcpu); + +fail: + nested_vmx_vmexit(vcpu, vmx->exit_reason, + vmcs_read32(VM_EXIT_INTR_INFO), + vmcs_readl(EXIT_QUALIFICATION)); + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -7787,9 +8160,12 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor, [EXIT_REASON_INVEPT] = handle_invept, [EXIT_REASON_INVVPID] = handle_invvpid, + [EXIT_REASON_RDRAND] = handle_invalid_op, + [EXIT_REASON_RDSEED] = handle_invalid_op, [EXIT_REASON_XSAVES] = handle_xsaves, [EXIT_REASON_XRSTORS] = handle_xrstors, [EXIT_REASON_PML_FULL] = handle_pml_full, + [EXIT_REASON_VMFUNC] = handle_vmfunc, [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, }; @@ -7966,19 +8342,11 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, * should handle it ourselves in L0 (and then continue L2). Only call this * when in is_guest_mode (L2). */ -static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) +static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) { u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - u32 exit_reason = vmx->exit_reason; - - trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, - vmcs_readl(EXIT_QUALIFICATION), - vmx->idt_vectoring_info, - intr_info, - vmcs_read32(VM_EXIT_INTR_ERROR_CODE), - KVM_ISA_VMX); if (vmx->nested.nested_run_pending) return false; @@ -7989,12 +8357,31 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) return true; } + /* + * The host physical addresses of some pages of guest memory + * are loaded into VMCS02 (e.g. L1's Virtual APIC Page). The CPU + * may write to these pages via their host physical address while + * L2 is running, bypassing any address-translation-based dirty + * tracking (e.g. EPT write protection). + * + * Mark them dirty on every exit from L2 to prevent them from + * getting out of sync with dirty tracking. + */ + nested_mark_vmcs12_pages_dirty(vcpu); + + trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, + vmcs_readl(EXIT_QUALIFICATION), + vmx->idt_vectoring_info, + intr_info, + vmcs_read32(VM_EXIT_INTR_ERROR_CODE), + KVM_ISA_VMX); + switch (exit_reason) { case EXIT_REASON_EXCEPTION_NMI: if (is_nmi(intr_info)) return false; else if (is_page_fault(intr_info)) - return enable_ept; + return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept; else if (is_no_device(intr_info) && !(vmcs12->guest_cr0 & X86_CR0_TS)) return false; @@ -8094,6 +8481,10 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) * table is L0's fault. */ return false; + case EXIT_REASON_INVPCID: + return + nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) && + nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); case EXIT_REASON_WBINVD: return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); case EXIT_REASON_XSETBV: @@ -8111,11 +8502,37 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) case EXIT_REASON_PML_FULL: /* We emulate PML support to L1. */ return false; + case EXIT_REASON_VMFUNC: + /* VM functions are emulated through L2->L0 vmexits. */ + return false; default: return true; } } +static int nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason) +{ + u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + + /* + * At this point, the exit interruption info in exit_intr_info + * is only valid for EXCEPTION_NMI exits. For EXTERNAL_INTERRUPT + * we need to query the in-kernel LAPIC. + */ + WARN_ON(exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT); + if ((exit_intr_info & + (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) == + (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) { + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + vmcs12->vm_exit_intr_error_code = + vmcs_read32(VM_EXIT_INTR_ERROR_CODE); + } + + nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info, + vmcs_readl(EXIT_QUALIFICATION)); + return 1; +} + static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) { *info1 = vmcs_readl(EXIT_QUALIFICATION); @@ -8346,7 +8763,6 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) u32 vectoring_info = vmx->idt_vectoring_info; trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX); - vcpu->arch.gpa_available = false; /* * Flush logged GPAs PML buffer, this will make dirty_bitmap more @@ -8362,12 +8778,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) if (vmx->emulation_required) return handle_invalid_guest_state(vcpu); - if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) { - nested_vmx_vmexit(vcpu, exit_reason, - vmcs_read32(VM_EXIT_INTR_INFO), - vmcs_readl(EXIT_QUALIFICATION)); - return 1; - } + if (is_guest_mode(vcpu) && nested_vmx_exit_reflected(vcpu, exit_reason)) + return nested_vmx_reflect_vmexit(vcpu, exit_reason); if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { dump_vmcs(); @@ -8398,9 +8810,15 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) exit_reason != EXIT_REASON_TASK_SWITCH)) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; - vcpu->run->internal.ndata = 2; + vcpu->run->internal.ndata = 3; vcpu->run->internal.data[0] = vectoring_info; vcpu->run->internal.data[1] = exit_reason; + vcpu->run->internal.data[2] = vcpu->arch.exit_qualification; + if (exit_reason == EXIT_REASON_EPT_MISCONFIG) { + vcpu->run->internal.ndata++; + vcpu->run->internal.data[3] = + vmcs_read64(GUEST_PHYSICAL_ADDRESS); + } return 0; } @@ -8591,17 +9009,24 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu) static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) { - u32 exit_intr_info; + u32 exit_intr_info = 0; + u16 basic_exit_reason = (u16)vmx->exit_reason; - if (!(vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY - || vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)) + if (!(basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY + || basic_exit_reason == EXIT_REASON_EXCEPTION_NMI)) return; - vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - exit_intr_info = vmx->exit_intr_info; + if (!(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) + exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + vmx->exit_intr_info = exit_intr_info; + + /* if exit due to PF check for async PF */ + if (is_page_fault(exit_intr_info)) + vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason(); /* Handle machine checks before interrupts are enabled */ - if (is_machine_check(exit_intr_info)) + if (basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY || + is_machine_check(exit_intr_info)) kvm_machine_check(); /* We need to handle NMIs before interrupts are enabled */ @@ -8615,7 +9040,6 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) { u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - register void *__sp asm(_ASM_SP); if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK)) == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) { @@ -8629,7 +9053,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) vector = exit_intr_info & INTR_INFO_VECTOR_MASK; desc = (gate_desc *)vmx->host_idt_base + vector; - entry = gate_offset(*desc); + entry = gate_offset(desc); asm volatile( #ifdef CONFIG_X86_64 "mov %%" _ASM_SP ", %[sp]\n\t" @@ -8644,7 +9068,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) #ifdef CONFIG_X86_64 [sp]"=&r"(tmp), #endif - "+r"(__sp) + ASM_CALL_CONSTRAINT : [entry]"r"(entry), [ss]"i"(__KERNEL_DS), @@ -8652,6 +9076,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) ); } } +STACK_FRAME_NON_STANDARD(vmx_handle_external_intr); static bool vmx_has_high_real_mode_segbase(void) { @@ -8679,7 +9104,7 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; - if (vmx->nmi_known_unmasked) + if (vmx->loaded_vmcs->nmi_known_unmasked) return; /* * Can't use vmx->exit_intr_info since we're not sure what @@ -8703,7 +9128,7 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); else - vmx->nmi_known_unmasked = + vmx->loaded_vmcs->nmi_known_unmasked = !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI); } @@ -8820,7 +9245,7 @@ static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu) static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long debugctlmsr, cr4; + unsigned long debugctlmsr, cr3, cr4; /* Don't enter VMX if guest state is invalid, let the exit handler start emulation until we arrive back to a valid state */ @@ -8842,10 +9267,16 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); + cr3 = __get_current_cr3_fast(); + if (unlikely(cr3 != vmx->loaded_vmcs->vmcs_host_cr3)) { + vmcs_writel(HOST_CR3, cr3); + vmx->loaded_vmcs->vmcs_host_cr3 = cr3; + } + cr4 = cr4_read_shadow(); - if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) { + if (unlikely(cr4 != vmx->loaded_vmcs->vmcs_host_cr4)) { vmcs_writel(HOST_CR4, cr4); - vmx->host_state.vmcs_host_cr4 = cr4; + vmx->loaded_vmcs->vmcs_host_cr4 = cr4; } /* When single-stepping over STI and MOV SS, we must clear the @@ -8856,8 +9287,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) vmx_set_interrupt_shadow(vcpu, 0); - if (vmx->guest_pkru_valid) - __write_pkru(vmx->guest_pkru); + if (static_cpu_has(X86_FEATURE_PKU) && + kvm_read_cr4_bits(vcpu, X86_CR4_PKE) && + vcpu->arch.pkru != vmx->host_pkru) + __write_pkru(vcpu->arch.pkru); atomic_switch_perf_msrs(vmx); debugctlmsr = get_debugctlmsr(); @@ -8994,24 +9427,16 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) | (1 << VCPU_EXREG_CR3)); vcpu->arch.regs_dirty = 0; - vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); - - vmx->loaded_vmcs->launched = 1; - - vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); - /* * eager fpu is enabled if PKEY is supported and CR4 is switched * back on host, so it is safe to read guest PKRU from current * XSAVE. */ - if (boot_cpu_has(X86_FEATURE_OSPKE)) { - vmx->guest_pkru = __read_pkru(); - if (vmx->guest_pkru != vmx->host_pkru) { - vmx->guest_pkru_valid = true; + if (static_cpu_has(X86_FEATURE_PKU) && + kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) { + vcpu->arch.pkru = __read_pkru(); + if (vcpu->arch.pkru != vmx->host_pkru) __write_pkru(vmx->host_pkru); - } else - vmx->guest_pkru_valid = false; } /* @@ -9023,11 +9448,20 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) kvm_make_request(KVM_REQ_EVENT, vcpu); vmx->nested.nested_run_pending = 0; + vmx->idt_vectoring_info = 0; + + vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON); + if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) + return; + + vmx->loaded_vmcs->launched = 1; + vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); vmx_complete_atomic_exit(vmx); vmx_recover_nmi_blocking(vmx); vmx_complete_interrupts(vmx); } +STACK_FRAME_NON_STANDARD(vmx_vcpu_run); static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) { @@ -9149,10 +9583,16 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) vmx->nested.posted_intr_nv = -1; vmx->nested.current_vmptr = -1ull; - vmx->nested.current_vmcs12 = NULL; vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED; + /* + * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR + * or POSTED_INTR_WAKEUP_VECTOR. + */ + vmx->pi_desc.nv = POSTED_INTR_VECTOR; + vmx->pi_desc.sn = 1; + return &vmx->vcpu; free_vmcs: @@ -9184,11 +9624,6 @@ static void __init vmx_check_processor_compat(void *rtn) } } -static int get_ept_level(void) -{ - return VMX_EPT_DEFAULT_GAW + 1; -} - static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { u8 cache; @@ -9305,39 +9740,13 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) static void vmx_cpuid_update(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 secondary_exec_ctl = vmx_secondary_exec_control(vmx); - - if (vmx_rdtscp_supported()) { - bool rdtscp_enabled = guest_cpuid_has_rdtscp(vcpu); - if (!rdtscp_enabled) - secondary_exec_ctl &= ~SECONDARY_EXEC_RDTSCP; - - if (nested) { - if (rdtscp_enabled) - vmx->nested.nested_vmx_secondary_ctls_high |= - SECONDARY_EXEC_RDTSCP; - else - vmx->nested.nested_vmx_secondary_ctls_high &= - ~SECONDARY_EXEC_RDTSCP; - } - } - - /* Exposing INVPCID only when PCID is exposed */ - best = kvm_find_cpuid_entry(vcpu, 0x7, 0); - if (vmx_invpcid_supported() && - (!best || !(best->ebx & bit(X86_FEATURE_INVPCID)) || - !guest_cpuid_has_pcid(vcpu))) { - secondary_exec_ctl &= ~SECONDARY_EXEC_ENABLE_INVPCID; - if (best) - best->ebx &= ~bit(X86_FEATURE_INVPCID); + if (cpu_has_secondary_exec_ctrls()) { + vmx_compute_secondary_exec_control(vmx); + vmcs_set_secondary_exec_control(vmx->secondary_exec_control); } - if (cpu_has_secondary_exec_ctrls()) - vmcs_set_secondary_exec_control(secondary_exec_ctl); - if (nested_vmx_allowed(vcpu)) to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; @@ -9376,6 +9785,11 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, vmcs12->guest_physical_address = fault->address; } +static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu) +{ + return nested_ept_get_cr3(vcpu) & VMX_EPTP_AD_ENABLE_BIT; +} + /* Callbacks for nested_ept_init_mmu_context: */ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) @@ -9386,18 +9800,15 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) { - u64 eptp; - WARN_ON(mmu_is_nested(vcpu)); - eptp = nested_ept_get_cr3(vcpu); - if ((eptp & VMX_EPT_AD_ENABLE_BIT) && !enable_ept_ad_bits) + if (!valid_ept_address(vcpu, nested_ept_get_cr3(vcpu))) return 1; kvm_mmu_unload(vcpu); kvm_init_shadow_ept_mmu(vcpu, to_vmx(vcpu)->nested.nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT, - eptp & VMX_EPT_AD_ENABLE_BIT); + nested_ept_ad_enabled(vcpu)); vcpu->arch.mmu.set_cr3 = vmx_set_cr3; vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3; vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault; @@ -9430,12 +9841,16 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, WARN_ON(!is_guest_mode(vcpu)); - if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code)) - nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason, - vmcs_read32(VM_EXIT_INTR_INFO), - vmcs_readl(EXIT_QUALIFICATION)); - else + if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) && + !to_vmx(vcpu)->nested.nested_run_pending) { + vmcs12->vm_exit_intr_error_code = fault->error_code; + nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, + PF_VECTOR | INTR_TYPE_HARD_EXCEPTION | + INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK, + fault->address); + } else { kvm_inject_page_fault(vcpu, fault); + } } static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, @@ -9445,6 +9860,7 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct vcpu_vmx *vmx = to_vmx(vcpu); + struct page *page; u64 hpa; if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { @@ -9454,17 +9870,19 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, * physical address remains valid. We keep a reference * to it so we can release it later. */ - if (vmx->nested.apic_access_page) /* shouldn't happen */ - nested_release_page(vmx->nested.apic_access_page); - vmx->nested.apic_access_page = - nested_get_page(vcpu, vmcs12->apic_access_addr); + if (vmx->nested.apic_access_page) { /* shouldn't happen */ + kvm_release_page_dirty(vmx->nested.apic_access_page); + vmx->nested.apic_access_page = NULL; + } + page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr); /* * If translation failed, no matter: This feature asks * to exit when accessing the given address, and if it * can never be accessed, this feature won't do * anything anyway. */ - if (vmx->nested.apic_access_page) { + if (!is_error_page(page)) { + vmx->nested.apic_access_page = page; hpa = page_to_phys(vmx->nested.apic_access_page); vmcs_write64(APIC_ACCESS_ADDR, hpa); } else { @@ -9479,10 +9897,11 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, } if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { - if (vmx->nested.virtual_apic_page) /* shouldn't happen */ - nested_release_page(vmx->nested.virtual_apic_page); - vmx->nested.virtual_apic_page = - nested_get_page(vcpu, vmcs12->virtual_apic_page_addr); + if (vmx->nested.virtual_apic_page) { /* shouldn't happen */ + kvm_release_page_dirty(vmx->nested.virtual_apic_page); + vmx->nested.virtual_apic_page = NULL; + } + page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->virtual_apic_page_addr); /* * If translation failed, VM entry will fail because @@ -9497,7 +9916,8 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, * control. But such a configuration is useless, so * let's keep the code simple. */ - if (vmx->nested.virtual_apic_page) { + if (!is_error_page(page)) { + vmx->nested.virtual_apic_page = page; hpa = page_to_phys(vmx->nested.virtual_apic_page); vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa); } @@ -9506,16 +9926,14 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, if (nested_cpu_has_posted_intr(vmcs12)) { if (vmx->nested.pi_desc_page) { /* shouldn't happen */ kunmap(vmx->nested.pi_desc_page); - nested_release_page(vmx->nested.pi_desc_page); + kvm_release_page_dirty(vmx->nested.pi_desc_page); + vmx->nested.pi_desc_page = NULL; } - vmx->nested.pi_desc_page = - nested_get_page(vcpu, vmcs12->posted_intr_desc_addr); - vmx->nested.pi_desc = - (struct pi_desc *)kmap(vmx->nested.pi_desc_page); - if (!vmx->nested.pi_desc) { - nested_release_page_clean(vmx->nested.pi_desc_page); + page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->posted_intr_desc_addr); + if (is_error_page(page)) return; - } + vmx->nested.pi_desc_page = page; + vmx->nested.pi_desc = kmap(vmx->nested.pi_desc_page); vmx->nested.pi_desc = (struct pi_desc *)((void *)vmx->nested.pi_desc + (unsigned long)(vmcs12->posted_intr_desc_addr & @@ -9556,23 +9974,38 @@ static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL); } +static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) + return 0; + + if (!page_address_valid(vcpu, vmcs12->io_bitmap_a) || + !page_address_valid(vcpu, vmcs12->io_bitmap_b)) + return -EINVAL; + + return 0; +} + static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { - int maxphyaddr; - u64 addr; - if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) return 0; - if (vmcs12_read_any(vcpu, MSR_BITMAP, &addr)) { - WARN_ON(1); + if (!page_address_valid(vcpu, vmcs12->msr_bitmap)) return -EINVAL; - } - maxphyaddr = cpuid_maxphyaddr(vcpu); - if (!PAGE_ALIGNED(vmcs12->msr_bitmap) || - ((addr + PAGE_SIZE) >> maxphyaddr)) + return 0; +} + +static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) + return 0; + + if (!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr)) return -EINVAL; return 0; @@ -9594,8 +10027,8 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, if (!nested_cpu_has_virt_x2apic_mode(vmcs12)) return false; - page = nested_get_page(vcpu, vmcs12->msr_bitmap); - if (!page) + page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap); + if (is_error_page(page)) return false; msr_bitmap_l1 = (unsigned long *)kmap(page); @@ -9625,7 +10058,7 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, } } kunmap(page); - nested_release_page_clean(page); + kvm_release_page_clean(page); return true; } @@ -9960,6 +10393,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs12->vm_entry_instruction_len); vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, vmcs12->guest_interruptibility_info); + vmx->loaded_vmcs->nmi_known_unmasked = + !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); } else { vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); } @@ -9984,13 +10419,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, /* Posted interrupts setting is only taken from vmcs12. */ if (nested_cpu_has_posted_intr(vmcs12)) { - /* - * Note that we use L0's vector here and in - * vmx_deliver_nested_posted_interrupt. - */ vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; vmx->nested.pi_pending = false; - vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); + vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); } else { exec_control &= ~PIN_BASED_POSTED_INTR; } @@ -10014,12 +10445,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when * !enable_ept, EB.PF is 1, so the "or" will always be 1. - * - * A problem with this approach (when !enable_ept) is that L1 may be - * injected with more page faults than it asked for. This could have - * caused problems, but in practice existing hypervisors don't care. - * To fix this, we will need to emulate the PFEC checking (on the L1 - * page tables), using walk_addr(), when injecting PFs to L1. */ vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, enable_ept ? vmcs12->page_fault_error_code_mask : 0); @@ -10027,13 +10452,16 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, enable_ept ? vmcs12->page_fault_error_code_match : 0); if (cpu_has_secondary_exec_ctrls()) { - exec_control = vmx_secondary_exec_control(vmx); + exec_control = vmx->secondary_exec_control; /* Take the following fields only from vmcs12 */ exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_ENABLE_INVPCID | SECONDARY_EXEC_RDTSCP | + SECONDARY_EXEC_XSAVES | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | - SECONDARY_EXEC_APIC_REGISTER_VIRT); + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_ENABLE_VMFUNC); if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) { vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control & @@ -10041,6 +10469,10 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, exec_control |= vmcs12_exec_ctrl; } + /* All VMFUNCs are currently emulated through L0 vmexits. */ + if (exec_control & SECONDARY_EXEC_ENABLE_VMFUNC) + vmcs_write64(VM_FUNCTION_CONTROL, 0); + if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) { vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); @@ -10106,6 +10538,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, if (exec_control & CPU_BASED_TPR_SHADOW) { vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull); vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); + } else { +#ifdef CONFIG_X86_64 + exec_control |= CPU_BASED_CR8_LOAD_EXITING | + CPU_BASED_CR8_STORE_EXITING; +#endif } /* @@ -10260,9 +10697,15 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (nested_vmx_check_io_bitmap_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (nested_vmx_check_apicv_controls(vcpu, vmcs12)) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; @@ -10290,6 +10733,18 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmx->nested.nested_vmx_entry_ctls_high)) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (nested_cpu_has_vmfunc(vmcs12)) { + if (vmcs12->vm_function_control & + ~vmx->nested.nested_vmx_vmfunc_controls) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + if (nested_cpu_has_eptp_switching(vmcs12)) { + if (!nested_cpu_has_ept(vmcs12) || + !page_address_valid(vcpu, vmcs12->eptp_list_address)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + } + } + if (vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; @@ -10396,8 +10851,6 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) return 1; } - vmcs12->launch_state = 1; - /* * Note no nested_vmx_succeed or nested_vmx_fail here. At this point * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet @@ -10415,6 +10868,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) { struct vmcs12 *vmcs12; struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); u32 exit_qual; int ret; @@ -10439,6 +10893,12 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) * for misconfigurations which will anyway be caught by the processor * when using the merged vmcs02. */ + if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS) { + nested_vmx_failValid(vcpu, + VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); + goto out; + } + if (vmcs12->launch_state == launch) { nested_vmx_failValid(vcpu, launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS @@ -10531,7 +10991,7 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, u32 idt_vectoring; unsigned int nr; - if (vcpu->arch.exception.pending && vcpu->arch.exception.reinject) { + if (vcpu->arch.exception.injected) { nr = vcpu->arch.exception.nr; idt_vectoring = nr | VECTORING_INFO_VALID_MASK; @@ -10570,12 +11030,20 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) { struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long exit_qual; - if (vcpu->arch.exception.pending || - vcpu->arch.nmi_injected || - vcpu->arch.interrupt.pending) + if (kvm_event_needs_reinjection(vcpu)) return -EBUSY; + if (vcpu->arch.exception.pending && + nested_vmx_check_exception(vcpu, &exit_qual)) { + if (vmx->nested.nested_run_pending) + return -EBUSY; + nested_vmx_inject_exception_vmexit(vcpu, exit_qual); + vcpu->arch.exception.pending = false; + return 0; + } + if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && vmx->nested.preemption_timer_expired) { if (vmx->nested.nested_run_pending) @@ -10710,8 +11178,7 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); } - if (nested_cpu_has_ept(vmcs12)) - vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); + vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); if (nested_cpu_has_vid(vmcs12)) vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); @@ -10736,8 +11203,6 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); if (kvm_mpx_supported()) vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); - if (nested_cpu_has_xsaves(vmcs12)) - vmcs12->xss_exit_bitmap = vmcs_read64(XSS_EXIT_BITMAP); } /* @@ -10762,18 +11227,15 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs12->vm_exit_reason = exit_reason; vmcs12->exit_qualification = exit_qualification; - vmcs12->vm_exit_intr_info = exit_intr_info; - if ((vmcs12->vm_exit_intr_info & - (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) == - (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) - vmcs12->vm_exit_intr_error_code = - vmcs_read32(VM_EXIT_INTR_ERROR_CODE); + vmcs12->idt_vectoring_info_field = 0; vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { + vmcs12->launch_state = 1; + /* vm_entry_intr_info_field is cleared on exit. Emulate this * instead of reading the real value. */ vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; @@ -10832,7 +11294,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, /* Same as above - no reason to call set_cr4_guest_host_mask(). */ vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); - kvm_set_cr4(vcpu, vmcs12->host_cr4); + vmx_set_cr4(vcpu, vmcs12->host_cr4); nested_ept_uninit_mmu_context(vcpu); @@ -10854,7 +11316,9 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, */ vmx_flush_tlb(vcpu); } - + /* Restore posted intr vector. */ + if (nested_cpu_has_posted_intr(vmcs12)) + vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); @@ -10942,39 +11406,30 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - u32 vm_inst_error = 0; /* trying to cancel vmlaunch/vmresume is a bug */ WARN_ON_ONCE(vmx->nested.nested_run_pending); - leave_guest_mode(vcpu); - prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, - exit_qualification); - - if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr, - vmcs12->vm_exit_msr_store_count)) - nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL); + /* + * The only expected VM-instruction error is "VM entry with + * invalid control field(s)." Anything else indicates a + * problem with L0. + */ + WARN_ON_ONCE(vmx->fail && (vmcs_read32(VM_INSTRUCTION_ERROR) != + VMXERR_ENTRY_INVALID_CONTROL_FIELD)); - if (unlikely(vmx->fail)) - vm_inst_error = vmcs_read32(VM_INSTRUCTION_ERROR); + leave_guest_mode(vcpu); - vmx_switch_vmcs(vcpu, &vmx->vmcs01); + if (likely(!vmx->fail)) { + prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, + exit_qualification); - if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) - && nested_exit_intr_ack_set(vcpu)) { - int irq = kvm_cpu_get_interrupt(vcpu); - WARN_ON(irq < 0); - vmcs12->vm_exit_intr_info = irq | - INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; + if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr, + vmcs12->vm_exit_msr_store_count)) + nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL); } - trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, - vmcs12->exit_qualification, - vmcs12->idt_vectoring_info_field, - vmcs12->vm_exit_intr_info, - vmcs12->vm_exit_intr_error_code, - KVM_ISA_VMX); - + vmx_switch_vmcs(vcpu, &vmx->vmcs01); vm_entry_controls_reset_shadow(vmx); vm_exit_controls_reset_shadow(vmx); vmx_segment_cache_clear(vmx); @@ -10983,8 +11438,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, if (VMCS02_POOL_SIZE == 0) nested_free_vmcs02(vmx, vmx->nested.current_vmptr); - load_vmcs12_host_state(vcpu, vmcs12); - /* Update any VMCS fields that might have changed while L2 ran */ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr); @@ -11013,16 +11466,16 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, /* Unpin physical memory we referred to in vmcs02 */ if (vmx->nested.apic_access_page) { - nested_release_page(vmx->nested.apic_access_page); + kvm_release_page_dirty(vmx->nested.apic_access_page); vmx->nested.apic_access_page = NULL; } if (vmx->nested.virtual_apic_page) { - nested_release_page(vmx->nested.virtual_apic_page); + kvm_release_page_dirty(vmx->nested.virtual_apic_page); vmx->nested.virtual_apic_page = NULL; } if (vmx->nested.pi_desc_page) { kunmap(vmx->nested.pi_desc_page); - nested_release_page(vmx->nested.pi_desc_page); + kvm_release_page_dirty(vmx->nested.pi_desc_page); vmx->nested.pi_desc_page = NULL; vmx->nested.pi_desc = NULL; } @@ -11033,21 +11486,57 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, */ kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); - /* - * Exiting from L2 to L1, we're now back to L1 which thinks it just - * finished a VMLAUNCH or VMRESUME instruction, so we need to set the - * success or failure flag accordingly. - */ - if (unlikely(vmx->fail)) { - vmx->fail = 0; - nested_vmx_failValid(vcpu, vm_inst_error); - } else - nested_vmx_succeed(vcpu); if (enable_shadow_vmcs) vmx->nested.sync_shadow_vmcs = true; /* in case we halted in L2 */ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + + if (likely(!vmx->fail)) { + /* + * TODO: SDM says that with acknowledge interrupt on + * exit, bit 31 of the VM-exit interrupt information + * (valid interrupt) is always set to 1 on + * EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't + * need kvm_cpu_has_interrupt(). See the commit + * message for details. + */ + if (nested_exit_intr_ack_set(vcpu) && + exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && + kvm_cpu_has_interrupt(vcpu)) { + int irq = kvm_cpu_get_interrupt(vcpu); + WARN_ON(irq < 0); + vmcs12->vm_exit_intr_info = irq | + INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; + } + + trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, + vmcs12->exit_qualification, + vmcs12->idt_vectoring_info_field, + vmcs12->vm_exit_intr_info, + vmcs12->vm_exit_intr_error_code, + KVM_ISA_VMX); + + load_vmcs12_host_state(vcpu, vmcs12); + + return; + } + + /* + * After an early L2 VM-entry failure, we're now back + * in L1 which thinks it just finished a VMLAUNCH or + * VMRESUME instruction, so we need to set the failure + * flag and the VM-instruction error field of the VMCS + * accordingly. + */ + nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); + /* + * The emulated instruction was already skipped in + * nested_vmx_run, but the updated RIP was never + * written back to the vmcs01. + */ + skip_emulated_instruction(vcpu); + vmx->fail = 0; } /* @@ -11134,7 +11623,8 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc) vmx->hv_deadline_tsc = tscl + delta_tsc; vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_VMX_PREEMPTION_TIMER); - return 0; + + return delta_tsc == 0; } static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) @@ -11197,14 +11687,14 @@ static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu) gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull; - page = nested_get_page(vcpu, vmcs12->pml_address); - if (!page) + page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->pml_address); + if (is_error_page(page)) return 0; pml_address = kmap(page); pml_address[vmcs12->guest_pml_index--] = gpa; kunmap(page); - nested_release_page_clean(page); + kvm_release_page_clean(page); } return 0; @@ -11217,6 +11707,37 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask); } +static void __pi_post_block(struct kvm_vcpu *vcpu) +{ + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + struct pi_desc old, new; + unsigned int dest; + + do { + old.control = new.control = pi_desc->control; + WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR, + "Wakeup handler not enabled while the VCPU is blocked\n"); + + dest = cpu_physical_id(vcpu->cpu); + + if (x2apic_enabled()) + new.ndst = dest; + else + new.ndst = (dest << 8) & 0xFF00; + + /* set 'NV' to 'notification vector' */ + new.nv = POSTED_INTR_VECTOR; + } while (cmpxchg64(&pi_desc->control, old.control, + new.control) != old.control); + + if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) { + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + list_del(&vcpu->blocked_vcpu_list); + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + vcpu->pre_pcpu = -1; + } +} + /* * This routine does the following things for vCPU which is going * to be blocked if VT-d PI is enabled. @@ -11232,7 +11753,6 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, */ static int pi_pre_block(struct kvm_vcpu *vcpu) { - unsigned long flags; unsigned int dest; struct pi_desc old, new; struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); @@ -11242,34 +11762,20 @@ static int pi_pre_block(struct kvm_vcpu *vcpu) !kvm_vcpu_apicv_active(vcpu)) return 0; - vcpu->pre_pcpu = vcpu->cpu; - spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - list_add_tail(&vcpu->blocked_vcpu_list, - &per_cpu(blocked_vcpu_on_cpu, - vcpu->pre_pcpu)); - spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); + WARN_ON(irqs_disabled()); + local_irq_disable(); + if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) { + vcpu->pre_pcpu = vcpu->cpu; + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + list_add_tail(&vcpu->blocked_vcpu_list, + &per_cpu(blocked_vcpu_on_cpu, + vcpu->pre_pcpu)); + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + } do { old.control = new.control = pi_desc->control; - /* - * We should not block the vCPU if - * an interrupt is posted for it. - */ - if (pi_test_on(pi_desc) == 1) { - spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - list_del(&vcpu->blocked_vcpu_list); - spin_unlock_irqrestore( - &per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - vcpu->pre_pcpu = -1; - - return 1; - } - WARN((pi_desc->sn == 1), "Warning: SN field of posted-interrupts " "is set before blocking\n"); @@ -11291,10 +11797,15 @@ static int pi_pre_block(struct kvm_vcpu *vcpu) /* set 'NV' to 'wakeup vector' */ new.nv = POSTED_INTR_WAKEUP_VECTOR; - } while (cmpxchg(&pi_desc->control, old.control, - new.control) != old.control); + } while (cmpxchg64(&pi_desc->control, old.control, + new.control) != old.control); - return 0; + /* We should not block the vCPU if an interrupt is posted for it. */ + if (pi_test_on(pi_desc) == 1) + __pi_post_block(vcpu); + + local_irq_enable(); + return (vcpu->pre_pcpu == -1); } static int vmx_pre_block(struct kvm_vcpu *vcpu) @@ -11310,44 +11821,13 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu) static void pi_post_block(struct kvm_vcpu *vcpu) { - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - struct pi_desc old, new; - unsigned int dest; - unsigned long flags; - - if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(vcpu)) + if (vcpu->pre_pcpu == -1) return; - do { - old.control = new.control = pi_desc->control; - - dest = cpu_physical_id(vcpu->cpu); - - if (x2apic_enabled()) - new.ndst = dest; - else - new.ndst = (dest << 8) & 0xFF00; - - /* Allow posting non-urgent interrupts */ - new.sn = 0; - - /* set 'NV' to 'notification vector' */ - new.nv = POSTED_INTR_VECTOR; - } while (cmpxchg(&pi_desc->control, old.control, - new.control) != old.control); - - if(vcpu->pre_pcpu != -1) { - spin_lock_irqsave( - &per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - list_del(&vcpu->blocked_vcpu_list); - spin_unlock_irqrestore( - &per_cpu(blocked_vcpu_on_cpu_lock, - vcpu->pre_pcpu), flags); - vcpu->pre_pcpu = -1; - } + WARN_ON(irqs_disabled()); + local_irq_disable(); + __pi_post_block(vcpu); + local_irq_enable(); } static void vmx_post_block(struct kvm_vcpu *vcpu) @@ -11375,7 +11855,7 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, struct kvm_lapic_irq irq; struct kvm_vcpu *vcpu; struct vcpu_data vcpu_info; - int idx, ret = -EINVAL; + int idx, ret = 0; if (!kvm_arch_has_assigned_device(kvm) || !irq_remapping_cap(IRQ_POSTING_CAP) || @@ -11384,7 +11864,12 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, idx = srcu_read_lock(&kvm->irq_srcu); irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); - BUG_ON(guest_irq >= irq_rt->nr_rt_entries); + if (guest_irq >= irq_rt->nr_rt_entries || + hlist_empty(&irq_rt->map[guest_irq])) { + pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n", + guest_irq, irq_rt->nr_rt_entries); + goto out; + } hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { if (e->type != KVM_IRQ_ROUTING_MSI) @@ -11427,12 +11912,8 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, if (set) ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); - else { - /* suppress notification event before unposting */ - pi_set_sn(vcpu_to_pi_desc(vcpu)); + else ret = irq_set_vcpu_affinity(host_irq, NULL); - pi_clear_sn(vcpu_to_pi_desc(vcpu)); - } if (ret < 0) { printk(KERN_INFO "%s: failed to update PI IRTE\n", @@ -11503,8 +11984,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, - .get_pkru = vmx_get_pkru, - .tlb_flush = vmx_flush_tlb, .run = vmx_vcpu_run, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0e846f0cb83b..03869eb7fcd6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -54,6 +54,7 @@ #include <linux/kvm_irqfd.h> #include <linux/irqbypass.h> #include <linux/sched/stat.h> +#include <linux/mem_encrypt.h> #include <trace/events/kvm.h> @@ -134,8 +135,6 @@ module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR); static bool __read_mostly vector_hashing = true; module_param(vector_hashing, bool, S_IRUGO); -static bool __read_mostly backwards_tsc_observed = false; - #define KVM_NR_SHARED_MSRS 16 struct kvm_shared_msrs_global { @@ -312,13 +311,13 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info) (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE); u64 new_state = msr_info->data & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE); - u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) | - 0x2ff | (guest_cpuid_has_x2apic(vcpu) ? 0 : X2APIC_ENABLE); + u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) | 0x2ff | + (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE); + if ((msr_info->data & reserved_bits) || new_state == X2APIC_ENABLE) + return 1; if (!msr_info->host_initiated && - ((msr_info->data & reserved_bits) != 0 || - new_state == X2APIC_ENABLE || - (new_state == MSR_IA32_APICBASE_ENABLE && + ((new_state == MSR_IA32_APICBASE_ENABLE && old_state == (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) || (new_state == (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE) && old_state == 0))) @@ -391,15 +390,28 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, kvm_make_request(KVM_REQ_EVENT, vcpu); - if (!vcpu->arch.exception.pending) { + if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) { queue: if (has_error && !is_protmode(vcpu)) has_error = false; - vcpu->arch.exception.pending = true; + if (reinject) { + /* + * On vmentry, vcpu->arch.exception.pending is only + * true if an event injection was blocked by + * nested_run_pending. In that case, however, + * vcpu_enter_guest requests an immediate exit, + * and the guest shouldn't proceed far enough to + * need reinjection. + */ + WARN_ON_ONCE(vcpu->arch.exception.pending); + vcpu->arch.exception.injected = true; + } else { + vcpu->arch.exception.pending = true; + vcpu->arch.exception.injected = false; + } vcpu->arch.exception.has_error_code = has_error; vcpu->arch.exception.nr = nr; vcpu->arch.exception.error_code = error_code; - vcpu->arch.exception.reinject = reinject; return; } @@ -414,8 +426,13 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, class2 = exception_class(nr); if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) { - /* generate double fault per SDM Table 5-5 */ + /* + * Generate double fault per SDM Table 5-5. Set + * exception.pending = true so that the double fault + * can trigger a nested vmexit. + */ vcpu->arch.exception.pending = true; + vcpu->arch.exception.injected = false; vcpu->arch.exception.has_error_code = true; vcpu->arch.exception.nr = DF_VECTOR; vcpu->arch.exception.error_code = 0; @@ -452,7 +469,12 @@ EXPORT_SYMBOL_GPL(kvm_complete_insn_gp); void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) { ++vcpu->stat.pf_guest; - vcpu->arch.cr2 = fault->address; + vcpu->arch.exception.nested_apf = + is_guest_mode(vcpu) && fault->async_page_fault; + if (vcpu->arch.exception.nested_apf) + vcpu->arch.apf.nested_apf_token = fault->address; + else + vcpu->arch.cr2 = fault->address; kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code); } EXPORT_SYMBOL_GPL(kvm_inject_page_fault); @@ -594,8 +616,8 @@ bool pdptrs_changed(struct kvm_vcpu *vcpu) (unsigned long *)&vcpu->arch.regs_avail)) return true; - gfn = (kvm_read_cr3(vcpu) & ~31u) >> PAGE_SHIFT; - offset = (kvm_read_cr3(vcpu) & ~31u) & (PAGE_SIZE - 1); + gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT; + offset = (kvm_read_cr3(vcpu) & 0xffffffe0ul) & (PAGE_SIZE - 1); r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte), PFERR_USER_MASK | PFERR_WRITE_MASK); if (r < 0) @@ -751,19 +773,22 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (cr4 & CR4_RESERVED_BITS) return 1; - if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE)) + if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE)) + return 1; + + if (!guest_cpuid_has(vcpu, X86_FEATURE_SMEP) && (cr4 & X86_CR4_SMEP)) return 1; - if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP)) + if (!guest_cpuid_has(vcpu, X86_FEATURE_SMAP) && (cr4 & X86_CR4_SMAP)) return 1; - if (!guest_cpuid_has_smap(vcpu) && (cr4 & X86_CR4_SMAP)) + if (!guest_cpuid_has(vcpu, X86_FEATURE_FSGSBASE) && (cr4 & X86_CR4_FSGSBASE)) return 1; - if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_FSGSBASE)) + if (!guest_cpuid_has(vcpu, X86_FEATURE_PKU) && (cr4 & X86_CR4_PKE)) return 1; - if (!guest_cpuid_has_pku(vcpu) && (cr4 & X86_CR4_PKE)) + if (!guest_cpuid_has(vcpu, X86_FEATURE_LA57) && (cr4 & X86_CR4_LA57)) return 1; if (is_long_mode(vcpu)) { @@ -776,7 +801,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return 1; if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) { - if (!guest_cpuid_has_pcid(vcpu)) + if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID)) return 1; /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */ @@ -810,10 +835,10 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) return 0; } - if (is_long_mode(vcpu)) { - if (cr3 & CR3_L_MODE_RESERVED_BITS) - return 1; - } else if (is_pae(vcpu) && is_paging(vcpu) && + if (is_long_mode(vcpu) && + (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 62))) + return 1; + else if (is_pae(vcpu) && is_paging(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) return 1; @@ -880,7 +905,7 @@ static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu) { u64 fixed = DR6_FIXED_1; - if (!guest_cpuid_has_rtm(vcpu)) + if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM)) fixed |= DR6_RTM; return fixed; } @@ -990,6 +1015,7 @@ static u32 emulated_msrs[] = { MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, + HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY, HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2, HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL, HV_X64_MSR_RESET, @@ -1018,21 +1044,11 @@ bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) if (efer & efer_reserved_bits) return false; - if (efer & EFER_FFXSR) { - struct kvm_cpuid_entry2 *feat; - - feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); - if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) + if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT)) return false; - } - if (efer & EFER_SVME) { - struct kvm_cpuid_entry2 *feat; - - feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); - if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) + if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM)) return false; - } return true; } @@ -1080,7 +1096,7 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) case MSR_KERNEL_GS_BASE: case MSR_CSTAR: case MSR_LSTAR: - if (is_noncanonical_address(msr->data)) + if (is_noncanonical_address(msr->data, vcpu)) return 1; break; case MSR_IA32_SYSENTER_EIP: @@ -1097,7 +1113,7 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) * value, and that something deterministic happens if the guest * invokes 64-bit SYSENTER. */ - msr->data = get_canonical(msr->data); + msr->data = get_canonical(msr->data, vcpu_virt_addr_bits(vcpu)); } return kvm_x86_ops->set_msr(vcpu, msr); } @@ -1530,8 +1546,9 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; - if (guest_cpuid_has_tsc_adjust(vcpu) && !msr->host_initiated) + if (!msr->host_initiated && guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST)) update_ia32_tsc_adjust_msr(vcpu, offset); + kvm_vcpu_write_tsc_offset(vcpu, offset); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); @@ -1719,7 +1736,7 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm) &ka->master_cycle_now); ka->use_master_clock = host_tsc_clocksource && vcpus_matched - && !backwards_tsc_observed + && !ka->backwards_tsc_observed && !ka->boot_vcpu_runs_old_kvmclock; if (ka->use_master_clock) @@ -2060,8 +2077,8 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) { gpa_t gpa = data & ~0x3f; - /* Bits 2:5 are reserved, Should be zero */ - if (data & 0x3c) + /* Bits 3:5 are reserved, Should be zero */ + if (data & 0x38) return 1; vcpu->arch.apf.msr_val = data; @@ -2077,6 +2094,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) return 1; vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS); + vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT; kvm_async_pf_wakeup_all(vcpu); return 0; } @@ -2180,7 +2198,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) kvm_set_lapic_tscdeadline_msr(vcpu, data); break; case MSR_IA32_TSC_ADJUST: - if (guest_cpuid_has_tsc_adjust(vcpu)) { + if (guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST)) { if (!msr_info->host_initiated) { s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr; adjust_tsc_offset_guest(vcpu, adj); @@ -2302,12 +2320,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n", msr, data); break; case MSR_AMD64_OSVW_ID_LENGTH: - if (!guest_cpuid_has_osvw(vcpu)) + if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) return 1; vcpu->arch.osvw.length = data; break; case MSR_AMD64_OSVW_STATUS: - if (!guest_cpuid_has_osvw(vcpu)) + if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) return 1; vcpu->arch.osvw.status = data; break; @@ -2532,12 +2550,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = 0xbe702111; break; case MSR_AMD64_OSVW_ID_LENGTH: - if (!guest_cpuid_has_osvw(vcpu)) + if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) return 1; msr_info->data = vcpu->arch.osvw.length; break; case MSR_AMD64_OSVW_STATUS: - if (!guest_cpuid_has_osvw(vcpu)) + if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) return 1; msr_info->data = vcpu->arch.osvw.status; break; @@ -2661,6 +2679,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_HYPERV_VAPIC: case KVM_CAP_HYPERV_SPIN: case KVM_CAP_HYPERV_SYNIC: + case KVM_CAP_HYPERV_SYNIC2: + case KVM_CAP_HYPERV_VP_INDEX: case KVM_CAP_PCI_SEGMENT: case KVM_CAP_DEBUGREGS: case KVM_CAP_X86_ROBUST_SINGLESTEP: @@ -2841,10 +2861,10 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_vcpu_write_tsc_offset(vcpu, offset); vcpu->arch.tsc_catchup = 1; } - if (kvm_lapic_hv_timer_in_use(vcpu) && - kvm_x86_ops->set_hv_timer(vcpu, - kvm_get_lapic_target_expiration_tsc(vcpu))) - kvm_lapic_switch_to_sw_timer(vcpu); + + if (kvm_lapic_hv_timer_in_use(vcpu)) + kvm_lapic_restart_hv_timer(vcpu); + /* * On a host with synchronized TSC, there is no need to update * kvmclock on vcpu->cpu migration @@ -2875,6 +2895,10 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { int idx; + + if (vcpu->preempted) + vcpu->arch.preempted_in_kernel = !kvm_x86_ops->get_cpl(vcpu); + /* * Disable page faults because we're in atomic context here. * kvm_write_guest_offset_cached() would call might_fault() @@ -3067,8 +3091,14 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { process_nmi(vcpu); + /* + * FIXME: pass injected and pending separately. This is only + * needed for nested virtualization, whose state cannot be + * migrated yet. For now we can combine them. + */ events->exception.injected = - vcpu->arch.exception.pending && + (vcpu->arch.exception.pending || + vcpu->arch.exception.injected) && !kvm_exception_is_soft(vcpu->arch.exception.nr); events->exception.nr = vcpu->arch.exception.nr; events->exception.has_error_code = vcpu->arch.exception.has_error_code; @@ -3123,6 +3153,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, return -EINVAL; process_nmi(vcpu); + vcpu->arch.exception.injected = false; vcpu->arch.exception.pending = events->exception.injected; vcpu->arch.exception.nr = events->exception.nr; vcpu->arch.exception.has_error_code = events->exception.has_error_code; @@ -3153,15 +3184,18 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, kvm_set_hflags(vcpu, hflags); vcpu->arch.smi_pending = events->smi.pending; - if (events->smi.smm_inside_nmi) - vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK; - else - vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK; - if (lapic_in_kernel(vcpu)) { - if (events->smi.latched_init) - set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); + + if (events->smi.smm) { + if (events->smi.smm_inside_nmi) + vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK; else - clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); + vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK; + if (lapic_in_kernel(vcpu)) { + if (events->smi.latched_init) + set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); + else + clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); + } } } @@ -3236,7 +3270,12 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) u32 size, offset, ecx, edx; cpuid_count(XSTATE_CPUID, index, &size, &offset, &ecx, &edx); - memcpy(dest + offset, src, size); + if (feature == XFEATURE_MASK_PKRU) + memcpy(dest + offset, &vcpu->arch.pkru, + sizeof(vcpu->arch.pkru)); + else + memcpy(dest + offset, src, size); + } valid -= feature; @@ -3274,7 +3313,11 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src) u32 size, offset, ecx, edx; cpuid_count(XSTATE_CPUID, index, &size, &offset, &ecx, &edx); - memcpy(dest, src + offset, size); + if (feature == XFEATURE_MASK_PKRU) + memcpy(&vcpu->arch.pkru, src + offset, + sizeof(vcpu->arch.pkru)); + else + memcpy(dest, src + offset, size); } valid -= feature; @@ -3384,10 +3427,14 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, return -EINVAL; switch (cap->cap) { + case KVM_CAP_HYPERV_SYNIC2: + if (cap->args[0]) + return -EINVAL; case KVM_CAP_HYPERV_SYNIC: if (!irqchip_in_kernel(vcpu->kvm)) return -EINVAL; - return kvm_hv_activate_synic(vcpu); + return kvm_hv_activate_synic(vcpu, cap->cap == + KVM_CAP_HYPERV_SYNIC2); default: return -EINVAL; } @@ -4188,9 +4235,15 @@ long kvm_arch_vm_ioctl(struct file *filp, goto out; r = 0; + /* + * TODO: userspace has to take care of races with VCPU_RUN, so + * kvm_gen_update_masterclock() can be cut down to locked + * pvclock_update_vm_gtod_copy(). + */ + kvm_gen_update_masterclock(kvm); now_ns = get_kvmclock_ns(kvm); kvm->arch.kvmclock_offset += user_ns.clock - now_ns; - kvm_gen_update_masterclock(kvm); + kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE); break; } case KVM_GET_CLOCK: { @@ -4642,25 +4695,18 @@ static int emulator_read_write_onepage(unsigned long addr, void *val, */ if (vcpu->arch.gpa_available && emulator_can_use_gpa(ctxt) && - vcpu_is_mmio_gpa(vcpu, addr, exception->address, write) && - (addr & ~PAGE_MASK) == (exception->address & ~PAGE_MASK)) { - gpa = exception->address; - goto mmio; + (addr & ~PAGE_MASK) == (vcpu->arch.gpa_val & ~PAGE_MASK)) { + gpa = vcpu->arch.gpa_val; + ret = vcpu_is_mmio_gpa(vcpu, addr, gpa, write); + } else { + ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write); + if (ret < 0) + return X86EMUL_PROPAGATE_FAULT; } - ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write); - - if (ret < 0) - return X86EMUL_PROPAGATE_FAULT; - - /* For APIC access vmexit */ - if (ret) - goto mmio; - - if (ops->read_write_emulate(vcpu, gpa, val, bytes)) + if (!ret && ops->read_write_emulate(vcpu, gpa, val, bytes)) return X86EMUL_CONTINUE; -mmio: /* * Is this MMIO handled locally? */ @@ -5198,10 +5244,10 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt, return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage); } -static void emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, - u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) +static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, + u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool check_limit) { - kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx); + return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, check_limit); } static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg) @@ -6011,7 +6057,7 @@ static void kvm_set_mmio_spte_mask(void) mask &= ~1ull; #endif - kvm_mmu_set_mmio_spte_mask(mask); + kvm_mmu_set_mmio_spte_mask(mask, mask); } #ifdef CONFIG_X86_64 @@ -6097,7 +6143,7 @@ int kvm_arch_init(void *opaque) kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, PT_DIRTY_MASK, PT64_NX_MASK, 0, - PT_PRESENT_MASK, 0); + PT_PRESENT_MASK, 0, sme_me_mask); kvm_timer_init(); perf_register_guest_info_callbacks(&kvm_guest_cbs); @@ -6199,6 +6245,7 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid) lapic_irq.shorthand = 0; lapic_irq.dest_mode = 0; + lapic_irq.level = 0; lapic_irq.dest_id = apicid; lapic_irq.msi_redir_hint = false; @@ -6332,11 +6379,42 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) int r; /* try to reinject previous events if any */ + if (vcpu->arch.exception.injected) { + kvm_x86_ops->queue_exception(vcpu); + return 0; + } + + /* + * Exceptions must be injected immediately, or the exception + * frame will have the address of the NMI or interrupt handler. + */ + if (!vcpu->arch.exception.pending) { + if (vcpu->arch.nmi_injected) { + kvm_x86_ops->set_nmi(vcpu); + return 0; + } + + if (vcpu->arch.interrupt.pending) { + kvm_x86_ops->set_irq(vcpu); + return 0; + } + } + + if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) { + r = kvm_x86_ops->check_nested_events(vcpu, req_int_win); + if (r != 0) + return r; + } + + /* try to inject new event if pending */ if (vcpu->arch.exception.pending) { trace_kvm_inj_exception(vcpu->arch.exception.nr, vcpu->arch.exception.has_error_code, vcpu->arch.exception.error_code); + vcpu->arch.exception.pending = false; + vcpu->arch.exception.injected = true; + if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT) __kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) | X86_EFLAGS_RF); @@ -6347,31 +6425,8 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) kvm_update_dr7(vcpu); } - kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, - vcpu->arch.exception.has_error_code, - vcpu->arch.exception.error_code, - vcpu->arch.exception.reinject); - return 0; - } - - if (vcpu->arch.nmi_injected) { - kvm_x86_ops->set_nmi(vcpu); - return 0; - } - - if (vcpu->arch.interrupt.pending) { - kvm_x86_ops->set_irq(vcpu); - return 0; - } - - if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) { - r = kvm_x86_ops->check_nested_events(vcpu, req_int_win); - if (r != 0) - return r; - } - - /* try to inject new event if pending */ - if (vcpu->arch.smi_pending && !is_smm(vcpu)) { + kvm_x86_ops->queue_exception(vcpu); + } else if (vcpu->arch.smi_pending && !is_smm(vcpu)) { vcpu->arch.smi_pending = false; enter_smm(vcpu); } else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) { @@ -6588,7 +6643,7 @@ static void enter_smm(struct kvm_vcpu *vcpu) trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true); vcpu->arch.hflags |= HF_SMM_MASK; memset(buf, 0, 512); - if (guest_cpuid_has_longmode(vcpu)) + if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) enter_smm_save_state_64(vcpu, buf); else enter_smm_save_state_32(vcpu, buf); @@ -6640,7 +6695,7 @@ static void enter_smm(struct kvm_vcpu *vcpu) kvm_set_segment(vcpu, &ds, VCPU_SREG_GS); kvm_set_segment(vcpu, &ds, VCPU_SREG_SS); - if (guest_cpuid_has_longmode(vcpu)) + if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) kvm_x86_ops->set_efer(vcpu, 0); kvm_update_cpuid(vcpu); @@ -6708,17 +6763,6 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page); -void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, - unsigned long address) -{ - /* - * The physical address of apic access page is stored in the VMCS. - * Update it when it becomes invalid. - */ - if (address == gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT)) - kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); -} - /* * Returns 1 to let vcpu_run() continue the guest execution loop without * exiting to the userspace. Otherwise, the value will be returned to the @@ -6733,7 +6777,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) bool req_immediate_exit = false; - if (vcpu->requests) { + if (kvm_request_pending(vcpu)) { if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) kvm_mmu_unload(vcpu); if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) @@ -6758,6 +6802,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) { vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; + vcpu->mmio_needed = 0; r = 0; goto out; } @@ -6846,6 +6891,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_x86_ops->enable_nmi_window(vcpu); if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win) kvm_x86_ops->enable_irq_window(vcpu); + WARN_ON(vcpu->arch.exception.pending); } if (kvm_lapic_enabled(vcpu)) { @@ -6897,7 +6943,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_x86_ops->sync_pir_to_irr(vcpu); } - if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests + if (vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) || need_resched() || signal_pending(current)) { vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); @@ -6988,6 +7034,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (vcpu->arch.apic_attention) kvm_lapic_sync_from_vapic(vcpu); + vcpu->arch.gpa_available = false; r = kvm_x86_ops->handle_exit(vcpu); return r; @@ -7178,16 +7225,25 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) int r; sigset_t sigsaved; - fpu__activate_curr(fpu); + fpu__initialize(fpu); if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { + if (kvm_run->immediate_exit) { + r = -EINTR; + goto out; + } kvm_vcpu_block(vcpu); kvm_apic_accept_events(vcpu); kvm_clear_request(KVM_REQ_UNHALT, vcpu); r = -EAGAIN; + if (signal_pending(current)) { + r = -EINTR; + vcpu->run->exit_reason = KVM_EXIT_INTR; + ++vcpu->stat.signal_exits; + } goto out; } @@ -7406,7 +7462,13 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, int pending_vec, max_bits, idx; struct desc_ptr dt; - if (!guest_cpuid_has_xsave(vcpu) && (sregs->cr4 & X86_CR4_OSXSAVE)) + if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && + (sregs->cr4 & X86_CR4_OSXSAVE)) + return -EINVAL; + + apic_base_msr.data = sregs->apic_base; + apic_base_msr.host_initiated = true; + if (kvm_set_apic_base(vcpu, &apic_base_msr)) return -EINVAL; dt.size = sregs->idt.limit; @@ -7425,9 +7487,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, mmu_reset_needed |= vcpu->arch.efer != sregs->efer; kvm_x86_ops->set_efer(vcpu, sregs->efer); - apic_base_msr.data = sregs->apic_base; - apic_base_msr.host_initiated = true; - kvm_set_apic_base(vcpu, &apic_base_msr); mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0; kvm_x86_ops->set_cr0(vcpu, sregs->cr0); @@ -7616,7 +7675,9 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) */ vcpu->guest_fpu_loaded = 1; __kernel_fpu_begin(); - __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state); + /* PKRU is separately restored in kvm_x86_ops->run. */ + __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state, + ~XFEATURE_MASK_PKRU); trace_kvm_fpu(1); } @@ -7676,6 +7737,8 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) struct msr_data msr; struct kvm *kvm = vcpu->kvm; + kvm_hv_vcpu_postcreate(vcpu); + if (vcpu_load(vcpu)) return; msr.data = 0x0; @@ -7714,6 +7777,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.nmi_injected = false; kvm_clear_interrupt_queue(vcpu); kvm_clear_exception_queue(vcpu); + vcpu->arch.exception.pending = false; memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); kvm_update_dr0123(vcpu); @@ -7829,8 +7893,8 @@ int kvm_arch_hardware_enable(void) */ if (backwards_tsc) { u64 delta_cyc = max_tsc - local_tsc; - backwards_tsc_observed = true; list_for_each_entry(kvm, &vm_list, vm_list) { + kvm->arch.backwards_tsc_observed = true; kvm_for_each_vcpu(i, vcpu, kvm) { vcpu->arch.tsc_offset_adjustment += delta_cyc; vcpu->arch.last_host_tsc = local_tsc; @@ -7916,7 +7980,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) BUG_ON(vcpu->kvm == NULL); kvm = vcpu->kvm; - vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(); + vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu); vcpu->arch.pv.pv_unhalted = false; vcpu->arch.emulate_ctxt.ops = &emulate_ops; if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu)) @@ -7973,6 +8037,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) kvm_pmu_init(vcpu); vcpu->arch.pending_external_vector = -1; + vcpu->arch.preempted_in_kernel = false; kvm_hv_vcpu_init(vcpu); @@ -8396,6 +8461,9 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) if (vcpu->arch.pv.pv_unhalted) return true; + if (vcpu->arch.exception.pending) + return true; + if (kvm_test_request(KVM_REQ_NMI, vcpu) || (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu))) @@ -8420,6 +8488,11 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu); } +bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.preempted_in_kernel; +} + int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) { return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE; @@ -8558,6 +8631,13 @@ static int apf_put_user(struct kvm_vcpu *vcpu, u32 val) sizeof(val)); } +static int apf_get_user(struct kvm_vcpu *vcpu, u32 *val) +{ + + return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, val, + sizeof(u32)); +} + void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) { @@ -8576,6 +8656,7 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, fault.error_code = 0; fault.nested_page_fault = false; fault.address = work->arch.token; + fault.async_page_fault = true; kvm_inject_page_fault(vcpu, &fault); } } @@ -8584,6 +8665,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) { struct x86_exception fault; + u32 val; if (work->wakeup_all) work->arch.token = ~0; /* broadcast wakeup */ @@ -8591,14 +8673,26 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, kvm_del_async_pf_gfn(vcpu, work->arch.gfn); trace_kvm_async_pf_ready(work->arch.token, work->gva); - if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) && - !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) { - fault.vector = PF_VECTOR; - fault.error_code_valid = true; - fault.error_code = 0; - fault.nested_page_fault = false; - fault.address = work->arch.token; - kvm_inject_page_fault(vcpu, &fault); + if (vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED && + !apf_get_user(vcpu, &val)) { + if (val == KVM_PV_REASON_PAGE_NOT_PRESENT && + vcpu->arch.exception.pending && + vcpu->arch.exception.nr == PF_VECTOR && + !apf_put_user(vcpu, 0)) { + vcpu->arch.exception.injected = false; + vcpu->arch.exception.pending = false; + vcpu->arch.exception.nr = 0; + vcpu->arch.exception.has_error_code = false; + vcpu->arch.exception.error_code = 0; + } else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) { + fault.vector = PF_VECTOR; + fault.error_code_valid = true; + fault.error_code = 0; + fault.nested_page_fault = false; + fault.address = work->arch.token; + fault.async_page_fault = true; + kvm_inject_page_fault(vcpu, &fault); + } } vcpu->arch.apf.halted = false; vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 612067074905..d0b95b7a90b4 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef ARCH_X86_KVM_X86_H #define ARCH_X86_KVM_X86_H @@ -11,7 +12,7 @@ static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu) { - vcpu->arch.exception.pending = false; + vcpu->arch.exception.injected = false; } static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector, @@ -29,7 +30,7 @@ static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu) static inline bool kvm_event_needs_reinjection(struct kvm_vcpu *vcpu) { - return vcpu->arch.exception.pending || vcpu->arch.interrupt.pending || + return vcpu->arch.exception.injected || vcpu->arch.interrupt.pending || vcpu->arch.nmi_injected; } @@ -62,6 +63,16 @@ static inline bool is_64_bit_mode(struct kvm_vcpu *vcpu) return cs_l; } +static inline bool is_la57_mode(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_X86_64 + return (vcpu->arch.efer & EFER_LMA) && + kvm_read_cr4_bits(vcpu, X86_CR4_LA57); +#else + return 0; +#endif +} + static inline bool mmu_is_nested(struct kvm_vcpu *vcpu) { return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu; @@ -87,10 +98,48 @@ static inline u32 bit(int bitno) return 1 << (bitno & 31); } +static inline u8 vcpu_virt_addr_bits(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr4_bits(vcpu, X86_CR4_LA57) ? 57 : 48; +} + +static inline u8 ctxt_virt_addr_bits(struct x86_emulate_ctxt *ctxt) +{ + return (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_LA57) ? 57 : 48; +} + +static inline u64 get_canonical(u64 la, u8 vaddr_bits) +{ + return ((int64_t)la << (64 - vaddr_bits)) >> (64 - vaddr_bits); +} + +static inline bool is_noncanonical_address(u64 la, struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_X86_64 + return get_canonical(la, vcpu_virt_addr_bits(vcpu)) != la; +#else + return false; +#endif +} + +static inline bool emul_is_noncanonical_address(u64 la, + struct x86_emulate_ctxt *ctxt) +{ +#ifdef CONFIG_X86_64 + return get_canonical(la, ctxt_virt_addr_bits(ctxt)) != la; +#else + return false; +#endif +} + static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, unsigned access) { - vcpu->arch.mmio_gva = gva & PAGE_MASK; + /* + * If this is a shadow nested page table, the "GVA" is + * actually a nGPA. + */ + vcpu->arch.mmio_gva = mmu_is_nested(vcpu) ? 0 : gva & PAGE_MASK; vcpu->arch.access = access; vcpu->arch.mmio_gfn = gfn; vcpu->arch.mmio_gen = kvm_memslots(vcpu->kvm)->generation; diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig deleted file mode 100644 index 08f41caada45..000000000000 --- a/arch/x86/lguest/Kconfig +++ /dev/null @@ -1,14 +0,0 @@ -config LGUEST_GUEST - bool "Lguest guest support" - depends on X86_32 && PARAVIRT && PCI - select TTY - select VIRTUALIZATION - select VIRTIO - select VIRTIO_CONSOLE - help - Lguest is a tiny in-kernel hypervisor. Selecting this will - allow your kernel to boot under lguest. This option will increase - your kernel size by about 10k. If in doubt, say N. - - If you say Y here, make sure you say Y (or M) to the virtio block - and net drivers which lguest needs. diff --git a/arch/x86/lguest/Makefile b/arch/x86/lguest/Makefile deleted file mode 100644 index 8f38d577a2fa..000000000000 --- a/arch/x86/lguest/Makefile +++ /dev/null @@ -1,2 +0,0 @@ -obj-y := head_32.o boot.o -CFLAGS_boot.o := $(call cc-option, -fno-stack-protector) diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c deleted file mode 100644 index 99472698c931..000000000000 --- a/arch/x86/lguest/boot.c +++ /dev/null @@ -1,1558 +0,0 @@ -/*P:010 - * A hypervisor allows multiple Operating Systems to run on a single machine. - * To quote David Wheeler: "Any problem in computer science can be solved with - * another layer of indirection." - * - * We keep things simple in two ways. First, we start with a normal Linux - * kernel and insert a module (lg.ko) which allows us to run other Linux - * kernels the same way we'd run processes. We call the first kernel the Host, - * and the others the Guests. The program which sets up and configures Guests - * (such as the example in tools/lguest/lguest.c) is called the Launcher. - * - * Secondly, we only run specially modified Guests, not normal kernels: setting - * CONFIG_LGUEST_GUEST to "y" compiles this file into the kernel so it knows - * how to be a Guest at boot time. This means that you can use the same kernel - * you boot normally (ie. as a Host) as a Guest. - * - * These Guests know that they cannot do privileged operations, such as disable - * interrupts, and that they have to ask the Host to do such things explicitly. - * This file consists of all the replacements for such low-level native - * hardware operations: these special Guest versions call the Host. - * - * So how does the kernel know it's a Guest? We'll see that later, but let's - * just say that we end up here where we replace the native functions various - * "paravirt" structures with our Guest versions, then boot like normal. -:*/ - -/* - * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ -#include <linux/kernel.h> -#include <linux/start_kernel.h> -#include <linux/string.h> -#include <linux/console.h> -#include <linux/screen_info.h> -#include <linux/irq.h> -#include <linux/interrupt.h> -#include <linux/clocksource.h> -#include <linux/clockchips.h> -#include <linux/lguest.h> -#include <linux/lguest_launcher.h> -#include <linux/virtio_console.h> -#include <linux/pm.h> -#include <linux/export.h> -#include <linux/pci.h> -#include <linux/virtio_pci.h> -#include <asm/acpi.h> -#include <asm/apic.h> -#include <asm/lguest.h> -#include <asm/paravirt.h> -#include <asm/param.h> -#include <asm/page.h> -#include <asm/pgtable.h> -#include <asm/desc.h> -#include <asm/setup.h> -#include <asm/e820/api.h> -#include <asm/mce.h> -#include <asm/io.h> -#include <asm/fpu/api.h> -#include <asm/stackprotector.h> -#include <asm/reboot.h> /* for struct machine_ops */ -#include <asm/kvm_para.h> -#include <asm/pci_x86.h> -#include <asm/pci-direct.h> - -/*G:010 - * Welcome to the Guest! - * - * The Guest in our tale is a simple creature: identical to the Host but - * behaving in simplified but equivalent ways. In particular, the Guest is the - * same kernel as the Host (or at least, built from the same source code). -:*/ - -struct lguest_data lguest_data = { - .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, - .noirq_iret = (u32)lguest_noirq_iret, - .kernel_address = PAGE_OFFSET, - .blocked_interrupts = { 1 }, /* Block timer interrupts */ - .syscall_vec = IA32_SYSCALL_VECTOR, -}; - -/*G:037 - * async_hcall() is pretty simple: I'm quite proud of it really. We have a - * ring buffer of stored hypercalls which the Host will run though next time we - * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall - * arguments, and a "hcall_status" word which is 0 if the call is ready to go, - * and 255 once the Host has finished with it. - * - * If we come around to a slot which hasn't been finished, then the table is - * full and we just make the hypercall directly. This has the nice side - * effect of causing the Host to run all the stored calls in the ring buffer - * which empties it for next time! - */ -static void async_hcall(unsigned long call, unsigned long arg1, - unsigned long arg2, unsigned long arg3, - unsigned long arg4) -{ - /* Note: This code assumes we're uniprocessor. */ - static unsigned int next_call; - unsigned long flags; - - /* - * Disable interrupts if not already disabled: we don't want an - * interrupt handler making a hypercall while we're already doing - * one! - */ - local_irq_save(flags); - if (lguest_data.hcall_status[next_call] != 0xFF) { - /* Table full, so do normal hcall which will flush table. */ - hcall(call, arg1, arg2, arg3, arg4); - } else { - lguest_data.hcalls[next_call].arg0 = call; - lguest_data.hcalls[next_call].arg1 = arg1; - lguest_data.hcalls[next_call].arg2 = arg2; - lguest_data.hcalls[next_call].arg3 = arg3; - lguest_data.hcalls[next_call].arg4 = arg4; - /* Arguments must all be written before we mark it to go */ - wmb(); - lguest_data.hcall_status[next_call] = 0; - if (++next_call == LHCALL_RING_SIZE) - next_call = 0; - } - local_irq_restore(flags); -} - -/*G:035 - * Notice the lazy_hcall() above, rather than hcall(). This is our first real - * optimization trick! - * - * When lazy_mode is set, it means we're allowed to defer all hypercalls and do - * them as a batch when lazy_mode is eventually turned off. Because hypercalls - * are reasonably expensive, batching them up makes sense. For example, a - * large munmap might update dozens of page table entries: that code calls - * paravirt_enter_lazy_mmu(), does the dozen updates, then calls - * lguest_leave_lazy_mode(). - * - * So, when we're in lazy mode, we call async_hcall() to store the call for - * future processing: - */ -static void lazy_hcall1(unsigned long call, unsigned long arg1) -{ - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) - hcall(call, arg1, 0, 0, 0); - else - async_hcall(call, arg1, 0, 0, 0); -} - -/* You can imagine what lazy_hcall2, 3 and 4 look like. :*/ -static void lazy_hcall2(unsigned long call, - unsigned long arg1, - unsigned long arg2) -{ - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) - hcall(call, arg1, arg2, 0, 0); - else - async_hcall(call, arg1, arg2, 0, 0); -} - -static void lazy_hcall3(unsigned long call, - unsigned long arg1, - unsigned long arg2, - unsigned long arg3) -{ - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) - hcall(call, arg1, arg2, arg3, 0); - else - async_hcall(call, arg1, arg2, arg3, 0); -} - -#ifdef CONFIG_X86_PAE -static void lazy_hcall4(unsigned long call, - unsigned long arg1, - unsigned long arg2, - unsigned long arg3, - unsigned long arg4) -{ - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) - hcall(call, arg1, arg2, arg3, arg4); - else - async_hcall(call, arg1, arg2, arg3, arg4); -} -#endif - -/*G:036 - * When lazy mode is turned off, we issue the do-nothing hypercall to - * flush any stored calls, and call the generic helper to reset the - * per-cpu lazy mode variable. - */ -static void lguest_leave_lazy_mmu_mode(void) -{ - hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0); - paravirt_leave_lazy_mmu(); -} - -/* - * We also catch the end of context switch; we enter lazy mode for much of - * that too, so again we need to flush here. - * - * (Technically, this is lazy CPU mode, and normally we're in lazy MMU - * mode, but unlike Xen, lguest doesn't care about the difference). - */ -static void lguest_end_context_switch(struct task_struct *next) -{ - hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0); - paravirt_end_context_switch(next); -} - -/*G:032 - * After that diversion we return to our first native-instruction - * replacements: four functions for interrupt control. - * - * The simplest way of implementing these would be to have "turn interrupts - * off" and "turn interrupts on" hypercalls. Unfortunately, this is too slow: - * these are by far the most commonly called functions of those we override. - * - * So instead we keep an "irq_enabled" field inside our "struct lguest_data", - * which the Guest can update with a single instruction. The Host knows to - * check there before it tries to deliver an interrupt. - */ - -/* - * save_flags() is expected to return the processor state (ie. "flags"). The - * flags word contains all kind of stuff, but in practice Linux only cares - * about the interrupt flag. Our "save_flags()" just returns that. - */ -asmlinkage __visible unsigned long lguest_save_fl(void) -{ - return lguest_data.irq_enabled; -} - -/* Interrupts go off... */ -asmlinkage __visible void lguest_irq_disable(void) -{ - lguest_data.irq_enabled = 0; -} - -/* - * Let's pause a moment. Remember how I said these are called so often? - * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to - * break some rules. In particular, these functions are assumed to save their - * own registers if they need to: normal C functions assume they can trash the - * eax register. To use normal C functions, we use - * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the - * C function, then restores it. - */ -PV_CALLEE_SAVE_REGS_THUNK(lguest_save_fl); -PV_CALLEE_SAVE_REGS_THUNK(lguest_irq_disable); -/*:*/ - -/* These are in head_32.S */ -extern void lg_irq_enable(void); -extern void lg_restore_fl(unsigned long flags); - -/*M:003 - * We could be more efficient in our checking of outstanding interrupts, rather - * than using a branch. One way would be to put the "irq_enabled" field in a - * page by itself, and have the Host write-protect it when an interrupt comes - * in when irqs are disabled. There will then be a page fault as soon as - * interrupts are re-enabled. - * - * A better method is to implement soft interrupt disable generally for x86: - * instead of disabling interrupts, we set a flag. If an interrupt does come - * in, we then disable them for real. This is uncommon, so we could simply use - * a hypercall for interrupt control and not worry about efficiency. -:*/ - -/*G:034 - * The Interrupt Descriptor Table (IDT). - * - * The IDT tells the processor what to do when an interrupt comes in. Each - * entry in the table is a 64-bit descriptor: this holds the privilege level, - * address of the handler, and... well, who cares? The Guest just asks the - * Host to make the change anyway, because the Host controls the real IDT. - */ -static void lguest_write_idt_entry(gate_desc *dt, - int entrynum, const gate_desc *g) -{ - /* - * The gate_desc structure is 8 bytes long: we hand it to the Host in - * two 32-bit chunks. The whole 32-bit kernel used to hand descriptors - * around like this; typesafety wasn't a big concern in Linux's early - * years. - */ - u32 *desc = (u32 *)g; - /* Keep the local copy up to date. */ - native_write_idt_entry(dt, entrynum, g); - /* Tell Host about this new entry. */ - hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1], 0); -} - -/* - * Changing to a different IDT is very rare: we keep the IDT up-to-date every - * time it is written, so we can simply loop through all entries and tell the - * Host about them. - */ -static void lguest_load_idt(const struct desc_ptr *desc) -{ - unsigned int i; - struct desc_struct *idt = (void *)desc->address; - - for (i = 0; i < (desc->size+1)/8; i++) - hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b, 0); -} - -/* - * The Global Descriptor Table. - * - * The Intel architecture defines another table, called the Global Descriptor - * Table (GDT). You tell the CPU where it is (and its size) using the "lgdt" - * instruction, and then several other instructions refer to entries in the - * table. There are three entries which the Switcher needs, so the Host simply - * controls the entire thing and the Guest asks it to make changes using the - * LOAD_GDT hypercall. - * - * This is the exactly like the IDT code. - */ -static void lguest_load_gdt(const struct desc_ptr *desc) -{ - unsigned int i; - struct desc_struct *gdt = (void *)desc->address; - - for (i = 0; i < (desc->size+1)/8; i++) - hcall(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b, 0); -} - -/* - * For a single GDT entry which changes, we simply change our copy and - * then tell the host about it. - */ -static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, - const void *desc, int type) -{ - native_write_gdt_entry(dt, entrynum, desc, type); - /* Tell Host about this new entry. */ - hcall(LHCALL_LOAD_GDT_ENTRY, entrynum, - dt[entrynum].a, dt[entrynum].b, 0); -} - -/* - * There are three "thread local storage" GDT entries which change - * on every context switch (these three entries are how glibc implements - * __thread variables). As an optimization, we have a hypercall - * specifically for this case. - * - * Wouldn't it be nicer to have a general LOAD_GDT_ENTRIES hypercall - * which took a range of entries? - */ -static void lguest_load_tls(struct thread_struct *t, unsigned int cpu) -{ - /* - * There's one problem which normal hardware doesn't have: the Host - * can't handle us removing entries we're currently using. So we clear - * the GS register here: if it's needed it'll be reloaded anyway. - */ - lazy_load_gs(0); - lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu); -} - -/*G:038 - * That's enough excitement for now, back to ploughing through each of the - * different pv_ops structures (we're about 1/3 of the way through). - * - * This is the Local Descriptor Table, another weird Intel thingy. Linux only - * uses this for some strange applications like Wine. We don't do anything - * here, so they'll get an informative and friendly Segmentation Fault. - */ -static void lguest_set_ldt(const void *addr, unsigned entries) -{ -} - -/* - * This loads a GDT entry into the "Task Register": that entry points to a - * structure called the Task State Segment. Some comments scattered though the - * kernel code indicate that this used for task switching in ages past, along - * with blood sacrifice and astrology. - * - * Now there's nothing interesting in here that we don't get told elsewhere. - * But the native version uses the "ltr" instruction, which makes the Host - * complain to the Guest about a Segmentation Fault and it'll oops. So we - * override the native version with a do-nothing version. - */ -static void lguest_load_tr_desc(void) -{ -} - -/* - * The "cpuid" instruction is a way of querying both the CPU identity - * (manufacturer, model, etc) and its features. It was introduced before the - * Pentium in 1993 and keeps getting extended by both Intel, AMD and others. - * As you might imagine, after a decade and a half this treatment, it is now a - * giant ball of hair. Its entry in the current Intel manual runs to 28 pages. - * - * This instruction even it has its own Wikipedia entry. The Wikipedia entry - * has been translated into 6 languages. I am not making this up! - * - * We could get funky here and identify ourselves as "GenuineLguest", but - * instead we just use the real "cpuid" instruction. Then I pretty much turned - * off feature bits until the Guest booted. (Don't say that: you'll damage - * lguest sales!) Shut up, inner voice! (Hey, just pointing out that this is - * hardly future proof.) No one's listening! They don't like you anyway, - * parenthetic weirdo! - * - * Replacing the cpuid so we can turn features off is great for the kernel, but - * anyone (including userspace) can just use the raw "cpuid" instruction and - * the Host won't even notice since it isn't privileged. So we try not to get - * too worked up about it. - */ -static void lguest_cpuid(unsigned int *ax, unsigned int *bx, - unsigned int *cx, unsigned int *dx) -{ - int function = *ax; - - native_cpuid(ax, bx, cx, dx); - switch (function) { - /* - * CPUID 0 gives the highest legal CPUID number (and the ID string). - * We futureproof our code a little by sticking to known CPUID values. - */ - case 0: - if (*ax > 5) - *ax = 5; - break; - - /* - * CPUID 1 is a basic feature request. - * - * CX: we only allow kernel to see SSE3, CMPXCHG16B and SSSE3 - * DX: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU and PAE. - */ - case 1: - *cx &= 0x00002201; - *dx &= 0x07808151; - /* - * The Host can do a nice optimization if it knows that the - * kernel mappings (addresses above 0xC0000000 or whatever - * PAGE_OFFSET is set to) haven't changed. But Linux calls - * flush_tlb_user() for both user and kernel mappings unless - * the Page Global Enable (PGE) feature bit is set. - */ - *dx |= 0x00002000; - /* - * We also lie, and say we're family id 5. 6 or greater - * leads to a rdmsr in early_init_intel which we can't handle. - * Family ID is returned as bits 8-12 in ax. - */ - *ax &= 0xFFFFF0FF; - *ax |= 0x00000500; - break; - - /* - * This is used to detect if we're running under KVM. We might be, - * but that's a Host matter, not us. So say we're not. - */ - case KVM_CPUID_SIGNATURE: - *bx = *cx = *dx = 0; - break; - - /* - * 0x80000000 returns the highest Extended Function, so we futureproof - * like we do above by limiting it to known fields. - */ - case 0x80000000: - if (*ax > 0x80000008) - *ax = 0x80000008; - break; - - /* - * PAE systems can mark pages as non-executable. Linux calls this the - * NX bit. Intel calls it XD (eXecute Disable), AMD EVP (Enhanced - * Virus Protection). We just switch it off here, since we don't - * support it. - */ - case 0x80000001: - *dx &= ~(1 << 20); - break; - } -} - -/* - * Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4. - * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother - * it. The Host needs to know when the Guest wants to change them, so we have - * a whole series of functions like read_cr0() and write_cr0(). - * - * We start with cr0. cr0 allows you to turn on and off all kinds of basic - * features, but the only cr0 bit that Linux ever used at runtime was the - * horrifically-named Task Switched (TS) bit at bit 3 (ie. 8) - * - * What does the TS bit do? Well, it causes the CPU to trap (interrupt 7) if - * the floating point unit is used. Which allows us to restore FPU state - * lazily after a task switch if we wanted to, but wouldn't a name like - * "FPUTRAP bit" be a little less cryptic? - * - * Fortunately, Linux keeps it simple and doesn't use TS, so we can ignore - * cr0. - */ -static void lguest_write_cr0(unsigned long val) -{ -} - -static unsigned long lguest_read_cr0(void) -{ - return 0; -} - -/* - * cr2 is the virtual address of the last page fault, which the Guest only ever - * reads. The Host kindly writes this into our "struct lguest_data", so we - * just read it out of there. - */ -static unsigned long lguest_read_cr2(void) -{ - return lguest_data.cr2; -} - -/* See lguest_set_pte() below. */ -static bool cr3_changed = false; -static unsigned long current_cr3; - -/* - * cr3 is the current toplevel pagetable page: the principle is the same as - * cr0. Keep a local copy, and tell the Host when it changes. - */ -static void lguest_write_cr3(unsigned long cr3) -{ - lazy_hcall1(LHCALL_NEW_PGTABLE, cr3); - current_cr3 = cr3; - - /* These two page tables are simple, linear, and used during boot */ - if (cr3 != __pa_symbol(swapper_pg_dir) && - cr3 != __pa_symbol(initial_page_table)) - cr3_changed = true; -} - -static unsigned long lguest_read_cr3(void) -{ - return current_cr3; -} - -/* cr4 is used to enable and disable PGE, but we don't care. */ -static unsigned long lguest_read_cr4(void) -{ - return 0; -} - -static void lguest_write_cr4(unsigned long val) -{ -} - -/* - * Page Table Handling. - * - * Now would be a good time to take a rest and grab a coffee or similarly - * relaxing stimulant. The easy parts are behind us, and the trek gradually - * winds uphill from here. - * - * Quick refresher: memory is divided into "pages" of 4096 bytes each. The CPU - * maps virtual addresses to physical addresses using "page tables". We could - * use one huge index of 1 million entries: each address is 4 bytes, so that's - * 1024 pages just to hold the page tables. But since most virtual addresses - * are unused, we use a two level index which saves space. The cr3 register - * contains the physical address of the top level "page directory" page, which - * contains physical addresses of up to 1024 second-level pages. Each of these - * second level pages contains up to 1024 physical addresses of actual pages, - * or Page Table Entries (PTEs). - * - * Here's a diagram, where arrows indicate physical addresses: - * - * cr3 ---> +---------+ - * | --------->+---------+ - * | | | PADDR1 | - * Mid-level | | PADDR2 | - * (PMD) page | | | - * | | Lower-level | - * | | (PTE) page | - * | | | | - * .... .... - * - * So to convert a virtual address to a physical address, we look up the top - * level, which points us to the second level, which gives us the physical - * address of that page. If the top level entry was not present, or the second - * level entry was not present, then the virtual address is invalid (we - * say "the page was not mapped"). - * - * Put another way, a 32-bit virtual address is divided up like so: - * - * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - * |<---- 10 bits ---->|<---- 10 bits ---->|<------ 12 bits ------>| - * Index into top Index into second Offset within page - * page directory page pagetable page - * - * Now, unfortunately, this isn't the whole story: Intel added Physical Address - * Extension (PAE) to allow 32 bit systems to use 64GB of memory (ie. 36 bits). - * These are held in 64-bit page table entries, so we can now only fit 512 - * entries in a page, and the neat three-level tree breaks down. - * - * The result is a four level page table: - * - * cr3 --> [ 4 Upper ] - * [ Level ] - * [ Entries ] - * [(PUD Page)]---> +---------+ - * | --------->+---------+ - * | | | PADDR1 | - * Mid-level | | PADDR2 | - * (PMD) page | | | - * | | Lower-level | - * | | (PTE) page | - * | | | | - * .... .... - * - * - * And the virtual address is decoded as: - * - * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - * |<-2->|<--- 9 bits ---->|<---- 9 bits --->|<------ 12 bits ------>| - * Index into Index into mid Index into lower Offset within page - * top entries directory page pagetable page - * - * It's too hard to switch between these two formats at runtime, so Linux only - * supports one or the other depending on whether CONFIG_X86_PAE is set. Many - * distributions turn it on, and not just for people with silly amounts of - * memory: the larger PTE entries allow room for the NX bit, which lets the - * kernel disable execution of pages and increase security. - * - * This was a problem for lguest, which couldn't run on these distributions; - * then Matias Zabaljauregui figured it all out and implemented it, and only a - * handful of puppies were crushed in the process! - * - * Back to our point: the kernel spends a lot of time changing both the - * top-level page directory and lower-level pagetable pages. The Guest doesn't - * know physical addresses, so while it maintains these page tables exactly - * like normal, it also needs to keep the Host informed whenever it makes a - * change: the Host will create the real page tables based on the Guests'. - */ - -/* - * The Guest calls this after it has set a second-level entry (pte), ie. to map - * a page into a process' address space. We tell the Host the toplevel and - * address this corresponds to. The Guest uses one pagetable per process, so - * we need to tell the Host which one we're changing (mm->pgd). - */ -static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) -{ -#ifdef CONFIG_X86_PAE - /* PAE needs to hand a 64 bit page table entry, so it uses two args. */ - lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr, - ptep->pte_low, ptep->pte_high); -#else - lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low); -#endif -} - -/* This is the "set and update" combo-meal-deal version. */ -static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pteval) -{ - native_set_pte(ptep, pteval); - lguest_pte_update(mm, addr, ptep); -} - -/* - * The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd - * to set a middle-level entry when PAE is activated. - * - * Again, we set the entry then tell the Host which page we changed, - * and the index of the entry we changed. - */ -#ifdef CONFIG_X86_PAE -static void lguest_set_pud(pud_t *pudp, pud_t pudval) -{ - native_set_pud(pudp, pudval); - - /* 32 bytes aligned pdpt address and the index. */ - lazy_hcall2(LHCALL_SET_PGD, __pa(pudp) & 0xFFFFFFE0, - (__pa(pudp) & 0x1F) / sizeof(pud_t)); -} - -static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) -{ - native_set_pmd(pmdp, pmdval); - lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK, - (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); -} -#else - -/* The Guest calls lguest_set_pmd to set a top-level entry when !PAE. */ -static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) -{ - native_set_pmd(pmdp, pmdval); - lazy_hcall2(LHCALL_SET_PGD, __pa(pmdp) & PAGE_MASK, - (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); -} -#endif - -/* - * There are a couple of legacy places where the kernel sets a PTE, but we - * don't know the top level any more. This is useless for us, since we don't - * know which pagetable is changing or what address, so we just tell the Host - * to forget all of them. Fortunately, this is very rare. - * - * ... except in early boot when the kernel sets up the initial pagetables, - * which makes booting astonishingly slow: 48 seconds! So we don't even tell - * the Host anything changed until we've done the first real page table switch, - * which brings boot back to 4.3 seconds. - */ -static void lguest_set_pte(pte_t *ptep, pte_t pteval) -{ - native_set_pte(ptep, pteval); - if (cr3_changed) - lazy_hcall1(LHCALL_FLUSH_TLB, 1); -} - -#ifdef CONFIG_X86_PAE -/* - * With 64-bit PTE values, we need to be careful setting them: if we set 32 - * bits at a time, the hardware could see a weird half-set entry. These - * versions ensure we update all 64 bits at once. - */ -static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) -{ - native_set_pte_atomic(ptep, pte); - if (cr3_changed) - lazy_hcall1(LHCALL_FLUSH_TLB, 1); -} - -static void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) -{ - native_pte_clear(mm, addr, ptep); - lguest_pte_update(mm, addr, ptep); -} - -static void lguest_pmd_clear(pmd_t *pmdp) -{ - lguest_set_pmd(pmdp, __pmd(0)); -} -#endif - -/* - * Unfortunately for Lguest, the pv_mmu_ops for page tables were based on - * native page table operations. On native hardware you can set a new page - * table entry whenever you want, but if you want to remove one you have to do - * a TLB flush (a TLB is a little cache of page table entries kept by the CPU). - * - * So the lguest_set_pte_at() and lguest_set_pmd() functions above are only - * called when a valid entry is written, not when it's removed (ie. marked not - * present). Instead, this is where we come when the Guest wants to remove a - * page table entry: we tell the Host to set that entry to 0 (ie. the present - * bit is zero). - */ -static void lguest_flush_tlb_single(unsigned long addr) -{ - /* Simply set it to zero: if it was not, it will fault back in. */ - lazy_hcall3(LHCALL_SET_PTE, current_cr3, addr, 0); -} - -/* - * This is what happens after the Guest has removed a large number of entries. - * This tells the Host that any of the page table entries for userspace might - * have changed, ie. virtual addresses below PAGE_OFFSET. - */ -static void lguest_flush_tlb_user(void) -{ - lazy_hcall1(LHCALL_FLUSH_TLB, 0); -} - -/* - * This is called when the kernel page tables have changed. That's not very - * common (unless the Guest is using highmem, which makes the Guest extremely - * slow), so it's worth separating this from the user flushing above. - */ -static void lguest_flush_tlb_kernel(void) -{ - lazy_hcall1(LHCALL_FLUSH_TLB, 1); -} - -/* - * The Unadvanced Programmable Interrupt Controller. - * - * This is an attempt to implement the simplest possible interrupt controller. - * I spent some time looking though routines like set_irq_chip_and_handler, - * set_irq_chip_and_handler_name, set_irq_chip_data and set_phasers_to_stun and - * I *think* this is as simple as it gets. - * - * We can tell the Host what interrupts we want blocked ready for using the - * lguest_data.interrupts bitmap, so disabling (aka "masking") them is as - * simple as setting a bit. We don't actually "ack" interrupts as such, we - * just mask and unmask them. I wonder if we should be cleverer? - */ -static void disable_lguest_irq(struct irq_data *data) -{ - set_bit(data->irq, lguest_data.blocked_interrupts); -} - -static void enable_lguest_irq(struct irq_data *data) -{ - clear_bit(data->irq, lguest_data.blocked_interrupts); -} - -/* This structure describes the lguest IRQ controller. */ -static struct irq_chip lguest_irq_controller = { - .name = "lguest", - .irq_mask = disable_lguest_irq, - .irq_mask_ack = disable_lguest_irq, - .irq_unmask = enable_lguest_irq, -}; - -/* - * Interrupt descriptors are allocated as-needed, but low-numbered ones are - * reserved by the generic x86 code. So we ignore irq_alloc_desc_at if it - * tells us the irq is already used: other errors (ie. ENOMEM) we take - * seriously. - */ -static int lguest_setup_irq(unsigned int irq) -{ - struct irq_desc *desc; - int err; - - /* Returns -ve error or vector number. */ - err = irq_alloc_desc_at(irq, 0); - if (err < 0 && err != -EEXIST) - return err; - - /* - * Tell the Linux infrastructure that the interrupt is - * controlled by our level-based lguest interrupt controller. - */ - irq_set_chip_and_handler_name(irq, &lguest_irq_controller, - handle_level_irq, "level"); - - /* Some systems map "vectors" to interrupts weirdly. Not us! */ - desc = irq_to_desc(irq); - __this_cpu_write(vector_irq[FIRST_EXTERNAL_VECTOR + irq], desc); - return 0; -} - -static int lguest_enable_irq(struct pci_dev *dev) -{ - int err; - u8 line = 0; - - /* We literally use the PCI interrupt line as the irq number. */ - pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &line); - err = lguest_setup_irq(line); - if (!err) - dev->irq = line; - return err; -} - -/* We don't do hotplug PCI, so this shouldn't be called. */ -static void lguest_disable_irq(struct pci_dev *dev) -{ - WARN_ON(1); -} - -/* - * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware - * interrupt (except 128, which is used for system calls). - */ -static void __init lguest_init_IRQ(void) -{ - unsigned int i; - - for (i = FIRST_EXTERNAL_VECTOR; i < FIRST_SYSTEM_VECTOR; i++) { - if (i != IA32_SYSCALL_VECTOR) - set_intr_gate(i, irq_entries_start + - 8 * (i - FIRST_EXTERNAL_VECTOR)); - } - - /* - * This call is required to set up for 4k stacks, where we have - * separate stacks for hard and soft interrupts. - */ - irq_ctx_init(smp_processor_id()); -} - -/* - * Time. - * - * It would be far better for everyone if the Guest had its own clock, but - * until then the Host gives us the time on every interrupt. - */ -static void lguest_get_wallclock(struct timespec *now) -{ - *now = lguest_data.time; -} - -/* - * The TSC is an Intel thing called the Time Stamp Counter. The Host tells us - * what speed it runs at, or 0 if it's unusable as a reliable clock source. - * This matches what we want here: if we return 0 from this function, the x86 - * TSC clock will give up and not register itself. - */ -static unsigned long lguest_tsc_khz(void) -{ - return lguest_data.tsc_khz; -} - -/* - * If we can't use the TSC, the kernel falls back to our lower-priority - * "lguest_clock", where we read the time value given to us by the Host. - */ -static u64 lguest_clock_read(struct clocksource *cs) -{ - unsigned long sec, nsec; - - /* - * Since the time is in two parts (seconds and nanoseconds), we risk - * reading it just as it's changing from 99 & 0.999999999 to 100 and 0, - * and getting 99 and 0. As Linux tends to come apart under the stress - * of time travel, we must be careful: - */ - do { - /* First we read the seconds part. */ - sec = lguest_data.time.tv_sec; - /* - * This read memory barrier tells the compiler and the CPU that - * this can't be reordered: we have to complete the above - * before going on. - */ - rmb(); - /* Now we read the nanoseconds part. */ - nsec = lguest_data.time.tv_nsec; - /* Make sure we've done that. */ - rmb(); - /* Now if the seconds part has changed, try again. */ - } while (unlikely(lguest_data.time.tv_sec != sec)); - - /* Our lguest clock is in real nanoseconds. */ - return sec*1000000000ULL + nsec; -} - -/* This is the fallback clocksource: lower priority than the TSC clocksource. */ -static struct clocksource lguest_clock = { - .name = "lguest", - .rating = 200, - .read = lguest_clock_read, - .mask = CLOCKSOURCE_MASK(64), - .flags = CLOCK_SOURCE_IS_CONTINUOUS, -}; - -/* - * We also need a "struct clock_event_device": Linux asks us to set it to go - * off some time in the future. Actually, James Morris figured all this out, I - * just applied the patch. - */ -static int lguest_clockevent_set_next_event(unsigned long delta, - struct clock_event_device *evt) -{ - /* FIXME: I don't think this can ever happen, but James tells me he had - * to put this code in. Maybe we should remove it now. Anyone? */ - if (delta < LG_CLOCK_MIN_DELTA) { - if (printk_ratelimit()) - printk(KERN_DEBUG "%s: small delta %lu ns\n", - __func__, delta); - return -ETIME; - } - - /* Please wake us this far in the future. */ - hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0, 0); - return 0; -} - -static int lguest_clockevent_shutdown(struct clock_event_device *evt) -{ - /* A 0 argument shuts the clock down. */ - hcall(LHCALL_SET_CLOCKEVENT, 0, 0, 0, 0); - return 0; -} - -/* This describes our primitive timer chip. */ -static struct clock_event_device lguest_clockevent = { - .name = "lguest", - .features = CLOCK_EVT_FEAT_ONESHOT, - .set_next_event = lguest_clockevent_set_next_event, - .set_state_shutdown = lguest_clockevent_shutdown, - .rating = INT_MAX, - .mult = 1, - .shift = 0, - .min_delta_ns = LG_CLOCK_MIN_DELTA, - .min_delta_ticks = LG_CLOCK_MIN_DELTA, - .max_delta_ns = LG_CLOCK_MAX_DELTA, - .max_delta_ticks = LG_CLOCK_MAX_DELTA, -}; - -/* - * This is the Guest timer interrupt handler (hardware interrupt 0). We just - * call the clockevent infrastructure and it does whatever needs doing. - */ -static void lguest_time_irq(struct irq_desc *desc) -{ - unsigned long flags; - - /* Don't interrupt us while this is running. */ - local_irq_save(flags); - lguest_clockevent.event_handler(&lguest_clockevent); - local_irq_restore(flags); -} - -/* - * At some point in the boot process, we get asked to set up our timing - * infrastructure. The kernel doesn't expect timer interrupts before this, but - * we cleverly initialized the "blocked_interrupts" field of "struct - * lguest_data" so that timer interrupts were blocked until now. - */ -static void lguest_time_init(void) -{ - /* Set up the timer interrupt (0) to go to our simple timer routine */ - if (lguest_setup_irq(0) != 0) - panic("Could not set up timer irq"); - irq_set_handler(0, lguest_time_irq); - - clocksource_register_hz(&lguest_clock, NSEC_PER_SEC); - - /* We can't set cpumask in the initializer: damn C limitations! Set it - * here and register our timer device. */ - lguest_clockevent.cpumask = cpumask_of(0); - clockevents_register_device(&lguest_clockevent); - - /* Finally, we unblock the timer interrupt. */ - clear_bit(0, lguest_data.blocked_interrupts); -} - -/* - * Miscellaneous bits and pieces. - * - * Here is an oddball collection of functions which the Guest needs for things - * to work. They're pretty simple. - */ - -/* - * The Guest needs to tell the Host what stack it expects traps to use. For - * native hardware, this is part of the Task State Segment mentioned above in - * lguest_load_tr_desc(), but to help hypervisors there's this special call. - * - * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data - * segment), the privilege level (we're privilege level 1, the Host is 0 and - * will not tolerate us trying to use that), the stack pointer, and the number - * of pages in the stack. - */ -static void lguest_load_sp0(struct tss_struct *tss, - struct thread_struct *thread) -{ - lazy_hcall3(LHCALL_SET_STACK, __KERNEL_DS | 0x1, thread->sp0, - THREAD_SIZE / PAGE_SIZE); - tss->x86_tss.sp0 = thread->sp0; -} - -/* Let's just say, I wouldn't do debugging under a Guest. */ -static unsigned long lguest_get_debugreg(int regno) -{ - /* FIXME: Implement */ - return 0; -} - -static void lguest_set_debugreg(int regno, unsigned long value) -{ - /* FIXME: Implement */ -} - -/* - * There are times when the kernel wants to make sure that no memory writes are - * caught in the cache (that they've all reached real hardware devices). This - * doesn't matter for the Guest which has virtual hardware. - * - * On the Pentium 4 and above, cpuid() indicates that the Cache Line Flush - * (clflush) instruction is available and the kernel uses that. Otherwise, it - * uses the older "Write Back and Invalidate Cache" (wbinvd) instruction. - * Unlike clflush, wbinvd can only be run at privilege level 0. So we can - * ignore clflush, but replace wbinvd. - */ -static void lguest_wbinvd(void) -{ -} - -/* - * If the Guest expects to have an Advanced Programmable Interrupt Controller, - * we play dumb by ignoring writes and returning 0 for reads. So it's no - * longer Programmable nor Controlling anything, and I don't think 8 lines of - * code qualifies for Advanced. It will also never interrupt anything. It - * does, however, allow us to get through the Linux boot code. - */ -#ifdef CONFIG_X86_LOCAL_APIC -static void lguest_apic_write(u32 reg, u32 v) -{ -} - -static u32 lguest_apic_read(u32 reg) -{ - return 0; -} - -static u64 lguest_apic_icr_read(void) -{ - return 0; -} - -static void lguest_apic_icr_write(u32 low, u32 id) -{ - /* Warn to see if there's any stray references */ - WARN_ON(1); -} - -static void lguest_apic_wait_icr_idle(void) -{ - return; -} - -static u32 lguest_apic_safe_wait_icr_idle(void) -{ - return 0; -} - -static void set_lguest_basic_apic_ops(void) -{ - apic->read = lguest_apic_read; - apic->write = lguest_apic_write; - apic->icr_read = lguest_apic_icr_read; - apic->icr_write = lguest_apic_icr_write; - apic->wait_icr_idle = lguest_apic_wait_icr_idle; - apic->safe_wait_icr_idle = lguest_apic_safe_wait_icr_idle; -}; -#endif - -/* STOP! Until an interrupt comes in. */ -static void lguest_safe_halt(void) -{ - hcall(LHCALL_HALT, 0, 0, 0, 0); -} - -/* - * The SHUTDOWN hypercall takes a string to describe what's happening, and - * an argument which says whether this to restart (reboot) the Guest or not. - * - * Note that the Host always prefers that the Guest speak in physical addresses - * rather than virtual addresses, so we use __pa() here. - */ -static void lguest_power_off(void) -{ - hcall(LHCALL_SHUTDOWN, __pa("Power down"), - LGUEST_SHUTDOWN_POWEROFF, 0, 0); -} - -/* - * Panicing. - * - * Don't. But if you did, this is what happens. - */ -static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p) -{ - hcall(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF, 0, 0); - /* The hcall won't return, but to keep gcc happy, we're "done". */ - return NOTIFY_DONE; -} - -static struct notifier_block paniced = { - .notifier_call = lguest_panic -}; - -/* Setting up memory is fairly easy. */ -static __init char *lguest_memory_setup(void) -{ - /* - * The Linux bootloader header contains an "e820" memory map: the - * Launcher populated the first entry with our memory limit. - */ - e820__range_add(boot_params.e820_table[0].addr, - boot_params.e820_table[0].size, - boot_params.e820_table[0].type); - - /* This string is for the boot messages. */ - return "LGUEST"; -} - -/* Offset within PCI config space of BAR access capability. */ -static int console_cfg_offset = 0; -static int console_access_cap; - -/* Set up so that we access off in bar0 (on bus 0, device 1, function 0) */ -static void set_cfg_window(u32 cfg_offset, u32 off) -{ - write_pci_config_byte(0, 1, 0, - cfg_offset + offsetof(struct virtio_pci_cap, bar), - 0); - write_pci_config(0, 1, 0, - cfg_offset + offsetof(struct virtio_pci_cap, length), - 4); - write_pci_config(0, 1, 0, - cfg_offset + offsetof(struct virtio_pci_cap, offset), - off); -} - -static void write_bar_via_cfg(u32 cfg_offset, u32 off, u32 val) -{ - /* - * We could set this up once, then leave it; nothing else in the * - * kernel should touch these registers. But if it went wrong, that - * would be a horrible bug to find. - */ - set_cfg_window(cfg_offset, off); - write_pci_config(0, 1, 0, - cfg_offset + sizeof(struct virtio_pci_cap), val); -} - -static void probe_pci_console(void) -{ - u8 cap, common_cap = 0, device_cap = 0; - u32 device_len; - - /* Avoid recursive printk into here. */ - console_cfg_offset = -1; - - if (!early_pci_allowed()) { - printk(KERN_ERR "lguest: early PCI access not allowed!\n"); - return; - } - - /* We expect a console PCI device at BUS0, slot 1. */ - if (read_pci_config(0, 1, 0, 0) != 0x10431AF4) { - printk(KERN_ERR "lguest: PCI device is %#x!\n", - read_pci_config(0, 1, 0, 0)); - return; - } - - /* Find the capabilities we need (must be in bar0) */ - cap = read_pci_config_byte(0, 1, 0, PCI_CAPABILITY_LIST); - while (cap) { - u8 vndr = read_pci_config_byte(0, 1, 0, cap); - if (vndr == PCI_CAP_ID_VNDR) { - u8 type, bar; - - type = read_pci_config_byte(0, 1, 0, - cap + offsetof(struct virtio_pci_cap, cfg_type)); - bar = read_pci_config_byte(0, 1, 0, - cap + offsetof(struct virtio_pci_cap, bar)); - - switch (type) { - case VIRTIO_PCI_CAP_DEVICE_CFG: - if (bar == 0) - device_cap = cap; - break; - case VIRTIO_PCI_CAP_PCI_CFG: - console_access_cap = cap; - break; - } - } - cap = read_pci_config_byte(0, 1, 0, cap + PCI_CAP_LIST_NEXT); - } - if (!device_cap || !console_access_cap) { - printk(KERN_ERR "lguest: No caps (%u/%u/%u) in console!\n", - common_cap, device_cap, console_access_cap); - return; - } - - /* - * Note that we can't check features, until we've set the DRIVER - * status bit. We don't want to do that until we have a real driver, - * so we just check that the device-specific config has room for - * emerg_wr. If it doesn't support VIRTIO_CONSOLE_F_EMERG_WRITE - * it should ignore the access. - */ - device_len = read_pci_config(0, 1, 0, - device_cap + offsetof(struct virtio_pci_cap, length)); - if (device_len < (offsetof(struct virtio_console_config, emerg_wr) - + sizeof(u32))) { - printk(KERN_ERR "lguest: console missing emerg_wr field\n"); - return; - } - - console_cfg_offset = read_pci_config(0, 1, 0, - device_cap + offsetof(struct virtio_pci_cap, offset)); - printk(KERN_INFO "lguest: Console via virtio-pci emerg_wr\n"); -} - -/* - * We will eventually use the virtio console device to produce console output, - * but before that is set up we use the virtio PCI console's backdoor mmio - * access and the "emergency" write facility (which is legal even before the - * device is configured). - */ -static __init int early_put_chars(u32 vtermno, const char *buf, int count) -{ - /* If we couldn't find PCI console, forget it. */ - if (console_cfg_offset < 0) - return count; - - if (unlikely(!console_cfg_offset)) { - probe_pci_console(); - if (console_cfg_offset < 0) - return count; - } - - write_bar_via_cfg(console_access_cap, - console_cfg_offset - + offsetof(struct virtio_console_config, emerg_wr), - buf[0]); - return 1; -} - -/* - * Rebooting also tells the Host we're finished, but the RESTART flag tells the - * Launcher to reboot us. - */ -static void lguest_restart(char *reason) -{ - hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0, 0); -} - -/*G:050 - * Patching (Powerfully Placating Performance Pedants) - * - * We have already seen that pv_ops structures let us replace simple native - * instructions with calls to the appropriate back end all throughout the - * kernel. This allows the same kernel to run as a Guest and as a native - * kernel, but it's slow because of all the indirect branches. - * - * Remember that David Wheeler quote about "Any problem in computer science can - * be solved with another layer of indirection"? The rest of that quote is - * "... But that usually will create another problem." This is the first of - * those problems. - * - * Our current solution is to allow the paravirt back end to optionally patch - * over the indirect calls to replace them with something more efficient. We - * patch two of the simplest of the most commonly called functions: disable - * interrupts and save interrupts. We usually have 6 or 10 bytes to patch - * into: the Guest versions of these operations are small enough that we can - * fit comfortably. - * - * First we need assembly templates of each of the patchable Guest operations, - * and these are in head_32.S. - */ - -/*G:060 We construct a table from the assembler templates: */ -static const struct lguest_insns -{ - const char *start, *end; -} lguest_insns[] = { - [PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli }, - [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, -}; - -/* - * Now our patch routine is fairly simple (based on the native one in - * paravirt.c). If we have a replacement, we copy it in and return how much of - * the available space we used. - */ -static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, - unsigned long addr, unsigned len) -{ - unsigned int insn_len; - - /* Don't do anything special if we don't have a replacement */ - if (type >= ARRAY_SIZE(lguest_insns) || !lguest_insns[type].start) - return paravirt_patch_default(type, clobber, ibuf, addr, len); - - insn_len = lguest_insns[type].end - lguest_insns[type].start; - - /* Similarly if it can't fit (doesn't happen, but let's be thorough). */ - if (len < insn_len) - return paravirt_patch_default(type, clobber, ibuf, addr, len); - - /* Copy in our instructions. */ - memcpy(ibuf, lguest_insns[type].start, insn_len); - return insn_len; -} - -/*G:029 - * Once we get to lguest_init(), we know we're a Guest. The various - * pv_ops structures in the kernel provide points for (almost) every routine we - * have to override to avoid privileged instructions. - */ -__init void lguest_init(void) -{ - /* We're under lguest. */ - pv_info.name = "lguest"; - /* We're running at privilege level 1, not 0 as normal. */ - pv_info.kernel_rpl = 1; - /* Everyone except Xen runs with this set. */ - pv_info.shared_kernel_pmd = 1; - - /* - * We set up all the lguest overrides for sensitive operations. These - * are detailed with the operations themselves. - */ - - /* Interrupt-related operations */ - pv_irq_ops.save_fl = PV_CALLEE_SAVE(lguest_save_fl); - pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl); - pv_irq_ops.irq_disable = PV_CALLEE_SAVE(lguest_irq_disable); - pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable); - pv_irq_ops.safe_halt = lguest_safe_halt; - - /* Setup operations */ - pv_init_ops.patch = lguest_patch; - - /* Intercepts of various CPU instructions */ - pv_cpu_ops.load_gdt = lguest_load_gdt; - pv_cpu_ops.cpuid = lguest_cpuid; - pv_cpu_ops.load_idt = lguest_load_idt; - pv_cpu_ops.iret = lguest_iret; - pv_cpu_ops.load_sp0 = lguest_load_sp0; - pv_cpu_ops.load_tr_desc = lguest_load_tr_desc; - pv_cpu_ops.set_ldt = lguest_set_ldt; - pv_cpu_ops.load_tls = lguest_load_tls; - pv_cpu_ops.get_debugreg = lguest_get_debugreg; - pv_cpu_ops.set_debugreg = lguest_set_debugreg; - pv_cpu_ops.read_cr0 = lguest_read_cr0; - pv_cpu_ops.write_cr0 = lguest_write_cr0; - pv_cpu_ops.read_cr4 = lguest_read_cr4; - pv_cpu_ops.write_cr4 = lguest_write_cr4; - pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry; - pv_cpu_ops.write_idt_entry = lguest_write_idt_entry; - pv_cpu_ops.wbinvd = lguest_wbinvd; - pv_cpu_ops.start_context_switch = paravirt_start_context_switch; - pv_cpu_ops.end_context_switch = lguest_end_context_switch; - - /* Pagetable management */ - pv_mmu_ops.write_cr3 = lguest_write_cr3; - pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user; - pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single; - pv_mmu_ops.flush_tlb_kernel = lguest_flush_tlb_kernel; - pv_mmu_ops.set_pte = lguest_set_pte; - pv_mmu_ops.set_pte_at = lguest_set_pte_at; - pv_mmu_ops.set_pmd = lguest_set_pmd; -#ifdef CONFIG_X86_PAE - pv_mmu_ops.set_pte_atomic = lguest_set_pte_atomic; - pv_mmu_ops.pte_clear = lguest_pte_clear; - pv_mmu_ops.pmd_clear = lguest_pmd_clear; - pv_mmu_ops.set_pud = lguest_set_pud; -#endif - pv_mmu_ops.read_cr2 = lguest_read_cr2; - pv_mmu_ops.read_cr3 = lguest_read_cr3; - pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu; - pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode; - pv_mmu_ops.lazy_mode.flush = paravirt_flush_lazy_mmu; - pv_mmu_ops.pte_update = lguest_pte_update; - -#ifdef CONFIG_X86_LOCAL_APIC - /* APIC read/write intercepts */ - set_lguest_basic_apic_ops(); -#endif - - x86_init.resources.memory_setup = lguest_memory_setup; - x86_init.irqs.intr_init = lguest_init_IRQ; - x86_init.timers.timer_init = lguest_time_init; - x86_platform.calibrate_tsc = lguest_tsc_khz; - x86_platform.get_wallclock = lguest_get_wallclock; - - /* - * Now is a good time to look at the implementations of these functions - * before returning to the rest of lguest_init(). - */ - - /*G:070 - * Now we've seen all the paravirt_ops, we return to - * lguest_init() where the rest of the fairly chaotic boot setup - * occurs. - */ - - /* - * The stack protector is a weird thing where gcc places a canary - * value on the stack and then checks it on return. This file is - * compiled with -fno-stack-protector it, so we got this far without - * problems. The value of the canary is kept at offset 20 from the - * %gs register, so we need to set that up before calling C functions - * in other files. - */ - setup_stack_canary_segment(0); - - /* - * We could just call load_stack_canary_segment(), but we might as well - * call switch_to_new_gdt() which loads the whole table and sets up the - * per-cpu segment descriptor register %fs as well. - */ - switch_to_new_gdt(0); - - /* - * The Host<->Guest Switcher lives at the top of our address space, and - * the Host told us how big it is when we made LGUEST_INIT hypercall: - * it put the answer in lguest_data.reserve_mem - */ - reserve_top_address(lguest_data.reserve_mem); - - /* Hook in our special panic hypercall code. */ - atomic_notifier_chain_register(&panic_notifier_list, &paniced); - - /* - * This is messy CPU setup stuff which the native boot code does before - * start_kernel, so we have to do, too: - */ - cpu_detect(&new_cpu_data); - /* head.S usually sets up the first capability word, so do it here. */ - new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1); - - /* Math is always hard! */ - set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU); - - /* We don't have features. We have puppies! Puppies! */ -#ifdef CONFIG_X86_MCE - mca_cfg.disabled = true; -#endif -#ifdef CONFIG_ACPI - acpi_disabled = 1; -#endif - - /* - * We set the preferred console to "hvc". This is the "hypervisor - * virtual console" driver written by the PowerPC people, which we also - * adapted for lguest's use. - */ - add_preferred_console("hvc", 0, NULL); - - /* Register our very early console. */ - virtio_cons_early_init(early_put_chars); - - /* Don't let ACPI try to control our PCI interrupts. */ - disable_acpi(); - - /* We control them ourselves, by overriding these two hooks. */ - pcibios_enable_irq = lguest_enable_irq; - pcibios_disable_irq = lguest_disable_irq; - - /* - * Last of all, we set the power management poweroff hook to point to - * the Guest routine to power off, and the reboot hook to our restart - * routine. - */ - pm_power_off = lguest_power_off; - machine_ops.restart = lguest_restart; - - /* - * Now we're set up, call i386_start_kernel() in head32.c and we proceed - * to boot as normal. It never returns. - */ - i386_start_kernel(); -} -/* - * This marks the end of stage II of our journey, The Guest. - * - * It is now time for us to explore the layer of virtual drivers and complete - * our understanding of the Guest in "make Drivers". - */ diff --git a/arch/x86/lguest/head_32.S b/arch/x86/lguest/head_32.S deleted file mode 100644 index d5ae63f5ec5d..000000000000 --- a/arch/x86/lguest/head_32.S +++ /dev/null @@ -1,192 +0,0 @@ -#include <linux/linkage.h> -#include <linux/lguest.h> -#include <asm/lguest_hcall.h> -#include <asm/asm-offsets.h> -#include <asm/thread_info.h> -#include <asm/processor-flags.h> - -/*G:020 - - * Our story starts with the bzImage: booting starts at startup_32 in - * arch/x86/boot/compressed/head_32.S. This merely uncompresses the real - * kernel in place and then jumps into it: startup_32 in - * arch/x86/kernel/head_32.S. Both routines expects a boot header in the %esi - * register, which is created by the bootloader (the Launcher in our case). - * - * The startup_32 function does very little: it clears the uninitialized global - * C variables which we expect to be zero (ie. BSS) and then copies the boot - * header and kernel command line somewhere safe, and populates some initial - * page tables. Finally it checks the 'hardware_subarch' field. This was - * introduced in 2.6.24 for lguest and Xen: if it's set to '1' (lguest's - * assigned number), then it calls us here. - * - * WARNING: be very careful here! We're running at addresses equal to physical - * addresses (around 0), not above PAGE_OFFSET as most code expects - * (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any - * data without remembering to subtract __PAGE_OFFSET! - * - * The .section line puts this code in .init.text so it will be discarded after - * boot. - */ -.section .init.text, "ax", @progbits -ENTRY(lguest_entry) - /* - * We make the "initialization" hypercall now to tell the Host where - * our lguest_data struct is. - */ - movl $LHCALL_LGUEST_INIT, %eax - movl $lguest_data - __PAGE_OFFSET, %ebx - int $LGUEST_TRAP_ENTRY - - /* Now turn our pagetables on; setup by arch/x86/kernel/head_32.S. */ - movl $LHCALL_NEW_PGTABLE, %eax - movl $(initial_page_table - __PAGE_OFFSET), %ebx - int $LGUEST_TRAP_ENTRY - - /* Set up the initial stack so we can run C code. */ - movl $(init_thread_union+THREAD_SIZE),%esp - - /* Jumps are relative: we're running __PAGE_OFFSET too low. */ - jmp lguest_init+__PAGE_OFFSET - -/*G:055 - * We create a macro which puts the assembler code between lgstart_ and lgend_ - * markers. These templates are put in the .text section: they can't be - * discarded after boot as we may need to patch modules, too. - */ -.text -#define LGUEST_PATCH(name, insns...) \ - lgstart_##name: insns; lgend_##name:; \ - .globl lgstart_##name; .globl lgend_##name - -LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) -LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) - -/*G:033 - * But using those wrappers is inefficient (we'll see why that doesn't matter - * for save_fl and irq_disable later). If we write our routines carefully in - * assembler, we can avoid clobbering any registers and avoid jumping through - * the wrapper functions. - * - * I skipped over our first piece of assembler, but this one is worth studying - * in a bit more detail so I'll describe in easy stages. First, the routine to - * enable interrupts: - */ -ENTRY(lg_irq_enable) - /* - * The reverse of irq_disable, this sets lguest_data.irq_enabled to - * X86_EFLAGS_IF (ie. "Interrupts enabled"). - */ - movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled - /* - * But now we need to check if the Host wants to know: there might have - * been interrupts waiting to be delivered, in which case it will have - * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we - * jump to send_interrupts, otherwise we're done. - */ - cmpl $0, lguest_data+LGUEST_DATA_irq_pending - jnz send_interrupts - /* - * One cool thing about x86 is that you can do many things without using - * a register. In this case, the normal path hasn't needed to save or - * restore any registers at all! - */ - ret -send_interrupts: - /* - * OK, now we need a register: eax is used for the hypercall number, - * which is LHCALL_SEND_INTERRUPTS. - * - * We used not to bother with this pending detection at all, which was - * much simpler. Sooner or later the Host would realize it had to - * send us an interrupt. But that turns out to make performance 7 - * times worse on a simple tcp benchmark. So now we do this the hard - * way. - */ - pushl %eax - movl $LHCALL_SEND_INTERRUPTS, %eax - /* This is the actual hypercall trap. */ - int $LGUEST_TRAP_ENTRY - /* Put eax back the way we found it. */ - popl %eax - ret - -/* - * Finally, the "popf" or "restore flags" routine. The %eax register holds the - * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're - * enabling interrupts again, if it's 0 we're leaving them off. - */ -ENTRY(lg_restore_fl) - /* This is just "lguest_data.irq_enabled = flags;" */ - movl %eax, lguest_data+LGUEST_DATA_irq_enabled - /* - * Now, if the %eax value has enabled interrupts and - * lguest_data.irq_pending is set, we want to tell the Host so it can - * deliver any outstanding interrupts. Fortunately, both values will - * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl" - * instruction will AND them together for us. If both are set, we - * jump to send_interrupts. - */ - testl lguest_data+LGUEST_DATA_irq_pending, %eax - jnz send_interrupts - /* Again, the normal path has used no extra registers. Clever, huh? */ - ret -/*:*/ - -/* These demark the EIP where host should never deliver interrupts. */ -.global lguest_noirq_iret - -/*M:004 - * When the Host reflects a trap or injects an interrupt into the Guest, it - * sets the eflags interrupt bit on the stack based on lguest_data.irq_enabled, - * so the Guest iret logic does the right thing when restoring it. However, - * when the Host sets the Guest up for direct traps, such as system calls, the - * processor is the one to push eflags onto the stack, and the interrupt bit - * will be 1 (in reality, interrupts are always enabled in the Guest). - * - * This turns out to be harmless: the only trap which should happen under Linux - * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc - * regions), which has to be reflected through the Host anyway. If another - * trap *does* go off when interrupts are disabled, the Guest will panic, and - * we'll never get to this iret! -:*/ - -/*G:045 - * There is one final paravirt_op that the Guest implements, and glancing at it - * you can see why I left it to last. It's *cool*! It's in *assembler*! - * - * The "iret" instruction is used to return from an interrupt or trap. The - * stack looks like this: - * old address - * old code segment & privilege level - * old processor flags ("eflags") - * - * The "iret" instruction pops those values off the stack and restores them all - * at once. The only problem is that eflags includes the Interrupt Flag which - * the Guest can't change: the CPU will simply ignore it when we do an "iret". - * So we have to copy eflags from the stack to lguest_data.irq_enabled before - * we do the "iret". - * - * There are two problems with this: firstly, we can't clobber any registers - * and secondly, the whole thing needs to be atomic. The first problem - * is solved by using "push memory"/"pop memory" instruction pair for copying. - * - * The second is harder: copying eflags to lguest_data.irq_enabled will turn - * interrupts on before we're finished, so we could be interrupted before we - * return to userspace or wherever. Our solution to this is to tell the - * Host that it is *never* to interrupt us there, even if interrupts seem to be - * enabled. (It's not necessary to protect pop instruction, since - * data gets updated only after it completes, so we only need to protect - * one instruction, iret). - */ -ENTRY(lguest_iret) - pushl 2*4(%esp) - /* - * Note the %ss: segment prefix here. Normal data accesses use the - * "ds" segment, but that will have already been restored for whatever - * we're returning to (such as userspace): we can't trust it. The %ss: - * prefix makes sure we use the stack segment, which is still valid. - */ - popl %ss:lguest_data+LGUEST_DATA_irq_enabled -lguest_noirq_iret: - iret diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 34a74131a12c..457f681ef379 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # Makefile for x86 specific library files. # diff --git a/arch/x86/lib/cache-smp.c b/arch/x86/lib/cache-smp.c index 216a629a4a1a..1811fa4a1b1a 100644 --- a/arch/x86/lib/cache-smp.c +++ b/arch/x86/lib/cache-smp.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/smp.h> #include <linux/export.h> diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c index 5cc78bf57232..3261abb21ef4 100644 --- a/arch/x86/lib/cmdline.c +++ b/arch/x86/lib/cmdline.c @@ -104,7 +104,112 @@ __cmdline_find_option_bool(const char *cmdline, int max_cmdline_size, return 0; /* Buffer overrun */ } +/* + * Find a non-boolean option (i.e. option=argument). In accordance with + * standard Linux practice, if this option is repeated, this returns the + * last instance on the command line. + * + * @cmdline: the cmdline string + * @max_cmdline_size: the maximum size of cmdline + * @option: option string to look for + * @buffer: memory buffer to return the option argument + * @bufsize: size of the supplied memory buffer + * + * Returns the length of the argument (regardless of if it was + * truncated to fit in the buffer), or -1 on not found. + */ +static int +__cmdline_find_option(const char *cmdline, int max_cmdline_size, + const char *option, char *buffer, int bufsize) +{ + char c; + int pos = 0, len = -1; + const char *opptr = NULL; + char *bufptr = buffer; + enum { + st_wordstart = 0, /* Start of word/after whitespace */ + st_wordcmp, /* Comparing this word */ + st_wordskip, /* Miscompare, skip */ + st_bufcpy, /* Copying this to buffer */ + } state = st_wordstart; + + if (!cmdline) + return -1; /* No command line */ + + /* + * This 'pos' check ensures we do not overrun + * a non-NULL-terminated 'cmdline' + */ + while (pos++ < max_cmdline_size) { + c = *(char *)cmdline++; + if (!c) + break; + + switch (state) { + case st_wordstart: + if (myisspace(c)) + break; + + state = st_wordcmp; + opptr = option; + /* fall through */ + + case st_wordcmp: + if ((c == '=') && !*opptr) { + /* + * We matched all the way to the end of the + * option we were looking for, prepare to + * copy the argument. + */ + len = 0; + bufptr = buffer; + state = st_bufcpy; + break; + } else if (c == *opptr++) { + /* + * We are currently matching, so continue + * to the next character on the cmdline. + */ + break; + } + state = st_wordskip; + /* fall through */ + + case st_wordskip: + if (myisspace(c)) + state = st_wordstart; + break; + + case st_bufcpy: + if (myisspace(c)) { + state = st_wordstart; + } else { + /* + * Increment len, but don't overrun the + * supplied buffer and leave room for the + * NULL terminator. + */ + if (++len < bufsize) + *bufptr++ = c; + } + break; + } + } + + if (bufsize) + *bufptr = '\0'; + + return len; +} + int cmdline_find_option_bool(const char *cmdline, const char *option) { return __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option); } + +int cmdline_find_option(const char *cmdline, const char *option, char *buffer, + int bufsize) +{ + return __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option, + buffer, bufsize); +} diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S index e8508156c99d..fd2d09afa097 100644 --- a/arch/x86/lib/copy_page_64.S +++ b/arch/x86/lib/copy_page_64.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */ #include <linux/linkage.h> diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index c5959576c315..020f75cc8cf6 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -37,7 +37,7 @@ ENTRY(copy_user_generic_unrolled) movl %edx,%ecx andl $63,%edx shrl $6,%ecx - jz 17f + jz .L_copy_short_string 1: movq (%rsi),%r8 2: movq 1*8(%rsi),%r9 3: movq 2*8(%rsi),%r10 @@ -58,7 +58,8 @@ ENTRY(copy_user_generic_unrolled) leaq 64(%rdi),%rdi decl %ecx jnz 1b -17: movl %edx,%ecx +.L_copy_short_string: + movl %edx,%ecx andl $7,%edx shrl $3,%ecx jz 20f @@ -174,6 +175,8 @@ EXPORT_SYMBOL(copy_user_generic_string) */ ENTRY(copy_user_enhanced_fast_string) ASM_STAC + cmpl $64,%edx + jb .L_copy_short_string /* less then 64 bytes, avoid the costly 'rep' */ movl %edx,%ecx 1: rep movsb diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c index 378e5d5bf9b1..9baca3e054be 100644 --- a/arch/x86/lib/csum-partial_64.c +++ b/arch/x86/lib/csum-partial_64.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * arch/x86_64/lib/csum-partial.c * diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index 29df077cb089..553f8fd23cc4 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Precise Delay Loops for i386 * diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S index 37b62d412148..c97d935a29e8 100644 --- a/arch/x86/lib/getuser.S +++ b/arch/x86/lib/getuser.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * __get_user functions. * diff --git a/arch/x86/lib/hweight.S b/arch/x86/lib/hweight.S index 23d893cbc200..a14f9939c365 100644 --- a/arch/x86/lib/hweight.S +++ b/arch/x86/lib/hweight.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #include <linux/linkage.h> #include <asm/export.h> diff --git a/arch/x86/lib/kaslr.c b/arch/x86/lib/kaslr.c index ab2d1d73e9e7..79778ab200e4 100644 --- a/arch/x86/lib/kaslr.c +++ b/arch/x86/lib/kaslr.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Entropy functions used on early boot for KASLR base and memory * randomization. The base randomization is done in the compressed diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c index cad12634d6bd..e565d1c9019e 100644 --- a/arch/x86/lib/memcpy_32.c +++ b/arch/x86/lib/memcpy_32.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/string.h> #include <linux/export.h> @@ -6,7 +7,7 @@ __visible void *memcpy(void *to, const void *from, size_t n) { -#ifdef CONFIG_X86_USE_3DNOW +#if defined(CONFIG_X86_USE_3DNOW) && !defined(CONFIG_FORTIFY_SOURCE) return __memcpy3d(to, from, n); #else return __memcpy(to, from, n); diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S index 15de86cd15b0..bbec69d8223b 100644 --- a/arch/x86/lib/memmove_64.S +++ b/arch/x86/lib/memmove_64.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Normally compiler builtins are used, but sometimes the compiler calls out * of line code. Based on asm-i386/string.h. diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index 55b95db30a61..9bc861c71e75 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Copyright 2002 Andi Kleen, SuSE Labs */ #include <linux/linkage.h> diff --git a/arch/x86/lib/misc.c b/arch/x86/lib/misc.c index 76b373af03f0..a018ec4fba53 100644 --- a/arch/x86/lib/misc.c +++ b/arch/x86/lib/misc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Count the digits of @val including a possible sign. * diff --git a/arch/x86/lib/mmx_32.c b/arch/x86/lib/mmx_32.c index c2311a678332..4321fa02e18d 100644 --- a/arch/x86/lib/mmx_32.c +++ b/arch/x86/lib/mmx_32.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * MMX 3DNow! library helper functions * diff --git a/arch/x86/lib/msr-reg-export.c b/arch/x86/lib/msr-reg-export.c index ff29e8d39414..876b4168ab0a 100644 --- a/arch/x86/lib/msr-reg-export.c +++ b/arch/x86/lib/msr-reg-export.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/export.h> #include <asm/msr.h> diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S index c81556409bbb..ed33cbab3958 100644 --- a/arch/x86/lib/msr-reg.S +++ b/arch/x86/lib/msr-reg.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #include <linux/linkage.h> #include <linux/errno.h> #include <asm/asm.h> @@ -13,14 +14,14 @@ .macro op_safe_regs op ENTRY(\op\()_safe_regs) pushq %rbx - pushq %rbp + pushq %r12 movq %rdi, %r10 /* Save pointer */ xorl %r11d, %r11d /* Return value */ movl (%rdi), %eax movl 4(%rdi), %ecx movl 8(%rdi), %edx movl 12(%rdi), %ebx - movl 20(%rdi), %ebp + movl 20(%rdi), %r12d movl 24(%rdi), %esi movl 28(%rdi), %edi 1: \op @@ -29,10 +30,10 @@ ENTRY(\op\()_safe_regs) movl %ecx, 4(%r10) movl %edx, 8(%r10) movl %ebx, 12(%r10) - movl %ebp, 20(%r10) + movl %r12d, 20(%r10) movl %esi, 24(%r10) movl %edi, 28(%r10) - popq %rbp + popq %r12 popq %rbx ret 3: diff --git a/arch/x86/lib/msr-smp.c b/arch/x86/lib/msr-smp.c index ce68b6a9d7d1..693cce0be82d 100644 --- a/arch/x86/lib/msr-smp.c +++ b/arch/x86/lib/msr-smp.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/export.h> #include <linux/preempt.h> #include <linux/smp.h> diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c index 07764255b611..3bd905e10ee2 100644 --- a/arch/x86/lib/msr.c +++ b/arch/x86/lib/msr.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/export.h> #include <linux/percpu.h> #include <linux/preempt.h> diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S index cd5d716d2897..96dce5fe2a35 100644 --- a/arch/x86/lib/putuser.S +++ b/arch/x86/lib/putuser.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * __put_user functions. * diff --git a/arch/x86/lib/string_32.c b/arch/x86/lib/string_32.c index dc0ad12f80bb..d15fdae9656e 100644 --- a/arch/x86/lib/string_32.c +++ b/arch/x86/lib/string_32.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Most of the string-functions are rather heavily hand-optimized, * see especially strsep,strstr,str[c]spn. They should work, but are not diff --git a/arch/x86/lib/strstr_32.c b/arch/x86/lib/strstr_32.c index a03b1c750bfe..38f37df056f7 100644 --- a/arch/x86/lib/strstr_32.c +++ b/arch/x86/lib/strstr_32.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/string.h> #include <linux/export.h> diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index bd057a4ffe6e..1b377f734e64 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * User address space access functions. * The non inlined parts of asm-i386/uaccess.h are here. diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index 3b7c40a2e3e1..75d3776123cc 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -7,6 +7,7 @@ */ #include <linux/export.h> #include <linux/uaccess.h> +#include <linux/highmem.h> /* * Zero Userspace @@ -73,3 +74,136 @@ copy_user_handle_tail(char *to, char *from, unsigned len) clac(); return len; } + +#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE +/** + * clean_cache_range - write back a cache range with CLWB + * @vaddr: virtual start address + * @size: number of bytes to write back + * + * Write back a cache range using the CLWB (cache line write back) + * instruction. Note that @size is internally rounded up to be cache + * line size aligned. + */ +static void clean_cache_range(void *addr, size_t size) +{ + u16 x86_clflush_size = boot_cpu_data.x86_clflush_size; + unsigned long clflush_mask = x86_clflush_size - 1; + void *vend = addr + size; + void *p; + + for (p = (void *)((unsigned long)addr & ~clflush_mask); + p < vend; p += x86_clflush_size) + clwb(p); +} + +void arch_wb_cache_pmem(void *addr, size_t size) +{ + clean_cache_range(addr, size); +} +EXPORT_SYMBOL_GPL(arch_wb_cache_pmem); + +long __copy_user_flushcache(void *dst, const void __user *src, unsigned size) +{ + unsigned long flushed, dest = (unsigned long) dst; + long rc = __copy_user_nocache(dst, src, size, 0); + + /* + * __copy_user_nocache() uses non-temporal stores for the bulk + * of the transfer, but we need to manually flush if the + * transfer is unaligned. A cached memory copy is used when + * destination or size is not naturally aligned. That is: + * - Require 8-byte alignment when size is 8 bytes or larger. + * - Require 4-byte alignment when size is 4 bytes. + */ + if (size < 8) { + if (!IS_ALIGNED(dest, 4) || size != 4) + clean_cache_range(dst, 1); + } else { + if (!IS_ALIGNED(dest, 8)) { + dest = ALIGN(dest, boot_cpu_data.x86_clflush_size); + clean_cache_range(dst, 1); + } + + flushed = dest - (unsigned long) dst; + if (size > flushed && !IS_ALIGNED(size - flushed, 8)) + clean_cache_range(dst + size - 1, 1); + } + + return rc; +} + +void memcpy_flushcache(void *_dst, const void *_src, size_t size) +{ + unsigned long dest = (unsigned long) _dst; + unsigned long source = (unsigned long) _src; + + /* cache copy and flush to align dest */ + if (!IS_ALIGNED(dest, 8)) { + unsigned len = min_t(unsigned, size, ALIGN(dest, 8) - dest); + + memcpy((void *) dest, (void *) source, len); + clean_cache_range((void *) dest, len); + dest += len; + source += len; + size -= len; + if (!size) + return; + } + + /* 4x8 movnti loop */ + while (size >= 32) { + asm("movq (%0), %%r8\n" + "movq 8(%0), %%r9\n" + "movq 16(%0), %%r10\n" + "movq 24(%0), %%r11\n" + "movnti %%r8, (%1)\n" + "movnti %%r9, 8(%1)\n" + "movnti %%r10, 16(%1)\n" + "movnti %%r11, 24(%1)\n" + :: "r" (source), "r" (dest) + : "memory", "r8", "r9", "r10", "r11"); + dest += 32; + source += 32; + size -= 32; + } + + /* 1x8 movnti loop */ + while (size >= 8) { + asm("movq (%0), %%r8\n" + "movnti %%r8, (%1)\n" + :: "r" (source), "r" (dest) + : "memory", "r8"); + dest += 8; + source += 8; + size -= 8; + } + + /* 1x4 movnti loop */ + while (size >= 4) { + asm("movl (%0), %%r8d\n" + "movnti %%r8d, (%1)\n" + :: "r" (source), "r" (dest) + : "memory", "r8"); + dest += 4; + source += 4; + size -= 4; + } + + /* cache copy for remaining bytes */ + if (size) { + memcpy((void *) dest, (void *) source, size); + clean_cache_range((void *) dest, size); + } +} +EXPORT_SYMBOL_GPL(memcpy_flushcache); + +void memcpy_page_flushcache(char *to, struct page *page, size_t offset, + size_t len) +{ + char *from = kmap_atomic(page); + + memcpy_flushcache(to, from + offset, len); + kunmap_atomic(from); +} +#endif diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index 767be7c76034..12e377184ee4 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -1009,7 +1009,7 @@ GrpTable: Grp15 1: fxstor | RDGSBASE Ry (F3),(11B) 2: vldmxcsr Md (v1) | WRFSBASE Ry (F3),(11B) 3: vstmxcsr Md (v1) | WRGSBASE Ry (F3),(11B) -4: XSAVE +4: XSAVE | ptwrite Ey (F3),(11B) 5: XRSTOR | lfence (11B) 6: XSAVEOPT | clwb (66) | mfence (11B) 7: clflush | clflushopt (66) | sfence (11B) diff --git a/arch/x86/math-emu/Makefile b/arch/x86/math-emu/Makefile index 9b0c63b60302..02211fc6f2ec 100644 --- a/arch/x86/math-emu/Makefile +++ b/arch/x86/math-emu/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # Makefile for wm-FPU-emu # @@ -5,8 +6,8 @@ #DEBUG = -DDEBUGGING DEBUG = PARANOID = -DPARANOID -EXTRA_CFLAGS := $(PARANOID) $(DEBUG) -fno-builtin $(MATH_EMULATION) -EXTRA_AFLAGS := $(PARANOID) +ccflags-y += $(PARANOID) $(DEBUG) -fno-builtin $(MATH_EMULATION) +asflags-y += $(PARANOID) # From 'C' language sources: C_OBJS =fpu_entry.o errors.o \ diff --git a/arch/x86/math-emu/control_w.h b/arch/x86/math-emu/control_w.h index ae2274dbd305..60f4dcc5edc3 100644 --- a/arch/x86/math-emu/control_w.h +++ b/arch/x86/math-emu/control_w.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /*---------------------------------------------------------------------------+ | control_w.h | | | diff --git a/arch/x86/math-emu/div_Xsig.S b/arch/x86/math-emu/div_Xsig.S index f77ba3058b31..ee08449d20fd 100644 --- a/arch/x86/math-emu/div_Xsig.S +++ b/arch/x86/math-emu/div_Xsig.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ .file "div_Xsig.S" /*---------------------------------------------------------------------------+ | div_Xsig.S | @@ -363,3 +364,4 @@ L_bugged_2: pop %ebx jmp L_exit #endif /* PARANOID */ +ENDPROC(div_Xsig) diff --git a/arch/x86/math-emu/div_small.S b/arch/x86/math-emu/div_small.S index 47099628fa4c..8f5025c80ee0 100644 --- a/arch/x86/math-emu/div_small.S +++ b/arch/x86/math-emu/div_small.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ .file "div_small.S" /*---------------------------------------------------------------------------+ | div_small.S | @@ -44,4 +45,4 @@ ENTRY(FPU_div_small) leave ret - +ENDPROC(FPU_div_small) diff --git a/arch/x86/math-emu/errors.c b/arch/x86/math-emu/errors.c index 2ccc424a57d9..6b468517ab71 100644 --- a/arch/x86/math-emu/errors.c +++ b/arch/x86/math-emu/errors.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /*---------------------------------------------------------------------------+ | errors.c | | | diff --git a/arch/x86/math-emu/exception.h b/arch/x86/math-emu/exception.h index 67f43a4683d5..75230b977577 100644 --- a/arch/x86/math-emu/exception.h +++ b/arch/x86/math-emu/exception.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /*---------------------------------------------------------------------------+ | exception.h | | | diff --git a/arch/x86/math-emu/fpu_arith.c b/arch/x86/math-emu/fpu_arith.c index aeab24e083c4..09006dc474a0 100644 --- a/arch/x86/math-emu/fpu_arith.c +++ b/arch/x86/math-emu/fpu_arith.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /*---------------------------------------------------------------------------+ | fpu_arith.c | | | diff --git a/arch/x86/math-emu/fpu_asm.h b/arch/x86/math-emu/fpu_asm.h index 955b932735a4..a83353d5271c 100644 --- a/arch/x86/math-emu/fpu_asm.h +++ b/arch/x86/math-emu/fpu_asm.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /*---------------------------------------------------------------------------+ | fpu_asm.h | | | diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c index 024f6e971174..034748459482 100644 --- a/arch/x86/math-emu/fpu_aux.c +++ b/arch/x86/math-emu/fpu_aux.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /*---------------------------------------------------------------------------+ | fpu_aux.c | | | diff --git a/arch/x86/math-emu/fpu_emu.h b/arch/x86/math-emu/fpu_emu.h index afbc4d805d66..a5a41ec58072 100644 --- a/arch/x86/math-emu/fpu_emu.h +++ b/arch/x86/math-emu/fpu_emu.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /*---------------------------------------------------------------------------+ | fpu_emu.h | | | @@ -157,7 +158,7 @@ extern u_char const data_sizes_16[32]; #define signbyte(a) (((u_char *)(a))[9]) #define getsign(a) (signbyte(a) & 0x80) -#define setsign(a,b) { if (b) signbyte(a) |= 0x80; else signbyte(a) &= 0x7f; } +#define setsign(a,b) { if ((b) != 0) signbyte(a) |= 0x80; else signbyte(a) &= 0x7f; } #define copysign(a,b) { if (getsign(a)) signbyte(b) |= 0x80; \ else signbyte(b) &= 0x7f; } #define changesign(a) { signbyte(a) ^= 0x80; } diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c index 0203baefb5c0..9e2ba7e667f6 100644 --- a/arch/x86/math-emu/fpu_entry.c +++ b/arch/x86/math-emu/fpu_entry.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /*---------------------------------------------------------------------------+ | fpu_entry.c | | | @@ -114,7 +115,7 @@ void math_emulate(struct math_emu_info *info) struct desc_struct code_descriptor; struct fpu *fpu = ¤t->thread.fpu; - fpu__activate_curr(fpu); + fpu__initialize(fpu); #ifdef RE_ENTRANT_CHECKING if (emulating) { @@ -147,7 +148,7 @@ void math_emulate(struct math_emu_info *info) } code_descriptor = FPU_get_ldt_descriptor(FPU_CS); - if (SEG_D_SIZE(code_descriptor)) { + if (code_descriptor.d) { /* The above test may be wrong, the book is not clear */ /* Segmented 32 bit protected mode */ addr_modes.default_mode = SEG32; @@ -155,11 +156,10 @@ void math_emulate(struct math_emu_info *info) /* 16 bit protected mode */ addr_modes.default_mode = PM16; } - FPU_EIP += code_base = SEG_BASE_ADDR(code_descriptor); - code_limit = code_base - + (SEG_LIMIT(code_descriptor) + - 1) * SEG_GRANULARITY(code_descriptor) - - 1; + FPU_EIP += code_base = seg_get_base(&code_descriptor); + code_limit = seg_get_limit(&code_descriptor) + 1; + code_limit *= seg_get_granularity(&code_descriptor); + code_limit += code_base - 1; if (code_limit < code_base) code_limit = 0xffffffff; } diff --git a/arch/x86/math-emu/fpu_etc.c b/arch/x86/math-emu/fpu_etc.c index 233e5af566f5..1b118fd93140 100644 --- a/arch/x86/math-emu/fpu_etc.c +++ b/arch/x86/math-emu/fpu_etc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /*---------------------------------------------------------------------------+ | fpu_etc.c | | | diff --git a/arch/x86/math-emu/fpu_proto.h b/arch/x86/math-emu/fpu_proto.h index caff438b9c1d..70d35c200945 100644 --- a/arch/x86/math-emu/fpu_proto.h +++ b/arch/x86/math-emu/fpu_proto.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _FPU_PROTO_H #define _FPU_PROTO_H diff --git a/arch/x86/math-emu/fpu_system.h b/arch/x86/math-emu/fpu_system.h index 5e044d506b7a..c8b1b31ed7c4 100644 --- a/arch/x86/math-emu/fpu_system.h +++ b/arch/x86/math-emu/fpu_system.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /*---------------------------------------------------------------------------+ | fpu_system.h | | | @@ -27,24 +28,50 @@ static inline struct desc_struct FPU_get_ldt_descriptor(unsigned seg) #ifdef CONFIG_MODIFY_LDT_SYSCALL seg >>= 3; mutex_lock(¤t->mm->context.lock); - if (current->mm->context.ldt && seg < current->mm->context.ldt->size) + if (current->mm->context.ldt && seg < current->mm->context.ldt->nr_entries) ret = current->mm->context.ldt->entries[seg]; mutex_unlock(¤t->mm->context.lock); #endif return ret; } -#define SEG_D_SIZE(x) ((x).b & (3 << 21)) -#define SEG_G_BIT(x) ((x).b & (1 << 23)) -#define SEG_GRANULARITY(x) (((x).b & (1 << 23)) ? 4096 : 1) -#define SEG_286_MODE(x) ((x).b & ( 0xff000000 | 0xf0000 | (1 << 23))) -#define SEG_BASE_ADDR(s) (((s).b & 0xff000000) \ - | (((s).b & 0xff) << 16) | ((s).a >> 16)) -#define SEG_LIMIT(s) (((s).b & 0xff0000) | ((s).a & 0xffff)) -#define SEG_EXECUTE_ONLY(s) (((s).b & ((1 << 11) | (1 << 9))) == (1 << 11)) -#define SEG_WRITE_PERM(s) (((s).b & ((1 << 11) | (1 << 9))) == (1 << 9)) -#define SEG_EXPAND_DOWN(s) (((s).b & ((1 << 11) | (1 << 10))) \ - == (1 << 10)) +#define SEG_TYPE_WRITABLE (1U << 1) +#define SEG_TYPE_EXPANDS_DOWN (1U << 2) +#define SEG_TYPE_EXECUTE (1U << 3) +#define SEG_TYPE_EXPAND_MASK (SEG_TYPE_EXPANDS_DOWN | SEG_TYPE_EXECUTE) +#define SEG_TYPE_EXECUTE_MASK (SEG_TYPE_WRITABLE | SEG_TYPE_EXECUTE) + +static inline unsigned long seg_get_base(struct desc_struct *d) +{ + unsigned long base = (unsigned long)d->base2 << 24; + + return base | ((unsigned long)d->base1 << 16) | d->base0; +} + +static inline unsigned long seg_get_limit(struct desc_struct *d) +{ + return ((unsigned long)d->limit1 << 16) | d->limit0; +} + +static inline unsigned long seg_get_granularity(struct desc_struct *d) +{ + return d->g ? 4096 : 1; +} + +static inline bool seg_expands_down(struct desc_struct *d) +{ + return (d->type & SEG_TYPE_EXPAND_MASK) == SEG_TYPE_EXPANDS_DOWN; +} + +static inline bool seg_execute_only(struct desc_struct *d) +{ + return (d->type & SEG_TYPE_EXECUTE_MASK) == SEG_TYPE_EXECUTE; +} + +static inline bool seg_writable(struct desc_struct *d) +{ + return (d->type & SEG_TYPE_EXECUTE_MASK) == SEG_TYPE_WRITABLE; +} #define I387 (¤t->thread.fpu.state) #define FPU_info (I387->soft.info) diff --git a/arch/x86/math-emu/fpu_tags.c b/arch/x86/math-emu/fpu_tags.c index d9c657cd7746..bff95d4e7dca 100644 --- a/arch/x86/math-emu/fpu_tags.c +++ b/arch/x86/math-emu/fpu_tags.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /*---------------------------------------------------------------------------+ | fpu_tags.c | | | diff --git a/arch/x86/math-emu/fpu_trig.c b/arch/x86/math-emu/fpu_trig.c index ecd06680581c..783c509f957a 100644 --- a/arch/x86/math-emu/fpu_trig.c +++ b/arch/x86/math-emu/fpu_trig.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /*---------------------------------------------------------------------------+ | fpu_trig.c | | | diff --git a/arch/x86/math-emu/get_address.c b/arch/x86/math-emu/get_address.c index b8ef9f9d2ffc..b82ca14ba718 100644 --- a/arch/x86/math-emu/get_address.c +++ b/arch/x86/math-emu/get_address.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /*---------------------------------------------------------------------------+ | get_address.c | | | @@ -159,17 +160,18 @@ static long pm_address(u_char FPU_modrm, u_char segment, } descriptor = FPU_get_ldt_descriptor(addr->selector); - base_address = SEG_BASE_ADDR(descriptor); + base_address = seg_get_base(&descriptor); address = base_address + offset; - limit = base_address - + (SEG_LIMIT(descriptor) + 1) * SEG_GRANULARITY(descriptor) - 1; + limit = seg_get_limit(&descriptor) + 1; + limit *= seg_get_granularity(&descriptor); + limit += base_address - 1; if (limit < base_address) limit = 0xffffffff; - if (SEG_EXPAND_DOWN(descriptor)) { - if (SEG_G_BIT(descriptor)) + if (seg_expands_down(&descriptor)) { + if (descriptor.g) { seg_top = 0xffffffff; - else { + } else { seg_top = base_address + (1 << 20); if (seg_top < base_address) seg_top = 0xffffffff; @@ -182,8 +184,8 @@ static long pm_address(u_char FPU_modrm, u_char segment, (address > limit) || (address < base_address) ? 0 : ((limit - address) >= 254 ? 255 : limit - address + 1); } - if (SEG_EXECUTE_ONLY(descriptor) || - (!SEG_WRITE_PERM(descriptor) && (FPU_modrm & FPU_WRITE_BIT))) { + if (seg_execute_only(&descriptor) || + (!seg_writable(&descriptor) && (FPU_modrm & FPU_WRITE_BIT))) { access_limit = 0; } return address; diff --git a/arch/x86/math-emu/load_store.c b/arch/x86/math-emu/load_store.c index 1643054766eb..f821a9cd7753 100644 --- a/arch/x86/math-emu/load_store.c +++ b/arch/x86/math-emu/load_store.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /*---------------------------------------------------------------------------+ | load_store.c | | | diff --git a/arch/x86/math-emu/mul_Xsig.S b/arch/x86/math-emu/mul_Xsig.S index 717785a53eb4..3e489122a2b0 100644 --- a/arch/x86/math-emu/mul_Xsig.S +++ b/arch/x86/math-emu/mul_Xsig.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /*---------------------------------------------------------------------------+ | mul_Xsig.S | | | @@ -62,6 +63,7 @@ ENTRY(mul32_Xsig) popl %esi leave ret +ENDPROC(mul32_Xsig) ENTRY(mul64_Xsig) @@ -114,6 +116,7 @@ ENTRY(mul64_Xsig) popl %esi leave ret +ENDPROC(mul64_Xsig) @@ -173,4 +176,4 @@ ENTRY(mul_Xsig_Xsig) popl %esi leave ret - +ENDPROC(mul_Xsig_Xsig) diff --git a/arch/x86/math-emu/poly.h b/arch/x86/math-emu/poly.h index 168eb44c93c8..fc1c887ca073 100644 --- a/arch/x86/math-emu/poly.h +++ b/arch/x86/math-emu/poly.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /*---------------------------------------------------------------------------+ | poly.h | | | diff --git a/arch/x86/math-emu/poly_2xm1.c b/arch/x86/math-emu/poly_2xm1.c index b00e9e10cdce..aa33006bafd5 100644 --- a/arch/x86/math-emu/poly_2xm1.c +++ b/arch/x86/math-emu/poly_2xm1.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /*---------------------------------------------------------------------------+ | poly_2xm1.c | | | diff --git a/arch/x86/math-emu/poly_atan.c b/arch/x86/math-emu/poly_atan.c index 20c28e58e2d4..7e7412c5a1fe 100644 --- a/arch/x86/math-emu/poly_atan.c +++ b/arch/x86/math-emu/poly_atan.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /*---------------------------------------------------------------------------+ | poly_atan.c | | | diff --git a/arch/x86/math-emu/poly_l2.c b/arch/x86/math-emu/poly_l2.c index 8e2ff4b28a0a..98b6949bb854 100644 --- a/arch/x86/math-emu/poly_l2.c +++ b/arch/x86/math-emu/poly_l2.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /*---------------------------------------------------------------------------+ | poly_l2.c | | | diff --git a/arch/x86/math-emu/poly_sin.c b/arch/x86/math-emu/poly_sin.c index b862039c728e..c192fba51526 100644 --- a/arch/x86/math-emu/poly_sin.c +++ b/arch/x86/math-emu/poly_sin.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /*---------------------------------------------------------------------------+ | poly_sin.c | | | diff --git a/arch/x86/math-emu/poly_tan.c b/arch/x86/math-emu/poly_tan.c index 1875763e0c02..1f5b1d712323 100644 --- a/arch/x86/math-emu/poly_tan.c +++ b/arch/x86/math-emu/poly_tan.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /*---------------------------------------------------------------------------+ | poly_tan.c | | | diff --git a/arch/x86/math-emu/polynom_Xsig.S b/arch/x86/math-emu/polynom_Xsig.S index 17315c89ff3d..604f0b2d17e8 100644 --- a/arch/x86/math-emu/polynom_Xsig.S +++ b/arch/x86/math-emu/polynom_Xsig.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /*---------------------------------------------------------------------------+ | polynomial_Xsig.S | | | @@ -133,3 +134,4 @@ L_accum_done: popl %esi leave ret +ENDPROC(polynomial_Xsig) diff --git a/arch/x86/math-emu/reg_add_sub.c b/arch/x86/math-emu/reg_add_sub.c index deea48b9f13a..29451dd07556 100644 --- a/arch/x86/math-emu/reg_add_sub.c +++ b/arch/x86/math-emu/reg_add_sub.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /*---------------------------------------------------------------------------+ | reg_add_sub.c | | | diff --git a/arch/x86/math-emu/reg_compare.c b/arch/x86/math-emu/reg_compare.c index b77360fdbf4a..eacb5128fc09 100644 --- a/arch/x86/math-emu/reg_compare.c +++ b/arch/x86/math-emu/reg_compare.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /*---------------------------------------------------------------------------+ | reg_compare.c | | | @@ -168,7 +169,7 @@ static int compare(FPU_REG const *b, int tagb) /* This function requires that st(0) is not empty */ int FPU_compare_st_data(FPU_REG const *loaded_data, u_char loaded_tag) { - int f = 0, c; + int f, c; c = compare(loaded_data, loaded_tag); @@ -189,12 +190,12 @@ int FPU_compare_st_data(FPU_REG const *loaded_data, u_char loaded_tag) case COMP_No_Comp: f = SW_C3 | SW_C2 | SW_C0; break; -#ifdef PARANOID default: +#ifdef PARANOID EXCEPTION(EX_INTERNAL | 0x121); +#endif /* PARANOID */ f = SW_C3 | SW_C2 | SW_C0; break; -#endif /* PARANOID */ } setcc(f); if (c & COMP_Denormal) { @@ -205,7 +206,7 @@ int FPU_compare_st_data(FPU_REG const *loaded_data, u_char loaded_tag) static int compare_st_st(int nr) { - int f = 0, c; + int f, c; FPU_REG *st_ptr; if (!NOT_EMPTY(0) || !NOT_EMPTY(nr)) { @@ -235,12 +236,12 @@ static int compare_st_st(int nr) case COMP_No_Comp: f = SW_C3 | SW_C2 | SW_C0; break; -#ifdef PARANOID default: +#ifdef PARANOID EXCEPTION(EX_INTERNAL | 0x122); +#endif /* PARANOID */ f = SW_C3 | SW_C2 | SW_C0; break; -#endif /* PARANOID */ } setcc(f); if (c & COMP_Denormal) { @@ -283,12 +284,12 @@ static int compare_i_st_st(int nr) case COMP_No_Comp: f = X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF; break; -#ifdef PARANOID default: +#ifdef PARANOID EXCEPTION(EX_INTERNAL | 0x122); +#endif /* PARANOID */ f = 0; break; -#endif /* PARANOID */ } FPU_EFLAGS = (FPU_EFLAGS & ~(X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF)) | f; if (c & COMP_Denormal) { diff --git a/arch/x86/math-emu/reg_constant.c b/arch/x86/math-emu/reg_constant.c index 00548354912f..8dc9095bab22 100644 --- a/arch/x86/math-emu/reg_constant.c +++ b/arch/x86/math-emu/reg_constant.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /*---------------------------------------------------------------------------+ | reg_constant.c | | | diff --git a/arch/x86/math-emu/reg_constant.h b/arch/x86/math-emu/reg_constant.h index 1bffaec3a134..f2fdd344dcaa 100644 --- a/arch/x86/math-emu/reg_constant.h +++ b/arch/x86/math-emu/reg_constant.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /*---------------------------------------------------------------------------+ | reg_constant.h | | | diff --git a/arch/x86/math-emu/reg_convert.c b/arch/x86/math-emu/reg_convert.c index 108060779977..251180623ce0 100644 --- a/arch/x86/math-emu/reg_convert.c +++ b/arch/x86/math-emu/reg_convert.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /*---------------------------------------------------------------------------+ | reg_convert.c | | | diff --git a/arch/x86/math-emu/reg_divide.c b/arch/x86/math-emu/reg_divide.c index 6827012db341..08c2f6de0288 100644 --- a/arch/x86/math-emu/reg_divide.c +++ b/arch/x86/math-emu/reg_divide.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /*---------------------------------------------------------------------------+ | reg_divide.c | | | diff --git a/arch/x86/math-emu/reg_ld_str.c b/arch/x86/math-emu/reg_ld_str.c index 2c98965a60ba..d40ff45497b9 100644 --- a/arch/x86/math-emu/reg_ld_str.c +++ b/arch/x86/math-emu/reg_ld_str.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /*---------------------------------------------------------------------------+ | reg_ld_str.c | | | diff --git a/arch/x86/math-emu/reg_mul.c b/arch/x86/math-emu/reg_mul.c index 36c37f71f713..d69618572f14 100644 --- a/arch/x86/math-emu/reg_mul.c +++ b/arch/x86/math-emu/reg_mul.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /*---------------------------------------------------------------------------+ | reg_mul.c | | | diff --git a/arch/x86/math-emu/reg_norm.S b/arch/x86/math-emu/reg_norm.S index 8b6352efceef..7f6b4392a15d 100644 --- a/arch/x86/math-emu/reg_norm.S +++ b/arch/x86/math-emu/reg_norm.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /*---------------------------------------------------------------------------+ | reg_norm.S | | | @@ -94,6 +95,7 @@ L_overflow: call arith_overflow pop %ebx jmp L_exit +ENDPROC(FPU_normalize) @@ -145,3 +147,4 @@ L_exit_nuo_zero: popl %ebx leave ret +ENDPROC(FPU_normalize_nuo) diff --git a/arch/x86/math-emu/reg_round.S b/arch/x86/math-emu/reg_round.S index d1d4e48b4f67..04563421ee7d 100644 --- a/arch/x86/math-emu/reg_round.S +++ b/arch/x86/math-emu/reg_round.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ .file "reg_round.S" /*---------------------------------------------------------------------------+ | reg_round.S | @@ -706,3 +707,5 @@ L_exception_exit: mov $-1,%eax jmp fpu_reg_round_special_exit #endif /* PARANOID */ + +ENDPROC(FPU_round) diff --git a/arch/x86/math-emu/reg_u_add.S b/arch/x86/math-emu/reg_u_add.S index 47c4c2434d85..50fe9f8c893c 100644 --- a/arch/x86/math-emu/reg_u_add.S +++ b/arch/x86/math-emu/reg_u_add.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ .file "reg_u_add.S" /*---------------------------------------------------------------------------+ | reg_u_add.S | @@ -165,3 +166,4 @@ L_exit: leave ret #endif /* PARANOID */ +ENDPROC(FPU_u_add) diff --git a/arch/x86/math-emu/reg_u_div.S b/arch/x86/math-emu/reg_u_div.S index cc00654b6f9a..94d545e118e4 100644 --- a/arch/x86/math-emu/reg_u_div.S +++ b/arch/x86/math-emu/reg_u_div.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ .file "reg_u_div.S" /*---------------------------------------------------------------------------+ | reg_u_div.S | @@ -469,3 +470,5 @@ L_exit: leave ret #endif /* PARANOID */ + +ENDPROC(FPU_u_div) diff --git a/arch/x86/math-emu/reg_u_mul.S b/arch/x86/math-emu/reg_u_mul.S index 973f12af97df..21cde47fb3e5 100644 --- a/arch/x86/math-emu/reg_u_mul.S +++ b/arch/x86/math-emu/reg_u_mul.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ .file "reg_u_mul.S" /*---------------------------------------------------------------------------+ | reg_u_mul.S | @@ -146,3 +147,4 @@ L_exit: ret #endif /* PARANOID */ +ENDPROC(FPU_u_mul) diff --git a/arch/x86/math-emu/reg_u_sub.S b/arch/x86/math-emu/reg_u_sub.S index 1b6c24801d22..f05dea7dec38 100644 --- a/arch/x86/math-emu/reg_u_sub.S +++ b/arch/x86/math-emu/reg_u_sub.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ .file "reg_u_sub.S" /*---------------------------------------------------------------------------+ | reg_u_sub.S | @@ -270,3 +271,4 @@ L_exit: popl %esi leave ret +ENDPROC(FPU_u_sub) diff --git a/arch/x86/math-emu/round_Xsig.S b/arch/x86/math-emu/round_Xsig.S index bbe0e87718e4..226a51e991f1 100644 --- a/arch/x86/math-emu/round_Xsig.S +++ b/arch/x86/math-emu/round_Xsig.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /*---------------------------------------------------------------------------+ | round_Xsig.S | | | @@ -78,7 +79,7 @@ L_exit: popl %ebx leave ret - +ENDPROC(round_Xsig) @@ -138,4 +139,4 @@ L_n_exit: popl %ebx leave ret - +ENDPROC(norm_Xsig) diff --git a/arch/x86/math-emu/shr_Xsig.S b/arch/x86/math-emu/shr_Xsig.S index 31cdd118e918..96f4779aa9c1 100644 --- a/arch/x86/math-emu/shr_Xsig.S +++ b/arch/x86/math-emu/shr_Xsig.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ .file "shr_Xsig.S" /*---------------------------------------------------------------------------+ | shr_Xsig.S | @@ -85,3 +86,4 @@ L_more_than_95: popl %esi leave ret +ENDPROC(shr_Xsig) diff --git a/arch/x86/math-emu/status_w.h b/arch/x86/math-emu/status_w.h index 54a3f226982d..b77bafec9526 100644 --- a/arch/x86/math-emu/status_w.h +++ b/arch/x86/math-emu/status_w.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /*---------------------------------------------------------------------------+ | status_w.h | | | diff --git a/arch/x86/math-emu/wm_shrx.S b/arch/x86/math-emu/wm_shrx.S index 518428317985..d588874eb6fb 100644 --- a/arch/x86/math-emu/wm_shrx.S +++ b/arch/x86/math-emu/wm_shrx.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ .file "wm_shrx.S" /*---------------------------------------------------------------------------+ | wm_shrx.S | @@ -92,6 +93,7 @@ L_more_than_95: popl %esi leave ret +ENDPROC(FPU_shrx) /*---------------------------------------------------------------------------+ @@ -202,3 +204,4 @@ Ls_more_than_95: popl %esi leave ret +ENDPROC(FPU_shrxs) diff --git a/arch/x86/math-emu/wm_sqrt.S b/arch/x86/math-emu/wm_sqrt.S index d258f59564e1..f031c0e19356 100644 --- a/arch/x86/math-emu/wm_sqrt.S +++ b/arch/x86/math-emu/wm_sqrt.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ .file "wm_sqrt.S" /*---------------------------------------------------------------------------+ | wm_sqrt.S | @@ -468,3 +469,4 @@ sqrt_more_prec_large: /* Our estimate is too large */ movl $0x7fffff00,%eax jmp sqrt_round_result +ENDPROC(wm_sqrt) diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 96d2b847e09e..7ba7f3d7f477 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -1,8 +1,16 @@ -# Kernel does not boot with instrumentation of tlb.c. -KCOV_INSTRUMENT_tlb.o := n +# SPDX-License-Identifier: GPL-2.0 +# Kernel does not boot with instrumentation of tlb.c and mem_encrypt.c +KCOV_INSTRUMENT_tlb.o := n +KCOV_INSTRUMENT_mem_encrypt.o := n + +KASAN_SANITIZE_mem_encrypt.o := n + +ifdef CONFIG_FUNCTION_TRACER +CFLAGS_REMOVE_mem_encrypt.o = -pg +endif obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ - pat.o pgtable.o physaddr.o gup.o setup_nx.o tlb.o + pat.o pgtable.o physaddr.o setup_nx.o tlb.o # Make sure __phys_addr has no stackprotector nostackp := $(call cc-option, -fno-stack-protector) @@ -39,3 +47,5 @@ obj-$(CONFIG_X86_INTEL_MPX) += mpx.o obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o +obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o +obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c index 91f501b2da3b..048c761d97b0 100644 --- a/arch/x86/mm/amdtopology.c +++ b/arch/x86/mm/amdtopology.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * AMD NUMA support. * Discover the memory map and associated nodes. diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index bce6990b1d81..5e3ac6fe6c9e 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -13,12 +13,12 @@ */ #include <linux/debugfs.h> +#include <linux/kasan.h> #include <linux/mm.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/seq_file.h> -#include <asm/kasan.h> #include <asm/pgtable.h> /* @@ -138,7 +138,7 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg) { pgprotval_t pr = pgprot_val(prot); static const char * const level_name[] = - { "cr3", "pgd", "pud", "pmd", "pte" }; + { "cr3", "pgd", "p4d", "pud", "pmd", "pte" }; if (!pgprot_val(prot)) { /* Not present */ @@ -162,12 +162,12 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg) pt_dump_cont_printf(m, dmsg, " "); /* Bit 7 has a different meaning on level 3 vs 4 */ - if (level <= 3 && pr & _PAGE_PSE) + if (level <= 4 && pr & _PAGE_PSE) pt_dump_cont_printf(m, dmsg, "PSE "); else pt_dump_cont_printf(m, dmsg, " "); - if ((level == 4 && pr & _PAGE_PAT) || - ((level == 3 || level == 2) && pr & _PAGE_PAT_LARGE)) + if ((level == 5 && pr & _PAGE_PAT) || + ((level == 4 || level == 3) && pr & _PAGE_PAT_LARGE)) pt_dump_cont_printf(m, dmsg, "PAT "); else pt_dump_cont_printf(m, dmsg, " "); @@ -188,11 +188,12 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg) */ static unsigned long normalize_addr(unsigned long u) { -#ifdef CONFIG_X86_64 - return (signed long)(u << 16) >> 16; -#else - return u; -#endif + int shift; + if (!IS_ENABLED(CONFIG_X86_64)) + return u; + + shift = 64 - (__VIRTUAL_MASK_SHIFT + 1); + return (signed long)(u << shift) >> shift; } /* @@ -297,32 +298,62 @@ static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, for (i = 0; i < PTRS_PER_PTE; i++) { prot = pte_flags(*start); st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT); - note_page(m, st, __pgprot(prot), 4); + note_page(m, st, __pgprot(prot), 5); start++; } } +#ifdef CONFIG_KASAN + +/* + * This is an optimization for KASAN=y case. Since all kasan page tables + * eventually point to the kasan_zero_page we could call note_page() + * right away without walking through lower level page tables. This saves + * us dozens of seconds (minutes for 5-level config) while checking for + * W+X mapping or reading kernel_page_tables debugfs file. + */ +static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st, + void *pt) +{ + if (__pa(pt) == __pa(kasan_zero_pmd) || +#ifdef CONFIG_X86_5LEVEL + __pa(pt) == __pa(kasan_zero_p4d) || +#endif + __pa(pt) == __pa(kasan_zero_pud)) { + pgprotval_t prot = pte_flags(kasan_zero_pte[0]); + note_page(m, st, __pgprot(prot), 5); + return true; + } + return false; +} +#else +static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st, + void *pt) +{ + return false; +} +#endif #if PTRS_PER_PMD > 1 static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, unsigned long P) { int i; - pmd_t *start; + pmd_t *start, *pmd_start; pgprotval_t prot; - start = (pmd_t *)pud_page_vaddr(addr); + pmd_start = start = (pmd_t *)pud_page_vaddr(addr); for (i = 0; i < PTRS_PER_PMD; i++) { st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT); if (!pmd_none(*start)) { if (pmd_large(*start) || !pmd_present(*start)) { prot = pmd_flags(*start); - note_page(m, st, __pgprot(prot), 3); - } else { + note_page(m, st, __pgprot(prot), 4); + } else if (!kasan_page_table(m, st, pmd_start)) { walk_pte_level(m, st, *start, P + i * PMD_LEVEL_MULT); } } else - note_page(m, st, __pgprot(0), 3); + note_page(m, st, __pgprot(0), 4); start++; } } @@ -335,39 +366,27 @@ static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, #if PTRS_PER_PUD > 1 -/* - * This is an optimization for CONFIG_DEBUG_WX=y + CONFIG_KASAN=y - * KASAN fills page tables with the same values. Since there is no - * point in checking page table more than once we just skip repeated - * entries. This saves us dozens of seconds during boot. - */ -static bool pud_already_checked(pud_t *prev_pud, pud_t *pud, bool checkwx) -{ - return checkwx && prev_pud && (pud_val(*prev_pud) == pud_val(*pud)); -} - static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, unsigned long P) { int i; - pud_t *start; + pud_t *start, *pud_start; pgprotval_t prot; pud_t *prev_pud = NULL; - start = (pud_t *)p4d_page_vaddr(addr); + pud_start = start = (pud_t *)p4d_page_vaddr(addr); for (i = 0; i < PTRS_PER_PUD; i++) { st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); - if (!pud_none(*start) && - !pud_already_checked(prev_pud, start, st->check_wx)) { + if (!pud_none(*start)) { if (pud_large(*start) || !pud_present(*start)) { prot = pud_flags(*start); - note_page(m, st, __pgprot(prot), 2); - } else { + note_page(m, st, __pgprot(prot), 3); + } else if (!kasan_page_table(m, st, pud_start)) { walk_pmd_level(m, st, *start, P + i * PUD_LEVEL_MULT); } } else - note_page(m, st, __pgprot(0), 2); + note_page(m, st, __pgprot(0), 3); prev_pud = start; start++; @@ -385,10 +404,10 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, unsigned long P) { int i; - p4d_t *start; + p4d_t *start, *p4d_start; pgprotval_t prot; - start = (p4d_t *)pgd_page_vaddr(addr); + p4d_start = start = (p4d_t *)pgd_page_vaddr(addr); for (i = 0; i < PTRS_PER_P4D; i++) { st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT); @@ -396,7 +415,7 @@ static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, if (p4d_large(*start) || !p4d_present(*start)) { prot = p4d_flags(*start); note_page(m, st, __pgprot(prot), 2); - } else { + } else if (!kasan_page_table(m, st, p4d_start)) { walk_pud_level(m, st, *start, P + i * P4D_LEVEL_MULT); } @@ -431,7 +450,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, bool checkwx) { #ifdef CONFIG_X86_64 - pgd_t *start = (pgd_t *) &init_level4_pgt; + pgd_t *start = (pgd_t *) &init_top_pgt; #else pgd_t *start = swapper_pg_dir; #endif diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 0ea8afcb929c..c3521e2be396 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -2,6 +2,7 @@ #include <linux/uaccess.h> #include <linux/sched/debug.h> +#include <asm/fpu/internal.h> #include <asm/traps.h> #include <asm/kdebug.h> @@ -36,6 +37,71 @@ bool ex_handler_fault(const struct exception_table_entry *fixup, } EXPORT_SYMBOL_GPL(ex_handler_fault); +/* + * Handler for UD0 exception following a failed test against the + * result of a refcount inc/dec/add/sub. + */ +bool ex_handler_refcount(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) +{ + /* First unconditionally saturate the refcount. */ + *(int *)regs->cx = INT_MIN / 2; + + /* + * Strictly speaking, this reports the fixup destination, not + * the fault location, and not the actually overflowing + * instruction, which is the instruction before the "js", but + * since that instruction could be a variety of lengths, just + * report the location after the overflow, which should be close + * enough for finding the overflow, as it's at least back in + * the function, having returned from .text.unlikely. + */ + regs->ip = ex_fixup_addr(fixup); + + /* + * This function has been called because either a negative refcount + * value was seen by any of the refcount functions, or a zero + * refcount value was seen by refcount_dec(). + * + * If we crossed from INT_MAX to INT_MIN, OF (Overflow Flag: result + * wrapped around) will be set. Additionally, seeing the refcount + * reach 0 will set ZF (Zero Flag: result was zero). In each of + * these cases we want a report, since it's a boundary condition. + * + */ + if (regs->flags & (X86_EFLAGS_OF | X86_EFLAGS_ZF)) { + bool zero = regs->flags & X86_EFLAGS_ZF; + + refcount_error_report(regs, zero ? "hit zero" : "overflow"); + } + + return true; +} +EXPORT_SYMBOL_GPL(ex_handler_refcount); + +/* + * Handler for when we fail to restore a task's FPU state. We should never get + * here because the FPU state of a task using the FPU (task->thread.fpu.state) + * should always be valid. However, past bugs have allowed userspace to set + * reserved bits in the XSAVE area using PTRACE_SETREGSET or sys_rt_sigreturn(). + * These caused XRSTOR to fail when switching to the task, leaking the FPU + * registers of the task previously executing on the CPU. Mitigate this class + * of vulnerability by restoring from the initial state (essentially, zeroing + * out all the FPU registers) if we can't restore from the task's FPU state. + */ +bool ex_handler_fprestore(const struct exception_table_entry *fixup, + struct pt_regs *regs, int trapnr) +{ + regs->ip = ex_fixup_addr(fixup); + + WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.", + (void *)instruction_pointer(regs)); + + __copy_kernel_to_fpregs(&init_fpstate, -1); + return true; +} +EXPORT_SYMBOL_GPL(ex_handler_fprestore); + bool ex_handler_ext(const struct exception_table_entry *fixup, struct pt_regs *regs, int trapnr) { @@ -142,7 +208,7 @@ void __init early_fixup_exception(struct pt_regs *regs, int trapnr) * undefined. I'm not sure which CPUs do this, but at least * the 486 DX works this way. */ - if ((regs->cs & 0xFFFF) != __KERNEL_CS) + if (regs->cs != __KERNEL_CS) goto fail; /* diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 8ad91a01cbc8..b0ff378650a9 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1995 Linus Torvalds * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. @@ -192,8 +193,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really * faulted on a pte with its pkey=4. */ -static void fill_sig_info_pkey(int si_code, siginfo_t *info, - struct vm_area_struct *vma) +static void fill_sig_info_pkey(int si_code, siginfo_t *info, u32 *pkey) { /* This is effectively an #ifdef */ if (!boot_cpu_has(X86_FEATURE_OSPKE)) @@ -209,7 +209,7 @@ static void fill_sig_info_pkey(int si_code, siginfo_t *info, * valid VMA, so we should never reach this without a * valid VMA. */ - if (!vma) { + if (!pkey) { WARN_ONCE(1, "PKU fault with no VMA passed in"); info->si_pkey = 0; return; @@ -219,13 +219,12 @@ static void fill_sig_info_pkey(int si_code, siginfo_t *info, * absolutely guranteed to be 100% accurate because of * the race explained above. */ - info->si_pkey = vma_pkey(vma); + info->si_pkey = *pkey; } static void force_sig_info_fault(int si_signo, int si_code, unsigned long address, - struct task_struct *tsk, struct vm_area_struct *vma, - int fault) + struct task_struct *tsk, u32 *pkey, int fault) { unsigned lsb = 0; siginfo_t info; @@ -240,7 +239,7 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address, lsb = PAGE_SHIFT; info.si_addr_lsb = lsb; - fill_sig_info_pkey(si_code, &info, vma); + fill_sig_info_pkey(si_code, &info, pkey); force_sig_info(si_signo, &info, tsk); } @@ -346,7 +345,7 @@ static noinline int vmalloc_fault(unsigned long address) * Do _not_ use "current" here. We might be inside * an interrupt in the middle of a task switch.. */ - pgd_paddr = read_cr3(); + pgd_paddr = read_cr3_pa(); pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); if (!pmd_k) return -1; @@ -388,7 +387,7 @@ static bool low_pfn(unsigned long pfn) static void dump_pagetable(unsigned long address) { - pgd_t *base = __va(read_cr3()); + pgd_t *base = __va(read_cr3_pa()); pgd_t *pgd = &base[pgd_index(address)]; p4d_t *p4d; pud_t *pud; @@ -396,14 +395,18 @@ static void dump_pagetable(unsigned long address) pte_t *pte; #ifdef CONFIG_X86_PAE - printk("*pdpt = %016Lx ", pgd_val(*pgd)); + pr_info("*pdpt = %016Lx ", pgd_val(*pgd)); if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd)) goto out; +#define pr_pde pr_cont +#else +#define pr_pde pr_info #endif p4d = p4d_offset(pgd, address); pud = pud_offset(p4d, address); pmd = pmd_offset(pud, address); - printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); + pr_pde("*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); +#undef pr_pde /* * We must not directly access the pte in the highpte @@ -415,9 +418,9 @@ static void dump_pagetable(unsigned long address) goto out; pte = pte_offset_kernel(pmd, address); - printk("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte)); + pr_cont("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte)); out: - printk("\n"); + pr_cont("\n"); } #else /* CONFIG_X86_64: */ @@ -451,7 +454,7 @@ static noinline int vmalloc_fault(unsigned long address) * happen within a race in page table update. In the later * case just flush: */ - pgd = (pgd_t *)__va(read_cr3()) + pgd_index(address); + pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address); pgd_ref = pgd_offset_k(address); if (pgd_none(*pgd_ref)) return -1; @@ -555,7 +558,7 @@ static int bad_address(void *p) static void dump_pagetable(unsigned long address) { - pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK); + pgd_t *base = __va(read_cr3_pa()); pgd_t *pgd = base + pgd_index(address); p4d_t *p4d; pud_t *pud; @@ -565,7 +568,7 @@ static void dump_pagetable(unsigned long address) if (bad_address(pgd)) goto bad; - printk("PGD %lx ", pgd_val(*pgd)); + pr_info("PGD %lx ", pgd_val(*pgd)); if (!pgd_present(*pgd)) goto out; @@ -574,7 +577,7 @@ static void dump_pagetable(unsigned long address) if (bad_address(p4d)) goto bad; - printk("P4D %lx ", p4d_val(*p4d)); + pr_cont("P4D %lx ", p4d_val(*p4d)); if (!p4d_present(*p4d) || p4d_large(*p4d)) goto out; @@ -582,7 +585,7 @@ static void dump_pagetable(unsigned long address) if (bad_address(pud)) goto bad; - printk("PUD %lx ", pud_val(*pud)); + pr_cont("PUD %lx ", pud_val(*pud)); if (!pud_present(*pud) || pud_large(*pud)) goto out; @@ -590,7 +593,7 @@ static void dump_pagetable(unsigned long address) if (bad_address(pmd)) goto bad; - printk("PMD %lx ", pmd_val(*pmd)); + pr_cont("PMD %lx ", pmd_val(*pmd)); if (!pmd_present(*pmd) || pmd_large(*pmd)) goto out; @@ -598,12 +601,12 @@ static void dump_pagetable(unsigned long address) if (bad_address(pte)) goto bad; - printk("PTE %lx", pte_val(*pte)); + pr_cont("PTE %lx", pte_val(*pte)); out: - printk("\n"); + pr_cont("\n"); return; bad: - printk("BAD\n"); + pr_info("BAD\n"); } #endif /* CONFIG_X86_64 */ @@ -700,7 +703,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, pgd_t *pgd; pte_t *pte; - pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK); + pgd = __va(read_cr3_pa()); pgd += pgd_index(address); pte = lookup_address_in_pgd(pgd, address, &level); @@ -758,8 +761,6 @@ no_context(struct pt_regs *regs, unsigned long error_code, struct task_struct *tsk = current; unsigned long flags; int sig; - /* No context means no VMA to pass down */ - struct vm_area_struct *vma = NULL; /* Are we prepared to handle this kernel fault? */ if (fixup_exception(regs, X86_TRAP_PF)) { @@ -784,7 +785,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, /* XXX: hwpoison faults will set the wrong code. */ force_sig_info_fault(signal, si_code, address, - tsk, vma, 0); + tsk, NULL, 0); } /* @@ -802,7 +803,6 @@ no_context(struct pt_regs *regs, unsigned long error_code, if (is_vmalloc_addr((void *)address) && (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) || address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) { - register void *__sp asm("rsp"); unsigned long stack = this_cpu_read(orig_ist.ist[DOUBLEFAULT_STACK]) - sizeof(void *); /* * We're likely to be running with very little stack space @@ -817,7 +817,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, asm volatile ("movq %[stack], %%rsp\n\t" "call handle_stack_overflow\n\t" "1: jmp 1b" - : "+r" (__sp) + : ASM_CALL_CONSTRAINT : "D" ("kernel stack overflow (page fault)"), "S" (regs), "d" (address), [stack] "rm" (stack)); @@ -893,8 +893,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code, static void __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, - unsigned long address, struct vm_area_struct *vma, - int si_code) + unsigned long address, u32 *pkey, int si_code) { struct task_struct *tsk = current; @@ -942,7 +941,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, tsk->thread.error_code = error_code; tsk->thread.trap_nr = X86_TRAP_PF; - force_sig_info_fault(SIGSEGV, si_code, address, tsk, vma, 0); + force_sig_info_fault(SIGSEGV, si_code, address, tsk, pkey, 0); return; } @@ -955,9 +954,9 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, static noinline void bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, - unsigned long address, struct vm_area_struct *vma) + unsigned long address, u32 *pkey) { - __bad_area_nosemaphore(regs, error_code, address, vma, SEGV_MAPERR); + __bad_area_nosemaphore(regs, error_code, address, pkey, SEGV_MAPERR); } static void @@ -965,6 +964,10 @@ __bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address, struct vm_area_struct *vma, int si_code) { struct mm_struct *mm = current->mm; + u32 pkey; + + if (vma) + pkey = vma_pkey(vma); /* * Something tried to access memory that isn't in our memory map.. @@ -972,7 +975,8 @@ __bad_area(struct pt_regs *regs, unsigned long error_code, */ up_read(&mm->mmap_sem); - __bad_area_nosemaphore(regs, error_code, address, vma, si_code); + __bad_area_nosemaphore(regs, error_code, address, + (vma) ? &pkey : NULL, si_code); } static noinline void @@ -1015,7 +1019,7 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code, static void do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, - struct vm_area_struct *vma, unsigned int fault) + u32 *pkey, unsigned int fault) { struct task_struct *tsk = current; int code = BUS_ADRERR; @@ -1042,13 +1046,12 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, code = BUS_MCEERR_AR; } #endif - force_sig_info_fault(SIGBUS, code, address, tsk, vma, fault); + force_sig_info_fault(SIGBUS, code, address, tsk, pkey, fault); } static noinline void mm_fault_error(struct pt_regs *regs, unsigned long error_code, - unsigned long address, struct vm_area_struct *vma, - unsigned int fault) + unsigned long address, u32 *pkey, unsigned int fault) { if (fatal_signal_pending(current) && !(error_code & PF_USER)) { no_context(regs, error_code, address, 0, 0); @@ -1072,9 +1075,9 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, } else { if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| VM_FAULT_HWPOISON_LARGE)) - do_sigbus(regs, error_code, address, vma, fault); + do_sigbus(regs, error_code, address, pkey, fault); else if (fault & VM_FAULT_SIGSEGV) - bad_area_nosemaphore(regs, error_code, address, vma); + bad_area_nosemaphore(regs, error_code, address, pkey); else BUG(); } @@ -1254,10 +1257,6 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs) * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate * routines. - * - * This function must have noinline because both callers - * {,trace_}do_page_fault() have notrace on. Having this an actual function - * guarantees there's a function trace entry. */ static noinline void __do_page_fault(struct pt_regs *regs, unsigned long error_code, @@ -1268,6 +1267,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, struct mm_struct *mm; int fault, major = 0; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + u32 pkey; tsk = current; mm = tsk->mm; @@ -1441,7 +1441,17 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked. + * + * Note that handle_userfault() may also release and reacquire mmap_sem + * (and not return with VM_FAULT_RETRY), when returning to userland to + * repeat the page fault later with a VM_FAULT_NOPAGE retval + * (potentially after handling any pending signal during the return to + * userland). The return to userland is identified whenever + * FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags. + * Thus we have to be careful about not touching vma after handling the + * fault, so we read the pkey beforehand. */ + pkey = vma_pkey(vma); fault = handle_mm_fault(vma, address, flags); major |= fault & VM_FAULT_MAJOR; @@ -1470,7 +1480,7 @@ good_area: up_read(&mm->mmap_sem); if (unlikely(fault & VM_FAULT_ERROR)) { - mm_fault_error(regs, error_code, address, vma, fault); + mm_fault_error(regs, error_code, address, &pkey, fault); return; } @@ -1490,27 +1500,6 @@ good_area: } NOKPROBE_SYMBOL(__do_page_fault); -dotraplinkage void notrace -do_page_fault(struct pt_regs *regs, unsigned long error_code) -{ - unsigned long address = read_cr2(); /* Get the faulting address */ - enum ctx_state prev_state; - - /* - * We must have this function tagged with __kprobes, notrace and call - * read_cr2() before calling anything else. To avoid calling any kind - * of tracing machinery before we've observed the CR2 value. - * - * exception_{enter,exit}() contain all sorts of tracepoints. - */ - - prev_state = exception_enter(); - __do_page_fault(regs, error_code, address); - exception_exit(prev_state); -} -NOKPROBE_SYMBOL(do_page_fault); - -#ifdef CONFIG_TRACING static nokprobe_inline void trace_page_fault_entries(unsigned long address, struct pt_regs *regs, unsigned long error_code) @@ -1521,22 +1510,24 @@ trace_page_fault_entries(unsigned long address, struct pt_regs *regs, trace_page_fault_kernel(address, regs, error_code); } +/* + * We must have this function blacklisted from kprobes, tagged with notrace + * and call read_cr2() before calling anything else. To avoid calling any + * kind of tracing machinery before we've observed the CR2 value. + * + * exception_{enter,exit}() contains all sorts of tracepoints. + */ dotraplinkage void notrace -trace_do_page_fault(struct pt_regs *regs, unsigned long error_code) +do_page_fault(struct pt_regs *regs, unsigned long error_code) { - /* - * The exception_enter and tracepoint processing could - * trigger another page faults (user space callchain - * reading) and destroy the original cr2 value, so read - * the faulting address now. - */ - unsigned long address = read_cr2(); + unsigned long address = read_cr2(); /* Get the faulting address */ enum ctx_state prev_state; prev_state = exception_enter(); - trace_page_fault_entries(address, regs, error_code); + if (trace_pagefault_enabled()) + trace_page_fault_entries(address, regs, error_code); + __do_page_fault(regs, error_code, address); exception_exit(prev_state); } -NOKPROBE_SYMBOL(trace_do_page_fault); -#endif /* CONFIG_TRACING */ +NOKPROBE_SYMBOL(do_page_fault); diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c deleted file mode 100644 index 456dfdfd2249..000000000000 --- a/arch/x86/mm/gup.c +++ /dev/null @@ -1,496 +0,0 @@ -/* - * Lockless get_user_pages_fast for x86 - * - * Copyright (C) 2008 Nick Piggin - * Copyright (C) 2008 Novell Inc. - */ -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/vmstat.h> -#include <linux/highmem.h> -#include <linux/swap.h> -#include <linux/memremap.h> - -#include <asm/mmu_context.h> -#include <asm/pgtable.h> - -static inline pte_t gup_get_pte(pte_t *ptep) -{ -#ifndef CONFIG_X86_PAE - return READ_ONCE(*ptep); -#else - /* - * With get_user_pages_fast, we walk down the pagetables without taking - * any locks. For this we would like to load the pointers atomically, - * but that is not possible (without expensive cmpxchg8b) on PAE. What - * we do have is the guarantee that a pte will only either go from not - * present to present, or present to not present or both -- it will not - * switch to a completely different present page without a TLB flush in - * between; something that we are blocking by holding interrupts off. - * - * Setting ptes from not present to present goes: - * ptep->pte_high = h; - * smp_wmb(); - * ptep->pte_low = l; - * - * And present to not present goes: - * ptep->pte_low = 0; - * smp_wmb(); - * ptep->pte_high = 0; - * - * We must ensure here that the load of pte_low sees l iff pte_high - * sees h. We load pte_high *after* loading pte_low, which ensures we - * don't see an older value of pte_high. *Then* we recheck pte_low, - * which ensures that we haven't picked up a changed pte high. We might - * have got rubbish values from pte_low and pte_high, but we are - * guaranteed that pte_low will not have the present bit set *unless* - * it is 'l'. And get_user_pages_fast only operates on present ptes, so - * we're safe. - * - * gup_get_pte should not be used or copied outside gup.c without being - * very careful -- it does not atomically load the pte or anything that - * is likely to be useful for you. - */ - pte_t pte; - -retry: - pte.pte_low = ptep->pte_low; - smp_rmb(); - pte.pte_high = ptep->pte_high; - smp_rmb(); - if (unlikely(pte.pte_low != ptep->pte_low)) - goto retry; - - return pte; -#endif -} - -static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) -{ - while ((*nr) - nr_start) { - struct page *page = pages[--(*nr)]; - - ClearPageReferenced(page); - put_page(page); - } -} - -/* - * 'pteval' can come from a pte, pmd, pud or p4d. We only check - * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the - * same value on all 4 types. - */ -static inline int pte_allows_gup(unsigned long pteval, int write) -{ - unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER; - - if (write) - need_pte_bits |= _PAGE_RW; - - if ((pteval & need_pte_bits) != need_pte_bits) - return 0; - - /* Check memory protection keys permissions. */ - if (!__pkru_allows_pkey(pte_flags_pkey(pteval), write)) - return 0; - - return 1; -} - -/* - * The performance critical leaf functions are made noinline otherwise gcc - * inlines everything into a single function which results in too much - * register pressure. - */ -static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - struct dev_pagemap *pgmap = NULL; - int nr_start = *nr, ret = 0; - pte_t *ptep, *ptem; - - /* - * Keep the original mapped PTE value (ptem) around since we - * might increment ptep off the end of the page when finishing - * our loop iteration. - */ - ptem = ptep = pte_offset_map(&pmd, addr); - do { - pte_t pte = gup_get_pte(ptep); - struct page *page; - - /* Similar to the PMD case, NUMA hinting must take slow path */ - if (pte_protnone(pte)) - break; - - if (!pte_allows_gup(pte_val(pte), write)) - break; - - if (pte_devmap(pte)) { - pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); - if (unlikely(!pgmap)) { - undo_dev_pagemap(nr, nr_start, pages); - break; - } - } else if (pte_special(pte)) - break; - - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); - page = pte_page(pte); - get_page(page); - put_dev_pagemap(pgmap); - SetPageReferenced(page); - pages[*nr] = page; - (*nr)++; - - } while (ptep++, addr += PAGE_SIZE, addr != end); - if (addr == end) - ret = 1; - pte_unmap(ptem); - - return ret; -} - -static inline void get_head_page_multiple(struct page *page, int nr) -{ - VM_BUG_ON_PAGE(page != compound_head(page), page); - VM_BUG_ON_PAGE(page_count(page) == 0, page); - page_ref_add(page, nr); - SetPageReferenced(page); -} - -static int __gup_device_huge(unsigned long pfn, unsigned long addr, - unsigned long end, struct page **pages, int *nr) -{ - int nr_start = *nr; - struct dev_pagemap *pgmap = NULL; - - do { - struct page *page = pfn_to_page(pfn); - - pgmap = get_dev_pagemap(pfn, pgmap); - if (unlikely(!pgmap)) { - undo_dev_pagemap(nr, nr_start, pages); - return 0; - } - SetPageReferenced(page); - pages[*nr] = page; - get_page(page); - put_dev_pagemap(pgmap); - (*nr)++; - pfn++; - } while (addr += PAGE_SIZE, addr != end); - return 1; -} - -static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr, - unsigned long end, struct page **pages, int *nr) -{ - unsigned long fault_pfn; - - fault_pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - return __gup_device_huge(fault_pfn, addr, end, pages, nr); -} - -static int __gup_device_huge_pud(pud_t pud, unsigned long addr, - unsigned long end, struct page **pages, int *nr) -{ - unsigned long fault_pfn; - - fault_pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); - return __gup_device_huge(fault_pfn, addr, end, pages, nr); -} - -static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - struct page *head, *page; - int refs; - - if (!pte_allows_gup(pmd_val(pmd), write)) - return 0; - - VM_BUG_ON(!pfn_valid(pmd_pfn(pmd))); - if (pmd_devmap(pmd)) - return __gup_device_huge_pmd(pmd, addr, end, pages, nr); - - /* hugepages are never "special" */ - VM_BUG_ON(pmd_flags(pmd) & _PAGE_SPECIAL); - - refs = 0; - head = pmd_page(pmd); - page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - do { - VM_BUG_ON_PAGE(compound_head(page) != head, page); - pages[*nr] = page; - (*nr)++; - page++; - refs++; - } while (addr += PAGE_SIZE, addr != end); - get_head_page_multiple(head, refs); - - return 1; -} - -static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) -{ - unsigned long next; - pmd_t *pmdp; - - pmdp = pmd_offset(&pud, addr); - do { - pmd_t pmd = *pmdp; - - next = pmd_addr_end(addr, end); - if (pmd_none(pmd)) - return 0; - if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) { - /* - * NUMA hinting faults need to be handled in the GUP - * slowpath for accounting purposes and so that they - * can be serialised against THP migration. - */ - if (pmd_protnone(pmd)) - return 0; - if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) - return 0; - } else { - if (!gup_pte_range(pmd, addr, next, write, pages, nr)) - return 0; - } - } while (pmdp++, addr = next, addr != end); - - return 1; -} - -static noinline int gup_huge_pud(pud_t pud, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - struct page *head, *page; - int refs; - - if (!pte_allows_gup(pud_val(pud), write)) - return 0; - - VM_BUG_ON(!pfn_valid(pud_pfn(pud))); - if (pud_devmap(pud)) - return __gup_device_huge_pud(pud, addr, end, pages, nr); - - /* hugepages are never "special" */ - VM_BUG_ON(pud_flags(pud) & _PAGE_SPECIAL); - - refs = 0; - head = pud_page(pud); - page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); - do { - VM_BUG_ON_PAGE(compound_head(page) != head, page); - pages[*nr] = page; - (*nr)++; - page++; - refs++; - } while (addr += PAGE_SIZE, addr != end); - get_head_page_multiple(head, refs); - - return 1; -} - -static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) -{ - unsigned long next; - pud_t *pudp; - - pudp = pud_offset(&p4d, addr); - do { - pud_t pud = *pudp; - - next = pud_addr_end(addr, end); - if (pud_none(pud)) - return 0; - if (unlikely(pud_large(pud))) { - if (!gup_huge_pud(pud, addr, next, write, pages, nr)) - return 0; - } else { - if (!gup_pmd_range(pud, addr, next, write, pages, nr)) - return 0; - } - } while (pudp++, addr = next, addr != end); - - return 1; -} - -static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) -{ - unsigned long next; - p4d_t *p4dp; - - p4dp = p4d_offset(&pgd, addr); - do { - p4d_t p4d = *p4dp; - - next = p4d_addr_end(addr, end); - if (p4d_none(p4d)) - return 0; - BUILD_BUG_ON(p4d_large(p4d)); - if (!gup_pud_range(p4d, addr, next, write, pages, nr)) - return 0; - } while (p4dp++, addr = next, addr != end); - - return 1; -} - -/* - * Like get_user_pages_fast() except its IRQ-safe in that it won't fall - * back to the regular GUP. - */ -int __get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) -{ - struct mm_struct *mm = current->mm; - unsigned long addr, len, end; - unsigned long next; - unsigned long flags; - pgd_t *pgdp; - int nr = 0; - - start &= PAGE_MASK; - addr = start; - len = (unsigned long) nr_pages << PAGE_SHIFT; - end = start + len; - if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, - (void __user *)start, len))) - return 0; - - /* - * XXX: batch / limit 'nr', to avoid large irq off latency - * needs some instrumenting to determine the common sizes used by - * important workloads (eg. DB2), and whether limiting the batch size - * will decrease performance. - * - * It seems like we're in the clear for the moment. Direct-IO is - * the main guy that batches up lots of get_user_pages, and even - * they are limited to 64-at-a-time which is not so many. - */ - /* - * This doesn't prevent pagetable teardown, but does prevent - * the pagetables and pages from being freed on x86. - * - * So long as we atomically load page table pointers versus teardown - * (which we do on x86, with the above PAE exception), we can follow the - * address down to the the page and take a ref on it. - */ - local_irq_save(flags); - pgdp = pgd_offset(mm, addr); - do { - pgd_t pgd = *pgdp; - - next = pgd_addr_end(addr, end); - if (pgd_none(pgd)) - break; - if (!gup_p4d_range(pgd, addr, next, write, pages, &nr)) - break; - } while (pgdp++, addr = next, addr != end); - local_irq_restore(flags); - - return nr; -} - -/** - * get_user_pages_fast() - pin user pages in memory - * @start: starting user address - * @nr_pages: number of pages from start to pin - * @write: whether pages will be written to - * @pages: array that receives pointers to the pages pinned. - * Should be at least nr_pages long. - * - * Attempt to pin user pages in memory without taking mm->mmap_sem. - * If not successful, it will fall back to taking the lock and - * calling get_user_pages(). - * - * Returns number of pages pinned. This may be fewer than the number - * requested. If nr_pages is 0 or negative, returns 0. If no pages - * were pinned, returns -errno. - */ -int get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) -{ - struct mm_struct *mm = current->mm; - unsigned long addr, len, end; - unsigned long next; - pgd_t *pgdp; - int nr = 0; - - start &= PAGE_MASK; - addr = start; - len = (unsigned long) nr_pages << PAGE_SHIFT; - - end = start + len; - if (end < start) - goto slow_irqon; - -#ifdef CONFIG_X86_64 - if (end >> __VIRTUAL_MASK_SHIFT) - goto slow_irqon; -#endif - - /* - * XXX: batch / limit 'nr', to avoid large irq off latency - * needs some instrumenting to determine the common sizes used by - * important workloads (eg. DB2), and whether limiting the batch size - * will decrease performance. - * - * It seems like we're in the clear for the moment. Direct-IO is - * the main guy that batches up lots of get_user_pages, and even - * they are limited to 64-at-a-time which is not so many. - */ - /* - * This doesn't prevent pagetable teardown, but does prevent - * the pagetables and pages from being freed on x86. - * - * So long as we atomically load page table pointers versus teardown - * (which we do on x86, with the above PAE exception), we can follow the - * address down to the the page and take a ref on it. - */ - local_irq_disable(); - pgdp = pgd_offset(mm, addr); - do { - pgd_t pgd = *pgdp; - - next = pgd_addr_end(addr, end); - if (pgd_none(pgd)) - goto slow; - if (!gup_p4d_range(pgd, addr, next, write, pages, &nr)) - goto slow; - } while (pgdp++, addr = next, addr != end); - local_irq_enable(); - - VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT); - return nr; - - { - int ret; - -slow: - local_irq_enable(); -slow_irqon: - /* Try to get the remaining pages with get_user_pages */ - start += nr << PAGE_SHIFT; - pages += nr; - - ret = get_user_pages_unlocked(start, - (end - start) >> PAGE_SHIFT, - pages, write ? FOLL_WRITE : 0); - - /* Have to be a bit careful with return values */ - if (nr > 0) { - if (ret < 0) - ret = nr; - else - ret += nr; - } - - return ret; - } -} diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index adad702b39cd..8ae0000cbdb3 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * IA-32 Huge TLB Page Support for Kernel. * @@ -18,6 +19,7 @@ #include <asm/tlbflush.h> #include <asm/pgalloc.h> #include <asm/elf.h> +#include <asm/mpx.h> #if 0 /* This is just for testing */ struct page * @@ -33,7 +35,7 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) if (!vma || !is_vm_hugetlb_page(vma)) return ERR_PTR(-EINVAL); - pte = huge_pte_offset(mm, address); + pte = huge_pte_offset(mm, address, vma_mmu_pagesize(vma)); /* hugetlb should be locked, and hence, prefaulted */ WARN_ON(!pte || pte_none(*pte)); @@ -85,25 +87,38 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, info.flags = 0; info.length = len; info.low_limit = get_mmap_base(1); + + /* + * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area + * in the full address space. + */ info.high_limit = in_compat_syscall() ? - tasksize_32bit() : tasksize_64bit(); + task_size_32bit() : task_size_64bit(addr > DEFAULT_MAP_WINDOW); + info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; return vm_unmapped_area(&info); } static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, - unsigned long addr0, unsigned long len, + unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct hstate *h = hstate_file(file); struct vm_unmapped_area_info info; - unsigned long addr; info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; info.low_limit = PAGE_SIZE; info.high_limit = get_mmap_base(0); + + /* + * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area + * in the full address space. + */ + if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall()) + info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW; + info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; addr = vm_unmapped_area(&info); @@ -118,7 +133,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, VM_BUG_ON(addr != -ENOMEM); info.flags = 0; info.low_limit = TASK_UNMAPPED_BASE; - info.high_limit = TASK_SIZE; + info.high_limit = TASK_SIZE_LOW; addr = vm_unmapped_area(&info); } @@ -135,6 +150,11 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, if (len & ~huge_page_mask(h)) return -EINVAL; + + addr = mpx_unmapped_area_check(addr, len, flags); + if (IS_ERR_VALUE(addr)) + return addr; + if (len > TASK_SIZE) return -ENOMEM; diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c index adab1595f4bd..ab33a32df2a8 100644 --- a/arch/x86/mm/ident_map.c +++ b/arch/x86/mm/ident_map.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Helper routines for building identity mapping page tables. This is * included by both the compressed kernel and the regular kernel. @@ -51,7 +52,7 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, if (!pmd) return -ENOMEM; ident_pmd_init(info, pmd, addr, next); - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); + set_pud(pud, __pud(__pa(pmd) | info->kernpg_flag)); } return 0; @@ -79,7 +80,7 @@ static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page, if (!pud) return -ENOMEM; ident_pud_init(info, pud, addr, next); - set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); + set_p4d(p4d, __p4d(__pa(pud) | info->kernpg_flag)); } return 0; @@ -93,6 +94,10 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, unsigned long next; int result; + /* Set the default pagetable flags if not supplied */ + if (!info->kernpg_flag) + info->kernpg_flag = _KERNPG_TABLE; + for (; addr < end; addr = next) { pgd_t *pgd = pgd_page + pgd_index(addr); p4d_t *p4d; @@ -116,14 +121,14 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, if (result) return result; if (IS_ENABLED(CONFIG_X86_5LEVEL)) { - set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE)); + set_pgd(pgd, __pgd(__pa(p4d) | info->kernpg_flag)); } else { /* * With p4d folded, pgd is equal to p4d. * The pgd entry has to point to the pud page table in this case. */ pud_t *pud = pud_offset(p4d, 0); - set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); + set_pgd(pgd, __pgd(__pa(pud) | info->kernpg_flag)); } } diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 9b3f9fa5b283..af5c1ed21d43 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -18,6 +18,8 @@ #include <asm/dma.h> /* for MAX_DMA_PFN */ #include <asm/microcode.h> #include <asm/kaslr.h> +#include <asm/hypervisor.h> +#include <asm/cpufeature.h> /* * We need to define the tracepoints somewhere, and tlb.c @@ -192,6 +194,38 @@ static void __init probe_page_size_mask(void) } } +static void setup_pcid(void) +{ +#ifdef CONFIG_X86_64 + if (boot_cpu_has(X86_FEATURE_PCID)) { + if (boot_cpu_has(X86_FEATURE_PGE)) { + /* + * This can't be cr4_set_bits_and_update_boot() -- + * the trampoline code can't handle CR4.PCIDE and + * it wouldn't do any good anyway. Despite the name, + * cr4_set_bits_and_update_boot() doesn't actually + * cause the bits in question to remain set all the + * way through the secondary boot asm. + * + * Instead, we brute-force it and set CR4.PCIDE + * manually in start_secondary(). + */ + cr4_set_bits(X86_CR4_PCIDE); + } else { + /* + * flush_tlb_all(), as currently implemented, won't + * work if PCID is on but PGE is not. Since that + * combination doesn't exist on real hardware, there's + * no reason to try to fully support it, but it's + * polite to avoid corrupting data if we're on + * an improperly configured VM. + */ + setup_clear_cpu_cap(X86_FEATURE_PCID); + } + } +#endif +} + #ifdef CONFIG_X86_32 #define NR_RANGE_MR 3 #else /* CONFIG_X86_64 */ @@ -591,6 +625,7 @@ void __init init_mem_mapping(void) unsigned long end; probe_page_size_mask(); + setup_pcid(); #ifdef CONFIG_X86_64 end = max_pfn << PAGE_SHIFT; @@ -636,6 +671,8 @@ void __init init_mem_mapping(void) load_cr3(swapper_pg_dir); __flush_tlb_all(); + hypervisor_init_mem_mapping(); + early_memtest(0, max_pfn_mapped << PAGE_SHIFT); } @@ -811,10 +848,8 @@ void __init zone_sizes_init(void) } DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { -#ifdef CONFIG_SMP - .active_mm = &init_mm, - .state = 0, -#endif + .loaded_mm = &init_mm, + .next_asid = 1, .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ }; EXPORT_SYMBOL_GPL(cpu_tlbstate); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 99fb83819a5f..8a64a6f2848d 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -823,15 +823,12 @@ void __init mem_init(void) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, bool for_device) +int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) { - struct pglist_data *pgdata = NODE_DATA(nid); - struct zone *zone = pgdata->node_zones + - zone_for_memory(nid, start, size, ZONE_HIGHMEM, for_device); unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - return __add_pages(nid, zone, start_pfn, nr_pages); + return __add_pages(nid, start_pfn, nr_pages, want_memblock); } #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 0a59daf799f8..048fbe8fc274 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -92,6 +92,44 @@ __setup("noexec32=", nonx32_setup); * When memory was added make sure all the processes MM have * suitable PGD entries in the local PGD level page. */ +#ifdef CONFIG_X86_5LEVEL +void sync_global_pgds(unsigned long start, unsigned long end) +{ + unsigned long addr; + + for (addr = start; addr <= end; addr = ALIGN(addr + 1, PGDIR_SIZE)) { + const pgd_t *pgd_ref = pgd_offset_k(addr); + struct page *page; + + /* Check for overflow */ + if (addr < start) + break; + + if (pgd_none(*pgd_ref)) + continue; + + spin_lock(&pgd_lock); + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + spinlock_t *pgt_lock; + + pgd = (pgd_t *)page_address(page) + pgd_index(addr); + /* the pgt_lock only for Xen */ + pgt_lock = &pgd_page_get_mm(page)->page_table_lock; + spin_lock(pgt_lock); + + if (!pgd_none(*pgd_ref) && !pgd_none(*pgd)) + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + + spin_unlock(pgt_lock); + } + spin_unlock(&pgd_lock); + } +} +#else void sync_global_pgds(unsigned long start, unsigned long end) { unsigned long addr; @@ -135,6 +173,7 @@ void sync_global_pgds(unsigned long start, unsigned long end) spin_unlock(&pgd_lock); } } +#endif /* * NOTE: This function is marked __ref because it calls __init function @@ -585,6 +624,57 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, return paddr_last; } +static unsigned long __meminit +phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end, + unsigned long page_size_mask) +{ + unsigned long paddr_next, paddr_last = paddr_end; + unsigned long vaddr = (unsigned long)__va(paddr); + int i = p4d_index(vaddr); + + if (!IS_ENABLED(CONFIG_X86_5LEVEL)) + return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end, page_size_mask); + + for (; i < PTRS_PER_P4D; i++, paddr = paddr_next) { + p4d_t *p4d; + pud_t *pud; + + vaddr = (unsigned long)__va(paddr); + p4d = p4d_page + p4d_index(vaddr); + paddr_next = (paddr & P4D_MASK) + P4D_SIZE; + + if (paddr >= paddr_end) { + if (!after_bootmem && + !e820__mapped_any(paddr & P4D_MASK, paddr_next, + E820_TYPE_RAM) && + !e820__mapped_any(paddr & P4D_MASK, paddr_next, + E820_TYPE_RESERVED_KERN)) + set_p4d(p4d, __p4d(0)); + continue; + } + + if (!p4d_none(*p4d)) { + pud = pud_offset(p4d, 0); + paddr_last = phys_pud_init(pud, paddr, + paddr_end, + page_size_mask); + __flush_tlb_all(); + continue; + } + + pud = alloc_low_page(); + paddr_last = phys_pud_init(pud, paddr, paddr_end, + page_size_mask); + + spin_lock(&init_mm.page_table_lock); + p4d_populate(&init_mm, p4d, pud); + spin_unlock(&init_mm.page_table_lock); + } + __flush_tlb_all(); + + return paddr_last; +} + /* * Create page table mapping for the physical memory for specific physical * addresses. The virtual and physical addresses have to be aligned on PMD level @@ -606,26 +696,26 @@ kernel_physical_mapping_init(unsigned long paddr_start, for (; vaddr < vaddr_end; vaddr = vaddr_next) { pgd_t *pgd = pgd_offset_k(vaddr); p4d_t *p4d; - pud_t *pud; vaddr_next = (vaddr & PGDIR_MASK) + PGDIR_SIZE; - BUILD_BUG_ON(pgd_none(*pgd)); - p4d = p4d_offset(pgd, vaddr); - if (p4d_val(*p4d)) { - pud = (pud_t *)p4d_page_vaddr(*p4d); - paddr_last = phys_pud_init(pud, __pa(vaddr), + if (pgd_val(*pgd)) { + p4d = (p4d_t *)pgd_page_vaddr(*pgd); + paddr_last = phys_p4d_init(p4d, __pa(vaddr), __pa(vaddr_end), page_size_mask); continue; } - pud = alloc_low_page(); - paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end), + p4d = alloc_low_page(); + paddr_last = phys_p4d_init(p4d, __pa(vaddr), __pa(vaddr_end), page_size_mask); spin_lock(&init_mm.page_table_lock); - p4d_populate(&init_mm, p4d, pud); + if (IS_ENABLED(CONFIG_X86_5LEVEL)) + pgd_populate(&init_mm, pgd, p4d); + else + p4d_populate(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d); spin_unlock(&init_mm.page_table_lock); pgd_changed = true; } @@ -671,7 +761,7 @@ void __init paging_init(void) * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need * updating. */ -static void update_end_of_memory_vars(u64 start, u64 size) +static void update_end_of_memory_vars(u64 start, u64 size) { unsigned long end_pfn = PFN_UP(start + size); @@ -682,29 +772,30 @@ static void update_end_of_memory_vars(u64 start, u64 size) } } -/* - * Memory is added always to NORMAL zone. This means you will never get - * additional DMA/DMA32 memory. - */ -int arch_add_memory(int nid, u64 start, u64 size, bool for_device) +int add_pages(int nid, unsigned long start_pfn, + unsigned long nr_pages, bool want_memblock) { - struct pglist_data *pgdat = NODE_DATA(nid); - struct zone *zone = pgdat->node_zones + - zone_for_memory(nid, start, size, ZONE_NORMAL, for_device); - unsigned long start_pfn = start >> PAGE_SHIFT; - unsigned long nr_pages = size >> PAGE_SHIFT; int ret; - init_memory_mapping(start, start + size); - - ret = __add_pages(nid, zone, start_pfn, nr_pages); + ret = __add_pages(nid, start_pfn, nr_pages, want_memblock); WARN_ON_ONCE(ret); /* update max_pfn, max_low_pfn and high_memory */ - update_end_of_memory_vars(start, size); + update_end_of_memory_vars(start_pfn << PAGE_SHIFT, + nr_pages << PAGE_SHIFT); return ret; } + +int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) +{ + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + + init_memory_mapping(start, start + size); + + return add_pages(nid, start_pfn, nr_pages, want_memblock); +} EXPORT_SYMBOL_GPL(arch_add_memory); #define PAGE_INUSE 0xFD diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index bbc558b88a88..34f0e1847dd6 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -13,6 +13,8 @@ #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/mmiotrace.h> +#include <linux/mem_encrypt.h> +#include <linux/efi.h> #include <asm/set_memory.h> #include <asm/e820/api.h> @@ -21,6 +23,7 @@ #include <asm/tlbflush.h> #include <asm/pgalloc.h> #include <asm/pat.h> +#include <asm/setup.h> #include "physaddr.h" @@ -106,12 +109,6 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, } /* - * Don't remap the low PCI/ISA area, it's always mapped.. - */ - if (is_ISA_range(phys_addr, last_addr)) - return (__force void __iomem *)phys_to_virt(phys_addr); - - /* * Don't allow anybody to remap normal RAM that we're using.. */ pfn = phys_addr >> PAGE_SHIFT; @@ -340,13 +337,17 @@ void iounmap(volatile void __iomem *addr) return; /* - * __ioremap special-cases the PCI/ISA range by not instantiating a - * vm_area and by simply returning an address into the kernel mapping - * of ISA space. So handle that here. + * The PCI/ISA range special-casing was removed from __ioremap() + * so this check, in theory, can be removed. However, there are + * cases where iounmap() is called for addresses not obtained via + * ioremap() (vga16fb for example). Add a warning so that these + * cases can be caught and fixed. */ if ((void __force *)addr >= phys_to_virt(ISA_START_ADDRESS) && - (void __force *)addr < phys_to_virt(ISA_END_ADDRESS)) + (void __force *)addr < phys_to_virt(ISA_END_ADDRESS)) { + WARN(1, "iounmap() called for ISA range not obtained using ioremap()\n"); return; + } addr = (volatile void __iomem *) (PAGE_MASK & (unsigned long __force)addr); @@ -399,12 +400,10 @@ void *xlate_dev_mem_ptr(phys_addr_t phys) unsigned long offset = phys & ~PAGE_MASK; void *vaddr; - /* If page is RAM, we can use __va. Otherwise ioremap and unmap. */ - if (page_is_ram(start >> PAGE_SHIFT)) - return __va(phys); + /* memremap() maps if RAM, otherwise falls back to ioremap() */ + vaddr = memremap(start, PAGE_SIZE, MEMREMAP_WB); - vaddr = ioremap_cache(start, PAGE_SIZE); - /* Only add the offset on success and return NULL if the ioremap() failed: */ + /* Only add the offset on success and return NULL if memremap() failed */ if (vaddr) vaddr += offset; @@ -413,18 +412,270 @@ void *xlate_dev_mem_ptr(phys_addr_t phys) void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr) { - if (page_is_ram(phys >> PAGE_SHIFT)) - return; + memunmap((void *)((unsigned long)addr & PAGE_MASK)); +} + +/* + * Examine the physical address to determine if it is an area of memory + * that should be mapped decrypted. If the memory is not part of the + * kernel usable area it was accessed and created decrypted, so these + * areas should be mapped decrypted. And since the encryption key can + * change across reboots, persistent memory should also be mapped + * decrypted. + */ +static bool memremap_should_map_decrypted(resource_size_t phys_addr, + unsigned long size) +{ + int is_pmem; + + /* + * Check if the address is part of a persistent memory region. + * This check covers areas added by E820, EFI and ACPI. + */ + is_pmem = region_intersects(phys_addr, size, IORESOURCE_MEM, + IORES_DESC_PERSISTENT_MEMORY); + if (is_pmem != REGION_DISJOINT) + return true; + + /* + * Check if the non-volatile attribute is set for an EFI + * reserved area. + */ + if (efi_enabled(EFI_BOOT)) { + switch (efi_mem_type(phys_addr)) { + case EFI_RESERVED_TYPE: + if (efi_mem_attributes(phys_addr) & EFI_MEMORY_NV) + return true; + break; + default: + break; + } + } + + /* Check if the address is outside kernel usable area */ + switch (e820__get_entry_type(phys_addr, phys_addr + size - 1)) { + case E820_TYPE_RESERVED: + case E820_TYPE_ACPI: + case E820_TYPE_NVS: + case E820_TYPE_UNUSABLE: + case E820_TYPE_PRAM: + return true; + default: + break; + } + + return false; +} + +/* + * Examine the physical address to determine if it is EFI data. Check + * it against the boot params structure and EFI tables and memory types. + */ +static bool memremap_is_efi_data(resource_size_t phys_addr, + unsigned long size) +{ + u64 paddr; + + /* Check if the address is part of EFI boot/runtime data */ + if (!efi_enabled(EFI_BOOT)) + return false; + + paddr = boot_params.efi_info.efi_memmap_hi; + paddr <<= 32; + paddr |= boot_params.efi_info.efi_memmap; + if (phys_addr == paddr) + return true; + + paddr = boot_params.efi_info.efi_systab_hi; + paddr <<= 32; + paddr |= boot_params.efi_info.efi_systab; + if (phys_addr == paddr) + return true; + + if (efi_is_table_address(phys_addr)) + return true; + + switch (efi_mem_type(phys_addr)) { + case EFI_BOOT_SERVICES_DATA: + case EFI_RUNTIME_SERVICES_DATA: + return true; + default: + break; + } + + return false; +} + +/* + * Examine the physical address to determine if it is boot data by checking + * it against the boot params setup_data chain. + */ +static bool memremap_is_setup_data(resource_size_t phys_addr, + unsigned long size) +{ + struct setup_data *data; + u64 paddr, paddr_next; + + paddr = boot_params.hdr.setup_data; + while (paddr) { + unsigned int len; + + if (phys_addr == paddr) + return true; + + data = memremap(paddr, sizeof(*data), + MEMREMAP_WB | MEMREMAP_DEC); + + paddr_next = data->next; + len = data->len; + + memunmap(data); + + if ((phys_addr > paddr) && (phys_addr < (paddr + len))) + return true; + + paddr = paddr_next; + } + + return false; +} + +/* + * Examine the physical address to determine if it is boot data by checking + * it against the boot params setup_data chain (early boot version). + */ +static bool __init early_memremap_is_setup_data(resource_size_t phys_addr, + unsigned long size) +{ + struct setup_data *data; + u64 paddr, paddr_next; + + paddr = boot_params.hdr.setup_data; + while (paddr) { + unsigned int len; + + if (phys_addr == paddr) + return true; + + data = early_memremap_decrypted(paddr, sizeof(*data)); + + paddr_next = data->next; + len = data->len; + + early_memunmap(data, sizeof(*data)); + + if ((phys_addr > paddr) && (phys_addr < (paddr + len))) + return true; + + paddr = paddr_next; + } + + return false; +} + +/* + * Architecture function to determine if RAM remap is allowed. By default, a + * RAM remap will map the data as encrypted. Determine if a RAM remap should + * not be done so that the data will be mapped decrypted. + */ +bool arch_memremap_can_ram_remap(resource_size_t phys_addr, unsigned long size, + unsigned long flags) +{ + if (!sme_active()) + return true; + + if (flags & MEMREMAP_ENC) + return true; + + if (flags & MEMREMAP_DEC) + return false; + + if (memremap_is_setup_data(phys_addr, size) || + memremap_is_efi_data(phys_addr, size) || + memremap_should_map_decrypted(phys_addr, size)) + return false; + + return true; +} + +/* + * Architecture override of __weak function to adjust the protection attributes + * used when remapping memory. By default, early_memremap() will map the data + * as encrypted. Determine if an encrypted mapping should not be done and set + * the appropriate protection attributes. + */ +pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr, + unsigned long size, + pgprot_t prot) +{ + if (!sme_active()) + return prot; + + if (early_memremap_is_setup_data(phys_addr, size) || + memremap_is_efi_data(phys_addr, size) || + memremap_should_map_decrypted(phys_addr, size)) + prot = pgprot_decrypted(prot); + else + prot = pgprot_encrypted(prot); + + return prot; +} + +bool phys_mem_access_encrypted(unsigned long phys_addr, unsigned long size) +{ + return arch_memremap_can_ram_remap(phys_addr, size, 0); +} + +#ifdef CONFIG_ARCH_USE_MEMREMAP_PROT +/* Remap memory with encryption */ +void __init *early_memremap_encrypted(resource_size_t phys_addr, + unsigned long size) +{ + return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC); +} + +/* + * Remap memory with encryption and write-protected - cannot be called + * before pat_init() is called + */ +void __init *early_memremap_encrypted_wp(resource_size_t phys_addr, + unsigned long size) +{ + /* Be sure the write-protect PAT entry is set for write-protect */ + if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP) + return NULL; + + return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC_WP); +} + +/* Remap memory without encryption */ +void __init *early_memremap_decrypted(resource_size_t phys_addr, + unsigned long size) +{ + return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC); +} + +/* + * Remap memory without encryption and write-protected - cannot be called + * before pat_init() is called + */ +void __init *early_memremap_decrypted_wp(resource_size_t phys_addr, + unsigned long size) +{ + /* Be sure the write-protect PAT entry is set for write-protect */ + if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP) + return NULL; - iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK)); + return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC_WP); } +#endif /* CONFIG_ARCH_USE_MEMREMAP_PROT */ static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss; static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) { /* Don't assume we're using swapper_pg_dir at this point */ - pgd_t *base = __va(read_cr3()); + pgd_t *base = __va(read_cr3_pa()); pgd_t *pgd = &base[pgd_index(addr)]; p4d_t *p4d = p4d_offset(pgd, addr); pud_t *pud = pud_offset(p4d, addr); diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 0c7d8129bed6..8f5be3eb40dd 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #define DISABLE_BRANCH_PROFILING #define pr_fmt(fmt) "kasan: " fmt #include <linux/bootmem.h> @@ -11,8 +12,8 @@ #include <asm/e820/types.h> #include <asm/tlbflush.h> #include <asm/sections.h> +#include <asm/pgtable.h> -extern pgd_t early_level4_pgt[PTRS_PER_PGD]; extern struct range pfn_mapped[E820_MAX_ENTRIES]; static int __init map_range(struct range *range) @@ -23,12 +24,7 @@ static int __init map_range(struct range *range) start = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->start)); end = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->end)); - /* - * end + 1 here is intentional. We check several shadow bytes in advance - * to slightly speed up fastpath. In some rare cases we could cross - * boundary of mapped shadow, so we just map some more here. - */ - return vmemmap_populate(start, end + 1, NUMA_NO_NODE); + return vmemmap_populate(start, end, NUMA_NO_NODE); } static void __init clear_pgds(unsigned long start, @@ -92,7 +88,7 @@ static struct notifier_block kasan_die_notifier = { void __init kasan_early_init(void) { int i; - pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL; + pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL | _PAGE_ENC; pmdval_t pmd_val = __pa_nodebug(kasan_zero_pte) | _KERNPG_TABLE; pudval_t pud_val = __pa_nodebug(kasan_zero_pmd) | _KERNPG_TABLE; p4dval_t p4d_val = __pa_nodebug(kasan_zero_pud) | _KERNPG_TABLE; @@ -109,8 +105,8 @@ void __init kasan_early_init(void) for (i = 0; CONFIG_PGTABLE_LEVELS >= 5 && i < PTRS_PER_P4D; i++) kasan_zero_p4d[i] = __p4d(p4d_val); - kasan_map_early_shadow(early_level4_pgt); - kasan_map_early_shadow(init_level4_pgt); + kasan_map_early_shadow(early_top_pgt); + kasan_map_early_shadow(init_top_pgt); } void __init kasan_init(void) @@ -121,8 +117,8 @@ void __init kasan_init(void) register_die_notifier(&kasan_die_notifier); #endif - memcpy(early_level4_pgt, init_level4_pgt, sizeof(early_level4_pgt)); - load_cr3(early_level4_pgt); + memcpy(early_top_pgt, init_top_pgt, sizeof(early_top_pgt)); + load_cr3(early_top_pgt); __flush_tlb_all(); clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END); @@ -148,7 +144,7 @@ void __init kasan_init(void) kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), (void *)KASAN_SHADOW_END); - load_cr3(init_level4_pgt); + load_cr3(init_top_pgt); __flush_tlb_all(); /* @@ -158,7 +154,7 @@ void __init kasan_init(void) */ memset(kasan_zero_page, 0, PAGE_SIZE); for (i = 0; i < PTRS_PER_PTE; i++) { - pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO); + pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO | _PAGE_ENC); set_pte(&kasan_zero_pte[i], pte); } /* Flush TLBs again to be sure that write protection applied. */ diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index aed206475aa7..879ef930e2c2 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * This file implements KASLR memory randomization for x86_64. It randomizes * the virtual address space of kernel memory regions (physical memory @@ -6,12 +7,12 @@ * * Entropy is generated using the KASLR early boot functions now shared in * the lib directory (originally written by Kees Cook). Randomization is - * done on PGD & PUD page table levels to increase possible addresses. The - * physical memory mapping code was adapted to support PUD level virtual - * addresses. This implementation on the best configuration provides 30,000 - * possible virtual addresses in average for each memory region. An additional - * low memory page is used to ensure each CPU can start with a PGD aligned - * virtual address (for realmode). + * done on PGD & P4D/PUD page table levels to increase possible addresses. + * The physical memory mapping code was adapted to support P4D/PUD level + * virtual addresses. This implementation on the best configuration provides + * 30,000 possible virtual addresses in average for each memory region. + * An additional low memory page is used to ensure each CPU can start with + * a PGD aligned virtual address (for realmode). * * The order of each memory region is not changed. The feature looks at * the available space for the regions based on different configuration @@ -70,7 +71,7 @@ static __initdata struct kaslr_memory_region { unsigned long *base; unsigned long size_tb; } kaslr_regions[] = { - { &page_offset_base, 64/* Maximum */ }, + { &page_offset_base, 1 << (__PHYSICAL_MASK_SHIFT - TB_SHIFT) /* Maximum */ }, { &vmalloc_base, VMALLOC_SIZE_TB }, { &vmemmap_base, 1 }, }; @@ -142,7 +143,10 @@ void __init kernel_randomize_memory(void) */ entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i); prandom_bytes_state(&rand_state, &rand, sizeof(rand)); - entropy = (rand % (entropy + 1)) & PUD_MASK; + if (IS_ENABLED(CONFIG_X86_5LEVEL)) + entropy = (rand % (entropy + 1)) & P4D_MASK; + else + entropy = (rand % (entropy + 1)) & PUD_MASK; vaddr += entropy; *kaslr_regions[i].base = vaddr; @@ -151,27 +155,21 @@ void __init kernel_randomize_memory(void) * randomization alignment. */ vaddr += get_padding(&kaslr_regions[i]); - vaddr = round_up(vaddr + 1, PUD_SIZE); + if (IS_ENABLED(CONFIG_X86_5LEVEL)) + vaddr = round_up(vaddr + 1, P4D_SIZE); + else + vaddr = round_up(vaddr + 1, PUD_SIZE); remain_entropy -= entropy; } } -/* - * Create PGD aligned trampoline table to allow real mode initialization - * of additional CPUs. Consume only 1 low memory page. - */ -void __meminit init_trampoline(void) +static void __meminit init_trampoline_pud(void) { unsigned long paddr, paddr_next; pgd_t *pgd; pud_t *pud_page, *pud_page_tramp; int i; - if (!kaslr_memory_enabled()) { - init_trampoline_default(); - return; - } - pud_page_tramp = alloc_low_page(); paddr = 0; @@ -192,3 +190,49 @@ void __meminit init_trampoline(void) set_pgd(&trampoline_pgd_entry, __pgd(_KERNPG_TABLE | __pa(pud_page_tramp))); } + +static void __meminit init_trampoline_p4d(void) +{ + unsigned long paddr, paddr_next; + pgd_t *pgd; + p4d_t *p4d_page, *p4d_page_tramp; + int i; + + p4d_page_tramp = alloc_low_page(); + + paddr = 0; + pgd = pgd_offset_k((unsigned long)__va(paddr)); + p4d_page = (p4d_t *) pgd_page_vaddr(*pgd); + + for (i = p4d_index(paddr); i < PTRS_PER_P4D; i++, paddr = paddr_next) { + p4d_t *p4d, *p4d_tramp; + unsigned long vaddr = (unsigned long)__va(paddr); + + p4d_tramp = p4d_page_tramp + p4d_index(paddr); + p4d = p4d_page + p4d_index(vaddr); + paddr_next = (paddr & P4D_MASK) + P4D_SIZE; + + *p4d_tramp = *p4d; + } + + set_pgd(&trampoline_pgd_entry, + __pgd(_KERNPG_TABLE | __pa(p4d_page_tramp))); +} + +/* + * Create PGD aligned trampoline table to allow real mode initialization + * of additional CPUs. Consume only 1 low memory page. + */ +void __meminit init_trampoline(void) +{ + + if (!kaslr_memory_enabled()) { + init_trampoline_default(); + return; + } + + if (IS_ENABLED(CONFIG_X86_5LEVEL)) + init_trampoline_p4d(); + else + init_trampoline_pud(); +} diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c index dab41876cdd5..872ec4159a68 100644 --- a/arch/x86/mm/kmemcheck/error.c +++ b/arch/x86/mm/kmemcheck/error.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/interrupt.h> #include <linux/kdebug.h> #include <linux/kmemcheck.h> diff --git a/arch/x86/mm/kmemcheck/error.h b/arch/x86/mm/kmemcheck/error.h index 0efc2e8d0a20..39f80d7a874d 100644 --- a/arch/x86/mm/kmemcheck/error.h +++ b/arch/x86/mm/kmemcheck/error.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef ARCH__X86__MM__KMEMCHECK__ERROR_H #define ARCH__X86__MM__KMEMCHECK__ERROR_H diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c index 324aa3f07237..df8109ddf7fe 100644 --- a/arch/x86/mm/kmemcheck/opcode.c +++ b/arch/x86/mm/kmemcheck/opcode.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/types.h> #include "opcode.h" diff --git a/arch/x86/mm/kmemcheck/opcode.h b/arch/x86/mm/kmemcheck/opcode.h index 6956aad66b5b..51a1ce94c24a 100644 --- a/arch/x86/mm/kmemcheck/opcode.h +++ b/arch/x86/mm/kmemcheck/opcode.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef ARCH__X86__MM__KMEMCHECK__OPCODE_H #define ARCH__X86__MM__KMEMCHECK__OPCODE_H diff --git a/arch/x86/mm/kmemcheck/pte.c b/arch/x86/mm/kmemcheck/pte.c index 4ead26eeaf96..8a03be90272a 100644 --- a/arch/x86/mm/kmemcheck/pte.c +++ b/arch/x86/mm/kmemcheck/pte.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/mm.h> #include <asm/pgtable.h> diff --git a/arch/x86/mm/kmemcheck/pte.h b/arch/x86/mm/kmemcheck/pte.h index 9f5966456492..b595612382c2 100644 --- a/arch/x86/mm/kmemcheck/pte.h +++ b/arch/x86/mm/kmemcheck/pte.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef ARCH__X86__MM__KMEMCHECK__PTE_H #define ARCH__X86__MM__KMEMCHECK__PTE_H diff --git a/arch/x86/mm/kmemcheck/selftest.c b/arch/x86/mm/kmemcheck/selftest.c index aef7140c0063..7ce0be1f99eb 100644 --- a/arch/x86/mm/kmemcheck/selftest.c +++ b/arch/x86/mm/kmemcheck/selftest.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/bug.h> #include <linux/kernel.h> diff --git a/arch/x86/mm/kmemcheck/selftest.h b/arch/x86/mm/kmemcheck/selftest.h index 8fed4fe11f95..8d759aae453d 100644 --- a/arch/x86/mm/kmemcheck/selftest.h +++ b/arch/x86/mm/kmemcheck/selftest.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef ARCH_X86_MM_KMEMCHECK_SELFTEST_H #define ARCH_X86_MM_KMEMCHECK_SELFTEST_H diff --git a/arch/x86/mm/kmemcheck/shadow.h b/arch/x86/mm/kmemcheck/shadow.h index ff0b2f70fbcb..49768dc18664 100644 --- a/arch/x86/mm/kmemcheck/shadow.h +++ b/arch/x86/mm/kmemcheck/shadow.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef ARCH__X86__MM__KMEMCHECK__SHADOW_H #define ARCH__X86__MM__KMEMCHECK__SHADOW_H diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index afc47f5c9531..c21c2ed04612 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* Support for MMIO probes. * Benfit many code from kprobes * (C) 2002 Louis Zhuang <louis.zhuang@intel.com>. diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c new file mode 100644 index 000000000000..16c5f37933a2 --- /dev/null +++ b/arch/x86/mm/mem_encrypt.c @@ -0,0 +1,595 @@ +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2016 Advanced Micro Devices, Inc. + * + * Author: Tom Lendacky <thomas.lendacky@amd.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#define DISABLE_BRANCH_PROFILING + +#include <linux/linkage.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/dma-mapping.h> +#include <linux/swiotlb.h> +#include <linux/mem_encrypt.h> + +#include <asm/tlbflush.h> +#include <asm/fixmap.h> +#include <asm/setup.h> +#include <asm/bootparam.h> +#include <asm/set_memory.h> +#include <asm/cacheflush.h> +#include <asm/sections.h> +#include <asm/processor-flags.h> +#include <asm/msr.h> +#include <asm/cmdline.h> + +static char sme_cmdline_arg[] __initdata = "mem_encrypt"; +static char sme_cmdline_on[] __initdata = "on"; +static char sme_cmdline_off[] __initdata = "off"; + +/* + * Since SME related variables are set early in the boot process they must + * reside in the .data section so as not to be zeroed out when the .bss + * section is later cleared. + */ +u64 sme_me_mask __section(.data) = 0; +EXPORT_SYMBOL_GPL(sme_me_mask); + +/* Buffer used for early in-place encryption by BSP, no locking needed */ +static char sme_early_buffer[PAGE_SIZE] __aligned(PAGE_SIZE); + +/* + * This routine does not change the underlying encryption setting of the + * page(s) that map this memory. It assumes that eventually the memory is + * meant to be accessed as either encrypted or decrypted but the contents + * are currently not in the desired state. + * + * This routine follows the steps outlined in the AMD64 Architecture + * Programmer's Manual Volume 2, Section 7.10.8 Encrypt-in-Place. + */ +static void __init __sme_early_enc_dec(resource_size_t paddr, + unsigned long size, bool enc) +{ + void *src, *dst; + size_t len; + + if (!sme_me_mask) + return; + + local_flush_tlb(); + wbinvd(); + + /* + * There are limited number of early mapping slots, so map (at most) + * one page at time. + */ + while (size) { + len = min_t(size_t, sizeof(sme_early_buffer), size); + + /* + * Create mappings for the current and desired format of + * the memory. Use a write-protected mapping for the source. + */ + src = enc ? early_memremap_decrypted_wp(paddr, len) : + early_memremap_encrypted_wp(paddr, len); + + dst = enc ? early_memremap_encrypted(paddr, len) : + early_memremap_decrypted(paddr, len); + + /* + * If a mapping can't be obtained to perform the operation, + * then eventual access of that area in the desired mode + * will cause a crash. + */ + BUG_ON(!src || !dst); + + /* + * Use a temporary buffer, of cache-line multiple size, to + * avoid data corruption as documented in the APM. + */ + memcpy(sme_early_buffer, src, len); + memcpy(dst, sme_early_buffer, len); + + early_memunmap(dst, len); + early_memunmap(src, len); + + paddr += len; + size -= len; + } +} + +void __init sme_early_encrypt(resource_size_t paddr, unsigned long size) +{ + __sme_early_enc_dec(paddr, size, true); +} + +void __init sme_early_decrypt(resource_size_t paddr, unsigned long size) +{ + __sme_early_enc_dec(paddr, size, false); +} + +static void __init __sme_early_map_unmap_mem(void *vaddr, unsigned long size, + bool map) +{ + unsigned long paddr = (unsigned long)vaddr - __PAGE_OFFSET; + pmdval_t pmd_flags, pmd; + + /* Use early_pmd_flags but remove the encryption mask */ + pmd_flags = __sme_clr(early_pmd_flags); + + do { + pmd = map ? (paddr & PMD_MASK) + pmd_flags : 0; + __early_make_pgtable((unsigned long)vaddr, pmd); + + vaddr += PMD_SIZE; + paddr += PMD_SIZE; + size = (size <= PMD_SIZE) ? 0 : size - PMD_SIZE; + } while (size); + + __native_flush_tlb(); +} + +void __init sme_unmap_bootdata(char *real_mode_data) +{ + struct boot_params *boot_data; + unsigned long cmdline_paddr; + + if (!sme_active()) + return; + + /* Get the command line address before unmapping the real_mode_data */ + boot_data = (struct boot_params *)real_mode_data; + cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32); + + __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), false); + + if (!cmdline_paddr) + return; + + __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, false); +} + +void __init sme_map_bootdata(char *real_mode_data) +{ + struct boot_params *boot_data; + unsigned long cmdline_paddr; + + if (!sme_active()) + return; + + __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), true); + + /* Get the command line address after mapping the real_mode_data */ + boot_data = (struct boot_params *)real_mode_data; + cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32); + + if (!cmdline_paddr) + return; + + __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, true); +} + +void __init sme_early_init(void) +{ + unsigned int i; + + if (!sme_me_mask) + return; + + early_pmd_flags = __sme_set(early_pmd_flags); + + __supported_pte_mask = __sme_set(__supported_pte_mask); + + /* Update the protection map with memory encryption mask */ + for (i = 0; i < ARRAY_SIZE(protection_map); i++) + protection_map[i] = pgprot_encrypted(protection_map[i]); +} + +/* Architecture __weak replacement functions */ +void __init mem_encrypt_init(void) +{ + if (!sme_me_mask) + return; + + /* Call into SWIOTLB to update the SWIOTLB DMA buffers */ + swiotlb_update_mem_attributes(); + + pr_info("AMD Secure Memory Encryption (SME) active\n"); +} + +void swiotlb_set_mem_attributes(void *vaddr, unsigned long size) +{ + WARN(PAGE_ALIGN(size) != size, + "size is not page-aligned (%#lx)\n", size); + + /* Make the SWIOTLB buffer area decrypted */ + set_memory_decrypted((unsigned long)vaddr, size >> PAGE_SHIFT); +} + +static void __init sme_clear_pgd(pgd_t *pgd_base, unsigned long start, + unsigned long end) +{ + unsigned long pgd_start, pgd_end, pgd_size; + pgd_t *pgd_p; + + pgd_start = start & PGDIR_MASK; + pgd_end = end & PGDIR_MASK; + + pgd_size = (((pgd_end - pgd_start) / PGDIR_SIZE) + 1); + pgd_size *= sizeof(pgd_t); + + pgd_p = pgd_base + pgd_index(start); + + memset(pgd_p, 0, pgd_size); +} + +#define PGD_FLAGS _KERNPG_TABLE_NOENC +#define P4D_FLAGS _KERNPG_TABLE_NOENC +#define PUD_FLAGS _KERNPG_TABLE_NOENC +#define PMD_FLAGS (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL) + +static void __init *sme_populate_pgd(pgd_t *pgd_base, void *pgtable_area, + unsigned long vaddr, pmdval_t pmd_val) +{ + pgd_t *pgd_p; + p4d_t *p4d_p; + pud_t *pud_p; + pmd_t *pmd_p; + + pgd_p = pgd_base + pgd_index(vaddr); + if (native_pgd_val(*pgd_p)) { + if (IS_ENABLED(CONFIG_X86_5LEVEL)) + p4d_p = (p4d_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK); + else + pud_p = (pud_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK); + } else { + pgd_t pgd; + + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + p4d_p = pgtable_area; + memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D); + pgtable_area += sizeof(*p4d_p) * PTRS_PER_P4D; + + pgd = native_make_pgd((pgdval_t)p4d_p + PGD_FLAGS); + } else { + pud_p = pgtable_area; + memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); + pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD; + + pgd = native_make_pgd((pgdval_t)pud_p + PGD_FLAGS); + } + native_set_pgd(pgd_p, pgd); + } + + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + p4d_p += p4d_index(vaddr); + if (native_p4d_val(*p4d_p)) { + pud_p = (pud_t *)(native_p4d_val(*p4d_p) & ~PTE_FLAGS_MASK); + } else { + p4d_t p4d; + + pud_p = pgtable_area; + memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); + pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD; + + p4d = native_make_p4d((pudval_t)pud_p + P4D_FLAGS); + native_set_p4d(p4d_p, p4d); + } + } + + pud_p += pud_index(vaddr); + if (native_pud_val(*pud_p)) { + if (native_pud_val(*pud_p) & _PAGE_PSE) + goto out; + + pmd_p = (pmd_t *)(native_pud_val(*pud_p) & ~PTE_FLAGS_MASK); + } else { + pud_t pud; + + pmd_p = pgtable_area; + memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD); + pgtable_area += sizeof(*pmd_p) * PTRS_PER_PMD; + + pud = native_make_pud((pmdval_t)pmd_p + PUD_FLAGS); + native_set_pud(pud_p, pud); + } + + pmd_p += pmd_index(vaddr); + if (!native_pmd_val(*pmd_p) || !(native_pmd_val(*pmd_p) & _PAGE_PSE)) + native_set_pmd(pmd_p, native_make_pmd(pmd_val)); + +out: + return pgtable_area; +} + +static unsigned long __init sme_pgtable_calc(unsigned long len) +{ + unsigned long p4d_size, pud_size, pmd_size; + unsigned long total; + + /* + * Perform a relatively simplistic calculation of the pagetable + * entries that are needed. That mappings will be covered by 2MB + * PMD entries so we can conservatively calculate the required + * number of P4D, PUD and PMD structures needed to perform the + * mappings. Incrementing the count for each covers the case where + * the addresses cross entries. + */ + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + p4d_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1; + p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D; + pud_size = (ALIGN(len, P4D_SIZE) / P4D_SIZE) + 1; + pud_size *= sizeof(pud_t) * PTRS_PER_PUD; + } else { + p4d_size = 0; + pud_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1; + pud_size *= sizeof(pud_t) * PTRS_PER_PUD; + } + pmd_size = (ALIGN(len, PUD_SIZE) / PUD_SIZE) + 1; + pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD; + + total = p4d_size + pud_size + pmd_size; + + /* + * Now calculate the added pagetable structures needed to populate + * the new pagetables. + */ + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + p4d_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE; + p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D; + pud_size = ALIGN(total, P4D_SIZE) / P4D_SIZE; + pud_size *= sizeof(pud_t) * PTRS_PER_PUD; + } else { + p4d_size = 0; + pud_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE; + pud_size *= sizeof(pud_t) * PTRS_PER_PUD; + } + pmd_size = ALIGN(total, PUD_SIZE) / PUD_SIZE; + pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD; + + total += p4d_size + pud_size + pmd_size; + + return total; +} + +void __init sme_encrypt_kernel(void) +{ + unsigned long workarea_start, workarea_end, workarea_len; + unsigned long execute_start, execute_end, execute_len; + unsigned long kernel_start, kernel_end, kernel_len; + unsigned long pgtable_area_len; + unsigned long paddr, pmd_flags; + unsigned long decrypted_base; + void *pgtable_area; + pgd_t *pgd; + + if (!sme_active()) + return; + + /* + * Prepare for encrypting the kernel by building new pagetables with + * the necessary attributes needed to encrypt the kernel in place. + * + * One range of virtual addresses will map the memory occupied + * by the kernel as encrypted. + * + * Another range of virtual addresses will map the memory occupied + * by the kernel as decrypted and write-protected. + * + * The use of write-protect attribute will prevent any of the + * memory from being cached. + */ + + /* Physical addresses gives us the identity mapped virtual addresses */ + kernel_start = __pa_symbol(_text); + kernel_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE); + kernel_len = kernel_end - kernel_start; + + /* Set the encryption workarea to be immediately after the kernel */ + workarea_start = kernel_end; + + /* + * Calculate required number of workarea bytes needed: + * executable encryption area size: + * stack page (PAGE_SIZE) + * encryption routine page (PAGE_SIZE) + * intermediate copy buffer (PMD_PAGE_SIZE) + * pagetable structures for the encryption of the kernel + * pagetable structures for workarea (in case not currently mapped) + */ + execute_start = workarea_start; + execute_end = execute_start + (PAGE_SIZE * 2) + PMD_PAGE_SIZE; + execute_len = execute_end - execute_start; + + /* + * One PGD for both encrypted and decrypted mappings and a set of + * PUDs and PMDs for each of the encrypted and decrypted mappings. + */ + pgtable_area_len = sizeof(pgd_t) * PTRS_PER_PGD; + pgtable_area_len += sme_pgtable_calc(execute_end - kernel_start) * 2; + + /* PUDs and PMDs needed in the current pagetables for the workarea */ + pgtable_area_len += sme_pgtable_calc(execute_len + pgtable_area_len); + + /* + * The total workarea includes the executable encryption area and + * the pagetable area. + */ + workarea_len = execute_len + pgtable_area_len; + workarea_end = workarea_start + workarea_len; + + /* + * Set the address to the start of where newly created pagetable + * structures (PGDs, PUDs and PMDs) will be allocated. New pagetable + * structures are created when the workarea is added to the current + * pagetables and when the new encrypted and decrypted kernel + * mappings are populated. + */ + pgtable_area = (void *)execute_end; + + /* + * Make sure the current pagetable structure has entries for + * addressing the workarea. + */ + pgd = (pgd_t *)native_read_cr3_pa(); + paddr = workarea_start; + while (paddr < workarea_end) { + pgtable_area = sme_populate_pgd(pgd, pgtable_area, + paddr, + paddr + PMD_FLAGS); + + paddr += PMD_PAGE_SIZE; + } + + /* Flush the TLB - no globals so cr3 is enough */ + native_write_cr3(__native_read_cr3()); + + /* + * A new pagetable structure is being built to allow for the kernel + * to be encrypted. It starts with an empty PGD that will then be + * populated with new PUDs and PMDs as the encrypted and decrypted + * kernel mappings are created. + */ + pgd = pgtable_area; + memset(pgd, 0, sizeof(*pgd) * PTRS_PER_PGD); + pgtable_area += sizeof(*pgd) * PTRS_PER_PGD; + + /* Add encrypted kernel (identity) mappings */ + pmd_flags = PMD_FLAGS | _PAGE_ENC; + paddr = kernel_start; + while (paddr < kernel_end) { + pgtable_area = sme_populate_pgd(pgd, pgtable_area, + paddr, + paddr + pmd_flags); + + paddr += PMD_PAGE_SIZE; + } + + /* + * A different PGD index/entry must be used to get different + * pagetable entries for the decrypted mapping. Choose the next + * PGD index and convert it to a virtual address to be used as + * the base of the mapping. + */ + decrypted_base = (pgd_index(workarea_end) + 1) & (PTRS_PER_PGD - 1); + decrypted_base <<= PGDIR_SHIFT; + + /* Add decrypted, write-protected kernel (non-identity) mappings */ + pmd_flags = (PMD_FLAGS & ~_PAGE_CACHE_MASK) | (_PAGE_PAT | _PAGE_PWT); + paddr = kernel_start; + while (paddr < kernel_end) { + pgtable_area = sme_populate_pgd(pgd, pgtable_area, + paddr + decrypted_base, + paddr + pmd_flags); + + paddr += PMD_PAGE_SIZE; + } + + /* Add decrypted workarea mappings to both kernel mappings */ + paddr = workarea_start; + while (paddr < workarea_end) { + pgtable_area = sme_populate_pgd(pgd, pgtable_area, + paddr, + paddr + PMD_FLAGS); + + pgtable_area = sme_populate_pgd(pgd, pgtable_area, + paddr + decrypted_base, + paddr + PMD_FLAGS); + + paddr += PMD_PAGE_SIZE; + } + + /* Perform the encryption */ + sme_encrypt_execute(kernel_start, kernel_start + decrypted_base, + kernel_len, workarea_start, (unsigned long)pgd); + + /* + * At this point we are running encrypted. Remove the mappings for + * the decrypted areas - all that is needed for this is to remove + * the PGD entry/entries. + */ + sme_clear_pgd(pgd, kernel_start + decrypted_base, + kernel_end + decrypted_base); + + sme_clear_pgd(pgd, workarea_start + decrypted_base, + workarea_end + decrypted_base); + + /* Flush the TLB - no globals so cr3 is enough */ + native_write_cr3(__native_read_cr3()); +} + +void __init __nostackprotector sme_enable(struct boot_params *bp) +{ + const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off; + unsigned int eax, ebx, ecx, edx; + bool active_by_default; + unsigned long me_mask; + char buffer[16]; + u64 msr; + + /* Check for the SME support leaf */ + eax = 0x80000000; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + if (eax < 0x8000001f) + return; + + /* + * Check for the SME feature: + * CPUID Fn8000_001F[EAX] - Bit 0 + * Secure Memory Encryption support + * CPUID Fn8000_001F[EBX] - Bits 5:0 + * Pagetable bit position used to indicate encryption + */ + eax = 0x8000001f; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + if (!(eax & 1)) + return; + + me_mask = 1UL << (ebx & 0x3f); + + /* Check if SME is enabled */ + msr = __rdmsr(MSR_K8_SYSCFG); + if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) + return; + + /* + * Fixups have not been applied to phys_base yet and we're running + * identity mapped, so we must obtain the address to the SME command + * line argument data using rip-relative addressing. + */ + asm ("lea sme_cmdline_arg(%%rip), %0" + : "=r" (cmdline_arg) + : "p" (sme_cmdline_arg)); + asm ("lea sme_cmdline_on(%%rip), %0" + : "=r" (cmdline_on) + : "p" (sme_cmdline_on)); + asm ("lea sme_cmdline_off(%%rip), %0" + : "=r" (cmdline_off) + : "p" (sme_cmdline_off)); + + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT)) + active_by_default = true; + else + active_by_default = false; + + cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr | + ((u64)bp->ext_cmd_line_ptr << 32)); + + cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer)); + + if (!strncmp(buffer, cmdline_on, sizeof(buffer))) + sme_me_mask = me_mask; + else if (!strncmp(buffer, cmdline_off, sizeof(buffer))) + sme_me_mask = 0; + else + sme_me_mask = active_by_default ? me_mask : 0; +} diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S new file mode 100644 index 000000000000..730e6d541df1 --- /dev/null +++ b/arch/x86/mm/mem_encrypt_boot.S @@ -0,0 +1,149 @@ +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2016 Advanced Micro Devices, Inc. + * + * Author: Tom Lendacky <thomas.lendacky@amd.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <asm/pgtable.h> +#include <asm/page.h> +#include <asm/processor-flags.h> +#include <asm/msr-index.h> + + .text + .code64 +ENTRY(sme_encrypt_execute) + + /* + * Entry parameters: + * RDI - virtual address for the encrypted kernel mapping + * RSI - virtual address for the decrypted kernel mapping + * RDX - length of kernel + * RCX - virtual address of the encryption workarea, including: + * - stack page (PAGE_SIZE) + * - encryption routine page (PAGE_SIZE) + * - intermediate copy buffer (PMD_PAGE_SIZE) + * R8 - physcial address of the pagetables to use for encryption + */ + + push %rbp + movq %rsp, %rbp /* RBP now has original stack pointer */ + + /* Set up a one page stack in the non-encrypted memory area */ + movq %rcx, %rax /* Workarea stack page */ + leaq PAGE_SIZE(%rax), %rsp /* Set new stack pointer */ + addq $PAGE_SIZE, %rax /* Workarea encryption routine */ + + push %r12 + movq %rdi, %r10 /* Encrypted kernel */ + movq %rsi, %r11 /* Decrypted kernel */ + movq %rdx, %r12 /* Kernel length */ + + /* Copy encryption routine into the workarea */ + movq %rax, %rdi /* Workarea encryption routine */ + leaq __enc_copy(%rip), %rsi /* Encryption routine */ + movq $(.L__enc_copy_end - __enc_copy), %rcx /* Encryption routine length */ + rep movsb + + /* Setup registers for call */ + movq %r10, %rdi /* Encrypted kernel */ + movq %r11, %rsi /* Decrypted kernel */ + movq %r8, %rdx /* Pagetables used for encryption */ + movq %r12, %rcx /* Kernel length */ + movq %rax, %r8 /* Workarea encryption routine */ + addq $PAGE_SIZE, %r8 /* Workarea intermediate copy buffer */ + + call *%rax /* Call the encryption routine */ + + pop %r12 + + movq %rbp, %rsp /* Restore original stack pointer */ + pop %rbp + + ret +ENDPROC(sme_encrypt_execute) + +ENTRY(__enc_copy) +/* + * Routine used to encrypt kernel. + * This routine must be run outside of the kernel proper since + * the kernel will be encrypted during the process. So this + * routine is defined here and then copied to an area outside + * of the kernel where it will remain and run decrypted + * during execution. + * + * On entry the registers must be: + * RDI - virtual address for the encrypted kernel mapping + * RSI - virtual address for the decrypted kernel mapping + * RDX - address of the pagetables to use for encryption + * RCX - length of kernel + * R8 - intermediate copy buffer + * + * RAX - points to this routine + * + * The kernel will be encrypted by copying from the non-encrypted + * kernel space to an intermediate buffer and then copying from the + * intermediate buffer back to the encrypted kernel space. The physical + * addresses of the two kernel space mappings are the same which + * results in the kernel being encrypted "in place". + */ + /* Enable the new page tables */ + mov %rdx, %cr3 + + /* Flush any global TLBs */ + mov %cr4, %rdx + andq $~X86_CR4_PGE, %rdx + mov %rdx, %cr4 + orq $X86_CR4_PGE, %rdx + mov %rdx, %cr4 + + /* Set the PAT register PA5 entry to write-protect */ + push %rcx + movl $MSR_IA32_CR_PAT, %ecx + rdmsr + push %rdx /* Save original PAT value */ + andl $0xffff00ff, %edx /* Clear PA5 */ + orl $0x00000500, %edx /* Set PA5 to WP */ + wrmsr + pop %rdx /* RDX contains original PAT value */ + pop %rcx + + movq %rcx, %r9 /* Save kernel length */ + movq %rdi, %r10 /* Save encrypted kernel address */ + movq %rsi, %r11 /* Save decrypted kernel address */ + + wbinvd /* Invalidate any cache entries */ + + /* Copy/encrypt 2MB at a time */ +1: + movq %r11, %rsi /* Source - decrypted kernel */ + movq %r8, %rdi /* Dest - intermediate copy buffer */ + movq $PMD_PAGE_SIZE, %rcx /* 2MB length */ + rep movsb + + movq %r8, %rsi /* Source - intermediate copy buffer */ + movq %r10, %rdi /* Dest - encrypted kernel */ + movq $PMD_PAGE_SIZE, %rcx /* 2MB length */ + rep movsb + + addq $PMD_PAGE_SIZE, %r11 + addq $PMD_PAGE_SIZE, %r10 + subq $PMD_PAGE_SIZE, %r9 /* Kernel length decrement */ + jnz 1b /* Kernel length not zero? */ + + /* Restore PAT register */ + push %rdx /* Save original PAT value */ + movl $MSR_IA32_CR_PAT, %ecx + rdmsr + pop %rdx /* Restore original PAT value */ + wrmsr + + ret +.L__enc_copy_end: +ENDPROC(__enc_copy) diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h index 62474ba66c8e..4e1f6e1b8159 100644 --- a/arch/x86/mm/mm_internal.h +++ b/arch/x86/mm/mm_internal.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __X86_MM_INTERNAL_H #define __X86_MM_INTERNAL_H diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 19ad095b41df..a99679826846 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -37,22 +37,21 @@ struct va_alignment __read_mostly va_align = { .flags = -1, }; -unsigned long tasksize_32bit(void) +unsigned long task_size_32bit(void) { return IA32_PAGE_OFFSET; } -unsigned long tasksize_64bit(void) +unsigned long task_size_64bit(int full_addr_space) { - return TASK_SIZE_MAX; + return full_addr_space ? TASK_SIZE_MAX : DEFAULT_MAP_WINDOW; } static unsigned long stack_maxrandom_size(unsigned long task_size) { unsigned long max = 0; - if ((current->flags & PF_RANDOMIZE) && - !(current->personality & ADDR_NO_RANDOMIZE)) { - max = (-1UL) & __STACK_RND_MASK(task_size == tasksize_32bit()); + if (current->flags & PF_RANDOMIZE) { + max = (-1UL) & __STACK_RND_MASK(task_size == task_size_32bit()); max <<= PAGE_SHIFT; } @@ -74,34 +73,36 @@ static int mmap_is_legacy(void) if (current->personality & ADDR_COMPAT_LAYOUT) return 1; - if (rlimit(RLIMIT_STACK) == RLIM_INFINITY) - return 1; - return sysctl_legacy_va_layout; } static unsigned long arch_rnd(unsigned int rndbits) { + if (!(current->flags & PF_RANDOMIZE)) + return 0; return (get_random_long() & ((1UL << rndbits) - 1)) << PAGE_SHIFT; } unsigned long arch_mmap_rnd(void) { - if (!(current->flags & PF_RANDOMIZE)) - return 0; return arch_rnd(mmap_is_ia32() ? mmap32_rnd_bits : mmap64_rnd_bits); } static unsigned long mmap_base(unsigned long rnd, unsigned long task_size) { unsigned long gap = rlimit(RLIMIT_STACK); + unsigned long pad = stack_maxrandom_size(task_size) + stack_guard_gap; unsigned long gap_min, gap_max; + /* Values close to RLIM_INFINITY can overflow. */ + if (gap + pad > gap) + gap += pad; + /* * Top of mmap area (just below the process stack). * Leave an at least ~128 MB hole with possible stack randomization. */ - gap_min = SIZE_128M + stack_maxrandom_size(task_size); + gap_min = SIZE_128M; gap_max = (task_size / 6) * 5; if (gap < gap_min) @@ -140,7 +141,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) mm->get_unmapped_area = arch_get_unmapped_area_topdown; arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base, - arch_rnd(mmap64_rnd_bits), tasksize_64bit()); + arch_rnd(mmap64_rnd_bits), task_size_64bit(0)); #ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES /* @@ -150,7 +151,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) * mmap_base, the compat syscall uses mmap_compat_base. */ arch_pick_mmap_base(&mm->mmap_compat_base, &mm->mmap_compat_legacy_base, - arch_rnd(mmap32_rnd_bits), tasksize_32bit()); + arch_rnd(mmap32_rnd_bits), task_size_32bit()); #endif } diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 1c34b767c84c..7eb06701a935 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * mpx.c - Memory Protection eXtensions * @@ -355,10 +356,19 @@ int mpx_enable_management(void) */ bd_base = mpx_get_bounds_dir(); down_write(&mm->mmap_sem); + + /* MPX doesn't support addresses above 47 bits yet. */ + if (find_vma(mm, DEFAULT_MAP_WINDOW)) { + pr_warn_once("%s (%d): MPX cannot handle addresses " + "above 47-bits. Disabling.", + current->comm, current->pid); + ret = -ENXIO; + goto out; + } mm->context.bd_addr = bd_base; if (mm->context.bd_addr == MPX_INVALID_BOUNDS_DIR) ret = -ENXIO; - +out: up_write(&mm->mmap_sem); return ret; } @@ -1030,3 +1040,25 @@ void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma, if (ret) force_sig(SIGSEGV, current); } + +/* MPX cannot handle addresses above 47 bits yet. */ +unsigned long mpx_unmapped_area_check(unsigned long addr, unsigned long len, + unsigned long flags) +{ + if (!kernel_managing_mpx_tables(current->mm)) + return addr; + if (addr + len <= DEFAULT_MAP_WINDOW) + return addr; + if (flags & MAP_FIXED) + return -ENOMEM; + + /* + * Requested len is larger than the whole area we're allowed to map in. + * Resetting hinting address wouldn't do much good -- fail early. + */ + if (len > DEFAULT_MAP_WINDOW) + return -ENOMEM; + + /* Look for unmap area within DEFAULT_MAP_WINDOW */ + return 0; +} diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 9405ffc91502..066f3511d5f1 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Generic VM initialization for x86-64 NUMA setups. * Copyright 2002,2003 Andi Kleen, SuSE Labs. diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index a8f90ce3dedf..34a2a3bfde9c 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * NUMA emulation */ @@ -75,13 +76,15 @@ static int __init emu_setup_memblk(struct numa_meminfo *ei, /* * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr - * to max_addr. The return value is the number of nodes allocated. + * to max_addr. + * + * Returns zero on success or negative on error. */ static int __init split_nodes_interleave(struct numa_meminfo *ei, struct numa_meminfo *pi, u64 addr, u64 max_addr, int nr_nodes) { - nodemask_t physnode_mask = NODE_MASK_NONE; + nodemask_t physnode_mask = numa_nodes_parsed; u64 size; int big; int nid = 0; @@ -116,9 +119,6 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei, return -1; } - for (i = 0; i < pi->nr_blks; i++) - node_set(pi->blk[i].nid, physnode_mask); - /* * Continue to fill physical nodes with fake nodes until there is no * memory left on any of them. @@ -200,13 +200,15 @@ static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) /* * Sets up fake nodes of `size' interleaved over physical nodes ranging from - * `addr' to `max_addr'. The return value is the number of nodes allocated. + * `addr' to `max_addr'. + * + * Returns zero on success or negative on error. */ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, struct numa_meminfo *pi, u64 addr, u64 max_addr, u64 size) { - nodemask_t physnode_mask = NODE_MASK_NONE; + nodemask_t physnode_mask = numa_nodes_parsed; u64 min_size; int nid = 0; int i, ret; @@ -231,9 +233,6 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, } size &= FAKE_NODE_MIN_HASH_MASK; - for (i = 0; i < pi->nr_blks; i++) - node_set(pi->blk[i].nid, physnode_mask); - /* * Fill physical nodes with fake nodes of size until there is no memory * left on any of them. @@ -280,6 +279,22 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, return 0; } +int __init setup_emu2phys_nid(int *dfl_phys_nid) +{ + int i, max_emu_nid = 0; + + *dfl_phys_nid = NUMA_NO_NODE; + for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) { + if (emu_nid_to_phys[i] != NUMA_NO_NODE) { + max_emu_nid = i; + if (*dfl_phys_nid == NUMA_NO_NODE) + *dfl_phys_nid = emu_nid_to_phys[i]; + } + } + + return max_emu_nid; +} + /** * numa_emulation - Emulate NUMA nodes * @numa_meminfo: NUMA configuration to massage @@ -376,23 +391,18 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) * Determine the max emulated nid and the default phys nid to use * for unmapped nodes. */ - max_emu_nid = 0; - dfl_phys_nid = NUMA_NO_NODE; - for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) { - if (emu_nid_to_phys[i] != NUMA_NO_NODE) { - max_emu_nid = i; - if (dfl_phys_nid == NUMA_NO_NODE) - dfl_phys_nid = emu_nid_to_phys[i]; - } - } - if (dfl_phys_nid == NUMA_NO_NODE) { - pr_warning("NUMA: Warning: can't determine default physical node, disabling emulation\n"); - goto no_emu; - } + max_emu_nid = setup_emu2phys_nid(&dfl_phys_nid); /* commit */ *numa_meminfo = ei; + /* Make sure numa_nodes_parsed only contains emulated nodes */ + nodes_clear(numa_nodes_parsed); + for (i = 0; i < ARRAY_SIZE(ei.blk); i++) + if (ei.blk[i].start != ei.blk[i].end && + ei.blk[i].nid != NUMA_NO_NODE) + node_set(ei.blk[i].nid, numa_nodes_parsed); + /* * Transform __apicid_to_node table to use emulated nids by * reverse-mapping phys_nid. The maps should always exist but fall diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h index ad86ec91e640..86860f279662 100644 --- a/arch/x86/mm/numa_internal.h +++ b/arch/x86/mm/numa_internal.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __X86_MM_NUMA_INTERNAL_H #define __X86_MM_NUMA_INTERNAL_H diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c index 5f169d5d76a8..a25588ad75ef 100644 --- a/arch/x86/mm/pageattr-test.c +++ b/arch/x86/mm/pageattr-test.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * self test for change_page_attr. * diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index c8520b2c62d2..dfb7d657cf43 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -150,6 +150,12 @@ void clflush_cache_range(void *vaddr, unsigned int size) } EXPORT_SYMBOL_GPL(clflush_cache_range); +void arch_invalidate_pmem(void *addr, size_t size) +{ + clflush_cache_range(addr, size); +} +EXPORT_SYMBOL_GPL(arch_invalidate_pmem); + static void __cpa_flush_all(void *arg) { unsigned long cache = (unsigned long)arg; @@ -1769,6 +1775,70 @@ int set_memory_4k(unsigned long addr, int numpages) __pgprot(0), 1, 0, NULL); } +static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) +{ + struct cpa_data cpa; + unsigned long start; + int ret; + + /* Nothing to do if the SME is not active */ + if (!sme_active()) + return 0; + + /* Should not be working on unaligned addresses */ + if (WARN_ONCE(addr & ~PAGE_MASK, "misaligned address: %#lx\n", addr)) + addr &= PAGE_MASK; + + start = addr; + + memset(&cpa, 0, sizeof(cpa)); + cpa.vaddr = &addr; + cpa.numpages = numpages; + cpa.mask_set = enc ? __pgprot(_PAGE_ENC) : __pgprot(0); + cpa.mask_clr = enc ? __pgprot(0) : __pgprot(_PAGE_ENC); + cpa.pgd = init_mm.pgd; + + /* Must avoid aliasing mappings in the highmem code */ + kmap_flush_unused(); + vm_unmap_aliases(); + + /* + * Before changing the encryption attribute, we need to flush caches. + */ + if (static_cpu_has(X86_FEATURE_CLFLUSH)) + cpa_flush_range(start, numpages, 1); + else + cpa_flush_all(1); + + ret = __change_page_attr_set_clr(&cpa, 1); + + /* + * After changing the encryption attribute, we need to flush TLBs + * again in case any speculative TLB caching occurred (but no need + * to flush caches again). We could just use cpa_flush_all(), but + * in case TLB flushing gets optimized in the cpa_flush_range() + * path use the same logic as above. + */ + if (static_cpu_has(X86_FEATURE_CLFLUSH)) + cpa_flush_range(start, numpages, 0); + else + cpa_flush_all(0); + + return ret; +} + +int set_memory_encrypted(unsigned long addr, int numpages) +{ + return __set_memory_enc_dec(addr, numpages, true); +} +EXPORT_SYMBOL_GPL(set_memory_encrypted); + +int set_memory_decrypted(unsigned long addr, int numpages) +{ + return __set_memory_enc_dec(addr, numpages, false); +} +EXPORT_SYMBOL_GPL(set_memory_decrypted); + int set_pages_uc(struct page *page, int numpages) { unsigned long addr = (unsigned long)page_address(page); @@ -2014,6 +2084,9 @@ int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, if (!(page_flags & _PAGE_RW)) cpa.mask_clr = __pgprot(_PAGE_RW); + if (!(page_flags & _PAGE_ENC)) + cpa.mask_clr = pgprot_encrypted(cpa.mask_clr); + cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags); retval = __change_page_attr_set_clr(&cpa, 0); diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 9b78685b66e6..fe7d57a8fb60 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -37,14 +37,14 @@ #undef pr_fmt #define pr_fmt(fmt) "" fmt -static bool boot_cpu_done; - -static int __read_mostly __pat_enabled = IS_ENABLED(CONFIG_X86_PAT); -static void init_cache_modes(void); +static bool __read_mostly boot_cpu_done; +static bool __read_mostly pat_disabled = !IS_ENABLED(CONFIG_X86_PAT); +static bool __read_mostly pat_initialized; +static bool __read_mostly init_cm_done; void pat_disable(const char *reason) { - if (!__pat_enabled) + if (pat_disabled) return; if (boot_cpu_done) { @@ -52,10 +52,8 @@ void pat_disable(const char *reason) return; } - __pat_enabled = 0; + pat_disabled = true; pr_info("x86/PAT: %s\n", reason); - - init_cache_modes(); } static int __init nopat(char *str) @@ -67,7 +65,7 @@ early_param("nopat", nopat); bool pat_enabled(void) { - return !!__pat_enabled; + return pat_initialized; } EXPORT_SYMBOL_GPL(pat_enabled); @@ -205,6 +203,8 @@ static void __init_cache_modes(u64 pat) update_cache_mode_entry(i, cache); } pr_info("x86/PAT: Configuration [0-7]: %s\n", pat_msg); + + init_cm_done = true; } #define PAT(x, y) ((u64)PAT_ ## y << ((x)*8)) @@ -225,6 +225,7 @@ static void pat_bsp_init(u64 pat) } wrmsrl(MSR_IA32_CR_PAT, pat); + pat_initialized = true; __init_cache_modes(pat); } @@ -242,10 +243,9 @@ static void pat_ap_init(u64 pat) wrmsrl(MSR_IA32_CR_PAT, pat); } -static void init_cache_modes(void) +void init_cache_modes(void) { u64 pat = 0; - static int init_cm_done; if (init_cm_done) return; @@ -287,15 +287,13 @@ static void init_cache_modes(void) } __init_cache_modes(pat); - - init_cm_done = 1; } /** * pat_init - Initialize PAT MSR and PAT table * * This function initializes PAT MSR and PAT table with an OS-defined value - * to enable additional cache attributes, WC and WT. + * to enable additional cache attributes, WC, WT and WP. * * This function must be called on all CPUs using the specific sequence of * operations defined in Intel SDM. mtrr_rendezvous_handler() provides this @@ -306,10 +304,8 @@ void pat_init(void) u64 pat; struct cpuinfo_x86 *c = &boot_cpu_data; - if (!pat_enabled()) { - init_cache_modes(); + if (pat_disabled) return; - } if ((c->x86_vendor == X86_VENDOR_INTEL) && (((c->x86 == 0x6) && (c->x86_model <= 0xd)) || @@ -356,7 +352,7 @@ void pat_init(void) * 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS * 011 3 UC : _PAGE_CACHE_MODE_UC * 100 4 WB : Reserved - * 101 5 WC : Reserved + * 101 5 WP : _PAGE_CACHE_MODE_WP * 110 6 UC-: Reserved * 111 7 WT : _PAGE_CACHE_MODE_WT * @@ -364,7 +360,7 @@ void pat_init(void) * corresponding types in the presence of PAT errata. */ pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) | - PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, WT); + PAT(4, WB) | PAT(5, WP) | PAT(6, UC_MINUS) | PAT(7, WT); } if (!boot_cpu_done) { @@ -748,6 +744,9 @@ EXPORT_SYMBOL(arch_io_free_memtype_wc); pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot) { + if (!phys_mem_access_encrypted(pfn << PAGE_SHIFT, size)) + vma_prot = pgprot_decrypted(vma_prot); + return vma_prot; } diff --git a/arch/x86/mm/pat_internal.h b/arch/x86/mm/pat_internal.h index a739bfc40690..eeb5caeb089b 100644 --- a/arch/x86/mm/pat_internal.h +++ b/arch/x86/mm/pat_internal.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __PAT_INTERNAL_H_ #define __PAT_INTERNAL_H_ diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c index d76485b22824..fa16036fa592 100644 --- a/arch/x86/mm/pat_rbtree.c +++ b/arch/x86/mm/pat_rbtree.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Handle caching attributes in page tables (PAT) * diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 508a708eb9a6..17ebc5a978cc 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/mm.h> #include <linux/gfp.h> #include <asm/pgalloc.h> @@ -56,7 +57,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { pgtable_page_dtor(pte); paravirt_release_pte(page_to_pfn(pte)); - tlb_remove_page(tlb, pte); + tlb_remove_table(tlb, pte); } #if CONFIG_PGTABLE_LEVELS > 2 @@ -72,21 +73,21 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) tlb->need_flush_all = 1; #endif pgtable_pmd_page_dtor(page); - tlb_remove_page(tlb, page); + tlb_remove_table(tlb, page); } #if CONFIG_PGTABLE_LEVELS > 3 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) { paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); - tlb_remove_page(tlb, virt_to_page(pud)); + tlb_remove_table(tlb, virt_to_page(pud)); } #if CONFIG_PGTABLE_LEVELS > 4 void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) { paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT); - tlb_remove_page(tlb, virt_to_page(p4d)); + tlb_remove_table(tlb, virt_to_page(p4d)); } #endif /* CONFIG_PGTABLE_LEVELS > 4 */ #endif /* CONFIG_PGTABLE_LEVELS > 3 */ @@ -426,10 +427,8 @@ int ptep_set_access_flags(struct vm_area_struct *vma, { int changed = !pte_same(*ptep, entry); - if (changed && dirty) { + if (changed && dirty) *ptep = entry; - pte_update(vma->vm_mm, address, ptep); - } return changed; } @@ -486,9 +485,6 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma, ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, (unsigned long *) &ptep->pte); - if (ret) - pte_update(vma->vm_mm, addr, ptep); - return ret; } diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index b9bd5b8b14fa..6b9bf023a700 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/sched.h> #include <linux/kernel.h> #include <linux/errno.h> diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c index cfc3b9121ce4..7f9acb68324c 100644 --- a/arch/x86/mm/physaddr.c +++ b/arch/x86/mm/physaddr.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/bootmem.h> #include <linux/mmdebug.h> #include <linux/export.h> diff --git a/arch/x86/mm/physaddr.h b/arch/x86/mm/physaddr.h index a3cd5a0c97b3..9f6419cafc32 100644 --- a/arch/x86/mm/physaddr.h +++ b/arch/x86/mm/physaddr.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #include <asm/processor.h> static inline int phys_addr_valid(resource_size_t addr) diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c index 2dab69a706ec..d7bc0eea20a5 100644 --- a/arch/x86/mm/pkeys.c +++ b/arch/x86/mm/pkeys.c @@ -18,7 +18,6 @@ #include <asm/cpufeature.h> /* boot_cpu_has, ... */ #include <asm/mmu_context.h> /* vma_pkey() */ -#include <asm/fpu/internal.h> /* fpregs_active() */ int __execute_only_pkey(struct mm_struct *mm) { @@ -45,7 +44,7 @@ int __execute_only_pkey(struct mm_struct *mm) */ preempt_disable(); if (!need_to_set_mm_pkey && - fpregs_active() && + current->thread.fpu.initialized && !__pkru_allows_read(read_pkru(), execute_only_pkey)) { preempt_enable(); return execute_only_pkey; diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c index f65a33f505b6..adb3c5784dac 100644 --- a/arch/x86/mm/setup_nx.c +++ b/arch/x86/mm/setup_nx.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/spinlock.h> #include <linux/errno.h> #include <linux/init.h> diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index 3ea20d61b523..dac07e4f5834 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * ACPI 3.0 based NUMA setup * Copyright 2004 Andi Kleen, SuSE Labs. diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 6e7bedf69af7..3118392cdf75 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -15,7 +15,7 @@ #include <linux/debugfs.h> /* - * Smarter SMP flushing macros. + * TLB flushing, formerly SMP-only * c/o Linus Torvalds. * * These mean you can really definitely utterly forget about @@ -28,38 +28,64 @@ * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi */ -#ifdef CONFIG_SMP +atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); -struct flush_tlb_info { - struct mm_struct *flush_mm; - unsigned long flush_start; - unsigned long flush_end; -}; -/* - * We cannot call mmdrop() because we are in interrupt context, - * instead update mm->cpu_vm_mask. - */ -void leave_mm(int cpu) +static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, + u16 *new_asid, bool *need_flush) { - struct mm_struct *active_mm = this_cpu_read(cpu_tlbstate.active_mm); - if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) - BUG(); - if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) { - cpumask_clear_cpu(cpu, mm_cpumask(active_mm)); - load_cr3(swapper_pg_dir); - /* - * This gets called in the idle path where RCU - * functions differently. Tracing normally - * uses RCU, so we have to call the tracepoint - * specially here. - */ - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + u16 asid; + + if (!static_cpu_has(X86_FEATURE_PCID)) { + *new_asid = 0; + *need_flush = true; + return; } + + for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) { + if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) != + next->context.ctx_id) + continue; + + *new_asid = asid; + *need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) < + next_tlb_gen); + return; + } + + /* + * We don't currently own an ASID slot on this CPU. + * Allocate a slot. + */ + *new_asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1; + if (*new_asid >= TLB_NR_DYN_ASIDS) { + *new_asid = 0; + this_cpu_write(cpu_tlbstate.next_asid, 1); + } + *need_flush = true; } -EXPORT_SYMBOL_GPL(leave_mm); -#endif /* CONFIG_SMP */ +void leave_mm(int cpu) +{ + struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); + + /* + * It's plausible that we're in lazy TLB mode while our mm is init_mm. + * If so, our callers still expect us to flush the TLB, but there + * aren't any user TLB entries in init_mm to worry about. + * + * This needs to happen before any other sanity checks due to + * intel_idle's shenanigans. + */ + if (loaded_mm == &init_mm) + return; + + /* Warn if we're not lazy. */ + WARN_ON(!this_cpu_read(cpu_tlbstate.is_lazy)); + + switch_mm(NULL, &init_mm, NULL); +} +EXPORT_SYMBOL_GPL(leave_mm); void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) @@ -74,217 +100,385 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next, void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { + struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); + u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); unsigned cpu = smp_processor_id(); + u64 next_tlb_gen; + + /* + * NB: The scheduler will call us with prev == next when switching + * from lazy TLB mode to normal mode if active_mm isn't changing. + * When this happens, we don't assume that CR3 (and hence + * cpu_tlbstate.loaded_mm) matches next. + * + * NB: leave_mm() calls us with prev == NULL and tsk == NULL. + */ + + /* We don't want flush_tlb_func_* to run concurrently with us. */ + if (IS_ENABLED(CONFIG_PROVE_LOCKING)) + WARN_ON_ONCE(!irqs_disabled()); + + /* + * Verify that CR3 is what we think it is. This will catch + * hypothetical buggy code that directly switches to swapper_pg_dir + * without going through leave_mm() / switch_mm_irqs_off() or that + * does something like write_cr3(read_cr3_pa()). + * + * Only do this check if CONFIG_DEBUG_VM=y because __read_cr3() + * isn't free. + */ +#ifdef CONFIG_DEBUG_VM + if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev, prev_asid))) { + /* + * If we were to BUG here, we'd be very likely to kill + * the system so hard that we don't see the call trace. + * Try to recover instead by ignoring the error and doing + * a global flush to minimize the chance of corruption. + * + * (This is far from being a fully correct recovery. + * Architecturally, the CPU could prefetch something + * back into an incorrect ASID slot and leave it there + * to cause trouble down the road. It's better than + * nothing, though.) + */ + __flush_tlb_all(); + } +#endif + this_cpu_write(cpu_tlbstate.is_lazy, false); + + if (real_prev == next) { + VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != + next->context.ctx_id); + + /* + * We don't currently support having a real mm loaded without + * our cpu set in mm_cpumask(). We have all the bookkeeping + * in place to figure out whether we would need to flush + * if our cpu were cleared in mm_cpumask(), but we don't + * currently use it. + */ + if (WARN_ON_ONCE(real_prev != &init_mm && + !cpumask_test_cpu(cpu, mm_cpumask(next)))) + cpumask_set_cpu(cpu, mm_cpumask(next)); + + return; + } else { + u16 new_asid; + bool need_flush; - if (likely(prev != next)) { if (IS_ENABLED(CONFIG_VMAP_STACK)) { /* * If our current stack is in vmalloc space and isn't * mapped in the new pgd, we'll double-fault. Forcibly * map it. */ - unsigned int stack_pgd_index = pgd_index(current_stack_pointer()); - - pgd_t *pgd = next->pgd + stack_pgd_index; + unsigned int index = pgd_index(current_stack_pointer); + pgd_t *pgd = next->pgd + index; if (unlikely(pgd_none(*pgd))) - set_pgd(pgd, init_mm.pgd[stack_pgd_index]); + set_pgd(pgd, init_mm.pgd[index]); } -#ifdef CONFIG_SMP - this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); - this_cpu_write(cpu_tlbstate.active_mm, next); -#endif - - cpumask_set_cpu(cpu, mm_cpumask(next)); + /* Stop remote flushes for the previous mm */ + VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) && + real_prev != &init_mm); + cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); /* - * Re-load page tables. - * - * This logic has an ordering constraint: - * - * CPU 0: Write to a PTE for 'next' - * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI. - * CPU 1: set bit 1 in next's mm_cpumask - * CPU 1: load from the PTE that CPU 0 writes (implicit) - * - * We need to prevent an outcome in which CPU 1 observes - * the new PTE value and CPU 0 observes bit 1 clear in - * mm_cpumask. (If that occurs, then the IPI will never - * be sent, and CPU 0's TLB will contain a stale entry.) - * - * The bad outcome can occur if either CPU's load is - * reordered before that CPU's store, so both CPUs must - * execute full barriers to prevent this from happening. - * - * Thus, switch_mm needs a full barrier between the - * store to mm_cpumask and any operation that could load - * from next->pgd. TLB fills are special and can happen - * due to instruction fetches or for no reason at all, - * and neither LOCK nor MFENCE orders them. - * Fortunately, load_cr3() is serializing and gives the - * ordering guarantee we need. - * + * Start remote flushes and then read tlb_gen. */ - load_cr3(next->pgd); - - trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); - - /* Stop flush ipis for the previous mm */ - cpumask_clear_cpu(cpu, mm_cpumask(prev)); - - /* Load per-mm CR4 state */ - load_mm_cr4(next); + cpumask_set_cpu(cpu, mm_cpumask(next)); + next_tlb_gen = atomic64_read(&next->context.tlb_gen); -#ifdef CONFIG_MODIFY_LDT_SYSCALL - /* - * Load the LDT, if the LDT is different. - * - * It's possible that prev->context.ldt doesn't match - * the LDT register. This can happen if leave_mm(prev) - * was called and then modify_ldt changed - * prev->context.ldt but suppressed an IPI to this CPU. - * In this case, prev->context.ldt != NULL, because we - * never set context.ldt to NULL while the mm still - * exists. That means that next->context.ldt != - * prev->context.ldt, because mms never share an LDT. - */ - if (unlikely(prev->context.ldt != next->context.ldt)) - load_mm_ldt(next); -#endif - } -#ifdef CONFIG_SMP - else { - this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); - BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next); + choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); - if (!cpumask_test_cpu(cpu, mm_cpumask(next))) { - /* - * On established mms, the mm_cpumask is only changed - * from irq context, from ptep_clear_flush() while in - * lazy tlb mode, and here. Irqs are blocked during - * schedule, protecting us from simultaneous changes. - */ - cpumask_set_cpu(cpu, mm_cpumask(next)); + if (need_flush) { + this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); + this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); + write_cr3(build_cr3(next, new_asid)); /* - * We were in lazy tlb mode and leave_mm disabled - * tlb flush IPI delivery. We must reload CR3 - * to make sure to use no freed page tables. + * NB: This gets called via leave_mm() in the idle path + * where RCU functions differently. Tracing normally + * uses RCU, so we need to use the _rcuidle variant. * - * As above, load_cr3() is serializing and orders TLB - * fills with respect to the mm_cpumask write. + * (There is no good reason for this. The idle code should + * be rearranged to call this before rcu_idle_enter().) */ - load_cr3(next->pgd); - trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); - load_mm_cr4(next); - load_mm_ldt(next); + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + } else { + /* The new ASID is already up to date. */ + write_cr3(build_cr3_noflush(next, new_asid)); + + /* See above wrt _rcuidle. */ + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); } + + this_cpu_write(cpu_tlbstate.loaded_mm, next); + this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); } -#endif + + load_mm_cr4(next); + switch_ldt(real_prev, next); } -#ifdef CONFIG_SMP +/* + * Please ignore the name of this function. It should be called + * switch_to_kernel_thread(). + * + * enter_lazy_tlb() is a hint from the scheduler that we are entering a + * kernel thread or other context without an mm. Acceptable implementations + * include doing nothing whatsoever, switching to init_mm, or various clever + * lazy tricks to try to minimize TLB flushes. + * + * The scheduler reserves the right to call enter_lazy_tlb() several times + * in a row. It will notify us that we're going back to a real mm by + * calling switch_mm_irqs_off(). + */ +void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) +{ + if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) + return; + + if (tlb_defer_switch_to_init_mm()) { + /* + * There's a significant optimization that may be possible + * here. We have accurate enough TLB flush tracking that we + * don't need to maintain coherence of TLB per se when we're + * lazy. We do, however, need to maintain coherence of + * paging-structure caches. We could, in principle, leave our + * old mm loaded and only switch to init_mm when + * tlb_remove_page() happens. + */ + this_cpu_write(cpu_tlbstate.is_lazy, true); + } else { + switch_mm(NULL, &init_mm, NULL); + } +} /* - * The flush IPI assumes that a thread switch happens in this order: - * [cpu0: the cpu that switches] - * 1) switch_mm() either 1a) or 1b) - * 1a) thread switch to a different mm - * 1a1) set cpu_tlbstate to TLBSTATE_OK - * Now the tlb flush NMI handler flush_tlb_func won't call leave_mm - * if cpu0 was in lazy tlb mode. - * 1a2) update cpu active_mm - * Now cpu0 accepts tlb flushes for the new mm. - * 1a3) cpu_set(cpu, new_mm->cpu_vm_mask); - * Now the other cpus will send tlb flush ipis. - * 1a4) change cr3. - * 1a5) cpu_clear(cpu, old_mm->cpu_vm_mask); - * Stop ipi delivery for the old mm. This is not synchronized with - * the other cpus, but flush_tlb_func ignore flush ipis for the wrong - * mm, and in the worst case we perform a superfluous tlb flush. - * 1b) thread switch without mm change - * cpu active_mm is correct, cpu0 already handles flush ipis. - * 1b1) set cpu_tlbstate to TLBSTATE_OK - * 1b2) test_and_set the cpu bit in cpu_vm_mask. - * Atomically set the bit [other cpus will start sending flush ipis], - * and test the bit. - * 1b3) if the bit was 0: leave_mm was called, flush the tlb. - * 2) switch %%esp, ie current + * Call this when reinitializing a CPU. It fixes the following potential + * problems: + * + * - The ASID changed from what cpu_tlbstate thinks it is (most likely + * because the CPU was taken down and came back up with CR3's PCID + * bits clear. CPU hotplug can do this. * - * The interrupt must handle 2 special cases: - * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm. - * - the cpu performs speculative tlb reads, i.e. even if the cpu only - * runs in kernel space, the cpu could load tlb entries for user space - * pages. + * - The TLB contains junk in slots corresponding to inactive ASIDs. * - * The good news is that cpu_tlbstate is local to each cpu, no - * write/read ordering problems. + * - The CPU went so far out to lunch that it may have missed a TLB + * flush. */ +void initialize_tlbstate_and_flush(void) +{ + int i; + struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm); + u64 tlb_gen = atomic64_read(&init_mm.context.tlb_gen); + unsigned long cr3 = __read_cr3(); + + /* Assert that CR3 already references the right mm. */ + WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd)); + + /* + * Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization + * doesn't work like other CR4 bits because it can only be set from + * long mode.) + */ + WARN_ON(boot_cpu_has(X86_FEATURE_PCID) && + !(cr4_read_shadow() & X86_CR4_PCIDE)); + + /* Force ASID 0 and force a TLB flush. */ + write_cr3(build_cr3(mm, 0)); + + /* Reinitialize tlbstate. */ + this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0); + this_cpu_write(cpu_tlbstate.next_asid, 1); + this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id); + this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen); + + for (i = 1; i < TLB_NR_DYN_ASIDS; i++) + this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0); +} /* - * TLB flush funcation: - * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. - * 2) Leave the mm if we are in the lazy tlb mode. + * flush_tlb_func_common()'s memory ordering requirement is that any + * TLB fills that happen after we flush the TLB are ordered after we + * read active_mm's tlb_gen. We don't need any explicit barriers + * because all x86 flush operations are serializing and the + * atomic64_read operation won't be reordered by the compiler. */ -static void flush_tlb_func(void *info) +static void flush_tlb_func_common(const struct flush_tlb_info *f, + bool local, enum tlb_flush_reason reason) { - struct flush_tlb_info *f = info; + /* + * We have three different tlb_gen values in here. They are: + * + * - mm_tlb_gen: the latest generation. + * - local_tlb_gen: the generation that this CPU has already caught + * up to. + * - f->new_tlb_gen: the generation that the requester of the flush + * wants us to catch up to. + */ + struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); + u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); + u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen); + u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen); - inc_irq_stat(irq_tlb_count); + /* This code cannot presently handle being reentered. */ + VM_WARN_ON(!irqs_disabled()); - if (f->flush_mm && f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm)) + if (unlikely(loaded_mm == &init_mm)) return; - count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); - if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { - if (f->flush_end == TLB_FLUSH_ALL) { - local_flush_tlb(); - trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, TLB_FLUSH_ALL); - } else { - unsigned long addr; - unsigned long nr_pages = - (f->flush_end - f->flush_start) / PAGE_SIZE; - addr = f->flush_start; - while (addr < f->flush_end) { - __flush_tlb_single(addr); - addr += PAGE_SIZE; - } - trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, nr_pages); + VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) != + loaded_mm->context.ctx_id); + + if (this_cpu_read(cpu_tlbstate.is_lazy)) { + /* + * We're in lazy mode. We need to at least flush our + * paging-structure cache to avoid speculatively reading + * garbage into our TLB. Since switching to init_mm is barely + * slower than a minimal flush, just switch to init_mm. + */ + switch_mm_irqs_off(NULL, &init_mm, NULL); + return; + } + + if (unlikely(local_tlb_gen == mm_tlb_gen)) { + /* + * There's nothing to do: we're already up to date. This can + * happen if two concurrent flushes happen -- the first flush to + * be handled can catch us all the way up, leaving no work for + * the second flush. + */ + trace_tlb_flush(reason, 0); + return; + } + + WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen); + WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen); + + /* + * If we get to this point, we know that our TLB is out of date. + * This does not strictly imply that we need to flush (it's + * possible that f->new_tlb_gen <= local_tlb_gen), but we're + * going to need to flush in the very near future, so we might + * as well get it over with. + * + * The only question is whether to do a full or partial flush. + * + * We do a partial flush if requested and two extra conditions + * are met: + * + * 1. f->new_tlb_gen == local_tlb_gen + 1. We have an invariant that + * we've always done all needed flushes to catch up to + * local_tlb_gen. If, for example, local_tlb_gen == 2 and + * f->new_tlb_gen == 3, then we know that the flush needed to bring + * us up to date for tlb_gen 3 is the partial flush we're + * processing. + * + * As an example of why this check is needed, suppose that there + * are two concurrent flushes. The first is a full flush that + * changes context.tlb_gen from 1 to 2. The second is a partial + * flush that changes context.tlb_gen from 2 to 3. If they get + * processed on this CPU in reverse order, we'll see + * local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL. + * If we were to use __flush_tlb_single() and set local_tlb_gen to + * 3, we'd be break the invariant: we'd update local_tlb_gen above + * 1 without the full flush that's needed for tlb_gen 2. + * + * 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimiation. + * Partial TLB flushes are not all that much cheaper than full TLB + * flushes, so it seems unlikely that it would be a performance win + * to do a partial flush if that won't bring our TLB fully up to + * date. By doing a full flush instead, we can increase + * local_tlb_gen all the way to mm_tlb_gen and we can probably + * avoid another flush in the very near future. + */ + if (f->end != TLB_FLUSH_ALL && + f->new_tlb_gen == local_tlb_gen + 1 && + f->new_tlb_gen == mm_tlb_gen) { + /* Partial flush */ + unsigned long addr; + unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT; + + addr = f->start; + while (addr < f->end) { + __flush_tlb_single(addr); + addr += PAGE_SIZE; } - } else - leave_mm(smp_processor_id()); + if (local) + count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages); + trace_tlb_flush(reason, nr_pages); + } else { + /* Full flush. */ + local_flush_tlb(); + if (local) + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); + trace_tlb_flush(reason, TLB_FLUSH_ALL); + } + /* Both paths above update our state to mm_tlb_gen. */ + this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen); } -void native_flush_tlb_others(const struct cpumask *cpumask, - struct mm_struct *mm, unsigned long start, - unsigned long end) +static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason) { - struct flush_tlb_info info; + const struct flush_tlb_info *f = info; - info.flush_mm = mm; - info.flush_start = start; - info.flush_end = end; + flush_tlb_func_common(f, true, reason); +} + +static void flush_tlb_func_remote(void *info) +{ + const struct flush_tlb_info *f = info; + + inc_irq_stat(irq_tlb_count); + if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.loaded_mm)) + return; + + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); + flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN); +} + +void native_flush_tlb_others(const struct cpumask *cpumask, + const struct flush_tlb_info *info) +{ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); - if (end == TLB_FLUSH_ALL) + if (info->end == TLB_FLUSH_ALL) trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL); else trace_tlb_flush(TLB_REMOTE_SEND_IPI, - (end - start) >> PAGE_SHIFT); + (info->end - info->start) >> PAGE_SHIFT); if (is_uv_system()) { + /* + * This whole special case is confused. UV has a "Broadcast + * Assist Unit", which seems to be a fancy way to send IPIs. + * Back when x86 used an explicit TLB flush IPI, UV was + * optimized to use its own mechanism. These days, x86 uses + * smp_call_function_many(), but UV still uses a manual IPI, + * and that IPI's action is out of date -- it does a manual + * flush instead of calling flush_tlb_func_remote(). This + * means that the percpu tlb_gen variables won't be updated + * and we'll do pointless flushes on future context switches. + * + * Rather than hooking native_flush_tlb_others() here, I think + * that UV should be updated so that smp_call_function_many(), + * etc, are optimal on UV. + */ unsigned int cpu; cpu = smp_processor_id(); - cpumask = uv_flush_tlb_others(cpumask, mm, start, end, cpu); + cpumask = uv_flush_tlb_others(cpumask, info); if (cpumask) - smp_call_function_many(cpumask, flush_tlb_func, - &info, 1); + smp_call_function_many(cpumask, flush_tlb_func_remote, + (void *)info, 1); return; } - smp_call_function_many(cpumask, flush_tlb_func, &info, 1); + smp_call_function_many(cpumask, flush_tlb_func_remote, + (void *)info, 1); } /* @@ -302,91 +496,46 @@ static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned long vmflag) { - unsigned long addr; - /* do a global flush by default */ - unsigned long base_pages_to_flush = TLB_FLUSH_ALL; + int cpu; - preempt_disable(); + struct flush_tlb_info info = { + .mm = mm, + }; - if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB)) - base_pages_to_flush = (end - start) >> PAGE_SHIFT; - if (base_pages_to_flush > tlb_single_page_flush_ceiling) - base_pages_to_flush = TLB_FLUSH_ALL; + cpu = get_cpu(); - if (current->active_mm != mm) { - /* Synchronize with switch_mm. */ - smp_mb(); + /* This is also a barrier that synchronizes with switch_mm(). */ + info.new_tlb_gen = inc_mm_tlb_gen(mm); - goto out; - } - - if (!current->mm) { - leave_mm(smp_processor_id()); - - /* Synchronize with switch_mm. */ - smp_mb(); - - goto out; - } - - /* - * Both branches below are implicit full barriers (MOV to CR or - * INVLPG) that synchronize with switch_mm. - */ - if (base_pages_to_flush == TLB_FLUSH_ALL) { - count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); - local_flush_tlb(); + /* Should we flush just the requested range? */ + if ((end != TLB_FLUSH_ALL) && + !(vmflag & VM_HUGETLB) && + ((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) { + info.start = start; + info.end = end; } else { - /* flush range by one by one 'invlpg' */ - for (addr = start; addr < end; addr += PAGE_SIZE) { - count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); - __flush_tlb_single(addr); - } - } - trace_tlb_flush(TLB_LOCAL_MM_SHOOTDOWN, base_pages_to_flush); -out: - if (base_pages_to_flush == TLB_FLUSH_ALL) { - start = 0UL; - end = TLB_FLUSH_ALL; + info.start = 0UL; + info.end = TLB_FLUSH_ALL; } - if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) - flush_tlb_others(mm_cpumask(mm), mm, start, end); - preempt_enable(); -} - -void flush_tlb_page(struct vm_area_struct *vma, unsigned long start) -{ - struct mm_struct *mm = vma->vm_mm; - - preempt_disable(); - - if (current->active_mm == mm) { - if (current->mm) { - /* - * Implicit full barrier (INVLPG) that synchronizes - * with switch_mm. - */ - __flush_tlb_one(start); - } else { - leave_mm(smp_processor_id()); - /* Synchronize with switch_mm. */ - smp_mb(); - } + if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) { + VM_WARN_ON(irqs_disabled()); + local_irq_disable(); + flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN); + local_irq_enable(); } - if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) - flush_tlb_others(mm_cpumask(mm), mm, start, start + PAGE_SIZE); + if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) + flush_tlb_others(mm_cpumask(mm), &info); - preempt_enable(); + put_cpu(); } + static void do_flush_tlb_all(void *info) { count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); __flush_tlb_all(); - if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) - leave_mm(smp_processor_id()); } void flush_tlb_all(void) @@ -401,7 +550,7 @@ static void do_kernel_range_flush(void *info) unsigned long addr; /* flush range by one by one 'invlpg' */ - for (addr = f->flush_start; addr < f->flush_end; addr += PAGE_SIZE) + for (addr = f->start; addr < f->end; addr += PAGE_SIZE) __flush_tlb_single(addr); } @@ -410,16 +559,41 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end) /* Balance as user space task's flush, a bit conservative */ if (end == TLB_FLUSH_ALL || - (end - start) > tlb_single_page_flush_ceiling * PAGE_SIZE) { + (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) { on_each_cpu(do_flush_tlb_all, NULL, 1); } else { struct flush_tlb_info info; - info.flush_start = start; - info.flush_end = end; + info.start = start; + info.end = end; on_each_cpu(do_kernel_range_flush, &info, 1); } } +void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) +{ + struct flush_tlb_info info = { + .mm = NULL, + .start = 0UL, + .end = TLB_FLUSH_ALL, + }; + + int cpu = get_cpu(); + + if (cpumask_test_cpu(cpu, &batch->cpumask)) { + VM_WARN_ON(irqs_disabled()); + local_irq_disable(); + flush_tlb_func_local(&info, TLB_LOCAL_SHOOTDOWN); + local_irq_enable(); + } + + if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) + flush_tlb_others(&batch->cpumask, &info); + + cpumask_clear(&batch->cpumask); + + put_cpu(); +} + static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { @@ -465,5 +639,3 @@ static int __init create_tlb_single_page_flush_ceiling(void) return 0; } late_initcall(create_tlb_single_page_flush_ceiling); - -#endif /* CONFIG_SMP */ diff --git a/arch/x86/net/Makefile b/arch/x86/net/Makefile index 90568c33ddb0..fefb4b619598 100644 --- a/arch/x86/net/Makefile +++ b/arch/x86/net/Makefile @@ -1,4 +1,6 @@ # # Arch-specific network modules # +OBJECT_FILES_NON_STANDARD_bpf_jit.o += y + obj-$(CONFIG_BPF_JIT) += bpf_jit.o bpf_jit_comp.o diff --git a/arch/x86/net/bpf_jit.S b/arch/x86/net/bpf_jit.S index f2a7faf4706e..b33093f84528 100644 --- a/arch/x86/net/bpf_jit.S +++ b/arch/x86/net/bpf_jit.S @@ -19,9 +19,6 @@ */ #define SKBDATA %r10 #define SKF_MAX_NEG_OFF $(-0x200000) /* SKF_LL_OFF from filter.h */ -#define MAX_BPF_STACK (512 /* from filter.h */ + \ - 32 /* space for rbx,r13,r14,r15 */ + \ - 8 /* space for skb_copy_bits */) #define FUNC(name) \ .globl name; \ @@ -66,7 +63,7 @@ FUNC(sk_load_byte_positive_offset) /* rsi contains offset and can be scratched */ #define bpf_slow_path_common(LEN) \ - lea -MAX_BPF_STACK + 32(%rbp), %rdx;\ + lea 32(%rbp), %rdx;\ FRAME_BEGIN; \ mov %rbx, %rdi; /* arg1 == skb */ \ push %r9; \ @@ -83,14 +80,14 @@ FUNC(sk_load_byte_positive_offset) bpf_slow_path_word: bpf_slow_path_common(4) js bpf_error - mov - MAX_BPF_STACK + 32(%rbp),%eax + mov 32(%rbp),%eax bswap %eax ret bpf_slow_path_half: bpf_slow_path_common(2) js bpf_error - mov - MAX_BPF_STACK + 32(%rbp),%ax + mov 32(%rbp),%ax rol $8,%ax movzwl %ax,%eax ret @@ -98,7 +95,7 @@ bpf_slow_path_half: bpf_slow_path_byte: bpf_slow_path_common(1) js bpf_error - movzbl - MAX_BPF_STACK + 32(%rbp),%eax + movzbl 32(%rbp),%eax ret #define sk_negative_common(SIZE) \ @@ -148,9 +145,10 @@ FUNC(sk_load_byte_negative_offset) bpf_error: # force a return 0 from jit handler xor %eax,%eax - mov - MAX_BPF_STACK(%rbp),%rbx - mov - MAX_BPF_STACK + 8(%rbp),%r13 - mov - MAX_BPF_STACK + 16(%rbp),%r14 - mov - MAX_BPF_STACK + 24(%rbp),%r15 + mov (%rbp),%rbx + mov 8(%rbp),%r13 + mov 16(%rbp),%r14 + mov 24(%rbp),%r15 + add $40, %rbp leaveq ret diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index f58939393eef..0554e8aef4d5 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -94,7 +94,9 @@ static int bpf_size_to_x86_bytes(int bpf_size) #define X86_JNE 0x75 #define X86_JBE 0x76 #define X86_JA 0x77 +#define X86_JL 0x7C #define X86_JGE 0x7D +#define X86_JLE 0x7E #define X86_JG 0x7F static void bpf_flush_icache(void *start, void *end) @@ -197,17 +199,16 @@ struct jit_context { #define BPF_MAX_INSN_SIZE 128 #define BPF_INSN_SAFETY 64 -#define STACKSIZE \ - (MAX_BPF_STACK + \ - 32 /* space for rbx, r13, r14, r15 */ + \ +#define AUX_STACK_SPACE \ + (32 /* space for rbx, r13, r14, r15 */ + \ 8 /* space for skb_copy_bits() buffer */) -#define PROLOGUE_SIZE 48 +#define PROLOGUE_SIZE 37 /* emit x64 prologue code for BPF program and check it's size. * bpf_tail_call helper will skip it while jumping into another program */ -static void emit_prologue(u8 **pprog) +static void emit_prologue(u8 **pprog, u32 stack_depth) { u8 *prog = *pprog; int cnt = 0; @@ -215,13 +216,17 @@ static void emit_prologue(u8 **pprog) EMIT1(0x55); /* push rbp */ EMIT3(0x48, 0x89, 0xE5); /* mov rbp,rsp */ - /* sub rsp, STACKSIZE */ - EMIT3_off32(0x48, 0x81, 0xEC, STACKSIZE); + /* sub rsp, rounded_stack_depth + AUX_STACK_SPACE */ + EMIT3_off32(0x48, 0x81, 0xEC, + round_up(stack_depth, 8) + AUX_STACK_SPACE); + + /* sub rbp, AUX_STACK_SPACE */ + EMIT4(0x48, 0x83, 0xED, AUX_STACK_SPACE); /* all classic BPF filters use R6(rbx) save it */ - /* mov qword ptr [rbp-X],rbx */ - EMIT3_off32(0x48, 0x89, 0x9D, -STACKSIZE); + /* mov qword ptr [rbp+0],rbx */ + EMIT4(0x48, 0x89, 0x5D, 0); /* bpf_convert_filter() maps classic BPF register X to R7 and uses R8 * as temporary, so all tcpdump filters need to spill/fill R7(r13) and @@ -231,12 +236,12 @@ static void emit_prologue(u8 **pprog) * than synthetic ones. Therefore not worth adding complexity. */ - /* mov qword ptr [rbp-X],r13 */ - EMIT3_off32(0x4C, 0x89, 0xAD, -STACKSIZE + 8); - /* mov qword ptr [rbp-X],r14 */ - EMIT3_off32(0x4C, 0x89, 0xB5, -STACKSIZE + 16); - /* mov qword ptr [rbp-X],r15 */ - EMIT3_off32(0x4C, 0x89, 0xBD, -STACKSIZE + 24); + /* mov qword ptr [rbp+8],r13 */ + EMIT4(0x4C, 0x89, 0x6D, 8); + /* mov qword ptr [rbp+16],r14 */ + EMIT4(0x4C, 0x89, 0x75, 16); + /* mov qword ptr [rbp+24],r15 */ + EMIT4(0x4C, 0x89, 0x7D, 24); /* Clear the tail call counter (tail_call_cnt): for eBPF tail calls * we need to reset the counter to 0. It's done in two instructions, @@ -246,8 +251,8 @@ static void emit_prologue(u8 **pprog) /* xor eax, eax */ EMIT2(0x31, 0xc0); - /* mov qword ptr [rbp-X], rax */ - EMIT3_off32(0x48, 0x89, 0x85, -STACKSIZE + 32); + /* mov qword ptr [rbp+32], rax */ + EMIT4(0x48, 0x89, 0x45, 32); BUILD_BUG_ON(cnt != PROLOGUE_SIZE); *pprog = prog; @@ -279,33 +284,32 @@ static void emit_bpf_tail_call(u8 **pprog) /* if (index >= array->map.max_entries) * goto out; */ - EMIT4(0x48, 0x8B, 0x46, /* mov rax, qword ptr [rsi + 16] */ + EMIT2(0x89, 0xD2); /* mov edx, edx */ + EMIT3(0x39, 0x56, /* cmp dword ptr [rsi + 16], edx */ offsetof(struct bpf_array, map.max_entries)); - EMIT3(0x48, 0x39, 0xD0); /* cmp rax, rdx */ -#define OFFSET1 47 /* number of bytes to jump */ +#define OFFSET1 43 /* number of bytes to jump */ EMIT2(X86_JBE, OFFSET1); /* jbe out */ label1 = cnt; /* if (tail_call_cnt > MAX_TAIL_CALL_CNT) * goto out; */ - EMIT2_off32(0x8B, 0x85, -STACKSIZE + 36); /* mov eax, dword ptr [rbp - 516] */ + EMIT2_off32(0x8B, 0x85, 36); /* mov eax, dword ptr [rbp + 36] */ EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */ -#define OFFSET2 36 +#define OFFSET2 32 EMIT2(X86_JA, OFFSET2); /* ja out */ label2 = cnt; EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ - EMIT2_off32(0x89, 0x85, -STACKSIZE + 36); /* mov dword ptr [rbp - 516], eax */ + EMIT2_off32(0x89, 0x85, 36); /* mov dword ptr [rbp + 36], eax */ /* prog = array->ptrs[index]; */ - EMIT4_off32(0x48, 0x8D, 0x84, 0xD6, /* lea rax, [rsi + rdx * 8 + offsetof(...)] */ + EMIT4_off32(0x48, 0x8B, 0x84, 0xD6, /* mov rax, [rsi + rdx * 8 + offsetof(...)] */ offsetof(struct bpf_array, ptrs)); - EMIT3(0x48, 0x8B, 0x00); /* mov rax, qword ptr [rax] */ /* if (prog == NULL) * goto out; */ - EMIT4(0x48, 0x83, 0xF8, 0x00); /* cmp rax, 0 */ + EMIT3(0x48, 0x85, 0xC0); /* test rax,rax */ #define OFFSET3 10 EMIT2(X86_JE, OFFSET3); /* je out */ label3 = cnt; @@ -361,7 +365,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, int proglen = 0; u8 *prog = temp; - emit_prologue(&prog); + emit_prologue(&prog, bpf_prog->aux->stack_depth); if (seen_ld_abs) emit_load_skb_data_hlen(&prog); @@ -877,7 +881,7 @@ xadd: if (is_imm8(insn->off)) } break; - case BPF_JMP | BPF_CALL | BPF_X: + case BPF_JMP | BPF_TAIL_CALL: emit_bpf_tail_call(&prog); break; @@ -885,9 +889,13 @@ xadd: if (is_imm8(insn->off)) case BPF_JMP | BPF_JEQ | BPF_X: case BPF_JMP | BPF_JNE | BPF_X: case BPF_JMP | BPF_JGT | BPF_X: + case BPF_JMP | BPF_JLT | BPF_X: case BPF_JMP | BPF_JGE | BPF_X: + case BPF_JMP | BPF_JLE | BPF_X: case BPF_JMP | BPF_JSGT | BPF_X: + case BPF_JMP | BPF_JSLT | BPF_X: case BPF_JMP | BPF_JSGE | BPF_X: + case BPF_JMP | BPF_JSLE | BPF_X: /* cmp dst_reg, src_reg */ EMIT3(add_2mod(0x48, dst_reg, src_reg), 0x39, add_2reg(0xC0, dst_reg, src_reg)); @@ -908,9 +916,13 @@ xadd: if (is_imm8(insn->off)) case BPF_JMP | BPF_JEQ | BPF_K: case BPF_JMP | BPF_JNE | BPF_K: case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JLT | BPF_K: case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JLE | BPF_K: case BPF_JMP | BPF_JSGT | BPF_K: + case BPF_JMP | BPF_JSLT | BPF_K: case BPF_JMP | BPF_JSGE | BPF_K: + case BPF_JMP | BPF_JSLE | BPF_K: /* cmp dst_reg, imm8/32 */ EMIT1(add_1mod(0x48, dst_reg)); @@ -932,18 +944,34 @@ emit_cond_jmp: /* convert BPF opcode to x86 */ /* GT is unsigned '>', JA in x86 */ jmp_cond = X86_JA; break; + case BPF_JLT: + /* LT is unsigned '<', JB in x86 */ + jmp_cond = X86_JB; + break; case BPF_JGE: /* GE is unsigned '>=', JAE in x86 */ jmp_cond = X86_JAE; break; + case BPF_JLE: + /* LE is unsigned '<=', JBE in x86 */ + jmp_cond = X86_JBE; + break; case BPF_JSGT: /* signed '>', GT in x86 */ jmp_cond = X86_JG; break; + case BPF_JSLT: + /* signed '<', LT in x86 */ + jmp_cond = X86_JL; + break; case BPF_JSGE: /* signed '>=', GE in x86 */ jmp_cond = X86_JGE; break; + case BPF_JSLE: + /* signed '<=', LE in x86 */ + jmp_cond = X86_JLE; + break; default: /* to silence gcc warning */ return -EFAULT; } @@ -1036,15 +1064,17 @@ common_load: seen_exit = true; /* update cleanup_addr */ ctx->cleanup_addr = proglen; - /* mov rbx, qword ptr [rbp-X] */ - EMIT3_off32(0x48, 0x8B, 0x9D, -STACKSIZE); - /* mov r13, qword ptr [rbp-X] */ - EMIT3_off32(0x4C, 0x8B, 0xAD, -STACKSIZE + 8); - /* mov r14, qword ptr [rbp-X] */ - EMIT3_off32(0x4C, 0x8B, 0xB5, -STACKSIZE + 16); - /* mov r15, qword ptr [rbp-X] */ - EMIT3_off32(0x4C, 0x8B, 0xBD, -STACKSIZE + 24); - + /* mov rbx, qword ptr [rbp+0] */ + EMIT4(0x48, 0x8B, 0x5D, 0); + /* mov r13, qword ptr [rbp+8] */ + EMIT4(0x4C, 0x8B, 0x6D, 8); + /* mov r14, qword ptr [rbp+16] */ + EMIT4(0x4C, 0x8B, 0x75, 16); + /* mov r15, qword ptr [rbp+24] */ + EMIT4(0x4C, 0x8B, 0x7D, 24); + + /* add rbp, AUX_STACK_SPACE */ + EMIT4(0x48, 0x83, 0xC5, AUX_STACK_SPACE); EMIT1(0xC9); /* leave */ EMIT1(0xC3); /* ret */ break; @@ -1162,6 +1192,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) bpf_jit_binary_lock_ro(header); prog->bpf_func = (void *)image; prog->jited = 1; + prog->jited_len = proglen; } else { prog = orig_prog; } diff --git a/arch/x86/oprofile/Makefile b/arch/x86/oprofile/Makefile index 1599f568f0e2..4d49b5a27025 100644 --- a/arch/x86/oprofile/Makefile +++ b/arch/x86/oprofile/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_OPROFILE) += oprofile.o DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \ diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile index 5c6fc3577a49..c806b57d3f22 100644 --- a/arch/x86/pci/Makefile +++ b/arch/x86/pci/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 obj-y := i386.o init.o obj-$(CONFIG_PCI_BIOS) += pcbios.o diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 3961103e9176..7df49c40665e 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/pci.h> #include <linux/acpi.h> #include <linux/init.h> diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index ae387e5ee6f7..649bdde63e32 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/init.h> #include <linux/pci.h> #include <linux/topology.h> diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c index 6eb3c8af96e2..2752c02e3f0e 100644 --- a/arch/x86/pci/bus_numa.c +++ b/arch/x86/pci/bus_numa.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/init.h> #include <linux/pci.h> #include <linux/range.h> diff --git a/arch/x86/pci/bus_numa.h b/arch/x86/pci/bus_numa.h index ff8f65b04574..697dd841b118 100644 --- a/arch/x86/pci/bus_numa.h +++ b/arch/x86/pci/bus_numa.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __BUS_NUMA_H #define __BUS_NUMA_H /* diff --git a/arch/x86/pci/ce4100.c b/arch/x86/pci/ce4100.c index b914e20b5a00..3353b76dcff0 100644 --- a/arch/x86/pci/ce4100.c +++ b/arch/x86/pci/ce4100.c @@ -65,6 +65,9 @@ struct sim_reg_op { { PCI_DEVFN(device, func), offset, init_op, read_op, write_op,\ {0, SIZE_TO_MASK(size)} }, +/* + * All read/write functions are called with pci_config_lock held. + */ static void reg_init(struct sim_dev_reg *reg) { pci_direct_conf1.read(0, 1, reg->dev_func, reg->reg, 4, @@ -73,21 +76,13 @@ static void reg_init(struct sim_dev_reg *reg) static void reg_read(struct sim_dev_reg *reg, u32 *value) { - unsigned long flags; - - raw_spin_lock_irqsave(&pci_config_lock, flags); *value = reg->sim_reg.value; - raw_spin_unlock_irqrestore(&pci_config_lock, flags); } static void reg_write(struct sim_dev_reg *reg, u32 value) { - unsigned long flags; - - raw_spin_lock_irqsave(&pci_config_lock, flags); reg->sim_reg.value = (value & reg->sim_reg.mask) | (reg->sim_reg.value & ~reg->sim_reg.mask); - raw_spin_unlock_irqrestore(&pci_config_lock, flags); } static void sata_reg_init(struct sim_dev_reg *reg) @@ -117,12 +112,8 @@ static void sata_revid_read(struct sim_dev_reg *reg, u32 *value) static void reg_noirq_read(struct sim_dev_reg *reg, u32 *value) { - unsigned long flags; - - raw_spin_lock_irqsave(&pci_config_lock, flags); /* force interrupt pin value to 0 */ *value = reg->sim_reg.value & 0xfff00ff; - raw_spin_unlock_irqrestore(&pci_config_lock, flags); } static struct sim_dev_reg bus1_fixups[] = { @@ -265,24 +256,33 @@ int bridge_read(unsigned int devfn, int reg, int len, u32 *value) return retval; } -static int ce4100_conf_read(unsigned int seg, unsigned int bus, - unsigned int devfn, int reg, int len, u32 *value) +static int ce4100_bus1_read(unsigned int devfn, int reg, int len, u32 *value) { + unsigned long flags; int i; - WARN_ON(seg); - if (bus == 1) { - for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) { - if (bus1_fixups[i].dev_func == devfn && - bus1_fixups[i].reg == (reg & ~3) && - bus1_fixups[i].read) { - bus1_fixups[i].read(&(bus1_fixups[i]), - value); - extract_bytes(value, reg, len); - return 0; - } + for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) { + if (bus1_fixups[i].dev_func == devfn && + bus1_fixups[i].reg == (reg & ~3) && + bus1_fixups[i].read) { + + raw_spin_lock_irqsave(&pci_config_lock, flags); + bus1_fixups[i].read(&(bus1_fixups[i]), value); + raw_spin_unlock_irqrestore(&pci_config_lock, flags); + extract_bytes(value, reg, len); + return 0; } } + return -1; +} + +static int ce4100_conf_read(unsigned int seg, unsigned int bus, + unsigned int devfn, int reg, int len, u32 *value) +{ + WARN_ON(seg); + + if (bus == 1 && !ce4100_bus1_read(devfn, reg, len, value)) + return 0; if (bus == 0 && (PCI_DEVFN(1, 0) == devfn) && !bridge_read(devfn, reg, len, value)) @@ -291,23 +291,32 @@ static int ce4100_conf_read(unsigned int seg, unsigned int bus, return pci_direct_conf1.read(seg, bus, devfn, reg, len, value); } -static int ce4100_conf_write(unsigned int seg, unsigned int bus, - unsigned int devfn, int reg, int len, u32 value) +static int ce4100_bus1_write(unsigned int devfn, int reg, int len, u32 value) { + unsigned long flags; int i; - WARN_ON(seg); - if (bus == 1) { - for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) { - if (bus1_fixups[i].dev_func == devfn && - bus1_fixups[i].reg == (reg & ~3) && - bus1_fixups[i].write) { - bus1_fixups[i].write(&(bus1_fixups[i]), - value); - return 0; - } + for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) { + if (bus1_fixups[i].dev_func == devfn && + bus1_fixups[i].reg == (reg & ~3) && + bus1_fixups[i].write) { + + raw_spin_lock_irqsave(&pci_config_lock, flags); + bus1_fixups[i].write(&(bus1_fixups[i]), value); + raw_spin_unlock_irqrestore(&pci_config_lock, flags); + return 0; } } + return -1; +} + +static int ce4100_conf_write(unsigned int seg, unsigned int bus, + unsigned int devfn, int reg, int len, u32 value) +{ + WARN_ON(seg); + + if (bus == 1 && !ce4100_bus1_write(devfn, reg, len, value)) + return 0; /* Discard writes to A/V bridge BAR. */ if (bus == 0 && PCI_DEVFN(1, 0) == devfn && @@ -318,8 +327,8 @@ static int ce4100_conf_write(unsigned int seg, unsigned int bus, } static const struct pci_raw_ops ce4100_pci_conf = { - .read = ce4100_conf_read, - .write = ce4100_conf_write, + .read = ce4100_conf_read, + .write = ce4100_conf_write, }; int __init ce4100_pci_init(void) diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 190e718694b1..7a5350d08cef 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -24,7 +24,6 @@ unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 | unsigned int pci_early_dump_regs; static int pci_bf_sort; -static int smbios_type_b1_flag; int pci_routeirq; int noioapicquirk; #ifdef CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS @@ -75,8 +74,8 @@ struct pci_ops pci_root_ops = { }; /* - * This interrupt-safe spinlock protects all accesses to PCI - * configuration space. + * This interrupt-safe spinlock protects all accesses to PCI configuration + * space, except for the mmconfig (ECAM) based operations. */ DEFINE_RAW_SPINLOCK(pci_config_lock); @@ -197,34 +196,18 @@ static int __init set_bf_sort(const struct dmi_system_id *d) static void __init read_dmi_type_b1(const struct dmi_header *dm, void *private_data) { - u8 *d = (u8 *)dm + 4; + u8 *data = (u8 *)dm + 4; if (dm->type != 0xB1) return; - switch (((*(u32 *)d) >> 9) & 0x03) { - case 0x00: - printk(KERN_INFO "dmi type 0xB1 record - unknown flag\n"); - break; - case 0x01: /* set pci=bfsort */ - smbios_type_b1_flag = 1; - break; - case 0x02: /* do not set pci=bfsort */ - smbios_type_b1_flag = 2; - break; - default: - break; - } + if ((((*(u32 *)data) >> 9) & 0x03) == 0x01) + set_bf_sort((const struct dmi_system_id *)private_data); } static int __init find_sort_method(const struct dmi_system_id *d) { - dmi_walk(read_dmi_type_b1, NULL); - - if (smbios_type_b1_flag == 1) { - set_bf_sort(d); - return 0; - } - return -1; + dmi_walk(read_dmi_type_b1, (void *)d); + return 0; } /* @@ -691,7 +674,7 @@ int pcibios_add_device(struct pci_dev *dev) pa_data = boot_params.hdr.setup_data; while (pa_data) { - data = ioremap(pa_data, sizeof(*rom)); + data = memremap(pa_data, sizeof(*rom), MEMREMAP_WB); if (!data) return -ENOMEM; @@ -710,7 +693,7 @@ int pcibios_add_device(struct pci_dev *dev) } } pa_data = data->next; - iounmap(data); + memunmap(data); } set_dma_domain_ops(dev); set_dev_domain_options(dev); diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c index 15460590b8c5..2d9503323d10 100644 --- a/arch/x86/pci/direct.c +++ b/arch/x86/pci/direct.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * direct.c - Low-level direct PCI config space access */ diff --git a/arch/x86/pci/early.c b/arch/x86/pci/early.c index d1067d539bee..f0114007e915 100644 --- a/arch/x86/pci/early.c +++ b/arch/x86/pci/early.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/pci.h> #include <asm/pci-direct.h> diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 6d52b94f4bb9..4210da7b44de 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Exceptions for specific devices. Usually work-arounds for fatal design flaws. */ @@ -571,3 +572,67 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, pci_invalid_bar); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6f60, pci_invalid_bar); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fa0, pci_invalid_bar); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, pci_invalid_bar); + +/* + * Device [1022:7808] + * 23. USB Wake on Connect/Disconnect with Low Speed Devices + * https://support.amd.com/TechDocs/46837.pdf + * Appendix A2 + * https://support.amd.com/TechDocs/42413.pdf + */ +static void pci_fixup_amd_ehci_pme(struct pci_dev *dev) +{ + dev_info(&dev->dev, "PME# does not work under D3, disabling it\n"); + dev->pme_support &= ~((PCI_PM_CAP_PME_D3 | PCI_PM_CAP_PME_D3cold) + >> PCI_PM_CAP_PME_SHIFT); +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x7808, pci_fixup_amd_ehci_pme); + +/* + * Apple MacBook Pro: Avoid [mem 0x7fa00000-0x7fbfffff] + * + * Using the [mem 0x7fa00000-0x7fbfffff] region, e.g., by assigning it to + * the 00:1c.0 Root Port, causes a conflict with [io 0x1804], which is used + * for soft poweroff and suspend-to-RAM. + * + * As far as we know, this is related to the address space, not to the Root + * Port itself. Attaching the quirk to the Root Port is a convenience, but + * it could probably also be a standalone DMI quirk. + * + * https://bugzilla.kernel.org/show_bug.cgi?id=103211 + */ +static void quirk_apple_mbp_poweroff(struct pci_dev *pdev) +{ + struct device *dev = &pdev->dev; + struct resource *res; + + if ((!dmi_match(DMI_PRODUCT_NAME, "MacBookPro11,4") && + !dmi_match(DMI_PRODUCT_NAME, "MacBookPro11,5")) || + pdev->bus->number != 0 || pdev->devfn != PCI_DEVFN(0x1c, 0)) + return; + + res = request_mem_region(0x7fa00000, 0x200000, + "MacBook Pro poweroff workaround"); + if (res) + dev_info(dev, "claimed %s %pR\n", res->name, res); + else + dev_info(dev, "can't work around MacBook Pro poweroff issue\n"); +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x8c10, quirk_apple_mbp_poweroff); + +/* + * VMD-enabled root ports will change the source ID for all messages + * to the VMD device. Rather than doing device matching with the source + * ID, the AER driver should traverse the child device tree, reading + * AER registers to find the faulting device. + */ +static void quirk_no_aersid(struct pci_dev *pdev) +{ + /* VMD Domain */ + if (is_vmd(pdev->bus)) + pdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_AERSID; +} +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2030, quirk_no_aersid); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2031, quirk_no_aersid); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2032, quirk_no_aersid); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2033, quirk_no_aersid); diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 7b4307163eac..ed4ac215305d 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Low-Level PCI Access for i386 machines * diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c index adb62aaa7ecd..5fc617edf108 100644 --- a/arch/x86/pci/init.c +++ b/arch/x86/pci/init.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/pci.h> #include <linux/init.h> #include <asm/pci_x86.h> diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c index 5a18aedcb341..1012a5f0f98d 100644 --- a/arch/x86/pci/intel_mid_pci.c +++ b/arch/x86/pci/intel_mid_pci.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Intel MID PCI support * Copyright (c) 2008 Intel Corporation @@ -215,16 +216,23 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev) struct irq_alloc_info info; int polarity; int ret; + u8 gsi; if (dev->irq_managed && dev->irq > 0) return 0; + ret = pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &gsi); + if (ret < 0) { + dev_warn(&dev->dev, "Failed to read interrupt line: %d\n", ret); + return ret; + } + switch (intel_mid_identify_cpu()) { case INTEL_MID_CPU_CHIP_TANGIER: polarity = IOAPIC_POL_HIGH; /* Special treatment for IRQ0 */ - if (dev->irq == 0) { + if (gsi == 0) { /* * Skip HS UART common registers device since it has * IRQ0 assigned and not used by the kernel. @@ -253,10 +261,11 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev) * MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to * IOAPIC RTE entries, so we just enable RTE for the device. */ - ret = mp_map_gsi_to_irq(dev->irq, IOAPIC_MAP_ALLOC, &info); + ret = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info); if (ret < 0) return ret; + dev->irq = ret; dev->irq_managed = 1; return 0; diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 9bd115484745..0452629148be 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Low-Level PCI Support for PC -- Routing of Interrupts * @@ -1092,7 +1093,7 @@ static int __init fix_acer_tm360_irqrouting(const struct dmi_system_id *d) return 0; } -static struct dmi_system_id __initdata pciirq_dmi_table[] = { +static const struct dmi_system_id pciirq_dmi_table[] __initconst = { { .callback = fix_broken_hp_bios_irq9, .ident = "HP Pavilion N5400 Series Laptop", diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c index ea6f3802c17b..1cb01abcb1be 100644 --- a/arch/x86/pci/legacy.c +++ b/arch/x86/pci/legacy.c @@ -24,12 +24,10 @@ static void pcibios_fixup_peer_bridges(void) int __init pci_legacy_init(void) { - if (!raw_pci_ops) { - printk("PCI: System does not support PCI\n"); - return 0; - } + if (!raw_pci_ops) + return 1; - printk("PCI: Probing PCI hardware\n"); + pr_info("PCI: Probing PCI hardware\n"); pcibios_scan_root(0); return 0; } @@ -46,7 +44,7 @@ void pcibios_scan_specific_bus(int busn) if (!raw_pci_read(0, busn, devfn, PCI_VENDOR_ID, 2, &l) && l != 0x0000 && l != 0xffff) { DBG("Found device at %02x:%02x [%04x]\n", busn, devfn, l); - printk(KERN_INFO "PCI: Discovered peer bus %02x\n", busn); + pr_info("PCI: Discovered peer bus %02x\n", busn); pcibios_scan_root(busn); return; } @@ -60,8 +58,12 @@ static int __init pci_subsys_init(void) * The init function returns an non zero value when * pci_legacy_init should be invoked. */ - if (x86_init.pci.init()) - pci_legacy_init(); + if (x86_init.pci.init()) { + if (pci_legacy_init()) { + pr_info("PCI: System does not support PCI\n"); + return -ENODEV; + } + } pcibios_fixup_peer_bridges(); x86_init.pci.init_irq(); diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index d1b47d5bc9c3..96684d0adcf9 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * mmconfig-shared.c - Low-level direct PCI config space access via * MMCONFIG - common code between i386 and x86-64. diff --git a/arch/x86/pci/mmconfig_64.c b/arch/x86/pci/mmconfig_64.c index f1c1aa0430ae..887d181b769b 100644 --- a/arch/x86/pci/mmconfig_64.c +++ b/arch/x86/pci/mmconfig_64.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * mmconfig.c - Low-level direct PCI config space access via MMCONFIG * diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c index c1bdb9edcae7..9c97d814125e 100644 --- a/arch/x86/pci/pcbios.c +++ b/arch/x86/pci/pcbios.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * BIOS32 and PCI BIOS handling. */ @@ -46,7 +47,7 @@ static inline void set_bios_x(void) pcibios_enabled = 1; set_memory_x(PAGE_OFFSET + BIOS_BEGIN, (BIOS_END - BIOS_BEGIN) >> PAGE_SHIFT); if (__supported_pte_mask & _PAGE_NX) - printk(KERN_INFO "PCI : PCI BIOS area is rw and x. Use pci=nobios if you want it NX.\n"); + printk(KERN_INFO "PCI: PCI BIOS area is rw and x. Use pci=nobios if you want it NX.\n"); } /* diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c index ec008e800b45..53d600217973 100644 --- a/arch/x86/pci/sta2x11-fixup.c +++ b/arch/x86/pci/sta2x11-fixup.c @@ -26,6 +26,7 @@ #include <linux/pci_ids.h> #include <linux/export.h> #include <linux/list.h> +#include <asm/iommu.h> #define STA2X11_SWIOTLB_SIZE (4*1024*1024) extern int swiotlb_late_init_with_default_size(size_t default_size); @@ -191,7 +192,7 @@ static const struct dma_map_ops sta2x11_dma_ops = { .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, .sync_sg_for_device = swiotlb_sync_sg_for_device, .mapping_error = swiotlb_dma_mapping_error, - .dma_supported = NULL, /* FIXME: we should use this instead! */ + .dma_supported = x86_dma_supported, }; /* At setup time, we use our own ops if the device is a ConneXt one */ diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile index 184842ef332e..d0e835470d01 100644 --- a/arch/x86/platform/Makefile +++ b/arch/x86/platform/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # Platform specific code goes here obj-y += atom/ obj-y += ce4100/ diff --git a/arch/x86/platform/efi/Makefile b/arch/x86/platform/efi/Makefile index f1d83b34c329..e4dc3862d423 100644 --- a/arch/x86/platform/efi/Makefile +++ b/arch/x86/platform/efi/Makefile @@ -1,4 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0 OBJECT_FILES_NON_STANDARD_efi_thunk_$(BITS).o := y +OBJECT_FILES_NON_STANDARD_efi_stub_$(BITS).o := y obj-$(CONFIG_EFI) += quirks.o efi.o efi_$(BITS).o efi_stub_$(BITS).o obj-$(CONFIG_EARLY_PRINTK_EFI) += early_printk.o diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 43b96f5f78ba..9061babfbc83 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Common EFI (Extensible Firmware Interface) support functions * Based on Extensible Firmware Interface Specification version 1.0 @@ -1014,7 +1015,6 @@ static void __init __efi_enter_virtual_mode(void) * necessary relocation fixups for the new virtual addresses. */ efi_runtime_update_mappings(); - efi_dump_pagetable(); /* clean DUMMY object */ efi_delete_dummy_variable(); @@ -1029,25 +1029,8 @@ void __init efi_enter_virtual_mode(void) kexec_enter_virtual_mode(); else __efi_enter_virtual_mode(); -} - -/* - * Convenience functions to obtain memory types and attributes - */ -u32 efi_mem_type(unsigned long phys_addr) -{ - efi_memory_desc_t *md; - if (!efi_enabled(EFI_MEMMAP)) - return 0; - - for_each_efi_memory_desc(md) { - if ((md->phys_addr <= phys_addr) && - (phys_addr < (md->phys_addr + - (md->num_pages << EFI_PAGE_SHIFT)))) - return md->type; - } - return 0; + efi_dump_pagetable(); } static int __init arch_parse_efi_cmdline(char *str) diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c index 3481268da3d0..324b93328b37 100644 --- a/arch/x86/platform/efi/efi_32.c +++ b/arch/x86/platform/efi/efi_32.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Extensible Firmware Interface * @@ -44,7 +45,14 @@ int __init efi_alloc_page_tables(void) } void efi_sync_low_kernel_mappings(void) {} -void __init efi_dump_pagetable(void) {} + +void __init efi_dump_pagetable(void) +{ +#ifdef CONFIG_EFI_PGT_DUMP + ptdump_walk_pgd_level(NULL, swapper_pg_dir); +#endif +} + int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) { return 0; diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index eb8dff15a7f6..20fb31579b69 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * x86_64 specific EFI support functions * Based on Extensible Firmware Interface Specification version 1.0 @@ -80,7 +81,7 @@ pgd_t * __init efi_call_phys_prolog(void) int n_pgds, i, j; if (!efi_enabled(EFI_OLD_MEMMAP)) { - save_pgd = (pgd_t *)read_cr3(); + save_pgd = (pgd_t *)__read_cr3(); write_cr3((unsigned long)efi_scratch.efi_pgt); goto out; } @@ -327,7 +328,7 @@ virt_to_phys_or_null_size(void *va, unsigned long size) int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) { - unsigned long pfn, text; + unsigned long pfn, text, pf; struct page *page; unsigned npages; pgd_t *pgd; @@ -335,7 +336,12 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) if (efi_enabled(EFI_OLD_MEMMAP)) return 0; - efi_scratch.efi_pgt = (pgd_t *)__pa(efi_pgd); + /* + * Since the PGD is encrypted, set the encryption mask so that when + * this value is loaded into cr3 the PGD will be decrypted during + * the pagetable walk. + */ + efi_scratch.efi_pgt = (pgd_t *)__sme_pa(efi_pgd); pgd = efi_pgd; /* @@ -345,7 +351,8 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) * phys_efi_set_virtual_address_map(). */ pfn = pa_memmap >> PAGE_SHIFT; - if (kernel_map_pages_in_pgd(pgd, pfn, pa_memmap, num_pages, _PAGE_NX | _PAGE_RW)) { + pf = _PAGE_NX | _PAGE_RW | _PAGE_ENC; + if (kernel_map_pages_in_pgd(pgd, pfn, pa_memmap, num_pages, pf)) { pr_err("Error ident-mapping new memmap (0x%lx)!\n", pa_memmap); return 1; } @@ -388,7 +395,8 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) text = __pa(_text); pfn = text >> PAGE_SHIFT; - if (kernel_map_pages_in_pgd(pgd, pfn, text, npages, _PAGE_RW)) { + pf = _PAGE_RW | _PAGE_ENC; + if (kernel_map_pages_in_pgd(pgd, pfn, text, npages, pf)) { pr_err("Failed to map kernel text 1:1\n"); return 1; } @@ -589,7 +597,10 @@ void __init efi_runtime_update_mappings(void) void __init efi_dump_pagetable(void) { #ifdef CONFIG_EFI_PGT_DUMP - ptdump_walk_pgd_level(NULL, efi_pgd); + if (efi_enabled(EFI_OLD_MEMMAP)) + ptdump_walk_pgd_level(NULL, swapper_pg_dir); + else + ptdump_walk_pgd_level(NULL, efi_pgd); #endif } @@ -646,7 +657,7 @@ efi_status_t efi_thunk_set_virtual_address_map( efi_sync_low_kernel_mappings(); local_irq_save(flags); - efi_scratch.prev_cr3 = read_cr3(); + efi_scratch.prev_cr3 = __read_cr3(); write_cr3((unsigned long)efi_scratch.efi_pgt); __flush_tlb_all(); diff --git a/arch/x86/platform/efi/efi_stub_32.S b/arch/x86/platform/efi/efi_stub_32.S index 040192b50d02..ab2e91e76894 100644 --- a/arch/x86/platform/efi/efi_stub_32.S +++ b/arch/x86/platform/efi/efi_stub_32.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * EFI call stub for IA32. * diff --git a/arch/x86/platform/efi/efi_stub_64.S b/arch/x86/platform/efi/efi_stub_64.S index cd95075944ab..74628ec78f29 100644 --- a/arch/x86/platform/efi/efi_stub_64.S +++ b/arch/x86/platform/efi/efi_stub_64.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Function calling ABI conversion from Linux to EFI for x86_64 * diff --git a/arch/x86/platform/efi/efi_thunk_64.S b/arch/x86/platform/efi/efi_thunk_64.S index ff85d28c50f2..189b218da87c 100644 --- a/arch/x86/platform/efi/efi_thunk_64.S +++ b/arch/x86/platform/efi/efi_thunk_64.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2014 Intel Corporation; author Matt Fleming * diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index e0cf95a83f3f..8a99a2e96537 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -15,12 +15,66 @@ #include <asm/e820/api.h> #include <asm/efi.h> #include <asm/uv/uv.h> +#include <asm/cpu_device_id.h> #define EFI_MIN_RESERVE 5120 #define EFI_DUMMY_GUID \ EFI_GUID(0x4424ac57, 0xbe4b, 0x47dd, 0x9e, 0x97, 0xed, 0x50, 0xf0, 0x9f, 0x92, 0xa9) +#define QUARK_CSH_SIGNATURE 0x5f435348 /* _CSH */ +#define QUARK_SECURITY_HEADER_SIZE 0x400 + +/* + * Header prepended to the standard EFI capsule on Quark systems the are based + * on Intel firmware BSP. + * @csh_signature: Unique identifier to sanity check signed module + * presence ("_CSH"). + * @version: Current version of CSH used. Should be one for Quark A0. + * @modulesize: Size of the entire module including the module header + * and payload. + * @security_version_number_index: Index of SVN to use for validation of signed + * module. + * @security_version_number: Used to prevent against roll back of modules. + * @rsvd_module_id: Currently unused for Clanton (Quark). + * @rsvd_module_vendor: Vendor Identifier. For Intel products value is + * 0x00008086. + * @rsvd_date: BCD representation of build date as yyyymmdd, where + * yyyy=4 digit year, mm=1-12, dd=1-31. + * @headersize: Total length of the header including including any + * padding optionally added by the signing tool. + * @hash_algo: What Hash is used in the module signing. + * @cryp_algo: What Crypto is used in the module signing. + * @keysize: Total length of the key data including including any + * padding optionally added by the signing tool. + * @signaturesize: Total length of the signature including including any + * padding optionally added by the signing tool. + * @rsvd_next_header: 32-bit pointer to the next Secure Boot Module in the + * chain, if there is a next header. + * @rsvd: Reserved, padding structure to required size. + * + * See also QuartSecurityHeader_t in + * Quark_EDKII_v1.2.1.1/QuarkPlatformPkg/Include/QuarkBootRom.h + * from https://downloadcenter.intel.com/download/23197/Intel-Quark-SoC-X1000-Board-Support-Package-BSP + */ +struct quark_security_header { + u32 csh_signature; + u32 version; + u32 modulesize; + u32 security_version_number_index; + u32 security_version_number; + u32 rsvd_module_id; + u32 rsvd_module_vendor; + u32 rsvd_date; + u32 headersize; + u32 hash_algo; + u32 cryp_algo; + u32 keysize; + u32 signaturesize; + u32 rsvd_next_header; + u32 rsvd[2]; +}; + static efi_char16_t efi_dummy_name[6] = { 'D', 'U', 'M', 'M', 'Y', 0 }; static bool efi_no_storage_paranoia; @@ -504,3 +558,86 @@ bool efi_poweroff_required(void) { return acpi_gbl_reduced_hardware || acpi_no_s5; } + +#ifdef CONFIG_EFI_CAPSULE_QUIRK_QUARK_CSH + +static int qrk_capsule_setup_info(struct capsule_info *cap_info, void **pkbuff, + size_t hdr_bytes) +{ + struct quark_security_header *csh = *pkbuff; + + /* Only process data block that is larger than the security header */ + if (hdr_bytes < sizeof(struct quark_security_header)) + return 0; + + if (csh->csh_signature != QUARK_CSH_SIGNATURE || + csh->headersize != QUARK_SECURITY_HEADER_SIZE) + return 1; + + /* Only process data block if EFI header is included */ + if (hdr_bytes < QUARK_SECURITY_HEADER_SIZE + + sizeof(efi_capsule_header_t)) + return 0; + + pr_debug("Quark security header detected\n"); + + if (csh->rsvd_next_header != 0) { + pr_err("multiple Quark security headers not supported\n"); + return -EINVAL; + } + + *pkbuff += csh->headersize; + cap_info->total_size = csh->headersize; + + /* + * Update the first page pointer to skip over the CSH header. + */ + cap_info->pages[0] += csh->headersize; + + return 1; +} + +#define ICPU(family, model, quirk_handler) \ + { X86_VENDOR_INTEL, family, model, X86_FEATURE_ANY, \ + (unsigned long)&quirk_handler } + +static const struct x86_cpu_id efi_capsule_quirk_ids[] = { + ICPU(5, 9, qrk_capsule_setup_info), /* Intel Quark X1000 */ + { } +}; + +int efi_capsule_setup_info(struct capsule_info *cap_info, void *kbuff, + size_t hdr_bytes) +{ + int (*quirk_handler)(struct capsule_info *, void **, size_t); + const struct x86_cpu_id *id; + int ret; + + if (hdr_bytes < sizeof(efi_capsule_header_t)) + return 0; + + cap_info->total_size = 0; + + id = x86_match_cpu(efi_capsule_quirk_ids); + if (id) { + /* + * The quirk handler is supposed to return + * - a value > 0 if the setup should continue, after advancing + * kbuff as needed + * - 0 if not enough hdr_bytes are available yet + * - a negative error code otherwise + */ + quirk_handler = (typeof(quirk_handler))id->driver_data; + ret = quirk_handler(cap_info, &kbuff, hdr_bytes); + if (ret <= 0) + return ret; + } + + memcpy(&cap_info->header, kbuff, sizeof(cap_info->header)); + + cap_info->total_size += cap_info->header.imagesize; + + return __efi_capsule_setup_info(cap_info); +} + +#endif diff --git a/arch/x86/platform/intel-mid/device_libs/Makefile b/arch/x86/platform/intel-mid/device_libs/Makefile index 53e0235e308f..480fed21cc7d 100644 --- a/arch/x86/platform/intel-mid/device_libs/Makefile +++ b/arch/x86/platform/intel-mid/device_libs/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # Family-Level Interface Shim (FLIS) obj-$(subst m,y,$(CONFIG_PINCTRL_MERRIFIELD)) += platform_mrfld_pinctrl.o # SDHCI Devices diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bt.c b/arch/x86/platform/intel-mid/device_libs/platform_bt.c index 5a0483e7bf66..dc036e511f48 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_bt.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_bt.c @@ -60,7 +60,7 @@ static int __init tng_bt_sfi_setup(struct bt_sfi_data *ddata) return 0; } -static struct bt_sfi_data tng_bt_sfi_data __initdata = { +static const struct bt_sfi_data tng_bt_sfi_data __initdata = { .setup = tng_bt_sfi_setup, }; diff --git a/arch/x86/platform/intel-mid/device_libs/platform_max7315.c b/arch/x86/platform/intel-mid/device_libs/platform_max7315.c index 6e075afa7877..58337b2bc682 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_max7315.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_max7315.c @@ -38,8 +38,10 @@ static void __init *max7315_platform_data(void *info) */ strcpy(i2c_info->type, "max7315"); if (nr++) { - sprintf(base_pin_name, "max7315_%d_base", nr); - sprintf(intr_pin_name, "max7315_%d_int", nr); + snprintf(base_pin_name, sizeof(base_pin_name), + "max7315_%d_base", nr); + snprintf(intr_pin_name, sizeof(intr_pin_name), + "max7315_%d_int", nr); } else { strcpy(base_pin_name, "max7315_base"); strcpy(intr_pin_name, "max7315_int"); diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c index 9e304e2ea4f5..4f5fa65a1011 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c @@ -30,13 +30,13 @@ static int tangier_probe(struct platform_device *pdev) { struct irq_alloc_info info; struct intel_mid_wdt_pdata *pdata = pdev->dev.platform_data; - int gsi, irq; + int gsi = TANGIER_EXT_TIMER0_MSI; + int irq; if (!pdata) return -EINVAL; /* IOAPIC builds identity mapping between GSI and IRQ on MID */ - gsi = pdata->irq; ioapic_set_alloc_attr(&info, cpu_to_node(0), 1, 0); irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info); if (irq < 0) { @@ -44,11 +44,11 @@ static int tangier_probe(struct platform_device *pdev) return irq; } + pdata->irq = irq; return 0; } static struct intel_mid_wdt_pdata tangier_pdata = { - .irq = TANGIER_EXT_TIMER0_MSI, .probe = tangier_probe, }; diff --git a/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c b/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c index b1526b95fd43..2905376559f1 100644 --- a/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c +++ b/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c @@ -11,7 +11,7 @@ */ #include <linux/gpio.h> -#include <linux/i2c/tc35876x.h> +#include <linux/platform_data/tc35876x.h> #include <asm/intel-mid.h> /*tc35876x DSI_LVDS bridge chip and panel platform data*/ diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c index 12a272582cdc..86676cec99a1 100644 --- a/arch/x86/platform/intel-mid/intel-mid.c +++ b/arch/x86/platform/intel-mid/intel-mid.c @@ -183,6 +183,7 @@ void __init x86_intel_mid_early_setup(void) x86_init.timers.timer_init = intel_mid_time_init; x86_init.timers.setup_percpu_clockev = x86_init_noop; + x86_init.timers.wallclock_init = intel_mid_rtc_init; x86_init.irqs.pre_vector_init = x86_init_noop; @@ -191,7 +192,6 @@ void __init x86_intel_mid_early_setup(void) x86_cpuinit.setup_percpu_clockev = apbt_setup_secondary_clock; x86_platform.calibrate_tsc = intel_mid_calibrate_tsc; - x86_init.timers.wallclock_init = intel_mid_rtc_init; x86_platform.get_nmi_reason = intel_mid_get_nmi_reason; x86_init.pci.init = intel_mid_pci_init; diff --git a/arch/x86/platform/intel-mid/pwr.c b/arch/x86/platform/intel-mid/pwr.c index ef03852ea6e8..49ec5b94c71f 100644 --- a/arch/x86/platform/intel-mid/pwr.c +++ b/arch/x86/platform/intel-mid/pwr.c @@ -444,7 +444,7 @@ static int mid_set_initial_state(struct mid_pwr *pwr, const u32 *states) static int pnw_set_initial_state(struct mid_pwr *pwr) { /* On Penwell SRAM must stay powered on */ - const u32 states[] = { + static const u32 states[] = { 0xf00fffff, /* PM_SSC(0) */ 0xffffffff, /* PM_SSC(1) */ 0xffffffff, /* PM_SSC(2) */ @@ -455,7 +455,7 @@ static int pnw_set_initial_state(struct mid_pwr *pwr) static int tng_set_initial_state(struct mid_pwr *pwr) { - const u32 states[] = { + static const u32 states[] = { 0xffffffff, /* PM_SSC(0) */ 0xffffffff, /* PM_SSC(1) */ 0xffffffff, /* PM_SSC(2) */ diff --git a/arch/x86/platform/intel-quark/imr_selftest.c b/arch/x86/platform/intel-quark/imr_selftest.c index b8f562049cad..42f879b75f9b 100644 --- a/arch/x86/platform/intel-quark/imr_selftest.c +++ b/arch/x86/platform/intel-quark/imr_selftest.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /** * imr_selftest.c -- Intel Isolated Memory Region self-test driver * diff --git a/arch/x86/platform/olpc/Makefile b/arch/x86/platform/olpc/Makefile index fd332c533947..049f92a9379d 100644 --- a/arch/x86/platform/olpc/Makefile +++ b/arch/x86/platform/olpc/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_OLPC) += olpc.o olpc_ofw.o olpc_dt.o obj-$(CONFIG_OLPC_XO1_PM) += olpc-xo1-pm.o xo1-wakeup.o obj-$(CONFIG_OLPC_XO1_RTC) += olpc-xo1-rtc.o diff --git a/arch/x86/platform/olpc/olpc-xo1-pm.c b/arch/x86/platform/olpc/olpc-xo1-pm.c index c5350fd27d70..0668aaff8bfe 100644 --- a/arch/x86/platform/olpc/olpc-xo1-pm.c +++ b/arch/x86/platform/olpc/olpc-xo1-pm.c @@ -77,7 +77,7 @@ static int xo1_power_state_enter(suspend_state_t pm_state) asmlinkage __visible int xo1_do_sleep(u8 sleep_state) { - void *pgd_addr = __va(read_cr3()); + void *pgd_addr = __va(read_cr3_pa()); /* Program wakeup mask (using dword access to CS5536_PM1_EN) */ outl(wakeup_mask << 16, acpi_base + CS5536_PM1_STS); diff --git a/arch/x86/platform/olpc/xo1-wakeup.S b/arch/x86/platform/olpc/xo1-wakeup.S index 948deb289753..5fee3a2c2fd4 100644 --- a/arch/x86/platform/olpc/xo1-wakeup.S +++ b/arch/x86/platform/olpc/xo1-wakeup.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ .text #include <linux/linkage.h> #include <asm/segment.h> diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 42e65fee5673..f44c0bc95aa2 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -26,7 +26,7 @@ static struct bau_operations ops __ro_after_init; /* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */ -static int timeout_base_ns[] = { +static const int timeout_base_ns[] = { 20, 160, 1280, @@ -40,7 +40,6 @@ static int timeout_base_ns[] = { static int timeout_us; static bool nobau = true; static int nobau_perm; -static cycles_t congested_cycles; /* tunables: */ static int max_concurr = MAX_BAU_CONCURRENT; @@ -456,12 +455,13 @@ static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp) */ static inline unsigned long long cycles_2_ns(unsigned long long cyc) { - struct cyc2ns_data *data = cyc2ns_read_begin(); + struct cyc2ns_data data; unsigned long long ns; - ns = mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift); + cyc2ns_read_begin(&data); + ns = mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift); + cyc2ns_read_end(); - cyc2ns_read_end(data); return ns; } @@ -470,12 +470,13 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc) */ static inline unsigned long long ns_2_cycles(unsigned long long ns) { - struct cyc2ns_data *data = cyc2ns_read_begin(); + struct cyc2ns_data data; unsigned long long cyc; - cyc = (ns << data->cyc2ns_shift) / data->cyc2ns_mul; + cyc2ns_read_begin(&data); + cyc = (ns << data.cyc2ns_shift) / data.cyc2ns_mul; + cyc2ns_read_end(); - cyc2ns_read_end(data); return cyc; } @@ -586,31 +587,11 @@ static unsigned long uv2_3_read_status(unsigned long offset, int rshft, int desc } /* - * Return whether the status of the descriptor that is normally used for this - * cpu (the one indexed by its hub-relative cpu number) is busy. - * The status of the original 32 descriptors is always reflected in the 64 - * bits of UVH_LB_BAU_SB_ACTIVATION_STATUS_0. - * The bit provided by the activation_status_2 register is irrelevant to - * the status if it is only being tested for busy or not busy. - */ -int normal_busy(struct bau_control *bcp) -{ - int cpu = bcp->uvhub_cpu; - int mmr_offset; - int right_shift; - - mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; - right_shift = cpu * UV_ACT_STATUS_SIZE; - return (((((read_lmmr(mmr_offset) >> right_shift) & - UV_ACT_STATUS_MASK)) << 1) == UV2H_DESC_BUSY); -} - -/* * Entered when a bau descriptor has gone into a permanent busy wait because * of a hardware bug. * Workaround the bug. */ -int handle_uv2_busy(struct bau_control *bcp) +static int handle_uv2_busy(struct bau_control *bcp) { struct ptc_stats *stat = bcp->statp; @@ -847,10 +828,10 @@ static void record_send_stats(cycles_t time1, cycles_t time2, if ((completion_status == FLUSH_COMPLETE) && (try == 1)) { bcp->period_requests++; bcp->period_time += elapsed; - if ((elapsed > congested_cycles) && + if ((elapsed > usec_2_cycles(bcp->cong_response_us)) && (bcp->period_requests > bcp->cong_reps) && ((bcp->period_time / bcp->period_requests) > - congested_cycles)) { + usec_2_cycles(bcp->cong_response_us))) { stat->s_congested++; disable_for_period(bcp, stat); } @@ -915,8 +896,9 @@ static void handle_cmplt(int completion_status, struct bau_desc *bau_desc, * Returns 1 if it gives up entirely and the original cpu mask is to be * returned to the kernel. */ -int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp, - struct bau_desc *bau_desc) +static int uv_flush_send_and_wait(struct cpumask *flush_mask, + struct bau_control *bcp, + struct bau_desc *bau_desc) { int seq_number = 0; int completion_stat = 0; @@ -1121,11 +1103,9 @@ static int set_distrib_bits(struct cpumask *flush_mask, struct bau_control *bcp, * done. The returned pointer is valid till preemption is re-enabled. */ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, - struct mm_struct *mm, - unsigned long start, - unsigned long end, - unsigned int cpu) + const struct flush_tlb_info *info) { + unsigned int cpu = smp_processor_id(); int locals = 0, remotes = 0, hubs = 0; struct bau_desc *bau_desc; struct cpumask *flush_mask; @@ -1179,8 +1159,8 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, record_send_statistics(stat, locals, hubs, remotes, bau_desc); - if (!end || (end - start) <= PAGE_SIZE) - address = start; + if (!info->end || (info->end - info->start) <= PAGE_SIZE) + address = info->start; else address = TLB_FLUSH_ALL; @@ -1212,8 +1192,8 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, * Search the message queue for any 'other' unprocessed message with the * same software acknowledge resource bit vector as the 'msg' message. */ -struct bau_pq_entry *find_another_by_swack(struct bau_pq_entry *msg, - struct bau_control *bcp) +static struct bau_pq_entry *find_another_by_swack(struct bau_pq_entry *msg, + struct bau_control *bcp) { struct bau_pq_entry *msg_next = msg + 1; unsigned char swack_vec = msg->swack_vec; @@ -1236,7 +1216,7 @@ struct bau_pq_entry *find_another_by_swack(struct bau_pq_entry *msg, * set a bit in the UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE register. * Such a message must be ignored. */ -void process_uv2_message(struct msg_desc *mdp, struct bau_control *bcp) +static void process_uv2_message(struct msg_desc *mdp, struct bau_control *bcp) { unsigned long mmr_image; unsigned char swack_vec; @@ -2241,14 +2221,17 @@ static int __init uv_bau_init(void) else if (is_uv1_hub()) ops = uv1_bau_ops; + nuvhubs = uv_num_possible_blades(); + if (nuvhubs < 2) { + pr_crit("UV: BAU disabled - insufficient hub count\n"); + goto err_bau_disable; + } + for_each_possible_cpu(cur_cpu) { mask = &per_cpu(uv_flush_tlb_mask, cur_cpu); zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cur_cpu)); } - nuvhubs = uv_num_possible_blades(); - congested_cycles = usec_2_cycles(congested_respns_us); - uv_base_pnode = 0x7fffffff; for (uvhub = 0; uvhub < nuvhubs; uvhub++) { cpus = uv_blade_nr_possible_cpus(uvhub); @@ -2261,9 +2244,8 @@ static int __init uv_bau_init(void) enable_timeouts(); if (init_per_cpu(nuvhubs, uv_base_pnode)) { - set_bau_off(); - nobau_perm = 1; - return 0; + pr_crit("UV: BAU disabled - per CPU init failed\n"); + goto err_bau_disable; } vector = UV_BAU_MESSAGE; @@ -2289,6 +2271,16 @@ static int __init uv_bau_init(void) } return 0; + +err_bau_disable: + + for_each_possible_cpu(cur_cpu) + free_cpumask_var(per_cpu(uv_flush_tlb_mask, cur_cpu)); + + set_bau_off(); + nobau_perm = 1; + + return -EINVAL; } core_initcall(uv_bau_init); fs_initcall(uv_ptc_init); diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index 776c6592136c..03fc397335b7 100644 --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -160,13 +160,21 @@ static struct irq_domain *uv_get_irq_domain(void) { static struct irq_domain *uv_domain; static DEFINE_MUTEX(uv_lock); + struct fwnode_handle *fn; mutex_lock(&uv_lock); - if (uv_domain == NULL) { - uv_domain = irq_domain_add_tree(NULL, &uv_domain_ops, NULL); - if (uv_domain) - uv_domain->parent = x86_vector_domain; - } + if (uv_domain) + goto out; + + fn = irq_domain_alloc_named_fwnode("UV-CORE"); + if (!fn) + goto out; + + uv_domain = irq_domain_create_tree(fn, &uv_domain_ops, NULL); + irq_domain_free_fwnode(fn); + if (uv_domain) + uv_domain->parent = x86_vector_domain; +out: mutex_unlock(&uv_lock); return uv_domain; diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile index a6a198c33623..a4701389562c 100644 --- a/arch/x86/power/Makefile +++ b/arch/x86/power/Makefile @@ -1,3 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0 +OBJECT_FILES_NON_STANDARD_hibernate_asm_$(BITS).o := y + # __restore_processor_state() restores %gs after S3 resume and so should not # itself be stack-protected nostackp := $(call cc-option, -fno-stack-protector) diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 6b05a9219ea2..84fcfde53f8f 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -129,7 +129,7 @@ static void __save_processor_state(struct saved_context *ctxt) */ ctxt->cr0 = read_cr0(); ctxt->cr2 = read_cr2(); - ctxt->cr3 = read_cr3(); + ctxt->cr3 = __read_cr3(); ctxt->cr4 = __read_cr4(); #ifdef CONFIG_X86_64 ctxt->cr8 = read_cr8(); @@ -181,6 +181,7 @@ static void fix_processor_context(void) #endif load_TR_desc(); /* This does ltr */ load_mm_ldt(current->active_mm); /* This does lldt */ + initialize_tlbstate_and_flush(); fpu__resume_cpu(); @@ -427,7 +428,7 @@ static int msr_initialize_bdw(const struct dmi_system_id *d) return msr_init_context(bdw_msr_id, ARRAY_SIZE(bdw_msr_id)); } -static struct dmi_system_id msr_save_dmi_table[] = { +static const struct dmi_system_id msr_save_dmi_table[] = { { .callback = msr_initialize_bdw, .ident = "BROADWELL BDX_EP", diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index a6e21fee22ea..f910c514438f 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -147,10 +147,11 @@ static int relocate_restore_code(void) if (!relocated_restore_code) return -ENOMEM; - memcpy((void *)relocated_restore_code, &core_restore_code, PAGE_SIZE); + memcpy((void *)relocated_restore_code, core_restore_code, PAGE_SIZE); /* Make the page containing the relocated code executable */ - pgd = (pgd_t *)__va(read_cr3()) + pgd_index(relocated_restore_code); + pgd = (pgd_t *)__va(read_cr3_pa()) + + pgd_index(relocated_restore_code); p4d = p4d_offset(pgd, relocated_restore_code); if (p4d_large(*p4d)) { set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX)); @@ -292,9 +293,28 @@ int arch_hibernation_header_save(void *addr, unsigned int max_size) if (max_size < sizeof(struct restore_data_record)) return -EOVERFLOW; - rdr->jump_address = (unsigned long)&restore_registers; - rdr->jump_address_phys = __pa_symbol(&restore_registers); - rdr->cr3 = restore_cr3; + rdr->jump_address = (unsigned long)restore_registers; + rdr->jump_address_phys = __pa_symbol(restore_registers); + + /* + * The restore code fixes up CR3 and CR4 in the following sequence: + * + * [in hibernation asm] + * 1. CR3 <= temporary page tables + * 2. CR4 <= mmu_cr4_features (from the kernel that restores us) + * 3. CR3 <= rdr->cr3 + * 4. CR4 <= mmu_cr4_features (from us, i.e. the image kernel) + * [in restore_processor_state()] + * 5. CR4 <= saved CR4 + * 6. CR3 <= saved CR3 + * + * Our mmu_cr4_features has CR4.PCIDE=0, and toggling + * CR4.PCIDE while CR3's PCID bits are nonzero is illegal, so + * rdr->cr3 needs to point to valid page tables but must not + * have any of the PCID bits set. + */ + rdr->cr3 = restore_cr3 & ~CR3_PCID_MASK; + rdr->magic = RESTORE_MAGIC; hibernation_e820_save(rdr->e820_digest); diff --git a/arch/x86/power/hibernate_asm_32.S b/arch/x86/power/hibernate_asm_32.S index 1d0fa0e24070..6e56815e13a0 100644 --- a/arch/x86/power/hibernate_asm_32.S +++ b/arch/x86/power/hibernate_asm_32.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * This may not use any stack, nor any variable that is not "NoSave": * diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index 7dbdb780264d..2f15a2ac4209 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 OBJECT_FILES_NON_STANDARD := y purgatory-y := purgatory.o stack.o setup-x86_$(BITS).o sha256.o entry64.o string.o diff --git a/arch/x86/ras/Kconfig b/arch/x86/ras/Kconfig index 2a2d89d39af6..a9c3db125222 100644 --- a/arch/x86/ras/Kconfig +++ b/arch/x86/ras/Kconfig @@ -1,13 +1,4 @@ -config MCE_AMD_INJ - tristate "Simple MCE injection interface for AMD processors" - depends on RAS && X86_MCE && DEBUG_FS && AMD_NB - default n - help - This is a simple debugfs interface to inject MCEs and test different - aspects of the MCE handling code. - - WARNING: Do not even assume this interface is staying stable! - +# SPDX-License-Identifier: GPL-2.0 config RAS_CEC bool "Correctable Errors Collector" depends on X86_MCE && MEMORY_FAILURE && DEBUG_FS @@ -20,4 +11,3 @@ config RAS_CEC Bear in mind that this is absolutely useless if your platform doesn't have ECC DIMMs and doesn't have DRAM ECC checking enabled in the BIOS. - diff --git a/arch/x86/ras/Makefile b/arch/x86/ras/Makefile deleted file mode 100644 index 5f94546db280..000000000000 --- a/arch/x86/ras/Makefile +++ /dev/null @@ -1,2 +0,0 @@ -obj-$(CONFIG_MCE_AMD_INJ) += mce_amd_inj.o - diff --git a/arch/x86/ras/mce_amd_inj.c b/arch/x86/ras/mce_amd_inj.c deleted file mode 100644 index 8730c2882fff..000000000000 --- a/arch/x86/ras/mce_amd_inj.c +++ /dev/null @@ -1,492 +0,0 @@ -/* - * A simple MCE injection facility for testing different aspects of the RAS - * code. This driver should be built as module so that it can be loaded - * on production kernels for testing purposes. - * - * This file may be distributed under the terms of the GNU General Public - * License version 2. - * - * Copyright (c) 2010-15: Borislav Petkov <bp@alien8.de> - * Advanced Micro Devices Inc. - */ - -#include <linux/kobject.h> -#include <linux/debugfs.h> -#include <linux/device.h> -#include <linux/module.h> -#include <linux/cpu.h> -#include <linux/string.h> -#include <linux/uaccess.h> -#include <linux/pci.h> - -#include <asm/mce.h> -#include <asm/smp.h> -#include <asm/amd_nb.h> -#include <asm/irq_vectors.h> - -#include "../kernel/cpu/mcheck/mce-internal.h" - -/* - * Collect all the MCi_XXX settings - */ -static struct mce i_mce; -static struct dentry *dfs_inj; - -static u8 n_banks; - -#define MAX_FLAG_OPT_SIZE 3 -#define NBCFG 0x44 - -enum injection_type { - SW_INJ = 0, /* SW injection, simply decode the error */ - HW_INJ, /* Trigger a #MC */ - DFR_INT_INJ, /* Trigger Deferred error interrupt */ - THR_INT_INJ, /* Trigger threshold interrupt */ - N_INJ_TYPES, -}; - -static const char * const flags_options[] = { - [SW_INJ] = "sw", - [HW_INJ] = "hw", - [DFR_INT_INJ] = "df", - [THR_INT_INJ] = "th", - NULL -}; - -/* Set default injection to SW_INJ */ -static enum injection_type inj_type = SW_INJ; - -#define MCE_INJECT_SET(reg) \ -static int inj_##reg##_set(void *data, u64 val) \ -{ \ - struct mce *m = (struct mce *)data; \ - \ - m->reg = val; \ - return 0; \ -} - -MCE_INJECT_SET(status); -MCE_INJECT_SET(misc); -MCE_INJECT_SET(addr); -MCE_INJECT_SET(synd); - -#define MCE_INJECT_GET(reg) \ -static int inj_##reg##_get(void *data, u64 *val) \ -{ \ - struct mce *m = (struct mce *)data; \ - \ - *val = m->reg; \ - return 0; \ -} - -MCE_INJECT_GET(status); -MCE_INJECT_GET(misc); -MCE_INJECT_GET(addr); -MCE_INJECT_GET(synd); - -DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n"); -DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n"); -DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n"); -DEFINE_SIMPLE_ATTRIBUTE(synd_fops, inj_synd_get, inj_synd_set, "%llx\n"); - -/* - * Caller needs to be make sure this cpu doesn't disappear - * from under us, i.e.: get_cpu/put_cpu. - */ -static int toggle_hw_mce_inject(unsigned int cpu, bool enable) -{ - u32 l, h; - int err; - - err = rdmsr_on_cpu(cpu, MSR_K7_HWCR, &l, &h); - if (err) { - pr_err("%s: error reading HWCR\n", __func__); - return err; - } - - enable ? (l |= BIT(18)) : (l &= ~BIT(18)); - - err = wrmsr_on_cpu(cpu, MSR_K7_HWCR, l, h); - if (err) - pr_err("%s: error writing HWCR\n", __func__); - - return err; -} - -static int __set_inj(const char *buf) -{ - int i; - - for (i = 0; i < N_INJ_TYPES; i++) { - if (!strncmp(flags_options[i], buf, strlen(flags_options[i]))) { - inj_type = i; - return 0; - } - } - return -EINVAL; -} - -static ssize_t flags_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[MAX_FLAG_OPT_SIZE]; - int n; - - n = sprintf(buf, "%s\n", flags_options[inj_type]); - - return simple_read_from_buffer(ubuf, cnt, ppos, buf, n); -} - -static ssize_t flags_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[MAX_FLAG_OPT_SIZE], *__buf; - int err; - - if (cnt > MAX_FLAG_OPT_SIZE) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt - 1] = 0; - - /* strip whitespace */ - __buf = strstrip(buf); - - err = __set_inj(__buf); - if (err) { - pr_err("%s: Invalid flags value: %s\n", __func__, __buf); - return err; - } - - *ppos += cnt; - - return cnt; -} - -static const struct file_operations flags_fops = { - .read = flags_read, - .write = flags_write, - .llseek = generic_file_llseek, -}; - -/* - * On which CPU to inject? - */ -MCE_INJECT_GET(extcpu); - -static int inj_extcpu_set(void *data, u64 val) -{ - struct mce *m = (struct mce *)data; - - if (val >= nr_cpu_ids || !cpu_online(val)) { - pr_err("%s: Invalid CPU: %llu\n", __func__, val); - return -EINVAL; - } - m->extcpu = val; - return 0; -} - -DEFINE_SIMPLE_ATTRIBUTE(extcpu_fops, inj_extcpu_get, inj_extcpu_set, "%llu\n"); - -static void trigger_mce(void *info) -{ - asm volatile("int $18"); -} - -static void trigger_dfr_int(void *info) -{ - asm volatile("int %0" :: "i" (DEFERRED_ERROR_VECTOR)); -} - -static void trigger_thr_int(void *info) -{ - asm volatile("int %0" :: "i" (THRESHOLD_APIC_VECTOR)); -} - -static u32 get_nbc_for_node(int node_id) -{ - struct cpuinfo_x86 *c = &boot_cpu_data; - u32 cores_per_node; - - cores_per_node = (c->x86_max_cores * smp_num_siblings) / amd_get_nodes_per_socket(); - - return cores_per_node * node_id; -} - -static void toggle_nb_mca_mst_cpu(u16 nid) -{ - struct pci_dev *F3 = node_to_amd_nb(nid)->misc; - u32 val; - int err; - - if (!F3) - return; - - err = pci_read_config_dword(F3, NBCFG, &val); - if (err) { - pr_err("%s: Error reading F%dx%03x.\n", - __func__, PCI_FUNC(F3->devfn), NBCFG); - return; - } - - if (val & BIT(27)) - return; - - pr_err("%s: Set D18F3x44[NbMcaToMstCpuEn] which BIOS hasn't done.\n", - __func__); - - val |= BIT(27); - err = pci_write_config_dword(F3, NBCFG, val); - if (err) - pr_err("%s: Error writing F%dx%03x.\n", - __func__, PCI_FUNC(F3->devfn), NBCFG); -} - -static void prepare_msrs(void *info) -{ - struct mce m = *(struct mce *)info; - u8 b = m.bank; - - wrmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); - - if (boot_cpu_has(X86_FEATURE_SMCA)) { - if (m.inject_flags == DFR_INT_INJ) { - wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(b), m.status); - wrmsrl(MSR_AMD64_SMCA_MCx_DEADDR(b), m.addr); - } else { - wrmsrl(MSR_AMD64_SMCA_MCx_STATUS(b), m.status); - wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr); - } - - wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), m.misc); - wrmsrl(MSR_AMD64_SMCA_MCx_SYND(b), m.synd); - } else { - wrmsrl(MSR_IA32_MCx_STATUS(b), m.status); - wrmsrl(MSR_IA32_MCx_ADDR(b), m.addr); - wrmsrl(MSR_IA32_MCx_MISC(b), m.misc); - } -} - -static void do_inject(void) -{ - u64 mcg_status = 0; - unsigned int cpu = i_mce.extcpu; - u8 b = i_mce.bank; - - rdtscll(i_mce.tsc); - - if (i_mce.misc) - i_mce.status |= MCI_STATUS_MISCV; - - if (i_mce.synd) - i_mce.status |= MCI_STATUS_SYNDV; - - if (inj_type == SW_INJ) { - mce_inject_log(&i_mce); - return; - } - - /* prep MCE global settings for the injection */ - mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV; - - if (!(i_mce.status & MCI_STATUS_PCC)) - mcg_status |= MCG_STATUS_RIPV; - - /* - * Ensure necessary status bits for deferred errors: - * - MCx_STATUS[Deferred]: make sure it is a deferred error - * - MCx_STATUS[UC] cleared: deferred errors are _not_ UC - */ - if (inj_type == DFR_INT_INJ) { - i_mce.status |= MCI_STATUS_DEFERRED; - i_mce.status |= (i_mce.status & ~MCI_STATUS_UC); - } - - /* - * For multi node CPUs, logging and reporting of bank 4 errors happens - * only on the node base core. Refer to D18F3x44[NbMcaToMstCpuEn] for - * Fam10h and later BKDGs. - */ - if (static_cpu_has(X86_FEATURE_AMD_DCM) && - b == 4 && - boot_cpu_data.x86 < 0x17) { - toggle_nb_mca_mst_cpu(amd_get_nb_id(cpu)); - cpu = get_nbc_for_node(amd_get_nb_id(cpu)); - } - - get_online_cpus(); - if (!cpu_online(cpu)) - goto err; - - toggle_hw_mce_inject(cpu, true); - - i_mce.mcgstatus = mcg_status; - i_mce.inject_flags = inj_type; - smp_call_function_single(cpu, prepare_msrs, &i_mce, 0); - - toggle_hw_mce_inject(cpu, false); - - switch (inj_type) { - case DFR_INT_INJ: - smp_call_function_single(cpu, trigger_dfr_int, NULL, 0); - break; - case THR_INT_INJ: - smp_call_function_single(cpu, trigger_thr_int, NULL, 0); - break; - default: - smp_call_function_single(cpu, trigger_mce, NULL, 0); - } - -err: - put_online_cpus(); - -} - -/* - * This denotes into which bank we're injecting and triggers - * the injection, at the same time. - */ -static int inj_bank_set(void *data, u64 val) -{ - struct mce *m = (struct mce *)data; - - if (val >= n_banks) { - pr_err("Non-existent MCE bank: %llu\n", val); - return -EINVAL; - } - - m->bank = val; - do_inject(); - - return 0; -} - -MCE_INJECT_GET(bank); - -DEFINE_SIMPLE_ATTRIBUTE(bank_fops, inj_bank_get, inj_bank_set, "%llu\n"); - -static const char readme_msg[] = -"Description of the files and their usages:\n" -"\n" -"Note1: i refers to the bank number below.\n" -"Note2: See respective BKDGs for the exact bit definitions of the files below\n" -"as they mirror the hardware registers.\n" -"\n" -"status:\t Set MCi_STATUS: the bits in that MSR control the error type and\n" -"\t attributes of the error which caused the MCE.\n" -"\n" -"misc:\t Set MCi_MISC: provide auxiliary info about the error. It is mostly\n" -"\t used for error thresholding purposes and its validity is indicated by\n" -"\t MCi_STATUS[MiscV].\n" -"\n" -"synd:\t Set MCi_SYND: provide syndrome info about the error. Only valid on\n" -"\t Scalable MCA systems, and its validity is indicated by MCi_STATUS[SyndV].\n" -"\n" -"addr:\t Error address value to be written to MCi_ADDR. Log address information\n" -"\t associated with the error.\n" -"\n" -"cpu:\t The CPU to inject the error on.\n" -"\n" -"bank:\t Specify the bank you want to inject the error into: the number of\n" -"\t banks in a processor varies and is family/model-specific, therefore, the\n" -"\t supplied value is sanity-checked. Setting the bank value also triggers the\n" -"\t injection.\n" -"\n" -"flags:\t Injection type to be performed. Writing to this file will trigger a\n" -"\t real machine check, an APIC interrupt or invoke the error decoder routines\n" -"\t for AMD processors.\n" -"\n" -"\t Allowed error injection types:\n" -"\t - \"sw\": Software error injection. Decode error to a human-readable \n" -"\t format only. Safe to use.\n" -"\t - \"hw\": Hardware error injection. Causes the #MC exception handler to \n" -"\t handle the error. Be warned: might cause system panic if MCi_STATUS[PCC] \n" -"\t is set. Therefore, consider setting (debugfs_mountpoint)/mce/fake_panic \n" -"\t before injecting.\n" -"\t - \"df\": Trigger APIC interrupt for Deferred error. Causes deferred \n" -"\t error APIC interrupt handler to handle the error if the feature is \n" -"\t is present in hardware. \n" -"\t - \"th\": Trigger APIC interrupt for Threshold errors. Causes threshold \n" -"\t APIC interrupt handler to handle the error. \n" -"\n"; - -static ssize_t -inj_readme_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - return simple_read_from_buffer(ubuf, cnt, ppos, - readme_msg, strlen(readme_msg)); -} - -static const struct file_operations readme_fops = { - .read = inj_readme_read, -}; - -static struct dfs_node { - char *name; - struct dentry *d; - const struct file_operations *fops; - umode_t perm; -} dfs_fls[] = { - { .name = "status", .fops = &status_fops, .perm = S_IRUSR | S_IWUSR }, - { .name = "misc", .fops = &misc_fops, .perm = S_IRUSR | S_IWUSR }, - { .name = "addr", .fops = &addr_fops, .perm = S_IRUSR | S_IWUSR }, - { .name = "synd", .fops = &synd_fops, .perm = S_IRUSR | S_IWUSR }, - { .name = "bank", .fops = &bank_fops, .perm = S_IRUSR | S_IWUSR }, - { .name = "flags", .fops = &flags_fops, .perm = S_IRUSR | S_IWUSR }, - { .name = "cpu", .fops = &extcpu_fops, .perm = S_IRUSR | S_IWUSR }, - { .name = "README", .fops = &readme_fops, .perm = S_IRUSR | S_IRGRP | S_IROTH }, -}; - -static int __init init_mce_inject(void) -{ - unsigned int i; - u64 cap; - - rdmsrl(MSR_IA32_MCG_CAP, cap); - n_banks = cap & MCG_BANKCNT_MASK; - - dfs_inj = debugfs_create_dir("mce-inject", NULL); - if (!dfs_inj) - return -EINVAL; - - for (i = 0; i < ARRAY_SIZE(dfs_fls); i++) { - dfs_fls[i].d = debugfs_create_file(dfs_fls[i].name, - dfs_fls[i].perm, - dfs_inj, - &i_mce, - dfs_fls[i].fops); - - if (!dfs_fls[i].d) - goto err_dfs_add; - } - - return 0; - -err_dfs_add: - while (i-- > 0) - debugfs_remove(dfs_fls[i].d); - - debugfs_remove(dfs_inj); - dfs_inj = NULL; - - return -ENODEV; -} - -static void __exit exit_mce_inject(void) -{ - - debugfs_remove_recursive(dfs_inj); - dfs_inj = NULL; - - memset(&dfs_fls, 0, sizeof(dfs_fls)); -} -module_init(init_mce_inject); -module_exit(exit_mce_inject); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Borislav Petkov <bp@alien8.de>"); -MODULE_AUTHOR("AMD Inc."); -MODULE_DESCRIPTION("MCE injection facility for RAS testing"); diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index a163a90af4aa..ed84d3917a59 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -1,6 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/io.h> #include <linux/slab.h> #include <linux/memblock.h> +#include <linux/mem_encrypt.h> #include <asm/set_memory.h> #include <asm/pgtable.h> @@ -59,6 +61,13 @@ static void __init setup_real_mode(void) base = (unsigned char *)real_mode_header; + /* + * If SME is active, the trampoline area will need to be in + * decrypted memory in order to bring up other processors + * successfully. + */ + set_memory_decrypted((unsigned long)base, size >> PAGE_SHIFT); + memcpy(base, real_mode_blob, size); phys_base = __pa(base); @@ -100,9 +109,13 @@ static void __init setup_real_mode(void) trampoline_cr4_features = &trampoline_header->cr4; *trampoline_cr4_features = mmu_cr4_features; + trampoline_header->flags = 0; + if (sme_active()) + trampoline_header->flags |= TH_FLAGS_SME_ACTIVE; + trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd); trampoline_pgd[0] = trampoline_pgd_entry.pgd; - trampoline_pgd[511] = init_level4_pgt[511].pgd; + trampoline_pgd[511] = init_top_pgt[511].pgd; #endif } diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S index a28221d94e69..30b0d30d861a 100644 --- a/arch/x86/realmode/rm/header.S +++ b/arch/x86/realmode/rm/header.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Real-mode blob header; this should match realmode.h and be * readonly; for mutable data instead add pointers into the .data diff --git a/arch/x86/realmode/rm/realmode.h b/arch/x86/realmode/rm/realmode.h index d74cff6350ed..c76041a35397 100644 --- a/arch/x86/realmode/rm/realmode.h +++ b/arch/x86/realmode/rm/realmode.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef ARCH_X86_REALMODE_RM_REALMODE_H #define ARCH_X86_REALMODE_RM_REALMODE_H diff --git a/arch/x86/realmode/rm/realmode.lds.S b/arch/x86/realmode/rm/realmode.lds.S index 86b2e8d6b1f1..df8e11e26bc3 100644 --- a/arch/x86/realmode/rm/realmode.lds.S +++ b/arch/x86/realmode/rm/realmode.lds.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * realmode.lds.S * diff --git a/arch/x86/realmode/rm/reboot.S b/arch/x86/realmode/rm/reboot.S index d66c607bdc58..cd2f97b9623b 100644 --- a/arch/x86/realmode/rm/reboot.S +++ b/arch/x86/realmode/rm/reboot.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #include <linux/linkage.h> #include <asm/segment.h> #include <asm/page_types.h> diff --git a/arch/x86/realmode/rm/stack.S b/arch/x86/realmode/rm/stack.S index 867ae87adfae..8d4cb64799ea 100644 --- a/arch/x86/realmode/rm/stack.S +++ b/arch/x86/realmode/rm/stack.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Common heap and stack allocations */ diff --git a/arch/x86/realmode/rm/trampoline_32.S b/arch/x86/realmode/rm/trampoline_32.S index 48ddd76bc4c3..2dd866c9e21e 100644 --- a/arch/x86/realmode/rm/trampoline_32.S +++ b/arch/x86/realmode/rm/trampoline_32.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * * Trampoline.S Derived from Setup.S by Linus Torvalds diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S index dac7b20d2f9d..de53bd15df5a 100644 --- a/arch/x86/realmode/rm/trampoline_64.S +++ b/arch/x86/realmode/rm/trampoline_64.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * * Trampoline.S Derived from Setup.S by Linus Torvalds @@ -30,6 +31,7 @@ #include <asm/msr.h> #include <asm/segment.h> #include <asm/processor-flags.h> +#include <asm/realmode.h> #include "realmode.h" .text @@ -92,6 +94,28 @@ ENTRY(startup_32) movl %edx, %fs movl %edx, %gs + /* + * Check for memory encryption support. This is a safety net in + * case BIOS hasn't done the necessary step of setting the bit in + * the MSR for this AP. If SME is active and we've gotten this far + * then it is safe for us to set the MSR bit and continue. If we + * don't we'll eventually crash trying to execute encrypted + * instructions. + */ + bt $TH_FLAGS_SME_ACTIVE_BIT, pa_tr_flags + jnc .Ldone + movl $MSR_K8_SYSCFG, %ecx + rdmsr + bts $MSR_K8_SYSCFG_MEM_ENCRYPT_BIT, %eax + jc .Ldone + + /* + * Memory encryption is enabled but the SME enable bit for this + * CPU has has not been set. It is safe to set it, so do so. + */ + wrmsr +.Ldone: + movl pa_tr_cr4, %eax movl %eax, %cr4 # Enable PAE mode @@ -147,6 +171,7 @@ GLOBAL(trampoline_header) tr_start: .space 8 GLOBAL(tr_efer) .space 8 GLOBAL(tr_cr4) .space 4 + GLOBAL(tr_flags) .space 4 END(trampoline_header) #include "trampoline_common.S" diff --git a/arch/x86/realmode/rm/trampoline_common.S b/arch/x86/realmode/rm/trampoline_common.S index b1ecdb9692ad..7c706772ab59 100644 --- a/arch/x86/realmode/rm/trampoline_common.S +++ b/arch/x86/realmode/rm/trampoline_common.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ .section ".rodata","a" .balign 16 tr_idt: .fill 1, 6, 0 diff --git a/arch/x86/realmode/rm/wakemain.c b/arch/x86/realmode/rm/wakemain.c index 91405d515ec6..1d6437e6d2ba 100644 --- a/arch/x86/realmode/rm/wakemain.c +++ b/arch/x86/realmode/rm/wakemain.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include "wakeup.h" #include "boot.h" diff --git a/arch/x86/realmode/rm/wakeup.h b/arch/x86/realmode/rm/wakeup.h index 7dd86a419f5d..0e4fd08ae447 100644 --- a/arch/x86/realmode/rm/wakeup.h +++ b/arch/x86/realmode/rm/wakeup.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Definitions for the wakeup data structure at the head of the * wakeup code. diff --git a/arch/x86/realmode/rm/wakeup_asm.S b/arch/x86/realmode/rm/wakeup_asm.S index 9e7e14797a72..05ac9c17c811 100644 --- a/arch/x86/realmode/rm/wakeup_asm.S +++ b/arch/x86/realmode/rm/wakeup_asm.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * ACPI wakeup real mode startup stub */ diff --git a/arch/x86/realmode/rmpiggy.S b/arch/x86/realmode/rmpiggy.S index 204c6ece0e97..c078dba40cef 100644 --- a/arch/x86/realmode/rmpiggy.S +++ b/arch/x86/realmode/rmpiggy.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Wrapper script for the realmode binary as a transport object * before copying to low memory. diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile index 604a37efd4d5..972b8e8d939c 100644 --- a/arch/x86/tools/Makefile +++ b/arch/x86/tools/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 PHONY += posttest ifeq ($(KBUILD_VERBOSE),1) diff --git a/arch/x86/tools/distill.awk b/arch/x86/tools/distill.awk index c13c0ee48ab4..e0edeccc1429 100644 --- a/arch/x86/tools/distill.awk +++ b/arch/x86/tools/distill.awk @@ -1,4 +1,5 @@ #!/bin/awk -f +# SPDX-License-Identifier: GPL-2.0 # Usage: objdump -d a.out | awk -f distill.awk | ./test_get_len # Distills the disassembly as follows: # - Removes all lines except the disassembled instructions. diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk index a3d2c62fd805..b02a36b2c14f 100644 --- a/arch/x86/tools/gen-insn-attr-x86.awk +++ b/arch/x86/tools/gen-insn-attr-x86.awk @@ -1,4 +1,5 @@ #!/bin/awk -f +# SPDX-License-Identifier: GPL-2.0 # gen-insn-attr-x86.awk: Instruction attribute table generator # Written by Masami Hiramatsu <mhiramat@redhat.com> # diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index 73eb7fd4aec4..5d73c443e778 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* This is included from relocs_32/64.c */ #define ElfW(type) _ElfW(ELF_BITS, type) diff --git a/arch/x86/tools/relocs.h b/arch/x86/tools/relocs.h index 1d23bf953a4a..43c83c0fd22c 100644 --- a/arch/x86/tools/relocs.h +++ b/arch/x86/tools/relocs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef RELOCS_H #define RELOCS_H diff --git a/arch/x86/tools/relocs_32.c b/arch/x86/tools/relocs_32.c index b2ade2bb4162..9442ff78be83 100644 --- a/arch/x86/tools/relocs_32.c +++ b/arch/x86/tools/relocs_32.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include "relocs.h" #define ELF_BITS 32 diff --git a/arch/x86/tools/relocs_64.c b/arch/x86/tools/relocs_64.c index 56b61b743c4c..9029cb619cb1 100644 --- a/arch/x86/tools/relocs_64.c +++ b/arch/x86/tools/relocs_64.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include "relocs.h" #define ELF_BITS 64 diff --git a/arch/x86/tools/relocs_common.c b/arch/x86/tools/relocs_common.c index acab636bcb34..6634352a20bc 100644 --- a/arch/x86/tools/relocs_common.c +++ b/arch/x86/tools/relocs_common.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include "relocs.h" void die(char *fmt, ...) diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig index ed56a1c4ae73..13ed827c7c66 100644 --- a/arch/x86/um/Kconfig +++ b/arch/x86/um/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 mainmenu "User Mode Linux/$SUBARCH $KERNELVERSION Kernel Configuration" source "arch/um/Kconfig.common" diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile index 46cbbfe03285..c2d3d7c51e9e 100644 --- a/arch/x86/um/Makefile +++ b/arch/x86/um/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) # diff --git a/arch/x86/um/asm/arch_hweight.h b/arch/x86/um/asm/arch_hweight.h index c656cf443f4a..0d2d5fbb3a39 100644 --- a/arch/x86/um/asm/arch_hweight.h +++ b/arch/x86/um/asm/arch_hweight.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_UM_HWEIGHT_H #define _ASM_UM_HWEIGHT_H diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h index 00c319048d52..b7d73400ea29 100644 --- a/arch/x86/um/asm/barrier.h +++ b/arch/x86/um/asm/barrier.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_UM_BARRIER_H_ #define _ASM_UM_BARRIER_H_ diff --git a/arch/x86/um/asm/checksum.h b/arch/x86/um/asm/checksum.h index 54d96f1e3594..2a56cac64687 100644 --- a/arch/x86/um/asm/checksum.h +++ b/arch/x86/um/asm/checksum.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __UM_CHECKSUM_H #define __UM_CHECKSUM_H diff --git a/arch/x86/um/asm/desc.h b/arch/x86/um/asm/desc.h index 4ec34a51b62c..34de4e93d945 100644 --- a/arch/x86/um/asm/desc.h +++ b/arch/x86/um/asm/desc.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __UM_DESC_H #define __UM_DESC_H diff --git a/arch/x86/um/asm/module.h b/arch/x86/um/asm/module.h index 61af80e932eb..a3b061d66082 100644 --- a/arch/x86/um/asm/module.h +++ b/arch/x86/um/asm/module.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __UM_MODULE_H #define __UM_MODULE_H diff --git a/arch/x86/um/asm/processor.h b/arch/x86/um/asm/processor.h index c77db2288982..593d5f3902bd 100644 --- a/arch/x86/um/asm/processor.h +++ b/arch/x86/um/asm/processor.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __UM_PROCESSOR_H #define __UM_PROCESSOR_H diff --git a/arch/x86/um/asm/ptrace.h b/arch/x86/um/asm/ptrace.h index b291ca5cf66b..83822fd42204 100644 --- a/arch/x86/um/asm/ptrace.h +++ b/arch/x86/um/asm/ptrace.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __UM_X86_PTRACE_H #define __UM_X86_PTRACE_H diff --git a/arch/x86/um/asm/segment.h b/arch/x86/um/asm/segment.h index 41dd5e1f3cd7..453db377150d 100644 --- a/arch/x86/um/asm/segment.h +++ b/arch/x86/um/asm/segment.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __UM_SEGMENT_H #define __UM_SEGMENT_H diff --git a/arch/x86/um/asm/syscall.h b/arch/x86/um/asm/syscall.h index 11ab90dc5f14..ef898af102d1 100644 --- a/arch/x86/um/asm/syscall.h +++ b/arch/x86/um/asm/syscall.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __UM_ASM_SYSCALL_H #define __UM_ASM_SYSCALL_H diff --git a/arch/x86/um/elfcore.c b/arch/x86/um/elfcore.c index 7bb89a27a5e4..48a3eb09d951 100644 --- a/arch/x86/um/elfcore.c +++ b/arch/x86/um/elfcore.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/elf.h> #include <linux/coredump.h> #include <linux/fs.h> diff --git a/arch/x86/um/mem_64.c b/arch/x86/um/mem_64.c index 7642e2e2aa61..c027e93d1002 100644 --- a/arch/x86/um/mem_64.c +++ b/arch/x86/um/mem_64.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/mm.h> #include <asm/elf.h> diff --git a/arch/x86/um/os-Linux/mcontext.c b/arch/x86/um/os-Linux/mcontext.c index 1d33d72c6284..49c3744cac37 100644 --- a/arch/x86/um/os-Linux/mcontext.c +++ b/arch/x86/um/os-Linux/mcontext.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <sys/ucontext.h> #define __FRAME_OFFSETS #include <asm/ptrace.h> diff --git a/arch/x86/um/os-Linux/registers.c b/arch/x86/um/os-Linux/registers.c index 00f54a91bb4b..3c423dfcd78b 100644 --- a/arch/x86/um/os-Linux/registers.c +++ b/arch/x86/um/os-Linux/registers.c @@ -5,6 +5,7 @@ */ #include <errno.h> +#include <stdlib.h> #include <sys/ptrace.h> #ifdef __i386__ #include <sys/user.h> @@ -26,17 +27,18 @@ int save_i387_registers(int pid, unsigned long *fp_regs) int save_fp_registers(int pid, unsigned long *fp_regs) { +#ifdef PTRACE_GETREGSET struct iovec iov; if (have_xstate_support) { iov.iov_base = fp_regs; - iov.iov_len = sizeof(struct _xstate); + iov.iov_len = FP_SIZE * sizeof(unsigned long); if (ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, &iov) < 0) return -errno; return 0; - } else { + } else +#endif return save_i387_registers(pid, fp_regs); - } } int restore_i387_registers(int pid, unsigned long *fp_regs) @@ -48,17 +50,17 @@ int restore_i387_registers(int pid, unsigned long *fp_regs) int restore_fp_registers(int pid, unsigned long *fp_regs) { +#ifdef PTRACE_SETREGSET struct iovec iov; - if (have_xstate_support) { iov.iov_base = fp_regs; - iov.iov_len = sizeof(struct _xstate); + iov.iov_len = FP_SIZE * sizeof(unsigned long); if (ptrace(PTRACE_SETREGSET, pid, NT_X86_XSTATE, &iov) < 0) return -errno; return 0; - } else { + } else +#endif return restore_i387_registers(pid, fp_regs); - } } #ifdef __i386__ @@ -122,13 +124,21 @@ int put_fp_registers(int pid, unsigned long *regs) void arch_init_registers(int pid) { - struct _xstate fp_regs; +#ifdef PTRACE_GETREGSET + void * fp_regs; struct iovec iov; - iov.iov_base = &fp_regs; - iov.iov_len = sizeof(struct _xstate); + fp_regs = malloc(FP_SIZE * sizeof(unsigned long)); + if(fp_regs == NULL) + return; + + iov.iov_base = fp_regs; + iov.iov_len = FP_SIZE * sizeof(unsigned long); if (ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, &iov) == 0) have_xstate_support = 1; + + free(fp_regs); +#endif } #endif diff --git a/arch/x86/um/os-Linux/task_size.c b/arch/x86/um/os-Linux/task_size.c index 5adb6a2fd117..e62174638f00 100644 --- a/arch/x86/um/os-Linux/task_size.c +++ b/arch/x86/um/os-Linux/task_size.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <stdio.h> #include <stdlib.h> #include <signal.h> diff --git a/arch/x86/um/os-Linux/tls.c b/arch/x86/um/os-Linux/tls.c index 9d94b3b76c74..3e1b1bf6acbc 100644 --- a/arch/x86/um/os-Linux/tls.c +++ b/arch/x86/um/os-Linux/tls.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <errno.h> #include <linux/unistd.h> @@ -37,7 +38,7 @@ void check_host_supports_tls(int *supports_tls, int *tls_min) continue; else if (errno == ENOSYS) *supports_tls = 0; - return; + return; } } diff --git a/arch/x86/um/setjmp_32.S b/arch/x86/um/setjmp_32.S index b766792c9933..62eaf8c80e04 100644 --- a/arch/x86/um/setjmp_32.S +++ b/arch/x86/um/setjmp_32.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ # # arch/i386/setjmp.S # @@ -16,9 +17,9 @@ .text .align 4 - .globl setjmp - .type setjmp, @function -setjmp: + .globl kernel_setjmp + .type kernel_setjmp, @function +kernel_setjmp: #ifdef _REGPARM movl %eax,%edx #else @@ -35,13 +36,13 @@ setjmp: movl %ecx,20(%edx) # Return address ret - .size setjmp,.-setjmp + .size kernel_setjmp,.-kernel_setjmp .text .align 4 - .globl longjmp - .type longjmp, @function -longjmp: + .globl kernel_longjmp + .type kernel_longjmp, @function +kernel_longjmp: #ifdef _REGPARM xchgl %eax,%edx #else @@ -55,4 +56,4 @@ longjmp: movl 16(%edx),%edi jmp *20(%edx) - .size longjmp,.-longjmp + .size kernel_longjmp,.-kernel_longjmp diff --git a/arch/x86/um/setjmp_64.S b/arch/x86/um/setjmp_64.S index 45f547b4043e..1b5d40d4ff46 100644 --- a/arch/x86/um/setjmp_64.S +++ b/arch/x86/um/setjmp_64.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ # # arch/x86_64/setjmp.S # @@ -18,9 +19,9 @@ .text .align 4 - .globl setjmp - .type setjmp, @function -setjmp: + .globl kernel_setjmp + .type kernel_setjmp, @function +kernel_setjmp: pop %rsi # Return address, and adjust the stack xorl %eax,%eax # Return value movq %rbx,(%rdi) @@ -34,13 +35,13 @@ setjmp: movq %rsi,56(%rdi) # Return address ret - .size setjmp,.-setjmp + .size kernel_setjmp,.-kernel_setjmp .text .align 4 - .globl longjmp - .type longjmp, @function -longjmp: + .globl kernel_longjmp + .type kernel_longjmp, @function +kernel_longjmp: movl %esi,%eax # Return value (int) movq (%rdi),%rbx movq 8(%rdi),%rsp @@ -51,4 +52,4 @@ longjmp: movq 48(%rdi),%r15 jmp *56(%rdi) - .size longjmp,.-longjmp + .size kernel_longjmp,.-kernel_longjmp diff --git a/arch/x86/um/shared/sysdep/archsetjmp.h b/arch/x86/um/shared/sysdep/archsetjmp.h index ff7766d28226..166cedbab926 100644 --- a/arch/x86/um/shared/sysdep/archsetjmp.h +++ b/arch/x86/um/shared/sysdep/archsetjmp.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifdef __i386__ #include "archsetjmp_32.h" #else diff --git a/arch/x86/um/shared/sysdep/archsetjmp_32.h b/arch/x86/um/shared/sysdep/archsetjmp_32.h index 0f312085ce1d..fb08f2576438 100644 --- a/arch/x86/um/shared/sysdep/archsetjmp_32.h +++ b/arch/x86/um/shared/sysdep/archsetjmp_32.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * arch/um/include/sysdep-i386/archsetjmp.h */ diff --git a/arch/x86/um/shared/sysdep/archsetjmp_64.h b/arch/x86/um/shared/sysdep/archsetjmp_64.h index 2af8f12ca161..9b499e457ba0 100644 --- a/arch/x86/um/shared/sysdep/archsetjmp_64.h +++ b/arch/x86/um/shared/sysdep/archsetjmp_64.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * arch/um/include/sysdep-x86_64/archsetjmp.h */ diff --git a/arch/x86/um/shared/sysdep/faultinfo.h b/arch/x86/um/shared/sysdep/faultinfo.h index 862ecb1c7781..4390803e07a1 100644 --- a/arch/x86/um/shared/sysdep/faultinfo.h +++ b/arch/x86/um/shared/sysdep/faultinfo.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifdef __i386__ #include "faultinfo_32.h" #else diff --git a/arch/x86/um/shared/sysdep/kernel-offsets.h b/arch/x86/um/shared/sysdep/kernel-offsets.h index 7e1d35b6ad5c..a004bffb7b8d 100644 --- a/arch/x86/um/shared/sysdep/kernel-offsets.h +++ b/arch/x86/um/shared/sysdep/kernel-offsets.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #include <linux/stddef.h> #include <linux/sched.h> #include <linux/elf.h> diff --git a/arch/x86/um/shared/sysdep/ptrace.h b/arch/x86/um/shared/sysdep/ptrace.h index eb9356904ad3..6ca4ecabc55b 100644 --- a/arch/x86/um/shared/sysdep/ptrace.h +++ b/arch/x86/um/shared/sysdep/ptrace.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __SYSDEP_X86_PTRACE_H #define __SYSDEP_X86_PTRACE_H diff --git a/arch/x86/um/shared/sysdep/ptrace_user.h b/arch/x86/um/shared/sysdep/ptrace_user.h index 16cd6b5e71f7..44782bbad41e 100644 --- a/arch/x86/um/shared/sysdep/ptrace_user.h +++ b/arch/x86/um/shared/sysdep/ptrace_user.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #include <generated/user_constants.h> #define PT_OFFSET(r) ((r) * sizeof(long)) diff --git a/arch/x86/um/shared/sysdep/stub.h b/arch/x86/um/shared/sysdep/stub.h index 3f55e5bd3cec..ce0ca46ad383 100644 --- a/arch/x86/um/shared/sysdep/stub.h +++ b/arch/x86/um/shared/sysdep/stub.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #include <asm/unistd.h> #include <sys/mman.h> #include <signal.h> diff --git a/arch/x86/um/shared/sysdep/syscalls.h b/arch/x86/um/shared/sysdep/syscalls.h index bd9a89b67e41..b2060ac707f0 100644 --- a/arch/x86/um/shared/sysdep/syscalls.h +++ b/arch/x86/um/shared/sysdep/syscalls.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifdef __i386__ #include "syscalls_32.h" #else diff --git a/arch/x86/um/shared/sysdep/tls.h b/arch/x86/um/shared/sysdep/tls.h index a682db13df23..b968016aa379 100644 --- a/arch/x86/um/shared/sysdep/tls.h +++ b/arch/x86/um/shared/sysdep/tls.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _SYSDEP_TLS_H #define _SYSDEP_TLS_H diff --git a/arch/x86/um/stub_32.S b/arch/x86/um/stub_32.S index 98816804e131..a193e88536a9 100644 --- a/arch/x86/um/stub_32.S +++ b/arch/x86/um/stub_32.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #include <as-layout.h> .section .__syscall_stub, "ax" diff --git a/arch/x86/um/stub_64.S b/arch/x86/um/stub_64.S index ba914b3b8cc4..8a95c5b2eaf9 100644 --- a/arch/x86/um/stub_64.S +++ b/arch/x86/um/stub_64.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #include <as-layout.h> .section .__syscall_stub, "ax" diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c index bfce503dffae..9649b5ad2ca2 100644 --- a/arch/x86/um/sys_call_table_32.c +++ b/arch/x86/um/sys_call_table_32.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * System call table for UML/i386, copied from arch/x86/kernel/syscall_*.c * with some changes for UML. diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c index f306413d3eb6..c8bc7fb8cbd6 100644 --- a/arch/x86/um/sys_call_table_64.c +++ b/arch/x86/um/sys_call_table_64.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * System call table for UML/x86-64, copied from arch/x86/kernel/syscall_*.c * with some changes for UML. diff --git a/arch/x86/um/syscalls_32.c b/arch/x86/um/syscalls_32.c index 627d68836b16..5c65254c8f35 100644 --- a/arch/x86/um/syscalls_32.c +++ b/arch/x86/um/syscalls_32.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/syscalls.h> #include <os.h> diff --git a/arch/x86/um/tls_64.c b/arch/x86/um/tls_64.c index 3ad714373d7f..3a621e0d3925 100644 --- a/arch/x86/um/tls_64.c +++ b/arch/x86/um/tls_64.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/sched.h> #include <asm/ptrace-abi.h> diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c index cb3c22370cf5..5b37b7f952dd 100644 --- a/arch/x86/um/user-offsets.c +++ b/arch/x86/um/user-offsets.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <stdio.h> #include <stddef.h> #include <signal.h> @@ -5,7 +6,7 @@ #include <sys/mman.h> #include <sys/user.h> #define __FRAME_OFFSETS -#include <asm/ptrace.h> +#include <linux/ptrace.h> #include <asm/types.h> #ifdef __i386__ @@ -50,7 +51,11 @@ void foo(void) DEFINE(HOST_GS, GS); DEFINE(HOST_ORIG_AX, ORIG_EAX); #else - DEFINE(HOST_FP_SIZE, sizeof(struct _xstate) / sizeof(unsigned long)); +#ifdef FP_XSTATE_MAGIC1 + DEFINE_LONGS(HOST_FP_SIZE, 2696); +#else + DEFINE(HOST_FP_SIZE, sizeof(struct _fpstate) / sizeof(unsigned long)); +#endif DEFINE_LONGS(HOST_BX, RBX); DEFINE_LONGS(HOST_CX, RCX); DEFINE_LONGS(HOST_DI, RDI); diff --git a/arch/x86/um/vdso/Makefile b/arch/x86/um/vdso/Makefile index d72dec406ccb..10003359e633 100644 --- a/arch/x86/um/vdso/Makefile +++ b/arch/x86/um/vdso/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # Building vDSO images for x86. # @@ -53,7 +54,7 @@ CFLAGS_REMOVE_vdso-note.o = -pg -fprofile-arcs -ftest-coverage CFLAGS_REMOVE_um_vdso.o = -pg -fprofile-arcs -ftest-coverage targets += vdso-syms.lds -obj-$(VDSO64-y) += vdso-syms.lds +extra-$(VDSO64-y) += vdso-syms.lds # # Match symbols in the DSO that look like VDSO*; produce a file of constants. diff --git a/arch/x86/um/vdso/checkundef.sh b/arch/x86/um/vdso/checkundef.sh index 7ee90a9b549d..8e3ea6bb956f 100644 --- a/arch/x86/um/vdso/checkundef.sh +++ b/arch/x86/um/vdso/checkundef.sh @@ -1,4 +1,5 @@ #!/bin/sh +# SPDX-License-Identifier: GPL-2.0 nm="$1" file="$2" $nm "$file" | grep '^ *U' > /dev/null 2>&1 diff --git a/arch/x86/um/vdso/vdso-layout.lds.S b/arch/x86/um/vdso/vdso-layout.lds.S index 634a2cf62046..439b790df18f 100644 --- a/arch/x86/um/vdso/vdso-layout.lds.S +++ b/arch/x86/um/vdso/vdso-layout.lds.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Linker script for vDSO. This is an ELF shared object prelinked to * its virtual address, and with only one read-only segment. diff --git a/arch/x86/um/vdso/vdso.S b/arch/x86/um/vdso/vdso.S index 1cb468adacbb..a4a3870dc059 100644 --- a/arch/x86/um/vdso/vdso.S +++ b/arch/x86/um/vdso/vdso.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #include <linux/init.h> __INITDATA diff --git a/arch/x86/um/vdso/vdso.lds.S b/arch/x86/um/vdso/vdso.lds.S index b96b2677cad8..73c508587a98 100644 --- a/arch/x86/um/vdso/vdso.lds.S +++ b/arch/x86/um/vdso/vdso.lds.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Linker script for 64-bit vDSO. * We #include the file to define the layout details. diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 027987638e98..f605825a04ab 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # This Kconfig describes xen options # @@ -17,6 +18,9 @@ config XEN_PV bool "Xen PV guest support" default y depends on XEN + # XEN_PV is not ready to work with 5-level paging. + # Changes to hypervisor are also required. + depends on !X86_5LEVEL select XEN_HAVE_PVMMU select XEN_HAVE_VPMU help @@ -75,4 +79,6 @@ config XEN_DEBUG_FS config XEN_PVH bool "Support for running as a PVH guest" depends on XEN && XEN_PVHVM && ACPI + # Pre-built page tables are not ready to handle 5-level paging. + depends on !X86_5LEVEL def_bool n diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index fffb0a16f9e3..d83cb5478f54 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -1,3 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0 +OBJECT_FILES_NON_STANDARD_xen-asm_$(BITS).o := y +OBJECT_FILES_NON_STANDARD_xen-pvh.o := y + ifdef CONFIG_FUNCTION_TRACER # Do not profile debug and lowlevel utilities CFLAGS_REMOVE_spinlock.o = -pg diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index bcea81f36fc5..30434b8708f2 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/init.h> #include <asm/x86_init.h> @@ -178,7 +179,7 @@ static struct apic xen_pv_apic = { .get_apic_id = xen_get_apic_id, .set_apic_id = xen_set_apic_id, /* Can be NULL on 32-bit. */ - .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, #ifdef CONFIG_SMP .send_IPI_mask = xen_send_IPI_mask, diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c index 1daff5545c0a..13da87918b4f 100644 --- a/arch/x86/xen/debugfs.c +++ b/arch/x86/xen/debugfs.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/init.h> #include <linux/debugfs.h> #include <linux/slab.h> diff --git a/arch/x86/xen/debugfs.h b/arch/x86/xen/debugfs.h index 12ebf3325c7b..6b813ad1091c 100644 --- a/arch/x86/xen/debugfs.h +++ b/arch/x86/xen/debugfs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _XEN_DEBUGFS_H #define _XEN_DEBUGFS_H diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c index 30bb2e80cfe7..a18703be9ead 100644 --- a/arch/x86/xen/efi.c +++ b/arch/x86/xen/efi.c @@ -54,38 +54,6 @@ static efi_system_table_t efi_systab_xen __initdata = { .tables = EFI_INVALID_TABLE_ADDR /* Initialized later. */ }; -static const struct efi efi_xen __initconst = { - .systab = NULL, /* Initialized later. */ - .runtime_version = 0, /* Initialized later. */ - .mps = EFI_INVALID_TABLE_ADDR, - .acpi = EFI_INVALID_TABLE_ADDR, - .acpi20 = EFI_INVALID_TABLE_ADDR, - .smbios = EFI_INVALID_TABLE_ADDR, - .smbios3 = EFI_INVALID_TABLE_ADDR, - .sal_systab = EFI_INVALID_TABLE_ADDR, - .boot_info = EFI_INVALID_TABLE_ADDR, - .hcdp = EFI_INVALID_TABLE_ADDR, - .uga = EFI_INVALID_TABLE_ADDR, - .uv_systab = EFI_INVALID_TABLE_ADDR, - .fw_vendor = EFI_INVALID_TABLE_ADDR, - .runtime = EFI_INVALID_TABLE_ADDR, - .config_table = EFI_INVALID_TABLE_ADDR, - .get_time = xen_efi_get_time, - .set_time = xen_efi_set_time, - .get_wakeup_time = xen_efi_get_wakeup_time, - .set_wakeup_time = xen_efi_set_wakeup_time, - .get_variable = xen_efi_get_variable, - .get_next_variable = xen_efi_get_next_variable, - .set_variable = xen_efi_set_variable, - .query_variable_info = xen_efi_query_variable_info, - .update_capsule = xen_efi_update_capsule, - .query_capsule_caps = xen_efi_query_capsule_caps, - .get_next_high_mono_count = xen_efi_get_next_high_mono_count, - .reset_system = xen_efi_reset_system, - .set_virtual_address_map = NULL, /* Not used under Xen. */ - .flags = 0 /* Initialized later. */ -}; - static efi_system_table_t __init *xen_efi_probe(void) { struct xen_platform_op op = { @@ -102,7 +70,18 @@ static efi_system_table_t __init *xen_efi_probe(void) /* Here we know that Xen runs on EFI platform. */ - efi = efi_xen; + efi.get_time = xen_efi_get_time; + efi.set_time = xen_efi_set_time; + efi.get_wakeup_time = xen_efi_get_wakeup_time; + efi.set_wakeup_time = xen_efi_set_wakeup_time; + efi.get_variable = xen_efi_get_variable; + efi.get_next_variable = xen_efi_get_next_variable; + efi.set_variable = xen_efi_set_variable; + efi.query_variable_info = xen_efi_query_variable_info; + efi.update_capsule = xen_efi_update_capsule; + efi.query_capsule_caps = xen_efi_query_capsule_caps; + efi.get_next_high_mono_count = xen_efi_get_next_high_mono_count; + efi.reset_system = xen_efi_reset_system; efi_systab_xen.tables = info->cfg.addr; efi_systab_xen.nr_tables = info->cfg.nent; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index a5ffcbb20cc0..d669e9d89001 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -93,11 +93,11 @@ int xen_cpuhp_setup(int (*cpu_up_prepare_cb)(unsigned int), int rc; rc = cpuhp_setup_state_nocalls(CPUHP_XEN_PREPARE, - "x86/xen/hvm_guest:prepare", + "x86/xen/guest:prepare", cpu_up_prepare_cb, cpu_dead_cb); if (rc >= 0) { rc = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, - "x86/xen/hvm_guest:online", + "x86/xen/guest:online", xen_cpu_up_online, NULL); if (rc < 0) cpuhp_remove_state_nocalls(CPUHP_XEN_PREPARE); @@ -106,15 +106,83 @@ int xen_cpuhp_setup(int (*cpu_up_prepare_cb)(unsigned int), return rc >= 0 ? 0 : rc; } -static void clamp_max_cpus(void) +static int xen_vcpu_setup_restore(int cpu) { -#ifdef CONFIG_SMP - if (setup_max_cpus > MAX_VIRT_CPUS) - setup_max_cpus = MAX_VIRT_CPUS; -#endif + int rc = 0; + + /* Any per_cpu(xen_vcpu) is stale, so reset it */ + xen_vcpu_info_reset(cpu); + + /* + * For PVH and PVHVM, setup online VCPUs only. The rest will + * be handled by hotplug. + */ + if (xen_pv_domain() || + (xen_hvm_domain() && cpu_online(cpu))) { + rc = xen_vcpu_setup(cpu); + } + + return rc; +} + +/* + * On restore, set the vcpu placement up again. + * If it fails, then we're in a bad state, since + * we can't back out from using it... + */ +void xen_vcpu_restore(void) +{ + int cpu, rc; + + for_each_possible_cpu(cpu) { + bool other_cpu = (cpu != smp_processor_id()); + bool is_up; + + if (xen_vcpu_nr(cpu) == XEN_VCPU_ID_INVALID) + continue; + + /* Only Xen 4.5 and higher support this. */ + is_up = HYPERVISOR_vcpu_op(VCPUOP_is_up, + xen_vcpu_nr(cpu), NULL) > 0; + + if (other_cpu && is_up && + HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(cpu), NULL)) + BUG(); + + if (xen_pv_domain() || xen_feature(XENFEAT_hvm_safe_pvclock)) + xen_setup_runstate_info(cpu); + + rc = xen_vcpu_setup_restore(cpu); + if (rc) + pr_emerg_once("vcpu restore failed for cpu=%d err=%d. " + "System will hang.\n", cpu, rc); + /* + * In case xen_vcpu_setup_restore() fails, do not bring up the + * VCPU. This helps us avoid the resulting OOPS when the VCPU + * accesses pvclock_vcpu_time via xen_vcpu (which is NULL.) + * Note that this does not improve the situation much -- now the + * VM hangs instead of OOPSing -- with the VCPUs that did not + * fail, spinning in stop_machine(), waiting for the failed + * VCPUs to come up. + */ + if (other_cpu && is_up && (rc == 0) && + HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL)) + BUG(); + } } -void xen_vcpu_setup(int cpu) +void xen_vcpu_info_reset(int cpu) +{ + if (xen_vcpu_nr(cpu) < MAX_VIRT_CPUS) { + per_cpu(xen_vcpu, cpu) = + &HYPERVISOR_shared_info->vcpu_info[xen_vcpu_nr(cpu)]; + } else { + /* Set to NULL so that if somebody accesses it we get an OOPS */ + per_cpu(xen_vcpu, cpu) = NULL; + } +} + +int xen_vcpu_setup(int cpu) { struct vcpu_register_vcpu_info info; int err; @@ -123,11 +191,11 @@ void xen_vcpu_setup(int cpu) BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); /* - * This path is called twice on PVHVM - first during bootup via - * smp_init -> xen_hvm_cpu_notify, and then if the VCPU is being - * hotplugged: cpu_up -> xen_hvm_cpu_notify. - * As we can only do the VCPUOP_register_vcpu_info once lets - * not over-write its result. + * This path is called on PVHVM at bootup (xen_hvm_smp_prepare_boot_cpu) + * and at restore (xen_vcpu_restore). Also called for hotplugged + * VCPUs (cpu_init -> xen_hvm_cpu_prepare_hvm). + * However, the hypercall can only be done once (see below) so if a VCPU + * is offlined and comes back online then let's not redo the hypercall. * * For PV it is called during restore (xen_vcpu_restore) and bootup * (xen_setup_vcpu_info_placement). The hotplug mechanism does not @@ -135,42 +203,44 @@ void xen_vcpu_setup(int cpu) */ if (xen_hvm_domain()) { if (per_cpu(xen_vcpu, cpu) == &per_cpu(xen_vcpu_info, cpu)) - return; + return 0; } - if (xen_vcpu_nr(cpu) < MAX_VIRT_CPUS) - per_cpu(xen_vcpu, cpu) = - &HYPERVISOR_shared_info->vcpu_info[xen_vcpu_nr(cpu)]; - if (!xen_have_vcpu_info_placement) { - if (cpu >= MAX_VIRT_CPUS) - clamp_max_cpus(); - return; + if (xen_have_vcpu_info_placement) { + vcpup = &per_cpu(xen_vcpu_info, cpu); + info.mfn = arbitrary_virt_to_mfn(vcpup); + info.offset = offset_in_page(vcpup); + + /* + * Check to see if the hypervisor will put the vcpu_info + * structure where we want it, which allows direct access via + * a percpu-variable. + * N.B. This hypercall can _only_ be called once per CPU. + * Subsequent calls will error out with -EINVAL. This is due to + * the fact that hypervisor has no unregister variant and this + * hypercall does not allow to over-write info.mfn and + * info.offset. + */ + err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, + xen_vcpu_nr(cpu), &info); + + if (err) { + pr_warn_once("register_vcpu_info failed: cpu=%d err=%d\n", + cpu, err); + xen_have_vcpu_info_placement = 0; + } else { + /* + * This cpu is using the registered vcpu info, even if + * later ones fail to. + */ + per_cpu(xen_vcpu, cpu) = vcpup; + } } - vcpup = &per_cpu(xen_vcpu_info, cpu); - info.mfn = arbitrary_virt_to_mfn(vcpup); - info.offset = offset_in_page(vcpup); - - /* Check to see if the hypervisor will put the vcpu_info - structure where we want it, which allows direct access via - a percpu-variable. - N.B. This hypercall can _only_ be called once per CPU. Subsequent - calls will error out with -EINVAL. This is due to the fact that - hypervisor has no unregister variant and this hypercall does not - allow to over-write info.mfn and info.offset. - */ - err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, xen_vcpu_nr(cpu), - &info); + if (!xen_have_vcpu_info_placement) + xen_vcpu_info_reset(cpu); - if (err) { - printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err); - xen_have_vcpu_info_placement = 0; - clamp_max_cpus(); - } else { - /* This cpu is using the registered vcpu info, even if - later ones fail to. */ - per_cpu(xen_vcpu, cpu) = vcpup; - } + return ((per_cpu(xen_vcpu, cpu) == NULL) ? -ENODEV : 0); } void xen_reboot(int reason) diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index a6d014f47e52..de503c225ae1 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -1,5 +1,6 @@ #include <linux/cpu.h> #include <linux/kexec.h> +#include <linux/memblock.h> #include <xen/features.h> #include <xen/events.h> @@ -10,47 +11,59 @@ #include <asm/reboot.h> #include <asm/setup.h> #include <asm/hypervisor.h> +#include <asm/e820/api.h> +#include <asm/early_ioremap.h> #include <asm/xen/cpuid.h> #include <asm/xen/hypervisor.h> +#include <asm/xen/page.h> #include "xen-ops.h" #include "mmu.h" #include "smp.h" -void __ref xen_hvm_init_shared_info(void) +static unsigned long shared_info_pfn; + +void xen_hvm_init_shared_info(void) { - int cpu; struct xen_add_to_physmap xatp; - static struct shared_info *shared_info_page; - if (!shared_info_page) - shared_info_page = (struct shared_info *) - extend_brk(PAGE_SIZE, PAGE_SIZE); xatp.domid = DOMID_SELF; xatp.idx = 0; xatp.space = XENMAPSPACE_shared_info; - xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT; + xatp.gpfn = shared_info_pfn; if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) BUG(); +} - HYPERVISOR_shared_info = (struct shared_info *)shared_info_page; +static void __init reserve_shared_info(void) +{ + u64 pa; - /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info - * page, we use it in the event channel upcall and in some pvclock - * related functions. We don't need the vcpu_info placement - * optimizations because we don't use any pv_mmu or pv_irq op on - * HVM. - * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is - * online but xen_hvm_init_shared_info is run at resume time too and - * in that case multiple vcpus might be online. */ - for_each_online_cpu(cpu) { - /* Leave it to be NULL. */ - if (xen_vcpu_nr(cpu) >= MAX_VIRT_CPUS) - continue; - per_cpu(xen_vcpu, cpu) = - &HYPERVISOR_shared_info->vcpu_info[xen_vcpu_nr(cpu)]; - } + /* + * Search for a free page starting at 4kB physical address. + * Low memory is preferred to avoid an EPT large page split up + * by the mapping. + * Starting below X86_RESERVE_LOW (usually 64kB) is fine as + * the BIOS used for HVM guests is well behaved and won't + * clobber memory other than the first 4kB. + */ + for (pa = PAGE_SIZE; + !e820__mapped_all(pa, pa + PAGE_SIZE, E820_TYPE_RAM) || + memblock_is_reserved(pa); + pa += PAGE_SIZE) + ; + + shared_info_pfn = PHYS_PFN(pa); + + memblock_reserve(pa, PAGE_SIZE); + HYPERVISOR_shared_info = early_memremap(pa, PAGE_SIZE); +} + +static void __init xen_hvm_init_mem_mapping(void) +{ + early_memunmap(HYPERVISOR_shared_info, PAGE_SIZE); + HYPERVISOR_shared_info = __va(PFN_PHYS(shared_info_pfn)); } static void __init init_hvm_pv_info(void) @@ -106,7 +119,7 @@ static void xen_hvm_crash_shutdown(struct pt_regs *regs) static int xen_cpu_up_prepare_hvm(unsigned int cpu) { - int rc; + int rc = 0; /* * This can happen if CPU was offlined earlier and @@ -121,7 +134,9 @@ static int xen_cpu_up_prepare_hvm(unsigned int cpu) per_cpu(xen_vcpu_id, cpu) = cpu_acpi_id(cpu); else per_cpu(xen_vcpu_id, cpu) = cpu; - xen_vcpu_setup(cpu); + rc = xen_vcpu_setup(cpu); + if (rc) + return rc; if (xen_have_vector_callback && xen_feature(XENFEAT_hvm_safe_pvclock)) xen_setup_timer(cpu); @@ -130,9 +145,8 @@ static int xen_cpu_up_prepare_hvm(unsigned int cpu) if (rc) { WARN(1, "xen_smp_intr_init() for CPU %d failed: %d\n", cpu, rc); - return rc; } - return 0; + return rc; } static int xen_cpu_dead_hvm(unsigned int cpu) @@ -152,8 +166,16 @@ static void __init xen_hvm_guest_init(void) init_hvm_pv_info(); + reserve_shared_info(); xen_hvm_init_shared_info(); + /* + * xen_vcpu is a pointer to the vcpu_info struct in the shared_info + * page, we use it in the event channel upcall and in some pvclock + * related functions. + */ + xen_vcpu_info_reset(0); + xen_panic_handler_init(); if (xen_feature(XENFEAT_hvm_callback_vector)) @@ -210,5 +232,6 @@ const struct hypervisor_x86 x86_hyper_xen_hvm = { .init_platform = xen_hvm_guest_init, .pin_vcpu = xen_pin_vcpu, .x2apic_available = xen_x2apic_para_available, + .init_mem_mapping = xen_hvm_init_mem_mapping, }; EXPORT_SYMBOL(x86_hyper_xen_hvm); diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index f33eef4ebd12..d4396e27b1fb 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Core of Xen paravirt_ops implementation. * @@ -89,8 +90,6 @@ void *xen_initial_gdt; -RESERVE_BRK(shared_info_page_brk, PAGE_SIZE); - static int xen_cpu_up_prepare_pv(unsigned int cpu); static int xen_cpu_dead_pv(unsigned int cpu); @@ -107,35 +106,6 @@ struct tls_descs { */ static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc); -/* - * On restore, set the vcpu placement up again. - * If it fails, then we're in a bad state, since - * we can't back out from using it... - */ -void xen_vcpu_restore(void) -{ - int cpu; - - for_each_possible_cpu(cpu) { - bool other_cpu = (cpu != smp_processor_id()); - bool is_up = HYPERVISOR_vcpu_op(VCPUOP_is_up, xen_vcpu_nr(cpu), - NULL); - - if (other_cpu && is_up && - HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(cpu), NULL)) - BUG(); - - xen_setup_runstate_info(cpu); - - if (xen_have_vcpu_info_placement) - xen_vcpu_setup(cpu); - - if (other_cpu && is_up && - HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL)) - BUG(); - } -} - static void __init xen_banner(void) { unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL); @@ -294,6 +264,13 @@ static void __init xen_init_capabilities(void) setup_clear_cpu_cap(X86_FEATURE_MTRR); setup_clear_cpu_cap(X86_FEATURE_ACC); setup_clear_cpu_cap(X86_FEATURE_X2APIC); + setup_clear_cpu_cap(X86_FEATURE_SME); + + /* + * Xen PV would need some work to support PCID: CR3 handling as well + * as xen_flush_tlb_others() would need updating. + */ + setup_clear_cpu_cap(X86_FEATURE_PCID); if (!xen_initial_domain()) setup_clear_cpu_cap(X86_FEATURE_ACPI); @@ -525,7 +502,7 @@ static void __init xen_load_gdt_boot(const struct desc_ptr *dtr) static inline bool desc_equal(const struct desc_struct *d1, const struct desc_struct *d2) { - return d1->a == d2->a && d1->b == d2->b; + return !memcmp(d1, d2, sizeof(*d1)); } static void load_TLS_descriptor(struct thread_struct *t, @@ -610,59 +587,91 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, preempt_enable(); } +#ifdef CONFIG_X86_64 +struct trap_array_entry { + void (*orig)(void); + void (*xen)(void); + bool ist_okay; +}; + +static struct trap_array_entry trap_array[] = { + { debug, xen_xendebug, true }, + { int3, xen_xenint3, true }, + { double_fault, xen_double_fault, true }, +#ifdef CONFIG_X86_MCE + { machine_check, xen_machine_check, true }, +#endif + { nmi, xen_nmi, true }, + { overflow, xen_overflow, false }, +#ifdef CONFIG_IA32_EMULATION + { entry_INT80_compat, xen_entry_INT80_compat, false }, +#endif + { page_fault, xen_page_fault, false }, + { divide_error, xen_divide_error, false }, + { bounds, xen_bounds, false }, + { invalid_op, xen_invalid_op, false }, + { device_not_available, xen_device_not_available, false }, + { coprocessor_segment_overrun, xen_coprocessor_segment_overrun, false }, + { invalid_TSS, xen_invalid_TSS, false }, + { segment_not_present, xen_segment_not_present, false }, + { stack_segment, xen_stack_segment, false }, + { general_protection, xen_general_protection, false }, + { spurious_interrupt_bug, xen_spurious_interrupt_bug, false }, + { coprocessor_error, xen_coprocessor_error, false }, + { alignment_check, xen_alignment_check, false }, + { simd_coprocessor_error, xen_simd_coprocessor_error, false }, +}; + +static bool get_trap_addr(void **addr, unsigned int ist) +{ + unsigned int nr; + bool ist_okay = false; + + /* + * Replace trap handler addresses by Xen specific ones. + * Check for known traps using IST and whitelist them. + * The debugger ones are the only ones we care about. + * Xen will handle faults like double_fault, * so we should never see + * them. Warn if there's an unexpected IST-using fault handler. + */ + for (nr = 0; nr < ARRAY_SIZE(trap_array); nr++) { + struct trap_array_entry *entry = trap_array + nr; + + if (*addr == entry->orig) { + *addr = entry->xen; + ist_okay = entry->ist_okay; + break; + } + } + + if (WARN_ON(ist != 0 && !ist_okay)) + return false; + + return true; +} +#endif + static int cvt_gate_to_trap(int vector, const gate_desc *val, struct trap_info *info) { unsigned long addr; - if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT) + if (val->bits.type != GATE_TRAP && val->bits.type != GATE_INTERRUPT) return 0; info->vector = vector; - addr = gate_offset(*val); + addr = gate_offset(val); #ifdef CONFIG_X86_64 - /* - * Look for known traps using IST, and substitute them - * appropriately. The debugger ones are the only ones we care - * about. Xen will handle faults like double_fault, - * so we should never see them. Warn if - * there's an unexpected IST-using fault handler. - */ - if (addr == (unsigned long)debug) - addr = (unsigned long)xen_debug; - else if (addr == (unsigned long)int3) - addr = (unsigned long)xen_int3; - else if (addr == (unsigned long)stack_segment) - addr = (unsigned long)xen_stack_segment; - else if (addr == (unsigned long)double_fault) { - /* Don't need to handle these */ + if (!get_trap_addr((void **)&addr, val->bits.ist)) return 0; -#ifdef CONFIG_X86_MCE - } else if (addr == (unsigned long)machine_check) { - /* - * when xen hypervisor inject vMCE to guest, - * use native mce handler to handle it - */ - ; -#endif - } else if (addr == (unsigned long)nmi) - /* - * Use the native version as well. - */ - ; - else { - /* Some other trap using IST? */ - if (WARN_ON(val->ist != 0)) - return 0; - } #endif /* CONFIG_X86_64 */ info->address = addr; - info->cs = gate_segment(*val); - info->flags = val->dpl; + info->cs = gate_segment(val); + info->flags = val->bits.dpl; /* interrupt gates clear IF */ - if (val->type == GATE_INTERRUPT) + if (val->bits.type == GATE_INTERRUPT) info->flags |= 1 << 2; return 1; @@ -960,30 +969,43 @@ void xen_setup_shared_info(void) HYPERVISOR_shared_info = (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP); -#ifndef CONFIG_SMP - /* In UP this is as good a place as any to set up shared info */ - xen_setup_vcpu_info_placement(); -#endif - xen_setup_mfn_list_list(); - /* - * Now that shared info is set up we can start using routines that - * point to pvclock area. - */ - if (system_state == SYSTEM_BOOTING) + if (system_state == SYSTEM_BOOTING) { +#ifndef CONFIG_SMP + /* + * In UP this is as good a place as any to set up shared info. + * Limit this to boot only, at restore vcpu setup is done via + * xen_vcpu_restore(). + */ + xen_setup_vcpu_info_placement(); +#endif + /* + * Now that shared info is set up we can start using routines + * that point to pvclock area. + */ xen_init_time_ops(); + } } /* This is called once we have the cpu_possible_mask */ -void xen_setup_vcpu_info_placement(void) +void __ref xen_setup_vcpu_info_placement(void) { int cpu; for_each_possible_cpu(cpu) { /* Set up direct vCPU id mapping for PV guests. */ per_cpu(xen_vcpu_id, cpu) = cpu; - xen_vcpu_setup(cpu); + + /* + * xen_vcpu_setup(cpu) can fail -- in which case it + * falls back to the shared_info version for cpus + * where xen_vcpu_nr(cpu) < MAX_VIRT_CPUS. + * + * xen_cpu_up_prepare_pv() handles the rest by failing + * them in hotplug. + */ + (void) xen_vcpu_setup(cpu); } /* @@ -999,59 +1021,6 @@ void xen_setup_vcpu_info_placement(void) } } -static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, - unsigned long addr, unsigned len) -{ - char *start, *end, *reloc; - unsigned ret; - - start = end = reloc = NULL; - -#define SITE(op, x) \ - case PARAVIRT_PATCH(op.x): \ - if (xen_have_vcpu_info_placement) { \ - start = (char *)xen_##x##_direct; \ - end = xen_##x##_direct_end; \ - reloc = xen_##x##_direct_reloc; \ - } \ - goto patch_site - - switch (type) { - SITE(pv_irq_ops, irq_enable); - SITE(pv_irq_ops, irq_disable); - SITE(pv_irq_ops, save_fl); - SITE(pv_irq_ops, restore_fl); -#undef SITE - - patch_site: - if (start == NULL || (end-start) > len) - goto default_patch; - - ret = paravirt_patch_insns(insnbuf, len, start, end); - - /* Note: because reloc is assigned from something that - appears to be an array, gcc assumes it's non-null, - but doesn't know its relationship with start and - end. */ - if (reloc > start && reloc < end) { - int reloc_off = reloc - start; - long *relocp = (long *)(insnbuf + reloc_off); - long delta = start - (char *)addr; - - *relocp += delta; - } - break; - - default_patch: - default: - ret = paravirt_patch_default(type, clobbers, insnbuf, - addr, len); - break; - } - - return ret; -} - static const struct pv_info xen_info __initconst = { .shared_kernel_pmd = 0, @@ -1061,10 +1030,6 @@ static const struct pv_info xen_info __initconst = { .name = "Xen", }; -static const struct pv_init_ops xen_init_ops __initconst = { - .patch = xen_patch, -}; - static const struct pv_cpu_ops xen_cpu_ops __initconst = { .cpuid = xen_cpuid, @@ -1074,7 +1039,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { .read_cr0 = xen_read_cr0, .write_cr0 = xen_write_cr0, - .read_cr4 = native_read_cr4, .write_cr4 = xen_write_cr4, #ifdef CONFIG_X86_64 @@ -1109,7 +1073,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { .alloc_ldt = xen_alloc_ldt, .free_ldt = xen_free_ldt, - .store_idt = native_store_idt, .store_tr = xen_store_tr, .write_ldt_entry = xen_write_ldt_entry, @@ -1262,7 +1225,7 @@ asmlinkage __visible void __init xen_start_kernel(void) /* Install Xen paravirt ops */ pv_info = xen_info; - pv_init_ops = xen_init_ops; + pv_init_ops.patch = paravirt_patch_default; pv_cpu_ops = xen_cpu_ops; x86_platform.get_nmi_reason = xen_get_nmi_reason; @@ -1332,9 +1295,17 @@ asmlinkage __visible void __init xen_start_kernel(void) */ acpi_numa = -1; #endif - /* Don't do the full vcpu_info placement stuff until we have a - possible map and a non-dummy shared_info. */ - per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; + /* Let's presume PV guests always boot on vCPU with id 0. */ + per_cpu(xen_vcpu_id, 0) = 0; + + /* + * Setup xen_vcpu early because start_kernel needs it for + * local_irq_disable(), irqs_disabled(). + * + * Don't do the full vcpu_info placement stuff until we have + * the cpu_possible_mask and a non-dummy shared_info. + */ + xen_vcpu_info_reset(0); WARN_ON(xen_cpuhp_setup(xen_cpu_up_prepare_pv, xen_cpu_dead_pv)); @@ -1431,9 +1402,7 @@ asmlinkage __visible void __init xen_start_kernel(void) #endif xen_raw_console_write("about to get started...\n"); - /* Let's presume PV guests always boot on vCPU with id 0. */ - per_cpu(xen_vcpu_id, 0) = 0; - + /* We need this for printk timestamps */ xen_setup_runstate_info(0); xen_efi_init(); @@ -1451,6 +1420,9 @@ static int xen_cpu_up_prepare_pv(unsigned int cpu) { int rc; + if (per_cpu(xen_vcpu, cpu) == NULL) + return -ENODEV; + xen_setup_timer(cpu); rc = xen_smp_intr_init(cpu); diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c index 98ab17673454..7bd3ee08393e 100644 --- a/arch/x86/xen/enlighten_pvh.c +++ b/arch/x86/xen/enlighten_pvh.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/acpi.h> #include <xen/hvc-console.h> diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index 33e92955e09d..74179852e46c 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/hardirq.h> #include <asm/x86_init.h> @@ -123,9 +124,6 @@ static const struct pv_irq_ops xen_irq_ops __initconst = { .safe_halt = xen_safe_halt, .halt = xen_halt, -#ifdef CONFIG_X86_64 - .adjust_exception_frame = xen_adjust_exception_frame, -#endif }; void __init xen_init_irq_ops(void) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 3be06f3caf3c..3e15345abfe7 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -84,7 +84,7 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token, else rmd->mfn++; - rmd->mmu_update->ptr = virt_to_machine(ptep).maddr; + rmd->mmu_update->ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE; rmd->mmu_update->val = pte_val_ma(pte); rmd->mmu_update++; diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index 3fe2b3292915..a7e47cf7ec6c 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _XEN_MMU_H #include <linux/linkage.h> diff --git a/arch/x86/xen/mmu_hvm.c b/arch/x86/xen/mmu_hvm.c index 1c57f1cd545c..2cfcfe4f6b2a 100644 --- a/arch/x86/xen/mmu_hvm.c +++ b/arch/x86/xen/mmu_hvm.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/types.h> #include <linux/crash_dump.h> diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 1f386d7fdf70..71495f1a86d7 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -162,26 +162,6 @@ static bool xen_page_pinned(void *ptr) return PagePinned(page); } -void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid) -{ - struct multicall_space mcs; - struct mmu_update *u; - - trace_xen_mmu_set_domain_pte(ptep, pteval, domid); - - mcs = xen_mc_entry(sizeof(*u)); - u = mcs.args; - - /* ptep might be kmapped when using 32-bit HIGHPTE */ - u->ptr = virt_to_machine(ptep).maddr; - u->val = pte_val_ma(pteval); - - MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid); - - xen_mc_issue(PARAVIRT_LAZY_MMU); -} -EXPORT_SYMBOL_GPL(xen_set_domain_pte); - static void xen_extend_mmu_update(const struct mmu_update *update) { struct multicall_space mcs; @@ -975,69 +955,63 @@ static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) spin_unlock(&mm->page_table_lock); } - -#ifdef CONFIG_SMP -/* Another cpu may still have their %cr3 pointing at the pagetable, so - we need to repoint it somewhere else before we can unpin it. */ -static void drop_other_mm_ref(void *info) +static void drop_mm_ref_this_cpu(void *info) { struct mm_struct *mm = info; - struct mm_struct *active_mm; - - active_mm = this_cpu_read(cpu_tlbstate.active_mm); - if (active_mm == mm && this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) + if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm) leave_mm(smp_processor_id()); - /* If this cpu still has a stale cr3 reference, then make sure - it has been flushed. */ + /* + * If this cpu still has a stale cr3 reference, then make sure + * it has been flushed. + */ if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd)) - load_cr3(swapper_pg_dir); + xen_mc_flush(); } +#ifdef CONFIG_SMP +/* + * Another cpu may still have their %cr3 pointing at the pagetable, so + * we need to repoint it somewhere else before we can unpin it. + */ static void xen_drop_mm_ref(struct mm_struct *mm) { cpumask_var_t mask; unsigned cpu; - if (current->active_mm == mm) { - if (current->mm == mm) - load_cr3(swapper_pg_dir); - else - leave_mm(smp_processor_id()); - } + drop_mm_ref_this_cpu(mm); /* Get the "official" set of cpus referring to our pagetable. */ if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) { for_each_online_cpu(cpu) { - if (!cpumask_test_cpu(cpu, mm_cpumask(mm)) - && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd)) + if (per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd)) continue; - smp_call_function_single(cpu, drop_other_mm_ref, mm, 1); + smp_call_function_single(cpu, drop_mm_ref_this_cpu, mm, 1); } return; } - cpumask_copy(mask, mm_cpumask(mm)); - /* It's possible that a vcpu may have a stale reference to our - cr3, because its in lazy mode, and it hasn't yet flushed - its set of pending hypercalls yet. In this case, we can - look at its actual current cr3 value, and force it to flush - if needed. */ + /* + * It's possible that a vcpu may have a stale reference to our + * cr3, because its in lazy mode, and it hasn't yet flushed + * its set of pending hypercalls yet. In this case, we can + * look at its actual current cr3 value, and force it to flush + * if needed. + */ + cpumask_clear(mask); for_each_online_cpu(cpu) { if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd)) cpumask_set_cpu(cpu, mask); } - if (!cpumask_empty(mask)) - smp_call_function_many(mask, drop_other_mm_ref, mm, 1); + smp_call_function_many(mask, drop_mm_ref_this_cpu, mm, 1); free_cpumask_var(mask); } #else static void xen_drop_mm_ref(struct mm_struct *mm) { - if (current->active_mm == mm) - load_cr3(swapper_pg_dir); + drop_mm_ref_this_cpu(mm); } #endif @@ -1264,21 +1238,16 @@ static void __init xen_pagetable_cleanhighmap(void) * from _brk_limit way up to the max_pfn_mapped (which is the end of * the ramdisk). We continue on, erasing PMD entries that point to page * tables - do note that they are accessible at this stage via __va. - * For good measure we also round up to the PMD - which means that if + * As Xen is aligning the memory end to a 4MB boundary, for good + * measure we also round up to PMD_SIZE * 2 - which means that if * anybody is using __ka address to the initial boot-stack - and try * to use it - they are going to crash. The xen_start_info has been * taken care of already in xen_setup_kernel_pagetable. */ addr = xen_start_info->pt_base; - size = roundup(xen_start_info->nr_pt_frames * PAGE_SIZE, PMD_SIZE); + size = xen_start_info->nr_pt_frames * PAGE_SIZE; - xen_cleanhighmap(addr, addr + size); + xen_cleanhighmap(addr, roundup(addr + size, PMD_SIZE * 2)); xen_start_info->pt_base = (unsigned long)__va(__pa(xen_start_info->pt_base)); -#ifdef DEBUG - /* This is superfluous and is not necessary, but you know what - * lets do it. The MODULES_VADDR -> MODULES_END should be clear of - * anything at this stage. */ - xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1); -#endif } #endif @@ -1366,8 +1335,7 @@ static void xen_flush_tlb_single(unsigned long addr) } static void xen_flush_tlb_others(const struct cpumask *cpus, - struct mm_struct *mm, unsigned long start, - unsigned long end) + const struct flush_tlb_info *info) { struct { struct mmuext_op op; @@ -1379,7 +1347,7 @@ static void xen_flush_tlb_others(const struct cpumask *cpus, } *args; struct multicall_space mcs; - trace_xen_mmu_flush_tlb_others(cpus, mm, start, end); + trace_xen_mmu_flush_tlb_others(cpus, info->mm, info->start, info->end); if (cpumask_empty(cpus)) return; /* nothing to do */ @@ -1393,9 +1361,10 @@ static void xen_flush_tlb_others(const struct cpumask *cpus, cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask)); args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; - if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) { + if (info->end != TLB_FLUSH_ALL && + (info->end - info->start) <= PAGE_SIZE) { args->op.cmd = MMUEXT_INVLPG_MULTI; - args->op.arg1.linear_addr = start; + args->op.arg1.linear_addr = info->start; } MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); @@ -1470,8 +1439,8 @@ static void xen_write_cr3(unsigned long cr3) * At the start of the day - when Xen launches a guest, it has already * built pagetables for the guest. We diligently look over them * in xen_setup_kernel_pagetable and graft as appropriate them in the - * init_level4_pgt and its friends. Then when we are happy we load - * the new init_level4_pgt - and continue on. + * init_top_pgt and its friends. Then when we are happy we load + * the new init_top_pgt - and continue on. * * The generic code starts (start_kernel) and 'init_mem_mapping' sets * up the rest of the pagetables. When it has completed it loads the cr3. @@ -1914,12 +1883,12 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) pt_end = pt_base + xen_start_info->nr_pt_frames; /* Zap identity mapping */ - init_level4_pgt[0] = __pgd(0); + init_top_pgt[0] = __pgd(0); /* Pre-constructed entries are in pfn, so convert to mfn */ /* L4[272] -> level3_ident_pgt */ /* L4[511] -> level3_kernel_pgt */ - convert_pfn_mfn(init_level4_pgt); + convert_pfn_mfn(init_top_pgt); /* L3_i[0] -> level2_ident_pgt */ convert_pfn_mfn(level3_ident_pgt); @@ -1950,10 +1919,10 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) /* Copy the initial P->M table mappings if necessary. */ i = pgd_index(xen_start_info->mfn_list); if (i && i < pgd_index(__START_KERNEL_map)) - init_level4_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i]; + init_top_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i]; /* Make pagetable pieces RO */ - set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); + set_page_prot(init_top_pgt, PAGE_KERNEL_RO); set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); @@ -1964,7 +1933,7 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) /* Pin down new L4 */ pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, - PFN_DOWN(__pa_symbol(init_level4_pgt))); + PFN_DOWN(__pa_symbol(init_top_pgt))); /* Unpin Xen-provided one */ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); @@ -1974,7 +1943,7 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) * attach it to, so make sure we just set kernel pgd. */ xen_mc_batch(); - __xen_write_cr3(true, __pa(init_level4_pgt)); + __xen_write_cr3(true, __pa(init_top_pgt)); xen_mc_issue(PARAVIRT_LAZY_CPU); /* We can't that easily rip out L3 and L2, as the Xen pagetables are @@ -2022,7 +1991,7 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr) pmd_t pmd; pte_t pte; - pa = read_cr3(); + pa = read_cr3_pa(); pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) * sizeof(pgd))); if (!pgd_present(pgd)) @@ -2102,7 +2071,7 @@ void __init xen_relocate_p2m(void) pt_phys = pmd_phys + PFN_PHYS(n_pmd); p2m_pfn = PFN_DOWN(pt_phys) + n_pt; - pgd = __va(read_cr3()); + pgd = __va(read_cr3_pa()); new_p2m = (unsigned long *)(2 * PGDIR_SIZE); idx_p4d = 0; save_pud = n_pud; @@ -2209,7 +2178,7 @@ static void __init xen_write_cr3_init(unsigned long cr3) { unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir)); - BUG_ON(read_cr3() != __pa(initial_page_table)); + BUG_ON(read_cr3_pa() != __pa(initial_page_table)); BUG_ON(cr3 != __pa(swapper_pg_dir)); /* @@ -2246,7 +2215,7 @@ static void __init xen_write_cr3_init(unsigned long cr3) * not the first page table in the page table pool. * Iterate through the initial page tables to find the real page table base. */ -static phys_addr_t xen_find_pt_base(pmd_t *pmd) +static phys_addr_t __init xen_find_pt_base(pmd_t *pmd) { phys_addr_t pt_base, paddr; unsigned pmdidx; @@ -2435,8 +2404,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { .flush_tlb_single = xen_flush_tlb_single, .flush_tlb_others = xen_flush_tlb_others, - .pte_update = paravirt_nop, - .pgd_alloc = xen_pgd_alloc, .pgd_free = xen_pgd_free, @@ -2698,8 +2665,8 @@ EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region); phys_addr_t paddr_vmcoreinfo_note(void) { if (xen_pv_domain()) - return virt_to_machine(&vmcoreinfo_note).maddr; + return virt_to_machine(vmcoreinfo_note).maddr; else - return __pa_symbol(&vmcoreinfo_note); + return __pa(vmcoreinfo_note); } #endif /* CONFIG_KEXEC_CORE */ diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c index ea54a08d8301..dc502ca8263e 100644 --- a/arch/x86/xen/multicalls.c +++ b/arch/x86/xen/multicalls.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Xen hypercall batching. * diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h index 9c2e74f9096c..1c51b2c87f30 100644 --- a/arch/x86/xen/multicalls.h +++ b/arch/x86/xen/multicalls.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _XEN_MULTICALLS_H #define _XEN_MULTICALLS_H diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 276da636dd39..6083ba462f35 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -212,8 +212,7 @@ void __ref xen_build_mfn_list_list(void) unsigned int level, topidx, mididx; unsigned long *mid_mfn_p; - if (xen_feature(XENFEAT_auto_translated_physmap) || - xen_start_info->flags & SIF_VIRT_P2M_4TOOLS) + if (xen_start_info->flags & SIF_VIRT_P2M_4TOOLS) return; /* Pre-initialize p2m_top_mfn to be completely missing */ @@ -269,9 +268,6 @@ void __ref xen_build_mfn_list_list(void) void xen_setup_mfn_list_list(void) { - if (xen_feature(XENFEAT_auto_translated_physmap)) - return; - BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); if (xen_start_info->flags & SIF_VIRT_P2M_4TOOLS) @@ -291,9 +287,6 @@ void __init xen_build_dynamic_phys_to_machine(void) { unsigned long pfn; - if (xen_feature(XENFEAT_auto_translated_physmap)) - return; - xen_p2m_addr = (unsigned long *)xen_start_info->mfn_list; xen_p2m_size = ALIGN(xen_start_info->nr_pages, P2M_PER_PAGE); @@ -540,9 +533,6 @@ int xen_alloc_p2m_entry(unsigned long pfn) unsigned long addr = (unsigned long)(xen_p2m_addr + pfn); unsigned long p2m_pfn; - if (xen_feature(XENFEAT_auto_translated_physmap)) - return 0; - ptep = lookup_address(addr, &level); BUG_ON(!ptep || level != PG_LEVEL_4K); pte_pg = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1)); @@ -640,9 +630,6 @@ unsigned long __init set_phys_range_identity(unsigned long pfn_s, if (unlikely(pfn_s >= xen_p2m_size)) return 0; - if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) - return pfn_e - pfn_s; - if (pfn_s > pfn_e) return 0; @@ -660,10 +647,6 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) pte_t *ptep; unsigned int level; - /* don't track P2M changes in autotranslate guests */ - if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) - return true; - if (unlikely(pfn >= xen_p2m_size)) { BUG_ON(mfn != INVALID_P2M_ENTRY); return true; @@ -711,9 +694,6 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, int i, ret = 0; pte_t *pte; - if (xen_feature(XENFEAT_auto_translated_physmap)) - return 0; - if (kmap_ops) { ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, kmap_ops, count); @@ -756,9 +736,6 @@ int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, { int i, ret = 0; - if (xen_feature(XENFEAT_auto_translated_physmap)) - return 0; - for (i = 0; i < count; i++) { unsigned long mfn = __pfn_to_mfn(page_to_pfn(pages[i])); unsigned long pfn = page_to_pfn(pages[i]); diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c index 42b08f8fc2ca..37c6056a7bba 100644 --- a/arch/x86/xen/pci-swiotlb-xen.c +++ b/arch/x86/xen/pci-swiotlb-xen.c @@ -18,20 +18,6 @@ int xen_swiotlb __read_mostly; -static const struct dma_map_ops xen_swiotlb_dma_ops = { - .alloc = xen_swiotlb_alloc_coherent, - .free = xen_swiotlb_free_coherent, - .sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu, - .sync_single_for_device = xen_swiotlb_sync_single_for_device, - .sync_sg_for_cpu = xen_swiotlb_sync_sg_for_cpu, - .sync_sg_for_device = xen_swiotlb_sync_sg_for_device, - .map_sg = xen_swiotlb_map_sg_attrs, - .unmap_sg = xen_swiotlb_unmap_sg_attrs, - .map_page = xen_swiotlb_map_page, - .unmap_page = xen_swiotlb_unmap_page, - .dma_supported = xen_swiotlb_dma_supported, -}; - /* * pci_xen_swiotlb_detect - set xen_swiotlb to 1 if necessary * diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c index b9fc52556bcc..7d00d4ad44d4 100644 --- a/arch/x86/xen/pmu.c +++ b/arch/x86/xen/pmu.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/types.h> #include <linux/interrupt.h> diff --git a/arch/x86/xen/pmu.h b/arch/x86/xen/pmu.h index 4be5355b56f7..0e83a160589b 100644 --- a/arch/x86/xen/pmu.h +++ b/arch/x86/xen/pmu.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __XEN_PMU_H #define __XEN_PMU_H diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index a5bf7c451435..c114ca767b3b 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Machine specific setup for xen * @@ -340,8 +341,6 @@ static void __init xen_do_set_identity_and_remap_chunk( WARN_ON(size == 0); - BUG_ON(xen_feature(XENFEAT_auto_translated_physmap)); - mfn_save = virt_to_mfn(buf); for (ident_pfn_iter = start_pfn, remap_pfn_iter = remap_pfn; @@ -499,7 +498,7 @@ static unsigned long __init xen_foreach_remap_area(unsigned long nr_pages, void __init xen_remap_memory(void) { unsigned long buf = (unsigned long)&xen_remap_buf; - unsigned long mfn_save, mfn, pfn; + unsigned long mfn_save, pfn; unsigned long remapped = 0; unsigned int i; unsigned long pfn_s = ~0UL; @@ -515,8 +514,7 @@ void __init xen_remap_memory(void) pfn = xen_remap_buf.target_pfn; for (i = 0; i < xen_remap_buf.size; i++) { - mfn = xen_remap_buf.mfns[i]; - xen_update_mem_tables(pfn, mfn); + xen_update_mem_tables(pfn, xen_remap_buf.mfns[i]); remapped++; pfn++; } @@ -530,8 +528,6 @@ void __init xen_remap_memory(void) pfn_s = xen_remap_buf.target_pfn; len = xen_remap_buf.size; } - - mfn = xen_remap_mfn; xen_remap_mfn = xen_remap_buf.next_area_mfn; } @@ -1027,8 +1023,7 @@ void __init xen_pvmmu_arch_setup(void) void __init xen_arch_setup(void) { xen_panic_handler_init(); - if (!xen_feature(XENFEAT_auto_translated_physmap)) - xen_pvmmu_arch_setup(); + xen_pvmmu_arch_setup(); #ifdef CONFIG_ACPI if (!(xen_start_info->flags & SIF_INITDOMAIN)) { diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 82ac611f2fc1..77c959cf81e7 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -1,4 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/smp.h> +#include <linux/cpu.h> #include <linux/slab.h> #include <linux/cpumask.h> #include <linux/percpu.h> @@ -114,6 +116,36 @@ int xen_smp_intr_init(unsigned int cpu) return rc; } +void __init xen_smp_cpus_done(unsigned int max_cpus) +{ + int cpu, rc, count = 0; + + if (xen_hvm_domain()) + native_smp_cpus_done(max_cpus); + + if (xen_have_vcpu_info_placement) + return; + + for_each_online_cpu(cpu) { + if (xen_vcpu_nr(cpu) < MAX_VIRT_CPUS) + continue; + + rc = cpu_down(cpu); + + if (rc == 0) { + /* + * Reset vcpu_info so this cpu cannot be onlined again. + */ + xen_vcpu_info_reset(cpu); + count++; + } else { + pr_warn("%s: failed to bring CPU %d down, error %d\n", + __func__, cpu, rc); + } + } + WARN(count, "%s: brought %d CPUs offline\n", __func__, count); +} + void xen_smp_send_reschedule(int cpu) { xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR); diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h index 8ebb6acca64a..bd02f9d50107 100644 --- a/arch/x86/xen/smp.h +++ b/arch/x86/xen/smp.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _XEN_SMP_H #ifdef CONFIG_SMP @@ -14,6 +15,8 @@ extern void xen_smp_intr_free(unsigned int cpu); int xen_smp_intr_init_pv(unsigned int cpu); void xen_smp_intr_free_pv(unsigned int cpu); +void xen_smp_cpus_done(unsigned int max_cpus); + void xen_smp_send_reschedule(int cpu); void xen_smp_send_call_function_ipi(const struct cpumask *mask); void xen_smp_send_call_function_single_ipi(int cpu); diff --git a/arch/x86/xen/smp_hvm.c b/arch/x86/xen/smp_hvm.c index f18561bbf5c9..f8d39440b292 100644 --- a/arch/x86/xen/smp_hvm.c +++ b/arch/x86/xen/smp_hvm.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <asm/smp.h> #include <xen/events.h> @@ -12,7 +13,8 @@ static void __init xen_hvm_smp_prepare_boot_cpu(void) native_smp_prepare_boot_cpu(); /* - * Setup vcpu_info for boot CPU. + * Setup vcpu_info for boot CPU. Secondary CPUs get their vcpu_info + * in xen_cpu_up_prepare_hvm(). */ xen_vcpu_setup(0); @@ -27,10 +29,20 @@ static void __init xen_hvm_smp_prepare_boot_cpu(void) static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus) { + int cpu; + native_smp_prepare_cpus(max_cpus); WARN_ON(xen_smp_intr_init(0)); xen_init_lock_cpu(0); + + for_each_possible_cpu(cpu) { + if (cpu == 0) + continue; + + /* Set default vcpu_id to make sure that we don't use cpu-0's */ + per_cpu(xen_vcpu_id, cpu) = XEN_VCPU_ID_INVALID; + } } #ifdef CONFIG_HOTPLUG_CPU @@ -60,4 +72,5 @@ void __init xen_hvm_smp_init(void) smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi; smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi; smp_ops.smp_prepare_boot_cpu = xen_hvm_smp_prepare_boot_cpu; + smp_ops.smp_cpus_done = xen_smp_cpus_done; } diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index aae32535f4ec..05f91ce9b55e 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Xen SMP support * @@ -19,6 +20,7 @@ #include <linux/irq_work.h> #include <linux/tick.h> #include <linux/nmi.h> +#include <linux/cpuhotplug.h> #include <asm/paravirt.h> #include <asm/desc.h> @@ -371,10 +373,6 @@ static int xen_pv_cpu_up(unsigned int cpu, struct task_struct *idle) return 0; } -static void xen_pv_smp_cpus_done(unsigned int max_cpus) -{ -} - #ifdef CONFIG_HOTPLUG_CPU static int xen_pv_cpu_disable(void) { @@ -417,7 +415,7 @@ static void xen_pv_play_dead(void) /* used only with HOTPLUG_CPU */ */ tick_nohz_idle_enter(); - cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); + cpuhp_online_idle(CPUHP_AP_ONLINE_IDLE); } #else /* !CONFIG_HOTPLUG_CPU */ @@ -469,7 +467,7 @@ static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id) static const struct smp_ops xen_smp_ops __initconst = { .smp_prepare_boot_cpu = xen_pv_smp_prepare_boot_cpu, .smp_prepare_cpus = xen_pv_smp_prepare_cpus, - .smp_cpus_done = xen_pv_smp_cpus_done, + .smp_cpus_done = xen_smp_cpus_done, .cpu_up = xen_pv_cpu_up, .cpu_die = xen_pv_cpu_die, diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 25a7c4302ce7..08324c64005d 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Split spinlock implementation out into its own file, so it can be * compiled in a FTRACE-compatible way. diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index d6b1680693a9..92bf5ecb6baf 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/types.h> #include <linux/tick.h> diff --git a/arch/x86/xen/suspend_hvm.c b/arch/x86/xen/suspend_hvm.c index 01afcadde50a..e666b614cf6d 100644 --- a/arch/x86/xen/suspend_hvm.c +++ b/arch/x86/xen/suspend_hvm.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/types.h> #include <xen/xen.h> @@ -8,15 +9,10 @@ void xen_hvm_post_suspend(int suspend_cancelled) { - int cpu; - - if (!suspend_cancelled) + if (!suspend_cancelled) { xen_hvm_init_shared_info(); + xen_vcpu_restore(); + } xen_callback_vector(); xen_unplug_emulated_devices(); - if (xen_feature(XENFEAT_hvm_safe_pvclock)) { - for_each_online_cpu(cpu) { - xen_setup_runstate_info(cpu); - } - } } diff --git a/arch/x86/xen/suspend_pv.c b/arch/x86/xen/suspend_pv.c index 3abe4f07f34a..a2e0f110af56 100644 --- a/arch/x86/xen/suspend_pv.c +++ b/arch/x86/xen/suspend_pv.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/types.h> #include <asm/fixmap.h> diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index a1895a8e85c1..80c2a4bdf230 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Xen time implementation. * @@ -309,7 +310,6 @@ static irqreturn_t xen_timer_interrupt(int irq, void *dev_id) void xen_teardown_timer(int cpu) { struct clock_event_device *evt; - BUG_ON(cpu == 0); evt = &per_cpu(xen_clock_events, cpu).evt; if (evt->irq >= 0) { diff --git a/arch/x86/xen/trace.c b/arch/x86/xen/trace.c index a702ec2f5931..329f60eb95de 100644 --- a/arch/x86/xen/trace.c +++ b/arch/x86/xen/trace.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/ftrace.h> #include <xen/interface/xen.h> #include <xen/interface/xen-mca.h> diff --git a/arch/x86/xen/vga.c b/arch/x86/xen/vga.c index 6722e3733f02..e336f223f7f4 100644 --- a/arch/x86/xen/vga.c +++ b/arch/x86/xen/vga.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/screen_info.h> #include <linux/init.h> diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index eff224df813f..8019edd0125c 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -1,14 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* - * Asm versions of Xen pv-ops, suitable for either direct use or - * inlining. The inline versions are the same as the direct-use - * versions, with the pre- and post-amble chopped off. - * - * This code is encoded for size rather than absolute efficiency, with - * a view to being able to inline as much as possible. + * Asm versions of Xen pv-ops, suitable for direct use. * * We only bother with direct forms (ie, vcpu in percpu data) of the - * operations here; the indirect forms are better handled in C, since - * they're generally too large to inline anyway. + * operations here; the indirect forms are better handled in C. */ #include <asm/asm-offsets.h> @@ -16,7 +11,7 @@ #include <asm/processor-flags.h> #include <asm/frame.h> -#include "xen-asm.h" +#include <linux/linkage.h> /* * Enable events. This clears the event mask and tests the pending @@ -38,13 +33,11 @@ ENTRY(xen_irq_enable_direct) testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending jz 1f -2: call check_events + call check_events 1: -ENDPATCH(xen_irq_enable_direct) FRAME_END ret ENDPROC(xen_irq_enable_direct) - RELOC(xen_irq_enable_direct, 2b+1) /* @@ -53,10 +46,8 @@ ENDPATCH(xen_irq_enable_direct) */ ENTRY(xen_irq_disable_direct) movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask -ENDPATCH(xen_irq_disable_direct) ret - ENDPROC(xen_irq_disable_direct) - RELOC(xen_irq_disable_direct, 0) +ENDPROC(xen_irq_disable_direct) /* * (xen_)save_fl is used to get the current interrupt enable status. @@ -71,10 +62,8 @@ ENTRY(xen_save_fl_direct) testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask setz %ah addb %ah, %ah -ENDPATCH(xen_save_fl_direct) ret ENDPROC(xen_save_fl_direct) - RELOC(xen_save_fl_direct, 0) /* @@ -101,13 +90,11 @@ ENTRY(xen_restore_fl_direct) /* check for unmasked and pending */ cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending jnz 1f -2: call check_events + call check_events 1: -ENDPATCH(xen_restore_fl_direct) FRAME_END ret ENDPROC(xen_restore_fl_direct) - RELOC(xen_restore_fl_direct, 2b+1) /* diff --git a/arch/x86/xen/xen-asm.h b/arch/x86/xen/xen-asm.h deleted file mode 100644 index 465276467a47..000000000000 --- a/arch/x86/xen/xen-asm.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef _XEN_XEN_ASM_H -#define _XEN_XEN_ASM_H - -#include <linux/linkage.h> - -#define RELOC(x, v) .globl x##_reloc; x##_reloc=v -#define ENDPATCH(x) .globl x##_end; x##_end=. - -/* Pseudo-flag used for virtual NMI, which we don't implement yet */ -#define XEN_EFLAGS_NMI 0x80000000 - -#endif diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S index feb6d40a0860..c15db060a242 100644 --- a/arch/x86/xen/xen-asm_32.S +++ b/arch/x86/xen/xen-asm_32.S @@ -1,14 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* - * Asm versions of Xen pv-ops, suitable for either direct use or - * inlining. The inline versions are the same as the direct-use - * versions, with the pre- and post-amble chopped off. - * - * This code is encoded for size rather than absolute efficiency, with - * a view to being able to inline as much as possible. + * Asm versions of Xen pv-ops, suitable for direct use. * * We only bother with direct forms (ie, vcpu in pda) of the - * operations here; the indirect forms are better handled in C, since - * they're generally too large to inline anyway. + * operations here; the indirect forms are better handled in C. */ #include <asm/thread_info.h> @@ -18,21 +13,10 @@ #include <xen/interface/xen.h> -#include "xen-asm.h" +#include <linux/linkage.h> -/* - * Force an event check by making a hypercall, but preserve regs - * before making the call. - */ -check_events: - push %eax - push %ecx - push %edx - call xen_force_evtchn_callback - pop %edx - pop %ecx - pop %eax - ret +/* Pseudo-flag used for virtual NMI, which we don't implement yet */ +#define XEN_EFLAGS_NMI 0x80000000 /* * This is run where a normal iret would be run, with the same stack setup: diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index c3df43141e70..c98a48c861fd 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -1,14 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* - * Asm versions of Xen pv-ops, suitable for either direct use or - * inlining. The inline versions are the same as the direct-use - * versions, with the pre- and post-amble chopped off. - * - * This code is encoded for size rather than absolute efficiency, with - * a view to being able to inline as much as possible. + * Asm versions of Xen pv-ops, suitable for direct use. * * We only bother with direct forms (ie, vcpu in pda) of the - * operations here; the indirect forms are better handled in C, since - * they're generally too large to inline anyway. + * operations here; the indirect forms are better handled in C. */ #include <asm/errno.h> @@ -20,13 +15,44 @@ #include <xen/interface/xen.h> -#include "xen-asm.h" +#include <linux/linkage.h> + +.macro xen_pv_trap name +ENTRY(xen_\name) + pop %rcx + pop %r11 + jmp \name +END(xen_\name) +.endm -ENTRY(xen_adjust_exception_frame) - mov 8+0(%rsp), %rcx - mov 8+8(%rsp), %r11 - ret $16 -ENDPROC(xen_adjust_exception_frame) +xen_pv_trap divide_error +xen_pv_trap debug +xen_pv_trap xendebug +xen_pv_trap int3 +xen_pv_trap xenint3 +xen_pv_trap nmi +xen_pv_trap overflow +xen_pv_trap bounds +xen_pv_trap invalid_op +xen_pv_trap device_not_available +xen_pv_trap double_fault +xen_pv_trap coprocessor_segment_overrun +xen_pv_trap invalid_TSS +xen_pv_trap segment_not_present +xen_pv_trap stack_segment +xen_pv_trap general_protection +xen_pv_trap page_fault +xen_pv_trap spurious_interrupt_bug +xen_pv_trap coprocessor_error +xen_pv_trap alignment_check +#ifdef CONFIG_X86_MCE +xen_pv_trap machine_check +#endif /* CONFIG_X86_MCE */ +xen_pv_trap simd_coprocessor_error +#ifdef CONFIG_IA32_EMULATION +xen_pv_trap entry_INT80_compat +#endif +xen_pv_trap hypervisor_callback hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32 /* @@ -46,9 +72,7 @@ hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32 */ ENTRY(xen_iret) pushq $0 -1: jmp hypercall_iret -ENDPATCH(xen_iret) -RELOC(xen_iret, 1b+1) + jmp hypercall_iret ENTRY(xen_sysret64) /* @@ -65,9 +89,7 @@ ENTRY(xen_sysret64) pushq %rcx pushq $VGCF_in_syscall -1: jmp hypercall_iret -ENDPATCH(xen_sysret64) -RELOC(xen_sysret64, 1b+1) + jmp hypercall_iret /* * Xen handles syscall callbacks much like ordinary exceptions, which @@ -82,34 +104,47 @@ RELOC(xen_sysret64, 1b+1) * rip * r11 * rsp->rcx - * - * In all the entrypoints, we undo all that to make it look like a - * CPU-generated syscall/sysenter and jump to the normal entrypoint. */ -.macro undo_xen_syscall - mov 0*8(%rsp), %rcx - mov 1*8(%rsp), %r11 - mov 5*8(%rsp), %rsp -.endm - /* Normal 64-bit system call target */ ENTRY(xen_syscall_target) - undo_xen_syscall - jmp entry_SYSCALL_64_after_swapgs + popq %rcx + popq %r11 + + /* + * Neither Xen nor the kernel really knows what the old SS and + * CS were. The kernel expects __USER_DS and __USER_CS, so + * report those values even though Xen will guess its own values. + */ + movq $__USER_DS, 4*8(%rsp) + movq $__USER_CS, 1*8(%rsp) + + jmp entry_SYSCALL_64_after_hwframe ENDPROC(xen_syscall_target) #ifdef CONFIG_IA32_EMULATION /* 32-bit compat syscall target */ ENTRY(xen_syscall32_target) - undo_xen_syscall - jmp entry_SYSCALL_compat + popq %rcx + popq %r11 + + /* + * Neither Xen nor the kernel really knows what the old SS and + * CS were. The kernel expects __USER32_DS and __USER32_CS, so + * report those values even though Xen will guess its own values. + */ + movq $__USER32_DS, 4*8(%rsp) + movq $__USER32_CS, 1*8(%rsp) + + jmp entry_SYSCALL_compat_after_hwframe ENDPROC(xen_syscall32_target) /* 32-bit compat sysenter target */ ENTRY(xen_sysenter_target) - undo_xen_syscall + mov 0*8(%rsp), %rcx + mov 1*8(%rsp), %r11 + mov 5*8(%rsp), %rsp jmp entry_SYSENTER_compat ENDPROC(xen_sysenter_target) diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 72a8e6adebe6..b5b8d7f43557 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* Xen-specific pieces of head.S, intended to be included in the right place in head.S */ @@ -58,7 +59,7 @@ ENTRY(hypercall_page) #else ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map) /* Map the p2m table to a 512GB-aligned user address. */ - ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad PGDIR_SIZE) + ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad (PUD_SIZE * PTRS_PER_PUD)) #endif #ifdef CONFIG_XEN_PV ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen) diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 9a440a42c618..f377e1820c6c 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef XEN_OPS_H #define XEN_OPS_H @@ -78,7 +79,8 @@ bool xen_vcpu_stolen(int vcpu); extern int xen_have_vcpu_info_placement; -void xen_vcpu_setup(int cpu); +int xen_vcpu_setup(int cpu); +void xen_vcpu_info_reset(int cpu); void xen_setup_vcpu_info_placement(void); #ifdef CONFIG_SMP @@ -128,23 +130,15 @@ static inline void __init xen_efi_init(void) } #endif -/* Declare an asm function, along with symbols needed to make it - inlineable */ -#define DECL_ASM(ret, name, ...) \ - __visible ret name(__VA_ARGS__); \ - extern char name##_end[] __visible; \ - extern char name##_reloc[] __visible - -DECL_ASM(void, xen_irq_enable_direct, void); -DECL_ASM(void, xen_irq_disable_direct, void); -DECL_ASM(unsigned long, xen_save_fl_direct, void); -DECL_ASM(void, xen_restore_fl_direct, unsigned long); +__visible void xen_irq_enable_direct(void); +__visible void xen_irq_disable_direct(void); +__visible unsigned long xen_save_fl_direct(void); +__visible void xen_restore_fl_direct(unsigned long); /* These are not functions, and cannot be called normally */ __visible void xen_iret(void); __visible void xen_sysret32(void); __visible void xen_sysret64(void); -__visible void xen_adjust_exception_frame(void); extern int xen_panic_handler_init(void); diff --git a/arch/x86/xen/xen-pvh.S b/arch/x86/xen/xen-pvh.S index 5e246716d58f..e1a5fbeae08d 100644 --- a/arch/x86/xen/xen-pvh.S +++ b/arch/x86/xen/xen-pvh.S @@ -87,7 +87,7 @@ ENTRY(pvh_start_xen) wrmsr /* Enable pre-constructed page tables. */ - mov $_pa(init_level4_pgt), %eax + mov $_pa(init_top_pgt), %eax mov %eax, %cr3 mov $(X86_CR0_PG | X86_CR0_PE), %eax mov %eax, %cr0 |