summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig15
-rw-r--r--arch/x86/Kconfig.cpu6
-rw-r--r--arch/x86/boot/compressed/Makefile2
-rw-r--r--arch/x86/boot/compressed/acpi.c2
-rw-r--r--arch/x86/boot/compressed/ident_map_64.c5
-rw-r--r--arch/x86/boot/compressed/idt_64.c1
-rw-r--r--arch/x86/boot/compressed/idt_handlers_64.S1
-rw-r--r--arch/x86/boot/compressed/mem.c2
-rw-r--r--arch/x86/boot/compressed/misc.h1
-rw-r--r--arch/x86/boot/pm.c7
-rw-r--r--arch/x86/boot/string.c2
-rw-r--r--arch/x86/coco/tdx/tdx.c3
-rw-r--r--arch/x86/crypto/Kconfig8
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S2
-rw-r--r--arch/x86/crypto/aesni-intel_avx-x86_64.S2
-rw-r--r--arch/x86/crypto/crc32c-pcl-intel-asm_64.S2
-rw-r--r--arch/x86/crypto/sha1_ssse3_glue.c7
-rw-r--r--arch/x86/crypto/sha256_ssse3_glue.c7
-rw-r--r--arch/x86/crypto/sha512-avx-asm.S2
-rw-r--r--arch/x86/crypto/sha512-ssse3-asm.S2
-rw-r--r--arch/x86/crypto/sm4-aesni-avx-asm_64.S52
-rw-r--r--arch/x86/crypto/sm4-aesni-avx2-asm_64.S55
-rw-r--r--arch/x86/crypto/sm4-avx.h4
-rw-r--r--arch/x86/crypto/sm4_aesni_avx2_glue.c26
-rw-r--r--arch/x86/crypto/sm4_aesni_avx_glue.c130
-rw-r--r--arch/x86/entry/calling.h12
-rw-r--r--arch/x86/entry/common.c93
-rw-r--r--arch/x86/entry/entry_64.S33
-rw-r--r--arch/x86/entry/entry_64_compat.S77
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl5
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl5
-rw-r--r--arch/x86/entry/vdso/vclock_gettime.c10
-rw-r--r--arch/x86/events/amd/brs.c2
-rw-r--r--arch/x86/events/amd/core.c4
-rw-r--r--arch/x86/events/amd/ibs.c3
-rw-r--r--arch/x86/events/core.c4
-rw-r--r--arch/x86/events/intel/core.c154
-rw-r--r--arch/x86/events/intel/cstate.c158
-rw-r--r--arch/x86/events/intel/ds.c4
-rw-r--r--arch/x86/events/intel/lbr.c85
-rw-r--r--arch/x86/events/intel/uncore.c12
-rw-r--r--arch/x86/events/intel/uncore.h10
-rw-r--r--arch/x86/events/intel/uncore_discovery.c5
-rw-r--r--arch/x86/events/intel/uncore_discovery.h2
-rw-r--r--arch/x86/events/intel/uncore_nhmex.c2
-rw-r--r--arch/x86/events/intel/uncore_snbep.c208
-rw-r--r--arch/x86/events/perf_event.h12
-rw-r--r--arch/x86/events/perf_event_flags.h2
-rw-r--r--arch/x86/hyperv/hv_apic.c2
-rw-r--r--arch/x86/hyperv/irqdomain.c2
-rw-r--r--arch/x86/hyperv/ivm.c2
-rw-r--r--arch/x86/include/asm/alternative.h30
-rw-r--r--arch/x86/include/asm/amd_nb.h2
-rw-r--r--arch/x86/include/asm/apic.h2
-rw-r--r--arch/x86/include/asm/apicdef.h276
-rw-r--r--arch/x86/include/asm/barrier.h18
-rw-r--r--arch/x86/include/asm/cfi.h126
-rw-r--r--arch/x86/include/asm/cpufeatures.h8
-rw-r--r--arch/x86/include/asm/current.h1
-rw-r--r--arch/x86/include/asm/debugreg.h1
-rw-r--r--arch/x86/include/asm/desc_defs.h78
-rw-r--r--arch/x86/include/asm/elf.h2
-rw-r--r--arch/x86/include/asm/extable_fixup_types.h2
-rw-r--r--arch/x86/include/asm/fpu/types.h4
-rw-r--r--arch/x86/include/asm/ia32.h18
-rw-r--r--arch/x86/include/asm/idtentry.h4
-rw-r--r--arch/x86/include/asm/io.h8
-rw-r--r--arch/x86/include/asm/iosf_mbi.h2
-rw-r--r--arch/x86/include/asm/irq_work.h1
-rw-r--r--arch/x86/include/asm/kvm_host.h2
-rw-r--r--arch/x86/include/asm/mce.h4
-rw-r--r--arch/x86/include/asm/msr-index.h5
-rw-r--r--arch/x86/include/asm/mwait.h20
-rw-r--r--arch/x86/include/asm/nospec-branch.h4
-rw-r--r--arch/x86/include/asm/paravirt.h83
-rw-r--r--arch/x86/include/asm/paravirt_types.h88
-rw-r--r--arch/x86/include/asm/percpu.h2
-rw-r--r--arch/x86/include/asm/perf_event.h4
-rw-r--r--arch/x86/include/asm/pgtable.h7
-rw-r--r--arch/x86/include/asm/pgtable_64.h2
-rw-r--r--arch/x86/include/asm/preempt.h1
-rw-r--r--arch/x86/include/asm/processor.h18
-rw-r--r--arch/x86/include/asm/proto.h4
-rw-r--r--arch/x86/include/asm/qspinlock_paravirt.h4
-rw-r--r--arch/x86/include/asm/setup.h2
-rw-r--r--arch/x86/include/asm/syscall_wrapper.h34
-rw-r--r--arch/x86/include/asm/text-patching.h12
-rw-r--r--arch/x86/include/asm/traps.h1
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h2
-rw-r--r--arch/x86/include/asm/vdso/gettimeofday.h4
-rw-r--r--arch/x86/include/asm/xen/interface_64.h2
-rw-r--r--arch/x86/include/uapi/asm/amd_hsmp.h2
-rw-r--r--arch/x86/include/uapi/asm/signal.h1
-rw-r--r--arch/x86/kernel/acpi/boot.c2
-rw-r--r--arch/x86/kernel/alternative.c237
-rw-r--r--arch/x86/kernel/amd_gart_64.c2
-rw-r--r--arch/x86/kernel/apic/Makefile2
-rw-r--r--arch/x86/kernel/apic/apic.c2
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c2
-rw-r--r--arch/x86/kernel/apic/apic_noop.c1
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c2
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c1
-rw-r--r--arch/x86/kernel/apic/io_apic.c2
-rw-r--r--arch/x86/kernel/apic/probe_32.c1
-rw-r--r--arch/x86/kernel/apic/vector.c4
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c1
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c1
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c1
-rw-r--r--arch/x86/kernel/apm_32.c2
-rw-r--r--arch/x86/kernel/callthunks.c17
-rw-r--r--arch/x86/kernel/cfi.c4
-rw-r--r--arch/x86/kernel/cpu/amd.c271
-rw-r--r--arch/x86/kernel/cpu/common.c57
-rw-r--r--arch/x86/kernel/cpu/hygon.c3
-rw-r--r--arch/x86/kernel/cpu/intel_epb.c2
-rw-r--r--arch/x86/kernel/cpu/mce/amd.c80
-rw-r--r--arch/x86/kernel/cpu/mce/core.c72
-rw-r--r--arch/x86/kernel/cpu/mce/inject.c1
-rw-r--r--arch/x86/kernel/cpu/mce/intel.c304
-rw-r--r--arch/x86/kernel/cpu/mce/internal.h66
-rw-r--r--arch/x86/kernel/cpu/mce/threshold.c115
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c20
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c14
-rw-r--r--arch/x86/kernel/cpu/sgx/ioctl.c2
-rw-r--r--arch/x86/kernel/crash.c16
-rw-r--r--arch/x86/kernel/fpu/bugs.c1
-rw-r--r--arch/x86/kernel/fpu/core.c2
-rw-r--r--arch/x86/kernel/head64.c6
-rw-r--r--arch/x86/kernel/head_64.S53
-rw-r--r--arch/x86/kernel/hpet.c4
-rw-r--r--arch/x86/kernel/idt.c2
-rw-r--r--arch/x86/kernel/kexec-bzimage64.c23
-rw-r--r--arch/x86/kernel/kprobes/core.c3
-rw-r--r--arch/x86/kernel/kvm.c6
-rw-r--r--arch/x86/kernel/kvmclock.c2
-rw-r--r--arch/x86/kernel/ldt.c6
-rw-r--r--arch/x86/kernel/machine_kexec_64.c7
-rw-r--r--arch/x86/kernel/module.c20
-rw-r--r--arch/x86/kernel/paravirt.c54
-rw-r--r--arch/x86/kernel/process.c2
-rw-r--r--arch/x86/kernel/setup.c4
-rw-r--r--arch/x86/kernel/setup_percpu.c4
-rw-r--r--arch/x86/kernel/sev-shared.c2
-rw-r--r--arch/x86/kernel/sev.c11
-rw-r--r--arch/x86/kernel/signal.c1
-rw-r--r--arch/x86/kernel/smpboot.c1
-rw-r--r--arch/x86/kernel/traps.c1
-rw-r--r--arch/x86/kernel/vmlinux.lds.S13
-rw-r--r--arch/x86/kvm/cpuid.c2
-rw-r--r--arch/x86/kvm/debugfs.c1
-rw-r--r--arch/x86/kvm/hyperv.c2
-rw-r--r--arch/x86/kvm/mmu/mmu.c4
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.c2
-rw-r--r--arch/x86/kvm/svm/sev.c19
-rw-r--r--arch/x86/kvm/svm/svm.c11
-rw-r--r--arch/x86/kvm/svm/svm.h2
-rw-r--r--arch/x86/kvm/vmx/nested.c2
-rw-r--r--arch/x86/kvm/vmx/vmx.c2
-rw-r--r--arch/x86/kvm/x86.c15
-rw-r--r--arch/x86/kvm/xen.c2
-rw-r--r--arch/x86/lib/cache-smp.c1
-rw-r--r--arch/x86/lib/csum-partial_64.c105
-rw-r--r--arch/x86/lib/delay.c2
-rw-r--r--arch/x86/lib/misc.c2
-rw-r--r--arch/x86/mm/fault.c2
-rw-r--r--arch/x86/mm/init_64.c6
-rw-r--r--arch/x86/mm/mem_encrypt_amd.c11
-rw-r--r--arch/x86/mm/numa.c34
-rw-r--r--arch/x86/mm/pat/memtype.c2
-rw-r--r--arch/x86/mm/pat/set_memory.c4
-rw-r--r--arch/x86/mm/pti.c2
-rw-r--r--arch/x86/mm/tlb.c2
-rw-r--r--arch/x86/net/bpf_jit_comp.c359
-rw-r--r--arch/x86/net/bpf_jit_comp32.c2
-rw-r--r--arch/x86/pci/sta2x11-fixup.c1
-rw-r--r--arch/x86/platform/intel-quark/imr_selftest.c2
-rw-r--r--arch/x86/platform/pvh/head.S9
-rw-r--r--arch/x86/platform/uv/uv_irq.c2
-rw-r--r--arch/x86/platform/uv/uv_nmi.c2
-rw-r--r--arch/x86/platform/uv/uv_time.c2
-rw-r--r--arch/x86/realmode/init.c2
-rw-r--r--arch/x86/realmode/rm/reboot.S3
-rw-r--r--arch/x86/tools/Makefile2
-rw-r--r--arch/x86/tools/chkobjdump.awk34
-rw-r--r--arch/x86/tools/objdump_reformat.awk6
-rw-r--r--arch/x86/tools/relocs.c2
-rw-r--r--arch/x86/um/asm/elf.h4
-rw-r--r--arch/x86/um/asm/processor_64.h3
-rw-r--r--arch/x86/um/os-Linux/Makefile1
-rw-r--r--arch/x86/um/os-Linux/prctl.c12
-rw-r--r--arch/x86/um/ptrace_32.c24
-rw-r--r--arch/x86/um/ptrace_64.c26
-rw-r--r--arch/x86/um/shared/sysdep/ptrace_32.h4
-rw-r--r--arch/x86/um/shared/sysdep/ptrace_user.h12
-rw-r--r--arch/x86/um/shared/sysdep/stub_32.h39
-rw-r--r--arch/x86/um/shared/sysdep/stub_64.h17
-rw-r--r--arch/x86/um/syscalls_64.c62
-rw-r--r--arch/x86/um/sysrq_64.c1
-rw-r--r--arch/x86/um/tls_64.c2
-rw-r--r--arch/x86/xen/Kconfig1
-rw-r--r--arch/x86/xen/enlighten.c6
-rw-r--r--arch/x86/xen/enlighten_pv.c2
-rw-r--r--arch/x86/xen/irq.c2
-rw-r--r--arch/x86/xen/mmu_pv.c2
-rw-r--r--arch/x86/xen/xen-asm.S2
-rw-r--r--arch/x86/xen/xen-ops.h2
206 files changed, 2452 insertions, 2117 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 3762f41bb092..53f2e7797b1d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -88,6 +88,7 @@ config X86
select ARCH_HAS_PMEM_API if X86_64
select ARCH_HAS_PTE_DEVMAP if X86_64
select ARCH_HAS_PTE_SPECIAL
+ select ARCH_HAS_HW_PTE_YOUNG
select ARCH_HAS_NONLEAF_PMD_YOUNG if PGTABLE_LEVELS > 2
select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64
select ARCH_HAS_COPY_MC if X86_64
@@ -169,7 +170,7 @@ config X86
select HAS_IOPORT
select HAVE_ACPI_APEI if ACPI
select HAVE_ACPI_APEI_NMI if ACPI
- select HAVE_ALIGNED_STRUCT_PAGE if SLUB
+ select HAVE_ALIGNED_STRUCT_PAGE
select HAVE_ARCH_AUDITSYSCALL
select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE
select HAVE_ARCH_HUGE_VMALLOC if X86_64
@@ -384,10 +385,6 @@ config HAVE_INTEL_TXT
def_bool y
depends on INTEL_IOMMU && ACPI
-config X86_32_SMP
- def_bool y
- depends on X86_32 && SMP
-
config X86_64_SMP
def_bool y
depends on X86_64 && SMP
@@ -1415,7 +1412,7 @@ config HIGHMEM4G
config HIGHMEM64G
bool "64GB"
- depends on !M486SX && !M486 && !M586 && !M586TSC && !M586MMX && !MGEODE_LX && !MGEODEGX1 && !MCYRIXIII && !MELAN && !MWINCHIPC6 && !MWINCHIP3D && !MK6
+ depends on X86_HAVE_PAE
select X86_PAE
help
Select this if you have a 32-bit processor and more than 4
@@ -1472,7 +1469,7 @@ config HIGHMEM
config X86_PAE
bool "PAE (Physical Address Extension) Support"
- depends on X86_32 && !HIGHMEM4G
+ depends on X86_32 && X86_HAVE_PAE
select PHYS_ADDR_T_64BIT
select SWIOTLB
help
@@ -2072,7 +2069,7 @@ config ARCH_SUPPORTS_KEXEC
def_bool y
config ARCH_SUPPORTS_KEXEC_FILE
- def_bool X86_64 && CRYPTO && CRYPTO_SHA256
+ def_bool X86_64
config ARCH_SELECTS_KEXEC_FILE
def_bool y
@@ -2080,7 +2077,7 @@ config ARCH_SELECTS_KEXEC_FILE
select HAVE_IMA_KEXEC if IMA
config ARCH_SUPPORTS_KEXEC_PURGATORY
- def_bool KEXEC_FILE
+ def_bool y
config ARCH_SUPPORTS_KEXEC_SIG
def_bool y
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 00468adf180f..b9224cf2ee4d 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -362,9 +362,13 @@ config X86_TSC
def_bool y
depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) || X86_64
+config X86_HAVE_PAE
+ def_bool y
+ depends on MCRUSOE || MEFFICEON || MCYRIXIII || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC7 || MCORE2 || MATOM || X86_64
+
config X86_CMPXCHG64
def_bool y
- depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586TSC || M586MMX || MATOM || MGEODE_LX || MGEODEGX1 || MK6 || MK7 || MK8
+ depends on X86_HAVE_PAE || M586TSC || M586MMX || MK6 || MK7
# this should be set for all -march=.. options where the compiler
# generates cmov.
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 71fc531b95b4..f19c038409aa 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -53,7 +53,7 @@ KBUILD_CFLAGS += -D__DISABLE_EXPORTS
KBUILD_CFLAGS += $(call cc-option,-Wa$(comma)-mrelax-relocations=no)
KBUILD_CFLAGS += -include $(srctree)/include/linux/hidden.h
-# sev.c indirectly inludes inat-table.h which is generated during
+# sev.c indirectly includes inat-table.h which is generated during
# compilation and stored in $(objtree). Add the directory to the includes so
# that the compiler finds it even with out-of-tree builds (make O=/some/path).
CFLAGS_sev.o += -I$(objtree)/arch/x86/lib/
diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c
index 55c98fdd67d2..18d15d1ce87d 100644
--- a/arch/x86/boot/compressed/acpi.c
+++ b/arch/x86/boot/compressed/acpi.c
@@ -178,7 +178,7 @@ static unsigned long get_cmdline_acpi_rsdp(void)
{
unsigned long addr = 0;
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
char val[MAX_ADDR_LEN] = { };
int ret;
diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c
index 473ba59b82a8..d040080d7edb 100644
--- a/arch/x86/boot/compressed/ident_map_64.c
+++ b/arch/x86/boot/compressed/ident_map_64.c
@@ -386,3 +386,8 @@ void do_boot_page_fault(struct pt_regs *regs, unsigned long error_code)
*/
kernel_add_identity_map(address, end);
}
+
+void do_boot_nmi_trap(struct pt_regs *regs, unsigned long error_code)
+{
+ /* Empty handler to ignore NMI during early boot */
+}
diff --git a/arch/x86/boot/compressed/idt_64.c b/arch/x86/boot/compressed/idt_64.c
index 3cdf94b41456..d100284bbef4 100644
--- a/arch/x86/boot/compressed/idt_64.c
+++ b/arch/x86/boot/compressed/idt_64.c
@@ -61,6 +61,7 @@ void load_stage2_idt(void)
boot_idt_desc.address = (unsigned long)boot_idt;
set_idt_entry(X86_TRAP_PF, boot_page_fault);
+ set_idt_entry(X86_TRAP_NMI, boot_nmi_trap);
#ifdef CONFIG_AMD_MEM_ENCRYPT
/*
diff --git a/arch/x86/boot/compressed/idt_handlers_64.S b/arch/x86/boot/compressed/idt_handlers_64.S
index 22890e199f5b..4d03c8562f63 100644
--- a/arch/x86/boot/compressed/idt_handlers_64.S
+++ b/arch/x86/boot/compressed/idt_handlers_64.S
@@ -70,6 +70,7 @@ SYM_FUNC_END(\name)
.code64
EXCEPTION_HANDLER boot_page_fault do_boot_page_fault error_code=1
+EXCEPTION_HANDLER boot_nmi_trap do_boot_nmi_trap error_code=0
#ifdef CONFIG_AMD_MEM_ENCRYPT
EXCEPTION_HANDLER boot_stage1_vc do_vc_no_ghcb error_code=1
diff --git a/arch/x86/boot/compressed/mem.c b/arch/x86/boot/compressed/mem.c
index b3c3a4be7471..dbba332e4a12 100644
--- a/arch/x86/boot/compressed/mem.c
+++ b/arch/x86/boot/compressed/mem.c
@@ -8,7 +8,7 @@
/*
* accept_memory() and process_unaccepted_memory() called from EFI stub which
- * runs before decompresser and its early_tdx_detect().
+ * runs before decompressor and its early_tdx_detect().
*
* Enumerate TDX directly from the early users.
*/
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index c0d502bd8716..bc2f0f17fb90 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -196,6 +196,7 @@ static inline void cleanup_exception_handling(void) { }
/* IDT Entry Points */
void boot_page_fault(void);
+void boot_nmi_trap(void);
void boot_stage1_vc(void);
void boot_stage2_vc(void);
diff --git a/arch/x86/boot/pm.c b/arch/x86/boot/pm.c
index 40031a614712..5941f930f6c5 100644
--- a/arch/x86/boot/pm.c
+++ b/arch/x86/boot/pm.c
@@ -11,6 +11,7 @@
*/
#include "boot.h"
+#include <asm/desc_defs.h>
#include <asm/segment.h>
/*
@@ -67,13 +68,13 @@ static void setup_gdt(void)
being 8-byte unaligned. Intel recommends 16 byte alignment. */
static const u64 boot_gdt[] __attribute__((aligned(16))) = {
/* CS: code, read/execute, 4 GB, base 0 */
- [GDT_ENTRY_BOOT_CS] = GDT_ENTRY(0xc09b, 0, 0xfffff),
+ [GDT_ENTRY_BOOT_CS] = GDT_ENTRY(DESC_CODE32, 0, 0xfffff),
/* DS: data, read/write, 4 GB, base 0 */
- [GDT_ENTRY_BOOT_DS] = GDT_ENTRY(0xc093, 0, 0xfffff),
+ [GDT_ENTRY_BOOT_DS] = GDT_ENTRY(DESC_DATA32, 0, 0xfffff),
/* TSS: 32-bit tss, 104 bytes, base 4096 */
/* We only have a TSS here to keep Intel VT happy;
we don't actually use it for anything. */
- [GDT_ENTRY_BOOT_TSS] = GDT_ENTRY(0x0089, 4096, 103),
+ [GDT_ENTRY_BOOT_TSS] = GDT_ENTRY(DESC_TSS32, 4096, 103),
};
/* Xen HVM incorrectly stores a pointer to the gdt_ptr, instead
of the gdt_ptr contents. Thus, make it static so it will
diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c
index 1c8541ae3b3a..c23f3b9c84fe 100644
--- a/arch/x86/boot/string.c
+++ b/arch/x86/boot/string.c
@@ -49,7 +49,7 @@ int strcmp(const char *str1, const char *str2)
{
const unsigned char *s1 = (const unsigned char *)str1;
const unsigned char *s2 = (const unsigned char *)str2;
- int delta = 0;
+ int delta;
while (*s1 || *s2) {
delta = *s1 - *s2;
diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index 1b5d17a9f70d..c1cb90369915 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -10,6 +10,7 @@
#include <asm/coco.h>
#include <asm/tdx.h>
#include <asm/vmx.h>
+#include <asm/ia32.h>
#include <asm/insn.h>
#include <asm/insn-eval.h>
#include <asm/pgtable.h>
@@ -886,7 +887,7 @@ void __init tdx_early_init(void)
* there.
*
* Intel-TDX has a secure RDMSR hypercall, but that needs to be
- * implemented seperately in the low level startup ASM code.
+ * implemented separately in the low level startup ASM code.
* Until that is in place, disable parallel bringup for TDX.
*/
x86_cpuinit.parallel_bringup = false;
diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig
index 9bbfd01cfa2f..c9e59589a1ce 100644
--- a/arch/x86/crypto/Kconfig
+++ b/arch/x86/crypto/Kconfig
@@ -189,7 +189,7 @@ config CRYPTO_SERPENT_AVX2_X86_64
Processes 16 blocks in parallel.
config CRYPTO_SM4_AESNI_AVX_X86_64
- tristate "Ciphers: SM4 with modes: ECB, CBC, CFB, CTR (AES-NI/AVX)"
+ tristate "Ciphers: SM4 with modes: ECB, CBC, CTR (AES-NI/AVX)"
depends on X86 && 64BIT
select CRYPTO_SKCIPHER
select CRYPTO_SIMD
@@ -197,7 +197,7 @@ config CRYPTO_SM4_AESNI_AVX_X86_64
select CRYPTO_SM4
help
Length-preserving ciphers: SM4 cipher algorithms
- (OSCCA GB/T 32907-2016) with ECB, CBC, CFB, and CTR modes
+ (OSCCA GB/T 32907-2016) with ECB, CBC, and CTR modes
Architecture: x86_64 using:
- AES-NI (AES New Instructions)
@@ -210,7 +210,7 @@ config CRYPTO_SM4_AESNI_AVX_X86_64
If unsure, say N.
config CRYPTO_SM4_AESNI_AVX2_X86_64
- tristate "Ciphers: SM4 with modes: ECB, CBC, CFB, CTR (AES-NI/AVX2)"
+ tristate "Ciphers: SM4 with modes: ECB, CBC, CTR (AES-NI/AVX2)"
depends on X86 && 64BIT
select CRYPTO_SKCIPHER
select CRYPTO_SIMD
@@ -219,7 +219,7 @@ config CRYPTO_SM4_AESNI_AVX2_X86_64
select CRYPTO_SM4_AESNI_AVX_X86_64
help
Length-preserving ciphers: SM4 cipher algorithms
- (OSCCA GB/T 32907-2016) with ECB, CBC, CFB, and CTR modes
+ (OSCCA GB/T 32907-2016) with ECB, CBC, and CTR modes
Architecture: x86_64 using:
- AES-NI (AES New Instructions)
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 187f913cc239..411d8c83e88a 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -666,7 +666,7 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff
.ifc \operation, dec
movdqa %xmm1, %xmm3
- pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn)
+ pxor %xmm1, %xmm9 # Ciphertext XOR E(K, Yn)
mov \PLAIN_CYPH_LEN, %r10
add %r13, %r10
diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S
index 74dd230973cf..8c9749ed0651 100644
--- a/arch/x86/crypto/aesni-intel_avx-x86_64.S
+++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -747,7 +747,7 @@ VARIABLE_OFFSET = 16*8
.if \ENC_DEC == DEC
vmovdqa %xmm1, %xmm3
- pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn)
+ pxor %xmm1, %xmm9 # Ciphertext XOR E(K, Yn)
mov \PLAIN_CYPH_LEN, %r10
add %r13, %r10
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index 81ce0f4db555..bbcff1fb78cb 100644
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -184,7 +184,7 @@ SYM_FUNC_START(crc_pcl)
xor crc1,crc1
xor crc2,crc2
- # Fall thruogh into top of crc array (crc_128)
+ # Fall through into top of crc array (crc_128)
################################################################
## 3) CRC Array:
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
index 959afa705e95..ab8bc54f254d 100644
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ b/arch/x86/crypto/sha1_ssse3_glue.c
@@ -2,8 +2,8 @@
/*
* Cryptographic API.
*
- * Glue code for the SHA1 Secure Hash Algorithm assembler implementation using
- * Supplemental SSE3 instructions.
+ * Glue code for the SHA1 Secure Hash Algorithm assembler implementations
+ * using SSSE3, AVX, AVX2, and SHA-NI instructions.
*
* This file is based on sha1_generic.c
*
@@ -28,6 +28,9 @@
#include <asm/simd.h>
static const struct x86_cpu_id module_cpu_ids[] = {
+#ifdef CONFIG_AS_SHA1_NI
+ X86_MATCH_FEATURE(X86_FEATURE_SHA_NI, NULL),
+#endif
X86_MATCH_FEATURE(X86_FEATURE_AVX2, NULL),
X86_MATCH_FEATURE(X86_FEATURE_AVX, NULL),
X86_MATCH_FEATURE(X86_FEATURE_SSSE3, NULL),
diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c
index 4c0383a90e11..e04a43d9f7d5 100644
--- a/arch/x86/crypto/sha256_ssse3_glue.c
+++ b/arch/x86/crypto/sha256_ssse3_glue.c
@@ -1,8 +1,8 @@
/*
* Cryptographic API.
*
- * Glue code for the SHA256 Secure Hash Algorithm assembler
- * implementation using supplemental SSE3 / AVX / AVX2 instructions.
+ * Glue code for the SHA256 Secure Hash Algorithm assembler implementations
+ * using SSSE3, AVX, AVX2, and SHA-NI instructions.
*
* This file is based on sha256_generic.c
*
@@ -45,6 +45,9 @@ asmlinkage void sha256_transform_ssse3(struct sha256_state *state,
const u8 *data, int blocks);
static const struct x86_cpu_id module_cpu_ids[] = {
+#ifdef CONFIG_AS_SHA256_NI
+ X86_MATCH_FEATURE(X86_FEATURE_SHA_NI, NULL),
+#endif
X86_MATCH_FEATURE(X86_FEATURE_AVX2, NULL),
X86_MATCH_FEATURE(X86_FEATURE_AVX, NULL),
X86_MATCH_FEATURE(X86_FEATURE_SSSE3, NULL),
diff --git a/arch/x86/crypto/sha512-avx-asm.S b/arch/x86/crypto/sha512-avx-asm.S
index d902b8ea0721..5bfce4b045fd 100644
--- a/arch/x86/crypto/sha512-avx-asm.S
+++ b/arch/x86/crypto/sha512-avx-asm.S
@@ -84,7 +84,7 @@ frame_size = frame_WK + WK_SIZE
# Useful QWORD "arrays" for simpler memory references
# MSG, DIGEST, K_t, W_t are arrays
-# WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even
+# WK_2(t) points to 1 of 2 qwords at frame.WK depending on t being odd/even
# Input message (arg1)
#define MSG(i) 8*i(msg)
diff --git a/arch/x86/crypto/sha512-ssse3-asm.S b/arch/x86/crypto/sha512-ssse3-asm.S
index 65be30156816..30a2c4777f9d 100644
--- a/arch/x86/crypto/sha512-ssse3-asm.S
+++ b/arch/x86/crypto/sha512-ssse3-asm.S
@@ -82,7 +82,7 @@ frame_size = frame_WK + WK_SIZE
# Useful QWORD "arrays" for simpler memory references
# MSG, DIGEST, K_t, W_t are arrays
-# WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even
+# WK_2(t) points to 1 of 2 qwords at frame.WK depending on t being odd/even
# Input message (arg1)
#define MSG(i) 8*i(msg)
diff --git a/arch/x86/crypto/sm4-aesni-avx-asm_64.S b/arch/x86/crypto/sm4-aesni-avx-asm_64.S
index e2668d2fe6ce..2bf611eaa191 100644
--- a/arch/x86/crypto/sm4-aesni-avx-asm_64.S
+++ b/arch/x86/crypto/sm4-aesni-avx-asm_64.S
@@ -534,55 +534,3 @@ SYM_TYPED_FUNC_START(sm4_aesni_avx_cbc_dec_blk8)
FRAME_END
RET;
SYM_FUNC_END(sm4_aesni_avx_cbc_dec_blk8)
-
-/*
- * void sm4_aesni_avx_cfb_dec_blk8(const u32 *rk, u8 *dst,
- * const u8 *src, u8 *iv)
- */
-SYM_TYPED_FUNC_START(sm4_aesni_avx_cfb_dec_blk8)
- /* input:
- * %rdi: round key array, CTX
- * %rsi: dst (8 blocks)
- * %rdx: src (8 blocks)
- * %rcx: iv
- */
- FRAME_BEGIN
-
- /* Load input */
- vmovdqu (%rcx), RA0;
- vmovdqu 0 * 16(%rdx), RA1;
- vmovdqu 1 * 16(%rdx), RA2;
- vmovdqu 2 * 16(%rdx), RA3;
- vmovdqu 3 * 16(%rdx), RB0;
- vmovdqu 4 * 16(%rdx), RB1;
- vmovdqu 5 * 16(%rdx), RB2;
- vmovdqu 6 * 16(%rdx), RB3;
-
- /* Update IV */
- vmovdqu 7 * 16(%rdx), RNOT;
- vmovdqu RNOT, (%rcx);
-
- call __sm4_crypt_blk8;
-
- vpxor (0 * 16)(%rdx), RA0, RA0;
- vpxor (1 * 16)(%rdx), RA1, RA1;
- vpxor (2 * 16)(%rdx), RA2, RA2;
- vpxor (3 * 16)(%rdx), RA3, RA3;
- vpxor (4 * 16)(%rdx), RB0, RB0;
- vpxor (5 * 16)(%rdx), RB1, RB1;
- vpxor (6 * 16)(%rdx), RB2, RB2;
- vpxor (7 * 16)(%rdx), RB3, RB3;
-
- vmovdqu RA0, (0 * 16)(%rsi);
- vmovdqu RA1, (1 * 16)(%rsi);
- vmovdqu RA2, (2 * 16)(%rsi);
- vmovdqu RA3, (3 * 16)(%rsi);
- vmovdqu RB0, (4 * 16)(%rsi);
- vmovdqu RB1, (5 * 16)(%rsi);
- vmovdqu RB2, (6 * 16)(%rsi);
- vmovdqu RB3, (7 * 16)(%rsi);
-
- vzeroall;
- FRAME_END
- RET;
-SYM_FUNC_END(sm4_aesni_avx_cfb_dec_blk8)
diff --git a/arch/x86/crypto/sm4-aesni-avx2-asm_64.S b/arch/x86/crypto/sm4-aesni-avx2-asm_64.S
index 98ede9459287..9ff5ba075591 100644
--- a/arch/x86/crypto/sm4-aesni-avx2-asm_64.S
+++ b/arch/x86/crypto/sm4-aesni-avx2-asm_64.S
@@ -439,58 +439,3 @@ SYM_TYPED_FUNC_START(sm4_aesni_avx2_cbc_dec_blk16)
FRAME_END
RET;
SYM_FUNC_END(sm4_aesni_avx2_cbc_dec_blk16)
-
-/*
- * void sm4_aesni_avx2_cfb_dec_blk16(const u32 *rk, u8 *dst,
- * const u8 *src, u8 *iv)
- */
-SYM_TYPED_FUNC_START(sm4_aesni_avx2_cfb_dec_blk16)
- /* input:
- * %rdi: round key array, CTX
- * %rsi: dst (16 blocks)
- * %rdx: src (16 blocks)
- * %rcx: iv
- */
- FRAME_BEGIN
-
- vzeroupper;
-
- /* Load input */
- vmovdqu (%rcx), RNOTx;
- vinserti128 $1, (%rdx), RNOT, RA0;
- vmovdqu (0 * 32 + 16)(%rdx), RA1;
- vmovdqu (1 * 32 + 16)(%rdx), RA2;
- vmovdqu (2 * 32 + 16)(%rdx), RA3;
- vmovdqu (3 * 32 + 16)(%rdx), RB0;
- vmovdqu (4 * 32 + 16)(%rdx), RB1;
- vmovdqu (5 * 32 + 16)(%rdx), RB2;
- vmovdqu (6 * 32 + 16)(%rdx), RB3;
-
- /* Update IV */
- vmovdqu (7 * 32 + 16)(%rdx), RNOTx;
- vmovdqu RNOTx, (%rcx);
-
- call __sm4_crypt_blk16;
-
- vpxor (0 * 32)(%rdx), RA0, RA0;
- vpxor (1 * 32)(%rdx), RA1, RA1;
- vpxor (2 * 32)(%rdx), RA2, RA2;
- vpxor (3 * 32)(%rdx), RA3, RA3;
- vpxor (4 * 32)(%rdx), RB0, RB0;
- vpxor (5 * 32)(%rdx), RB1, RB1;
- vpxor (6 * 32)(%rdx), RB2, RB2;
- vpxor (7 * 32)(%rdx), RB3, RB3;
-
- vmovdqu RA0, (0 * 32)(%rsi);
- vmovdqu RA1, (1 * 32)(%rsi);
- vmovdqu RA2, (2 * 32)(%rsi);
- vmovdqu RA3, (3 * 32)(%rsi);
- vmovdqu RB0, (4 * 32)(%rsi);
- vmovdqu RB1, (5 * 32)(%rsi);
- vmovdqu RB2, (6 * 32)(%rsi);
- vmovdqu RB3, (7 * 32)(%rsi);
-
- vzeroall;
- FRAME_END
- RET;
-SYM_FUNC_END(sm4_aesni_avx2_cfb_dec_blk16)
diff --git a/arch/x86/crypto/sm4-avx.h b/arch/x86/crypto/sm4-avx.h
index 1bceab7516aa..b5b5e67e40ed 100644
--- a/arch/x86/crypto/sm4-avx.h
+++ b/arch/x86/crypto/sm4-avx.h
@@ -14,10 +14,6 @@ int sm4_cbc_encrypt(struct skcipher_request *req);
int sm4_avx_cbc_decrypt(struct skcipher_request *req,
unsigned int bsize, sm4_crypt_func func);
-int sm4_cfb_encrypt(struct skcipher_request *req);
-int sm4_avx_cfb_decrypt(struct skcipher_request *req,
- unsigned int bsize, sm4_crypt_func func);
-
int sm4_avx_ctr_crypt(struct skcipher_request *req,
unsigned int bsize, sm4_crypt_func func);
diff --git a/arch/x86/crypto/sm4_aesni_avx2_glue.c b/arch/x86/crypto/sm4_aesni_avx2_glue.c
index 84bc718f49a3..1148fd4cd57f 100644
--- a/arch/x86/crypto/sm4_aesni_avx2_glue.c
+++ b/arch/x86/crypto/sm4_aesni_avx2_glue.c
@@ -23,8 +23,6 @@ asmlinkage void sm4_aesni_avx2_ctr_enc_blk16(const u32 *rk, u8 *dst,
const u8 *src, u8 *iv);
asmlinkage void sm4_aesni_avx2_cbc_dec_blk16(const u32 *rk, u8 *dst,
const u8 *src, u8 *iv);
-asmlinkage void sm4_aesni_avx2_cfb_dec_blk16(const u32 *rk, u8 *dst,
- const u8 *src, u8 *iv);
static int sm4_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key,
unsigned int key_len)
@@ -41,12 +39,6 @@ static int cbc_decrypt(struct skcipher_request *req)
}
-static int cfb_decrypt(struct skcipher_request *req)
-{
- return sm4_avx_cfb_decrypt(req, SM4_CRYPT16_BLOCK_SIZE,
- sm4_aesni_avx2_cfb_dec_blk16);
-}
-
static int ctr_crypt(struct skcipher_request *req)
{
return sm4_avx_ctr_crypt(req, SM4_CRYPT16_BLOCK_SIZE,
@@ -89,24 +81,6 @@ static struct skcipher_alg sm4_aesni_avx2_skciphers[] = {
.decrypt = cbc_decrypt,
}, {
.base = {
- .cra_name = "__cfb(sm4)",
- .cra_driver_name = "__cfb-sm4-aesni-avx2",
- .cra_priority = 500,
- .cra_flags = CRYPTO_ALG_INTERNAL,
- .cra_blocksize = 1,
- .cra_ctxsize = sizeof(struct sm4_ctx),
- .cra_module = THIS_MODULE,
- },
- .min_keysize = SM4_KEY_SIZE,
- .max_keysize = SM4_KEY_SIZE,
- .ivsize = SM4_BLOCK_SIZE,
- .chunksize = SM4_BLOCK_SIZE,
- .walksize = 16 * SM4_BLOCK_SIZE,
- .setkey = sm4_skcipher_setkey,
- .encrypt = sm4_cfb_encrypt,
- .decrypt = cfb_decrypt,
- }, {
- .base = {
.cra_name = "__ctr(sm4)",
.cra_driver_name = "__ctr-sm4-aesni-avx2",
.cra_priority = 500,
diff --git a/arch/x86/crypto/sm4_aesni_avx_glue.c b/arch/x86/crypto/sm4_aesni_avx_glue.c
index 7800f77d68ad..85b4ca78b47b 100644
--- a/arch/x86/crypto/sm4_aesni_avx_glue.c
+++ b/arch/x86/crypto/sm4_aesni_avx_glue.c
@@ -27,8 +27,6 @@ asmlinkage void sm4_aesni_avx_ctr_enc_blk8(const u32 *rk, u8 *dst,
const u8 *src, u8 *iv);
asmlinkage void sm4_aesni_avx_cbc_dec_blk8(const u32 *rk, u8 *dst,
const u8 *src, u8 *iv);
-asmlinkage void sm4_aesni_avx_cfb_dec_blk8(const u32 *rk, u8 *dst,
- const u8 *src, u8 *iv);
static int sm4_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key,
unsigned int key_len)
@@ -188,116 +186,6 @@ static int cbc_decrypt(struct skcipher_request *req)
sm4_aesni_avx_cbc_dec_blk8);
}
-int sm4_cfb_encrypt(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
- struct skcipher_walk walk;
- unsigned int nbytes;
- int err;
-
- err = skcipher_walk_virt(&walk, req, false);
-
- while ((nbytes = walk.nbytes) > 0) {
- u8 keystream[SM4_BLOCK_SIZE];
- const u8 *iv = walk.iv;
- const u8 *src = walk.src.virt.addr;
- u8 *dst = walk.dst.virt.addr;
-
- while (nbytes >= SM4_BLOCK_SIZE) {
- sm4_crypt_block(ctx->rkey_enc, keystream, iv);
- crypto_xor_cpy(dst, src, keystream, SM4_BLOCK_SIZE);
- iv = dst;
- src += SM4_BLOCK_SIZE;
- dst += SM4_BLOCK_SIZE;
- nbytes -= SM4_BLOCK_SIZE;
- }
- if (iv != walk.iv)
- memcpy(walk.iv, iv, SM4_BLOCK_SIZE);
-
- /* tail */
- if (walk.nbytes == walk.total && nbytes > 0) {
- sm4_crypt_block(ctx->rkey_enc, keystream, walk.iv);
- crypto_xor_cpy(dst, src, keystream, nbytes);
- nbytes = 0;
- }
-
- err = skcipher_walk_done(&walk, nbytes);
- }
-
- return err;
-}
-EXPORT_SYMBOL_GPL(sm4_cfb_encrypt);
-
-int sm4_avx_cfb_decrypt(struct skcipher_request *req,
- unsigned int bsize, sm4_crypt_func func)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
- struct skcipher_walk walk;
- unsigned int nbytes;
- int err;
-
- err = skcipher_walk_virt(&walk, req, false);
-
- while ((nbytes = walk.nbytes) > 0) {
- const u8 *src = walk.src.virt.addr;
- u8 *dst = walk.dst.virt.addr;
-
- kernel_fpu_begin();
-
- while (nbytes >= bsize) {
- func(ctx->rkey_enc, dst, src, walk.iv);
- dst += bsize;
- src += bsize;
- nbytes -= bsize;
- }
-
- while (nbytes >= SM4_BLOCK_SIZE) {
- u8 keystream[SM4_BLOCK_SIZE * 8];
- unsigned int nblocks = min(nbytes >> 4, 8u);
-
- memcpy(keystream, walk.iv, SM4_BLOCK_SIZE);
- if (nblocks > 1)
- memcpy(&keystream[SM4_BLOCK_SIZE], src,
- (nblocks - 1) * SM4_BLOCK_SIZE);
- memcpy(walk.iv, src + (nblocks - 1) * SM4_BLOCK_SIZE,
- SM4_BLOCK_SIZE);
-
- sm4_aesni_avx_crypt8(ctx->rkey_enc, keystream,
- keystream, nblocks);
-
- crypto_xor_cpy(dst, src, keystream,
- nblocks * SM4_BLOCK_SIZE);
- dst += nblocks * SM4_BLOCK_SIZE;
- src += nblocks * SM4_BLOCK_SIZE;
- nbytes -= nblocks * SM4_BLOCK_SIZE;
- }
-
- kernel_fpu_end();
-
- /* tail */
- if (walk.nbytes == walk.total && nbytes > 0) {
- u8 keystream[SM4_BLOCK_SIZE];
-
- sm4_crypt_block(ctx->rkey_enc, keystream, walk.iv);
- crypto_xor_cpy(dst, src, keystream, nbytes);
- nbytes = 0;
- }
-
- err = skcipher_walk_done(&walk, nbytes);
- }
-
- return err;
-}
-EXPORT_SYMBOL_GPL(sm4_avx_cfb_decrypt);
-
-static int cfb_decrypt(struct skcipher_request *req)
-{
- return sm4_avx_cfb_decrypt(req, SM4_CRYPT8_BLOCK_SIZE,
- sm4_aesni_avx_cfb_dec_blk8);
-}
-
int sm4_avx_ctr_crypt(struct skcipher_request *req,
unsigned int bsize, sm4_crypt_func func)
{
@@ -408,24 +296,6 @@ static struct skcipher_alg sm4_aesni_avx_skciphers[] = {
.decrypt = cbc_decrypt,
}, {
.base = {
- .cra_name = "__cfb(sm4)",
- .cra_driver_name = "__cfb-sm4-aesni-avx",
- .cra_priority = 400,
- .cra_flags = CRYPTO_ALG_INTERNAL,
- .cra_blocksize = 1,
- .cra_ctxsize = sizeof(struct sm4_ctx),
- .cra_module = THIS_MODULE,
- },
- .min_keysize = SM4_KEY_SIZE,
- .max_keysize = SM4_KEY_SIZE,
- .ivsize = SM4_BLOCK_SIZE,
- .chunksize = SM4_BLOCK_SIZE,
- .walksize = 8 * SM4_BLOCK_SIZE,
- .setkey = sm4_skcipher_setkey,
- .encrypt = sm4_cfb_encrypt,
- .decrypt = cfb_decrypt,
- }, {
- .base = {
.cra_name = "__ctr(sm4)",
.cra_driver_name = "__ctr-sm4-aesni-avx",
.cra_priority = 400,
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index f6907627172b..9f1d94790a54 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -175,8 +175,7 @@ For 32-bit we have the following conventions - kernel is built with
#define THIS_CPU_user_pcid_flush_mask \
PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask
-.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
- ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+.macro SWITCH_TO_USER_CR3 scratch_reg:req scratch_reg2:req
mov %cr3, \scratch_reg
ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
@@ -206,13 +205,20 @@ For 32-bit we have the following conventions - kernel is built with
/* Flip the PGD to the user version */
orq $(PTI_USER_PGTABLE_MASK), \scratch_reg
mov \scratch_reg, %cr3
+.endm
+
+.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+ SWITCH_TO_USER_CR3 \scratch_reg \scratch_reg2
.Lend_\@:
.endm
.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
+ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
pushq %rax
- SWITCH_TO_USER_CR3_NOSTACK scratch_reg=\scratch_reg scratch_reg2=%rax
+ SWITCH_TO_USER_CR3 scratch_reg=\scratch_reg scratch_reg2=%rax
popq %rax
+.Lend_\@:
.endm
.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index d813160b14d8..6356060caaf3 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -26,6 +26,7 @@
#include <xen/events.h>
#endif
+#include <asm/apic.h>
#include <asm/desc.h>
#include <asm/traps.h>
#include <asm/vdso.h>
@@ -167,7 +168,96 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr)
}
}
-/* Handles int $0x80 */
+#ifdef CONFIG_IA32_EMULATION
+static __always_inline bool int80_is_external(void)
+{
+ const unsigned int offs = (0x80 / 32) * 0x10;
+ const u32 bit = BIT(0x80 % 32);
+
+ /* The local APIC on XENPV guests is fake */
+ if (cpu_feature_enabled(X86_FEATURE_XENPV))
+ return false;
+
+ /*
+ * If vector 0x80 is set in the APIC ISR then this is an external
+ * interrupt. Either from broken hardware or injected by a VMM.
+ *
+ * Note: In guest mode this is only valid for secure guests where
+ * the secure module fully controls the vAPIC exposed to the guest.
+ */
+ return apic_read(APIC_ISR + offs) & bit;
+}
+
+/**
+ * int80_emulation - 32-bit legacy syscall entry
+ *
+ * This entry point can be used by 32-bit and 64-bit programs to perform
+ * 32-bit system calls. Instances of INT $0x80 can be found inline in
+ * various programs and libraries. It is also used by the vDSO's
+ * __kernel_vsyscall fallback for hardware that doesn't support a faster
+ * entry method. Restarted 32-bit system calls also fall back to INT
+ * $0x80 regardless of what instruction was originally used to do the
+ * system call.
+ *
+ * This is considered a slow path. It is not used by most libc
+ * implementations on modern hardware except during process startup.
+ *
+ * The arguments for the INT $0x80 based syscall are on stack in the
+ * pt_regs structure:
+ * eax: system call number
+ * ebx, ecx, edx, esi, edi, ebp: arg1 - arg 6
+ */
+DEFINE_IDTENTRY_RAW(int80_emulation)
+{
+ int nr;
+
+ /* Kernel does not use INT $0x80! */
+ if (unlikely(!user_mode(regs))) {
+ irqentry_enter(regs);
+ instrumentation_begin();
+ panic("Unexpected external interrupt 0x80\n");
+ }
+
+ /*
+ * Establish kernel context for instrumentation, including for
+ * int80_is_external() below which calls into the APIC driver.
+ * Identical for soft and external interrupts.
+ */
+ enter_from_user_mode(regs);
+
+ instrumentation_begin();
+ add_random_kstack_offset();
+
+ /* Validate that this is a soft interrupt to the extent possible */
+ if (unlikely(int80_is_external()))
+ panic("Unexpected external interrupt 0x80\n");
+
+ /*
+ * The low level idtentry code pushed -1 into regs::orig_ax
+ * and regs::ax contains the syscall number.
+ *
+ * User tracing code (ptrace or signal handlers) might assume
+ * that the regs::orig_ax contains a 32-bit number on invoking
+ * a 32-bit syscall.
+ *
+ * Establish the syscall convention by saving the 32bit truncated
+ * syscall number in regs::orig_ax and by invalidating regs::ax.
+ */
+ regs->orig_ax = regs->ax & GENMASK(31, 0);
+ regs->ax = -ENOSYS;
+
+ nr = syscall_32_enter(regs);
+
+ local_irq_enable();
+ nr = syscall_enter_from_user_mode_work(regs, nr);
+ do_syscall_32_irqs_on(regs, nr);
+
+ instrumentation_end();
+ syscall_exit_to_user_mode(regs);
+}
+#else /* CONFIG_IA32_EMULATION */
+
+/* Handles int $0x80 on a 32bit kernel */
__visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
{
int nr = syscall_32_enter(regs);
@@ -186,6 +276,7 @@ __visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
instrumentation_end();
syscall_exit_to_user_mode(regs);
}
+#endif /* !CONFIG_IA32_EMULATION */
static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
{
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index de6469dffe3a..c40f89ab1b4c 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -559,17 +559,27 @@ __irqentry_text_end:
SYM_CODE_START_LOCAL(common_interrupt_return)
SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
IBRS_EXIT
-#ifdef CONFIG_DEBUG_ENTRY
- /* Assert that pt_regs indicates user mode. */
- testb $3, CS(%rsp)
- jnz 1f
- ud2
-1:
-#endif
#ifdef CONFIG_XEN_PV
ALTERNATIVE "", "jmp xenpv_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
#endif
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ ALTERNATIVE "", "jmp .Lpti_restore_regs_and_return_to_usermode", X86_FEATURE_PTI
+#endif
+
+ STACKLEAK_ERASE
+ POP_REGS
+ add $8, %rsp /* orig_ax */
+ UNWIND_HINT_IRET_REGS
+
+.Lswapgs_and_iret:
+ swapgs
+ /* Assert that the IRET frame indicates user mode. */
+ testb $3, 8(%rsp)
+ jnz .Lnative_iret
+ ud2
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+.Lpti_restore_regs_and_return_to_usermode:
POP_REGS pop_rdi=0
/*
@@ -596,13 +606,14 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
*/
STACKLEAK_ERASE_NOCLOBBER
- SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
+ push %rax
+ SWITCH_TO_USER_CR3 scratch_reg=%rdi scratch_reg2=%rax
+ pop %rax
/* Restore RDI. */
popq %rdi
- swapgs
- jmp .Lnative_iret
-
+ jmp .Lswapgs_and_iret
+#endif
SYM_INNER_LABEL(restore_regs_and_return_to_kernel, SYM_L_GLOBAL)
#ifdef CONFIG_DEBUG_ENTRY
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 27c05d08558a..de94e2e84ecc 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -275,80 +275,3 @@ SYM_INNER_LABEL(entry_SYSRETL_compat_end, SYM_L_GLOBAL)
ANNOTATE_NOENDBR
int3
SYM_CODE_END(entry_SYSCALL_compat)
-
-/*
- * 32-bit legacy system call entry.
- *
- * 32-bit x86 Linux system calls traditionally used the INT $0x80
- * instruction. INT $0x80 lands here.
- *
- * This entry point can be used by 32-bit and 64-bit programs to perform
- * 32-bit system calls. Instances of INT $0x80 can be found inline in
- * various programs and libraries. It is also used by the vDSO's
- * __kernel_vsyscall fallback for hardware that doesn't support a faster
- * entry method. Restarted 32-bit system calls also fall back to INT
- * $0x80 regardless of what instruction was originally used to do the
- * system call.
- *
- * This is considered a slow path. It is not used by most libc
- * implementations on modern hardware except during process startup.
- *
- * Arguments:
- * eax system call number
- * ebx arg1
- * ecx arg2
- * edx arg3
- * esi arg4
- * edi arg5
- * ebp arg6
- */
-SYM_CODE_START(entry_INT80_compat)
- UNWIND_HINT_ENTRY
- ENDBR
- /*
- * Interrupts are off on entry.
- */
- ASM_CLAC /* Do this early to minimize exposure */
- ALTERNATIVE "swapgs", "", X86_FEATURE_XENPV
-
- /*
- * User tracing code (ptrace or signal handlers) might assume that
- * the saved RAX contains a 32-bit number when we're invoking a 32-bit
- * syscall. Just in case the high bits are nonzero, zero-extend
- * the syscall number. (This could almost certainly be deleted
- * with no ill effects.)
- */
- movl %eax, %eax
-
- /* switch to thread stack expects orig_ax and rdi to be pushed */
- pushq %rax /* pt_regs->orig_ax */
-
- /* Need to switch before accessing the thread stack. */
- SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
-
- /* In the Xen PV case we already run on the thread stack. */
- ALTERNATIVE "", "jmp .Lint80_keep_stack", X86_FEATURE_XENPV
-
- movq %rsp, %rax
- movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp
-
- pushq 5*8(%rax) /* regs->ss */
- pushq 4*8(%rax) /* regs->rsp */
- pushq 3*8(%rax) /* regs->eflags */
- pushq 2*8(%rax) /* regs->cs */
- pushq 1*8(%rax) /* regs->ip */
- pushq 0*8(%rax) /* regs->orig_ax */
-.Lint80_keep_stack:
-
- PUSH_AND_CLEAR_REGS rax=$-ENOSYS
- UNWIND_HINT_REGS
-
- cld
-
- IBRS_ENTER
- UNTRAIN_RET
-
- movq %rsp, %rdi
- call do_int80_syscall_32
- jmp swapgs_restore_regs_and_return_to_usermode
-SYM_CODE_END(entry_INT80_compat)
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index c8fac5205803..5f8591ce7f25 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -461,3 +461,8 @@
454 i386 futex_wake sys_futex_wake
455 i386 futex_wait sys_futex_wait
456 i386 futex_requeue sys_futex_requeue
+457 i386 statmount sys_statmount
+458 i386 listmount sys_listmount
+459 i386 lsm_get_self_attr sys_lsm_get_self_attr
+460 i386 lsm_set_self_attr sys_lsm_set_self_attr
+461 i386 lsm_list_modules sys_lsm_list_modules
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 8cb8bf68721c..7e8d46f4147f 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -378,6 +378,11 @@
454 common futex_wake sys_futex_wake
455 common futex_wait sys_futex_wait
456 common futex_requeue sys_futex_requeue
+457 common statmount sys_statmount
+458 common listmount sys_listmount
+459 common lsm_get_self_attr sys_lsm_get_self_attr
+460 common lsm_set_self_attr sys_lsm_set_self_attr
+461 common lsm_list_modules sys_lsm_list_modules
#
# Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index 7d70935b6758..0debc194bd78 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -11,12 +11,10 @@
#include <linux/time.h>
#include <linux/kernel.h>
#include <linux/types.h>
+#include <vdso/gettime.h>
#include "../../../../lib/vdso/gettimeofday.c"
-extern int __vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz);
-extern __kernel_old_time_t __vdso_time(__kernel_old_time_t *t);
-
int __vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz)
{
return __cvdso_gettimeofday(tv, tz);
@@ -35,9 +33,6 @@ __kernel_old_time_t time(__kernel_old_time_t *t) __attribute__((weak, alias("__v
#if defined(CONFIG_X86_64) && !defined(BUILD_VDSO32_64)
/* both 64-bit and x32 use these */
-extern int __vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts);
-extern int __vdso_clock_getres(clockid_t clock, struct __kernel_timespec *res);
-
int __vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts)
{
return __cvdso_clock_gettime(clock, ts);
@@ -56,9 +51,6 @@ int clock_getres(clockid_t, struct __kernel_timespec *)
#else
/* i386 only */
-extern int __vdso_clock_gettime(clockid_t clock, struct old_timespec32 *ts);
-extern int __vdso_clock_getres(clockid_t clock, struct old_timespec32 *res);
-
int __vdso_clock_gettime(clockid_t clock, struct old_timespec32 *ts)
{
return __cvdso_clock_gettime32(clock, ts);
diff --git a/arch/x86/events/amd/brs.c b/arch/x86/events/amd/brs.c
index ed308719236c..780acd3dff22 100644
--- a/arch/x86/events/amd/brs.c
+++ b/arch/x86/events/amd/brs.c
@@ -125,7 +125,7 @@ int amd_brs_hw_config(struct perf_event *event)
* Where X is the number of taken branches due to interrupt
* skid. Skid is large.
*
- * Where Y is the occurences of the event while BRS is
+ * Where Y is the occurrences of the event while BRS is
* capturing the lbr_nr entries.
*
* By using retired taken branches, we limit the impact on the
diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
index e24976593a29..81f6d8275b6b 100644
--- a/arch/x86/events/amd/core.c
+++ b/arch/x86/events/amd/core.c
@@ -940,7 +940,7 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs)
continue;
if (has_branch_stack(event))
- perf_sample_save_brstack(&data, event, &cpuc->lbr_stack);
+ perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL);
if (perf_event_overflow(event, &data, regs))
x86_pmu_stop(event, 0);
@@ -1184,7 +1184,7 @@ static void amd_put_event_constraints_f17h(struct cpu_hw_events *cpuc,
* period of each one and given that the BRS saturates, it would not be possible
* to guarantee correlated content for all events. Therefore, in situations
* where multiple events want to use BRS, the kernel enforces mutual exclusion.
- * Exclusion is enforced by chosing only one counter for events using BRS.
+ * Exclusion is enforced by choosing only one counter for events using BRS.
* The event scheduling logic will then automatically multiplex the
* events and ensure that at most one event is actively using BRS.
*
diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
index 6911c5399d02..e91970b01d62 100644
--- a/arch/x86/events/amd/ibs.c
+++ b/arch/x86/events/amd/ibs.c
@@ -287,6 +287,9 @@ static int perf_ibs_init(struct perf_event *event)
if (config & ~perf_ibs->config_mask)
return -EINVAL;
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
+
ret = validate_group(event);
if (ret)
return ret;
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 40ad1425ffa2..09050641ce5d 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -601,7 +601,7 @@ int x86_pmu_hw_config(struct perf_event *event)
}
}
- if (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK)
+ if (branch_sample_call_stack(event))
event->attach_state |= PERF_ATTACH_TASK_DATA;
/*
@@ -1702,7 +1702,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs)
perf_sample_data_init(&data, 0, event->hw.last_period);
if (has_branch_stack(event))
- perf_sample_save_brstack(&data, event, &cpuc->lbr_stack);
+ perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL);
if (perf_event_overflow(event, &data, regs))
x86_pmu_stop(event, 0);
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index ce1c777227b4..3804f21ab049 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2527,9 +2527,14 @@ static void intel_pmu_assign_event(struct perf_event *event, int idx)
perf_report_aux_output_id(event, idx);
}
+static __always_inline bool intel_pmu_needs_branch_stack(struct perf_event *event)
+{
+ return event->hw.flags & PERF_X86_EVENT_NEEDS_BRANCH_STACK;
+}
+
static void intel_pmu_del_event(struct perf_event *event)
{
- if (needs_branch_stack(event))
+ if (intel_pmu_needs_branch_stack(event))
intel_pmu_lbr_del(event);
if (event->attr.precise_ip)
intel_pmu_pebs_del(event);
@@ -2787,6 +2792,7 @@ static void intel_pmu_enable_fixed(struct perf_event *event)
static void intel_pmu_enable_event(struct perf_event *event)
{
+ u64 enable_mask = ARCH_PERFMON_EVENTSEL_ENABLE;
struct hw_perf_event *hwc = &event->hw;
int idx = hwc->idx;
@@ -2795,8 +2801,10 @@ static void intel_pmu_enable_event(struct perf_event *event)
switch (idx) {
case 0 ... INTEL_PMC_IDX_FIXED - 1:
+ if (branch_sample_counters(event))
+ enable_mask |= ARCH_PERFMON_EVENTSEL_BR_CNTR;
intel_set_masks(event, idx);
- __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
+ __x86_pmu_enable_event(hwc, enable_mask);
break;
case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS - 1:
case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END:
@@ -2820,7 +2828,7 @@ static void intel_pmu_add_event(struct perf_event *event)
{
if (event->attr.precise_ip)
intel_pmu_pebs_add(event);
- if (needs_branch_stack(event))
+ if (intel_pmu_needs_branch_stack(event))
intel_pmu_lbr_add(event);
}
@@ -3047,7 +3055,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
perf_sample_data_init(&data, 0, event->hw.last_period);
if (has_branch_stack(event))
- perf_sample_save_brstack(&data, event, &cpuc->lbr_stack);
+ intel_pmu_lbr_save_brstack(&data, cpuc, event);
if (perf_event_overflow(event, &data, regs))
x86_pmu_stop(event, 0);
@@ -3612,6 +3620,13 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
if (cpuc->excl_cntrs)
return intel_get_excl_constraints(cpuc, event, idx, c2);
+ /* Not all counters support the branch counter feature. */
+ if (branch_sample_counters(event)) {
+ c2 = dyn_constraint(cpuc, c2, idx);
+ c2->idxmsk64 &= x86_pmu.lbr_counters;
+ c2->weight = hweight64(c2->idxmsk64);
+ }
+
return c2;
}
@@ -3897,7 +3912,62 @@ static int intel_pmu_hw_config(struct perf_event *event)
x86_pmu.pebs_aliases(event);
}
- if (needs_branch_stack(event)) {
+ if (needs_branch_stack(event) && is_sampling_event(event))
+ event->hw.flags |= PERF_X86_EVENT_NEEDS_BRANCH_STACK;
+
+ if (branch_sample_counters(event)) {
+ struct perf_event *leader, *sibling;
+ int num = 0;
+
+ if (!(x86_pmu.flags & PMU_FL_BR_CNTR) ||
+ (event->attr.config & ~INTEL_ARCH_EVENT_MASK))
+ return -EINVAL;
+
+ /*
+ * The branch counter logging is not supported in the call stack
+ * mode yet, since we cannot simply flush the LBR during e.g.,
+ * multiplexing. Also, there is no obvious usage with the call
+ * stack mode. Simply forbids it for now.
+ *
+ * If any events in the group enable the branch counter logging
+ * feature, the group is treated as a branch counter logging
+ * group, which requires the extra space to store the counters.
+ */
+ leader = event->group_leader;
+ if (branch_sample_call_stack(leader))
+ return -EINVAL;
+ if (branch_sample_counters(leader))
+ num++;
+ leader->hw.flags |= PERF_X86_EVENT_BRANCH_COUNTERS;
+
+ for_each_sibling_event(sibling, leader) {
+ if (branch_sample_call_stack(sibling))
+ return -EINVAL;
+ if (branch_sample_counters(sibling))
+ num++;
+ }
+
+ if (num > fls(x86_pmu.lbr_counters))
+ return -EINVAL;
+ /*
+ * Only applying the PERF_SAMPLE_BRANCH_COUNTERS doesn't
+ * require any branch stack setup.
+ * Clear the bit to avoid unnecessary branch stack setup.
+ */
+ if (0 == (event->attr.branch_sample_type &
+ ~(PERF_SAMPLE_BRANCH_PLM_ALL |
+ PERF_SAMPLE_BRANCH_COUNTERS)))
+ event->hw.flags &= ~PERF_X86_EVENT_NEEDS_BRANCH_STACK;
+
+ /*
+ * Force the leader to be a LBR event. So LBRs can be reset
+ * with the leader event. See intel_pmu_lbr_del() for details.
+ */
+ if (!intel_pmu_needs_branch_stack(leader))
+ return -EINVAL;
+ }
+
+ if (intel_pmu_needs_branch_stack(event)) {
ret = intel_pmu_setup_lbr_filter(event);
if (ret)
return ret;
@@ -4027,7 +4097,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
/*
* Currently, the only caller of this function is the atomic_switch_perf_msrs().
- * The host perf conext helps to prepare the values of the real hardware for
+ * The host perf context helps to prepare the values of the real hardware for
* a set of msrs that need to be switched atomically in a vmx transaction.
*
* For example, the pseudocode needed to add a new msr should look like:
@@ -4051,12 +4121,17 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data)
u64 pebs_mask = cpuc->pebs_enabled & x86_pmu.pebs_capable;
int global_ctrl, pebs_enable;
+ /*
+ * In addition to obeying exclude_guest/exclude_host, remove bits being
+ * used for PEBS when running a guest, because PEBS writes to virtual
+ * addresses (not physical addresses).
+ */
*nr = 0;
global_ctrl = (*nr)++;
arr[global_ctrl] = (struct perf_guest_switch_msr){
.msr = MSR_CORE_PERF_GLOBAL_CTRL,
.host = intel_ctrl & ~cpuc->intel_ctrl_guest_mask,
- .guest = intel_ctrl & (~cpuc->intel_ctrl_host_mask | ~pebs_mask),
+ .guest = intel_ctrl & ~cpuc->intel_ctrl_host_mask & ~pebs_mask,
};
if (!x86_pmu.pebs)
@@ -4375,8 +4450,13 @@ cmt_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
*/
if (event->attr.precise_ip == 3) {
/* Force instruction:ppp on PMC0, 1 and Fixed counter 0 */
- if (constraint_match(&fixed0_constraint, event->hw.config))
- return &fixed0_counter0_1_constraint;
+ if (constraint_match(&fixed0_constraint, event->hw.config)) {
+ /* The fixed counter 0 doesn't support LBR event logging. */
+ if (branch_sample_counters(event))
+ return &counter0_1_constraint;
+ else
+ return &fixed0_counter0_1_constraint;
+ }
switch (c->idxmsk64 & 0x3ull) {
case 0x1:
@@ -4555,7 +4635,7 @@ int intel_cpuc_prepare(struct cpu_hw_events *cpuc, int cpu)
goto err;
}
- if (x86_pmu.flags & (PMU_FL_EXCL_CNTRS | PMU_FL_TFA)) {
+ if (x86_pmu.flags & (PMU_FL_EXCL_CNTRS | PMU_FL_TFA | PMU_FL_BR_CNTR)) {
size_t sz = X86_PMC_IDX_MAX * sizeof(struct event_constraint);
cpuc->constraint_list = kzalloc_node(sz, GFP_KERNEL, cpu_to_node(cpu));
@@ -5527,11 +5607,41 @@ static ssize_t branches_show(struct device *cdev,
static DEVICE_ATTR_RO(branches);
+static ssize_t branch_counter_nr_show(struct device *cdev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", fls(x86_pmu.lbr_counters));
+}
+
+static DEVICE_ATTR_RO(branch_counter_nr);
+
+static ssize_t branch_counter_width_show(struct device *cdev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", LBR_INFO_BR_CNTR_BITS);
+}
+
+static DEVICE_ATTR_RO(branch_counter_width);
+
static struct attribute *lbr_attrs[] = {
&dev_attr_branches.attr,
+ &dev_attr_branch_counter_nr.attr,
+ &dev_attr_branch_counter_width.attr,
NULL
};
+static umode_t
+lbr_is_visible(struct kobject *kobj, struct attribute *attr, int i)
+{
+ /* branches */
+ if (i == 0)
+ return x86_pmu.lbr_nr ? attr->mode : 0;
+
+ return (x86_pmu.flags & PMU_FL_BR_CNTR) ? attr->mode : 0;
+}
+
static char pmu_name_str[30];
static ssize_t pmu_name_show(struct device *cdev,
@@ -5559,6 +5669,15 @@ static struct attribute *intel_pmu_attrs[] = {
};
static umode_t
+default_is_visible(struct kobject *kobj, struct attribute *attr, int i)
+{
+ if (attr == &dev_attr_allow_tsx_force_abort.attr)
+ return x86_pmu.flags & PMU_FL_TFA ? attr->mode : 0;
+
+ return attr->mode;
+}
+
+static umode_t
tsx_is_visible(struct kobject *kobj, struct attribute *attr, int i)
{
return boot_cpu_has(X86_FEATURE_RTM) ? attr->mode : 0;
@@ -5580,26 +5699,11 @@ mem_is_visible(struct kobject *kobj, struct attribute *attr, int i)
}
static umode_t
-lbr_is_visible(struct kobject *kobj, struct attribute *attr, int i)
-{
- return x86_pmu.lbr_nr ? attr->mode : 0;
-}
-
-static umode_t
exra_is_visible(struct kobject *kobj, struct attribute *attr, int i)
{
return x86_pmu.version >= 2 ? attr->mode : 0;
}
-static umode_t
-default_is_visible(struct kobject *kobj, struct attribute *attr, int i)
-{
- if (attr == &dev_attr_allow_tsx_force_abort.attr)
- return x86_pmu.flags & PMU_FL_TFA ? attr->mode : 0;
-
- return attr->mode;
-}
-
static struct attribute_group group_events_td = {
.name = "events",
};
diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
index cbeb6d2bf5b4..4b50a3a9818a 100644
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -41,7 +41,7 @@
* MSR_CORE_C1_RES: CORE C1 Residency Counter
* perf code: 0x00
* Available model: SLM,AMT,GLM,CNL,ICX,TNT,ADL,RPL
- * MTL
+ * MTL,SRF,GRR
* Scope: Core (each processor core has a MSR)
* MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter
* perf code: 0x01
@@ -52,7 +52,8 @@
* perf code: 0x02
* Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,
* SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX,
- * TGL,TNT,RKL,ADL,RPL,SPR,MTL
+ * TGL,TNT,RKL,ADL,RPL,SPR,MTL,SRF,
+ * GRR
* Scope: Core
* MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter
* perf code: 0x03
@@ -75,7 +76,7 @@
* perf code: 0x02
* Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,
* SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX,
- * TGL,TNT,RKL,ADL,RPL,SPR,MTL
+ * TGL,TNT,RKL,ADL,RPL,SPR,MTL,SRF
* Scope: Package (physical package)
* MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter.
* perf code: 0x03
@@ -97,6 +98,10 @@
* Available model: HSW ULT,KBL,GLM,CNL,CML,ICL,TGL,
* TNT,RKL,ADL,RPL,MTL
* Scope: Package (physical package)
+ * MSR_MODULE_C6_RES_MS: Module C6 Residency Counter.
+ * perf code: 0x00
+ * Available model: SRF,GRR
+ * Scope: A cluster of cores shared L2 cache
*
*/
@@ -130,6 +135,7 @@ static ssize_t cstate_get_attr_cpumask(struct device *dev,
struct cstate_model {
unsigned long core_events;
unsigned long pkg_events;
+ unsigned long module_events;
unsigned long quirks;
};
@@ -189,20 +195,20 @@ static struct attribute *attrs_empty[] = {
* "events" group (with empty attrs) before updating
* it with detected events.
*/
-static struct attribute_group core_events_attr_group = {
+static struct attribute_group cstate_events_attr_group = {
.name = "events",
.attrs = attrs_empty,
};
-DEFINE_CSTATE_FORMAT_ATTR(core_event, event, "config:0-63");
-static struct attribute *core_format_attrs[] = {
- &format_attr_core_event.attr,
+DEFINE_CSTATE_FORMAT_ATTR(cstate_event, event, "config:0-63");
+static struct attribute *cstate_format_attrs[] = {
+ &format_attr_cstate_event.attr,
NULL,
};
-static struct attribute_group core_format_attr_group = {
+static struct attribute_group cstate_format_attr_group = {
.name = "format",
- .attrs = core_format_attrs,
+ .attrs = cstate_format_attrs,
};
static cpumask_t cstate_core_cpu_mask;
@@ -217,9 +223,9 @@ static struct attribute_group cpumask_attr_group = {
.attrs = cstate_cpumask_attrs,
};
-static const struct attribute_group *core_attr_groups[] = {
- &core_events_attr_group,
- &core_format_attr_group,
+static const struct attribute_group *cstate_attr_groups[] = {
+ &cstate_events_attr_group,
+ &cstate_format_attr_group,
&cpumask_attr_group,
NULL,
};
@@ -268,30 +274,30 @@ static struct perf_msr pkg_msr[] = {
[PERF_CSTATE_PKG_C10_RES] = { MSR_PKG_C10_RESIDENCY, &group_cstate_pkg_c10, test_msr },
};
-static struct attribute_group pkg_events_attr_group = {
- .name = "events",
- .attrs = attrs_empty,
-};
+static cpumask_t cstate_pkg_cpu_mask;
-DEFINE_CSTATE_FORMAT_ATTR(pkg_event, event, "config:0-63");
-static struct attribute *pkg_format_attrs[] = {
- &format_attr_pkg_event.attr,
- NULL,
-};
-static struct attribute_group pkg_format_attr_group = {
- .name = "format",
- .attrs = pkg_format_attrs,
+/* cstate_module PMU */
+static struct pmu cstate_module_pmu;
+static bool has_cstate_module;
+
+enum perf_cstate_module_events {
+ PERF_CSTATE_MODULE_C6_RES = 0,
+
+ PERF_CSTATE_MODULE_EVENT_MAX,
};
-static cpumask_t cstate_pkg_cpu_mask;
+PMU_EVENT_ATTR_STRING(c6-residency, attr_cstate_module_c6, "event=0x00");
-static const struct attribute_group *pkg_attr_groups[] = {
- &pkg_events_attr_group,
- &pkg_format_attr_group,
- &cpumask_attr_group,
- NULL,
+static unsigned long module_msr_mask;
+
+PMU_EVENT_GROUP(events, cstate_module_c6);
+
+static struct perf_msr module_msr[] = {
+ [PERF_CSTATE_MODULE_C6_RES] = { MSR_MODULE_C6_RES_MS, &group_cstate_module_c6, test_msr },
};
+static cpumask_t cstate_module_cpu_mask;
+
static ssize_t cstate_get_attr_cpumask(struct device *dev,
struct device_attribute *attr,
char *buf)
@@ -302,6 +308,8 @@ static ssize_t cstate_get_attr_cpumask(struct device *dev,
return cpumap_print_to_pagebuf(true, buf, &cstate_core_cpu_mask);
else if (pmu == &cstate_pkg_pmu)
return cpumap_print_to_pagebuf(true, buf, &cstate_pkg_cpu_mask);
+ else if (pmu == &cstate_module_pmu)
+ return cpumap_print_to_pagebuf(true, buf, &cstate_module_cpu_mask);
else
return 0;
}
@@ -342,6 +350,15 @@ static int cstate_pmu_event_init(struct perf_event *event)
event->hw.event_base = pkg_msr[cfg].msr;
cpu = cpumask_any_and(&cstate_pkg_cpu_mask,
topology_die_cpumask(event->cpu));
+ } else if (event->pmu == &cstate_module_pmu) {
+ if (cfg >= PERF_CSTATE_MODULE_EVENT_MAX)
+ return -EINVAL;
+ cfg = array_index_nospec((unsigned long)cfg, PERF_CSTATE_MODULE_EVENT_MAX);
+ if (!(module_msr_mask & (1 << cfg)))
+ return -EINVAL;
+ event->hw.event_base = module_msr[cfg].msr;
+ cpu = cpumask_any_and(&cstate_module_cpu_mask,
+ topology_cluster_cpumask(event->cpu));
} else {
return -ENOENT;
}
@@ -429,6 +446,17 @@ static int cstate_cpu_exit(unsigned int cpu)
perf_pmu_migrate_context(&cstate_pkg_pmu, cpu, target);
}
}
+
+ if (has_cstate_module &&
+ cpumask_test_and_clear_cpu(cpu, &cstate_module_cpu_mask)) {
+
+ target = cpumask_any_but(topology_cluster_cpumask(cpu), cpu);
+ /* Migrate events if there is a valid target */
+ if (target < nr_cpu_ids) {
+ cpumask_set_cpu(target, &cstate_module_cpu_mask);
+ perf_pmu_migrate_context(&cstate_module_pmu, cpu, target);
+ }
+ }
return 0;
}
@@ -455,6 +483,15 @@ static int cstate_cpu_init(unsigned int cpu)
if (has_cstate_pkg && target >= nr_cpu_ids)
cpumask_set_cpu(cpu, &cstate_pkg_cpu_mask);
+ /*
+ * If this is the first online thread of that cluster, set it
+ * in the cluster cpu mask as the designated reader.
+ */
+ target = cpumask_any_and(&cstate_module_cpu_mask,
+ topology_cluster_cpumask(cpu));
+ if (has_cstate_module && target >= nr_cpu_ids)
+ cpumask_set_cpu(cpu, &cstate_module_cpu_mask);
+
return 0;
}
@@ -477,8 +514,13 @@ static const struct attribute_group *pkg_attr_update[] = {
NULL,
};
+static const struct attribute_group *module_attr_update[] = {
+ &group_cstate_module_c6,
+ NULL
+};
+
static struct pmu cstate_core_pmu = {
- .attr_groups = core_attr_groups,
+ .attr_groups = cstate_attr_groups,
.attr_update = core_attr_update,
.name = "cstate_core",
.task_ctx_nr = perf_invalid_context,
@@ -493,7 +535,7 @@ static struct pmu cstate_core_pmu = {
};
static struct pmu cstate_pkg_pmu = {
- .attr_groups = pkg_attr_groups,
+ .attr_groups = cstate_attr_groups,
.attr_update = pkg_attr_update,
.name = "cstate_pkg",
.task_ctx_nr = perf_invalid_context,
@@ -507,6 +549,21 @@ static struct pmu cstate_pkg_pmu = {
.module = THIS_MODULE,
};
+static struct pmu cstate_module_pmu = {
+ .attr_groups = cstate_attr_groups,
+ .attr_update = module_attr_update,
+ .name = "cstate_module",
+ .task_ctx_nr = perf_invalid_context,
+ .event_init = cstate_pmu_event_init,
+ .add = cstate_pmu_event_add,
+ .del = cstate_pmu_event_del,
+ .start = cstate_pmu_event_start,
+ .stop = cstate_pmu_event_stop,
+ .read = cstate_pmu_event_update,
+ .capabilities = PERF_PMU_CAP_NO_INTERRUPT | PERF_PMU_CAP_NO_EXCLUDE,
+ .module = THIS_MODULE,
+};
+
static const struct cstate_model nhm_cstates __initconst = {
.core_events = BIT(PERF_CSTATE_CORE_C3_RES) |
BIT(PERF_CSTATE_CORE_C6_RES),
@@ -621,6 +678,22 @@ static const struct cstate_model glm_cstates __initconst = {
BIT(PERF_CSTATE_PKG_C10_RES),
};
+static const struct cstate_model grr_cstates __initconst = {
+ .core_events = BIT(PERF_CSTATE_CORE_C1_RES) |
+ BIT(PERF_CSTATE_CORE_C6_RES),
+
+ .module_events = BIT(PERF_CSTATE_MODULE_C6_RES),
+};
+
+static const struct cstate_model srf_cstates __initconst = {
+ .core_events = BIT(PERF_CSTATE_CORE_C1_RES) |
+ BIT(PERF_CSTATE_CORE_C6_RES),
+
+ .pkg_events = BIT(PERF_CSTATE_PKG_C6_RES),
+
+ .module_events = BIT(PERF_CSTATE_MODULE_C6_RES),
+};
+
static const struct x86_cpu_id intel_cstates_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(NEHALEM, &nhm_cstates),
@@ -673,6 +746,8 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT, &glm_cstates),
X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L, &glm_cstates),
X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, &adl_cstates),
+ X86_MATCH_INTEL_FAM6_MODEL(ATOM_CRESTMONT_X, &srf_cstates),
+ X86_MATCH_INTEL_FAM6_MODEL(ATOM_CRESTMONT, &grr_cstates),
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &icl_cstates),
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &icl_cstates),
@@ -714,10 +789,14 @@ static int __init cstate_probe(const struct cstate_model *cm)
pkg_msr_mask = perf_msr_probe(pkg_msr, PERF_CSTATE_PKG_EVENT_MAX,
true, (void *) &cm->pkg_events);
+ module_msr_mask = perf_msr_probe(module_msr, PERF_CSTATE_MODULE_EVENT_MAX,
+ true, (void *) &cm->module_events);
+
has_cstate_core = !!core_msr_mask;
has_cstate_pkg = !!pkg_msr_mask;
+ has_cstate_module = !!module_msr_mask;
- return (has_cstate_core || has_cstate_pkg) ? 0 : -ENODEV;
+ return (has_cstate_core || has_cstate_pkg || has_cstate_module) ? 0 : -ENODEV;
}
static inline void cstate_cleanup(void)
@@ -730,6 +809,9 @@ static inline void cstate_cleanup(void)
if (has_cstate_pkg)
perf_pmu_unregister(&cstate_pkg_pmu);
+
+ if (has_cstate_module)
+ perf_pmu_unregister(&cstate_module_pmu);
}
static int __init cstate_init(void)
@@ -766,6 +848,16 @@ static int __init cstate_init(void)
return err;
}
}
+
+ if (has_cstate_module) {
+ err = perf_pmu_register(&cstate_module_pmu, cstate_module_pmu.name, -1);
+ if (err) {
+ has_cstate_module = false;
+ pr_info("Failed to register cstate cluster pmu\n");
+ cstate_cleanup();
+ return err;
+ }
+ }
return 0;
}
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index bf97ab904d40..d49d661ec0a7 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1755,7 +1755,7 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event,
setup_pebs_time(event, data, pebs->tsc);
if (has_branch_stack(event))
- perf_sample_save_brstack(data, event, &cpuc->lbr_stack);
+ perf_sample_save_brstack(data, event, &cpuc->lbr_stack, NULL);
}
static void adaptive_pebs_save_regs(struct pt_regs *regs,
@@ -1912,7 +1912,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
if (has_branch_stack(event)) {
intel_pmu_store_pebs_lbrs(lbr);
- perf_sample_save_brstack(data, event, &cpuc->lbr_stack);
+ intel_pmu_lbr_save_brstack(data, cpuc, event);
}
}
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index c3b0d15a9841..78cd5084104e 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -676,6 +676,25 @@ void intel_pmu_lbr_del(struct perf_event *event)
WARN_ON_ONCE(cpuc->lbr_users < 0);
WARN_ON_ONCE(cpuc->lbr_pebs_users < 0);
perf_sched_cb_dec(event->pmu);
+
+ /*
+ * The logged occurrences information is only valid for the
+ * current LBR group. If another LBR group is scheduled in
+ * later, the information from the stale LBRs will be wrongly
+ * interpreted. Reset the LBRs here.
+ *
+ * Only clear once for a branch counter group with the leader
+ * event. Because
+ * - Cannot simply reset the LBRs with the !cpuc->lbr_users.
+ * Because it's possible that the last LBR user is not in a
+ * branch counter group, e.g., a branch_counters group +
+ * several normal LBR events.
+ * - The LBR reset can be done with any one of the events in a
+ * branch counter group, since they are always scheduled together.
+ * It's easy to force the leader event an LBR event.
+ */
+ if (is_branch_counters_group(event) && event == event->group_leader)
+ intel_pmu_lbr_reset();
}
static inline bool vlbr_exclude_host(void)
@@ -866,6 +885,8 @@ static __always_inline u16 get_lbr_cycles(u64 info)
return cycles;
}
+static_assert((64 - PERF_BRANCH_ENTRY_INFO_BITS_MAX) > LBR_INFO_BR_CNTR_NUM * LBR_INFO_BR_CNTR_BITS);
+
static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc,
struct lbr_entry *entries)
{
@@ -898,11 +919,67 @@ static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc,
e->abort = !!(info & LBR_INFO_ABORT);
e->cycles = get_lbr_cycles(info);
e->type = get_lbr_br_type(info);
+
+ /*
+ * Leverage the reserved field of cpuc->lbr_entries[i] to
+ * temporarily store the branch counters information.
+ * The later code will decide what content can be disclosed
+ * to the perf tool. Pleae see intel_pmu_lbr_counters_reorder().
+ */
+ e->reserved = (info >> LBR_INFO_BR_CNTR_OFFSET) & LBR_INFO_BR_CNTR_FULL_MASK;
}
cpuc->lbr_stack.nr = i;
}
+/*
+ * The enabled order may be different from the counter order.
+ * Update the lbr_counters with the enabled order.
+ */
+static void intel_pmu_lbr_counters_reorder(struct cpu_hw_events *cpuc,
+ struct perf_event *event)
+{
+ int i, j, pos = 0, order[X86_PMC_IDX_MAX];
+ struct perf_event *leader, *sibling;
+ u64 src, dst, cnt;
+
+ leader = event->group_leader;
+ if (branch_sample_counters(leader))
+ order[pos++] = leader->hw.idx;
+
+ for_each_sibling_event(sibling, leader) {
+ if (!branch_sample_counters(sibling))
+ continue;
+ order[pos++] = sibling->hw.idx;
+ }
+
+ WARN_ON_ONCE(!pos);
+
+ for (i = 0; i < cpuc->lbr_stack.nr; i++) {
+ src = cpuc->lbr_entries[i].reserved;
+ dst = 0;
+ for (j = 0; j < pos; j++) {
+ cnt = (src >> (order[j] * LBR_INFO_BR_CNTR_BITS)) & LBR_INFO_BR_CNTR_MASK;
+ dst |= cnt << j * LBR_INFO_BR_CNTR_BITS;
+ }
+ cpuc->lbr_counters[i] = dst;
+ cpuc->lbr_entries[i].reserved = 0;
+ }
+}
+
+void intel_pmu_lbr_save_brstack(struct perf_sample_data *data,
+ struct cpu_hw_events *cpuc,
+ struct perf_event *event)
+{
+ if (is_branch_counters_group(event)) {
+ intel_pmu_lbr_counters_reorder(cpuc, event);
+ perf_sample_save_brstack(data, event, &cpuc->lbr_stack, cpuc->lbr_counters);
+ return;
+ }
+
+ perf_sample_save_brstack(data, event, &cpuc->lbr_stack, NULL);
+}
+
static void intel_pmu_arch_lbr_read(struct cpu_hw_events *cpuc)
{
intel_pmu_store_lbr(cpuc, NULL);
@@ -1173,8 +1250,10 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
for (i = 0; i < cpuc->lbr_stack.nr; ) {
if (!cpuc->lbr_entries[i].from) {
j = i;
- while (++j < cpuc->lbr_stack.nr)
+ while (++j < cpuc->lbr_stack.nr) {
cpuc->lbr_entries[j-1] = cpuc->lbr_entries[j];
+ cpuc->lbr_counters[j-1] = cpuc->lbr_counters[j];
+ }
cpuc->lbr_stack.nr--;
if (!cpuc->lbr_entries[i].from)
continue;
@@ -1525,8 +1604,12 @@ void __init intel_pmu_arch_lbr_init(void)
x86_pmu.lbr_mispred = ecx.split.lbr_mispred;
x86_pmu.lbr_timed_lbr = ecx.split.lbr_timed_lbr;
x86_pmu.lbr_br_type = ecx.split.lbr_br_type;
+ x86_pmu.lbr_counters = ecx.split.lbr_counters;
x86_pmu.lbr_nr = lbr_nr;
+ if (!!x86_pmu.lbr_counters)
+ x86_pmu.flags |= PMU_FL_BR_CNTR;
+
if (x86_pmu.lbr_mispred)
static_branch_enable(&x86_lbr_mispred);
if (x86_pmu.lbr_timed_lbr)
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 01023aa5125b..7927c0b832fa 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1814,6 +1814,14 @@ static const struct intel_uncore_init_fun spr_uncore_init __initconst = {
.uncore_units_ignore = spr_uncore_units_ignore,
};
+static const struct intel_uncore_init_fun gnr_uncore_init __initconst = {
+ .cpu_init = gnr_uncore_cpu_init,
+ .pci_init = gnr_uncore_pci_init,
+ .mmio_init = gnr_uncore_mmio_init,
+ .use_discovery = true,
+ .uncore_units_ignore = gnr_uncore_units_ignore,
+};
+
static const struct intel_uncore_init_fun generic_uncore_init __initconst = {
.cpu_init = intel_uncore_generic_uncore_cpu_init,
.pci_init = intel_uncore_generic_uncore_pci_init,
@@ -1865,8 +1873,12 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE_L, &mtl_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &spr_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &spr_uncore_init),
+ X86_MATCH_INTEL_FAM6_MODEL(GRANITERAPIDS_X, &gnr_uncore_init),
+ X86_MATCH_INTEL_FAM6_MODEL(GRANITERAPIDS_D, &gnr_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &snr_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, &adl_uncore_init),
+ X86_MATCH_INTEL_FAM6_MODEL(ATOM_CRESTMONT_X, &gnr_uncore_init),
+ X86_MATCH_INTEL_FAM6_MODEL(ATOM_CRESTMONT, &gnr_uncore_init),
{},
};
MODULE_DEVICE_TABLE(x86cpu, intel_uncore_match);
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index c30fb5bb1222..4838502d89ae 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -72,9 +72,9 @@ struct intel_uncore_type {
unsigned single_fixed:1;
unsigned pair_ctr_ctl:1;
union {
- unsigned *msr_offsets;
- unsigned *pci_offsets;
- unsigned *mmio_offsets;
+ u64 *msr_offsets;
+ u64 *pci_offsets;
+ u64 *mmio_offsets;
};
unsigned *box_ids;
struct event_constraint unconstrainted;
@@ -593,6 +593,7 @@ extern struct list_head pci2phy_map_head;
extern struct pci_extra_dev *uncore_extra_pci_dev;
extern struct event_constraint uncore_constraint_empty;
extern int spr_uncore_units_ignore[];
+extern int gnr_uncore_units_ignore[];
/* uncore_snb.c */
int snb_uncore_pci_init(void);
@@ -634,6 +635,9 @@ void icx_uncore_mmio_init(void);
int spr_uncore_pci_init(void);
void spr_uncore_cpu_init(void);
void spr_uncore_mmio_init(void);
+int gnr_uncore_pci_init(void);
+void gnr_uncore_cpu_init(void);
+void gnr_uncore_mmio_init(void);
/* uncore_nhmex.c */
void nhmex_uncore_cpu_init(void);
diff --git a/arch/x86/events/intel/uncore_discovery.c b/arch/x86/events/intel/uncore_discovery.c
index cb488e41807c..9a698a92962a 100644
--- a/arch/x86/events/intel/uncore_discovery.c
+++ b/arch/x86/events/intel/uncore_discovery.c
@@ -125,7 +125,8 @@ uncore_insert_box_info(struct uncore_unit_discovery *unit,
int die, bool parsed)
{
struct intel_uncore_discovery_type *type;
- unsigned int *box_offset, *ids;
+ unsigned int *ids;
+ u64 *box_offset;
int i;
if (!unit->ctl || !unit->ctl_offset || !unit->ctr_offset) {
@@ -153,7 +154,7 @@ uncore_insert_box_info(struct uncore_unit_discovery *unit,
if (!type)
return;
- box_offset = kcalloc(type->num_boxes + 1, sizeof(unsigned int), GFP_KERNEL);
+ box_offset = kcalloc(type->num_boxes + 1, sizeof(u64), GFP_KERNEL);
if (!box_offset)
return;
diff --git a/arch/x86/events/intel/uncore_discovery.h b/arch/x86/events/intel/uncore_discovery.h
index 6ee80ad3423e..22e769a81103 100644
--- a/arch/x86/events/intel/uncore_discovery.h
+++ b/arch/x86/events/intel/uncore_discovery.h
@@ -125,7 +125,7 @@ struct intel_uncore_discovery_type {
u8 ctr_offset; /* Counter 0 offset */
u16 num_boxes; /* number of boxes for the uncore block */
unsigned int *ids; /* Box IDs */
- unsigned int *box_offset; /* Box offset */
+ u64 *box_offset; /* Box offset */
};
bool intel_uncore_has_discovery_tables(int *ignore);
diff --git a/arch/x86/events/intel/uncore_nhmex.c b/arch/x86/events/intel/uncore_nhmex.c
index 173e2674be6e..56eea2c66cfb 100644
--- a/arch/x86/events/intel/uncore_nhmex.c
+++ b/arch/x86/events/intel/uncore_nhmex.c
@@ -306,7 +306,7 @@ static const struct attribute_group nhmex_uncore_cbox_format_group = {
};
/* msr offset for each instance of cbox */
-static unsigned nhmex_cbox_msr_offsets[] = {
+static u64 nhmex_cbox_msr_offsets[] = {
0x0, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x240, 0x2c0,
};
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 8250f0f59c2b..a96496bef678 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -1396,6 +1396,29 @@ err:
return ret;
}
+static int topology_gidnid_map(int nodeid, u32 gidnid)
+{
+ int i, die_id = -1;
+
+ /*
+ * every three bits in the Node ID mapping register maps
+ * to a particular node.
+ */
+ for (i = 0; i < 8; i++) {
+ if (nodeid == GIDNIDMAP(gidnid, i)) {
+ if (topology_max_die_per_package() > 1)
+ die_id = i;
+ else
+ die_id = topology_phys_to_logical_pkg(i);
+ if (die_id < 0)
+ die_id = -ENODEV;
+ break;
+ }
+ }
+
+ return die_id;
+}
+
/*
* build pci bus to socket mapping
*/
@@ -1435,22 +1458,7 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool
break;
}
- /*
- * every three bits in the Node ID mapping register maps
- * to a particular node.
- */
- for (i = 0; i < 8; i++) {
- if (nodeid == GIDNIDMAP(config, i)) {
- if (topology_max_die_per_package() > 1)
- die_id = i;
- else
- die_id = topology_phys_to_logical_pkg(i);
- if (die_id < 0)
- die_id = -ENODEV;
- map->pbus_to_dieid[bus] = die_id;
- break;
- }
- }
+ map->pbus_to_dieid[bus] = topology_gidnid_map(nodeid, config);
raw_spin_unlock(&pci2phy_map_lock);
} else {
segment = pci_domain_nr(ubox_dev->bus);
@@ -5278,7 +5286,7 @@ void snr_uncore_mmio_init(void)
/* ICX uncore support */
-static unsigned icx_cha_msr_offsets[] = {
+static u64 icx_cha_msr_offsets[] = {
0x2a0, 0x2ae, 0x2bc, 0x2ca, 0x2d8, 0x2e6, 0x2f4, 0x302, 0x310,
0x31e, 0x32c, 0x33a, 0x348, 0x356, 0x364, 0x372, 0x380, 0x38e,
0x3aa, 0x3b8, 0x3c6, 0x3d4, 0x3e2, 0x3f0, 0x3fe, 0x40c, 0x41a,
@@ -5326,7 +5334,7 @@ static struct intel_uncore_type icx_uncore_chabox = {
.format_group = &snr_uncore_chabox_format_group,
};
-static unsigned icx_msr_offsets[] = {
+static u64 icx_msr_offsets[] = {
0x0, 0x20, 0x40, 0x90, 0xb0, 0xd0,
};
@@ -5596,7 +5604,7 @@ static int discover_upi_topology(struct intel_uncore_type *type, int ubox_did, i
struct pci_dev *ubox = NULL;
struct pci_dev *dev = NULL;
u32 nid, gid;
- int i, idx, ret = -EPERM;
+ int idx, lgc_pkg, ret = -EPERM;
struct intel_uncore_topology *upi;
unsigned int devfn;
@@ -5611,20 +5619,21 @@ static int discover_upi_topology(struct intel_uncore_type *type, int ubox_did, i
break;
}
- for (i = 0; i < 8; i++) {
- if (nid != GIDNIDMAP(gid, i))
- continue;
- for (idx = 0; idx < type->num_boxes; idx++) {
- upi = &type->topology[nid][idx];
- devfn = PCI_DEVFN(dev_link0 + idx, ICX_UPI_REGS_ADDR_FUNCTION);
- dev = pci_get_domain_bus_and_slot(pci_domain_nr(ubox->bus),
- ubox->bus->number,
- devfn);
- if (dev) {
- ret = upi_fill_topology(dev, upi, idx);
- if (ret)
- goto err;
- }
+ lgc_pkg = topology_gidnid_map(nid, gid);
+ if (lgc_pkg < 0) {
+ ret = -EPERM;
+ goto err;
+ }
+ for (idx = 0; idx < type->num_boxes; idx++) {
+ upi = &type->topology[lgc_pkg][idx];
+ devfn = PCI_DEVFN(dev_link0 + idx, ICX_UPI_REGS_ADDR_FUNCTION);
+ dev = pci_get_domain_bus_and_slot(pci_domain_nr(ubox->bus),
+ ubox->bus->number,
+ devfn);
+ if (dev) {
+ ret = upi_fill_topology(dev, upi, idx);
+ if (ret)
+ goto err;
}
}
}
@@ -6079,13 +6088,16 @@ static struct uncore_event_desc spr_uncore_imc_events[] = {
{ /* end: all zeroes */ },
};
+#define SPR_UNCORE_MMIO_COMMON_FORMAT() \
+ SPR_UNCORE_COMMON_FORMAT(), \
+ .ops = &spr_uncore_mmio_ops
+
static struct intel_uncore_type spr_uncore_imc = {
- SPR_UNCORE_COMMON_FORMAT(),
+ SPR_UNCORE_MMIO_COMMON_FORMAT(),
.name = "imc",
.fixed_ctr_bits = 48,
.fixed_ctr = SNR_IMC_MMIO_PMON_FIXED_CTR,
.fixed_ctl = SNR_IMC_MMIO_PMON_FIXED_CTL,
- .ops = &spr_uncore_mmio_ops,
.event_descs = spr_uncore_imc_events,
};
@@ -6181,7 +6193,7 @@ static struct intel_uncore_type *spr_uncores[UNCORE_SPR_NUM_UNCORE_TYPES] = {
*/
#define SPR_UNCORE_UPI_NUM_BOXES 4
-static unsigned int spr_upi_pci_offsets[SPR_UNCORE_UPI_NUM_BOXES] = {
+static u64 spr_upi_pci_offsets[SPR_UNCORE_UPI_NUM_BOXES] = {
0, 0x8000, 0x10000, 0x18000
};
@@ -6412,7 +6424,8 @@ static void uncore_type_customized_copy(struct intel_uncore_type *to_type,
static struct intel_uncore_type **
uncore_get_uncores(enum uncore_access_type type_id, int num_extra,
- struct intel_uncore_type **extra)
+ struct intel_uncore_type **extra, int max_num_types,
+ struct intel_uncore_type **uncores)
{
struct intel_uncore_type **types, **start_types;
int i;
@@ -6421,9 +6434,9 @@ uncore_get_uncores(enum uncore_access_type type_id, int num_extra,
/* Only copy the customized features */
for (; *types; types++) {
- if ((*types)->type_id >= UNCORE_SPR_NUM_UNCORE_TYPES)
+ if ((*types)->type_id >= max_num_types)
continue;
- uncore_type_customized_copy(*types, spr_uncores[(*types)->type_id]);
+ uncore_type_customized_copy(*types, uncores[(*types)->type_id]);
}
for (i = 0; i < num_extra; i++, types++)
@@ -6470,7 +6483,9 @@ void spr_uncore_cpu_init(void)
uncore_msr_uncores = uncore_get_uncores(UNCORE_ACCESS_MSR,
UNCORE_SPR_MSR_EXTRA_UNCORES,
- spr_msr_uncores);
+ spr_msr_uncores,
+ UNCORE_SPR_NUM_UNCORE_TYPES,
+ spr_uncores);
type = uncore_find_type_by_id(uncore_msr_uncores, UNCORE_SPR_CHA);
if (type) {
@@ -6552,7 +6567,9 @@ int spr_uncore_pci_init(void)
spr_update_device_location(UNCORE_SPR_M3UPI);
uncore_pci_uncores = uncore_get_uncores(UNCORE_ACCESS_PCI,
UNCORE_SPR_PCI_EXTRA_UNCORES,
- spr_pci_uncores);
+ spr_pci_uncores,
+ UNCORE_SPR_NUM_UNCORE_TYPES,
+ spr_uncores);
return 0;
}
@@ -6560,15 +6577,116 @@ void spr_uncore_mmio_init(void)
{
int ret = snbep_pci2phy_map_init(0x3250, SKX_CPUNODEID, SKX_GIDNIDMAP, true);
- if (ret)
- uncore_mmio_uncores = uncore_get_uncores(UNCORE_ACCESS_MMIO, 0, NULL);
- else {
+ if (ret) {
+ uncore_mmio_uncores = uncore_get_uncores(UNCORE_ACCESS_MMIO, 0, NULL,
+ UNCORE_SPR_NUM_UNCORE_TYPES,
+ spr_uncores);
+ } else {
uncore_mmio_uncores = uncore_get_uncores(UNCORE_ACCESS_MMIO,
UNCORE_SPR_MMIO_EXTRA_UNCORES,
- spr_mmio_uncores);
+ spr_mmio_uncores,
+ UNCORE_SPR_NUM_UNCORE_TYPES,
+ spr_uncores);
spr_uncore_imc_free_running.num_boxes = uncore_type_max_boxes(uncore_mmio_uncores, UNCORE_SPR_IMC) / 2;
}
}
/* end of SPR uncore support */
+
+/* GNR uncore support */
+
+#define UNCORE_GNR_NUM_UNCORE_TYPES 23
+#define UNCORE_GNR_TYPE_15 15
+#define UNCORE_GNR_B2UPI 18
+#define UNCORE_GNR_TYPE_21 21
+#define UNCORE_GNR_TYPE_22 22
+
+int gnr_uncore_units_ignore[] = {
+ UNCORE_SPR_UPI,
+ UNCORE_GNR_TYPE_15,
+ UNCORE_GNR_B2UPI,
+ UNCORE_GNR_TYPE_21,
+ UNCORE_GNR_TYPE_22,
+ UNCORE_IGNORE_END
+};
+
+static struct intel_uncore_type gnr_uncore_ubox = {
+ .name = "ubox",
+ .attr_update = uncore_alias_groups,
+};
+
+static struct intel_uncore_type gnr_uncore_b2cmi = {
+ SPR_UNCORE_PCI_COMMON_FORMAT(),
+ .name = "b2cmi",
+};
+
+static struct intel_uncore_type gnr_uncore_b2cxl = {
+ SPR_UNCORE_MMIO_COMMON_FORMAT(),
+ .name = "b2cxl",
+};
+
+static struct intel_uncore_type gnr_uncore_mdf_sbo = {
+ .name = "mdf_sbo",
+ .attr_update = uncore_alias_groups,
+};
+
+static struct intel_uncore_type *gnr_uncores[UNCORE_GNR_NUM_UNCORE_TYPES] = {
+ &spr_uncore_chabox,
+ &spr_uncore_iio,
+ &spr_uncore_irp,
+ NULL,
+ &spr_uncore_pcu,
+ &gnr_uncore_ubox,
+ &spr_uncore_imc,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ &gnr_uncore_b2cmi,
+ &gnr_uncore_b2cxl,
+ NULL,
+ NULL,
+ &gnr_uncore_mdf_sbo,
+ NULL,
+ NULL,
+};
+
+static struct freerunning_counters gnr_iio_freerunning[] = {
+ [SPR_IIO_MSR_IOCLK] = { 0x290e, 0x01, 0x10, 1, 48 },
+ [SPR_IIO_MSR_BW_IN] = { 0x360e, 0x10, 0x80, 8, 48 },
+ [SPR_IIO_MSR_BW_OUT] = { 0x2e0e, 0x10, 0x80, 8, 48 },
+};
+
+void gnr_uncore_cpu_init(void)
+{
+ uncore_msr_uncores = uncore_get_uncores(UNCORE_ACCESS_MSR,
+ UNCORE_SPR_MSR_EXTRA_UNCORES,
+ spr_msr_uncores,
+ UNCORE_GNR_NUM_UNCORE_TYPES,
+ gnr_uncores);
+ spr_uncore_iio_free_running.num_boxes = uncore_type_max_boxes(uncore_msr_uncores, UNCORE_SPR_IIO);
+ spr_uncore_iio_free_running.freerunning = gnr_iio_freerunning;
+}
+
+int gnr_uncore_pci_init(void)
+{
+ uncore_pci_uncores = uncore_get_uncores(UNCORE_ACCESS_PCI, 0, NULL,
+ UNCORE_GNR_NUM_UNCORE_TYPES,
+ gnr_uncores);
+ return 0;
+}
+
+void gnr_uncore_mmio_init(void)
+{
+ uncore_mmio_uncores = uncore_get_uncores(UNCORE_ACCESS_MMIO, 0, NULL,
+ UNCORE_GNR_NUM_UNCORE_TYPES,
+ gnr_uncores);
+}
+
+/* end of GNR uncore support */
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 53dd5d495ba6..fb56518356ec 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -110,6 +110,11 @@ static inline bool is_topdown_event(struct perf_event *event)
return is_metric_event(event) || is_slots_event(event);
}
+static inline bool is_branch_counters_group(struct perf_event *event)
+{
+ return event->group_leader->hw.flags & PERF_X86_EVENT_BRANCH_COUNTERS;
+}
+
struct amd_nb {
int nb_id; /* NorthBridge id */
int refcnt; /* reference count */
@@ -283,6 +288,7 @@ struct cpu_hw_events {
int lbr_pebs_users;
struct perf_branch_stack lbr_stack;
struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES];
+ u64 lbr_counters[MAX_LBR_ENTRIES]; /* branch stack extra */
union {
struct er_account *lbr_sel;
struct er_account *lbr_ctl;
@@ -888,6 +894,7 @@ struct x86_pmu {
unsigned int lbr_mispred:1;
unsigned int lbr_timed_lbr:1;
unsigned int lbr_br_type:1;
+ unsigned int lbr_counters:4;
void (*lbr_reset)(void);
void (*lbr_read)(struct cpu_hw_events *cpuc);
@@ -1012,6 +1019,7 @@ do { \
#define PMU_FL_INSTR_LATENCY 0x80 /* Support Instruction Latency in PEBS Memory Info Record */
#define PMU_FL_MEM_LOADS_AUX 0x100 /* Require an auxiliary event for the complete memory info */
#define PMU_FL_RETIRE_LATENCY 0x200 /* Support Retire Latency in PEBS */
+#define PMU_FL_BR_CNTR 0x400 /* Support branch counter logging */
#define EVENT_VAR(_id) event_attr_##_id
#define EVENT_PTR(_id) &event_attr_##_id.attr.attr
@@ -1552,6 +1560,10 @@ void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr);
void intel_ds_init(void);
+void intel_pmu_lbr_save_brstack(struct perf_sample_data *data,
+ struct cpu_hw_events *cpuc,
+ struct perf_event *event);
+
void intel_pmu_lbr_swap_task_ctx(struct perf_event_pmu_context *prev_epc,
struct perf_event_pmu_context *next_epc);
diff --git a/arch/x86/events/perf_event_flags.h b/arch/x86/events/perf_event_flags.h
index 1dc19b9b4426..6c977c19f2cd 100644
--- a/arch/x86/events/perf_event_flags.h
+++ b/arch/x86/events/perf_event_flags.h
@@ -20,3 +20,5 @@ PERF_ARCH(TOPDOWN, 0x04000) /* Count Topdown slots/metrics events */
PERF_ARCH(PEBS_STLAT, 0x08000) /* st+stlat data address sampling */
PERF_ARCH(AMD_BRS, 0x10000) /* AMD Branch Sampling */
PERF_ARCH(PEBS_LAT_HYBRID, 0x20000) /* ld and st lat for hybrid */
+PERF_ARCH(NEEDS_BRANCH_STACK, 0x40000) /* require branch stack setup */
+PERF_ARCH(BRANCH_COUNTERS, 0x80000) /* logs the counters in the extra space of each branch */
diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c
index 97bfe5f0531f..5fc45543e955 100644
--- a/arch/x86/hyperv/hv_apic.c
+++ b/arch/x86/hyperv/hv_apic.c
@@ -209,7 +209,7 @@ static bool __send_ipi_mask(const struct cpumask *mask, int vector,
/*
* This particular version of the IPI hypercall can
- * only target upto 64 CPUs.
+ * only target up to 64 CPUs.
*/
if (vcpu >= 64)
goto do_ex_hypercall;
diff --git a/arch/x86/hyperv/irqdomain.c b/arch/x86/hyperv/irqdomain.c
index 42c70d28ef27..3215a4a07408 100644
--- a/arch/x86/hyperv/irqdomain.c
+++ b/arch/x86/hyperv/irqdomain.c
@@ -212,7 +212,7 @@ static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
* This interrupt is already mapped. Let's unmap first.
*
* We don't use retarget interrupt hypercalls here because
- * Microsoft Hypervisor doens't allow root to change the vector
+ * Microsoft Hypervisor doesn't allow root to change the vector
* or specify VPs outside of the set that is initially used
* during mapping.
*/
diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c
index 02e55237d919..7dcbf153ad72 100644
--- a/arch/x86/hyperv/ivm.c
+++ b/arch/x86/hyperv/ivm.c
@@ -144,7 +144,7 @@ void __noreturn hv_ghcb_terminate(unsigned int set, unsigned int reason)
/* Tell the hypervisor what went wrong. */
val |= GHCB_SEV_TERM_REASON(set, reason);
- /* Request Guest Termination from Hypvervisor */
+ /* Request Guest Termination from Hypervisor */
wr_ghcb_msr(val);
VMGEXIT();
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 65f79092c9d9..fcd20c6dc7f9 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -10,6 +10,9 @@
#define ALT_FLAG_NOT (1 << 0)
#define ALT_NOT(feature) ((ALT_FLAG_NOT << ALT_FLAGS_SHIFT) | (feature))
+#define ALT_FLAG_DIRECT_CALL (1 << 1)
+#define ALT_DIRECT_CALL(feature) ((ALT_FLAG_DIRECT_CALL << ALT_FLAGS_SHIFT) | (feature))
+#define ALT_CALL_ALWAYS ALT_DIRECT_CALL(X86_FEATURE_ALWAYS)
#ifndef __ASSEMBLY__
@@ -86,6 +89,8 @@ struct alt_instr {
u8 replacementlen; /* length of new instruction */
} __packed;
+extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
+
/*
* Debug flag that can be tested to see whether alternative
* instructions were patched in already:
@@ -101,11 +106,10 @@ extern void apply_fineibt(s32 *start_retpoline, s32 *end_retpoine,
s32 *start_cfi, s32 *end_cfi);
struct module;
-struct paravirt_patch_site;
struct callthunk_sites {
s32 *call_start, *call_end;
- struct paravirt_patch_site *pv_start, *pv_end;
+ struct alt_instr *alt_start, *alt_end;
};
#ifdef CONFIG_CALL_THUNKS
@@ -150,6 +154,8 @@ static inline int alternatives_text_reserved(void *start, void *end)
}
#endif /* CONFIG_SMP */
+#define ALT_CALL_INSTR "call BUG_func"
+
#define b_replacement(num) "664"#num
#define e_replacement(num) "665"#num
@@ -330,6 +336,22 @@ static inline int alternatives_text_reserved(void *start, void *end)
*/
#define ASM_NO_INPUT_CLOBBER(clbr...) "i" (0) : clbr
+/* Macro for creating assembler functions avoiding any C magic. */
+#define DEFINE_ASM_FUNC(func, instr, sec) \
+ asm (".pushsection " #sec ", \"ax\"\n" \
+ ".global " #func "\n\t" \
+ ".type " #func ", @function\n\t" \
+ ASM_FUNC_ALIGN "\n" \
+ #func ":\n\t" \
+ ASM_ENDBR \
+ instr "\n\t" \
+ ASM_RET \
+ ".size " #func ", . - " #func "\n\t" \
+ ".popsection")
+
+void BUG_func(void);
+void nop_func(void);
+
#else /* __ASSEMBLY__ */
#ifdef CONFIG_SMP
@@ -370,6 +392,10 @@ static inline int alternatives_text_reserved(void *start, void *end)
.byte \alt_len
.endm
+.macro ALT_CALL_INSTR
+ call BUG_func
+.endm
+
/*
* Define an alternative between two instructions. If @feature is
* present, early code in apply_alternatives() replaces @oldinstr with
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index ed0eaf65c437..5c37944c8a5e 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -104,7 +104,7 @@ static inline bool amd_gart_present(void)
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
return false;
- /* GART present only on Fam15h, upto model 0fh */
+ /* GART present only on Fam15h, up to model 0fh */
if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 ||
(boot_cpu_data.x86 == 0x15 && boot_cpu_data.x86_model < 0x10))
return true;
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index d21f48f1c242..9d159b771dc8 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -272,8 +272,6 @@ struct apic {
void (*send_IPI_all)(int vector);
void (*send_IPI_self)(int vector);
- enum apic_delivery_modes delivery_mode;
-
u32 disable_esr : 1,
dest_mode_logical : 1,
x2apic_set_max_apicid : 1,
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 4b125e5b3187..094106b6a538 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -20,6 +20,13 @@
*/
#define IO_APIC_SLOT_SIZE 1024
+#define APIC_DELIVERY_MODE_FIXED 0
+#define APIC_DELIVERY_MODE_LOWESTPRIO 1
+#define APIC_DELIVERY_MODE_SMI 2
+#define APIC_DELIVERY_MODE_NMI 4
+#define APIC_DELIVERY_MODE_INIT 5
+#define APIC_DELIVERY_MODE_EXTINT 7
+
#define APIC_ID 0x20
#define APIC_LVR 0x30
@@ -165,279 +172,10 @@
#define APIC_CPUID(apicid) ((apicid) & XAPIC_DEST_CPUS_MASK)
#define NUM_APIC_CLUSTERS ((BAD_APICID + 1) >> XAPIC_DEST_CPUS_SHIFT)
-#ifndef __ASSEMBLY__
-/*
- * the local APIC register structure, memory mapped. Not terribly well
- * tested, but we might eventually use this one in the future - the
- * problem why we cannot use it right now is the P5 APIC, it has an
- * errata which cannot take 8-bit reads and writes, only 32-bit ones ...
- */
-#define u32 unsigned int
-
-struct local_apic {
-
-/*000*/ struct { u32 __reserved[4]; } __reserved_01;
-
-/*010*/ struct { u32 __reserved[4]; } __reserved_02;
-
-/*020*/ struct { /* APIC ID Register */
- u32 __reserved_1 : 24,
- phys_apic_id : 4,
- __reserved_2 : 4;
- u32 __reserved[3];
- } id;
-
-/*030*/ const
- struct { /* APIC Version Register */
- u32 version : 8,
- __reserved_1 : 8,
- max_lvt : 8,
- __reserved_2 : 8;
- u32 __reserved[3];
- } version;
-
-/*040*/ struct { u32 __reserved[4]; } __reserved_03;
-
-/*050*/ struct { u32 __reserved[4]; } __reserved_04;
-
-/*060*/ struct { u32 __reserved[4]; } __reserved_05;
-
-/*070*/ struct { u32 __reserved[4]; } __reserved_06;
-
-/*080*/ struct { /* Task Priority Register */
- u32 priority : 8,
- __reserved_1 : 24;
- u32 __reserved_2[3];
- } tpr;
-
-/*090*/ const
- struct { /* Arbitration Priority Register */
- u32 priority : 8,
- __reserved_1 : 24;
- u32 __reserved_2[3];
- } apr;
-
-/*0A0*/ const
- struct { /* Processor Priority Register */
- u32 priority : 8,
- __reserved_1 : 24;
- u32 __reserved_2[3];
- } ppr;
-
-/*0B0*/ struct { /* End Of Interrupt Register */
- u32 eoi;
- u32 __reserved[3];
- } eoi;
-
-/*0C0*/ struct { u32 __reserved[4]; } __reserved_07;
-
-/*0D0*/ struct { /* Logical Destination Register */
- u32 __reserved_1 : 24,
- logical_dest : 8;
- u32 __reserved_2[3];
- } ldr;
-
-/*0E0*/ struct { /* Destination Format Register */
- u32 __reserved_1 : 28,
- model : 4;
- u32 __reserved_2[3];
- } dfr;
-
-/*0F0*/ struct { /* Spurious Interrupt Vector Register */
- u32 spurious_vector : 8,
- apic_enabled : 1,
- focus_cpu : 1,
- __reserved_2 : 22;
- u32 __reserved_3[3];
- } svr;
-
-/*100*/ struct { /* In Service Register */
-/*170*/ u32 bitfield;
- u32 __reserved[3];
- } isr [8];
-
-/*180*/ struct { /* Trigger Mode Register */
-/*1F0*/ u32 bitfield;
- u32 __reserved[3];
- } tmr [8];
-
-/*200*/ struct { /* Interrupt Request Register */
-/*270*/ u32 bitfield;
- u32 __reserved[3];
- } irr [8];
-
-/*280*/ union { /* Error Status Register */
- struct {
- u32 send_cs_error : 1,
- receive_cs_error : 1,
- send_accept_error : 1,
- receive_accept_error : 1,
- __reserved_1 : 1,
- send_illegal_vector : 1,
- receive_illegal_vector : 1,
- illegal_register_address : 1,
- __reserved_2 : 24;
- u32 __reserved_3[3];
- } error_bits;
- struct {
- u32 errors;
- u32 __reserved_3[3];
- } all_errors;
- } esr;
-
-/*290*/ struct { u32 __reserved[4]; } __reserved_08;
-
-/*2A0*/ struct { u32 __reserved[4]; } __reserved_09;
-
-/*2B0*/ struct { u32 __reserved[4]; } __reserved_10;
-
-/*2C0*/ struct { u32 __reserved[4]; } __reserved_11;
-
-/*2D0*/ struct { u32 __reserved[4]; } __reserved_12;
-
-/*2E0*/ struct { u32 __reserved[4]; } __reserved_13;
-
-/*2F0*/ struct { u32 __reserved[4]; } __reserved_14;
-
-/*300*/ struct { /* Interrupt Command Register 1 */
- u32 vector : 8,
- delivery_mode : 3,
- destination_mode : 1,
- delivery_status : 1,
- __reserved_1 : 1,
- level : 1,
- trigger : 1,
- __reserved_2 : 2,
- shorthand : 2,
- __reserved_3 : 12;
- u32 __reserved_4[3];
- } icr1;
-
-/*310*/ struct { /* Interrupt Command Register 2 */
- union {
- u32 __reserved_1 : 24,
- phys_dest : 4,
- __reserved_2 : 4;
- u32 __reserved_3 : 24,
- logical_dest : 8;
- } dest;
- u32 __reserved_4[3];
- } icr2;
-
-/*320*/ struct { /* LVT - Timer */
- u32 vector : 8,
- __reserved_1 : 4,
- delivery_status : 1,
- __reserved_2 : 3,
- mask : 1,
- timer_mode : 1,
- __reserved_3 : 14;
- u32 __reserved_4[3];
- } lvt_timer;
-
-/*330*/ struct { /* LVT - Thermal Sensor */
- u32 vector : 8,
- delivery_mode : 3,
- __reserved_1 : 1,
- delivery_status : 1,
- __reserved_2 : 3,
- mask : 1,
- __reserved_3 : 15;
- u32 __reserved_4[3];
- } lvt_thermal;
-
-/*340*/ struct { /* LVT - Performance Counter */
- u32 vector : 8,
- delivery_mode : 3,
- __reserved_1 : 1,
- delivery_status : 1,
- __reserved_2 : 3,
- mask : 1,
- __reserved_3 : 15;
- u32 __reserved_4[3];
- } lvt_pc;
-
-/*350*/ struct { /* LVT - LINT0 */
- u32 vector : 8,
- delivery_mode : 3,
- __reserved_1 : 1,
- delivery_status : 1,
- polarity : 1,
- remote_irr : 1,
- trigger : 1,
- mask : 1,
- __reserved_2 : 15;
- u32 __reserved_3[3];
- } lvt_lint0;
-
-/*360*/ struct { /* LVT - LINT1 */
- u32 vector : 8,
- delivery_mode : 3,
- __reserved_1 : 1,
- delivery_status : 1,
- polarity : 1,
- remote_irr : 1,
- trigger : 1,
- mask : 1,
- __reserved_2 : 15;
- u32 __reserved_3[3];
- } lvt_lint1;
-
-/*370*/ struct { /* LVT - Error */
- u32 vector : 8,
- __reserved_1 : 4,
- delivery_status : 1,
- __reserved_2 : 3,
- mask : 1,
- __reserved_3 : 15;
- u32 __reserved_4[3];
- } lvt_error;
-
-/*380*/ struct { /* Timer Initial Count Register */
- u32 initial_count;
- u32 __reserved_2[3];
- } timer_icr;
-
-/*390*/ const
- struct { /* Timer Current Count Register */
- u32 curr_count;
- u32 __reserved_2[3];
- } timer_ccr;
-
-/*3A0*/ struct { u32 __reserved[4]; } __reserved_16;
-
-/*3B0*/ struct { u32 __reserved[4]; } __reserved_17;
-
-/*3C0*/ struct { u32 __reserved[4]; } __reserved_18;
-
-/*3D0*/ struct { u32 __reserved[4]; } __reserved_19;
-
-/*3E0*/ struct { /* Timer Divide Configuration Register */
- u32 divisor : 4,
- __reserved_1 : 28;
- u32 __reserved_2[3];
- } timer_dcr;
-
-/*3F0*/ struct { u32 __reserved[4]; } __reserved_20;
-
-} __attribute__ ((packed));
-
-#undef u32
-
#ifdef CONFIG_X86_32
#define BAD_APICID 0xFFu
#else
#define BAD_APICID 0xFFFFu
#endif
-enum apic_delivery_modes {
- APIC_DELIVERY_MODE_FIXED = 0,
- APIC_DELIVERY_MODE_LOWESTPRIO = 1,
- APIC_DELIVERY_MODE_SMI = 2,
- APIC_DELIVERY_MODE_NMI = 4,
- APIC_DELIVERY_MODE_INIT = 5,
- APIC_DELIVERY_MODE_EXTINT = 7,
-};
-
-#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_APICDEF_H */
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index 35389b2af88e..0216f63a366b 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -81,22 +81,4 @@ do { \
#include <asm-generic/barrier.h>
-/*
- * Make previous memory operations globally visible before
- * a WRMSR.
- *
- * MFENCE makes writes visible, but only affects load/store
- * instructions. WRMSR is unfortunately not a load/store
- * instruction and is unaffected by MFENCE. The LFENCE ensures
- * that the WRMSR is not reordered.
- *
- * Most WRMSRs are full serializing instructions themselves and
- * do not require this barrier. This is only required for the
- * IA32_TSC_DEADLINE and X2APIC MSRs.
- */
-static inline void weak_wrmsr_fence(void)
-{
- asm volatile("mfence; lfence" : : : "memory");
-}
-
#endif /* _ASM_X86_BARRIER_H */
diff --git a/arch/x86/include/asm/cfi.h b/arch/x86/include/asm/cfi.h
index 58dacd90daef..7cd752557905 100644
--- a/arch/x86/include/asm/cfi.h
+++ b/arch/x86/include/asm/cfi.h
@@ -7,16 +7,140 @@
*
* Copyright (C) 2022 Google LLC
*/
+#include <linux/bug.h>
+#include <asm/ibt.h>
-#include <linux/cfi.h>
+/*
+ * An overview of the various calling conventions...
+ *
+ * Traditional:
+ *
+ * foo:
+ * ... code here ...
+ * ret
+ *
+ * direct caller:
+ * call foo
+ *
+ * indirect caller:
+ * lea foo(%rip), %r11
+ * ...
+ * call *%r11
+ *
+ *
+ * IBT:
+ *
+ * foo:
+ * endbr64
+ * ... code here ...
+ * ret
+ *
+ * direct caller:
+ * call foo / call foo+4
+ *
+ * indirect caller:
+ * lea foo(%rip), %r11
+ * ...
+ * call *%r11
+ *
+ *
+ * kCFI:
+ *
+ * __cfi_foo:
+ * movl $0x12345678, %eax
+ * # 11 nops when CONFIG_CALL_PADDING
+ * foo:
+ * endbr64 # when IBT
+ * ... code here ...
+ * ret
+ *
+ * direct call:
+ * call foo # / call foo+4 when IBT
+ *
+ * indirect call:
+ * lea foo(%rip), %r11
+ * ...
+ * movl $(-0x12345678), %r10d
+ * addl -4(%r11), %r10d # -15 when CONFIG_CALL_PADDING
+ * jz 1f
+ * ud2
+ * 1:call *%r11
+ *
+ *
+ * FineIBT (builds as kCFI + CALL_PADDING + IBT + RETPOLINE and runtime patches into):
+ *
+ * __cfi_foo:
+ * endbr64
+ * subl 0x12345678, %r10d
+ * jz foo
+ * ud2
+ * nop
+ * foo:
+ * osp nop3 # was endbr64
+ * ... code here ...
+ * ret
+ *
+ * direct caller:
+ * call foo / call foo+4
+ *
+ * indirect caller:
+ * lea foo(%rip), %r11
+ * ...
+ * movl $0x12345678, %r10d
+ * subl $16, %r11
+ * nop4
+ * call *%r11
+ *
+ */
+enum cfi_mode {
+ CFI_DEFAULT, /* FineIBT if hardware has IBT, otherwise kCFI */
+ CFI_OFF, /* Taditional / IBT depending on .config */
+ CFI_KCFI, /* Optionally CALL_PADDING, IBT, RETPOLINE */
+ CFI_FINEIBT, /* see arch/x86/kernel/alternative.c */
+};
+
+extern enum cfi_mode cfi_mode;
+
+struct pt_regs;
#ifdef CONFIG_CFI_CLANG
enum bug_trap_type handle_cfi_failure(struct pt_regs *regs);
+#define __bpfcall
+extern u32 cfi_bpf_hash;
+extern u32 cfi_bpf_subprog_hash;
+
+static inline int cfi_get_offset(void)
+{
+ switch (cfi_mode) {
+ case CFI_FINEIBT:
+ return 16;
+ case CFI_KCFI:
+ if (IS_ENABLED(CONFIG_CALL_PADDING))
+ return 16;
+ return 5;
+ default:
+ return 0;
+ }
+}
+#define cfi_get_offset cfi_get_offset
+
+extern u32 cfi_get_func_hash(void *func);
+
#else
static inline enum bug_trap_type handle_cfi_failure(struct pt_regs *regs)
{
return BUG_TRAP_TYPE_NONE;
}
+#define cfi_bpf_hash 0U
+#define cfi_bpf_subprog_hash 0U
+static inline u32 cfi_get_func_hash(void *func)
+{
+ return 0;
+}
#endif /* CONFIG_CFI_CLANG */
+#if HAS_KERNEL_IBT == 1
+#define CFI_NOSEAL(x) asm(IBT_NOSEAL(__stringify(x)))
+#endif
+
#endif /* _ASM_X86_CFI_H */
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 4af140cf5719..632c26cdeeda 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -218,7 +218,7 @@
#define X86_FEATURE_IBRS ( 7*32+25) /* Indirect Branch Restricted Speculation */
#define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */
#define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */
-#define X86_FEATURE_ZEN (7*32+28) /* "" CPU based on Zen microarchitecture */
+#define X86_FEATURE_ZEN ( 7*32+28) /* "" Generic flag for all Zen and newer */
#define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */
#define X86_FEATURE_IBRS_ENHANCED ( 7*32+30) /* Enhanced IBRS */
#define X86_FEATURE_MSR_IA32_FEAT_CTL ( 7*32+31) /* "" MSR IA32_FEAT_CTL configured */
@@ -308,10 +308,14 @@
#define X86_FEATURE_SMBA (11*32+21) /* "" Slow Memory Bandwidth Allocation */
#define X86_FEATURE_BMEC (11*32+22) /* "" Bandwidth Monitoring Event Configuration */
#define X86_FEATURE_USER_SHSTK (11*32+23) /* Shadow stack support for user mode applications */
-
#define X86_FEATURE_SRSO (11*32+24) /* "" AMD BTB untrain RETs */
#define X86_FEATURE_SRSO_ALIAS (11*32+25) /* "" AMD BTB untrain RETs through aliasing */
#define X86_FEATURE_IBPB_ON_VMEXIT (11*32+26) /* "" Issue an IBPB only on VMEXIT */
+#define X86_FEATURE_APIC_MSRS_FENCE (11*32+27) /* "" IA32_TSC_DEADLINE and X2APIC MSRs need fencing */
+#define X86_FEATURE_ZEN2 (11*32+28) /* "" CPU based on Zen2 microarchitecture */
+#define X86_FEATURE_ZEN3 (11*32+29) /* "" CPU based on Zen3 microarchitecture */
+#define X86_FEATURE_ZEN4 (11*32+30) /* "" CPU based on Zen4 microarchitecture */
+#define X86_FEATURE_ZEN1 (11*32+31) /* "" CPU based on Zen1 microarchitecture */
/* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
#define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */
diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h
index a1168e7b69e5..dd4b67101bb7 100644
--- a/arch/x86/include/asm/current.h
+++ b/arch/x86/include/asm/current.h
@@ -2,6 +2,7 @@
#ifndef _ASM_X86_CURRENT_H
#define _ASM_X86_CURRENT_H
+#include <linux/build_bug.h>
#include <linux/compiler.h>
#ifndef __ASSEMBLY__
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
index 66eb5e1ac4fb..0cec92c430cc 100644
--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -5,6 +5,7 @@
#include <linux/bug.h>
#include <linux/percpu.h>
#include <uapi/asm/debugreg.h>
+#include <asm/cpufeature.h>
DECLARE_PER_CPU(unsigned long, cpu_dr7);
diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h
index f7e7099af595..d440a65af8f3 100644
--- a/arch/x86/include/asm/desc_defs.h
+++ b/arch/x86/include/asm/desc_defs.h
@@ -8,6 +8,56 @@
* archs.
*/
+/*
+ * Low-level interface mapping flags/field names to bits
+ */
+
+/* Flags for _DESC_S (non-system) descriptors */
+#define _DESC_ACCESSED 0x0001
+#define _DESC_DATA_WRITABLE 0x0002
+#define _DESC_CODE_READABLE 0x0002
+#define _DESC_DATA_EXPAND_DOWN 0x0004
+#define _DESC_CODE_CONFORMING 0x0004
+#define _DESC_CODE_EXECUTABLE 0x0008
+
+/* Common flags */
+#define _DESC_S 0x0010
+#define _DESC_DPL(dpl) ((dpl) << 5)
+#define _DESC_PRESENT 0x0080
+
+#define _DESC_LONG_CODE 0x2000
+#define _DESC_DB 0x4000
+#define _DESC_GRANULARITY_4K 0x8000
+
+/* System descriptors have a numeric "type" field instead of flags */
+#define _DESC_SYSTEM(code) (code)
+
+/*
+ * High-level interface mapping intended usage to low-level combinations
+ * of flags
+ */
+
+#define _DESC_DATA (_DESC_S | _DESC_PRESENT | _DESC_ACCESSED | \
+ _DESC_DATA_WRITABLE)
+#define _DESC_CODE (_DESC_S | _DESC_PRESENT | _DESC_ACCESSED | \
+ _DESC_CODE_READABLE | _DESC_CODE_EXECUTABLE)
+
+#define DESC_DATA16 (_DESC_DATA)
+#define DESC_CODE16 (_DESC_CODE)
+
+#define DESC_DATA32 (_DESC_DATA | _DESC_GRANULARITY_4K | _DESC_DB)
+#define DESC_DATA32_BIOS (_DESC_DATA | _DESC_DB)
+
+#define DESC_CODE32 (_DESC_CODE | _DESC_GRANULARITY_4K | _DESC_DB)
+#define DESC_CODE32_BIOS (_DESC_CODE | _DESC_DB)
+
+#define DESC_TSS32 (_DESC_SYSTEM(9) | _DESC_PRESENT)
+
+#define DESC_DATA64 (_DESC_DATA | _DESC_GRANULARITY_4K | _DESC_DB)
+#define DESC_CODE64 (_DESC_CODE | _DESC_GRANULARITY_4K | _DESC_LONG_CODE)
+
+#define DESC_USER (_DESC_DPL(3))
+
#ifndef __ASSEMBLY__
#include <linux/types.h>
@@ -22,19 +72,19 @@ struct desc_struct {
#define GDT_ENTRY_INIT(flags, base, limit) \
{ \
- .limit0 = (u16) (limit), \
- .limit1 = ((limit) >> 16) & 0x0F, \
- .base0 = (u16) (base), \
- .base1 = ((base) >> 16) & 0xFF, \
- .base2 = ((base) >> 24) & 0xFF, \
- .type = (flags & 0x0f), \
- .s = (flags >> 4) & 0x01, \
- .dpl = (flags >> 5) & 0x03, \
- .p = (flags >> 7) & 0x01, \
- .avl = (flags >> 12) & 0x01, \
- .l = (flags >> 13) & 0x01, \
- .d = (flags >> 14) & 0x01, \
- .g = (flags >> 15) & 0x01, \
+ .limit0 = ((limit) >> 0) & 0xFFFF, \
+ .limit1 = ((limit) >> 16) & 0x000F, \
+ .base0 = ((base) >> 0) & 0xFFFF, \
+ .base1 = ((base) >> 16) & 0x00FF, \
+ .base2 = ((base) >> 24) & 0x00FF, \
+ .type = ((flags) >> 0) & 0x000F, \
+ .s = ((flags) >> 4) & 0x0001, \
+ .dpl = ((flags) >> 5) & 0x0003, \
+ .p = ((flags) >> 7) & 0x0001, \
+ .avl = ((flags) >> 12) & 0x0001, \
+ .l = ((flags) >> 13) & 0x0001, \
+ .d = ((flags) >> 14) & 0x0001, \
+ .g = ((flags) >> 15) & 0x0001, \
}
enum {
@@ -94,6 +144,7 @@ struct gate_struct {
typedef struct gate_struct gate_desc;
+#ifndef _SETUP
static inline unsigned long gate_offset(const gate_desc *g)
{
#ifdef CONFIG_X86_64
@@ -108,6 +159,7 @@ static inline unsigned long gate_segment(const gate_desc *g)
{
return g->segment;
}
+#endif
struct desc_ptr {
unsigned short size;
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index a0234dfd1031..1e16bd5ac781 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -150,7 +150,7 @@ do { \
((x)->e_machine == EM_X86_64)
#define compat_elf_check_arch(x) \
- ((elf_check_arch_ia32(x) && ia32_enabled()) || \
+ ((elf_check_arch_ia32(x) && ia32_enabled_verbose()) || \
(IS_ENABLED(CONFIG_X86_X32_ABI) && (x)->e_machine == EM_X86_64))
static inline void elf_common_init(struct thread_struct *t,
diff --git a/arch/x86/include/asm/extable_fixup_types.h b/arch/x86/include/asm/extable_fixup_types.h
index 991e31cfde94..fe6312045042 100644
--- a/arch/x86/include/asm/extable_fixup_types.h
+++ b/arch/x86/include/asm/extable_fixup_types.h
@@ -4,7 +4,7 @@
/*
* Our IMM is signed, as such it must live at the top end of the word. Also,
- * since C99 hex constants are of ambigious type, force cast the mask to 'int'
+ * since C99 hex constants are of ambiguous type, force cast the mask to 'int'
* so that FIELD_GET() will DTRT and sign extend the value when it extracts it.
*/
#define EX_DATA_TYPE_MASK ((int)0x000000FF)
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index eb810074f1e7..ace9aa3b78a3 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -5,6 +5,8 @@
#ifndef _ASM_X86_FPU_H
#define _ASM_X86_FPU_H
+#include <asm/page_types.h>
+
/*
* The legacy x87 FPU state format, as saved by FSAVE and
* restored by the FRSTOR instructions:
@@ -415,7 +417,7 @@ struct fpu_state_perm {
*
* This master permission field is only to be used when
* task.fpu.fpstate based checks fail to validate whether the task
- * is allowed to expand it's xfeatures set which requires to
+ * is allowed to expand its xfeatures set which requires to
* allocate a larger sized fpstate buffer.
*
* Do not access this field directly. Use the provided helper
diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h
index 5a2ae24b1204..c7ef6ea2fa99 100644
--- a/arch/x86/include/asm/ia32.h
+++ b/arch/x86/include/asm/ia32.h
@@ -2,7 +2,6 @@
#ifndef _ASM_X86_IA32_H
#define _ASM_X86_IA32_H
-
#ifdef CONFIG_IA32_EMULATION
#include <linux/compat.h>
@@ -75,6 +74,11 @@ static inline bool ia32_enabled(void)
return __ia32_enabled;
}
+static inline void ia32_disable(void)
+{
+ __ia32_enabled = false;
+}
+
#else /* !CONFIG_IA32_EMULATION */
static inline bool ia32_enabled(void)
@@ -82,6 +86,18 @@ static inline bool ia32_enabled(void)
return IS_ENABLED(CONFIG_X86_32);
}
+static inline void ia32_disable(void) {}
+
#endif
+static inline bool ia32_enabled_verbose(void)
+{
+ bool enabled = ia32_enabled();
+
+ if (IS_ENABLED(CONFIG_IA32_EMULATION) && !enabled)
+ pr_notice_once("32-bit emulation disabled. You can reenable with ia32_emulation=on\n");
+
+ return enabled;
+}
+
#endif /* _ASM_X86_IA32_H */
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index 05fd175cec7d..13639e57e1f8 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -569,6 +569,10 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_UD, exc_invalid_op);
DECLARE_IDTENTRY_RAW(X86_TRAP_BP, exc_int3);
DECLARE_IDTENTRY_RAW_ERRORCODE(X86_TRAP_PF, exc_page_fault);
+#if defined(CONFIG_IA32_EMULATION)
+DECLARE_IDTENTRY_RAW(IA32_SYSCALL_VECTOR, int80_emulation);
+#endif
+
#ifdef CONFIG_X86_MCE
#ifdef CONFIG_X86_64
DECLARE_IDTENTRY_MCE(X86_TRAP_MC, exc_machine_check);
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 76238842406a..3814a9263d64 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -242,7 +242,7 @@ static inline void slow_down_io(void)
#endif
-#define BUILDIO(bwl, bw, type) \
+#define BUILDIO(bwl, type) \
static inline void out##bwl##_p(type value, u16 port) \
{ \
out##bwl(value, port); \
@@ -288,9 +288,9 @@ static inline void ins##bwl(u16 port, void *addr, unsigned long count) \
} \
}
-BUILDIO(b, b, u8)
-BUILDIO(w, w, u16)
-BUILDIO(l, , u32)
+BUILDIO(b, u8)
+BUILDIO(w, u16)
+BUILDIO(l, u32)
#undef BUILDIO
#define inb_p inb_p
diff --git a/arch/x86/include/asm/iosf_mbi.h b/arch/x86/include/asm/iosf_mbi.h
index a1911fea8739..af7541c11821 100644
--- a/arch/x86/include/asm/iosf_mbi.h
+++ b/arch/x86/include/asm/iosf_mbi.h
@@ -111,7 +111,7 @@ int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask);
* This function will block all kernel access to the PMIC I2C bus, so that the
* P-Unit can safely access the PMIC over the shared I2C bus.
*
- * Note on these systems the i2c-bus driver will request a sempahore from the
+ * Note on these systems the i2c-bus driver will request a semaphore from the
* P-Unit for exclusive access to the PMIC bus when i2c drivers are accessing
* it, but this does not appear to be sufficient, we still need to avoid making
* certain P-Unit requests during the access window to avoid problems.
diff --git a/arch/x86/include/asm/irq_work.h b/arch/x86/include/asm/irq_work.h
index 800ffce0db29..6b4d36c95165 100644
--- a/arch/x86/include/asm/irq_work.h
+++ b/arch/x86/include/asm/irq_work.h
@@ -9,7 +9,6 @@ static inline bool arch_irq_work_has_interrupt(void)
{
return boot_cpu_has(X86_FEATURE_APIC);
}
-extern void arch_irq_work_raise(void);
#else
static inline bool arch_irq_work_has_interrupt(void)
{
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d7036982332e..6711da000bb7 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1652,7 +1652,7 @@ struct kvm_x86_ops {
/* Whether or not a virtual NMI is pending in hardware. */
bool (*is_vnmi_pending)(struct kvm_vcpu *vcpu);
/*
- * Attempt to pend a virtual NMI in harware. Returns %true on success
+ * Attempt to pend a virtual NMI in hardware. Returns %true on success
* to allow using static_call_ret0 as the fallback.
*/
bool (*set_vnmi_pending)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 6de6e1d95952..de3118305838 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -311,6 +311,7 @@ enum smca_bank_types {
SMCA_PIE, /* Power, Interrupts, etc. */
SMCA_UMC, /* Unified Memory Controller */
SMCA_UMC_V2,
+ SMCA_MA_LLC, /* Memory Attached Last Level Cache */
SMCA_PB, /* Parameter Block */
SMCA_PSP, /* Platform Security Processor */
SMCA_PSP_V2,
@@ -326,6 +327,8 @@ enum smca_bank_types {
SMCA_SHUB, /* System HUB Unit */
SMCA_SATA, /* SATA Unit */
SMCA_USB, /* USB Unit */
+ SMCA_USR_DP, /* Ultra Short Reach Data Plane Controller */
+ SMCA_USR_CP, /* Ultra Short Reach Control Plane Controller */
SMCA_GMI_PCS, /* GMI PCS Unit */
SMCA_XGMI_PHY, /* xGMI PHY Unit */
SMCA_WAFL_PHY, /* WAFL PHY Unit */
@@ -333,7 +336,6 @@ enum smca_bank_types {
N_SMCA_BANK_TYPES
};
-extern const char *smca_get_long_name(enum smca_bank_types t);
extern bool amd_mce_is_memory_error(struct mce *m);
extern int mce_threshold_create_device(unsigned int cpu);
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 1d51e1850ed0..737a52b89e64 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -237,6 +237,11 @@
#define LBR_INFO_CYCLES 0xffff
#define LBR_INFO_BR_TYPE_OFFSET 56
#define LBR_INFO_BR_TYPE (0xfull << LBR_INFO_BR_TYPE_OFFSET)
+#define LBR_INFO_BR_CNTR_OFFSET 32
+#define LBR_INFO_BR_CNTR_NUM 4
+#define LBR_INFO_BR_CNTR_BITS 2
+#define LBR_INFO_BR_CNTR_MASK GENMASK_ULL(LBR_INFO_BR_CNTR_BITS - 1, 0)
+#define LBR_INFO_BR_CNTR_FULL_MASK GENMASK_ULL(LBR_INFO_BR_CNTR_NUM * LBR_INFO_BR_CNTR_BITS - 1, 0)
#define MSR_ARCH_LBR_CTL 0x000014ce
#define ARCH_LBR_CTL_LBREN BIT(0)
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index 778df05f8539..920426d691ce 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -87,6 +87,15 @@ static __always_inline void __mwaitx(unsigned long eax, unsigned long ebx,
:: "a" (eax), "b" (ebx), "c" (ecx));
}
+/*
+ * Re-enable interrupts right upon calling mwait in such a way that
+ * no interrupt can fire _before_ the execution of mwait, ie: no
+ * instruction must be placed between "sti" and "mwait".
+ *
+ * This is necessary because if an interrupt queues a timer before
+ * executing mwait, it would otherwise go unnoticed and the next tick
+ * would not be reprogrammed accordingly before mwait ever wakes up.
+ */
static __always_inline void __sti_mwait(unsigned long eax, unsigned long ecx)
{
mds_idle_clear_cpu_buffers();
@@ -115,8 +124,15 @@ static __always_inline void mwait_idle_with_hints(unsigned long eax, unsigned lo
}
__monitor((void *)&current_thread_info()->flags, 0, 0);
- if (!need_resched())
- __mwait(eax, ecx);
+
+ if (!need_resched()) {
+ if (ecx & 1) {
+ __mwait(eax, ecx);
+ } else {
+ __sti_mwait(eax, ecx);
+ raw_local_irq_disable();
+ }
+ }
}
current_clr_polling();
}
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index f93e9b96927a..262e65539f83 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -49,7 +49,7 @@
* but there is still a cushion vs. the RSB depth. The algorithm does not
* claim to be perfect and it can be speculated around by the CPU, but it
* is considered that it obfuscates the problem enough to make exploitation
- * extremly difficult.
+ * extremely difficult.
*/
#define RET_DEPTH_SHIFT 5
#define RSB_RET_STUFF_LOOPS 16
@@ -208,7 +208,7 @@
/*
* Abuse ANNOTATE_RETPOLINE_SAFE on a NOP to indicate UNRET_END, should
- * eventually turn into it's own annotation.
+ * eventually turn into its own annotation.
*/
.macro VALIDATE_UNRET_END
#if defined(CONFIG_NOINSTR_VALIDATION) && \
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 6c8ff12140ae..d4eb9e1d61b8 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -6,6 +6,10 @@
#include <asm/paravirt_types.h>
+#ifndef __ASSEMBLY__
+struct mm_struct;
+#endif
+
#ifdef CONFIG_PARAVIRT
#include <asm/pgtable_types.h>
#include <asm/asm.h>
@@ -142,8 +146,7 @@ static inline void write_cr0(unsigned long x)
static __always_inline unsigned long read_cr2(void)
{
return PVOP_ALT_CALLEE0(unsigned long, mmu.read_cr2,
- "mov %%cr2, %%rax;",
- ALT_NOT(X86_FEATURE_XENPV));
+ "mov %%cr2, %%rax;", ALT_NOT_XEN);
}
static __always_inline void write_cr2(unsigned long x)
@@ -154,13 +157,12 @@ static __always_inline void write_cr2(unsigned long x)
static inline unsigned long __read_cr3(void)
{
return PVOP_ALT_CALL0(unsigned long, mmu.read_cr3,
- "mov %%cr3, %%rax;", ALT_NOT(X86_FEATURE_XENPV));
+ "mov %%cr3, %%rax;", ALT_NOT_XEN);
}
static inline void write_cr3(unsigned long x)
{
- PVOP_ALT_VCALL1(mmu.write_cr3, x,
- "mov %%rdi, %%cr3", ALT_NOT(X86_FEATURE_XENPV));
+ PVOP_ALT_VCALL1(mmu.write_cr3, x, "mov %%rdi, %%cr3", ALT_NOT_XEN);
}
static inline void __write_cr4(unsigned long x)
@@ -182,7 +184,7 @@ extern noinstr void pv_native_wbinvd(void);
static __always_inline void wbinvd(void)
{
- PVOP_ALT_VCALL0(cpu.wbinvd, "wbinvd", ALT_NOT(X86_FEATURE_XENPV));
+ PVOP_ALT_VCALL0(cpu.wbinvd, "wbinvd", ALT_NOT_XEN);
}
static inline u64 paravirt_read_msr(unsigned msr)
@@ -390,27 +392,25 @@ static inline void paravirt_release_p4d(unsigned long pfn)
static inline pte_t __pte(pteval_t val)
{
return (pte_t) { PVOP_ALT_CALLEE1(pteval_t, mmu.make_pte, val,
- "mov %%rdi, %%rax",
- ALT_NOT(X86_FEATURE_XENPV)) };
+ "mov %%rdi, %%rax", ALT_NOT_XEN) };
}
static inline pteval_t pte_val(pte_t pte)
{
return PVOP_ALT_CALLEE1(pteval_t, mmu.pte_val, pte.pte,
- "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
+ "mov %%rdi, %%rax", ALT_NOT_XEN);
}
static inline pgd_t __pgd(pgdval_t val)
{
return (pgd_t) { PVOP_ALT_CALLEE1(pgdval_t, mmu.make_pgd, val,
- "mov %%rdi, %%rax",
- ALT_NOT(X86_FEATURE_XENPV)) };
+ "mov %%rdi, %%rax", ALT_NOT_XEN) };
}
static inline pgdval_t pgd_val(pgd_t pgd)
{
return PVOP_ALT_CALLEE1(pgdval_t, mmu.pgd_val, pgd.pgd,
- "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
+ "mov %%rdi, %%rax", ALT_NOT_XEN);
}
#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
@@ -444,14 +444,13 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
static inline pmd_t __pmd(pmdval_t val)
{
return (pmd_t) { PVOP_ALT_CALLEE1(pmdval_t, mmu.make_pmd, val,
- "mov %%rdi, %%rax",
- ALT_NOT(X86_FEATURE_XENPV)) };
+ "mov %%rdi, %%rax", ALT_NOT_XEN) };
}
static inline pmdval_t pmd_val(pmd_t pmd)
{
return PVOP_ALT_CALLEE1(pmdval_t, mmu.pmd_val, pmd.pmd,
- "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
+ "mov %%rdi, %%rax", ALT_NOT_XEN);
}
static inline void set_pud(pud_t *pudp, pud_t pud)
@@ -464,7 +463,7 @@ static inline pud_t __pud(pudval_t val)
pudval_t ret;
ret = PVOP_ALT_CALLEE1(pudval_t, mmu.make_pud, val,
- "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
+ "mov %%rdi, %%rax", ALT_NOT_XEN);
return (pud_t) { ret };
}
@@ -472,7 +471,7 @@ static inline pud_t __pud(pudval_t val)
static inline pudval_t pud_val(pud_t pud)
{
return PVOP_ALT_CALLEE1(pudval_t, mmu.pud_val, pud.pud,
- "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
+ "mov %%rdi, %%rax", ALT_NOT_XEN);
}
static inline void pud_clear(pud_t *pudp)
@@ -492,8 +491,7 @@ static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
static inline p4d_t __p4d(p4dval_t val)
{
p4dval_t ret = PVOP_ALT_CALLEE1(p4dval_t, mmu.make_p4d, val,
- "mov %%rdi, %%rax",
- ALT_NOT(X86_FEATURE_XENPV));
+ "mov %%rdi, %%rax", ALT_NOT_XEN);
return (p4d_t) { ret };
}
@@ -501,7 +499,7 @@ static inline p4d_t __p4d(p4dval_t val)
static inline p4dval_t p4d_val(p4d_t p4d)
{
return PVOP_ALT_CALLEE1(p4dval_t, mmu.p4d_val, p4d.p4d,
- "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
+ "mov %%rdi, %%rax", ALT_NOT_XEN);
}
static inline void __set_pgd(pgd_t *pgdp, pgd_t pgd)
@@ -687,17 +685,17 @@ bool __raw_callee_save___native_vcpu_is_preempted(long cpu);
static __always_inline unsigned long arch_local_save_flags(void)
{
return PVOP_ALT_CALLEE0(unsigned long, irq.save_fl, "pushf; pop %%rax;",
- ALT_NOT(X86_FEATURE_XENPV));
+ ALT_NOT_XEN);
}
static __always_inline void arch_local_irq_disable(void)
{
- PVOP_ALT_VCALLEE0(irq.irq_disable, "cli;", ALT_NOT(X86_FEATURE_XENPV));
+ PVOP_ALT_VCALLEE0(irq.irq_disable, "cli;", ALT_NOT_XEN);
}
static __always_inline void arch_local_irq_enable(void)
{
- PVOP_ALT_VCALLEE0(irq.irq_enable, "sti;", ALT_NOT(X86_FEATURE_XENPV));
+ PVOP_ALT_VCALLEE0(irq.irq_enable, "sti;", ALT_NOT_XEN);
}
static __always_inline unsigned long arch_local_irq_save(void)
@@ -726,52 +724,25 @@ static __always_inline unsigned long arch_local_irq_save(void)
#undef PVOP_VCALL4
#undef PVOP_CALL4
-#define DEFINE_PARAVIRT_ASM(func, instr, sec) \
- asm (".pushsection " #sec ", \"ax\"\n" \
- ".global " #func "\n\t" \
- ".type " #func ", @function\n\t" \
- ASM_FUNC_ALIGN "\n" \
- #func ":\n\t" \
- ASM_ENDBR \
- instr "\n\t" \
- ASM_RET \
- ".size " #func ", . - " #func "\n\t" \
- ".popsection")
-
extern void default_banner(void);
void native_pv_lock_init(void) __init;
#else /* __ASSEMBLY__ */
-#define _PVSITE(ptype, ops, word, algn) \
-771:; \
- ops; \
-772:; \
- .pushsection .parainstructions,"a"; \
- .align algn; \
- word 771b; \
- .byte ptype; \
- .byte 772b-771b; \
- _ASM_ALIGN; \
- .popsection
-
-
#ifdef CONFIG_X86_64
#ifdef CONFIG_PARAVIRT_XXL
+#ifdef CONFIG_DEBUG_ENTRY
-#define PARA_PATCH(off) ((off) / 8)
-#define PARA_SITE(ptype, ops) _PVSITE(ptype, ops, .quad, 8)
#define PARA_INDIRECT(addr) *addr(%rip)
-#ifdef CONFIG_DEBUG_ENTRY
.macro PARA_IRQ_save_fl
- PARA_SITE(PARA_PATCH(PV_IRQ_save_fl),
- ANNOTATE_RETPOLINE_SAFE;
- call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl);)
+ ANNOTATE_RETPOLINE_SAFE;
+ call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl);
.endm
-#define SAVE_FLAGS ALTERNATIVE "PARA_IRQ_save_fl;", "pushf; pop %rax;", \
- ALT_NOT(X86_FEATURE_XENPV)
+#define SAVE_FLAGS ALTERNATIVE_2 "PARA_IRQ_save_fl;", \
+ "ALT_CALL_INSTR;", ALT_CALL_ALWAYS, \
+ "pushf; pop %rax;", ALT_NOT_XEN
#endif
#endif /* CONFIG_PARAVIRT_XXL */
#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 772d03487520..8d4fbe1be489 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -2,18 +2,10 @@
#ifndef _ASM_X86_PARAVIRT_TYPES_H
#define _ASM_X86_PARAVIRT_TYPES_H
-#ifndef __ASSEMBLY__
-/* These all sit in the .parainstructions section to tell us what to patch. */
-struct paravirt_patch_site {
- u8 *instr; /* original instructions */
- u8 type; /* type of this instruction */
- u8 len; /* length of original instruction */
-};
-#endif
-
#ifdef CONFIG_PARAVIRT
#ifndef __ASSEMBLY__
+#include <linux/types.h>
#include <asm/desc_defs.h>
#include <asm/pgtable_types.h>
@@ -250,43 +242,11 @@ struct paravirt_patch_template {
extern struct pv_info pv_info;
extern struct paravirt_patch_template pv_ops;
-#define PARAVIRT_PATCH(x) \
- (offsetof(struct paravirt_patch_template, x) / sizeof(void *))
-
-#define paravirt_type(op) \
- [paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \
- [paravirt_opptr] "m" (pv_ops.op)
-/*
- * Generate some code, and mark it as patchable by the
- * apply_paravirt() alternate instruction patcher.
- */
-#define _paravirt_alt(insn_string, type) \
- "771:\n\t" insn_string "\n" "772:\n" \
- ".pushsection .parainstructions,\"a\"\n" \
- _ASM_ALIGN "\n" \
- _ASM_PTR " 771b\n" \
- " .byte " type "\n" \
- " .byte 772b-771b\n" \
- _ASM_ALIGN "\n" \
- ".popsection\n"
-
-/* Generate patchable code, with the default asm parameters. */
-#define paravirt_alt(insn_string) \
- _paravirt_alt(insn_string, "%c[paravirt_typenum]")
-
-/* Simple instruction patching code. */
-#define NATIVE_LABEL(a,x,b) "\n\t.globl " a #x "_" #b "\n" a #x "_" #b ":\n\t"
-
-unsigned int paravirt_patch(u8 type, void *insn_buff, unsigned long addr, unsigned int len);
+#define paravirt_ptr(op) [paravirt_opptr] "m" (pv_ops.op)
int paravirt_disable_iospace(void);
-/*
- * This generates an indirect call based on the operation type number.
- * The type number, computed in PARAVIRT_PATCH, is derived from the
- * offset into the paravirt_patch_template structure, and can therefore be
- * freely converted back into a structure offset.
- */
+/* This generates an indirect call based on the operation type number. */
#define PARAVIRT_CALL \
ANNOTATE_RETPOLINE_SAFE \
"call *%[paravirt_opptr];"
@@ -319,12 +279,6 @@ int paravirt_disable_iospace(void);
* However, x86_64 also has to clobber all caller saved registers, which
* unfortunately, are quite a bit (r8 - r11)
*
- * The call instruction itself is marked by placing its start address
- * and size into the .parainstructions section, so that
- * apply_paravirt() in arch/i386/kernel/alternative.c can do the
- * appropriate patching under the control of the backend pv_init_ops
- * implementation.
- *
* Unfortunately there's no way to get gcc to generate the args setup
* for the call, and then allow the call itself to be generated by an
* inline asm. Because of this, we must do the complete arg setup and
@@ -423,14 +377,27 @@ int paravirt_disable_iospace(void);
__mask & __eax; \
})
-
+/*
+ * Use alternative patching for paravirt calls:
+ * - For replacing an indirect call with a direct one, use the "normal"
+ * ALTERNATIVE() macro with the indirect call as the initial code sequence,
+ * which will be replaced with the related direct call by using the
+ * ALT_FLAG_DIRECT_CALL special case and the "always on" feature.
+ * - In case the replacement is either a direct call or a short code sequence
+ * depending on a feature bit, the ALTERNATIVE_2() macro is being used.
+ * The indirect call is the initial code sequence again, while the special
+ * code sequence is selected with the specified feature bit. In case the
+ * feature is not active, the direct call is used as above via the
+ * ALT_FLAG_DIRECT_CALL special case and the "always on" feature.
+ */
#define ____PVOP_CALL(ret, op, call_clbr, extra_clbr, ...) \
({ \
PVOP_CALL_ARGS; \
PVOP_TEST_NULL(op); \
- asm volatile(paravirt_alt(PARAVIRT_CALL) \
+ asm volatile(ALTERNATIVE(PARAVIRT_CALL, ALT_CALL_INSTR, \
+ ALT_CALL_ALWAYS) \
: call_clbr, ASM_CALL_CONSTRAINT \
- : paravirt_type(op), \
+ : paravirt_ptr(op), \
##__VA_ARGS__ \
: "memory", "cc" extra_clbr); \
ret; \
@@ -441,10 +408,11 @@ int paravirt_disable_iospace(void);
({ \
PVOP_CALL_ARGS; \
PVOP_TEST_NULL(op); \
- asm volatile(ALTERNATIVE(paravirt_alt(PARAVIRT_CALL), \
- alt, cond) \
+ asm volatile(ALTERNATIVE_2(PARAVIRT_CALL, \
+ ALT_CALL_INSTR, ALT_CALL_ALWAYS, \
+ alt, cond) \
: call_clbr, ASM_CALL_CONSTRAINT \
- : paravirt_type(op), \
+ : paravirt_ptr(op), \
##__VA_ARGS__ \
: "memory", "cc" extra_clbr); \
ret; \
@@ -542,8 +510,6 @@ int paravirt_disable_iospace(void);
__PVOP_VCALL(op, PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \
PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4))
-void _paravirt_nop(void);
-void paravirt_BUG(void);
unsigned long paravirt_ret0(void);
#ifdef CONFIG_PARAVIRT_XXL
u64 _paravirt_ident_64(u64);
@@ -553,11 +519,11 @@ void pv_native_irq_enable(void);
unsigned long pv_native_read_cr2(void);
#endif
-#define paravirt_nop ((void *)_paravirt_nop)
-
-extern struct paravirt_patch_site __parainstructions[],
- __parainstructions_end[];
+#define paravirt_nop ((void *)nop_func)
#endif /* __ASSEMBLY__ */
+
+#define ALT_NOT_XEN ALT_NOT(X86_FEATURE_XENPV)
+
#endif /* CONFIG_PARAVIRT */
#endif /* _ASM_X86_PARAVIRT_TYPES_H */
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 20624b80f890..5e01883eb51e 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -24,8 +24,8 @@
#else /* ...!ASSEMBLY */
-#include <linux/kernel.h>
#include <linux/stringify.h>
+#include <asm/asm.h>
#ifdef CONFIG_SMP
#define __percpu_prefix "%%"__stringify(__percpu_seg)":"
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 2618ec7c3d1d..3736b8a46c04 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -31,6 +31,7 @@
#define ARCH_PERFMON_EVENTSEL_ENABLE (1ULL << 22)
#define ARCH_PERFMON_EVENTSEL_INV (1ULL << 23)
#define ARCH_PERFMON_EVENTSEL_CMASK 0xFF000000ULL
+#define ARCH_PERFMON_EVENTSEL_BR_CNTR (1ULL << 35)
#define INTEL_FIXED_BITS_MASK 0xFULL
#define INTEL_FIXED_BITS_STRIDE 4
@@ -223,6 +224,9 @@ union cpuid28_ecx {
unsigned int lbr_timed_lbr:1;
/* Branch Type Field Supported */
unsigned int lbr_br_type:1;
+ unsigned int reserved:13;
+ /* Branch counters (Event Logging) Supported */
+ unsigned int lbr_counters:4;
} split;
unsigned int full;
};
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 57bab91bbf50..9d077bca6a10 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -141,6 +141,7 @@ static inline int pte_young(pte_t pte)
return pte_flags(pte) & _PAGE_ACCESSED;
}
+#define pmd_dirty pmd_dirty
static inline bool pmd_dirty(pmd_t pmd)
{
return pmd_flags(pmd) & _PAGE_DIRTY_BITS;
@@ -1679,12 +1680,6 @@ static inline bool arch_has_pfn_modify_check(void)
return boot_cpu_has_bug(X86_BUG_L1TF);
}
-#define arch_has_hw_pte_young arch_has_hw_pte_young
-static inline bool arch_has_hw_pte_young(void)
-{
- return true;
-}
-
#define arch_check_zapped_pte arch_check_zapped_pte
void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte);
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index a629b1b9f65a..24af25b1551a 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -203,7 +203,7 @@ static inline void native_pgd_clear(pgd_t *pgd)
* F (2) in swp entry is used to record when a pagetable is
* writeprotected by userfaultfd WP support.
*
- * E (3) in swp entry is used to rememeber PG_anon_exclusive.
+ * E (3) in swp entry is used to remember PG_anon_exclusive.
*
* Bit 7 in swp entry should be 0 because pmd_present checks not only P,
* but also L and G.
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index 4527e1430c6d..af77235fded6 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -6,7 +6,6 @@
#include <asm/percpu.h>
#include <asm/current.h>
-#include <linux/thread_info.h>
#include <linux/static_call_types.h>
/* We use the MSB mostly because its available */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index ae81a7191c1c..26620d7642a9 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -749,4 +749,22 @@ enum mds_mitigations {
extern bool gds_ucode_mitigated(void);
+/*
+ * Make previous memory operations globally visible before
+ * a WRMSR.
+ *
+ * MFENCE makes writes visible, but only affects load/store
+ * instructions. WRMSR is unfortunately not a load/store
+ * instruction and is unaffected by MFENCE. The LFENCE ensures
+ * that the WRMSR is not reordered.
+ *
+ * Most WRMSRs are full serializing instructions themselves and
+ * do not require this barrier. This is only required for the
+ * IA32_TSC_DEADLINE and X2APIC MSRs.
+ */
+static inline void weak_wrmsr_fence(void)
+{
+ alternative("mfence; lfence", "", ALT_NOT(X86_FEATURE_APIC_MSRS_FENCE));
+}
+
#endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index 4d84122bd643..484f4f0131a5 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -32,10 +32,6 @@ void entry_SYSCALL_compat(void);
void entry_SYSCALL_compat_safe_stack(void);
void entry_SYSRETL_compat_unsafe_stack(void);
void entry_SYSRETL_compat_end(void);
-void entry_INT80_compat(void);
-#ifdef CONFIG_XEN_PV
-void xen_entry_INT80_compat(void);
-#endif
#else /* !CONFIG_IA32_EMULATION */
#define entry_SYSCALL_compat NULL
#define entry_SYSENTER_compat NULL
diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h
index 85b6e3609cb9..ef9697f20129 100644
--- a/arch/x86/include/asm/qspinlock_paravirt.h
+++ b/arch/x86/include/asm/qspinlock_paravirt.h
@@ -56,8 +56,8 @@ __PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, ".spinlock.text");
"pop %rdx\n\t" \
FRAME_END
-DEFINE_PARAVIRT_ASM(__raw_callee_save___pv_queued_spin_unlock,
- PV_UNLOCK_ASM, .spinlock.text);
+DEFINE_ASM_FUNC(__raw_callee_save___pv_queued_spin_unlock,
+ PV_UNLOCK_ASM, .spinlock.text);
#else /* CONFIG_64BIT */
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index bf483fcb4e57..5c83729c8e71 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -31,8 +31,6 @@
#include <asm/bootparam.h>
#include <asm/x86_init.h>
-extern u64 relocated_ramdisk;
-
/* Interrupt control for vSMPowered x86_64 systems */
#ifdef CONFIG_X86_64
void vsmp_init(void);
diff --git a/arch/x86/include/asm/syscall_wrapper.h b/arch/x86/include/asm/syscall_wrapper.h
index fd2669b1cb2d..21f9407be5d3 100644
--- a/arch/x86/include/asm/syscall_wrapper.h
+++ b/arch/x86/include/asm/syscall_wrapper.h
@@ -86,9 +86,6 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs);
return sys_ni_syscall(); \
}
-#define __SYS_NI(abi, name) \
- SYSCALL_ALIAS(__##abi##_##name, sys_ni_posix_timers);
-
#ifdef CONFIG_X86_64
#define __X64_SYS_STUB0(name) \
__SYS_STUB0(x64, sys_##name)
@@ -100,13 +97,10 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs);
#define __X64_COND_SYSCALL(name) \
__COND_SYSCALL(x64, sys_##name)
-#define __X64_SYS_NI(name) \
- __SYS_NI(x64, sys_##name)
#else /* CONFIG_X86_64 */
#define __X64_SYS_STUB0(name)
#define __X64_SYS_STUBx(x, name, ...)
#define __X64_COND_SYSCALL(name)
-#define __X64_SYS_NI(name)
#endif /* CONFIG_X86_64 */
#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
@@ -120,13 +114,10 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs);
#define __IA32_COND_SYSCALL(name) \
__COND_SYSCALL(ia32, sys_##name)
-#define __IA32_SYS_NI(name) \
- __SYS_NI(ia32, sys_##name)
#else /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */
#define __IA32_SYS_STUB0(name)
#define __IA32_SYS_STUBx(x, name, ...)
#define __IA32_COND_SYSCALL(name)
-#define __IA32_SYS_NI(name)
#endif /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */
#ifdef CONFIG_IA32_EMULATION
@@ -135,8 +126,7 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs);
* additional wrappers (aptly named __ia32_sys_xyzzy) which decode the
* ia32 regs in the proper order for shared or "common" syscalls. As some
* syscalls may not be implemented, we need to expand COND_SYSCALL in
- * kernel/sys_ni.c and SYS_NI in kernel/time/posix-stubs.c to cover this
- * case as well.
+ * kernel/sys_ni.c to cover this case as well.
*/
#define __IA32_COMPAT_SYS_STUB0(name) \
__SYS_STUB0(ia32, compat_sys_##name)
@@ -148,14 +138,10 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs);
#define __IA32_COMPAT_COND_SYSCALL(name) \
__COND_SYSCALL(ia32, compat_sys_##name)
-#define __IA32_COMPAT_SYS_NI(name) \
- __SYS_NI(ia32, compat_sys_##name)
-
#else /* CONFIG_IA32_EMULATION */
#define __IA32_COMPAT_SYS_STUB0(name)
#define __IA32_COMPAT_SYS_STUBx(x, name, ...)
#define __IA32_COMPAT_COND_SYSCALL(name)
-#define __IA32_COMPAT_SYS_NI(name)
#endif /* CONFIG_IA32_EMULATION */
@@ -175,13 +161,10 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs);
#define __X32_COMPAT_COND_SYSCALL(name) \
__COND_SYSCALL(x64, compat_sys_##name)
-#define __X32_COMPAT_SYS_NI(name) \
- __SYS_NI(x64, compat_sys_##name)
#else /* CONFIG_X86_X32_ABI */
#define __X32_COMPAT_SYS_STUB0(name)
#define __X32_COMPAT_SYS_STUBx(x, name, ...)
#define __X32_COMPAT_COND_SYSCALL(name)
-#define __X32_COMPAT_SYS_NI(name)
#endif /* CONFIG_X86_X32_ABI */
@@ -212,17 +195,12 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs);
/*
* As some compat syscalls may not be implemented, we need to expand
- * COND_SYSCALL_COMPAT in kernel/sys_ni.c and COMPAT_SYS_NI in
- * kernel/time/posix-stubs.c to cover this case as well.
+ * COND_SYSCALL_COMPAT in kernel/sys_ni.c to cover this case as well.
*/
#define COND_SYSCALL_COMPAT(name) \
__IA32_COMPAT_COND_SYSCALL(name) \
__X32_COMPAT_COND_SYSCALL(name)
-#define COMPAT_SYS_NI(name) \
- __IA32_COMPAT_SYS_NI(name) \
- __X32_COMPAT_SYS_NI(name)
-
#endif /* CONFIG_COMPAT */
#define __SYSCALL_DEFINEx(x, name, ...) \
@@ -243,8 +221,8 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs);
* As the generic SYSCALL_DEFINE0() macro does not decode any parameters for
* obvious reasons, and passing struct pt_regs *regs to it in %rdi does not
* hurt, we only need to re-define it here to keep the naming congruent to
- * SYSCALL_DEFINEx() -- which is essential for the COND_SYSCALL() and SYS_NI()
- * macros to work correctly.
+ * SYSCALL_DEFINEx() -- which is essential for the COND_SYSCALL() macro
+ * to work correctly.
*/
#define SYSCALL_DEFINE0(sname) \
SYSCALL_METADATA(_##sname, 0); \
@@ -257,10 +235,6 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs);
__X64_COND_SYSCALL(name) \
__IA32_COND_SYSCALL(name)
-#define SYS_NI(name) \
- __X64_SYS_NI(name) \
- __IA32_SYS_NI(name)
-
/*
* For VSYSCALLS, we need to declare these three syscalls with the new
diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
index 29832c338cdc..0b70653a98c1 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -6,18 +6,6 @@
#include <linux/stddef.h>
#include <asm/ptrace.h>
-struct paravirt_patch_site;
-#ifdef CONFIG_PARAVIRT
-void apply_paravirt(struct paravirt_patch_site *start,
- struct paravirt_patch_site *end);
-#else
-static inline void apply_paravirt(struct paravirt_patch_site *start,
- struct paravirt_patch_site *end)
-{}
-#define __parainstructions NULL
-#define __parainstructions_end NULL
-#endif
-
/*
* Currently, the max observed size in the kernel code is
* JUMP_LABEL_NOP_SIZE/RELATIVEJUMP_SIZE, which are 5.
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index b1c9cea6ba88..1f1deaecd364 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -14,7 +14,6 @@
asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs);
asmlinkage __visible notrace
struct pt_regs *fixup_bad_iret(struct pt_regs *bad_regs);
-void __init trap_init(void);
asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *eregs);
#endif
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index 5fa76c2ced51..ea877fd83114 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -653,7 +653,7 @@ static inline int uv_blade_to_node(int blade)
return uv_socket_to_node(blade);
}
-/* Blade number of current cpu. Numnbered 0 .. <#blades -1> */
+/* Blade number of current cpu. Numbered 0 .. <#blades -1> */
static inline int uv_numa_blade_id(void)
{
return uv_hub_info->numa_blade_id;
diff --git a/arch/x86/include/asm/vdso/gettimeofday.h b/arch/x86/include/asm/vdso/gettimeofday.h
index c81858d903dc..8e048ca980df 100644
--- a/arch/x86/include/asm/vdso/gettimeofday.h
+++ b/arch/x86/include/asm/vdso/gettimeofday.h
@@ -321,7 +321,7 @@ static __always_inline
u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult)
{
/*
- * Due to the MSB/Sign-bit being used as invald marker (see
+ * Due to the MSB/Sign-bit being used as invalid marker (see
* arch_vdso_cycles_valid() above), the effective mask is S64_MAX.
*/
u64 delta = (cycles - last) & S64_MAX;
@@ -337,8 +337,6 @@ u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult)
}
#define vdso_calc_delta vdso_calc_delta
-int __vdso_clock_gettime64(clockid_t clock, struct __kernel_timespec *ts);
-
#endif /* !__ASSEMBLY__ */
#endif /* __ASM_VDSO_GETTIMEOFDAY_H */
diff --git a/arch/x86/include/asm/xen/interface_64.h b/arch/x86/include/asm/xen/interface_64.h
index c599ec269a25..c10f279aae93 100644
--- a/arch/x86/include/asm/xen/interface_64.h
+++ b/arch/x86/include/asm/xen/interface_64.h
@@ -61,7 +61,7 @@
* RING1 -> RING3 kernel mode.
* RING2 -> RING3 kernel mode.
* RING3 -> RING3 user mode.
- * However RING0 indicates that the guest kernel should return to iteself
+ * However RING0 indicates that the guest kernel should return to itself
* directly with
* orb $3,1*8(%rsp)
* iretq
diff --git a/arch/x86/include/uapi/asm/amd_hsmp.h b/arch/x86/include/uapi/asm/amd_hsmp.h
index fce22686c834..e5d182c7373c 100644
--- a/arch/x86/include/uapi/asm/amd_hsmp.h
+++ b/arch/x86/include/uapi/asm/amd_hsmp.h
@@ -238,7 +238,7 @@ static const struct hsmp_msg_desc hsmp_msg_desc_table[] = {
/*
* HSMP_GET_DIMM_THERMAL, num_args = 1, response_sz = 1
* input: args[0] = DIMM address[7:0]
- * output: args[0] = temperature in degree celcius[31:21] + update rate in ms[16:8] +
+ * output: args[0] = temperature in degree celsius[31:21] + update rate in ms[16:8] +
* DIMM address[7:0]
*/
{1, 1, HSMP_GET},
diff --git a/arch/x86/include/uapi/asm/signal.h b/arch/x86/include/uapi/asm/signal.h
index 777c3a0f4e23..f777346450ec 100644
--- a/arch/x86/include/uapi/asm/signal.h
+++ b/arch/x86/include/uapi/asm/signal.h
@@ -4,7 +4,6 @@
#ifndef __ASSEMBLY__
#include <linux/types.h>
-#include <linux/time.h>
#include <linux/compiler.h>
/* Avoid too many header ordering problems. */
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 1a0dd80d81ac..85a3ce2a3666 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -293,6 +293,7 @@ acpi_parse_lapic(union acpi_subtable_headers * header, const unsigned long end)
processor->processor_id, /* ACPI ID */
processor->lapic_flags & ACPI_MADT_ENABLED);
+ has_lapic_cpus = true;
return 0;
}
@@ -1134,7 +1135,6 @@ static int __init acpi_parse_madt_lapic_entries(void)
if (!count) {
count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC,
acpi_parse_lapic, MAX_LOCAL_APIC);
- has_lapic_cpus = count > 0;
x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC,
acpi_parse_x2apic, MAX_LOCAL_APIC);
}
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 73be3931e4f0..cc130b57542a 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -30,6 +30,7 @@
#include <asm/fixmap.h>
#include <asm/paravirt.h>
#include <asm/asm-prototypes.h>
+#include <asm/cfi.h>
int __read_mostly alternatives_patched;
@@ -160,7 +161,6 @@ extern s32 __retpoline_sites[], __retpoline_sites_end[];
extern s32 __return_sites[], __return_sites_end[];
extern s32 __cfi_sites[], __cfi_sites_end[];
extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[];
-extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
extern s32 __smp_locks[], __smp_locks_end[];
void text_poke_early(void *addr, const void *opcode, size_t len);
@@ -255,6 +255,16 @@ static void __init_or_module noinline optimize_nops(u8 *instr, size_t len)
}
}
+static void __init_or_module noinline optimize_nops_inplace(u8 *instr, size_t len)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ optimize_nops(instr, len);
+ sync_core();
+ local_irq_restore(flags);
+}
+
/*
* In this context, "source" is where the instructions are placed in the
* section .altinstr_replacement, for example during kernel build by the
@@ -385,6 +395,63 @@ apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
}
}
+/* Low-level backend functions usable from alternative code replacements. */
+DEFINE_ASM_FUNC(nop_func, "", .entry.text);
+EXPORT_SYMBOL_GPL(nop_func);
+
+noinstr void BUG_func(void)
+{
+ BUG();
+}
+EXPORT_SYMBOL_GPL(BUG_func);
+
+#define CALL_RIP_REL_OPCODE 0xff
+#define CALL_RIP_REL_MODRM 0x15
+
+/*
+ * Rewrite the "call BUG_func" replacement to point to the target of the
+ * indirect pv_ops call "call *disp(%ip)".
+ */
+static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a)
+{
+ void *target, *bug = &BUG_func;
+ s32 disp;
+
+ if (a->replacementlen != 5 || insn_buff[0] != CALL_INSN_OPCODE) {
+ pr_err("ALT_FLAG_DIRECT_CALL set for a non-call replacement instruction\n");
+ BUG();
+ }
+
+ if (a->instrlen != 6 ||
+ instr[0] != CALL_RIP_REL_OPCODE ||
+ instr[1] != CALL_RIP_REL_MODRM) {
+ pr_err("ALT_FLAG_DIRECT_CALL set for unrecognized indirect call\n");
+ BUG();
+ }
+
+ /* Skip CALL_RIP_REL_OPCODE and CALL_RIP_REL_MODRM */
+ disp = *(s32 *)(instr + 2);
+#ifdef CONFIG_X86_64
+ /* ff 15 00 00 00 00 call *0x0(%rip) */
+ /* target address is stored at "next instruction + disp". */
+ target = *(void **)(instr + a->instrlen + disp);
+#else
+ /* ff 15 00 00 00 00 call *0x0 */
+ /* target address is stored at disp. */
+ target = *(void **)disp;
+#endif
+ if (!target)
+ target = bug;
+
+ /* (BUG_func - .) + (target - BUG_func) := target - . */
+ *(s32 *)(insn_buff + 1) += target - bug;
+
+ if (target == &nop_func)
+ return 0;
+
+ return 5;
+}
+
/*
* Replace instructions with better alternatives for this CPU type. This runs
* before SMP is initialized to avoid SMP problems with self modifying code.
@@ -438,20 +505,25 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
* patch if feature is *NOT* present.
*/
if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) {
- optimize_nops(instr, a->instrlen);
+ optimize_nops_inplace(instr, a->instrlen);
continue;
}
- DPRINTK(ALT, "feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)",
- (a->flags & ALT_FLAG_NOT) ? "!" : "",
+ DPRINTK(ALT, "feat: %d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d) flags: 0x%x",
a->cpuid >> 5,
a->cpuid & 0x1f,
instr, instr, a->instrlen,
- replacement, a->replacementlen);
+ replacement, a->replacementlen, a->flags);
memcpy(insn_buff, replacement, a->replacementlen);
insn_buff_sz = a->replacementlen;
+ if (a->flags & ALT_FLAG_DIRECT_CALL) {
+ insn_buff_sz = alt_replace_call(instr, insn_buff, a);
+ if (insn_buff_sz < 0)
+ continue;
+ }
+
for (; insn_buff_sz < a->instrlen; insn_buff_sz++)
insn_buff[insn_buff_sz] = 0x90;
@@ -832,15 +904,82 @@ void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { }
#endif /* CONFIG_X86_KERNEL_IBT */
#ifdef CONFIG_FINEIBT
+#define __CFI_DEFAULT CFI_DEFAULT
+#elif defined(CONFIG_CFI_CLANG)
+#define __CFI_DEFAULT CFI_KCFI
+#else
+#define __CFI_DEFAULT CFI_OFF
+#endif
-enum cfi_mode {
- CFI_DEFAULT,
- CFI_OFF,
- CFI_KCFI,
- CFI_FINEIBT,
-};
+enum cfi_mode cfi_mode __ro_after_init = __CFI_DEFAULT;
+
+#ifdef CONFIG_CFI_CLANG
+struct bpf_insn;
+
+/* Must match bpf_func_t / DEFINE_BPF_PROG_RUN() */
+extern unsigned int __bpf_prog_runX(const void *ctx,
+ const struct bpf_insn *insn);
+
+/*
+ * Force a reference to the external symbol so the compiler generates
+ * __kcfi_typid.
+ */
+__ADDRESSABLE(__bpf_prog_runX);
+
+/* u32 __ro_after_init cfi_bpf_hash = __kcfi_typeid___bpf_prog_runX; */
+asm (
+" .pushsection .data..ro_after_init,\"aw\",@progbits \n"
+" .type cfi_bpf_hash,@object \n"
+" .globl cfi_bpf_hash \n"
+" .p2align 2, 0x0 \n"
+"cfi_bpf_hash: \n"
+" .long __kcfi_typeid___bpf_prog_runX \n"
+" .size cfi_bpf_hash, 4 \n"
+" .popsection \n"
+);
+
+/* Must match bpf_callback_t */
+extern u64 __bpf_callback_fn(u64, u64, u64, u64, u64);
+
+__ADDRESSABLE(__bpf_callback_fn);
+
+/* u32 __ro_after_init cfi_bpf_subprog_hash = __kcfi_typeid___bpf_callback_fn; */
+asm (
+" .pushsection .data..ro_after_init,\"aw\",@progbits \n"
+" .type cfi_bpf_subprog_hash,@object \n"
+" .globl cfi_bpf_subprog_hash \n"
+" .p2align 2, 0x0 \n"
+"cfi_bpf_subprog_hash: \n"
+" .long __kcfi_typeid___bpf_callback_fn \n"
+" .size cfi_bpf_subprog_hash, 4 \n"
+" .popsection \n"
+);
+
+u32 cfi_get_func_hash(void *func)
+{
+ u32 hash;
+
+ func -= cfi_get_offset();
+ switch (cfi_mode) {
+ case CFI_FINEIBT:
+ func += 7;
+ break;
+ case CFI_KCFI:
+ func += 1;
+ break;
+ default:
+ return 0;
+ }
+
+ if (get_kernel_nofault(hash, func))
+ return 0;
+
+ return hash;
+}
+#endif
+
+#ifdef CONFIG_FINEIBT
-static enum cfi_mode cfi_mode __ro_after_init = CFI_DEFAULT;
static bool cfi_rand __ro_after_init = true;
static u32 cfi_seed __ro_after_init;
@@ -1149,8 +1288,11 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
goto err;
if (cfi_rand) {
- if (builtin)
+ if (builtin) {
cfi_seed = get_random_u32();
+ cfi_bpf_hash = cfi_rehash(cfi_bpf_hash);
+ cfi_bpf_subprog_hash = cfi_rehash(cfi_bpf_subprog_hash);
+ }
ret = cfi_rand_preamble(start_cfi, end_cfi);
if (ret)
@@ -1411,46 +1553,6 @@ int alternatives_text_reserved(void *start, void *end)
}
#endif /* CONFIG_SMP */
-#ifdef CONFIG_PARAVIRT
-
-/* Use this to add nops to a buffer, then text_poke the whole buffer. */
-static void __init_or_module add_nops(void *insns, unsigned int len)
-{
- while (len > 0) {
- unsigned int noplen = len;
- if (noplen > ASM_NOP_MAX)
- noplen = ASM_NOP_MAX;
- memcpy(insns, x86_nops[noplen], noplen);
- insns += noplen;
- len -= noplen;
- }
-}
-
-void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
- struct paravirt_patch_site *end)
-{
- struct paravirt_patch_site *p;
- char insn_buff[MAX_PATCH_LEN];
-
- for (p = start; p < end; p++) {
- unsigned int used;
-
- BUG_ON(p->len > MAX_PATCH_LEN);
- /* prep the buffer with the original instructions */
- memcpy(insn_buff, p->instr, p->len);
- used = paravirt_patch(p->type, insn_buff, (unsigned long)p->instr, p->len);
-
- BUG_ON(used > p->len);
-
- /* Pad the rest with nops */
- add_nops(insn_buff + used, p->len - used);
- text_poke_early(p->instr, insn_buff, p->len);
- }
-}
-extern struct paravirt_patch_site __start_parainstructions[],
- __stop_parainstructions[];
-#endif /* CONFIG_PARAVIRT */
-
/*
* Self-test for the INT3 based CALL emulation code.
*
@@ -1586,28 +1688,11 @@ void __init alternative_instructions(void)
*/
/*
- * Paravirt patching and alternative patching can be combined to
- * replace a function call with a short direct code sequence (e.g.
- * by setting a constant return value instead of doing that in an
- * external function).
- * In order to make this work the following sequence is required:
- * 1. set (artificial) features depending on used paravirt
- * functions which can later influence alternative patching
- * 2. apply paravirt patching (generally replacing an indirect
- * function call with a direct one)
- * 3. apply alternative patching (e.g. replacing a direct function
- * call with a custom code sequence)
- * Doing paravirt patching after alternative patching would clobber
- * the optimization of the custom code with a function call again.
+ * Make sure to set (artificial) features depending on used paravirt
+ * functions which can later influence alternative patching.
*/
paravirt_set_cap();
- /*
- * First patch paravirt functions, such that we overwrite the indirect
- * call with the direct call.
- */
- apply_paravirt(__parainstructions, __parainstructions_end);
-
__apply_fineibt(__retpoline_sites, __retpoline_sites_end,
__cfi_sites, __cfi_sites_end, true);
@@ -1618,10 +1703,6 @@ void __init alternative_instructions(void)
apply_retpolines(__retpoline_sites, __retpoline_sites_end);
apply_returns(__return_sites, __return_sites_end);
- /*
- * Then patch alternatives, such that those paravirt calls that are in
- * alternatives can be overwritten by their immediate fragments.
- */
apply_alternatives(__alt_instructions, __alt_instructions_end);
/*
@@ -1685,8 +1766,8 @@ void __init_or_module text_poke_early(void *addr, const void *opcode,
} else {
local_irq_save(flags);
memcpy(addr, opcode, len);
- local_irq_restore(flags);
sync_core();
+ local_irq_restore(flags);
/*
* Could also do a CLFLUSH here to speed up CPU recovery; but
@@ -1896,7 +1977,7 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l
* Note that the caller must ensure that if the modified code is part of a
* module, the module would not be removed during poking. This can be achieved
* by registering a module notifier, and ordering module removal and patching
- * trough a mutex.
+ * through a mutex.
*/
void *text_poke(void *addr, const void *opcode, size_t len)
{
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index 56a917df410d..2ae98f754e59 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -776,7 +776,7 @@ int __init gart_iommu_init(void)
iommu_size >> PAGE_SHIFT);
/*
* Tricky. The GART table remaps the physical memory range,
- * so the CPU wont notice potential aliases and if the memory
+ * so the CPU won't notice potential aliases and if the memory
* is remapped to UC later on, we might surprise the PCI devices
* with a stray writeout of a cacheline. So play it sure and
* do an explicit, full-scale wbinvd() _after_ having marked all
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 2ee867d796d9..3bf0487cf3b7 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -4,7 +4,7 @@
#
# Leads to non-deterministic coverage that is not a function of syscall inputs.
-# In particualr, smp_apic_timer_interrupt() is called in random places.
+# In particular, smp_apic_timer_interrupt() is called in random places.
KCOV_INSTRUMENT := n
obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_common.o apic_noop.o ipi.o vector.o init.o
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 41093cf20acd..4667bc4b00ab 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -782,7 +782,7 @@ bool __init apic_needs_pit(void)
/*
* If interrupt delivery mode is legacy PIC or virtual wire without
- * configuration, the local APIC timer wont be set up. Make sure
+ * configuration, the local APIC timer won't be set up. Make sure
* that the PIT is initialized.
*/
if (apic_intr_mode == APIC_PIC ||
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 7139867d69cd..b295a056a4fc 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -82,7 +82,6 @@ static struct apic apic_flat __ro_after_init = {
.acpi_madt_oem_check = flat_acpi_madt_oem_check,
.apic_id_registered = default_apic_id_registered,
- .delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = true,
.disable_esr = 0,
@@ -154,7 +153,6 @@ static struct apic apic_physflat __ro_after_init = {
.acpi_madt_oem_check = physflat_acpi_madt_oem_check,
.apic_id_registered = default_apic_id_registered,
- .delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = false,
.disable_esr = 0,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index b00d52ae84fa..9f1d553eb48f 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -47,7 +47,6 @@ static void noop_apic_write(u32 reg, u32 val)
struct apic apic_noop __ro_after_init = {
.name = "noop",
- .delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = true,
.disable_esr = 0,
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index 456a14c44f67..7d0c51b9d3bc 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -222,7 +222,6 @@ static const struct apic apic_numachip1 __refconst = {
.probe = numachip1_probe,
.acpi_madt_oem_check = numachip1_acpi_madt_oem_check,
- .delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = false,
.disable_esr = 0,
@@ -259,7 +258,6 @@ static const struct apic apic_numachip2 __refconst = {
.probe = numachip2_probe,
.acpi_madt_oem_check = numachip2_acpi_madt_oem_check,
- .delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = false,
.disable_esr = 0,
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 7ee3c486cb33..5a0d60b38e6b 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -80,7 +80,6 @@ static struct apic apic_bigsmp __ro_after_init = {
.name = "bigsmp",
.probe = probe_bigsmp,
- .delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = false,
.disable_esr = 1,
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 00da6cf6b07d..40c7cf180c20 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -997,7 +997,7 @@ static int alloc_isa_irq_from_domain(struct irq_domain *domain,
/*
* Legacy ISA IRQ has already been allocated, just add pin to
* the pin list associated with this IRQ and program the IOAPIC
- * entry. The IOAPIC entry
+ * entry.
*/
if (irq_data && irq_data->parent_data) {
if (!mp_check_pin_attr(irq, info))
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 5eb3fbe472da..c0f78059f06a 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -45,7 +45,6 @@ static struct apic apic_default __ro_after_init = {
.probe = probe_default,
.apic_id_registered = default_apic_id_registered,
- .delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = true,
.disable_esr = 0,
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 319448d87b99..185738c72766 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -738,8 +738,8 @@ int __init arch_probe_nr_irqs(void)
void lapic_assign_legacy_vector(unsigned int irq, bool replace)
{
/*
- * Use assign system here so it wont get accounted as allocated
- * and moveable in the cpu hotplug check and it prevents managed
+ * Use assign system here so it won't get accounted as allocated
+ * and movable in the cpu hotplug check and it prevents managed
* irq reservation from touching it.
*/
irq_matrix_assign_system(vector_matrix, ISA_IRQ_VECTOR(irq), replace);
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index a8306089c91b..28a7d3f2312d 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -227,7 +227,6 @@ static struct apic apic_x2apic_cluster __ro_after_init = {
.probe = x2apic_cluster_probe,
.acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
- .delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = true,
.disable_esr = 0,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 558a4a8824f4..409815a40668 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -145,7 +145,6 @@ static struct apic apic_x2apic_phys __ro_after_init = {
.probe = x2apic_phys_probe,
.acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
- .delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = false,
.disable_esr = 0,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 1b0d7336a28f..f1766b18dcd0 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -805,7 +805,6 @@ static struct apic apic_x2apic_uv_x __ro_after_init = {
.probe = uv_probe,
.acpi_madt_oem_check = uv_acpi_madt_oem_check,
- .delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = false,
.disable_esr = 0,
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 5934ee5bc087..76a5ced278c2 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -420,7 +420,7 @@ static DEFINE_MUTEX(apm_mutex);
* This is for buggy BIOS's that refer to (real mode) segment 0x40
* even though they are called in protected mode.
*/
-static struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(0x4092,
+static struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(DESC_DATA32_BIOS,
(unsigned long)__va(0x400UL), PAGE_SIZE - 0x400 - 1);
static const char driver_version[] = "1.16ac"; /* no spaces */
diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c
index e9ad518a5003..64ad2ddea121 100644
--- a/arch/x86/kernel/callthunks.c
+++ b/arch/x86/kernel/callthunks.c
@@ -233,14 +233,13 @@ patch_call_sites(s32 *start, s32 *end, const struct core_text *ct)
}
static __init_or_module void
-patch_paravirt_call_sites(struct paravirt_patch_site *start,
- struct paravirt_patch_site *end,
- const struct core_text *ct)
+patch_alt_call_sites(struct alt_instr *start, struct alt_instr *end,
+ const struct core_text *ct)
{
- struct paravirt_patch_site *p;
+ struct alt_instr *a;
- for (p = start; p < end; p++)
- patch_call(p->instr, ct);
+ for (a = start; a < end; a++)
+ patch_call((void *)&a->instr_offset + a->instr_offset, ct);
}
static __init_or_module void
@@ -248,7 +247,7 @@ callthunks_setup(struct callthunk_sites *cs, const struct core_text *ct)
{
prdbg("Patching call sites %s\n", ct->name);
patch_call_sites(cs->call_start, cs->call_end, ct);
- patch_paravirt_call_sites(cs->pv_start, cs->pv_end, ct);
+ patch_alt_call_sites(cs->alt_start, cs->alt_end, ct);
prdbg("Patching call sites done%s\n", ct->name);
}
@@ -257,8 +256,8 @@ void __init callthunks_patch_builtin_calls(void)
struct callthunk_sites cs = {
.call_start = __call_sites,
.call_end = __call_sites_end,
- .pv_start = __parainstructions,
- .pv_end = __parainstructions_end
+ .alt_start = __alt_instructions,
+ .alt_end = __alt_instructions_end
};
if (!cpu_feature_enabled(X86_FEATURE_CALL_DEPTH))
diff --git a/arch/x86/kernel/cfi.c b/arch/x86/kernel/cfi.c
index 8674a5c0c031..e6bf78fac146 100644
--- a/arch/x86/kernel/cfi.c
+++ b/arch/x86/kernel/cfi.c
@@ -4,10 +4,10 @@
*
* Copyright (C) 2022 Google LLC
*/
-#include <asm/cfi.h>
+#include <linux/string.h>
+#include <linux/cfi.h>
#include <asm/insn.h>
#include <asm/insn-eval.h>
-#include <linux/string.h>
/*
* Returns the target address and the expected type when regs->ip points
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index a7eab05e5f29..9f42d1c59e09 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -34,87 +34,6 @@
*/
static u32 nodes_per_socket = 1;
-/*
- * AMD errata checking
- *
- * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or
- * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that
- * have an OSVW id assigned, which it takes as first argument. Both take a
- * variable number of family-specific model-stepping ranges created by
- * AMD_MODEL_RANGE().
- *
- * Example:
- *
- * const int amd_erratum_319[] =
- * AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2),
- * AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0),
- * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0));
- */
-
-#define AMD_LEGACY_ERRATUM(...) { -1, __VA_ARGS__, 0 }
-#define AMD_OSVW_ERRATUM(osvw_id, ...) { osvw_id, __VA_ARGS__, 0 }
-#define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \
- ((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end))
-#define AMD_MODEL_RANGE_FAMILY(range) (((range) >> 24) & 0xff)
-#define AMD_MODEL_RANGE_START(range) (((range) >> 12) & 0xfff)
-#define AMD_MODEL_RANGE_END(range) ((range) & 0xfff)
-
-static const int amd_erratum_400[] =
- AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf),
- AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf));
-
-static const int amd_erratum_383[] =
- AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf));
-
-/* #1054: Instructions Retired Performance Counter May Be Inaccurate */
-static const int amd_erratum_1054[] =
- AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0, 0, 0x2f, 0xf));
-
-static const int amd_zenbleed[] =
- AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0x30, 0x0, 0x4f, 0xf),
- AMD_MODEL_RANGE(0x17, 0x60, 0x0, 0x7f, 0xf),
- AMD_MODEL_RANGE(0x17, 0x90, 0x0, 0x91, 0xf),
- AMD_MODEL_RANGE(0x17, 0xa0, 0x0, 0xaf, 0xf));
-
-static const int amd_div0[] =
- AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0x00, 0x0, 0x2f, 0xf),
- AMD_MODEL_RANGE(0x17, 0x50, 0x0, 0x5f, 0xf));
-
-static const int amd_erratum_1485[] =
- AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x19, 0x10, 0x0, 0x1f, 0xf),
- AMD_MODEL_RANGE(0x19, 0x60, 0x0, 0xaf, 0xf));
-
-static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum)
-{
- int osvw_id = *erratum++;
- u32 range;
- u32 ms;
-
- if (osvw_id >= 0 && osvw_id < 65536 &&
- cpu_has(cpu, X86_FEATURE_OSVW)) {
- u64 osvw_len;
-
- rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len);
- if (osvw_id < osvw_len) {
- u64 osvw_bits;
-
- rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6),
- osvw_bits);
- return osvw_bits & (1ULL << (osvw_id & 0x3f));
- }
- }
-
- /* OSVW unavailable or ID unknown, match family-model-stepping range */
- ms = (cpu->x86_model << 4) | cpu->x86_stepping;
- while ((range = *erratum++))
- if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) &&
- (ms >= AMD_MODEL_RANGE_START(range)) &&
- (ms <= AMD_MODEL_RANGE_END(range)))
- return true;
-
- return false;
-}
-
static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
{
u32 gprs[8] = { 0 };
@@ -616,6 +535,49 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
}
resctrl_cpu_detect(c);
+
+ /* Figure out Zen generations: */
+ switch (c->x86) {
+ case 0x17: {
+ switch (c->x86_model) {
+ case 0x00 ... 0x2f:
+ case 0x50 ... 0x5f:
+ setup_force_cpu_cap(X86_FEATURE_ZEN1);
+ break;
+ case 0x30 ... 0x4f:
+ case 0x60 ... 0x7f:
+ case 0x90 ... 0x91:
+ case 0xa0 ... 0xaf:
+ setup_force_cpu_cap(X86_FEATURE_ZEN2);
+ break;
+ default:
+ goto warn;
+ }
+ break;
+ }
+ case 0x19: {
+ switch (c->x86_model) {
+ case 0x00 ... 0x0f:
+ case 0x20 ... 0x5f:
+ setup_force_cpu_cap(X86_FEATURE_ZEN3);
+ break;
+ case 0x10 ... 0x1f:
+ case 0x60 ... 0xaf:
+ setup_force_cpu_cap(X86_FEATURE_ZEN4);
+ break;
+ default:
+ goto warn;
+ }
+ break;
+ }
+ default:
+ break;
+ }
+
+ return;
+
+warn:
+ WARN_ONCE(1, "Family 0x%x, model: 0x%x??\n", c->x86, c->x86_model);
}
static void early_detect_mem_encrypt(struct cpuinfo_x86 *c)
@@ -739,15 +701,6 @@ static void early_init_amd(struct cpuinfo_x86 *c)
if (c->x86 == 0x16 && c->x86_model <= 0xf)
msr_set_bit(MSR_AMD64_LS_CFG, 15);
- /*
- * Check whether the machine is affected by erratum 400. This is
- * used to select the proper idle routine and to enable the check
- * whether the machine is affected in arch_post_acpi_init(), which
- * sets the X86_BUG_AMD_APIC_C1E bug depending on the MSR check.
- */
- if (cpu_has_amd_erratum(c, amd_erratum_400))
- set_cpu_bug(c, X86_BUG_AMD_E400);
-
early_detect_mem_encrypt(c);
/* Re-enable TopologyExtensions if switched off by BIOS */
@@ -814,6 +767,16 @@ static void init_amd_k8(struct cpuinfo_x86 *c)
msr_set_bit(MSR_K7_HWCR, 6);
#endif
set_cpu_bug(c, X86_BUG_SWAPGS_FENCE);
+
+ /*
+ * Check models and steppings affected by erratum 400. This is
+ * used to select the proper idle routine and to enable the
+ * check whether the machine is affected in arch_post_acpi_subsys_init()
+ * which sets the X86_BUG_AMD_APIC_C1E bug depending on the MSR check.
+ */
+ if (c->x86_model > 0x41 ||
+ (c->x86_model == 0x41 && c->x86_stepping >= 0x2))
+ setup_force_cpu_bug(X86_BUG_AMD_E400);
}
static void init_amd_gh(struct cpuinfo_x86 *c)
@@ -847,8 +810,17 @@ static void init_amd_gh(struct cpuinfo_x86 *c)
*/
msr_clear_bit(MSR_AMD64_BU_CFG2, 24);
- if (cpu_has_amd_erratum(c, amd_erratum_383))
- set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH);
+ set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH);
+
+ /*
+ * Check models and steppings affected by erratum 400. This is
+ * used to select the proper idle routine and to enable the
+ * check whether the machine is affected in arch_post_acpi_subsys_init()
+ * which sets the X86_BUG_AMD_APIC_C1E bug depending on the MSR check.
+ */
+ if (c->x86_model > 0x2 ||
+ (c->x86_model == 0x2 && c->x86_stepping >= 0x1))
+ setup_force_cpu_bug(X86_BUG_AMD_E400);
}
static void init_amd_ln(struct cpuinfo_x86 *c)
@@ -941,6 +913,19 @@ static void init_amd_bd(struct cpuinfo_x86 *c)
clear_rdrand_cpuid_bit(c);
}
+static void fix_erratum_1386(struct cpuinfo_x86 *c)
+{
+ /*
+ * Work around Erratum 1386. The XSAVES instruction malfunctions in
+ * certain circumstances on Zen1/2 uarch, and not all parts have had
+ * updated microcode at the time of writing (March 2023).
+ *
+ * Affected parts all have no supervisor XSAVE states, meaning that
+ * the XSAVEC instruction (which works fine) is equivalent.
+ */
+ clear_cpu_cap(c, X86_FEATURE_XSAVES);
+}
+
void init_spectral_chicken(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_CPU_UNRET_ENTRY
@@ -951,34 +936,28 @@ void init_spectral_chicken(struct cpuinfo_x86 *c)
*
* This suppresses speculation from the middle of a basic block, i.e. it
* suppresses non-branch predictions.
- *
- * We use STIBP as a heuristic to filter out Zen2 from the rest of F17H
*/
- if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && cpu_has(c, X86_FEATURE_AMD_STIBP)) {
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) {
if (!rdmsrl_safe(MSR_ZEN2_SPECTRAL_CHICKEN, &value)) {
value |= MSR_ZEN2_SPECTRAL_CHICKEN_BIT;
wrmsrl_safe(MSR_ZEN2_SPECTRAL_CHICKEN, value);
}
}
#endif
- /*
- * Work around Erratum 1386. The XSAVES instruction malfunctions in
- * certain circumstances on Zen1/2 uarch, and not all parts have had
- * updated microcode at the time of writing (March 2023).
- *
- * Affected parts all have no supervisor XSAVE states, meaning that
- * the XSAVEC instruction (which works fine) is equivalent.
- */
- clear_cpu_cap(c, X86_FEATURE_XSAVES);
}
-static void init_amd_zn(struct cpuinfo_x86 *c)
+static void init_amd_zen_common(void)
{
- set_cpu_cap(c, X86_FEATURE_ZEN);
-
+ setup_force_cpu_cap(X86_FEATURE_ZEN);
#ifdef CONFIG_NUMA
node_reclaim_distance = 32;
#endif
+}
+
+static void init_amd_zen1(struct cpuinfo_x86 *c)
+{
+ init_amd_zen_common();
+ fix_erratum_1386(c);
/* Fix up CPUID bits, but only if not virtualised. */
if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) {
@@ -986,15 +965,10 @@ static void init_amd_zn(struct cpuinfo_x86 *c)
/* Erratum 1076: CPB feature bit not being set in CPUID. */
if (!cpu_has(c, X86_FEATURE_CPB))
set_cpu_cap(c, X86_FEATURE_CPB);
-
- /*
- * Zen3 (Fam19 model < 0x10) parts are not susceptible to
- * Branch Type Confusion, but predate the allocation of the
- * BTC_NO bit.
- */
- if (c->x86 == 0x19 && !cpu_has(c, X86_FEATURE_BTC_NO))
- set_cpu_cap(c, X86_FEATURE_BTC_NO);
}
+
+ pr_notice_once("AMD Zen1 DIV0 bug detected. Disable SMT for full protection.\n");
+ setup_force_cpu_bug(X86_BUG_DIV0);
}
static bool cpu_has_zenbleed_microcode(void)
@@ -1018,11 +992,8 @@ static bool cpu_has_zenbleed_microcode(void)
return true;
}
-static void zenbleed_check(struct cpuinfo_x86 *c)
+static void zen2_zenbleed_check(struct cpuinfo_x86 *c)
{
- if (!cpu_has_amd_erratum(c, amd_zenbleed))
- return;
-
if (cpu_has(c, X86_FEATURE_HYPERVISOR))
return;
@@ -1037,6 +1008,37 @@ static void zenbleed_check(struct cpuinfo_x86 *c)
}
}
+static void init_amd_zen2(struct cpuinfo_x86 *c)
+{
+ init_amd_zen_common();
+ init_spectral_chicken(c);
+ fix_erratum_1386(c);
+ zen2_zenbleed_check(c);
+}
+
+static void init_amd_zen3(struct cpuinfo_x86 *c)
+{
+ init_amd_zen_common();
+
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) {
+ /*
+ * Zen3 (Fam19 model < 0x10) parts are not susceptible to
+ * Branch Type Confusion, but predate the allocation of the
+ * BTC_NO bit.
+ */
+ if (!cpu_has(c, X86_FEATURE_BTC_NO))
+ set_cpu_cap(c, X86_FEATURE_BTC_NO);
+ }
+}
+
+static void init_amd_zen4(struct cpuinfo_x86 *c)
+{
+ init_amd_zen_common();
+
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR))
+ msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT);
+}
+
static void init_amd(struct cpuinfo_x86 *c)
{
u64 vm_cr;
@@ -1072,11 +1074,17 @@ static void init_amd(struct cpuinfo_x86 *c)
case 0x12: init_amd_ln(c); break;
case 0x15: init_amd_bd(c); break;
case 0x16: init_amd_jg(c); break;
- case 0x17: init_spectral_chicken(c);
- fallthrough;
- case 0x19: init_amd_zn(c); break;
}
+ if (boot_cpu_has(X86_FEATURE_ZEN1))
+ init_amd_zen1(c);
+ else if (boot_cpu_has(X86_FEATURE_ZEN2))
+ init_amd_zen2(c);
+ else if (boot_cpu_has(X86_FEATURE_ZEN3))
+ init_amd_zen3(c);
+ else if (boot_cpu_has(X86_FEATURE_ZEN4))
+ init_amd_zen4(c);
+
/*
* Enable workaround for FXSAVE leak on CPUs
* without a XSaveErPtr feature
@@ -1136,7 +1144,7 @@ static void init_amd(struct cpuinfo_x86 *c)
* Counter May Be Inaccurate".
*/
if (cpu_has(c, X86_FEATURE_IRPERF) &&
- !cpu_has_amd_erratum(c, amd_erratum_1054))
+ (boot_cpu_has(X86_FEATURE_ZEN1) && c->x86_model > 0x2f))
msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT);
check_null_seg_clears_base(c);
@@ -1152,16 +1160,8 @@ static void init_amd(struct cpuinfo_x86 *c)
cpu_has(c, X86_FEATURE_AUTOIBRS))
WARN_ON_ONCE(msr_set_bit(MSR_EFER, _EFER_AUTOIBRS));
- zenbleed_check(c);
-
- if (cpu_has_amd_erratum(c, amd_div0)) {
- pr_notice_once("AMD Zen1 DIV0 bug detected. Disable SMT for full protection.\n");
- setup_force_cpu_bug(X86_BUG_DIV0);
- }
-
- if (!cpu_has(c, X86_FEATURE_HYPERVISOR) &&
- cpu_has_amd_erratum(c, amd_erratum_1485))
- msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT);
+ /* AMD CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */
+ clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
}
#ifdef CONFIG_X86_32
@@ -1315,11 +1315,14 @@ static void zenbleed_check_cpu(void *unused)
{
struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
- zenbleed_check(c);
+ zen2_zenbleed_check(c);
}
void amd_check_microcode(void)
{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+ return;
+
on_each_cpu(zenbleed_check_cpu, NULL, 1);
}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index b14fc8c1c953..94bff381ef20 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -188,45 +188,37 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
* TLS descriptors are currently at a different place compared to i386.
* Hopefully nobody expects them at a fixed place (Wine?)
*/
- [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff),
- [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff),
- [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff),
- [GDT_ENTRY_DEFAULT_USER32_CS] = GDT_ENTRY_INIT(0xc0fb, 0, 0xfffff),
- [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f3, 0, 0xfffff),
- [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xa0fb, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(DESC_CODE32, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(DESC_CODE64, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(DESC_DATA64, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER32_CS] = GDT_ENTRY_INIT(DESC_CODE32 | DESC_USER, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(DESC_DATA64 | DESC_USER, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(DESC_CODE64 | DESC_USER, 0, 0xfffff),
#else
- [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xc09a, 0, 0xfffff),
- [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
- [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xc0fa, 0, 0xfffff),
- [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f2, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(DESC_CODE32, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(DESC_DATA32, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(DESC_CODE32 | DESC_USER, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(DESC_DATA32 | DESC_USER, 0, 0xfffff),
/*
* Segments used for calling PnP BIOS have byte granularity.
* They code segments and data segments have fixed 64k limits,
* the transfer segment sizes are set at run time.
*/
- /* 32-bit code */
- [GDT_ENTRY_PNPBIOS_CS32] = GDT_ENTRY_INIT(0x409a, 0, 0xffff),
- /* 16-bit code */
- [GDT_ENTRY_PNPBIOS_CS16] = GDT_ENTRY_INIT(0x009a, 0, 0xffff),
- /* 16-bit data */
- [GDT_ENTRY_PNPBIOS_DS] = GDT_ENTRY_INIT(0x0092, 0, 0xffff),
- /* 16-bit data */
- [GDT_ENTRY_PNPBIOS_TS1] = GDT_ENTRY_INIT(0x0092, 0, 0),
- /* 16-bit data */
- [GDT_ENTRY_PNPBIOS_TS2] = GDT_ENTRY_INIT(0x0092, 0, 0),
+ [GDT_ENTRY_PNPBIOS_CS32] = GDT_ENTRY_INIT(DESC_CODE32_BIOS, 0, 0xffff),
+ [GDT_ENTRY_PNPBIOS_CS16] = GDT_ENTRY_INIT(DESC_CODE16, 0, 0xffff),
+ [GDT_ENTRY_PNPBIOS_DS] = GDT_ENTRY_INIT(DESC_DATA16, 0, 0xffff),
+ [GDT_ENTRY_PNPBIOS_TS1] = GDT_ENTRY_INIT(DESC_DATA16, 0, 0),
+ [GDT_ENTRY_PNPBIOS_TS2] = GDT_ENTRY_INIT(DESC_DATA16, 0, 0),
/*
* The APM segments have byte granularity and their bases
* are set at run time. All have 64k limits.
*/
- /* 32-bit code */
- [GDT_ENTRY_APMBIOS_BASE] = GDT_ENTRY_INIT(0x409a, 0, 0xffff),
- /* 16-bit code */
- [GDT_ENTRY_APMBIOS_BASE+1] = GDT_ENTRY_INIT(0x009a, 0, 0xffff),
- /* data */
- [GDT_ENTRY_APMBIOS_BASE+2] = GDT_ENTRY_INIT(0x4092, 0, 0xffff),
-
- [GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
- [GDT_ENTRY_PERCPU] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
+ [GDT_ENTRY_APMBIOS_BASE] = GDT_ENTRY_INIT(DESC_CODE32_BIOS, 0, 0xffff),
+ [GDT_ENTRY_APMBIOS_BASE+1] = GDT_ENTRY_INIT(DESC_CODE16, 0, 0xffff),
+ [GDT_ENTRY_APMBIOS_BASE+2] = GDT_ENTRY_INIT(DESC_DATA32_BIOS, 0, 0xffff),
+
+ [GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(DESC_DATA32, 0, 0xfffff),
+ [GDT_ENTRY_PERCPU] = GDT_ENTRY_INIT(DESC_DATA32, 0, 0xfffff),
#endif
} };
EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
@@ -1856,6 +1848,13 @@ static void identify_cpu(struct cpuinfo_x86 *c)
c->topo.apicid = apic->phys_pkg_id(c->topo.initial_apicid, 0);
#endif
+
+ /*
+ * Set default APIC and TSC_DEADLINE MSR fencing flag. AMD and
+ * Hygon will clear it in ->c_init() below.
+ */
+ set_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
+
/*
* Vendor-specific initialization. In this section we
* canonicalize the feature flags, meaning if there are
diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c
index 6f247d66758d..f0cd95502faa 100644
--- a/arch/x86/kernel/cpu/hygon.c
+++ b/arch/x86/kernel/cpu/hygon.c
@@ -354,6 +354,9 @@ static void init_hygon(struct cpuinfo_x86 *c)
set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
check_null_seg_clears_base(c);
+
+ /* Hygon CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */
+ clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
}
static void cpu_detect_tlb_hygon(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c
index e4c3ba91321c..f18d35fe27a9 100644
--- a/arch/x86/kernel/cpu/intel_epb.c
+++ b/arch/x86/kernel/cpu/intel_epb.c
@@ -237,4 +237,4 @@ err_out_online:
cpuhp_remove_state(CPUHP_AP_X86_INTEL_EPB_ONLINE);
return ret;
}
-subsys_initcall(intel_epb_init);
+late_initcall(intel_epb_init);
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index f3517b8a8e91..2b46eb0fdf3a 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -87,42 +87,40 @@ struct smca_bank {
static DEFINE_PER_CPU_READ_MOSTLY(struct smca_bank[MAX_NR_BANKS], smca_banks);
static DEFINE_PER_CPU_READ_MOSTLY(u8[N_SMCA_BANK_TYPES], smca_bank_counts);
-struct smca_bank_name {
- const char *name; /* Short name for sysfs */
- const char *long_name; /* Long name for pretty-printing */
-};
-
-static struct smca_bank_name smca_names[] = {
- [SMCA_LS ... SMCA_LS_V2] = { "load_store", "Load Store Unit" },
- [SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" },
- [SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" },
- [SMCA_DE] = { "decode_unit", "Decode Unit" },
- [SMCA_RESERVED] = { "reserved", "Reserved" },
- [SMCA_EX] = { "execution_unit", "Execution Unit" },
- [SMCA_FP] = { "floating_point", "Floating Point Unit" },
- [SMCA_L3_CACHE] = { "l3_cache", "L3 Cache" },
- [SMCA_CS ... SMCA_CS_V2] = { "coherent_slave", "Coherent Slave" },
- [SMCA_PIE] = { "pie", "Power, Interrupts, etc." },
+static const char * const smca_names[] = {
+ [SMCA_LS ... SMCA_LS_V2] = "load_store",
+ [SMCA_IF] = "insn_fetch",
+ [SMCA_L2_CACHE] = "l2_cache",
+ [SMCA_DE] = "decode_unit",
+ [SMCA_RESERVED] = "reserved",
+ [SMCA_EX] = "execution_unit",
+ [SMCA_FP] = "floating_point",
+ [SMCA_L3_CACHE] = "l3_cache",
+ [SMCA_CS ... SMCA_CS_V2] = "coherent_slave",
+ [SMCA_PIE] = "pie",
/* UMC v2 is separate because both of them can exist in a single system. */
- [SMCA_UMC] = { "umc", "Unified Memory Controller" },
- [SMCA_UMC_V2] = { "umc_v2", "Unified Memory Controller v2" },
- [SMCA_PB] = { "param_block", "Parameter Block" },
- [SMCA_PSP ... SMCA_PSP_V2] = { "psp", "Platform Security Processor" },
- [SMCA_SMU ... SMCA_SMU_V2] = { "smu", "System Management Unit" },
- [SMCA_MP5] = { "mp5", "Microprocessor 5 Unit" },
- [SMCA_MPDMA] = { "mpdma", "MPDMA Unit" },
- [SMCA_NBIO] = { "nbio", "Northbridge IO Unit" },
- [SMCA_PCIE ... SMCA_PCIE_V2] = { "pcie", "PCI Express Unit" },
- [SMCA_XGMI_PCS] = { "xgmi_pcs", "Ext Global Memory Interconnect PCS Unit" },
- [SMCA_NBIF] = { "nbif", "NBIF Unit" },
- [SMCA_SHUB] = { "shub", "System Hub Unit" },
- [SMCA_SATA] = { "sata", "SATA Unit" },
- [SMCA_USB] = { "usb", "USB Unit" },
- [SMCA_GMI_PCS] = { "gmi_pcs", "Global Memory Interconnect PCS Unit" },
- [SMCA_XGMI_PHY] = { "xgmi_phy", "Ext Global Memory Interconnect PHY Unit" },
- [SMCA_WAFL_PHY] = { "wafl_phy", "WAFL PHY Unit" },
- [SMCA_GMI_PHY] = { "gmi_phy", "Global Memory Interconnect PHY Unit" },
+ [SMCA_UMC] = "umc",
+ [SMCA_UMC_V2] = "umc_v2",
+ [SMCA_MA_LLC] = "ma_llc",
+ [SMCA_PB] = "param_block",
+ [SMCA_PSP ... SMCA_PSP_V2] = "psp",
+ [SMCA_SMU ... SMCA_SMU_V2] = "smu",
+ [SMCA_MP5] = "mp5",
+ [SMCA_MPDMA] = "mpdma",
+ [SMCA_NBIO] = "nbio",
+ [SMCA_PCIE ... SMCA_PCIE_V2] = "pcie",
+ [SMCA_XGMI_PCS] = "xgmi_pcs",
+ [SMCA_NBIF] = "nbif",
+ [SMCA_SHUB] = "shub",
+ [SMCA_SATA] = "sata",
+ [SMCA_USB] = "usb",
+ [SMCA_USR_DP] = "usr_dp",
+ [SMCA_USR_CP] = "usr_cp",
+ [SMCA_GMI_PCS] = "gmi_pcs",
+ [SMCA_XGMI_PHY] = "xgmi_phy",
+ [SMCA_WAFL_PHY] = "wafl_phy",
+ [SMCA_GMI_PHY] = "gmi_phy",
};
static const char *smca_get_name(enum smca_bank_types t)
@@ -130,17 +128,8 @@ static const char *smca_get_name(enum smca_bank_types t)
if (t >= N_SMCA_BANK_TYPES)
return NULL;
- return smca_names[t].name;
-}
-
-const char *smca_get_long_name(enum smca_bank_types t)
-{
- if (t >= N_SMCA_BANK_TYPES)
- return NULL;
-
- return smca_names[t].long_name;
+ return smca_names[t];
}
-EXPORT_SYMBOL_GPL(smca_get_long_name);
enum smca_bank_types smca_get_bank_type(unsigned int cpu, unsigned int bank)
{
@@ -178,6 +167,7 @@ static const struct smca_hwid smca_hwid_mcatypes[] = {
{ SMCA_CS, HWID_MCATYPE(0x2E, 0x0) },
{ SMCA_PIE, HWID_MCATYPE(0x2E, 0x1) },
{ SMCA_CS_V2, HWID_MCATYPE(0x2E, 0x2) },
+ { SMCA_MA_LLC, HWID_MCATYPE(0x2E, 0x4) },
/* Unified Memory Controller MCA type */
{ SMCA_UMC, HWID_MCATYPE(0x96, 0x0) },
@@ -212,6 +202,8 @@ static const struct smca_hwid smca_hwid_mcatypes[] = {
{ SMCA_SHUB, HWID_MCATYPE(0x80, 0x0) },
{ SMCA_SATA, HWID_MCATYPE(0xA8, 0x0) },
{ SMCA_USB, HWID_MCATYPE(0xAA, 0x0) },
+ { SMCA_USR_DP, HWID_MCATYPE(0x170, 0x0) },
+ { SMCA_USR_CP, HWID_MCATYPE(0x180, 0x0) },
{ SMCA_GMI_PCS, HWID_MCATYPE(0x241, 0x0) },
{ SMCA_XGMI_PHY, HWID_MCATYPE(0x259, 0x0) },
{ SMCA_WAFL_PHY, HWID_MCATYPE(0x267, 0x0) },
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 7b397370b4d6..fd5ce12c4f9a 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -44,6 +44,7 @@
#include <linux/sync_core.h>
#include <linux/task_work.h>
#include <linux/hardirq.h>
+#include <linux/kexec.h>
#include <asm/intel-family.h>
#include <asm/processor.h>
@@ -233,6 +234,7 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp)
struct llist_node *pending;
struct mce_evt_llist *l;
int apei_err = 0;
+ struct page *p;
/*
* Allow instrumentation around external facilities usage. Not that it
@@ -286,6 +288,20 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp)
if (!fake_panic) {
if (panic_timeout == 0)
panic_timeout = mca_cfg.panic_timeout;
+
+ /*
+ * Kdump skips the poisoned page in order to avoid
+ * touching the error bits again. Poison the page even
+ * if the error is fatal and the machine is about to
+ * panic.
+ */
+ if (kexec_crash_loaded()) {
+ if (final && (final->status & MCI_STATUS_ADDRV)) {
+ p = pfn_to_online_page(final->addr >> PAGE_SHIFT);
+ if (p)
+ SetPageHWPoison(p);
+ }
+ }
panic(msg);
} else
pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
@@ -670,6 +686,16 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
barrier();
m.status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS));
+ /*
+ * Update storm tracking here, before checking for the
+ * MCI_STATUS_VAL bit. Valid corrected errors count
+ * towards declaring, or maintaining, storm status. No
+ * error in a bank counts towards avoiding, or ending,
+ * storm status.
+ */
+ if (!mca_cfg.cmci_disabled)
+ mce_track_storm(&m);
+
/* If this entry is not valid, ignore it */
if (!(m.status & MCI_STATUS_VAL))
continue;
@@ -1601,13 +1627,6 @@ static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
static DEFINE_PER_CPU(struct timer_list, mce_timer);
-static unsigned long mce_adjust_timer_default(unsigned long interval)
-{
- return interval;
-}
-
-static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
-
static void __start_timer(struct timer_list *t, unsigned long interval)
{
unsigned long when = jiffies + interval;
@@ -1637,15 +1656,9 @@ static void mce_timer_fn(struct timer_list *t)
iv = __this_cpu_read(mce_next_interval);
- if (mce_available(this_cpu_ptr(&cpu_info))) {
+ if (mce_available(this_cpu_ptr(&cpu_info)))
mc_poll_banks();
- if (mce_intel_cmci_poll()) {
- iv = mce_adjust_timer(iv);
- goto done;
- }
- }
-
/*
* Alert userspace if needed. If we logged an MCE, reduce the polling
* interval, otherwise increase the polling interval.
@@ -1655,23 +1668,29 @@ static void mce_timer_fn(struct timer_list *t)
else
iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
-done:
- __this_cpu_write(mce_next_interval, iv);
- __start_timer(t, iv);
+ if (mce_get_storm_mode()) {
+ __start_timer(t, HZ);
+ } else {
+ __this_cpu_write(mce_next_interval, iv);
+ __start_timer(t, iv);
+ }
}
/*
- * Ensure that the timer is firing in @interval from now.
+ * When a storm starts on any bank on this CPU, switch to polling
+ * once per second. When the storm ends, revert to the default
+ * polling interval.
*/
-void mce_timer_kick(unsigned long interval)
+void mce_timer_kick(bool storm)
{
struct timer_list *t = this_cpu_ptr(&mce_timer);
- unsigned long iv = __this_cpu_read(mce_next_interval);
- __start_timer(t, interval);
+ mce_set_storm_mode(storm);
- if (interval < iv)
- __this_cpu_write(mce_next_interval, interval);
+ if (storm)
+ __start_timer(t, HZ);
+ else
+ __this_cpu_write(mce_next_interval, check_interval * HZ);
}
/* Must not be called in IRQ context where del_timer_sync() can deadlock */
@@ -1995,7 +2014,6 @@ static void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c)
intel_init_cmci();
intel_init_lmce();
- mce_adjust_timer = cmci_intel_adjust_timer;
}
static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c)
@@ -2008,7 +2026,6 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
switch (c->x86_vendor) {
case X86_VENDOR_INTEL:
mce_intel_feature_init(c);
- mce_adjust_timer = cmci_intel_adjust_timer;
break;
case X86_VENDOR_AMD: {
@@ -2568,9 +2585,6 @@ static int mce_device_create(unsigned int cpu)
int err;
int i, j;
- if (!mce_available(&boot_cpu_data))
- return -EIO;
-
dev = per_cpu(mce_device, cpu);
if (dev)
return 0;
@@ -2665,8 +2679,6 @@ static void mce_reenable_cpu(void)
static int mce_cpu_dead(unsigned int cpu)
{
- mce_intel_hcpu_update(cpu);
-
/* intentionally ignoring frozen here */
if (!cpuhp_tasks_frozen)
cmci_rediscover();
diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c
index 4d8d4bcf915d..72f0695c3dc1 100644
--- a/arch/x86/kernel/cpu/mce/inject.c
+++ b/arch/x86/kernel/cpu/mce/inject.c
@@ -746,6 +746,7 @@ static void check_hw_inj_possible(void)
wrmsrl_safe(mca_msr_reg(bank, MCA_STATUS), status);
rdmsrl_safe(mca_msr_reg(bank, MCA_STATUS), &status);
+ wrmsrl_safe(mca_msr_reg(bank, MCA_STATUS), 0);
if (!status) {
hw_injection_possible = false;
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
index 52bce533ddcc..399b62e223d2 100644
--- a/arch/x86/kernel/cpu/mce/intel.c
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -42,15 +42,6 @@
static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
/*
- * CMCI storm detection backoff counter
- *
- * During storm, we reset this counter to INITIAL_CHECK_INTERVAL in case we've
- * encountered an error. If not, we decrement it by one. We signal the end of
- * the CMCI storm when it reaches 0.
- */
-static DEFINE_PER_CPU(int, cmci_backoff_cnt);
-
-/*
* cmci_discover_lock protects against parallel discovery attempts
* which could race against each other.
*/
@@ -63,22 +54,26 @@ static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
*/
static DEFINE_SPINLOCK(cmci_poll_lock);
+/* Linux non-storm CMCI threshold (may be overridden by BIOS) */
#define CMCI_THRESHOLD 1
-#define CMCI_POLL_INTERVAL (30 * HZ)
-#define CMCI_STORM_INTERVAL (HZ)
-#define CMCI_STORM_THRESHOLD 15
-static DEFINE_PER_CPU(unsigned long, cmci_time_stamp);
-static DEFINE_PER_CPU(unsigned int, cmci_storm_cnt);
-static DEFINE_PER_CPU(unsigned int, cmci_storm_state);
-
-enum {
- CMCI_STORM_NONE,
- CMCI_STORM_ACTIVE,
- CMCI_STORM_SUBSIDED,
-};
+/*
+ * MCi_CTL2 threshold for each bank when there is no storm.
+ * Default value for each bank may have been set by BIOS.
+ */
+static u16 cmci_threshold[MAX_NR_BANKS];
-static atomic_t cmci_storm_on_cpus;
+/*
+ * High threshold to limit CMCI rate during storms. Max supported is
+ * 0x7FFF. Use this slightly smaller value so it has a distinctive
+ * signature when some asks "Why am I not seeing all corrected errors?"
+ * A high threshold is used instead of just disabling CMCI for a
+ * bank because both corrected and uncorrected errors may be logged
+ * in the same bank and signalled with CMCI. The threshold only applies
+ * to corrected errors, so keeping CMCI enabled means that uncorrected
+ * errors will still be processed in a timely fashion.
+ */
+#define CMCI_STORM_THRESHOLD 32749
static int cmci_supported(int *banks)
{
@@ -134,204 +129,166 @@ static bool lmce_supported(void)
return tmp & FEAT_CTL_LMCE_ENABLED;
}
-bool mce_intel_cmci_poll(void)
+/*
+ * Set a new CMCI threshold value. Preserve the state of the
+ * MCI_CTL2_CMCI_EN bit in case this happens during a
+ * cmci_rediscover() operation.
+ */
+static void cmci_set_threshold(int bank, int thresh)
{
- if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE)
- return false;
-
- /*
- * Reset the counter if we've logged an error in the last poll
- * during the storm.
- */
- if (machine_check_poll(0, this_cpu_ptr(&mce_banks_owned)))
- this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
- else
- this_cpu_dec(cmci_backoff_cnt);
+ unsigned long flags;
+ u64 val;
- return true;
+ raw_spin_lock_irqsave(&cmci_discover_lock, flags);
+ rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
+ val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
+ wrmsrl(MSR_IA32_MCx_CTL2(bank), val | thresh);
+ raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
}
-void mce_intel_hcpu_update(unsigned long cpu)
+void mce_intel_handle_storm(int bank, bool on)
{
- if (per_cpu(cmci_storm_state, cpu) == CMCI_STORM_ACTIVE)
- atomic_dec(&cmci_storm_on_cpus);
+ if (on)
+ cmci_set_threshold(bank, CMCI_STORM_THRESHOLD);
+ else
+ cmci_set_threshold(bank, cmci_threshold[bank]);
+}
- per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE;
+/*
+ * The interrupt handler. This is called on every event.
+ * Just call the poller directly to log any events.
+ * This could in theory increase the threshold under high load,
+ * but doesn't for now.
+ */
+static void intel_threshold_interrupt(void)
+{
+ machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
}
-static void cmci_toggle_interrupt_mode(bool on)
+/*
+ * Check all the reasons why current CPU cannot claim
+ * ownership of a bank.
+ * 1: CPU already owns this bank
+ * 2: BIOS owns this bank
+ * 3: Some other CPU owns this bank
+ */
+static bool cmci_skip_bank(int bank, u64 *val)
{
- unsigned long flags, *owned;
- int bank;
- u64 val;
+ unsigned long *owned = (void *)this_cpu_ptr(&mce_banks_owned);
- raw_spin_lock_irqsave(&cmci_discover_lock, flags);
- owned = this_cpu_ptr(mce_banks_owned);
- for_each_set_bit(bank, owned, MAX_NR_BANKS) {
- rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
+ if (test_bit(bank, owned))
+ return true;
- if (on)
- val |= MCI_CTL2_CMCI_EN;
- else
- val &= ~MCI_CTL2_CMCI_EN;
+ /* Skip banks in firmware first mode */
+ if (test_bit(bank, mce_banks_ce_disabled))
+ return true;
- wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
- }
- raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
-}
+ rdmsrl(MSR_IA32_MCx_CTL2(bank), *val);
-unsigned long cmci_intel_adjust_timer(unsigned long interval)
-{
- if ((this_cpu_read(cmci_backoff_cnt) > 0) &&
- (__this_cpu_read(cmci_storm_state) == CMCI_STORM_ACTIVE)) {
- mce_notify_irq();
- return CMCI_STORM_INTERVAL;
+ /* Already owned by someone else? */
+ if (*val & MCI_CTL2_CMCI_EN) {
+ clear_bit(bank, owned);
+ __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
+ return true;
}
- switch (__this_cpu_read(cmci_storm_state)) {
- case CMCI_STORM_ACTIVE:
-
- /*
- * We switch back to interrupt mode once the poll timer has
- * silenced itself. That means no events recorded and the timer
- * interval is back to our poll interval.
- */
- __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED);
- if (!atomic_sub_return(1, &cmci_storm_on_cpus))
- pr_notice("CMCI storm subsided: switching to interrupt mode\n");
+ return false;
+}
- fallthrough;
+/*
+ * Decide which CMCI interrupt threshold to use:
+ * 1: If this bank is in storm mode from whichever CPU was
+ * the previous owner, stay in storm mode.
+ * 2: If ignoring any threshold set by BIOS, set Linux default
+ * 3: Try to honor BIOS threshold (unless buggy BIOS set it at zero).
+ */
+static u64 cmci_pick_threshold(u64 val, int *bios_zero_thresh)
+{
+ if ((val & MCI_CTL2_CMCI_THRESHOLD_MASK) == CMCI_STORM_THRESHOLD)
+ return val;
- case CMCI_STORM_SUBSIDED:
+ if (!mca_cfg.bios_cmci_threshold) {
+ val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
+ val |= CMCI_THRESHOLD;
+ } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) {
/*
- * We wait for all CPUs to go back to SUBSIDED state. When that
- * happens we switch back to interrupt mode.
+ * If bios_cmci_threshold boot option was specified
+ * but the threshold is zero, we'll try to initialize
+ * it to 1.
*/
- if (!atomic_read(&cmci_storm_on_cpus)) {
- __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE);
- cmci_toggle_interrupt_mode(true);
- cmci_recheck();
- }
- return CMCI_POLL_INTERVAL;
- default:
-
- /* We have shiny weather. Let the poll do whatever it thinks. */
- return interval;
+ *bios_zero_thresh = 1;
+ val |= CMCI_THRESHOLD;
}
+
+ return val;
}
-static bool cmci_storm_detect(void)
+/*
+ * Try to claim ownership of a bank.
+ */
+static void cmci_claim_bank(int bank, u64 val, int bios_zero_thresh, int *bios_wrong_thresh)
{
- unsigned int cnt = __this_cpu_read(cmci_storm_cnt);
- unsigned long ts = __this_cpu_read(cmci_time_stamp);
- unsigned long now = jiffies;
- int r;
+ struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
- if (__this_cpu_read(cmci_storm_state) != CMCI_STORM_NONE)
- return true;
+ val |= MCI_CTL2_CMCI_EN;
+ wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
+ rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
- if (time_before_eq(now, ts + CMCI_STORM_INTERVAL)) {
- cnt++;
- } else {
- cnt = 1;
- __this_cpu_write(cmci_time_stamp, now);
+ /* If the enable bit did not stick, this bank should be polled. */
+ if (!(val & MCI_CTL2_CMCI_EN)) {
+ WARN_ON(!test_bit(bank, this_cpu_ptr(mce_poll_banks)));
+ storm->banks[bank].poll_only = true;
+ return;
}
- __this_cpu_write(cmci_storm_cnt, cnt);
- if (cnt <= CMCI_STORM_THRESHOLD)
- return false;
-
- cmci_toggle_interrupt_mode(false);
- __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
- r = atomic_add_return(1, &cmci_storm_on_cpus);
- mce_timer_kick(CMCI_STORM_INTERVAL);
- this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
+ /* This CPU successfully set the enable bit. */
+ set_bit(bank, (void *)this_cpu_ptr(&mce_banks_owned));
- if (r == 1)
- pr_notice("CMCI storm detected: switching to poll mode\n");
- return true;
-}
+ if ((val & MCI_CTL2_CMCI_THRESHOLD_MASK) == CMCI_STORM_THRESHOLD) {
+ pr_notice("CPU%d BANK%d CMCI inherited storm\n", smp_processor_id(), bank);
+ mce_inherit_storm(bank);
+ cmci_storm_begin(bank);
+ } else {
+ __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
+ }
-/*
- * The interrupt handler. This is called on every event.
- * Just call the poller directly to log any events.
- * This could in theory increase the threshold under high load,
- * but doesn't for now.
- */
-static void intel_threshold_interrupt(void)
-{
- if (cmci_storm_detect())
- return;
+ /*
+ * We are able to set thresholds for some banks that
+ * had a threshold of 0. This means the BIOS has not
+ * set the thresholds properly or does not work with
+ * this boot option. Note down now and report later.
+ */
+ if (mca_cfg.bios_cmci_threshold && bios_zero_thresh &&
+ (val & MCI_CTL2_CMCI_THRESHOLD_MASK))
+ *bios_wrong_thresh = 1;
- machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
+ /* Save default threshold for each bank */
+ if (cmci_threshold[bank] == 0)
+ cmci_threshold[bank] = val & MCI_CTL2_CMCI_THRESHOLD_MASK;
}
/*
* Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
* on this CPU. Use the algorithm recommended in the SDM to discover shared
- * banks.
+ * banks. Called during initial bootstrap, and also for hotplug CPU operations
+ * to rediscover/reassign machine check banks.
*/
static void cmci_discover(int banks)
{
- unsigned long *owned = (void *)this_cpu_ptr(&mce_banks_owned);
+ int bios_wrong_thresh = 0;
unsigned long flags;
int i;
- int bios_wrong_thresh = 0;
raw_spin_lock_irqsave(&cmci_discover_lock, flags);
for (i = 0; i < banks; i++) {
u64 val;
int bios_zero_thresh = 0;
- if (test_bit(i, owned))
+ if (cmci_skip_bank(i, &val))
continue;
- /* Skip banks in firmware first mode */
- if (test_bit(i, mce_banks_ce_disabled))
- continue;
-
- rdmsrl(MSR_IA32_MCx_CTL2(i), val);
-
- /* Already owned by someone else? */
- if (val & MCI_CTL2_CMCI_EN) {
- clear_bit(i, owned);
- __clear_bit(i, this_cpu_ptr(mce_poll_banks));
- continue;
- }
-
- if (!mca_cfg.bios_cmci_threshold) {
- val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
- val |= CMCI_THRESHOLD;
- } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) {
- /*
- * If bios_cmci_threshold boot option was specified
- * but the threshold is zero, we'll try to initialize
- * it to 1.
- */
- bios_zero_thresh = 1;
- val |= CMCI_THRESHOLD;
- }
-
- val |= MCI_CTL2_CMCI_EN;
- wrmsrl(MSR_IA32_MCx_CTL2(i), val);
- rdmsrl(MSR_IA32_MCx_CTL2(i), val);
-
- /* Did the enable bit stick? -- the bank supports CMCI */
- if (val & MCI_CTL2_CMCI_EN) {
- set_bit(i, owned);
- __clear_bit(i, this_cpu_ptr(mce_poll_banks));
- /*
- * We are able to set thresholds for some banks that
- * had a threshold of 0. This means the BIOS has not
- * set the thresholds properly or does not work with
- * this boot option. Note down now and report later.
- */
- if (mca_cfg.bios_cmci_threshold && bios_zero_thresh &&
- (val & MCI_CTL2_CMCI_THRESHOLD_MASK))
- bios_wrong_thresh = 1;
- } else {
- WARN_ON(!test_bit(i, this_cpu_ptr(mce_poll_banks)));
- }
+ val = cmci_pick_threshold(val, &bios_zero_thresh);
+ cmci_claim_bank(i, val, bios_zero_thresh, &bios_wrong_thresh);
}
raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) {
@@ -370,6 +327,9 @@ static void __cmci_disable_bank(int bank)
val &= ~MCI_CTL2_CMCI_EN;
wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
__clear_bit(bank, this_cpu_ptr(mce_banks_owned));
+
+ if ((val & MCI_CTL2_CMCI_THRESHOLD_MASK) == CMCI_STORM_THRESHOLD)
+ cmci_storm_end(bank);
}
/*
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index e13a26c9c0ac..01f8f03969e6 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -41,9 +41,7 @@ struct dentry *mce_get_debugfs_dir(void);
extern mce_banks_t mce_banks_ce_disabled;
#ifdef CONFIG_X86_MCE_INTEL
-unsigned long cmci_intel_adjust_timer(unsigned long interval);
-bool mce_intel_cmci_poll(void);
-void mce_intel_hcpu_update(unsigned long cpu);
+void mce_intel_handle_storm(int bank, bool on);
void cmci_disable_bank(int bank);
void intel_init_cmci(void);
void intel_init_lmce(void);
@@ -51,9 +49,7 @@ void intel_clear_lmce(void);
bool intel_filter_mce(struct mce *m);
bool intel_mce_usable_address(struct mce *m);
#else
-# define cmci_intel_adjust_timer mce_adjust_timer_default
-static inline bool mce_intel_cmci_poll(void) { return false; }
-static inline void mce_intel_hcpu_update(unsigned long cpu) { }
+static inline void mce_intel_handle_storm(int bank, bool on) { }
static inline void cmci_disable_bank(int bank) { }
static inline void intel_init_cmci(void) { }
static inline void intel_init_lmce(void) { }
@@ -62,7 +58,63 @@ static inline bool intel_filter_mce(struct mce *m) { return false; }
static inline bool intel_mce_usable_address(struct mce *m) { return false; }
#endif
-void mce_timer_kick(unsigned long interval);
+void mce_timer_kick(bool storm);
+
+#ifdef CONFIG_X86_MCE_THRESHOLD
+void cmci_storm_begin(unsigned int bank);
+void cmci_storm_end(unsigned int bank);
+void mce_track_storm(struct mce *mce);
+void mce_inherit_storm(unsigned int bank);
+bool mce_get_storm_mode(void);
+void mce_set_storm_mode(bool storm);
+#else
+static inline void cmci_storm_begin(unsigned int bank) {}
+static inline void cmci_storm_end(unsigned int bank) {}
+static inline void mce_track_storm(struct mce *mce) {}
+static inline void mce_inherit_storm(unsigned int bank) {}
+static inline bool mce_get_storm_mode(void) { return false; }
+static inline void mce_set_storm_mode(bool storm) {}
+#endif
+
+/*
+ * history: Bitmask tracking errors occurrence. Each set bit
+ * represents an error seen.
+ *
+ * timestamp: Last time (in jiffies) that the bank was polled.
+ * in_storm_mode: Is this bank in storm mode?
+ * poll_only: Bank does not support CMCI, skip storm tracking.
+ */
+struct storm_bank {
+ u64 history;
+ u64 timestamp;
+ bool in_storm_mode;
+ bool poll_only;
+};
+
+#define NUM_HISTORY_BITS (sizeof(u64) * BITS_PER_BYTE)
+
+/* How many errors within the history buffer mark the start of a storm. */
+#define STORM_BEGIN_THRESHOLD 5
+
+/*
+ * How many polls of machine check bank without an error before declaring
+ * the storm is over. Since it is tracked by the bitmasks in the history
+ * field of struct storm_bank the mask is 30 bits [0 ... 29].
+ */
+#define STORM_END_POLL_THRESHOLD 29
+
+/*
+ * banks: per-cpu, per-bank details
+ * stormy_bank_count: count of MC banks in storm state
+ * poll_mode: CPU is in poll mode
+ */
+struct mca_storm_desc {
+ struct storm_bank banks[MAX_NR_BANKS];
+ u8 stormy_bank_count;
+ bool poll_mode;
+};
+
+DECLARE_PER_CPU(struct mca_storm_desc, storm_desc);
#ifdef CONFIG_ACPI_APEI
int apei_write_mce(struct mce *m);
diff --git a/arch/x86/kernel/cpu/mce/threshold.c b/arch/x86/kernel/cpu/mce/threshold.c
index ef4e7bb5fd88..89e31e1e5c9c 100644
--- a/arch/x86/kernel/cpu/mce/threshold.c
+++ b/arch/x86/kernel/cpu/mce/threshold.c
@@ -29,3 +29,118 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_threshold)
trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR);
apic_eoi();
}
+
+DEFINE_PER_CPU(struct mca_storm_desc, storm_desc);
+
+void mce_inherit_storm(unsigned int bank)
+{
+ struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
+
+ /*
+ * Previous CPU owning this bank had put it into storm mode,
+ * but the precise history of that storm is unknown. Assume
+ * the worst (all recent polls of the bank found a valid error
+ * logged). This will avoid the new owner prematurely declaring
+ * the storm has ended.
+ */
+ storm->banks[bank].history = ~0ull;
+ storm->banks[bank].timestamp = jiffies;
+}
+
+bool mce_get_storm_mode(void)
+{
+ return __this_cpu_read(storm_desc.poll_mode);
+}
+
+void mce_set_storm_mode(bool storm)
+{
+ __this_cpu_write(storm_desc.poll_mode, storm);
+}
+
+static void mce_handle_storm(unsigned int bank, bool on)
+{
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_INTEL:
+ mce_intel_handle_storm(bank, on);
+ break;
+ }
+}
+
+void cmci_storm_begin(unsigned int bank)
+{
+ struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
+
+ __set_bit(bank, this_cpu_ptr(mce_poll_banks));
+ storm->banks[bank].in_storm_mode = true;
+
+ /*
+ * If this is the first bank on this CPU to enter storm mode
+ * start polling.
+ */
+ if (++storm->stormy_bank_count == 1)
+ mce_timer_kick(true);
+}
+
+void cmci_storm_end(unsigned int bank)
+{
+ struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
+
+ __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
+ storm->banks[bank].history = 0;
+ storm->banks[bank].in_storm_mode = false;
+
+ /* If no banks left in storm mode, stop polling. */
+ if (!this_cpu_dec_return(storm_desc.stormy_bank_count))
+ mce_timer_kick(false);
+}
+
+void mce_track_storm(struct mce *mce)
+{
+ struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
+ unsigned long now = jiffies, delta;
+ unsigned int shift = 1;
+ u64 history = 0;
+
+ /* No tracking needed for banks that do not support CMCI */
+ if (storm->banks[mce->bank].poll_only)
+ return;
+
+ /*
+ * When a bank is in storm mode it is polled once per second and
+ * the history mask will record about the last minute of poll results.
+ * If it is not in storm mode, then the bank is only checked when
+ * there is a CMCI interrupt. Check how long it has been since
+ * this bank was last checked, and adjust the amount of "shift"
+ * to apply to history.
+ */
+ if (!storm->banks[mce->bank].in_storm_mode) {
+ delta = now - storm->banks[mce->bank].timestamp;
+ shift = (delta + HZ) / HZ;
+ }
+
+ /* If it has been a long time since the last poll, clear history. */
+ if (shift < NUM_HISTORY_BITS)
+ history = storm->banks[mce->bank].history << shift;
+
+ storm->banks[mce->bank].timestamp = now;
+
+ /* History keeps track of corrected errors. VAL=1 && UC=0 */
+ if ((mce->status & MCI_STATUS_VAL) && mce_is_correctable(mce))
+ history |= 1;
+
+ storm->banks[mce->bank].history = history;
+
+ if (storm->banks[mce->bank].in_storm_mode) {
+ if (history & GENMASK_ULL(STORM_END_POLL_THRESHOLD, 0))
+ return;
+ printk_deferred(KERN_NOTICE "CPU%d BANK%d CMCI storm subsided\n", smp_processor_id(), mce->bank);
+ mce_handle_storm(mce->bank, false);
+ cmci_storm_end(mce->bank);
+ } else {
+ if (hweight64(history) < STORM_BEGIN_THRESHOLD)
+ return;
+ printk_deferred(KERN_NOTICE "CPU%d BANK%d CMCI storm detected\n", smp_processor_id(), mce->bank);
+ mce_handle_storm(mce->bank, true);
+ cmci_storm_begin(mce->bank);
+ }
+}
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 070426b9895f..857e608af641 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -370,14 +370,14 @@ static __init struct microcode_intel *get_microcode_blob(struct ucode_cpu_info *
{
struct cpio_data cp;
+ intel_collect_cpu_info(&uci->cpu_sig);
+
if (!load_builtin_intel_microcode(&cp))
cp = find_microcode_in_initrd(ucode_path);
if (!(cp.data && cp.size))
return NULL;
- intel_collect_cpu_info(&uci->cpu_sig);
-
return scan_microcode(cp.data, cp.size, uci, save);
}
@@ -410,13 +410,13 @@ void __init load_ucode_intel_bsp(struct early_load_data *ed)
{
struct ucode_cpu_info uci;
- ed->old_rev = intel_get_microcode_revision();
-
uci.mc = get_microcode_blob(&uci, false);
- if (uci.mc && apply_microcode_early(&uci) == UCODE_UPDATED)
- ucode_patch_va = UCODE_BSP_LOADED;
+ ed->old_rev = uci.cpu_sig.rev;
- ed->new_rev = uci.cpu_sig.rev;
+ if (uci.mc && apply_microcode_early(&uci) == UCODE_UPDATED) {
+ ucode_patch_va = UCODE_BSP_LOADED;
+ ed->new_rev = uci.cpu_sig.rev;
+ }
}
void load_ucode_intel_ap(void)
@@ -457,12 +457,6 @@ static enum ucode_state apply_microcode_late(int cpu)
if (ret != UCODE_UPDATED && ret != UCODE_OK)
return ret;
- if (!cpu && uci->cpu_sig.rev != cur_rev) {
- pr_info("Updated to revision 0x%x, date = %04x-%02x-%02x\n",
- uci->cpu_sig.rev, mc->hdr.date & 0xffff, mc->hdr.date >> 24,
- (mc->hdr.date >> 16) & 0xff);
- }
-
cpu_data(cpu).microcode = uci->cpu_sig.rev;
if (!cpu)
boot_cpu_data.microcode = uci->cpu_sig.rev;
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 2d6aa5d2e3d7..d3524778a545 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -428,6 +428,10 @@ void __init mtrr_copy_map(void)
* from the x86_init.hyper.init_platform() hook. It can be called only once.
* The MTRR state can't be changed afterwards. To ensure that, X86_FEATURE_MTRR
* is cleared.
+ *
+ * @var: MTRR variable range array to use
+ * @num_var: length of the @var array
+ * @def_type: default caching type
*/
void mtrr_overwrite_state(struct mtrr_var_range *var, unsigned int num_var,
mtrr_type def_type)
@@ -492,13 +496,15 @@ static u8 type_merge(u8 type, u8 new_type, u8 *uniform)
/**
* mtrr_type_lookup - look up memory type in MTRR
*
+ * @start: Begin of the physical address range
+ * @end: End of the physical address range
+ * @uniform: output argument:
+ * - 1: the returned MTRR type is valid for the whole region
+ * - 0: otherwise
+ *
* Return Values:
* MTRR_TYPE_(type) - The effective MTRR type for the region
* MTRR_TYPE_INVALID - MTRR is disabled
- *
- * Output Argument:
- * uniform - Set to 1 when the returned MTRR type is valid for the whole
- * region, set to 0 else.
*/
u8 mtrr_type_lookup(u64 start, u64 end, u8 *uniform)
{
diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c
index 5d390df21440..b65ab214bdf5 100644
--- a/arch/x86/kernel/cpu/sgx/ioctl.c
+++ b/arch/x86/kernel/cpu/sgx/ioctl.c
@@ -581,7 +581,7 @@ err_out:
*
* Flush any outstanding enqueued EADD operations and perform EINIT. The
* Launch Enclave Public Key Hash MSRs are rewritten as necessary to match
- * the enclave's MRSIGNER, which is caculated from the provided sigstruct.
+ * the enclave's MRSIGNER, which is calculated from the provided sigstruct.
*
* Return:
* - 0: Success.
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index c92d88680dbf..b6b044356f1b 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -170,7 +170,7 @@ static int elf_header_exclude_ranges(struct crash_mem *cmem)
int ret = 0;
/* Exclude the low 1M because it is always reserved */
- ret = crash_exclude_mem_range(cmem, 0, (1<<20)-1);
+ ret = crash_exclude_mem_range(cmem, 0, SZ_1M - 1);
if (ret)
return ret;
@@ -198,8 +198,8 @@ static int prepare_elf64_ram_headers_callback(struct resource *res, void *arg)
}
/* Prepare elf headers. Return addr and size */
-static int prepare_elf_headers(struct kimage *image, void **addr,
- unsigned long *sz, unsigned long *nr_mem_ranges)
+static int prepare_elf_headers(void **addr, unsigned long *sz,
+ unsigned long *nr_mem_ranges)
{
struct crash_mem *cmem;
int ret;
@@ -221,7 +221,7 @@ static int prepare_elf_headers(struct kimage *image, void **addr,
*nr_mem_ranges = cmem->nr_ranges;
/* By default prepare 64bit headers */
- ret = crash_prepare_elf64_headers(cmem, IS_ENABLED(CONFIG_X86_64), addr, sz);
+ ret = crash_prepare_elf64_headers(cmem, IS_ENABLED(CONFIG_X86_64), addr, sz);
out:
vfree(cmem);
@@ -349,7 +349,7 @@ int crash_load_segments(struct kimage *image)
.buf_max = ULONG_MAX, .top_down = false };
/* Prepare elf headers and add a segment */
- ret = prepare_elf_headers(image, &kbuf.buffer, &kbuf.bufsz, &pnum);
+ ret = prepare_elf_headers(&kbuf.buffer, &kbuf.bufsz, &pnum);
if (ret)
return ret;
@@ -386,8 +386,8 @@ int crash_load_segments(struct kimage *image)
if (ret)
return ret;
image->elf_load_addr = kbuf.mem;
- pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
- image->elf_load_addr, kbuf.bufsz, kbuf.memsz);
+ kexec_dprintk("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ image->elf_load_addr, kbuf.bufsz, kbuf.memsz);
return ret;
}
@@ -452,7 +452,7 @@ void arch_crash_handle_hotplug_event(struct kimage *image)
* Create the new elfcorehdr reflecting the changes to CPU and/or
* memory resources.
*/
- if (prepare_elf_headers(image, &elfbuf, &elfsz, &nr_mem_ranges)) {
+ if (prepare_elf_headers(&elfbuf, &elfsz, &nr_mem_ranges)) {
pr_err("unable to create new elfcorehdr");
goto out;
}
diff --git a/arch/x86/kernel/fpu/bugs.c b/arch/x86/kernel/fpu/bugs.c
index 794e70151203..a06b876bbf2d 100644
--- a/arch/x86/kernel/fpu/bugs.c
+++ b/arch/x86/kernel/fpu/bugs.c
@@ -2,6 +2,7 @@
/*
* x86 FPU bug checks:
*/
+#include <asm/cpufeature.h>
#include <asm/fpu/api.h>
/*
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index a21a4d0ecc34..520deb411a70 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -308,7 +308,7 @@ EXPORT_SYMBOL_GPL(fpu_update_guest_xfd);
* Must be invoked from KVM after a VMEXIT before enabling interrupts when
* XFD write emulation is disabled. This is required because the guest can
* freely modify XFD and the state at VMEXIT is not guaranteed to be the
- * same as the state on VMENTER. So software state has to be udpated before
+ * same as the state on VMENTER. So software state has to be updated before
* any operation which depends on it can take place.
*
* Note: It can be invoked unconditionally even when write emulation is
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 05a110c97111..dc0956067944 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -71,9 +71,9 @@ EXPORT_SYMBOL(vmemmap_base);
* GDT used on the boot CPU before switching to virtual addresses.
*/
static struct desc_struct startup_gdt[GDT_ENTRIES] __initdata = {
- [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff),
- [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff),
- [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(DESC_CODE32, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(DESC_CODE64, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(DESC_DATA64, 0, 0xfffff),
};
/*
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 086a2c3aaaa0..d4918d03efb4 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -114,6 +114,28 @@ SYM_CODE_START_NOALIGN(startup_64)
/* Form the CR3 value being sure to include the CR3 modifier */
addq $(early_top_pgt - __START_KERNEL_map), %rax
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ mov %rax, %rdi
+ mov %rax, %r14
+
+ addq phys_base(%rip), %rdi
+
+ /*
+ * For SEV guests: Verify that the C-bit is correct. A malicious
+ * hypervisor could lie about the C-bit position to perform a ROP
+ * attack on the guest by writing to the unencrypted stack and wait for
+ * the next RET instruction.
+ */
+ call sev_verify_cbit
+
+ /*
+ * Restore CR3 value without the phys_base which will be added
+ * below, before writing %cr3.
+ */
+ mov %r14, %rax
+#endif
+
jmp 1f
SYM_CODE_END(startup_64)
@@ -182,7 +204,7 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
/* Enable PAE mode, PSE, PGE and LA57 */
orl $(X86_CR4_PAE | X86_CR4_PSE | X86_CR4_PGE), %ecx
#ifdef CONFIG_X86_5LEVEL
- testl $1, __pgtable_l5_enabled(%rip)
+ testb $1, __pgtable_l5_enabled(%rip)
jz 1f
orl $X86_CR4_LA57, %ecx
1:
@@ -193,21 +215,12 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
addq phys_base(%rip), %rax
/*
- * For SEV guests: Verify that the C-bit is correct. A malicious
- * hypervisor could lie about the C-bit position to perform a ROP
- * attack on the guest by writing to the unencrypted stack and wait for
- * the next RET instruction.
- */
- movq %rax, %rdi
- call sev_verify_cbit
-
- /*
* Switch to new page-table
*
* For the boot CPU this switches to early_top_pgt which still has the
- * indentity mappings present. The secondary CPUs will switch to the
+ * identity mappings present. The secondary CPUs will switch to the
* init_top_pgt here, away from the trampoline_pgd and unmap the
- * indentity mapped ranges.
+ * identity mapped ranges.
*/
movq %rax, %cr3
@@ -255,6 +268,22 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
testl $X2APIC_ENABLE, %eax
jnz .Lread_apicid_msr
+#ifdef CONFIG_X86_X2APIC
+ /*
+ * If system is in X2APIC mode then MMIO base might not be
+ * mapped causing the MMIO read below to fault. Faults can't
+ * be handled at that point.
+ */
+ cmpl $0, x2apic_mode(%rip)
+ jz .Lread_apicid_mmio
+
+ /* Force the AP into X2APIC mode. */
+ orl $X2APIC_ENABLE, %eax
+ wrmsr
+ jmp .Lread_apicid_msr
+#endif
+
+.Lread_apicid_mmio:
/* Read the APIC ID from the fix-mapped MMIO space. */
movq apic_mmio_base(%rip), %rcx
addq $APIC_ID, %rcx
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 41eecf180b7f..8ff2bf921519 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -707,7 +707,7 @@ static void __init hpet_select_clockevents(void)
hpet_base.nr_clockevents = 0;
- /* No point if MSI is disabled or CPU has an Always Runing APIC Timer */
+ /* No point if MSI is disabled or CPU has an Always Running APIC Timer */
if (hpet_msi_disable || boot_cpu_has(X86_FEATURE_ARAT))
return;
@@ -965,7 +965,7 @@ static bool __init mwait_pc10_supported(void)
* and per CPU timer interrupts.
*
* The probability that this problem is going to be solved in the
- * forseeable future is close to zero, so the kernel has to be cluttered
+ * foreseeable future is close to zero, so the kernel has to be cluttered
* with heuristics to keep up with the ever growing amount of hardware and
* firmware trainwrecks. Hopefully some day hardware people will understand
* that the approach of "This can be fixed in software" is not sustainable.
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index 8857abc706e4..660b601f1d6c 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -121,7 +121,7 @@ static const __initconst struct idt_data def_idts[] = {
static const struct idt_data ia32_idt[] __initconst = {
#if defined(CONFIG_IA32_EMULATION)
- SYSG(IA32_SYSCALL_VECTOR, entry_INT80_compat),
+ SYSG(IA32_SYSCALL_VECTOR, asm_int80_emulation),
#elif defined(CONFIG_X86_32)
SYSG(IA32_SYSCALL_VECTOR, entry_INT80_32),
#endif
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index a61c12c01270..2a422e00ed4b 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -82,7 +82,7 @@ static int setup_cmdline(struct kimage *image, struct boot_params *params,
cmdline_ptr[cmdline_len - 1] = '\0';
- pr_debug("Final command line is: %s\n", cmdline_ptr);
+ kexec_dprintk("Final command line is: %s\n", cmdline_ptr);
cmdline_ptr_phys = bootparams_load_addr + cmdline_offset;
cmdline_low_32 = cmdline_ptr_phys & 0xffffffffUL;
cmdline_ext_32 = cmdline_ptr_phys >> 32;
@@ -272,7 +272,12 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params,
nr_e820_entries = params->e820_entries;
+ kexec_dprintk("E820 memmap:\n");
for (i = 0; i < nr_e820_entries; i++) {
+ kexec_dprintk("%016llx-%016llx (%d)\n",
+ params->e820_table[i].addr,
+ params->e820_table[i].addr + params->e820_table[i].size - 1,
+ params->e820_table[i].type);
if (params->e820_table[i].type != E820_TYPE_RAM)
continue;
start = params->e820_table[i].addr;
@@ -424,7 +429,7 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
* command line. Make sure it does not overflow
*/
if (cmdline_len + MAX_ELFCOREHDR_STR_LEN > header->cmdline_size) {
- pr_debug("Appending elfcorehdr=<addr> to command line exceeds maximum allowed length\n");
+ pr_err("Appending elfcorehdr=<addr> to command line exceeds maximum allowed length\n");
return ERR_PTR(-EINVAL);
}
@@ -445,7 +450,7 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
return ERR_PTR(ret);
}
- pr_debug("Loaded purgatory at 0x%lx\n", pbuf.mem);
+ kexec_dprintk("Loaded purgatory at 0x%lx\n", pbuf.mem);
/*
@@ -490,8 +495,8 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
if (ret)
goto out_free_params;
bootparam_load_addr = kbuf.mem;
- pr_debug("Loaded boot_param, command line and misc at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
- bootparam_load_addr, kbuf.bufsz, kbuf.bufsz);
+ kexec_dprintk("Loaded boot_param, command line and misc at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ bootparam_load_addr, kbuf.bufsz, kbuf.memsz);
/* Load kernel */
kbuf.buffer = kernel + kern16_size;
@@ -505,8 +510,8 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
goto out_free_params;
kernel_load_addr = kbuf.mem;
- pr_debug("Loaded 64bit kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
- kernel_load_addr, kbuf.bufsz, kbuf.memsz);
+ kexec_dprintk("Loaded 64bit kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ kernel_load_addr, kbuf.bufsz, kbuf.memsz);
/* Load initrd high */
if (initrd) {
@@ -520,8 +525,8 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
goto out_free_params;
initrd_load_addr = kbuf.mem;
- pr_debug("Loaded initrd at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
- initrd_load_addr, initrd_len, initrd_len);
+ kexec_dprintk("Loaded initrd at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ initrd_load_addr, initrd_len, initrd_len);
setup_initrd(params, initrd_load_addr, initrd_len);
}
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index e8babebad7b8..a0ce46c0a2d8 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -576,7 +576,8 @@ static void kprobe_emulate_call_indirect(struct kprobe *p, struct pt_regs *regs)
{
unsigned long offs = addrmode_regoffs[p->ainsn.indirect.reg];
- int3_emulate_call(regs, regs_get_register(regs, offs));
+ int3_emulate_push(regs, regs->ip - INT3_INSN_SIZE + p->ainsn.size);
+ int3_emulate_jmp(regs, regs_get_register(regs, offs));
}
NOKPROBE_SYMBOL(kprobe_emulate_call_indirect);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 0ddb3bd0f1aa..dfe9945b9bec 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -803,8 +803,8 @@ extern bool __raw_callee_save___kvm_vcpu_is_preempted(long);
"cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax)\n\t" \
"setne %al\n\t"
-DEFINE_PARAVIRT_ASM(__raw_callee_save___kvm_vcpu_is_preempted,
- PV_VCPU_PREEMPTED_ASM, .text);
+DEFINE_ASM_FUNC(__raw_callee_save___kvm_vcpu_is_preempted,
+ PV_VCPU_PREEMPTED_ASM, .text);
#endif
static void __init kvm_guest_init(void)
@@ -942,7 +942,7 @@ static void __init kvm_init_platform(void)
* Reset the host's shared pages list related to kernel
* specific page encryption status settings before we load a
* new kernel by kexec. Reset the page encryption status
- * during early boot intead of just before kexec to avoid SMP
+ * during early boot instead of just before kexec to avoid SMP
* races during kvm_pv_guest_cpu_reboot().
* NOTE: We cannot reset the complete shared pages list
* here as we need to retain the UEFI/OVMF firmware
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index fb8f52149be9..a95d0900e8c6 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -42,7 +42,7 @@ static int __init parse_no_kvmclock_vsyscall(char *arg)
}
early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
-/* Aligned to page sizes to match whats mapped via vsyscalls to userspace */
+/* Aligned to page sizes to match what's mapped via vsyscalls to userspace */
#define HVC_BOOT_ARRAY_SIZE \
(PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info))
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index adc67f98819a..7a814b41402d 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -7,7 +7,7 @@
* This handles calls from both 32bit and 64bit mode.
*
* Lock order:
- * contex.ldt_usr_sem
+ * context.ldt_usr_sem
* mmap_lock
* context.lock
*/
@@ -49,7 +49,7 @@ void load_mm_ldt(struct mm_struct *mm)
/*
* Any change to mm->context.ldt is followed by an IPI to all
* CPUs with the mm active. The LDT will not be freed until
- * after the IPI is handled by all such CPUs. This means that,
+ * after the IPI is handled by all such CPUs. This means that
* if the ldt_struct changes before we return, the values we see
* will be safe, and the new values will be loaded before we run
* any user code.
@@ -685,7 +685,7 @@ SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr ,
}
/*
* The SYSCALL_DEFINE() macros give us an 'unsigned long'
- * return type, but tht ABI for sys_modify_ldt() expects
+ * return type, but the ABI for sys_modify_ldt() expects
* 'int'. This cast gives us an int-sized value in %rax
* for the return code. The 'unsigned' is necessary so
* the compiler does not try to sign-extend the negative
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 1a3e2c05a8a5..bc0a5348b4a6 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -42,12 +42,9 @@ struct init_pgtable_data {
static int mem_region_callback(struct resource *res, void *arg)
{
struct init_pgtable_data *data = arg;
- unsigned long mstart, mend;
-
- mstart = res->start;
- mend = mstart + resource_size(res) - 1;
- return kernel_ident_mapping_init(data->info, data->level4p, mstart, mend);
+ return kernel_ident_mapping_init(data->info, data->level4p,
+ res->start, res->end + 1);
}
static int
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 5f71a0cf4399..e18914c0e38a 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -276,7 +276,7 @@ int module_finalize(const Elf_Ehdr *hdr,
struct module *me)
{
const Elf_Shdr *s, *alt = NULL, *locks = NULL,
- *para = NULL, *orc = NULL, *orc_ip = NULL,
+ *orc = NULL, *orc_ip = NULL,
*retpolines = NULL, *returns = NULL, *ibt_endbr = NULL,
*calls = NULL, *cfi = NULL;
char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
@@ -286,8 +286,6 @@ int module_finalize(const Elf_Ehdr *hdr,
alt = s;
if (!strcmp(".smp_locks", secstrings + s->sh_name))
locks = s;
- if (!strcmp(".parainstructions", secstrings + s->sh_name))
- para = s;
if (!strcmp(".orc_unwind", secstrings + s->sh_name))
orc = s;
if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name))
@@ -304,14 +302,6 @@ int module_finalize(const Elf_Ehdr *hdr,
ibt_endbr = s;
}
- /*
- * See alternative_instructions() for the ordering rules between the
- * various patching types.
- */
- if (para) {
- void *pseg = (void *)para->sh_addr;
- apply_paravirt(pseg, pseg + para->sh_size);
- }
if (retpolines || cfi) {
void *rseg = NULL, *cseg = NULL;
unsigned int rsize = 0, csize = 0;
@@ -341,7 +331,7 @@ int module_finalize(const Elf_Ehdr *hdr,
void *aseg = (void *)alt->sh_addr;
apply_alternatives(aseg, aseg + alt->sh_size);
}
- if (calls || para) {
+ if (calls || alt) {
struct callthunk_sites cs = {};
if (calls) {
@@ -349,9 +339,9 @@ int module_finalize(const Elf_Ehdr *hdr,
cs.call_end = (void *)calls->sh_addr + calls->sh_size;
}
- if (para) {
- cs.pv_start = (void *)para->sh_addr;
- cs.pv_end = (void *)para->sh_addr + para->sh_size;
+ if (alt) {
+ cs.alt_start = (void *)alt->sh_addr;
+ cs.alt_end = (void *)alt->sh_addr + alt->sh_size;
}
callthunks_patch_module_calls(&cs, me);
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 97f1436c1a20..5358d43886ad 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -34,14 +34,8 @@
#include <asm/io_bitmap.h>
#include <asm/gsseg.h>
-/*
- * nop stub, which must not clobber anything *including the stack* to
- * avoid confusing the entry prologues.
- */
-DEFINE_PARAVIRT_ASM(_paravirt_nop, "", .entry.text);
-
/* stub always returning 0. */
-DEFINE_PARAVIRT_ASM(paravirt_ret0, "xor %eax,%eax", .entry.text);
+DEFINE_ASM_FUNC(paravirt_ret0, "xor %eax,%eax", .entry.text);
void __init default_banner(void)
{
@@ -49,26 +43,12 @@ void __init default_banner(void)
pv_info.name);
}
-/* Undefined instruction for dealing with missing ops pointers. */
-noinstr void paravirt_BUG(void)
-{
- BUG();
-}
-
-static unsigned paravirt_patch_call(void *insn_buff, const void *target,
- unsigned long addr, unsigned len)
-{
- __text_gen_insn(insn_buff, CALL_INSN_OPCODE,
- (void *)addr, target, CALL_INSN_SIZE);
- return CALL_INSN_SIZE;
-}
-
#ifdef CONFIG_PARAVIRT_XXL
-DEFINE_PARAVIRT_ASM(_paravirt_ident_64, "mov %rdi, %rax", .text);
-DEFINE_PARAVIRT_ASM(pv_native_save_fl, "pushf; pop %rax", .noinstr.text);
-DEFINE_PARAVIRT_ASM(pv_native_irq_disable, "cli", .noinstr.text);
-DEFINE_PARAVIRT_ASM(pv_native_irq_enable, "sti", .noinstr.text);
-DEFINE_PARAVIRT_ASM(pv_native_read_cr2, "mov %cr2, %rax", .noinstr.text);
+DEFINE_ASM_FUNC(_paravirt_ident_64, "mov %rdi, %rax", .text);
+DEFINE_ASM_FUNC(pv_native_save_fl, "pushf; pop %rax", .noinstr.text);
+DEFINE_ASM_FUNC(pv_native_irq_disable, "cli", .noinstr.text);
+DEFINE_ASM_FUNC(pv_native_irq_enable, "sti", .noinstr.text);
+DEFINE_ASM_FUNC(pv_native_read_cr2, "mov %cr2, %rax", .noinstr.text);
#endif
DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key);
@@ -85,28 +65,6 @@ static void native_tlb_remove_table(struct mmu_gather *tlb, void *table)
tlb_remove_page(tlb, table);
}
-unsigned int paravirt_patch(u8 type, void *insn_buff, unsigned long addr,
- unsigned int len)
-{
- /*
- * Neat trick to map patch type back to the call within the
- * corresponding structure.
- */
- void *opfunc = *((void **)&pv_ops + type);
- unsigned ret;
-
- if (opfunc == NULL)
- /* If there's no function, patch it with paravirt_BUG() */
- ret = paravirt_patch_call(insn_buff, paravirt_BUG, addr, len);
- else if (opfunc == _paravirt_nop)
- ret = 0;
- else
- /* Otherwise call the function. */
- ret = paravirt_patch_call(insn_buff, opfunc, addr, len);
-
- return ret;
-}
-
struct static_key paravirt_steal_enabled;
struct static_key paravirt_steal_rq_enabled;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index b6f4e8399fca..ab49ade31b0d 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -477,7 +477,7 @@ void native_tss_update_io_bitmap(void)
/*
* Make sure that the TSS limit is covering the IO bitmap. It might have
* been cut down by a VMEXIT to 0x67 which would cause a subsequent I/O
- * access from user space to trigger a #GP because tbe bitmap is outside
+ * access from user space to trigger a #GP because the bitmap is outside
* the TSS limit.
*/
refresh_tss_limit();
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 1526747bedf2..ec2c21a1844e 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -226,8 +226,6 @@ static void __init reserve_brk(void)
_brk_start = 0;
}
-u64 relocated_ramdisk;
-
#ifdef CONFIG_BLK_DEV_INITRD
static u64 __init get_ramdisk_image(void)
@@ -261,7 +259,7 @@ static void __init relocate_initrd(void)
u64 area_size = PAGE_ALIGN(ramdisk_size);
/* We need to move the initrd down into directly mapped mem */
- relocated_ramdisk = memblock_phys_alloc_range(area_size, PAGE_SIZE, 0,
+ u64 relocated_ramdisk = memblock_phys_alloc_range(area_size, PAGE_SIZE, 0,
PFN_PHYS(max_pfn_mapped));
if (!relocated_ramdisk)
panic("Cannot find place for new RAMDISK of size %lld\n",
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 2c97bf7b56ae..b30d6e180df7 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -106,8 +106,8 @@ void __init pcpu_populate_pte(unsigned long addr)
static inline void setup_percpu_segment(int cpu)
{
#ifdef CONFIG_X86_32
- struct desc_struct d = GDT_ENTRY_INIT(0x8092, per_cpu_offset(cpu),
- 0xFFFFF);
+ struct desc_struct d = GDT_ENTRY_INIT(DESC_DATA32,
+ per_cpu_offset(cpu), 0xFFFFF);
write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PERCPU, &d, DESCTYPE_S);
#endif
diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c
index ccb0915e84e1..1d24ec679915 100644
--- a/arch/x86/kernel/sev-shared.c
+++ b/arch/x86/kernel/sev-shared.c
@@ -96,7 +96,7 @@ static void __noreturn sev_es_terminate(unsigned int set, unsigned int reason)
/* Tell the hypervisor what went wrong. */
val |= GHCB_SEV_TERM_REASON(set, reason);
- /* Request Guest Termination from Hypvervisor */
+ /* Request Guest Termination from Hypervisor */
sev_es_wr_ghcb_msr(val);
VMGEXIT();
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index 70472eebe719..c67285824e82 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -1234,10 +1234,6 @@ void setup_ghcb(void)
if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
return;
- /* First make sure the hypervisor talks a supported protocol. */
- if (!sev_es_negotiate_protocol())
- sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
-
/*
* Check whether the runtime #VC exception handler is active. It uses
* the per-CPU GHCB page which is set up by sev_es_init_vc_handling().
@@ -1255,6 +1251,13 @@ void setup_ghcb(void)
}
/*
+ * Make sure the hypervisor talks a supported protocol.
+ * This gets called only in the BSP boot phase.
+ */
+ if (!sev_es_negotiate_protocol())
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
+
+ /*
* Clear the boot_ghcb. The first exception comes in before the bss
* section is cleared.
*/
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 65fe2094da59..31b6f5dddfc2 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -27,6 +27,7 @@
#include <linux/context_tracking.h>
#include <linux/entry-common.h>
#include <linux/syscalls.h>
+#include <linux/rseq.h>
#include <asm/processor.h>
#include <asm/ucontext.h>
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 2cc2aa120b4b..3f57ce68a3f1 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -757,6 +757,7 @@ const struct cpumask *cpu_clustergroup_mask(int cpu)
{
return cpu_l2c_shared_mask(cpu);
}
+EXPORT_SYMBOL_GPL(cpu_clustergroup_mask);
static void impress_friends(void)
{
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index c876f1d36a81..b0737a15c470 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -37,6 +37,7 @@
#include <linux/nmi.h>
#include <linux/mm.h>
#include <linux/smp.h>
+#include <linux/cpu.h>
#include <linux/io.h>
#include <linux/hardirq.h>
#include <linux/atomic.h>
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 54a5596adaa6..a349dbfc6d5a 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -267,19 +267,6 @@ SECTIONS
}
#endif
- /*
- * start address and size of operations which during runtime
- * can be patched with virtualization friendly instructions or
- * baremetal native ones. Think page table operations.
- * Details in paravirt_types.h
- */
- . = ALIGN(8);
- .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
- __parainstructions = .;
- *(.parainstructions)
- __parainstructions_end = .;
- }
-
#ifdef CONFIG_RETPOLINE
/*
* List of instructions that call/jmp/jcc to retpoline thunks
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index dda6fc4cfae8..42d3f47f4c07 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -105,7 +105,7 @@ static inline struct kvm_cpuid_entry2 *cpuid_entry2_find(
/*
* If the index isn't significant, use the first entry with a
- * matching function. It's userspace's responsibilty to not
+ * matching function. It's userspace's responsibility to not
* provide "duplicate" entries in all cases.
*/
if (!(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) || e->index == index)
diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c
index ee8c4c3496ed..eea6ea7f14af 100644
--- a/arch/x86/kvm/debugfs.c
+++ b/arch/x86/kvm/debugfs.c
@@ -182,6 +182,7 @@ static int kvm_mmu_rmaps_stat_release(struct inode *inode, struct file *file)
}
static const struct file_operations mmu_rmaps_stat_fops = {
+ .owner = THIS_MODULE,
.open = kvm_mmu_rmaps_stat_open,
.read = seq_read,
.llseek = seq_lseek,
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 238afd7335e4..4943f6b2bbee 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -2388,7 +2388,7 @@ static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *h
if (!eventfd)
return HV_STATUS_INVALID_PORT_ID;
- eventfd_signal(eventfd, 1);
+ eventfd_signal(eventfd);
return HV_STATUS_SUCCESS;
}
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index c57e181bba21..0b1f991b9a31 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -987,7 +987,7 @@ static void pte_list_desc_remove_entry(struct kvm *kvm,
/*
* The head descriptor is empty. If there are no tail descriptors,
- * nullify the rmap head to mark the list as emtpy, else point the rmap
+ * nullify the rmap head to mark the list as empty, else point the rmap
* head at the next descriptor, i.e. the new head.
*/
if (!head_desc->more)
@@ -6544,7 +6544,7 @@ void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, false);
/*
- * A TLB flush is unnecessary at this point for the same resons as in
+ * A TLB flush is unnecessary at this point for the same reasons as in
* kvm_mmu_slot_try_split_huge_pages().
*/
}
diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
index bd30ebfb2f2c..04c247bfe318 100644
--- a/arch/x86/kvm/mmu/tdp_iter.c
+++ b/arch/x86/kvm/mmu/tdp_iter.c
@@ -146,7 +146,7 @@ static bool try_step_up(struct tdp_iter *iter)
* Step to the next SPTE in a pre-order traversal of the paging structure.
* To get to the next SPTE, the iterator either steps down towards the goal
* GFN, if at a present, non-last-level SPTE, or over to a SPTE mapping a
- * highter GFN.
+ * higher GFN.
*
* The basic algorithm is as follows:
* 1. If the current SPTE is a non-last-level SPTE, step down into the page
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 4900c078045a..6ee925d66648 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -2972,6 +2972,25 @@ static void sev_es_vcpu_after_set_cpuid(struct vcpu_svm *svm)
set_msr_interception(vcpu, svm->msrpm, MSR_TSC_AUX, v_tsc_aux, v_tsc_aux);
}
+
+ /*
+ * For SEV-ES, accesses to MSR_IA32_XSS should not be intercepted if
+ * the host/guest supports its use.
+ *
+ * guest_can_use() checks a number of requirements on the host/guest to
+ * ensure that MSR_IA32_XSS is available, but it might report true even
+ * if X86_FEATURE_XSAVES isn't configured in the guest to ensure host
+ * MSR_IA32_XSS is always properly restored. For SEV-ES, it is better
+ * to further check that the guest CPUID actually supports
+ * X86_FEATURE_XSAVES so that accesses to MSR_IA32_XSS by misbehaved
+ * guests will still get intercepted and caught in the normal
+ * kvm_emulate_rdmsr()/kvm_emulated_wrmsr() paths.
+ */
+ if (guest_can_use(vcpu, X86_FEATURE_XSAVES) &&
+ guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_XSS, 1, 1);
+ else
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_XSS, 0, 0);
}
void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm)
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 712146312358..7fb51424fc74 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -103,6 +103,7 @@ static const struct svm_direct_access_msrs {
{ .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
{ .index = MSR_IA32_LASTINTFROMIP, .always = false },
{ .index = MSR_IA32_LASTINTTOIP, .always = false },
+ { .index = MSR_IA32_XSS, .always = false },
{ .index = MSR_EFER, .always = false },
{ .index = MSR_IA32_CR_PAT, .always = false },
{ .index = MSR_AMD64_SEV_ES_GHCB, .always = true },
@@ -1855,15 +1856,17 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
bool old_paging = is_paging(vcpu);
#ifdef CONFIG_X86_64
- if (vcpu->arch.efer & EFER_LME && !vcpu->arch.guest_state_protected) {
+ if (vcpu->arch.efer & EFER_LME) {
if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
vcpu->arch.efer |= EFER_LMA;
- svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
+ if (!vcpu->arch.guest_state_protected)
+ svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
}
if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
vcpu->arch.efer &= ~EFER_LMA;
- svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
+ if (!vcpu->arch.guest_state_protected)
+ svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
}
}
#endif
@@ -4741,7 +4744,7 @@ static int svm_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
* Emulation is possible for SEV guests if and only if a prefilled
* buffer containing the bytes of the intercepted instruction is
* available. SEV guest memory is encrypted with a guest specific key
- * and cannot be decrypted by KVM, i.e. KVM would read cyphertext and
+ * and cannot be decrypted by KVM, i.e. KVM would read ciphertext and
* decode garbage.
*
* If KVM is NOT trying to simply skip an instruction, inject #UD if
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index be67ab7fdd10..c409f934c377 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -30,7 +30,7 @@
#define IOPM_SIZE PAGE_SIZE * 3
#define MSRPM_SIZE PAGE_SIZE * 2
-#define MAX_DIRECT_ACCESS_MSRS 46
+#define MAX_DIRECT_ACCESS_MSRS 47
#define MSRPM_OFFSETS 32
extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
extern bool npt_enabled;
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index c5ec0ef51ff7..65826fe23f33 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -6561,7 +6561,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
* code was changed such that flag signals vmcs12 should
* be copied into eVMCS in guest memory.
*
- * To preserve backwards compatability, allow user
+ * To preserve backwards compatibility, allow user
* to set this flag even when there is no VMXON region.
*/
if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index be20a60047b1..e0f86f11c345 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1809,7 +1809,7 @@ static void vmx_inject_exception(struct kvm_vcpu *vcpu)
* do generate error codes with bits 31:16 set, and so KVM's
* ABI lets userspace shove in arbitrary 32-bit values. Drop
* the upper bits to avoid VM-Fail, losing information that
- * does't really exist is preferable to killing the VM.
+ * doesn't really exist is preferable to killing the VM.
*/
vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, (u16)ex->error_code);
intr_info |= INTR_INFO_DELIVER_CODE_MASK;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2c924075f6f1..cec0fc2a4b1c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5518,8 +5518,8 @@ static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu,
static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
struct kvm_xsave *guest_xsave)
{
- return kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region,
- sizeof(guest_xsave->region));
+ kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region,
+ sizeof(guest_xsave->region));
}
static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
@@ -10165,7 +10165,7 @@ static void kvm_inject_exception(struct kvm_vcpu *vcpu)
*
* But, if a VM-Exit occurs during instruction execution, and KVM does NOT skip
* the instruction or inject an exception, then KVM can incorrecty inject a new
- * asynchrounous event if the event became pending after the CPU fetched the
+ * asynchronous event if the event became pending after the CPU fetched the
* instruction (in the guest). E.g. if a page fault (#PF, #NPF, EPT violation)
* occurs and is resolved by KVM, a coincident NMI, SMI, IRQ, etc... can be
* injected on the restarted instruction instead of being deferred until the
@@ -10186,7 +10186,7 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu,
int r;
/*
- * Process nested events first, as nested VM-Exit supercedes event
+ * Process nested events first, as nested VM-Exit supersedes event
* re-injection. If there's an event queued for re-injection, it will
* be saved into the appropriate vmc{b,s}12 fields on nested VM-Exit.
*/
@@ -10884,7 +10884,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
/*
* Assert that vCPU vs. VM APICv state is consistent. An APICv
* update must kick and wait for all vCPUs before toggling the
- * per-VM state, and responsing vCPUs must wait for the update
+ * per-VM state, and responding vCPUs must wait for the update
* to complete before servicing KVM_REQ_APICV_UPDATE.
*/
WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) &&
@@ -13031,7 +13031,10 @@ bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
if (vcpu->arch.guest_state_protected)
return true;
- return vcpu->arch.preempted_in_kernel;
+ if (vcpu != kvm_get_running_vcpu())
+ return vcpu->arch.preempted_in_kernel;
+
+ return static_call(kvm_x86_get_cpl)(vcpu) == 0;
}
unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index e53fad915a62..523bb6df5ac9 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -2088,7 +2088,7 @@ static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r)
if (ret < 0 && ret != -ENOTCONN)
return false;
} else {
- eventfd_signal(evtchnfd->deliver.eventfd.ctx, 1);
+ eventfd_signal(evtchnfd->deliver.eventfd.ctx);
}
*r = 0;
diff --git a/arch/x86/lib/cache-smp.c b/arch/x86/lib/cache-smp.c
index 7c48ff4ae8d1..7af743bd3b13 100644
--- a/arch/x86/lib/cache-smp.c
+++ b/arch/x86/lib/cache-smp.c
@@ -1,4 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
+#include <asm/paravirt.h>
#include <linux/smp.h>
#include <linux/export.h>
diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
index cea25ca8b8cf..c9dae65ac01b 100644
--- a/arch/x86/lib/csum-partial_64.c
+++ b/arch/x86/lib/csum-partial_64.c
@@ -11,26 +11,23 @@
#include <asm/checksum.h>
#include <asm/word-at-a-time.h>
-static inline unsigned short from32to16(unsigned a)
+static inline __wsum csum_finalize_sum(u64 temp64)
{
- unsigned short b = a >> 16;
- asm("addw %w2,%w0\n\t"
- "adcw $0,%w0\n"
- : "=r" (b)
- : "0" (b), "r" (a));
- return b;
+ return (__force __wsum)((temp64 + ror64(temp64, 32)) >> 32);
}
-static inline __wsum csum_tail(u64 temp64, int odd)
+static inline unsigned long update_csum_40b(unsigned long sum, const unsigned long m[5])
{
- unsigned int result;
-
- result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff);
- if (unlikely(odd)) {
- result = from32to16(result);
- result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
- }
- return (__force __wsum)result;
+ asm("addq %1,%0\n\t"
+ "adcq %2,%0\n\t"
+ "adcq %3,%0\n\t"
+ "adcq %4,%0\n\t"
+ "adcq %5,%0\n\t"
+ "adcq $0,%0"
+ :"+r" (sum)
+ :"m" (m[0]), "m" (m[1]), "m" (m[2]),
+ "m" (m[3]), "m" (m[4]));
+ return sum;
}
/*
@@ -47,64 +44,32 @@ static inline __wsum csum_tail(u64 temp64, int odd)
__wsum csum_partial(const void *buff, int len, __wsum sum)
{
u64 temp64 = (__force u64)sum;
- unsigned odd;
- odd = 1 & (unsigned long) buff;
- if (unlikely(odd)) {
- if (unlikely(len == 0))
- return sum;
- temp64 = ror32((__force u32)sum, 8);
- temp64 += (*(unsigned char *)buff << 8);
- len--;
- buff++;
+ /* Do two 40-byte chunks in parallel to get better ILP */
+ if (likely(len >= 80)) {
+ u64 temp64_2 = 0;
+ do {
+ temp64 = update_csum_40b(temp64, buff);
+ temp64_2 = update_csum_40b(temp64_2, buff + 40);
+ buff += 80;
+ len -= 80;
+ } while (len >= 80);
+
+ asm("addq %1,%0\n\t"
+ "adcq $0,%0"
+ :"+r" (temp64): "r" (temp64_2));
}
/*
- * len == 40 is the hot case due to IPv6 headers, but annotating it likely()
- * has noticeable negative affect on codegen for all other cases with
- * minimal performance benefit here.
+ * len == 40 is the hot case due to IPv6 headers, so return
+ * early for that exact case without checking the tail bytes.
*/
- if (len == 40) {
- asm("addq 0*8(%[src]),%[res]\n\t"
- "adcq 1*8(%[src]),%[res]\n\t"
- "adcq 2*8(%[src]),%[res]\n\t"
- "adcq 3*8(%[src]),%[res]\n\t"
- "adcq 4*8(%[src]),%[res]\n\t"
- "adcq $0,%[res]"
- : [res] "+r"(temp64)
- : [src] "r"(buff), "m"(*(const char(*)[40])buff));
- return csum_tail(temp64, odd);
- }
- if (unlikely(len >= 64)) {
- /*
- * Extra accumulators for better ILP in the loop.
- */
- u64 tmp_accum, tmp_carries;
-
- asm("xorl %k[tmp_accum],%k[tmp_accum]\n\t"
- "xorl %k[tmp_carries],%k[tmp_carries]\n\t"
- "subl $64, %[len]\n\t"
- "1:\n\t"
- "addq 0*8(%[src]),%[res]\n\t"
- "adcq 1*8(%[src]),%[res]\n\t"
- "adcq 2*8(%[src]),%[res]\n\t"
- "adcq 3*8(%[src]),%[res]\n\t"
- "adcl $0,%k[tmp_carries]\n\t"
- "addq 4*8(%[src]),%[tmp_accum]\n\t"
- "adcq 5*8(%[src]),%[tmp_accum]\n\t"
- "adcq 6*8(%[src]),%[tmp_accum]\n\t"
- "adcq 7*8(%[src]),%[tmp_accum]\n\t"
- "adcl $0,%k[tmp_carries]\n\t"
- "addq $64, %[src]\n\t"
- "subl $64, %[len]\n\t"
- "jge 1b\n\t"
- "addq %[tmp_accum],%[res]\n\t"
- "adcq %[tmp_carries],%[res]\n\t"
- "adcq $0,%[res]"
- : [tmp_accum] "=&r"(tmp_accum),
- [tmp_carries] "=&r"(tmp_carries), [res] "+r"(temp64),
- [len] "+r"(len), [src] "+r"(buff)
- : "m"(*(const char *)buff));
+ if (len >= 40) {
+ temp64 = update_csum_40b(temp64, buff);
+ len -= 40;
+ if (!len)
+ return csum_finalize_sum(temp64);
+ buff += 40;
}
if (len & 32) {
@@ -143,7 +108,7 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
: [res] "+r"(temp64)
: [trail] "r"(trail));
}
- return csum_tail(temp64, odd);
+ return csum_finalize_sum(temp64);
}
EXPORT_SYMBOL(csum_partial);
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index 0e65d00e2339..23f81ca3f06b 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -128,7 +128,7 @@ static void delay_halt_mwaitx(u64 unused, u64 cycles)
delay = min_t(u64, MWAITX_MAX_WAIT_CYCLES, cycles);
/*
- * Use cpu_tss_rw as a cacheline-aligned, seldomly accessed per-cpu
+ * Use cpu_tss_rw as a cacheline-aligned, seldom accessed per-cpu
* variable as the monitor target.
*/
__monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0);
diff --git a/arch/x86/lib/misc.c b/arch/x86/lib/misc.c
index 92cd8ecc3a2c..40b81c338ae5 100644
--- a/arch/x86/lib/misc.c
+++ b/arch/x86/lib/misc.c
@@ -8,7 +8,7 @@
*/
int num_digits(int val)
{
- int m = 10;
+ long long m = 10;
int d = 1;
if (val < 0) {
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index ab778eac1952..679b09cfe241 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1370,6 +1370,8 @@ void do_user_addr_fault(struct pt_regs *regs,
goto done;
}
count_vm_vma_lock_event(VMA_LOCK_RETRY);
+ if (fault & VM_FAULT_MAJOR)
+ flags |= FAULT_FLAG_TRIED;
/* Quick path to respond to signals */
if (fault_signal_pending(fault, regs)) {
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index a190aae8ceaf..a0dffaca6d2b 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1013,7 +1013,7 @@ static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
return;
}
- /* free a pte talbe */
+ /* free a pte table */
free_pagetable(pmd_page(*pmd), 0);
spin_lock(&init_mm.page_table_lock);
pmd_clear(pmd);
@@ -1031,7 +1031,7 @@ static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
return;
}
- /* free a pmd talbe */
+ /* free a pmd table */
free_pagetable(pud_page(*pud), 0);
spin_lock(&init_mm.page_table_lock);
pud_clear(pud);
@@ -1049,7 +1049,7 @@ static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d)
return;
}
- /* free a pud talbe */
+ /* free a pud table */
free_pagetable(p4d_page(*p4d), 0);
spin_lock(&init_mm.page_table_lock);
p4d_clear(p4d);
diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c
index a68f2dda0948..70b91de2e053 100644
--- a/arch/x86/mm/mem_encrypt_amd.c
+++ b/arch/x86/mm/mem_encrypt_amd.c
@@ -32,6 +32,7 @@
#include <asm/msr.h>
#include <asm/cmdline.h>
#include <asm/sev.h>
+#include <asm/ia32.h>
#include "mm_internal.h"
@@ -481,6 +482,16 @@ void __init sme_early_init(void)
*/
if (sev_status & MSR_AMD64_SEV_ES_ENABLED)
x86_cpuinit.parallel_bringup = false;
+
+ /*
+ * The VMM is capable of injecting interrupt 0x80 and triggering the
+ * compatibility syscall path.
+ *
+ * By default, the 32-bit emulation is disabled in order to ensure
+ * the safety of the VM.
+ */
+ if (sev_status & MSR_AMD64_SEV_ENABLED)
+ ia32_disable();
}
void __init mem_encrypt_free_decrypted_mem(void)
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index b29ceb19e46e..adc497b93f03 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -450,37 +450,6 @@ int __node_distance(int from, int to)
EXPORT_SYMBOL(__node_distance);
/*
- * Sanity check to catch more bad NUMA configurations (they are amazingly
- * common). Make sure the nodes cover all memory.
- */
-static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
-{
- u64 numaram, e820ram;
- int i;
-
- numaram = 0;
- for (i = 0; i < mi->nr_blks; i++) {
- u64 s = mi->blk[i].start >> PAGE_SHIFT;
- u64 e = mi->blk[i].end >> PAGE_SHIFT;
- numaram += e - s;
- numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e);
- if ((s64)numaram < 0)
- numaram = 0;
- }
-
- e820ram = max_pfn - absent_pages_in_range(0, max_pfn);
-
- /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
- if ((s64)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) {
- printk(KERN_ERR "NUMA: nodes only cover %LuMB of your %LuMB e820 RAM. Not used.\n",
- (numaram << PAGE_SHIFT) >> 20,
- (e820ram << PAGE_SHIFT) >> 20);
- return false;
- }
- return true;
-}
-
-/*
* Mark all currently memblock-reserved physical memory (which covers the
* kernel's own memory ranges) as hot-unswappable.
*/
@@ -585,7 +554,8 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
return -EINVAL;
}
}
- if (!numa_meminfo_cover_memory(mi))
+
+ if (!memblock_validate_numa_coverage(SZ_1M))
return -EINVAL;
/* Finally register nodes. */
diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
index de10800cd4dd..0904d7e8e126 100644
--- a/arch/x86/mm/pat/memtype.c
+++ b/arch/x86/mm/pat/memtype.c
@@ -14,7 +14,7 @@
* memory ranges: uncached, write-combining, write-through, write-protected,
* and the most commonly used and default attribute: write-back caching.
*
- * PAT support supercedes and augments MTRR support in a compatible fashion: MTRR is
+ * PAT support supersedes and augments MTRR support in a compatible fashion: MTRR is
* a hardware interface to enumerate a limited number of physical memory ranges
* and set their caching attributes explicitly, programmed into the CPU via MSRs.
* Even modern CPUs have MTRRs enabled - but these are typically not touched
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index bda9f129835e..e9b448d1b1b7 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -1621,7 +1621,7 @@ repeat:
/*
* We need to keep the pfn from the existing PTE,
- * after all we're only going to change it's attributes
+ * after all we're only going to change its attributes
* not the memory it points to
*/
new_pte = pfn_pte(pfn, new_prot);
@@ -2447,7 +2447,7 @@ int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address,
/*
* The typical sequence for unmapping is to find a pte through
* lookup_address_in_pgd() (ideally, it should never return NULL because
- * the address is already mapped) and change it's protections. As pfn is
+ * the address is already mapped) and change its protections. As pfn is
* the *target* of a mapping, it's not useful while unmapping.
*/
struct cpa_data cpa = {
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 5dd733944629..669ba1c345b3 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -6,7 +6,7 @@
*
* https://github.com/IAIK/KAISER
*
- * The original work was written by and and signed off by for the Linux
+ * The original work was written by and signed off by for the Linux
* kernel by:
*
* Signed-off-by: Richard Fellner <richard.fellner@student.tugraz.at>
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 453ea95b667d..5768d386efab 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -355,7 +355,7 @@ static void l1d_flush_evaluate(unsigned long prev_mm, unsigned long next_mm,
/*
* Validate that it is not running on an SMT sibling as this would
- * make the excercise pointless because the siblings share L1D. If
+ * make the exercise pointless because the siblings share L1D. If
* it runs on a SMT sibling, notify it with SIGBUS on return to
* user/guest
*/
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 8c10d9abc239..919f647c740f 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -17,6 +17,7 @@
#include <asm/nospec-branch.h>
#include <asm/text-patching.h>
#include <asm/unwind.h>
+#include <asm/cfi.h>
static bool all_callee_regs_used[4] = {true, true, true, true};
@@ -51,9 +52,11 @@ static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
do { EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0)
#ifdef CONFIG_X86_KERNEL_IBT
-#define EMIT_ENDBR() EMIT(gen_endbr(), 4)
+#define EMIT_ENDBR() EMIT(gen_endbr(), 4)
+#define EMIT_ENDBR_POISON() EMIT(gen_endbr_poison(), 4)
#else
#define EMIT_ENDBR()
+#define EMIT_ENDBR_POISON()
#endif
static bool is_imm8(int value)
@@ -304,6 +307,88 @@ static void pop_callee_regs(u8 **pprog, bool *callee_regs_used)
*pprog = prog;
}
+static void emit_nops(u8 **pprog, int len)
+{
+ u8 *prog = *pprog;
+ int i, noplen;
+
+ while (len > 0) {
+ noplen = len;
+
+ if (noplen > ASM_NOP_MAX)
+ noplen = ASM_NOP_MAX;
+
+ for (i = 0; i < noplen; i++)
+ EMIT1(x86_nops[noplen][i]);
+ len -= noplen;
+ }
+
+ *pprog = prog;
+}
+
+/*
+ * Emit the various CFI preambles, see asm/cfi.h and the comments about FineIBT
+ * in arch/x86/kernel/alternative.c
+ */
+
+static void emit_fineibt(u8 **pprog, u32 hash)
+{
+ u8 *prog = *pprog;
+
+ EMIT_ENDBR();
+ EMIT3_off32(0x41, 0x81, 0xea, hash); /* subl $hash, %r10d */
+ EMIT2(0x74, 0x07); /* jz.d8 +7 */
+ EMIT2(0x0f, 0x0b); /* ud2 */
+ EMIT1(0x90); /* nop */
+ EMIT_ENDBR_POISON();
+
+ *pprog = prog;
+}
+
+static void emit_kcfi(u8 **pprog, u32 hash)
+{
+ u8 *prog = *pprog;
+
+ EMIT1_off32(0xb8, hash); /* movl $hash, %eax */
+#ifdef CONFIG_CALL_PADDING
+ EMIT1(0x90);
+ EMIT1(0x90);
+ EMIT1(0x90);
+ EMIT1(0x90);
+ EMIT1(0x90);
+ EMIT1(0x90);
+ EMIT1(0x90);
+ EMIT1(0x90);
+ EMIT1(0x90);
+ EMIT1(0x90);
+ EMIT1(0x90);
+#endif
+ EMIT_ENDBR();
+
+ *pprog = prog;
+}
+
+static void emit_cfi(u8 **pprog, u32 hash)
+{
+ u8 *prog = *pprog;
+
+ switch (cfi_mode) {
+ case CFI_FINEIBT:
+ emit_fineibt(&prog, hash);
+ break;
+
+ case CFI_KCFI:
+ emit_kcfi(&prog, hash);
+ break;
+
+ default:
+ EMIT_ENDBR();
+ break;
+ }
+
+ *pprog = prog;
+}
+
/*
* Emit x86-64 prologue code for BPF program.
* bpf_tail_call helper will skip the first X86_TAIL_CALL_OFFSET bytes
@@ -315,12 +400,11 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf,
{
u8 *prog = *pprog;
+ emit_cfi(&prog, is_subprog ? cfi_bpf_subprog_hash : cfi_bpf_hash);
/* BPF trampoline can be made to work without these nops,
* but let's waste 5 bytes for now and optimize later
*/
- EMIT_ENDBR();
- memcpy(prog, x86_nops[5], X86_PATCH_SIZE);
- prog += X86_PATCH_SIZE;
+ emit_nops(&prog, X86_PATCH_SIZE);
if (!ebpf_from_cbpf) {
if (tail_call_reachable && !is_subprog)
/* When it's the entry of the whole tailcall context,
@@ -626,8 +710,7 @@ static void emit_bpf_tail_call_direct(struct bpf_prog *bpf_prog,
if (stack_depth)
EMIT3_off32(0x48, 0x81, 0xC4, round_up(stack_depth, 8));
- memcpy(prog, x86_nops[5], X86_PATCH_SIZE);
- prog += X86_PATCH_SIZE;
+ emit_nops(&prog, X86_PATCH_SIZE);
/* out: */
ctx->tail_call_direct_label = prog - start;
@@ -989,25 +1072,6 @@ static void detect_reg_usage(struct bpf_insn *insn, int insn_cnt,
}
}
-static void emit_nops(u8 **pprog, int len)
-{
- u8 *prog = *pprog;
- int i, noplen;
-
- while (len > 0) {
- noplen = len;
-
- if (noplen > ASM_NOP_MAX)
- noplen = ASM_NOP_MAX;
-
- for (i = 0; i < noplen; i++)
- EMIT1(x86_nops[noplen][i]);
- len -= noplen;
- }
-
- *pprog = prog;
-}
-
/* emit the 3-byte VEX prefix
*
* r: same as rex.r, extra bit for ModRM reg field
@@ -2143,7 +2207,7 @@ static void save_args(const struct btf_func_model *m, u8 **prog,
} else {
/* Only copy the arguments on-stack to current
* 'stack_size' and ignore the regs, used to
- * prepare the arguments on-stack for orign call.
+ * prepare the arguments on-stack for origin call.
*/
if (for_call_origin) {
nr_regs += arg_regs;
@@ -2198,7 +2262,8 @@ static void restore_regs(const struct btf_func_model *m, u8 **prog,
static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
struct bpf_tramp_link *l, int stack_size,
- int run_ctx_off, bool save_ret)
+ int run_ctx_off, bool save_ret,
+ void *image, void *rw_image)
{
u8 *prog = *pprog;
u8 *jmp_insn;
@@ -2226,7 +2291,7 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
else
EMIT4(0x48, 0x8D, 0x75, -run_ctx_off);
- if (emit_rsb_call(&prog, bpf_trampoline_enter(p), prog))
+ if (emit_rsb_call(&prog, bpf_trampoline_enter(p), image + (prog - (u8 *)rw_image)))
return -EINVAL;
/* remember prog start time returned by __bpf_prog_enter */
emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0);
@@ -2250,7 +2315,7 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
(long) p->insnsi >> 32,
(u32) (long) p->insnsi);
/* call JITed bpf program or interpreter */
- if (emit_rsb_call(&prog, p->bpf_func, prog))
+ if (emit_rsb_call(&prog, p->bpf_func, image + (prog - (u8 *)rw_image)))
return -EINVAL;
/*
@@ -2277,7 +2342,7 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
EMIT3_off32(0x48, 0x8D, 0x95, -run_ctx_off);
else
EMIT4(0x48, 0x8D, 0x55, -run_ctx_off);
- if (emit_rsb_call(&prog, bpf_trampoline_exit(p), prog))
+ if (emit_rsb_call(&prog, bpf_trampoline_exit(p), image + (prog - (u8 *)rw_image)))
return -EINVAL;
*pprog = prog;
@@ -2312,14 +2377,15 @@ static int emit_cond_near_jump(u8 **pprog, void *func, void *ip, u8 jmp_cond)
static int invoke_bpf(const struct btf_func_model *m, u8 **pprog,
struct bpf_tramp_links *tl, int stack_size,
- int run_ctx_off, bool save_ret)
+ int run_ctx_off, bool save_ret,
+ void *image, void *rw_image)
{
int i;
u8 *prog = *pprog;
for (i = 0; i < tl->nr_links; i++) {
if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size,
- run_ctx_off, save_ret))
+ run_ctx_off, save_ret, image, rw_image))
return -EINVAL;
}
*pprog = prog;
@@ -2328,7 +2394,8 @@ static int invoke_bpf(const struct btf_func_model *m, u8 **pprog,
static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog,
struct bpf_tramp_links *tl, int stack_size,
- int run_ctx_off, u8 **branches)
+ int run_ctx_off, u8 **branches,
+ void *image, void *rw_image)
{
u8 *prog = *pprog;
int i;
@@ -2339,7 +2406,8 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog,
emit_mov_imm32(&prog, false, BPF_REG_0, 0);
emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
for (i = 0; i < tl->nr_links; i++) {
- if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size, run_ctx_off, true))
+ if (invoke_bpf_prog(m, &prog, tl->links[i], stack_size, run_ctx_off, true,
+ image, rw_image))
return -EINVAL;
/* mod_ret prog stored return value into [rbp - 8]. Emit:
@@ -2422,10 +2490,11 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog,
* add rsp, 8 // skip eth_type_trans's frame
* ret // return to its caller
*/
-int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
- const struct btf_func_model *m, u32 flags,
- struct bpf_tramp_links *tlinks,
- void *func_addr)
+static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_image,
+ void *rw_image_end, void *image,
+ const struct btf_func_model *m, u32 flags,
+ struct bpf_tramp_links *tlinks,
+ void *func_addr)
{
int i, ret, nr_regs = m->nr_args, stack_size = 0;
int regs_off, nregs_off, ip_off, run_ctx_off, arg_stack_off, rbx_off;
@@ -2437,10 +2506,19 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
u8 *prog;
bool save_ret;
+ /*
+ * F_INDIRECT is only compatible with F_RET_FENTRY_RET, it is
+ * explicitly incompatible with F_CALL_ORIG | F_SKIP_FRAME | F_IP_ARG
+ * because @func_addr.
+ */
+ WARN_ON_ONCE((flags & BPF_TRAMP_F_INDIRECT) &&
+ (flags & ~(BPF_TRAMP_F_INDIRECT | BPF_TRAMP_F_RET_FENTRY_RET)));
+
/* extra registers for struct arguments */
- for (i = 0; i < m->nr_args; i++)
+ for (i = 0; i < m->nr_args; i++) {
if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG)
nr_regs += (m->arg_size[i] + 7) / 8 - 1;
+ }
/* x86-64 supports up to MAX_BPF_FUNC_ARGS arguments. 1-6
* are passed through regs, the remains are through stack.
@@ -2521,22 +2599,29 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
orig_call += X86_PATCH_SIZE;
}
- prog = image;
+ prog = rw_image;
- EMIT_ENDBR();
- /*
- * This is the direct-call trampoline, as such it needs accounting
- * for the __fentry__ call.
- */
- x86_call_depth_emit_accounting(&prog, NULL);
+ if (flags & BPF_TRAMP_F_INDIRECT) {
+ /*
+ * Indirect call for bpf_struct_ops
+ */
+ emit_cfi(&prog, cfi_get_func_hash(func_addr));
+ } else {
+ /*
+ * Direct-call fentry stub, as such it needs accounting for the
+ * __fentry__ call.
+ */
+ x86_call_depth_emit_accounting(&prog, NULL);
+ }
EMIT1(0x55); /* push rbp */
EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */
- if (!is_imm8(stack_size))
+ if (!is_imm8(stack_size)) {
/* sub rsp, stack_size */
EMIT3_off32(0x48, 0x81, 0xEC, stack_size);
- else
+ } else {
/* sub rsp, stack_size */
EMIT4(0x48, 0x83, 0xEC, stack_size);
+ }
if (flags & BPF_TRAMP_F_TAIL_CALL_CTX)
EMIT1(0x50); /* push rax */
/* mov QWORD PTR [rbp - rbx_off], rbx */
@@ -2563,16 +2648,18 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
if (flags & BPF_TRAMP_F_CALL_ORIG) {
/* arg1: mov rdi, im */
emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im);
- if (emit_rsb_call(&prog, __bpf_tramp_enter, prog)) {
+ if (emit_rsb_call(&prog, __bpf_tramp_enter,
+ image + (prog - (u8 *)rw_image))) {
ret = -EINVAL;
goto cleanup;
}
}
- if (fentry->nr_links)
+ if (fentry->nr_links) {
if (invoke_bpf(m, &prog, fentry, regs_off, run_ctx_off,
- flags & BPF_TRAMP_F_RET_FENTRY_RET))
+ flags & BPF_TRAMP_F_RET_FENTRY_RET, image, rw_image))
return -EINVAL;
+ }
if (fmod_ret->nr_links) {
branches = kcalloc(fmod_ret->nr_links, sizeof(u8 *),
@@ -2581,7 +2668,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
return -ENOMEM;
if (invoke_bpf_mod_ret(m, &prog, fmod_ret, regs_off,
- run_ctx_off, branches)) {
+ run_ctx_off, branches, image, rw_image)) {
ret = -EINVAL;
goto cleanup;
}
@@ -2591,27 +2678,27 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
restore_regs(m, &prog, regs_off);
save_args(m, &prog, arg_stack_off, true);
- if (flags & BPF_TRAMP_F_TAIL_CALL_CTX)
+ if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) {
/* Before calling the original function, restore the
* tail_call_cnt from stack to rax.
*/
RESTORE_TAIL_CALL_CNT(stack_size);
+ }
if (flags & BPF_TRAMP_F_ORIG_STACK) {
emit_ldx(&prog, BPF_DW, BPF_REG_6, BPF_REG_FP, 8);
EMIT2(0xff, 0xd3); /* call *rbx */
} else {
/* call original function */
- if (emit_rsb_call(&prog, orig_call, prog)) {
+ if (emit_rsb_call(&prog, orig_call, image + (prog - (u8 *)rw_image))) {
ret = -EINVAL;
goto cleanup;
}
}
/* remember return value in a stack for bpf prog to access */
emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
- im->ip_after_call = prog;
- memcpy(prog, x86_nops[5], X86_PATCH_SIZE);
- prog += X86_PATCH_SIZE;
+ im->ip_after_call = image + (prog - (u8 *)rw_image);
+ emit_nops(&prog, X86_PATCH_SIZE);
}
if (fmod_ret->nr_links) {
@@ -2624,16 +2711,19 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
/* Update the branches saved in invoke_bpf_mod_ret with the
* aligned address of do_fexit.
*/
- for (i = 0; i < fmod_ret->nr_links; i++)
- emit_cond_near_jump(&branches[i], prog, branches[i],
- X86_JNE);
+ for (i = 0; i < fmod_ret->nr_links; i++) {
+ emit_cond_near_jump(&branches[i], image + (prog - (u8 *)rw_image),
+ image + (branches[i] - (u8 *)rw_image), X86_JNE);
+ }
}
- if (fexit->nr_links)
- if (invoke_bpf(m, &prog, fexit, regs_off, run_ctx_off, false)) {
+ if (fexit->nr_links) {
+ if (invoke_bpf(m, &prog, fexit, regs_off, run_ctx_off,
+ false, image, rw_image)) {
ret = -EINVAL;
goto cleanup;
}
+ }
if (flags & BPF_TRAMP_F_RESTORE_REGS)
restore_regs(m, &prog, regs_off);
@@ -2643,18 +2733,19 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
* restored to R0.
*/
if (flags & BPF_TRAMP_F_CALL_ORIG) {
- im->ip_epilogue = prog;
+ im->ip_epilogue = image + (prog - (u8 *)rw_image);
/* arg1: mov rdi, im */
emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im);
- if (emit_rsb_call(&prog, __bpf_tramp_exit, prog)) {
+ if (emit_rsb_call(&prog, __bpf_tramp_exit, image + (prog - (u8 *)rw_image))) {
ret = -EINVAL;
goto cleanup;
}
- } else if (flags & BPF_TRAMP_F_TAIL_CALL_CTX)
+ } else if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) {
/* Before running the original function, restore the
* tail_call_cnt from stack to rax.
*/
RESTORE_TAIL_CALL_CNT(stack_size);
+ }
/* restore return value of orig_call or fentry prog back into RAX */
if (save_ret)
@@ -2662,22 +2753,94 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
emit_ldx(&prog, BPF_DW, BPF_REG_6, BPF_REG_FP, -rbx_off);
EMIT1(0xC9); /* leave */
- if (flags & BPF_TRAMP_F_SKIP_FRAME)
+ if (flags & BPF_TRAMP_F_SKIP_FRAME) {
/* skip our return address and return to parent */
EMIT4(0x48, 0x83, 0xC4, 8); /* add rsp, 8 */
- emit_return(&prog, prog);
+ }
+ emit_return(&prog, image + (prog - (u8 *)rw_image));
/* Make sure the trampoline generation logic doesn't overflow */
- if (WARN_ON_ONCE(prog > (u8 *)image_end - BPF_INSN_SAFETY)) {
+ if (WARN_ON_ONCE(prog > (u8 *)rw_image_end - BPF_INSN_SAFETY)) {
ret = -EFAULT;
goto cleanup;
}
- ret = prog - (u8 *)image;
+ ret = prog - (u8 *)rw_image + BPF_INSN_SAFETY;
cleanup:
kfree(branches);
return ret;
}
+void *arch_alloc_bpf_trampoline(unsigned int size)
+{
+ return bpf_prog_pack_alloc(size, jit_fill_hole);
+}
+
+void arch_free_bpf_trampoline(void *image, unsigned int size)
+{
+ bpf_prog_pack_free(image, size);
+}
+
+void arch_protect_bpf_trampoline(void *image, unsigned int size)
+{
+}
+
+void arch_unprotect_bpf_trampoline(void *image, unsigned int size)
+{
+}
+
+int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
+ const struct btf_func_model *m, u32 flags,
+ struct bpf_tramp_links *tlinks,
+ void *func_addr)
+{
+ void *rw_image, *tmp;
+ int ret;
+ u32 size = image_end - image;
+
+ /* rw_image doesn't need to be in module memory range, so we can
+ * use kvmalloc.
+ */
+ rw_image = kvmalloc(size, GFP_KERNEL);
+ if (!rw_image)
+ return -ENOMEM;
+
+ ret = __arch_prepare_bpf_trampoline(im, rw_image, rw_image + size, image, m,
+ flags, tlinks, func_addr);
+ if (ret < 0)
+ goto out;
+
+ tmp = bpf_arch_text_copy(image, rw_image, size);
+ if (IS_ERR(tmp))
+ ret = PTR_ERR(tmp);
+out:
+ kvfree(rw_image);
+ return ret;
+}
+
+int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
+ struct bpf_tramp_links *tlinks, void *func_addr)
+{
+ struct bpf_tramp_image im;
+ void *image;
+ int ret;
+
+ /* Allocate a temporary buffer for __arch_prepare_bpf_trampoline().
+ * This will NOT cause fragmentation in direct map, as we do not
+ * call set_memory_*() on this buffer.
+ *
+ * We cannot use kvmalloc here, because we need image to be in
+ * module memory range.
+ */
+ image = bpf_jit_alloc_exec(PAGE_SIZE);
+ if (!image)
+ return -ENOMEM;
+
+ ret = __arch_prepare_bpf_trampoline(&im, image, image + PAGE_SIZE, image,
+ m, flags, tlinks, func_addr);
+ bpf_jit_free_exec(image);
+ return ret;
+}
+
static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs, u8 *image, u8 *buf)
{
u8 *jg_reloc, *prog = *pprog;
@@ -2935,9 +3098,16 @@ out_image:
jit_data->header = header;
jit_data->rw_header = rw_header;
}
- prog->bpf_func = (void *)image;
+ /*
+ * ctx.prog_offset is used when CFI preambles put code *before*
+ * the function. See emit_cfi(). For FineIBT specifically this code
+ * can also be executed and bpf_prog_kallsyms_add() will
+ * generate an additional symbol to cover this, hence also
+ * decrement proglen.
+ */
+ prog->bpf_func = (void *)image + cfi_get_offset();
prog->jited = 1;
- prog->jited_len = proglen;
+ prog->jited_len = proglen - cfi_get_offset();
} else {
prog = orig_prog;
}
@@ -2992,6 +3162,7 @@ void bpf_jit_free(struct bpf_prog *prog)
kvfree(jit_data->addrs);
kfree(jit_data);
}
+ prog->bpf_func = (void *)prog->bpf_func - cfi_get_offset();
hdr = bpf_jit_binary_pack_hdr(prog);
bpf_jit_binary_pack_free(hdr, NULL);
WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(prog));
@@ -3025,3 +3196,49 @@ void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp
#endif
WARN(1, "verification of programs using bpf_throw should have failed\n");
}
+
+void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
+ struct bpf_prog *new, struct bpf_prog *old)
+{
+ u8 *old_addr, *new_addr, *old_bypass_addr;
+ int ret;
+
+ old_bypass_addr = old ? NULL : poke->bypass_addr;
+ old_addr = old ? (u8 *)old->bpf_func + poke->adj_off : NULL;
+ new_addr = new ? (u8 *)new->bpf_func + poke->adj_off : NULL;
+
+ /*
+ * On program loading or teardown, the program's kallsym entry
+ * might not be in place, so we use __bpf_arch_text_poke to skip
+ * the kallsyms check.
+ */
+ if (new) {
+ ret = __bpf_arch_text_poke(poke->tailcall_target,
+ BPF_MOD_JUMP,
+ old_addr, new_addr);
+ BUG_ON(ret < 0);
+ if (!old) {
+ ret = __bpf_arch_text_poke(poke->tailcall_bypass,
+ BPF_MOD_JUMP,
+ poke->bypass_addr,
+ NULL);
+ BUG_ON(ret < 0);
+ }
+ } else {
+ ret = __bpf_arch_text_poke(poke->tailcall_bypass,
+ BPF_MOD_JUMP,
+ old_bypass_addr,
+ poke->bypass_addr);
+ BUG_ON(ret < 0);
+ /* let other CPUs finish the execution of program
+ * so that it will not possible to expose them
+ * to invalid nop, stack unwind, nop state
+ */
+ if (!ret)
+ synchronize_rcu();
+ ret = __bpf_arch_text_poke(poke->tailcall_target,
+ BPF_MOD_JUMP,
+ old_addr, NULL);
+ BUG_ON(ret < 0);
+ }
+}
diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c
index 429a89c5468b..b18ce19981ec 100644
--- a/arch/x86/net/bpf_jit_comp32.c
+++ b/arch/x86/net/bpf_jit_comp32.c
@@ -1194,7 +1194,7 @@ struct jit_context {
#define PROLOGUE_SIZE 35
/*
- * Emit prologue code for BPF program and check it's size.
+ * Emit prologue code for BPF program and check its size.
* bpf_tail_call helper will skip it while jumping into another program.
*/
static void emit_prologue(u8 **pprog, u32 stack_depth)
diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c
index 7368afc03998..8c8ddc4dcc08 100644
--- a/arch/x86/pci/sta2x11-fixup.c
+++ b/arch/x86/pci/sta2x11-fixup.c
@@ -14,6 +14,7 @@
#include <linux/dma-map-ops.h>
#include <linux/swiotlb.h>
#include <asm/iommu.h>
+#include <asm/sta2x11.h>
#define STA2X11_SWIOTLB_SIZE (4*1024*1024)
diff --git a/arch/x86/platform/intel-quark/imr_selftest.c b/arch/x86/platform/intel-quark/imr_selftest.c
index 761f3689f60a..84ba715f44d1 100644
--- a/arch/x86/platform/intel-quark/imr_selftest.c
+++ b/arch/x86/platform/intel-quark/imr_selftest.c
@@ -6,7 +6,7 @@
* Copyright(c) 2015 Bryan O'Donoghue <pure.logic@nexus-software.ie>
*
* IMR self test. The purpose of this module is to run a set of tests on the
- * IMR API to validate it's sanity. We check for overlapping, reserved
+ * IMR API to validate its sanity. We check for overlapping, reserved
* addresses and setup/teardown sanity.
*
*/
diff --git a/arch/x86/platform/pvh/head.S b/arch/x86/platform/pvh/head.S
index c4365a05ab83..f7235ef87bc3 100644
--- a/arch/x86/platform/pvh/head.S
+++ b/arch/x86/platform/pvh/head.S
@@ -11,6 +11,7 @@
#include <linux/elfnote.h>
#include <linux/init.h>
#include <linux/linkage.h>
+#include <asm/desc_defs.h>
#include <asm/segment.h>
#include <asm/asm.h>
#include <asm/boot.h>
@@ -41,7 +42,7 @@
* Bit 8 (TF) must be cleared. Other bits are all unspecified.
*
* All other processor registers and flag bits are unspecified. The OS is in
- * charge of setting up it's own stack, GDT and IDT.
+ * charge of setting up its own stack, GDT and IDT.
*/
#define PVH_GDT_ENTRY_CS 1
@@ -148,11 +149,11 @@ SYM_DATA_END(gdt)
SYM_DATA_START_LOCAL(gdt_start)
.quad 0x0000000000000000 /* NULL descriptor */
#ifdef CONFIG_X86_64
- .quad GDT_ENTRY(0xa09a, 0, 0xfffff) /* PVH_CS_SEL */
+ .quad GDT_ENTRY(DESC_CODE64, 0, 0xfffff) /* PVH_CS_SEL */
#else
- .quad GDT_ENTRY(0xc09a, 0, 0xfffff) /* PVH_CS_SEL */
+ .quad GDT_ENTRY(DESC_CODE32, 0, 0xfffff) /* PVH_CS_SEL */
#endif
- .quad GDT_ENTRY(0xc092, 0, 0xfffff) /* PVH_DS_SEL */
+ .quad GDT_ENTRY(DESC_DATA32, 0, 0xfffff) /* PVH_DS_SEL */
SYM_DATA_END_LABEL(gdt_start, SYM_L_LOCAL, gdt_end)
.balign 16
diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c
index 4221259a5870..a379501b7a69 100644
--- a/arch/x86/platform/uv/uv_irq.c
+++ b/arch/x86/platform/uv/uv_irq.c
@@ -35,7 +35,7 @@ static void uv_program_mmr(struct irq_cfg *cfg, struct uv_irq_2_mmr_pnode *info)
mmr_value = 0;
entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
entry->vector = cfg->vector;
- entry->delivery_mode = apic->delivery_mode;
+ entry->delivery_mode = APIC_DELIVERY_MODE_FIXED;
entry->dest_mode = apic->dest_mode_logical;
entry->polarity = 0;
entry->trigger = 0;
diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c
index e03207de2880..5c50e550ab63 100644
--- a/arch/x86/platform/uv/uv_nmi.c
+++ b/arch/x86/platform/uv/uv_nmi.c
@@ -741,7 +741,7 @@ static void uv_nmi_dump_state_cpu(int cpu, struct pt_regs *regs)
this_cpu_write(uv_cpu_nmi.state, UV_NMI_STATE_DUMP_DONE);
}
-/* Trigger a slave CPU to dump it's state */
+/* Trigger a slave CPU to dump its state */
static void uv_nmi_trigger_dump(int cpu)
{
int retry = uv_nmi_trigger_delay;
diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c
index ff5afc8a5a41..3712afc3534d 100644
--- a/arch/x86/platform/uv/uv_time.c
+++ b/arch/x86/platform/uv/uv_time.c
@@ -270,7 +270,7 @@ static int uv_rtc_unset_timer(int cpu, int force)
* Read the RTC.
*
* Starting with HUB rev 2.0, the UV RTC register is replicated across all
- * cachelines of it's own page. This allows faster simultaneous reads
+ * cachelines of its own page. This allows faster simultaneous reads
* from a given socket.
*/
static u64 uv_read_rtc(struct clocksource *cs)
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index 788e5559549f..f9bc444a3064 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -61,7 +61,7 @@ void __init reserve_real_mode(void)
set_real_mode_mem(mem);
/*
- * Unconditionally reserve the entire fisrt 1M, see comment in
+ * Unconditionally reserve the entire first 1M, see comment in
* setup_arch().
*/
memblock_reserve(0, SZ_1M);
diff --git a/arch/x86/realmode/rm/reboot.S b/arch/x86/realmode/rm/reboot.S
index f10515b10e0a..e714b4624e36 100644
--- a/arch/x86/realmode/rm/reboot.S
+++ b/arch/x86/realmode/rm/reboot.S
@@ -1,5 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/linkage.h>
+#include <asm/desc_defs.h>
#include <asm/segment.h>
#include <asm/page_types.h>
#include <asm/processor-flags.h>
@@ -153,5 +154,5 @@ SYM_DATA_START(machine_real_restart_gdt)
* base value 0x100; since this is consistent with real mode
* semantics we don't have to reload the segments once CR0.PE = 0.
*/
- .quad GDT_ENTRY(0x0093, 0x100, 0xffff)
+ .quad GDT_ENTRY(DESC_DATA16, 0x100, 0xffff)
SYM_DATA_END(machine_real_restart_gdt)
diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile
index 90e820ac9771..7278e2545c35 100644
--- a/arch/x86/tools/Makefile
+++ b/arch/x86/tools/Makefile
@@ -17,7 +17,7 @@ reformatter = $(srctree)/arch/x86/tools/objdump_reformat.awk
chkobjdump = $(srctree)/arch/x86/tools/chkobjdump.awk
quiet_cmd_posttest = TEST $@
- cmd_posttest = ($(OBJDUMP) -v | $(AWK) -f $(chkobjdump)) || $(OBJDUMP) -d -j .text $(objtree)/vmlinux | $(AWK) -f $(reformatter) | $(obj)/insn_decoder_test $(posttest_64bit) $(posttest_verbose)
+ cmd_posttest = $(OBJDUMP) -d -j .text $(objtree)/vmlinux | $(AWK) -f $(reformatter) | $(obj)/insn_decoder_test $(posttest_64bit) $(posttest_verbose)
quiet_cmd_sanitytest = TEST $@
cmd_sanitytest = $(obj)/insn_sanity $(posttest_64bit) -m 1000000
diff --git a/arch/x86/tools/chkobjdump.awk b/arch/x86/tools/chkobjdump.awk
deleted file mode 100644
index a4cf678cf5c8..000000000000
--- a/arch/x86/tools/chkobjdump.awk
+++ /dev/null
@@ -1,34 +0,0 @@
-# GNU objdump version checker
-#
-# Usage:
-# objdump -v | awk -f chkobjdump.awk
-BEGIN {
- # objdump version 2.19 or later is OK for the test.
- od_ver = 2;
- od_sver = 19;
-}
-
-/^GNU objdump/ {
- verstr = ""
- gsub(/\(.*\)/, "");
- for (i = 3; i <= NF; i++)
- if (match($(i), "^[0-9]")) {
- verstr = $(i);
- break;
- }
- if (verstr == "") {
- printf("Warning: Failed to find objdump version number.\n");
- exit 0;
- }
- split(verstr, ver, ".");
- if (ver[1] > od_ver ||
- (ver[1] == od_ver && ver[2] >= od_sver)) {
- exit 1;
- } else {
- printf("Warning: objdump version %s is older than %d.%d\n",
- verstr, od_ver, od_sver);
- print("Warning: Skipping posttest.");
- # Logic is inverted, because we just skip test without error.
- exit 0;
- }
-}
diff --git a/arch/x86/tools/objdump_reformat.awk b/arch/x86/tools/objdump_reformat.awk
index f418c91b71f0..20b08a6c4d33 100644
--- a/arch/x86/tools/objdump_reformat.awk
+++ b/arch/x86/tools/objdump_reformat.awk
@@ -11,8 +11,8 @@ BEGIN {
prev_addr = ""
prev_hex = ""
prev_mnemonic = ""
- bad_expr = "(\\(bad\\)|^rex|^.byte|^rep(z|nz)$|^lock$|^es$|^cs$|^ss$|^ds$|^fs$|^gs$|^data(16|32)$|^addr(16|32|64))"
- fwait_expr = "^9b "
+ bad_expr = "(\\(bad\\)|<unknown>|^rex|^.byte|^rep(z|nz)$|^lock$|^es$|^cs$|^ss$|^ds$|^fs$|^gs$|^data(16|32)$|^addr(16|32|64))"
+ fwait_expr = "^9b[ \t]*fwait"
fwait_str="9b\tfwait"
}
@@ -22,7 +22,7 @@ BEGIN {
}
/^ *[0-9a-f]+:/ {
- if (split($0, field, "\t") < 3) {
+ if (split($0, field, /: |\t/) < 3) {
# This is a continuation of the same insn.
prev_hex = prev_hex field[2]
} else {
diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index d30949e25ebd..a3bae2b24626 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -66,7 +66,7 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = {
[S_REL] =
"^(__init_(begin|end)|"
"__x86_cpu_dev_(start|end)|"
- "(__parainstructions|__alt_instructions)(_end)?|"
+ "__alt_instructions(_end)?|"
"(__iommu_table|__apicdrivers|__smp_locks)(_end)?|"
"__(start|end)_pci_.*|"
#if CONFIG_FW_LOADER
diff --git a/arch/x86/um/asm/elf.h b/arch/x86/um/asm/elf.h
index 6523eb7c3bd1..6052200fe925 100644
--- a/arch/x86/um/asm/elf.h
+++ b/arch/x86/um/asm/elf.h
@@ -168,8 +168,8 @@ do { \
(pr_reg)[18] = (_regs)->regs.gp[18]; \
(pr_reg)[19] = (_regs)->regs.gp[19]; \
(pr_reg)[20] = (_regs)->regs.gp[20]; \
- (pr_reg)[21] = current->thread.arch.fs; \
- (pr_reg)[22] = 0; \
+ (pr_reg)[21] = (_regs)->regs.gp[21]; \
+ (pr_reg)[22] = (_regs)->regs.gp[22]; \
(pr_reg)[23] = 0; \
(pr_reg)[24] = 0; \
(pr_reg)[25] = 0; \
diff --git a/arch/x86/um/asm/processor_64.h b/arch/x86/um/asm/processor_64.h
index 1ef9c21877bc..f90159508936 100644
--- a/arch/x86/um/asm/processor_64.h
+++ b/arch/x86/um/asm/processor_64.h
@@ -10,13 +10,11 @@
struct arch_thread {
unsigned long debugregs[8];
int debugregs_seq;
- unsigned long fs;
struct faultinfo faultinfo;
};
#define INIT_ARCH_THREAD { .debugregs = { [ 0 ... 7 ] = 0 }, \
.debugregs_seq = 0, \
- .fs = 0, \
.faultinfo = { 0, 0, 0 } }
#define STACKSLOTS_PER_LINE 4
@@ -28,7 +26,6 @@ static inline void arch_flush_thread(struct arch_thread *thread)
static inline void arch_copy_thread(struct arch_thread *from,
struct arch_thread *to)
{
- to->fs = from->fs;
}
#define current_sp() ({ void *sp; __asm__("movq %%rsp, %0" : "=r" (sp) : ); sp; })
diff --git a/arch/x86/um/os-Linux/Makefile b/arch/x86/um/os-Linux/Makefile
index ae169125d03f..5249bbc30dcd 100644
--- a/arch/x86/um/os-Linux/Makefile
+++ b/arch/x86/um/os-Linux/Makefile
@@ -6,7 +6,6 @@
obj-y = registers.o task_size.o mcontext.o
obj-$(CONFIG_X86_32) += tls.o
-obj-$(CONFIG_64BIT) += prctl.o
USER_OBJS := $(obj-y)
diff --git a/arch/x86/um/os-Linux/prctl.c b/arch/x86/um/os-Linux/prctl.c
deleted file mode 100644
index 8431e87ac333..000000000000
--- a/arch/x86/um/os-Linux/prctl.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * Copyright (C) 2007 Jeff Dike (jdike@{addtoit.com,linux.intel.com})
- * Licensed under the GPL
- */
-
-#include <sys/ptrace.h>
-#include <asm/ptrace.h>
-
-int os_arch_prctl(int pid, int option, unsigned long *arg2)
-{
- return ptrace(PTRACE_ARCH_PRCTL, pid, (unsigned long) arg2, option);
-}
diff --git a/arch/x86/um/ptrace_32.c b/arch/x86/um/ptrace_32.c
index 0bc4b73a9cde..7f1abde2c84b 100644
--- a/arch/x86/um/ptrace_32.c
+++ b/arch/x86/um/ptrace_32.c
@@ -25,30 +25,6 @@ void arch_switch_to(struct task_struct *to)
printk(KERN_WARNING "arch_switch_tls failed, errno = EINVAL\n");
}
-int is_syscall(unsigned long addr)
-{
- unsigned short instr;
- int n;
-
- n = copy_from_user(&instr, (void __user *) addr, sizeof(instr));
- if (n) {
- /* access_process_vm() grants access to vsyscall and stub,
- * while copy_from_user doesn't. Maybe access_process_vm is
- * slow, but that doesn't matter, since it will be called only
- * in case of singlestepping, if copy_from_user failed.
- */
- n = access_process_vm(current, addr, &instr, sizeof(instr),
- FOLL_FORCE);
- if (n != sizeof(instr)) {
- printk(KERN_ERR "is_syscall : failed to read "
- "instruction from 0x%lx\n", addr);
- return 1;
- }
- }
- /* int 0x80 or sysenter */
- return (instr == 0x80cd) || (instr == 0x340f);
-}
-
/* determines which flags the user has access to. */
/* 1 = access 0 = no access */
#define FLAG_MASK 0x00044dd5
diff --git a/arch/x86/um/ptrace_64.c b/arch/x86/um/ptrace_64.c
index 289d0159b041..aa68d83d3f44 100644
--- a/arch/x86/um/ptrace_64.c
+++ b/arch/x86/um/ptrace_64.c
@@ -188,32 +188,6 @@ int peek_user(struct task_struct *child, long addr, long data)
return put_user(tmp, (unsigned long *) data);
}
-/* XXX Mostly copied from sys-i386 */
-int is_syscall(unsigned long addr)
-{
- unsigned short instr;
- int n;
-
- n = copy_from_user(&instr, (void __user *) addr, sizeof(instr));
- if (n) {
- /*
- * access_process_vm() grants access to vsyscall and stub,
- * while copy_from_user doesn't. Maybe access_process_vm is
- * slow, but that doesn't matter, since it will be called only
- * in case of singlestepping, if copy_from_user failed.
- */
- n = access_process_vm(current, addr, &instr, sizeof(instr),
- FOLL_FORCE);
- if (n != sizeof(instr)) {
- printk("is_syscall : failed to read instruction from "
- "0x%lx\n", addr);
- return 1;
- }
- }
- /* sysenter */
- return instr == 0x050f;
-}
-
static int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *child)
{
int err, n, cpu = ((struct thread_info *) child->stack)->cpu;
diff --git a/arch/x86/um/shared/sysdep/ptrace_32.h b/arch/x86/um/shared/sysdep/ptrace_32.h
index db8478a83a09..0c4989842fbe 100644
--- a/arch/x86/um/shared/sysdep/ptrace_32.h
+++ b/arch/x86/um/shared/sysdep/ptrace_32.h
@@ -8,10 +8,6 @@
#define MAX_FP_NR HOST_FPX_SIZE
-void set_using_sysemu(int value);
-int get_using_sysemu(void);
-extern int sysemu_supported;
-
#define UPT_SYSCALL_ARG1(r) UPT_BX(r)
#define UPT_SYSCALL_ARG2(r) UPT_CX(r)
#define UPT_SYSCALL_ARG3(r) UPT_DX(r)
diff --git a/arch/x86/um/shared/sysdep/ptrace_user.h b/arch/x86/um/shared/sysdep/ptrace_user.h
index 44782bbad41e..1d1a824fa652 100644
--- a/arch/x86/um/shared/sysdep/ptrace_user.h
+++ b/arch/x86/um/shared/sysdep/ptrace_user.h
@@ -15,14 +15,12 @@
#define FP_SIZE ((HOST_FPX_SIZE > HOST_FP_SIZE) ? HOST_FPX_SIZE : HOST_FP_SIZE)
#else
#define FP_SIZE HOST_FP_SIZE
+#endif
/*
- * x86_64 FC3 doesn't define this in /usr/include/linux/ptrace.h even though
- * it's defined in the kernel's include/linux/ptrace.h. Additionally, use the
- * 2.4 name and value for 2.4 host compatibility.
+ * glibc before 2.27 does not include PTRACE_SYSEMU_SINGLESTEP in its enum,
+ * ensure we have a definition by (re-)defining it here.
*/
-#ifndef PTRACE_OLDSETOPTIONS
-#define PTRACE_OLDSETOPTIONS 21
-#endif
-
+#ifndef PTRACE_SYSEMU_SINGLESTEP
+#define PTRACE_SYSEMU_SINGLESTEP 32
#endif
diff --git a/arch/x86/um/shared/sysdep/stub_32.h b/arch/x86/um/shared/sysdep/stub_32.h
index 38fa894b65d0..ea8b5a2d67af 100644
--- a/arch/x86/um/shared/sysdep/stub_32.h
+++ b/arch/x86/um/shared/sysdep/stub_32.h
@@ -12,72 +12,79 @@
#define STUB_MMAP_NR __NR_mmap2
#define MMAP_OFFSET(o) ((o) >> UM_KERN_PAGE_SHIFT)
-static inline long stub_syscall0(long syscall)
+static __always_inline long stub_syscall0(long syscall)
{
long ret;
- __asm__ volatile ("int $0x80" : "=a" (ret) : "0" (syscall));
+ __asm__ volatile ("int $0x80" : "=a" (ret) : "0" (syscall)
+ : "memory");
return ret;
}
-static inline long stub_syscall1(long syscall, long arg1)
+static __always_inline long stub_syscall1(long syscall, long arg1)
{
long ret;
- __asm__ volatile ("int $0x80" : "=a" (ret) : "0" (syscall), "b" (arg1));
+ __asm__ volatile ("int $0x80" : "=a" (ret) : "0" (syscall), "b" (arg1)
+ : "memory");
return ret;
}
-static inline long stub_syscall2(long syscall, long arg1, long arg2)
+static __always_inline long stub_syscall2(long syscall, long arg1, long arg2)
{
long ret;
__asm__ volatile ("int $0x80" : "=a" (ret) : "0" (syscall), "b" (arg1),
- "c" (arg2));
+ "c" (arg2)
+ : "memory");
return ret;
}
-static inline long stub_syscall3(long syscall, long arg1, long arg2, long arg3)
+static __always_inline long stub_syscall3(long syscall, long arg1, long arg2,
+ long arg3)
{
long ret;
__asm__ volatile ("int $0x80" : "=a" (ret) : "0" (syscall), "b" (arg1),
- "c" (arg2), "d" (arg3));
+ "c" (arg2), "d" (arg3)
+ : "memory");
return ret;
}
-static inline long stub_syscall4(long syscall, long arg1, long arg2, long arg3,
- long arg4)
+static __always_inline long stub_syscall4(long syscall, long arg1, long arg2,
+ long arg3, long arg4)
{
long ret;
__asm__ volatile ("int $0x80" : "=a" (ret) : "0" (syscall), "b" (arg1),
- "c" (arg2), "d" (arg3), "S" (arg4));
+ "c" (arg2), "d" (arg3), "S" (arg4)
+ : "memory");
return ret;
}
-static inline long stub_syscall5(long syscall, long arg1, long arg2, long arg3,
- long arg4, long arg5)
+static __always_inline long stub_syscall5(long syscall, long arg1, long arg2,
+ long arg3, long arg4, long arg5)
{
long ret;
__asm__ volatile ("int $0x80" : "=a" (ret) : "0" (syscall), "b" (arg1),
- "c" (arg2), "d" (arg3), "S" (arg4), "D" (arg5));
+ "c" (arg2), "d" (arg3), "S" (arg4), "D" (arg5)
+ : "memory");
return ret;
}
-static inline void trap_myself(void)
+static __always_inline void trap_myself(void)
{
__asm("int3");
}
-static inline void remap_stack_and_trap(void)
+static __always_inline void remap_stack_and_trap(void)
{
__asm__ volatile (
"movl %%esp,%%ebx ;"
diff --git a/arch/x86/um/shared/sysdep/stub_64.h b/arch/x86/um/shared/sysdep/stub_64.h
index 2de1c8f88173..b24168ef0ac4 100644
--- a/arch/x86/um/shared/sysdep/stub_64.h
+++ b/arch/x86/um/shared/sysdep/stub_64.h
@@ -16,7 +16,7 @@
#define __syscall_clobber "r11","rcx","memory"
#define __syscall "syscall"
-static inline long stub_syscall0(long syscall)
+static __always_inline long stub_syscall0(long syscall)
{
long ret;
@@ -27,7 +27,7 @@ static inline long stub_syscall0(long syscall)
return ret;
}
-static inline long stub_syscall2(long syscall, long arg1, long arg2)
+static __always_inline long stub_syscall2(long syscall, long arg1, long arg2)
{
long ret;
@@ -38,7 +38,8 @@ static inline long stub_syscall2(long syscall, long arg1, long arg2)
return ret;
}
-static inline long stub_syscall3(long syscall, long arg1, long arg2, long arg3)
+static __always_inline long stub_syscall3(long syscall, long arg1, long arg2,
+ long arg3)
{
long ret;
@@ -50,7 +51,7 @@ static inline long stub_syscall3(long syscall, long arg1, long arg2, long arg3)
return ret;
}
-static inline long stub_syscall4(long syscall, long arg1, long arg2, long arg3,
+static __always_inline long stub_syscall4(long syscall, long arg1, long arg2, long arg3,
long arg4)
{
long ret;
@@ -64,8 +65,8 @@ static inline long stub_syscall4(long syscall, long arg1, long arg2, long arg3,
return ret;
}
-static inline long stub_syscall5(long syscall, long arg1, long arg2, long arg3,
- long arg4, long arg5)
+static __always_inline long stub_syscall5(long syscall, long arg1, long arg2,
+ long arg3, long arg4, long arg5)
{
long ret;
@@ -78,12 +79,12 @@ static inline long stub_syscall5(long syscall, long arg1, long arg2, long arg3,
return ret;
}
-static inline void trap_myself(void)
+static __always_inline void trap_myself(void)
{
__asm("int3");
}
-static inline void remap_stack_and_trap(void)
+static __always_inline void remap_stack_and_trap(void)
{
__asm__ volatile (
"movq %0,%%rax ;"
diff --git a/arch/x86/um/syscalls_64.c b/arch/x86/um/syscalls_64.c
index 27b29ae6c471..6a00a28c9cca 100644
--- a/arch/x86/um/syscalls_64.c
+++ b/arch/x86/um/syscalls_64.c
@@ -16,60 +16,24 @@
long arch_prctl(struct task_struct *task, int option,
unsigned long __user *arg2)
{
- unsigned long *ptr = arg2, tmp;
- long ret;
- int pid = task->mm->context.id.u.pid;
-
- /*
- * With ARCH_SET_FS (and ARCH_SET_GS is treated similarly to
- * be safe), we need to call arch_prctl on the host because
- * setting %fs may result in something else happening (like a
- * GDT or thread.fs being set instead). So, we let the host
- * fiddle the registers and thread struct and restore the
- * registers afterwards.
- *
- * So, the saved registers are stored to the process (this
- * needed because a stub may have been the last thing to run),
- * arch_prctl is run on the host, then the registers are read
- * back.
- */
- switch (option) {
- case ARCH_SET_FS:
- case ARCH_SET_GS:
- ret = restore_pid_registers(pid, &current->thread.regs.regs);
- if (ret)
- return ret;
- break;
- case ARCH_GET_FS:
- case ARCH_GET_GS:
- /*
- * With these two, we read to a local pointer and
- * put_user it to the userspace pointer that we were
- * given. If addr isn't valid (because it hasn't been
- * faulted in or is just bogus), we want put_user to
- * fault it in (or return -EFAULT) instead of having
- * the host return -EFAULT.
- */
- ptr = &tmp;
- }
-
- ret = os_arch_prctl(pid, option, ptr);
- if (ret)
- return ret;
+ long ret = -EINVAL;
switch (option) {
case ARCH_SET_FS:
- current->thread.arch.fs = (unsigned long) ptr;
- ret = save_registers(pid, &current->thread.regs.regs);
+ current->thread.regs.regs.gp[FS_BASE / sizeof(unsigned long)] =
+ (unsigned long) arg2;
+ ret = 0;
break;
case ARCH_SET_GS:
- ret = save_registers(pid, &current->thread.regs.regs);
+ current->thread.regs.regs.gp[GS_BASE / sizeof(unsigned long)] =
+ (unsigned long) arg2;
+ ret = 0;
break;
case ARCH_GET_FS:
- ret = put_user(tmp, arg2);
+ ret = put_user(current->thread.regs.regs.gp[FS_BASE / sizeof(unsigned long)], arg2);
break;
case ARCH_GET_GS:
- ret = put_user(tmp, arg2);
+ ret = put_user(current->thread.regs.regs.gp[GS_BASE / sizeof(unsigned long)], arg2);
break;
}
@@ -83,10 +47,10 @@ SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
void arch_switch_to(struct task_struct *to)
{
- if ((to->thread.arch.fs == 0) || (to->mm == NULL))
- return;
-
- arch_prctl(to, ARCH_SET_FS, (void __user *) to->thread.arch.fs);
+ /*
+ * Nothing needs to be done on x86_64.
+ * The FS_BASE/GS_BASE registers are saved in the ptrace register set.
+ */
}
SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
diff --git a/arch/x86/um/sysrq_64.c b/arch/x86/um/sysrq_64.c
index ef1eb4f4f612..0bf6de40abff 100644
--- a/arch/x86/um/sysrq_64.c
+++ b/arch/x86/um/sysrq_64.c
@@ -6,6 +6,7 @@
#include <linux/kernel.h>
#include <linux/module.h>
+#include <linux/pid.h>
#include <linux/sched.h>
#include <linux/sched/debug.h>
#include <linux/utsname.h>
diff --git a/arch/x86/um/tls_64.c b/arch/x86/um/tls_64.c
index ebd3855d9b13..c51a613f6f5c 100644
--- a/arch/x86/um/tls_64.c
+++ b/arch/x86/um/tls_64.c
@@ -12,7 +12,7 @@ int arch_set_tls(struct task_struct *t, unsigned long tls)
* If CLONE_SETTLS is set, we need to save the thread id
* so it can be set during context switches.
*/
- t->thread.arch.fs = tls;
+ t->thread.regs.regs.gp[FS_BASE / sizeof(unsigned long)] = tls;
return 0;
}
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 9b1ec5d8c99c..a65fc2ae15b4 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -9,6 +9,7 @@ config XEN
select PARAVIRT_CLOCK
select X86_HV_CALLBACK_VECTOR
depends on X86_64 || (X86_32 && X86_PAE)
+ depends on X86_64 || (X86_GENERIC || MPENTIUM4 || MCORE2 || MATOM || MK8)
depends on X86_LOCAL_APIC && X86_TSC
help
This is the Linux Xen port. Enabling this will allow the
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 0337392a3121..3c61bb98c10e 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -33,9 +33,12 @@ EXPORT_SYMBOL_GPL(hypercall_page);
* and xen_vcpu_setup for details. By default it points to share_info->vcpu_info
* but during boot it is switched to point to xen_vcpu_info.
* The pointer is used in xen_evtchn_do_upcall to acknowledge pending events.
+ * Make sure that xen_vcpu_info doesn't cross a page boundary by making it
+ * cache-line aligned (the struct is guaranteed to have a size of 64 bytes,
+ * which matches the cache line size of 64-bit x86 processors).
*/
DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
-DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
+DEFINE_PER_CPU_ALIGNED(struct vcpu_info, xen_vcpu_info);
/* Linux <-> Xen vCPU id mapping */
DEFINE_PER_CPU(uint32_t, xen_vcpu_id);
@@ -160,6 +163,7 @@ void xen_vcpu_setup(int cpu)
int err;
struct vcpu_info *vcpup;
+ BUILD_BUG_ON(sizeof(*vcpup) > SMP_CACHE_BYTES);
BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
/*
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index bbbfdd495ebd..aeb33e0a3f76 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -704,7 +704,7 @@ static struct trap_array_entry trap_array[] = {
TRAP_ENTRY(exc_int3, false ),
TRAP_ENTRY(exc_overflow, false ),
#ifdef CONFIG_IA32_EMULATION
- { entry_INT80_compat, xen_entry_INT80_compat, false },
+ TRAP_ENTRY(int80_emulation, false ),
#endif
TRAP_ENTRY(exc_page_fault, false ),
TRAP_ENTRY(exc_divide_error, false ),
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 6092fea7d651..39982f955cfe 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -45,7 +45,7 @@ static const typeof(pv_ops) xen_irq_ops __initconst = {
/* Initial interrupt flag handling only called while interrupts off. */
.save_fl = __PV_IS_CALLEE_SAVE(paravirt_ret0),
.irq_disable = __PV_IS_CALLEE_SAVE(paravirt_nop),
- .irq_enable = __PV_IS_CALLEE_SAVE(paravirt_BUG),
+ .irq_enable = __PV_IS_CALLEE_SAVE(BUG_func),
.safe_halt = xen_safe_halt,
.halt = xen_halt,
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index b6830554ff69..72af496a160c 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -34,7 +34,7 @@
* would need to validate the whole pagetable before going on.
* Naturally, this is quite slow. The solution is to "pin" a
* pagetable, which enforces all the constraints on the pagetable even
- * when it is not actively in use. This menas that Xen can be assured
+ * when it is not actively in use. This means that Xen can be assured
* that it is still valid when you do load it into %cr3, and doesn't
* need to revalidate it.
*
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
index 9e5e68008785..1a9cd18dfbd3 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -156,7 +156,7 @@ xen_pv_trap asm_xenpv_exc_machine_check
#endif /* CONFIG_X86_MCE */
xen_pv_trap asm_exc_simd_coprocessor_error
#ifdef CONFIG_IA32_EMULATION
-xen_pv_trap entry_INT80_compat
+xen_pv_trap asm_int80_emulation
#endif
xen_pv_trap asm_exc_xen_unknown_trap
xen_pv_trap asm_exc_xen_hypervisor_callback
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 408a2aa66c69..a87ab36889e7 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -21,7 +21,7 @@ extern void *xen_initial_gdt;
struct trap_info;
void xen_copy_trap_info(struct trap_info *traps);
-DECLARE_PER_CPU(struct vcpu_info, xen_vcpu_info);
+DECLARE_PER_CPU_ALIGNED(struct vcpu_info, xen_vcpu_info);
DECLARE_PER_CPU(unsigned long, xen_cr3);
DECLARE_PER_CPU(unsigned long, xen_current_cr3);