summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig3
-rw-r--r--arch/x86/Makefile2
-rw-r--r--arch/x86/boot/compressed/head_64.S170
-rw-r--r--arch/x86/boot/compressed/idt_64.c14
-rw-r--r--arch/x86/boot/compressed/mem_encrypt.S130
-rw-r--r--arch/x86/boot/compressed/misc.c9
-rw-r--r--arch/x86/boot/compressed/misc.h6
-rw-r--r--arch/x86/boot/compressed/sev-es.c12
-rw-r--r--arch/x86/crypto/poly1305_glue.c6
-rw-r--r--arch/x86/entry/entry_32.S6
-rw-r--r--arch/x86/entry/entry_64.S2
-rw-r--r--arch/x86/entry/vdso/vdso32/system_call.S2
-rw-r--r--arch/x86/events/intel/core.c2
-rw-r--r--arch/x86/events/intel/uncore_snbep.c61
-rw-r--r--arch/x86/include/asm/alternative-asm.h114
-rw-r--r--arch/x86/include/asm/alternative.h142
-rw-r--r--arch/x86/include/asm/cpufeature.h41
-rw-r--r--arch/x86/include/asm/cpufeatures.h4
-rw-r--r--arch/x86/include/asm/irqflags.h7
-rw-r--r--arch/x86/include/asm/kfence.h7
-rw-r--r--arch/x86/include/asm/mshyperv.h2
-rw-r--r--arch/x86/include/asm/nospec-branch.h1
-rw-r--r--arch/x86/include/asm/paravirt.h167
-rw-r--r--arch/x86/include/asm/paravirt_types.h210
-rw-r--r--arch/x86/include/asm/sgx.h (renamed from arch/x86/kernel/cpu/sgx/arch.h)50
-rw-r--r--arch/x86/include/asm/smap.h5
-rw-r--r--arch/x86/include/asm/smp.h1
-rw-r--r--arch/x86/include/asm/xen/page.h12
-rw-r--r--arch/x86/kernel/Makefile3
-rw-r--r--arch/x86/kernel/acpi/boot.c25
-rw-r--r--arch/x86/kernel/acpi/wakeup_64.S2
-rw-r--r--arch/x86/kernel/alternative.c52
-rw-r--r--arch/x86/kernel/asm-offsets.c7
-rw-r--r--arch/x86/kernel/cpu/cpuid-deps.c3
-rw-r--r--arch/x86/kernel/cpu/feat_ctl.c71
-rw-r--r--arch/x86/kernel/cpu/mce/inject.c6
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c8
-rw-r--r--arch/x86/kernel/cpu/scattered.c2
-rw-r--r--arch/x86/kernel/cpu/sgx/Makefile1
-rw-r--r--arch/x86/kernel/cpu/sgx/driver.c17
-rw-r--r--arch/x86/kernel/cpu/sgx/encl.c33
-rw-r--r--arch/x86/kernel/cpu/sgx/encl.h1
-rw-r--r--arch/x86/kernel/cpu/sgx/encls.h30
-rw-r--r--arch/x86/kernel/cpu/sgx/ioctl.c43
-rw-r--r--arch/x86/kernel/cpu/sgx/main.c264
-rw-r--r--arch/x86/kernel/cpu/sgx/sgx.h40
-rw-r--r--arch/x86/kernel/cpu/sgx/virt.c376
-rw-r--r--arch/x86/kernel/cpu/vmware.c7
-rw-r--r--arch/x86/kernel/crash.c2
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--arch/x86/kernel/kvmclock.c2
-rw-r--r--arch/x86/kernel/paravirt-spinlocks.c9
-rw-r--r--arch/x86/kernel/paravirt.c75
-rw-r--r--arch/x86/kernel/paravirt_patch.c99
-rw-r--r--arch/x86/kernel/setup.c116
-rw-r--r--arch/x86/kernel/sev-es-shared.c16
-rw-r--r--arch/x86/kernel/sev-es.c38
-rw-r--r--arch/x86/kernel/smpboot.c26
-rw-r--r--arch/x86/kernel/tboot.c44
-rw-r--r--arch/x86/kernel/traps.c4
-rw-r--r--arch/x86/kernel/tsc.c3
-rw-r--r--arch/x86/kvm/Kconfig12
-rw-r--r--arch/x86/kvm/Makefile2
-rw-r--r--arch/x86/kvm/mmu/mmu.c9
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c26
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.h24
-rw-r--r--arch/x86/kvm/svm/nested.c28
-rw-r--r--arch/x86/kvm/svm/pmu.c8
-rw-r--r--arch/x86/kvm/vmx/vmx.c10
-rw-r--r--arch/x86/kvm/x86.c77
-rw-r--r--arch/x86/kvm/x86.h1
-rw-r--r--arch/x86/lib/atomic64_386_32.S2
-rw-r--r--arch/x86/lib/atomic64_cx8_32.S2
-rw-r--r--arch/x86/lib/copy_page_64.S2
-rw-r--r--arch/x86/lib/copy_user_64.S2
-rw-r--r--arch/x86/lib/memcpy_64.S2
-rw-r--r--arch/x86/lib/memmove_64.S2
-rw-r--r--arch/x86/lib/memset_64.S2
-rw-r--r--arch/x86/lib/retpoline.S2
-rw-r--r--arch/x86/mm/mem_encrypt.c8
-rw-r--r--arch/x86/mm/mem_encrypt_identity.c35
-rw-r--r--arch/x86/net/bpf_jit_comp.c42
-rw-r--r--arch/x86/net/bpf_jit_comp32.c11
-rw-r--r--arch/x86/xen/enlighten_pv.c4
-rw-r--r--arch/x86/xen/p2m.c7
-rw-r--r--arch/x86/xen/setup.c16
-rw-r--r--arch/x86/xen/time.c26
87 files changed, 1890 insertions, 1087 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2792879d398e..489531ed236e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -777,6 +777,7 @@ if HYPERVISOR_GUEST
config PARAVIRT
bool "Enable paravirtualization code"
+ depends on HAVE_STATIC_CALL
help
This changes the kernel so it can modify itself when it is run
under a hypervisor, potentially improving performance significantly
@@ -1518,6 +1519,7 @@ config AMD_MEM_ENCRYPT
select ARCH_USE_MEMREMAP_PROT
select ARCH_HAS_FORCE_DMA_UNENCRYPTED
select INSTRUCTION_DECODER
+ select ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS
help
Say yes to enable support for the encryption of system memory.
This requires an AMD processor that supports Secure Memory
@@ -1931,6 +1933,7 @@ config X86_SGX
depends on CRYPTO_SHA256=y
select SRCU
select MMU_NOTIFIER
+ select NUMA_KEEP_MEMINFO if NUMA
help
Intel(R) Software Guard eXtensions (SGX) is a set of CPU instructions
that can be used by applications to set aside private regions of code
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 2d6d5a28c3bf..9a85eae37b17 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -27,7 +27,7 @@ endif
REALMODE_CFLAGS := -m16 -g -Os -DDISABLE_BRANCH_PROFILING \
-Wall -Wstrict-prototypes -march=i386 -mregparm=3 \
-fno-strict-aliasing -fomit-frame-pointer -fno-pic \
- -mno-mmx -mno-sse
+ -mno-mmx -mno-sse $(call cc-option,-fcf-protection=none)
REALMODE_CFLAGS += -ffreestanding
REALMODE_CFLAGS += -fno-stack-protector
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index a8c4095ee115..a2347ded77ea 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -34,6 +34,7 @@
#include <asm/asm-offsets.h>
#include <asm/bootparam.h>
#include <asm/desc_defs.h>
+#include <asm/trapnr.h>
#include "pgtable.h"
/*
@@ -107,9 +108,19 @@ SYM_FUNC_START(startup_32)
movl %eax, %gs
movl %eax, %ss
-/* setup a stack and make sure cpu supports long mode. */
+ /* Setup a stack and load CS from current GDT */
leal rva(boot_stack_end)(%ebp), %esp
+ pushl $__KERNEL32_CS
+ leal rva(1f)(%ebp), %eax
+ pushl %eax
+ lretl
+1:
+
+ /* Setup Exception handling for SEV-ES */
+ call startup32_load_idt
+
+ /* Make sure cpu supports long mode. */
call verify_cpu
testl %eax, %eax
jnz .Lno_longmode
@@ -172,11 +183,21 @@ SYM_FUNC_START(startup_32)
*/
call get_sev_encryption_bit
xorl %edx, %edx
+#ifdef CONFIG_AMD_MEM_ENCRYPT
testl %eax, %eax
jz 1f
subl $32, %eax /* Encryption bit is always above bit 31 */
bts %eax, %edx /* Set encryption mask for page tables */
+ /*
+ * Mark SEV as active in sev_status so that startup32_check_sev_cbit()
+ * will do a check. The sev_status memory will be fully initialized
+ * with the contents of MSR_AMD_SEV_STATUS later in
+ * set_sev_encryption_mask(). For now it is sufficient to know that SEV
+ * is active.
+ */
+ movl $1, rva(sev_status)(%ebp)
1:
+#endif
/* Initialize Page tables to 0 */
leal rva(pgtable)(%ebx), %edi
@@ -261,6 +282,9 @@ SYM_FUNC_START(startup_32)
movl %esi, %edx
1:
#endif
+ /* Check if the C-bit position is correct when SEV is active */
+ call startup32_check_sev_cbit
+
pushl $__KERNEL_CS
pushl %eax
@@ -694,6 +718,19 @@ SYM_DATA_START(boot_idt)
.endr
SYM_DATA_END_LABEL(boot_idt, SYM_L_GLOBAL, boot_idt_end)
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+SYM_DATA_START(boot32_idt_desc)
+ .word boot32_idt_end - boot32_idt - 1
+ .long 0
+SYM_DATA_END(boot32_idt_desc)
+ .balign 8
+SYM_DATA_START(boot32_idt)
+ .rept 32
+ .quad 0
+ .endr
+SYM_DATA_END_LABEL(boot32_idt, SYM_L_GLOBAL, boot32_idt_end)
+#endif
+
#ifdef CONFIG_EFI_STUB
SYM_DATA(image_offset, .long 0)
#endif
@@ -786,6 +823,137 @@ SYM_DATA_START_LOCAL(loaded_image_proto)
SYM_DATA_END(loaded_image_proto)
#endif
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ __HEAD
+ .code32
+/*
+ * Write an IDT entry into boot32_idt
+ *
+ * Parameters:
+ *
+ * %eax: Handler address
+ * %edx: Vector number
+ *
+ * Physical offset is expected in %ebp
+ */
+SYM_FUNC_START(startup32_set_idt_entry)
+ push %ebx
+ push %ecx
+
+ /* IDT entry address to %ebx */
+ leal rva(boot32_idt)(%ebp), %ebx
+ shl $3, %edx
+ addl %edx, %ebx
+
+ /* Build IDT entry, lower 4 bytes */
+ movl %eax, %edx
+ andl $0x0000ffff, %edx # Target code segment offset [15:0]
+ movl $__KERNEL32_CS, %ecx # Target code segment selector
+ shl $16, %ecx
+ orl %ecx, %edx
+
+ /* Store lower 4 bytes to IDT */
+ movl %edx, (%ebx)
+
+ /* Build IDT entry, upper 4 bytes */
+ movl %eax, %edx
+ andl $0xffff0000, %edx # Target code segment offset [31:16]
+ orl $0x00008e00, %edx # Present, Type 32-bit Interrupt Gate
+
+ /* Store upper 4 bytes to IDT */
+ movl %edx, 4(%ebx)
+
+ pop %ecx
+ pop %ebx
+ ret
+SYM_FUNC_END(startup32_set_idt_entry)
+#endif
+
+SYM_FUNC_START(startup32_load_idt)
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ /* #VC handler */
+ leal rva(startup32_vc_handler)(%ebp), %eax
+ movl $X86_TRAP_VC, %edx
+ call startup32_set_idt_entry
+
+ /* Load IDT */
+ leal rva(boot32_idt)(%ebp), %eax
+ movl %eax, rva(boot32_idt_desc+2)(%ebp)
+ lidt rva(boot32_idt_desc)(%ebp)
+#endif
+ ret
+SYM_FUNC_END(startup32_load_idt)
+
+/*
+ * Check for the correct C-bit position when the startup_32 boot-path is used.
+ *
+ * The check makes use of the fact that all memory is encrypted when paging is
+ * disabled. The function creates 64 bits of random data using the RDRAND
+ * instruction. RDRAND is mandatory for SEV guests, so always available. If the
+ * hypervisor violates that the kernel will crash right here.
+ *
+ * The 64 bits of random data are stored to a memory location and at the same
+ * time kept in the %eax and %ebx registers. Since encryption is always active
+ * when paging is off the random data will be stored encrypted in main memory.
+ *
+ * Then paging is enabled. When the C-bit position is correct all memory is
+ * still mapped encrypted and comparing the register values with memory will
+ * succeed. An incorrect C-bit position will map all memory unencrypted, so that
+ * the compare will use the encrypted random data and fail.
+ */
+SYM_FUNC_START(startup32_check_sev_cbit)
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+
+ /* Check for non-zero sev_status */
+ movl rva(sev_status)(%ebp), %eax
+ testl %eax, %eax
+ jz 4f
+
+ /*
+ * Get two 32-bit random values - Don't bail out if RDRAND fails
+ * because it is better to prevent forward progress if no random value
+ * can be gathered.
+ */
+1: rdrand %eax
+ jnc 1b
+2: rdrand %ebx
+ jnc 2b
+
+ /* Store to memory and keep it in the registers */
+ movl %eax, rva(sev_check_data)(%ebp)
+ movl %ebx, rva(sev_check_data+4)(%ebp)
+
+ /* Enable paging to see if encryption is active */
+ movl %cr0, %edx /* Backup %cr0 in %edx */
+ movl $(X86_CR0_PG | X86_CR0_PE), %ecx /* Enable Paging and Protected mode */
+ movl %ecx, %cr0
+
+ cmpl %eax, rva(sev_check_data)(%ebp)
+ jne 3f
+ cmpl %ebx, rva(sev_check_data+4)(%ebp)
+ jne 3f
+
+ movl %edx, %cr0 /* Restore previous %cr0 */
+
+ jmp 4f
+
+3: /* Check failed - hlt the machine */
+ hlt
+ jmp 3b
+
+4:
+ popl %edx
+ popl %ecx
+ popl %ebx
+ popl %eax
+#endif
+ ret
+SYM_FUNC_END(startup32_check_sev_cbit)
+
/*
* Stack and heap for uncompression
*/
diff --git a/arch/x86/boot/compressed/idt_64.c b/arch/x86/boot/compressed/idt_64.c
index 804a502ee0d2..9b93567d663a 100644
--- a/arch/x86/boot/compressed/idt_64.c
+++ b/arch/x86/boot/compressed/idt_64.c
@@ -52,3 +52,17 @@ void load_stage2_idt(void)
load_boot_idt(&boot_idt_desc);
}
+
+void cleanup_exception_handling(void)
+{
+ /*
+ * Flush GHCB from cache and map it encrypted again when running as
+ * SEV-ES guest.
+ */
+ sev_es_shutdown_ghcb();
+
+ /* Set a null-idt, disabling #PF and #VC handling */
+ boot_idt_desc.size = 0;
+ boot_idt_desc.address = 0;
+ load_boot_idt(&boot_idt_desc);
+}
diff --git a/arch/x86/boot/compressed/mem_encrypt.S b/arch/x86/boot/compressed/mem_encrypt.S
index aa561795efd1..c1e81a848b2a 100644
--- a/arch/x86/boot/compressed/mem_encrypt.S
+++ b/arch/x86/boot/compressed/mem_encrypt.S
@@ -23,12 +23,6 @@ SYM_FUNC_START(get_sev_encryption_bit)
push %ecx
push %edx
- /* Check if running under a hypervisor */
- movl $1, %eax
- cpuid
- bt $31, %ecx /* Check the hypervisor bit */
- jnc .Lno_sev
-
movl $0x80000000, %eax /* CPUID to check the highest leaf */
cpuid
cmpl $0x8000001f, %eax /* See if 0x8000001f is available */
@@ -67,10 +61,132 @@ SYM_FUNC_START(get_sev_encryption_bit)
ret
SYM_FUNC_END(get_sev_encryption_bit)
+/**
+ * sev_es_req_cpuid - Request a CPUID value from the Hypervisor using
+ * the GHCB MSR protocol
+ *
+ * @%eax: Register to request (0=EAX, 1=EBX, 2=ECX, 3=EDX)
+ * @%edx: CPUID Function
+ *
+ * Returns 0 in %eax on success, non-zero on failure
+ * %edx returns CPUID value on success
+ */
+SYM_CODE_START_LOCAL(sev_es_req_cpuid)
+ shll $30, %eax
+ orl $0x00000004, %eax
+ movl $MSR_AMD64_SEV_ES_GHCB, %ecx
+ wrmsr
+ rep; vmmcall # VMGEXIT
+ rdmsr
+
+ /* Check response */
+ movl %eax, %ecx
+ andl $0x3ffff000, %ecx # Bits [12-29] MBZ
+ jnz 2f
+
+ /* Check return code */
+ andl $0xfff, %eax
+ cmpl $5, %eax
+ jne 2f
+
+ /* All good - return success */
+ xorl %eax, %eax
+1:
+ ret
+2:
+ movl $-1, %eax
+ jmp 1b
+SYM_CODE_END(sev_es_req_cpuid)
+
+SYM_CODE_START(startup32_vc_handler)
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+
+ /* Keep CPUID function in %ebx */
+ movl %eax, %ebx
+
+ /* Check if error-code == SVM_EXIT_CPUID */
+ cmpl $0x72, 16(%esp)
+ jne .Lfail
+
+ movl $0, %eax # Request CPUID[fn].EAX
+ movl %ebx, %edx # CPUID fn
+ call sev_es_req_cpuid # Call helper
+ testl %eax, %eax # Check return code
+ jnz .Lfail
+ movl %edx, 12(%esp) # Store result
+
+ movl $1, %eax # Request CPUID[fn].EBX
+ movl %ebx, %edx # CPUID fn
+ call sev_es_req_cpuid # Call helper
+ testl %eax, %eax # Check return code
+ jnz .Lfail
+ movl %edx, 8(%esp) # Store result
+
+ movl $2, %eax # Request CPUID[fn].ECX
+ movl %ebx, %edx # CPUID fn
+ call sev_es_req_cpuid # Call helper
+ testl %eax, %eax # Check return code
+ jnz .Lfail
+ movl %edx, 4(%esp) # Store result
+
+ movl $3, %eax # Request CPUID[fn].EDX
+ movl %ebx, %edx # CPUID fn
+ call sev_es_req_cpuid # Call helper
+ testl %eax, %eax # Check return code
+ jnz .Lfail
+ movl %edx, 0(%esp) # Store result
+
+ /*
+ * Sanity check CPUID results from the Hypervisor. See comment in
+ * do_vc_no_ghcb() for more details on why this is necessary.
+ */
+
+ /* Fail if SEV leaf not available in CPUID[0x80000000].EAX */
+ cmpl $0x80000000, %ebx
+ jne .Lcheck_sev
+ cmpl $0x8000001f, 12(%esp)
+ jb .Lfail
+ jmp .Ldone
+
+.Lcheck_sev:
+ /* Fail if SEV bit not set in CPUID[0x8000001f].EAX[1] */
+ cmpl $0x8000001f, %ebx
+ jne .Ldone
+ btl $1, 12(%esp)
+ jnc .Lfail
+
+.Ldone:
+ popl %edx
+ popl %ecx
+ popl %ebx
+ popl %eax
+
+ /* Remove error code */
+ addl $4, %esp
+
+ /* Jump over CPUID instruction */
+ addl $2, (%esp)
+
+ iret
+.Lfail:
+ /* Send terminate request to Hypervisor */
+ movl $0x100, %eax
+ xorl %edx, %edx
+ movl $MSR_AMD64_SEV_ES_GHCB, %ecx
+ wrmsr
+ rep; vmmcall
+
+ /* If request fails, go to hlt loop */
+ hlt
+ jmp .Lfail
+SYM_CODE_END(startup32_vc_handler)
+
.code64
#include "../../kernel/sev_verify_cbit.S"
-
SYM_FUNC_START(set_sev_encryption_mask)
#ifdef CONFIG_AMD_MEM_ENCRYPT
push %rbp
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 267e7f93050e..dde042f64cca 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -430,8 +430,6 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
error("Destination address too large");
#endif
#ifndef CONFIG_RELOCATABLE
- if ((unsigned long)output != LOAD_PHYSICAL_ADDR)
- error("Destination address does not match LOAD_PHYSICAL_ADDR");
if (virt_addr != LOAD_PHYSICAL_ADDR)
error("Destination virtual address changed when not relocatable");
#endif
@@ -443,11 +441,8 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
handle_relocations(output, output_len, virt_addr);
debug_putstr("done.\nBooting the kernel.\n");
- /*
- * Flush GHCB from cache and map it encrypted again when running as
- * SEV-ES guest.
- */
- sev_es_shutdown_ghcb();
+ /* Disable exception handling before booting the kernel */
+ cleanup_exception_handling();
return output;
}
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 901ea5ebec22..e5612f035498 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -155,6 +155,12 @@ extern pteval_t __default_kernel_pte_mask;
extern gate_desc boot_idt[BOOT_IDT_ENTRIES];
extern struct desc_ptr boot_idt_desc;
+#ifdef CONFIG_X86_64
+void cleanup_exception_handling(void);
+#else
+static inline void cleanup_exception_handling(void) { }
+#endif
+
/* IDT Entry Points */
void boot_page_fault(void);
void boot_stage1_vc(void);
diff --git a/arch/x86/boot/compressed/sev-es.c b/arch/x86/boot/compressed/sev-es.c
index 27826c265aab..d904bd56b3e3 100644
--- a/arch/x86/boot/compressed/sev-es.c
+++ b/arch/x86/boot/compressed/sev-es.c
@@ -200,14 +200,8 @@ void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code)
}
finish:
- if (result == ES_OK) {
+ if (result == ES_OK)
vc_finish_insn(&ctxt);
- } else if (result != ES_RETRY) {
- /*
- * For now, just halt the machine. That makes debugging easier,
- * later we just call sev_es_terminate() here.
- */
- while (true)
- asm volatile("hlt\n");
- }
+ else if (result != ES_RETRY)
+ sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST);
}
diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c
index 646da46e8d10..1dfb8af48a3c 100644
--- a/arch/x86/crypto/poly1305_glue.c
+++ b/arch/x86/crypto/poly1305_glue.c
@@ -16,7 +16,7 @@
#include <asm/simd.h>
asmlinkage void poly1305_init_x86_64(void *ctx,
- const u8 key[POLY1305_KEY_SIZE]);
+ const u8 key[POLY1305_BLOCK_SIZE]);
asmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp,
const size_t len, const u32 padbit);
asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
@@ -81,7 +81,7 @@ static void convert_to_base2_64(void *ctx)
state->is_base2_26 = 0;
}
-static void poly1305_simd_init(void *ctx, const u8 key[POLY1305_KEY_SIZE])
+static void poly1305_simd_init(void *ctx, const u8 key[POLY1305_BLOCK_SIZE])
{
poly1305_init_x86_64(ctx, key);
}
@@ -129,7 +129,7 @@ static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
poly1305_emit_avx(ctx, mac, nonce);
}
-void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
+void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 key[POLY1305_KEY_SIZE])
{
poly1305_simd_init(&dctx->h, key);
dctx->s[0] = get_unaligned_le32(&key[16]);
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index cc7745aa4630..ff0034740900 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -40,7 +40,7 @@
#include <asm/processor-flags.h>
#include <asm/irq_vectors.h>
#include <asm/cpufeatures.h>
-#include <asm/alternative-asm.h>
+#include <asm/alternative.h>
#include <asm/asm.h>
#include <asm/smap.h>
#include <asm/frame.h>
@@ -430,7 +430,7 @@
* will soon execute iret and the tracer was already set to
* the irqstate after the IRET:
*/
- DISABLE_INTERRUPTS(CLBR_ANY)
+ cli
lss (%esp), %esp /* switch to espfix segment */
.Lend_\@:
#endif /* CONFIG_X86_ESPFIX32 */
@@ -1077,7 +1077,7 @@ restore_all_switch_stack:
* when returning from IPI handler and when returning from
* scheduler to user-space.
*/
- INTERRUPT_RETURN
+ iret
.section .fixup, "ax"
SYM_CODE_START(asm_iret_error)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 0a7e9647e84a..a16a5294d55f 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -305,7 +305,7 @@ SYM_CODE_END(ret_from_fork)
.macro DEBUG_ENTRY_ASSERT_IRQS_OFF
#ifdef CONFIG_DEBUG_ENTRY
pushq %rax
- SAVE_FLAGS(CLBR_RAX)
+ SAVE_FLAGS
testl $X86_EFLAGS_IF, %eax
jz .Lokay_\@
ud2
diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S
index b15adf7f5ef8..6ddd7a937b3e 100644
--- a/arch/x86/entry/vdso/vdso32/system_call.S
+++ b/arch/x86/entry/vdso/vdso32/system_call.S
@@ -6,7 +6,7 @@
#include <linux/linkage.h>
#include <asm/dwarf2.h>
#include <asm/cpufeatures.h>
-#include <asm/alternative-asm.h>
+#include <asm/alternative.h>
.text
.globl __kernel_vsyscall
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index f9b638e72e20..3fd69bd5fa6e 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -4516,7 +4516,7 @@ static const struct x86_cpu_desc isolation_ucodes[] = {
INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_D, 3, 0x07000009),
INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_D, 4, 0x0f000009),
INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_D, 5, 0x0e000002),
- INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_X, 2, 0x0b000014),
+ INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_X, 1, 0x0b000014),
INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 3, 0x00000021),
INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 4, 0x00000000),
INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 5, 0x00000000),
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 324158107b22..4bba0491068c 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -1159,7 +1159,6 @@ enum {
SNBEP_PCI_QPI_PORT0_FILTER,
SNBEP_PCI_QPI_PORT1_FILTER,
BDX_PCI_QPI_PORT2_FILTER,
- HSWEP_PCI_PCU_3,
};
static int snbep_qpi_hw_config(struct intel_uncore_box *box, struct perf_event *event)
@@ -2857,22 +2856,33 @@ static struct intel_uncore_type *hswep_msr_uncores[] = {
NULL,
};
-void hswep_uncore_cpu_init(void)
+#define HSWEP_PCU_DID 0x2fc0
+#define HSWEP_PCU_CAPID4_OFFET 0x94
+#define hswep_get_chop(_cap) (((_cap) >> 6) & 0x3)
+
+static bool hswep_has_limit_sbox(unsigned int device)
{
- int pkg = boot_cpu_data.logical_proc_id;
+ struct pci_dev *dev = pci_get_device(PCI_VENDOR_ID_INTEL, device, NULL);
+ u32 capid4;
+
+ if (!dev)
+ return false;
+
+ pci_read_config_dword(dev, HSWEP_PCU_CAPID4_OFFET, &capid4);
+ if (!hswep_get_chop(capid4))
+ return true;
+ return false;
+}
+
+void hswep_uncore_cpu_init(void)
+{
if (hswep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
hswep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
/* Detect 6-8 core systems with only two SBOXes */
- if (uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3]) {
- u32 capid4;
-
- pci_read_config_dword(uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3],
- 0x94, &capid4);
- if (((capid4 >> 6) & 0x3) == 0)
- hswep_uncore_sbox.num_boxes = 2;
- }
+ if (hswep_has_limit_sbox(HSWEP_PCU_DID))
+ hswep_uncore_sbox.num_boxes = 2;
uncore_msr_uncores = hswep_msr_uncores;
}
@@ -3135,11 +3145,6 @@ static const struct pci_device_id hswep_uncore_pci_ids[] = {
.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
SNBEP_PCI_QPI_PORT1_FILTER),
},
- { /* PCU.3 (for Capability registers) */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fc0),
- .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
- HSWEP_PCI_PCU_3),
- },
{ /* end: all zeroes */ }
};
@@ -3231,27 +3236,18 @@ static struct event_constraint bdx_uncore_pcu_constraints[] = {
EVENT_CONSTRAINT_END
};
+#define BDX_PCU_DID 0x6fc0
+
void bdx_uncore_cpu_init(void)
{
- int pkg = topology_phys_to_logical_pkg(boot_cpu_data.phys_proc_id);
-
if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
uncore_msr_uncores = bdx_msr_uncores;
- /* BDX-DE doesn't have SBOX */
- if (boot_cpu_data.x86_model == 86) {
- uncore_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL;
/* Detect systems with no SBOXes */
- } else if (uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3]) {
- struct pci_dev *pdev;
- u32 capid4;
-
- pdev = uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3];
- pci_read_config_dword(pdev, 0x94, &capid4);
- if (((capid4 >> 6) & 0x3) == 0)
- bdx_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL;
- }
+ if ((boot_cpu_data.x86_model == 86) || hswep_has_limit_sbox(BDX_PCU_DID))
+ uncore_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL;
+
hswep_uncore_pcu.constraints = bdx_uncore_pcu_constraints;
}
@@ -3472,11 +3468,6 @@ static const struct pci_device_id bdx_uncore_pci_ids[] = {
.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
BDX_PCI_QPI_PORT2_FILTER),
},
- { /* PCU.3 (for Capability registers) */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fc0),
- .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
- HSWEP_PCI_PCU_3),
- },
{ /* end: all zeroes */ }
};
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
deleted file mode 100644
index 464034db299f..000000000000
--- a/arch/x86/include/asm/alternative-asm.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_X86_ALTERNATIVE_ASM_H
-#define _ASM_X86_ALTERNATIVE_ASM_H
-
-#ifdef __ASSEMBLY__
-
-#include <asm/asm.h>
-
-#ifdef CONFIG_SMP
- .macro LOCK_PREFIX
-672: lock
- .pushsection .smp_locks,"a"
- .balign 4
- .long 672b - .
- .popsection
- .endm
-#else
- .macro LOCK_PREFIX
- .endm
-#endif
-
-/*
- * objtool annotation to ignore the alternatives and only consider the original
- * instruction(s).
- */
-.macro ANNOTATE_IGNORE_ALTERNATIVE
- .Lannotate_\@:
- .pushsection .discard.ignore_alts
- .long .Lannotate_\@ - .
- .popsection
-.endm
-
-/*
- * Issue one struct alt_instr descriptor entry (need to put it into
- * the section .altinstructions, see below). This entry contains
- * enough information for the alternatives patching code to patch an
- * instruction. See apply_alternatives().
- */
-.macro altinstruction_entry orig alt feature orig_len alt_len pad_len
- .long \orig - .
- .long \alt - .
- .word \feature
- .byte \orig_len
- .byte \alt_len
- .byte \pad_len
-.endm
-
-/*
- * Define an alternative between two instructions. If @feature is
- * present, early code in apply_alternatives() replaces @oldinstr with
- * @newinstr. ".skip" directive takes care of proper instruction padding
- * in case @newinstr is longer than @oldinstr.
- */
-.macro ALTERNATIVE oldinstr, newinstr, feature
-140:
- \oldinstr
-141:
- .skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90
-142:
-
- .pushsection .altinstructions,"a"
- altinstruction_entry 140b,143f,\feature,142b-140b,144f-143f,142b-141b
- .popsection
-
- .pushsection .altinstr_replacement,"ax"
-143:
- \newinstr
-144:
- .popsection
-.endm
-
-#define old_len 141b-140b
-#define new_len1 144f-143f
-#define new_len2 145f-144f
-
-/*
- * gas compatible max based on the idea from:
- * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
- *
- * The additional "-" is needed because gas uses a "true" value of -1.
- */
-#define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b)))))
-
-
-/*
- * Same as ALTERNATIVE macro above but for two alternatives. If CPU
- * has @feature1, it replaces @oldinstr with @newinstr1. If CPU has
- * @feature2, it replaces @oldinstr with @feature2.
- */
-.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2
-140:
- \oldinstr
-141:
- .skip -((alt_max_short(new_len1, new_len2) - (old_len)) > 0) * \
- (alt_max_short(new_len1, new_len2) - (old_len)),0x90
-142:
-
- .pushsection .altinstructions,"a"
- altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f,142b-141b
- altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f,142b-141b
- .popsection
-
- .pushsection .altinstr_replacement,"ax"
-143:
- \newinstr1
-144:
- \newinstr2
-145:
- .popsection
-.endm
-
-#endif /* __ASSEMBLY__ */
-
-#endif /* _ASM_X86_ALTERNATIVE_ASM_H */
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 13adca37c99a..17b36090d448 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -2,13 +2,17 @@
#ifndef _ASM_X86_ALTERNATIVE_H
#define _ASM_X86_ALTERNATIVE_H
-#ifndef __ASSEMBLY__
-
#include <linux/types.h>
-#include <linux/stddef.h>
#include <linux/stringify.h>
#include <asm/asm.h>
+#define ALTINSTR_FLAG_INV (1 << 15)
+#define ALT_NOT(feat) ((feat) | ALTINSTR_FLAG_INV)
+
+#ifndef __ASSEMBLY__
+
+#include <linux/stddef.h>
+
/*
* Alternative inline assembly for SMP.
*
@@ -150,7 +154,7 @@ static inline int alternatives_text_reserved(void *start, void *end)
" .byte " alt_rlen(num) "\n" /* replacement len */ \
" .byte " alt_pad_len "\n" /* pad len */
-#define ALTINSTR_REPLACEMENT(newinstr, feature, num) /* replacement */ \
+#define ALTINSTR_REPLACEMENT(newinstr, num) /* replacement */ \
"# ALT: replacement " #num "\n" \
b_replacement(num)":\n\t" newinstr "\n" e_replacement(num) ":\n"
@@ -161,7 +165,7 @@ static inline int alternatives_text_reserved(void *start, void *end)
ALTINSTR_ENTRY(feature, 1) \
".popsection\n" \
".pushsection .altinstr_replacement, \"ax\"\n" \
- ALTINSTR_REPLACEMENT(newinstr, feature, 1) \
+ ALTINSTR_REPLACEMENT(newinstr, 1) \
".popsection\n"
#define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\
@@ -171,10 +175,15 @@ static inline int alternatives_text_reserved(void *start, void *end)
ALTINSTR_ENTRY(feature2, 2) \
".popsection\n" \
".pushsection .altinstr_replacement, \"ax\"\n" \
- ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \
- ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \
+ ALTINSTR_REPLACEMENT(newinstr1, 1) \
+ ALTINSTR_REPLACEMENT(newinstr2, 2) \
".popsection\n"
+/* If @feature is set, patch in @newinstr_yes, otherwise @newinstr_no. */
+#define ALTERNATIVE_TERNARY(oldinstr, feature, newinstr_yes, newinstr_no) \
+ ALTERNATIVE_2(oldinstr, newinstr_no, X86_FEATURE_ALWAYS, \
+ newinstr_yes, feature)
+
#define ALTERNATIVE_3(oldinsn, newinsn1, feat1, newinsn2, feat2, newinsn3, feat3) \
OLDINSTR_3(oldinsn, 1, 2, 3) \
".pushsection .altinstructions,\"a\"\n" \
@@ -183,9 +192,9 @@ static inline int alternatives_text_reserved(void *start, void *end)
ALTINSTR_ENTRY(feat3, 3) \
".popsection\n" \
".pushsection .altinstr_replacement, \"ax\"\n" \
- ALTINSTR_REPLACEMENT(newinsn1, feat1, 1) \
- ALTINSTR_REPLACEMENT(newinsn2, feat2, 2) \
- ALTINSTR_REPLACEMENT(newinsn3, feat3, 3) \
+ ALTINSTR_REPLACEMENT(newinsn1, 1) \
+ ALTINSTR_REPLACEMENT(newinsn2, 2) \
+ ALTINSTR_REPLACEMENT(newinsn3, 3) \
".popsection\n"
/*
@@ -206,6 +215,9 @@ static inline int alternatives_text_reserved(void *start, void *end)
#define alternative_2(oldinstr, newinstr1, feature1, newinstr2, feature2) \
asm_inline volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2) ::: "memory")
+#define alternative_ternary(oldinstr, feature, newinstr_yes, newinstr_no) \
+ asm_inline volatile(ALTERNATIVE_TERNARY(oldinstr, feature, newinstr_yes, newinstr_no) ::: "memory")
+
/*
* Alternative inline assembly with input.
*
@@ -271,6 +283,116 @@ static inline int alternatives_text_reserved(void *start, void *end)
*/
#define ASM_NO_INPUT_CLOBBER(clbr...) "i" (0) : clbr
+#else /* __ASSEMBLY__ */
+
+#ifdef CONFIG_SMP
+ .macro LOCK_PREFIX
+672: lock
+ .pushsection .smp_locks,"a"
+ .balign 4
+ .long 672b - .
+ .popsection
+ .endm
+#else
+ .macro LOCK_PREFIX
+ .endm
+#endif
+
+/*
+ * objtool annotation to ignore the alternatives and only consider the original
+ * instruction(s).
+ */
+.macro ANNOTATE_IGNORE_ALTERNATIVE
+ .Lannotate_\@:
+ .pushsection .discard.ignore_alts
+ .long .Lannotate_\@ - .
+ .popsection
+.endm
+
+/*
+ * Issue one struct alt_instr descriptor entry (need to put it into
+ * the section .altinstructions, see below). This entry contains
+ * enough information for the alternatives patching code to patch an
+ * instruction. See apply_alternatives().
+ */
+.macro altinstruction_entry orig alt feature orig_len alt_len pad_len
+ .long \orig - .
+ .long \alt - .
+ .word \feature
+ .byte \orig_len
+ .byte \alt_len
+ .byte \pad_len
+.endm
+
+/*
+ * Define an alternative between two instructions. If @feature is
+ * present, early code in apply_alternatives() replaces @oldinstr with
+ * @newinstr. ".skip" directive takes care of proper instruction padding
+ * in case @newinstr is longer than @oldinstr.
+ */
+.macro ALTERNATIVE oldinstr, newinstr, feature
+140:
+ \oldinstr
+141:
+ .skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90
+142:
+
+ .pushsection .altinstructions,"a"
+ altinstruction_entry 140b,143f,\feature,142b-140b,144f-143f,142b-141b
+ .popsection
+
+ .pushsection .altinstr_replacement,"ax"
+143:
+ \newinstr
+144:
+ .popsection
+.endm
+
+#define old_len 141b-140b
+#define new_len1 144f-143f
+#define new_len2 145f-144f
+
+/*
+ * gas compatible max based on the idea from:
+ * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
+ *
+ * The additional "-" is needed because gas uses a "true" value of -1.
+ */
+#define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b)))))
+
+
+/*
+ * Same as ALTERNATIVE macro above but for two alternatives. If CPU
+ * has @feature1, it replaces @oldinstr with @newinstr1. If CPU has
+ * @feature2, it replaces @oldinstr with @feature2.
+ */
+.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2
+140:
+ \oldinstr
+141:
+ .skip -((alt_max_short(new_len1, new_len2) - (old_len)) > 0) * \
+ (alt_max_short(new_len1, new_len2) - (old_len)),0x90
+142:
+
+ .pushsection .altinstructions,"a"
+ altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f,142b-141b
+ altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f,142b-141b
+ .popsection
+
+ .pushsection .altinstr_replacement,"ax"
+143:
+ \newinstr1
+144:
+ \newinstr2
+145:
+ .popsection
+.endm
+
+/* If @feature is set, patch in @newinstr_yes, otherwise @newinstr_no. */
+#define ALTERNATIVE_TERNARY(oldinstr, feature, newinstr_yes, newinstr_no) \
+ ALTERNATIVE_2 oldinstr, newinstr_no, X86_FEATURE_ALWAYS, \
+ newinstr_yes, feature
+
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_ALTERNATIVE_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 1728d4ce5730..16a51e7288d5 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -8,6 +8,7 @@
#include <asm/asm.h>
#include <linux/bitops.h>
+#include <asm/alternative.h>
enum cpuid_leafs
{
@@ -175,39 +176,15 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
*/
static __always_inline bool _static_cpu_has(u16 bit)
{
- asm_volatile_goto("1: jmp 6f\n"
- "2:\n"
- ".skip -(((5f-4f) - (2b-1b)) > 0) * "
- "((5f-4f) - (2b-1b)),0x90\n"
- "3:\n"
- ".section .altinstructions,\"a\"\n"
- " .long 1b - .\n" /* src offset */
- " .long 4f - .\n" /* repl offset */
- " .word %P[always]\n" /* always replace */
- " .byte 3b - 1b\n" /* src len */
- " .byte 5f - 4f\n" /* repl len */
- " .byte 3b - 2b\n" /* pad len */
- ".previous\n"
- ".section .altinstr_replacement,\"ax\"\n"
- "4: jmp %l[t_no]\n"
- "5:\n"
- ".previous\n"
- ".section .altinstructions,\"a\"\n"
- " .long 1b - .\n" /* src offset */
- " .long 0\n" /* no replacement */
- " .word %P[feature]\n" /* feature bit */
- " .byte 3b - 1b\n" /* src len */
- " .byte 0\n" /* repl len */
- " .byte 0\n" /* pad len */
- ".previous\n"
- ".section .altinstr_aux,\"ax\"\n"
- "6:\n"
- " testb %[bitnum],%[cap_byte]\n"
- " jnz %l[t_yes]\n"
- " jmp %l[t_no]\n"
- ".previous\n"
+ asm_volatile_goto(
+ ALTERNATIVE_TERNARY("jmp 6f", %P[feature], "", "jmp %l[t_no]")
+ ".section .altinstr_aux,\"ax\"\n"
+ "6:\n"
+ " testb %[bitnum],%[cap_byte]\n"
+ " jnz %l[t_yes]\n"
+ " jmp %l[t_no]\n"
+ ".previous\n"
: : [feature] "i" (bit),
- [always] "i" (X86_FEATURE_ALWAYS),
[bitnum] "i" (1 << (bit & 7)),
[cap_byte] "m" (((const char *)boot_cpu_data.x86_capability)[bit >> 3])
: : t_yes, t_no);
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index cc96e26d69f7..5e2624a9d890 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -236,6 +236,8 @@
#define X86_FEATURE_EPT_AD ( 8*32+17) /* Intel Extended Page Table access-dirty bit */
#define X86_FEATURE_VMCALL ( 8*32+18) /* "" Hypervisor supports the VMCALL instruction */
#define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */
+#define X86_FEATURE_PVUNLOCK ( 8*32+20) /* "" PV unlock function */
+#define X86_FEATURE_VCPUPREEMPT ( 8*32+21) /* "" PV vcpu_is_preempted function */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */
#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/
@@ -290,6 +292,8 @@
#define X86_FEATURE_FENCE_SWAPGS_KERNEL (11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */
#define X86_FEATURE_SPLIT_LOCK_DETECT (11*32+ 6) /* #AC for split lock */
#define X86_FEATURE_PER_THREAD_MBA (11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */
+#define X86_FEATURE_SGX1 (11*32+ 8) /* "" Basic SGX */
+#define X86_FEATURE_SGX2 (11*32+ 9) /* "" SGX Enclave Dynamic Memory Management (EDMM) */
/* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
#define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 144d70ea4393..c5ce9845c999 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -109,18 +109,13 @@ static __always_inline unsigned long arch_local_irq_save(void)
}
#else
-#define ENABLE_INTERRUPTS(x) sti
-#define DISABLE_INTERRUPTS(x) cli
-
#ifdef CONFIG_X86_64
#ifdef CONFIG_DEBUG_ENTRY
-#define SAVE_FLAGS(x) pushfq; popq %rax
+#define SAVE_FLAGS pushfq; popq %rax
#endif
#define INTERRUPT_RETURN jmp native_iret
-#else
-#define INTERRUPT_RETURN iret
#endif
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/kfence.h b/arch/x86/include/asm/kfence.h
index 97bbb4a9083a..05b48b33baf0 100644
--- a/arch/x86/include/asm/kfence.h
+++ b/arch/x86/include/asm/kfence.h
@@ -56,8 +56,13 @@ static inline bool kfence_protect_page(unsigned long addr, bool protect)
else
set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
- /* Flush this CPU's TLB. */
+ /*
+ * Flush this CPU's TLB, assuming whoever did the allocation/free is
+ * likely to continue running on this CPU.
+ */
+ preempt_disable();
flush_tlb_one_kernel(addr);
+ preempt_enable();
return true;
}
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index ccf60a809a17..e7be720062a8 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -63,7 +63,7 @@ typedef int (*hyperv_fill_flush_list_func)(
static __always_inline void hv_setup_sched_clock(void *sched_clock)
{
#ifdef CONFIG_PARAVIRT
- pv_ops.time.sched_clock = sched_clock;
+ paravirt_set_sched_clock(sched_clock);
#endif
}
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index d83ea9e5db47..c14fb80b9a07 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -7,7 +7,6 @@
#include <linux/objtool.h>
#include <asm/alternative.h>
-#include <asm/alternative-asm.h>
#include <asm/cpufeatures.h>
#include <asm/msr-index.h>
#include <asm/unwind_hints.h>
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 4abf110e2243..43992e5c52c2 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -15,11 +15,20 @@
#include <linux/bug.h>
#include <linux/types.h>
#include <linux/cpumask.h>
+#include <linux/static_call_types.h>
#include <asm/frame.h>
-static inline unsigned long long paravirt_sched_clock(void)
+u64 dummy_steal_clock(int cpu);
+u64 dummy_sched_clock(void);
+
+DECLARE_STATIC_CALL(pv_steal_clock, dummy_steal_clock);
+DECLARE_STATIC_CALL(pv_sched_clock, dummy_sched_clock);
+
+void paravirt_set_sched_clock(u64 (*func)(void));
+
+static inline u64 paravirt_sched_clock(void)
{
- return PVOP_CALL0(unsigned long long, time.sched_clock);
+ return static_call(pv_sched_clock)();
}
struct static_key;
@@ -33,9 +42,13 @@ bool pv_is_native_vcpu_is_preempted(void);
static inline u64 paravirt_steal_clock(int cpu)
{
- return PVOP_CALL1(u64, time.steal_clock, cpu);
+ return static_call(pv_steal_clock)(cpu);
}
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+void __init paravirt_set_cap(void);
+#endif
+
/* The paravirtualized I/O functions */
static inline void slow_down_io(void)
{
@@ -122,7 +135,9 @@ static inline void write_cr0(unsigned long x)
static inline unsigned long read_cr2(void)
{
- return PVOP_CALLEE0(unsigned long, mmu.read_cr2);
+ return PVOP_ALT_CALLEE0(unsigned long, mmu.read_cr2,
+ "mov %%cr2, %%rax;",
+ ALT_NOT(X86_FEATURE_XENPV));
}
static inline void write_cr2(unsigned long x)
@@ -132,12 +147,14 @@ static inline void write_cr2(unsigned long x)
static inline unsigned long __read_cr3(void)
{
- return PVOP_CALL0(unsigned long, mmu.read_cr3);
+ return PVOP_ALT_CALL0(unsigned long, mmu.read_cr3,
+ "mov %%cr3, %%rax;", ALT_NOT(X86_FEATURE_XENPV));
}
static inline void write_cr3(unsigned long x)
{
- PVOP_VCALL1(mmu.write_cr3, x);
+ PVOP_ALT_VCALL1(mmu.write_cr3, x,
+ "mov %%rdi, %%cr3", ALT_NOT(X86_FEATURE_XENPV));
}
static inline void __write_cr4(unsigned long x)
@@ -157,7 +174,7 @@ static inline void halt(void)
static inline void wbinvd(void)
{
- PVOP_VCALL0(cpu.wbinvd);
+ PVOP_ALT_VCALL0(cpu.wbinvd, "wbinvd", ALT_NOT(X86_FEATURE_XENPV));
}
static inline u64 paravirt_read_msr(unsigned msr)
@@ -371,22 +388,28 @@ static inline void paravirt_release_p4d(unsigned long pfn)
static inline pte_t __pte(pteval_t val)
{
- return (pte_t) { PVOP_CALLEE1(pteval_t, mmu.make_pte, val) };
+ return (pte_t) { PVOP_ALT_CALLEE1(pteval_t, mmu.make_pte, val,
+ "mov %%rdi, %%rax",
+ ALT_NOT(X86_FEATURE_XENPV)) };
}
static inline pteval_t pte_val(pte_t pte)
{
- return PVOP_CALLEE1(pteval_t, mmu.pte_val, pte.pte);
+ return PVOP_ALT_CALLEE1(pteval_t, mmu.pte_val, pte.pte,
+ "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
}
static inline pgd_t __pgd(pgdval_t val)
{
- return (pgd_t) { PVOP_CALLEE1(pgdval_t, mmu.make_pgd, val) };
+ return (pgd_t) { PVOP_ALT_CALLEE1(pgdval_t, mmu.make_pgd, val,
+ "mov %%rdi, %%rax",
+ ALT_NOT(X86_FEATURE_XENPV)) };
}
static inline pgdval_t pgd_val(pgd_t pgd)
{
- return PVOP_CALLEE1(pgdval_t, mmu.pgd_val, pgd.pgd);
+ return PVOP_ALT_CALLEE1(pgdval_t, mmu.pgd_val, pgd.pgd,
+ "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
}
#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
@@ -419,12 +442,15 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
static inline pmd_t __pmd(pmdval_t val)
{
- return (pmd_t) { PVOP_CALLEE1(pmdval_t, mmu.make_pmd, val) };
+ return (pmd_t) { PVOP_ALT_CALLEE1(pmdval_t, mmu.make_pmd, val,
+ "mov %%rdi, %%rax",
+ ALT_NOT(X86_FEATURE_XENPV)) };
}
static inline pmdval_t pmd_val(pmd_t pmd)
{
- return PVOP_CALLEE1(pmdval_t, mmu.pmd_val, pmd.pmd);
+ return PVOP_ALT_CALLEE1(pmdval_t, mmu.pmd_val, pmd.pmd,
+ "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
}
static inline void set_pud(pud_t *pudp, pud_t pud)
@@ -436,14 +462,16 @@ static inline pud_t __pud(pudval_t val)
{
pudval_t ret;
- ret = PVOP_CALLEE1(pudval_t, mmu.make_pud, val);
+ ret = PVOP_ALT_CALLEE1(pudval_t, mmu.make_pud, val,
+ "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
return (pud_t) { ret };
}
static inline pudval_t pud_val(pud_t pud)
{
- return PVOP_CALLEE1(pudval_t, mmu.pud_val, pud.pud);
+ return PVOP_ALT_CALLEE1(pudval_t, mmu.pud_val, pud.pud,
+ "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
}
static inline void pud_clear(pud_t *pudp)
@@ -462,14 +490,17 @@ static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
static inline p4d_t __p4d(p4dval_t val)
{
- p4dval_t ret = PVOP_CALLEE1(p4dval_t, mmu.make_p4d, val);
+ p4dval_t ret = PVOP_ALT_CALLEE1(p4dval_t, mmu.make_p4d, val,
+ "mov %%rdi, %%rax",
+ ALT_NOT(X86_FEATURE_XENPV));
return (p4d_t) { ret };
}
static inline p4dval_t p4d_val(p4d_t p4d)
{
- return PVOP_CALLEE1(p4dval_t, mmu.p4d_val, p4d.p4d);
+ return PVOP_ALT_CALLEE1(p4dval_t, mmu.p4d_val, p4d.p4d,
+ "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
}
static inline void __set_pgd(pgd_t *pgdp, pgd_t pgd)
@@ -556,7 +587,9 @@ static __always_inline void pv_queued_spin_lock_slowpath(struct qspinlock *lock,
static __always_inline void pv_queued_spin_unlock(struct qspinlock *lock)
{
- PVOP_VCALLEE1(lock.queued_spin_unlock, lock);
+ PVOP_ALT_VCALLEE1(lock.queued_spin_unlock, lock,
+ "movb $0, (%%" _ASM_ARG1 ");",
+ ALT_NOT(X86_FEATURE_PVUNLOCK));
}
static __always_inline void pv_wait(u8 *ptr, u8 val)
@@ -571,7 +604,9 @@ static __always_inline void pv_kick(int cpu)
static __always_inline bool pv_vcpu_is_preempted(long cpu)
{
- return PVOP_CALLEE1(bool, lock.vcpu_is_preempted, cpu);
+ return PVOP_ALT_CALLEE1(bool, lock.vcpu_is_preempted, cpu,
+ "xor %%" _ASM_AX ", %%" _ASM_AX ";",
+ ALT_NOT(X86_FEATURE_VCPUPREEMPT));
}
void __raw_callee_save___native_queued_spin_unlock(struct qspinlock *lock);
@@ -645,17 +680,18 @@ bool __raw_callee_save___native_vcpu_is_preempted(long cpu);
#ifdef CONFIG_PARAVIRT_XXL
static inline notrace unsigned long arch_local_save_flags(void)
{
- return PVOP_CALLEE0(unsigned long, irq.save_fl);
+ return PVOP_ALT_CALLEE0(unsigned long, irq.save_fl, "pushf; pop %%rax;",
+ ALT_NOT(X86_FEATURE_XENPV));
}
static inline notrace void arch_local_irq_disable(void)
{
- PVOP_VCALLEE0(irq.irq_disable);
+ PVOP_ALT_VCALLEE0(irq.irq_disable, "cli;", ALT_NOT(X86_FEATURE_XENPV));
}
static inline notrace void arch_local_irq_enable(void)
{
- PVOP_VCALLEE0(irq.irq_enable);
+ PVOP_ALT_VCALLEE0(irq.irq_enable, "sti;", ALT_NOT(X86_FEATURE_XENPV));
}
static inline notrace unsigned long arch_local_irq_save(void)
@@ -700,84 +736,27 @@ extern void default_banner(void);
.popsection
-#define COND_PUSH(set, mask, reg) \
- .if ((~(set)) & mask); push %reg; .endif
-#define COND_POP(set, mask, reg) \
- .if ((~(set)) & mask); pop %reg; .endif
-
#ifdef CONFIG_X86_64
-
-#define PV_SAVE_REGS(set) \
- COND_PUSH(set, CLBR_RAX, rax); \
- COND_PUSH(set, CLBR_RCX, rcx); \
- COND_PUSH(set, CLBR_RDX, rdx); \
- COND_PUSH(set, CLBR_RSI, rsi); \
- COND_PUSH(set, CLBR_RDI, rdi); \
- COND_PUSH(set, CLBR_R8, r8); \
- COND_PUSH(set, CLBR_R9, r9); \
- COND_PUSH(set, CLBR_R10, r10); \
- COND_PUSH(set, CLBR_R11, r11)
-#define PV_RESTORE_REGS(set) \
- COND_POP(set, CLBR_R11, r11); \
- COND_POP(set, CLBR_R10, r10); \
- COND_POP(set, CLBR_R9, r9); \
- COND_POP(set, CLBR_R8, r8); \
- COND_POP(set, CLBR_RDI, rdi); \
- COND_POP(set, CLBR_RSI, rsi); \
- COND_POP(set, CLBR_RDX, rdx); \
- COND_POP(set, CLBR_RCX, rcx); \
- COND_POP(set, CLBR_RAX, rax)
+#ifdef CONFIG_PARAVIRT_XXL
#define PARA_PATCH(off) ((off) / 8)
#define PARA_SITE(ptype, ops) _PVSITE(ptype, ops, .quad, 8)
#define PARA_INDIRECT(addr) *addr(%rip)
-#else
-#define PV_SAVE_REGS(set) \
- COND_PUSH(set, CLBR_EAX, eax); \
- COND_PUSH(set, CLBR_EDI, edi); \
- COND_PUSH(set, CLBR_ECX, ecx); \
- COND_PUSH(set, CLBR_EDX, edx)
-#define PV_RESTORE_REGS(set) \
- COND_POP(set, CLBR_EDX, edx); \
- COND_POP(set, CLBR_ECX, ecx); \
- COND_POP(set, CLBR_EDI, edi); \
- COND_POP(set, CLBR_EAX, eax)
-
-#define PARA_PATCH(off) ((off) / 4)
-#define PARA_SITE(ptype, ops) _PVSITE(ptype, ops, .long, 4)
-#define PARA_INDIRECT(addr) *%cs:addr
-#endif
-#ifdef CONFIG_PARAVIRT_XXL
#define INTERRUPT_RETURN \
- PARA_SITE(PARA_PATCH(PV_CPU_iret), \
- ANNOTATE_RETPOLINE_SAFE; \
- jmp PARA_INDIRECT(pv_ops+PV_CPU_iret);)
-
-#define DISABLE_INTERRUPTS(clobbers) \
- PARA_SITE(PARA_PATCH(PV_IRQ_irq_disable), \
- PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \
- ANNOTATE_RETPOLINE_SAFE; \
- call PARA_INDIRECT(pv_ops+PV_IRQ_irq_disable); \
- PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
-
-#define ENABLE_INTERRUPTS(clobbers) \
- PARA_SITE(PARA_PATCH(PV_IRQ_irq_enable), \
- PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \
- ANNOTATE_RETPOLINE_SAFE; \
- call PARA_INDIRECT(pv_ops+PV_IRQ_irq_enable); \
- PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
-#endif
+ ANNOTATE_RETPOLINE_SAFE; \
+ ALTERNATIVE_TERNARY("jmp *paravirt_iret(%rip);", \
+ X86_FEATURE_XENPV, "jmp xen_iret;", "jmp native_iret;")
-#ifdef CONFIG_X86_64
-#ifdef CONFIG_PARAVIRT_XXL
#ifdef CONFIG_DEBUG_ENTRY
-#define SAVE_FLAGS(clobbers) \
- PARA_SITE(PARA_PATCH(PV_IRQ_save_fl), \
- PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \
- ANNOTATE_RETPOLINE_SAFE; \
- call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl); \
- PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
+.macro PARA_IRQ_save_fl
+ PARA_SITE(PARA_PATCH(PV_IRQ_save_fl),
+ ANNOTATE_RETPOLINE_SAFE;
+ call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl);)
+.endm
+
+#define SAVE_FLAGS ALTERNATIVE "PARA_IRQ_save_fl;", "pushf; pop %rax;", \
+ ALT_NOT(X86_FEATURE_XENPV)
#endif
#endif /* CONFIG_PARAVIRT_XXL */
#endif /* CONFIG_X86_64 */
@@ -800,5 +779,11 @@ static inline void paravirt_arch_exit_mmap(struct mm_struct *mm)
{
}
#endif
+
+#ifndef CONFIG_PARAVIRT_SPINLOCKS
+static inline void paravirt_set_cap(void)
+{
+}
+#endif
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_PARAVIRT_H */
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index c490308fbe6f..ae692c3194e9 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -3,7 +3,6 @@
#define _ASM_X86_PARAVIRT_TYPES_H
/* Bitmask of what can be clobbered: usually at least eax. */
-#define CLBR_NONE 0
#define CLBR_EAX (1 << 0)
#define CLBR_ECX (1 << 1)
#define CLBR_EDX (1 << 2)
@@ -15,7 +14,6 @@
#define CLBR_ARG_REGS (CLBR_EAX | CLBR_EDX | CLBR_ECX)
#define CLBR_RET_REG (CLBR_EAX | CLBR_EDX)
-#define CLBR_SCRATCH (0)
#else
#define CLBR_RAX CLBR_EAX
#define CLBR_RCX CLBR_ECX
@@ -32,12 +30,9 @@
#define CLBR_ARG_REGS (CLBR_RDI | CLBR_RSI | CLBR_RDX | \
CLBR_RCX | CLBR_R8 | CLBR_R9)
#define CLBR_RET_REG (CLBR_RAX)
-#define CLBR_SCRATCH (CLBR_R10 | CLBR_R11)
#endif /* X86_64 */
-#define CLBR_CALLEE_SAVE ((CLBR_ARG_REGS | CLBR_SCRATCH) & ~CLBR_RET_REG)
-
#ifndef __ASSEMBLY__
#include <asm/desc_defs.h>
@@ -73,19 +68,6 @@ struct pv_info {
const char *name;
};
-struct pv_init_ops {
- /*
- * Patch may replace one of the defined code sequences with
- * arbitrary code, subject to the same register constraints.
- * This generally means the code is not free to clobber any
- * registers other than EAX. The patch function should return
- * the number of bytes of code generated, as we nop pad the
- * rest in generic code.
- */
- unsigned (*patch)(u8 type, void *insn_buff,
- unsigned long addr, unsigned len);
-} __no_randomize_layout;
-
#ifdef CONFIG_PARAVIRT_XXL
struct pv_lazy_ops {
/* Set deferred update mode, used for batching operations. */
@@ -95,11 +77,6 @@ struct pv_lazy_ops {
} __no_randomize_layout;
#endif
-struct pv_time_ops {
- unsigned long long (*sched_clock)(void);
- unsigned long long (*steal_clock)(int cpu);
-} __no_randomize_layout;
-
struct pv_cpu_ops {
/* hooks for various privileged instructions */
void (*io_delay)(void);
@@ -156,10 +133,6 @@ struct pv_cpu_ops {
u64 (*read_pmc)(int counter);
- /* Normal iret. Jump to this with the standard iret stack
- frame set up. */
- void (*iret)(void);
-
void (*start_context_switch)(struct task_struct *prev);
void (*end_context_switch)(struct task_struct *next);
#endif
@@ -290,8 +263,6 @@ struct pv_lock_ops {
* number for each function using the offset which we use to indicate
* what to patch. */
struct paravirt_patch_template {
- struct pv_init_ops init;
- struct pv_time_ops time;
struct pv_cpu_ops cpu;
struct pv_irq_ops irq;
struct pv_mmu_ops mmu;
@@ -300,6 +271,7 @@ struct paravirt_patch_template {
extern struct pv_info pv_info;
extern struct paravirt_patch_template pv_ops;
+extern void (*paravirt_iret)(void);
#define PARAVIRT_PATCH(x) \
(offsetof(struct paravirt_patch_template, x) / sizeof(void *))
@@ -331,11 +303,7 @@ extern struct paravirt_patch_template pv_ops;
/* Simple instruction patching code. */
#define NATIVE_LABEL(a,x,b) "\n\t.globl " a #x "_" #b "\n" a #x "_" #b ":\n\t"
-unsigned paravirt_patch_ident_64(void *insn_buff, unsigned len);
-unsigned paravirt_patch_default(u8 type, void *insn_buff, unsigned long addr, unsigned len);
-unsigned paravirt_patch_insns(void *insn_buff, unsigned len, const char *start, const char *end);
-
-unsigned native_patch(u8 type, void *insn_buff, unsigned long addr, unsigned len);
+unsigned int paravirt_patch(u8 type, void *insn_buff, unsigned long addr, unsigned int len);
int paravirt_disable_iospace(void);
@@ -414,11 +382,9 @@ int paravirt_disable_iospace(void);
* makes sure the incoming and outgoing types are always correct.
*/
#ifdef CONFIG_X86_32
-#define PVOP_VCALL_ARGS \
+#define PVOP_CALL_ARGS \
unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx;
-#define PVOP_CALL_ARGS PVOP_VCALL_ARGS
-
#define PVOP_CALL_ARG1(x) "a" ((unsigned long)(x))
#define PVOP_CALL_ARG2(x) "d" ((unsigned long)(x))
#define PVOP_CALL_ARG3(x) "c" ((unsigned long)(x))
@@ -434,12 +400,10 @@ int paravirt_disable_iospace(void);
#define VEXTRA_CLOBBERS
#else /* CONFIG_X86_64 */
/* [re]ax isn't an arg, but the return val */
-#define PVOP_VCALL_ARGS \
+#define PVOP_CALL_ARGS \
unsigned long __edi = __edi, __esi = __esi, \
__edx = __edx, __ecx = __ecx, __eax = __eax;
-#define PVOP_CALL_ARGS PVOP_VCALL_ARGS
-
#define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x))
#define PVOP_CALL_ARG2(x) "S" ((unsigned long)(x))
#define PVOP_CALL_ARG3(x) "d" ((unsigned long)(x))
@@ -464,152 +428,138 @@ int paravirt_disable_iospace(void);
#define PVOP_TEST_NULL(op) ((void)pv_ops.op)
#endif
-#define PVOP_RETMASK(rettype) \
+#define PVOP_RETVAL(rettype) \
({ unsigned long __mask = ~0UL; \
+ BUILD_BUG_ON(sizeof(rettype) > sizeof(unsigned long)); \
switch (sizeof(rettype)) { \
case 1: __mask = 0xffUL; break; \
case 2: __mask = 0xffffUL; break; \
case 4: __mask = 0xffffffffUL; break; \
default: break; \
} \
- __mask; \
+ __mask & __eax; \
})
-#define ____PVOP_CALL(rettype, op, clbr, call_clbr, extra_clbr, \
- pre, post, ...) \
+#define ____PVOP_CALL(ret, op, clbr, call_clbr, extra_clbr, ...) \
({ \
- rettype __ret; \
PVOP_CALL_ARGS; \
PVOP_TEST_NULL(op); \
- /* This is 32-bit specific, but is okay in 64-bit */ \
- /* since this condition will never hold */ \
- if (sizeof(rettype) > sizeof(unsigned long)) { \
- asm volatile(pre \
- paravirt_alt(PARAVIRT_CALL) \
- post \
- : call_clbr, ASM_CALL_CONSTRAINT \
- : paravirt_type(op), \
- paravirt_clobber(clbr), \
- ##__VA_ARGS__ \
- : "memory", "cc" extra_clbr); \
- __ret = (rettype)((((u64)__edx) << 32) | __eax); \
- } else { \
- asm volatile(pre \
- paravirt_alt(PARAVIRT_CALL) \
- post \
- : call_clbr, ASM_CALL_CONSTRAINT \
- : paravirt_type(op), \
- paravirt_clobber(clbr), \
- ##__VA_ARGS__ \
- : "memory", "cc" extra_clbr); \
- __ret = (rettype)(__eax & PVOP_RETMASK(rettype)); \
- } \
- __ret; \
+ asm volatile(paravirt_alt(PARAVIRT_CALL) \
+ : call_clbr, ASM_CALL_CONSTRAINT \
+ : paravirt_type(op), \
+ paravirt_clobber(clbr), \
+ ##__VA_ARGS__ \
+ : "memory", "cc" extra_clbr); \
+ ret; \
})
-#define __PVOP_CALL(rettype, op, pre, post, ...) \
- ____PVOP_CALL(rettype, op, CLBR_ANY, PVOP_CALL_CLOBBERS, \
- EXTRA_CLOBBERS, pre, post, ##__VA_ARGS__)
-
-#define __PVOP_CALLEESAVE(rettype, op, pre, post, ...) \
- ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \
- PVOP_CALLEE_CLOBBERS, , \
- pre, post, ##__VA_ARGS__)
-
-
-#define ____PVOP_VCALL(op, clbr, call_clbr, extra_clbr, pre, post, ...) \
+#define ____PVOP_ALT_CALL(ret, op, alt, cond, clbr, call_clbr, \
+ extra_clbr, ...) \
({ \
- PVOP_VCALL_ARGS; \
+ PVOP_CALL_ARGS; \
PVOP_TEST_NULL(op); \
- asm volatile(pre \
- paravirt_alt(PARAVIRT_CALL) \
- post \
+ asm volatile(ALTERNATIVE(paravirt_alt(PARAVIRT_CALL), \
+ alt, cond) \
: call_clbr, ASM_CALL_CONSTRAINT \
: paravirt_type(op), \
paravirt_clobber(clbr), \
##__VA_ARGS__ \
: "memory", "cc" extra_clbr); \
+ ret; \
})
-#define __PVOP_VCALL(op, pre, post, ...) \
- ____PVOP_VCALL(op, CLBR_ANY, PVOP_VCALL_CLOBBERS, \
- VEXTRA_CLOBBERS, \
- pre, post, ##__VA_ARGS__)
+#define __PVOP_CALL(rettype, op, ...) \
+ ____PVOP_CALL(PVOP_RETVAL(rettype), op, CLBR_ANY, \
+ PVOP_CALL_CLOBBERS, EXTRA_CLOBBERS, ##__VA_ARGS__)
+
+#define __PVOP_ALT_CALL(rettype, op, alt, cond, ...) \
+ ____PVOP_ALT_CALL(PVOP_RETVAL(rettype), op, alt, cond, CLBR_ANY,\
+ PVOP_CALL_CLOBBERS, EXTRA_CLOBBERS, \
+ ##__VA_ARGS__)
+
+#define __PVOP_CALLEESAVE(rettype, op, ...) \
+ ____PVOP_CALL(PVOP_RETVAL(rettype), op.func, CLBR_RET_REG, \
+ PVOP_CALLEE_CLOBBERS, , ##__VA_ARGS__)
+
+#define __PVOP_ALT_CALLEESAVE(rettype, op, alt, cond, ...) \
+ ____PVOP_ALT_CALL(PVOP_RETVAL(rettype), op.func, alt, cond, \
+ CLBR_RET_REG, PVOP_CALLEE_CLOBBERS, , ##__VA_ARGS__)
+
+
+#define __PVOP_VCALL(op, ...) \
+ (void)____PVOP_CALL(, op, CLBR_ANY, PVOP_VCALL_CLOBBERS, \
+ VEXTRA_CLOBBERS, ##__VA_ARGS__)
+
+#define __PVOP_ALT_VCALL(op, alt, cond, ...) \
+ (void)____PVOP_ALT_CALL(, op, alt, cond, CLBR_ANY, \
+ PVOP_VCALL_CLOBBERS, VEXTRA_CLOBBERS, \
+ ##__VA_ARGS__)
-#define __PVOP_VCALLEESAVE(op, pre, post, ...) \
- ____PVOP_VCALL(op.func, CLBR_RET_REG, \
- PVOP_VCALLEE_CLOBBERS, , \
- pre, post, ##__VA_ARGS__)
+#define __PVOP_VCALLEESAVE(op, ...) \
+ (void)____PVOP_CALL(, op.func, CLBR_RET_REG, \
+ PVOP_VCALLEE_CLOBBERS, , ##__VA_ARGS__)
+#define __PVOP_ALT_VCALLEESAVE(op, alt, cond, ...) \
+ (void)____PVOP_ALT_CALL(, op.func, alt, cond, CLBR_RET_REG, \
+ PVOP_VCALLEE_CLOBBERS, , ##__VA_ARGS__)
#define PVOP_CALL0(rettype, op) \
- __PVOP_CALL(rettype, op, "", "")
+ __PVOP_CALL(rettype, op)
#define PVOP_VCALL0(op) \
- __PVOP_VCALL(op, "", "")
+ __PVOP_VCALL(op)
+#define PVOP_ALT_CALL0(rettype, op, alt, cond) \
+ __PVOP_ALT_CALL(rettype, op, alt, cond)
+#define PVOP_ALT_VCALL0(op, alt, cond) \
+ __PVOP_ALT_VCALL(op, alt, cond)
#define PVOP_CALLEE0(rettype, op) \
- __PVOP_CALLEESAVE(rettype, op, "", "")
+ __PVOP_CALLEESAVE(rettype, op)
#define PVOP_VCALLEE0(op) \
- __PVOP_VCALLEESAVE(op, "", "")
+ __PVOP_VCALLEESAVE(op)
+#define PVOP_ALT_CALLEE0(rettype, op, alt, cond) \
+ __PVOP_ALT_CALLEESAVE(rettype, op, alt, cond)
+#define PVOP_ALT_VCALLEE0(op, alt, cond) \
+ __PVOP_ALT_VCALLEESAVE(op, alt, cond)
#define PVOP_CALL1(rettype, op, arg1) \
- __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1))
+ __PVOP_CALL(rettype, op, PVOP_CALL_ARG1(arg1))
#define PVOP_VCALL1(op, arg1) \
- __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1))
+ __PVOP_VCALL(op, PVOP_CALL_ARG1(arg1))
+#define PVOP_ALT_VCALL1(op, arg1, alt, cond) \
+ __PVOP_ALT_VCALL(op, alt, cond, PVOP_CALL_ARG1(arg1))
#define PVOP_CALLEE1(rettype, op, arg1) \
- __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1))
+ __PVOP_CALLEESAVE(rettype, op, PVOP_CALL_ARG1(arg1))
#define PVOP_VCALLEE1(op, arg1) \
- __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1))
+ __PVOP_VCALLEESAVE(op, PVOP_CALL_ARG1(arg1))
+#define PVOP_ALT_CALLEE1(rettype, op, arg1, alt, cond) \
+ __PVOP_ALT_CALLEESAVE(rettype, op, alt, cond, PVOP_CALL_ARG1(arg1))
+#define PVOP_ALT_VCALLEE1(op, arg1, alt, cond) \
+ __PVOP_ALT_VCALLEESAVE(op, alt, cond, PVOP_CALL_ARG1(arg1))
#define PVOP_CALL2(rettype, op, arg1, arg2) \
- __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \
- PVOP_CALL_ARG2(arg2))
+ __PVOP_CALL(rettype, op, PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2))
#define PVOP_VCALL2(op, arg1, arg2) \
- __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \
- PVOP_CALL_ARG2(arg2))
-
-#define PVOP_CALLEE2(rettype, op, arg1, arg2) \
- __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \
- PVOP_CALL_ARG2(arg2))
-#define PVOP_VCALLEE2(op, arg1, arg2) \
- __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1), \
- PVOP_CALL_ARG2(arg2))
-
+ __PVOP_VCALL(op, PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2))
#define PVOP_CALL3(rettype, op, arg1, arg2, arg3) \
- __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \
+ __PVOP_CALL(rettype, op, PVOP_CALL_ARG1(arg1), \
PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3))
#define PVOP_VCALL3(op, arg1, arg2, arg3) \
- __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \
+ __PVOP_VCALL(op, PVOP_CALL_ARG1(arg1), \
PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3))
-/* This is the only difference in x86_64. We can make it much simpler */
-#ifdef CONFIG_X86_32
#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \
__PVOP_CALL(rettype, op, \
- "push %[_arg4];", "lea 4(%%esp),%%esp;", \
- PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \
- PVOP_CALL_ARG3(arg3), [_arg4] "mr" ((u32)(arg4)))
-#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \
- __PVOP_VCALL(op, \
- "push %[_arg4];", "lea 4(%%esp),%%esp;", \
- "0" ((u32)(arg1)), "1" ((u32)(arg2)), \
- "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4)))
-#else
-#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \
- __PVOP_CALL(rettype, op, "", "", \
PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \
PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4))
#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \
- __PVOP_VCALL(op, "", "", \
- PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \
+ __PVOP_VCALL(op, PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \
PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4))
-#endif
/* Lazy mode for batching updates / context switch */
enum paravirt_lazy_mode {
diff --git a/arch/x86/kernel/cpu/sgx/arch.h b/arch/x86/include/asm/sgx.h
index 26315bea1cb4..9c31e0ebc55b 100644
--- a/arch/x86/kernel/cpu/sgx/arch.h
+++ b/arch/x86/include/asm/sgx.h
@@ -2,15 +2,20 @@
/**
* Copyright(c) 2016-20 Intel Corporation.
*
- * Contains data structures defined by the SGX architecture. Data structures
- * defined by the Linux software stack should not be placed here.
+ * Intel Software Guard Extensions (SGX) support.
*/
-#ifndef _ASM_X86_SGX_ARCH_H
-#define _ASM_X86_SGX_ARCH_H
+#ifndef _ASM_X86_SGX_H
+#define _ASM_X86_SGX_H
#include <linux/bits.h>
#include <linux/types.h>
+/*
+ * This file contains both data structures defined by SGX architecture and Linux
+ * defined software data structures and functions. The two should not be mixed
+ * together for better readibility. The architectural definitions come first.
+ */
+
/* The SGX specific CPUID function. */
#define SGX_CPUID 0x12
/* EPC enumeration. */
@@ -22,16 +27,36 @@
/* The bitmask for the EPC section type. */
#define SGX_CPUID_EPC_MASK GENMASK(3, 0)
+enum sgx_encls_function {
+ ECREATE = 0x00,
+ EADD = 0x01,
+ EINIT = 0x02,
+ EREMOVE = 0x03,
+ EDGBRD = 0x04,
+ EDGBWR = 0x05,
+ EEXTEND = 0x06,
+ ELDU = 0x08,
+ EBLOCK = 0x09,
+ EPA = 0x0A,
+ EWB = 0x0B,
+ ETRACK = 0x0C,
+ EAUG = 0x0D,
+ EMODPR = 0x0E,
+ EMODT = 0x0F,
+};
+
/**
* enum sgx_return_code - The return code type for ENCLS, ENCLU and ENCLV
* %SGX_NOT_TRACKED: Previous ETRACK's shootdown sequence has not
* been completed yet.
+ * %SGX_CHILD_PRESENT SECS has child pages present in the EPC.
* %SGX_INVALID_EINITTOKEN: EINITTOKEN is invalid and enclave signer's
* public key does not match IA32_SGXLEPUBKEYHASH.
* %SGX_UNMASKED_EVENT: An unmasked event, e.g. INTR, was received
*/
enum sgx_return_code {
SGX_NOT_TRACKED = 11,
+ SGX_CHILD_PRESENT = 13,
SGX_INVALID_EINITTOKEN = 16,
SGX_UNMASKED_EVENT = 128,
};
@@ -335,4 +360,19 @@ struct sgx_sigstruct {
#define SGX_LAUNCH_TOKEN_SIZE 304
-#endif /* _ASM_X86_SGX_ARCH_H */
+/*
+ * Do not put any hardware-defined SGX structure representations below this
+ * comment!
+ */
+
+#ifdef CONFIG_X86_SGX_KVM
+int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs,
+ int *trapnr);
+int sgx_virt_einit(void __user *sigstruct, void __user *token,
+ void __user *secs, u64 *lepubkeyhash, int *trapnr);
+#endif
+
+int sgx_set_attribute(unsigned long *allowed_attributes,
+ unsigned int attribute_fd);
+
+#endif /* _ASM_X86_SGX_H */
diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h
index 0bc9b0895f33..d17b39893b79 100644
--- a/arch/x86/include/asm/smap.h
+++ b/arch/x86/include/asm/smap.h
@@ -11,6 +11,7 @@
#include <asm/nops.h>
#include <asm/cpufeatures.h>
+#include <asm/alternative.h>
/* "Raw" instruction opcodes */
#define __ASM_CLAC ".byte 0x0f,0x01,0xca"
@@ -18,8 +19,6 @@
#ifdef __ASSEMBLY__
-#include <asm/alternative-asm.h>
-
#ifdef CONFIG_X86_SMAP
#define ASM_CLAC \
@@ -37,8 +36,6 @@
#else /* __ASSEMBLY__ */
-#include <asm/alternative.h>
-
#ifdef CONFIG_X86_SMAP
static __always_inline void clac(void)
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index c0538f82c9a2..630ff08532be 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -132,6 +132,7 @@ void native_play_dead(void);
void play_dead_common(void);
void wbinvd_on_cpu(int cpu);
int wbinvd_on_all_cpus(void);
+void cond_wakeup_cpu0(void);
void native_smp_send_reschedule(int cpu);
void native_send_call_func_ipi(const struct cpumask *mask);
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 7068e4bb057d..1a162e559753 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -87,18 +87,6 @@ clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
#endif
/*
- * The maximum amount of extra memory compared to the base size. The
- * main scaling factor is the size of struct page. At extreme ratios
- * of base:extra, all the base memory can be filled with page
- * structures for the extra memory, leaving no space for anything
- * else.
- *
- * 10x seems like a reasonable balance between scaling flexibility and
- * leaving a practically usable system.
- */
-#define XEN_EXTRA_MEM_RATIO (10)
-
-/*
* Helper functions to write or read unsigned long values to/from
* memory, when the access may fault.
*/
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 2ddf08351f0b..0704c2a94272 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -35,7 +35,6 @@ KASAN_SANITIZE_sev-es.o := n
KCSAN_SANITIZE := n
OBJECT_FILES_NON_STANDARD_test_nx.o := y
-OBJECT_FILES_NON_STANDARD_paravirt_patch.o := y
ifdef CONFIG_FRAME_POINTER
OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y
@@ -121,7 +120,7 @@ obj-$(CONFIG_AMD_NB) += amd_nb.o
obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o
obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o
-obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch.o
+obj-$(CONFIG_PARAVIRT) += paravirt.o
obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index c2eee6e8aaf2..e90310cbe73a 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1554,10 +1554,18 @@ void __init acpi_boot_table_init(void)
/*
* Initialize the ACPI boot-time table parser.
*/
- if (acpi_table_init()) {
+ if (acpi_locate_initial_tables())
disable_acpi();
- return;
- }
+ else
+ acpi_reserve_initial_tables();
+}
+
+int __init early_acpi_boot_init(void)
+{
+ if (acpi_disabled)
+ return 1;
+
+ acpi_table_init_complete();
acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf);
@@ -1570,18 +1578,9 @@ void __init acpi_boot_table_init(void)
} else {
printk(KERN_WARNING PREFIX "Disabling ACPI support\n");
disable_acpi();
- return;
+ return 1;
}
}
-}
-
-int __init early_acpi_boot_init(void)
-{
- /*
- * If acpi_disabled, bail out
- */
- if (acpi_disabled)
- return 1;
/*
* Process the Multiple APIC Description Table (MADT), if present
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index 56b6865afb2a..d5d8a352eafa 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -115,7 +115,7 @@ SYM_FUNC_START(do_suspend_lowlevel)
movq pt_regs_r14(%rax), %r14
movq pt_regs_r15(%rax), %r15
-#if defined(CONFIG_KASAN) && CONFIG_KASAN_STACK
+#if defined(CONFIG_KASAN) && defined(CONFIG_KASAN_STACK)
/*
* The suspend path may have poisoned some areas deeper in the stack,
* which we now need to unpoison.
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 8d778e46725d..f810e6fececd 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -28,6 +28,7 @@
#include <asm/insn.h>
#include <asm/io.h>
#include <asm/fixmap.h>
+#include <asm/paravirt.h>
int __read_mostly alternatives_patched;
@@ -388,21 +389,31 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
*/
for (a = start; a < end; a++) {
int insn_buff_sz = 0;
+ /* Mask away "NOT" flag bit for feature to test. */
+ u16 feature = a->cpuid & ~ALTINSTR_FLAG_INV;
instr = (u8 *)&a->instr_offset + a->instr_offset;
replacement = (u8 *)&a->repl_offset + a->repl_offset;
BUG_ON(a->instrlen > sizeof(insn_buff));
- BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
- if (!boot_cpu_has(a->cpuid)) {
+ BUG_ON(feature >= (NCAPINTS + NBUGINTS) * 32);
+
+ /*
+ * Patch if either:
+ * - feature is present
+ * - feature not present but ALTINSTR_FLAG_INV is set to mean,
+ * patch if feature is *NOT* present.
+ */
+ if (!boot_cpu_has(feature) == !(a->cpuid & ALTINSTR_FLAG_INV)) {
if (a->padlen > 1)
optimize_nops(a, instr);
continue;
}
- DPRINTK("feat: %d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d), pad: %d",
- a->cpuid >> 5,
- a->cpuid & 0x1f,
+ DPRINTK("feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d), pad: %d",
+ (a->cpuid & ALTINSTR_FLAG_INV) ? "!" : "",
+ feature >> 5,
+ feature & 0x1f,
instr, instr, a->instrlen,
replacement, a->replacementlen, a->padlen);
@@ -605,7 +616,7 @@ void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
BUG_ON(p->len > MAX_PATCH_LEN);
/* prep the buffer with the original instructions */
memcpy(insn_buff, p->instr, p->len);
- used = pv_ops.init.patch(p->type, insn_buff, (unsigned long)p->instr, p->len);
+ used = paravirt_patch(p->type, insn_buff, (unsigned long)p->instr, p->len);
BUG_ON(used > p->len);
@@ -723,6 +734,33 @@ void __init alternative_instructions(void)
* patching.
*/
+ /*
+ * Paravirt patching and alternative patching can be combined to
+ * replace a function call with a short direct code sequence (e.g.
+ * by setting a constant return value instead of doing that in an
+ * external function).
+ * In order to make this work the following sequence is required:
+ * 1. set (artificial) features depending on used paravirt
+ * functions which can later influence alternative patching
+ * 2. apply paravirt patching (generally replacing an indirect
+ * function call with a direct one)
+ * 3. apply alternative patching (e.g. replacing a direct function
+ * call with a custom code sequence)
+ * Doing paravirt patching after alternative patching would clobber
+ * the optimization of the custom code with a function call again.
+ */
+ paravirt_set_cap();
+
+ /*
+ * First patch paravirt functions, such that we overwrite the indirect
+ * call with the direct call.
+ */
+ apply_paravirt(__parainstructions, __parainstructions_end);
+
+ /*
+ * Then patch alternatives, such that those paravirt calls that are in
+ * alternatives can be overwritten by their immediate fragments.
+ */
apply_alternatives(__alt_instructions, __alt_instructions_end);
#ifdef CONFIG_SMP
@@ -741,8 +779,6 @@ void __init alternative_instructions(void)
}
#endif
- apply_paravirt(__parainstructions, __parainstructions_end);
-
restart_nmi();
alternatives_patched = 1;
}
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 60b9f42ce3c1..ecd3fd6993d1 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -61,13 +61,6 @@ static void __used common(void)
OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext);
#endif
-#ifdef CONFIG_PARAVIRT_XXL
- BLANK();
- OFFSET(PV_IRQ_irq_disable, paravirt_patch_template, irq.irq_disable);
- OFFSET(PV_IRQ_irq_enable, paravirt_patch_template, irq.irq_enable);
- OFFSET(PV_CPU_iret, paravirt_patch_template, cpu.iret);
-#endif
-
#ifdef CONFIG_XEN
BLANK();
OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index 42af31b64c2c..defda61f372d 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -72,6 +72,9 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_AVX512_FP16, X86_FEATURE_AVX512BW },
{ X86_FEATURE_ENQCMD, X86_FEATURE_XSAVES },
{ X86_FEATURE_PER_THREAD_MBA, X86_FEATURE_MBA },
+ { X86_FEATURE_SGX_LC, X86_FEATURE_SGX },
+ { X86_FEATURE_SGX1, X86_FEATURE_SGX },
+ { X86_FEATURE_SGX2, X86_FEATURE_SGX1 },
{}
};
diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c
index 3b1b01f2b248..da696eb4821a 100644
--- a/arch/x86/kernel/cpu/feat_ctl.c
+++ b/arch/x86/kernel/cpu/feat_ctl.c
@@ -93,15 +93,9 @@ static void init_vmx_capabilities(struct cpuinfo_x86 *c)
}
#endif /* CONFIG_X86_VMX_FEATURE_NAMES */
-static void clear_sgx_caps(void)
-{
- setup_clear_cpu_cap(X86_FEATURE_SGX);
- setup_clear_cpu_cap(X86_FEATURE_SGX_LC);
-}
-
static int __init nosgx(char *str)
{
- clear_sgx_caps();
+ setup_clear_cpu_cap(X86_FEATURE_SGX);
return 0;
}
@@ -110,23 +104,30 @@ early_param("nosgx", nosgx);
void init_ia32_feat_ctl(struct cpuinfo_x86 *c)
{
+ bool enable_sgx_kvm = false, enable_sgx_driver = false;
bool tboot = tboot_enabled();
- bool enable_sgx;
+ bool enable_vmx;
u64 msr;
if (rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr)) {
clear_cpu_cap(c, X86_FEATURE_VMX);
- clear_sgx_caps();
+ clear_cpu_cap(c, X86_FEATURE_SGX);
return;
}
- /*
- * Enable SGX if and only if the kernel supports SGX and Launch Control
- * is supported, i.e. disable SGX if the LE hash MSRs can't be written.
- */
- enable_sgx = cpu_has(c, X86_FEATURE_SGX) &&
- cpu_has(c, X86_FEATURE_SGX_LC) &&
- IS_ENABLED(CONFIG_X86_SGX);
+ enable_vmx = cpu_has(c, X86_FEATURE_VMX) &&
+ IS_ENABLED(CONFIG_KVM_INTEL);
+
+ if (cpu_has(c, X86_FEATURE_SGX) && IS_ENABLED(CONFIG_X86_SGX)) {
+ /*
+ * Separate out SGX driver enabling from KVM. This allows KVM
+ * guests to use SGX even if the kernel SGX driver refuses to
+ * use it. This happens if flexible Launch Control is not
+ * available.
+ */
+ enable_sgx_driver = cpu_has(c, X86_FEATURE_SGX_LC);
+ enable_sgx_kvm = enable_vmx && IS_ENABLED(CONFIG_X86_SGX_KVM);
+ }
if (msr & FEAT_CTL_LOCKED)
goto update_caps;
@@ -142,15 +143,18 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c)
* i.e. KVM is enabled, to avoid unnecessarily adding an attack vector
* for the kernel, e.g. using VMX to hide malicious code.
*/
- if (cpu_has(c, X86_FEATURE_VMX) && IS_ENABLED(CONFIG_KVM_INTEL)) {
+ if (enable_vmx) {
msr |= FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
if (tboot)
msr |= FEAT_CTL_VMX_ENABLED_INSIDE_SMX;
}
- if (enable_sgx)
- msr |= FEAT_CTL_SGX_ENABLED | FEAT_CTL_SGX_LC_ENABLED;
+ if (enable_sgx_kvm || enable_sgx_driver) {
+ msr |= FEAT_CTL_SGX_ENABLED;
+ if (enable_sgx_driver)
+ msr |= FEAT_CTL_SGX_LC_ENABLED;
+ }
wrmsrl(MSR_IA32_FEAT_CTL, msr);
@@ -173,10 +177,29 @@ update_caps:
}
update_sgx:
- if (!(msr & FEAT_CTL_SGX_ENABLED) ||
- !(msr & FEAT_CTL_SGX_LC_ENABLED) || !enable_sgx) {
- if (enable_sgx)
- pr_err_once("SGX disabled by BIOS\n");
- clear_sgx_caps();
+ if (!(msr & FEAT_CTL_SGX_ENABLED)) {
+ if (enable_sgx_kvm || enable_sgx_driver)
+ pr_err_once("SGX disabled by BIOS.\n");
+ clear_cpu_cap(c, X86_FEATURE_SGX);
+ return;
+ }
+
+ /*
+ * VMX feature bit may be cleared due to being disabled in BIOS,
+ * in which case SGX virtualization cannot be supported either.
+ */
+ if (!cpu_has(c, X86_FEATURE_VMX) && enable_sgx_kvm) {
+ pr_err_once("SGX virtualization disabled due to lack of VMX.\n");
+ enable_sgx_kvm = 0;
+ }
+
+ if (!(msr & FEAT_CTL_SGX_LC_ENABLED) && enable_sgx_driver) {
+ if (!enable_sgx_kvm) {
+ pr_err_once("SGX Launch Control is locked. Disable SGX.\n");
+ clear_cpu_cap(c, X86_FEATURE_SGX);
+ } else {
+ pr_err_once("SGX Launch Control is locked. Support SGX virtualization only.\n");
+ clear_cpu_cap(c, X86_FEATURE_SGX_LC);
+ }
}
}
diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c
index 7b360731fc2d..4e86d97f9653 100644
--- a/arch/x86/kernel/cpu/mce/inject.c
+++ b/arch/x86/kernel/cpu/mce/inject.c
@@ -74,6 +74,7 @@ MCE_INJECT_SET(status);
MCE_INJECT_SET(misc);
MCE_INJECT_SET(addr);
MCE_INJECT_SET(synd);
+MCE_INJECT_SET(ipid);
#define MCE_INJECT_GET(reg) \
static int inj_##reg##_get(void *data, u64 *val) \
@@ -88,11 +89,13 @@ MCE_INJECT_GET(status);
MCE_INJECT_GET(misc);
MCE_INJECT_GET(addr);
MCE_INJECT_GET(synd);
+MCE_INJECT_GET(ipid);
DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n");
DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n");
DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n");
DEFINE_SIMPLE_ATTRIBUTE(synd_fops, inj_synd_get, inj_synd_set, "%llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(ipid_fops, inj_ipid_get, inj_ipid_set, "%llx\n");
static void setup_inj_struct(struct mce *m)
{
@@ -629,6 +632,8 @@ static const char readme_msg[] =
"\t is present in hardware. \n"
"\t - \"th\": Trigger APIC interrupt for Threshold errors. Causes threshold \n"
"\t APIC interrupt handler to handle the error. \n"
+"\n"
+"ipid:\t IPID (AMD-specific)\n"
"\n";
static ssize_t
@@ -652,6 +657,7 @@ static struct dfs_node {
{ .name = "misc", .fops = &misc_fops, .perm = S_IRUSR | S_IWUSR },
{ .name = "addr", .fops = &addr_fops, .perm = S_IRUSR | S_IWUSR },
{ .name = "synd", .fops = &synd_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "ipid", .fops = &ipid_fops, .perm = S_IRUSR | S_IWUSR },
{ .name = "bank", .fops = &bank_fops, .perm = S_IRUSR | S_IWUSR },
{ .name = "flags", .fops = &flags_fops, .perm = S_IRUSR | S_IWUSR },
{ .name = "cpu", .fops = &extcpu_fops, .perm = S_IRUSR | S_IWUSR },
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index b935e1b5f115..6a6318e9590c 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -629,16 +629,16 @@ static ssize_t reload_store(struct device *dev,
if (val != 1)
return size;
- tmp_ret = microcode_ops->request_microcode_fw(bsp, &microcode_pdev->dev, true);
- if (tmp_ret != UCODE_NEW)
- return size;
-
get_online_cpus();
ret = check_online_cpus();
if (ret)
goto put;
+ tmp_ret = microcode_ops->request_microcode_fw(bsp, &microcode_pdev->dev, true);
+ if (tmp_ret != UCODE_NEW)
+ goto put;
+
mutex_lock(&microcode_mutex);
ret = microcode_reload_late();
mutex_unlock(&microcode_mutex);
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 972ec3bfa9c0..21d1f062895a 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -36,6 +36,8 @@ static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_CDP_L2, CPUID_ECX, 2, 0x00000010, 2 },
{ X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 },
{ X86_FEATURE_PER_THREAD_MBA, CPUID_ECX, 0, 0x00000010, 3 },
+ { X86_FEATURE_SGX1, CPUID_EAX, 0, 0x00000012, 0 },
+ { X86_FEATURE_SGX2, CPUID_EAX, 1, 0x00000012, 0 },
{ X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 },
{ X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 },
{ X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 },
diff --git a/arch/x86/kernel/cpu/sgx/Makefile b/arch/x86/kernel/cpu/sgx/Makefile
index 91d3dc784a29..9c1656779b2a 100644
--- a/arch/x86/kernel/cpu/sgx/Makefile
+++ b/arch/x86/kernel/cpu/sgx/Makefile
@@ -3,3 +3,4 @@ obj-y += \
encl.o \
ioctl.o \
main.o
+obj-$(CONFIG_X86_SGX_KVM) += virt.o
diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c
index 8ce6d8371cfb..aa9b8b868867 100644
--- a/arch/x86/kernel/cpu/sgx/driver.c
+++ b/arch/x86/kernel/cpu/sgx/driver.c
@@ -136,10 +136,6 @@ static const struct file_operations sgx_encl_fops = {
.get_unmapped_area = sgx_get_unmapped_area,
};
-const struct file_operations sgx_provision_fops = {
- .owner = THIS_MODULE,
-};
-
static struct miscdevice sgx_dev_enclave = {
.minor = MISC_DYNAMIC_MINOR,
.name = "sgx_enclave",
@@ -147,13 +143,6 @@ static struct miscdevice sgx_dev_enclave = {
.fops = &sgx_encl_fops,
};
-static struct miscdevice sgx_dev_provision = {
- .minor = MISC_DYNAMIC_MINOR,
- .name = "sgx_provision",
- .nodename = "sgx_provision",
- .fops = &sgx_provision_fops,
-};
-
int __init sgx_drv_init(void)
{
unsigned int eax, ebx, ecx, edx;
@@ -187,11 +176,5 @@ int __init sgx_drv_init(void)
if (ret)
return ret;
- ret = misc_register(&sgx_dev_provision);
- if (ret) {
- misc_deregister(&sgx_dev_enclave);
- return ret;
- }
-
return 0;
}
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
index 7449ef33f081..3be203297988 100644
--- a/arch/x86/kernel/cpu/sgx/encl.c
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -7,7 +7,7 @@
#include <linux/shmem_fs.h>
#include <linux/suspend.h>
#include <linux/sched/mm.h>
-#include "arch.h"
+#include <asm/sgx.h>
#include "encl.h"
#include "encls.h"
#include "sgx.h"
@@ -78,7 +78,7 @@ static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page,
ret = __sgx_encl_eldu(encl_page, epc_page, secs_page);
if (ret) {
- sgx_free_epc_page(epc_page);
+ sgx_encl_free_epc_page(epc_page);
return ERR_PTR(ret);
}
@@ -404,7 +404,7 @@ void sgx_encl_release(struct kref *ref)
if (sgx_unmark_page_reclaimable(entry->epc_page))
continue;
- sgx_free_epc_page(entry->epc_page);
+ sgx_encl_free_epc_page(entry->epc_page);
encl->secs_child_cnt--;
entry->epc_page = NULL;
}
@@ -415,7 +415,7 @@ void sgx_encl_release(struct kref *ref)
xa_destroy(&encl->page_array);
if (!encl->secs_child_cnt && encl->secs.epc_page) {
- sgx_free_epc_page(encl->secs.epc_page);
+ sgx_encl_free_epc_page(encl->secs.epc_page);
encl->secs.epc_page = NULL;
}
@@ -423,7 +423,7 @@ void sgx_encl_release(struct kref *ref)
va_page = list_first_entry(&encl->va_pages, struct sgx_va_page,
list);
list_del(&va_page->list);
- sgx_free_epc_page(va_page->epc_page);
+ sgx_encl_free_epc_page(va_page->epc_page);
kfree(va_page);
}
@@ -686,7 +686,7 @@ struct sgx_epc_page *sgx_alloc_va_page(void)
ret = __epa(sgx_get_epc_virt_addr(epc_page));
if (ret) {
WARN_ONCE(1, "EPA returned %d (0x%x)", ret, ret);
- sgx_free_epc_page(epc_page);
+ sgx_encl_free_epc_page(epc_page);
return ERR_PTR(-EFAULT);
}
@@ -735,3 +735,24 @@ bool sgx_va_page_full(struct sgx_va_page *va_page)
return slot == SGX_VA_SLOT_COUNT;
}
+
+/**
+ * sgx_encl_free_epc_page - free an EPC page assigned to an enclave
+ * @page: EPC page to be freed
+ *
+ * Free an EPC page assigned to an enclave. It does EREMOVE for the page, and
+ * only upon success, it puts the page back to free page list. Otherwise, it
+ * gives a WARNING to indicate page is leaked.
+ */
+void sgx_encl_free_epc_page(struct sgx_epc_page *page)
+{
+ int ret;
+
+ WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED);
+
+ ret = __eremove(sgx_get_epc_virt_addr(page));
+ if (WARN_ONCE(ret, EREMOVE_ERROR_MESSAGE, ret, ret))
+ return;
+
+ sgx_free_epc_page(page);
+}
diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h
index d8d30ccbef4c..6e74f85b6264 100644
--- a/arch/x86/kernel/cpu/sgx/encl.h
+++ b/arch/x86/kernel/cpu/sgx/encl.h
@@ -115,5 +115,6 @@ struct sgx_epc_page *sgx_alloc_va_page(void);
unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page);
void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset);
bool sgx_va_page_full(struct sgx_va_page *va_page);
+void sgx_encl_free_epc_page(struct sgx_epc_page *page);
#endif /* _X86_ENCL_H */
diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h
index 443188fe7e70..9b204843b78d 100644
--- a/arch/x86/kernel/cpu/sgx/encls.h
+++ b/arch/x86/kernel/cpu/sgx/encls.h
@@ -11,21 +11,6 @@
#include <asm/traps.h>
#include "sgx.h"
-enum sgx_encls_function {
- ECREATE = 0x00,
- EADD = 0x01,
- EINIT = 0x02,
- EREMOVE = 0x03,
- EDGBRD = 0x04,
- EDGBWR = 0x05,
- EEXTEND = 0x06,
- ELDU = 0x08,
- EBLOCK = 0x09,
- EPA = 0x0A,
- EWB = 0x0B,
- ETRACK = 0x0C,
-};
-
/**
* ENCLS_FAULT_FLAG - flag signifying an ENCLS return code is a trapnr
*
@@ -55,6 +40,19 @@ enum sgx_encls_function {
} while (0); \
}
+/*
+ * encls_faulted() - Check if an ENCLS leaf faulted given an error code
+ * @ret: the return value of an ENCLS leaf function call
+ *
+ * Return:
+ * - true: ENCLS leaf faulted.
+ * - false: Otherwise.
+ */
+static inline bool encls_faulted(int ret)
+{
+ return ret & ENCLS_FAULT_FLAG;
+}
+
/**
* encls_failed() - Check if an ENCLS function failed
* @ret: the return value of an ENCLS function call
@@ -65,7 +63,7 @@ enum sgx_encls_function {
*/
static inline bool encls_failed(int ret)
{
- if (ret & ENCLS_FAULT_FLAG)
+ if (encls_faulted(ret))
return ENCLS_TRAPNR(ret) != X86_TRAP_PF;
return !!ret;
diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c
index 90a5caf76939..83df20e3e633 100644
--- a/arch/x86/kernel/cpu/sgx/ioctl.c
+++ b/arch/x86/kernel/cpu/sgx/ioctl.c
@@ -2,6 +2,7 @@
/* Copyright(c) 2016-20 Intel Corporation. */
#include <asm/mman.h>
+#include <asm/sgx.h>
#include <linux/mman.h>
#include <linux/delay.h>
#include <linux/file.h>
@@ -47,7 +48,7 @@ static void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page)
encl->page_cnt--;
if (va_page) {
- sgx_free_epc_page(va_page->epc_page);
+ sgx_encl_free_epc_page(va_page->epc_page);
list_del(&va_page->list);
kfree(va_page);
}
@@ -117,7 +118,7 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs)
return 0;
err_out:
- sgx_free_epc_page(encl->secs.epc_page);
+ sgx_encl_free_epc_page(encl->secs.epc_page);
encl->secs.epc_page = NULL;
err_out_backing:
@@ -365,7 +366,7 @@ err_out_unlock:
mmap_read_unlock(current->mm);
err_out_free:
- sgx_free_epc_page(epc_page);
+ sgx_encl_free_epc_page(epc_page);
kfree(encl_page);
return ret;
@@ -495,7 +496,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct,
void *token)
{
u64 mrsigner[4];
- int i, j, k;
+ int i, j;
void *addr;
int ret;
@@ -544,8 +545,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct,
preempt_disable();
- for (k = 0; k < 4; k++)
- wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + k, mrsigner[k]);
+ sgx_update_lepubkeyhash(mrsigner);
ret = __einit(sigstruct, token, addr);
@@ -568,7 +568,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct,
}
}
- if (ret & ENCLS_FAULT_FLAG) {
+ if (encls_faulted(ret)) {
if (encls_failed(ret))
ENCLS_WARN(ret, "EINIT");
@@ -604,7 +604,6 @@ static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg)
{
struct sgx_sigstruct *sigstruct;
struct sgx_enclave_init init_arg;
- struct page *initp_page;
void *token;
int ret;
@@ -615,11 +614,15 @@ static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg)
if (copy_from_user(&init_arg, arg, sizeof(init_arg)))
return -EFAULT;
- initp_page = alloc_page(GFP_KERNEL);
- if (!initp_page)
+ /*
+ * 'sigstruct' must be on a page boundary and 'token' on a 512 byte
+ * boundary. kmalloc() will give this alignment when allocating
+ * PAGE_SIZE bytes.
+ */
+ sigstruct = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!sigstruct)
return -ENOMEM;
- sigstruct = kmap(initp_page);
token = (void *)((unsigned long)sigstruct + PAGE_SIZE / 2);
memset(token, 0, SGX_LAUNCH_TOKEN_SIZE);
@@ -645,8 +648,7 @@ static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg)
ret = sgx_encl_init(encl, sigstruct, token);
out:
- kunmap(initp_page);
- __free_page(initp_page);
+ kfree(sigstruct);
return ret;
}
@@ -665,24 +667,11 @@ out:
static long sgx_ioc_enclave_provision(struct sgx_encl *encl, void __user *arg)
{
struct sgx_enclave_provision params;
- struct file *file;
if (copy_from_user(&params, arg, sizeof(params)))
return -EFAULT;
- file = fget(params.fd);
- if (!file)
- return -EINVAL;
-
- if (file->f_op != &sgx_provision_fops) {
- fput(file);
- return -EINVAL;
- }
-
- encl->attributes_mask |= SGX_ATTR_PROVISIONKEY;
-
- fput(file);
- return 0;
+ return sgx_set_attribute(&encl->attributes_mask, params.fd);
}
long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 9ea55fdaf322..63d3de02bbcc 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -1,14 +1,17 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright(c) 2016-20 Intel Corporation. */
+#include <linux/file.h>
#include <linux/freezer.h>
#include <linux/highmem.h>
#include <linux/kthread.h>
+#include <linux/miscdevice.h>
#include <linux/pagemap.h>
#include <linux/ratelimit.h>
#include <linux/sched/mm.h>
#include <linux/sched/signal.h>
#include <linux/slab.h>
+#include <asm/sgx.h>
#include "driver.h"
#include "encl.h"
#include "encls.h"
@@ -23,42 +26,58 @@ static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq);
* with sgx_reclaimer_lock acquired.
*/
static LIST_HEAD(sgx_active_page_list);
-
static DEFINE_SPINLOCK(sgx_reclaimer_lock);
+/* The free page list lock protected variables prepend the lock. */
+static unsigned long sgx_nr_free_pages;
+
+/* Nodes with one or more EPC sections. */
+static nodemask_t sgx_numa_mask;
+
+/*
+ * Array with one list_head for each possible NUMA node. Each
+ * list contains all the sgx_epc_section's which are on that
+ * node.
+ */
+static struct sgx_numa_node *sgx_numa_nodes;
+
+static LIST_HEAD(sgx_dirty_page_list);
+
/*
- * Reset dirty EPC pages to uninitialized state. Laundry can be left with SECS
- * pages whose child pages blocked EREMOVE.
+ * Reset post-kexec EPC pages to the uninitialized state. The pages are removed
+ * from the input list, and made available for the page allocator. SECS pages
+ * prepending their children in the input list are left intact.
*/
-static void sgx_sanitize_section(struct sgx_epc_section *section)
+static void __sgx_sanitize_pages(struct list_head *dirty_page_list)
{
struct sgx_epc_page *page;
LIST_HEAD(dirty);
int ret;
- /* init_laundry_list is thread-local, no need for a lock: */
- while (!list_empty(&section->init_laundry_list)) {
+ /* dirty_page_list is thread-local, no need for a lock: */
+ while (!list_empty(dirty_page_list)) {
if (kthread_should_stop())
return;
- /* needed for access to ->page_list: */
- spin_lock(&section->lock);
-
- page = list_first_entry(&section->init_laundry_list,
- struct sgx_epc_page, list);
+ page = list_first_entry(dirty_page_list, struct sgx_epc_page, list);
ret = __eremove(sgx_get_epc_virt_addr(page));
- if (!ret)
- list_move(&page->list, &section->page_list);
- else
+ if (!ret) {
+ /*
+ * page is now sanitized. Make it available via the SGX
+ * page allocator:
+ */
+ list_del(&page->list);
+ sgx_free_epc_page(page);
+ } else {
+ /* The page is not yet clean - move to the dirty list. */
list_move_tail(&page->list, &dirty);
-
- spin_unlock(&section->lock);
+ }
cond_resched();
}
- list_splice(&dirty, &section->init_laundry_list);
+ list_splice(&dirty, dirty_page_list);
}
static bool sgx_reclaimer_age(struct sgx_epc_page *epc_page)
@@ -278,7 +297,7 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page,
sgx_encl_ewb(encl->secs.epc_page, &secs_backing);
- sgx_free_epc_page(encl->secs.epc_page);
+ sgx_encl_free_epc_page(encl->secs.epc_page);
encl->secs.epc_page = NULL;
sgx_encl_put_backing(&secs_backing, true);
@@ -308,6 +327,7 @@ static void sgx_reclaim_pages(void)
struct sgx_epc_section *section;
struct sgx_encl_page *encl_page;
struct sgx_epc_page *epc_page;
+ struct sgx_numa_node *node;
pgoff_t page_index;
int cnt = 0;
int ret;
@@ -379,50 +399,33 @@ skip:
epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
section = &sgx_epc_sections[epc_page->section];
- spin_lock(&section->lock);
- list_add_tail(&epc_page->list, &section->page_list);
- section->free_cnt++;
- spin_unlock(&section->lock);
- }
-}
-
-static unsigned long sgx_nr_free_pages(void)
-{
- unsigned long cnt = 0;
- int i;
-
- for (i = 0; i < sgx_nr_epc_sections; i++)
- cnt += sgx_epc_sections[i].free_cnt;
+ node = section->node;
- return cnt;
+ spin_lock(&node->lock);
+ list_add_tail(&epc_page->list, &node->free_page_list);
+ sgx_nr_free_pages++;
+ spin_unlock(&node->lock);
+ }
}
static bool sgx_should_reclaim(unsigned long watermark)
{
- return sgx_nr_free_pages() < watermark &&
- !list_empty(&sgx_active_page_list);
+ return sgx_nr_free_pages < watermark && !list_empty(&sgx_active_page_list);
}
static int ksgxd(void *p)
{
- int i;
-
set_freezable();
/*
* Sanitize pages in order to recover from kexec(). The 2nd pass is
* required for SECS pages, whose child pages blocked EREMOVE.
*/
- for (i = 0; i < sgx_nr_epc_sections; i++)
- sgx_sanitize_section(&sgx_epc_sections[i]);
-
- for (i = 0; i < sgx_nr_epc_sections; i++) {
- sgx_sanitize_section(&sgx_epc_sections[i]);
+ __sgx_sanitize_pages(&sgx_dirty_page_list);
+ __sgx_sanitize_pages(&sgx_dirty_page_list);
- /* Should never happen. */
- if (!list_empty(&sgx_epc_sections[i].init_laundry_list))
- WARN(1, "EPC section %d has unsanitized pages.\n", i);
- }
+ /* sanity check: */
+ WARN_ON(!list_empty(&sgx_dirty_page_list));
while (!kthread_should_stop()) {
if (try_to_freeze())
@@ -454,45 +457,56 @@ static bool __init sgx_page_reclaimer_init(void)
return true;
}
-static struct sgx_epc_page *__sgx_alloc_epc_page_from_section(struct sgx_epc_section *section)
+static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid)
{
- struct sgx_epc_page *page;
+ struct sgx_numa_node *node = &sgx_numa_nodes[nid];
+ struct sgx_epc_page *page = NULL;
- spin_lock(&section->lock);
+ spin_lock(&node->lock);
- if (list_empty(&section->page_list)) {
- spin_unlock(&section->lock);
+ if (list_empty(&node->free_page_list)) {
+ spin_unlock(&node->lock);
return NULL;
}
- page = list_first_entry(&section->page_list, struct sgx_epc_page, list);
+ page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list);
list_del_init(&page->list);
- section->free_cnt--;
+ sgx_nr_free_pages--;
+
+ spin_unlock(&node->lock);
- spin_unlock(&section->lock);
return page;
}
/**
* __sgx_alloc_epc_page() - Allocate an EPC page
*
- * Iterate through EPC sections and borrow a free EPC page to the caller. When a
- * page is no longer needed it must be released with sgx_free_epc_page().
+ * Iterate through NUMA nodes and reserve ia free EPC page to the caller. Start
+ * from the NUMA node, where the caller is executing.
*
* Return:
- * an EPC page,
- * -errno on error
+ * - an EPC page: A borrowed EPC pages were available.
+ * - NULL: Out of EPC pages.
*/
struct sgx_epc_page *__sgx_alloc_epc_page(void)
{
- struct sgx_epc_section *section;
struct sgx_epc_page *page;
- int i;
+ int nid_of_current = numa_node_id();
+ int nid = nid_of_current;
- for (i = 0; i < sgx_nr_epc_sections; i++) {
- section = &sgx_epc_sections[i];
+ if (node_isset(nid_of_current, sgx_numa_mask)) {
+ page = __sgx_alloc_epc_page_from_node(nid_of_current);
+ if (page)
+ return page;
+ }
+
+ /* Fall back to the non-local NUMA nodes: */
+ while (true) {
+ nid = next_node_in(nid, sgx_numa_mask);
+ if (nid == nid_of_current)
+ break;
- page = __sgx_alloc_epc_page_from_section(section);
+ page = __sgx_alloc_epc_page_from_node(nid);
if (page)
return page;
}
@@ -598,23 +612,22 @@ struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim)
* sgx_free_epc_page() - Free an EPC page
* @page: an EPC page
*
- * Call EREMOVE for an EPC page and insert it back to the list of free pages.
+ * Put the EPC page back to the list of free pages. It's the caller's
+ * responsibility to make sure that the page is in uninitialized state. In other
+ * words, do EREMOVE, EWB or whatever operation is necessary before calling
+ * this function.
*/
void sgx_free_epc_page(struct sgx_epc_page *page)
{
struct sgx_epc_section *section = &sgx_epc_sections[page->section];
- int ret;
+ struct sgx_numa_node *node = section->node;
- WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED);
+ spin_lock(&node->lock);
- ret = __eremove(sgx_get_epc_virt_addr(page));
- if (WARN_ONCE(ret, "EREMOVE returned %d (0x%x)", ret, ret))
- return;
+ list_add_tail(&page->list, &node->free_page_list);
+ sgx_nr_free_pages++;
- spin_lock(&section->lock);
- list_add_tail(&page->list, &section->page_list);
- section->free_cnt++;
- spin_unlock(&section->lock);
+ spin_unlock(&node->lock);
}
static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
@@ -635,18 +648,14 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
}
section->phys_addr = phys_addr;
- spin_lock_init(&section->lock);
- INIT_LIST_HEAD(&section->page_list);
- INIT_LIST_HEAD(&section->init_laundry_list);
for (i = 0; i < nr_pages; i++) {
section->pages[i].section = index;
section->pages[i].flags = 0;
section->pages[i].owner = NULL;
- list_add_tail(&section->pages[i].list, &section->init_laundry_list);
+ list_add_tail(&section->pages[i].list, &sgx_dirty_page_list);
}
- section->free_cnt = nr_pages;
return true;
}
@@ -665,8 +674,13 @@ static bool __init sgx_page_cache_init(void)
{
u32 eax, ebx, ecx, edx, type;
u64 pa, size;
+ int nid;
int i;
+ sgx_numa_nodes = kmalloc_array(num_possible_nodes(), sizeof(*sgx_numa_nodes), GFP_KERNEL);
+ if (!sgx_numa_nodes)
+ return false;
+
for (i = 0; i < ARRAY_SIZE(sgx_epc_sections); i++) {
cpuid_count(SGX_CPUID, i + SGX_CPUID_EPC, &eax, &ebx, &ecx, &edx);
@@ -689,6 +703,21 @@ static bool __init sgx_page_cache_init(void)
break;
}
+ nid = numa_map_to_online_node(phys_to_target_node(pa));
+ if (nid == NUMA_NO_NODE) {
+ /* The physical address is already printed above. */
+ pr_warn(FW_BUG "Unable to map EPC section to online node. Fallback to the NUMA node 0.\n");
+ nid = 0;
+ }
+
+ if (!node_isset(nid, sgx_numa_mask)) {
+ spin_lock_init(&sgx_numa_nodes[nid].lock);
+ INIT_LIST_HEAD(&sgx_numa_nodes[nid].free_page_list);
+ node_set(nid, sgx_numa_mask);
+ }
+
+ sgx_epc_sections[i].node = &sgx_numa_nodes[nid];
+
sgx_nr_epc_sections++;
}
@@ -700,6 +729,67 @@ static bool __init sgx_page_cache_init(void)
return true;
}
+/*
+ * Update the SGX_LEPUBKEYHASH MSRs to the values specified by caller.
+ * Bare-metal driver requires to update them to hash of enclave's signer
+ * before EINIT. KVM needs to update them to guest's virtual MSR values
+ * before doing EINIT from guest.
+ */
+void sgx_update_lepubkeyhash(u64 *lepubkeyhash)
+{
+ int i;
+
+ WARN_ON_ONCE(preemptible());
+
+ for (i = 0; i < 4; i++)
+ wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]);
+}
+
+const struct file_operations sgx_provision_fops = {
+ .owner = THIS_MODULE,
+};
+
+static struct miscdevice sgx_dev_provision = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "sgx_provision",
+ .nodename = "sgx_provision",
+ .fops = &sgx_provision_fops,
+};
+
+/**
+ * sgx_set_attribute() - Update allowed attributes given file descriptor
+ * @allowed_attributes: Pointer to allowed enclave attributes
+ * @attribute_fd: File descriptor for specific attribute
+ *
+ * Append enclave attribute indicated by file descriptor to allowed
+ * attributes. Currently only SGX_ATTR_PROVISIONKEY indicated by
+ * /dev/sgx_provision is supported.
+ *
+ * Return:
+ * -0: SGX_ATTR_PROVISIONKEY is appended to allowed_attributes
+ * -EINVAL: Invalid, or not supported file descriptor
+ */
+int sgx_set_attribute(unsigned long *allowed_attributes,
+ unsigned int attribute_fd)
+{
+ struct file *file;
+
+ file = fget(attribute_fd);
+ if (!file)
+ return -EINVAL;
+
+ if (file->f_op != &sgx_provision_fops) {
+ fput(file);
+ return -EINVAL;
+ }
+
+ *allowed_attributes |= SGX_ATTR_PROVISIONKEY;
+
+ fput(file);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(sgx_set_attribute);
+
static int __init sgx_init(void)
{
int ret;
@@ -716,12 +806,28 @@ static int __init sgx_init(void)
goto err_page_cache;
}
- ret = sgx_drv_init();
+ ret = misc_register(&sgx_dev_provision);
if (ret)
goto err_kthread;
+ /*
+ * Always try to initialize the native *and* KVM drivers.
+ * The KVM driver is less picky than the native one and
+ * can function if the native one is not supported on the
+ * current system or fails to initialize.
+ *
+ * Error out only if both fail to initialize.
+ */
+ ret = sgx_drv_init();
+
+ if (sgx_vepc_init() && ret)
+ goto err_provision;
+
return 0;
+err_provision:
+ misc_deregister(&sgx_dev_provision);
+
err_kthread:
kthread_stop(ksgxd_tsk);
diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h
index 5fa42d143feb..4628acec0009 100644
--- a/arch/x86/kernel/cpu/sgx/sgx.h
+++ b/arch/x86/kernel/cpu/sgx/sgx.h
@@ -8,11 +8,15 @@
#include <linux/rwsem.h>
#include <linux/types.h>
#include <asm/asm.h>
-#include "arch.h"
+#include <asm/sgx.h>
#undef pr_fmt
#define pr_fmt(fmt) "sgx: " fmt
+#define EREMOVE_ERROR_MESSAGE \
+ "EREMOVE returned %d (0x%x) and an EPC page was leaked. SGX may become unusable. " \
+ "Refer to Documentation/x86/sgx.rst for more information."
+
#define SGX_MAX_EPC_SECTIONS 8
#define SGX_EEXTEND_BLOCK_SIZE 256
#define SGX_NR_TO_SCAN 16
@@ -30,28 +34,25 @@ struct sgx_epc_page {
};
/*
+ * Contains the tracking data for NUMA nodes having EPC pages. Most importantly,
+ * the free page list local to the node is stored here.
+ */
+struct sgx_numa_node {
+ struct list_head free_page_list;
+ spinlock_t lock;
+};
+
+/*
* The firmware can define multiple chunks of EPC to the different areas of the
* physical memory e.g. for memory areas of the each node. This structure is
* used to store EPC pages for one EPC section and virtual memory area where
* the pages have been mapped.
- *
- * 'lock' must be held before accessing 'page_list' or 'free_cnt'.
*/
struct sgx_epc_section {
unsigned long phys_addr;
void *virt_addr;
struct sgx_epc_page *pages;
-
- spinlock_t lock;
- struct list_head page_list;
- unsigned long free_cnt;
-
- /*
- * Pages which need EREMOVE run on them before they can be
- * used. Only safe to be accessed in ksgxd and init code.
- * Not protected by locks.
- */
- struct list_head init_laundry_list;
+ struct sgx_numa_node *node;
};
extern struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
@@ -83,4 +84,15 @@ void sgx_mark_page_reclaimable(struct sgx_epc_page *page);
int sgx_unmark_page_reclaimable(struct sgx_epc_page *page);
struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim);
+#ifdef CONFIG_X86_SGX_KVM
+int __init sgx_vepc_init(void);
+#else
+static inline int __init sgx_vepc_init(void)
+{
+ return -ENODEV;
+}
+#endif
+
+void sgx_update_lepubkeyhash(u64 *lepubkeyhash);
+
#endif /* _X86_SGX_H */
diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c
new file mode 100644
index 000000000000..6ad165a5c0cc
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/virt.c
@@ -0,0 +1,376 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Device driver to expose SGX enclave memory to KVM guests.
+ *
+ * Copyright(c) 2021 Intel Corporation.
+ */
+
+#include <linux/miscdevice.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/slab.h>
+#include <linux/xarray.h>
+#include <asm/sgx.h>
+#include <uapi/asm/sgx.h>
+
+#include "encls.h"
+#include "sgx.h"
+
+struct sgx_vepc {
+ struct xarray page_array;
+ struct mutex lock;
+};
+
+/*
+ * Temporary SECS pages that cannot be EREMOVE'd due to having child in other
+ * virtual EPC instances, and the lock to protect it.
+ */
+static struct mutex zombie_secs_pages_lock;
+static struct list_head zombie_secs_pages;
+
+static int __sgx_vepc_fault(struct sgx_vepc *vepc,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+ struct sgx_epc_page *epc_page;
+ unsigned long index, pfn;
+ int ret;
+
+ WARN_ON(!mutex_is_locked(&vepc->lock));
+
+ /* Calculate index of EPC page in virtual EPC's page_array */
+ index = vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start);
+
+ epc_page = xa_load(&vepc->page_array, index);
+ if (epc_page)
+ return 0;
+
+ epc_page = sgx_alloc_epc_page(vepc, false);
+ if (IS_ERR(epc_page))
+ return PTR_ERR(epc_page);
+
+ ret = xa_err(xa_store(&vepc->page_array, index, epc_page, GFP_KERNEL));
+ if (ret)
+ goto err_free;
+
+ pfn = PFN_DOWN(sgx_get_epc_phys_addr(epc_page));
+
+ ret = vmf_insert_pfn(vma, addr, pfn);
+ if (ret != VM_FAULT_NOPAGE) {
+ ret = -EFAULT;
+ goto err_delete;
+ }
+
+ return 0;
+
+err_delete:
+ xa_erase(&vepc->page_array, index);
+err_free:
+ sgx_free_epc_page(epc_page);
+ return ret;
+}
+
+static vm_fault_t sgx_vepc_fault(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct sgx_vepc *vepc = vma->vm_private_data;
+ int ret;
+
+ mutex_lock(&vepc->lock);
+ ret = __sgx_vepc_fault(vepc, vma, vmf->address);
+ mutex_unlock(&vepc->lock);
+
+ if (!ret)
+ return VM_FAULT_NOPAGE;
+
+ if (ret == -EBUSY && (vmf->flags & FAULT_FLAG_ALLOW_RETRY)) {
+ mmap_read_unlock(vma->vm_mm);
+ return VM_FAULT_RETRY;
+ }
+
+ return VM_FAULT_SIGBUS;
+}
+
+static const struct vm_operations_struct sgx_vepc_vm_ops = {
+ .fault = sgx_vepc_fault,
+};
+
+static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct sgx_vepc *vepc = file->private_data;
+
+ if (!(vma->vm_flags & VM_SHARED))
+ return -EINVAL;
+
+ vma->vm_ops = &sgx_vepc_vm_ops;
+ /* Don't copy VMA in fork() */
+ vma->vm_flags |= VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY;
+ vma->vm_private_data = vepc;
+
+ return 0;
+}
+
+static int sgx_vepc_free_page(struct sgx_epc_page *epc_page)
+{
+ int ret;
+
+ /*
+ * Take a previously guest-owned EPC page and return it to the
+ * general EPC page pool.
+ *
+ * Guests can not be trusted to have left this page in a good
+ * state, so run EREMOVE on the page unconditionally. In the
+ * case that a guest properly EREMOVE'd this page, a superfluous
+ * EREMOVE is harmless.
+ */
+ ret = __eremove(sgx_get_epc_virt_addr(epc_page));
+ if (ret) {
+ /*
+ * Only SGX_CHILD_PRESENT is expected, which is because of
+ * EREMOVE'ing an SECS still with child, in which case it can
+ * be handled by EREMOVE'ing the SECS again after all pages in
+ * virtual EPC have been EREMOVE'd. See comments in below in
+ * sgx_vepc_release().
+ *
+ * The user of virtual EPC (KVM) needs to guarantee there's no
+ * logical processor is still running in the enclave in guest,
+ * otherwise EREMOVE will get SGX_ENCLAVE_ACT which cannot be
+ * handled here.
+ */
+ WARN_ONCE(ret != SGX_CHILD_PRESENT, EREMOVE_ERROR_MESSAGE,
+ ret, ret);
+ return ret;
+ }
+
+ sgx_free_epc_page(epc_page);
+
+ return 0;
+}
+
+static int sgx_vepc_release(struct inode *inode, struct file *file)
+{
+ struct sgx_vepc *vepc = file->private_data;
+ struct sgx_epc_page *epc_page, *tmp, *entry;
+ unsigned long index;
+
+ LIST_HEAD(secs_pages);
+
+ xa_for_each(&vepc->page_array, index, entry) {
+ /*
+ * Remove all normal, child pages. sgx_vepc_free_page()
+ * will fail if EREMOVE fails, but this is OK and expected on
+ * SECS pages. Those can only be EREMOVE'd *after* all their
+ * child pages. Retries below will clean them up.
+ */
+ if (sgx_vepc_free_page(entry))
+ continue;
+
+ xa_erase(&vepc->page_array, index);
+ }
+
+ /*
+ * Retry EREMOVE'ing pages. This will clean up any SECS pages that
+ * only had children in this 'epc' area.
+ */
+ xa_for_each(&vepc->page_array, index, entry) {
+ epc_page = entry;
+ /*
+ * An EREMOVE failure here means that the SECS page still
+ * has children. But, since all children in this 'sgx_vepc'
+ * have been removed, the SECS page must have a child on
+ * another instance.
+ */
+ if (sgx_vepc_free_page(epc_page))
+ list_add_tail(&epc_page->list, &secs_pages);
+
+ xa_erase(&vepc->page_array, index);
+ }
+
+ /*
+ * SECS pages are "pinned" by child pages, and "unpinned" once all
+ * children have been EREMOVE'd. A child page in this instance
+ * may have pinned an SECS page encountered in an earlier release(),
+ * creating a zombie. Since some children were EREMOVE'd above,
+ * try to EREMOVE all zombies in the hopes that one was unpinned.
+ */
+ mutex_lock(&zombie_secs_pages_lock);
+ list_for_each_entry_safe(epc_page, tmp, &zombie_secs_pages, list) {
+ /*
+ * Speculatively remove the page from the list of zombies,
+ * if the page is successfully EREMOVE'd it will be added to
+ * the list of free pages. If EREMOVE fails, throw the page
+ * on the local list, which will be spliced on at the end.
+ */
+ list_del(&epc_page->list);
+
+ if (sgx_vepc_free_page(epc_page))
+ list_add_tail(&epc_page->list, &secs_pages);
+ }
+
+ if (!list_empty(&secs_pages))
+ list_splice_tail(&secs_pages, &zombie_secs_pages);
+ mutex_unlock(&zombie_secs_pages_lock);
+
+ kfree(vepc);
+
+ return 0;
+}
+
+static int sgx_vepc_open(struct inode *inode, struct file *file)
+{
+ struct sgx_vepc *vepc;
+
+ vepc = kzalloc(sizeof(struct sgx_vepc), GFP_KERNEL);
+ if (!vepc)
+ return -ENOMEM;
+ mutex_init(&vepc->lock);
+ xa_init(&vepc->page_array);
+
+ file->private_data = vepc;
+
+ return 0;
+}
+
+static const struct file_operations sgx_vepc_fops = {
+ .owner = THIS_MODULE,
+ .open = sgx_vepc_open,
+ .release = sgx_vepc_release,
+ .mmap = sgx_vepc_mmap,
+};
+
+static struct miscdevice sgx_vepc_dev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "sgx_vepc",
+ .nodename = "sgx_vepc",
+ .fops = &sgx_vepc_fops,
+};
+
+int __init sgx_vepc_init(void)
+{
+ /* SGX virtualization requires KVM to work */
+ if (!cpu_feature_enabled(X86_FEATURE_VMX))
+ return -ENODEV;
+
+ INIT_LIST_HEAD(&zombie_secs_pages);
+ mutex_init(&zombie_secs_pages_lock);
+
+ return misc_register(&sgx_vepc_dev);
+}
+
+/**
+ * sgx_virt_ecreate() - Run ECREATE on behalf of guest
+ * @pageinfo: Pointer to PAGEINFO structure
+ * @secs: Userspace pointer to SECS page
+ * @trapnr: trap number injected to guest in case of ECREATE error
+ *
+ * Run ECREATE on behalf of guest after KVM traps ECREATE for the purpose
+ * of enforcing policies of guest's enclaves, and return the trap number
+ * which should be injected to guest in case of any ECREATE error.
+ *
+ * Return:
+ * - 0: ECREATE was successful.
+ * - <0: on error.
+ */
+int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs,
+ int *trapnr)
+{
+ int ret;
+
+ /*
+ * @secs is an untrusted, userspace-provided address. It comes from
+ * KVM and is assumed to be a valid pointer which points somewhere in
+ * userspace. This can fault and call SGX or other fault handlers when
+ * userspace mapping @secs doesn't exist.
+ *
+ * Add a WARN() to make sure @secs is already valid userspace pointer
+ * from caller (KVM), who should already have handled invalid pointer
+ * case (for instance, made by malicious guest). All other checks,
+ * such as alignment of @secs, are deferred to ENCLS itself.
+ */
+ if (WARN_ON_ONCE(!access_ok(secs, PAGE_SIZE)))
+ return -EINVAL;
+
+ __uaccess_begin();
+ ret = __ecreate(pageinfo, (void *)secs);
+ __uaccess_end();
+
+ if (encls_faulted(ret)) {
+ *trapnr = ENCLS_TRAPNR(ret);
+ return -EFAULT;
+ }
+
+ /* ECREATE doesn't return an error code, it faults or succeeds. */
+ WARN_ON_ONCE(ret);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(sgx_virt_ecreate);
+
+static int __sgx_virt_einit(void __user *sigstruct, void __user *token,
+ void __user *secs)
+{
+ int ret;
+
+ /*
+ * Make sure all userspace pointers from caller (KVM) are valid.
+ * All other checks deferred to ENCLS itself. Also see comment
+ * for @secs in sgx_virt_ecreate().
+ */
+#define SGX_EINITTOKEN_SIZE 304
+ if (WARN_ON_ONCE(!access_ok(sigstruct, sizeof(struct sgx_sigstruct)) ||
+ !access_ok(token, SGX_EINITTOKEN_SIZE) ||
+ !access_ok(secs, PAGE_SIZE)))
+ return -EINVAL;
+
+ __uaccess_begin();
+ ret = __einit((void *)sigstruct, (void *)token, (void *)secs);
+ __uaccess_end();
+
+ return ret;
+}
+
+/**
+ * sgx_virt_einit() - Run EINIT on behalf of guest
+ * @sigstruct: Userspace pointer to SIGSTRUCT structure
+ * @token: Userspace pointer to EINITTOKEN structure
+ * @secs: Userspace pointer to SECS page
+ * @lepubkeyhash: Pointer to guest's *virtual* SGX_LEPUBKEYHASH MSR values
+ * @trapnr: trap number injected to guest in case of EINIT error
+ *
+ * Run EINIT on behalf of guest after KVM traps EINIT. If SGX_LC is available
+ * in host, SGX driver may rewrite the hardware values at wish, therefore KVM
+ * needs to update hardware values to guest's virtual MSR values in order to
+ * ensure EINIT is executed with expected hardware values.
+ *
+ * Return:
+ * - 0: EINIT was successful.
+ * - <0: on error.
+ */
+int sgx_virt_einit(void __user *sigstruct, void __user *token,
+ void __user *secs, u64 *lepubkeyhash, int *trapnr)
+{
+ int ret;
+
+ if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) {
+ ret = __sgx_virt_einit(sigstruct, token, secs);
+ } else {
+ preempt_disable();
+
+ sgx_update_lepubkeyhash(lepubkeyhash);
+
+ ret = __sgx_virt_einit(sigstruct, token, secs);
+ preempt_enable();
+ }
+
+ /* Propagate up the error from the WARN_ON_ONCE in __sgx_virt_einit() */
+ if (ret == -EINVAL)
+ return ret;
+
+ if (encls_faulted(ret)) {
+ *trapnr = ENCLS_TRAPNR(ret);
+ return -EFAULT;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(sgx_virt_einit);
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index c6ede3b3d302..c04b933f48d3 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -27,6 +27,7 @@
#include <linux/clocksource.h>
#include <linux/cpu.h>
#include <linux/reboot.h>
+#include <linux/static_call.h>
#include <asm/div64.h>
#include <asm/x86_init.h>
#include <asm/hypervisor.h>
@@ -336,11 +337,11 @@ static void __init vmware_paravirt_ops_setup(void)
vmware_cyc2ns_setup();
if (vmw_sched_clock)
- pv_ops.time.sched_clock = vmware_sched_clock;
+ paravirt_set_sched_clock(vmware_sched_clock);
if (vmware_is_stealclock_available()) {
has_steal_clock = true;
- pv_ops.time.steal_clock = vmware_steal_clock;
+ static_call_update(pv_steal_clock, vmware_steal_clock);
/* We use reboot notifier only to disable steal clock */
register_reboot_notifier(&vmware_pv_reboot_nb);
@@ -378,6 +379,8 @@ static void __init vmware_set_capabilities(void)
{
setup_force_cpu_cap(X86_FEATURE_CONSTANT_TSC);
setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+ if (vmware_tsc_khz)
+ setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
if (vmware_hypercall_mode == CPUID_VMWARE_FEATURES_ECX_VMCALL)
setup_force_cpu_cap(X86_FEATURE_VMCALL);
else if (vmware_hypercall_mode == CPUID_VMWARE_FEATURES_ECX_VMMCALL)
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index a8f3af257e26..b1deacbeb266 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -337,7 +337,7 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params)
struct crash_memmap_data cmd;
struct crash_mem *cmem;
- cmem = vzalloc(sizeof(struct crash_mem));
+ cmem = vzalloc(struct_size(cmem, ranges, 1));
if (!cmem)
return -ENOMEM;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 78bb0fae3982..172c947240b9 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -650,7 +650,7 @@ static void __init kvm_guest_init(void)
if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
has_steal_clock = 1;
- pv_ops.time.steal_clock = kvm_steal_clock;
+ static_call_update(pv_steal_clock, kvm_steal_clock);
}
if (pv_tlb_flush_supported()) {
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 1fc0962c89c0..d37ed4e1d033 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -106,7 +106,7 @@ static inline void kvm_sched_clock_init(bool stable)
if (!stable)
clear_sched_clock_stable();
kvm_sched_clock_offset = kvm_clock_read();
- pv_ops.time.sched_clock = kvm_sched_clock_read;
+ paravirt_set_sched_clock(kvm_sched_clock_read);
pr_info("kvm-clock: using sched offset of %llu cycles",
kvm_sched_clock_offset);
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index 4f75d0cf6305..9e1ea99ad9df 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -32,3 +32,12 @@ bool pv_is_native_vcpu_is_preempted(void)
return pv_ops.lock.vcpu_is_preempted.func ==
__raw_callee_save___native_vcpu_is_preempted;
}
+
+void __init paravirt_set_cap(void)
+{
+ if (!pv_is_native_spin_unlock())
+ setup_force_cpu_cap(X86_FEATURE_PVUNLOCK);
+
+ if (!pv_is_native_vcpu_is_preempted())
+ setup_force_cpu_cap(X86_FEATURE_VCPUPREEMPT);
+}
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index c60222ab8ab9..d0730264786b 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -14,6 +14,7 @@
#include <linux/highmem.h>
#include <linux/kprobes.h>
#include <linux/pgtable.h>
+#include <linux/static_call.h>
#include <asm/bug.h>
#include <asm/paravirt.h>
@@ -52,7 +53,10 @@ void __init default_banner(void)
}
/* Undefined instruction for dealing with missing ops pointers. */
-static const unsigned char ud2a[] = { 0x0f, 0x0b };
+static void paravirt_BUG(void)
+{
+ BUG();
+}
struct branch {
unsigned char opcode;
@@ -85,25 +89,6 @@ u64 notrace _paravirt_ident_64(u64 x)
{
return x;
}
-
-static unsigned paravirt_patch_jmp(void *insn_buff, const void *target,
- unsigned long addr, unsigned len)
-{
- struct branch *b = insn_buff;
- unsigned long delta = (unsigned long)target - (addr+5);
-
- if (len < 5) {
-#ifdef CONFIG_RETPOLINE
- WARN_ONCE(1, "Failing to patch indirect JMP in %ps\n", (void *)addr);
-#endif
- return len; /* call too long for patch site */
- }
-
- b->opcode = 0xe9; /* jmp */
- b->delta = delta;
-
- return 5;
-}
#endif
DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key);
@@ -114,8 +99,8 @@ void __init native_pv_lock_init(void)
static_branch_disable(&virt_spin_lock_key);
}
-unsigned paravirt_patch_default(u8 type, void *insn_buff,
- unsigned long addr, unsigned len)
+unsigned int paravirt_patch(u8 type, void *insn_buff, unsigned long addr,
+ unsigned int len)
{
/*
* Neat trick to map patch type back to the call within the
@@ -125,20 +110,10 @@ unsigned paravirt_patch_default(u8 type, void *insn_buff,
unsigned ret;
if (opfunc == NULL)
- /* If there's no function, patch it with a ud2a (BUG) */
- ret = paravirt_patch_insns(insn_buff, len, ud2a, ud2a+sizeof(ud2a));
+ /* If there's no function, patch it with paravirt_BUG() */
+ ret = paravirt_patch_call(insn_buff, paravirt_BUG, addr, len);
else if (opfunc == _paravirt_nop)
ret = 0;
-
-#ifdef CONFIG_PARAVIRT_XXL
- /* identity functions just return their single argument */
- else if (opfunc == _paravirt_ident_64)
- ret = paravirt_patch_ident_64(insn_buff, len);
-
- else if (type == PARAVIRT_PATCH(cpu.iret))
- /* If operation requires a jmp, then jmp */
- ret = paravirt_patch_jmp(insn_buff, opfunc, addr, len);
-#endif
else
/* Otherwise call the function. */
ret = paravirt_patch_call(insn_buff, opfunc, addr, len);
@@ -146,19 +121,6 @@ unsigned paravirt_patch_default(u8 type, void *insn_buff,
return ret;
}
-unsigned paravirt_patch_insns(void *insn_buff, unsigned len,
- const char *start, const char *end)
-{
- unsigned insn_len = end - start;
-
- /* Alternative instruction is too large for the patch site and we cannot continue: */
- BUG_ON(insn_len > len || start == NULL);
-
- memcpy(insn_buff, start, insn_len);
-
- return insn_len;
-}
-
struct static_key paravirt_steal_enabled;
struct static_key paravirt_steal_rq_enabled;
@@ -167,6 +129,14 @@ static u64 native_steal_clock(int cpu)
return 0;
}
+DEFINE_STATIC_CALL(pv_steal_clock, native_steal_clock);
+DEFINE_STATIC_CALL(pv_sched_clock, native_sched_clock);
+
+void paravirt_set_sched_clock(u64 (*func)(void))
+{
+ static_call_update(pv_sched_clock, func);
+}
+
/* These are in entry.S */
extern void native_iret(void);
@@ -269,13 +239,6 @@ struct pv_info pv_info = {
#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64)
struct paravirt_patch_template pv_ops = {
- /* Init ops. */
- .init.patch = native_patch,
-
- /* Time ops. */
- .time.sched_clock = native_sched_clock,
- .time.steal_clock = native_steal_clock,
-
/* Cpu ops. */
.cpu.io_delay = native_io_delay,
@@ -308,8 +271,6 @@ struct paravirt_patch_template pv_ops = {
.cpu.load_sp0 = native_load_sp0,
- .cpu.iret = native_iret,
-
#ifdef CONFIG_X86_IOPL_IOPERM
.cpu.invalidate_io_bitmap = native_tss_invalidate_io_bitmap,
.cpu.update_io_bitmap = native_tss_update_io_bitmap,
@@ -414,6 +375,8 @@ struct paravirt_patch_template pv_ops = {
NOKPROBE_SYMBOL(native_get_debugreg);
NOKPROBE_SYMBOL(native_set_debugreg);
NOKPROBE_SYMBOL(native_load_idt);
+
+void (*paravirt_iret)(void) = native_iret;
#endif
EXPORT_SYMBOL(pv_ops);
diff --git a/arch/x86/kernel/paravirt_patch.c b/arch/x86/kernel/paravirt_patch.c
deleted file mode 100644
index abd27ec67397..000000000000
--- a/arch/x86/kernel/paravirt_patch.c
+++ /dev/null
@@ -1,99 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/stringify.h>
-
-#include <asm/paravirt.h>
-#include <asm/asm-offsets.h>
-
-#define PSTART(d, m) \
- patch_data_##d.m
-
-#define PEND(d, m) \
- (PSTART(d, m) + sizeof(patch_data_##d.m))
-
-#define PATCH(d, m, insn_buff, len) \
- paravirt_patch_insns(insn_buff, len, PSTART(d, m), PEND(d, m))
-
-#define PATCH_CASE(ops, m, data, insn_buff, len) \
- case PARAVIRT_PATCH(ops.m): \
- return PATCH(data, ops##_##m, insn_buff, len)
-
-#ifdef CONFIG_PARAVIRT_XXL
-struct patch_xxl {
- const unsigned char irq_irq_disable[1];
- const unsigned char irq_irq_enable[1];
- const unsigned char irq_save_fl[2];
- const unsigned char mmu_read_cr2[3];
- const unsigned char mmu_read_cr3[3];
- const unsigned char mmu_write_cr3[3];
- const unsigned char cpu_wbinvd[2];
- const unsigned char mov64[3];
-};
-
-static const struct patch_xxl patch_data_xxl = {
- .irq_irq_disable = { 0xfa }, // cli
- .irq_irq_enable = { 0xfb }, // sti
- .irq_save_fl = { 0x9c, 0x58 }, // pushf; pop %[re]ax
- .mmu_read_cr2 = { 0x0f, 0x20, 0xd0 }, // mov %cr2, %[re]ax
- .mmu_read_cr3 = { 0x0f, 0x20, 0xd8 }, // mov %cr3, %[re]ax
- .mmu_write_cr3 = { 0x0f, 0x22, 0xdf }, // mov %rdi, %cr3
- .cpu_wbinvd = { 0x0f, 0x09 }, // wbinvd
- .mov64 = { 0x48, 0x89, 0xf8 }, // mov %rdi, %rax
-};
-
-unsigned int paravirt_patch_ident_64(void *insn_buff, unsigned int len)
-{
- return PATCH(xxl, mov64, insn_buff, len);
-}
-# endif /* CONFIG_PARAVIRT_XXL */
-
-#ifdef CONFIG_PARAVIRT_SPINLOCKS
-struct patch_lock {
- unsigned char queued_spin_unlock[3];
- unsigned char vcpu_is_preempted[2];
-};
-
-static const struct patch_lock patch_data_lock = {
- .vcpu_is_preempted = { 0x31, 0xc0 }, // xor %eax, %eax
-
-# ifdef CONFIG_X86_64
- .queued_spin_unlock = { 0xc6, 0x07, 0x00 }, // movb $0, (%rdi)
-# else
- .queued_spin_unlock = { 0xc6, 0x00, 0x00 }, // movb $0, (%eax)
-# endif
-};
-#endif /* CONFIG_PARAVIRT_SPINLOCKS */
-
-unsigned int native_patch(u8 type, void *insn_buff, unsigned long addr,
- unsigned int len)
-{
- switch (type) {
-
-#ifdef CONFIG_PARAVIRT_XXL
- PATCH_CASE(irq, save_fl, xxl, insn_buff, len);
- PATCH_CASE(irq, irq_enable, xxl, insn_buff, len);
- PATCH_CASE(irq, irq_disable, xxl, insn_buff, len);
-
- PATCH_CASE(mmu, read_cr2, xxl, insn_buff, len);
- PATCH_CASE(mmu, read_cr3, xxl, insn_buff, len);
- PATCH_CASE(mmu, write_cr3, xxl, insn_buff, len);
-
- PATCH_CASE(cpu, wbinvd, xxl, insn_buff, len);
-#endif
-
-#ifdef CONFIG_PARAVIRT_SPINLOCKS
- case PARAVIRT_PATCH(lock.queued_spin_unlock):
- if (pv_is_native_spin_unlock())
- return PATCH(lock, queued_spin_unlock, insn_buff, len);
- break;
-
- case PARAVIRT_PATCH(lock.vcpu_is_preempted):
- if (pv_is_native_vcpu_is_preempted())
- return PATCH(lock, vcpu_is_preempted, insn_buff, len);
- break;
-#endif
- default:
- break;
- }
-
- return paravirt_patch_default(type, insn_buff, addr, len);
-}
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index cdf7bbdc07e2..69757fac7462 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -633,11 +633,16 @@ static void __init trim_snb_memory(void)
printk(KERN_DEBUG "reserving inaccessible SNB gfx pages\n");
/*
- * Reserve all memory below the 1 MB mark that has not
- * already been reserved.
+ * SandyBridge integrated graphics devices have a bug that prevents
+ * them from accessing certain memory ranges, namely anything below
+ * 1M and in the pages listed in bad_pages[] above.
+ *
+ * To avoid these pages being ever accessed by SNB gfx devices
+ * reserve all memory below the 1 MB mark and bad_pages that have
+ * not already been reserved at boot time.
*/
memblock_reserve(0, 1<<20);
-
+
for (i = 0; i < ARRAY_SIZE(bad_pages); i++) {
if (memblock_reserve(bad_pages[i], PAGE_SIZE))
printk(KERN_WARNING "failed to reserve 0x%08lx\n",
@@ -645,18 +650,6 @@ static void __init trim_snb_memory(void)
}
}
-/*
- * Here we put platform-specific memory range workarounds, i.e.
- * memory known to be corrupt or otherwise in need to be reserved on
- * specific platforms.
- *
- * If this gets used more widely it could use a real dispatch mechanism.
- */
-static void __init trim_platform_memory_ranges(void)
-{
- trim_snb_memory();
-}
-
static void __init trim_bios_range(void)
{
/*
@@ -725,11 +718,41 @@ static int __init parse_reservelow(char *p)
early_param("reservelow", parse_reservelow);
-static void __init trim_low_memory_range(void)
+static void __init early_reserve_memory(void)
{
+ /*
+ * Reserve the memory occupied by the kernel between _text and
+ * __end_of_kernel_reserve symbols. Any kernel sections after the
+ * __end_of_kernel_reserve symbol must be explicitly reserved with a
+ * separate memblock_reserve() or they will be discarded.
+ */
+ memblock_reserve(__pa_symbol(_text),
+ (unsigned long)__end_of_kernel_reserve - (unsigned long)_text);
+
+ /*
+ * The first 4Kb of memory is a BIOS owned area, but generally it is
+ * not listed as such in the E820 table.
+ *
+ * Reserve the first memory page and typically some additional
+ * memory (64KiB by default) since some BIOSes are known to corrupt
+ * low memory. See the Kconfig help text for X86_RESERVE_LOW.
+ *
+ * In addition, make sure page 0 is always reserved because on
+ * systems with L1TF its contents can be leaked to user processes.
+ */
memblock_reserve(0, ALIGN(reserve_low, PAGE_SIZE));
+
+ early_reserve_initrd();
+
+ if (efi_enabled(EFI_BOOT))
+ efi_memblock_x86_reserve_range();
+
+ memblock_x86_reserve_range_setup_data();
+
+ reserve_ibft_region();
+ reserve_bios_regions();
}
-
+
/*
* Dump out kernel offset information on panic.
*/
@@ -764,29 +787,6 @@ dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p)
void __init setup_arch(char **cmdline_p)
{
- /*
- * Reserve the memory occupied by the kernel between _text and
- * __end_of_kernel_reserve symbols. Any kernel sections after the
- * __end_of_kernel_reserve symbol must be explicitly reserved with a
- * separate memblock_reserve() or they will be discarded.
- */
- memblock_reserve(__pa_symbol(_text),
- (unsigned long)__end_of_kernel_reserve - (unsigned long)_text);
-
- /*
- * Make sure page 0 is always reserved because on systems with
- * L1TF its contents can be leaked to user processes.
- */
- memblock_reserve(0, PAGE_SIZE);
-
- early_reserve_initrd();
-
- /*
- * At this point everything still needed from the boot loader
- * or BIOS or kernel text should be early reserved or marked not
- * RAM in e820. All other memory is free game.
- */
-
#ifdef CONFIG_X86_32
memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
@@ -910,8 +910,18 @@ void __init setup_arch(char **cmdline_p)
parse_early_param();
- if (efi_enabled(EFI_BOOT))
- efi_memblock_x86_reserve_range();
+ /*
+ * Do some memory reservations *before* memory is added to
+ * memblock, so memblock allocations won't overwrite it.
+ * Do it after early param, so we could get (unlikely) panic from
+ * serial.
+ *
+ * After this point everything still needed from the boot loader or
+ * firmware or kernel text should be early reserved or marked not
+ * RAM in e820. All other memory is free game.
+ */
+ early_reserve_memory();
+
#ifdef CONFIG_MEMORY_HOTPLUG
/*
* Memory used by the kernel cannot be hot-removed because Linux
@@ -938,9 +948,6 @@ void __init setup_arch(char **cmdline_p)
x86_report_nx();
- /* after early param, so could get panic from serial */
- memblock_x86_reserve_range_setup_data();
-
if (acpi_mps_check()) {
#ifdef CONFIG_X86_LOCAL_APIC
disable_apic = 1;
@@ -1032,8 +1039,6 @@ void __init setup_arch(char **cmdline_p)
*/
find_smp_config();
- reserve_ibft_region();
-
early_alloc_pgt_buf();
/*
@@ -1054,8 +1059,6 @@ void __init setup_arch(char **cmdline_p)
*/
sev_setup_arch();
- reserve_bios_regions();
-
efi_fake_memmap();
efi_find_mirror();
efi_esrt_init();
@@ -1081,8 +1084,12 @@ void __init setup_arch(char **cmdline_p)
reserve_real_mode();
- trim_platform_memory_ranges();
- trim_low_memory_range();
+ /*
+ * Reserving memory causing GPU hangs on Sandy Bridge integrated
+ * graphics devices should be done after we allocated memory under
+ * 1M for the real mode trampoline.
+ */
+ trim_snb_memory();
init_mem_mapping();
@@ -1129,6 +1136,8 @@ void __init setup_arch(char **cmdline_p)
reserve_initrd();
acpi_table_upgrade();
+ /* Look for ACPI tables and reserve memory occupied by them. */
+ acpi_boot_table_init();
vsmp_init();
@@ -1136,11 +1145,6 @@ void __init setup_arch(char **cmdline_p)
early_platform_quirks();
- /*
- * Parse the ACPI tables for possible boot-time SMP configuration.
- */
- acpi_boot_table_init();
-
early_acpi_boot_init();
initmem_init();
diff --git a/arch/x86/kernel/sev-es-shared.c b/arch/x86/kernel/sev-es-shared.c
index cdc04d091242..0aa9f13efd57 100644
--- a/arch/x86/kernel/sev-es-shared.c
+++ b/arch/x86/kernel/sev-es-shared.c
@@ -24,7 +24,7 @@ static bool __init sev_es_check_cpu_features(void)
return true;
}
-static void sev_es_terminate(unsigned int reason)
+static void __noreturn sev_es_terminate(unsigned int reason)
{
u64 val = GHCB_SEV_TERMINATE;
@@ -186,7 +186,6 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code)
* make it accessible to the hypervisor.
*
* In particular, check for:
- * - Hypervisor CPUID bit
* - Availability of CPUID leaf 0x8000001f
* - SEV CPUID bit.
*
@@ -194,10 +193,7 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code)
* can't be checked here.
*/
- if ((fn == 1 && !(regs->cx & BIT(31))))
- /* Hypervisor bit */
- goto fail;
- else if (fn == 0x80000000 && (regs->ax < 0x8000001f))
+ if (fn == 0x80000000 && (regs->ax < 0x8000001f))
/* SEV leaf check */
goto fail;
else if ((fn == 0x8000001f && !(regs->ax & BIT(1))))
@@ -210,12 +206,8 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code)
return;
fail:
- sev_es_wr_ghcb_msr(GHCB_SEV_TERMINATE);
- VMGEXIT();
-
- /* Shouldn't get here - if we do halt the machine */
- while (true)
- asm volatile("hlt\n");
+ /* Terminate the guest */
+ sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST);
}
static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt,
diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c
index 04a780abb512..26f5479a97a8 100644
--- a/arch/x86/kernel/sev-es.c
+++ b/arch/x86/kernel/sev-es.c
@@ -137,29 +137,41 @@ static __always_inline bool on_vc_stack(struct pt_regs *regs)
}
/*
- * This function handles the case when an NMI is raised in the #VC exception
- * handler entry code. In this case, the IST entry for #VC must be adjusted, so
- * that any subsequent #VC exception will not overwrite the stack contents of the
- * interrupted #VC handler.
+ * This function handles the case when an NMI is raised in the #VC
+ * exception handler entry code, before the #VC handler has switched off
+ * its IST stack. In this case, the IST entry for #VC must be adjusted,
+ * so that any nested #VC exception will not overwrite the stack
+ * contents of the interrupted #VC handler.
*
* The IST entry is adjusted unconditionally so that it can be also be
- * unconditionally adjusted back in sev_es_ist_exit(). Otherwise a nested
- * sev_es_ist_exit() call may adjust back the IST entry too early.
+ * unconditionally adjusted back in __sev_es_ist_exit(). Otherwise a
+ * nested sev_es_ist_exit() call may adjust back the IST entry too
+ * early.
+ *
+ * The __sev_es_ist_enter() and __sev_es_ist_exit() functions always run
+ * on the NMI IST stack, as they are only called from NMI handling code
+ * right now.
*/
void noinstr __sev_es_ist_enter(struct pt_regs *regs)
{
unsigned long old_ist, new_ist;
/* Read old IST entry */
- old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
+ new_ist = old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
- /* Make room on the IST stack */
+ /*
+ * If NMI happened while on the #VC IST stack, set the new IST
+ * value below regs->sp, so that the interrupted stack frame is
+ * not overwritten by subsequent #VC exceptions.
+ */
if (on_vc_stack(regs))
- new_ist = ALIGN_DOWN(regs->sp, 8) - sizeof(old_ist);
- else
- new_ist = old_ist - sizeof(old_ist);
+ new_ist = regs->sp;
- /* Store old IST entry */
+ /*
+ * Reserve additional 8 bytes and store old IST value so this
+ * adjustment can be unrolled in __sev_es_ist_exit().
+ */
+ new_ist -= sizeof(old_ist);
*(unsigned long *)new_ist = old_ist;
/* Set new IST entry */
@@ -277,7 +289,7 @@ static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt)
return ES_EXCEPTION;
}
- insn_init(&ctxt->insn, buffer, MAX_INSN_SIZE - res, 1);
+ insn_init(&ctxt->insn, buffer, MAX_INSN_SIZE, 1);
insn_get_length(&ctxt->insn);
}
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 2406c7fb05ab..1e2050c4f94a 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1659,13 +1659,17 @@ void play_dead_common(void)
local_irq_disable();
}
-static bool wakeup_cpu0(void)
+/**
+ * cond_wakeup_cpu0 - Wake up CPU0 if needed.
+ *
+ * If NMI wants to wake up CPU0, start CPU0.
+ */
+void cond_wakeup_cpu0(void)
{
if (smp_processor_id() == 0 && enable_start_cpu0)
- return true;
-
- return false;
+ start_cpu0();
}
+EXPORT_SYMBOL_GPL(cond_wakeup_cpu0);
/*
* We need to flush the caches before going to sleep, lest we have
@@ -1734,11 +1738,8 @@ static inline void mwait_play_dead(void)
__monitor(mwait_ptr, 0, 0);
mb();
__mwait(eax, 0);
- /*
- * If NMI wants to wake up CPU0, start CPU0.
- */
- if (wakeup_cpu0())
- start_cpu0();
+
+ cond_wakeup_cpu0();
}
}
@@ -1749,11 +1750,8 @@ void hlt_play_dead(void)
while (1) {
native_halt();
- /*
- * If NMI wants to wake up CPU0, start CPU0.
- */
- if (wakeup_cpu0())
- start_cpu0();
+
+ cond_wakeup_cpu0();
}
}
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 4c09ba110204..f9af561c3cd4 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -49,6 +49,30 @@ bool tboot_enabled(void)
return tboot != NULL;
}
+/* noinline to prevent gcc from warning about dereferencing constant fixaddr */
+static noinline __init bool check_tboot_version(void)
+{
+ if (memcmp(&tboot_uuid, &tboot->uuid, sizeof(tboot->uuid))) {
+ pr_warn("tboot at 0x%llx is invalid\n", boot_params.tboot_addr);
+ return false;
+ }
+
+ if (tboot->version < 5) {
+ pr_warn("tboot version is invalid: %u\n", tboot->version);
+ return false;
+ }
+
+ pr_info("found shared page at phys addr 0x%llx:\n",
+ boot_params.tboot_addr);
+ pr_debug("version: %d\n", tboot->version);
+ pr_debug("log_addr: 0x%08x\n", tboot->log_addr);
+ pr_debug("shutdown_entry: 0x%x\n", tboot->shutdown_entry);
+ pr_debug("tboot_base: 0x%08x\n", tboot->tboot_base);
+ pr_debug("tboot_size: 0x%x\n", tboot->tboot_size);
+
+ return true;
+}
+
void __init tboot_probe(void)
{
/* Look for valid page-aligned address for shared page. */
@@ -66,25 +90,9 @@ void __init tboot_probe(void)
/* Map and check for tboot UUID. */
set_fixmap(FIX_TBOOT_BASE, boot_params.tboot_addr);
- tboot = (struct tboot *)fix_to_virt(FIX_TBOOT_BASE);
- if (memcmp(&tboot_uuid, &tboot->uuid, sizeof(tboot->uuid))) {
- pr_warn("tboot at 0x%llx is invalid\n", boot_params.tboot_addr);
+ tboot = (void *)fix_to_virt(FIX_TBOOT_BASE);
+ if (!check_tboot_version())
tboot = NULL;
- return;
- }
- if (tboot->version < 5) {
- pr_warn("tboot version is invalid: %u\n", tboot->version);
- tboot = NULL;
- return;
- }
-
- pr_info("found shared page at phys addr 0x%llx:\n",
- boot_params.tboot_addr);
- pr_debug("version: %d\n", tboot->version);
- pr_debug("log_addr: 0x%08x\n", tboot->log_addr);
- pr_debug("shutdown_entry: 0x%x\n", tboot->shutdown_entry);
- pr_debug("tboot_base: 0x%08x\n", tboot->tboot_base);
- pr_debug("tboot_size: 0x%x\n", tboot->tboot_size);
}
static pgd_t *tboot_pg_dir;
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 9a1b23cb776f..48881db8dca1 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -556,7 +556,7 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
tsk->thread.trap_nr = X86_TRAP_GP;
if (fixup_vdso_exception(regs, X86_TRAP_GP, error_code, 0))
- return;
+ goto exit;
show_signal(tsk, SIGSEGV, "", desc, regs, error_code);
force_sig(SIGSEGV);
@@ -1057,7 +1057,7 @@ static void math_error(struct pt_regs *regs, int trapnr)
goto exit;
if (fixup_vdso_exception(regs, trapnr, 0, 0))
- return;
+ goto exit;
force_sig_fault(SIGFPE, si_code,
(void __user *)uprobe_get_trap_addr(regs));
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 5d042b359d54..57ec01192180 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -14,6 +14,7 @@
#include <linux/percpu.h>
#include <linux/timex.h>
#include <linux/static_key.h>
+#include <linux/static_call.h>
#include <asm/hpet.h>
#include <asm/timer.h>
@@ -254,7 +255,7 @@ unsigned long long sched_clock(void)
bool using_native_sched_clock(void)
{
- return pv_ops.time.sched_clock == native_sched_clock;
+ return static_call_query(pv_sched_clock) == native_sched_clock;
}
#else
unsigned long long
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index a788d5120d4d..f6b93a35ce14 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -84,6 +84,18 @@ config KVM_INTEL
To compile this as a module, choose M here: the module
will be called kvm-intel.
+config X86_SGX_KVM
+ bool "Software Guard eXtensions (SGX) Virtualization"
+ depends on X86_SGX && KVM_INTEL
+ help
+
+ Enables KVM guests to create SGX enclaves.
+
+ This includes support to expose "raw" unreclaimable enclave memory to
+ guests via a device node, e.g. /dev/sgx_vepc.
+
+ If unsure, say N.
+
config KVM_AMD
tristate "KVM for AMD processors support"
depends on KVM
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 1b4766fe1de2..eafc4d601f25 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: GPL-2.0
-ccflags-y += -Iarch/x86/kvm
+ccflags-y += -I $(srctree)/arch/x86/kvm
ccflags-$(CONFIG_KVM_WERROR) += -Werror
ifeq ($(CONFIG_FRAME_POINTER),y)
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index b6f9a817d9b1..62b1729277ef 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5884,6 +5884,7 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
struct kvm_mmu_page *sp;
unsigned int ratio;
LIST_HEAD(invalid_list);
+ bool flush = false;
ulong to_zap;
rcu_idx = srcu_read_lock(&kvm->srcu);
@@ -5905,19 +5906,19 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
lpage_disallowed_link);
WARN_ON_ONCE(!sp->lpage_disallowed);
if (is_tdp_mmu_page(sp)) {
- kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn,
- sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level));
+ flush |= kvm_tdp_mmu_zap_sp(kvm, sp);
} else {
kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
WARN_ON_ONCE(sp->lpage_disallowed);
}
if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
- kvm_mmu_commit_zap_page(kvm, &invalid_list);
+ kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
cond_resched_rwlock_write(&kvm->mmu_lock);
+ flush = false;
}
}
- kvm_mmu_commit_zap_page(kvm, &invalid_list);
+ kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
write_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, rcu_idx);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 1436b1d1f4c0..34207b874886 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -86,7 +86,7 @@ static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
- gfn_t start, gfn_t end, bool can_yield);
+ gfn_t start, gfn_t end, bool can_yield, bool flush);
void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
{
@@ -99,7 +99,7 @@ void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
list_del(&root->link);
- zap_gfn_range(kvm, root, 0, max_gfn, false);
+ zap_gfn_range(kvm, root, 0, max_gfn, false, false);
free_page((unsigned long)root->spt);
kmem_cache_free(mmu_page_header_cache, root);
@@ -668,20 +668,21 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
* scheduler needs the CPU or there is contention on the MMU lock. If this
* function cannot yield, it will not release the MMU lock or reschedule and
* the caller must ensure it does not supply too large a GFN range, or the
- * operation can cause a soft lockup.
+ * operation can cause a soft lockup. Note, in some use cases a flush may be
+ * required by prior actions. Ensure the pending flush is performed prior to
+ * yielding.
*/
static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
- gfn_t start, gfn_t end, bool can_yield)
+ gfn_t start, gfn_t end, bool can_yield, bool flush)
{
struct tdp_iter iter;
- bool flush_needed = false;
rcu_read_lock();
tdp_root_for_each_pte(iter, root, start, end) {
if (can_yield &&
- tdp_mmu_iter_cond_resched(kvm, &iter, flush_needed)) {
- flush_needed = false;
+ tdp_mmu_iter_cond_resched(kvm, &iter, flush)) {
+ flush = false;
continue;
}
@@ -699,11 +700,11 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
continue;
tdp_mmu_set_spte(kvm, &iter, 0);
- flush_needed = true;
+ flush = true;
}
rcu_read_unlock();
- return flush_needed;
+ return flush;
}
/*
@@ -712,13 +713,14 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
* SPTEs have been cleared and a TLB flush is needed before releasing the
* MMU lock.
*/
-bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end)
+bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end,
+ bool can_yield)
{
struct kvm_mmu_page *root;
bool flush = false;
for_each_tdp_mmu_root_yield_safe(kvm, root)
- flush |= zap_gfn_range(kvm, root, start, end, true);
+ flush = zap_gfn_range(kvm, root, start, end, can_yield, flush);
return flush;
}
@@ -930,7 +932,7 @@ static int zap_gfn_range_hva_wrapper(struct kvm *kvm,
struct kvm_mmu_page *root, gfn_t start,
gfn_t end, unsigned long unused)
{
- return zap_gfn_range(kvm, root, start, end, false);
+ return zap_gfn_range(kvm, root, start, end, false, false);
}
int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start,
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 3b761c111bff..31096ece9b14 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -8,7 +8,29 @@
hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root);
-bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end);
+bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end,
+ bool can_yield);
+static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start,
+ gfn_t end)
+{
+ return __kvm_tdp_mmu_zap_gfn_range(kvm, start, end, true);
+}
+static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ gfn_t end = sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level);
+
+ /*
+ * Don't allow yielding, as the caller may have a flush pending. Note,
+ * if mmu_lock is held for write, zapping will never yield in this case,
+ * but explicitly disallow it for safety. The TDP MMU does not yield
+ * until it has made forward progress (steps sideways), and when zapping
+ * a single shadow page that it's guaranteed to see (thus the mmu_lock
+ * requirement), its "step sideways" will always step beyond the bounds
+ * of the shadow page's gfn range and stop iterating before yielding.
+ */
+ lockdep_assert_held_write(&kvm->mmu_lock);
+ return __kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn, end, false);
+}
void kvm_tdp_mmu_zap_all(struct kvm *kvm);
int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 35891d9a1099..fb204eaa8bb3 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -246,11 +246,18 @@ static bool nested_vmcb_check_controls(struct vmcb_control_area *control)
return true;
}
-static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb12)
+static bool nested_vmcb_check_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
{
struct kvm_vcpu *vcpu = &svm->vcpu;
bool vmcb12_lma;
+ /*
+ * FIXME: these should be done after copying the fields,
+ * to avoid TOC/TOU races. For these save area checks
+ * the possible damage is limited since kvm_set_cr0 and
+ * kvm_set_cr4 handle failure; EFER_SVME is an exception
+ * so it is force-set later in nested_prepare_vmcb_save.
+ */
if ((vmcb12->save.efer & EFER_SVME) == 0)
return false;
@@ -271,7 +278,7 @@ static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb12)
if (!kvm_is_valid_cr4(&svm->vcpu, vmcb12->save.cr4))
return false;
- return nested_vmcb_check_controls(&vmcb12->control);
+ return true;
}
static void load_nested_vmcb_control(struct vcpu_svm *svm,
@@ -396,7 +403,14 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
svm->vmcb->save.gdtr = vmcb12->save.gdtr;
svm->vmcb->save.idtr = vmcb12->save.idtr;
kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED);
- svm_set_efer(&svm->vcpu, vmcb12->save.efer);
+
+ /*
+ * Force-set EFER_SVME even though it is checked earlier on the
+ * VMCB12, because the guest can flip the bit between the check
+ * and now. Clearing EFER_SVME would call svm_free_nested.
+ */
+ svm_set_efer(&svm->vcpu, vmcb12->save.efer | EFER_SVME);
+
svm_set_cr0(&svm->vcpu, vmcb12->save.cr0);
svm_set_cr4(&svm->vcpu, vmcb12->save.cr4);
svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = vmcb12->save.cr2;
@@ -468,7 +482,6 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa,
svm->nested.vmcb12_gpa = vmcb12_gpa;
- load_nested_vmcb_control(svm, &vmcb12->control);
nested_prepare_vmcb_control(svm);
nested_prepare_vmcb_save(svm, vmcb12);
@@ -515,7 +528,10 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
if (WARN_ON_ONCE(!svm->nested.initialized))
return -EINVAL;
- if (!nested_vmcb_checks(svm, vmcb12)) {
+ load_nested_vmcb_control(svm, &vmcb12->control);
+
+ if (!nested_vmcb_check_save(svm, vmcb12) ||
+ !nested_vmcb_check_controls(&svm->nested.ctl)) {
vmcb12->control.exit_code = SVM_EXIT_ERR;
vmcb12->control.exit_code_hi = 0;
vmcb12->control.exit_info_1 = 0;
@@ -1209,6 +1225,8 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
*/
if (!(save->cr0 & X86_CR0_PG))
goto out_free;
+ if (!(save->efer & EFER_SVME))
+ goto out_free;
/*
* All checks done, we can enter guest mode. L1 control fields
diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
index 035da07500e8..fdf587f19c5f 100644
--- a/arch/x86/kvm/svm/pmu.c
+++ b/arch/x86/kvm/svm/pmu.c
@@ -98,6 +98,8 @@ static enum index msr_to_index(u32 msr)
static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr,
enum pmu_type type)
{
+ struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
+
switch (msr) {
case MSR_F15H_PERF_CTL0:
case MSR_F15H_PERF_CTL1:
@@ -105,6 +107,9 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr,
case MSR_F15H_PERF_CTL3:
case MSR_F15H_PERF_CTL4:
case MSR_F15H_PERF_CTL5:
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE))
+ return NULL;
+ fallthrough;
case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
if (type != PMU_TYPE_EVNTSEL)
return NULL;
@@ -115,6 +120,9 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr,
case MSR_F15H_PERF_CTR3:
case MSR_F15H_PERF_CTR4:
case MSR_F15H_PERF_CTR5:
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE))
+ return NULL;
+ fallthrough;
case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
if (type != PMU_TYPE_COUNTER)
return NULL;
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index bdd94bf23b75..bcbf0d2139e9 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6027,19 +6027,19 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
exit_reason.basic != EXIT_REASON_PML_FULL &&
exit_reason.basic != EXIT_REASON_APIC_ACCESS &&
exit_reason.basic != EXIT_REASON_TASK_SWITCH)) {
+ int ndata = 3;
+
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
- vcpu->run->internal.ndata = 3;
vcpu->run->internal.data[0] = vectoring_info;
vcpu->run->internal.data[1] = exit_reason.full;
vcpu->run->internal.data[2] = vcpu->arch.exit_qualification;
if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) {
- vcpu->run->internal.ndata++;
- vcpu->run->internal.data[3] =
+ vcpu->run->internal.data[ndata++] =
vmcs_read64(GUEST_PHYSICAL_ADDRESS);
}
- vcpu->run->internal.data[vcpu->run->internal.ndata++] =
- vcpu->arch.last_vmentry_cpu;
+ vcpu->run->internal.data[ndata++] = vcpu->arch.last_vmentry_cpu;
+ vcpu->run->internal.ndata = ndata;
return 0;
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cb42c201adef..efc7a82ab140 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -271,8 +271,7 @@ static struct kmem_cache *x86_emulator_cache;
* When called, it means the previous get/set msr reached an invalid msr.
* Return true if we want to ignore/silent this failed msr access.
*/
-static bool kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr,
- u64 data, bool write)
+static bool kvm_msr_ignored_check(u32 msr, u64 data, bool write)
{
const char *op = write ? "wrmsr" : "rdmsr";
@@ -1445,7 +1444,7 @@ static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
if (r == KVM_MSR_RET_INVALID) {
/* Unconditionally clear the output for simplicity */
*data = 0;
- if (kvm_msr_ignored_check(vcpu, index, 0, false))
+ if (kvm_msr_ignored_check(index, 0, false))
r = 0;
}
@@ -1620,7 +1619,7 @@ static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu,
int ret = __kvm_set_msr(vcpu, index, data, host_initiated);
if (ret == KVM_MSR_RET_INVALID)
- if (kvm_msr_ignored_check(vcpu, index, data, true))
+ if (kvm_msr_ignored_check(index, data, true))
ret = 0;
return ret;
@@ -1658,7 +1657,7 @@ static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu,
if (ret == KVM_MSR_RET_INVALID) {
/* Unconditionally clear *data for simplicity */
*data = 0;
- if (kvm_msr_ignored_check(vcpu, index, 0, false))
+ if (kvm_msr_ignored_check(index, 0, false))
ret = 0;
}
@@ -2329,7 +2328,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
kvm_vcpu_write_tsc_offset(vcpu, offset);
raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
- spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
+ spin_lock_irqsave(&kvm->arch.pvclock_gtod_sync_lock, flags);
if (!matched) {
kvm->arch.nr_vcpus_matched_tsc = 0;
} else if (!already_matched) {
@@ -2337,7 +2336,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
}
kvm_track_tsc_matching(vcpu);
- spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
+ spin_unlock_irqrestore(&kvm->arch.pvclock_gtod_sync_lock, flags);
}
static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
@@ -2559,13 +2558,16 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
int i;
struct kvm_vcpu *vcpu;
struct kvm_arch *ka = &kvm->arch;
+ unsigned long flags;
kvm_hv_invalidate_tsc_page(kvm);
- spin_lock(&ka->pvclock_gtod_sync_lock);
kvm_make_mclock_inprogress_request(kvm);
+
/* no guest entries from this point */
+ spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
pvclock_update_vm_gtod_copy(kvm);
+ spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@ -2573,8 +2575,6 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
/* guest entries allowed */
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
-
- spin_unlock(&ka->pvclock_gtod_sync_lock);
#endif
}
@@ -2582,17 +2582,18 @@ u64 get_kvmclock_ns(struct kvm *kvm)
{
struct kvm_arch *ka = &kvm->arch;
struct pvclock_vcpu_time_info hv_clock;
+ unsigned long flags;
u64 ret;
- spin_lock(&ka->pvclock_gtod_sync_lock);
+ spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
if (!ka->use_master_clock) {
- spin_unlock(&ka->pvclock_gtod_sync_lock);
+ spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
return get_kvmclock_base_ns() + ka->kvmclock_offset;
}
hv_clock.tsc_timestamp = ka->master_cycle_now;
hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
- spin_unlock(&ka->pvclock_gtod_sync_lock);
+ spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
/* both __this_cpu_read() and rdtsc() should be on the same cpu */
get_cpu();
@@ -2686,13 +2687,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
* If the host uses TSC clock, then passthrough TSC as stable
* to the guest.
*/
- spin_lock(&ka->pvclock_gtod_sync_lock);
+ spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
use_master_clock = ka->use_master_clock;
if (use_master_clock) {
host_tsc = ka->master_cycle_now;
kernel_ns = ka->master_kernel_ns;
}
- spin_unlock(&ka->pvclock_gtod_sync_lock);
+ spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
/* Keep irq disabled to prevent changes to the clock */
local_irq_save(flags);
@@ -4024,7 +4025,6 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
{
struct kvm_host_map map;
struct kvm_steal_time *st;
- int idx;
if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
return;
@@ -4032,15 +4032,9 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
if (vcpu->arch.st.preempted)
return;
- /*
- * Take the srcu lock as memslots will be accessed to check the gfn
- * cache generation against the memslots generation.
- */
- idx = srcu_read_lock(&vcpu->kvm->srcu);
-
if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map,
&vcpu->arch.st.cache, true))
- goto out;
+ return;
st = map.hva +
offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
@@ -4048,20 +4042,25 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true);
-
-out:
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
}
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
{
+ int idx;
+
if (vcpu->preempted && !vcpu->arch.guest_state_protected)
vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu);
+ /*
+ * Take the srcu lock as memslots will be accessed to check the gfn
+ * cache generation against the memslots generation.
+ */
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
if (kvm_xen_msr_enabled(vcpu->kvm))
kvm_xen_runstate_set_preempted(vcpu);
else
kvm_steal_time_set_preempted(vcpu);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
static_call(kvm_x86_vcpu_put)(vcpu);
vcpu->arch.last_host_tsc = rdtsc();
@@ -5726,6 +5725,7 @@ set_pit2_out:
}
#endif
case KVM_SET_CLOCK: {
+ struct kvm_arch *ka = &kvm->arch;
struct kvm_clock_data user_ns;
u64 now_ns;
@@ -5744,8 +5744,22 @@ set_pit2_out:
* pvclock_update_vm_gtod_copy().
*/
kvm_gen_update_masterclock(kvm);
- now_ns = get_kvmclock_ns(kvm);
- kvm->arch.kvmclock_offset += user_ns.clock - now_ns;
+
+ /*
+ * This pairs with kvm_guest_time_update(): when masterclock is
+ * in use, we use master_kernel_ns + kvmclock_offset to set
+ * unsigned 'system_time' so if we use get_kvmclock_ns() (which
+ * is slightly ahead) here we risk going negative on unsigned
+ * 'system_time' when 'user_ns.clock' is very small.
+ */
+ spin_lock_irq(&ka->pvclock_gtod_sync_lock);
+ if (kvm->arch.use_master_clock)
+ now_ns = ka->master_kernel_ns;
+ else
+ now_ns = get_kvmclock_base_ns();
+ ka->kvmclock_offset = user_ns.clock - now_ns;
+ spin_unlock_irq(&ka->pvclock_gtod_sync_lock);
+
kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);
break;
}
@@ -7724,6 +7738,7 @@ static void kvm_hyperv_tsc_notifier(void)
struct kvm *kvm;
struct kvm_vcpu *vcpu;
int cpu;
+ unsigned long flags;
mutex_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list)
@@ -7739,17 +7754,15 @@ static void kvm_hyperv_tsc_notifier(void)
list_for_each_entry(kvm, &vm_list, vm_list) {
struct kvm_arch *ka = &kvm->arch;
- spin_lock(&ka->pvclock_gtod_sync_lock);
-
+ spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
pvclock_update_vm_gtod_copy(kvm);
+ spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
kvm_for_each_vcpu(cpu, vcpu, kvm)
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
kvm_for_each_vcpu(cpu, vcpu, kvm)
kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
-
- spin_unlock(&ka->pvclock_gtod_sync_lock);
}
mutex_unlock(&kvm_lock);
}
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 39eb04887141..9035e34aa156 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -250,7 +250,6 @@ static inline bool kvm_vcpu_latch_init(struct kvm_vcpu *vcpu)
void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs);
void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
-void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr);
u64 get_kvmclock_ns(struct kvm *kvm);
int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S
index 3b6544111ac9..16bc9130e7a5 100644
--- a/arch/x86/lib/atomic64_386_32.S
+++ b/arch/x86/lib/atomic64_386_32.S
@@ -6,7 +6,7 @@
*/
#include <linux/linkage.h>
-#include <asm/alternative-asm.h>
+#include <asm/alternative.h>
/* if you want SMP support, implement these with real spinlocks */
.macro LOCK reg
diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S
index 1c5c81c16b06..ce6935690766 100644
--- a/arch/x86/lib/atomic64_cx8_32.S
+++ b/arch/x86/lib/atomic64_cx8_32.S
@@ -6,7 +6,7 @@
*/
#include <linux/linkage.h>
-#include <asm/alternative-asm.h>
+#include <asm/alternative.h>
.macro read64 reg
movl %ebx, %eax
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index 2402d4c489d2..db4b4f9197c7 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -3,7 +3,7 @@
#include <linux/linkage.h>
#include <asm/cpufeatures.h>
-#include <asm/alternative-asm.h>
+#include <asm/alternative.h>
#include <asm/export.h>
/*
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 77b9b2a3b5c8..57b79c577496 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -11,7 +11,7 @@
#include <asm/asm-offsets.h>
#include <asm/thread_info.h>
#include <asm/cpufeatures.h>
-#include <asm/alternative-asm.h>
+#include <asm/alternative.h>
#include <asm/asm.h>
#include <asm/smap.h>
#include <asm/export.h>
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 1e299ac73c86..1cc9da6e29c7 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -4,7 +4,7 @@
#include <linux/linkage.h>
#include <asm/errno.h>
#include <asm/cpufeatures.h>
-#include <asm/alternative-asm.h>
+#include <asm/alternative.h>
#include <asm/export.h>
.pushsection .noinstr.text, "ax"
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index 41902fe8b859..64801010d312 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -8,7 +8,7 @@
*/
#include <linux/linkage.h>
#include <asm/cpufeatures.h>
-#include <asm/alternative-asm.h>
+#include <asm/alternative.h>
#include <asm/export.h>
#undef memmove
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 0bfd26e4ca9e..9827ae267f96 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -3,7 +3,7 @@
#include <linux/linkage.h>
#include <asm/cpufeatures.h>
-#include <asm/alternative-asm.h>
+#include <asm/alternative.h>
#include <asm/export.h>
/*
diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
index f6fb1d218dcc..6bb74b5c238c 100644
--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -4,7 +4,7 @@
#include <linux/linkage.h>
#include <asm/dwarf2.h>
#include <asm/cpufeatures.h>
-#include <asm/alternative-asm.h>
+#include <asm/alternative.h>
#include <asm/export.h>
#include <asm/nospec-branch.h>
#include <asm/unwind_hints.h>
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 4b01f7dbaf30..f633f9e23b8f 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -19,6 +19,7 @@
#include <linux/kernel.h>
#include <linux/bitops.h>
#include <linux/dma-mapping.h>
+#include <linux/virtio_config.h>
#include <asm/tlbflush.h>
#include <asm/fixmap.h>
@@ -262,7 +263,7 @@ static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc)
if (pgprot_val(old_prot) == pgprot_val(new_prot))
return;
- pa = pfn << page_level_shift(level);
+ pa = pfn << PAGE_SHIFT;
size = page_level_size(level);
/*
@@ -484,3 +485,8 @@ void __init mem_encrypt_init(void)
print_mem_encrypt_feature_info();
}
+int arch_has_restricted_virtio_memory_access(void)
+{
+ return sev_active();
+}
+EXPORT_SYMBOL_GPL(arch_has_restricted_virtio_memory_access);
diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c
index 6c5eb6f3f14f..a19374d26101 100644
--- a/arch/x86/mm/mem_encrypt_identity.c
+++ b/arch/x86/mm/mem_encrypt_identity.c
@@ -503,14 +503,10 @@ void __init sme_enable(struct boot_params *bp)
#define AMD_SME_BIT BIT(0)
#define AMD_SEV_BIT BIT(1)
- /*
- * Set the feature mask (SME or SEV) based on whether we are
- * running under a hypervisor.
- */
- eax = 1;
- ecx = 0;
- native_cpuid(&eax, &ebx, &ecx, &edx);
- feature_mask = (ecx & BIT(31)) ? AMD_SEV_BIT : AMD_SME_BIT;
+
+ /* Check the SEV MSR whether SEV or SME is enabled */
+ sev_status = __rdmsr(MSR_AMD64_SEV);
+ feature_mask = (sev_status & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT;
/*
* Check for the SME/SEV feature:
@@ -530,19 +526,26 @@ void __init sme_enable(struct boot_params *bp)
/* Check if memory encryption is enabled */
if (feature_mask == AMD_SME_BIT) {
+ /*
+ * No SME if Hypervisor bit is set. This check is here to
+ * prevent a guest from trying to enable SME. For running as a
+ * KVM guest the MSR_K8_SYSCFG will be sufficient, but there
+ * might be other hypervisors which emulate that MSR as non-zero
+ * or even pass it through to the guest.
+ * A malicious hypervisor can still trick a guest into this
+ * path, but there is no way to protect against that.
+ */
+ eax = 1;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ if (ecx & BIT(31))
+ return;
+
/* For SME, check the SYSCFG MSR */
msr = __rdmsr(MSR_K8_SYSCFG);
if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
return;
} else {
- /* For SEV, check the SEV MSR */
- msr = __rdmsr(MSR_AMD64_SEV);
- if (!(msr & MSR_AMD64_SEV_ENABLED))
- return;
-
- /* Save SEV_STATUS to avoid reading MSR again */
- sev_status = msr;
-
/* SEV state cannot be controlled by a command line option */
sme_me_mask = me_mask;
sev_enabled = true;
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 3a310daa8e31..220e72434f3c 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1689,7 +1689,16 @@ emit_jmp:
}
if (image) {
- if (unlikely(proglen + ilen > oldproglen)) {
+ /*
+ * When populating the image, assert that:
+ *
+ * i) We do not write beyond the allocated space, and
+ * ii) addrs[i] did not change from the prior run, in order
+ * to validate assumptions made for computing branch
+ * displacements.
+ */
+ if (unlikely(proglen + ilen > oldproglen ||
+ proglen + ilen != addrs[i])) {
pr_err("bpf_jit: fatal error\n");
return -EFAULT;
}
@@ -1936,7 +1945,7 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog,
* add rsp, 8 // skip eth_type_trans's frame
* ret // return to its caller
*/
-int arch_prepare_bpf_trampoline(void *image, void *image_end,
+int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
const struct btf_func_model *m, u32 flags,
struct bpf_tramp_progs *tprogs,
void *orig_call)
@@ -1975,6 +1984,15 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end,
save_regs(m, &prog, nr_args, stack_size);
+ if (flags & BPF_TRAMP_F_CALL_ORIG) {
+ /* arg1: mov rdi, im */
+ emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im);
+ if (emit_call(&prog, __bpf_tramp_enter, prog)) {
+ ret = -EINVAL;
+ goto cleanup;
+ }
+ }
+
if (fentry->nr_progs)
if (invoke_bpf(m, &prog, fentry, stack_size))
return -EINVAL;
@@ -1993,8 +2011,7 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end,
}
if (flags & BPF_TRAMP_F_CALL_ORIG) {
- if (fentry->nr_progs || fmod_ret->nr_progs)
- restore_regs(m, &prog, nr_args, stack_size);
+ restore_regs(m, &prog, nr_args, stack_size);
/* call original function */
if (emit_call(&prog, orig_call, prog)) {
@@ -2003,6 +2020,9 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end,
}
/* remember return value in a stack for bpf prog to access */
emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
+ im->ip_after_call = prog;
+ memcpy(prog, ideal_nops[NOP_ATOMIC5], X86_PATCH_SIZE);
+ prog += X86_PATCH_SIZE;
}
if (fmod_ret->nr_progs) {
@@ -2033,9 +2053,17 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end,
* the return value is only updated on the stack and still needs to be
* restored to R0.
*/
- if (flags & BPF_TRAMP_F_CALL_ORIG)
+ if (flags & BPF_TRAMP_F_CALL_ORIG) {
+ im->ip_epilogue = prog;
+ /* arg1: mov rdi, im */
+ emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im);
+ if (emit_call(&prog, __bpf_tramp_exit, prog)) {
+ ret = -EINVAL;
+ goto cleanup;
+ }
/* restore original return value back into RAX */
emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8);
+ }
EMIT1(0x5B); /* pop rbx */
EMIT1(0xC9); /* leave */
@@ -2225,7 +2253,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
padding = true;
goto skip_init_addrs;
}
- addrs = kmalloc_array(prog->len + 1, sizeof(*addrs), GFP_KERNEL);
+ addrs = kvmalloc_array(prog->len + 1, sizeof(*addrs), GFP_KERNEL);
if (!addrs) {
prog = orig_prog;
goto out_addrs;
@@ -2317,7 +2345,7 @@ out_image:
if (image)
bpf_prog_fill_jited_linfo(prog, addrs + 1);
out_addrs:
- kfree(addrs);
+ kvfree(addrs);
kfree(jit_data);
prog->aux->jit_data = NULL;
}
diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c
index d17b67c69f89..6a99def7d315 100644
--- a/arch/x86/net/bpf_jit_comp32.c
+++ b/arch/x86/net/bpf_jit_comp32.c
@@ -2276,7 +2276,16 @@ notyet:
}
if (image) {
- if (unlikely(proglen + ilen > oldproglen)) {
+ /*
+ * When populating the image, assert that:
+ *
+ * i) We do not write beyond the allocated space, and
+ * ii) addrs[i] did not change from the prior run, in order
+ * to validate assumptions made for computing branch
+ * displacements.
+ */
+ if (unlikely(proglen + ilen > oldproglen ||
+ proglen + ilen != addrs[i])) {
pr_err("bpf_jit: fatal error\n");
return -EFAULT;
}
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index dc0a337f985b..4f18cd9eacd8 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -1070,8 +1070,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
.read_pmc = xen_read_pmc,
- .iret = xen_iret,
-
.load_tr_desc = paravirt_nop,
.set_ldt = xen_set_ldt,
.load_gdt = xen_load_gdt,
@@ -1233,8 +1231,8 @@ asmlinkage __visible void __init xen_start_kernel(void)
/* Install Xen paravirt ops */
pv_info = xen_info;
- pv_ops.init.patch = paravirt_patch_default;
pv_ops.cpu = xen_cpu_ops;
+ paravirt_iret = xen_iret;
xen_init_irq_ops();
/*
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 17d80f751fcb..ac06ca32e9ef 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -98,8 +98,8 @@ EXPORT_SYMBOL_GPL(xen_p2m_size);
unsigned long xen_max_p2m_pfn __read_mostly;
EXPORT_SYMBOL_GPL(xen_max_p2m_pfn);
-#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG_LIMIT
-#define P2M_LIMIT CONFIG_XEN_BALLOON_MEMORY_HOTPLUG_LIMIT
+#ifdef CONFIG_XEN_MEMORY_HOTPLUG_LIMIT
+#define P2M_LIMIT CONFIG_XEN_MEMORY_HOTPLUG_LIMIT
#else
#define P2M_LIMIT 0
#endif
@@ -416,9 +416,6 @@ void __init xen_vmalloc_p2m_tree(void)
xen_p2m_last_pfn = xen_max_p2m_pfn;
p2m_limit = (phys_addr_t)P2M_LIMIT * 1024 * 1024 * 1024 / PAGE_SIZE;
- if (!p2m_limit && IS_ENABLED(CONFIG_XEN_UNPOPULATED_ALLOC))
- p2m_limit = xen_start_info->nr_pages * XEN_EXTRA_MEM_RATIO;
-
vm.flags = VM_ALLOC;
vm.size = ALIGN(sizeof(unsigned long) * max(xen_max_p2m_pfn, p2m_limit),
PMD_SIZE * PMDS_PER_MID_PAGE);
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 1a3b75652fa4..8bfc10330107 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -59,6 +59,18 @@ static struct {
} xen_remap_buf __initdata __aligned(PAGE_SIZE);
static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY;
+/*
+ * The maximum amount of extra memory compared to the base size. The
+ * main scaling factor is the size of struct page. At extreme ratios
+ * of base:extra, all the base memory can be filled with page
+ * structures for the extra memory, leaving no space for anything
+ * else.
+ *
+ * 10x seems like a reasonable balance between scaling flexibility and
+ * leaving a practically usable system.
+ */
+#define EXTRA_MEM_RATIO (10)
+
static bool xen_512gb_limit __initdata = IS_ENABLED(CONFIG_XEN_512GB);
static void __init xen_parse_512gb(void)
@@ -778,13 +790,13 @@ char * __init xen_memory_setup(void)
extra_pages += max_pages - max_pfn;
/*
- * Clamp the amount of extra memory to a XEN_EXTRA_MEM_RATIO
+ * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
* factor the base size.
*
* Make sure we have no memory above max_pages, as this area
* isn't handled by the p2m management.
*/
- extra_pages = min3(XEN_EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
+ extra_pages = min3(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
extra_pages, max_pages - max_pfn);
i = 0;
addr = xen_e820_table.entries[0].addr;
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 91f5b330dcc6..d9c945ee1100 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -379,11 +379,6 @@ void xen_timer_resume(void)
}
}
-static const struct pv_time_ops xen_time_ops __initconst = {
- .sched_clock = xen_sched_clock,
- .steal_clock = xen_steal_clock,
-};
-
static struct pvclock_vsyscall_time_info *xen_clock __read_mostly;
static u64 xen_clock_value_saved;
@@ -525,17 +520,24 @@ static void __init xen_time_init(void)
pvclock_gtod_register_notifier(&xen_pvclock_gtod_notifier);
}
-void __init xen_init_time_ops(void)
+static void __init xen_init_time_common(void)
{
xen_sched_clock_offset = xen_clocksource_read();
- pv_ops.time = xen_time_ops;
+ static_call_update(pv_steal_clock, xen_steal_clock);
+ paravirt_set_sched_clock(xen_sched_clock);
+
+ x86_platform.calibrate_tsc = xen_tsc_khz;
+ x86_platform.get_wallclock = xen_get_wallclock;
+}
+
+void __init xen_init_time_ops(void)
+{
+ xen_init_time_common();
x86_init.timers.timer_init = xen_time_init;
x86_init.timers.setup_percpu_clockev = x86_init_noop;
x86_cpuinit.setup_percpu_clockev = x86_init_noop;
- x86_platform.calibrate_tsc = xen_tsc_khz;
- x86_platform.get_wallclock = xen_get_wallclock;
/* Dom0 uses the native method to set the hardware RTC. */
if (!xen_initial_domain())
x86_platform.set_wallclock = xen_set_wallclock;
@@ -569,13 +571,11 @@ void __init xen_hvm_init_time_ops(void)
return;
}
- xen_sched_clock_offset = xen_clocksource_read();
- pv_ops.time = xen_time_ops;
+ xen_init_time_common();
+
x86_init.timers.setup_percpu_clockev = xen_time_init;
x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents;
- x86_platform.calibrate_tsc = xen_tsc_khz;
- x86_platform.get_wallclock = xen_get_wallclock;
x86_platform.set_wallclock = xen_set_wallclock;
}
#endif