diff options
Diffstat (limited to 'arch/x86')
29 files changed, 3778 insertions, 19 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index fbf26e0f7a6a..52e36adb5112 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1931,6 +1931,23 @@ config X86_INTEL_TSX_MODE_AUTO side channel attacks- equals the tsx=auto command line parameter. endchoice +config X86_SGX + bool "Software Guard eXtensions (SGX)" + depends on X86_64 && CPU_SUP_INTEL + depends on CRYPTO=y + depends on CRYPTO_SHA256=y + select SRCU + select MMU_NOTIFIER + help + Intel(R) Software Guard eXtensions (SGX) is a set of CPU instructions + that can be used by applications to set aside private regions of code + and data, referred to as enclaves. An enclave's private memory can + only be accessed by code running within the enclave. Accesses from + outside the enclave, including other enclaves, are disallowed by + hardware. + + If unsure, say N. + config EFI bool "EFI runtime service support" depends on ACPI diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 21243747965d..02e3e42f380b 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -27,9 +27,10 @@ VDSO32-$(CONFIG_IA32_EMULATION) := y vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o vobjs32-y := vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o vobjs32-y += vdso32/vclock_gettime.o +vobjs-$(CONFIG_X86_SGX) += vsgx.o # files to link into kernel -obj-y += vma.o +obj-y += vma.o extable.o KASAN_SANITIZE_vma.o := y UBSAN_SANITIZE_vma.o := y KCSAN_SANITIZE_vma.o := y @@ -98,6 +99,7 @@ $(vobjs): KBUILD_CFLAGS := $(filter-out $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS CFLAGS_REMOVE_vclock_gettime.o = -pg CFLAGS_REMOVE_vdso32/vclock_gettime.o = -pg CFLAGS_REMOVE_vgetcpu.o = -pg +CFLAGS_REMOVE_vsgx.o = -pg # # X32 processes use x32 vDSO to access 64bit kernel data. @@ -128,8 +130,8 @@ $(obj)/%-x32.o: $(obj)/%.o FORCE targets += vdsox32.lds $(vobjx32s-y) -$(obj)/%.so: OBJCOPYFLAGS := -S -$(obj)/%.so: $(obj)/%.so.dbg FORCE +$(obj)/%.so: OBJCOPYFLAGS := -S --remove-section __ex_table +$(obj)/%.so: $(obj)/%.so.dbg $(call if_changed,objcopy) $(obj)/vdsox32.so.dbg: $(obj)/vdsox32.lds $(vobjx32s) FORCE diff --git a/arch/x86/entry/vdso/extable.c b/arch/x86/entry/vdso/extable.c new file mode 100644 index 000000000000..afcf5b65beef --- /dev/null +++ b/arch/x86/entry/vdso/extable.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/err.h> +#include <linux/mm.h> +#include <asm/current.h> +#include <asm/traps.h> +#include <asm/vdso.h> + +struct vdso_exception_table_entry { + int insn, fixup; +}; + +bool fixup_vdso_exception(struct pt_regs *regs, int trapnr, + unsigned long error_code, unsigned long fault_addr) +{ + const struct vdso_image *image = current->mm->context.vdso_image; + const struct vdso_exception_table_entry *extable; + unsigned int nr_entries, i; + unsigned long base; + + /* + * Do not attempt to fixup #DB or #BP. It's impossible to identify + * whether or not a #DB/#BP originated from within an SGX enclave and + * SGX enclaves are currently the only use case for vDSO fixup. + */ + if (trapnr == X86_TRAP_DB || trapnr == X86_TRAP_BP) + return false; + + if (!current->mm->context.vdso) + return false; + + base = (unsigned long)current->mm->context.vdso + image->extable_base; + nr_entries = image->extable_len / (sizeof(*extable)); + extable = image->extable; + + for (i = 0; i < nr_entries; i++) { + if (regs->ip == base + extable[i].insn) { + regs->ip = base + extable[i].fixup; + regs->di = trapnr; + regs->si = error_code; + regs->dx = fault_addr; + return true; + } + } + + return false; +} diff --git a/arch/x86/entry/vdso/extable.h b/arch/x86/entry/vdso/extable.h new file mode 100644 index 000000000000..b56f6b012941 --- /dev/null +++ b/arch/x86/entry/vdso/extable.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __VDSO_EXTABLE_H +#define __VDSO_EXTABLE_H + +/* + * Inject exception fixup for vDSO code. Unlike normal exception fixup, + * vDSO uses a dedicated handler the addresses are relative to the overall + * exception table, not each individual entry. + */ +#ifdef __ASSEMBLY__ +#define _ASM_VDSO_EXTABLE_HANDLE(from, to) \ + ASM_VDSO_EXTABLE_HANDLE from to + +.macro ASM_VDSO_EXTABLE_HANDLE from:req to:req + .pushsection __ex_table, "a" + .long (\from) - __ex_table + .long (\to) - __ex_table + .popsection +.endm +#else +#define _ASM_VDSO_EXTABLE_HANDLE(from, to) \ + ".pushsection __ex_table, \"a\"\n" \ + ".long (" #from ") - __ex_table\n" \ + ".long (" #to ") - __ex_table\n" \ + ".popsection\n" +#endif + +#endif /* __VDSO_EXTABLE_H */ diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S index 4d152933547d..dc8da7695859 100644 --- a/arch/x86/entry/vdso/vdso-layout.lds.S +++ b/arch/x86/entry/vdso/vdso-layout.lds.S @@ -75,11 +75,18 @@ SECTIONS * stuff that isn't used at runtime in between. */ - .text : { *(.text*) } :text =0x90909090, + .text : { + *(.text*) + *(.fixup) + } :text =0x90909090, + + .altinstructions : { *(.altinstructions) } :text .altinstr_replacement : { *(.altinstr_replacement) } :text + __ex_table : { *(__ex_table) } :text + /DISCARD/ : { *(.discard) *(.discard.*) diff --git a/arch/x86/entry/vdso/vdso.lds.S b/arch/x86/entry/vdso/vdso.lds.S index 36b644e16272..4bf48462fca7 100644 --- a/arch/x86/entry/vdso/vdso.lds.S +++ b/arch/x86/entry/vdso/vdso.lds.S @@ -27,6 +27,7 @@ VERSION { __vdso_time; clock_getres; __vdso_clock_getres; + __vdso_sgx_enter_enclave; local: *; }; } diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h index 6f46e11ce539..1c7cfac7e64a 100644 --- a/arch/x86/entry/vdso/vdso2c.h +++ b/arch/x86/entry/vdso/vdso2c.h @@ -5,6 +5,41 @@ * are built for 32-bit userspace. */ +static void BITSFUNC(copy)(FILE *outfile, const unsigned char *data, size_t len) +{ + size_t i; + + for (i = 0; i < len; i++) { + if (i % 10 == 0) + fprintf(outfile, "\n\t"); + fprintf(outfile, "0x%02X, ", (int)(data)[i]); + } +} + + +/* + * Extract a section from the input data into a standalone blob. Used to + * capture kernel-only data that needs to persist indefinitely, e.g. the + * exception fixup tables, but only in the kernel, i.e. the section can + * be stripped from the final vDSO image. + */ +static void BITSFUNC(extract)(const unsigned char *data, size_t data_len, + FILE *outfile, ELF(Shdr) *sec, const char *name) +{ + unsigned long offset; + size_t len; + + offset = (unsigned long)GET_LE(&sec->sh_offset); + len = (size_t)GET_LE(&sec->sh_size); + + if (offset + len > data_len) + fail("section to extract overruns input data"); + + fprintf(outfile, "static const unsigned char %s[%lu] = {", name, len); + BITSFUNC(copy)(outfile, data + offset, len); + fprintf(outfile, "\n};\n\n"); +} + static void BITSFUNC(go)(void *raw_addr, size_t raw_len, void *stripped_addr, size_t stripped_len, FILE *outfile, const char *image_name) @@ -15,7 +50,7 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, ELF(Ehdr) *hdr = (ELF(Ehdr) *)raw_addr; unsigned long i, syms_nr; ELF(Shdr) *symtab_hdr = NULL, *strtab_hdr, *secstrings_hdr, - *alt_sec = NULL; + *alt_sec = NULL, *extable_sec = NULL; ELF(Dyn) *dyn = 0, *dyn_end = 0; const char *secstrings; INT_BITS syms[NSYMS] = {}; @@ -77,6 +112,8 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, if (!strcmp(secstrings + GET_LE(&sh->sh_name), ".altinstructions")) alt_sec = sh; + if (!strcmp(secstrings + GET_LE(&sh->sh_name), "__ex_table")) + extable_sec = sh; } if (!symtab_hdr) @@ -155,6 +192,9 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, (int)((unsigned char *)stripped_addr)[i]); } fprintf(outfile, "\n};\n\n"); + if (extable_sec) + BITSFUNC(extract)(raw_addr, raw_len, outfile, + extable_sec, "extable"); fprintf(outfile, "const struct vdso_image %s = {\n", image_name); fprintf(outfile, "\t.data = raw_data,\n"); @@ -165,6 +205,14 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, fprintf(outfile, "\t.alt_len = %lu,\n", (unsigned long)GET_LE(&alt_sec->sh_size)); } + if (extable_sec) { + fprintf(outfile, "\t.extable_base = %lu,\n", + (unsigned long)GET_LE(&extable_sec->sh_offset)); + fprintf(outfile, "\t.extable_len = %lu,\n", + (unsigned long)GET_LE(&extable_sec->sh_size)); + fprintf(outfile, "\t.extable = extable,\n"); + } + for (i = 0; i < NSYMS; i++) { if (required_syms[i].export && syms[i]) fprintf(outfile, "\t.sym_%s = %" PRIi64 ",\n", diff --git a/arch/x86/entry/vdso/vsgx.S b/arch/x86/entry/vdso/vsgx.S new file mode 100644 index 000000000000..86a0e94f68df --- /dev/null +++ b/arch/x86/entry/vdso/vsgx.S @@ -0,0 +1,151 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <linux/linkage.h> +#include <asm/export.h> +#include <asm/errno.h> +#include <asm/enclu.h> + +#include "extable.h" + +/* Relative to %rbp. */ +#define SGX_ENCLAVE_OFFSET_OF_RUN 16 + +/* The offsets relative to struct sgx_enclave_run. */ +#define SGX_ENCLAVE_RUN_TCS 0 +#define SGX_ENCLAVE_RUN_LEAF 8 +#define SGX_ENCLAVE_RUN_EXCEPTION_VECTOR 12 +#define SGX_ENCLAVE_RUN_EXCEPTION_ERROR_CODE 14 +#define SGX_ENCLAVE_RUN_EXCEPTION_ADDR 16 +#define SGX_ENCLAVE_RUN_USER_HANDLER 24 +#define SGX_ENCLAVE_RUN_USER_DATA 32 /* not used */ +#define SGX_ENCLAVE_RUN_RESERVED_START 40 +#define SGX_ENCLAVE_RUN_RESERVED_END 256 + +.code64 +.section .text, "ax" + +SYM_FUNC_START(__vdso_sgx_enter_enclave) + /* Prolog */ + .cfi_startproc + push %rbp + .cfi_adjust_cfa_offset 8 + .cfi_rel_offset %rbp, 0 + mov %rsp, %rbp + .cfi_def_cfa_register %rbp + push %rbx + .cfi_rel_offset %rbx, -8 + + mov %ecx, %eax +.Lenter_enclave: + /* EENTER <= function <= ERESUME */ + cmp $EENTER, %eax + jb .Linvalid_input + cmp $ERESUME, %eax + ja .Linvalid_input + + mov SGX_ENCLAVE_OFFSET_OF_RUN(%rbp), %rcx + + /* Validate that the reserved area contains only zeros. */ + mov $SGX_ENCLAVE_RUN_RESERVED_START, %rbx +1: + cmpq $0, (%rcx, %rbx) + jne .Linvalid_input + add $8, %rbx + cmpq $SGX_ENCLAVE_RUN_RESERVED_END, %rbx + jne 1b + + /* Load TCS and AEP */ + mov SGX_ENCLAVE_RUN_TCS(%rcx), %rbx + lea .Lasync_exit_pointer(%rip), %rcx + + /* Single ENCLU serving as both EENTER and AEP (ERESUME) */ +.Lasync_exit_pointer: +.Lenclu_eenter_eresume: + enclu + + /* EEXIT jumps here unless the enclave is doing something fancy. */ + mov SGX_ENCLAVE_OFFSET_OF_RUN(%rbp), %rbx + + /* Set exit_reason. */ + movl $EEXIT, SGX_ENCLAVE_RUN_LEAF(%rbx) + + /* Invoke userspace's exit handler if one was provided. */ +.Lhandle_exit: + cmpq $0, SGX_ENCLAVE_RUN_USER_HANDLER(%rbx) + jne .Linvoke_userspace_handler + + /* Success, in the sense that ENCLU was attempted. */ + xor %eax, %eax + +.Lout: + pop %rbx + leave + .cfi_def_cfa %rsp, 8 + ret + + /* The out-of-line code runs with the pre-leave stack frame. */ + .cfi_def_cfa %rbp, 16 + +.Linvalid_input: + mov $(-EINVAL), %eax + jmp .Lout + +.Lhandle_exception: + mov SGX_ENCLAVE_OFFSET_OF_RUN(%rbp), %rbx + + /* Set the exception info. */ + mov %eax, (SGX_ENCLAVE_RUN_LEAF)(%rbx) + mov %di, (SGX_ENCLAVE_RUN_EXCEPTION_VECTOR)(%rbx) + mov %si, (SGX_ENCLAVE_RUN_EXCEPTION_ERROR_CODE)(%rbx) + mov %rdx, (SGX_ENCLAVE_RUN_EXCEPTION_ADDR)(%rbx) + jmp .Lhandle_exit + +.Linvoke_userspace_handler: + /* Pass the untrusted RSP (at exit) to the callback via %rcx. */ + mov %rsp, %rcx + + /* Save struct sgx_enclave_exception %rbx is about to be clobbered. */ + mov %rbx, %rax + + /* Save the untrusted RSP offset in %rbx (non-volatile register). */ + mov %rsp, %rbx + and $0xf, %rbx + + /* + * Align stack per x86_64 ABI. Note, %rsp needs to be 16-byte aligned + * _after_ pushing the parameters on the stack, hence the bonus push. + */ + and $-0x10, %rsp + push %rax + + /* Push struct sgx_enclave_exception as a param to the callback. */ + push %rax + + /* Clear RFLAGS.DF per x86_64 ABI */ + cld + + /* + * Load the callback pointer to %rax and lfence for LVI (load value + * injection) protection before making the call. + */ + mov SGX_ENCLAVE_RUN_USER_HANDLER(%rax), %rax + lfence + call *%rax + + /* Undo the post-exit %rsp adjustment. */ + lea 0x10(%rsp, %rbx), %rsp + + /* + * If the return from callback is zero or negative, return immediately, + * else re-execute ENCLU with the postive return value interpreted as + * the requested ENCLU function. + */ + cmp $0, %eax + jle .Lout + jmp .Lenter_enclave + + .cfi_endproc + +_ASM_VDSO_EXTABLE_HANDLE(.Lenclu_eenter_eresume, .Lhandle_exception) + +SYM_FUNC_END(__vdso_sgx_enter_enclave) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index dad350d42ecf..f5ef2d5b9231 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -241,6 +241,7 @@ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */ #define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ #define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3B */ +#define X86_FEATURE_SGX ( 9*32+ 2) /* Software Guard Extensions */ #define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */ #define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */ #define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */ @@ -356,6 +357,7 @@ #define X86_FEATURE_MOVDIRI (16*32+27) /* MOVDIRI instruction */ #define X86_FEATURE_MOVDIR64B (16*32+28) /* MOVDIR64B instruction */ #define X86_FEATURE_ENQCMD (16*32+29) /* ENQCMD and ENQCMDS instructions */ +#define X86_FEATURE_SGX_LC (16*32+30) /* Software Guard Extensions Launch Control */ /* AMD-defined CPU features, CPUID level 0x80000007 (EBX), word 17 */ #define X86_FEATURE_OVERFLOW_RECOV (17*32+ 0) /* MCA overflow recovery support */ diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 5861d34f9771..7947cb1782da 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -62,6 +62,12 @@ # define DISABLE_ENQCMD (1 << (X86_FEATURE_ENQCMD & 31)) #endif +#ifdef CONFIG_X86_SGX +# define DISABLE_SGX 0 +#else +# define DISABLE_SGX (1 << (X86_FEATURE_SGX & 31)) +#endif + /* * Make sure to add features to the correct mask */ @@ -74,7 +80,7 @@ #define DISABLED_MASK6 0 #define DISABLED_MASK7 (DISABLE_PTI) #define DISABLED_MASK8 0 -#define DISABLED_MASK9 (DISABLE_SMAP) +#define DISABLED_MASK9 (DISABLE_SMAP|DISABLE_SGX) #define DISABLED_MASK10 0 #define DISABLED_MASK11 0 #define DISABLED_MASK12 0 diff --git a/arch/x86/include/asm/enclu.h b/arch/x86/include/asm/enclu.h new file mode 100644 index 000000000000..b1314e41a744 --- /dev/null +++ b/arch/x86/include/asm/enclu.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_ENCLU_H +#define _ASM_X86_ENCLU_H + +#define EENTER 0x02 +#define ERESUME 0x03 +#define EEXIT 0x04 + +#endif /* _ASM_X86_ENCLU_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index b2dd2648c0e2..6a5085a5f9d8 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -610,6 +610,8 @@ #define FEAT_CTL_LOCKED BIT(0) #define FEAT_CTL_VMX_ENABLED_INSIDE_SMX BIT(1) #define FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX BIT(2) +#define FEAT_CTL_SGX_LC_ENABLED BIT(17) +#define FEAT_CTL_SGX_ENABLED BIT(18) #define FEAT_CTL_LMCE_ENABLED BIT(20) #define MSR_IA32_TSC_ADJUST 0x0000003b @@ -629,6 +631,12 @@ #define MSR_IA32_UCODE_WRITE 0x00000079 #define MSR_IA32_UCODE_REV 0x0000008b +/* Intel SGX Launch Enclave Public Key Hash MSRs */ +#define MSR_IA32_SGXLEPUBKEYHASH0 0x0000008C +#define MSR_IA32_SGXLEPUBKEYHASH1 0x0000008D +#define MSR_IA32_SGXLEPUBKEYHASH2 0x0000008E +#define MSR_IA32_SGXLEPUBKEYHASH3 0x0000008F + #define MSR_IA32_SMM_MONITOR_CTL 0x0000009b #define MSR_IA32_SMBASE 0x0000009e diff --git a/arch/x86/include/asm/trap_pf.h b/arch/x86/include/asm/trap_pf.h index 305bc1214aef..10b1de500ab1 100644 --- a/arch/x86/include/asm/trap_pf.h +++ b/arch/x86/include/asm/trap_pf.h @@ -11,6 +11,7 @@ * bit 3 == 1: use of reserved bit detected * bit 4 == 1: fault was an instruction fetch * bit 5 == 1: protection keys block access + * bit 15 == 1: SGX MMU page-fault */ enum x86_pf_error_code { X86_PF_PROT = 1 << 0, @@ -19,6 +20,7 @@ enum x86_pf_error_code { X86_PF_RSVD = 1 << 3, X86_PF_INSTR = 1 << 4, X86_PF_PK = 1 << 5, + X86_PF_SGX = 1 << 15, }; #endif /* _ASM_X86_TRAP_PF_H */ diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index bbcdc7b8f963..b5d23470f56b 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -15,6 +15,8 @@ struct vdso_image { unsigned long size; /* Always a multiple of PAGE_SIZE */ unsigned long alt, alt_len; + unsigned long extable_base, extable_len; + const void *extable; long sym_vvar_start; /* Negative offset to the vvar area */ @@ -45,6 +47,9 @@ extern void __init init_vdso_image(const struct vdso_image *image); extern int map_vdso_once(const struct vdso_image *image, unsigned long addr); +extern bool fixup_vdso_exception(struct pt_regs *regs, int trapnr, + unsigned long error_code, + unsigned long fault_addr); #endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_VDSO_H */ diff --git a/arch/x86/include/uapi/asm/sgx.h b/arch/x86/include/uapi/asm/sgx.h new file mode 100644 index 000000000000..9034f3007c4e --- /dev/null +++ b/arch/x86/include/uapi/asm/sgx.h @@ -0,0 +1,168 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Copyright(c) 2016-20 Intel Corporation. + */ +#ifndef _UAPI_ASM_X86_SGX_H +#define _UAPI_ASM_X86_SGX_H + +#include <linux/types.h> +#include <linux/ioctl.h> + +/** + * enum sgx_page_flags - page control flags + * %SGX_PAGE_MEASURE: Measure the page contents with a sequence of + * ENCLS[EEXTEND] operations. + */ +enum sgx_page_flags { + SGX_PAGE_MEASURE = 0x01, +}; + +#define SGX_MAGIC 0xA4 + +#define SGX_IOC_ENCLAVE_CREATE \ + _IOW(SGX_MAGIC, 0x00, struct sgx_enclave_create) +#define SGX_IOC_ENCLAVE_ADD_PAGES \ + _IOWR(SGX_MAGIC, 0x01, struct sgx_enclave_add_pages) +#define SGX_IOC_ENCLAVE_INIT \ + _IOW(SGX_MAGIC, 0x02, struct sgx_enclave_init) +#define SGX_IOC_ENCLAVE_PROVISION \ + _IOW(SGX_MAGIC, 0x03, struct sgx_enclave_provision) + +/** + * struct sgx_enclave_create - parameter structure for the + * %SGX_IOC_ENCLAVE_CREATE ioctl + * @src: address for the SECS page data + */ +struct sgx_enclave_create { + __u64 src; +}; + +/** + * struct sgx_enclave_add_pages - parameter structure for the + * %SGX_IOC_ENCLAVE_ADD_PAGE ioctl + * @src: start address for the page data + * @offset: starting page offset + * @length: length of the data (multiple of the page size) + * @secinfo: address for the SECINFO data + * @flags: page control flags + * @count: number of bytes added (multiple of the page size) + */ +struct sgx_enclave_add_pages { + __u64 src; + __u64 offset; + __u64 length; + __u64 secinfo; + __u64 flags; + __u64 count; +}; + +/** + * struct sgx_enclave_init - parameter structure for the + * %SGX_IOC_ENCLAVE_INIT ioctl + * @sigstruct: address for the SIGSTRUCT data + */ +struct sgx_enclave_init { + __u64 sigstruct; +}; + +/** + * struct sgx_enclave_provision - parameter structure for the + * %SGX_IOC_ENCLAVE_PROVISION ioctl + * @fd: file handle of /dev/sgx_provision + */ +struct sgx_enclave_provision { + __u64 fd; +}; + +struct sgx_enclave_run; + +/** + * typedef sgx_enclave_user_handler_t - Exit handler function accepted by + * __vdso_sgx_enter_enclave() + * @run: The run instance given by the caller + * + * The register parameters contain the snapshot of their values at enclave + * exit. An invalid ENCLU function number will cause -EINVAL to be returned + * to the caller. + * + * Return: + * - <= 0: The given value is returned back to the caller. + * - > 0: ENCLU function to invoke, either EENTER or ERESUME. + */ +typedef int (*sgx_enclave_user_handler_t)(long rdi, long rsi, long rdx, + long rsp, long r8, long r9, + struct sgx_enclave_run *run); + +/** + * struct sgx_enclave_run - the execution context of __vdso_sgx_enter_enclave() + * @tcs: TCS used to enter the enclave + * @function: The last seen ENCLU function (EENTER, ERESUME or EEXIT) + * @exception_vector: The interrupt vector of the exception + * @exception_error_code: The exception error code pulled out of the stack + * @exception_addr: The address that triggered the exception + * @user_handler: User provided callback run on exception + * @user_data: Data passed to the user handler + * @reserved Reserved for future extensions + * + * If @user_handler is provided, the handler will be invoked on all return paths + * of the normal flow. The user handler may transfer control, e.g. via a + * longjmp() call or a C++ exception, without returning to + * __vdso_sgx_enter_enclave(). + */ +struct sgx_enclave_run { + __u64 tcs; + __u32 function; + __u16 exception_vector; + __u16 exception_error_code; + __u64 exception_addr; + __u64 user_handler; + __u64 user_data; + __u8 reserved[216]; +}; + +/** + * typedef vdso_sgx_enter_enclave_t - Prototype for __vdso_sgx_enter_enclave(), + * a vDSO function to enter an SGX enclave. + * @rdi: Pass-through value for RDI + * @rsi: Pass-through value for RSI + * @rdx: Pass-through value for RDX + * @function: ENCLU function, must be EENTER or ERESUME + * @r8: Pass-through value for R8 + * @r9: Pass-through value for R9 + * @run: struct sgx_enclave_run, must be non-NULL + * + * NOTE: __vdso_sgx_enter_enclave() does not ensure full compliance with the + * x86-64 ABI, e.g. doesn't handle XSAVE state. Except for non-volatile + * general purpose registers, EFLAGS.DF, and RSP alignment, preserving/setting + * state in accordance with the x86-64 ABI is the responsibility of the enclave + * and its runtime, i.e. __vdso_sgx_enter_enclave() cannot be called from C + * code without careful consideration by both the enclave and its runtime. + * + * All general purpose registers except RAX, RBX and RCX are passed as-is to the + * enclave. RAX, RBX and RCX are consumed by EENTER and ERESUME and are loaded + * with @function, asynchronous exit pointer, and @run.tcs respectively. + * + * RBP and the stack are used to anchor __vdso_sgx_enter_enclave() to the + * pre-enclave state, e.g. to retrieve @run.exception and @run.user_handler + * after an enclave exit. All other registers are available for use by the + * enclave and its runtime, e.g. an enclave can push additional data onto the + * stack (and modify RSP) to pass information to the optional user handler (see + * below). + * + * Most exceptions reported on ENCLU, including those that occur within the + * enclave, are fixed up and reported synchronously instead of being delivered + * via a standard signal. Debug Exceptions (#DB) and Breakpoints (#BP) are + * never fixed up and are always delivered via standard signals. On synchrously + * reported exceptions, -EFAULT is returned and details about the exception are + * recorded in @run.exception, the optional sgx_enclave_exception struct. + * + * Return: + * - 0: ENCLU function was successfully executed. + * - -EINVAL: Invalid ENCL number (neither EENTER nor ERESUME). + */ +typedef int (*vdso_sgx_enter_enclave_t)(unsigned long rdi, unsigned long rsi, + unsigned long rdx, unsigned int function, + unsigned long r8, unsigned long r9, + struct sgx_enclave_run *run); + +#endif /* _UAPI_ASM_X86_SGX_H */ diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 93792b457b81..637b499450d1 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -48,6 +48,7 @@ obj-$(CONFIG_X86_MCE) += mce/ obj-$(CONFIG_MTRR) += mtrr/ obj-$(CONFIG_MICROCODE) += microcode/ obj-$(CONFIG_X86_CPU_RESCTRL) += resctrl/ +obj-$(CONFIG_X86_SGX) += sgx/ obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c index 29a3bedabd06..3b1b01f2b248 100644 --- a/arch/x86/kernel/cpu/feat_ctl.c +++ b/arch/x86/kernel/cpu/feat_ctl.c @@ -93,16 +93,41 @@ static void init_vmx_capabilities(struct cpuinfo_x86 *c) } #endif /* CONFIG_X86_VMX_FEATURE_NAMES */ +static void clear_sgx_caps(void) +{ + setup_clear_cpu_cap(X86_FEATURE_SGX); + setup_clear_cpu_cap(X86_FEATURE_SGX_LC); +} + +static int __init nosgx(char *str) +{ + clear_sgx_caps(); + + return 0; +} + +early_param("nosgx", nosgx); + void init_ia32_feat_ctl(struct cpuinfo_x86 *c) { bool tboot = tboot_enabled(); + bool enable_sgx; u64 msr; if (rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr)) { clear_cpu_cap(c, X86_FEATURE_VMX); + clear_sgx_caps(); return; } + /* + * Enable SGX if and only if the kernel supports SGX and Launch Control + * is supported, i.e. disable SGX if the LE hash MSRs can't be written. + */ + enable_sgx = cpu_has(c, X86_FEATURE_SGX) && + cpu_has(c, X86_FEATURE_SGX_LC) && + IS_ENABLED(CONFIG_X86_SGX); + if (msr & FEAT_CTL_LOCKED) goto update_caps; @@ -124,13 +149,16 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c) msr |= FEAT_CTL_VMX_ENABLED_INSIDE_SMX; } + if (enable_sgx) + msr |= FEAT_CTL_SGX_ENABLED | FEAT_CTL_SGX_LC_ENABLED; + wrmsrl(MSR_IA32_FEAT_CTL, msr); update_caps: set_cpu_cap(c, X86_FEATURE_MSR_IA32_FEAT_CTL); if (!cpu_has(c, X86_FEATURE_VMX)) - return; + goto update_sgx; if ( (tboot && !(msr & FEAT_CTL_VMX_ENABLED_INSIDE_SMX)) || (!tboot && !(msr & FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX))) { @@ -143,4 +171,12 @@ update_caps: init_vmx_capabilities(c); #endif } + +update_sgx: + if (!(msr & FEAT_CTL_SGX_ENABLED) || + !(msr & FEAT_CTL_SGX_LC_ENABLED) || !enable_sgx) { + if (enable_sgx) + pr_err_once("SGX disabled by BIOS\n"); + clear_sgx_caps(); + } } diff --git a/arch/x86/kernel/cpu/sgx/Makefile b/arch/x86/kernel/cpu/sgx/Makefile new file mode 100644 index 000000000000..91d3dc784a29 --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/Makefile @@ -0,0 +1,5 @@ +obj-y += \ + driver.o \ + encl.o \ + ioctl.o \ + main.o diff --git a/arch/x86/kernel/cpu/sgx/arch.h b/arch/x86/kernel/cpu/sgx/arch.h new file mode 100644 index 000000000000..dd7602c44c72 --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/arch.h @@ -0,0 +1,338 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/** + * Copyright(c) 2016-20 Intel Corporation. + * + * Contains data structures defined by the SGX architecture. Data structures + * defined by the Linux software stack should not be placed here. + */ +#ifndef _ASM_X86_SGX_ARCH_H +#define _ASM_X86_SGX_ARCH_H + +#include <linux/bits.h> +#include <linux/types.h> + +/* The SGX specific CPUID function. */ +#define SGX_CPUID 0x12 +/* EPC enumeration. */ +#define SGX_CPUID_EPC 2 +/* An invalid EPC section, i.e. the end marker. */ +#define SGX_CPUID_EPC_INVALID 0x0 +/* A valid EPC section. */ +#define SGX_CPUID_EPC_SECTION 0x1 +/* The bitmask for the EPC section type. */ +#define SGX_CPUID_EPC_MASK GENMASK(3, 0) + +/** + * enum sgx_return_code - The return code type for ENCLS, ENCLU and ENCLV + * %SGX_NOT_TRACKED: Previous ETRACK's shootdown sequence has not + * been completed yet. + * %SGX_INVALID_EINITTOKEN: EINITTOKEN is invalid and enclave signer's + * public key does not match IA32_SGXLEPUBKEYHASH. + * %SGX_UNMASKED_EVENT: An unmasked event, e.g. INTR, was received + */ +enum sgx_return_code { + SGX_NOT_TRACKED = 11, + SGX_INVALID_EINITTOKEN = 16, + SGX_UNMASKED_EVENT = 128, +}; + +/* The modulus size for 3072-bit RSA keys. */ +#define SGX_MODULUS_SIZE 384 + +/** + * enum sgx_miscselect - additional information to an SSA frame + * %SGX_MISC_EXINFO: Report #PF or #GP to the SSA frame. + * + * Save State Area (SSA) is a stack inside the enclave used to store processor + * state when an exception or interrupt occurs. This enum defines additional + * information stored to an SSA frame. + */ +enum sgx_miscselect { + SGX_MISC_EXINFO = BIT(0), +}; + +#define SGX_MISC_RESERVED_MASK GENMASK_ULL(63, 1) + +#define SGX_SSA_GPRS_SIZE 184 +#define SGX_SSA_MISC_EXINFO_SIZE 16 + +/** + * enum sgx_attributes - the attributes field in &struct sgx_secs + * %SGX_ATTR_INIT: Enclave can be entered (is initialized). + * %SGX_ATTR_DEBUG: Allow ENCLS(EDBGRD) and ENCLS(EDBGWR). + * %SGX_ATTR_MODE64BIT: Tell that this a 64-bit enclave. + * %SGX_ATTR_PROVISIONKEY: Allow to use provisioning keys for remote + * attestation. + * %SGX_ATTR_KSS: Allow to use key separation and sharing (KSS). + * %SGX_ATTR_EINITTOKENKEY: Allow to use token signing key that is used to + * sign cryptographic tokens that can be passed to + * EINIT as an authorization to run an enclave. + */ +enum sgx_attribute { + SGX_ATTR_INIT = BIT(0), + SGX_ATTR_DEBUG = BIT(1), + SGX_ATTR_MODE64BIT = BIT(2), + SGX_ATTR_PROVISIONKEY = BIT(4), + SGX_ATTR_EINITTOKENKEY = BIT(5), + SGX_ATTR_KSS = BIT(7), +}; + +#define SGX_ATTR_RESERVED_MASK (BIT_ULL(3) | BIT_ULL(6) | GENMASK_ULL(63, 8)) + +/** + * struct sgx_secs - SGX Enclave Control Structure (SECS) + * @size: size of the address space + * @base: base address of the address space + * @ssa_frame_size: size of an SSA frame + * @miscselect: additional information stored to an SSA frame + * @attributes: attributes for enclave + * @xfrm: XSave-Feature Request Mask (subset of XCR0) + * @mrenclave: SHA256-hash of the enclave contents + * @mrsigner: SHA256-hash of the public key used to sign the SIGSTRUCT + * @config_id: a user-defined value that is used in key derivation + * @isv_prod_id: a user-defined value that is used in key derivation + * @isv_svn: a user-defined value that is used in key derivation + * @config_svn: a user-defined value that is used in key derivation + * + * SGX Enclave Control Structure (SECS) is a special enclave page that is not + * visible in the address space. In fact, this structure defines the address + * range and other global attributes for the enclave and it is the first EPC + * page created for any enclave. It is moved from a temporary buffer to an EPC + * by the means of ENCLS[ECREATE] function. + */ +struct sgx_secs { + u64 size; + u64 base; + u32 ssa_frame_size; + u32 miscselect; + u8 reserved1[24]; + u64 attributes; + u64 xfrm; + u32 mrenclave[8]; + u8 reserved2[32]; + u32 mrsigner[8]; + u8 reserved3[32]; + u32 config_id[16]; + u16 isv_prod_id; + u16 isv_svn; + u16 config_svn; + u8 reserved4[3834]; +} __packed; + +/** + * enum sgx_tcs_flags - execution flags for TCS + * %SGX_TCS_DBGOPTIN: If enabled allows single-stepping and breakpoints + * inside an enclave. It is cleared by EADD but can + * be set later with EDBGWR. + */ +enum sgx_tcs_flags { + SGX_TCS_DBGOPTIN = 0x01, +}; + +#define SGX_TCS_RESERVED_MASK GENMASK_ULL(63, 1) +#define SGX_TCS_RESERVED_SIZE 4024 + +/** + * struct sgx_tcs - Thread Control Structure (TCS) + * @state: used to mark an entered TCS + * @flags: execution flags (cleared by EADD) + * @ssa_offset: SSA stack offset relative to the enclave base + * @ssa_index: the current SSA frame index (cleard by EADD) + * @nr_ssa_frames: the number of frame in the SSA stack + * @entry_offset: entry point offset relative to the enclave base + * @exit_addr: address outside the enclave to exit on an exception or + * interrupt + * @fs_offset: offset relative to the enclave base to become FS + * segment inside the enclave + * @gs_offset: offset relative to the enclave base to become GS + * segment inside the enclave + * @fs_limit: size to become a new FS-limit (only 32-bit enclaves) + * @gs_limit: size to become a new GS-limit (only 32-bit enclaves) + * + * Thread Control Structure (TCS) is an enclave page visible in its address + * space that defines an entry point inside the enclave. A thread enters inside + * an enclave by supplying address of TCS to ENCLU(EENTER). A TCS can be entered + * by only one thread at a time. + */ +struct sgx_tcs { + u64 state; + u64 flags; + u64 ssa_offset; + u32 ssa_index; + u32 nr_ssa_frames; + u64 entry_offset; + u64 exit_addr; + u64 fs_offset; + u64 gs_offset; + u32 fs_limit; + u32 gs_limit; + u8 reserved[SGX_TCS_RESERVED_SIZE]; +} __packed; + +/** + * struct sgx_pageinfo - an enclave page descriptor + * @addr: address of the enclave page + * @contents: pointer to the page contents + * @metadata: pointer either to a SECINFO or PCMD instance + * @secs: address of the SECS page + */ +struct sgx_pageinfo { + u64 addr; + u64 contents; + u64 metadata; + u64 secs; +} __packed __aligned(32); + + +/** + * enum sgx_page_type - bits in the SECINFO flags defining the page type + * %SGX_PAGE_TYPE_SECS: a SECS page + * %SGX_PAGE_TYPE_TCS: a TCS page + * %SGX_PAGE_TYPE_REG: a regular page + * %SGX_PAGE_TYPE_VA: a VA page + * %SGX_PAGE_TYPE_TRIM: a page in trimmed state + */ +enum sgx_page_type { + SGX_PAGE_TYPE_SECS, + SGX_PAGE_TYPE_TCS, + SGX_PAGE_TYPE_REG, + SGX_PAGE_TYPE_VA, + SGX_PAGE_TYPE_TRIM, +}; + +#define SGX_NR_PAGE_TYPES 5 +#define SGX_PAGE_TYPE_MASK GENMASK(7, 0) + +/** + * enum sgx_secinfo_flags - the flags field in &struct sgx_secinfo + * %SGX_SECINFO_R: allow read + * %SGX_SECINFO_W: allow write + * %SGX_SECINFO_X: allow execution + * %SGX_SECINFO_SECS: a SECS page + * %SGX_SECINFO_TCS: a TCS page + * %SGX_SECINFO_REG: a regular page + * %SGX_SECINFO_VA: a VA page + * %SGX_SECINFO_TRIM: a page in trimmed state + */ +enum sgx_secinfo_flags { + SGX_SECINFO_R = BIT(0), + SGX_SECINFO_W = BIT(1), + SGX_SECINFO_X = BIT(2), + SGX_SECINFO_SECS = (SGX_PAGE_TYPE_SECS << 8), + SGX_SECINFO_TCS = (SGX_PAGE_TYPE_TCS << 8), + SGX_SECINFO_REG = (SGX_PAGE_TYPE_REG << 8), + SGX_SECINFO_VA = (SGX_PAGE_TYPE_VA << 8), + SGX_SECINFO_TRIM = (SGX_PAGE_TYPE_TRIM << 8), +}; + +#define SGX_SECINFO_PERMISSION_MASK GENMASK_ULL(2, 0) +#define SGX_SECINFO_PAGE_TYPE_MASK (SGX_PAGE_TYPE_MASK << 8) +#define SGX_SECINFO_RESERVED_MASK ~(SGX_SECINFO_PERMISSION_MASK | \ + SGX_SECINFO_PAGE_TYPE_MASK) + +/** + * struct sgx_secinfo - describes attributes of an EPC page + * @flags: permissions and type + * + * Used together with ENCLS leaves that add or modify an EPC page to an + * enclave to define page permissions and type. + */ +struct sgx_secinfo { + u64 flags; + u8 reserved[56]; +} __packed __aligned(64); + +#define SGX_PCMD_RESERVED_SIZE 40 + +/** + * struct sgx_pcmd - Paging Crypto Metadata (PCMD) + * @enclave_id: enclave identifier + * @mac: MAC over PCMD, page contents and isvsvn + * + * PCMD is stored for every swapped page to the regular memory. When ELDU loads + * the page back it recalculates the MAC by using a isvsvn number stored in a + * VA page. Together these two structures bring integrity and rollback + * protection. + */ +struct sgx_pcmd { + struct sgx_secinfo secinfo; + u64 enclave_id; + u8 reserved[SGX_PCMD_RESERVED_SIZE]; + u8 mac[16]; +} __packed __aligned(128); + +#define SGX_SIGSTRUCT_RESERVED1_SIZE 84 +#define SGX_SIGSTRUCT_RESERVED2_SIZE 20 +#define SGX_SIGSTRUCT_RESERVED3_SIZE 32 +#define SGX_SIGSTRUCT_RESERVED4_SIZE 12 + +/** + * struct sgx_sigstruct_header - defines author of the enclave + * @header1: constant byte string + * @vendor: must be either 0x0000 or 0x8086 + * @date: YYYYMMDD in BCD + * @header2: costant byte string + * @swdefined: software defined value + */ +struct sgx_sigstruct_header { + u64 header1[2]; + u32 vendor; + u32 date; + u64 header2[2]; + u32 swdefined; + u8 reserved1[84]; +} __packed; + +/** + * struct sgx_sigstruct_body - defines contents of the enclave + * @miscselect: additional information stored to an SSA frame + * @misc_mask: required miscselect in SECS + * @attributes: attributes for enclave + * @xfrm: XSave-Feature Request Mask (subset of XCR0) + * @attributes_mask: required attributes in SECS + * @xfrm_mask: required XFRM in SECS + * @mrenclave: SHA256-hash of the enclave contents + * @isvprodid: a user-defined value that is used in key derivation + * @isvsvn: a user-defined value that is used in key derivation + */ +struct sgx_sigstruct_body { + u32 miscselect; + u32 misc_mask; + u8 reserved2[20]; + u64 attributes; + u64 xfrm; + u64 attributes_mask; + u64 xfrm_mask; + u8 mrenclave[32]; + u8 reserved3[32]; + u16 isvprodid; + u16 isvsvn; +} __packed; + +/** + * struct sgx_sigstruct - an enclave signature + * @header: defines author of the enclave + * @modulus: the modulus of the public key + * @exponent: the exponent of the public key + * @signature: the signature calculated over the fields except modulus, + * @body: defines contents of the enclave + * @q1: a value used in RSA signature verification + * @q2: a value used in RSA signature verification + * + * Header and body are the parts that are actual signed. The remaining fields + * define the signature of the enclave. + */ +struct sgx_sigstruct { + struct sgx_sigstruct_header header; + u8 modulus[SGX_MODULUS_SIZE]; + u32 exponent; + u8 signature[SGX_MODULUS_SIZE]; + struct sgx_sigstruct_body body; + u8 reserved4[12]; + u8 q1[SGX_MODULUS_SIZE]; + u8 q2[SGX_MODULUS_SIZE]; +} __packed; + +#define SGX_LAUNCH_TOKEN_SIZE 304 + +#endif /* _ASM_X86_SGX_ARCH_H */ diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c new file mode 100644 index 000000000000..f2eac41bb4ff --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/driver.c @@ -0,0 +1,194 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2016-20 Intel Corporation. */ + +#include <linux/acpi.h> +#include <linux/miscdevice.h> +#include <linux/mman.h> +#include <linux/security.h> +#include <linux/suspend.h> +#include <asm/traps.h> +#include "driver.h" +#include "encl.h" + +u64 sgx_attributes_reserved_mask; +u64 sgx_xfrm_reserved_mask = ~0x3; +u32 sgx_misc_reserved_mask; + +static int sgx_open(struct inode *inode, struct file *file) +{ + struct sgx_encl *encl; + int ret; + + encl = kzalloc(sizeof(*encl), GFP_KERNEL); + if (!encl) + return -ENOMEM; + + kref_init(&encl->refcount); + xa_init(&encl->page_array); + mutex_init(&encl->lock); + INIT_LIST_HEAD(&encl->va_pages); + INIT_LIST_HEAD(&encl->mm_list); + spin_lock_init(&encl->mm_lock); + + ret = init_srcu_struct(&encl->srcu); + if (ret) { + kfree(encl); + return ret; + } + + file->private_data = encl; + + return 0; +} + +static int sgx_release(struct inode *inode, struct file *file) +{ + struct sgx_encl *encl = file->private_data; + struct sgx_encl_mm *encl_mm; + + /* + * Drain the remaining mm_list entries. At this point the list contains + * entries for processes, which have closed the enclave file but have + * not exited yet. The processes, which have exited, are gone from the + * list by sgx_mmu_notifier_release(). + */ + for ( ; ; ) { + spin_lock(&encl->mm_lock); + + if (list_empty(&encl->mm_list)) { + encl_mm = NULL; + } else { + encl_mm = list_first_entry(&encl->mm_list, + struct sgx_encl_mm, list); + list_del_rcu(&encl_mm->list); + } + + spin_unlock(&encl->mm_lock); + + /* The enclave is no longer mapped by any mm. */ + if (!encl_mm) + break; + + synchronize_srcu(&encl->srcu); + mmu_notifier_unregister(&encl_mm->mmu_notifier, encl_mm->mm); + kfree(encl_mm); + } + + kref_put(&encl->refcount, sgx_encl_release); + return 0; +} + +static int sgx_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct sgx_encl *encl = file->private_data; + int ret; + + ret = sgx_encl_may_map(encl, vma->vm_start, vma->vm_end, vma->vm_flags); + if (ret) + return ret; + + ret = sgx_encl_mm_add(encl, vma->vm_mm); + if (ret) + return ret; + + vma->vm_ops = &sgx_vm_ops; + vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO; + vma->vm_private_data = encl; + + return 0; +} + +static unsigned long sgx_get_unmapped_area(struct file *file, + unsigned long addr, + unsigned long len, + unsigned long pgoff, + unsigned long flags) +{ + if ((flags & MAP_TYPE) == MAP_PRIVATE) + return -EINVAL; + + if (flags & MAP_FIXED) + return addr; + + return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); +} + +#ifdef CONFIG_COMPAT +static long sgx_compat_ioctl(struct file *filep, unsigned int cmd, + unsigned long arg) +{ + return sgx_ioctl(filep, cmd, arg); +} +#endif + +static const struct file_operations sgx_encl_fops = { + .owner = THIS_MODULE, + .open = sgx_open, + .release = sgx_release, + .unlocked_ioctl = sgx_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = sgx_compat_ioctl, +#endif + .mmap = sgx_mmap, + .get_unmapped_area = sgx_get_unmapped_area, +}; + +const struct file_operations sgx_provision_fops = { + .owner = THIS_MODULE, +}; + +static struct miscdevice sgx_dev_enclave = { + .minor = MISC_DYNAMIC_MINOR, + .name = "sgx_enclave", + .nodename = "sgx_enclave", + .fops = &sgx_encl_fops, +}; + +static struct miscdevice sgx_dev_provision = { + .minor = MISC_DYNAMIC_MINOR, + .name = "sgx_provision", + .nodename = "sgx_provision", + .fops = &sgx_provision_fops, +}; + +int __init sgx_drv_init(void) +{ + unsigned int eax, ebx, ecx, edx; + u64 attr_mask; + u64 xfrm_mask; + int ret; + + if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) + return -ENODEV; + + cpuid_count(SGX_CPUID, 0, &eax, &ebx, &ecx, &edx); + + if (!(eax & 1)) { + pr_err("SGX disabled: SGX1 instruction support not available.\n"); + return -ENODEV; + } + + sgx_misc_reserved_mask = ~ebx | SGX_MISC_RESERVED_MASK; + + cpuid_count(SGX_CPUID, 1, &eax, &ebx, &ecx, &edx); + + attr_mask = (((u64)ebx) << 32) + (u64)eax; + sgx_attributes_reserved_mask = ~attr_mask | SGX_ATTR_RESERVED_MASK; + + if (cpu_feature_enabled(X86_FEATURE_OSXSAVE)) { + xfrm_mask = (((u64)edx) << 32) + (u64)ecx; + sgx_xfrm_reserved_mask = ~xfrm_mask; + } + + ret = misc_register(&sgx_dev_enclave); + if (ret) + return ret; + + ret = misc_register(&sgx_dev_provision); + if (ret) { + misc_deregister(&sgx_dev_enclave); + return ret; + } + + return 0; +} diff --git a/arch/x86/kernel/cpu/sgx/driver.h b/arch/x86/kernel/cpu/sgx/driver.h new file mode 100644 index 000000000000..4eddb4d571ef --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/driver.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ARCH_SGX_DRIVER_H__ +#define __ARCH_SGX_DRIVER_H__ + +#include <crypto/hash.h> +#include <linux/kref.h> +#include <linux/mmu_notifier.h> +#include <linux/radix-tree.h> +#include <linux/rwsem.h> +#include <linux/sched.h> +#include <linux/workqueue.h> +#include <uapi/asm/sgx.h> +#include "sgx.h" + +#define SGX_EINIT_SPIN_COUNT 20 +#define SGX_EINIT_SLEEP_COUNT 50 +#define SGX_EINIT_SLEEP_TIME 20 + +extern u64 sgx_attributes_reserved_mask; +extern u64 sgx_xfrm_reserved_mask; +extern u32 sgx_misc_reserved_mask; + +extern const struct file_operations sgx_provision_fops; + +long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); + +int sgx_drv_init(void); + +#endif /* __ARCH_X86_SGX_DRIVER_H__ */ diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c new file mode 100644 index 000000000000..ee50a5010277 --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -0,0 +1,740 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2016-20 Intel Corporation. */ + +#include <linux/lockdep.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/shmem_fs.h> +#include <linux/suspend.h> +#include <linux/sched/mm.h> +#include "arch.h" +#include "encl.h" +#include "encls.h" +#include "sgx.h" + +/* + * ELDU: Load an EPC page as unblocked. For more info, see "OS Management of EPC + * Pages" in the SDM. + */ +static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, + struct sgx_epc_page *epc_page, + struct sgx_epc_page *secs_page) +{ + unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK; + struct sgx_encl *encl = encl_page->encl; + struct sgx_pageinfo pginfo; + struct sgx_backing b; + pgoff_t page_index; + int ret; + + if (secs_page) + page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base); + else + page_index = PFN_DOWN(encl->size); + + ret = sgx_encl_get_backing(encl, page_index, &b); + if (ret) + return ret; + + pginfo.addr = encl_page->desc & PAGE_MASK; + pginfo.contents = (unsigned long)kmap_atomic(b.contents); + pginfo.metadata = (unsigned long)kmap_atomic(b.pcmd) + + b.pcmd_offset; + + if (secs_page) + pginfo.secs = (u64)sgx_get_epc_virt_addr(secs_page); + else + pginfo.secs = 0; + + ret = __eldu(&pginfo, sgx_get_epc_virt_addr(epc_page), + sgx_get_epc_virt_addr(encl_page->va_page->epc_page) + va_offset); + if (ret) { + if (encls_failed(ret)) + ENCLS_WARN(ret, "ELDU"); + + ret = -EFAULT; + } + + kunmap_atomic((void *)(unsigned long)(pginfo.metadata - b.pcmd_offset)); + kunmap_atomic((void *)(unsigned long)pginfo.contents); + + sgx_encl_put_backing(&b, false); + + return ret; +} + +static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page, + struct sgx_epc_page *secs_page) +{ + + unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK; + struct sgx_encl *encl = encl_page->encl; + struct sgx_epc_page *epc_page; + int ret; + + epc_page = sgx_alloc_epc_page(encl_page, false); + if (IS_ERR(epc_page)) + return epc_page; + + ret = __sgx_encl_eldu(encl_page, epc_page, secs_page); + if (ret) { + sgx_free_epc_page(epc_page); + return ERR_PTR(ret); + } + + sgx_free_va_slot(encl_page->va_page, va_offset); + list_move(&encl_page->va_page->list, &encl->va_pages); + encl_page->desc &= ~SGX_ENCL_PAGE_VA_OFFSET_MASK; + encl_page->epc_page = epc_page; + + return epc_page; +} + +static struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl, + unsigned long addr, + unsigned long vm_flags) +{ + unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC); + struct sgx_epc_page *epc_page; + struct sgx_encl_page *entry; + + entry = xa_load(&encl->page_array, PFN_DOWN(addr)); + if (!entry) + return ERR_PTR(-EFAULT); + + /* + * Verify that the faulted page has equal or higher build time + * permissions than the VMA permissions (i.e. the subset of {VM_READ, + * VM_WRITE, VM_EXECUTE} in vma->vm_flags). + */ + if ((entry->vm_max_prot_bits & vm_prot_bits) != vm_prot_bits) + return ERR_PTR(-EFAULT); + + /* Entry successfully located. */ + if (entry->epc_page) { + if (entry->desc & SGX_ENCL_PAGE_BEING_RECLAIMED) + return ERR_PTR(-EBUSY); + + return entry; + } + + if (!(encl->secs.epc_page)) { + epc_page = sgx_encl_eldu(&encl->secs, NULL); + if (IS_ERR(epc_page)) + return ERR_CAST(epc_page); + } + + epc_page = sgx_encl_eldu(entry, encl->secs.epc_page); + if (IS_ERR(epc_page)) + return ERR_CAST(epc_page); + + encl->secs_child_cnt++; + sgx_mark_page_reclaimable(entry->epc_page); + + return entry; +} + +static vm_fault_t sgx_vma_fault(struct vm_fault *vmf) +{ + unsigned long addr = (unsigned long)vmf->address; + struct vm_area_struct *vma = vmf->vma; + struct sgx_encl_page *entry; + unsigned long phys_addr; + struct sgx_encl *encl; + unsigned long pfn; + vm_fault_t ret; + + encl = vma->vm_private_data; + + /* + * It's very unlikely but possible that allocating memory for the + * mm_list entry of a forked process failed in sgx_vma_open(). When + * this happens, vm_private_data is set to NULL. + */ + if (unlikely(!encl)) + return VM_FAULT_SIGBUS; + + mutex_lock(&encl->lock); + + entry = sgx_encl_load_page(encl, addr, vma->vm_flags); + if (IS_ERR(entry)) { + mutex_unlock(&encl->lock); + + if (PTR_ERR(entry) == -EBUSY) + return VM_FAULT_NOPAGE; + + return VM_FAULT_SIGBUS; + } + + phys_addr = sgx_get_epc_phys_addr(entry->epc_page); + + /* Check if another thread got here first to insert the PTE. */ + if (!follow_pfn(vma, addr, &pfn)) { + mutex_unlock(&encl->lock); + + return VM_FAULT_NOPAGE; + } + + ret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr)); + if (ret != VM_FAULT_NOPAGE) { + mutex_unlock(&encl->lock); + + return VM_FAULT_SIGBUS; + } + + sgx_encl_test_and_clear_young(vma->vm_mm, entry); + mutex_unlock(&encl->lock); + + return VM_FAULT_NOPAGE; +} + +static void sgx_vma_open(struct vm_area_struct *vma) +{ + struct sgx_encl *encl = vma->vm_private_data; + + /* + * It's possible but unlikely that vm_private_data is NULL. This can + * happen in a grandchild of a process, when sgx_encl_mm_add() had + * failed to allocate memory in this callback. + */ + if (unlikely(!encl)) + return; + + if (sgx_encl_mm_add(encl, vma->vm_mm)) + vma->vm_private_data = NULL; +} + + +/** + * sgx_encl_may_map() - Check if a requested VMA mapping is allowed + * @encl: an enclave pointer + * @start: lower bound of the address range, inclusive + * @end: upper bound of the address range, exclusive + * @vm_flags: VMA flags + * + * Iterate through the enclave pages contained within [@start, @end) to verify + * that the permissions requested by a subset of {VM_READ, VM_WRITE, VM_EXEC} + * do not contain any permissions that are not contained in the build time + * permissions of any of the enclave pages within the given address range. + * + * An enclave creator must declare the strongest permissions that will be + * needed for each enclave page. This ensures that mappings have the identical + * or weaker permissions than the earlier declared permissions. + * + * Return: 0 on success, -EACCES otherwise + */ +int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start, + unsigned long end, unsigned long vm_flags) +{ + unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC); + struct sgx_encl_page *page; + unsigned long count = 0; + int ret = 0; + + XA_STATE(xas, &encl->page_array, PFN_DOWN(start)); + + /* + * Disallow READ_IMPLIES_EXEC tasks as their VMA permissions might + * conflict with the enclave page permissions. + */ + if (current->personality & READ_IMPLIES_EXEC) + return -EACCES; + + mutex_lock(&encl->lock); + xas_lock(&xas); + xas_for_each(&xas, page, PFN_DOWN(end - 1)) { + if (~page->vm_max_prot_bits & vm_prot_bits) { + ret = -EACCES; + break; + } + + /* Reschedule on every XA_CHECK_SCHED iteration. */ + if (!(++count % XA_CHECK_SCHED)) { + xas_pause(&xas); + xas_unlock(&xas); + mutex_unlock(&encl->lock); + + cond_resched(); + + mutex_lock(&encl->lock); + xas_lock(&xas); + } + } + xas_unlock(&xas); + mutex_unlock(&encl->lock); + + return ret; +} + +static int sgx_vma_mprotect(struct vm_area_struct *vma, unsigned long start, + unsigned long end, unsigned long newflags) +{ + return sgx_encl_may_map(vma->vm_private_data, start, end, newflags); +} + +static int sgx_encl_debug_read(struct sgx_encl *encl, struct sgx_encl_page *page, + unsigned long addr, void *data) +{ + unsigned long offset = addr & ~PAGE_MASK; + int ret; + + + ret = __edbgrd(sgx_get_epc_virt_addr(page->epc_page) + offset, data); + if (ret) + return -EIO; + + return 0; +} + +static int sgx_encl_debug_write(struct sgx_encl *encl, struct sgx_encl_page *page, + unsigned long addr, void *data) +{ + unsigned long offset = addr & ~PAGE_MASK; + int ret; + + ret = __edbgwr(sgx_get_epc_virt_addr(page->epc_page) + offset, data); + if (ret) + return -EIO; + + return 0; +} + +/* + * Load an enclave page to EPC if required, and take encl->lock. + */ +static struct sgx_encl_page *sgx_encl_reserve_page(struct sgx_encl *encl, + unsigned long addr, + unsigned long vm_flags) +{ + struct sgx_encl_page *entry; + + for ( ; ; ) { + mutex_lock(&encl->lock); + + entry = sgx_encl_load_page(encl, addr, vm_flags); + if (PTR_ERR(entry) != -EBUSY) + break; + + mutex_unlock(&encl->lock); + } + + if (IS_ERR(entry)) + mutex_unlock(&encl->lock); + + return entry; +} + +static int sgx_vma_access(struct vm_area_struct *vma, unsigned long addr, + void *buf, int len, int write) +{ + struct sgx_encl *encl = vma->vm_private_data; + struct sgx_encl_page *entry = NULL; + char data[sizeof(unsigned long)]; + unsigned long align; + int offset; + int cnt; + int ret = 0; + int i; + + /* + * If process was forked, VMA is still there but vm_private_data is set + * to NULL. + */ + if (!encl) + return -EFAULT; + + if (!test_bit(SGX_ENCL_DEBUG, &encl->flags)) + return -EFAULT; + + for (i = 0; i < len; i += cnt) { + entry = sgx_encl_reserve_page(encl, (addr + i) & PAGE_MASK, + vma->vm_flags); + if (IS_ERR(entry)) { + ret = PTR_ERR(entry); + break; + } + + align = ALIGN_DOWN(addr + i, sizeof(unsigned long)); + offset = (addr + i) & (sizeof(unsigned long) - 1); + cnt = sizeof(unsigned long) - offset; + cnt = min(cnt, len - i); + + ret = sgx_encl_debug_read(encl, entry, align, data); + if (ret) + goto out; + + if (write) { + memcpy(data + offset, buf + i, cnt); + ret = sgx_encl_debug_write(encl, entry, align, data); + if (ret) + goto out; + } else { + memcpy(buf + i, data + offset, cnt); + } + +out: + mutex_unlock(&encl->lock); + + if (ret) + break; + } + + return ret < 0 ? ret : i; +} + +const struct vm_operations_struct sgx_vm_ops = { + .fault = sgx_vma_fault, + .mprotect = sgx_vma_mprotect, + .open = sgx_vma_open, + .access = sgx_vma_access, +}; + +/** + * sgx_encl_release - Destroy an enclave instance + * @kref: address of a kref inside &sgx_encl + * + * Used together with kref_put(). Frees all the resources associated with the + * enclave and the instance itself. + */ +void sgx_encl_release(struct kref *ref) +{ + struct sgx_encl *encl = container_of(ref, struct sgx_encl, refcount); + struct sgx_va_page *va_page; + struct sgx_encl_page *entry; + unsigned long index; + + xa_for_each(&encl->page_array, index, entry) { + if (entry->epc_page) { + /* + * The page and its radix tree entry cannot be freed + * if the page is being held by the reclaimer. + */ + if (sgx_unmark_page_reclaimable(entry->epc_page)) + continue; + + sgx_free_epc_page(entry->epc_page); + encl->secs_child_cnt--; + entry->epc_page = NULL; + } + + kfree(entry); + } + + xa_destroy(&encl->page_array); + + if (!encl->secs_child_cnt && encl->secs.epc_page) { + sgx_free_epc_page(encl->secs.epc_page); + encl->secs.epc_page = NULL; + } + + while (!list_empty(&encl->va_pages)) { + va_page = list_first_entry(&encl->va_pages, struct sgx_va_page, + list); + list_del(&va_page->list); + sgx_free_epc_page(va_page->epc_page); + kfree(va_page); + } + + if (encl->backing) + fput(encl->backing); + + cleanup_srcu_struct(&encl->srcu); + + WARN_ON_ONCE(!list_empty(&encl->mm_list)); + + /* Detect EPC page leak's. */ + WARN_ON_ONCE(encl->secs_child_cnt); + WARN_ON_ONCE(encl->secs.epc_page); + + kfree(encl); +} + +/* + * 'mm' is exiting and no longer needs mmu notifications. + */ +static void sgx_mmu_notifier_release(struct mmu_notifier *mn, + struct mm_struct *mm) +{ + struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier); + struct sgx_encl_mm *tmp = NULL; + + /* + * The enclave itself can remove encl_mm. Note, objects can't be moved + * off an RCU protected list, but deletion is ok. + */ + spin_lock(&encl_mm->encl->mm_lock); + list_for_each_entry(tmp, &encl_mm->encl->mm_list, list) { + if (tmp == encl_mm) { + list_del_rcu(&encl_mm->list); + break; + } + } + spin_unlock(&encl_mm->encl->mm_lock); + + if (tmp == encl_mm) { + synchronize_srcu(&encl_mm->encl->srcu); + mmu_notifier_put(mn); + } +} + +static void sgx_mmu_notifier_free(struct mmu_notifier *mn) +{ + struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier); + + kfree(encl_mm); +} + +static const struct mmu_notifier_ops sgx_mmu_notifier_ops = { + .release = sgx_mmu_notifier_release, + .free_notifier = sgx_mmu_notifier_free, +}; + +static struct sgx_encl_mm *sgx_encl_find_mm(struct sgx_encl *encl, + struct mm_struct *mm) +{ + struct sgx_encl_mm *encl_mm = NULL; + struct sgx_encl_mm *tmp; + int idx; + + idx = srcu_read_lock(&encl->srcu); + + list_for_each_entry_rcu(tmp, &encl->mm_list, list) { + if (tmp->mm == mm) { + encl_mm = tmp; + break; + } + } + + srcu_read_unlock(&encl->srcu, idx); + + return encl_mm; +} + +int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm) +{ + struct sgx_encl_mm *encl_mm; + int ret; + + /* + * Even though a single enclave may be mapped into an mm more than once, + * each 'mm' only appears once on encl->mm_list. This is guaranteed by + * holding the mm's mmap lock for write before an mm can be added or + * remove to an encl->mm_list. + */ + mmap_assert_write_locked(mm); + + /* + * It's possible that an entry already exists in the mm_list, because it + * is removed only on VFS release or process exit. + */ + if (sgx_encl_find_mm(encl, mm)) + return 0; + + encl_mm = kzalloc(sizeof(*encl_mm), GFP_KERNEL); + if (!encl_mm) + return -ENOMEM; + + encl_mm->encl = encl; + encl_mm->mm = mm; + encl_mm->mmu_notifier.ops = &sgx_mmu_notifier_ops; + + ret = __mmu_notifier_register(&encl_mm->mmu_notifier, mm); + if (ret) { + kfree(encl_mm); + return ret; + } + + spin_lock(&encl->mm_lock); + list_add_rcu(&encl_mm->list, &encl->mm_list); + /* Pairs with smp_rmb() in sgx_reclaimer_block(). */ + smp_wmb(); + encl->mm_list_version++; + spin_unlock(&encl->mm_lock); + + return 0; +} + +static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl, + pgoff_t index) +{ + struct inode *inode = encl->backing->f_path.dentry->d_inode; + struct address_space *mapping = inode->i_mapping; + gfp_t gfpmask = mapping_gfp_mask(mapping); + + return shmem_read_mapping_page_gfp(mapping, index, gfpmask); +} + +/** + * sgx_encl_get_backing() - Pin the backing storage + * @encl: an enclave pointer + * @page_index: enclave page index + * @backing: data for accessing backing storage for the page + * + * Pin the backing storage pages for storing the encrypted contents and Paging + * Crypto MetaData (PCMD) of an enclave page. + * + * Return: + * 0 on success, + * -errno otherwise. + */ +int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, + struct sgx_backing *backing) +{ + pgoff_t pcmd_index = PFN_DOWN(encl->size) + 1 + (page_index >> 5); + struct page *contents; + struct page *pcmd; + + contents = sgx_encl_get_backing_page(encl, page_index); + if (IS_ERR(contents)) + return PTR_ERR(contents); + + pcmd = sgx_encl_get_backing_page(encl, pcmd_index); + if (IS_ERR(pcmd)) { + put_page(contents); + return PTR_ERR(pcmd); + } + + backing->page_index = page_index; + backing->contents = contents; + backing->pcmd = pcmd; + backing->pcmd_offset = + (page_index & (PAGE_SIZE / sizeof(struct sgx_pcmd) - 1)) * + sizeof(struct sgx_pcmd); + + return 0; +} + +/** + * sgx_encl_put_backing() - Unpin the backing storage + * @backing: data for accessing backing storage for the page + * @do_write: mark pages dirty + */ +void sgx_encl_put_backing(struct sgx_backing *backing, bool do_write) +{ + if (do_write) { + set_page_dirty(backing->pcmd); + set_page_dirty(backing->contents); + } + + put_page(backing->pcmd); + put_page(backing->contents); +} + +static int sgx_encl_test_and_clear_young_cb(pte_t *ptep, unsigned long addr, + void *data) +{ + pte_t pte; + int ret; + + ret = pte_young(*ptep); + if (ret) { + pte = pte_mkold(*ptep); + set_pte_at((struct mm_struct *)data, addr, ptep, pte); + } + + return ret; +} + +/** + * sgx_encl_test_and_clear_young() - Test and reset the accessed bit + * @mm: mm_struct that is checked + * @page: enclave page to be tested for recent access + * + * Checks the Access (A) bit from the PTE corresponding to the enclave page and + * clears it. + * + * Return: 1 if the page has been recently accessed and 0 if not. + */ +int sgx_encl_test_and_clear_young(struct mm_struct *mm, + struct sgx_encl_page *page) +{ + unsigned long addr = page->desc & PAGE_MASK; + struct sgx_encl *encl = page->encl; + struct vm_area_struct *vma; + int ret; + + ret = sgx_encl_find(mm, addr, &vma); + if (ret) + return 0; + + if (encl != vma->vm_private_data) + return 0; + + ret = apply_to_page_range(vma->vm_mm, addr, PAGE_SIZE, + sgx_encl_test_and_clear_young_cb, vma->vm_mm); + if (ret < 0) + return 0; + + return ret; +} + +/** + * sgx_alloc_va_page() - Allocate a Version Array (VA) page + * + * Allocate a free EPC page and convert it to a Version Array (VA) page. + * + * Return: + * a VA page, + * -errno otherwise + */ +struct sgx_epc_page *sgx_alloc_va_page(void) +{ + struct sgx_epc_page *epc_page; + int ret; + + epc_page = sgx_alloc_epc_page(NULL, true); + if (IS_ERR(epc_page)) + return ERR_CAST(epc_page); + + ret = __epa(sgx_get_epc_virt_addr(epc_page)); + if (ret) { + WARN_ONCE(1, "EPA returned %d (0x%x)", ret, ret); + sgx_free_epc_page(epc_page); + return ERR_PTR(-EFAULT); + } + + return epc_page; +} + +/** + * sgx_alloc_va_slot - allocate a VA slot + * @va_page: a &struct sgx_va_page instance + * + * Allocates a slot from a &struct sgx_va_page instance. + * + * Return: offset of the slot inside the VA page + */ +unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page) +{ + int slot = find_first_zero_bit(va_page->slots, SGX_VA_SLOT_COUNT); + + if (slot < SGX_VA_SLOT_COUNT) + set_bit(slot, va_page->slots); + + return slot << 3; +} + +/** + * sgx_free_va_slot - free a VA slot + * @va_page: a &struct sgx_va_page instance + * @offset: offset of the slot inside the VA page + * + * Frees a slot from a &struct sgx_va_page instance. + */ +void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset) +{ + clear_bit(offset >> 3, va_page->slots); +} + +/** + * sgx_va_page_full - is the VA page full? + * @va_page: a &struct sgx_va_page instance + * + * Return: true if all slots have been taken + */ +bool sgx_va_page_full(struct sgx_va_page *va_page) +{ + int slot = find_first_zero_bit(va_page->slots, SGX_VA_SLOT_COUNT); + + return slot == SGX_VA_SLOT_COUNT; +} diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h new file mode 100644 index 000000000000..d8d30ccbef4c --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/encl.h @@ -0,0 +1,119 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/** + * Copyright(c) 2016-20 Intel Corporation. + * + * Contains the software defined data structures for enclaves. + */ +#ifndef _X86_ENCL_H +#define _X86_ENCL_H + +#include <linux/cpumask.h> +#include <linux/kref.h> +#include <linux/list.h> +#include <linux/mm_types.h> +#include <linux/mmu_notifier.h> +#include <linux/mutex.h> +#include <linux/notifier.h> +#include <linux/srcu.h> +#include <linux/workqueue.h> +#include <linux/xarray.h> +#include "sgx.h" + +/* 'desc' bits holding the offset in the VA (version array) page. */ +#define SGX_ENCL_PAGE_VA_OFFSET_MASK GENMASK_ULL(11, 3) + +/* 'desc' bit marking that the page is being reclaimed. */ +#define SGX_ENCL_PAGE_BEING_RECLAIMED BIT(3) + +struct sgx_encl_page { + unsigned long desc; + unsigned long vm_max_prot_bits; + struct sgx_epc_page *epc_page; + struct sgx_encl *encl; + struct sgx_va_page *va_page; +}; + +enum sgx_encl_flags { + SGX_ENCL_IOCTL = BIT(0), + SGX_ENCL_DEBUG = BIT(1), + SGX_ENCL_CREATED = BIT(2), + SGX_ENCL_INITIALIZED = BIT(3), +}; + +struct sgx_encl_mm { + struct sgx_encl *encl; + struct mm_struct *mm; + struct list_head list; + struct mmu_notifier mmu_notifier; +}; + +struct sgx_encl { + unsigned long base; + unsigned long size; + unsigned long flags; + unsigned int page_cnt; + unsigned int secs_child_cnt; + struct mutex lock; + struct xarray page_array; + struct sgx_encl_page secs; + unsigned long attributes; + unsigned long attributes_mask; + + cpumask_t cpumask; + struct file *backing; + struct kref refcount; + struct list_head va_pages; + unsigned long mm_list_version; + struct list_head mm_list; + spinlock_t mm_lock; + struct srcu_struct srcu; +}; + +#define SGX_VA_SLOT_COUNT 512 + +struct sgx_va_page { + struct sgx_epc_page *epc_page; + DECLARE_BITMAP(slots, SGX_VA_SLOT_COUNT); + struct list_head list; +}; + +struct sgx_backing { + pgoff_t page_index; + struct page *contents; + struct page *pcmd; + unsigned long pcmd_offset; +}; + +extern const struct vm_operations_struct sgx_vm_ops; + +static inline int sgx_encl_find(struct mm_struct *mm, unsigned long addr, + struct vm_area_struct **vma) +{ + struct vm_area_struct *result; + + result = find_vma(mm, addr); + if (!result || result->vm_ops != &sgx_vm_ops || addr < result->vm_start) + return -EINVAL; + + *vma = result; + + return 0; +} + +int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start, + unsigned long end, unsigned long vm_flags); + +void sgx_encl_release(struct kref *ref); +int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm); +int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, + struct sgx_backing *backing); +void sgx_encl_put_backing(struct sgx_backing *backing, bool do_write); +int sgx_encl_test_and_clear_young(struct mm_struct *mm, + struct sgx_encl_page *page); + +struct sgx_epc_page *sgx_alloc_va_page(void); +unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page); +void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset); +bool sgx_va_page_full(struct sgx_va_page *va_page); + +#endif /* _X86_ENCL_H */ diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h new file mode 100644 index 000000000000..443188fe7e70 --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/encls.h @@ -0,0 +1,231 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _X86_ENCLS_H +#define _X86_ENCLS_H + +#include <linux/bitops.h> +#include <linux/err.h> +#include <linux/io.h> +#include <linux/rwsem.h> +#include <linux/types.h> +#include <asm/asm.h> +#include <asm/traps.h> +#include "sgx.h" + +enum sgx_encls_function { + ECREATE = 0x00, + EADD = 0x01, + EINIT = 0x02, + EREMOVE = 0x03, + EDGBRD = 0x04, + EDGBWR = 0x05, + EEXTEND = 0x06, + ELDU = 0x08, + EBLOCK = 0x09, + EPA = 0x0A, + EWB = 0x0B, + ETRACK = 0x0C, +}; + +/** + * ENCLS_FAULT_FLAG - flag signifying an ENCLS return code is a trapnr + * + * ENCLS has its own (positive value) error codes and also generates + * ENCLS specific #GP and #PF faults. And the ENCLS values get munged + * with system error codes as everything percolates back up the stack. + * Unfortunately (for us), we need to precisely identify each unique + * error code, e.g. the action taken if EWB fails varies based on the + * type of fault and on the exact SGX error code, i.e. we can't simply + * convert all faults to -EFAULT. + * + * To make all three error types coexist, we set bit 30 to identify an + * ENCLS fault. Bit 31 (technically bits N:31) is used to differentiate + * between positive (faults and SGX error codes) and negative (system + * error codes) values. + */ +#define ENCLS_FAULT_FLAG 0x40000000 + +/* Retrieve the encoded trapnr from the specified return code. */ +#define ENCLS_TRAPNR(r) ((r) & ~ENCLS_FAULT_FLAG) + +/* Issue a WARN() about an ENCLS function. */ +#define ENCLS_WARN(r, name) { \ + do { \ + int _r = (r); \ + WARN_ONCE(_r, "%s returned %d (0x%x)\n", (name), _r, _r); \ + } while (0); \ +} + +/** + * encls_failed() - Check if an ENCLS function failed + * @ret: the return value of an ENCLS function call + * + * Check if an ENCLS function failed. This happens when the function causes a + * fault that is not caused by an EPCM conflict or when the function returns a + * non-zero value. + */ +static inline bool encls_failed(int ret) +{ + if (ret & ENCLS_FAULT_FLAG) + return ENCLS_TRAPNR(ret) != X86_TRAP_PF; + + return !!ret; +} + +/** + * __encls_ret_N - encode an ENCLS function that returns an error code in EAX + * @rax: function number + * @inputs: asm inputs for the function + * + * Emit assembly for an ENCLS function that returns an error code, e.g. EREMOVE. + * And because SGX isn't complex enough as it is, function that return an error + * code also modify flags. + * + * Return: + * 0 on success, + * SGX error code on failure + */ +#define __encls_ret_N(rax, inputs...) \ + ({ \ + int ret; \ + asm volatile( \ + "1: .byte 0x0f, 0x01, 0xcf;\n\t" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3: orl $"__stringify(ENCLS_FAULT_FLAG)",%%eax\n" \ + " jmp 2b\n" \ + ".previous\n" \ + _ASM_EXTABLE_FAULT(1b, 3b) \ + : "=a"(ret) \ + : "a"(rax), inputs \ + : "memory", "cc"); \ + ret; \ + }) + +#define __encls_ret_1(rax, rcx) \ + ({ \ + __encls_ret_N(rax, "c"(rcx)); \ + }) + +#define __encls_ret_2(rax, rbx, rcx) \ + ({ \ + __encls_ret_N(rax, "b"(rbx), "c"(rcx)); \ + }) + +#define __encls_ret_3(rax, rbx, rcx, rdx) \ + ({ \ + __encls_ret_N(rax, "b"(rbx), "c"(rcx), "d"(rdx)); \ + }) + +/** + * __encls_N - encode an ENCLS function that doesn't return an error code + * @rax: function number + * @rbx_out: optional output variable + * @inputs: asm inputs for the function + * + * Emit assembly for an ENCLS function that does not return an error code, e.g. + * ECREATE. Leaves without error codes either succeed or fault. @rbx_out is an + * optional parameter for use by EDGBRD, which returns the requested value in + * RBX. + * + * Return: + * 0 on success, + * trapnr with ENCLS_FAULT_FLAG set on fault + */ +#define __encls_N(rax, rbx_out, inputs...) \ + ({ \ + int ret; \ + asm volatile( \ + "1: .byte 0x0f, 0x01, 0xcf;\n\t" \ + " xor %%eax,%%eax;\n" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3: orl $"__stringify(ENCLS_FAULT_FLAG)",%%eax\n" \ + " jmp 2b\n" \ + ".previous\n" \ + _ASM_EXTABLE_FAULT(1b, 3b) \ + : "=a"(ret), "=b"(rbx_out) \ + : "a"(rax), inputs \ + : "memory"); \ + ret; \ + }) + +#define __encls_2(rax, rbx, rcx) \ + ({ \ + unsigned long ign_rbx_out; \ + __encls_N(rax, ign_rbx_out, "b"(rbx), "c"(rcx)); \ + }) + +#define __encls_1_1(rax, data, rcx) \ + ({ \ + unsigned long rbx_out; \ + int ret = __encls_N(rax, rbx_out, "c"(rcx)); \ + if (!ret) \ + data = rbx_out; \ + ret; \ + }) + +static inline int __ecreate(struct sgx_pageinfo *pginfo, void *secs) +{ + return __encls_2(ECREATE, pginfo, secs); +} + +static inline int __eextend(void *secs, void *addr) +{ + return __encls_2(EEXTEND, secs, addr); +} + +static inline int __eadd(struct sgx_pageinfo *pginfo, void *addr) +{ + return __encls_2(EADD, pginfo, addr); +} + +static inline int __einit(void *sigstruct, void *token, void *secs) +{ + return __encls_ret_3(EINIT, sigstruct, secs, token); +} + +static inline int __eremove(void *addr) +{ + return __encls_ret_1(EREMOVE, addr); +} + +static inline int __edbgwr(void *addr, unsigned long *data) +{ + return __encls_2(EDGBWR, *data, addr); +} + +static inline int __edbgrd(void *addr, unsigned long *data) +{ + return __encls_1_1(EDGBRD, *data, addr); +} + +static inline int __etrack(void *addr) +{ + return __encls_ret_1(ETRACK, addr); +} + +static inline int __eldu(struct sgx_pageinfo *pginfo, void *addr, + void *va) +{ + return __encls_ret_3(ELDU, pginfo, addr, va); +} + +static inline int __eblock(void *addr) +{ + return __encls_ret_1(EBLOCK, addr); +} + +static inline int __epa(void *addr) +{ + unsigned long rbx = SGX_PAGE_TYPE_VA; + + return __encls_2(EPA, rbx, addr); +} + +static inline int __ewb(struct sgx_pageinfo *pginfo, void *addr, + void *va) +{ + return __encls_ret_3(EWB, pginfo, addr, va); +} + +#endif /* _X86_ENCLS_H */ diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c new file mode 100644 index 000000000000..90a5caf76939 --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -0,0 +1,716 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2016-20 Intel Corporation. */ + +#include <asm/mman.h> +#include <linux/mman.h> +#include <linux/delay.h> +#include <linux/file.h> +#include <linux/hashtable.h> +#include <linux/highmem.h> +#include <linux/ratelimit.h> +#include <linux/sched/signal.h> +#include <linux/shmem_fs.h> +#include <linux/slab.h> +#include <linux/suspend.h> +#include "driver.h" +#include "encl.h" +#include "encls.h" + +static struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl) +{ + struct sgx_va_page *va_page = NULL; + void *err; + + BUILD_BUG_ON(SGX_VA_SLOT_COUNT != + (SGX_ENCL_PAGE_VA_OFFSET_MASK >> 3) + 1); + + if (!(encl->page_cnt % SGX_VA_SLOT_COUNT)) { + va_page = kzalloc(sizeof(*va_page), GFP_KERNEL); + if (!va_page) + return ERR_PTR(-ENOMEM); + + va_page->epc_page = sgx_alloc_va_page(); + if (IS_ERR(va_page->epc_page)) { + err = ERR_CAST(va_page->epc_page); + kfree(va_page); + return err; + } + + WARN_ON_ONCE(encl->page_cnt % SGX_VA_SLOT_COUNT); + } + encl->page_cnt++; + return va_page; +} + +static void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page) +{ + encl->page_cnt--; + + if (va_page) { + sgx_free_epc_page(va_page->epc_page); + list_del(&va_page->list); + kfree(va_page); + } +} + +static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs) +{ + struct sgx_epc_page *secs_epc; + struct sgx_va_page *va_page; + struct sgx_pageinfo pginfo; + struct sgx_secinfo secinfo; + unsigned long encl_size; + struct file *backing; + long ret; + + va_page = sgx_encl_grow(encl); + if (IS_ERR(va_page)) + return PTR_ERR(va_page); + else if (va_page) + list_add(&va_page->list, &encl->va_pages); + /* else the tail page of the VA page list had free slots. */ + + /* The extra page goes to SECS. */ + encl_size = secs->size + PAGE_SIZE; + + backing = shmem_file_setup("SGX backing", encl_size + (encl_size >> 5), + VM_NORESERVE); + if (IS_ERR(backing)) { + ret = PTR_ERR(backing); + goto err_out_shrink; + } + + encl->backing = backing; + + secs_epc = sgx_alloc_epc_page(&encl->secs, true); + if (IS_ERR(secs_epc)) { + ret = PTR_ERR(secs_epc); + goto err_out_backing; + } + + encl->secs.epc_page = secs_epc; + + pginfo.addr = 0; + pginfo.contents = (unsigned long)secs; + pginfo.metadata = (unsigned long)&secinfo; + pginfo.secs = 0; + memset(&secinfo, 0, sizeof(secinfo)); + + ret = __ecreate((void *)&pginfo, sgx_get_epc_virt_addr(secs_epc)); + if (ret) { + ret = -EIO; + goto err_out; + } + + if (secs->attributes & SGX_ATTR_DEBUG) + set_bit(SGX_ENCL_DEBUG, &encl->flags); + + encl->secs.encl = encl; + encl->base = secs->base; + encl->size = secs->size; + encl->attributes = secs->attributes; + encl->attributes_mask = SGX_ATTR_DEBUG | SGX_ATTR_MODE64BIT | SGX_ATTR_KSS; + + /* Set only after completion, as encl->lock has not been taken. */ + set_bit(SGX_ENCL_CREATED, &encl->flags); + + return 0; + +err_out: + sgx_free_epc_page(encl->secs.epc_page); + encl->secs.epc_page = NULL; + +err_out_backing: + fput(encl->backing); + encl->backing = NULL; + +err_out_shrink: + sgx_encl_shrink(encl, va_page); + + return ret; +} + +/** + * sgx_ioc_enclave_create() - handler for %SGX_IOC_ENCLAVE_CREATE + * @encl: An enclave pointer. + * @arg: The ioctl argument. + * + * Allocate kernel data structures for the enclave and invoke ECREATE. + * + * Return: + * - 0: Success. + * - -EIO: ECREATE failed. + * - -errno: POSIX error. + */ +static long sgx_ioc_enclave_create(struct sgx_encl *encl, void __user *arg) +{ + struct sgx_enclave_create create_arg; + void *secs; + int ret; + + if (test_bit(SGX_ENCL_CREATED, &encl->flags)) + return -EINVAL; + + if (copy_from_user(&create_arg, arg, sizeof(create_arg))) + return -EFAULT; + + secs = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!secs) + return -ENOMEM; + + if (copy_from_user(secs, (void __user *)create_arg.src, PAGE_SIZE)) + ret = -EFAULT; + else + ret = sgx_encl_create(encl, secs); + + kfree(secs); + return ret; +} + +static struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl, + unsigned long offset, + u64 secinfo_flags) +{ + struct sgx_encl_page *encl_page; + unsigned long prot; + + encl_page = kzalloc(sizeof(*encl_page), GFP_KERNEL); + if (!encl_page) + return ERR_PTR(-ENOMEM); + + encl_page->desc = encl->base + offset; + encl_page->encl = encl; + + prot = _calc_vm_trans(secinfo_flags, SGX_SECINFO_R, PROT_READ) | + _calc_vm_trans(secinfo_flags, SGX_SECINFO_W, PROT_WRITE) | + _calc_vm_trans(secinfo_flags, SGX_SECINFO_X, PROT_EXEC); + + /* + * TCS pages must always RW set for CPU access while the SECINFO + * permissions are *always* zero - the CPU ignores the user provided + * values and silently overwrites them with zero permissions. + */ + if ((secinfo_flags & SGX_SECINFO_PAGE_TYPE_MASK) == SGX_SECINFO_TCS) + prot |= PROT_READ | PROT_WRITE; + + /* Calculate maximum of the VM flags for the page. */ + encl_page->vm_max_prot_bits = calc_vm_prot_bits(prot, 0); + + return encl_page; +} + +static int sgx_validate_secinfo(struct sgx_secinfo *secinfo) +{ + u64 perm = secinfo->flags & SGX_SECINFO_PERMISSION_MASK; + u64 pt = secinfo->flags & SGX_SECINFO_PAGE_TYPE_MASK; + + if (pt != SGX_SECINFO_REG && pt != SGX_SECINFO_TCS) + return -EINVAL; + + if ((perm & SGX_SECINFO_W) && !(perm & SGX_SECINFO_R)) + return -EINVAL; + + /* + * CPU will silently overwrite the permissions as zero, which means + * that we need to validate it ourselves. + */ + if (pt == SGX_SECINFO_TCS && perm) + return -EINVAL; + + if (secinfo->flags & SGX_SECINFO_RESERVED_MASK) + return -EINVAL; + + if (memchr_inv(secinfo->reserved, 0, sizeof(secinfo->reserved))) + return -EINVAL; + + return 0; +} + +static int __sgx_encl_add_page(struct sgx_encl *encl, + struct sgx_encl_page *encl_page, + struct sgx_epc_page *epc_page, + struct sgx_secinfo *secinfo, unsigned long src) +{ + struct sgx_pageinfo pginfo; + struct vm_area_struct *vma; + struct page *src_page; + int ret; + + /* Deny noexec. */ + vma = find_vma(current->mm, src); + if (!vma) + return -EFAULT; + + if (!(vma->vm_flags & VM_MAYEXEC)) + return -EACCES; + + ret = get_user_pages(src, 1, 0, &src_page, NULL); + if (ret < 1) + return -EFAULT; + + pginfo.secs = (unsigned long)sgx_get_epc_virt_addr(encl->secs.epc_page); + pginfo.addr = encl_page->desc & PAGE_MASK; + pginfo.metadata = (unsigned long)secinfo; + pginfo.contents = (unsigned long)kmap_atomic(src_page); + + ret = __eadd(&pginfo, sgx_get_epc_virt_addr(epc_page)); + + kunmap_atomic((void *)pginfo.contents); + put_page(src_page); + + return ret ? -EIO : 0; +} + +/* + * If the caller requires measurement of the page as a proof for the content, + * use EEXTEND to add a measurement for 256 bytes of the page. Repeat this + * operation until the entire page is measured." + */ +static int __sgx_encl_extend(struct sgx_encl *encl, + struct sgx_epc_page *epc_page) +{ + unsigned long offset; + int ret; + + for (offset = 0; offset < PAGE_SIZE; offset += SGX_EEXTEND_BLOCK_SIZE) { + ret = __eextend(sgx_get_epc_virt_addr(encl->secs.epc_page), + sgx_get_epc_virt_addr(epc_page) + offset); + if (ret) { + if (encls_failed(ret)) + ENCLS_WARN(ret, "EEXTEND"); + + return -EIO; + } + } + + return 0; +} + +static int sgx_encl_add_page(struct sgx_encl *encl, unsigned long src, + unsigned long offset, struct sgx_secinfo *secinfo, + unsigned long flags) +{ + struct sgx_encl_page *encl_page; + struct sgx_epc_page *epc_page; + struct sgx_va_page *va_page; + int ret; + + encl_page = sgx_encl_page_alloc(encl, offset, secinfo->flags); + if (IS_ERR(encl_page)) + return PTR_ERR(encl_page); + + epc_page = sgx_alloc_epc_page(encl_page, true); + if (IS_ERR(epc_page)) { + kfree(encl_page); + return PTR_ERR(epc_page); + } + + va_page = sgx_encl_grow(encl); + if (IS_ERR(va_page)) { + ret = PTR_ERR(va_page); + goto err_out_free; + } + + mmap_read_lock(current->mm); + mutex_lock(&encl->lock); + + /* + * Adding to encl->va_pages must be done under encl->lock. Ditto for + * deleting (via sgx_encl_shrink()) in the error path. + */ + if (va_page) + list_add(&va_page->list, &encl->va_pages); + + /* + * Insert prior to EADD in case of OOM. EADD modifies MRENCLAVE, i.e. + * can't be gracefully unwound, while failure on EADD/EXTEND is limited + * to userspace errors (or kernel/hardware bugs). + */ + ret = xa_insert(&encl->page_array, PFN_DOWN(encl_page->desc), + encl_page, GFP_KERNEL); + if (ret) + goto err_out_unlock; + + ret = __sgx_encl_add_page(encl, encl_page, epc_page, secinfo, + src); + if (ret) + goto err_out; + + /* + * Complete the "add" before doing the "extend" so that the "add" + * isn't in a half-baked state in the extremely unlikely scenario + * the enclave will be destroyed in response to EEXTEND failure. + */ + encl_page->encl = encl; + encl_page->epc_page = epc_page; + encl->secs_child_cnt++; + + if (flags & SGX_PAGE_MEASURE) { + ret = __sgx_encl_extend(encl, epc_page); + if (ret) + goto err_out; + } + + sgx_mark_page_reclaimable(encl_page->epc_page); + mutex_unlock(&encl->lock); + mmap_read_unlock(current->mm); + return ret; + +err_out: + xa_erase(&encl->page_array, PFN_DOWN(encl_page->desc)); + +err_out_unlock: + sgx_encl_shrink(encl, va_page); + mutex_unlock(&encl->lock); + mmap_read_unlock(current->mm); + +err_out_free: + sgx_free_epc_page(epc_page); + kfree(encl_page); + + return ret; +} + +/** + * sgx_ioc_enclave_add_pages() - The handler for %SGX_IOC_ENCLAVE_ADD_PAGES + * @encl: an enclave pointer + * @arg: a user pointer to a struct sgx_enclave_add_pages instance + * + * Add one or more pages to an uninitialized enclave, and optionally extend the + * measurement with the contents of the page. The SECINFO and measurement mask + * are applied to all pages. + * + * A SECINFO for a TCS is required to always contain zero permissions because + * CPU silently zeros them. Allowing anything else would cause a mismatch in + * the measurement. + * + * mmap()'s protection bits are capped by the page permissions. For each page + * address, the maximum protection bits are computed with the following + * heuristics: + * + * 1. A regular page: PROT_R, PROT_W and PROT_X match the SECINFO permissions. + * 2. A TCS page: PROT_R | PROT_W. + * + * mmap() is not allowed to surpass the minimum of the maximum protection bits + * within the given address range. + * + * The function deinitializes kernel data structures for enclave and returns + * -EIO in any of the following conditions: + * + * - Enclave Page Cache (EPC), the physical memory holding enclaves, has + * been invalidated. This will cause EADD and EEXTEND to fail. + * - If the source address is corrupted somehow when executing EADD. + * + * Return: + * - 0: Success. + * - -EACCES: The source page is located in a noexec partition. + * - -ENOMEM: Out of EPC pages. + * - -EINTR: The call was interrupted before data was processed. + * - -EIO: Either EADD or EEXTEND failed because invalid source address + * or power cycle. + * - -errno: POSIX error. + */ +static long sgx_ioc_enclave_add_pages(struct sgx_encl *encl, void __user *arg) +{ + struct sgx_enclave_add_pages add_arg; + struct sgx_secinfo secinfo; + unsigned long c; + int ret; + + if (!test_bit(SGX_ENCL_CREATED, &encl->flags) || + test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) + return -EINVAL; + + if (copy_from_user(&add_arg, arg, sizeof(add_arg))) + return -EFAULT; + + if (!IS_ALIGNED(add_arg.offset, PAGE_SIZE) || + !IS_ALIGNED(add_arg.src, PAGE_SIZE)) + return -EINVAL; + + if (!add_arg.length || add_arg.length & (PAGE_SIZE - 1)) + return -EINVAL; + + if (add_arg.offset + add_arg.length - PAGE_SIZE >= encl->size) + return -EINVAL; + + if (copy_from_user(&secinfo, (void __user *)add_arg.secinfo, + sizeof(secinfo))) + return -EFAULT; + + if (sgx_validate_secinfo(&secinfo)) + return -EINVAL; + + for (c = 0 ; c < add_arg.length; c += PAGE_SIZE) { + if (signal_pending(current)) { + if (!c) + ret = -ERESTARTSYS; + + break; + } + + if (need_resched()) + cond_resched(); + + ret = sgx_encl_add_page(encl, add_arg.src + c, add_arg.offset + c, + &secinfo, add_arg.flags); + if (ret) + break; + } + + add_arg.count = c; + + if (copy_to_user(arg, &add_arg, sizeof(add_arg))) + return -EFAULT; + + return ret; +} + +static int __sgx_get_key_hash(struct crypto_shash *tfm, const void *modulus, + void *hash) +{ + SHASH_DESC_ON_STACK(shash, tfm); + + shash->tfm = tfm; + + return crypto_shash_digest(shash, modulus, SGX_MODULUS_SIZE, hash); +} + +static int sgx_get_key_hash(const void *modulus, void *hash) +{ + struct crypto_shash *tfm; + int ret; + + tfm = crypto_alloc_shash("sha256", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + ret = __sgx_get_key_hash(tfm, modulus, hash); + + crypto_free_shash(tfm); + return ret; +} + +static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct, + void *token) +{ + u64 mrsigner[4]; + int i, j, k; + void *addr; + int ret; + + /* + * Deny initializing enclaves with attributes (namely provisioning) + * that have not been explicitly allowed. + */ + if (encl->attributes & ~encl->attributes_mask) + return -EACCES; + + /* + * Attributes should not be enforced *only* against what's available on + * platform (done in sgx_encl_create) but checked and enforced against + * the mask for enforcement in sigstruct. For example an enclave could + * opt to sign with AVX bit in xfrm, but still be loadable on a platform + * without it if the sigstruct->body.attributes_mask does not turn that + * bit on. + */ + if (sigstruct->body.attributes & sigstruct->body.attributes_mask & + sgx_attributes_reserved_mask) + return -EINVAL; + + if (sigstruct->body.miscselect & sigstruct->body.misc_mask & + sgx_misc_reserved_mask) + return -EINVAL; + + if (sigstruct->body.xfrm & sigstruct->body.xfrm_mask & + sgx_xfrm_reserved_mask) + return -EINVAL; + + ret = sgx_get_key_hash(sigstruct->modulus, mrsigner); + if (ret) + return ret; + + mutex_lock(&encl->lock); + + /* + * ENCLS[EINIT] is interruptible because it has such a high latency, + * e.g. 50k+ cycles on success. If an IRQ/NMI/SMI becomes pending, + * EINIT may fail with SGX_UNMASKED_EVENT so that the event can be + * serviced. + */ + for (i = 0; i < SGX_EINIT_SLEEP_COUNT; i++) { + for (j = 0; j < SGX_EINIT_SPIN_COUNT; j++) { + addr = sgx_get_epc_virt_addr(encl->secs.epc_page); + + preempt_disable(); + + for (k = 0; k < 4; k++) + wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + k, mrsigner[k]); + + ret = __einit(sigstruct, token, addr); + + preempt_enable(); + + if (ret == SGX_UNMASKED_EVENT) + continue; + else + break; + } + + if (ret != SGX_UNMASKED_EVENT) + break; + + msleep_interruptible(SGX_EINIT_SLEEP_TIME); + + if (signal_pending(current)) { + ret = -ERESTARTSYS; + goto err_out; + } + } + + if (ret & ENCLS_FAULT_FLAG) { + if (encls_failed(ret)) + ENCLS_WARN(ret, "EINIT"); + + ret = -EIO; + } else if (ret) { + pr_debug("EINIT returned %d\n", ret); + ret = -EPERM; + } else { + set_bit(SGX_ENCL_INITIALIZED, &encl->flags); + } + +err_out: + mutex_unlock(&encl->lock); + return ret; +} + +/** + * sgx_ioc_enclave_init() - handler for %SGX_IOC_ENCLAVE_INIT + * @encl: an enclave pointer + * @arg: userspace pointer to a struct sgx_enclave_init instance + * + * Flush any outstanding enqueued EADD operations and perform EINIT. The + * Launch Enclave Public Key Hash MSRs are rewritten as necessary to match + * the enclave's MRSIGNER, which is caculated from the provided sigstruct. + * + * Return: + * - 0: Success. + * - -EPERM: Invalid SIGSTRUCT. + * - -EIO: EINIT failed because of a power cycle. + * - -errno: POSIX error. + */ +static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg) +{ + struct sgx_sigstruct *sigstruct; + struct sgx_enclave_init init_arg; + struct page *initp_page; + void *token; + int ret; + + if (!test_bit(SGX_ENCL_CREATED, &encl->flags) || + test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) + return -EINVAL; + + if (copy_from_user(&init_arg, arg, sizeof(init_arg))) + return -EFAULT; + + initp_page = alloc_page(GFP_KERNEL); + if (!initp_page) + return -ENOMEM; + + sigstruct = kmap(initp_page); + token = (void *)((unsigned long)sigstruct + PAGE_SIZE / 2); + memset(token, 0, SGX_LAUNCH_TOKEN_SIZE); + + if (copy_from_user(sigstruct, (void __user *)init_arg.sigstruct, + sizeof(*sigstruct))) { + ret = -EFAULT; + goto out; + } + + /* + * A legacy field used with Intel signed enclaves. These used to mean + * regular and architectural enclaves. The CPU only accepts these values + * but they do not have any other meaning. + * + * Thus, reject any other values. + */ + if (sigstruct->header.vendor != 0x0000 && + sigstruct->header.vendor != 0x8086) { + ret = -EINVAL; + goto out; + } + + ret = sgx_encl_init(encl, sigstruct, token); + +out: + kunmap(initp_page); + __free_page(initp_page); + return ret; +} + +/** + * sgx_ioc_enclave_provision() - handler for %SGX_IOC_ENCLAVE_PROVISION + * @encl: an enclave pointer + * @arg: userspace pointer to a struct sgx_enclave_provision instance + * + * Allow ATTRIBUTE.PROVISION_KEY for an enclave by providing a file handle to + * /dev/sgx_provision. + * + * Return: + * - 0: Success. + * - -errno: Otherwise. + */ +static long sgx_ioc_enclave_provision(struct sgx_encl *encl, void __user *arg) +{ + struct sgx_enclave_provision params; + struct file *file; + + if (copy_from_user(¶ms, arg, sizeof(params))) + return -EFAULT; + + file = fget(params.fd); + if (!file) + return -EINVAL; + + if (file->f_op != &sgx_provision_fops) { + fput(file); + return -EINVAL; + } + + encl->attributes_mask |= SGX_ATTR_PROVISIONKEY; + + fput(file); + return 0; +} + +long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) +{ + struct sgx_encl *encl = filep->private_data; + int ret; + + if (test_and_set_bit(SGX_ENCL_IOCTL, &encl->flags)) + return -EBUSY; + + switch (cmd) { + case SGX_IOC_ENCLAVE_CREATE: + ret = sgx_ioc_enclave_create(encl, (void __user *)arg); + break; + case SGX_IOC_ENCLAVE_ADD_PAGES: + ret = sgx_ioc_enclave_add_pages(encl, (void __user *)arg); + break; + case SGX_IOC_ENCLAVE_INIT: + ret = sgx_ioc_enclave_init(encl, (void __user *)arg); + break; + case SGX_IOC_ENCLAVE_PROVISION: + ret = sgx_ioc_enclave_provision(encl, (void __user *)arg); + break; + default: + ret = -ENOIOCTLCMD; + break; + } + + clear_bit(SGX_ENCL_IOCTL, &encl->flags); + return ret; +} diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c new file mode 100644 index 000000000000..c519fc5f6948 --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -0,0 +1,733 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2016-20 Intel Corporation. */ + +#include <linux/freezer.h> +#include <linux/highmem.h> +#include <linux/kthread.h> +#include <linux/pagemap.h> +#include <linux/ratelimit.h> +#include <linux/sched/mm.h> +#include <linux/sched/signal.h> +#include <linux/slab.h> +#include "driver.h" +#include "encl.h" +#include "encls.h" + +struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS]; +static int sgx_nr_epc_sections; +static struct task_struct *ksgxd_tsk; +static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq); + +/* + * These variables are part of the state of the reclaimer, and must be accessed + * with sgx_reclaimer_lock acquired. + */ +static LIST_HEAD(sgx_active_page_list); + +static DEFINE_SPINLOCK(sgx_reclaimer_lock); + +/* + * Reset dirty EPC pages to uninitialized state. Laundry can be left with SECS + * pages whose child pages blocked EREMOVE. + */ +static void sgx_sanitize_section(struct sgx_epc_section *section) +{ + struct sgx_epc_page *page; + LIST_HEAD(dirty); + int ret; + + /* init_laundry_list is thread-local, no need for a lock: */ + while (!list_empty(§ion->init_laundry_list)) { + if (kthread_should_stop()) + return; + + /* needed for access to ->page_list: */ + spin_lock(§ion->lock); + + page = list_first_entry(§ion->init_laundry_list, + struct sgx_epc_page, list); + + ret = __eremove(sgx_get_epc_virt_addr(page)); + if (!ret) + list_move(&page->list, §ion->page_list); + else + list_move_tail(&page->list, &dirty); + + spin_unlock(§ion->lock); + + cond_resched(); + } + + list_splice(&dirty, §ion->init_laundry_list); +} + +static bool sgx_reclaimer_age(struct sgx_epc_page *epc_page) +{ + struct sgx_encl_page *page = epc_page->owner; + struct sgx_encl *encl = page->encl; + struct sgx_encl_mm *encl_mm; + bool ret = true; + int idx; + + idx = srcu_read_lock(&encl->srcu); + + list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { + if (!mmget_not_zero(encl_mm->mm)) + continue; + + mmap_read_lock(encl_mm->mm); + ret = !sgx_encl_test_and_clear_young(encl_mm->mm, page); + mmap_read_unlock(encl_mm->mm); + + mmput_async(encl_mm->mm); + + if (!ret) + break; + } + + srcu_read_unlock(&encl->srcu, idx); + + if (!ret) + return false; + + return true; +} + +static void sgx_reclaimer_block(struct sgx_epc_page *epc_page) +{ + struct sgx_encl_page *page = epc_page->owner; + unsigned long addr = page->desc & PAGE_MASK; + struct sgx_encl *encl = page->encl; + unsigned long mm_list_version; + struct sgx_encl_mm *encl_mm; + struct vm_area_struct *vma; + int idx, ret; + + do { + mm_list_version = encl->mm_list_version; + + /* Pairs with smp_rmb() in sgx_encl_mm_add(). */ + smp_rmb(); + + idx = srcu_read_lock(&encl->srcu); + + list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { + if (!mmget_not_zero(encl_mm->mm)) + continue; + + mmap_read_lock(encl_mm->mm); + + ret = sgx_encl_find(encl_mm->mm, addr, &vma); + if (!ret && encl == vma->vm_private_data) + zap_vma_ptes(vma, addr, PAGE_SIZE); + + mmap_read_unlock(encl_mm->mm); + + mmput_async(encl_mm->mm); + } + + srcu_read_unlock(&encl->srcu, idx); + } while (unlikely(encl->mm_list_version != mm_list_version)); + + mutex_lock(&encl->lock); + + ret = __eblock(sgx_get_epc_virt_addr(epc_page)); + if (encls_failed(ret)) + ENCLS_WARN(ret, "EBLOCK"); + + mutex_unlock(&encl->lock); +} + +static int __sgx_encl_ewb(struct sgx_epc_page *epc_page, void *va_slot, + struct sgx_backing *backing) +{ + struct sgx_pageinfo pginfo; + int ret; + + pginfo.addr = 0; + pginfo.secs = 0; + + pginfo.contents = (unsigned long)kmap_atomic(backing->contents); + pginfo.metadata = (unsigned long)kmap_atomic(backing->pcmd) + + backing->pcmd_offset; + + ret = __ewb(&pginfo, sgx_get_epc_virt_addr(epc_page), va_slot); + + kunmap_atomic((void *)(unsigned long)(pginfo.metadata - + backing->pcmd_offset)); + kunmap_atomic((void *)(unsigned long)pginfo.contents); + + return ret; +} + +static void sgx_ipi_cb(void *info) +{ +} + +static const cpumask_t *sgx_encl_ewb_cpumask(struct sgx_encl *encl) +{ + cpumask_t *cpumask = &encl->cpumask; + struct sgx_encl_mm *encl_mm; + int idx; + + /* + * Can race with sgx_encl_mm_add(), but ETRACK has already been + * executed, which means that the CPUs running in the new mm will enter + * into the enclave with a fresh epoch. + */ + cpumask_clear(cpumask); + + idx = srcu_read_lock(&encl->srcu); + + list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) { + if (!mmget_not_zero(encl_mm->mm)) + continue; + + cpumask_or(cpumask, cpumask, mm_cpumask(encl_mm->mm)); + + mmput_async(encl_mm->mm); + } + + srcu_read_unlock(&encl->srcu, idx); + + return cpumask; +} + +/* + * Swap page to the regular memory transformed to the blocked state by using + * EBLOCK, which means that it can no loger be referenced (no new TLB entries). + * + * The first trial just tries to write the page assuming that some other thread + * has reset the count for threads inside the enlave by using ETRACK, and + * previous thread count has been zeroed out. The second trial calls ETRACK + * before EWB. If that fails we kick all the HW threads out, and then do EWB, + * which should be guaranteed the succeed. + */ +static void sgx_encl_ewb(struct sgx_epc_page *epc_page, + struct sgx_backing *backing) +{ + struct sgx_encl_page *encl_page = epc_page->owner; + struct sgx_encl *encl = encl_page->encl; + struct sgx_va_page *va_page; + unsigned int va_offset; + void *va_slot; + int ret; + + encl_page->desc &= ~SGX_ENCL_PAGE_BEING_RECLAIMED; + + va_page = list_first_entry(&encl->va_pages, struct sgx_va_page, + list); + va_offset = sgx_alloc_va_slot(va_page); + va_slot = sgx_get_epc_virt_addr(va_page->epc_page) + va_offset; + if (sgx_va_page_full(va_page)) + list_move_tail(&va_page->list, &encl->va_pages); + + ret = __sgx_encl_ewb(epc_page, va_slot, backing); + if (ret == SGX_NOT_TRACKED) { + ret = __etrack(sgx_get_epc_virt_addr(encl->secs.epc_page)); + if (ret) { + if (encls_failed(ret)) + ENCLS_WARN(ret, "ETRACK"); + } + + ret = __sgx_encl_ewb(epc_page, va_slot, backing); + if (ret == SGX_NOT_TRACKED) { + /* + * Slow path, send IPIs to kick cpus out of the + * enclave. Note, it's imperative that the cpu + * mask is generated *after* ETRACK, else we'll + * miss cpus that entered the enclave between + * generating the mask and incrementing epoch. + */ + on_each_cpu_mask(sgx_encl_ewb_cpumask(encl), + sgx_ipi_cb, NULL, 1); + ret = __sgx_encl_ewb(epc_page, va_slot, backing); + } + } + + if (ret) { + if (encls_failed(ret)) + ENCLS_WARN(ret, "EWB"); + + sgx_free_va_slot(va_page, va_offset); + } else { + encl_page->desc |= va_offset; + encl_page->va_page = va_page; + } +} + +static void sgx_reclaimer_write(struct sgx_epc_page *epc_page, + struct sgx_backing *backing) +{ + struct sgx_encl_page *encl_page = epc_page->owner; + struct sgx_encl *encl = encl_page->encl; + struct sgx_backing secs_backing; + int ret; + + mutex_lock(&encl->lock); + + sgx_encl_ewb(epc_page, backing); + encl_page->epc_page = NULL; + encl->secs_child_cnt--; + + if (!encl->secs_child_cnt && test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) { + ret = sgx_encl_get_backing(encl, PFN_DOWN(encl->size), + &secs_backing); + if (ret) + goto out; + + sgx_encl_ewb(encl->secs.epc_page, &secs_backing); + + sgx_free_epc_page(encl->secs.epc_page); + encl->secs.epc_page = NULL; + + sgx_encl_put_backing(&secs_backing, true); + } + +out: + mutex_unlock(&encl->lock); +} + +/* + * Take a fixed number of pages from the head of the active page pool and + * reclaim them to the enclave's private shmem files. Skip the pages, which have + * been accessed since the last scan. Move those pages to the tail of active + * page pool so that the pages get scanned in LRU like fashion. + * + * Batch process a chunk of pages (at the moment 16) in order to degrade amount + * of IPI's and ETRACK's potentially required. sgx_encl_ewb() does degrade a bit + * among the HW threads with three stage EWB pipeline (EWB, ETRACK + EWB and IPI + * + EWB) but not sufficiently. Reclaiming one page at a time would also be + * problematic as it would increase the lock contention too much, which would + * halt forward progress. + */ +static void sgx_reclaim_pages(void) +{ + struct sgx_epc_page *chunk[SGX_NR_TO_SCAN]; + struct sgx_backing backing[SGX_NR_TO_SCAN]; + struct sgx_epc_section *section; + struct sgx_encl_page *encl_page; + struct sgx_epc_page *epc_page; + pgoff_t page_index; + int cnt = 0; + int ret; + int i; + + spin_lock(&sgx_reclaimer_lock); + for (i = 0; i < SGX_NR_TO_SCAN; i++) { + if (list_empty(&sgx_active_page_list)) + break; + + epc_page = list_first_entry(&sgx_active_page_list, + struct sgx_epc_page, list); + list_del_init(&epc_page->list); + encl_page = epc_page->owner; + + if (kref_get_unless_zero(&encl_page->encl->refcount) != 0) + chunk[cnt++] = epc_page; + else + /* The owner is freeing the page. No need to add the + * page back to the list of reclaimable pages. + */ + epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; + } + spin_unlock(&sgx_reclaimer_lock); + + for (i = 0; i < cnt; i++) { + epc_page = chunk[i]; + encl_page = epc_page->owner; + + if (!sgx_reclaimer_age(epc_page)) + goto skip; + + page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base); + ret = sgx_encl_get_backing(encl_page->encl, page_index, &backing[i]); + if (ret) + goto skip; + + mutex_lock(&encl_page->encl->lock); + encl_page->desc |= SGX_ENCL_PAGE_BEING_RECLAIMED; + mutex_unlock(&encl_page->encl->lock); + continue; + +skip: + spin_lock(&sgx_reclaimer_lock); + list_add_tail(&epc_page->list, &sgx_active_page_list); + spin_unlock(&sgx_reclaimer_lock); + + kref_put(&encl_page->encl->refcount, sgx_encl_release); + + chunk[i] = NULL; + } + + for (i = 0; i < cnt; i++) { + epc_page = chunk[i]; + if (epc_page) + sgx_reclaimer_block(epc_page); + } + + for (i = 0; i < cnt; i++) { + epc_page = chunk[i]; + if (!epc_page) + continue; + + encl_page = epc_page->owner; + sgx_reclaimer_write(epc_page, &backing[i]); + sgx_encl_put_backing(&backing[i], true); + + kref_put(&encl_page->encl->refcount, sgx_encl_release); + epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; + + section = &sgx_epc_sections[epc_page->section]; + spin_lock(§ion->lock); + list_add_tail(&epc_page->list, §ion->page_list); + section->free_cnt++; + spin_unlock(§ion->lock); + } +} + +static unsigned long sgx_nr_free_pages(void) +{ + unsigned long cnt = 0; + int i; + + for (i = 0; i < sgx_nr_epc_sections; i++) + cnt += sgx_epc_sections[i].free_cnt; + + return cnt; +} + +static bool sgx_should_reclaim(unsigned long watermark) +{ + return sgx_nr_free_pages() < watermark && + !list_empty(&sgx_active_page_list); +} + +static int ksgxd(void *p) +{ + int i; + + set_freezable(); + + /* + * Sanitize pages in order to recover from kexec(). The 2nd pass is + * required for SECS pages, whose child pages blocked EREMOVE. + */ + for (i = 0; i < sgx_nr_epc_sections; i++) + sgx_sanitize_section(&sgx_epc_sections[i]); + + for (i = 0; i < sgx_nr_epc_sections; i++) { + sgx_sanitize_section(&sgx_epc_sections[i]); + + /* Should never happen. */ + if (!list_empty(&sgx_epc_sections[i].init_laundry_list)) + WARN(1, "EPC section %d has unsanitized pages.\n", i); + } + + while (!kthread_should_stop()) { + if (try_to_freeze()) + continue; + + wait_event_freezable(ksgxd_waitq, + kthread_should_stop() || + sgx_should_reclaim(SGX_NR_HIGH_PAGES)); + + if (sgx_should_reclaim(SGX_NR_HIGH_PAGES)) + sgx_reclaim_pages(); + + cond_resched(); + } + + return 0; +} + +static bool __init sgx_page_reclaimer_init(void) +{ + struct task_struct *tsk; + + tsk = kthread_run(ksgxd, NULL, "ksgxd"); + if (IS_ERR(tsk)) + return false; + + ksgxd_tsk = tsk; + + return true; +} + +static struct sgx_epc_page *__sgx_alloc_epc_page_from_section(struct sgx_epc_section *section) +{ + struct sgx_epc_page *page; + + spin_lock(§ion->lock); + + if (list_empty(§ion->page_list)) { + spin_unlock(§ion->lock); + return NULL; + } + + page = list_first_entry(§ion->page_list, struct sgx_epc_page, list); + list_del_init(&page->list); + section->free_cnt--; + + spin_unlock(§ion->lock); + return page; +} + +/** + * __sgx_alloc_epc_page() - Allocate an EPC page + * + * Iterate through EPC sections and borrow a free EPC page to the caller. When a + * page is no longer needed it must be released with sgx_free_epc_page(). + * + * Return: + * an EPC page, + * -errno on error + */ +struct sgx_epc_page *__sgx_alloc_epc_page(void) +{ + struct sgx_epc_section *section; + struct sgx_epc_page *page; + int i; + + for (i = 0; i < sgx_nr_epc_sections; i++) { + section = &sgx_epc_sections[i]; + + page = __sgx_alloc_epc_page_from_section(section); + if (page) + return page; + } + + return ERR_PTR(-ENOMEM); +} + +/** + * sgx_mark_page_reclaimable() - Mark a page as reclaimable + * @page: EPC page + * + * Mark a page as reclaimable and add it to the active page list. Pages + * are automatically removed from the active list when freed. + */ +void sgx_mark_page_reclaimable(struct sgx_epc_page *page) +{ + spin_lock(&sgx_reclaimer_lock); + page->flags |= SGX_EPC_PAGE_RECLAIMER_TRACKED; + list_add_tail(&page->list, &sgx_active_page_list); + spin_unlock(&sgx_reclaimer_lock); +} + +/** + * sgx_unmark_page_reclaimable() - Remove a page from the reclaim list + * @page: EPC page + * + * Clear the reclaimable flag and remove the page from the active page list. + * + * Return: + * 0 on success, + * -EBUSY if the page is in the process of being reclaimed + */ +int sgx_unmark_page_reclaimable(struct sgx_epc_page *page) +{ + spin_lock(&sgx_reclaimer_lock); + if (page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED) { + /* The page is being reclaimed. */ + if (list_empty(&page->list)) { + spin_unlock(&sgx_reclaimer_lock); + return -EBUSY; + } + + list_del(&page->list); + page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; + } + spin_unlock(&sgx_reclaimer_lock); + + return 0; +} + +/** + * sgx_alloc_epc_page() - Allocate an EPC page + * @owner: the owner of the EPC page + * @reclaim: reclaim pages if necessary + * + * Iterate through EPC sections and borrow a free EPC page to the caller. When a + * page is no longer needed it must be released with sgx_free_epc_page(). If + * @reclaim is set to true, directly reclaim pages when we are out of pages. No + * mm's can be locked when @reclaim is set to true. + * + * Finally, wake up ksgxd when the number of pages goes below the watermark + * before returning back to the caller. + * + * Return: + * an EPC page, + * -errno on error + */ +struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim) +{ + struct sgx_epc_page *page; + + for ( ; ; ) { + page = __sgx_alloc_epc_page(); + if (!IS_ERR(page)) { + page->owner = owner; + break; + } + + if (list_empty(&sgx_active_page_list)) + return ERR_PTR(-ENOMEM); + + if (!reclaim) { + page = ERR_PTR(-EBUSY); + break; + } + + if (signal_pending(current)) { + page = ERR_PTR(-ERESTARTSYS); + break; + } + + sgx_reclaim_pages(); + cond_resched(); + } + + if (sgx_should_reclaim(SGX_NR_LOW_PAGES)) + wake_up(&ksgxd_waitq); + + return page; +} + +/** + * sgx_free_epc_page() - Free an EPC page + * @page: an EPC page + * + * Call EREMOVE for an EPC page and insert it back to the list of free pages. + */ +void sgx_free_epc_page(struct sgx_epc_page *page) +{ + struct sgx_epc_section *section = &sgx_epc_sections[page->section]; + int ret; + + WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED); + + ret = __eremove(sgx_get_epc_virt_addr(page)); + if (WARN_ONCE(ret, "EREMOVE returned %d (0x%x)", ret, ret)) + return; + + spin_lock(§ion->lock); + list_add_tail(&page->list, §ion->page_list); + section->free_cnt++; + spin_unlock(§ion->lock); +} + +static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, + unsigned long index, + struct sgx_epc_section *section) +{ + unsigned long nr_pages = size >> PAGE_SHIFT; + unsigned long i; + + section->virt_addr = memremap(phys_addr, size, MEMREMAP_WB); + if (!section->virt_addr) + return false; + + section->pages = vmalloc(nr_pages * sizeof(struct sgx_epc_page)); + if (!section->pages) { + memunmap(section->virt_addr); + return false; + } + + section->phys_addr = phys_addr; + spin_lock_init(§ion->lock); + INIT_LIST_HEAD(§ion->page_list); + INIT_LIST_HEAD(§ion->init_laundry_list); + + for (i = 0; i < nr_pages; i++) { + section->pages[i].section = index; + section->pages[i].flags = 0; + section->pages[i].owner = NULL; + list_add_tail(§ion->pages[i].list, §ion->init_laundry_list); + } + + section->free_cnt = nr_pages; + return true; +} + +/** + * A section metric is concatenated in a way that @low bits 12-31 define the + * bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the + * metric. + */ +static inline u64 __init sgx_calc_section_metric(u64 low, u64 high) +{ + return (low & GENMASK_ULL(31, 12)) + + ((high & GENMASK_ULL(19, 0)) << 32); +} + +static bool __init sgx_page_cache_init(void) +{ + u32 eax, ebx, ecx, edx, type; + u64 pa, size; + int i; + + for (i = 0; i < ARRAY_SIZE(sgx_epc_sections); i++) { + cpuid_count(SGX_CPUID, i + SGX_CPUID_EPC, &eax, &ebx, &ecx, &edx); + + type = eax & SGX_CPUID_EPC_MASK; + if (type == SGX_CPUID_EPC_INVALID) + break; + + if (type != SGX_CPUID_EPC_SECTION) { + pr_err_once("Unknown EPC section type: %u\n", type); + break; + } + + pa = sgx_calc_section_metric(eax, ebx); + size = sgx_calc_section_metric(ecx, edx); + + pr_info("EPC section 0x%llx-0x%llx\n", pa, pa + size - 1); + + if (!sgx_setup_epc_section(pa, size, i, &sgx_epc_sections[i])) { + pr_err("No free memory for an EPC section\n"); + break; + } + + sgx_nr_epc_sections++; + } + + if (!sgx_nr_epc_sections) { + pr_err("There are zero EPC sections.\n"); + return false; + } + + return true; +} + +static void __init sgx_init(void) +{ + int ret; + int i; + + if (!cpu_feature_enabled(X86_FEATURE_SGX)) + return; + + if (!sgx_page_cache_init()) + return; + + if (!sgx_page_reclaimer_init()) + goto err_page_cache; + + ret = sgx_drv_init(); + if (ret) + goto err_kthread; + + return; + +err_kthread: + kthread_stop(ksgxd_tsk); + +err_page_cache: + for (i = 0; i < sgx_nr_epc_sections; i++) { + vfree(sgx_epc_sections[i].pages); + memunmap(sgx_epc_sections[i].virt_addr); + } +} + +device_initcall(sgx_init); diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h new file mode 100644 index 000000000000..5fa42d143feb --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/sgx.h @@ -0,0 +1,86 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _X86_SGX_H +#define _X86_SGX_H + +#include <linux/bitops.h> +#include <linux/err.h> +#include <linux/io.h> +#include <linux/rwsem.h> +#include <linux/types.h> +#include <asm/asm.h> +#include "arch.h" + +#undef pr_fmt +#define pr_fmt(fmt) "sgx: " fmt + +#define SGX_MAX_EPC_SECTIONS 8 +#define SGX_EEXTEND_BLOCK_SIZE 256 +#define SGX_NR_TO_SCAN 16 +#define SGX_NR_LOW_PAGES 32 +#define SGX_NR_HIGH_PAGES 64 + +/* Pages, which are being tracked by the page reclaimer. */ +#define SGX_EPC_PAGE_RECLAIMER_TRACKED BIT(0) + +struct sgx_epc_page { + unsigned int section; + unsigned int flags; + struct sgx_encl_page *owner; + struct list_head list; +}; + +/* + * The firmware can define multiple chunks of EPC to the different areas of the + * physical memory e.g. for memory areas of the each node. This structure is + * used to store EPC pages for one EPC section and virtual memory area where + * the pages have been mapped. + * + * 'lock' must be held before accessing 'page_list' or 'free_cnt'. + */ +struct sgx_epc_section { + unsigned long phys_addr; + void *virt_addr; + struct sgx_epc_page *pages; + + spinlock_t lock; + struct list_head page_list; + unsigned long free_cnt; + + /* + * Pages which need EREMOVE run on them before they can be + * used. Only safe to be accessed in ksgxd and init code. + * Not protected by locks. + */ + struct list_head init_laundry_list; +}; + +extern struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS]; + +static inline unsigned long sgx_get_epc_phys_addr(struct sgx_epc_page *page) +{ + struct sgx_epc_section *section = &sgx_epc_sections[page->section]; + unsigned long index; + + index = ((unsigned long)page - (unsigned long)section->pages) / sizeof(*page); + + return section->phys_addr + index * PAGE_SIZE; +} + +static inline void *sgx_get_epc_virt_addr(struct sgx_epc_page *page) +{ + struct sgx_epc_section *section = &sgx_epc_sections[page->section]; + unsigned long index; + + index = ((unsigned long)page - (unsigned long)section->pages) / sizeof(*page); + + return section->virt_addr + index * PAGE_SIZE; +} + +struct sgx_epc_page *__sgx_alloc_epc_page(void); +void sgx_free_epc_page(struct sgx_epc_page *page); + +void sgx_mark_page_reclaimable(struct sgx_epc_page *page); +int sgx_unmark_page_reclaimable(struct sgx_epc_page *page); +struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim); + +#endif /* _X86_SGX_H */ diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index e19df6cde35d..7798d862983f 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -60,6 +60,7 @@ #include <asm/umip.h> #include <asm/insn.h> #include <asm/insn-eval.h> +#include <asm/vdso.h> #ifdef CONFIG_X86_64 #include <asm/x86_init.h> @@ -117,6 +118,9 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str, tsk->thread.error_code = error_code; tsk->thread.trap_nr = trapnr; die(str, regs, error_code); + } else { + if (fixup_vdso_exception(regs, trapnr, error_code, 0)) + return 0; } /* @@ -550,6 +554,9 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_protection) tsk->thread.error_code = error_code; tsk->thread.trap_nr = X86_TRAP_GP; + if (fixup_vdso_exception(regs, X86_TRAP_GP, error_code, 0)) + return; + show_signal(tsk, SIGSEGV, "", desc, regs, error_code); force_sig(SIGSEGV); goto exit; @@ -1048,6 +1055,9 @@ static void math_error(struct pt_regs *regs, int trapnr) if (!si_code) goto exit; + if (fixup_vdso_exception(regs, trapnr, 0, 0)) + return; + force_sig_fault(SIGFPE, si_code, (void __user *)uprobe_get_trap_addr(regs)); exit: diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 82bf37a5c9ec..f1f1b5a0956a 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -30,6 +30,7 @@ #include <asm/cpu_entry_area.h> /* exception stack */ #include <asm/pgtable_areas.h> /* VMALLOC_START, ... */ #include <asm/kvm_para.h> /* kvm_handle_async_pf */ +#include <asm/vdso.h> /* fixup_vdso_exception() */ #define CREATE_TRACE_POINTS #include <asm/trace/exceptions.h> @@ -602,11 +603,9 @@ pgtable_bad(struct pt_regs *regs, unsigned long error_code, oops_end(flags, regs, sig); } -static void set_signal_archinfo(unsigned long address, - unsigned long error_code) +static void sanitize_error_code(unsigned long address, + unsigned long *error_code) { - struct task_struct *tsk = current; - /* * To avoid leaking information about the kernel page * table layout, pretend that user-mode accesses to @@ -617,7 +616,13 @@ static void set_signal_archinfo(unsigned long address, * information and does not appear to cause any problems. */ if (address >= TASK_SIZE_MAX) - error_code |= X86_PF_PROT; + *error_code |= X86_PF_PROT; +} + +static void set_signal_archinfo(unsigned long address, + unsigned long error_code) +{ + struct task_struct *tsk = current; tsk->thread.trap_nr = X86_TRAP_PF; tsk->thread.error_code = error_code | X86_PF_USER; @@ -658,6 +663,8 @@ no_context(struct pt_regs *regs, unsigned long error_code, * faulting through the emulate_vsyscall() logic. */ if (current->thread.sig_on_uaccess_err && signal) { + sanitize_error_code(address, &error_code); + set_signal_archinfo(address, error_code); /* XXX: hwpoison faults will set the wrong code. */ @@ -806,13 +813,10 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, if (is_errata100(regs, address)) return; - /* - * To avoid leaking information about the kernel page table - * layout, pretend that user-mode accesses to kernel addresses - * are always protection faults. - */ - if (address >= TASK_SIZE_MAX) - error_code |= X86_PF_PROT; + sanitize_error_code(address, &error_code); + + if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address)) + return; if (likely(show_unhandled_signals)) show_signal_msg(regs, error_code, address, tsk); @@ -931,6 +935,11 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, if (is_prefetch(regs, error_code, address)) return; + sanitize_error_code(address, &error_code); + + if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address)) + return; + set_signal_archinfo(address, error_code); #ifdef CONFIG_MEMORY_FAILURE @@ -1102,6 +1111,18 @@ access_error(unsigned long error_code, struct vm_area_struct *vma) return 1; /* + * SGX hardware blocked the access. This usually happens + * when the enclave memory contents have been destroyed, like + * after a suspend/resume cycle. In any case, the kernel can't + * fix the cause of the fault. Handle the fault as an access + * error even in cases where no actual access violation + * occurred. This allows userspace to rebuild the enclave in + * response to the signal. + */ + if (unlikely(error_code & X86_PF_SGX)) + return 1; + + /* * Make sure to check the VMA so that we do not perform * faults just to hit a X86_PF_PK as soon as we fill in a * page. |